[med-svn] [mptp] 01/02: New upstream version 0.2.2
Andreas Tille
tille at debian.org
Wed Mar 15 20:59:50 UTC 2017
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository mptp.
commit 00b45602af7b126be44e7bddd7ad2e31cfa345ec
Author: Andreas Tille <tille at debian.org>
Date: Wed Mar 15 21:59:12 2017 +0100
New upstream version 0.2.2
---
.gitignore | 22 +
.travis.yml | 7 +
ChangeLog.md | 22 +
LICENSE.txt | 661 ++++++++++++
Makefile.am | 3 +
README.md | 251 +++++
autogen.sh | 3 +
completion/Makefile.am | 4 +
completion/mptp | 27 +
configure.ac | 94 ++
man/Makefile.am | 23 +
man/mptp.1 | 355 +++++++
src/Makefile.am | 29 +
src/aic.c | 1128 +++++++++++++++++++++
src/arch.c | 75 ++
src/auto.c | 340 +++++++
src/dp.c | 358 +++++++
src/fasta.c | 305 ++++++
src/lex_rtree.l | 86 ++
src/lex_utree.l | 86 ++
src/likelihood.c | 55 +
src/maps.c | 82 ++
src/mptp.c | 626 ++++++++++++
src/mptp.h | 419 ++++++++
src/multirun.c | 362 +++++++
src/output.c | 73 ++
src/parse_rtree.y | 198 ++++
src/parse_utree.y | 221 ++++
src/python/compare.py | 90 ++
src/python/create_delimit_results.py | 54 +
src/python/create_delimit_results_simu_data.py | 54 +
src/python/create_scoring_results.py | 286 ++++++
src/python/create_scoring_results_with_gmyc.py | 329 ++++++
src/python/create_scoring_results_without_gmyc.py | 314 ++++++
src/python/create_scoring_results_without_ptp.py | 316 ++++++
src/python/create_subsets.py | 84 ++
src/python/extract_trees.py | 22 +
src/python/plotscript | 323 ++++++
src/python/plotscript_without_gmyc | 294 ++++++
src/python/rewrite_species_result_file_GMYC.py | 48 +
src/python/rewrite_species_result_file_PTP.py | 46 +
src/random.c | 128 +++
src/rtree.c | 684 +++++++++++++
src/svg.c | 404 ++++++++
src/svg_landscape.c | 246 +++++
src/util.c | 179 ++++
src/utree.c | 614 +++++++++++
47 files changed, 10430 insertions(+)
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..29f63f1
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,22 @@
+*.a
+*.o
+*.pdf
+*~
+.deps
+.dirstamp
+/aclocal.m4
+/autom4te.cache
+/bin
+/compile
+/config.h
+/config.h.in
+/config.log
+/config.status
+/configure
+/depcomp
+/install-sh
+/missing
+/stamp-h1
+Makefile
+Makefile.in
+
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..e04b388
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,7 @@
+language: c
+
+compiler:
+ - gcc
+ - clang
+
+script: ./autogen.sh && ./configure && make && make check
diff --git a/ChangeLog.md b/ChangeLog.md
new file mode 100644
index 0000000..139f47f
--- /dev/null
+++ b/ChangeLog.md
@@ -0,0 +1,22 @@
+# Change Log
+All notable changes to `mptp` will be documented in this file.
+This project adheres to [Semantic Versioning](http://semver.org/).
+
+## [0.2.2] - 2017-01-31
+### Fixed
+ - Regular expressions now allow scientific notation when parsing branch lengths
+ - Improved accuracy of ASV score (takes into account tip species)
+ - Memory leaks when parsing incorrectly formatted trees
+
+## [0.2.1] - 2016-10-18
+### Fixed
+ - Updated ASV to consider only coalescent roots of ML delimitation
+ - Assertion stopping mptp when using random starting delimitations for MCMC
+
+## [0.2.0] - 2016-09-27
+### Fixed
+ - Floating point exception error when constructing random trees caused from
+ division by zero
+ - Allocation with malloc caused uninitialized variables when converting unrooted
+ tree to rooted for the MCMC method
+ - Sample size for the the AIC with a correction for finite sample sizes
diff --git a/LICENSE.txt b/LICENSE.txt
new file mode 100644
index 0000000..dba13ed
--- /dev/null
+++ b/LICENSE.txt
@@ -0,0 +1,661 @@
+ GNU AFFERO GENERAL PUBLIC LICENSE
+ Version 3, 19 November 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The GNU Affero General Public License is a free, copyleft license for
+software and other kinds of works, specifically designed to ensure
+cooperation with the community in the case of network server software.
+
+ The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works. By contrast,
+our General Public Licenses are intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+ Developers that use our General Public Licenses protect your rights
+with two steps: (1) assert copyright on the software, and (2) offer
+you this License which gives you legal permission to copy, distribute
+and/or modify the software.
+
+ A secondary benefit of defending all users' freedom is that
+improvements made in alternate versions of the program, if they
+receive widespread use, become available for other developers to
+incorporate. Many developers of free software are heartened and
+encouraged by the resulting cooperation. However, in the case of
+software used on network servers, this result may fail to come about.
+The GNU General Public License permits making a modified version and
+letting the public access it on a server without ever releasing its
+source code to the public.
+
+ The GNU Affero General Public License is designed specifically to
+ensure that, in such cases, the modified source code becomes available
+to the community. It requires the operator of a network server to
+provide the source code of the modified version running there to the
+users of that server. Therefore, public use of a modified version, on
+a publicly accessible server, gives the public access to the source
+code of the modified version.
+
+ An older license, called the Affero General Public License and
+published by Affero, was designed to accomplish similar goals. This is
+a different license, not a version of the Affero GPL, but Affero has
+released a new version of the Affero GPL which permits relicensing under
+this license.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ TERMS AND CONDITIONS
+
+ 0. Definitions.
+
+ "This License" refers to version 3 of the GNU Affero General Public License.
+
+ "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+ "The Program" refers to any copyrightable work licensed under this
+License. Each licensee is addressed as "you". "Licensees" and
+"recipients" may be individuals or organizations.
+
+ To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy. The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+ A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+ To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy. Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+ To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies. Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+ An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License. If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+ 1. Source Code.
+
+ The "source code" for a work means the preferred form of the work
+for making modifications to it. "Object code" means any non-source
+form of a work.
+
+ A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+ The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form. A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+ The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities. However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work. For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+ The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+ The Corresponding Source for a work in source code form is that
+same work.
+
+ 2. Basic Permissions.
+
+ All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met. This License explicitly affirms your unlimited
+permission to run the unmodified Program. The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work. This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+ You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force. You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright. Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+ Conveying under any other circumstances is permitted solely under
+the conditions stated below. Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+ No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+ When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+ 4. Conveying Verbatim Copies.
+
+ You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+ You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+ 5. Conveying Modified Source Versions.
+
+ You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+ a) The work must carry prominent notices stating that you modified
+ it, and giving a relevant date.
+
+ b) The work must carry prominent notices stating that it is
+ released under this License and any conditions added under section
+ 7. This requirement modifies the requirement in section 4 to
+ "keep intact all notices".
+
+ c) You must license the entire work, as a whole, under this
+ License to anyone who comes into possession of a copy. This
+ License will therefore apply, along with any applicable section 7
+ additional terms, to the whole of the work, and all its parts,
+ regardless of how they are packaged. This License gives no
+ permission to license the work in any other way, but it does not
+ invalidate such permission if you have separately received it.
+
+ d) If the work has interactive user interfaces, each must display
+ Appropriate Legal Notices; however, if the Program has interactive
+ interfaces that do not display Appropriate Legal Notices, your
+ work need not make them do so.
+
+ A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit. Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+ 6. Conveying Non-Source Forms.
+
+ You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+ a) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by the
+ Corresponding Source fixed on a durable physical medium
+ customarily used for software interchange.
+
+ b) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by a
+ written offer, valid for at least three years and valid for as
+ long as you offer spare parts or customer support for that product
+ model, to give anyone who possesses the object code either (1) a
+ copy of the Corresponding Source for all the software in the
+ product that is covered by this License, on a durable physical
+ medium customarily used for software interchange, for a price no
+ more than your reasonable cost of physically performing this
+ conveying of source, or (2) access to copy the
+ Corresponding Source from a network server at no charge.
+
+ c) Convey individual copies of the object code with a copy of the
+ written offer to provide the Corresponding Source. This
+ alternative is allowed only occasionally and noncommercially, and
+ only if you received the object code with such an offer, in accord
+ with subsection 6b.
+
+ d) Convey the object code by offering access from a designated
+ place (gratis or for a charge), and offer equivalent access to the
+ Corresponding Source in the same way through the same place at no
+ further charge. You need not require recipients to copy the
+ Corresponding Source along with the object code. If the place to
+ copy the object code is a network server, the Corresponding Source
+ may be on a different server (operated by you or a third party)
+ that supports equivalent copying facilities, provided you maintain
+ clear directions next to the object code saying where to find the
+ Corresponding Source. Regardless of what server hosts the
+ Corresponding Source, you remain obligated to ensure that it is
+ available for as long as needed to satisfy these requirements.
+
+ e) Convey the object code using peer-to-peer transmission, provided
+ you inform other peers where the object code and Corresponding
+ Source of the work are being offered to the general public at no
+ charge under subsection 6d.
+
+ A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+ A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling. In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage. For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product. A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+ "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source. The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+ If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information. But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+ The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed. Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+ Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+ 7. Additional Terms.
+
+ "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law. If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+ When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it. (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.) You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+ Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+ a) Disclaiming warranty or limiting liability differently from the
+ terms of sections 15 and 16 of this License; or
+
+ b) Requiring preservation of specified reasonable legal notices or
+ author attributions in that material or in the Appropriate Legal
+ Notices displayed by works containing it; or
+
+ c) Prohibiting misrepresentation of the origin of that material, or
+ requiring that modified versions of such material be marked in
+ reasonable ways as different from the original version; or
+
+ d) Limiting the use for publicity purposes of names of licensors or
+ authors of the material; or
+
+ e) Declining to grant rights under trademark law for use of some
+ trade names, trademarks, or service marks; or
+
+ f) Requiring indemnification of licensors and authors of that
+ material by anyone who conveys the material (or modified versions of
+ it) with contractual assumptions of liability to the recipient, for
+ any liability that these contractual assumptions directly impose on
+ those licensors and authors.
+
+ All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10. If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term. If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+ If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+ Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+ 8. Termination.
+
+ You may not propagate or modify a covered work except as expressly
+provided under this License. Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+ However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+ Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+ Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License. If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+ 9. Acceptance Not Required for Having Copies.
+
+ You are not required to accept this License in order to receive or
+run a copy of the Program. Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance. However,
+nothing other than this License grants you permission to propagate or
+modify any covered work. These actions infringe copyright if you do
+not accept this License. Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+ 10. Automatic Licensing of Downstream Recipients.
+
+ Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License. You are not responsible
+for enforcing compliance by third parties with this License.
+
+ An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations. If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+ You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License. For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+ 11. Patents.
+
+ A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based. The
+work thus licensed is called the contributor's "contributor version".
+
+ A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version. For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+ In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement). To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+ If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients. "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+ If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+ A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License. You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+ Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+ 12. No Surrender of Others' Freedom.
+
+ If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all. For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+ 13. Remote Network Interaction; Use with the GNU General Public License.
+
+ Notwithstanding any other provision of this License, if you modify the
+Program, your modified version must prominently offer all users
+interacting with it remotely through a computer network (if your version
+supports such interaction) an opportunity to receive the Corresponding
+Source of your version by providing access to the Corresponding Source
+from a network server at no charge, through some standard or customary
+means of facilitating copying of software. This Corresponding Source
+shall include the Corresponding Source for any work covered by version 3
+of the GNU General Public License that is incorporated pursuant to the
+following paragraph.
+
+ Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU General Public License into a single
+combined work, and to convey the resulting work. The terms of this
+License will continue to apply to the part which is the covered work,
+but the work with which it is combined will remain governed by version
+3 of the GNU General Public License.
+
+ 14. Revised Versions of this License.
+
+ The Free Software Foundation may publish revised and/or new versions of
+the GNU Affero General Public License from time to time. Such new versions
+will be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+ Each version is given a distinguishing version number. If the
+Program specifies that a certain numbered version of the GNU Affero General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation. If the Program does not specify a version number of the
+GNU Affero General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+ If the Program specifies that a proxy can decide which future
+versions of the GNU Affero General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+ Later license versions may give you additional or different
+permissions. However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+ 15. Disclaimer of Warranty.
+
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+ 16. Limitation of Liability.
+
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+ 17. Interpretation of Sections 15 and 16.
+
+ If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+ If your software can interact with users remotely through a computer
+network, you should also make sure that it provides a way for users to
+get its source. For example, if your program is a web application, its
+interface could display a "Source" link that leads users to an archive
+of the code. There are many ways you could offer source, and different
+solutions will be better for different programs; see section 13 for the
+specific requirements.
+
+ You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU AGPL, see
+<http://www.gnu.org/licenses/>.
diff --git a/Makefile.am b/Makefile.am
new file mode 100644
index 0000000..567b1d7
--- /dev/null
+++ b/Makefile.am
@@ -0,0 +1,3 @@
+AUTOMAKE_OPTIONS = foreign
+SUBDIRS = src man completion
+EXTRA_DIST = autogen.sh LICENSE.txt README.md ChangeLog.md
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..7fb5416
--- /dev/null
+++ b/README.md
@@ -0,0 +1,251 @@
+# Species Delimitation
+
+[![License](https://img.shields.io/badge/license-AGPL-blue.svg)](http://www.gnu.org/licenses/agpl-3.0.en.html)
+[![Build Status](https://travis-ci.org/Pas-Kapli/mptp.svg?branch=master)](https://travis-ci.com/Pas-Kapli/mptp)
+
+## Introduction
+
+The aim of this project is to implement a fast species delimitation method,
+based on PTP (Zhang et al. 2013). The new tool should:
+
+* have an open source code with an appropriate open source license.
+* 64-bit multi-threaded design that handles very large datasets.
+
+We have implemented a tool called mPTP which can handle very large biodiversity
+datasets. It implements a fast method to compute the ML delimitation from an
+inferred phylogenetic tree of the samples. Using MCMC, it also computes the
+support values for each clade, which can be used to assess the confidence of
+the ML delimitation.
+
+**ML delimitation** mPTP implements two flavours of the point-estimate
+solution. First, it implements the original method from (Zhang et al. 2013)
+where all within-species processes are modelled with a single exponential
+distribution. mPTP uses a dynamic programming implementation which estimates
+the ML delimitation faster and more accurately than the original PTP. The
+dynamic programming implementation has similar properties as (Gulek et al.
+2010). See the [wiki](https://github.com/Pas-Kapli/mptp/wiki) for more
+information. The second method assumes a distinct exponential distribution for
+the branching events of each of the delimited species allowing it to fit to a
+wider range of empirical datasets.
+
+**MCMC method** mPTP generates support values for each clades. They represent
+the ratio of the number of samples for which a particular node was in the
+between-species process, to the total number of samples.
+
+## Compilation instructions
+
+**Cloning the repo** Clone the repo and build the executable and the documentation using
+the following commands.
+
+```bash
+git clone https://github.com/Pas-Kapli/mptp.git
+cd mptp
+./autogen.sh
+./configure
+make
+make install # as root, or run sudo make install
+```
+
+You will need [GNU Bison](http://www.gnu.org/software/bison/) and
+[Flex](http://flex.sourceforge.net/) installed on your system. When using the
+cloned repository version, you will also need
+[autoconf](https://www.gnu.org/software/autoconf/autoconf.html) and
+[automake](https://www.gnu.org/software/automake/) installed. Optionally, you
+will need the [GNU Scientific Library](http://www.gnu.org/software/gsl/) for
+the likelihood ratio test. If it is not available on your system, ratio test
+will be disabled.
+
+On a Debian-based Linux system, the four packages can be installed
+using the command
+
+```bash
+sudo apt-get install libgsl0-dev flex bison autotools-dev
+```
+
+Optionally, you can install the bash auto-completion for mptp. To do that,
+replace the `./configure` step above with
+```bash
+./configure --with-bash-completions=DIR
+```
+where `DIR` is the directory where bash autocompletion is stored. You can use
+`pkg-config` as follows:
+```bash
+./configure --with-bash-completions=`pkg-config --variable=completionsdir bash-completion`
+```
+
+**Source distribution** To download the source distribution from a
+[release](https://github.com/Pas-Kapli/mptp/releases) and build the executable
+and the documentation, use the following commands:
+
+```bash
+wget https://github.com/Pas-Kapli/mptp/releases/download/v0.2.2/mptp-src-0.2.2.tar.gz
+tar zxvf mptp-src-0.2.2.tar.gz
+cd mptp-src-0.2.2
+./configure
+make
+make install # as root, or run sudo make install
+```
+
+Note that, similarly to cloning the repository, you will need [GNU
+Bison](http://www.gnu.org/software/bison/) and
+[Flex](http://flex.sourceforge.net/) installed on your system, and optionally,
+the [GNU Scientific Library](http://www.gnu.org/software/gsl/). However, you
+do not need [autoconf](https://www.gnu.org/software/autoconf/autoconf.html) and
+[automake](https://www.gnu.org/software/automake/) installed (note the missing `./autogen`).
+See also the notes for installing the bash auto-completition, as described in
+the *Cloning the repo* section.
+
+
+**Binary distribution** Starting with version 0.2.0, binary distribution files
+(.tar.gz) for GNU/Linux on x86-64 containing pre-compiled binaries as well as
+the documentation (man and pdf files) will be made available as part of each
+[release](https://github.com/Pas-Kapli/mptp/releases). The included executables
+currently are not compiled with [`libgsl`](http://www.gnu.org/software/gsl/)
+support. This means, Likelihood Ratio Test (LRT) is disabled for the
+single-rate PTP model. However, we intend to implement dynamic loading for
+`libgsl` and therefore this issue will disappear in the next releases. Until then, please
+consider compiling from source in order to enable `libgsl`.
+
+To use the pre-compiled binary, download the appropriate executable for your
+system using the following commands if you are using a Linux system:
+
+```bash
+wget https://github.com/Pas-Kapli/mptp/releases/download/v0.2.2/mptp-0.2.2-linux-x86_64.tar.gz
+tar zxvf mptp-0.2.2-linux-x86_64.tar.gz
+```
+
+You will now have the binary distribution in a folder called
+`mptp-0.2.2-linux-x86_64` in which you will find three subfolders `bin`, `man`
+and `doc`. We recommend making a copy or a symbolic link to the mptp binary
+`bin/mptp` in a folder included in your `$PATH`, and a copy or a symbolic link
+to the mptp man page `man/mptp.1` in a folder included in your `$MANPATH`. The
+PDF version of the manual is available in `doc/mptp_manual.pdf`.
+
+
+
+## Command-line options
+
+General options:
+
+* `--help`
+* `--version`
+* `--quiet`
+* `--tree_show`
+* `--multi`
+* `--single`
+* `--ml`
+* `--mcmc INT`
+* `--mcmc_sample INT`
+* `--mcmc_log`
+* `--mcmc_burnin INT`
+* `--mcmc_startnull`
+* `--mcmc_startrandom`
+* `--mcmc_startml`
+* `--mcmc_credible REAL`
+* `--mcmc_runs INT`
+* `--outgroup TAXA`
+* `--outgroup_crop`
+* `--minbr REAL`
+* `--minbr_auto FILENAME`
+* `--pvalue REAL`
+* `--precision INT`
+
+Input and output options:
+
+* `--tree_file FILENAME`
+* `--output_file FILENAME`
+
+Visualization options:
+
+* `--svg_width INT`
+* `--svg_fontsize INT`
+* `--svg_tipspacing INT`
+* `--svg_legend_ratio <0..1>`
+* `--svg_nolegend`
+* `--svg_marginleft INT`
+* `--svg_marginright INT`
+* `--svg_margintop INT`
+* `--svg_marginbottom INT`
+* `--svg_inner_radius INT`
+
+## Usage example
+
+```bash
+mptp --ml --multi --tree_file testTree --output_file out --outgroup A,C --tree_show
+mptp --mcmc 50000000 --multi --mcmc_sample 1000000 --mcmc_burnin 1000000 --tree_file tree.newick --output_file out
+```
+
+## Documentation
+
+If `mptp` was installed according to the [Compilation
+instructions](https://github.com/Pas-Kapli/mptp#compilation-instructions) you
+can access the man pages by:
+
+```bash
+man mptp
+```
+
+A comprehensive documentation is also available in the [wiki](https://github.com/Pas-Kapli/mptp/wiki).
+
+## License and third party licenses
+
+The code is currently licensed under the [GNU Affero General Public License version 3](http://www.gnu.org/licenses/agpl-3.0.en.html).
+
+## Code
+
+ File | Description
+--------------------|----------------
+**arch.c** | Architecture specific code (Mac/Linux).
+**auto.c** | Code for auto-detecting minimum branch length.
+**aic.c** | Code for Bayesian Single- and multi-rate PTP.
+**mptp.c** | Main file handling command-line parameters and executing corresponding parts.
+**mptp.h** | MPTP Header file.
+**dp.c** | Single- and multi-rate DP heuristics for solving the PTP problem.
+**fasta.c** | Code for reading FASTA files.
+**lex_rtree.l** | Lexical analyzer parsing newick rooted trees.
+**lex_utree.l** | Lexical analyzer parsing newick unrooted trees.
+**likelihood.c** | Likelihood rated functions.
+**Makefile.am** | Automake file for generating Makefile.in.
+**maps.c** | Character mapping arrays for converting sequences to the internal representation.
+**multirun.c** | Functions to execute multiple MCMC runs and compute ASD of support values.
+**output.c** | Output related files.
+**parse_rtree.y** | Functions for parsing rooted trees in newick format.
+**parse_utree.y** | Functions for parsing unrooted trees in newick format.
+**random.c** | Functions for creating a random delimitation.
+**rtree.c** | Rooted tree manipulation functions.
+**svg.c** | SVG visualization of delimited tree.
+**svg_landscape.c** | SVG visualization of likelihood landscape.
+**util.c** | Various common utility functions.
+**utree.c** | Unrooted tree manipulation functions.
+
+## The team
+
+* Paschalia Kapli
+* Sarah Lutteropp
+* Kassian Kobert
+* Pavlos Pavlides
+* Jiajie Zhang
+* Alexandros Stamatakis
+* Tomáš Flouri
+
+# References
+
+* Zhang J., Kapli P., Pavlidis P., Stamatakis A. (2013)
+**A general species delimitation method with applications to phylogenetic placements.**
+*Bioinformatics*, 29(22):2869-2876.
+doi:[10.1093/bioinformatics/btt499](http://dx.doi.org/10.1093/bioinformatics/btt499)
+
+* Nguyen XV, Epps J., Bailey J. (2010)
+**Information Theoretic Measures for Clustering Comparison: Variants, Properties, Normalization and Correction for Chance.**
+*Journal of Machine Learning Research*, 11:2837-2854.
+[PDF](http://www.jmlr.org/papers/volume11/vinh10a/vinh10a.pdf)
+
+* Gulek M., Toroslu IH. (2010)
+**A dynamic programming algorithm for tree-like weighted set packing problem.**
+*Information Sciences*, 180(20):3974-3979.
+doi:[10.1016/j.ins.2010.06.035](http://dx.doi.org/10.1016/j.ins.2010.06.035)
+
+* Powell JR. (2012)
+**Accounting for uncertainty in species delineation during the analysis of environmental DNA sequence data.**
+*Methods in Ecology and Evolution*, 3(1):1-11.
+doi:[10.1111/j.2041-210X.2011.00122.x](http://dx.doi.org/10.1111/j.2041-210X.2011.00122.x)
diff --git a/autogen.sh b/autogen.sh
new file mode 100755
index 0000000..d73da49
--- /dev/null
+++ b/autogen.sh
@@ -0,0 +1,3 @@
+#!/bin/sh
+autoreconf --force --install
+
diff --git a/completion/Makefile.am b/completion/Makefile.am
new file mode 100644
index 0000000..71e7319
--- /dev/null
+++ b/completion/Makefile.am
@@ -0,0 +1,4 @@
+if HAVE_BASH_COMPLETIONS
+ bashcompletiondir = $(bash_completions_dir)
+ dist_bashcompletion_DATA = mptp
+endif
diff --git a/completion/mptp b/completion/mptp
new file mode 100644
index 0000000..8e4382e
--- /dev/null
+++ b/completion/mptp
@@ -0,0 +1,27 @@
+_mptp()
+{
+ local cur prev opts
+ COMREPLY=()
+ cur="${COMP_WORDS[COMP_CWORD]}"
+ prev="${COMP_WORDS[COMP_CWORD-1]}"
+ opts="--help --version --tree_show --multi --single --ml --mcmc --mcmc_sample
+ --mcmc_log --mcmc_burnin --mcmc_runs --mcmc_credible --mcmc_startnull
+ --mcmc_startrandom --mcmc_startml --pvalue --minbr --minbr_auto --outgroup
+ --outgroup_crop --quiet --precision --seed --tree_file --output_file
+ --svg_width --svg_fontsize --svg_tipspacing --svg_legend_ratio --svg_nolegend
+ --svg_marginleft --svg_marginright --svg_margintop --svg_marginbottom
+ --svg_inner_radius"
+
+ case "${prev}" in
+ '--tree_file')
+ #COMPREPLY=( $(compgen -f ${cur}) )
+ _filedir
+ return 0
+ ;;
+ *)
+ ;;
+ esac
+
+ COMPREPLY=( $(compgen -W "${opts}" -- ${cur}) )
+}
+complete -F _mptp mptp
diff --git a/configure.ac b/configure.ac
new file mode 100644
index 0000000..7e8fa14
--- /dev/null
+++ b/configure.ac
@@ -0,0 +1,94 @@
+# -*- Autoconf -*-
+# Process this file with autoconf to produce a configure script.
+
+AC_PREREQ([2.63])
+AC_INIT([mptp], [0.2.2], [Tomas.Flouri at h-its.org])
+AM_INIT_AUTOMAKE([subdir-objects])
+AC_LANG([C])
+AC_CONFIG_SRCDIR([src/mptp.c])
+AC_CONFIG_HEADERS([config.h])
+AC_CANONICAL_HOST
+
+# Checks for programs.
+AC_PROG_CC
+AC_PROG_RANLIB
+AC_PROG_SED
+AC_PROG_LEX
+if test "x$LEX" != xflex; then
+ AC_MSG_ERROR(could not find required installation of FLEX)
+fi
+
+AC_PROG_YACC
+if test "x$YACC" != x"bison -y"; then
+ AC_MSG_ERROR(could not find required installation of BISON)
+fi
+
+AC_PROG_INSTALL
+
+# Checks for header files.
+AC_CHECK_HEADERS([assert.h stdio.h stdarg.h string.h getopt.h stdlib.h regex.h ctype.h locale.h limits.h string.h sys/time.h])
+
+# Checks for typedefs, structures, and compiler characteristics.
+AC_C_INLINE
+AC_TYPE_SIZE_T
+
+# Checks for library functions.
+AC_FUNC_MALLOC
+AC_FUNC_STRTOD
+AC_FUNC_ALLOCA
+AC_FUNC_REALLOC
+AC_CHECK_FUNCS([memmove memcpy gettimeofday memchr memset pow regcomp strcasecmp strchr strcspn sysinfo])
+
+AC_CHECK_LIB([m],[cos])
+AC_CHECK_LIB([gslcblas], [cblas_dgemm])
+AC_CHECK_LIB([gsl], [gsl_cdf_chisq_P])
+
+# Bash completions
+AC_ARG_WITH([bash-completions],
+ AC_HELP_STRING([--with-bash-completions=[DIR]], [Bash completions directory [default=no]]),
+ [with_bash_completions="$withval"],
+ [with_bash_completions="no"]
+)
+AS_CASE([$with_bash_completions],
+# [yes], [PKG_CHECK_VAR([bash_completions_dir], [bash-completion], [completionsdir], [], [AC_MSG_ERROR([bash completions not found])])],
+ [no], [bash_completions_dir=],
+ [bash_completions_dir="$with_bash_completions"]
+)
+AC_SUBST([bash_completions_dir])
+AM_CONDITIONAL(HAVE_BASH_COMPLETIONS, test -n "$bash_completions_dir")
+AS_IF([test -n "$bash_completions_dir"],
+ [bash_completions_output="${bash_completions_dir}"],
+ [bash_completions_output=no]
+)
+
+have_ps2pdf=no
+AC_ARG_ENABLE(pdfman, AS_HELP_STRING([--disable-pdfman], [Disable PDF manual creation]))
+AS_IF([test "x$enable_pdfman" != "xno"], [
+ have_ps2pdf=yes
+ AC_CHECK_PROG(HAVE_PS2PDF, ps2pdf, yes, no)
+ if test "x$HAVE_PS2PDF" = "xno"; then
+ AC_MSG_WARN([*** ps2pdf is required to build a PDF version of the manual])
+ have_ps2pdf=no
+ fi
+])
+
+AM_CONDITIONAL(HAVE_PS2PDF, test "x${have_ps2pdf}" = "xyes")
+AM_PROG_CC_C_O
+
+AC_CONFIG_FILES([Makefile
+ src/Makefile
+ man/Makefile
+ completion/Makefile])
+
+AC_OUTPUT
+
+AC_MSG_RESULT([
+ $PACKAGE $VERSION
+
+ Target: $host_os $host_cpu
+ Compiler: ${CC}
+ CFLAGS: ${CFLAGS} ${CPPFLAGS}
+ LIBS: ${LIBS} ${LDFLAGS}
+
+ Continue with 'make' command
+])
diff --git a/man/Makefile.am b/man/Makefile.am
new file mode 100644
index 0000000..8cf24e8
--- /dev/null
+++ b/man/Makefile.am
@@ -0,0 +1,23 @@
+# Makefile for creating PDF manual from man file
+
+dist_man_MANS = mptp.1
+
+if HAVE_PS2PDF
+
+doc_DATA = mptp_manual.pdf
+
+mptp_manual.pdf : mptp.1
+ TEMP=$$(mktemp temp.XXXXXXXX) ; \
+ if [ $$(uname) == "Darwin" ] ; then \
+ ${SED} -e 's/\\-/-/g' $< | \
+ iconv -f UTF-8 -t ISO-8859-1 > $$TEMP ; \
+ else \
+ ${SED} -e 's/\\-/-/g' $< > $$TEMP ; \
+ fi ; \
+ man -t ./$$TEMP | ps2pdf -sPAPERSIZE=a4 - $@ ; \
+ rm $$TEMP
+
+CLEANFILES=mptp_manual.pdf
+
+endif
+
diff --git a/man/mptp.1 b/man/mptp.1
new file mode 100644
index 0000000..c685b2d
--- /dev/null
+++ b/man/mptp.1
@@ -0,0 +1,355 @@
+.\" -*- coding: utf-8 -*-
+.\" ============================================================================
+.TH mptp 1 "January 31, 2017" "mptp 0.2.2" "USER COMMANDS"
+.\" ============================================================================
+.SH NAME
+mptp \(em single-locus species delimitation
+.\" ============================================================================
+.SH SYNOPSIS
+.\" left justified, ragged right
+.ad l
+Maximum-likelihood species delimitation:
+.RS
+\fBmptp\fR \-\-ml (\-\-single | \-\-multi) \-\-tree_file \fInewickfile\fR
+\-\-output_file \fIoutputfile\fR [\fIoptions\fR]
+.PP
+.RE
+Species delimitation with support values:
+.RS
+\fBmptp\fR \-\-mcmc \fIpositive integer\fR (\-\-single | \-\-multi)
+(\-\-mcmc_startnull | \-\-mcmc_startrandom | \-\-mcmc_startml) \-\-mcmc_log
+\fIpositive integer\fR \-\-tree_file \fInewickfile\fR \-\-output_file
+\fIoutputfile\fR [\fIoptions\fR]
+.PP
+.RE
+.\" left and right justified (default)
+.ad b
+.\" ============================================================================
+.SH DESCRIPTION
+Species is one of the fundamental units of comparison in virtually all
+subfields of biology, from systematics to anatomy, development, ecology,
+evolution, genetics and molecular biology. The aim of \fBmptp\fR is to offer
+an open source tool to infer species boundaries on a a given phylogenetic tree
+based on the Poisson Tree Process (PTP) and the Multiple Poisson Tree Process
+(mPTP) models.
+.PP
+\fBmptp\fR offers two methods for inferring species delimitation. First, a
+maximum-likelihood based method that uses a dynamic programming approach to
+infer an ML estimate. Second, an mcmc approach for sampling the space of
+possible delimitations providing the user with support values on the tree clades.
+Both approaches are available in two flavours: the PTP and the mPTP model. The
+PTP model is specified by using the \fIsingle\fR switch and the mPTP by using
+\fImulti\fR.
+.\" ============================================================================
+.SS Input
+The input for \fBmptp\fR is a newick file that contains one phylogenetic tree,
+i.e., branches express the expected number of substitutions per alignment site.
+.\" ============================================================================
+.SS Options
+\fBmptp\fR parses a large number of command-line options. For easier
+navigation, options are grouped below by theme.
+.PP
+General options:
+.RS
+.TP 9
+.B \-\-help
+Display help text and exit.
+.TP
+.B \-\-version
+Output version information and exit.
+.TP
+.B \-\-quiet
+Supress all output to stdout except for warnings and fatal error messages.
+.TP
+.BI \-\-tree_file \0filename
+Input newick file that contains a phylogenetic tree. Can be rooted or unrooted.
+.TP
+.BI \-\-output_file \0filename
+Specifies the prefix used for generating output files. For maximum-likelihood
+species delimitation two files will be created. First, \fIfilename\fR.txt that
+contains the actual delimitation and \fIfilename\fR.svg that contains an SVG
+figure of the computed delimitation. For mcmc analyses, a file
+\fIfilename\fR.txt is created that contains the newick tree with supports
+values.
+.TP
+.BI \-\-outgroup\~ "comma-separated list of taxa"
+All computations for species delimitation are carried out on rooted trees. This
+option is used only (and is required) In case an unrooted tree was specified
+with the \-\-tree_file option. \fImptp\fR roots the unrooted tree by
+splitting the branch leading to the most recent common ancestor (MRCA) of the
+comma-separated list of taxa into two branches of equal size and introducing a
+new node (the root of the new rooted tree) that connects these two branches.
+.TP
+.BI \-\-outgroup_crop
+Crops taxa specified with the \-\-outgroup option from the the tree.
+.TP
+.BI \-\-min_br \0real
+Any branch lengths in the input tree smaller or equal than \fIreal\fR are
+excluded (ignored) from the computations. In addition, for mcmc analyses,
+subtrees that exclusively consist of branch lengths smaller or equal to
+\fIreal\fR are completely ignored from the proposals (support values for those
+clades are set to 0). (default: 0.0001)
+.TP
+.BI \-\-precision\~ "positive integer"
+Specifies the precision of the decimal part of floating point numbers on output
+(default: 7)
+.TP
+.BI \-\-minbr_auto \0filename
+Automatically detects the minimum branch length from the p-distances of the
+FASTA file \fIfilename\fR.
+.TP
+.BI \-\-tree_show
+Show an ASCII version of the processed input tree (i.e. after it is rooted by,
+potentially cropping, the outgroup).
+.RE
+.PP
+.\" ============================================================================
+Maximum-likelihood estimations:
+.PP
+.RS
+Estimating the maximum-likelihood delimitation is triggered by the switch
+\-\-ml followed by \-\-single (the PTP model) or \-\-ml \-\-multi (the mPTP
+model). Note that these two methods affect how options \-\-output_file behaves
+and can be controlled using the \-\-min_br switch. Both methods require a
+rooted phylogenetic tree, however an unrooted tree may be specified in
+conjuction with the option \-\-outgroup. In this case, \fImptp\fR roots it at
+that outgroup (see General options, \-\-outgroup for more info). Note that both
+methods output an SVG depiction of the ML delimitation. See Visualization for
+more information on adjusting and fine-tuning the SVG output.
+.PP
+Both methods ignore discard branch lengths of size smaller than the size
+specified using the \-\-min_br option. The PTP model then attempts to find a
+connected subgraph of the rooted tree that (a) contains the root, and (b) the
+sum of likelihoods of fitting the edges of that subgraph in one exponential
+distribution and the remaining edges in another (exponential distribution) is
+maximized. With likelihood we mean the sums of the probability density function
+with the mean defined as the reciprocal of the average of edge lengths in the
+particular distribution.
+.PP
+.TP 9
+.B \-\-ml \-\-single
+Triggers the algorithm for computing an ML estimate of the delimitation using
+the PTP model.
+.TP
+.B \-\-ml \-\-multi
+Triggers the algorithm for computing an ML estimate of the delimitation using
+the mPTP model.
+.TP
+.B \-\-pvalue \0real
+Only used with the PTP model (specified with \-\-single). Sets the p-value for
+performing a likelihood ratio test. Note that, there is no likelihood ratio test
+for the mPTP model this test is not done. (default: 0.001)
+.RE
+.PP
+.\" ============================================================================
+MCMC method:
+.PP
+.RS
+The MCMC method is triggered with the \-\-mcmc switch combined with either
+\-\-single (the PTP model) or \-\-multi (the mPTP model).
+.PP
+Some more stuff to write
+.PP
+.TP 9
+.B \-\-mcmc\~ "positive integer" \-\-single
+Triggers the algorithm for computing support values by taking the specified
+number of MCMC samples (delimitations) using the PTP model.
+.TP
+.B \-\-mcmc\~ "positive integer" \-\-multi
+Triggers the algorithm for computing support values by taking the specified
+number of MCMC samples (delimitations) using the mPTP model.
+.TP
+.B \-\-mcmc_sample\~ "positive integer"
+Sample only every n-th MCMC step.
+.TP
+.B \-\-mcmc_log
+Log the scores (log-likelihood) for each MCMC sample in a file and create an SVG
+plot.
+.TP
+.B \-\-mcmc_burnin\~ "positive integer"
+Ignore all MCMC samples generated before the specified step. (default: 1)
+.TP
+.B \-\-mcmc_runs\~ "positive integer"
+Perform multiple MCMC runs. If more than 1 run is specified, mptp will generate
+one seed for each run based on the provided seed using the \-\-seed switch.
+Output files will be generated for each run (default: 1)
+.TP
+.B \-\-mcmc_credible \0real
+Specify the probability (0.0 to 1.0) for which to generate the credible interval
+i.e., the probability the true number of species will fall within the credible
+interval given the observed data. (default: 0.95)
+.TP
+.B \-\-mcmc_startnull
+Start MCMC sampling from the null-model.
+.TP
+.B \-\-mcmc_startrandom
+Start MCMC sampling from a random delimitation.
+.TP
+.B \-\-mcmc_startrandom
+Start MCMC sampling from the ML delimitation.
+.TP
+.B \-\-seed\~ "positive integer"
+Specifies the seed for the pseudo-random number generator. (default: randomly
+generated based on system time)
+.RE
+.PP
+.\" ============================================================================
+SVG Output:
+.PP
+.RS
+The ML method generates one SVG file that visualizes the processed input tree
+(i.e. after it is rooted by, potentially cropping, the outgroup) and marks the
+subtrees corresponding to coalescent processes (the detected species groups)
+with red color, while the speciation process is colored green.
+.PP
+The MCMC method generates one SVG file per run visualizing the processed
+tree, and indicates the support value for each node, i.e., the percentage of
+MCMC samples (delimitations) in which the particular node was part of the
+speciation process. A value of 1 means it was always in the speciation process
+while a value of 0 means it was always in a coalescent process. The tree
+branches are colored according to the support values of descendant nodes; a
+support of value of 0 is colored with red, 1 with black, and values in between
+are gradients of the two colors. Only support values above 0.5 are shown to
+avoid packed numbers in dense branching events. In addition, if \-\-mcmc_log is
+specified, an additional SVG image of log-likelihoods plots for each sampled
+delimitation is created.
+.PP
+.TP 9
+.B \-\-svg_width\~ "positive integer"
+Sets the total width (including margins) of the SVG in pixels. (default: 1920)
+.TP
+.B \-\-svg_fontsize\~ "positive integer"
+Size of font in SVG image. (default: 12)
+.TP
+.B \-\-svg_tipspacing\~ "positive integer"
+Vertical space in pixels between taxa in SVG tree. (default: 20)
+.TP
+.B \-\-svg_legend_ratio \0real
+Ratio (value between 0.0 and 1.0) of total tree length to be displayed as
+legend line. (default: 0.1)
+.TP
+.B \-\-svg_nolengend
+Hide legend.
+.TP
+.B \-\-svg_marginleft\~ "positive integer"
+Left margin in pixels. (default: 20)
+.TP
+.B \-\-svg_marginright\~ "positive integer"
+Right margin in pixels. (default: 20)
+.TP
+.B \-\-svg_margintop\~ "positive integer"
+Top margin in pixels. (default: 20)
+.TP
+.B \-\-svg_marginbottom\~ "positive integer"
+Top margin in pixels. (default: 20)
+.TP
+.B \-\-svg_inner_radius\~ "positive integer"
+Radius of inner nodes in pixels. (default: 0)
+.RE
+.PP
+.\" ============================================================================
+.SH EXAMPLES
+.PP
+Compute the maximum likelihood estimate using the mPTP model by discarding all
+branches with length below or equal to 0.0001
+.PP
+.RS
+\fBmptp\fR \-\-ml \-\-multi \-\-min_br 0.0001 \-\-tree_file \fInewick.txt\fR
+\-\-output_file \fIout\fR
+.RE
+.PP
+Run an MCMC analysis of 100 million steps with the mPTP model, that logs every
+one million-th step, ignores the first 2 million steps and discards all branches
+with lengths smaller or equal to 0.0001. Use 777 as seed. The chain will start
+from the ML delimitation (default).
+.PP
+.RS
+\fBmptp\fR \-\-mcmc 100000000 \-\-multi \-\-min_br 0.0001 \-\-tree_file
+\fInewick.txt\fR \-\-output_file \fIout\fR \-\-mcmc_log 1000000 \-\-mcmc_burnin
+2000000 -seed 777
+.RE
+.PP
+Perform an MCMC analysis of 5 runs, each of 100 million steps with the mPTP
+model, log every one million-th step, ignore the first 2 million steps, and
+detect the minimum branch length by specifying the FASTA file alignment.fa that
+contains the alignment. Use 777 as seed. Start each run from a random
+delimitation.
+.PP
+.RS
+\fBmptp\fR \-\-mcmc 100000000 \-\-multi -\-\-mcmc_runs 5 \-\-mcmc_log 1000000
+\-\-minbr_auto \fIalignment.fa\fR \-\-tree_file \fInewick.txt\fR
+\-\-output_file \fIout\fR \-\-mcmc_burnin 2000000 -seed 777
+\-\-mcmc_startrandom
+.RE
+.PP
+.\"
+.\" ============================================================================
+.SH AUTHORS
+Implementation by Tomas Flouri, Sarah Lutteropp and Paschalia Kapli. Additional
+PTP and mPTP model authors include Kassian Kobert, Jiajie Zhang, Pavlos
+Pavlidis, and Alexandros Stamatakis.
+.SH REPORTING BUGS
+Submit suggestions and bug-reports at
+<https://github.com/Pas-Kapli/mptp/issues>, or e-mail Tomas Flouri
+<Tomas.Flouri at h-its.org>.
+.\" ============================================================================
+.SH AVAILABILITY
+Source code and binaries are available at
+<https://github.com/Pas-Kapli/mptp>.
+.\" ============================================================================
+.SH COPYRIGHT
+Copyright (C) 2015-2017, Tomas Flouri, Sarah Lutteropp, Paschalia Kapli
+.PP
+All rights reserved.
+.PP
+Contact: Tomas Flouri <Tomas.Flouri at h-its.org>,
+Scientific Computing, Heidelberg Insititute for Theoretical Studies,
+69118 Heidelberg, Germany
+.PP
+This software is licensed under the terms of the GNU Affero General Public
+License version 3.
+.PP
+\fBGNU Affero General Public License version 3\fR
+.PP
+This program is free software: you can redistribute it and/or modify it under
+the terms of the GNU Affero General Public License as published by the Free
+Software Foundation, either version 3 of the License, or (at your option) any
+later version.
+.PP
+This program is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+details.
+.PP
+You should have received a copy of the GNU Affero General Public License along
+with this program. If not, see <http://www.gnu.org/licenses/>.
+.SH VERSION HISTORY
+New features and important modifications of \fBmptp\fR (short lived or minor
+bug releases may not be mentioned):
+.RS
+.TP
+.BR v0.1.0\~ "released June 27th, 2016"
+First public release.
+.TP
+.BR v0.1.1\~ "released July 15th, 2016"
+Bug fix (now LRT test is not printed in output file when using --multi)
+.TP
+.BR v.0.2.0\~ "released September 27th, 2016"
+Fixed floating point exception error when constructing random trees, caused
+from dividing by zero. Changed allocation from malloc to calloc, as it caused
+unititialized variables when converting unrooted trees to rooted when using the
+MCMC method. Fixed sample size for the AIC with a correction for finite sample
+sizes.
+.TP
+.BR v.0.2.1\~ "released October 18th, 2016"
+Updated ASV to consider only coalescent roots of ML delimitation. Removed
+assertion stopping mptp when using random starting delimitations for the MCMC
+method.
+.TP
+.BR v0.2.2\~ "released January 31st, 2017"
+Fixed regular expressions to allow scientific notation for branch lengths when
+parsing trees. Improved the accuracy of ASV score by also taking into account
+tips forming coalescent roots. Fixed memory leaks that occur when parsing
+incorrectly formatted trees.
+.RE
+.LP
diff --git a/src/Makefile.am b/src/Makefile.am
new file mode 100644
index 0000000..d33e607
--- /dev/null
+++ b/src/Makefile.am
@@ -0,0 +1,29 @@
+bin_PROGRAMS = $(top_builddir)/bin/mptp
+
+libparse_utree_a_SOURCES = parse_utree.y lex_utree.l
+libparse_rtree_a_SOURCES = parse_rtree.y lex_rtree.l
+noinst_LIBRARIES = libparse_utree.a libparse_rtree.a
+
+
+AM_CFLAGS=-I${srcdir} -O3 -mtune=native -Wall -Wsign-compare -g ${LIBS}
+AM_YFLAGS = -d -p `${SED} -n 's/.*_\(.*\)/\1_/p' <<<"$*"`
+AM_LFLAGS = -o lex.yy.c
+
+__top_builddir__bin_mptp_LDADD = libparse_utree.a libparse_rtree.a
+__top_builddir__bin_mptp_SOURCES = arch.c \
+auto.c \
+aic.c \
+mptp.c \
+mptp.h \
+dp.c \
+fasta.c \
+likelihood.c \
+maps.c \
+multirun.c \
+output.c \
+random.c \
+rtree.c \
+svg.c \
+svg_landscape.c \
+util.c \
+utree.c
diff --git a/src/aic.c b/src/aic.c
new file mode 100644
index 0000000..836bfbb
--- /dev/null
+++ b/src/aic.c
@@ -0,0 +1,1128 @@
+/*
+ Copyright (C) 2015 Tomas Flouri, Sarah Lutteropp
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as
+ published by the Free Software Foundation, either version 3 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+ Contact: Tomas Flouri <Tomas.Flouri at h-its.org>,
+ Heidelberg Institute for Theoretical Studies,
+ Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
+*/
+
+#include "mptp.h"
+
+typedef struct density_s
+{
+ double logl;
+ long species_count;
+} density_t;
+
+static rtree_t ** crnodes;
+static rtree_t ** snodes;
+
+static long crnodes_count = 0;
+static long snodes_count = 0;
+
+static long accept_count = 0;
+static FILE * fp_log = NULL;
+
+static long species_count = 0;
+
+static density_t * densities = NULL;
+
+static void mcmc_log(double logl, long sc)
+{
+ if (opt_mcmc_log)
+ fprintf(fp_log, "%f,%ld\n", logl, sc);
+}
+
+static int cb_desc(const void * va, const void * vb)
+{
+ const density_t * a = va;
+ const density_t * b = vb;
+
+ if (a->logl - b->logl < 0)
+ return 1;
+ else if (a->logl - b->logl > 0)
+ return -1;
+
+ return 0;
+}
+
+static void mcmc_init(rtree_t * root, long seed)
+{
+ long i;
+
+ crnodes = (rtree_t **)xmalloc((size_t)(root->leaves)*sizeof(rtree_t *));
+ snodes = (rtree_t **)xmalloc((size_t)(root->leaves)*sizeof(rtree_t *));
+
+ crnodes_count = 0;
+ snodes_count = 0;
+ accept_count = 0;
+
+ densities = (density_t *)xmalloc((size_t)(root->leaves+1)*sizeof(density_t));
+ memset(densities, 0, (size_t)(root->leaves+1) * sizeof(density_t));
+ for (i = 0; i < root->leaves+1; ++i)
+ densities[i].species_count = i;
+
+ /* open log file */
+ if (opt_mcmc_log)
+ fp_log = open_file_ext("log", seed);
+}
+
+static void init_null(rtree_t * root)
+{
+ int i;
+
+ rtree_t ** inner_node_list = (rtree_t **)xmalloc((size_t)(root->leaves-1) *
+ sizeof(rtree_t *));
+ rtree_query_innernodes(root, inner_node_list);
+
+ /* start mcmc analysis from null model */
+ for (i = 0; i < root->leaves - 1; ++i)
+ inner_node_list[i]->event = EVENT_COALESCENT;
+ free(inner_node_list);
+}
+
+static void mcmc_stats_init(rtree_t * root)
+{
+ int i;
+
+ rtree_t ** inner_node_list = (rtree_t **)xmalloc((size_t)(root->leaves-1) *
+ sizeof(rtree_t *));
+ rtree_query_innernodes(root, inner_node_list);
+
+ for (i = 0; i < root->leaves - 1; ++i)
+ {
+ if (inner_node_list[i]->event == EVENT_COALESCENT)
+ {
+ inner_node_list[i]->speciation_start = -1;
+ inner_node_list[i]->aic_weight_start = 0; // Just to initialize - it's not used
+ }
+ else
+ {
+ inner_node_list[i]->speciation_start = opt_mcmc_burnin-1;
+ inner_node_list[i]->aic_weight_start = 0; // This one should be used
+ }
+
+ inner_node_list[i]->speciation_count = 0;
+ }
+
+ free(inner_node_list);
+}
+
+static void hpd(long n, FILE * fp)
+{
+ long i;
+ long min, max;
+ double densities_sum = 0;
+ double acc_sum = 0;
+ long * indices = NULL;
+
+ indices = (long *)xmalloc((size_t)(n+2)*sizeof(long));
+ memset(indices, 0, (size_t)(n+2) * sizeof(long));
+
+ for (i = 1; i <= n; ++i)
+ densities_sum += densities[i].logl;
+
+ max = 0; min = n+1;
+ for (i = 1; i <= n; ++i)
+ {
+ acc_sum += densities[i].logl;
+ indices[densities[i].species_count] = 1;
+
+ if (densities[i].species_count < min)
+ min = densities[i].species_count;
+
+ if (densities[i].species_count > max)
+ max = densities[i].species_count;
+
+ if (acc_sum / densities_sum >= opt_mcmc_credible)
+ break;
+ }
+
+ fprintf(fp, "CCI (%ld,%ld)\n", min, max);
+ if (!opt_quiet)
+ fprintf(stdout, "CCI (%ld,%ld)\n", min, max);
+
+
+ fprintf(fp, "HPD ");
+ if (!opt_quiet)
+ printf("HPD ");
+ for (i = 1; i <= n+1; ++i)
+ {
+ if (indices[i] == 1 && indices[i-1] == 0)
+ {
+ fprintf(fp, "(%ld,", i);
+ if (!opt_quiet)
+ printf("(%ld,", i);
+ }
+ if (indices[i] == 0 && indices[i-1] == 1)
+ {
+ fprintf(fp, "%ld) ", i-1);
+ if (!opt_quiet)
+ printf("%ld) ", i-1);
+ }
+ }
+ fprintf(fp,"\n");
+ if (!opt_quiet)
+ printf("\n");
+ free(indices);
+
+}
+
+static void mcmc_finalize(rtree_t * root,
+ double mcmc_min_logl,
+ double mcmc_max_logl,
+ long seed,
+ double aic_weight_prefix_sum)
+{
+ long i;
+
+ if (!opt_quiet)
+ {
+ printf ("Minimum log-likelihood observed in mcmc run: %f\n", mcmc_min_logl);
+ printf ("Maximum log-likelihood observed in mcmc run: %f\n", mcmc_max_logl);
+ }
+
+ /* write support values to all nodes */
+ rtree_t ** inner_node_list = (rtree_t **)xmalloc((size_t)(root->leaves-1) *
+ sizeof(rtree_t *));
+ rtree_query_innernodes(root, inner_node_list);
+
+ for (i = 0; i < root->leaves - 1; ++i)
+ {
+ if (inner_node_list[i]->speciation_start != -1)
+ {
+ inner_node_list[i]->speciation_count = inner_node_list[i]->speciation_count +
+ opt_mcmc_steps -
+ inner_node_list[i]->speciation_start;
+ inner_node_list[i]->aic_support += aic_weight_prefix_sum - inner_node_list[i]->aic_weight_start;
+ }
+
+ inner_node_list[i]->aic_support /= aic_weight_prefix_sum;
+
+ inner_node_list[i]->support = inner_node_list[i]->aic_support;
+
+ /*inner_node_list[i]->support = inner_node_list[i]->speciation_count /
+ (double)(opt_mcmc_steps-opt_mcmc_burnin+1);*/
+ }
+
+ free(inner_node_list);
+ free(crnodes);
+ free(snodes);
+
+ if (opt_mcmc_log)
+ {
+ if (!opt_quiet)
+ fprintf(stdout, "Log written in %s.%ld.log ...\n", opt_outfile, seed);
+
+ fclose(fp_log);
+ }
+
+ FILE * fp_stats = open_file_ext("stats", seed);
+
+ double densities_sum = 0;
+ for (i = 1; i <= root->leaves; ++i)
+ densities_sum += densities[i].logl;
+
+ for (i = 1; i <= root->leaves; ++i)
+ {
+ fprintf(fp_stats,
+ "%ld,%f\n",
+ i,
+ (densities[i].logl/densities_sum)*100);
+ }
+
+ /* compute a HPD */
+ qsort(densities+1, (size_t)(root->leaves), sizeof(density_t), cb_desc);
+ hpd(root->leaves, fp_stats);
+
+ if (!opt_quiet)
+ fprintf(stdout,
+ "Statistics written in %s.%ld.stats ...\n",
+ opt_outfile,
+ seed);
+
+ fclose(fp_stats);
+ free(densities);
+}
+
+static void dp_recurse(rtree_t * node, int method)
+{
+ int k,j;
+
+ /* bottom-up recursion */
+
+ if (node->left) dp_recurse(node->left, method);
+ if (node->right) dp_recurse(node->right, method);
+
+ /* u_vec
+ *
+ / \
+ / \
+ v_vec * * w_vec */
+
+ dp_vector_t * u_vec = node->vector;
+
+ double spec_logl = loglikelihood(node->spec_edge_count,
+ node->spec_edgelen_sum);
+
+ u_vec[0].spec_edgelen_sum = 0;
+ u_vec[0].score_multi = node->coal_logl + spec_logl;
+ u_vec[0].score_single = node->coal_logl + spec_logl;
+ u_vec[0].coal_multi_logl = node->coal_logl;
+ u_vec[0].species_count = 1;
+ u_vec[0].filled = 1;
+
+ if (!node->left) return;
+
+ dp_vector_t * v_vec = node->left->vector;
+ dp_vector_t * w_vec = node->right->vector;
+
+ assert(node->spec_edge_count >= 0);
+
+ int u_edge_count = 0;
+ double u_edgelen_sum = 0;
+
+ /* check whether edges (u,v) and (u,w) are > min branch length */
+ if (node->left->length > opt_minbr)
+ {
+ u_edge_count++;
+ u_edgelen_sum += node->left->length;
+ }
+ if (node->right->length > opt_minbr)
+ {
+ u_edge_count++;
+ u_edgelen_sum += node->right->length;
+ }
+
+ for (j = 0; j <= node->left->edge_count; ++j)
+ {
+ for (k = 0; k <= node->right->edge_count; ++k)
+ {
+ /* if at least one of the two entries is not valid/filled, skip */
+ if (!v_vec[j].filled || !w_vec[k].filled) continue;
+
+ int i = j + k + u_edge_count;
+
+ /* set the number of species */
+ unsigned int u_species_count = v_vec[j].species_count +
+ w_vec[k].species_count;
+
+ /* compute multi-rate coalescent log-likelihood */
+ double coal_multi_logl = v_vec[j].coal_multi_logl +
+ w_vec[k].coal_multi_logl;
+
+ /* compute coalescent edge count and length sum of subtree u */
+ double u_spec_edgelen_sum = v_vec[j].spec_edgelen_sum +
+ w_vec[k].spec_edgelen_sum +
+ u_edgelen_sum;
+ int coal_edge_count = node->edge_count - i; /* change to int */
+ double coal_edgelen_sum = node->edgelen_sum - u_spec_edgelen_sum;
+
+
+ /* compute single-rate coalescent log-likelihood */
+ double coal_single_logl = loglikelihood(coal_edge_count,coal_edgelen_sum);
+
+ /* compute total speciation log-likelihood */
+ double spec_edgelen_sum = node->spec_edgelen_sum +
+ u_edgelen_sum +
+ v_vec[j].spec_edgelen_sum +
+ w_vec[k].spec_edgelen_sum;
+
+ int spec_edge_count = node->spec_edge_count + i;
+ assert(u_species_count > 0);
+ spec_logl = loglikelihood(spec_edge_count,spec_edgelen_sum);
+
+
+ /* compute single- and multi-rate scores */
+ double score_multi = coal_multi_logl + spec_logl;
+ double score_single = coal_single_logl + spec_logl;
+ double score = score_multi;
+ double best_score = u_vec[i].score_multi;
+
+ if (method == PTP_METHOD_SINGLE)
+ {
+ score = score_single;
+ best_score = u_vec[i].score_single;
+ }
+
+ if (!u_vec[i].filled || score > best_score)
+ {
+ u_vec[i].score_multi = score_multi;
+ u_vec[i].score_single = score_single;
+ u_vec[i].spec_edgelen_sum = u_spec_edgelen_sum;
+ u_vec[i].coal_multi_logl = coal_multi_logl;
+ u_vec[i].vec_left = j;
+ u_vec[i].vec_right = k;
+ u_vec[i].species_count = u_species_count;
+ u_vec[i].filled = 1;
+ }
+
+ }
+ }
+}
+
+static void backtrack_random(rtree_t * node,
+ bool *warning_minbr)
+
+{
+
+ node->mcmc_slot = -1;
+
+ if (node->event == EVENT_SPECIATION)
+ {
+ if (node->length <= opt_minbr && node->parent) *warning_minbr = true;
+
+ backtrack_random(node->left, warning_minbr);
+ backtrack_random(node->right, warning_minbr);
+
+ /* add to list of speciation nodes only if its two direct descendents
+ are coalescent roots and also the subtree at node has at least one
+ branch length greater than minbr */
+ if ((node->left->event == EVENT_COALESCENT) &&
+ (node->right->event == EVENT_COALESCENT) &&
+ (node->edge_count))
+ {
+ node->mcmc_slot = snodes_count;
+ snodes[snodes_count++] = node;
+ }
+
+ }
+ else
+ {
+
+ node->event = EVENT_COALESCENT;
+
+ /* add to list of coalescent roots in case it is not a tip AND if
+ the subtree rooted at node has at least one edge longer than minbr */
+ if (node->edge_count)
+ {
+ node->mcmc_slot = crnodes_count;
+ crnodes[crnodes_count++] = node;
+ }
+ }
+}
+
+static void backtrack(rtree_t * node,
+ long index,
+ bool *warning_minbr)
+
+{
+ dp_vector_t * vec = node->vector;
+
+ node->mcmc_slot = -1;
+
+ if ((vec[index].vec_left != -1) && (vec[index].vec_right != -1))
+ {
+ node->event = EVENT_SPECIATION;
+
+ if (node->length <= opt_minbr && node->parent) *warning_minbr = true;
+
+ backtrack(node->left, vec[index].vec_left, warning_minbr);
+ backtrack(node->right,vec[index].vec_right,warning_minbr);
+
+ /* add to list of speciation nodes only if its two direct descendents
+ are coalescent roots and also the subtree at node has at least one
+ branch length greater than minbr */
+ if ((node->left->event == EVENT_COALESCENT) &&
+ (node->right->event == EVENT_COALESCENT) &&
+ (node->edge_count))
+ {
+ node->mcmc_slot = snodes_count;
+ snodes[snodes_count++] = node;
+ }
+
+ }
+ else
+ {
+ node->event = EVENT_COALESCENT;
+
+ /* add to list of coalescent roots in case it is not a tip AND if
+ the subtree rooted at node has at least one edge longer than minbr */
+ if (node->edge_count)
+ {
+ node->mcmc_slot = crnodes_count;
+ crnodes[crnodes_count++] = node;
+ }
+ }
+}
+
+static void speciate(long r)
+{
+ /* CR S
+ * *
+ / \ -> / \
+ / \ / \
+ C * * C CR * * CR */
+
+
+ /* select the coalescent root at position r and split it into
+ two coalescent root nodes */
+
+ rtree_t * node = crnodes[r];
+
+ /* move the last node of the list to the position of the node
+ we just used */
+ if (r != (crnodes_count-1))
+ {
+ crnodes[r] = crnodes[crnodes_count-1];
+ crnodes[r]->mcmc_slot = r;
+ }
+ --crnodes_count;
+
+ /* eliminate parent from snodes if both its children were coalescent
+ roots, i.e. we had the case below:
+
+ S S
+ * *
+ / \ / \
+ / \ / \
+ CR * * CR -> CR * * S
+ / \ / \
+ / \ / \
+ C * * C CR * * CR
+
+ */
+ if (node->parent &&
+ node->parent->left->event == EVENT_COALESCENT &&
+ node->parent->right->event == EVENT_COALESCENT)
+ {
+ assert(node->parent->mcmc_slot != -1);
+ assert(node->edge_count);
+
+ /* perform the following only if the parent is not the last node
+ in the list */
+ if (node->parent->mcmc_slot != snodes_count-1)
+ {
+ /* set slot of last node in snodes to the slot we will place it */
+ snodes[snodes_count-1]->mcmc_slot = node->parent->mcmc_slot;
+
+ /* move this last node to its new slot */
+ snodes[node->parent->mcmc_slot] = snodes[snodes_count-1];
+ }
+
+ /* reset slot of the removed node and decrease count */
+ node->parent->mcmc_slot = -1;
+ --snodes_count;
+ }
+
+ /* add select node to the list of speciation nodes */
+ node->mcmc_slot = snodes_count;
+ snodes[snodes_count++] = node;
+ node->event = EVENT_SPECIATION;
+
+ /* add left child to coalescent roots unless it is a leaf OR the
+ tree rooted at node->left has all branch lengths smaller than minbr */
+ if (node->left->edge_count)
+ {
+ crnodes[crnodes_count] = node->left;
+ node->left->mcmc_slot = crnodes_count++;
+ }
+
+ /* add right child to coalescent roots unless it is a leaf OR the
+ tree rooted at node->right has all branch lengths smaller than minbr */
+ if (node->right->edge_count)
+ {
+ crnodes[crnodes_count] = node->right;
+ node->right->mcmc_slot = crnodes_count++;
+ }
+}
+
+static void coalesce(long r)
+{
+ /* S CR
+ * *
+ / \ -> / \
+ / \ / \
+ CR * * CR C * * C */
+
+ rtree_t * node = snodes[r];
+
+ /* move the last node of the list to the position of the node
+ we just used */
+ if (r != (snodes_count-1))
+ {
+ snodes[r] = snodes[snodes_count-1];
+ snodes[r]->mcmc_slot = r;
+ }
+ --snodes_count;
+
+ /* add the current node to the list of coalescent roots */
+ node->mcmc_slot = crnodes_count;
+ crnodes[crnodes_count++] = node;
+ node->event = EVENT_COALESCENT;
+
+ /* remove left child from coalescent roots unless it is a leaf OR the
+ tree rooted at node->left has all branch lengths smaller than minbr */
+ if (node->left->edge_count)
+ {
+ /* perform the following only if it is not the last node
+ in the list */
+ if (node->left->mcmc_slot != crnodes_count-1)
+ {
+ /* set slot of last node in crnodes to the slot we will place it */
+ crnodes[crnodes_count-1]->mcmc_slot = node->left->mcmc_slot;
+
+ /* move this last node to its new slot */
+ crnodes[node->left->mcmc_slot] = crnodes[crnodes_count-1];
+ }
+
+ /* reset slot of the removed node and decrease count */
+ node->left->mcmc_slot = -1;
+ crnodes_count--;
+ }
+
+ /* now do the same for the right child */
+ if (node->right->edge_count)
+ {
+ /* perform the following only if the parent is not the last node
+ in the list */
+ if (node->right->mcmc_slot != crnodes_count-1)
+ {
+ /* set slot of last node in crnodes to the slot we will place it */
+ crnodes[crnodes_count-1]->mcmc_slot = node->right->mcmc_slot;
+
+ /* move this last node to its new slot */
+ crnodes[node->right->mcmc_slot] = crnodes[crnodes_count-1];
+ }
+
+ /* reset slot of removed node and decrease count */
+ node->right->mcmc_slot = -1;
+ crnodes_count--;
+ }
+
+ /* if the parent of the node has two coalescent roots as children
+ now, then add it to snodes, i.e. the following case:
+
+ S S
+ * *
+ / \ / \
+ / \ / \
+ CR * * S -> CR * * CR
+ / \ / \
+ / \ / \
+ CR * * CR C * * C
+ */
+ if (node->parent &&
+ node->parent->left->event == EVENT_COALESCENT &&
+ node->parent->right->event == EVENT_COALESCENT)
+ {
+ assert(node->parent->mcmc_slot == -1);
+
+ /* set slot of parent */
+ node->parent->mcmc_slot = snodes_count;
+
+ /* place parent to the last slot in snodes and increase count */
+ snodes[snodes_count++] = node->parent;
+ }
+}
+
+static double aic_weight_nominator(double aic_score)
+{
+ return exp(-0.5 * aic_score);
+}
+
+void aic_mcmc(rtree_t * tree,
+ long method,
+ unsigned short * rstate,
+ long seed,
+ double * mcmc_min_logl,
+ double * mcmc_max_logl)
+{
+ long i;
+ long best_index = 0;
+ long rand_long = 0;
+ double rand_double = 0;
+ double max = 0;
+ double logl = 0;
+
+ double aic_weight_prefix_sum = 0.0;
+
+ *mcmc_max_logl = 0;
+ *mcmc_min_logl = 0;
+
+ if (!opt_quiet)
+ fprintf(stdout,"Computing initial delimitation...\n");
+
+ /* check whether all edges are smaller or equal than minbr */
+ if (!tree->edge_count)
+ {
+ fprintf(stderr,"WARNING: All branch lengths are smaller or equal to the "
+ "threshold specified by --minbr. Delimitation equals to "
+ "the null model\n");
+ tree->support = 1;
+ tree->aic_support = 1;
+ tree->event = EVENT_COALESCENT;
+
+ return;
+ }
+
+ mcmc_init(tree, seed);
+
+ /* fill DP table */
+ dp_recurse(tree, method);
+
+ /* obtain best entry in the root DP table */
+ dp_vector_t * vec = tree->vector;
+ if (method == PTP_METHOD_MULTI)
+ {
+ max = vec[0].score_multi;
+ for (i = 1; i < tree->edge_count; i++)
+ {
+ if (max < vec[i].score_multi && vec[i].filled)
+ {
+ max = vec[i].score_multi;
+ best_index = i;
+ }
+ }
+ }
+ else
+ {
+ max = vec[0].score_single;
+ for (i = 1; i < tree->edge_count; i++)
+ {
+ //printf("vec[%d].score_single: %.6f\n", i, vec[i].score_single);
+ if (max < vec[i].score_single && vec[i].filled)
+ {
+ max = vec[i].score_single;
+ best_index = i;
+ }
+ }
+ }
+ species_count = vec[best_index].species_count;
+
+ double max_logl_aic = (method == PTP_METHOD_MULTI) ?
+ vec[best_index].score_multi : vec[best_index].score_single;
+ double max_aic = aic(max_logl_aic, species_count, tree->leaves+2);
+
+
+ long coal_edge_count = 0;
+ long spec_edge_count = 0;
+ double spec_edgelen_sum = 0;
+ double coal_edgelen_sum = 0;
+ double coal_score = 0;
+
+ if (opt_mcmc_startnull && opt_mcmc_startrandom)
+ {
+ fatal("Cannot specify --mcmc_startnull and --mcmc_startrandom together");
+ }
+ else if (opt_mcmc_startnull)
+ {
+ tree->event = EVENT_COALESCENT;
+
+ crnodes[crnodes_count++] = tree;
+ logl = tree->coal_logl;
+ best_index = 0;
+ species_count = 1;
+
+ /* set parameters */
+ coal_edge_count = tree->edge_count;
+ spec_edge_count = 0;
+ spec_edgelen_sum = 0;
+ coal_edgelen_sum = tree->edgelen_sum;
+ coal_score = tree->coal_logl;
+
+ /* set all nodes to coalescent */
+ init_null(tree);
+
+ /* log log-likelihood at step 0 */
+ if (opt_mcmc_burnin == 1)
+ mcmc_log(logl,species_count);
+
+
+ }
+ else if (opt_mcmc_startrandom)
+ {
+ bool warning_minbr = false;
+ logl = random_delimitation(tree,
+ &species_count,
+ &coal_edge_count,
+ &coal_edgelen_sum,
+ &spec_edge_count,
+ &spec_edgelen_sum,
+ &coal_score,
+ rstate);
+ backtrack_random(tree, &warning_minbr);
+ if (warning_minbr)
+ fprintf(stderr,"WARNING: A speciation edge is smaller than the specified "
+ "minimum branch length.\n");
+
+ /* log log-likelihood at step 0 */
+ if (opt_mcmc_burnin == 1)
+ mcmc_log(logl,species_count);
+ }
+ else
+ {
+ /* ML starting delimitation */
+ bool warning_minbr = false;
+ backtrack(tree, best_index, &warning_minbr);
+ if (warning_minbr)
+ fprintf(stderr,"WARNING: A speciation edge is smaller than the specified "
+ "minimum branch length.\n");
+
+ logl = (method == PTP_METHOD_MULTI) ?
+ vec[best_index].score_multi : vec[best_index].score_single;
+
+ /* log log-likelihood at step 0 */
+ if (opt_mcmc_burnin == 1)
+ mcmc_log(logl,species_count);
+ }
+
+ if (!opt_mcmc_startnull && !opt_mcmc_startrandom)
+ {
+ if (method == PTP_METHOD_SINGLE)
+ {
+ coal_edge_count = tree->edge_count - best_index;
+ spec_edge_count = best_index;
+ spec_edgelen_sum = tree->vector[best_index].spec_edgelen_sum;
+ coal_edgelen_sum = tree->edgelen_sum - spec_edgelen_sum;
+ }
+ else
+ {
+ spec_edge_count = best_index;
+ spec_edgelen_sum = tree->vector[best_index].spec_edgelen_sum;
+ coal_score = tree->vector[best_index].score_multi -
+ loglikelihood(spec_edge_count, spec_edgelen_sum);
+ }
+ }
+
+ *mcmc_max_logl = logl;
+ *mcmc_min_logl = logl;
+
+ if (!opt_quiet)
+ {
+ if (opt_mcmc_startnull)
+ fprintf(stdout, "Null model log-likelihood: %f\n", logl);
+ else if (opt_mcmc_startrandom)
+ fprintf(stdout, "Random delimitation log-likelihood: %f\n", logl);
+ else
+ fprintf(stdout, "ML delimitation log-likelihood: %f\n", logl);
+ }
+
+ if (opt_mcmc_burnin == 1)
+ {
+ //densities[species_count].logl += logl;
+ densities[species_count].logl += -aic(logl, species_count, tree->leaves+2);
+ }
+
+ if (opt_mcmc_sample == 1)
+ {
+ if (!opt_quiet)
+ printf("1 Log-L: %f\n", logl);
+ }
+
+ mcmc_stats_init(tree);
+
+ for (i = 1; i < opt_mcmc_steps; ++i)
+ {
+
+ /* throw a coin to decide whether to convert a coalescent root to a
+ speciation or the other way round */
+ rand_double = erand48(rstate);
+ int speciation = (rand_double >= 0.5) ? 1 : 0;
+
+ if ((speciation && crnodes_count) || (snodes_count == 0))
+ {
+
+ /* CR S
+ * *
+ / \ -> / \
+ / \ / \
+ C * * C CR * * CR */
+
+
+ /* select a coalescent root, split it into two coalescent nodes */
+ rand_long = nrand48(rstate);
+ long r = rand_long % crnodes_count;
+ rtree_t * node = crnodes[r];
+
+ /* store the count of crnodes for the Hasting ratio */
+ double old_crnodes_count = crnodes_count;
+
+ /* speciate */
+ speciate(r);
+
+ /* store the new count of snodes for the Hasting ratio */
+ double new_snodes_count = snodes_count;
+
+ /* TODO: distinguish between single- and multi-rate methods */
+
+ /* subtract the two edges (left and right) from the coalescent
+ distribution and add them to the speciation distribution */
+ unsigned int edge_count_diff = 0;
+ double edgelen_sum_diff = 0;
+ if (node->left->length > opt_minbr)
+ {
+ ++edge_count_diff;
+ edgelen_sum_diff += node->left->length;
+ }
+
+ if (node->right->length > opt_minbr)
+ {
+ ++edge_count_diff;
+ edgelen_sum_diff += node->right->length;
+ }
+
+ if (method == PTP_METHOD_SINGLE)
+ {
+ coal_edgelen_sum -= edgelen_sum_diff;
+ coal_edge_count -= edge_count_diff;
+ }
+ spec_edgelen_sum += edgelen_sum_diff;
+ spec_edge_count += edge_count_diff;
+
+ /* compute new log-likelihood */
+ double new_logl;
+ if (spec_edge_count == 0 || (method == PTP_METHOD_SINGLE && coal_edge_count == 0))
+ new_logl = tree->coal_logl;
+ else
+ {
+ assert((method == PTP_METHOD_MULTI) || (coal_edge_count > 0));
+ assert(spec_edge_count > 0);
+ if (method == PTP_METHOD_SINGLE)
+ new_logl = loglikelihood(coal_edge_count, coal_edgelen_sum) +
+ loglikelihood(spec_edge_count, spec_edgelen_sum);
+ else
+ new_logl = coal_score - node->coal_logl +
+ node->left->coal_logl + node->right->coal_logl +
+ loglikelihood(spec_edge_count, spec_edgelen_sum);
+
+ }
+
+ if (new_logl > *mcmc_max_logl)
+ *mcmc_max_logl = new_logl;
+ if (i+1 < opt_mcmc_burnin)
+ *mcmc_min_logl = *mcmc_max_logl;
+ else if (new_logl < *mcmc_min_logl)
+ *mcmc_min_logl = new_logl;
+
+
+ double aic_new_logl = -aic(new_logl, species_count+1, tree->leaves+2);
+ double aic_logl = -aic(logl, species_count, tree->leaves+2);
+
+ /* Hastings ratio */
+ double a = exp(aic_new_logl - aic_logl) * (old_crnodes_count / new_snodes_count);
+
+ /* update densities */
+ if (i+1 >= opt_mcmc_burnin)
+ {
+ //densities[species_count+1].logl += new_logl;
+ densities[species_count+1].logl += aic_new_logl;
+ }
+
+ /* decide whether to accept or reject proposal */
+ rand_double = erand48(rstate);
+ if (rand_double <= a)
+ {
+ /* accept */
+ if ((i+1) % opt_mcmc_sample == 0)
+ {
+ if (!opt_quiet)
+ printf("%ld Log-L: %f\n", i+1, new_logl);
+ if (i+1 >= opt_mcmc_burnin)
+ mcmc_log(new_logl,species_count+1);
+ }
+
+ /* update support values information */
+ if (i+1 >= opt_mcmc_burnin) {
+ node->speciation_start = i;
+ aic_weight_prefix_sum += aic_weight_nominator(-aic_new_logl/max_aic);
+ node->aic_weight_start = aic_weight_prefix_sum;
+ }
+ else
+ {
+ node->speciation_start = opt_mcmc_burnin;
+ }
+
+ accept_count++;
+ species_count++;
+ logl = new_logl;
+ if (method == PTP_METHOD_MULTI)
+ coal_score = coal_score - node->coal_logl +
+ node->left->coal_logl + node->right->coal_logl;
+ continue;
+ }
+ else
+ {
+ /* reject */
+ if ((i+1) % opt_mcmc_sample == 0)
+ {
+ if (!opt_quiet)
+ printf("%ld Log-L: %f\n", i+1, new_logl);
+ if (i+1 >= opt_mcmc_burnin)
+ mcmc_log(new_logl,species_count+1);
+ }
+
+ if (i+1 >= opt_mcmc_burnin)
+ node->speciation_count++;
+
+ if (method == PTP_METHOD_SINGLE)
+ {
+ coal_edgelen_sum += edgelen_sum_diff;
+ coal_edge_count += edge_count_diff;
+ }
+ spec_edgelen_sum -= edgelen_sum_diff;
+ spec_edge_count -= edge_count_diff;
+ coalesce(node->mcmc_slot);
+ }
+ }
+ else
+ {
+
+ /* S CR
+ * *
+ / \ -> / \
+ / \ / \
+ CR * * CR C * * C */
+
+ rand_long = nrand48(rstate);
+ long r = rand_long % snodes_count;
+ rtree_t * node = snodes[r];
+
+ /* store the count of snodes for the Hastings ratio */
+ double old_snodes_count = snodes_count;
+
+ /* coalesce */
+ coalesce(r);
+
+ double new_crnodes_count = crnodes_count;
+
+ /* TODO: distinguish between single- and multi-rate methods */
+
+ /* subtract the two edges (left and right) from the speciation
+ distribution and add them to the coalescent distribution */
+ int edge_count_diff = 0;
+ double edgelen_sum_diff = 0;
+ if (node->left->length > opt_minbr)
+ {
+ ++edge_count_diff;
+ edgelen_sum_diff += node->left->length;
+ }
+
+ if (node->right->length > opt_minbr)
+ {
+ ++edge_count_diff;
+ edgelen_sum_diff += node->right->length;
+ }
+ if (method == PTP_METHOD_SINGLE)
+ {
+ coal_edgelen_sum += edgelen_sum_diff;
+ coal_edge_count += edge_count_diff;
+ }
+ spec_edgelen_sum -= edgelen_sum_diff;
+ spec_edge_count -= edge_count_diff;
+
+ /* compute new log-likelihood */
+ double new_logl;
+ if (spec_edge_count == 0 || (method == PTP_METHOD_SINGLE && coal_edge_count == 0))
+ new_logl = tree->coal_logl;
+ else
+ {
+ assert((method == PTP_METHOD_MULTI) || (coal_edge_count > 0));
+ assert(spec_edge_count > 0);
+ if (method == PTP_METHOD_SINGLE)
+ new_logl = loglikelihood(coal_edge_count, coal_edgelen_sum) +
+ loglikelihood(spec_edge_count, spec_edgelen_sum);
+ else
+ new_logl = coal_score - node->left->coal_logl - node->right->coal_logl +
+ node->coal_logl +
+ loglikelihood(spec_edge_count, spec_edgelen_sum);
+
+ }
+
+ if (new_logl > *mcmc_max_logl)
+ *mcmc_max_logl = new_logl;
+ if (i+1 < opt_mcmc_burnin)
+ *mcmc_min_logl = *mcmc_max_logl;
+ else if (new_logl < *mcmc_min_logl)
+ *mcmc_min_logl = new_logl;
+
+ double aic_new_logl = -aic(new_logl, species_count-1, tree->leaves+2);
+ double aic_logl = -aic(logl, species_count, tree->leaves+2);
+
+ /* Hastings ratio */
+ double a = exp(aic_new_logl - aic_logl) * (old_snodes_count / new_crnodes_count);
+
+ /* update densities */
+ if (i+1 >= opt_mcmc_burnin)
+ {
+ //densities[species_count-1].logl += new_logl;
+ densities[species_count-1].logl += aic_new_logl;
+ }
+
+ /* decide whether to accept or reject proposal */
+ rand_double = erand48(rstate);
+ if (rand_double <= a)
+ {
+ /* accept */
+ if ((i+1) % opt_mcmc_sample == 0)
+ {
+ if (!opt_quiet)
+ printf("%ld Log-L: %f\n", i+1, new_logl);
+ if (i+1 >= opt_mcmc_burnin)
+ mcmc_log(new_logl,species_count-1);
+ }
+
+ /* update support values information */
+ if (i+1 >= opt_mcmc_burnin)
+ {
+ node->speciation_count = node->speciation_count +
+ i - node->speciation_start;
+ aic_weight_prefix_sum += aic_weight_nominator(-aic_new_logl/max_aic);
+ node->aic_support += aic_weight_prefix_sum - node->aic_weight_start;
+ }
+ node->speciation_start = -1;
+
+ accept_count++;
+ species_count--;
+ logl = new_logl;
+ if (method == PTP_METHOD_MULTI)
+ coal_score = coal_score - node->left->coal_logl - node->right->coal_logl +
+ node->coal_logl;
+
+ continue;
+ }
+ else
+ {
+ /* reject */
+ if ((i+1) % opt_mcmc_sample == 0)
+ {
+ if (!opt_quiet)
+ printf("%ld Log-L: %f\n", i+1, new_logl);
+ if (i+1 >= opt_mcmc_burnin)
+ mcmc_log(new_logl,species_count-1);
+ }
+ if (method == PTP_METHOD_SINGLE)
+ {
+ coal_edgelen_sum -= edgelen_sum_diff;
+ coal_edge_count -= edge_count_diff;
+ }
+ spec_edgelen_sum += edgelen_sum_diff;
+ spec_edge_count += edge_count_diff;
+ speciate(node->mcmc_slot);
+ if (i+1 >= opt_mcmc_burnin)
+ {
+ node->speciation_count--;
+ }
+ }
+ }
+ }
+
+ //printf("Acceptance: %ld\n", accept_count);
+ /* TODO: DEBUG variables for checking the max likelihood mcmc runs give.
+ Must be removed */
+ mcmc_finalize(tree, *mcmc_min_logl, *mcmc_max_logl, seed, aic_weight_prefix_sum);
+
+}
diff --git a/src/arch.c b/src/arch.c
new file mode 100644
index 0000000..797120d
--- /dev/null
+++ b/src/arch.c
@@ -0,0 +1,75 @@
+/*
+ Copyright (C) 2014-2015 Tomas Flouri, Torbjorn Rognes, Jeff Epler
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as
+ published by the Free Software Foundation, either version 3 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+ Contact: Tomas Flouri <Tomas.Flouri at h-its.org>,
+ Heidelberg Institute for Theoretical Studies,
+ Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
+*/
+
+#include "mptp.h"
+
+unsigned long arch_get_memused()
+{
+ struct rusage r_usage;
+ getrusage(RUSAGE_SELF, & r_usage);
+
+#if defined __APPLE__
+ /* Mac: ru_maxrss gives the size in bytes */
+ return (unsigned long)(r_usage.ru_maxrss);
+#else
+ /* Linux: ru_maxrss gives the size in kilobytes */
+ return (unsigned long)r_usage.ru_maxrss * 1024;
+#endif
+}
+
+unsigned long arch_get_memtotal()
+{
+#if defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE)
+
+ long phys_pages = sysconf(_SC_PHYS_PAGES);
+ long pagesize = sysconf(_SC_PAGESIZE);
+
+ if ((phys_pages == -1) || (pagesize == -1))
+ fatal("Cannot determine amount of RAM");
+
+ // sysconf(3) notes that pagesize * phys_pages can overflow, such as
+ // when long is 32-bits and there's more than 4GB RAM. Since vsearch
+ // apparently targets LP64 systems like x86_64 linux, this will not
+ // arise in practice on the intended platform.
+
+ if (pagesize > LONG_MAX / phys_pages)
+ return LONG_MAX;
+ else
+ return (unsigned long)pagesize * (unsigned long)phys_pages;
+
+#elif defined(__APPLE__)
+
+ int mib [] = { CTL_HW, HW_MEMSIZE };
+ int64_t ram = 0;
+ size_t length = sizeof(ram);
+ if(-1 == sysctl(mib, 2, &ram, &length, NULL, 0))
+ fatal("Cannot determine amount of RAM");
+ return ram;
+
+#else
+
+ struct sysinfo si;
+ if (sysinfo(&si))
+ fatal("Cannot determine amount of RAM");
+ return si.totalram * si.mem_unit;
+
+#endif
+}
diff --git a/src/auto.c b/src/auto.c
new file mode 100644
index 0000000..d3db87c
--- /dev/null
+++ b/src/auto.c
@@ -0,0 +1,340 @@
+/*
+ Copyright (C) 2015 Tomas Flouri
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as
+ published by the Free Software Foundation, either version 3 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+ Contact: Tomas Flouri <Tomas.Flouri at h-its.org>,
+ Heidelberg Institute for Theoretical Studies,
+ Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
+*/
+
+#include "mptp.h"
+
+static double minbr;
+
+static const unsigned int mask[256] =
+ {
+ 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ };
+
+static int pdist(char * a, char * b, long len)
+{
+ long i;
+ int pdist = 0;
+
+ for (i = 0; i < len; ++i)
+ {
+ if (mask[(int)a[i]] && mask[(int)b[i]] && (a[i] != b[i]))
+ pdist++;
+ }
+
+ return pdist;
+}
+
+static long load_fasta(int tip_nodes_count, char ** headers, char ** seqdata)
+{
+ int i;
+
+ /* open FASTA file */
+ pll_fasta_t * fp = pll_fasta_open(opt_pdist_file, pll_map_fasta);
+ if (!fp)
+ fatal("Error opening file %s", opt_pdist_file);
+
+ char * seq = NULL;
+ char * hdr = NULL;
+ long seqlen;
+ long hdrlen;
+ long seqno;
+
+ /* read FASTA sequences and make sure they are all of the same length */
+ long sites = -1;
+ for (i = 0; pll_fasta_getnext(fp,&hdr,&hdrlen,&seq,&seqlen,&seqno); ++i)
+ {
+ if (i >= tip_nodes_count)
+ fatal("FASTA file contains more sequences than expected");
+
+ if (sites != -1 && sites != seqlen)
+ fatal("FASTA file does not contain equal size sequences\n");
+
+ if (sites == -1) sites = seqlen;
+
+ headers[i] = hdr;
+ seqdata[i] = seq;
+ }
+
+ /* did we stop reading the file because we reached EOF? */
+ if (pll_errno != PLL_ERROR_FILE_EOF)
+ fatal("Error while reading file %s", opt_pdist_file);
+
+ /* close FASTA file */
+ pll_fasta_close(fp);
+
+ if (sites == -1)
+ fatal("Unable to read alignment");
+
+ if (i != tip_nodes_count)
+ fatal("Some taxa are missing from FASTA file");
+
+ return sites;
+}
+
+static int cb_ascending(const void * a, const void * b)
+{
+ if (*(double *)(a) < *(double *)(b))
+ return -1;
+ else if (*(double *)(a) > *(double *)(b))
+ return 1;
+
+ return 0;
+
+}
+
+static int cb_allnodes(rtree_t * node)
+{
+ return 1;
+}
+
+static int cb_short_trees(rtree_t * node)
+{
+ /* mark tip down but don't include them in the list */
+ if (!node->left)
+ {
+ node->mark = 1;
+ return 0;
+ }
+
+ if (node->left->mark &&
+ node->right->mark &&
+ node->left->length <= minbr &&
+ node->right->length <= minbr)
+ {
+ node->mark = 1;
+ if (node->parent)
+ {
+ /* if it's parent is the root of a short tree then dont include
+ current node in the list, otherwise include it */
+ if (node->parent->left->length <= minbr &&
+ node->parent->right->length <= minbr)
+ {
+ return 0;
+ }
+ else
+ {
+ return 1;
+ }
+ }
+ else /* the current node is the root */
+ {
+ return 1;
+ }
+ }
+
+ return 0;
+
+}
+
+static void hash_tips(rtree_t * root)
+{
+ int i;
+
+ /* obtain an array of pointers to tip names */
+ rtree_t ** tipnodes = (rtree_t **)xmalloc((size_t)(root->leaves) *
+ sizeof(rtree_t *));
+ rtree_query_tipnodes(root, tipnodes);
+
+ /* create a libc hash table of size tip_count */
+ hcreate(2*(size_t)(root->leaves));
+
+ /* populate a libc hash table with tree tip labels */
+ for (i = 0; i < root->leaves; ++i)
+ {
+ ENTRY entry;
+ entry.key = tipnodes[i]->label;
+ entry.data = (void *)(tipnodes[i]);
+ hsearch(entry, ENTER);
+ }
+ free(tipnodes);
+}
+
+
+static void set_encode_sequence(rtree_t * node,
+ char * sequence,
+ long seqlen,
+ const unsigned int * map)
+{
+ unsigned int c;
+ long i;
+
+ /* iterate through sites and encode */
+ for (i = 0; i < seqlen; ++i)
+ {
+ if ((c = map[(int)sequence[i]]) == 0)
+ fatal("Illegal state code in tip \"%c\"", sequence[i]);
+
+ assert(c < 256);
+
+ sequence[i] = (char)c;
+ }
+
+ /* set sequence to tip */
+ node->sequence = sequence;
+
+}
+
+static void link_sequences(rtree_t * root, char ** headers, char ** sequence, long seqlen)
+{
+ int i;
+
+ for (i = 0; i < root->leaves; ++i)
+ {
+ ENTRY query;
+// printf("Linking %s\n", headers[i]);
+ query.key = headers[i];
+ ENTRY * found = NULL;
+
+ found = hsearch(query,FIND);
+
+ if (!found)
+ fatal("Sequence with header %s does not appear in the tree", headers[i]);
+
+ set_encode_sequence((rtree_t *)(found->data), sequence[i], seqlen, pll_map_nt);
+ }
+}
+
+
+static int all_pairwise_dist(rtree_t ** tip_node_list, int tip_list_count, long seqlen)
+{
+ int j,k;
+
+ for (j = 0; j < tip_list_count; ++j)
+ for (k = j+1; k < tip_list_count; ++k)
+ if (pdist(tip_node_list[j]->sequence, tip_node_list[k]->sequence, seqlen))
+ return 1;
+
+ return 0;
+}
+
+void detect_min_bl(rtree_t * rtree)
+{
+ rtree_t ** inner_node_list;
+ rtree_t ** tip_node_list = NULL;
+ int inner_list_count = 0;
+ int tip_list_count = 0;
+ int i,n;
+ char ** seqdata = NULL;
+ char ** headers = NULL;
+ long seqlen = 0;
+
+ /* for p-distance computation load an alignment from a FASTA file and map
+ the sequences to the tree tips */
+
+ if (!opt_quiet)
+ fprintf(stdout, "Parsing FASTA file %s...\n", opt_pdist_file);
+
+ /* allocate arrays to store FASTA headers and sequences */
+ headers = (char **)calloc((size_t)(rtree->leaves), sizeof(char *));
+ seqdata = (char **)calloc((size_t)(rtree->leaves), sizeof(char *));
+
+ seqlen = load_fasta(rtree->leaves, headers, seqdata);
+
+ hash_tips(rtree);
+
+ /* find sequences in hash table and link them with the corresponding taxa */
+ link_sequences(rtree, headers, seqdata, seqlen);
+
+ /* destroy hash table */
+ hdestroy();
+
+ /* get inner nodes that are roots of of the largest short subtrees. Short are
+ such subtrees where all branch lengths within them are less or equal to
+ opt_subtree_short. The largest such subtrees are those that are not
+ subtrees of short subtrees.
+ */
+ inner_node_list = (rtree_t **)xmalloc((size_t)(rtree->leaves-1) *
+ sizeof(rtree_t *));
+
+
+ double * branch_lengths = (double *)xmalloc((size_t)(2*rtree->leaves-1) *
+ sizeof(double));
+ rtree_t ** allnodes_list = (rtree_t **)xmalloc((size_t)(2*rtree->leaves-1) *
+ sizeof(rtree_t *));
+ int allnodes_count;
+
+ /* get list of all nodes, extract branch lengths and sort them in ascending
+ order */
+ allnodes_count = rtree_traverse_postorder(rtree, cb_allnodes, allnodes_list);
+ assert(allnodes_count == 2*rtree->leaves-1);
+ for (i = 0; i < allnodes_count; ++i)
+ branch_lengths[i] = allnodes_list[i]->length;
+ qsort(branch_lengths, (size_t)allnodes_count, sizeof(double), cb_ascending);
+ free(allnodes_list);
+
+
+ printf("Computing all pairwise p-distances ...\n");
+
+ tip_node_list = (rtree_t **)xmalloc((size_t)(rtree->leaves) *
+ sizeof(rtree_t *));
+
+
+ int minfound = 0;
+ /* go through all branch lengths */
+ for (n = 1; n < allnodes_count && !minfound; ++n)
+ {
+ minbr = branch_lengths[n];
+ inner_list_count = rtree_traverse_postorder(rtree,
+ cb_short_trees,
+ inner_node_list);
+
+ for (i = 0; i < inner_list_count && !minfound; ++i)
+ {
+ /* traverse the roots and grab the tips */
+ tip_list_count = rtree_query_tipnodes(inner_node_list[i], tip_node_list);
+ minfound = all_pairwise_dist(tip_node_list, tip_list_count, seqlen);
+ if (minfound) break;
+ }
+ }
+
+ if (minfound && n != 1)
+ printf("Minimum branch length (--minbr) should be set to %.10f\n", branch_lengths[n-1]);
+ else
+ printf("Minimum branch length (--minbr) should be set to 0\n");
+
+
+ free(branch_lengths);
+ free(inner_node_list);
+ free(tip_node_list);
+
+ for (i = 0; i < rtree->leaves; ++i)
+ {
+ free(seqdata[i]);
+ free(headers[i]);
+ }
+ free(seqdata);
+ free(headers);
+}
diff --git a/src/dp.c b/src/dp.c
new file mode 100644
index 0000000..7588019
--- /dev/null
+++ b/src/dp.c
@@ -0,0 +1,358 @@
+/*
+ Copyright (C) 2015 Tomas Flouri, Sarah Lutteropp
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as
+ published by the Free Software Foundation, either version 3 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+ Contact: Tomas Flouri <Tomas.Flouri at h-its.org>,
+ Heidelberg Institute for Theoretical Studies,
+ Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
+*/
+
+#include "mptp.h"
+
+static unsigned int species_iter = 0;
+
+static void dp_recurse(rtree_t * node, long method)
+{
+ int k,j;
+
+ /* bottom-up recursion */
+
+ if (node->left) dp_recurse(node->left, method);
+ if (node->right) dp_recurse(node->right, method);
+
+ /* u_vec
+ *
+ / \
+ / \
+ v_vec * * w_vec */
+
+ dp_vector_t * u_vec = node->vector;
+
+ double spec_logl = loglikelihood(node->spec_edge_count,
+ node->spec_edgelen_sum);
+
+ u_vec[0].spec_edgelen_sum = 0;
+ u_vec[0].score_multi = node->coal_logl + spec_logl;
+ u_vec[0].score_single = node->coal_logl + spec_logl;
+ u_vec[0].coal_multi_logl = node->coal_logl;
+ u_vec[0].species_count = 1;
+ u_vec[0].filled = 1;
+
+ if (!node->left) return;
+
+ dp_vector_t * v_vec = node->left->vector;
+ dp_vector_t * w_vec = node->right->vector;
+
+ assert(node->spec_edge_count >= 0);
+
+ int u_edge_count = 0;
+ double u_edgelen_sum = 0;
+
+ /* check whether edges (u,v) and (u,w) are > min branch length */
+ if (node->left->length > opt_minbr)
+ {
+ u_edge_count++;
+ u_edgelen_sum += node->left->length;
+ }
+ if (node->right->length > opt_minbr)
+ {
+ u_edge_count++;
+ u_edgelen_sum += node->right->length;
+ }
+
+ for (j = 0; j <= node->left->edge_count; ++j)
+ {
+ for (k = 0; k <= node->right->edge_count; ++k)
+ {
+ /* if at least one of the two entries is not valid/filled, skip */
+ if (!v_vec[j].filled || !w_vec[k].filled) continue;
+
+ int i = j + k + u_edge_count;
+
+ /* set the number of species */
+ unsigned int species_count = v_vec[j].species_count +
+ w_vec[k].species_count;
+
+ /* compute multi-rate coalescent log-likelihood */
+ double coal_multi_logl = v_vec[j].coal_multi_logl +
+ w_vec[k].coal_multi_logl;
+
+ /* compute coalescent edge count and length sum of subtree u */
+ double u_spec_edgelen_sum = v_vec[j].spec_edgelen_sum +
+ w_vec[k].spec_edgelen_sum +
+ u_edgelen_sum;
+ int coal_edge_count = node->edge_count - i; /* change to int */
+ double coal_edgelen_sum = node->edgelen_sum - u_spec_edgelen_sum;
+
+
+ /* compute single-rate coalescent log-likelihood */
+ double coal_single_logl = loglikelihood(coal_edge_count,coal_edgelen_sum);
+
+ /* compute total speciation log-likelihood */
+ double spec_edgelen_sum = node->spec_edgelen_sum +
+ u_edgelen_sum +
+ v_vec[j].spec_edgelen_sum +
+ w_vec[k].spec_edgelen_sum;
+
+ int spec_edge_count = node->spec_edge_count + i;
+ assert(species_count > 0);
+ spec_logl = loglikelihood(spec_edge_count,spec_edgelen_sum);
+
+
+ /* compute single- and multi-rate scores */
+ double score_multi = coal_multi_logl + spec_logl;
+ double score_single = coal_single_logl + spec_logl;
+ double score = score_multi;
+ double best_score = u_vec[i].score_multi;
+
+ if (method == PTP_METHOD_SINGLE)
+ {
+ score = score_single;
+ best_score = u_vec[i].score_single;
+ }
+
+ if (!u_vec[i].filled || score > best_score)
+ {
+ u_vec[i].score_multi = score_multi;
+ u_vec[i].score_single = score_single;
+ u_vec[i].spec_edgelen_sum = u_spec_edgelen_sum;
+ u_vec[i].coal_multi_logl = coal_multi_logl;
+ u_vec[i].vec_left = j;
+ u_vec[i].vec_right = k;
+ u_vec[i].species_count = species_count;
+ u_vec[i].filled = 1;
+ }
+
+ }
+ }
+}
+
+static void backtrack(rtree_t * node,
+ int index,
+ bool *warning_minbr,
+ FILE * out)
+
+{
+ dp_vector_t * vec = node->vector;
+
+ if ((vec[index].vec_left != -1) && (vec[index].vec_right != -1))
+ {
+ node->event = EVENT_SPECIATION;
+
+ if (node->length <= opt_minbr && node->parent) *warning_minbr = true;
+
+ backtrack(node->left, vec[index].vec_left, warning_minbr, out);
+ backtrack(node->right,vec[index].vec_right,warning_minbr, out);
+ }
+ else
+ {
+ species_iter++;
+ node->event = EVENT_COALESCENT;
+
+ fprintf(out, "\nSpecies %d:\n", species_iter);
+ rtree_print_tips(node,out);
+ }
+}
+
+void dp_ptp(rtree_t * tree, long method)
+{
+ int i;
+ int lrt_pass;
+ int best_index = 0;
+ unsigned int species_count;
+ double max = 0;
+ double pvalue = -1;
+
+
+ /* reset species counter */
+ species_iter = 0;
+
+ /* fill DP table */
+ dp_recurse(tree, method);
+
+ /* obtain best entry in the root DP table */
+ dp_vector_t * vec = tree->vector;
+ if (method == PTP_METHOD_MULTI)
+ {
+ max = vec[0].score_multi;
+ double min_aic_score = aic(vec[0].score_multi, vec[0].species_count, tree->leaves+2);
+ for (i = 1; i < tree->edge_count; i++)
+ {
+ if (vec[i].filled)
+ {
+ double aic_score = aic(vec[i].score_multi, vec[i].species_count, tree->leaves+2);
+ //printf("edges: %d logl: %f aic: %f species: %d\n", i, vec[i].score_multi, aic_score, vec[i].species_count);
+ if (aic_score < min_aic_score)
+ {
+ min_aic_score = aic_score;
+ best_index = i;
+ }
+ }
+ }
+ }
+ else
+ {
+ max = vec[0].score_single;
+ for (i = 1; i < tree->edge_count; i++)
+ {
+ if (max < vec[i].score_single && vec[i].filled)
+ {
+ max = vec[i].score_single;
+ best_index = i;
+ }
+ }
+ }
+
+ /* output some statistics */
+ if (!opt_quiet)
+ {
+ fprintf(stdout,
+ "Number of edges greater than minimum branch length: %d / %d\n",
+ tree->edge_count,
+ 2 * tree->leaves - 2);
+ printf("Score Null Model: %.6f\n", tree->coal_logl);
+ fprintf(stdout, "Best score for single coalescent rate: %.6f\n",
+ vec[best_index].score_single);
+ fprintf(stdout, "Best score for multi coalescent rate: %.6f\n",
+ vec[best_index].score_multi);
+ }
+
+ /* do a Likelihood Ratio Test (lrt) and return the computed p-value */
+ species_count = vec[best_index].species_count;
+
+ // only do LRT for PTP, not for mPTP
+ lrt_pass = (method == PTP_METHOD_MULTI) ? 1 : lrt(tree->coal_logl,
+ vec[best_index].score_single, 1, &pvalue);
+
+#ifndef HAVE_LIBGSL
+ fprintf(stderr, "WARNING: delimit was not compiled with libgsl. "
+ "Likelihood ratio test disabled.\n");
+#endif
+
+#ifdef HAVE_LIBGSL
+ if (!opt_quiet && method == PTP_METHOD_SINGLE)
+ fprintf(stdout,"LRT computed p-value: %.6f\n", pvalue);
+#endif
+
+ /* initialize file name */
+ FILE * out = open_file_ext("txt", opt_seed);
+
+ if (!opt_quiet)
+ fprintf(stdout, "Writing delimitation file %s.txt ...\n", opt_outfile);
+
+ /* write information about delimitation to file */
+ output_info(out,
+ method,
+ tree->coal_logl,
+ max,
+ pvalue,
+ lrt_pass,
+ tree,
+ species_count);
+
+ /* if LRT passed, then back-track the DP table and print the delimitation,
+ otherwise print the null-model (one single species) */
+
+ if (lrt_pass)
+ {
+ bool warning_minbr = false;
+ backtrack(tree, best_index, &warning_minbr,out);
+ if (warning_minbr)
+ fprintf(stderr,"WARNING: A speciation edge is smaller than the specified "
+ "minimum branch length.\n");
+ }
+ else
+ {
+ species_iter = 1;
+ fprintf(stdout, "LRT failed -- null-model is preferred and printed\n");
+ fprintf(out,"\nSpecies 1:\n");
+ rtree_print_tips(tree,out);
+ }
+
+ if (!opt_quiet)
+ printf("Number of delimited species: %d\n", species_iter);
+
+ if (tree->edge_count == 0)
+ fprintf(stderr, "WARNING: The tree has no edges > %f. "
+ "All edges have been ignored. \n", opt_minbr);
+
+ fclose(out);
+}
+
+void dp_init(rtree_t * tree)
+{
+ int i;
+
+ if (tree->left) dp_init(tree->left);
+ if (tree->right) dp_init(tree->right);
+
+ // TODO: Check whether this is the best way to handle those
+ // nasty zero-length edges.
+
+ tree->vector = calloc((size_t)(tree->edge_count + 1), sizeof(dp_vector_t));
+
+ for (i = 0; i <= tree->edge_count; i++)
+ {
+ tree->vector[i].vec_left = -1;
+ tree->vector[i].vec_right = -1;
+ }
+
+ assert(tree->edge_count >= 0);
+
+ tree->coal_logl = loglikelihood(tree->edge_count,
+ tree->edgelen_sum);
+}
+
+void dp_free(rtree_t * tree)
+{
+ if (tree->left) dp_free(tree->left);
+ if (tree->right) dp_free(tree->right);
+
+ if (tree->vector) free(tree->vector);
+}
+
+void dp_set_pernode_spec_edges(rtree_t * node)
+{
+ if (!node) return;
+
+ node->spec_edge_count = 0;
+ node->spec_edgelen_sum = 0;
+
+ /* for each node set spec_edge_count (and spec_edgelen_sum) as the count
+ (or sum) of edges (edge-lengths) of all direct child edges of
+ nodes on the path to root excluding the current node */
+ if (node->parent)
+ {
+ node->spec_edge_count = node->parent->spec_edge_count;
+ node->spec_edgelen_sum = node->parent->spec_edgelen_sum;
+
+ double len = node->parent->left->length;
+ if (len > opt_minbr)
+ {
+ node->spec_edge_count++;
+ node->spec_edgelen_sum += len;
+ }
+
+ len = node->parent->right->length;
+ if (len > opt_minbr)
+ {
+ node->spec_edge_count++;
+ node->spec_edgelen_sum += len;
+ }
+ }
+
+ dp_set_pernode_spec_edges(node->left);
+ dp_set_pernode_spec_edges(node->right);
+}
diff --git a/src/fasta.c b/src/fasta.c
new file mode 100644
index 0000000..fb11deb
--- /dev/null
+++ b/src/fasta.c
@@ -0,0 +1,305 @@
+/*
+ Copyright (C) 2015 Tomas Flouri
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as
+ published by the Free Software Foundation, either version 3 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+ Contact: Tomas Flouri <Tomas.Flouri at h-its.org>,
+ Exelixis Lab, Heidelberg Instutute for Theoretical Studies
+ Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
+*/
+
+#include "mptp.h"
+
+#define MEMCHUNK 4096
+
+/* please note that these functions will return a pointer to a buffer
+ allocated here for the query header and sequence. This buffers will
+ be overwritten on the next call of query_getnext. */
+
+pll_fasta_t * pll_fasta_open(const char * filename, const unsigned int * map)
+{
+ int i;
+ pll_fasta_t * fd = (pll_fasta_t *)malloc(sizeof(pll_fasta_t));
+ if (!fd) return NULL;
+
+ /* allocate space */
+
+ fd->lineno = 0;
+
+ fd->no = -1;
+
+ fd->chrstatus = map;
+
+ /* open file */
+ fd->fp = fopen(filename, "r");
+ if (!(fd->fp))
+ {
+ pll_errno = PLL_ERROR_FILE_OPEN;
+ snprintf(errmsg, 200, "Unable to open file (%s)", filename);
+ free(fd);
+ return PLL_FAILURE;
+ }
+
+ /* get filesize */
+ if (fseek(fd->fp, 0, SEEK_END))
+ {
+ pll_errno = PLL_ERROR_FILE_SEEK;
+ snprintf(errmsg, 200, "Unable to seek in file (%s)", filename);
+ free(fd);
+ return PLL_FAILURE;
+ }
+ fd->filesize = ftell(fd->fp);
+
+ rewind(fd->fp);
+
+ /* reset stripped char frequencies */
+ fd->stripped_count = 0;
+ for(i=0; i<256; i++)
+ fd->stripped[i] = 0;
+
+ fd->line[0] = 0;
+ if (!fgets(fd->line, PLL_LINEALLOC, fd->fp))
+ {
+ pll_errno = PLL_ERROR_FILE_SEEK;
+ snprintf(errmsg, 200, "Unable to read file (%s)", filename);
+ free(fd);
+ return PLL_FAILURE;
+ }
+ fd->lineno = 1;
+
+ return fd;
+}
+
+int pll_fasta_rewind(pll_fasta_t * fd)
+{
+ int i;
+
+ rewind(fd->fp);
+
+ /* reset stripped char frequencies */
+ fd->stripped_count = 0;
+ for(i=0; i<256; i++)
+ fd->stripped[i] = 0;
+
+ fd->line[0] = 0;
+ if (!fgets(fd->line, PLL_LINEALLOC, fd->fp))
+ {
+ pll_errno = PLL_ERROR_FILE_SEEK;
+ snprintf(errmsg, 200, "Unable to rewind and cache data");
+ return PLL_FAILURE;
+ }
+ fd->lineno = 1;
+
+ return PLL_SUCCESS;
+}
+
+void pll_fasta_close(pll_fasta_t * fd)
+{
+ fclose(fd->fp);
+ free(fd);
+}
+
+int pll_fasta_getnext(pll_fasta_t * fd, char ** head,
+ long * head_len, char ** seq,
+ long * seq_len, long * seqno)
+{
+ void * mem;
+ long head_alloc = MEMCHUNK;
+ long seq_alloc = MEMCHUNK;
+
+ *head_len = 0;
+ *seq_len = 0;
+
+ /* allocate sequence buffers */
+ *head = (char *)malloc((size_t)(head_alloc));
+ if (!(*head))
+ return PLL_FAILURE;
+
+ *seq = (char *)malloc((size_t)(seq_alloc));
+ if (!(*seq))
+ {
+ free(*head);
+ return PLL_FAILURE;
+ }
+
+ /* read line and increase line number */
+
+ while (fd->line[0])
+ {
+ /* read header */
+
+ if (fd->line[0] != '>')
+ {
+ pll_errno = PLL_ERROR_FASTA_INVALIDHEADER;
+ snprintf(errmsg, 200, "Illegal header line in query fasta file");
+ free(*head);
+ free(*seq);
+ return PLL_FAILURE;
+ }
+
+
+ long headerlen;
+ if (strchr(fd->line+1,'\r'))
+ headerlen = xstrchrnul(fd->line+1, '\r') - (fd->line+1);
+ else
+ headerlen = xstrchrnul(fd->line+1, '\n') - (fd->line+1);
+
+ *head_len = headerlen;
+
+
+ if (headerlen + 1 > head_alloc)
+ {
+ head_alloc = headerlen + 1;
+ mem = realloc(*head, (size_t)(head_alloc));
+ if (!mem)
+ {
+ pll_errno = PLL_ERROR_MEM_ALLOC;
+ snprintf(errmsg, 200, "Unable to allocate enough memory.");
+ free(*head);
+ free(*seq);
+ return PLL_FAILURE;
+ }
+ *head = (char *)mem;
+ }
+
+ memcpy(*head, fd->line + 1, (size_t)headerlen);
+ *(*head + headerlen) = 0;
+
+ /* get next line */
+
+ fd->line[0] = 0;
+ if (!fgets(fd->line, PLL_LINEALLOC, fd->fp))
+ {
+ /* do nothing */
+ }
+ fd->lineno++;
+
+ /* read sequence */
+
+ *seq_len = 0;
+
+ while (fd->line[0] && (fd->line[0] != '>'))
+ {
+ char c;
+ char m;
+ char * p = fd->line;
+
+ while((c = *p++))
+ {
+ m = (char) fd->chrstatus[(int)c];
+ switch(m)
+ {
+ case 0:
+ /* character to be stripped */
+ fd->stripped_count++;
+ fd->stripped[(int)c]++;
+ break;
+
+ case 1:
+ /* legal character */
+ if (*seq_len + 1 > seq_alloc)
+ {
+ seq_alloc += MEMCHUNK;
+ mem = realloc(*seq, (size_t)(seq_alloc));
+ if (!mem)
+ {
+ pll_errno = PLL_ERROR_MEM_ALLOC;
+ snprintf(errmsg, 200,
+ "Unable to allocate enough memory.");
+ free(*head);
+ free(*seq);
+ return PLL_FAILURE;
+ }
+ *seq = (char *)mem;
+ }
+ *(*seq + *seq_len) = c;
+ (*seq_len)++;
+
+ break;
+
+ case 2:
+ /* fatal character */
+ if (c>=32)
+ {
+ pll_errno = PLL_ERROR_FASTA_ILLEGALCHAR;
+ snprintf(errmsg, 200, "illegal character '%c' "
+ "on line %ld in the fasta file",
+ c, fd->lineno);
+ }
+ else
+ {
+ pll_errno = PLL_ERROR_FASTA_UNPRINTABLECHAR;
+ snprintf(errmsg, 200, "illegal unprintable character "
+ "%#.2x (hexadecimal) on line %ld "
+ "in the fasta file",
+ c, fd->lineno);
+ }
+ return PLL_FAILURE;
+
+ case 3:
+ /* silently stripped chars */
+ break;
+
+ }
+ }
+
+ fd->line[0] = 0;
+ if (!fgets(fd->line, PLL_LINEALLOC, fd->fp))
+ {
+ /* do nothing */
+ }
+ fd->lineno++;
+ }
+
+ /* add zero after sequence */
+
+ if (*seq_len + 1 > seq_alloc)
+ {
+ seq_alloc += MEMCHUNK;
+ mem = realloc(*seq, (size_t)seq_alloc);
+ if (!mem)
+ {
+ pll_errno = PLL_ERROR_MEM_ALLOC;
+ snprintf(errmsg, 200, "Unable to allocate enough memory.");
+ free(*head);
+ free(*seq);
+ return PLL_FAILURE;
+ }
+ *seq = (char *)mem;
+ }
+ *(*seq + *seq_len) = 0;
+
+ fd->no++;
+ *seqno = fd->no;
+
+ return PLL_SUCCESS;
+ }
+
+
+ snprintf(errmsg, 200, "End of file\n");
+ pll_errno = PLL_ERROR_FILE_EOF;
+ free(*head);
+ free(*seq);
+ return PLL_FAILURE;
+}
+
+long pll_fasta_getfilesize(pll_fasta_t * fd)
+{
+ return fd->filesize;
+}
+
+long pll_fasta_getfilepos(pll_fasta_t * fd)
+{
+ return ftell(fd->fp);
+}
diff --git a/src/lex_rtree.l b/src/lex_rtree.l
new file mode 100644
index 0000000..f7c7d2c
--- /dev/null
+++ b/src/lex_rtree.l
@@ -0,0 +1,86 @@
+/*
+ Copyright (C) 2015 Tomas Flouri
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as
+ published by the Free Software Foundation, either version 3 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+ Contact: Tomas Flouri <Tomas.Flouri at h-its.org>,
+ Heidelberg Institute for Theoretical Studies,
+ Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
+*/
+%{
+#include "parse_rtree.h"
+#include "mptp.h"
+
+static size_t string_length = 0;
+
+static char * append(size_t * dstlen, const char * src, size_t srclen)
+{
+ char * mem = (char *)xmalloc((*dstlen + srclen + 1)*sizeof(char));
+ memcpy(mem,rtree_lval.s,*dstlen);
+ strncpy(mem+(*dstlen),src,srclen);
+ mem[*dstlen+srclen] = 0;
+ if (*dstlen)
+ free(rtree_lval.s);
+ rtree_lval.s = mem;
+ *dstlen += srclen;
+ return rtree_lval.s;
+}
+
+%}
+%option noyywrap
+%option prefix="rtree_"
+%option nounput
+%option noinput
+%x apos
+%x quot
+
+%%
+<quot>{
+\\\" { append(&string_length, "\\\"", 2); }
+\' { append(&string_length, "\'", 1); }
+\" { BEGIN(INITIAL); return STRING; }
+}
+
+<apos>{
+\\\' { append(&string_length, "\\\'", 2); }
+\" { append(&string_length, "\"", 1); }
+\' { BEGIN(INITIAL); return STRING; }
+}
+
+<apos,quot>{
+\\n { append(&string_length, "\\n", 2); }
+\\t { append(&string_length, "\\t", 2); }
+\\ { append(&string_length, "\\", 1); }
+\\\\ { append(&string_length, "\\\\", 2); }
+([^\"\'\\]|\n)+ { append(&string_length, rtree_text, rtree_leng); }
+}
+
+\: { return COLON; }
+\; { return SEMICOLON; }
+\) { return CPAR; }
+\( { return OPAR; }
+\, { return COMMA; }
+\" { string_length = 0; BEGIN(quot); }
+\' { string_length = 0; BEGIN(apos); }
+[-+]?[0-9]+ { rtree_lval.d = xstrndup(rtree_text,rtree_leng);
+ return NUMBER; }
+[+-]?(([0-9]+[\.]?[0-9]*)|([0-9]*[\.]?[0-9]+))([eE][+-]?[0-9]+)? {
+ rtree_lval.d = xstrndup(rtree_text,rtree_leng);
+ return NUMBER; }
+[^ \'\",\(\):;\[\]\t\n\r][^ \t\n\r\)\(\[\]\,:;]* {
+ rtree_lval.s = xstrndup(rtree_text,rtree_leng);
+ return STRING; }
+[ \t\n\r] { ; }
+. { fatal("Syntax error (%c)\n", rtree_text[0]); }
+%%
diff --git a/src/lex_utree.l b/src/lex_utree.l
new file mode 100644
index 0000000..50af97f
--- /dev/null
+++ b/src/lex_utree.l
@@ -0,0 +1,86 @@
+/*
+ Copyright (C) 2015 Tomas Flouri
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as
+ published by the Free Software Foundation, either version 3 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+ Contact: Tomas Flouri <Tomas.Flouri at h-its.org>,
+ Heidelberg Institute for Theoretical Studies,
+ Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
+*/
+%{
+#include "parse_utree.h"
+#include "mptp.h"
+
+static size_t string_length = 0;
+
+static char * append(size_t * dstlen, const char * src, size_t srclen)
+{
+ char * mem = (char *)xmalloc((*dstlen + srclen + 1)*sizeof(char));
+ memcpy(mem,utree_lval.s,*dstlen);
+ strncpy(mem+(*dstlen),src,srclen);
+ mem[*dstlen+srclen] = 0;
+ if (*dstlen)
+ free(utree_lval.s);
+ utree_lval.s = mem;
+ *dstlen += srclen;
+ return utree_lval.s;
+}
+
+%}
+%option noyywrap
+%option prefix="utree_"
+%option nounput
+%option noinput
+%x apos
+%x quot
+
+%%
+<quot>{
+\\\" { append(&string_length, "\\\"", 2); }
+\' { append(&string_length, "\'", 1); }
+\" { BEGIN(INITIAL); return STRING; }
+}
+
+<apos>{
+\\\' { append(&string_length, "\\\'", 2); }
+\" { append(&string_length, "\"", 1); }
+\' { BEGIN(INITIAL);return STRING;}
+}
+
+<apos,quot>{
+\\n { append(&string_length, "\\n", 2); }
+\\t { append(&string_length, "\\t", 2); }
+\\ { append(&string_length, "\\", 1); }
+\\\\ { append(&string_length, "\\\\", 2); }
+([^\"\'\\]|\n)+ { append(&string_length, utree_text, utree_leng); }
+}
+
+\: { return COLON; }
+\; { return SEMICOLON; }
+\) { return CPAR; }
+\( { return OPAR; }
+\, { return COMMA; }
+\" { string_length = 0; BEGIN(quot); }
+\' { string_length = 0; BEGIN(apos); }
+[-+]?[0-9]+ { utree_lval.d = xstrndup(utree_text,utree_leng);
+ return NUMBER; }
+[+-]?(([0-9]+[\.]?[0-9]*)|([0-9]*[\.]?[0-9]+))([eE][+-]?[0-9]+)? {
+ utree_lval.d = xstrndup(utree_text,utree_leng);
+ return NUMBER; }
+[^ \'\",\(\):;\[\]\t\n\r][^ \t\n\r\)\(\[\]\,:;]* {
+ utree_lval.s = xstrndup(utree_text,utree_leng);
+ return STRING; }
+[ \t\n\r] { ; }
+. { fatal("Syntax error (%c)\n", utree_text[0]); }
+%%
diff --git a/src/likelihood.c b/src/likelihood.c
new file mode 100644
index 0000000..eb9a07b
--- /dev/null
+++ b/src/likelihood.c
@@ -0,0 +1,55 @@
+/*
+ Copyright (C) 2015 Tomas Flouri, Sarah Lutteropp
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as
+ published by the Free Software Foundation, either version 3 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+ Contact: Tomas Flouri <Tomas.Flouri at h-its.org>,
+ Heidelberg Institute for Theoretical Studies,
+ Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
+*/
+
+#include "mptp.h"
+
+
+
+double loglikelihood(long edge_count, double edgelen_sum)
+{
+ assert(edge_count >= 0);
+
+ if (edge_count == 0 || edgelen_sum < __DBL_MIN__) return 0;
+
+ return edge_count * (log(edge_count) - 1 - log(edgelen_sum));
+}
+
+int lrt(double nullmodel_logl, double ptp_logl, unsigned int df, double * pvalue)
+{
+#ifdef HAVE_LIBGSL
+ double diff = 2*(ptp_logl - nullmodel_logl);
+
+ /* http://docs.scipy.org/doc/scipy/reference/generated/scipy.special.chdtr.html */
+ *pvalue = 1 - gsl_cdf_chisq_P(diff,df);
+
+ if ((*pvalue) > opt_pvalue)
+ return 0;
+#endif
+
+ return 1;
+}
+
+double aic(double logl, long k, long n)
+{
+ if (k > 1) k++;
+
+ return -2*logl + 2*k + (double)(2*k*(k + 1)) / (double)(n-k-1);
+}
diff --git a/src/maps.c b/src/maps.c
new file mode 100644
index 0000000..5b09a80
--- /dev/null
+++ b/src/maps.c
@@ -0,0 +1,82 @@
+/*
+ Copyright (C) 2015 Tomas Flouri
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as
+ published by the Free Software Foundation, either version 3 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+ Contact: Tomas Flouri <Tomas.Flouri at h-its.org>,
+ Exelixis Lab, Heidelberg Instutute for Theoretical Studies
+ Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
+*/
+
+#include "mptp.h"
+
+/* maps for encoding sequences */
+
+const unsigned int pll_map_nt[256] =
+ {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15,
+ 0, 1, 14, 2, 13, 0, 0, 4, 11, 0, 0, 12, 0, 3, 15, 15,
+ 0, 0, 5, 6, 8, 8, 7, 9, 15, 10, 0, 0, 0, 0, 0, 0,
+ 0, 1, 14, 2, 13, 0, 0, 4, 11, 0, 0, 12, 0, 3, 15, 15,
+ 0, 0, 5, 6, 8, 8, 7, 9, 15, 10, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ };
+
+/*
+ map for fasta parsing
+
+ legal symbols: *abcdefghiklmnpqrstuvxyz (all except j and o), also upper case
+ fatal symbols: .-
+ fatal: ascii 0-26 except tab (9), newline (10 and 13), vt (11), formfeed (12)
+ stripped: !"#$&'()+,/0123456789:;<=>?@JO^_`joæøåÆØŧ¨´ as well as chrs 9-13
+
+ includes both amino acid and nucleotide sequences, adapt to nt only
+*/
+
+const unsigned int pll_map_fasta[256] =
+ {
+ /*
+ 0=stripped, 1=legal, 2=fatal, 3=silently stripped
+ @ A B C D E F G H I J K L M N O
+ P Q R S T U V W X Y Z [ \ ] ^ _
+ */
+
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
+ 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+ 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,
+ 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+ 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ };
+
diff --git a/src/mptp.c b/src/mptp.c
new file mode 100644
index 0000000..0a15718
--- /dev/null
+++ b/src/mptp.c
@@ -0,0 +1,626 @@
+/*
+ Copyright (C) 2015 Tomas Flouri, Sarah Lutteropp
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as
+ published by the Free Software Foundation, either version 3 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+ Contact: Tomas Flouri <Tomas.Flouri at h-its.org>,
+ Heidelberg Institute for Theoretical Studies,
+ Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
+*/
+
+#include "mptp.h"
+
+static char * progname;
+static char progheader[80];
+char * cmdline;
+
+/* global error message buffer */
+char errmsg[200] = {0};
+
+/* global pseudo-random number generator 48-bit state */
+unsigned short global_xsubi[3];
+
+/* number of mandatory options for the user to input */
+static const char mandatory_options_count = 2;
+static const char * mandatory_options_list = " --tree_file --output_file";
+
+/* options */
+int pll_errno;
+int opt_quiet;
+int opt_precision;
+int opt_svg_showlegend;
+long opt_help;
+long opt_version;
+long opt_treeshow;
+long opt_method;
+long opt_mcmc_sample;
+long opt_mcmc_steps;
+long opt_mcmc_log;
+long opt_mcmc_startnull;
+long opt_mcmc_startrandom;
+long opt_mcmc_startml;
+long opt_mcmc_burnin;
+long opt_mcmc_runs;
+long opt_seed;
+long opt_mcmc;
+long opt_ml;
+long opt_multi;
+long opt_single;
+long opt_crop;
+long opt_svg;
+long opt_svg_width;
+long opt_svg_fontsize;
+long opt_svg_tipspace;
+long opt_svg_marginleft;
+long opt_svg_marginright;
+long opt_svg_margintop;
+long opt_svg_marginbottom;
+long opt_svg_inner_radius;
+double opt_mcmc_credible;
+double opt_svg_legend_ratio;
+double opt_pvalue;
+double opt_minbr;
+char * opt_treefile;
+char * opt_outfile;
+char * opt_outgroup;
+char * opt_pdist_file;
+
+static struct option long_options[] =
+{
+ {"help", no_argument, 0, 0 }, /* 0 */
+ {"version", no_argument, 0, 0 }, /* 1 */
+ {"quiet", no_argument, 0, 0 }, /* 2 */
+ {"tree_file", required_argument, 0, 0 }, /* 3 */
+ {"tree_show", no_argument, 0, 0 }, /* 4 */
+ {"output_file", required_argument, 0, 0 }, /* 5 */
+ {"outgroup", required_argument, 0, 0 }, /* 6 */
+ {"pvalue", required_argument, 0, 0 }, /* 7 */
+ {"minbr", required_argument, 0, 0 }, /* 8 */
+ {"svg_width", required_argument, 0, 0 }, /* 9 */
+ {"svg_fontsize", required_argument, 0, 0 }, /* 10 */
+ {"svg_tipspacing", required_argument, 0, 0 }, /* 11 */
+ {"svg_legend_ratio", required_argument, 0, 0 }, /* 12 */
+ {"svg_nolegend", no_argument, 0, 0 }, /* 13 */
+ {"svg_marginleft", required_argument, 0, 0 }, /* 14 */
+ {"svg_marginright", required_argument, 0, 0 }, /* 15 */
+ {"svg_margintop", required_argument, 0, 0 }, /* 16 */
+ {"svg_marginbottom", required_argument, 0, 0 }, /* 17 */
+ {"svg_inner_radius", required_argument, 0, 0 }, /* 18 */
+ {"precision", required_argument, 0, 0 }, /* 19 */
+ {"mcmc_sample", required_argument, 0, 0 }, /* 20 */
+ {"mcmc_log", no_argument, 0, 0 }, /* 21 */
+ {"seed", required_argument, 0, 0 }, /* 22 */
+ {"mcmc_startnull", no_argument, 0, 0 }, /* 23 */
+ {"mcmc_burnin", required_argument, 0, 0 }, /* 24 */
+ {"mcmc_startrandom", no_argument, 0, 0 }, /* 25 */
+ {"mcmc_runs", required_argument, 0, 0 }, /* 26 */
+ {"minbr_auto", required_argument, 0, 0 }, /* 27 */
+ {"outgroup_crop", no_argument, 0, 0 }, /* 28 */
+ {"mcmc_credible", required_argument, 0, 0 }, /* 29 */
+ {"mcmc", required_argument, 0, 0 }, /* 30 */
+ {"ml", no_argument, 0, 0 }, /* 31 */
+ {"single", no_argument, 0, 0 }, /* 32 */
+ {"multi", no_argument, 0, 0 }, /* 33 */
+ {"mcmc_startml", no_argument, 0, 0 }, /* 34 */
+ { 0, 0, 0, 0 }
+};
+
+void args_init(int argc, char ** argv)
+{
+ int option_index = 0;
+ int c;
+ int mand_options = 0;
+
+ /* set defaults */
+
+ progname = argv[0];
+
+ opt_help = 0;
+ opt_version = 0;
+ opt_treeshow = 0;
+ opt_treefile = NULL;
+ opt_outfile = NULL;
+ opt_outgroup = NULL;
+ opt_pdist_file = NULL;
+ opt_quiet = 0;
+ opt_pvalue = 0.001;
+ opt_minbr = 0.0001;
+ opt_precision = 7;
+ opt_mcmc_steps = 0;
+ opt_mcmc_sample = 1000;
+ opt_mcmc_startnull = 0;
+ opt_mcmc_startrandom = 0;
+ opt_mcmc_startml = 0;
+ opt_mcmc_log = 0;
+ opt_mcmc_burnin = 1;
+ opt_mcmc_runs = 1;
+ opt_mcmc_credible = 0.95;
+ opt_seed = (long)time(NULL);
+ opt_crop = 0;
+ opt_ml = 0;
+ opt_mcmc = 0;
+ opt_method = PTP_METHOD_MULTI;
+ opt_multi = 0;
+ opt_single = 0;
+
+ opt_svg_width = 1920;
+ opt_svg_fontsize = 12;
+ opt_svg_tipspace = 20;
+ opt_svg_legend_ratio = 0.1;
+ opt_svg_showlegend = 1;
+ opt_svg_marginleft = 20;
+ opt_svg_marginright = 20;
+ opt_svg_margintop = 20;
+ opt_svg_marginbottom = 20;
+ opt_svg_inner_radius = 0;
+
+ while ((c = getopt_long_only(argc, argv, "", long_options, &option_index)) == 0)
+ {
+ char * end;
+ switch (option_index)
+ {
+ case 0:
+ opt_help = 1;
+ break;
+
+ case 1:
+ opt_version = 1;
+ break;
+
+ case 2:
+ opt_quiet = 1;
+ break;
+
+ case 3:
+ free(opt_treefile);
+ opt_treefile = optarg;
+ break;
+
+ case 4:
+ opt_treeshow = 1;
+ break;
+
+ case 5:
+ opt_outfile = optarg;
+ break;
+
+ case 6:
+ opt_outgroup = optarg;
+ break;
+
+ case 7:
+ opt_pvalue = strtod(optarg, &end);
+ if (end == optarg) {
+ fatal(" is not a valid number.\n");
+ }
+ break;
+
+ case 8:
+ opt_minbr = strtod(optarg, &end);
+ if (end == optarg) {
+ fatal(" is not a valid number.\n");
+ }
+ break;
+
+ case 9:
+ opt_svg_width = atoi(optarg);
+ break;
+
+ case 10:
+ opt_svg_fontsize = atol(optarg);
+ break;
+
+ case 11:
+ opt_svg_tipspace = atol(optarg);
+ break;
+
+ case 12:
+ opt_svg_legend_ratio = atof(optarg);
+ break;
+
+ case 13:
+ opt_svg_showlegend = 0;
+ break;
+
+ case 14:
+ opt_svg_marginleft = atol(optarg);
+ break;
+
+ case 15:
+ opt_svg_marginright = atol(optarg);
+ break;
+
+ case 16:
+ opt_svg_margintop = atol(optarg);
+ break;
+
+ case 17:
+ opt_svg_marginbottom = atol(optarg);
+ break;
+
+ case 18:
+ opt_svg_inner_radius = atol(optarg);
+ break;
+
+ case 19:
+ opt_precision = atoi(optarg);
+ break;
+
+ case 20:
+ opt_mcmc_sample = atol(optarg);
+ break;
+
+ case 21:
+ opt_mcmc_log = 1;
+ break;
+
+ case 22:
+ opt_seed = atol(optarg);
+ break;
+
+ case 23:
+ opt_mcmc_startnull = 1;
+ break;
+
+ case 24:
+ opt_mcmc_burnin = atol(optarg);
+ break;
+
+ case 25:
+ opt_mcmc_startrandom = 1;
+ break;
+
+ case 26:
+ opt_mcmc_runs = atol(optarg);
+ break;
+
+ case 27:
+ free(opt_pdist_file);
+ opt_pdist_file = optarg;
+ break;
+
+ case 28:
+ opt_crop = 1;
+ break;
+
+ case 29:
+ opt_mcmc_credible = atof(optarg);
+ break;
+
+ case 30:
+ opt_mcmc = 1;
+ opt_mcmc_steps = atol(optarg);
+ break;
+
+ case 31:
+ opt_ml = 1;
+ break;
+
+ case 32:
+ opt_method = PTP_METHOD_SINGLE;
+ opt_single = 1;
+ break;
+
+ case 33:
+ opt_method = PTP_METHOD_MULTI;
+ opt_multi = 1;
+ break;
+
+ case 34:
+ opt_mcmc_startml = 1;
+ break;
+
+ default:
+ fatal("Internal error in option parsing");
+ }
+ }
+
+ if (c != -1)
+ exit(EXIT_FAILURE);
+
+ int commands = 0;
+
+ /* check for mandatory options */
+ if (opt_treefile)
+ mand_options++;
+ if (opt_outfile)
+ mand_options++;
+
+ /* check for number of independent commands selected */
+ if (opt_version)
+ commands++;
+ if (opt_help)
+ commands++;
+ if (opt_pdist_file)
+ commands++;
+ if (opt_mcmc)
+ commands++;
+ if (opt_ml)
+ commands++;
+
+ /* if more than one independent command, fail */
+ if (commands > 1)
+ fatal("More than one command specified");
+
+ /* if more than one independent command, fail */
+ if (opt_mcmc_startrandom + opt_mcmc_startnull + opt_mcmc_startml > 1)
+ fatal("You can only select one out of --mcmc_startrandom, --mcmc_startnull, --mcmc_startml");
+
+ /* if more than one independent command, fail */
+ if (opt_multi && opt_single)
+ fatal("You can either specify --multi or --single, but not both at once.");
+
+ /* if no command specified, turn on --help */
+ if (!commands)
+ {
+ opt_help = 1;
+ return;
+ }
+ /* check for mandatory options */
+ if (!opt_version && !opt_help)
+ if (mand_options != mandatory_options_count)
+ fatal("Mandatory options are:\n\n%s", mandatory_options_list);
+
+}
+
+void cmd_help()
+{
+ fprintf(stderr,
+ "Usage: %s [OPTIONS]\n", progname);
+ fprintf(stderr,
+ "\n"
+ "Examples:\n"
+ " mptp --ml --multi --tree_file tree.newick --output_file output\n"
+ " mptp --mcmc 50000000 --multi --mcmc_sample 1000000 --mcmc_burnin 1000000 --tree_file tree.newick --output_file output\n\n"
+ "General options:\n"
+ " --help display help information.\n"
+ " --version display version information.\n"
+ " --tree_show display an ASCII version of the tree.\n"
+ " --multi Use one lambda per coalescent (this is default).\n"
+ " --single Use one lambda for all coalescent.\n"
+ " --ml Maximum-likelihood heuristic.\n"
+ " --mcmc INT Support values for the delimitation (INT steps).\n"
+ " --mcmc_sample INT Sample every INT iteration (default: 1000).\n"
+ " --mcmc_log Log samples and create SVG plot of log-likelihoods.\n"
+ " --mcmc_burnin INT Ignore all MCMC steps below threshold.\n"
+ " --mcmc_runs INT Perform multiple MCMC runs.\n"
+ " --mcmc_credible <0..1> Credible interval (default: 0.95).\n"
+ " --mcmc_startnull Start each run with the null model (one single species).\n"
+ " --mcmc_startrandom Start each run with a random delimitation.\n"
+ " --mcmc_startml Start each run with the delimitation obtained by the Maximum-likelihood heuristic.\n"
+ " --pvalue REAL Set p-value for LRT (default: 0.001)\n"
+ " --minbr REAL Set minimum branch length (default: 0.0001)\n"
+ " --minbr_auto FILENAME Detect minimum branch length from FASTA p-distances\n"
+ " --outgroup TAXA Root unrooted tree at outgroup (default: taxon with longest branch).\n"
+ " --outgroup_crop Crop outgroup from tree\n"
+ " --quiet only output warnings and fatal errors to stderr.\n"
+ " --precision INT Precision of floating point numbers on output (default: 7).\n"
+ " --seed Seed for pseudo-random number generator.\n"
+ "\n"
+ "Input and output options:\n"
+ " --tree_file FILENAME tree file in newick format.\n"
+ " --output_file FILENAME output file name.\n"
+ "\n"
+ "Visualization options:\n"
+ " --svg_width INT Width of SVG tree in pixels (default: 1920).\n"
+ " --svg_fontsize INT Size of font in SVG image. (default: 12)\n"
+ " --svg_tipspacing INT Vertical space between taxa in SVG tree (default: 20).\n"
+ " --svg_legend_ratio <0..1> Ratio of total tree length to be displayed as legend line.\n"
+ " --svg_nolegend Hides legend.\n"
+ " --svg_marginleft INT Left margin in pixels (default: 20).\n"
+ " --svg_marginright INT Right margin in pixels (default: 20).\n"
+ " --svg_margintop INT Top margin in pixels (default: 20).\n"
+ " --svg_marginbottom INT Bottom margin in pixels (default: 20).\n"
+ " --svg_inner_radius INT Radius of inner nodes in pixels (default: 0).\n"
+ );
+}
+
+static rtree_t * load_tree(void)
+{
+ /* parse tree */
+ if (!opt_quiet)
+ fprintf(stdout, "Parsing tree file...\n");
+
+ rtree_t * rtree = rtree_parse_newick(opt_treefile);
+
+ if (!rtree)
+ {
+ unsigned int tip_count;
+ utree_t * utree = utree_parse_newick(opt_treefile, &tip_count);
+ if (!utree)
+ fatal("Tree is neither unrooted nor rooted.");
+
+ if (!opt_quiet)
+ {
+ fprintf(stdout, "Loaded unrooted tree...\n");
+ fprintf(stdout, "Converting to rooted tree...\n");
+ }
+
+ /* if outgroup was not specified, get the node with the longest branch */
+ utree_t * og_root = NULL;
+
+ /* if outgroup was not specified, get the tip with the longest branch */
+ if (!opt_outgroup)
+ {
+ og_root = utree_longest_branchtip(utree, tip_count);
+ assert(og_root);
+ fprintf(stdout,
+ "Selected %s as outgroup based on longest tip-branch criterion\n",
+ og_root->label);
+ }
+ else
+ {
+ /* get LCA of out group */
+ og_root = utree_outgroup_lca(utree, tip_count);
+ if (!og_root)
+ {
+ utree_destroy(utree);
+ fatal("Outgroup must be a single tip or a list of all tips of a subtree");
+ }
+ }
+
+ if (opt_crop)
+ {
+ rtree = utree_crop(og_root);
+ }
+ else
+ {
+ rtree = utree_convert_rtree(og_root);
+ }
+
+ utree_destroy(utree);
+ }
+ else
+ {
+ if (!opt_quiet)
+ fprintf(stdout, "Loaded rooted tree...\n");
+
+ if (opt_crop)
+ {
+ if (!opt_outgroup)
+ fatal("--outgroup must be specified when using --outgroup_crop.");
+
+ /* get LCA of outgroup */
+ rtree_t * og_root = get_outgroup_lca(rtree);
+
+ /* crop outgroup from tree */
+ rtree = rtree_crop(rtree,og_root);
+ if (!rtree)
+ fatal("Cropping the outgroup leads to less than two tips.");
+ }
+ }
+
+ return rtree;
+}
+
+void cmd_auto()
+{
+ rtree_t * rtree = load_tree();
+
+ detect_min_bl(rtree);
+
+ /* deallocate tree structure */
+ rtree_destroy(rtree);
+}
+
+void cmd_ml(void)
+{
+ rtree_t * rtree = load_tree();
+
+ dp_init(rtree);
+ dp_set_pernode_spec_edges(rtree);
+ dp_ptp(rtree, opt_method);
+ dp_free(rtree);
+
+ if (opt_treeshow)
+ rtree_show_ascii(rtree);
+
+ cmd_svg(rtree, opt_seed, "svg");
+
+ /* deallocate tree structure */
+ rtree_destroy(rtree);
+
+ if (!opt_quiet)
+ fprintf(stdout, "Done...\n");
+}
+
+void cmd_multirun(void)
+{
+ if (opt_mcmc_steps == 0)
+ fatal("The number of runs specified after --mcmc must be a positive integer greater than zero");
+
+ if (opt_mcmc_burnin < 1 || opt_mcmc_burnin > opt_mcmc_steps)
+ fatal("--opt_mcmc_burnin must be a positive integer smaller or equal to --opt_mcmc_steps");
+
+ if (opt_mcmc_credible < 0 || opt_mcmc_credible > 1)
+ fatal("--opt_mcmc_credible must be a real number between 0 and 1");
+
+ rtree_t * rtree = load_tree();
+
+ multirun(rtree, opt_method);
+
+ if (opt_treeshow)
+ rtree_show_ascii(rtree);
+
+ if (!opt_quiet)
+ fprintf(stdout, "Done...\n");
+
+}
+
+void getentirecommandline(int argc, char * argv[])
+{
+ int len = 0;
+ int i;
+
+ for (i = 0; i < argc; ++i)
+ len += strlen(argv[i]);
+
+ cmdline = (char *)xmalloc((size_t)(len + argc + 1));
+ cmdline[0] = 0;
+
+ for (i = 0; i < argc; ++i)
+ {
+ strcat(cmdline, argv[i]);
+ strcat(cmdline, " ");
+ }
+}
+
+void fillheader()
+{
+ snprintf(progheader, 80,
+ "%s %s_%s, %1.fGB RAM, %ld cores",
+ PROG_NAME, PROG_VERSION, PROG_ARCH,
+ arch_get_memtotal() / 1024.0 / 1024.0 / 1024.0,
+ sysconf(_SC_NPROCESSORS_ONLN));
+}
+
+void show_header()
+{
+ fprintf(stdout, "%s\n", progheader);
+ fprintf(stdout, "https://github.com/Pas-Kapli/mptp\n");
+ fprintf(stdout,"\n");
+}
+
+int main (int argc, char * argv[])
+{
+ fillheader();
+ getentirecommandline(argc, argv);
+
+ args_init(argc, argv);
+
+ show_header();
+
+ /* init random number generator and maintain compatibility with srand48 */
+ random_init(global_xsubi,opt_seed);
+
+ if (opt_help)
+ {
+ cmd_help();
+ }
+ else if (opt_pdist_file)
+ {
+ cmd_auto();
+ }
+ else if (opt_mcmc)
+ {
+ cmd_multirun();
+ }
+ else if (opt_ml)
+ {
+ cmd_ml();
+ }
+
+ free(cmdline);
+ return (0);
+}
diff --git a/src/mptp.h b/src/mptp.h
new file mode 100644
index 0000000..efd3e3c
--- /dev/null
+++ b/src/mptp.h
@@ -0,0 +1,419 @@
+/*
+ Copyright (C) 2015 Tomas Flouri, Sarah Lutteropp
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as
+ published by the Free Software Foundation, either version 3 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+ Contact: Tomas Flouri <Tomas.Flouri at h-its.org>,
+ Heidelberg Institute for Theoretical Studies,
+ Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
+*/
+
+#define _GNU_SOURCE
+
+#include <assert.h>
+#include <search.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <string.h>
+#include <pthread.h>
+#include <getopt.h>
+#include <stdlib.h>
+#include <time.h>
+#include <limits.h>
+#include <locale.h>
+#include <math.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <unistd.h>
+#include <stdbool.h>
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#if (defined(HAVE_CONFIG_H) && defined(HAVE_LIBGSL))
+#include <gsl/gsl_cdf.h>
+#endif
+
+/* constants */
+
+#define PROG_NAME PACKAGE
+#define PROG_VERSION PACKAGE_VERSION
+
+#ifdef __APPLE__
+#define PROG_ARCH "macosx_x86_64"
+#else
+#define PROG_ARCH "linux_x86_64"
+#endif
+
+#define PLL_FAILURE 0
+#define PLL_SUCCESS 1
+#define PLL_LINEALLOC 2048
+#define PLL_ERROR_FILE_OPEN 1
+#define PLL_ERROR_FILE_SEEK 2
+#define PLL_ERROR_FILE_EOF 3
+#define PLL_ERROR_FASTA_ILLEGALCHAR 4
+#define PLL_ERROR_FASTA_UNPRINTABLECHAR 5
+#define PLL_ERROR_FASTA_INVALIDHEADER 6
+#define PLL_ERROR_MEM_ALLOC 7
+
+#define LINEALLOC 2048
+
+#define EVENT_SPECIATION 0
+#define EVENT_COALESCENT 1
+
+#define PTP_METHOD_SINGLE 0
+#define PTP_METHOD_MULTI 1
+
+#define REGEX_REAL "([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?)"
+
+/* structures and data types */
+
+typedef unsigned int UINT32;
+typedef unsigned short WORD;
+typedef unsigned char BYTE;
+
+typedef struct dp_vector_s
+{
+ /* sum of speciation edge lengths of current subtree */
+ double spec_edgelen_sum;
+
+ /* coalescent logl of subtree for multi lambda */
+ double coal_multi_logl;
+
+ /* best single- and multi-rate log-likelihood for current subtree */
+ double score_multi;
+ double score_single;
+
+ /* back-tracking information */
+ int vec_left;
+ int vec_right;
+
+ unsigned int species_count;
+ int filled;
+} dp_vector_t;
+
+typedef struct utree_s
+{
+ char * label;
+ double length;
+ int height;
+ struct utree_s * next;
+ struct utree_s * back;
+
+ void * data;
+
+ /* for finding the lca */
+ int mark;
+
+} utree_t;
+
+typedef struct rtree_s
+{
+ char * label;
+ double length;
+ struct rtree_s * left;
+ struct rtree_s * right;
+ struct rtree_s * parent;
+ int leaves;
+
+ /* number of edges within current subtree with lengths greater than opt_minbr
+ and corresponding sum */
+ int edge_count;
+ double edgelen_sum;
+ double coal_logl;
+
+ /* minimum number of speciation edges if current node is the start of a
+ coalescent event, and the respective sum of lengths */
+ int spec_edge_count;
+ double spec_edgelen_sum;
+
+ /* which process does this node belong to (coalesent or speciation) */
+ int event;
+
+ /* slot in which the node resides when doing mcmc analysis */
+ long mcmc_slot;
+ long speciation_start;
+ long speciation_count;
+ double aic_weight_start;
+ double aic_support;
+ double support;
+
+ /* dynamic programming vector */
+ dp_vector_t * vector;
+
+ /* auxialiary data */
+ void * data;
+
+ /* for generating random delimitations */
+ int max_species_count;
+
+ /* mark */
+ int mark;
+ char * sequence;
+
+} rtree_t;
+
+typedef struct pll_fasta
+{
+ FILE * fp;
+ char line[LINEALLOC];
+ const unsigned int * chrstatus;
+ long no;
+ long filesize;
+ long lineno;
+ long stripped_count;
+ long stripped[256];
+} pll_fasta_t;
+
+/* macros */
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* options */
+
+extern int opt_quiet;
+extern int opt_precision;
+extern int opt_svg_showlegend;
+extern long opt_help;
+extern long opt_version;
+extern long opt_treeshow;
+extern long opt_mcmc_sample;
+extern long opt_mcmc_steps;
+extern long opt_mcmc_log;
+extern long opt_mcmc_startml;
+extern long opt_mcmc_startnull;
+extern long opt_mcmc_startrandom;
+extern long opt_mcmc_burnin;
+extern long opt_mcmc_runs;
+extern long opt_seed;
+extern long opt_mcmc;
+extern long opt_ml;
+extern long opt_multi;
+extern long opt_single;
+extern long opt_method;
+extern long opt_crop;
+extern long opt_svg;
+extern long opt_svg_width;
+extern long opt_svg_fontsize;
+extern long opt_svg_tipspace;
+extern long opt_svg_marginleft;
+extern long opt_svg_marginright;
+extern long opt_svg_margintop;
+extern long opt_svg_marginbottom;
+extern long opt_svg_inner_radius;
+extern double opt_mcmc_credible;
+extern double opt_svg_legend_ratio;
+extern double opt_pvalue;
+extern double opt_minbr;
+extern char * opt_treefile;
+extern char * opt_outfile;
+extern char * opt_outgroup;
+extern char * opt_pdist_file;
+extern char * cmdline;
+
+/* common data */
+
+extern char errmsg[200];
+
+extern int pll_errno;
+extern unsigned short global_xsubi[3];
+extern const unsigned int pll_map_nt[256];
+extern const unsigned int pll_map_fasta[256];
+
+extern long mmx_present;
+extern long sse_present;
+extern long sse2_present;
+extern long sse3_present;
+extern long ssse3_present;
+extern long sse41_present;
+extern long sse42_present;
+extern long popcnt_present;
+extern long avx_present;
+extern long avx2_present;
+
+/* functions in util.c */
+
+void fatal(const char * format, ...) __attribute__ ((noreturn));
+void progress_init(const char * prompt, unsigned long size);
+void progress_update(unsigned int progress);
+void progress_done(void);
+void * xmalloc(size_t size);
+void * xcalloc(size_t nmemb, size_t size);
+void * xrealloc(void *ptr, size_t size);
+char * xstrchrnul(char *s, int c);
+char * xstrdup(const char * s);
+char * xstrndup(const char * s, size_t len);
+long getusec(void);
+void show_rusage(void);
+FILE * xopen(const char * filename, const char * mode);
+void random_init(unsigned short * rstate, long seedval);
+
+/* functions in mptp.c */
+
+void args_init(int argc, char ** argv);
+void cmd_help(void);
+void getentirecommandline(int argc, char * argv[]);
+void fillheader(void);
+void show_header(void);
+void cmd_ml(void);
+void cmd_multirun(void);
+void cmd_auto(void);
+
+/* functions in parse_rtree.y */
+
+rtree_t * rtree_parse_newick(const char * filename);
+void rtree_destroy(rtree_t * root);
+
+/* functions in parse_utree.y */
+
+utree_t * utree_parse_newick(const char * filename, unsigned int * tip_count);
+
+void utree_destroy(utree_t * root);
+
+/* functions in utree.c */
+
+void utree_show_ascii(utree_t * tree);
+char * utree_export_newick(utree_t * root);
+int utree_query_tipnodes(utree_t * root, utree_t ** node_list);
+int utree_query_innernodes(utree_t * root, utree_t ** node_list);
+rtree_t * utree_convert_rtree(utree_t * root);
+int utree_traverse(utree_t * root,
+ int (*cbtrav)(utree_t *),
+ utree_t ** outbuffer);
+utree_t * utree_longest_branchtip(utree_t * node, unsigned int tip_count);
+utree_t * utree_outgroup_lca(utree_t * root, unsigned int tip_count);
+rtree_t * utree_crop(utree_t * lca);
+
+/* functions in rtree.c */
+
+void rtree_show_ascii(rtree_t * tree);
+char * rtree_export_newick(rtree_t * root);
+int rtree_query_tipnodes(rtree_t * root, rtree_t ** node_list);
+int rtree_query_innernodes(rtree_t * root, rtree_t ** node_list);
+void rtree_reset_info(rtree_t * root);
+void rtree_print_tips(rtree_t * node, FILE * out);
+int rtree_traverse(rtree_t * root,
+ int (*cbtrav)(rtree_t *),
+ unsigned short * rstate,
+ rtree_t ** outbuffer);
+rtree_t * rtree_clone(rtree_t * node, rtree_t * parent);
+int rtree_traverse_postorder(rtree_t * root,
+ int (*cbtrav)(rtree_t *),
+ rtree_t ** outbuffer);
+rtree_t ** rtree_tipstring_nodes(rtree_t * root,
+ char * tipstring,
+ unsigned int * tiplist_count);
+rtree_t * get_outgroup_lca(rtree_t * root);
+rtree_t * rtree_lca(rtree_t * root,
+ rtree_t ** tip_nodes,
+ unsigned int count);
+rtree_t * rtree_crop(rtree_t * root, rtree_t * crop_root);
+int rtree_height(rtree_t * root);
+
+/* functions in parse_rtree.y */
+
+rtree_t * rtree_parse_newick(const char * filename);
+
+/* functions in lca_utree.c */
+
+void lca_init(utree_t * root);
+utree_t * lca_compute(utree_t * tip1, utree_t * tip2);
+void lca_destroy(void);
+
+/* functions in arch.c */
+
+unsigned long arch_get_memused(void);
+unsigned long arch_get_memtotal(void);
+
+/* functions in dp.c */
+
+void dp_init(rtree_t * tree);
+void dp_free(rtree_t * tree);
+void dp_ptp(rtree_t * rtree, long method);
+void dp_set_pernode_spec_edges(rtree_t * node);
+
+/* functions in svg.c */
+
+void cmd_svg(rtree_t * rtree, long seed, const char * ext);
+
+/* functions in likelihood.c */
+
+double loglikelihood(long edge_count, double edgelen_sum);
+int lrt(double nullmodel_logl, double ptp_logl, unsigned int df, double * pvalue);
+double aic(double logl, long k, long n);
+
+/* functions in output.c */
+
+void output_info(FILE * out,
+ long method,
+ double nullmodel_logl,
+ double logl,
+ double pvalue,
+ int lrt_result,
+ rtree_t * root,
+ unsigned int species_count);
+
+FILE * open_file_ext(const char * extension, long seed);
+
+/* functions in svg_landscape.c */
+
+void svg_landscape(double mcmc_min_log, double mcmc_max_logl, long seed);
+void svg_landscape_combined(double mcmc_min_log, double mcmc_max_logl, long runs, long * seed);
+
+/* functions in random.c */
+
+double random_delimitation(rtree_t * root,
+ long * delimited_species,
+ long * coal_edge_count,
+ double * coal_edgelen_sum,
+ long * spec_edge_count,
+ double * spec_edgelen_sum,
+ double * coal_score,
+ unsigned short * rstate);
+
+/* functions in multirun.c */
+
+void multirun(rtree_t * root, long method);
+
+/* functions in fasta.c */
+
+pll_fasta_t * pll_fasta_open(const char * filename,
+ const unsigned int * map);
+
+int pll_fasta_getnext(pll_fasta_t * fd, char ** head,
+ long * head_len, char ** seq,
+ long * seq_len, long * seqno);
+
+void pll_fasta_close(pll_fasta_t * fd);
+
+long pll_fasta_getfilesize(pll_fasta_t * fd);
+
+long pll_fasta_getfilepos(pll_fasta_t * fd);
+
+int pll_fasta_rewind(pll_fasta_t * fd);
+
+/* functions in auto.c */
+
+void detect_min_bl(rtree_t * rtree);
+
+/* functions in aic.c */
+
+void aic_mcmc(rtree_t * tree,
+ long method,
+ unsigned short * rstate,
+ long seed,
+ double * mcmc_min_logl,
+ double * mcmc_max_logl);
diff --git a/src/multirun.c b/src/multirun.c
new file mode 100644
index 0000000..1a53016
--- /dev/null
+++ b/src/multirun.c
@@ -0,0 +1,362 @@
+/*
+ Copyright (C) 2015 Tomas Flouri
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as
+ published by the Free Software Foundation, either version 3 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+ Contact: Tomas Flouri <Tomas.Flouri at h-its.org>,
+ Heidelberg Institute for Theoretical Studies,
+ Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
+*/
+
+#include "mptp.h"
+
+#define MPTP_INNER_CROOT 1
+#define MPTP_TIP_CROOT 2
+
+static double asv(int * mlcroots, double * support, int count)
+{
+ int i;
+ double sum = 0;
+ int croots_count = 0;
+
+ for (i = 0; i < count; ++i)
+ {
+ if (mlcroots[i] == MPTP_INNER_CROOT)
+ {
+ sum += (1-support[i]);
+ croots_count++;
+ }
+ else if (mlcroots[i] == MPTP_TIP_CROOT)
+ {
+ sum += support[i];
+ croots_count++;
+ }
+ }
+
+ return sum / croots_count;
+}
+
+static void extract_croots_recursive(rtree_t * node,
+ int * index,
+ int * outbuffer)
+{
+ if (!node->edge_count) return;
+
+ if (node->parent)
+ {
+ outbuffer[*index] = 0;
+ if (node->event == EVENT_COALESCENT &&
+ node->parent->event == EVENT_SPECIATION)
+ {
+ outbuffer[*index] = MPTP_INNER_CROOT;
+ }
+ else
+ {
+ if ((node->event == EVENT_SPECIATION) && (node->left->edge_count == 0 || node->right->edge_count == 0))
+ outbuffer[*index] = MPTP_TIP_CROOT;
+ }
+
+ }
+ else
+ {
+ outbuffer[*index] = 0;
+ if (node->event == EVENT_COALESCENT)
+ outbuffer[*index] = MPTP_INNER_CROOT;
+ }
+
+ *index = *index+1;
+
+ extract_croots_recursive(node->left, index, outbuffer);
+ extract_croots_recursive(node->right, index, outbuffer);
+}
+
+/* recursively extract support values from a tree into an array */
+static int extract_croots(rtree_t * root, int * outbuffer)
+{
+ int index = 0;
+ int count = 0;
+ int i;
+
+ if (!root->edge_count) return -1;
+
+ extract_croots_recursive(root, &index, outbuffer);
+
+ for (i = 0; i < index; ++i)
+ if (outbuffer[i])
+ ++count;
+
+ return count;
+}
+
+static void extract_support_recursive(rtree_t * node,
+ int * index,
+ double * outbuffer)
+{
+ if (!node->edge_count) return;
+
+ outbuffer[*index] = node->support;
+ *index = *index + 1;
+
+ extract_support_recursive(node->left, index, outbuffer);
+ extract_support_recursive(node->right, index, outbuffer);
+}
+
+/* recursively extract support values from a tree into an array */
+static int extract_support(rtree_t * root, double * outbuffer)
+{
+ int index = 0;
+
+ if (!root->edge_count) return -1;
+
+ extract_support_recursive(root, &index, outbuffer);
+
+ return index;
+}
+
+void multirun(rtree_t * root, long method)
+{
+ long i,j;
+ long * seeds;
+ rtree_t * mltree;
+ rtree_t * ctree;
+ rtree_t ** trees;
+ unsigned short ** rstates;
+ double * mcmc_min_logl;
+ double * mcmc_max_logl;
+
+ trees = (rtree_t **)xmalloc((size_t)opt_mcmc_runs * sizeof(rtree_t *));
+ trees[0] = root;
+
+ /* clone trees in order to have one independent tree per run */
+ for (i = 1; i < opt_mcmc_runs; ++i)
+ trees[i] = rtree_clone(root, NULL);
+ mltree = rtree_clone(root,NULL);
+ ctree = rtree_clone(root,NULL);
+
+ /* allocate memory for storing min and max logl for each run */
+ mcmc_min_logl = (double *)xmalloc((size_t)opt_mcmc_runs * sizeof(double));
+ mcmc_max_logl = (double *)xmalloc((size_t)opt_mcmc_runs * sizeof(double));
+
+ /* reset to zero */
+ memset(mcmc_min_logl, 0, (size_t)opt_mcmc_runs * sizeof(double));
+ memset(mcmc_max_logl, 0, (size_t)opt_mcmc_runs * sizeof(double));
+
+ /* generate one seed for each run */
+ seeds = (long *)xmalloc((size_t)opt_mcmc_runs * sizeof(long));
+ for (i = 0; i < opt_mcmc_runs; ++i)
+ seeds[i] = nrand48(global_xsubi);
+
+ if (opt_mcmc_runs == 1)
+ seeds[0] = opt_seed;
+
+ /* initialize states for random number generators */
+ rstates = (unsigned short **)xmalloc((size_t)opt_mcmc_runs *
+ sizeof(unsigned short *));
+ for (i = 0; i < opt_mcmc_runs; ++i)
+ rstates[i] = (unsigned short *)xmalloc(3*sizeof(unsigned short *));
+
+ /* initialize a pseudo-random number generator for each run */
+ for (i = 0; i < opt_mcmc_runs; ++i)
+ random_init(rstates[i], seeds[i]);
+
+ /* create an array for storing the sum of support values for each node
+ across all MCMC runs */
+ double * combined_val;
+ combined_val = (double *)xmalloc((size_t)(root->leaves-1) * sizeof(double));
+ memset(combined_val,0,(root->leaves-1)*sizeof(double));
+
+ rtree_t ** inner_node_list = (rtree_t **)xmalloc((size_t)(root->leaves-1) *
+ sizeof(rtree_t *));
+
+ /* execute each run sequentially */
+ for (i = 0; i < opt_mcmc_runs; ++i)
+ {
+ dp_init(trees[i]);
+ dp_set_pernode_spec_edges(trees[i]);
+ if (!opt_quiet)
+ fprintf(stdout, "\nMCMC run %ld...\n", i);
+ aic_mcmc(trees[i],
+ method,
+ rstates[i],
+ seeds[i],
+ mcmc_min_logl+i,
+ mcmc_max_logl+i);
+ dp_free(trees[i]);
+
+ /* add up support values */
+ rtree_query_innernodes(trees[i], inner_node_list);
+ for (j = 0; j < trees[i]->leaves-1; ++j)
+ combined_val[j] += inner_node_list[j]->support;
+
+
+ /* print SVG log-likelihood landscape of current run given its
+ generated seed */
+ if (opt_mcmc_log)
+ {
+ svg_landscape(mcmc_min_logl[i], mcmc_max_logl[i], seeds[i]);
+ }
+
+ /* output SVG tree with support values for current run */
+ char * newick = rtree_export_newick(trees[i]);
+
+ if (!opt_quiet)
+ fprintf(stdout,
+ "Creating tree with support values in %s.%ld.tree ...\n",
+ opt_outfile,
+ seeds[i]);
+
+ FILE * newick_fp = open_file_ext("tree", seeds[i]);
+ fprintf(newick_fp, "%s\n", newick);
+ fclose(newick_fp);
+
+ cmd_svg(trees[i], seeds[i], "svg");
+
+ free(newick);
+ }
+
+ /* compute the min and max log-l values among all runs */
+ double min_logl = mcmc_min_logl[0];
+ double max_logl = mcmc_max_logl[0];
+ for (i = 1; i < opt_mcmc_runs; ++i)
+ {
+ if (mcmc_min_logl[i] < min_logl) min_logl = mcmc_min_logl[i];
+ if (mcmc_max_logl[i] > max_logl) max_logl = mcmc_max_logl[i];
+ }
+
+ /* generate the SVG log-likelihood landscape for all runs combined */
+ if (!opt_quiet && opt_mcmc_log && (opt_mcmc_runs > 1))
+ fprintf(stdout, "\nPreparing overall log-likelihood landscape ...\n");
+ if (opt_mcmc_log && (opt_mcmc_runs > 1))
+ svg_landscape_combined(min_logl, max_logl, opt_mcmc_runs, seeds);
+
+ /* free min and max logl arrays */
+ free(mcmc_min_logl);
+ free(mcmc_max_logl);
+
+ /* allocate memory for support values */
+ double ** support = (double **)xmalloc((size_t)opt_mcmc_runs *
+ sizeof(double *));
+ int support_count = 0;
+ for (i = 0; i < opt_mcmc_runs; ++i)
+ {
+ support[i] = (double *)xmalloc((size_t)(trees[i]->leaves) * sizeof(double));
+ support_count = extract_support(trees[i], support[i]);
+ rtree_destroy(trees[i]);
+ }
+
+ /* compute ML tree */
+ dp_init(mltree);
+ dp_set_pernode_spec_edges(mltree);
+ dp_ptp(mltree, method);
+ int * mlcroots = (int *)xmalloc((size_t)(mltree->leaves) * sizeof(int));
+ int croots_count = extract_croots(mltree, mlcroots);
+
+ /* If any of the two following conditions hold then the ML solution is the
+ null-model in the following form:
+
+ 0 : we have n species (n = tips)
+ -1 : we have one species
+
+ In this case, ASV is not informative and hence it is skipped */
+ if (croots_count == 0 || croots_count == -1)
+ fprintf(stderr, "WARNING: ML delimitation is the null-model - ASV is skipped\n");
+ else
+ {
+ for (i = 0; i < opt_mcmc_runs; ++i)
+ {
+ printf("ML average support based on run with seed %ld : %.17f\n",
+ seeds[i],
+ asv(mlcroots, support[i], support_count));
+ }
+ }
+
+ dp_free(mltree);
+ rtree_destroy(mltree);
+ free(mlcroots);
+
+ /* compute the standard deviation of each support value given the runs,
+ and then compute a consensus average standard deviation for all support
+ values */
+ double mean, var, stdev, avg_stdev = 0;
+ for (i = 0; i < support_count; ++i)
+ {
+ int j;
+ mean = var = stdev = 0;
+ for (j = 0; j < opt_mcmc_runs; ++j)
+ mean += support[j][i];
+
+ mean /= opt_mcmc_runs;
+
+ for (j = 0; j < opt_mcmc_runs; ++j)
+ var += (mean - support[j][i])*(mean - support[j][i]);
+
+ var /= opt_mcmc_runs;
+ stdev = sqrt(var);
+
+ avg_stdev += stdev;
+ }
+ avg_stdev /= support_count;
+
+ if (!opt_quiet)
+ printf("Average standard deviation of support values among runs: %f\n",
+ avg_stdev);
+
+ /* compute the combined support values */
+ for (j = 0; j < ctree->leaves-1; ++j)
+ combined_val[j] /= opt_mcmc_runs;
+
+ /* query inner nodes and set the combined support values */
+ rtree_query_innernodes(ctree, inner_node_list);
+ for (j = 0; j < ctree->leaves-1; ++j)
+ inner_node_list[j]->support = combined_val[j];
+
+ /* deallocate the structures */
+ free(inner_node_list);
+ free(combined_val);
+
+ /* export the combined tree */
+ char * newick = rtree_export_newick(ctree);
+
+ if (!opt_quiet)
+ fprintf(stdout,
+ "Creating tree with combined support values in %s.%ld.combined.tree ...\n",
+ opt_outfile,
+ opt_seed);
+
+ /* open, write, close, free newick */
+ FILE * newick_fp = open_file_ext("combined.tree", opt_seed);
+ fprintf(newick_fp, "%s\n", newick);
+ fclose(newick_fp);
+ free(newick);
+
+ /* create an SVG of the combined tree with support values */
+ cmd_svg(ctree, opt_seed, "combined.svg");
+
+
+ /* destroy combined tree */
+ rtree_destroy(ctree);
+
+
+ /* deallocate support values array */
+ for (i = 0; i < opt_mcmc_runs; ++i)
+ free(support[i]);
+ free(support);
+
+ /* deallocate all cloned trees (except from the original) */
+ for (i = 0; i < opt_mcmc_runs; ++i)
+ free(rstates[i]);
+ free(rstates);
+ free(seeds);
+ free(trees);
+}
diff --git a/src/output.c b/src/output.c
new file mode 100644
index 0000000..e7bc9da
--- /dev/null
+++ b/src/output.c
@@ -0,0 +1,73 @@
+/*
+ Copyright (C) 2015 Tomas Flouri, Sarah Lutteropp
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as
+ published by the Free Software Foundation, either version 3 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+ Contact: Tomas Flouri <Tomas.Flouri at h-its.org>,
+ Heidelberg Institute for Theoretical Studies,
+ Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
+*/
+
+#include "mptp.h"
+
+FILE * open_file_ext(const char * extension, long seed)
+{
+ char * filename = NULL;
+ if (opt_mcmc)
+ {
+ if (asprintf(&filename, "%s.%ld.%s", opt_outfile, seed, extension) == -1)
+ fatal("Unable to allocate enough memory.");
+ }
+ else
+ {
+ if (asprintf(&filename, "%s.%s", opt_outfile, extension) == -1)
+ fatal("Unable to allocate enough memory.");
+ }
+
+ FILE * out = xopen(filename,"w");
+
+ free(filename);
+
+ return out;
+}
+
+void output_info(FILE * out,
+ long method,
+ double nullmodel_logl,
+ double logl,
+ double pvalue,
+ int lrt_result,
+ rtree_t * root,
+ unsigned int species_count)
+{
+ fprintf(out, "Command: %s\n", cmdline);
+ fprintf(out,
+ "Number of edges greater than minimum branch length: %d / %d\n",
+ root->edge_count,
+ 2 * root->leaves - 2);
+ fprintf(out, "Null-model score: %.6f\n", nullmodel_logl);
+ fprintf(out,
+ "Best score for %s coalescent rate: %.6f\n",
+ (method == PTP_METHOD_SINGLE) ?
+ "single" : "multi",
+ logl);
+#ifdef HAVE_LIBGSL
+ if (method == PTP_METHOD_SINGLE)
+ {
+ fprintf(out, "LRT computed p-value: %.6f\n", pvalue);
+ fprintf(out, "LRT: %s\n", lrt_result ? "passed" : "failed");
+ }
+#endif
+ fprintf(out, "Number of delimited species: %d\n", species_count);
+}
diff --git a/src/parse_rtree.y b/src/parse_rtree.y
new file mode 100644
index 0000000..82206ee
--- /dev/null
+++ b/src/parse_rtree.y
@@ -0,0 +1,198 @@
+/*
+ Copyright (C) 2015 Tomas Flouri
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as
+ published by the Free Software Foundation, either version 3 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+ Contact: Tomas Flouri <Tomas.Flouri at h-its.org>,
+ Heidelberg Institute for Theoretical Studies,
+ Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
+*/
+%{
+#include "mptp.h"
+
+extern int rtree_lex();
+extern FILE * rtree_in;
+extern void rtree_lex_destroy();
+
+void rtree_destroy(rtree_t * root)
+{
+ if (!root) return;
+
+ rtree_destroy(root->left);
+ rtree_destroy(root->right);
+ if (root->data)
+ free(root->data);
+
+ free(root->label);
+ free(root);
+}
+
+
+static void rtree_error(rtree_t * tree, const char * s)
+{
+}
+
+%}
+
+
+%union
+{
+ char * s;
+ char * d;
+ struct rtree_s * tree;
+}
+
+%error-verbose
+%parse-param {struct rtree_s * tree}
+%destructor { rtree_destroy($$); } subtree
+%destructor { free($$); } STRING
+%destructor { free($$); } NUMBER
+%destructor { free($$); } label
+
+%token OPAR
+%token CPAR
+%token COMMA
+%token COLON SEMICOLON
+%token<s> STRING
+%token<d> NUMBER
+%type<s> label optional_label
+%type<d> number optional_length
+%type<tree> subtree
+%start input
+%%
+
+input: OPAR subtree COMMA subtree CPAR optional_label optional_length SEMICOLON
+{
+ tree->left = $2;
+ tree->right = $4;
+ tree->label = $6;
+ tree->length = $7 ? atof($7) : 0;
+ tree->leaves = $2->leaves + $4->leaves;
+ tree->parent = NULL;
+ tree->event = EVENT_COALESCENT;
+ tree->data = NULL;
+ free($7);
+
+ tree->left->parent = tree;
+ tree->right->parent = tree;
+
+ tree->edge_count = $2->edge_count + $4->edge_count;
+ tree->edgelen_sum = $2->edgelen_sum + $4->edgelen_sum;
+ if ($2->length > opt_minbr)
+ {
+ tree->edge_count++;
+ tree->edgelen_sum += $2->length;
+ }
+ if ($4->length > opt_minbr)
+ {
+ tree->edge_count++;
+ tree->edgelen_sum += $4->length;
+ }
+
+ tree->max_species_count = 1;
+ if (tree->edge_count > 0)
+ tree->max_species_count = $2->max_species_count + $4->max_species_count;
+
+ tree->mark = 0;
+};
+
+subtree: OPAR subtree COMMA subtree CPAR optional_label optional_length
+{
+ $$ = (rtree_t *)calloc(1, sizeof(rtree_t));
+ $$->left = $2;
+ $$->right = $4;
+ $$->label = $6;
+ $$->length = $7 ? atof($7) : 0;
+ $$->leaves = $2->leaves + $4->leaves;
+ $$->event = EVENT_COALESCENT;
+ free($7);
+
+ $$->left->parent = $$;
+ $$->right->parent = $$;
+
+ $$->edge_count = $2->edge_count + $4->edge_count;
+ $$->edgelen_sum = $2->edgelen_sum + $4->edgelen_sum;
+ if ($2->length > opt_minbr)
+ {
+ $$->edge_count++;
+ $$->edgelen_sum += $2->length;
+ }
+ if ($4->length > opt_minbr)
+ {
+ $$->edge_count++;
+ $$->edgelen_sum += $4->length;
+ }
+
+ $$->max_species_count = 1;
+ if ($$->edge_count > 0)
+ $$->max_species_count = $2->max_species_count + $4->max_species_count;
+ $$->mark = 0;
+ $$->data = NULL;
+}
+ | label optional_length
+{
+ $$ = (rtree_t *)calloc(1, sizeof(rtree_t));
+ $$->label = $1;
+ $$->length = $2 ? atof($2) : 0;
+ $$->left = NULL;
+ $$->right = NULL;
+ $$->leaves = 1;
+ $$->event = EVENT_COALESCENT;
+
+ $$->edge_count = 0;
+ $$->edgelen_sum = 0;
+
+ $$->max_species_count = 1;
+ $$->mark = 0;
+ $$->data = NULL;
+
+ free($2);
+};
+
+
+optional_label: {$$ = NULL;} | label {$$ = $1;};
+optional_length: {$$ = NULL;} | COLON number {$$ = $2;};
+label: STRING {$$=$1;} | NUMBER {$$=$1;};
+number: NUMBER {$$=$1;};
+
+%%
+
+rtree_t * rtree_parse_newick(const char * filename)
+{
+ struct rtree_s * tree;
+
+ tree = (rtree_t *)calloc(1, sizeof(rtree_t));
+
+ rtree_in = fopen(filename, "r");
+ if (!rtree_in)
+ {
+ rtree_destroy(tree);
+ snprintf(errmsg, 200, "Unable to open file (%s)", filename);
+ return NULL;
+ }
+ else if (rtree_parse(tree))
+ {
+ rtree_destroy(tree);
+ tree = NULL;
+ fclose(rtree_in);
+ rtree_lex_destroy();
+ return NULL;
+ }
+
+ if (rtree_in) fclose(rtree_in);
+
+ rtree_lex_destroy();
+
+ return tree;
+}
diff --git a/src/parse_utree.y b/src/parse_utree.y
new file mode 100644
index 0000000..73bb2fb
--- /dev/null
+++ b/src/parse_utree.y
@@ -0,0 +1,221 @@
+/*
+ Copyright (C) 2015 Tomas Flouri
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as
+ published by the Free Software Foundation, either version 3 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+ Contact: Tomas Flouri <Tomas.Flouri at h-its.org>,
+ Heidelberg Institute for Theoretical Studies,
+ Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
+*/
+%{
+#include "mptp.h"
+
+extern int utree_lex();
+extern FILE * utree_in;
+extern void utree_lex_destroy();
+
+static unsigned int tip_cnt = 0;
+
+static void dealloc_tree_recursive(utree_t * node)
+{
+ if (!node->next)
+ {
+ free(node->label);
+ free(node);
+ return;
+ }
+
+ dealloc_tree_recursive(node->next->back);
+ dealloc_tree_recursive(node->next->next->back);
+
+ free(node->next->next);
+ free(node->next);
+ free(node->label);
+ free(node);
+}
+
+void utree_destroy(utree_t * root)
+{
+ if (!root) return;
+ if (!(root->next))
+ {
+ free(root->label);
+ free(root);
+ return;
+ }
+
+ if (root->next)
+ dealloc_tree_recursive(root->next->back);
+ if (root->next->next)
+ dealloc_tree_recursive(root->next->next->back);
+ if (root->back)
+ dealloc_tree_recursive(root->back);
+
+ free(root->label);
+ free(root->next->next);
+ free(root->next);
+ free(root);
+}
+
+static void utree_error(utree_t * tree, const char * s)
+{
+}
+
+%}
+
+%union
+{
+ char * s;
+ char * d;
+ struct utree_s * tree;
+}
+
+%error-verbose
+%parse-param {struct utree_s * tree}
+%destructor { utree_destroy($$); } subtree
+
+%token OPAR
+%token CPAR
+%token COMMA
+%token COLON SEMICOLON
+%token<s> STRING
+%token<d> NUMBER
+%type<s> label optional_label
+%type<d> number optional_length
+%type<tree> subtree
+%start input
+%%
+
+input: OPAR subtree COMMA subtree COMMA subtree CPAR optional_label optional_length SEMICOLON
+{
+ tree->next = (utree_t *)calloc(1, sizeof(utree_t));
+
+ tree->next->next = (utree_t *)calloc(1, sizeof(utree_t));
+ tree->next->next->next = tree;
+
+
+ tree->back = $2;
+ tree->next->back = $4;
+ tree->next->next->back = $6;
+
+ $2->back = tree;
+ $4->back = tree->next;
+ $6->back = tree->next->next;
+
+ tree->label = $8;
+ tree->next->label = $8;
+ tree->next->next->label = $8;
+
+ tree->length = $2->length;
+ tree->next->length = $4->length;
+ tree->next->next->length = $6->length;
+
+ tree->height = ($2->height > $4->height) ?
+ (($2->height > $6->height) ? $2->height + 1 : $6->height + 1) :
+ (($4->height > $6->height) ? $4->height + 1 : $6->height + 1);
+ tree->next->height = tree->height;
+ tree->next->next->height = tree->height;
+
+ free($9);
+};
+
+subtree: OPAR subtree COMMA subtree CPAR optional_label optional_length
+{
+ $$ = (utree_t *)calloc(1, sizeof(utree_t));
+
+ $$->next = (utree_t *)calloc(1, sizeof(utree_t));
+
+ $$->next->next = (utree_t *)calloc(1, sizeof(utree_t));
+ $$->next->next->next = $$;
+
+
+ $$->next->back = $2;
+ $$->next->next->back = $4;
+
+ $2->back = $$->next;
+ $4->back = $$->next->next;
+
+ $$->label = $6;
+ $$->next->label = $6;
+ $$->next->next->label = $6;
+ $$->length = $7 ? atof($7) : 0;
+ $$->height = ($2->height > $4->height) ?
+ $2->height + 1 : $4->height + 1;
+ $$->next->height = $$->height;
+ $$->next->next->height = $$->height;
+
+ $$->mark = 0;
+ $$->next->mark = 0;
+ $$->next->next->mark = 0;
+
+ free($7);
+
+ $$->next->length = $2->length;
+ $$->next->next->length = $4->length;
+
+}
+ | label optional_length
+{
+ $$ = (utree_t *)calloc(1, sizeof(utree_t));
+
+ $$->label = $1;
+ $$->length = $2 ? atof($2) : 0;
+ $$->next = NULL;
+ $$->height = 0;
+ $$->mark = 0;
+ tip_cnt++;
+ free($2);
+};
+
+
+optional_label: { $$ = NULL;} | label {$$ = $1;};
+optional_length: { $$ = NULL;} | COLON number {$$ = $2;};
+label: STRING { $$=$1;} | NUMBER {$$=$1;};
+number: NUMBER { $$=$1;};
+
+%%
+
+utree_t * utree_parse_newick(const char * filename, unsigned int * tip_count)
+{
+ struct utree_s * tree;
+
+ /* reset tip count */
+ tip_cnt = 0;
+
+ tree = (utree_t *)calloc(1, sizeof(utree_t));
+
+ utree_in = fopen(filename, "r");
+ if (!utree_in)
+ {
+ utree_destroy(tree);
+ snprintf(errmsg, 200, "Unable to open file (%s)", filename);
+ return NULL;
+ }
+ else if (utree_parse(tree))
+ {
+ utree_destroy(tree);
+ tree = NULL;
+ fclose(utree_in);
+ utree_lex_destroy();
+ return NULL;
+ }
+
+ if (utree_in) fclose(utree_in);
+
+ utree_lex_destroy();
+
+ *tip_count = tip_cnt;
+
+ return tree;
+}
diff --git a/src/python/compare.py b/src/python/compare.py
new file mode 100755
index 0000000..41c2026
--- /dev/null
+++ b/src/python/compare.py
@@ -0,0 +1,90 @@
+#! /usr/bin/env python
+
+import commands
+import time
+
+def evaluate(treeFile, rooted):
+ cmd_multi = './delimit --ptp_multi --tree_file ' + treeFile + ' --output_file foo'
+ cmd_single = './delimit --ptp_single --tree_file ' + treeFile + ' --output_file foo'
+ cmd_ptp_rooted = './PTP/PTP.py -t ' + treeFile + ' -p -minbr 0 -o output -pvalue 1'
+ cmd_ptp_unrooted = './PTP/PTP.py -t ' + treeFile + ' -p -minbr 0 -o output -pvalue 1 -r'
+
+ if (rooted):
+ programs = [cmd_multi, cmd_single, cmd_ptp_rooted]
+ cmd_ptp = cmd_ptp_rooted
+ else:
+ programs = [cmd_multi, cmd_single, cmd_ptp_unrooted]
+ cmd_ptp = cmd_ptp_unrooted
+
+ scores = {}
+ times = {}
+
+ print "Testing " + treeFile + "..."
+
+ # cmd_ptp:
+ ts = time.time()
+ ( stat, output ) = commands.getstatusoutput(cmd_ptp)
+ te = time.time()
+ times['ptp'] = te-ts
+ #print output
+ left = output.find("MAX logl: ")
+ right = output[left+10:].find("\n")
+ score = output[left+10:right+left+10]
+ scores['ptp'] = score
+
+ # cmd_multi:
+ ts = time.time()
+ ( stat, output ) = commands.getstatusoutput(cmd_multi)
+ te = time.time()
+ times['multi'] = te-ts
+ #print output
+ left = output.find("Best score found single: ")
+ right = output[left+25:].find("\n")
+ score = output[left+25:right+left+25]
+ scores['multi'] = score
+
+ # cmd_single:
+ ts = time.time()
+ ( stat, output ) = commands.getstatusoutput(cmd_single)
+ te = time.time()
+ times['single'] = te-ts
+ #print output
+ left = output.find("Best score found single: ")
+ right = output[left+25:].find("\n")
+ score = output[left+25:right+left+25]
+ scores['single'] = score
+
+ print 'scores: '
+ print scores
+ print 'times: '
+ print times
+ print '\n'
+
+ return scores
+
+def compare_rooted():
+ with open('tree_names_rooted') as f_rooted:
+ content = f_rooted.read().splitlines()
+ #gnuplotOut = open('workfile', 'w')
+ for i in range (0, len(content)):
+ scores = evaluate('trees/' + content[i], True)
+ #gnuplotOut.write(str(i) + ' ' + scores['ptp'] + ' ' + scores['multi'] + ' ' + scores['single'] + '\n')
+ #print evaluate('trees/' + name)
+ #gnuplotOut.close()
+ #commands.getstatusoutput('gnuplot plotscript')
+ f_rooted.close()
+
+def compare_unrooted():
+ with open('tree_names_unrooted') as f_unrooted:
+ content = f_unrooted.read().splitlines()
+ #gnuplotOut = open('workfile', 'w')
+ for i in range (0, len(content)):
+ scores = evaluate('trees/' + content[i], False)
+ #gnuplotOut.write(str(i) + ' ' + scores['ptp'] + ' ' + scores['multi'] + ' ' + scores['single'] + '\n')
+ #print evaluate('trees/' + name)
+ #gnuplotOut.close()
+ #commands.getstatusoutput('gnuplot plotscript')
+ f_unrooted.close()
+
+compare_unrooted()
+compare_rooted()
diff --git a/src/python/create_delimit_results.py b/src/python/create_delimit_results.py
new file mode 100755
index 0000000..a062b5e
--- /dev/null
+++ b/src/python/create_delimit_results.py
@@ -0,0 +1,54 @@
+#! /usr/bin/env python
+import os
+import commands
+
+def run_delimit_on_data(input_tree_file, output_delimit_single_minbr_0_file, output_delimit_multi_minbr_0_file, output_delimit_single_minbr_default_file, output_delimit_multi_minbr_default_file):
+ try:
+ open(input_tree_file)
+
+ if not os.path.exists(os.path.dirname(output_delimit_single_minbr_0_file)):
+ os.makedirs(os.path.dirname(output_delimit_single_minbr_0_file))
+ if not os.path.exists(os.path.dirname(output_delimit_multi_minbr_0_file)):
+ os.makedirs(os.path.dirname(output_delimit_multi_minbr_0_file))
+ if not os.path.exists(os.path.dirname(output_delimit_single_minbr_default_file)):
+ os.makedirs(os.path.dirname(output_delimit_single_minbr_default_file))
+ if not os.path.exists(os.path.dirname(output_delimit_multi_minbr_default_file)):
+ os.makedirs(os.path.dirname(output_delimit_multi_minbr_default_file))
+
+ delimit_single_minbr_0_call = "./delimit --ml_single --min_br 0 --tree_file " + input_tree_file + " --output_file foo"
+ delimit_multi_minbr_0_call = "./delimit --ml_multi --min_br 0 --tree_file " + input_tree_file + " --output_file foo"
+ delimit_single_minbr_default_call = "./delimit --ml_single --tree_file " + input_tree_file + " --output_file foo"
+ delimit_multi_minbr_default_call = "./delimit --ml_multi --tree_file " + input_tree_file + " --output_file foo"
+
+ (stat_single_minbr_0, output_single_minbr_0) = commands.getstatusoutput(delimit_single_minbr_0_call)
+ (stat_multi_minbr_0, output_multi_minbr_0) = commands.getstatusoutput(delimit_multi_minbr_0_call)
+ (stat_single_minbr_default, output_single_minbr_default) = commands.getstatusoutput(delimit_single_minbr_default_call)
+ (stat_multi_minbr_default, output_multi_minbr_default) = commands.getstatusoutput(delimit_multi_minbr_default_call)
+
+ delimit_single_minbr_0_out = open(output_delimit_single_minbr_0_file, 'w')
+ delimit_multi_minbr_0_out = open(output_delimit_multi_minbr_0_file, 'w')
+ delimit_single_minbr_default_out = open(output_delimit_single_minbr_default_file, 'w')
+ delimit_multi_minbr_default_out = open(output_delimit_multi_minbr_default_file, 'w')
+
+ delimit_single_minbr_0_out.write(output_single_minbr_0)
+ delimit_multi_minbr_0_out.write(output_multi_minbr_0)
+ delimit_single_minbr_default_out.write(output_single_minbr_default)
+ delimit_multi_minbr_default_out.write(output_multi_minbr_default)
+
+ delimit_single_minbr_0_out.close()
+ delimit_multi_minbr_0_out.close()
+ delimit_single_minbr_default_out.close()
+ delimit_multi_minbr_default_out.close()
+ except IOError:
+ print "File not found: " + input_tree_file
+
+set_names = ["Ne10000", "Ne100000", "Ne500000", "Ne1000000"]
+
+for set_name in set_names:
+ for i in range(1,101):
+ input_tree_file = "similar_to_GMYC/15-08-2015.16-40/set_BIRTH0.27_" + set_name + "/rooted.RAxML_result.inferred.simulated_set_BIRTH0.27_" + set_name + "_" + str(i) + ".phy"
+ output_delimit_single_minbr_0_file = "similar_to_GMYC_delimit_single_minbr_0/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt"
+ output_delimit_multi_minbr_0_file = "similar_to_GMYC_delimit_multi_minbr_0/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt"
+ output_delimit_single_minbr_default_file = "similar_to_GMYC_delimit_single_minbr_default/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt"
+ output_delimit_multi_minbr_default_file = "similar_to_GMYC_delimit_multi_minbr_default/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt"
+ run_delimit_on_data(input_tree_file, output_delimit_single_minbr_0_file, output_delimit_multi_minbr_0_file, output_delimit_single_minbr_default_file, output_delimit_multi_minbr_default_file)
diff --git a/src/python/create_delimit_results_simu_data.py b/src/python/create_delimit_results_simu_data.py
new file mode 100755
index 0000000..fc30f2f
--- /dev/null
+++ b/src/python/create_delimit_results_simu_data.py
@@ -0,0 +1,54 @@
+#! /usr/bin/env python
+import os
+import commands
+
+def run_delimit_on_data(input_tree_file, output_delimit_single_minbr_0_file, output_delimit_multi_minbr_0_file, output_delimit_single_minbr_default_file, output_delimit_multi_minbr_default_file):
+ try:
+ open(input_tree_file)
+
+ if not os.path.exists(os.path.dirname(output_delimit_single_minbr_0_file)):
+ os.makedirs(os.path.dirname(output_delimit_single_minbr_0_file))
+ if not os.path.exists(os.path.dirname(output_delimit_multi_minbr_0_file)):
+ os.makedirs(os.path.dirname(output_delimit_multi_minbr_0_file))
+ if not os.path.exists(os.path.dirname(output_delimit_single_minbr_default_file)):
+ os.makedirs(os.path.dirname(output_delimit_single_minbr_default_file))
+ if not os.path.exists(os.path.dirname(output_delimit_multi_minbr_default_file)):
+ os.makedirs(os.path.dirname(output_delimit_multi_minbr_default_file))
+
+ delimit_single_minbr_0_call = "./delimit --ml_single --min_br 0 --tree_file " + input_tree_file + " --output_file foo"
+ delimit_multi_minbr_0_call = "./delimit --ml_multi --min_br 0 --tree_file " + input_tree_file + " --output_file foo"
+ delimit_single_minbr_default_call = "./delimit --ml_single --tree_file " + input_tree_file + " --output_file foo"
+ delimit_multi_minbr_default_call = "./delimit --ml_multi --tree_file " + input_tree_file + " --output_file foo"
+
+ (stat_single_minbr_0, output_single_minbr_0) = commands.getstatusoutput(delimit_single_minbr_0_call)
+ (stat_multi_minbr_0, output_multi_minbr_0) = commands.getstatusoutput(delimit_multi_minbr_0_call)
+ (stat_single_minbr_default, output_single_minbr_default) = commands.getstatusoutput(delimit_single_minbr_default_call)
+ (stat_multi_minbr_default, output_multi_minbr_default) = commands.getstatusoutput(delimit_multi_minbr_default_call)
+
+ delimit_single_minbr_0_out = open(output_delimit_single_minbr_0_file, 'w')
+ delimit_multi_minbr_0_out = open(output_delimit_multi_minbr_0_file, 'w')
+ delimit_single_minbr_default_out = open(output_delimit_single_minbr_default_file, 'w')
+ delimit_multi_minbr_default_out = open(output_delimit_multi_minbr_default_file, 'w')
+
+ delimit_single_minbr_0_out.write(output_single_minbr_0)
+ delimit_multi_minbr_0_out.write(output_multi_minbr_0)
+ delimit_single_minbr_default_out.write(output_single_minbr_default)
+ delimit_multi_minbr_default_out.write(output_multi_minbr_default)
+
+ delimit_single_minbr_0_out.close()
+ delimit_multi_minbr_0_out.close()
+ delimit_single_minbr_default_out.close()
+ delimit_multi_minbr_default_out.close()
+ except IOError:
+ print "File not found: " + input_tree_file
+
+set_names = ["Ne1e+05", "Ne1e+06", "Ne5e+05", "Ne10000"]
+
+for set_name in set_names:
+ for i in range(1,101):
+ input_tree_file = "SimulB_C_trees/set_" + set_name + "/SimulB_C_tree_set_" + set_name + "." + str(i) + ".txt"
+ output_delimit_single_minbr_0_file = "SimulB_C_delimit_single_minbr_0/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt"
+ output_delimit_multi_minbr_0_file = "SimulB_C_delimit_multi_minbr_0/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt"
+ output_delimit_single_minbr_default_file = "SimulB_C_delimit_single_minbr_default/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt"
+ output_delimit_multi_minbr_default_file = "SimulB_C_delimit_multi_minbr_default/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt"
+ run_delimit_on_data(input_tree_file, output_delimit_single_minbr_0_file, output_delimit_multi_minbr_0_file, output_delimit_single_minbr_default_file, output_delimit_multi_minbr_default_file)
diff --git a/src/python/create_scoring_results.py b/src/python/create_scoring_results.py
new file mode 100755
index 0000000..52dcb7e
--- /dev/null
+++ b/src/python/create_scoring_results.py
@@ -0,0 +1,286 @@
+#! /usr/bin/env python
+import os
+import commands
+
+def extract_tree_score(input_text):
+ lines = input_text.split('\n')
+ for line in lines:
+ if line.startswith("Tree penalty score:"):
+ return int(line.split(': ')[1])
+ break
+
+def extract_nmi_score(input_text):
+ lines = input_text.split('\n')
+ for line in lines:
+ if line.startswith("NMI score:"):
+ return float(line.split(': ')[1])
+ break
+
+def extract_num_species(input_text):
+ lines = input_text.split('\n')
+ for line in lines:
+ if line.startswith("Number of species in input file:"):
+ return int(line.split(': ')[1])
+ if (int(line.split(': ')[1]) == 1):
+ print "Baaaaad data"
+ break
+
+def extract_num_real_species(input_text):
+ lines = input_text.split('\n')
+ for line in lines:
+ if line.startswith("Number of real species:"):
+ return int(line.split(': ')[1])
+ break
+
+def extract_score_real_single(input_text):
+ lines = input_text.split('\n')
+ for line in lines:
+ if line.startswith("Score real single:"):
+ return float(line.split(': ')[1])
+ break
+
+def extract_score_real_multi(input_text):
+ lines = input_text.split('\n')
+ for line in lines:
+ if line.startswith("Score real multi:"):
+ return float(line.split(': ')[1])
+ break
+
+def extract_score_input_single(input_text):
+ lines = input_text.split('\n')
+ for line in lines:
+ if line.startswith("Score input single:"):
+ return float(line.split(': ')[1])
+ break
+
+def extract_score_input_multi(input_text):
+ lines = input_text.split('\n')
+ for line in lines:
+ if line.startswith("Score input multi:"):
+ return float(line.split(': ')[1])
+ break
+
+def grab_scorings(input_tree_file, output_delimit_single_minbr_0, output_delimit_multi_minbr_0, output_delimit_single_minbr_default, output_delimit_multi_minbr_default, output_PTP_minbr_0):
+ try:
+ open(input_tree_file)
+ programNames = ['delimit_single_minbr_0', 'delimit_multi_minbr_0', 'delimit_single_minbr_default', 'delimit_multi_minbr_default', 'PTP_minbr_0']
+ tree_scores = {}
+ nmi_scores = {}
+ num_species = {}
+ single_scores = {}
+ multi_scores = {}
+ num_real_species = 0
+ score_real_single_minbr_0 = 0
+ score_real_multi_minbr_0 = 0
+ score_real_single_minbr_default = 0
+ score_real_multi_minbr_default = 0
+
+ tree_scores['delimit_single_minbr_0'] = extract_tree_score(output_delimit_single_minbr_0)
+ tree_scores['delimit_multi_minbr_0'] = extract_tree_score(output_delimit_multi_minbr_0)
+ tree_scores['delimit_single_minbr_default'] = extract_tree_score(output_delimit_single_minbr_default)
+ tree_scores['delimit_multi_minbr_default'] = extract_tree_score(output_delimit_multi_minbr_default)
+ tree_scores['PTP_minbr_0'] = extract_tree_score(output_PTP_minbr_0)
+
+ nmi_scores['delimit_single_minbr_0'] = extract_nmi_score(output_delimit_single_minbr_0)
+ nmi_scores['delimit_multi_minbr_0'] = extract_nmi_score(output_delimit_multi_minbr_0)
+ nmi_scores['delimit_single_minbr_default'] = extract_nmi_score(output_delimit_single_minbr_default)
+ nmi_scores['delimit_multi_minbr_default'] = extract_nmi_score(output_delimit_multi_minbr_default)
+ nmi_scores['PTP_minbr_0'] = extract_nmi_score(output_PTP_minbr_0)
+
+ num_species['delimit_single_minbr_0'] = extract_num_species(output_delimit_single_minbr_0)
+ num_species['delimit_multi_minbr_0'] = extract_num_species(output_delimit_multi_minbr_0)
+ num_species['delimit_single_minbr_default'] = extract_num_species(output_delimit_single_minbr_default)
+ num_species['delimit_multi_minbr_default'] = extract_num_species(output_delimit_multi_minbr_default)
+ num_species['PTP_minbr_0'] = extract_num_species(output_PTP_minbr_0)
+
+ single_scores['delimit_single_minbr_0'] = extract_score_input_single(output_delimit_single_minbr_0)
+ single_scores['delimit_multi_minbr_0'] = extract_score_input_single(output_delimit_multi_minbr_0)
+ single_scores['delimit_single_minbr_default'] = extract_score_input_single(output_delimit_single_minbr_default)
+ single_scores['delimit_multi_minbr_default'] = extract_score_input_single(output_delimit_multi_minbr_default)
+ single_scores['PTP_minbr_0'] = extract_score_input_single(output_PTP_minbr_0)
+
+ multi_scores['delimit_single_minbr_0'] = extract_score_input_multi(output_delimit_single_minbr_0)
+ multi_scores['delimit_multi_minbr_0'] = extract_score_input_multi(output_delimit_multi_minbr_0)
+ multi_scores['delimit_single_minbr_default'] = extract_score_input_multi(output_delimit_single_minbr_default)
+ multi_scores['delimit_multi_minbr_default'] = extract_score_input_multi(output_delimit_multi_minbr_default)
+ multi_scores['PTP_minbr_0'] = extract_score_input_multi(output_PTP_minbr_0)
+
+ score_real_single_minbr_0 = extract_score_real_single(output_delimit_single_minbr_0)
+ score_real_multi_minbr_0 = extract_score_real_multi(output_delimit_single_minbr_0)
+ score_real_single_minbr_default = extract_score_real_single(output_delimit_single_minbr_default)
+ score_real_multi_minbr_default = extract_score_real_multi(output_delimit_single_minbr_default)
+ num_real_species = extract_num_real_species(output_delimit_single_minbr_0)
+
+ return (tree_scores, nmi_scores, num_species, single_scores, multi_scores, score_real_single_minbr_0, score_real_multi_minbr_0, score_real_single_minbr_default, score_real_multi_minbr_default, num_real_species)
+ except IOError:
+ print "File not found: " + input_tree_file
+
+def create_scoring_results(input_tree_file, input_delimit_single_minbr_0_file, input_delimit_multi_minbr_0_file, input_delimit_single_minbr_default_file, input_delimit_multi_minbr_default_file, input_PTP_minbr_0_file, output_delimit_single_minbr_0_file, output_delimit_multi_minbr_0_file, output_delimit_single_minbr_default_file, output_delimit_multi_minbr_default_file, output_PTP_minbr_0_file):
+ try:
+ open(input_tree_file)
+ if not os.path.exists(os.path.dirname(output_delimit_single_minbr_0_file)):
+ os.makedirs(os.path.dirname(output_delimit_single_minbr_0_file))
+ if not os.path.exists(os.path.dirname(output_delimit_multi_minbr_0_file)):
+ os.makedirs(os.path.dirname(output_delimit_multi_minbr_0_file))
+ if not os.path.exists(os.path.dirname(output_delimit_single_minbr_default_file)):
+ os.makedirs(os.path.dirname(output_delimit_single_minbr_default_file))
+ if not os.path.exists(os.path.dirname(output_delimit_multi_minbr_default_file)):
+ os.makedirs(os.path.dirname(output_delimit_multi_minbr_default_file))
+ if not os.path.exists(os.path.dirname(output_PTP_minbr_0_file)):
+ os.makedirs(os.path.dirname(output_PTP_minbr_0_file))
+
+ call_delimit_single_minbr_0 = "./delimit --score " + input_delimit_single_minbr_0_file + " --min_br 0 --tree_file " + input_tree_file + " --output_file foo"
+ call_delimit_multi_minbr_0 = "./delimit --score " + input_delimit_multi_minbr_0_file + " --min_br 0 --tree_file " + input_tree_file + " --output_file foo"
+ call_delimit_single_minbr_default = "./delimit --score " + input_delimit_single_minbr_default_file + " --tree_file " + input_tree_file + " --output_file foo"
+ call_delimit_multi_minbr_default = "./delimit --score " + input_delimit_multi_minbr_default_file + " --tree_file " + input_tree_file + " --output_file foo"
+ call_PTP_minbr_0 = "./delimit --score " + input_PTP_minbr_0_file + " --min_br 0 --tree_file " + input_tree_file + " --output_file foo"
+
+ (stat_delimit_single_minbr_0, output_delimit_single_minbr_0) = commands.getstatusoutput(call_delimit_single_minbr_0)
+ (stat_delimit_multi_minbr_0, output_delimit_multi_minbr_0) = commands.getstatusoutput(call_delimit_multi_minbr_0)
+ (stat_delimit_single_minbr_default, output_delimit_single_minbr_default) = commands.getstatusoutput(call_delimit_single_minbr_default)
+ (stat_delimit_multi_minbr_default, output_delimit_multi_minbr_default) = commands.getstatusoutput(call_delimit_multi_minbr_default)
+ (stat_PTP_minbr_0, output_PTP_minbr_0) = commands.getstatusoutput(call_PTP_minbr_0)
+
+ delimit_single_minbr_0_out = open(output_delimit_single_minbr_0_file, 'w')
+ delimit_multi_minbr_0_out = open(output_delimit_multi_minbr_0_file, 'w')
+ delimit_single_minbr_default_out = open(output_delimit_single_minbr_default_file, 'w')
+ delimit_multi_minbr_default_out = open(output_delimit_multi_minbr_default_file, 'w')
+ PTP_minbr_0_out = open(output_PTP_minbr_0_file, 'w')
+
+ delimit_single_minbr_0_out.write(output_delimit_single_minbr_0)
+ delimit_multi_minbr_0_out.write(output_delimit_multi_minbr_0)
+ delimit_single_minbr_default_out.write(output_delimit_single_minbr_default)
+ delimit_multi_minbr_default_out.write(output_delimit_multi_minbr_default)
+ PTP_minbr_0_out.write(output_PTP_minbr_0)
+
+ delimit_single_minbr_0_out.close()
+ delimit_multi_minbr_0_out.close()
+ delimit_single_minbr_default_out.close()
+ delimit_multi_minbr_default_out.close()
+ PTP_minbr_0_out.close()
+
+ return grab_scorings(input_tree_file, output_delimit_single_minbr_0, output_delimit_multi_minbr_0, output_delimit_single_minbr_default, output_delimit_multi_minbr_default, output_PTP_minbr_0)
+ except IOError:
+ print "File not found: " + input_tree_file
+
+set_names = ["1", "5", "10", "20", "40", "80", "160"]
+names = ['delimit_single_minbr_0', 'delimit_multi_minbr_0', 'delimit_single_minbr_default', 'delimit_multi_minbr_default', 'PTP_minbr_0']
+
+gnuplotOut_tree_scores = open('workfile_tree_scores', 'w')
+gnuplotOut_nmi_scores = open('workfile_nmi_scores', 'w')
+gnuplotOut_single_scores = open('workfile_single_scores', 'w')
+gnuplotOut_multi_scores = open('workfile_multi_scores', 'w')
+gnuplotOut_num_species = open('workfile_num_species', 'w')
+
+for set_name in set_names:
+ num_valid_indices = 0
+ average_tree_scores = {}
+ average_nmi_scores = {}
+ average_num_species = {}
+ average_single_scores = {}
+ average_multi_scores = {}
+ average_real_num_species = 0
+ average_real_score_single_minbr_0 = 0
+ average_real_score_multi_minbr_0 = 0
+ average_real_score_single_minbr_default = 0
+ average_real_score_multi_minbr_default = 0
+
+ for name in names:
+ average_tree_scores[name] = 0
+ average_nmi_scores[name] = 0
+ average_num_species[name] = 0
+ average_single_scores[name] = 0
+ average_multi_scores[name] = 0
+
+ for i in range(1,101):
+ if (set_name == "1"):
+ input_tree_file = "unique_taxa_trees_big_dataset/set_" + set_name + "/RAxML_inferred_trees_unique_taxa/rooted.inferred_unique_taxa." + str(i)
+ else:
+ input_tree_file = "unique_taxa_trees_big_dataset/set_" + set_name + "/RAxML_inferred_trees_unique_taxa/rooted.inferred_unique_taxa_set_" + set_name + "." + str(i)
+
+ try:
+ open(input_tree_file)
+
+ input_delimit_single_minbr_0_file = "unique_taxa_big_delimit_single_minbr_0/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt"
+ input_delimit_multi_minbr_0_file = "unique_taxa_big_delimit_multi_minbr_0/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt"
+ input_delimit_single_minbr_default_file = "unique_taxa_big_delimit_single_minbr_default/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt"
+ input_delimit_multi_minbr_default_file = "unique_taxa_big_delimit_multi_minbr_default/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt"
+ input_PTP_minbr_0_file = "unique_taxa_big_PTP_minbr_0/set_" + set_name + "/PTP_results_set_" + set_name + "." + str(i) + ".txt"
+
+ score_path = "unique_taxa_big_scoring_results/"
+ output_delimit_single_minbr_0_file = score_path + "delimit_single_minbr_0/set_" + set_name + "/delimit_score_set_" + set_name + "." + str(i) + ".txt"
+ output_delimit_multi_minbr_0_file = score_path + "delimit_multi_minbr_0/set_" + set_name + "/delimit_score_set_" + set_name + "." + str(i) + ".txt"
+ output_delimit_single_minbr_default_file = score_path + "delimit_single_minbr_default/set_" + set_name + "/delimit_score_set_" + set_name + "." + str(i) + ".txt"
+ output_delimit_multi_minbr_default_file = score_path + "delimit_multi_minbr_default/set_" + set_name + "/delimit_score_set_" + set_name + "." + str(i) + ".txt"
+ output_PTP_minbr_0_file = score_path + "PTP_minbr_0/set_" + set_name + "/PTP_score_set_" + set_name + "." + str(i) + ".txt"
+
+ (tree_scores, nmi_scores, num_species, single_scores, multi_scores, score_real_single_minbr_0, score_real_multi_minbr_0, score_real_single_minbr_default, score_real_multi_minbr_default, num_real_species) = create_scoring_results(input_tree_file, input_delimit_single_minbr_0_file, input_delimit_multi_minbr_0_file, input_delimit_single_minbr_default_file, input_delimit_multi_minbr_default_file, input_PTP_minbr_0_file, output_delimit_single_minbr_0_file, output_delimit_multi_minbr_0_file [...]
+
+ try:
+ for name in names:
+ average_tree_scores[name] = average_tree_scores[name] + tree_scores[name]
+ average_nmi_scores[name] = average_nmi_scores[name] + nmi_scores[name]
+ average_num_species[name] = average_num_species[name] + num_species[name]
+ average_single_scores[name] = average_single_scores[name] + single_scores[name]
+ average_multi_scores[name] = average_multi_scores[name] + multi_scores[name]
+ average_real_num_species = average_real_num_species + num_real_species
+ average_real_score_single_minbr_0 = average_real_score_single_minbr_0 + score_real_single_minbr_0
+ average_real_score_multi_minbr_0 = average_real_score_multi_minbr_0 + score_real_multi_minbr_0
+ average_real_score_single_minbr_default = average_real_score_single_minbr_default + score_real_single_minbr_default
+ average_real_score_multi_minbr_default = average_real_score_multi_minbr_default + score_real_multi_minbr_default
+ except:
+ print "File is bad: " + input_tree_file
+ num_valid_indices = num_valid_indices - 1
+
+ num_valid_indices = num_valid_indices + 1
+ except IOError:
+ #1
+ print "File not found: " + input_tree_file
+
+ if (num_valid_indices > 0):
+ for name in names:
+ average_tree_scores[name] = float(average_tree_scores[name]) / float(num_valid_indices)
+ average_nmi_scores[name] = float(average_nmi_scores[name]) / float(num_valid_indices)
+ average_num_species[name] = float(average_num_species[name]) / float(num_valid_indices)
+ average_single_scores[name] = float(average_single_scores[name]) / float(num_valid_indices)
+ average_multi_scores[name] = float(average_multi_scores[name]) / float(num_valid_indices)
+
+ #print "Set " + set_name + ": Average tree score " + name
+ #print average_tree_scores[name]
+ #print "Set " + set_name + ": Average NMI score " + name
+ #print average_nmi_scores[name]
+ #print "Set " + set_name + ": Average num species " + name
+ #print average_num_species[name]
+ #print "Set " + set_name + ": Average input score single " + name
+ #print average_single_scores[name]
+ #print "Set " + set_name + ": Average input score multi " + name
+ #print average_multi_scores[name]
+ average_real_num_species = float(average_real_num_species) / float(num_valid_indices)
+ average_real_score_single_minbr_0 = float(average_real_score_single_minbr_0) / float(num_valid_indices)
+ average_real_score_multi_minbr_0 = float(average_real_score_multi_minbr_0) / float(num_valid_indices)
+ average_real_score_single_minbr_default = float(average_real_score_single_minbr_default) / float(num_valid_indices)
+ average_real_score_multi_minbr_default = float(average_real_score_multi_minbr_default) / float(num_valid_indices)
+ #print "Set " + set_name + ": Average real num species "
+ #print average_real_num_species
+ #print "Set " + set_name + ": Average real score single "
+ #print average_real_score_single
+ #print "Set " + set_name + ": Average real score multi "
+ #print average_real_score_multi
+
+ gnuplotOut_tree_scores.write(set_name + ' ' + str(average_tree_scores['delimit_single_minbr_0']) + ' ' + str(average_tree_scores['delimit_multi_minbr_0']) + ' ' + str(average_tree_scores['delimit_single_minbr_default']) + ' ' + str(average_tree_scores['delimit_multi_minbr_default']) + ' ' + str(average_tree_scores['PTP_minbr_0']) + '\n')
+
+ gnuplotOut_nmi_scores.write(set_name + ' ' + str(average_nmi_scores['delimit_single_minbr_0']) + ' ' + str(average_nmi_scores['delimit_multi_minbr_0']) + ' ' + str(average_nmi_scores['delimit_single_minbr_default']) + ' ' + str(average_nmi_scores['delimit_multi_minbr_default']) + ' ' + str(average_nmi_scores['PTP_minbr_0']) + '\n')
+
+ gnuplotOut_single_scores.write(set_name + ' ' + str(average_single_scores['delimit_single_minbr_0']) + ' ' + str(average_single_scores['delimit_multi_minbr_0']) + ' ' + str(average_single_scores['delimit_single_minbr_default']) + ' ' + str(average_single_scores['delimit_multi_minbr_default']) + ' ' + str(average_single_scores['PTP_minbr_0']) + ' ' + str(average_real_score_single_minbr_0) + ' ' + str(average_real_score_single_minbr_default) + '\n')
+
+ gnuplotOut_multi_scores.write(set_name + ' ' + str(average_multi_scores['delimit_single_minbr_0']) + ' ' + str(average_multi_scores['delimit_multi_minbr_0']) + ' ' + str(average_multi_scores['delimit_single_minbr_default']) + ' ' + str(average_multi_scores['delimit_multi_minbr_default']) + ' ' + str(average_multi_scores['PTP_minbr_0']) + ' ' + str(average_real_score_multi_minbr_0) + ' ' + str(average_real_score_multi_minbr_default) + '\n')
+
+ gnuplotOut_num_species.write(set_name + ' ' + str(average_num_species['delimit_single_minbr_0']) + ' ' + str(average_num_species['delimit_multi_minbr_0']) + ' ' + str(average_num_species['delimit_single_minbr_default']) + ' ' + str(average_num_species['delimit_multi_minbr_default']) + ' ' + str(average_num_species['PTP_minbr_0']) + ' ' + str(average_real_num_species) + '\n')
+
+gnuplotOut_tree_scores.close()
+gnuplotOut_nmi_scores.close()
+gnuplotOut_single_scores.close()
+gnuplotOut_multi_scores.close()
+gnuplotOut_num_species.close()
+
+commands.getstatusoutput('gnuplot plotscript')
diff --git a/src/python/create_scoring_results_with_gmyc.py b/src/python/create_scoring_results_with_gmyc.py
new file mode 100755
index 0000000..8350436
--- /dev/null
+++ b/src/python/create_scoring_results_with_gmyc.py
@@ -0,0 +1,329 @@
+#! /usr/bin/env python
+import os
+import commands
+
+def extract_tree_score(input_text):
+ lines = input_text.split('\n')
+ for line in lines:
+ if line.startswith("Tree penalty score:"):
+ return int(line.split(': ')[1])
+ break
+
+def extract_nmi_score(input_text):
+ lines = input_text.split('\n')
+ for line in lines:
+ if line.startswith("NMI score:"):
+ return float(line.split(': ')[1])
+ break
+
+def extract_num_species(input_text):
+ lines = input_text.split('\n')
+ for line in lines:
+ if line.startswith("Number of species in input file:"):
+ return int(line.split(': ')[1])
+ if (int(line.split(': ')[1]) == 1):
+ print "Baaaaad data"
+ break
+
+def extract_num_real_species(input_text):
+ lines = input_text.split('\n')
+ for line in lines:
+ if line.startswith("Number of real species:"):
+ return int(line.split(': ')[1])
+ break
+
+def extract_score_real_single(input_text):
+ lines = input_text.split('\n')
+ for line in lines:
+ if line.startswith("Score real single:"):
+ return float(line.split(': ')[1])
+ break
+
+def extract_score_real_multi(input_text):
+ lines = input_text.split('\n')
+ for line in lines:
+ if line.startswith("Score real multi:"):
+ return float(line.split(': ')[1])
+ break
+
+def extract_score_input_single(input_text):
+ lines = input_text.split('\n')
+ for line in lines:
+ if line.startswith("Score input single:"):
+ return float(line.split(': ')[1])
+ break
+
+def extract_score_input_multi(input_text):
+ lines = input_text.split('\n')
+ for line in lines:
+ if line.startswith("Score input multi:"):
+ return float(line.split(': ')[1])
+ break
+
+def grab_scorings(input_tree_file, output_delimit_single_minbr_0, output_delimit_multi_minbr_0, output_delimit_single_minbr_default, output_delimit_multi_minbr_default, output_PTP_minbr_default, output_gmyc_minbr_0):
+ try:
+ open(input_tree_file)
+ programNames = ['delimit_single_minbr_0', 'delimit_multi_minbr_0', 'delimit_single_minbr_default', 'delimit_multi_minbr_default', 'PTP_minbr_default', 'gmyc_minbr_0']
+ tree_scores = {}
+ nmi_scores = {}
+ num_species = {}
+ single_scores = {}
+ multi_scores = {}
+ num_real_species = 0
+ score_real_single_minbr_0 = 0
+ score_real_multi_minbr_0 = 0
+ score_real_single_minbr_default = 0
+ score_real_multi_minbr_default = 0
+
+ tree_scores['delimit_single_minbr_0'] = extract_tree_score(output_delimit_single_minbr_0)
+ tree_scores['delimit_multi_minbr_0'] = extract_tree_score(output_delimit_multi_minbr_0)
+ tree_scores['delimit_single_minbr_default'] = extract_tree_score(output_delimit_single_minbr_default)
+ tree_scores['delimit_multi_minbr_default'] = extract_tree_score(output_delimit_multi_minbr_default)
+ tree_scores['PTP_minbr_default'] = extract_tree_score(output_PTP_minbr_default)
+ tree_scores['gmyc_minbr_0'] = extract_tree_score(output_gmyc_minbr_0)
+
+ nmi_scores['delimit_single_minbr_0'] = extract_nmi_score(output_delimit_single_minbr_0)
+ nmi_scores['delimit_multi_minbr_0'] = extract_nmi_score(output_delimit_multi_minbr_0)
+ nmi_scores['delimit_single_minbr_default'] = extract_nmi_score(output_delimit_single_minbr_default)
+ nmi_scores['delimit_multi_minbr_default'] = extract_nmi_score(output_delimit_multi_minbr_default)
+ nmi_scores['PTP_minbr_default'] = extract_nmi_score(output_PTP_minbr_default)
+ nmi_scores['gmyc_minbr_0'] = extract_nmi_score(output_gmyc_minbr_0)
+
+ num_species['delimit_single_minbr_0'] = extract_num_species(output_delimit_single_minbr_0)
+ num_species['delimit_multi_minbr_0'] = extract_num_species(output_delimit_multi_minbr_0)
+ num_species['delimit_single_minbr_default'] = extract_num_species(output_delimit_single_minbr_default)
+ num_species['delimit_multi_minbr_default'] = extract_num_species(output_delimit_multi_minbr_default)
+ num_species['PTP_minbr_default'] = extract_num_species(output_PTP_minbr_default)
+ num_species['gmyc_minbr_0'] = extract_num_species(output_gmyc_minbr_0)
+
+ single_scores['delimit_single_minbr_0'] = extract_score_input_single(output_delimit_single_minbr_0)
+ single_scores['delimit_multi_minbr_0'] = extract_score_input_single(output_delimit_multi_minbr_0)
+ single_scores['delimit_single_minbr_default'] = extract_score_input_single(output_delimit_single_minbr_default)
+ single_scores['delimit_multi_minbr_default'] = extract_score_input_single(output_delimit_multi_minbr_default)
+ single_scores['PTP_minbr_default'] = extract_score_input_single(output_PTP_minbr_default)
+ single_scores['gmyc_minbr_0'] = extract_score_input_single(output_gmyc_minbr_0)
+
+ multi_scores['delimit_single_minbr_0'] = extract_score_input_multi(output_delimit_single_minbr_0)
+ multi_scores['delimit_multi_minbr_0'] = extract_score_input_multi(output_delimit_multi_minbr_0)
+ multi_scores['delimit_single_minbr_default'] = extract_score_input_multi(output_delimit_single_minbr_default)
+ multi_scores['delimit_multi_minbr_default'] = extract_score_input_multi(output_delimit_multi_minbr_default)
+ multi_scores['PTP_minbr_default'] = extract_score_input_multi(output_PTP_minbr_default)
+ multi_scores['gmyc_minbr_0'] = extract_score_input_multi(output_gmyc_minbr_0)
+
+ score_real_single_minbr_0 = extract_score_real_single(output_delimit_single_minbr_0)
+ score_real_multi_minbr_0 = extract_score_real_multi(output_delimit_single_minbr_0)
+ score_real_single_minbr_default = extract_score_real_single(output_delimit_single_minbr_default)
+ score_real_multi_minbr_default = extract_score_real_multi(output_delimit_single_minbr_default)
+ num_real_species = extract_num_real_species(output_delimit_single_minbr_0)
+
+ return (tree_scores, nmi_scores, num_species, single_scores, multi_scores, score_real_single_minbr_0, score_real_multi_minbr_0, score_real_single_minbr_default, score_real_multi_minbr_default, num_real_species)
+ except IOError:
+ print "File not found: " + input_tree_file
+
+def create_scoring_results(input_tree_file, input_delimit_single_minbr_0_file, input_delimit_multi_minbr_0_file, input_delimit_single_minbr_default_file, input_delimit_multi_minbr_default_file, input_PTP_minbr_default_file, input_gmyc_minbr_0_file, output_delimit_single_minbr_0_file, output_delimit_multi_minbr_0_file, output_delimit_single_minbr_default_file, output_delimit_multi_minbr_default_file, output_PTP_minbr_default_file, output_gmyc_minbr_0_file):
+ try:
+ open(input_tree_file)
+ if not os.path.exists(os.path.dirname(output_delimit_single_minbr_0_file)):
+ os.makedirs(os.path.dirname(output_delimit_single_minbr_0_file))
+ if not os.path.exists(os.path.dirname(output_delimit_multi_minbr_0_file)):
+ os.makedirs(os.path.dirname(output_delimit_multi_minbr_0_file))
+ if not os.path.exists(os.path.dirname(output_delimit_single_minbr_default_file)):
+ os.makedirs(os.path.dirname(output_delimit_single_minbr_default_file))
+ if not os.path.exists(os.path.dirname(output_delimit_multi_minbr_default_file)):
+ os.makedirs(os.path.dirname(output_delimit_multi_minbr_default_file))
+ if not os.path.exists(os.path.dirname(output_PTP_minbr_default_file)):
+ os.makedirs(os.path.dirname(output_PTP_minbr_default_file))
+ if not os.path.exists(os.path.dirname(output_gmyc_minbr_0_file)):
+ os.makedirs(os.path.dirname(output_gmyc_minbr_0_file))
+
+ call_delimit_single_minbr_0 = "./delimit --score " + input_delimit_single_minbr_0_file + " --min_br 0 --tree_file " + input_tree_file + " --output_file foo"
+ call_delimit_multi_minbr_0 = "./delimit --score " + input_delimit_multi_minbr_0_file + " --min_br 0 --tree_file " + input_tree_file + " --output_file foo"
+ call_delimit_single_minbr_default = "./delimit --score " + input_delimit_single_minbr_default_file + " --tree_file " + input_tree_file + " --output_file foo"
+ call_delimit_multi_minbr_default = "./delimit --score " + input_delimit_multi_minbr_default_file + " --tree_file " + input_tree_file + " --output_file foo"
+ call_PTP_minbr_default = "./delimit --score " + input_PTP_minbr_default_file + " --min_br 0 --tree_file " + input_tree_file + " --output_file foo"
+ call_gmyc_minbr_0 = "./delimit --score " + input_gmyc_minbr_0_file + " --min_br 0 --tree_file " + input_tree_file + " --output_file foo"
+
+ (stat_delimit_single_minbr_0, output_delimit_single_minbr_0) = commands.getstatusoutput(call_delimit_single_minbr_0)
+ (stat_delimit_multi_minbr_0, output_delimit_multi_minbr_0) = commands.getstatusoutput(call_delimit_multi_minbr_0)
+ (stat_delimit_single_minbr_default, output_delimit_single_minbr_default) = commands.getstatusoutput(call_delimit_single_minbr_default)
+ (stat_delimit_multi_minbr_default, output_delimit_multi_minbr_default) = commands.getstatusoutput(call_delimit_multi_minbr_default)
+ (stat_PTP_minbr_default, output_PTP_minbr_default) = commands.getstatusoutput(call_PTP_minbr_default)
+ (stat_gmyc_minbr_0, output_gmyc_minbr_0) = commands.getstatusoutput(call_gmyc_minbr_0)
+
+ delimit_single_minbr_0_out = open(output_delimit_single_minbr_0_file, 'w')
+ delimit_multi_minbr_0_out = open(output_delimit_multi_minbr_0_file, 'w')
+ delimit_single_minbr_default_out = open(output_delimit_single_minbr_default_file, 'w')
+ delimit_multi_minbr_default_out = open(output_delimit_multi_minbr_default_file, 'w')
+ PTP_minbr_default_out = open(output_PTP_minbr_default_file, 'w')
+ gmyc_minbr_0_out = open(output_gmyc_minbr_0_file, 'w')
+
+ delimit_single_minbr_0_out.write(output_delimit_single_minbr_0)
+ delimit_multi_minbr_0_out.write(output_delimit_multi_minbr_0)
+ delimit_single_minbr_default_out.write(output_delimit_single_minbr_default)
+ delimit_multi_minbr_default_out.write(output_delimit_multi_minbr_default)
+ PTP_minbr_default_out.write(output_PTP_minbr_default)
+ gmyc_minbr_0_out.write(output_gmyc_minbr_0)
+
+ delimit_single_minbr_0_out.close()
+ delimit_multi_minbr_0_out.close()
+ delimit_single_minbr_default_out.close()
+ delimit_multi_minbr_default_out.close()
+ PTP_minbr_default_out.close()
+ gmyc_minbr_0_out.close()
+
+ return grab_scorings(input_tree_file, output_delimit_single_minbr_0, output_delimit_multi_minbr_0, output_delimit_single_minbr_default, output_delimit_multi_minbr_default, output_PTP_minbr_default, output_gmyc_minbr_0)
+ except IOError:
+ print "File not found: " + input_tree_file
+
+set_names = ["Ne10000", "Ne100000", "Ne500000", "Ne1000000"]
+names = ['delimit_single_minbr_0', 'delimit_multi_minbr_0', 'delimit_single_minbr_default', 'delimit_multi_minbr_default', 'PTP_minbr_default', 'gmyc_minbr_0']
+
+gnuplotOut_tree_scores = open('workfile_tree_scores', 'w')
+gnuplotOut_nmi_scores = open('workfile_nmi_scores', 'w')
+gnuplotOut_single_scores = open('workfile_single_scores', 'w')
+gnuplotOut_multi_scores = open('workfile_multi_scores', 'w')
+gnuplotOut_num_species = open('workfile_num_species', 'w')
+
+for set_name in set_names:
+ gnuplotOut_tree_scores_current_set = open('workfile_tree_scores_' + set_name, 'w')
+ gnuplotOut_nmi_scores_current_set = open('workfile_nmi_scores_' + set_name, 'w')
+ gnuplotOut_single_scores_current_set = open('workfile_single_scores_' + set_name, 'w')
+ gnuplotOut_multi_scores_current_set = open('workfile_multi_scores_' + set_name, 'w')
+ gnuplotOut_num_species_current_set = open('workfile_num_species_' + set_name, 'w')
+ gnuplotOut_delta_species_current_set = open('workfile_delta_species_' + set_name, 'w')
+
+ num_valid_indices = 0
+ average_tree_scores = {}
+ average_nmi_scores = {}
+ average_num_species = {}
+ average_single_scores = {}
+ average_multi_scores = {}
+ average_real_num_species = 0
+ average_real_score_single_minbr_0 = 0
+ average_real_score_multi_minbr_0 = 0
+ average_real_score_single_minbr_default = 0
+ average_real_score_multi_minbr_default = 0
+
+ for name in names:
+ average_tree_scores[name] = 0
+ average_nmi_scores[name] = 0
+ average_num_species[name] = 0
+ average_single_scores[name] = 0
+ average_multi_scores[name] = 0
+ num_bad_guys = 0
+
+ for i in range(1,101):
+ input_tree_file = "similar_to_GMYC/15-08-2015.16-40/set_BIRTH0.27_" + set_name + "/rooted.RAxML_result.inferred.simulated_set_BIRTH0.27_" + set_name + "_" + str(i) + ".phy"
+
+ try:
+ open(input_tree_file)
+
+ input_delimit_single_minbr_0_file = "similar_to_GMYC_delimit_single_minbr_0/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt"
+ input_delimit_multi_minbr_0_file = "similar_to_GMYC_delimit_multi_minbr_0/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt"
+ input_delimit_single_minbr_default_file = "similar_to_GMYC_delimit_single_minbr_default/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt"
+ input_delimit_multi_minbr_default_file = "similar_to_GMYC_delimit_multi_minbr_default/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt"
+ input_PTP_minbr_default_file = "similar_to_GMYC_PTP_minbr_default/set_" + set_name + "/PTP_results_set_" + set_name + "." + str(i) + ".txt"
+ input_gmyc_minbr_0_file = "similar_to_GMYC_gmyc_minbr_0/set_" + set_name + "/gmyc_results_set_" + set_name + "." + str(i) + ".txt"
+
+ score_path = "similar_to_GMYC_scoring_results/"
+ output_delimit_single_minbr_0_file = score_path + "delimit_single_minbr_0/set_" + set_name + "/delimit_score_set_" + set_name + "." + str(i) + ".txt"
+ output_delimit_multi_minbr_0_file = score_path + "delimit_multi_minbr_0/set_" + set_name + "/delimit_score_set_" + set_name + "." + str(i) + ".txt"
+ output_delimit_single_minbr_default_file = score_path + "delimit_single_minbr_default/set_" + set_name + "/delimit_score_set_" + set_name + "." + str(i) + ".txt"
+ output_delimit_multi_minbr_default_file = score_path + "delimit_multi_minbr_default/set_" + set_name + "/delimit_score_set_" + set_name + "." + str(i) + ".txt"
+ output_PTP_minbr_default_file = score_path + "PTP_minbr_default/set_" + set_name + "/PTP_score_set_" + set_name + "." + str(i) + ".txt"
+ output_gmyc_minbr_0_file = score_path + "gmyc_minbr_0/set_" + set_name + "/PTP_score_set_" + set_name + "." + str(i) + ".txt"
+
+ (tree_scores, nmi_scores, num_species, single_scores, multi_scores, score_real_single_minbr_0, score_real_multi_minbr_0, score_real_single_minbr_default, score_real_multi_minbr_default, num_real_species) = create_scoring_results(input_tree_file, input_delimit_single_minbr_0_file, input_delimit_multi_minbr_0_file, input_delimit_single_minbr_default_file, input_delimit_multi_minbr_default_file, input_PTP_minbr_default_file, input_gmyc_minbr_0_file, output_delimit_single_minbr_0_file, ou [...]
+
+ gnuplotOut_tree_scores_current_set.write(str(i) + ' ' + str(tree_scores['delimit_single_minbr_0']) + ' ' + str(tree_scores['delimit_multi_minbr_0']) + ' ' + str(tree_scores['delimit_single_minbr_default']) + ' ' + str(tree_scores['delimit_multi_minbr_default']) + ' ' + str(tree_scores['PTP_minbr_default']) + ' ' + str(tree_scores['gmyc_minbr_0']) + '\n')
+
+ gnuplotOut_nmi_scores_current_set.write(str(i) + ' ' + str(nmi_scores['delimit_single_minbr_0']) + ' ' + str(nmi_scores['delimit_multi_minbr_0']) + ' ' + str(nmi_scores['delimit_single_minbr_default']) + ' ' + str(nmi_scores['delimit_multi_minbr_default']) + ' ' + str(nmi_scores['PTP_minbr_default']) + ' ' + str(nmi_scores['gmyc_minbr_0']) + '\n')
+
+ gnuplotOut_single_scores_current_set.write(str(i) + ' ' + str(single_scores['delimit_single_minbr_0']) + ' ' + str(single_scores['delimit_multi_minbr_0']) + ' ' + str(single_scores['delimit_single_minbr_default']) + ' ' + str(single_scores['delimit_multi_minbr_default']) + ' ' + str(single_scores['PTP_minbr_default']) + ' ' + str(score_real_single_minbr_0) + ' ' + str(single_scores['gmyc_minbr_0']) + ' ' + str(score_real_single_minbr_default) + '\n')
+
+ gnuplotOut_multi_scores_current_set.write(str(i) + ' ' + str(multi_scores['delimit_single_minbr_0']) + ' ' + str(multi_scores['delimit_multi_minbr_0']) + ' ' + str(multi_scores['delimit_single_minbr_default']) + ' ' + str(multi_scores['delimit_multi_minbr_default']) + ' ' + str(multi_scores['PTP_minbr_default']) + ' ' + str(score_real_multi_minbr_0) + ' ' + ' ' + str(multi_scores['gmyc_minbr_0']) + str(score_real_multi_minbr_default) + '\n')
+
+ gnuplotOut_num_species_current_set.write(str(i) + ' ' + str(num_species['delimit_single_minbr_0']) + ' ' + str(num_species['delimit_multi_minbr_0']) + ' ' + str(num_species['delimit_single_minbr_default']) + ' ' + str(num_species['delimit_multi_minbr_default']) + ' ' + str(num_species['PTP_minbr_default']) + ' ' + str(num_species['gmyc_minbr_0']) + ' ' + str(num_real_species) + '\n')
+
+ gnuplotOut_delta_species_current_set.write(str(i) + ' ' + str(num_species['delimit_single_minbr_0'] - num_real_species) + ' ' + str(num_species['delimit_multi_minbr_0'] - num_real_species) + ' ' + str(num_species['delimit_single_minbr_default'] - num_real_species) + ' ' + str(num_species['delimit_multi_minbr_default'] - num_real_species) + ' ' + str(num_species['PTP_minbr_default'] - num_real_species) + ' ' + str(num_species['gmyc_minbr_0'] - num_real_species) + ' ' + str(num_real_spe [...]
+
+
+ try:
+ for name in names:
+ average_tree_scores[name] = average_tree_scores[name] + tree_scores[name]
+ average_nmi_scores[name] = average_nmi_scores[name] + nmi_scores[name]
+ average_num_species[name] = average_num_species[name] + num_species[name]
+ average_single_scores[name] = average_single_scores[name] + single_scores[name]
+ average_multi_scores[name] = average_multi_scores[name] + multi_scores[name]
+ average_real_num_species = average_real_num_species + num_real_species
+ average_real_score_single_minbr_0 = average_real_score_single_minbr_0 + score_real_single_minbr_0
+ average_real_score_multi_minbr_0 = average_real_score_multi_minbr_0 + score_real_multi_minbr_0
+ average_real_score_single_minbr_default = average_real_score_single_minbr_default + score_real_single_minbr_default
+ average_real_score_multi_minbr_default = average_real_score_multi_minbr_default + score_real_multi_minbr_default
+ except:
+ print "File is bad: " + input_tree_file
+ num_valid_indices = num_valid_indices - 1
+ num_bad_guys = num_bad_guys + 1
+
+ num_valid_indices = num_valid_indices + 1
+ except IOError:
+ #1
+ print "File not found: " + input_tree_file
+
+ #print "Set " + set_name + ": Num bad guys " + str(num_bad_guys)
+ #print "Set " + set_name + ": Num good guys " + str(num_valid_indices)
+
+ if (num_valid_indices > 0):
+ for name in names:
+ average_tree_scores[name] = float(average_tree_scores[name]) / float(num_valid_indices)
+ average_nmi_scores[name] = float(average_nmi_scores[name]) / float(num_valid_indices)
+ average_num_species[name] = float(average_num_species[name]) / float(num_valid_indices)
+ average_single_scores[name] = float(average_single_scores[name]) / float(num_valid_indices)
+ average_multi_scores[name] = float(average_multi_scores[name]) / float(num_valid_indices)
+
+ #print "Set " + set_name + ": Average tree score " + name
+ #print average_tree_scores[name]
+ #print "Set " + set_name + ": Average NMI score " + name
+ #print average_nmi_scores[name]
+ #print "Set " + set_name + ": Average num species " + name
+ #print average_num_species[name]
+ #print "Set " + set_name + ": Average input score single " + name
+ #print average_single_scores[name]
+ #print "Set " + set_name + ": Average input score multi " + name
+ #print average_multi_scores[name]
+ average_real_num_species = float(average_real_num_species) / float(num_valid_indices)
+ average_real_score_single_minbr_0 = float(average_real_score_single_minbr_0) / float(num_valid_indices)
+ average_real_score_multi_minbr_0 = float(average_real_score_multi_minbr_0) / float(num_valid_indices)
+ average_real_score_single_minbr_default = float(average_real_score_single_minbr_default) / float(num_valid_indices)
+ average_real_score_multi_minbr_default = float(average_real_score_multi_minbr_default) / float(num_valid_indices)
+ #print "Set " + set_name + ": Average real num species "
+ #print average_real_num_species
+ #print "Set " + set_name + ": Average real score single "
+ #print average_real_score_single
+ #print "Set " + set_name + ": Average real score multi "
+ #print average_real_score_multi
+
+ gnuplotOut_tree_scores.write(set_name[2:] + ' ' + str(average_tree_scores['delimit_single_minbr_0']) + ' ' + str(average_tree_scores['delimit_multi_minbr_0']) + ' ' + str(average_tree_scores['delimit_single_minbr_default']) + ' ' + str(average_tree_scores['delimit_multi_minbr_default']) + ' ' + str(average_tree_scores['PTP_minbr_default']) + ' ' + str(average_tree_scores['gmyc_minbr_0']) + '\n')
+
+ gnuplotOut_nmi_scores.write(set_name[2:] + ' ' + str(average_nmi_scores['delimit_single_minbr_0']) + ' ' + str(average_nmi_scores['delimit_multi_minbr_0']) + ' ' + str(average_nmi_scores['delimit_single_minbr_default']) + ' ' + str(average_nmi_scores['delimit_multi_minbr_default']) + ' ' + str(average_nmi_scores['PTP_minbr_default']) + ' ' + str(average_nmi_scores['gmyc_minbr_0']) + '\n')
+
+ gnuplotOut_single_scores.write(set_name[2:] + ' ' + str(average_single_scores['delimit_single_minbr_0']) + ' ' + str(average_single_scores['delimit_multi_minbr_0']) + ' ' + str(average_single_scores['delimit_single_minbr_default']) + ' ' + str(average_single_scores['delimit_multi_minbr_default']) + ' ' + str(average_single_scores['PTP_minbr_default']) + ' ' + str(average_single_scores['gmyc_minbr_0']) + ' ' + str(average_real_score_single_minbr_0) + ' ' + str(average_real_score_single_ [...]
+
+ gnuplotOut_multi_scores.write(set_name[2:] + ' ' + str(average_multi_scores['delimit_single_minbr_0']) + ' ' + str(average_multi_scores['delimit_multi_minbr_0']) + ' ' + str(average_multi_scores['delimit_single_minbr_default']) + ' ' + str(average_multi_scores['delimit_multi_minbr_default']) + ' ' + str(average_multi_scores['PTP_minbr_default']) + ' ' + str(average_multi_scores['gmyc_minbr_0']) + ' ' + str(average_real_score_multi_minbr_0) + ' ' + str(average_real_score_multi_minbr_def [...]
+
+ gnuplotOut_num_species.write(set_name[2:] + ' ' + str(average_num_species['delimit_single_minbr_0']) + ' ' + str(average_num_species['delimit_multi_minbr_0']) + ' ' + str(average_num_species['delimit_single_minbr_default']) + ' ' + str(average_num_species['delimit_multi_minbr_default']) + ' ' + str(average_num_species['PTP_minbr_default']) + ' ' + str(average_num_species['gmyc_minbr_0']) + ' ' + str(average_real_num_species) + '\n')
+
+ gnuplotOut_tree_scores_current_set.close()
+ gnuplotOut_nmi_scores_current_set.close()
+ gnuplotOut_single_scores_current_set.close()
+ gnuplotOut_multi_scores_current_set.close()
+ gnuplotOut_num_species_current_set.close()
+ gnuplotOut_delta_species_current_set.close()
+
+gnuplotOut_tree_scores.close()
+gnuplotOut_nmi_scores.close()
+gnuplotOut_single_scores.close()
+gnuplotOut_multi_scores.close()
+gnuplotOut_num_species.close()
+
+commands.getstatusoutput('gnuplot plotscript')
diff --git a/src/python/create_scoring_results_without_gmyc.py b/src/python/create_scoring_results_without_gmyc.py
new file mode 100755
index 0000000..cabfcf1
--- /dev/null
+++ b/src/python/create_scoring_results_without_gmyc.py
@@ -0,0 +1,314 @@
+#! /usr/bin/env python
+import os
+import commands
+
+def extract_tree_score(input_text):
+ lines = input_text.split('\n')
+ for line in lines:
+ if line.startswith("Tree penalty score:"):
+ return int(line.split(': ')[1])
+ break
+
+def extract_nmi_score(input_text):
+ lines = input_text.split('\n')
+ for line in lines:
+ if line.startswith("NMI score:"):
+ return float(line.split(': ')[1])
+ break
+
+def extract_num_species(input_text):
+ lines = input_text.split('\n')
+ for line in lines:
+ if line.startswith("Number of species in input file:"):
+ return int(line.split(': ')[1])
+ if (int(line.split(': ')[1]) == 1):
+ print "Baaaaad data"
+ break
+
+def extract_num_real_species(input_text):
+ lines = input_text.split('\n')
+ for line in lines:
+ if line.startswith("Number of real species:"):
+ return int(line.split(': ')[1])
+ break
+
+def extract_score_real_single(input_text):
+ lines = input_text.split('\n')
+ for line in lines:
+ if line.startswith("Score real single:"):
+ return float(line.split(': ')[1])
+ break
+
+def extract_score_real_multi(input_text):
+ lines = input_text.split('\n')
+ for line in lines:
+ if line.startswith("Score real multi:"):
+ return float(line.split(': ')[1])
+ break
+
+def extract_score_input_single(input_text):
+ lines = input_text.split('\n')
+ for line in lines:
+ if line.startswith("Score input single:"):
+ return float(line.split(': ')[1])
+ break
+
+def extract_score_input_multi(input_text):
+ lines = input_text.split('\n')
+ for line in lines:
+ if line.startswith("Score input multi:"):
+ return float(line.split(': ')[1])
+ break
+
+def grab_scorings(input_tree_file, output_delimit_single_minbr_0, output_delimit_multi_minbr_0, output_delimit_single_minbr_default, output_delimit_multi_minbr_default, output_PTP_minbr_default):
+ try:
+ open(input_tree_file)
+ programNames = ['delimit_single_minbr_0', 'delimit_multi_minbr_0', 'delimit_single_minbr_default', 'delimit_multi_minbr_default', 'PTP_minbr_default']
+ tree_scores = {}
+ nmi_scores = {}
+ num_species = {}
+ single_scores = {}
+ multi_scores = {}
+ num_real_species = 0
+ score_real_single_minbr_0 = 0
+ score_real_multi_minbr_0 = 0
+ score_real_single_minbr_default = 0
+ score_real_multi_minbr_default = 0
+
+ tree_scores['delimit_single_minbr_0'] = extract_tree_score(output_delimit_single_minbr_0)
+ tree_scores['delimit_multi_minbr_0'] = extract_tree_score(output_delimit_multi_minbr_0)
+ tree_scores['delimit_single_minbr_default'] = extract_tree_score(output_delimit_single_minbr_default)
+ tree_scores['delimit_multi_minbr_default'] = extract_tree_score(output_delimit_multi_minbr_default)
+ tree_scores['PTP_minbr_default'] = extract_tree_score(output_PTP_minbr_default)
+
+ nmi_scores['delimit_single_minbr_0'] = extract_nmi_score(output_delimit_single_minbr_0)
+ nmi_scores['delimit_multi_minbr_0'] = extract_nmi_score(output_delimit_multi_minbr_0)
+ nmi_scores['delimit_single_minbr_default'] = extract_nmi_score(output_delimit_single_minbr_default)
+ nmi_scores['delimit_multi_minbr_default'] = extract_nmi_score(output_delimit_multi_minbr_default)
+ nmi_scores['PTP_minbr_default'] = extract_nmi_score(output_PTP_minbr_default)
+
+ num_species['delimit_single_minbr_0'] = extract_num_species(output_delimit_single_minbr_0)
+ num_species['delimit_multi_minbr_0'] = extract_num_species(output_delimit_multi_minbr_0)
+ num_species['delimit_single_minbr_default'] = extract_num_species(output_delimit_single_minbr_default)
+ num_species['delimit_multi_minbr_default'] = extract_num_species(output_delimit_multi_minbr_default)
+ num_species['PTP_minbr_default'] = extract_num_species(output_PTP_minbr_default)
+
+ single_scores['delimit_single_minbr_0'] = extract_score_input_single(output_delimit_single_minbr_0)
+ single_scores['delimit_multi_minbr_0'] = extract_score_input_single(output_delimit_multi_minbr_0)
+ single_scores['delimit_single_minbr_default'] = extract_score_input_single(output_delimit_single_minbr_default)
+ single_scores['delimit_multi_minbr_default'] = extract_score_input_single(output_delimit_multi_minbr_default)
+ single_scores['PTP_minbr_default'] = extract_score_input_single(output_PTP_minbr_default)
+
+ multi_scores['delimit_single_minbr_0'] = extract_score_input_multi(output_delimit_single_minbr_0)
+ multi_scores['delimit_multi_minbr_0'] = extract_score_input_multi(output_delimit_multi_minbr_0)
+ multi_scores['delimit_single_minbr_default'] = extract_score_input_multi(output_delimit_single_minbr_default)
+ multi_scores['delimit_multi_minbr_default'] = extract_score_input_multi(output_delimit_multi_minbr_default)
+ multi_scores['PTP_minbr_default'] = extract_score_input_multi(output_PTP_minbr_default)
+
+ score_real_single_minbr_0 = extract_score_real_single(output_delimit_single_minbr_0)
+ score_real_multi_minbr_0 = extract_score_real_multi(output_delimit_single_minbr_0)
+ score_real_single_minbr_default = extract_score_real_single(output_delimit_single_minbr_default)
+ score_real_multi_minbr_default = extract_score_real_multi(output_delimit_single_minbr_default)
+ num_real_species = extract_num_real_species(output_delimit_single_minbr_0)
+
+ return (tree_scores, nmi_scores, num_species, single_scores, multi_scores, score_real_single_minbr_0, score_real_multi_minbr_0, score_real_single_minbr_default, score_real_multi_minbr_default, num_real_species)
+ except IOError:
+ print "File not found: " + input_tree_file
+
+def create_scoring_results(input_tree_file, input_delimit_single_minbr_0_file, input_delimit_multi_minbr_0_file, input_delimit_single_minbr_default_file, input_delimit_multi_minbr_default_file, input_PTP_minbr_default_file, output_delimit_single_minbr_0_file, output_delimit_multi_minbr_0_file, output_delimit_single_minbr_default_file, output_delimit_multi_minbr_default_file, output_PTP_minbr_default_file):
+ try:
+ open(input_tree_file)
+ if not os.path.exists(os.path.dirname(output_delimit_single_minbr_0_file)):
+ os.makedirs(os.path.dirname(output_delimit_single_minbr_0_file))
+ if not os.path.exists(os.path.dirname(output_delimit_multi_minbr_0_file)):
+ os.makedirs(os.path.dirname(output_delimit_multi_minbr_0_file))
+ if not os.path.exists(os.path.dirname(output_delimit_single_minbr_default_file)):
+ os.makedirs(os.path.dirname(output_delimit_single_minbr_default_file))
+ if not os.path.exists(os.path.dirname(output_delimit_multi_minbr_default_file)):
+ os.makedirs(os.path.dirname(output_delimit_multi_minbr_default_file))
+ if not os.path.exists(os.path.dirname(output_PTP_minbr_default_file)):
+ os.makedirs(os.path.dirname(output_PTP_minbr_default_file))
+
+ call_delimit_single_minbr_0 = "./delimit --score " + input_delimit_single_minbr_0_file + " --min_br 0 --tree_file " + input_tree_file + " --output_file foo"
+ call_delimit_multi_minbr_0 = "./delimit --score " + input_delimit_multi_minbr_0_file + " --min_br 0 --tree_file " + input_tree_file + " --output_file foo"
+ call_delimit_single_minbr_default = "./delimit --score " + input_delimit_single_minbr_default_file + " --tree_file " + input_tree_file + " --output_file foo"
+ call_delimit_multi_minbr_default = "./delimit --score " + input_delimit_multi_minbr_default_file + " --tree_file " + input_tree_file + " --output_file foo"
+ call_PTP_minbr_default = "./delimit --score " + input_PTP_minbr_default_file + " --min_br 0 --tree_file " + input_tree_file + " --output_file foo"
+ (stat_delimit_single_minbr_0, output_delimit_single_minbr_0) = commands.getstatusoutput(call_delimit_single_minbr_0)
+ (stat_delimit_multi_minbr_0, output_delimit_multi_minbr_0) = commands.getstatusoutput(call_delimit_multi_minbr_0)
+ (stat_delimit_single_minbr_default, output_delimit_single_minbr_default) = commands.getstatusoutput(call_delimit_single_minbr_default)
+ (stat_delimit_multi_minbr_default, output_delimit_multi_minbr_default) = commands.getstatusoutput(call_delimit_multi_minbr_default)
+ (stat_PTP_minbr_default, output_PTP_minbr_default) = commands.getstatusoutput(call_PTP_minbr_default)
+
+ delimit_single_minbr_0_out = open(output_delimit_single_minbr_0_file, 'w')
+ delimit_multi_minbr_0_out = open(output_delimit_multi_minbr_0_file, 'w')
+ delimit_single_minbr_default_out = open(output_delimit_single_minbr_default_file, 'w')
+ delimit_multi_minbr_default_out = open(output_delimit_multi_minbr_default_file, 'w')
+ PTP_minbr_default_out = open(output_PTP_minbr_default_file, 'w')
+
+ delimit_single_minbr_0_out.write(output_delimit_single_minbr_0)
+ delimit_multi_minbr_0_out.write(output_delimit_multi_minbr_0)
+ delimit_single_minbr_default_out.write(output_delimit_single_minbr_default)
+ delimit_multi_minbr_default_out.write(output_delimit_multi_minbr_default)
+ PTP_minbr_default_out.write(output_PTP_minbr_default)
+
+ delimit_single_minbr_0_out.close()
+ delimit_multi_minbr_0_out.close()
+ delimit_single_minbr_default_out.close()
+ delimit_multi_minbr_default_out.close()
+ PTP_minbr_default_out.close()
+
+ return grab_scorings(input_tree_file, output_delimit_single_minbr_0, output_delimit_multi_minbr_0, output_delimit_single_minbr_default, output_delimit_multi_minbr_default, output_PTP_minbr_default)
+ except IOError:
+ print "File not found: " + input_tree_file
+
+set_names = ["Ne10000", "Ne100000", "Ne500000", "Ne1000000"]
+names = ['delimit_single_minbr_0', 'delimit_multi_minbr_0', 'delimit_single_minbr_default', 'delimit_multi_minbr_default', 'PTP_minbr_default']
+
+gnuplotOut_tree_scores = open('workfile_tree_scores', 'w')
+gnuplotOut_nmi_scores = open('workfile_nmi_scores', 'w')
+gnuplotOut_single_scores = open('workfile_single_scores', 'w')
+gnuplotOut_multi_scores = open('workfile_multi_scores', 'w')
+gnuplotOut_num_species = open('workfile_num_species', 'w')
+
+for set_name in set_names:
+ gnuplotOut_tree_scores_current_set = open('workfile_tree_scores_' + set_name, 'w')
+ gnuplotOut_nmi_scores_current_set = open('workfile_nmi_scores_' + set_name, 'w')
+ gnuplotOut_single_scores_current_set = open('workfile_single_scores_' + set_name, 'w')
+ gnuplotOut_multi_scores_current_set = open('workfile_multi_scores_' + set_name, 'w')
+ gnuplotOut_num_species_current_set = open('workfile_num_species_' + set_name, 'w')
+ gnuplotOut_delta_species_current_set = open('workfile_delta_species_' + set_name, 'w')
+
+ num_valid_indices = 0
+ average_tree_scores = {}
+ average_nmi_scores = {}
+ average_num_species = {}
+ average_single_scores = {}
+ average_multi_scores = {}
+ average_real_num_species = 0
+ average_real_score_single_minbr_0 = 0
+ average_real_score_multi_minbr_0 = 0
+ average_real_score_single_minbr_default = 0
+ average_real_score_multi_minbr_default = 0
+
+ for name in names:
+ average_tree_scores[name] = 0
+ average_nmi_scores[name] = 0
+ average_num_species[name] = 0
+ average_single_scores[name] = 0
+ average_multi_scores[name] = 0
+ num_bad_guys = 0
+
+ for i in range(1,101):
+ input_tree_file = "similar_to_GMYC/15-08-2015.16-40/set_BIRTH0.27_" + set_name + "/rooted.RAxML_result.inferred.simulated_set_BIRTH0.27_" + set_name + "_" + str(i) + ".phy"
+
+ try:
+ open(input_tree_file)
+
+ input_delimit_single_minbr_0_file = "similar_to_GMYC_delimit_single_minbr_0/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt"
+ input_delimit_multi_minbr_0_file = "similar_to_GMYC_delimit_multi_minbr_0/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt"
+ input_delimit_single_minbr_default_file = "similar_to_GMYC_delimit_single_minbr_default/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt"
+ input_delimit_multi_minbr_default_file = "similar_to_GMYC_delimit_multi_minbr_default/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt"
+ input_PTP_minbr_default_file = "similar_to_GMYC_PTP_minbr_default/set_" + set_name + "/PTP_results_set_" + set_name + "." + str(i) + ".txt"
+ input_gmyc_minbr_0_file = "similar_to_GMYC_gmyc_minbr_0/set_" + set_name + "/PTP_results_set_" + set_name + "." + str(i) + ".txt"
+
+ score_path = "similar_to_GMYC_scoring_results/"
+ output_delimit_single_minbr_0_file = score_path + "delimit_single_minbr_0/set_" + set_name + "/delimit_score_set_" + set_name + "." + str(i) + ".txt"
+ output_delimit_multi_minbr_0_file = score_path + "delimit_multi_minbr_0/set_" + set_name + "/delimit_score_set_" + set_name + "." + str(i) + ".txt"
+ output_delimit_single_minbr_default_file = score_path + "delimit_single_minbr_default/set_" + set_name + "/delimit_score_set_" + set_name + "." + str(i) + ".txt"
+ output_delimit_multi_minbr_default_file = score_path + "delimit_multi_minbr_default/set_" + set_name + "/delimit_score_set_" + set_name + "." + str(i) + ".txt"
+ output_PTP_minbr_default_file = score_path + "PTP_minbr_default/set_" + set_name + "/PTP_score_set_" + set_name + "." + str(i) + ".txt"
+ (tree_scores, nmi_scores, num_species, single_scores, multi_scores, score_real_single_minbr_0, score_real_multi_minbr_0, score_real_single_minbr_default, score_real_multi_minbr_default, num_real_species) = create_scoring_results(input_tree_file, input_delimit_single_minbr_0_file, input_delimit_multi_minbr_0_file, input_delimit_single_minbr_default_file, input_delimit_multi_minbr_default_file, input_PTP_minbr_default_file, output_delimit_single_minbr_0_file, output_delimit_multi_minbr_ [...]
+
+ gnuplotOut_tree_scores_current_set.write(str(i) + ' ' + str(tree_scores['delimit_single_minbr_0']) + ' ' + str(tree_scores['delimit_multi_minbr_0']) + ' ' + str(tree_scores['delimit_single_minbr_default']) + ' ' + str(tree_scores['delimit_multi_minbr_default']) + ' ' + str(tree_scores['PTP_minbr_default']) + '\n')
+
+ gnuplotOut_nmi_scores_current_set.write(str(i) + ' ' + str(nmi_scores['delimit_single_minbr_0']) + ' ' + str(nmi_scores['delimit_multi_minbr_0']) + ' ' + str(nmi_scores['delimit_single_minbr_default']) + ' ' + str(nmi_scores['delimit_multi_minbr_default']) + ' ' + str(nmi_scores['PTP_minbr_default']) + '\n')
+
+ gnuplotOut_single_scores_current_set.write(str(i) + ' ' + str(single_scores['delimit_single_minbr_0']) + ' ' + str(single_scores['delimit_multi_minbr_0']) + ' ' + str(single_scores['delimit_single_minbr_default']) + ' ' + str(single_scores['delimit_multi_minbr_default']) + ' ' + str(single_scores['PTP_minbr_default']) + ' ' + str(score_real_single_minbr_0) + ' ' + str(score_real_single_minbr_default) + '\n')
+
+ gnuplotOut_multi_scores_current_set.write(str(i) + ' ' + str(multi_scores['delimit_single_minbr_0']) + ' ' + str(multi_scores['delimit_multi_minbr_0']) + ' ' + str(multi_scores['delimit_single_minbr_default']) + ' ' + str(multi_scores['delimit_multi_minbr_default']) + ' ' + str(multi_scores['PTP_minbr_default']) + ' ' + str(score_real_multi_minbr_0) + ' ' + str(score_real_multi_minbr_default) + '\n')
+
+ gnuplotOut_num_species_current_set.write(str(i) + ' ' + str(num_species['delimit_single_minbr_0']) + ' ' + str(num_species['delimit_multi_minbr_0']) + ' ' + str(num_species['delimit_single_minbr_default']) + ' ' + str(num_species['delimit_multi_minbr_default']) + ' ' + str(num_species['PTP_minbr_default']) + ' ' + str(num_real_species) + '\n')
+
+ gnuplotOut_delta_species_current_set.write(str(i) + ' ' + str(num_species['delimit_single_minbr_0'] - num_real_species) + ' ' + str(num_species['delimit_multi_minbr_0'] - num_real_species) + ' ' + str(num_species['delimit_single_minbr_default'] - num_real_species) + ' ' + str(num_species['delimit_multi_minbr_default'] - num_real_species) + ' ' + str(num_species['PTP_minbr_default'] - num_real_species) + ' ' + str(num_real_species - num_real_species) + '\n')
+
+
+ try:
+ for name in names:
+ average_tree_scores[name] = average_tree_scores[name] + tree_scores[name]
+ average_nmi_scores[name] = average_nmi_scores[name] + nmi_scores[name]
+ average_num_species[name] = average_num_species[name] + num_species[name]
+ average_single_scores[name] = average_single_scores[name] + single_scores[name]
+ average_multi_scores[name] = average_multi_scores[name] + multi_scores[name]
+ average_real_num_species = average_real_num_species + num_real_species
+ average_real_score_single_minbr_0 = average_real_score_single_minbr_0 + score_real_single_minbr_0
+ average_real_score_multi_minbr_0 = average_real_score_multi_minbr_0 + score_real_multi_minbr_0
+ average_real_score_single_minbr_default = average_real_score_single_minbr_default + score_real_single_minbr_default
+ average_real_score_multi_minbr_default = average_real_score_multi_minbr_default + score_real_multi_minbr_default
+ except:
+ print "File is bad: " + input_tree_file
+ num_valid_indices = num_valid_indices - 1
+ num_bad_guys = num_bad_guys + 1
+
+ num_valid_indices = num_valid_indices + 1
+ except IOError:
+ #1
+ print "File not found: " + input_tree_file
+
+ #print "Set " + set_name + ": Num bad guys " + str(num_bad_guys)
+ #print "Set " + set_name + ": Num good guys " + str(num_valid_indices)
+
+ if (num_valid_indices > 0):
+ for name in names:
+ average_tree_scores[name] = float(average_tree_scores[name]) / float(num_valid_indices)
+ average_nmi_scores[name] = float(average_nmi_scores[name]) / float(num_valid_indices)
+ average_num_species[name] = float(average_num_species[name]) / float(num_valid_indices)
+ average_single_scores[name] = float(average_single_scores[name]) / float(num_valid_indices)
+ average_multi_scores[name] = float(average_multi_scores[name]) / float(num_valid_indices)
+
+ #print "Set " + set_name + ": Average tree score " + name
+ #print average_tree_scores[name]
+ #print "Set " + set_name + ": Average NMI score " + name
+ #print average_nmi_scores[name]
+ #print "Set " + set_name + ": Average num species " + name
+ #print average_num_species[name]
+ #print "Set " + set_name + ": Average input score single " + name
+ #print average_single_scores[name]
+ #print "Set " + set_name + ": Average input score multi " + name
+ #print average_multi_scores[name]
+ average_real_num_species = float(average_real_num_species) / float(num_valid_indices)
+ average_real_score_single_minbr_0 = float(average_real_score_single_minbr_0) / float(num_valid_indices)
+ average_real_score_multi_minbr_0 = float(average_real_score_multi_minbr_0) / float(num_valid_indices)
+ average_real_score_single_minbr_default = float(average_real_score_single_minbr_default) / float(num_valid_indices)
+ average_real_score_multi_minbr_default = float(average_real_score_multi_minbr_default) / float(num_valid_indices)
+ #print "Set " + set_name + ": Average real num species "
+ #print average_real_num_species
+ #print "Set " + set_name + ": Average real score single "
+ #print average_real_score_single
+ #print "Set " + set_name + ": Average real score multi "
+ #print average_real_score_multi
+
+ gnuplotOut_tree_scores.write(set_name[2:] + ' ' + str(average_tree_scores['delimit_single_minbr_0']) + ' ' + str(average_tree_scores['delimit_multi_minbr_0']) + ' ' + str(average_tree_scores['delimit_single_minbr_default']) + ' ' + str(average_tree_scores['delimit_multi_minbr_default']) + ' ' + str(average_tree_scores['PTP_minbr_default']) + '\n')
+
+ gnuplotOut_nmi_scores.write(set_name[2:] + ' ' + str(average_nmi_scores['delimit_single_minbr_0']) + ' ' + str(average_nmi_scores['delimit_multi_minbr_0']) + ' ' + str(average_nmi_scores['delimit_single_minbr_default']) + ' ' + str(average_nmi_scores['delimit_multi_minbr_default']) + ' ' + str(average_nmi_scores['PTP_minbr_default']) + '\n')
+
+ gnuplotOut_single_scores.write(set_name[2:] + ' ' + str(average_single_scores['delimit_single_minbr_0']) + ' ' + str(average_single_scores['delimit_multi_minbr_0']) + ' ' + str(average_single_scores['delimit_single_minbr_default']) + ' ' + str(average_single_scores['delimit_multi_minbr_default']) + ' ' + str(average_single_scores['PTP_minbr_default']) + ' ' + str(average_real_score_single_minbr_0) + ' ' + str(average_real_score_single_minbr_default) + '\n')
+
+ gnuplotOut_multi_scores.write(set_name[2:] + ' ' + str(average_multi_scores['delimit_single_minbr_0']) + ' ' + str(average_multi_scores['delimit_multi_minbr_0']) + ' ' + str(average_multi_scores['delimit_single_minbr_default']) + ' ' + str(average_multi_scores['delimit_multi_minbr_default']) + ' ' + str(average_multi_scores['PTP_minbr_default']) + ' ' + str(average_real_score_multi_minbr_0) + ' ' + str(average_real_score_multi_minbr_default) + '\n')
+
+ gnuplotOut_num_species.write(set_name[2:] + ' ' + str(average_num_species['delimit_single_minbr_0']) + ' ' + str(average_num_species['delimit_multi_minbr_0']) + ' ' + str(average_num_species['delimit_single_minbr_default']) + ' ' + str(average_num_species['delimit_multi_minbr_default']) + ' ' + str(average_num_species['PTP_minbr_default']) + ' ' + str(average_real_num_species) + '\n')
+
+ gnuplotOut_tree_scores_current_set.close()
+ gnuplotOut_nmi_scores_current_set.close()
+ gnuplotOut_single_scores_current_set.close()
+ gnuplotOut_multi_scores_current_set.close()
+ gnuplotOut_num_species_current_set.close()
+ gnuplotOut_delta_species_current_set.close()
+
+gnuplotOut_tree_scores.close()
+gnuplotOut_nmi_scores.close()
+gnuplotOut_single_scores.close()
+gnuplotOut_multi_scores.close()
+gnuplotOut_num_species.close()
+
+commands.getstatusoutput('gnuplot plotscript_without_gmyc')
diff --git a/src/python/create_scoring_results_without_ptp.py b/src/python/create_scoring_results_without_ptp.py
new file mode 100755
index 0000000..e8b2fab
--- /dev/null
+++ b/src/python/create_scoring_results_without_ptp.py
@@ -0,0 +1,316 @@
+#! /usr/bin/env python
+import os
+import commands
+
+def extract_tree_score(input_text):
+ lines = input_text.split('\n')
+ for line in lines:
+ if line.startswith("Tree penalty score:"):
+ return int(line.split(': ')[1])
+ break
+
+def extract_nmi_score(input_text):
+ lines = input_text.split('\n')
+ for line in lines:
+ if line.startswith("NMI score:"):
+ return float(line.split(': ')[1])
+ break
+
+def extract_num_species(input_text):
+ lines = input_text.split('\n')
+ for line in lines:
+ if line.startswith("Number of species in input file:"):
+ return int(line.split(': ')[1])
+ if (int(line.split(': ')[1]) == 1):
+ print "Baaaaad data"
+ break
+
+def extract_num_real_species(input_text):
+ lines = input_text.split('\n')
+ for line in lines:
+ if line.startswith("Number of real species:"):
+ return int(line.split(': ')[1])
+ break
+
+def extract_score_real_single(input_text):
+ lines = input_text.split('\n')
+ for line in lines:
+ if line.startswith("Score real single:"):
+ return float(line.split(': ')[1])
+ break
+
+def extract_score_real_multi(input_text):
+ lines = input_text.split('\n')
+ for line in lines:
+ if line.startswith("Score real multi:"):
+ return float(line.split(': ')[1])
+ break
+
+def extract_score_input_single(input_text):
+ lines = input_text.split('\n')
+ for line in lines:
+ if line.startswith("Score input single:"):
+ return float(line.split(': ')[1])
+ break
+
+def extract_score_input_multi(input_text):
+ lines = input_text.split('\n')
+ for line in lines:
+ if line.startswith("Score input multi:"):
+ return float(line.split(': ')[1])
+ break
+
+def grab_scorings(input_tree_file, output_delimit_single_minbr_0, output_delimit_multi_minbr_0, output_delimit_single_minbr_default, output_delimit_multi_minbr_default, output_gmyc_minbr_0):
+ try:
+ open(input_tree_file)
+ programNames = ['delimit_single_minbr_0', 'delimit_multi_minbr_0', 'delimit_single_minbr_default', 'delimit_multi_minbr_default', 'gmyc_minbr_0']
+ tree_scores = {}
+ nmi_scores = {}
+ num_species = {}
+ single_scores = {}
+ multi_scores = {}
+ num_real_species = 0
+ score_real_single_minbr_0 = 0
+ score_real_multi_minbr_0 = 0
+ score_real_single_minbr_default = 0
+ score_real_multi_minbr_default = 0
+
+ tree_scores['delimit_single_minbr_0'] = extract_tree_score(output_delimit_single_minbr_0)
+ tree_scores['delimit_multi_minbr_0'] = extract_tree_score(output_delimit_multi_minbr_0)
+ tree_scores['delimit_single_minbr_default'] = extract_tree_score(output_delimit_single_minbr_default)
+ tree_scores['delimit_multi_minbr_default'] = extract_tree_score(output_delimit_multi_minbr_default)
+ tree_scores['gmyc_minbr_0'] = extract_tree_score(output_gmyc_minbr_0)
+
+ nmi_scores['delimit_single_minbr_0'] = extract_nmi_score(output_delimit_single_minbr_0)
+ nmi_scores['delimit_multi_minbr_0'] = extract_nmi_score(output_delimit_multi_minbr_0)
+ nmi_scores['delimit_single_minbr_default'] = extract_nmi_score(output_delimit_single_minbr_default)
+ nmi_scores['delimit_multi_minbr_default'] = extract_nmi_score(output_delimit_multi_minbr_default)
+ nmi_scores['gmyc_minbr_0'] = extract_nmi_score(output_gmyc_minbr_0)
+
+ num_species['delimit_single_minbr_0'] = extract_num_species(output_delimit_single_minbr_0)
+ num_species['delimit_multi_minbr_0'] = extract_num_species(output_delimit_multi_minbr_0)
+ num_species['delimit_single_minbr_default'] = extract_num_species(output_delimit_single_minbr_default)
+ num_species['delimit_multi_minbr_default'] = extract_num_species(output_delimit_multi_minbr_default)
+ num_species['gmyc_minbr_0'] = extract_num_species(output_gmyc_minbr_0)
+
+ single_scores['delimit_single_minbr_0'] = extract_score_input_single(output_delimit_single_minbr_0)
+ single_scores['delimit_multi_minbr_0'] = extract_score_input_single(output_delimit_multi_minbr_0)
+ single_scores['delimit_single_minbr_default'] = extract_score_input_single(output_delimit_single_minbr_default)
+ single_scores['delimit_multi_minbr_default'] = extract_score_input_single(output_delimit_multi_minbr_default)
+ single_scores['gmyc_minbr_0'] = extract_score_input_single(output_gmyc_minbr_0)
+
+ multi_scores['delimit_single_minbr_0'] = extract_score_input_multi(output_delimit_single_minbr_0)
+ multi_scores['delimit_multi_minbr_0'] = extract_score_input_multi(output_delimit_multi_minbr_0)
+ multi_scores['delimit_single_minbr_default'] = extract_score_input_multi(output_delimit_single_minbr_default)
+ multi_scores['delimit_multi_minbr_default'] = extract_score_input_multi(output_delimit_multi_minbr_default)
+ multi_scores['gmyc_minbr_0'] = extract_score_input_multi(output_gmyc_minbr_0)
+
+ score_real_single_minbr_0 = extract_score_real_single(output_delimit_single_minbr_0)
+ score_real_multi_minbr_0 = extract_score_real_multi(output_delimit_single_minbr_0)
+ score_real_single_minbr_default = extract_score_real_single(output_delimit_single_minbr_default)
+ score_real_multi_minbr_default = extract_score_real_multi(output_delimit_single_minbr_default)
+ num_real_species = extract_num_real_species(output_delimit_single_minbr_0)
+
+ return (tree_scores, nmi_scores, num_species, single_scores, multi_scores, score_real_single_minbr_0, score_real_multi_minbr_0, score_real_single_minbr_default, score_real_multi_minbr_default, num_real_species)
+ except IOError:
+ print "File not found: " + input_tree_file
+
+def create_scoring_results(input_tree_file, input_delimit_single_minbr_0_file, input_delimit_multi_minbr_0_file, input_delimit_single_minbr_default_file, input_delimit_multi_minbr_default_file, input_gmyc_minbr_0_file, output_delimit_single_minbr_0_file, output_delimit_multi_minbr_0_file, output_delimit_single_minbr_default_file, output_delimit_multi_minbr_default_file, output_gmyc_minbr_0_file):
+ try:
+ open(input_tree_file)
+ if not os.path.exists(os.path.dirname(output_delimit_single_minbr_0_file)):
+ os.makedirs(os.path.dirname(output_delimit_single_minbr_0_file))
+ if not os.path.exists(os.path.dirname(output_delimit_multi_minbr_0_file)):
+ os.makedirs(os.path.dirname(output_delimit_multi_minbr_0_file))
+ if not os.path.exists(os.path.dirname(output_delimit_single_minbr_default_file)):
+ os.makedirs(os.path.dirname(output_delimit_single_minbr_default_file))
+ if not os.path.exists(os.path.dirname(output_delimit_multi_minbr_default_file)):
+ os.makedirs(os.path.dirname(output_delimit_multi_minbr_default_file))
+ if not os.path.exists(os.path.dirname(output_gmyc_minbr_0_file)):
+ os.makedirs(os.path.dirname(output_gmyc_minbr_0_file))
+
+ call_delimit_single_minbr_0 = "./delimit --score " + input_delimit_single_minbr_0_file + " --min_br 0 --tree_file " + input_tree_file + " --output_file foo"
+ call_delimit_multi_minbr_0 = "./delimit --score " + input_delimit_multi_minbr_0_file + " --min_br 0 --tree_file " + input_tree_file + " --output_file foo"
+ call_delimit_single_minbr_default = "./delimit --score " + input_delimit_single_minbr_default_file + " --tree_file " + input_tree_file + " --output_file foo"
+ call_delimit_multi_minbr_default = "./delimit --score " + input_delimit_multi_minbr_default_file + " --tree_file " + input_tree_file + " --output_file foo"
+ call_gmyc_minbr_0 = "./delimit --score " + input_gmyc_minbr_0_file + " --min_br 0 --tree_file " + input_tree_file + " --output_file foo"
+
+ (stat_delimit_single_minbr_0, output_delimit_single_minbr_0) = commands.getstatusoutput(call_delimit_single_minbr_0)
+ (stat_delimit_multi_minbr_0, output_delimit_multi_minbr_0) = commands.getstatusoutput(call_delimit_multi_minbr_0)
+ (stat_delimit_single_minbr_default, output_delimit_single_minbr_default) = commands.getstatusoutput(call_delimit_single_minbr_default)
+ (stat_delimit_multi_minbr_default, output_delimit_multi_minbr_default) = commands.getstatusoutput(call_delimit_multi_minbr_default)
+ (stat_gmyc_minbr_0, output_gmyc_minbr_0) = commands.getstatusoutput(call_gmyc_minbr_0)
+
+ delimit_single_minbr_0_out = open(output_delimit_single_minbr_0_file, 'w')
+ delimit_multi_minbr_0_out = open(output_delimit_multi_minbr_0_file, 'w')
+ delimit_single_minbr_default_out = open(output_delimit_single_minbr_default_file, 'w')
+ delimit_multi_minbr_default_out = open(output_delimit_multi_minbr_default_file, 'w')
+ gmyc_minbr_0_out = open(output_gmyc_minbr_0_file, 'w')
+
+ delimit_single_minbr_0_out.write(output_delimit_single_minbr_0)
+ delimit_multi_minbr_0_out.write(output_delimit_multi_minbr_0)
+ delimit_single_minbr_default_out.write(output_delimit_single_minbr_default)
+ delimit_multi_minbr_default_out.write(output_delimit_multi_minbr_default)
+ gmyc_minbr_0_out.write(output_gmyc_minbr_0)
+
+ delimit_single_minbr_0_out.close()
+ delimit_multi_minbr_0_out.close()
+ delimit_single_minbr_default_out.close()
+ delimit_multi_minbr_default_out.close()
+ gmyc_minbr_0_out.close()
+
+ return grab_scorings(input_tree_file, output_delimit_single_minbr_0, output_delimit_multi_minbr_0, output_delimit_single_minbr_default, output_delimit_multi_minbr_default, output_gmyc_minbr_0)
+ except IOError:
+ print "File not found: " + input_tree_file
+
+set_names = ["Ne1e+05", "Ne1e+06", "Ne5e+05", "Ne10000"]
+names = ['delimit_single_minbr_0', 'delimit_multi_minbr_0', 'delimit_single_minbr_default', 'delimit_multi_minbr_default', 'gmyc_minbr_0']
+
+gnuplotOut_tree_scores = open('workfile_tree_scores', 'w')
+gnuplotOut_nmi_scores = open('workfile_nmi_scores', 'w')
+gnuplotOut_single_scores = open('workfile_single_scores', 'w')
+gnuplotOut_multi_scores = open('workfile_multi_scores', 'w')
+gnuplotOut_num_species = open('workfile_num_species', 'w')
+
+for set_name in set_names:
+ gnuplotOut_tree_scores_current_set = open('workfile_tree_scores_' + set_name, 'w')
+ gnuplotOut_nmi_scores_current_set = open('workfile_nmi_scores_' + set_name, 'w')
+ gnuplotOut_single_scores_current_set = open('workfile_single_scores_' + set_name, 'w')
+ gnuplotOut_multi_scores_current_set = open('workfile_multi_scores_' + set_name, 'w')
+ gnuplotOut_num_species_current_set = open('workfile_num_species_' + set_name, 'w')
+ gnuplotOut_delta_species_current_set = open('workfile_delta_species_' + set_name, 'w')
+
+ num_valid_indices = 0
+ average_tree_scores = {}
+ average_nmi_scores = {}
+ average_num_species = {}
+ average_single_scores = {}
+ average_multi_scores = {}
+ average_real_num_species = 0
+ average_real_score_single_minbr_0 = 0
+ average_real_score_multi_minbr_0 = 0
+ average_real_score_single_minbr_default = 0
+ average_real_score_multi_minbr_default = 0
+
+ for name in names:
+ average_tree_scores[name] = 0
+ average_nmi_scores[name] = 0
+ average_num_species[name] = 0
+ average_single_scores[name] = 0
+ average_multi_scores[name] = 0
+ num_bad_guys = 0
+
+ for i in range(1,101):
+ input_tree_file = "SimulB_C_trees/set_" + set_name + "/SimulB_C_tree_set_" + set_name + "." + str(i) + ".txt"
+
+ try:
+ open(input_tree_file)
+
+ input_delimit_single_minbr_0_file = "SimulB_C_delimit_single_minbr_0/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt"
+ input_delimit_multi_minbr_0_file = "SimulB_C_delimit_multi_minbr_0/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt"
+ input_delimit_single_minbr_default_file = "SimulB_C_delimit_single_minbr_default/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt"
+ input_delimit_multi_minbr_default_file = "SimulB_C_delimit_multi_minbr_default/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt"
+ input_gmyc_minbr_0_file = "SimulB_C_gmyc_minbr_0/set_" + set_name + "/gmyc_results_set_" + set_name + "." + str(i) + ".txt"
+
+ score_path = "SimulB_C_scoring_results/"
+ output_delimit_single_minbr_0_file = score_path + "delimit_single_minbr_0/set_" + set_name + "/delimit_score_set_" + set_name + "." + str(i) + ".txt"
+ output_delimit_multi_minbr_0_file = score_path + "delimit_multi_minbr_0/set_" + set_name + "/delimit_score_set_" + set_name + "." + str(i) + ".txt"
+ output_delimit_single_minbr_default_file = score_path + "delimit_single_minbr_default/set_" + set_name + "/delimit_score_set_" + set_name + "." + str(i) + ".txt"
+ output_delimit_multi_minbr_default_file = score_path + "delimit_multi_minbr_default/set_" + set_name + "/delimit_score_set_" + set_name + "." + str(i) + ".txt"
+
+ output_gmyc_minbr_0_file = score_path + "gmyc_minbr_0/set_" + set_name + "/gmyc_score_set_" + set_name + "." + str(i) + ".txt"
+
+ (tree_scores, nmi_scores, num_species, single_scores, multi_scores, score_real_single_minbr_0, score_real_multi_minbr_0, score_real_single_minbr_default, score_real_multi_minbr_default, num_real_species) = create_scoring_results(input_tree_file, input_delimit_single_minbr_0_file, input_delimit_multi_minbr_0_file, input_delimit_single_minbr_default_file, input_delimit_multi_minbr_default_file, input_gmyc_minbr_0_file, output_delimit_single_minbr_0_file, output_delimit_multi_minbr_0_fil [...]
+
+ gnuplotOut_tree_scores_current_set.write(str(i) + ' ' + str(tree_scores['delimit_single_minbr_0']) + ' ' + str(tree_scores['delimit_multi_minbr_0']) + ' ' + str(tree_scores['delimit_single_minbr_default']) + ' ' + str(tree_scores['delimit_multi_minbr_default']) + ' ' + str(tree_scores['gmyc_minbr_0']) + '\n')
+
+ gnuplotOut_nmi_scores_current_set.write(str(i) + ' ' + str(nmi_scores['delimit_single_minbr_0']) + ' ' + str(nmi_scores['delimit_multi_minbr_0']) + ' ' + str(nmi_scores['delimit_single_minbr_default']) + ' ' + str(nmi_scores['delimit_multi_minbr_default']) + ' ' + str(nmi_scores['gmyc_minbr_0']) + '\n')
+
+ gnuplotOut_single_scores_current_set.write(str(i) + ' ' + str(single_scores['delimit_single_minbr_0']) + ' ' + str(single_scores['delimit_multi_minbr_0']) + ' ' + str(single_scores['delimit_single_minbr_default']) + ' ' + str(single_scores['delimit_multi_minbr_default']) + ' ' + str(score_real_single_minbr_0) + ' ' + str(single_scores['gmyc_minbr_0']) + ' ' + str(score_real_single_minbr_default) + '\n')
+
+ gnuplotOut_multi_scores_current_set.write(str(i) + ' ' + str(multi_scores['delimit_single_minbr_0']) + ' ' + str(multi_scores['delimit_multi_minbr_0']) + ' ' + str(multi_scores['delimit_single_minbr_default']) + ' ' + str(multi_scores['delimit_multi_minbr_default']) + ' ' + str(score_real_multi_minbr_0) + ' ' + ' ' + str(multi_scores['gmyc_minbr_0']) + str(score_real_multi_minbr_default) + '\n')
+
+ gnuplotOut_num_species_current_set.write(str(i) + ' ' + str(num_species['delimit_single_minbr_0']) + ' ' + str(num_species['delimit_multi_minbr_0']) + ' ' + str(num_species['delimit_single_minbr_default']) + ' ' + str(num_species['delimit_multi_minbr_default']) + ' ' + str(num_species['gmyc_minbr_0']) + ' ' + str(num_real_species) + '\n')
+
+ gnuplotOut_delta_species_current_set.write(str(i) + ' ' + str(num_species['delimit_single_minbr_0'] - num_real_species) + ' ' + str(num_species['delimit_multi_minbr_0'] - num_real_species) + ' ' + str(num_species['delimit_single_minbr_default'] - num_real_species) + ' ' + str(num_species['delimit_multi_minbr_default'] - num_real_species) + ' ' + str(num_species['gmyc_minbr_0'] - num_real_species) + ' ' + str(num_real_species - num_real_species) + '\n')
+
+
+ try:
+ for name in names:
+ average_tree_scores[name] = average_tree_scores[name] + tree_scores[name]
+ average_nmi_scores[name] = average_nmi_scores[name] + nmi_scores[name]
+ average_num_species[name] = average_num_species[name] + num_species[name]
+ average_single_scores[name] = average_single_scores[name] + single_scores[name]
+ average_multi_scores[name] = average_multi_scores[name] + multi_scores[name]
+ average_real_num_species = average_real_num_species + num_real_species
+ average_real_score_single_minbr_0 = average_real_score_single_minbr_0 + score_real_single_minbr_0
+ average_real_score_multi_minbr_0 = average_real_score_multi_minbr_0 + score_real_multi_minbr_0
+ average_real_score_single_minbr_default = average_real_score_single_minbr_default + score_real_single_minbr_default
+ average_real_score_multi_minbr_default = average_real_score_multi_minbr_default + score_real_multi_minbr_default
+ except:
+ print "File is bad: " + input_tree_file
+ num_valid_indices = num_valid_indices - 1
+ num_bad_guys = num_bad_guys + 1
+
+ num_valid_indices = num_valid_indices + 1
+ except IOError:
+ #1
+ print "File not found: " + input_tree_file
+
+ #print "Set " + set_name + ": Num bad guys " + str(num_bad_guys)
+ #print "Set " + set_name + ": Num good guys " + str(num_valid_indices)
+
+ if (num_valid_indices > 0):
+ for name in names:
+ average_tree_scores[name] = float(average_tree_scores[name]) / float(num_valid_indices)
+ average_nmi_scores[name] = float(average_nmi_scores[name]) / float(num_valid_indices)
+ average_num_species[name] = float(average_num_species[name]) / float(num_valid_indices)
+ average_single_scores[name] = float(average_single_scores[name]) / float(num_valid_indices)
+ average_multi_scores[name] = float(average_multi_scores[name]) / float(num_valid_indices)
+
+ #print "Set " + set_name + ": Average tree score " + name
+ #print average_tree_scores[name]
+ #print "Set " + set_name + ": Average NMI score " + name
+ #print average_nmi_scores[name]
+ #print "Set " + set_name + ": Average num species " + name
+ #print average_num_species[name]
+ #print "Set " + set_name + ": Average input score single " + name
+ #print average_single_scores[name]
+ #print "Set " + set_name + ": Average input score multi " + name
+ #print average_multi_scores[name]
+ average_real_num_species = float(average_real_num_species) / float(num_valid_indices)
+ average_real_score_single_minbr_0 = float(average_real_score_single_minbr_0) / float(num_valid_indices)
+ average_real_score_multi_minbr_0 = float(average_real_score_multi_minbr_0) / float(num_valid_indices)
+ average_real_score_single_minbr_default = float(average_real_score_single_minbr_default) / float(num_valid_indices)
+ average_real_score_multi_minbr_default = float(average_real_score_multi_minbr_default) / float(num_valid_indices)
+ #print "Set " + set_name + ": Average real num species "
+ #print average_real_num_species
+ #print "Set " + set_name + ": Average real score single "
+ #print average_real_score_single
+ #print "Set " + set_name + ": Average real score multi "
+ #print average_real_score_multi
+
+ gnuplotOut_tree_scores.write(set_name[2:] + ' ' + str(average_tree_scores['delimit_single_minbr_0']) + ' ' + str(average_tree_scores['delimit_multi_minbr_0']) + ' ' + str(average_tree_scores['delimit_single_minbr_default']) + ' ' + str(average_tree_scores['delimit_multi_minbr_default']) + ' ' + str(average_tree_scores['gmyc_minbr_0']) + '\n')
+
+ gnuplotOut_nmi_scores.write(set_name[2:] + ' ' + str(average_nmi_scores['delimit_single_minbr_0']) + ' ' + str(average_nmi_scores['delimit_multi_minbr_0']) + ' ' + str(average_nmi_scores['delimit_single_minbr_default']) + ' ' + str(average_nmi_scores['delimit_multi_minbr_default']) + ' ' + str(average_nmi_scores['gmyc_minbr_0']) + '\n')
+
+ gnuplotOut_single_scores.write(set_name[2:] + ' ' + str(average_single_scores['delimit_single_minbr_0']) + ' ' + str(average_single_scores['delimit_multi_minbr_0']) + ' ' + str(average_single_scores['delimit_single_minbr_default']) + ' ' + str(average_single_scores['delimit_multi_minbr_default']) + ' ' + str(average_single_scores['gmyc_minbr_0']) + ' ' + str(average_real_score_single_minbr_0) + ' ' + str(average_real_score_single_minbr_default) + '\n')
+
+ gnuplotOut_multi_scores.write(set_name[2:] + ' ' + str(average_multi_scores['delimit_single_minbr_0']) + ' ' + str(average_multi_scores['delimit_multi_minbr_0']) + ' ' + str(average_multi_scores['delimit_single_minbr_default']) + ' ' + str(average_multi_scores['delimit_multi_minbr_default']) + ' ' + str(average_multi_scores['gmyc_minbr_0']) + ' ' + str(average_real_score_multi_minbr_0) + ' ' + str(average_real_score_multi_minbr_default) + '\n')
+
+ gnuplotOut_num_species.write(set_name[2:] + ' ' + str(average_num_species['delimit_single_minbr_0']) + ' ' + str(average_num_species['delimit_multi_minbr_0']) + ' ' + str(average_num_species['delimit_single_minbr_default']) + ' ' + str(average_num_species['delimit_multi_minbr_default']) + ' ' + str(average_num_species['gmyc_minbr_0']) + ' ' + str(average_real_num_species) + '\n')
+
+ gnuplotOut_tree_scores_current_set.close()
+ gnuplotOut_nmi_scores_current_set.close()
+ gnuplotOut_single_scores_current_set.close()
+ gnuplotOut_multi_scores_current_set.close()
+ gnuplotOut_num_species_current_set.close()
+ gnuplotOut_delta_species_current_set.close()
+
+gnuplotOut_tree_scores.close()
+gnuplotOut_nmi_scores.close()
+gnuplotOut_single_scores.close()
+gnuplotOut_multi_scores.close()
+gnuplotOut_num_species.close()
+
+commands.getstatusoutput('gnuplot plotscript_without_ptp')
diff --git a/src/python/create_subsets.py b/src/python/create_subsets.py
new file mode 100755
index 0000000..2b255e8
--- /dev/null
+++ b/src/python/create_subsets.py
@@ -0,0 +1,84 @@
+#! /usr/bin/env python
+import os
+
+def create_subsets(alignmentFile, num_of_species, sum_of_species, num_basepairs, output_taxa_file, output_alignment_file, num_alignments):
+ try:
+ with open(alignmentFile) as f:
+ content = f.read().splitlines()
+ f.close()
+ speciesList = []
+
+ for i in range(0,31):
+ emptyList = []
+ speciesList.append(emptyList)
+
+ alignments = {}
+ for i in range(1, len(content)): # ignore first line
+ contentSplitted = content[i].split();
+ taxonName = contentSplitted[0]
+ alignments[taxonName] = contentSplitted[1][0:num_basepairs]
+ species = taxonName.split('.')[0]
+ speciesList[int(species)].append(taxonName)
+
+ speciesListSorted = sorted(speciesList, key = len)
+
+ currentIdx = 0
+
+ selectedTaxa = []
+
+ found = 0
+ for i in range(30,-1,-1):
+ if currentIdx < len(num_of_species):
+ if len(speciesListSorted[i]) >= sum_of_species[currentIdx]:
+ found = found + 1
+ for j in range(1, sum_of_species[currentIdx]):
+ selectedTaxa.append(speciesListSorted[i][j])
+ else:
+ print "We had an error :("
+ if found == num_of_species[currentIdx]:
+ currentIdx = currentIdx + 1
+ found = 0
+
+ # write the solutions into the files
+ if not os.path.exists(os.path.dirname(output_taxa_file)):
+ os.makedirs(os.path.dirname(output_taxa_file))
+ taxaOut = open(output_taxa_file, 'w')
+ for taxon in selectedTaxa:
+ taxaOut.write(taxon + "\n")
+ taxaOut.close()
+
+ if not os.path.exists(os.path.dirname(output_alignment_file)):
+ os.makedirs(os.path.dirname(output_alignment_file))
+ alignmentOut = open(output_alignment_file, 'w')
+ alignmentOut.write(str(num_alignments) + " " + str(num_basepairs) + "\n")
+ for taxon in selectedTaxa:
+ alignmentOut.write(taxon + " "+ alignments[taxon] + "\n")
+ alignmentOut.close()
+
+ return (currentIdx >= len(num_of_species))
+ except IOError:
+ print "File not found: " + alignmentFile
+
+
+set_names = ["set_1", "set_5", "set_10", "set_20", "set_40", "set_80", "set_160"]
+num_of_species = [3, 6, 9, 12]
+size_of_species = [35, 25, 10, 2]
+uniform_num = [30]
+uniform_size = [12]
+base_pairs = [100, 250, 500, 1000]
+uniform_num_alignments = 360
+nonuniform_num_alignments = 369
+
+for set_name in set_names:
+ for i in range(1,101):
+ for bp in base_pairs:
+ output_nonuniform_taxa_file = "nonuniform/taxa/"+str(bp)+"/taxa.simulated_" + set_name + "_" + str(i)
+ output_nonuniform_alignment_file = "nonuniform/alignments/"+str(bp)+"/simulated_tree_" + set_name + "_" + str(i)
+ output_uniform_taxa_file = "uniform/taxa/"+str(bp)+"/taxa.simulated_" + set_name + "_" + str(i)
+ output_uniform_alignment_file = "uniform/alignments/"+str(bp)+"/simulated_tree_" + set_name + "_" + str(i)
+
+ alignmentFile = "reduced_alignments/" + set_name + "/simulated_" + set_name + "_" + str(i) + ".phy.reduced"
+ if create_subsets(alignmentFile, num_of_species, size_of_species, bp, output_nonuniform_taxa_file, output_nonuniform_alignment_file, nonuniform_num_alignments) == False:
+ print "Found a file that does not fit our requirement :-("
+ if create_subsets(alignmentFile, uniform_num, uniform_size, bp, output_uniform_taxa_file, output_uniform_alignment_file, uniform_num_alignments) == False:
+ print "Found a file that does not fit our requirement :-("
diff --git a/src/python/extract_trees.py b/src/python/extract_trees.py
new file mode 100755
index 0000000..00d7270
--- /dev/null
+++ b/src/python/extract_trees.py
@@ -0,0 +1,22 @@
+#! /usr/bin/env python
+import os
+import commands
+
+set_names = ["Ne1e+05", "Ne1e+06", "Ne5e+05", "Ne10000"]
+
+for set_name in set_names:
+ try:
+ tree_path = "SimulB&C." + set_name + "_nospec.phy"
+ tree_file = open(tree_path)
+ lines = tree_file.readlines()
+
+ for i in range(1,101): # only the first 100 trees
+ tree_destination = "SimulB_C_trees/set_" + set_name + "/SimulB_C_tree_set_" + set_name + "." + str(i) + ".txt"
+ if not os.path.exists(os.path.dirname(tree_destination)):
+ os.makedirs(os.path.dirname(tree_destination))
+ tree_destination_file = open(tree_destination, 'w')
+ tree_destination_file.write(lines[i - 1])
+
+ tree_file.close()
+ except IOError:
+ print "File not found: " + tree_path
diff --git a/src/python/plotscript b/src/python/plotscript
new file mode 100644
index 0000000..c7e05eb
--- /dev/null
+++ b/src/python/plotscript
@@ -0,0 +1,323 @@
+set term pngcairo size 800,600 nocrop enhanced font 'Verdana,11'#define axis
+set style line 11 lc rgb '#808080' lt 1
+set border 3 back ls 11
+set tics nomirror
+
+#define key
+#set key opaque
+set key outside
+
+# define grid
+set style line 12 lc rgb '#808080' lt 0 lw 1
+set grid back ls 12
+
+# define linecolors
+
+set style line 1 lc rgb '#0060ad' pt 7 ps 0.5 pi -1 lt 0 lw 2 # --- blue
+set style line 2 lc rgb '#8b1a0e' pt 7 ps 0.5 pi -1 lt 0 lw 2 # --- red
+set style line 3 lc rgb '#5e9c36' pt 7 ps 0.5 pi -1 lt 0 lw 2 # --- green
+set style line 4 lc rgb '#ffa500' pt 7 ps 0.5 pi -1 lt 0 lw 2 # --- orange
+set style line 5 lc rgb '#40e0d0' pt 7 ps 0.5 pi -1 lt 0 lw 2 # --- turquoise
+set style line 6 lc rgb '#9400d3' pt 7 ps 0.5 pi -1 lt 0 lw 2 # --- darkviolet
+set style line 7 lc rgb '#ff00ff' pt 7 ps 0.5 pi -1 lt 0 lw 2 # --- magenta
+set style line 8 lc rgb '#c0c0c0' pt 7 ps 0.5 pi -1 lt 0 lw 2 # --- silver
+set style line 9 lc rgb '#e6e6Fa' pt 7 ps 0.5 pi -1 lt 0 lw 2 # --- lavender
+
+set pointintervalbox 1
+
+set samples 300
+#Start of user script
+#---------------------
+
+single_0 = 2
+multi_0 = 3
+single_default = 4
+multi_default = 5
+ptp_default = 6
+gmyc_0 = 7
+real = 8
+
+
+
+
+# Kassian Score
+
+set title "Average Kassian Score similar GMYC taxa"
+set xlabel "Set number"
+set ylabel "Average tree score"
+
+ExtData1 = 'workfile_tree_scores'
+set output 'plots/average_tree_scores.png'
+
+plot ExtData1 using 1:2 title 'delimit single minbr 0' with linespoints ls 1, ExtData1 using 1:3 title 'delimit multi minbr 0' with linespoints ls 2, ExtData1 using 1:4 title 'delimit single minbr default' with linespoints ls 3, ExtData1 using 1:5 title 'delimit multi minbr default' with linespoints ls 4, ExtData1 using 1:6 title 'PTP minbr default' with linespoints ls 5, ExtData1 using 1:7 title 'GMYC minbr 0' with linespoints ls 6
+
+
+
+set xlabel "index"
+set ylabel "Tree Score"
+
+ExtData1_10000 = 'workfile_tree_scores_Ne10000'
+ExtData1_100000 = 'workfile_tree_scores_Ne100000'
+ExtData1_500000 = 'workfile_tree_scores_Ne500000'
+ExtData1_1000000 = 'workfile_tree_scores_Ne1000000'
+
+set title "Kassian Tree Score similar GMYC taxa delimit single minbr 0"
+set output 'plots/tree_scores_delimit_single_minbr_0.png'
+plot ExtData1_10000 using 1:single_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_0 title 'Ne1000000' with linespoints ls 4
+
+set title "Kassian Tree Score similar GMYC taxa delimit multi minbr 0"
+set output 'plots/tree_scores_delimit_multi_minbr_0.png'
+plot ExtData1_10000 using 1:multi_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_0 title 'Ne1000000' with linespoints ls 4
+
+set title "Kassian Tree Score similar GMYC taxa delimit single minbr default"
+set output 'plots/tree_scores_delimit_single_minbr_default.png'
+plot ExtData1_10000 using 1:single_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_default title 'Ne1000000' with linespoints ls 4
+
+set title "Kassian Tree Score similar GMYC taxa delimit multi minbr default"
+set output 'plots/tree_scores_delimit_multi_minbr_default.png'
+plot ExtData1_10000 using 1:multi_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_default title 'Ne1000000' with linespoints ls 4
+
+set title "Kassian Tree Score similar GMYC taxa PTP minbr default"
+set output 'plots/tree_scores_PTP_minbr_default.png'
+plot ExtData1_10000 using 1:ptp_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:ptp_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:ptp_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:ptp_default title 'Ne1000000' with linespoints ls 4
+
+set title "Kassian Tree Score similar GMYC taxa GMYC minbr 0"
+set output 'plots/tree_scores_GMYC_minbr_0.png'
+plot ExtData1_10000 using 1:gmyc_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:gmyc_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:gmyc_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:gmyc_0 title 'Ne1000000' with linespoints ls 4
+
+
+
+
+# NMI score
+
+set title "Average NMI Score similar GMYC taxa"
+set xlabel "Set number"
+set ylabel "Average NMI score"
+
+ExtData1 = 'workfile_nmi_scores'
+set output 'plots/average_nmi_scores.png'
+
+plot ExtData1 using 1:2 title 'delimit single minbr 0' with linespoints ls 1, ExtData1 using 1:3 title 'delimit multi minbr 0' with linespoints ls 2, ExtData1 using 1:4 title 'delimit single minbr default' with linespoints ls 3, ExtData1 using 1:5 title 'delimit multi minbr default' with linespoints ls 4, ExtData1 using 1:6 title 'PTP minbr default' with linespoints ls 5, ExtData1 using 1:7 title 'GMYC minbr 0' with linespoints ls 6
+
+
+
+set xlabel "index"
+set ylabel "NMI Score"
+
+ExtData1_10000 = 'workfile_nmi_scores_Ne10000'
+ExtData1_100000 = 'workfile_nmi_scores_Ne100000'
+ExtData1_500000 = 'workfile_nmi_scores_Ne500000'
+ExtData1_1000000 = 'workfile_nmi_scores_Ne1000000'
+
+set title "NMI Score similar GMYC taxa delimit single minbr 0"
+set output 'plots/nmi_scores_delimit_single_minbr_0.png'
+plot ExtData1_10000 using 1:single_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_0 title 'Ne1000000' with linespoints ls 4
+
+set title "NMI Score similar GMYC taxa delimit multi minbr 0"
+set output 'plots/nmi_scores_delimit_multi_minbr_0.png'
+plot ExtData1_10000 using 1:multi_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_0 title 'Ne1000000' with linespoints ls 4
+
+set title "NMI Score similar GMYC taxa delimit single minbr default"
+set output 'plots/nmi_scores_delimit_single_minbr_default.png'
+plot ExtData1_10000 using 1:single_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_default title 'Ne1000000' with linespoints ls 4
+
+set title "NMI Score similar GMYC taxa delimit multi minbr default"
+set output 'plots/nmi_scores_delimit_multi_minbr_default.png'
+plot ExtData1_10000 using 1:multi_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_default title 'Ne1000000' with linespoints ls 4
+
+set title "NMI Score similar GMYC taxa PTP minbr default"
+set output 'plots/nmi_scores_PTP_minbr_default.png'
+plot ExtData1_10000 using 1:ptp_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:ptp_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:ptp_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:ptp_default title 'Ne1000000' with linespoints ls 4
+
+set title "NMI Score similar GMYC taxa GMYC minbr 0"
+set output 'plots/nmi_scores_GMYC_minbr_0.png'
+plot ExtData1_10000 using 1:gmyc_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:gmyc_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:gmyc_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:gmyc_0 title 'Ne1000000' with linespoints ls 4
+
+
+
+
+# number of species
+
+set title "Average Number of Species similar GMYC taxa"
+set xlabel "Set number"
+set ylabel "Average number of species"
+
+ExtData1 = 'workfile_num_species'
+set output 'plots/average_num_species.png'
+
+plot ExtData1 using 1:2 title 'delimit single minbr 0' with linespoints ls 1, ExtData1 using 1:3 title 'delimit multi minbr 0' with linespoints ls 2, ExtData1 using 1:4 title 'delimit single minbr default' with linespoints ls 3, ExtData1 using 1:5 title 'delimit multi minbr default' with linespoints ls 4, ExtData1 using 1:6 title 'PTP minbr default' with linespoints ls 5, ExtData1 using 1:7 title 'GMYC minbr 0' with linespoints ls 6, ExtData1 using 1:8 title 'real' with linespoints ls 7
+
+
+
+set xlabel "index"
+set ylabel "Number of Species"
+
+ExtData1_10000 = 'workfile_num_species_Ne10000'
+ExtData1_100000 = 'workfile_num_species_Ne100000'
+ExtData1_500000 = 'workfile_num_species_Ne500000'
+ExtData1_1000000 = 'workfile_num_species_Ne1000000'
+
+set title "Number of Species similar GMYC taxa delimit single minbr 0"
+set output 'plots/num_species_delimit_single_minbr_0.png'
+plot ExtData1_10000 using 1:single_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_0 title 'Ne1000000' with linespoints ls 4
+
+set title "Number of Species similar GMYC taxa delimit multi minbr 0"
+set output 'plots/num_species_delimit_multi_minbr_0.png'
+plot ExtData1_10000 using 1:multi_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_0 title 'Ne1000000' with linespoints ls 4
+
+set title "Number of Species similar GMYC taxa delimit single minbr default"
+set output 'plots/num_species_delimit_single_minbr_default.png'
+plot ExtData1_10000 using 1:single_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_default title 'Ne1000000' with linespoints ls 4
+
+set title "Number of Species similar GMYC taxa delimit multi minbr default"
+set output 'plots/num_species_delimit_multi_minbr_default.png'
+plot ExtData1_10000 using 1:multi_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_default title 'Ne1000000' with linespoints ls 4
+
+set title "Number of Species similar GMYC taxa PTP minbr default"
+set output 'plots/num_species_PTP_minbr_default.png'
+plot ExtData1_10000 using 1:ptp_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:ptp_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:ptp_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:ptp_default title 'Ne1000000' with linespoints ls 4
+
+set title "Number of Species similar GMYC taxa real"
+set output 'plots/num_species_real.png'
+plot ExtData1_10000 using 1:real title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:real title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:real title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:real title 'Ne1000000' with linespoints ls 4
+
+set title "Number of Species similar GMYC taxa GMYC minbr 0"
+set output 'plots/num_species_GMYC_minbr_0.png'
+plot ExtData1_10000 using 1:gmyc_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:gmyc_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:gmyc_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:gmyc_0 title 'Ne1000000' with linespoints ls 4
+
+
+
+set xlabel "index"
+set ylabel "Delta Number of Species"
+
+ExtData1_10000 = 'workfile_delta_species_Ne10000'
+ExtData1_100000 = 'workfile_delta_species_Ne100000'
+ExtData1_500000 = 'workfile_delta_species_Ne500000'
+ExtData1_1000000 = 'workfile_delta_species_Ne1000000'
+
+set title "Delta Number of Species similar GMYC taxa delimit single minbr 0"
+set output 'plots/delta_species_delimit_single_minbr_0.png'
+plot ExtData1_10000 using 1:single_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_0 title 'Ne1000000' with linespoints ls 4
+
+set title "Delta Number of Species similar GMYC taxa delimit multi minbr 0"
+set output 'plots/delta_species_delimit_multi_minbr_0.png'
+plot ExtData1_10000 using 1:multi_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_0 title 'Ne1000000' with linespoints ls 4
+
+set title "Delta Number of Species similar GMYC taxa delimit single minbr default"
+set output 'plots/delta_species_delimit_single_minbr_default.png'
+plot ExtData1_10000 using 1:single_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_default title 'Ne1000000' with linespoints ls 4
+
+set title "Delta Number of Species similar GMYC taxa delimit multi minbr default"
+set output 'plots/delta_species_delimit_multi_minbr_default.png'
+plot ExtData1_10000 using 1:multi_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_default title 'Ne1000000' with linespoints ls 4
+
+set title "Delta Number of Species similar GMYC taxa PTP minbr default"
+set output 'plots/delta_species_PTP_minbr_default.png'
+plot ExtData1_10000 using 1:ptp_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:ptp_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:ptp_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:ptp_default title 'Ne1000000' with linespoints ls 4
+
+set title "Delta Number of Species similar GMYC taxa real"
+set output 'plots/delta_species_real.png'
+plot ExtData1_10000 using 1:real title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:real title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:real title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:real title 'Ne1000000' with linespoints ls 4
+
+set title "Delta Number of Species similar GMYC taxa GMYC minbr 0"
+set output 'plots/delta_species_GMYC_minbr_0.png'
+plot ExtData1_10000 using 1:gmyc_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:gmyc_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:gmyc_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:gmyc_0 title 'Ne1000000' with linespoints ls 4
+
+
+
+# single lambda score
+
+set title "Average Single Lambda Score similar GMYC taxa"
+set xlabel "Set number"
+set ylabel "Average single lambda score"
+
+ExtData1 = 'workfile_single_scores'
+set output 'plots/average_single_scores.png'
+
+plot ExtData1 using 1:2 title 'delimit single minbr 0' with linespoints ls 1, ExtData1 using 1:3 title 'delimit multi minbr 0' with linespoints ls 2, ExtData1 using 1:4 title 'delimit single minbr default' with linespoints ls 3, ExtData1 using 1:5 title 'delimit multi minbr default' with linespoints ls 4, ExtData1 using 1:6 title 'PTP minbr default' with linespoints ls 5, ExtData1 using 1:7 title 'GMYC minbr 0' with linespoints ls 6, ExtData1 using 1:8 title 'real minbr 0' with linespoin [...]
+
+
+
+
+set xlabel "index"
+set ylabel "Single Lambda Score"
+
+ExtData1_10000 = 'workfile_single_scores_Ne10000'
+ExtData1_100000 = 'workfile_single_scores_Ne100000'
+ExtData1_500000 = 'workfile_single_scores_Ne500000'
+ExtData1_1000000 = 'workfile_single_scores_Ne1000000'
+
+set title "Single Lambda Score similar GMYC taxa delimit single minbr 0"
+set output 'plots/single_scores_delimit_single_minbr_0.png'
+plot ExtData1_10000 using 1:single_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_0 title 'Ne1000000' with linespoints ls 4
+
+set title "Single Lambda Score similar GMYC taxa delimit multi minbr 0"
+set output 'plots/single_scores_delimit_multi_minbr_0.png'
+plot ExtData1_10000 using 1:multi_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_0 title 'Ne1000000' with linespoints ls 4
+
+set title "Single Lambda Score similar GMYC taxa delimit single minbr default"
+set output 'plots/single_scores_delimit_single_minbr_default.png'
+plot ExtData1_10000 using 1:single_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_default title 'Ne1000000' with linespoints ls 4
+
+set title "Single Lambda Score similar GMYC taxa delimit multi minbr default"
+set output 'plots/single_scores_delimit_multi_minbr_default.png'
+plot ExtData1_10000 using 1:multi_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_default title 'Ne1000000' with linespoints ls 4
+
+set title "Single Lambda Score similar GMYC taxa PTP minbr default"
+set output 'plots/single_scores_PTP_minbr_default.png'
+plot ExtData1_10000 using 1:ptp_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:ptp_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:ptp_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:ptp_default title 'Ne1000000' with linespoints ls 4
+
+set title "Single Lambda Score similar GMYC taxa GMYC minbr 0"
+set output 'plots/single_scores_GMYC_minbr_0.png'
+plot ExtData1_10000 using 1:gmyc_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:gmyc_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:gmyc_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:gmyc_0 title 'Ne1000000' with linespoints ls 4
+
+
+
+set title "Average Multi Lambda Score similar GMYC taxa"
+set xlabel "Set number"
+set ylabel "Average multi lambda score"
+
+ExtData1 = 'workfile_multi_scores'
+set output 'plots/average_multi_scores.png'
+
+plot ExtData1 using 1:2 title 'delimit single minbr 0' with linespoints ls 1, ExtData1 using 1:3 title 'delimit multi minbr 0' with linespoints ls 2, ExtData1 using 1:4 title 'delimit single minbr default' with linespoints ls 3, ExtData1 using 1:5 title 'delimit multi minbr default' with linespoints ls 4, ExtData1 using 1:6 title 'PTP minbr default' with linespoints ls 5, ExtData1 using 1:7 title 'GMYC minbr 0' with linespoints ls 6, ExtData1 using 1:8 title 'real minbr 0' with linespoin [...]
+
+
+
+
+
+set xlabel "index"
+set ylabel "Multi Lambda Score"
+
+ExtData1_10000 = 'workfile_multi_scores_Ne10000'
+ExtData1_100000 = 'workfile_multi_scores_Ne100000'
+ExtData1_500000 = 'workfile_multi_scores_Ne500000'
+ExtData1_1000000 = 'workfile_multi_scores_Ne1000000'
+
+set title "Multi Lambda Score similar GMYC taxa delimit single minbr 0"
+set output 'plots/multi_scores_delimit_single_minbr_0.png'
+plot ExtData1_10000 using 1:single_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_0 title 'Ne1000000' with linespoints ls 4
+
+set title "Multi Lambda Score similar GMYC taxa delimit multi minbr 0"
+set output 'plots/multi_scores_delimit_multi_minbr_0.png'
+plot ExtData1_10000 using 1:multi_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_0 title 'Ne1000000' with linespoints ls 4
+
+set title "Multi Lambda Score similar GMYC taxa delimit single minbr default"
+set output 'plots/multi_scores_delimit_single_minbr_default.png'
+plot ExtData1_10000 using 1:single_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_default title 'Ne1000000' with linespoints ls 4
+
+set title "Multi Lambda Score similar GMYC taxa delimit multi minbr default"
+set output 'plots/multi_scores_delimit_multi_minbr_default.png'
+plot ExtData1_10000 using 1:multi_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_default title 'Ne1000000' with linespoints ls 4
+
+set title "Multi Lambda Score similar GMYC taxa PTP minbr default"
+set output 'plots/multi_scores_PTP_minbr_default.png'
+plot ExtData1_10000 using 1:ptp_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:ptp_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:ptp_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:ptp_default title 'Ne1000000' with linespoints ls 4
+
+set title "Multi Lambda Score similar GMYC taxa GMYC minbr 0"
+set output 'plots/multi_scores_GMYC_minbr_0.png'
+plot ExtData1_10000 using 1:gmyc_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:gmyc_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:gmyc_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:gmyc_0 title 'Ne1000000' with linespoints ls 4
+
+
+
+reset;
diff --git a/src/python/plotscript_without_gmyc b/src/python/plotscript_without_gmyc
new file mode 100644
index 0000000..dabc50e
--- /dev/null
+++ b/src/python/plotscript_without_gmyc
@@ -0,0 +1,294 @@
+set term pngcairo size 800,600 nocrop enhanced font 'Verdana,11'#define axis
+set style line 11 lc rgb '#808080' lt 1
+set border 3 back ls 11
+set tics nomirror
+
+#define key
+#set key opaque
+set key outside
+
+# define grid
+set style line 12 lc rgb '#808080' lt 0 lw 1
+set grid back ls 12
+
+# define linecolors
+
+set style line 1 lc rgb '#0060ad' pt 7 ps 0.5 pi -1 lt 0 lw 2 # --- blue
+set style line 2 lc rgb '#8b1a0e' pt 7 ps 0.5 pi -1 lt 0 lw 2 # --- red
+set style line 3 lc rgb '#5e9c36' pt 7 ps 0.5 pi -1 lt 0 lw 2 # --- green
+set style line 4 lc rgb '#ffa500' pt 7 ps 0.5 pi -1 lt 0 lw 2 # --- orange
+set style line 5 lc rgb '#40e0d0' pt 7 ps 0.5 pi -1 lt 0 lw 2 # --- turquoise
+set style line 6 lc rgb '#9400d3' pt 7 ps 0.5 pi -1 lt 0 lw 2 # --- darkviolet
+set style line 7 lc rgb '#ff00ff' pt 7 ps 0.5 pi -1 lt 0 lw 2 # --- magenta
+set style line 8 lc rgb '#c0c0c0' pt 7 ps 0.5 pi -1 lt 0 lw 2 # --- silver
+set style line 9 lc rgb '#e6e6Fa' pt 7 ps 0.5 pi -1 lt 0 lw 2 # --- lavender
+
+set pointintervalbox 1
+
+set samples 300
+#Start of user script
+#---------------------
+
+single_0 = 2
+multi_0 = 3
+single_default = 4
+multi_default = 5
+ptp_default = 6
+real = 7
+
+
+
+
+# Kassian Score
+
+set title "Average Kassian Score similar GMYC taxa"
+set xlabel "Set number"
+set ylabel "Average tree score"
+
+ExtData1 = 'workfile_tree_scores'
+set output 'plots/average_tree_scores.png'
+
+plot ExtData1 using 1:2 title 'delimit single minbr 0' with linespoints ls 1, ExtData1 using 1:3 title 'delimit multi minbr 0' with linespoints ls 2, ExtData1 using 1:4 title 'delimit single minbr default' with linespoints ls 3, ExtData1 using 1:5 title 'delimit multi minbr default' with linespoints ls 4, ExtData1 using 1:6 title 'PTP minbr default' with linespoints ls 5
+
+
+
+set xlabel "index"
+set ylabel "Tree Score"
+
+ExtData1_10000 = 'workfile_tree_scores_Ne10000'
+ExtData1_100000 = 'workfile_tree_scores_Ne100000'
+ExtData1_500000 = 'workfile_tree_scores_Ne500000'
+ExtData1_1000000 = 'workfile_tree_scores_Ne1000000'
+
+set title "Kassian Tree Score similar GMYC taxa delimit single minbr 0"
+set output 'plots/tree_scores_delimit_single_minbr_0.png'
+plot ExtData1_10000 using 1:single_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_0 title 'Ne1000000' with linespoints ls 4
+
+set title "Kassian Tree Score similar GMYC taxa delimit multi minbr 0"
+set output 'plots/tree_scores_delimit_multi_minbr_0.png'
+plot ExtData1_10000 using 1:multi_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_0 title 'Ne1000000' with linespoints ls 4
+
+set title "Kassian Tree Score similar GMYC taxa delimit single minbr default"
+set output 'plots/tree_scores_delimit_single_minbr_default.png'
+plot ExtData1_10000 using 1:single_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_default title 'Ne1000000' with linespoints ls 4
+
+set title "Kassian Tree Score similar GMYC taxa delimit multi minbr default"
+set output 'plots/tree_scores_delimit_multi_minbr_default.png'
+plot ExtData1_10000 using 1:multi_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_default title 'Ne1000000' with linespoints ls 4
+
+set title "Kassian Tree Score similar GMYC taxa PTP minbr default"
+set output 'plots/tree_scores_PTP_minbr_default.png'
+plot ExtData1_10000 using 1:ptp_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:ptp_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:ptp_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:ptp_default title 'Ne1000000' with linespoints ls 4
+
+
+
+# NMI score
+
+set title "Average NMI Score similar GMYC taxa"
+set xlabel "Set number"
+set ylabel "Average NMI score"
+
+ExtData1 = 'workfile_nmi_scores'
+set output 'plots/average_nmi_scores.png'
+
+plot ExtData1 using 1:2 title 'delimit single minbr 0' with linespoints ls 1, ExtData1 using 1:3 title 'delimit multi minbr 0' with linespoints ls 2, ExtData1 using 1:4 title 'delimit single minbr default' with linespoints ls 3, ExtData1 using 1:5 title 'delimit multi minbr default' with linespoints ls 4, ExtData1 using 1:6 title 'PTP minbr default' with linespoints ls 5
+
+
+
+set xlabel "index"
+set ylabel "NMI Score"
+
+ExtData1_10000 = 'workfile_nmi_scores_Ne10000'
+ExtData1_100000 = 'workfile_nmi_scores_Ne100000'
+ExtData1_500000 = 'workfile_nmi_scores_Ne500000'
+ExtData1_1000000 = 'workfile_nmi_scores_Ne1000000'
+
+set title "NMI Score similar GMYC taxa delimit single minbr 0"
+set output 'plots/nmi_scores_delimit_single_minbr_0.png'
+plot ExtData1_10000 using 1:single_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_0 title 'Ne1000000' with linespoints ls 4
+
+set title "NMI Score similar GMYC taxa delimit multi minbr 0"
+set output 'plots/nmi_scores_delimit_multi_minbr_0.png'
+plot ExtData1_10000 using 1:multi_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_0 title 'Ne1000000' with linespoints ls 4
+
+set title "NMI Score similar GMYC taxa delimit single minbr default"
+set output 'plots/nmi_scores_delimit_single_minbr_default.png'
+plot ExtData1_10000 using 1:single_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_default title 'Ne1000000' with linespoints ls 4
+
+set title "NMI Score similar GMYC taxa delimit multi minbr default"
+set output 'plots/nmi_scores_delimit_multi_minbr_default.png'
+plot ExtData1_10000 using 1:multi_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_default title 'Ne1000000' with linespoints ls 4
+
+set title "NMI Score similar GMYC taxa PTP minbr default"
+set output 'plots/nmi_scores_PTP_minbr_default.png'
+plot ExtData1_10000 using 1:ptp_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:ptp_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:ptp_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:ptp_default title 'Ne1000000' with linespoints ls 4
+
+
+
+# number of species
+
+set title "Average Number of Species similar GMYC taxa"
+set xlabel "Set number"
+set ylabel "Average number of species"
+
+ExtData1 = 'workfile_num_species'
+set output 'plots/average_num_species.png'
+
+plot ExtData1 using 1:2 title 'delimit single minbr 0' with linespoints ls 1, ExtData1 using 1:3 title 'delimit multi minbr 0' with linespoints ls 2, ExtData1 using 1:4 title 'delimit single minbr default' with linespoints ls 3, ExtData1 using 1:5 title 'delimit multi minbr default' with linespoints ls 4, ExtData1 using 1:6 title 'PTP minbr default' with linespoints ls 5, ExtData1 using 1:7 title 'real' with linespoints ls 7
+
+
+
+set xlabel "index"
+set ylabel "Number of Species"
+
+ExtData1_10000 = 'workfile_num_species_Ne10000'
+ExtData1_100000 = 'workfile_num_species_Ne100000'
+ExtData1_500000 = 'workfile_num_species_Ne500000'
+ExtData1_1000000 = 'workfile_num_species_Ne1000000'
+
+set title "Number of Species similar GMYC taxa delimit single minbr 0"
+set output 'plots/num_species_delimit_single_minbr_0.png'
+plot ExtData1_10000 using 1:single_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_0 title 'Ne1000000' with linespoints ls 4
+
+set title "Number of Species similar GMYC taxa delimit multi minbr 0"
+set output 'plots/num_species_delimit_multi_minbr_0.png'
+plot ExtData1_10000 using 1:multi_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_0 title 'Ne1000000' with linespoints ls 4
+
+set title "Number of Species similar GMYC taxa delimit single minbr default"
+set output 'plots/num_species_delimit_single_minbr_default.png'
+plot ExtData1_10000 using 1:single_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_default title 'Ne1000000' with linespoints ls 4
+
+set title "Number of Species similar GMYC taxa delimit multi minbr default"
+set output 'plots/num_species_delimit_multi_minbr_default.png'
+plot ExtData1_10000 using 1:multi_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_default title 'Ne1000000' with linespoints ls 4
+
+set title "Number of Species similar GMYC taxa PTP minbr default"
+set output 'plots/num_species_PTP_minbr_default.png'
+plot ExtData1_10000 using 1:ptp_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:ptp_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:ptp_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:ptp_default title 'Ne1000000' with linespoints ls 4
+
+set title "Number of Species similar GMYC taxa real"
+set output 'plots/num_species_real.png'
+plot ExtData1_10000 using 1:real title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:real title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:real title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:real title 'Ne1000000' with linespoints ls 4
+
+
+
+set xlabel "index"
+set ylabel "Delta Number of Species"
+
+ExtData1_10000 = 'workfile_delta_species_Ne10000'
+ExtData1_100000 = 'workfile_delta_species_Ne100000'
+ExtData1_500000 = 'workfile_delta_species_Ne500000'
+ExtData1_1000000 = 'workfile_delta_species_Ne1000000'
+
+set title "Delta Number of Species similar GMYC taxa delimit single minbr 0"
+set output 'plots/delta_species_delimit_single_minbr_0.png'
+plot ExtData1_10000 using 1:single_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_0 title 'Ne1000000' with linespoints ls 4
+
+set title "Delta Number of Species similar GMYC taxa delimit multi minbr 0"
+set output 'plots/delta_species_delimit_multi_minbr_0.png'
+plot ExtData1_10000 using 1:multi_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_0 title 'Ne1000000' with linespoints ls 4
+
+set title "Delta Number of Species similar GMYC taxa delimit single minbr default"
+set output 'plots/delta_species_delimit_single_minbr_default.png'
+plot ExtData1_10000 using 1:single_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_default title 'Ne1000000' with linespoints ls 4
+
+set title "Delta Number of Species similar GMYC taxa delimit multi minbr default"
+set output 'plots/delta_species_delimit_multi_minbr_default.png'
+plot ExtData1_10000 using 1:multi_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_default title 'Ne1000000' with linespoints ls 4
+
+set title "Delta Number of Species similar GMYC taxa PTP minbr default"
+set output 'plots/delta_species_PTP_minbr_default.png'
+plot ExtData1_10000 using 1:ptp_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:ptp_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:ptp_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:ptp_default title 'Ne1000000' with linespoints ls 4
+
+set title "Delta Number of Species similar GMYC taxa real"
+set output 'plots/delta_species_real.png'
+plot ExtData1_10000 using 1:real title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:real title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:real title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:real title 'Ne1000000' with linespoints ls 4
+
+
+
+# single lambda score
+
+set title "Average Single Lambda Score similar GMYC taxa"
+set xlabel "Set number"
+set ylabel "Average single lambda score"
+
+ExtData1 = 'workfile_single_scores'
+set output 'plots/average_single_scores.png'
+
+plot ExtData1 using 1:2 title 'delimit single minbr 0' with linespoints ls 1, ExtData1 using 1:3 title 'delimit multi minbr 0' with linespoints ls 2, ExtData1 using 1:4 title 'delimit single minbr default' with linespoints ls 3, ExtData1 using 1:5 title 'delimit multi minbr default' with linespoints ls 4, ExtData1 using 1:6 title 'PTP minbr default' with linespoints ls 5, ExtData1 using 1:7 title 'real minbr 0' with linespoints ls 7, ExtData1 using 1:8 title 'real minbr default' with lin [...]
+
+
+
+
+set xlabel "index"
+set ylabel "Single Lambda Score"
+
+ExtData1_10000 = 'workfile_single_scores_Ne10000'
+ExtData1_100000 = 'workfile_single_scores_Ne100000'
+ExtData1_500000 = 'workfile_single_scores_Ne500000'
+ExtData1_1000000 = 'workfile_single_scores_Ne1000000'
+
+set title "Single Lambda Score similar GMYC taxa delimit single minbr 0"
+set output 'plots/single_scores_delimit_single_minbr_0.png'
+plot ExtData1_10000 using 1:single_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_0 title 'Ne1000000' with linespoints ls 4
+
+set title "Single Lambda Score similar GMYC taxa delimit multi minbr 0"
+set output 'plots/single_scores_delimit_multi_minbr_0.png'
+plot ExtData1_10000 using 1:multi_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_0 title 'Ne1000000' with linespoints ls 4
+
+set title "Single Lambda Score similar GMYC taxa delimit single minbr default"
+set output 'plots/single_scores_delimit_single_minbr_default.png'
+plot ExtData1_10000 using 1:single_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_default title 'Ne1000000' with linespoints ls 4
+
+set title "Single Lambda Score similar GMYC taxa delimit multi minbr default"
+set output 'plots/single_scores_delimit_multi_minbr_default.png'
+plot ExtData1_10000 using 1:multi_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_default title 'Ne1000000' with linespoints ls 4
+
+set title "Single Lambda Score similar GMYC taxa PTP minbr default"
+set output 'plots/single_scores_PTP_minbr_default.png'
+plot ExtData1_10000 using 1:ptp_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:ptp_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:ptp_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:ptp_default title 'Ne1000000' with linespoints ls 4
+
+
+set title "Average Multi Lambda Score similar GMYC taxa"
+set xlabel "Set number"
+set ylabel "Average multi lambda score"
+
+ExtData1 = 'workfile_multi_scores'
+set output 'plots/average_multi_scores.png'
+
+plot ExtData1 using 1:2 title 'delimit single minbr 0' with linespoints ls 1, ExtData1 using 1:3 title 'delimit multi minbr 0' with linespoints ls 2, ExtData1 using 1:4 title 'delimit single minbr default' with linespoints ls 3, ExtData1 using 1:5 title 'delimit multi minbr default' with linespoints ls 4, ExtData1 using 1:6 title 'PTP minbr default' with linespoints ls 5, ExtData1 using 1:7 title 'real minbr 0' with linespoints ls 7, ExtData1 using 1:8 title 'real minbr default' with lin [...]
+
+
+
+
+
+set xlabel "index"
+set ylabel "Multi Lambda Score"
+
+ExtData1_10000 = 'workfile_multi_scores_Ne10000'
+ExtData1_100000 = 'workfile_multi_scores_Ne100000'
+ExtData1_500000 = 'workfile_multi_scores_Ne500000'
+ExtData1_1000000 = 'workfile_multi_scores_Ne1000000'
+
+set title "Multi Lambda Score similar GMYC taxa delimit single minbr 0"
+set output 'plots/multi_scores_delimit_single_minbr_0.png'
+plot ExtData1_10000 using 1:single_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_0 title 'Ne1000000' with linespoints ls 4
+
+set title "Multi Lambda Score similar GMYC taxa delimit multi minbr 0"
+set output 'plots/multi_scores_delimit_multi_minbr_0.png'
+plot ExtData1_10000 using 1:multi_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_0 title 'Ne1000000' with linespoints ls 4
+
+set title "Multi Lambda Score similar GMYC taxa delimit single minbr default"
+set output 'plots/multi_scores_delimit_single_minbr_default.png'
+plot ExtData1_10000 using 1:single_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_default title 'Ne1000000' with linespoints ls 4
+
+set title "Multi Lambda Score similar GMYC taxa delimit multi minbr default"
+set output 'plots/multi_scores_delimit_multi_minbr_default.png'
+plot ExtData1_10000 using 1:multi_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_default title 'Ne1000000' with linespoints ls 4
+
+set title "Multi Lambda Score similar GMYC taxa PTP minbr default"
+set output 'plots/multi_scores_PTP_minbr_default.png'
+plot ExtData1_10000 using 1:ptp_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:ptp_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:ptp_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:ptp_default title 'Ne1000000' with linespoints ls 4
+
+
+reset;
diff --git a/src/python/rewrite_species_result_file_GMYC.py b/src/python/rewrite_species_result_file_GMYC.py
new file mode 100755
index 0000000..d2d504d
--- /dev/null
+++ b/src/python/rewrite_species_result_file_GMYC.py
@@ -0,0 +1,48 @@
+#! /usr/bin/env python
+import os
+
+def rewrite_species_result(input_species_file, output_species_file):
+ try:
+ with open(input_species_file) as f:
+ content = f.read().splitlines()
+ f.close()
+
+ largestSpecies = 0
+ taxaList = []
+ assignments = {}
+
+ for i in range(1,151):
+ line = content[i]
+ line = " ".join(line.split())
+ species_idx = int(line.split(' ')[1])
+ taxon_name = line.split(' ')[2]
+ if (species_idx > largestSpecies):
+ assignments[species_idx] = []
+ largestSpecies = species_idx
+ assignments[species_idx].append(taxon_name)
+
+ if not os.path.exists(os.path.dirname(output_species_file)):
+ os.makedirs(os.path.dirname(output_species_file))
+ speciesOut = open(output_species_file, 'w')
+
+ speciesOut.write("Species 1:\n")
+ for j in range(0, len(assignments[1])):
+ speciesOut.write(assignments[1][j] + "\n")
+ for i in range(2, largestSpecies + 1):
+ speciesOut.write("\nSpecies " + str(i) + ":\n")
+ for j in range(0, len(assignments[i])):
+ speciesOut.write(assignments[i][j] + "\n")
+
+ speciesOut.close()
+ except IOError:
+ print "File not found: " + input_species_file
+
+set_names = ["Ne1e+05", "Ne1e+06", "Ne5e+05", "Ne10000"]
+
+#rewrite_species_result("gmyc_results_SimulB_C/set_Ne1e+05/gmyc_results_set_Ne1e+05.1.txt", "SimulB_C_gmyc_minbr_0/set_Ne1e+05/gmyc_results_set_Ne1e+05.1.txt")
+
+for set_name in set_names:
+ for i in range(1,101):
+ input_species_file = "gmyc_results_SimulB_C/set_" + set_name + "/gmyc_results_set_" + set_name + "." + str(i) + ".txt"
+ output_species_file = "SimulB_C_gmyc_minbr_0/set_" + set_name + "/gmyc_results_set_" + set_name + "." + str(i) + ".txt"
+ rewrite_species_result(input_species_file, output_species_file)
diff --git a/src/python/rewrite_species_result_file_PTP.py b/src/python/rewrite_species_result_file_PTP.py
new file mode 100755
index 0000000..513aa69
--- /dev/null
+++ b/src/python/rewrite_species_result_file_PTP.py
@@ -0,0 +1,46 @@
+#! /usr/bin/env python
+import os
+
+def rewrite_species_result(input_species_file, output_species_file):
+ try:
+ with open(input_species_file) as f:
+ content = f.read().splitlines()
+ f.close()
+
+ taxaListString = content[0].split(':')[1]
+ taxaList = taxaListString.split(',')
+ speciesList = content[1].split(',')
+
+ if not os.path.exists(os.path.dirname(output_species_file)):
+ os.makedirs(os.path.dirname(output_species_file))
+ speciesOut = open(output_species_file, 'w')
+
+ oldSpeciesIdx = speciesList[0]
+ speciesOut.write("Species " + oldSpeciesIdx + ":\n")
+ speciesOut.write(taxaList[0] + "\n")
+ for i in range(1,len(speciesList)):
+ if (speciesList[i] == oldSpeciesIdx):
+ speciesOut.write(taxaList[i] + "\n")
+ else:
+ oldSpeciesIdx = speciesList[i]
+ speciesOut.write("\nSpecies " + oldSpeciesIdx + ":\n")
+ speciesOut.write(taxaList[i] + "\n")
+
+ speciesOut.close()
+ except IOError:
+ print "File not found: " + input_species_file
+
+set_names = ["Ne10000", "Ne100000", "Ne500000", "Ne1000000"]
+
+for set_name in set_names:
+ for i in range(1,101):
+ if set_name == "Ne10000":
+ input_species_file = "similar_to_GMYC/15-08-2015.16-40/set_BIRTH0.27_" + set_name + "/PTP_result_BIRTH0.27_" + set_name + "_" + str(i) + ".PTPPartitions.txt"
+ elif set_name == "Ne500000":
+ input_species_file = "similar_to_GMYC/15-08-2015.16-40/set_BIRTH0.27_" + set_name + "/PTP_BIRTH0.27_" + set_name + "_" + str(i) + ".PTPPartitions.txt"
+ elif set_name == "Ne100000":
+ input_species_file = "similar_to_GMYC/15-08-2015.16-40/set_BIRTH0.27_" + set_name + "/PTP_BIRTH0.27_" + set_name + "_" + str(i) + ".PTPPartitions.txt"
+ else:
+ input_species_file = "similar_to_GMYC/15-08-2015.16-40/set_BIRTH0.27_" + set_name + "/PTP_result." + str(i) + ".PTPPartitions.txt"
+ output_species_file = "similar_to_GMYC_PTP_minbr_default/set_" + set_name + "/PTP_results_set_" + set_name + "." + str(i) + ".txt"
+ rewrite_species_result(input_species_file, output_species_file)
diff --git a/src/random.c b/src/random.c
new file mode 100644
index 0000000..14ab609
--- /dev/null
+++ b/src/random.c
@@ -0,0 +1,128 @@
+/*
+ Copyright (C) 2015 Tomas Flouri, Sarah Lutteropp
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as
+ published by the Free Software Foundation, either version 3 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+ Contact: Tomas Flouri <Tomas.Flouri at h-its.org>,
+ Heidelberg Institute for Theoretical Studies,
+ Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
+*/
+
+#include "mptp.h"
+
+static long min_species;
+static long max_species;
+static long species_count;
+static unsigned short * g_rstate;
+
+static int cb_node_select(rtree_t * node)
+{
+ double rand_double = 0;
+
+ if (!node->edge_count) return 0;
+
+ /* check if not selecting node is possible */
+ if (min_species+1 > species_count)
+ {
+ /* we must select the node */
+ node->event = EVENT_COALESCENT;
+ max_species = max_species - node->max_species_count + 1;
+ return 0;
+ }
+
+ /* check if selecting the node is possible */
+ if (max_species - node->max_species_count + 1 < species_count)
+ {
+ /* we must NOT select the node */
+ node->event = EVENT_SPECIATION;
+ min_species = min_species+1;
+ return 1;
+ }
+
+ /* otherwise, we just throw a coin and select one of the two cases */
+ rand_double = erand48(g_rstate);
+ if (rand_double >= 0.5)
+ {
+ /* don't select */
+ node->event = EVENT_SPECIATION;
+ min_species = min_species+1;
+ return 1;
+ }
+
+ /* otherwise select node */
+ node->event = EVENT_COALESCENT;
+ max_species = max_species - node->max_species_count + 1;
+ return 0;
+}
+
+double random_delimitation(rtree_t * root,
+ long * delimited_species,
+ long * coal_edge_count,
+ double * coal_edgelen_sum,
+ long * spec_edge_count,
+ double * spec_edgelen_sum,
+ double * coal_score,
+ unsigned short * rstate)
+{
+ int edge_count = 0;
+ long i;
+ long rand_long = 0;
+ double logl = 0;
+ double edgelen_sum = 0;
+
+ /* initialize */
+ min_species = 1;
+ max_species = root->max_species_count;
+ g_rstate = rstate;
+
+ rand_long = nrand48(rstate);
+ if (!root->max_species_count)
+ species_count = (rand_long % root->leaves) + 1;
+ else
+ species_count = (rand_long % root->max_species_count) + 1;
+
+ rtree_t ** inner_node_list = (rtree_t **)xmalloc((size_t)species_count *
+ sizeof(rtree_t *));
+
+ long count = rtree_traverse(root, cb_node_select, rstate, inner_node_list);
+
+ for (i = 0; i < count; ++i)
+ {
+ logl += inner_node_list[i]->coal_logl;
+ edge_count += inner_node_list[i]->edge_count;
+ edgelen_sum += inner_node_list[i]->edgelen_sum;
+ }
+ *coal_score = logl;
+
+ /* if we have PTP single logl is different */
+ if (opt_method == PTP_METHOD_SINGLE)
+ logl = loglikelihood(edge_count, edgelen_sum);
+
+
+ /* append speciation part log-likelihood */
+ logl += loglikelihood(root->edge_count - edge_count,
+ root->edgelen_sum - edgelen_sum);
+
+ free(inner_node_list);
+
+ assert(count == species_count);
+
+ *delimited_species = species_count;
+ *coal_edge_count = edge_count;
+ *coal_edgelen_sum = edgelen_sum;
+ *spec_edge_count = root->edge_count - edge_count;
+ *spec_edgelen_sum = root->edgelen_sum - edgelen_sum;
+
+ return logl;
+}
diff --git a/src/rtree.c b/src/rtree.c
new file mode 100644
index 0000000..195a2c9
--- /dev/null
+++ b/src/rtree.c
@@ -0,0 +1,684 @@
+/*
+ Copyright (C) 2015 Tomas Flouri
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as
+ published by the Free Software Foundation, either version 3 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+ Contact: Tomas Flouri <Tomas.Flouri at h-its.org>,
+ Heidelberg Institute for Theoretical Studies,
+ Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
+*/
+
+#include "mptp.h"
+
+static int indend_space = 4;
+
+static void print_node_info(rtree_t * tree)
+{
+ printf (" %s", tree->label);
+ printf (" %f", tree->length);
+ printf("\n");
+}
+
+static void print_tree_recurse(rtree_t * tree,
+ int indend_level,
+ int * active_node_order)
+{
+ int i,j;
+
+ if (!tree) return;
+
+ for (i = 0; i < indend_level; ++i)
+ {
+ if (active_node_order[i])
+ printf("|");
+ else
+ printf(" ");
+
+ for (j = 0; j < indend_space-1; ++j)
+ printf(" ");
+ }
+ printf("\n");
+
+ for (i = 0; i < indend_level-1; ++i)
+ {
+ if (active_node_order[i])
+ printf("|");
+ else
+ printf(" ");
+
+ for (j = 0; j < indend_space-1; ++j)
+ printf(" ");
+ }
+
+ printf("+");
+ for (j = 0; j < indend_space-1; ++j)
+ printf ("-");
+ if (tree->left || tree->right) printf("+");
+
+ print_node_info(tree);
+
+ if (active_node_order[indend_level-1] == 2)
+ active_node_order[indend_level-1] = 0;
+
+ active_node_order[indend_level] = 1;
+ print_tree_recurse(tree->left,
+ indend_level+1,
+ active_node_order);
+ active_node_order[indend_level] = 2;
+ print_tree_recurse(tree->right,
+ indend_level+1,
+ active_node_order);
+
+}
+
+static int tree_indend_level(rtree_t * tree, int indend)
+{
+ if (!tree) return indend;
+
+ int a = tree_indend_level(tree->left, indend+1);
+ int b = tree_indend_level(tree->right, indend+1);
+
+ return (a > b ? a : b);
+}
+
+void rtree_show_ascii(rtree_t * tree)
+{
+
+ int indend_max = tree_indend_level(tree,0);
+
+ int * active_node_order = (int *)malloc((size_t)(indend_max+1) * sizeof(int));
+ active_node_order[0] = 1;
+ active_node_order[1] = 1;
+
+ print_node_info(tree);
+ print_tree_recurse(tree->left, 1, active_node_order);
+ active_node_order[0] = 2;
+ print_tree_recurse(tree->right, 1, active_node_order);
+ free(active_node_order);
+}
+
+static char * rtree_export_newick_recursive(rtree_t * root)
+{
+ char * newick;
+ char * support = NULL;
+
+ if (!root) return NULL;
+
+ if (!(root->left) || !(root->right))
+ {
+ if (asprintf(&newick, "%s:%f", root->label, root->length) == -1)
+ fatal("Unable to allocate enough memory.");
+ }
+ else
+ {
+ char * subtree1 = rtree_export_newick_recursive(root->left);
+ char * subtree2 = rtree_export_newick_recursive(root->right);
+
+ if (opt_mcmc)
+ if (asprintf(&support, "%f", root->support) == -1)
+ fatal("Unable to allocate enough memory.");
+
+ if (asprintf(&newick, "(%s,%s)%s:%f", subtree1,
+ subtree2,
+ (opt_mcmc) ? support : "",
+ root->length) == -1)
+ fatal("Unable to allocate enough memory.");
+
+ if (opt_mcmc)
+ free(support);
+
+ free(subtree1);
+ free(subtree2);
+ }
+
+ return newick;
+}
+
+char * rtree_export_newick(rtree_t * root)
+{
+ char * newick;
+ char * support = NULL;
+
+ if (!root) return NULL;
+
+ if (!(root->left) || !(root->right))
+ {
+ if (asprintf(&newick, "%s:%f", root->label, root->length) == -1)
+ fatal("Unable to allocate enough memory.");
+ }
+ else
+ {
+ char * subtree1 = rtree_export_newick_recursive(root->left);
+ char * subtree2 = rtree_export_newick_recursive(root->right);
+
+ if (opt_mcmc)
+ if (asprintf(&support, "%f", root->support) == -1)
+ fatal("Unable to allocate enough memory.");
+
+ if (asprintf(&newick, "(%s,%s)%s:%f;", subtree1,
+ subtree2,
+ (opt_mcmc) ? support : "",
+ root->length) == -1)
+ fatal("Unable to allocate enough memory.");
+ if (opt_mcmc)
+ free(support);
+
+ free(subtree1);
+ free(subtree2);
+ }
+
+ return newick;
+}
+
+static void rtree_traverse_recursive(rtree_t * node,
+ int (*cbtrav)(rtree_t *),
+ int * index,
+ unsigned short * rstate,
+ rtree_t ** outbuffer)
+{
+ double rand_double = 0;
+
+ if (!node->left)
+ {
+ if (!cbtrav(node))
+ {
+ outbuffer[*index] = node;
+ *index = *index + 1;
+ }
+ return;
+ }
+ if (!cbtrav(node))
+ {
+ outbuffer[*index] = node;
+ *index = *index + 1;
+ return;
+ }
+
+ rand_double = erand48(rstate);
+ if (rand_double >= 0.5)
+ {
+ rtree_traverse_recursive(node->left, cbtrav, index, rstate, outbuffer);
+ rtree_traverse_recursive(node->right, cbtrav, index, rstate, outbuffer);
+ }
+ else
+ {
+ rtree_traverse_recursive(node->right, cbtrav, index, rstate, outbuffer);
+ rtree_traverse_recursive(node->left, cbtrav, index, rstate, outbuffer);
+ }
+
+}
+
+int rtree_traverse(rtree_t * root,
+ int (*cbtrav)(rtree_t *),
+ unsigned short * rstate,
+ rtree_t ** outbuffer)
+{
+ int index = 0;
+
+ if (!root->left) return -1;
+
+ /* we will traverse an rooted tree in the following way
+
+ root
+ /\
+ / \
+ left right
+
+ at each node the callback function is called to decide whether we
+ are going to traversing the subtree rooted at the specific node */
+
+ rtree_traverse_recursive(root, cbtrav, &index, rstate, outbuffer);
+ return index;
+}
+
+static void rtree_traverse_postorder_recursive(rtree_t * node,
+ int (*cbtrav)(rtree_t *),
+ int * index,
+ rtree_t ** outbuffer)
+{
+ if (!node) return;
+
+ rtree_traverse_postorder_recursive(node->left, cbtrav, index, outbuffer);
+ rtree_traverse_postorder_recursive(node->right, cbtrav, index, outbuffer);
+
+ if (cbtrav(node))
+ {
+ outbuffer[*index] = node;
+ *index = *index + 1;
+ }
+}
+
+
+int rtree_traverse_postorder(rtree_t * root,
+ int (*cbtrav)(rtree_t *),
+ rtree_t ** outbuffer)
+{
+ int index = 0;
+
+ if (!root->left) return -1;
+
+ /* we will traverse an unrooted tree in the following way
+
+ root
+ /\
+ / \
+ left right
+
+ at each node the callback function is called to decide whether to
+ place the node in the list */
+
+ rtree_traverse_postorder_recursive(root, cbtrav, &index, outbuffer);
+ return index;
+}
+
+static int rtree_height_recursive(rtree_t * node)
+{
+ if (!node) return 1;
+
+ int a = rtree_height_recursive(node->left);
+ int b = rtree_height_recursive(node->right);
+
+ return MAX(a,b)+1;
+}
+
+
+int rtree_height(rtree_t * root)
+{
+ return rtree_height_recursive(root);
+}
+
+static void rtree_query_tipnodes_recursive(rtree_t * node,
+ rtree_t ** node_list,
+ int * index)
+{
+ if (!node) return;
+
+ if (!node->left)
+ {
+ node_list[*index] = node;
+ *index = *index + 1;
+ return;
+ }
+
+ rtree_query_tipnodes_recursive(node->left, node_list, index);
+ rtree_query_tipnodes_recursive(node->right, node_list, index);
+}
+
+int rtree_query_tipnodes(rtree_t * root,
+ rtree_t ** node_list)
+{
+ int index = 0;
+
+ if (!root) return 0;
+ if (!root->left)
+ {
+ node_list[index++] = root;
+ return index;
+ }
+
+ rtree_query_tipnodes_recursive(root->left, node_list, &index);
+ rtree_query_tipnodes_recursive(root->right, node_list, &index);
+
+ return index;
+}
+
+static void rtree_query_innernodes_recursive(rtree_t * root,
+ rtree_t ** node_list,
+ int * index)
+{
+ if (!root) return;
+ if (!root->left) return;
+
+ /* postorder traversal */
+
+ rtree_query_innernodes_recursive(root->left, node_list, index);
+ rtree_query_innernodes_recursive(root->right, node_list, index);
+
+ node_list[*index] = root;
+ *index = *index + 1;
+ return;
+}
+
+int rtree_query_innernodes(rtree_t * root,
+ rtree_t ** node_list)
+{
+ int index = 0;
+
+ if (!root) return 0;
+ if (!root->left) return 0;
+
+ rtree_query_innernodes_recursive(root->left, node_list, &index);
+ rtree_query_innernodes_recursive(root->right, node_list, &index);
+
+ node_list[index++] = root;
+
+ return index;
+}
+
+void rtree_reset_info(rtree_t * root)
+{
+ if (!root->left)
+ {
+ root->leaves = 1;
+ root->edge_count = 0;
+ root->edgelen_sum = 0;
+ return;
+ }
+
+ rtree_reset_info(root->left);
+ rtree_reset_info(root->right);
+
+ root->leaves = root->left->leaves + root->right->leaves;
+ root->edge_count = root->left->edge_count +
+ root->right->edge_count;
+ root->edgelen_sum = root->left->edgelen_sum +
+ root->right->edgelen_sum;
+
+ if (root->left->length > opt_minbr)
+ {
+ root->edge_count++;
+ root->edgelen_sum += root->left->length;
+ }
+ if (root->right->length > opt_minbr)
+ {
+ root->edge_count++;
+ root->edgelen_sum += root->right->length;
+ }
+}
+
+void rtree_print_tips(rtree_t * node, FILE * out)
+{
+ if (node->left) rtree_print_tips(node->left,out);
+ if (node->right) rtree_print_tips(node->right,out);
+
+ if (!node->left && !node->right)
+ fprintf(out, "%s\n", node->label);
+}
+
+
+rtree_t * rtree_clone(rtree_t * node, rtree_t * parent)
+{
+ if (!node) return NULL;
+
+ /* clone node */
+ rtree_t * clone = (rtree_t *)xcalloc(1,sizeof(rtree_t));
+ memcpy(clone,node,sizeof(rtree_t));
+ clone->parent = parent;
+ clone->data = NULL;
+
+ if (node->label)
+ clone->label = xstrdup(node->label);
+
+ /* clone the two subtrees */
+ clone->left = rtree_clone(node->left, clone);
+ clone->right = rtree_clone(node->right, clone);
+
+ return clone;
+}
+
+rtree_t ** rtree_tipstring_nodes(rtree_t * root,
+ char * tipstring,
+ unsigned int * tiplist_count)
+{
+ size_t i;
+ unsigned int k;
+ unsigned int commas_count = 0;
+
+ char * taxon;
+ unsigned long taxon_len;
+
+ ENTRY * found = NULL;
+
+ for (i = 0; i < strlen(tipstring); ++i)
+ if (tipstring[i] == ',')
+ commas_count++;
+
+ rtree_t ** node_list = (rtree_t **)xmalloc((size_t)(root->leaves) *
+ sizeof(rtree_t *));
+ rtree_query_tipnodes(root, node_list);
+
+ rtree_t ** out_node_list = (rtree_t **)xmalloc((size_t)(commas_count+1) *
+ sizeof(rtree_t *));
+
+ /* create a hashtable of tip labels */
+ hcreate(2 * (size_t)(root->leaves));
+
+ for (i = 0; i < (unsigned int)(root->leaves); ++i)
+ {
+ ENTRY entry;
+ entry.key = node_list[i]->label;
+ entry.data = node_list[i];
+ hsearch(entry,ENTER);
+ }
+
+ char * s = tipstring;
+
+ k = 0;
+ while (*s)
+ {
+ /* get next tip */
+ taxon_len = strcspn(s, ",");
+ if (!taxon_len)
+ fatal("Erroneous prune list format (double comma)/taxon missing");
+
+ taxon = xstrndup(s, taxon_len);
+
+ /* search tip in hash table */
+ ENTRY query;
+ query.key = taxon;
+ found = NULL;
+ found = hsearch(query,FIND);
+
+ if (!found)
+ fatal("Taxon %s in does not appear in the tree", taxon);
+
+ /* store pointer in output list */
+ out_node_list[k++] = (rtree_t *)(found->data);
+
+ /* free tip label, and move to the beginning of next tip if available */
+ free(taxon);
+ s += taxon_len;
+ if (*s == ',')
+ s += 1;
+ }
+
+ /* kill the hash table */
+ hdestroy();
+
+ free(node_list);
+
+ /* return number of tips in the list */
+ *tiplist_count = commas_count + 1;
+
+ /* return tip node list */
+ return out_node_list;
+}
+
+/* fill path with nodes of the path tip to root */
+static void fill_path(rtree_t ** path, int * path_len, rtree_t * tip)
+{
+ int i = 0;
+
+ while (tip)
+ {
+ path[i++] = tip;
+ tip = tip->parent;
+ }
+
+ *path_len = i;
+}
+
+rtree_t * rtree_lca(rtree_t * root,
+ rtree_t ** tip_nodes,
+ unsigned int count)
+{
+ unsigned int i;
+ rtree_t *** path;
+
+ assert(count >= 2);
+
+ /* allocate path arrays for count tip nodes */
+ path = (rtree_t ***)xmalloc((size_t)count *
+ sizeof(rtree_t **));
+ int * path_len = (int *)xmalloc((size_t)count * sizeof(int));
+
+ /* for each tip node fill corresponding path array with all nodes
+ in the path to the root node and store the length of the path */
+ for (i = 0; i < count; ++i)
+ {
+ path[i] = (rtree_t **)xmalloc((size_t)(rtree_height(root)) *
+ sizeof(rtree_t *));
+
+ fill_path(path[i], &(path_len[i]), tip_nodes[i]);
+ }
+
+ /* find the LCA using a breadth-first-search traversal starting from the root.
+ Since all paths start at the root, the LCA is the parent of nodes that
+ differ in the paths when encountered for the first time */
+ rtree_t * lca = NULL;
+ while (!lca)
+ {
+ for (i = 0; i < count; ++i)
+ --path_len[i];
+
+ for (i = 1; i < count; ++i)
+ {
+ if (path[i-1][path_len[i-1]] != path[i][path_len[i]])
+ {
+ lca = path[i][path_len[i]+1];
+ break;
+ }
+ }
+ }
+
+ /* free allocated memory */
+ for (i = 0; i < count; ++i)
+ free(path[i]);
+ free(path);
+ free(path_len);
+
+ return lca;
+}
+
+rtree_t * get_outgroup_lca(rtree_t * root)
+{
+ unsigned int og_tips_count;
+ rtree_t * og_root;
+ rtree_t ** og_tips;
+
+
+ og_tips = rtree_tipstring_nodes(root,
+ opt_outgroup,
+ &og_tips_count);
+
+ if (og_tips_count > 1)
+ og_root = rtree_lca(root, og_tips, og_tips_count);
+ else og_root = og_tips[0];
+
+ free(og_tips);
+
+ return og_root;
+}
+
+rtree_t * rtree_crop(rtree_t * root, rtree_t * crop_root)
+{
+ /* check if the selected subtree can be cropped */
+ if (root->leaves - crop_root->leaves < 2)
+ return NULL;
+
+ /* subtree can be cropped, distinguish between two cases: */
+
+ if (crop_root->parent == root)
+ {
+
+ /* Case 1:
+
+ root
+ *
+ / \ A
+ A * * crop_root ----> *
+ / \
+ * *
+
+ in this case the subtree rooted at crop_root is cropped, the root node is
+ eliminated and subtree rooted at A becomes the new tree
+ */
+
+ rtree_t * new_root;
+
+ if (root->left == crop_root)
+ {
+ new_root = root->right;
+ root->right = NULL;
+ }
+ else
+ {
+ new_root = root->left;
+ root->left = NULL;
+ }
+
+ rtree_destroy(root);
+
+ new_root->parent = NULL;
+ rtree_reset_info(new_root);
+
+ return new_root;
+ }
+
+ /* Case 2:
+
+ root
+ *
+ / \
+ A * -
+ \ root
+ * B ----> *
+ / \ / \
+ C * * crop_root A * -
+ / \ \
+ * * * C
+
+ in this case the subtree rooted at crop_root is cropped, the root node is
+ eliminated and subtree rooted at A becomes the new tree
+ */
+
+ rtree_t * b = crop_root->parent;
+ rtree_t * c;
+
+ /* get C and break the link between B and C */
+ if (b->left == crop_root)
+ {
+ c = b->right;
+ b->right = NULL;
+ }
+ else
+ {
+ c = b->left;
+ b->left = NULL;
+ }
+
+ /* link the parent of B with C from both directions */
+ c->parent = b->parent;
+ if (b->parent->left == b)
+ b->parent->left = c;
+ else
+ b->parent->right = c;
+
+ c->length += b->length;
+
+ rtree_destroy(b);
+ rtree_reset_info(root);
+
+ return root;
+}
diff --git a/src/svg.c b/src/svg.c
new file mode 100644
index 0000000..e673ff8
--- /dev/null
+++ b/src/svg.c
@@ -0,0 +1,404 @@
+/*
+ Copyright (C) 2015 Tomas Flouri
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as
+ published by the Free Software Foundation, either version 3 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+ Contact: Tomas Flouri <Tomas.Flouri at h-its.org>,
+ Heidelberg Institute for Theoretical Studies,
+ Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
+*/
+
+#include "mptp.h"
+
+#define GRADIENT(x) (1-x)*100
+
+static double scaler = 0;
+
+static long legend_spacing = 10;
+
+static FILE * svg_fp;
+
+static double max_font_len = 0;
+static double max_tree_len = 0;
+
+static double canvas_width;
+
+static char * const speciation_color = "#31a354";
+static char * const coalesence_color = "#ff0000";
+
+static int tip_occ = 0;
+
+typedef struct coord_s
+{
+ double x;
+ double y;
+} coord_t;
+
+static coord_t * create_coord(double x, double y)
+{
+ coord_t * coord = (coord_t *)xmalloc(sizeof(coord_t));
+ coord->x = x;
+ coord->y = y;
+ return coord;
+}
+
+static void svg_line(double x1,
+ double y1,
+ double x2,
+ double y2,
+ const char * color,
+ double stroke_width)
+{
+ fprintf(svg_fp,
+ "<line x1=\"%f\" y1=\"%f\" x2=\"%f\" y2=\"%f\" "
+ "stroke=\"%s\" stroke-width=\"%f\" />\n",
+ x1, y1, x2, y2, color, stroke_width);
+}
+
+static void svg_circle(double cx, double cy, double r, const char * color)
+{
+ fprintf(svg_fp,
+ "<circle cx=\"%f\" cy=\"%f\" r=\"%f\" fill=\"%s\" "
+ "stroke=\"%s\" />\n",
+ cx, cy, r, color, color);
+ /* animation effect
+ fprintf(svg_fp, "<animate attributeName=\"r\" begin=\"mouseover\" dur=\"0.2\" fill=\"freeze\" from=\"%ld\" to=\"%ld\" />\n",
+ (long)r, (long)r+5);
+ fprintf(svg_fp, "<animate attributeName=\"r\" begin=\"mouseout\" dur=\"0.2\" fill=\"freeze\" to=\"%ld\" />\n</circle>\n",
+ (long)r);
+ */
+
+}
+
+static void svg_text(double x, double y, long fontsize, const char * text)
+{
+ fprintf(svg_fp,
+ "<text x=\"%f\" y=\"%f\" font-size=\"%ld\" font-family=\"Arial;\" text-anchor=\"end\">"
+ "%s</text>\n",
+ x,y,fontsize,text);
+}
+
+static void rtree_set_xcoord(rtree_t * node)
+{
+ /* create the coordinate info of the node's scaled branch length (edge
+ towards root) */
+ coord_t * coord = create_coord(node->length * scaler, 0);
+ node->data = (void *)coord;
+
+ /* if the node has a parent then add the x coord of the parent such that
+ the branch is shifted towards right, otherwise, if the node is the root,
+ align it with the left margin */
+ if (node->parent)
+ coord->x += ((coord_t *)(node->parent->data))->x;
+ else
+ {
+ coord->x = opt_svg_marginleft;
+ }
+
+ if (!node->left)
+ return;
+
+ /* recursively set coordinates of the other nodes in a pre-order fashion */
+ rtree_set_xcoord(node->left);
+ rtree_set_xcoord(node->right);
+}
+
+static void svg_rtree_plot(rtree_t * node)
+{
+ char * current_color;
+ double y;
+ double stroke_width = 3;
+
+ /* traverse tree in post-order */
+ if (node->left)
+ {
+ svg_rtree_plot(node->left);
+ svg_rtree_plot(node->right);
+ }
+
+ /* any node that has a parent, i.e. any node apart from the root */
+ if (node->parent)
+ {
+ double x,px;
+
+ x = ((coord_t *)(node->data))->x;
+ px = ((coord_t *)(node->parent->data))->x;
+
+ if (!node->left)
+ {
+ y = tip_occ * opt_svg_tipspace + opt_svg_margintop + legend_spacing;
+ tip_occ++;
+ }
+ else
+ {
+ double ly,ry;
+ ly = ((coord_t *)(node->left->data))->y;
+ ry = ((coord_t *)(node->right->data))->y;
+ y = (ly + ry) / 2.0;
+
+ /* decide the color */
+ if (opt_mcmc)
+ {
+ if (asprintf(¤t_color, "rgb(%f%%,%f%%,%f%%)",
+ GRADIENT(node->support),
+ 0.0,
+ 0.0) == -1)
+ fatal("Unable to allocate enough memory.");
+ }
+ else if (node->event == EVENT_COALESCENT)
+ current_color = coalesence_color;
+ else if (node->event == EVENT_SPECIATION)
+ current_color = speciation_color;
+ else
+ assert(0);
+
+ /* draw a vertical line and a circle in the middle */
+ svg_line(x, ly, x, ry, current_color, stroke_width);
+ svg_circle(x, y, opt_svg_inner_radius, current_color);
+
+ /* deallocate color if mcmc */
+ if (opt_mcmc)
+ free(current_color);
+
+ /* if support value greater than threshold output it */
+ if (opt_mcmc)
+ {
+ if (node->support > 0.5)
+ {
+ char * support;
+
+ if (asprintf(&support, "%.2f", node->support) == -1)
+ fatal("Unable to allocate enough memory.");
+
+ svg_text(x-5,y-5,opt_svg_fontsize,support);
+ free(support);
+ }
+ }
+ }
+
+ /* decide the color based on the parent node */
+ if (opt_mcmc)
+ {
+ if (asprintf(¤t_color, "rgb(%f%%,%f%%,%f%%)",
+ GRADIENT(node->parent->support),
+ 0.0,
+ 0.0) == -1)
+ fatal("Unable to allocate enough memory.");
+ }
+ else if (node->parent->event == EVENT_COALESCENT)
+ current_color = coalesence_color;
+ else if (node->parent->event == EVENT_SPECIATION)
+ current_color = speciation_color;
+ else
+ assert(0);
+
+ /* draw horizontal line */
+ svg_line(px,y,x,y,current_color,stroke_width);
+ ((coord_t *)(node->data))->y = y;
+
+ if (opt_mcmc)
+ free(current_color);
+
+ /* if node is a tip then print its label */
+ if (!node->left)
+ {
+ fprintf(svg_fp, "<text x=\"%f\" y=\"%f\" "
+ "font-size=\"%ld\" font-family=\"Arial;\">%s</text>\n",
+ x+5,
+ y+opt_svg_fontsize/3.0,
+ opt_svg_fontsize,
+ node->label);
+ }
+ else
+ fprintf(svg_fp, "\n");
+ }
+ else /* the root node case */
+ {
+ double ly,ry,x;
+ // lx = ((coord_t *)(node->left->data))->x;
+ ly = ((coord_t *)(node->left->data))->y;
+ // rx = ((coord_t *)(node->right->data))->x;
+ ry = ((coord_t *)(node->right->data))->y;
+ y = (ly + ry) / 2.0;
+ x = opt_svg_marginleft;
+
+ /* decide the color */
+ if (opt_mcmc)
+ {
+ if (asprintf(¤t_color, "rgb(%f%%,%f%%,%f%%)",
+ GRADIENT(node->support),
+ 0.0,
+ 0.0) == -1)
+ fatal("Unable to allocate enough memory.");
+ }
+ else if (node->event == EVENT_COALESCENT)
+ current_color = coalesence_color;
+ else if (node->event == EVENT_SPECIATION)
+ current_color = speciation_color;
+ else
+ assert(0);
+
+ svg_line(x,ly,x,ry,current_color,stroke_width);
+ svg_circle(x,y,opt_svg_inner_radius,current_color);
+
+ if (opt_mcmc)
+ free(current_color);
+
+ if (opt_mcmc)
+ {
+ if (node->support > 0.5)
+ {
+ char * support;
+
+ if (asprintf(&support, "%.2f", node->support) == -1)
+ fatal("Unable to allocate enough memory.");
+
+ svg_text(x-5,y-5,opt_svg_fontsize,support);
+ free(support);
+ }
+ }
+ }
+}
+
+static void rtree_scaler_init(rtree_t * root)
+{
+ double len = 0;
+ double label_len;
+ int i;
+
+ rtree_t ** node_list = (rtree_t **)malloc((size_t)(2 * root->leaves - 1) *
+ sizeof(rtree_t *));
+
+ rtree_query_tipnodes(root, node_list);
+
+ /* find longest path to root */
+
+ for (i = 0; i < root->leaves; ++i)
+ {
+ rtree_t * node = node_list[i];
+
+ len = 0;
+ while(node)
+ {
+ len += node->length;
+ node = node->parent;
+ }
+ /* subtract root length */
+ len -= root->length;
+
+ if (len > max_tree_len)
+ max_tree_len = len;
+
+ label_len = (opt_svg_fontsize / 1.5) *
+ (node_list[i]->label ? strlen(node_list[i]->label) : 0);
+
+ len = (canvas_width - label_len) / len;
+ if (i == 0)
+ {
+ scaler = len;
+ max_font_len = label_len;
+ }
+ else
+ if (len < scaler)
+ {
+ scaler = len;
+ max_font_len = label_len;
+ }
+ }
+ free(node_list);
+}
+
+static void svg_rtree_init(rtree_t * root)
+{
+ long svg_height;
+
+ canvas_width = opt_svg_width - opt_svg_marginleft - opt_svg_marginright;
+
+ /* initialize pixel scaler (scaler) and compute max tree
+ length (max_tree_len) */
+ rtree_scaler_init(root);
+
+ svg_height = opt_svg_margintop + legend_spacing + opt_svg_marginbottom +
+ opt_svg_tipspace * root->leaves;
+
+
+ /* print svg header tag with dimensions and grey border */
+ fprintf(svg_fp, "<svg xmlns=\"http://www.w3.org/2000/svg\" width=\"%ld\" "
+ "height=\"%ld\" style=\"border: 1px solid #cccccc;\">\n",
+ opt_svg_width,
+ svg_height);
+
+ /* draw legend */
+ if (opt_svg_showlegend)
+ {
+ svg_line(opt_svg_marginleft,
+ 10,
+ (canvas_width - max_font_len)*opt_svg_legend_ratio +
+ opt_svg_marginleft,
+ 10,
+ speciation_color,
+ 3);
+
+ fprintf(svg_fp, "<text x=\"%f\" y=\"%f\" font-size=\"%ld\" "
+ "font-family=\"Arial;\">%.*f</text>\n",
+ (canvas_width - max_font_len)*opt_svg_legend_ratio + opt_svg_marginleft + 5,
+ 20-opt_svg_fontsize/3.0,
+ (long)opt_svg_fontsize, opt_precision, max_tree_len * opt_svg_legend_ratio);
+ }
+
+ /* uncomment to print a dashed border to indicate margins */
+
+ /*
+ fprintf(svg_fp, "<rect x=\"%ld\" y=\"%ld\" width=\"%ld\" fill=\"none\" "
+ "height=\"%ld\" stroke=\"#999999\" stroke-dasharray=\"5,5\" "
+ "stroke-width=\"1\" />\n",
+ opt_svg_marginleft,
+ opt_svg_margintop + legend_spacing,
+ opt_svg_width - opt_svg_marginleft - opt_svg_marginright,
+ svg_height - opt_svg_margintop - legend_spacing - opt_svg_marginbottom);
+ */
+
+ rtree_set_xcoord(root);
+
+ svg_rtree_plot(root);
+
+ fprintf(svg_fp, "</svg>\n");
+}
+
+
+void cmd_svg(rtree_t * root, long seed, const char * ext)
+{
+
+ /* reset tip occurrence */
+ tip_occ = 0;
+
+ if (!opt_quiet)
+ {
+ if (opt_mcmc)
+ fprintf(stdout,
+ "Creating SVG delimitation file %s.%ld.svg ...\n",
+ opt_outfile,
+ seed);
+ else
+ fprintf(stdout,
+ "Creating SVG delimitation file %s.svg ...\n",
+ opt_outfile);
+ }
+
+ svg_fp = open_file_ext(ext, seed);
+
+ svg_rtree_init(root);
+
+ fclose(svg_fp);
+}
diff --git a/src/svg_landscape.c b/src/svg_landscape.c
new file mode 100644
index 0000000..16a6ae9
--- /dev/null
+++ b/src/svg_landscape.c
@@ -0,0 +1,246 @@
+/*
+ Copyright (C) 2015 Tomas Flouri, Sarah Lutteropp
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as
+ published by the Free Software Foundation, either version 3 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+ Contact: Tomas Flouri <Tomas.Flouri at h-its.org>,
+ Heidelberg Institute for Theoretical Studies,
+ Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
+*/
+
+#include "mptp.h"
+
+
+static char line[LINEALLOC];
+
+static double originx = 133;
+
+static int xtics = 10;
+
+static long canvas_x1 = 130;
+static long canvas_x2 = 730;
+static long canvas_y1 = 10;
+static long canvas_y2 = 360;
+static int radius = 4;
+static int radius_mouseover = 10;
+
+static int color_index = 2;
+
+static char * const color10[] =
+ { "#1f77b4", "#ff7f0e",
+ "#2ca02c", "#d62728",
+ "#9467bd", "#8c564b",
+ "#e377c2", "#7f7f7f",
+ "#bcbd22", "#17becf"
+ };
+
+static void svg_header(FILE * svg_fp)
+{
+
+ fprintf(svg_fp,"<svg class=\"graph\" version=\"1.1\" "
+ "xmlns:xlink=\"http://www.w3.org/1999/xlink\" "
+ "xmlns=\"http://www.w3.org/2000/svg\">\n");
+ fprintf(svg_fp,"<style type=\"text/css\">\n");
+ fprintf(svg_fp,"<![CDATA[\n"
+ "svg.graph {\n"
+ " height: 500px;\n"
+ " width: 800px;\n"
+ " background: #f8f8f8;\n"
+ "}\n\n"
+ "svg.graph .grid {\n"
+ " stroke: #e5e5e5;\n"
+ " stroke-width: 1;\n"
+ "}\n\n"
+ "svg.graph .points {\n"
+ " stroke: white;\n"
+ " stroke-width: 3;\n"
+ "}\n\n"
+ "svg.graph .first_set {\n"
+ " fill: #00554d;\n"
+ "}\n\n"
+ "svg.graph .first_set_bar {\n"
+ " fill: #00554d;\n"
+ " stroke: #000000;\n"
+ "}\n\n"
+ "svg.graph .surfaces {\n"
+ " fill-opacity: 0.5;\n"
+ "}\n\n"
+ "svg.graph .grid.double {\n"
+ " stroke-opacity: 0.4;\n"
+ "}\n\n"
+ "svg.graph .labels {\n"
+ " font-family: Arial;\n"
+ " font-size: 12px;\n"
+ " kerning: 1;\n"
+ "}\n"
+ "svg.graph .labels.x-labels {\n"
+ " text-anchor: end;\n"
+ "}\n"
+ "svg.graph .labels.y-labels {\n"
+ " text-anchor: end;\n"
+ "}\n"
+ "]]>\n</style>\n");
+
+ /* print axes */
+ fprintf(svg_fp, "<g class=\"grid x-grid\" id=\"xGrid\">\n"
+ " <line x1=\"130\" x2=\"130\" y1=\"10\" y2=\"380\"></line>\n"
+ " <line x1=\"190\" x2=\"190\" y1=\"10\" y2=\"380\"></line>\n"
+ " <line x1=\"250\" x2=\"250\" y1=\"10\" y2=\"380\"></line>\n"
+ " <line x1=\"310\" x2=\"310\" y1=\"10\" y2=\"380\"></line>\n"
+ " <line x1=\"370\" x2=\"370\" y1=\"10\" y2=\"380\"></line>\n"
+ " <line x1=\"430\" x2=\"430\" y1=\"10\" y2=\"380\"></line>\n"
+ " <line x1=\"490\" x2=\"490\" y1=\"10\" y2=\"380\"></line>\n"
+ " <line x1=\"550\" x2=\"550\" y1=\"10\" y2=\"380\"></line>\n"
+ " <line x1=\"610\" x2=\"610\" y1=\"10\" y2=\"380\"></line>\n"
+ " <line x1=\"670\" x2=\"670\" y1=\"10\" y2=\"380\"></line>\n"
+ " <line x1=\"730\" x2=\"730\" y1=\"10\" y2=\"380\"></line>\n"
+ "</g>\n");
+
+ fprintf(svg_fp, "<g class=\"grid y-grid\" id=\"yGrid\">\n"
+ " <line x1=\"103\" x2=\"730\" y1=\"10\" y2=\"10\"></line>\n"
+ " <line x1=\"103\" x2=\"730\" y1=\"68\" y2=\"68\"></line>\n"
+ " <line x1=\"103\" x2=\"730\" y1=\"126\" y2=\"126\"></line>\n"
+ " <line x1=\"103\" x2=\"730\" y1=\"185\" y2=\"185\"></line>\n"
+ " <line x1=\"103\" x2=\"730\" y1=\"243\" y2=\"243\"></line>\n"
+ " <line x1=\"103\" x2=\"730\" y1=\"301\" y2=\"301\"></line>\n"
+ " <line x1=\"103\" x2=\"730\" y1=\"360\" y2=\"360\"></line>\n"
+ "</g>\n");
+ fprintf(svg_fp, "<g class=\"surfaces\">\n");
+}
+
+static void out_svg(FILE * svg_fp, double min_logl, double max_logl, long seed)
+{
+
+ double scale = (max_logl - min_logl) * 1.1;
+
+ /* open data points file */
+ char * filename;
+ if (asprintf(&filename, "%s.%ld.%s", opt_outfile, seed, "log") == -1)
+ fatal("Unable to allocate enough memory.");
+ FILE * fp = xopen(filename,"r");
+ free(filename);
+
+ /* read and print data points to svg */
+ int i = 0;
+ while (fgets(line,LINEALLOC,fp))
+ {
+ double x,y;
+ double logl;
+ int species;
+
+ sscanf(line,"%lf,%d\n",&logl,&species);
+
+ /* compute x point */
+ x = ((i*opt_mcmc_sample)/(double)(opt_mcmc_steps-opt_mcmc_burnin)) *
+ (canvas_x2 - canvas_x1) + canvas_x1;
+
+ /* compute y point */
+ y = (1 - (logl-min_logl)/scale) *
+ (canvas_y2-canvas_y1) +
+ canvas_y1;
+
+ /* print point */
+ fprintf(svg_fp,
+ "<circle cx=\"%f\" cy=\"%f\" r=\"%d\" fill=\"%s\" stroke=\"%s\" fill-opacity=\".5\" >\n"
+ "<animate attributeName=\"r\" begin=\"mouseover\" dur=\"0.2\" fill=\"freeze\" from=\"%d\" to=\"%d\" />\n"
+ "<animate attributeName=\"fill-opacity\" begin=\"mouseover\" dur=\"0.2\" fill=\"freeze\" from=\".5\" to=\"1\" />\n"
+ "<animate attributeName=\"r\" begin=\"mouseout\" dur=\"0.2\" fill=\"freeze\" to=\"%d\" />\n"
+ "<animate attributeName=\"fill-opacity\" begin=\"mouseout\" dur=\"0.2\" fill=\"freeze\" to=\".5\" />\n"
+ "</circle>\n",
+ x, y, radius, color10[color_index], color10[color_index], radius, radius_mouseover, radius);
+
+ ++i;
+
+ }
+ fclose(fp);
+}
+
+static void svg_footer(FILE * svg_fp, double min_logl, double max_logl)
+{
+ double scale = (max_logl - min_logl) * 1.1;
+ int i;
+
+ fprintf(svg_fp, "</g>\n");
+ /* bring gridlines to front */
+ fprintf(svg_fp,"<use class=\"grid double\" xlink:href=\"#xGrid\" style=\"\"></use>\n");
+ fprintf(svg_fp,"<use class=\"grid double\" xlink:href=\"#yGrid\" style=\"\"></use>\n");
+
+ /* x labels */
+ fprintf(svg_fp, "<g class=\"labels x-labels\">\n");
+ fprintf(svg_fp, "<text transform=\"translate(%f,400)rotate(270)\">%ld</text>\n",
+ originx,
+ opt_mcmc_burnin);
+ for (i = 0; i < xtics; ++i)
+ {
+ fprintf(svg_fp,
+ "<text transform=\"translate(%f,400)rotate(270)\">%ld</text>\n",
+ originx + (i+1)*((canvas_x2 - canvas_x1)/(double)xtics),
+ (long)((i+1)*((opt_mcmc_steps-opt_mcmc_burnin)/(double)xtics)) +
+ opt_mcmc_burnin);
+ }
+ fprintf(svg_fp, "</g>\n");
+
+ /* y labels */
+ fprintf(svg_fp, "<g class=\"labels y-labels\">\n");
+ fprintf(svg_fp, " <text x=\"100\" y=\"15\">%.3f</text>\n", min_logl + scale);
+ fprintf(svg_fp, " <text x=\"100\" y=\"73\">%.3f</text>\n", min_logl + 5*(scale)/6);
+ fprintf(svg_fp, " <text x=\"100\" y=\"131\">%.3f</text>\n", min_logl + 4*(scale)/6);
+ fprintf(svg_fp, " <text x=\"100\" y=\"190\">%.3f</text>\n", min_logl + 3*(scale)/6);
+ fprintf(svg_fp, " <text x=\"100\" y=\"248\">%.3f</text>\n", min_logl + 2*(scale)/6);
+ fprintf(svg_fp, " <text x=\"100\" y=\"307\">%.3f</text>\n", min_logl + scale/6);
+ fprintf(svg_fp, " <text x=\"100\" y=\"365\">%.3f</text>\n", min_logl);
+ fprintf(svg_fp, "</g>\n");
+
+ fprintf(svg_fp,"</svg>\n");
+}
+
+void svg_landscape(double mcmc_min_logl, double mcmc_max_logl, long seed)
+{
+ FILE * svg_fp = open_file_ext("logl.svg", seed);
+ if (!opt_quiet)
+ fprintf(stdout,
+ "Creating log-likelihood visualization in %s.%ld.logl.svg ...\n",
+ opt_outfile, seed);
+
+ svg_header(svg_fp);
+ out_svg(svg_fp, mcmc_min_logl, mcmc_max_logl, seed);
+ svg_footer(svg_fp, mcmc_min_logl, mcmc_max_logl);
+
+ fclose(svg_fp);
+}
+
+void svg_landscape_combined(double mcmc_min_logl,
+ double mcmc_max_logl,
+ long runs,
+ long *seed)
+{
+ long i;
+ FILE * svg_fp = open_file_ext("logl.svg", opt_seed);
+ if (!opt_quiet)
+ fprintf(stdout,
+ "Overall log-likelihood visualization in %s.%ld.logl.svg ...\n",
+ opt_outfile, opt_seed);
+
+ svg_header(svg_fp);
+
+ for (i = 0; i < runs; ++i)
+ {
+ color_index = i % 10;
+ out_svg(svg_fp, mcmc_min_logl, mcmc_max_logl, seed[i]);
+ }
+
+ svg_footer(svg_fp, mcmc_min_logl, mcmc_max_logl);
+
+ fclose(svg_fp);
+}
diff --git a/src/util.c b/src/util.c
new file mode 100644
index 0000000..b195b51
--- /dev/null
+++ b/src/util.c
@@ -0,0 +1,179 @@
+/*
+ Copyright (C) 2015 Tomas Flouri
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as
+ published by the Free Software Foundation, either version 3 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+ Contact: Tomas Flouri <Tomas.Flouri at h-its.org>,
+ Heidelberg Institute for Theoretical Studies,
+ Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
+*/
+
+#include "mptp.h"
+
+static const char * progress_prompt;
+static unsigned long progress_next;
+static unsigned long progress_size;
+static unsigned long progress_chunk;
+static const unsigned long progress_granularity = 200;
+
+void fatal(const char * format, ...)
+{
+ va_list argptr;
+ va_start(argptr, format);
+ vfprintf(stderr, format, argptr);
+ va_end(argptr);
+ fprintf(stderr, "\n");
+ exit(1);
+}
+
+void progress_init(const char * prompt, unsigned long size)
+{
+ if (!opt_quiet)
+ {
+ progress_prompt = prompt;
+ progress_size = size;
+ progress_chunk = size < progress_granularity ?
+ 1 : size / progress_granularity;
+ progress_next = 0;
+ fprintf(stderr, "%s %.0f%%", prompt, 0.0);
+ }
+}
+
+void progress_update(unsigned int progress)
+{
+ if (!opt_quiet)
+ {
+ if (progress >= progress_next)
+ {
+ fprintf(stderr, " \r%s %.0f%%", progress_prompt,
+ 100.0 * progress / progress_size);
+ progress_next = progress + progress_chunk;
+ }
+ }
+}
+
+void progress_done()
+{
+ if (!opt_quiet)
+ fprintf(stderr, " \r%s %.0f%%\n", progress_prompt, 100.0);
+}
+
+#if 0
+void * xmalloc(size_t size)
+{
+ const size_t alignment = 16;
+ void * t = NULL;
+ if (posix_memalign(& t, alignment, size) == -1)
+ fatal("Unable to allocate enough memory.");
+
+ if (!t)
+ fatal("Unable to allocate enough memory.");
+
+ return t;
+}
+#else
+void * xmalloc(size_t size)
+{
+ void * t;
+ t = malloc(size);
+ if (!t)
+ fatal("Unable to allocate enough memory.");
+
+ return t;
+}
+#endif
+
+void * xcalloc(size_t nmemb, size_t size)
+{
+ void * t;
+ t = calloc(nmemb,size);
+ if (!t)
+ fatal("Unable to allocate enough memory.");
+
+ return t;
+}
+
+void * xrealloc(void *ptr, size_t size)
+{
+ void * t = realloc(ptr, size);
+ if (!t)
+ fatal("Unable to allocate enough memory.");
+ return t;
+}
+
+char * xstrchrnul(char *s, int c)
+{
+ char * r = strchr(s, c);
+
+ if (r)
+ return r;
+ else
+ return (char *)s + strlen(s);
+}
+
+char * xstrdup(const char * s)
+{
+ size_t len = strlen(s);
+ char * p = (char *)xmalloc(len+1);
+ return strcpy(p,s);
+}
+
+char * xstrndup(const char * s, size_t len)
+{
+ char * p = (char *)xmalloc(len+1);
+ strncpy(p,s,len);
+ p[len] = 0;
+ return p;
+}
+
+long getusec(void)
+{
+ struct timeval tv;
+ if(gettimeofday(&tv,0) != 0) return 0;
+ return tv.tv_sec * 1000000 + tv.tv_usec;
+}
+
+void show_rusage()
+{
+ struct rusage r_usage;
+ getrusage(RUSAGE_SELF, & r_usage);
+
+ fprintf(stderr, "Time: %.3fs (user)", r_usage.ru_utime.tv_sec * 1.0 + (double) r_usage.ru_utime.tv_usec * 1.0e-6);
+ fprintf(stderr, " %.3fs (sys)", r_usage.ru_stime.tv_sec * 1.0 + r_usage.ru_stime.tv_usec * 1.0e-6);
+
+#if defined __APPLE__
+ /* Mac: ru_maxrss gives the size in bytes */
+ fprintf(stderr, " Memory: %.0fMB\n", r_usage.ru_maxrss * 1.0e-6);
+#else
+ /* Linux: ru_maxrss gives the size in kilobytes */
+ fprintf(stderr, " Memory: %.0fMB\n", r_usage.ru_maxrss * 1.0e-3);
+#endif
+}
+
+FILE * xopen(const char * filename, const char * mode)
+{
+ FILE * out = fopen(filename, mode);
+ if (!out)
+ fatal("Cannot open file %s", opt_outfile);
+
+ return out;
+}
+
+void random_init(unsigned short * rstate, long seedval)
+{
+ /* emulate drand48() */
+ rstate[0] = 0x330e;
+ rstate[1] = seedval & 0xffffl;
+ rstate[2] = seedval >> 16;
+}
diff --git a/src/utree.c b/src/utree.c
new file mode 100644
index 0000000..6fb6191
--- /dev/null
+++ b/src/utree.c
@@ -0,0 +1,614 @@
+/*
+ Copyright (C) 2015 Tomas Flouri
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as
+ published by the Free Software Foundation, either version 3 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+ Contact: Tomas Flouri <Tomas.Flouri at h-its.org>,
+ Heidelberg Institute for Theoretical Studies,
+ Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
+*/
+
+#include "mptp.h"
+
+static int indend_space = 4;
+
+static void print_node_info(utree_t * tree)
+{
+ printf (" %s", tree->label);
+ printf (" %f", tree->length);
+ printf("\n");
+}
+
+static void print_tree_recurse(utree_t * tree,
+ int indend_level,
+ int * active_node_order)
+{
+ int i,j;
+
+ if (!tree) return;
+
+ for (i = 0; i < indend_level; ++i)
+ {
+ if (active_node_order[i])
+ printf("|");
+ else
+ printf(" ");
+
+ for (j = 0; j < indend_space-1; ++j)
+ printf(" ");
+ }
+ printf("\n");
+
+ for (i = 0; i < indend_level-1; ++i)
+ {
+ if (active_node_order[i])
+ printf("|");
+ else
+ printf(" ");
+
+ for (j = 0; j < indend_space-1; ++j)
+ printf(" ");
+ }
+
+ printf("+");
+ for (j = 0; j < indend_space-1; ++j)
+ printf ("-");
+ if (tree->next) printf("+");
+
+ print_node_info(tree);
+
+ if (active_node_order[indend_level-1] == 2)
+ active_node_order[indend_level-1] = 0;
+
+ if (tree->next)
+ {
+ active_node_order[indend_level] = 1;
+ print_tree_recurse(tree->next->back,
+ indend_level+1,
+ active_node_order);
+ active_node_order[indend_level] = 2;
+ print_tree_recurse(tree->next->next->back,
+ indend_level+1,
+ active_node_order);
+ }
+
+}
+
+static int tree_indend_level(utree_t * tree, int indend)
+{
+ if (!tree->next) return indend+1;
+
+ int a = tree_indend_level(tree->next->back, indend+1);
+ int b = tree_indend_level(tree->next->next->back, indend+1);
+
+ return (a > b ? a : b);
+}
+
+void utree_show_ascii(utree_t * tree)
+{
+ int a, b;
+
+ a = tree_indend_level(tree->back,1);
+ b = tree_indend_level(tree,0);
+ int max_indend_level = (a > b ? a : b);
+
+
+ int * active_node_order = (int *)malloc((size_t)(max_indend_level+1) *
+ sizeof(int));
+ active_node_order[0] = 1;
+ active_node_order[1] = 1;
+
+ print_tree_recurse(tree->back, 1, active_node_order);
+ print_tree_recurse(tree->next->back, 1, active_node_order);
+ active_node_order[0] = 2;
+ print_tree_recurse(tree->next->next->back, 1, active_node_order);
+ free(active_node_order);
+}
+
+static char * newick_utree_recurse(utree_t * root)
+{
+ char * newick;
+
+ if (!root->next)
+ {
+ if (asprintf(&newick, "%s:%f", root->label, root->length) == -1)
+ fatal("Unable to allocate enough memory.");
+ }
+ else
+ {
+ char * subtree1 = newick_utree_recurse(root->next->back);
+ char * subtree2 = newick_utree_recurse(root->next->next->back);
+
+ if (asprintf(&newick, "(%s,%s)%s:%f", subtree1,
+ subtree2,
+ root->label ? root->label : "",
+ root->length) == -1)
+ fatal("Unable to allocate enough memory.");
+ free(subtree1);
+ free(subtree2);
+ }
+
+ return newick;
+}
+
+char * utree_export_newick(utree_t * root)
+{
+ char * newick;
+
+ if (!root) return NULL;
+
+ char * subtree1 = newick_utree_recurse(root->back);
+ char * subtree2 = newick_utree_recurse(root->next->back);
+ char * subtree3 = newick_utree_recurse(root->next->next->back);
+
+ if (asprintf(&newick, "(%s,%s,%s)%s:%f;", subtree1,
+ subtree2,
+ subtree3,
+ root->label ? root->label : "",
+ root->length) == -1)
+ fatal("Unable to allocate enough memory.");
+ free(subtree1);
+ free(subtree2);
+ free(subtree3);
+
+ return (newick);
+
+}
+
+static void utree_traverse_recursive(utree_t * node,
+ int (*cbtrav)(utree_t *),
+ int * index,
+ utree_t ** outbuffer)
+{
+ if (!node->next)
+ {
+ if (cbtrav(node))
+ {
+ outbuffer[*index] = node;
+ *index = *index + 1;
+ }
+ return;
+ }
+ if (!cbtrav(node))
+ return;
+
+ utree_traverse_recursive(node->next->back, cbtrav, index, outbuffer);
+ utree_traverse_recursive(node->next->next->back, cbtrav, index, outbuffer);
+
+ outbuffer[*index] = node;
+ *index = *index + 1;
+}
+
+int utree_traverse(utree_t * root,
+ int (*cbtrav)(utree_t *),
+ utree_t ** outbuffer)
+{
+ int index = 0;
+
+ if (!root->next) return -1;
+
+ /* we will traverse an unrooted tree in the following way
+
+ 2
+ /
+ 1 --*
+ \
+ 3
+
+ at each node the callback function is called to decide whether we
+ are going to traversing the subtree rooted at the specific node */
+
+ utree_traverse_recursive(root->back, cbtrav, &index, outbuffer);
+ utree_traverse_recursive(root, cbtrav, &index, outbuffer);
+
+ return index;
+}
+
+static void utree_traverse_postorder_recursive(utree_t * node,
+ int (*cbtrav)(utree_t *),
+ int * index,
+ utree_t ** outbuffer)
+{
+
+ if (!node->next)
+ {
+ if (cbtrav(node))
+ {
+ outbuffer[*index] = node;
+ *index = *index + 1;
+ }
+ return;
+ }
+
+ utree_traverse_postorder_recursive(node->next->back, cbtrav, index, outbuffer);
+ utree_traverse_postorder_recursive(node->next->next->back, cbtrav, index, outbuffer);
+
+ if (cbtrav(node))
+ {
+ outbuffer[*index] = node;
+ *index = *index + 1;
+ }
+}
+
+static int cb_outgroup(utree_t * node)
+{
+ /* if it's a tip */
+ if (!node->next)
+ return 0;
+
+ /* if inner node */
+ if (node->next->back->mark == 1 || node->next->next->back->mark == 1)
+ node->mark = 1;
+ else
+ node->mark = 0;
+
+ node->next->mark = node->next->back->mark;
+ node->next->next->mark = node->next->next->back->mark;
+
+ return node->mark;
+}
+
+static int utree_traverse_postorder(utree_t * root,
+ int (*cbtrav)(utree_t *),
+ utree_t ** outbuffer)
+{
+ int index = 0;
+
+ if (!root->next) return -1;
+
+ /* we will traverse an unrooted tree in the following way
+
+ 2
+ /
+ 1 --*
+ \
+ 3
+
+ at each node the callback function is called to decide whether we
+ are going to traversing the subtree rooted at the specific node */
+
+ utree_traverse_postorder_recursive(root->back, cbtrav, &index, outbuffer);
+ utree_traverse_postorder_recursive(root, cbtrav, &index, outbuffer);
+
+ return index;
+}
+
+
+
+static void utree_query_tipnodes_recursive(utree_t * node,
+ utree_t ** node_list,
+ int * index)
+{
+ if (!node->next)
+ {
+ node_list[*index] = node;
+ *index = *index + 1;
+ return;
+ }
+
+ utree_query_tipnodes_recursive(node->next->back, node_list, index);
+ utree_query_tipnodes_recursive(node->next->next->back, node_list, index);
+}
+
+int utree_query_tipnodes(utree_t * root,
+ utree_t ** node_list)
+{
+ int index = 0;
+
+ if (!root) return 0;
+
+ if (!root->next) root = root->back;
+
+ utree_query_tipnodes_recursive(root->back, node_list, &index);
+
+ utree_query_tipnodes_recursive(root->next->back, node_list, &index);
+ utree_query_tipnodes_recursive(root->next->next->back, node_list, &index);
+
+ return index;
+}
+
+static void utree_query_innernodes_recursive(utree_t * node,
+ utree_t ** node_list,
+ int * index)
+{
+ if (!node->next) return;
+
+ /* postorder traversal */
+
+ utree_query_innernodes_recursive(node->next->back, node_list, index);
+ utree_query_innernodes_recursive(node->next->next->back, node_list, index);
+
+ node_list[*index] = node;
+ *index = *index + 1;
+ return;
+}
+
+int utree_query_innernodes(utree_t * root,
+ utree_t ** node_list)
+{
+ int index = 0;
+
+ if (!root) return 0;
+ if (!root->next) root = root->back;
+
+ utree_query_innernodes_recursive(root->back, node_list, &index);
+
+ utree_query_innernodes_recursive(root->next->back, node_list, &index);
+ utree_query_innernodes_recursive(root->next->next->back, node_list, &index);
+
+ node_list[index++] = root;
+
+ return index;
+}
+
+static rtree_t * utree_rtree(utree_t * unode)
+{
+ rtree_t * rnode = (rtree_t *)xcalloc(1,sizeof(rtree_t));
+
+ rnode->event = EVENT_COALESCENT;
+
+ if (unode->label)
+ rnode->label = xstrdup(unode->label);
+ else
+ rnode->label = NULL;
+ rnode->length = unode->length;
+ rnode->data = NULL;
+ rnode->mark = 0;
+
+ if (!unode->next)
+ {
+ rnode->left = NULL;
+ rnode->right = NULL;
+ return rnode;
+ }
+
+ rnode->left = utree_rtree(unode->next->back);
+ rnode->right = utree_rtree(unode->next->next->back);
+
+ rnode->left->parent = rnode;
+ rnode->right->parent = rnode;
+
+ return rnode;
+}
+
+utree_t * utree_longest_branchtip(utree_t * node, unsigned int tip_count)
+{
+ unsigned int index = 0;
+ unsigned int i;
+ double branch_length = 0;
+ utree_t * outgroup = NULL;
+
+ /* query tip nodes */
+ utree_t ** tip_nodes_list = (utree_t **)xcalloc(1,(size_t)tip_count * sizeof(utree_t *));
+ utree_query_tipnodes(node, tip_nodes_list);
+
+ for (i = 0; i < tip_count; ++i)
+ if (tip_nodes_list[i]->length > branch_length)
+ {
+ index = i;
+ branch_length = tip_nodes_list[i]->length;
+ }
+
+ outgroup = tip_nodes_list[index];
+
+ free(tip_nodes_list);
+
+ return outgroup;
+}
+
+rtree_t * utree_crop(utree_t * lca)
+{
+ /* is the back of the lca a tip? */
+ if (!lca->back->next)
+ return NULL;
+
+ rtree_t * root = (rtree_t *)xcalloc(1,sizeof(rtree_t));
+
+ /* clone the two subtrees */
+ root->left = utree_rtree(lca->back->next->back);
+ root->right = utree_rtree(lca->back->next->next->back);
+
+ root->parent = NULL;
+ root->length = 0;
+ root->label = NULL;
+ root->data = NULL;
+ root->mark = 0;
+
+ root->left->parent = root;
+ root->right->parent = root;
+
+ rtree_reset_info(root);
+
+ return root;
+}
+
+rtree_t * utree_convert_rtree(utree_t * outgroup)
+{
+ rtree_t * root = (rtree_t *)xcalloc(1,sizeof(rtree_t));
+ root->left = utree_rtree(outgroup);
+ root->right = utree_rtree(outgroup->back);
+
+ root->left->parent = root;
+ root->right->parent = root;
+
+ root->left->length /= 2;
+ root->right->length /= 2;
+
+ root->label = NULL;
+ root->length = 0;
+ root->parent = NULL;
+ root->event = EVENT_COALESCENT;
+ root->data = NULL;
+ root->mark = 0;
+
+ /* reset per-node leaves and valid edges */
+ rtree_reset_info(root);
+
+ return root;
+
+}
+
+static utree_t ** utree_tipstring_nodes(utree_t * root,
+ char * tipstring,
+ unsigned int utree_tip_count,
+ unsigned int * tiplist_count)
+{
+ unsigned int i;
+ unsigned int k;
+ unsigned int commas_count = 0;
+
+ char * taxon;
+ size_t taxon_len;
+
+ ENTRY * found = NULL;
+
+ for (i = 0; i < strlen(tipstring); ++i)
+ if (tipstring[i] == ',')
+ commas_count++;
+
+ utree_t ** node_list = (utree_t **)xcalloc(1,(size_t)utree_tip_count *
+ sizeof(utree_t *));
+ utree_query_tipnodes(root, node_list);
+
+ utree_t ** out_node_list = (utree_t **)xcalloc(1,(commas_count+1) *
+ sizeof(utree_t *));
+
+ /* create a hashtable of tip labels */
+ hcreate(2 * (size_t)utree_tip_count);
+
+ for (i = 0; i < (unsigned int)utree_tip_count; ++i)
+ {
+ ENTRY entry;
+ entry.key = node_list[i]->label;
+ entry.data = node_list[i];
+ hsearch(entry,ENTER);
+ }
+
+ char * s = tipstring;
+
+ k = 0;
+ while (*s)
+ {
+ /* get next tip */
+ taxon_len = strcspn(s, ",");
+ if (!taxon_len)
+ fatal("Erroneous prune list format (double comma)/taxon missing");
+
+ taxon = xstrndup(s, taxon_len);
+
+ /* search tip in hash table */
+ ENTRY query;
+ query.key = taxon;
+ found = NULL;
+ found = hsearch(query,FIND);
+
+ if (!found)
+ fatal("Taxon %s does not appear in the tree", taxon);
+
+ /* store pointer in output list */
+ out_node_list[k++] = (utree_t *)(found->data);
+
+ /* free tip label, and move to the beginning of next tip if available */
+ free(taxon);
+ s += taxon_len;
+ if (*s == ',')
+ s += 1;
+ }
+
+ /* kill the hash table */
+ hdestroy();
+
+ free(node_list);
+
+ /* return number of tips in the list */
+ *tiplist_count = commas_count + 1;
+
+ /* return tip node list */
+ return out_node_list;
+}
+
+static utree_t * utree_lca(utree_t ** tip_nodes,
+ unsigned int count,
+ unsigned int utree_tip_count)
+{
+ long i;
+ utree_t * lca = NULL;
+ utree_t ** path;
+
+ /* allocate a path */
+ path = (utree_t **)xcalloc(1,(size_t)utree_tip_count *
+ sizeof(utree_t **));
+
+ /* mark all tip nodes */
+ for (i = 0; i < count; ++i)
+ tip_nodes[i]->mark = 1;
+
+ /* traverse the tree with the cb_outgroup callback to get the inner nodes
+ of the subtree formed by the outgroup */
+ int path_len = utree_traverse_postorder(tip_nodes[0]->back,
+ cb_outgroup,
+ path);
+
+
+ /* there must be exactly one inner node that does not have all three
+ directions mark. That one will be the root of the outgroup subtree */
+ int root_count = 0;
+ for (i = 0; i < path_len; ++i)
+ if (!(path[i]->mark && path[i]->next->mark && path[i]->next->next->mark))
+ {
+ root_count++;
+ lca = path[i];
+ }
+
+ /* deallocate path */
+ free(path);
+
+ /* if we had more than one inner nodes with less than three directions marked
+ then not all tips of a subtree were specified (invalid outgroup) */
+ if (root_count != 1) return NULL;
+ while (lca->mark == 1) lca = lca->next;
+
+ /* return the LCA */
+ return lca;
+}
+
+utree_t * utree_outgroup_lca(utree_t * root, unsigned int tip_count)
+{
+ unsigned int og_tips_count;
+ utree_t * og_root;
+ utree_t ** og_tips;
+
+ /* get all nodes that have labels equal to the comma separated string in
+ opt_outgroup */
+ og_tips = utree_tipstring_nodes(root,
+ opt_outgroup,
+ tip_count,
+ &og_tips_count);
+
+ if (og_tips_count == 1)
+ {
+ og_root = og_tips[0];
+ }
+ else
+ {
+ /* find the LCA of the tips in og_tips. Note that, *all* tips of the desired
+ subtree *must* be specified */
+ og_root = utree_lca(og_tips, og_tips_count, tip_count);
+ }
+
+ free(og_tips);
+
+ /* return the LCA (root of the outgroup subtree */
+ return og_root;
+}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/mptp.git
More information about the debian-med-commit
mailing list