[med-svn] [qtltools] 02/03: Imported Upstream version 1.0+dfsg
Dylan Aïssi
bob.dybian-guest at moszumanska.debian.org
Thu Nov 24 21:00:27 UTC 2016
This is an automated email from the git hooks/post-receive script.
bob.dybian-guest pushed a commit to branch master
in repository qtltools.
commit 288141def8e7fa73625a2dcefc5ab3cfd5a8157d
Author: Dylan Aïssi <bob.dybian at gmail.com>
Date: Thu Nov 24 21:59:43 2016 +0100
Imported Upstream version 1.0+dfsg
---
LICENSE | 621 +++++++++++++++++++++
Makefile | 245 ++++++++
README | 8 +
lib/OTools/basic_stats.h | 87 +++
lib/OTools/compressed_io.h | 93 +++
lib/OTools/full_linear_regression.h | 102 ++++
lib/OTools/genomic_region.h | 103 ++++
lib/OTools/interval_tree.h | 274 +++++++++
lib/OTools/otools.h | 80 +++
lib/OTools/pca.h | 230 ++++++++
lib/OTools/random_number.h | 68 +++
lib/OTools/ranker.h | 238 ++++++++
lib/OTools/residualizer.h | 195 +++++++
lib/OTools/string_utils.h | 72 +++
lib/OTools/timer.h | 69 +++
lib/OTools/verbose.h | 121 ++++
script/plotTrans.R | 59 ++
script/runFDR_atrans.R | 25 +
script/runFDR_cis.R | 67 +++
script/runFDR_ftrans.R | 25 +
src/QTLtools.cpp | 133 +++++
src/common/data.cpp | 287 ++++++++++
src/common/data.h | 73 +++
src/common/filter.h | 87 +++
src/mode_ase/ase_data.h | 102 ++++
src/mode_ase/ase_main.cpp | 102 ++++
src/mode_ase/ase_read_genotypes.cpp | 118 ++++
src/mode_ase/ase_read_sequences.cpp | 180 ++++++
src/mode_bamstat/bamstat_data.h | 80 +++
src/mode_bamstat/bamstat_main.cpp | 80 +++
src/mode_bamstat/bamstat_read_annotations.cpp | 36 ++
src/mode_bamstat/bamstat_read_bam.cpp | 161 ++++++
src/mode_bamstat/bamstat_write_output.cpp | 23 +
src/mode_cis/cis_chunking.cpp | 190 +++++++
src/mode_cis/cis_collapse_phenotypes.cpp | 82 +++
src/mode_cis/cis_conditionnal_pass.cpp | 207 +++++++
src/mode_cis/cis_data.h | 184 ++++++
src/mode_cis/cis_initilization.cpp | 66 +++
src/mode_cis/cis_learn_beta.cpp | 103 ++++
src/mode_cis/cis_learn_dof.cpp | 99 ++++
src/mode_cis/cis_main.cpp | 201 +++++++
src/mode_cis/cis_management.cpp | 94 ++++
src/mode_cis/cis_nominal_pass.cpp | 135 +++++
src/mode_cis/cis_permutation_pass.cpp | 195 +++++++
src/mode_cis/cis_read_covariates.cpp | 56 ++
src/mode_cis/cis_read_genotypes.cpp | 215 +++++++
src/mode_cis/cis_read_phenotypes.cpp | 157 ++++++
src/mode_cis/cis_read_thresholds.cpp | 57 ++
src/mode_correct/correct_data.h | 66 +++
src/mode_correct/correct_main.cpp | 100 ++++
src/mode_correct/correct_management.cpp | 91 +++
src/mode_correct/correct_processing.cpp | 114 ++++
src/mode_correct/correct_read_covariates.cpp | 52 ++
src/mode_extract/extract_data.h | 56 ++
src/mode_extract/extract_main.cpp | 88 +++
src/mode_extract/extract_managment.cpp | 39 ++
src/mode_extract/extract_read_data.cpp | 198 +++++++
src/mode_extract/extract_write.cpp | 45 ++
src/mode_fdensity/fdensity_data.h | 63 +++
src/mode_fdensity/fdensity_main.cpp | 86 +++
src/mode_fdensity/fdensity_process.cpp | 84 +++
src/mode_fdensity/fdensity_read_annotation.cpp | 44 ++
src/mode_fdensity/fdensity_read_qtl.cpp | 42 ++
src/mode_fenrich/fenrich_data.h | 73 +++
src/mode_fenrich/fenrich_main.cpp | 86 +++
src/mode_fenrich/fenrich_management.cpp | 80 +++
src/mode_fenrich/fenrich_process.cpp | 64 +++
src/mode_fenrich/fenrich_read_annotation.cpp | 44 ++
src/mode_fenrich/fenrich_read_qtl.cpp | 38 ++
src/mode_fenrich/fenrich_read_tss.cpp | 42 ++
src/mode_genrich/genrich_binning_process.cpp | 61 ++
src/mode_genrich/genrich_data.h | 83 +++
src/mode_genrich/genrich_main.cpp | 106 ++++
src/mode_genrich/genrich_process.cpp | 110 ++++
src/mode_genrich/genrich_read_auxillliary_data.cpp | 66 +++
src/mode_genrich/genrich_read_phenotypes.cpp | 76 +++
.../genrich_read_reference_genotypes.cpp | 101 ++++
src/mode_genrich/genrich_routines.cpp | 52 ++
src/mode_match/match_data.h | 106 ++++
src/mode_match/match_main.cpp | 98 ++++
src/mode_match/match_managment.cpp | 32 ++
src/mode_match/match_process.cpp | 67 +++
src/mode_match/match_read_genotypes.cpp | 121 ++++
src/mode_match/match_read_sequences.cpp | 103 ++++
src/mode_pca/pca_data.h | 58 ++
src/mode_pca/pca_main.cpp | 102 ++++
src/mode_pca/pca_management.cpp | 64 +++
src/mode_pca/pca_pca.cpp | 240 ++++++++
src/mode_pca/pca_pca.h | 161 ++++++
src/mode_pca/pca_read_data.cpp | 287 ++++++++++
src/mode_quan/quan_chunking.cpp | 42 ++
src/mode_quan/quan_data.h | 277 +++++++++
src/mode_quan/quan_main.cpp | 166 ++++++
src/mode_quan/quan_management.cpp | 41 ++
src/mode_quan/quan_printResults.cpp | 136 +++++
src/mode_quan/quan_readBAM.cpp | 398 +++++++++++++
src/mode_quan/quan_readGTF.cpp | 102 ++++
src/mode_rtc/rtc_chunking.cpp | 167 ++++++
src/mode_rtc/rtc_collapse_phenotypes.cpp | 83 +++
src/mode_rtc/rtc_common.cpp | 492 ++++++++++++++++
src/mode_rtc/rtc_data.h | 506 +++++++++++++++++
src/mode_rtc/rtc_gwas_cis.cpp | 137 +++++
src/mode_rtc/rtc_gwas_trans.cpp | 135 +++++
src/mode_rtc/rtc_initilization.cpp | 80 +++
src/mode_rtc/rtc_main.cpp | 384 +++++++++++++
src/mode_rtc/rtc_management.cpp | 196 +++++++
src/mode_rtc/rtc_mergeQTL_cis.cpp | 148 +++++
src/mode_rtc/rtc_mergeQTL_trans.cpp | 148 +++++
src/mode_rtc/rtc_read_covariates.cpp | 55 ++
src/mode_rtc/rtc_read_genotypes.cpp | 430 ++++++++++++++
src/mode_rtc/rtc_read_get_hotspots.cpp | 113 ++++
src/mode_rtc/rtc_read_phenotypes.cpp | 181 ++++++
src/mode_rtc/rtc_sampling.cpp | 312 +++++++++++
src/mode_trans/trans_adjust.cpp | 52 ++
src/mode_trans/trans_analysis_pass.cpp | 117 ++++
src/mode_trans/trans_chunking.cpp | 31 +
src/mode_trans/trans_data.h | 148 +++++
src/mode_trans/trans_initilization.cpp | 52 ++
src/mode_trans/trans_learn_beta.cpp | 103 ++++
src/mode_trans/trans_main.cpp | 161 ++++++
src/mode_trans/trans_managment.cpp | 161 ++++++
src/mode_trans/trans_read_covariates.cpp | 67 +++
src/mode_trans/trans_read_phenotypes.cpp | 116 ++++
src/mode_trans/trans_read_samples.cpp | 93 +++
src/mode_union/union_data.h | 247 ++++++++
src/mode_union/union_initilization.cpp | 91 +++
src/mode_union/union_main.cpp | 153 +++++
src/mode_union/union_management.cpp | 257 +++++++++
src/mode_union/union_read_covariates.cpp | 57 ++
src/mode_union/union_read_genotypes.cpp | 374 +++++++++++++
src/mode_union/union_read_get_hotspots.cpp | 88 +++
src/mode_union/union_read_phenotypes.cpp | 153 +++++
src/mode_union/union_union.cpp | 170 ++++++
133 files changed, 17394 insertions(+)
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..94a0453
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,621 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+ The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works. By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users. We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors. You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+ To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights. Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received. You must make sure that they, too, receive
+or can get the source code. And you must show them these terms so they
+know their rights.
+
+ Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+ For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software. For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+ Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so. This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software. The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable. Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products. If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+ Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary. To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ TERMS AND CONDITIONS
+
+ 0. Definitions.
+
+ "This License" refers to version 3 of the GNU General Public License.
+
+ "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+ "The Program" refers to any copyrightable work licensed under this
+License. Each licensee is addressed as "you". "Licensees" and
+"recipients" may be individuals or organizations.
+
+ To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy. The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+ A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+ To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy. Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+ To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies. Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+ An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License. If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+ 1. Source Code.
+
+ The "source code" for a work means the preferred form of the work
+for making modifications to it. "Object code" means any non-source
+form of a work.
+
+ A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+ The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form. A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+ The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities. However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work. For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+ The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+ The Corresponding Source for a work in source code form is that
+same work.
+
+ 2. Basic Permissions.
+
+ All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met. This License explicitly affirms your unlimited
+permission to run the unmodified Program. The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work. This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+ You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force. You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright. Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+ Conveying under any other circumstances is permitted solely under
+the conditions stated below. Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+ No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+ When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+ 4. Conveying Verbatim Copies.
+
+ You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+ You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+ 5. Conveying Modified Source Versions.
+
+ You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+ a) The work must carry prominent notices stating that you modified
+ it, and giving a relevant date.
+
+ b) The work must carry prominent notices stating that it is
+ released under this License and any conditions added under section
+ 7. This requirement modifies the requirement in section 4 to
+ "keep intact all notices".
+
+ c) You must license the entire work, as a whole, under this
+ License to anyone who comes into possession of a copy. This
+ License will therefore apply, along with any applicable section 7
+ additional terms, to the whole of the work, and all its parts,
+ regardless of how they are packaged. This License gives no
+ permission to license the work in any other way, but it does not
+ invalidate such permission if you have separately received it.
+
+ d) If the work has interactive user interfaces, each must display
+ Appropriate Legal Notices; however, if the Program has interactive
+ interfaces that do not display Appropriate Legal Notices, your
+ work need not make them do so.
+
+ A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit. Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+ 6. Conveying Non-Source Forms.
+
+ You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+ a) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by the
+ Corresponding Source fixed on a durable physical medium
+ customarily used for software interchange.
+
+ b) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by a
+ written offer, valid for at least three years and valid for as
+ long as you offer spare parts or customer support for that product
+ model, to give anyone who possesses the object code either (1) a
+ copy of the Corresponding Source for all the software in the
+ product that is covered by this License, on a durable physical
+ medium customarily used for software interchange, for a price no
+ more than your reasonable cost of physically performing this
+ conveying of source, or (2) access to copy the
+ Corresponding Source from a network server at no charge.
+
+ c) Convey individual copies of the object code with a copy of the
+ written offer to provide the Corresponding Source. This
+ alternative is allowed only occasionally and noncommercially, and
+ only if you received the object code with such an offer, in accord
+ with subsection 6b.
+
+ d) Convey the object code by offering access from a designated
+ place (gratis or for a charge), and offer equivalent access to the
+ Corresponding Source in the same way through the same place at no
+ further charge. You need not require recipients to copy the
+ Corresponding Source along with the object code. If the place to
+ copy the object code is a network server, the Corresponding Source
+ may be on a different server (operated by you or a third party)
+ that supports equivalent copying facilities, provided you maintain
+ clear directions next to the object code saying where to find the
+ Corresponding Source. Regardless of what server hosts the
+ Corresponding Source, you remain obligated to ensure that it is
+ available for as long as needed to satisfy these requirements.
+
+ e) Convey the object code using peer-to-peer transmission, provided
+ you inform other peers where the object code and Corresponding
+ Source of the work are being offered to the general public at no
+ charge under subsection 6d.
+
+ A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+ A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling. In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage. For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product. A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+ "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source. The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+ If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information. But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+ The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed. Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+ Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+ 7. Additional Terms.
+
+ "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law. If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+ When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it. (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.) You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+ Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+ a) Disclaiming warranty or limiting liability differently from the
+ terms of sections 15 and 16 of this License; or
+
+ b) Requiring preservation of specified reasonable legal notices or
+ author attributions in that material or in the Appropriate Legal
+ Notices displayed by works containing it; or
+
+ c) Prohibiting misrepresentation of the origin of that material, or
+ requiring that modified versions of such material be marked in
+ reasonable ways as different from the original version; or
+
+ d) Limiting the use for publicity purposes of names of licensors or
+ authors of the material; or
+
+ e) Declining to grant rights under trademark law for use of some
+ trade names, trademarks, or service marks; or
+
+ f) Requiring indemnification of licensors and authors of that
+ material by anyone who conveys the material (or modified versions of
+ it) with contractual assumptions of liability to the recipient, for
+ any liability that these contractual assumptions directly impose on
+ those licensors and authors.
+
+ All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10. If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term. If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+ If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+ Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+ 8. Termination.
+
+ You may not propagate or modify a covered work except as expressly
+provided under this License. Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+ However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+ Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+ Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License. If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+ 9. Acceptance Not Required for Having Copies.
+
+ You are not required to accept this License in order to receive or
+run a copy of the Program. Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance. However,
+nothing other than this License grants you permission to propagate or
+modify any covered work. These actions infringe copyright if you do
+not accept this License. Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+ 10. Automatic Licensing of Downstream Recipients.
+
+ Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License. You are not responsible
+for enforcing compliance by third parties with this License.
+
+ An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations. If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+ You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License. For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+ 11. Patents.
+
+ A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based. The
+work thus licensed is called the contributor's "contributor version".
+
+ A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version. For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+ In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement). To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+ If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients. "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+ If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+ A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License. You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+ Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+ 12. No Surrender of Others' Freedom.
+
+ If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all. For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+ 13. Use with the GNU Affero General Public License.
+
+ Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work. The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+ 14. Revised Versions of this License.
+
+ The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+ Each version is given a distinguishing version number. If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation. If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+ If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+ Later license versions may give you additional or different
+permissions. However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+ 15. Disclaimer of Warranty.
+
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+ 16. Limitation of Liability.
+
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+ 17. Interpretation of Sections 15 and 16.
+
+ If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+ END OF TERMS AND CONDITIONS
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..28851fe
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,245 @@
+##########################################
+# SET CORRECTLY THESE 6 PATHS TO COMPILE #
+##########################################
+BOOST_INC=
+BOOST_LIB=
+RMATH_INC=
+RMATH_LIB=
+HTSLD_INC=
+HTSLD_LIB=
+
+#COMPILER MODE C++11
+CXX=g++ -std=c++11
+
+#COMPILER FLAGS
+CXXFLAG_REL=-O2
+CXXFLAG_DBG=-g
+CXXFLAG_WRN=-Wall -Wextra -Wno-sign-compare -Wno-unused-local-typedefs -Wno-deprecated -Wno-unused-parameter
+
+#LINKER FLAGS
+LDFLAG_REL=-O2
+LDFLAG_DBG=-g
+
+#BASE LIBRARIES
+LIB_FLAGS=-Wl,-Bstatic -lz -lgsl -lblas -lbz2 -Wl,-Bdynamic -lm -lpthread
+
+#FILE LISTS
+BFILE=bin/QTLtools
+HFILE=$(shell find src -name *.h)
+TFILE=$(shell find lib -name *.h)
+CFILE=$(shell find src -name *.cpp)
+OFILE=$(shell for file in `find src -name *.cpp`; do echo obj/$$(basename $$file .cpp).o; done)
+VPATH=$(shell for file in `find src -name *.cpp`; do echo $$(dirname $$file); done)
+
+#DEFAULT VERSION (I.E. UNIGE DESKTOP RELEASE VERSION)
+all: default
+
+#DEFAULT RELEASE VERSION
+default: CXXFLAG=$(CXXFLAG_REL) $(CXXFLAG_WRN)
+default: IFLAG=-Ilib/OTools -Ilib -I$(RMATH_INC) -I$(HTSLD_INC) -I$(BOOST_INC)
+default: LIB_FILES=$(RMATH_LIB)/libRmath.a $(HTSLD_LIB)/libhts.a $(BOOST_LIB)/libboost_iostreams.a $(BOOST_LIB)/libboost_program_options.a
+default: LDFLAG=$(LDFLAG_REL)
+default: $(BFILE)
+
+#UNIGE DESKTOP RELEASE VERSION
+desktop: RMATH_INC=$(HOME)/Tools/R-3.2.2/src/include
+desktop: RMATH_LIB=$(HOME)/Tools/R-3.2.2/src/nmath/standalone
+desktop: HTSLD_INC=$(HOME)/Tools/htslib-1.3
+desktop: HTSLD_LIB=$(HOME)/Tools/htslib-1.3
+desktop: BOOST_INC=/usr/include
+desktop: BOOST_LIB=/usr/lib/x86_64-linux-gnu
+desktop: CXXFLAG=$(CXXFLAG_REL) $(CXXFLAG_WRN)
+desktop: IFLAG=-Ilib/OTools -Ilib -I$(RMATH_INC) -I$(HTSLD_INC) -I$(BOOST_INC)
+desktop: LIB_FILES=$(RMATH_LIB)/libRmath.a $(HTSLD_LIB)/libhts.a $(BOOST_LIB)/libboost_iostreams.a $(BOOST_LIB)/libboost_program_options.a
+desktop: LDFLAG=$(LDFLAG_REL)
+desktop: $(BFILE)
+
+#UNIGE DESKTOP DEBUG VERSION
+desktop-dbg: RMATH_INC=$(HOME)/Tools/R-3.2.2/src/include
+desktop-dbg: RMATH_LIB=$(HOME)/Tools/R-3.2.2/src/nmath/standalone
+desktop-dbg: HTSLD_INC=$(HOME)/Tools/htslib-1.3
+desktop-dbg: HTSLD_LIB=$(HOME)/Tools/htslib-1.3
+desktop-dbg: BOOST_INC=/usr/include
+desktop-dbg: BOOST_LIB=/usr/lib/x86_64-linux-gnu
+desktop-dbg: CXXFLAG=$(CXXFLAG_DBG) $(CXXFLAG_WRN)
+desktop-dbg: IFLAG=-Ilib/OTools -Ilib -I$(RMATH_INC) -I$(HTSLD_INC) -I$(BOOST_INC)
+desktop-dbg: LIB_FILES=$(RMATH_LIB)/libRmath.a $(HTSLD_LIB)/libhts.a $(BOOST_LIB)/libboost_iostreams.a $(BOOST_LIB)/libboost_program_options.a
+desktop-dbg: LDFLAG=$(LDFLAG_DBG)
+desktop-dbg: $(BFILE)
+
+#DELL LAPTOP RELEASE VERSION
+laptop: RMATH_INC=$(HOME)/Libraries/R-3.2.2/src/include
+laptop: RMATH_LIB=$(HOME)/Libraries/R-3.2.2/src/nmath/standalone
+laptop: HTSLD_INC=$(HOME)/Libraries/htslib-1.2.1
+laptop: HTSLD_LIB=$(HOME)/Libraries/htslib-1.2.1
+laptop: BOOST_INC=/usr/include
+laptop: BOOST_LIB=/usr/lib/x86_64-linux-gnu
+laptop: CXXFLAG=$(CXXFLAG_REL) $(CXXFLAG_WRN)
+laptop: IFLAG=-Ilib/OTools -Ilib -I$(RMATH_INC) -I$(HTSLD_INC) -I$(BOOST_INC)
+laptop: LIB_FILES=$(RMATH_LIB)/libRmath.a $(HTSLD_LIB)/libhts.a $(BOOST_LIB)/libboost_iostreams.a $(BOOST_LIB)/libboost_program_options.a
+laptop: LDFLAG=$(LDFLAG_REL)
+laptop: $(BFILE)
+
+#DELL LAPTOP DEBUG VERSION
+laptop-dbg: RMATH_INC=$(HOME)/Libraries/R-3.2.2/src/include
+laptop-dbg: RMATH_LIB=$(HOME)/Libraries/R-3.2.2/src/nmath/standalone
+laptop-dbg: HTSLD_INC=$(HOME)/Libraries/htslib-1.2.1
+laptop-dbg: HTSLD_LIB=$(HOME)/Libraries/htslib-1.2.1
+laptop-dbg: BOOST_INC=/usr/include
+laptop-dbg: BOOST_LIB=/usr/lib/x86_64-linux-gnu
+laptop-dbg: CXXFLAG=$(CXXFLAG_DBG) $(CXXFLAG_WRN)
+laptop-dbg: IFLAG=-Ilib/OTools -Ilib -I$(RMATH_INC) -I$(HTSLD_INC) -I$(BOOST_INC)
+laptop-dbg: LIB_FILES=$(RMATH_LIB)/libRmath.a $(HTSLD_LIB)/libhts.a $(BOOST_LIB)/libboost_iostreams.a $(BOOST_LIB)/libboost_program_options.a
+laptop-dbg: LDFLAG=$(LDFLAG_DBG)
+laptop-dbg: $(BFILE)
+
+#VITAL-IT RELEASE VERSION
+cluster: RMATH_INC=/software/R/3.1.1/include
+cluster: RMATH_LIB=/software/R/3.1.1/lib64
+cluster: HTSLD_INC=/software/UHTS/Analysis/samtools/1.2/include
+cluster: HTSLD_LIB=/software/UHTS/Analysis/samtools/1.2/lib64
+cluster: BOOST_INC=/software/include
+cluster: BOOST_LIB=/software/lib64
+cluster: CXXFLAG=$(CXXFLAG_REL) $(CXXFLAG_WRN)
+cluster: IFLAG=-Ilib/OTools -Ilib -I$(RMATH_INC) -I$(HTSLD_INC) -I$(BOOST_INC)
+cluster: LIB_FILES=$(RMATH_LIB)/libRmath.a $(HTSLD_LIB)/libhts.a $(BOOST_LIB)/libboost_iostreams.a $(BOOST_LIB)/libboost_program_options.a
+cluster: LDFLAG=$(LDFLAG_REL)
+cluster: $(BFILE)
+
+#VITAL-IT DEBUG VERSION
+cluster-dbg: RMATH_INC=/software/R/3.1.1/include
+cluster-dbg: RMATH_LIB=/software/R/3.1.1/lib64
+cluster-dbg: HTSLD_INC=/software/UHTS/Analysis/samtools/1.2/include
+cluster-dbg: HTSLD_LIB=/software/UHTS/Analysis/samtools/1.2/lib64
+cluster-dbg: BOOST_INC=/software/include
+cluster-dbg: BOOST_LIB=/software/lib64
+cluster-dbg: CXXFLAG=$(CXXFLAG_DBG) $(CXXFLAG_WRN)
+cluster-dbg: IFLAG=-Ilib/OTools -Ilib -I$(RMATH_INC) -I$(HTSLD_INC) -I$(BOOST_INC)
+cluster-dbg: LIB_FILES=$(RMATH_LIB)/libRmath.a $(HTSLD_LIB)/libhts.a $(BOOST_LIB)/libboost_iostreams.a $(BOOST_LIB)/libboost_program_options.a
+cluster-dbg: LDFLAG=$(LDFLAG_DBG)
+cluster-dbg: $(BFILE)
+
+#MAC RELEASE VERSION
+mac: RMATH_INC=$(HOME)/Libraries/R-3.2.2/src/include
+mac: RMATH_LIB=$(HOME)/Libraries/R-3.2.2/src/nmath/standalone
+mac: HTSLD_INC=$(HOME)/Libraries/htslib-1.2.1
+mac: HTSLD_LIB=$(HOME)/Libraries/htslib-1.2.1
+mac: BOOST_INC=/opt/local/include
+mac: BOOST_LIB=/opt/local/lib
+mac: CXXFLAG=$(CXXFLAG_REL) $(CXXFLAG_WRN)
+mac: IFLAG=-Ilib/OTools -Ilib/ -I$(RMATH_INC) -I$(HTSLD_INC) -I$(BOOST_INC)
+mac: LIB_FILES=$(RMATH_LIB)/libRmath.a $(HTSLD_LIB)/libhts.a $(BOOST_LIB)/libboost_iostreams-mt.a $(BOOST_LIB)/libboost_program_options-mt.a
+mac: LDFLAG=$(LDFLAG_REL) -L /opt/local/lib
+mac: $(BFILE)
+
+#MAC DEBUG VERSION
+mac-dbg: RMATH_INC=$(HOME)/Libraries/R-3.2.2/src/include
+mac-dbg: RMATH_LIB=$(HOME)/Libraries/R-3.2.2/src/nmath/standalone
+mac-dbg: HTSLD_INC=$(HOME)/Libraries/htslib-1.2.1
+mac-dbg: HTSLD_LIB=$(HOME)/Libraries/htslib-1.2.1
+mac-dbg: BOOST_INC=/opt/local/include
+mac-dbg: BOOST_LIB=/opt/local/lib
+mac-dbg: CXXFLAG=$(CXXFLAG_DBG) $(CXXFLAG_WRN)
+mac-dbg: IFLAG=-Ilib/OTools -Ilib/ -I$(RMATH_INC) -I$(HTSLD_INC) -I$(BOOST_INC)
+mac-dbg: LIB_FILES=$(RMATH_LIB)/libRmath.a $(HTSLD_LIB)/libhts.a $(BOOST_LIB)/libboost_iostreams-mt.a $(BOOST_LIB)/libboost_program_options-mt.a
+mac-dbg: LDFLAG=$(LDFLAG_DBG) -L /opt/local/lib
+mac-dbg: $(BFILE)
+
+#COMPILATION RULES
+$(BFILE): $(OFILE)
+ $(CXX) $^ $(LIB_FILES) -o $@ $(LIB_FLAGS) $(LDFLAG)
+
+obj/QTLtools.o: src/QTLtools.cpp $(HFILE) $(TFILE)
+ $(CXX) -o $@ -c $< $(CXXFLAG) $(IFLAG)
+
+obj/data.o: src/common/data.cpp src/common/data.h src/common/filter.h $(TFILE)
+ $(CXX) -o $@ -c $< $(CXXFLAG) $(IFLAG)
+
+obj/cis_%.o: cis_%.cpp cis_data.h src/common/data.h src/common/filter.h $(TFILE)
+ $(CXX) -o $@ -c $< $(CXXFLAG) $(IFLAG)
+
+obj/correct_%.o: correct_%.cpp correct_data.h src/common/data.h src/common/filter.h $(TFILE)
+ $(CXX) -o $@ -c $< $(CXXFLAG) $(IFLAG)
+
+obj/fenrich_%.o: fenrich_%.cpp fenrich_data.h src/common/data.h src/common/filter.h $(TFILE)
+ $(CXX) -o $@ -c $< $(CXXFLAG) $(IFLAG)
+
+obj/match_%.o: match_%.cpp match_data.h src/common/data.h src/common/filter.h $(TFILE)
+ $(CXX) -o $@ -c $< $(CXXFLAG) $(IFLAG)
+
+obj/trans_%.o: trans_%.cpp trans_data.h src/common/data.h src/common/filter.h $(TFILE)
+ $(CXX) -o $@ -c $< $(CXXFLAG) $(IFLAG)
+
+obj/rtc_%.o: rtc_%.cpp rtc_data.h src/common/data.h src/common/filter.h $(TFILE)
+ $(CXX) -o $@ -c $< $(CXXFLAG) $(IFLAG)
+
+obj/pca_%.o: pca_%.cpp pca_data.h pca_pca.h src/common/data.h src/common/filter.h $(TFILE)
+ $(CXX) -o $@ -c $< $(CXXFLAG) $(IFLAG)
+
+obj/genrich_%.o: genrich_%.cpp genrich_data.h src/common/data.h src/common/filter.h $(TFILE)
+ $(CXX) -o $@ -c $< $(CXXFLAG) $(IFLAG)
+
+obj/union_%.o: union_%.cpp union_data.h src/common/data.h src/common/filter.h $(TFILE)
+ $(CXX) -o $@ -c $< $(CXXFLAG) $(IFLAG)
+
+obj/extract_%.o: extract_%.cpp extract_data.h src/common/data.h src/common/filter.h $(TFILE)
+ $(CXX) -o $@ -c $< $(CXXFLAG) $(IFLAG)
+
+obj/quan_%.o: quan_%.cpp quan_data.h src/common/data.h src/common/filter.h $(TFILE)
+ $(CXX) -o $@ -c $< $(CXXFLAG) $(IFLAG)
+
+obj/ase_%.o: ase_%.cpp ase_data.h src/common/data.h src/common/filter.h $(TFILE)
+ $(CXX) -o $@ -c $< $(CXXFLAG) $(IFLAG)
+
+obj/bamstat_%.o: bamstat_%.cpp bamstat_data.h src/common/data.h src/common/filter.h $(TFILE)
+ $(CXX) -o $@ -c $< $(CXXFLAG) $(IFLAG)
+
+obj/fdensity_%.o: fdensity_%.cpp fdensity_data.h src/common/data.h src/common/filter.h $(TFILE)
+ $(CXX) -o $@ -c $< $(CXXFLAG) $(IFLAG)
+
+clean:
+ rm -f obj/*.o $(BFILE)
+
+clean-cis:
+ rm -f obj/cis_*.o $(BFILE)
+
+clean-correct:
+ rm -f obj/correct_*.o $(BFILE)
+
+clean-fenrich:
+ rm -f obj/fenrich_*.o $(BFILE)
+
+clean-genrich:
+ rm -f obj/genrich_*.o $(BFILE)
+
+clean-match:
+ rm -f obj/match_*.o $(BFILE)
+
+clean-trans:
+ rm -f obj/trans_*.o $(BFILE)
+
+clean-rtc:
+ rm -f obj/rtc_*.o $(BFILE)
+
+clean-pca:
+ rm -f obj/pca_*.o $(BFILE)
+
+clean-extract:
+ rm -f obj/extract_*.o $(BFILE)
+
+clean-ase:
+ rm -f obj/ase_*.o $(BFILE)
+
+clean-union:
+ rm -f obj/union_*.o $(BFILE)
+
+clean-quan:
+ rm -f obj/quan_*.o $(BFILE)
+
+clean-bamstat:
+ rm -f obj/bamstat_*.o $(BFILE)
+
+clean-fdensity:
+ rm -f obj/fdensity_*.o $(BFILE)
+
+
\ No newline at end of file
diff --git a/README b/README
new file mode 100644
index 0000000..2c64161
--- /dev/null
+++ b/README
@@ -0,0 +1,8 @@
+QTLtools: a tool set for molecular QTL discovery and analysis
+
+ver: 1.0
+git: https://github.com/qtltools/qtltools
+web:https://qtltools.github.io/qtltools/
+
+developers: Olivier Delaneau (olivier.delaneau at gmail.com)
+ Halit Ongen (halit.ongen at unige.ch)
diff --git a/lib/OTools/basic_stats.h b/lib/OTools/basic_stats.h
new file mode 100644
index 0000000..436bf5a
--- /dev/null
+++ b/lib/OTools/basic_stats.h
@@ -0,0 +1,87 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#ifndef _BASIC_STATS_H
+#define _BASIC_STATS_H
+
+#include <vector>
+
+class basic_stats {
+protected:
+ uint32_t m_n;
+ double m_oldM;
+ double m_newM;
+ double m_oldS;
+ double m_newS;
+
+public:
+ basic_stats() {
+ m_n = 0;
+ m_oldM = 0;
+ m_newM = 0;
+ m_oldS = 0;
+ m_newS = 0;
+ }
+
+ template <class T>
+ basic_stats(vector < T > & X) {
+ m_n = 0;
+ m_oldM = 0;
+ m_newM = 0;
+ m_oldS = 0;
+ m_newS = 0;
+ for (uint32_t e = 0 ; e < X.size() ; e ++) push(X[e]);
+ }
+
+ void clear() {
+ m_n = 0;
+ m_oldM = 0;
+ m_newM = 0;
+ m_oldS = 0;
+ m_newS = 0;
+ }
+
+ template <class T>
+ void push(T x) {
+ m_n++;
+ if (m_n == 1) {
+ m_oldM = m_newM = x;
+ m_oldS = 0.0;
+ } else {
+ m_newM = m_oldM + (x - m_oldM)/m_n;
+ m_newS = m_oldS + (x - m_oldM)*(x - m_newM);
+ m_oldM = m_newM;
+ m_oldS = m_newS;
+ }
+ }
+
+ int size() const {
+ return m_n;
+ }
+
+ double mean() const {
+ return (m_n > 0) ? m_newM : 0.0;
+ }
+
+ double variance() const {
+ return ( (m_n > 1) ? m_newS/(m_n - 1) : 0.0 );
+ }
+
+ double sd() const {
+ return sqrt( variance() );
+ }
+};
+
+#endif
diff --git a/lib/OTools/compressed_io.h b/lib/OTools/compressed_io.h
new file mode 100644
index 0000000..4b8a474
--- /dev/null
+++ b/lib/OTools/compressed_io.h
@@ -0,0 +1,93 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#ifndef _COMPRESSED_IO_H
+#define _COMPRESSED_IO_H
+
+//STL INCLUDES
+#include <iostream>
+#include <sstream>
+#include <fstream>
+
+//BOOST INCLUDES
+#include <boost/iostreams/filtering_stream.hpp>
+#include <boost/iostreams/filter/gzip.hpp>
+#include <boost/iostreams/filter/bzip2.hpp>
+
+class input_file : public boost::iostreams::filtering_istream {
+protected:
+ ifstream file_descriptor;
+
+public:
+ input_file(string filename) {
+ if (filename.substr(filename.find_last_of(".") + 1) == "gz") {
+ file_descriptor.open(filename.c_str(), ios::in | ios::binary);
+ push(boost::iostreams::gzip_decompressor());
+ } else if (filename.substr(filename.find_last_of(".") + 1) == "bz2") {
+ file_descriptor.open(filename.c_str(), ios::in | ios::binary);
+ push(boost::iostreams::bzip2_decompressor());
+ } else file_descriptor.open(filename.c_str());
+ if (!file_descriptor.fail()) push(file_descriptor);
+ }
+
+ ~input_file() {
+ close();
+ }
+
+ bool fail() {
+ return file_descriptor.fail();
+ }
+
+ void close() {
+ if (!file_descriptor.fail()) {
+ if (!empty()) reset();
+ file_descriptor.close();
+ }
+ }
+};
+
+class output_file : public boost::iostreams::filtering_ostream {
+protected:
+ ofstream file_descriptor;
+
+public:
+ output_file(string filename) {
+ if (filename.substr(filename.find_last_of(".") + 1) == "gz") {
+ file_descriptor.open(filename.c_str(), ios::out | ios::binary);
+ push(boost::iostreams::gzip_compressor());
+ } else if (filename.substr(filename.find_last_of(".") + 1) == "bz2") {
+ file_descriptor.open(filename.c_str(), ios::out | ios::binary);
+ push(boost::iostreams::bzip2_compressor());
+ } else file_descriptor.open(filename.c_str());
+ if (!file_descriptor.fail()) push(file_descriptor);
+ }
+
+ ~output_file() {
+ close();
+ }
+
+ bool fail() {
+ return file_descriptor.fail();
+ }
+
+ void close() {
+ if (!file_descriptor.fail()) {
+ if (!empty()) reset();
+ file_descriptor.close();
+ }
+ }
+};
+
+#endif
diff --git a/lib/OTools/full_linear_regression.h b/lib/OTools/full_linear_regression.h
new file mode 100644
index 0000000..c2a5664
--- /dev/null
+++ b/lib/OTools/full_linear_regression.h
@@ -0,0 +1,102 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#ifndef full_linear_regression_h
+#define full_linear_regression_h
+
+#include <vector>
+#include <iostream>
+#include <cmath>
+#include <cassert>
+#ifndef MATHLIB_STANDALONE
+#define MATHLIB_STANDALONE
+#endif
+#include <Rmath.h>
+
+using namespace std;
+
+
+class linReg{
+public:
+ double pval;
+ double corr;
+ double r;
+ double beta;
+ double yIntercept;
+ double se;
+ vector < float > residuals,X,Y;
+ linReg(){pval=1;corr=0;beta=0; yIntercept = 0; residuals = vector < float> (0);X = vector < float> (0);Y = vector < float> (0); se = 0;r=0;}
+ linReg(double p ,double c, double b, double y , vector <float> &res){pval=p;corr=c;beta=b; yIntercept = y; residuals = res; r = beta < 0.0 ? sqrt(corr) * -1.0 : sqrt(corr);}
+ friend ostream& operator<<(ostream& out, const linReg& l){
+ out << "X";
+ for (int i =0 ; i < l.X.size(); i++) out << "\t" << l.X[i];
+ out << "\n";
+ out << "Y";
+ for (int i =0 ; i < l.Y.size(); i++) out << "\t" << l.Y[i];
+ out << "\n";
+ out << "Residuals";
+ for (int i =0 ; i < l.residuals.size(); i++) out << "\t" << l.residuals[i];
+ out << "\n";
+ out << "R2\t" << l.corr << "\n";
+ out << "R\t" << l.r << "\n";
+ out << "Beta\t" << l.beta << "\n";
+ out << "SE\t" << l.se << "\n";
+ out << "Y-int\t" << l.yIntercept << "\n";
+ out << "Pval\t" << l.pval;
+ return out;
+ }
+
+ linReg( vector <float > & x, vector <float > &y){
+ assert(x.size() == y.size());
+ X = x;
+ Y = y;
+ double dataSize = (double) x.size();
+ residuals = vector < float > ( (unsigned int) dataSize, 0.0);
+ double sum_x = 0.0; //sum of x values
+ double sum_y = 0.0; //sum of y values
+ double sum_xy = 0.0; //sum of x * y
+ double sum_xx = 0.0; //sum of x^2
+ double sum_res = 0.0; //sum of squared residue
+ double sum_Yres = 0.0; //sum of squared of the discrepancies
+ double AVGy = 0.0; //mean of y
+ double AVGx = 0.0; //mean of x
+
+ for (int i = 0 ; i < dataSize; i++){
+ sum_x += X[i];
+ sum_y += Y[i];
+ sum_xy += X[i] * Y[i];
+ sum_xx += X[i] * X[i];
+ }
+
+ AVGy = sum_y / dataSize;
+ AVGx = sum_x / dataSize;
+
+ beta = (dataSize * sum_xy - sum_x * sum_y) / (dataSize * sum_xx - sum_x*sum_x);
+ yIntercept = AVGy - beta * AVGx;
+
+ for (int i = 0 ; i < dataSize; i++){
+ residuals[i] = Y[i] - (X[i] * beta + yIntercept);
+ sum_Yres += residuals[i] * residuals[i];
+ sum_res += pow(Y[i] - AVGy,2);
+ }
+ corr = (sum_res - sum_Yres) / sum_res;
+ r = beta < 0 ? sqrt(corr) * -1.0 : sqrt(corr);
+ se = sqrt(sum_Yres / (dataSize-2));
+ pval = pf((dataSize-2) * corr / (1 - corr), 1, (dataSize-2), 0, 0);
+ }
+};
+
+#undef MATHLIB_STANDALONE
+#endif /* full_linear_regression_h */
diff --git a/lib/OTools/genomic_region.h b/lib/OTools/genomic_region.h
new file mode 100644
index 0000000..1db9cfd
--- /dev/null
+++ b/lib/OTools/genomic_region.h
@@ -0,0 +1,103 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#ifndef _GENOMIC_REGION_H
+#define _GENOMIC_REGION_H
+
+#define POS_MIN 0000000000
+#define POS_MAX 1000000000
+
+#include <string>
+#include <exception>
+
+using namespace std;
+
+class genomic_region {
+public:
+ string chr;
+ unsigned int start;
+ unsigned int end;
+
+ genomic_region() {
+ chr = "NA";
+ start = POS_MIN;
+ end = POS_MAX;
+ }
+
+ genomic_region(string _chr, unsigned int _start, unsigned int _end) {
+ chr = _chr;
+ start = _start;
+ end = _end;
+ assert(start <= end);
+ }
+
+ genomic_region(genomic_region & gr, unsigned int window) {
+ chr = gr.chr;
+ if (window > gr.start) start = 0;
+ else start = gr.start - window;
+ end = gr.end + window;
+ assert(start <= end);
+ }
+
+ bool isSet () {
+ return (chr != "NA");
+ }
+
+ string get() {
+ ostringstream s2( stringstream::out );
+ s2 << chr;
+ if (start != POS_MIN || end != POS_MAX) s2 << ":" << start << "-" << end;
+ return s2.str();
+ }
+
+ bool set (string _chr, unsigned int _start, unsigned int _end) {
+ chr = _chr;
+ start = _start;
+ end = _end;
+ if (start > end) return false;
+ return true;
+ }
+
+ bool parse(string str) {
+ size_t split_point_chr = str.find_first_of(":");
+ size_t split_point_pos = str.find_first_of("-");
+
+ //
+ if (split_point_chr == string::npos && split_point_pos == string::npos) {
+ chr = str;
+ } else if (split_point_chr == string::npos && split_point_pos != string::npos) {
+ return false;
+ } else if (split_point_chr != string::npos && split_point_pos == string::npos) {
+ chr = str.substr(0, split_point_chr);
+ try {
+ start = std::stoi(str.substr(split_point_chr+1, string::npos));
+ end = std::stoi(str.substr(split_point_chr+1, string::npos));
+ } catch (const std::exception & e) {
+ return false;
+ }
+ } else {
+ chr = str.substr(0, split_point_chr);
+ try {
+ start = std::stoi(str.substr(split_point_chr+1, split_point_pos));
+ end = std::stoi(str.substr(split_point_pos+1, string::npos));
+ } catch (const std::exception & e) {
+ return false;
+ }
+ }
+ return true;
+ }
+};
+
+#endif
diff --git a/lib/OTools/interval_tree.h b/lib/OTools/interval_tree.h
new file mode 100644
index 0000000..a6d8048
--- /dev/null
+++ b/lib/OTools/interval_tree.h
@@ -0,0 +1,274 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#ifndef _INTERVAL_TREE_H
+#define _INTERVAL_TREE_H
+
+#include <vector>
+#include <algorithm>
+#include <iostream>
+
+using namespace std;
+
+
+template <class T, typename K = int>
+class Interval {
+public:
+ K start;
+ K stop;
+ T value;
+ Interval(K s, K e, const T& v)
+ : start(s)
+ , stop(e)
+ , value(v)
+ { }
+};
+
+template <class T, typename K>
+int intervalStart(const Interval<T,K>& i) {
+ return i.start;
+}
+
+template <class T, typename K>
+int intervalStop(const Interval<T,K>& i) {
+ return i.stop;
+}
+
+template <class T, typename K>
+ostream& operator<<(ostream& out, Interval<T,K>& i) {
+ out << "Interval(" << i.start << ", " << i.stop << "): " << i.value;
+ return out;
+}
+
+template <class T, typename K = int>
+class IntervalStartSorter {
+public:
+ bool operator() (const Interval<T,K>& a, const Interval<T,K>& b) {
+ return a.start < b.start;
+ }
+};
+
+template <class T, typename K = int>
+class IntervalTree {
+
+public:
+ typedef Interval<T,K> interval;
+ typedef vector<interval> intervalVector;
+ typedef IntervalTree<T,K> intervalTree;
+
+ intervalVector intervals;
+ intervalTree* left;
+ intervalTree* right;
+ int center;
+
+ IntervalTree<T,K>(void)
+ : left(NULL)
+ , right(NULL)
+ , center(0)
+ { }
+
+ IntervalTree<T,K>(const intervalTree& other) {
+ center = other.center;
+ intervals = other.intervals;
+ if (other.left) {
+ left = (intervalTree*) malloc(sizeof(intervalTree));
+ *left = *other.left;
+ } else {
+ left = NULL;
+ }
+ if (other.right) {
+ right = new intervalTree();
+ *right = *other.right;
+ } else {
+ right = NULL;
+ }
+ }
+
+ IntervalTree<T,K>& operator=(const intervalTree& other) {
+ center = other.center;
+ intervals = other.intervals;
+ if (other.left) {
+ left = new intervalTree();
+ *left = *other.left;
+ } else {
+ left = NULL;
+ }
+ if (other.right) {
+ right = new intervalTree();
+ *right = *other.right;
+ } else {
+ right = NULL;
+ }
+ return *this;
+ }
+
+ IntervalTree<T,K>(
+ intervalVector& ivals,
+ unsigned int depth = 16,
+ unsigned int minbucket = 64,
+ int leftextent = 0,
+ int rightextent = 0,
+ unsigned int maxbucket = 512
+ )
+ : left(NULL)
+ , right(NULL)
+ {
+
+ --depth;
+ if (depth == 0 || (ivals.size() < minbucket && ivals.size() < maxbucket)) {
+ intervals = ivals;
+ } else {
+ if (leftextent == 0 && rightextent == 0) {
+ // sort intervals by start
+ IntervalStartSorter<T,K> intervalStartSorter;
+ sort(ivals.begin(), ivals.end(), intervalStartSorter);
+ }
+
+ int leftp = 0;
+ int rightp = 0;
+ int centerp = 0;
+
+ if (leftextent || rightextent) {
+ leftp = leftextent;
+ rightp = rightextent;
+ } else {
+ leftp = ivals.front().start;
+ vector<K> stops;
+ stops.resize(ivals.size());
+ transform(ivals.begin(), ivals.end(), stops.begin(), intervalStop<T,K>);
+ rightp = *max_element(stops.begin(), stops.end());
+ }
+
+ //centerp = ( leftp + rightp ) / 2;
+ centerp = ivals.at(ivals.size() / 2).start;
+ center = centerp;
+
+ intervalVector lefts;
+ intervalVector rights;
+
+ for (typename intervalVector::iterator i = ivals.begin(); i != ivals.end(); ++i) {
+ interval& interval = *i;
+ if (interval.stop < center) {
+ lefts.push_back(interval);
+ } else if (interval.start > center) {
+ rights.push_back(interval);
+ } else {
+ intervals.push_back(interval);
+ }
+ }
+
+ if (!lefts.empty()) {
+ left = new intervalTree(lefts, depth, minbucket, leftp, centerp);
+ }
+ if (!rights.empty()) {
+ right = new intervalTree(rights, depth, minbucket, centerp, rightp);
+ }
+ }
+ }
+
+ void findOverlapping(K start, K stop, intervalVector& overlapping) {
+ if (!intervals.empty() && ! (stop < intervals.front().start)) {
+ for (typename intervalVector::iterator i = intervals.begin(); i != intervals.end(); ++i) {
+ interval& interval = *i;
+ if (interval.stop >= start && interval.start <= stop) {
+ overlapping.push_back(interval);
+ }
+ }
+ }
+
+ if (left && start <= center) {
+ left->findOverlapping(start, stop, overlapping);
+ }
+
+ if (right && stop >= center) {
+ right->findOverlapping(start, stop, overlapping);
+ }
+
+ }
+
+ void findOverlapping(K pos, intervalVector& overlapping) {
+ if (!intervals.empty() && ! (pos < intervals.front().start)) {
+ for (typename intervalVector::iterator i = intervals.begin(); i != intervals.end(); ++i) {
+ interval& interval = *i;
+ if (interval.stop >= pos && interval.start <= pos) {
+ overlapping.push_back(interval);
+ }
+ }
+ }
+
+ if (left && pos <= center) {
+ left->findOverlapping(pos, overlapping);
+ }
+
+ if (right && pos >= center) {
+ right->findOverlapping(pos, overlapping);
+ }
+
+ }
+
+ void findContained(K start, K stop, intervalVector& contained) {
+ if (!intervals.empty() && ! (stop < intervals.front().start)) {
+ for (typename intervalVector::iterator i = intervals.begin(); i != intervals.end(); ++i) {
+ interval& interval = *i;
+ if (interval.start >= start && interval.stop <= stop) {
+ contained.push_back(interval);
+ }
+ }
+ }
+
+ if (left && start <= center) {
+ left->findContained(start, stop, contained);
+ }
+
+ if (right && stop >= center) {
+ right->findContained(start, stop, contained);
+ }
+
+ }
+
+ bool checkOverlapping(K pos) {
+ bool outcome = false;
+
+ if (!intervals.empty() && ! (pos < intervals.front().start)) {
+ for (typename intervalVector::iterator i = intervals.begin(); i != intervals.end(); ++i) {
+ if (i->stop >= pos && i->start <= pos)
+ outcome = true;
+ }
+ }
+
+ if (left && pos <= center) {
+ outcome = outcome || left->checkOverlapping(pos);
+ }
+
+ if (right && pos >= center) {
+ outcome = outcome || right->checkOverlapping(pos);
+ }
+ return outcome;
+ }
+
+ ~IntervalTree(void) {
+ // traverse the left and right
+ // delete them all the way down
+ if (left) {
+ delete left;
+ }
+ if (right) {
+ delete right;
+ }
+ }
+
+};
+
+#endif
diff --git a/lib/OTools/otools.h b/lib/OTools/otools.h
new file mode 100644
index 0000000..e375fb1
--- /dev/null
+++ b/lib/OTools/otools.h
@@ -0,0 +1,80 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#ifndef _OLIVIER_TOOLS_H
+#define _OLIVIER_TOOLS_H
+
+//INCLUDE STANDARD TEMPLATE LIBRARY USEFULL STUFFS (STL)
+#include <vector>
+#include <list>
+#include <queue>
+#include <stack>
+#include <bitset>
+#include <set>
+#include <map>
+#include <unordered_set>
+#include <unordered_map>
+#include <algorithm>
+#include <cmath>
+#include <string>
+#include <exception>
+#include <cassert>
+#include <limits>
+
+//INCLUDE BOOST USEFULL STUFFS (BOOST)
+#include <boost/program_options.hpp>
+
+//INCLUDE HTS LIBRARY
+#include <htslib/hts.h>
+#include <htslib/kseq.h>
+#include <htslib/sam.h>
+extern "C" {
+ #include <htslib/vcf_sweep.h>
+ #include <htslib/synced_bcf_reader.h>
+ #include <htslib/vcf.h>
+ #include <htslib/vcfutils.h>
+}
+
+//INCLUDE RMATH LIBRARY
+#define MATHLIB_STANDALONE
+#include <Rmath.h>
+
+//INCLUDES BASE STUFFS
+#include "genomic_region.h"
+#include "interval_tree.h"
+#include "compressed_io.h"
+#include "random_number.h"
+#include "ranker.h"
+#include "residualizer.h"
+#include "pca.h"
+#include "full_linear_regression.h"
+#include <basic_stats.h>
+#include <string_utils.h>
+#include <timer.h>
+#include <verbose.h>
+
+//MAKE SOME TOOL FULLY ACCESSIBLE THROUGHOUT THE SOFTWARE
+#ifdef _DECLARE_TOOLBOX_HERE
+ random_number_generator rng;
+ string_utils stb;
+ verbose vrb;
+#else
+ extern random_number_generator rng;
+ extern string_utils stb;
+ extern verbose vrb;
+#endif
+
+#endif
+
diff --git a/lib/OTools/pca.h b/lib/OTools/pca.h
new file mode 100644
index 0000000..2d6d346
--- /dev/null
+++ b/lib/OTools/pca.h
@@ -0,0 +1,230 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#ifndef _PCA_H
+#define _PCA_H
+
+//STL INCLUDES
+#include <vector>
+#include <string>
+
+//EIGEN INCLUDES
+#include <Eigen/Dense>
+#include <Eigen/SVD>
+#include <iterator>
+
+//EIGEN NAMESPACE
+using namespace Eigen;
+using namespace std;
+
+class pca {
+public:
+ unsigned int _nrows; // Number of rows in matrix x.
+ unsigned int _ncols; // Number of cols in matrix x.
+ bool _is_center; // Whether the variables should be shifted to be zero centered
+ bool _is_scale; // Whether the variables should be scaled to have unit variance
+ bool _is_corr; // PCA with correlation matrix, not covariance
+ vector < unsigned int > _eliminated_columns; // Numbers of eliminated columns
+ vector < float > _sd; // Standard deviation of each component
+ vector < float > _prop_of_var; // Proportion of variance
+ vector < float > _cum_prop; // Cumulative proportion
+ vector < float > _scores; // Rotated values
+ unsigned int _kaiser; // Number of PC according Kaiser criterion
+ unsigned int _thresh995; // Number of PC according 95% variance threshol
+ Eigen::MatrixXf _xXf; // Initial matrix as Eigen MatrixXf structure
+ Eigen::MatrixXf _pcs;
+
+ pca(int nrows, int ncols): _nrows(nrows), _ncols(ncols), _is_center(true), _is_scale (true), _is_corr(false), _kaiser(0), _thresh995(1) {
+ _xXf.resize(nrows, ncols);
+ };
+
+ ~pca() { _xXf.resize(0, 0); };
+
+ void fill (vector < vector < float > > & values, vector < unsigned int > & indexes) {
+ for (int i = 0; i < indexes.size() ; i++) for (int s = 0 ; s < values[0].size() ; s++) _xXf(s, i) = values[indexes[i]][s];
+ }
+
+ void get(int i_pc, vector < float > & values) {
+ for (int s = 0 ; s < _nrows ; s ++) values[s] = _pcs(i_pc, s);
+ }
+
+ float getVariance (unsigned int k) { return _prop_of_var[k]; }
+
+ bool run(const bool is_corr, const bool is_center, const bool is_scale) {
+ _ncols = _xXf.cols();
+ _nrows = _xXf.rows();
+ _is_corr = is_corr;
+ _is_center = is_center;
+ _is_scale = is_scale;
+
+ if ((1 == _ncols) || (1 == _nrows)) return false;
+
+ // Mean and standard deviation for each column
+ VectorXf mean_vector(_ncols);
+ mean_vector = _xXf.colwise().mean();
+ VectorXf sd_vector(_ncols);
+ unsigned int zero_sd_num = 0;
+ float denom = static_cast<float>((_nrows > 1)? _nrows - 1: 1);
+ for (unsigned int i = 0; i < _ncols; ++i) {
+ VectorXf curr_col = VectorXf::Constant(_nrows, mean_vector(i)); // mean(x) for column x
+ curr_col = _xXf.col(i) - curr_col; // x - mean(x)
+ curr_col = curr_col.array().square(); // (x-mean(x))^2
+ sd_vector(i) = sqrt((curr_col.sum())/denom);
+ if (0 == sd_vector(i)) {
+ zero_sd_num++;
+ }
+ }
+ if (1 > _ncols-zero_sd_num) return false;
+
+ // Delete columns where sd == 0
+ MatrixXf tmp(_nrows, _ncols-zero_sd_num);
+ VectorXf tmp_mean_vector(_ncols-zero_sd_num);
+ unsigned int curr_col_num = 0;
+ for (unsigned int i = 0; i < _ncols; ++i) {
+ if (0 != sd_vector(i)) {
+ tmp.col(curr_col_num) = _xXf.col(i);
+ tmp_mean_vector(curr_col_num) = mean_vector(i);
+ curr_col_num++;
+ } else {
+ _eliminated_columns.push_back(i);
+ }
+ }
+ _ncols -= zero_sd_num;
+ _xXf = tmp;
+ mean_vector = tmp_mean_vector;
+ tmp.resize(0, 0); tmp_mean_vector.resize(0);
+
+ // Shift to zero
+ if (true == _is_center) {
+ for (unsigned int i = 0; i < _ncols; ++i) {
+ _xXf.col(i) -= VectorXf::Constant(_nrows, mean_vector(i));
+ }
+ }
+
+ // Scale to unit variance
+ if ( true == _is_scale) {
+ for (unsigned int i = 0; i < _ncols; ++i) {
+ _xXf.col(i) /= sqrt(_xXf.col(i).array().square().sum()/denom);
+ }
+ }
+
+ // When _nrows < _ncols then svd will be used.
+ // If corr is true and _nrows > _ncols then will be used correlation matrix
+ // (TODO): What about covariance?
+ if ((_nrows < _ncols) || (false == _is_corr)) { // Singular Value Decomposition is on
+ JacobiSVD<MatrixXf> svd(_xXf, ComputeThinV);
+ VectorXf eigen_singular_values = svd.singularValues();
+ VectorXf tmp_vec = eigen_singular_values.array().square();
+ float tmp_sum = tmp_vec.sum();
+ tmp_vec /= tmp_sum;
+ // PC's standard deviation and
+ // PC's proportion of variance
+ _kaiser = 0;
+ unsigned int lim = (_nrows < _ncols)? _nrows : _ncols;
+ for (unsigned int i = 0; i < lim; ++i) {
+ _sd.push_back(eigen_singular_values(i)/sqrt(denom));
+ if (_sd[i] >= 1) {
+ _kaiser = i + 1;
+ }
+ _prop_of_var.push_back(tmp_vec(i));
+ }
+ tmp_vec.resize(0);
+ // PC's cumulative proportion
+ _thresh995 = 1;
+ _cum_prop.push_back(_prop_of_var[0]);
+ for (unsigned int i = 1; i < _prop_of_var.size(); ++i) {
+ _cum_prop.push_back(_cum_prop[i-1]+_prop_of_var[i]);
+ if (_cum_prop[i] < 0.995) {
+ _thresh995 = i+1;
+ }
+ }
+ // Scores
+ MatrixXf eigen_scores = _xXf * svd.matrixV();
+ _pcs = eigen_scores.transpose();
+ eigen_scores.resize(0, 0);
+ } else { // COR OR COV MATRICES ARE HERE
+ // Calculate covariance matrix
+ MatrixXf eigen_cov; // = MatrixXf::Zero(_ncols, _ncols);
+ VectorXf sds;
+ // (TODO) Should be weighted cov matrix, even if is_center == false
+ eigen_cov = (1.0 /(_nrows/*-1*/)) * _xXf.transpose() * _xXf;
+ sds = eigen_cov.diagonal().array().sqrt();
+ MatrixXf outer_sds = sds * sds.transpose();
+ eigen_cov = eigen_cov.array() / outer_sds.array();
+ outer_sds.resize(0, 0);
+
+ // ?If data matrix is scaled, covariance matrix is equal to correlation matrix
+ EigenSolver<MatrixXf> edc(eigen_cov);
+ VectorXf eigen_eigenvalues = edc.eigenvalues().real();
+ MatrixXf eigen_eigenvectors = edc.eigenvectors().real();
+
+
+ // The eigenvalues and eigenvectors are not sorted in any particular order.
+ // So, we should sort them
+ typedef pair<float, int> eigen_pair;
+ vector<eigen_pair> ep;
+ for (unsigned int i = 0 ; i < _ncols; ++i) {
+ ep.push_back(make_pair(eigen_eigenvalues(i), i));
+ }
+ sort(ep.begin(), ep.end()); // Ascending order by default
+ // Sort them all in descending order
+ MatrixXf eigen_eigenvectors_sorted = MatrixXf::Zero(eigen_eigenvectors.rows(), eigen_eigenvectors.cols());
+ VectorXf eigen_eigenvalues_sorted = VectorXf::Zero(_ncols);
+ int colnum = 0;
+ int i = ep.size()-1;
+ for (; i > -1; i--) {
+ eigen_eigenvalues_sorted(colnum) = ep[i].first;
+ eigen_eigenvectors_sorted.col(colnum++) += eigen_eigenvectors.col(ep[i].second);
+ }
+
+ // We don't need not sorted arrays anymore
+ eigen_eigenvalues.resize(0);
+ eigen_eigenvectors.resize(0, 0);
+
+ _sd.clear(); _prop_of_var.clear(); _kaiser = 0;
+ float tmp_sum = eigen_eigenvalues_sorted.sum();
+ for (unsigned int i = 0; i < _ncols; ++i) {
+ _sd.push_back(sqrt(eigen_eigenvalues_sorted(i)));
+ if (_sd[i] >= 1) {
+ _kaiser = i + 1;
+ }
+ _prop_of_var.push_back(eigen_eigenvalues_sorted(i)/tmp_sum);
+ }
+
+ // PC's cumulative proportion
+ _cum_prop.clear(); _thresh995 = 1;
+ _cum_prop.push_back(_prop_of_var[0]);
+ for (unsigned int i = 1; i < _prop_of_var.size(); ++i) {
+ _cum_prop.push_back(_cum_prop[i-1]+_prop_of_var[i]);
+ if (_cum_prop[i] < 0.995) {
+ _thresh995 = i+1;
+ }
+ }
+
+ // Scores for PCA with correlation matrix
+ // Scale before calculating new values
+ for (unsigned int i = 0; i < _ncols; ++i) {
+ _xXf.col(i) /= sds(i);
+ }
+ sds.resize(0);
+ MatrixXf eigen_scores = _xXf * eigen_eigenvectors_sorted;
+ _pcs = eigen_scores.transpose();
+ eigen_scores.resize(0, 0);
+ }
+ return true;
+ };
+};
+
+#endif
diff --git a/lib/OTools/random_number.h b/lib/OTools/random_number.h
new file mode 100644
index 0000000..db09fc4
--- /dev/null
+++ b/lib/OTools/random_number.h
@@ -0,0 +1,68 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#ifndef _RANDOM_NUMBER_H
+#define _RANDOM_NUMBER_H
+
+#include <cfloat>
+#include <cstdint>
+#include <random>
+
+class random_number_generator {
+protected:
+ unsigned int seed;
+ std::mt19937 randomEngine;
+ std::uniform_int_distribution < unsigned int > uniformDistributionInt;
+ std::uniform_real_distribution < double > uniformDistributionDouble;
+
+public:
+
+ random_number_generator(unsigned int seed = 15052011) : randomEngine(seed), uniformDistributionInt(0, 32768), uniformDistributionDouble(0, 1.0) {
+ }
+
+ ~random_number_generator(){
+ }
+
+ void setSeed(unsigned int _seed) {
+ seed = _seed;
+ randomEngine.seed(seed);
+ }
+
+ unsigned int getSeed() {
+ return seed;
+ }
+
+ std::mt19937 & getEngine() {
+ return randomEngine;
+ }
+
+ unsigned int getInt(unsigned int imin, unsigned int imax) {
+ return uniformDistributionInt(randomEngine, std::uniform_int_distribution < unsigned int > {imin, imax}.param());
+ }
+
+ unsigned int getInt(unsigned int isize) {
+ return getInt(0, isize - 1);
+ }
+
+ double getDouble(double fmin, double fmax) {
+ return uniformDistributionDouble(randomEngine, std::uniform_real_distribution < double > {fmin, fmax}.param());
+ }
+
+ double getDouble() {
+ return getDouble(0.0, 1.0);
+ }
+};
+
+#endif
diff --git a/lib/OTools/ranker.h b/lib/OTools/ranker.h
new file mode 100644
index 0000000..84e6678
--- /dev/null
+++ b/lib/OTools/ranker.h
@@ -0,0 +1,238 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#ifndef _RANKER_H
+#define _RANKER_H
+
+#include <vector>
+#include <string>
+#include <cstdint>
+#include <algorithm>
+
+using std::vector;
+using std::string;
+
+namespace myranker {
+
+ template <class T>
+ class lt {
+ public:
+ static int compare(T a, T b) {
+ return(a < b);
+ }
+ };
+
+ template <class T>
+ class gt {
+ public:
+ static int compare(T a, T b) {
+ return(a > b);
+ }
+ };
+
+ template <class T, class C>
+ class ranker {
+ private:
+ const T* p;
+ uint32_t sz;
+
+ public:
+ ranker(const vector<T>& v) : p(&v[0]), sz(v.size()) {
+ }
+
+ ranker(const T* tp, uint32_t s) : p(tp), sz(s) {
+ }
+
+ int operator()(uint32_t i1, uint32_t i2) const {
+ return(C::compare(p[i1],p[i2]));
+ }
+
+ template <class S>
+ void get_orders(vector<S>& w) const {
+ w.resize(sz);
+ w.front() = 0;
+ for (typename vector<S>::iterator i = w.begin(); i != w.end() - 1; ++i) *(i + 1) = *i + 1;
+ std::sort(w.begin(), w.end(), *this);
+ }
+
+ template <class S>
+ void get_partial_orders(vector<S>& w, uint32_t num) const {
+ if (num > sz) num = sz;
+ w.resize(sz);
+ w.front() = 0;
+ for (typename vector<S>::iterator i = w.begin(); i != w.end() - 1; ++i) *(i + 1) = *i + 1;
+ std::partial_sort(w.begin(), w.begin() + num, w.end(), *this);
+ w.resize(num);
+ }
+
+ template <class S>
+ void get_ranks(vector<S>& w, const string& method) const {
+ w.resize(sz);
+ vector<uint32_t> tmp(w.size());
+ get_orders(tmp);
+ if (method == "average") {
+ for (uint32_t c = 0, reps; c < w.size(); c += reps) {
+ reps = 1;
+ while (c + reps < w.size() && p[tmp[c]] == p[tmp[c + reps]]) ++reps;
+ for (uint32_t k = 0; k < reps; ++k) w[tmp[c + k]] = S(2 * c + reps - 1) / 2 + 1;
+ }
+ } else if (method == "min") {
+ for (uint32_t c = 0, reps; c < w.size(); c += reps) {
+ reps = 1;
+ while (c + reps < w.size() && p[tmp[c]] == p[tmp[c + reps]]) ++reps;
+ for (uint32_t k = 0; k < reps; ++k) w[tmp[c + k]] = c + 1;
+ }
+ } else if (method == "max") {
+ for (uint32_t c = 0, reps; c < w.size(); c += reps) {
+ reps = 1;
+ while (c + reps < w.size() && p[tmp[c]] == p[tmp[c + reps]]) ++reps;
+ for (uint32_t k = 0; k < reps; ++k) w[tmp[c + k]] = c + reps;
+ }
+ } else // default
+ for (uint32_t c = 0; c < w.size(); ++c) w[tmp[c]] = c + 1;
+ }
+
+ template <class S>
+ void get_partial_ranks(vector<S>& w, const string& method, size_t num) const {
+ if (num > sz) num = sz;
+ vector<uint32_t> tmp(sz);
+ get_partial_orders(tmp, num);
+ w.resize(sz);
+ fill(w.begin(), w.end(), 0);
+ if (method == "average") {
+ for (uint32_t c = 0, reps; c < num; c += reps) { reps = 1;
+ while (c + reps < num && p[tmp[c]] == p[tmp[c + reps]]) ++reps;
+ for (uint32_t k = 0; k < reps; ++k)
+ w[tmp[c + k]] = S(2 * c + reps - 1) / 2 + 1;
+ }
+ } else if (method == "min") {
+ for (uint32_t c = 0, reps; c < num; c += reps) { reps = 1;
+ while (c + reps < num && p[tmp[c]] == p[tmp[c + reps]]) ++reps;
+ for (uint32_t k = 0; k < reps; ++k) w[tmp[c + k]] = c + 1;
+ }
+ } else if (method == "max") {
+ for (uint32_t c = 0, reps; c < num; c += reps) { reps = 1;
+ while (c + reps < num && p[tmp[c]] == p[tmp[c + reps]]) ++reps;
+ for (uint32_t k = 0; k < reps; ++k) w[tmp[c + k]] = c + reps;
+ }
+ } else // default
+ for (uint32_t c = 0; c < num; ++c) w[tmp[c]] = c + 1;
+ }
+ };
+
+ template <class T, class S>
+ inline void rank(const vector<T>& v, vector<S>& w, const string& method = "average") {
+ ranker<T, lt<T> > r(v); r.get_ranks(w, method);
+ }
+
+ template <class T, class S>
+ inline void rank(const T* d, uint32_t size, vector<S>& w, const string& method = "average") {
+ ranker<T, lt<T> > r(d, size); r.get_ranks(w, method);
+ }
+
+ template <class T, class S>
+ inline void partial_rank(const vector<T>& v, vector<S>& w, uint32_t num, const string& method = "average") {
+ ranker<T, lt<T> > r(v); r.get_partial_ranks(w, method, num);
+ }
+
+ template <class T, class S>
+ inline void partial_rank(const T* d, uint32_t size, vector<S>& w, uint32_t num, const string& method = "average") {
+ ranker<T, lt<T> > r(d, size); r.get_partial_ranks(w, method, num);
+ }
+
+ template <class T, class S>
+ inline void order(const vector<T>& v, vector<S>& w) {
+ ranker<T, lt<T> > r(v); r.get_orders(w);
+ }
+
+ template <class T, class S>
+ inline void order(const T* d, uint32_t size, vector<S>& w) {
+ ranker<T, lt<T> > r(d, size); r.get_orders(w);
+ }
+
+ template <class T, class S>
+ inline void partial_order(const vector<T>& v, vector<S>& w, uint32_t num) {
+ ranker<T, lt<T> > r(v); r.get_partial_orders(w, num);
+ }
+
+ template <class T, class S>
+ inline void partial_order(const T* d, uint32_t size, vector<S>& w, uint32_t num) {
+ ranker<T, lt<T> > r(d, size); r.get_partial_orders(w, num);
+ }
+
+ template <class T, class S>
+ inline void rankhigh(const vector<T>& v, vector<S>& w, const string& method = "average") {
+ ranker<T, gt<T> > r(v); r.get_ranks(w, method);
+ }
+
+ template <class T, class S>
+ inline void rankhigh(const T* d, uint32_t size, vector<S>& w, const string& method = "average") {
+ ranker<T, gt<T> > r(d, size); r.get_ranks(w, method);
+ }
+
+ template <class T, class S>
+ inline void partial_rankhigh(const vector<T>& v, vector<S>& w, uint32_t num, const string& method = "average") {
+ ranker<T, gt<T> > r(v); r.get_partial_ranks(w, method, num);
+ }
+
+ template <class T, class S>
+ inline void partial_rankhigh(const T* d, uint32_t size, vector<S>& w, uint32_t num, const string& method = "average") {
+ ranker<T, gt<T> > r(d, size); r.get_partial_ranks(w, method, num);
+ }
+
+ template <class T, class S>
+ inline void orderhigh(const vector<T>& v, vector<S>& w) {
+ ranker<T, gt<T> > r(v); r.get_orders(w);
+ }
+
+ template <class T, class S>
+ inline void orderhigh(const T* d, uint32_t size, vector<S>& w) {
+ ranker<T, gt<T> > r(d, size); r.get_orders(w);
+ }
+
+ template <class T, class S>
+ inline void partial_orderhigh(const vector<T>& v, vector<S>& w, uint32_t num) {
+ ranker<T, gt<T> > r(v); r.get_partial_orders(w, num);
+ }
+
+ template <class T, class S>
+ inline void partial_orderhigh(const T* d, uint32_t size, vector<S>& w, uint32_t num) {
+ ranker<T, gt<T> > r(d, size); r.get_partial_orders(w, num);
+ }
+
+ template <class T>
+ inline T quantile(const T* d, const uint32_t size, const double q) {
+ if (size == 0) return T(0);
+ if (size == 1) return d[0];
+ if (q <= 0) return *std::min_element(d, d + size);
+ if (q >= 1) return *std::max_element(d, d + size);
+
+ double pos = (size - 1) * q;
+ uint32_t ind = uint32_t(pos);
+ double delta = pos - ind;
+ vector<T> w(size); std::copy(d, d + size, w.begin());
+ std::nth_element(w.begin(), w.begin() + ind, w.end());
+ T i1 = *(w.begin() + ind);
+ T i2 = *std::min_element(w.begin() + ind + 1, w.end());
+ return i1 * (1.0 - delta) + i2 * delta;
+ }
+
+ template <class T>
+ inline T quantile(const vector<T>& v, const double q) {
+ return quantile(&v[0], v.size(), q);
+ }
+};
+
+#endif
diff --git a/lib/OTools/residualizer.h b/lib/OTools/residualizer.h
new file mode 100644
index 0000000..57b0853
--- /dev/null
+++ b/lib/OTools/residualizer.h
@@ -0,0 +1,195 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#ifndef _RESIDUALIZER_H
+#define _RESIDUALIZER_H
+
+#define R_QR_TOLERANCE 1e-7
+
+//ERROR CODE FOR COVARIATE PARSING
+#define COV_OKAY 0
+#define COV_MIXD 1
+#define COV_DROP 2
+#define COV_NVAR 3
+#define COV_NCOV 4
+#define COV_CORR 5
+
+//STL INCLUDES
+#include <vector>
+#include <set>
+#include <sstream>
+#include <string>
+
+//EIGEN INCLUDES
+#include <Eigen/Dense>
+#include <Eigen/LU>
+
+//EIGEN NAMESPACE
+using namespace Eigen;
+
+class residualizer {
+public:
+ unsigned int n_samples;
+ unsigned int n_covariates;
+ MatrixXd covarM;
+ MatrixXd PQR_Q;
+ MatrixXd PQR_Q_A;
+ ColPivHouseholderQR < MatrixXd > PQR;
+
+ residualizer(int _n_samples) : n_samples (_n_samples) {
+ covarM.resize(n_samples,1);
+ covarM.col(0) = VectorXd::Ones(n_samples);
+ n_covariates = 0;
+ }
+
+ ~residualizer() {
+ n_samples = 0;
+ n_covariates = 0;
+ covarM.resize(0,0);
+ PQR_Q.resize(0,0);
+ PQR_Q_A.resize(0,0);
+ }
+
+ unsigned int push(vector < string > & covariate) {
+ set < string > factors;
+ set < unsigned int > i_yesmissing, i_nonmissing;
+
+ //MAP MISSING
+ for (int i = 0 ; i < covariate.size() ; i++) {
+ if (covariate[i] == "NA") i_yesmissing.insert(i);
+ else i_nonmissing.insert(i);
+ }
+
+ //TEST FOR NUMERIC
+ bool isNumeric = false, isAlphabetic = false;
+ for (set < unsigned int >::iterator itNM = i_nonmissing.begin(); itNM != i_nonmissing.end() ; ++itNM) {
+ float value;
+ std::istringstream in(covariate[*itNM]);
+ if (!(in >> value)) {
+ factors.insert(covariate[*itNM]);
+ isAlphabetic = true;
+ } else isNumeric = true;
+ }
+ if (isNumeric && isAlphabetic) return COV_MIXD;
+
+ //FILL IN VALUES
+ vector < vector < float > > additional_hcov;
+ if (factors.size() == 0) {
+ additional_hcov = vector < vector < float > > (1, vector < float > (n_samples, 0.0));
+ for (int i = 0 ; i < covariate.size() ; i++) additional_hcov[0][i] = std::stof(covariate[i]);
+ } else if (factors.size() > 1) {
+ factors.erase(factors.begin());
+ for (set < string > ::iterator itF = factors.begin(); itF != factors.end() ; itF++) {
+ additional_hcov.push_back(vector < float > (n_samples, 0.0));
+ for (int i = 0 ; i < n_samples ; i++) additional_hcov.back()[i] = (covariate[i] == (*itF));
+ }
+ } else return COV_DROP;
+
+ //IMPUTE MISSING
+ if (i_yesmissing.size() > 0) {
+ for (unsigned int c = 0 ; c < additional_hcov.size() ; c ++) {
+ double sum_row = 0;
+ for (set < unsigned int >::iterator itNM = i_nonmissing.begin(); itNM != i_nonmissing.end() ; ++itNM) sum_row += additional_hcov[c][*itNM];
+ for (set < unsigned int >::iterator itYM = i_yesmissing.begin(); itYM != i_yesmissing.end() ; ++itYM) additional_hcov[c][*itYM] = sum_row / i_nonmissing.size();
+ }
+ }
+
+ //ADD RESULTING COVARIATES
+ for (int c = 0 ; c < additional_hcov.size() ; c ++) if (!push(additional_hcov[c])) return COV_NVAR;
+
+ return COV_OKAY;
+ }
+
+ bool push(vector < float > & covariate) {
+ bool isVariable = false;
+ for (unsigned int e = 1 ; e < covariate.size() ; e ++) if (covariate[e] != covariate[e-1]) isVariable = true;
+ if (!isVariable) return false;
+ n_covariates ++;
+ covarM.conservativeResize(n_samples, n_covariates+1);
+ for(int i = 0 ; i < n_samples ; i ++) covarM(i, n_covariates) = covariate[i];
+ return true;
+ }
+
+ unsigned int build() {
+ if (n_covariates == 0) return COV_NCOV;
+ PQR = ColPivHouseholderQR<MatrixXd>(covarM);
+ PQR.setThreshold(R_QR_TOLERANCE);
+ if (PQR.rank() != n_covariates + 1) {
+ PQR_Q = PQR.householderQ();
+ PQR_Q_A = PQR.householderQ().adjoint();
+ return COV_CORR;
+ }
+ return COV_OKAY;
+ }
+
+ unsigned int residualize(vector < float > & data) {
+ if (n_covariates == 0) return COV_NCOV;
+
+ bool isVariable = false;
+ for (unsigned int e = 1 ; e < data.size() ; e ++) if (data[e] != data[e-1]) isVariable = true;
+ if (!isVariable) return COV_NVAR;
+
+ //FILL IN DATA
+ VectorXd counts(n_samples);
+ for(int i = 0; i < n_samples ; i ++) counts(i) = data[i];
+
+ //CORRECTION
+ if (PQR.rank() == n_covariates + 1) {
+ VectorXd m_coef = PQR.solve(counts);
+ VectorXd fitted = covarM * m_coef;
+ VectorXd e = counts - fitted;
+ for (int i = 0; i < e.size(); i ++) data[i] = e(i);
+ } else {
+ VectorXd effects(PQR_Q_A * counts);
+ effects.tail(n_samples - PQR.rank()).setZero();
+ VectorXd fitted = PQR_Q * effects;
+ VectorXd e = counts - fitted;
+ for (int i = 0; i < e.size(); i ++) data[i] = (float)e(i);
+ }
+
+ return COV_OKAY;
+ }
+
+
+ unsigned int residualize(float * data) {
+ if (n_covariates == 0) return COV_NCOV;
+
+ bool isVariable = false;
+ for (unsigned int e = 1 ; e < n_samples ; e ++) if (data[e] != data[e-1]) isVariable = true;
+ if (!isVariable) return COV_NVAR;
+
+ //FILL IN DATA
+ VectorXd counts(n_samples);
+ for(int i = 0; i < n_samples ; i ++) counts(i) = data[i];
+
+ //CORRECTION
+ if (PQR.rank() == n_covariates + 1) {
+ VectorXd m_coef = PQR.solve(counts);
+ VectorXd fitted = covarM * m_coef;
+ VectorXd e = counts - fitted;
+ for (int i = 0; i < e.size(); i ++) data[i] = e(i);
+ } else {
+ VectorXd effects(PQR_Q_A * counts);
+ effects.tail(n_samples - PQR.rank()).setZero();
+ VectorXd fitted = PQR_Q * effects;
+ VectorXd e = counts - fitted;
+ for (int i = 0; i < e.size(); i ++) data[i] = (float)e(i);
+ }
+
+ return COV_OKAY;
+ }
+};
+
+#endif
diff --git a/lib/OTools/string_utils.h b/lib/OTools/string_utils.h
new file mode 100644
index 0000000..ed61eec
--- /dev/null
+++ b/lib/OTools/string_utils.h
@@ -0,0 +1,72 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#ifndef _STRING_UTILS_H
+#define _STRING_UTILS_H
+
+#include <sstream>
+#include <iomanip>
+#include <string>
+#include <vector>
+
+//using namespace std;
+
+class string_utils {
+public:
+ string_utils () {};
+ ~string_utils () {};
+
+ int split(const string & str, vector < string > & tokens, string sep = " ", unsigned int n_max_tokens = 1000000) {
+ tokens.clear();
+ if (str == ""){
+ tokens.push_back("");
+ return tokens.size();
+ }
+ string::size_type p_last = str.find_first_not_of(sep, 0);
+ string::size_type p_curr = str.find_first_of(sep, p_last);
+ while ((string::npos != p_curr || string::npos != p_last) && tokens.size() < n_max_tokens) {
+ tokens.push_back(str.substr(p_last, p_curr - p_last));
+ p_last = str.find_first_not_of(sep, p_curr);
+ p_curr = str.find_first_of(sep, p_last);
+ }
+ if (tokens.back()[tokens.back().size()-1] == '\r') tokens.back() = tokens.back().substr(0, tokens.back().size()-1);
+ return tokens.size();
+ }
+
+ bool numeric(string & str) {
+ float n;
+ std::istringstream in(str);
+ if (!(in >> n)) return false;
+ return true;
+ }
+
+ template < class T >
+ string str(T n, int prec = -1) {
+ ostringstream ss( stringstream::out );
+ if (prec >= 0) { ss << setiosflags( ios::fixed ); ss.precision(prec); }
+ ss << n;
+ return ss.str();
+ }
+
+ template < class T >
+ string str(vector < T > & v, int prec = -1) {
+ ostringstream ss( stringstream::out );
+ if (prec >= 0) { ss << setiosflags( ios::fixed ); ss.precision(prec); }
+ for (int e = 0 ; e < v.size() ; e ++) ss << (e>0?" ":"") << v[e] ;
+ return ss.str();
+ }
+};
+
+#endif
diff --git a/lib/OTools/timer.h b/lib/OTools/timer.h
new file mode 100644
index 0000000..4504d50
--- /dev/null
+++ b/lib/OTools/timer.h
@@ -0,0 +1,69 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#ifndef _TIMER_H
+#define _TIMER_H
+
+#include <chrono>
+#include <ctime>
+#include <sstream>
+#include <iomanip>
+#include <string>
+
+class timer {
+protected:
+ std::chrono::time_point<std::chrono::high_resolution_clock> start_timing_clock, prev_timing_clock;
+
+public:
+ timer () {
+ start_timing_clock = std::chrono::high_resolution_clock::now();
+ }
+
+ ~timer() {
+ }
+
+ void clock() {
+ prev_timing_clock = std::chrono::high_resolution_clock::now();
+ }
+
+ unsigned int rel_time() {
+ return std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - prev_timing_clock).count();
+ }
+
+ unsigned int abs_time() {
+ return std::chrono::duration_cast<std::chrono::seconds>(std::chrono::high_resolution_clock::now() - start_timing_clock).count();
+ }
+
+ std::string date() {
+ char buffer[256];
+ std::time_t t = std::time(NULL);
+ std::tm tm = *std::localtime(&t);
+ std::strftime(buffer, 256, "%d/%m/%Y - %X", &tm);
+ return string(buffer);
+ }
+
+ /* More elegant GCC5 version
+ std::string date() {
+ auto now = std::chrono::system_clock::now();
+ auto in_time_t = std::chrono::system_clock::to_time_t(now);
+ std::stringstream ss;
+ ss << std::put_time(std::localtime(&in_time_t), "%d/%m/%Y - %X");
+ return ss.str();
+ }
+ */
+
+};
+
+#endif
diff --git a/lib/OTools/verbose.h b/lib/OTools/verbose.h
new file mode 100644
index 0000000..c9f267d
--- /dev/null
+++ b/lib/OTools/verbose.h
@@ -0,0 +1,121 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#ifndef _VERBOSE_H
+#define _VERBOSE_H
+
+#include <iostream>
+#include <sstream>
+#include <fstream>
+#include <cmath>
+
+using namespace std;
+
+class verbose {
+protected:
+ ofstream log;
+ bool verbose_on_screen;
+ bool verbose_on_log;
+ int prev_percent;
+
+public:
+ verbose() {
+ verbose_on_screen = true;
+ verbose_on_log = false;
+ prev_percent = 0;
+ }
+
+ ~verbose() {
+ close_log();
+ }
+
+ bool open_log(string fname) {
+ log.open(fname.c_str());
+ if (log.fail()) return false;
+ else return (verbose_on_log = true);
+ }
+
+ void close_log() {
+ log.close();
+ }
+
+ void set_silent() {
+ verbose_on_screen = false;
+ }
+
+ void print(string s) {
+ if (verbose_on_screen) cout << s << endl;
+ if (verbose_on_log) log << s << endl;
+ }
+
+ void ctitle(string s) {
+ if (verbose_on_screen) cout << endl << "\x1B[32m" << s << "\033[0m" << endl;
+ if (verbose_on_log) log << endl << s << endl;
+ }
+
+ void title(string s) {
+ if (verbose_on_screen) cout << endl << s << endl;
+ if (verbose_on_log) log << endl << s << endl;
+ }
+
+ void bullet(string s) {
+ if (verbose_on_screen) cout << " * " << s << endl;
+ if (verbose_on_log) log << " * " << s << endl;
+ }
+
+ void warning(string s) {
+ if (verbose_on_screen) cout << endl << "\x1B[33m" << "WARNING: " << "\033[0m" << s << endl;
+ if (verbose_on_log) log << endl << "WARNING: " << s << endl;
+ }
+
+ void leave(string s) {
+ if (verbose_on_screen) cout << endl << "\x1B[33m" << "EXITED: " << "\033[0m" << s << endl;
+ if (verbose_on_log) log << endl << "EXITED: " << s << endl;
+ exit(EXIT_SUCCESS);
+ }
+
+ void error(string s) {
+ if (verbose_on_screen) cout << endl << "\x1B[31m" << "ERROR: " << "\033[0m" << s << endl;
+ if (verbose_on_log) log << endl << "ERROR: " << s << endl;
+ exit(EXIT_FAILURE);
+ }
+
+ void done(string s) {
+ if (verbose_on_screen) cout << endl << "\x1B[32m" << "DONE: " << "\033[0m" << s << endl;
+ if (verbose_on_log) log << endl << "DONE: " << s << endl;
+ exit(EXIT_SUCCESS);
+ }
+
+ void progress(float percent, int barWidth = 70) {
+ if (verbose_on_screen) {
+ int curr_percent = int(percent * 100.0);
+ if (prev_percent > curr_percent) prev_percent = -1;
+ if (curr_percent > prev_percent) {
+ int pos = barWidth * percent;
+ cout << "[";
+ for (int i = 0; i < barWidth; ++i) {
+ if (i < pos) cout << "=";
+ else if (i == pos) cout << ">";
+ else cout << " ";
+ }
+ cout << "] " << curr_percent << " %\r";
+ if (percent < 1.0) cout.flush();
+ else cout << endl;
+ prev_percent = curr_percent;
+ }
+ }
+ }
+};
+#endif
diff --git a/script/plotTrans.R b/script/plotTrans.R
new file mode 100644
index 0000000..8b418a4
--- /dev/null
+++ b/script/plotTrans.R
@@ -0,0 +1,59 @@
+#Read command line arguments
+args <- commandArgs(trailingOnly = TRUE)
+try(if(length(args) != 5) stop("Incorrect number of arguments, usage> Rscript transQQplot.R qqplot.pdf nominal.hits.txt.gz nominal.bins.txt.gz permutation.hits.txt.gz permutation.bins.txt.gz"))
+
+#Read data
+Nh=read.table(args[2], head=FALSE, stringsAsFactors=FALSE)
+Nb=read.table(args[3], head=FALSE, stringsAsFactors=FALSE)
+Ph=read.table(args[4], head=FALSE, stringsAsFactors=FALSE)
+Pb=read.table(args[5], head=FALSE, stringsAsFactors=FALSE)
+
+#Sort best hits
+Nh=Nh[order(-Nh$V7), ]
+Ph=Ph[order(-Ph$V7), ]
+
+#Counts
+n_bins = nrow(Nb)
+n_tests = nrow(Nh) + sum(Nb$V6)
+
+#Cumulative sums
+Nb$cs0=c(0, cumsum(Nb$V6[1:(n_bins-1)]))
+Nb$cs1=cumsum(Nb$V6)
+Pb$cs0=c(0, cumsum(Pb$V6[1:(n_bins-1)]))
+Pb$cs1=cumsum(Pb$V6)
+
+#Get the null p-value of a rank
+pvalue_estimate <- function(idx, Ph, Pb) {
+ pvalue = -1.0;
+ if (idx > Pb$cs1[n_bins]) {
+ pvalue = Ph$V7[idx - Pb$cs1[n_bins]]
+ } else {
+ nidx = which(Pb$cs0 < idx & idx <= Pb$cs1)
+
+ cs0 = Pb$cs0[nidx]
+ cs1 = Pb$cs1[nidx]
+ pv0 = Pb$V4[nidx]
+ pv1 = Pb$V5[nidx]
+
+ pvalue = pv0 - (pv0 - pv1) * (idx - cs0) / (cs1 - cs0)
+ }
+ return (pvalue);
+}
+
+#Build the QQplot data
+MP = matrix(0, 2 * rep (n_bins + nrow(Nh)), ncol=2)
+for (b in 1:n_bins) {
+ MP[b, 1] = (Nb$V4[b] + Nb$V5[b]) / 2
+ MP[b, 2] = pvalue_estimate(round((Nb$cs0[b] + Nb$cs1[b]) / 2), Ph, Pb)
+}
+for (h in 1:nrow(Nh)) {
+ MP[h+n_bins, 1] = Nh$V7[h]
+ MP[h+n_bins, 2] = pvalue_estimate(sum(Nb$V6) + h, Ph, Pb)
+}
+
+#Plot the QQplot
+pdf(args[1], 5, 5)
+plot(-log10(MP[, 2]), -log10(MP[, 1]), xlab="-log10(permutation P-values)", ylab="-log10(nominal P-values)", main="QQplot")
+abline(0, 1, col="red")
+dev.off()
+
diff --git a/script/runFDR_atrans.R b/script/runFDR_atrans.R
new file mode 100644
index 0000000..f80005c
--- /dev/null
+++ b/script/runFDR_atrans.R
@@ -0,0 +1,25 @@
+#Load qvalue package
+suppressMessages(library(qvalue))
+
+#Read command line arguments
+args <- commandArgs(trailingOnly = TRUE)
+try(if(length(args) != 4) stop("Incorrect number of arguments, usage> Rscript runFDR_atrans.R adjusted.best.txt adjusted.hits.txt FDR output.txt"))
+
+cat("\nProcessing QTLtools approximate trans output\n");
+cat(" * File best = [", args[1], "]\n");
+cat(" * File hits = [", args[2], "]\n");
+cat(" * FDR = [", args[3], "]\n");
+cat(" * Output = [", args[4], "]\n");
+
+
+B = read.table(args[1], head=FALSE, stringsAsFactors=FALSE)
+H = read.table(args[2], head=FALSE, stringsAsFactors=FALSE)
+FDR = as.numeric(args[3])
+B$qval = qvalue(B$V2)$qval
+threshold = min(B$V2[which(B$qval > FDR)])
+cat(" * Threshold of significance for adjusted P-values =" , threshold, "\n")
+
+cat("\nFiltering hits and output results\n");
+S = H[which(H$V8 <= threshold), ]
+cat(" * " , nrow(S) , " are significante out of ", nrow(H), "\n")
+write.table(S, args[4], quote=FALSE, row.names=FALSE, col.names=FALSE)
diff --git a/script/runFDR_cis.R b/script/runFDR_cis.R
new file mode 100644
index 0000000..b0de790
--- /dev/null
+++ b/script/runFDR_cis.R
@@ -0,0 +1,67 @@
+#Load qvalue package
+suppressMessages(library(qvalue))
+
+#Read command line arguments
+args <- commandArgs(trailingOnly = TRUE)
+try(if(length(args) != 3) stop("Incorrect number of arguments, usage> Rscript runFDR.R INPUT FDR OUTPUT"))
+opt_input = args[1];
+opt_fdr = as.numeric(args[2]);
+opt_output = args[3];
+
+#Verbose
+cat("\nProcessing fastQTL output\n");
+cat(" * Input = [", opt_input, "]\n");
+cat(" * FDR = ", opt_fdr, "\n");
+cat(" * Output = [", opt_output, "]\n");
+
+#Read data
+cat("\nRead Input data\n");
+D = read.table(opt_input,hea=FALSE, stringsAsFactors=FALSE)
+exon_offset = ifelse(ncol(D) == 19, 0, 2)
+if (exon_offset == 2) cat(" * Gene level correction detected\n")
+MASK=!is.na(D[,18+exon_offset])
+Dnas=D[!MASK,]
+D = D[MASK,]
+cat(" * Number of molecular phenotypes =" , nrow(D), "\n")
+cat(" * Number of NA lines =" , nrow(Dnas), "\n")
+cat(" * Correlation between Beta approx. and Empirical p-values =", round(cor(D[, 18+exon_offset], D[, 19+exon_offset]), 4), "\n")
+
+#Run qvalue on pvalues for best signals
+cat("\nProcess Input data with Qvalue\n");
+MASK=!is.na(D[,18+exon_offset])
+Q = qvalue(D[MASK,19+exon_offset]);
+D$qval = NA;
+D$qval[MASK] = Q$qvalue;
+cat(" * Proportion of significant phenotypes =" , round((1 - Q$pi0) * 100, 2), "%\n")
+
+#Determine significance threshold
+cat("\nDetermine significance thresholds\n");
+set0 = D[which(D$qval <= opt_fdr),]
+set1 = D[which(D$qval > opt_fdr),]
+pthreshold = (sort(set1[,19+exon_offset])[1] - sort(-1.0 * set0[,19+exon_offset])[1]) / 2
+cat(" * Corrected p-value threshold = ", pthreshold, "\n")
+pval0 = qbeta(pthreshold, D[,14+exon_offset], D[,15+exon_offset], ncp = 0, lower.tail = TRUE, log.p = FALSE)
+test0 = qf(pval0, 1, D[,13+exon_offset], ncp = 0, lower.tail = FALSE, log.p = FALSE)
+corr0 = sqrt(test0 / (D[,13+exon_offset] + test0))
+test1 = D[,12+exon_offset] * corr0 * corr0 / (1 - corr0 * corr0)
+pval1 = pf(test1, 1, D[,12+exon_offset], ncp = 0, lower.tail = FALSE, log.p = FALSE)
+cat(" * pval0 = ", mean(pval0), " +/- ", sd(pval0), "\n")
+cat(" * test0 = ", mean(test0), " +/- ", sd(test0), "\n")
+cat(" * corr0 = ", mean(corr0), " +/- ", sd(corr0), "\n")
+cat(" * test1 = ", mean(test1), " +/- ", sd(test1), "\n")
+cat(" * pval1 = ", mean(pval1), " +/- ", sd(pval1), "\n")
+D$nthresholds = pval1
+
+#Write significant hits
+fout1=paste(opt_output, "significant.txt", sep=".")
+cat("\nWrite significant hits in [", fout1, "]\n");
+write.table(D[D$qval <= opt_fdr,], fout1, quote=FALSE, row.names=FALSE, col.names=FALSE)
+
+#Write thresholds
+fout2=paste(opt_output, "thresholds.txt", sep=".")
+cat("\nWrite nominal thresholds in [", fout2, "]\n");
+D1=D[, c(1, 21+exon_offset)]
+D2=Dnas[, c(1,18)]
+names(D2)=names(D1)
+D3=rbind(D1, D2)
+write.table(D3, fout2, quote=FALSE, row.names=FALSE, col.names=FALSE)
diff --git a/script/runFDR_ftrans.R b/script/runFDR_ftrans.R
new file mode 100644
index 0000000..bb677c2
--- /dev/null
+++ b/script/runFDR_ftrans.R
@@ -0,0 +1,25 @@
+#Read command line arguments
+args <- commandArgs(trailingOnly = TRUE)
+try(if(length(args) != 3) stop("Incorrect number of arguments, usage> Rscript runFDR_ftrans.R nominal.hits.txt.gz permutation.hits.txt.gz output.txt"))
+
+cat("\nProcessing QTLtools full trans output\n");
+cat(" * File hits nominal = [", args[1], "]\n");
+cat(" * File hits permute = [", args[2], "]\n");
+cat(" * Output = [", args[3], "]\n");
+
+#Read data
+Nh=read.table(args[1], head=FALSE, stringsAsFactors=FALSE)
+Ph=read.table(args[2], head=FALSE, stringsAsFactors=FALSE)
+
+#Sort best hits
+Nh=Nh[order(Nh$V7), ]
+Ph=Ph[order(Ph$V7), ]
+
+#Estimate FDR
+Nh$fdr=1.0
+for (h in 1:nrow(Nh)) {
+ Nh$fdr[h] = sum(Ph$V7 <= Nh$V7[h]) / h
+}
+
+#OUTPUT
+write.table(Nh, args[3], quote=FALSE, row.names=FALSE, col.names=FALSE)
diff --git a/src/QTLtools.cpp b/src/QTLtools.cpp
new file mode 100644
index 0000000..d7f04b6
--- /dev/null
+++ b/src/QTLtools.cpp
@@ -0,0 +1,133 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "mode_cis/cis_data.h"
+#include "mode_trans/trans_data.h"
+#include "mode_match/match_data.h"
+#include "mode_fenrich/fenrich_data.h"
+#include "mode_correct/correct_data.h"
+#include "mode_rtc/rtc_data.h"
+#include "mode_pca/pca_data.h"
+#include "mode_genrich/genrich_data.h"
+#include "mode_extract/extract_data.h"
+#include "mode_ase/ase_data.h"
+#include "mode_quan/quan_data.h"
+#include "mode_union/union_data.h"
+#include "mode_bamstat/bamstat_data.h"
+#include "mode_fdensity/fdensity_data.h"
+
+void printModes(){
+ vrb.ctitle("Usage:");
+ vrb.print(" QTLtools [mode] [options]");
+ vrb.print(" eg: QTLtools cis --help");
+ vrb.ctitle("Available modes:");
+ vrb.print(" bamstat Calculate basic QC metrics for BAM/SAM");
+ vrb.print(" match Match VCF genotypes to BAM/SAM file");
+ vrb.print(" pca Calculate principal components for a BED/VCF/BCF file");
+ vrb.print(" correct Covariate correction of a BED file");
+ vrb.print(" cis cis QTL analysis");
+ vrb.print(" trans trans QTL analysis");
+ vrb.print(" fenrich Functional enrichment for QTLs");
+ vrb.print(" fdensity Functional density around QTLs");
+ vrb.print(" genrich GWAS enrichment for QTLs");
+ vrb.print(" rtc Regulatory Trait Concordance analysis");
+ vrb.print(" rtc-union Find the union of QTLs");
+ vrb.print(" extract Data extraction mode");
+ vrb.print(" quan Quantification mode");
+ vrb.print(" ase Measure allelic imbalance at every het genotype");
+}
+
+int main(int argc, char ** argv) {
+
+ //1. Start timing
+ timer running_timer;
+
+ //2. Open LOG file if necessary
+ for (int a = 1 ; a < argc - 1 ; a ++) {
+ if ((strcmp(argv[a], "--log") == 0) && !vrb.open_log(string(argv[a+1]))) vrb.error("Impossible to open log file!");
+ if (strcmp(argv[a], "--silent") == 0) vrb.set_silent();
+ }
+
+ //3. Print header on screen
+ vrb.ctitle("QTLtools");
+ vrb.bullet("Authors : Olivier DELANEAU / Halit ONGEN / Emmanouil DERMITZAKIS");
+ vrb.bullet("Contact : olivier.delaneau at gmail.com / halit.ongen at unige.ch / Emmanouil.Dermitzakis at unige.ch");
+ vrb.bullet("Webpage : https://qtltools.github.io/qtltools/");
+ vrb.bullet("Version : " + string(QTLTOOLS_VERSION));
+ vrb.bullet("Date : " + running_timer.date());
+
+ //4. Switch mode
+ vector < string > args;
+ if (argc < 2){
+ printModes();
+ exit(EXIT_SUCCESS);
+ }
+ for (int a = 2 ; a < argc ; a ++) args.push_back(string(argv[a]));
+
+ //5.1. CIS mode
+ if (strcmp(argv[1], "cis") == 0) cis_main(args);
+
+ //5.2. TRANS mode
+ else if (strcmp(argv[1], "trans") == 0) trans_main(args);
+
+ //5.3. MATCH mode
+ else if (strcmp(argv[1], "match") == 0) match_main(args);
+
+ //5.4. FENRICH mode
+ else if (strcmp(argv[1], "fenrich") == 0) fenrich_main(args);
+
+ //5.5. GENRICH mode
+ else if (strcmp(argv[1], "genrich") == 0) genrich_main(args);
+
+ //5.6. CORRECT mode
+ else if (strcmp(argv[1], "correct") == 0) correct_main(args);
+
+ //5.7. RTC mode
+ else if (strcmp(argv[1], "rtc") == 0) rtc_main(args);
+
+ //5.8. PCA mode
+ else if (strcmp(argv[1], "pca") == 0) pca_main(args);
+
+ //5.9. EXTRACT mode
+ else if (strcmp(argv[1], "extract") == 0) extract_main(args);
+
+ //5.10. RTC-UNION mode
+ else if (strcmp(argv[1], "rtc-union") == 0) union_main(args);
+
+ //5.11. QUANTIFICATION mode
+ else if (strcmp(argv[1], "quan") == 0) quan_main(args);
+
+ //5.12. ASE mode
+ else if (strcmp(argv[1], "ase") == 0) ase_main(args);
+
+ //5.13. BAMSTAT mode
+ else if (strcmp(argv[1], "bamstat") == 0) bamstat_main(args);
+
+ //5.14. FDENSITY mode
+ else if (strcmp(argv[1], "fdensity") == 0) fdensity_main(args);
+
+ //5.15. UNRECOGNIZED mode
+ else if (strcmp(argv[1], "--help") == 0) {
+ printModes();
+ exit(EXIT_SUCCESS);
+ } else {
+ printModes();
+ vrb.error("Unrecognized QTLtools mode!");
+ }
+
+ //5. Terminate
+ vrb.title("Running time: " + stb.str(running_timer.abs_time()) + " seconds");
+ vrb.close_log();
+}
diff --git a/src/common/data.cpp b/src/common/data.cpp
new file mode 100644
index 0000000..8a8eef2
--- /dev/null
+++ b/src/common/data.cpp
@@ -0,0 +1,287 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "data.h"
+
+//SAMPLE NAMES
+void data::readSampleFromVCF(string fname, bool silent) {
+ unsigned int n_included = 0, n_excluded = 0;
+ if (!silent) vrb.title("Read sample list from [" + fname + "]");
+ bcf_sweep_t * sw = bcf_sweep_init(fname.c_str());
+ if (!sw) vrb.error("Cannot open file!");
+ bcf_hdr_t * hdr = bcf_sweep_hdr(sw);
+ if (!hdr) vrb.error("Cannot read vcf header!");
+ unsigned int n_sample = bcf_hdr_nsamples(hdr);
+ for (int i = 0 ; i < n_sample ; i ++) {
+ string sid = string(hdr->samples[i]);
+ if (filter_sample.check(sid)) {
+ map < string, unsigned int > :: iterator it_SO = sample_occurrence.find(sid);
+ if (it_SO == sample_occurrence.end()) sample_occurrence.insert(make_pair(sid, 1));
+ else it_SO->second ++;
+ n_included ++;
+ } else n_excluded ++;
+ }
+ if (!silent){
+ if (n_excluded == 0) vrb.bullet("#samples = " + stb.str(n_included));
+ else {
+ vrb.bullet("#samples included by user = " + stb.str(n_included));
+ vrb.bullet("#samples excluded by user = " + stb.str(n_excluded));
+ }
+ }
+ file_count ++;
+ bcf_sweep_destroy(sw);
+}
+
+void data::readSampleFromBED(string fname, bool silent) {
+ unsigned int n_included = 0, n_excluded = 0;
+ if (!silent) vrb.title("Reading sample list from [" + fname + "]");
+ htsFile *fp = hts_open(fname.c_str(),"r");
+ if (!fp) vrb.error("Cannot open file");
+ kstring_t str = {0,0,0};
+ vector < string > tokens;
+ if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != '#' ) vrb.error("Cannot read BED header!");
+ stb.split(string(str.s), tokens);
+ if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
+ for (int t = 6 ; t < tokens.size() ; t ++) {
+ string sid = tokens[t];
+ if (filter_sample.check(sid)) {
+ map < string, unsigned int > :: iterator it_SO = sample_occurrence.find(sid);
+ if (it_SO == sample_occurrence.end()) sample_occurrence.insert(make_pair(sid, 1));
+ else it_SO->second ++;
+ n_included ++;
+ } else n_excluded ++;
+ }
+ if (!silent){
+ if (n_excluded == 0) vrb.bullet("#samples = " + stb.str(n_included));
+ else {
+ vrb.bullet("#samples included by user = " + stb.str(n_included));
+ vrb.bullet("#samples excluded by user = " + stb.str(n_excluded));
+ }
+ }
+ file_count ++;
+ hts_close(fp);
+}
+
+void data::readSampleFromCOV(string fname, bool silent) {
+ string buffer; vector < string > tokens;
+ unsigned int n_included = 0, n_excluded = 0;
+ if (!silent) vrb.title("Reading sample list from [" + fname + "]");
+ input_file fd (fname);
+ if (fd.fail()) vrb.error("Cannot open file!");
+ getline(fd, buffer);
+ if (buffer.size() == 0) vrb.error("No header line detected!");
+ stb.split(buffer, tokens);
+ if (tokens.size() < 2) vrb.error("Incorrect number of columns!");
+ for (int t = 1 ; t < tokens.size() ; t ++) {
+ string sid = tokens[t];
+ if (filter_sample.check(sid)) {
+ map < string, unsigned int > :: iterator it_SO = sample_occurrence.find(sid);
+ if (it_SO == sample_occurrence.end()) sample_occurrence.insert(make_pair(sid, 1));
+ else it_SO->second ++;
+ n_included ++;
+ } else n_excluded ++;
+ }
+ if (!silent){
+ if (n_excluded == 0) vrb.bullet("#samples = " + stb.str(n_included));
+ else {
+ vrb.bullet("#samples included by user = " + stb.str(n_included));
+ vrb.bullet("#samples excluded by user = " + stb.str(n_excluded));
+ }
+ }
+ file_count ++;
+ fd.close();
+}
+
+void data::readSampleFromTXT(string fname) {
+ string buffer; vector < string > tokens;
+ unsigned int n_included = 0, n_excluded = 0;
+ vrb.title("Reading sample list from [" + fname + "]");
+ input_file fd (fname);
+ if (fd.fail()) vrb.error("Cannot open file!");
+ while(getline(fd, buffer)) {
+ stb.split(buffer, tokens);
+ string sid = tokens[0];
+ if (filter_sample.check(sid)) {
+ map < string, unsigned int > :: iterator it_SO = sample_occurrence.find(sid);
+ if (it_SO == sample_occurrence.end()) sample_occurrence.insert(make_pair(sid, 1));
+ else it_SO->second ++;
+ n_included ++;
+ } else n_excluded ++;
+ }
+ if (n_excluded == 0) vrb.bullet("#samples = " + stb.str(n_included));
+ else {
+ vrb.bullet("#samples included by user = " + stb.str(n_included));
+ vrb.bullet("#samples excluded by user = " + stb.str(n_excluded));
+ }
+ file_count ++;
+ fd.close();
+}
+
+void data::readSampleFromSTR(string fname) {
+ string buffer; vector < string > tokens;
+ unsigned int n_included = 0, n_excluded = 0;
+ vrb.title("Parse sample list [" + fname + "]");
+ stb.split(fname, tokens, ",");
+ for (int t = 0 ; t < tokens.size() ; t++) if (filter_sample.check(tokens[t])) {
+ map < string, unsigned int > :: iterator it_SO = sample_occurrence.find(tokens[t]);
+ if (it_SO == sample_occurrence.end()) sample_occurrence.insert(make_pair(tokens[t], 1));
+ else it_SO->second ++;
+ n_included ++;
+ } else n_excluded ++;
+ if (n_excluded == 0) vrb.bullet("#samples = " + stb.str(n_included));
+ else {
+ vrb.bullet("#samples included by user = " + stb.str(n_included));
+ vrb.bullet("#samples excluded by user = " + stb.str(n_excluded));
+ }
+ file_count ++;
+}
+
+
+void data::mergeSampleLists(bool silent) {
+ unsigned int n_incomplete = 0;
+ if (!silent){
+ vrb.title("Merging sample sets from all input files");
+ vrb.bullet("#files = " + stb.str(file_count));
+ }
+ for (map < string, unsigned int > :: iterator it_SO = sample_occurrence.begin() ; it_SO != sample_occurrence.end() ; ++it_SO) {
+ if (it_SO->second == file_count) {
+ sample_id.push_back(it_SO->first);
+ sample_count ++;
+ } else n_incomplete ++;
+ }
+ if (!silent){
+ vrb.bullet("#samples in all files = " + stb.str(sample_count));
+ if (n_incomplete > 0) vrb.bullet("#samples NOT in all files (i.e. ignored) = " + stb.str(n_incomplete));
+ }
+}
+
+int data::findSample(string sid) {
+ for (unsigned int i = 0 ; i < sample_count ; i ++) if (sample_id[i] == sid) return i;
+ return -1;
+}
+
+void data::declareBasicOptions() {
+ boost::program_options::options_description opt_basic ("\x1B[32mBasics\33[0m");
+ opt_basic.add_options()
+ ("help", "Produces option description")
+ ("seed", boost::program_options::value< unsigned int >()->default_value(15112011), "Random number seed. Useful to replicate runs.")
+ ("log", boost::program_options::value< string >(), "Output on screen goes to this file.")
+ ("silent", "Disable screen output");
+
+ boost::program_options::options_description opt_exclusion ("\x1B[32mData Exclusion/Inclusion\33[0m");
+ opt_exclusion.add_options()
+ ("exclude-samples", boost::program_options::value< string >(), "List of samples to exclude.")
+ ("include-samples", boost::program_options::value< string >(), "List of samples to include.")
+ ("exclude-sites", boost::program_options::value< string >(), "List of sites to exclude.")
+ ("include-sites", boost::program_options::value< string >(), "List of sites to include.")
+ ("exclude-positions", boost::program_options::value< string >(), "List of positions to exclude.")
+ ("include-positions", boost::program_options::value< string >(), "List of positions to include.")
+ ("exclude-phenotypes", boost::program_options::value< string >(), "List of phenotypes to exclude.")
+ ("include-phenotypes", boost::program_options::value< string >(), "List of phenotypes to include.")
+ ("exclude-covariates", boost::program_options::value< string >(), "List of covariates to exclude.")
+ ("include-covariates", boost::program_options::value< string >(), "List of covariates to include.");
+
+ option_descriptions.add(opt_basic).add(opt_exclusion);
+}
+
+void data::processBasicOptions() {
+ if (options.count("exclude-samples")) readSampleExclusion(options["exclude-samples"].as < string > ());
+ if (options.count("include-samples")) readSampleInclusion(options["include-samples"].as < string > ());
+ if (options.count("exclude-sites")) readGenotypeExclusion(options["exclude-sites"].as < string > ());
+ if (options.count("include-sites")) readGenotypeInclusion(options["include-sites"].as < string > ());
+ if (options.count("exclude-positions")) readPositionExclusion(options["exclude-positions"].as < string > ());
+ if (options.count("include-positions")) readPositionInclusion(options["include-positions"].as < string > ());
+ if (options.count("exclude-phenotypes")) readPhenotypeExclusion(options["exclude-phenotypes"].as < string > ());
+ if (options.count("include-phenotypes")) readPhenotypeInclusion(options["include-phenotypes"].as < string > ());
+ if (options.count("exclude-covariates")) readCovariateExclusion(options["exclude-covariates"].as < string > ());
+ if (options.count("include-covariates")) readCovariateInclusion(options["include-covariates"].as < string > ());
+
+ vrb.title("Initialize random number generator");
+ if (!options["seed"].defaulted()) vrb.bullet("User specified seed is " + stb.str(options["seed"].as < unsigned int > ()));
+ else vrb.bullet("Built-in seed is 15112011");
+ rng.setSeed(options["seed"].as < unsigned int > ());
+ vrb.bullet("First Integer = " + stb.str(rng.getInt(32768)));
+ vrb.bullet("First Double = " + stb.str(rng.getDouble()));
+}
+
+void data::readSampleExclusion(string fname){
+ vrb.title("Read sample exclusion list [" + fname + "]");
+ int ret = filter_sample.readExclusion(fname);
+ if (ret < 0) vrb.error("Cannot open file!");
+ else vrb.bullet(stb.str(ret) + " samples");
+}
+
+void data::readSampleInclusion(string fname){
+ vrb.title("Read sample inclusion list [" + fname + "]");
+ int ret = filter_sample.readInclusion(fname);
+ if (ret < 0) vrb.error("Cannot open file!");
+ else vrb.bullet(stb.str(ret) + " samples");
+}
+
+void data::readGenotypeExclusion(string fname){
+ vrb.title("Read variant exclusion list [" + fname + "]");
+ int ret = filter_genotype.readExclusion(fname);
+ if (ret < 0) vrb.error("Cannot open file!");
+ else vrb.bullet(stb.str(ret) + " variants");
+}
+
+void data::readGenotypeInclusion(string fname){
+ vrb.title("Read variant inclusion list [" + fname + "]");
+ int ret = filter_genotype.readInclusion(fname);
+ if (ret < 0) vrb.error("Cannot open file!");
+ else vrb.bullet(stb.str(ret) + " variants");
+}
+
+void data::readPositionExclusion(string fname){
+ vrb.title("Read position exclusion list [" + fname + "]");
+ int ret = filter_position.readExclusion(fname, true);
+ if (ret < 0) vrb.error("Cannot open file!");
+ else vrb.bullet(stb.str(ret) + " positions");
+}
+
+void data::readPositionInclusion(string fname){
+ vrb.title("Read position inclusion list [" + fname + "]");
+ int ret = filter_position.readInclusion(fname, true);
+ if (ret < 0) vrb.error("Cannot open file!");
+ else vrb.bullet(stb.str(ret) + " positions");
+}
+
+void data::readPhenotypeExclusion(string fname){
+ vrb.title("Read phenotype exclusion list [" + fname + "]");
+ int ret = filter_phenotype.readExclusion(fname);
+ if (ret < 0) vrb.error("Cannot open file!");
+ else vrb.bullet(stb.str(ret) + " phenotypes");
+}
+
+void data::readPhenotypeInclusion(string fname){
+ vrb.title("Read phenotype inclusion list [" + fname + "]");
+ int ret = filter_phenotype.readInclusion(fname);
+ if (ret < 0) vrb.error("Cannot open file!");
+ else vrb.bullet(stb.str(ret) + " phenotypes");
+}
+
+void data::readCovariateExclusion(string fname){
+ vrb.title("Read covariate exclusion list [" + fname + "]");
+ int ret = filter_covariate.readExclusion(fname);
+ if (ret < 0) vrb.error("Cannot open file!");
+ else vrb.bullet(stb.str(ret) + " covariates");
+}
+
+void data::readCovariateInclusion(string fname){
+ vrb.title("Read covariate inclusion list [" + fname + "]");
+ int ret = filter_covariate.readInclusion(fname);
+ if (ret < 0) vrb.error("Cannot open file!");
+ else vrb.bullet(stb.str(ret) + " covariates");
+}
diff --git a/src/common/data.h b/src/common/data.h
new file mode 100644
index 0000000..79d5e48
--- /dev/null
+++ b/src/common/data.h
@@ -0,0 +1,73 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#ifndef _DATA_H
+#define _DATA_H
+
+#include "otools.h"
+#include "filter.h"
+
+#define QTLTOOLS_VERSION "1.0"
+
+class data {
+public:
+ //SAMPLES
+ unsigned sample_count;
+ vector < string > sample_id;
+ unsigned file_count;
+ map < string, unsigned int > sample_occurrence;
+
+ //FILTERS
+ filter filter_sample;
+ filter filter_phenotype;
+ filter filter_genotype;
+ filter filter_position;
+ filter filter_covariate;
+
+ //OPTIONS
+ boost::program_options::options_description option_descriptions;
+ boost::program_options::variables_map options;
+
+ //CONSTRUCTOR
+ data() { file_count = 0; sample_count = 0; };
+ ~data() { sample_id.clear(); sample_occurrence.clear(); };
+
+ //SAMPLE NAMES
+ void readSampleFromTXT(string);
+ void readSampleFromSTR(string);
+ void readSampleFromVCF(string, bool silent = false);
+ void readSampleFromBED(string, bool silent = false);
+ void readSampleFromCOV(string, bool silent = false);
+ void mergeSampleLists(bool silent = false);
+ int findSample(string);
+
+ //COMMON OPTIONS
+ void declareBasicOptions();
+ void processBasicOptions();
+
+ //INCLUSION/EXCLUSION LISTS
+ void readSampleExclusion(string);
+ void readSampleInclusion(string);
+ void readGenotypeExclusion(string);
+ void readGenotypeInclusion(string);
+ void readPositionExclusion(string);
+ void readPositionInclusion(string);
+ void readPhenotypeExclusion(string);
+ void readPhenotypeInclusion(string);
+ void readCovariateExclusion(string);
+ void readCovariateInclusion(string);
+};
+
+#endif
diff --git a/src/common/filter.h b/src/common/filter.h
new file mode 100644
index 0000000..88b8251
--- /dev/null
+++ b/src/common/filter.h
@@ -0,0 +1,87 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#ifndef _DATA_FILTER_H
+#define _DATA_FILTER_H
+
+#include <unordered_set>
+#include <compressed_io.h>
+
+class filter {
+protected:
+ unordered_set < string > inclusion_map;
+ unordered_set < string > exclusion_map;
+
+public:
+ filter() {
+ }
+
+ ~filter() {
+ inclusion_map.clear();
+ exclusion_map.clear();
+ }
+
+ int readInclusion(string file, bool position = false) {
+ unsigned int n_ids = 0;
+ string buffer;
+ vector < string > tokens;
+ input_file fd(file);
+ if (fd.fail()) return -1;
+ while(getline(fd, buffer, '\n')) {
+ stb.split(buffer, tokens);
+ if (position && tokens.size() != 2) return -2;
+ if (position) inclusion_map.insert(tokens[0] + "_" + tokens[1]);
+ else inclusion_map.insert(tokens[0]);
+ n_ids++;
+ }
+ fd.close();
+ return n_ids;
+ }
+
+ int readExclusion(string file, bool position = false) {
+ unsigned int n_ids = 0;
+ string buffer;
+ vector < string > tokens;
+ input_file fd(file);
+ if (fd.fail()) return -1;
+ while(getline(fd, buffer, '\n')) {
+ stb.split(buffer, tokens);
+ if (position && tokens.size() != 2) return -2;
+ if (position) exclusion_map.insert(tokens[0] + "_" + tokens[1]);
+ else exclusion_map.insert(tokens[0]);
+ exclusion_map.insert(tokens[0]);
+ n_ids++;
+ }
+ fd.close();
+ return n_ids;
+ }
+
+ bool check(string id) {
+ bool included = ((inclusion_map.size() == 0)?true:inclusion_map.count(id));
+ bool excluded = ((exclusion_map.size() == 0)?false:exclusion_map.count(id));
+ if (!included || excluded) return false;
+ return true;
+ }
+
+ void addInclusion(string value){
+ inclusion_map.insert(value);
+ }
+
+ void addExclusion(string value){
+ exclusion_map.insert(value);
+ }
+};
+
+#endif
diff --git a/src/mode_ase/ase_data.h b/src/mode_ase/ase_data.h
new file mode 100644
index 0000000..4564e67
--- /dev/null
+++ b/src/mode_ase/ase_data.h
@@ -0,0 +1,102 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#ifndef _ASE_DATA_H
+#define _ASE_DATA_H
+
+//INCLUDES
+#include "../common/data.h"
+
+class ase_site {
+public:
+ unsigned int pos;
+ string chr, sid;
+ char ref, alt;
+
+ ase_site (string _chr, string _sid, unsigned int _pos, string _ref, string _alt) {
+ chr = _chr;
+ sid = _sid;
+ pos = _pos;
+ ref = _ref[0];
+ alt = _alt[0];
+ }
+};
+
+class ase_data : public data {
+public :
+ //PARAMETERS
+ unsigned int param_min_mapQ;
+ unsigned int param_min_baseQ;
+ unsigned int param_min_cov;
+ float param_min_gp;
+ float param_min_iq;
+ float param_min_pval;
+ bool param_dup_rd;
+
+ //DATA
+ vector < string > regions;
+ vector < vector < ase_site > > variants;
+
+ //CONSTRUCTOR/DESTRUCTOR
+ ase_data() {
+ param_min_mapQ = 10;
+ param_min_baseQ = 5;
+ param_min_cov = 10;
+ param_min_pval = 1.0;
+ param_min_gp = 0.99;
+ param_min_iq = 0.90;
+ }
+
+ ~ase_data() {
+ regions.clear();
+ variants.clear();
+ }
+
+ //
+ void readGenotypes(string, string);
+ void readSequences(string, string);
+};
+
+void ase_main(vector < string > & );
+
+inline char ase_getBase (int code) {
+ switch (code) {
+ case 1: return 'A';
+ case 2: return 'C';
+ case 4: return 'G';
+ case 8: return 'T';
+ case 15: return 'N';
+ }
+ return -1;
+}
+
+inline double ase_binomialTest(int x, int n, float p) {
+ int y = 0;
+ if (p == 0) return (x == 0);
+ if (p == 1) return (x == n);
+ double relErr = 1 + 1e-07;
+ double d = dbinom(x, n, p, 0);
+ double m = n * p;
+ if (x == m) return 1.0;
+ if (x < m) {
+ for (int i = (int)ceil (m); i <= n ; i++) y += (dbinom(i, n, p, 0) <= d * relErr);
+ return pbinom(x, n, p, 1, 0) + pbinom(n - y, n, p, 0, 0);
+ } else {
+ for (int i = 0 ; i <= (int)floor(m) ; i++) y += (dbinom(i, n, p, 0) <= d * relErr);
+ return pbinom(y - 1, n, p, 1, 0) + pbinom(x - 1, n, p, 0, 0);
+ }
+}
+
+#endif
diff --git a/src/mode_ase/ase_main.cpp b/src/mode_ase/ase_main.cpp
new file mode 100644
index 0000000..c5d9c87
--- /dev/null
+++ b/src/mode_ase/ase_main.cpp
@@ -0,0 +1,102 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "ase_data.h"
+
+void ase_main(vector < string > & argv) {
+ ase_data D;
+
+ //-------------------------
+ // 1. DECLARE ALL OPTIONS
+ //-------------------------
+ D.declareBasicOptions();
+ boost::program_options::options_description opt_files ("\x1B[32mI/O\33[0m");
+ opt_files.add_options()
+ ("vcf", boost::program_options::value< string >(), "Genotypes in VCF/BCF format.")
+ ("bam", boost::program_options::value< string >(), "Sequence data in BAM/SAM format.")
+ ("ind", boost::program_options::value< string >(), "Sample to be processed.")
+ ("reg", boost::program_options::value< string >()->default_value(""), "Genomic region(s) to be processed.")
+ ("out", boost::program_options::value< string >(), "Output file.");
+
+ boost::program_options::options_description opt_parameters ("\x1B[32mFilters\33[0m");
+ opt_parameters.add_options()
+ ("filter-mapping-quality", boost::program_options::value< unsigned int >()->default_value(10), "Minimal phred mapping quality for a read to be considered.")
+ ("filter-base-quality", boost::program_options::value< unsigned int >()->default_value(10), "Minimal phred quality for a base to be considered.")
+ ("filter-binomial-pvalue", boost::program_options::value< double >()->default_value(1.0, "1.0"), "Binomial p-value threshold for ASE in output.")
+ ("filter-minimal-coverage", boost::program_options::value< unsigned int >()->default_value(10), "Minimal coverage for a genotype to be considered.")
+ ("filter-imputation-qual", boost::program_options::value< double >()->default_value(0.90, "0.90"), "Minimal imputation information score for a variant to be considered.")
+ ("filter-imputation-prob", boost::program_options::value< double >()->default_value(0.99, "0.99"), "Minimal posterior probability for a genotype to be considered.")
+ ("filter-remove-duplicates", "Remove duplicate sequencing reads in the process.");
+
+ D.option_descriptions.add(opt_files).add(opt_parameters);
+
+ //-------------------
+ // 2. PARSE OPTIONS
+ //-------------------
+ boost::program_options::variables_map options;
+ try {
+ boost::program_options::store(boost::program_options::command_line_parser(argv).options(D.option_descriptions).run(), D.options);
+ boost::program_options::notify(D.options);
+ } catch ( const boost::program_options::error& e ) {
+ cerr << "Error parsing [ase] command line :" << string(e.what()) << endl;
+ exit(0);
+ }
+
+ //---------------------
+ // 3. PRINT HELP/HEADER
+ //---------------------
+ vrb.ctitle("CALLING ALLELE SPECIFIC SITES");
+ if (D.options.count("help")) {
+ cout << D.option_descriptions << endl;
+ exit(EXIT_SUCCESS);
+ }
+
+ //-----------------
+ // 4. COMMON CHECKS
+ //-----------------
+ if (!D.options.count("vcf")) vrb.error("Genotype data needs to be specified with --vcf [file.vcf]");
+ if (!D.options.count("bam")) vrb.error("Sequence data needs to be specified with --bam [file.bam]");
+ if (!D.options.count("ind")) vrb.error("Sample ID needs to be specified with --ind [sample_id]");
+ if (!D.options.count("out")) vrb.error("Output needs to be specified with --out [file.out]");
+
+ //TO DO CHECK PARAMETER VALUES
+ D.param_min_mapQ = D.options["filter-mapping-quality"].as < unsigned int > ();
+ D.param_min_baseQ = D.options["filter-base-quality"].as < unsigned int > ();
+ D.param_min_cov = D.options["filter-minimal-coverage"].as < unsigned int > ();
+ D.param_min_pval = D.options["filter-binomial-pvalue"].as < double > ();
+ D.param_min_gp = D.options["filter-imputation-prob"].as < double > ();
+ D.param_min_iq = D.options["filter-imputation-qual"].as < double > ();
+ D.param_dup_rd = (D.options.count("filter-remove-duplicates") == 0);
+ vrb.bullet("Mapping quality >= " + stb.str(D.param_min_mapQ));
+ vrb.bullet("Base quality >= " + stb.str(D.param_min_baseQ));
+ vrb.bullet("Coverage >= " + stb.str(D.param_min_cov));
+ vrb.bullet("Binomial p-value threshold = " + stb.str(D.param_min_pval));
+ vrb.bullet("Genotype probability >= " + stb.str(D.param_min_gp));
+ vrb.bullet("Imputation quality >= " + stb.str(D.param_min_iq));
+ vrb.bullet("Remove duplicate reads = " + stb.str(D.param_dup_rd));
+
+ //------------------------------------------
+ // 5. READ FILES / INITIALIZE / RUN ANALYSIS
+ //------------------------------------------
+ D.processBasicOptions();
+ D.readSampleFromVCF(D.options["vcf"].as < string > ());
+ D.readSampleFromSTR(D.options["ind"].as < string > ());
+ D.mergeSampleLists();
+ if (D.sample_count == 0) vrb.error("Could not find [" + D.options["ind"].as < string > () + "] in VCF/BCF file");
+ else if (D.sample_count >= 2) vrb.error("More than one sample specified with --ind");
+ else vrb.bullet("Target sample is [" + D.sample_id[0] + "]");
+ D.readGenotypes(D.options["vcf"].as < string > (), D.options["reg"].as < string > ());
+ D.readSequences(D.options["bam"].as < string > (), D.options["out"].as < string > ());
+}
diff --git a/src/mode_ase/ase_read_genotypes.cpp b/src/mode_ase/ase_read_genotypes.cpp
new file mode 100644
index 0000000..176275b
--- /dev/null
+++ b/src/mode_ase/ase_read_genotypes.cpp
@@ -0,0 +1,118 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "ase_data.h"
+
+void ase_data::readGenotypes(string filename, string str_regions) {
+ int n_includedG = 0;
+ int n_excludedG_mult = 0;
+ int n_excludedG_snpv = 0;
+ int n_excludedG_void = 0;
+ int n_excludedG_user = 0;
+ int n_excludedG_impq = 0;
+ int n_excludedG_impp = 0;
+ int n_excludedG_homo = 0;
+ int n_excludedG_miss = 0;
+
+ vrb.title("Reading VCF [" + filename + "]");
+ bcf_srs_t * sr = bcf_sr_init();
+ sr->collapse = COLLAPSE_NONE;
+
+ //Jump to regions if necessary
+ if (str_regions.size() > 0) {
+ if (bcf_sr_set_regions(sr, str_regions.c_str(), 0) == -1) vrb.error("Failed to jump to region [" + str_regions + "]");
+ else vrb.bullet("scanning region(s) [" + str_regions + "]");
+ } else vrb.bullet("scanning full VCF file");
+
+ //Add readers
+ if(!(bcf_sr_add_reader (sr, filename.c_str()))) {
+ switch (sr->errnum) {
+ case not_bgzf: vrb.error("Not compressed with bgzip");
+ case idx_load_failed: vrb.error("Impossible to load index file");
+ case file_type_error: vrb.error("Unrecognized file format");
+ default: vrb.error("Unknown error when opening");
+ }
+ }
+
+ //Sample processing
+ int index_sample = -1;
+ unsigned int n_samples_in_file = bcf_hdr_nsamples(sr->readers[0].header);
+ for (int i = 0 ; i < n_samples_in_file ; i ++) if (strcmp(sr->readers[0].header->samples[i], sample_id[0].c_str()) == 0) index_sample = i;
+ if (index_sample < 0) vrb.error("Unexpected error: sample unfound!");
+ else vrb.bullet("index of [" + sample_id[0] + "] = " + stb.str(index_sample));
+
+ //Init needed data
+ int ngp = 0, ngt = 0, niq = 0, ngt_arr = 0, ngp_arr = 0, niq_arr = 0;
+ int * gt_arr = NULL;
+ float * gp_arr = NULL, * iq_arr = NULL;
+ bcf1_t * line;
+
+ //Parse VCF
+ map < string, unsigned int > region_map;
+ map < string, unsigned int > :: iterator region_map_it;
+ while(bcf_sr_next_line (sr)) {
+ line = bcf_sr_get_line(sr, 0);
+ if (line->n_allele > 2) n_excludedG_mult ++;
+ else {
+ bcf_unpack(line, BCF_UN_STR);
+ string sid = string(line->d.id);
+ if (!filter_genotype.check(sid)) n_excludedG_user ++;
+ else {
+ string curr_chr = bcf_hdr_id2name(sr->readers[0].header, line->rid); //chr
+ unsigned int pos = line->pos; //pos
+ string sid = string(line->d.id); //sid
+ string ref = string(line->d.allele[0]); //ref
+ string alt = string(line->d.allele[1]); //alt
+ niq = bcf_get_info_float(sr->readers[0].header, line, "IQ", &iq_arr, &niq_arr); //imp score
+ unsigned int region_idx;
+ region_map_it = region_map.find(curr_chr);
+ if (region_map_it == region_map.end()) {
+ vrb.bullet("new chromosome discovered [" + curr_chr + "]");
+ region_map.insert(pair < string, unsigned int > (curr_chr, regions.size()));
+ region_idx = regions.size();
+ regions.push_back(curr_chr);
+ variants.push_back(vector < ase_site > ());
+ } else region_idx = region_map_it->second;
+
+ if (ref.size() > 1 || alt.size() > 1) n_excludedG_snpv ++;
+ else if (niq > 0 && iq_arr[0] < param_min_iq) n_excludedG_impq ++;
+ else {
+ ngt = bcf_get_genotypes(sr->readers[0].header, line, >_arr, &ngt_arr);
+ ngp = bcf_get_format_float(sr->readers[0].header, line,"GP", &gp_arr, &ngp_arr);
+ if (ngt != n_samples_in_file * 2) n_excludedG_void ++;
+ else if (gt_arr[2*index_sample+0] == bcf_gt_missing || gt_arr[2*index_sample+1] == bcf_gt_missing) n_excludedG_miss ++;
+ else if (ngp == 3 * n_samples_in_file && gp_arr[3*index_sample+0] != bcf_float_missing && gp_arr[3*index_sample+1] != bcf_float_missing && gp_arr[3*index_sample+2] != bcf_float_missing && gp_arr[3*index_sample+0] < param_min_gp && gp_arr[3*index_sample+1] < param_min_gp && gp_arr[3*index_sample+2] < param_min_gp) n_excludedG_impp ++;
+ else if (bcf_gt_allele(gt_arr[2*index_sample+0]) == bcf_gt_allele(gt_arr[2*index_sample+1])) n_excludedG_homo ++;
+ else {
+ variants[region_idx].push_back(ase_site (curr_chr, sid, pos, ref, alt));
+ n_includedG ++;
+ }
+ }
+ }
+ }
+ }
+ vrb.bullet(stb.str(n_includedG) + " heterozygous genotypes included");
+ if (n_excludedG_user > 0) vrb.bullet(stb.str(n_excludedG_user) + " variants excluded by user");
+ if (n_excludedG_mult > 0) vrb.bullet(stb.str(n_excludedG_mult) + " multi-allelic variants excluded");
+ if (n_excludedG_snpv > 0) vrb.bullet(stb.str(n_excludedG_snpv) + " multi-nucleotidic variants excluded");
+ if (n_excludedG_impq > 0) vrb.bullet(stb.str(n_excludedG_user) + " badly imputed variants excluded");
+ if (n_excludedG_impp > 0) vrb.bullet(stb.str(n_excludedG_impp) + " badly imputed genotypes excluded");
+ if (n_excludedG_void > 0) vrb.bullet(stb.str(n_excludedG_void) + " variants without GT field excluded");
+ if (n_excludedG_miss > 0) vrb.bullet(stb.str(n_excludedG_miss) + " missing genotypes excluded");
+ if (n_excludedG_homo > 0) vrb.bullet(stb.str(n_excludedG_homo) + " homozygous genotypes excluded");
+ if (variants.size() == 0) vrb.leave("Cannot find usable variants in target region!");
+ free(gt_arr);
+ bcf_sr_destroy(sr);
+}
diff --git a/src/mode_ase/ase_read_sequences.cpp b/src/mode_ase/ase_read_sequences.cpp
new file mode 100644
index 0000000..472a1e3
--- /dev/null
+++ b/src/mode_ase/ase_read_sequences.cpp
@@ -0,0 +1,180 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "ase_data.h"
+
+typedef struct { // auxiliary data structure
+ samFile * fp; // the file handle
+ bam_hdr_t * hdr; // the file header
+ hts_itr_t * iter; // NULL if a region not specified
+} aux_t;
+
+static int read_bam(void *data, bam1_t *b) {
+ aux_t * aux = (aux_t*) data;
+ return (aux->iter? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->hdr, b));
+}
+
+void ase_data::readSequences(string fbam, string fvcf) {
+ aux_t * data = (aux_t *) malloc (sizeof(aux_t));
+
+ vrb.title("Opening BAM file [" + fbam + "] and writing VCF file [" + fvcf + "]");
+
+ //BAM OPEN
+ data->fp = sam_open(fbam.c_str(), "r");
+ if (data->fp == 0) vrb.error("Cannot open BAM file!");
+ data->hdr = sam_hdr_read(data->fp);
+ if (data->hdr == 0) vrb.error("Cannot parse BAM header!");
+ hts_idx_t *idx = sam_index_load(data->fp, fbam.c_str());
+ if (idx == NULL) vrb.error("Cannot load BAM index!");
+
+ //VCF OPEN
+ htsFile * bcf_fd = NULL;
+ bool compressed_vcf = fvcf.substr(fvcf.find_last_of(".") + 1) == "gz";
+ if (compressed_vcf) bcf_fd = bcf_open(fvcf.c_str(), "wz");
+ else bcf_fd = bcf_open(fvcf.c_str(), "wu");
+ if (bcf_fd == NULL) vrb.error("Impossible to create VCF file");
+ else if (compressed_vcf) vrb.bullet("BGZIP for VCF compression is ON");
+ else vrb.bullet("BGZIP compression for VCF is OFF (add .gz to filename to activate it)");
+ bcf_hdr_t * bcf_hdr = bcf_hdr_init("w");
+ kstring_t str = {0,0,0};
+ ksprintf(&str, "##QTLtools ase Version=%s\n",QTLTOOLS_VERSION);
+ bcf_hdr_append(bcf_hdr, str.s);
+ free(str.s);
+
+ //VCF INFO
+ vrb.bullet("Writing VCF header [INFO, CONTIG, FORMAT, SAMPLES]");
+ bcf_hdr_append(bcf_hdr,"##INFO=<ID=M_FAI,Number=1,Type=Integer,Description=\"Number of reads failing mapping filters\">");
+ if (!param_dup_rd) bcf_hdr_append(bcf_hdr,"##INFO=<ID=M_DUP,Number=1,Type=Integer,Description=\"Number of duplicate reads removed\">");
+ bcf_hdr_append(bcf_hdr,"##INFO=<ID=M_SUC,Number=1,Type=Integer,Description=\"Number of reads passing mapping filters\">");
+ bcf_hdr_append(bcf_hdr,"##INFO=<ID=B_DEL,Number=1,Type=Integer,Description=\"Number of reads with deletion\">");
+ bcf_hdr_append(bcf_hdr,"##INFO=<ID=B_DIS,Number=1,Type=Integer,Description=\"Number of reads with base discordance\">");
+ bcf_hdr_append(bcf_hdr,"##INFO=<ID=B_QUA,Number=1,Type=Integer,Description=\"Number of reads with low base quality\">");
+ bcf_hdr_append(bcf_hdr,"##INFO=<ID=B_REF,Number=1,Type=Integer,Description=\"Number of reads matching REF allele\">");
+ bcf_hdr_append(bcf_hdr,"##INFO=<ID=B_ALT,Number=1,Type=Integer,Description=\"Number of reads matching ALT allele\">");
+
+ //VCF CONTIG
+ set < string > contig_set (regions.begin(), regions.end());
+ vrb.bullet("Writing " + stb.str(contig_set.size()) + " CONTIG fields");
+ for (set < string > :: iterator it_c = contig_set.begin() ; it_c != contig_set.end() ; ++ it_c) {
+ string tmp_str = "##contig=<ID=" + *it_c + ">";
+ bcf_hdr_append(bcf_hdr, tmp_str.c_str());
+ }
+
+ //VCF FORMAT
+ bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=AS,Number=1,Type=Float,Description=\"Binomial test for ASE (i.e. ref/(ref+alt) != 0.5)\">");
+
+ //VCF SAMPLES
+ bcf_hdr_add_sample(bcf_hdr, sample_id[0].c_str());
+ bcf_hdr_add_sample(bcf_hdr, NULL);
+
+ //VCF HEADER
+ bcf_hdr_write(bcf_fd, bcf_hdr);
+
+ //Loop across regions
+ bcf1_t * bcf_rec = bcf_init1();
+ for (int reg = 0; reg < regions.size() ; reg ++) {
+
+ //Jump to region
+ data->iter = sam_itr_querys(idx, data->hdr, regions[reg].c_str()); // set the iterator
+ if (data->iter == NULL) vrb.error("Problem jumping to region [" + regions[reg] + "]");
+ else vrb.bullet("Scanning region [" + regions[reg] + "]");
+
+ int beg = data->iter->beg;
+ int end = data->iter->end;
+
+ //Pile up reads
+ const bam_pileup1_t * v_plp;
+ int n_plp = 0, tid, pos, i_site = 0;
+ bam_plp_t s_plp = bam_plp_init(read_bam, (void*)data);
+ while (((v_plp = bam_plp_auto(s_plp, &tid, &pos, &n_plp)) != 0) && i_site < variants[reg].size()) {
+ int chr = bam_name2id(data->hdr, variants[reg][i_site].chr.c_str());
+ if (pos < beg || pos >= end) continue;
+ while (i_site < variants[reg].size() && (chr != tid || pos > variants[reg][i_site].pos)) { i_site ++; }
+ if (tid == chr && pos == variants[reg][i_site].pos) {
+ int m_fai = 0, m_dup = 0, m_suc = 0;
+ int b_del = 0, b_ref = 0, b_alt = 0, b_dis = 0, b_qua = 0;
+
+ //STEP1: Parse sequencing reads
+ for (int iread = 0 ; iread < n_plp ; iread ++) {
+ bool failed_qc = false;
+ const bam_pileup1_t * p = v_plp + iread;
+
+ if (p->b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL)) failed_qc = true;
+ else if ((int)p->b->core.qual < param_min_mapQ) failed_qc = true;
+ else if (p->b->core.flag & BAM_FPAIRED) {
+ if (!(p->b->core.flag & BAM_FPROPER_PAIR)) failed_qc = true;
+ else if (p->b->core.flag & BAM_FMUNMAP) failed_qc = true;
+ else if ((p->b->core.flag & BAM_FREVERSE) == (p->b->core.flag & BAM_FMREVERSE)) failed_qc = true;
+ }
+
+ if (failed_qc) m_fai ++;
+ else {
+ if (p->b->core.flag & BAM_FDUP) m_dup ++;
+ if (!param_dup_rd || !(p->b->core.flag & BAM_FDUP)) {
+ if (p->is_del || p->is_refskip || p->indel == 1) b_del++;
+ else if (bam_get_qual(p->b)[p->qpos] < param_min_baseQ) b_qua++;
+ else {
+ char base = ase_getBase(bam_seqi(bam_get_seq(p->b), p->qpos));
+ bool isRef = (base == variants[reg][i_site].ref);
+ bool isAlt = (base == variants[reg][i_site].alt);
+ if (isRef) b_ref++;
+ if (isAlt) b_alt++;
+ if (!isRef && !isAlt) b_dis++;
+ }
+ m_suc ++;
+ }
+ }
+ }
+
+ //STEP2: Write VCF record
+ if ((b_ref + b_alt) >= param_min_cov) {
+ bcf_clear1(bcf_rec);
+ bcf_rec->rid = bcf_hdr_name2id(bcf_hdr, variants[reg][i_site].chr.c_str());
+ bcf_rec->pos = variants[reg][i_site].pos;
+ bcf_update_id(bcf_hdr, bcf_rec, variants[reg][i_site].sid.c_str());
+ string str_alleles = "N,N";
+ str_alleles[0] = variants[reg][i_site].ref;
+ str_alleles[2] = variants[reg][i_site].alt;
+ bcf_update_alleles_str(bcf_hdr, bcf_rec, str_alleles.c_str());
+ bcf_rec->qual = 100;
+ int tmpi = bcf_hdr_id2int(bcf_hdr, BCF_DT_ID, "PASS");
+ bcf_update_filter(bcf_hdr, bcf_rec, &tmpi, 1);
+ bcf_update_info_int32(bcf_hdr, bcf_rec, "M_FAI", &m_fai, 1);
+ if (!param_dup_rd) bcf_update_info_int32(bcf_hdr, bcf_rec, "M_DUP", &m_dup, 1);
+ bcf_update_info_int32(bcf_hdr, bcf_rec, "M_SUC", &m_suc, 1);
+ bcf_update_info_int32(bcf_hdr, bcf_rec, "B_DEL", &b_del, 1);
+ bcf_update_info_int32(bcf_hdr, bcf_rec, "B_DIS", &b_dis, 1);
+ bcf_update_info_int32(bcf_hdr, bcf_rec, "B_QUA", &b_qua, 1);
+ bcf_update_info_int32(bcf_hdr, bcf_rec, "B_REF", &b_ref, 1);
+ bcf_update_info_int32(bcf_hdr, bcf_rec, "B_ALT", &b_alt, 1);
+ float tmpf = ase_binomialTest(b_ref, b_ref+b_alt, 0.5);
+ bcf_update_format_float(bcf_hdr, bcf_rec, "AS", &tmpf, 1);
+ bcf_write1(bcf_fd, bcf_hdr, bcf_rec);
+ }
+ }
+ }
+ bam_plp_reset(s_plp);
+ bam_plp_destroy(s_plp);
+ }
+
+ bam_hdr_destroy(data->hdr);
+ hts_idx_destroy(idx);
+ if (data->fp) sam_close(data->fp);
+ hts_itr_destroy(data->iter);
+ free(data);
+ bcf_destroy1(bcf_rec);
+ hts_close(bcf_fd);
+ bcf_hdr_destroy(bcf_hdr);
+}
diff --git a/src/mode_bamstat/bamstat_data.h b/src/mode_bamstat/bamstat_data.h
new file mode 100644
index 0000000..1c62eaf
--- /dev/null
+++ b/src/mode_bamstat/bamstat_data.h
@@ -0,0 +1,80 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#ifndef _BAMSTAT_DATA_H
+#define _BAMSTAT_DATA_H
+
+//INCLUDES
+#include "../common/data.h"
+
+class bamstat_region {
+public:
+ string chr;
+ unsigned int start;
+ unsigned int end;
+ unsigned int n_covering_reads;
+
+ bamstat_region (string _chr, unsigned int _start, unsigned int _end) {
+ chr = _chr;
+ start = _start;
+ end = _end;
+ n_covering_reads = 0;
+ }
+
+ string toString() {
+ return chr + ":" + stb.str(start) + "-" + stb.str(end);
+ }
+};
+
+class bamstat_data : public data {
+public :
+
+ //DATA
+ vector < bamstat_region > R;
+ bool param_dup_rd;
+ unsigned int param_min_mapQ;
+ unsigned int n_total_reads;
+ unsigned int n_mapped_reads;
+ unsigned int n_overlap_reads;
+ unsigned int n_overlap_annotations;
+ unsigned int n_keep;
+
+ //CONSTRUCTOR/DESTRUCTOR
+ bamstat_data () {
+ param_dup_rd = false;
+ param_min_mapQ = 0;
+ n_total_reads = 0;
+ n_mapped_reads = 0;
+ n_overlap_reads = 0;
+ n_overlap_annotations = 0;
+ n_keep = 0;
+ }
+
+ ~bamstat_data () { }
+
+ //
+ int keepRead(bam1_t *);
+ void readAnnotationsBED(string);
+ void readSequences(string);
+ void writeOutput(string);
+};
+
+//***************************************************************//
+//******************** DECLARE FUNCTIONS *************************//
+//***************************************************************//
+void bamstat_main(vector < string > &);
+
+
+#endif
diff --git a/src/mode_bamstat/bamstat_main.cpp b/src/mode_bamstat/bamstat_main.cpp
new file mode 100644
index 0000000..f70e690
--- /dev/null
+++ b/src/mode_bamstat/bamstat_main.cpp
@@ -0,0 +1,80 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "bamstat_data.h"
+
+void bamstat_main(vector < string > & argv) {
+ bamstat_data D;
+
+ //-------------------------
+ // 1. DECLARE ALL OPTIONS
+ //-------------------------
+ D.declareBasicOptions();
+ boost::program_options::options_description opt_files ("\x1B[32mI/O\33[0m");
+ opt_files.add_options()
+ ("bam", boost::program_options::value< string >(), "Sequence data in BAM/SAM format.")
+ ("bed", boost::program_options::value< string >(), "Annotation data in BED format.")
+ ("out", boost::program_options::value< string >(), "Output file.");
+
+ boost::program_options::options_description opt_parameters ("\x1B[32mFilters\33[0m");
+ opt_parameters.add_options()
+ ("filter-mapping-quality", boost::program_options::value< unsigned int >()->default_value(10), "Minimal phred mapping quality for a read to be considered.")
+ ("filter-keep-duplicates", "Keep duplicate sequencing reads in the process.");
+
+ D.option_descriptions.add(opt_files).add(opt_parameters);
+
+ //-------------------
+ // 2. PARSE OPTIONS
+ //-------------------
+ boost::program_options::variables_map options;
+ try {
+ boost::program_options::store(boost::program_options::command_line_parser(argv).options(D.option_descriptions).run(), D.options);
+ boost::program_options::notify(D.options);
+ } catch ( const boost::program_options::error& e ) {
+ cerr << "Error parsing [bamstat] command line :" << string(e.what()) << endl;
+ exit(0);
+ }
+
+ //---------------------
+ // 3. PRINT HELP/HEADER
+ //---------------------
+ vrb.ctitle("CALCULATE BASIC QC METRICS FOR A BAM FILE");
+ if (D.options.count("help")) {
+ cout << D.option_descriptions << endl;
+ exit(EXIT_SUCCESS);
+ }
+
+ //-----------------
+ // 4. COMMON CHECKS
+ //-----------------
+ if (!D.options.count("bam")) vrb.error("Sequence data needs to be specified with --bam [file.bam]");
+ if (!D.options.count("bed")) vrb.error("Annotation data needs to be specified with --bed [file.bed]");
+ if (!D.options.count("out")) vrb.error("Output needs to be specified with --out [file.out]");
+
+ //TO DO CHECK PARAMETER VALUES
+ D.param_min_mapQ = D.options["filter-mapping-quality"].as < unsigned int > ();
+ D.param_dup_rd = (D.options.count("filter-keep-duplicates") != 0);
+ vrb.bullet("Mapping quality >= " + stb.str(D.param_min_mapQ));
+ vrb.bullet("Keep duplicate reads = " + stb.str(D.param_dup_rd));
+
+ //------------------------------------------
+ // 5. READ FILES / INITIALIZE / RUN ANALYSIS
+ //------------------------------------------
+
+ D.processBasicOptions();
+ D.readAnnotationsBED(D.options["bed"].as < string > ());
+ D.readSequences(D.options["bam"].as < string > ());
+ D.writeOutput(D.options["out"].as < string > ());
+}
diff --git a/src/mode_bamstat/bamstat_read_annotations.cpp b/src/mode_bamstat/bamstat_read_annotations.cpp
new file mode 100644
index 0000000..bcb06fa
--- /dev/null
+++ b/src/mode_bamstat/bamstat_read_annotations.cpp
@@ -0,0 +1,36 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "bamstat_data.h"
+
+void bamstat_data::readAnnotationsBED(string fbed) {
+ string buffer;
+ vector < string > tok;
+ vrb.title("Read BED annotations in [" + fbed + "]");
+ input_file fd (fbed);
+ while (getline(fd, buffer)) {
+ stb.split(buffer, tok);
+ if (tok.size() < 3) vrb.error("Incorrect number of columns in BED file!");
+ R.push_back(bamstat_region(tok[0], atoi(tok[1].c_str()), atoi(tok[2].c_str())));
+ }
+ if (R.size() > 0) vrb.bullet("#annotations = " + stb.str(R.size()));
+ else vrb.error("Could not find any annotations in file!");
+ fd.close();
+}
+
+
+
+
+
diff --git a/src/mode_bamstat/bamstat_read_bam.cpp b/src/mode_bamstat/bamstat_read_bam.cpp
new file mode 100644
index 0000000..90e9a03
--- /dev/null
+++ b/src/mode_bamstat/bamstat_read_bam.cpp
@@ -0,0 +1,161 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "bamstat_data.h"
+
+int bamstat_data::keepRead(bam1_t * b) {
+ if (b->core.flag & BAM_FUNMAP) return 1;
+ if (b->core.flag & BAM_FSECONDARY) return 2;
+ if (b->core.flag & BAM_FQCFAIL) return 3;
+ if (b->core.flag & BAM_FDUP) return 4;
+
+ if (b->core.flag & BAM_FPAIRED) {
+ if (! (b->core.flag & BAM_FPROPER_PAIR)) return 5;
+ if (b->core.flag & BAM_FMUNMAP) return 6;
+ if ((b->core.flag & BAM_FREVERSE) == (b->core.flag & BAM_FMREVERSE)) return 7;
+ }
+ if ((int)b->core.qual < param_min_mapQ) return 8;
+ return 0;
+}
+
+void bamstat_data::readSequences(string fbam) {
+ vrb.title("Opening BAM file [" + fbam + "]");
+ samFile * fd = sam_open(fbam.c_str(), "r");
+ if (fd == 0) vrb.error("Failed to open file");
+ vrb.bullet("reading header");
+ bam_hdr_t * header = sam_hdr_read(fd);
+ if (header == 0) vrb.error("Failed to read header");
+ vrb.bullet("reading index file");
+ hts_idx_t *idx = sam_index_load(fd, fbam.c_str());
+ if (idx == NULL) vrb.error("Failed to load index");
+
+ vrb.bullet("STEP1: Iterate across reads");
+ bam1_t * b = bam_init1();
+ bool pair_end_seq = false;
+ unsigned int n_unmap = 0;
+ unsigned int n_second = 0;
+ unsigned int n_qcfail = 0;
+ unsigned int n_dup = 0;
+ unsigned int n_proper = 0;
+ unsigned int n_munmapped = 0;
+ unsigned int n_strand = 0;
+ unsigned int n_mappingqual = 0;
+ unsigned int n_bothmappingqual = 0;
+ unordered_map < string, pair < unsigned char, unsigned char > > M;
+ unordered_map < string, pair < unsigned char, unsigned char > > :: iterator itM;
+ while (sam_read1(fd, header, b) >= 0) {
+ int error_code = keepRead(b);
+ pair_end_seq = pair_end_seq || (b->core.flag & BAM_FPAIRED);
+ if (error_code == 0) {
+ string uid = string(bam_get_qname(b));
+ itM = M.find(uid);
+ if (itM == M.end()) M.insert(make_pair(string(bam_get_qname(b)), make_pair(1,0)));
+ else itM->second.first ++;
+ n_keep++;
+ } else if (error_code == 1) n_unmap ++;
+ else if (error_code == 2) n_second ++;
+ else if (error_code == 3) n_qcfail ++;
+ else if (error_code == 4) {
+ if (!param_dup_rd) {
+ string uid = string(bam_get_qname(b));
+ itM = M.find(uid);
+ if (itM == M.end()) M.insert(make_pair(string(bam_get_qname(b)), make_pair(1,0)));
+ else itM->second.first ++;
+ n_keep++;
+ }
+ n_dup ++;
+ } else if (error_code == 5) n_proper ++;
+ else if (error_code == 6) n_munmapped ++;
+ else if (error_code == 7) n_strand ++;
+ else if (error_code == 8) n_mappingqual ++;
+ n_total_reads ++;
+ }
+ bam_destroy1(b);
+
+ vrb.bullet(" + #n_total_reads = " + stb.str(n_total_reads));
+ vrb.bullet(" - #n_unmap_reads = " + stb.str(n_unmap));
+ vrb.bullet(" - #n_secondary_reads = " + stb.str(n_second));
+ vrb.bullet(" - #n_qcfail_reads = " + stb.str(n_qcfail));
+ if (param_dup_rd) vrb.bullet(" - #n_dup_reads = " + stb.str(n_dup));
+ else vrb.bullet(" #n_dup_reads = " + stb.str(n_dup));
+ if (pair_end_seq) vrb.bullet(" - #n_improperpairs_reads = " + stb.str(n_proper));
+ if (pair_end_seq) vrb.bullet(" - #n_mate_unmapped_reads = " + stb.str(n_munmapped));
+ if (pair_end_seq) vrb.bullet(" - #n_strand_pb_reads = " + stb.str(n_strand));
+ vrb.bullet(" - #n_mapping_qual_reads = " + stb.str(n_mappingqual));
+ vrb.bullet(" = #n_good_reads = " + stb.str(n_keep));
+
+ if (pair_end_seq) {
+ vrb.bullet(" = #n_total_pairs = " + stb.str(M.size()));
+ for (itM = M.begin() ; itM != M.end() ;) {
+ if (itM->second.first != 2) {
+ itM = M.erase(itM);
+ n_bothmappingqual ++;
+ } else {
+ itM->second.first = 0;
+ ++itM;
+ }
+ }
+ vrb.bullet(" - #n_both_not_mapped_pairs = " + stb.str(n_bothmappingqual));
+ vrb.bullet(" = #n_good_pairs = " + stb.str(M.size()));
+ } else for (itM = M.begin() ; itM != M.end() ; ++itM) itM->second.first = 0;
+
+ ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+ vrb.bullet("STEP2: Iterate across regions");
+ b = bam_init1();
+ for (int r = 0 ; r < R.size() ; r ++) {
+ string r_str = R[r].toString();
+ hts_itr_t *iter = sam_itr_querys(idx, header, r_str.c_str());
+ if (iter == NULL) vrb.error("Failed to parse region [" + r_str + "]");
+ else {
+ unsigned int n_tmp = 0;
+ while (sam_itr_next(fd, iter, b) >= 0) {
+ string uid = string(bam_get_qname(b));
+ itM = M.find(uid);
+ if (itM != M.end()) {
+ if (b->core.flag & BAM_FPAIRED) {
+ if ((b->core.flag & BAM_FREAD1) && itM->second.first < 255) itM->second.first ++;
+ if ((b->core.flag & BAM_FREAD2) && itM->second.second < 255) itM->second.second ++;
+ } else if (itM->second.first < 255) itM->second.first ++;
+ R[r].n_covering_reads ++;
+ }
+ n_tmp ++;
+ }
+ }
+ hts_itr_destroy(iter);
+
+ vrb.progress((r+1) * 1.0 / R.size());
+ }
+ bam_destroy1(b);
+
+ //Counting 'exonic' reads
+ vrb.title("Counting pairs overlapping annotations");
+ for (itM = M.begin() ; itM != M.end() ; ++ itM) {
+ if (itM->second.first > 0) n_overlap_reads ++;
+ if (itM->second.second > 0) n_overlap_reads ++;
+ }
+ vrb.bullet("#n_overlap_reads = " + stb.str(n_overlap_reads) + " / " + stb.str(n_keep));
+
+ //Counting covered regions
+ vrb.title("Counting annotations overlapped by reads");
+ for (int r = 0 ; r < R.size() ; r ++) {
+ if (R[r].n_covering_reads > 0) n_overlap_annotations ++;
+ }
+ vrb.bullet("#n_overlap_annotations = " + stb.str(n_overlap_annotations) + " / " + stb.str(R.size()));
+
+ hts_idx_destroy(idx);
+ bam_hdr_destroy(header);
+ sam_close(fd);
+
+}
diff --git a/src/mode_bamstat/bamstat_write_output.cpp b/src/mode_bamstat/bamstat_write_output.cpp
new file mode 100644
index 0000000..552e0ec
--- /dev/null
+++ b/src/mode_bamstat/bamstat_write_output.cpp
@@ -0,0 +1,23 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "bamstat_data.h"
+
+void bamstat_data::writeOutput(string fout) {
+ output_file fd (fout);
+ fd << n_total_reads << " " << n_keep << " " << n_overlap_reads << " " << R.size() << " " << n_overlap_annotations << endl;
+ fd.close();
+}
+
diff --git a/src/mode_cis/cis_chunking.cpp b/src/mode_cis/cis_chunking.cpp
new file mode 100644
index 0000000..6c88ec0
--- /dev/null
+++ b/src/mode_cis/cis_chunking.cpp
@@ -0,0 +1,190 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "cis_data.h"
+
+void cis_data::writeHeader(string fout) {
+ vrb.title("Writing header for output in [" + fout + "]");
+ output_file fdo (fout);
+
+ if (mode == CIS_PERM) {
+ if (grp_mode == GRP_NONE) fdo << "phe_id";
+ else fdo << "grp_id";
+ fdo << " phe_chr phe_from phe_to phe_strd";
+ switch (grp_mode) {
+ case GRP_BEST: fdo << " phe_id n_phe_in_grp"; break;
+ case GRP_PCA1: fdo << " ve_by_pc1 n_phe_in_grp"; break;
+ case GRP_MEAN: fdo << " n_phe_in_grp"; break;
+ }
+ fdo << " n_var_in_cis dist_phe_var var_id var_chr var_from var_to dof1 dof2 bml1 bml2 nom_pval slope adj_emp_pval adj_beta_pval" << endl;
+ }
+ if (mode == CIS_NOMI) {
+ if (grp_mode == GRP_NONE) fdo << "phe_id";
+ else fdo << "grp_id";
+ fdo << " phe_chr phe_from phe_to phe_strd";
+ switch (grp_mode) {
+ case GRP_BEST: fdo << " phe_id n_phe_in_grp"; break;
+ case GRP_PCA1: fdo << " ve_by_pc1 n_phe_in_grp"; break;
+ case GRP_MEAN: fdo << " n_phe_in_grp"; break;
+ }
+ fdo << " n_var_in_cis dist_phe_var var_id var_chr var_from var_to nom_pval slope best_hit" << endl;
+ }
+ if (mode == CIS_COND) {
+ if (grp_mode == GRP_NONE) fdo << "phe_id";
+ else fdo << "grp_id";
+ fdo << " phe_chr phe_from phe_to phe_strd";
+ switch (grp_mode) {
+ case GRP_BEST: fdo << " phe_id n_phe_in_grp"; break;
+ case GRP_PCA1: fdo << " ve_by_pc1 n_phe_in_grp"; break;
+ case GRP_MEAN: fdo << " n_phe_in_grp"; break;
+ }
+ fdo << " n_var_in_cis dist_phe_var var_id var_chr var_from var_to rank fwd_pval fwd_slope fwd_best_hit fwd_sig bwd_pval bwd_slope bwd_best_hit bwd_sig" << endl;
+ }
+ fdo.close();
+}
+
+bool cis_data::setPhenotypeRegion(string reg) {
+ return regionPhenotype.parse(reg);
+}
+
+class pgroup {
+public:
+ int start, end;
+ string chr;
+
+ pgroup(string pc, int ps, int pe) {
+ chr = pc;
+ start = ps;
+ end = pe;
+ }
+
+ void merge(int ps, int pe) {
+ if (start > ps) start = ps;
+ if (end < pe) end = pe;
+ }
+
+ void merge(pgroup & p) {
+ if (start > p.start) start = p.start;
+ if (end < p.end) end = p.end;
+ }
+
+ bool overlap(pgroup & p) {
+ if (chr != p.chr) return false;
+ //cout << start << " " << end << " vs " << p.start << " " << p.end;
+ if (start <= p.end && p.start <= end) {
+ //cout << " Y" << endl;
+ return true;
+ } else {
+ //cout << " N" << endl;
+ return false;
+ }
+ }
+
+ bool operator < (pgroup const & p) const {
+ if (chr < p.chr) return true;
+ if (chr > p.chr) return false;
+ if (start < p.start) return true;
+ if (start >= p.start) return false;
+ return false;
+ }
+};
+
+void cis_data::setPhenotypeRegion(int k, int K) {
+ //STEP0: check input values
+ if (K < 1) vrb.error("Number of chunks needs to be > 0");
+ if (K > phenotype_count) vrb.error("Number of chunks (" + stb.str(K) + ") is greater than the number of phenotypes (" + stb.str(phenotype_count) + ")");
+ if (k < 0) vrb.error("Chunk index needs to be > 0");
+ if (k >= K) vrb.error("Chunk index needs to be smaller than the total number of chunks [=" + stb.str(K) + "]");
+
+ //STEP1: regroup by group
+ vector < pgroup > v_pgroup;
+ if (phenotype_grp.size() > 0) {
+ map < string, int > grp2idx;
+ map < string, int > :: iterator it_grp2idx;
+ for (int p = 0 ; p < phenotype_count ; p ++) {
+ it_grp2idx = grp2idx.find (phenotype_grp[p]);
+ if (it_grp2idx == grp2idx.end()) {
+ grp2idx.insert(pair < string, int > (phenotype_grp[p], v_pgroup.size()));
+ v_pgroup.push_back(pgroup(phenotype_chr[p], phenotype_start[p], phenotype_end[p]));
+ } else v_pgroup[it_grp2idx->second].merge(phenotype_start[p], phenotype_end[p]);
+ }
+ } else {
+ for (int p = 0 ; p < phenotype_count ; p ++) {
+ v_pgroup.push_back(pgroup(phenotype_chr[p], phenotype_start[p], phenotype_end[p]));
+ }
+ }
+ sort(v_pgroup.begin(), v_pgroup.end());
+
+ //STEP2: merge overlapping groups
+ stack < pgroup > s_pgroup;
+ s_pgroup.push(v_pgroup[0]);
+ for (int i = 1 ; i < v_pgroup.size(); i++) {
+ pgroup ptop = s_pgroup.top();
+ if (!ptop.overlap(v_pgroup[i])) s_pgroup.push(v_pgroup[i]);
+ else {
+ ptop.merge(v_pgroup[i]);
+ s_pgroup.pop();
+ s_pgroup.push(ptop);
+ }
+ }
+ v_pgroup.clear();
+ while (!s_pgroup.empty()) {
+ v_pgroup.push_back(s_pgroup.top());
+ s_pgroup.pop();
+ }
+ sort(v_pgroup.begin(), v_pgroup.end());
+
+ //STEP3: build one cluster per chromosome
+ vector < vector < int > > cluster_idx;
+ map < string , int > chr2idx;
+ for (int p = 0 ; p < v_pgroup.size() ; p ++) {
+ map < string , int > :: iterator it_chr2idx = chr2idx.find(v_pgroup[p].chr);
+ if (it_chr2idx == chr2idx.end()) {
+ chr2idx.insert(make_pair(v_pgroup[p].chr, cluster_idx.size()));
+ cluster_idx.push_back(vector < int > (1, p));
+ } else cluster_idx[it_chr2idx->second].push_back(p);
+ }
+
+ //STEP4: split until number of chunks is reached
+ bool done = (cluster_idx.size() >= K);
+ while (!done) {
+
+ int max_idx = -1, max_val = 1;
+ for (int p = 0 ; p < cluster_idx.size() ; p ++) {
+ if (cluster_idx[p].size() > max_val) {
+ max_val = cluster_idx[p].size();
+ max_idx = p;
+ }
+ }
+
+ if (max_idx >= 0) {
+ int max_mid = cluster_idx[max_idx].size() / 2;
+ cluster_idx.push_back(vector < int > (cluster_idx[max_idx].begin() + max_mid, cluster_idx[max_idx].end()));
+ cluster_idx[max_idx].erase(cluster_idx[max_idx].begin() + max_mid, cluster_idx[max_idx].end());
+ if (cluster_idx.size() >= K) done = true;
+ } else done = true;
+ }
+
+ //STEP5: extract coordinates
+ if (k < cluster_idx.size()) {
+ regionPhenotype.chr = v_pgroup[cluster_idx[k][0]].chr;
+ regionPhenotype.start = 1000000000;
+ regionPhenotype.end = 0;
+ for (int c = 0 ; c < cluster_idx[k].size() ; c ++) {
+ if (v_pgroup[cluster_idx[k][c]].start < regionPhenotype.start) regionPhenotype.start = v_pgroup[cluster_idx[k][c]].start;
+ if (v_pgroup[cluster_idx[k][c]].end > regionPhenotype.end) regionPhenotype.end = v_pgroup[cluster_idx[k][c]].end;
+ }
+ } else vrb.leave("Empty chunk, no data to process!");
+}
diff --git a/src/mode_cis/cis_collapse_phenotypes.cpp b/src/mode_cis/cis_collapse_phenotypes.cpp
new file mode 100644
index 0000000..35e725e
--- /dev/null
+++ b/src/mode_cis/cis_collapse_phenotypes.cpp
@@ -0,0 +1,82 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "cis_data.h"
+
+void cis_data::collapsePhenotypes() {
+ group_idx.clear();
+
+ //PASS0: check that groups are specified for aggragation methods
+
+
+
+ //PASS1: regroup phenotypes by group ID
+ map < string, unsigned int > group_id;
+ map < string, unsigned int >::iterator group_it;
+ for (int p = 0 ; p < phenotype_count ; p ++) {
+ if (grp_mode != GRP_NONE) {
+ group_it = group_id.find(phenotype_grp[p]);
+ if (group_it == group_id.end()) {
+ group_idx.push_back(vector < unsigned int > (1, p));
+ group_var.push_back(1.0);
+ group_size.push_back(1);
+ group_id.insert(pair < string, unsigned int > (phenotype_grp[p], group_id.size()));
+ } else {
+ group_idx[group_it->second].push_back(p);
+ group_size[group_it->second]++;
+ }
+ } else {
+ group_idx.push_back(vector < unsigned int > (1, p));
+ group_var.push_back(1.0);
+ group_size.push_back(1);
+ }
+ }
+
+ //PASS2: sort & stats
+ basic_stats bspg;
+ for (int g = 0 ; g < group_idx.size() ; g ++) {
+ sort(group_idx[g].begin(), group_idx[g].end());
+ bspg.push(group_idx[g].size());
+ }
+ if (grp_mode != GRP_NONE) {
+ vrb.title("Regrouping phenotypes within groups");
+ vrb.bullet("#phenotypes = " + stb.str(phenotype_count));
+ vrb.bullet("#groups = " + stb.str(group_idx.size()));
+ vrb.bullet("#phenotypes per group = " + stb.str(bspg.mean(), 2) + " +/-" + stb.str(bspg.sd(), 2));
+ }
+
+ //PASS3: pca1 and mean
+ basic_stats bsvg;
+ for (int g = 0 ; g < group_idx.size() ; g ++) {
+ if (group_idx[g].size() > 1) {
+ if (grp_mode == GRP_MEAN) {
+ for (int s = 0 ; s < sample_count ; s ++) {
+ for (int p = 1 ; p < group_idx[g].size() ; p ++) phenotype_val[group_idx[g][0]][s] += phenotype_val[group_idx[g][p]][s];
+ phenotype_val[group_idx[g][0]][s] /= group_idx[g].size();
+ }
+ group_idx[g].erase(group_idx[g].begin() + 1, group_idx[g].end());
+ } else if (grp_mode == GRP_PCA1) {
+ pca P (sample_count, group_idx[g].size());
+ P.fill(phenotype_val, group_idx[g]);
+ P.run(false, true, true);
+ P.get(0, phenotype_val[group_idx[g][0]]);
+ group_var[g] = P.getVariance(0);
+ bsvg.push(group_var[g]);
+ group_idx[g].erase(group_idx[g].begin() + 1, group_idx[g].end());
+ }
+ } else if (grp_mode == GRP_PCA1) group_var[g] = 1.0;
+ }
+ if (grp_mode == GRP_PCA1) vrb.bullet("variance explained by PC1 per group = " + stb.str(bsvg.mean(), 3) + " +/-" + stb.str(bsvg.sd(), 3));
+}
diff --git a/src/mode_cis/cis_conditionnal_pass.cpp b/src/mode_cis/cis_conditionnal_pass.cpp
new file mode 100644
index 0000000..7676633
--- /dev/null
+++ b/src/mode_cis/cis_conditionnal_pass.cpp
@@ -0,0 +1,207 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "cis_data.h"
+
+void cis_data::runConditionalPass(string fout) {
+
+ //STEP0: INITIALIZATION OF IO
+ output_file fdo (fout);
+ if (fdo.fail()) vrb.error("Cannot open file [" + fout + "]");
+
+ //STEP2: MAIN SWEEP THROUGH PHENOTYPES
+ for (unsigned int i_group = 0 ; i_group < group_idx.size() ; i_group ++) {
+
+ //STEP4: VERBOSE PROCESSED PHENOTYPES
+ if (grp_mode == GRP_NONE) vrb.title("Processing phenotype [" + phenotype_id[group_idx[i_group][0]] + "] [" + stb.str(i_group+1) + "/" + stb.str(group_idx.size()) + "]");
+ else {
+ vrb.title("Processing group of phenotypes [" + phenotype_grp[group_idx[i_group][0]] + "] [" + stb.str(i_group+1) + "/" + stb.str(group_idx.size()) + "]");
+ vrb.bullet("#phenotypes in group = " + stb.str(group_size[i_group]));
+ if (grp_mode == GRP_PCA1) vrb.bullet("variance explained by PC1 = " + stb.str(group_var[i_group], 3));
+ }
+
+ //STEP4: ENUMERATE ALL VARIANTS IN CIS
+ vector < unsigned int > variant_indexes;
+ vector < int > variant_distances;
+ for (unsigned int v = 0 ; v < genotype_count ; v ++) {
+ if (phenotype_chr[group_idx[i_group][0]] != genotype_chr[v]) continue;
+ int ps = (phenotype_start[group_idx[i_group][0]]>cis_window)?(phenotype_start[group_idx[i_group][0]]-cis_window):0;
+ int pe = phenotype_end[group_idx[i_group][0]] + cis_window;
+
+ if (genotype_start[v] <= pe && ps <= genotype_end[v]) {
+ int cisdistance = 0;
+ if (genotype_start[v] <= phenotype_end[group_idx[i_group][0]] && phenotype_start[group_idx[i_group][0]] <= genotype_end[v]) cisdistance = 0;
+ else if (genotype_end[v] < phenotype_start[group_idx[i_group][0]]) cisdistance = (genotype_end[v] - phenotype_start[group_idx[i_group][0]]);
+ else cisdistance = genotype_start[v] - phenotype_end[group_idx[i_group][0]];
+ if (phenotype_neg[group_idx[i_group][0]]) cisdistance *= -1;
+ variant_indexes.push_back(v);
+ variant_distances.push_back(cisdistance);
+ }
+ }
+ vrb.bullet("#variants in cis = " + stb.str(variant_indexes.size()));
+ vrb.bullet("Nominal significance threshold = " + stb.str(phenotype_threshold[group_idx[i_group][0]]));
+
+ //STEP5: VARIANTS IN CIS FOUND: PERFORM COMPUTATIONS
+ if (variant_indexes.size() > 0) {
+
+ //STEP6: FORWARD PASS
+ bool fdone = true;
+ unsigned int fhits = 0, fsignals = 0;
+ vector < int > fbest_idx;
+ vector < double > fbest_pvalue;
+ vector < vector < double > > fpvalue, fslope;
+ vector < vector < float > > phenotype_curr = vector < vector < float > > (group_idx[i_group].size());
+ for (unsigned int p = 0 ; p < group_idx[i_group].size() ; p ++) phenotype_curr[p] = phenotype_val[group_idx[i_group][p]];
+
+ do {
+ if (fsignals > 0) {
+ residualizer conditional_engine(sample_count);
+ conditional_engine.push(genotype_val[fbest_idx.back()]);
+ conditional_engine.build();
+ for (unsigned int p = 0 ; p < group_idx[i_group].size() ; p ++) {
+ conditional_engine.residualize(phenotype_curr[p]);
+ normalTransform(phenotype_curr[p]);
+ }
+ }
+
+ fdone = true;
+ fbest_idx.push_back(-1);
+ fbest_pvalue.push_back(1.0);
+ fpvalue.push_back(vector < double > (group_idx[i_group].size() * variant_indexes.size(), 1.0));
+ fslope.push_back(vector < double > (group_idx[i_group].size() * variant_indexes.size(), 0.0));
+
+ for (unsigned int p = 0 ; p < group_idx[i_group].size() ; p ++) {
+ for (unsigned int v = 0 ; v < variant_indexes.size() ; v ++) {
+ unsigned int rel_idx = v * group_idx[i_group].size() + p;
+ regression(genotype_val[variant_indexes[v]], phenotype_curr[p], fpvalue.back()[rel_idx], fslope.back()[rel_idx]);
+ if (fpvalue.back()[rel_idx] <= phenotype_threshold[group_idx[i_group][p]]) {
+ if (fdone) fsignals ++;
+ fdone = false;
+ fhits ++;
+ }
+ if (fpvalue.back()[rel_idx] <= fbest_pvalue.back()) {
+ fbest_pvalue.back() = fpvalue.back()[rel_idx];
+ fbest_idx.back() = variant_indexes[v];
+ }
+ }
+ }
+ } while (!fdone);
+
+ fpvalue.pop_back();
+ fslope.pop_back();
+ fbest_idx.pop_back();
+ fbest_pvalue.pop_back();
+
+ vrb.bullet("Forward pass: [ni=" + stb.str(fsignals) + ", nh=" + stb.str(fhits) + "]");
+
+ //STEP7: IF THERE IS SIGNIFICANT QTLs
+ if (fsignals == 0) vrb.bullet("No backward pass");
+ else {
+
+ //STEP8: BACKWARD PASS
+ bool bdone = true;
+ unsigned int bhits = 0, bsignals = 0;
+ vector < double > bbest_pvalue = vector < double > (fsignals, 1.0);
+ vector < vector < double > > bpvalue = vector < vector < double > > (fsignals, vector < double > (group_idx[i_group].size() * variant_indexes.size(), 1.0));
+ vector < vector < double > > bslope = vector < vector < double > > (fsignals, vector < double > (group_idx[i_group].size() * variant_indexes.size(), 0.0));
+
+ for (unsigned int i_sig = 0 ; i_sig < fsignals ; i_sig ++) {
+
+ //Dump phenotypes
+ vector < vector < float > > phenotype_curr = vector < vector < float > > (group_idx[i_group].size());
+ for (unsigned int p = 0 ; p < group_idx[i_group].size() ; p ++) phenotype_curr[p] = phenotype_val[group_idx[i_group][p]];
+
+ //Iterative correction
+ for (unsigned int s = 0 ; s < fsignals ; s ++) {
+ if (s != i_sig) {
+ residualizer conditional_engine (sample_count);
+ conditional_engine.push(genotype_val[fbest_idx[s]]);
+ conditional_engine.build();
+ for (unsigned int p = 0 ; p < group_idx[i_group].size() ; p ++) {
+ conditional_engine.residualize(phenotype_curr[p]);
+ normalTransform(phenotype_curr[p]);
+ }
+ }
+ }
+
+ bdone = true;
+ for (unsigned int p = 0 ; p < group_idx[i_group].size() ; p ++) {
+ for (unsigned int v = 0 ; v < variant_indexes.size() ; v ++) {
+ unsigned int rel_idx = v*group_idx[i_group].size() + p;
+ regression(genotype_val[variant_indexes[v]], phenotype_curr[p], bpvalue[i_sig][rel_idx], bslope[i_sig][rel_idx]);
+ if (bpvalue[i_sig][rel_idx] <= phenotype_threshold[group_idx[i_group][p]]) {
+ if (bdone) bsignals ++;
+ bdone = false;
+ bhits ++;
+ }
+ if (bpvalue[i_sig][rel_idx] <= bbest_pvalue[i_sig]) bbest_pvalue[i_sig] = bpvalue[i_sig][rel_idx];
+ }
+ }
+ }
+
+ vrb.bullet("Backward pass: [ni=" + stb.str(bsignals) + ", nh=" + stb.str(bhits) + "]");
+
+ //STEP9: WRITE OUTPUT
+ for (unsigned int i_sig = 0 ; i_sig < fsignals ; i_sig ++) {
+ vector < unsigned int > fbest_region, bbest_region;
+ for (unsigned int p = 0 ; p < group_idx[i_group].size() ; p ++) {
+ for (unsigned int v = 0 ; v < variant_indexes.size() ; v ++) {
+ unsigned int rel_idx = v * group_idx[i_group].size() + p;
+ if (fpvalue[i_sig][rel_idx] == fbest_pvalue[i_sig]) fbest_region.push_back(rel_idx);
+ if (bpvalue[i_sig][rel_idx] == bbest_pvalue[i_sig]) bbest_region.push_back(rel_idx);
+ }
+ }
+ random_shuffle(fbest_region.begin(), fbest_region.begin());
+ random_shuffle(bbest_region.begin(), bbest_region.begin());
+ for (unsigned int p = 0 ; p < group_idx[i_group].size() ; p ++) {
+ for (unsigned int v = 0 ; v < variant_indexes.size() ; v ++) {
+ unsigned int rel_idx = v * group_idx[i_group].size() + p;
+ if (fpvalue[i_sig][rel_idx] <= phenotype_threshold[group_idx[i_group][p]] || bpvalue[i_sig][rel_idx] <= phenotype_threshold[group_idx[i_group][p]]) {
+ if (grp_mode == GRP_NONE) fdo << phenotype_id[group_idx[i_group][p]];
+ else fdo << phenotype_grp[group_idx[i_group][p]];
+ fdo << " " << phenotype_chr[group_idx[i_group][p]];
+ fdo << " " << phenotype_start[group_idx[i_group][p]];
+ fdo << " " << phenotype_end[group_idx[i_group][p]];
+ fdo << " " << (phenotype_neg[group_idx[i_group][p]]?"-":"+");
+ switch (grp_mode) {
+ case GRP_BEST: fdo << " " << phenotype_id[group_idx[i_group][p]] << " " << stb.str(group_size[i_group]); break;
+ case GRP_PCA1: fdo << " " << stb.str(group_var[i_group], 3) << " " << stb.str(group_size[i_group]); break;
+ case GRP_MEAN: fdo << " " << stb.str(group_size[i_group]); break;
+ }
+ fdo << " " << variant_indexes.size();
+ fdo << " " << variant_distances[v];
+ fdo << " " << genotype_id[variant_indexes[v]];
+ fdo << " " << genotype_chr[variant_indexes[v]];
+ fdo << " " << genotype_start[variant_indexes[v]];
+ fdo << " " << genotype_end[variant_indexes[v]];
+ fdo << " " << i_sig;
+ fdo << " " << fpvalue[i_sig][rel_idx];
+ fdo << " " << fslope[i_sig][rel_idx];
+ fdo << " " << ((rel_idx == fbest_region[0])?"1":"0");
+ fdo << " " << ((fpvalue[i_sig][rel_idx] <= phenotype_threshold[group_idx[i_group][p]])?"1":"0");
+ fdo << " " << bpvalue[i_sig][rel_idx];
+ fdo << " " << bslope[i_sig][rel_idx];
+ fdo << " " << ((rel_idx == bbest_region[0])?"1":"0");
+ fdo << " " << ((bpvalue[i_sig][rel_idx] <= phenotype_threshold[group_idx[i_group][p]])?"1":"0");
+ fdo << endl;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ fdo.close();
+}
diff --git a/src/mode_cis/cis_data.h b/src/mode_cis/cis_data.h
new file mode 100644
index 0000000..001ac3c
--- /dev/null
+++ b/src/mode_cis/cis_data.h
@@ -0,0 +1,184 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#ifndef _CIS_DATA_H
+#define _CIS_DATA_H
+
+//ANALYSIS MODES
+#define CIS_PERM 1
+#define CIS_NOMI 2
+#define CIS_COND 3
+
+//AGGREGATION MODES
+#define GRP_NONE 0
+#define GRP_BEST 1
+#define GRP_PCA1 2
+#define GRP_MEAN 3
+
+//INCLUDES
+#include "../common/data.h"
+
+class cis_data : public data {
+public:
+ //PARAMETERS
+ unsigned int mode;
+ unsigned int grp_mode;
+ unsigned int cis_window;
+ unsigned int n_permutations;
+ double threshold;
+
+ //REGIONS
+ genomic_region regionPhenotype;
+ genomic_region regionGenotype;
+
+ //GENOTYPES
+ int genotype_count; //variant site number
+ vector < vector < float > > genotype_val; //variant site genotype dosages
+ vector < string > genotype_chr; //variant site chromosome
+ vector < string > genotype_id; //variant site IDs
+ vector < int > genotype_start; //variant site start positions
+ vector < int > genotype_end; //variant site end positions
+
+ //PHENOTYPES
+ int phenotype_count; //phenotype number
+ vector < vector < float > > phenotype_val; //phenotype values
+ vector < string > phenotype_id; //phenotype ids
+ vector < string > phenotype_chr; //phenotype chromosomes
+ vector < int > phenotype_start; //phenotype start positions
+ vector < int > phenotype_end; //phenotype end positions
+ vector < bool > phenotype_neg; //phenotype is on the negative strand
+ vector < string > phenotype_grp; //phenotype group
+ vector < double > phenotype_threshold; //phenotype nominal significance thresholds
+
+ //PHENOTYPE GROUPS
+ vector < vector < unsigned int > > group_idx; //group index to phenotype indexes
+ vector < double > group_var; //group variance explained by PC1
+ vector < int > group_size; //number of phenotypes in group
+
+ //COVARIATES & INTERACTION
+ int covariate_count; //covariate number
+ vector < vector < string > > covariate_val; //covariate values
+ vector < string > covariate_id; //covariate ids
+
+ //CONSTRUCTOR / DESTRUCTOR
+ cis_data();
+ ~cis_data();
+ void clear();
+
+ //DATA REGION
+ bool setPhenotypeRegion(string);
+ void setPhenotypeRegion(int, int);
+
+ //READ DATA
+ void readGenotypes(string);
+ void readGenotypesVCF(string);
+ void readGenotypesBED(string);
+ void readPhenotypes(string);
+ void scanPhenotypes(string);
+ void readCovariates(string);
+ void readThresholds(string);
+
+ //GENOTYPE & PHENOTYPE MANAGEMENT
+ void clusterizePhenotypes(int);
+ void imputeGenotypes();
+ void imputePhenotypes();
+ void residualizePhenotypes();
+ void normalTransformPhenotypes();
+ void normalTransform(vector < float > &);
+ void normalize(vector < float > &);
+ void normalize(vector < vector < float > > &);
+ void collapsePhenotypes();
+
+ //OPTIMIZATION
+ int learnBetaParameters(vector < double > & pval, double & beta_shape1, double & beta_shape2);
+ int learnDegreeOfFreedom(vector < double > & corr, double &);
+
+ //COMPUTATION METHODS [ALL INLINES FOR SPEED]
+ double getCorrelation(vector < float > &, vector < float > &);
+ double getPvalue(double, double);
+ double getPvalue(double, vector < double > &);
+ double getSlope(double, double, double);
+ void regression(vector < float > & X, vector < float > & Y, double & pvalue, double & slope);
+
+ //ANALYSIS
+ void writeHeader(string);
+ void runNominalPass(string);
+ void runPermutationPass(string);
+ void runConditionalPass(string);
+};
+
+//***************************************************************//
+//******************** DECLARE FUNCTIONS *************************//
+//***************************************************************//
+void cis_main(vector < string > &);
+
+//***************************************************************//
+//******************** INLINE FUNCTIONS *************************//
+//***************************************************************//
+
+inline double cis_data::getCorrelation(vector < float > & vec1, vector < float > & vec2) {
+ int i = 0;
+ int repeat = (sample_count / 4);
+ int left = (sample_count % 4);
+ double sum0 = 0, sum1 = 0, sum2 = 0, sum3 = 0;
+
+ while (repeat --) {
+ sum0 += vec1[i] * vec2[i];
+ sum1 += vec1[i+1] * vec2[i+1];
+ sum2 += vec1[i+2] * vec2[i+2];
+ sum3 += vec1[i+3] * vec2[i+3];
+ i += 4;
+ }
+
+ switch (left) {
+ case 3: sum0 += vec1[i+2] * vec2[i+2];
+ case 2: sum0 += vec1[i+1] * vec2[i+1];
+ case 1: sum0 += vec1[i+0] * vec2[i+0];
+ case 0: ;
+ }
+
+ return sum0 + sum1 + sum2 + sum3;
+}
+
+inline double cis_data::getPvalue(double corr, double df) {
+ double pval = pf(df * corr * corr / (1 - corr * corr), 1, df, 0, 0);
+ if (pval <= std::numeric_limits<double>::min()) pval =std::numeric_limits<double>::min();
+ return pval;
+}
+
+inline double cis_data::getPvalue(double ncorr, vector < double > & pcorr) {
+ unsigned int n_hits = 0;
+ for (int p = 0 ; p < pcorr.size() ; p++) if (abs(pcorr[p]) >= abs(ncorr)) n_hits++;
+ return ((n_hits + 1) * 1.0 / (pcorr.size() + 1.0));
+}
+
+inline double cis_data::getSlope(double nominal_correlation, double gsd, double psd) {
+ if (gsd < 1e-16 || psd < 1e-16) return 0;
+ else return nominal_correlation * psd / gsd;
+}
+
+inline void cis_data::regression(vector < float > & X, vector < float > & Y, double & pvalue, double & slope) {
+ vector < float > Xtmp = X;
+ vector < float > Ytmp = Y;
+ double sdXtmp = basic_stats(Xtmp).sd();
+ double sdYtmp = basic_stats(Ytmp).sd();
+ normalize(Xtmp);
+ normalize(Ytmp);
+ double correlation = getCorrelation(Xtmp, Ytmp);
+ pvalue = getPvalue(correlation, sample_count - 2);
+ slope = getSlope(correlation, sdXtmp, sdYtmp);
+}
+
+#endif
diff --git a/src/mode_cis/cis_initilization.cpp b/src/mode_cis/cis_initilization.cpp
new file mode 100644
index 0000000..ce9377d
--- /dev/null
+++ b/src/mode_cis/cis_initilization.cpp
@@ -0,0 +1,66 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "cis_data.h"
+
+cis_data::cis_data() {
+ grp_mode = GRP_NONE;
+ cis_window = 0.0;
+ n_permutations = 0;
+ threshold = 1.0;
+ sample_count = 0;
+ genotype_count = 0;
+ phenotype_count = 0;
+ covariate_count = 0;
+}
+
+void cis_data::clear() {
+ sample_count = 0;
+ sample_id.clear();
+ genotype_count = 0;
+ genotype_val.clear();
+ genotype_chr.clear();
+ genotype_id.clear();
+ genotype_start.clear();
+ genotype_end.clear();
+ phenotype_count = 0;
+ phenotype_val.clear();
+ phenotype_id.clear();
+ phenotype_chr.clear();
+ phenotype_start.clear();
+ phenotype_end.clear();
+ phenotype_neg.clear();
+ phenotype_grp.clear();
+ group_idx.clear();
+ group_var.clear();
+ group_size.clear();
+ covariate_count = 0;
+ covariate_val.clear();
+ covariate_id.clear();
+}
+
+cis_data::~cis_data() {
+ clear();
+}
+
+void cis_data::residualizePhenotypes() {
+ vrb.title("Residualize phenotypes for covariates");
+ residualizer covariate_engine (sample_count);
+ for (int c = 0 ; c < covariate_count ; c ++) covariate_engine.push(covariate_val[c]);
+ covariate_engine.build();
+ for (unsigned int p = 0 ; p < phenotype_count ; p ++) covariate_engine.residualize(phenotype_val[p]);
+ vrb.bullet("#covariates = " + stb.str(covariate_engine.n_covariates));
+}
+
diff --git a/src/mode_cis/cis_learn_beta.cpp b/src/mode_cis/cis_learn_beta.cpp
new file mode 100644
index 0000000..28aee9f
--- /dev/null
+++ b/src/mode_cis/cis_learn_beta.cpp
@@ -0,0 +1,103 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "cis_data.h"
+
+#include <gsl/gsl_multimin.h>
+#include <gsl/gsl_sf_psi.h>
+#include <gsl/gsl_sf_gamma.h>
+
+#define BETA_SHAPE1_MIN 0.1
+#define BETA_SHAPE1_MAX 10
+#define BETA_SHAPE2_MIN 5
+#define BETA_SHAPE2_MAX 1000000 //to be changed for trans!
+
+
+class cis_learn_beta_exception : public std::exception {
+public:
+ cis_learn_beta_exception(string _msg) { msg = _msg; }
+ virtual ~cis_learn_beta_exception() throw() { }
+ virtual const char * what() const throw() { return msg.c_str(); }
+private:
+ string msg;
+};
+
+double cis_betaLogLikelihood(const gsl_vector *v, void *params) {
+ double * p = (double *) params;
+ double beta_shape1 = gsl_vector_get(v, 0);
+ double beta_shape2 = gsl_vector_get(v, 1);
+
+ if (beta_shape1 < BETA_SHAPE1_MIN) throw cis_learn_beta_exception("beta_shape1 too small [" + stb.str(beta_shape1, 3) + "]");
+ if (beta_shape1 > BETA_SHAPE1_MAX) throw cis_learn_beta_exception("beta_shape1 too large [" + stb.str(beta_shape1, 3) + "]");
+ if (beta_shape2 < BETA_SHAPE2_MIN) throw cis_learn_beta_exception("beta_shape2 too small [" + stb.str(beta_shape2, 3) + "]");
+ if (beta_shape2 > BETA_SHAPE2_MAX) throw cis_learn_beta_exception("beta_shape2 too large [" + stb.str(beta_shape2, 3) + "]");
+
+ return -1.0 * ((beta_shape1 - 1) * p[0] + (beta_shape2 - 1) * p[1] - p[2] * gsl_sf_lnbeta(beta_shape1, beta_shape2));
+}
+
+int cis_data::learnBetaParameters(vector < double > & pval, double & beta_shape1, double & beta_shape2) {
+
+ //Set starting point to moment matching estimates
+ gsl_vector * x = gsl_vector_alloc (2);
+ gsl_vector_set (x, 0, beta_shape1);
+ gsl_vector_set (x, 1, beta_shape2);
+
+ //Set initial step sizes to shape1 and shape2 scales
+ gsl_vector * ss = gsl_vector_alloc (2);
+ gsl_vector_set (ss, 0, beta_shape1/10);
+ gsl_vector_set (ss, 1, beta_shape2/10);
+
+ //Initialize method and iterate
+ double par [3];
+ par[0] = 0.0;
+ par[1] = 0.0;
+ for (int e = 0 ; e < pval.size(); e ++) {
+ if (pval[e] == 1.0) pval[e] = 0.99999999;
+ par[0] += log (pval[e]);
+ par[1] += log (1 - pval[e]);
+ }
+ par[2] = pval.size();
+ gsl_multimin_function minex_func;
+ minex_func.n = 2;
+ minex_func.f = cis_betaLogLikelihood;
+ minex_func.params = par;
+
+ //Initialize optimization machinery
+ const gsl_multimin_fminimizer_type * T = gsl_multimin_fminimizer_nmsimplex2;
+ gsl_multimin_fminimizer * s = gsl_multimin_fminimizer_alloc (T, 2);
+ gsl_multimin_fminimizer_set (s, &minex_func, x, ss);
+
+ //Optimization iteration
+ size_t iter = 0;
+ int status;
+ double size;
+ do {
+ iter++;
+ status = gsl_multimin_fminimizer_iterate(s);
+ if (status) break;
+ size = gsl_multimin_fminimizer_size (s);
+ status = gsl_multimin_test_size (size, 0.01);
+ } while (status == GSL_CONTINUE && iter < 1000);
+
+ //Output new beta shape values
+ beta_shape1 = gsl_vector_get (s->x, 0);
+ beta_shape2 = gsl_vector_get (s->x, 1);
+
+ //Free allocated memory
+ gsl_vector_free(x);
+ gsl_vector_free(ss);
+ gsl_multimin_fminimizer_free (s);
+ return (status == GSL_SUCCESS);
+}
diff --git a/src/mode_cis/cis_learn_dof.cpp b/src/mode_cis/cis_learn_dof.cpp
new file mode 100644
index 0000000..1f52c39
--- /dev/null
+++ b/src/mode_cis/cis_learn_dof.cpp
@@ -0,0 +1,99 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "cis_data.h"
+
+#include <gsl/gsl_multimin.h>
+#include <gsl/gsl_sf_psi.h>
+#include <gsl/gsl_sf_gamma.h>
+
+struct data_to_function {
+ cis_data * D;
+ int n;
+ double * C;
+
+ data_to_function (cis_data * _D, int _n, double * _C) {
+ D = _D;
+ n = _n;
+ C = _C;
+ }
+
+};
+
+double degreeOfFreedom(const gsl_vector *v, void *params) {
+ data_to_function * d = (data_to_function *) params;
+ vector < double > pval = vector < double >(d->n, 0.0);
+ double mean = 0.0;
+ for (int c = 0 ; c < d->n ; c++) {
+ pval[c] = d->D->getPvalue(d->C[c], gsl_vector_get(v, 0));
+ mean += pval[c];
+ }
+ mean /= pval.size();
+ double variance = 0.0;
+ for (int c = 0 ; c < pval.size() ; c++) variance += (pval[c] - mean) * (pval[c] - mean);
+ variance /= (pval.size() - 1);
+
+ double shape2 = abs((mean * (mean * (1 - mean ) / variance - 1)) - 1.0);
+ //cout << "O = " << mean << " " << shape2 << endl;
+ return shape2;
+}
+
+int cis_data::learnDegreeOfFreedom(vector < double > & corr, double & df) {
+
+ //Set starting point to moment matching estimates
+ gsl_vector * x = gsl_vector_alloc (1);
+ gsl_vector_set (x, 0, df);
+
+ //Set initial step sizes to shape1 and shape2 scales
+ gsl_vector * ss = gsl_vector_alloc (1);
+ gsl_vector_set (ss, 0, df * 0.1);
+
+ //Initialize method and iterate
+ data_to_function * par = new data_to_function (this, corr.size(), &corr[0]);
+
+ gsl_multimin_function minex_func;
+ minex_func.n = 1;
+ minex_func.f = degreeOfFreedom;
+ minex_func.params = (void*)par;
+
+ //Initialize optimization machinery
+ const gsl_multimin_fminimizer_type * T = gsl_multimin_fminimizer_nmsimplex2;
+ gsl_multimin_fminimizer * s = gsl_multimin_fminimizer_alloc (T, 1);
+ gsl_multimin_fminimizer_set (s, &minex_func, x, ss);
+
+ //Optimization iteration
+ //cout << "\n ========================" << endl;
+ size_t iter = 0;
+ int status;
+ double size;
+ do {
+ iter++;
+ status = gsl_multimin_fminimizer_iterate(s);
+ if (status) break;
+ size = gsl_multimin_fminimizer_size (s);
+ status = gsl_multimin_test_size (size, 0.01);
+ //printf ("%d %10.3e f() = %7.10f size = %.10f\n", iter, gsl_vector_get (s->x, 0), s->fval, size);
+
+ } while (status == GSL_CONTINUE && iter < 20);
+
+ //Output new beta shape values
+ df = gsl_vector_get (s->x, 0);
+
+ //Free allocated memory
+ gsl_vector_free(x);
+ gsl_vector_free(ss);
+ gsl_multimin_fminimizer_free (s);
+ return (status == GSL_SUCCESS);
+}
diff --git a/src/mode_cis/cis_main.cpp b/src/mode_cis/cis_main.cpp
new file mode 100644
index 0000000..42dc335
--- /dev/null
+++ b/src/mode_cis/cis_main.cpp
@@ -0,0 +1,201 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#define _DECLARE_TOOLBOX_HERE
+#include "cis_data.h"
+
+void cis_main(vector < string > & argv) {
+ cis_data D;
+
+ //-------------------------
+ // 1. DECLARE ALL OPTIONS
+ //-------------------------
+ D.declareBasicOptions();
+
+ boost::program_options::options_description opt_files ("\x1B[32mI/O\33[0m");
+ opt_files.add_options()
+ ("vcf", boost::program_options::value< string >(), "Genotypes in VCF/BCF/BED format.")
+ ("bed", boost::program_options::value< string >(), "Phenotypes in BED format.")
+ ("cov", boost::program_options::value< string >(), "Covariates in TXT format.")
+ ("out", boost::program_options::value< string >(), "Output file.");
+
+ boost::program_options::options_description opt_parameters ("\x1B[32mParameters\33[0m");
+ opt_parameters.add_options()
+ ("normal", "Normal transform the phenotypes.")
+ ("window", boost::program_options::value< unsigned int >()->default_value(1000000), "Size of the cis-window.");
+
+ boost::program_options::options_description opt_modes ("\x1B[32mAnalysis type\33[0m");
+ opt_modes.add_options()
+ ("permute", boost::program_options::value< int >(), "MODE1: PERMUTATION PASS.")
+ ("nominal", boost::program_options::value< double >(), "MODE2: NOMINAL PASS.")
+ ("mapping", boost::program_options::value< string >(), "MODE3: MAPPING PASS.");
+
+ boost::program_options::options_description opt_aggr ("\x1B[32mPhenotype aggregation methods\33[0m");
+ opt_aggr.add_options()
+ ("grp-best", "Correct for multiple phenotypes within a group.")
+ ("grp-pca1", "Run PCA on phenotypes within a group and use PC1 for association testing.")
+ ("grp-mean", "Average phenotypes within a group and use the results for association testing.");
+
+ boost::program_options::options_description opt_parallel ("\x1B[32mParallelization\33[0m");
+ opt_parallel.add_options()
+ ("chunk", boost::program_options::value< vector < int > >()->multitoken(), "Specify which chunk needs to be processed")
+ ("region", boost::program_options::value< string >(), "Region of interest.");
+
+ D.option_descriptions.add(opt_files).add(opt_parameters).add(opt_modes).add(opt_aggr).add(opt_parallel);
+
+ //-------------------
+ // 2. PARSE OPTIONS
+ //-------------------
+ try {
+ boost::program_options::store(boost::program_options::command_line_parser(argv).options(D.option_descriptions).run(), D.options);
+ boost::program_options::notify(D.options);
+ } catch ( const boost::program_options::error& e ) {
+ cerr << "Error parsing [cis] command line :" << string(e.what()) << endl;
+ exit(0);
+ }
+
+ //---------------------
+ // 3. PRINT HELP/HEADER
+ //---------------------
+ vrb.ctitle("MAPPING QTL IN CIS");
+ if (D.options.count("help")) {
+ cout << D.option_descriptions << endl;
+ exit(EXIT_SUCCESS);
+ }
+
+ //-----------------
+ // 4. COMMON CHECKS
+ //-----------------BEST
+ if (!D.options.count("vcf")) vrb.error("Genotype data needs to be specified with --vcf [file.vcf]");
+ if (!D.options.count("bed")) vrb.error("Phenotype data needs to be specified with --bed [file.bed]");
+ if (!D.options.count("out")) vrb.error("Output needs to be specified with --out [file.out]");
+ int nParallel = D.options.count("chunk") + D.options.count("region");
+ if (nParallel > 1) vrb.error("Please, specify one of these options [--region, --chunk]");
+ int nMode = D.options.count("permute") + D.options.count("mapping") + D.options.count("nominal");
+ if (nMode != 1) vrb.error("Please, specify only one of these options [--permute, --nominal, --mapping]");
+ string outFile = D.options["out"].as < string > ();
+
+ //---------
+ // 5. MODES
+ //---------
+
+ //ONLY MODE1: PERMUTATION PASS
+ if (D.options.count("permute")) {
+ D.mode = CIS_PERM;
+ if (D.options["permute"].as < int >() < 10) vrb.error("Number of permutations is incorrect.");
+ vrb.bullet("TASK: Perform " + stb.str(D.options["permute"].as < int >()) + " permutations");
+ D.n_permutations = D.options["permute"].as < int >();
+ }
+ //ONLY MODE2: NOMINAL PASS
+ if (D.options.count("nominal")) {
+ D.mode = CIS_NOMI;
+ if (D.options["nominal"].as < double >() <= 0 || D.options["nominal"].as < double >() > 1.0) vrb.error("Significance threshold is outside of the range ]0,1]");
+ else vrb.bullet("TASK: Report all nominal associations with p <= " + stb.str(D.options["nominal"].as < double >()));
+ D.threshold = D.options["nominal"].as < double >();
+ }
+ //ONLY MODE3: MAPPING PASS
+ if (D.options.count("mapping")) {
+ D.mode = CIS_COND;
+ vrb.bullet("TASK: Perform a conditional pass");
+ }
+
+ //--------------
+ // 6. SET PARAMS
+ //--------------
+ if (D.options.count("cov")) vrb.bullet("Linear model: Phe ~ Var + Cov");
+ else vrb.bullet("Linear model: Phe ~ Var");
+ if (D.options["window"].as < unsigned int > () > 1000000000) vrb.error("Incorrect cis-window size!");
+ vrb.bullet("Cis-window size is " + stb.str(D.options["window"].as < unsigned int > ()) + " bp");
+ D.cis_window = D.options["window"].as < unsigned int > ();
+ if (D.options.count("chunk")) {
+ vector < int > nChunk = D.options["chunk"].as < vector < int > > ();
+ if (nChunk.size() != 2 || nChunk[0] > nChunk[1] || nChunk[0] < 0) vrb.error("Incorrect --chunk arguments!");
+ vrb.bullet("Chunk = [" + stb.str(nChunk[0]) + "/" + stb.str(nChunk[1]) + "]");
+ } else if(D.options.count("region")) vrb.bullet("Region = [" + D.options["region"].as < string > () +"]");
+
+ //---------------------------
+ // 7. SET AGGREGATION METHODS
+ //---------------------------
+ int n_aggregation_methods = D.options.count("grp-best") + D.options.count("grp-pca1") + D.options.count("grp-mean");
+ if (n_aggregation_methods > 1) vrb.error("Only one of the --grp-XXX options is allowed");
+ if (D.options.count("grp-best")) {
+ vrb.bullet("Phenotypes are regrouped within groups [method: best]");
+ D.grp_mode = GRP_BEST;
+ } else if (D.options.count("grp-pca1")) {
+ vrb.bullet("Phenotypes are regrouped within groups [method: pca1]");
+ D.grp_mode = GRP_PCA1;
+ } else if (D.options.count("grp-mean")) {
+ vrb.bullet("Phenotypes are regrouped within groups [method: mean]");
+ D.grp_mode = GRP_MEAN;
+ } else {
+ D.grp_mode = GRP_NONE;
+ }
+
+ //--------------
+ // 8. SET REGION
+ //--------------
+ if (D.options.count("chunk")) {
+ vector < int > nChunk = D.options["chunk"].as < vector < int > > ();
+ if (nChunk[0] == 0) {
+ D.writeHeader(outFile);
+ return;
+ } else {
+ D.scanPhenotypes(D.options["bed"].as < string > ());
+ D.setPhenotypeRegion(nChunk[0] - 1, nChunk[1]);
+ D.clear();
+ D.regionGenotype = genomic_region(D.regionPhenotype, D.options["window"].as < unsigned int > ());
+ }
+ } else if (D.options.count("region")){
+ if (!D.setPhenotypeRegion(D.options["region"].as < string > ())) vrb.error("Impossible to interpret region [" + D.options["region"].as < string > () + "]");
+ D.regionGenotype = genomic_region(D.regionPhenotype, D.options["window"].as < unsigned int > ());
+ }
+
+
+ //---------------
+ // 9. READ FILES
+ //---------------
+ D.processBasicOptions();
+ D.readSampleFromBED(D.options["bed"].as < string > ()); //Read samples in BED
+
+ htsFile * fp = hts_open(D.options["vcf"].as < string > ().c_str(),"r");
+ if (fp->format.format == sam) D.readSampleFromBED(D.options["vcf"].as < string > ());
+ else D.readSampleFromVCF(D.options["vcf"].as < string > ());
+ hts_close(fp);
+
+ if (D.options.count("cov")) D.readSampleFromCOV(D.options["cov"].as < string > ()); //Read samples in COV
+ D.mergeSampleLists(); //Merge all sample lists
+ D.readPhenotypes(D.options["bed"].as < string > ()); //Read data in BED
+ D.readGenotypes(D.options["vcf"].as < string > ()); //Read data in VCF
+ if (D.options.count("cov")) D.readCovariates(D.options["cov"].as < string > ());
+ if (D.options.count("mapping")) D.readThresholds(D.options["mapping"].as < string > ());
+
+ //------------------------
+ // 10. INITIALIZE ANALYSIS
+ //------------------------
+ D.imputeGenotypes();
+ D.imputePhenotypes();
+ if (D.options.count("cov")) D.residualizePhenotypes();
+ D.collapsePhenotypes();
+ if (D.options.count("normal")) D.normalTransformPhenotypes();
+
+ //-----------------
+ // 11. RUN ANALYSIS
+ //-----------------
+ switch (D.mode) {
+ case CIS_PERM: D.runPermutationPass(outFile); break;
+ case CIS_NOMI: D.runNominalPass(outFile); break;
+ case CIS_COND: D.runConditionalPass(outFile); break;
+ }
+}
diff --git a/src/mode_cis/cis_management.cpp b/src/mode_cis/cis_management.cpp
new file mode 100644
index 0000000..79e8bfd
--- /dev/null
+++ b/src/mode_cis/cis_management.cpp
@@ -0,0 +1,94 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "cis_data.h"
+
+void cis_data::imputeGenotypes() {
+ for (int g = 0; g < genotype_count ; g ++) {
+ double mean = 0.0;
+ int c_mean = 0;
+ for (int s = 0; s < sample_count ; s ++) {
+ if (genotype_val[g][s] != bcf_float_missing) {
+ mean += genotype_val[g][s];
+ c_mean ++;
+ }
+ }
+ mean /= c_mean;
+ for (int s = 0; s < sample_count ; s ++) if (genotype_val[g][s] == bcf_float_missing) genotype_val[g][s] = mean;
+ }
+}
+
+void cis_data::imputePhenotypes() {
+ for (int p = 0; p < phenotype_count ; p ++) {
+ double mean = 0.0;
+ int c_mean= 0;
+ for (int s = 0; s < sample_count; s ++) {
+ if (phenotype_val[p][s] != bcf_float_missing) {
+ mean += phenotype_val [p][s];
+ c_mean ++;
+ }
+ }
+ mean /= c_mean;
+ for (int s = 0; s < sample_count ; s ++) if (phenotype_val[p][s] == bcf_float_missing) phenotype_val[p][s] = mean;
+ }
+}
+
+void cis_data::normalTransform(vector < float > & V) {
+ vector < float > R;
+ myranker::rank(V, R);
+ double max = 0;
+ for (int s = 0 ; s < sample_count ; s ++) {
+ R[s] = R[s] - 0.5;
+ if (R[s] > max) max = R[s];
+ }
+ max = max + 0.5;
+ for (int s = 0 ; s < sample_count ; s ++) {
+ R[s] /= max;
+ V[s] = qnorm(R[s], 0.0, 1.0, 1, 0);
+ }
+}
+
+void cis_data::normalTransformPhenotypes() {
+ vrb.title("Match phenotypes to Normal distribution");
+ for (int p = 0; p < phenotype_count ; p ++) normalTransform(phenotype_val[p]);
+}
+
+void cis_data::normalize(vector < float > & X) {
+ double mean = 0.0, sum = 0.0;
+ for (int s = 0; s < sample_count ; s ++) mean += X[s];
+ mean /= sample_count;
+ for (int s = 0; s < sample_count ; s ++) {
+ X[s] -= mean;
+ sum += X[s] * X[s];
+ }
+ sum = sqrt(sum);
+ if (sum == 0) sum = 1;
+ for (int s = 0; s < sample_count ; s ++) X[s] /= sum;
+}
+
+void cis_data::normalize(vector < vector < float > > & X) {
+ for (int x = 0 ; x < X.size() ; x++) {
+ double mean = 0.0, sum = 0.0;
+ for (int s = 0; s < sample_count ; s ++) mean += X[x][s];
+ mean /= sample_count;
+ for (int s = 0; s < sample_count ; s ++) {
+ X[x][s] -= mean;
+ sum += X[x][s] * X[x][s];
+ }
+ sum = sqrt(sum);
+ if (sum == 0) sum = 1;
+ for (int s = 0; s < sample_count ; s ++) X[x][s] /= sum;
+ }
+}
diff --git a/src/mode_cis/cis_nominal_pass.cpp b/src/mode_cis/cis_nominal_pass.cpp
new file mode 100644
index 0000000..10270a3
--- /dev/null
+++ b/src/mode_cis/cis_nominal_pass.cpp
@@ -0,0 +1,135 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "cis_data.h"
+
+void cis_data::runNominalPass(string fout) {
+
+ //STEP0: INITIALIZATION OF IO
+ output_file fdo (fout);
+ if (fdo.fail()) vrb.error("Cannot open file [" + fout + "]");
+
+ //STEP2: INITIALIZE A WORKING COPY OF GENOTYPES
+ vector < double > genotype_sd = vector < double > (genotype_count, 0.0);
+ for (unsigned int v = 0 ; v < genotype_count ; v ++) {
+ genotype_sd[v] = basic_stats(genotype_val[v]).sd();
+ normalize(genotype_val[v]);
+ }
+
+ //STEP3: INITIALIZE A WORKING COPY OF PHENOTYPES
+ vector < double > phenotype_sd = vector < double > (phenotype_count, 0.0);
+ for (unsigned int p = 0 ; p < phenotype_count ; p ++) {
+ phenotype_sd[p] = basic_stats(phenotype_val[p]).sd();
+ normalize(phenotype_val[p]);
+ }
+
+ //STEP4: MAIN SWEEP THROUGH PHENOTYPES
+ for (unsigned int i_group = 0 ; i_group < group_idx.size() ; i_group ++) {
+
+ //STEP4: VERBOSE PROCESSED PHENOTYPES
+ if (grp_mode == GRP_NONE) vrb.title("Processing phenotype [" + phenotype_id[group_idx[i_group][0]] + "] [" + stb.str(i_group+1) + "/" + stb.str(group_idx.size()) + "]");
+ else {
+ vrb.title("Processing group of phenotypes [" + phenotype_grp[group_idx[i_group][0]] + "] [" + stb.str(i_group+1) + "/" + stb.str(group_idx.size()) + "]");
+ vrb.bullet("#phenotypes in group = " + stb.str(group_size[i_group]));
+ if (grp_mode == GRP_PCA1) vrb.bullet("variance explained by PC1 = " + stb.str(group_var[i_group], 3));
+ }
+
+ //STEP6: ENUMERATE ALL VARIANTS IN CIS
+ vector < unsigned int > variant_indexes;
+ vector < int > variant_distances;
+ for (unsigned int v = 0 ; v < genotype_count ; v ++) {
+ if (phenotype_chr[group_idx[i_group][0]] != genotype_chr[v]) continue;
+ int ps = (phenotype_start[group_idx[i_group][0]]>cis_window)?(phenotype_start[group_idx[i_group][0]]-cis_window):0;
+ int pe = phenotype_end[group_idx[i_group][0]] + cis_window;
+
+ if (genotype_start[v] <= pe && ps <= genotype_end[v]) {
+ int cisdistance = 0;
+ if (genotype_start[v] <= phenotype_end[group_idx[i_group][0]] && phenotype_start[group_idx[i_group][0]] <= genotype_end[v]) cisdistance = 0;
+ else if (genotype_end[v] < phenotype_start[group_idx[i_group][0]]) cisdistance = genotype_end[v] - phenotype_start[group_idx[i_group][0]];
+ else cisdistance = genotype_start[v] - phenotype_end[group_idx[i_group][0]];
+ if (phenotype_neg[group_idx[i_group][0]]) cisdistance *= -1;
+ variant_indexes.push_back(v);
+ variant_distances.push_back(cisdistance);
+ }
+ }
+ vrb.bullet("#variants in cis = " + stb.str(variant_indexes.size()));
+
+ //STEP7: VARIANTS IN CIS FOUND: FULL COMPUTATIONS
+ double dof_true;
+ if (variant_indexes.size() > 0) {
+ double best_nominal_correlation = 0.0;
+ int best_nominal_variant_abs = -1;
+ int best_nominal_variant_rel = -1;
+ int best_nominal_phenotype_abs = -1;
+ int best_nominal_phenotype_rel = -1;
+ int best_nominal_distance = cis_window;
+ vector < double > pval_nom = vector < double > (variant_indexes.size() * group_idx[i_group].size(), 0.0);
+ vector < double > pval_slope = vector < double > (variant_indexes.size() * group_idx[i_group].size(), 0.0);
+
+ //STEP8: ASSOCIATION TESTING
+ dof_true = sample_count - 2;
+ for (unsigned int p = 0 ; p < group_idx[i_group].size() ; p ++) {
+ for (unsigned int v = 0 ; v < variant_indexes.size() ; v ++) {
+ double curr_correlation = getCorrelation(genotype_val[variant_indexes[v]], phenotype_val[group_idx[i_group][p]]);
+ pval_nom[v*group_idx[i_group].size()+p] = getPvalue(curr_correlation, dof_true);
+ pval_slope[v*group_idx[i_group].size()+p] = getSlope(curr_correlation, genotype_sd[variant_indexes[v]], phenotype_sd[group_idx[i_group][p]]);
+ if (abs(curr_correlation) > abs(best_nominal_correlation) || (abs(curr_correlation) == abs(best_nominal_correlation) && abs(variant_distances[v]) < abs(best_nominal_distance))) {
+ best_nominal_correlation = curr_correlation;
+ best_nominal_variant_rel = v;
+ best_nominal_variant_abs = variant_indexes[v];
+ best_nominal_phenotype_rel = p;
+ best_nominal_phenotype_abs = group_idx[i_group][p];
+ best_nominal_distance = variant_distances[v];
+ }
+ }
+ }
+
+ //STEP9: VERBOSE BEST HIT
+ if (grp_mode == GRP_BEST)
+ vrb.bullet("Best hit: [id=" + genotype_id[best_nominal_variant_abs] + ", d=" + stb.str(best_nominal_distance) + ", p=" + phenotype_id[best_nominal_phenotype_abs] + ", pv=" + stb.str(pval_nom[best_nominal_variant_rel]) + ", s=" + stb.str(pval_slope[best_nominal_variant_rel], 4) + "]");
+ else vrb.bullet("Best hit: [id=" + genotype_id[best_nominal_variant_abs] + ", d=" + stb.str(best_nominal_distance) + ", pv=" + stb.str(pval_nom[best_nominal_variant_rel]) + ", s=" + stb.str(pval_slope[best_nominal_variant_rel], 4) + "]");
+
+ //STEP10: PRINT RESULTS IN FILE
+ for (unsigned int p = 0 ; p < group_idx[i_group].size() ; p ++) {
+ for (unsigned int v = 0 ; v < variant_indexes.size() ; v ++) {
+ if (pval_nom[v * group_idx[i_group].size() + p] <= threshold) {
+ if (grp_mode == GRP_NONE) fdo << phenotype_id[group_idx[i_group][p]];
+ else fdo << phenotype_grp[group_idx[i_group][p]];
+ fdo << " " << phenotype_chr[group_idx[i_group][p]];
+ fdo << " " << phenotype_start[group_idx[i_group][p]];
+ fdo << " " << phenotype_end[group_idx[i_group][p]];
+ fdo << " " << (phenotype_neg[group_idx[i_group][p]]?"-":"+");
+ switch (grp_mode) {
+ case GRP_BEST: fdo << " " << phenotype_id[group_idx[i_group][p]] << " " << stb.str(group_size[i_group]); break;
+ case GRP_PCA1: fdo << " " << stb.str(group_var[i_group], 3) << " " << stb.str(group_size[i_group]); break;
+ case GRP_MEAN: fdo << " " << stb.str(group_size[i_group]); break;
+ }
+ fdo << " " << variant_indexes.size();
+ fdo << " " << variant_distances[v];
+ fdo << " " << genotype_id[variant_indexes[v]];
+ fdo << " " << genotype_chr[variant_indexes[v]];
+ fdo << " " << genotype_start[variant_indexes[v]];
+ fdo << " " << genotype_end[variant_indexes[v]];
+ fdo << " " << pval_nom[v * group_idx[i_group].size() + p];
+ fdo << " " << pval_slope[v * group_idx[i_group].size() + p];
+ fdo << " " << ((v == best_nominal_variant_rel && p == best_nominal_phenotype_rel)?"1":"0");
+ fdo << endl;
+ }
+ }
+ }
+ }
+ }
+ fdo.close();
+}
diff --git a/src/mode_cis/cis_permutation_pass.cpp b/src/mode_cis/cis_permutation_pass.cpp
new file mode 100644
index 0000000..52cd961
--- /dev/null
+++ b/src/mode_cis/cis_permutation_pass.cpp
@@ -0,0 +1,195 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "cis_data.h"
+
+void cis_data::runPermutationPass(string fout) {
+
+ //STEP0: INITIALIZATION OF IO
+ output_file fdo (fout);
+ if (fdo.fail()) vrb.error("Cannot open file [" + fout + "]");
+
+ //STEP1: INITIALIZE A WORKING COPY OF GENOTYPES
+ vector < double > genotype_sd = vector < double > (genotype_count, 0.0);
+ for (unsigned int v = 0 ; v < genotype_count ; v ++) {
+ genotype_sd[v] = basic_stats(genotype_val[v]).sd();
+ normalize(genotype_val[v]);
+ }
+
+ //STEP2: INITIALIZE A WORKING COPY OF PHENOTYPES
+ vector < double > phenotype_sd = vector < double > (phenotype_count, 0.0);
+ for (unsigned int p = 0 ; p < phenotype_count ; p ++) {
+ phenotype_sd[p] = basic_stats(phenotype_val[p]).sd();
+ normalize(phenotype_val[p]);
+ }
+
+ //STEP3: MAIN SWEEP THROUGH PHENOTYPES
+ for (unsigned int i_group = 0 ; i_group < group_idx.size() ; i_group ++) {
+
+ //STEP4: VERBOSE PROCESSED PHENOTYPES
+ if (grp_mode == GRP_NONE) vrb.title("Processing phenotype [" + phenotype_id[group_idx[i_group][0]] + "] [" + stb.str(i_group+1) + "/" + stb.str(group_idx.size()) + "]");
+ else {
+ vrb.title("Processing group of phenotypes [" + phenotype_grp[group_idx[i_group][0]] + "] [" + stb.str(i_group+1) + "/" + stb.str(group_idx.size()) + "]");
+ vrb.bullet("#phenotypes in group = " + stb.str(group_size[i_group]));
+ if (grp_mode == GRP_PCA1) vrb.bullet("variance explained by PC1 = " + stb.str(group_var[i_group], 3));
+ }
+
+ //STEP5: ENUMERATE ALL VARIANTS IN CIS
+ vector < unsigned int > variant_indexes;
+ vector < int > variant_distances;
+ for (unsigned int v = 0 ; v < genotype_count ; v ++) {
+ if (phenotype_chr[group_idx[i_group][0]] != genotype_chr[v]) continue;
+ int ps = (phenotype_start[group_idx[i_group][0]]>cis_window)?(phenotype_start[group_idx[i_group][0]]-cis_window):0;
+ int pe = phenotype_end[group_idx[i_group][0]] + cis_window;
+
+ if (genotype_start[v] <= pe && ps <= genotype_end[v]) {
+ int cisdistance = 0;
+ if (genotype_start[v] <= phenotype_end[group_idx[i_group][0]] && phenotype_start[group_idx[i_group][0]] <= genotype_end[v]) cisdistance = 0;
+ else if (genotype_end[v] < phenotype_start[group_idx[i_group][0]]) cisdistance = genotype_end[v] - phenotype_start[group_idx[i_group][0]];
+ else cisdistance = genotype_start[v] - phenotype_end[group_idx[i_group][0]];
+ if (phenotype_neg[group_idx[i_group][0]]) cisdistance *= -1;
+ variant_indexes.push_back(v);
+ variant_distances.push_back(cisdistance);
+ }
+ }
+ vrb.bullet("#variants in cis = " + stb.str(variant_indexes.size()));
+
+ //STEP7: NO VARIANTS IN CIS: OUTPUT NAs
+ if (variant_indexes.size() == 0) {
+ if (grp_mode == GRP_NONE) fdo << phenotype_id[group_idx[i_group][0]];
+ else fdo << phenotype_grp[group_idx[i_group][0]];
+ fdo << " " << phenotype_chr[group_idx[i_group][0]];
+ fdo << " " << phenotype_start[group_idx[i_group][0]];
+ fdo << " " << phenotype_end[group_idx[i_group][0]];
+ fdo << " " << (phenotype_neg[group_idx[i_group][0]]?"-":"+");
+ switch (grp_mode) {
+ case GRP_BEST: fdo << " NA " << stb.str(group_size[i_group]); break;
+ case GRP_PCA1: fdo << " " << stb.str(group_var[i_group], 3) << " " << stb.str(group_size[i_group]); break;
+ case GRP_MEAN: fdo << " " << stb.str(group_size[i_group]); break;
+ }
+ fdo << " " << variant_indexes.size() << " NA NA NA NA NA NA NA NA NA NA NA NA NA" << endl;
+ }
+
+ //STEP8: VARIANTS IN CIS FOUND: FULL COMPUTATIONS
+ else {
+ double best_nominal_correlation = 0.0;
+ int best_nominal_variant_abs = -1;
+ int best_nominal_phenotype_abs = -1;
+ int best_nominal_distance = cis_window;
+ vector < double > best_permuted_correlations = vector < double > (n_permutations, 0.0);
+
+ //STEP9: NOMINAL PASS
+ for (unsigned int p = 0 ; p < group_idx[i_group].size() ; p ++) {
+ for (unsigned int v = 0 ; v < variant_indexes.size() ; v ++) {
+ double curr_correlation = getCorrelation(genotype_val[variant_indexes[v]], phenotype_val[group_idx[i_group][p]]);
+ if (abs(curr_correlation) > abs(best_nominal_correlation) || (abs(curr_correlation) == abs(best_nominal_correlation) && abs(variant_distances[v]) < abs(best_nominal_distance))) {
+ best_nominal_correlation = curr_correlation;
+ best_nominal_variant_abs = variant_indexes[v];
+ best_nominal_phenotype_abs = group_idx[i_group][p];
+ best_nominal_distance = variant_distances[v];
+ }
+ }
+ }
+
+ //STEP10: PERMUTATION PASS
+ vector < float > phenotype_curr = vector < float > (sample_count, 0.0);
+ vector < unsigned int > permuted_indexes = vector < unsigned int > (sample_count, 0);
+ for (unsigned int i = 0 ; i < sample_count ; i ++) permuted_indexes[i] = i;
+ for (int perm = 0 ; perm < n_permutations ; perm ++) {
+ shuffle(permuted_indexes.begin(), permuted_indexes.end(), rng.getEngine());
+ for (unsigned int p = 0 ; p < group_idx[i_group].size() ; p ++) {
+ for (unsigned int i = 0 ; i < sample_count ; i ++) phenotype_curr[i] = phenotype_val[group_idx[i_group][p]][permuted_indexes[i]];
+ for (unsigned int v = 0 ; v < variant_indexes.size() ; v ++) {
+ double curr_correlation = getCorrelation(genotype_val[variant_indexes[v]], phenotype_curr);
+ if (abs(curr_correlation) > abs(best_permuted_correlations[perm])) best_permuted_correlations[perm] = curr_correlation;
+ }
+ }
+ }
+
+ //STEP11: COMPUTE BASIC STATS FOR BEST HIT
+ double dof_true = sample_count - 2;
+ double pval_nom = getPvalue(best_nominal_correlation, dof_true);
+ double pval_slope = getSlope(best_nominal_correlation, genotype_sd[best_nominal_variant_abs], phenotype_sd[best_nominal_phenotype_abs]);
+
+ //STEP12: VERBOSE BEST HIT
+ if (grp_mode == GRP_BEST)
+ vrb.bullet("Best hit: [id=" + genotype_id[best_nominal_variant_abs] + ", d=" + stb.str(best_nominal_distance) + ", p=" + phenotype_id[best_nominal_phenotype_abs] + ", pv=" + stb.str(pval_nom) + ", s=" + stb.str(pval_slope, 4) + "]");
+ else vrb.bullet("Best hit: [id=" + genotype_id[best_nominal_variant_abs] + ", d=" + stb.str(best_nominal_distance) + ", pv=" + stb.str(pval_nom) + ", s=" + stb.str(pval_slope, 4) + "]");
+
+ //STEP13: PROCESS DEGREES OF FREEDOM
+ double dof_esti = dof_true;
+ double variance_best_permuted_correlations = basic_stats(best_permuted_correlations).variance();
+ if (variance_best_permuted_correlations != 0) learnDegreeOfFreedom(best_permuted_correlations, dof_esti);
+ //vrb.bullet("DOF: [t=" + stb.str(dof_true, 1) + ", e=" + stb.str(dof_esti, 1) + "]");
+
+ //STEP14: COMPUTE BEST PERMUTATION HIT
+ vector < double > best_permuted_pvalues = vector < double > (n_permutations, 0.0);
+ for (int perm = 0 ; perm < n_permutations ; perm ++) best_permuted_pvalues[perm] = getPvalue(best_permuted_correlations[perm], dof_esti);
+ double mean_best_permuted_pvalues, variance_best_permuted_pvalues;
+ mean_best_permuted_pvalues = basic_stats(best_permuted_pvalues).mean();
+ variance_best_permuted_pvalues = basic_stats(best_permuted_pvalues).variance();
+
+ //STEP15: LEARN BETA PARAMETERS
+ double beta_mm1 = mean_best_permuted_pvalues * (mean_best_permuted_pvalues * (1 - mean_best_permuted_pvalues ) / variance_best_permuted_pvalues - 1);
+ double beta_mm2 = beta_mm1 * (1 / mean_best_permuted_pvalues - 1);
+ double beta_ml1 = beta_mm1;
+ double beta_ml2 = beta_mm2;
+ try {
+ learnBetaParameters(best_permuted_pvalues, beta_ml1, beta_ml2);
+ } catch (const std::exception & e) {
+ vrb.bullet("Maximum Likelihood estimation failed, use Moment Matching instead!");
+ beta_ml1 = beta_mm1;
+ beta_ml2 = beta_mm2;
+ }
+ vrb.bullet("Beta parameters: [s1=" + stb.str(beta_ml1) + ", s2=" + stb.str(beta_ml2) +"]");
+
+ //STEP16: COMPUTE ADJUSTED PVALUES
+ double pval_emp = getPvalue(best_nominal_correlation, best_permuted_correlations);
+ double pval_bml = pbeta(getPvalue(best_nominal_correlation, dof_esti), beta_ml1, beta_ml2, 1, 0);
+
+ //STEP17: VERBOSE ADJUSTED PVALUES
+ vrb.bullet("Adjusted p-values: [emp=" + stb.str(pval_emp) + ", beta=" + stb.str(pval_bml) + "]");
+
+ //STEP18: PRINT RESULTS IN FILE
+ if (grp_mode == GRP_NONE) fdo << phenotype_id[group_idx[i_group][0]];
+ else fdo << phenotype_grp[group_idx[i_group][0]];
+ fdo << " " << phenotype_chr[group_idx[i_group][0]];
+ fdo << " " << phenotype_start[group_idx[i_group][0]];
+ fdo << " " << phenotype_end[group_idx[i_group][0]];
+ fdo << " " << (phenotype_neg[group_idx[i_group][0]]?"-":"+");
+ switch (grp_mode) {
+ case GRP_BEST: fdo << " " << phenotype_id[best_nominal_phenotype_abs] << " " << stb.str(group_size[i_group]); break;
+ case GRP_PCA1: fdo << " " << stb.str(group_var[i_group], 3) << " " << stb.str(group_size[i_group]); break;
+ case GRP_MEAN: fdo << " " << stb.str(group_size[i_group]); break;
+ }
+ fdo << " " << variant_indexes.size();
+ fdo << " " << best_nominal_distance;
+ fdo << " " << genotype_id[best_nominal_variant_abs];
+ fdo << " " << genotype_chr[best_nominal_variant_abs];
+ fdo << " " << genotype_start[best_nominal_variant_abs];
+ fdo << " " << genotype_end[best_nominal_variant_abs];
+ fdo << " " << dof_true;
+ fdo << " " << dof_esti;
+ fdo << " " << beta_ml1;
+ fdo << " " << beta_ml2;
+ fdo << " " << pval_nom;
+ fdo << " " << pval_slope;
+ fdo << " " << pval_emp;
+ fdo << " " << pval_bml;
+ fdo << endl;
+ }
+ }
+ fdo.close();
+}
diff --git a/src/mode_cis/cis_read_covariates.cpp b/src/mode_cis/cis_read_covariates.cpp
new file mode 100644
index 0000000..8e01b07
--- /dev/null
+++ b/src/mode_cis/cis_read_covariates.cpp
@@ -0,0 +1,56 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "cis_data.h"
+
+void cis_data::readCovariates(string fcov) {
+ string buffer;
+ vector < string > str;
+ int n_includedS = 0;
+ int n_includedC = 0;
+ int n_excludedC = 0;
+ vector < int > mappingS;
+
+ vrb.title("Reading covariates in [" + fcov + "]");
+ input_file fd (fcov);
+ if (fd.fail()) vrb.error("Cannot open file!");
+
+ //Read samples
+ getline(fd, buffer);
+ if (buffer.size() == 0) vrb.error("No header line detected!");
+ stb.split(buffer, str);
+ for (int t = 1 ; t < str.size() ; t ++) {
+ mappingS.push_back(findSample(str[t]));
+ if (mappingS.back() >= 0) n_includedS++;
+ }
+
+ //Read covariates
+ while(getline(fd, buffer)) {
+ stb.split(buffer, str);
+ if (str.size() < 2) vrb.error("Incorrect number of columns!");
+ if (filter_covariate.check(str[0])) {
+ covariate_val.push_back(vector < string > (sample_count));
+ assert(mappingS.size() == (str.size() - 1));
+ for (int t = 1 ; t < str.size() ; t ++) if (mappingS[t-1] >= 0) covariate_val.back()[mappingS[t-1]] = str[t];
+ n_includedC ++;
+ } else n_excludedC ++;
+ }
+
+ //Finalise
+ covariate_count = n_includedC;
+ vrb.bullet(stb.str(n_includedC) + " covariates included");
+ if (n_excludedC > 0) vrb.bullet(stb.str(n_excludedC) + " covariates excluded");
+ fd.close();
+}
diff --git a/src/mode_cis/cis_read_genotypes.cpp b/src/mode_cis/cis_read_genotypes.cpp
new file mode 100644
index 0000000..b2d3d6a
--- /dev/null
+++ b/src/mode_cis/cis_read_genotypes.cpp
@@ -0,0 +1,215 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "cis_data.h"
+
+void cis_data::readGenotypes(string filename) {
+ vrb.title("Reading genotype data in [" + filename + "]");
+ htsFile * fp = hts_open(filename.c_str(),"r");
+ enum htsExactFormat fileformat = fp->format.format;
+ hts_close(fp);
+ if (fileformat == bcf) {
+ vrb.bullet("File format detected: BCF");
+ readGenotypesVCF(filename);
+ } else if (fileformat == vcf) {
+ vrb.bullet("File format detected: VCF");
+ readGenotypesVCF(filename);
+ } else if (fileformat == sam) {
+ vrb.bullet("File format detected: BED");
+ readGenotypesBED(filename);
+ } else vrb.error("File format not supported!");
+}
+
+void cis_data::readGenotypesVCF(string fvcf) {
+ int n_includedG = 0;
+ int n_excludedG_mult = 0;
+ int n_excludedG_void = 0;
+ int n_excludedG_user = 0;
+ int n_includedS = 0;
+ vector < int > mappingS;
+
+ //Opening files
+ bcf_srs_t * sr = bcf_sr_init();
+ if ( regionGenotype.chr != "NA"){
+ vrb.bullet("target region [" + regionGenotype.get() + "]");
+ if (bcf_sr_set_regions(sr, regionGenotype.get().c_str(), 0) == -1) vrb.error("Cannot jump to region!");
+ }
+ if(!(bcf_sr_add_reader (sr, fvcf.c_str()))) {
+ switch (sr->errnum) {
+ case not_bgzf: vrb.error("File not compressed with bgzip!"); break;
+ case idx_load_failed: vrb.error("Impossible to load index file!"); break;
+ case file_type_error: vrb.error("File format not detected by htslib!"); break;
+ default : vrb.error("Unknown error!");
+ }
+ }
+
+ //Sample processing
+ int n_samples = bcf_hdr_nsamples(sr->readers[0].header);
+ for (int i0 = 0 ; i0 < n_samples ; i0 ++) {
+ mappingS.push_back(findSample(string(sr->readers[0].header->samples[i0])));
+ if (mappingS.back() >= 0) n_includedS++;
+ }
+ unsigned int linecount=0;
+ //Read genotype data
+ int ngt, ngt_arr = 0, nds, nds_arr = 0, * gt_arr = NULL, nsl, nsl_arr = 0, * sl_arr = NULL;
+ float * ds_arr = NULL;
+ bcf1_t * line;
+ while(bcf_sr_next_line (sr)) {
+ linecount ++;
+ if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
+ line = bcf_sr_get_line(sr, 0);
+ if (line->n_allele == 2) {
+ ngt = bcf_get_genotypes(sr->readers[0].header, line, >_arr, &ngt_arr);
+ nds = bcf_get_format_float(sr->readers[0].header, line,"DS", &ds_arr, &nds_arr);
+ if (nds == n_samples || ngt == 2*n_samples) {
+ bcf_unpack(line, BCF_UN_STR);
+ string sid = string(line->d.id);
+ string chr = string(bcf_hdr_id2name(sr->readers[0].header, line->rid));
+ int pos = line->pos + 1;
+ if (filter_genotype.check(sid) && filter_position.check(chr + "_" + stb.str(pos))) {
+ genotype_id.push_back(sid);
+ genotype_chr.push_back(chr);
+ string genotype_ref = string(line->d.allele[0]);
+ genotype_start.push_back(pos);
+ nsl = bcf_get_info_int32(sr->readers[0].header, line, "END", &sl_arr, &nsl_arr);
+ if (nsl >= 0 && nsl_arr == 1) genotype_end.push_back(sl_arr[0]);
+ else genotype_end.push_back(genotype_start.back() + genotype_ref.size() - 1);
+ genotype_val.push_back(vector < float > (sample_count, 0.0));
+
+ for(int i = 0 ; i < n_samples ; i ++) {
+ if (mappingS[i] >= 0) {
+ if (nds > 0) genotype_val.back()[mappingS[i]] = ds_arr[i];
+ else {
+ if (gt_arr[2*i+0] == bcf_gt_missing || gt_arr[2*i+1] == bcf_gt_missing) genotype_val.back()[mappingS[i]] = bcf_float_missing;
+ else genotype_val.back()[mappingS[i]] = bcf_gt_allele(gt_arr[2*i+0]) + bcf_gt_allele(gt_arr[2*i+1]);
+ }
+ }
+ }
+ n_includedG++;
+ } else n_excludedG_user ++;
+ } else n_excludedG_void ++;
+ } else n_excludedG_mult ++;
+ }
+
+ //Finalize
+ free(gt_arr);
+ free(ds_arr);
+ bcf_sr_destroy(sr);
+ genotype_count = n_includedG;
+ vrb.bullet(stb.str(n_includedG) + " variants included");
+ if (n_excludedG_user > 0) vrb.bullet(stb.str(n_excludedG_user) + " variants excluded by user");
+ if (n_excludedG_mult > 0) vrb.bullet(stb.str(n_excludedG_mult) + " multi-allelic variants excluded");
+ if (n_excludedG_void > 0) vrb.bullet(stb.str(n_excludedG_void) + " uninformative variants excluded [no GT/DS]");
+ if (genotype_count == 0) vrb.leave("Cannot find genotypes in target region!");
+}
+
+void cis_data::readGenotypesBED(string fbed) {
+ string buffer;
+ int n_includedG = 0;
+ int n_excludedG_user = 0;
+ int n_includedS = 0;
+ int n_excludedS = 0;
+ int n_missingS = 0;
+ vector < int > mappingS;
+
+ //Opening files
+ htsFile *fp = hts_open(fbed.c_str(),"r");
+ if (!fp) vrb.error("Cannot open file!");
+ tbx_t * tbx = tbx_index_load(fbed.c_str());
+ if (!tbx) vrb.error("Cannot load index file!");
+ kstring_t str = {0,0,0};
+ if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line!");
+
+ //Process sample names
+ vector < string > tokens;
+ stb.split(string(str.s), tokens);
+ if (tokens.size() < 5) vrb.error("Incorrect number of columns!");
+ for (int i0 = 4 ; i0 < tokens.size() ; i0 ++) {
+ string sid = tokens[i0];
+ if (filter_sample.check(sid)) {
+ mappingS.push_back(findSample(sid));
+ if (mappingS.back() >= 0) n_includedS ++;
+ else n_missingS ++;
+ } else {
+ mappingS.push_back(-1);
+ n_excludedS ++;
+ }
+ }
+ vrb.bullet(stb.str(n_includedS) + " samples included");
+ if (n_excludedS > 0) vrb.bullet(stb.str(n_excludedS) + " samples excluded by user");
+ if (n_missingS > 0) vrb.bullet(stb.str(n_missingS) + " samples without phenotype data");
+ if (n_includedS != sample_count) vrb.error("Cannot find genotype for " + stb.str(sample_count - n_includedS) + " samples!");
+
+
+ unsigned int linecount = 0;
+
+ //Jump to interesting region
+ if (regionGenotype.chr != "NA"){
+ hts_itr_t *itr = tbx_itr_querys(tbx, regionGenotype.get().c_str());
+ vrb.bullet("target region [" + regionGenotype.get() + "]");
+ if (!itr) vrb.error("Cannot jump to region!");
+ while (tbx_itr_next(fp, tbx, itr, &str) >= 0) {
+ linecount ++;
+ if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
+ stb.split(string(str.s), tokens);
+ if (tokens.size() < 5) vrb.error("Incorrect number of columns!");
+ if (filter_genotype.check(tokens[3])) {
+ genotype_id.push_back(tokens[3]);
+ genotype_chr.push_back(tokens[0]);
+ genotype_start.push_back(atoi(tokens[1].c_str()) + 1);
+ genotype_end.push_back(atoi(tokens[2].c_str()));
+ genotype_val.push_back(vector < float > (sample_count, 0.0));
+ for (int t = 4 ; t < tokens.size() ; t ++) {
+ if (mappingS[t-4] >= 0) {
+ if (tokens[t] == "NA") genotype_val.back()[mappingS[t-4]] = bcf_float_missing;
+ else genotype_val.back()[mappingS[t-4]] = atof(tokens[t].c_str());
+ }
+ }
+ n_includedG++;
+ } else n_excludedG_user ++;
+ }
+ tbx_itr_destroy(itr);
+ }else{
+ while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) {
+ linecount ++;
+ if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
+ stb.split(string(str.s), tokens);
+ if (tokens.size() < 5) vrb.error("Incorrect number of columns!");
+ if (filter_genotype.check(tokens[3])) {
+ genotype_id.push_back(tokens[3]);
+ genotype_chr.push_back(tokens[0]);
+ genotype_start.push_back(atoi(tokens[1].c_str()) + 1);
+ genotype_end.push_back(atoi(tokens[2].c_str()));
+ genotype_val.push_back(vector < float > (sample_count, 0.0));
+ for (int t = 4 ; t < tokens.size() ; t ++) {
+ if (mappingS[t-4] >= 0) {
+ if (tokens[t] == "NA") genotype_val.back()[mappingS[t-4]] = bcf_float_missing;
+ else genotype_val.back()[mappingS[t-4]] = atof(tokens[t].c_str());
+ }
+ }
+ n_includedG++;
+ } else n_excludedG_user ++;
+ }
+ }
+
+
+ //Finalize & verbose
+ tbx_destroy(tbx);
+ if (hts_close(fp)) vrb.error("Cannot properly close file!");
+ genotype_count = n_includedG;
+ vrb.bullet(stb.str(n_includedG) + " variants included");
+ if (n_excludedG_user > 0) vrb.bullet(stb.str(n_excludedG_user) + " variants excluded by user");
+ if (genotype_count == 0) vrb.leave("Cannot find variants in target region!");
+}
diff --git a/src/mode_cis/cis_read_phenotypes.cpp b/src/mode_cis/cis_read_phenotypes.cpp
new file mode 100644
index 0000000..c26a757
--- /dev/null
+++ b/src/mode_cis/cis_read_phenotypes.cpp
@@ -0,0 +1,157 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "cis_data.h"
+
+void cis_data::readPhenotypes(string fbed) {
+ int n_includedS = 0;
+ int n_includedP = 0;
+ int n_excludedP = 0;
+ int n_negativeStrd = 0;
+ vector < int > mappingS;
+
+ //Open BED file
+ vrb.title("Reading phenotype data in [" + fbed + "]");
+ htsFile *fp = hts_open(fbed.c_str(),"r");
+ if (!fp) vrb.error("Cannot open file");
+ tbx_t *tbx = tbx_index_load(fbed.c_str());
+ if (!tbx) vrb.error("Cannot open index file");
+ kstring_t str = {0,0,0};
+ if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line!");
+
+ //Process sample names
+ vector < string > tokens;
+ stb.split(string(str.s), tokens);
+ if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
+ for (int t = 6 ; t < tokens.size() ; t ++) {
+ mappingS.push_back(findSample(tokens[t]));
+ if (mappingS.back() >= 0) n_includedS++;
+ }
+
+ //Read phenotypes
+ unsigned int linecount =0;
+
+ //Read phenotypes
+ if (regionPhenotype.chr != "NA"){
+ hts_itr_t *itr = tbx_itr_querys(tbx, regionPhenotype.get().c_str());
+ vrb.bullet("target region [" + regionPhenotype.get() + "]");
+ if (!itr) vrb.error("Cannot jump to region!");
+ //Read data
+ while (tbx_itr_next(fp, tbx, itr, &str) >= 0) {
+ linecount ++;
+ if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
+ stb.split(string(str.s), tokens);
+ if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
+ if ((grp_mode == GRP_NONE && filter_phenotype.check(tokens[3])) || (grp_mode != GRP_NONE && filter_phenotype.check(tokens[4]))) {
+ phenotype_id.push_back(tokens[3]);
+ phenotype_chr.push_back(tokens[0]);
+ phenotype_start.push_back(atoi(tokens[1].c_str()) + 1);
+ phenotype_end.push_back(atoi(tokens[2].c_str()));
+ if (grp_mode > 0) phenotype_grp.push_back(tokens[4]);
+ phenotype_neg.push_back(tokens[5] == "-");
+ if (phenotype_neg.back()) n_negativeStrd ++;
+ phenotype_val.push_back(vector < float > (sample_count, 0.0));
+ for (int t = 6 ; t < tokens.size() ; t ++) {
+ if (mappingS[t-6] >= 0) {
+ if (tokens[t] == "NA") phenotype_val.back()[mappingS[t-6]] = bcf_float_missing;
+ else phenotype_val.back()[mappingS[t-6]] = stof(tokens[t]);
+ }
+ }
+ n_includedP++;
+ } else n_excludedP ++;
+ }
+ tbx_itr_destroy(itr);
+ }else{
+ while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) {
+ linecount ++;
+ if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
+ stb.split(string(str.s), tokens);
+ if (str.l && str.s[0] != tbx->conf.meta_char) {
+ if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
+ if ((grp_mode == GRP_NONE && filter_phenotype.check(tokens[3])) || (grp_mode != GRP_NONE && filter_phenotype.check(tokens[4]))) {
+ phenotype_id.push_back(tokens[3]);
+ phenotype_chr.push_back(tokens[0]);
+ phenotype_start.push_back(atoi(tokens[1].c_str()) + 1);
+ phenotype_end.push_back(atoi(tokens[2].c_str()));
+ if (grp_mode > 0) phenotype_grp.push_back(tokens[4]);
+ phenotype_neg.push_back(tokens[5] == "-");
+ if (phenotype_neg.back()) n_negativeStrd ++;
+ phenotype_val.push_back(vector < float > (sample_count, 0.0));
+ for (int t = 6 ; t < tokens.size() ; t ++) {
+ if (mappingS[t-6] >= 0) {
+ if (tokens[t] == "NA") phenotype_val.back()[mappingS[t-6]] = bcf_float_missing;
+ else phenotype_val.back()[mappingS[t-6]] = stof(tokens[t]);
+ }
+ }
+ n_includedP++;
+ } else n_excludedP ++;
+ }
+ }
+ }
+
+ //Finalize & verbose
+ tbx_destroy(tbx);
+ if (hts_close(fp)) vrb.error("Cannot properly close file");
+ phenotype_count = phenotype_id.size();
+ vrb.bullet(stb.str(n_includedP) + " phenotypes included");
+ if (n_excludedP > 0) vrb.bullet(stb.str(n_excludedP) + " phenotypes excluded by user");
+ if (n_negativeStrd > 0 ) vrb.bullet(stb.str(n_negativeStrd) + " phenotypes are on the negative strand");
+ if (phenotype_count == 0) vrb.leave("Cannot find phenotypes in target region!");
+}
+
+void cis_data::scanPhenotypes(string fbed) {
+ int n_includedP = 0;
+ int n_excludedP = 0;
+ int n_negativeStrd = 0;
+
+ //Open BED file
+ vrb.title("Scanning phenotype data in [" + fbed + "]");
+ htsFile *fp = hts_open(fbed.c_str(),"r");
+ if (!fp) vrb.error("Cannot open file");
+ tbx_t * tbx = tbx_index_load(fbed.c_str());
+ if (!tbx) vrb.error("Cannot open index file");
+
+ //Read header
+ kstring_t str = {0,0,0};
+ if (!hts_getline(fp, KS_SEP_LINE, &str) || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line");
+
+ //Scan file
+ vector < string > tokens;
+ while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) {
+ if (str.l && str.s[0] != tbx->conf.meta_char) {
+ stb.split(string(str.s), tokens);
+ if (tokens.size() < 5) vrb.error("Incorrect number of columns!");
+ if ((grp_mode == GRP_NONE && filter_phenotype.check(tokens[3])) || (grp_mode != GRP_NONE && filter_phenotype.check(tokens[4]))) {
+ phenotype_id.push_back(tokens[3]);
+ phenotype_chr.push_back(tokens[0]);
+ phenotype_start.push_back(atoi(tokens[1].c_str()) + 1);
+ phenotype_end.push_back(atoi(tokens[2].c_str()));
+ if (grp_mode > 0) phenotype_grp.push_back(tokens[4]);
+ phenotype_neg.push_back(tokens[5] == "-");
+ if (phenotype_neg.back()) n_negativeStrd ++;
+ n_includedP++;
+ } else n_excludedP ++;
+ }
+ }
+
+ //Finalize & verbose
+ tbx_destroy(tbx);
+ if (hts_close(fp)) vrb.error("Cannot properly close file");
+ phenotype_count = phenotype_id.size();
+ vrb.bullet(stb.str(n_includedP) + " phenotypes included");
+ if (n_excludedP > 0) vrb.bullet(stb.str(n_excludedP) + " phenotypes excluded by user");
+ if (n_negativeStrd > 0 ) vrb.bullet(stb.str(n_negativeStrd) + " phenotypes are on the negative strand");
+ if (phenotype_count == 0) vrb.leave("Cannot find phenotypes in region!");
+}
diff --git a/src/mode_cis/cis_read_thresholds.cpp b/src/mode_cis/cis_read_thresholds.cpp
new file mode 100644
index 0000000..990e951
--- /dev/null
+++ b/src/mode_cis/cis_read_thresholds.cpp
@@ -0,0 +1,57 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "cis_data.h"
+
+void cis_data::readThresholds(string fres) {
+ string buffer; vector < string > tokens;
+
+ //1.0 Allocation
+ phenotype_threshold = vector < double > (phenotype_count, -1);
+ vector < bool > phenotype_mask = vector < bool > (phenotype_count, false);
+
+ //2.0 Read results
+ vrb.title("Reading nominal thresholds in [" + fres + "]");
+ input_file fdr(fres);
+ if (fdr.fail()) vrb.error("Cannot not open file!");
+
+ while (getline(fdr, buffer)) {
+ stb.split(buffer, tokens);
+ if (tokens.size() < 2) vrb.error("Incorrect number of columns!");
+
+ vector < int> phenotype_idx;
+
+ if (grp_mode != GRP_NONE) {
+ for (int p = 0 ; p < phenotype_count ; p ++) if (phenotype_grp[p] == tokens[0]) phenotype_idx.push_back(p);
+ } else {
+ for (int p = 0 ; p < phenotype_count; p ++) if (phenotype_id[p] == tokens[0]) phenotype_idx.push_back(p);
+ }
+
+ for (int i = 0 ; i < phenotype_idx.size() ; i++) {
+ if (tokens[1] != "NA") phenotype_threshold[phenotype_idx[i]] = atof(tokens[1].c_str());
+ phenotype_mask[phenotype_idx[i]] = true;
+ }
+ }
+ fdr.close();
+
+ //3.0 Make sure that each MP has a qvalue
+ int n_set= 0, n_unset = 0;
+ for (int p = 0 ; p < phenotype_count ; p ++) {
+ if (phenotype_mask[p]) n_set ++;
+ else n_unset ++;
+ }
+ vrb.bullet("#phenotypes set = " + stb.str(n_set));
+ if (n_unset > 0) vrb.error("Cannot find thresholds for " + stb.str(n_unset) + " phenotypes!");
+}
diff --git a/src/mode_correct/correct_data.h b/src/mode_correct/correct_data.h
new file mode 100644
index 0000000..63fa01f
--- /dev/null
+++ b/src/mode_correct/correct_data.h
@@ -0,0 +1,66 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#ifndef _CORRECT_DATA_H
+#define _CORRECT_DATA_H
+
+//ANALYSIS MODES
+#define CORRECT_VCF 1
+#define CORRECT_BED 2
+
+//INCLUDES
+#include "../common/data.h"
+
+class correct_data : public data {
+public:
+ //PARAMETERS
+ unsigned int mode;
+ bool normalize;
+ bool residualize;
+
+ //COVARIATES
+ int covariate_count; //covariate number
+ vector < vector < string > > covariate_val; //covariate values
+ vector < string > covariate_id; //covariate ids
+ residualizer * covariate_engine; //covariate engine machinery
+
+ //CONSTRUCTOR / DESTRUCTOR
+ correct_data();
+ ~correct_data();
+ void clear();
+
+ //INITIALIZE
+ void initializeResidualizer();
+
+ //READ DATA
+ void readCovariates(string);
+
+ //GENOTYPE & PHENOTYPE MANAGEMENT
+ void imputeMissing(vector < float > &);
+ void normalTransform(vector < float > &);
+ void imputeMissing(float *);
+ void normalTransform(float *);
+
+ //PROCESS DATA
+ void processBED(string, string);
+ void processVCF(string, string);
+};
+
+//***************************************************************//
+//******************** DECLARE FUNCTIONS *************************//
+//***************************************************************//
+void correct_main(vector < string > &);
+
+#endif
diff --git a/src/mode_correct/correct_main.cpp b/src/mode_correct/correct_main.cpp
new file mode 100644
index 0000000..ab9184a
--- /dev/null
+++ b/src/mode_correct/correct_main.cpp
@@ -0,0 +1,100 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "correct_data.h"
+
+void correct_main(vector < string > & argv) {
+ correct_data D;
+
+ //-------------------------
+ // 1. DECLARE ALL OPTIONS
+ //-------------------------
+ D.declareBasicOptions();
+
+ boost::program_options::options_description opt_files ("\x1B[32mI/O\33[0m");
+ opt_files.add_options()
+ ("vcf", boost::program_options::value< string >(), "Genotypes in VCF/BCF/BED format.")
+ ("bed", boost::program_options::value< string >(), "Phenotypes in BED format.")
+ ("cov", boost::program_options::value< string >(), "Covariates in TXT format.")
+ ("out", boost::program_options::value< string >(), "Output file.");
+
+ boost::program_options::options_description opt_param ("\x1B[32mParameters\33[0m");
+ opt_param.add_options()
+ ("normal", "Normal transformation fo the data.");
+
+ D.option_descriptions.add(opt_files).add(opt_param);
+
+ //-------------------
+ // 2. PARSE OPTIONS
+ //-------------------
+ try {
+ boost::program_options::store(boost::program_options::command_line_parser(argv).options(D.option_descriptions).run(), D.options);
+ boost::program_options::notify(D.options);
+ } catch ( const boost::program_options::error& e ) {
+ cerr << "Error parsing [correct] command line :" << string(e.what()) << endl;
+ exit(0);
+ }
+
+ //---------------------
+ // 3. PRINT HELP/HEADER
+ //---------------------
+ vrb.ctitle("CORRECTING GENOTYPES OR PHENOTYPES FOR COVARIATES");
+ if (D.options.count("help")) {
+ cout << D.option_descriptions << endl;
+ exit(EXIT_SUCCESS);
+ }
+
+ //-----------------
+ // 4. COMMON CHECKS
+ //-----------------
+ if ((D.options.count("vcf") + D.options.count("bed")) != 1) vrb.error("One input file has to be specified using either --vcf [file.vcf] or --bed [file.bed]");
+ if ((D.options.count("cov") + D.options.count("normal")) < 1) vrb.error("At least one data transformation has to be specified with --cov [file.cov] and/or --normal");
+
+ //---------
+ // 5. MODES
+ //---------
+
+ //MODE1: correcting phenotypes
+ if (D.options.count("bed")) {
+ D.mode = CORRECT_BED;
+ if (D.options.count("cov")) { vrb.bullet("Correct phenotypes using linear model: Phe ~ Cov"); D.residualize = true; }
+ if (D.options.count("normal")) { vrb.bullet("Quantile normal transform phenotypes"); D.normalize = true; }
+ }
+ //MODE2: correcting genotypes
+ if (D.options.count("vcf")) {
+ D.mode = CORRECT_VCF;
+ if (D.options.count("cov")) { vrb.bullet("Correct genotypes using linear model: Phe ~ Cov"); D.residualize = true; }
+ if (D.options.count("normal")) { vrb.bullet("Quantile normal transform genotypes"); D.normalize = true; }
+ }
+
+ //---------------------------
+ // 6. READ FILES & INITIALIZE
+ //---------------------------
+ D.processBasicOptions();
+ if (D.options.count("bed")) D.readSampleFromBED(D.options["bed"].as < string > ());
+ if (D.options.count("vcf")) D.readSampleFromVCF(D.options["vcf"].as < string > ());
+ if (D.options.count("cov")) D.readSampleFromCOV(D.options["cov"].as < string > ());
+ D.mergeSampleLists();
+ if (D.options.count("cov")) {
+ D.readCovariates(D.options["cov"].as < string > ());
+ D.initializeResidualizer();
+ }
+
+ //----------------
+ // 7. RUN ANALYSIS
+ //----------------
+ if (D.options.count("bed")) D.processBED(D.options["bed"].as < string > (), D.options["out"].as < string > ());
+ if (D.options.count("vcf")) D.processVCF(D.options["vcf"].as < string > (), D.options["out"].as < string > ());
+}
diff --git a/src/mode_correct/correct_management.cpp b/src/mode_correct/correct_management.cpp
new file mode 100644
index 0000000..4504fe2
--- /dev/null
+++ b/src/mode_correct/correct_management.cpp
@@ -0,0 +1,91 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "correct_data.h"
+
+correct_data::correct_data() {
+ normalize = false;
+ residualize = false;
+ mode = 0;
+ sample_count = 0;
+ covariate_engine = NULL;
+ covariate_count = 0;
+}
+
+correct_data::~correct_data() {
+ normalize = false;
+ residualize = false;
+ sample_count = 0;
+ sample_id.clear();
+ covariate_count = 0;
+ covariate_val.clear();
+ covariate_id.clear();
+ if (covariate_engine != NULL) delete covariate_engine;
+ covariate_engine = NULL;
+}
+
+void correct_data::imputeMissing(vector < float > & V) {
+ double mean = 0.0;
+ int c_mean = 0;
+ for (int s = 0; s < V.size() ; s ++) if (V[s] != bcf_float_missing) { mean += V[s]; c_mean ++; }
+ mean /= c_mean;
+ for (int s = 0; s < V.size() ; s ++) if (V[s] == bcf_float_missing) V[s] = mean;
+}
+
+void correct_data::normalTransform(vector < float > & V) {
+ vector < float > R;
+ myranker::rank(V, R);
+ double max = 0;
+ for (int s = 0 ; s < sample_count ; s ++) {
+ R[s] = R[s] - 0.5;
+ if (R[s] > max) max = R[s];
+ }
+ max = max + 0.5;
+ for (int s = 0 ; s < sample_count ; s ++) {
+ R[s] /= max;
+ V[s] = qnorm(R[s], 0.0, 1.0, 1, 0);
+ }
+}
+
+void correct_data::imputeMissing(float * V) {
+ double mean = 0.0;
+ int c_mean = 0;
+ for (int s = 0; s < sample_count ; s ++) if (V[s] != bcf_float_missing) { mean += V[s]; c_mean ++; }
+ mean /= c_mean;
+ for (int s = 0; s < sample_count ; s ++) if (V[s] == bcf_float_missing) V[s] = mean;
+}
+
+void correct_data::normalTransform(float * V) {
+ vector < float > R;
+ myranker::rank(V, sample_count, R);
+ double max = 0;
+ for (int s = 0 ; s < sample_count ; s ++) {
+ R[s] = R[s] - 0.5;
+ if (R[s] > max) max = R[s];
+ }
+ max = max + 0.5;
+ for (int s = 0 ; s < sample_count ; s ++) {
+ R[s] /= max;
+ V[s] = qnorm(R[s], 0.0, 1.0, 1, 0);
+ }
+}
+
+void correct_data::initializeResidualizer() {
+ vrb.title("Initialize residualizer");
+ covariate_engine = new residualizer (sample_count);
+ for (int c = 0 ; c < covariate_count ; c ++) covariate_engine->push(covariate_val[c]);
+ covariate_engine->build();
+ vrb.bullet("#covariates = " + stb.str(covariate_count));
+}
diff --git a/src/mode_correct/correct_processing.cpp b/src/mode_correct/correct_processing.cpp
new file mode 100644
index 0000000..77534bb
--- /dev/null
+++ b/src/mode_correct/correct_processing.cpp
@@ -0,0 +1,114 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "correct_data.h"
+
+void correct_data::processBED(string fin, string fout) {
+ int n_includedP = 0, n_excludedP_user = 0;
+
+ //Open BED file
+ vrb.title("Reading phenotype data in [" + fin + "] and writing [" + fout + "]");
+ output_file fdo (fout.c_str());
+ if (fdo.fail()) vrb.error("CAnnot open file for writing!");
+
+ htsFile *fp = hts_open(fin.c_str(),"r");
+ if (!fp) vrb.error("Cannot open file for reading!");
+ kstring_t str = {0,0,0};
+ if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != '#' ) vrb.error("Cannot read header!");
+ fdo << "#chr\tstart\tend\tid";
+
+ //Read and map sample names
+ vector < int > mappingS;
+ vector < string > tokens;
+ stb.split(string(str.s), tokens);
+ if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
+ for (int t = 6 ; t < tokens.size() ; t ++) {
+ mappingS.push_back(findSample(tokens[t]));
+ if (mappingS.back() >= 0) fdo << "\t" << tokens[t];
+ }
+ fdo << endl;
+
+ //Read phenotypes
+ while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) {
+ stb.split(string(str.s), tokens);
+ if (filter_phenotype.check(tokens[3])) {
+ vector < float > values = vector < float > (sample_count, 0.0);
+ fdo << tokens[0] << "\t" << tokens[1] << "\t" << tokens[2] << "\t" << tokens[3] << "\t" << tokens[4] << "\t" << tokens[5];
+ for (int t = 6 ; t < tokens.size() ; t ++) if (mappingS[t-6] >= 0) values[mappingS[t-6]] = ((tokens[t] != "NA")?stof(tokens[t]):bcf_float_missing);
+ imputeMissing(values);
+ if (residualize) covariate_engine->residualize(values);
+ if (normalize) normalTransform(values);
+ for (int s = 0 ; s < sample_count ; s++) fdo << "\t" << values[s];
+ n_includedP ++ ;
+ fdo << endl;
+ } else n_excludedP_user ++;
+ }
+
+ //Finalize & verbose
+ vrb.bullet(stb.str(n_includedP) + " phenotypes");
+ if (n_excludedP_user > 0) vrb.bullet(stb.str(n_excludedP_user) + " phenotypes excluded by user");
+ hts_close(fp);
+ fdo.close();
+}
+
+void correct_data::processVCF(string fin, string fout) {
+ int n_includedG = 0, n_excludedG_user = 0, n_excludedG_miss = 0, n_excludedG_mult = 0;
+
+ vrb.title("Reading genotype data in [" + fin + "] and writing [" + fout + "]");
+
+ bcf_sweep_t * sw = bcf_sweep_init(fin.c_str());
+ if (!sw) vrb.error("Cannot open file for reading [" + fin + "]");
+ bcf_hdr_t * hdr_old = bcf_sweep_hdr(sw);
+ if (!hdr_old) vrb.error("Cannot read header!");
+
+ htsFile * fp = hts_open(fout.c_str(),"wz");
+ if (!fp) vrb.error("Cannot open file for writing [" + fout + "]");
+
+ //Update sample ids in hdr
+ int * imap = (int *) malloc(sample_count * sizeof(int));
+ char ** samples = (char **) malloc(sample_count * sizeof(char *));
+ for (int i = 0 ; i < sample_count ; i ++) samples[i] = hdr_old->samples[bcf_hdr_id2int(hdr_old, BCF_DT_SAMPLE, sample_id[i].c_str())];
+ bcf_hdr_t * hdr_new = bcf_hdr_subset(hdr_old, sample_count, samples, imap);
+ bcf_hdr_write(fp, hdr_new);
+
+ //Read the VCF
+ bcf1_t * rec = bcf_init1();
+ int mDS = 0; float * vDS = NULL;
+ while ( (rec = bcf_sweep_fwd(sw)) ) {
+ bcf_subset(hdr_old, rec, sample_count, imap);
+ bcf_unpack(rec, BCF_UN_STR);
+ if (rec->n_allele != 2) n_excludedG_mult++;
+ else if (bcf_get_format_float(hdr_new, rec, "DS", &vDS, &mDS) != sample_count) n_excludedG_miss++;
+ else if (!filter_genotype.check(string(rec->d.id))) n_excludedG_user++;
+ else {
+ imputeMissing(vDS);
+ if (residualize) covariate_engine->residualize(vDS);
+ if (normalize) normalTransform(vDS);
+ bcf_update_format_float(hdr_new, rec, "DS", vDS, sample_count);
+ bcf_write1(fp, hdr_new, rec);
+ n_includedG ++;
+ }
+ }
+ free(vDS);
+ bcf_sweep_destroy(sw);
+ bcf_destroy1(rec);
+ bcf_hdr_destroy(hdr_old);
+ bcf_hdr_destroy(hdr_new);
+ if (hts_close(fp)) vrb.error("Cannot close properly file!");
+ vrb.bullet(stb.str(n_includedG) + " variants corrected");
+ if (n_excludedG_user) vrb.bullet(stb.str(n_excludedG_user) + " variants excluded by user");
+ if (n_excludedG_miss) vrb.bullet(stb.str(n_excludedG_miss) + " missing DS variants excluded");
+ if (n_excludedG_mult) vrb.bullet(stb.str(n_excludedG_mult) + " multi-allelic variants excluded");
+}
diff --git a/src/mode_correct/correct_read_covariates.cpp b/src/mode_correct/correct_read_covariates.cpp
new file mode 100644
index 0000000..aff6c16
--- /dev/null
+++ b/src/mode_correct/correct_read_covariates.cpp
@@ -0,0 +1,52 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "correct_data.h"
+
+void correct_data::readCovariates(string fcov) {
+ string buffer; vector < string > tokens;
+ vector < int > mappingS;
+ int n_includedS = 0, n_includedC = 0, n_excludedC = 0;
+
+ vrb.title("Reading covariates in [" + fcov + "]");
+ input_file fd (fcov);
+ if (fd.fail()) vrb.error("Cannot open file!");
+
+ //Read samples
+ getline(fd, buffer);
+ if (buffer.size() == 0) vrb.error("No header line detected!");
+ stb.split(buffer, tokens);
+ for (int t = 1 ; t < tokens.size() ; t ++) {
+ mappingS.push_back(findSample(tokens[t]));
+ if (mappingS.back() >= 0) n_includedS ++;
+ }
+
+ //Read covariates
+ while (getline(fd, buffer)) {
+ stb.split(buffer, tokens);
+ if (tokens.size() < 2) vrb.error("Wrong Incorrect number of columns!");
+ if (filter_covariate.check(tokens[0])) {
+ covariate_val.push_back(vector < string > (sample_count, "0"));
+ for (int t = 1 ; t < tokens.size() ; t ++) if (mappingS[t-1] >= 0) covariate_val.back()[mappingS[t-1]] = tokens[t];
+ n_includedC ++;
+ } else n_excludedC ++;
+ }
+
+ //Finalise
+ covariate_count = n_includedC;
+ vrb.bullet(stb.str(n_includedC) + " covariate(s) included");
+ if (n_excludedC > 0) vrb.bullet(stb.str(n_excludedC) + " covariate(s) excluded by user");
+ fd.close();
+}
diff --git a/src/mode_extract/extract_data.h b/src/mode_extract/extract_data.h
new file mode 100644
index 0000000..ccf97b3
--- /dev/null
+++ b/src/mode_extract/extract_data.h
@@ -0,0 +1,56 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#ifndef _EXTRACT_DATA_H
+#define _EXTRACT_DATA_H
+
+//INCLUDES
+#include "../common/data.h"
+
+class extract_data : public data {
+public:
+
+ //REGIONS
+ genomic_region regionData;
+
+ //DATA
+ vector < string > variable_id;
+ vector < string > variable_chr;
+ vector < int > variable_start;
+ vector < int > variable_end;
+ vector < vector < string > > variable_val;
+
+ //CONSTRUCTOR / DESTRUCTOR
+ extract_data() {}
+ ~extract_data() {}
+ void clear() { variable_val.clear(); }
+
+ //READ & WRITE DATA
+ void readBED(string);
+ void readVCF(string);
+ void readCOV(string);
+ void writeOUT(string);
+
+ //DATA MANAGMENT
+ void imputeMissing();
+
+};
+
+//***************************************************************//
+//******************** DECLARE FUNCTIONS *************************//
+//***************************************************************//
+void extract_main(vector < string > &);
+
+#endif
diff --git a/src/mode_extract/extract_main.cpp b/src/mode_extract/extract_main.cpp
new file mode 100644
index 0000000..4f1dd41
--- /dev/null
+++ b/src/mode_extract/extract_main.cpp
@@ -0,0 +1,88 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "extract_data.h"
+
+void extract_main(vector < string > & argv) {
+ extract_data D;
+
+ //-------------------------
+ // 1. DECLARE ALL OPTIONS
+ //-------------------------
+ D.declareBasicOptions();
+
+ boost::program_options::options_description opt_files ("\x1B[32mI/O\33[0m");
+ opt_files.add_options()
+ ("vcf", boost::program_options::value< string >(), "Genotypes in VCF/BCF format.")
+ ("bed", boost::program_options::value< vector < string > >()->multitoken(), "Phenotypes in BED format.")
+ ("cov", boost::program_options::value< string >(), "Covariates in TXT format.")
+ ("out", boost::program_options::value< string >(), "Output file.");
+
+ boost::program_options::options_description opt_parallel ("\x1B[32mParallelization\33[0m");
+ opt_parallel.add_options()
+ ("region", boost::program_options::value< string >(), "Region of interest.");
+
+ D.option_descriptions.add(opt_files).add(opt_parallel);
+
+ //-------------------
+ // 2. PARSE OPTIONS
+ //-------------------
+ try {
+ boost::program_options::store(boost::program_options::command_line_parser(argv).options(D.option_descriptions).run(), D.options);
+ boost::program_options::notify(D.options);
+ } catch ( const boost::program_options::error& e ) {
+ cerr << "Error parsing [extract] command line :" << string(e.what()) << endl;
+ exit(0);
+ }
+
+ //---------------------
+ // 3. PRINT HELP/HEADER
+ //---------------------
+ vrb.ctitle("DATA EXTRACTION");
+ if (D.options.count("help")) {
+ cout << D.option_descriptions << endl;
+ exit(EXIT_SUCCESS);
+ }
+
+ //-----------------
+ // 4. COMMON CHECKS
+ //-----------------
+ if ((D.options.count("vcf") + D.options.count("bed") + D.options.count("cov")) == 0) vrb.error("At least one input file has to be specified using either --vcf [file.vcf], --bed [file.bed] or --cov [file.txt]");
+ if (!D.options.count("region")) vrb.warning("Please use --region to speed up data extraction for phenotype and genotype data!");
+
+ //--------------
+ // 5. SET REGION
+ //--------------
+ if (D.options.count("region") && !D.regionData.parse(D.options["region"].as < string > ()))
+ vrb.error("Impossible to interpret region [" + D.options["region"].as < string > () + "]");
+
+ //--------------
+ // 6. READ FILES
+ //--------------
+ D.processBasicOptions();
+ vector < string > bed_list = D.options["bed"].as < vector < string > > ();
+ for (int b = 0 ; b < bed_list.size() ; b ++) D.readSampleFromBED(bed_list[b]);
+ if (D.options.count("vcf")) D.readSampleFromVCF(D.options["vcf"].as < string > ());
+ if (D.options.count("cov")) D.readSampleFromCOV(D.options["cov"].as < string > ());
+ D.mergeSampleLists();
+
+ for (int b = 0 ; b < bed_list.size() ; b ++) D.readBED(bed_list[b]);
+ if (D.options.count("vcf")) D.readVCF(D.options["vcf"].as < string > ());
+ if (D.options.count("cov")) D.readCOV(D.options["cov"].as < string > ());
+
+ D.imputeMissing();
+
+ D.writeOUT(D.options["out"].as < string > ());
+}
diff --git a/src/mode_extract/extract_managment.cpp b/src/mode_extract/extract_managment.cpp
new file mode 100644
index 0000000..2ba312c
--- /dev/null
+++ b/src/mode_extract/extract_managment.cpp
@@ -0,0 +1,39 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "extract_data.h"
+
+void extract_data::imputeMissing() {
+ unsigned int n_missing = 0, n_nmissing = 0;
+ vrb.title("Impute missing data with mean");
+ for (int v = 0; v < variable_val.size() ; v ++) {
+ double mean = 0.0; int c_mean= 0;
+ for (int s = 0; s < sample_count; s ++) {
+ if (variable_val[v][s] != "NA") {
+ mean += atof(variable_val[v][s].c_str());
+ c_mean ++;
+ }
+ }
+ mean /= c_mean;
+ for (int s = 0; s < sample_count ; s ++) {
+ if (variable_val[v][s] == "NA") {
+ variable_val[v][s] = stb.str(mean);
+ n_missing ++;
+ } else n_nmissing ++;
+ }
+ }
+ vrb.bullet("#non_missing_data_points = " + stb.str(n_nmissing));
+ vrb.bullet("#imputed_data_points = " + stb.str(n_missing) + " (=" + stb.str(n_missing * 100.0 / (n_nmissing + n_missing)) + "%)");
+}
diff --git a/src/mode_extract/extract_read_data.cpp b/src/mode_extract/extract_read_data.cpp
new file mode 100644
index 0000000..291b702
--- /dev/null
+++ b/src/mode_extract/extract_read_data.cpp
@@ -0,0 +1,198 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "extract_data.h"
+
+void extract_data::readBED(string fbed) {
+ int n_includedP = 0;
+ int n_excludedP = 0;
+ vector < int > mappingS;
+
+ //Open BED file
+ vrb.title("Reading phenotype data in [" + fbed + "]");
+ htsFile *fp = hts_open(fbed.c_str(),"r");
+ if (!fp) vrb.error("Cannot open file");
+ tbx_t *tbx = tbx_index_load(fbed.c_str());
+ if (!tbx) vrb.error("Cannot open index file");
+ kstring_t str = {0,0,0};
+ if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line!");
+
+ //Process sample names
+ vector < string > tokens;
+ stb.split(string(str.s), tokens);
+ if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
+ for (int t = 6 ; t < tokens.size() ; t ++) mappingS.push_back(findSample(tokens[t]));
+
+ //Read phenotypes
+ if (regionData.chr != "NA"){
+ hts_itr_t *itr = tbx_itr_querys(tbx, regionData.get().c_str());
+ vrb.bullet("target region [" + regionData.get() + "]");
+ if (!itr) vrb.error("Cannot jump to region!");
+
+ //Read data
+ while (tbx_itr_next(fp, tbx, itr, &str) >= 0) {
+ stb.split(string(str.s), tokens);
+ if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
+ if (filter_phenotype.check(tokens[3])) {
+ variable_id.push_back(tokens[3]);
+ variable_chr.push_back(tokens[0]);
+ variable_start.push_back(atoi(tokens[1].c_str()) + 1);
+ variable_end.push_back(atoi(tokens[2].c_str()));
+ variable_val.push_back(vector < string > (sample_count));
+ for (int t = 6 ; t < tokens.size() ; t ++) if (mappingS[t-6] >= 0) variable_val.back()[mappingS[t-6]] = tokens[t];
+ n_includedP++;
+ } else n_excludedP ++;
+ }
+ tbx_itr_destroy(itr);
+ }else{
+ while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) {
+ stb.split(string(str.s), tokens);
+ if (str.l && str.s[0] != tbx->conf.meta_char) {
+ if (tokens.size() < 5) vrb.error("Incorrect number of columns!");
+ if (filter_phenotype.check(tokens[3])) {
+ variable_id.push_back(tokens[3]);
+ variable_chr.push_back(tokens[0]);
+ variable_start.push_back(atoi(tokens[1].c_str()) + 1);
+ variable_end.push_back(atoi(tokens[2].c_str()));
+ variable_val.push_back(vector < string > (sample_count));
+ for (int t = 6 ; t < tokens.size() ; t ++) if (mappingS[t-6] >= 0) variable_val.back()[mappingS[t-6]] = tokens[t];
+ n_includedP++;
+ } else n_excludedP ++;
+ }
+ }
+ }
+
+ //Finalize & verbose
+ tbx_destroy(tbx);
+ if (hts_close(fp)) vrb.error("Cannot properly close file");
+ vrb.bullet(stb.str(n_includedP) + " phenotypes included");
+ if (n_excludedP > 0) vrb.bullet(stb.str(n_excludedP) + " phenotypes excluded by user");
+ if (n_includedP == 0) vrb.warning("Cannot find phenotypes to extract!");
+}
+
+void extract_data::readVCF(string fvcf) {
+ int n_includedG = 0;
+ int n_excludedG_mult = 0;
+ int n_excludedG_void = 0;
+ int n_excludedG_user = 0;
+ vector < int > mappingS;
+
+ //Opening files
+ vrb.title("Reading genotype data in [" + fvcf + "]");
+ bcf_srs_t * sr = bcf_sr_init();
+ if (regionData.chr != "NA"){
+ vrb.bullet("target region [" + regionData.get() + "]");
+ if (bcf_sr_set_regions(sr, regionData.get().c_str(), 0) == -1) vrb.error("Cannot jump to region!");
+ }
+ if(!(bcf_sr_add_reader (sr, fvcf.c_str()))) {
+ switch (sr->errnum) {
+ case not_bgzf: vrb.error("File not compressed with bgzip!");
+ case idx_load_failed: vrb.error("Impossible to load index file!");
+ case file_type_error: vrb.error("File format not detected by htslib!");
+ default : vrb.error("Unknown error!");
+ }
+ }
+
+ //Sample processing
+ int n_samples = bcf_hdr_nsamples(sr->readers[0].header);
+ for (int i0 = 0 ; i0 < n_samples ; i0 ++) mappingS.push_back(findSample(string(sr->readers[0].header->samples[i0])));
+
+ //Read genotype data
+ unsigned int linecount=0;
+ int ngt, ngt_arr = 0, nds, nds_arr = 0, * gt_arr = NULL, nsl, nsl_arr = 0, * sl_arr = NULL;
+ float * ds_arr = NULL;
+ bcf1_t * line;
+ while(bcf_sr_next_line (sr)) {
+ linecount ++;
+ if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
+ line = bcf_sr_get_line(sr, 0);
+ if (line->n_allele == 2) {
+ ngt = bcf_get_genotypes(sr->readers[0].header, line, >_arr, &ngt_arr);
+ nds = bcf_get_format_float(sr->readers[0].header, line,"DS", &ds_arr, &nds_arr);
+ if (nds == n_samples || ngt == 2*n_samples) {
+ bcf_unpack(line, BCF_UN_STR);
+ string sid = string(line->d.id);
+ if (filter_genotype.check(sid)) {
+ variable_id.push_back(sid);
+ variable_chr.push_back(string(bcf_hdr_id2name(sr->readers[0].header, line->rid)));
+ string genotype_ref = string(line->d.allele[0]);
+ variable_start.push_back(line->pos + 1);
+ nsl = bcf_get_info_int32(sr->readers[0].header, line, "END", &sl_arr, &nsl_arr);
+ if (nsl >= 0 && nsl_arr == 1) variable_end.push_back(sl_arr[0]);
+ else variable_end.push_back(variable_start.back() + genotype_ref.size() - 1);
+ variable_val.push_back(vector < string > (sample_count));
+ for(int i = 0 ; i < n_samples ; i ++) {
+ if (mappingS[i] >= 0) {
+ if (nds > 0) variable_val.back()[mappingS[i]] = stb.str(ds_arr[i]);
+ else {
+ if (gt_arr[2*i+0] == bcf_gt_missing || gt_arr[2*i+1] == bcf_gt_missing) variable_val.back()[mappingS[i]] = "NA";
+ else variable_val.back()[mappingS[i]] = stb.str(bcf_gt_allele(gt_arr[2*i+0]) + bcf_gt_allele(gt_arr[2*i+1]));
+ }
+ }
+ }
+ n_includedG++;
+ } else n_excludedG_user ++;
+ } else n_excludedG_void ++;
+ } else n_excludedG_mult ++;
+ }
+
+ //Finalize
+ free(gt_arr);
+ free(ds_arr);
+ bcf_sr_destroy(sr);
+ vrb.bullet(stb.str(n_includedG) + " variants included");
+ if (n_excludedG_user > 0) vrb.bullet(stb.str(n_excludedG_user) + " variants excluded by user");
+ if (n_excludedG_mult > 0) vrb.bullet(stb.str(n_excludedG_mult) + " multi-allelic variants excluded");
+ if (n_excludedG_void > 0) vrb.bullet(stb.str(n_excludedG_void) + " uninformative variants excluded [no GT/DS]");
+ if (n_includedG == 0) vrb.leave("Cannot find genotypes to extract!");
+}
+
+void extract_data::readCOV(string fcov) {
+ string buffer;
+ vector < string > str;
+ int n_includedC = 0;
+ int n_excludedC = 0;
+ vector < int > mappingS;
+
+ vrb.title("Reading covariates in [" + fcov + "]");
+ input_file fd (fcov);
+ if (fd.fail()) vrb.error("Cannot open file!");
+
+ //Read samples
+ getline(fd, buffer);
+ if (buffer.size() == 0) vrb.error("No header line detected!");
+ stb.split(buffer, str );
+ for (int t = 1 ; t < str.size() ; t ++) mappingS.push_back(findSample(str[t]));
+
+ //Read covariates
+ while(getline(fd, buffer)) {
+ stb.split(buffer, str);
+ if (str.size() < 2) vrb.error("Incorrect number of columns!");
+ if (filter_covariate.check(str[0])) {
+ variable_id.push_back(str[0]);
+ variable_chr.push_back(string("NA"));
+ variable_start.push_back(-1);
+ variable_end.push_back(-1);
+ variable_val.push_back(vector < string > (sample_count));
+ for (int t = 1 ; t < str.size() ; t ++) if (mappingS[t-1] >= 0) variable_val.back()[mappingS[t-1]] = str[t];
+ n_includedC ++;
+ } else n_excludedC ++;
+ }
+
+ //Finalise
+ vrb.bullet(stb.str(n_includedC) + " covariates included");
+ if (n_excludedC > 0) vrb.bullet(stb.str(n_excludedC) + " covariates excluded");
+ fd.close();
+}
diff --git a/src/mode_extract/extract_write.cpp b/src/mode_extract/extract_write.cpp
new file mode 100644
index 0000000..f092e8c
--- /dev/null
+++ b/src/mode_extract/extract_write.cpp
@@ -0,0 +1,45 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "extract_data.h"
+
+void extract_data::writeOUT(string fout) {
+
+ //
+ string filename_header = fout + ".header.txt";
+ vrb.title("Writing header in [" + filename_header + "]");
+ output_file fdh(filename_header);
+ fdh << "id chr start end" << endl;
+ for (int v = 0 ; v < variable_id.size() ; v ++) {
+ fdh << variable_id[v] << " " << variable_chr[v] << " " << variable_start[v] << " " << variable_end[v] << endl;
+ }
+ fdh.close();
+ vrb.bullet("#variables = " + stb.str(variable_id.size()));
+
+ string filename_content = fout + ".content.txt.gz";
+ vrb.title("Writing content in [" + filename_content + "]");
+ output_file fdc(filename_content);
+ fdc << "sample";
+ for (int v = 0 ; v < variable_id.size() ; v ++) fdc << " " << variable_id[v];
+ fdc << endl;
+
+ for (int s = 0 ; s < sample_count ; s ++) {
+ fdc << sample_id[s];
+ for (int v = 0 ; v < variable_id.size() ; v ++) fdc << " " << variable_val[v][s];
+ fdc << endl;
+ }
+ fdc.close();
+ vrb.bullet("#samples = " + stb.str(sample_count));
+}
diff --git a/src/mode_fdensity/fdensity_data.h b/src/mode_fdensity/fdensity_data.h
new file mode 100644
index 0000000..8a6042d
--- /dev/null
+++ b/src/mode_fdensity/fdensity_data.h
@@ -0,0 +1,63 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#ifndef _FDENSITY_DATA_H
+#define _FDENSITY_DATA_H
+
+//INCLUDES
+#include "../common/data.h"
+
+class fdensity_data : public data {
+public:
+ //PARAMETERS
+ int window, bin;
+
+ //Annotations
+ int ann_count; //Annotation number
+ vector < int > ann_start; //Annotation start position
+ vector < int > ann_end; //Annotation end position
+ vector < string > ann_chr; //Annotation chromosome
+
+ //TSS
+ int tss_count;
+ vector < string > tss_id;
+ vector < string > tss_chr;
+ vector < int > tss_pos;
+ vector < bool > tss_neg;
+
+ //TSS functional neighborhood
+ vector < IntervalTree < bool > > Itree;
+
+ //CONSTRUCTOR / DESTRUCTOR
+ fdensity_data() {}
+ ~fdensity_data() {}
+
+ //READ DATA
+ void readAnnotation(string);
+ void readQTL(string);
+
+ //DATA MANAGEMENT
+ void buildIntervalTrees();
+
+ //ANALYSIS
+ void runDensityCalculation(string);
+};
+
+//***************************************************************//
+//******************** DECLARE FUNCTIONS ************************//
+//***************************************************************//
+void fdensity_main(vector < string > &);
+
+#endif
diff --git a/src/mode_fdensity/fdensity_main.cpp b/src/mode_fdensity/fdensity_main.cpp
new file mode 100644
index 0000000..9d41357
--- /dev/null
+++ b/src/mode_fdensity/fdensity_main.cpp
@@ -0,0 +1,86 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "fdensity_data.h"
+
+void fdensity_main(vector < string > & argv) {
+ fdensity_data D;
+
+ //-------------------------
+ // 1. DECLARE ALL OPTIONS
+ //-------------------------
+ D.declareBasicOptions();
+ boost::program_options::options_description opt_files ("\x1B[32mI/O\33[0m");
+ opt_files.add_options()
+ ("bed", boost::program_options::value< string >(), "Functional annotations in BED format.")
+ ("qtl", boost::program_options::value< string >(), "QTL positions.")
+ ("out", boost::program_options::value< string >(), "Output file.");
+
+ boost::program_options::options_description opt_parameters ("\x1B[32mParameters\33[0m");
+ opt_parameters.add_options()
+ ("window", boost::program_options::value< int >()->default_value(1000000), "Window size arround TSS in bp.")
+ ("bin", boost::program_options::value< int >()->default_value(1000), "Bin size in bp.");
+
+ D.option_descriptions.add(opt_files).add(opt_parameters);
+
+ //-------------------
+ // 2. PARSE OPTIONS
+ //-------------------
+ try {
+ boost::program_options::store(boost::program_options::command_line_parser(argv).options(D.option_descriptions).run(), D.options);
+ boost::program_options::notify(D.options);
+ } catch ( const boost::program_options::error& e ) {
+ cerr << "Error parsing [fdensity] command line :" << string(e.what()) << endl;
+ exit(0);
+ }
+
+ //---------------------
+ // 3. PRINT HELP/HEADER
+ //---------------------
+ vrb.ctitle("ANNOTATION DENSITY ARROUND QTLs");
+ if (D.options.count("help")) {
+ cout << D.option_descriptions << endl;
+ exit(EXIT_SUCCESS);
+ }
+
+ //-----------------
+ // 4. COMMON CHECKS
+ //-----------------
+ if (!D.options.count("qtl")) vrb.error("QTL data needs to be specified with --qtl [file.bed]");
+ if (!D.options.count("bed")) vrb.error("Annotation data needs to be specified with --bed [file.bed]");
+ if (!D.options.count("out")) vrb.error("Output needs to be specified with --out [file.out]");
+
+ //--------------
+ // 6. SET PARAMS
+ //--------------
+ D.window = D.options["window"].as < int > ();
+ D.bin = D.options["bin"].as < int > ();
+ vrb.bullet("window = " + stb.str(D.options["window"].as < int > ()));
+ vrb.bullet("bin = " + stb.str(D.options["bin"].as < int > ()));
+
+ //---------------------------
+ // 7. READ FILES & INITIALIZE
+ //---------------------------
+ D.processBasicOptions();
+ D.readAnnotation(D.options["bed"].as < string > ());
+ D.readQTL(D.options["qtl"].as < string > ());
+ D.buildIntervalTrees();
+
+ //----------------
+ // 8. RUN ANALYSIS
+ //----------------
+ D.runDensityCalculation(D.options["out"].as < string > ());
+}
+
diff --git a/src/mode_fdensity/fdensity_process.cpp b/src/mode_fdensity/fdensity_process.cpp
new file mode 100644
index 0000000..b34edc6
--- /dev/null
+++ b/src/mode_fdensity/fdensity_process.cpp
@@ -0,0 +1,84 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "fdensity_data.h"
+
+void fdensity_data::buildIntervalTrees() {
+
+ //0. Initilization
+ vrb.title("Mapping annotations to QTLs ");
+
+ //1. Enumerate chr
+ map < string, int > chr2idx;
+ for (int t = 0 ; t < tss_count ; t ++) {
+ map < string, int >::iterator itC = chr2idx.find(tss_chr[t]);
+ if (itC == chr2idx.end()) chr2idx.insert(pair < string, int > (tss_chr[t], chr2idx.size()));
+ }
+ unsigned int n_chr = chr2idx.size();
+ vrb.bullet("#detected chromosomes in QTL data = " + stb.str(n_chr));
+
+ //2. Build chromosomal interval trees
+ unsigned int chr_unfound = 0;
+ vector < vector < Interval < bool > > > Ivec = vector < vector < Interval < bool > > > (n_chr, vector < Interval < bool > > ());
+ for (int a = 0 ; a < ann_count ; a ++) {
+ map < string, int >::iterator itC = chr2idx.find(ann_chr[a]);
+ if (itC == chr2idx.end()) chr_unfound ++;
+ else Ivec[itC->second].push_back(Interval < bool > (ann_start[a], ann_end[a], true));
+ }
+ vrb.bullet("#annotations NOT mapped to QTL chromosomes = " + stb.str(chr_unfound));
+ if ((ann_count - chr_unfound) > 0) vrb.bullet("#annotations mapped to QTL chromosomes = " + stb.str(ann_count - chr_unfound));
+ else vrb.error("None of the annotations have been found to be located on the same chromosomes than QTL!");
+
+ vector < IntervalTree < bool > > Ctree = vector < IntervalTree < bool > > (n_chr, IntervalTree < bool > ());
+ for (int c = 0 ; c < n_chr ; c ++) Ctree[c] = IntervalTree < bool > (Ivec[c]);
+
+ //4. Build functional neighborhoods
+ basic_stats Rstat;
+ Itree = vector < IntervalTree < bool > > (tss_count);
+ for (int t = 0 ; t < tss_count ; t ++) {
+ map < string, int >::iterator itC = chr2idx.find(tss_chr[t]);
+ assert(itC != chr2idx.end());
+ vector < Interval < bool > > ann_in_cis;
+ Ctree[itC->second].findOverlapping(tss_pos[t] - window, tss_pos[t] + window, ann_in_cis);
+ Rstat.push(ann_in_cis.size() * 1.0);
+ vector < Interval < bool > > Rvec;
+ if (!tss_neg[t]) for (int a = 0 ; a < ann_in_cis.size() ; a ++) Rvec.push_back(Interval < bool > (ann_in_cis[a].start - tss_pos[t], ann_in_cis[a].stop - tss_pos[t], true));
+ else for (int a = 0 ; a < ann_in_cis.size() ; a ++) Rvec.push_back(Interval < bool > (-1 * (ann_in_cis[a].stop - tss_pos[t]), -1 * (ann_in_cis[a].start - tss_pos[t]), true));
+ Itree[t] = IntervalTree < bool > (Rvec);
+ }
+ vrb.bullet("#annotated cis-windows = " + stb.str(Rstat.size()));
+ vrb.bullet("#annotations per cis-window = " + stb.str(Rstat.mean(), 2) + " +/- " + stb.str(Rstat.sd(), 2));
+}
+
+
+void fdensity_data::runDensityCalculation(string fout) {
+
+ vrb.title("Density analysis");
+ output_file fdo (fout.c_str());
+ for (int w = -1 * window ; w < window ; w += bin) {
+ int wfrom = w;
+ int wto = w + bin - 1;
+ int n_annotation = 0;
+
+ for (int t = 0 ; t < tss_count ; t ++) {
+ vector < Interval < bool > > ann_in_bin;
+ Itree[t].findOverlapping(wfrom, wto, ann_in_bin);
+ n_annotation += ann_in_bin.size();
+ }
+
+ fdo << wfrom << " " << wto << " " << n_annotation << endl;
+ }
+ fdo.close();
+}
diff --git a/src/mode_fdensity/fdensity_read_annotation.cpp b/src/mode_fdensity/fdensity_read_annotation.cpp
new file mode 100644
index 0000000..c2e87a6
--- /dev/null
+++ b/src/mode_fdensity/fdensity_read_annotation.cpp
@@ -0,0 +1,44 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "fdensity_data.h"
+
+void fdensity_data::readAnnotation(string fbed) {
+ //Open BED file
+ vrb.title("Reading annotation data in [" + fbed + "]");
+ htsFile *fp = hts_open(fbed.c_str(),"r");
+ if (!fp) vrb.error("Cannot open file!");
+
+ //Read annotations
+ long coverage = 0;
+ kstring_t str = {0,0,0};
+ vector < string > tokens;
+ while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) {
+ if (str.s[0] != '#') {
+ stb.split(string(str.s), tokens);
+ if (tokens.size() < 3) vrb.error("Incorrect number of columns!");
+ ann_start.push_back(atoi(tokens[1].c_str()) + 1);
+ ann_end.push_back(atoi(tokens[2].c_str()));
+ ann_chr.push_back(tokens[0]);
+ coverage += ann_end.back() - ann_start.back() + 1;
+ }
+ }
+
+ //Finalize & verbose
+ hts_close(fp);
+ ann_count = ann_chr.size();
+ vrb.bullet("#annotations = " + stb.str(ann_count));
+ vrb.bullet("coverage = " + stb.str(coverage));
+}
diff --git a/src/mode_fdensity/fdensity_read_qtl.cpp b/src/mode_fdensity/fdensity_read_qtl.cpp
new file mode 100644
index 0000000..e9bee0d
--- /dev/null
+++ b/src/mode_fdensity/fdensity_read_qtl.cpp
@@ -0,0 +1,42 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "fdensity_data.h"
+
+void fdensity_data::readQTL(string fbed) {
+ //Open BED file
+ vrb.title("Reading QTL in [" + fbed + "]");
+ htsFile *fp = hts_open(fbed.c_str(),"r");
+ if (!fp) vrb.error("Cannot open file!");
+
+ //Read annotations
+ kstring_t str = {0,0,0};
+ vector < string > tokens;
+ while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) {
+ if (str.s[0] != '#') {
+ stb.split(string(str.s), tokens);
+ if (tokens.size() < 6) vrb.error("Incorrect number of columns!");
+ tss_id.push_back(tokens[3]);
+ tss_chr.push_back(tokens[0]);
+ tss_pos.push_back(atoi(tokens[1].c_str()) + 1);
+ tss_neg.push_back(tokens[5] == "-");
+ }
+ }
+
+ //Finalize & verbose
+ hts_close(fp);
+ tss_count = tss_chr.size();
+ vrb.bullet("#QTLs = " + stb.str(tss_chr.size()));
+}
diff --git a/src/mode_fenrich/fenrich_data.h b/src/mode_fenrich/fenrich_data.h
new file mode 100644
index 0000000..2cc5879
--- /dev/null
+++ b/src/mode_fenrich/fenrich_data.h
@@ -0,0 +1,73 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#ifndef _FENRICH_DATA_H
+#define _FENRICH_DATA_H
+
+//INCLUDES
+#include "../common/data.h"
+
+class fenrich_data : public data {
+public:
+ //PARAMETERS
+ unsigned int n_permutation;
+
+ //QTL
+ int qtl_count; //QTL number
+ vector < int > qtl_pos; //QTL variant start position
+ vector < int > qtl_order;
+
+ //Annotations
+ int ann_count; //Annotation number
+ vector < int > ann_start; //Annotation start position
+ vector < int > ann_end; //Annotation end position
+ vector < string > ann_chr; //Annotation chromosome
+
+ //Tss
+ int tss_count;
+ vector < string > tss_id;
+ vector < string > tss_chr;
+ vector < int > tss_pos;
+ vector < bool > tss_neg;
+
+ //QTL functional neighborhood
+ vector < IntervalTree < bool > > R;
+
+ //CONSTRUCTOR / DESTRUCTOR
+ fenrich_data() {}
+ ~fenrich_data() {}
+
+ //READ DATA
+ void readQTL(string);
+ void readAnnotation(string);
+ void readTSS(string);
+
+ //DATA MANAGEMENT
+ int findTSS(string &);
+ void mapAnnotation2QTL();
+
+ //COMPUTATIONNAL ROUTINES
+ unsigned int countOverlaps();
+
+ //ANALYSIS
+ void runEnrichmentPass(string);
+};
+
+//***************************************************************//
+//******************** DECLARE FUNCTIONS ************************//
+//***************************************************************//
+void fenrich_main(vector < string > &);
+
+#endif
diff --git a/src/mode_fenrich/fenrich_main.cpp b/src/mode_fenrich/fenrich_main.cpp
new file mode 100644
index 0000000..b682565
--- /dev/null
+++ b/src/mode_fenrich/fenrich_main.cpp
@@ -0,0 +1,86 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "fenrich_data.h"
+
+void fenrich_main(vector < string > & argv) {
+ fenrich_data D;
+
+ //-------------------------
+ // 1. DECLARE ALL OPTIONS
+ //-------------------------
+ D.declareBasicOptions();
+ boost::program_options::options_description opt_files ("\x1B[32mI/O\33[0m");
+ opt_files.add_options()
+ ("qtl", boost::program_options::value< string >(), "QTL list in TXT format.")
+ ("bed", boost::program_options::value< string >(), "Functional annotations in BED format.")
+ ("tss", boost::program_options::value< string >(), "TSS positions.")
+ ("out", boost::program_options::value< string >(), "Output file.");
+
+ boost::program_options::options_description opt_parameters ("\x1B[32mParameters\33[0m");
+ opt_parameters.add_options()
+ ("permute", boost::program_options::value< unsigned int >()->default_value(1000), "Permutation number to empirically assess significance.");
+
+ D.option_descriptions.add(opt_files).add(opt_parameters);
+
+ //-------------------
+ // 2. PARSE OPTIONS
+ //-------------------
+ try {
+ boost::program_options::store(boost::program_options::command_line_parser(argv).options(D.option_descriptions).run(), D.options);
+ boost::program_options::notify(D.options);
+ } catch ( const boost::program_options::error& e ) {
+ cerr << "Error parsing [fenrich] command line :" << string(e.what()) << endl;
+ exit(0);
+ }
+
+ //---------------------
+ // 3. PRINT HELP/HEADER
+ //---------------------
+ vrb.ctitle("MEASURE ENRICHMENT OF QTL WITHIN ANNOTATIONS");
+ if (D.options.count("help")) {
+ cout << D.option_descriptions << endl;
+ exit(EXIT_SUCCESS);
+ }
+
+ //-----------------
+ // 4. COMMON CHECKS
+ //-----------------
+ if (!D.options.count("qtl")) vrb.error("QTL data needs to be specified with --qtl [file.txt]");
+ if (!D.options.count("tss")) vrb.error("TSS data needs to be specified with --tss [file.bed]");
+ if (!D.options.count("bed")) vrb.error("Annotation data needs to be specified with --bed [file.bed]");
+ if (!D.options.count("out")) vrb.error("Output needs to be specified with --out [file.out]");
+
+ //--------------
+ // 6. SET PARAMS
+ //--------------
+ D.n_permutation = D.options["permute"].as < unsigned int > ();
+ vrb.bullet("#permutations = " + stb.str(D.options["permute"].as < unsigned int > ()));
+
+ //---------------------------
+ // 7. READ FILES & INITIALIZE
+ //---------------------------
+ D.processBasicOptions();
+ D.readAnnotation(D.options["bed"].as < string > ());
+ D.readTSS(D.options["tss"].as < string > ());
+ D.readQTL(D.options["qtl"].as < string > ());
+
+ //----------------
+ // 8. RUN ANALYSIS
+ //----------------
+ D.mapAnnotation2QTL();
+ D.runEnrichmentPass(D.options["out"].as < string > ());
+}
+
diff --git a/src/mode_fenrich/fenrich_management.cpp b/src/mode_fenrich/fenrich_management.cpp
new file mode 100644
index 0000000..b8abb48
--- /dev/null
+++ b/src/mode_fenrich/fenrich_management.cpp
@@ -0,0 +1,80 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "fenrich_data.h"
+
+int fenrich_data::findTSS(string & tss_str) {
+ for (int t = 0 ; t < tss_count ; t ++) if (tss_id[t] == tss_str) return t;
+ return -1;
+}
+
+void fenrich_data::mapAnnotation2QTL() {
+
+ //0. Initilization
+ vrb.title("Mapping annotations to TSS ");
+
+ //1. Enumerate chr
+ map < string, int > chr2idx;
+ for (int t = 0 ; t < tss_count ; t ++) {
+ map < string, int >::iterator itC = chr2idx.find(tss_chr[t]);
+ if (itC == chr2idx.end()) chr2idx.insert(pair < string, int > (tss_chr[t], chr2idx.size()));
+ }
+ unsigned int n_chr = chr2idx.size();
+ vrb.bullet("#detected chromosomes in TSS data = " + stb.str(n_chr));
+
+ //2. Build chromosomal interval trees
+ unsigned int chr_unfound = 0;
+ vector < vector < Interval < bool > > > Tvec = vector < vector < Interval < bool > > > (n_chr, vector < Interval < bool > > ());
+ for (int a = 0 ; a < ann_count ; a ++) {
+ map < string, int >::iterator itC = chr2idx.find(ann_chr[a]);
+ if (itC == chr2idx.end()) chr_unfound ++;
+ else Tvec[itC->second].push_back(Interval < bool > (ann_start[a], ann_end[a], true));
+ }
+ vrb.bullet("#annotations NOT mapped to TSS chromosomes = " + stb.str(chr_unfound));
+ if ((ann_count - chr_unfound) > 0) vrb.bullet("#annotations mapped to TSS chromosomes = " + stb.str(ann_count - chr_unfound));
+ else vrb.error("None of the annotations have been found to be located on the same chromosomes than TSS!");
+ vector < IntervalTree < bool > > Ttree = vector < IntervalTree < bool > > (n_chr, IntervalTree < bool > ());
+ for (int c = 0 ; c < n_chr ; c ++) Ttree[c] = IntervalTree < bool > (Tvec[c]);
+
+ //3. Work out cis-window size in QTL data
+ unsigned int max_distance = 0;
+ for (int q = 0 ; q < qtl_count ; q ++) if (abs(qtl_pos[q]) > max_distance) max_distance = abs(qtl_pos[q]);
+ vrb.bullet("estimated cis-window size from the data = " + stb.str(max_distance));
+
+ //4. Build functional neighborhoods
+ basic_stats Rstat;
+ R = vector < IntervalTree < bool > > (tss_count);
+ for (int t = 0 ; t < tss_count ; t ++) {
+ map < string, int >::iterator itC = chr2idx.find(tss_chr[t]);
+ assert(itC != chr2idx.end());
+ vector < Interval < bool > > ann_in_cis;
+ Ttree[itC->second].findOverlapping(tss_pos[t] - max_distance - 10000, tss_pos[t] + max_distance + 10000, ann_in_cis);
+ Rstat.push(ann_in_cis.size() * 1.0);
+ vector < Interval < bool > > Rvec;
+ if (!tss_neg[t]) for (int a = 0 ; a < ann_in_cis.size() ; a ++) Rvec.push_back(Interval < bool > (ann_in_cis[a].start - tss_pos[t], ann_in_cis[a].stop - tss_pos[t], true));
+ else for (int a = 0 ; a < ann_in_cis.size() ; a ++) Rvec.push_back(Interval < bool > (-1 * (ann_in_cis[a].start - tss_pos[t]), -1 * (ann_in_cis[a].stop - tss_pos[t]), true));
+ R[t] = IntervalTree < bool > (Rvec);
+ }
+ vrb.bullet("#annotated cis-windows = " + stb.str(Rstat.size()));
+ vrb.bullet("#annotations per cis-window = " + stb.str(Rstat.mean(), 2) + " +/- " + stb.str(Rstat.sd(), 2));
+}
+
+unsigned int fenrich_data::countOverlaps() {
+ unsigned int n_overlaps = 0;
+ for (int q = 0 ; q < qtl_count ; q ++) {
+ if (R[qtl_order[q]].checkOverlapping(qtl_pos[q])) n_overlaps ++;
+ }
+ return n_overlaps;
+}
diff --git a/src/mode_fenrich/fenrich_process.cpp b/src/mode_fenrich/fenrich_process.cpp
new file mode 100644
index 0000000..34693ec
--- /dev/null
+++ b/src/mode_fenrich/fenrich_process.cpp
@@ -0,0 +1,64 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "fenrich_data.h"
+
+void fenrich_data::runEnrichmentPass(string fout) {
+
+ //1. Nominal pass
+ vrb.title("Enrichment analysis");
+ unsigned int obs_overlaps = countOverlaps();
+ vrb.bullet("#observed overlaps = " + stb.str(obs_overlaps) + " / " + stb.str(qtl_count) + " (" + stb.str(obs_overlaps * 100.0 / qtl_count, 2) + "%)");
+
+ //2. Permutation pass
+ basic_stats null_stat;
+ vector < int > rorder = vector < int > (tss_count, -1);
+ for ( int t = 0 ; t < tss_count ; t ++) rorder[t] = t;
+ vector < unsigned int > null_overlaps;
+ for (int p = 0 ; p < n_permutation ; p++) {
+ random_shuffle(rorder.begin(), rorder.end());
+ qtl_order = vector < int > (rorder.begin() , rorder.begin() + qtl_pos.size());
+ unsigned int no = countOverlaps();
+ null_overlaps.push_back(no);
+ null_stat.push(no * 1.0);
+ }
+ sort(null_overlaps.begin(), null_overlaps.end());
+ vrb.bullet("#null overlaps = " + stb.str(null_stat.mean(), 2) + " +/- " + stb.str(null_stat.sd(), 2) + " (" + stb.str(null_stat.mean() * 100.0 / qtl_count, 2) + "% +/- " + stb.str(null_stat.sd() * 100.0 / qtl_count, 2) + "%)");
+
+ //3. calculate empirical p-value
+ unsigned int n_smaller = 0, n_bigger = 0;
+ for (int p = 0 ; p < n_permutation ; p++) {
+ if (obs_overlaps >= null_overlaps[p]) n_smaller ++;
+ if (obs_overlaps <= null_overlaps[p]) n_bigger ++;
+ }
+ double epval = min((min(n_smaller, n_bigger) * 2.0 + 1) / (n_permutation + 1), 1.0);
+ vrb.bullet("empirical p-value = " + stb.str(epval));
+
+ //4. Compute odd ratios
+ double obs_freq = obs_overlaps * 1.0 / qtl_count;
+ double exp_freq_med = null_overlaps[(int)round(null_overlaps.size() * 0.500)] * 1.0 / qtl_count;
+ double exp_freq_upv = null_overlaps[(int)round(null_overlaps.size() * 0.975)] * 1.0 / qtl_count;
+ double exp_freq_dnv = null_overlaps[(int)round(null_overlaps.size() * 0.025)] * 1.0 / qtl_count;
+ double odd_ratio_med = (obs_freq * (1 - exp_freq_med)) / (exp_freq_med * (1 - obs_freq));
+ double odd_ratio_upv = (obs_freq * (1 - exp_freq_upv)) / (exp_freq_upv * (1 - obs_freq));
+ double odd_ratio_dnv = (obs_freq * (1 - exp_freq_dnv)) / (exp_freq_dnv * (1 - obs_freq));
+ vrb.bullet("Odd ratio = " + stb.str(odd_ratio_med, 4) + " [" + stb.str(odd_ratio_dnv, 4) + "," + stb.str(odd_ratio_upv, 4) + "]");
+
+ //5. Write output
+ output_file fdo (fout.c_str());
+ if (fdo.fail()) vrb.error("Cannot open output file!");
+ fdo << obs_overlaps << " " << qtl_count << " " << null_stat.mean() << " " << null_stat.sd() << " " << epval << " " << odd_ratio_dnv << " " << odd_ratio_med << " " << odd_ratio_upv << endl;
+ fdo.close();
+}
diff --git a/src/mode_fenrich/fenrich_read_annotation.cpp b/src/mode_fenrich/fenrich_read_annotation.cpp
new file mode 100644
index 0000000..303bab0
--- /dev/null
+++ b/src/mode_fenrich/fenrich_read_annotation.cpp
@@ -0,0 +1,44 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "fenrich_data.h"
+
+void fenrich_data::readAnnotation(string fbed) {
+ //Open BED file
+ vrb.title("Reading annotation data in [" + fbed + "]");
+ htsFile *fp = hts_open(fbed.c_str(),"r");
+ if (!fp) vrb.error("Cannot open file!");
+
+ //Read annotations
+ long coverage = 0;
+ kstring_t str = {0,0,0};
+ vector < string > tokens;
+ while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) {
+ if (str.s[0] != '#') {
+ stb.split(string(str.s), tokens);
+ if (tokens.size() < 3) vrb.error("Incorrect number of columns!");
+ ann_start.push_back(atoi(tokens[1].c_str()) + 1);
+ ann_end.push_back(atoi(tokens[2].c_str()));
+ ann_chr.push_back(tokens[0]);
+ coverage += ann_end.back() - ann_start.back() + 1;
+ }
+ }
+
+ //Finalize & verbose
+ hts_close(fp);
+ ann_count = ann_chr.size();
+ vrb.bullet("#annotations = " + stb.str(ann_count));
+ vrb.bullet("coverage = " + stb.str(coverage));
+}
diff --git a/src/mode_fenrich/fenrich_read_qtl.cpp b/src/mode_fenrich/fenrich_read_qtl.cpp
new file mode 100644
index 0000000..637c6e1
--- /dev/null
+++ b/src/mode_fenrich/fenrich_read_qtl.cpp
@@ -0,0 +1,38 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "fenrich_data.h"
+
+void fenrich_data::readQTL(string fqtl) {
+ string buffer; vector < string > str;
+
+ //Read QTL
+ vrb.title("Reading QTL in [" + fqtl + "]");
+ input_file fdq(fqtl);
+ if (fdq.fail()) vrb.error("Cannot open file!");
+ while (getline(fdq, buffer)) {
+ if (buffer[0] != '#') {
+ stb.split(buffer, str);
+ if (str.size() < 6) vrb.error("Incorrect number of columns, observed = " + stb.str(str.size()) + " expected = 5");
+ int idx_tss = findTSS(str[4]);
+ if (idx_tss < 0) vrb.error("Unknown phenotype id!");
+ qtl_pos.push_back(atoi(str[1].c_str()) - tss_pos[idx_tss]);
+ qtl_order.push_back(idx_tss);
+ }
+ }
+ fdq.close();
+ qtl_count = qtl_pos.size();
+ vrb.bullet("#qtl = " + stb.str(qtl_count));
+}
diff --git a/src/mode_fenrich/fenrich_read_tss.cpp b/src/mode_fenrich/fenrich_read_tss.cpp
new file mode 100644
index 0000000..1106cfb
--- /dev/null
+++ b/src/mode_fenrich/fenrich_read_tss.cpp
@@ -0,0 +1,42 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "fenrich_data.h"
+
+void fenrich_data::readTSS(string fbed) {
+ //Open BED file
+ vrb.title("Reading TSS in [" + fbed + "]");
+ htsFile *fp = hts_open(fbed.c_str(),"r");
+ if (!fp) vrb.error("Cannot open file!");
+
+ //Read annotations
+ kstring_t str = {0,0,0};
+ vector < string > tokens;
+ while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) {
+ if (str.s[0] != '#') {
+ stb.split(string(str.s), tokens);
+ if (tokens.size() < 6) vrb.error("Incorrect number of columns!");
+ tss_id.push_back(tokens[3]);
+ tss_chr.push_back(tokens[0]);
+ tss_pos.push_back(atoi(tokens[1].c_str()) + 1);
+ tss_neg.push_back(tokens[5] == "-");
+ }
+ }
+
+ //Finalize & verbose
+ hts_close(fp);
+ tss_count = tss_chr.size();
+ vrb.bullet("#TSS = " + stb.str(tss_chr.size()));
+}
diff --git a/src/mode_genrich/genrich_binning_process.cpp b/src/mode_genrich/genrich_binning_process.cpp
new file mode 100644
index 0000000..29c4ed5
--- /dev/null
+++ b/src/mode_genrich/genrich_binning_process.cpp
@@ -0,0 +1,61 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "genrich_data.h"
+
+void genrich_data::binningAllVariants() {
+ vrb.title("Bin all variants");
+ vrb.bullet("Distance +/- " + stb.str(bin_distance) + "bp and MAF +/- " + stb.str(bin_maf, 4));
+ for (int v = 0 ; v < genotype_chr.size() ; v ++) {
+ if (genotype_qtl[v]) {
+ int idx_bin = -1;
+ unsigned int n_bin = bin_min_maf.size();
+ for (int b = 0 ; b < n_bin && idx_bin < 0 ; b++) {
+ bool in_freq = (genotype_maf[v] >= bin_min_maf[b] && genotype_maf[v] < bin_max_maf[b]);
+ bool in_dist = (genotype_dist[v] >= bin_min_dist[b] && genotype_dist[v] < bin_max_dist[b]);
+ if (in_freq && in_dist) idx_bin = b;
+ }
+ if (idx_bin < 0) {
+ idx_bin = bin_min_maf.size();
+ bin_min_maf.push_back(genotype_maf[v] - bin_maf);
+ if (bin_min_maf.back() < 0) bin_min_maf.back() = 0;
+ bin_max_maf.push_back(genotype_maf[v] + bin_maf);
+ if (bin_max_maf.back() >= 0.5) bin_max_maf.back() = 0.5;
+ bin_min_dist.push_back(genotype_dist[v] - bin_distance);
+ bin_max_dist.push_back(genotype_dist[v] + bin_distance);
+ }
+ genotype_bin[v] = idx_bin;
+ }
+ }
+ vrb.bullet("Number of bins made from QTL data = " + stb.str(bin_min_maf.size()));
+
+ unsigned int n_binned = 0, n_nbinned = 0;
+ for (int v = 0 ; v < genotype_chr.size() ; v ++) {
+ if (!genotype_qtl[v]) {
+ int idx_bin = -1;
+ unsigned int n_bin = bin_min_maf.size();
+ for (int b = 0 ; b < n_bin && idx_bin < 0 ; b++) {
+ bool in_freq = (genotype_maf[v] >= bin_min_maf[b] && genotype_maf[v] < bin_max_maf[b]);
+ bool in_dist = (genotype_dist[v] >= bin_min_dist[b] && genotype_dist[v] < bin_max_dist[b]);
+ if (in_freq && in_dist) idx_bin = b;
+ }
+ genotype_bin[v] = idx_bin;
+ if (idx_bin >= 0) n_binned ++;
+ else n_nbinned ++;
+ }
+ }
+ vrb.bullet("Number of reference variants falling within bins = " + stb.str(n_binned));
+ vrb.bullet("Number of reference variants outside of any bins = " + stb.str(n_nbinned));
+}
diff --git a/src/mode_genrich/genrich_data.h b/src/mode_genrich/genrich_data.h
new file mode 100644
index 0000000..62808d5
--- /dev/null
+++ b/src/mode_genrich/genrich_data.h
@@ -0,0 +1,83 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#ifndef _GENRICH_DATA_H
+#define _GENRICH_DATA_H
+
+//ANALYSIS MODES
+
+//INCLUDES
+#include "../common/data.h"
+
+class genrich_data : public data {
+public:
+
+ //PARAMETERS
+ unsigned int n_permutations;
+ float threshold_ld;
+ float threshold_maf;
+ float bin_maf;
+ unsigned int bin_distance;
+
+ //DATA FOR CHROMOSOME ID
+ vector < string > chromosome_id;
+ unordered_map < string, unsigned int > chromosome_idx;
+
+ //DATA FOR PHENOTYPES
+ vector < IntervalTree < pair < bool, bool > > > phenotype_pos;
+
+ //DATA FOR VARIANTS
+ vector < unsigned int > genotype_chr;
+ vector < int > genotype_pos;
+ vector < float > genotype_maf;
+ vector < bool > genotype_qtl;
+ vector < bool > genotype_gwas;
+ vector < int > genotype_dist;
+ vector < int > genotype_bin;
+ unordered_map < string, unsigned int > genotype_uuid;
+ vector < vector < bool > > genotype_haps;
+
+ //QTL BINS
+ vector < float > bin_min_maf;
+ vector < float > bin_max_maf;
+ vector < int > bin_min_dist;
+ vector < int > bin_max_dist;
+
+ //CONSTRUCTOR / DESTRUCTOR
+ genrich_data() {};
+ ~genrich_data() {};
+
+ //READ DATA
+ void readReferenceGenotypes(string);
+ void readQTL(string fqtl);
+ void readGWAS(string fgwas);
+ void readPhenotypes(string fgwas);
+
+ //PROCESSES
+ void binningAllVariants();
+ void overlapGWASandQTL(string);
+
+ //ROUTINES
+ bool isSameSignal(unsigned int, unsigned int);
+ int getDistance(unsigned int, int);
+ int findCHR (string &);
+};
+
+//***************************************************************//
+//******************** DECLARE FUNCTIONS *************************//
+//***************************************************************//
+void genrich_main(vector < string > &);
+
+#endif
diff --git a/src/mode_genrich/genrich_main.cpp b/src/mode_genrich/genrich_main.cpp
new file mode 100644
index 0000000..d35b4ac
--- /dev/null
+++ b/src/mode_genrich/genrich_main.cpp
@@ -0,0 +1,106 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "genrich_data.h"
+
+void genrich_main(vector < string > & argv) {
+ genrich_data D;
+
+ //-------------------------
+ // 1. DECLARE ALL OPTIONS
+ //-------------------------
+ D.declareBasicOptions();
+
+ boost::program_options::options_description opt_files ("\x1B[32mI/O\33[0m");
+ opt_files.add_options()
+ ("tss", boost::program_options::value< string >(), "Phenotype file used for the QTL mapping.")
+ ("qtl", boost::program_options::value< string >(), "List of QTLs in BED format.")
+ ("ref", boost::program_options::value< string >(), "1000 Genomes genotypes in VCF/BCF format.")
+ ("gwas", boost::program_options::value< string >(), "List of GWAS hits in BED format.")
+ ("out", boost::program_options::value< string >(), "Output filename.");
+
+ boost::program_options::options_description opt_param ("\x1B[32mParameters\33[0m");
+ opt_param.add_options()
+ ("threshold-maf", boost::program_options::value< double >()->default_value(0.01), "MAF filter for sites in 1000 Genomes.")
+ ("threshold-ld", boost::program_options::value< double >()->default_value(0.5), "Consider that a GWAS hit and a QTL belong to the same signal when r2 >= arg.")
+ ("bin-distance", boost::program_options::value < unsigned int > ()->default_value(5000), "Maximal distance to assume 2 QTL are within the same bin.")
+ ("bin-maf", boost::program_options::value < double > ()->default_value(0.02), "Maximal frequency difference to assume 2 QTL are within the same bin.")
+ ("permute", boost::program_options::value < unsigned int > ()->default_value(1000), "Number of null sets of variants to be samples.");
+
+ D.option_descriptions.add(opt_files).add(opt_param);
+
+ //-------------------
+ // 2. PARSE OPTIONS
+ //-------------------
+ try {
+ boost::program_options::store(boost::program_options::command_line_parser(argv).options(D.option_descriptions).run(), D.options);
+ boost::program_options::notify(D.options);
+ } catch ( const boost::program_options::error& e ) {
+ cerr << "Error parsing [genrich] command line :" << string(e.what()) << endl;
+ exit(0);
+ }
+
+ //---------------------
+ // 3. PRINT HELP/HEADER
+ //---------------------
+ vrb.ctitle("COMPUTING GWAS ENRICHMENT FOR QTLs");
+ if (D.options.count("help")) {
+ cout << D.option_descriptions << endl;
+ exit(EXIT_SUCCESS);
+ }
+
+ //-----------------
+ // 4. COMMON CHECKS
+ //-----------------
+ if (!D.options.count("ref")) vrb.error("Please specify a variant collection with --ref [file.vcf]");
+ if (!D.options.count("qtl")) vrb.error("Please specify a QTL list with --qtl [qtl.bed]");
+ if (!D.options.count("gwas")) vrb.error("Please specify a GWAS hits list with --gwas [hits.bed]");
+ if (!D.options.count("tss")) vrb.error("Please specify a list of reference positions with --tss [phenotype.txt]");
+ if (!D.options.count("out")) vrb.error("Please specify an output file with --out [out.txt]");
+
+ //------------------
+ // 5. SET PARAMETERS
+ //------------------
+ D.threshold_ld = D.options["threshold-ld"].as < double > ();
+ D.threshold_maf = D.options["threshold-maf"].as < double > ();
+ D.bin_distance = D.options["bin-distance"].as < unsigned int > ();
+ D.bin_maf = D.options["bin-maf"].as < double > ();
+ D.n_permutations = D.options["permute"].as < unsigned int > ();
+ //TO BE DONE: check parameter values within reasonable range.
+ vrb.bullet("LD threshold = " + stb.str(D.threshold_ld));
+ vrb.bullet("MAF threshold = " + stb.str(D.threshold_maf));
+ vrb.bullet("Distance binning = " + stb.str(D.bin_distance));
+ vrb.bullet("MAF binning = " + stb.str(D.bin_maf));
+ vrb.bullet("#permutations = " + stb.str(D.n_permutations));
+
+ //--------------
+ // 6. READ FILES
+ //--------------
+ D.processBasicOptions();
+ D.readSampleFromVCF(D.options["ref"].as < string > ());
+ D.mergeSampleLists();
+ D.readPhenotypes(D.options["tss"].as < string > ());
+
+ D.readReferenceGenotypes(D.options["ref"].as < string > ());
+
+ D.readQTL(D.options["qtl"].as < string > ());
+ D.readGWAS(D.options["gwas"].as < string > ());
+
+ //----------------
+ // 7. RUN ANALYSIS
+ //----------------
+ D.binningAllVariants();
+ D.overlapGWASandQTL(D.options["out"].as < string > ());
+}
diff --git a/src/mode_genrich/genrich_process.cpp b/src/mode_genrich/genrich_process.cpp
new file mode 100644
index 0000000..9a71f19
--- /dev/null
+++ b/src/mode_genrich/genrich_process.cpp
@@ -0,0 +1,110 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "genrich_data.h"
+
+void genrich_data::overlapGWASandQTL(string fout) {
+
+ vrb.title("Calculating observed overlap between QTLs and GWAS hits");
+ vector < unsigned int > qtl_idx, gwas_idx;
+ for (int v = 0 ; v < genotype_pos.size() ; v ++) {
+ if (genotype_qtl[v]) qtl_idx.push_back(v);
+ if (genotype_gwas[v]) gwas_idx.push_back(v);
+ }
+ vrb.bullet("#qtls=" + stb.str(qtl_idx.size()));
+ vrb.bullet("#gwas=" + stb.str(gwas_idx.size()));
+
+ vector < bool > overlap_qtl = vector < bool > (qtl_idx.size() , false);
+ for (int q = 0 ; q < qtl_idx.size() ; q ++) for (int g = 0 ; g < gwas_idx.size() && !overlap_qtl[q] ; g ++) overlap_qtl[q] = isSameSignal(gwas_idx[g], qtl_idx[q]);
+
+ unsigned int n_obs_overlap = 0;
+ for (int q = 0 ; q < qtl_idx.size() ; q ++) if (overlap_qtl[q]) n_obs_overlap ++;
+ vrb.bullet("#observed overlap=" + stb.str(n_obs_overlap) + " (=" + stb.str(n_obs_overlap * 100.0 / qtl_idx.size(), 2) + " %)");
+
+ vrb.title("Classifying null variants");
+ vector < vector < unsigned int > > null_sets = vector < vector < unsigned int > > (bin_min_maf.size());
+ for (int v = 0 ; v < genotype_pos.size() ; v ++) if (!genotype_qtl[v] && genotype_bin[v] >= 0) null_sets[genotype_bin[v]].push_back(v);
+ basic_stats bs_null_count;
+ for (int b = 0 ; b < null_sets.size() ; b ++) {
+ bs_null_count.push(null_sets[b].size());
+ //cerr << b << " " << null_sets[b].size() << " " << bin_min_maf[b] << " " << bin_max_maf[b] << " " << bin_min_dist[b] << " " << bin_max_dist[b] << endl;
+ }
+ vrb.bullet("#null variants per bin = " + stb.str(bs_null_count.mean(),3) + " +/-" + stb.str(bs_null_count.sd(), 3));
+
+ vrb.title("Calculating overlap between NULL sets of variants and GWAS hits");
+ vrb.bullet("#permutations=" + stb.str(n_permutations));
+ basic_stats bs_null_overlap;
+ vector < unsigned int > n_exp_overlap = vector < unsigned int > (n_permutations, 0);
+ for (int p = 0 ; p < n_permutations ; p ++) {
+
+ //step1: sample sequence of null variants
+ //cerr << "1. Permutation " << p << endl;
+ vector < int > seq_null_qtl;
+ for (int q = 0 ; q < qtl_idx.size() ; q ++) {
+ unsigned int idx_bin = genotype_bin[qtl_idx[q]];
+ unsigned int idx_rnd = rng.getInt(null_sets[idx_bin].size());
+ //cerr << q << " " << idx_bin << " " << idx_rnd << " " << null_sets[idx_bin].size() << endl;
+ seq_null_qtl.push_back(null_sets[idx_bin][idx_rnd]);
+ }
+ //cerr << "1. Size = " << seq_null_qtl.size() << endl;
+
+ //step2: work out overlap
+ //cerr << "2. Overlap " << p << endl;
+ overlap_qtl = vector < bool > (qtl_idx.size() , false);
+ for (int q = 0 ; q < qtl_idx.size() ; q ++) for (int g = 0 ; g < gwas_idx.size() && !overlap_qtl[q] ; g ++) overlap_qtl[q] = isSameSignal(gwas_idx[g], seq_null_qtl[q]);
+
+ //step3: count overlaps
+ //cerr << "3. Count overlaps " << p << endl;
+ for (int q = 0 ; q < qtl_idx.size() ; q ++) if (overlap_qtl[q]) n_exp_overlap[p] ++;
+ bs_null_overlap.push(n_exp_overlap[p]);
+ //cerr << "3. #overlaps = " << n_exp_overlap[p] << endl;
+ vrb.bullet("permutation = " + stb.str(p) + " overlaps = " + stb.str(n_exp_overlap[p]));
+ }
+ sort(n_exp_overlap.begin(), n_exp_overlap.end());
+ vrb.bullet("#null overlap = " + stb.str(bs_null_overlap.mean(),3) + " +/-" + stb.str(bs_null_overlap.sd(), 3));
+
+ unsigned int n_bigger = 0, n_smaller = 0;
+ for (int p = 0 ; p < n_permutations ; p++) {
+ if (n_obs_overlap <= n_exp_overlap[p]) n_bigger ++;
+ if (n_obs_overlap >= n_exp_overlap[p]) n_smaller ++;
+ }
+ double epval = min((min(n_smaller, n_bigger) * 2.0 + 1.0) / (n_permutations + 1.0), 1.0);
+ vrb.bullet("Empirical p-value = " + stb.str(epval));
+
+ double obs_freq = n_obs_overlap * 1.0 / qtl_idx.size();
+ double exp_freq_med = n_exp_overlap[(int)round(n_exp_overlap.size() * 0.500)] * 1.0 / qtl_idx.size();
+ double exp_freq_upv = n_exp_overlap[(int)round(n_exp_overlap.size() * 0.975)] * 1.0 / qtl_idx.size();
+ double exp_freq_dnv = n_exp_overlap[(int)round(n_exp_overlap.size() * 0.025)] * 1.0 / qtl_idx.size();
+ double odd_ratio_med = (obs_freq * (1 - exp_freq_med)) / (exp_freq_med * (1 - obs_freq));
+ double odd_ratio_upv = (obs_freq * (1 - exp_freq_upv)) / (exp_freq_upv * (1 - obs_freq));
+ double odd_ratio_dnv = (obs_freq * (1 - exp_freq_dnv)) / (exp_freq_dnv * (1 - obs_freq));
+ vrb.bullet("Odd ratio = " + stb.str(odd_ratio_med, 4) + " [" + stb.str(odd_ratio_dnv, 4) + "," + stb.str(odd_ratio_upv, 4) + "]");
+
+ string fout_sum = fout + ".summary.txt";
+ vrb.title ("Writing summary of enrichment analysis in [" + fout + "]");
+ output_file fdo_sum (fout_sum.c_str());
+ if (fdo_sum.fail()) vrb.error("Cannot open output file!");
+ fdo_sum << n_obs_overlap << " " << qtl_idx.size() << " " << bs_null_overlap.mean() << " " << bs_null_overlap.sd() << " " << epval << " " << odd_ratio_dnv << " " << odd_ratio_med << " " << odd_ratio_upv << endl;
+ fdo_sum.close();
+
+ string fout_full = fout + ".full.txt.gz";
+ vrb.title ("Writing full enrichment analysis outcome in [" + fout + "]");
+ output_file fdo_full (fout_full.c_str());
+ if (fdo_full.fail()) vrb.error("Cannot open output file!");
+ fdo_full << n_obs_overlap << " " << qtl_idx.size() << endl;
+ for (int p = 0 ; p < n_permutations ; p ++) fdo_full << n_exp_overlap[p] << " " << qtl_idx.size() << endl;
+ fdo_full.close();
+}
+
diff --git a/src/mode_genrich/genrich_read_auxillliary_data.cpp b/src/mode_genrich/genrich_read_auxillliary_data.cpp
new file mode 100644
index 0000000..2e4b7b0
--- /dev/null
+++ b/src/mode_genrich/genrich_read_auxillliary_data.cpp
@@ -0,0 +1,66 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "genrich_data.h"
+
+void genrich_data::readQTL(string fqtl) {
+ string buffer;
+ vector < string > tokens;
+ unsigned int n_var_found = 0, n_var_unfound = 0;
+
+ vrb.title("Reading QTLs in [" + fqtl + "]");
+ input_file fd (fqtl);
+ while (getline(fd, buffer)) {
+ if (buffer[0] != '#') {
+ stb.split(buffer, tokens);
+ string uid = tokens[0] + "_" + tokens[1];
+ unordered_map < string, unsigned int > :: iterator it_genotype_id = genotype_uuid.find(uid);
+
+ if (it_genotype_id == genotype_uuid.end()) n_var_unfound ++;
+ else {
+ genotype_qtl[it_genotype_id->second] = true;
+ n_var_found ++;
+ }
+ }
+ }
+ fd.close();
+ vrb.bullet("Number of QTL found in reference = " + stb.str(n_var_found));
+ vrb.bullet("Number of QTL not found in reference = " + stb.str(n_var_unfound));
+}
+
+void genrich_data::readGWAS(string fgwas) {
+ string buffer;
+ vector < string > tokens;
+ unsigned int n_var_found = 0, n_var_unfound = 0;
+
+ vrb.title("Reading GWAS hits in [" + fgwas + "]");
+ input_file fd (fgwas);
+ while (getline(fd, buffer)) {
+ if (buffer[0] != '#') {
+ stb.split(buffer, tokens);
+ string uid = tokens[0] + "_" + tokens[1];
+ unordered_map < string, unsigned int > :: iterator it_genotype_id = genotype_uuid.find(uid);
+
+ if (it_genotype_id == genotype_uuid.end()) n_var_unfound ++;
+ else {
+ genotype_gwas[it_genotype_id->second] = true;
+ n_var_found ++;
+ }
+ }
+ }
+ fd.close();
+ vrb.bullet("Number of GWAS hits found in reference = " + stb.str(n_var_found));
+ vrb.bullet("Number of GWAS hits not found in reference = " + stb.str(n_var_unfound));
+}
diff --git a/src/mode_genrich/genrich_read_phenotypes.cpp b/src/mode_genrich/genrich_read_phenotypes.cpp
new file mode 100644
index 0000000..1ca0c88
--- /dev/null
+++ b/src/mode_genrich/genrich_read_phenotypes.cpp
@@ -0,0 +1,76 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "genrich_data.h"
+
+void genrich_data::readPhenotypes(string fbed) {
+ int n_includedP = 0;
+ int n_excludedP = 0;
+ int n_negativeStrd = 0;
+ vector < string > tokens;
+
+ vector < int > vchr, vpos;
+ vector < bool > vneg;
+
+ //Open BED file
+ vrb.title("Reading phenotype coordinates in [" + fbed + "]");
+ htsFile *fp = hts_open(fbed.c_str(),"r");
+ if (!fp) vrb.error("Cannot open file");
+ kstring_t str = {0,0,0};
+ if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != '#') vrb.error("Cannot read header line!");
+
+ while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) {
+ stb.split(string(str.s), tokens);
+ if (str.l && str.s[0] != '#') {
+ if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
+ if (filter_phenotype.check(tokens[3])) {
+ int chr_idx = findCHR(tokens[0]);
+ if (chr_idx < 0) {
+ chr_idx = chromosome_id.size();
+ chromosome_id.push_back(tokens[0]);
+ chromosome_idx.insert(pair < string , unsigned int > (tokens[0], chr_idx)) ;
+ }
+
+ vchr.push_back(chr_idx);
+ vpos.push_back((atoi(tokens[1].c_str()) + atoi(tokens[2].c_str()))/2);
+ vneg.push_back(tokens[5] == "-");
+ if (vneg.back()) n_negativeStrd ++;
+ n_includedP ++;
+ } else n_excludedP ++;
+ }
+ }
+ if (hts_close(fp)) vrb.error("Cannot properly close file");
+ vrb.bullet(stb.str(n_includedP) + " phenotypes included");
+ if (n_excludedP > 0) vrb.bullet(stb.str(n_excludedP) + " phenotypes excluded by user");
+ if (n_negativeStrd > 0 ) vrb.bullet(stb.str(n_negativeStrd) + " phenotypes are on the negative strand");
+ if (n_includedP == 0) vrb.leave("Cannot find phenotypes!");
+ vrb.bullet("Detected number of distinct chromosomes = " + stb.str(chromosome_id.size()));
+ vrb.bullet("Number of phenotypes on the negative strand = " + stb.str(n_negativeStrd));
+
+ vrb.title("Convert phenotypes coordinates into a set of interval trees");
+ vector < vector < Interval < pair < bool, bool > > > > Tvec = vector < vector < Interval < pair < bool, bool > > > > (chromosome_id.size(), vector < Interval < pair < bool, bool > > > ());
+ for (int t = 0 ; t < vpos.size() ; t ++) {
+ if (t == 0 || vchr[t-1] != vchr[t]) {
+ Tvec[vchr[t]].push_back(Interval < pair < bool, bool > > (-1000000000, vpos[t] - 1, pair < bool, bool >(false, vneg[t])));
+ } else if ((t == vpos.size() - 1) || vchr[t] != vchr[t+1]) {
+ Tvec[vchr[t]].push_back(Interval < pair < bool, bool > > (vpos[t-1], vpos[t] - 1, pair < bool, bool >(vneg[t-1], vneg[t])));
+ Tvec[vchr[t]].push_back(Interval < pair < bool, bool > > (vpos[t], 1000000000, pair < bool, bool >(vneg[t], false)));
+ } else {
+ Tvec[vchr[t]].push_back(Interval < pair < bool, bool > > (vpos[t-1], vpos[t] - 1, pair < bool, bool >(vneg[t-1], vneg[t])));
+ }
+ }
+ phenotype_pos = vector < IntervalTree < pair < bool, bool > > > (chromosome_id.size(), IntervalTree < pair < bool, bool > > ());
+ for (int c = 0 ; c < chromosome_id.size() ; c ++) phenotype_pos[c] = IntervalTree < pair < bool, bool > > (Tvec[c]);
+}
diff --git a/src/mode_genrich/genrich_read_reference_genotypes.cpp b/src/mode_genrich/genrich_read_reference_genotypes.cpp
new file mode 100644
index 0000000..c0fb6b4
--- /dev/null
+++ b/src/mode_genrich/genrich_read_reference_genotypes.cpp
@@ -0,0 +1,101 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "genrich_data.h"
+
+void genrich_data::readReferenceGenotypes(string fvcf) {
+ vector < int > mappingS;
+
+ //Opening files
+ vrb.title("Reading variant list in [" + fvcf + "] MAF=" + stb.str(threshold_maf));
+ bcf_srs_t * sr = bcf_sr_init();
+ if(!(bcf_sr_add_reader (sr, fvcf.c_str()))) {
+ switch (sr->errnum) {
+ case not_bgzf: vrb.error("File not compressed with bgzip!");
+ case idx_load_failed: vrb.error("Impossible to load index file!");
+ case file_type_error: vrb.error("File format not detected by htslib!");
+ default : vrb.error("Unknown error!");
+ }
+ }
+
+ //Sample processing
+ int included_sample = 0;
+ int n_samples = bcf_hdr_nsamples(sr->readers[0].header);
+ for (int i = 0 ; i < n_samples ; i ++) {
+ mappingS.push_back(findSample(string(sr->readers[0].header->samples[i])));
+ if (mappingS.back() >= 0) included_sample ++;
+ }
+ vrb.bullet("#samples = " + stb.str(included_sample));
+
+ //Variant processing
+ unsigned int n_excludedV_mult = 0, n_excludedV_void = 0, n_excludedV_rare = 0, n_excludedV_uchr = 0, n_line = 0;
+ int ngt, ngt_arr = 0, *gt_arr = NULL;
+ bcf1_t * line;
+ while(bcf_sr_next_line (sr)) {
+ line = bcf_sr_get_line(sr, 0);
+ if (line->n_allele == 2) {
+ bcf_unpack(line, BCF_UN_STR);
+ string sid = string(line->d.id);
+ string chr = string(bcf_hdr_id2name(sr->readers[0].header, line->rid));
+ int chr_idx = findCHR(chr);
+ if (chr_idx >= 0) {
+ unsigned int pos = line->pos + 1;
+ ngt = bcf_get_genotypes(sr->readers[0].header, line, >_arr, &ngt_arr);
+ if (ngt == 2*n_samples) {
+ double freq = 0.0, tot = 0.0;
+ for(int i = 0 ; i < n_samples ; i ++) {
+ assert(gt_arr[2*i+0] != bcf_gt_missing && gt_arr[2*i+1] != bcf_gt_missing);
+ if (mappingS[i] >= 0) {
+ freq += bcf_gt_allele(gt_arr[2*i+0]) + bcf_gt_allele(gt_arr[2*i+1]);
+ tot += 2.0;
+ }
+ }
+ double maf = freq / tot;
+ if (maf > 0.5) maf = 1.0 - maf;
+ if (maf >= threshold_maf) {
+ int dist_tss = getDistance(chr_idx, pos);
+ string tmp_id = chr + "_" + stb.str(pos);
+ genotype_uuid.insert(pair < string, unsigned int > (tmp_id, genotype_pos.size()));
+ genotype_chr.push_back(chr_idx);
+ genotype_pos.push_back(pos);
+ genotype_maf.push_back(maf);
+ genotype_dist.push_back(dist_tss);
+ genotype_haps.push_back(vector < bool > (2 * included_sample, false));
+ for(int i = 0 ; i < n_samples ; i ++) {
+ if (mappingS[i] >= 0) {
+ genotype_haps.back()[2 * mappingS[i] + 0] = bcf_gt_allele(gt_arr[2 * i + 0]);
+ genotype_haps.back()[2 * mappingS[i] + 1] = bcf_gt_allele(gt_arr[2 * i + 1]);
+ }
+ }
+ } else n_excludedV_rare ++;
+ } else n_excludedV_void ++;
+ } else n_excludedV_uchr ++;
+ } else n_excludedV_mult ++;
+
+ if (n_line % 100000 == 0) vrb.bullet("#lines = " + stb.str(n_line));
+
+ n_line ++;
+ }
+ genotype_qtl = vector < bool > (genotype_pos.size(), false);
+ genotype_gwas = vector < bool > (genotype_pos.size(), false);
+ genotype_bin = vector < int > (genotype_pos.size(), -1);
+
+ //Finalize
+ bcf_sr_destroy(sr);
+ vrb.bullet(stb.str(genotype_pos.size()) + " variants included");
+ if (n_excludedV_mult > 0) vrb.bullet(stb.str(n_excludedV_mult) + " multi-allelic variants excluded");
+ if (n_excludedV_uchr > 0) vrb.bullet(stb.str(n_excludedV_uchr) + " variants with unreferenced chromosome in --tss");
+ if (n_excludedV_rare > 0) vrb.bullet(stb.str(n_excludedV_rare) + " maf filtered variants");
+}
diff --git a/src/mode_genrich/genrich_routines.cpp b/src/mode_genrich/genrich_routines.cpp
new file mode 100644
index 0000000..c8ed10e
--- /dev/null
+++ b/src/mode_genrich/genrich_routines.cpp
@@ -0,0 +1,52 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "genrich_data.h"
+
+bool genrich_data::isSameSignal(unsigned int idx1, unsigned int idx2) {
+ if (idx1 == idx2) return true;
+ if (genotype_chr[idx1] != genotype_chr[idx2]) return false;
+ if (abs(genotype_pos[idx1] - genotype_pos[idx2]) > 1000000) return false;
+
+ double h11 = 0.0, hX1 = 0.0, h1X = 0.0;
+ for (int i = 0 ; i < 2 * sample_count ; i ++) {
+ if (genotype_haps[idx1][i]) h1X ++;
+ if (genotype_haps[idx2][i]) hX1 ++;
+ if (genotype_haps[idx1][i] && genotype_haps[idx2][i]) h11 ++;
+ }
+ h11 /= 2 * sample_count;
+ hX1 /= 2 * sample_count;
+ h1X /= 2 * sample_count;
+ double r2 = (h11 - h1X * hX1) * (h11 - h1X * hX1) / (h1X * (1 - h1X) * hX1 * (1 - hX1));
+ //if (!(r2 >= 0 && r2 <= 1.0)) vrb.warning("LD r2 of " + stb.str(r2));
+ if (r2 >= threshold_ld) return true;
+ else return false;
+}
+
+int genrich_data::getDistance(unsigned int chr, int pos) {
+ vector < Interval < pair < bool, bool > > > phenotype_pair;
+ phenotype_pos[chr].findOverlapping(pos, phenotype_pair);
+ assert(phenotype_pair.size() == 1);
+ int x1 = abs((int)pos - (int)phenotype_pair[0].start);
+ int x2 = abs((int)pos - (int)phenotype_pair[0].stop);
+ if (x1 < x2) return ((phenotype_pair[0].value.first)?(-1*x1):x1);
+ else return ((phenotype_pair[0].value.second)?(-1*x2):x2);
+}
+
+int genrich_data::findCHR (string & chr) {
+ unordered_map < string, unsigned int > :: iterator it = chromosome_idx.find(chr);
+ if (it == chromosome_idx.end()) return -1;
+ else return it->second;
+}
diff --git a/src/mode_match/match_data.h b/src/mode_match/match_data.h
new file mode 100644
index 0000000..83a0a5f
--- /dev/null
+++ b/src/mode_match/match_data.h
@@ -0,0 +1,106 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#ifndef _MATCH_DATA_H
+#define _MATCH_DATA_H
+
+//INCLUDES
+#include "../common/data.h"
+
+class site {
+public:
+ string chr;
+ unsigned int pos;
+ unsigned int idx;
+ char ref;
+ char alt;
+ unsigned int cref;
+ unsigned int calt;
+ unsigned int cdisc;
+ unsigned int cqual;
+ unsigned int cdel;
+ unsigned int ctot;
+
+ site (string _chr, unsigned int _pos, string _ref, string _alt) {
+ chr = _chr;
+ pos = _pos;
+ ref = _ref[0];
+ alt = _alt[0];
+ cref = 0;
+ calt = 0;
+ cdisc = 0;
+ cqual = 0;
+ cdel = 0;
+ ctot = 0;
+ }
+};
+
+class match_data : public data {
+public :
+ //PARAMETERS
+ unsigned int param_min_mapQ;
+ unsigned int param_min_baseQ;
+ unsigned int param_min_cov;
+ float param_min_pval;
+ float param_min_gp;
+ float param_min_iq;
+ bool param_dup_rd;
+
+ //DATA
+ vector < string > regions;
+ vector < vector < site > > sites;
+ vector < vector < vector < bool > > > gen_ref, gen_alt;
+
+ //CONSTRUCTOR/DESTRUCTOR
+ match_data();
+ ~match_data();
+
+ //
+ void readGenotypes(string, string);
+ void readSequences(string);
+ void writeOutput(string);
+};
+
+void match_main(vector < string > & );
+
+inline char match_getBase (int code) {
+ switch (code) {
+ case 1: return 'A';
+ case 2: return 'C';
+ case 4: return 'G';
+ case 8: return 'T';
+ case 15: return 'N';
+ }
+ return -1;
+}
+
+inline double match_binomialTest(int x, int n, float p) {
+ int y = 0;
+ if (p == 0) return (x == 0);
+ if (p == 1) return (x == n);
+ double relErr = 1 + 1e-07;
+ double d = dbinom(x, n, p, 0);
+ double m = n * p;
+ if (x == m) return 1.0;
+ if (x < m) {
+ for (int i = (int)ceil (m); i <= n ; i++) y += (dbinom(i, n, p, 0) <= d * relErr);
+ return pbinom(x, n, p, 1, 0) + pbinom(n - y, n, p, 0, 0);
+ } else {
+ for (int i = 0 ; i <= (int)floor(m) ; i++) y += (dbinom(i, n, p, 0) <= d * relErr);
+ return pbinom(y - 1, n, p, 1, 0) + pbinom(x - 1, n, p, 0, 0);
+ }
+}
+
+#endif
diff --git a/src/mode_match/match_main.cpp b/src/mode_match/match_main.cpp
new file mode 100644
index 0000000..b1bac7e
--- /dev/null
+++ b/src/mode_match/match_main.cpp
@@ -0,0 +1,98 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "match_data.h"
+
+void match_main(vector < string > & argv) {
+ match_data D;
+
+ //-------------------------
+ // 1. DECLARE ALL OPTIONS
+ //-------------------------
+ D.declareBasicOptions();
+ boost::program_options::options_description opt_files ("\x1B[32mI/O\33[0m");
+ opt_files.add_options()
+ ("vcf", boost::program_options::value< string >(), "Genotypes in VCF/BCF format.")
+ ("bam", boost::program_options::value< string >(), "Sequence data in BAM/SAM format.")
+ ("reg", boost::program_options::value< string >()->default_value(""), "Genomic region(s) to be processed.")
+ ("out", boost::program_options::value< string >(), "Output file.");
+
+ boost::program_options::options_description opt_parameters ("\x1B[32mFilters\33[0m");
+ opt_parameters.add_options()
+ ("filter-mapping-quality", boost::program_options::value< unsigned int >()->default_value(10), "Minimal phred mapping quality for a read to be considered.")
+ ("filter-base-quality", boost::program_options::value< unsigned int >()->default_value(5), "Minimal phred quality for a base to be considered.")
+ ("filter-binomial-pvalue", boost::program_options::value< double >()->default_value(0.05, "0.05"), "Binomial p-value threshold below which a het genotype is considered as exhibiting allelic imbalance.")
+ ("filter-minimal-coverage", boost::program_options::value< unsigned int >()->default_value(10), "Minimal coverage for a genotype to be considered.")
+ ("filter-imputation-qual", boost::program_options::value< double >()->default_value(0.90, "0.90"), "Minimal imputation information score for a variant to be considered.")
+ ("filter-imputation-prob", boost::program_options::value< double >()->default_value(0.99, "0.99"), "Minimal posterior probability for a genotype to be considered.")
+ ("filter-keep-duplicates", "Keep duplicate sequencing reads in the process.");
+
+ D.option_descriptions.add(opt_files).add(opt_parameters);
+
+ //-------------------
+ // 2. PARSE OPTIONS
+ //-------------------
+ boost::program_options::variables_map options;
+ try {
+ boost::program_options::store(boost::program_options::command_line_parser(argv).options(D.option_descriptions).run(), D.options);
+ boost::program_options::notify(D.options);
+ } catch ( const boost::program_options::error& e ) {
+ cerr << "Error parsing [match] command line :" << string(e.what()) << endl;
+ exit(0);
+ }
+
+ //---------------------
+ // 3. PRINT HELP/HEADER
+ //---------------------
+ vrb.ctitle("MATCHING SAMPLES BETWEEN GENOTYPES AND SEQUENCES");
+ if (D.options.count("help")) {
+ cout << D.option_descriptions << endl;
+ exit(EXIT_SUCCESS);
+ }
+
+ //-----------------
+ // 4. COMMON CHECKS
+ //-----------------
+ if (!D.options.count("vcf")) vrb.error("Genotype data needs to be specified with --vcf [file.vcf]");
+ if (!D.options.count("bam")) vrb.error("Sequence data needs to be specified with --bam [file.bam]");
+ if (!D.options.count("out")) vrb.error("Output needs to be specified with --out [file.out]");
+
+ //TO DO CHECK PARAMETER VALUES
+ D.param_min_mapQ = D.options["filter-mapping-quality"].as < unsigned int > ();
+ D.param_min_baseQ = D.options["filter-base-quality"].as < unsigned int > ();
+ D.param_min_cov = D.options["filter-minimal-coverage"].as < unsigned int > ();
+ D.param_min_pval = D.options["filter-binomial-pvalue"].as < double > ();
+ D.param_min_gp = D.options["filter-imputation-prob"].as < double > ();
+ D.param_min_iq = D.options["filter-imputation-qual"].as < double > ();
+ D.param_dup_rd = (D.options.count("filter-keep-duplicates") != 0);
+ vrb.bullet("Mapping quality >= " + stb.str(D.param_min_mapQ));
+ vrb.bullet("Base quality >= " + stb.str(D.param_min_baseQ));
+ vrb.bullet("Coverage >= " + stb.str(D.param_min_cov));
+ vrb.bullet("Binomial p-value threshold = " + stb.str(D.param_min_pval));
+ vrb.bullet("Genotype probability >= " + stb.str(D.param_min_gp));
+ vrb.bullet("Imputation quality >= " + stb.str(D.param_min_iq));
+ vrb.bullet("Keep duplicate reads = " + stb.str(D.param_dup_rd));
+
+ //------------------------------------------
+ // 5. READ FILES / INITIALIZE / RUN ANALYSIS
+ //------------------------------------------
+
+ D.processBasicOptions();
+ D.readSampleFromVCF(D.options["vcf"].as < string > ());
+ D.mergeSampleLists();
+ D.readGenotypes(D.options["vcf"].as < string > (), D.options["reg"].as < string > ());
+ D.readSequences(D.options["bam"].as < string > ());
+ D.writeOutput(D.options["out"].as < string > ());
+}
diff --git a/src/mode_match/match_managment.cpp b/src/mode_match/match_managment.cpp
new file mode 100644
index 0000000..d5780f0
--- /dev/null
+++ b/src/mode_match/match_managment.cpp
@@ -0,0 +1,32 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "match_data.h"
+
+match_data::match_data() {
+ param_min_mapQ = 10;
+ param_min_baseQ = 5;
+ param_min_cov = 10;
+ param_min_pval = 0.05;
+ param_min_gp = 0.99;
+ param_min_iq = 0.90;
+}
+
+match_data::~match_data() {
+ regions.clear();
+ sites.clear();
+ gen_ref.clear();
+ gen_alt.clear();
+}
diff --git a/src/mode_match/match_process.cpp b/src/mode_match/match_process.cpp
new file mode 100644
index 0000000..309db27
--- /dev/null
+++ b/src/mode_match/match_process.cpp
@@ -0,0 +1,67 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "match_data.h"
+
+void match_data::writeOutput(string filename) {
+
+ vrb.title("Write summary report in [" + filename + "]");
+ output_file fd (filename);
+
+ for (int i = 0 ; i < sample_count ; i ++) {
+ unsigned int n_mis_tot = 0, n_het_tot = 0, n_hom_tot = 0, n_het_cov = 0, n_hom_cov = 0, n_het_fit = 0, n_hom_fit = 0, n_het_ase = 0;
+ for (int r = 0; r < regions.size() ; r ++) {
+ for (int s = 0 ; s < sites[r].size() ; s ++) {
+ unsigned int coverage = sites[r][s].cref + sites[r][s].calt;
+
+ //A. Missing genotype
+ if (!gen_ref[r][s][i] && !gen_alt[r][s][i]) n_mis_tot ++;
+
+ //B. Homozygous
+ if (gen_ref[r][s][i] != gen_alt[r][s][i]) {
+ n_hom_tot ++;
+ if (coverage >= param_min_cov) {
+ n_hom_cov ++;
+ if (sites[r][s].cref == 0 || sites[r][s].calt == 0) n_hom_fit ++;
+ }
+ }
+
+ //C. Heterozygous
+ if (gen_ref[r][s][i] && gen_alt[r][s][i]) {
+ n_het_tot ++;
+ if (coverage >= param_min_cov) {
+ n_het_cov ++;
+ if (sites[r][s].cref > 0 && sites[r][s].calt > 0) {
+ n_het_fit ++;
+ double bvalue = match_binomialTest(sites[r][s].cref, sites[r][s].cref + sites[r][s].calt, 0.5);
+ if (bvalue < param_min_pval) n_het_ase ++;
+ }
+ }
+ }
+ }
+ }
+
+ fd << sample_id[i];
+ fd << " " << n_mis_tot;
+ fd << " " << n_het_tot;
+ fd << " " << n_hom_tot;
+ fd << " " << n_het_cov;
+ fd << " " << n_hom_cov;
+ fd << " " << n_het_fit;
+ fd << " " << n_hom_fit;
+ fd << " " << n_het_ase << endl;
+ }
+ fd.close();
+}
diff --git a/src/mode_match/match_read_genotypes.cpp b/src/mode_match/match_read_genotypes.cpp
new file mode 100644
index 0000000..9a73efc
--- /dev/null
+++ b/src/mode_match/match_read_genotypes.cpp
@@ -0,0 +1,121 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "match_data.h"
+
+void match_data::readGenotypes(string filename, string str_regions) {
+ int n_includedG = 0;
+ int n_excludedG_mult = 0;
+ int n_excludedG_snpv = 0;
+ int n_excludedG_void = 0;
+ int n_excludedG_user = 0;
+ int n_excludedG_impq = 0;
+ vector < int > mappingS;
+
+ vrb.title("Reading VCF [" + filename + "]");
+ bcf_srs_t * sr = bcf_sr_init();
+ sr->collapse = COLLAPSE_NONE;
+
+ //Jump to regions if necessary
+ if (str_regions.size() > 0) {
+ if (bcf_sr_set_regions(sr, str_regions.c_str(), 0) == -1) vrb.error("Failed to jump to region [" + str_regions + "]");
+ else vrb.bullet("scanning region(s) [" + str_regions + "]");
+ } else vrb.bullet("scanning full VCF file");
+
+ //Add readers
+ if(!(bcf_sr_add_reader (sr, filename.c_str()))) {
+ switch (sr->errnum) {
+ case not_bgzf: vrb.error("Not compressed with bgzip");
+ case idx_load_failed: vrb.error("Impossible to load index file");
+ case file_type_error: vrb.error("Unrecognized file format");
+ default: vrb.error("Unknown error when opening");
+ }
+ }
+
+ //Sample processing
+ unsigned int n_samples_in_file = bcf_hdr_nsamples(sr->readers[0].header);
+ for (int i = 0 ; i < n_samples_in_file ; i ++) mappingS.push_back(findSample(string(sr->readers[0].header->samples[i])));
+
+ //Init needed data
+ int ngp = 0, ngt = 0, niq = 0, ngt_arr = 0, ngp_arr = 0, niq_arr = 0;
+ int * gt_arr = NULL;
+ float * gp_arr = NULL, * iq_arr = NULL;
+ bcf1_t * line;
+
+ //Parse VCF
+ map < string, unsigned int > region_map;
+ map < string, unsigned int > :: iterator region_map_it;
+ while(bcf_sr_next_line (sr)) {
+ line = bcf_sr_get_line(sr, 0);
+ if (line->n_allele > 2) n_excludedG_mult ++;
+ else {
+ bcf_unpack(line, BCF_UN_STR);
+ string sid = string(line->d.id);
+ if (!filter_genotype.check(sid)) n_excludedG_user ++;
+ else {
+ string curr_chr = bcf_hdr_id2name(sr->readers[0].header, line->rid); //chr
+ unsigned int pos = line->pos; //pos
+ string ref = string(line->d.allele[0]); //ref
+ string alt = string(line->d.allele[1]); //alt
+ niq = bcf_get_info_float(sr->readers[0].header, line, "IQ", &iq_arr, &niq_arr); //imp score
+ unsigned int region_idx;
+ region_map_it = region_map.find(curr_chr);
+ if (region_map_it == region_map.end()) {
+ vrb.bullet("new chromosome discovered [" + curr_chr + "]");
+ region_map.insert(pair < string, unsigned int > (curr_chr, regions.size()));
+ region_idx = regions.size();
+ regions.push_back(curr_chr);
+ sites.push_back(vector < site > ());
+ gen_ref.push_back(vector < vector < bool > > ());
+ gen_alt.push_back(vector < vector < bool > > ());
+ } else region_idx = region_map_it->second;
+
+ if (ref.size() > 1 || alt.size() > 1) n_excludedG_snpv ++;
+ else if (niq > 0 && iq_arr[0] < param_min_iq) n_excludedG_impq ++;
+ else {
+ ngt = bcf_get_genotypes(sr->readers[0].header, line, >_arr, &ngt_arr);
+ ngp = bcf_get_format_float(sr->readers[0].header, line,"GP", &gp_arr, &ngp_arr);
+ if (ngt != n_samples_in_file * 2) n_excludedG_void ++;
+ else {
+ sites[region_idx].push_back(site (curr_chr, pos, ref, alt));
+ gen_ref[region_idx].push_back(vector < bool > (sample_count, false));
+ gen_alt[region_idx].push_back(vector < bool > (sample_count, false));
+ for (int i = 0 ; i < n_samples_in_file ; i ++) {
+ if (mappingS[i] >= 0) {
+ bool miss = false;
+ if (gt_arr[2*i+0] == bcf_gt_missing || gt_arr[2*i+1] == bcf_gt_missing) miss = true;
+ else if (ngp == 3 * n_samples_in_file && gp_arr[3*i+0] != bcf_float_missing && gp_arr[3*i+1] != bcf_float_missing && gp_arr[3*i+2] != bcf_float_missing && gp_arr[3*i+0] < param_min_gp && gp_arr[3*i+1] < param_min_gp && gp_arr[3*i+2] < param_min_gp) miss = true;
+ if (!miss) {
+ gen_ref[region_idx].back()[mappingS[i]] = !(bcf_gt_allele(gt_arr[2*i+0]) && bcf_gt_allele(gt_arr[2*i+1]));
+ gen_alt[region_idx].back()[mappingS[i]] = (bcf_gt_allele(gt_arr[2*i+0]) || bcf_gt_allele(gt_arr[2*i+1]));
+ }
+ }
+ }
+ n_includedG ++;
+ }
+ }
+ }
+ }
+ }
+ vrb.bullet(stb.str(n_includedG) + " variants included");
+ if (n_excludedG_user > 0) vrb.bullet(stb.str(n_excludedG_user) + " variants excluded by user");
+ if (n_excludedG_mult > 0) vrb.bullet(stb.str(n_excludedG_mult) + " multi-allelic variants excluded");
+ if (n_excludedG_snpv > 0) vrb.bullet(stb.str(n_excludedG_snpv) + " multi-nucleotidic variants excluded");
+ if (n_excludedG_impq > 0) vrb.bullet(stb.str(n_excludedG_user) + " badly imputed variants excluded");
+ if (n_excludedG_void > 0) vrb.bullet(stb.str(n_excludedG_void) + " variants without GT field excluded");
+ if (sites.size() == 0) vrb.leave("Cannot find usable variants in target region!");
+ free(gt_arr);
+ bcf_sr_destroy(sr);
+}
diff --git a/src/mode_match/match_read_sequences.cpp b/src/mode_match/match_read_sequences.cpp
new file mode 100644
index 0000000..c7a3dcd
--- /dev/null
+++ b/src/mode_match/match_read_sequences.cpp
@@ -0,0 +1,103 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "match_data.h"
+
+typedef struct { // auxiliary data structure
+ samFile * fp; // the file handle
+ bam_hdr_t * hdr; // the file header
+ hts_itr_t * iter; // NULL if a region not specified
+ int min_mapQ; // mapQ filter
+ bool dup_rd; // Do we consider duplicate read or not?
+} aux_t;
+
+static int read_bam(void *data, bam1_t *b) {
+ aux_t * aux = (aux_t*) data;
+ int ret;
+ while (1) {
+ ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->hdr, b);
+ if (ret < 0) break;
+ if (b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL)) continue;
+ if (!aux->dup_rd && (b->core.flag & BAM_FDUP)) continue;
+ if (b->core.flag & BAM_FPAIRED) {
+ if (! (b->core.flag & BAM_FPROPER_PAIR)) continue;
+ if (b->core.flag & BAM_FMUNMAP) continue;
+ if ((b->core.flag & BAM_FREVERSE) == (b->core.flag & BAM_FMREVERSE)) continue;
+ }
+ if ((int)b->core.qual < aux->min_mapQ) continue;
+ break;
+ }
+ return ret;
+}
+
+void match_data::readSequences(string fbam) {
+ aux_t * data = (aux_t *) malloc (sizeof(aux_t));
+
+ vrb.title("Processing BAM file [" + fbam + "]");
+ data->fp = sam_open(fbam.c_str(), "r");
+ if (data->fp == 0) vrb.error("Cannot open file!");
+ data->min_mapQ = param_min_mapQ;
+ data->dup_rd = param_dup_rd;
+ data->hdr = sam_hdr_read(data->fp);
+ if (data->hdr == 0) vrb.error("Cannot parse header!");
+ hts_idx_t *idx = sam_index_load(data->fp, fbam.c_str());
+ if (idx == NULL) vrb.error("Cannot load index!");
+
+ //Loop across regions
+ for (int reg = 0; reg < regions.size() ; reg ++) {
+
+ //Jump to region
+ data->iter = sam_itr_querys(idx, data->hdr, regions[reg].c_str()); // set the iterator
+ if (data->iter == NULL) vrb.error("Problem jumping to region [" + regions[reg] + "]");
+ else vrb.bullet("scanning region [" + regions[reg] + "]");
+
+ int beg = data->iter->beg;
+ int end = data->iter->end;
+
+ //Pile up reads
+ const bam_pileup1_t * v_plp;
+ int n_plp = 0, tid, pos, i_site = 0;
+ bam_plp_t s_plp = bam_plp_init(read_bam, (void*)data);
+ while (((v_plp = bam_plp_auto(s_plp, &tid, &pos, &n_plp)) != 0) && i_site < sites[reg].size()) {
+ int chr = bam_name2id(data->hdr, sites[reg][i_site].chr.c_str());
+ if (pos < beg || pos >= end) continue;
+ while (i_site < sites[reg].size() && (chr != tid || pos > sites[reg][i_site].pos)) { i_site ++; }
+ if (tid == chr && pos == sites[reg][i_site].pos) {
+ for (int red = 0 ; red < n_plp ; red ++) {
+ const bam_pileup1_t * p = v_plp + red;
+ sites[reg][i_site].ctot ++;
+ if (p->is_del || p->is_refskip || p->indel == 1) sites[reg][i_site].cdel ++;
+ else if (bam_get_qual(p->b)[p->qpos] < param_min_baseQ) sites[reg][i_site].cqual ++;
+ else {
+ char base = match_getBase(bam_seqi(bam_get_seq(p->b), p->qpos));
+ bool isRef = (base == sites[reg][i_site].ref);
+ bool isAlt = (base == sites[reg][i_site].alt);
+ if (isRef) sites[reg][i_site].cref ++;
+ if (isAlt) sites[reg][i_site].calt ++;
+ if (!isRef && !isAlt) sites[reg][i_site].cdisc ++;
+ }
+ }
+ }
+ }
+ bam_plp_reset(s_plp);
+ bam_plp_destroy(s_plp);
+ }
+
+ bam_hdr_destroy(data->hdr);
+ hts_idx_destroy(idx);
+ if (data->fp) sam_close(data->fp);
+ hts_itr_destroy(data->iter);
+ free(data);
+}
diff --git a/src/mode_pca/pca_data.h b/src/mode_pca/pca_data.h
new file mode 100644
index 0000000..550e749
--- /dev/null
+++ b/src/mode_pca/pca_data.h
@@ -0,0 +1,58 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#ifndef pca_data_h
+#define pca_data_h
+
+#define __RESIZE_CHUNK__ 50000
+
+//INCLUDES
+#include "../common/data.h"
+#include "pca_pca.h"
+
+//dataS
+class pca_data : public data {
+public:
+ double maf_cutoff;
+ int distance_separator;
+ Pca PCA;
+
+ int data_count; //data number
+ MatrixXf data_val; //data values
+ /*vector < string > data_id; //data ids
+ vector < string > data_chr; //data chromosomes
+ vector < int > data_start; //data start positions
+ vector < int > data_end; //data end positions*/
+
+ pca_data(){maf_cutoff = 0.0 ; distance_separator = 0;}
+
+ void resizeData();
+ void finalizeData(int);
+ void imputeData();
+ void readData(string);
+ void readDataVCF(string);
+ void readDataBED(string);
+ void readDataPhenoBED(string);
+ void printPCA(string);
+
+};
+
+//***************************************************************//
+//******************** DECLARE FUNCTIONS *************************//
+//***************************************************************//
+void pca_main(vector < string > &);
+
+
+#endif /* pca_data_h */
diff --git a/src/mode_pca/pca_main.cpp b/src/mode_pca/pca_main.cpp
new file mode 100644
index 0000000..4edcb33
--- /dev/null
+++ b/src/mode_pca/pca_main.cpp
@@ -0,0 +1,102 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "pca_data.h"
+
+void pca_main(vector < string > & argv) {
+ pca_data D;
+
+ //-------------------------
+ // 1. DECLARE ALL OPTIONS
+ //-------------------------
+ D.declareBasicOptions(); //Mandatory
+ boost::program_options::options_description opt_files ("\x1B[32mI/O\33[0m");
+ opt_files.add_options()
+ ("vcf", boost::program_options::value< string >(), "Genotypes in VCF/BCF/BED format.")
+ ("bed", boost::program_options::value< string >(), "Phenotypes in BED format.")
+ ("out", boost::program_options::value< string >(), "Output file prefix.");
+
+ boost::program_options::options_description opt_parameters ("\x1B[32mParameters\33[0m");
+ opt_parameters.add_options()
+ ("center", "Center the quantifications or genotypes before PCA.")
+ ("scale", "Scale the quantifications or genotypes to unit variance before PCA.")
+ //("use-cor", "Use correlation rather than SVD in PCA (Valid only when number of samples is greater than number of phenotypes or genotypes.")
+ ("maf", boost::program_options::value< double >()->default_value(0.0, "0"), "Exclude sites with MAF less than this.")
+ ("distance", boost::program_options::value< unsigned int >()->default_value(0,"0"), "Only include sites separated with this many bp");
+
+ D.option_descriptions.add(opt_files).add(opt_parameters);
+
+ //-------------------
+ // 2. PARSE OPTIONS
+ //-------------------
+ try {
+ boost::program_options::store(boost::program_options::command_line_parser(argv).options(D.option_descriptions).run(), D.options);
+ boost::program_options::notify(D.options);
+ } catch ( const boost::program_options::error& e ) {
+ cerr << "Error parsing [pca] command line :" << string(e.what()) << endl;
+ exit(0);
+ }
+
+ //---------------------
+ // 3. PRINT HELP/HEADER
+ //---------------------
+ vrb.ctitle("CONDUCT PCA ON DATA");
+ if (D.options.count("help")) {
+ cout << D.option_descriptions << endl;
+ exit(EXIT_SUCCESS);
+ }
+
+ //-----------------
+ // 4. COMMON CHECKS
+ //-----------------
+ if (!D.options.count("vcf") && !D.options.count("bed") ) vrb.error("Genotypes with --vcf [file.vcf] or phenotypes with --bed [file.bed] must be specified");
+ if (D.options.count("vcf") && D.options.count("bed") ) vrb.error("Provide only one of --bed or --vcf");
+ if (!D.options.count("out")) vrb.error("Output needs to be specified with --out [file.out]");
+ if (D.options.count("bed") && (D.options["distance"].as < unsigned int > () > 0 || D.options["maf"].as < double > () > 0)) vrb.error("--distance and --maf cannot be combined with --bed");
+
+ //--------------
+ // 5. SET PARAMS
+ //--------------
+ if (D.options.count("vcf")){
+ if (D.options["maf"].as < double > () < 0 || D.options["maf"].as < double > () > 1.0) vrb.error ("Incorrect --maf");
+ D.maf_cutoff = D.options["maf"].as < double > ();
+ vrb.bullet("MAF greater than " + stb.str(D.maf_cutoff));
+ D.distance_separator = D.options["distance"].as < unsigned int > ();
+ vrb.bullet("Sites every " + stb.str(D.distance_separator) + " bp");
+ }
+ string outFile = D.options["out"].as < string > ();
+
+ //--------------
+ // 6. READ FILES
+ //--------------
+ D.processBasicOptions();
+ if (D.options.count("bed")) D.readSampleFromBED(D.options["bed"].as < string > ()); //Read samples in BED
+ else {
+ htsFile * fp = hts_open(D.options["vcf"].as < string > ().c_str(),"r");
+ if (fp->format.format == sam) D.readSampleFromBED(D.options["vcf"].as < string > ());
+ else D.readSampleFromVCF(D.options["vcf"].as < string > ());
+ hts_close(fp);
+ }
+ D.mergeSampleLists(); //Merge all sample lists
+ if (D.options.count("bed")) D.readDataPhenoBED(D.options["bed"].as < string > ()); //Read data in BED
+ else D.readData(D.options["vcf"].as < string > ()); //Read data in VCF
+
+ //-----------------
+ // 7. RUN ANALYSIS
+ //-----------------
+ D.imputeData();
+ D.PCA.Calculate(D.options.count("use-cor"), D.options.count("center") , D.options.count("scale") );
+ D.printPCA(outFile);
+}
\ No newline at end of file
diff --git a/src/mode_pca/pca_management.cpp b/src/mode_pca/pca_management.cpp
new file mode 100644
index 0000000..1b4b67d
--- /dev/null
+++ b/src/mode_pca/pca_management.cpp
@@ -0,0 +1,64 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "pca_data.h"
+
+
+void pca_data::resizeData(){
+ PCA._xXf.conservativeResize(NoChange,PCA._xXf.cols() + __RESIZE_CHUNK__);
+}
+
+void pca_data::finalizeData(int size){
+ PCA._xXf.conservativeResize(NoChange,size);
+}
+
+void pca_data::imputeData() {
+ for (int g = 0; g < data_count ; g ++) {
+ double mean = 0.0;
+ int c_mean = 0;
+ for (int s = 0; s < sample_count ; s ++) {
+ if (PCA._xXf(s,g) != bcf_float_missing) {
+ mean += PCA._xXf(s,g);
+ c_mean ++;
+ }
+ }
+ mean /= c_mean;
+ for (int s = 0; s < sample_count ; s ++) if (PCA._xXf(s,g) == bcf_float_missing) PCA._xXf(s,g) = mean;
+ }
+}
+
+void pca_data::printPCA(string prefix){
+ output_file pca(prefix + ".pca");
+ output_file pcaStats(prefix + ".pca_stats");
+ pca << "SampleID";
+ for (int i = 0 ; i < sample_count; i++) pca << " " << sample_id[i];
+ pca<< endl;
+ IOFormat MF(StreamPrecision, DontAlignCols, " " , "\n", "", "", "", "");
+ for (int r = 0 ; r < PCA._pcs.rows(); r++ ) {
+ pca << prefix << "_" << PCA.is_center() << "_" << PCA.is_scale() << "_" << PCA._method + "_PC" + stb.str(r+1) + " " << PCA._pcs.row(r).format(MF) << endl;
+ }
+
+ pcaStats << "sd ";
+ copy(PCA._sd.begin(), PCA._sd.end(),ostream_iterator<float>(pcaStats," "));
+ pcaStats << endl;
+ pcaStats << "prop_var ";
+ copy(PCA._prop_of_var.begin(), PCA._prop_of_var.end(),ostream_iterator<float>(pcaStats," "));
+ pcaStats << endl;
+ pcaStats << "cumm_prop ";
+ copy(PCA._cum_prop.begin(), PCA._cum_prop.end(),ostream_iterator<float>(pcaStats," "));
+ pcaStats << endl;
+ pcaStats << "#Kaiser criterion: PC #" << PCA._kaiser << endl;
+ pcaStats << "#Thresh995 criterion: PC #" << PCA._thresh995 << endl;
+}
diff --git a/src/mode_pca/pca_pca.cpp b/src/mode_pca/pca_pca.cpp
new file mode 100644
index 0000000..6509ba7
--- /dev/null
+++ b/src/mode_pca/pca_pca.cpp
@@ -0,0 +1,240 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "pca_pca.h"
+#include <iostream>
+#include <iterator>
+#include "Eigen/SVD"
+
+
+using namespace std;
+using namespace Eigen;
+
+void Pca::Calculate(const bool is_corr, const bool is_center, const bool is_scale) {
+ _ncols = _xXf.cols();
+ _nrows = _xXf.rows();
+ _is_corr = is_corr;
+ _is_center = is_center;
+ _is_scale = is_scale;
+
+ vrb.title("Calculating PCA with a matrix " + stb.str(_nrows) + " x " + stb.str(_ncols));
+
+ if ((1 == _ncols) || (1 == _nrows)) {
+ vrb.error("Row or column count equals 1!");
+ }
+ // Mean and standard deviation for each column
+ VectorXf mean_vector(_ncols);
+ mean_vector = _xXf.colwise().mean();
+ VectorXf sd_vector(_ncols);
+ unsigned int zero_sd_num = 0;
+ float denom = static_cast<float>((_nrows > 1)? _nrows - 1: 1);
+ for (unsigned int i = 0; i < _ncols; ++i) {
+ VectorXf curr_col = VectorXf::Constant(_nrows, mean_vector(i)); // mean(x) for column x
+ curr_col = _xXf.col(i) - curr_col; // x - mean(x)
+ curr_col = curr_col.array().square(); // (x-mean(x))^2
+ sd_vector(i) = sqrt((curr_col.sum())/denom);
+ if (0 == sd_vector(i)) {
+ zero_sd_num++;
+ }
+ }
+ vrb.bullet("Calculated mean and sd");
+ // If colums with sd == 0 are too many,
+ // don't continue calculations
+ if (1 > _ncols-zero_sd_num) {
+ vrb.error("No variant data found!");
+ }
+
+ // Delete columns where sd == 0
+ MatrixXf tmp(_nrows, _ncols-zero_sd_num);
+ VectorXf tmp_mean_vector(_ncols-zero_sd_num);
+ unsigned int curr_col_num = 0;
+ for (unsigned int i = 0; i < _ncols; ++i) {
+ if (0 != sd_vector(i)) {
+ tmp.col(curr_col_num) = _xXf.col(i);
+ tmp_mean_vector(curr_col_num) = mean_vector(i);
+ curr_col_num++;
+ } else {
+ _eliminated_columns.push_back(i);
+ }
+ }
+ _ncols -= zero_sd_num;
+ _xXf = tmp;
+ mean_vector = tmp_mean_vector;
+ tmp.resize(0, 0); tmp_mean_vector.resize(0);
+
+ vrb.bullet( stb.str(zero_sd_num) + " sd==0 columns removed");
+
+ // Shift to zero
+ if (true == _is_center) {
+ for (unsigned int i = 0; i < _ncols; ++i) {
+ _xXf.col(i) -= VectorXf::Constant(_nrows, mean_vector(i));
+ }
+ vrb.bullet("Variables centered");
+ }else vrb.warning("Variables are NOT centered");
+
+ // Scale to unit variance
+ if ( true == _is_scale) {
+ for (unsigned int i = 0; i < _ncols; ++i) {
+ _xXf.col(i) /= sqrt(_xXf.col(i).array().square().sum()/denom);
+ }
+ vrb.bullet("Variables scaled to unit variance");
+ }else vrb.warning("Variables are NOT scaled");
+
+ // When _nrows < _ncols then svd will be used.
+ // If corr is true and _nrows > _ncols then will be used correlation matrix
+ // (TODO): What about covariance?
+ if ( (_nrows < _ncols) || (false == _is_corr)) { // Singular Value Decomposition is on
+ _method = "svd";
+ //if ( (_nrows < _ncols) && true == _is_corr) vrb.bullet("Number of rows is less than number of columns defaulting to SVD");
+ //if ( (_nrows >= _ncols) && false == _is_corr) vrb.bullet("Number of rows is greater than number of columns, but SVD is forced (--use-cor if you don't want this)");
+ vrb.bullet("Running SVD");
+ JacobiSVD<MatrixXf> svd(_xXf, ComputeThinV);
+ VectorXf eigen_singular_values = svd.singularValues();
+ VectorXf tmp_vec = eigen_singular_values.array().square();
+ vrb.bullet("SVD complete");
+ float tmp_sum = tmp_vec.sum();
+ tmp_vec /= tmp_sum;
+ // PC's standard deviation and
+ // PC's proportion of variance
+ _kaiser = 0;
+ unsigned int lim = (_nrows < _ncols)? _nrows : _ncols;
+ for (unsigned int i = 0; i < lim; ++i) {
+ _sd.push_back(eigen_singular_values(i)/sqrt(denom));
+ if (_sd[i] >= 1) {
+ _kaiser = i + 1;
+ }
+ _prop_of_var.push_back(tmp_vec(i));
+ }
+ vrb.bullet("Calculated sd and var for PCs");
+ tmp_vec.resize(0);
+ // PC's cumulative proportion
+ _thresh995 = 1;
+ _cum_prop.push_back(_prop_of_var[0]);
+ for (unsigned int i = 1; i < _prop_of_var.size(); ++i) {
+ _cum_prop.push_back(_cum_prop[i-1]+_prop_of_var[i]);
+ if (_cum_prop[i] < 0.995) {
+ _thresh995 = i+1;
+ }
+ }
+ vrb.bullet("Calculated cumulative var for PCs");
+ // Scores
+ MatrixXf eigen_scores = _xXf * svd.matrixV();
+ _pcs = eigen_scores.transpose();
+ eigen_scores.resize(0, 0);
+ vrb.bullet("Done!");
+ } else { // COR OR COV MATRICES ARE HERE
+ _method = "cor";
+ vrb.bullet("PCA with correlation matrix");
+ // Calculate covariance matrix
+ MatrixXf eigen_cov; // = MatrixXf::Zero(_ncols, _ncols);
+ VectorXf sds;
+ // (TODO) Should be weighted cov matrix, even if is_center == false
+ eigen_cov = (1.0 /(_nrows/*-1*/)) * _xXf.transpose() * _xXf;
+ sds = eigen_cov.diagonal().array().sqrt();
+ MatrixXf outer_sds = sds * sds.transpose();
+ eigen_cov = eigen_cov.array() / outer_sds.array();
+ vrb.bullet("Covariance complete");
+ outer_sds.resize(0, 0);
+ // ?If data matrix is scaled, covariance matrix is equal to correlation matrix
+
+ EigenSolver<MatrixXf> edc(eigen_cov);
+ VectorXf eigen_eigenvalues = edc.eigenvalues().real();
+ MatrixXf eigen_eigenvectors = edc.eigenvectors().real();
+
+
+ // The eigenvalues and eigenvectors are not sorted in any particular order.
+ // So, we should sort them
+ typedef pair<float, int> eigen_pair;
+ vector<eigen_pair> ep;
+ for (unsigned int i = 0 ; i < _ncols; ++i) {
+ ep.push_back(make_pair(eigen_eigenvalues(i), i));
+ }
+ sort(ep.begin(), ep.end()); // Ascending order by default
+ // Sort them all in descending order
+ MatrixXf eigen_eigenvectors_sorted = MatrixXf::Zero(eigen_eigenvectors.rows(), eigen_eigenvectors.cols());
+ VectorXf eigen_eigenvalues_sorted = VectorXf::Zero(_ncols);
+ int colnum = 0;
+ int i = ep.size()-1;
+ for (; i > -1; i--) {
+ eigen_eigenvalues_sorted(colnum) = ep[i].first;
+ eigen_eigenvectors_sorted.col(colnum++) += eigen_eigenvectors.col(ep[i].second);
+ }
+
+ // We don't need not sorted arrays anymore
+ eigen_eigenvalues.resize(0);
+ eigen_eigenvectors.resize(0, 0);
+
+ _sd.clear(); _prop_of_var.clear(); _kaiser = 0;
+ float tmp_sum = eigen_eigenvalues_sorted.sum();
+ for (unsigned int i = 0; i < _ncols; ++i) {
+ _sd.push_back(sqrt(eigen_eigenvalues_sorted(i)));
+ if (_sd[i] >= 1) {
+ _kaiser = i + 1;
+ }
+ _prop_of_var.push_back(eigen_eigenvalues_sorted(i)/tmp_sum);
+ }
+ vrb.bullet("Calculated sd and var for PCs");
+
+ // PC's cumulative proportion
+ _cum_prop.clear(); _thresh995 = 1;
+ _cum_prop.push_back(_prop_of_var[0]);
+ for (unsigned int i = 1; i < _prop_of_var.size(); ++i) {
+ _cum_prop.push_back(_cum_prop[i-1]+_prop_of_var[i]);
+ if (_cum_prop[i] < 0.995) {
+ _thresh995 = i+1;
+ }
+ }
+ vrb.bullet("Calculated cumulative var for PCs");
+ // Scores for PCA with correlation matrix
+ // Scale before calculating new values
+ for (unsigned int i = 0; i < _ncols; ++i) {
+ _xXf.col(i) /= sds(i);
+ }
+ sds.resize(0);
+ MatrixXf eigen_scores = _xXf * eigen_eigenvectors_sorted;
+ _pcs = eigen_scores.transpose();
+ eigen_scores.resize(0, 0);
+ vrb.bullet("Done!");
+ }
+}
+std::vector<float> Pca::sd(void) { return _sd; };
+std::vector<float> Pca::prop_of_var(void) {return _prop_of_var; };
+std::vector<float> Pca::cum_prop(void) { return _cum_prop; };
+std::vector<float> Pca::scores(void) { return _scores; };
+std::vector<unsigned int> Pca::eliminated_columns(void) { return _eliminated_columns; }
+string Pca::method(void) { return _method; }
+unsigned int Pca::kaiser(void) { return _kaiser; };
+unsigned int Pca::thresh995(void) { return _thresh995; };
+unsigned int Pca::ncols(void) { return _ncols; }
+unsigned int Pca::nrows(void) { return _nrows; }
+bool Pca::is_scale(void) { return _is_scale; }
+bool Pca::is_center(void) { return _is_center; }
+Pca::Pca(void) {
+ _nrows = 0;
+ _ncols = 0;
+ // Variables will be scaled by default
+ _is_center = true;
+ _is_scale = true;
+ // By default will be used singular value decomposition
+ _method = "svd";
+ _is_corr = false;
+
+ _kaiser = 0;
+ _thresh995 = 1;
+}
+Pca::~Pca(void) {
+ _xXf.resize(0, 0);
+ _x.clear();
+}
diff --git a/src/mode_pca/pca_pca.h b/src/mode_pca/pca_pca.h
new file mode 100644
index 0000000..195f1e2
--- /dev/null
+++ b/src/mode_pca/pca_pca.h
@@ -0,0 +1,161 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#ifndef __fastqtl__pca__
+#define __fastqtl__pca__
+
+#include <vector>
+#include <string>
+#include "Eigen/Dense"
+#include "../common/data.h"
+
+class Pca {
+public:
+ std::vector<float> _x; // Initial matrix as vector filled by rows.
+ unsigned int _nrows, // Number of rows in matrix x.
+ _ncols; // Number of cols in matrix x.
+ bool _is_center, // Whether the variables should be shifted to be zero centered
+ _is_scale, // Whether the variables should be scaled to have unit variance
+ _is_corr; // PCA with correlation matrix, not covariance
+ std::string
+ _method; // svd, cor, cov
+ std::vector<unsigned int>
+ _eliminated_columns; // Numbers of eliminated columns
+ std::vector<float> _sd, // Standard deviation of each component
+ _prop_of_var, // Proportion of variance
+ _cum_prop, // Cumulative proportion
+ _scores; // Rotated values
+ unsigned int _kaiser, // Number of PC according Kaiser criterion
+ _thresh995; // Number of PC according 95% variance threshol
+ Eigen::MatrixXf _xXf; // Initial matrix as Eigen MatrixXf structure
+ Eigen::MatrixXf _pcs;
+ //! Initializing values and performing PCA
+ /*!
+ The main method for performin Principal Component Analysis
+ \param x Initial data matrix
+ \param nrows Number of matrix rows
+ \param ncols Number of matrix cols
+ \param is_corr Correlation matrix will be used instead of covariance matrix
+ \param is_center Whether the variables should be shifted to be zero centered
+ \param is_scale Whether the variables should be scaled to have unit variance
+ \result
+ 0 if everything is Ok
+ -1 if there were some errors
+ */
+ void Calculate(const bool is_corr = true, const bool is_center = true, const bool is_scale = true);
+ //! Return number of rows in initial matrix
+ /*!
+ \result Number of rows in initial matrix
+ */
+ unsigned int nrows(void);
+ //! Return number of cols in initial matrix
+ /*!
+ \result Number of cols in initial matrix
+ */
+ unsigned int ncols(void);
+ //! If variables are centered
+ /*!
+ \result
+ true - variables are centered
+ false - otherwise
+ */
+ bool is_center(void);
+ //! If variables are scaled
+ /*!
+ \result
+ true - variables are scaled
+ false - otherwise
+ */
+ bool is_scale(void);
+ //! Method for calculation of principal components
+ /*!
+ There are different methods used. The most used is SVD.
+ But in some cases it may be correlation or covariance matrices.
+ If
+ \result
+ "svd" - PCA with singular value decomposition
+ "cor" - PCA with correlation matrix
+ "cov" - PCA with covariance matrix
+ */
+ std::string method(void);
+ //! Returns numbers of eliminated columns
+ /*!
+ If standard deviation of a column is equal to 0, the column shoud be rejected,
+ or PCA will fail.
+ \result Numbers of eliminated columns, empty vector otherwise
+ */
+ std::vector<unsigned int> eliminated_columns(void);
+ //! Standard deviation of each principal component
+ /*!
+ \result Vector of standard deviation for each principal component:
+ 1st element is sd for 1st PC, 2nd - for 2nd PC and so on.
+ */
+ std::vector<float> sd(void);
+ //! Proportion of variance
+ /*!
+ \result Vector of variances for each component
+ */
+ std::vector<float> prop_of_var(void);
+ //! Cumulative proportion
+ /*!
+ \result Vector of cumulative proportions for each components
+ */
+ std::vector<float> cum_prop(void);
+ //! Principal component by the Kaiser criterion
+ /*!
+ Number of the last component with eigenvalue greater than 1.
+ \result Number of the first components we should retain defined by the Kaiser criterion
+ */
+ unsigned int kaiser(void);
+ //! 95% threshold
+ /*!
+ Retain only PC which cumulative proportion is less than 0.95
+ \result Number of PCs should be retain with the 95% threshold criterion
+ */
+ unsigned int thresh995(void);
+ //! Rotated values (scores)
+ /*!
+ Return calculated scores (coordinates in a new space) as vector. Matrix filled by rows.
+ \result Vector of scores
+ */
+ std::vector<float> scores(void);
+ //! Class constructor
+ Pca(void);
+ //! Class destructor
+ ~Pca(void);
+};
+
+inline void removeRow(Eigen::MatrixXf& matrix, unsigned int rowToRemove){
+ unsigned int numRows = matrix.rows()-1;
+ unsigned int numCols = matrix.cols();
+
+ if( rowToRemove < numRows )
+ matrix.block(rowToRemove,0,numRows-rowToRemove,numCols) = matrix.block(rowToRemove+1,0,numRows-rowToRemove,numCols);
+
+ matrix.conservativeResize(numRows,numCols);
+}
+
+
+inline void removeColumn(Eigen::MatrixXf& matrix, unsigned int colToRemove){
+ unsigned int numRows = matrix.rows();
+ unsigned int numCols = matrix.cols()-1;
+
+ if( colToRemove < numCols )
+ matrix.block(0,colToRemove,numRows,numCols-colToRemove) = matrix.block(0,colToRemove+1,numRows,numCols-colToRemove);
+
+ matrix.conservativeResize(numRows,numCols);
+}
+
+#endif /* defined(__fastqtl__pca__) */
diff --git a/src/mode_pca/pca_read_data.cpp b/src/mode_pca/pca_read_data.cpp
new file mode 100644
index 0000000..be14229
--- /dev/null
+++ b/src/mode_pca/pca_read_data.cpp
@@ -0,0 +1,287 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "pca_data.h"
+
+void pca_data::readData(string filename) {
+ vrb.title("Reading genotype data in [" + filename + "]");
+ htsFile * fp = hts_open(filename.c_str(),"r");
+ enum htsExactFormat fileformat = fp->format.format;
+ hts_close(fp);
+ if (fileformat == bcf) {
+ vrb.bullet("File format detected: BCF");
+ readDataVCF(filename);
+ } else if (fileformat == vcf) {
+ vrb.bullet("File format detected: VCF");
+ readDataVCF(filename);
+ } else if (fileformat == sam) {
+ vrb.bullet("File format detected: BED");
+ readDataBED(filename);
+ } else vrb.error("File format not supported!");
+}
+
+void pca_data::readDataVCF(string fvcf) {
+ int n_includedG = 0;
+ int n_excludedG_mult = 0;
+ int n_excludedG_void = 0;
+ int n_excludedG_user = 0;
+ int n_includedS = 0;
+ vector < int > mappingS;
+
+ //Opening files
+ bcf_srs_t * sr = bcf_sr_init();
+ if(!(bcf_sr_add_reader (sr, fvcf.c_str()))) {
+ switch (sr->errnum) {
+ case not_bgzf: vrb.error("File not compressed with bgzip!");
+ case idx_load_failed: vrb.error("Impossible to load index file!");
+ case file_type_error: vrb.error("File format not detected by htslib!");
+ default : vrb.error("Unknown error!");
+ }
+ }
+
+ //Sample processing
+ int n_samples = bcf_hdr_nsamples(sr->readers[0].header);
+ for (int i0 = 0 ; i0 < n_samples ; i0 ++) {
+ mappingS.push_back(findSample(string(sr->readers[0].header->samples[i0])));
+ if (mappingS.back() >= 0) n_includedS++;
+ }
+
+ PCA._xXf.resize(sample_count, __RESIZE_CHUNK__);
+
+ //Read genotype data
+ //int ngt, ngt_arr = 0, nds, nds_arr = 0, * gt_arr = NULL, nsl, nsl_arr = 0, * sl_arr = NULL;
+ int ngt, ngt_arr = 0, nds, nds_arr = 0, * gt_arr = NULL;
+ float * ds_arr = NULL;
+ bcf1_t * line;
+ unsigned int linecount = 0;
+ string pChr = "";
+ int pPos = 0;
+ while(bcf_sr_next_line (sr)) {
+ linecount ++;
+ if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
+ line = bcf_sr_get_line(sr, 0);
+ if (line->n_allele == 2) {
+ ngt = bcf_get_genotypes(sr->readers[0].header, line, >_arr, &ngt_arr);
+ nds = bcf_get_format_float(sr->readers[0].header, line,"DS", &ds_arr, &nds_arr);
+ if (nds == n_samples || ngt == 2*n_samples) {
+ bcf_unpack(line, BCF_UN_STR);
+ string sid = string(line->d.id);
+ string chr = string(bcf_hdr_id2name(sr->readers[0].header, line->rid));
+ int pos = line->pos + 1;
+ if (filter_genotype.check(sid) && ((pChr == chr && abs(pos - pPos) >= distance_separator) || pChr != chr)) {
+ vector < float > temp(sample_count, 0.0);
+ int total = 0 ;
+ int count = 0;
+ for(int i = 0 ; i < n_samples ; i ++) {
+ if (mappingS[i] >= 0) {
+ if (nds > 0) {
+ temp[mappingS[i]] = ds_arr[i];
+ count+=2;
+ total+=temp[mappingS[i]];
+ } else {
+ if (gt_arr[2*i+0] == bcf_gt_missing || gt_arr[2*i+1] == bcf_gt_missing) temp[mappingS[i]] = bcf_float_missing;
+ else {
+ temp[mappingS[i]] = bcf_gt_allele(gt_arr[2*i+0]) + bcf_gt_allele(gt_arr[2*i+1]);
+ count+=2;
+ total+=temp[mappingS[i]];
+ }
+ }
+ }
+ }
+ double af = (double) total / (double) count;
+ if (maf_cutoff > af || 1.0-maf_cutoff < af){
+ n_excludedG_user ++;
+ continue;
+ }
+ pChr = chr;
+ pPos = pos;
+ //data_id.push_back(sid);
+ //data_chr.push_back(chr);
+ //string genotype_ref = string(line->d.allele[0]);
+ //data_start.push_back(pos);
+ //nsl = bcf_get_info_int32(sr->readers[0].header, line, "END", &sl_arr, &nsl_arr);
+ //if (nsl >= 0 && nsl_arr == 1) data_end.push_back(sl_arr[0]);
+ //else data_end.push_back(data_start.back() + genotype_ref.size() - 1);
+ if (n_includedG >= PCA._xXf.cols()) resizeData();
+ for (int i = 0 ; i < temp.size(); i++) PCA._xXf(i,n_includedG) = temp[i];
+ n_includedG++;
+ } else n_excludedG_user ++;
+ } else n_excludedG_void ++;
+ } else n_excludedG_mult ++;
+ }
+
+ //Finalize
+ free(gt_arr);
+ free(ds_arr);
+ bcf_sr_destroy(sr);
+ data_count = n_includedG;
+ vrb.bullet(stb.str(n_includedG) + " variants included");
+ if (n_excludedG_user > 0) vrb.bullet(stb.str(n_excludedG_user) + " variants excluded by user");
+ if (n_excludedG_mult > 0) vrb.bullet(stb.str(n_excludedG_mult) + " multi-allelic variants excluded");
+ if (n_excludedG_void > 0) vrb.bullet(stb.str(n_excludedG_void) + " uninformative variants excluded [no GT/DS]");
+ if (data_count == 0) vrb.leave("Cannot find genotypes in target region!");
+ finalizeData(n_includedG);
+}
+
+void pca_data::readDataBED(string fbed) {
+ string buffer;
+ int n_includedG = 0;
+ int n_excludedG_user = 0;
+ int n_includedS = 0;
+ int n_excludedS = 0;
+ int n_missingS = 0;
+ vector < int > mappingS;
+
+ //Opening files
+ htsFile *fp = hts_open(fbed.c_str(),"r");
+ if (!fp) vrb.error("Cannot open file!");
+ tbx_t * tbx = tbx_index_load(fbed.c_str());
+ if (!tbx) vrb.error("Cannot load index file!");
+ kstring_t str = {0,0,0};
+ if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line!");
+
+ //Process sample names
+ vector < string > tokens;
+ stb.split(string(str.s), tokens);
+ if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
+ for (int i0 = 6 ; i0 < tokens.size() ; i0 ++) {
+ string sid = tokens[i0];
+ if (filter_sample.check(sid)) {
+ mappingS.push_back(findSample(sid));
+ if (mappingS.back() >= 0) n_includedS ++;
+ else n_missingS ++;
+ } else {
+ mappingS.push_back(-1);
+ n_excludedS ++;
+ }
+ }
+ vrb.bullet(stb.str(n_includedS) + " samples included");
+ if (n_excludedS > 0) vrb.bullet(stb.str(n_excludedS) + " samples excluded by user");
+ if (n_missingS > 0) vrb.bullet(stb.str(n_missingS) + " samples without phenotype data");
+ if (n_includedS != sample_count) vrb.error("Cannot find genotype for " + stb.str(sample_count - n_includedS) + " samples!");
+
+ PCA._xXf.resize(sample_count,__RESIZE_CHUNK__);
+
+ unsigned int linecount = 0;
+ string pChr = "";
+ int pPos = 0;
+ while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) {
+ linecount ++;
+ if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
+ stb.split(string(str.s), tokens);
+ if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
+ string chr = tokens[0];
+ int pos = atoi(tokens[1].c_str()) + 1;
+ if (filter_genotype.check(tokens[3]) && ((pChr == chr && abs(pos - pPos) >= distance_separator) || pChr != chr)) {
+ vector < float > temp(sample_count, 0.0);
+ int total = 0 ;
+ int count = 0;
+ for (int t = 6 ; t < tokens.size() ; t ++) {
+ if (mappingS[t-6] >= 0) {
+ if (tokens[t] == "NA") temp[mappingS[t-6]] = bcf_float_missing;
+ else {
+ temp[mappingS[t-6]] = stof(tokens[t]);
+ count+=2;
+ total+=temp[mappingS[t-6]];
+ }
+ }
+ }
+ double af = (double) total / (double) count;
+ if (maf_cutoff > af || 1.0-maf_cutoff < af){
+ n_excludedG_user ++;
+ continue;
+ }
+ //data_id.push_back(tokens[3]);
+ //data_chr.push_back(chr);
+ //data_start.push_back(pos);
+ //data_end.push_back(atoi(tokens[2].c_str()));
+ if (n_includedG >= PCA._xXf.cols()) resizeData();
+ for (int i = 0 ; i < temp.size(); i++) PCA._xXf(i,n_includedG) = temp[i];
+ n_includedG++;
+ } else n_excludedG_user ++;
+ }
+
+
+ //Finalize & verbose
+ tbx_destroy(tbx);
+ if (hts_close(fp)) vrb.error("Cannot properly close file!");
+ data_count = n_includedG;
+ vrb.bullet(stb.str(n_includedG) + " variants included");
+ if (n_excludedG_user > 0) vrb.bullet(stb.str(n_excludedG_user) + " variants excluded by user");
+ if (data_count == 0) vrb.leave("Cannot find variants in target region!");
+ finalizeData(n_includedG);
+}
+
+void pca_data::readDataPhenoBED(string fbed) {
+ int n_includedS = 0, n_excludedS = 0, n_excludedU = 0, n_excludedP = 0, n_includedP = 0;
+ vector < int > mappingS;
+
+ //Open BED file
+ vrb.title("Reading phenotype data in [" + fbed + "]");
+ htsFile *fp = hts_open(fbed.c_str(),"r");
+ if (!fp) vrb.error("Cannot open file!");
+ tbx_t * tbx = tbx_index_load(fbed.c_str());
+ if (!tbx) vrb.error("Cannot open index file");
+ kstring_t str = {0,0,0};
+ if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != '#' ) vrb.error("Cannot read header line");
+
+ //Process sample names
+ vector < string > tokens;
+ stb.split(string(str.s), tokens);
+ if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
+ for (int t = 6 ; t < tokens.size() ; t ++) {
+ if (filter_sample.check(tokens[t])) {
+ mappingS.push_back(findSample(tokens[t]));
+ if (mappingS.back() < 0) n_excludedS ++;
+ else n_includedS ++;
+ } else n_excludedU ++;
+ }
+ vrb.bullet(stb.str(n_includedS) + " samples included");
+ if (n_excludedU > 0) vrb.bullet(stb.str(n_excludedU) + " samples excluded by user");
+ if (n_excludedS > 0) vrb.bullet(stb.str(n_excludedS) + " samples without genotype data");
+ if (n_includedS != sample_count) vrb.error("Cannot find phenotype data for " + stb.str(sample_count - n_includedS) + " samples!");
+ PCA._xXf.resize(sample_count, __RESIZE_CHUNK__);
+ unsigned long int linecount = 1;
+ //Read phenotypes
+ while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) {
+ if (str.l && str.s[0] != tbx->conf.meta_char) {
+ stb.split(string(str.s), tokens);
+ if (filter_phenotype.check(tokens[3])) {
+ //data_id.push_back(tokens[3]);
+ //data_chr.push_back(tokens[0]);
+ //data_start.push_back(atoi(tokens[1].c_str()) + 1);
+ //data_end.push_back(atoi(tokens[2].c_str()));
+ if (n_includedP >= PCA._xXf.cols()) resizeData();
+ for (int t = 6 ; t < tokens.size() ; t ++) if (mappingS[t-6] >= 0) {
+ if (tokens[t] == "NA") PCA._xXf(mappingS[t-6],n_includedP) = bcf_float_missing;
+ else PCA._xXf(mappingS[t-6],n_includedP) = stof(tokens[t]);
+ }
+ linecount++;
+ n_includedP++;
+ } else n_excludedP ++;
+ }
+ }
+
+ //Finalize & verbose
+ tbx_destroy(tbx);
+ data_count = n_includedP;
+ vrb.bullet(stb.str(data_count) + " phenotypes included");
+ if (n_excludedP > 0) vrb.bullet(stb.str(n_excludedP) + " phenotypes excluded by user");
+ if (data_count == 0) vrb.leave("Cannot find phenotypes in target region!");
+ hts_close(fp);
+ finalizeData(n_includedP);
+}
+
+
diff --git a/src/mode_quan/quan_chunking.cpp b/src/mode_quan/quan_chunking.cpp
new file mode 100644
index 0000000..e125e56
--- /dev/null
+++ b/src/mode_quan/quan_chunking.cpp
@@ -0,0 +1,42 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "quan_data.h"
+
+
+void quan_data::setRegion(string r){
+ if (!region.parse(r)) vrb.error("Unable to parse [" + r +"]");
+ vector < quan_gene_grp > new_genes_grp;
+ for (int g = 0 ;g < gene_grps.size(); g++) if(gene_grps[g].overlap(region)) new_genes_grp.push_back(gene_grps[g]);
+ gene_grps = new_genes_grp;
+ vrb.bullet("Number of gene groups in [" + region.get() +"] = " + stb.str(gene_grps.size()));
+}
+
+void quan_data::setChunk(int k, int K){
+ //STEP0: check input values
+ if (K < 1) vrb.error("Number of chunks needs to be > 0");
+ if (K > gene_grps.size()) vrb.error("Number of chunks (" + stb.str(K) + ") is greater than the number of gene groups (" + stb.str(gene_grps.size()) + ")");
+ if (k < 0) vrb.error("Chunk index needs to be > 0");
+ if (k > K) vrb.error("Chunk index needs to be smaller than or equal to the total number of chunks [=" + stb.str(K) + "]");
+
+ unsigned long int max_length =0 ;
+ if (gene_grps.size() % K == 0) max_length = gene_grps.size() / K;
+ else for ( unsigned long int l = 1 ; l * (K-1) < gene_grps.size(); l++ ) max_length = l;
+ unsigned long int start_idx = (k-1) * max_length;
+ unsigned long int end_idx = k * max_length;
+ if (end_idx > gene_grps.size()) end_idx = gene_grps.size();
+ gene_grps = vector < quan_gene_grp > (gene_grps.begin()+start_idx, gene_grps.begin()+end_idx);
+ vrb.bullet("Number of gene groups in chunk [" + stb.str(k) + " / " + stb.str(K) +"] = " + stb.str(gene_grps.size()));
+}
\ No newline at end of file
diff --git a/src/mode_quan/quan_data.h b/src/mode_quan/quan_data.h
new file mode 100644
index 0000000..314e980
--- /dev/null
+++ b/src/mode_quan/quan_data.h
@@ -0,0 +1,277 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#ifndef _QUAN_DATA_H
+#define _QUAN_DATA_H
+
+#include "../common/data.h"
+
+
+class quan_stats{
+public:
+ map < string , char > failed;
+ unsigned long int mapQ,mismatch,unmapped,unpaired,dup,notexon,good,total;
+ double exonic;
+ quan_stats(){mapQ=0;mismatch=0;unmapped=0;dup=0;unpaired=0;notexon=0;exonic=0.0;total=0;good=0;}
+};
+
+class quan_exon{
+public:
+ unsigned int start;
+ unsigned int end;
+ unsigned int length;
+ string chr;
+ string name;
+ string gene_id;
+ string gene_name;
+ short int strand;
+ vector < double > read_count;
+ bool merged;
+ quan_exon(){chr="NA";name="NA";start=0;end=0;gene_id="NA";gene_name="NA";strand = 0;;merged=false;length=0;}
+ ~quan_exon(){read_count.clear();}
+ quan_exon(string c , unsigned int s, unsigned int e, string gi, string gn, string st, unsigned int nof){chr=c;start=s;end=e;gene_id=gi;name = gi + "_" + stb.str(start) + "_" + stb.str(end);gene_name = gn; strand = st == "-" ? -1 : 1; length = end - start + 1; merged=false; read_count=vector < double >(nof,0.0);}
+ quan_exon(string c , unsigned int s, unsigned int e, string gi, string gn, string st){chr=c;start=s;end=e;gene_id=gi;name = gi + "_" + stb.str(start) + "_" + stb.str(end);gene_name = gn; strand = st == "-" ? -1 : 1; length = end - start + 1; merged=false;}
+ void allocate(unsigned int nof) { read_count=vector < double >(nof,0.0); }
+
+ void resize(unsigned int s, unsigned int e){start = s; end = e;length = end - start + 1; name = gene_id + "_" + stb.str(start) + "_" + stb.str(end);merged=true;}
+ void merge(vector < quan_exon > &exs){
+ unsigned int sr = start;
+ unsigned int er = end;
+ for (int i =0 ; i < exs.size(); i++){
+ if (exs[i].start < sr) sr = exs[i].start;
+ if (exs[i].end > er) er = exs[i].end;
+ }
+ resize(sr,er);
+ }
+
+ bool overlap(unsigned int s, unsigned int e){
+ if (s <= end && e >= start) return true;
+ else return false;
+ }
+
+ bool overlap(string c, unsigned int s, unsigned int e){
+ if (c == chr && s <= end && e >= start) return true;
+ else return false;
+ }
+
+ bool operator < (quan_exon const & p) const {
+ if (chr < p.chr) return true;
+ if (chr > p.chr) return false;
+ if (start < p.start) return true;
+ if (start >= p.start) return false;
+ return false;
+ }
+
+ friend ostream& operator<<(ostream& out, const quan_exon& p){
+ out << p.chr << "\t" << p.start << "\t" << p.end << "\t" << p.name << "\t" << p.length << "\t" << (p.strand == 1 ? "+" : "-") << "\t" << p.merged << endl;
+ return out;
+ }
+};
+
+class quan_gene{
+public:
+ string ID;
+ string name;
+ string chr;
+ string region;
+ unsigned int start;
+ unsigned int end;
+ unsigned int tss;
+ short int strand;
+ unsigned int length;
+ vector < double > read_count;
+ vector < quan_exon > exons;
+ quan_gene(){chr="NA";name="NA";start=0;end=0;ID="NA";strand = 0;region="NA"; length = 0 ;tss=0;};
+ quan_gene(unsigned int nof){chr="NA";name="NA";start=0;end=0;ID="NA";strand = 0;region="NA"; length = 0 ;tss=0;read_count=vector < double >(nof,0.0);};
+ ~quan_gene(){exons.clear();read_count.clear();}
+ void allocate(unsigned int nof) {
+ read_count=vector < double >(nof,0.0);
+ for (int i = 0 ; i < exons.size(); i++) exons[i].allocate(nof);
+ }
+ void assign(quan_exon &e){
+ if (strand && e.strand != strand) vrb.error("Strand mismatch");
+ if (chr != "NA" && e.chr != chr) vrb.error("Chr mismatch");
+ if (name != "NA" && e.gene_name != name) vrb.error("Name mismatch");
+ if (ID != "NA" && e.gene_id != ID) vrb.error("ID mismatch");
+ strand = e.strand;
+ name = e.gene_name;
+ chr = e.chr;
+ ID = e.gene_id;
+ if (start == 0 || e.start < start ) start = e.start;
+ if (end == 0 || e.end > end) end = e.end;
+ tss = strand == -1 ? end : start;
+ vector < quan_exon > overlaping;
+ vector < int > overlaping_idx;
+ unsigned int tot = 0;
+ sort(exons.begin(), exons.end());
+ for (int i =0 ; i < exons.size() ; i++){
+ if (exons[i].end >= e.start && exons[i].start <= e.end ){
+ overlaping.push_back(exons[i]);
+ overlaping_idx.push_back(i);
+ tot += exons[i].length;
+ }
+ }
+ if (overlaping.size()){
+ e.merge(overlaping);
+ exons.erase(exons.begin() + overlaping_idx[0], exons.begin() + (overlaping_idx.back() + 1));
+ length -= tot;
+ }
+ exons.push_back(e);
+ length += e.length;
+ region = chr + ":" + stb.str(start) + "-" + stb.str(end);
+ }
+
+ bool operator < (quan_gene const & p) const {
+ if (chr < p.chr) return true;
+ if (chr > p.chr) return false;
+ if (tss < p.tss) return true;
+ if (tss >= p.tss) return false;
+ return false;
+ }
+
+ bool overlap(unsigned int s, unsigned int e){
+ if (s <= end && e >= start) return true;
+ else return false;
+ }
+
+ bool overlap(string c, unsigned int s, unsigned int e){
+ if (c == chr && s <= end && e >= start) return true;
+ else return false;
+ }
+
+ bool overlap(genomic_region &gr){
+ if (gr.chr == chr && gr.start <= end && gr.end >= start) return true;
+ else return false;
+ }
+
+ friend ostream& operator<<(ostream& out, quan_gene& g){
+ out << g.chr << "\t" << g.start << "\t" << g.end << "\t" << g.ID << "\t" << g.length << "\t" << (g.strand == 1 ? "+" : "-") << "\t" << 2<<endl;
+ sort(g.exons.begin(), g.exons.end());
+ for (int i =0; i < g.exons.size(); i++ ) out << g.exons[i];
+ return out;
+ }
+
+};
+
+class quan_gene_grp{
+public:
+ vector < quan_gene > genes;
+ string chr;
+ unsigned int start;
+ unsigned int end;
+ string region;
+ quan_gene_grp(){start=0;end=0;region="";chr="";}
+ ~quan_gene_grp(){genes.clear();}
+ void allocate(unsigned int nof){ for(int i = 0 ; i < genes.size(); i++) genes[i].allocate(nof);}
+ bool overlap(unsigned int s, unsigned int e){
+ if (s <= end && e >= start) return true;
+ else return false;
+ }
+
+ bool overlap(string c, unsigned int s, unsigned int e){
+ if (c == chr && s <= end && e >= start) return true;
+ else return false;
+ }
+
+ bool overlap(genomic_region &gr){
+ if (gr.chr == chr && gr.start <= end && gr.end >= start) return true;
+ else return false;
+ }
+};
+
+typedef struct { // auxiliary data structure
+ samFile * fp; // the file handle
+ bam_hdr_t * hdr; // the file header
+ hts_itr_t * iter; // NULL if a region not specified
+ hts_idx_t * idx;
+ int min_mapQ; // mapQ filter
+ bool dup_remove; // remove duplicates
+ int max_intron_length;
+ double max_mismatch_count;
+ double max_mismatch_count_total;
+} aux_tq;
+
+class quan_block{
+public:
+ vector < unsigned int > starts;
+ vector < unsigned int > ends;
+ vector < unsigned int > lengths;
+ vector < double > block_overlap;
+ double total_contribution;
+ unsigned int read_length;
+ unsigned int mmc;
+ bam1_core_t core;
+ quan_block(){read_length=0;mmc=0; total_contribution = 1.0;}
+ void merge(quan_block &B){
+ for (int i =0 ; i < starts.size(); i++){
+ for (int j =i; j < B.starts.size(); j++){
+ if(starts[i] <= B.ends[j] && ends[i] >= B.starts[j]){
+ unsigned int overlap = min(ends[i], B.ends[j]) - max(starts[i], B.starts[j]) + 1;
+ block_overlap[i] = 0.5 + ((lengths[i] - overlap) / lengths[i] * 0.5);
+ B.block_overlap[j] = 0.5 + ((B.lengths[j] - overlap) / B.lengths[j] * 0.5);
+ break;
+ }
+ }
+ }
+ double total = 0.0;
+ for (int i = 0 ; i < block_overlap.size(); i++) total += (double) lengths[i] * block_overlap[i];
+ total_contribution = total / (double) read_length;
+ total = 0.0;
+ for (int i = 0 ; i < B.block_overlap.size(); i++) total += (double) B.lengths[i] * B.block_overlap[i];
+ B.total_contribution = total / (double) B.read_length;
+ }
+ friend ostream& operator<<(ostream& out, quan_block& g){
+ out << g.read_length << "\t" << g.mmc << "\t" << g.core.pos << "\t" << g.core.mpos << "\t" << g.total_contribution;
+ for (int i =0; i < g.starts.size(); i++ ) out << "\t" << g.starts[i] << "," << g.ends[i] << " " << g.block_overlap[i];
+ out << endl;
+ return out;
+ }
+};
+
+class quan_data : public data {
+public:
+ map < string, unsigned int > genes_map;
+ vector < quan_gene > genes;
+ vector < quan_gene_grp > gene_grps;
+ vector < string > bams;
+ vector < quan_stats > stats;
+ vector < string > samples;
+ set < string > gene_types;
+
+ //FILTERS
+ int max_intron_length;
+ double max_mismatch_count,max_mismatch_count_total;
+ unsigned int min_mapQ,max_read_length;
+ bool dup_remove,proper_pair,check_consistency,debug,merge,fraction_mm,fraction_mmt;
+ quan_data(){max_intron_length=0;max_mismatch_count=0.0;max_mismatch_count_total=0.0;min_mapQ=0;max_read_length=0;dup_remove=false;proper_pair=false;check_consistency=false;debug=false;merge=true;fraction_mm=false;fraction_mmt=false;}
+
+ void setChunk(int,int);
+ void setRegion(string);
+ genomic_region region;
+
+
+ void read_Sample_Names(vector < string > &);
+ void readGTF(string,unsigned int);
+ void groupGenes();
+ void readBams();
+ int read_bam(void *, bam1_t *, quan_stats &, unsigned int &mmc);
+ void printBEDcount(string);
+ void printBEDrpkm(string);
+ void printStats(string);
+};
+
+void quan_main(vector < string > & );
+
+#endif /* quan_data_h */
diff --git a/src/mode_quan/quan_main.cpp b/src/mode_quan/quan_main.cpp
new file mode 100644
index 0000000..61d23f0
--- /dev/null
+++ b/src/mode_quan/quan_main.cpp
@@ -0,0 +1,166 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "quan_data.h"
+
+void quan_main(vector < string > & argv) {
+ quan_data D;
+ //-------------------------
+ // 1. DECLARE ALL OPTIONS
+ //-------------------------
+ D.declareBasicOptions();
+ boost::program_options::options_description opt_files ("\x1B[32mI/O\33[0m");
+ opt_files.add_options()
+ ("gtf", boost::program_options::value< string >(), "Annotation in GTF format")
+ ("bam", boost::program_options::value< vector < string > > ()->multitoken(), "Sequence data in BAM/SAM format.")
+ ("samples",boost::program_options::value< vector < string > > ()->multitoken(), "Sample names or a file with sample names. [Optional]")
+ ("out-prefix", boost::program_options::value< string >(), "Output file prefix.");
+
+ boost::program_options::options_description opt_parameters ("\x1B[32mParameters\33[0m");
+ opt_parameters.add_options()
+ ("rpkm", "Print RPKM values.")
+ ("debug", "Print debug info to stderr.")
+ ("gene-types", boost::program_options::value< vector < string > > ()->multitoken(), "Gene types to quantify. (Requires gene_type attribute in GTF. It will also use transcript_type if present).")
+ ("max-read-length", boost::program_options::value< unsigned int >()->default_value(1000), "Group genes separated by this much together. Set this larger than your read length");
+
+ boost::program_options::options_description opt_filters ("\x1B[32mFilters\33[0m");
+ opt_filters.add_options()
+ ("filter-mapping-quality", boost::program_options::value< unsigned int >()->default_value(10), "Minimal phred mapping quality for a read to be considered.")
+ ("filter-mismatch", boost::program_options::value< double >()->default_value(-1.0,"OFF"), "Maximum mismatches allowed in a read. If between 0 and 1 taken as the fraction of read length. (Requires NM attribute)")
+ ("filter-mismatch-total", boost::program_options::value< double >()->default_value(-1.0,"OFF"), "Maximum total mismatches allowed in paired reads. If between 0 and 1 taken as the fraction of combined read length. (Requires NM attribute)")
+ ("check-proper-pairing", "If provided only properly paired reads according to the aligner that are in correct orientation will be considered. Otherwise all pairs in correct orientation will be considered.")
+ ("check-consistency", "If provided checks the consistency of split reads with annotation, rather than pure overlap of one of the blocks of the split read.")
+ ("no-merge", "If provided overlapping mate pairs will not be merged.")
+ ("filter-remove-duplicates", "Remove duplicate sequencing reads in the process.");
+
+ boost::program_options::options_description opt_parallel ("\x1B[32mParallelization\33[0m");
+ opt_parallel.add_options()
+ ("chunk", boost::program_options::value< vector < int > >()->multitoken(), "Specify which chunk needs to be processed")
+ ("region", boost::program_options::value< string >(), "Region of interest.");
+
+ D.option_descriptions.add(opt_files).add(opt_parameters).add(opt_filters).add(opt_parallel);
+
+ //-------------------
+ // 2. PARSE OPTIONS
+ //-------------------
+ boost::program_options::variables_map options;
+ try {
+ boost::program_options::store(boost::program_options::command_line_parser(argv).options(D.option_descriptions).run(), D.options);
+ boost::program_options::notify(D.options);
+ } catch ( const boost::program_options::error& e ) {
+ cerr << "Error parsing [quan] command line :" << string(e.what()) << endl;
+ exit(0);
+ }
+
+ //---------------------
+ // 3. PRINT HELP/HEADER
+ //---------------------
+ vrb.ctitle("QUANTIFY GENES AND EXONS FROM BAM FILES");
+ if (D.options.count("help")) {
+ cout << D.option_descriptions << endl;
+ exit(EXIT_SUCCESS);
+ }
+
+ //-----------------
+ // 4. COMMON CHECKS
+ //-----------------
+ if (!D.options.count("gtf")) vrb.error("Genotype data needs to be specified with --gtf [file.gtf]");
+ if (!D.options.count("bam")) vrb.error("Sequence data needs to be specified with --bam [file.bam]");
+ if (!D.options.count("out-prefix")) vrb.error("Output needs to be specified with --out [file.out]");
+
+
+ D.min_mapQ = D.options["filter-mapping-quality"].as < unsigned int > ();
+ vrb.bullet("Minimum mapping quality: " + stb.str(D.min_mapQ));
+ D.max_read_length = D.options["max-read-length"].as < unsigned int > ();
+ vrb.bullet("Maximum read length: " + stb.str(D.max_read_length));
+ double intpart;
+
+ D.max_mismatch_count_total = D.options["filter-mismatch-total"].as < double > ();
+ if(D.max_mismatch_count_total >= 0 && modf(D.max_mismatch_count_total, &intpart) != 0.0) {
+ if(D.max_mismatch_count_total > 1) vrb.error("--filter-mismatch-total cannot be greater than 1 when not an integer");
+ else D.fraction_mmt = true;
+ }
+ if ( D.max_mismatch_count_total >= 0) vrb.bullet("Maximum mismatch count per mate-pair: " + stb.str(D.max_mismatch_count_total));
+
+ D.max_mismatch_count = D.options["filter-mismatch"].as < double > ();
+ if(D.max_mismatch_count >= 0 && modf(D.max_mismatch_count, &intpart) != 0.0) {
+ if(D.max_mismatch_count > 1) vrb.error("--filter-mismatch cannot be greater than 1 when not an integer");
+ else D.fraction_mm = true;
+ }
+ if (D.max_mismatch_count < 0 && D.max_mismatch_count_total >= 0 && !D.fraction_mmt) D.max_mismatch_count = D.max_mismatch_count_total;
+ if ( D.max_mismatch_count >= 0) vrb.bullet("Maximum mismatch count per read: " + stb.str(D.max_mismatch_count));
+
+
+
+ if (D.options.count("check-proper-pairing")){
+ vrb.bullet("Checking properly paired flag");
+ D.proper_pair = true;
+ }
+
+ if (D.options.count("check-consistency")){
+ vrb.bullet("Checking if all blocks of a split read are consistent with the annotation");
+ D.check_consistency = true;
+ }
+
+ if (D.options.count("filter-remove-duplicates")){
+ vrb.bullet("Filtering reads flagged as duplicate");
+ D.dup_remove = true;
+ }
+
+ if (D.options.count("no-merge")){
+ vrb.bullet("Not merging overlapping mate pairs");
+ D.merge = false;
+ }
+
+ if (D.options.count("gene-types")){
+ vector < string > t = D.options["gene-types"].as < vector < string > > ();
+ D.gene_types = set < string > (t.begin(),t.end());
+ const char* const delim = " ";
+ ostringstream temp;
+ copy(D.gene_types.begin(), D.gene_types.end(), ostream_iterator<string>(temp, delim));
+ vrb.bullet("Genes included: " + temp.str());
+ }
+
+ if (D.options.count("debug")) D.debug = true;
+
+ int k=1,K=1;
+ if (D.options.count("chunk")) {
+ vector < int > nChunk = D.options["chunk"].as < vector < int > > ();
+ if (nChunk.size() != 2 || nChunk[0] > nChunk[1]) vrb.error("Incorrect --chunk arguments!");
+ vrb.bullet("Chunk = [" + stb.str(nChunk[0]) + "/" + stb.str(nChunk[1]) + "]");
+ k=nChunk[0] , K = nChunk[1];
+ } else if(D.options.count("region")) vrb.bullet("Region = [" + D.options["region"].as < string > () +"]");
+
+ //TO DO CHECK PARAMETER VALUES
+
+
+ //------------------------------------------
+ // 5. READ FILES / INITIALIZE / RUN ANALYSIS
+ //------------------------------------------
+
+ D.processBasicOptions();
+ D.bams = D.options["bam"].as < vector < string > > ();
+ if (D.options.count("samples")) {
+ vector < string > n = D.options["samples"].as <vector < string > > ();
+ D.read_Sample_Names(n);
+ }else D.samples = D.bams;
+ D.readGTF(D.options["gtf"].as < string > (),D.bams.size());
+ if (D.options.count("region")) D.setRegion(D.options["region"].as < string >());
+ if (D.options.count("chunk")) D.setChunk(k, K);
+ D.readBams();
+ D.printBEDcount(D.options["out-prefix"].as < string > ());
+ if (D.options.count("rpkm")) D.printBEDrpkm(D.options["out-prefix"].as < string > ());
+ D.printStats(D.options["out-prefix"].as < string > ());
+}
diff --git a/src/mode_quan/quan_management.cpp b/src/mode_quan/quan_management.cpp
new file mode 100644
index 0000000..2a23d8d
--- /dev/null
+++ b/src/mode_quan/quan_management.cpp
@@ -0,0 +1,41 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "quan_data.h"
+
+
+void quan_data::read_Sample_Names(vector <string> &names){
+ if (names.size() == 0){
+ samples = bams;
+ }else if (names.size()==1){
+ input_file fd(names[0]);
+ if (fd.fail()){
+ //Assuming a single name
+ fd.close();
+ samples.push_back(names[0]);
+ }else{
+ //Assuming a file with names
+ string buffer;
+ vector < string > str;
+ while(getline(fd, buffer)) {
+ stb.split(buffer, str);
+ if (str.size()!=1) vrb.error("Expecting a single sample name per line in [" + names[0] + "] but got: " + buffer);
+ samples.push_back(str[0]);
+ }
+ fd.close();
+ }
+ }else samples = names;
+ if (samples.size() != bams.size()) vrb.error("Sample names does not match with BAM files!");
+}
diff --git a/src/mode_quan/quan_printResults.cpp b/src/mode_quan/quan_printResults.cpp
new file mode 100644
index 0000000..1e75b88
--- /dev/null
+++ b/src/mode_quan/quan_printResults.cpp
@@ -0,0 +1,136 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "quan_data.h"
+
+
+void quan_data::printBEDcount(string fout){
+ vrb.title("Printing counts");
+ string ext ="";
+ string prefix = fout;
+ if (fout.substr(fout.find_last_of(".") + 1) == "gz") {
+ ext = ".gz";
+ prefix = fout.substr(0,fout.find_last_of("."));
+ }
+ if (fout.substr(fout.find_last_of(".") + 1) == "bz2"){
+ ext = ".bz2";
+ prefix = fout.substr(0,fout.find_last_of("."));
+ }
+ output_file fdo(prefix+".gene.count.bed" + ext);
+ output_file fdoe(prefix+".exon.count.bed" + ext);
+ if (fdo.fail()) vrb.error("Cannot open file [" + prefix + ".gene.count.bed" + ext + "]");
+ if (fdoe.fail()) vrb.error("Cannot open file [" + prefix + ".exon.count.bed" + ext + "]");
+ fdo.precision(10);
+ fdoe.precision(10);
+ fdo << "#chr\tstart\tend\tgene\tlength\tstrand";
+ fdoe << "#chr\tstart\tend\texon\tgeneID\tstrand";
+ for (int i = 0 ; i < samples.size(); i++) {fdo << "\t" << samples[i]; fdoe << "\t" << samples[i];}
+ fdo<<endl;
+ fdoe<<endl;
+ for (int gr= 0; gr < gene_grps.size(); gr++){
+ for (int g = 0 ; g < gene_grps[gr].genes.size(); g++){
+ string chr = gene_grps[gr].genes[g].chr;
+ if (chr.substr(0,3) == "chr") chr.erase(0,3);
+ fdo << chr;
+ fdo << "\t" << gene_grps[gr].genes[g].tss-1;
+ fdo << "\t" << gene_grps[gr].genes[g].tss;
+ fdo << "\t" << gene_grps[gr].genes[g].ID;
+ fdo << "\t" << gene_grps[gr].genes[g].length;
+ fdo << "\t" << (gene_grps[gr].genes[g].strand == -1 ? "-" : "+");
+ for (int i = 0 ; i < bams.size(); i++) fdo << "\t" << gene_grps[gr].genes[g].read_count[i];
+ fdo << endl;
+ for (int e = 0 ; e < gene_grps[gr].genes[g].exons.size(); e++){
+ fdoe << chr;
+ fdoe << "\t" << gene_grps[gr].genes[g].tss-1;
+ fdoe << "\t" << gene_grps[gr].genes[g].tss;
+ fdoe << "\t" << gene_grps[gr].genes[g].exons[e].name;
+ fdoe << "\t" << gene_grps[gr].genes[g].ID;
+ fdoe << "\t" << (gene_grps[gr].genes[g].strand == -1 ? "-" : "+");
+ for (int i = 0 ; i < bams.size(); i++) fdoe << "\t" << gene_grps[gr].genes[g].exons[e].read_count[i];
+ fdoe << endl;
+ }
+ }
+ }
+}
+
+
+void quan_data::printBEDrpkm(string fout){
+ vrb.title("Printing RPKM");
+ string ext ="";
+ string prefix = fout;
+ if (fout.substr(fout.find_last_of(".") + 1) == "gz") {
+ ext = ".gz";
+ prefix = fout.substr(0,fout.find_last_of("."));
+ }
+ if (fout.substr(fout.find_last_of(".") + 1) == "bz2"){
+ ext = ".bz2";
+ prefix = fout.substr(0,fout.find_last_of("."));
+ }
+ output_file fdo(prefix+".gene.rpkm.bed" + ext);
+ output_file fdoe(prefix+".exon.rpkm.bed" + ext);
+ if (fdo.fail()) vrb.error("Cannot open file [" + prefix + ".gene.rpkm.bed" + ext + "]");
+ if (fdoe.fail()) vrb.error("Cannot open file [" + prefix + ".exon.rpkm.bed" + ext + "]");
+
+
+ fdo.precision(10);
+ fdoe.precision(10);
+ fdo << "#chr\tstart\tend\tgene\tlength\tstrand";
+ fdoe << "#chr\tstart\tend\texon\tgeneID\tstrand";
+ for (int i = 0 ; i < samples.size(); i++) {fdo << "\t" << samples[i]; fdoe << "\t" << samples[i];}
+ fdo<<endl;
+ fdoe<<endl;
+ for (int gr= 0; gr < gene_grps.size(); gr++){
+ for (int g = 0 ; g < gene_grps[gr].genes.size(); g++){
+ string chr = gene_grps[gr].genes[g].chr;
+ if (chr.substr(0,3) == "chr") chr.erase(0,3);
+ fdo << chr;
+ fdo << "\t" << gene_grps[gr].genes[g].tss-1;
+ fdo << "\t" << gene_grps[gr].genes[g].tss;
+ fdo << "\t" << gene_grps[gr].genes[g].ID;
+ fdo << "\t" << gene_grps[gr].genes[g].length;
+ fdo << "\t" << (gene_grps[gr].genes[g].strand == -1 ? "-" : "+");
+ for (int i = 0 ; i < bams.size(); i++) fdo << "\t" << ((gene_grps[gr].genes[g].read_count[i] * 1000.0) / (double) gene_grps[gr].genes[g].length) * (1000000.0 / (double)stats[i].exonic);
+ fdo << endl;
+ for (int e = 0 ; e < gene_grps[gr].genes[g].exons.size(); e++){
+ fdoe << chr;
+ fdoe << "\t" << gene_grps[gr].genes[g].tss-1;
+ fdoe << "\t" << gene_grps[gr].genes[g].tss;
+ fdoe << "\t" << gene_grps[gr].genes[g].exons[e].name;
+ fdoe << "\t" << gene_grps[gr].genes[g].ID;
+ fdoe << "\t" << (gene_grps[gr].genes[g].strand == -1 ? "-" : "+");
+ for (int i = 0 ; i < bams.size(); i++) fdoe << "\t" << ((gene_grps[gr].genes[g].exons[e].read_count[i] * 1000.0) / (double) gene_grps[gr].genes[g].exons[e].length) * (1000000.0 / (double)stats[i].exonic);
+ fdoe << endl;
+ }
+ }
+ }
+}
+
+void quan_data::printStats(string fout){
+ vrb.title("Printing stats");
+ string ext ="";
+ string prefix = fout;
+ if (fout.substr(fout.find_last_of(".") + 1) == "gz") {
+ ext = ".gz";
+ prefix = fout.substr(0,fout.find_last_of("."));
+ }
+ if (fout.substr(fout.find_last_of(".") + 1) == "bz2"){
+ ext = ".bz2";
+ prefix = fout.substr(0,fout.find_last_of("."));
+ }
+ output_file fdo(prefix+".stats" + ext);
+ if (fdo.fail()) vrb.error("Cannot open file [" + prefix + ".stats" + ext + "]");
+ fdo << "sample\tunmmaped_in_genic_regions\tduplicate_reads_in_genic_regions\tfails_mapQ_in_genic_regions:" << min_mapQ <<"\tunpaired_in_genic_regions\tfails_mismatch_in_genic_regions:" << max_mismatch_count << ":" << max_mismatch_count_total << "\tgood_reads_in_genic_regions\tnot_exonic_in_genic_regions\texonic_in_genic_regions\ttotal_reads_in_genic_regions" << endl;
+ for (int i = 0 ; i < samples.size(); i++) fdo << samples[i] <<"\t" << stats[i].unmapped << "\t" << stats[i].dup << "\t" << stats[i].mapQ << "\t"<< stats[i].unpaired <<"\t" << stats[i].mismatch << "\t" << stats[i].good << "\t" << stats[i].notexon << "\t" << stats[i].exonic << "\t" << stats[i].total << endl;
+}
diff --git a/src/mode_quan/quan_readBAM.cpp b/src/mode_quan/quan_readBAM.cpp
new file mode 100644
index 0000000..54a2e07
--- /dev/null
+++ b/src/mode_quan/quan_readBAM.cpp
@@ -0,0 +1,398 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "quan_data.h"
+
+int quan_data::read_bam(void *data, bam1_t *b, quan_stats &f, unsigned int &mmc) {
+ aux_tq * aux = (aux_tq*) data;
+ int ret;
+
+ while (1) {
+ ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->hdr, b);
+ if (ret < 0) break;
+ f.total++;
+ string name = bam_get_qname(b);
+ if (f.failed.count(name)){
+ if (debug) cerr << f.failed[name] << "\t" << name << endl;
+ switch(f.failed[name]){
+ case 'u': f.unmapped++; break;
+ case 'd': f.dup++; break;
+ case 'm': f.mapQ++; break;
+ case 'p': f.unpaired++; break;
+ case 'M': f.mismatch++; break;
+ }
+ continue;
+ }
+
+ if (b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL)) {
+ f.failed[name] = 'u';
+ if (debug) cerr << f.failed[name] << "\t" << name << endl;
+ f.unmapped++;
+ continue;
+ }
+
+ if (aux->dup_remove && (b->core.flag & BAM_FDUP)) {
+ f.failed[name] = 'd';
+ if (debug) cerr << f.failed[name] << "\t" << name << endl;
+ f.dup++;
+ continue;
+ }
+
+ if ((int)b->core.qual < aux->min_mapQ) {
+ f.failed[name] = 'm';
+ if (debug) cerr << f.failed[name] << "\t" << name << endl;
+ f.mapQ++;
+ continue;
+ }
+
+ if (b->core.flag & BAM_FPAIRED) {
+ if ((b->core.flag & BAM_FREVERSE) == (b->core.flag & BAM_FMREVERSE) || (b->core.flag & BAM_FMUNMAP) ){
+ f.failed[name] = 'p';
+ if (debug) cerr << f.failed[name] << "\t" << name << endl;
+ f.unpaired++;
+ continue;
+ }
+ if (proper_pair && !(b->core.flag & BAM_FPROPER_PAIR)){
+ f.failed[name] = 'p';
+ if (debug) cerr << f.failed[name] << "\t" << name << endl;
+ f.unpaired++;
+ continue;
+ }
+ }
+
+ if (aux->max_mismatch_count >= 0 || aux->max_mismatch_count_total >= 0){
+ unsigned int mc = 0;
+ uint8_t *s = bam_get_aux(b);
+ while (s+4 <= b->data + b->l_data) {
+ uint8_t type, key[3];
+ key[0] = s[0]; key[1] = s[1]; key[2] = '\0';
+ string keys((const char*)key);
+ s += 2; type = *s++;
+ if (type == 'A') {
+ if (keys=="NM") mc = *s;
+ ++s;
+ } else if (type == 'C') {
+ if (keys=="NM") mc = *s;
+ ++s;
+ } else if (type == 'c') {
+ if (keys=="NM") mc = *(int8_t*)s;
+ ++s;
+ } else if (type == 'S') {
+ if (s+2 <= b->data + b->l_data) {
+ if (keys=="NM") mc = *(uint16_t*)s;
+ s += 2;
+ } else break;
+ } else if (type == 's') {
+ if (s+2 <= b->data + b->l_data) {
+ if (keys=="NM") mc = *(int16_t*)s;
+ s += 2;
+ } else break;
+ } else if (type == 'I') {
+ if (s+4 <= b->data + b->l_data) {
+ if (keys=="NM") mc = *(uint32_t*)s;
+ s += 4;
+ } else break;
+ } else if (type == 'i') {
+ if (s+4 <= b->data + b->l_data) {
+ if (keys=="NM") mc = *(int32_t*)s;
+ s += 4;
+ } else break;
+ } else if (type == 'f') {
+ if (s+4 <= b->data + b->l_data) {
+ if (keys=="NM") mc = *(float*)s;
+ s += 4;
+ } else break;
+
+ } else if (type == 'd') {
+ if (s+8 <= b->data + b->l_data) {
+ if (keys=="NM") mc = *(double*)s;
+ s += 8;
+ }else break;
+ } else if (type == 'Z' || type == 'H') {
+ while (s < b->data + b->l_data && *s) s++;
+ if (s >= b->data + b->l_data)
+ break;
+ ++s;
+ } else if (type == 'B') {
+ uint8_t sub_type = *(s++);
+ int32_t n;
+ memcpy(&n, s, 4);
+ s += 4; // no point to the start of the array
+ if (s + n >= b->data + b->l_data)
+ break;
+ for (int i = 0; i < n; ++i) { // FIXME: for better performance, put the loop after "if"
+ if ('c' == sub_type) { ++s; }
+ else if ('C' == sub_type) { ++s; }
+ else if ('s' == sub_type) { s += 2; }
+ else if ('S' == sub_type) { s += 2; }
+ else if ('i' == sub_type) { s += 4; }
+ else if ('I' == sub_type) { s += 4; }
+ else if ('f' == sub_type) { s += 4; }
+ }
+ }
+ }
+ if ((!fraction_mm && mc > aux->max_mismatch_count) || (fraction_mm && (double) mc / (double) (b->core.l_qseq) > aux->max_mismatch_count)){
+ f.failed[name] = 'M';
+ if (debug) cerr << f.failed[name] << "\t" << name << endl;
+ f.mismatch++;
+ continue;
+ }
+ mmc = mc;
+ }
+ break;
+ }
+ return ret;
+}
+
+void quan_data::readBams(){
+ vrb.title("Reading BAMs");
+
+ //ALLOCATE
+ stats = vector < quan_stats >(bams.size());
+ for (int gr = 0 ; gr < gene_grps.size(); gr++) gene_grps[gr].allocate(bams.size());
+
+ for (int bm = 0 ; bm < bams.size(); bm++){
+ string fbam = bams[bm];
+ vrb.bullet(fbam + " [" + stb.str(bm+1) + " / " + stb.str(bams.size()) +"]");
+ aux_tq * data = (aux_tq *) malloc (sizeof(aux_tq));
+ data->min_mapQ = min_mapQ;
+ data->dup_remove = dup_remove;
+ data->max_mismatch_count = max_mismatch_count;
+ data->max_mismatch_count_total = max_mismatch_count_total;
+ data->max_intron_length = max_intron_length;
+ data->fp = sam_open(fbam.c_str(), "r");
+ if (data->fp == 0) vrb.error("Cannot open file! [" + fbam + "]");
+ data->hdr = sam_hdr_read(data->fp);
+ if (data->hdr == 0) vrb.error("Cannot parse header![" + fbam + "]");
+ hts_idx_t *idx = sam_index_load(data->fp, fbam.c_str());
+ if (idx == NULL) vrb.error("Cannot load index![" + fbam + ".bai]");
+ data->idx = idx;
+ quan_stats stat;
+ for (int gr = 0 ; gr < gene_grps.size(); gr++){
+ vrb.progress( ((float) gr + 1.0) / (float) gene_grps.size());
+ data->iter = sam_itr_querys(data->idx, data->hdr, gene_grps[gr].region.c_str()); // set the iterator
+ if (data->iter == NULL) {
+ vrb.warning("Problem jumping to region [" + gene_grps[gr].region + "]");
+ hts_itr_destroy(data->iter);
+ continue;
+ }
+ map < string , quan_block > read_sink;
+ bam1_t *b = bam_init1();
+ int r;
+ unsigned int mmc = 0;
+ while((r=read_bam((void*)data,b,stat,mmc)) >= 0){
+ string name = bam_get_qname(b);
+ const bam1_core_t *c = &b->core;
+ if (c->n_cigar) { // cigar
+ quan_block B;
+ B.mmc = mmc;
+ unsigned int bS = b->core.pos+1;
+ unsigned int bL = 0;
+ uint32_t *cigar = bam_get_cigar(b);
+ for (int i = 0; i < c->n_cigar; ++i) {
+ int l = bam_cigar_oplen(cigar[i]);
+ char c = bam_cigar_opchr(cigar[i]);
+ if(c=='S' || c=='D' || c =='H' || c=='P') continue;
+ else if (c=='N' && l){
+ B.starts.push_back(bS);
+ B.ends.push_back(bS+bL-1);
+ B.lengths.push_back(bL);
+ B.block_overlap.push_back(1.0);
+ B.read_length+=bL;
+ bS+=bL+l;
+ bL = 0;
+ }else bL += l;
+ }
+ B.starts.push_back(bS);
+ B.ends.push_back(bS+bL-1);
+ B.lengths.push_back(bL);
+ B.block_overlap.push_back(1.0);
+ B.read_length+=bL;
+ B.core = b->core;
+ if (b->core.flag & BAM_FPAIRED){
+ if(read_sink.count(name)){
+ quan_block A = read_sink[name];
+ read_sink.erase(name);
+ if (max_mismatch_count_total >= 0 && ((!fraction_mmt && A.mmc + B.mmc > max_mismatch_count_total) || (fraction_mmt && (double) (A.mmc + B.mmc) / (double) (A.core.l_qseq + B.core.l_qseq) > max_mismatch_count_total) )){
+ if (debug){
+ cerr << "M\t" << name<<endl;
+ cerr << "M\t" << name<<endl;
+ }
+ stat.mismatch+=2;
+ continue;
+ }
+ if(A.core.mtid != B.core.tid || (A.core.flag & BAM_FREVERSE) || !(B.core.flag & BAM_FREVERSE) || (B.core.pos < A.core.mpos) ){
+ stat.failed[name] = 'p';
+ if (debug) cerr << stat.failed[name] << "\t" << name << endl;
+ stat.unpaired++;
+ continue;
+ }
+ if (merge) A.merge(B);
+ stat.good +=2;
+ bool both_found = false;
+ for (int g = 0 ; g < gene_grps[gr].genes.size(); g++){
+ vector < int > exon_overlap1,exon_overlap2,exon_overlap1_length,exon_overlap2_length,exon_map1,exon_map2;
+ bool all_found1 = true, all_found2 = true , any_found1 = false, any_found2 = false;
+ unsigned long int exon_overlap1_length_total = 0 , exon_overlap2_length_total = 0;
+ for (int i = 0 ; i < A.starts.size() ; i++){
+ int idx = -1;
+ if (gene_grps[gr].genes[g].overlap(A.starts[i],A.ends[i])){
+ for (int e = 0 ; e < gene_grps[gr].genes[g].exons.size();e++){
+ if(gene_grps[gr].genes[g].exons[e].overlap(A.starts[i],A.ends[i])){
+ idx = e;
+ any_found1 = true;
+ exon_overlap1.push_back(idx);
+ exon_overlap1_length.push_back(min(gene_grps[gr].genes[g].exons[idx].end, A.ends[i]) - max(gene_grps[gr].genes[g].exons[idx].start, A.starts[i]) + 1);
+ exon_overlap1_length_total += exon_overlap1_length.back();
+ exon_map1.push_back(i);
+ }
+ }
+ }
+ if (idx == -1) all_found1 = false;
+ }
+ if (!all_found1 && check_consistency){
+ if (debug){
+ cerr << "NCONS\t" << name<<endl;
+ cerr << gene_grps[gr].genes[g];
+ cerr << A;
+ cerr << "NCONS\t" << name<<endl;
+ }
+ continue;
+ }
+ if (!any_found1){
+ if (debug){
+ cerr << "NCONS\t" << name<<endl;
+ cerr << gene_grps[gr].genes[g];
+ cerr << A;
+ cerr << "NCONS\t" << name<<endl;
+ }
+ continue;
+ }
+
+
+ for (int i = 0 ; i < B.starts.size(); i++){
+ int idx = -1;
+ if (gene_grps[gr].genes[g].overlap(B.starts[i],B.ends[i])){
+ for (int e = 0 ; e < gene_grps[gr].genes[g].exons.size(); e++){
+ if(gene_grps[gr].genes[g].exons[e].overlap(B.starts[i],B.ends[i])){
+ idx = e;
+ any_found2 = true;
+ exon_overlap2.push_back(idx);
+ exon_overlap2_length.push_back(min(gene_grps[gr].genes[g].exons[idx].end, B.ends[i]) - max(gene_grps[gr].genes[g].exons[idx].start, B.starts[i]) + 1);
+ exon_overlap2_length_total += exon_overlap2_length.back();
+ exon_map2.push_back(i);
+ }
+ }
+ }
+ if (idx == -1) all_found2 = false;
+ }
+ if (!all_found2 && check_consistency){
+ if (debug){
+ cerr << "NCONS\t" << name<<endl;
+ cerr << "NCONS\t" << name<<endl;
+ cerr << gene_grps[gr].genes[g];
+ cerr << B;
+ }
+ continue;
+ }
+ if (!any_found2){
+ if(debug){
+ cerr << "NCONS\t" << name<<endl;
+ cerr << "NCONS\t" << name<<endl;
+ cerr << gene_grps[gr].genes[g];
+ cerr << B;
+ }
+ continue;
+ }
+
+
+ both_found= true;
+ for (int i = 0 ; i < exon_overlap1.size(); i++) {
+ gene_grps[gr].genes[g].exons[exon_overlap1[i]].read_count[bm] += (double)exon_overlap1_length[i] / (double)exon_overlap1_length_total * A.block_overlap[exon_map1[i]];
+ if (debug) cerr << gene_grps[gr].genes[g].exons[exon_overlap1[i]].name << "\t" << name << "\t" <<(double)exon_overlap1_length[i] / (double)exon_overlap1_length_total * A.block_overlap[exon_map1[i]] << endl;
+ }
+ for (int i = 0 ; i < exon_overlap2.size(); i++) {
+ gene_grps[gr].genes[g].exons[exon_overlap2[i]].read_count[bm] += (double)exon_overlap2_length[i] / (double)exon_overlap2_length_total * B.block_overlap[exon_map2[i]];
+ if (debug) cerr << gene_grps[gr].genes[g].exons[exon_overlap2[i]].name << "\t" << name << "\t" <<(double)exon_overlap2_length[i] / (double)exon_overlap2_length_total * B.block_overlap[exon_map2[i]] << endl;
+ }
+ gene_grps[gr].genes[g].read_count[bm]+= A.total_contribution + B.total_contribution;
+ if (debug){
+ cerr << gene_grps[gr].genes[g].ID << "\t" << name << "\t" << A.total_contribution <<endl;
+ cerr << gene_grps[gr].genes[g].ID << "\t" << name << "\t" << B.total_contribution <<endl;
+ }
+
+ }//gene loop
+ if(both_found) stat.exonic+= A.total_contribution + B.total_contribution;
+ else stat.notexon+=2;
+ }else read_sink[name]= B;
+ }else{
+ if (max_mismatch_count_total >= 0 && fraction_mmt && (double) (B.mmc) / (double) (B.core.l_qseq) > max_mismatch_count_total) {
+ stat.mismatch+=1;
+ continue;
+ }
+ stat.good +=1;
+ bool both_found = false;
+ for (int g = 0 ; g < gene_grps[gr].genes.size(); g++){
+ vector < int >exon_overlap2,exon_overlap2_length,exon_map2;
+ int exon_overlap2_length_total = 0;
+ bool all_found2 = true , any_found2 = false;
+ for (int i = 0 ; i < B.starts.size(); i++){
+ int idx = -1;
+ if (gene_grps[gr].genes[g].overlap(B.starts[i],B.ends[i])){
+ for (int e = 0 ; e < gene_grps[gr].genes[g].exons.size(); e++){
+ if(gene_grps[gr].genes[g].exons[e].overlap(B.starts[i],B.ends[i])){
+ idx = e;
+ any_found2 = true;
+ exon_overlap2.push_back(idx);
+ exon_overlap2_length.push_back(min(gene_grps[gr].genes[g].exons[idx].end, B.ends[i]) - max(gene_grps[gr].genes[g].exons[idx].start, B.starts[i]) + 1);
+ exon_overlap2_length_total += exon_overlap2_length[i];
+ exon_map2.push_back(i);
+ }
+ }
+ }
+ if (idx == -1) all_found2 = false;
+ }
+ if (!all_found2 && check_consistency) continue;
+ if (!any_found2) continue;
+
+ both_found = true;
+
+ for (int i = 0 ; i < exon_overlap2.size(); i++) {
+ gene_grps[gr].genes[g].exons[exon_overlap2[i]].read_count[bm] += (double)exon_overlap2_length[i] / (double)exon_overlap2_length_total * B.block_overlap[exon_map2[i]];
+ }
+ gene_grps[gr].genes[g].read_count[bm] += B.total_contribution;
+ }//gene loop
+ if(both_found) stat.exonic += B.total_contribution;
+ else stat.notexon+=1;
+ }
+ }//cigar
+ }//while loop
+ bam_destroy1(b);
+ stat.notexon += read_sink.size(); //Orphan mate pairs will not map to the same gene
+ if (debug) for (map < string , quan_block >::iterator it = read_sink.begin() ; it != read_sink.end(); it++) cerr << "O\t" << it->first << endl;
+ read_sink.clear();
+ hts_itr_destroy(data->iter);
+ } //gene_grp floop
+ stat.failed.clear();
+ stats[bm] = stat;
+ bam_hdr_destroy(data->hdr);
+ hts_idx_destroy(data->idx);
+ if (data->fp) sam_close(data->fp);
+ free(data);
+
+ }//bam loop
+
+}
+
diff --git a/src/mode_quan/quan_readGTF.cpp b/src/mode_quan/quan_readGTF.cpp
new file mode 100644
index 0000000..dd0863f
--- /dev/null
+++ b/src/mode_quan/quan_readGTF.cpp
@@ -0,0 +1,102 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "quan_data.h"
+
+void quan_data::readGTF(string fgtf, unsigned int nof){
+ string buffer;
+ vector < string > str;
+
+ vrb.title("Reading exons in [" + fgtf + "]");
+ input_file fd (fgtf);
+ if (fd.fail()) vrb.error("Cannot open file!");
+ int linecount = 0;
+ while(getline(fd, buffer)) {
+ linecount++;
+ if (linecount % 500000 == 0) vrb.bullet(stb.str(linecount) + " lines read");
+ if (buffer[0] == '#') continue;
+ stb.split(buffer, str, " \t;");
+ if (str.size() < 10) vrb.error("Incorrect number of columns: " + stb.str(str.size()));
+ if (str.size() % 2 ) vrb.error("Unmatched attributes: " + buffer);
+ if (str[2] != "exon") continue;
+ string chr = str[0];
+ unsigned int start = atoi(str[3].c_str());
+ unsigned int end = atoi(str[4].c_str());
+ string strand = str[6];
+ string gene_type="",gene_id="",gene_name="",trans_type="";
+ for (int i = 8 ; i < str.size(); i+=2 ){
+ str[i+1].erase(remove(str[i+1].begin(), str[i+1].end(), '"'), str[i+1].end());
+ str[i+1].erase(remove(str[i+1].begin(), str[i+1].end(), ';'), str[i+1].end());
+ if (str[i] == "gene_name") gene_name = str[i+1];
+ if (str[i] == "gene_id") gene_id = str[i+1];
+ if (str[i] == "gene_type") gene_type = str[i+1];
+ if (str[i] == "transcript_type") trans_type = str[i+1];
+ }
+ if (gene_id=="") vrb.error("gene_id attribute is required: " + buffer);
+ if (gene_type != "" && gene_types.size() && !gene_types.count(gene_type)) continue;
+ if (trans_type != "" && gene_types.size() && !gene_types.count(trans_type)) continue;
+ //cerr << gene_id << " " << gene_name << " " << chr << " " << start << " " << end << " " << strand << " " << type << endl;
+ quan_exon E(chr, start, end, gene_id, gene_name, strand);
+ if (!genes_map.count(gene_id)){
+ genes_map[gene_id] = genes.size();
+ genes.push_back(quan_gene());
+ genes.back().assign(E);
+ }else genes[genes_map[gene_id]].assign(E);
+
+ }
+ groupGenes();
+ //for (int i = 0 ; i < genes.size(); i++) cout << genes[i];
+}
+
+
+void quan_data::groupGenes(){
+ vrb.title("Sorting and grouping genes");
+ sort(genes.begin(),genes.end());
+ string pC ="";
+ unsigned int pE = 0;
+ unsigned int pS = UINT_MAX;
+ vector < quan_gene > temp;
+ for (int i = 0 ; i < genes.size(); i++){
+ if ( pC != "" && (genes[i].chr != pC || genes[i].start - pE > max_read_length)){
+ gene_grps.push_back(quan_gene_grp());
+ gene_grps.back().genes = temp;
+ gene_grps.back().chr = pC;
+ gene_grps.back().start = pS;
+ gene_grps.back().end = pE;
+ gene_grps.back().region = pC + ":" + stb.str(pS) + "-" + stb.str(pE);
+ temp.clear();
+ temp.push_back(genes[i]);
+ pC = genes[i].chr;
+ pE = genes[i].end;
+ pS = genes[i].start;
+ }else{
+ pC = genes[i].chr;
+ pE = pE < genes[i].end ? genes[i].end : pE;
+ pS = pS < genes[i].start ? pS : genes[i].start;
+ temp.push_back(genes[i]);
+ }
+ }
+ gene_grps.push_back(quan_gene_grp());
+ gene_grps.back().genes = temp;
+ gene_grps.back().chr = pC;
+ gene_grps.back().start = pS;
+ gene_grps.back().end = pE;
+ gene_grps.back().region = pC + ":" + stb.str(pS) + "-" + stb.str(pE);
+ temp.clear();
+ vrb.bullet("Number of genes = " + stb.str(genes.size()));
+ vrb.bullet("Number of genes groups = " + stb.str(gene_grps.size()));
+ genes.clear();
+ genes_map.clear();
+}
diff --git a/src/mode_rtc/rtc_chunking.cpp b/src/mode_rtc/rtc_chunking.cpp
new file mode 100644
index 0000000..38355e8
--- /dev/null
+++ b/src/mode_rtc/rtc_chunking.cpp
@@ -0,0 +1,167 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "rtc_data.h"
+
+bool rtc_data::setPhenotypeRegion(string reg) {
+ return regionPhenotype.parse(reg);
+}
+
+void rtc_data::deduceGenotypeRegion(int W) {
+ regionGenotype.chr = regionPhenotype.chr;
+ int start = regionPhenotype.start - W;
+ if (start < 0) regionGenotype.start = 0;
+ else{
+ int start_coldspot = getColdspot(regionGenotype.chr,start);
+ if (start_coldspot > 0 ) regionGenotype.start = all_coldspots[start_coldspot].start;
+ else if (start_coldspot == -1) regionGenotype.start = (coldspot_bins_p[regionGenotype.chr].rbegin()->second).back()->end;
+ else regionGenotype.start = start;
+ }
+ int end = regionPhenotype.end + W;
+ int end_coldspot = getColdspot(regionGenotype.chr,end);
+ if (end_coldspot > 0 ) regionGenotype.end = all_coldspots[end_coldspot].end;
+ else if (end_coldspot == -1) regionGenotype.end = end + 1000000000;
+ else regionGenotype.end = end;
+}
+
+class pgroup {
+public:
+ int start, end;
+ string chr;
+
+ pgroup(string pc, int ps, int pe) {
+ chr = pc;
+ start = ps;
+ end = pe;
+ }
+
+ void merge(int ps, int pe) {
+ if (start > ps) start = ps;
+ if (end < pe) end = pe;
+ }
+
+ void merge(pgroup & p) {
+ if (start > p.start) start = p.start;
+ if (end < p.end) end = p.end;
+ }
+
+ bool overlap(pgroup & p) {
+ if (chr != p.chr) return false;
+ //cout << start << " " << end << " vs " << p.start << " " << p.end;
+ if (start <= p.end && p.start <= end) {
+ //cout << " Y" << endl;
+ return true;
+ } else {
+ //cout << " N" << endl;
+ return false;
+ }
+ }
+
+ bool operator < (pgroup const & p) const {
+ if (chr < p.chr) return true;
+ if (chr > p.chr) return false;
+ if (start < p.start) return true;
+ if (start >= p.start) return false;
+ return false;
+ }
+};
+
+void rtc_data::setPhenotypeRegion(int k, int K) {
+ //STEP0: check input values
+ if (K < 1) vrb.error("Number of chunks needs to be > 0");
+ if (K > phenotype_count) vrb.error("Number of chunks (" + stb.str(K) + ") is greater than the number of phenotypes (" + stb.str(phenotype_count) + ")");
+ if (k < 0) vrb.error("Chunk index needs to be > 0");
+ if (k >= K) vrb.error("Chunk index needs to be smaller than the total number of chunks [=" + stb.str(K) + "]");
+
+ //STEP1: regroup by group
+ vector < pgroup > v_pgroup;
+ if (phenotype_grp.size() > 0) {
+ map < string, int > grp2idx;
+ map < string, int > :: iterator it_grp2idx;
+ for (int p = 0 ; p < phenotype_count ; p ++) {
+ it_grp2idx = grp2idx.find (phenotype_grp[p]);
+ if (it_grp2idx == grp2idx.end()) {
+ grp2idx.insert(pair < string, int > (phenotype_grp[p], v_pgroup.size()));
+ v_pgroup.push_back(pgroup(phenotype_chr[p], phenotype_start[p], phenotype_end[p]));
+ } else v_pgroup[it_grp2idx->second].merge(phenotype_start[p], phenotype_end[p]);
+ }
+ } else {
+ for (int p = 0 ; p < phenotype_count ; p ++) {
+ v_pgroup.push_back(pgroup(phenotype_chr[p], phenotype_start[p], phenotype_end[p]));
+ }
+ }
+ sort(v_pgroup.begin(), v_pgroup.end());
+
+ //STEP2: merge overlapping groups
+ stack < pgroup > s_pgroup;
+ s_pgroup.push(v_pgroup[0]);
+ for (int i = 1 ; i < v_pgroup.size(); i++) {
+ pgroup ptop = s_pgroup.top();
+ if (!ptop.overlap(v_pgroup[i])) s_pgroup.push(v_pgroup[i]);
+ else {
+ ptop.merge(v_pgroup[i]);
+ s_pgroup.pop();
+ s_pgroup.push(ptop);
+ }
+ }
+ v_pgroup.clear();
+ while (!s_pgroup.empty()) {
+ v_pgroup.push_back(s_pgroup.top());
+ s_pgroup.pop();
+ }
+ sort(v_pgroup.begin(), v_pgroup.end());
+
+ //STEP3: build one cluster per chromosome
+ vector < vector < int > > cluster_idx;
+ map < string , int > chr2idx;
+ for (int p = 0 ; p < v_pgroup.size() ; p ++) {
+ map < string , int > :: iterator it_chr2idx = chr2idx.find(v_pgroup[p].chr);
+ if (it_chr2idx == chr2idx.end()) {
+ chr2idx.insert(make_pair(v_pgroup[p].chr, cluster_idx.size()));
+ cluster_idx.push_back(vector < int > (1, p));
+ } else cluster_idx[it_chr2idx->second].push_back(p);
+ }
+
+ //STEP4: split until number of chunks is reached
+ bool done = (cluster_idx.size() >= K);
+ while (!done) {
+
+ int max_idx = -1, max_val = 1;
+ for (int p = 0 ; p < cluster_idx.size() ; p ++) {
+ if (cluster_idx[p].size() > max_val) {
+ max_val = cluster_idx[p].size();
+ max_idx = p;
+ }
+ }
+
+ if (max_idx >= 0) {
+ int max_mid = cluster_idx[max_idx].size() / 2;
+ cluster_idx.push_back(vector < int > (cluster_idx[max_idx].begin() + max_mid, cluster_idx[max_idx].end()));
+ cluster_idx[max_idx].erase(cluster_idx[max_idx].begin() + max_mid, cluster_idx[max_idx].end());
+ if (cluster_idx.size() >= K) done = true;
+ } else done = true;
+ }
+
+ //STEP5: extract coordinates
+ if (k < cluster_idx.size()) {
+ regionPhenotype.chr = v_pgroup[cluster_idx[k][0]].chr;
+ regionPhenotype.start = 1000000000;
+ regionPhenotype.end = 0;
+ for (int c = 0 ; c < cluster_idx[k].size() ; c ++) {
+ if (v_pgroup[cluster_idx[k][c]].start < regionPhenotype.start) regionPhenotype.start = v_pgroup[cluster_idx[k][c]].start;
+ if (v_pgroup[cluster_idx[k][c]].end > regionPhenotype.end) regionPhenotype.end = v_pgroup[cluster_idx[k][c]].end;
+ }
+ } else vrb.leave("Empty chunk, no data to process!");
+}
diff --git a/src/mode_rtc/rtc_collapse_phenotypes.cpp b/src/mode_rtc/rtc_collapse_phenotypes.cpp
new file mode 100644
index 0000000..fc77858
--- /dev/null
+++ b/src/mode_rtc/rtc_collapse_phenotypes.cpp
@@ -0,0 +1,83 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "rtc_data.h"
+
+
+void rtc_data::collapsePhenotypes() {
+ group_idx.clear();
+
+ //PASS0: check that groups are specified for aggragation methods
+
+
+
+ //PASS1: regroup phenotypes by group ID
+ //map < string, unsigned int > group_id;
+ map < string, unsigned int >::iterator group_it;
+ for (int p = 0 ; p < phenotype_count ; p ++) {
+ if (grp_mode != GRP_NONE) {
+ group_it = group_id.find(phenotype_grp[p]);
+ if (group_it == group_id.end()) {
+ group_idx.push_back(vector < unsigned int > (1, p));
+ group_var.push_back(1.0);
+ group_size.push_back(1);
+ group_id.insert(pair < string, unsigned int > (phenotype_grp[p], group_id.size()));
+ } else {
+ group_idx[group_it->second].push_back(p);
+ group_size[group_it->second]++;
+ }
+ } else {
+ group_idx.push_back(vector < unsigned int > (1, p));
+ group_var.push_back(1.0);
+ group_size.push_back(1);
+ }
+ }
+
+ //PASS2: sort & stats
+ basic_stats bspg;
+ for (int g = 0 ; g < group_idx.size() ; g ++) {
+ sort(group_idx[g].begin(), group_idx[g].end());
+ bspg.push(group_idx[g].size());
+ }
+ if (grp_mode != GRP_NONE) {
+ vrb.title("Regrouping phenotypes within groups");
+ vrb.bullet("#phenotypes = " + stb.str(phenotype_count));
+ vrb.bullet("#groups = " + stb.str(group_idx.size()));
+ vrb.bullet("#phenotypes per group = " + stb.str(bspg.mean(), 2) + " +/-" + stb.str(bspg.sd(), 2));
+ }
+
+ //PASS3: pca1 and mean
+ basic_stats bsvg;
+ for (int g = 0 ; g < group_idx.size() ; g ++) {
+ if (group_idx[g].size() > 1) {
+ if (grp_mode == GRP_MEAN) {
+ for (int s = 0 ; s < sample_count ; s ++) {
+ for (int p = 1 ; p < group_idx[g].size() ; p ++) phenotype_val[group_idx[g][0]][s] += phenotype_val[group_idx[g][p]][s];
+ phenotype_val[group_idx[g][0]][s] /= group_idx[g].size();
+ }
+ group_idx[g].erase(group_idx[g].begin() + 1, group_idx[g].end());
+ } else if (grp_mode == GRP_PCA1) {
+ pca P (sample_count, group_idx[g].size());
+ P.fill(phenotype_val, group_idx[g]);
+ P.run(false, true, true);
+ P.get(0, phenotype_val[group_idx[g][0]]);
+ group_var[g] = P.getVariance(0);
+ bsvg.push(group_var[g]);
+ group_idx[g].erase(group_idx[g].begin() + 1, group_idx[g].end());
+ }
+ } else if (grp_mode == GRP_PCA1) group_var[g] = 1.0;
+ }
+ if (grp_mode == GRP_PCA1) vrb.bullet("variance explained by PC1 per group = " + stb.str(bsvg.mean(), 3) + " +/-" + stb.str(bsvg.sd(), 3));
+}
diff --git a/src/mode_rtc/rtc_common.cpp b/src/mode_rtc/rtc_common.cpp
new file mode 100644
index 0000000..10dc2a6
--- /dev/null
+++ b/src/mode_rtc/rtc_common.cpp
@@ -0,0 +1,492 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "rtc_data.h"
+
+void rtc_data::createTransLists(){
+ vrb.title("Creating a list of variants and phenotypes for trans analyses");
+ unsigned int count = 0;
+ map < string ,vector < pairsToTestForRTC > >::iterator it;
+ for (it = pheno_eqtls.begin(); it != pheno_eqtls.end(); it++){
+ for (int p = 0 ; p < it->second.size(); p++){
+ filter_phenotype.addInclusion(phenotype_id[it->second[p].pheno_idx]);
+ vector < int > genotype_idx_to_test;
+ if (it->second[p].eqtl_snp_coldspot_idx >= 0 && it->second[p].test_snp_coldspot_idx >= 0 && genotype_chr[it->second[p].test_snp_idx] == genotype_chr[it->second[p].eqtl_snp_idx] && (it->second[p].eqtl_snp_coldspot_idx == it->second[p].test_snp_coldspot_idx || it->second[p].Dprime >= Dprime_cutoff)){
+ int si,ei;
+ if (it->second[p].eqtl_snp_coldspot_idx < it->second[p].test_snp_coldspot_idx){
+ si = it->second[p].eqtl_snp_coldspot_idx;
+ ei = it->second[p].test_snp_coldspot_idx;
+ }else{
+ ei = it->second[p].eqtl_snp_coldspot_idx;
+ si = it->second[p].test_snp_coldspot_idx;
+ }
+ for (int csi = si ; csi <= ei; csi++){
+ genotype_idx_to_test.insert(genotype_idx_to_test.end(),all_coldspots[csi].coldspot_variant_idx.begin(),all_coldspots[csi].coldspot_variant_idx.end());
+ }
+ }else{
+ continue;
+ }
+ count++;
+ for (int i = 0 ; i < genotype_idx_to_test.size(); i ++) filter_genotype.addInclusion(genotype_id[genotype_idx_to_test[i]]);
+ }
+ }
+ vrb.bullet(stb.str(count) + " actual RTC calculations found.");
+}
+
+
+vector <double> rtc_data::getDprimeRsquare(string chr1, int pos1, string chr2 , int pos2, string al1, string al2, int idx1 , int idx2){
+ vector <double > result(2,-9.0);
+
+ if(!calculate_Dprime_R2){
+ if ( idx1 >= 0 && idx2 >= 0) result[1] = getRsquare(idx1,idx2);
+ return result;
+ }
+
+ if (chr1 == chr2 && pos1 == pos2){
+ result = {1.0,1.0};
+ return result;
+ }
+ vector < string > geno1,geno2;
+ string alleles1,alleles2;
+ string region1 = chr1 + ":" + stb.str(pos1) + "-" + stb.str(pos1);
+ string region2 = chr2 + ":" + stb.str(pos2) + "-" + stb.str(pos2);
+ if (DprimeR2inMem >= 2){
+ if (DprimeRsquareSink.count(region1) && DprimeRsquareSink[region1].count(region2) ) return DprimeRsquareSink[region1][region2];
+ if (DprimeRsquareSink.count(region2) && DprimeRsquareSink[region1].count(region1) ) return DprimeRsquareSink[region2][region1];
+ }
+ int s1 = readGenotypesVCFStats(region1, alleles1, geno1);
+ int s2 = readGenotypesVCFStats(region2, alleles2, geno2);
+ //if (!s1) vrb.warning(region1 + " does not exist in [" +stats_vcf_file +"]");
+ //if (!s2) vrb.warning(region2 + " does not exist in [" +stats_vcf_file +"]");
+ if (s1 == 0) unfound_regions.insert(region1);
+ if (s2 == 0) unfound_regions.insert(region2);
+ if (s1 == -1) unphased_regions.insert(region1);
+ if (s2 == -1) unphased_regions.insert(region2);
+ if ((s1 == -1 || s2 == -1 )){
+ if(!options.count("individual-Dprime")){
+ calculate_Dprime_R2 = false;
+ vrb.warning("Unphased genotypes encounted stopping D' calculations");
+ }
+ if ( idx1 >= 0 && idx2 >= 0) result[1] = getRsquare(idx1,idx2);
+ }
+ if (s1 <= 0 || s2 <= 0){
+ if ( idx1 >= 0 && idx2 >= 0) result[1] = getRsquare(idx1,idx2);
+ if (DprimeR2inMem >= 2) DprimeRsquareSink[region1][region2] = result;
+ return result;
+ }
+ //CALCULATE D' AND R2 TAKEN FROM VCFTOOLS variant_file_output.cpp
+ double x11=0, x12=0, x21=0, x22=0;
+ double X=0, X2=0, Y=0, Y2=0, XY=0;
+ double sx, sy;
+ double rel_x11, p1, p2, q1, q2, Dmax;
+ double var1, var2, cov12;
+ double r2,D, Dprime;
+ int chr_count = 0;
+ int allele1, allele2;
+ for ( int i = 0 ; i < geno1.size(); i++){
+ if (geno1[i] == "NA" || geno2[i] == "NA") continue;
+ int g1a1 = geno1[i][0] - '0';
+ int g1a2 = geno1[i][1] - '0';
+ int g2a1 = geno2[i][0] - '0';
+ int g2a2 = geno2[i][1] - '0';
+ for (unsigned int c=0; c<2; c++){
+ if (c==0){
+ allele1 = g1a1;
+ allele2 = g2a1;
+ }else{
+ allele1 = g1a2;
+ allele2 = g2a2;
+ }
+
+ if ((allele1 < 0) || (allele2 < 0))
+ continue;
+
+ if (allele1 == 0 && allele2 == 0){
+ x11++;
+ } else if (allele1 == 0 && allele2 != 0){
+ x12++;
+ } else if (allele1 != 0 && allele2 == 0){
+ x21++;
+ } else { // (allele1 !=0 && allele2 != 0)
+ x22++;
+ }
+
+ sx=0, sy=0;
+ if (allele1 == 0)
+ sx += 1;
+
+ if (allele2 == 0)
+ sy += 1;
+
+ X += sx; Y += sy;
+ XY += sx*sy;
+ sx *= sx; sy *= sy;
+ X2 += sx;
+ Y2 += sy;
+
+ chr_count++;
+ }
+
+ }
+ rel_x11 = x11/double(chr_count);
+ p1 = (x11 + x12)/double(chr_count);
+ p2 = (x21 + x22)/double(chr_count);
+ q1 = (x11 + x21)/double(chr_count);
+ q2 = (x12 + x22)/double(chr_count);
+ D = rel_x11 - p1*q1;
+ if (D < 0)
+ Dmax = min(p1*q1,p2*q2);
+ else
+ Dmax = min(p1*q2,p2*q1);
+ Dprime = D/Dmax;
+
+ X /= chr_count; X2 /= chr_count;
+ Y /= chr_count; Y2 /= chr_count;
+ XY /= chr_count;
+ var1 = X2 - X*X;
+ var2 = Y2 - Y*Y;
+ cov12 = XY - X*Y;
+ //if (var1 == 0) vrb.warning(region1 + " has zero variance in [" +stats_vcf_file +"]");
+ //if (var2 == 0) vrb.warning(region2 + " has zero variance in [" +stats_vcf_file +"]");
+ if (var1 == 0) no_variance_regions.insert(region1);
+ if (var2 == 0) no_variance_regions.insert(region1);
+ if (var1 == 0 || var2 == 0){
+ if (DprimeR2inMem >= 2) DprimeRsquareSink[region1][region2] = result;
+ return result;
+ }
+ r2 = cov12 * cov12 / (var1 * var2);
+ result = {abs(Dprime),r2};
+ if (DprimeR2inMem >= 2) DprimeRsquareSink[region1][region2] = result;
+ if (al1 != "" && alleles1 != al1) unmatched_alleles.insert(region1 + " " + al1 +" "+ alleles1);
+ if (al2 != "" && alleles2 != al2) unmatched_alleles.insert(region2 + " " + al2 +" "+ alleles2);
+ return result;
+
+}
+
+
+void rtc_data::printPTTFR(){
+ map < string ,vector < pairsToTestForRTC > >::iterator it;
+ for (it = pheno_eqtls.begin() ; it != pheno_eqtls.end(); it++)
+ for(int i = 0 ; i < (it->second).size() ; i++)
+ cout << phenotype_id[it->second[i].pheno_idx] << " " << (it->second)[i];
+}
+
+void rtc_data::mapVariantsToColdspots(){
+ vrb.title("Mapping variants to coldspots");
+ for (int g = 0 ; g < genotype_count; g++ ){
+ if ((g+1) % 100000 == 0 ) vrb.bullet(stb.str(g+1) + " genotypes mapped");
+ if (coldspot_bins_p.find(genotype_chr[g]) != coldspot_bins_p.end()){
+ int max = (coldspot_bins_p[genotype_chr[g]].rbegin()->second).back()->end;
+ if (genotype_start[g] > max){
+ //coldspot_end_idx[genotype_chr[g]].push_back(g);
+ //continue;
+ vrb.error("Genotype " + genotype_id[g] + " at " + genotype_chr[g] + " " + stb.str(genotype_start[g]) + " is outside the last coldspot on the chr [" + stb.str(max) + "]");
+ }
+ int bin = genotype_start[g] / bin_size;
+ if (coldspot_bins_p[genotype_chr[g]].find(bin) != coldspot_bins_p[genotype_chr[g]].end()){
+ for (int c = 0 ; c < coldspot_bins_p[genotype_chr[g]][bin].size(); c++){
+ if (genotype_start[g] >= coldspot_bins_p[genotype_chr[g]][bin][c]->start && genotype_start[g] <= coldspot_bins_p[genotype_chr[g]][bin][c]->end ){
+ coldspot_bins_p[genotype_chr[g]][bin][c]->coldspot_variant_idx.push_back(g);
+ break;
+ }
+ }
+ }
+ }
+ }
+ vrb.bullet(stb.str(genotype_count) + " genotypes mapped to coldspots");
+}
+
+void rtc_data::calculateRTC(string fout){
+ vrb.title("Calculating RTC");
+ output_file fdo (fout);
+ if (fdo.fail()) vrb.error("Cannot open file [" + fout + "]");
+ if (options.count("header") || (!options.count("chunk") && !options.count("region"))){
+ fdo <<"other_variant our_variant phenotype phenotype_group other_variant_chr other_variant_start other_variant_rank our_variant_chr our_variant_start our_variant_rank phenotype_chr phenotype_start distance_between_variants distance_between_other_variant_and_pheno other_variant_region_index our_variant_region_index region_start region_end variant_count_in_region RTC D' r^2";
+ if (options.count("sample")) fdo << " p_value unique_picks_H0 unique_picks_H1 rtc_bin_start rtc_bin_end rtc_bin_H0_proportion rtc_bin_H1_proportion median_r^2 median_H0 median_H1 H0 H1";
+ fdo << endl;
+ }
+ map < string ,vector < pairsToTestForRTC > >::iterator it;
+#ifdef __INTERVAL_CENTRIC_RTC
+ //Transform to coldspot centric
+ vrb.title("Transforming to interval centric view");
+ map < int , map <int, map <string, vector < pairsToTestForRTC > > > > cs_transform;
+ map < int , map <int, map <string, vector < pairsToTestForRTC > > > >::iterator sit;
+ map <int, map <string, vector < pairsToTestForRTC > > >::iterator eit;
+ for (it = pheno_eqtls.begin(); it != pheno_eqtls.end(); it++){
+ for (int p = 0 ; p < it->second.size(); p++){
+ if (it->second[p].eqtl_snp_coldspot_idx >= 0 && it->second[p].test_snp_coldspot_idx >= 0 && genotype_chr[it->second[p].test_snp_idx] == genotype_chr[it->second[p].eqtl_snp_idx] && (it->second[p].eqtl_snp_coldspot_idx == it->second[p].test_snp_coldspot_idx || it->second[p].Dprime >= Dprime_cutoff)){
+ int si,ei;
+ if (it->second[p].eqtl_snp_coldspot_idx < it->second[p].test_snp_coldspot_idx){
+ si = it->second[p].eqtl_snp_coldspot_idx;
+ ei = it->second[p].test_snp_coldspot_idx;
+ }else{
+ ei = it->second[p].eqtl_snp_coldspot_idx;
+ si = it->second[p].test_snp_coldspot_idx;
+ }
+ cs_transform[si][ei][it->first].push_back(it->second[p]);
+ }
+ }
+ }
+ unsigned int si_count =1;
+ unsigned int done = 0;
+ map < string ,vector < pairsToTestForRTC > >().swap(pheno_eqtls);
+ for (sit = cs_transform.begin() ; sit != cs_transform.end(); sit++){
+ //Deallocate memory
+ if (DprimeR2inMem == 2){
+ unordered_map < int , vector < float > >().swap(genotypeSink);
+ //unordered_map < int , unordered_map < int , double > >().swap(RsquareSink);
+ vector <float>().swap(RsquareSink);
+ }
+ for (eit = sit->second.begin() ; eit != sit->second.end(); eit++){
+ vrb.title("Processing intervals starting with index[" + stb.str(sit->first) + "] " + stb.str(si_count) + " / " + stb.str(cs_transform.size()) );
+ si_count++;
+ //if (sit->first != 24837) continue;
+ unsigned int count_pheno = 1;
+ pheno_eqtls = eit->second;
+ for (it = pheno_eqtls.begin(); it != pheno_eqtls.end(); it++){
+ //Deallocate memory
+ vrb.bullet("Processing phenotype [" + it->first + "] " + stb.str(count_pheno) + " / " + stb.str(pheno_eqtls.size()) );
+ vrb.bullet(stb.str(it->second.size()) + " pairwise tests");
+ count_pheno++;
+ for (int p = 0 ; p < it->second.size(); p++){
+ fdo << genotype_id[it->second[p].test_snp_idx] << " " ;
+ fdo << genotype_id[it->second[p].eqtl_snp_idx] << " " ;
+ fdo << phenotype_id[it->second[p].pheno_idx] << " ";
+ fdo << it->first << " ";
+ fdo << genotype_chr[it->second[p].test_snp_idx] << " " ;
+ fdo << genotype_start[it->second[p].test_snp_idx] << " " ;
+ fdo << it->second[p].test_snp_rank << " ";
+ fdo << genotype_chr[it->second[p].eqtl_snp_idx] << " " ;
+ fdo << genotype_start[it->second[p].eqtl_snp_idx] << " " ;
+ fdo << it->second[p].eqtl_snp_rank << " ";
+ fdo << phenotype_chr[it->second[p].pheno_idx] << " ";
+ fdo << phenotype_start[it->second[p].pheno_idx] << " ";
+ fdo << abs(genotype_start[it->second[p].test_snp_idx] - genotype_start[it->second[p].eqtl_snp_idx]) << " ";
+ if (genotype_chr[it->second[p].test_snp_idx] == phenotype_chr[it->second[p].pheno_idx]) fdo << abs(genotype_start[it->second[p].test_snp_idx] - phenotype_start[it->second[p].pheno_idx]) << " ";
+ else fdo << "NA ";
+ fdo << it->second[p].test_snp_coldspot_idx << " " ;
+ fdo << it->second[p].eqtl_snp_coldspot_idx << " " ;
+ vector < int > genotype_idx_to_test;
+ int pI = options.count("debug") ? it->second[p].pheno_idx : -1 ;
+ int si,ei;
+ if (it->second[p].eqtl_snp_coldspot_idx < it->second[p].test_snp_coldspot_idx){
+ si = it->second[p].eqtl_snp_coldspot_idx;
+ ei = it->second[p].test_snp_coldspot_idx;
+ }else{
+ ei = it->second[p].eqtl_snp_coldspot_idx;
+ si = it->second[p].test_snp_coldspot_idx;
+ }
+ fdo << all_coldspots[si].start << " ";
+ fdo << all_coldspots[ei].end << " ";
+ for (int csi = si ; csi <= ei; csi++){
+ genotype_idx_to_test.insert(genotype_idx_to_test.end(),all_coldspots[csi].coldspot_variant_idx.begin(),all_coldspots[csi].coldspot_variant_idx.end());
+ }
+ fdo << genotype_idx_to_test.size() << " ";
+ if (it->second[p].test_snp_idx == it->second[p].eqtl_snp_idx){
+ if (sample_iterations > 0) fdo << "1 1 1 0 NA NA 1 1 0 1 NA NA NA NA NA" << endl;
+ else fdo << "1 1 1" << endl;
+ done++;
+ continue;
+ }
+
+ string extra;
+ if (it->second[p].other_conditional_signal_idx.size()){
+ extra = ", correcting for =";
+ for (int i = 0 ; i < it->second[p].other_conditional_signal_idx.size(); i++) extra += " " + genotype_id[it->second[p].other_conditional_signal_idx[i]];
+ }
+ vrb.bullet("Testing " + genotype_id[it->second[p].test_snp_idx] + " x " + genotype_id[it->second[p].eqtl_snp_idx] + " x " + phenotype_id[it->second[p].pheno_idx] + ", " + genotype_id[it->second[p].eqtl_snp_idx] + " rank = " + stb.str(it->second[p].eqtl_snp_rank) + extra + ", #variants = " + stb.str(genotype_idx_to_test.size()));
+ done++;
+ vector < double > corrs(genotype_idx_to_test.size());
+ double test_snp_corr = 0.0 ;
+ vector < float > genotype_eqtl;
+ if (genotypeSink.count(it->second[p].eqtl_snp_idx)){
+ genotype_eqtl = genotypeSink[it->second[p].eqtl_snp_idx];
+ }else{
+ genotype_eqtl = genotype_val[it->second[p].eqtl_snp_idx];
+ normalize(genotype_eqtl);
+ if(DprimeR2inMem >= 2) genotypeSink[it->second[p].eqtl_snp_idx] = genotype_eqtl;
+ }
+ vector < float > phenotype_eqtl;
+ vector < float > raw_phenotype_eqtl;
+
+ phenotype_eqtl = phenotype_val[it->second[p].pheno_idx];
+ if (it->second[p].other_conditional_signal_idx.size()){
+ residualizer covariate_engine (sample_count);
+ for (int o = 0 ; o < it->second[p].other_conditional_signal_idx.size(); o++){
+ covariate_engine.push(genotype_val[it->second[p].other_conditional_signal_idx[o]]);
+ }
+ covariate_engine.build();
+ covariate_engine.residualize(phenotype_eqtl);
+ }
+ if (sample_iterations > 0 ) {
+ raw_phenotype_eqtl = phenotype_eqtl;
+
+ }
+ if (options.count("normal")) normalTransform(phenotype_eqtl);
+ normalize(phenotype_eqtl);
+
+
+ for (int s = 0 ; s < genotype_idx_to_test.size() ; s++){
+ vector < float > test;
+ if (genotypeSink.count(genotype_idx_to_test[s])){
+ test = genotypeSink[genotype_idx_to_test[s]];
+ }else{
+ test = genotype_val[genotype_idx_to_test[s]];
+ normalize(test);
+ if(DprimeR2inMem >= 2) genotypeSink[genotype_idx_to_test[s]] = test;
+ }
+ vector <float> new_pheno = correct(test,phenotype_eqtl);
+ if (options.count("normal")) normalTransform(new_pheno);
+ normalize(new_pheno);
+ corrs[s] = abs(getCorrelation(genotype_eqtl, new_pheno));
+ if( genotype_idx_to_test[s] == it->second[p].test_snp_idx) test_snp_corr = corrs[s];
+ }
+ sort(corrs.begin(),corrs.end());
+ int rank = -1;
+ for (int i = 0 ; i<corrs.size() && corrs[i] <= test_snp_corr; i++) if(corrs[i] == test_snp_corr) rank = i;
+ double RTC = ((double) corrs.size() - (double) rank) / (double) corrs.size();
+ string dprime = it->second[p].Dprime != -9 ? stb.str(it->second[p].Dprime) : "NA";
+ string rsquared = it->second[p].R2 != -9 ? stb.str(it->second[p].R2) : "NA";
+ if (sample_iterations >0 ) {
+ sort(genotype_idx_to_test.begin(),genotype_idx_to_test.end());
+ rtc_sample_results res = sampleRTC(genotype_idx_to_test, raw_phenotype_eqtl, it->second[p].eqtl_snp_idx, RTC,pI);
+ fdo << RTC <<" " << dprime << " " << rsquared << " " << res.pval << " " << res.unique_h0 << " " << res.unique_h1 << " " << res.rtc_bin_start << " " << res.rtc_bin_end << " " << res.rtc_bin_h0_proportion << " " << res.rtc_bin_h1_proportion << " " << res.medianR2 << " " << res.median_h0 << " " << res.median_h1 << " " << res.h0 << " " << res.h1 << endl;
+ }else fdo << RTC <<" " << dprime << " " << rsquared << endl;
+ }
+ }
+ }
+ }
+#else
+ unsigned int count_pheno = 1;
+ unsigned int done = 0;
+ for (it = pheno_eqtls.begin(); it != pheno_eqtls.end(); it++){
+ //Deallocate memory
+ if (DprimeR2inMem == 2){
+ unordered_map < int , vector < float > >().swap(genotypeSink);
+ unordered_map < int , unordered_map < int , double > >().swap(RsquareSink);
+ }
+ vrb.title("Processing phenotype [" + it->first + "] " + stb.str(count_pheno) + " / " + stb.str(pheno_eqtls.size()) );
+ vrb.bullet(stb.str(it->second.size()) + " pairwise tests");
+ count_pheno++;
+ for (int p = 0 ; p < it->second.size(); p++){
+ fdo << genotype_id[it->second[p].test_snp_idx] << " " ;
+ fdo << genotype_id[it->second[p].eqtl_snp_idx] << " " ;
+ fdo << phenotype_id[it->second[p].pheno_idx] << " ";
+ fdo << it->first << " ";
+ fdo << genotype_chr[it->second[p].test_snp_idx] << " " ;
+ fdo << genotype_start[it->second[p].test_snp_idx] << " " ;
+ fdo << it->second[p].test_snp_rank << " ";
+ fdo << genotype_chr[it->second[p].eqtl_snp_idx] << " " ;
+ fdo << genotype_start[it->second[p].eqtl_snp_idx] << " " ;
+ fdo << it->second[p].eqtl_snp_rank << " ";
+ fdo << phenotype_chr[it->second[p].pheno_idx] << " ";
+ fdo << phenotype_start[it->second[p].pheno_idx] << " ";
+ fdo << abs(genotype_start[it->second[p].test_snp_idx] - genotype_start[it->second[p].eqtl_snp_idx]) << " ";
+ if (genotype_chr[it->second[p].test_snp_idx] == phenotype_chr[it->second[p].pheno_idx]) fdo << abs(genotype_start[it->second[p].test_snp_idx] - phenotype_start[it->second[p].pheno_idx]) << " ";
+ else fdo << "NA ";
+ fdo << it->second[p].test_snp_coldspot_idx << " " ;
+ fdo << it->second[p].eqtl_snp_coldspot_idx << " " ;
+ vector < int > genotype_idx_to_test;
+ int pI = options.count("debug") ? it->second[p].pheno_idx : -1 ;
+ if (it->second[p].eqtl_snp_coldspot_idx >= 0 && it->second[p].test_snp_coldspot_idx >= 0 && genotype_chr[it->second[p].test_snp_idx] == genotype_chr[it->second[p].eqtl_snp_idx] && (it->second[p].eqtl_snp_coldspot_idx == it->second[p].test_snp_coldspot_idx || it->second[p].Dprime >= Dprime_cutoff)){
+ if (it->second[p].test_snp_idx == it->second[p].eqtl_snp_idx){
+ fdo << all_coldspots[it->second[p].test_snp_coldspot_idx].start << " ";
+ fdo << all_coldspots[it->second[p].test_snp_coldspot_idx].end << " ";
+ if (sample_iterations > 0) fdo << "1 1 1 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA" << endl;
+ else fdo << "1 1 1" << endl;
+ done++;
+ continue;
+ }
+ int si,ei;
+ if (it->second[p].eqtl_snp_coldspot_idx < it->second[p].test_snp_coldspot_idx){
+ si = it->second[p].eqtl_snp_coldspot_idx;
+ ei = it->second[p].test_snp_coldspot_idx;
+ }else{
+ ei = it->second[p].eqtl_snp_coldspot_idx;
+ si = it->second[p].test_snp_coldspot_idx;
+ }
+ fdo << all_coldspots[si].start << " ";
+ fdo << all_coldspots[ei].end << " ";
+ for (int csi = si ; csi <= ei; csi++){
+ genotype_idx_to_test.insert(genotype_idx_to_test.end(),all_coldspots[csi].coldspot_variant_idx.begin(),all_coldspots[csi].coldspot_variant_idx.end());
+ }
+ }else{
+ string dprime = it->second[p].Dprime != -9 ? stb.str(it->second[p].Dprime) : "NA";
+ string rsquared = it->second[p].R2 != -9 ? stb.str(it->second[p].R2) : "NA";
+ if (sample_iterations > 0){
+ fdo << "NA NA NA " << dprime << " " << rsquared << " NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA" << endl;
+ }else{
+ fdo << "NA NA NA " << dprime << " " << rsquared << endl;
+ }
+ continue;
+ }
+ vrb.bullet("Testing " + genotype_id[it->second[p].test_snp_idx] + " x " + genotype_id[it->second[p].eqtl_snp_idx] + " x " + phenotype_id[it->second[p].pheno_idx] + " #variants = " + stb.str(genotype_idx_to_test.size()));
+ done++;
+ vector < double > corrs(genotype_idx_to_test.size());
+ double test_snp_corr = 0.0 ;
+ vector < float > genotype_eqtl;
+ if (genotypeSink.count(it->second[p].eqtl_snp_idx)){
+ genotype_eqtl = genotypeSink[it->second[p].eqtl_snp_idx];
+ }else{
+ genotype_eqtl = genotype_val[it->second[p].eqtl_snp_idx];
+ normalize(genotype_eqtl);
+ if(DprimeR2inMem >= 2) genotypeSink[it->second[p].eqtl_snp_idx] = genotype_eqtl;
+ }
+ vector < float > phenotype_eqtl;
+ vector < float > raw_phenotype_eqtl;
+
+ phenotype_eqtl = phenotype_val[it->second[p].pheno_idx];
+ if (it->second[p].other_conditional_signal_idx.size()){
+ residualizer covariate_engine (sample_count);
+ for (int o = 0 ; o < it->second[p].other_conditional_signal_idx.size(); o++){
+ covariate_engine.push(genotype_val[it->second[p].other_conditional_signal_idx[o]]);
+ }
+ covariate_engine.build();
+ covariate_engine.residualize(phenotype_eqtl);
+ }
+ if (sample_iterations > 0 ) {
+ raw_phenotype_eqtl = phenotype_eqtl;
+
+ }
+ if (options.count("normal")) normalTransform(phenotype_eqtl);
+ normalize(phenotype_eqtl);
+
+
+ for (int s = 0 ; s < genotype_idx_to_test.size() ; s++){
+ vector < float > test = genotype_val[genotype_idx_to_test[s]];
+ normalize(test);
+ vector <float> new_pheno = correct(test,phenotype_eqtl);
+ if (options.count("normal")) normalTransform(new_pheno);
+ normalize(new_pheno);
+ corrs[s] = abs(getCorrelation(genotype_eqtl, new_pheno));
+ if( genotype_idx_to_test[s] == it->second[p].test_snp_idx) test_snp_corr = corrs[s];
+ }
+ sort(corrs.begin(),corrs.end());
+ int rank = -1;
+ for (int i = 0 ; i<corrs.size() && corrs[i] <= test_snp_corr; i++) if(corrs[i] == test_snp_corr) rank = i;
+ double RTC = ((double) corrs.size() - (double) rank) / (double) corrs.size();
+ string dprime = it->second[p].Dprime != -9 ? stb.str(it->second[p].Dprime) : "NA";
+ string rsquared = it->second[p].R2 != -9 ? stb.str(it->second[p].R2) : "NA";
+ if (sample_iterations >0 ) {
+ rtc_sample_results res = sampleRTC(genotype_idx_to_test, raw_phenotype_eqtl, it->second[p].eqtl_snp_idx, RTC,pI);
+ fdo << RTC <<" " << dprime << " " << rsquared << " " << res.gtoe_h0 << " " << res.gt_h0 << " " << res.unique_h0 << " " << res.count_h0 << " " << res.gtoe_h1 << " " << res.gt_h1 << " " << res.unique_h1 << " " << res.count_h1 << " " << res.rtc_bin_start << " " << res.rtc_bin_end << " " << res.rtc_bin_h0_proportion << " " << res.rtc_bin_h1_proportion << " " << res.medianR2 << " " << res.median_h0 << " " << res.median_h1 << " " << res.h0 << " " << res.h1 << endl;
+ }else fdo << RTC <<" " << dprime << " " << rsquared << endl;
+ }
+ }
+#endif
+ vrb.print("\n\n * " + stb.str(done) + " actual RTCs calculated.");
+ fdo.close();
+}
+
diff --git a/src/mode_rtc/rtc_data.h b/src/mode_rtc/rtc_data.h
new file mode 100644
index 0000000..e9a2141
--- /dev/null
+++ b/src/mode_rtc/rtc_data.h
@@ -0,0 +1,506 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#ifndef _RTC_DATA_H
+#define _RTC_DATA_H
+
+#define __INTERVAL_CENTRIC_RTC //do interval centric analysis
+
+//ANALYSIS MODES
+#define RTC_MODE1 1
+#define RTC_MODE2 2
+#define RTC_MODE3 3
+#define RTC_MODE4 4
+
+//AGGREGATION MODES
+#define GRP_NONE 0
+#define GRP_BEST 1
+#define GRP_PCA1 2
+#define GRP_MEAN 3
+
+#define __RTC_NA__ (0.0/0.0)
+
+//INCLUDES
+#include "../common/data.h"
+
+
+class coldspot{
+public:
+ string chr;
+ int start;
+ int end;
+ int idx;
+ string type;
+ vector <int> coldspot_variant_idx;
+ coldspot(){chr="";start=-1;end=-1;idx=-1;type="NA";}
+ coldspot(string c, int s, int e, int i,string t){chr=c;start=s;end=e;idx=i; type = t ;}
+ //~coldspot(){coldspot_variant_idx.clear();}
+ friend ostream& operator<<(ostream& out, const coldspot& p){
+ out << "ID: " << p.idx << " CHR: " << p.chr << " START: " << p.start << " END: " << p.end << " TYPE: " << p.type << " VARIANTS:";
+ for (int i =0 ; i < p.coldspot_variant_idx.size(); i++) out << " " << p.coldspot_variant_idx[i];
+ out << endl;
+ return out;
+ }
+ long long unsigned int getMemoryUsage(){
+ return 3*sizeof(int) + type.capacity()*sizeof(char) + chr.capacity()*sizeof(char) + coldspot_variant_idx.capacity()*sizeof(int);
+ }
+};
+
+
+
+class pairsToTestForRTC{
+public:
+ int test_snp_idx;
+ int eqtl_snp_idx;
+ int test_snp_rank;
+ int eqtl_snp_rank;
+ vector < int > other_conditional_signal_idx;
+ int test_snp_coldspot_idx;
+ int eqtl_snp_coldspot_idx;
+ double Dprime;
+ double R2;
+ int pheno_idx;
+ pairsToTestForRTC(){test_snp_idx=-1;eqtl_snp_idx=-1;test_snp_coldspot_idx=-1;eqtl_snp_coldspot_idx=-1;Dprime=-1.0;R2=0.0;test_snp_rank=0;eqtl_snp_rank=0;pheno_idx=-1;}
+ pairsToTestForRTC(int tsi , int esi , vector <int> & ocsi, int tsci, int esci,double dp, double r2,int tr, int er , int pidx){eqtl_snp_rank=er;test_snp_rank=tr;test_snp_idx=tsi;eqtl_snp_idx=esi;test_snp_coldspot_idx=tsci;eqtl_snp_coldspot_idx=esci;other_conditional_signal_idx = ocsi;Dprime=dp;R2=r2;pheno_idx=pidx;}
+ //~pairsToTestForRTC(){other_conditional_signal_idx.clear();}
+ friend ostream& operator<<(ostream& out, const pairsToTestForRTC& p){
+ out << "TSI: " << p.test_snp_idx << " ESI: " << p.eqtl_snp_idx << " TSCI: " << p.test_snp_coldspot_idx << " ESCI: " << p.eqtl_snp_coldspot_idx << " D: " << p.Dprime << " R: " <<p.R2 << " O:";
+ for (int i =0 ; i < p.other_conditional_signal_idx.size(); i++) out << " " << p.other_conditional_signal_idx[i];
+ out << endl;
+ return out;
+ }
+ long long unsigned int getMemoryUsage(){
+ return 7*sizeof(int) + 2*sizeof(double) + other_conditional_signal_idx.capacity()*sizeof(int);
+ }
+};
+
+class rtc_sample_results{
+public:
+ double gtoe_h0,gt_h0,gtoe_h1,gt_h1,count_h0,count_h1,pval;
+ unsigned long int unique_h0,unique_h1,variants;
+ float medianR2, median_h0, median_h1;
+ double rtc_bin_start,rtc_bin_end,rtc_bin_h0_proportion, rtc_bin_h1_proportion;
+ string h0,h1;
+ rtc_sample_results(){pval=gtoe_h0=__RTC_NA__;gt_h0=__RTC_NA__;gtoe_h1=__RTC_NA__;gt_h1=__RTC_NA__;count_h0=0.0;count_h1=0.0;unique_h0=0,unique_h1=0; median_h0 = median_h1 = medianR2=__RTC_NA__; rtc_bin_start = 0.0,rtc_bin_end=1.0,rtc_bin_h0_proportion=1.0, rtc_bin_h1_proportion=1.0 ;h1=h0="";variants=0;}
+ long long unsigned int getMemoryUsage(){
+ return sizeof(long int) + 11*sizeof(double) + 3*sizeof(float) + h0.capacity()*sizeof(char) + h1.capacity()*sizeof(char);
+ }
+};
+
+class rtc_data : public data {
+public:
+ //PARAMETERS
+ unsigned int mode;
+ unsigned int grp_mode;
+ unsigned int cis_window;
+ int pvalue_column,variant_column,phenotype_column,rank_column,best_column,coldspot_count,group_column;
+ static const int bin_size = 1000000;
+ double Dprime_cutoff;
+ double R2_cutoff;
+ unsigned long int sample_iterations,max_sample_iterations;
+ set <string> unphased_regions,unfound_regions,no_variance_regions,unmatched_alleles,unfound_ids,unfound_phenotypes;
+ unsigned int DprimeR2inMem;
+ static const int DprimePrintFreq = 10;
+ static const int normal_output_columns = 18;
+ bool calculate_Dprime_R2;
+
+ //ADDITIONAL VCF
+ filter stats_vcf_sample_filter;
+ string stats_vcf_file;
+ vector <int> stats_mappingS;
+ int stats_n_includedS;
+
+ //REGIONS
+ genomic_region regionPhenotype;
+ genomic_region regionGenotype;
+
+ //GENOTYPES
+ int genotype_count; //variant site number
+ vector < vector < float > > genotype_val; //variant site genotype dosages
+ vector < string > genotype_chr; //variant site chromosome
+ vector < string > genotype_id; //variant site IDs
+ vector < int > genotype_start; //variant site start positions
+ vector < int > genotype_end; //variant site end positions
+ unordered_map < string, int > genotype_id_to_idx;
+ vector < string > genotype_alleles;
+
+ //PHENOTYPES
+ int phenotype_count; //phenotype number
+ vector < vector < float > > phenotype_val; //phenotype values
+ vector < string > phenotype_id; //phenotype ids
+ vector < string > phenotype_grp; //phenotype groups
+ vector < string > phenotype_chr; //phenotype chromosomes
+ vector < int > phenotype_start; //phenotype start positions
+ vector < int > phenotype_end; //phenotype end positions
+ unordered_map < string, int > phenotype_id_to_idx;
+
+ //PHENOTYPE GROUPS
+ vector < vector < unsigned int > > group_idx; //group index to phenotype indexes
+ vector < double > group_var; //group variance explained by PC1
+ vector < int > group_size; //number of phenotypes in group
+ map < string, unsigned int > group_id;
+
+ //COVARIATES & INTERACTION
+ int covariate_count; //covariate number
+ vector < vector < string > > covariate_val; //covariate values
+ vector < string > covariate_id; //covariate ids
+
+ //RTC
+ map < string, map < int, vector <coldspot *> > > coldspot_bins_p;
+ vector < coldspot > all_coldspots;
+ //map < string, vector < int > > coldspot_end_idx;
+ map < string ,vector < pairsToTestForRTC > > pheno_eqtls;
+ unordered_map < string , unordered_map< string, vector <double> > > DprimeRsquareSink;
+ //unordered_map < string , vector < float > > phenotypeSink; //UNUSED
+ //unordered_map < string , vector < float > > phenotypeSinkRaw; //UNUSED
+ unordered_map < int , vector < float > > genotypeSink;
+ //unordered_map < int , unordered_map < int, float > > RsquareSink;
+ vector < float> RsquareSink;
+
+
+ //CONSTRUCTOR / DESTRUCTOR
+ rtc_data();
+ ~rtc_data();
+ void clear();
+
+ //DATA REGION
+ bool setPhenotypeRegion(string);
+ void setPhenotypeRegion(int, int);
+ void deduceGenotypeRegion(int);
+
+ //READ DATA
+ void readGenotypes(string);
+ void readGenotypesVCF(string);
+ void readGenotypesBED(string);
+ void scanGenotypes(string);
+ void scanGenotypesVCF(string);
+ void scanGenotypesBED(string);
+ int readGenotypesVCFStats(string, string &, vector <string> &);
+ void readPhenotypes(string);
+ void scanPhenotypes(string);
+ void readCovariates(string);
+ void readSampleInclusionStats(string);
+ void readSampleExclusionStats(string);
+ void copyIncludeExclude();
+ void setStatsVCF(string);
+ void checkStatsVCF();
+ void createTransLists();
+ void collapsePhenotypes();
+ bool readRTCline(const string &buufer, set < int > &their);
+ bool readRTCline(const string &buffer, string &pheno, string &snp, string &best, string &group, int &rank);
+ bool readRTCline(const string &buffer, string &pheno, string &snp, string &group,unsigned int &line_count);
+
+ //GENOTYPE & PHENOTYPE MANAGEMENT
+ void clusterizePhenotypes(int);
+ void imputeGenotypes();
+ void imputePhenotypes();
+ void residualizePhenotypes();
+ void normalTransformPhenotypes();
+ void normalTransform(vector < float > &);
+ void normalize(vector < float > &);
+ void normalize(vector < vector < float > > &);
+ long long unsigned int getMemoryUsage();
+
+ //COMPUTATION METHODS [ALL INLINES FOR SPEED]
+ double getCorrelation(vector < float > &, vector < float > &);
+ double getCorrelation(vector < float > &, vector < float > &, int);
+ double getPvalue(double, double);
+ double getPvalue(double, vector < double > &);
+ double getSlope(double, double, double);
+ void regression(vector < float > & X, vector < float > & Y, double & slope);
+ float median(vector < float > &);
+
+ //ANALYSIS
+ void readHotspots(string);
+ int getColdspot(string, int);
+ void mapVariantsToColdspots();
+ void calculateRTC(string);
+ rtc_sample_results sampleRTC(vector <int> &, vector <float> &, int, double,int pI = -1);
+ void generatePhenotype (double,int, vector <float> &);
+ void generatePhenotype(vector<float> &, linReg &, vector <float>&);
+ void mergeqtl_cis_conditional(string, string);
+ void mergeqtl_cis(string, string);
+ void mergeqtl_trans_conditional(string, string);
+ void mergeqtl_trans(string, string);
+ void gwas_cis_conditional(string, string);
+ void gwas_cis(string, string);
+ void gwas_trans_conditional(string, string);
+ void gwas_trans(string, string);
+ vector <double> getDprimeRsquare(string,int,string,int, string al1 = "" , string al2 = "", int idx1 = -1 , int idx2 = -1);
+ double getRsquare(int,int,int,int);
+ double getRsquare(int,int);
+ void printPTTFR();
+ vector < float > correct( vector < float> , vector <float> );
+ int getBestVariant(vector <int> &, int, double &);
+ int bcf_all_phased(const bcf_hdr_t *, bcf1_t *);
+ void probability(vector < float > &, vector < float > & , double , rtc_sample_results &);
+
+};
+
+//***************************************************************//
+//******************** DECLARE FUNCTIONS ************************//
+//***************************************************************//
+void rtc_main(vector < string > &);
+
+//***************************************************************//
+//******************** INLINE FUNCTIONS *************************//
+//***************************************************************//
+
+inline double rtc_data::getCorrelation(vector < float > & vec1, vector < float > & vec2) {
+ int i = 0;
+ int repeat = (sample_count / 4);
+ int left = (sample_count % 4);
+ double sum0 = 0, sum1 = 0, sum2 = 0, sum3 = 0;
+
+ while (repeat --) {
+ sum0 += vec1[i] * vec2[i];
+ sum1 += vec1[i+1] * vec2[i+1];
+ sum2 += vec1[i+2] * vec2[i+2];
+ sum3 += vec1[i+3] * vec2[i+3];
+ i += 4;
+ }
+
+ switch (left) {
+ case 3: sum0 += vec1[i+2] * vec2[i+2];
+ case 2: sum0 += vec1[i+1] * vec2[i+1];
+ case 1: sum0 += vec1[i+0] * vec2[i+0];
+ case 0: ;
+ }
+
+ return sum0 + sum1 + sum2 + sum3;
+}
+
+inline double rtc_data::getCorrelation(vector < float > & vec1, vector < float > & vec2, int sc) {
+ int i = 0;
+ int repeat = (sc / 4);
+ int left = (sc % 4);
+ double sum0 = 0, sum1 = 0, sum2 = 0, sum3 = 0;
+
+ while (repeat --) {
+ sum0 += vec1[i] * vec2[i];
+ sum1 += vec1[i+1] * vec2[i+1];
+ sum2 += vec1[i+2] * vec2[i+2];
+ sum3 += vec1[i+3] * vec2[i+3];
+ i += 4;
+ }
+
+ switch (left) {
+ case 3: sum0 += vec1[i+2] * vec2[i+2];
+ case 2: sum0 += vec1[i+1] * vec2[i+1];
+ case 1: sum0 += vec1[i+0] * vec2[i+0];
+ case 0: ;
+ }
+
+ return sum0 + sum1 + sum2 + sum3;
+}
+
+inline double rtc_data::getPvalue(double corr, double df) {
+ return pf(df * corr * corr / (1 - corr * corr), 1, df, 0, 0);
+}
+
+inline double rtc_data::getPvalue(double ncorr, vector < double > & pcorr) {
+ unsigned int n_hits = 0;
+ for (int p = 0 ; p < pcorr.size() ; p++) if (abs(pcorr[p]) >= abs(ncorr)) n_hits++;
+ return ((n_hits + 1) * 1.0 / (pcorr.size() + 1.0));
+}
+
+inline double rtc_data::getSlope(double nominal_correlation, double gsd, double psd) {
+ if (gsd < 1e-16 || psd < 1e-16) return 0;
+ else return nominal_correlation * psd / gsd;
+}
+
+inline void rtc_data::regression(vector < float > & X, vector < float > & Y, double & slope) {
+ vector < float > Xtmp = X;
+ vector < float > Ytmp = Y;
+ double sdXtmp = basic_stats(Xtmp).sd();
+ double sdYtmp = basic_stats(Ytmp).sd();
+ normalize(Xtmp);
+ normalize(Ytmp);
+ double correlation = getCorrelation(Xtmp, Ytmp);
+ //pvalue = getPvalue(correlation, sample_count - 2);
+ slope = getSlope(correlation, sdXtmp, sdYtmp);
+}
+
+inline double rtc_data::getRsquare(int i, int j){
+ vector < float > v1;
+ if (genotypeSink.count(i)){
+ v1 = genotypeSink[i];
+ }else{
+ v1 = genotype_val[i];
+ normalize(v1);
+ if (DprimeR2inMem >=2) genotypeSink[i] = v1;
+ }
+ vector < float > v2;
+ if (genotypeSink.count(j)){
+ v2 = genotypeSink[j];
+ }else{
+ v2 = genotype_val[j];
+ normalize(v2);
+ if (DprimeR2inMem >=2) genotypeSink[j] = v2;
+ }
+ double r = getCorrelation(v1 , v2 ,v1.size());
+ return r * r;
+}
+
+inline double rtc_data::getRsquare(int i, int j,int start, int size){
+ if (i==j) return 1.0;
+ int first = -1 , second = -1;
+ if (i > j){
+ first = j - start + 1;
+ second = i - start + 1;
+ }else{
+ first = i - start + 1;
+ second = j - start + 1;
+ }
+ int total = size * (size - 1) / 2;
+ if (!RsquareSink.size() && DprimeR2inMem >=2) RsquareSink = vector <float> (total, __RTC_NA__ );
+ int diff = (size - first) * (size - first + 1) / 2;
+ int index = total - diff + second - first - 1;
+ if (index < RsquareSink.size() && !isnan(RsquareSink[index])) return RsquareSink[index];
+ vector < float > v1;
+ if (genotypeSink.count(i)){
+ v1 = genotypeSink[i];
+ }else{
+ v1 = genotype_val[i];
+ normalize(v1);
+ if (DprimeR2inMem >=2) genotypeSink[i] = v1;
+ }
+ vector < float > v2;
+ if (genotypeSink.count(j)){
+ v2 = genotypeSink[j];
+ }else{
+ v2 = genotype_val[j];
+ normalize(v2);
+ if (DprimeR2inMem >=2) genotypeSink[j] = v2;
+ }
+ double r = getCorrelation(v1 , v2 ,v1.size());
+ if (DprimeR2inMem >=2) RsquareSink[index] = r * r;
+ return r * r;
+}
+
+inline int rtc_data::getBestVariant(vector <int> &genotype_idx, int phenotype_idx, double &pval){
+ vector < float > y = phenotype_val[phenotype_idx];
+ if (options.count("normal")) normalTransform(y);
+ normalize(y);
+ double bestR = 0.0;
+ int bestV = -1;
+ for (int g = 0; g< genotype_idx.size(); g++){
+ vector < float > x = genotype_val[genotype_idx[g]];
+ normalize(x);
+ double R = abs(getCorrelation(x,y));
+ if (R > bestR){
+ bestR = R;
+ bestV = genotype_idx[g];
+ }
+ }
+ pval = getPvalue(bestR,sample_count-2);
+ return bestV;
+}
+
+inline float rtc_data::median(vector <float> &scores){
+ float median;
+ size_t size = scores.size();
+ if (!size) return __RTC_NA__;
+ sort(scores.begin(), scores.end());
+ if (size % 2 == 0){
+ median = (scores[size / 2 - 1] + scores[size / 2]) / 2;
+ }else {
+ median = scores[size / 2];
+ }
+ return median;
+}
+
+inline bool rtc_data::readRTCline(const string &buffer, set < int > &their){
+ vector < string > str;
+ stb.split(buffer, str);
+ string snp = str[0];
+ if (!genotype_id_to_idx.count(snp) ) {
+ unfound_ids.insert(snp);
+ return false;
+ }
+ their.insert(genotype_id_to_idx[snp]);
+ return true;
+}
+
+inline bool rtc_data::readRTCline(const string &buffer, string &pheno, string &snp, string &best, string &group, int &rank){
+ vector < string > str;
+ stb.split(buffer, str);
+ if (str.size() < 4) vrb.error("Wrong QTLtools output file format");
+ if (str.size() < 4) vrb.error("Wrong QTLtools output file format");
+ if (str[0] == "__UNION__"){
+ if (str[2].substr(0,15) == "__UNION_FILLER_") return false;
+ pheno = str[1];
+ snp = str[2];
+ best = "1";
+ rank = atoi(str[3].c_str());
+ group = pheno;
+ }else{
+ if (rank_column >= str.size()) vrb.error("rank column = " + stb.str(pvalue_column+1) + " but found " + stb.str(str.size()) + " columns in the following line:\n" + buffer);
+ if (variant_column >= str.size()) vrb.error("variant column = " + stb.str(variant_column+1) + " but found " + stb.str(str.size()) + " columns in the following line:\n" + buffer);
+ if (phenotype_column >= str.size()) vrb.error("phenotype column = " + stb.str(phenotype_column+1) + " but found " + stb.str(str.size()) + " columns in the following line:\n" + buffer);
+ if (best_column >= str.size()) vrb.error("best column = " + stb.str(best_column+1) + " but found " + stb.str(str.size()) + " columns in the following line:\n" + buffer);
+ if (group_column >= str.size()) vrb.error("group column = " + stb.str(group_column+1) + " but found " + stb.str(str.size()) + " columns in the following line:\n" + buffer);
+ pheno = str[phenotype_column];
+ group = str[group_column];
+ snp = str[variant_column];
+ best = str[best_column];
+ rank = atoi(str[rank_column].c_str());
+ }
+ string test = grp_mode > 0 ? group : pheno;
+ //cerr << pheno << " " << snp << " " << group << " " << test << " " << group_id.count(test) << endl;
+ if(best != "1"){
+ return false;
+ }else if (!genotype_id_to_idx.count(snp)){
+ unfound_ids.insert(snp);
+ if (!phenotype_id_to_idx.count(test) && !group_id.count(test)) unfound_phenotypes.insert(test);
+ return false;
+ }else if (!phenotype_id_to_idx.count(test) && !group_id.count(test)){
+ unfound_phenotypes.insert(test);
+ return false;
+ }
+ return true;
+}
+
+inline bool rtc_data::readRTCline(const string &buffer, string &pheno, string &snp, string &group,unsigned int &line_count){
+ vector < string > str;
+ stb.split(buffer, str);
+ if (str[0] == "__UNION__"){
+ if (str[2].substr(0,15) == "__UNION_FILLER_") return false;
+ pheno = str[1];
+ snp = str[2];
+ group = pheno;
+ }else{
+ if (!line_count && str.size() > normal_output_columns && !options.count("conditional")) vrb.warning("Looks like a conditional QTLtools output yet no --conditional provided, is this desired?");
+ if (variant_column >= str.size()) vrb.error("variant column = " + stb.str(variant_column+1) + " but found " + stb.str(str.size()) + " columns in the following line:\n" + buffer);
+ if (phenotype_column >= str.size()) vrb.error("phenotype column = " + stb.str(phenotype_column+1) + " but found " + stb.str(str.size()) + " columns in the following line:\n" + buffer);
+ if (group_column >= str.size()) vrb.error("group column = " + stb.str(group_column+1) + " but found " + stb.str(str.size()) + " columns in the following line:\n" + buffer);
+ pheno = str[phenotype_column];
+ snp = str[variant_column];
+ group = str[group_column];
+ }
+ line_count++;
+ string test = grp_mode > 0 ? group : pheno;
+ if (!genotype_id_to_idx.count(snp)){
+ unfound_ids.insert(snp);
+ if (!phenotype_id_to_idx.count(test) && !group_id.count(test)) unfound_phenotypes.insert(test);
+ return false;
+ }else if (!phenotype_id_to_idx.count(test) && !group_id.count(test)){
+ unfound_phenotypes.insert(test);
+ return false;
+ }
+ return true;
+}
+
+#endif
diff --git a/src/mode_rtc/rtc_gwas_cis.cpp b/src/mode_rtc/rtc_gwas_cis.cpp
new file mode 100644
index 0000000..6c04967
--- /dev/null
+++ b/src/mode_rtc/rtc_gwas_cis.cpp
@@ -0,0 +1,137 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "rtc_data.h"
+
+void rtc_data::gwas_cis_conditional(string frtc1, string frtc2){
+ string buffer;
+ vector < string > str;
+ set < int > theirs;
+ map < string , vector < int > >::iterator it;
+ unsigned int count = 0;
+ vrb.title("Reading list of GWAS variants [" + frtc1 + "]");
+ input_file fd (frtc1);
+ if (fd.fail()) vrb.error("Cannot open file!");
+ while(getline(fd, buffer)) {
+ readRTCline(buffer,theirs);
+ }
+ fd.close();
+ vector <int> their(theirs.begin(),theirs.end());
+ vrb.title("Reading conditional cis QTLtools output in [" + frtc2 + "]");
+ input_file fd2 (frtc2);
+ if (fd2.fail()) vrb.error("Cannot open file!");
+ map < string , vector < int > > file2,pheno2;
+ map < string , map < int , int > > rank2;
+ string pheno,snp,best,group;
+ int rank;
+ while(getline(fd2, buffer)) {
+ if(readRTCline(buffer,pheno,snp,best,group,rank)){
+ file2[group].push_back(genotype_id_to_idx[snp]);
+ rank2[group][genotype_id_to_idx[snp]] = rank;
+ pheno2[group].push_back(phenotype_id_to_idx[pheno]);
+ }
+ }
+ fd2.close();
+ vrb.bullet(stb.str(file2.size()) + " phenotypes with eQTLs");
+ vrb.title("Merging and calculating D' and R2");
+ unsigned int event_count=0,pairs_tested=0;
+ for (it = file2.begin(); it!= file2.end(); it++){
+ event_count++;
+ if(event_count % DprimePrintFreq == 0) {
+ vrb.bullet(stb.str(event_count) + " common phenotypes processed [" + stb.str(pairs_tested) + " pairs]");
+ pairs_tested =0;
+ }
+ vector <int > our = it->second;
+ string pheno = it->first;
+ vector <int > ourpi = pheno2[pheno];
+ int phst = grp_mode > 0 ? phenotype_start[group_idx[group_id[pheno]][0]] : phenotype_start[phenotype_id_to_idx[pheno]];
+ for (int o = 0 ; o < our.size(); o++){
+ vector <int> others;
+ for (int oo = 0 ; oo < our.size(); oo++) if (o != oo) others.push_back(our[oo]);
+ int eqtl_snp_csi = getColdspot(genotype_chr[our[o]], genotype_start[our[o]]);
+ for (int t = 0 ;t < their.size(); t++){
+ if (genotype_chr[their[t]] == genotype_chr[our[o]] && abs(genotype_start[their[t]] - phst) <= cis_window ){
+ int test_snp_csi = getColdspot(genotype_chr[their[t]], genotype_start[their[t]]);
+ vector <double> info= genotype_alleles.size() ? getDprimeRsquare(genotype_chr[their[t]], genotype_start[their[t]] , genotype_chr[our[o]], genotype_start[our[o]],genotype_alleles[their[t]], genotype_alleles[our[o]],their[t],our[o]) : getDprimeRsquare(genotype_chr[their[t]], genotype_start[their[t]] , genotype_chr[our[o]], genotype_start[our[o]],"","",their[t],our[o]);
+ pheno_eqtls[pheno].push_back(pairsToTestForRTC(their[t], our[o], others, test_snp_csi, eqtl_snp_csi, info[0] , info[1],0,rank2[pheno][our[o]],ourpi[o]));
+ count++;
+ pairs_tested++;
+ }
+ }
+ }
+ }
+ vrb.bullet(stb.str(count) + " potential merge events found for " + stb.str(pheno_eqtls.size()) + " phenotypes.");
+ if (pheno_eqtls.size() == 0 ) vrb.leave("No merge events found!");
+}
+
+
+void rtc_data::gwas_cis(string frtc1, string frtc2){
+ string buffer;
+ vector < string > str;
+ set < int > theirs;
+ map < string , vector < int > >::iterator it;
+ unsigned int count = 0;
+ vrb.title("Reading list of GWAS variants [" + frtc1 + "]");
+ input_file fd (frtc1);
+ if (fd.fail()) vrb.error("Cannot open file!");
+ while(getline(fd, buffer)) {
+ readRTCline(buffer,theirs);
+ }
+ fd.close();
+ vector <int> their(theirs.begin(),theirs.end());
+ vrb.title("Reading cis QTLtools output in [" + frtc2 + "]");
+ input_file fd2 (frtc2);
+ if (fd2.fail()) vrb.error("Cannot open file!");
+ map < string , vector < int > > file2,pheno2;
+ unsigned int line_count = 0;
+ string pheno,snp,group;
+ while(getline(fd2, buffer)) {
+ if(readRTCline(buffer,pheno,snp,group,line_count)){
+ file2[group].push_back(genotype_id_to_idx[snp]);
+ pheno2[group].push_back(phenotype_id_to_idx[pheno]);
+ }
+ }
+ fd2.close();
+ vrb.bullet(stb.str(file2.size()) + " phenotypes with eQTLs");
+ vrb.title("Merging and calculating D' and R2");
+ unsigned int event_count=0,pairs_tested=0;
+ for (it = file2.begin(); it!= file2.end(); it++){
+ event_count++;
+ if(event_count % DprimePrintFreq == 0) {
+ vrb.bullet(stb.str(event_count) + " common phenotypes processed [" + stb.str(pairs_tested) + " pairs]");
+ pairs_tested =0;
+ }
+ vector <int > our = it->second;
+ string pheno = it->first;
+ vector <int > ourpi = pheno2[pheno];
+ int phst = grp_mode > 0 ? phenotype_start[group_idx[group_id[pheno]][0]] : phenotype_start[phenotype_id_to_idx[pheno]];
+ for (int o = 0 ; o < our.size(); o++){
+ vector <int> others;
+ for (int oo = 0 ; oo < our.size(); oo++) if (o != oo) others.push_back(our[oo]);
+ int eqtl_snp_csi = getColdspot(genotype_chr[our[o]], genotype_start[our[o]]);
+ for (int t = 0 ;t < their.size(); t++){
+ if (genotype_chr[their[t]] == genotype_chr[our[o]] && abs(genotype_start[their[t]] - phst) <= cis_window ){
+ int test_snp_csi = getColdspot(genotype_chr[their[t]], genotype_start[their[t]]);
+ vector <double> info= genotype_alleles.size() ? getDprimeRsquare(genotype_chr[their[t]], genotype_start[their[t]] , genotype_chr[our[o]], genotype_start[our[o]],genotype_alleles[their[t]], genotype_alleles[our[o]],their[t],our[o]) : getDprimeRsquare(genotype_chr[their[t]], genotype_start[their[t]] , genotype_chr[our[o]], genotype_start[our[o]],"","",their[t],our[o]);
+ pheno_eqtls[pheno].push_back(pairsToTestForRTC(their[t], our[o], others, test_snp_csi, eqtl_snp_csi, info[0] , info[1],0,0,ourpi[o]));
+ count++;
+ pairs_tested++;
+ }
+ }
+ }
+ }
+ vrb.bullet(stb.str(count) + " potential merge events found for " + stb.str(pheno_eqtls.size()) + " phenotypes.");
+ if (pheno_eqtls.size() == 0 ) vrb.leave("No merge events found!");
+}
diff --git a/src/mode_rtc/rtc_gwas_trans.cpp b/src/mode_rtc/rtc_gwas_trans.cpp
new file mode 100644
index 0000000..d26a3ce
--- /dev/null
+++ b/src/mode_rtc/rtc_gwas_trans.cpp
@@ -0,0 +1,135 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "rtc_data.h"
+
+void rtc_data::gwas_trans_conditional(string frtc1, string frtc2){
+ string buffer;
+ vector < string > str;
+ set < int > theirs;
+ map < string , vector < int > >::iterator it;
+ unsigned int count = 0;
+ vrb.title("Reading list of GWAS variants [" + frtc1 + "]");
+ input_file fd (frtc1);
+ if (fd.fail()) vrb.error("Cannot open file!");
+ while(getline(fd, buffer)) {
+ readRTCline(buffer,theirs);
+ }
+ fd.close();
+ vector <int> their(theirs.begin(),theirs.end());
+ vrb.title("Reading conditional trans fastQTL output in [" + frtc2 + "]");
+ input_file fd2 (frtc2);
+ if (fd2.fail()) vrb.error("Cannot open file!");
+ map < string , vector < int > > file2,pheno2;
+ map < string , map < int , int > > rank2;
+ string pheno,snp,best,group;
+ int rank;
+ while(getline(fd2, buffer)) {
+ if(readRTCline(buffer,pheno,snp,best,group,rank)){
+ file2[group].push_back(genotype_id_to_idx[snp]);
+ rank2[group][genotype_id_to_idx[snp]] = rank;
+ pheno2[group].push_back(phenotype_id_to_idx[pheno]);
+ }
+ }
+ fd2.close();
+ vrb.bullet(stb.str(file2.size()) + " phenotypes with eQTLs");
+ vrb.title("Merging and calculating D' and R2");
+ unsigned int event_count=0,pairs_tested=0;;
+ for (it = file2.begin(); it!= file2.end(); it++){
+ event_count++;
+ if(event_count % DprimePrintFreq == 0) {
+ vrb.bullet(stb.str(event_count) + " common phenotypes processed [" + stb.str(pairs_tested) + " pairs]");
+ pairs_tested =0;
+ }
+ vector <int > our = it->second;
+ string pheno = it->first;
+ vector <int > ourpi = pheno2[pheno];
+ for (int o = 0 ; o < our.size(); o++){
+ vector <int> others;
+ for (int oo = 0 ; oo < our.size(); oo++) if (o != oo) others.push_back(our[oo]);
+ int eqtl_snp_csi = getColdspot(genotype_chr[our[o]], genotype_start[our[o]]);
+ for (int t = 0 ;t < their.size(); t++){
+ if (genotype_chr[their[t]] == genotype_chr[our[o]] && abs(genotype_start[their[t]] - genotype_start[our[o]]) <= cis_window ){
+ int test_snp_csi = getColdspot(genotype_chr[their[t]], genotype_start[their[t]]);
+ vector <double> info= genotype_alleles.size() ? getDprimeRsquare(genotype_chr[their[t]], genotype_start[their[t]] , genotype_chr[our[o]], genotype_start[our[o]],genotype_alleles[their[t]], genotype_alleles[our[o]],their[t],our[o]) : getDprimeRsquare(genotype_chr[their[t]], genotype_start[their[t]] , genotype_chr[our[o]], genotype_start[our[o]],"","",their[t],our[o]);
+ pheno_eqtls[pheno].push_back(pairsToTestForRTC(their[t], our[o], others, test_snp_csi, eqtl_snp_csi, info[0] , info[1],0,rank2[pheno][our[o]],ourpi[o]));
+ count++;
+ pairs_tested++;
+ }
+ }
+ }
+ }
+ vrb.bullet(stb.str(count) + " potential merge events found for " + stb.str(pheno_eqtls.size()) + " phenotypes.");
+ if (pheno_eqtls.size() == 0 ) vrb.leave("No merge events found!");
+}
+
+
+void rtc_data::gwas_trans(string frtc1, string frtc2){
+ string buffer;
+ vector < string > str;
+ set < int > theirs;
+ map < string , vector < int > >::iterator it;
+ unsigned int count = 0;
+ vrb.title("Reading list of GWAS variants [" + frtc1 + "]");
+ input_file fd (frtc1);
+ if (fd.fail()) vrb.error("Cannot open file!");
+ while(getline(fd, buffer)) {
+ readRTCline(buffer,theirs);
+ }
+ fd.close();
+ vector <int> their(theirs.begin(),theirs.end());
+ vrb.title("Reading cis QTLtools output in [" + frtc2 + "]");
+ input_file fd2 (frtc2);
+ if (fd2.fail()) vrb.error("Cannot open file!");
+ map < string , vector < int > > file2,pheno2;
+ unsigned int line_count=0;
+ string pheno,snp,group;
+ while(getline(fd2, buffer)) {
+ if(readRTCline(buffer,pheno,snp,group,line_count)){
+ file2[group].push_back(genotype_id_to_idx[snp]);
+ pheno2[group].push_back(phenotype_id_to_idx[pheno]);
+ }
+ }
+ fd2.close();
+ vrb.bullet(stb.str(file2.size()) + " phenotypes with eQTLs");
+ vrb.title("Merging and calculating D' and R2");
+ unsigned int event_count=0,pairs_tested=0;
+ for (it = file2.begin(); it!= file2.end(); it++){
+ event_count++;
+ if(event_count % DprimePrintFreq == 0) {
+ vrb.bullet(stb.str(event_count) + " common phenotypes processed [" + stb.str(pairs_tested) + " pairs]");
+ pairs_tested =0;
+ }
+ vector <int > our = it->second;
+ string pheno = it->first;
+ vector <int > ourpi = pheno2[pheno];
+ for (int o = 0 ; o < our.size(); o++){
+ vector <int> others;
+ for (int oo = 0 ; oo < our.size(); oo++) if (o != oo) others.push_back(our[oo]);
+ int eqtl_snp_csi = getColdspot(genotype_chr[our[o]], genotype_start[our[o]]);
+ for (int t = 0 ;t < their.size(); t++){
+ if (genotype_chr[their[t]] == genotype_chr[our[o]] && abs(genotype_start[their[t]] - genotype_start[our[o]]) <= cis_window ){
+ int test_snp_csi = getColdspot(genotype_chr[their[t]], genotype_start[their[t]]);
+ vector <double> info= genotype_alleles.size() ? getDprimeRsquare(genotype_chr[their[t]], genotype_start[their[t]] , genotype_chr[our[o]], genotype_start[our[o]],genotype_alleles[their[t]], genotype_alleles[our[o]],their[t],our[o]) : getDprimeRsquare(genotype_chr[their[t]], genotype_start[their[t]] , genotype_chr[our[o]], genotype_start[our[o]],"","",their[t],our[o]);
+ pheno_eqtls[pheno].push_back(pairsToTestForRTC(their[t], our[o], others, test_snp_csi, eqtl_snp_csi, info[0] , info[1],0,0,ourpi[o]));
+ count++;
+ pairs_tested++;
+ }
+ }
+ }
+ }
+ vrb.bullet(stb.str(count) + " potential merge events found for " + stb.str(pheno_eqtls.size()) + " phenotypes.");
+ if (pheno_eqtls.size() == 0 ) vrb.leave("No merge events found!");
+}
diff --git a/src/mode_rtc/rtc_initilization.cpp b/src/mode_rtc/rtc_initilization.cpp
new file mode 100644
index 0000000..0908787
--- /dev/null
+++ b/src/mode_rtc/rtc_initilization.cpp
@@ -0,0 +1,80 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "rtc_data.h"
+
+rtc_data::rtc_data() {
+ cis_window = 0.0;
+ genotype_count = 0;
+ phenotype_count = 0;
+ covariate_count = 0;
+ pvalue_column = 17;
+ variant_column = 6;
+ phenotype_column = 0;
+ group_column=0;
+ rank_column = 10;
+ best_column = 17;
+ coldspot_count = 0;
+ Dprime_cutoff = 0.4;
+ R2_cutoff = 0.5;
+ stats_n_includedS = 0;
+ sample_iterations = 0;
+ DprimeR2inMem = 0;
+ calculate_Dprime_R2 = true;
+}
+
+void rtc_data::clear() {
+ sample_count = 0;
+ sample_id.clear();
+ genotype_count = 0;
+ genotype_val.clear();
+ genotype_chr.clear();
+ genotype_id.clear();
+ genotype_start.clear();
+ genotype_end.clear();
+ phenotype_count = 0;
+ phenotype_val.clear();
+ phenotype_id.clear();
+ phenotype_chr.clear();
+ phenotype_start.clear();
+ phenotype_end.clear();
+ phenotype_grp.clear();
+ covariate_count = 0;
+ covariate_val.clear();
+ covariate_id.clear();
+ //coldspot_end_idx.clear();
+ pheno_eqtls.clear();
+ //for (int i = 0 ; i < all_coldspots_p.size(); i++) delete all_coldspots_p[i];
+ all_coldspots.clear();
+ coldspot_bins_p.clear();
+ genotype_id_to_idx.clear();
+ phenotype_id_to_idx.clear();
+ coldspot_count = 0;
+}
+
+
+rtc_data::~rtc_data() {
+ clear();
+}
+
+void rtc_data::residualizePhenotypes() {
+ vrb.title("Residualize phenotypes for covariates");
+ residualizer covariate_engine (sample_count);
+ for (int c = 0 ; c < covariate_count ; c ++) covariate_engine.push(covariate_val[c]);
+ covariate_engine.build();
+ for (unsigned int p = 0 ; p < phenotype_count ; p ++) covariate_engine.residualize(phenotype_val[p]);
+ vrb.bullet("#covariates = " + stb.str(covariate_count));
+}
+
diff --git a/src/mode_rtc/rtc_main.cpp b/src/mode_rtc/rtc_main.cpp
new file mode 100644
index 0000000..45b9103
--- /dev/null
+++ b/src/mode_rtc/rtc_main.cpp
@@ -0,0 +1,384 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "rtc_data.h"
+
+void rtc_main(vector < string > & argv) {
+ rtc_data D;
+
+ //-------------------------
+ // 1. DECLARE ALL OPTIONS
+ //-------------------------
+ D.declareBasicOptions(); //Mandatory
+
+ boost::program_options::options_description opt_files ("\x1B[32mI/O\33[0m");
+ opt_files.add_options()
+ ("vcf", boost::program_options::value< string >(), "Genotypes in VCF/BCF/BED format.")
+ ("bed", boost::program_options::value< string >(), "Phenotypes in BED format.")
+ ("cov", boost::program_options::value< string >(), "Covariates in TXT format.")
+ ("hotspots", boost::program_options::value< string >(), "Recombination hotspots in BED format.")
+ ("out", boost::program_options::value< string >(), "Output file.")
+ ("stats-vcf", boost::program_options::value< string >(), "Genotypes from which D' and r2 are calculated in VCF/BCF format. (Defaults to --vcf, MUST HAVE PHASED GENOTYPES")
+ ("stats-vcf-include-samples", boost::program_options::value< string >(), "Include sample list for --stats-vcf")
+ ("stats-vcf-exclude-samples", boost::program_options::value< string >(), "Exclude sample list for --stats-vcf");
+
+ boost::program_options::options_description opt_parameters ("\x1B[32mParameters\33[0m");
+ opt_parameters.add_options()
+ ("normal", "Normal transform the phenotypes.")
+ ("conditional", "Do conditional analysis.")
+ ("debug", "Print debugging info for sampling to STDERR.")
+ ("warnings", "Print all encountered individual warnings to log/STDOUT.")
+ ("header", "Add a header to the output file when --chunk or --region is active.")
+ ("individual-Dprime", "Will calculate D' on an individual variant basis. If not provided D' will not be calculated after first unphased genotype is encountered.")
+ ("mem",boost::program_options::value< unsigned int >()->default_value(0), "Keep results of calculations that may be used multiple times in memory. 0 = nothing in mem, 1 = only basic, 2 = all in mem but clean after unlikely to be reused, 3 = all in mem no cleaning")
+ ("mem-est", "Estimate memory usage and exit.")
+ ("window", boost::program_options::value< unsigned int >()->default_value(1000000), "Size of the cis-window.")
+ ("sample", boost::program_options::value< unsigned int >()->default_value(0), "Sample iterations to assess RTC significance.")
+ ("max-sample", boost::program_options::value< unsigned int >()->default_value(50,"--sample * 50"), "Max number of sample iterations trying to reach --sample before quitting. (Provide the actual number not the multiplier)")
+ ("R2-threshold", boost::program_options::value< double >()->default_value(0.5), "R2 threshold used in sampling")
+ ("D-prime-threshold", boost::program_options::value< double >()->default_value(2,"OFF"), "If the pairs of variants have a D' greater than this the RTC calculation is extended to multiple regions. (Assumes D' can be calculated");
+
+ boost::program_options::options_description opt_aggr ("\x1B[32mPhenotype aggregation methods\33[0m");
+ opt_aggr.add_options()
+ ("grp-best", "Correct for multiple phenotypes within a group.");
+
+ boost::program_options::options_description opt_columns ("\x1B[32mColumns (1-based)\33[0m");
+ opt_columns.add_options()
+ ("pheno-col", boost::program_options::value< unsigned int >()->default_value(1, "1 or 5 when --grp-best"), "Phenotype column")
+ ("geno-col", boost::program_options::value< unsigned int >()->default_value(8,"8 or 9 when --grp-best"), "Genotype column")
+ ("grp-col", boost::program_options::value< unsigned int >()->default_value(1), "Phenotype group column")
+ ("rank-col", boost::program_options::value< unsigned int >()->default_value(12,"12 or 13 when --grp-best"), "Conditional analysis rank column")
+ ("best-col", boost::program_options::value< unsigned int >()->default_value(19,"19 or 20 when --grp-best"), "Conditional analysis best variant column");
+
+ boost::program_options::options_description opt_modes ("\x1B[32mAnalysis type\33[0m");
+ opt_modes.add_options()
+ ("gwas-cis", boost::program_options::value< vector < string > >()->multitoken(), "MODE1: RTC for GWAS and cis-eQTL integration.")
+ ("gwas-trans", boost::program_options::value< vector < string > >()->multitoken(), "MODE2: RTC for GWAS and trans-eQTL integration.")
+ ("mergeQTL-cis", boost::program_options::value< vector < string > >()->multitoken(), "MODE3: RTC for cis-eQTL and cis-eQTL integration.")
+ ("mergeQTL-trans", boost::program_options::value< vector < string > >()->multitoken(), "MODE4: RTC for trans-eQTL and trans-eQTL integration.");
+
+ boost::program_options::options_description opt_parallel ("\x1B[32mParallelization\33[0m");
+ opt_parallel.add_options()
+ ("chunk", boost::program_options::value< vector < int > >()->multitoken(), "Specify which chunk needs to be processed. Chunk 0 is a special chunk which only prints out the header.")
+ ("region", boost::program_options::value< string >(), "Region of interest.");
+
+ D.option_descriptions.add(opt_files).add(opt_parameters).add(opt_aggr).add(opt_columns).add(opt_modes).add(opt_parallel);
+
+ //-------------------
+ // 2. PARSE OPTIONS
+ //-------------------
+ try {
+ boost::program_options::store(boost::program_options::command_line_parser(argv).options(D.option_descriptions).run(), D.options);
+ boost::program_options::notify(D.options);
+ } catch ( const boost::program_options::error& e ) {
+ cerr << "Error parsing [rtc] command line :" << string(e.what()) << endl;
+ exit(0);
+ }
+
+ //---------------------
+ // 3. PRINT HELP/HEADER
+ //---------------------
+ vrb.ctitle("CALCULATE RTC SCORE");
+ if (D.options.count("help")) {
+ cout << D.option_descriptions << endl;
+ exit(EXIT_SUCCESS);
+ }
+
+
+ //-----------------
+ // 4. COMMON CHECKS
+ //-----------------
+ if (!D.options.count("vcf")) vrb.error("Genotype data needs to be specified with --vcf [file.vcf]");
+ if (!D.options.count("bed")) vrb.error("Phenotype data needs to be specified with --bed [file.bed]");
+ if (!D.options.count("out")) vrb.error("Output needs to be specified with --out [file.out]");
+ if (!D.options.count("hotspots")) vrb.error("Output needs to be specified with --hotspots [file.bed]");
+ int nMode = D.options.count("gwas-cis") + D.options.count("gwas-trans") + D.options.count("mergeQTL-cis") + D.options.count("mergeQTL-trans");
+ if (nMode != 1) vrb.error("Please specify only one of these options [--gwas-cis, --gwas-trans, --mergeQTL-cis, --mergeQTL-trans]");
+ string outFile = D.options["out"].as < string > ();
+ if (D.options["pheno-col"].as < unsigned int > () < 1) vrb.error("--pheno-col must be greater than 0");
+ if (D.options["geno-col"].as < unsigned int > () < 1) vrb.error("--geno-col must be greater than 0");
+ if (D.options["rank-col"].as < unsigned int > () < 1) vrb.error("--rank-col must be greater than 0");
+ if (D.options["best-col"].as < unsigned int > () < 1) vrb.error("--best-col must be greater than 0");
+ if (D.options["grp-col"].as < unsigned int > () < 1) vrb.error("--grp-col must be greater than 0");
+ if (D.options["sample"].as <unsigned int> () == 0 && D.options.count("debug") ) vrb.error("--debug only applies when --sample");
+
+ //---------
+ // 5. MODES
+ //---------
+ vector < string > RTCfiles;
+ if (D.options.count("gwas-cis")) {
+ D.mode = RTC_MODE1;
+ vrb.bullet("TASK: RTC on GWAS variants and cis-eQTLs");
+ RTCfiles = D.options["gwas-cis"].as < vector < string > > ();
+ if (RTCfiles.size() != 2) vrb.error("Please provide 2 input files for --gwas-cis");
+ }
+
+ if (D.options.count("gwas-trans")) {
+ D.mode = RTC_MODE2;
+ vrb.bullet("TASK: RTC on GWAS variants and trans-eQTLs");
+ RTCfiles = D.options["gwas-trans"].as < vector < string > > ();
+ if (RTCfiles.size() != 2) vrb.error("Please provide 2 input files for --gwas-trans");
+ }
+
+ if (D.options.count("mergeQTL-cis")) {
+ D.mode = RTC_MODE3;
+ vrb.bullet("TASK: RTC on pairs of cis-QTLs");
+ RTCfiles = D.options["mergeQTL-cis"].as < vector < string > > ();
+ if (RTCfiles.size() != 2) vrb.error("Please provide 2 input files for --mergeQTL-cis");
+ }
+
+ if (D.options.count("mergeQTL-trans")) {
+ D.mode = RTC_MODE4;
+ vrb.bullet("TASK: RTC on pairs of trans-QTLs");
+ RTCfiles = D.options["mergeQTL-trans"].as < vector < string > > ();
+ if (RTCfiles.size() != 2) vrb.error("Please provide 2 input files for --mergeQTL-trans");
+ }
+
+ int n_aggregation_methods = D.options.count("grp-best") + D.options.count("grp-pca1") + D.options.count("grp-mean");
+ if (n_aggregation_methods > 1) vrb.error("Only one of the --grp-XXX options is allowed");
+ if (D.options.count("grp-best")) {
+ vrb.bullet("Phenotypes are regrouped within groups [method: best]");
+ D.grp_mode = GRP_BEST;
+ } else if (D.options.count("grp-pca1")) {
+ vrb.bullet("Phenotypes are regrouped within groups [method: pca1]");
+ D.grp_mode = GRP_PCA1;
+ } else if (D.options.count("grp-mean")) {
+ vrb.bullet("Phenotypes are regrouped within groups [method: mean]");
+ D.grp_mode = GRP_MEAN;
+ } else {
+ D.grp_mode = GRP_NONE;
+ }
+
+ //--------------
+ // 6. SET PARAMS
+ //--------------
+ if (D.options["window"].as < unsigned int > () > 1000000000) vrb.error("Incorrect cis-window size!");
+ vrb.bullet("Cis-window size is " + stb.str(D.options["window"].as < unsigned int > ()) + " bp");
+ D.cis_window = D.options["window"].as < unsigned int > ();
+ if (D.options.count("chunk")) {
+ vector < int > nChunk = D.options["chunk"].as < vector < int > > ();
+ if (nChunk.size() != 2 || nChunk[0] > nChunk[1]) vrb.error("Incorrect --chunk arguments!");
+ vrb.bullet("Chunk = [" + stb.str(nChunk[0]) + "/" + stb.str(nChunk[1]) + "]");
+ if (nChunk[0] == 0){
+ output_file fdo(outFile);
+ if (fdo.fail()) vrb.error("Cannot open file [" + outFile + "]");
+ fdo <<"other_variant our_variant phenotype phenotype_group other_variant_chr other_variant_start other_variant_rank our_variant_chr our_variant_start our_variant_rank phenotype_chr phenotype_start distance_between_variants distance_between_other_variant_and_pheno other_variant_region_index our_variant_region_index region_start region_end variant_count_in_region RTC D' r^2";
+ if (D.options.count("sample")) fdo << " p_value unique_picks_H0 unique_picks_H1 rtc_bin_start rtc_bin_end rtc_bin_H0_proportion rtc_bin_H1_proportion median_r^2 median_H0 median_H1 H0 H1";
+ fdo << endl;
+ fdo.close();
+ vrb.leave("Header written");
+ }
+ } else if (D.options.count("region")) vrb.bullet("Region = [" + D.options["region"].as < string > () +"]");
+ if(D.options.count("conditional")) vrb.bullet("Doing conditional analysis.");
+ D.Dprime_cutoff = D.options["D-prime-threshold"].as < double > ();
+ D.R2_cutoff = D.options["R2-threshold"].as < double > ();
+ D.sample_iterations = D.options["sample"].as < unsigned int > ();
+ D.max_sample_iterations = D.options["max-sample"].defaulted() ? D.sample_iterations * D.options["max-sample"].as < unsigned int > () : D.options["max-sample"].as < unsigned int > ();
+ if(D.Dprime_cutoff < 0 ) vrb.error("Wrong D-prime threshold");
+ if(D.R2_cutoff < 0 || D.R2_cutoff > 1) vrb.error("Wrong R2 threshold");
+ if(D.sample_iterations) vrb.bullet(stb.str(D.sample_iterations) + " sample iterations.");
+ if(D.sample_iterations) vrb.bullet(stb.str(D.max_sample_iterations) + " max sample iterations.");
+ if(D.sample_iterations) vrb.bullet(stb.str(D.R2_cutoff) + " R2 threshold.");
+ if(D.Dprime_cutoff <= 1 )vrb.bullet(stb.str(D.Dprime_cutoff) + " D' threshold.");
+ else D.calculate_Dprime_R2 = false;
+ D.DprimeR2inMem = D.options["mem"].as < unsigned int > ();
+ if (D.grp_mode == GRP_BEST){
+ D.phenotype_column = D.options["pheno-col"].defaulted() ? 5 : D.options["pheno-col"].as < unsigned int > () - 1;
+ D.variant_column = D.options["geno-col"].defaulted() ? 8 : D.options["geno-col"].as < unsigned int > () - 1;
+ D.rank_column = D.options["rank-col"].defaulted() ? 12 : D.options["rank-col"].as < unsigned int > () - 1;
+ D.best_column = D.options["best-col"].defaulted() ? 19 : D.options["best-col"].as < unsigned int > () - 1;
+ D.group_column = D.options["grp-col"].defaulted() ? 0 : D.options["grp-col"].as < unsigned int > () - 1;
+ }else{
+ D.phenotype_column = D.options["pheno-col"].as < unsigned int > () - 1;
+ D.variant_column = D.options["geno-col"].as < unsigned int > () - 1;
+ D.rank_column = D.options["rank-col"].as < unsigned int > () - 1;
+ D.best_column = D.options["best-col"].as < unsigned int > () - 1;
+ D.group_column = D.options["grp-col"].defaulted() ? D.phenotype_column : D.options["grp-col"].as < unsigned int > () - 1;
+ }
+ vrb.bullet("Phenotype column (0-based) " + stb.str(D.phenotype_column));
+ vrb.bullet("Variant column (0-based) " + stb.str(D.variant_column));
+ vrb.bullet("Group column (0-based) " + stb.str(D.group_column));
+ if (D.options.count("conditional")){
+ vrb.bullet("Rank column (0-based) " + stb.str(D.rank_column));
+ vrb.bullet("Best column (0-based) " + stb.str(D.best_column));
+ }
+
+ //--------------
+ // 7. SET REGION
+ //--------------
+ if (D.options.count("chunk")) {
+ if (D.options.count("region")){
+ if (!D.setPhenotypeRegion(D.options["region"].as < string > ())) vrb.error("Impossible to interpret region [" + D.options["region"].as < string > () + "]");
+ }
+ D.scanPhenotypes(D.options["bed"].as < string > ());
+ D.setPhenotypeRegion(D.options["chunk"].as < vector < int > > ()[0] - 1, D.options["chunk"].as < vector < int > > ()[1]);
+ //outFile += "." + D.regionPhenotype.get();
+ D.clear();
+ } else if (D.options.count("region")){
+ if (!D.setPhenotypeRegion(D.options["region"].as < string > ())) vrb.error("Impossible to interpret region [" + D.options["region"].as < string > () + "]");
+ }
+
+
+ //---------------------------
+ // 8. READ FILES & INITIALIZE
+ //---------------------------
+ if (D.mode == RTC_MODE1 || D.mode == RTC_MODE3){
+ D.processBasicOptions(); //Mandatory
+ D.readHotspots(D.options["hotspots"].as < string > ());
+ D.deduceGenotypeRegion(D.options["window"].as < unsigned int > ());
+ D.readSampleFromBED(D.options["bed"].as < string > ()); //Read samples in BED
+ D.readSampleFromVCF(D.options["vcf"].as < string > ()); //Read samples in VCF
+ if (D.options.count("cov")) D.readSampleFromCOV(D.options["cov"].as < string > ()); //Read samples in COV
+ D.mergeSampleLists(); //Merge all sample lists
+ D.readPhenotypes(D.options["bed"].as < string > ()); //Read data in BED
+ D.readGenotypes(D.options["vcf"].as < string > ()); //Read data in VCF
+ if (D.options.count("cov")) D.readCovariates(D.options["cov"].as < string > ());
+ if (D.calculate_Dprime_R2){
+ if (D.options.count("stats-vcf") && D.options["stats-vcf"].as < string > () != D.options["vcf"].as < string > ()){
+ D.setStatsVCF(D.options["stats-vcf"].as < string > ());
+ if (D.options.count("stats-vcf-exclude-samples")) D.readSampleExclusionStats(D.options["stats-vcf-exclude-samples"].as <string> ());
+ if (D.options.count("stats-vcf-include-samples")) D.readSampleInclusionStats(D.options["stats-vcf-include-samples"].as <string> ());
+ }else{
+ D.setStatsVCF(D.options["vcf"].as < string > ());
+ D.copyIncludeExclude();
+ }
+ D.checkStatsVCF();
+ }
+ D.imputeGenotypes();
+ D.imputePhenotypes();
+ if (D.options.count("cov")) D.residualizePhenotypes();
+ if (D.options.count("normal")) D.normalTransformPhenotypes();
+ D.mapVariantsToColdspots();
+ D.collapsePhenotypes();
+ }else{
+ //SCAN TO FIND A LIST OF SNPS TO READ INTO MEMORY
+ D.processBasicOptions(); //Mandatory
+ D.readHotspots(D.options["hotspots"].as < string > ());
+ D.scanPhenotypes(D.options["bed"].as < string > ());
+ D.scanGenotypes(D.options["vcf"].as < string > ());
+ if (D.calculate_Dprime_R2){
+ if (D.options.count("stats-vcf") && D.options["stats-vcf"].as < string > () != D.options["vcf"].as < string > ()){
+ D.setStatsVCF(D.options["stats-vcf"].as < string > ());
+ if (D.options.count("stats-vcf-exclude-samples")) D.readSampleExclusionStats(D.options["stats-vcf-exclude-samples"].as <string> ());
+ if (D.options.count("stats-vcf-include-samples")) D.readSampleInclusionStats(D.options["stats-vcf-include-samples"].as <string> ());
+ }else{
+ D.setStatsVCF(D.options["vcf"].as < string > ());
+ D.copyIncludeExclude();
+ }
+ D.checkStatsVCF();
+ }
+ D.mapVariantsToColdspots();
+ switch (D.mode) {
+ case RTC_MODE2:
+ if (D.options.count("conditional")) D.gwas_trans_conditional(RTCfiles[0], RTCfiles[1]);
+ else D.gwas_trans(RTCfiles[0], RTCfiles[1]);
+ break;
+ case RTC_MODE4:
+ if (D.options.count("conditional")) {
+ D.mergeqtl_trans_conditional(RTCfiles[0], RTCfiles[1]);
+ D.createTransLists();
+ }else {
+ D.mergeqtl_trans(RTCfiles[0], RTCfiles[1]);
+ D.createTransLists();
+ }
+ break;
+ }
+ D.clear();
+ //DO THE FINAL READING
+ D.processBasicOptions(); //Mandatory
+ D.readHotspots(D.options["hotspots"].as < string > ());
+ D.readSampleFromBED(D.options["bed"].as < string > ()); //Read samples in BED
+ D.readSampleFromVCF(D.options["vcf"].as < string > ()); //Read samples in VCF
+ if (D.options.count("cov")) D.readSampleFromCOV(D.options["cov"].as < string > ()); //Read samples in COV
+ D.mergeSampleLists(); //Merge all sample lists
+ D.readPhenotypes(D.options["bed"].as < string > ()); //Read data in BED
+ D.readGenotypes(D.options["vcf"].as < string > ()); //Read data in VCF
+ if (D.options.count("cov")) D.readCovariates(D.options["cov"].as < string > ());
+ D.imputeGenotypes();
+ D.imputePhenotypes();
+ if (D.options.count("cov")) D.residualizePhenotypes();
+ if (D.options.count("normal")) D.normalTransformPhenotypes();
+ D.mapVariantsToColdspots();
+ D.collapsePhenotypes();
+ }
+
+ //----------------
+ // 9. RUN ANALYSIS
+ //----------------
+
+ switch (D.mode) {
+ case RTC_MODE1:
+ if (D.options.count("conditional")) D.gwas_cis_conditional(RTCfiles[0], RTCfiles[1]);
+ else D.gwas_cis(RTCfiles[0], RTCfiles[1]);
+ break;
+ case RTC_MODE2:
+ if (D.options.count("conditional")) D.gwas_trans_conditional(RTCfiles[0], RTCfiles[1]);
+ else D.gwas_trans(RTCfiles[0], RTCfiles[1]);
+ break;
+ case RTC_MODE3:
+ if (D.options.count("conditional")) D.mergeqtl_cis_conditional(RTCfiles[0], RTCfiles[1]);
+ else D.mergeqtl_cis(RTCfiles[0], RTCfiles[1]);
+ break;
+ case RTC_MODE4:
+ if (D.options.count("conditional")) D.mergeqtl_trans_conditional(RTCfiles[0], RTCfiles[1]);
+ else D.mergeqtl_trans(RTCfiles[0], RTCfiles[1]);
+ break;
+ }
+
+ //Estimate mem usage and exit
+ if (D.options.count("mem-est")){
+ vrb.leave("This is just an estimate: " + stb.str(D.getMemoryUsage()));
+ }
+
+ //Deallocate memory in DprimeRsquareSink
+ unordered_map < string , unordered_map< string, vector <double> > >().swap(D.DprimeRsquareSink);
+
+ //Print warnings
+ set <string>::iterator it;
+ if (D.unfound_regions.size()){
+ vrb.warning(stb.str(D.unfound_regions.size()) + " genotypes were missing from [" + D.stats_vcf_file + "]");
+ if( D.options.count("warnings") ) for (it = D.unfound_regions.begin(); it != D.unfound_regions.end(); it++) vrb.print(*it);
+ }
+ if (D.unphased_regions.size()){
+ vrb.warning(stb.str(D.unphased_regions.size()) + " genotypes were unphased in [" + D.stats_vcf_file + "]");
+ for (it = D.unphased_regions.begin(); it != D.unphased_regions.end(); it++) vrb.print(*it);
+ }
+ if (D.no_variance_regions.size()){
+ vrb.warning(stb.str(D.no_variance_regions.size()) + " genotypes had no variance in [" + D.stats_vcf_file + "]");
+ if( D.options.count("warnings") ) for (it = D.no_variance_regions.begin(); it != D.no_variance_regions.end(); it++) vrb.print(*it);
+ }
+ if (D.unmatched_alleles.size()){
+ vrb.warning(stb.str(D.unmatched_alleles.size()) + " genotypes had mismatching alleles between [" + D.options["vcf"].as < string > () + "] and [" + D.stats_vcf_file + "]");
+ if( D.options.count("warnings") ) for (it = D.unmatched_alleles.begin(); it != D.unmatched_alleles.end(); it++) vrb.print(*it);
+ }
+ if (D.unfound_ids.size()){
+ vrb.warning(stb.str(D.unfound_ids.size()) + " genotypes could not be found in [" + D.options["vcf"].as < string > () + "]");
+ if( D.options.count("warnings") ) for (it = D.unfound_ids.begin(); it != D.unfound_ids.end(); it++) vrb.print(*it);
+ }
+ if (D.unfound_phenotypes.size()){
+ vrb.warning(stb.str(D.unfound_phenotypes.size()) + " phenotypes could not be found in [" + D.options["bed"].as < string > () + "]");
+ if( D.options.count("warnings") ) for (it = D.unfound_phenotypes.begin(); it != D.unfound_phenotypes.end(); it++) vrb.print(*it);
+ }
+
+ //Deallocate memory in warnings
+ set < string >().swap(D.unfound_ids);
+ set < string >().swap(D.unfound_phenotypes);
+ set < string >().swap(D.unfound_regions);
+ set < string >().swap(D.unphased_regions);
+ set < string >().swap(D.unmatched_alleles);
+ set < string >().swap(D.no_variance_regions);
+
+ D.calculateRTC(outFile);
+}
diff --git a/src/mode_rtc/rtc_management.cpp b/src/mode_rtc/rtc_management.cpp
new file mode 100644
index 0000000..4d11f4c
--- /dev/null
+++ b/src/mode_rtc/rtc_management.cpp
@@ -0,0 +1,196 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "rtc_data.h"
+
+void rtc_data::imputeGenotypes() {
+ vrb.title("Imputing missing genotypes");
+ for (int g = 0; g < genotype_count ; g ++) {
+ double mean = 0.0;
+ int c_mean = 0;
+ for (int s = 0; s < sample_count ; s ++) {
+ if (genotype_val[g][s] != bcf_float_missing && !std::isnan(genotype_val[g][s])) {
+ mean += genotype_val[g][s];
+ c_mean ++;
+ }
+ }
+ mean /= c_mean;
+ for (int s = 0; s < sample_count ; s ++) if (genotype_val[g][s] == bcf_float_missing || std::isnan(genotype_val[g][s])) genotype_val[g][s] = mean;
+ }
+}
+
+void rtc_data::imputePhenotypes() {
+ vrb.title("Imputing missing phenotypes");
+ for (int p = 0; p < phenotype_count ; p ++) {
+ double mean = 0.0;
+ int c_mean= 0;
+ for (int s = 0; s < sample_count; s ++) {
+ if (phenotype_val[p][s] != bcf_float_missing && !std::isnan(phenotype_val[p][s])) {
+ mean += phenotype_val [p][s];
+ c_mean ++;
+ }
+ }
+ mean /= c_mean;
+ for (int s = 0; s < sample_count ; s ++) if (phenotype_val[p][s] == bcf_float_missing || std::isnan(phenotype_val[p][s])) phenotype_val[p][s] = mean;
+ }
+}
+
+void rtc_data::normalTransform(vector < float > & V) {
+ vector < float > R;
+ myranker::rank(V, R);
+ double max = 0;
+ for (int s = 0 ; s < sample_count ; s ++) {
+ R[s] = R[s] - 0.5;
+ if (R[s] > max) max = R[s];
+ }
+ max = max + 0.5;
+ for (int s = 0 ; s < sample_count ; s ++) {
+ R[s] /= max;
+ V[s] = qnorm(R[s], 0.0, 1.0, 1, 0);
+ }
+}
+
+void rtc_data::normalTransformPhenotypes() {
+ vrb.title("Match phenotypes to Normal distribution");
+ for (int p = 0; p < phenotype_count ; p ++) normalTransform(phenotype_val[p]);
+}
+
+void rtc_data::normalize(vector < float > & X) {
+ double mean = 0.0, sum = 0.0;
+ for (int s = 0; s < X.size() ; s ++) mean += X[s];
+ mean /= X.size();
+ for (int s = 0; s < X.size() ; s ++) {
+ X[s] -= mean;
+ sum += X[s] * X[s];
+ }
+ sum = sqrt(sum);
+ if (sum == 0) sum = 1;
+ for (int s = 0; s < X.size() ; s ++) X[s] /= sum;
+}
+
+
+void rtc_data::normalize(vector < vector < float > > & X) {
+ for (int x = 0 ; x < X.size() ; x++) {
+ double mean = 0.0, sum = 0.0;
+ for (int s = 0; s < sample_count ; s ++) mean += X[x][s];
+ mean /= sample_count;
+ for (int s = 0; s < sample_count ; s ++) {
+ X[x][s] -= mean;
+ sum += X[x][s] * X[x][s];
+ }
+ sum = sqrt(sum);
+ if (sum == 0) sum = 1;
+ for (int s = 0; s < sample_count ; s ++) X[x][s] /= sum;
+ }
+}
+
+vector <float> rtc_data::correct(vector < float > X, vector < float > Y) {
+ vector < float > R(sample_count);
+ double corr = getCorrelation(X, Y);
+ for (int s = 0 ; s < sample_count ; s ++) R[s] = Y[s] - corr * X[s];
+ return R;
+}
+
+// true if all samples are phased.
+// haploid genotypes are considered phased
+// ./. => not phased, .|. => phased
+// From vcfview.c
+int rtc_data::bcf_all_phased(const bcf_hdr_t *header, bcf1_t *line){
+ bcf_unpack(line, BCF_UN_FMT);
+ bcf_fmt_t *fmt_ptr = bcf_get_fmt(header, line, "GT");
+ int all_phased = 1;
+ if ( fmt_ptr ){
+ int i, isample;
+ for (isample=0; isample<line->n_sample; isample++){
+ if (stats_mappingS[isample] != 1) continue;
+ int sample_phased = 0;
+ #define BRANCH_INT(type_t,vector_end) { \
+ type_t *p = (type_t*) (fmt_ptr->p + isample*fmt_ptr->size); \
+ for (i=0; i<fmt_ptr->n; i++) \
+ { \
+ if (fmt_ptr->n == 1 || (p[i] == vector_end && i == 1)) { sample_phased = 1; break; } /* haploid phased by definition */ \
+ if ( p[i] == vector_end ) { break; }; /* smaller ploidy */ \
+ if ( bcf_gt_is_missing(p[i]) ) continue; /* missing allele */ \
+ if ((p[i])&1) { \
+ sample_phased = 1; \
+ break; \
+ } \
+ } \
+ }
+ switch (fmt_ptr->type) {
+ case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break;
+ case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break;
+ case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break;
+ default: fprintf(stderr, "[E::%s] todo: fmt_type %d\n", __func__, fmt_ptr->type); exit(1); break;
+ }
+ #undef BRANCH_INT
+ if (!sample_phased) {
+ all_phased = 0;
+ break;
+ }
+ }
+ }else all_phased = 0;
+ return all_phased;
+}
+
+
+void rtc_data::readSampleExclusionStats(string fname){
+ vrb.title("Read sample exclusion list [" + fname + "]");
+ int ret = stats_vcf_sample_filter.readExclusion(fname);
+ if (ret < 0) vrb.error("Cannot open file!");
+ else vrb.bullet(stb.str(ret) + " samples");
+}
+
+void rtc_data::readSampleInclusionStats(string fname){
+ vrb.title("Read sample inclusion list [" + fname + "]");
+ int ret = stats_vcf_sample_filter.readInclusion(fname);
+ if (ret < 0) vrb.error("Cannot open file!");
+ else vrb.bullet(stb.str(ret) + " samples");
+}
+
+void rtc_data::copyIncludeExclude(){
+ stats_vcf_sample_filter = filter_sample;
+}
+
+void rtc_data::setStatsVCF(string file){
+ stats_vcf_file = file;
+}
+
+long long unsigned int rtc_data::getMemoryUsage(){
+ long long unsigned int mem = 0;
+ ifstream file("/proc/self/status");
+ if (file.is_open()){
+ string buffer;
+ vector < string > str;
+ while(getline(file, buffer)) {
+ stb.split(buffer, str);
+ if (str[0] == "VmRSS:"){
+ mem += atoi(str[1].c_str()) * 1024;
+ break;
+ }
+ }
+ file.close();
+ }else mem += (genotype_val.capacity() * genotype_val[0].capacity() + phenotype_val.capacity() * phenotype_val[0].capacity()) * sizeof(float);
+
+ long long unsigned int max_variants = 0;
+ for (int c = 0 ; c < all_coldspots.size(); c++){
+ if (all_coldspots[c].coldspot_variant_idx.size() > max_variants) max_variants = all_coldspots[c].coldspot_variant_idx.size();
+ mem += all_coldspots[c].getMemoryUsage();
+ }
+ cerr << max_variants << endl;
+ if(DprimeR2inMem) mem += ((sample_iterations * 2 * max_variants * phenotype_val[0].capacity() * sizeof(float)) + (max_variants * genotype_val[0].capacity() * sizeof(float)) );
+ mem += max_variants * (max_variants - 1) / 2 * sizeof(float);
+ return mem;
+}
diff --git a/src/mode_rtc/rtc_mergeQTL_cis.cpp b/src/mode_rtc/rtc_mergeQTL_cis.cpp
new file mode 100644
index 0000000..2da18f4
--- /dev/null
+++ b/src/mode_rtc/rtc_mergeQTL_cis.cpp
@@ -0,0 +1,148 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "rtc_data.h"
+
+void rtc_data::mergeqtl_cis_conditional(string frtc1, string frtc2){
+ string buffer;
+ vector < string > str;
+ map < string , vector < int > > file1;
+ map < string , vector < int > > pheno1;
+ map < string , map < int , int > > rank1;
+ map < string , vector < int > >::iterator it;
+ unsigned int count = 0;
+ vrb.title("Reading conditional cis QTLtools output in [" + frtc1 + "]");
+ input_file fd (frtc1);
+ if (fd.fail()) vrb.error("Cannot open file!");
+ string pheno,snp,best,group;
+ int rank;
+ while(getline(fd, buffer)) {
+ if(readRTCline(buffer,pheno,snp,best,group,rank)){
+ file1[group].push_back(genotype_id_to_idx[snp]);
+ rank1[group][genotype_id_to_idx[snp]] = rank;
+ pheno1[group].push_back(phenotype_id_to_idx[pheno]);
+ }
+ }
+ fd.close();
+
+ vrb.title("Reading conditional cis QTLtools output in [" + frtc2 + "]");
+ input_file fd2 (frtc2);
+ if (fd2.fail()) vrb.error("Cannot open file!");
+ map < string , vector < int > > file2;
+ map < string , vector < int > > pheno2;
+ map < string , map < int , int > > rank2;
+ while(getline(fd2, buffer)) {
+ if(readRTCline(buffer,pheno,snp,best,group,rank)){
+ if (file1.count(group)==0) continue;
+ file2[group].push_back(genotype_id_to_idx[snp]);
+ rank2[group][genotype_id_to_idx[snp]] = rank;
+ pheno2[group].push_back(phenotype_id_to_idx[pheno]);
+ }
+ }
+ fd2.close();
+ vrb.bullet(stb.str(file2.size()) + " common phenotypes found");
+ vrb.title("Merging and calculating D' and R2");
+ unsigned int event_count=0,pairs_tested =0;
+ for (it = file2.begin(); it!= file2.end(); it++){
+ event_count++;
+ if(event_count % DprimePrintFreq == 0) {
+ vrb.bullet(stb.str(event_count) + " common phenotypes processed [" + stb.str(pairs_tested) + " pairs]");
+ pairs_tested =0;
+ }
+ vector <int > our = it->second;
+ string pheno = it->first;
+ vector < int > ourpi = pheno2[pheno];
+ vector <int> their = file1[pheno];
+ for (int o = 0 ; o < our.size(); o++){
+ vector <int> others;
+ for (int oo = 0 ; oo < our.size(); oo++) if (o != oo) others.push_back(our[oo]);
+ int eqtl_snp_csi = getColdspot(genotype_chr[our[o]], genotype_start[our[o]]);
+ for (int t = 0 ;t < their.size(); t++){
+ int test_snp_csi = getColdspot(genotype_chr[their[t]], genotype_start[their[t]]);
+ vector <double> info= genotype_alleles.size() ? getDprimeRsquare(genotype_chr[their[t]], genotype_start[their[t]] , genotype_chr[our[o]], genotype_start[our[o]],genotype_alleles[their[t]], genotype_alleles[our[o]],their[t],our[o]) : getDprimeRsquare(genotype_chr[their[t]], genotype_start[their[t]] , genotype_chr[our[o]], genotype_start[our[o]],"","",their[t],our[o]);
+ pheno_eqtls[pheno].push_back(pairsToTestForRTC(their[t], our[o], others, test_snp_csi, eqtl_snp_csi, info[0] , info[1],rank1[pheno][their[t]],rank2[pheno][our[o]],ourpi[o]));
+
+ count++;
+ pairs_tested++;
+ }
+ }
+ }
+ vrb.bullet(stb.str(count) + " potential merge events found for " + stb.str(pheno_eqtls.size()) + " phenotypes.");
+ if (pheno_eqtls.size() == 0 ) vrb.leave("No merge events found!");
+}
+
+void rtc_data::mergeqtl_cis(string frtc1, string frtc2){
+ string buffer;
+ vector < string > str;
+ map < string , vector < int > > file1,pheno1;
+ map < string , vector < int > >::iterator it;
+ unsigned int count = 0;
+ vrb.title("Reading cis QTLtools output in [" + frtc1 + "]");
+ input_file fd (frtc1);
+ if (fd.fail()) vrb.error("Cannot open file!");
+ string pheno,snp,group;
+ unsigned int line_count = 0;
+ while(getline(fd, buffer)) {
+ if(readRTCline(buffer,pheno,snp,group,line_count)){
+ file1[group].push_back(genotype_id_to_idx[snp]);
+ pheno1[group].push_back(phenotype_id_to_idx[pheno]);
+ }
+ }
+ fd.close();
+ line_count = 0;
+ vrb.title("Reading cis QTLtools output in [" + frtc2 + "]");
+ input_file fd2 (frtc2);
+ if (fd2.fail()) vrb.error("Cannot open file!");
+ map < string , vector < int > > file2,pheno2;
+ while(getline(fd2, buffer)) {
+ if(readRTCline(buffer,pheno,snp,group,line_count)){
+ if (file1.count(group)==0) continue;
+ file2[group].push_back(genotype_id_to_idx[snp]);
+ pheno2[group].push_back(phenotype_id_to_idx[pheno]);
+ }
+ }
+ fd2.close();
+
+ vrb.bullet(stb.str(file2.size()) + " common phenotypes found");
+ vrb.title("Merging and calculating D' and R2");
+ unsigned int event_count=0,pairs_tested =0;
+ for (it = file2.begin(); it!= file2.end(); it++){
+ event_count++;
+ if(event_count % DprimePrintFreq == 0) {
+ vrb.bullet(stb.str(event_count) + " common phenotypes processed [" + stb.str(pairs_tested) + " pairs]");
+ pairs_tested =0;
+ }
+ vector <int > our = it->second;
+ string pheno = it->first;
+ vector < int > ourpi = pheno2[pheno];
+ vector <int> their = file1[pheno];
+ for (int o = 0 ; o < our.size(); o++){
+ vector <int> others;
+ for (int oo = 0 ; oo < our.size(); oo++) if (o != oo) others.push_back(our[oo]);
+ int eqtl_snp_csi = getColdspot(genotype_chr[our[o]], genotype_start[our[o]]);
+ for (int t = 0 ;t < their.size(); t++){
+ int test_snp_csi = getColdspot(genotype_chr[their[t]], genotype_start[their[t]]);
+ vector <double> info= genotype_alleles.size() ? getDprimeRsquare(genotype_chr[their[t]], genotype_start[their[t]] , genotype_chr[our[o]], genotype_start[our[o]],genotype_alleles[their[t]], genotype_alleles[our[o]],their[t],our[o]) : getDprimeRsquare(genotype_chr[their[t]], genotype_start[their[t]] , genotype_chr[our[o]], genotype_start[our[o]],"","",their[t],our[o]);
+ pheno_eqtls[pheno].push_back(pairsToTestForRTC(their[t], our[o], others, test_snp_csi, eqtl_snp_csi, info[0] , info[1],0,0,ourpi[o]));
+ count++;
+ pairs_tested++;
+ }
+ }
+ }
+ vrb.bullet(stb.str(count) + " potential merge events found for " + stb.str(pheno_eqtls.size()) + " phenotypes.");
+ if (pheno_eqtls.size() == 0 ) vrb.leave("No merge events found!");
+
+}
+
diff --git a/src/mode_rtc/rtc_mergeQTL_trans.cpp b/src/mode_rtc/rtc_mergeQTL_trans.cpp
new file mode 100644
index 0000000..9c3a0bb
--- /dev/null
+++ b/src/mode_rtc/rtc_mergeQTL_trans.cpp
@@ -0,0 +1,148 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "rtc_data.h"
+
+void rtc_data::mergeqtl_trans_conditional(string frtc1, string frtc2){
+ string buffer;
+ vector < string > str;
+ map < string , vector < int > > file1,pheno1;
+ map < string , map < int , int > > rank1;
+ map < string , vector < int > >::iterator it;
+ unsigned int count = 0;
+ vrb.title("Reading conditional trans QTLtools output in [" + frtc1 + "]");
+ input_file fd (frtc1);
+ if (fd.fail()) vrb.error("Cannot open file!");
+ string pheno,snp,best,group;
+ int rank;
+ while(getline(fd, buffer)) {
+ if(readRTCline(buffer,pheno,snp,best,group,rank)){
+ file1[group].push_back(genotype_id_to_idx[snp]);
+ rank1[group][genotype_id_to_idx[snp]] = rank;
+ pheno1[group].push_back(phenotype_id_to_idx[pheno]);
+ }
+ }
+ fd.close();
+
+ vrb.title("Reading conditional trans QTLtools output in [" + frtc2 + "]");
+ input_file fd2 (frtc2);
+ if (fd2.fail()) vrb.error("Cannot open file!");
+ map < string , vector < int > > file2,pheno2;
+ map < string , map < int , int > > rank2;
+ while(getline(fd2, buffer)) {
+ if(readRTCline(buffer,pheno,snp,best,group,rank)){
+ if (file1.count(group)==0) continue;
+ file2[group].push_back(genotype_id_to_idx[snp]);
+ rank2[group][genotype_id_to_idx[snp]] = rank;
+ pheno2[group].push_back(phenotype_id_to_idx[pheno]);
+ }
+ }
+ fd2.close();
+ vrb.bullet(stb.str(file2.size()) + " common phenotypes found");
+ vrb.title("Merging and calculating D' and R2");
+ unsigned int event_count=0,pairs_tested =0;
+ for (it = file2.begin(); it!= file2.end(); it++){
+ event_count++;
+ if(event_count % DprimePrintFreq == 0) {
+ vrb.bullet(stb.str(event_count) + " common phenotypes processed [" + stb.str(pairs_tested) + " pairs]");
+ pairs_tested =0;
+ }
+ vector <int > our = it->second;
+ string pheno = it->first;
+ vector < int > ourpi = pheno2[pheno];
+ vector <int> their = file1[pheno];
+ for (int o = 0 ; o < our.size(); o++){
+ vector <int> others;
+ for (int oo = 0 ; oo < our.size(); oo++) if (o != oo) others.push_back(our[oo]);
+ int eqtl_snp_csi = getColdspot(genotype_chr[our[o]], genotype_start[our[o]]);
+ for (int t = 0 ;t < their.size(); t++){
+ if (genotype_chr[their[t]] == genotype_chr[our[o]] && abs(genotype_start[their[t]] - genotype_start[our[o]]) <= cis_window ){
+ int test_snp_csi = getColdspot(genotype_chr[their[t]], genotype_start[their[t]]);
+ vector <double> info= genotype_alleles.size() ? getDprimeRsquare(genotype_chr[their[t]], genotype_start[their[t]] , genotype_chr[our[o]], genotype_start[our[o]],genotype_alleles[their[t]], genotype_alleles[our[o]],their[t],our[o]) : getDprimeRsquare(genotype_chr[their[t]], genotype_start[their[t]] , genotype_chr[our[o]], genotype_start[our[o]],"","",their[t],our[o]);
+ pheno_eqtls[pheno].push_back(pairsToTestForRTC(their[t], our[o], others, test_snp_csi, eqtl_snp_csi, info[0] , info[1],rank1[pheno][their[t]],rank2[pheno][our[o]],ourpi[o]));
+ count++;
+ pairs_tested++;
+ }
+ }
+ }
+ }
+ vrb.bullet(stb.str(count) + " potential merge events found for " + stb.str(pheno_eqtls.size()) + " phenotypes.");
+ if (pheno_eqtls.size() == 0 ) vrb.leave("No merge events found!");
+}
+
+void rtc_data::mergeqtl_trans(string frtc1, string frtc2){
+ string buffer;
+ vector < string > str;
+ map < string , vector < int > > file1,pheno1;
+ map < string , vector < int > >::iterator it;
+ unsigned int count = 0;
+ vrb.title("Reading trans QTLtools output in [" + frtc1 + "]");
+ input_file fd (frtc1);
+ if (fd.fail()) vrb.error("Cannot open file!");
+ string pheno,snp,group;
+ unsigned int line_count = 0;
+ while(getline(fd, buffer)) {
+ if(readRTCline(buffer,pheno,snp,group,line_count)){
+ file1[group].push_back(genotype_id_to_idx[snp]);
+ pheno1[group].push_back(phenotype_id_to_idx[pheno]);
+ }
+ }
+ fd.close();
+ line_count = 0;
+ vrb.title("Reading trans QTLtools output in [" + frtc2 + "]");
+ input_file fd2 (frtc2);
+ if (fd2.fail()) vrb.error("Cannot open file!");
+ map < string , vector < int > > file2,pheno2;
+ while(getline(fd2, buffer)) {
+ if(readRTCline(buffer,pheno,snp,group,line_count)){
+ if (file1.count(group)==0) continue;
+ file2[group].push_back(genotype_id_to_idx[snp]);
+ pheno2[group].push_back(phenotype_id_to_idx[pheno]);
+ }
+ }
+ fd2.close();
+ vrb.bullet(stb.str(file2.size()) + " common phenotypes found");
+ vrb.title("Merging and calculating D' and R2");
+ unsigned int event_count=0,pairs_tested =0;
+ for (it = file2.begin(); it!= file2.end(); it++){
+ event_count++;
+ if(event_count % DprimePrintFreq == 0) {
+ vrb.bullet(stb.str(event_count) + " common phenotypes processed [" + stb.str(pairs_tested) + " pairs]");
+ pairs_tested =0;
+ }
+ vector <int > our = it->second;
+ string pheno = it->first;
+ vector < int > ourpi = pheno2[pheno];
+ vector <int> their = file1[pheno];
+ for (int o = 0 ; o < our.size(); o++){
+ vector <int> others;
+ for (int oo = 0 ; oo < our.size(); oo++) if (o != oo) others.push_back(our[oo]);
+ int eqtl_snp_csi = getColdspot(genotype_chr[our[o]], genotype_start[our[o]]);
+ for (int t = 0 ;t < their.size(); t++){
+ if (genotype_chr[their[t]] == genotype_chr[our[o]] && abs(genotype_start[their[t]] - genotype_start[our[o]]) <= cis_window ){
+ int test_snp_csi = getColdspot(genotype_chr[their[t]], genotype_start[their[t]]);
+ vector <double> info= genotype_alleles.size() ? getDprimeRsquare(genotype_chr[their[t]], genotype_start[their[t]] , genotype_chr[our[o]], genotype_start[our[o]],genotype_alleles[their[t]], genotype_alleles[our[o]],their[t],our[o]) : getDprimeRsquare(genotype_chr[their[t]], genotype_start[their[t]] , genotype_chr[our[o]], genotype_start[our[o]],"","",their[t],our[o]);
+ pheno_eqtls[pheno].push_back(pairsToTestForRTC(their[t], our[o], others, test_snp_csi, eqtl_snp_csi, info[0] , info[1],0,0,ourpi[o]));
+ count++;
+ pairs_tested++;
+ }
+ }
+ }
+ }
+ vrb.bullet(stb.str(count) + " potential merge events found for " + stb.str(pheno_eqtls.size()) + " phenotypes.");
+ if (pheno_eqtls.size() == 0 ) vrb.leave("No merge events found!");
+
+}
+
diff --git a/src/mode_rtc/rtc_read_covariates.cpp b/src/mode_rtc/rtc_read_covariates.cpp
new file mode 100644
index 0000000..8ace4c4
--- /dev/null
+++ b/src/mode_rtc/rtc_read_covariates.cpp
@@ -0,0 +1,55 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "rtc_data.h"
+
+void rtc_data::readCovariates(string fcov) {
+ string buffer;
+ vector < string > str;
+ int n_includedS = 0;
+ int n_includedC = 0;
+ int n_excludedC = 0;
+ vector < int > mappingS;
+
+ vrb.title("Reading covariates in [" + fcov + "]");
+ input_file fd (fcov);
+ if (fd.fail()) vrb.error("Cannot open file!");
+
+ //Read samples
+ getline(fd, buffer);
+ if (buffer.size() == 0) vrb.error("No header line detected!");
+ stb.split(buffer, str );
+ for (int t = 1 ; t < str.size() ; t ++) {
+ mappingS.push_back(findSample(str[t]));
+ if (mappingS.back() >= 0) n_includedS++;
+ }
+
+ //Read covariates
+ while(getline(fd, buffer)) {
+ stb.split(buffer, str);
+ if (str.size() < 2) vrb.error("Incorrect number of columns!");
+ if (filter_covariate.check(str[0])) {
+ covariate_val.push_back(vector < string > (sample_count));
+ for (int t = 1 ; t < str.size() ; t ++) if (mappingS[t-1] >= 0) covariate_val.back()[mappingS[t-1]] = str[t];
+ n_includedC ++;
+ } else n_excludedC ++;
+ }
+
+ //Finalise
+ covariate_count = n_includedC;
+ vrb.bullet(stb.str(n_includedC) + " covariates included");
+ if (n_excludedC > 0) vrb.bullet(stb.str(n_excludedC) + " covariates excluded");
+ fd.close();
+}
diff --git a/src/mode_rtc/rtc_read_genotypes.cpp b/src/mode_rtc/rtc_read_genotypes.cpp
new file mode 100644
index 0000000..6c40557
--- /dev/null
+++ b/src/mode_rtc/rtc_read_genotypes.cpp
@@ -0,0 +1,430 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "rtc_data.h"
+
+void rtc_data::readGenotypes(string filename) {
+ vrb.title("Reading genotype data in [" + filename + "]");
+ htsFile * fp = hts_open(filename.c_str(),"r");
+ enum htsExactFormat fileformat = fp->format.format;
+ hts_close(fp);
+ if (fileformat == bcf) {
+ vrb.bullet("File format detected: BCF");
+ readGenotypesVCF(filename);
+ } else if (fileformat == vcf) {
+ vrb.bullet("File format detected: VCF");
+ readGenotypesVCF(filename);
+ } else if (fileformat == sam) {
+ vrb.bullet("File format detected: BED");
+ readGenotypesBED(filename);
+ } else vrb.error("File format not supported!");
+}
+
+void rtc_data::scanGenotypes(string filename) {
+ vrb.title("Scanning genotype data in [" + filename + "]");
+ htsFile * fp = hts_open(filename.c_str(),"r");
+ enum htsExactFormat fileformat = fp->format.format;
+ hts_close(fp);
+ if (fileformat == bcf) {
+ vrb.bullet("File format detected: BCF");
+ scanGenotypesVCF(filename);
+ } else if (fileformat == vcf) {
+ vrb.bullet("File format detected: VCF");
+ scanGenotypesVCF(filename);
+ } else if (fileformat == sam) {
+ vrb.bullet("File format detected: BED");
+ scanGenotypesBED(filename);
+ } else vrb.error("File format not supported!");
+}
+
+void rtc_data::checkStatsVCF(){
+ if (!calculate_Dprime_R2) return;
+ htsFile * fp = hts_open(stats_vcf_file.c_str(),"r");
+ if (!fp) vrb.error("Cannot open file");
+ enum htsExactFormat fileformat = fp->format.format;
+ hts_close(fp);
+ if (fileformat != vcf && fileformat != bcf){
+ vrb.warning("Stats VCF must be in VCF/BCF format [" + stats_vcf_file + "] D' will NOT be calculated");
+ calculate_Dprime_R2 = false;
+ }
+}
+
+
+void rtc_data::readGenotypesVCF(string fvcf) {
+ int n_includedG = 0;
+ int n_excludedG_mult = 0;
+ int n_excludedG_void = 0;
+ int n_excludedG_user = 0;
+ int n_includedS = 0;
+ vector < int > mappingS;
+
+ //Opening files
+ bcf_srs_t * sr = bcf_sr_init();
+ if ( regionGenotype.chr != "NA"){
+ vrb.bullet("target region [" + regionGenotype.get() + "]");
+ if (bcf_sr_set_regions(sr, regionGenotype.get().c_str(), 0) == -1) vrb.error("Cannot jump to region!");
+ }
+ if(!(bcf_sr_add_reader (sr, fvcf.c_str()))) {
+ switch (sr->errnum) {
+ case not_bgzf: vrb.error("File not compressed with bgzip!");
+ case idx_load_failed: vrb.error("Impossible to load index file!");
+ case file_type_error: vrb.error("File format not detected by htslib!");
+ default : vrb.error("Unknown error!");
+ }
+ }
+
+ //Sample processing
+ int n_samples = bcf_hdr_nsamples(sr->readers[0].header);
+ for (int i0 = 0 ; i0 < n_samples ; i0 ++) {
+ mappingS.push_back(findSample(string(sr->readers[0].header->samples[i0])));
+ if (mappingS.back() >= 0) n_includedS++;
+ }
+
+ //Read genotype data
+ int ngt, ngt_arr = 0, nds, nds_arr = 0, * gt_arr = NULL, nsl, nsl_arr = 0, * sl_arr = NULL;
+ float * ds_arr = NULL;
+ bcf1_t * line;
+ unsigned int linecount = 0;
+ while(bcf_sr_next_line (sr)) {
+ linecount ++;
+ if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
+ line = bcf_sr_get_line(sr, 0);
+ if (line->n_allele == 2) {
+ ngt = bcf_get_genotypes(sr->readers[0].header, line, >_arr, &ngt_arr);
+ nds = bcf_get_format_float(sr->readers[0].header, line,"DS", &ds_arr, &nds_arr);
+ if (nds == n_samples || ngt == 2*n_samples) {
+ bcf_unpack(line, BCF_UN_STR);
+ string sid = string(line->d.id);
+ if (filter_genotype.check(sid)) {
+ genotype_id.push_back(sid);
+ genotype_chr.push_back(string(bcf_hdr_id2name(sr->readers[0].header, line->rid)));
+ string genotype_ref = string(line->d.allele[0]);
+ genotype_alleles.push_back(genotype_ref + "/" + string(line->d.allele[1]));
+ genotype_start.push_back(line->pos + 1);
+ nsl = bcf_get_info_int32(sr->readers[0].header, line, "END", &sl_arr, &nsl_arr);
+ if (nsl >= 0 && nsl_arr == 1) genotype_end.push_back(sl_arr[0]);
+ else genotype_end.push_back(genotype_start.back() + genotype_ref.size() - 1);
+ genotype_val.push_back(vector < float > (sample_count, 0.0));
+
+ for(int i = 0 ; i < n_samples ; i ++) {
+ if (mappingS[i] >= 0) {
+ if (nds > 0) genotype_val.back()[mappingS[i]] = ds_arr[i];
+ else {
+ if (gt_arr[2*i+0] == bcf_gt_missing || gt_arr[2*i+1] == bcf_gt_missing) genotype_val.back()[mappingS[i]] = bcf_float_missing;
+ else genotype_val.back()[mappingS[i]] = bcf_gt_allele(gt_arr[2*i+0]) + bcf_gt_allele(gt_arr[2*i+1]);
+ }
+ }
+ }
+ pair < string, int > temp (sid,n_includedG);
+ genotype_id_to_idx.insert(temp);
+ n_includedG++;
+ } else n_excludedG_user ++;
+ } else n_excludedG_void ++;
+ } else n_excludedG_mult ++;
+ }
+
+ //Finalize
+ free(gt_arr);
+ free(ds_arr);
+ bcf_sr_destroy(sr);
+ genotype_count = n_includedG;
+ vrb.bullet(stb.str(n_includedG) + " variants included");
+ if (n_excludedG_user > 0) vrb.bullet(stb.str(n_excludedG_user) + " variants excluded by user");
+ if (n_excludedG_mult > 0) vrb.bullet(stb.str(n_excludedG_mult) + " multi-allelic variants excluded");
+ if (n_excludedG_void > 0) vrb.bullet(stb.str(n_excludedG_void) + " uninformative variants excluded [no GT/DS]");
+ if (genotype_count == 0) vrb.leave("Cannot find genotypes in target region!");
+}
+
+void rtc_data::readGenotypesBED(string fbed) {
+ string buffer;
+ int n_includedG = 0;
+ int n_excludedG_user = 0;
+ int n_includedS = 0;
+ int n_excludedS = 0;
+ int n_missingS = 0;
+ vector < int > mappingS;
+
+ //Opening files
+ htsFile *fp = hts_open(fbed.c_str(),"r");
+ if (!fp) vrb.error("Cannot open file!");
+ tbx_t * tbx = tbx_index_load(fbed.c_str());
+ if (!tbx) vrb.error("Cannot load index file!");
+ kstring_t str = {0,0,0};
+ if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line!");
+
+ //Process sample names
+ vector < string > tokens;
+ stb.split(string(str.s), tokens);
+ if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
+ for (int i0 = 7 ; i0 < tokens.size() ; i0 ++) {
+ string sid = tokens[i0];
+ if (filter_sample.check(sid)) {
+ mappingS.push_back(findSample(sid));
+ if (mappingS.back() >= 0) n_includedS ++;
+ else n_missingS ++;
+ } else {
+ mappingS.push_back(-1);
+ n_excludedS ++;
+ }
+ }
+ vrb.bullet(stb.str(n_includedS) + " samples included");
+ if (n_excludedS > 0) vrb.bullet(stb.str(n_excludedS) + " samples excluded by user");
+ if (n_missingS > 0) vrb.bullet(stb.str(n_missingS) + " samples without phenotype data");
+ if (n_includedS != sample_count) vrb.error("Cannot find genotype for " + stb.str(sample_count - n_includedS) + " samples!");
+
+ unsigned int linecount = 0;
+
+ //Jump to interesting region
+ if (regionGenotype.chr != "NA"){
+ hts_itr_t *itr = tbx_itr_querys(tbx, regionGenotype.get().c_str());
+ vrb.bullet("target region [" + regionGenotype.get() + "]");
+ if (!itr) vrb.error("Cannot jump to region!");
+ while (tbx_itr_next(fp, tbx, itr, &str) >= 0) {
+ linecount ++;
+ if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
+ stb.split(string(str.s), tokens);
+ if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
+ if (filter_genotype.check(tokens[3])) {
+ genotype_id.push_back(tokens[3]);
+ genotype_chr.push_back(tokens[0]);
+ genotype_start.push_back(atoi(tokens[1].c_str()) + 1);
+ genotype_end.push_back(atoi(tokens[2].c_str()));
+ genotype_val.push_back(vector < float > (sample_count, 0.0));
+ for (int t = 6 ; t < tokens.size() ; t ++) {
+ if (mappingS[t-6] >= 0) {
+ if (tokens[t] == "NA") genotype_val.back()[mappingS[t-6]] = bcf_float_missing;
+ else genotype_val.back()[mappingS[t-6]] = stof(tokens[t]);
+ }
+ }
+ pair < string, int > temp (tokens[3],n_includedG);
+ genotype_id_to_idx.insert(temp);
+ n_includedG++;
+ } else n_excludedG_user ++;
+ }
+ tbx_itr_destroy(itr);
+ }else{
+ while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) {
+ linecount ++;
+ if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
+ stb.split(string(str.s), tokens);
+ if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
+ if (filter_genotype.check(tokens[3])) {
+ genotype_id.push_back(tokens[3]);
+ genotype_chr.push_back(tokens[0]);
+ genotype_start.push_back(atoi(tokens[1].c_str()) + 1);
+ genotype_end.push_back(atoi(tokens[2].c_str()));
+ genotype_val.push_back(vector < float > (sample_count, 0.0));
+ for (int t = 6 ; t < tokens.size() ; t ++) {
+ if (mappingS[t-6] >= 0) {
+ if (tokens[t] == "NA") genotype_val.back()[mappingS[t-6]] = bcf_float_missing;
+ else genotype_val.back()[mappingS[t-6]] = stof(tokens[t]);
+ }
+ }
+ pair < string, int > temp (tokens[3],n_includedG);
+ genotype_id_to_idx.insert(temp);
+ n_includedG++;
+ } else n_excludedG_user ++;
+ }
+ }
+
+ //Finalize & verbose
+ tbx_destroy(tbx);
+ if (hts_close(fp)) vrb.error("Cannot properly close file!");
+ genotype_count = n_includedG;
+ vrb.bullet(stb.str(n_includedG) + " variants included");
+ if (n_excludedG_user > 0) vrb.bullet(stb.str(n_excludedG_user) + " variants excluded by user");
+ if (genotype_count == 0) vrb.leave("Cannot find variants in target region!");
+}
+
+
+
+
+int rtc_data::readGenotypesVCFStats(string region, string &alleles, vector < string > &values) {
+ int n_includedG = 0;
+ int n_excludedG_mult = 0;
+
+ //Opening files
+ bcf_srs_t * sr = bcf_sr_init();
+ if (bcf_sr_set_regions(sr, region.c_str(), 0) == -1) return 0;
+ if(!(bcf_sr_add_reader (sr, stats_vcf_file.c_str()))) {
+ switch (sr->errnum) {
+ case not_bgzf: vrb.error("File not compressed with bgzip!");
+ case idx_load_failed: vrb.error("Impossible to load index file!");
+ case file_type_error: vrb.error("File format not detected by htslib!");
+ default : vrb.error("Unknown error!");
+ }
+ }
+
+ //Sample processing
+ int n_samples = bcf_hdr_nsamples(sr->readers[0].header);
+ if (stats_mappingS.size() == 0){
+ for (int i0 = 0 ; i0 < n_samples ; i0 ++) {
+ stats_mappingS.push_back(stats_vcf_sample_filter.check(string(sr->readers[0].header->samples[i0])));
+ if (stats_mappingS.back() == 1) stats_n_includedS++;
+ }
+ }
+ values = vector < string >(stats_n_includedS,"NA");
+
+
+ //Read genotype data
+ int ngt, ngt_arr = 0, * gt_arr = NULL;
+ float * ds_arr = NULL;
+ bcf1_t * line;
+ while(bcf_sr_next_line (sr)) {
+ line = bcf_sr_get_line(sr, 0);
+ if (line->n_allele == 2) {
+ ngt = bcf_get_genotypes(sr->readers[0].header, line, >_arr, &ngt_arr);
+ if (ngt == 2*n_samples) {
+ bcf_unpack(line, BCF_UN_STR);
+ if (!bcf_all_phased(sr->readers[0].header, line)) return -1;
+ alleles = string(line->d.allele[0]) + "/" + string(line->d.allele[1]);
+ string gs = stb.str(line->pos + 1);
+ string chr = string(bcf_hdr_id2name(sr->readers[0].header, line->rid));
+ string tempRegion = chr + ":" + gs + "-" + gs;
+ if (region != tempRegion) continue;
+ int index =0 ;
+ for(int i = 0 ; i < n_samples ; i ++) {
+ if (stats_mappingS[i] == 1) {
+ //if (!bcf_gt_is_phased(gt_arr[2*i+0]) || !bcf_gt_is_phased(gt_arr[2*i+1])) return -1;
+ if (gt_arr[2*i+0] == bcf_gt_missing || gt_arr[2*i+1] == bcf_gt_missing) values[index] = "NA";
+ else values[index] = stb.str(bcf_gt_allele(gt_arr[2*i+0])) + stb.str(bcf_gt_allele(gt_arr[2*i+1]));
+ index++;
+ n_includedG++;
+ }
+ }
+ }else return -1;
+ } else n_excludedG_mult ++;
+ }
+
+ //Finalize
+ free(gt_arr);
+ free(ds_arr);
+ bcf_sr_destroy(sr);
+ return n_includedG;
+}
+
+void rtc_data::scanGenotypesVCF(string fvcf) {
+ int n_includedG = 0;
+ int n_excludedG_mult = 0;
+ int n_excludedG_void = 0;
+ int n_excludedG_user = 0;
+ int n_includedS = 0;
+ vector < int > mappingS;
+
+ //Opening files
+ bcf_srs_t * sr = bcf_sr_init();
+ if(!(bcf_sr_add_reader (sr, fvcf.c_str()))) {
+ switch (sr->errnum) {
+ case not_bgzf: vrb.error("File not compressed with bgzip!");
+ case idx_load_failed: vrb.error("Impossible to load index file!");
+ case file_type_error: vrb.error("File format not detected by htslib!");
+ default : vrb.error("Unknown error!");
+ }
+ }
+
+ //Sample processing
+ int n_samples = bcf_hdr_nsamples(sr->readers[0].header);
+ for (int i0 = 0 ; i0 < n_samples ; i0 ++) {
+ mappingS.push_back(findSample(string(sr->readers[0].header->samples[i0])));
+ if (mappingS.back() >= 0) n_includedS++;
+ }
+
+ //Read genotype data
+ int ngt, ngt_arr = 0, nds, nds_arr = 0, * gt_arr = NULL, nsl, nsl_arr = 0, * sl_arr = NULL;
+ float * ds_arr = NULL;
+ bcf1_t * line;
+ unsigned int linecount = 0;
+ while(bcf_sr_next_line (sr)) {
+ linecount ++;
+ if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
+ line = bcf_sr_get_line(sr, 0);
+ if (line->n_allele == 2) {
+ ngt = bcf_get_genotypes(sr->readers[0].header, line, >_arr, &ngt_arr);
+ nds = bcf_get_format_float(sr->readers[0].header, line,"DS", &ds_arr, &nds_arr);
+ if (nds == n_samples || ngt == 2*n_samples) {
+ bcf_unpack(line, BCF_UN_STR);
+ string sid = string(line->d.id);
+ if (filter_genotype.check(sid)) {
+ genotype_id.push_back(sid);
+ genotype_chr.push_back(string(bcf_hdr_id2name(sr->readers[0].header, line->rid)));
+ string genotype_ref = string(line->d.allele[0]);
+ genotype_start.push_back(line->pos + 1);
+ nsl = bcf_get_info_int32(sr->readers[0].header, line, "END", &sl_arr, &nsl_arr);
+ if (nsl >= 0 && nsl_arr == 1) genotype_end.push_back(sl_arr[0]);
+ else genotype_end.push_back(genotype_start.back() + genotype_ref.size() - 1);
+ pair < string, int > temp (sid,n_includedG);
+ genotype_id_to_idx.insert(temp);
+ n_includedG++;
+ } else n_excludedG_user ++;
+ } else n_excludedG_void ++;
+ } else n_excludedG_mult ++;
+ }
+
+ //Finalize
+ free(gt_arr);
+ free(ds_arr);
+ bcf_sr_destroy(sr);
+ genotype_count = n_includedG;
+ vrb.bullet(stb.str(n_includedG) + " variants included");
+ if (n_excludedG_user > 0) vrb.bullet(stb.str(n_excludedG_user) + " variants excluded by user");
+ if (n_excludedG_mult > 0) vrb.bullet(stb.str(n_excludedG_mult) + " multi-allelic variants excluded");
+ if (n_excludedG_void > 0) vrb.bullet(stb.str(n_excludedG_void) + " uninformative variants excluded [no GT/DS]");
+ if (genotype_count == 0) vrb.leave("Cannot find genotypes in target region!");
+}
+
+void rtc_data::scanGenotypesBED(string fbed) {
+ string buffer;
+ int n_includedG = 0;
+ int n_excludedG_user = 0;
+
+ //Opening files
+ htsFile *fp = hts_open(fbed.c_str(),"r");
+ if (!fp) vrb.error("Cannot open file!");
+ tbx_t * tbx = tbx_index_load(fbed.c_str());
+ if (!tbx) vrb.error("Cannot load index file!");
+ kstring_t str = {0,0,0};
+ if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line!");
+
+ //Read genotype data
+ vector < string > tokens;
+ unsigned int linecount = 0;
+ while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) {
+ linecount ++;
+ if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
+ stb.split(string(str.s), tokens);
+ if (str.l && str.s[0] != tbx->conf.meta_char) {
+ if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
+ if (filter_genotype.check(tokens[3])) {
+ genotype_id.push_back(tokens[3]);
+ genotype_chr.push_back(tokens[0]);
+ genotype_start.push_back(atoi(tokens[1].c_str()) + 1);
+ genotype_end.push_back(atoi(tokens[2].c_str()));
+ pair < string, int > temp (tokens[3],n_includedG);
+ genotype_id_to_idx.insert(temp);
+ n_includedG++;
+ } else n_excludedG_user ++;
+ }
+ }
+
+ //Finalize & verbose
+ tbx_destroy(tbx);
+ genotype_count = n_includedG;
+ if (hts_close(fp)) vrb.error("Cannot properly close file!");
+ vrb.bullet(stb.str(n_includedG) + " variants included");
+ if (n_excludedG_user > 0) vrb.bullet(stb.str(n_excludedG_user) + " variants excluded by user");
+ if (n_includedG == 0) vrb.leave("Cannot find variants in target region!");
+}
+
+
diff --git a/src/mode_rtc/rtc_read_get_hotspots.cpp b/src/mode_rtc/rtc_read_get_hotspots.cpp
new file mode 100644
index 0000000..273c6e8
--- /dev/null
+++ b/src/mode_rtc/rtc_read_get_hotspots.cpp
@@ -0,0 +1,113 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "rtc_data.h"
+
+void rtc_data::readHotspots(string fcov) {
+ string buffer;
+ vector < string > str;
+ int idx = 0;
+ vrb.title("Reading hotspots in [" + fcov + "]");
+ input_file fd (fcov);
+ if(fd.fail()) vrb.error("Cannot open file");
+ map < string , map < int , vector < unsigned int > > > _bins;
+ map < string , map < int , vector < unsigned int > > >::iterator oit;
+ map < int , vector < unsigned int > >::iterator iit;
+ coldspot prev;
+ //Read hotspots
+ while(getline(fd, buffer)) {
+ stb.split(buffer, str);
+ if (str.size() < 3) vrb.error("Wrong hotspot file format");
+ if (prev.chr != "" && prev.chr != str[0]){
+ //coldspot *pCs = new coldspot(prev.chr,prev.end + 1,INT_MAX,idx,"CS");
+ //all_coldspots_p.push_back(pCs);
+ all_coldspots.push_back(coldspot(prev.chr,prev.end + 1,INT_MAX,idx,"CS"));
+ int sb = (prev.end + 1) / bin_size;
+ int eb = INT_MAX / bin_size;
+ //for (int b = sb; b <= eb; b++) coldspot_bins_p[prev.chr][b].push_back(pCs);
+ for (int b = sb; b <= eb; b++) _bins[prev.chr][b].push_back(all_coldspots.size()-1);
+ idx++;
+ coldspot_count++;
+ }
+ int s = prev.chr == str[0] ? prev.end + 1 : 0 ;
+ int e = atoi(str[1].c_str());
+ int sb = s / bin_size;
+ int eb = e / bin_size;
+ //coldspot *pCs = new coldspot(str[0],s,e,idx,"CS");
+ //for (int b = sb; b <= eb; b++) coldspot_bins_p[str[0]][b].push_back(pCs);
+ //all_coldspots_p.push_back(pCs);
+ all_coldspots.push_back(coldspot(str[0],s,e,idx,"CS"));
+ for (int b = sb; b <= eb; b++) _bins[str[0]][b].push_back(all_coldspots.size()-1);
+ idx++;
+ coldspot_count++;
+ prev = coldspot(str[0],atoi(str[1].c_str())+1,atoi(str[2].c_str()),-1,"NA");
+ s = atoi(str[1].c_str())+1;
+ e = atoi(str[2].c_str());
+ if (e < s) vrb.error("Hotspot end cannot be less than start " + buffer);
+ if (prev.chr == str[0] && prev.start > s) vrb.error("Hotspots are not sorted at " + buffer);
+ sb = s / bin_size;
+ eb = e / bin_size;
+ //pCs = new coldspot(str[0],s,e,idx,"HS");
+ //for (int b = sb; b <= eb; b++) coldspot_bins_p[str[0]][b].push_back(pCs);
+ //all_coldspots_p.push_back(pCs);
+ all_coldspots.push_back(coldspot(str[0],s,e,idx,"HS"));
+ for (int b = sb; b <= eb; b++) _bins[str[0]][b].push_back(all_coldspots.size()-1);
+ idx++;
+ coldspot_count++;
+ }
+ //coldspot *pCs = new coldspot(prev.chr,prev.end + 1,INT_MAX,idx,"CS");
+ //all_coldspots_p.push_back(pCs);
+ all_coldspots.push_back(coldspot(prev.chr,prev.end + 1,INT_MAX,idx,"CS"));
+ int sb = (prev.end + 1) / bin_size;
+ int eb = INT_MAX / bin_size;
+ //for (int b = sb; b <= eb; b++) coldspot_bins_p[prev.chr][b].push_back(pCs);
+ for (int b = sb; b <= eb; b++) _bins[prev.chr][b].push_back(all_coldspots.size()-1);
+ idx++;
+ coldspot_count++;
+
+ //Finalise
+ if (!coldspot_count) vrb.error("No coldspots found");
+ for (oit = _bins.begin(); oit != _bins.end(); oit++){
+ for (iit = oit->second.begin() ; iit != oit->second.end(); iit++){
+ for (int i = 0 ; i < (iit->second).size(); i++) coldspot_bins_p[oit->first][iit->first].push_back( & all_coldspots[iit->second[i]]);
+ }
+ }
+ vrb.bullet(stb.str(coldspot_count) + " coldspots included");
+ fd.close();
+
+ //for (int i = 0 ; i < coldspot_count; i++ ) cerr << (*all_coldspots_p[i]);
+ /*map < string, map < int, vector <coldspot *> > >::iterator it1;
+ map < int, vector <coldspot *> >::iterator it2;
+ for (it1 = coldspot_bins_p.begin(); it1 != coldspot_bins_p.end(); it1++){
+ for (it2 = it1->second.begin() ; it2 != it1->second.end(); it2++){
+ cerr << it1->first << " " << it2->first << endl;
+ for (int i = 0 ; i < (it2->second).size(); i++) cerr << *(it2->second[i]);
+ }
+ }
+ exit(0);*/
+}
+
+
+int rtc_data::getColdspot(string chr, int pos){
+ if (coldspot_bins_p.find(chr) != coldspot_bins_p.end()){
+ int max = (coldspot_bins_p[chr].rbegin()->second).back()->end;
+ if (pos > max) return -1; //after the last hotspot on this chr
+ int bin = pos / bin_size;
+ if (coldspot_bins_p[chr].find(bin) != coldspot_bins_p[chr].end()){
+ for ( int i = 0 ; i < coldspot_bins_p[chr][bin].size(); i ++ ) if (coldspot_bins_p[chr][bin][i]->start <= pos && coldspot_bins_p[chr][bin][i]->end >= pos) return coldspot_bins_p[chr][bin][i]->idx;
+ return -2; //in a hotspot
+ }else return -3; //in a hotspot BUT SINCE BIN IS 1MB SHOULD NOT HAPPEN
+ }else return -4; //no hospot found for this chr
+}
diff --git a/src/mode_rtc/rtc_read_phenotypes.cpp b/src/mode_rtc/rtc_read_phenotypes.cpp
new file mode 100644
index 0000000..8a3ad1f
--- /dev/null
+++ b/src/mode_rtc/rtc_read_phenotypes.cpp
@@ -0,0 +1,181 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "rtc_data.h"
+
+void rtc_data::readPhenotypes(string fbed) {
+ int n_includedS = 0;
+ int n_includedP = 0;
+ int n_excludedP = 0;
+ vector < int > mappingS;
+
+ //Open BED file
+ vrb.title("Reading phenotype data in [" + fbed + "]");
+ htsFile *fp = hts_open(fbed.c_str(),"r");
+ if (!fp) vrb.error("Cannot open file");
+ tbx_t *tbx = tbx_index_load(fbed.c_str());
+ if (!tbx) vrb.error("Cannot open index file");
+ kstring_t str = {0,0,0};
+ if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line!");
+
+ //Process sample names
+ vector < string > tokens;
+ stb.split(string(str.s), tokens);
+ if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
+ for (int t = 6 ; t < tokens.size() ; t ++) {
+ mappingS.push_back(findSample(tokens[t]));
+ if (mappingS.back() >= 0) n_includedS++;
+ }
+ unsigned int linecount =0;
+
+ //Read phenotypes
+ if (regionPhenotype.chr != "NA"){
+ hts_itr_t *itr = tbx_itr_querys(tbx, regionPhenotype.get().c_str());
+ vrb.bullet("target region [" + regionPhenotype.get() + "]");
+ if (!itr) vrb.error("Cannot jump to region!");
+ //Read data
+ while (tbx_itr_next(fp, tbx, itr, &str) >= 0) {
+ linecount ++;
+ if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
+ stb.split(string(str.s), tokens);
+ if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
+ if (filter_phenotype.check(tokens[3])) {
+ phenotype_id.push_back(tokens[3]);
+ phenotype_chr.push_back(tokens[0]);
+ phenotype_start.push_back(atoi(tokens[1].c_str()) + 1);
+ phenotype_end.push_back(atoi(tokens[2].c_str()));
+ phenotype_val.push_back(vector < float > (sample_count, 0.0));
+ if (grp_mode > 0) phenotype_grp.push_back(tokens[4]);
+ for (int t = 6 ; t < tokens.size() ; t ++) {
+ if (mappingS[t-6] >= 0) {
+ if (tokens[t] == "NA") phenotype_val.back()[mappingS[t-6]] = bcf_float_missing;
+ else phenotype_val.back()[mappingS[t-6]] = stof(tokens[t]);
+ }
+ }
+ pair < string, int > temp (tokens[3],n_includedP);
+ phenotype_id_to_idx.insert(temp);
+ n_includedP++;
+ } else n_excludedP ++;
+ }
+ tbx_itr_destroy(itr);
+ }else{
+ while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) {
+ linecount ++;
+ if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
+ stb.split(string(str.s), tokens);
+ if (str.l && str.s[0] != tbx->conf.meta_char) {
+ if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
+ if (filter_phenotype.check(tokens[3])) {
+ phenotype_id.push_back(tokens[3]);
+ phenotype_chr.push_back(tokens[0]);
+ phenotype_start.push_back(atoi(tokens[1].c_str()) + 1);
+ phenotype_end.push_back(atoi(tokens[2].c_str()));
+ phenotype_val.push_back(vector < float > (sample_count, 0.0));
+ if (grp_mode > 0) phenotype_grp.push_back(tokens[4]);
+ for (int t = 6 ; t < tokens.size() ; t ++) {
+ if (mappingS[t-6] >= 0) {
+ if (tokens[t] == "NA") phenotype_val.back()[mappingS[t-6]] = bcf_float_missing;
+ else phenotype_val.back()[mappingS[t-6]] = stof(tokens[t]);
+ }
+ }
+ pair < string, int > temp (tokens[3],n_includedP);
+ phenotype_id_to_idx.insert(temp);
+ n_includedP++;
+ } else n_excludedP ++;
+ }
+ }
+ }
+
+
+
+ //Finalize & verbose
+ tbx_destroy(tbx);
+ if (hts_close(fp)) vrb.error("Cannot properly close file");
+ phenotype_count = phenotype_id.size();
+ vrb.bullet(stb.str(n_includedP) + " phenotypes included");
+ if (n_excludedP > 0) vrb.bullet(stb.str(n_excludedP) + " phenotypes excluded by user");
+ if (phenotype_count == 0) vrb.leave("Cannot find phenotypes in target region!");
+}
+
+void rtc_data::scanPhenotypes(string fbed) {
+ int n_includedP = 0;
+ int n_excludedP = 0;
+
+ //Open BED file
+ vrb.title("Scanning phenotype data in [" + fbed + "]");
+ htsFile *fp = hts_open(fbed.c_str(),"r");
+ if (!fp) vrb.error("Cannot open file");
+ tbx_t * tbx = tbx_index_load(fbed.c_str());
+ if (!tbx) vrb.error("Cannot open index file");
+
+ //Read header
+ kstring_t str = {0,0,0};
+ if (!hts_getline(fp, KS_SEP_LINE, &str) || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line");
+
+ //Scan file
+ vector < string > tokens;
+ unsigned int linecount =0;
+ //Read data
+ if (regionPhenotype.chr != "NA"){
+ hts_itr_t *itr = tbx_itr_querys(tbx, regionPhenotype.get().c_str());
+ vrb.bullet("target region [" + regionPhenotype.get() + "]");
+ if (!itr) vrb.error("Cannot jump to region!");
+ while (tbx_itr_next(fp, tbx, itr, &str) >= 0) {
+ linecount ++;
+ if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
+ stb.split(string(str.s), tokens);
+ if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
+ if (filter_phenotype.check(tokens[3])) {
+ phenotype_id.push_back(tokens[3]);
+ phenotype_chr.push_back(tokens[0]);
+ phenotype_start.push_back(atoi(tokens[1].c_str()) + 1);
+ phenotype_end.push_back(atoi(tokens[2].c_str()));
+ phenotype_val.push_back(vector < float > (sample_count, 0.0));
+ if (grp_mode > 0) phenotype_grp.push_back(tokens[4]);
+ pair < string, int > temp (tokens[3],n_includedP);
+ phenotype_id_to_idx.insert(temp);
+ n_includedP++;
+ } else n_excludedP ++;
+ }
+ tbx_itr_destroy(itr);
+ }else{
+ while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) {
+ linecount ++;
+ if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
+ if (str.l && str.s[0] != tbx->conf.meta_char) {
+ stb.split(string(str.s), tokens);
+ if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
+ if (filter_phenotype.check(tokens[3])) {
+ phenotype_id.push_back(tokens[3]);
+ phenotype_chr.push_back(tokens[0]);
+ phenotype_start.push_back(atoi(tokens[1].c_str()) + 1);
+ phenotype_end.push_back(atoi(tokens[2].c_str()));
+ if (grp_mode > 0) phenotype_grp.push_back(tokens[4]);
+ pair < string, int > temp (tokens[3],n_includedP);
+ phenotype_id_to_idx.insert(temp);
+ n_includedP++;
+ } else n_excludedP ++;
+ }
+ }
+ }
+
+ //Finalize & verbose
+ tbx_destroy(tbx);
+ if (hts_close(fp)) vrb.error("Cannot properly close file");
+ phenotype_count = phenotype_id.size();
+ vrb.bullet(stb.str(n_includedP) + " phenotypes included");
+ if (n_excludedP > 0) vrb.bullet(stb.str(n_excludedP) + " phenotypes excluded by user");
+ if (phenotype_count == 0) vrb.leave("Cannot find phenotypes in region!");
+}
diff --git a/src/mode_rtc/rtc_sampling.cpp b/src/mode_rtc/rtc_sampling.cpp
new file mode 100644
index 0000000..681cc17
--- /dev/null
+++ b/src/mode_rtc/rtc_sampling.cpp
@@ -0,0 +1,312 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "rtc_data.h"
+
+void rtc_data::generatePhenotype (double slope ,int geno_idx, vector <float> & new_pheno){
+ double fake_intercept = 100.0;
+ new_pheno = vector <float>(sample_count,0.0);
+ for (int i = 0 ; i < sample_count; i++) new_pheno[i] = slope * genotype_val[geno_idx][i] + fake_intercept + rnorm(0.0, 1.0);
+}
+
+void rtc_data::generatePhenotype(vector<float> &X, linReg &linreg, vector <float> &np){
+ int N = X.size();
+ np = vector < float > (N,0.0);
+ vector < float > residuals = linreg.residuals;
+ random_shuffle(residuals.begin(),residuals.end());
+ for (int i = 0 ; i<N; i++) np[i] = X[i] * linreg.beta + linreg.yIntercept + residuals[i];
+}
+
+rtc_sample_results rtc_data::sampleRTC(vector <int> & genotype_idx, vector <float> & phenotype, int eqtl_idx, double RTC, int pI){
+ rtc_sample_results result;
+ if (genotype_idx.size() < 4) return result;
+ //double slope = 0.0;
+ //regression(genotype_val[eqtl_idx], phenotype, slope);
+ linReg linreg(genotype_val[eqtl_idx], phenotype);
+ double count = 0.0;
+ double count2 = 0.0;
+ double gt_h0 = 0.0, gtoe_h0 = 0.0 , gt_h1 = 0.0, gtoe_h1 = 0.0;
+ unordered_map < int , vector < vector < float > > > pseudo_phenos;
+ unordered_map < int , map < int , map < int, double > > > h0s;
+ unordered_map < int , map < int , map < int, double > > > h1s;
+ set < string > h0ss,h1ss;
+ int better_hit = 0, pseudo_hit = 0;
+ float medianR2;
+ vector < float > h0,h1;
+ long int trials = 0;
+ //calculate median r2
+ //if (pI >= 0){
+ vector < float > r2s;
+ for (int i =0 ; i < genotype_idx.size(); i++){
+ for (int j =i+1; j < genotype_idx.size(); j++){
+ r2s.push_back(getRsquare(genotype_idx[i], genotype_idx[j],genotype_idx[0],genotype_idx.size()));
+ }
+ }
+ medianR2 = median(r2s);
+ vector < float > ().swap(r2s);
+ //}
+ ////////////////////
+ while ( count < sample_iterations && trials < max_sample_iterations){
+ trials++;
+ //select random causal eQTL
+ int r_eqtl_causal = genotype_idx[rng.getInt(genotype_idx.size())];
+
+ //Find variants linked to this random eQTL
+ vector <int> possible_selections1;
+ for (int s = 0 ; s < genotype_idx.size(); s++){
+ if (genotype_idx[s] == r_eqtl_causal) continue;
+ if (getRsquare(r_eqtl_causal, genotype_idx[s],genotype_idx[0],genotype_idx.size()) >= R2_cutoff) possible_selections1.push_back(genotype_idx[s]);
+ }
+
+ //If there are no linked ones continue
+ if(!possible_selections1.size()) {
+ if (pI >= 0) cerr << phenotype_id[pI] << "-" << genotype_id[eqtl_idx] << " H0 " << genotype_id[r_eqtl_causal] << " " << count << " " << sample_iterations << " 0 NA 0 NA NA " << gtoe_h0 << " " << gt_h0 << " " << count << " " << RTC << " " << medianR2<< " NA NA NA NA" <<endl;
+ continue;
+ }
+ //Select a random linked one
+ int r_eqtl = possible_selections1[rng.getInt(possible_selections1.size())];
+
+ //select second random causal
+ int r_other_causal = r_eqtl_causal;
+ while(r_other_causal == r_eqtl_causal || r_other_causal == r_eqtl) r_other_causal = genotype_idx[rng.getInt(genotype_idx.size())];
+
+ //Find variants linked to second random causal
+ vector <int> possible_selections2;
+ for (int s = 0 ; s < genotype_idx.size(); s++){
+ if (genotype_idx[s] == r_other_causal) continue;
+ if (genotype_idx[s] != r_eqtl_causal && genotype_idx[s] != r_eqtl && getRsquare(r_other_causal, genotype_idx[s],genotype_idx[0],genotype_idx.size()) >= R2_cutoff) possible_selections2.push_back(genotype_idx[s]);
+ }
+
+ //If there are no linked ones continue
+ if(!possible_selections2.size()) {
+ if (pI >= 0) cerr << phenotype_id[pI] << "-" << genotype_id[eqtl_idx] << " H0 " << genotype_id[r_eqtl_causal] << " " << count << " " << sample_iterations << " " << possible_selections1.size() << " " << genotype_id[r_eqtl] << " 0 NA NA " << gtoe_h0 << " " << gt_h0 << " " << count << " " << RTC << " " << medianR2 << " NA NA NA NA"<< endl;
+ continue;
+ }
+ //Select a random linked one
+ int r_other = possible_selections2[rng.getInt(possible_selections2.size())];
+ count++;
+ h0ss.insert(stb.str(r_eqtl_causal) + stb.str(r_eqtl) + stb.str(r_other));
+ if(h0s.count(r_eqtl_causal) && h0s[r_eqtl_causal].count(r_eqtl) && h0s[r_eqtl_causal][r_eqtl].count(r_other)){
+ better_hit++;
+ h0.push_back(h0s[r_eqtl_causal][r_eqtl][r_other]);
+ if (h0s[r_eqtl_causal][r_eqtl][r_other] >= RTC) {
+ gtoe_h0++;
+ if (h0s[r_eqtl_causal][r_eqtl][r_other] > RTC) gt_h0++;
+ }
+ if (pI >= 0) cerr << phenotype_id[pI] << "-" << genotype_id[eqtl_idx] << " H0 " << genotype_id[r_eqtl_causal] << " " << count << " " << sample_iterations << " " << possible_selections1.size() << " " << genotype_id[r_eqtl] << " " << possible_selections2.size() << " " << genotype_id[r_other] << " " << h0s[r_eqtl_causal][r_eqtl][r_other] << " " << gtoe_h0 << " " << gt_h0 << " " << count << " " << RTC << " " << medianR2 << " " << getRsquare(r_eqtl_causal, r_other_causal,genotype_id [...]
+ }else{
+ vector < double > corrs(genotype_idx.size());
+ double test_snp_corr = 0.0 ;
+ vector < float > genotype_eqtl;
+ if (genotypeSink.count(r_eqtl)){
+ genotype_eqtl = genotypeSink[r_eqtl];
+ }else{
+ genotype_eqtl = genotype_val[r_eqtl];
+ normalize(genotype_eqtl);
+ if (DprimeR2inMem >= 2) genotypeSink[r_eqtl] = genotype_eqtl;
+ }
+ if (pseudo_phenos.count(r_eqtl_causal)){
+ pseudo_hit++;
+ for (int s = 0 ; s < genotype_idx.size() ; s++){
+ corrs[s] = abs(getCorrelation(genotype_eqtl, pseudo_phenos[r_eqtl_causal][s]));
+ if( genotype_idx[s] == r_other) test_snp_corr = corrs[s];
+ }
+ }else{
+ vector < float > pseudo_pheno;
+ //generatePhenotype(slope, r_eqtl_causal, pseudo_pheno);
+ generatePhenotype(genotype_val[r_eqtl_causal], linreg, pseudo_pheno);
+ if (options.count("normal")) normalTransform(pseudo_pheno);
+ normalize(pseudo_pheno);
+ for (int s = 0 ; s < genotype_idx.size() ; s++){
+ vector < float > test;
+ if(genotypeSink.count(genotype_idx[s])){
+ test = genotypeSink[genotype_idx[s]];
+ }else{
+ test = genotype_val[genotype_idx[s]];
+ normalize(test);
+ if (DprimeR2inMem >= 2) genotypeSink[genotype_idx[s]] = test;
+ }
+ vector <float> new_pheno = correct(test,pseudo_pheno);
+ if (options.count("normal")) normalTransform(new_pheno);
+ normalize(new_pheno);
+ if (DprimeR2inMem) pseudo_phenos[r_eqtl_causal].push_back(new_pheno);
+ corrs[s] = abs(getCorrelation(genotype_eqtl, new_pheno));
+ if( genotype_idx[s] == r_other) test_snp_corr = corrs[s];
+ }
+ }
+
+ sort(corrs.begin(),corrs.end());
+ int rank = -1;
+ for (int i = 0 ; i<corrs.size() && corrs[i] <= test_snp_corr; i++) if(corrs[i] == test_snp_corr) rank = i;
+ double rtc = ((double) corrs.size() - (double) rank) / (double) corrs.size();
+ h0.push_back(rtc);
+ if (rtc >= RTC) {
+ gtoe_h0++;
+ if (rtc > RTC) gt_h0++;
+ }
+ if (DprimeR2inMem) h0s[r_eqtl_causal][r_eqtl][r_other]= rtc;
+ if (pI >= 0) cerr << phenotype_id[pI] << "-" << genotype_id[eqtl_idx] << " H0 " << genotype_id[r_eqtl_causal] << " " << count << " " << sample_iterations << " " << possible_selections1.size() << " " << genotype_id[r_eqtl] << " " << possible_selections2.size() << " " << genotype_id[r_other] << " " << rtc << " " << gtoe_h0 << " " << gt_h0 << " " << count << " " << RTC << " " << medianR2 << " " << getRsquare(r_eqtl_causal, r_other_causal,genotype_idx[0],genotype_idx.size()) << " " [...]
+ }
+
+ }
+ trials = 0;
+ while ( count2 < sample_iterations && trials < max_sample_iterations ){
+ trials++;
+ //select random causal eQTL
+ int r_eqtl_causal = genotype_idx[rng.getInt(genotype_idx.size())];
+
+ //Find variants linked to this true eQTL
+ vector <int> possible_selections1;
+ for (int s = 0 ; s < genotype_idx.size(); s++){
+ if (genotype_idx[s] == r_eqtl_causal) continue;
+ if (getRsquare(r_eqtl_causal, genotype_idx[s],genotype_idx[0],genotype_idx.size()) >= R2_cutoff) possible_selections1.push_back(genotype_idx[s]);
+ }
+
+ //If there are less than 2 linked ones continue
+ if(possible_selections1.size() < 2) {
+ if (pI >= 0) cerr << phenotype_id[pI] << "-" << genotype_id[eqtl_idx] << " H1 " << genotype_id[r_eqtl_causal] << " " << count2 << " " << sample_iterations << " " << possible_selections1.size() << " NA NA NA NA " << gtoe_h1 << " " << gt_h1 << " " << count2 << " " << RTC << " " << medianR2<< " NA NA NA NA"<<endl;
+ continue;
+ }
+ unsigned int rngi = rng.getInt(possible_selections1.size());
+ int r_eqtl = possible_selections1[rngi];
+ possible_selections1.erase(possible_selections1.begin() + rngi);
+ int r_other = possible_selections1[rng.getInt(possible_selections1.size())];
+ count2++;
+ h1ss.insert(stb.str(r_eqtl_causal) + stb.str(r_eqtl) + stb.str(r_other));
+ if(h1s.count(r_eqtl_causal) && h1s[r_eqtl_causal].count(r_eqtl) && h1s[r_eqtl_causal][r_eqtl].count(r_other)){
+ better_hit++;
+ h1.push_back(h1s[r_eqtl_causal][r_eqtl][r_other]);
+ if (h1s[r_eqtl_causal][r_eqtl][r_other] >= RTC) {
+ gtoe_h1++;
+ if(h1s[r_eqtl_causal][r_eqtl][r_other] > RTC) gt_h1++;
+ }
+ if (pI >= 0) cerr << phenotype_id[pI] << "-" << genotype_id[eqtl_idx] << " H1 " << genotype_id[r_eqtl_causal] << " " << count2 << " " << sample_iterations << " " << possible_selections1.size() << " " << genotype_id[r_eqtl] << " " << "NA" << " " << genotype_id[r_other] << " " << h1s[r_eqtl_causal][r_eqtl][r_other] << " " << gtoe_h1 << " " << gt_h1 << " " << count2 << " " << RTC << " " << medianR2 << " NA " << getRsquare(r_eqtl_causal, r_eqtl,genotype_idx[0],genotype_idx.size [...]
+ }else{
+
+ vector < double > corrs(genotype_idx.size());
+ double test_snp_corr = 0.0 ;
+ vector < float > genotype_eqtl;
+ if (genotypeSink.count(r_eqtl)){
+ genotype_eqtl = genotypeSink[r_eqtl];
+ }else{
+ genotype_eqtl = genotype_val[r_eqtl];
+ normalize(genotype_eqtl);
+ if (DprimeR2inMem >= 2) genotypeSink[r_eqtl] = genotype_eqtl;
+ }
+ if (pseudo_phenos.count(r_eqtl_causal)){
+ pseudo_hit++;
+ for (int s = 0 ; s < genotype_idx.size() ; s++){
+ corrs[s] = abs(getCorrelation(genotype_eqtl, pseudo_phenos[r_eqtl_causal][s]));
+ if( genotype_idx[s] == r_other) test_snp_corr = corrs[s];
+ }
+ }else{
+ vector < float > pseudo_pheno;
+ //generatePhenotype(slope, r_eqtl_causal, pseudo_pheno);
+ generatePhenotype(genotype_val[r_eqtl_causal], linreg, pseudo_pheno);
+ if (options.count("normal")) normalTransform(pseudo_pheno);
+ normalize(pseudo_pheno);
+ for (int s = 0 ; s < genotype_idx.size() ; s++){
+ vector < float > test;
+ if(genotypeSink.count(genotype_idx[s])){
+ test = genotypeSink[genotype_idx[s]];
+ }else{
+ test = genotype_val[genotype_idx[s]];
+ normalize(test);
+ if (DprimeR2inMem >= 2) genotypeSink[genotype_idx[s]] = test;
+ }
+ vector <float> new_pheno = correct(test,pseudo_pheno);
+ if (options.count("normal")) normalTransform(new_pheno);
+ normalize(new_pheno);
+ if (DprimeR2inMem) pseudo_phenos[r_eqtl_causal].push_back(new_pheno);
+ corrs[s] = abs(getCorrelation(genotype_eqtl, new_pheno));
+ if( genotype_idx[s] == r_other) test_snp_corr = corrs[s];
+ }
+ }
+
+ sort(corrs.begin(),corrs.end());
+ int rank = -1;
+ for (int i = 0 ; i<corrs.size() && corrs[i] <= test_snp_corr; i++) if(corrs[i] == test_snp_corr) rank = i;
+ double rtc = ((double) corrs.size() - (double) rank) / (double) corrs.size();
+ h1.push_back(rtc);
+ if (rtc >= RTC) {
+ gtoe_h1++;
+ if(rtc > RTC) gt_h1++;
+ }
+ if (DprimeR2inMem) h1s[r_eqtl_causal][r_eqtl][r_other]= rtc;
+ if (pI >= 0) cerr << phenotype_id[pI] << "-" << genotype_id[eqtl_idx] << " H1 " << genotype_id[r_eqtl_causal] << " " << count2 << " " << sample_iterations << " " << possible_selections1.size() << " " << genotype_id[r_eqtl] << " " << "NA" << " " << genotype_id[r_other] << " " << rtc << " " << gtoe_h1 << " " << gt_h1 << " " << count2 << " " << RTC << " " << medianR2 << " NA " << getRsquare(r_eqtl_causal, r_eqtl) << " " << getRsquare(r_eqtl_causal, r_other) << " " << getRsquar [...]
+ }
+
+ }
+ //if (DprimeR2inMem) vrb.bullet(stb.str(better_hit) + " " + stb.str(pseudo_hit));
+ result.gtoe_h0 = gtoe_h0;
+ result.gt_h0 = gt_h0;
+ result.gtoe_h1 = gtoe_h1;
+ result.gt_h1 = gt_h1;
+ result.count_h0 = count;
+ result.count_h1 = count2;
+ result.unique_h0 = h0ss.size();
+ result.unique_h1 = h1ss.size();
+ result.medianR2 = medianR2;
+ result.median_h0 = median(h0);
+ result.median_h1 = median(h1);
+ result.pval = (gtoe_h0 + 1.0) / (count + 1.0);
+ const char * sep = ",";
+ stringstream h0sss,h1sss;
+ if(h0.size()){
+ copy(h0.begin(),h0.end(),ostream_iterator<float>(h0sss,sep));
+ result.h0 = h0sss.str();
+ //result.h0.pop_back();
+ result.h0.resize (result.h0.size () - 1);
+ } else result.h0 = "NA";
+ if(h1.size()){
+ copy(h1.begin(),h1.end(),ostream_iterator<float>(h1sss,sep));
+ result.h1 = h1sss.str();
+ result.h1.resize (result.h1.size () - 1);
+ } else result.h1 = "NA";
+ if (count && count2){
+ probability(h0,h1,RTC, result);
+ }
+ unordered_map < int , vector < vector < float > > >().swap(pseudo_phenos);
+ unordered_map < int , map < int , map < int, double > > >().swap(h0s);
+ unordered_map < int , map < int , map < int, double > > >().swap(h1s);
+ vector < float > ().swap(h1);
+ vector < float > ().swap(h0);
+ return result;
+}
+
+void rtc_data::probability(vector < float > &h0, vector < float > &h1, double RTC, rtc_sample_results & res){
+ vector < float > all = h0;
+ all.insert(all.end(),h1.begin(),h1.end());
+ sort(all.begin(),all.end());
+ int step = all.size() / 10 ;
+ double diff = 2.0;
+ int index = 0;
+ for (unsigned long int i = 0 ; i < all.size(); i++){
+ double d = abs(RTC - all[i]);
+ if (d < diff){
+ diff = d;
+ index = i;
+ }
+ }
+ int s = index - step >= 0 ? index - step : 0;
+ int e = index + step >= all.size() ? all.size() - 1 : index+step;
+ res.rtc_bin_start = all[s];
+ res.rtc_bin_end = all[e];
+ double c_h0 = 0.0 , c_h1 = 0.0;
+ for (int i = 0 ; i < h0.size() ; i++) if (h0[i] >= all[s] && h0[i] <= all[e]) c_h0++;
+ for (int i = 0 ; i < h1.size() ; i++) if (h1[i] >= all[s] && h1[i] <= all[e]) c_h1++;
+ res.rtc_bin_h0_proportion = c_h0 / h0.size();
+ res.rtc_bin_h1_proportion = c_h1 / h1.size();
+}
diff --git a/src/mode_trans/trans_adjust.cpp b/src/mode_trans/trans_adjust.cpp
new file mode 100644
index 0000000..b265c79
--- /dev/null
+++ b/src/mode_trans/trans_adjust.cpp
@@ -0,0 +1,52 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "trans_data.h"
+
+void trans_data::buildNullDistribution(string fnull) {
+ string buffer;
+ vector < string > tokens;
+ vector < double > null_pvalues;
+
+ //Open BED file
+ vrb.title("Reading null p-values in [" + fnull + "]");
+ input_file fd(fnull);
+ if (fd.fail()) vrb.error("Cannot open file!");
+ while (getline(fd, buffer)) {
+ stb.split(buffer, tokens);
+ null_pvalues.push_back(stof(tokens.back()));
+ }
+ fd.close();
+ vrb.bullet("#null p-values = " + stb.str(null_pvalues.size()));
+
+ double mean_null_pvalues = basic_stats(null_pvalues).mean();
+ double variance_null_pvalues = basic_stats(null_pvalues).variance();
+ vrb.bullet("Mean=" + stb.str(mean_null_pvalues) + " Var=" + stb.str(variance_null_pvalues));
+
+ double beta_mm1 = mean_null_pvalues * (mean_null_pvalues * (1 - mean_null_pvalues ) / variance_null_pvalues - 1);
+ double beta_mm2 = beta_mm1 * (1 / mean_null_pvalues - 1);
+ beta_ml1 = beta_mm1;
+ beta_ml2 = beta_mm2;
+ vrb.bullet("Beta parameters (MM) : [s1=" + stb.str(beta_ml1) + ", s2=" + stb.str(beta_ml2) +"]");
+
+ try {
+ learnBetaParameters(null_pvalues, beta_ml1, beta_ml2);
+ } catch (const std::exception & e) {
+ vrb.bullet("Maximum Likelihood estimation failed, use Moment Matching instead!");
+ beta_ml1 = beta_mm1;
+ beta_ml2 = beta_mm2;
+ }
+ vrb.bullet("Beta parameters (ML): [s1=" + stb.str(beta_ml1) + ", s2=" + stb.str(beta_ml2) +"]");
+}
diff --git a/src/mode_trans/trans_analysis_pass.cpp b/src/mode_trans/trans_analysis_pass.cpp
new file mode 100644
index 0000000..1abd621
--- /dev/null
+++ b/src/mode_trans/trans_analysis_pass.cpp
@@ -0,0 +1,117 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "trans_data.h"
+
+void trans_data::runTransPass(string fvcf, string fout) {
+ string fhits = fout + ".hits.txt.gz";
+ string fbins = fout + ".bins.txt.gz";
+ string fbest = fout + ".best.txt.gz";
+
+ vrb.title("Sweep through genotype data in [" + fvcf + "]");
+ bcf_sweep_t * sw = bcf_sweep_init(fvcf.c_str());
+ if (!sw) vrb.error("Cannot open file for reading [" + fvcf + "]");
+ bcf_hdr_t * hdr = bcf_sweep_hdr(sw);
+ if (!hdr) vrb.error("Cannot read header!");
+
+ unsigned long n_pos_tests = 0, n_variants = 0, n_curr_tests = 0, n_prev_tests = 0;
+ vector < double > best_hits = vector < double > (phenotype_count, 0);
+ vector < unsigned long > bins_hits = vector < unsigned long > (n_bins, 0);
+
+ int mDS = 0, mGT = 0;
+ float * vDS = NULL;
+ int * vGT = NULL;
+ bcf1_t * rec;
+ double step_bins = n_bins * 1.0 / correlation_threshold;
+
+ timer testing_speed_timer;
+ testing_speed_timer.clock();
+
+ int mode_GT_DS = -1; //-1 means unset / 0 means dosages / 1 means genotypes
+ output_file fdhits (fhits);
+ if (fdhits.fail()) vrb.error("Cannot open file for writing [" + fhits + "]");
+ while ( (rec = bcf_sweep_fwd(sw)) ) {
+ bcf_unpack(rec, BCF_UN_STR);
+ string vid = string(rec->d.id);
+ string chr = string(bcf_hdr_id2name(hdr, rec->rid));
+ int pos = rec->pos + 1;
+ if (rec->n_allele == 2) {
+ bool variant_is_to_be_processed = false;
+ if (mode_GT_DS != 0 && (bcf_get_genotypes(hdr, rec, &vGT, &mGT) == (2 * sample_count))) {
+ if (vDS == NULL) vDS = (float*)malloc(sample_count*sizeof(float));
+ computeDosages(vGT, vDS);
+ variant_is_to_be_processed = true;
+ mode_GT_DS = 1;
+ } else if (mode_GT_DS != 1 && (bcf_get_format_float(hdr, rec, "DS", &vDS, &mDS) == sample_count)) {
+ variant_is_to_be_processed = true;
+ mode_GT_DS = 0;
+ }
+
+ if (variant_is_to_be_processed) {
+ imputeGenotypes(vDS);
+ normalize(vDS);
+ for (int p = 0 ; p < phenotype_count ; p ++) {
+ if (chr != phenotype_chr[p] || abs(phenotype_start[p] - pos) > cis_window) {
+ double rcorr = fastCorrelation(phenotype_val[p], vDS);
+ double acorr = abs (rcorr);
+ if (acorr >= correlation_threshold) {
+ double npval = getNominalPvalue(acorr, sample_count - 2);
+ double apval = getAdjustedPvalue(npval);
+ fdhits << phenotype_id[p] << " " << phenotype_chr[p] << " " << phenotype_start[p] << " " << vid << " " << chr << " " << pos << " " << npval << " " << apval << " " << rcorr << endl;
+ n_pos_tests ++;
+ } else {
+ unsigned int idx_bin = (unsigned int) floor(acorr * step_bins);
+ bins_hits[idx_bin] ++;
+ }
+ if (acorr > best_hits[p]) best_hits[p] = acorr;
+
+ n_curr_tests ++;
+
+ if (n_curr_tests % 1000000 == 0) {
+ unsigned int elapsed_msecs = testing_speed_timer.rel_time();
+ vrb.bullet("#variants=" + stb.str(n_variants) + "\t#hits=" + stb.str(n_pos_tests) + "/" + stb.str(n_curr_tests) + "\tspeed=" + stb.str((n_curr_tests - n_prev_tests) * 1.0 / (elapsed_msecs * 1000), 2) + "MT/s");
+ testing_speed_timer.clock();
+ n_prev_tests = n_curr_tests;
+ }
+ }
+ }
+ }
+ }
+ n_variants ++;
+ }
+ unsigned int elapsed_msecs = testing_speed_timer.rel_time();
+ vrb.bullet("#variants=" + stb.str(n_variants) + "\t#hits=" + stb.str(n_pos_tests) + "/" + stb.str(n_curr_tests) + "\tspeed=" + stb.str((n_curr_tests - n_prev_tests) * 1.0 / (elapsed_msecs * 1000), 2) + "MT/s");
+ n_prev_tests = n_curr_tests;
+ bcf_sweep_destroy(sw);
+ fdhits.close();
+
+ output_file fdbins(fbins);
+ if (fdbins.fail()) vrb.error("Cannot open file for writing [" + fbins + "]");
+ for (int b = 0 ; b < n_bins ; b ++) {
+ double start_corr = (b * 1.0 / n_bins) * correlation_threshold;
+ double end_corr = ((b+1) * 1.0 / n_bins) * correlation_threshold;
+ fdbins << b << " " << start_corr << " " << end_corr << " " << getNominalPvalue(start_corr, sample_count - 2) << " " << getNominalPvalue(end_corr, sample_count - 2) << " " << bins_hits[b] << endl;
+ }
+ fdbins.close();
+
+ output_file fdbest(fbest);
+ if (fdbest.fail()) vrb.error("Cannot open file for writing [" + fbest + "]");
+ for (int p = 0 ; p < phenotype_count ; p ++) {
+ double npval = getNominalPvalue(best_hits[p], sample_count - 2);
+ double apval = getAdjustedPvalue(npval);
+ fdbest << phenotype_id[p] << " " << apval << " " << npval << endl;
+ }
+ fdbest.close();
+}
diff --git a/src/mode_trans/trans_chunking.cpp b/src/mode_trans/trans_chunking.cpp
new file mode 100644
index 0000000..b7319cb
--- /dev/null
+++ b/src/mode_trans/trans_chunking.cpp
@@ -0,0 +1,31 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "trans_data.h"
+
+void trans_data::setPhenotypeLines(int k, int K) {
+ //STEP0: check input values
+ if (K < 1) vrb.error("Number of chunks needs to be > 0");
+ if (K > phenotype_count) vrb.error("Number of chunks (" + stb.str(K) + ") is greater than the number of phenotypes (" + stb.str(phenotype_count) + ")");
+ if (k < 0) vrb.error("Chunk index needs to be > 0");
+ if (k > K) vrb.error("Chunk index needs to be smaller than or equal to the total number of chunks [=" + stb.str(K) + "]");
+
+ unsigned long int max_length =0 ;
+ if (phenotype_count % K == 0) max_length = phenotype_count / K;
+ else for ( unsigned long int l = 1 ; l * (K-1) < phenotype_count; l++ ) max_length = l;
+ start_line = (k-1) * max_length + 1;
+ end_line = k * max_length;
+ if (end_line > phenotype_count) end_line = phenotype_count;
+}
\ No newline at end of file
diff --git a/src/mode_trans/trans_data.h b/src/mode_trans/trans_data.h
new file mode 100644
index 0000000..2efe01d
--- /dev/null
+++ b/src/mode_trans/trans_data.h
@@ -0,0 +1,148 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#ifndef _TRANS_DATA_H
+#define _TRANS_DATA_H
+
+//ANALYSIS MODES
+#define TRANS_MODE1 1
+#define TRANS_MODE2 2
+#define TRANS_MODE3 3
+#define TRANS_MODE4 4
+
+//INCLUDES
+#include "../common/data.h"
+
+class trans_data : public data {
+public:
+ //PARAMETERS
+ unsigned int mode;
+ double cis_window;
+ double correlation_threshold;
+ double beta_ml1, beta_ml2;
+ unsigned int n_bins;
+ unsigned long int start_line,end_line;
+
+ //REGIONS
+ genomic_region regionPhenotype;
+
+ //PHENOTYPES
+ int phenotype_count; //phenotype number
+ vector < vector < float > > phenotype_val; //phenotype values
+ vector < string > phenotype_id; //phenotype ids
+ vector < string > phenotype_chr; //phenotype chromosomes
+ vector < int > phenotype_start; //phenotype start positions
+ vector < int > phenotype_end; //phenotype end positions
+ vector < bool > phenotype_neg; //phenotype is on the negative strand
+
+ //COVARIATES
+ int covariate_count; //covariate number
+ vector < vector < string > > covariate_val; //covariate values
+ vector < string > covariate_id; //covariate ids
+
+ //CONSTRUCTOR / DESTRUCTOR
+ trans_data();
+ ~trans_data();
+ void clear();
+
+
+ //READ OR GENERATE DATA
+ void readPhenotypes(string);
+ void scanPhenotypes(string);
+ void setPhenotypeLines(int,int);
+ void readSampleFromVCF(string);
+ void checkSampleInBED(string);
+ void checkSampleInCOV(string);
+ void readCovariates(string);
+
+ //GENOTYPE & PHENOTYPE MANAGEMENT
+ void computeDosages(int *, float *);
+ void imputeGenotypes(float *);
+ void imputePhenotypes();
+ void normalizePhenotypes();
+ void normalize(float *);
+ void normalize(vector < float > &);
+ void residualizePhenotypes();
+ void shufflePhenotypes();
+ void samplePhenotypes(unsigned int);
+ void normalTranformPhenotypes();
+
+ //COMPUTATION METHODS [ALL INLINES FOR SPEED]
+ double fastCorrelation(vector < float > &, float *);
+ double slowCorrelation(vector < float > &, float *);
+ double getNominalPvalue(double, double);
+ double getAdjustedPvalue(double);
+ void getCorrelationThreshold (double);
+ int learnBetaParameters(vector < double > & , double &, double &);
+ void buildNullDistribution(string);
+
+ //ANALYSIS
+ void runTransPass(string, string);
+};
+
+//***************************************************************//
+//******************** DECLARE FUNCTIONS ************************//
+//***************************************************************//
+void trans_main(vector < string > &);
+
+//***************************************************************//
+//******************** INLINE FUNCTIONS *************************//
+//***************************************************************//
+
+/*
+ * Fast implementation of inner_product optimized for 64 bytes cache lines.
+ */
+inline double trans_data::fastCorrelation(vector < float > & P, float * G) {
+ int i = 0;
+ int repeat = (sample_count / 4);
+ int left = (sample_count % 4);
+ double sum0 = 0, sum1 = 0, sum2 = 0, sum3 = 0;
+
+ while (repeat --) {
+ sum0 += P[i] * G[i];
+ sum1 += P[i+1] * G[i+1];
+ sum2 += P[i+2] * G[i+2];
+ sum3 += P[i+3] * G[i+3];
+ i += 4;
+ }
+
+ switch (left) {
+ case 3: sum0 += P[i+2] * G[i+2];
+ case 2: sum0 += P[i+1] * G[i+1];
+ case 1: sum0 += P[i+0] * G[i+0];
+ case 0: ;
+ }
+
+ return sum0 + sum1 + sum2 + sum3;
+}
+
+inline double trans_data::slowCorrelation(vector < float > & P, float * G) {
+ double sum = 0.0;
+ for (int e = 0 ; e < sample_count ; e ++) sum += P[e] * G[e];
+ return sum;
+}
+
+inline double trans_data::getNominalPvalue(double corr, double df) {
+ double pval = pf(df * corr * corr / (1 - corr * corr), 1, df, 0, 0);
+ if (pval <= std::numeric_limits<double>::min()) pval =std::numeric_limits<double>::min();
+ return pval;
+}
+
+inline double trans_data::getAdjustedPvalue(double pv) {
+ if (beta_ml1 == 0.0 || beta_ml2 == 0.0) return -1;
+ else return pbeta(pv, beta_ml1, beta_ml2, 1, 0);
+}
+
+#endif
diff --git a/src/mode_trans/trans_initilization.cpp b/src/mode_trans/trans_initilization.cpp
new file mode 100644
index 0000000..a193b19
--- /dev/null
+++ b/src/mode_trans/trans_initilization.cpp
@@ -0,0 +1,52 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "trans_data.h"
+
+
+trans_data::trans_data() {
+ sample_count = 0;
+ phenotype_count = 0;
+ covariate_count = 0;
+ beta_ml1 = 0;
+ beta_ml2 = 0;
+ correlation_threshold = 0;
+ start_line=0;
+ end_line=0;
+}
+
+void trans_data::clear() {
+ sample_count = 0;
+ sample_id.clear();
+ phenotype_count = 0;
+ phenotype_val.clear();
+ phenotype_id.clear();
+ phenotype_chr.clear();
+ phenotype_start.clear();
+ phenotype_end.clear();
+ covariate_count = 0;
+ covariate_val.clear();
+ covariate_id.clear();
+ sample_count = 0;
+ phenotype_count = 0;
+ covariate_count = 0;
+ beta_ml1 = 0;
+ beta_ml2 = 0;
+ correlation_threshold = 0;
+}
+
+trans_data::~trans_data() {
+ clear();
+}
\ No newline at end of file
diff --git a/src/mode_trans/trans_learn_beta.cpp b/src/mode_trans/trans_learn_beta.cpp
new file mode 100644
index 0000000..dc675b6
--- /dev/null
+++ b/src/mode_trans/trans_learn_beta.cpp
@@ -0,0 +1,103 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "trans_data.h"
+
+#include <gsl/gsl_multimin.h>
+#include <gsl/gsl_sf_psi.h>
+#include <gsl/gsl_sf_gamma.h>
+
+#define BETA_SHAPE1_MIN 0.1
+#define BETA_SHAPE1_MAX 10
+#define BETA_SHAPE2_MIN 5
+#define BETA_SHAPE2_MAX 100000000
+
+
+class trans_learn_beta_exception : public std::exception {
+public:
+ trans_learn_beta_exception(string _msg) { msg = _msg; }
+ virtual ~trans_learn_beta_exception() throw() { }
+ virtual const char * what() const throw() { return msg.c_str(); }
+private:
+ string msg;
+};
+
+double trans_betaLogLikelihood(const gsl_vector *v, void *params) {
+ double * p = (double *) params;
+ double beta_shape1 = gsl_vector_get(v, 0);
+ double beta_shape2 = gsl_vector_get(v, 1);
+
+ if (beta_shape1 < BETA_SHAPE1_MIN) throw trans_learn_beta_exception("beta_shape1 too small [" + stb.str(beta_shape1, 3) + "]");
+ if (beta_shape1 > BETA_SHAPE1_MAX) throw trans_learn_beta_exception("beta_shape1 too large [" + stb.str(beta_shape1, 3) + "]");
+ if (beta_shape2 < BETA_SHAPE2_MIN) throw trans_learn_beta_exception("beta_shape2 too small [" + stb.str(beta_shape2, 3) + "]");
+ if (beta_shape2 > BETA_SHAPE2_MAX) throw trans_learn_beta_exception("beta_shape2 too large [" + stb.str(beta_shape2, 3) + "]");
+
+ return -1.0 * ((beta_shape1 - 1) * p[0] + (beta_shape2 - 1) * p[1] - p[2] * gsl_sf_lnbeta(beta_shape1, beta_shape2));
+}
+
+int trans_data::learnBetaParameters(vector < double > & pval, double & beta_shape1, double & beta_shape2) {
+
+ //Set starting point to moment matching estimates
+ gsl_vector * x = gsl_vector_alloc (2);
+ gsl_vector_set (x, 0, beta_shape1);
+ gsl_vector_set (x, 1, beta_shape2);
+
+ //Set initial step sizes to shape1 and shape2 scales
+ gsl_vector * ss = gsl_vector_alloc (2);
+ gsl_vector_set (ss, 0, beta_shape1/10);
+ gsl_vector_set (ss, 1, beta_shape2/10);
+
+ //Initialize method and iterate
+ double par [3];
+ par[0] = 0.0;
+ par[1] = 0.0;
+ for (int e = 0 ; e < pval.size(); e ++) {
+ if (pval[e] == 1.0) pval[e] = 0.99999999;
+ par[0] += log (pval[e]);
+ par[1] += log (1 - pval[e]);
+ }
+ par[2] = pval.size();
+ gsl_multimin_function minex_func;
+ minex_func.n = 2;
+ minex_func.f = trans_betaLogLikelihood;
+ minex_func.params = par;
+
+ //Initialize optimization machinery
+ const gsl_multimin_fminimizer_type * T = gsl_multimin_fminimizer_nmsimplex2;
+ gsl_multimin_fminimizer * s = gsl_multimin_fminimizer_alloc (T, 2);
+ gsl_multimin_fminimizer_set (s, &minex_func, x, ss);
+
+ //Optimization iteration
+ size_t iter = 0;
+ int status;
+ double size;
+ do {
+ iter++;
+ status = gsl_multimin_fminimizer_iterate(s);
+ if (status) break;
+ size = gsl_multimin_fminimizer_size (s);
+ status = gsl_multimin_test_size (size, 0.01);
+ } while (status == GSL_CONTINUE && iter < 1000);
+
+ //Output new beta shape values
+ beta_shape1 = gsl_vector_get (s->x, 0);
+ beta_shape2 = gsl_vector_get (s->x, 1);
+
+ //Free allocated memory
+ gsl_vector_free(x);
+ gsl_vector_free(ss);
+ gsl_multimin_fminimizer_free (s);
+ return (status == GSL_SUCCESS);
+}
diff --git a/src/mode_trans/trans_main.cpp b/src/mode_trans/trans_main.cpp
new file mode 100644
index 0000000..5fa1c8d
--- /dev/null
+++ b/src/mode_trans/trans_main.cpp
@@ -0,0 +1,161 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "trans_data.h"
+
+void trans_main(vector < string > & argv) {
+ trans_data D;
+
+ //-------------------------
+ // 1. DECLARE ALL OPTIONS
+ //-------------------------
+ D.declareBasicOptions(); //Mandatory
+ boost::program_options::options_description opt_files ("\x1B[32mI/O\33[0m");
+ opt_files.add_options()
+ ("vcf", boost::program_options::value< string >(), "Genotypes in VCF/BCF format.")
+ ("bed", boost::program_options::value< string >(), "Phenotypes in BED format.")
+ ("cov", boost::program_options::value< string >(), "Covariates in TXT format.")
+ ("out", boost::program_options::value< string >(), "Output file.");
+
+ boost::program_options::options_description opt_parameters ("\x1B[32mParameters\33[0m");
+ opt_parameters.add_options()
+ ("normal", "Quantile normalize phenotype data.")
+ ("window", boost::program_options::value< double >()->default_value(5e6, "5e6"), "Cis-window of variants to be excluded.")
+ ("threshold", boost::program_options::value< double >()->default_value(1e-5, "1e-5"), "P-value threshold below which hits are reported.")
+ ("bins", boost::program_options::value< unsigned int >()->default_value(1000), "Number of bins to use to categorize all p-values above --threshold.");
+
+ boost::program_options::options_description opt_modes ("\x1B[32mAnalysis type\33[0m");
+ opt_modes.add_options()
+ ("nominal", "MODE1: NOMINAL PASS [Pvalues are not adjusted].")
+ ("adjust", boost::program_options::value< string >(), "MODE2: ADJUSTED PASS [Pvalues are adjusted].")
+ ("permute", "MODE3: PERMUTATION PASS [Permute all phenotypes once].")
+ ("sample", boost::program_options::value< unsigned int >(), "MODE4: PERMUTATION PASS [Permute randomly chosen phenotypes multiple times].");
+
+ boost::program_options::options_description opt_parallel ("\x1B[32mParallelization\33[0m");
+ opt_parallel.add_options()
+ ("chunk", boost::program_options::value< vector < int > >()->multitoken(), "Specify which chunk needs to be processed");
+
+ D.option_descriptions.add(opt_files).add(opt_parameters).add(opt_modes).add(opt_parallel);
+
+ //-------------------
+ // 2. PARSE OPTIONS
+ //-------------------
+ try {
+ boost::program_options::store(boost::program_options::command_line_parser(argv).options(D.option_descriptions).run(), D.options);
+ boost::program_options::notify(D.options);
+ } catch ( const boost::program_options::error& e ) {
+ cerr << "Error parsing [trans] command line :" << string(e.what()) << endl;
+ exit(0);
+ }
+
+ //---------------------
+ // 3. PRINT HELP/HEADER
+ //---------------------
+ vrb.ctitle("MAPPING QTL IN TRANS");
+ if (D.options.count("help")) {
+ cout << D.option_descriptions << endl;
+ exit(EXIT_SUCCESS);
+ }
+
+ //-----------------
+ // 4. COMMON CHECKS
+ //-----------------
+ if (!D.options.count("vcf")) vrb.error("Genotype data needs to be specified with --vcf [file.vcf]");
+ if (!D.options.count("bed")) vrb.error("Phenotype data needs to be specified with --bed [file.bed]");
+ if (!D.options.count("out")) vrb.error("Output needs to be specified with --out [file.out]");
+ int nMode = D.options.count("nominal") + D.options.count("adjust") + D.options.count("permute") + D.options.count("sample");
+ if (nMode != 1) vrb.error("Please, specify only one of these options [--nominal, --adjust, --permute, --sample]");
+ if (D.options.count("chunk") && D.options.count("sample")) vrb.error("--chunk cannot be combined with --sample");
+
+ //---------
+ // 5. MODES
+ //---------
+
+ //MODE1: NOMINAL PASS NON ADJUSTED
+ if (D.options.count("nominal")) {
+ D.mode = TRANS_MODE1;
+ vrb.bullet("TASK: Perform a full nominal pass, do not adjust p-values");
+ }
+
+ //MODE2: NOMINAL PASS ADJUSTED
+ if (D.options.count("adjust")) {
+ D.mode = TRANS_MODE2;
+ vrb.bullet("TASK: Test and adjust p-values using [" + D.options["adjust"].as < string > () +"]");
+ }
+
+ //MODE3: PERMUTATION PASS
+ if (D.options.count("permute")) {
+ D.mode = TRANS_MODE3;
+ vrb.bullet("TASK: Permute all phenotype once and test");
+ }
+
+ //MODE4: PERMUTATION PASS
+ if (D.options.count("sample")) {
+ D.mode = TRANS_MODE4;
+ vrb.bullet("TASK: Permute randomly chosen phenotypes " + stb.str(D.options["sample"].as < unsigned int > ()) + " times and test");
+ }
+
+ //--------------
+ // 6. SET PARAMS
+ //--------------
+ if (D.options["window"].as < double > () <= 0 || D.options["window"].as < double > () > 1e9) vrb.error ("Incorrect cis-window size");
+ vrb.bullet("Cis-window size is " + stb.str((int)D.options["window"].as < double > ()) + " bp");
+ D.cis_window = D.options["window"].as < double > ();
+ if (D.options.count("chunk")) {
+ vector < int > nChunk = D.options["chunk"].as < vector < int > > ();
+ if (nChunk.size() != 2 || nChunk[0] > nChunk[1]) vrb.error("Incorrect --chunk arguments!");
+ vrb.bullet("Chunk = [" + stb.str(nChunk[0]) + "/" + stb.str(nChunk[1]) + "]");
+ }
+ D.n_bins = D.options["bins"].as < unsigned int > ();
+ vrb.bullet("#bins = " + stb.str(D.n_bins));
+ vrb.bullet("threshold = " + stb.str(D.options["threshold"].as < double > ()));
+
+
+ D.processBasicOptions();
+ if (D.options.count("chunk")) {
+ D.scanPhenotypes(D.options["bed"].as < string > ());
+ D.setPhenotypeLines(D.options["chunk"].as < vector < int > > ()[0], D.options["chunk"].as < vector < int > > ()[1]);
+ D.clear();
+ }
+
+ //---------------------------
+ // 7. READ FILES & INITIALIZE
+ //---------------------------
+ //D.processBasicOptions();
+ D.readSampleFromVCF(D.options["vcf"].as < string > ());
+ D.checkSampleInBED(D.options["bed"].as < string > ());
+ if (D.options.count("cov")) D.checkSampleInCOV(D.options["cov"].as < string > ());
+
+ D.readPhenotypes(D.options["bed"].as < string > ());
+ D.imputePhenotypes();
+
+ if (D.options.count("cov")) {
+ D.readCovariates(D.options["cov"].as < string > ());
+ D.residualizePhenotypes();
+ }
+ if (D.options.count("normal")) D.normalTranformPhenotypes();
+
+ if (D.options.count("permute")) D.shufflePhenotypes();
+ if (D.options.count("sample")) D.samplePhenotypes(D.options["sample"].as < unsigned int > ());
+ if (D.options.count("adjust")) D.buildNullDistribution(D.options["adjust"].as < string > ());
+ D.getCorrelationThreshold(D.options["threshold"].as < double > ());
+
+ D.normalizePhenotypes();
+
+ //----------------
+ // 8. RUN ANALYSIS
+ //----------------
+ D.runTransPass(D.options["vcf"].as < string > (), D.options["out"].as < string > ());
+}
diff --git a/src/mode_trans/trans_managment.cpp b/src/mode_trans/trans_managment.cpp
new file mode 100644
index 0000000..71c80e1
--- /dev/null
+++ b/src/mode_trans/trans_managment.cpp
@@ -0,0 +1,161 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "trans_data.h"
+
+void trans_data::computeDosages(int * vGT, float * vDS) {
+ for (int s = 0; s < sample_count; s ++) {
+ if (vGT[2*s+0] == bcf_gt_missing || vGT[2*s+1] == bcf_gt_missing) vDS[s] = bcf_float_missing;
+ else vDS[s] = bcf_gt_allele(vGT[2*s+0]) + bcf_gt_allele(vGT[2*s+1]);
+ }
+}
+
+void trans_data::imputeGenotypes(float * G) {
+ double mean = 0.0;
+ int c_mean= 0;
+ for (int s = 0; s < sample_count; s ++) {
+ if (G[s] != bcf_float_missing) {
+ mean += G[s];
+ c_mean ++;
+ }
+ }
+ mean /= c_mean;
+ for (int s = 0; s < sample_count ; s ++) if (G[s] == bcf_float_missing) G[s] = mean;
+}
+
+void trans_data::imputePhenotypes() {
+ for (int p = 0; p < phenotype_count ; p ++) {
+ double mean = 0.0;
+ int c_mean= 0;
+ for (int s = 0; s < sample_count; s ++) {
+ if (phenotype_val[p][s] != bcf_float_missing) {
+ mean += phenotype_val [p][s];
+ c_mean ++;
+ }
+ }
+ mean /= c_mean;
+ for (int s = 0; s < sample_count ; s ++) if (phenotype_val[p][s] == bcf_float_missing) phenotype_val[p][s] = mean;
+ }
+}
+
+void trans_data::normalize(float * G) {
+ double mean = 0.0, sum = 0.0;
+ for (int s = 0; s < sample_count ; s ++) mean += G[s];
+ mean /= sample_count;
+ for (int s = 0; s < sample_count ; s ++) {
+ G[s] -= mean;
+ sum += G[s] * G[s];
+ }
+ sum = sqrt(sum);
+ if (sum == 0) sum = 1;
+ for (int s = 0; s < sample_count ; s ++) G[s] /= sum;
+}
+
+void trans_data::normalize(vector < float > & G) {
+ double mean = 0.0, sum = 0.0;
+ for (int s = 0; s < sample_count ; s ++) mean += G[s];
+ mean /= sample_count;
+ for (int s = 0; s < sample_count ; s ++) {
+ G[s] -= mean;
+ sum += G[s] * G[s];
+ }
+ sum = sqrt(sum);
+ if (sum == 0) sum = 1;
+ for (int s = 0; s < sample_count ; s ++) G[s] /= sum;
+}
+
+void trans_data::normalizePhenotypes() {
+ for (int p = 0 ; p < phenotype_count ; p ++) normalize(phenotype_val[p]);
+}
+
+void trans_data::normalTranformPhenotypes() {
+ vrb.title("Normal transform phenotypes");
+ for (int p = 0 ; p < phenotype_count ; p ++) {
+ vector < float > R;
+ myranker::rank(phenotype_val[p], R);
+ double max = 0;
+ for (int s = 0 ; s < sample_count ; s ++) {
+ R[s] = R[s] - 0.5;
+ if (R[s] > max) max = R[s];
+ }
+ max = max + 0.5;
+ for (int s = 0 ; s < sample_count ; s ++) {
+ R[s] /= max;
+ phenotype_val[p][s] = qnorm(R[s], 0.0, 1.0, 1, 0);
+ }
+ }
+}
+
+void trans_data::shufflePhenotypes() {
+ vector < float > phenotype_tmp = vector < float > (sample_count, 0.0);
+ vector < unsigned int > O = vector < unsigned int > (sample_count, 0);
+ for (int i = 0 ; i < sample_count ; i ++) O[i] = i;
+ shuffle(O.begin(), O.end(), rng.getEngine());
+ for (unsigned int p = 0 ; p < phenotype_count ; p ++) {
+ for (unsigned int i = 0 ; i < sample_count ; i ++) phenotype_tmp[i] = phenotype_val[p][O[i]];
+ phenotype_val[p] = phenotype_tmp;
+ }
+}
+
+void trans_data::samplePhenotypes(unsigned int N) {
+ int phenotype_count_tmp = N;
+ vector < vector < float > > phenotype_val_tmp = vector < vector < float > > (N);
+ vector < string > phenotype_id_tmp = vector < string > (N);
+ vector < string > phenotype_chr_tmp = vector < string > (N);
+ vector < int > phenotype_start_tmp = vector < int > (N);
+ vector < int > phenotype_end_tmp = vector < int > (N);
+
+ for (int n = 0; n < phenotype_count_tmp; n ++) {
+ unsigned int ridx = rng.getInt(phenotype_count);
+ phenotype_val_tmp[n] = phenotype_val[ridx];
+ shuffle(phenotype_val_tmp[n].begin(), phenotype_val_tmp[n].end(), rng.getEngine());
+ phenotype_id_tmp[n] = phenotype_id[ridx];
+ phenotype_chr_tmp[n] = phenotype_chr[ridx];
+ phenotype_start_tmp[n] = phenotype_start[ridx];
+ phenotype_end_tmp[n] = phenotype_end[ridx];
+ }
+ phenotype_count = phenotype_count_tmp;
+ phenotype_val = phenotype_val_tmp;
+ phenotype_id = phenotype_id_tmp;
+ phenotype_chr = phenotype_chr_tmp;
+ phenotype_start = phenotype_start_tmp;
+ phenotype_end = phenotype_end_tmp;
+}
+
+void trans_data::getCorrelationThreshold(double pvalue) {
+ vrb.title("Calculate correlation threshold");
+ vrb.bullet("thres = " + stb.str(pvalue));
+ if (beta_ml1 == 0 && beta_ml2 == 0) {
+ double p = qf(pvalue, 1, sample_count - 2, 0, 0);
+ correlation_threshold = sqrt(p / (sample_count - 2 + p));
+ } else {
+ double p0 = qbeta(pvalue, beta_ml1, beta_ml2, 1, 0);
+ vrb.bullet("npvt = " + stb.str(p0));
+ double p1 = qf(p0, 1, sample_count - 2, 0, 0);
+ vrb.bullet("nfst = " + stb.str(p1));
+ correlation_threshold = sqrt(p1 / (sample_count - 2 + p1));
+ }
+ vrb.bullet("corr = " + stb.str(correlation_threshold, 4));
+}
+
+void trans_data::residualizePhenotypes() {
+ vrb.title("Residualize phenotypes for covariates");
+ residualizer covariate_engine (sample_count);
+ for (int c = 0 ; c < covariate_count ; c ++) covariate_engine.push(covariate_val[c]);
+ covariate_engine.build();
+ for (unsigned int p = 0 ; p < phenotype_count ; p ++) covariate_engine.residualize(phenotype_val[p]);
+ vrb.bullet("#covariates = " + stb.str(covariate_count));
+}
+
diff --git a/src/mode_trans/trans_read_covariates.cpp b/src/mode_trans/trans_read_covariates.cpp
new file mode 100644
index 0000000..ccb9a9c
--- /dev/null
+++ b/src/mode_trans/trans_read_covariates.cpp
@@ -0,0 +1,67 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "trans_data.h"
+
+void trans_data::readCovariates(string fcov) {
+ string buffer;vector < string > tokens;
+ int n_includedS = 0;
+ int n_excludedS = 0;
+ int n_missingS = 0;
+ int n_includedC = 0;
+ int n_excludedC = 0;
+ vector < int > mappingS;
+
+ vrb.title("Reading covariates in [" + fcov + "]");
+ input_file fd (fcov);
+ if (fd.fail()) vrb.error("Cannot open file!");
+
+ //Read samples
+ getline(fd, buffer);
+ stb.split(buffer, tokens);
+ for (int t = 1 ; t < tokens.size() ; t ++) {
+ if (filter_sample.check(tokens[t])) {
+ mappingS.push_back(findSample(tokens[t]));
+ if (mappingS.back() >= 0) n_includedS ++;
+ else n_missingS ++;
+ } else {
+ mappingS.push_back(-1);
+ n_excludedS++;
+ }
+ }
+ vrb.bullet(stb.str(n_includedS) + " samples included");
+ if (n_excludedS > 0) vrb.bullet(stb.str(n_excludedS) + " samples excluded by user");
+ if (n_missingS > 0) vrb.bullet(stb.str(n_missingS) + " samples without phenotype data");
+ if (n_includedS != sample_count) vrb.error("Cannot find covariates for " + stb.str(sample_count - n_includedS) + " samples!");
+
+ //Read covariates
+ while(getline(fd, buffer)) {
+ if (buffer.size() == 0) continue;
+ stb.split(buffer, tokens);
+ if (tokens.size() < 2) vrb.error("Incorrect number of columns!");
+ if (filter_covariate.check(tokens[0])) {
+ covariate_val.push_back(vector < string > (sample_count));
+ for (int t = 1 ; t < tokens.size() ; t ++) if (mappingS[t-1] >= 0) covariate_val.back()[mappingS[t-1]] = tokens[t];
+ n_includedC ++;
+ } else n_excludedC ++;
+ }
+
+ //Finalise
+ covariate_count = n_includedC;
+ vrb.bullet(stb.str(n_includedC) + " covariate(s) included");
+ if (n_excludedC > 0) vrb.bullet(stb.str(n_excludedC) + " covariate(s) excluded");
+ fd.close();
+}
+
diff --git a/src/mode_trans/trans_read_phenotypes.cpp b/src/mode_trans/trans_read_phenotypes.cpp
new file mode 100644
index 0000000..49c26c6
--- /dev/null
+++ b/src/mode_trans/trans_read_phenotypes.cpp
@@ -0,0 +1,116 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "trans_data.h"
+
+void trans_data::readPhenotypes(string fbed) {
+ int n_includedS = 0, n_excludedS = 0, n_excludedU = 0, n_excludedP = 0, n_negativeStrd = 0;
+ vector < int > mappingS;
+
+ //Open BED file
+ vrb.title("Reading phenotype data in [" + fbed + "] from line " + stb.str(start_line) + " to " + stb.str(end_line));
+ htsFile *fp = hts_open(fbed.c_str(),"r");
+ if (!fp) vrb.error("Cannot open file!");
+ tbx_t * tbx = tbx_index_load(fbed.c_str());
+ if (!tbx) vrb.error("Cannot open index file");
+ kstring_t str = {0,0,0};
+ if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != '#' ) vrb.error("Cannot read header line");
+
+ //Process sample names
+ vector < string > tokens;
+ stb.split(string(str.s), tokens);
+ if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
+ for (int t = 6 ; t < tokens.size() ; t ++) {
+ if (filter_sample.check(tokens[t])) {
+ mappingS.push_back(findSample(tokens[t]));
+ if (mappingS.back() < 0) n_excludedS ++;
+ else n_includedS ++;
+ } else n_excludedU ++;
+ }
+ vrb.bullet(stb.str(n_includedS) + " samples included");
+ if (n_excludedU > 0) vrb.bullet(stb.str(n_excludedU) + " samples excluded by user");
+ if (n_excludedS > 0) vrb.bullet(stb.str(n_excludedS) + " samples without genotype data");
+ if (n_includedS != sample_count) vrb.error("Cannot find phenotype data for " + stb.str(sample_count - n_includedS) + " samples!");
+
+ unsigned long int linecount = 1;
+ //Read phenotypes
+ while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) {
+ if (str.l && str.s[0] != tbx->conf.meta_char) {
+ stb.split(string(str.s), tokens);
+ if (filter_phenotype.check(tokens[3])) {
+ if (linecount >= start_line){
+ phenotype_id.push_back(tokens[3]);
+ phenotype_chr.push_back(tokens[0]);
+ phenotype_start.push_back(atoi(tokens[1].c_str()) + 1);
+ phenotype_end.push_back(atoi(tokens[2].c_str()));
+ phenotype_neg.push_back(tokens[5] == "-");
+ if (phenotype_neg.back()) n_negativeStrd ++;
+ phenotype_val.push_back(vector < float > (sample_count, 0.0));
+ for (int t = 6 ; t < tokens.size() ; t ++) if (mappingS[t-6] >= 0) {
+ if (tokens[t] == "NA") phenotype_val.back()[mappingS[t-6]] = bcf_float_missing;
+ else phenotype_val.back()[mappingS[t-6]] = stof(tokens[t]);
+ }
+ }
+ if (end_line != 0 && linecount >= end_line) break;
+ linecount++;
+ } else n_excludedP ++;
+ }
+ }
+
+ //Finalize & verbose
+ tbx_destroy(tbx);
+ phenotype_count = phenotype_id.size();
+ vrb.bullet(stb.str(phenotype_count) + " phenotypes included");
+ if (n_excludedP > 0) vrb.bullet(stb.str(n_excludedP) + " phenotypes excluded by user");
+ if (n_negativeStrd > 0) vrb.bullet(stb.str(n_negativeStrd) + " phenotypes is on the negative strand");
+ hts_close(fp);
+}
+
+
+void trans_data::scanPhenotypes(string fbed) {
+ int n_includedP = 0;
+ int n_excludedP = 0;
+
+ //Open BED file
+ vrb.title("Reading phenotype data in [" + fbed + "]");
+ htsFile *fp = hts_open(fbed.c_str(),"r");
+ if (!fp) vrb.error("Cannot open file");
+ tbx_t * tbx = tbx_index_load(fbed.c_str());
+ if (!tbx) vrb.error("Cannot open index file");
+
+ //Read header
+ kstring_t str = {0,0,0};
+ if (!hts_getline(fp, KS_SEP_LINE, &str) || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line");
+
+ //Scan file
+ vector < string > tokens;
+ while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) {
+ if (str.l && str.s[0] != tbx->conf.meta_char) {
+ stb.split(string(str.s), tokens," ",4);
+ if (tokens.size() < 4) vrb.error("Incorrect number of columns!");
+ if (filter_phenotype.check(tokens[3])) {
+ n_includedP++;
+ } else n_excludedP ++;
+ }
+ }
+
+ //Finalize & verbose
+ tbx_destroy(tbx);
+ if (hts_close(fp)) vrb.error("Cannot properly close file");
+ phenotype_count = n_includedP;
+ vrb.bullet(stb.str(n_includedP) + " phenotypes included");
+ if (n_excludedP > 0) vrb.bullet(stb.str(n_excludedP) + " phenotypes excluded by user");
+ if (phenotype_count == 0) vrb.leave("Cannot find phenotypes in region!");
+}
diff --git a/src/mode_trans/trans_read_samples.cpp b/src/mode_trans/trans_read_samples.cpp
new file mode 100644
index 0000000..de6f6f6
--- /dev/null
+++ b/src/mode_trans/trans_read_samples.cpp
@@ -0,0 +1,93 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "trans_data.h"
+
+void trans_data::readSampleFromVCF(string fname) {
+ vrb.title("Read sample list from [" + fname + "]");
+ bcf_sweep_t * sw = bcf_sweep_init(fname.c_str());
+ if (!sw) vrb.error("Cannot open file!");
+ bcf_hdr_t * hdr = bcf_sweep_hdr(sw);
+ if (!hdr) vrb.error("Cannot read vcf header!");
+ unsigned int n_sample = bcf_hdr_nsamples(hdr);
+ for (int i = 0 ; i < n_sample ; i ++) {
+ sample_id.push_back(string(hdr->samples[i]));
+ sample_count ++;
+ }
+ vrb.bullet("#samples = " + stb.str(sample_count));
+ bcf_sweep_destroy(sw);
+}
+
+void trans_data::checkSampleInBED(string fname) {
+ unsigned int n_included = 0, n_excluded = 0, n_missing = 0;
+ vrb.title("Checking sample list in [" + fname + "]");
+ htsFile *fp = hts_open(fname.c_str(),"r");
+ if (!fp) vrb.error("Cannot open file");
+ kstring_t str = {0,0,0};
+ vector < string > tokens;
+ if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != '#' ) vrb.error("Cannot read BED header!");
+ stb.split(string(str.s), tokens);
+ if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
+ vector < bool > sample_found = vector < bool > (sample_count, false);
+ for (int t = 6 ; t < tokens.size() ; t ++) {
+ int sidx = findSample(tokens[t]);
+ if (sidx >= 0) {
+ sample_found[sidx] = true;
+ n_included ++;
+ } else n_excluded ++;
+ }
+ for (int i = 0 ; i < sample_count ; i ++) if (!sample_found[i]) n_missing ++;
+ if (n_missing > 0) {
+ vrb.bullet("#samples included = " + stb.str(n_included));
+ vrb.bullet("#samples excluded = " + stb.str(n_excluded));
+ vrb.bullet("#samples missing = " + stb.str(n_missing));
+ vrb.error("Some sample have genotype data and no phenotype data. Trans analysis does allow this for speed purposes.");
+ } else if (n_excluded > 0) {
+ vrb.bullet("#samples included = " + stb.str(n_included));
+ vrb.bullet("#samples excluded = " + stb.str(n_excluded));
+ } else vrb.bullet("#samples = " + stb.str(n_included));
+ hts_close(fp);
+}
+
+void trans_data::checkSampleInCOV(string fname) {
+ string buffer; vector < string > tokens;
+ unsigned int n_included = 0, n_excluded = 0, n_missing = 0;
+ vrb.title("Checking sample list in [" + fname + "]");
+ input_file fd (fname);
+ if (fd.fail()) vrb.error("Cannot open file!");
+ getline(fd, buffer);
+ if (buffer.size() == 0) vrb.error("No header line detected!");
+ stb.split(buffer, tokens);
+ if (tokens.size() < 2) vrb.error("Incorrect number of columns!");
+ vector < bool > sample_found = vector < bool > (sample_count, false);
+ for (int t = 1 ; t < tokens.size() ; t ++) {
+ int sidx = findSample(tokens[t]);
+ if (sidx >= 0) {
+ sample_found[sidx] = true;
+ n_included ++;
+ } else n_excluded ++;
+ }
+ for (int i = 0 ; i < sample_count ; i ++) if (!sample_found[i]) n_missing ++;
+ if (n_missing > 0) {
+ vrb.bullet("#samples included = " + stb.str(n_included));
+ vrb.bullet("#samples excluded = " + stb.str(n_excluded));
+ vrb.bullet("#samples missing = " + stb.str(n_missing));
+ vrb.error("Some sample have genotype data and no phenotype data!");
+ } else if (n_excluded > 0) {
+ vrb.bullet("#samples included = " + stb.str(n_included));
+ vrb.bullet("#samples excluded = " + stb.str(n_excluded));
+ } else vrb.bullet("#samples = " + stb.str(n_included));
+ fd.close();
+}
diff --git a/src/mode_union/union_data.h b/src/mode_union/union_data.h
new file mode 100644
index 0000000..abcd7d0
--- /dev/null
+++ b/src/mode_union/union_data.h
@@ -0,0 +1,247 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#ifndef _UNION_DATA_H
+#define _UNION_DATA_H
+
+//INCLUDES
+#include "../common/data.h"
+
+class coldspot_u{
+public:
+ string chr;
+ int start;
+ int end;
+ int idx;
+ string type;
+ coldspot_u(){chr="";start=-1;end=-1;idx=-1;type="NA";}
+ coldspot_u(string c, int s, int e, int i,string t){chr=c;start=s;end=e;idx=i; type = t ;}
+ friend ostream& operator<<(ostream& out, const coldspot_u& p){
+ out << "ID: " << p.idx << " CHR: " << p.chr << " START: " << p.start << " END: " << p.end << " TYPE: " << p.type << endl;
+ return out;
+ }
+};
+
+class results{
+public:
+ vector < string > pheno;
+ vector < string > geno;
+ vector < int > rank;
+ vector < double > pval;
+ vector < int > csi;
+ vector < string > regions;
+ void assign(string p, string s, int r, double pv,int cs , string re){pheno.push_back(p);geno.push_back(s);rank.push_back(r);pval.push_back(pv);csi.push_back(cs); regions.push_back(re);}
+ friend ostream& operator<<(ostream& out, const results& r){
+ for (int i =0 ; i < r.pheno.size(); i++) out << "__UNION__ " << r.pheno[i] << " " << r.geno[i] << " " << r.rank[i] << " 1 " << r.pval[i] << " "<< r.csi[i] << " " << r.regions[i] << "\n";
+ return out;
+ }
+};
+
+
+struct genotypes_holder{
+ vector < vector < float> > genotypes;
+ vector < string > ids;
+};
+
+class myPhenotype{
+public:
+ vector < vector < int > > ranks;
+ vector < vector < string > > genotypes;
+ vector < bool > found;
+ int max_independent_signal;
+ string pheno_chr;
+ int pheno_pos;
+ myPhenotype(){pheno_chr = "";pheno_pos = -1; max_independent_signal=0;}
+ ~myPhenotype(){found.clear();ranks.clear();genotypes.clear();}
+ myPhenotype(string c , int p , int f){ranks = vector < vector < int > >(f); genotypes = vector < vector < string > >(f); pheno_chr = c ; pheno_pos = p; found = vector < bool >(f,false);max_independent_signal=1;}
+ void assign(string g , int r , int i){genotypes[i].push_back(g); ranks[i].push_back(r); found[i] = true; if(ranks[i].size() > max_independent_signal) max_independent_signal = ranks[i].size();}
+};
+
+
+class union_data : public data {
+public:
+ //PARAMETERS
+ int pvalue_column,variant_column,phenotype_column,rank_column,best_column,coldspot_count;
+ static const int bin_size = 1000000;
+ int no_of_files;
+
+
+ //REGIONS
+ genomic_region regionPhenotype;
+ genomic_region regionGenotype;
+
+ //GENOTYPES
+ int genotype_count; //variant site number
+ vector < vector < float > > genotype_val; //variant site genotype dosages
+ vector < string > genotype_chr; //variant site chromosome
+ vector < string > genotype_id; //variant site IDs
+ vector < int > genotype_start; //variant site start positions
+ vector < int > genotype_end; //variant site end positions
+ unordered_map < string, int > genotype_id_to_idx;
+
+ //PHENOTYPES
+ int phenotype_count; //phenotype number
+ vector < vector < float > > phenotype_val; //phenotype values
+ vector < string > phenotype_id; //phenotype ids
+ vector < string > phenotype_grp; //phenotype groups
+ vector < string > phenotype_chr; //phenotype chromosomes
+ vector < int > phenotype_start; //phenotype start positions
+ vector < int > phenotype_end; //phenotype end positions
+ unordered_map < string, int > phenotype_id_to_idx;
+
+ //COVARIATES & INTERACTION
+ int covariate_count; //covariate number
+ vector < vector < string > > covariate_val; //covariate values
+ vector < string > covariate_id; //covariate ids
+
+ //RTC
+ map < string, map < int, vector <coldspot_u *> > > coldspot_bins_p;
+ vector < coldspot_u *> all_coldspots_p;
+ map < int , map < string , myPhenotype> > toUnite;
+
+ //CONSTRUCTOR / DESTRUCTOR
+ union_data();
+ ~union_data();
+ void clear();
+ void clearNotHotspot();
+ void clearSamples();
+
+ //DATA REGION
+ bool setPhenotypeRegion(string);
+ bool setGenotypeRegion(string);
+ void deduceGenotypeRegion(int);
+ void setPhenotypeRegion(int, int);
+
+ //READ DATA
+ void readGenotypes(string,string);
+ void readGenotypesVCF(string,string);
+ void readGenotypesBED(string,string);
+ void scanGenotypes(string);
+ void scanGenotypesVCF(string);
+ void scanGenotypesBED(string);
+ void readPhenotypes(string,string);
+ void scanPhenotypes(string);
+ void readCovariates(string);
+
+ //GENOTYPE & PHENOTYPE MANAGEMENT
+ void clusterizePhenotypes(int);
+ void imputeGenotypes();
+ void imputePhenotypes();
+ void residualizePhenotypes();
+ void normalTransformPhenotypes();
+ void normalTransform(vector < float > &);
+ void normalize(vector < float > &);
+ void normalize(vector < vector < float > > &);
+
+ //COMPUTATION METHODS [ALL INLINES FOR SPEED]
+ double getCorrelation(vector < float > &, vector < float > &);
+ double getCorrelation(vector < float > &, vector < float > &, int);
+ double getPvalue(double, double);
+
+ //ANALYSIS
+ void readHotspots(string);
+ int getColdspot(string, int);
+ void mapVariantsToColdspots();
+ string getBestVariant(genotypes_holder&, int, double &);
+ void unions(string,int);
+ void unions_conditional(string,int);
+ void create_unions(vector <string> & , vector <string> & , vector <string> & , vector <string> &);
+ void find_unions(vector <string> & , vector <string> & , vector <string> & , vector <string> &);
+
+};
+
+//***************************************************************//
+//******************** DECLARE FUNCTIONS ************************//
+//***************************************************************//
+void union_main(vector < string > &);
+
+//***************************************************************//
+//******************** INLINE FUNCTIONS *************************//
+//***************************************************************//
+
+inline double union_data::getCorrelation(vector < float > & vec1, vector < float > & vec2) {
+ int i = 0;
+ int repeat = (sample_count / 4);
+ int left = (sample_count % 4);
+ double sum0 = 0, sum1 = 0, sum2 = 0, sum3 = 0;
+
+ while (repeat --) {
+ sum0 += vec1[i] * vec2[i];
+ sum1 += vec1[i+1] * vec2[i+1];
+ sum2 += vec1[i+2] * vec2[i+2];
+ sum3 += vec1[i+3] * vec2[i+3];
+ i += 4;
+ }
+
+ switch (left) {
+ case 3: sum0 += vec1[i+2] * vec2[i+2];
+ case 2: sum0 += vec1[i+1] * vec2[i+1];
+ case 1: sum0 += vec1[i+0] * vec2[i+0];
+ case 0: ;
+ }
+
+ return sum0 + sum1 + sum2 + sum3;
+}
+
+inline double union_data::getCorrelation(vector < float > & vec1, vector < float > & vec2, int sc) {
+ int i = 0;
+ int repeat = (sc / 4);
+ int left = (sc % 4);
+ double sum0 = 0, sum1 = 0, sum2 = 0, sum3 = 0;
+
+ while (repeat --) {
+ sum0 += vec1[i] * vec2[i];
+ sum1 += vec1[i+1] * vec2[i+1];
+ sum2 += vec1[i+2] * vec2[i+2];
+ sum3 += vec1[i+3] * vec2[i+3];
+ i += 4;
+ }
+
+ switch (left) {
+ case 3: sum0 += vec1[i+2] * vec2[i+2];
+ case 2: sum0 += vec1[i+1] * vec2[i+1];
+ case 1: sum0 += vec1[i+0] * vec2[i+0];
+ case 0: ;
+ }
+
+ return sum0 + sum1 + sum2 + sum3;
+}
+
+inline double union_data::getPvalue(double corr, double df) {
+ return pf(df * corr * corr / (1 - corr * corr), 1, df, 0, 0);
+}
+
+inline string union_data::getBestVariant(genotypes_holder &genotype, int phenotype_idx, double &pval){
+ vector < float > y = phenotype_val[phenotype_idx];
+ if (options.count("normal")) normalTransform(y);
+ normalize(y);
+ double bestR = 0.0;
+ string bestV = "NA";
+ int size = 3;
+ for (int g = 0; g< genotype.genotypes.size(); g++){
+ vector < float > x = genotype.genotypes[g];
+ size = x.size();
+ normalize(x);
+ double R = abs(getCorrelation(x,y));
+ if (R > bestR){
+ bestR = R;
+ bestV = genotype.ids[g];
+ }
+ }
+ pval = getPvalue(bestR,size-2);
+ return bestV;
+}
+
+#endif
diff --git a/src/mode_union/union_initilization.cpp b/src/mode_union/union_initilization.cpp
new file mode 100644
index 0000000..ff6add9
--- /dev/null
+++ b/src/mode_union/union_initilization.cpp
@@ -0,0 +1,91 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "union_data.h"
+
+union_data::union_data() {
+ genotype_count = 0;
+ phenotype_count = 0;
+ covariate_count = 0;
+ pvalue_column = 17;
+ variant_column = 6;
+ phenotype_column = 0;
+ rank_column = 10;
+ best_column = 17;
+ coldspot_count = 0;
+ no_of_files = 0;
+}
+
+void union_data::clear() {
+ sample_count = 0;
+ sample_id.clear();
+ genotype_count = 0;
+ genotype_val.clear();
+ genotype_chr.clear();
+ genotype_id.clear();
+ genotype_start.clear();
+ genotype_end.clear();
+ phenotype_count = 0;
+ phenotype_val.clear();
+ phenotype_id.clear();
+ phenotype_chr.clear();
+ phenotype_start.clear();
+ phenotype_end.clear();
+ covariate_count = 0;
+ covariate_val.clear();
+ covariate_id.clear();
+ coldspot_bins_p.clear();
+ for (int i = 0 ; i < all_coldspots_p.size(); i++) delete all_coldspots_p[i];
+ all_coldspots_p.clear();
+ genotype_id_to_idx.clear();
+ phenotype_id_to_idx.clear();
+ coldspot_count = 0;
+}
+
+void union_data::clearNotHotspot() {
+ sample_count = 0;
+ sample_id.clear();
+ genotype_count = 0;
+ genotype_val.clear();
+ genotype_chr.clear();
+ genotype_id.clear();
+ genotype_start.clear();
+ genotype_end.clear();
+ phenotype_count = 0;
+ phenotype_val.clear();
+ phenotype_id.clear();
+ phenotype_chr.clear();
+ phenotype_start.clear();
+ phenotype_end.clear();
+ covariate_count = 0;
+ covariate_val.clear();
+ covariate_id.clear();
+ genotype_id_to_idx.clear();
+ phenotype_id_to_idx.clear();
+}
+
+
+union_data::~union_data() {
+ clear();
+}
+
+void union_data::residualizePhenotypes() {
+ //vrb.title("Residualize phenotypes for covariates");
+ residualizer covariate_engine (sample_count);
+ for (int c = 0 ; c < covariate_count ; c ++) covariate_engine.push(covariate_val[c]);
+ covariate_engine.build();
+ for (unsigned int p = 0 ; p < phenotype_count ; p ++) covariate_engine.residualize(phenotype_val[p]);
+ //vrb.bullet("#covariates = " + stb.str(covariate_count));
+}
diff --git a/src/mode_union/union_main.cpp b/src/mode_union/union_main.cpp
new file mode 100644
index 0000000..43ede25
--- /dev/null
+++ b/src/mode_union/union_main.cpp
@@ -0,0 +1,153 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "union_data.h"
+
+void union_main(vector < string > & argv) {
+ union_data D;
+
+ //-------------------------
+ // 1. DECLARE ALL OPTIONS
+ //-------------------------
+ D.declareBasicOptions(); //Mandatory
+
+ boost::program_options::options_description opt_files ("\x1B[32mI/O\33[0m");
+ opt_files.add_options()
+ ("vcf", boost::program_options::value< vector < string > >()->multitoken(), "Genotypes in VCF/BCF/BED format.")
+ ("bed", boost::program_options::value< vector < string > >()->multitoken(), "Phenotypes in BED format.")
+ ("cov", boost::program_options::value< vector < string > >()->multitoken(), "Covariates in TXT format.")
+ ("hotspots", boost::program_options::value< string >(), "Hotspots in BED format.")
+ ("results", boost::program_options::value< vector < string > >()->multitoken(), "QTLtools results file in TXT format.")
+ ("out-suffix", boost::program_options::value< string >(), "If provided output files will be suffixed with this.");
+
+ boost::program_options::options_description opt_parameters ("\x1B[32mParameters\33[0m");
+ opt_parameters.add_options()
+ ("force", "Force overwrite of union files.")
+ ("normal", "Normal transform the phenotypes.")
+ ("conditional", "Do conditional analysis.");
+
+ boost::program_options::options_description opt_columns ("\x1B[32mColumns (1-based)\33[0m");
+ opt_columns.add_options()
+ ("pheno-col", boost::program_options::value< unsigned int >()->default_value(1), "Phenotype column")
+ ("geno-col", boost::program_options::value< unsigned int >()->default_value(7), "Genotype column")
+ ("rank-col", boost::program_options::value< unsigned int >()->default_value(11), "Conditional analysis rank column")
+ ("best-col", boost::program_options::value< unsigned int >()->default_value(18), "Conditional analysis best variant column");
+
+ boost::program_options::options_description opt_parallel ("\x1B[32mParallelization\33[0m");
+ opt_parallel.add_options()
+ ("chunk", boost::program_options::value< vector < int > >()->multitoken(), "Specify which chunk needs to be processed")
+ ("region", boost::program_options::value< string >(), "Region of interest.")
+ ("window", boost::program_options::value< unsigned int >()->default_value(1000000), "Size of the cis-window.");
+
+ D.option_descriptions.add(opt_files).add(opt_parameters).add(opt_columns).add(opt_parallel);
+
+ //-------------------
+ // 2. PARSE OPTIONS
+ //-------------------
+ try {
+ boost::program_options::store(boost::program_options::command_line_parser(argv).options(D.option_descriptions).run(), D.options);
+ boost::program_options::notify(D.options);
+ } catch ( const boost::program_options::error& e ) {
+ cerr << "Error parsing [rtc-union] command line :" << string(e.what()) << endl;
+ exit(0);
+ }
+
+ //---------------------
+ // 3. PRINT HELP/HEADER
+ //---------------------
+ vrb.ctitle("CALCULATE UNION OF QTLS");
+ if (D.options.count("help")) {
+ cout << D.option_descriptions << endl;
+ exit(EXIT_SUCCESS);
+ }
+
+
+ //-----------------
+ // 4. COMMON CHECKS
+ //-----------------
+ if (!D.options.count("vcf")) vrb.error("Genotype data needs to be specified with --vcf [file.vcf]");
+ if (!D.options.count("bed")) vrb.error("Phenotype data needs to be specified with --bed [file.bed]");
+ if (!D.options.count("results")) vrb.error("Results needs to be specified with --results [file.txt]");
+ if (!D.options.count("hotspots")) vrb.error("Output needs to be specified with --hotspots [file.bed]");
+ if (D.options["pheno-col"].as < unsigned int > () < 1) vrb.error("--pheno-col must be greater than 0");
+ if (D.options["geno-col"].as < unsigned int > () < 1) vrb.error("--geno-col must be greater than 0");
+ if (D.options["rank-col"].as < unsigned int > () < 1) vrb.error("--rank-col must be greater than 0");
+ if (D.options["best-col"].as < unsigned int > () < 1) vrb.error("--best-col must be greater than 0");
+ vector < string > bedFiles = D.options["bed"].as < vector < string > > ();
+ vector < string > hitFiles = D.options["results"].as < vector < string > > ();
+ vector < string > vcfFiles = D.options["vcf"].as < vector < string > > ();
+ vector < string > covFiles = D.options["cov"].as < vector < string > > ();
+ if (bedFiles.size() != hitFiles.size()) vrb.error("Unmatched --results and --bed files");
+ if (D.options.count("cov") && covFiles.size() != hitFiles.size()) vrb.error("Unmatched --results and --cov files");
+ if (vcfFiles.size() == 1) vrb.bullet("Single VCF provided, assuming common VCF file.");
+ else if (vcfFiles.size() != hitFiles.size()) vrb.error("Unmatched --results and --vcf files");
+ D.no_of_files = hitFiles.size();
+ if(!D.options.count("force")){
+ for (int i = 0 ; i < D.no_of_files; i++){
+ string name = hitFiles[i].substr(hitFiles[i].find_last_of("/") + 1) + ".union";
+ if (D.options.count("out-suffix")) name += D.options["out-suffix"].as <string> ();
+ ifstream file(name.c_str());
+ if (file.is_open()){
+ file.close();
+ vrb.error("File [" + name + "] already exists use --force to overwrite");
+ }
+ }
+ }
+
+ //--------------
+ // 5. SET PARAMS
+ //--------------
+
+ if (D.options.count("chunk")) {
+ vector < int > nChunk = D.options["chunk"].as < vector < int > > ();
+ if (nChunk.size() != 2 || nChunk[0] > nChunk[1]) vrb.error("Incorrect --chunk arguments!");
+ vrb.bullet("Chunk = [" + stb.str(nChunk[0]) + "/" + stb.str(nChunk[1]) + "]");
+ } else if (D.options.count("region")) vrb.bullet("Region = [" + D.options["region"].as < string > () +"]");
+ if(D.options.count("conditional")) vrb.bullet("Doing conditional analysis.");
+ D.phenotype_column = D.options["pheno-col"].as < unsigned int > () - 1;
+ D.variant_column = D.options["geno-col"].as < unsigned int > () - 1;
+ D.rank_column = D.options["rank-col"].as < unsigned int > () - 1;
+ D.best_column = D.options["best-col"].as < unsigned int > () - 1;
+ vrb.bullet("Phenotype column (0-based) " + stb.str(D.phenotype_column));
+ vrb.bullet("Variant column (0-based) " + stb.str(D.variant_column));
+ if (D.options.count("conditional")){
+ vrb.bullet("Rank column (0-based) " + stb.str(D.rank_column));
+ vrb.bullet("Best column (0-based) " + stb.str(D.best_column));
+ }
+
+ if (D.options.count("chunk") || D.options.count("region")) vrb.warning("--chunk or --region will not work for trans results");
+ if (D.options.count("chunk") && !D.options.count("out-suffix")) vrb.error("--out-suffix is required when --chunk. Otherwise output files will be overwritten.");
+ D.readHotspots(D.options["hotspots"].as < string > ());
+ //--------------
+ // 6SET REGION
+ //--------------
+ if (D.options.count("chunk")) {
+ for (int i =0 ; i < D.no_of_files; i++) D.scanPhenotypes(bedFiles[i]);
+ D.setPhenotypeRegion(D.options["chunk"].as < vector < int > > ()[0] - 1, D.options["chunk"].as < vector < int > > ()[1]);
+ //outFile += "." + D.regionPhenotype.get();
+ D.clearNotHotspot();
+ D.deduceGenotypeRegion(D.options["window"].as < unsigned int > ());
+ } else if (D.options.count("region")){
+ if (!D.setPhenotypeRegion(D.options["region"].as < string > ())) vrb.error("Impossible to interpret region [" + D.options["region"].as < string > () + "]");
+ D.deduceGenotypeRegion(D.options["window"].as < unsigned int > ());
+ }
+
+
+ D.processBasicOptions(); //Mandatory
+ D.create_unions(hitFiles,vcfFiles,bedFiles,covFiles);
+ D.find_unions(hitFiles,vcfFiles,bedFiles,covFiles);
+ D.clear();
+
+}
diff --git a/src/mode_union/union_management.cpp b/src/mode_union/union_management.cpp
new file mode 100644
index 0000000..628ed5f
--- /dev/null
+++ b/src/mode_union/union_management.cpp
@@ -0,0 +1,257 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "union_data.h"
+
+void union_data::imputeGenotypes() {
+ vrb.title("Imputing missing genotypes");
+ for (int g = 0; g < genotype_count ; g ++) {
+ double mean = 0.0;
+ int c_mean = 0;
+ for (int s = 0; s < sample_count ; s ++) {
+ if (genotype_val[g][s] != bcf_float_missing) {
+ mean += genotype_val[g][s];
+ c_mean ++;
+ }
+ }
+ mean /= c_mean;
+ for (int s = 0; s < sample_count ; s ++) if (genotype_val[g][s] == bcf_float_missing) genotype_val[g][s] = mean;
+ }
+}
+
+void union_data::imputePhenotypes() {
+ vrb.title("Imputing missing phenotypes");
+ for (int p = 0; p < phenotype_count ; p ++) {
+ double mean = 0.0;
+ int c_mean= 0;
+ for (int s = 0; s < sample_count; s ++) {
+ if (phenotype_val[p][s] != bcf_float_missing) {
+ mean += phenotype_val [p][s];
+ c_mean ++;
+ }
+ }
+ mean /= c_mean;
+ for (int s = 0; s < sample_count ; s ++) if (phenotype_val[p][s] == bcf_float_missing) phenotype_val[p][s] = mean;
+ }
+}
+
+void union_data::normalTransform(vector < float > & V) {
+ vector < float > R;
+ myranker::rank(V, R);
+ double max = 0;
+ for (int s = 0 ; s < sample_count ; s ++) {
+ R[s] = R[s] - 0.5;
+ if (R[s] > max) max = R[s];
+ }
+ max = max + 0.5;
+ for (int s = 0 ; s < sample_count ; s ++) {
+ R[s] /= max;
+ V[s] = qnorm(R[s], 0.0, 1.0, 1, 0);
+ }
+}
+
+void union_data::normalTransformPhenotypes() {
+ vrb.title("Match phenotypes to Normal distribution");
+ for (int p = 0; p < phenotype_count ; p ++) normalTransform(phenotype_val[p]);
+}
+
+void union_data::normalize(vector < float > & X) {
+ double mean = 0.0, sum = 0.0;
+ for (int s = 0; s < X.size() ; s ++) mean += X[s];
+ mean /= X.size();
+ for (int s = 0; s < X.size() ; s ++) {
+ X[s] -= mean;
+ sum += X[s] * X[s];
+ }
+ sum = sqrt(sum);
+ if (sum == 0) sum = 1;
+ for (int s = 0; s < X.size() ; s ++) X[s] /= sum;
+}
+
+
+void union_data::normalize(vector < vector < float > > & X) {
+ for (int x = 0 ; x < X.size() ; x++) {
+ double mean = 0.0, sum = 0.0;
+ for (int s = 0; s < sample_count ; s ++) mean += X[x][s];
+ mean /= sample_count;
+ for (int s = 0; s < sample_count ; s ++) {
+ X[x][s] -= mean;
+ sum += X[x][s] * X[x][s];
+ }
+ sum = sqrt(sum);
+ if (sum == 0) sum = 1;
+ for (int s = 0; s < sample_count ; s ++) X[x][s] /= sum;
+ }
+}
+
+bool union_data::setPhenotypeRegion(string reg) {
+ return regionPhenotype.parse(reg);
+}
+
+bool union_data::setGenotypeRegion(string reg) {
+ return regionGenotype.parse(reg);
+}
+
+void union_data::deduceGenotypeRegion(int W) {
+ regionGenotype.chr = regionPhenotype.chr;
+ int start = regionPhenotype.start - W;
+ if (start < 0) regionGenotype.start = 0;
+ else{
+ int start_coldspot = getColdspot(regionGenotype.chr,start);
+ if (start_coldspot > 0 ) regionGenotype.start = all_coldspots_p[start_coldspot]->start;
+ else if (start_coldspot == -1) regionGenotype.start = (coldspot_bins_p[regionGenotype.chr].rbegin()->second).back()->end;
+ else regionGenotype.start = start;
+ }
+ int end = regionPhenotype.end + W;
+ int end_coldspot = getColdspot(regionGenotype.chr,end);
+ if (end_coldspot > 0 ) regionGenotype.end = all_coldspots_p[end_coldspot]->end;
+ else if (end_coldspot == -1) regionGenotype.end = end + 1000000000;
+ else regionGenotype.end = end;
+}
+
+class pgroup {
+public:
+ int start, end;
+ string chr;
+
+ pgroup(string pc, int ps, int pe) {
+ chr = pc;
+ start = ps;
+ end = pe;
+ }
+
+ void merge(int ps, int pe) {
+ if (start > ps) start = ps;
+ if (end < pe) end = pe;
+ }
+
+ void merge(pgroup & p) {
+ if (start > p.start) start = p.start;
+ if (end < p.end) end = p.end;
+ }
+
+ bool overlap(pgroup & p) {
+ if (chr != p.chr) return false;
+ //cout << start << " " << end << " vs " << p.start << " " << p.end;
+ if (start <= p.end && p.start <= end) {
+ //cout << " Y" << endl;
+ return true;
+ } else {
+ //cout << " N" << endl;
+ return false;
+ }
+ }
+
+ bool operator < (pgroup const & p) const {
+ if (chr < p.chr) return true;
+ if (chr > p.chr) return false;
+ if (start < p.start) return true;
+ if (start >= p.start) return false;
+ return false;
+ }
+};
+
+void union_data::setPhenotypeRegion(int k, int K) {
+ //STEP0: check input values
+ if (K < 1) vrb.error("Number of chunks needs to be > 0");
+ if (K > phenotype_count) vrb.error("Number of chunks (" + stb.str(K) + ") is greater than the number of phenotypes (" + stb.str(phenotype_count) + ")");
+ if (k < 0) vrb.error("Chunk index needs to be > 0");
+ if (k >= K) vrb.error("Chunk index needs to be smaller than the total number of chunks [=" + stb.str(K) + "]");
+
+ //STEP1: regroup by group
+ vector < pgroup > v_pgroup;
+ if (phenotype_grp.size() > 0) {
+ map < string, int > grp2idx;
+ map < string, int > :: iterator it_grp2idx;
+ for (int p = 0 ; p < phenotype_count ; p ++) {
+ it_grp2idx = grp2idx.find (phenotype_grp[p]);
+ if (it_grp2idx == grp2idx.end()) {
+ grp2idx.insert(pair < string, int > (phenotype_grp[p], v_pgroup.size()));
+ v_pgroup.push_back(pgroup(phenotype_chr[p], phenotype_start[p], phenotype_end[p]));
+ } else v_pgroup[it_grp2idx->second].merge(phenotype_start[p], phenotype_end[p]);
+ }
+ } else {
+ for (int p = 0 ; p < phenotype_count ; p ++) {
+ v_pgroup.push_back(pgroup(phenotype_chr[p], phenotype_start[p], phenotype_end[p]));
+ }
+ }
+ sort(v_pgroup.begin(), v_pgroup.end());
+
+ //STEP2: merge overlapping groups
+ stack < pgroup > s_pgroup;
+ s_pgroup.push(v_pgroup[0]);
+ for (int i = 1 ; i < v_pgroup.size(); i++) {
+ pgroup ptop = s_pgroup.top();
+ if (!ptop.overlap(v_pgroup[i])) s_pgroup.push(v_pgroup[i]);
+ else {
+ ptop.merge(v_pgroup[i]);
+ s_pgroup.pop();
+ s_pgroup.push(ptop);
+ }
+ }
+ v_pgroup.clear();
+ while (!s_pgroup.empty()) {
+ v_pgroup.push_back(s_pgroup.top());
+ s_pgroup.pop();
+ }
+ sort(v_pgroup.begin(), v_pgroup.end());
+
+ //STEP3: build one cluster per chromosome
+ vector < vector < int > > cluster_idx;
+ map < string , int > chr2idx;
+ for (int p = 0 ; p < v_pgroup.size() ; p ++) {
+ map < string , int > :: iterator it_chr2idx = chr2idx.find(v_pgroup[p].chr);
+ if (it_chr2idx == chr2idx.end()) {
+ chr2idx.insert(make_pair(v_pgroup[p].chr, cluster_idx.size()));
+ cluster_idx.push_back(vector < int > (1, p));
+ } else cluster_idx[it_chr2idx->second].push_back(p);
+ }
+
+ //STEP4: split until number of chunks is reached
+ bool done = (cluster_idx.size() >= K);
+ while (!done) {
+
+ int max_idx = -1, max_val = 1;
+ for (int p = 0 ; p < cluster_idx.size() ; p ++) {
+ if (cluster_idx[p].size() > max_val) {
+ max_val = cluster_idx[p].size();
+ max_idx = p;
+ }
+ }
+
+ if (max_idx >= 0) {
+ int max_mid = cluster_idx[max_idx].size() / 2;
+ cluster_idx.push_back(vector < int > (cluster_idx[max_idx].begin() + max_mid, cluster_idx[max_idx].end()));
+ cluster_idx[max_idx].erase(cluster_idx[max_idx].begin() + max_mid, cluster_idx[max_idx].end());
+ if (cluster_idx.size() >= K) done = true;
+ } else done = true;
+ }
+
+ //STEP5: extract coordinates
+ if (k < cluster_idx.size()) {
+ regionPhenotype.chr = v_pgroup[cluster_idx[k][0]].chr;
+ regionPhenotype.start = 1000000000;
+ regionPhenotype.end = 0;
+ for (int c = 0 ; c < cluster_idx[k].size() ; c ++) {
+ if (v_pgroup[cluster_idx[k][c]].start < regionPhenotype.start) regionPhenotype.start = v_pgroup[cluster_idx[k][c]].start;
+ if (v_pgroup[cluster_idx[k][c]].end > regionPhenotype.end) regionPhenotype.end = v_pgroup[cluster_idx[k][c]].end;
+ }
+ } else vrb.leave("Empty chunk, no data to process!");
+}
+
+void union_data::clearSamples(){
+ file_count = 0; sample_count = 0;
+ sample_id.clear(); sample_occurrence.clear();
+}
diff --git a/src/mode_union/union_read_covariates.cpp b/src/mode_union/union_read_covariates.cpp
new file mode 100644
index 0000000..7e9583c
--- /dev/null
+++ b/src/mode_union/union_read_covariates.cpp
@@ -0,0 +1,57 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "union_data.h"
+
+void union_data::readCovariates(string fcov) {
+ covariate_val.clear();
+ covariate_count =0;
+ string buffer;
+ vector < string > str;
+ int n_includedS = 0;
+ int n_includedC = 0;
+ int n_excludedC = 0;
+ vector < int > mappingS;
+
+ //vrb.title("Reading covariates in [" + fcov + "]");
+ input_file fd (fcov);
+ if (fd.fail()) vrb.error("Cannot open file!");
+
+ //Read samples
+ getline(fd, buffer);
+ if (buffer.size() == 0) vrb.error("No header line detected!");
+ stb.split(buffer, str );
+ for (int t = 1 ; t < str.size() ; t ++) {
+ mappingS.push_back(findSample(str[t]));
+ if (mappingS.back() >= 0) n_includedS++;
+ }
+
+ //Read covariates
+ while(getline(fd, buffer)) {
+ stb.split(buffer, str);
+ if (str.size() < 2) vrb.error("Incorrect number of columns!");
+ if (filter_covariate.check(str[0])) {
+ covariate_val.push_back(vector < string > (sample_count));
+ for (int t = 1 ; t < str.size() ; t ++) if (mappingS[t-1] >= 0) covariate_val.back()[mappingS[t-1]] = str[t];
+ n_includedC ++;
+ } else n_excludedC ++;
+ }
+
+ //Finalise
+ covariate_count = n_includedC;
+ //vrb.bullet(stb.str(n_includedC) + " covariates included");
+ //if (n_excludedC > 0) vrb.bullet(stb.str(n_excludedC) + " covariates excluded");
+ fd.close();
+}
diff --git a/src/mode_union/union_read_genotypes.cpp b/src/mode_union/union_read_genotypes.cpp
new file mode 100644
index 0000000..f21735e
--- /dev/null
+++ b/src/mode_union/union_read_genotypes.cpp
@@ -0,0 +1,374 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "union_data.h"
+
+void union_data::readGenotypes(string filename, string region) {
+ //vrb.title("Reading genotype data in [" + filename + "]");
+ htsFile * fp = hts_open(filename.c_str(),"r");
+ enum htsExactFormat fileformat = fp->format.format;
+ hts_close(fp);
+ if (fileformat == bcf) {
+ //vrb.bullet("File format detected: BCF");
+ readGenotypesVCF(filename,region);
+ } else if (fileformat == vcf) {
+ //vrb.bullet("File format detected: VCF");
+ readGenotypesVCF(filename,region);
+ } else if (fileformat == sam) {
+ //vrb.bullet("File format detected: BED");
+ readGenotypesBED(filename,region);
+ } else vrb.error("File format not supported!");
+}
+
+void union_data::scanGenotypes(string filename) {
+ vrb.title("Scanning genotype data in [" + filename + "]");
+ htsFile * fp = hts_open(filename.c_str(),"r");
+ enum htsExactFormat fileformat = fp->format.format;
+ hts_close(fp);
+ if (fileformat == bcf) {
+ vrb.bullet("File format detected: BCF");
+ scanGenotypesVCF(filename);
+ } else if (fileformat == vcf) {
+ vrb.bullet("File format detected: VCF");
+ scanGenotypesVCF(filename);
+ } else if (fileformat == sam) {
+ vrb.bullet("File format detected: BED");
+ scanGenotypesBED(filename);
+ } else vrb.error("File format not supported!");
+}
+
+
+void union_data::readGenotypesVCF(string fvcf,string region) {
+ int n_includedG = 0;
+ int n_excludedG_mult = 0;
+ int n_excludedG_void = 0;
+ int n_excludedG_user = 0;
+ int n_includedS = 0;
+ vector < int > mappingS;
+ genotype_id.clear();
+ genotype_chr.clear();
+ genotype_start.clear();
+ genotype_end.clear();
+ genotype_val.clear();
+ genotype_count=0;
+ genotype_id_to_idx.clear();
+
+ //Opening files
+ bcf_srs_t * sr = bcf_sr_init();
+
+ //vrb.bullet("target region [" + regionGenotype.get() + "]");
+ //if (bcf_sr_set_regions(sr, regionGenotype.get().c_str(), 0) == -1) vrb.error("Cannot jump to region!");
+ bcf_sr_set_regions(sr, region.c_str(), 0);
+ if(!(bcf_sr_add_reader (sr, fvcf.c_str()))) {
+ switch (sr->errnum) {
+ case not_bgzf: vrb.error("File not compressed with bgzip!");
+ case idx_load_failed: vrb.error("Impossible to load index file!");
+ case file_type_error: vrb.error("File format not detected by htslib!");
+ default : vrb.error("Unknown error!");
+ }
+ }
+
+ //Sample processing
+ int n_samples = bcf_hdr_nsamples(sr->readers[0].header);
+ for (int i0 = 0 ; i0 < n_samples ; i0 ++) {
+ mappingS.push_back(findSample(string(sr->readers[0].header->samples[i0])));
+ if (mappingS.back() >= 0) n_includedS++;
+ }
+
+
+ //Read genotype data
+ int ngt, ngt_arr = 0, nds, nds_arr = 0, * gt_arr = NULL, nsl, nsl_arr = 0, * sl_arr = NULL;
+ float * ds_arr = NULL;
+ bcf1_t * line;
+ unsigned int linecount = 0;
+ while(bcf_sr_next_line (sr)) {
+ linecount ++;
+ if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
+ line = bcf_sr_get_line(sr, 0);
+ if (line->n_allele == 2) {
+ ngt = bcf_get_genotypes(sr->readers[0].header, line, >_arr, &ngt_arr);
+ nds = bcf_get_format_float(sr->readers[0].header, line,"DS", &ds_arr, &nds_arr);
+ if (nds == n_samples || ngt == 2*n_samples) {
+ bcf_unpack(line, BCF_UN_STR);
+ string sid = string(line->d.id);
+ if (filter_genotype.check(sid)) {
+ genotype_id.push_back(sid);
+ genotype_chr.push_back(string(bcf_hdr_id2name(sr->readers[0].header, line->rid)));
+ string genotype_ref = string(line->d.allele[0]);
+ genotype_start.push_back(line->pos + 1);
+ nsl = bcf_get_info_int32(sr->readers[0].header, line, "END", &sl_arr, &nsl_arr);
+ if (nsl >= 0 && nsl_arr == 1) genotype_end.push_back(sl_arr[0]);
+ else genotype_end.push_back(genotype_start.back() + genotype_ref.size() - 1);
+ genotype_val.push_back(vector < float > (sample_count, 0.0));
+
+ for(int i = 0 ; i < n_samples ; i ++) {
+ if (mappingS[i] >= 0) {
+ if (nds > 0) genotype_val.back()[mappingS[i]] = ds_arr[i];
+ else {
+ if (gt_arr[2*i+0] == bcf_gt_missing || gt_arr[2*i+1] == bcf_gt_missing) genotype_val.back()[mappingS[i]] = bcf_float_missing;
+ else genotype_val.back()[mappingS[i]] = bcf_gt_allele(gt_arr[2*i+0]) + bcf_gt_allele(gt_arr[2*i+1]);
+ }
+ }
+ }
+ pair < string, int > temp (sid,n_includedG);
+ genotype_id_to_idx.insert(temp);
+ n_includedG++;
+ } else n_excludedG_user ++;
+ } else n_excludedG_void ++;
+ } else n_excludedG_mult ++;
+ }
+
+ //Finalize
+ free(gt_arr);
+ free(ds_arr);
+ bcf_sr_destroy(sr);
+ genotype_count = n_includedG;
+ //vrb.bullet(stb.str(n_includedG) + " variants included");
+ //if (n_excludedG_user > 0) vrb.bullet(stb.str(n_excludedG_user) + " variants excluded by user");
+ //if (n_excludedG_mult > 0) vrb.bullet(stb.str(n_excludedG_mult) + " multi-allelic variants excluded");
+ //if (n_excludedG_void > 0) vrb.bullet(stb.str(n_excludedG_void) + " uninformative variants excluded [no GT/DS]");
+ //if (genotype_count == 0) vrb.leave("Cannot find genotypes in target region!");
+}
+
+void union_data::readGenotypesBED(string fbed,string region) {
+ string buffer;
+ int n_includedG = 0;
+ int n_excludedG_user = 0;
+ int n_includedS = 0;
+ int n_excludedS = 0;
+ int n_missingS = 0;
+ vector < int > mappingS;
+ genotype_id.clear();
+ genotype_chr.clear();
+ genotype_start.clear();
+ genotype_end.clear();
+ genotype_val.clear();
+ genotype_count=0;
+ genotype_id_to_idx.clear();
+ //Opening files
+ htsFile *fp = hts_open(fbed.c_str(),"r");
+ if (!fp) vrb.error("Cannot open file!");
+ tbx_t * tbx = tbx_index_load(fbed.c_str());
+ if (!tbx) vrb.error("Cannot load index file!");
+ kstring_t str = {0,0,0};
+ if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line!");
+
+ //Process sample names
+ vector < string > tokens;
+ stb.split(string(str.s), tokens);
+ if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
+ for (int i0 = 6 ; i0 < tokens.size() ; i0 ++) {
+ string sid = tokens[i0];
+ if (filter_sample.check(sid)) {
+ mappingS.push_back(findSample(sid));
+ if (mappingS.back() >= 0) n_includedS ++;
+ else n_missingS ++;
+ } else {
+ mappingS.push_back(-1);
+ n_excludedS ++;
+ }
+ }
+ //vrb.bullet(stb.str(n_includedS) + " samples included");
+ //if (n_excludedS > 0) vrb.bullet(stb.str(n_excludedS) + " samples excluded by user");
+ //if (n_missingS > 0) vrb.bullet(stb.str(n_missingS) + " samples without phenotype data");
+ //if (n_includedS != sample_count) vrb.error("Cannot find genotype for " + stb.str(sample_count - n_includedS) + " samples!");
+
+ unsigned int linecount = 0;
+
+ //Jump to interesting region
+
+ hts_itr_t *itr = tbx_itr_querys(tbx, region.c_str());
+ //vrb.bullet("target region [" + regionGenotype.get() + "]");
+ //if (!itr) vrb.error("Cannot jump to region!");
+ while (tbx_itr_next(fp, tbx, itr, &str) >= 0) {
+ linecount ++;
+ if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
+ stb.split(string(str.s), tokens);
+ if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
+ if (filter_genotype.check(tokens[3])) {
+ genotype_id.push_back(tokens[3]);
+ genotype_chr.push_back(tokens[0]);
+ genotype_start.push_back(atoi(tokens[1].c_str()) + 1);
+ genotype_end.push_back(atoi(tokens[2].c_str()));
+ genotype_val.push_back(vector < float > (sample_count, 0.0));
+ for (int t = 6 ; t < tokens.size() ; t ++) {
+ if (mappingS[t-6] >= 0) {
+ if (tokens[t] == "NA") genotype_val.back()[mappingS[t-6]] = bcf_float_missing;
+ else genotype_val.back()[mappingS[t-6]] = stof(tokens[t]);
+ }
+ }
+ pair < string, int > temp (tokens[3],n_includedG);
+ genotype_id_to_idx.insert(temp);
+ n_includedG++;
+ } else n_excludedG_user ++;
+ }
+ tbx_itr_destroy(itr);
+
+
+ //Finalize & verbose
+ tbx_destroy(tbx);
+ if (hts_close(fp)) vrb.error("Cannot properly close file!");
+ genotype_count = n_includedG;
+ //vrb.bullet(stb.str(n_includedG) + " variants included");
+ //if (n_excludedG_user > 0) vrb.bullet(stb.str(n_excludedG_user) + " variants excluded by user");
+ //if (genotype_count == 0) vrb.leave("Cannot find variants in target region!");
+}
+
+
+void union_data::scanGenotypesVCF(string fvcf) {
+ int n_includedG = 0;
+ int n_excludedG_mult = 0;
+ int n_excludedG_void = 0;
+ int n_excludedG_user = 0;
+ int n_includedS = 0;
+ vector < int > mappingS;
+
+ //Opening files
+ bcf_srs_t * sr = bcf_sr_init();
+
+ if ( regionGenotype.chr != "NA"){
+ vrb.bullet("target region [" + regionGenotype.get() + "]");
+ if (bcf_sr_set_regions(sr, regionGenotype.get().c_str(), 0) == -1) vrb.error("Cannot jump to region!");
+ }
+
+ if(!(bcf_sr_add_reader (sr, fvcf.c_str()))) {
+ switch (sr->errnum) {
+ case not_bgzf: vrb.error("File not compressed with bgzip!");
+ case idx_load_failed: vrb.error("Impossible to load index file!");
+ case file_type_error: vrb.error("File format not detected by htslib!");
+ default : vrb.error("Unknown error!");
+ }
+ }
+
+ //Sample processing
+ int n_samples = bcf_hdr_nsamples(sr->readers[0].header);
+ for (int i0 = 0 ; i0 < n_samples ; i0 ++) {
+ mappingS.push_back(findSample(string(sr->readers[0].header->samples[i0])));
+ if (mappingS.back() >= 0) n_includedS++;
+ }
+
+ //Read genotype data
+ int ngt, ngt_arr = 0, nds, nds_arr = 0, * gt_arr = NULL, nsl, nsl_arr = 0, * sl_arr = NULL;
+ float * ds_arr = NULL;
+ bcf1_t * line;
+ unsigned int linecount = 0;
+ while(bcf_sr_next_line (sr)) {
+ linecount ++;
+ if (linecount % 1000000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
+ line = bcf_sr_get_line(sr, 0);
+ if (line->n_allele == 2) {
+ ngt = bcf_get_genotypes(sr->readers[0].header, line, >_arr, &ngt_arr);
+ nds = bcf_get_format_float(sr->readers[0].header, line,"DS", &ds_arr, &nds_arr);
+ if (nds == n_samples || ngt == 2*n_samples) {
+ bcf_unpack(line, BCF_UN_STR);
+ string sid = string(line->d.id);
+ if (genotype_id_to_idx.count(sid)) continue;
+ if (filter_genotype.check(sid)) {
+ genotype_id.push_back(sid);
+ genotype_chr.push_back(string(bcf_hdr_id2name(sr->readers[0].header, line->rid)));
+ string genotype_ref = string(line->d.allele[0]);
+ genotype_start.push_back(line->pos + 1);
+ nsl = bcf_get_info_int32(sr->readers[0].header, line, "END", &sl_arr, &nsl_arr);
+ if (nsl >= 0 && nsl_arr == 1) genotype_end.push_back(sl_arr[0]);
+ else genotype_end.push_back(genotype_start.back() + genotype_ref.size() - 1);
+ pair < string, int > temp (sid,genotype_id_to_idx.size());
+ genotype_id_to_idx.insert(temp);
+ n_includedG++;
+ } else n_excludedG_user ++;
+ } else n_excludedG_void ++;
+ } else n_excludedG_mult ++;
+ }
+
+ //Finalize
+ free(gt_arr);
+ free(ds_arr);
+ bcf_sr_destroy(sr);
+ genotype_count += n_includedG;
+ vrb.bullet(stb.str(n_includedG) + " new variants included");
+ if (n_excludedG_user > 0) vrb.bullet(stb.str(n_excludedG_user) + " variants excluded by user");
+ if (n_excludedG_mult > 0) vrb.bullet(stb.str(n_excludedG_mult) + " multi-allelic variants excluded");
+ if (n_excludedG_void > 0) vrb.bullet(stb.str(n_excludedG_void) + " uninformative variants excluded [no GT/DS]");
+ if (genotype_count == 0) vrb.leave("Cannot find genotypes in target region!");
+}
+
+void union_data::scanGenotypesBED(string fbed) {
+ string buffer;
+ int n_includedG = 0;
+ int n_excludedG_user = 0;
+
+ //Opening files
+ htsFile *fp = hts_open(fbed.c_str(),"r");
+ if (!fp) vrb.error("Cannot open file!");
+ tbx_t * tbx = tbx_index_load(fbed.c_str());
+ if (!tbx) vrb.error("Cannot load index file!");
+ kstring_t str = {0,0,0};
+ if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line!");
+
+ //Read genotype data
+ vector < string > tokens;
+ unsigned int linecount = 0;
+ //Jump to interesting region
+ if (regionGenotype.chr != "NA"){
+ hts_itr_t *itr = tbx_itr_querys(tbx, regionGenotype.get().c_str());
+ vrb.bullet("target region [" + regionGenotype.get() + "]");
+ if (!itr) vrb.error("Cannot jump to region!");
+ while (tbx_itr_next(fp, tbx, itr, &str) >= 0) {
+ linecount ++;
+ if (linecount % 1000000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
+ stb.split(string(str.s), tokens);
+ if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
+ if (genotype_id_to_idx.count(tokens[3])) continue;
+ if (filter_genotype.check(tokens[3])) {
+ genotype_id.push_back(tokens[3]);
+ genotype_chr.push_back(tokens[0]);
+ genotype_start.push_back(atoi(tokens[1].c_str()) + 1);
+ genotype_end.push_back(atoi(tokens[2].c_str()));
+ pair < string, int > temp (tokens[3],genotype_id_to_idx.size());
+ genotype_id_to_idx.insert(temp);
+ n_includedG++;
+ } else n_excludedG_user ++;
+ }
+ tbx_itr_destroy(itr);
+ }else{
+ while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) {
+ linecount ++;
+ if (linecount % 1000000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
+ stb.split(string(str.s), tokens);
+ if (str.l && str.s[0] != tbx->conf.meta_char) {
+ if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
+ if (genotype_id_to_idx.count(tokens[3])) continue;
+ if (filter_genotype.check(tokens[3])) {
+ genotype_id.push_back(tokens[3]);
+ genotype_chr.push_back(tokens[0]);
+ genotype_start.push_back(atoi(tokens[1].c_str()) + 1);
+ genotype_end.push_back(atoi(tokens[2].c_str()));
+ pair < string, int > temp (tokens[3],genotype_id_to_idx.size());
+ genotype_id_to_idx.insert(temp);
+ n_includedG++;
+ } else n_excludedG_user ++;
+ }
+ }
+ }
+
+ //Finalize & verbose
+ tbx_destroy(tbx);
+ genotype_count += n_includedG;
+ if (hts_close(fp)) vrb.error("Cannot properly close file!");
+ vrb.bullet(stb.str(n_includedG) + " new variants included");
+ if (n_excludedG_user > 0) vrb.bullet(stb.str(n_excludedG_user) + " variants excluded by user");
+ if (n_includedG == 0) vrb.leave("Cannot find variants in target region!");
+}
+
+
diff --git a/src/mode_union/union_read_get_hotspots.cpp b/src/mode_union/union_read_get_hotspots.cpp
new file mode 100644
index 0000000..7c7bdc4
--- /dev/null
+++ b/src/mode_union/union_read_get_hotspots.cpp
@@ -0,0 +1,88 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "union_data.h"
+
+void union_data::readHotspots(string fcov) {
+ string buffer;
+ vector < string > str;
+ int idx = 0;
+ vrb.title("Reading hotspots in [" + fcov + "]");
+ input_file fd (fcov);
+ if(fd.fail()) vrb.error("Cannot open file");
+ coldspot_u prev;
+ //Read hotspots
+ while(getline(fd, buffer)) {
+ stb.split(buffer, str);
+ if (str.size() < 3) vrb.error("Wrong hotspot file format");
+ if (prev.chr != "" && prev.chr != str[0]){
+ coldspot_u *pCs = new coldspot_u(prev.chr,prev.end + 1,INT_MAX,idx,"CS");
+ all_coldspots_p.push_back(pCs);
+ int sb = (prev.end + 1) / bin_size;
+ int eb = INT_MAX / bin_size;
+ for (int b = sb; b <= eb; b++) coldspot_bins_p[prev.chr][b].push_back(pCs);
+ idx++;
+ coldspot_count++;
+ }
+ int s = prev.chr == str[0] ? prev.end + 1 : 0 ;
+ int e = atoi(str[1].c_str());
+ int sb = s / bin_size;
+ int eb = e / bin_size;
+ coldspot_u *pCs = new coldspot_u(str[0],s,e,idx,"CS");
+ for (int b = sb; b <= eb; b++) coldspot_bins_p[str[0]][b].push_back(pCs);
+ all_coldspots_p.push_back(pCs);
+ idx++;
+ coldspot_count++;
+ prev = coldspot_u(str[0],atoi(str[1].c_str())+1,atoi(str[2].c_str()),-1,"NA");
+ s = atoi(str[1].c_str())+1;
+ e = atoi(str[2].c_str());
+ if (e < s) vrb.error("Hotspot end cannot be smaller than start " + buffer);
+ if (prev.chr == str[0] && prev.start > s) vrb.error("Hotspots are not sorted at " + buffer);
+ sb = s / bin_size;
+ eb = e / bin_size;
+ pCs = new coldspot_u(str[0],s,e,idx,"HS");
+ for (int b = sb; b <= eb; b++) coldspot_bins_p[str[0]][b].push_back(pCs);
+ all_coldspots_p.push_back(pCs);
+ idx++;
+ coldspot_count++;
+ }
+ coldspot_u *pCs = new coldspot_u(prev.chr,prev.end + 1,INT_MAX,idx,"CS");
+ all_coldspots_p.push_back(pCs);
+ int sb = (prev.end + 1) / bin_size;
+ int eb = INT_MAX / bin_size;
+ for (int b = sb; b <= eb; b++) coldspot_bins_p[prev.chr][b].push_back(pCs);
+ idx++;
+ coldspot_count++;
+
+ //Finalise
+ if (!coldspot_count) vrb.error("No coldspots found");
+ vrb.bullet(stb.str(coldspot_count) + " coldspots included");
+ fd.close();
+
+ //for (int i = 0 ; i < coldspot_count; i++ ) cerr << (*all_coldspots_p[i]);
+}
+
+
+int union_data::getColdspot(string chr, int pos){
+ if (coldspot_bins_p.find(chr) != coldspot_bins_p.end()){
+ int max = (coldspot_bins_p[chr].rbegin()->second).back()->end;
+ if (pos > max) return -1; //after the last hotspot on this chr
+ int bin = pos / bin_size;
+ if (coldspot_bins_p[chr].find(bin) != coldspot_bins_p[chr].end()){
+ for ( int i = 0 ; i < coldspot_bins_p[chr][bin].size(); i ++ ) if (coldspot_bins_p[chr][bin][i]->start <= pos && coldspot_bins_p[chr][bin][i]->end >= pos) return coldspot_bins_p[chr][bin][i]->idx;
+ return -2; //in a hotspot
+ }else return -3; //in a hotspot BUT SINCE BIN IS 1MB SHOULD NOT HAPPEN
+ }else return -4; //no hospot found for this chr
+}
diff --git a/src/mode_union/union_read_phenotypes.cpp b/src/mode_union/union_read_phenotypes.cpp
new file mode 100644
index 0000000..a2654be
--- /dev/null
+++ b/src/mode_union/union_read_phenotypes.cpp
@@ -0,0 +1,153 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "union_data.h"
+
+void union_data::readPhenotypes(string fbed, string region) {
+ int n_includedS = 0;
+ int n_includedP = 0;
+ int n_excludedP = 0;
+ vector < int > mappingS;
+ phenotype_id.clear();
+ phenotype_chr.clear();
+ phenotype_start.clear();
+ phenotype_end.clear();
+ phenotype_val.clear();
+ phenotype_count=0;
+ phenotype_id_to_idx.clear();
+ //Open BED file
+ //vrb.title("Reading phenotype data in [" + fbed + "]");
+ htsFile *fp = hts_open(fbed.c_str(),"r");
+ if (!fp) vrb.error("Cannot open file");
+ tbx_t *tbx = tbx_index_load(fbed.c_str());
+ if (!tbx) vrb.error("Cannot open index file");
+ kstring_t str = {0,0,0};
+ if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line!");
+
+ //Process sample names
+ vector < string > tokens;
+ stb.split(string(str.s), tokens);
+ if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
+ for (int t = 6 ; t < tokens.size() ; t ++) {
+ mappingS.push_back(findSample(tokens[t]));
+ if (mappingS.back() >= 0) n_includedS++;
+ }
+ unsigned int linecount =0;
+
+ //Read phenotypes
+ hts_itr_t *itr = tbx_itr_querys(tbx, region.c_str());
+ //vrb.bullet("target region [" + regionPhenotype.get() + "]");
+ //if (!itr) vrb.error("Cannot jump to region!");
+ //Read data
+ while (tbx_itr_next(fp, tbx, itr, &str) >= 0) {
+ linecount ++;
+ if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
+ stb.split(string(str.s), tokens);
+ if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
+ if (filter_phenotype.check(tokens[3])) {
+ phenotype_id.push_back(tokens[3]);
+ phenotype_chr.push_back(tokens[0]);
+ phenotype_start.push_back(atoi(tokens[1].c_str()) + 1);
+ phenotype_end.push_back(atoi(tokens[2].c_str()));
+ phenotype_val.push_back(vector < float > (sample_count, 0.0));
+ for (int t = 6 ; t < tokens.size() ; t ++) {
+ if (mappingS[t-6] >= 0) {
+ if (tokens[t] == "NA") phenotype_val.back()[mappingS[t-6]] = bcf_float_missing;
+ else phenotype_val.back()[mappingS[t-6]] = stof(tokens[t]);
+ }
+ }
+ pair < string, int > temp (tokens[3],n_includedP);
+ phenotype_id_to_idx.insert(temp);
+ n_includedP++;
+ } else n_excludedP ++;
+ }
+ tbx_itr_destroy(itr);
+ //Finalize & verbose
+ tbx_destroy(tbx);
+ if (hts_close(fp)) vrb.error("Cannot properly close file");
+ phenotype_count = phenotype_id.size();
+ //vrb.bullet(stb.str(n_includedP) + " phenotypes included");
+ //if (n_excludedP > 0) vrb.bullet(stb.str(n_excludedP) + " phenotypes excluded by user");
+ //if (phenotype_count == 0) vrb.leave("Cannot find phenotypes in target region!");
+}
+
+void union_data::scanPhenotypes(string fbed) {
+ int n_includedP = 0;
+ int n_excludedP = 0;
+
+ //Open BED file
+ vrb.title("Scanning phenotype data in [" + fbed + "]");
+ htsFile *fp = hts_open(fbed.c_str(),"r");
+ if (!fp) vrb.error("Cannot open file");
+ tbx_t * tbx = tbx_index_load(fbed.c_str());
+ if (!tbx) vrb.error("Cannot open index file");
+
+ //Read header
+ kstring_t str = {0,0,0};
+ if (!hts_getline(fp, KS_SEP_LINE, &str) || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line");
+
+ //Scan file
+ vector < string > tokens;
+ unsigned int linecount =0;
+ if (regionPhenotype.chr != "NA"){
+ hts_itr_t *itr = tbx_itr_querys(tbx, regionPhenotype.get().c_str());
+ vrb.bullet("target region [" + regionPhenotype.get() + "]");
+ if (!itr) vrb.error("Cannot jump to region!");
+ //Read data
+ while (tbx_itr_next(fp, tbx, itr, &str) >= 0) {
+ linecount ++;
+ if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
+ stb.split(string(str.s), tokens);
+ if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
+ if (phenotype_id_to_idx.count(tokens[3])) continue;
+ if (filter_phenotype.check(tokens[3])) {
+ phenotype_id.push_back(tokens[3]);
+ phenotype_chr.push_back(tokens[0]);
+ phenotype_start.push_back(atoi(tokens[1].c_str()) + 1);
+ phenotype_end.push_back(atoi(tokens[2].c_str()));
+ pair < string, int > temp (tokens[3],phenotype_id_to_idx.size());
+ phenotype_id_to_idx.insert(temp);
+ n_includedP++;
+ } else n_excludedP ++;
+ }
+ tbx_itr_destroy(itr);
+ }else{
+ while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) {
+ linecount ++;
+ if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
+ if (str.l && str.s[0] != tbx->conf.meta_char) {
+ stb.split(string(str.s), tokens);
+ if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
+ if (phenotype_id_to_idx.count(tokens[3])) continue;
+ if (filter_phenotype.check(tokens[3])) {
+ phenotype_id.push_back(tokens[3]);
+ phenotype_chr.push_back(tokens[0]);
+ phenotype_start.push_back(atoi(tokens[1].c_str()) + 1);
+ phenotype_end.push_back(atoi(tokens[2].c_str()));
+ pair < string, int > temp (tokens[3],phenotype_id_to_idx.size());
+ phenotype_id_to_idx.insert(temp);
+ n_includedP++;
+ } else n_excludedP ++;
+ }
+ }
+ }
+ //Finalize & verbose
+ tbx_destroy(tbx);
+ if (hts_close(fp)) vrb.error("Cannot properly close file");
+ phenotype_count = phenotype_id.size();
+ vrb.bullet(stb.str(n_includedP) + " new phenotypes included");
+ if (n_excludedP > 0) vrb.bullet(stb.str(n_excludedP) + " phenotypes excluded by user");
+ if (phenotype_count == 0) vrb.leave("Cannot find phenotypes in region!");
+}
diff --git a/src/mode_union/union_union.cpp b/src/mode_union/union_union.cpp
new file mode 100644
index 0000000..fdb174f
--- /dev/null
+++ b/src/mode_union/union_union.cpp
@@ -0,0 +1,170 @@
+/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
+#include "union_data.h"
+
+void union_data::unions(string frtc1,int i){
+ string buffer;
+ vector < string > str;
+ vrb.title("Reading QTLtools output in [" + frtc1 + "]");
+ input_file fd (frtc1);
+ if (fd.fail()) vrb.error("Cannot open file!");
+ while(getline(fd, buffer)) {
+ stb.split(buffer, str);
+ if (str.size() < 2) vrb.error("Wrong fastQTL output file format");
+ if (variant_column >= str.size()) vrb.error("variant column = " + stb.str(variant_column+1) + " but found " + stb.str(str.size()) + " columns in the following line:\n" + buffer);
+ if (phenotype_column >= str.size()) vrb.error("phenotype column = " + stb.str(phenotype_column+1) + " but found " + stb.str(str.size()) + " columns in the following line:\n" + buffer);
+ string pheno = str[phenotype_column];
+ string snp = str[variant_column];
+ if (!genotype_id_to_idx.count(snp) || !phenotype_id_to_idx.count(pheno) ) continue;
+ int cs = getColdspot(genotype_chr[genotype_id_to_idx[snp]],genotype_start[genotype_id_to_idx[snp]]);
+ if( cs < 0) continue;
+ if (!toUnite.count(cs) || !toUnite[cs].count(pheno) ) {
+ //myPhenotype P(phenotype_chr[phenotype_id_to_idx[pheno]],phenotype_start[phenotype_id_to_idx[pheno]],no_of_files);
+ toUnite[cs][pheno] = myPhenotype(phenotype_chr[phenotype_id_to_idx[pheno]],phenotype_start[phenotype_id_to_idx[pheno]],no_of_files) ;
+ }
+ toUnite[cs][pheno].assign(snp,0,i);
+ }
+ fd.close();
+}
+
+void union_data::unions_conditional(string frtc1,int i){
+ string buffer;
+ vector < string > str;
+ vrb.title("Reading QTLtools output in [" + frtc1 + "]");
+ input_file fd (frtc1);
+ if (fd.fail()) vrb.error("Cannot open file!");
+ while(getline(fd, buffer)) {
+ stb.split(buffer, str);
+ if (str.size() < 4) vrb.error("Wrong fastQTL output file format");
+ if (rank_column >= str.size()) vrb.error("rank column = " + stb.str(pvalue_column+1) + " but found " + stb.str(str.size()) + " columns in the following line:\n" + buffer);
+ if (variant_column >= str.size()) vrb.error("variant column = " + stb.str(variant_column+1) + " but found " + stb.str(str.size()) + " columns in the following line:\n" + buffer);
+ if (phenotype_column >= str.size()) vrb.error("phenotype column = " + stb.str(phenotype_column+1) + " but found " + stb.str(str.size()) + " columns in the following line:\n" + buffer);
+ if (best_column >= str.size()) vrb.error("best column = " + stb.str(best_column+1) + " but found " + stb.str(str.size()) + " columns in the following line:\n" + buffer);
+ string pheno = str[phenotype_column];
+ string snp = str[variant_column];
+ string best = str[best_column];
+ int rank = atoi(str[rank_column].c_str());
+ if (!genotype_id_to_idx.count(snp) || !phenotype_id_to_idx.count(pheno) || best != "1" ) continue;
+ int cs = getColdspot(genotype_chr[genotype_id_to_idx[snp]],genotype_start[genotype_id_to_idx[snp]]);
+ //cerr << cs << " " << pheno << " " << snp << " "<< frtc1 << " " << i << " "<< genotype_chr[genotype_id_to_idx[snp]] << " " << genotype_start[genotype_id_to_idx[snp]] << " " << genotype_id_to_idx[snp]<<endl;
+ if( cs < 0) continue;
+ if (!toUnite.count(cs) || !toUnite[cs].count(pheno) ) {
+ toUnite[cs][pheno] = myPhenotype(phenotype_chr[phenotype_id_to_idx[pheno]],phenotype_start[phenotype_id_to_idx[pheno]],no_of_files) ;
+ }
+ toUnite[cs][pheno].assign(snp,rank,i);
+ }
+ fd.close();
+}
+
+void union_data::create_unions(vector <string> &hitFiles , vector <string> &vcfFiles , vector <string> &bedFiles , vector <string> &covFiles){
+ for (int i = 0 ; i < no_of_files; i++){
+ readSampleFromBED(bedFiles[i]);
+ if (vcfFiles.size() > 1) readSampleFromVCF(vcfFiles[i]);
+ else readSampleFromVCF(vcfFiles[0]);
+ if (options.count("cov")) readSampleFromCOV(covFiles[i]);
+ mergeSampleLists();
+ scanPhenotypes(bedFiles[i]);
+ if(vcfFiles.size()==1){
+ if(i==0) scanGenotypes(vcfFiles[i]);
+ }else scanGenotypes(vcfFiles[i]);
+ if(options.count("conditional")) unions_conditional(hitFiles[i],i);
+ else unions(hitFiles[i],i);
+ clearSamples();
+ }
+}
+
+void union_data::find_unions(vector <string> &hitFiles , vector <string> &vcfFiles , vector <string> &bedFiles , vector <string> &covFiles){
+ vrb.title("Find the best variant in all significant regions [" + stb.str(toUnite.size()) +"]");
+ map < int ,map <string, myPhenotype > >::iterator outer_it;
+ map <string, myPhenotype >::iterator inner_it;
+ vector < results > results_vector(no_of_files);
+ int outer_count = 1;
+ for (outer_it = toUnite.begin(); outer_it != toUnite.end(); outer_it++){
+ if (outer_count % 10 ==0) vrb.bullet("Region [" + stb.str(outer_count) + "/" + stb.str(toUnite.size()) +"]");
+ outer_count++;
+ int cs = outer_it->first;
+ if (cs < 0) continue;
+ vector < genotypes_holder > geno_sink(no_of_files);
+ string regionG = all_coldspots_p[cs]->chr + ":" + stb.str(all_coldspots_p[cs]->start) + "-" + stb.str(all_coldspots_p[cs]->end);
+ for (inner_it = outer_it->second.begin(); inner_it != outer_it->second.end(); inner_it++){
+ string pheno = inner_it->first;
+ string chr = inner_it->second.pheno_chr;
+ int pos = inner_it->second.pheno_pos;
+ string regionP = chr + ":" + stb.str(pos) + "-" + stb.str(pos+1);
+ myPhenotype *P = &(inner_it->second);
+ for (int i =0; i < no_of_files;i++){
+ //cerr << bedFiles[i] << " " << pheno << " " << regionP << " " << regionG << endl;
+ if (P->found[i]) {
+ for (int u = 0 ; u < P->genotypes[i].size(); u++) results_vector[i].assign(pheno,P->genotypes[i][u],P->ranks[i][u],0.0,cs,regionG);
+ for (int u = P->genotypes[i].size() ; u < P->max_independent_signal ; u++) results_vector[i].assign(pheno,"__UNION_FILLER_MAX_INDEP__",-1,0.0,cs,regionG);
+ }else{
+ readSampleFromBED(bedFiles[i],true);
+ if (vcfFiles.size() > 1) readSampleFromVCF(vcfFiles[i],true);
+ else readSampleFromVCF(vcfFiles[0],true);
+ if (options.count("cov")) readSampleFromCOV(covFiles[i],true);
+ mergeSampleLists(true);
+ readPhenotypes(bedFiles[i],regionP);
+ if(!phenotype_id_to_idx.count(pheno)){
+ for (int u = 0 ; u < P->max_independent_signal ; u++) results_vector[i].assign(pheno,"__UNION_FILLER_MISS_PHENO__",-1,0.0,cs,regionG);
+ clearSamples();
+ continue;
+ }
+ if (options.count("cov")){
+ readCovariates(covFiles[i]);
+ residualizePhenotypes();
+ }
+ int idx = 0;
+ if(vcfFiles.size()==1){
+ if (!geno_sink[0].genotypes.size()){
+ readGenotypes(vcfFiles[i],regionG);
+ geno_sink[0].genotypes = genotype_val;
+ geno_sink[0].ids = genotype_id;
+ }
+ }else{
+ if (!geno_sink[i].genotypes.size()){
+ readGenotypes(vcfFiles[i],regionG);
+ geno_sink[i].genotypes = genotype_val;
+ geno_sink[i].ids = genotype_id;
+ }
+ idx = i;
+ }
+ int geno_count = geno_sink[idx].genotypes.size();
+ if (geno_count){
+ double pval;
+ string bestSNP = getBestVariant(geno_sink[idx],phenotype_id_to_idx[pheno], pval );
+ //if (bestSNP == "NA") cerr << pheno << " " << phenotype_id[phenotype_id_to_idx[pheno]] << " " << idx << " " << vcfFiles[idx] << " " << geno_sink[idx].genotypes.size() << endl;
+ results_vector[i].assign(pheno,bestSNP, -1 , pval,cs,regionG);
+ for (int u = 1 ; u < P->max_independent_signal ; u++) results_vector[i].assign(pheno,"__UNION_FILLER_MAX_INDEP__",-1,0.0,cs,regionG);
+ }else{
+ for (int u = 0 ; u < P->max_independent_signal ; u++) results_vector[i].assign(pheno,"__UNION_FILLER_MISS_GENO__",-1,0.0,cs,regionG);
+ }
+ clearSamples();
+ }
+ }
+ }
+ geno_sink.clear();
+ }
+ for (int i = 0 ; i < no_of_files; i++){
+ string name = hitFiles[i].substr(hitFiles[i].find_last_of("/") + 1);
+ //add prefix
+ if (options.count("out-suffix")) name += options["out-suffix"].as <string> ();
+ output_file fout(name + ".union");
+ if (fout.fail()) vrb.error("Cannot open [" + name + ".union]");
+ fout << results_vector[i];
+ }
+
+}
+
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/qtltools.git
More information about the debian-med-commit
mailing list