[med-svn] [fastqtl] 02/04: Imported Upstream version 2.184+dfsg
Dylan Aïssi
bob.dybian-guest at moszumanska.debian.org
Mon May 2 21:52:56 UTC 2016
This is an automated email from the git hooks/post-receive script.
bob.dybian-guest pushed a commit to branch master
in repository fastqtl.
commit 8cf06313b080d920898507dc667b015c87877c61
Author: Dylan Aïssi <bob.dybian at gmail.com>
Date: Mon May 2 23:44:22 2016 +0200
Imported Upstream version 2.184+dfsg
---
INSTALL | 142 ++++++
LICENCE | 621 ++++++++++++++++++++++++
Makefile | 79 ++++
README | 1 +
doc/img/fastqtl_checks.jpg | Bin 0 -> 188036 bytes
doc/img/fastqtl_screenshot_nominal.jpg | Bin 0 -> 274581 bytes
doc/img/fastqtl_screenshot_permutation.jpg | Bin 0 -> 399258 bytes
doc/index.html | 27 ++
doc/pages/cis_mapping.html | 35 ++
doc/pages/cis_nominal.html | 250 ++++++++++
doc/pages/cis_permutation.html | 354 ++++++++++++++
doc/pages/footer.html | 15 +
doc/pages/format_bed.html | 71 +++
doc/pages/format_cov.html | 52 ++
doc/pages/format_vcf.html | 63 +++
doc/pages/header.html | 13 +
doc/pages/homepage.html | 56 +++
doc/pages/install.html | 97 ++++
doc/pages/leftmenu.html | 37 ++
doc/pages/options.html | 247 ++++++++++
doc/pages/template.html | 51 ++
doc/script/print_last_modif_date.js | 3 +
doc/style/default.css | 228 +++++++++
example/covariates.txt.gz | Bin 0 -> 3379 bytes
example/genotypes.vcf.gz | Bin 0 -> 15890106 bytes
example/genotypes.vcf.gz.tbi | Bin 0 -> 26376 bytes
example/interaction.txt | 373 +++++++++++++++
example/phenotypes.bed.gz | Bin 0 -> 404725 bytes
example/phenotypes.bed.gz.tbi | Bin 0 -> 3703 bytes
script/calulateNominalPvalueThresholds.R | 33 ++
src/analysisMapping.cpp | 142 ++++++
src/analysisNominal.cpp | 69 +++
src/analysisPermutation.cpp | 136 ++++++
src/analysisPermutationInteraction.cpp | 126 +++++
src/analysisPermutationPerGroup.cpp | 160 +++++++
src/analysisPermutationSequence.cpp | 135 ++++++
src/commands.cpp | 52 ++
src/data.h | 210 +++++++++
src/df.cpp | 100 ++++
src/fastQTL.cpp | 290 ++++++++++++
src/management.cpp | 252 ++++++++++
src/mle.cpp | 88 ++++
src/readCovariates.cpp | 77 +++
src/readGenotypes.cpp | 117 +++++
src/readGroups.cpp | 54 +++
src/readInclusionsExclusions.cpp | 89 ++++
src/readInteractions.cpp | 51 ++
src/readPhenotypes.cpp | 119 +++++
src/readThresholds.cpp | 53 +++
src/region.h | 66 +++
src/residualizer.cpp | 142 ++++++
src/residualizer.h | 53 +++
src/utils/ranker.h | 224 +++++++++
src/utils/tabix.cpp | 107 +++++
src/utils/tabix.hpp | 41 ++
src/utils/utils.cpp | 731 +++++++++++++++++++++++++++++
src/utils/utils.h | 312 ++++++++++++
57 files changed, 6844 insertions(+)
diff --git a/INSTALL b/INSTALL
new file mode 100644
index 0000000..96c707e
--- /dev/null
+++ b/INSTALL
@@ -0,0 +1,142 @@
+This document describes how to install/compile FastQTL on both Linux & Mac OS
+
+A. RUN FASTQTL ON LINUX
+B. COMPILE FASTQTL ON LINUX
+C. RUN FASTQTL ON MAC OS
+D. COMPILE FASTQTL ON MAC OS
+
+
+
+############################################################################################
+# A. RUN FASTQTL ON LINUX #
+############################################################################################
+
+Just run: ./bin/fastqtl.static --help
+
+If it works, you should get this screen output:
+
+-------------------------------
+Fast QTL
+ * Authors : Olivier DELANEAU, Halit ONGEN, Alfonso BUIL & Manolis DERMITZAKIS
+ * Contact : olivier.delaneau at gmail.com
+ * Webpage : http://fastqtl.sourceforge.net/
+ * Version : v2.0
+
+Basic options:
+ --help Produces this help
+ --silent Silent mode on terminal
+ --seed arg (=1434116729) Random number seed. Useful to replicate
+...
+Parallelization:
+ -K [ --chunk ] arg Specify which chunk needs to be
+ processed
+ --commands arg Generates all commands
+ -R [ --region ] arg Region of interest.
+-------------------------------
+
+If it doesn't run, i'm afraid you've got to compile the code (See section B).
+
+############################################################################################
+# B. COMPILE FASTQTL ON LINUX #
+############################################################################################
+
+(B.1) INSTALL MATH R LIBRARY
+ (1) Download R source code: wget http://cran.r-project.org/src/base/R-3/R-3.2.0.tar.gz
+ (2) Unzip R source code: tar xzvf R-3.2.0.tar.gz
+ (3) Go to R source code folder: cd R-3.2.0
+ (4) Configure Makefile: ./configure
+ (5) Go to R math library folder: cd src/nmath/standalone
+ (6) Compile the code: make
+ (7) Go 2 folder backward: cd ../..
+ (8) Save the current path: RMATH=$(pwd)
+
+(B.2.1) INSTALL BOOST (if not already installer, OPTION1: MANUALLY)
+ (1) Download BOOST: http://sourceforge.net/projects/boost/files/boost/1.58.0/boost_1_58_0.tar.gz/download
+ (2) Unzip the package: tar xzvf boost_1_58_0.tar.gz
+ (3) Go in the folder: cd boost_1_58_0
+ (4) Install required packages: ./bootstrap.sh --prefix=/home/olivier/Desktop/myInstall --with-libraries=iostreams,program_options
+
+(B.2.2) INSTALL BOOST (if not already installer, OPTION2: AUTOMATICALLY)
+ Many Linux distribution have BOOST package already built.
+ On Debian/Ubunutu, just run: sudo apt-get install libboost-dev
+ On Redhat/CentOS, just run: yum install boost-devel
+
+(B.3) INSTALL GSL (if not already installed)
+ Most Linux distribution have GSL package already built.
+ On Debian/Ubunutu, just run: sudo apt-get install libgsl0-dev
+ On Redhat/CentOS, just run: yum install gsl-devel
+
+
+(B.4) COMPILE FASTQTL
+ (1) Go to the fastqtl folder: cd FastQTL-2.165
+ (2) Compile the code running: make cleanall && make
+ This later step needs the RMATH variable in A.1.8 to be defined.
+ This can be specified here by: make RMATH=/my/path/to/r/src
+
+############################################################################################
+# C. RUN FASTQTL ON MACOS #
+############################################################################################
+
+(C.1) INSTALL HOMEBREW (if not already installed)
+ (1) Run: ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
+
+(C.2) INSTALL BOOST (if not already installed)
+ (1) Run: brew install boost
+
+(C.3) INSTALL GSL (if not already installed)
+ (1) Run: brew install gsl
+
+(C.4) TEST FASTQTL:
+ (1) Run: ./bin/fastqtl.macos --help
+
+If it works, you should get this screen output:
+
+-------------------------------
+Fast QTL
+ * Authors : Olivier DELANEAU, Halit ONGEN, Alfonso BUIL & Manolis DERMITZAKIS
+ * Contact : olivier.delaneau at gmail.com
+ * Webpage : http://fastqtl.sourceforge.net/
+ * Version : v2.0
+
+Basic options:
+ --help Produces this help
+ --silent Silent mode on terminal
+ --seed arg (=1434116729) Random number seed. Useful to replicate
+...
+Parallelization:
+ -K [ --chunk ] arg Specify which chunk needs to be
+ processed
+ --commands arg Generates all commands
+ -R [ --region ] arg Region of interest.
+-------------------------------
+
+If it doesn't run, i'm afraid you've got to compile the code (See section D).
+
+############################################################################################
+# D. COMPILE FASTQTL ON MACOS #
+############################################################################################
+
+(D.1) INSTALL MATH R LIBRARY
+ (1) Download R source code: wget http://cran.r-project.org/src/base/R-3/R-3.2.0.tar.gz
+ (2) Unzip R source code: tar xzvf R-3.2.0.tar.gz
+ (3) Go to R source code folder: cd R-3.2.0
+ (4) Configure Makefile: ./configure
+ (5) Go to R math library folder: cd src/nmath/standalone
+ (6) Compile the code: make
+ (7) Go 2 folder backward: cd ../..
+ (8) Save the current path: RMATH=$(pwd)
+
+(D.2) INSTALL HOMEBREW (if not already installed)
+ (1) Run: ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
+
+(D.3) INSTALL BOOST (if not already installed)
+ (1) Run: brew install boost
+
+(D.4) INSTALL GSL (if not already installed)
+ (1) Run: brew install gsl
+
+(D.5) COMPILE FASTQTL
+ (1) Go to the fastqtl folder: cd FastQTL-2.165
+ (2) Compile the code running: make cleanall && make macos
+ This later step needs the RMATH variable in D.1.8 to be defined.
+ This can be specified here by: make RMATH=/my/path/to/r/src
diff --git a/LICENCE b/LICENCE
new file mode 100644
index 0000000..94a0453
--- /dev/null
+++ b/LICENCE
@@ -0,0 +1,621 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+ The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works. By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users. We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors. You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+ To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights. Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received. You must make sure that they, too, receive
+or can get the source code. And you must show them these terms so they
+know their rights.
+
+ Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+ For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software. For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+ Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so. This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software. The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable. Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products. If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+ Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary. To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ TERMS AND CONDITIONS
+
+ 0. Definitions.
+
+ "This License" refers to version 3 of the GNU General Public License.
+
+ "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+ "The Program" refers to any copyrightable work licensed under this
+License. Each licensee is addressed as "you". "Licensees" and
+"recipients" may be individuals or organizations.
+
+ To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy. The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+ A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+ To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy. Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+ To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies. Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+ An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License. If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+ 1. Source Code.
+
+ The "source code" for a work means the preferred form of the work
+for making modifications to it. "Object code" means any non-source
+form of a work.
+
+ A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+ The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form. A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+ The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities. However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work. For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+ The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+ The Corresponding Source for a work in source code form is that
+same work.
+
+ 2. Basic Permissions.
+
+ All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met. This License explicitly affirms your unlimited
+permission to run the unmodified Program. The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work. This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+ You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force. You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright. Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+ Conveying under any other circumstances is permitted solely under
+the conditions stated below. Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+ No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+ When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+ 4. Conveying Verbatim Copies.
+
+ You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+ You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+ 5. Conveying Modified Source Versions.
+
+ You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+ a) The work must carry prominent notices stating that you modified
+ it, and giving a relevant date.
+
+ b) The work must carry prominent notices stating that it is
+ released under this License and any conditions added under section
+ 7. This requirement modifies the requirement in section 4 to
+ "keep intact all notices".
+
+ c) You must license the entire work, as a whole, under this
+ License to anyone who comes into possession of a copy. This
+ License will therefore apply, along with any applicable section 7
+ additional terms, to the whole of the work, and all its parts,
+ regardless of how they are packaged. This License gives no
+ permission to license the work in any other way, but it does not
+ invalidate such permission if you have separately received it.
+
+ d) If the work has interactive user interfaces, each must display
+ Appropriate Legal Notices; however, if the Program has interactive
+ interfaces that do not display Appropriate Legal Notices, your
+ work need not make them do so.
+
+ A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit. Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+ 6. Conveying Non-Source Forms.
+
+ You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+ a) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by the
+ Corresponding Source fixed on a durable physical medium
+ customarily used for software interchange.
+
+ b) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by a
+ written offer, valid for at least three years and valid for as
+ long as you offer spare parts or customer support for that product
+ model, to give anyone who possesses the object code either (1) a
+ copy of the Corresponding Source for all the software in the
+ product that is covered by this License, on a durable physical
+ medium customarily used for software interchange, for a price no
+ more than your reasonable cost of physically performing this
+ conveying of source, or (2) access to copy the
+ Corresponding Source from a network server at no charge.
+
+ c) Convey individual copies of the object code with a copy of the
+ written offer to provide the Corresponding Source. This
+ alternative is allowed only occasionally and noncommercially, and
+ only if you received the object code with such an offer, in accord
+ with subsection 6b.
+
+ d) Convey the object code by offering access from a designated
+ place (gratis or for a charge), and offer equivalent access to the
+ Corresponding Source in the same way through the same place at no
+ further charge. You need not require recipients to copy the
+ Corresponding Source along with the object code. If the place to
+ copy the object code is a network server, the Corresponding Source
+ may be on a different server (operated by you or a third party)
+ that supports equivalent copying facilities, provided you maintain
+ clear directions next to the object code saying where to find the
+ Corresponding Source. Regardless of what server hosts the
+ Corresponding Source, you remain obligated to ensure that it is
+ available for as long as needed to satisfy these requirements.
+
+ e) Convey the object code using peer-to-peer transmission, provided
+ you inform other peers where the object code and Corresponding
+ Source of the work are being offered to the general public at no
+ charge under subsection 6d.
+
+ A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+ A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling. In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage. For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product. A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+ "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source. The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+ If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information. But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+ The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed. Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+ Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+ 7. Additional Terms.
+
+ "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law. If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+ When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it. (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.) You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+ Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+ a) Disclaiming warranty or limiting liability differently from the
+ terms of sections 15 and 16 of this License; or
+
+ b) Requiring preservation of specified reasonable legal notices or
+ author attributions in that material or in the Appropriate Legal
+ Notices displayed by works containing it; or
+
+ c) Prohibiting misrepresentation of the origin of that material, or
+ requiring that modified versions of such material be marked in
+ reasonable ways as different from the original version; or
+
+ d) Limiting the use for publicity purposes of names of licensors or
+ authors of the material; or
+
+ e) Declining to grant rights under trademark law for use of some
+ trade names, trademarks, or service marks; or
+
+ f) Requiring indemnification of licensors and authors of that
+ material by anyone who conveys the material (or modified versions of
+ it) with contractual assumptions of liability to the recipient, for
+ any liability that these contractual assumptions directly impose on
+ those licensors and authors.
+
+ All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10. If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term. If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+ If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+ Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+ 8. Termination.
+
+ You may not propagate or modify a covered work except as expressly
+provided under this License. Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+ However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+ Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+ Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License. If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+ 9. Acceptance Not Required for Having Copies.
+
+ You are not required to accept this License in order to receive or
+run a copy of the Program. Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance. However,
+nothing other than this License grants you permission to propagate or
+modify any covered work. These actions infringe copyright if you do
+not accept this License. Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+ 10. Automatic Licensing of Downstream Recipients.
+
+ Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License. You are not responsible
+for enforcing compliance by third parties with this License.
+
+ An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations. If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+ You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License. For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+ 11. Patents.
+
+ A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based. The
+work thus licensed is called the contributor's "contributor version".
+
+ A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version. For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+ In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement). To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+ If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients. "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+ If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+ A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License. You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+ Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+ 12. No Surrender of Others' Freedom.
+
+ If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all. For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+ 13. Use with the GNU Affero General Public License.
+
+ Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work. The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+ 14. Revised Versions of this License.
+
+ The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+ Each version is given a distinguishing version number. If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation. If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+ If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+ Later license versions may give you additional or different
+permissions. However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+ 15. Disclaimer of Warranty.
+
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+ 16. Limitation of Liability.
+
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+ 17. Interpretation of Sections 15 and 16.
+
+ If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+ END OF TERMS AND CONDITIONS
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..a7b23ec
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,79 @@
+#PLEASE SPECIFY THE R path here where you built the R math library standalone
+RMATH=~/Software/R-3.1.3/src
+
+#compiler
+CXX=g++
+
+#internal paths
+VPATH=$(shell for file in `find src -name *.cpp`; do echo $$(dirname $$file); done)
+PATH_TABX=lib/Tabix
+PATH_EIGN=lib/Eigen
+
+#compiler flags
+CXXFLAG_OPTI=-O3 -D_FAST_CORRELATION
+CXXFLAG_DEBG=-g
+CXXFLAG_WARN=-Wall -Wextra -Wno-sign-compare
+CXXFLAG_MACX=-mmacosx-version-min=10.7 -stdlib=libc++
+
+#linker flags
+LDFLAG_OPTI=-O3
+LDFLAG_DEBG=-g
+LDFLAG_MACX=-mmacosx-version-min=10.7 -stdlib=libc++
+
+#includes
+INC_BASE=-Isrc -I$(PATH_TABX) -I$(PATH_EIGN)
+INC_MATH=-I$(RMATH)/include/
+INC_MACX=-I/usr/local/include/
+
+#libraries
+#LIB_BASE=-lm -lboost_iostreams -lboost_program_options -lz -lgsl -lblas
+LIB_BASE=-lm -lz -lboost_iostreams -lboost_program_options -lgsl -lblas
+LIB_MATH=$(RMATH)/nmath/standalone/libRmath.a
+LIB_TABX=$(PATH_TABX)/libtabix.a
+LIB_MACX=-L/usr/local/lib/
+
+#files (binary, objects, headers & sources)
+FILE_BIN=bin/fastQTL
+FILE_O=$(shell for file in `find src -name *.cpp`; do echo obj/$$(basename $$file .cpp).o; done)
+FILE_H=$(shell find src -name *.h)
+FILE_CPP=$(shell find src -name *.cpp)
+
+#default
+all: linux
+
+#linux release
+linux: CXXFLAG=$(CXXFLAG_OPTI) $(CXXFLAG_WARN)
+linux: IFLAG=$(INC_BASE) $(INC_MATH)
+linux: LIB=$(LIB_MATH) $(LIB_TABX) $(LIB_BASE)
+linux: LDFLAG=$(LDFLAG_OPTI)
+linux: $(FILE_BIN)
+
+#macos release
+macos: CXXFLAG=$(CXXFLAG_OPTI) $(CXXFLAG_WARN) $(CXXFLAG_MACX)
+macos: IFLAG=$(INC_BASE) $(INC_MACX) $(INC_MATH)
+macos: LIB=$(LIB_MACX) $(LIB_MATH) $(LIB_TABX) $(LIB_BASE)
+macos: LDFLAG=$(LDFLAG_OPTI) $(LDFLAG_MACX)
+macos: $(FILE_BIN)
+
+#debug release
+debug: CXXFLAG=$(CXXFLAG_DEBG) $(CXXFLAG_WARN)
+debug: IFLAG=$(INC_BASE) $(INC_MATH)
+debug: LIB=$(LIB_MATH) $(LIB_TABX) $(LIB_BASE)
+debug: LDFLAG=$(LDFLAG_DEBG)
+debug: $(FILE_BIN)
+
+#compilation
+$(LIB_TABX):
+ cd $(PATH_TABX) && make && cd ../..
+
+$(FILE_BIN): $(FILE_O) $(LIB_TABX)
+ $(CXX) $(LDFLAG) $^ $(LIB) -o $@
+
+obj/%.o: %.cpp $(FILE_H)
+ $(CXX) $(CXXFLAG) -o $@ -c $< $(IFLAG)
+
+clean:
+ rm -f obj/*.o $(FILE_BIN)
+
+cleanall: clean
+ cd $(PATH_TABX) && make clean && cd ../..
diff --git a/README b/README
new file mode 100644
index 0000000..40028c4
--- /dev/null
+++ b/README
@@ -0,0 +1 @@
+Documentation can be found here: http://fastqtl.sourceforge.net/
diff --git a/doc/img/fastqtl_checks.jpg b/doc/img/fastqtl_checks.jpg
new file mode 100644
index 0000000..ad37f32
Binary files /dev/null and b/doc/img/fastqtl_checks.jpg differ
diff --git a/doc/img/fastqtl_screenshot_nominal.jpg b/doc/img/fastqtl_screenshot_nominal.jpg
new file mode 100644
index 0000000..2d75c11
Binary files /dev/null and b/doc/img/fastqtl_screenshot_nominal.jpg differ
diff --git a/doc/img/fastqtl_screenshot_permutation.jpg b/doc/img/fastqtl_screenshot_permutation.jpg
new file mode 100644
index 0000000..74844d6
Binary files /dev/null and b/doc/img/fastqtl_screenshot_permutation.jpg differ
diff --git a/doc/index.html b/doc/index.html
new file mode 100644
index 0000000..028cf97
--- /dev/null
+++ b/doc/index.html
@@ -0,0 +1,27 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
+
+<html>
+
+<meta http-equiv="pragma" content="nocache">
+
+<head>
+<title>FastQTL</title>
+</head>
+
+<frameset rows="60,*,20" border="0">
+
+ <frame src="./pages/header.html" name="header">
+
+ <frameset cols="330,*" border="0" scrolling=no noresize>
+
+ <frame src="./pages/leftmenu.html" name="leftmenu">
+
+ <frame src="./pages/homepage.html" name="content">
+
+ </frameset>
+
+ <frame src="./pages/footer.html" name="footer">
+
+</frameset>
+
+</html>
\ No newline at end of file
diff --git a/doc/pages/cis_mapping.html b/doc/pages/cis_mapping.html
new file mode 100644
index 0000000..b422ec7
--- /dev/null
+++ b/doc/pages/cis_mapping.html
@@ -0,0 +1,35 @@
+<html>
+<head>
+<meta http-equiv="content-type" content="text/html; charset=iso-8859-1"/>
+<meta http-equiv="Cache-Control" content="max-age=3600"/>
+<meta name="description" content="description"/>
+<meta name="keywords" content="keywords"/>
+<meta name="author" content="author"/>
+<link rel="stylesheet" type="text/css" href="../style/default.css" media="screen"/>
+<title>FastQTL</title>
+<script language="Javascript" src="../script/print_last_modif_date.js" ></script>
+</head>
+
+<body>
+<div class="content">
+
+ <div class="item" id="item1">
+ <h1>In construction</h1>
+ <p>
+ Text1
+ </p>
+
+ <code>
+ Code1
+ </code>
+
+ <p>
+ Text2
+ </p>
+
+ </div>
+
+ <div class="log"><script type="text/javascript"> print_last_modif_date("$Date: 2014-10-25 12:59:59 +0200 (Sat, 25 Oct 2014) $"); </script></div>
+</div>
+</body>
+</html>
diff --git a/doc/pages/cis_nominal.html b/doc/pages/cis_nominal.html
new file mode 100644
index 0000000..f14c89b
--- /dev/null
+++ b/doc/pages/cis_nominal.html
@@ -0,0 +1,250 @@
+<html>
+<head>
+<meta http-equiv="content-type" content="text/html; charset=iso-8859-1"/>
+<meta http-equiv="Cache-Control" content="max-age=3600"/>
+<meta name="description" content="description"/>
+<meta name="keywords" content="keywords"/>
+<meta name="author" content="author"/>
+<link rel="stylesheet" type="text/css" href="../style/default.css" media="screen"/>
+<title>FastQTL</title>
+<script language="Javascript" src="../script/print_last_modif_date.js" ></script>
+</head>
+
+<body>
+<div class="content">
+
+ <div class="item" id="cis_map1">
+ <h1>Default nominal pass</h1>
+ <p>
+ To perform a simple nominal pass on the example data set, use:
+ </p>
+
+ <code>
+ fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz --region 22:17000000-18000000 --out nominals.default.txt.gz
+ </code>
+
+ <p>
+ This produces this output on the screen when everything works correctly:
+ </p>
+
+ <img WIDTH=40% src="../img/fastqtl_screenshot_nominal.jpg"></img>
+
+ </div>
+
+ <div class="item" id="cis_map3">
+ <h1>Association testing parameters</h1>
+ <p>
+ Associations between genotype dosages and phenotype quantifications are measured with linear regressions (<a href="http://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient">here</a>), similar to the R/lm function. This model assumes that phenotypes are normally distributed.
+ If your phenotype quantifications are not normally distributed, you can force them to match normal distributions N(0, 1) by using:
+ </p>
+
+ <code>
+ fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz --region 22:17000000-18000000 --out nominals.quantile.txt.gz <b>--normal</b>
+ </code>
+
+ <p>
+ To change the cis-window size (i.e. the maximal distance spanned by phenotype-variant pairs to be considered for testing) from default value 1e6 bp to 2e6 bp, use:
+ </p>
+
+ <code>
+ fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz --region 22:17000000-18000000 --out nominals.window2Mb.txt.gz <b>--window 2e6</b>
+ </code>
+
+ <p>
+ To change the seed of the random number generator, which is particularly useful to replicate an analysis, use:
+ </p>
+
+ <code>
+ fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz --region 22:17000000-18000000 --out nominals.seed.txt.gz <b>--seed 123456789</b>
+ </code>
+
+ <p>
+ To add covariates in the linear regressions used for association testing, use:
+ </p>
+
+ <code>
+ fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz --region 22:17000000-18000000 --out nominals.cov.txt.gz <b>--cov covariates.txt.gz</b>
+ </code>
+
+ <p>
+ The file <b>covariates.txt.gz</b> needs to be formatted as described <b>here</b>.
+ </p>
+ </div>
+
+ <div class="item" id="cis_map4">
+ <h1>Excluding/Including data</h1>
+ <p>
+ To exclude samples, variants, phenotypes or covariates from the analysis, you can use one of these options:
+ <ol>
+ <li>To exclude samples: <i>--exclude-samples file.exc</i></li>
+ <li>To exclude variants: <i>--exclude-sites file.exc</i></li>
+ <li>To exclude phenotypes: <i>--exclude-phenotypes file.exc</i></li>
+ <li>To exclude caovariates: <i>--exclude-covariates file.exc</i></li>
+ </ol>
+ </p>
+
+ <p>
+ For instance, if you want to ignore 3 samples when analyzing the example data set, first create a text file containing the IDs of the samples to be excluded, called here <i>file.exc</i>:
+ </p>
+
+ <code>
+ UNR1<br>
+ UNR2<br>
+ UNR3<br>
+ </code>
+
+ <p>
+ Then, add the following option to the command line:
+ </p>
+
+ <code>
+ fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz --region 22:17000000-18000000 --out nominals.sub.txt.gz <b>--exclude-samples file.exc</b>
+ </code>
+
+ <p>
+ Similarly to these 4 options for data exclusion, you can also specify the set of samples, variants, phenotypes and covariates you wich to include in the analysis using the options: <i>--include-samples, --include-sites, --include-phenotypes and --include-covariates</i>, respectively.
+ </p>
+ </div>
+
+
+ <div class="item" id="cis_map5">
+ <h1>Parallelization</h1>
+ <p>
+ As a first way to facilitate parallelization on compute cluster, we developed an option to run the analysis for just a chunk of molecular phenotypes.
+ The region of interest is specified with the standard <b>chr:start-end</b> format.
+ FastQTL extracts all phenotypes in this region, then all genotypes given the specified cis-window size and finally performs the analysis for this data subset.
+ For instance, to a nominal pass only for molecular phenotypes located on chr22 between coordinates 18Mb and 20Mb, use:
+ </p>
+
+ <code>
+ fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz <b>--region chr22:18000000-20000000</b> --out nominals.18M20M.txt.gz
+ </code>
+
+ <note>
+ This option requires both the genotype and phenotype files to be indexed with TABIX!
+ </note>
+
+ <p>
+ This strategy is quite similar to what is commonly used for genotype imputation, where only small chunks of data are imputed one at a time in seperate jobs.
+ However in practice, it is usually quite complicated to properly split the genome into a given number of chunks with correct coordinates.
+ To facilitate this, we embedded all coordinates into a chunk-based system such that you only need to specify the chunk index you want to run.
+ Then, splitting the genome into chunks, extraction of data, and analysis are automatically performed.
+ For instance, to run analysis on chunk number 25 when splitting the example data set (i.e. genome) into 30 chunks, just run:
+ </p>
+
+ <code>
+ fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz --out nominals.chunk25.txt.gz <b>--chunk 25 30</b>
+ </code>
+
+ <p>
+ If you want to submit the whole analysis into 30 jobs on your compute cluster, just run:
+ </p>
+
+ <code>
+ for j in <b>$(seq 1 30)</b>; do<br>
+ echo "fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz --out nominals.txt.gz <b>--chunk $j 30</b>" | qsub<br>
+ done
+ </code>
+
+ <p>
+ Here <b>qsub</b> needs to be changed according to the job submission system used (bsub, psub, etc...).
+ </p>
+
+ <note>
+ In this simple example, we only split the data into 30 chunks.
+ However, a realistic whole genome analysis would require to split the data in 200, 500 or even 1,000 chunks to fully use the capabilities of a modern compute cluster.
+ </note>
+
+ <p>
+ Finally, we also developed a slightly different parallelization option that, this time, allows to generate all required command lines and write them into a file.
+ Let take the same example as before, that is splitting the analysis into 10 jobs:
+ </p>
+
+ <code>
+ fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz --out results <b>--commands 10 commands.10.txt</b>
+ </code>
+
+ <p>
+ Now if you look at the file <b>commands.10.txt</b>, you'll see this:
+ </p>
+
+ <code>
+ fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz --out nominals.22:17517460-20748406 --region 22:17517460-20748406<br>
+ fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz --out nominals.22:36424473-39052635 --region 22:36424473-39052635<br>
+ fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz --out nominals.22:24407642-30163001 --region 22:24407642-30163001<br>
+ fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz --out nominals.22:42017123-45704850 --region 22:42017123-45704850<br>
+ fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz --out nominals.22:20792146-22307210 --region 22:20792146-22307210<br>
+ fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz --out nominals.22:39052641-39915701 --region 22:39052641-39915701<br>
+ fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz --out nominals.22:30163352-36044443 --region 22:30163352-36044443<br>
+ fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz --out nominals.22:45809500-51222092 --region 22:45809500-51222092<br>
+ fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz --out nominals.22:22337213-24322661 --region 22:22337213-24322661<br>
+ fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz --out nominals.22:39928860-42017101 --region 22:39928860-42017101<br>
+ </code>
+
+ <p>
+ Where region coordinates are automatically determined given the total number of chunks specified.
+ You can then submit all these commands on a cluster using:
+ </p>
+
+ <code>
+ while read c; do<br>
+ echo $c | qsub<br>
+ done <b>< commands.10.txt</b>
+ </code>
+ </div>
+
+ <div class="item" id="cis_map7">
+ <h1>Output file format of a <b>nominal</b> pass</h1>
+
+ <p>
+ Once the analysis completed and all output file collected, you can easily concat and compress all of them into a single output file in order to ease downstream analysis with:
+ </p>
+
+ <code>
+ zcat nominals.chunk*.txt.gz | gzip -c > nominals.all.chunks.txt.gz
+ </code>
+
+ <p>
+ After having performed a nominal pass on the data and concatenating the output files, you end up with a file with 5 columns and N lines corresponding to all N phenotype-variant pairs tested.
+ For instance, if you tested 1,000 molecular phenotypes and for each there are 1,000 variants in cis, it means that you'll get 1,000,000 lines in the output files.
+ Hereafter a short example:
+ </p>
+
+ <code>
+ ENSG00000237438.1 snp_22_18516782 999322 0.602225<br>
+ ENSG00000237438.1 snp_22_18516997 999537 0.796906<br>
+ ENSG00000237438.1 snp_22_18517084 999624 0.20782<br>
+ ENSG00000237438.1 snp_22_18517312 999852 0.196428<br>
+ ENSG00000177663.8 snp_22_16566314 -999530 0.77477<br>
+ ENSG00000177663.8 snp_22_16566347 -999497 0.57854<br>
+ ENSG00000177663.8 snp_22_16566779 -999065 0.379964<br>
+ ENSG00000177663.8 snp_22_16580254 -985590 0.525688<br>
+ ENSG00000177663.8 snp_22_16581158 -984686 0.329372<br>
+ ENSG00000177663.8 snp_22_16581386 -984458 0.461748<br>
+ </code>
+
+ <p>
+ In this file, the 4 columns correspond to:
+ <ol>
+ <li>ID of the tested molecular phenotype (in this particular case, the gene ID)</li>
+ <li>ID of the tested variant (in this case a SNP)</li>
+ <li>Distance between the variant and the phenotype in bp</li>
+ <li>The nominal p-value of association</li>
+ </ol>
+ </p>
+
+ <p>
+ To make this file much smaller, you can output only significant phenotype-variant pairs with a pvalue below 0.001 for instance.
+ To do so, use:
+ </p>
+
+ <code>
+ fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz --region 22:17000000-18000000 <b>--threshold 0.001</b> --out nominals.threshold.txt.gz
+ </code>
+
+ </div>
+
+ <div class="log"><script type="text/javascript"> print_last_modif_date("$Date: 2014-10-25 12:59:59 +0200 (Sat, 25 Oct 2014) $"); </script></div>
+</div>
+</body>
+</html>
diff --git a/doc/pages/cis_permutation.html b/doc/pages/cis_permutation.html
new file mode 100644
index 0000000..fcf47a0
--- /dev/null
+++ b/doc/pages/cis_permutation.html
@@ -0,0 +1,354 @@
+<html>
+<head>
+<meta http-equiv="content-type" content="text/html; charset=iso-8859-1"/>
+<meta http-equiv="Cache-Control" content="max-age=3600"/>
+<meta name="description" content="description"/>
+<meta name="keywords" content="keywords"/>
+<meta name="author" content="author"/>
+<link rel="stylesheet" type="text/css" href="../style/default.css" media="screen"/>
+<title>FastQTL</title>
+<script language="Javascript" src="../script/print_last_modif_date.js" ></script>
+</head>
+
+<body>
+<div class="content">
+
+ <div class="item" id="cis_perm1">
+ <h1>Default permutation pass</h1>
+ <p>
+ To perform a permutation based analysis of the example data set using a total of 1,000 permutations, use:
+ </p>
+
+ <code>
+ fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz --region 22:17000000-18000000 <b>--permute 1000</b> --out permutations.default.txt.gz
+ </code>
+
+ <p>
+ This produces this output on the screen when everything works correctly:
+ </p>
+
+ <img WIDTH=40% src="../img/fastqtl_screenshot_permutation.jpg"></img>
+
+ <br>
+ <p>
+ You can increase the number of permutations to 10,000 using:
+ </p>
+
+ <code>
+ fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz --region 22:17000000-18000000 <b>--permute 10000</b> --out permutations.10K.txt.gz
+ </code>
+
+ <p>
+ You can also perform an adaptive permutation pass on the example data set (i.e. with a number of permutations that varies in function of the significance).
+ To run between 100 and 100,000 permutations, use:
+ </p>
+
+ <code>
+ fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz --region 22:17000000-18000000 <b>--permute 100 100000</b> --out permutations.adaptive.txt.gz
+ </code>
+
+ </div>
+
+ <div class="item" id="cis_map3">
+ <h1>Association testing parameters</h1>
+ <p>
+ Associations between genotype dosages and phenotype quantifications are measured with linear regressions (<a href="http://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient">here</a>), similar to the R/lm function. This model assumes that phenotypes are normally distributed.
+ If your phenotype quantifications are not normally distributed, you can force them to match normal distributions N(0, 1) by using:
+ </p>
+
+ <code>
+ fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz --region 22:17000000-18000000 --permute 1000 --out permutations.quantile.txt.gz <b>--normal</b>
+ </code>
+
+ <p>
+ To change the cis-window size (i.e. the maximal distance spanned by phenotype-variant pairs to be considered for testing) from default value 1e6 bp to 2e6 bp, use:
+ </p>
+
+ <code>
+ fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz --region 22:17000000-18000000 --permute 1000 --out permutations.window2Mb.txt.gz <b>--window 2e6</b>
+ </code>
+
+ <p>
+ To change the seed of the random number generator, which is particularly useful to replicate an analysis, use:
+ </p>
+
+ <code>
+ fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz --region 22:17000000-18000000 --permute 1000 --out permutations.seed.txt.gz <b>--seed 123456789</b>
+ </code>
+
+ <p>
+ To add covariates in the linear regressions used for association testing, use:
+ </p>
+
+ <code>
+ fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz --region 22:17000000-18000000 --permute 1000 --out permutations.txt.gz <b>--cov covariates.txt.gz</b>
+ </code>
+
+ <p>
+ The file <b>covariates.txt.gz</b> needs to be formatted as described <b>here</b>.
+ </p>
+ </div>
+
+ <div class="item" id="cis_map4">
+ <h1>Excluding/Including data</h1>
+ <p>
+ To exclude samples, variants, phenotypes or covariates from the analysis, you can use one of these options:
+ <ol>
+ <li>To exclude samples: <i>--exclude-samples file.exc</i></li>
+ <li>To exclude variants: <i>--exclude-sites file.exc</i></li>
+ <li>To exclude phenotypes: <i>--exclude-phenotypes file.exc</i></li>
+ <li>To exclude caovariates: <i>--exclude-covariates file.exc</i></li>
+ </ol>
+ </p>
+
+ <p>
+ For instance, if you want to ignore 3 samples when analyzing the example data set, first create a text file containing the IDs of the samples to be excluded, called here <i>file.exc</i>:
+ </p>
+
+ <code>
+ UNR1<br>
+ UNR2<br>
+ UNR3<br>
+ </code>
+
+ <p>
+ Then, run the following command line:
+ </p>
+
+ <code>
+ fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz --region 22:17000000-18000000 --permute 1000 --out permutations.sub.txt.gz <b>--exclude-samples file.exc</b>
+ </code>
+
+ <p>
+ Similarly to these 4 options for data exclusion, you can also specify the set of samples, variants, phenotypes and covariates to be included in the analysis using the options: <i>--include-samples, --include-sites, --include-phenotypes and --include-covariates</i>, respectively.
+ </p>
+ </div>
+
+ <div class="item" id="cis_map5">
+ <h1>Parallelization</h1>
+ <p>
+ As a first way to facilitate parallelization on compute cluster, we developed an option to run the analysis for just a chunk of molecular phenotypes.
+ The region of interest is specified with the standard <b>chr:start-end</b> format.
+ Then, FastQTL extracts all phenotypes in this region, then all genotypes given the specified cis-window size and finally perform the analysis for this data subset.
+ For instance, to run QTL mapping only for molecular phenotypes on chr22 with coordinates between 18Mb and 20Mb, use:
+ </p>
+
+ <code>
+ fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz <b>--region chr22:18000000-20000000</b> --permute 1000 --out permutations.18M20M.txt.gz
+ </code>
+
+ <note>
+ This option requires both the genotype and phenotype files to be indexed with TABIX!
+ </note>
+
+ <p>
+ This strategy is quite similar to what is commonly used for genotype imputation, where only small chunks of data are imputed one at a time in seperate jobs.
+ However in practice, it is usually quite complicated to properly split the genome into a given number of chunks with correct coordinates.
+ To facilitate this, we embedded all coordinates into a chunk-based system such that you only need to specify the chunk index you want to run.
+ Then, splitting the genome into chunks, extraction of data, and analysis are automatically performed.
+ For instance, to run analysis on chunk number 25 when splitting the example data set (i.e. genome) into 30 chunks, just run:
+ </p>
+
+ <code>
+ fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz --permute 1000 --out permutations.chunk25.txt.gz <b>--chunk 25 30</b>
+ </code>
+
+ <p>
+ If you want to submit the whole analysis into 30 jobs on your compute cluster, just run:
+ </p>
+
+ <code>
+ for j in <b>$(seq 1 30)</b>; do<br>
+ echo "fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz --permute 1000 --out permutations.txt.gz <b>--chunk $j 30</b>" | qsub<br>
+ done
+ </code>
+
+ <p>
+ Here <b>qsub</b> needs to be changed according to the job submission system used (bsub, psub, etc...).
+ </p>
+
+ <note>
+ In this simple example, we only split the data into 30 chunks.
+ However, a realistic whole genome analysis would require to split the data in 200, 500 or even 1,000 chunks to fully use the capabilities of a modern compute cluster.
+ </note>
+
+ <p>
+ Finally, we also developed a slightly different parallelization option that, this time, allows to generate all required command lines and write into a file.
+ Let take the same example as before, that is splitting the analysis into 10 jobs:
+ </p>
+
+ <code>
+ fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz --permute 1000 --out permutations <b>--commands 10 commands.10.txt</b>
+ </code>
+
+ <p>
+ Now if you look at the file <b>commands.10.txt</b>, you'll get this:
+ </p>
+
+ <code>
+ fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz --permute 1000 --out permutations.22:17517460-20748406 --region 22:17517460-20748406<br>
+ fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz --permute 1000 --out permutations.22:36424473-39052635 --region 22:36424473-39052635<br>
+ fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz --permute 1000 --out permutations.22:24407642-30163001 --region 22:24407642-30163001<br>
+ fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz --permute 1000 --out permutations.22:42017123-45704850 --region 22:42017123-45704850<br>
+ fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz --permute 1000 --out permutations.22:20792146-22307210 --region 22:20792146-22307210<br>
+ fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz --permute 1000 --out permutations.22:39052641-39915701 --region 22:39052641-39915701<br>
+ fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz --permute 1000 --out permutations.22:30163352-36044443 --region 22:30163352-36044443<br>
+ fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz --permute 1000 --out permutations.22:45809500-51222092 --region 22:45809500-51222092<br>
+ fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz --permute 1000 --out permutations.22:22337213-24322661 --region 22:22337213-24322661<br>
+ fastQTL --vcf genotypes.vcf.gz --bed phenotypes.bed.gz --permute 1000 --out permutations.22:39928860-42017101 --region 22:39928860-42017101<br>
+ </code>
+
+ <p>
+ Where, region coordinates are automatically determined given the total number of chunks provided.
+ Then, you can submit all these commands on a cluster using for example:
+ </p>
+
+ <code>
+ while read c; do<br>
+ echo $c | qsub<br>
+ done <b>| commands.10.txt</b>
+ </code>
+ </div>
+
+ <div class="item" id="cis_map8">
+ <h1>Output file format of a <b>permutation</b> pass</h1>
+
+ <p>
+ Once the analysis completed and all output file collected, you can easily concat and compress all files into a single output file to ease downstream analysis with:
+ </p>
+
+ <code>
+ zcat permutations.chunk*.txt.gz | gzip -c > permutations.all.chunks.txt.gz
+ </code>
+
+ <p>
+ After having performed a permutation pass on the data and concatenating the output files, you end up with a file with 10 columns and M lines with M being the total number of tested molecular phenotypes.
+ For instance, if you tested 1,000 molecular phenotypes, it means that you'll get 1,000 lines in the output files.
+ Hereafter a short example:
+ </p>
+
+ <code>
+ ENSG00000237438.1 7966 0.93436 2871.25 376.072 snp_22_17542810 25350 5.48728e-13 0.000999001 4.38131e-09<br>
+ ENSG00000177663.8 8165 0.936488 3667.52 392.593 snp_22_17497295 -68549 0.000167489 0.376623 0.361675<br>
+ ENSG00000183307.3 8279 1.03157 2719.43 369.406 snp_22_17587975 -14282 3.11324e-08 0.000999001 6.78199e-05<br>
+ ENSG00000069998.8 8466 1.04834 2472.36 358.392 snp_22_17639837 -6340 5.01155e-11 0.000999001 1.26653e-07<br>
+ ENSG00000093072.10 8531 0.934923 5838.18 406.904 snp_22_17699299 -39826 9.028e-05 0.246753 0.245706<br>
+ </code>
+
+ <p>
+ In this file, the 10 columns correspond to:
+ <ol>
+ <li>ID of the tested molecular phenotype (in this particular case, the gene ID)</li>
+ <li>Number of variants tested in cis for this phenotype</li>
+ <li>MLE of the shape1 parameter of the Beta distribution</li>
+ <li>MLE of the shape2 parameter of the Beta distribution</li>
+ <li>Dummy [To be described later]</li>
+ <li>ID of the best variant found for this molecular phenotypes (i.e. with the smallest p-value)</li>
+ <li>Distance between the molecular phenotype - variant pair</li>
+ <li>The <b>nominal</b> p-value of association that quantifies how significant from 0, the regression coefficient is</li>
+ <li>A first <b>permutation</b> p-value directly obtained from the permutations with the direct method. This is basically a corrected version of the nominal p-value that accounts for the fact that multiple variants are tested per molecular phenotype.
+ <li>A second <b>permutation</b> p-value obtained via beta approximation. We advice to use this one in any downstream analysis.
+ </ol>
+ </p>
+ </div>
+
+ <div class="item" id="cis_map8">
+ <h1>Checking that the experiment went well</h1>
+
+ <p>
+ To check that the beta approximated permutation p-values are well estimated, load the data in R and make the following plot:
+ </p>
+
+ <code>
+ R> d = read.table("permutations.all.chunks.txt.gz", hea=F, stringsAsFactors=F)<br>
+ R> colnames(d) = c("pid", "nvar", "shape1", "shape2", "dummy", "sid", "dist", "npval", "ppval", "bpval")<br>
+ R> plot(d$ppval, d$bpval, xlab="Direct method", ylab="Beta approximation", main="Check plot")<br>
+ R> abline(0, 1, col="red")<br>
+ </code>
+
+ <img WIDTH=40% src="../img/fastqtl_checks.jpg"></img>
+
+ <br>
+ <p>
+ The expectation is to get all the points along the diagonal as on the plot above.
+ This shows that unsignificant beta pproximated p-values are well estimated and therefore that the estimations went well.
+ </p>
+ </div>
+
+ <div class="item" id="cis_map9">
+ <h1>Controlling for multiple phenotypes being tested</h1>
+ <p>
+ Once obtained all permutation p-values for all the tested molecular phenotypes, we need now to account for the fact that many molecular phenotypes are tested whole genome in order to determine the significant QTLs.
+ To do so, we propose here 3 approaches; from the most to the least stringent.
+ First, load the data in R using:
+ </p>
+
+ <code>
+ R> d = read.table("permutations.all.chunks.txt.gz", hea=F, stringsAsFactors=F)<br>
+ R> colnames(d) = c("pid", "nvar", "shape1", "shape2", "dummy", "sid", "dist", "npval", "ppval", "bpval")<br>
+ </code>
+
+ <br>
+ <h3>Bonferroni correction</h3>
+ <p>
+ Look <a href="http://en.wikipedia.org/wiki/Bonferroni_correction">here</a> for some details on the correction and <a href="https://stat.ethz.ch/R-manual/R-devel/library/stats/html/p.adjust.html">here</a> for some details on the R function used in this example.
+ To apply the Bonferroni correction on the permutation p-values derived from beta approximation, use:
+ </p>
+
+ <code>
+ <b>R></b> d$bonferroni = p.adjust(d$bpval, method="bonferroni")
+ </code>
+
+ <p>
+ Then, you can extract all significant MP-QTL pairs at α=0.05 and write them to a file <b>permutations.all.chunks.bonferroni.txt</b> using:
+ </p>
+
+ <code>
+ <b>R></b> write.table(d[which(d$bonferroni <= 0.05), c(1,6)], "permutations.all.chunks.bonferroni.txt", quote=F, row.names=F, col.names=T)
+ </code>
+
+ <br>
+ <h3>Benjamini & Hochberg correction</h3>
+ <p>
+ Look <a href="http://en.wikipedia.org/wiki/False_discovery_rate">here</a> for some details on the correction and <a href="https://stat.ethz.ch/R-manual/R-devel/library/stats/html/p.adjust.html">here</a> for some details on the R function used in this example.
+ To apply the Benjamini & Hochberg correction on the permutation p-values derived from beta approximation, use:
+ </p>
+
+ <code>
+ <b>R></b> d$bh = p.adjust(d$bpval, method="fdr")
+ </code>
+
+ <p>
+ Then, you can extract all significant MP-QTL pairs with a 10% false discovery rate (FDR) for instance and write them to a file <b>permutations.all.chunks.benjamini.txt</b> using:
+ </p>
+
+ <code>
+ <b>R></b> write.table(d[which(d$bh <= 0.10), c(1,6)], "permutations.all.chunks.benjamini.txt", quote=F, row.names=F, col.names=T)
+ </code>
+
+ <br>
+ <h3>Storey & Tibshirani correction</h3>
+ <p>
+ Look <a href="http://en.wikipedia.org/wiki/False_discovery_rate">here</a> for some details on the correction and <a href="http://svitsrv25.epfl.ch/R-doc/library/qvalue/html/qvalue.html">here</a> for some details on the R function used in this example.
+ To install the R/qvalue package on your system, look <a href="http://www.bioconductor.org/packages/release/bioc/html/qvalue.html">here</a>.
+ Then to apply the Storey & Tibshirani correction on the permutation p-values derived from beta approximation, use:
+ </p>
+
+ <code>
+ <b>R></b> library(qvalue)
+ <b>R></b> d$st = qvalue(d$bpval)$qvalues
+ </code>
+
+ <p>
+ Then, you can extract all significant MP-QTL pairs with a 10% false discovery rate (FDR) for instance and write them to a file <b>whole_analysis.storey.permutations</b> using:
+ </p>
+
+ <code>
+ <b>R></b> write.table(d[which(d$st <= 0.10), c(1,6)], "permutations.all.chunks.storey.txt", quote=F, row.names=F, col.names=T)
+ </code>
+ </div>
+
+ <div class="log"><script type="text/javascript"> print_last_modif_date("$Date: 2015-04-01 16:11:24 +0200 (Wed, 01 Apr 2015) $"); </script></div>
+</div>
+</body>
+</html>
diff --git a/doc/pages/footer.html b/doc/pages/footer.html
new file mode 100644
index 0000000..bbe27ce
--- /dev/null
+++ b/doc/pages/footer.html
@@ -0,0 +1,15 @@
+<html>
+<head>
+<meta http-equiv="content-type" content="text/html; charset=iso-8859-1"/>
+<meta http-equiv="Cache-Control" content="max-age=3600"/>
+<link rel="stylesheet" type="text/css" href="../style/default.css" media="screen"/>
+<title>FastQTL</title>
+</head>
+
+<div class="footer">
+© 2015 <a href="homepage.html" target='content'> fastqtl.sourceforge.net</a> Olivier DELANEAU, Halit ONGEN, Alfonso BUIL & Emmanouil DERMITZAKIS
+</div>
+
+</html>
+
+
diff --git a/doc/pages/format_bed.html b/doc/pages/format_bed.html
new file mode 100644
index 0000000..6551d88
--- /dev/null
+++ b/doc/pages/format_bed.html
@@ -0,0 +1,71 @@
+<html>
+<head>
+<meta http-equiv="content-type" content="text/html; charset=iso-8859-1"/>
+<meta http-equiv="Cache-Control" content="max-age=3600"/>
+<meta name="description" content="description"/>
+<meta name="keywords" content="keywords"/>
+<meta name="author" content="author"/>
+<link rel="stylesheet" type="text/css" href="../style/default.css" media="screen"/>
+<title>FastQTL</title>
+<script language="Javascript" src="../script/print_last_modif_date.js" ></script>
+</head>
+
+<body>
+<div class="content">
+
+ <div class="item" id="bed1">
+ <h1>BED file</h1>
+ <p>
+ The <b>BED</b> file contains the phenotypic data in a UCSC BED derived format. It is basically a BED file with one additional column per sample.
+ Hereafter an example of 3 molecular phenotypes for 4 samples.
+ </p>
+
+ <code>
+ #Chr start end ID UNR1 UNR2 UNR3 UNR4
+ chr1 173863 173864 ENSG123 -0.50 0.82 -0.71 0.83<br>
+ chr1 685395 685396 ENSG456 -1.13 1.18 -0.03 0.11<br>
+ chr1 700304 700305 ENSG789 -1.18 1.32 -0.36 1.26<br>
+ </code>
+
+ <p>
+ This file is <i>TAB</i> delimited.
+ Each line corresponds to a single molecular phenotype.
+ The first 4 columns are:
+ </p>
+
+ <ol>
+ <li>Chromosome ID <i>[string]</i></li>
+ <li>Start genomic position of the phenotype (e.g. TSS) <i>[integer]</i></li>
+ <li>End genomic position of the phenotype (e.g. TSS) <i>[integer]</i></li>
+ <li>Phenotype ID <i>[string]</i></li>
+ </ol>
+
+ <p>
+ Then additional columns give phenotype quantifications for all samples.
+ Phenotype quantifications are encoded with floating numbers.
+ This file should have P lines and N+4 columns where P and N are the numbers of phenotypes and samples, respectively.
+ </p>
+ </div>
+
+ <div class="item" id="bed2">
+ <h1>Indexing BED file (required)</h1>
+ <p>
+ To feed FastQTL with BED files containing phenotypes, you need to index them with tabix first. Hereafter, the commands that does it:
+ </p>
+
+ <code>
+ bgzip phenotypes.bed && tabix -p bed phenotypes.bed.gz
+ </code>
+
+ <p>
+ Look <A href=http://samtools.sourceforge.net/tabix.shtml>here</A> for more details on Tabix and Bgzip command lines.
+ The above command line produces a file <b>phenotypes.bed.gz.tbi</b> that contains the index for <b>data.bed.gz</b>.
+ These tow files need to be together in the same folder in order for FastQTL do be able to also read the index file when reading <b>phenotypes.bed.gz</b>.
+ </p>
+ </div>
+
+
+ <div class="log"><script type="text/javascript"> print_last_modif_date("$Date: 2015-01-22 00:08:17 +0100 (Thu, 22 Jan 2015) $"); </script></div>
+</div>
+</body>
+</html>
\ No newline at end of file
diff --git a/doc/pages/format_cov.html b/doc/pages/format_cov.html
new file mode 100644
index 0000000..9472ba1
--- /dev/null
+++ b/doc/pages/format_cov.html
@@ -0,0 +1,52 @@
+<html>
+<head>
+<meta http-equiv="content-type" content="text/html; charset=iso-8859-1"/>
+<meta http-equiv="Cache-Control" content="max-age=3600"/>
+<meta name="description" content="description"/>
+<meta name="keywords" content="keywords"/>
+<meta name="author" content="author"/>
+<link rel="stylesheet" type="text/css" href="../style/default.css" media="screen"/>
+<title>FastQTL</title>
+<script language="Javascript" src="../script/print_last_modif_date.js" ></script>
+</head>
+
+<body>
+<div class="content">
+
+ <div class="item" id="cov1">
+ <h1>Covariate file</h1>
+ <p>
+ The <b>COV</b> file contains the covariate data in simple txt format.
+ Hereafter an example of 4 covariates for 4 samples.
+ </p>
+
+ <code>
+ id UNR1 UNR2 UNR3 UNR4<br>
+ PC1 -0.02 0.14 0.16 -0.02<br>
+ PC2 0.01 0.11 0.10 0.01<br>
+ PC3 0.03 0.05 0.08 0.07<br>
+ BIN 1 0 0 1
+ </code>
+
+ <p>
+ Herafter, some properties of this file:
+ <ol>
+ <li>The file is <i>TAB</i> delimited</li>
+ <li>First row gives the sample ID and each additional one corresponds to a single covariate</li>
+ <li>First column gives the covariate ID and each additional one corresponds to a sample</li>
+ <li>The file should have S+1 rows and C+1 columns where S and C are the numbers of samples and covariates, respectively.</li>
+ </ol>
+
+ <p>
+ Both quantitative and qualitative covariates are supported.
+ <b>Quantitative</b> covariates are assumed when only <b>numeric</b> values are provided.
+ <b>Qualitative</b> covariates are assumed when only <b>non-numeric</b> values are provided.
+ In practice, qualitative covariates with <i>F</i> factors are converted in <i>F-1</i> binary quantitative covariates.
+ </p>
+
+ </div>
+
+ <div class="log"><script type="text/javascript"> print_last_modif_date("$Date: 2014-09-22 15:58:58 +0200 (Mon, 22 Sep 2014) $"); </script></div>
+</div>
+</body>
+</html>
\ No newline at end of file
diff --git a/doc/pages/format_vcf.html b/doc/pages/format_vcf.html
new file mode 100644
index 0000000..dee6b40
--- /dev/null
+++ b/doc/pages/format_vcf.html
@@ -0,0 +1,63 @@
+<html>
+<head>
+<meta http-equiv="content-type" content="text/html; charset=iso-8859-1"/>
+<meta http-equiv="Cache-Control" content="max-age=3600"/>
+<meta name="description" content="description"/>
+<meta name="keywords" content="keywords"/>
+<meta name="author" content="author"/>
+<link rel="stylesheet" type="text/css" href="../style/default.css" media="screen"/>
+<title>FastQTL</title>
+<script language="Javascript" src="../script/print_last_modif_date.js" ></script>
+</head>
+
+<body>
+<div class="content">
+
+ <div class="item" id="vcf1">
+ <h1>VCF file format</h1>
+ <p>
+ The <b>VCF</b> file contains the genetic data (genotypes). Hereafter a minimal example:
+ </p>
+
+ <code>
+ ##fileformat=VCFv4.1<br>
+ #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT UNR1 UNR2 UNR3 UNR4<br>
+ chr7 123 SNP1 A G 100 PASS INFO GT:DS 0/0:0.001 0/0:0.000 0/1:0.999 1/1:1.999<br>
+ chr7 456 SNP2 T C 100 PASS INFO GT:DS 0/0:0.001 0/0:0.000 0/1:1.100 0/0:0.100<br>
+ chr7 789 SNP3 A T 100 PASS INFO GT:DS 1/1:2.000 0/1:1.001 0/0:0.010 0/1:0.890<br>
+ </code>
+
+ <p>
+ A precise description of this file format can be found <A href=http://vcftools.sourceforge.net/specs.html>here</A>.
+ FastQTL needs at least one of the two following fields <b>GT</b> or <b>DS</b>.
+ It uses in priority the <b>DS</b> field and if absent, the <b>GT</b> field from which it derives the required dosages.
+ We strongly recommend to use dosages instead of fixed genotypes in order to account for imputation uncertainty.
+ </p>
+
+ <note>
+ Missing entries (<b>./.</b>, <b>./0</b> or <b>./1</b>) are internally imputed as mean dosage at the variant site.
+ </note>
+
+ </div>
+
+ <div class="item" id="vcf24">
+ <h1>Indexing VCF file (required)</h1>
+ <p>
+ To feed FastQTL with VCF files, you need to index them with tabix first. Hereafter, the commands that does it:
+ </p>
+
+ <code>
+ bgzip genotypes.vcf && tabix -p vcf genotypes.vcf.gz
+ </code>
+
+ <p>
+ Look <A href=http://samtools.sourceforge.net/tabix.shtml>here</A> for more details on Tabix and Bgzip command lines.
+ The above command line produces a file <b>genotypes.vcf.gz.tbi</b> that contains the index for <b>data.vcf.gz</b>.
+ These tow files need to be together in the same folder in order for FastQTL do be able to also read the index file when reading <b>genotypes.vcf.gz</b>.
+ </p>
+ </div>
+
+ <div class="log"><script type="text/javascript"> print_last_modif_date("$Date: 2015-01-22 00:08:17 +0100 (Thu, 22 Jan 2015) $"); </script></div>
+</div>
+</body>
+</html>
\ No newline at end of file
diff --git a/doc/pages/header.html b/doc/pages/header.html
new file mode 100644
index 0000000..16480ba
--- /dev/null
+++ b/doc/pages/header.html
@@ -0,0 +1,13 @@
+<html>
+<head>
+<meta http-equiv="content-type" content="text/html; charset=iso-8859-1"/>
+<meta http-equiv="Cache-Control" content="max-age=3600"/>
+<link rel="stylesheet" type="text/css" href="../style/default.css" media="screen"/>
+<title>FastQTL</title>
+</head>
+
+<div class="header">
+<font size=10 color="#367EA6">F</font>ast<font size=10 color="#367EA6">QTL</font>
+</div>
+
+</html>
\ No newline at end of file
diff --git a/doc/pages/homepage.html b/doc/pages/homepage.html
new file mode 100644
index 0000000..08d83f3
--- /dev/null
+++ b/doc/pages/homepage.html
@@ -0,0 +1,56 @@
+<html>
+<head>
+<meta http-equiv="content-type" content="text/html; charset=iso-8859-1"/>
+<meta http-equiv="Cache-Control" content="max-age=3600"/>
+<meta name="description" content="description"/>
+<meta name="keywords" content="keywords"/>
+<meta name="author" content="author"/>
+<link rel="stylesheet" type="text/css" href="../style/default.css" media="screen"/>
+<title>FastQTL</title>
+<script language="Javascript" src="../script/print_last_modif_date.js" ></script>
+</head>
+
+<body>
+<div class="content">
+ <div class="item">
+ <h1>General information</h1>
+ <p>
+ THIS WEB PAGE IS UNDER CONSTRUCTION. WE APOLOGIZE FOR INCOMPLETE INFORMATION.
+ </p>
+
+ <p>
+ <b>FastQTL</b> is a QTL mapper with several notable features:
+ <ul>
+ <li><b>Fast</b>; with a permutation scheme relying on Beta approximation. No need to perform millions of permutations to reach low significance levels, only 100 to 1,000 are needed!</li>
+ <li><b>Flexible</b>; association testing is implemented w/o qualitative/quantitative covariates. Phenotypes can be internally quantile normalized. </li>
+ <li><b>User-friendly</b>; only standard file formats are used and easy-to-use options implemented.</li>
+ <li><b>Cluster-friendly</b>; parallelization is easy to set up for large deployment on compute clusters.</li>
+ </ul>
+ </p>
+
+ <p>
+ <b>FastQTL</b> is developed by Olivier DELANEAU, Halit ONGEN, Alfonso BUIL & Manolis DERMITZAKIS.<br>
+ <b>FastQTL</b> is a project developed in the <a href="http://funpopgen.unige.ch/">FunPopGen lab</a>.
+ </p>
+ </div>
+
+ <div class="item">
+ <h1>Publications and projects using FastQTL</h1>
+ <p>
+ Paper to cite when FastQTL is used:<br>
+ <b>Ongen, H. et al. Fast and efficient QTL mapper for molecular phenotypes. Submitted.</b>
+ </p>
+
+ <p>
+ <b>FastQTL</b> is being used in the following projects:
+ <ul>
+ <li><a href="http://www.gtexportal.org/home/">GTEx</a></li>
+ <li>Sinergia (paper to come)</li>
+ </ul>
+ </p>
+ </div>
+
+ <div class="log"><script type="text/javascript"> print_last_modif_date("$Date: 2015-04-07 10:49:55 +0200 (Tue, 07 Apr 2015) $"); </script></div>
+</div>
+</body>
+</html>
\ No newline at end of file
diff --git a/doc/pages/install.html b/doc/pages/install.html
new file mode 100644
index 0000000..4ed9c70
--- /dev/null
+++ b/doc/pages/install.html
@@ -0,0 +1,97 @@
+<html>
+<head>
+<meta http-equiv="content-type" content="text/html; charset=iso-8859-1"/>
+<meta http-equiv="Cache-Control" content="max-age=3600"/>
+<meta name="description" content="description"/>
+<meta name="keywords" content="keywords"/>
+<meta name="author" content="author"/>
+<link rel="stylesheet" type="text/css" href="../style/default.css" media="screen"/>
+<title>FastQTL</title>
+<script language="Javascript" src="../script/print_last_modif_date.js" ></script>
+</head>
+
+<body>
+<div class="content">
+
+ <div class="item" id="item1">
+ <div class="options">
+ <h1>Downloading <b>FastQTL</b></h1>
+ <p>
+ Hereafter, the different versions of FastQTL.
+ </p>
+
+ <table width=100%>
+ <tr>
+ <th width=20%>Version</th>
+ <th width=80%>Description</th>
+ </tr>
+ <tr>
+ <td>
+ <a href="http://fastqtl.sourceforge.net/files/FastQTL-2.165.linux.tgz">v2.165 (Binary + Examples for Linux)</a>
+ </td>
+ <td>
+ Binary of the first version released to the public.
+ It includes functions to perform a nominal pass of association and permutations.
+ <b>Source code will be made available upon publication.</b>
+ If it doesn't work on your machine, please contact <b>olivier.delaneau at gmail.com</b> to request a version compiled on a different machine.
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <a href="http://fastqtl.sourceforge.net/files/FastQTL-2.165.macos.tgz">v2.165 (Binary + Examples for MacOS > 10.6)</a>
+ </td>
+ <td>
+ Binary of the first version released to the public.
+ It includes functions to perform a nominal pass of association and permutations.
+ <b>Source code will be made available upon publication.</b>
+ If it doesn't work on your machine, please contact <b>olivier.delaneau at gmail.com</b> to request a version compiled on a different machine.
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <a href="http://fastqtl.sourceforge.net/files/FastQTL-2.166.linux.tgz">v2.166 (Binary + Source + Examples for Linux)</a>
+ </td>
+ <td>
+ Binary of the second version released to the public.
+ It includes a new column in the output file : the slope of the linear regression.
+ <b>Source code is now available!</b>
+ Source code is located in the src folder. Instructions to compile the code can be found in the next section or in the INSTALL file provided with the tarball.
+ </td>
+ </tr>
+
+ </table>
+ </div>
+ </div>
+
+ <div class="item" id="item1">
+ <h1>Installing <b>FastQTL</b></h1>
+ <p>
+ In the <i>bin</i> folder is a static binary that should work on most machines.
+ If not, you can re-compile FastQTL following the following steps.
+ First, few libraries are needed:
+ </p>
+
+ <ul>
+ <li><b>The GNU Scientific Library:</b> This library is usually installed by default on most computers. FastQTL needs it for maximum likelihood estimations. Instructions to install it can be found <a href="http://www.gnu.org/software/gsl/">here</a>.</li>
+ <li><b>The Boost C++ libraries:</b> This collection of C++ libraries is also usually installed by default on most computers. FastQTL needs it to handle file descriptors and parse program options. Instructions to install it can be found <a href="http://www.boost.org/">here</a>.</li>
+ <li><b>The Zlib library:</b> This standard library allows gzipped files to be internally (un-)compressed by FastQTL. Instructions to install it can be found <a href="http://www.zlib.net/">here</a>.</li>
+ <li><b>The standalone Rmath library:</b> This (very useful) library provided by R contains all basic functions to simulate from standard probability distributions (ex: qnorm, pnorm, rnorm, etc ...). Instructions to install it can be found <a href="http://cran.r-project.org/doc/manuals/r-release/R-admin.html#The-standalone-Rmath-library">here</a>. FastQTL uses its F, Beta and Normal distribution routines. A pre-compiled version can be directly obtained by installing the package <i>r-mat [...]
+ </ul>
+
+ <p>
+ Once all libraries installed, just go in the FastQTL folder and compile the code using:
+ </p>
+
+ <code>
+ make
+ </code>
+
+ <p>
+ This creates a binary of the program in the <i>bin</i> folder nammed <i>fastQTL</i>.
+ </p>
+ </div>
+
+ <div class="log"><script type="text/javascript"> print_last_modif_date("$Date: 2014-10-25 12:59:59 +0200 (Sat, 25 Oct 2014) $"); </script></div>
+</div>
+</body>
+</html>
diff --git a/doc/pages/leftmenu.html b/doc/pages/leftmenu.html
new file mode 100644
index 0000000..a40f612
--- /dev/null
+++ b/doc/pages/leftmenu.html
@@ -0,0 +1,37 @@
+<html>
+<head>
+<meta http-equiv="content-type" content="text/html; charset=iso-8859-1"/>
+<meta http-equiv="Cache-Control" content="max-age=3600"/>
+<link rel="stylesheet" type="text/css" href="../style/default.css" media="screen"/>
+<title>FastQTL</title>
+</head>
+
+<div class="sidenav">
+ <div class="item">
+ <h1>Basic information</h1>
+ <ul>
+ <li><a href="homepage.html" target='content'>Homepage</a></li>
+ <li><a href="install.html" target='content'>Download & Installation</a></li>
+ <li><a href="options.html" target='content'>Options list</a></li>
+ </ul>
+ </div>
+
+ <div class="item">
+ <h1>File formats</h1>
+ <ul>
+ <li><a href="format_vcf.html" target='content'>Genotypes</a></li>
+ <li><a href="format_bed.html" target='content'>Phenotypes</a></li>
+ <li><a href="format_cov.html" target='content'>Covariates</a></li>
+ </ul>
+ </div>
+
+ <div class="item">
+ <h1>Types of analysis</h1>
+ <ul>
+ <li><a href="cis_nominal.html" target='content'>A1: Nominal pass in <i>cis</i></a></li>
+ <li><a href="cis_permutation.html" target='content'>A2: Permutation pass in <i>cis</i></a></li>
+ <li><a href="cis_mapping.html" target='content'>A3: Mapping QTLs in <i>cis</i></a></li>
+ </ul>
+ </div>
+</div>
+</html>
diff --git a/doc/pages/options.html b/doc/pages/options.html
new file mode 100644
index 0000000..03b8e3b
--- /dev/null
+++ b/doc/pages/options.html
@@ -0,0 +1,247 @@
+<html>
+<head>
+<meta http-equiv="content-type" content="text/html; charset=iso-8859-1"/>
+<meta http-equiv="Cache-Control" content="max-age=3600"/>
+<meta name="description" content="description"/>
+<meta name="keywords" content="keywords"/>
+<meta name="author" content="author"/>
+<link rel="stylesheet" type="text/css" href="../style/default.css" media="screen"/>
+<title>FastQTL</title>
+<script language="Javascript" src="../script/print_last_modif_date.js" ></script>
+</head>
+
+<body>
+<div class="content">
+ <div class="item">
+ <div class="options">
+ <h1><font size="6">fastqtl [options]</font></h1>
+
+ <p>
+ Hereafter, the complete list of options for FastQTL in order to map cisQTL.
+ </p>
+
+ <table width=100%>
+ <tr>
+ <th width=30%>Basic options</th>
+ <th width=70%>Description</th>
+ </tr>
+ <tr>
+ <td>
+ <b>--help</b><br>
+ -H
+ </td>
+ <td>
+ Print help about options.
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <b>--version</b><br>
+ -V
+ </td>
+ <td>
+ Print the FastQTL binary version.
+ </td>
+ </tr>
+ </table>
+
+ <br>
+ <br>
+ <table width=100%>
+ <tr>
+ <th width=30%>Input file options</th>
+ <th width=70%>Description</th>
+ </tr>
+
+ <tr>
+ <td>
+ <b>--vcf file.vcf</b><br>
+ -V file.vcf<br>
+ -V file.vcf.gz
+ </td>
+ <td>
+ Genotypes in VCF 4.1 format<br>
+ Field <b>DS</b> is used as default for dosages.<br>
+ If no DS field is present, FQTL derives dosages from the <b>GT</b> field.<br>
+ Brief description <a href="vcf.html" target='content'>here</a>.<br>
+ Detailed description <A href=http://vcftools.sourceforge.net/specs.html>here</A>.<br>
+ This file needs to be compressed with BGZIP and indexed with <A href=http://sourceforge.net/projects/samtools/files/tabix/>TABIX</A>.
+ </td>
+ </tr>
+
+ <tr>
+ <td>
+ <b>--bed file.bed</b><br>
+ -B file.bed<br>
+ -B file.bed.gz
+ </td>
+ <td>
+ Phenotypes in UCSC BED extended format.<br>
+ Brief description <a href="bed.html" target='content'>here</a>.<br>
+ Detailed description <A href=http://genome.ucsc.edu/FAQ/FAQformat.html#format1>here</A>.
+ This file needs to be compressed with BGZIP and indexed with <A href=http://sourceforge.net/projects/samtools/files/tabix/>TABIX</A>.
+ </td>
+ </tr>
+ </table>
+
+ <br>
+ <br>
+ <table width=100%>
+ <tr>
+ <th width=30%>Output file options</th>
+ <th width=70%>Description</th>
+ </tr>
+
+ <tr>
+ <td>
+ <b>--out file.results</b><br>
+ -O file.results.gz<br>
+ </td>
+ <td>
+ Output file.
+ </td>
+ </tr>
+
+ <tr>
+ <td>
+ <b>--log file.log</b><br>
+ -L file.log<br>
+ </td>
+ <td>
+ Log file. It is basically a copy of the screen output. If this option is not provided, the software will generate one automatically with a UUID based filename.
+ </td>
+ </tr>
+ </table>
+
+
+ <br>
+ <br>
+ <table width=100%>
+ <tr>
+ <th width=30%>Filtering options</th>
+ <th width=70%>Description</th>
+ </tr>
+
+ <tr>
+ <td>
+ <b>--exclude-samples file.exc</b><br>
+ <b>--exclude-sites file.exc</b><br>
+ <b>--exclude-phenotypes file.exc</b><br>
+ <b>--exclude-covariates file.exc</b>
+ </td>
+ <td>
+ To specify files containing all samples, variants, phenotypes or covariates to be excluded from the analysis.
+ </td>
+ </tr>
+
+ <tr>
+ <td>
+ <b>--include-samples file.exc</b><br>
+ <b>--include-sites file.exc</b><br>
+ <b>--include-phenotypes file.exc</b><br>
+ <b>--include-covariates file.exc</b>
+ </td>
+ <td>
+ To specify files containing all samples, variants, phenotypes or covariates to be included from the analysis.
+ </td>
+ </tr>
+ </table>
+
+ <br>
+ <br>
+ <table width=100%>
+ <tr>
+ <th width=30%>Method parameters</th>
+ <th width=70%>Description</th>
+ </tr>
+
+ <tr>
+ <td>
+ <b>--normal</b><br>
+ </td>
+ <td>
+ To perform quantile normalization on the phenotype quantifications to make them normally distributed.
+ Implemeted as the <i>rntransform</i> function of the GenABEL package. Can be found <A href=http://www.genabel.org/packages/GenABEL>here</A>.
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <b>--window [float]</b><br>
+ </td>
+ <td>
+ Cis-window size. Default values is 1Mb (1e6).
+ It means that all variants within 1e6 bp of the phenotype location (e.g. TSS) is analyzed.
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <b>--threshold [float]</b><br>
+ </td>
+ <td>
+ To filter out all phenotype-variant pairs with a p-value above the specified threshold in the output of a nominal pass.
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <b>--permute [int]</b><br>
+ <b>--permute [int] [int]</b><br>
+ </td>
+ <td>
+ Perform permutations and report only best candidate QTL per MP. When one argument (=X) is given, a fixed number of permutations (=x) is performed.
+ When two arguments are given (=X1 and X2), the adaptive permutation scheme is used and between X1 and X2 permutations are performed depending on the significance of the tested MP.
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <b>--seed</b><br>
+ </td>
+ <td>
+ Random number generator seed. Useful to replicate runs of the software.
+ </td>
+ </tr>
+ </table>
+
+ <br>
+ <br>
+ <table width=100%>
+ <tr>
+ <th width=30%>Parallelization</th>
+ <th width=70%>Description</th>
+ </tr>
+ <tr>
+ <td>
+ <b>--region chr:start-end</b><br>
+ -R 12:1000000-2000000
+ </td>
+ <td>
+ Define a chunk of molecular phenotypes to be processed.
+ Genotype data is automatically extracted given the phenotype region and the cis-window size.
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <b>--chunk [int] [int]</b>
+ </td>
+ <td>
+ Specify which chunk of data needs to be processed. Needs two arguments (=X1, X2). Tells to FastQTL to process chunk X1 out of X2 chunks.
+ </td>
+ </tr>
+
+ <tr>
+ <td>
+ <b>--commands [int] file.commands</b>
+ </td>
+ <td>
+ Split the whole analysis in X jobs (first argument) and write the resulting X commands in <i>file.commands</i>.
+ </td>
+ </tr>
+
+ </table>
+
+ </div>
+ </div>
+
+ <div class="log"><script type="text/javascript"> print_last_modif_date("$Date: 2015-01-22 00:08:17 +0100 (Thu, 22 Jan 2015) $"); </script></div>
+</div>
+</body>
+</html>
\ No newline at end of file
diff --git a/doc/pages/template.html b/doc/pages/template.html
new file mode 100644
index 0000000..3964201
--- /dev/null
+++ b/doc/pages/template.html
@@ -0,0 +1,51 @@
+<html>
+<head>
+<meta http-equiv="content-type" content="text/html; charset=iso-8859-1"/>
+<meta http-equiv="Cache-Control" content="max-age=3600"/>
+<meta name="description" content="description"/>
+<meta name="keywords" content="keywords"/>
+<meta name="author" content="author"/>
+<link rel="stylesheet" type="text/css" href="../style/default.css" media="screen"/>
+<title>FastQTL</title>
+<script language="Javascript" src="../script/print_last_modif_date.js" ></script>
+</head>
+
+<body>
+<div class="content">
+
+ <div class="item" id="item1">
+ <h1>Title1</h1>
+ <p>
+ Text1.1
+ </p>
+
+ <code>
+ Code1
+ </code>
+
+ <p>
+ Text1.2
+ </p>
+
+ </div>
+
+ <div class="item" id="item2">
+ <h1>Title1</h1>
+ <p>
+ Text2.1
+ </p>
+
+ <code>
+ Code2
+ </code>
+
+ <p>
+ Text2.2
+ </p>
+
+ </div>
+
+ <div class="log"><script type="text/javascript"> print_last_modif_date("$Date: 2014-10-25 12:59:59 +0200 (Sat, 25 Oct 2014) $"); </script></div>
+</div>
+</body>
+</html>
diff --git a/doc/script/print_last_modif_date.js b/doc/script/print_last_modif_date.js
new file mode 100644
index 0000000..df001f4
--- /dev/null
+++ b/doc/script/print_last_modif_date.js
@@ -0,0 +1,3 @@
+function print_last_modif_date(v) {
+document.write("Last updated " + v.substr(7, 19));
+}
diff --git a/doc/style/default.css b/doc/style/default.css
new file mode 100644
index 0000000..6ded0c2
--- /dev/null
+++ b/doc/style/default.css
@@ -0,0 +1,228 @@
+/* COMMON */
+
+* {
+ margin: 0;
+ padding: 0;
+}
+
+a {
+ color: #36C;
+}
+
+a:hover {
+ color: #06F;
+}
+
+
+body {
+ color: #444;
+ font: normal 90% "Lucida Sans Unicode",sans-serif;
+ background-color: #EEE;
+ margin: 0;
+}
+
+input {
+ color: #555;
+ font: normal 1.1em "Lucida Sans Unicode",sans-serif;
+}
+
+p,note,code,ul, ol {
+ font-size: 1.2em;
+ padding-bottom: 1.2em;
+}
+
+h1 {
+ font-size: 1.4em;
+}
+
+.highligth {
+ color: #f00
+}
+
+code {
+ border: 1px solid #F0F0F0;
+ border-left: 5px solid #39F;
+ color: #555;
+ display: block;
+ /* font: normal 1.1em "Lucida Sans Unicode",serif; */
+ font: normal 1.2em "MS Courier New", monospace;
+ margin-bottom: 12px;
+ padding: 8px 10px;
+}
+
+code table {
+ border-collapse:collapse;
+ font: normal 1em "MS Courier New", monospace;
+}
+
+code th, td {
+ border:0px;
+ padding-right:20px;
+ text-align:left;
+}
+
+note {
+ border: 1px solid #F0F0F0;
+ border-left: 5px solid #F0F0F0;
+ display: block;
+ font: normal "Lucida Sans Unicode",sans-serif;
+ font-size: 1.3em;
+ margin-bottom: 12px;
+ padding: 8px 10px;
+}
+
+h1,h2,h3 {
+ color: #367EA6;
+}
+
+/* HEADER */
+
+.header {
+ color: #666;
+ background: #EEE;
+ font: bold 3em "MS Courier New", monospace;
+ margin-bottom: 0px;
+ text-align: center;
+}
+
+/* FOOTER */
+.footer {
+ background: #EEE;
+ color: #666;
+ font-size: 1.2em;
+ text-align: center;
+}
+.footer a {
+ color: #36C;
+ text-decoration: none;
+}
+.footer a:hover {
+ color: #06F;
+ text-decoration: underline;
+}
+
+/* LEFT MENU */
+.sidenav {
+ padding: 10px;
+}
+
+.sidenav .item {
+ font-size: 1em;
+ padding: 6px 12px;
+ border: 1px solid #CCC;
+ background: #FFF;
+ margin-bottom: 8px;
+}
+
+.sidenav h1 {
+ color: #367EA6;
+ margin-bottom: 8px;
+}
+
+.sidenav ul {
+ margin: 5px;
+ padding: 0px;
+}
+.sidenav li {
+ font-size: 0.8em;
+ list-style: none;
+}
+
+.sidenav a {
+ color: #367EA6;
+ text-decoration: none;
+}
+
+.sidenav a:hover {
+ color: #111;
+}
+
+.sidenav li a {
+ color: #666;
+ display: block;
+ font-size: 1em;
+ padding: 3px 6px 3px 14px;
+ text-decoration: none;
+}
+.sidenav li a:hover {
+ color: #111;
+}
+
+/* CONTENT */
+.content {
+ padding: 10px;
+}
+
+.content .item {
+ padding: 6px 12px;
+ border: 1px solid #CCC;
+ background: #FFF;
+ margin-bottom: 8px;
+}
+
+.content .log {
+ /*padding: 6px 12px;*/
+ border: 1px solid #CCC;
+ background: #FFF;
+ /*margin-bottom: 8px;*/
+ text-align: center;
+ width: 250px;
+ margin-left: auto;
+ margin-right: auto;
+}
+
+.content .descr {
+ color: #333;
+ font-size: 0.8em;
+ margin-bottom: 6px;
+}
+
+.content h1 {
+ margin-bottom: 20px;
+}
+
+.content ul {
+ list-style-image: url(../img/li.gif);
+}
+
+.content li {
+ margin-left: 18px;
+}
+
+/* TABLE OPTION */
+
+.options table {
+ border: 1px solid #BBB;
+ border-collapse: collapse;
+}
+
+.options th {
+ font: bold 0.9em "MS Courier New", monospace;
+ color: #367EA6;
+ border: 1px solid #BBB;
+ text-align: left;
+ vertical-align: top;
+ padding: 5px;
+}
+
+.options td {
+ font: normal 0.8em "MS Courier New", monospace;
+ border: 1px solid #BBB;
+ text-align: left;
+ vertical-align: top;
+ padding: 0px 5px;
+}
+
+/* TABLE FORM */
+
+.form table {
+ border: 0px;
+ border-collapse: collapse;
+}
+
+.form td {
+ font: normal 0.8em "MS Courier New";
+ border: 0px;
+ text-align: left;
+ padding: 5px;
+}
\ No newline at end of file
diff --git a/example/covariates.txt.gz b/example/covariates.txt.gz
new file mode 100644
index 0000000..04056e3
Binary files /dev/null and b/example/covariates.txt.gz differ
diff --git a/example/genotypes.vcf.gz b/example/genotypes.vcf.gz
new file mode 100644
index 0000000..7eabd69
Binary files /dev/null and b/example/genotypes.vcf.gz differ
diff --git a/example/genotypes.vcf.gz.tbi b/example/genotypes.vcf.gz.tbi
new file mode 100644
index 0000000..f73258f
Binary files /dev/null and b/example/genotypes.vcf.gz.tbi differ
diff --git a/example/interaction.txt b/example/interaction.txt
new file mode 100644
index 0000000..7d3b134
--- /dev/null
+++ b/example/interaction.txt
@@ -0,0 +1,373 @@
+HG00096 0
+HG00097 1
+HG00099 0
+HG00100 1
+HG00101 0
+HG00102 1
+HG00103 0
+HG00104 1
+HG00105 0
+HG00106 1
+HG00108 0
+HG00109 1
+HG00110 0
+HG00111 1
+HG00112 0
+HG00114 1
+HG00115 0
+HG00116 1
+HG00117 0
+HG00118 1
+HG00119 0
+HG00120 1
+HG00121 0
+HG00122 1
+HG00123 0
+HG00124 1
+HG00125 0
+HG00126 1
+HG00127 0
+HG00128 1
+HG00129 0
+HG00130 1
+HG00131 0
+HG00132 1
+HG00133 0
+HG00134 1
+HG00135 0
+HG00136 1
+HG00137 0
+HG00138 1
+HG00139 0
+HG00141 1
+HG00142 0
+HG00143 1
+HG00145 0
+HG00146 1
+HG00148 0
+HG00149 1
+HG00150 0
+HG00151 1
+HG00152 0
+HG00154 1
+HG00155 0
+HG00156 1
+HG00157 0
+HG00158 1
+HG00159 0
+HG00160 1
+HG00171 0
+HG00173 1
+HG00174 0
+HG00176 1
+HG00177 0
+HG00178 1
+HG00179 0
+HG00180 1
+HG00181 0
+HG00182 1
+HG00183 0
+HG00185 1
+HG00186 0
+HG00187 1
+HG00188 0
+HG00189 1
+HG00231 0
+HG00232 1
+HG00233 0
+HG00234 1
+HG00235 0
+HG00236 1
+HG00238 0
+HG00239 1
+HG00240 0
+HG00242 1
+HG00243 0
+HG00244 1
+HG00245 0
+HG00246 1
+HG00247 0
+HG00249 1
+HG00250 0
+HG00251 1
+HG00252 0
+HG00253 1
+HG00255 0
+HG00256 1
+HG00257 0
+HG00258 1
+HG00259 0
+HG00260 1
+HG00261 0
+HG00262 1
+HG00263 0
+HG00264 1
+HG00265 0
+HG00266 1
+HG00267 0
+HG00268 1
+HG00269 0
+HG00271 1
+HG00272 0
+HG00273 1
+HG00274 0
+HG00275 1
+HG00276 0
+HG00277 1
+HG00278 0
+HG00280 1
+HG00281 0
+HG00282 1
+HG00284 0
+HG00285 1
+HG00306 0
+HG00308 1
+HG00309 0
+HG00310 1
+HG00311 0
+HG00312 1
+HG00313 0
+HG00315 1
+HG00319 0
+HG00320 1
+HG00321 0
+HG00323 1
+HG00324 0
+HG00325 1
+HG00326 0
+HG00327 1
+HG00328 0
+HG00329 1
+HG00330 0
+HG00331 1
+HG00332 0
+HG00334 1
+HG00335 0
+HG00336 1
+HG00337 0
+HG00338 1
+HG00339 0
+HG00341 1
+HG00342 0
+HG00343 1
+HG00344 0
+HG00345 1
+HG00346 0
+HG00349 1
+HG00350 0
+HG00351 1
+HG00353 0
+HG00355 1
+HG00356 0
+HG00358 1
+HG00359 0
+HG00360 1
+HG00361 0
+HG00362 1
+HG00364 0
+HG00365 1
+HG00366 0
+HG00367 1
+HG00369 0
+HG00371 1
+HG00372 0
+HG00373 1
+HG00375 0
+HG00376 1
+HG00377 0
+HG00378 1
+HG00379 0
+HG00380 1
+HG00381 0
+HG00382 1
+HG00383 0
+HG00384 1
+HG01334 0
+HG01789 1
+HG01790 0
+HG01791 1
+HG02215 0
+NA06984 1
+NA06985 0
+NA06986 1
+NA06989 0
+NA06994 1
+NA07037 0
+NA07048 1
+NA07051 0
+NA07056 1
+NA07346 0
+NA07347 1
+NA07357 0
+NA10847 1
+NA10851 0
+NA11829 1
+NA11830 0
+NA11831 1
+NA11832 0
+NA11840 1
+NA11843 0
+NA11881 1
+NA11892 0
+NA11893 1
+NA11894 0
+NA11918 1
+NA11920 0
+NA11930 1
+NA11931 0
+NA11992 1
+NA11993 0
+NA11994 1
+NA11995 0
+NA12004 1
+NA12005 0
+NA12006 1
+NA12043 0
+NA12044 1
+NA12045 0
+NA12058 1
+NA12144 0
+NA12154 1
+NA12155 0
+NA12156 1
+NA12234 0
+NA12249 1
+NA12272 0
+NA12273 1
+NA12275 0
+NA12282 1
+NA12283 0
+NA12286 1
+NA12287 0
+NA12340 1
+NA12341 0
+NA12342 1
+NA12347 0
+NA12348 1
+NA12383 0
+NA12399 1
+NA12400 0
+NA12413 1
+NA12489 0
+NA12546 1
+NA12716 0
+NA12717 1
+NA12718 0
+NA12749 1
+NA12750 0
+NA12751 1
+NA12760 0
+NA12761 1
+NA12762 0
+NA12763 1
+NA12775 0
+NA12776 1
+NA12777 0
+NA12778 1
+NA12812 0
+NA12813 1
+NA12814 0
+NA12815 1
+NA12827 0
+NA12829 1
+NA12830 0
+NA12842 1
+NA12843 0
+NA12872 1
+NA12873 0
+NA12874 1
+NA12889 0
+NA12890 1
+NA20502 0
+NA20503 1
+NA20504 0
+NA20505 1
+NA20506 0
+NA20507 1
+NA20508 0
+NA20509 1
+NA20510 0
+NA20512 1
+NA20513 0
+NA20514 1
+NA20515 0
+NA20516 1
+NA20517 0
+NA20518 1
+NA20519 0
+NA20520 1
+NA20521 0
+NA20524 1
+NA20525 0
+NA20527 1
+NA20528 0
+NA20529 1
+NA20530 0
+NA20531 1
+NA20532 0
+NA20534 1
+NA20535 0
+NA20536 1
+NA20537 0
+NA20538 1
+NA20539 0
+NA20540 1
+NA20541 0
+NA20542 1
+NA20543 0
+NA20544 1
+NA20581 0
+NA20582 1
+NA20585 0
+NA20586 1
+NA20588 0
+NA20589 1
+NA20752 0
+NA20754 1
+NA20756 0
+NA20757 1
+NA20758 0
+NA20759 1
+NA20760 0
+NA20761 1
+NA20765 0
+NA20766 1
+NA20768 0
+NA20769 1
+NA20770 0
+NA20771 1
+NA20772 0
+NA20773 1
+NA20774 0
+NA20778 1
+NA20783 0
+NA20785 1
+NA20786 0
+NA20787 1
+NA20790 0
+NA20792 1
+NA20795 0
+NA20796 1
+NA20797 0
+NA20798 1
+NA20799 0
+NA20800 1
+NA20801 0
+NA20802 1
+NA20803 0
+NA20804 1
+NA20805 0
+NA20806 1
+NA20807 0
+NA20808 1
+NA20809 0
+NA20810 1
+NA20811 0
+NA20812 1
+NA20813 0
+NA20814 1
+NA20815 0
+NA20816 1
+NA20819 0
+NA20826 1
+NA20828 0
diff --git a/example/phenotypes.bed.gz b/example/phenotypes.bed.gz
new file mode 100644
index 0000000..a38e962
Binary files /dev/null and b/example/phenotypes.bed.gz differ
diff --git a/example/phenotypes.bed.gz.tbi b/example/phenotypes.bed.gz.tbi
new file mode 100644
index 0000000..b307a37
Binary files /dev/null and b/example/phenotypes.bed.gz.tbi differ
diff --git a/script/calulateNominalPvalueThresholds.R b/script/calulateNominalPvalueThresholds.R
new file mode 100644
index 0000000..0e95c51
--- /dev/null
+++ b/script/calulateNominalPvalueThresholds.R
@@ -0,0 +1,33 @@
+suppressMessages(library(qvalue))
+
+args <- commandArgs(trailingOnly = TRUE)
+
+ifile = args[1]
+fdr = as.numeric(args[2]);
+
+cat("Processing fastQTL concatenated output [", ifile, "] controlling for FDR =", fdr * 100, "%\n");
+
+#Read data
+D = read.table(ifile, hea=FALSE, stringsAsFactors=FALSE)
+D = D[which(!is.na(D[, 11])),]
+cat(" * Number of molecular phenotypes =" , nrow(D), "\n")
+cat(" * Correlation between Beta approx. and Empirical p-values =", round(cor(D[, 9], D[, 10]), 4), "\n")
+
+#Run qvalue on pvalues for best signals
+Q = qvalue(D[, 11])
+D$qval = Q$qvalue
+cat(" * Proportion of significant phenotypes =" , round((1 - Q$pi0) * 100, 2), "%\n")
+
+#Determine significance threshold
+set0 = D[which(D$qval <= fdr),]
+set1 = D[which(D$qval > fdr),]
+pthreshold = (sort(set1$V11)[1] - sort(-1.0 * set0$V11)[1]) / 2
+cat(" * Corrected p-value threshold = ", pthreshold, "\n")
+
+#Calculate nominal pvalue thresholds
+D$nthresholds = qbeta(pthreshold, D$V3, D$V4, ncp = 0, lower.tail = TRUE, log.p = FALSE)
+
+#Write output
+write.table(D[, c(1, 13, 12)], args[3], quote=FALSE, row.names=FALSE, col.names=FALSE)
+
+cat("Done\n")
diff --git a/src/analysisMapping.cpp b/src/analysisMapping.cpp
new file mode 100644
index 0000000..17ece06
--- /dev/null
+++ b/src/analysisMapping.cpp
@@ -0,0 +1,142 @@
+//FastQTL: Fast and efficient QTL mapper for molecular phenotypes
+//Copyright (C) 2015 Olivier DELANEAU, Alfonso BUIL, Emmanouil DERMITZAKIS & Olivier DELANEAU
+//
+//This program is free software: you can redistribute it and/or modify
+//it under the terms of the GNU General Public License as published by
+//the Free Software Foundation, either version 3 of the License, or
+//(at your option) any later version.
+//
+//This program is distributed in the hope that it will be useful,
+//but WITHOUT ANY WARRANTY; without even the implied warranty of
+//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//GNU General Public License for more details.
+//
+//You should have received a copy of the GNU General Public License
+//along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+#include "data.h"
+
+#define MAX_ANALYSIS_DEPTH 10
+
+void data::runMapping(string fout, bool full) {
+ ofile fdo (fout);
+ for (int p = 0 ; p < phenotype_count ; p ++) {
+
+ LOG.println("\nProcessing gene [" + phenotype_id[p] + "]");
+
+ vector < int > targetGenotypes, targetDistances;
+ for (int g = 0 ; g < genotype_count ; g ++) {
+ int cisdistance = genotype_pos[g] - phenotype_start[p];
+ if (abs(cisdistance) <= cis_window) {
+ targetGenotypes.push_back(g);
+ targetDistances.push_back(cisdistance);
+ }
+ }
+ LOG.println(" * Number of variants in cis = " + sutils::int2str(targetGenotypes.size()));
+
+ if (targetGenotypes.size() > 0) {
+
+ LOG.println(" * Nominal significance threshold = " + sutils::double2str(phenotype_threshold[p]));
+
+ // 1. Forward pass: Learn number of independent signals and Map the best candidates
+ vector < double > bestCorr = vector < double > (MAX_ANALYSIS_DEPTH, 0.0);
+ vector < double > uncorrected_pvalues = vector < double > (targetGenotypes.size(), 2.0);
+ vector < int > bestIndex;
+ bool done = false;
+
+ for (int i = 0 ; i < MAX_ANALYSIS_DEPTH && !done; i ++) {
+ int n_significant = 0;
+ vector < float > phenotype_curr = phenotype_orig[p];
+ copy(genotype_orig.begin() + targetGenotypes[0], genotype_orig.begin() + targetGenotypes.back() + 1, genotype_curr.begin() + targetGenotypes[0]);
+ vector < int > bestIndex_tmp = bestIndex;
+ bestIndex_tmp.push_back(-1);
+
+ //Covariates + Best signals
+ covariate_engine->clearSoft();
+ for (int h = 0 ; h < bestIndex.size() ; h ++) covariate_engine->pushSoft(genotype_orig[bestIndex[h]]);
+ covariate_engine->residualize(phenotype_curr);
+ normalize(phenotype_curr);
+
+ for (int g = 0 ; g < targetGenotypes.size() ; g ++) {
+ if (i == 0 || full || (!full && uncorrected_pvalues[g] <= phenotype_threshold[p])) {
+ covariate_engine->residualize(genotype_curr[targetGenotypes[g]]);
+ normalize(genotype_curr[targetGenotypes[g]]);
+ double corr = getCorrelation(genotype_curr[targetGenotypes[g]], phenotype_curr);
+ double pvalue = getPvalue(corr, sample_count - 2 - covariate_engine->nCovariates());
+ if (abs(corr) > abs(bestCorr[i])) {
+ bestCorr[i] = corr;
+ bestIndex_tmp[i] = targetGenotypes[g];
+ }
+ if (i == 0) uncorrected_pvalues[g] = pvalue;
+ if (pvalue <= phenotype_threshold[p]) n_significant ++;
+ }
+ }
+ if (n_significant == 0) done = true;
+ else bestIndex = bestIndex_tmp;
+ }
+ LOG.println(" * Number of independent signals found = " + sutils::int2str(bestIndex.size()));
+
+ if (bestIndex.size() == 1) {
+ int n_signals = 0;
+ for (int g = 0 ; g < targetGenotypes.size() ; g ++) {
+ if (uncorrected_pvalues[g] <= phenotype_threshold[p]) {
+ fdo << phenotype_id[p] << " 0 " << genotype_id[targetGenotypes[g]] << " " << targetDistances[g] << " " << uncorrected_pvalues[g] << " " << uncorrected_pvalues[g] << " " << (bestIndex[0] == targetGenotypes[g]) << endl;
+ n_signals ++;
+ }
+ }
+ LOG.println(" * Number of candidate QTLs reported for rank 1 = " + sutils::int2str(n_signals));
+ } else if (bestIndex.size() > 1) {
+ //2. Backward pass: Determine candidate variants and classify them
+ vector < vector < double > > corrected_pvalues = vector < vector < double > > (bestIndex.size(), vector < double > (targetGenotypes.size(), 2.0));
+ for (int i = bestIndex.size() - 1 ; i >= 0 ; i --) {
+ vector < float > phenotype_curr = phenotype_orig[p];
+ copy(genotype_orig.begin() + targetGenotypes[0], genotype_orig.begin() + targetGenotypes.back() + 1, genotype_curr.begin() + targetGenotypes[0]);
+
+ vector < int > bestIndexOthers = bestIndex;
+ bestIndexOthers.erase(bestIndexOthers.begin() + i);
+
+ covariate_engine->clearSoft();
+ for (int h = 0 ; h < bestIndexOthers.size() ; h ++) covariate_engine->pushSoft(genotype_orig[bestIndexOthers[h]]);
+ covariate_engine->residualize(phenotype_curr);
+ normalize(phenotype_curr);
+
+ for (int g = 0 ; g < targetGenotypes.size() ; g ++) {
+ if (full || (!full && uncorrected_pvalues[g] <= phenotype_threshold[p])) {
+ covariate_engine->residualize(genotype_curr[targetGenotypes[g]]);
+ normalize(genotype_curr[targetGenotypes[g]]);
+ double corr = getCorrelation(genotype_curr[targetGenotypes[g]], phenotype_curr);
+ corrected_pvalues[i][g] = getPvalue(corr, sample_count - 2 - covariate_engine->nCovariates());
+ }
+ }
+ }
+
+ //
+ for (int g = 0 ; g < targetGenotypes.size() ; g ++) {
+ double min_pvalue = 1.1;
+ for (int i = 0; i < bestIndex.size() ; i ++) if (corrected_pvalues[i][g] < min_pvalue) min_pvalue = corrected_pvalues[i][g];
+ for (int i = 0; i < bestIndex.size() ; i ++) if (corrected_pvalues[i][g] != min_pvalue) {
+ corrected_pvalues[i][g] = 2.0;
+ }
+ }
+
+ //
+ vector < int > n_signals = vector < int > (bestIndex.size(), 0);
+ for (int i = 0; i < bestIndex.size() ; i ++) {
+ for (int g = 0 ; g < targetGenotypes.size() ; g ++) {
+ if ((bestIndex[i] == targetGenotypes[g]) || (corrected_pvalues[i][g] <= phenotype_threshold[p])) {
+ fdo << phenotype_id[p] << " " << i << " " << genotype_id[targetGenotypes[g]] << " " << targetDistances[g] << " " << uncorrected_pvalues[g] << " " << corrected_pvalues[i][g] << " " << (bestIndex[i] == targetGenotypes[g]) << endl;
+ n_signals [i] ++;
+ }
+ }
+ }
+
+ //
+ for (int i = 0; i < bestIndex.size() ; i ++)
+ LOG.println(" * Number of candidate QTLs reported for rank "+ sutils::int2str(i) + " = " + sutils::int2str(n_signals[i]));
+ }
+ }
+
+ LOG.println(" * Progress = " + sutils::double2str((p+1) * 100.0 / phenotype_count, 1) + "%");
+ }
+ fdo.close();
+}
diff --git a/src/analysisNominal.cpp b/src/analysisNominal.cpp
new file mode 100644
index 0000000..9eca9fd
--- /dev/null
+++ b/src/analysisNominal.cpp
@@ -0,0 +1,69 @@
+//FastQTL: Fast and efficient QTL mapper for molecular phenotypes
+//Copyright (C) 2015 Olivier DELANEAU, Alfonso BUIL, Emmanouil DERMITZAKIS & Olivier DELANEAU
+//
+//This program is free software: you can redistribute it and/or modify
+//it under the terms of the GNU General Public License as published by
+//the Free Software Foundation, either version 3 of the License, or
+//(at your option) any later version.
+//
+//This program is distributed in the hope that it will be useful,
+//but WITHOUT ANY WARRANTY; without even the implied warranty of
+//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//GNU General Public License for more details.
+//
+//You should have received a copy of the GNU General Public License
+//along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+#include "data.h"
+
+
+void data::runNominal(string fout, double threshold) {
+
+ //0. Prepare genotypes
+ vector < double > genotype_sd = vector < double > (genotype_count, 0.0);
+ vector < double > phenotype_sd = vector < double > (phenotype_count, 0.0);
+ if (covariate_count > 0) {
+ LOG.println("\nCorrecting genotypes & phenotypes for covariates");
+ covariate_engine->residualize(genotype_orig);
+ covariate_engine->residualize(phenotype_orig);
+ }
+ for (int g = 0 ; g < genotype_count ; g ++) genotype_sd[g] = RunningStat(genotype_orig[g]).StandardDeviation();
+ for (int p = 0 ; p < phenotype_count ; p ++) phenotype_sd[p] = RunningStat(phenotype_orig[p]).StandardDeviation();
+ normalize(genotype_orig);
+ normalize(phenotype_orig);
+
+ //1. Loop over phenotypes
+ ofile fdo (fout);
+ for (int p = 0 ; p < phenotype_count ; p ++) {
+
+ LOG.println("\nProcessing gene [" + phenotype_id[p] + "]");
+
+ //1.1. Enumerate all genotype-phenotype pairs within cis-window
+ vector < int > targetGenotypes, targetDistances;
+ for (int g = 0 ; g < genotype_count ; g ++) {
+ int cisdistance = genotype_pos[g] - phenotype_start[p];
+ if (abs(cisdistance) <= cis_window) {
+ targetGenotypes.push_back(g);
+ targetDistances.push_back(cisdistance);
+ }
+ }
+ LOG.println(" * Number of variants in cis = " + sutils::int2str(targetGenotypes.size()));
+
+ //1.2. Nominal pass: scan cis-window & compute statistics
+ for (int g = 0 ; g < targetGenotypes.size() ; g ++) {
+ double corr = getCorrelation(genotype_orig[targetGenotypes[g]], phenotype_orig[p]);
+ double pval = getPvalue(corr, sample_count - 2 - covariate_count);
+ double slope = getSlope(corr, phenotype_sd[p], genotype_sd[targetGenotypes[g]]);
+ if (pval <= threshold ) {
+ fdo << phenotype_id[p];
+ fdo << " " << genotype_id[targetGenotypes[g]];
+ fdo << " " << targetDistances[g];
+ fdo << " " << pval;
+ fdo << " " << slope;
+ fdo << endl;
+ }
+ }
+ LOG.println(" * Progress = " + sutils::double2str((p+1) * 100.0 / phenotype_count, 1) + "%");
+ }
+ fdo.close();
+}
diff --git a/src/analysisPermutation.cpp b/src/analysisPermutation.cpp
new file mode 100644
index 0000000..3f2a049
--- /dev/null
+++ b/src/analysisPermutation.cpp
@@ -0,0 +1,136 @@
+//FastQTL: Fast and efficient QTL mapper for molecular phenotypes
+//Copyright (C) 2015 Olivier DELANEAU, Alfonso BUIL, Emmanouil DERMITZAKIS & Olivier DELANEAU
+//
+//This program is free software: you can redistribute it and/or modify
+//it under the terms of the GNU General Public License as published by
+//the Free Software Foundation, either version 3 of the License, or
+//(at your option) any later version.
+//
+//This program is distributed in the hope that it will be useful,
+//but WITHOUT ANY WARRANTY; without even the implied warranty of
+//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//GNU General Public License for more details.
+//
+//You should have received a copy of the GNU General Public License
+//along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+#include "data.h"
+
+
+void data::runPermutation(string fout, vector < int > nPermutations) {
+
+ //0. Prepare genotypes
+ vector < double > genotype_sd = vector < double > (genotype_count, 0.0);
+ vector < double > phenotype_sd = vector < double > (phenotype_count, 0.0);
+ if (covariate_count > 0) {
+ LOG.println("\nCorrecting genotypes for covariates");
+ covariate_engine->residualize(genotype_orig);
+ }
+ for (int g = 0 ; g < genotype_count ; g ++) genotype_sd[g] = RunningStat(genotype_orig[g]).StandardDeviation();
+ normalize(genotype_orig);
+
+ //1. Loop over phenotypes
+ ofile fdo (fout);
+ for (int p = 0 ; p < phenotype_count ; p ++) {
+
+ LOG.println("\nProcessing gene [" + phenotype_id[p] + "]");
+
+ //1.1. Enumerate all genotype-phenotype pairs within cis-window
+ vector < int > targetGenotypes, targetDistances;
+ for (int g = 0 ; g < genotype_count ; g ++) {
+ int cisdistance = genotype_pos[g] - phenotype_start[p];
+ if (abs(cisdistance) <= cis_window) {
+ targetGenotypes.push_back(g);
+ targetDistances.push_back(cisdistance);
+ }
+ }
+ LOG.println(" * Number of variants in cis = " + sutils::int2str(targetGenotypes.size()));
+
+ //1.2. Copy original data
+ vector < float > phenotype_curr = phenotype_orig[p];
+ if (covariate_count > 0) covariate_engine->residualize(phenotype_curr);
+ phenotype_sd[p] = RunningStat(phenotype_curr).StandardDeviation();
+ normalize(phenotype_curr);
+
+ //1.3. Nominal pass: scan cis-window & compute statistics
+ double bestCorr = 0.0;
+ int bestDistance = ___LI___, bestIndex = -1;
+ for (int g = 0 ; g < targetGenotypes.size() ; g ++) {
+ double corr = getCorrelation(genotype_orig[targetGenotypes[g]], phenotype_curr);
+ if (abs(corr) > abs(bestCorr) || (abs(corr) == abs(bestCorr) && abs(targetDistances[g]) < bestDistance)) {
+ bestCorr = corr;
+ bestDistance = targetDistances[g];
+ bestIndex = targetGenotypes[g];
+ }
+ }
+ if (targetGenotypes.size() > 0) LOG.println(" * Best correlation = " + sutils::double2str(bestCorr, 4));
+
+ //1.4. Permutation pass:
+ bool done = false;
+ int countPermutations = 0, nBetterCorrelation = 0;
+ vector < double > permCorr;
+ do {
+ double bestCperm = 0.0;
+ phenotype_curr = phenotype_orig[p];
+ random_shuffle(phenotype_curr.begin(), phenotype_curr.end());
+ if (covariate_count > 0) covariate_engine->residualize(phenotype_curr);
+ normalize(phenotype_curr);
+ for (int g = 0 ; g < targetGenotypes.size() ; g ++) {
+ double corr = getCorrelation(genotype_orig[targetGenotypes[g]], phenotype_curr);
+ if (abs(corr) > abs(bestCperm)) bestCperm = corr;
+ }
+ if (abs(bestCperm) >= abs(bestCorr)) nBetterCorrelation++;
+ permCorr.push_back(bestCperm);
+ countPermutations++;
+
+ if (nPermutations.size() == 1 && countPermutations >= nPermutations[0]) done = true;
+ if (nPermutations.size() == 2 && (nBetterCorrelation >= nPermutations[0] || countPermutations >= nPermutations[1])) done = true;
+ if (nPermutations.size() == 3 && (countPermutations >= nPermutations[0]) && (nBetterCorrelation >= nPermutations[1] || countPermutations >= nPermutations[2])) done = true;
+ } while (!done);
+ if (targetGenotypes.size() > 0) LOG.println(" * Number of permutations = " + sutils::int2str(nBetterCorrelation) + " / " + sutils::int2str(countPermutations));
+
+ //1.5. Calculate effective DFs & Beta distribution parameters
+ vector < double > permPvalues;
+ double true_df = sample_count - 2 - ((covariate_count>0)?covariate_engine->nCovariates():0);
+ double mean = 0.0, variance = 0.0, beta_shape1 = 1.0, beta_shape2 = 1.0;
+ fdo << phenotype_id[p] << " " << targetGenotypes.size();
+ if (targetGenotypes.size() > 0) {
+ //Estimate number of degrees of freedom
+ if (putils::variance(permCorr, putils::mean(permCorr)) != 0.0) {
+ learnDF(permCorr, true_df);
+ //LOG.println(" * Effective degree of freedom = " + sutils::double2str(true_df, 4));
+ }
+ //Compute mean and variance of p-values
+ for (int c = 0 ; c < permCorr.size() ; c ++) permPvalues.push_back(getPvalue(permCorr[c], true_df));
+ for (int pv = 0 ; pv < permPvalues.size() ; pv++) mean += permPvalues[pv];
+ mean /= permPvalues.size();
+ for (int pv = 0 ; pv < permPvalues.size() ; pv++) variance += (permPvalues[pv] - mean) * (permPvalues[pv] - mean);
+ variance /= (permPvalues.size() - 1);
+ //Estimate shape1 & shape2
+ if (targetGenotypes.size() > 1 && mean != 1.0) {
+ beta_shape1 = mean * (mean * (1 - mean ) / variance - 1);
+ beta_shape2 = beta_shape1 * (1 / mean - 1);
+ if (targetGenotypes.size() > 10) mleBeta(permPvalues, beta_shape1, beta_shape2); //ML estimate if more than 10 variant in cis
+ }
+ LOG.println(" * Beta distribution parameters = " + sutils::double2str(beta_shape1, 4) + " " + sutils::double2str(beta_shape2, 4));
+ fdo << " " << beta_shape1 << " " << beta_shape2 << " " << true_df;
+ } else fdo << " NA NA NA";
+
+ //1.6. Writing results
+ if (targetGenotypes.size() > 0 && bestIndex >= 0) {
+ double pval_fdo = getPvalue(bestCorr, true_df);
+ double pval_nom = getPvalue(bestCorr, sample_count - 2 - ((covariate_count>0)?covariate_engine->nCovariates():0));
+ double pval_slope = getSlope(bestCorr, phenotype_sd[p], genotype_sd[bestIndex]);
+ fdo << " " << genotype_id[bestIndex];
+ fdo << " " << bestDistance;
+ fdo << " " << pval_nom;
+ fdo << " " << pval_slope;
+ fdo << " " << (nBetterCorrelation + 1) * 1.0 / (countPermutations + 1.0);
+ fdo << " " << pbeta(pval_fdo, beta_shape1, beta_shape2, 1, 0);
+ } else fdo << " NA NA NA NA NA NA";
+ fdo << endl;
+
+ LOG.println(" * Progress = " + sutils::double2str((p+1) * 100.0 / phenotype_count, 1) + "%");
+ }
+ fdo.close();
+}
diff --git a/src/analysisPermutationInteraction.cpp b/src/analysisPermutationInteraction.cpp
new file mode 100644
index 0000000..2402300
--- /dev/null
+++ b/src/analysisPermutationInteraction.cpp
@@ -0,0 +1,126 @@
+//FastQTL: Fast and efficient QTL mapper for molecular phenotypes
+//Copyright (C) 2015 Olivier DELANEAU, Alfonso BUIL, Emmanouil DERMITZAKIS & Olivier DELANEAU
+//
+//This program is free software: you can redistribute it and/or modify
+//it under the terms of the GNU General Public License as published by
+//the Free Software Foundation, either version 3 of the License, or
+//(at your option) any later version.
+//
+//This program is distributed in the hope that it will be useful,
+//but WITHOUT ANY WARRANTY; without even the implied warranty of
+//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//GNU General Public License for more details.
+//
+//You should have received a copy of the GNU General Public License
+//along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+#include "data.h"
+
+
+void data::runPermutationInteraction(string fout, int nPermutations) {
+
+ //0. Loop over phenotypes
+ ofile fdo (fout);
+ for (int p = 0 ; p < phenotype_count ; p ++) {
+
+ LOG.println("\nProcessing gene [" + phenotype_id[p] + "]");
+
+ //1.1. Enumerate all genotype-phenotype pairs within cis-window
+ vector < int > targetGenotypes, targetDistances;
+ for (int g = 0 ; g < genotype_count ; g ++) {
+ int cisdistance = genotype_pos[g] - phenotype_start[p];
+ if (abs(cisdistance) <= cis_window) {
+ targetGenotypes.push_back(g);
+ targetDistances.push_back(cisdistance);
+ }
+ }
+ LOG.println(" * Number of variants in cis = " + sutils::int2str(targetGenotypes.size()));
+
+ //1.2. Loop over genotypes
+ double nominal_correlation = 0.0;
+ int nominal_variant = -1;
+ int nominal_distance = -1;
+ vector < double > permuted_correlations = vector < double > (nPermutations, 0.0);
+ vector < vector < float > > permuted_phenotypes = vector < vector < float > > (nPermutations, vector < float > (sample_count, 0));
+
+ for (int perm = 0 ; perm < nPermutations ; perm ++) {
+ permuted_phenotypes[perm] = phenotype_orig[p];
+ random_shuffle(permuted_phenotypes[perm].begin(), permuted_phenotypes[perm].end());
+ }
+
+ for (int g = 0 ; g < targetGenotypes.size() ; g ++) {
+
+ //RESIDUALIZER
+ covariate_engine->clearSoft();
+ covariate_engine->pushSoft(genotype_orig[targetGenotypes[g]]);
+
+ //INTERACTION TERM
+ vector < float > interaction_curr = genotype_orig[targetGenotypes[g]];
+ for (int i = 0 ; i < sample_count ; i++) interaction_curr[i] *= interaction_val[i];
+ covariate_engine->residualize(interaction_curr);
+ normalize(interaction_curr);
+
+ //NOMINAL STEP
+ vector < float > phenotype_curr = phenotype_orig[p];
+ covariate_engine->residualize(phenotype_curr);
+ normalize(phenotype_curr);
+ double corr = getCorrelation(interaction_curr, phenotype_curr);
+ if (abs(corr) > abs(nominal_correlation)) {
+ nominal_correlation = corr;
+ nominal_variant = targetGenotypes[g];
+ nominal_distance = targetDistances[g];
+ }
+
+ //PERMUTATION PASS
+ for (int perm = 0 ; perm < nPermutations ; perm ++) {
+ random_shuffle(phenotype_curr.begin(), phenotype_curr.end());
+ phenotype_curr = permuted_phenotypes[perm];
+ covariate_engine->residualize(phenotype_curr);
+ normalize(phenotype_curr);
+ corr = getCorrelation(interaction_curr, phenotype_curr);
+ if (abs(corr) > abs(permuted_correlations[perm])) permuted_correlations[perm] = corr;
+ }
+ }
+
+ //1.5. Calculate effective DFs & Beta distribution parameters
+ vector < double > permuted_pvalues = permuted_correlations;
+ double true_df = sample_count - 2 - covariate_engine->nCovariates();
+ double mean = 0.0, variance = 0.0, beta_shape1 = 1.0, beta_shape2 = 1.0;
+ fdo << phenotype_id[p] << " " << targetGenotypes.size();
+ if (targetGenotypes.size() > 0) {
+ //Estimate number of degrees of freedom
+ if (putils::variance(permuted_correlations, putils::mean(permuted_correlations)) != 0.0) {
+ learnDF(permuted_correlations, true_df);
+ //LOG.println(" * Effective degree of freedom = " + sutils::double2str(true_df, 4));
+ }
+ //Compute mean and variance of p-values
+ for (int c = 0 ; c < permuted_correlations.size() ; c ++) permuted_pvalues[c] = getPvalue(permuted_correlations[c], true_df);
+ mean = putils::mean(permuted_pvalues);
+ for (int pv = 0 ; pv < permuted_pvalues.size() ; pv++) variance += (permuted_pvalues[pv] - mean) * (permuted_pvalues[pv] - mean);
+ variance /= (permuted_pvalues.size() - 1);
+ //Estimate shape1 & shape2
+ if (targetGenotypes.size() > 1 && mean != 1.0) {
+ beta_shape1 = mean * (mean * (1 - mean ) / variance - 1);
+ beta_shape2 = beta_shape1 * (1 / mean - 1);
+ if (targetGenotypes.size() > 10) mleBeta(permuted_pvalues, beta_shape1, beta_shape2); //ML estimate if more than 10 variant in cis
+ }
+ LOG.println(" * Beta distribution parameters = " + sutils::double2str(beta_shape1, 4) + " " + sutils::double2str(beta_shape2, 4));
+ fdo << " " << beta_shape1 << " " << beta_shape2 << " " << true_df;
+ } else fdo << " NA NA NA";
+
+ //1.6. Writing results
+ if (targetGenotypes.size() > 0 && nominal_variant >= 0) {
+ double pval_fdo = getPvalue(nominal_correlation, true_df);
+ double pval_nom = getPvalue(nominal_correlation, sample_count - 2 - covariate_engine->nCovariates());
+ fdo << " " << genotype_id[nominal_variant];
+ fdo << " " << nominal_distance;
+ fdo << " " << pval_nom;
+ fdo << " " << getPvalue(nominal_correlation, permuted_correlations);
+ fdo << " " << pbeta(pval_fdo, beta_shape1, beta_shape2, 1, 0);
+ } else fdo << " NA NA NA NA NA";
+ fdo << endl;
+
+ LOG.println(" * Progress = " + sutils::double2str((p+1) * 100.0 / phenotype_count, 1) + "%");
+ }
+ fdo.close();
+}
diff --git a/src/analysisPermutationPerGroup.cpp b/src/analysisPermutationPerGroup.cpp
new file mode 100644
index 0000000..8e23b60
--- /dev/null
+++ b/src/analysisPermutationPerGroup.cpp
@@ -0,0 +1,160 @@
+//FastQTL: Fast and efficient QTL mapper for molecular phenotypes
+//Copyright (C) 2015 Olivier DELANEAU, Alfonso BUIL, Emmanouil DERMITZAKIS & Olivier DELANEAU
+//
+//This program is free software: you can redistribute it and/or modify
+//it under the terms of the GNU General Public License as published by
+//the Free Software Foundation, either version 3 of the License, or
+//(at your option) any later version.
+//
+//This program is distributed in the hope that it will be useful,
+//but WITHOUT ANY WARRANTY; without even the implied warranty of
+//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//GNU General Public License for more details.
+//
+//You should have received a copy of the GNU General Public License
+//along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+#include "data.h"
+
+
+void data::runPermutationPerGroup(string fout, vector < int > nPermutations) {
+
+ //0. Prepare genotypes
+ if (covariate_count > 0) {
+ LOG.println("\nCorrecting genotypes for covariates");
+ covariate_engine->residualize(genotype_orig);
+ }
+ normalize(genotype_orig);
+
+ //1. Prepare phenotype groups
+ LOG.println("\nBuilding groups");
+ vector < vector < int > > PG;
+ for (int p = 0 ; p < phenotype_count ; p ++) {
+ if (p == 0 || phenotype_grp [p-1] != phenotype_grp [p]) PG.push_back(vector < int > (1, p));
+ else PG.back().push_back(p);
+ }
+ LOG .println(" * Number of groups = " + sutils::int2str(PG.size()));
+
+ //2. Loop over groups
+ ofile fdo (fout);
+ for (int g = 0 ; g < PG.size() ; g ++) {
+
+ LOG.println("\nProcessing group [" + phenotype_grp[PG[g][0]] + "]");
+ LOG.println(" * Number of MP in group = " + sutils::int2str(PG[g].size()) + " / " + sutils::int2str(PG[g]));
+
+ //1.1. Enumerate all genotype-phenotype pairs within cis-window
+ vector < int > targetGenotypes, targetDistances;
+ for (int l = 0 ; l < genotype_count ; l ++) {
+ int cisdistance = genotype_pos[l] - phenotype_start[PG[g][0]];
+ if (abs(cisdistance) <= cis_window) {
+ targetGenotypes.push_back(l);
+ targetDistances.push_back(cisdistance);
+ }
+ }
+ LOG.println(" * Number of variants in cis = " + sutils::int2str(targetGenotypes.size()));
+
+ //1.2. Copy original data
+ vector < vector < float > > phenotype_curr = vector < vector < float > > (phenotype_orig.begin() + PG[g][0], phenotype_orig.begin() + PG[g].back() + 1);
+ if (covariate_count > 0) covariate_engine->residualize(phenotype_curr);
+ normalize(phenotype_curr);
+
+ //1.3. Nominal pass: scan cis-window & compute statistics
+ double bestCorr = 0.0;
+ int bestDistance = ___LI___, bestIndex = -1;
+ string bestExon = "NA";
+ for (int p = 0 ; p < phenotype_curr.size() ; p ++) {
+ for (int l = 0 ; l < targetGenotypes.size() ; l ++) {
+ double corr = getCorrelation(genotype_orig[targetGenotypes[l]], phenotype_curr[p]);
+ if (corr > 1) {
+ cerr << endl;
+ cerr << "************************************************************************************************" << endl;
+ cerr << g << "/" << PG.size() << " " << p << "/" << phenotype_curr.size() << " " << l << "/" << targetGenotypes.size() << endl;
+ cerr << "-----Genotypes--------" << endl;
+ cerr << sutils::double2str(genotype_orig[targetGenotypes[l]]) << endl;
+ cerr << "-----Phenotypes-------" << endl;
+ cerr << sutils::double2str(phenotype_curr[p]) << endl;
+ }
+ if (abs(corr) > abs(bestCorr) || (abs(corr) == abs(bestCorr) && abs(targetDistances[l]) < bestDistance)) {
+ bestCorr = corr;
+ bestDistance = targetDistances[l];
+ bestIndex = targetGenotypes[l];
+ bestExon = phenotype_id[PG[g][p]];
+ }
+ }
+ }
+ if (targetGenotypes.size() > 0) {
+
+ LOG.println(" * Best correlation = " + sutils::double2str(bestCorr, 4));
+ }
+
+ //1.4. Permutation pass:
+ bool done = false;
+ int countPermutations = 0, nBetterCorrelation = 0;
+ vector < double > permCorr;
+ do {
+ double bestCperm = 0.0;
+ copy(phenotype_orig.begin() + PG[g][0], phenotype_orig.begin() + PG[g].back() + 1, phenotype_curr.begin());
+ putils::random_shuffle(phenotype_curr);
+ if (covariate_count > 0) covariate_engine->residualize(phenotype_curr);
+ normalize(phenotype_curr);
+ for (int p = 0 ; p < phenotype_curr.size() ; p ++) {
+ for (int l = 0 ; l < targetGenotypes.size() ; l ++) {
+ double corr = getCorrelation(genotype_orig[targetGenotypes[l]], phenotype_curr[p]);
+ if (abs(corr) > abs(bestCperm)) bestCperm = corr;
+ }
+ }
+ if (abs(bestCperm) >= abs(bestCorr)) nBetterCorrelation++;
+ permCorr.push_back(bestCperm);
+ countPermutations++;
+
+ if (nPermutations.size() == 1 && countPermutations >= nPermutations[0]) done = true;
+ if (nPermutations.size() == 2 && (nBetterCorrelation >= nPermutations[0] || countPermutations >= nPermutations[1])) done = true;
+ if (nPermutations.size() == 3 && (countPermutations >= nPermutations[0]) && (nBetterCorrelation >= nPermutations[1] || countPermutations >= nPermutations[2])) done = true;
+ } while (!done);
+ if (targetGenotypes.size() > 0) LOG.println(" * Number of permutations = " + sutils::int2str(nBetterCorrelation) + " / " + sutils::int2str(countPermutations));
+
+ //1.5. Calculate effective DFs & Beta distribution parameters
+ vector < double > permPvalues;
+ double true_df = sample_count - 2 - ((covariate_count>0)?covariate_engine->nCovariates():0);
+ double mean = 0.0, variance = 0.0, beta_shape1 = 1.0, beta_shape2 = 1.0;
+ fdo << bestExon << " " << targetGenotypes.size();
+ if (targetGenotypes.size() > 0) {
+ //Estimate number of degrees of freedom
+ if (putils::variance(permCorr, putils::mean(permCorr)) != 0.0) {
+ learnDF(permCorr, true_df);
+ LOG.println(" * Effective degree of freedom = " + sutils::double2str(true_df, 4));
+ }
+ //Compute mean and variance of p-values
+ for (int c = 0 ; c < permCorr.size() ; c ++) permPvalues.push_back(getPvalue(permCorr[c], true_df));
+ for (int pv = 0 ; pv < permPvalues.size() ; pv++) mean += permPvalues[pv];
+ mean /= permPvalues.size();
+ for (int pv = 0 ; pv < permPvalues.size() ; pv++) variance += (permPvalues[pv] - mean) * (permPvalues[pv] - mean);
+ variance /= (permPvalues.size() - 1);
+ //Estimate shape1 & shape2
+ if (targetGenotypes.size() > 1 && mean != 1.0) {
+ beta_shape1 = mean * (mean * (1 - mean ) / variance - 1);
+ beta_shape2 = beta_shape1 * (1 / mean - 1);
+ if (targetGenotypes.size() > 10) mleBeta(permPvalues, beta_shape1, beta_shape2); //ML estimate if more than 10 variant in cis
+ }
+ LOG.println(" * Beta distribution parameters = " + sutils::double2str(beta_shape1, 4) + " " + sutils::double2str(beta_shape2, 4));
+ fdo << " " << beta_shape1 << " " << beta_shape2 << " " << true_df;
+ } else fdo << " NA NA NA";
+
+ //1.6. Writing results
+ if (targetGenotypes.size() > 0 && bestIndex >= 0) {
+ double pval_fdo = getPvalue(bestCorr, true_df);
+ double pval_nom = getPvalue(bestCorr, sample_count - 2 - ((covariate_count>0)?covariate_engine->nCovariates():0));
+ fdo << " " << genotype_id[bestIndex];
+ fdo << " " << bestDistance;
+ fdo << " " << pval_nom;
+ fdo << " " << (nBetterCorrelation + 1) * 1.0 / (countPermutations + 1.0);
+ fdo << " " << pbeta(pval_fdo, beta_shape1, beta_shape2, 1, 0);
+ fdo << " " << phenotype_grp[PG[g][0]];
+ fdo << " " << PG[g].size();
+ } else fdo << " NA NA NA NA NA NA NA";
+ fdo << endl;
+
+ LOG.println(" * Progress = " + sutils::double2str((g+1) * 100.0 / PG.size(), 1) + "%");
+ }
+ fdo.close();
+}
diff --git a/src/analysisPermutationSequence.cpp b/src/analysisPermutationSequence.cpp
new file mode 100644
index 0000000..e122827
--- /dev/null
+++ b/src/analysisPermutationSequence.cpp
@@ -0,0 +1,135 @@
+//FastQTL: Fast and efficient QTL mapper for molecular phenotypes
+//Copyright (C) 2015 Olivier DELANEAU, Alfonso BUIL, Emmanouil DERMITZAKIS & Olivier DELANEAU
+//
+//This program is free software: you can redistribute it and/or modify
+//it under the terms of the GNU General Public License as published by
+//the Free Software Foundation, either version 3 of the License, or
+//(at your option) any later version.
+//
+//This program is distributed in the hope that it will be useful,
+//but WITHOUT ANY WARRANTY; without even the implied warranty of
+//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//GNU General Public License for more details.
+//
+//You should have received a copy of the GNU General Public License
+//along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+#include "data.h"
+
+
+void data::runPermutation(string fout, string fperm) {
+ string buffer;
+ vector < string > tokens;
+
+ //Read perm sequence
+ ifile fdp (fperm);
+ vector < vector < unsigned int > > P;
+ while(getline(fdp, buffer)) {
+ P.push_back(vector < unsigned int > ());
+ sutils::tokenize(buffer, tokens);
+ for (int t = 0 ; t < tokens.size() ; t++ ) P.back().push_back(atoi(tokens[t].c_str()) - 1);
+ }
+
+ //0. Prepare genotypes
+ if (covariate_count > 0) {
+ LOG.println("\nCorrecting genotypes for covariates");
+ covariate_engine->residualize(genotype_orig);
+ }
+ normalize(genotype_orig);
+
+ //1. Loop over phenotypes
+ ofile fdo (fout);
+ for (int p = 0 ; p < phenotype_count ; p ++) {
+
+ LOG.println("\nProcessing gene [" + phenotype_id[p] + "]");
+
+ //1.1. Enumerate all genotype-phenotype pairs within cis-window
+ vector < int > targetGenotypes, targetDistances;
+ for (int g = 0 ; g < genotype_count ; g ++) {
+ int cisdistance = genotype_pos[g] - phenotype_start[p];
+ if (abs(cisdistance) <= cis_window) {
+ targetGenotypes.push_back(g);
+ targetDistances.push_back(cisdistance);
+ }
+ }
+ LOG.println(" * Number of variants in cis = " + sutils::int2str(targetGenotypes.size()));
+
+ //1.2. Copy original data
+ vector < float > phenotype_curr = phenotype_orig[p];
+ if (covariate_count > 0) covariate_engine->residualize(phenotype_curr);
+ normalize(phenotype_curr);
+
+ //1.3. Nominal pass: scan cis-window & compute statistics
+ double bestCorr = 0.0;
+ int bestDistance = ___LI___, bestIndex = -1;
+ for (int g = 0 ; g < targetGenotypes.size() ; g ++) {
+ double corr = getCorrelation(genotype_orig[targetGenotypes[g]], phenotype_curr);
+ if (abs(corr) > abs(bestCorr) || (abs(corr) == abs(bestCorr) && abs(targetDistances[g]) < bestDistance)) {
+ bestCorr = corr;
+ bestDistance = targetDistances[g];
+ bestIndex = targetGenotypes[g];
+ }
+ }
+ if (targetGenotypes.size() > 0) LOG.println(" * Best correlation = " + sutils::double2str(bestCorr, 4));
+
+ //1.4. Permutation pass:
+ int nBetterCorrelation = 0;
+ vector < double > permCorr;
+ for (int perm = 0 ; perm < P.size() ; perm ++) {
+ double bestCperm = 0.0;
+ autils::reorder(phenotype_orig[p], P[perm]);
+ phenotype_curr = phenotype_orig[p];
+ if (covariate_count > 0) covariate_engine->residualize(phenotype_curr);
+ normalize(phenotype_curr);
+ for (int g = 0 ; g < targetGenotypes.size() ; g ++) {
+ double corr = getCorrelation(genotype_orig[targetGenotypes[g]], phenotype_curr);
+ if (abs(corr) > abs(bestCperm)) bestCperm = corr;
+ }
+ if (abs(bestCperm) >= abs(bestCorr)) nBetterCorrelation++;
+ permCorr.push_back(bestCperm);
+ }
+ if (targetGenotypes.size() > 0) LOG.println(" * Number of permutations = " + sutils::int2str(nBetterCorrelation) + " / " + sutils::int2str(P.size()));
+
+ //1.5. Calculate effective DFs & Beta distribution parameters
+ vector < double > permPvalues;
+ double true_df = sample_count - 2 - ((covariate_count>0)?covariate_engine->nCovariates():0);
+ double mean = 0.0, variance = 0.0, beta_shape1 = 1.0, beta_shape2 = 1.0;
+ fdo << phenotype_id[p] << " " << targetGenotypes.size();
+ if (targetGenotypes.size() > 0) {
+ //Estimate number of degrees of freedom
+ if (putils::variance(permCorr, putils::mean(permCorr)) != 0.0) {
+ learnDF(permCorr, true_df);
+ //LOG.println(" * Effective degree of freedom = " + sutils::double2str(true_df, 4));
+ }
+ //Compute mean and variance of p-values
+ for (int c = 0 ; c < permCorr.size() ; c ++) permPvalues.push_back(getPvalue(permCorr[c], true_df));
+ for (int pv = 0 ; pv < permPvalues.size() ; pv++) mean += permPvalues[pv];
+ mean /= permPvalues.size();
+ for (int pv = 0 ; pv < permPvalues.size() ; pv++) variance += (permPvalues[pv] - mean) * (permPvalues[pv] - mean);
+ variance /= (permPvalues.size() - 1);
+ //Estimate shape1 & shape2
+ if (targetGenotypes.size() > 1 && mean != 1.0) {
+ beta_shape1 = mean * (mean * (1 - mean ) / variance - 1);
+ beta_shape2 = beta_shape1 * (1 / mean - 1);
+ if (targetGenotypes.size() > 10) mleBeta(permPvalues, beta_shape1, beta_shape2); //ML estimate if more than 10 variant in cis
+ }
+ LOG.println(" * Beta distribution parameters = " + sutils::double2str(beta_shape1, 4) + " " + sutils::double2str(beta_shape2, 4));
+ fdo << " " << beta_shape1 << " " << beta_shape2 << " " << true_df;
+ } else fdo << " NA NA NA";
+
+ //1.6. Writing results
+ if (targetGenotypes.size() > 0 && bestIndex >= 0) {
+ double pval_fdo = getPvalue(bestCorr, true_df);
+ double pval_nom = getPvalue(bestCorr, sample_count - 2 - ((covariate_count>0)?covariate_engine->nCovariates():0));
+ fdo << " " << genotype_id[bestIndex];
+ fdo << " " << bestDistance;
+ fdo << " " << pval_nom;
+ fdo << " " << (nBetterCorrelation + 1) * 1.0 / (P.size() + 1.0);
+ fdo << " " << pbeta(pval_fdo, beta_shape1, beta_shape2, 1, 0);
+ } else fdo << " NA NA NA NA NA";
+ fdo << endl;
+
+ LOG.println(" * Progress = " + sutils::double2str((p+1) * 100.0 / phenotype_count, 1) + "%");
+ }
+ fdo.close();
+}
diff --git a/src/commands.cpp b/src/commands.cpp
new file mode 100644
index 0000000..af55aa8
--- /dev/null
+++ b/src/commands.cpp
@@ -0,0 +1,52 @@
+//FastQTL: Fast and efficient QTL mapper for molecular phenotypes
+//Copyright (C) 2015 Olivier DELANEAU, Alfonso BUIL, Emmanouil DERMITZAKIS & Olivier DELANEAU
+//
+//This program is free software: you can redistribute it and/or modify
+//it under the terms of the GNU General Public License as published by
+//the Free Software Foundation, either version 3 of the License, or
+//(at your option) any later version.
+//
+//This program is distributed in the hope that it will be useful,
+//but WITHOUT ANY WARRANTY; without even the implied warranty of
+//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//GNU General Public License for more details.
+//
+//You should have received a copy of the GNU General Public License
+//along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+#include "data.h"
+
+void data::writeCommands(string fout, int nChunks, int argc, char ** argv) {
+ LOG.println("\nGenerate " + sutils::int2str(nChunks) + " commands in [" + fout + "]");
+
+ vector < string > args;
+ for (int a = 0 ; a < argc ; a ++) args.push_back(argv[a]);
+
+ //map commands argument
+ int idx_commands_arg = -1;
+ for (int a = 0 ; a < argc ; a ++) if (args[a] == "--commands") idx_commands_arg = a;
+ args.erase (args.begin() + idx_commands_arg, args.begin() + idx_commands_arg + 3);
+
+ //map output argument
+ int idx_output_arg = -1;
+ string str_output_arg = "";
+ for (int a = 0 ; a < argc ; a ++) {
+ if (args[a] == "--out") {
+ idx_output_arg = a;
+ str_output_arg = string(args[a+1]);
+ }
+ }
+
+ //write commands [loop over regions]
+ ofile fd(fout);
+ for (int c = 0 ; c < nChunks ; c ++) {
+ setPhenotypeRegion(c);
+ string region = getPhenotypeRegion(c);
+
+ for (int a = 0 ; a < args.size() ; a ++ )
+ if (a == idx_output_arg + 1) fd << " " << str_output_arg + "." + region;
+ else fd << " " << args[a];
+
+ fd << " --region " << region << endl;
+ }
+}
diff --git a/src/data.h b/src/data.h
new file mode 100644
index 0000000..7a6e939
--- /dev/null
+++ b/src/data.h
@@ -0,0 +1,210 @@
+//FastQTL: Fast and efficient QTL mapper for molecular phenotypes
+//Copyright (C) 2015 Olivier DELANEAU, Alfonso BUIL, Emmanouil DERMITZAKIS & Olivier DELANEAU
+//
+//This program is free software: you can redistribute it and/or modify
+//it under the terms of the GNU General Public License as published by
+//the Free Software Foundation, either version 3 of the License, or
+//(at your option) any later version.
+//
+//This program is distributed in the hope that it will be useful,
+//but WITHOUT ANY WARRANTY; without even the implied warranty of
+//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//GNU General Public License for more details.
+//
+//You should have received a copy of the GNU General Public License
+//along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef _DATA_H
+#define _DATA_H
+
+#define ___NA___ (0.0/0.0)
+#define ___LI___ 1000000000
+
+#define MATHLIB_STANDALONE
+
+#include "utils/utils.h"
+#include "region.h"
+#include "residualizer.h"
+#include <Rmath.h>
+
+class data {
+public:
+ //INCLUDE/EXCLUDE LISTS
+ set < string > sample_inclusion;
+ set < string > sample_exclusion;
+ set < string > genotype_inclusion;
+ set < string > genotype_exclusion;
+ set < string > phenotype_inclusion;
+ set < string > phenotype_exclusion;
+ set < string > covariate_inclusion;
+ set < string > covariate_exclusion;
+
+ //REGIONS
+ region regionPhenotype;
+ region regionGenotype;
+ float cis_window;
+
+ //SAMPLES
+ int sample_count; //sample number
+ vector < string > sample_id; //sample IDs
+
+ //GENOTYPES
+ int genotype_count; //variant site number
+ vector < vector < float > > genotype_orig; //original genotype dosages
+ vector < vector < float > > genotype_curr; //current genotype dosages
+ vector < string > genotype_chr; //variant site chromosome
+ vector < string > genotype_id; //variant site IDs
+ vector < int > genotype_pos; //variant site positions
+
+ //PHENOTYPES
+ int phenotype_count; //phenotype number
+ vector < vector < float > > phenotype_orig; //original phenotype values
+ vector < string > phenotype_id; //phenotype ids
+ vector < string > phenotype_grp; //phenotype groups
+ vector < string > phenotype_chr; //phenotype chromosomes
+ vector < int > phenotype_start; //phenotype start positions
+ vector < int > phenotype_end; //phenotype end positions
+ vector < vector < int > > phenotype_cluster; //phenotype cluster (parallel jobs)
+ vector < double > phenotype_threshold; //phenotype genome-wide significance thresholds
+
+ //COVARIATES
+ int covariate_count; //covariate number
+ vector < vector < string > > covariate_val; //covariate values
+ vector < string > covariate_id; //covariate ids
+ residualizer * covariate_engine;
+
+ //INTERACTION
+ vector < float > interaction_val; //interaction values
+
+ //CONSTRUCTOR / DESTRUCTOR
+ data();
+ void clear();
+
+ //READ EXCLUSION/INCLUSION LISTS (IO/readInclusionsExcusions.cpp)
+ void readSamplesToExclude(string);
+ void readSamplesToInclude(string);
+ void readGenotypesToExclude(string);
+ void readGenotypesToInclude(string);
+ void readPhenotypesToExclude(string);
+ void readPhenotypesToInclude(string);
+ void readCovariatesToExclude(string);
+ void readCovariatesToInclude(string);
+ bool checkSample(string &, bool checkDuplicates = true);
+ bool checkGenotype(string &);
+ bool checkPhenotype(string &, bool include = true);
+ bool checkCovariate(string &);
+
+ //REGIONS
+ bool setPhenotypeRegion(string);
+ void setPhenotypeRegion(int);
+ string getPhenotypeRegion(int);
+ void deduceGenotypeRegion(int);
+
+ //READ DATA
+ void readGenotypesVCF(string);
+ void readPhenotypes(string);
+ void scanPhenotypes(string);
+ void readCovariates(string);
+ void readThresholds(string);
+ void readGroups(string);
+ void readInteractions(string);
+
+ //GENERAL MANAGMENT
+ void clusterizePhenotypes(int);
+ void imputeGenotypes();
+ void imputePhenotypes();
+ void normalTranformPhenotypes();
+ void initResidualizer();
+ void rankTranformPhenotypes();
+ void rankTranformGenotypes();
+ void normalize(vector < float > &);
+ void normalize(vector < vector < float > > &);
+ void correct(vector < float > &, vector < float > &);
+ int mleBeta(vector < double > & pval, double & beta_shape1, double & beta_shape2);
+ int learnDF(vector < double > & corr, double &);
+
+ //COMPUTATION METHODS [ALL INLINES FOR SPEED]
+ double getCorrelation(vector < float > &, vector < float > &);
+ double getCorrelation(vector < float > &, vector < float > &, vector < int > &);
+ double getPvalue(double, double);
+ double getPvalue(double, vector < double > &);
+ double getSlope(double, double, double);
+
+ //ANALYSIS
+ void runNominal(string, double);
+ void runPermutation(string, vector < int >);
+ void runPermutation(string, string);
+ void runPermutationPerGroup(string, vector < int >);
+ void runMapping(string, bool);
+ void runPermutationInteraction(string, int);
+
+ //COMMANDS
+ void writeCommands(string, int, int, char **);
+};
+
+
+//***************************************************************//
+//******************** INLINE FUNCTIONS *************************//
+//***************************************************************//
+
+#ifdef _FAST_CORRELATION
+
+/*
+ * This implementation of inner_product is optimized for 64 bytes cache lines.
+ */
+inline double data::getCorrelation(vector < float > & vec1, vector < float > & vec2) {
+ int i = 0;
+ int repeat = (sample_count / 4);
+ int left = (sample_count % 4);
+ double sum0 = 0, sum1 = 0, sum2 = 0, sum3 = 0;
+
+ while (repeat --) {
+ sum0 += vec1[i] * vec2[i];
+ sum1 += vec1[i+1] * vec2[i+1];
+ sum2 += vec1[i+2] * vec2[i+2];
+ sum3 += vec1[i+3] * vec2[i+3];
+ i += 4;
+ }
+
+ switch (left) {
+ case 3: sum0 += vec1[i+2] * vec2[i+2];
+ case 2: sum0 += vec1[i+1] * vec2[i+1];
+ case 1: sum0 += vec1[i+0] * vec2[i+0];
+ case 0: ;
+ }
+
+ return sum0 + sum1 + sum2 + sum3;
+}
+
+#else
+
+inline double data::getCorrelation(vector < float > & vec1, vector < float > & vec2) {
+ double corr = 0.0;
+ for (int s = 0 ; s < sample_count ; s ++) corr += vec1[s] * vec2[s];
+ return corr;
+}
+
+#endif
+
+inline double data::getCorrelation(vector < float > & vec1, vector < float > & vec2, vector < int > & indexes) {
+ double corr = 0.0;
+ for (int s = 0 ; s < sample_count ; s ++) corr += vec1[indexes[s]] * vec2[indexes[s]];
+ return corr;
+}
+
+inline double data::getPvalue(double corr, double df) {
+ return pf(df * corr * corr / (1 - corr * corr), 1, df,0,0);
+}
+
+inline double data::getPvalue(double ncorr, vector < double > & pcorr) {
+ unsigned int n_hits = 0;
+ for (int p = 0 ; p < pcorr.size() ; p++) if (abs(pcorr[p]) >= abs(ncorr)) n_hits++;
+ return ((n_hits + 1) * 1.0 / (pcorr.size() + 1.0));
+}
+
+inline double data::getSlope(double nominal_correlation, double psd, double gsd) {
+ if (gsd < 1e-16 || psd < 1e-16) return 0;
+ else return nominal_correlation * psd / gsd;
+}
+
+#endif
diff --git a/src/df.cpp b/src/df.cpp
new file mode 100644
index 0000000..da2b678
--- /dev/null
+++ b/src/df.cpp
@@ -0,0 +1,100 @@
+//FastQTL: Fast and efficient QTL mapper for molecular phenotypes
+//Copyright (C) 2015 Olivier DELANEAU, Alfonso BUIL, Emmanouil DERMITZAKIS & Olivier DELANEAU
+//
+//This program is free software: you can redistribute it and/or modify
+//it under the terms of the GNU General Public License as published by
+//the Free Software Foundation, either version 3 of the License, or
+//(at your option) any later version.
+//
+//This program is distributed in the hope that it will be useful,
+//but WITHOUT ANY WARRANTY; without even the implied warranty of
+//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//GNU General Public License for more details.
+//
+//You should have received a copy of the GNU General Public License
+//along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+#include "data.h"
+
+#include <gsl/gsl_multimin.h>
+#include <gsl/gsl_sf_psi.h>
+#include <gsl/gsl_sf_gamma.h>
+
+struct data_to_function {
+ data * D;
+ int n;
+ double * C;
+
+ data_to_function (data * _D, int _n, double * _C) {
+ D = _D;
+ n = _n;
+ C = _C;
+ }
+
+};
+
+double degreeOfFreedom(const gsl_vector *v, void *params) {
+ data_to_function * d = (data_to_function *) params;
+ vector < double > pval = vector < double >(d->n, 0.0);
+ double mean = 0.0;
+ for (int c = 0 ; c < d->n ; c++) {
+ pval[c] = d->D->getPvalue(d->C[c], gsl_vector_get(v, 0));
+ mean += pval[c];
+ }
+ mean /= pval.size();
+ double variance = 0.0;
+ for (int c = 0 ; c < pval.size() ; c++) variance += (pval[c] - mean) * (pval[c] - mean);
+ variance /= (pval.size() - 1);
+
+ double shape2 = abs((mean * (mean * (1 - mean ) / variance - 1)) - 1.0);
+ //cout << "O = " << mean << " " << shape2 << endl;
+ return shape2;
+}
+
+int data::learnDF(vector < double > & corr, double & df) {
+
+ //Set starting point to moment matching estimates
+ gsl_vector * x = gsl_vector_alloc (1);
+ gsl_vector_set (x, 0, df);
+
+ //Set initial step sizes to shape1 and shape2 scales
+ gsl_vector * ss = gsl_vector_alloc (1);
+ gsl_vector_set (ss, 0, df * 0.1);
+
+ //Initialize method and iterate
+ data_to_function * par = new data_to_function (this, corr.size(), &corr[0]);
+
+ gsl_multimin_function minex_func;
+ minex_func.n = 1;
+ minex_func.f = degreeOfFreedom;
+ minex_func.params = (void*)par;
+
+ //Initialize optimization machinery
+ const gsl_multimin_fminimizer_type * T = gsl_multimin_fminimizer_nmsimplex2;
+ gsl_multimin_fminimizer * s = gsl_multimin_fminimizer_alloc (T, 1);
+ gsl_multimin_fminimizer_set (s, &minex_func, x, ss);
+
+ //Optimization iteration
+ //cout << "\n ========================" << endl;
+ size_t iter = 0;
+ int status;
+ double size;
+ do {
+ iter++;
+ status = gsl_multimin_fminimizer_iterate(s);
+ if (status) break;
+ size = gsl_multimin_fminimizer_size (s);
+ status = gsl_multimin_test_size (size, 0.01);
+ //printf ("%d %10.3e f() = %7.10f size = %.10f\n", iter, gsl_vector_get (s->x, 0), s->fval, size);
+
+ } while (status == GSL_CONTINUE && iter < 20);
+
+ //Output new beta shape values
+ df = gsl_vector_get (s->x, 0);
+
+ //Free allocated memory
+ gsl_vector_free(x);
+ gsl_vector_free(ss);
+ gsl_multimin_fminimizer_free (s);
+ return (status == GSL_SUCCESS);
+}
diff --git a/src/fastQTL.cpp b/src/fastQTL.cpp
new file mode 100644
index 0000000..bebd436
--- /dev/null
+++ b/src/fastQTL.cpp
@@ -0,0 +1,290 @@
+//FastQTL: Fast and efficient QTL mapper for molecular phenotypes
+//Copyright (C) 2015 Olivier DELANEAU, Alfonso BUIL, Emmanouil DERMITZAKIS & Olivier DELANEAU
+//
+//This program is free software: you can redistribute it and/or modify
+//it under the terms of the GNU General Public License as published by
+//the Free Software Foundation, either version 3 of the License, or
+//(at your option) any later version.
+//
+//This program is distributed in the hope that it will be useful,
+//but WITHOUT ANY WARRANTY; without even the implied warranty of
+//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//GNU General Public License for more details.
+//
+//You should have received a copy of the GNU General Public License
+//along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+#include "data.h"
+
+int main(int argc, char ** argv) {
+ data D;
+
+ //-------------------------
+ // 1. DECLARE ALL OPTIONS
+ //-------------------------
+
+ bpo::options_description opt_basic ("\33[33mBasic options\33[0m");
+ opt_basic.add_options()
+ ("help", "Produces this help")
+ ("silent", "Silent mode on terminal")
+ ("seed", bpo::value< int >()->default_value(time(NULL)), "Random number seed. Useful to replicate runs.");
+
+ bpo::options_description opt_files ("\33[33mInput/Output files\33[0m");
+ opt_files.add_options()
+ ("log,L", bpo::value< string >()->default_value("fastQTL_date_time_UUID.log"), "Screen output is copied in this file.")
+ ("vcf,V", bpo::value< string >(), "Genotypes in VCF format.")
+ ("bed,B", bpo::value< string >(), "Phenotypes in BED format.")
+ ("cov,C", bpo::value< string >(), "Covariates in TXT format.")
+ ("grp,G", bpo::value< string >(), "Phenotype groups in TXT format.")
+ ("out,O", bpo::value< string >(), "Output file.");
+
+ bpo::options_description opt_exclusion ("\33[33mExclusion/Inclusion files\33[0m");
+ opt_exclusion.add_options()
+ ("exclude-samples", bpo::value< string >(), "List of samples to exclude.")
+ ("include-samples", bpo::value< string >(), "List of samples to include.")
+ ("exclude-sites", bpo::value< string >(), "List of sites to exclude.")
+ ("include-sites", bpo::value< string >(), "List of sites to include.")
+ ("exclude-phenotypes", bpo::value< string >(), "List of phenotypes to exclude.")
+ ("include-phenotypes", bpo::value< string >(), "List of phenotypes to include.")
+ ("exclude-covariates", bpo::value< string >(), "List of covariates to exclude.")
+ ("include-covariates", bpo::value< string >(), "List of covariates to include.");
+
+ bpo::options_description opt_parameters ("\33[33mParameters\33[0m");
+ opt_parameters.add_options()
+ ("normal", "Normal transform the phenotypes.")
+ ("window,W", bpo::value< double >()->default_value(1e6), "Cis-window size.")
+ ("threshold, T", bpo::value< double >()->default_value(1.0), "P-value threshold used in nominal pass of association");
+
+ bpo::options_description opt_modes ("\33[33mModes\33[0m");
+ opt_modes.add_options()
+ ("permute,P", bpo::value< vector < int > >()->multitoken(), "Permutation pass to calculate corrected p-values for molecular phenotypes.")
+ ("psequence", bpo::value< string >(), "Permutation sequence.")
+ ("map", bpo::value< string >(), "Map best QTL candidates per molecular phenotype.")
+ ("map-full", "Scan full cis-window to discover independent signals.")
+ ("interaction", bpo::value< string >(), "Test for interactions with variable specified in file.");
+
+ bpo::options_description opt_parallel ("\33[33mParallelization\33[0m");
+ opt_parallel.add_options()
+ ("chunk,K", bpo::value< vector < int > >()->multitoken(), "Specify which chunk needs to be processed")
+ ("commands", bpo::value< vector < string > >()->multitoken(), "Generates all commands")
+ ("region,R", bpo::value< string >(), "Region of interest.");
+
+ bpo::options_description descriptions;
+ descriptions.add(opt_basic).add(opt_files).add(opt_exclusion).add(opt_parameters).add(opt_modes).add(opt_parallel);
+
+ //-------------------
+ // 2. PARSE OPTIONS
+ //-------------------
+ bpo::variables_map options;
+ try {
+ bpo::store(bpo::command_line_parser(argc, argv).options(descriptions).run(), options);
+ bpo::notify(options);
+ } catch ( const boost::program_options::error& e ) {
+ cerr << "Error parsing command line :" << string(e.what()) << endl;
+ exit(0);
+ }
+
+ //-----------------------
+ // 3. PRINT HEADER/HELP
+ //-----------------------
+ if (! options.count("silent")) {
+ cout << endl;
+ cout << "\33[33mF\33[0mast \33[33mQTL\33[0m" << endl;
+ cout << " * Authors : Olivier DELANEAU, Halit ONGEN, Alfonso BUIL & Manolis DERMITZAKIS" << endl;
+ cout << " * Contact : olivier.delaneau at gmail.com" << endl;
+ cout << " * Webpage : http://fastqtl.sourceforge.net/" << endl;
+ cout << " * Version : v2.0" << endl;
+ if (options.count("help")) { cout << descriptions<< endl; exit(1); }
+ }
+
+ //--------------
+ // 4. LOG FILE
+ //--------------
+ struct timeval start_time, stop_time;
+ gettimeofday(&start_time, 0);
+ START_DATE = time(0);
+ //localtime(&START_DATE);
+ //string logfile = "fastQTL_" + sutils::date2str(&START_DATE, "%d%m%Y_%Hh%Mm%Ss") + "_" + putils::getRandomID() + ".log";
+ if (!options["log"].defaulted()) {
+ if (!LOG.open(options["log"].as < string > ())) {
+ cerr << "Impossible to open log file[" << options["log"].as < string > () << "] check writing permissions!" << endl;
+ exit(1);
+ }
+ } else LOG.muteL();
+ if (options.count("silent")) LOG.muteC();
+
+ //------------------------
+ // 5. OPTIONS COMBINATIONS
+ //------------------------
+ if (!options.count("vcf")) LOG.error("Genotype data needs to be specified with --vcf [file.vcf]");
+ if (!options.count("bed")) LOG.error("Phenotype data needs to be specified with --bed [file.bed]");
+ if (!options.count("out")) LOG.error("Output needs to be specified with --out [file.out]");
+
+ int nParallel = options.count("chunk") + options.count("commands") + options.count("region");
+ if (nParallel != 1) LOG.error("Please, specify one of these options [--region, --chunk, --commands]");
+
+ int nMode = options.count("permute") + options.count("map");
+ if (nMode > 1) LOG.error("Please, specify only one of these options [--permute, --map]");
+
+ //---------------
+ // 6. CHECK FILES
+ //---------------
+ if (!futils::isFile(options["vcf"].as < string > ())) LOG.error(options["vcf"].as < string > () + " is impossible to open, check file existence or reading permissions");
+ if (!futils::isFile(options["bed"].as < string > ())) LOG.error(options["bed"].as < string > () + " is impossible to open, check file existence or reading permissions");
+ if (options.count("cov") && !futils::isFile(options["cov"].as < string > ())) LOG.error(options["cov"].as < string > () + " is impossible to open, check file existence or reading permissions");
+ if (options.count("interaction") && !futils::isFile(options["interaction"].as < string > ())) LOG.error(options["interaction"].as < string > () + " is impossible to open, check file existence or reading permissions");
+ if (options.count("grp") && !futils::isFile(options["grp"].as < string > ())) LOG.error(options["grp"].as < string > () + " is impossible to open, check file existence or reading permissions");
+ if (options.count("map") && !futils::isFile(options["map"].as < string > ())) LOG.error(options["map"].as < string > () + " is impossible to open, check file existence or reading permissions");
+ if (!futils::createFile(options["out"].as < string > ())) LOG.error(options["out"].as < string > () + " is impossible to create, check writing permissions");
+
+ //-----------------------------------
+ // 6. CHECK INCLUSION/EXCLUSION FILES
+ //-----------------------------------
+ if (options.count("exclude-samples") && !futils::isFile(options["exclude-samples"].as < string > ())) LOG.error(options["exclude-samples"].as < string > () + " is impossible to open, check file existence or reading permissions");
+ if (options.count("include-samples") && !futils::isFile(options["include-samples"].as < string > ())) LOG.error(options["include-samples"].as < string > () + " is impossible to open, check file existence or reading permissions");
+ if (options.count("exclude-sites") && !futils::isFile(options["exclude-sites"].as < string > ())) LOG.error(options["exclude-sites"].as < string > () + " is impossible to open, check file existence or reading permissions");
+ if (options.count("include-sites") && !futils::isFile(options["include-sites"].as < string > ())) LOG.error(options["include-sites"].as < string > () + " is impossible to open, check file existence or reading permissions");
+ if (options.count("exclude-phenotypes") && !futils::isFile(options["exclude-phenotypes"].as < string > ())) LOG.error(options["exclude-phenotypes"].as < string > () + " is impossible to open, check file existence or reading permissions");
+ if (options.count("include-phenotypes") && !futils::isFile(options["include-phenotypes"].as < string > ())) LOG.error(options["include-phenotypes"].as < string > () + " is impossible to open, check file existence or reading permissions");
+ if (options.count("exclude-covariates") && !futils::isFile(options["exclude-covariates"].as < string > ())) LOG.error(options["exclude-covariates"].as < string > () + " is impossible to open, check file existence or reading permissions");
+ if (options.count("include-covariates") && !futils::isFile(options["include-covariates"].as < string > ())) LOG.error(options["include-covariates"].as < string > () + " is impossible to open, check file existence or reading permissions");
+
+ //----------------------------
+ // 7. CHECK METHODS/PARAMETERS
+ //----------------------------
+ if (options.count("interaction")) {
+ if (options.count("permute")) {
+ LOG.println("\nPerform permutation based interaction analysis (used to calculate corrected p-values for MPs)");
+ vector < int > nPerm = options["permute"].as < vector < int > > ();
+ if (nPerm.size() != 1) LOG.error("Interactions only work with a fixed number of permutations!");
+ else {
+ if (nPerm[0] < 50) LOG.warning("Permutation number seems to be low, check parameters");
+ LOG.println(" * Perform " + sutils::int2str(nPerm[0]) + " permutations");
+ }
+ } else LOG.error("Interactions only work with permutations!");
+ LOG.println(" * Test interaction with term from [" + options["interaction"].as < string > () + "]");
+ } else if (options.count("permute")) {
+ LOG.println("\nPerform permutation based analysis (used to calculate corrected p-values for MPs)");
+ vector < int > nPerm = options["permute"].as < vector < int > > ();
+ if (nPerm.size() > 3 || nPerm.size() < 1) LOG.error ("Option --permute takes 1, 2 or 3 arguments");
+ if (nPerm.size() == 1) {
+ if (nPerm[0] <= 0) LOG.error("Permutation number needs to be positive integer");
+ if (nPerm[0] < 50) LOG.warning("Permutation number seems to be low, check parameters");
+ LOG.println(" * Perform " + sutils::int2str(nPerm[0]) + " permutations");
+ } else if (nPerm.size() == 2) {
+ if (nPerm[0] <= 0 || nPerm[1] <= 0) LOG.error("Permutation number needs to be positive");
+ if (nPerm[1] <= nPerm[0]) LOG.error("For adaptive permutation scheme, arg1 needs to be smaller than arg2!");
+ LOG.println(" * Perform between " + sutils::int2str(nPerm[0]) + " and " + sutils::int2str(nPerm[1]) + " permutations");
+ } else {
+ if (nPerm[0] <= 0 || nPerm[1] <= 0 || nPerm[2] <= 0) LOG.error("Permutation number needs to be positive");
+ if (nPerm[2] <= nPerm[0]) LOG.error("For adaptive permutation scheme, arg1 needs to be smaller than arg3!");
+ if (nPerm[0] <= nPerm[1]) LOG.error("For adaptive permutation scheme, arg2 needs to be smaller than arg1!");
+ LOG.println(" * Perform between " + sutils::int2str(nPerm[0]) + " and " + sutils::int2str(nPerm[2]) + " permutations and stop when " + sutils::int2str(nPerm[1]) + " best associations are found");
+ }
+ if (options.count("grp")) LOG.println(" * Using MP groups from [" + options["grp"].as < string > () + "]");
+ } else if (options.count("map")) {
+ LOG.println("\nPerform conditional based analysis (used to map significant QTLs for MPs");
+ LOG.println(" * Using per MP p-value threshold from [" + options["map"].as < string > () + "]");
+ if (options.count("map-full")) LOG.println(" * Scanning all variants in cis and not only nominally significant ones");
+ } else {
+ LOG.println("\nPerform nominal analysis (used to get raw p-values of association)");
+ double threshold = options["threshold"].as < double > ();
+ if (threshold <= 0.0 || threshold > 1.0) LOG.error("Incorrect --threshold value : 0 < X <= 1");
+ LOG.println(" * Using p-value threshold = " + sutils::double2str(threshold, 10));
+ }
+
+ if (options["seed"].as < int > () < 0) LOG.error("Random number generator needs a positive seed value");
+ else srand(options["seed"].as < int > ());
+ LOG.println(" * Random number generator is seeded with " + sutils::int2str(options["seed"].as < int > ()));
+
+ if (options["window"].as < double > () <= 0) LOG.error ("Incorrect value for option --window (null or negative value)");
+ if (options["window"].as < double > () > 1e9) LOG.error ("Cis-window cannot be larger than 1e9bp");
+ LOG.println(" * Considering variants within " + sutils::double2str(options["window"].as < double > ()) + " bp of the MPs");
+ D.cis_window = options["window"].as < double > ();
+
+ if (options.count("chunk")) {
+ vector < int > nChunk = options["chunk"].as < vector < int > > ();
+ if (nChunk.size() != 2) LOG.error ("--chunk needs 2 integer arguments");
+ if (nChunk[0] > nChunk[1]) LOG.error ("arg1 for --chunk needs to be smaller or equal to arg2");
+ LOG.println (" * Chunk processed " + sutils::int2str(nChunk[0]) + " / " + sutils::int2str(nChunk[1]));
+ } else if (options.count("commands")) {
+ vector < string > nCommands = options["commands"].as < vector < string > > ();
+ if (nCommands.size() != 2) LOG.error ("--commands needs 2 arguments");
+ LOG.println (" * " + nCommands[0] + " commands output in [" + nCommands[1] +"]");
+ } else LOG.println (" * Focus on region [" + options["region"].as < string > () +"]");
+
+ //--------------------------------
+ // 7. READ EXCLUDE / INCLUDE FILES
+ //--------------------------------
+ if (options.count("exclude-samples")) D.readSamplesToExclude(options["exclude-samples"].as < string > ());
+ if (options.count("include-samples")) D.readSamplesToInclude(options["include-samples"].as < string > ());
+ if (options.count("exclude-sites")) D.readGenotypesToExclude(options["exclude-sites"].as < string > ());
+ if (options.count("include-sites")) D.readGenotypesToInclude(options["include-sites"].as < string > ());
+ if (options.count("exclude-phenotypes")) D.readPhenotypesToExclude(options["exclude-phenotypes"].as < string > ());
+ if (options.count("include-phenotypes")) D.readPhenotypesToInclude(options["include-phenotypes"].as < string > ());
+ if (options.count("exclude-covariates")) D.readCovariatesToExclude(options["exclude-covariates"].as < string > ());
+ if (options.count("include-covariates")) D.readCovariatesToInclude(options["include-covariates"].as < string > ());
+
+ if (options.count("commands")) {
+ //---------------------
+ // 8. GENERATE COMMANDS
+ //---------------------
+ int nChunks = atoi(options["commands"].as < vector < string > > ()[0].c_str());
+ D.scanPhenotypes(options["bed"].as < string > ());
+ D.clusterizePhenotypes(nChunks);
+ D.writeCommands(options["commands"].as < vector < string > > ()[1], nChunks, argc, argv);
+ } else {
+ //--------------
+ // 9. SET REGION
+ //--------------
+ if (options.count("chunk")) {
+ D.scanPhenotypes(options["bed"].as < string > ());
+ D.clusterizePhenotypes(options["chunk"].as < vector < int > > ()[1]);
+ D.setPhenotypeRegion(options["chunk"].as < vector < int > > ()[0] - 1);
+ D.clear();
+ } else if (!D.setPhenotypeRegion(options["region"].as < string > ())) LOG.error("Impossible to interpret region [" + options["region"].as < string > () + "]");
+ D.deduceGenotypeRegion(options["window"].as < double > ());
+
+ //---------------
+ // 10. READ FILES
+ //---------------
+ D.readPhenotypes(options["bed"].as < string > ());
+ D.readGenotypesVCF(options["vcf"].as < string > ());
+ if (options.count("cov")) D.readCovariates(options["cov"].as < string > ());
+ if (options.count("map")) D.readThresholds(options["map"].as < string > ());
+ if (options.count("grp")) D.readGroups(options["grp"].as < string > ());
+ if (options.count("interaction")) D.readInteractions(options["interaction"].as < string > ());
+
+ //------------------------
+ // 11. INITIALIZE ANALYSIS
+ //------------------------
+ D.imputeGenotypes();
+ D.imputePhenotypes();
+ if (options.count("normal")) D.normalTranformPhenotypes();
+ D.initResidualizer();
+
+ //-----------------
+ // 12. RUN ANALYSIS
+ //-----------------
+ if (options.count("interaction"))
+ D.runPermutationInteraction(options["out"].as < string > (), options["permute"].as < vector < int > > ()[0]);
+ else if (options.count("permute") && options.count("grp"))
+ D.runPermutationPerGroup(options["out"].as < string > (), options["permute"].as < vector < int > > ());
+ else if (options.count("permute")) {
+ if (options.count("psequence")) D.runPermutation(options["out"].as < string > (), options["psequence"].as < string > ());
+ else D.runPermutation(options["out"].as < string > (), options["permute"].as < vector < int > > ());
+ } else if (options.count("map"))
+ D.runMapping(options["out"].as < string > (), options.count("map-full"));
+ else
+ D.runNominal(options["out"].as < string > (), options["threshold"].as < double > ());
+ }
+
+ //----------------
+ // 13. TERMINATION
+ //----------------
+ D.clear();
+ gettimeofday(&stop_time, 0);
+ int n_seconds = (int)floor(stop_time.tv_sec - start_time.tv_sec);
+ LOG.println("\nRunning time: " + sutils::int2str(n_seconds) + " seconds");
+ if (!options["log"].defaulted()) LOG.close();
+}
diff --git a/src/management.cpp b/src/management.cpp
new file mode 100644
index 0000000..151151b
--- /dev/null
+++ b/src/management.cpp
@@ -0,0 +1,252 @@
+//FastQTL: Fast and efficient QTL mapper for molecular phenotypes
+//Copyright (C) 2015 Olivier DELANEAU, Alfonso BUIL, Emmanouil DERMITZAKIS & Olivier DELANEAU
+//
+//This program is free software: you can redistribute it and/or modify
+//it under the terms of the GNU General Public License as published by
+//the Free Software Foundation, either version 3 of the License, or
+//(at your option) any later version.
+//
+//This program is distributed in the hope that it will be useful,
+//but WITHOUT ANY WARRANTY; without even the implied warranty of
+//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//GNU General Public License for more details.
+//
+//You should have received a copy of the GNU General Public License
+//along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+
+#define _GLOBAL
+
+#include "data.h"
+#include "utils/ranker.h"
+
+
+data::data() {
+ sample_count = 0;
+ genotype_count = 0;
+ phenotype_count = 0;
+ covariate_count = 0;
+ covariate_engine = NULL;
+}
+
+void data::clear() {
+ sample_count = 0;
+ sample_id.clear();
+ genotype_count = 0;
+ genotype_orig.clear();
+ genotype_curr.clear();
+ genotype_chr.clear();
+ genotype_id.clear();
+ genotype_pos.clear();
+ phenotype_count = 0;
+ phenotype_orig.clear();
+ phenotype_id.clear();
+ phenotype_chr.clear();
+ phenotype_start.clear();
+ phenotype_end.clear();
+ covariate_count = 0;
+ covariate_val.clear();
+ covariate_id.clear();
+}
+
+bool data::checkSample(string & id, bool checkDuplicate) {
+ bool included = ((sample_inclusion.size() == 0)?true:sample_inclusion.count(id));
+ bool excluded = ((sample_exclusion.size() == 0)?false:sample_exclusion.count(id));
+ if (!included || excluded) return false;
+ if (checkDuplicate) for (int s = 0 ; s < sample_id.size() ; s++) if (id == sample_id[s]) LOG.error("Duplicate sample id [" + id +"]");
+ return true;
+}
+
+bool data::checkGenotype(string & id) {
+ bool included = ((genotype_inclusion.size() == 0)?true:genotype_inclusion.count(id));
+ bool excluded = ((genotype_exclusion.size() == 0)?false:genotype_exclusion.count(id));
+ if (!included || excluded) return false;
+ //for (int g = 0 ; g < genotype_id.size() ; g++) if (id == genotype_id[g]) LOG.error("Duplicate variant site id [" + id + "]");
+ return true;
+}
+
+bool data::checkPhenotype(string & id, bool include) {
+ if (include) {
+ bool included = ((phenotype_inclusion.size() == 0)?true:phenotype_inclusion.count(id));
+ bool excluded = ((phenotype_exclusion.size() == 0)?false:phenotype_exclusion.count(id));
+ if (!included || excluded) return false;
+ }
+ for (int p = 0 ; p < phenotype_id.size() ; p++) if (id == phenotype_id[p]) LOG.error("Duplicate phenotype id [" + id + "]");
+ return true;
+}
+
+bool data::checkCovariate(string & id) {
+ bool included = ((covariate_inclusion.size() == 0)?true:covariate_inclusion.count(id));
+ bool excluded = ((covariate_exclusion.size() == 0)?false:covariate_exclusion.count(id));
+ if (!included || excluded) return false;
+ for (int c = 0 ; c < covariate_id.size() ; c++) if (id == covariate_id[c]) LOG.error("Duplicate covariate id [" + id + "]");
+ return true;
+}
+
+void data::clusterizePhenotypes(int K) {
+ //check K
+ if (K < 1) LOG.error("Number of chunks needs to be > 0");
+ if (K > phenotype_count) LOG.error("Number of chunks (" + sutils::int2str(K) + ") is greater than the number of phenotypes (" + sutils::int2str(phenotype_count) + ")");
+
+ //initialise (1 cluster = 1 chromosome)
+ phenotype_cluster = vector < vector < int > > (1, vector < int > (1, 0));
+ for (int p = 1 ; p < phenotype_count ; p ++) {
+ if (phenotype_chr[p] != phenotype_chr[p-1]) phenotype_cluster.push_back(vector < int > (1, p));
+ else phenotype_cluster.back().push_back(p);
+ }
+
+ //iterate (split cluster in the middle until K clusters are built)
+ if (phenotype_cluster.size() > K) LOG.error("Number of chunks needs to be > #chr");
+ if (phenotype_cluster.size() < K) {
+ int max_idx, max_val, max_mid;
+ do {
+ max_idx = -1;
+ max_val = +1;
+ max_mid = -1;
+ for (int k = 0 ; k < phenotype_cluster.size() ; k ++) {
+ if (phenotype_cluster[k].size() > max_val) {
+ max_val = phenotype_cluster[k].size();
+ max_idx = k;
+ }
+ }
+ if (max_idx >= 0) {
+ max_mid = max_val / 2;
+ while (max_mid > 1 && phenotype_end[phenotype_cluster[max_idx][max_mid-1]] >= phenotype_start[phenotype_cluster[max_idx][max_mid]]) max_mid --;
+ phenotype_cluster.push_back(vector < int > (phenotype_cluster[max_idx].begin() + max_mid, phenotype_cluster[max_idx].end()));
+ phenotype_cluster[max_idx].erase(phenotype_cluster[max_idx].begin() + max_mid, phenotype_cluster[max_idx].end());
+ }
+ } while (phenotype_cluster.size() < K && max_idx >= 0);
+ }
+}
+
+bool data::setPhenotypeRegion(string reg) {
+ return regionPhenotype.set(reg);
+}
+
+void data::setPhenotypeRegion(int k) {
+ regionPhenotype.chr = phenotype_chr[phenotype_cluster[k][0]];
+ regionPhenotype.start = phenotype_start[phenotype_cluster[k][0]];
+ regionPhenotype.end = phenotype_end[phenotype_cluster[k].back()];
+}
+
+string data::getPhenotypeRegion(int k) {
+ return string (phenotype_chr[phenotype_cluster[k][0]] + ":" + sutils::int2str(phenotype_start[phenotype_cluster[k][0]]) + "-" + sutils::int2str(phenotype_end[phenotype_cluster[k].back()]));
+}
+
+void data::deduceGenotypeRegion(int W) {
+ regionGenotype.chr = regionPhenotype.chr;
+ regionGenotype.start = regionPhenotype.start - W;
+ if (regionGenotype.start < 0) regionGenotype.start = 0;
+ regionGenotype.end = regionPhenotype.end + W;
+}
+
+void data::imputeGenotypes() {
+ LOG.println("\nImputing missing genotypes");
+ for (int g = 0; g < genotype_count ; g ++) {
+ double mean = 0.0;
+ int c_mean = 0;
+ for (int s = 0; s < sample_count ; s ++) {
+ if (genotype_orig[g][s] >= 0) {
+ mean += genotype_orig[g][s];
+ c_mean ++;
+ }
+ }
+ mean /= c_mean;
+ for (int s = 0; s < sample_count ; s ++) if (genotype_orig[g][s] < 0) genotype_orig[g][s] = mean;
+ }
+}
+
+void data::imputePhenotypes() {
+ LOG.println("\nImputing missing phenotypes");
+ for (int p = 0; p < phenotype_count ; p ++) {
+ double mean = 0.0;
+ int c_mean = 0;
+ for (int s = 0; s < sample_count; s ++) {
+ if (!isnan(phenotype_orig[p][s])) {
+ mean += phenotype_orig [p][s];
+ c_mean ++;
+ }
+ }
+ mean /= c_mean;
+ for (int s = 0; s < sample_count ; s ++) if (isnan(phenotype_orig[p][s])) phenotype_orig[p][s] = mean;
+ }
+}
+
+void data::normalTranformPhenotypes() {
+ string method = "average";
+ LOG.println("\nQuantile normalise phenotypes to Normal distribution");
+ for (int p = 0; p < phenotype_count ; p ++) {
+ vector < float > R;
+ myranker::rank(phenotype_orig[p], R, method);
+ double max = 0;
+ for (int s = 0 ; s < sample_count ; s ++) {
+ R[s] = R[s] - 0.5;
+ if (R[s] > max) max = R[s];
+ }
+ max = max + 0.5;
+ for (int s = 0 ; s < sample_count ; s ++) {
+ R[s] /= max;
+ phenotype_orig[p][s] = qnorm(R[s], 0.0, 1.0, 1, 0);
+ }
+ }
+}
+
+void data::initResidualizer() {
+ LOG.println("\nInitialize covariate ");
+ covariate_engine = new residualizer (sample_count);
+ for (int c = 0 ; c < covariate_count ; c ++) covariate_engine->pushHard(covariate_val[c]);
+ if (interaction_val.size() > 0) covariate_engine->pushHard(interaction_val);
+}
+
+void data::rankTranformPhenotypes() {
+ string method = "average";
+ LOG.println("\nRanking phenotypes");
+ for (int p = 0; p < phenotype_count ; p ++) {
+ vector < float > R;
+ myranker::rank(phenotype_orig[p], R, method);
+ phenotype_orig[p] = R;
+ }
+}
+
+void data::rankTranformGenotypes() {
+ string method = "average";
+ LOG.println("\nRanking genotypes");
+ for (int g = 0; g < genotype_count ; g ++) {
+ vector < float > R;
+ myranker::rank(genotype_orig[g], R, method);
+ genotype_orig[g] = R;
+ }
+}
+
+void data::normalize(vector < float > & X) {
+ double mean = 0.0, sum = 0.0;
+ for (int s = 0; s < sample_count ; s ++) mean += X[s];
+ mean /= sample_count;
+ for (int s = 0; s < sample_count ; s ++) {
+ X[s] -= mean;
+ sum += X[s] * X[s];
+ }
+ sum = sqrt(sum);
+ if (sum == 0) sum = 1;
+ for (int s = 0; s < sample_count ; s ++) X[s] /= sum;
+}
+
+void data::normalize(vector < vector < float > > & X) {
+ for (int x = 0 ; x < X.size() ; x++) {
+ double mean = 0.0, sum = 0.0;
+ for (int s = 0; s < sample_count ; s ++) mean += X[x][s];
+ mean /= sample_count;
+ for (int s = 0; s < sample_count ; s ++) {
+ X[x][s] -= mean;
+ sum += X[x][s] * X[x][s];
+ }
+ sum = sqrt(sum);
+ if (sum == 0) sum = 1;
+ for (int s = 0; s < sample_count ; s ++) X[x][s] /= sum;
+ }
+}
+
+void data::correct(vector < float > & X, vector < float > & Y) {
+ double corr = getCorrelation(X, Y);
+ for (int s = 0 ; s < sample_count ; s ++) Y[s] = Y[s] - corr * X[s];
+}
diff --git a/src/mle.cpp b/src/mle.cpp
new file mode 100644
index 0000000..54d4933
--- /dev/null
+++ b/src/mle.cpp
@@ -0,0 +1,88 @@
+//FastQTL: Fast and efficient QTL mapper for molecular phenotypes
+//Copyright (C) 2015 Olivier DELANEAU, Alfonso BUIL, Emmanouil DERMITZAKIS & Olivier DELANEAU
+//
+//This program is free software: you can redistribute it and/or modify
+//it under the terms of the GNU General Public License as published by
+//the Free Software Foundation, either version 3 of the License, or
+//(at your option) any later version.
+//
+//This program is distributed in the hope that it will be useful,
+//but WITHOUT ANY WARRANTY; without even the implied warranty of
+//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//GNU General Public License for more details.
+//
+//You should have received a copy of the GNU General Public License
+//along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+#include "data.h"
+
+#include <gsl/gsl_multimin.h>
+#include <gsl/gsl_sf_psi.h>
+#include <gsl/gsl_sf_gamma.h>
+
+#define BETA_SHAPE1_MIN 0.1
+#define BETA_SHAPE2_MIN 1
+#define BETA_SHAPE1_MAX 10
+#define BETA_SHAPE2_MAX 1000000 //to be changed for trans!
+
+double betaLogLikelihood(const gsl_vector *v, void *params) {
+ double * p = (double *) params;
+ double beta_shape1 = gsl_vector_get(v, 0);
+ double beta_shape2 = gsl_vector_get(v, 1);
+ return -1.0 * ((beta_shape1 - 1) * p[0] + (beta_shape2 - 1) * p[1] - p[2] * gsl_sf_lnbeta(beta_shape1, beta_shape2));
+}
+
+int data::mleBeta(vector < double > & pval, double & beta_shape1, double & beta_shape2) {
+
+ //Set starting point to moment matching estimates
+ gsl_vector * x = gsl_vector_alloc (2);
+ gsl_vector_set (x, 0, beta_shape1);
+ gsl_vector_set (x, 1, beta_shape2);
+
+ //Set initial step sizes to shape1 and shape2 scales
+ gsl_vector * ss = gsl_vector_alloc (2);
+ gsl_vector_set (ss, 0, beta_shape1/10);
+ gsl_vector_set (ss, 1, beta_shape2/10);
+
+ //Initialize method and iterate
+ double par [3];
+ par[0] = 0.0;
+ par[1] = 0.0;
+ for (int e = 0 ; e < pval.size(); e ++) {
+ if (pval[e] == 1.0) pval[e] = 0.99999999;
+ par[0] += log (pval[e]);
+ par[1] += log (1 - pval[e]);
+ }
+ par[2] = pval.size();
+ gsl_multimin_function minex_func;
+ minex_func.n = 2;
+ minex_func.f = betaLogLikelihood;
+ minex_func.params = par;
+
+ //Initialize optimization machinery
+ const gsl_multimin_fminimizer_type * T = gsl_multimin_fminimizer_nmsimplex2;
+ gsl_multimin_fminimizer * s = gsl_multimin_fminimizer_alloc (T, 2);
+ gsl_multimin_fminimizer_set (s, &minex_func, x, ss);
+
+ //Optimization iteration
+ size_t iter = 0;
+ int status;
+ double size;
+ do {
+ iter++;
+ status = gsl_multimin_fminimizer_iterate(s);
+ if (status) break;
+ size = gsl_multimin_fminimizer_size (s);
+ status = gsl_multimin_test_size (size, 0.01);
+ } while (status == GSL_CONTINUE && iter < 1000);
+
+ //Output new beta shape values
+ beta_shape1 = gsl_vector_get (s->x, 0);
+ beta_shape2 = gsl_vector_get (s->x, 1);
+
+ //Free allocated memory
+ gsl_vector_free(x);
+ gsl_vector_free(ss);
+ gsl_multimin_fminimizer_free (s);
+ return (status == GSL_SUCCESS);
+}
diff --git a/src/readCovariates.cpp b/src/readCovariates.cpp
new file mode 100644
index 0000000..7c03eed
--- /dev/null
+++ b/src/readCovariates.cpp
@@ -0,0 +1,77 @@
+//FastQTL: Fast and efficient QTL mapper for molecular phenotypes
+//Copyright (C) 2015 Olivier DELANEAU, Alfonso BUIL, Emmanouil DERMITZAKIS & Olivier DELANEAU
+//
+//This program is free software: you can redistribute it and/or modify
+//it under the terms of the GNU General Public License as published by
+//the Free Software Foundation, either version 3 of the License, or
+//(at your option) any later version.
+//
+//This program is distributed in the hope that it will be useful,
+//but WITHOUT ANY WARRANTY; without even the implied warranty of
+//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//GNU General Public License for more details.
+//
+//You should have received a copy of the GNU General Public License
+//along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+#include "data.h"
+
+
+void data::readCovariates(string fcov) {
+ string buffer;
+ vector < string > str;
+ int n_includedS = 0;
+ int n_excludedS = 0;
+ int n_includedC = 0;
+ int n_excludedC = 0;
+ int n_missingS = 0;
+ vector < int > mappingS;
+
+ LOG.println("\nReading covariates in [" + fcov + "]");
+ ifile fd (fcov);
+
+ //Read samples
+ getline(fd, buffer);
+ if (buffer.size() == 0) LOG.error("No header line detected!");
+ sutils::tokenize(buffer, str, "\t");
+ for (int t = 1 ; t < str.size() ; t ++) {
+ if (checkSample(str[t], false)) {
+ int idx_sample = -1;
+ for (int i = 0 ; i < sample_count && idx_sample < 0 ; i++) if (sample_id[i] == sutils::remove_spaces(str[t])) idx_sample = i;
+ mappingS.push_back(idx_sample);
+ if (idx_sample >= 0) n_includedS ++;
+ else n_missingS ++;
+ } else {
+ mappingS.push_back(-1);
+ n_excludedS ++;
+ }
+ }
+ if (n_includedS != sample_count) LOG.error("Number of overlapping samples in covariate file is " + sutils::int2str(n_includedS) + " and should be " + sutils::int2str(sample_count));
+
+ //Read covariates
+ //int n_type0 = 0, n_type1 = 0;
+ while(getline(fd, buffer)) {
+ if (buffer.size() == 0) continue;
+ sutils::tokenize(buffer, str, "\t");
+ if (str.size() < 2) LOG.error("Wrong genotype covariate file format");
+ if (checkCovariate(str[0])) {
+ covariate_val.push_back(vector < string > (sample_count, "0"));
+ for (int t = 1 ; t < str.size() ; t ++) {
+ if (mappingS[t-1] >= 0) {
+ covariate_val.back()[mappingS[t-1]] = str[t];
+ }
+ }
+ n_includedC ++;
+ } else n_excludedC ++;
+ }
+
+ //Finalise
+ covariate_count = n_includedC;
+ LOG.println(" * " + sutils::int2str(n_includedS) + " samples included");
+ if (n_excludedS > 0) LOG.println(" * " + sutils::int2str(n_excludedS) + " samples excluded");
+ if (n_missingS > 0) LOG.println(" * " + sutils::int2str(n_missingS) + " samples excluded without phenotype data");
+ LOG.println(" * " + sutils::int2str(n_includedC) + " covariate(s) included");
+ if (n_excludedC > 0) LOG.println(" * " + sutils::int2str(n_excludedC) + " covariate(s) excluded");
+ fd.close();
+}
+
diff --git a/src/readGenotypes.cpp b/src/readGenotypes.cpp
new file mode 100644
index 0000000..94f432e
--- /dev/null
+++ b/src/readGenotypes.cpp
@@ -0,0 +1,117 @@
+//FastQTL: Fast and efficient QTL mapper for molecular phenotypes
+//Copyright (C) 2015 Olivier DELANEAU, Alfonso BUIL, Emmanouil DERMITZAKIS & Olivier DELANEAU
+//
+//This program is free software: you can redistribute it and/or modify
+//it under the terms of the GNU General Public License as published by
+//the Free Software Foundation, either version 3 of the License, or
+//(at your option) any later version.
+//
+//This program is distributed in the hope that it will be useful,
+//but WITHOUT ANY WARRANTY; without even the implied warranty of
+//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//GNU General Public License for more details.
+//
+//You should have received a copy of the GNU General Public License
+//along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+#include "data.h"
+#include "utils/tabix.hpp"
+
+
+void data::readGenotypesVCF(string fvcf) {
+ string buffer;
+ vector<string> str, field;
+ int n_includedG = 0;
+ int n_excludedG = 0;
+ int n_excludedF = 0;
+ int n_includedS = 0;
+ int n_excludedS = 0;
+ int n_missingS = 0;
+ int n_parsed = 0;
+ vector < int > mappingS;
+
+ //Initialise
+ LOG.println("\nReading genotype data in [" + fvcf + "] in VCF format");
+ if (!futils::isFile(fvcf + ".tbi")) LOG.error("index file missing [" + fvcf + ".tbi]");
+ Tabix fd (fvcf);
+
+ //Read samples
+ fd.getLastHeader(buffer);
+ if (buffer.size() == 0) LOG.error("No header line detected!");
+ sutils::tokenize(buffer, str);
+ if (str.size() < 10) LOG.error("Wrong VCF header format for sample ids");
+ for (int t = 9 ; t < str.size() ; t ++) {
+ if (checkSample(str[t], false)) {
+ int idx_sample = -1;
+ for (int i = 0 ; i < sample_count && idx_sample < 0 ; i++) if (sample_id[i] == str[t]) idx_sample = i;
+ mappingS.push_back(idx_sample);
+ if (idx_sample >= 0) n_includedS ++;
+ else n_missingS ++;
+ } else {
+ mappingS.push_back(-1);
+ n_excludedS ++;
+ }
+ }
+ if (n_includedS != sample_count) LOG.error("Genotype data does not overlap with phenotype data, check your files!");
+
+ //Read genotypes
+ if (!fd.setRegion(regionGenotype.str())) LOG.error("Failed to get region " + regionGenotype.str() + " in [" + fvcf + "]");
+ LOG.println(" * region = " + regionGenotype.str());
+ while (fd.getNextLine(buffer)) {
+ if (buffer.size() == 0) continue;
+ sutils::tokenize(buffer, str);
+ if (checkGenotype(str[2])) {
+ //Check VCF format
+ bool gt_field = false;
+ int idx_field = -1;
+ sutils::tokenize(str[8], field, ":");
+ for (int f = 0 ; f < field.size() ; f ++) if (field[f] == "DS") idx_field = f;
+ if (idx_field < 0) { for (int f = 0 ; f < field.size() ; f ++) if (field[f] == "GT") idx_field = f; gt_field = true; }
+ //Read data is format is correct
+ if (idx_field >= 0) {
+ genotype_id.push_back(str[2]);
+ genotype_chr.push_back(str[0]);
+ genotype_pos.push_back(atoi(str[1].c_str()));
+ genotype_orig.push_back(vector < float > (sample_count, 0.0));
+ genotype_curr.push_back(vector < float > (sample_count, 0.0));
+ for (int t = 9 ; t < str.size() ; t ++) {
+ if (mappingS[t-9] >= 0) {
+ sutils::tokenize(str[t], field, ":");
+ if (str[t] == "." || str[t] == "NN" || str[t] == "NA") genotype_orig.back()[mappingS[t-9]] = -1.0;
+ else if (!gt_field) {
+ if (field[idx_field][0] == '.') genotype_orig.back()[mappingS[t-9]] = -1.0;
+ else {
+ float dosage = atof(field[idx_field].c_str());
+ //if (dosage < 0 || dosage > 2) LOG.error("Dosages must be between 0 and 2, check: " + field[idx_field]);
+ genotype_orig.back()[mappingS[t-9]] = dosage;
+ }
+ } else {
+ if (field[idx_field][0] == '.' || field[idx_field][2] == '.') genotype_orig.back()[mappingS[t-9]] = -1.0;
+ else {
+ int a0 = atoi(field[idx_field].substr(0, 1).c_str());
+ int a1 = atoi(field[idx_field].substr(2, 1).c_str());
+ int dosage = a0 + a1;
+ if (dosage < 0 || dosage > 2) LOG.error("Genotypes must be 00, 01, or 11, check: " + field[idx_field]);
+ genotype_orig.back()[mappingS[t-9]] = dosage;
+ }
+ }
+ }
+ }
+ n_includedG ++;
+ } else n_excludedF ++;
+ } else n_excludedG ++;
+ n_parsed++;
+ if (n_parsed % 100000 == 0) LOG.println(" * " + sutils::int2str(n_parsed) + " lines parsed");
+ }
+
+ //Finalise
+ genotype_count = n_includedG;
+ //LOG.println(" * region = " + regionGenotype.str());
+ LOG.println(" * " + sutils::int2str(n_includedS) + " samples included");
+ if (n_excludedS > 0) LOG.println(" * " + sutils::int2str(n_excludedS) + " samples excluded");
+ if (n_missingS > 0) LOG.println(" * " + sutils::int2str(n_excludedS) + " samples excluded without phenotype data");
+ LOG.println(" * " + sutils::int2str(n_includedG) + " sites included");
+ if (n_excludedG > 0) LOG.println(" * " + sutils::int2str(n_excludedG) + " sites excluded");
+ if (n_excludedF > 0) LOG.println(" * " + sutils::int2str(n_excludedF) + " sites excluded because of missing GT/DS field");
+ if (n_includedG <= 0) LOG.error("No genotypes in this region: " + regionPhenotype.str());
+}
diff --git a/src/readGroups.cpp b/src/readGroups.cpp
new file mode 100644
index 0000000..b50b5f5
--- /dev/null
+++ b/src/readGroups.cpp
@@ -0,0 +1,54 @@
+//FastQTL: Fast and efficient QTL mapper for molecular phenotypes
+//Copyright (C) 2015 Olivier DELANEAU, Alfonso BUIL, Emmanouil DERMITZAKIS & Olivier DELANEAU
+//
+//This program is free software: you can redistribute it and/or modify
+//it under the terms of the GNU General Public License as published by
+//the Free Software Foundation, either version 3 of the License, or
+//(at your option) any later version.
+//
+//This program is distributed in the hope that it will be useful,
+//but WITHOUT ANY WARRANTY; without even the implied warranty of
+//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//GNU General Public License for more details.
+//
+//You should have received a copy of the GNU General Public License
+//along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+#include "data.h"
+#include "utils/tabix.hpp"
+
+
+void data::readGroups(string fgrp) {
+ string buffer;
+ vector < string > str;
+
+ phenotype_grp = vector < string > (phenotype_count, "");
+ vector < bool > phenotype_mask = vector < bool > (phenotype_count, false);
+
+ //Initialise
+ LOG.println("\nReading phenotype groups in [" + fgrp + "]");
+ ifile fdg (fgrp);
+ while (getline(fdg, buffer)) {
+ sutils::tokenize(buffer, str);
+
+ if (str.size() < 2) LOG.error("Incorrect number of columns in [" + fgrp + "], observed = " + sutils::int2str(str.size()) + " expected == 2");
+
+ int phenotype_idx = -1;
+ for (int p = 0 ; p < phenotype_count && phenotype_idx < 0 ; p ++) if (phenotype_id[p] == str[0]) phenotype_idx = p;
+
+ if (phenotype_idx >= 0) {
+ phenotype_grp[phenotype_idx] = str[1];
+ phenotype_mask[phenotype_idx] = true;
+ }
+ }
+ fdg.close();
+
+ //3.0 Make sure that each MP has a qvalue
+ int n_set= 0, n_unset = 0;
+ for (int p = 0 ; p < phenotype_count ; p ++) {
+ if (phenotype_mask[p]) n_set ++;
+ else n_unset ++;
+ }
+ LOG.println(" * #phenotypes set = " + sutils::int2str(n_set));
+ if (n_unset > 0) LOG.error(sutils::int2str(n_unset) + " phenotypes without groups!");
+}
diff --git a/src/readInclusionsExclusions.cpp b/src/readInclusionsExclusions.cpp
new file mode 100644
index 0000000..3d77404
--- /dev/null
+++ b/src/readInclusionsExclusions.cpp
@@ -0,0 +1,89 @@
+//FastQTL: Fast and efficient QTL mapper for molecular phenotypes
+//Copyright (C) 2015 Olivier DELANEAU, Alfonso BUIL, Emmanouil DERMITZAKIS & Olivier DELANEAU
+//
+//This program is free software: you can redistribute it and/or modify
+//it under the terms of the GNU General Public License as published by
+//the Free Software Foundation, either version 3 of the License, or
+//(at your option) any later version.
+//
+//This program is distributed in the hope that it will be useful,
+//but WITHOUT ANY WARRANTY; without even the implied warranty of
+//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//GNU General Public License for more details.
+//
+//You should have received a copy of the GNU General Public License
+//along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+#include "data.h"
+
+void data::readSamplesToExclude(string file) {
+ string buffer;
+ LOG.println("\nReading list of samples to exclude in [" + file + "]");
+ ifile fd(file);
+ while(getline(fd, buffer, '\n')) sample_exclusion.insert(buffer);
+ LOG.println(" * " + sutils::int2str(sample_exclusion.size()) + " sample(s) found");
+ fd.close();
+}
+
+void data::readSamplesToInclude(string file) {
+ string buffer;
+ LOG.println("\nReading list of samples to include in [" + file + "]");
+ ifile fd(file);
+ while(getline(fd, buffer, '\n')) sample_inclusion.insert(buffer);
+ LOG.println(" * " + sutils::int2str(sample_inclusion.size()) + " sample(s) found");
+ fd.close();
+}
+
+void data::readGenotypesToExclude(string file) {
+ string buffer;
+ LOG.println("\nReading list of sites to exclude in [" + file + "]");
+ ifile fd(file);
+ while(getline(fd, buffer, '\n')) genotype_exclusion.insert(buffer);
+ LOG.println(" * " + sutils::int2str(genotype_exclusion.size()) + " site(s) found");
+ fd.close();
+}
+
+void data::readGenotypesToInclude(string file) {
+ string buffer;
+ LOG.println("\nReading list of sites to include in [" + file + "]");
+ ifile fd(file);
+ while(getline(fd, buffer, '\n')) genotype_inclusion.insert(buffer);
+ LOG.println(" * " + sutils::int2str(genotype_inclusion.size()) + " site(s) found");
+ fd.close();
+}
+
+void data::readPhenotypesToExclude(string file) {
+ string buffer;
+ LOG.println("\nReading list of phenotypes to exclude in [" + file + "]");
+ ifile fd(file);
+ while(getline(fd, buffer, '\n')) phenotype_exclusion.insert(buffer);
+ LOG.println(" * " + sutils::int2str(phenotype_exclusion.size()) + " phenotype(s) found");
+ fd.close();
+}
+
+void data::readPhenotypesToInclude(string file) {
+ string buffer;
+ LOG.println("\nReading list of phenotypes to include in [" + file + "]");
+ ifile fd(file);
+ while(getline(fd, buffer, '\n')) phenotype_inclusion.insert(buffer);
+ LOG.println(" * " + sutils::int2str(phenotype_inclusion.size()) + " phenotype(s) found");
+ fd.close();
+}
+
+void data::readCovariatesToExclude(string file) {
+ string buffer;
+ LOG.println("\nReading list of covariates to exclude in [" + file + "]");
+ ifile fd(file);
+ while(getline(fd, buffer, '\n')) covariate_exclusion.insert(buffer);
+ LOG.println(" * " + sutils::int2str(covariate_exclusion.size()) + " covariate(s) found");
+ fd.close();
+}
+
+void data::readCovariatesToInclude(string file) {
+ string buffer;
+ LOG.println("\nReading list of covariates to include in [" + file + "]");
+ ifile fd(file);
+ while(getline(fd, buffer, '\n')) covariate_inclusion.insert(buffer);
+ LOG.println(" * " + sutils::int2str(covariate_inclusion.size()) + " covariate(s) found");
+ fd.close();
+}
diff --git a/src/readInteractions.cpp b/src/readInteractions.cpp
new file mode 100644
index 0000000..75d8459
--- /dev/null
+++ b/src/readInteractions.cpp
@@ -0,0 +1,51 @@
+//FastQTL: Fast and efficient QTL mapper for molecular phenotypes
+//Copyright (C) 2015 Olivier DELANEAU, Alfonso BUIL, Emmanouil DERMITZAKIS & Olivier DELANEAU
+//
+//This program is free software: you can redistribute it and/or modify
+//it under the terms of the GNU General Public License as published by
+//the Free Software Foundation, either version 3 of the License, or
+//(at your option) any later version.
+//
+//This program is distributed in the hope that it will be useful,
+//but WITHOUT ANY WARRANTY; without even the implied warranty of
+//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//GNU General Public License for more details.
+//
+//You should have received a copy of the GNU General Public License
+//along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+#include "data.h"
+
+
+void data::readInteractions(string fcov) {
+ string buffer;
+ vector < string > str;
+ int n_sample_found = 0;
+
+ LOG.println("\nReading interaction term in [" + fcov + "]");
+ ifile fd (fcov);
+
+ //Read interactions
+ interaction_val = vector < float >(sample_count, 0.0);
+ while(getline(fd, buffer)) {
+ sutils::tokenize(buffer, str, "\t");
+ if (str.size() != 2) LOG.error("Wrong interaction file format");
+
+ //
+ int idx_sample = -1;
+ for (int i = 0 ; i < sample_count && idx_sample < 0 ; i++) if (sample_id[i] == sutils::remove_spaces(str[0])) idx_sample = i;
+
+ //
+ if (idx_sample >=0) {
+ interaction_val[idx_sample] = atof(str[1].c_str());
+ n_sample_found ++;
+ }
+ }
+ if (n_sample_found != sample_count)
+ LOG.error("Number of overlapping samples in interaction file is " + sutils::int2str(n_sample_found) + " and should be " + sutils::int2str(sample_count));
+
+ //Finalise
+ LOG.println(" * " + sutils::int2str(n_sample_found) + " samples included");
+ fd.close();
+}
+
diff --git a/src/readPhenotypes.cpp b/src/readPhenotypes.cpp
new file mode 100644
index 0000000..46978ff
--- /dev/null
+++ b/src/readPhenotypes.cpp
@@ -0,0 +1,119 @@
+//FastQTL: Fast and efficient QTL mapper for molecular phenotypes
+//Copyright (C) 2015 Olivier DELANEAU, Alfonso BUIL, Emmanouil DERMITZAKIS & Olivier DELANEAU
+//
+//This program is free software: you can redistribute it and/or modify
+//it under the terms of the GNU General Public License as published by
+//the Free Software Foundation, either version 3 of the License, or
+//(at your option) any later version.
+//
+//This program is distributed in the hope that it will be useful,
+//but WITHOUT ANY WARRANTY; without even the implied warranty of
+//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//GNU General Public License for more details.
+//
+//You should have received a copy of the GNU General Public License
+//along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+#include "data.h"
+#include "utils/tabix.hpp"
+
+
+void data::readPhenotypes(string fbed) {
+ string buffer;
+ vector < string > str;
+ int n_includedP = 0;
+ int n_excludedP = 0;
+ int n_includedS = 0;
+ int n_excludedS = 0;
+ int n_parsed = 0;
+ vector < int > mappingS;
+
+ //Initialise
+ LOG.println("\nReading phenotype data in [" + fbed + "]");
+ if (!futils::isFile(fbed + ".tbi")) LOG.error("index file missing [" + fbed + ".tbi]");
+ Tabix fd (fbed);
+
+ //Read samples ids
+ fd.getLastHeader(buffer);
+ if (buffer.size() == 0) LOG.error("No header line detected");
+ sutils::tokenize(buffer, str);
+ if (str.size() < 5) LOG.error("Wrong BED header format for sample ids");
+ for (int t = 4 ; t < str.size() ; t ++) {
+ if (checkSample(str[t])) {
+ sample_id.push_back(str[t]);
+ mappingS.push_back(n_includedS);
+ n_includedS ++;
+ } else {
+ mappingS.push_back(-1);
+ n_excludedS ++;
+ }
+ }
+ sample_count = sample_id.size();
+
+ //Read phenotypes
+ if (!fd.setRegion(regionPhenotype.str())) LOG.error("Failed to get region " + regionPhenotype.str() + " in [" + fbed + "]");
+ while (fd.getNextLine(buffer)) {
+ if (buffer.size() == 0) continue;
+ sutils::tokenize(buffer, str);
+ if (str.size() < 5) LOG.error("Wrong BED file format 2 = " +sutils::int2str(str.size()));
+ if (checkPhenotype(str[3])) {
+ phenotype_id.push_back(str[3]);
+ phenotype_chr.push_back(str[0]);
+ phenotype_start.push_back(atoi(str[1].c_str()) + 1); //convert to 1-based, tabix works on 1-based coordinates and all other files are 1-based
+ phenotype_end.push_back(atoi(str[2].c_str()));
+ phenotype_orig.push_back(vector < float > (sample_count, 0.0));
+ for (int t = 4 ; t < str.size() ; t ++) {
+ if (mappingS[t-4] >= 0) {
+ if (str[t] == "NA") phenotype_orig.back()[mappingS[t-4]] = ___NA___;
+ else if (!sutils::isNumeric(str[t])) LOG.error("Phenotype encountered is not a number, check: [" + str[t] + "]");
+ else phenotype_orig.back()[mappingS[t-4]] = atof(str[t].c_str());
+ }
+ }
+ n_includedP++;
+ } else n_excludedP ++;
+ n_parsed++;
+ if (n_parsed % 100000 == 0) LOG.println(" * " + sutils::int2str(n_parsed) + " lines parsed");
+ }
+
+ //Finalise
+ phenotype_count = phenotype_id.size();
+ LOG.println(" * region = " + regionPhenotype.str());
+ LOG.println(" * " + sutils::int2str(n_includedS) + " samples included");
+ if (n_excludedS > 0) LOG.println(" * " + sutils::int2str(n_excludedS) + " samples excluded");
+ LOG.println(" * " + sutils::int2str(n_includedP) + " phenotypes included");
+ if (n_excludedP > 0) LOG.println(" * " + sutils::int2str(n_excludedP) + " phenotypes excluded");
+ if (phenotype_count == 0) LOG.error("No phenotypes in this region");
+}
+
+void data::scanPhenotypes(string fbed) {
+ string buffer;
+ vector < string > str;
+ int n_includedP = 0;
+ int n_duplicateP = 0;
+
+ //Initialise
+ LOG.println("\nScanning phenotype data in [" + fbed + "]");
+ if (!futils::isFile(fbed + ".tbi")) LOG.error("index file missing [" + fbed + ".tbi]");
+ Tabix fd (fbed);
+
+ //Read lines
+ fd.getLastHeader(buffer);
+ while (fd.getNextLine(buffer)) {
+ if (buffer.size() == 0) continue;
+ sutils::tokenize(buffer, str, 4);
+ if (str.size() != 4) LOG.error("Wrong BED file format");
+ if (checkPhenotype(str[3], false)) {
+ phenotype_id.push_back(str[3]);
+ phenotype_chr.push_back(str[0]);
+ phenotype_start.push_back(atoi(str[1].c_str()) + 1); //convert to 1-based, tabix works on 1-based coordinates and all other files are 1-based
+ phenotype_end.push_back(atoi(str[2].c_str()));
+ n_includedP++;
+ } else n_duplicateP ++;
+ }
+
+ //Finalise
+ phenotype_count = phenotype_id.size();
+ LOG.println(" * " + sutils::int2str(n_includedP) + " phenotypes");
+ if (n_duplicateP > 0) LOG.error("Duplicated phenotype IDs found n=" + sutils::int2str(n_duplicateP));
+ if (phenotype_count == 0) LOG.error("No phenotypes in this region");
+}
diff --git a/src/readThresholds.cpp b/src/readThresholds.cpp
new file mode 100644
index 0000000..184f1c6
--- /dev/null
+++ b/src/readThresholds.cpp
@@ -0,0 +1,53 @@
+//FastQTL: Fast and efficient QTL mapper for molecular phenotypes
+//Copyright (C) 2015 Olivier DELANEAU, Alfonso BUIL, Emmanouil DERMITZAKIS & Olivier DELANEAU
+//
+//This program is free software: you can redistribute it and/or modify
+//it under the terms of the GNU General Public License as published by
+//the Free Software Foundation, either version 3 of the License, or
+//(at your option) any later version.
+//
+//This program is distributed in the hope that it will be useful,
+//but WITHOUT ANY WARRANTY; without even the implied warranty of
+//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//GNU General Public License for more details.
+//
+//You should have received a copy of the GNU General Public License
+//along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+#include "data.h"
+
+
+void data::readThresholds(string fres) {
+ string buffer;
+ vector < string > str;
+
+ //1.0 Allocation
+ phenotype_threshold = vector < double > (phenotype_count, -1);
+ vector < bool > phenotype_mask = vector < bool > (phenotype_count, false);
+
+ //2.0 Read results
+ LOG.println("\nReading Qvalues in [" + fres + "]");
+ ifile fdr(fres);
+ while (getline(fdr, buffer)) {
+ sutils::tokenize(buffer, str);
+ if (str.size() < 2) LOG.error("Incorrect number of columns in [" + fres + "], observed = " + sutils::int2str(str.size()) + " expected > 2");
+
+ int phenotype_idx = -1;
+ for (int p = 0 ; p < phenotype_count && phenotype_idx < 0 ; p ++) if (phenotype_id[p] == str[0]) phenotype_idx = p;
+
+ if (phenotype_idx >= 0) {
+ phenotype_threshold[phenotype_idx] = atof(str[1].c_str());
+ phenotype_mask[phenotype_idx] = true;
+ }
+ }
+ fdr.close();
+
+ //3.0 Make sure that each MP has a qvalue
+ int n_set= 0, n_unset = 0;
+ for (int p = 0 ; p < phenotype_count ; p ++) {
+ if (phenotype_mask[p]) n_set ++;
+ else n_unset ++;
+ }
+ LOG.println(" * #phenotypes set = " + sutils::int2str(n_set));
+ if (n_unset > 0) LOG.error(sutils::int2str(n_unset) + " phenotypes without qvalues!");
+}
diff --git a/src/region.h b/src/region.h
new file mode 100644
index 0000000..fa3ceee
--- /dev/null
+++ b/src/region.h
@@ -0,0 +1,66 @@
+//FastQTL: Fast and efficient QTL mapper for molecular phenotypes
+//Copyright (C) 2015 Olivier DELANEAU, Alfonso BUIL, Emmanouil DERMITZAKIS & Olivier DELANEAU
+//
+//This program is free software: you can redistribute it and/or modify
+//it under the terms of the GNU General Public License as published by
+//the Free Software Foundation, either version 3 of the License, or
+//(at your option) any later version.
+//
+//This program is distributed in the hope that it will be useful,
+//but WITHOUT ANY WARRANTY; without even the implied warranty of
+//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//GNU General Public License for more details.
+//
+//You should have received a copy of the GNU General Public License
+//along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef _REGION_H
+#define _REGION_H
+
+#define POS_MIN 0
+#define POS_MAX 1000000000
+
+class region {
+public:
+ string chr;
+ int start;
+ int end;
+
+ region() {
+ chr = "chrNA";
+ start = POS_MIN;
+ end = POS_MAX;
+ }
+
+ string str() {
+ ostringstream s2( stringstream::out );
+ s2 << chr << ":" << start << "-" << end;
+ return s2.str();
+ }
+
+ bool check (string _chr, int _pos) {
+ if (_chr == chr && _pos >= start && _pos < end) return true;
+ else return false;
+ }
+
+ bool set(string r) {
+ vector < string > chr_split, pos_split;
+ sutils::tokenize(r, chr_split, ":");
+ if (chr_split.size() == 2) {
+ chr = chr_split[0];
+ sutils::tokenize(chr_split[1], pos_split, "-");
+ if (pos_split.size() == 2) {
+ start = atoi(pos_split[0].c_str());
+ end = atoi(pos_split[1].c_str());
+ return true;
+ } else return false;
+ } else if (chr_split.size() == 1) {
+ chr = chr_split[0];
+ start = POS_MIN;
+ end = POS_MAX;
+ return true;
+ } else return false;
+ }
+};
+
+#endif
diff --git a/src/residualizer.cpp b/src/residualizer.cpp
new file mode 100644
index 0000000..e5701e4
--- /dev/null
+++ b/src/residualizer.cpp
@@ -0,0 +1,142 @@
+
+#define FASTQTL_USE_QR
+#define FASTQTL_QR_TOLERANCE 1e-7 //this is set to the same tolerance as R
+
+#include "residualizer.h"
+
+
+residualizer::residualizer(int _n_samples) : n_samples (_n_samples) {
+ matrices_uptodate = false;
+}
+
+residualizer::~residualizer() {
+ hard_covariates.clear();
+ soft_covariates.clear();
+ m_rank = 0;
+ covarM.resize(0,0);
+ PQR_Q.resize(0,0);
+ PQR_Q_A.resize(0,0);
+ matrices_uptodate = false;
+}
+
+int residualizer::nCovariates() {
+ return soft_covariates.size() + hard_covariates.size();
+}
+
+void residualizer::clearSoft() {
+ if (soft_covariates.size() > 0) {
+ soft_covariates.clear();
+ matrices_uptodate = false;
+ }
+}
+
+void residualizer::pushHard(vector < string > & covariate) {
+ //INIT
+ set < int > i_yesmissing, i_nonmissing;
+ set < string > factors;
+
+ //TEST FOR NUMERIC & MISSING
+ for (int i = 0 ; i < covariate.size() ; i++) {
+ if (covariate[i] == "NA") i_yesmissing.insert(i);
+ else {
+ if (!sutils::isNumeric(covariate[i])) factors.insert(covariate[i]);
+ i_nonmissing.insert(i);
+ }
+ }
+
+ //FILL IN VALUES
+ unsigned int starting_row = hard_covariates.size();
+ if (factors.size() == 1) LOG.warning("Non-numeric covariate with only 1 factor, covariate dropped!");
+ else if (factors.size() > 1) {
+ set < string >::iterator itF = factors.begin(); itF ++;
+ for (; itF != factors.end() ; ++itF) {
+ hard_covariates.push_back(vector < float > (n_samples, 0.0));
+ for (int i = 0 ; i < covariate.size() ; i++) if (covariate[i] == *itF) hard_covariates.back()[i] = 1.0;
+ }
+ } else {
+ hard_covariates.push_back(vector < float > (n_samples, 0.0));
+ for (int i = 0 ; i < covariate.size() ; i++) hard_covariates.back()[i] = atof(covariate[i].c_str());
+ }
+
+ //IMPUTE MISSING
+ if (i_yesmissing.size() > 0) {
+ for (int i_row = starting_row ; i_row < hard_covariates.size() ; i_row ++) {
+ float mean_row = 0;
+ for (set < int >::iterator itNM = i_nonmissing.begin(); itNM != i_nonmissing.end() ; ++itNM) mean_row += hard_covariates[i_row][*itNM];
+ mean_row /= i_nonmissing.size();
+ for (set < int >::iterator itYM = i_yesmissing.begin(); itYM != i_yesmissing.end() ; ++itYM) hard_covariates[i_row][*itYM] = mean_row;
+ }
+ }
+
+ matrices_uptodate = false;
+}
+
+void residualizer::pushSoft(vector < float > & covariate) {
+ for (int i = 1, same = 1 ; i < covariate.size(); i++) {
+ if (covariate[i] != covariate[i-1]) same = 0;
+ if (i == covariate.size() - 1 && same == 1) return;
+ }
+ soft_covariates.push_back(vector < float > (n_samples, 0.0));
+ for (int i = 0 ; i < covariate.size() ; i++) soft_covariates.back()[i] = covariate[i];
+ matrices_uptodate = false;
+}
+
+void residualizer::pushHard(vector < float > & covariate) {
+ for (int i = 1, same = 1 ; i < covariate.size(); i++) {
+ if (covariate[i] != covariate[i-1]) same = 0;
+ if (i == covariate.size() - 1 && same == 1) return;
+ }
+ hard_covariates.push_back(vector < float > (n_samples, 0.0));
+ for (int i = 0 ; i < covariate.size() ; i++) hard_covariates.back()[i] = covariate[i];
+ matrices_uptodate = false;
+}
+
+void residualizer::update() {
+ covarM.resize(n_samples,nCovariates() + 1);
+ covarM.col(0) = VectorXd::Ones(n_samples);
+ for(int h = 0; h < hard_covariates.size() ; h ++) for(int c = 0 ; c < n_samples ; c ++) covarM(c, h + 1) = hard_covariates[h][c];
+ for(int s = 0; s < soft_covariates.size() ; s ++) for(int c = 0 ; c < n_samples ; c ++) covarM(c, s + hard_covariates.size() + 1) = soft_covariates[s][c];
+ PQR = ColPivHouseholderQR<MatrixXd>(covarM);
+ PQR.setThreshold(FASTQTL_QR_TOLERANCE);
+ m_rank = PQR.rank();
+ if (m_rank != nCovariates()+1) {
+ PQR_Q = PQR.householderQ();
+ PQR_Q_A = PQR.householderQ().adjoint();
+ LOG.warning("Linear dependency between covariates [#dropped=" +sutils::int2str(nCovariates()+1-m_rank) + "]");
+ }
+ matrices_uptodate = true;
+}
+
+void residualizer::residualize(vector < float > & data) {
+ if (nCovariates() == 0) return;
+ //TEST IF DATA VARIABLE
+ for (int i = 1, same = 1 ; i < data.size(); i++) {
+ if (data[i] != data[i-1]) same = 0;
+ if (i == data.size() - 1 && same == 1) return;
+ }
+
+ //FILL IN DATA
+ VectorXd counts(n_samples);
+ for(int i = 0; i < n_samples ; i ++) counts(i) = (double) data[i];
+
+ //MATRICES UPDATE
+ if (!matrices_uptodate) update();
+
+ //CORRECTION
+ if (m_rank == nCovariates()+1) {
+ VectorXd m_coef = PQR.solve(counts);
+ VectorXd fitted = covarM * m_coef;
+ VectorXd e = counts - fitted;
+ for (int i = 0; i < e.size(); i ++) data[i] = e(i);
+ } else {
+ VectorXd effects(PQR_Q_A * counts);
+ effects.tail(n_samples - m_rank).setZero();
+ VectorXd fitted = PQR_Q * effects;
+ VectorXd e = counts - fitted;
+ for (int i = 0; i < e.size(); i ++) data[i] = (float)e(i);
+ }
+}
+
+void residualizer::residualize(vector < vector < float > > & data) {
+ for (int d = 0; d < data.size() ; d ++) residualize(data[d]);
+}
diff --git a/src/residualizer.h b/src/residualizer.h
new file mode 100644
index 0000000..3cddb8f
--- /dev/null
+++ b/src/residualizer.h
@@ -0,0 +1,53 @@
+//FastQTL: Fast and efficient QTL mapper for molecular phenotypes
+//Copyright (C) 2015 Olivier DELANEAU, Alfonso BUIL, Emmanouil DERMITZAKIS & Olivier DELANEAU
+//
+//This program is free software: you can redistribute it and/or modify
+//it under the terms of the GNU General Public License as published by
+//the Free Software Foundation, either version 3 of the License, or
+//(at your option) any later version.
+//
+//This program is distributed in the hope that it will be useful,
+//but WITHOUT ANY WARRANTY; without even the implied warranty of
+//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//GNU General Public License for more details.
+//
+//You should have received a copy of the GNU General Public License
+//along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef _RESIDUALIZER_H
+#define _RESIDUALIZER_H
+
+#include <Dense>
+#include <LU>
+#include "utils/utils.h"
+
+using namespace Eigen;
+
+class residualizer {
+public:
+
+ int n_samples;
+ vector < vector < float > > hard_covariates;
+ vector < vector < float > > soft_covariates;
+
+ bool matrices_uptodate;
+ int m_rank;
+ MatrixXd covarM;
+ MatrixXd PQR_Q;
+ MatrixXd PQR_Q_A;
+ ColPivHouseholderQR < MatrixXd > PQR;
+
+ residualizer(int);
+ ~residualizer();
+
+ int nCovariates();
+ void clearSoft();
+ void pushHard(vector < string > &);
+ void pushHard(vector < float > &);
+ void pushSoft(vector < float > &);
+ void update();
+ void residualize(vector < float > &);
+ void residualize(vector < vector < float > > &);
+};
+
+#endif
diff --git a/src/utils/ranker.h b/src/utils/ranker.h
new file mode 100644
index 0000000..3e009e6
--- /dev/null
+++ b/src/utils/ranker.h
@@ -0,0 +1,224 @@
+/*
+FastQTL: Fast and efficient QTL mapper for molecular phenotypes
+Copyright (C) 2015 Olivier DELANEAU, Alfonso BUIL, Emmanouil DERMITZAKIS & Olivier DELANEAU
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef RANKER_H
+#define RANKER_H
+
+
+#include <vector>
+#include <string>
+#include <algorithm>
+
+using std::vector;
+using std::string;
+
+#ifndef uint
+typedef unsigned int uint;
+#endif
+
+namespace myranker{
+ template <class T>
+ class lt { public: static int compare(T a, T b) { return(a < b); } };
+ template <class T>
+ class gt { public: static int compare(T a, T b) { return(a > b); } };
+
+ template <class T, class C>
+ class ranker
+ {
+ private:
+ const T* p;
+ uint sz;
+
+ public:
+ ranker(const vector<T>& v) : p(&v[0]), sz(v.size()) { }
+ ranker(const T* tp, uint s) : p(tp), sz(s) { }
+
+ int operator()(uint i1, uint i2) const { return(C::compare(p[i1],p[i2])); }
+
+ template <class S>
+ void get_orders(vector<S>& w) const {
+ w.resize(sz);
+ w.front() = 0;
+ for (typename vector<S>::iterator i = w.begin(); i != w.end() - 1; ++i)
+ *(i + 1) = *i + 1;
+ std::sort(w.begin(), w.end(), *this);
+ }
+
+ template <class S>
+ void get_partial_orders(vector<S>& w, uint num) const {
+ if (num > sz) num = sz;
+ w.resize(sz);
+ w.front() = 0;
+ for (typename vector<S>::iterator i = w.begin(); i != w.end() - 1; ++i)
+ *(i + 1) = *i + 1;
+ std::partial_sort(w.begin(), w.begin() + num, w.end(), *this);
+ w.resize(num);
+ }
+
+ template <class S>
+ void get_ranks(vector<S>& w, const string& method) const {
+ w.resize(sz);
+ vector<uint> tmp(w.size());
+ get_orders(tmp);
+ if (method == "average") {
+ for (uint c = 0, reps; c < w.size(); c += reps) {
+ reps = 1;
+ while (c + reps < w.size() && p[tmp[c]] == p[tmp[c + reps]]) ++reps;
+ for (uint k = 0; k < reps; ++k) w[tmp[c + k]] = S(2 * c + reps - 1) / 2 + 1;
+ }
+ } else if (method == "min") {
+ for (uint c = 0, reps; c < w.size(); c += reps) {
+ reps = 1;
+ while (c + reps < w.size() && p[tmp[c]] == p[tmp[c + reps]]) ++reps;
+ for (uint k = 0; k < reps; ++k) w[tmp[c + k]] = c + 1;
+ }
+ } else if (method == "max") {
+ for (uint c = 0, reps; c < w.size(); c += reps) {
+ reps = 1;
+ while (c + reps < w.size() && p[tmp[c]] == p[tmp[c + reps]]) ++reps;
+ for (uint k = 0; k < reps; ++k) w[tmp[c + k]] = c + reps;
+ }
+ } else // default
+ for (uint c = 0; c < w.size(); ++c) w[tmp[c]] = c + 1;
+ }
+
+ template <class S>
+ void get_partial_ranks(vector<S>& w, const string& method, size_t num) const {
+ if (num > sz) num = sz;
+ vector<uint> tmp(sz);
+ get_partial_orders(tmp, num);
+ w.resize(sz);
+ fill(w.begin(), w.end(), 0);
+ if (method == "average") {
+ for (uint c = 0, reps; c < num; c += reps) { reps = 1;
+ while (c + reps < num && p[tmp[c]] == p[tmp[c + reps]]) ++reps;
+ for (uint k = 0; k < reps; ++k)
+ w[tmp[c + k]] = S(2 * c + reps - 1) / 2 + 1;
+ }
+ } else if (method == "min") {
+ for (uint c = 0, reps; c < num; c += reps) { reps = 1;
+ while (c + reps < num && p[tmp[c]] == p[tmp[c + reps]]) ++reps;
+ for (uint k = 0; k < reps; ++k) w[tmp[c + k]] = c + 1;
+ }
+ } else if (method == "max") {
+ for (uint c = 0, reps; c < num; c += reps) { reps = 1;
+ while (c + reps < num && p[tmp[c]] == p[tmp[c + reps]]) ++reps;
+ for (uint k = 0; k < reps; ++k) w[tmp[c + k]] = c + reps;
+ }
+ } else // default
+ for (uint c = 0; c < num; ++c) w[tmp[c]] = c + 1;
+ }
+
+ };
+
+ template <class T, class S>
+ inline void rank(const vector<T>& v, vector<S>& w,
+ const string& method = "average")
+ { ranker<T, lt<T> > r(v); r.get_ranks(w, method); }
+
+ template <class T, class S>
+ inline void rank(const T* d, uint size, vector<S>& w,
+ const string& method = "average")
+ { ranker<T, lt<T> > r(d, size); r.get_ranks(w, method); }
+
+ template <class T, class S>
+ inline void partial_rank(const vector<T>& v, vector<S>& w, uint num,
+ const string& method = "average")
+ { ranker<T, lt<T> > r(v); r.get_partial_ranks(w, method, num); }
+
+ template <class T, class S>
+ inline void partial_rank(const T* d, uint size, vector<S>& w, uint num,
+ const string& method = "average")
+ { ranker<T, lt<T> > r(d, size); r.get_partial_ranks(w, method, num); }
+
+ template <class T, class S>
+ inline void order(const vector<T>& v, vector<S>& w)
+ { ranker<T, lt<T> > r(v); r.get_orders(w); }
+
+ template <class T, class S>
+ inline void order(const T* d, uint size, vector<S>& w)
+ { ranker<T, lt<T> > r(d, size); r.get_orders(w); }
+
+ template <class T, class S>
+ inline void partial_order(const vector<T>& v, vector<S>& w, uint num)
+ { ranker<T, lt<T> > r(v); r.get_partial_orders(w, num); }
+
+ template <class T, class S>
+ inline void partial_order(const T* d, uint size, vector<S>& w, uint num)
+ { ranker<T, lt<T> > r(d, size); r.get_partial_orders(w, num); }
+
+ template <class T, class S>
+ inline void rankhigh(const vector<T>& v, vector<S>& w,
+ const string& method = "average")
+ { ranker<T, gt<T> > r(v); r.get_ranks(w, method); }
+
+ template <class T, class S>
+ inline void rankhigh(const T* d, uint size, vector<S>& w,
+ const string& method = "average")
+ { ranker<T, gt<T> > r(d, size); r.get_ranks(w, method); }
+
+ template <class T, class S>
+ inline void partial_rankhigh(const vector<T>& v, vector<S>& w, uint num,
+ const string& method = "average")
+ { ranker<T, gt<T> > r(v); r.get_partial_ranks(w, method, num); }
+
+ template <class T, class S>
+ inline void partial_rankhigh(const T* d, uint size, vector<S>& w, uint num,
+ const string& method = "average")
+ { ranker<T, gt<T> > r(d, size); r.get_partial_ranks(w, method, num); }
+
+ template <class T, class S>
+ inline void orderhigh(const vector<T>& v, vector<S>& w)
+ { ranker<T, gt<T> > r(v); r.get_orders(w); }
+
+ template <class T, class S>
+ inline void orderhigh(const T* d, uint size, vector<S>& w)
+ { ranker<T, gt<T> > r(d, size); r.get_orders(w); }
+
+ template <class T, class S>
+ inline void partial_orderhigh(const vector<T>& v, vector<S>& w, uint num)
+ { ranker<T, gt<T> > r(v); r.get_partial_orders(w, num); }
+
+ template <class T, class S>
+ inline void partial_orderhigh(const T* d, uint size, vector<S>& w, uint num)
+ { ranker<T, gt<T> > r(d, size); r.get_partial_orders(w, num); }
+
+ template <class T>
+ inline T quantile(const T* d, const uint size, const double q)
+ {
+ if (size == 0) return T(0);
+ if (size == 1) return d[0];
+ if (q <= 0) return *std::min_element(d, d + size);
+ if (q >= 1) return *std::max_element(d, d + size);
+
+ double pos = (size - 1) * q;
+ uint ind = uint(pos);
+ double delta = pos - ind;
+ vector<T> w(size); std::copy(d, d + size, w.begin());
+ std::nth_element(w.begin(), w.begin() + ind, w.end());
+ T i1 = *(w.begin() + ind);
+ T i2 = *std::min_element(w.begin() + ind + 1, w.end());
+ return i1 * (1.0 - delta) + i2 * delta;
+ }
+
+ template <class T>
+ inline T quantile(const vector<T>& v, const double q)
+ { return quantile(&v[0], v.size(), q); }
+};
+
+#endif
diff --git a/src/utils/tabix.cpp b/src/utils/tabix.cpp
new file mode 100644
index 0000000..bd1d53c
--- /dev/null
+++ b/src/utils/tabix.cpp
@@ -0,0 +1,107 @@
+/*
+ * This is a C++ wrapper around tabix project which abstracts some of the details of opening and jumping in tabix-indexed files.
+ * Author: Erik Garrison erik.garrison at gmail.com
+ */
+
+#include "tabix.hpp"
+
+Tabix::Tabix(void) { }
+
+Tabix::Tabix(string& file) {
+ filename = file;
+ const char* cfilename = file.c_str();
+ struct stat stat_tbi,stat_vcf;
+ char *fnidx = (char*) calloc(strlen(cfilename) + 5, 1);
+ strcat(strcpy(fnidx, cfilename), ".tbi");
+ if ( bgzf_is_bgzf(cfilename)!=1 )
+ {
+ cerr << "[tabix++] was bgzip used to compress this file? " << file << endl;
+ free(fnidx);
+ exit(1);
+ }
+ // Common source of errors: new VCF is used with an old index
+ stat(fnidx, &stat_tbi);
+ stat(cfilename, &stat_vcf);
+ if ( stat_vcf.st_mtime > stat_tbi.st_mtime )
+ {
+ cerr << "[tabix++] the index file is older than the vcf file. Please use '-f' to overwrite or reindex." << endl;
+ free(fnidx);
+ exit(1);
+ }
+ free(fnidx);
+
+ if ((t = ti_open(cfilename, 0)) == 0) {
+ cerr << "[tabix++] fail to open the data file." << endl;
+ exit(1);
+ }
+
+ if (ti_lazy_index_load(t) < 0) {
+ cerr << "[tabix++] failed to load the index file." << endl;
+ exit(1);
+ }
+
+ idxconf = ti_get_conf(t->idx);
+
+ // set up the iterator, defaults to the beginning
+ iter = ti_query(t, 0, 0, 0);
+
+}
+
+Tabix::~Tabix(void) {
+ ti_iter_destroy(iter);
+ ti_close(t);
+}
+
+void Tabix::getHeader(string& header) {
+ header.clear();
+ ti_iter_destroy(iter);
+ iter = ti_query(t, 0, 0, 0);
+ const char* s;
+ int len;
+ while ((s = ti_read(t, iter, &len)) != 0) {
+ if ((int)(*s) != idxconf->meta_char) {
+ firstline = string(s); // stash this line
+ break;
+ } else {
+ header += string(s);
+ header += "\n";
+ }
+ }
+}
+
+void Tabix::getLastHeader(string & header) {
+ header.clear();
+ ti_iter_destroy(iter);
+ iter = ti_query(t, 0, 0, 0);
+ const char* s;
+ int len;
+ while ((s = ti_read(t, iter, &len)) != 0) {
+ if ((int)(*s) != idxconf->meta_char) {
+ firstline = string(s); // stash this line
+ break;
+ } else header = string(s);
+ }
+}
+
+bool Tabix::setRegion(string region) {
+ if (ti_parse_region(t->idx, region.c_str(), &tid, &beg, &end) == 0) {
+ firstline.clear();
+ ti_iter_destroy(iter);
+ iter = ti_queryi(t, tid, beg, end);
+ return true;
+ } else return false;
+}
+
+bool Tabix::getNextLine(string & line) {
+ const char* s;
+ int len;
+ if (!firstline.empty()) {
+ line = firstline; // recovers line read if header is parsed
+ firstline.clear();
+ return true;
+ }
+ if ((s = ti_read(t, iter, &len)) != 0) {
+ line = string(s);
+ return true;
+ } else return false;
+}
diff --git a/src/utils/tabix.hpp b/src/utils/tabix.hpp
new file mode 100644
index 0000000..9b6013a
--- /dev/null
+++ b/src/utils/tabix.hpp
@@ -0,0 +1,41 @@
+/*
+ * This is a C++ wrapper around tabix project which abstracts some of the details of opening and jumping in tabix-indexed files.
+ * Author: Erik Garrison erik.garrison at gmail.com
+ */
+
+#ifndef _TABIX_HPP
+#define _TABIX_HPP
+
+#include <string>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <bgzf.h>
+#include <tabix.h>
+#include <iostream>
+
+
+using namespace std;
+
+class Tabix {
+
+ tabix_t *t;
+ ti_iter_t iter;
+ const ti_conf_t *idxconf;
+ int tid, beg, end;
+ string firstline;
+
+public:
+
+ string filename;
+
+ Tabix(void);
+ Tabix(string& file);
+ ~Tabix(void);
+
+ void getHeader(string& header);
+ void getLastHeader(string & header);
+ bool setRegion(string region);
+ bool getNextLine(string& line);
+};
+
+#endif
diff --git a/src/utils/utils.cpp b/src/utils/utils.cpp
new file mode 100644
index 0000000..890d513
--- /dev/null
+++ b/src/utils/utils.cpp
@@ -0,0 +1,731 @@
+//FastQTL: Fast and efficient QTL mapper for molecular phenotypes
+//Copyright (C) 2015 Olivier DELANEAU, Alfonso BUIL, Emmanouil DERMITZAKIS & Olivier DELANEAU
+//
+//This program is free software: you can redistribute it and/or modify
+//it under the terms of the GNU General Public License as published by
+//the Free Software Foundation, either version 3 of the License, or
+//(at your option) any later version.
+//
+//This program is distributed in the hope that it will be useful,
+//but WITHOUT ANY WARRANTY; without even the implied warranty of
+//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//GNU General Public License for more details.
+//
+//You should have received a copy of the GNU General Public License
+//along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+#include "utils.h"
+
+long seed = -123456789;
+pthread_mutex_t mutex_rng;
+
+namespace putils {
+ void initRandom(long s) {
+ seed = - s;
+ pthread_mutex_init(&mutex_rng, NULL);
+ }
+
+ double mean(vector < double > & X) {
+ double mean = 0.0;
+ for (int x = 0 ; x < X.size() ; x ++) mean += X[x];
+ mean /= X.size();
+ return mean;
+ }
+
+ double variance(vector < double > & X, double mean) {
+ double variance = 0.0;
+ for (int x = 0 ; x < X.size() ; x++) variance += (X[x] - mean) * (X[x] - mean);
+ variance /= (X.size() - 1);
+ return variance;
+ }
+
+ double mean(vector < float > & X) {
+ double mean = 0.0;
+ for (int x = 0 ; x < X.size() ; x ++) mean += X[x];
+ mean /= X.size();
+ return mean;
+ }
+
+ double variance(vector < float > & X, double mean) {
+ double variance = 0.0;
+ for (int x = 0 ; x < X.size() ; x++) variance += (X[x] - mean) * (X[x] - mean);
+ variance /= (X.size() - 1);
+ return variance;
+ }
+
+ bool isVariable(vector < float > & v) {
+ for (int i = 1 ; i < v.size(); i++) if (v[i] != v[i-1]) return true;
+ return false;
+ }
+
+ bool isVariable(vector < double > & v) {
+ for (int i = 1 ; i < v.size(); i++) if (v[i] != v[i-1]) return true;
+ return false;
+ }
+
+ double getRandom() {
+ pthread_mutex_lock(&mutex_rng);
+ int j;
+ long k;
+ static long iy=0;
+ static long iv[NTAB];
+ double temp;
+ if (seed <= 0 || !iy) {
+ if (-(seed) < 1) seed=1;
+ else seed = -(seed);
+ for (j=NTAB+7;j>=0;j--) {
+ k=(seed)/IQ;
+ seed=IA*(seed-k*IQ)-IR*k;
+ if (seed < 0) seed += IM;
+ if (j < NTAB) iv[j] = seed;
+ }
+ iy=iv[0];
+ }
+ k=(seed)/IQ;
+ seed=IA*(seed-k*IQ)-IR*k;
+ if (seed < 0) seed += IM;
+ j=iy/NDIV;
+ iy=iv[j];
+ iv[j] = seed;
+ temp=AM*iy;
+ pthread_mutex_unlock(&mutex_rng);
+ if (temp > RNMX) return RNMX;
+ else return temp;
+ }
+
+ string getRandomID(){
+ return boost::lexical_cast<string>(boost::uuids::random_generator()());
+ }
+
+ long getSeed() {
+ return seed;
+ }
+
+ int getRandom(int n) {
+ return (int)floor(getRandom() * n);
+ }
+
+ void bootstrap(int N, vector < int > & sample) {
+ sample = vector < int > (N, 0);
+ for (int e = 0 ; e < N ; e ++) sample[e] = getRandom(N);
+ }
+
+ void normalise(vector < double > & v) {
+ double sum = 0.0;
+ for (int i = 0 ; i < v.size() ; i++) sum += v[i];
+ if (sum != 0.0) for (int i = 0 ; i < v.size() ; i++) v[i] /= sum;
+ }
+
+ int sample(vector< double > & v, double sum) {
+ double csum = v[0];
+ double u = getRandom() * sum;
+ for (int i = 0; i < v.size() - 1; ++i) {
+ if ( u < csum ) return i;
+ csum += v[i+1];
+ }
+ return v.size() - 1;
+ }
+
+ void random_shuffle(vector < vector < float > > & X) {
+ int n = X[0].size();
+ vector < int > O;
+ for (int i = 0; i < n; i++) O.push_back(i);
+ random_shuffle(O.begin(), O.end());
+ vector < vector < float > > Xtmp = X;
+ for (int c = 0 ; c < X.size() ; c++) for (int e = 0 ; e < n ; e++) X[c][e] = Xtmp[c][O[e]];
+ }
+
+ double entropy(vector < double > & v) {
+ double e = 0.0;
+ for (int i = 0 ; i < v.size() ; i ++) {
+ if (v[i] > 0.0) e += v[i] * log(v[i]);
+ }
+ return e;
+ }
+
+ double KLdistance(vector < double > & P, vector < double > & Q) {
+ assert(P.size() == Q.size());
+ double d = 0.0;
+ for (int i = 0 ; i < Q.size() ; i ++) {
+ if (Q[i] > 0.0 && P[i] > 0.0) d += P[i] * (log(P[i]) - log(Q[i]));
+ }
+ return d;
+ }
+
+ double qnorm(double p, double mu, double sigma, int lower_tail, int log_p) {
+ double q, r, val;
+ q = p - 0.5;
+ if (fabs(q) <= .425) {
+ r = .180625 - q * q;
+ val = q * (((((((r * 2509.0809287301226727 + 33430.575583588128105) * r + 67265.770927008700853) * r + 45921.953931549871457) * r + 13731.693765509461125) * r + 1971.5909503065514427) * r + 133.14166789178437745) * r + 3.387132872796366608) / (((((((r * 5226.495278852854561 + 28729.085735721942674) * r + 39307.89580009271061) * r + 21213.794301586595867) * r + 5394.1960214247511077) * r + 687.1870074920579083) * r + 42.313330701600911252) * r + 1.);
+ } else { /* closer than 0.075 from {0,1} boundary */
+ if (q > 0) r = 1 - p;
+ else r = p;
+
+ r = sqrt(- ((log_p && ((lower_tail && q <= 0) || (!lower_tail && q > 0))) ? p : log(r)));
+
+ if (r <= 5.) { /* <==> min(p,1-p) >= exp(-25) ~= 1.3888e-11 */
+ r += -1.6;
+ val = (((((((r * 7.7454501427834140764e-4 + .0227238449892691845833) * r + .24178072517745061177) * r + 1.27045825245236838258) * r + 3.64784832476320460504) * r + 5.7694972214606914055) * r + 4.6303378461565452959) * r + 1.42343711074968357734) / (((((((r * 1.05075007164441684324e-9 + 5.475938084995344946e-4) * r + .0151986665636164571966) * r + .14810397642748007459) * r + .68976733498510000455) * r + 1.6763848301838038494) * r + 2.05319162663775882187) * r + 1.);
+ } else { /* very close to 0 or 1 */
+ r += -5.;
+ val = (((((((r * 2.01033439929228813265e-7 + 2.71155556874348757815e-5) * r + .0012426609473880784386) * r + .026532189526576123093) * r + .29656057182850489123) * r + 1.7848265399172913358) * r + 5.4637849111641143699) * r + 6.6579046435011037772) / (((((((r * 2.04426310338993978564e-15 + 1.4215117583164458887e-7)* r + 1.8463183175100546818e-5) * r + 7.868691311456132591e-4) * r + .0148753612908506148525) * r + .13692988092273580531) * r + .59983220655588793769) * r + 1.);
+ }
+ if (q < 0.0) val = -val;
+ }
+ return mu + sigma * val;
+ }
+};
+
+/******************************************************/
+/* UTILS ALGORITHM */
+/******************************************************/
+namespace autils {
+ int max(vector < double > & v) {
+ double max = -1e300;
+ int index_max = 0;
+ for (int i = 0 ; i < v.size() ; i ++)
+ if (v[i] > max) {
+ max = v[i];
+ index_max = i;
+ }
+ return index_max;
+ }
+
+ int max(vector < int > & v) {
+ int max = -1000000000;
+ int index_max = 0;
+ for (int i = 0 ; i < v.size() ; i ++)
+ if (v[i] > max) {
+ max = v[i];
+ index_max = i;
+ }
+ return index_max;
+ }
+
+ void reorder(vector < float > & v, vector < unsigned int > const & order ) {
+ vector < float > vtmp = v;
+ for (int e = 0 ; e < order.size() ; e ++) v[e] = vtmp[order[e]];
+ }
+
+ void findUniqueSet(vector < bool > & B, vector < int > & U) {
+ U.clear();
+ for (int b = 0 ; b < B.size() ; b ++)
+ if (B[b]) U.push_back(b);
+ //sort( U.begin(), U.end() );
+ //U.erase(unique( U.begin(), U.end() ), U.end());
+ }
+
+ void decompose(int min, vector < vector < int > > & B, vector < vector < vector < int > > > & BB) {
+ if (B.size() < 2 * min || B.size() == 2) {
+ BB.push_back(B);
+ return;
+ }
+
+ int l = putils::getRandom(B.size() - 2 * min) + min;
+ vector < vector < int > > LB = vector < vector < int > > (B.begin(), B.begin() + l + 1);
+ vector < vector < int > > RB = vector < vector < int > > (B.begin() + l - 1, B.end());
+ vector < vector < vector < int > > > LBB;
+ vector < vector < vector < int > > > RBB;
+ decompose(min, LB, LBB);
+ decompose(min, RB, RBB);
+ BB = LBB;
+ BB.insert(BB.end(), RBB.begin(), RBB.end());
+ }
+
+ int checkDuo (int pa1, int pa2, int ca1, int ca2) {
+ if (pa1 == 0 && pa2 == 0 && ca1 == 0 && ca2 == 0) { return 0; }
+ if (pa1 == 0 && pa2 == 0 && ca1 == 0 && ca2 == 1) { return 0; }
+ if (pa1 == 0 && pa2 == 0 && ca1 == 1 && ca2 == 0) { return 0; }
+ if (pa1 == 0 && pa2 == 0 && ca1 == 1 && ca2 == 1) { return -1; }
+ if (pa1 == 0 && pa2 == 1 && ca1 == 0 && ca2 == 0) { return 1; }
+ if (pa1 == 0 && pa2 == 1 && ca1 == 0 && ca2 == 1) { return 0; }
+ if (pa1 == 0 && pa2 == 1 && ca1 == 1 && ca2 == 0) { return 0; }
+ if (pa1 == 0 && pa2 == 1 && ca1 == 1 && ca2 == 1) { return 1; }
+ if (pa1 == 1 && pa2 == 0 && ca1 == 0 && ca2 == 0) { return 1; }
+ if (pa1 == 1 && pa2 == 0 && ca1 == 0 && ca2 == 1) { return 0; }
+ if (pa1 == 1 && pa2 == 0 && ca1 == 1 && ca2 == 0) { return 0; }
+ if (pa1 == 1 && pa2 == 0 && ca1 == 1 && ca2 == 1) { return 1; }
+ if (pa1 == 1 && pa2 == 1 && ca1 == 0 && ca2 == 0) { return -1; }
+ if (pa1 == 1 && pa2 == 1 && ca1 == 0 && ca2 == 1) { return 0; }
+ if (pa1 == 1 && pa2 == 1 && ca1 == 1 && ca2 == 0) { return 0; }
+ if (pa1 == 1 && pa2 == 1 && ca1 == 1 && ca2 == 1) { return 0; }
+ return 0;
+ }
+
+ int checkTrio (int fa1, int fa2, int ma1, int ma2, int ca1, int ca2) {
+ if (fa1 == 0 && fa2 == 0 && ma1 == 0 && ma2 == 0 && ca1 == 0 && ca2 == 0 ) { return 0; }
+ if (fa1 == 0 && fa2 == 0 && ma1 == 0 && ma2 == 0 && ca1 == 0 && ca2 == 1 ) { return -1; }
+ if (fa1 == 0 && fa2 == 0 && ma1 == 0 && ma2 == 0 && ca1 == 1 && ca2 == 0 ) { return -1; }
+ if (fa1 == 0 && fa2 == 0 && ma1 == 0 && ma2 == 0 && ca1 == 1 && ca2 == 1 ) { return -1; }
+ if (fa1 == 0 && fa2 == 0 && ma1 == 0 && ma2 == 1 && ca1 == 0 && ca2 == 0 ) { return 1; }
+ if (fa1 == 0 && fa2 == 0 && ma1 == 0 && ma2 == 1 && ca1 == 0 && ca2 == 1 ) { return 1; }
+ if (fa1 == 0 && fa2 == 0 && ma1 == 0 && ma2 == 1 && ca1 == 1 && ca2 == 0 ) { return 1; }
+ if (fa1 == 0 && fa2 == 0 && ma1 == 0 && ma2 == 1 && ca1 == 1 && ca2 == 1 ) { return -1; }
+ if (fa1 == 0 && fa2 == 0 && ma1 == 1 && ma2 == 0 && ca1 == 0 && ca2 == 0 ) { return 1; }
+ if (fa1 == 0 && fa2 == 0 && ma1 == 1 && ma2 == 0 && ca1 == 0 && ca2 == 1 ) { return 1; }
+ if (fa1 == 0 && fa2 == 0 && ma1 == 1 && ma2 == 0 && ca1 == 1 && ca2 == 0 ) { return 1; }
+ if (fa1 == 0 && fa2 == 0 && ma1 == 1 && ma2 == 0 && ca1 == 1 && ca2 == 1 ) { return -1; }
+ if (fa1 == 0 && fa2 == 0 && ma1 == 1 && ma2 == 1 && ca1 == 0 && ca2 == 0 ) { return -1; }
+ if (fa1 == 0 && fa2 == 0 && ma1 == 1 && ma2 == 1 && ca1 == 0 && ca2 == 1 ) { return 0; }
+ if (fa1 == 0 && fa2 == 0 && ma1 == 1 && ma2 == 1 && ca1 == 1 && ca2 == 0 ) { return 0; }
+ if (fa1 == 0 && fa2 == 0 && ma1 == 1 && ma2 == 1 && ca1 == 1 && ca2 == 1 ) { return -1; }
+ if (fa1 == 0 && fa2 == 1 && ma1 == 0 && ma2 == 0 && ca1 == 0 && ca2 == 0 ) { return 1; }
+ if (fa1 == 0 && fa2 == 1 && ma1 == 0 && ma2 == 0 && ca1 == 0 && ca2 == 1 ) { return 1; }
+ if (fa1 == 0 && fa2 == 1 && ma1 == 0 && ma2 == 0 && ca1 == 1 && ca2 == 0 ) { return 1; }
+ if (fa1 == 0 && fa2 == 1 && ma1 == 0 && ma2 == 0 && ca1 == 1 && ca2 == 1 ) { return -1; }
+ if (fa1 == 0 && fa2 == 1 && ma1 == 0 && ma2 == 1 && ca1 == 0 && ca2 == 0 ) { return 2; }
+ if (fa1 == 0 && fa2 == 1 && ma1 == 0 && ma2 == 1 && ca1 == 0 && ca2 == 1 ) { return 0; }
+ if (fa1 == 0 && fa2 == 1 && ma1 == 0 && ma2 == 1 && ca1 == 1 && ca2 == 0 ) { return 0; }
+ if (fa1 == 0 && fa2 == 1 && ma1 == 0 && ma2 == 1 && ca1 == 1 && ca2 == 1 ) { return 2; }
+ if (fa1 == 0 && fa2 == 1 && ma1 == 1 && ma2 == 0 && ca1 == 0 && ca2 == 0 ) { return 2; }
+ if (fa1 == 0 && fa2 == 1 && ma1 == 1 && ma2 == 0 && ca1 == 0 && ca2 == 1 ) { return 0; }
+ if (fa1 == 0 && fa2 == 1 && ma1 == 1 && ma2 == 0 && ca1 == 1 && ca2 == 0 ) { return 0; }
+ if (fa1 == 0 && fa2 == 1 && ma1 == 1 && ma2 == 0 && ca1 == 1 && ca2 == 1 ) { return 2; }
+ if (fa1 == 0 && fa2 == 1 && ma1 == 1 && ma2 == 1 && ca1 == 0 && ca2 == 0 ) { return -1; }
+ if (fa1 == 0 && fa2 == 1 && ma1 == 1 && ma2 == 1 && ca1 == 0 && ca2 == 1 ) { return 1; }
+ if (fa1 == 0 && fa2 == 1 && ma1 == 1 && ma2 == 1 && ca1 == 1 && ca2 == 0 ) { return 1; }
+ if (fa1 == 0 && fa2 == 1 && ma1 == 1 && ma2 == 1 && ca1 == 1 && ca2 == 1 ) { return 1; }
+ if (fa1 == 1 && fa2 == 0 && ma1 == 0 && ma2 == 0 && ca1 == 0 && ca2 == 0 ) { return 1; }
+ if (fa1 == 1 && fa2 == 0 && ma1 == 0 && ma2 == 0 && ca1 == 0 && ca2 == 1 ) { return 1; }
+ if (fa1 == 1 && fa2 == 0 && ma1 == 0 && ma2 == 0 && ca1 == 1 && ca2 == 0 ) { return 1; }
+ if (fa1 == 1 && fa2 == 0 && ma1 == 0 && ma2 == 0 && ca1 == 1 && ca2 == 1 ) { return -1; }
+ if (fa1 == 1 && fa2 == 0 && ma1 == 0 && ma2 == 1 && ca1 == 0 && ca2 == 0 ) { return 2; }
+ if (fa1 == 1 && fa2 == 0 && ma1 == 0 && ma2 == 1 && ca1 == 0 && ca2 == 1 ) { return 0; }
+ if (fa1 == 1 && fa2 == 0 && ma1 == 0 && ma2 == 1 && ca1 == 1 && ca2 == 0 ) { return 0; }
+ if (fa1 == 1 && fa2 == 0 && ma1 == 0 && ma2 == 1 && ca1 == 1 && ca2 == 1 ) { return 2; }
+ if (fa1 == 1 && fa2 == 0 && ma1 == 1 && ma2 == 0 && ca1 == 0 && ca2 == 0 ) { return 2; }
+ if (fa1 == 1 && fa2 == 0 && ma1 == 1 && ma2 == 0 && ca1 == 0 && ca2 == 1 ) { return 0; }
+ if (fa1 == 1 && fa2 == 0 && ma1 == 1 && ma2 == 0 && ca1 == 1 && ca2 == 0 ) { return 0; }
+ if (fa1 == 1 && fa2 == 0 && ma1 == 1 && ma2 == 0 && ca1 == 1 && ca2 == 1 ) { return 2; }
+ if (fa1 == 1 && fa2 == 0 && ma1 == 1 && ma2 == 1 && ca1 == 0 && ca2 == 0 ) { return -1; }
+ if (fa1 == 1 && fa2 == 0 && ma1 == 1 && ma2 == 1 && ca1 == 0 && ca2 == 1 ) { return 1; }
+ if (fa1 == 1 && fa2 == 0 && ma1 == 1 && ma2 == 1 && ca1 == 1 && ca2 == 0 ) { return 1; }
+ if (fa1 == 1 && fa2 == 0 && ma1 == 1 && ma2 == 1 && ca1 == 1 && ca2 == 1 ) { return 1; }
+ if (fa1 == 1 && fa2 == 1 && ma1 == 0 && ma2 == 0 && ca1 == 0 && ca2 == 0 ) { return -1; }
+ if (fa1 == 1 && fa2 == 1 && ma1 == 0 && ma2 == 0 && ca1 == 0 && ca2 == 1 ) { return 0; }
+ if (fa1 == 1 && fa2 == 1 && ma1 == 0 && ma2 == 0 && ca1 == 1 && ca2 == 0 ) { return 0; }
+ if (fa1 == 1 && fa2 == 1 && ma1 == 0 && ma2 == 0 && ca1 == 1 && ca2 == 1 ) { return -1; }
+ if (fa1 == 1 && fa2 == 1 && ma1 == 0 && ma2 == 1 && ca1 == 0 && ca2 == 0 ) { return -1; }
+ if (fa1 == 1 && fa2 == 1 && ma1 == 0 && ma2 == 1 && ca1 == 0 && ca2 == 1 ) { return 1; }
+ if (fa1 == 1 && fa2 == 1 && ma1 == 0 && ma2 == 1 && ca1 == 1 && ca2 == 0 ) { return 1; }
+ if (fa1 == 1 && fa2 == 1 && ma1 == 0 && ma2 == 1 && ca1 == 1 && ca2 == 1 ) { return 1; }
+ if (fa1 == 1 && fa2 == 1 && ma1 == 1 && ma2 == 0 && ca1 == 0 && ca2 == 0 ) { return -1; }
+ if (fa1 == 1 && fa2 == 1 && ma1 == 1 && ma2 == 0 && ca1 == 0 && ca2 == 1 ) { return 1; }
+ if (fa1 == 1 && fa2 == 1 && ma1 == 1 && ma2 == 0 && ca1 == 1 && ca2 == 0 ) { return 1; }
+ if (fa1 == 1 && fa2 == 1 && ma1 == 1 && ma2 == 0 && ca1 == 1 && ca2 == 1 ) { return 1; }
+ if (fa1 == 1 && fa2 == 1 && ma1 == 1 && ma2 == 1 && ca1 == 0 && ca2 == 0 ) { return -1; }
+ if (fa1 == 1 && fa2 == 1 && ma1 == 1 && ma2 == 1 && ca1 == 0 && ca2 == 1 ) { return -1; }
+ if (fa1 == 1 && fa2 == 1 && ma1 == 1 && ma2 == 1 && ca1 == 1 && ca2 == 0 ) { return -1; }
+ if (fa1 == 1 && fa2 == 1 && ma1 == 1 && ma2 == 1 && ca1 == 1 && ca2 == 1 ) { return 0; }
+ return 0;
+ }
+};
+
+/******************************************************/
+/* UTILS STRING */
+/******************************************************/
+namespace sutils {
+ int tokenize (string & str, vector < string > & tokens) {
+ tokens.clear();
+ string::size_type p_last = str.find_first_not_of(" ", 0);
+ string::size_type p_curr = str.find_first_of(" ", p_last);
+ while (string::npos != p_curr || string::npos != p_last) {
+ tokens.push_back(str.substr(p_last, p_curr - p_last));
+ p_last = str.find_first_not_of(" ", p_curr);
+ p_curr = str.find_first_of(" ", p_last);
+ }
+ if (tokens.back()[tokens.back().size()-1] == '\r')
+ tokens.back() = tokens.back().substr(0, tokens.back().size()-1);
+ return tokens.size();
+ }
+
+ int tokenize(string & str, vector < string > & tokens, int n_max_tokens) {
+ tokens.clear();
+ string::size_type p_last = str.find_first_not_of(" ", 0);
+ string::size_type p_curr = str.find_first_of(" ", p_last);
+ while ((string::npos != p_curr || string::npos != p_last) && tokens.size() < n_max_tokens) {
+ tokens.push_back(str.substr(p_last, p_curr - p_last));
+ p_last = str.find_first_not_of(" ", p_curr);
+ p_curr = str.find_first_of(" ", p_last);
+ }
+ return tokens.size();
+ }
+
+ int tokenize (string & str, vector < string > & tokens, string sep) {
+ tokens.clear();
+ string::size_type p_last = str.find_first_not_of(sep, 0);
+ string::size_type p_curr = str.find_first_of(sep, p_last);
+ while (string::npos != p_curr || string::npos != p_last) {
+ tokens.push_back(str.substr(p_last, p_curr - p_last));
+ p_last = str.find_first_not_of(sep, p_curr);
+ p_curr = str.find_first_of(sep, p_last);
+ }
+ if (tokens.back()[tokens.back().size()-1] == '\r')
+ tokens.back() = tokens.back().substr(0, tokens.back().size()-1);
+ return tokens.size();
+ }
+
+ bool isNumeric(string & str) {
+ float n;
+ std::istringstream in(str);
+ if (!(in >> n)) return false;
+ return true;
+ }
+
+ string int2str(int n) {
+ ostringstream s2( stringstream::out );
+ s2 << n;
+ return s2.str();
+ }
+
+ string int2str(vector < int > & v) {
+ ostringstream s2( stringstream::out );
+ for (int l = 0 ; l < v.size() ; l++) {
+ s2 << " " << v[l] ;
+ }
+ return s2.str();
+ }
+
+ string long2str(long int n) {
+ ostringstream s2( stringstream::out );
+ s2 << n;
+ return s2.str();
+ }
+
+ string double2str(double n) {
+ ostringstream s2;
+ s2 << n;
+ return s2.str();
+ }
+
+ string double2str(double n, int prc) {
+ ostringstream s2;
+ s2 << setiosflags( ios::fixed );
+ if ( prc > 0 ) s2.precision(prc);
+ s2 << n;
+ return s2.str();
+ }
+
+ string double2str(vector < double > &v) {
+ ostringstream s2;
+ for (int l = 0 ; l < v.size() ; l++) s2 << v[l] << ((l!=v.size()-1)?" ":"");
+ return s2.str();
+ }
+
+ string double2str(vector < double > &v, int prc) {
+ ostringstream s2;
+ s2 << setiosflags( ios::fixed );
+ if ( prc >= 0 ) s2.precision(prc);
+ for (int l = 0 ; l < v.size() ; l++) s2 << v[l] << ((l!=v.size()-1)?" ":"");
+ return s2.str();
+ }
+
+ string double2str(vector < float > &v) {
+ ostringstream s2;
+ for (int l = 0 ; l < v.size() ; l++) s2 << v[l] << ((l!=v.size()-1)?" ":"");
+ return s2.str();
+ }
+
+ string double2str(vector < float > &v, int prc) {
+ ostringstream s2;
+ s2 << setiosflags( ios::fixed );
+ if ( prc >= 0 ) s2.precision(prc);
+ for (int l = 0 ; l < v.size() ; l++) s2 << v[l] << ((l!=v.size()-1)?" ":"");
+ return s2.str();
+ }
+
+ string bool2str(vector<bool> & v) {
+ ostringstream s2( stringstream::out );
+ for (int l = 0 ; l < v.size() ; l++) {
+ if (v[l]) s2 << "1";
+ else s2 << "0";
+ }
+ return s2.str();
+ }
+
+ string date2str(time_t * t, string format) {
+ struct tm * timeinfo = localtime(t);
+ char buffer[128];
+ strftime(buffer, 128, format.c_str(), timeinfo);
+ ostringstream s2( stringstream::out );
+ s2 << buffer;
+ return s2.str();
+ }
+
+ string remove_spaces(const string &s) {
+ int last = s.size() - 1;
+ while (last >= 0 && s[last] == ' ') --last;
+ return s.substr(0, last + 1);
+ }
+};
+
+/******************************************************/
+/* UTILS FILE */
+/******************************************************/
+namespace futils {
+ bool isFile(string f) {
+ ifile inp;
+ inp.open(f.c_str(), ifstream::in);
+ if(inp.fail()) {
+ inp.clear(ios::failbit);
+ inp.close();
+ return false;
+ }
+ inp.close();
+ return true;
+ }
+
+ bool createFile(string f) {
+ ofile out;
+ //out.open(f.c_str(), ofstream::out);
+ out.open(f.c_str());
+ if (out.fail()) {
+ out.clear(ios::failbit);
+ out.close();
+ return false;
+ } else out << "";
+ out.close();
+ return true;
+ }
+
+ void checkFile(string f, bool readmode) {
+ if (readmode && !isFile(f)) LOG.error(f + " is impossible to open, check file existence or reading permissions");
+ if (!readmode && !createFile(f)) LOG.error(f + " is impossible to open, check writing permissions");
+ }
+
+ string extensionFile(string & filename) {
+ if (filename.find_last_of(".") != string::npos)
+ return filename.substr(filename.find_last_of(".") + 1);
+ return "";
+ }
+
+
+ void bool2binary(vector < bool > & V, ostream &fd) {
+ int nb = V.size();
+ fd.write((char*)&nb, sizeof(int));
+ int cpt_byte = 0;
+ int cpt_bin = 0;
+ int nb_byte = (int)ceil( (V.size() * 1.0) / 8);
+ while (cpt_byte < nb_byte) {
+ bitset<8> byte_bitset;
+ for (int l = 0; l < 8 && cpt_bin < V.size() ; l++) {
+ byte_bitset[l] = V[cpt_bin];
+ cpt_bin ++;
+ }
+ char byte_char = (char)byte_bitset.to_ulong();
+ fd.write(&byte_char, 1);
+ cpt_byte++;
+ }
+ }
+
+ bool binary2bool(vector < bool > & V, istream & fd) {
+ int nb;
+ fd.read((char*)&nb, sizeof(int));
+ if (!fd) return false;
+ int cpt_byte = 0;
+ int cpt_bin = 0;
+ int nb_byte = (int)ceil( (nb * 1.0) / 8);
+ V = vector < bool >(nb);
+ while (cpt_byte < nb_byte) {
+ char byte_char;
+ fd.read(&byte_char, 1);
+ if (!fd) return false;
+ bitset<8> byte_bitset = byte_char;
+ for (int l = 0; l < 8 && cpt_bin < nb ; l++) {
+ V[cpt_bin] = byte_bitset[l];
+ cpt_bin++;
+ }
+ cpt_byte++;
+ }
+ return true;
+ }
+};
+
+/******************************************************/
+/* INPUT FILE */
+/******************************************************/
+ifile::ifile() {
+}
+
+ifile::ifile(string filename , bool binary) {
+ open(filename, binary);
+}
+
+ifile::~ifile() {
+ close();
+}
+
+string ifile::name() {
+ return file;
+}
+
+bool ifile::open(string filename, bool binary) {
+ file = filename;
+ string ext = futils::extensionFile(filename);
+ if (ext == "gz") {
+ fd.open(file.c_str(), ios::in | ios::binary);
+ push(bio::gzip_decompressor());
+ } else if (ext == "bz2") {
+ fd.open(file.c_str(), ios::in | ios::binary);
+ push(bio::bzip2_decompressor());
+ } else if (binary) {
+ fd.open(file.c_str(), ios::in | ios::binary);
+ } else {
+ fd.open(file.c_str());
+ }
+ if (fd.fail()) return false;
+ push(fd);
+ return true;
+}
+
+bool ifile::readString(string & str) {
+ int s;
+ if (!read((char*)&s, sizeof(int))) return false;
+ char * buffer = new char [s + 1];
+ if (!read(buffer, s)) return false;
+ buffer[s] = '\0';
+ str = string(buffer);
+ delete [] buffer;
+ return true;
+}
+
+void ifile::close() {
+ if (!empty()) reset();
+ fd.close();
+}
+
+/******************************************************/
+/* OUTPUT FILE */
+/******************************************************/
+ofile::ofile() {
+}
+
+ofile::ofile(string filename , bool binary) {
+ open(filename, binary);
+}
+
+ofile::~ofile() {
+ close();
+}
+
+string ofile::name() {
+ return file;
+}
+
+bool ofile::open(string filename, bool binary) {
+ file = filename;
+ string ext = futils::extensionFile(filename);
+ if (ext == "gz") {
+ fd.open(file.c_str(), ios::out | ios::binary);
+ push(bio::gzip_compressor());
+ } else if (ext == "bz2") {
+ fd.open(file.c_str(), ios::out | ios::binary);
+ push(bio::bzip2_compressor());
+ } else if (binary) {
+ fd.open(file.c_str(), ios::out | ios::binary);
+ } else {
+ fd.open(file.c_str());
+ }
+ if (fd.fail()) return false;
+ push(fd);
+ return true;
+}
+
+void ofile::writeString(string & str) {
+ int s = str.size();
+ write((char*)&s, sizeof(int));
+ write(str.c_str(), s * sizeof(char));
+}
+
+void ofile::close() {
+ if (!empty()) reset();
+ fd.close();
+}
+
+/******************************************************/
+/* LOG FILE */
+/******************************************************/
+lfile::lfile() {
+ verboseC = true;
+ verboseL = true;
+}
+
+lfile::~lfile() {
+ close();
+}
+
+string lfile::name() {
+ return file;
+}
+
+bool lfile::open(string filename) {
+ file = filename;
+ if (futils::extensionFile(file) != "log") file += ".log";
+ fd.open(file.c_str());
+ if (fd.fail()) return false;
+ return true;
+}
+
+void lfile::close() {
+ fd.close();
+}
+
+string lfile::getPrefix() {
+ return file.substr(0, file.find_last_of("."));
+}
+
+void lfile::muteL() {
+ verboseL = false;
+}
+
+void lfile::unmuteL() {
+ verboseL = true;
+}
+
+void lfile::muteC() {
+ verboseC = false;
+}
+
+void lfile::unmuteC() {
+ verboseC = true;
+}
+
+
+void lfile::print(string s) {
+ if (verboseL) { fd << s; fd.flush(); }
+ if (verboseC) { cout << s; cout.flush(); }
+}
+
+void lfile::printC(string s) {
+ if (verboseC) { cout << s; cout.flush(); }
+}
+
+void lfile::printL(string s) {
+ if (verboseL) { fd << s; fd.flush(); }
+}
+
+void lfile::println(string s) {
+ if (verboseL) { fd << s << endl; }
+ if (verboseC) { cout << s << endl; }
+}
+
+void lfile::printlnC(string s) {
+ if (verboseC) { cout << s << endl; }
+}
+
+void lfile::printlnL(string s) {
+ if (verboseL) { fd << s << endl; }
+}
+
+void lfile::warning(string s) {
+ cout << endl << "\33[33mWARNING:\33[0m " << s << endl;
+ if (verboseL) fd << endl << "WARNING: " << s << endl;
+}
+
+void lfile::error(string s) {
+ cout << endl << "\33[33mERROR:\33[0m " << s << endl;
+ if (verboseL) fd << endl << "ERROR: " << s << endl;
+ close();
+ exit(1);
+}
diff --git a/src/utils/utils.h b/src/utils/utils.h
new file mode 100644
index 0000000..fa9d71e
--- /dev/null
+++ b/src/utils/utils.h
@@ -0,0 +1,312 @@
+//FastQTL: Fast and efficient QTL mapper for molecular phenotypes
+//Copyright (C) 2015 Olivier DELANEAU, Alfonso BUIL, Emmanouil DERMITZAKIS & Olivier DELANEAU
+//
+//This program is free software: you can redistribute it and/or modify
+//it under the terms of the GNU General Public License as published by
+//the Free Software Foundation, either version 3 of the License, or
+//(at your option) any later version.
+//
+//This program is distributed in the hope that it will be useful,
+//but WITHOUT ANY WARRANTY; without even the implied warranty of
+//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//GNU General Public License for more details.
+//
+//You should have received a copy of the GNU General Public License
+//along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef _UTILS_H
+#define _UTILS_H
+
+#define IA 16807
+#define IM 2147483647
+#define AM (1.0/IM)
+#define IQ 127773
+#define IR 2836
+#define NTAB 32
+#define NDIV (1+(IM-1)/NTAB)
+#define EPS 1.2e-7
+#define RNMX (1.0-EPS)
+#define PI 3.14159265358979323846
+
+#define LOW_POS_DOUBLE 1e-300
+#define BIG_POS_DOUBLE 1e300
+#define LOW_NEG_DOUBLE -1e-300
+#define BIG_NEG_DOUBLE -1e300
+#define LOW_POS_FLOAT 1e-8
+#define BIG_POS_FLOAT 1e8
+#define LOW_NEG_FLOAT -1e-8
+#define BIG_NEG_FLOAT -1e8
+#define BIG_POS_INT 1000000000
+#define BIG_NEG_INT -1000000000
+
+#define ___NA___ (0.0/0.0)
+
+#include <string>
+#include <complex>
+#include <vector>
+#include <queue>
+#include <map>
+#include <numeric>
+#include <bitset>
+#include <list>
+#include <bitset>
+#include <cmath>
+#include <algorithm>
+#include <iostream>
+#include <sstream>
+#include <fstream>
+#include <iomanip>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <time.h>
+#include <sys/time.h>
+#include <pthread.h>
+#include <exception>
+#include <boost/algorithm/string.hpp>
+#include <boost/program_options.hpp>
+#include <boost/iostreams/filtering_stream.hpp>
+#include <boost/iostreams/filter/gzip.hpp>
+#include <boost/iostreams/filter/bzip2.hpp>
+#include <boost/uuid/uuid.hpp>
+#include <boost/uuid/uuid_generators.hpp>
+#include <boost/uuid/uuid_io.hpp>
+#include <boost/lexical_cast.hpp>
+
+
+using namespace std;
+namespace bio = boost::iostreams;
+namespace bpo = boost::program_options;
+namespace bid = boost::uuids;
+
+
+/******************************************************/
+/* UTILS RUNNING STATS */
+/******************************************************/
+
+class RunningStat {
+public:
+ RunningStat() : m_n(0) {}
+
+ RunningStat(vector < double > & X): m_n(0) {
+ for (unsigned int e = 0 ; e < X.size() ; e ++) Push(X[e]);
+ }
+
+ RunningStat(vector < float > & X): m_n(0) {
+ for (unsigned int e = 0 ; e < X.size() ; e ++) Push((double)X[e]);
+ }
+
+ void Clear() { m_n = 0; }
+
+ void Push(double x) {
+ m_n++;
+
+ if (m_n == 1) {
+ m_oldM = m_newM = x;
+ m_oldS = 0.0;
+ } else {
+ m_newM = m_oldM + (x - m_oldM)/m_n;
+ m_newS = m_oldS + (x - m_oldM)*(x - m_newM);
+ m_oldM = m_newM;
+ m_oldS = m_newS;
+ }
+ }
+
+ int NumDataValues() const {
+ return m_n;
+ }
+
+ double Mean() const {
+ return (m_n > 0) ? m_newM : 0.0;
+ }
+
+ double Variance() const {
+ return ( (m_n > 1) ? m_newS/(m_n - 1) : 0.0 );
+ }
+
+ double StandardDeviation() const {
+ return sqrt( Variance() );
+ }
+
+ void MeanStandardDeviation (double & mean, double & sd) {
+ mean = Mean();
+ sd = StandardDeviation();
+ }
+
+ void MeanVariance (double & mean, double & variance) {
+ mean = Mean();
+ variance = Variance();
+ }
+
+private:
+ int m_n;
+ double m_oldM, m_newM, m_oldS, m_newS;
+};
+
+
+/******************************************************/
+/* UTILS STATISTICS */
+/******************************************************/
+namespace putils {
+ double mean(vector < double > &);
+ double variance(vector < double > &, double );
+ double mean(vector < float > &);
+ double variance(vector < float > &, double );
+ bool isVariable(vector < float > &);
+ bool isVariable(vector < double > &);
+ void initRandom(long s);
+ double getRandom();
+ string getRandomID();
+ int getRandom(int);
+ void bootstrap(int, vector < int > &);
+ long getSeed();
+ void normalise(vector < double > & v);
+ void random_shuffle(vector < vector < float > > &);
+ int sample(vector< double > & v, double sum);
+ double entropy(vector < double > & v);
+ double KLdistance(vector < double > & P, vector < double > & Q);
+ double qnorm(double p, double mu, double sigma, int lower_tail, int log_p);
+};
+
+/******************************************************/
+/* UTILS ALGORITHM */
+/******************************************************/
+namespace autils {
+ int max(vector < double > & v);
+ int max(vector < int > & v);
+ void findUniqueSet(vector < bool > & B, vector < int > & U);
+ void reorder(vector < float > &, vector < unsigned int > const & ) ;
+ void decompose(int min, vector < vector < int > > & B, vector < vector < vector < int > > > & BB);
+ int checkDuo (int pa1, int pa2, int ca1, int ca2);
+ int checkTrio (int fa1, int fa2, int ma1, int ma2, int ca1, int ca2);
+};
+
+/******************************************************/
+/* UTILS STRING */
+/******************************************************/
+namespace sutils {
+ int tokenize(string &, vector < string > &);
+ int tokenize(string &, vector < string > &, int);
+ int tokenize(string &, vector < string > &, string);
+ string int2str(int n);
+ string int2str(vector < int > & v);
+ string long2str(long int n);
+ string double2str(double n);
+ string double2str(double n, int prc);
+ string double2str(vector < double > &v);
+ string double2str(vector < double > &v, int prc);
+ string double2str(vector < float > &v);
+ string double2str(vector < float > &v, int prc);
+ string bool2str(vector<bool> & v);
+ string date2str(time_t * t, string format);
+ bool isNumeric(string &);
+ string remove_spaces(const string &);
+};
+
+/******************************************************/
+/* UTILS FILE */
+/******************************************************/
+namespace futils {
+ bool isFile(string f);
+ bool createFile(string f);
+ void checkFile(string f, bool);
+ string extensionFile(string & filename);
+ void bool2binary(vector < bool > & V, ostream &fd);
+ bool binary2bool(vector < bool > & V, istream & fd);
+};
+
+
+/******************************************************/
+/* EXCEPTIONS */
+/******************************************************/
+class myException : public exception {
+public:
+ explicit myException(std::string msg) : msg_(msg) {}
+
+ virtual ~myException() throw() {}
+
+ virtual const char* what() const throw() {
+ return msg_.c_str();
+ }
+
+private:
+ std::string msg_;
+};
+
+/******************************************************/
+/* INPUT FILE */
+/******************************************************/
+class ifile : public bio::filtering_istream {
+private:
+ string file;
+ ifstream fd;
+
+public:
+ ifile();
+ ifile(string filename , bool binary = false);
+ ~ifile();
+ string name();
+ bool open(string filename, bool binary = false);
+ bool readString(string &);
+ void close();
+};
+
+/******************************************************/
+/* OUTPUT FILE */
+/******************************************************/
+class ofile : public bio::filtering_ostream {
+private:
+ string file;
+ ofstream fd;
+
+public:
+ ofile();
+ ofile(string filename , bool binary = false);
+ ~ofile();
+ string name();
+ bool open(string filename, bool binary = false);
+ void writeString(string &);
+ void close();
+};
+
+/******************************************************/
+/* LOG FILE */
+/******************************************************/
+class lfile {
+private:
+ string file;
+ ofstream fd;
+ bool verboseC;
+ bool verboseL;
+
+public:
+ lfile();
+ ~lfile();
+ string name();
+ bool open(string filename = "file.log");
+ void close();
+ string getPrefix();
+ void muteL();
+ void unmuteL();
+ void muteC();
+ void unmuteC();
+ void print(string s);
+ void printC(string s);
+ void printL(string s);
+ void println(string s);
+ void printlnC(string s);
+ void printlnL(string s);
+ void warning(string s);
+ void error(string s);
+};
+
+#ifdef _GLOBAL
+#define _EXTERN
+#else
+#define _EXTERN extern
+#endif
+
+_EXTERN lfile LOG;
+_EXTERN time_t START_DATE;
+
+#endif
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/fastqtl.git
More information about the debian-med-commit
mailing list