[med-svn] [qtltools] 01/06: New upstream version 1.1+dfsg
Andreas Tille
tille at debian.org
Mon Jan 16 15:47:14 UTC 2017
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository qtltools.
commit f241cdb35d16d35b6e763d93378ffba0b6ceca4b
Author: Andreas Tille <tille at debian.org>
Date: Mon Jan 16 10:04:57 2017 +0100
New upstream version 1.1+dfsg
---
Makefile | 81 +++++----------------------
README | 2 +-
lib/OTools/otools.h | 6 +-
src/QTLtools.cpp | 8 ++-
src/common/data.h | 2 +-
src/mode_cis/cis_chunking.cpp | 4 ++
src/mode_cis/cis_data.h | 2 +
src/mode_cis/cis_initilization.cpp | 1 +
src/mode_cis/cis_main.cpp | 21 ++++++-
src/mode_cis/cis_permutation_pass.cpp | 4 +-
src/mode_cis/cis_read_phenotypes.cpp | 9 ++-
src/mode_correct/correct_data.h | 8 ++-
src/mode_correct/correct_main.cpp | 9 ++-
src/mode_correct/correct_management.cpp | 8 ---
src/mode_correct/correct_processing.cpp | 34 +++++++++++-
src/mode_correct/correct_read_covariates.cpp | 81 +++++++++++++++++++++++++++
src/mode_fdensity/fdensity_process.cpp | 36 ++++++++++--
src/mode_fenrich/fenrich_management.cpp | 11 +++-
src/mode_match/match_process.cpp | 7 ++-
src/mode_quan/quan_chunking.cpp | 19 ++++++-
src/mode_quan/quan_data.h | 8 +--
src/mode_quan/quan_main.cpp | 22 +++++++-
src/mode_quan/quan_printResults.cpp | 2 +
src/mode_quan/quan_readBAM.cpp | 83 +++++++++++++++++-----------
src/mode_rtc/rtc_common.cpp | 2 +-
src/mode_trans/trans_chunking.cpp | 18 +++++-
26 files changed, 339 insertions(+), 149 deletions(-)
diff --git a/Makefile b/Makefile
index 28851fe..33b2436 100644
--- a/Makefile
+++ b/Makefile
@@ -9,17 +9,13 @@ HTSLD_INC=
HTSLD_LIB=
#COMPILER MODE C++11
-CXX=g++ -std=c++11
+CXX=g++ -std=c++0x
#COMPILER FLAGS
CXXFLAG_REL=-O2
CXXFLAG_DBG=-g
CXXFLAG_WRN=-Wall -Wextra -Wno-sign-compare -Wno-unused-local-typedefs -Wno-deprecated -Wno-unused-parameter
-#LINKER FLAGS
-LDFLAG_REL=-O2
-LDFLAG_DBG=-g
-
#BASE LIBRARIES
LIB_FLAGS=-Wl,-Bstatic -lz -lgsl -lblas -lbz2 -Wl,-Bdynamic -lm -lpthread
@@ -32,66 +28,25 @@ OFILE=$(shell for file in `find src -name *.cpp`; do echo obj/$$(basename $$file
VPATH=$(shell for file in `find src -name *.cpp`; do echo $$(dirname $$file); done)
#DEFAULT VERSION (I.E. UNIGE DESKTOP RELEASE VERSION)
-all: default
-
-#DEFAULT RELEASE VERSION
-default: CXXFLAG=$(CXXFLAG_REL) $(CXXFLAG_WRN)
-default: IFLAG=-Ilib/OTools -Ilib -I$(RMATH_INC) -I$(HTSLD_INC) -I$(BOOST_INC)
-default: LIB_FILES=$(RMATH_LIB)/libRmath.a $(HTSLD_LIB)/libhts.a $(BOOST_LIB)/libboost_iostreams.a $(BOOST_LIB)/libboost_program_options.a
-default: LDFLAG=$(LDFLAG_REL)
-default: $(BFILE)
+all: desktop
#UNIGE DESKTOP RELEASE VERSION
desktop: RMATH_INC=$(HOME)/Tools/R-3.2.2/src/include
desktop: RMATH_LIB=$(HOME)/Tools/R-3.2.2/src/nmath/standalone
-desktop: HTSLD_INC=$(HOME)/Tools/htslib-1.3
-desktop: HTSLD_LIB=$(HOME)/Tools/htslib-1.3
+desktop: HTSLD_INC=$(HOME)/Tools/htslib-1.3.1
+desktop: HTSLD_LIB=$(HOME)/Tools/htslib-1.3.1
desktop: BOOST_INC=/usr/include
-desktop: BOOST_LIB=/usr/lib/x86_64-linux-gnu
+desktop: BOOST_LIB=/usr/lib
desktop: CXXFLAG=$(CXXFLAG_REL) $(CXXFLAG_WRN)
desktop: IFLAG=-Ilib/OTools -Ilib -I$(RMATH_INC) -I$(HTSLD_INC) -I$(BOOST_INC)
desktop: LIB_FILES=$(RMATH_LIB)/libRmath.a $(HTSLD_LIB)/libhts.a $(BOOST_LIB)/libboost_iostreams.a $(BOOST_LIB)/libboost_program_options.a
-desktop: LDFLAG=$(LDFLAG_REL)
+desktop: LDFLAG=$(CXXFLAG_REL)
desktop: $(BFILE)
#UNIGE DESKTOP DEBUG VERSION
-desktop-dbg: RMATH_INC=$(HOME)/Tools/R-3.2.2/src/include
-desktop-dbg: RMATH_LIB=$(HOME)/Tools/R-3.2.2/src/nmath/standalone
-desktop-dbg: HTSLD_INC=$(HOME)/Tools/htslib-1.3
-desktop-dbg: HTSLD_LIB=$(HOME)/Tools/htslib-1.3
-desktop-dbg: BOOST_INC=/usr/include
-desktop-dbg: BOOST_LIB=/usr/lib/x86_64-linux-gnu
+desktop-dbg: desktop
desktop-dbg: CXXFLAG=$(CXXFLAG_DBG) $(CXXFLAG_WRN)
-desktop-dbg: IFLAG=-Ilib/OTools -Ilib -I$(RMATH_INC) -I$(HTSLD_INC) -I$(BOOST_INC)
-desktop-dbg: LIB_FILES=$(RMATH_LIB)/libRmath.a $(HTSLD_LIB)/libhts.a $(BOOST_LIB)/libboost_iostreams.a $(BOOST_LIB)/libboost_program_options.a
-desktop-dbg: LDFLAG=$(LDFLAG_DBG)
-desktop-dbg: $(BFILE)
-
-#DELL LAPTOP RELEASE VERSION
-laptop: RMATH_INC=$(HOME)/Libraries/R-3.2.2/src/include
-laptop: RMATH_LIB=$(HOME)/Libraries/R-3.2.2/src/nmath/standalone
-laptop: HTSLD_INC=$(HOME)/Libraries/htslib-1.2.1
-laptop: HTSLD_LIB=$(HOME)/Libraries/htslib-1.2.1
-laptop: BOOST_INC=/usr/include
-laptop: BOOST_LIB=/usr/lib/x86_64-linux-gnu
-laptop: CXXFLAG=$(CXXFLAG_REL) $(CXXFLAG_WRN)
-laptop: IFLAG=-Ilib/OTools -Ilib -I$(RMATH_INC) -I$(HTSLD_INC) -I$(BOOST_INC)
-laptop: LIB_FILES=$(RMATH_LIB)/libRmath.a $(HTSLD_LIB)/libhts.a $(BOOST_LIB)/libboost_iostreams.a $(BOOST_LIB)/libboost_program_options.a
-laptop: LDFLAG=$(LDFLAG_REL)
-laptop: $(BFILE)
-
-#DELL LAPTOP DEBUG VERSION
-laptop-dbg: RMATH_INC=$(HOME)/Libraries/R-3.2.2/src/include
-laptop-dbg: RMATH_LIB=$(HOME)/Libraries/R-3.2.2/src/nmath/standalone
-laptop-dbg: HTSLD_INC=$(HOME)/Libraries/htslib-1.2.1
-laptop-dbg: HTSLD_LIB=$(HOME)/Libraries/htslib-1.2.1
-laptop-dbg: BOOST_INC=/usr/include
-laptop-dbg: BOOST_LIB=/usr/lib/x86_64-linux-gnu
-laptop-dbg: CXXFLAG=$(CXXFLAG_DBG) $(CXXFLAG_WRN)
-laptop-dbg: IFLAG=-Ilib/OTools -Ilib -I$(RMATH_INC) -I$(HTSLD_INC) -I$(BOOST_INC)
-laptop-dbg: LIB_FILES=$(RMATH_LIB)/libRmath.a $(HTSLD_LIB)/libhts.a $(BOOST_LIB)/libboost_iostreams.a $(BOOST_LIB)/libboost_program_options.a
-laptop-dbg: LDFLAG=$(LDFLAG_DBG)
-laptop-dbg: $(BFILE)
+desktop-dbg: LDFLAG=$(CXXFLAG_DBG)
#VITAL-IT RELEASE VERSION
cluster: RMATH_INC=/software/R/3.1.1/include
@@ -103,21 +58,13 @@ cluster: BOOST_LIB=/software/lib64
cluster: CXXFLAG=$(CXXFLAG_REL) $(CXXFLAG_WRN)
cluster: IFLAG=-Ilib/OTools -Ilib -I$(RMATH_INC) -I$(HTSLD_INC) -I$(BOOST_INC)
cluster: LIB_FILES=$(RMATH_LIB)/libRmath.a $(HTSLD_LIB)/libhts.a $(BOOST_LIB)/libboost_iostreams.a $(BOOST_LIB)/libboost_program_options.a
-cluster: LDFLAG=$(LDFLAG_REL)
+cluster: LDFLAG=$(CXXFLAG_REL)
cluster: $(BFILE)
#VITAL-IT DEBUG VERSION
-cluster-dbg: RMATH_INC=/software/R/3.1.1/include
-cluster-dbg: RMATH_LIB=/software/R/3.1.1/lib64
-cluster-dbg: HTSLD_INC=/software/UHTS/Analysis/samtools/1.2/include
-cluster-dbg: HTSLD_LIB=/software/UHTS/Analysis/samtools/1.2/lib64
-cluster-dbg: BOOST_INC=/software/include
-cluster-dbg: BOOST_LIB=/software/lib64
+cluster-dbg: cluster
cluster-dbg: CXXFLAG=$(CXXFLAG_DBG) $(CXXFLAG_WRN)
-cluster-dbg: IFLAG=-Ilib/OTools -Ilib -I$(RMATH_INC) -I$(HTSLD_INC) -I$(BOOST_INC)
-cluster-dbg: LIB_FILES=$(RMATH_LIB)/libRmath.a $(HTSLD_LIB)/libhts.a $(BOOST_LIB)/libboost_iostreams.a $(BOOST_LIB)/libboost_program_options.a
-cluster-dbg: LDFLAG=$(LDFLAG_DBG)
-cluster-dbg: $(BFILE)
+cluster-dbg: LDFLAG=$(CXXFLAG_DBG)
#MAC RELEASE VERSION
mac: RMATH_INC=$(HOME)/Libraries/R-3.2.2/src/include
@@ -129,7 +76,7 @@ mac: BOOST_LIB=/opt/local/lib
mac: CXXFLAG=$(CXXFLAG_REL) $(CXXFLAG_WRN)
mac: IFLAG=-Ilib/OTools -Ilib/ -I$(RMATH_INC) -I$(HTSLD_INC) -I$(BOOST_INC)
mac: LIB_FILES=$(RMATH_LIB)/libRmath.a $(HTSLD_LIB)/libhts.a $(BOOST_LIB)/libboost_iostreams-mt.a $(BOOST_LIB)/libboost_program_options-mt.a
-mac: LDFLAG=$(LDFLAG_REL) -L /opt/local/lib
+mac: LDFLAG=$(CXXFLAG_REL) -L /opt/local/lib
mac: $(BFILE)
#MAC DEBUG VERSION
@@ -142,7 +89,7 @@ mac-dbg: BOOST_LIB=/opt/local/lib
mac-dbg: CXXFLAG=$(CXXFLAG_DBG) $(CXXFLAG_WRN)
mac-dbg: IFLAG=-Ilib/OTools -Ilib/ -I$(RMATH_INC) -I$(HTSLD_INC) -I$(BOOST_INC)
mac-dbg: LIB_FILES=$(RMATH_LIB)/libRmath.a $(HTSLD_LIB)/libhts.a $(BOOST_LIB)/libboost_iostreams-mt.a $(BOOST_LIB)/libboost_program_options-mt.a
-mac-dbg: LDFLAG=$(LDFLAG_DBG) -L /opt/local/lib
+mac-dbg: LDFLAG=$(CXXFLAG_DBG) -L /opt/local/lib
mac-dbg: $(BFILE)
#COMPILATION RULES
@@ -242,4 +189,4 @@ clean-bamstat:
clean-fdensity:
rm -f obj/fdensity_*.o $(BFILE)
-
\ No newline at end of file
+
diff --git a/README b/README
index 2c64161..c61c13f 100644
--- a/README
+++ b/README
@@ -1,6 +1,6 @@
QTLtools: a tool set for molecular QTL discovery and analysis
-ver: 1.0
+ver: 1.1
git: https://github.com/qtltools/qtltools
web:https://qtltools.github.io/qtltools/
diff --git a/lib/OTools/otools.h b/lib/OTools/otools.h
index e375fb1..d544f66 100644
--- a/lib/OTools/otools.h
+++ b/lib/OTools/otools.h
@@ -67,9 +67,9 @@ extern "C" {
//MAKE SOME TOOL FULLY ACCESSIBLE THROUGHOUT THE SOFTWARE
#ifdef _DECLARE_TOOLBOX_HERE
- random_number_generator rng;
- string_utils stb;
- verbose vrb;
+ random_number_generator rng; //Random number generator
+ string_utils stb; //Utils for string manipulation
+ verbose vrb; //Verbose
#else
extern random_number_generator rng;
extern string_utils stb;
diff --git a/src/QTLtools.cpp b/src/QTLtools.cpp
index d7f04b6..3598b9a 100644
--- a/src/QTLtools.cpp
+++ b/src/QTLtools.cpp
@@ -34,7 +34,7 @@ void printModes(){
vrb.print(" eg: QTLtools cis --help");
vrb.ctitle("Available modes:");
vrb.print(" bamstat Calculate basic QC metrics for BAM/SAM");
- vrb.print(" match Match VCF genotypes to BAM/SAM file");
+ vrb.print(" mbv Match BAM to VCF ");
vrb.print(" pca Calculate principal components for a BED/VCF/BCF file");
vrb.print(" correct Covariate correction of a BED file");
vrb.print(" cis cis QTL analysis");
@@ -55,9 +55,11 @@ int main(int argc, char ** argv) {
timer running_timer;
//2. Open LOG file if necessary
+ bool match_mode = false;
for (int a = 1 ; a < argc - 1 ; a ++) {
if ((strcmp(argv[a], "--log") == 0) && !vrb.open_log(string(argv[a+1]))) vrb.error("Impossible to open log file!");
if (strcmp(argv[a], "--silent") == 0) vrb.set_silent();
+ if (strcmp(argv[a], "mbv") == 0) match_mode = true;
}
//3. Print header on screen
@@ -67,6 +69,8 @@ int main(int argc, char ** argv) {
vrb.bullet("Webpage : https://qtltools.github.io/qtltools/");
vrb.bullet("Version : " + string(QTLTOOLS_VERSION));
vrb.bullet("Date : " + running_timer.date());
+ if (!match_mode) vrb.bullet("Citation: A complete tool set for molecular QTL discovery and analysis, https://doi.org/10.1101/068635");
+ else vrb.bullet("Citation: MBV; a method to solve sample mislabeling and detect technical bias in large combined genotype and sequencing assay data sets");
//4. Switch mode
vector < string > args;
@@ -83,7 +87,7 @@ int main(int argc, char ** argv) {
else if (strcmp(argv[1], "trans") == 0) trans_main(args);
//5.3. MATCH mode
- else if (strcmp(argv[1], "match") == 0) match_main(args);
+ else if (strcmp(argv[1], "mbv") == 0) match_main(args);
//5.4. FENRICH mode
else if (strcmp(argv[1], "fenrich") == 0) fenrich_main(args);
diff --git a/src/common/data.h b/src/common/data.h
index 79d5e48..1a0ebba 100644
--- a/src/common/data.h
+++ b/src/common/data.h
@@ -19,7 +19,7 @@
#include "otools.h"
#include "filter.h"
-#define QTLTOOLS_VERSION "1.0"
+#define QTLTOOLS_VERSION "1.1"
class data {
public:
diff --git a/src/mode_cis/cis_chunking.cpp b/src/mode_cis/cis_chunking.cpp
index 6c88ec0..878f880 100644
--- a/src/mode_cis/cis_chunking.cpp
+++ b/src/mode_cis/cis_chunking.cpp
@@ -59,6 +59,10 @@ bool cis_data::setPhenotypeRegion(string reg) {
return regionPhenotype.parse(reg);
}
+bool cis_data::setGenotypeRegion(string reg) {
+ return regionGenotype.parse(reg);
+}
+
class pgroup {
public:
int start, end;
diff --git a/src/mode_cis/cis_data.h b/src/mode_cis/cis_data.h
index 001ac3c..cf47bad 100644
--- a/src/mode_cis/cis_data.h
+++ b/src/mode_cis/cis_data.h
@@ -42,6 +42,7 @@ public:
//REGIONS
genomic_region regionPhenotype;
genomic_region regionGenotype;
+ bool full_test;
//GENOTYPES
int genotype_count; //variant site number
@@ -79,6 +80,7 @@ public:
//DATA REGION
bool setPhenotypeRegion(string);
+ bool setGenotypeRegion(string);
void setPhenotypeRegion(int, int);
//READ DATA
diff --git a/src/mode_cis/cis_initilization.cpp b/src/mode_cis/cis_initilization.cpp
index ce9377d..a2bfa0a 100644
--- a/src/mode_cis/cis_initilization.cpp
+++ b/src/mode_cis/cis_initilization.cpp
@@ -24,6 +24,7 @@ cis_data::cis_data() {
genotype_count = 0;
phenotype_count = 0;
covariate_count = 0;
+ full_test = false;
}
void cis_data::clear() {
diff --git a/src/mode_cis/cis_main.cpp b/src/mode_cis/cis_main.cpp
index 42dc335..b9c28f9 100644
--- a/src/mode_cis/cis_main.cpp
+++ b/src/mode_cis/cis_main.cpp
@@ -51,7 +51,8 @@ void cis_main(vector < string > & argv) {
boost::program_options::options_description opt_parallel ("\x1B[32mParallelization\33[0m");
opt_parallel.add_options()
("chunk", boost::program_options::value< vector < int > >()->multitoken(), "Specify which chunk needs to be processed")
- ("region", boost::program_options::value< string >(), "Region of interest.");
+ ("region", boost::program_options::value< string >(), "Region of interest.")
+ ("region-pair", boost::program_options::value< vector < string > >()->multitoken(), "Pairs of interest.");
D.option_descriptions.add(opt_files).add(opt_parameters).add(opt_modes).add(opt_aggr).add(opt_parallel);
@@ -81,8 +82,8 @@ void cis_main(vector < string > & argv) {
if (!D.options.count("vcf")) vrb.error("Genotype data needs to be specified with --vcf [file.vcf]");
if (!D.options.count("bed")) vrb.error("Phenotype data needs to be specified with --bed [file.bed]");
if (!D.options.count("out")) vrb.error("Output needs to be specified with --out [file.out]");
- int nParallel = D.options.count("chunk") + D.options.count("region");
- if (nParallel > 1) vrb.error("Please, specify one of these options [--region, --chunk]");
+ int nParallel = D.options.count("chunk") + D.options.count("region") + D.options.count("region-pair");
+ if (nParallel > 1) vrb.error("Please, specify one of these options [--region, --chunk, --region-pair]");
int nMode = D.options.count("permute") + D.options.count("mapping") + D.options.count("nominal");
if (nMode != 1) vrb.error("Please, specify only one of these options [--permute, --nominal, --mapping]");
string outFile = D.options["out"].as < string > ();
@@ -124,6 +125,12 @@ void cis_main(vector < string > & argv) {
if (nChunk.size() != 2 || nChunk[0] > nChunk[1] || nChunk[0] < 0) vrb.error("Incorrect --chunk arguments!");
vrb.bullet("Chunk = [" + stb.str(nChunk[0]) + "/" + stb.str(nChunk[1]) + "]");
} else if(D.options.count("region")) vrb.bullet("Region = [" + D.options["region"].as < string > () +"]");
+ else if(D.options.count("region-pair")) {
+ vector < string > regions = D.options["region-pair"].as < vector < string > > ();
+ vrb.bullet("Region for genotypes = [" + regions[0] +"]");
+ vrb.bullet("Region for phenotypes = [" + regions[1] +"]");
+ D.full_test = true;
+ }
//---------------------------
// 7. SET AGGREGATION METHODS
@@ -160,6 +167,14 @@ void cis_main(vector < string > & argv) {
} else if (D.options.count("region")){
if (!D.setPhenotypeRegion(D.options["region"].as < string > ())) vrb.error("Impossible to interpret region [" + D.options["region"].as < string > () + "]");
D.regionGenotype = genomic_region(D.regionPhenotype, D.options["window"].as < unsigned int > ());
+ } else if (D.options.count("region-pair")) {
+ vector < string > regions = D.options["region-pair"].as < vector < string > > ();
+ if (!D.setPhenotypeRegion(regions[0])) vrb.error("Impossible to interpret region [" + regions[0] + "]");
+ if (!D.setGenotypeRegion(regions[1])) vrb.error("Impossible to interpret region [" + regions[1] + "]");
+ D.regionGenotype = genomic_region(D.regionGenotype, D.options["window"].as < unsigned int > ());
+ D.regionPhenotype = genomic_region(D.regionPhenotype, D.options["window"].as < unsigned int > ());
+ D.full_test = true;
+ D.grp_mode = GRP_BEST;
}
diff --git a/src/mode_cis/cis_permutation_pass.cpp b/src/mode_cis/cis_permutation_pass.cpp
index 52cd961..200d58f 100644
--- a/src/mode_cis/cis_permutation_pass.cpp
+++ b/src/mode_cis/cis_permutation_pass.cpp
@@ -50,11 +50,11 @@ void cis_data::runPermutationPass(string fout) {
vector < unsigned int > variant_indexes;
vector < int > variant_distances;
for (unsigned int v = 0 ; v < genotype_count ; v ++) {
- if (phenotype_chr[group_idx[i_group][0]] != genotype_chr[v]) continue;
+ if (!full_test && phenotype_chr[group_idx[i_group][0]] != genotype_chr[v]) continue;
int ps = (phenotype_start[group_idx[i_group][0]]>cis_window)?(phenotype_start[group_idx[i_group][0]]-cis_window):0;
int pe = phenotype_end[group_idx[i_group][0]] + cis_window;
- if (genotype_start[v] <= pe && ps <= genotype_end[v]) {
+ if (full_test || (genotype_start[v] <= pe && ps <= genotype_end[v])) {
int cisdistance = 0;
if (genotype_start[v] <= phenotype_end[group_idx[i_group][0]] && phenotype_start[group_idx[i_group][0]] <= genotype_end[v]) cisdistance = 0;
else if (genotype_end[v] < phenotype_start[group_idx[i_group][0]]) cisdistance = genotype_end[v] - phenotype_start[group_idx[i_group][0]];
diff --git a/src/mode_cis/cis_read_phenotypes.cpp b/src/mode_cis/cis_read_phenotypes.cpp
index c26a757..c2440a5 100644
--- a/src/mode_cis/cis_read_phenotypes.cpp
+++ b/src/mode_cis/cis_read_phenotypes.cpp
@@ -59,7 +59,8 @@ void cis_data::readPhenotypes(string fbed) {
phenotype_chr.push_back(tokens[0]);
phenotype_start.push_back(atoi(tokens[1].c_str()) + 1);
phenotype_end.push_back(atoi(tokens[2].c_str()));
- if (grp_mode > 0) phenotype_grp.push_back(tokens[4]);
+ if (grp_mode > 0 && full_test) phenotype_grp.push_back("ALL_GENES");
+ if (grp_mode > 0 && !full_test) phenotype_grp.push_back(tokens[4]);
phenotype_neg.push_back(tokens[5] == "-");
if (phenotype_neg.back()) n_negativeStrd ++;
phenotype_val.push_back(vector < float > (sample_count, 0.0));
@@ -85,7 +86,8 @@ void cis_data::readPhenotypes(string fbed) {
phenotype_chr.push_back(tokens[0]);
phenotype_start.push_back(atoi(tokens[1].c_str()) + 1);
phenotype_end.push_back(atoi(tokens[2].c_str()));
- if (grp_mode > 0) phenotype_grp.push_back(tokens[4]);
+ if (grp_mode > 0 && full_test) phenotype_grp.push_back("ALL_GENES");
+ if (grp_mode > 0 && !full_test) phenotype_grp.push_back(tokens[4]);
phenotype_neg.push_back(tokens[5] == "-");
if (phenotype_neg.back()) n_negativeStrd ++;
phenotype_val.push_back(vector < float > (sample_count, 0.0));
@@ -138,7 +140,8 @@ void cis_data::scanPhenotypes(string fbed) {
phenotype_chr.push_back(tokens[0]);
phenotype_start.push_back(atoi(tokens[1].c_str()) + 1);
phenotype_end.push_back(atoi(tokens[2].c_str()));
- if (grp_mode > 0) phenotype_grp.push_back(tokens[4]);
+ if (grp_mode > 0 && full_test) phenotype_grp.push_back("ALL_GENES");
+ if (grp_mode > 0 && !full_test) phenotype_grp.push_back(tokens[4]);
phenotype_neg.push_back(tokens[5] == "-");
if (phenotype_neg.back()) n_negativeStrd ++;
n_includedP++;
diff --git a/src/mode_correct/correct_data.h b/src/mode_correct/correct_data.h
index 63fa01f..278d88c 100644
--- a/src/mode_correct/correct_data.h
+++ b/src/mode_correct/correct_data.h
@@ -34,18 +34,20 @@ public:
int covariate_count; //covariate number
vector < vector < string > > covariate_val; //covariate values
vector < string > covariate_id; //covariate ids
+ vector < string > covariate_target;
residualizer * covariate_engine; //covariate engine machinery
+ //QTL COVARIATES
+
+
//CONSTRUCTOR / DESTRUCTOR
correct_data();
~correct_data();
void clear();
- //INITIALIZE
- void initializeResidualizer();
-
//READ DATA
void readCovariates(string);
+ void readQTLCovariates(string, string);
//GENOTYPE & PHENOTYPE MANAGEMENT
void imputeMissing(vector < float > &);
diff --git a/src/mode_correct/correct_main.cpp b/src/mode_correct/correct_main.cpp
index ab9184a..f3f9fe2 100644
--- a/src/mode_correct/correct_main.cpp
+++ b/src/mode_correct/correct_main.cpp
@@ -28,6 +28,7 @@ void correct_main(vector < string > & argv) {
("vcf", boost::program_options::value< string >(), "Genotypes in VCF/BCF/BED format.")
("bed", boost::program_options::value< string >(), "Phenotypes in BED format.")
("cov", boost::program_options::value< string >(), "Covariates in TXT format.")
+ ("qtl", boost::program_options::value< vector < string > >()->multitoken(), "QTL covariates.")
("out", boost::program_options::value< string >(), "Output file.");
boost::program_options::options_description opt_param ("\x1B[32mParameters\33[0m");
@@ -87,9 +88,11 @@ void correct_main(vector < string > & argv) {
if (D.options.count("vcf")) D.readSampleFromVCF(D.options["vcf"].as < string > ());
if (D.options.count("cov")) D.readSampleFromCOV(D.options["cov"].as < string > ());
D.mergeSampleLists();
- if (D.options.count("cov")) {
- D.readCovariates(D.options["cov"].as < string > ());
- D.initializeResidualizer();
+ if (D.options.count("cov")) D.readCovariates(D.options["cov"].as < string > ());
+ if (D.options.count("qtl")) {
+ vector < string > files = D.options["qtl"].as < vector < string > > ();
+ assert(files .size() == 2);
+ D.readQTLCovariates(files[0], files[1]);
}
//----------------
diff --git a/src/mode_correct/correct_management.cpp b/src/mode_correct/correct_management.cpp
index 4504fe2..fa2f379 100644
--- a/src/mode_correct/correct_management.cpp
+++ b/src/mode_correct/correct_management.cpp
@@ -81,11 +81,3 @@ void correct_data::normalTransform(float * V) {
V[s] = qnorm(R[s], 0.0, 1.0, 1, 0);
}
}
-
-void correct_data::initializeResidualizer() {
- vrb.title("Initialize residualizer");
- covariate_engine = new residualizer (sample_count);
- for (int c = 0 ; c < covariate_count ; c ++) covariate_engine->push(covariate_val[c]);
- covariate_engine->build();
- vrb.bullet("#covariates = " + stb.str(covariate_count));
-}
diff --git a/src/mode_correct/correct_processing.cpp b/src/mode_correct/correct_processing.cpp
index 77534bb..dd3e6f9 100644
--- a/src/mode_correct/correct_processing.cpp
+++ b/src/mode_correct/correct_processing.cpp
@@ -18,6 +18,14 @@
void correct_data::processBED(string fin, string fout) {
int n_includedP = 0, n_excludedP_user = 0;
+ vrb.title("Initialize residualizer");
+ if (covariate_target.size() == 0) {
+ covariate_engine = new residualizer (sample_count);
+ for (int c = 0 ; c < covariate_count ; c ++) covariate_engine->push(covariate_val[c]);
+ covariate_engine->build();
+ vrb.bullet("#covariates = " + stb.str(covariate_count));
+ }
+
//Open BED file
vrb.title("Reading phenotype data in [" + fin + "] and writing [" + fout + "]");
output_file fdo (fout.c_str());
@@ -27,7 +35,7 @@ void correct_data::processBED(string fin, string fout) {
if (!fp) vrb.error("Cannot open file for reading!");
kstring_t str = {0,0,0};
if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != '#' ) vrb.error("Cannot read header!");
- fdo << "#chr\tstart\tend\tid";
+ fdo << "#chr\tstart\tend\tid\tgid\tstrd";
//Read and map sample names
vector < int > mappingS;
@@ -48,7 +56,21 @@ void correct_data::processBED(string fin, string fout) {
fdo << tokens[0] << "\t" << tokens[1] << "\t" << tokens[2] << "\t" << tokens[3] << "\t" << tokens[4] << "\t" << tokens[5];
for (int t = 6 ; t < tokens.size() ; t ++) if (mappingS[t-6] >= 0) values[mappingS[t-6]] = ((tokens[t] != "NA")?stof(tokens[t]):bcf_float_missing);
imputeMissing(values);
- if (residualize) covariate_engine->residualize(values);
+ if (residualize) {
+ if (covariate_target.size() > 0) {
+ covariate_engine = new residualizer (sample_count);
+ for (int c = 0 ; c < covariate_count ; c ++) {
+ if (covariate_target[c] == "ALL") covariate_engine->push(covariate_val[c]);
+ if (covariate_target[c] == tokens[4]) covariate_engine->push(covariate_val[c]);
+ }
+ covariate_engine->build();
+ }
+ covariate_engine->residualize(values);
+ if (covariate_target.size() > 0) {
+ delete covariate_engine;
+ covariate_engine = NULL;
+ }
+ }
if (normalize) normalTransform(values);
for (int s = 0 ; s < sample_count ; s++) fdo << "\t" << values[s];
n_includedP ++ ;
@@ -66,8 +88,14 @@ void correct_data::processBED(string fin, string fout) {
void correct_data::processVCF(string fin, string fout) {
int n_includedG = 0, n_excludedG_user = 0, n_excludedG_miss = 0, n_excludedG_mult = 0;
- vrb.title("Reading genotype data in [" + fin + "] and writing [" + fout + "]");
+ if (covariate_target.size() == 0) {
+ covariate_engine = new residualizer (sample_count);
+ for (int c = 0 ; c < covariate_count ; c ++) covariate_engine->push(covariate_val[c]);
+ covariate_engine->build();
+ vrb.bullet("#covariates = " + stb.str(covariate_count));
+ }
+ vrb.title("Reading genotype data in [" + fin + "] and writing [" + fout + "]");
bcf_sweep_t * sw = bcf_sweep_init(fin.c_str());
if (!sw) vrb.error("Cannot open file for reading [" + fin + "]");
bcf_hdr_t * hdr_old = bcf_sweep_hdr(sw);
diff --git a/src/mode_correct/correct_read_covariates.cpp b/src/mode_correct/correct_read_covariates.cpp
index aff6c16..d71f1d3 100644
--- a/src/mode_correct/correct_read_covariates.cpp
+++ b/src/mode_correct/correct_read_covariates.cpp
@@ -50,3 +50,84 @@ void correct_data::readCovariates(string fcov) {
if (n_excludedC > 0) vrb.bullet(stb.str(n_excludedC) + " covariate(s) excluded by user");
fd.close();
}
+
+
+void correct_data::readQTLCovariates(string fqtl, string fvcf) {
+ string buffer; vector < string > tokens;
+
+ //File qtl
+ map < string, string > map_qtl;
+ vrb.title("Reading QTLs in [" + fqtl + "]");
+ input_file fd (fqtl);
+ if (fd.fail()) vrb.error("Cannot open file!");
+ while (getline(fd, buffer)) {
+ stb.split(buffer, tokens);
+ if (tokens.size() != 2) vrb.error("Wrong Incorrect number of columns, expected 2!");
+ map_qtl.insert(pair < string, string > (tokens[1], tokens[0]));
+ }
+ vrb.bullet(stb.str(map_qtl.size()) + " QTL(s) read");
+ fd.close();
+
+ //
+ for (int c = 0 ; c < covariate_count ; c ++) covariate_target.push_back("ALL");
+
+ //File VCF
+ vector < int > mappingS;
+ int n_includedG = 0, n_excludedG = 0, n_includedS = 0;
+ bcf_srs_t * sr = bcf_sr_init();
+ if(!(bcf_sr_add_reader (sr, fvcf.c_str()))) {
+ switch (sr->errnum) {
+ case not_bgzf: vrb.error("File not compressed with bgzip!"); break;
+ case idx_load_failed: vrb.error("Impossible to load index file!"); break;
+ case file_type_error: vrb.error("File format not detected by htslib!"); break;
+ default : vrb.error("Unknown error!");
+ }
+ }
+ int n_samples = bcf_hdr_nsamples(sr->readers[0].header);
+ for (int i0 = 0 ; i0 < n_samples ; i0 ++) {
+ mappingS.push_back(findSample(string(sr->readers[0].header->samples[i0])));
+ if (mappingS.back() >= 0) n_includedS++;
+ }
+ unsigned int linecount=0;
+ int ngt, ngt_arr = 0, nds, nds_arr = 0, * gt_arr = NULL, nsl, nsl_arr = 0, * sl_arr = NULL;
+ float * ds_arr = NULL;
+ bcf1_t * line;
+ while(bcf_sr_next_line (sr)) {
+ linecount ++;
+ if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
+ line = bcf_sr_get_line(sr, 0);
+ if (line->n_allele == 2) {
+ ngt = bcf_get_genotypes(sr->readers[0].header, line, >_arr, &ngt_arr);
+ nds = bcf_get_format_float(sr->readers[0].header, line,"DS", &ds_arr, &nds_arr);
+ if (nds == n_samples || ngt == 2*n_samples) {
+ bcf_unpack(line, BCF_UN_STR);
+ string sid = string(line->d.id);
+
+ map < string, string > :: iterator itM = map_qtl.find(sid);
+
+ if (itM != map_qtl.end() && filter_covariate.check(sid)) {
+ covariate_val.push_back(vector < string > (sample_count, "0"));
+ for(int i = 0 ; i < n_samples ; i ++) {
+ if (mappingS[i] >= 0) {
+ if (nds > 0) covariate_val.back()[mappingS[i]] = stb.str(ds_arr[i]);
+ else {
+ if (gt_arr[2*i+0] == bcf_gt_missing || gt_arr[2*i+1] == bcf_gt_missing) covariate_val.back()[mappingS[i]] ="NA";
+ else covariate_val.back()[mappingS[i]] = stb.str(bcf_gt_allele(gt_arr[2*i+0]) + bcf_gt_allele(gt_arr[2*i+1]));
+ }
+ }
+ }
+ covariate_target.push_back(itM->second);
+ covariate_count++;
+ n_includedG++;
+ } else n_excludedG ++;
+ }
+ }
+ }
+
+ //Finalize
+ free(gt_arr);
+ free(ds_arr);
+ bcf_sr_destroy(sr);
+ vrb.bullet(stb.str(n_includedG) + " variants included");
+ if (n_excludedG > 0) vrb.bullet(stb.str(n_excludedG) + " variants excluded");
+}
diff --git a/src/mode_fdensity/fdensity_process.cpp b/src/mode_fdensity/fdensity_process.cpp
index b34edc6..0efba73 100644
--- a/src/mode_fdensity/fdensity_process.cpp
+++ b/src/mode_fdensity/fdensity_process.cpp
@@ -13,6 +13,7 @@
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.*/
+
#include "fdensity_data.h"
void fdensity_data::buildIntervalTrees() {
@@ -45,6 +46,10 @@ void fdensity_data::buildIntervalTrees() {
for (int c = 0 ; c < n_chr ; c ++) Ctree[c] = IntervalTree < bool > (Ivec[c]);
//4. Build functional neighborhoods
+
+ int n_neg = 0, n_pos = 0, n_mid = 0;
+ int s_neg = 0, s_pos = 0, s_mid = 0;
+
basic_stats Rstat;
Itree = vector < IntervalTree < bool > > (tss_count);
for (int t = 0 ; t < tss_count ; t ++) {
@@ -52,17 +57,38 @@ void fdensity_data::buildIntervalTrees() {
assert(itC != chr2idx.end());
vector < Interval < bool > > ann_in_cis;
Ctree[itC->second].findOverlapping(tss_pos[t] - window, tss_pos[t] + window, ann_in_cis);
- Rstat.push(ann_in_cis.size() * 1.0);
+
+ IntervalStartSorter < bool > intervalStartSorter;
+ sort(ann_in_cis.begin(), ann_in_cis.end(), intervalStartSorter);
+
+ Rstat.push(ann_in_cis.size() * 1.0);
vector < Interval < bool > > Rvec;
- if (!tss_neg[t]) for (int a = 0 ; a < ann_in_cis.size() ; a ++) Rvec.push_back(Interval < bool > (ann_in_cis[a].start - tss_pos[t], ann_in_cis[a].stop - tss_pos[t], true));
- else for (int a = 0 ; a < ann_in_cis.size() ; a ++) Rvec.push_back(Interval < bool > (-1 * (ann_in_cis[a].stop - tss_pos[t]), -1 * (ann_in_cis[a].start - tss_pos[t]), true));
+
+ for (int a = 0 ; a < ann_in_cis.size() ; a ++) {
+
+ if (!tss_neg[t]) Rvec.push_back(Interval < bool > (ann_in_cis[a].start - tss_pos[t], ann_in_cis[a].stop - tss_pos[t], true));
+ else Rvec.push_back(Interval < bool > (-1 * (ann_in_cis[a].stop - tss_pos[t]), -1 * (ann_in_cis[a].start - tss_pos[t]), true));
+ assert(Rvec.back().start <= Rvec.back().stop);
+
+ if (Rvec.back().start < 0 && Rvec.back().stop < 0) {
+ s_neg += Rvec.back().stop - Rvec.back().start;
+ n_neg ++;
+ } else if (Rvec.back().start > 0 && Rvec.back().stop > 0) {
+ s_pos += Rvec.back().stop - Rvec.back().start;
+ n_pos ++;
+ } else {
+ s_mid += Rvec.back().stop - Rvec.back().start;
+ n_mid ++;
+ }
+ }
+ sort(Rvec.begin(), Rvec.end(), intervalStartSorter);
Itree[t] = IntervalTree < bool > (Rvec);
}
+
vrb.bullet("#annotated cis-windows = " + stb.str(Rstat.size()));
vrb.bullet("#annotations per cis-window = " + stb.str(Rstat.mean(), 2) + " +/- " + stb.str(Rstat.sd(), 2));
}
-
void fdensity_data::runDensityCalculation(string fout) {
vrb.title("Density analysis");
@@ -72,6 +98,7 @@ void fdensity_data::runDensityCalculation(string fout) {
int wto = w + bin - 1;
int n_annotation = 0;
+
for (int t = 0 ; t < tss_count ; t ++) {
vector < Interval < bool > > ann_in_bin;
Itree[t].findOverlapping(wfrom, wto, ann_in_bin);
@@ -80,5 +107,6 @@ void fdensity_data::runDensityCalculation(string fout) {
fdo << wfrom << " " << wto << " " << n_annotation << endl;
}
+
fdo.close();
}
diff --git a/src/mode_fenrich/fenrich_management.cpp b/src/mode_fenrich/fenrich_management.cpp
index b8abb48..edf8ab7 100644
--- a/src/mode_fenrich/fenrich_management.cpp
+++ b/src/mode_fenrich/fenrich_management.cpp
@@ -61,10 +61,17 @@ void fenrich_data::mapAnnotation2QTL() {
assert(itC != chr2idx.end());
vector < Interval < bool > > ann_in_cis;
Ttree[itC->second].findOverlapping(tss_pos[t] - max_distance - 10000, tss_pos[t] + max_distance + 10000, ann_in_cis);
+ IntervalStartSorter < bool > intervalStartSorter;
+ sort(ann_in_cis.begin(), ann_in_cis.end(), intervalStartSorter);
Rstat.push(ann_in_cis.size() * 1.0);
vector < Interval < bool > > Rvec;
- if (!tss_neg[t]) for (int a = 0 ; a < ann_in_cis.size() ; a ++) Rvec.push_back(Interval < bool > (ann_in_cis[a].start - tss_pos[t], ann_in_cis[a].stop - tss_pos[t], true));
- else for (int a = 0 ; a < ann_in_cis.size() ; a ++) Rvec.push_back(Interval < bool > (-1 * (ann_in_cis[a].start - tss_pos[t]), -1 * (ann_in_cis[a].stop - tss_pos[t]), true));
+
+ for (int a = 0 ; a < ann_in_cis.size() ; a ++) {
+ if (!tss_neg[t]) Rvec.push_back(Interval < bool > (ann_in_cis[a].start - tss_pos[t], ann_in_cis[a].stop - tss_pos[t], true));
+ else Rvec.push_back(Interval < bool > (-1 * (ann_in_cis[a].stop - tss_pos[t]), -1 * (ann_in_cis[a].start - tss_pos[t]), true));
+ assert(Rvec.back().start <= Rvec.back().stop);
+ }
+ sort(Rvec.begin(), Rvec.end(), intervalStartSorter);
R[t] = IntervalTree < bool > (Rvec);
}
vrb.bullet("#annotated cis-windows = " + stb.str(Rstat.size()));
diff --git a/src/mode_match/match_process.cpp b/src/mode_match/match_process.cpp
index 309db27..e7c397a 100644
--- a/src/mode_match/match_process.cpp
+++ b/src/mode_match/match_process.cpp
@@ -20,6 +20,8 @@ void match_data::writeOutput(string filename) {
vrb.title("Write summary report in [" + filename + "]");
output_file fd (filename);
+ fd << "SampleID n_geno_missing n_het_total n_hom_total n_het_covered n_hom_covered n_het_consistent n_hom_consistent perc_het_consistent perc_hom_consistent n_het_in_ase" << endl;
+
for (int i = 0 ; i < sample_count ; i ++) {
unsigned int n_mis_tot = 0, n_het_tot = 0, n_hom_tot = 0, n_het_cov = 0, n_hom_cov = 0, n_het_fit = 0, n_hom_fit = 0, n_het_ase = 0;
for (int r = 0; r < regions.size() ; r ++) {
@@ -61,7 +63,10 @@ void match_data::writeOutput(string filename) {
fd << " " << n_hom_cov;
fd << " " << n_het_fit;
fd << " " << n_hom_fit;
- fd << " " << n_het_ase << endl;
+ fd << " " << n_het_fit/n_het_cov;
+ fd << " " << n_hom_fit/n_hom_cov;
+ fd << " " << n_het_ase;
+ fd << endl;
}
fd.close();
}
diff --git a/src/mode_quan/quan_chunking.cpp b/src/mode_quan/quan_chunking.cpp
index e125e56..f26d981 100644
--- a/src/mode_quan/quan_chunking.cpp
+++ b/src/mode_quan/quan_chunking.cpp
@@ -34,9 +34,22 @@ void quan_data::setChunk(int k, int K){
unsigned long int max_length =0 ;
if (gene_grps.size() % K == 0) max_length = gene_grps.size() / K;
else for ( unsigned long int l = 1 ; l * (K-1) < gene_grps.size(); l++ ) max_length = l;
- unsigned long int start_idx = (k-1) * max_length;
- unsigned long int end_idx = k * max_length;
+ unsigned long int start_idx, end_idx;
+ if (K * max_length < gene_grps.size()){
+ int diff = gene_grps.size() - (K * max_length);
+ if (k <= diff){
+ start_idx = (k-1) * (max_length + 1) + 1;
+ end_idx = k * (max_length + 1);
+ }else{
+ int prev = diff * (max_length + 1);
+ start_idx = (k-diff-1) * max_length + 1 + prev;
+ end_idx = (k-diff) * max_length + prev;
+ }
+ }else{
+ start_idx = (k-1) * max_length + 1;
+ end_idx = k * max_length;
+ }
if (end_idx > gene_grps.size()) end_idx = gene_grps.size();
gene_grps = vector < quan_gene_grp > (gene_grps.begin()+start_idx, gene_grps.begin()+end_idx);
vrb.bullet("Number of gene groups in chunk [" + stb.str(k) + " / " + stb.str(K) +"] = " + stb.str(gene_grps.size()));
-}
\ No newline at end of file
+}
diff --git a/src/mode_quan/quan_data.h b/src/mode_quan/quan_data.h
index 314e980..b7e0914 100644
--- a/src/mode_quan/quan_data.h
+++ b/src/mode_quan/quan_data.h
@@ -197,7 +197,7 @@ typedef struct { // auxiliary data structure
hts_itr_t * iter; // NULL if a region not specified
hts_idx_t * idx;
int min_mapQ; // mapQ filter
- bool dup_remove; // remove duplicates
+ bool dup_remove,fail_qc; // remove duplicates, failed QC reads
int max_intron_length;
double max_mismatch_count;
double max_mismatch_count_total;
@@ -253,9 +253,9 @@ public:
//FILTERS
int max_intron_length;
double max_mismatch_count,max_mismatch_count_total;
- unsigned int min_mapQ,max_read_length;
- bool dup_remove,proper_pair,check_consistency,debug,merge,fraction_mm,fraction_mmt;
- quan_data(){max_intron_length=0;max_mismatch_count=0.0;max_mismatch_count_total=0.0;min_mapQ=0;max_read_length=0;dup_remove=false;proper_pair=false;check_consistency=false;debug=false;merge=true;fraction_mm=false;fraction_mmt=false;}
+ unsigned int min_mapQ,max_read_length,min_exon;
+ bool dup_remove,proper_pair,check_consistency,debug,merge,fraction_mm,fraction_mmt,fail_qc,old_wrong_split;
+ quan_data(){max_intron_length=0;max_mismatch_count=0.0;max_mismatch_count_total=0.0;min_mapQ=0;max_read_length=0;dup_remove=false;proper_pair=false;check_consistency=false;debug=false;merge=true;fraction_mm=false;fraction_mmt=false;fail_qc=false;old_wrong_split=false;}
void setChunk(int,int);
void setRegion(string);
diff --git a/src/mode_quan/quan_main.cpp b/src/mode_quan/quan_main.cpp
index 61d23f0..dd0549d 100644
--- a/src/mode_quan/quan_main.cpp
+++ b/src/mode_quan/quan_main.cpp
@@ -37,12 +37,15 @@ void quan_main(vector < string > & argv) {
boost::program_options::options_description opt_filters ("\x1B[32mFilters\33[0m");
opt_filters.add_options()
- ("filter-mapping-quality", boost::program_options::value< unsigned int >()->default_value(10), "Minimal phred mapping quality for a read to be considered.")
+ ("filter-mapping-quality", boost::program_options::value< unsigned int >()->default_value(10), "Minimal mapping quality for a read to be considered.")
("filter-mismatch", boost::program_options::value< double >()->default_value(-1.0,"OFF"), "Maximum mismatches allowed in a read. If between 0 and 1 taken as the fraction of read length. (Requires NM attribute)")
("filter-mismatch-total", boost::program_options::value< double >()->default_value(-1.0,"OFF"), "Maximum total mismatches allowed in paired reads. If between 0 and 1 taken as the fraction of combined read length. (Requires NM attribute)")
("check-proper-pairing", "If provided only properly paired reads according to the aligner that are in correct orientation will be considered. Otherwise all pairs in correct orientation will be considered.")
("check-consistency", "If provided checks the consistency of split reads with annotation, rather than pure overlap of one of the blocks of the split read.")
("no-merge", "If provided overlapping mate pairs will not be merged.")
+ ("legacy-options", "Exactly replicate Dermitzakis lab original quantification script. (DO NOT USE UNLESS YOU HAVE A GOOD REASON). Sets --no-merge as well.")
+ ("filter-failed-qc", "Remove fastq reads that fail sequencing QC (as indicated by the sequencer)")
+ ("filter-min-exon", boost::program_options::value< unsigned int >()->default_value(0), "Minimal exon length to consider. Exons smaller than this will not be printed out in the exon quantifications, but will still count towards gene quantifications.")
("filter-remove-duplicates", "Remove duplicate sequencing reads in the process.");
boost::program_options::options_description opt_parallel ("\x1B[32mParallelization\33[0m");
@@ -119,11 +122,28 @@ void quan_main(vector < string > & argv) {
D.dup_remove = true;
}
+ if (D.options.count("filter-failed-qc")){
+ vrb.bullet("Filtering reads flagged as failing QC");
+ D.fail_qc = true;
+ }
+
if (D.options.count("no-merge")){
vrb.bullet("Not merging overlapping mate pairs");
D.merge = false;
}
+ if (D.options.count("legacy-options")){
+ if (!D.options.count("no-merge")) vrb.bullet("Not merging overlapping mate pairs");
+ D.min_exon = 2;
+ vrb.bullet("Excluding exons smaller than " + stb.str(D.min_exon) );
+ vrb.warning("You are using --legacy-options, do you know what you are doing?");
+ D.old_wrong_split = true;
+ D.merge = false;
+ }else{
+ D.min_exon = D.options["filter-min-exon"].as < unsigned int > ();
+ vrb.bullet("Excluding exons smaller than " + stb.str(D.min_exon) );
+ }
+
if (D.options.count("gene-types")){
vector < string > t = D.options["gene-types"].as < vector < string > > ();
D.gene_types = set < string > (t.begin(),t.end());
diff --git a/src/mode_quan/quan_printResults.cpp b/src/mode_quan/quan_printResults.cpp
index 1e75b88..84554de 100644
--- a/src/mode_quan/quan_printResults.cpp
+++ b/src/mode_quan/quan_printResults.cpp
@@ -52,6 +52,7 @@ void quan_data::printBEDcount(string fout){
for (int i = 0 ; i < bams.size(); i++) fdo << "\t" << gene_grps[gr].genes[g].read_count[i];
fdo << endl;
for (int e = 0 ; e < gene_grps[gr].genes[g].exons.size(); e++){
+ if (gene_grps[gr].genes[g].exons[e].length < min_exon) continue;
fdoe << chr;
fdoe << "\t" << gene_grps[gr].genes[g].tss-1;
fdoe << "\t" << gene_grps[gr].genes[g].tss;
@@ -104,6 +105,7 @@ void quan_data::printBEDrpkm(string fout){
for (int i = 0 ; i < bams.size(); i++) fdo << "\t" << ((gene_grps[gr].genes[g].read_count[i] * 1000.0) / (double) gene_grps[gr].genes[g].length) * (1000000.0 / (double)stats[i].exonic);
fdo << endl;
for (int e = 0 ; e < gene_grps[gr].genes[g].exons.size(); e++){
+ if (gene_grps[gr].genes[g].exons[e].length < min_exon) continue;
fdoe << chr;
fdoe << "\t" << gene_grps[gr].genes[g].tss-1;
fdoe << "\t" << gene_grps[gr].genes[g].tss;
diff --git a/src/mode_quan/quan_readBAM.cpp b/src/mode_quan/quan_readBAM.cpp
index 54a2e07..6d850f1 100644
--- a/src/mode_quan/quan_readBAM.cpp
+++ b/src/mode_quan/quan_readBAM.cpp
@@ -36,7 +36,14 @@ int quan_data::read_bam(void *data, bam1_t *b, quan_stats &f, unsigned int &mmc)
continue;
}
- if (b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL)) {
+ if (b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY)) {
+ f.failed[name] = 'u';
+ if (debug) cerr << f.failed[name] << "\t" << name << endl;
+ f.unmapped++;
+ continue;
+ }
+
+ if(aux->fail_qc && b->core.flag & BAM_FQCFAIL){
f.failed[name] = 'u';
if (debug) cerr << f.failed[name] << "\t" << name << endl;
f.unmapped++;
@@ -172,6 +179,7 @@ void quan_data::readBams(){
data->max_mismatch_count = max_mismatch_count;
data->max_mismatch_count_total = max_mismatch_count_total;
data->max_intron_length = max_intron_length;
+ data->fail_qc = fail_qc;
data->fp = sam_open(fbam.c_str(), "r");
if (data->fp == 0) vrb.error("Cannot open file! [" + fbam + "]");
data->hdr = sam_hdr_read(data->fp);
@@ -204,8 +212,8 @@ void quan_data::readBams(){
for (int i = 0; i < c->n_cigar; ++i) {
int l = bam_cigar_oplen(cigar[i]);
char c = bam_cigar_opchr(cigar[i]);
- if(c=='S' || c=='D' || c =='H' || c=='P') continue;
- else if (c=='N' && l){
+ if(c=='S' || c =='H' || c =='I' || c=='P') continue;
+ else if ( (c=='N' || c == 'D' ) && l){
B.starts.push_back(bS);
B.ends.push_back(bS+bL-1);
B.lengths.push_back(bL);
@@ -233,12 +241,13 @@ void quan_data::readBams(){
stat.mismatch+=2;
continue;
}
- if(A.core.mtid != B.core.tid || (A.core.flag & BAM_FREVERSE) || !(B.core.flag & BAM_FREVERSE) || (B.core.pos < A.core.mpos) ){
+ if(A.core.mtid != B.core.tid || (A.core.flag & BAM_FREVERSE) || !(B.core.flag & BAM_FREVERSE) || ( old_wrong_split && B.core.pos <= A.core.pos) || ( !old_wrong_split && B.core.pos < A.core.pos)){
stat.failed[name] = 'p';
if (debug) cerr << stat.failed[name] << "\t" << name << endl;
stat.unpaired++;
continue;
}
+
if (merge) A.merge(B);
stat.good +=2;
bool both_found = false;
@@ -254,7 +263,10 @@ void quan_data::readBams(){
idx = e;
any_found1 = true;
exon_overlap1.push_back(idx);
- exon_overlap1_length.push_back(min(gene_grps[gr].genes[g].exons[idx].end, A.ends[i]) - max(gene_grps[gr].genes[g].exons[idx].start, A.starts[i]) + 1);
+ if ( old_wrong_split){
+ int mul = ( (int) A.starts[i] / 100000 != (int) A.ends[i] / 100000 && (int) gene_grps[gr].genes[g].exons[idx].start / 100000 != (int) gene_grps[gr].genes[g].exons[idx].end / 100000 ) ? 2 : 1;
+ exon_overlap1_length.push_back(gene_grps[gr].genes[g].exons[idx].end - A.starts[i] < A.ends[i] - A.starts[i] ? (gene_grps[gr].genes[g].exons[idx].end - A.starts[i] + 1) * mul : A.lengths[i] * mul);
+ }else exon_overlap1_length.push_back(min(gene_grps[gr].genes[g].exons[idx].end, A.ends[i]) - max(gene_grps[gr].genes[g].exons[idx].start, A.starts[i]) + 1);
exon_overlap1_length_total += exon_overlap1_length.back();
exon_map1.push_back(i);
}
@@ -262,26 +274,8 @@ void quan_data::readBams(){
}
if (idx == -1) all_found1 = false;
}
- if (!all_found1 && check_consistency){
- if (debug){
- cerr << "NCONS\t" << name<<endl;
- cerr << gene_grps[gr].genes[g];
- cerr << A;
- cerr << "NCONS\t" << name<<endl;
- }
- continue;
- }
- if (!any_found1){
- if (debug){
- cerr << "NCONS\t" << name<<endl;
- cerr << gene_grps[gr].genes[g];
- cerr << A;
- cerr << "NCONS\t" << name<<endl;
- }
- continue;
- }
-
-
+
+
for (int i = 0 ; i < B.starts.size(); i++){
int idx = -1;
if (gene_grps[gr].genes[g].overlap(B.starts[i],B.ends[i])){
@@ -290,7 +284,10 @@ void quan_data::readBams(){
idx = e;
any_found2 = true;
exon_overlap2.push_back(idx);
- exon_overlap2_length.push_back(min(gene_grps[gr].genes[g].exons[idx].end, B.ends[i]) - max(gene_grps[gr].genes[g].exons[idx].start, B.starts[i]) + 1);
+ if ( old_wrong_split) {
+ int mul = ( (int) B.starts[i] / 100000 != (int) B.ends[i] / 100000 && (int) gene_grps[gr].genes[g].exons[idx].start / 100000 != (int) gene_grps[gr].genes[g].exons[idx].end / 100000 ) ? 2 : 1;
+ exon_overlap2_length.push_back(gene_grps[gr].genes[g].exons[idx].end - B.starts[i] < B.ends[i] - B.starts[i] ? (gene_grps[gr].genes[g].exons[idx].end - B.starts[i] + 1) * mul: B.lengths[i] * mul);
+ }else exon_overlap2_length.push_back(min(gene_grps[gr].genes[g].exons[idx].end, B.ends[i]) - max(gene_grps[gr].genes[g].exons[idx].start, B.starts[i]) + 1);
exon_overlap2_length_total += exon_overlap2_length.back();
exon_map2.push_back(i);
}
@@ -298,20 +295,41 @@ void quan_data::readBams(){
}
if (idx == -1) all_found2 = false;
}
+
+ if (!all_found1 && check_consistency){
+ if (debug){
+ cerr << "NCONSALL1\t" << name<<endl;
+ cerr << gene_grps[gr].genes[g];
+ cerr << A;
+ cerr << B;
+ }
+ continue;
+ }
+ //for debugging
+ if (!any_found1){
+ if (debug){
+ cerr << "NCONSANY1\t" << name<<endl;
+ cerr << gene_grps[gr].genes[g];
+ cerr << A;
+ cerr << B;
+ }
+ continue;
+ }
+
if (!all_found2 && check_consistency){
if (debug){
- cerr << "NCONS\t" << name<<endl;
- cerr << "NCONS\t" << name<<endl;
+ cerr << "NCONSALL2\t" << name<<endl;
cerr << gene_grps[gr].genes[g];
+ cerr << A;
cerr << B;
}
continue;
}
if (!any_found2){
if(debug){
- cerr << "NCONS\t" << name<<endl;
- cerr << "NCONS\t" << name<<endl;
+ cerr << "NCONSANY2\t" << name<<endl;
cerr << gene_grps[gr].genes[g];
+ cerr << A;
cerr << B;
}
continue;
@@ -356,7 +374,10 @@ void quan_data::readBams(){
idx = e;
any_found2 = true;
exon_overlap2.push_back(idx);
- exon_overlap2_length.push_back(min(gene_grps[gr].genes[g].exons[idx].end, B.ends[i]) - max(gene_grps[gr].genes[g].exons[idx].start, B.starts[i]) + 1);
+ if ( old_wrong_split) {
+ int mul = ( (int) B.starts[i] / 100000 != (int) B.ends[i] / 100000 && (int) gene_grps[gr].genes[g].exons[idx].start / 100000 != (int) gene_grps[gr].genes[g].exons[idx].end / 100000 ) ? 2 : 1;
+ exon_overlap2_length.push_back(gene_grps[gr].genes[g].exons[idx].end - B.starts[i] < B.ends[i] - B.starts[i] ? (gene_grps[gr].genes[g].exons[idx].end - B.starts[i] + 1) * mul: B.lengths[i] * mul);
+ } else exon_overlap2_length.push_back(min(gene_grps[gr].genes[g].exons[idx].end, B.ends[i]) - max(gene_grps[gr].genes[g].exons[idx].start, B.starts[i]) + 1);
exon_overlap2_length_total += exon_overlap2_length[i];
exon_map2.push_back(i);
}
diff --git a/src/mode_rtc/rtc_common.cpp b/src/mode_rtc/rtc_common.cpp
index 10dc2a6..154379e 100644
--- a/src/mode_rtc/rtc_common.cpp
+++ b/src/mode_rtc/rtc_common.cpp
@@ -214,7 +214,7 @@ void rtc_data::calculateRTC(string fout){
if (fdo.fail()) vrb.error("Cannot open file [" + fout + "]");
if (options.count("header") || (!options.count("chunk") && !options.count("region"))){
fdo <<"other_variant our_variant phenotype phenotype_group other_variant_chr other_variant_start other_variant_rank our_variant_chr our_variant_start our_variant_rank phenotype_chr phenotype_start distance_between_variants distance_between_other_variant_and_pheno other_variant_region_index our_variant_region_index region_start region_end variant_count_in_region RTC D' r^2";
- if (options.count("sample")) fdo << " p_value unique_picks_H0 unique_picks_H1 rtc_bin_start rtc_bin_end rtc_bin_H0_proportion rtc_bin_H1_proportion median_r^2 median_H0 median_H1 H0 H1";
+ if (options["sample"].as <unsigned int> ()) fdo << " p_value unique_picks_H0 unique_picks_H1 rtc_bin_start rtc_bin_end rtc_bin_H0_proportion rtc_bin_H1_proportion median_r^2 median_H0 median_H1 H0 H1";
fdo << endl;
}
map < string ,vector < pairsToTestForRTC > >::iterator it;
diff --git a/src/mode_trans/trans_chunking.cpp b/src/mode_trans/trans_chunking.cpp
index b7319cb..efa52e0 100644
--- a/src/mode_trans/trans_chunking.cpp
+++ b/src/mode_trans/trans_chunking.cpp
@@ -25,7 +25,19 @@ void trans_data::setPhenotypeLines(int k, int K) {
unsigned long int max_length =0 ;
if (phenotype_count % K == 0) max_length = phenotype_count / K;
else for ( unsigned long int l = 1 ; l * (K-1) < phenotype_count; l++ ) max_length = l;
- start_line = (k-1) * max_length + 1;
- end_line = k * max_length;
+ if (K * max_length < phenotype_count){
+ int diff = phenotype_count - (K * max_length);
+ if (k <= diff){
+ start_line = (k-1) * (max_length + 1) + 1;
+ end_line = k * (max_length + 1);
+ }else{
+ int prev = diff * (max_length + 1);
+ start_line = (k-diff-1) * max_length + 1 + prev;
+ end_line = (k-diff) * max_length + prev;
+ }
+ }else{
+ start_line = (k-1) * max_length + 1;
+ end_line = k * max_length;
+ }
if (end_line > phenotype_count) end_line = phenotype_count;
-}
\ No newline at end of file
+}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/qtltools.git
More information about the debian-med-commit
mailing list