[med-svn] [eagle] 01/06: New upstream version 2.4

Dylan Aïssi bob.dybian-guest at moszumanska.debian.org
Fri Jan 12 22:38:14 UTC 2018


This is an automated email from the git hooks/post-receive script.

bob.dybian-guest pushed a commit to branch master
in repository eagle.

commit 2d8ac15bd5c0ab3842a95b847dc44051024eb23e
Author: Dylan Aïssi <bob.dybian at gmail.com>
Date:   Fri Jan 12 22:53:00 2018 +0100

    New upstream version 2.4
---
 example/example.log          |  70 ++++++++++++-------------
 example/example_ref.log      |  15 +++---
 example/example_vcf.log      |  70 ++++++++++++-------------
 example/phased.vcf.gz        | Bin 200383 -> 200388 bytes
 example/target.phased.vcf.gz | Bin 21053 -> 21053 bytes
 src/Eagle.cpp                |  17 ++++++-
 src/Eagle.hpp                |   5 +-
 src/EagleMain.cpp            |  15 ++++--
 src/EaglePBWT.cpp            |  15 +++---
 src/EagleParams.cpp          |  11 +++-
 src/EagleParams.hpp          |   3 +-
 src/MapInterpolater.cpp      |  10 ++++
 src/SyncedVcfData.cpp        | 119 +++++++++++++++++++++++--------------------
 src/SyncedVcfData.hpp        |  19 +++----
 src/Version.hpp              |   4 +-
 15 files changed, 214 insertions(+), 159 deletions(-)

diff --git a/example/example.log b/example/example.log
index a916d76..adfb60e 100644
--- a/example/example.log
+++ b/example/example.log
@@ -1,12 +1,12 @@
                       +-----------------------------+
                       |                             |
-                      |   Eagle v2.3.5              |
-                      |   August 2, 2017            |
+                      |   Eagle v2.4                |
+                      |   December 13, 2017         |
                       |   Po-Ru Loh                 |
                       |                             |
                       +-----------------------------+
 
-Copyright (C) 2015-2016 Harvard University.
+Copyright (C) 2015-2017 Harvard University.
 Distributed under the GNU GPLv3+ open source license.
 
 Command line options:
@@ -56,25 +56,25 @@ Setting --histFactor=1.00
 
 BEGINNING STEP 1
 
-Time for step 1: 0.373437
-Time for step 1 MN^2: 0.023717
+Time for step 1: 0.282519
+Time for step 1 MN^2: 0.0201082
 
-Making hard calls (time: 0.028342)
+Making hard calls (time: 0.0164621)
 
 
 BEGINNING STEP 2
 
 BATCH 1 OF 1
 Building hash tables
-.................................................................. (time: 0.050245)
+.................................................................. (time: 0.0826769)
 
 Phasing samples 1-379
-Time for phasing batch: 0.433878
+Time for phasing batch: 0.335781
 
-Making hard calls (time: 0.030308)
+Making hard calls (time: 0.0172811)
 
-Time for step 2: 0.514443
-Time for step 2 MN^2: 0.0593926
+Time for step 2: 0.435747
+Time for step 2 MN^2: 0.0391194
 
 
 BEGINNING STEP 3 (PBWT ITERS)
@@ -87,108 +87,108 @@ BEGINNING PBWT ITER 1
 BATCH 1 OF 10
 
 Phasing samples 1-37
-Time for phasing batch: 1.75117
+Time for phasing batch: 1.13401
 
 BATCH 2 OF 10
 
 Phasing samples 38-75
-Time for phasing batch: 1.58165
+Time for phasing batch: 1.04853
 
 BATCH 3 OF 10
 
 Phasing samples 76-113
-Time for phasing batch: 1.68042
+Time for phasing batch: 1.18956
 
 BATCH 4 OF 10
 
 Phasing samples 114-151
-Time for phasing batch: 1.6039
+Time for phasing batch: 1.1353
 
 BATCH 5 OF 10
 
 Phasing samples 152-189
-Time for phasing batch: 1.59436
+Time for phasing batch: 1.01419
 
 BATCH 6 OF 10
 
 Phasing samples 190-227
-Time for phasing batch: 1.64487
+Time for phasing batch: 1.05172
 
 BATCH 7 OF 10
 
 Phasing samples 228-265
-Time for phasing batch: 1.60414
+Time for phasing batch: 0.99322
 
 BATCH 8 OF 10
 
 Phasing samples 266-303
-Time for phasing batch: 1.51987
+Time for phasing batch: 0.99847
 
 BATCH 9 OF 10
 
 Phasing samples 304-341
-Time for phasing batch: 1.56367
+Time for phasing batch: 0.998471
 
 BATCH 10 OF 10
 
 Phasing samples 342-379
-Time for phasing batch: 1.5805
+Time for phasing batch: 1.05052
 
-Time for PBWT iter 1: 16.1246
+Time for PBWT iter 1: 10.6142
 
 BEGINNING PBWT ITER 2
 
 BATCH 1 OF 10
 
 Phasing samples 1-37
-Time for phasing batch: 2.70429
+Time for phasing batch: 1.76314
 
 BATCH 2 OF 10
 
 Phasing samples 38-75
-Time for phasing batch: 2.5743
+Time for phasing batch: 1.79145
 
 BATCH 3 OF 10
 
 Phasing samples 76-113
-Time for phasing batch: 2.70542
+Time for phasing batch: 1.8253
 
 BATCH 4 OF 10
 
 Phasing samples 114-151
-Time for phasing batch: 2.56365
+Time for phasing batch: 1.54563
 
 BATCH 5 OF 10
 
 Phasing samples 152-189
-Time for phasing batch: 2.50088
+Time for phasing batch: 1.54299
 
 BATCH 6 OF 10
 
 Phasing samples 190-227
-Time for phasing batch: 2.58336
+Time for phasing batch: 1.59936
 
 BATCH 7 OF 10
 
 Phasing samples 228-265
-Time for phasing batch: 2.59174
+Time for phasing batch: 1.55847
 
 BATCH 8 OF 10
 
 Phasing samples 266-303
-Time for phasing batch: 2.47758
+Time for phasing batch: 1.51538
 
 BATCH 9 OF 10
 
 Phasing samples 304-341
-Time for phasing batch: 2.48646
+Time for phasing batch: 1.58919
 
 BATCH 10 OF 10
 
 Phasing samples 342-379
-Time for phasing batch: 2.57226
+Time for phasing batch: 1.64308
 
-Time for PBWT iter 2: 25.76
+Time for PBWT iter 2: 16.3741
 Writing .haps.gz and .sample output
-Time for writing output: 0.306937
-Total elapsed time for analysis = 43.2534 sec
+Time for writing output: 0.281509
+Total elapsed time for analysis = 28.1508 sec
diff --git a/example/example_ref.log b/example/example_ref.log
index bb3c432..14faf91 100644
--- a/example/example_ref.log
+++ b/example/example_ref.log
@@ -1,12 +1,12 @@
                       +-----------------------------+
                       |                             |
-                      |   Eagle v2.3.5              |
-                      |   August 2, 2017            |
+                      |   Eagle v2.4                |
+                      |   December 13, 2017         |
                       |   Po-Ru Loh                 |
                       |                             |
                       +-----------------------------+
 
-Copyright (C) 2015-2016 Harvard University.
+Copyright (C) 2015-2017 Harvard University.
 Distributed under the GNU GPLv3+ open source license.
 
 Command line options:
@@ -42,6 +42,9 @@ Genetic distance range:  10.292 cM
 Average # SNPs per cM:   42
 Number of <=(64-SNP, 1cM) segments: 9
 Average # SNPs per segment: 47
+
+Time for reading input: 6.18623 sec
+
 Fraction of heterozygous genotypes: 0.178114
 Typical span of default 100-het history length: 13.44 cM
 Setting --histFactor=1.00
@@ -56,15 +59,15 @@ PHASING ITER 1 OF 1
 
 Phasing target samples
 ................................................................................
-Time for phasing iter 1: 0.236665
+Time for phasing iter 1: 0.144591
 Writing vcf.gz output to target.phased.vcf.gz
 [W::vcf_parse] INFO 'AC' is not defined in the header, assuming Type=String
 [W::vcf_parse] INFO 'AN' is not defined in the header, assuming Type=String
 [W::vcf_parse] INFO 'DP' is not defined in the header, assuming Type=String
 [W::vcf_parse] INFO 'AFR_AF' is not defined in the header, assuming Type=String
 [W::vcf_parse] INFO 'EX_TARGET' is not defined in the header, assuming Type=String
-Time for writing output: 0.028465
-Total elapsed time for analysis = 9.34482 sec
+Time for writing output: 0.0239
+Total elapsed time for analysis = 6.35535 sec
 
 Mean phase confidence of each target individual:
 ID	PHASE_CONFIDENCE
diff --git a/example/example_vcf.log b/example/example_vcf.log
index c901b3c..19f7213 100644
--- a/example/example_vcf.log
+++ b/example/example_vcf.log
@@ -1,12 +1,12 @@
                       +-----------------------------+
                       |                             |
-                      |   Eagle v2.3.5              |
-                      |   August 2, 2017            |
+                      |   Eagle v2.4                |
+                      |   December 13, 2017         |
                       |   Po-Ru Loh                 |
                       |                             |
                       +-----------------------------+
 
-Copyright (C) 2015-2016 Harvard University.
+Copyright (C) 2015-2017 Harvard University.
 Distributed under the GNU GPLv3+ open source license.
 
 Command line options:
@@ -39,25 +39,25 @@ Setting --histFactor=1.00
 
 BEGINNING STEP 1
 
-Time for step 1: 0.37492
-Time for step 1 MN^2: 0.0236487
+Time for step 1: 0.267283
+Time for step 1 MN^2: 0.0164427
 
-Making hard calls (time: 0.02865)
+Making hard calls (time: 0.0152462)
 
 
 BEGINNING STEP 2
 
 BATCH 1 OF 1
 Building hash tables
-.................................................................. (time: 0.0538409)
+.................................................................. (time: 0.0786691)
 
 Phasing samples 1-379
-Time for phasing batch: 0.435353
+Time for phasing batch: 0.331454
 
-Making hard calls (time: 0.0303631)
+Making hard calls (time: 0.0164559)
 
-Time for step 2: 0.519568
-Time for step 2 MN^2: 0.0592469
+Time for step 2: 0.426586
+Time for step 2 MN^2: 0.0375612
 
 
 BEGINNING STEP 3 (PBWT ITERS)
@@ -70,108 +70,108 @@ BEGINNING PBWT ITER 1
 BATCH 1 OF 10
 
 Phasing samples 1-37
-Time for phasing batch: 1.79524
+Time for phasing batch: 1.0778
 
 BATCH 2 OF 10
 
 Phasing samples 38-75
-Time for phasing batch: 1.61659
+Time for phasing batch: 1.0477
 
 BATCH 3 OF 10
 
 Phasing samples 76-113
-Time for phasing batch: 1.70906
+Time for phasing batch: 1.11441
 
 BATCH 4 OF 10
 
 Phasing samples 114-151
-Time for phasing batch: 1.63612
+Time for phasing batch: 1.06495
 
 BATCH 5 OF 10
 
 Phasing samples 152-189
-Time for phasing batch: 1.60907
+Time for phasing batch: 1.07034
 
 BATCH 6 OF 10
 
 Phasing samples 190-227
-Time for phasing batch: 1.65953
+Time for phasing batch: 1.13122
 
 BATCH 7 OF 10
 
 Phasing samples 228-265
-Time for phasing batch: 1.657
+Time for phasing batch: 1.07535
 
 BATCH 8 OF 10
 
 Phasing samples 266-303
-Time for phasing batch: 1.59278
+Time for phasing batch: 1.16462
 
 BATCH 9 OF 10
 
 Phasing samples 304-341
-Time for phasing batch: 1.61661
+Time for phasing batch: 1.15337
 
 BATCH 10 OF 10
 
 Phasing samples 342-379
-Time for phasing batch: 1.59879
+Time for phasing batch: 1.17788
 
-Time for PBWT iter 1: 16.4908
+Time for PBWT iter 1: 11.0777
 
 BEGINNING PBWT ITER 2
 
 BATCH 1 OF 10
 
 Phasing samples 1-37
-Time for phasing batch: 2.72667
+Time for phasing batch: 1.87655
 
 BATCH 2 OF 10
 
 Phasing samples 38-75
-Time for phasing batch: 2.55292
+Time for phasing batch: 1.6772
 
 BATCH 3 OF 10
 
 Phasing samples 76-113
-Time for phasing batch: 2.68511
+Time for phasing batch: 1.67862
 
 BATCH 4 OF 10
 
 Phasing samples 114-151
-Time for phasing batch: 2.57207
+Time for phasing batch: 1.57155
 
 BATCH 5 OF 10
 
 Phasing samples 152-189
-Time for phasing batch: 2.51047
+Time for phasing batch: 1.57551
 
 BATCH 6 OF 10
 
 Phasing samples 190-227
-Time for phasing batch: 2.6027
+Time for phasing batch: 1.64418
 
 BATCH 7 OF 10
 
 Phasing samples 228-265
-Time for phasing batch: 2.619
+Time for phasing batch: 1.60561
 
 BATCH 8 OF 10
 
 Phasing samples 266-303
-Time for phasing batch: 2.47177
+Time for phasing batch: 1.63349
 
 BATCH 9 OF 10
 
 Phasing samples 304-341
-Time for phasing batch: 2.52629
+Time for phasing batch: 1.78741
 
 BATCH 10 OF 10
 
 Phasing samples 342-379
-Time for phasing batch: 2.5489
+Time for phasing batch: 1.80977
 
-Time for PBWT iter 2: 25.8159
+Time for PBWT iter 2: 16.86
 Writing vcf.gz output to phased.vcf.gz
-Time for writing output: 0.316302
-Total elapsed time for analysis = 53.0844 sec
+Time for writing output: 0.204556
+Total elapsed time for analysis = 34.7048 sec
diff --git a/example/phased.vcf.gz b/example/phased.vcf.gz
index 67ee350..5ed3010 100644
Binary files a/example/phased.vcf.gz and b/example/phased.vcf.gz differ
diff --git a/example/target.phased.vcf.gz b/example/target.phased.vcf.gz
index 81664e0..ed6302c 100644
Binary files a/example/target.phased.vcf.gz and b/example/target.phased.vcf.gz differ
diff --git a/src/Eagle.cpp b/src/Eagle.cpp
index 6e185b5..b0eced0 100644
--- a/src/Eagle.cpp
+++ b/src/Eagle.cpp
@@ -2329,6 +2329,10 @@ namespace EAGLE {
       cerr << "Too few SNP segments for analysis" << endl;
       exit(1);
     }
+    if (Mseg64 > 50000U) {
+      cerr << "Too many SNP segments for analysis: " << Mseg64 << " (max = 50000)" << endl;
+      exit(1);
+    }
 
     uint seed = n0; // for rand_r()
 
@@ -3218,7 +3222,8 @@ namespace EAGLE {
 
   void Eagle::writeVcf(const string &tmpFile, const vector <bool> &isTmpPhased,
 		       const string &outFile, int chromX, double bpStart, double bpEnd,
-		       const string &writeMode, bool noImpMissing, int argc, char **argv) const {
+		       const string &writeMode, bool noImpMissing, bool keepMissingPloidyX,
+		       int argc, char **argv) const {
 
     htsFile *htsTmp = hts_open(tmpFile.c_str(), "r");
     if (htsTmp == NULL) {
@@ -3274,6 +3279,10 @@ namespace EAGLE {
 	    int ploidy = 2;
 	    int *ptr = tgt_gt + i*ploidy;
 
+	    // --keepMissingPloidyX: assume missing genotypes in target VCF have correct ploidy
+	    if (chrom == chromX && keepMissingPloidyX && bcf_gt_is_missing(ptr[0]))
+	      mostRecentPloidy[i] = (ptr[1] == bcf_int32_vector_end ? 1 : 2);
+
 	    if (chrom != chromX || (bcf_gt_is_missing(ptr[0]) && mostRecentPloidy[i] == 2)
 		|| ptr[1] != bcf_int32_vector_end) { // diploid... be careful about missing '.'
 	      mostRecentPloidy[i] = 2;
@@ -3355,7 +3364,7 @@ namespace EAGLE {
   // - does not delete tmpFile (now vcfFile with original input)
   void Eagle::writeVcfNonRef(const string &vcfFile, const string &outFile, int inputChrom,
 			     int chromX, double bpStart, double bpEnd, const string &writeMode,
-			     int argc, char **argv) const {
+			     bool keepMissingPloidyX, int argc, char **argv) const {
 
     htsFile *htsIn = hts_open(vcfFile.c_str(), "r");
     htsFile *out = hts_open(outFile.c_str(), writeMode.c_str());
@@ -3388,6 +3397,10 @@ namespace EAGLE {
 	  int ploidy = 2;
 	  int *ptr = tgt_gt + i*ploidy;
 
+	  // --keepMissingPloidyX: assume missing genotypes in target VCF have correct ploidy
+	  if (chrom == chromX && keepMissingPloidyX && bcf_gt_is_missing(ptr[0]))
+	    mostRecentPloidy[i] = (ptr[1] == bcf_int32_vector_end ? 1 : 2);
+
 	  if (chrom != chromX || (bcf_gt_is_missing(ptr[0]) && mostRecentPloidy[i] == 2)
 	      || ptr[1] != bcf_int32_vector_end) { // diploid... be careful about missing '.'
 	    mostRecentPloidy[i] = 2;
diff --git a/src/Eagle.hpp b/src/Eagle.hpp
index 5c43e0f..8e309a7 100644
--- a/src/Eagle.hpp
+++ b/src/Eagle.hpp
@@ -187,10 +187,11 @@ namespace EAGLE {
     void writeHapsGzSample(const std::string &prefix) const;
     void writeVcf(const std::string &tmpFile, const std::vector <bool> &isTmpPhased,
 		  const std::string &outFile, int chromX, double bpStart, double bpEnd,
-		  const std::string &writeMode, bool noImpMissing, int argc, char**argv) const;
+		  const std::string &writeMode, bool noImpMissing, bool keepMissingPloidyX,
+		  int argc, char**argv) const;
     void writeVcfNonRef(const std::string &vcfFile, const std::string &outFile, int inputChrom,
 			int chromX, double bpStart, double bpEnd, const std::string &writeMode,
-			int argc, char **argv) const;
+			bool keepMissingPloidyX, int argc, char **argv) const;
     void makeHardCalls(uint64 n0start, uint64 n0end, uint seed);
     void initRefIter(int refIter);
     void buildHashTables(int iter, int batch, int seed);
diff --git a/src/EagleMain.cpp b/src/EagleMain.cpp
index c8a2617..12ce800 100644
--- a/src/EagleMain.cpp
+++ b/src/EagleMain.cpp
@@ -67,11 +67,12 @@ void phaseWithRef(EagleParams &params, Timer &timer, double t0, int argc, char *
   string outFile = params.outPrefix + "." + params.vcfOutSuffix;
   vector < vector < pair <int, int> > > conPSall; double snpRate;
   vector <bool> isTmpPhased;
-  SyncedVcfData vcfData(params.vcfRef, params.vcfTarget, params.allowRefAltSwap, params.chrom,
-			params.chromX, params.bpStart-params.bpFlanking,
+  SyncedVcfData vcfData(params.vcfRef, params.vcfTarget, params.vcfExclude, params.allowRefAltSwap,
+			params.chrom, params.chromX, params.bpStart-params.bpFlanking,
 			params.bpEnd+params.bpFlanking, params.geneticMapFile,
 			params.cMmax==0 ? 1 : params.cMmax, tmpFile, params.vcfWriteMode,
 			params.usePS, conPSall, snpRate, params.outputUnphased, isTmpPhased);
+  cout << endl << "Time for reading input: " << timer.update_time() << " sec" << endl << endl;
 
   Eagle eagle(vcfData.getNref(), vcfData.getNtarget(), vcfData.getMseg64(),
 	      vcfData.getGenoBits(), vcfData.getSeg64cMvecs(), params.pErr);
@@ -187,7 +188,7 @@ void phaseWithRef(EagleParams &params, Timer &timer, double t0, int argc, char *
   timer.update_time();
   cout << "Writing " << params.vcfOutSuffix << " output to " << outFile << endl;
   eagle.writeVcf(tmpFile, isTmpPhased, outFile, params.chromX, params.bpStart, params.bpEnd,
-		 params.vcfWriteMode, params.noImpMissing, argc, argv);
+		 params.vcfWriteMode, params.noImpMissing, params.keepMissingPloidyX, argc, argv);
   cout << "Time for writing output: " << timer.update_time() << endl;
 
   cout << "Total elapsed time for analysis = " << (timer.get_time()-t0) << " sec" << endl;
@@ -215,7 +216,7 @@ int main(int argc, char *argv[]) {
   cout << "                      +-----------------------------+" << endl;
   cout << endl;
 
-  cout << "Copyright (C) 2015-2016 Harvard University." << endl;
+  cout << "Copyright (C) 2015-2017 Harvard University." << endl;
   cout << "Distributed under the GNU GPLv3+ open source license." << endl << endl;
 
   //cout << "Boost version: " << BOOST_LIB_VERSION << endl;
@@ -285,6 +286,9 @@ int main(int argc, char *argv[]) {
     int bpSpan = genoData.getSnps().back().physpos - genoData.getSnps()[0].physpos;
     double snpsPerMb = genoData.getSnps().size() / (bpSpan*1e-6);
     if (snpsPerMb > 1000) params.pbwtOnly = true;
+
+    if (genoData.getN() > 200000U) params.pbwtOnly = true;
+
     if (params.pbwtOnly) params.runStep2 = 0;
 
     // if --runStep2 hasn't yet been set, SNP density must be low; run Step 2 unless too few chunks
@@ -499,7 +503,8 @@ int main(int argc, char *argv[]) {
       string outFile = params.outPrefix + "." + params.vcfOutSuffix;
       cout << "Writing " << params.vcfOutSuffix << " output to " << outFile << endl;
       eagle.writeVcfNonRef(params.vcfFile, outFile, params.chrom, params.chromX, params.bpStart,
-			   params.bpEnd, params.vcfWriteMode, argc, argv);
+			   params.bpEnd, params.vcfWriteMode, params.keepMissingPloidyX, argc,
+			   argv);
     }
     else {
       cout << "Writing .haps.gz and .sample output" << endl; timer.update_time();
diff --git a/src/EaglePBWT.cpp b/src/EaglePBWT.cpp
index 9d20c8a..686af3b 100644
--- a/src/EaglePBWT.cpp
+++ b/src/EaglePBWT.cpp
@@ -529,7 +529,7 @@ namespace EAGLE {
       const uint64 m64jNext = t==(int) splits64j.size() ? Mseg64*64 : splits64j[t];
       vector <uint64> miss64j;
       for (uint64 m64j = m64jPrev+1; m64j < m64jNext; m64j++)
-	if (genos64j[m64j] == 3 && impMissing) // missing
+	if (maskSnps64j[m64j] && genos64j[m64j] == 3 && impMissing) // missing
 	  miss64j.push_back(m64j);
 
       // orient each hap pair wrt called phase
@@ -582,8 +582,9 @@ namespace EAGLE {
 
 	// initialize allele dosages
 	int nMiss = miss64j.size();
-	double alleleDoses[nMiss][2];
-	for (int m = 0; m < nMiss; m++) alleleDoses[m][0] = alleleDoses[m][1] = 0;
+	struct DosePair { double d[2]; };
+	vector <DosePair> alleleDoses(nMiss);
+	for (int m = 0; m < nMiss; m++) alleleDoses[m].d[0] = alleleDoses[m].d[1] = 0;
 
 	// process non-ends: call using longer of fwd, rev ref samples
 	double meanLens[2] = {0, 0};
@@ -595,7 +596,7 @@ namespace EAGLE {
 	for (uint k = 0; k < nonEndRefs[fbLong].size(); k++) {
 	  int refSeq = nonEndRefs[fbLong][k];
 	  for (int m = 0; m < nMiss; m++)
-	    alleleDoses[m][(haploBitsT[refSeq*Mseg64+miss64j[m]/64]>>(miss64j[m]&63))&1] += 1;
+	    alleleDoses[m].d[(haploBitsT[refSeq*Mseg64+miss64j[m]/64]>>(miss64j[m]&63))&1] += 1;
 	}
 	  
 	// compute mean lengths including ends=0 for phasing singletons later
@@ -653,9 +654,9 @@ namespace EAGLE {
 		else
 		  break;
 	      }
-	      alleleDoses[m][(haploBitsT[refSeqFwd*Mseg64+miss64j[m]/64]>>(miss64j[m]&63))&1] +=
+	      alleleDoses[m].d[(haploBitsT[refSeqFwd*Mseg64+miss64j[m]/64]>>(miss64j[m]&63))&1] +=
 		cMcum / cMtot;
-	      alleleDoses[m][(haploBitsT[refSeqRev*Mseg64+miss64j[m]/64]>>(miss64j[m]&63))&1] +=
+	      alleleDoses[m].d[(haploBitsT[refSeqRev*Mseg64+miss64j[m]/64]>>(miss64j[m]&63))&1] +=
 		1 - cMcum / cMtot;
 	    }
 	  }
@@ -664,7 +665,7 @@ namespace EAGLE {
 	// make final calls
 	for (int m = 0; m < nMiss; m++) {
 	  uint64 m64 = miss64j[m]/64, j = miss64j[m]&63;
-	  if (alleleDoses[m][0] >= alleleDoses[m][1])
+	  if (alleleDoses[m].d[0] >= alleleDoses[m].d[1])
 	    tmpHaploBitsT[(nTargetHap+h)*Mseg64 + m64] &= ~(1ULL<<j);
 	  else
 	    tmpHaploBitsT[(nTargetHap+h)*Mseg64 + m64] |= 1ULL<<j;
diff --git a/src/EagleParams.cpp b/src/EagleParams.cpp
index 2509c23..06ddf82 100644
--- a/src/EagleParams.cpp
+++ b/src/EagleParams.cpp
@@ -83,9 +83,13 @@ namespace EAGLE {
        "tabix-indexed [compressed] VCF/BCF file for target genotypes")
       ("vcfOutFormat", po::value<string>(&vcfOutFormat)->default_value("."),
        "b|u|z|v: compressed BCF (b), uncomp BCF (u), compressed VCF (z), uncomp VCF (v)")
-      ("noImpMissing", "disable imputation of missing ./. target genotypes")
+      ("noImpMissing", "disable imputation of missing target genotypes (. or ./.)")
       ("allowRefAltSwap", "allow swapping of REF/ALT in target vs. ref VCF")
       ("outputUnphased", "output unphased sites (target-only, multi-allelic, etc.)")
+      ("keepMissingPloidyX",
+       "assume missing genotypes have correct ploidy (.=haploid, ./.=diploid)")
+      ("vcfExclude", po::value<string>(&vcfExclude),
+       "tabix-indexed [compressed] VCF/BCF file containing variants to exclude from phasing")
       ;
 
     po::options_description bothModes("Region selection options");
@@ -229,6 +233,7 @@ namespace EAGLE {
       noImpMissing = vm.count("noImpMissing");
       allowRefAltSwap = vm.count("allowRefAltSwap");
       outputUnphased = vm.count("outputUnphased");
+      keepMissingPloidyX = vm.count("keepMissingPloidyX");
 
       if (vm.count("vcfRef") || vm.count("vcfTarget") || vm.count("vcf")) { // VCF mode
 	if (vm.count("vcf")) { // non-ref mode
@@ -254,6 +259,9 @@ namespace EAGLE {
 	    cerr << "ERROR: --pbwtIters cannot be greater than 1 if --noImpMissing is set" << endl;
 	    return false;
 	  }
+	  if (vcfRef.substr(vcfRef.length()-4) != string(".bcf")) {
+	    cerr << "WARNING: --vcfRef does not end in '.bcf'; BCF input is fastest" << endl;
+	  }
 	}
 
 	// vcf input checks for both ref and non-ref mode
@@ -365,6 +373,7 @@ namespace EAGLE {
       FileUtils::requireEmptyOrReadable(bedFile);
       FileUtils::requireEmptyOrReadable(vcfRef);
       FileUtils::requireEmptyOrReadable(vcfTarget);
+      FileUtils::requireEmptyOrReadable(vcfExclude);
       if (geneticMapFile != "USE_BIM") {
 	vector <string> reqHeader;
 	reqHeader.push_back("chr"); reqHeader.push_back("position");
diff --git a/src/EagleParams.hpp b/src/EagleParams.hpp
index 9470418..b53222f 100644
--- a/src/EagleParams.hpp
+++ b/src/EagleParams.hpp
@@ -30,7 +30,7 @@ namespace EAGLE {
   public:
 
     // main input files
-    std::string famFile, bimFile, bedFile, vcfFile, vcfRef, vcfTarget;
+    std::string famFile, bimFile, bedFile, vcfFile, vcfRef, vcfTarget, vcfExclude;
     int chrom, chromX;
 
     // optional reference map file for filling in genpos
@@ -45,6 +45,7 @@ namespace EAGLE {
     // outFormat b|u|z|v -> outSuffix bcf|bcf|vcf.gz|vcf, writeMode wb|wbu|wz|w
     bool noImpMissing;
     bool outputUnphased; // output unphased sites (target-only, multi-allelic, etc.)
+    bool keepMissingPloidyX; // assume missing genotypes in VCF have correct ploidy (.=haploid, ./.=diploid)
     int usePS; // use FORMAT:PS phase constraints: 1=soft, 2=harder
 
     bool allowRefAltSwap; // in reference-based phasing mode
diff --git a/src/MapInterpolater.cpp b/src/MapInterpolater.cpp
index b3d1f93..4ef80f4 100644
--- a/src/MapInterpolater.cpp
+++ b/src/MapInterpolater.cpp
@@ -62,6 +62,16 @@ namespace Genetics {
 	  cerr << "       " << chr << " " << bp << " " << rate << " " << gen << endl;
 	  exit(1);
 	}
+	if (bp<=0) {
+	  cerr << "ERROR: Genetic map positions must be positive:" << endl;
+	  cerr << "       " << chr << " " << bp << " " << rate << " " << gen << endl;
+	  exit(1);
+	}
+	if (bp==bp0) {
+	  cerr << "ERROR: Genetic map contains duplicate position:" << endl;
+	  cerr << "       " << chr << " " << bp << " " << rate << " " << gen << endl;
+	  exit(1);
+	}
 	chrBpToRateGen[make_pair(chr, bp)] = make_pair((gen-gen0)/(1e-6*(bp-bp0)), gen);
       }
       chr0 = chr; bp0 = bp; gen0 = gen;
diff --git a/src/SyncedVcfData.cpp b/src/SyncedVcfData.cpp
index d8b0b2b..7a2ccd9 100644
--- a/src/SyncedVcfData.cpp
+++ b/src/SyncedVcfData.cpp
@@ -159,17 +159,19 @@ namespace EAGLE {
       }
   }
 
+  // fills in chrom if chrom==0
   vector < pair <int, int> > SyncedVcfData::processVcfs
-  (const string &vcfRef, const string &vcfTarget, bool allowRefAltSwap, int chrom, int chromX,
-   double bpStart, double bpEnd, vector <bool> &hapsRef, vector <uchar> &genosTarget,
-   const string &tmpFile, const string &writeMode, int usePS,
+  (const string &vcfRef, const string &vcfTarget, const string &vcfExclude, bool allowRefAltSwap,
+   int &chrom, int chromX, double bpStart, double bpEnd, vector <bool> &hapsRef,
+   vector <uchar> &genosTarget, const string &tmpFile, const string &writeMode, int usePS,
    vector < vector < pair <int, int> > > &conPSall, bool outputUnphased,
    vector <bool> &isTmpPhased) {
 
     vector < pair <int, int> > chrBps;
 
     bcf_srs_t *sr = bcf_sr_init();
-    bcf_sr_set_opt(sr, BCF_SR_PAIR_LOGIC, BCF_SR_PAIR_BOTH_REF);
+    bcf_sr_set_opt(sr, BCF_SR_PAIR_LOGIC,
+		   allowRefAltSwap ? BCF_SR_PAIR_BOTH_REF : BCF_SR_PAIR_EXACT);
     bcf_sr_set_opt(sr, BCF_SR_REQUIRE_IDX);
 
     if ( chrom!=0 )
@@ -181,7 +183,7 @@ namespace EAGLE {
             ksprintf(&str,"X:%d-%d,chrX:%d-%d",(uint32_t)bpStart,(uint32_t)bpEnd,(uint32_t)bpStart,(uint32_t)bpEnd);
         }
         else
-            ksprintf(&str,"%d:%d-%d",chrom,(uint32_t)bpStart,(uint32_t)bpEnd);
+            ksprintf(&str,"%d:%d-%d,chr%d:%d-%d",chrom,(uint32_t)bpStart,(uint32_t)bpEnd,chrom,(uint32_t)bpStart,(uint32_t)bpEnd);
         if ( bcf_sr_set_regions(sr, str.s, 0)!=0 )
         {
             cerr << "ERROR: failed to initialize the region:" << str.s;
@@ -200,6 +202,12 @@ namespace EAGLE {
 	   << bcf_sr_strerror(sr->errnum) << endl;
       exit(1);
     }
+    bool useExclude = !vcfExclude.empty();
+    if (useExclude && !bcf_sr_add_reader(sr, vcfExclude.c_str())) {
+      cerr << "ERROR: Could not open " << vcfExclude << " for reading: "
+	   << bcf_sr_strerror(sr->errnum) << endl;
+      exit(1);
+    }
 
     bcf_hdr_t *ref_hdr = bcf_sr_get_header(sr, 0);
     bcf_hdr_t *tgt_hdr = bcf_sr_get_header(sr, 1);
@@ -229,7 +237,7 @@ namespace EAGLE {
     cout << "Target samples: Ntarget = " << Ntarget << endl;
 
     M = 0;
-    uint MtargetOnly = 0, MrefOnly = 0, MmultiAllelicTgt = 0,
+    uint Mexclude = 0, MtargetOnly = 0, MrefOnly = 0, MmultiAllelicTgt = 0,
       MmultiAllelicRef = 0, MmonomorphicRef = 0;
     uint MwithMissingRef = 0, MwithUnphasedRef = 0, MnotInRegion = 0, MnotOnChrom = 0;
     uint MrefAltError = 0, numRefAltSwaps = 0;
@@ -244,6 +252,11 @@ namespace EAGLE {
       {
 	bcf1_t *ref = bcf_sr_get_line(sr, 0);
 	bcf1_t *tgt = bcf_sr_get_line(sr, 1);
+	if (useExclude && bcf_sr_get_line(sr, 2)) {
+	  Mexclude++;
+	  if (outputUnphased && tgt) { bcf_write(out, tgt_hdr, tgt); isTmpPhased.push_back(false); }
+	  continue;
+	}
 	if ( !ref ) {
 	  //fprintf(stderr, "onlyT .. %s:%d\n", bcf_seqname(tgt_hdr, tgt), tgt->pos+1);
 	  bcf_unpack(tgt, BCF_UN_STR); // unpack thru ALT
@@ -261,53 +274,47 @@ namespace EAGLE {
 	}
 	//fprintf(stderr, "match .. %s:%d\n", bcf_seqname(ref_hdr, ref), ref->pos+1);
 
-	// filter out multi-allelic and monomorphic markers
+	// deal with multi-allelic and monomorphic markers
+	// drop multi-allelic target markers
 	int ntgt_gt = bcf_get_genotypes(tgt_hdr, tgt, &tgt_gt, &mtgt_gt);
 	if (tgt->n_allele > 2) {
 	  MmultiAllelicTgt++;
 	  if (outputUnphased) { bcf_write(out, tgt_hdr, tgt); isTmpPhased.push_back(false); }
 	  continue;
 	}
-    if (ref->n_allele==1)
-    {
         // drop monomorphic reference markers
-        MmonomorphicRef++;
-	if (outputUnphased) { bcf_write(out, tgt_hdr, tgt); isTmpPhased.push_back(false); }
-        continue;
-    }
-
-    // preserve monomorphic markers if biallelic in the reference panel
-    if (tgt->n_allele < 2)
-    {
-        bcf_update_alleles(tgt_hdr, tgt, (const char**)ref->d.allele,ref->n_allele);
-    }
+	if (ref->n_allele==1) {
+	  MmonomorphicRef++;
+	  if (outputUnphased) { bcf_write(out, tgt_hdr, tgt); isTmpPhased.push_back(false); }
+	  continue;
+	}
+	// preserve monomorphic markers if biallelic in the reference panel
+	if (tgt->n_allele < 2) {
+	  bcf_update_alleles(tgt_hdr, tgt, (const char**)ref->d.allele,ref->n_allele);
+	}
+	// drop multi-allelic reference markers
+	if (ref->n_allele > 2) {
+	  MmultiAllelicRef++;
+	  if (outputUnphased) { bcf_write(out, tgt_hdr, tgt); isTmpPhased.push_back(false); }
+	  continue;
+	}
 
+	// check for REF/ALT swaps
 	bool refAltSwap = false;
-
-        if (allowRefAltSwap) { // perform further error-checking
-	  if (ref->n_allele > 2) {
-	    MmultiAllelicRef++;
-	    if (outputUnphased) { bcf_write(out, tgt_hdr, tgt); isTmpPhased.push_back(false); }
-	    continue;
-	  }
-	  /*
-	  printf("tgt REF=%s, ALT=%s   ref REF=%s, ALT=%s\n", tgt->d.allele[0], tgt->d.allele[1],
-		 ref->d.allele[0], ref->d.allele[1]);
-	  */
-	  if (strcmp(tgt->d.allele[0], ref->d.allele[0]) == 0 &&
-	      strcmp(tgt->d.allele[1], ref->d.allele[1]) == 0) {
-	    refAltSwap = false;
-	  }
-	  else if (strcmp(tgt->d.allele[0], ref->d.allele[1]) == 0 &&
-		   strcmp(tgt->d.allele[1], ref->d.allele[0]) == 0) {
-	    refAltSwap = true;
-	    numRefAltSwaps++;
-	  }
-	  else {
-	    MrefAltError++;
-	    if (outputUnphased) { bcf_write(out, tgt_hdr, tgt); isTmpPhased.push_back(false); }
-	    continue;	    
-	  }
+	if (strcmp(tgt->d.allele[0], ref->d.allele[0]) == 0 &&
+	    strcmp(tgt->d.allele[1], ref->d.allele[1]) == 0) {
+	  refAltSwap = false;
+	}
+	else if (allowRefAltSwap && bcf_is_snp(tgt) && // allow swap if --allowRefAltSwap is set
+		 strcmp(tgt->d.allele[0], ref->d.allele[1]) == 0 &&
+		 strcmp(tgt->d.allele[1], ref->d.allele[0]) == 0) {
+	  refAltSwap = true;
+	  numRefAltSwaps++;
+	}
+	else {
+	  MrefAltError++;
+	  if (outputUnphased) { bcf_write(out, tgt_hdr, tgt); isTmpPhased.push_back(false); }
+	  continue;	    
 	}
 
     // Check the chromosome: if region was requested (chrom is set), synced
@@ -383,7 +390,7 @@ namespace EAGLE {
     cout << endl;
     cout << "SNPs ignored: " << MtargetOnly << " SNPs in target but not reference" << endl;
     if (MtargetOnly > M/10U)
-      cerr << "              --> WARNING: Check REF/ALT agreement between target and ref <--"
+      cerr << "              --> WARNING: Check REF/ALT agreement between target and ref? <--"
 	   << endl;
     cout << "              " << MrefOnly << " SNPs in reference but not target" << endl;
     if (MnotOnChrom)
@@ -397,7 +404,9 @@ namespace EAGLE {
     if (MmonomorphicRef)
       cout << "              " << MmonomorphicRef << " SNPs biallelic in target but monomorphic in ref" << endl;
     if (MrefAltError)
-      cout << "              " << MrefAltError << " SNPs with REF/ALT matching errors" << endl;
+      cout << "              " << MrefAltError << " SNPs with allele mismatches" << endl;
+    if (Mexclude)
+      cout << "              " << Mexclude << " SNPs excluded based on --vcfExclude" << endl;
     cout << endl;
     
     if (MwithMissingRef) {
@@ -492,20 +501,22 @@ namespace EAGLE {
    * reads ref+target vcf data
    * writes target[isec] to tmpFile
    * fills in cM coordinates and seg64cMvecs, genoBits
+   * fills in chrom if chrom==0
    */
-  SyncedVcfData::SyncedVcfData(const string &vcfRef, const string &vcfTarget, bool allowRefAltSwap,
-			       int chrom, int chromX, double bpStart, double bpEnd,
-			       const string &geneticMapFile, double cMmax, const string &tmpFile,
-			       const string &writeMode, int usePS,
-			       vector < vector < pair <int, int> > > &conPSall, double &snpRate,
-			       bool outputUnphased, vector <bool> &isTmpPhased) {
+  SyncedVcfData::SyncedVcfData
+  (const string &vcfRef, const string &vcfTarget, const string &vcfExclude, bool allowRefAltSwap,
+   int &chrom, int chromX, double bpStart, double bpEnd, const string &geneticMapFile,
+   double cMmax, const string &tmpFile, const string &writeMode, int usePS,
+   vector < vector < pair <int, int> > > &conPSall, double &snpRate, bool outputUnphased,
+   vector <bool> &isTmpPhased) {
 
     // perform synced read
     vector <bool> hapsRef;     // M*2*Nref
     vector <uchar> genosTarget; // M*Ntarget
     vector < pair <int, int> > chrBps = 
-      processVcfs(vcfRef, vcfTarget, allowRefAltSwap, chrom, chromX, bpStart, bpEnd, hapsRef,
-		  genosTarget, tmpFile, writeMode, usePS, conPSall, outputUnphased, isTmpPhased);
+      processVcfs(vcfRef, vcfTarget, vcfExclude, allowRefAltSwap, chrom, chromX, bpStart, bpEnd,
+		  hapsRef, genosTarget, tmpFile, writeMode, usePS, conPSall, outputUnphased,
+		  isTmpPhased);
 
     // interpolate genetic coordinates
     vector <double> cMs = processMap(chrBps, geneticMapFile);
diff --git a/src/SyncedVcfData.hpp b/src/SyncedVcfData.hpp
index 9d4543a..76fe66b 100644
--- a/src/SyncedVcfData.hpp
+++ b/src/SyncedVcfData.hpp
@@ -40,11 +40,12 @@ namespace EAGLE {
     std::vector <std::string> targetIDs;
 
     std::vector < std::pair <int, int> > processVcfs
-    (const std::string &vcfRef, const std::string &vcfTarget, bool allowRefAltSwap, int chrom,
-     int chromX, double bpStart, double bpEnd, std::vector <bool> &hapsRef,
-     std::vector <uchar> &genosTarget, const std::string &tmpFile, const std::string &writeMode,
-     int usePS, std::vector < std::vector < std::pair <int, int> > > &conPSall,
-     bool outputUnphased, std::vector <bool> &isTmpPhased);
+    (const std::string &vcfRef, const std::string &vcfTarget, const std::string &vcfExclude,
+     bool allowRefAltSwap, int &chrom, int chromX, double bpStart, double bpEnd,
+     std::vector <bool> &hapsRef, std::vector <uchar> &genosTarget, const std::string &tmpFile,
+     const std::string &writeMode, int usePS,
+     std::vector < std::vector < std::pair <int, int> > > &conPSall, bool outputUnphased,
+     std::vector <bool> &isTmpPhased);
     std::vector <double> processMap(std::vector < std::pair <int, int> > &chrBps,
 				    const std::string &geneticMapFile);
     void buildGenoBits(const std::vector <bool> &hapsRef, const std::vector <uchar> &genosTarget,
@@ -56,10 +57,10 @@ namespace EAGLE {
      * writes target[isec] to tmpFile
      * fills in cM coordinates and seg64cMvecs, genoBits
      */
-    SyncedVcfData(const std::string &vcfRef, const std::string &vcfTarget, bool allowRefAltSwap,
-		  int chrom, int chromX, double bpStart, double bpEnd,
-		  const std::string &geneticMapFile, double cMmax, const std::string &tmpFile,
-		  const std::string &writeMode, int usePS,
+    SyncedVcfData(const std::string &vcfRef, const std::string &vcfTarget,
+		  const std::string &vcfExclude, bool allowRefAltSwap, int &chrom, int chromX,
+		  double bpStart, double bpEnd, const std::string &geneticMapFile, double cMmax,
+		  const std::string &tmpFile, const std::string &writeMode, int usePS,
 		  std::vector < std::vector < std::pair <int, int> > > &conPSall, double &snpRate,
 		  bool outputUnphased, std::vector <bool> &isTmpPhased);
 
diff --git a/src/Version.hpp b/src/Version.hpp
index 07ad47e..00c4380 100644
--- a/src/Version.hpp
+++ b/src/Version.hpp
@@ -19,7 +19,7 @@
 #ifndef VERSION_HPP
 #define VERSION_HPP
 
-const char EAGLE_VERSION[] = "2.3.5";
-const char EAGLE_VERSION_DATE[] = "August 2, 2017";
+const char EAGLE_VERSION[] = "2.4";
+const char EAGLE_VERSION_DATE[] = "December 13, 2017";
 
 #endif

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/eagle.git



More information about the debian-med-commit mailing list