[med-svn] [eagle] 01/02: Imported Upstream version 2.3.2

Dylan Aïssi bob.dybian-guest at moszumanska.debian.org
Thu Apr 13 21:48:02 UTC 2017


This is an automated email from the git hooks/post-receive script.

bob.dybian-guest pushed a commit to branch master
in repository eagle.

commit 4625e661ec9b9feef169bc1d2f04f659f2149e18
Author: Dylan Aïssi <bob.dybian at gmail.com>
Date:   Thu Apr 13 23:47:05 2017 +0200

    Imported Upstream version 2.3.2
---
 example/example.log          |  68 ++++++++++++++++++++---------------------
 example/example_ref.log      |  16 +++++-----
 example/example_vcf.log      |  70 +++++++++++++++++++++----------------------
 example/phased.vcf.gz        | Bin 200385 -> 200384 bytes
 example/target.phased.vcf.gz | Bin 21050 -> 21053 bytes
 src/Eagle.cpp                |  22 ++++++++++++--
 src/Eagle.hpp                |   6 ++--
 src/EagleMain.cpp            |   5 ++--
 src/EagleParams.cpp          |   8 +++--
 src/EagleParams.hpp          |   1 +
 src/Makefile                 |   4 +--
 src/SyncedVcfData.cpp        |  63 ++++++++++++++++++++------------------
 src/SyncedVcfData.hpp        |   6 ++--
 src/Version.hpp              |   4 +--
 14 files changed, 149 insertions(+), 124 deletions(-)

diff --git a/example/example.log b/example/example.log
index bd2d755..65ea5a4 100644
--- a/example/example.log
+++ b/example/example.log
@@ -1,7 +1,7 @@
                       +-----------------------------+
                       |                             |
-                      |   Eagle v2.3.1              |
-                      |   March 3, 2017             |
+                      |   Eagle v2.3.2              |
+                      |   March 23, 2017            |
                       |   Po-Ru Loh                 |
                       |                             |
                       +-----------------------------+
@@ -56,25 +56,25 @@ Setting --histFactor=1.00
 
 BEGINNING STEP 1
 
-Time for step 1: 0.380579
-Time for step 1 MN^2: 0.024638
+Time for step 1: 0.325358
+Time for step 1 MN^2: 0.0202137
 
-Making hard calls (time: 0.0283101)
+Making hard calls (time: 0.0190668)
 
 
 BEGINNING STEP 2
 
 BATCH 1 OF 1
 Building hash tables
-.................................................................. (time: 0.0659139)
+.................................................................. (time: 0.161656)
 
 Phasing samples 1-379
-Time for phasing batch: 0.466222
+Time for phasing batch: 0.384582
 
-Making hard calls (time: 0.029881)
+Making hard calls (time: 0.0192499)
 
-Time for step 2: 0.562044
-Time for step 2 MN^2: 0.0596148
+Time for step 2: 0.5655
+Time for step 2 MN^2: 0.050123
 
 
 BEGINNING STEP 3 (PBWT ITERS)
@@ -87,108 +87,108 @@ BEGINNING PBWT ITER 1
 BATCH 1 OF 10
 
 Phasing samples 1-37
-Time for phasing batch: 1.84304
+Time for phasing batch: 2.14832
 
 BATCH 2 OF 10
 
 Phasing samples 38-75
-Time for phasing batch: 1.67975
+Time for phasing batch: 1.47815
 
 BATCH 3 OF 10
 
 Phasing samples 76-113
-Time for phasing batch: 1.79137
+Time for phasing batch: 1.60704
 
 BATCH 4 OF 10
 
 Phasing samples 114-151
-Time for phasing batch: 1.71214
+Time for phasing batch: 1.69579
 
 BATCH 5 OF 10
 
 Phasing samples 152-189
-Time for phasing batch: 1.62954
+Time for phasing batch: 1.6183
 
 BATCH 6 OF 10
 
 Phasing samples 190-227
-Time for phasing batch: 1.67016
+Time for phasing batch: 1.5888
 
 BATCH 7 OF 10
 
 Phasing samples 228-265
-Time for phasing batch: 1.66139
+Time for phasing batch: 1.69022
 
 BATCH 8 OF 10
 
 Phasing samples 266-303
-Time for phasing batch: 1.6077
+Time for phasing batch: 1.39507
 
 BATCH 9 OF 10
 
 Phasing samples 304-341
-Time for phasing batch: 1.63602
+Time for phasing batch: 1.72671
 
 BATCH 10 OF 10
 
 Phasing samples 342-379
-Time for phasing batch: 1.69574
+Time for phasing batch: 1.57987
 
-Time for PBWT iter 1: 16.9269
+Time for PBWT iter 1: 16.5283
 
 BEGINNING PBWT ITER 2
 
 BATCH 1 OF 10
 
 Phasing samples 1-37
-Time for phasing batch: 2.81822
+Time for phasing batch: 2.41118
 
 BATCH 2 OF 10
 
 Phasing samples 38-75
-Time for phasing batch: 2.59811
+Time for phasing batch: 2.39643
 
 BATCH 3 OF 10
 
 Phasing samples 76-113
-Time for phasing batch: 2.79582
+Time for phasing batch: 2.82551
 
 BATCH 4 OF 10
 
 Phasing samples 114-151
-Time for phasing batch: 2.55964
+Time for phasing batch: 2.8835
 
 BATCH 5 OF 10
 
 Phasing samples 152-189
-Time for phasing batch: 2.70656
+Time for phasing batch: 3.0855
 
 BATCH 6 OF 10
 
 Phasing samples 190-227
-Time for phasing batch: 2.66884
+Time for phasing batch: 2.37932
 
 BATCH 7 OF 10
 
 Phasing samples 228-265
-Time for phasing batch: 2.66954
+Time for phasing batch: 2.45946
 
 BATCH 8 OF 10
 
 Phasing samples 266-303
-Time for phasing batch: 2.51312
+Time for phasing batch: 2.54506
 
 BATCH 9 OF 10
 
 Phasing samples 304-341
-Time for phasing batch: 2.58987
+Time for phasing batch: 2.71921
 
 BATCH 10 OF 10
 
 Phasing samples 342-379
-Time for phasing batch: 2.608
+Time for phasing batch: 2.58614
 
-Time for PBWT iter 2: 26.5278
+Time for PBWT iter 2: 26.2913
 Writing .haps.gz and .sample output
-Time for writing output: 0.354415
-Total elapsed time for analysis = 44.9045 sec
+Time for writing output: 0.359584
+Total elapsed time for analysis = 44.334 sec
diff --git a/example/example_ref.log b/example/example_ref.log
index f53b54d..ec27b0e 100644
--- a/example/example_ref.log
+++ b/example/example_ref.log
@@ -1,7 +1,7 @@
                       +-----------------------------+
                       |                             |
-                      |   Eagle v2.3.1              |
-                      |   March 3, 2017             |
+                      |   Eagle v2.3.2              |
+                      |   March 23, 2017            |
                       |   Po-Ru Loh                 |
                       |                             |
                       +-----------------------------+
@@ -29,11 +29,9 @@ Target samples: Ntarget = 8
 [W::vcf_parse] INFO 'EX_TARGET' is not defined in the header, assuming Type=String
 SNPs to analyze: M = 430 SNPs in both target and reference
 
-SNPs ignored: 215 SNPs in target but not reference
-              --> WARNING: Check REF/ALT agreement between target and ref <--
+SNPs ignored: 0 SNPs in target but not reference
               215 SNPs in reference but not target
               0 multi-allelic SNPs in target
-              0 monomorphic SNPs in target
 
 Missing rate in target genotypes: 0.00116279
 
@@ -58,15 +56,15 @@ PHASING ITER 1 OF 1
 
 Phasing target samples
 ................................................................................
-Time for phasing iter 1: 0.236171
-Writing vcf output to target.phased.vcf
+Time for phasing iter 1: 0.181584
+Writing vcf.gz output to target.phased.vcf.gz
 [W::vcf_parse] INFO 'AC' is not defined in the header, assuming Type=String
 [W::vcf_parse] INFO 'AN' is not defined in the header, assuming Type=String
 [W::vcf_parse] INFO 'DP' is not defined in the header, assuming Type=String
 [W::vcf_parse] INFO 'AFR_AF' is not defined in the header, assuming Type=String
 [W::vcf_parse] INFO 'EX_TARGET' is not defined in the header, assuming Type=String
-Time for writing output: 0.051106
-Total elapsed time for analysis = 9.22206 sec
+Time for writing output: 0.0341809
+Total elapsed time for analysis = 6.69493 sec
 
 Mean phase confidence of each target individual:
 ID	PHASE_CONFIDENCE
diff --git a/example/example_vcf.log b/example/example_vcf.log
index 6932e15..4affef4 100644
--- a/example/example_vcf.log
+++ b/example/example_vcf.log
@@ -1,7 +1,7 @@
                       +-----------------------------+
                       |                             |
-                      |   Eagle v2.3.1              |
-                      |   March 3, 2017             |
+                      |   Eagle v2.3.2              |
+                      |   March 23, 2017            |
                       |   Po-Ru Loh                 |
                       |                             |
                       +-----------------------------+
@@ -39,25 +39,25 @@ Setting --histFactor=1.00
 
 BEGINNING STEP 1
 
-Time for step 1: 0.383094
-Time for step 1 MN^2: 0.024787
+Time for step 1: 0.252695
+Time for step 1 MN^2: 0.0173362
 
-Making hard calls (time: 0.0283229)
+Making hard calls (time: 0.018297)
 
 
 BEGINNING STEP 2
 
 BATCH 1 OF 1
 Building hash tables
-.................................................................. (time: 0.0526478)
+.................................................................. (time: 0.0494721)
 
 Phasing samples 1-379
-Time for phasing batch: 0.469969
+Time for phasing batch: 0.308583
 
-Making hard calls (time: 0.029753)
+Making hard calls (time: 0.019398)
 
-Time for step 2: 0.552398
-Time for step 2 MN^2: 0.0597794
+Time for step 2: 0.377476
+Time for step 2 MN^2: 0.0407339
 
 
 BEGINNING STEP 3 (PBWT ITERS)
@@ -70,108 +70,108 @@ BEGINNING PBWT ITER 1
 BATCH 1 OF 10
 
 Phasing samples 1-37
-Time for phasing batch: 1.77488
+Time for phasing batch: 1.59837
 
 BATCH 2 OF 10
 
 Phasing samples 38-75
-Time for phasing batch: 1.69304
+Time for phasing batch: 1.56651
 
 BATCH 3 OF 10
 
 Phasing samples 76-113
-Time for phasing batch: 1.74526
+Time for phasing batch: 1.463
 
 BATCH 4 OF 10
 
 Phasing samples 114-151
-Time for phasing batch: 1.64103
+Time for phasing batch: 1.61652
 
 BATCH 5 OF 10
 
 Phasing samples 152-189
-Time for phasing batch: 1.66061
+Time for phasing batch: 1.54799
 
 BATCH 6 OF 10
 
 Phasing samples 190-227
-Time for phasing batch: 1.66104
+Time for phasing batch: 1.63484
 
 BATCH 7 OF 10
 
 Phasing samples 228-265
-Time for phasing batch: 1.6719
+Time for phasing batch: 1.78759
 
 BATCH 8 OF 10
 
 Phasing samples 266-303
-Time for phasing batch: 1.58273
+Time for phasing batch: 1.4086
 
 BATCH 9 OF 10
 
 Phasing samples 304-341
-Time for phasing batch: 1.63833
+Time for phasing batch: 1.4037
 
 BATCH 10 OF 10
 
 Phasing samples 342-379
-Time for phasing batch: 1.6582
+Time for phasing batch: 1.66127
 
-Time for PBWT iter 1: 16.727
+Time for PBWT iter 1: 15.6884
 
 BEGINNING PBWT ITER 2
 
 BATCH 1 OF 10
 
 Phasing samples 1-37
-Time for phasing batch: 2.75391
+Time for phasing batch: 2.77417
 
 BATCH 2 OF 10
 
 Phasing samples 38-75
-Time for phasing batch: 2.66628
+Time for phasing batch: 2.84674
 
 BATCH 3 OF 10
 
 Phasing samples 76-113
-Time for phasing batch: 2.74627
+Time for phasing batch: 3.50577
 
 BATCH 4 OF 10
 
 Phasing samples 114-151
-Time for phasing batch: 2.55569
+Time for phasing batch: 2.62343
 
 BATCH 5 OF 10
 
 Phasing samples 152-189
-Time for phasing batch: 2.56301
+Time for phasing batch: 2.34182
 
 BATCH 6 OF 10
 
 Phasing samples 190-227
-Time for phasing batch: 2.63885
+Time for phasing batch: 2.38923
 
 BATCH 7 OF 10
 
 Phasing samples 228-265
-Time for phasing batch: 2.61097
+Time for phasing batch: 2.38542
 
 BATCH 8 OF 10
 
 Phasing samples 266-303
-Time for phasing batch: 2.51258
+Time for phasing batch: 2.40897
 
 BATCH 9 OF 10
 
 Phasing samples 304-341
-Time for phasing batch: 2.57989
+Time for phasing batch: 2.41107
 
 BATCH 10 OF 10
 
 Phasing samples 342-379
-Time for phasing batch: 2.57872
+Time for phasing batch: 2.17693
 
-Time for PBWT iter 2: 26.2062
-Writing vcf output to phased.vcf
-Time for writing output: 0.168121
-Total elapsed time for analysis = 53.6103 sec
+Time for PBWT iter 2: 25.8636
+Writing vcf.gz output to phased.vcf.gz
+Time for writing output: 0.245132
+Total elapsed time for analysis = 49.358 sec
diff --git a/example/phased.vcf.gz b/example/phased.vcf.gz
index 962c54b..76de287 100644
Binary files a/example/phased.vcf.gz and b/example/phased.vcf.gz differ
diff --git a/example/target.phased.vcf.gz b/example/target.phased.vcf.gz
index f74a9f3..2ad75cb 100644
Binary files a/example/target.phased.vcf.gz and b/example/target.phased.vcf.gz differ
diff --git a/src/Eagle.cpp b/src/Eagle.cpp
index 4860b60..3fe0fef 100644
--- a/src/Eagle.cpp
+++ b/src/Eagle.cpp
@@ -3216,9 +3216,9 @@ namespace EAGLE {
     bcf_hdr_sync(hdr);
   }
 
-  void Eagle::writeVcf(const string &tmpFile, const string &outFile, int chromX, double bpStart,
-		       double bpEnd, const string &writeMode, bool noImpMissing, int argc,
-		       char **argv) const {
+  void Eagle::writeVcf(const string &tmpFile, const vector <bool> &isTmpPhased,
+		       const string &outFile, int chromX, double bpStart, double bpEnd,
+		       const string &writeMode, bool noImpMissing, int argc, char **argv) const {
 
     htsFile *htsTmp = hts_open(tmpFile.c_str(), "r");
     if (htsTmp == NULL) {
@@ -3242,7 +3242,23 @@ namespace EAGLE {
 
     vector <int> mostRecentPloidy(N-Nref, 2);
 
+    int tmpLineNum = -1; // index in tmp file (corresponding to isTmpPhased)
     while (bcf_read(htsTmp, hdr, rec) >= 0) {
+      tmpLineNum++;
+      if (!isTmpPhased[tmpLineNum]) { // site was not phased; remove phase information and output
+	int bp = rec->pos+1;
+	if (bpStart <= bp && bp <= bpEnd) { // check if within output region
+	  int ntgt_gt = bcf_get_genotypes(hdr, rec, &tgt_gt, &mtgt_gt);
+	  for (int k = 0; k < ntgt_gt; k++)
+	    if (tgt_gt[k] != bcf_int32_vector_end && !bcf_gt_is_missing(tgt_gt[k])) {
+	      int idx = bcf_gt_allele(tgt_gt[k]); // allele index
+	      tgt_gt[k] = bcf_gt_unphased(idx); // convert allele index to bcf value (unphased)
+	    }
+	  bcf_update_genotypes(hdr, rec, tgt_gt, ntgt_gt);
+	  bcf_write(out, hdr, rec);
+	}
+	continue;
+      }
 
       int chrom = StringUtils::bcfNameToChrom(bcf_hdr_id2name(hdr, rec->rid), 1, chromX);
 
diff --git a/src/Eagle.hpp b/src/Eagle.hpp
index dfe7bb8..5c43e0f 100644
--- a/src/Eagle.hpp
+++ b/src/Eagle.hpp
@@ -185,9 +185,9 @@ namespace EAGLE {
     void outputSE(const std::vector <uint> &children, const std::vector <uint> &nF1s,
 		  const std::vector <uint> &nF2s, int step) const;
     void writeHapsGzSample(const std::string &prefix) const;
-    void writeVcf(const std::string &tmpFile, const std::string &outFile, int chromX,
-		  double bpStart, double bpEnd, const std::string &writeMode, bool noImpMissing,
-		  int argc, char**argv) const;
+    void writeVcf(const std::string &tmpFile, const std::vector <bool> &isTmpPhased,
+		  const std::string &outFile, int chromX, double bpStart, double bpEnd,
+		  const std::string &writeMode, bool noImpMissing, int argc, char**argv) const;
     void writeVcfNonRef(const std::string &vcfFile, const std::string &outFile, int inputChrom,
 			int chromX, double bpStart, double bpEnd, const std::string &writeMode,
 			int argc, char **argv) const;
diff --git a/src/EagleMain.cpp b/src/EagleMain.cpp
index d2c28b1..c8a2617 100644
--- a/src/EagleMain.cpp
+++ b/src/EagleMain.cpp
@@ -66,11 +66,12 @@ void phaseWithRef(EagleParams &params, Timer &timer, double t0, int argc, char *
   string tmpFile = params.outPrefix + ".unphased." + params.vcfOutSuffix;
   string outFile = params.outPrefix + "." + params.vcfOutSuffix;
   vector < vector < pair <int, int> > > conPSall; double snpRate;
+  vector <bool> isTmpPhased;
   SyncedVcfData vcfData(params.vcfRef, params.vcfTarget, params.allowRefAltSwap, params.chrom,
 			params.chromX, params.bpStart-params.bpFlanking,
 			params.bpEnd+params.bpFlanking, params.geneticMapFile,
 			params.cMmax==0 ? 1 : params.cMmax, tmpFile, params.vcfWriteMode,
-			params.usePS, conPSall, snpRate);
+			params.usePS, conPSall, snpRate, params.outputUnphased, isTmpPhased);
 
   Eagle eagle(vcfData.getNref(), vcfData.getNtarget(), vcfData.getMseg64(),
 	      vcfData.getGenoBits(), vcfData.getSeg64cMvecs(), params.pErr);
@@ -185,7 +186,7 @@ void phaseWithRef(EagleParams &params, Timer &timer, double t0, int argc, char *
 
   timer.update_time();
   cout << "Writing " << params.vcfOutSuffix << " output to " << outFile << endl;
-  eagle.writeVcf(tmpFile, outFile, params.chromX, params.bpStart, params.bpEnd,
+  eagle.writeVcf(tmpFile, isTmpPhased, outFile, params.chromX, params.bpStart, params.bpEnd,
 		 params.vcfWriteMode, params.noImpMissing, argc, argv);
   cout << "Time for writing output: " << timer.update_time() << endl;
 
diff --git a/src/EagleParams.cpp b/src/EagleParams.cpp
index 7eb084a..2509c23 100644
--- a/src/EagleParams.cpp
+++ b/src/EagleParams.cpp
@@ -85,6 +85,7 @@ namespace EAGLE {
        "b|u|z|v: compressed BCF (b), uncomp BCF (u), compressed VCF (z), uncomp VCF (v)")
       ("noImpMissing", "disable imputation of missing ./. target genotypes")
       ("allowRefAltSwap", "allow swapping of REF/ALT in target vs. ref VCF")
+      ("outputUnphased", "output unphased sites (target-only, multi-allelic, etc.)")
       ;
 
     po::options_description bothModes("Region selection options");
@@ -227,6 +228,7 @@ namespace EAGLE {
       noMapCheck = vm.count("noMapCheck");
       noImpMissing = vm.count("noImpMissing");
       allowRefAltSwap = vm.count("allowRefAltSwap");
+      outputUnphased = vm.count("outputUnphased");
 
       if (vm.count("vcfRef") || vm.count("vcfTarget") || vm.count("vcf")) { // VCF mode
 	if (vm.count("vcf")) { // non-ref mode
@@ -268,10 +270,10 @@ namespace EAGLE {
 	}
 	if (vcfOutFormat == "b") { vcfOutSuffix = "bcf"; vcfWriteMode = "wb"; }
 	else if (vcfOutFormat == "u") { vcfOutSuffix = "bcf"; vcfWriteMode = "wbu"; }
-	else if (vcfOutFormat == "z") { vcfOutSuffix = "vcf.gz"; vcfWriteMode = "wz"; }
-	else if (vcfOutFormat == "v" || vcfOutFormat == ".") {
-	  vcfOutSuffix = "vcf"; vcfWriteMode = "w";
+	else if (vcfOutFormat == "z" || vcfOutFormat == ".") {
+	  vcfOutSuffix = "vcf.gz"; vcfWriteMode = "wz";
 	}
+	else if (vcfOutFormat == "v") { vcfOutSuffix = "vcf"; vcfWriteMode = "w"; }
 	else {
 	  cerr << "ERROR: --vcfOutFormat must be one of {b,u,z,v}" << endl;
 	  return false;
diff --git a/src/EagleParams.hpp b/src/EagleParams.hpp
index a0b6eb2..9470418 100644
--- a/src/EagleParams.hpp
+++ b/src/EagleParams.hpp
@@ -44,6 +44,7 @@ namespace EAGLE {
     std::string vcfOutFormat, vcfOutSuffix, vcfWriteMode;
     // outFormat b|u|z|v -> outSuffix bcf|bcf|vcf.gz|vcf, writeMode wb|wbu|wz|w
     bool noImpMissing;
+    bool outputUnphased; // output unphased sites (target-only, multi-allelic, etc.)
     int usePS; // use FORMAT:PS phase constraints: 1=soft, 2=harder
 
     bool allowRefAltSwap; // in reference-based phasing mode
diff --git a/src/Makefile b/src/Makefile
index e17235d..d7731c0 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -7,8 +7,8 @@
 LBLAS = -lopenblas # alternatively -llapack (just need sgemm_)
 BLAS_DIR = /opt/openblas/0.2.14/lib
 BOOST_INSTALL_DIR = /home/pl88/boost_1_58_0/install
-HTSLIB_DIR = /groups/price/poru/external_software/htslib/htslib-1.3
-
+HTSLIB_DIR = /groups/price/poru/external_software/htslib/htslib-1.4
+### NOTE: to reduce dependencies from htslib, configure htslib with: ./configure --disable-bz2 --disable-lzma --disable-plugins --disable-libcurl --disable-gcs --disable-s3
 
 ### these paths are used only for static linking
 ZLIB_STATIC_DIR = /opt/zlib-1.2.8/lib # probably unnecessary on most systems
diff --git a/src/SyncedVcfData.cpp b/src/SyncedVcfData.cpp
index 4e72e3d..0acc785 100644
--- a/src/SyncedVcfData.cpp
+++ b/src/SyncedVcfData.cpp
@@ -163,12 +163,14 @@ namespace EAGLE {
   (const string &vcfRef, const string &vcfTarget, bool allowRefAltSwap, int chrom, int chromX,
    double bpStart, double bpEnd, vector <bool> &hapsRef, vector <uchar> &genosTarget,
    const string &tmpFile, const string &writeMode, int usePS,
-   vector < vector < pair <int, int> > > &conPSall) {
+   vector < vector < pair <int, int> > > &conPSall, bool outputUnphased,
+   vector <bool> &isTmpPhased) {
 
     vector < pair <int, int> > chrBps;
 
     bcf_srs_t *sr = bcf_sr_init();
-    sr->require_index = 1;
+    bcf_sr_set_opt(sr, BCF_SR_PAIR_LOGIC, BCF_SR_PAIR_BOTH_REF);
+    bcf_sr_set_opt(sr, BCF_SR_REQUIRE_IDX);
 
     if ( chrom!=0 )
     {
@@ -188,17 +190,6 @@ namespace EAGLE {
         free(str.s);
     }
 
-    // By default, the synced reader requires that CHR, POS and ALT are the same
-    // in both files. If this is too strict and SNP/indel/all lines with the same
-    // position should be considered as matching, uncomment:
-    //
-    //      sr->collapse = COLLAPSE_SNPS|COLLAPSE_INDELS;
-    //
-    // See also examples in bcftools/vcfisec etc.
-
-    if (allowRefAltSwap)
-      sr->collapse = COLLAPSE_SNPS;
-
     if (!bcf_sr_add_reader(sr, vcfRef.c_str())) {
       cerr << "ERROR: Could not open " << vcfRef << " for reading: " << bcf_sr_strerror(sr->errnum)
 	   << endl;
@@ -238,7 +229,7 @@ namespace EAGLE {
     cout << "Target samples: Ntarget = " << Ntarget << endl;
 
     M = 0;
-    uint MtargetOnly = 0, MrefOnly = 0, MmultiAllelicTgt = 0, MmonomorphicTgt = 0,
+    uint MtargetOnly = 0, MrefOnly = 0, MmultiAllelicTgt = 0,
       MmultiAllelicRef = 0, MmonomorphicRef = 0;
     uint MwithMissingRef = 0, MwithUnphasedRef = 0, MnotInRegion = 0, MnotOnChrom = 0;
     uint MrefAltError = 0, numRefAltSwaps = 0;
@@ -258,14 +249,14 @@ namespace EAGLE {
 	  bcf_unpack(tgt, BCF_UN_STR); // unpack thru ALT
 	  if (strcmp(tgt->d.allele[1], ".") != 0) // report if polymorphic in target
 	    MtargetOnly++;
+	  if (outputUnphased) { bcf_write(out, tgt_hdr, tgt); isTmpPhased.push_back(false); }
 	  continue;
 	}
 	if ( !tgt ) {
 	  //fprintf(stderr, "onlyR .. %s:%d\n", bcf_seqname(ref_hdr, ref), ref->pos+1);
 	  bcf_unpack(ref, BCF_UN_STR); // unpack thru ALT
 	  if (strcmp(ref->d.allele[1], ".") != 0) // report if polymorphic in ref
-	    MtargetOnly++;
-	  MrefOnly++;
+	    MrefOnly++;
 	  continue;
 	}
 	//fprintf(stderr, "match .. %s:%d\n", bcf_seqname(ref_hdr, ref), ref->pos+1);
@@ -274,26 +265,31 @@ namespace EAGLE {
 	int ntgt_gt = bcf_get_genotypes(tgt_hdr, tgt, &tgt_gt, &mtgt_gt);
 	if (tgt->n_allele > 2) {
 	  MmultiAllelicTgt++;
+	  if (outputUnphased) { bcf_write(out, tgt_hdr, tgt); isTmpPhased.push_back(false); }
 	  continue;
 	}
-	if (tgt->n_allele < 2) {
-	  MmonomorphicTgt++;
-	  continue;
-	}
+    if (ref->n_allele==1)
+    {
+        // drop monomorphic reference markers
+        MmonomorphicRef++;
+	if (outputUnphased) { bcf_write(out, tgt_hdr, tgt); isTmpPhased.push_back(false); }
+        continue;
+    }
+
+    // preserve monomorphic markers if biallelic in the reference panel
+    if (tgt->n_allele < 2)
+    {
+        bcf_update_alleles(tgt_hdr, tgt, (const char**)ref->d.allele,ref->n_allele);
+    }
 
 	bool refAltSwap = false;
 
         if (allowRefAltSwap) { // perform further error-checking
 	  if (ref->n_allele > 2) {
 	    MmultiAllelicRef++;
+	    if (outputUnphased) { bcf_write(out, tgt_hdr, tgt); isTmpPhased.push_back(false); }
 	    continue;
 	  }
-	  if (ref->n_allele < 2) {
-	    MmonomorphicRef++;
-	    continue;
-	  }
-	  bcf_unpack(tgt, BCF_UN_STR); // unpack thru ALT
-	  bcf_unpack(ref, BCF_UN_STR); // unpack thru ALT
 	  /*
 	  printf("tgt REF=%s, ALT=%s   ref REF=%s, ALT=%s\n", tgt->d.allele[0], tgt->d.allele[1],
 		 ref->d.allele[0], ref->d.allele[1]);
@@ -309,6 +305,7 @@ namespace EAGLE {
 	  }
 	  else {
 	    MrefAltError++;
+	    if (outputUnphased) { bcf_write(out, tgt_hdr, tgt); isTmpPhased.push_back(false); }
 	    continue;	    
 	  }
 	}
@@ -366,6 +363,7 @@ namespace EAGLE {
 
 	// print the record
 	bcf_write(out, tgt_hdr, tgt);
+	isTmpPhased.push_back(true);
       }
 
     bcf_sr_destroy(sr);
@@ -394,7 +392,6 @@ namespace EAGLE {
       cout << "              " << MnotInRegion << " SNPs not in selected region (+ flanks)"
 	   << endl;
     cout << "              " << MmultiAllelicTgt << " multi-allelic SNPs in target" << endl;
-    cout << "              " << MmonomorphicTgt << " monomorphic SNPs in target" << endl;
     if (MmultiAllelicRef)
       cout << "              " << MmultiAllelicRef << " SNPs biallelic in target but multi-allelic in ref" << endl;
     if (MmonomorphicRef)
@@ -420,6 +417,13 @@ namespace EAGLE {
     cout << "Missing rate in target genotypes: " << GmissingTarget / (double) M / Ntarget << endl;
     cout << endl;
 
+    int numTmpUnphased = 0;
+    for (uint j = 0; j < isTmpPhased.size(); j++)
+      numTmpUnphased += !isTmpPhased[j];
+    if (numTmpUnphased)
+      cout << "SNPs excluded from phasing that will be kept in output (unphased): "
+	   << numTmpUnphased << endl << endl;
+
     if (M <= 1U) {
       cerr << endl << "ERROR: Target and ref have too few matching SNPs (M = " << M << ")" << endl;
       exit(1);
@@ -493,14 +497,15 @@ namespace EAGLE {
 			       int chrom, int chromX, double bpStart, double bpEnd,
 			       const string &geneticMapFile, double cMmax, const string &tmpFile,
 			       const string &writeMode, int usePS,
-			       vector < vector < pair <int, int> > > &conPSall, double &snpRate) {
+			       vector < vector < pair <int, int> > > &conPSall, double &snpRate,
+			       bool outputUnphased, vector <bool> &isTmpPhased) {
 
     // perform synced read
     vector <bool> hapsRef;     // M*2*Nref
     vector <uchar> genosTarget; // M*Ntarget
     vector < pair <int, int> > chrBps = 
       processVcfs(vcfRef, vcfTarget, allowRefAltSwap, chrom, chromX, bpStart, bpEnd, hapsRef,
-		  genosTarget, tmpFile, writeMode, usePS, conPSall);
+		  genosTarget, tmpFile, writeMode, usePS, conPSall, outputUnphased, isTmpPhased);
 
     // interpolate genetic coordinates
     vector <double> cMs = processMap(chrBps, geneticMapFile);
diff --git a/src/SyncedVcfData.hpp b/src/SyncedVcfData.hpp
index 99dcd73..9d4543a 100644
--- a/src/SyncedVcfData.hpp
+++ b/src/SyncedVcfData.hpp
@@ -43,7 +43,8 @@ namespace EAGLE {
     (const std::string &vcfRef, const std::string &vcfTarget, bool allowRefAltSwap, int chrom,
      int chromX, double bpStart, double bpEnd, std::vector <bool> &hapsRef,
      std::vector <uchar> &genosTarget, const std::string &tmpFile, const std::string &writeMode,
-     int usePS, std::vector < std::vector < std::pair <int, int> > > &conPSall);
+     int usePS, std::vector < std::vector < std::pair <int, int> > > &conPSall,
+     bool outputUnphased, std::vector <bool> &isTmpPhased);
     std::vector <double> processMap(std::vector < std::pair <int, int> > &chrBps,
 				    const std::string &geneticMapFile);
     void buildGenoBits(const std::vector <bool> &hapsRef, const std::vector <uchar> &genosTarget,
@@ -59,7 +60,8 @@ namespace EAGLE {
 		  int chrom, int chromX, double bpStart, double bpEnd,
 		  const std::string &geneticMapFile, double cMmax, const std::string &tmpFile,
 		  const std::string &writeMode, int usePS,
-		  std::vector < std::vector < std::pair <int, int> > > &conPSall, double &snpRate);
+		  std::vector < std::vector < std::pair <int, int> > > &conPSall, double &snpRate,
+		  bool outputUnphased, std::vector <bool> &isTmpPhased);
 
     ~SyncedVcfData();
 
diff --git a/src/Version.hpp b/src/Version.hpp
index f0d41d8..050a7fe 100644
--- a/src/Version.hpp
+++ b/src/Version.hpp
@@ -19,7 +19,7 @@
 #ifndef VERSION_HPP
 #define VERSION_HPP
 
-const char EAGLE_VERSION[] = "2.3.1";
-const char EAGLE_VERSION_DATE[] = "March 3, 2017";
+const char EAGLE_VERSION[] = "2.3.2";
+const char EAGLE_VERSION_DATE[] = "March 23, 2017";
 
 #endif

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/eagle.git



More information about the debian-med-commit mailing list