[med-svn] [eagle] 01/06: New upstream version 2.4
Dylan Aïssi
bob.dybian-guest at moszumanska.debian.org
Fri Jan 12 22:38:14 UTC 2018
This is an automated email from the git hooks/post-receive script.
bob.dybian-guest pushed a commit to branch master
in repository eagle.
commit 2d8ac15bd5c0ab3842a95b847dc44051024eb23e
Author: Dylan Aïssi <bob.dybian at gmail.com>
Date: Fri Jan 12 22:53:00 2018 +0100
New upstream version 2.4
---
example/example.log | 70 ++++++++++++-------------
example/example_ref.log | 15 +++---
example/example_vcf.log | 70 ++++++++++++-------------
example/phased.vcf.gz | Bin 200383 -> 200388 bytes
example/target.phased.vcf.gz | Bin 21053 -> 21053 bytes
src/Eagle.cpp | 17 ++++++-
src/Eagle.hpp | 5 +-
src/EagleMain.cpp | 15 ++++--
src/EaglePBWT.cpp | 15 +++---
src/EagleParams.cpp | 11 +++-
src/EagleParams.hpp | 3 +-
src/MapInterpolater.cpp | 10 ++++
src/SyncedVcfData.cpp | 119 +++++++++++++++++++++++--------------------
src/SyncedVcfData.hpp | 19 +++----
src/Version.hpp | 4 +-
15 files changed, 214 insertions(+), 159 deletions(-)
diff --git a/example/example.log b/example/example.log
index a916d76..adfb60e 100644
--- a/example/example.log
+++ b/example/example.log
@@ -1,12 +1,12 @@
+-----------------------------+
| |
- | Eagle v2.3.5 |
- | August 2, 2017 |
+ | Eagle v2.4 |
+ | December 13, 2017 |
| Po-Ru Loh |
| |
+-----------------------------+
-Copyright (C) 2015-2016 Harvard University.
+Copyright (C) 2015-2017 Harvard University.
Distributed under the GNU GPLv3+ open source license.
Command line options:
@@ -56,25 +56,25 @@ Setting --histFactor=1.00
BEGINNING STEP 1
-Time for step 1: 0.373437
-Time for step 1 MN^2: 0.023717
+Time for step 1: 0.282519
+Time for step 1 MN^2: 0.0201082
-Making hard calls (time: 0.028342)
+Making hard calls (time: 0.0164621)
BEGINNING STEP 2
BATCH 1 OF 1
Building hash tables
-.................................................................. (time: 0.050245)
+.................................................................. (time: 0.0826769)
Phasing samples 1-379
-Time for phasing batch: 0.433878
+Time for phasing batch: 0.335781
-Making hard calls (time: 0.030308)
+Making hard calls (time: 0.0172811)
-Time for step 2: 0.514443
-Time for step 2 MN^2: 0.0593926
+Time for step 2: 0.435747
+Time for step 2 MN^2: 0.0391194
BEGINNING STEP 3 (PBWT ITERS)
@@ -87,108 +87,108 @@ BEGINNING PBWT ITER 1
BATCH 1 OF 10
Phasing samples 1-37
-Time for phasing batch: 1.75117
+Time for phasing batch: 1.13401
BATCH 2 OF 10
Phasing samples 38-75
-Time for phasing batch: 1.58165
+Time for phasing batch: 1.04853
BATCH 3 OF 10
Phasing samples 76-113
-Time for phasing batch: 1.68042
+Time for phasing batch: 1.18956
BATCH 4 OF 10
Phasing samples 114-151
-Time for phasing batch: 1.6039
+Time for phasing batch: 1.1353
BATCH 5 OF 10
Phasing samples 152-189
-Time for phasing batch: 1.59436
+Time for phasing batch: 1.01419
BATCH 6 OF 10
Phasing samples 190-227
-Time for phasing batch: 1.64487
+Time for phasing batch: 1.05172
BATCH 7 OF 10
Phasing samples 228-265
-Time for phasing batch: 1.60414
+Time for phasing batch: 0.99322
BATCH 8 OF 10
Phasing samples 266-303
-Time for phasing batch: 1.51987
+Time for phasing batch: 0.99847
BATCH 9 OF 10
Phasing samples 304-341
-Time for phasing batch: 1.56367
+Time for phasing batch: 0.998471
BATCH 10 OF 10
Phasing samples 342-379
-Time for phasing batch: 1.5805
+Time for phasing batch: 1.05052
-Time for PBWT iter 1: 16.1246
+Time for PBWT iter 1: 10.6142
BEGINNING PBWT ITER 2
BATCH 1 OF 10
Phasing samples 1-37
-Time for phasing batch: 2.70429
+Time for phasing batch: 1.76314
BATCH 2 OF 10
Phasing samples 38-75
-Time for phasing batch: 2.5743
+Time for phasing batch: 1.79145
BATCH 3 OF 10
Phasing samples 76-113
-Time for phasing batch: 2.70542
+Time for phasing batch: 1.8253
BATCH 4 OF 10
Phasing samples 114-151
-Time for phasing batch: 2.56365
+Time for phasing batch: 1.54563
BATCH 5 OF 10
Phasing samples 152-189
-Time for phasing batch: 2.50088
+Time for phasing batch: 1.54299
BATCH 6 OF 10
Phasing samples 190-227
-Time for phasing batch: 2.58336
+Time for phasing batch: 1.59936
BATCH 7 OF 10
Phasing samples 228-265
-Time for phasing batch: 2.59174
+Time for phasing batch: 1.55847
BATCH 8 OF 10
Phasing samples 266-303
-Time for phasing batch: 2.47758
+Time for phasing batch: 1.51538
BATCH 9 OF 10
Phasing samples 304-341
-Time for phasing batch: 2.48646
+Time for phasing batch: 1.58919
BATCH 10 OF 10
Phasing samples 342-379
-Time for phasing batch: 2.57226
+Time for phasing batch: 1.64308
-Time for PBWT iter 2: 25.76
+Time for PBWT iter 2: 16.3741
Writing .haps.gz and .sample output
-Time for writing output: 0.306937
-Total elapsed time for analysis = 43.2534 sec
+Time for writing output: 0.281509
+Total elapsed time for analysis = 28.1508 sec
diff --git a/example/example_ref.log b/example/example_ref.log
index bb3c432..14faf91 100644
--- a/example/example_ref.log
+++ b/example/example_ref.log
@@ -1,12 +1,12 @@
+-----------------------------+
| |
- | Eagle v2.3.5 |
- | August 2, 2017 |
+ | Eagle v2.4 |
+ | December 13, 2017 |
| Po-Ru Loh |
| |
+-----------------------------+
-Copyright (C) 2015-2016 Harvard University.
+Copyright (C) 2015-2017 Harvard University.
Distributed under the GNU GPLv3+ open source license.
Command line options:
@@ -42,6 +42,9 @@ Genetic distance range: 10.292 cM
Average # SNPs per cM: 42
Number of <=(64-SNP, 1cM) segments: 9
Average # SNPs per segment: 47
+
+Time for reading input: 6.18623 sec
+
Fraction of heterozygous genotypes: 0.178114
Typical span of default 100-het history length: 13.44 cM
Setting --histFactor=1.00
@@ -56,15 +59,15 @@ PHASING ITER 1 OF 1
Phasing target samples
................................................................................
-Time for phasing iter 1: 0.236665
+Time for phasing iter 1: 0.144591
Writing vcf.gz output to target.phased.vcf.gz
[W::vcf_parse] INFO 'AC' is not defined in the header, assuming Type=String
[W::vcf_parse] INFO 'AN' is not defined in the header, assuming Type=String
[W::vcf_parse] INFO 'DP' is not defined in the header, assuming Type=String
[W::vcf_parse] INFO 'AFR_AF' is not defined in the header, assuming Type=String
[W::vcf_parse] INFO 'EX_TARGET' is not defined in the header, assuming Type=String
-Time for writing output: 0.028465
-Total elapsed time for analysis = 9.34482 sec
+Time for writing output: 0.0239
+Total elapsed time for analysis = 6.35535 sec
Mean phase confidence of each target individual:
ID PHASE_CONFIDENCE
diff --git a/example/example_vcf.log b/example/example_vcf.log
index c901b3c..19f7213 100644
--- a/example/example_vcf.log
+++ b/example/example_vcf.log
@@ -1,12 +1,12 @@
+-----------------------------+
| |
- | Eagle v2.3.5 |
- | August 2, 2017 |
+ | Eagle v2.4 |
+ | December 13, 2017 |
| Po-Ru Loh |
| |
+-----------------------------+
-Copyright (C) 2015-2016 Harvard University.
+Copyright (C) 2015-2017 Harvard University.
Distributed under the GNU GPLv3+ open source license.
Command line options:
@@ -39,25 +39,25 @@ Setting --histFactor=1.00
BEGINNING STEP 1
-Time for step 1: 0.37492
-Time for step 1 MN^2: 0.0236487
+Time for step 1: 0.267283
+Time for step 1 MN^2: 0.0164427
-Making hard calls (time: 0.02865)
+Making hard calls (time: 0.0152462)
BEGINNING STEP 2
BATCH 1 OF 1
Building hash tables
-.................................................................. (time: 0.0538409)
+.................................................................. (time: 0.0786691)
Phasing samples 1-379
-Time for phasing batch: 0.435353
+Time for phasing batch: 0.331454
-Making hard calls (time: 0.0303631)
+Making hard calls (time: 0.0164559)
-Time for step 2: 0.519568
-Time for step 2 MN^2: 0.0592469
+Time for step 2: 0.426586
+Time for step 2 MN^2: 0.0375612
BEGINNING STEP 3 (PBWT ITERS)
@@ -70,108 +70,108 @@ BEGINNING PBWT ITER 1
BATCH 1 OF 10
Phasing samples 1-37
-Time for phasing batch: 1.79524
+Time for phasing batch: 1.0778
BATCH 2 OF 10
Phasing samples 38-75
-Time for phasing batch: 1.61659
+Time for phasing batch: 1.0477
BATCH 3 OF 10
Phasing samples 76-113
-Time for phasing batch: 1.70906
+Time for phasing batch: 1.11441
BATCH 4 OF 10
Phasing samples 114-151
-Time for phasing batch: 1.63612
+Time for phasing batch: 1.06495
BATCH 5 OF 10
Phasing samples 152-189
-Time for phasing batch: 1.60907
+Time for phasing batch: 1.07034
BATCH 6 OF 10
Phasing samples 190-227
-Time for phasing batch: 1.65953
+Time for phasing batch: 1.13122
BATCH 7 OF 10
Phasing samples 228-265
-Time for phasing batch: 1.657
+Time for phasing batch: 1.07535
BATCH 8 OF 10
Phasing samples 266-303
-Time for phasing batch: 1.59278
+Time for phasing batch: 1.16462
BATCH 9 OF 10
Phasing samples 304-341
-Time for phasing batch: 1.61661
+Time for phasing batch: 1.15337
BATCH 10 OF 10
Phasing samples 342-379
-Time for phasing batch: 1.59879
+Time for phasing batch: 1.17788
-Time for PBWT iter 1: 16.4908
+Time for PBWT iter 1: 11.0777
BEGINNING PBWT ITER 2
BATCH 1 OF 10
Phasing samples 1-37
-Time for phasing batch: 2.72667
+Time for phasing batch: 1.87655
BATCH 2 OF 10
Phasing samples 38-75
-Time for phasing batch: 2.55292
+Time for phasing batch: 1.6772
BATCH 3 OF 10
Phasing samples 76-113
-Time for phasing batch: 2.68511
+Time for phasing batch: 1.67862
BATCH 4 OF 10
Phasing samples 114-151
-Time for phasing batch: 2.57207
+Time for phasing batch: 1.57155
BATCH 5 OF 10
Phasing samples 152-189
-Time for phasing batch: 2.51047
+Time for phasing batch: 1.57551
BATCH 6 OF 10
Phasing samples 190-227
-Time for phasing batch: 2.6027
+Time for phasing batch: 1.64418
BATCH 7 OF 10
Phasing samples 228-265
-Time for phasing batch: 2.619
+Time for phasing batch: 1.60561
BATCH 8 OF 10
Phasing samples 266-303
-Time for phasing batch: 2.47177
+Time for phasing batch: 1.63349
BATCH 9 OF 10
Phasing samples 304-341
-Time for phasing batch: 2.52629
+Time for phasing batch: 1.78741
BATCH 10 OF 10
Phasing samples 342-379
-Time for phasing batch: 2.5489
+Time for phasing batch: 1.80977
-Time for PBWT iter 2: 25.8159
+Time for PBWT iter 2: 16.86
Writing vcf.gz output to phased.vcf.gz
-Time for writing output: 0.316302
-Total elapsed time for analysis = 53.0844 sec
+Time for writing output: 0.204556
+Total elapsed time for analysis = 34.7048 sec
diff --git a/example/phased.vcf.gz b/example/phased.vcf.gz
index 67ee350..5ed3010 100644
Binary files a/example/phased.vcf.gz and b/example/phased.vcf.gz differ
diff --git a/example/target.phased.vcf.gz b/example/target.phased.vcf.gz
index 81664e0..ed6302c 100644
Binary files a/example/target.phased.vcf.gz and b/example/target.phased.vcf.gz differ
diff --git a/src/Eagle.cpp b/src/Eagle.cpp
index 6e185b5..b0eced0 100644
--- a/src/Eagle.cpp
+++ b/src/Eagle.cpp
@@ -2329,6 +2329,10 @@ namespace EAGLE {
cerr << "Too few SNP segments for analysis" << endl;
exit(1);
}
+ if (Mseg64 > 50000U) {
+ cerr << "Too many SNP segments for analysis: " << Mseg64 << " (max = 50000)" << endl;
+ exit(1);
+ }
uint seed = n0; // for rand_r()
@@ -3218,7 +3222,8 @@ namespace EAGLE {
void Eagle::writeVcf(const string &tmpFile, const vector <bool> &isTmpPhased,
const string &outFile, int chromX, double bpStart, double bpEnd,
- const string &writeMode, bool noImpMissing, int argc, char **argv) const {
+ const string &writeMode, bool noImpMissing, bool keepMissingPloidyX,
+ int argc, char **argv) const {
htsFile *htsTmp = hts_open(tmpFile.c_str(), "r");
if (htsTmp == NULL) {
@@ -3274,6 +3279,10 @@ namespace EAGLE {
int ploidy = 2;
int *ptr = tgt_gt + i*ploidy;
+ // --keepMissingPloidyX: assume missing genotypes in target VCF have correct ploidy
+ if (chrom == chromX && keepMissingPloidyX && bcf_gt_is_missing(ptr[0]))
+ mostRecentPloidy[i] = (ptr[1] == bcf_int32_vector_end ? 1 : 2);
+
if (chrom != chromX || (bcf_gt_is_missing(ptr[0]) && mostRecentPloidy[i] == 2)
|| ptr[1] != bcf_int32_vector_end) { // diploid... be careful about missing '.'
mostRecentPloidy[i] = 2;
@@ -3355,7 +3364,7 @@ namespace EAGLE {
// - does not delete tmpFile (now vcfFile with original input)
void Eagle::writeVcfNonRef(const string &vcfFile, const string &outFile, int inputChrom,
int chromX, double bpStart, double bpEnd, const string &writeMode,
- int argc, char **argv) const {
+ bool keepMissingPloidyX, int argc, char **argv) const {
htsFile *htsIn = hts_open(vcfFile.c_str(), "r");
htsFile *out = hts_open(outFile.c_str(), writeMode.c_str());
@@ -3388,6 +3397,10 @@ namespace EAGLE {
int ploidy = 2;
int *ptr = tgt_gt + i*ploidy;
+ // --keepMissingPloidyX: assume missing genotypes in target VCF have correct ploidy
+ if (chrom == chromX && keepMissingPloidyX && bcf_gt_is_missing(ptr[0]))
+ mostRecentPloidy[i] = (ptr[1] == bcf_int32_vector_end ? 1 : 2);
+
if (chrom != chromX || (bcf_gt_is_missing(ptr[0]) && mostRecentPloidy[i] == 2)
|| ptr[1] != bcf_int32_vector_end) { // diploid... be careful about missing '.'
mostRecentPloidy[i] = 2;
diff --git a/src/Eagle.hpp b/src/Eagle.hpp
index 5c43e0f..8e309a7 100644
--- a/src/Eagle.hpp
+++ b/src/Eagle.hpp
@@ -187,10 +187,11 @@ namespace EAGLE {
void writeHapsGzSample(const std::string &prefix) const;
void writeVcf(const std::string &tmpFile, const std::vector <bool> &isTmpPhased,
const std::string &outFile, int chromX, double bpStart, double bpEnd,
- const std::string &writeMode, bool noImpMissing, int argc, char**argv) const;
+ const std::string &writeMode, bool noImpMissing, bool keepMissingPloidyX,
+ int argc, char**argv) const;
void writeVcfNonRef(const std::string &vcfFile, const std::string &outFile, int inputChrom,
int chromX, double bpStart, double bpEnd, const std::string &writeMode,
- int argc, char **argv) const;
+ bool keepMissingPloidyX, int argc, char **argv) const;
void makeHardCalls(uint64 n0start, uint64 n0end, uint seed);
void initRefIter(int refIter);
void buildHashTables(int iter, int batch, int seed);
diff --git a/src/EagleMain.cpp b/src/EagleMain.cpp
index c8a2617..12ce800 100644
--- a/src/EagleMain.cpp
+++ b/src/EagleMain.cpp
@@ -67,11 +67,12 @@ void phaseWithRef(EagleParams ¶ms, Timer &timer, double t0, int argc, char *
string outFile = params.outPrefix + "." + params.vcfOutSuffix;
vector < vector < pair <int, int> > > conPSall; double snpRate;
vector <bool> isTmpPhased;
- SyncedVcfData vcfData(params.vcfRef, params.vcfTarget, params.allowRefAltSwap, params.chrom,
- params.chromX, params.bpStart-params.bpFlanking,
+ SyncedVcfData vcfData(params.vcfRef, params.vcfTarget, params.vcfExclude, params.allowRefAltSwap,
+ params.chrom, params.chromX, params.bpStart-params.bpFlanking,
params.bpEnd+params.bpFlanking, params.geneticMapFile,
params.cMmax==0 ? 1 : params.cMmax, tmpFile, params.vcfWriteMode,
params.usePS, conPSall, snpRate, params.outputUnphased, isTmpPhased);
+ cout << endl << "Time for reading input: " << timer.update_time() << " sec" << endl << endl;
Eagle eagle(vcfData.getNref(), vcfData.getNtarget(), vcfData.getMseg64(),
vcfData.getGenoBits(), vcfData.getSeg64cMvecs(), params.pErr);
@@ -187,7 +188,7 @@ void phaseWithRef(EagleParams ¶ms, Timer &timer, double t0, int argc, char *
timer.update_time();
cout << "Writing " << params.vcfOutSuffix << " output to " << outFile << endl;
eagle.writeVcf(tmpFile, isTmpPhased, outFile, params.chromX, params.bpStart, params.bpEnd,
- params.vcfWriteMode, params.noImpMissing, argc, argv);
+ params.vcfWriteMode, params.noImpMissing, params.keepMissingPloidyX, argc, argv);
cout << "Time for writing output: " << timer.update_time() << endl;
cout << "Total elapsed time for analysis = " << (timer.get_time()-t0) << " sec" << endl;
@@ -215,7 +216,7 @@ int main(int argc, char *argv[]) {
cout << " +-----------------------------+" << endl;
cout << endl;
- cout << "Copyright (C) 2015-2016 Harvard University." << endl;
+ cout << "Copyright (C) 2015-2017 Harvard University." << endl;
cout << "Distributed under the GNU GPLv3+ open source license." << endl << endl;
//cout << "Boost version: " << BOOST_LIB_VERSION << endl;
@@ -285,6 +286,9 @@ int main(int argc, char *argv[]) {
int bpSpan = genoData.getSnps().back().physpos - genoData.getSnps()[0].physpos;
double snpsPerMb = genoData.getSnps().size() / (bpSpan*1e-6);
if (snpsPerMb > 1000) params.pbwtOnly = true;
+
+ if (genoData.getN() > 200000U) params.pbwtOnly = true;
+
if (params.pbwtOnly) params.runStep2 = 0;
// if --runStep2 hasn't yet been set, SNP density must be low; run Step 2 unless too few chunks
@@ -499,7 +503,8 @@ int main(int argc, char *argv[]) {
string outFile = params.outPrefix + "." + params.vcfOutSuffix;
cout << "Writing " << params.vcfOutSuffix << " output to " << outFile << endl;
eagle.writeVcfNonRef(params.vcfFile, outFile, params.chrom, params.chromX, params.bpStart,
- params.bpEnd, params.vcfWriteMode, argc, argv);
+ params.bpEnd, params.vcfWriteMode, params.keepMissingPloidyX, argc,
+ argv);
}
else {
cout << "Writing .haps.gz and .sample output" << endl; timer.update_time();
diff --git a/src/EaglePBWT.cpp b/src/EaglePBWT.cpp
index 9d20c8a..686af3b 100644
--- a/src/EaglePBWT.cpp
+++ b/src/EaglePBWT.cpp
@@ -529,7 +529,7 @@ namespace EAGLE {
const uint64 m64jNext = t==(int) splits64j.size() ? Mseg64*64 : splits64j[t];
vector <uint64> miss64j;
for (uint64 m64j = m64jPrev+1; m64j < m64jNext; m64j++)
- if (genos64j[m64j] == 3 && impMissing) // missing
+ if (maskSnps64j[m64j] && genos64j[m64j] == 3 && impMissing) // missing
miss64j.push_back(m64j);
// orient each hap pair wrt called phase
@@ -582,8 +582,9 @@ namespace EAGLE {
// initialize allele dosages
int nMiss = miss64j.size();
- double alleleDoses[nMiss][2];
- for (int m = 0; m < nMiss; m++) alleleDoses[m][0] = alleleDoses[m][1] = 0;
+ struct DosePair { double d[2]; };
+ vector <DosePair> alleleDoses(nMiss);
+ for (int m = 0; m < nMiss; m++) alleleDoses[m].d[0] = alleleDoses[m].d[1] = 0;
// process non-ends: call using longer of fwd, rev ref samples
double meanLens[2] = {0, 0};
@@ -595,7 +596,7 @@ namespace EAGLE {
for (uint k = 0; k < nonEndRefs[fbLong].size(); k++) {
int refSeq = nonEndRefs[fbLong][k];
for (int m = 0; m < nMiss; m++)
- alleleDoses[m][(haploBitsT[refSeq*Mseg64+miss64j[m]/64]>>(miss64j[m]&63))&1] += 1;
+ alleleDoses[m].d[(haploBitsT[refSeq*Mseg64+miss64j[m]/64]>>(miss64j[m]&63))&1] += 1;
}
// compute mean lengths including ends=0 for phasing singletons later
@@ -653,9 +654,9 @@ namespace EAGLE {
else
break;
}
- alleleDoses[m][(haploBitsT[refSeqFwd*Mseg64+miss64j[m]/64]>>(miss64j[m]&63))&1] +=
+ alleleDoses[m].d[(haploBitsT[refSeqFwd*Mseg64+miss64j[m]/64]>>(miss64j[m]&63))&1] +=
cMcum / cMtot;
- alleleDoses[m][(haploBitsT[refSeqRev*Mseg64+miss64j[m]/64]>>(miss64j[m]&63))&1] +=
+ alleleDoses[m].d[(haploBitsT[refSeqRev*Mseg64+miss64j[m]/64]>>(miss64j[m]&63))&1] +=
1 - cMcum / cMtot;
}
}
@@ -664,7 +665,7 @@ namespace EAGLE {
// make final calls
for (int m = 0; m < nMiss; m++) {
uint64 m64 = miss64j[m]/64, j = miss64j[m]&63;
- if (alleleDoses[m][0] >= alleleDoses[m][1])
+ if (alleleDoses[m].d[0] >= alleleDoses[m].d[1])
tmpHaploBitsT[(nTargetHap+h)*Mseg64 + m64] &= ~(1ULL<<j);
else
tmpHaploBitsT[(nTargetHap+h)*Mseg64 + m64] |= 1ULL<<j;
diff --git a/src/EagleParams.cpp b/src/EagleParams.cpp
index 2509c23..06ddf82 100644
--- a/src/EagleParams.cpp
+++ b/src/EagleParams.cpp
@@ -83,9 +83,13 @@ namespace EAGLE {
"tabix-indexed [compressed] VCF/BCF file for target genotypes")
("vcfOutFormat", po::value<string>(&vcfOutFormat)->default_value("."),
"b|u|z|v: compressed BCF (b), uncomp BCF (u), compressed VCF (z), uncomp VCF (v)")
- ("noImpMissing", "disable imputation of missing ./. target genotypes")
+ ("noImpMissing", "disable imputation of missing target genotypes (. or ./.)")
("allowRefAltSwap", "allow swapping of REF/ALT in target vs. ref VCF")
("outputUnphased", "output unphased sites (target-only, multi-allelic, etc.)")
+ ("keepMissingPloidyX",
+ "assume missing genotypes have correct ploidy (.=haploid, ./.=diploid)")
+ ("vcfExclude", po::value<string>(&vcfExclude),
+ "tabix-indexed [compressed] VCF/BCF file containing variants to exclude from phasing")
;
po::options_description bothModes("Region selection options");
@@ -229,6 +233,7 @@ namespace EAGLE {
noImpMissing = vm.count("noImpMissing");
allowRefAltSwap = vm.count("allowRefAltSwap");
outputUnphased = vm.count("outputUnphased");
+ keepMissingPloidyX = vm.count("keepMissingPloidyX");
if (vm.count("vcfRef") || vm.count("vcfTarget") || vm.count("vcf")) { // VCF mode
if (vm.count("vcf")) { // non-ref mode
@@ -254,6 +259,9 @@ namespace EAGLE {
cerr << "ERROR: --pbwtIters cannot be greater than 1 if --noImpMissing is set" << endl;
return false;
}
+ if (vcfRef.substr(vcfRef.length()-4) != string(".bcf")) {
+ cerr << "WARNING: --vcfRef does not end in '.bcf'; BCF input is fastest" << endl;
+ }
}
// vcf input checks for both ref and non-ref mode
@@ -365,6 +373,7 @@ namespace EAGLE {
FileUtils::requireEmptyOrReadable(bedFile);
FileUtils::requireEmptyOrReadable(vcfRef);
FileUtils::requireEmptyOrReadable(vcfTarget);
+ FileUtils::requireEmptyOrReadable(vcfExclude);
if (geneticMapFile != "USE_BIM") {
vector <string> reqHeader;
reqHeader.push_back("chr"); reqHeader.push_back("position");
diff --git a/src/EagleParams.hpp b/src/EagleParams.hpp
index 9470418..b53222f 100644
--- a/src/EagleParams.hpp
+++ b/src/EagleParams.hpp
@@ -30,7 +30,7 @@ namespace EAGLE {
public:
// main input files
- std::string famFile, bimFile, bedFile, vcfFile, vcfRef, vcfTarget;
+ std::string famFile, bimFile, bedFile, vcfFile, vcfRef, vcfTarget, vcfExclude;
int chrom, chromX;
// optional reference map file for filling in genpos
@@ -45,6 +45,7 @@ namespace EAGLE {
// outFormat b|u|z|v -> outSuffix bcf|bcf|vcf.gz|vcf, writeMode wb|wbu|wz|w
bool noImpMissing;
bool outputUnphased; // output unphased sites (target-only, multi-allelic, etc.)
+ bool keepMissingPloidyX; // assume missing genotypes in VCF have correct ploidy (.=haploid, ./.=diploid)
int usePS; // use FORMAT:PS phase constraints: 1=soft, 2=harder
bool allowRefAltSwap; // in reference-based phasing mode
diff --git a/src/MapInterpolater.cpp b/src/MapInterpolater.cpp
index b3d1f93..4ef80f4 100644
--- a/src/MapInterpolater.cpp
+++ b/src/MapInterpolater.cpp
@@ -62,6 +62,16 @@ namespace Genetics {
cerr << " " << chr << " " << bp << " " << rate << " " << gen << endl;
exit(1);
}
+ if (bp<=0) {
+ cerr << "ERROR: Genetic map positions must be positive:" << endl;
+ cerr << " " << chr << " " << bp << " " << rate << " " << gen << endl;
+ exit(1);
+ }
+ if (bp==bp0) {
+ cerr << "ERROR: Genetic map contains duplicate position:" << endl;
+ cerr << " " << chr << " " << bp << " " << rate << " " << gen << endl;
+ exit(1);
+ }
chrBpToRateGen[make_pair(chr, bp)] = make_pair((gen-gen0)/(1e-6*(bp-bp0)), gen);
}
chr0 = chr; bp0 = bp; gen0 = gen;
diff --git a/src/SyncedVcfData.cpp b/src/SyncedVcfData.cpp
index d8b0b2b..7a2ccd9 100644
--- a/src/SyncedVcfData.cpp
+++ b/src/SyncedVcfData.cpp
@@ -159,17 +159,19 @@ namespace EAGLE {
}
}
+ // fills in chrom if chrom==0
vector < pair <int, int> > SyncedVcfData::processVcfs
- (const string &vcfRef, const string &vcfTarget, bool allowRefAltSwap, int chrom, int chromX,
- double bpStart, double bpEnd, vector <bool> &hapsRef, vector <uchar> &genosTarget,
- const string &tmpFile, const string &writeMode, int usePS,
+ (const string &vcfRef, const string &vcfTarget, const string &vcfExclude, bool allowRefAltSwap,
+ int &chrom, int chromX, double bpStart, double bpEnd, vector <bool> &hapsRef,
+ vector <uchar> &genosTarget, const string &tmpFile, const string &writeMode, int usePS,
vector < vector < pair <int, int> > > &conPSall, bool outputUnphased,
vector <bool> &isTmpPhased) {
vector < pair <int, int> > chrBps;
bcf_srs_t *sr = bcf_sr_init();
- bcf_sr_set_opt(sr, BCF_SR_PAIR_LOGIC, BCF_SR_PAIR_BOTH_REF);
+ bcf_sr_set_opt(sr, BCF_SR_PAIR_LOGIC,
+ allowRefAltSwap ? BCF_SR_PAIR_BOTH_REF : BCF_SR_PAIR_EXACT);
bcf_sr_set_opt(sr, BCF_SR_REQUIRE_IDX);
if ( chrom!=0 )
@@ -181,7 +183,7 @@ namespace EAGLE {
ksprintf(&str,"X:%d-%d,chrX:%d-%d",(uint32_t)bpStart,(uint32_t)bpEnd,(uint32_t)bpStart,(uint32_t)bpEnd);
}
else
- ksprintf(&str,"%d:%d-%d",chrom,(uint32_t)bpStart,(uint32_t)bpEnd);
+ ksprintf(&str,"%d:%d-%d,chr%d:%d-%d",chrom,(uint32_t)bpStart,(uint32_t)bpEnd,chrom,(uint32_t)bpStart,(uint32_t)bpEnd);
if ( bcf_sr_set_regions(sr, str.s, 0)!=0 )
{
cerr << "ERROR: failed to initialize the region:" << str.s;
@@ -200,6 +202,12 @@ namespace EAGLE {
<< bcf_sr_strerror(sr->errnum) << endl;
exit(1);
}
+ bool useExclude = !vcfExclude.empty();
+ if (useExclude && !bcf_sr_add_reader(sr, vcfExclude.c_str())) {
+ cerr << "ERROR: Could not open " << vcfExclude << " for reading: "
+ << bcf_sr_strerror(sr->errnum) << endl;
+ exit(1);
+ }
bcf_hdr_t *ref_hdr = bcf_sr_get_header(sr, 0);
bcf_hdr_t *tgt_hdr = bcf_sr_get_header(sr, 1);
@@ -229,7 +237,7 @@ namespace EAGLE {
cout << "Target samples: Ntarget = " << Ntarget << endl;
M = 0;
- uint MtargetOnly = 0, MrefOnly = 0, MmultiAllelicTgt = 0,
+ uint Mexclude = 0, MtargetOnly = 0, MrefOnly = 0, MmultiAllelicTgt = 0,
MmultiAllelicRef = 0, MmonomorphicRef = 0;
uint MwithMissingRef = 0, MwithUnphasedRef = 0, MnotInRegion = 0, MnotOnChrom = 0;
uint MrefAltError = 0, numRefAltSwaps = 0;
@@ -244,6 +252,11 @@ namespace EAGLE {
{
bcf1_t *ref = bcf_sr_get_line(sr, 0);
bcf1_t *tgt = bcf_sr_get_line(sr, 1);
+ if (useExclude && bcf_sr_get_line(sr, 2)) {
+ Mexclude++;
+ if (outputUnphased && tgt) { bcf_write(out, tgt_hdr, tgt); isTmpPhased.push_back(false); }
+ continue;
+ }
if ( !ref ) {
//fprintf(stderr, "onlyT .. %s:%d\n", bcf_seqname(tgt_hdr, tgt), tgt->pos+1);
bcf_unpack(tgt, BCF_UN_STR); // unpack thru ALT
@@ -261,53 +274,47 @@ namespace EAGLE {
}
//fprintf(stderr, "match .. %s:%d\n", bcf_seqname(ref_hdr, ref), ref->pos+1);
- // filter out multi-allelic and monomorphic markers
+ // deal with multi-allelic and monomorphic markers
+ // drop multi-allelic target markers
int ntgt_gt = bcf_get_genotypes(tgt_hdr, tgt, &tgt_gt, &mtgt_gt);
if (tgt->n_allele > 2) {
MmultiAllelicTgt++;
if (outputUnphased) { bcf_write(out, tgt_hdr, tgt); isTmpPhased.push_back(false); }
continue;
}
- if (ref->n_allele==1)
- {
// drop monomorphic reference markers
- MmonomorphicRef++;
- if (outputUnphased) { bcf_write(out, tgt_hdr, tgt); isTmpPhased.push_back(false); }
- continue;
- }
-
- // preserve monomorphic markers if biallelic in the reference panel
- if (tgt->n_allele < 2)
- {
- bcf_update_alleles(tgt_hdr, tgt, (const char**)ref->d.allele,ref->n_allele);
- }
+ if (ref->n_allele==1) {
+ MmonomorphicRef++;
+ if (outputUnphased) { bcf_write(out, tgt_hdr, tgt); isTmpPhased.push_back(false); }
+ continue;
+ }
+ // preserve monomorphic markers if biallelic in the reference panel
+ if (tgt->n_allele < 2) {
+ bcf_update_alleles(tgt_hdr, tgt, (const char**)ref->d.allele,ref->n_allele);
+ }
+ // drop multi-allelic reference markers
+ if (ref->n_allele > 2) {
+ MmultiAllelicRef++;
+ if (outputUnphased) { bcf_write(out, tgt_hdr, tgt); isTmpPhased.push_back(false); }
+ continue;
+ }
+ // check for REF/ALT swaps
bool refAltSwap = false;
-
- if (allowRefAltSwap) { // perform further error-checking
- if (ref->n_allele > 2) {
- MmultiAllelicRef++;
- if (outputUnphased) { bcf_write(out, tgt_hdr, tgt); isTmpPhased.push_back(false); }
- continue;
- }
- /*
- printf("tgt REF=%s, ALT=%s ref REF=%s, ALT=%s\n", tgt->d.allele[0], tgt->d.allele[1],
- ref->d.allele[0], ref->d.allele[1]);
- */
- if (strcmp(tgt->d.allele[0], ref->d.allele[0]) == 0 &&
- strcmp(tgt->d.allele[1], ref->d.allele[1]) == 0) {
- refAltSwap = false;
- }
- else if (strcmp(tgt->d.allele[0], ref->d.allele[1]) == 0 &&
- strcmp(tgt->d.allele[1], ref->d.allele[0]) == 0) {
- refAltSwap = true;
- numRefAltSwaps++;
- }
- else {
- MrefAltError++;
- if (outputUnphased) { bcf_write(out, tgt_hdr, tgt); isTmpPhased.push_back(false); }
- continue;
- }
+ if (strcmp(tgt->d.allele[0], ref->d.allele[0]) == 0 &&
+ strcmp(tgt->d.allele[1], ref->d.allele[1]) == 0) {
+ refAltSwap = false;
+ }
+ else if (allowRefAltSwap && bcf_is_snp(tgt) && // allow swap if --allowRefAltSwap is set
+ strcmp(tgt->d.allele[0], ref->d.allele[1]) == 0 &&
+ strcmp(tgt->d.allele[1], ref->d.allele[0]) == 0) {
+ refAltSwap = true;
+ numRefAltSwaps++;
+ }
+ else {
+ MrefAltError++;
+ if (outputUnphased) { bcf_write(out, tgt_hdr, tgt); isTmpPhased.push_back(false); }
+ continue;
}
// Check the chromosome: if region was requested (chrom is set), synced
@@ -383,7 +390,7 @@ namespace EAGLE {
cout << endl;
cout << "SNPs ignored: " << MtargetOnly << " SNPs in target but not reference" << endl;
if (MtargetOnly > M/10U)
- cerr << " --> WARNING: Check REF/ALT agreement between target and ref <--"
+ cerr << " --> WARNING: Check REF/ALT agreement between target and ref? <--"
<< endl;
cout << " " << MrefOnly << " SNPs in reference but not target" << endl;
if (MnotOnChrom)
@@ -397,7 +404,9 @@ namespace EAGLE {
if (MmonomorphicRef)
cout << " " << MmonomorphicRef << " SNPs biallelic in target but monomorphic in ref" << endl;
if (MrefAltError)
- cout << " " << MrefAltError << " SNPs with REF/ALT matching errors" << endl;
+ cout << " " << MrefAltError << " SNPs with allele mismatches" << endl;
+ if (Mexclude)
+ cout << " " << Mexclude << " SNPs excluded based on --vcfExclude" << endl;
cout << endl;
if (MwithMissingRef) {
@@ -492,20 +501,22 @@ namespace EAGLE {
* reads ref+target vcf data
* writes target[isec] to tmpFile
* fills in cM coordinates and seg64cMvecs, genoBits
+ * fills in chrom if chrom==0
*/
- SyncedVcfData::SyncedVcfData(const string &vcfRef, const string &vcfTarget, bool allowRefAltSwap,
- int chrom, int chromX, double bpStart, double bpEnd,
- const string &geneticMapFile, double cMmax, const string &tmpFile,
- const string &writeMode, int usePS,
- vector < vector < pair <int, int> > > &conPSall, double &snpRate,
- bool outputUnphased, vector <bool> &isTmpPhased) {
+ SyncedVcfData::SyncedVcfData
+ (const string &vcfRef, const string &vcfTarget, const string &vcfExclude, bool allowRefAltSwap,
+ int &chrom, int chromX, double bpStart, double bpEnd, const string &geneticMapFile,
+ double cMmax, const string &tmpFile, const string &writeMode, int usePS,
+ vector < vector < pair <int, int> > > &conPSall, double &snpRate, bool outputUnphased,
+ vector <bool> &isTmpPhased) {
// perform synced read
vector <bool> hapsRef; // M*2*Nref
vector <uchar> genosTarget; // M*Ntarget
vector < pair <int, int> > chrBps =
- processVcfs(vcfRef, vcfTarget, allowRefAltSwap, chrom, chromX, bpStart, bpEnd, hapsRef,
- genosTarget, tmpFile, writeMode, usePS, conPSall, outputUnphased, isTmpPhased);
+ processVcfs(vcfRef, vcfTarget, vcfExclude, allowRefAltSwap, chrom, chromX, bpStart, bpEnd,
+ hapsRef, genosTarget, tmpFile, writeMode, usePS, conPSall, outputUnphased,
+ isTmpPhased);
// interpolate genetic coordinates
vector <double> cMs = processMap(chrBps, geneticMapFile);
diff --git a/src/SyncedVcfData.hpp b/src/SyncedVcfData.hpp
index 9d4543a..76fe66b 100644
--- a/src/SyncedVcfData.hpp
+++ b/src/SyncedVcfData.hpp
@@ -40,11 +40,12 @@ namespace EAGLE {
std::vector <std::string> targetIDs;
std::vector < std::pair <int, int> > processVcfs
- (const std::string &vcfRef, const std::string &vcfTarget, bool allowRefAltSwap, int chrom,
- int chromX, double bpStart, double bpEnd, std::vector <bool> &hapsRef,
- std::vector <uchar> &genosTarget, const std::string &tmpFile, const std::string &writeMode,
- int usePS, std::vector < std::vector < std::pair <int, int> > > &conPSall,
- bool outputUnphased, std::vector <bool> &isTmpPhased);
+ (const std::string &vcfRef, const std::string &vcfTarget, const std::string &vcfExclude,
+ bool allowRefAltSwap, int &chrom, int chromX, double bpStart, double bpEnd,
+ std::vector <bool> &hapsRef, std::vector <uchar> &genosTarget, const std::string &tmpFile,
+ const std::string &writeMode, int usePS,
+ std::vector < std::vector < std::pair <int, int> > > &conPSall, bool outputUnphased,
+ std::vector <bool> &isTmpPhased);
std::vector <double> processMap(std::vector < std::pair <int, int> > &chrBps,
const std::string &geneticMapFile);
void buildGenoBits(const std::vector <bool> &hapsRef, const std::vector <uchar> &genosTarget,
@@ -56,10 +57,10 @@ namespace EAGLE {
* writes target[isec] to tmpFile
* fills in cM coordinates and seg64cMvecs, genoBits
*/
- SyncedVcfData(const std::string &vcfRef, const std::string &vcfTarget, bool allowRefAltSwap,
- int chrom, int chromX, double bpStart, double bpEnd,
- const std::string &geneticMapFile, double cMmax, const std::string &tmpFile,
- const std::string &writeMode, int usePS,
+ SyncedVcfData(const std::string &vcfRef, const std::string &vcfTarget,
+ const std::string &vcfExclude, bool allowRefAltSwap, int &chrom, int chromX,
+ double bpStart, double bpEnd, const std::string &geneticMapFile, double cMmax,
+ const std::string &tmpFile, const std::string &writeMode, int usePS,
std::vector < std::vector < std::pair <int, int> > > &conPSall, double &snpRate,
bool outputUnphased, std::vector <bool> &isTmpPhased);
diff --git a/src/Version.hpp b/src/Version.hpp
index 07ad47e..00c4380 100644
--- a/src/Version.hpp
+++ b/src/Version.hpp
@@ -19,7 +19,7 @@
#ifndef VERSION_HPP
#define VERSION_HPP
-const char EAGLE_VERSION[] = "2.3.5";
-const char EAGLE_VERSION_DATE[] = "August 2, 2017";
+const char EAGLE_VERSION[] = "2.4";
+const char EAGLE_VERSION_DATE[] = "December 13, 2017";
#endif
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/eagle.git
More information about the debian-med-commit
mailing list