[med-svn] [canu] 01/01: New upstream version 1.6+dfsg
Andreas Tille
tille at debian.org
Sat Sep 2 13:54:51 UTC 2017
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to annotated tag upstream/1.6+dfsg
in repository canu.
commit 3dbd24eb7ac4a7479ac4f8bfba75146f5a8908ff
Author: Andreas Tille <tille at debian.org>
Date: Sat Sep 2 15:30:13 2017 +0200
New upstream version 1.6+dfsg
---
.github/ISSUE_TEMPLATE.md | 21 +
README.citation | 2 +-
README.md | 8 +-
addCopyrights-BuildData.pl | 1 +
addCopyrights.dat | 1020 ++++++++++++++++++++
addCopyrights.pl | 30 +-
buildRelease.sh | 35 +-
documentation/source/conf.py | 4 +-
documentation/source/faq.rst | 63 +-
documentation/source/index.rst | 2 +-
documentation/source/parameter-reference.rst | 39 +-
documentation/source/quick-start.rst | 271 ++----
documentation/source/tutorial.rst | 103 +-
src/AS_UTL/AS_UTL_fileIO.C | 35 +-
src/AS_UTL/AS_UTL_fileIO.H | 1 +
src/AS_UTL/AS_UTL_reverseComplement.C | 129 +--
src/AS_UTL/AS_UTL_reverseComplement.H | 6 +-
src/AS_UTL/bitPackedFile.C | 11 +-
src/AS_UTL/timeAndSize.C | 92 +-
src/AS_UTL/timeAndSize.H | 11 +-
src/AS_UTL/writeBuffer.H | 6 +-
src/AS_global.C | 6 +
src/Makefile | 21 +-
src/bogart/AS_BAT_AssemblyGraph.C | 5 +-
src/bogart/AS_BAT_BestOverlapGraph.C | 27 +-
src/bogart/AS_BAT_BestOverlapGraph.H | 1 +
src/bogart/AS_BAT_CreateUnitigs.C | 553 +++++++++--
src/bogart/AS_BAT_CreateUnitigs.H | 12 +-
src/bogart/AS_BAT_DropDeadEnds.C | 297 ++++++
...BAT_MarkRepeatReads.H => AS_BAT_DropDeadEnds.H} | 23 +-
src/bogart/AS_BAT_Instrumentation.C | 345 ++++---
src/bogart/AS_BAT_Logging.C | 6 +-
src/bogart/AS_BAT_Logging.H | 2 +-
src/bogart/AS_BAT_MarkRepeatReads.C | 85 +-
src/bogart/AS_BAT_MarkRepeatReads.H | 30 +-
src/bogart/AS_BAT_MergeOrphans.C | 919 ++++++++----------
src/bogart/AS_BAT_OptimizePositions.C | 508 ++++++++++
src/bogart/AS_BAT_Outputs.C | 17 +-
src/bogart/AS_BAT_OverlapCache.C | 644 ++++++------
src/bogart/AS_BAT_OverlapCache.H | 136 ++-
src/bogart/AS_BAT_PlaceContains.C | 4 +-
src/bogart/AS_BAT_PlaceReadUsingOverlaps.C | 647 ++++++-------
src/bogart/AS_BAT_PlaceReadUsingOverlaps.H | 18 +-
src/bogart/AS_BAT_PromoteToSingleton.C | 2 +
src/bogart/AS_BAT_ReadInfo.C | 12 +-
src/bogart/AS_BAT_ReadInfo.H | 16 +-
src/bogart/AS_BAT_SplitDiscontinuous.C | 4 +-
src/bogart/AS_BAT_TigGraph.C | 110 ++-
src/bogart/AS_BAT_TigVector.C | 6 +-
src/bogart/AS_BAT_TigVector.H | 2 +
src/bogart/AS_BAT_Unitig.C | 251 ++---
src/bogart/AS_BAT_Unitig.H | 111 ++-
src/bogart/AS_BAT_Unitig_PlaceReadUsingEdges.C | 30 +-
src/bogart/bogart.C | 282 +++---
src/bogart/bogart.mk | 2 +
src/canu_version_update.pl | 4 +-
src/falcon_sense/falcon_sense.C | 4 +-
src/falcon_sense/libfalcon/falcon.C | 4 +
src/fastq-utilities/fastqSample.C | 8 +-
src/gfa/alignGFA.C | 671 +++++++++++--
src/gfa/bed.C | 173 ++++
src/gfa/{gfa.H => bed.H} | 68 +-
src/gfa/gfa.C | 62 +-
src/gfa/gfa.H | 4 +
src/main.mk | 3 +-
src/meryl/libmeryl.C | 34 +-
src/minimap/mmapConvert.C | 65 +-
src/overlapBasedTrimming/splitReads-trimBad.C | 11 +-
src/overlapBasedTrimming/splitReads-workUnit.C | 8 +-
src/overlapBasedTrimming/splitReads.C | 15 +-
src/overlapErrorAdjustment/findErrors.H | 2 +-
src/overlapInCore/libedlib/edlib.C | 205 ++--
src/overlapInCore/libedlib/edlib.H | 18 +-
src/overlapInCore/liboverlap/Binomial_Bound.C | 8 +
src/overlapInCore/overlapInCorePartition.C | 2 +
src/pipelines/canu.pl | 350 ++++---
src/pipelines/canu/Configure.pm | 127 ++-
src/pipelines/canu/Consensus.pm | 120 ++-
src/pipelines/canu/CorrectReads.pm | 29 +-
src/pipelines/canu/Defaults.pm | 712 +++++++-------
src/pipelines/canu/Execution.pm | 195 ++--
src/pipelines/canu/Gatekeeper.pm | 150 +--
src/pipelines/canu/Grid_LSF.pm | 2 +-
src/pipelines/canu/Grid_PBSTorque.pm | 22 +-
src/pipelines/canu/Meryl.pm | 16 +-
src/pipelines/canu/Output.pm | 7 +
src/pipelines/canu/OverlapBasedTrimming.pm | 3 +-
src/pipelines/canu/OverlapErrorAdjustment.pm | 46 +-
src/pipelines/canu/OverlapInCore.pm | 21 +-
src/pipelines/canu/OverlapMMap.pm | 127 ++-
src/pipelines/canu/OverlapMhap.pm | 43 +-
src/pipelines/canu/OverlapStore.pm | 214 ++--
src/pipelines/canu/Unitig.pm | 20 +-
src/stores/gatekeeperPartition.C | 77 +-
src/stores/gkStore.C | 16 +-
src/stores/ovStoreBucketizer.C | 2 +
src/stores/ovStoreDump.C | 18 +-
src/stores/ovStoreIndexer.C | 2 +-
src/stores/ovStoreSorter.C | 39 +-
src/stores/ovStoreWriter.C | 10 +-
src/stores/tgStore.C | 8 +-
src/stores/tgTig.C | 6 +-
src/utgcns/libNDFalcon/LICENSE | 36 -
src/utgcns/libNDFalcon/dw.C | 359 -------
src/utgcns/libNDFalcon/dw.H | 161 ---
src/utgcns/libcns/unitigConsensus.C | 752 ++++++++++-----
src/utgcns/libcns/unitigConsensus.H | 7 +-
src/utgcns/libpbutgcns/Alignment.C | 113 ---
src/utgcns/libpbutgcns/Alignment.H | 65 +-
src/utgcns/libpbutgcns/AlnGraphBoost.C | 8 +-
src/utgcns/libpbutgcns/AlnGraphBoost.H | 6 +-
src/utgcns/utgcns.C | 185 ++--
112 files changed, 8012 insertions(+), 4455 deletions(-)
diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md
new file mode 100644
index 0000000..137f31b
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE.md
@@ -0,0 +1,21 @@
+**Remove this text before submitting your issue!**
+
+Include the canu command used, or at least tell us what options you've set.
+
+Include the output of `canu -version` if it isn't in any outputs. It's reported at the start of the logging, and just before any crash report.
+
+Include what system you're running on. MacOS, Linux, or other? In a virtual machine? On a grid?
+
+FORMATTING TIPS:
+
+Use `single backtics` to highlight words in text.
+```
+Use triple backtics surrounding any pasted-in text.
+This preserves
+ any
+ bizarre
+ formatting
+```
+Use the `Preview` button just above this space to see what the issue will look like.
+
+**Remove this text before submitting your issue!**
diff --git a/README.citation b/README.citation
index 4e922ed..579225b 100644
--- a/README.citation
+++ b/README.citation
@@ -1 +1 @@
-Koren S, Walenz BP, Berlin K, Miller JR, Phillippy AM. Canu: scalable and accurate long-read assembly via adaptive k-mer weighting and repeat separation. bioRxiv 071282; doi: http://dx.doi.org/10.1101/071282
+Koren S, Walenz BP, Berlin K, Miller JR, Phillippy AM. Canu: scalable and accurate long-read assembly via adaptive k-mer weighting and repeat separation. Genome Research 2017; doi: https://doi.org/10.1101/gr.215087.116
diff --git a/README.md b/README.md
index ccb8252..4b86ec4 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
# Canu
-Canu is a fork of the [Celera Assembler](http://wgs-assembler.sourceforge.net/wiki/index.php?title=Main_Page), designed for high-noise single-molecule sequencing (such as the [PacBio](http://www.pacb.com) [RS II](http://www.pacb.com/products-and-services/pacbio-systems/rsii/) or [Oxford Nanopore](https://www.nanoporetech.com/) [MinION](https://www.nanoporetech.com/products-services/minion-mki)).
+Canu is a fork of the [Celera Assembler](http://wgs-assembler.sourceforge.net/wiki/index.php?title=Main_Page), designed for high-noise single-molecule sequencing (such as the [PacBio](http://www.pacb.com) [RS II](http://www.pacb.com/products-and-services/pacbio-systems/rsii/)/[Sequel](http://www.pacb.com/products-and-services/pacbio-systems/sequel/) or [Oxford Nanopore](https://www.nanoporetech.com/) [MinION](https://nanoporetech.com/products)).
Canu is a hierarchical assembly pipeline which runs in four steps:
@@ -21,17 +21,17 @@ Alternatively, you can also build the latest unreleased from github:
## Learn:
-The [quick start](http://canu.readthedocs.io/en/stable/quick-start.html) will get you assembling quickly, while the [tutorial](http://canu.readthedocs.io/en/stable/tutorial.html) explains things in more detail.
+The [quick start](http://canu.readthedocs.io/en/latest/quick-start.html) will get you assembling quickly, while the [tutorial](http://canu.readthedocs.io/en/latest/tutorial.html) explains things in more detail.
## Run:
Brief command line help:
- ../<achitechture>/bin/canu
+ ../<architecture>/bin/canu
Full list of parameters:
../<architecture>/bin/canu -options
## Citation:
- - Koren S, Walenz BP, Berlin K, Miller JR, Phillippy AM. [Canu: scalable and accurate long-read assembly via adaptive k-mer weighting and repeat separation](http://dx.doi.org/10.1101/071282). bioRxiv. (2016).
+ - Koren S, Walenz BP, Berlin K, Miller JR, Phillippy AM. [Canu: scalable and accurate long-read assembly via adaptive k-mer weighting and repeat separation](https://doi.org/10.1101/gr.215087.116). Genome Research. (2017).
diff --git a/addCopyrights-BuildData.pl b/addCopyrights-BuildData.pl
index 41ff558..363377e 100644
--- a/addCopyrights-BuildData.pl
+++ b/addCopyrights-BuildData.pl
@@ -34,6 +34,7 @@ $stoppingCommits{"b2df5790f77d38cc31fe77a7f65360e02389f92e"} = 1; # 04 MAR 20
$stoppingCommits{"1ef335952342ef06ad1651a888f09c312f54dab8"} = 1; # 18 MAY 2016
$stoppingCommits{"bbbdcd063560e5f86006ee6b8b96d2d7b80bb750"} = 1; # 21 NOV 2016
$stoppingCommits{"64459fe33f97f6d23fe036ba1395743d0cdd03e4"} = 1; # 17 APR 2017
+$stoppingCommits{"9e9bd674b705f89817b07ff30067210c2d180f42"} = 1; # 14 AUG 2017
open(F, "< logs") or die "Failed to open 'logs': $!\n";
diff --git a/addCopyrights.dat b/addCopyrights.dat
index 21c0af7..9958697 100644
--- a/addCopyrights.dat
+++ b/addCopyrights.dat
@@ -11836,3 +11836,1023 @@ A src/bogart/AS_BAT_Unitig.C nihh20161121Brian P. Walenz
A addCopyrights-BuildData.pl nihh20161121Brian P. Walenz
A addCopyrights.dat nihh20161121Brian P. Walenz
A addCopyrights.pl nihh20161121Brian P. Walenz
+D src/bogart/AS_BAT_OptimizePositions.C src/bogart/AS_BAT_Unitig.C
+A src/pipelines/canu/Consensus.pm nihh20170811Brian P. Walenz
+A src/overlapBasedTrimming/splitReads.C nihh20170811Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.C nihh20170810Brian P. Walenz
+A src/bogart/AS_BAT_OverlapCache.C nihh20170810Brian P. Walenz
+A src/bogart/bogart.C nihh20170810Brian P. Walenz
+A src/AS_UTL/timeAndSize.C nihh20170810Brian P. Walenz
+A src/AS_UTL/timeAndSize.H nihh20170810Brian P. Walenz
+A src/AS_global.C nihh20170810Brian P. Walenz
+A documentation/source/faq.rst nihh20170809Brian P. Walenz
+A src/AS_UTL/AS_UTL_fileIO.C nihh20170809Brian P. Walenz
+A src/bogart/AS_BAT_OptimizePositions.C nihh20170809Brian P. Walenz
+A src/bogart/AS_BAT_Logging.C nihh20170808Brian P. Walenz
+A src/bogart/AS_BAT_Logging.H nihh20170808Brian P. Walenz
+A src/bogart/AS_BAT_MergeOrphans.C nihh20170808Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.C nihh20170808Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.H nihh20170808Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.C nihh20170808Brian P. Walenz
+A src/bogart/bogart.C nihh20170808Brian P. Walenz
+A documentation/source/faq.rst nihh20170808Brian P. Walenz
+A documentation/source/parameter-reference.rst nihh20170808Brian P. Walenz
+A documentation/source/quick-start.rst nihh20170808Brian P. Walenz
+A documentation/source/tutorial.rst nihh20170808Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20170808Brian P. Walenz
+A src/overlapBasedTrimming/splitReads-workUnit.C nihh20170808Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20170808Brian P. Walenz
+A src/overlapInCore/overlapInCorePartition.C nihh20170804Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20170802Brian P. Walenz
+A src/pipelines/canu.pl nihh20170802Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20170802Brian P. Walenz
+A documentation/source/tutorial.rst nihh20170802Brian P. Walenz
+A src/bogart/AS_BAT_OverlapCache.H nihh20170802Brian P. Walenz
+A src/bogart/AS_BAT_OverlapCache.C nihh20170801Brian P. Walenz
+A src/bogart/AS_BAT_OverlapCache.H nihh20170801Brian P. Walenz
+A README.md nihh20170731Sergey Koren
+A src/bogart/AS_BAT_OverlapCache.C nihh20170731Brian P. Walenz
+A src/bogart/AS_BAT_OverlapCache.H nihh20170731Brian P. Walenz
+A src/bogart/bogart.C nihh20170728Brian P. Walenz
+A src/stores/gatekeeperPartition.C nihh20170728Brian P. Walenz
+A src/pipelines/canu/OverlapStore.pm nihh20170728Brian P. Walenz
+A src/pipelines/canu/OverlapMMap.pm nihh20170728Brian P. Walenz
+A src/pipelines/canu/OverlapMhap.pm nihh20170728Brian P. Walenz
+A src/pipelines/canu/OverlapInCore.pm nihh20170728Brian P. Walenz
+A src/pipelines/canu/OverlapMMap.pm nihh20170728Brian P. Walenz
+A src/pipelines/canu/OverlapMhap.pm nihh20170728Brian P. Walenz
+A src/pipelines/canu/CorrectReads.pm nihh20170728Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20170728Brian P. Walenz
+A src/pipelines/canu/OverlapBasedTrimming.pm nihh20170728Brian P. Walenz
+A src/pipelines/canu/Consensus.pm nihh20170728Brian P. Walenz
+A src/pipelines/canu/Consensus.pm nihh20170728Brian P. Walenz
+A src/pipelines/canu/Gatekeeper.pm nihh20170728Brian P. Walenz
+A src/stores/gkStore.C nihh20170728Brian P. Walenz
+A src/AS_UTL/AS_UTL_fileIO.C nihh20170728Brian P. Walenz
+A src/AS_UTL/AS_UTL_fileIO.H nihh20170728Brian P. Walenz
+A src/pipelines/canu/Consensus.pm nihh20170728Brian P. Walenz
+A src/pipelines/canu/Output.pm nihh20170728Brian P. Walenz
+A src/bogart/AS_BAT_OverlapCache.C nihh20170727Brian P. Walenz
+A src/bogart/AS_BAT_OverlapCache.C nihh20170727Brian P. Walenz
+A src/bogart/AS_BAT_OverlapCache.H nihh20170727Brian P. Walenz
+A src/bogart/AS_BAT_ReadInfo.H nihh20170727Brian P. Walenz
+A src/bogart/AS_BAT_OverlapCache.C nihh20170727Brian P. Walenz
+A src/bogart/AS_BAT_OverlapCache.H nihh20170727Brian P. Walenz
+A src/bogart/AS_BAT_ReadInfo.C nihh20170727Brian P. Walenz
+A src/bogart/AS_BAT_ReadInfo.H nihh20170727Brian P. Walenz
+A src/bogart/bogart.C nihh20170727Brian P. Walenz
+A documentation/source/quick-start.rst nihh20170727Sergey Koren
+A src/pipelines/canu/OverlapErrorAdjustment.pm nihh20170727Sergey Koren
+A src/bogart/AS_BAT_CreateUnitigs.C nihh20170727Brian P. Walenz
+A src/bogart/AS_BAT_TigVector.C nihh20170727Brian P. Walenz
+A src/gfa/alignGFA.C nihh20170725Brian P. Walenz
+A src/minimap/mmapConvert.C nihh20170723Sergey Koren
+A src/pipelines/canu/OverlapMMap.pm nihh20170723Sergey Koren
+A src/bogart/AS_BAT_OptimizePositions.C nihh20170718Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.H nihh20170718Brian P. Walenz
+A src/utgcns/libcns/unitigConsensus.C nihh20170718Brian P. Walenz
+A src/bogart/AS_BAT_OptimizePositions.C nihh20170717Brian P. Walenz
+A src/bogart/AS_BAT_TigVector.C nihh20170717Brian P. Walenz
+A addCopyrights.dat nihh20170717Brian P. Walenz
+A src/bogart/AS_BAT_OptimizePositions.C nihh20170717Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.C nihh20170717Brian P. Walenz
+A src/bogart/bogart.mk nihh20170717Brian P. Walenz
+A src/pipelines/canu/Configure.pm nihh20170717Brian P. Walenz
+A src/bogart/bogart.C nihh20170717Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20170717Brian P. Walenz
+A src/bogart/AS_BAT_Logging.C nihh20170717Brian P. Walenz
+A src/bogart/bogart.C nihh20170714Brian P. Walenz
+A src/bogart/AS_BAT_Instrumentation.C nihh20170714Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.H nihh20170714Brian P. Walenz
+A src/bogart/AS_BAT_Outputs.C nihh20170714Brian P. Walenz
+A src/bogart/AS_BAT_TigGraph.C nihh20170714Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.H nihh20170714Brian P. Walenz
+A src/bogart/bogart.C nihh20170714Brian P. Walenz
+A src/bogart/AS_BAT_OverlapCache.C nihh20170713Brian P. Walenz
+A src/bogart/AS_BAT_OverlapCache.C nihh20170713Brian P. Walenz
+A src/pipelines/canu.pl nihh20170713Brian P. Walenz
+A src/pipelines/canu.pl nihh20170713Brian P. Walenz
+A src/utgcns/utgcns.C nihh20170712Brian P. Walenz
+A src/bogart/AS_BAT_Instrumentation.C nihh20170712Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.C nihh20170712Brian P. Walenz
+A src/bogart/AS_BAT_CreateUnitigs.C nihh20170712Brian P. Walenz
+A src/bogart/AS_BAT_DropDeadEnds.C nihh20170712Brian P. Walenz
+A src/bogart/AS_BAT_MarkRepeatReads.C nihh20170712Brian P. Walenz
+A src/bogart/AS_BAT_PlaceReadUsingOverlaps.C nihh20170712Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.C nihh20170712Brian P. Walenz
+A src/bogart/AS_BAT_TigGraph.C nihh20170712Brian P. Walenz
+A src/bogart/AS_BAT_PromoteToSingleton.C nihh20170712Brian P. Walenz
+A src/bogart/AS_BAT_OverlapCache.C nihh20170708Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.C nihh20170707Brian P. Walenz
+A src/stores/ovStoreDump.C nihh20170707Brian P. Walenz
+A src/bogart/AS_BAT_MarkRepeatReads.C nihh20170706Brian P. Walenz
+A src/bogart/AS_BAT_OverlapCache.C nihh20170706Brian P. Walenz
+A src/bogart/AS_BAT_BestOverlapGraph.C nihh20170706Brian P. Walenz
+A src/bogart/AS_BAT_BestOverlapGraph.H nihh20170706Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.C nihh20170706Brian P. Walenz
+A src/bogart/AS_BAT_OverlapCache.C nihh20170706Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.C nihh20170705Brian P. Walenz
+A src/bogart/AS_BAT_PlaceReadUsingOverlaps.C nihh20170705Brian P. Walenz
+A src/bogart/AS_BAT_PlaceReadUsingOverlaps.C nihh20170705Brian P. Walenz
+A src/bogart/AS_BAT_PlaceReadUsingOverlaps.C nihh20170705Brian P. Walenz
+A src/bogart/AS_BAT_PlaceReadUsingOverlaps.C nihh20170703Brian P. Walenz
+A src/bogart/bogart.C nihh20170703Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.C nihh20170703Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.H nihh20170703Brian P. Walenz
+A src/bogart/AS_BAT_PlaceReadUsingOverlaps.C nihh20170703Brian P. Walenz
+A src/bogart/AS_BAT_PlaceReadUsingOverlaps.C nihh20170703Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.C nihh20170702Brian P. Walenz
+A src/bogart/AS_BAT_CreateUnitigs.C nihh20170629Brian P. Walenz
+A src/bogart/AS_BAT_Instrumentation.C nihh20170629Brian P. Walenz
+A src/bogart/AS_BAT_MarkRepeatReads.C nihh20170629Brian P. Walenz
+A src/bogart/AS_BAT_MergeOrphans.C nihh20170629Brian P. Walenz
+A src/bogart/AS_BAT_Outputs.C nihh20170629Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.H nihh20170629Brian P. Walenz
+A src/bogart/AS_BAT_MarkRepeatReads.C nihh20170629Brian P. Walenz
+A src/bogart/AS_BAT_CreateUnitigs.C nihh20170629Brian P. Walenz
+A src/bogart/AS_BAT_Instrumentation.C nihh20170629Brian P. Walenz
+A src/bogart/AS_BAT_Outputs.C nihh20170629Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.H nihh20170629Brian P. Walenz
+A src/pipelines/canu/Consensus.pm nihh20170629Brian P. Walenz
+A src/pipelines/canu/CorrectReads.pm nihh20170629Brian P. Walenz
+A src/pipelines/canu/Meryl.pm nihh20170629Brian P. Walenz
+A src/pipelines/canu/OverlapErrorAdjustment.pm nihh20170629Brian P. Walenz
+A src/pipelines/canu/OverlapInCore.pm nihh20170629Brian P. Walenz
+A src/pipelines/canu/OverlapMMap.pm nihh20170629Brian P. Walenz
+A src/pipelines/canu/OverlapMhap.pm nihh20170629Brian P. Walenz
+A src/pipelines/canu/OverlapStore.pm nihh20170629Brian P. Walenz
+A src/pipelines/canu/Unitig.pm nihh20170629Brian P. Walenz
+A src/pipelines/canu/Execution.pm nihh20170629Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20170629Brian P. Walenz
+A src/overlapErrorAdjustment/findErrors.H nihh20170629Brian P. Walenz
+A src/pipelines/canu/OverlapStore.pm nihh20170629Brian P. Walenz
+A src/pipelines/canu/OverlapStore.pm nihh20170629Brian P. Walenz
+A src/pipelines/canu/OverlapStore.pm nihh20170629Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20170629Brian P. Walenz
+A src/stores/ovStoreDump.C nihh20170629Brian P. Walenz
+A src/AS_UTL/bitPackedFile.C nihh20170628Brian P. Walenz
+A src/meryl/libmeryl.C nihh20170627Brian P. Walenz
+A src/AS_UTL/bitPackedFile.C nihh20170627Brian P. Walenz
+A src/pipelines/canu/Gatekeeper.pm nihh20170627Brian P. Walenz
+A src/pipelines/canu.pl nihh20170627Brian P. Walenz
+A src/pipelines/canu/Execution.pm nihh20170627Brian P. Walenz
+A src/pipelines/canu/Grid_PBSTorque.pm nihh20170626Brian P. Walenz
+A src/pipelines/canu/OverlapStore.pm nihh20170626Brian P. Walenz
+A src/stores/ovStoreSorter.C nihh20170626Brian P. Walenz
+A src/stores/ovStoreSorter.C nihh20170626Brian P. Walenz
+A src/stores/ovStoreWriter.C nihh20170626Brian P. Walenz
+A src/stores/ovStoreBucketizer.C nihh20170626Brian P. Walenz
+A src/stores/ovStoreIndexer.C nihh20170626Brian P. Walenz
+A src/stores/ovStoreSorter.C nihh20170626Brian P. Walenz
+A src/pipelines/canu/Execution.pm nihh20170626Brian P. Walenz
+A src/pipelines/canu/OverlapStore.pm nihh20170626Brian P. Walenz
+A src/gfa/alignGFA.C nihh20170624Brian P. Walenz
+A src/AS_UTL/AS_UTL_reverseComplement.C nihh20170624Brian P. Walenz
+A src/AS_UTL/AS_UTL_reverseComplement.H nihh20170624Brian P. Walenz
+A src/bogart/AS_BAT_CreateUnitigs.C nihh20170623Brian P. Walenz
+A src/bogart/AS_BAT_PlaceReadUsingOverlaps.C nihh20170623Brian P. Walenz
+A src/bogart/AS_BAT_PlaceReadUsingOverlaps.H nihh20170623Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.C nihh20170623Brian P. Walenz
+A src/bogart/AS_BAT_TigGraph.C nihh20170623Brian P. Walenz
+A src/bogart/AS_BAT_TigGraph.C nihh20170623Brian P. Walenz
+A src/bogart/AS_BAT_PlaceContains.C nihh20170622Brian P. Walenz
+A src/bogart/AS_BAT_CreateUnitigs.C nihh20170622Brian P. Walenz
+A src/bogart/AS_BAT_Unitig_PlaceReadUsingEdges.C nihh20170621Brian P. Walenz
+A src/bogart/AS_BAT_TigVector.C nihh20170621Brian P. Walenz
+A src/bogart/AS_BAT_TigVector.H nihh20170621Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.C nihh20170621Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.H nihh20170621Brian P. Walenz
+A src/bogart/bogart.C nihh20170621Brian P. Walenz
+A src/bogart/AS_BAT_PlaceReadUsingOverlaps.H nihh20170621Brian P. Walenz
+A src/bogart/AS_BAT_CreateUnitigs.C nihh20170620Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.H nihh20170620Brian P. Walenz
+A src/bogart/AS_BAT_TigGraph.C nihh20170615Brian P. Walenz
+A src/bogart/AS_BAT_TigGraph.C nihh20170615Brian P. Walenz
+A src/bogart/bogart.C nihh20170615Brian P. Walenz
+A src/bogart/AS_BAT_CreateUnitigs.C nihh20170614Brian P. Walenz
+A src/bogart/AS_BAT_CreateUnitigs.H nihh20170614Brian P. Walenz
+A src/bogart/AS_BAT_MarkRepeatReads.C nihh20170614Brian P. Walenz
+A src/bogart/AS_BAT_MarkRepeatReads.H nihh20170614Brian P. Walenz
+A src/bogart/bogart.C nihh20170614Brian P. Walenz
+A src/bogart/bogart.C nihh20170614Brian P. Walenz
+A src/pipelines/canu/Unitig.pm nihh20170614Brian P. Walenz
+A src/pipelines/canu/Unitig.pm nihh20170614Brian P. Walenz
+A src/bogart/AS_BAT_DropDeadEnds.C nihh20170612Brian P. Walenz
+A src/bogart/AS_BAT_DropDeadEnds.C nihh20170612Brian P. Walenz
+A src/bogart/bogart.C nihh20170609Brian P. Walenz
+A src/pipelines/canu.pl nihh20170609Brian P. Walenz
+A src/overlapBasedTrimming/splitReads-trimBad.C nihh20170613Sergey Koren
+A src/fastq-utilities/fastqSample.C nihh20170613Sergey Koren
+A src/utgcns/libcns/unitigConsensus.C nihh20170612Brian P. Walenz
+A src/utgcns/utgcns.C nihh20170612Brian P. Walenz
+A src/gfa/alignGFA.C nihh20170612Brian P. Walenz
+A src/gfa/alignGFA.C nihh20170612Brian P. Walenz
+A src/falcon_sense/libfalcon/falcon.C nihh20170609Sergey Koren
+A src/overlapInCore/libedlib/edlib.C nihh20170609Brian P. Walenz
+A src/overlapInCore/libedlib/edlib.H nihh20170609Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20170608Brian P. Walenz
+A src/bogart/AS_BAT_CreateUnitigs.C nihh20170607Brian P. Walenz
+A src/bogart/AS_BAT_CreateUnitigs.H nihh20170607Brian P. Walenz
+A src/bogart/bogart.C nihh20170607Brian P. Walenz
+A src/bogart/bogart.C nihh20170607Brian P. Walenz
+A src/utgcns/libcns/unitigConsensus.C nihh20170607Brian P. Walenz
+A src/utgcns/libcns/unitigConsensus.C nihh20170607Brian P. Walenz
+A src/utgcns/libcns/unitigConsensus.C nihh20170607Brian P. Walenz
+A src/utgcns/libcns/unitigConsensus.H nihh20170607Brian P. Walenz
+A src/utgcns/utgcns.C nihh20170607Brian P. Walenz
+A src/utgcns/libboost/boost/config/compiler/intel.hpp nihh20170606Sergey Koren
+A src/bogart/AS_BAT_DropDeadEnds.C nihh20170531Brian P. Walenz
+A src/bogart/AS_BAT_DropDeadEnds.H nihh20170531Brian P. Walenz
+A src/bogart/bogart.C nihh20170531Brian P. Walenz
+A src/bogart/bogart.mk nihh20170531Brian P. Walenz
+A src/bogart/bogart.C nihh20170530Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20170530Brian P. Walenz
+A src/pipelines/canu/OverlapStore.pm nihh20170530Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.C nihh20170525Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.H nihh20170525Brian P. Walenz
+A src/pipelines/canu.pl nihh20170522Brian P. Walenz
+A src/pipelines/canu.pl nihh20170522Brian P. Walenz
+A src/pipelines/canu/Consensus.pm nihh20170519Brian P. Walenz
+A src/bogart/AS_BAT_CreateUnitigs.C nihh20170518Brian P. Walenz
+A src/bogart/AS_BAT_CreateUnitigs.C nihh20170518Brian P. Walenz
+A documentation/source/faq.rst nihh20170518Sergey Koren
+A src/AS_UTL/writeBuffer.H nihh20170517Sergey Koren
+A src/stores/gkStore.C nihh20170517Sergey Koren
+A .github/ISSUE_TEMPLATE.md nihh20170517Brian P. Walenz
+A src/pipelines/canu.pl nihh20170516Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20170516Brian P. Walenz
+A src/pipelines/canu/Execution.pm nihh20170516Brian P. Walenz
+A src/pipelines/canu/Execution.pm nihh20170516Brian P. Walenz
+A src/pipelines/canu/Configure.pm nihh20170516Brian P. Walenz
+A src/utgcns/libcns/unitigConsensus.C nihh20170515Brian P. Walenz
+A src/pipelines/canu/Consensus.pm nihh20170512Brian P. Walenz
+A src/pipelines/canu/Output.pm nihh20170512Brian P. Walenz
+A src/gfa/alignGFA.C nihh20170512Brian P. Walenz
+A src/gfa/alignGFA.C nihh20170512Brian P. Walenz
+A src/gfa/gfa.C nihh20170512Brian P. Walenz
+A src/gfa/gfa.H nihh20170512Brian P. Walenz
+A src/gfa/gfa.C nihh20170512Brian P. Walenz
+A src/gfa/bed.C nihh20170512Brian P. Walenz
+A src/gfa/bed.H nihh20170512Brian P. Walenz
+A src/main.mk nihh20170512Brian P. Walenz
+A src/bogart/AS_BAT_TigGraph.C nihh20170512Brian P. Walenz
+A src/bogart/AS_BAT_SplitDiscontinuous.C nihh20170512Brian P. Walenz
+A src/pipelines/canu/Configure.pm nihh20170512Sergey Koren
+A src/canu_version_update.pl nihh20170511Brian P. Walenz
+A src/Makefile nihh20170511Brian P. Walenz
+A src/pipelines/canu/Grid_PBSTorque.pm nihh20170510Brian P. Walenz
+A src/stores/tgStore.C nihh20170510Brian P. Walenz
+A src/stores/tgTig.C nihh20170510Brian P. Walenz
+A src/bogart/AS_BAT_MergeOrphans.C nihh20170510Brian P. Walenz
+A src/bogart/AS_BAT_OverlapCache.H nihh20170509Brian P. Walenz
+A src/bogart/AS_BAT_ReadInfo.H nihh20170509Brian P. Walenz
+A src/overlapInCore/liboverlap/Binomial_Bound.C nihh20170509Brian P. Walenz
+A src/main.mk nihh20170509Brian P. Walenz
+A src/utgcns/libcns/unitigConsensus.C nihh20170509Brian P. Walenz
+A src/utgcns/libcns/unitigConsensus.H nihh20170509Brian P. Walenz
+A src/utgcns/libpbutgcns/Alignment.H nihh20170509Brian P. Walenz
+A src/utgcns/libpbutgcns/AlnGraphBoost.C nihh20170509Brian P. Walenz
+A src/utgcns/libpbutgcns/AlnGraphBoost.H nihh20170509Brian P. Walenz
+A src/utgcns/utgcns.C nihh20170509Brian P. Walenz
+A src/overlapInCore/libedlib/edlib.C nihh20170508Brian P. Walenz
+A src/AS_UTL/AS_UTL_fileIO.C nihh20170508Brian P. Walenz
+A src/utgcns/utgcns.C nihh20170508Brian P. Walenz
+A src/utgcns/utgcns.C nihh20170508Brian P. Walenz
+A src/utgcns/utgcns.C nihh20170508Brian P. Walenz
+A src/pipelines/canu/Consensus.pm nihh20170509Brian P. Walenz
+A src/falcon_sense/falcon_sense.C nihh20170503Sergey Koren
+A src/pipelines/canu/Consensus.pm nihh20170425Brian P. Walenz
+A src/pipelines/canu/CorrectReads.pm nihh20170425Brian P. Walenz
+A src/pipelines/canu/Execution.pm nihh20170425Brian P. Walenz
+A src/pipelines/canu/Meryl.pm nihh20170425Brian P. Walenz
+A src/pipelines/canu/OverlapErrorAdjustment.pm nihh20170425Brian P. Walenz
+A src/pipelines/canu/OverlapInCore.pm nihh20170425Brian P. Walenz
+A src/pipelines/canu/OverlapMMap.pm nihh20170425Brian P. Walenz
+A src/pipelines/canu/OverlapMhap.pm nihh20170425Brian P. Walenz
+A src/pipelines/canu/OverlapStore.pm nihh20170425Brian P. Walenz
+A src/pipelines/canu/Unitig.pm nihh20170425Brian P. Walenz
+A documentation/source/index.rst nihh20170422Sergey Koren
+A README.md nihh20170420Sergey Koren
+A README.md nihh20170420Sergey Koren
+A README.citation nihh20170420Sergey Koren
+A buildRelease.sh nihh20170420Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20170420Sergey Koren
+A src/pipelines/canu/Grid_LSF.pm nihh20170420Sergey Koren
+A src/pipelines/canu.pl nihh20170420Sergey Koren
+A src/pipelines/canu/Consensus.pm nihh20170420Sergey Koren
+A src/pipelines/canu/OverlapMhap.pm nihh20170420Sergey Koren
+A documentation/source/faq.rst nihh20170417Sergey Koren
+A src/canu_version_update.pl nihh20170417Brian P. Walenz
+A addCopyrights-BuildData.pl nihh20170417Brian P. Walenz
+A addCopyrights.dat nihh20170417Brian P. Walenz
+A addCopyrights.pl nihh20170417Brian P. Walenz
+A src/AS_UTL/AS_UTL_stackTrace.C nihh20170417Brian P. Walenz
+A src/bogart/AS_BAT_CreateUnitigs.C nihh20170417Brian P. Walenz
+A src/bogart/AS_BAT_MergeOrphans.H nihh20170417Brian P. Walenz
+A src/falcon_sense/falcon_sense.C nihh20170417Brian P. Walenz
+A src/gfa/alignGFA.C nihh20170417Brian P. Walenz
+A src/gfa/gfa.C nihh20170417Brian P. Walenz
+A src/gfa/gfa.H nihh20170417Brian P. Walenz
+A src/meryl/compare-counts.C nihh20170417Brian P. Walenz
+A src/meryl/maskMers.C nihh20170417Brian P. Walenz
+A src/overlapInCore/liboverlap/prefixEditDistance-allocateMoreSpace.C nihh20170417Brian P. Walenz
+A src/overlapInCore/liboverlap/prefixEditDistance-forward.C nihh20170417Brian P. Walenz
+A src/overlapInCore/liboverlap/prefixEditDistance-reverse.C nihh20170417Brian P. Walenz
+A src/overlapInCore/overlapPair.C nihh20170417Brian P. Walenz
+A src/pipelines/canu-object-store.pl nihh20170417Brian P. Walenz
+A src/pipelines/canu/Configure.pm nihh20170417Brian P. Walenz
+A src/pipelines/canu/ErrorEstimate.pm nihh20170417Brian P. Walenz
+A src/pipelines/canu/Grid_Cloud.pm nihh20170417Brian P. Walenz
+A src/pipelines/canu/Grid_DNANexus.pm nihh20170417Brian P. Walenz
+A src/pipelines/canu/Meryl.pm nihh20170417Brian P. Walenz
+A src/pipelines/canu/OverlapBasedTrimming.pm nihh20170417Brian P. Walenz
+A src/pipelines/canu/OverlapMMap.pm nihh20170417Brian P. Walenz
+A src/pipelines/canu/Report.pm nihh20170417Brian P. Walenz
+A src/pipelines/simple-repeat-test.pl nihh20170417Brian P. Walenz
+A src/utgcns/libcns/unitigConsensus.C nihh20170417Brian P. Walenz
+A src/pipelines/simple-repeat-test.pl nihh20170417Brian P. Walenz
+A src/stores/ovStore.C nihh20170417Brian P. Walenz
+A documentation/source/faq.rst nihh20170413Sergey Koren
+A documentation/source/conf.py nihh20170413Sergey Koren
+A documentation/source/quick-start.rst nihh20170413Sergey Koren
+A src/stores/ovStore.C nihh20170412Brian P. Walenz
+A src/pipelines/canu/Configure.pm nihh20170411Brian P. Walenz
+A src/pipelines/canu/Consensus.pm nihh20170411Brian P. Walenz
+A src/pipelines/canu/OverlapErrorAdjustment.pm nihh20170411Brian P. Walenz
+A src/AS_global.C nihh20170411Brian P. Walenz
+A src/canu_version_update.pl nihh20170411Brian P. Walenz
+A src/pipelines/canu/Grid_SGE.pm nihh20170407Brian P. Walenz
+A documentation/source/parameter-reference.rst nihh20170407Brian P. Walenz
+A documentation/source/parameter-reference.rst nihh20170407Brian P. Walenz
+A documentation/source/tutorial.rst nihh20170407Brian P. Walenz
+A documentation/source/parameter-reference.rst nihh20170407Brian P. Walenz
+A documentation/source/parameter-reference.rst nihh20170407Brian P. Walenz
+A documentation/source/parameter-reference.rst nihh20170407Brian P. Walenz
+A src/pipelines/canu.pl nihh20170407Brian P. Walenz
+A src/pipelines/canu/Configure.pm nihh20170407Brian P. Walenz
+A src/pipelines/canu/Consensus.pm nihh20170407Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20170407Brian P. Walenz
+A src/pipelines/canu/Output.pm nihh20170407Brian P. Walenz
+A src/gfa/alignGFA.C nihh20170407Brian P. Walenz
+A src/stores/ovStore.C nihh20170406Brian P. Walenz
+A src/stores/ovStore.H nihh20170406Brian P. Walenz
+A documentation/source/faq.rst nihh20170405Brian P. Walenz
+A src/stores/ovStoreHistogram.C nihh20170405Brian P. Walenz
+A src/meryl/estimate-mer-threshold.C nihh20170405Brian P. Walenz
+A src/gfa/alignGFA.C nihh20170404Brian P. Walenz
+A src/gfa/gfa.C nihh20170404Brian P. Walenz
+A src/gfa/gfa.H nihh20170404Brian P. Walenz
+A src/gfa/alignGFA.C nihh20170404Brian P. Walenz
+A src/gfa/gfa.C nihh20170404Brian P. Walenz
+A src/gfa/gfa.H nihh20170404Brian P. Walenz
+A src/main.mk nihh20170404Brian P. Walenz
+A src/main.mk nihh20170404Brian P. Walenz
+A src/AS_UTL/AS_UTL_fileIO.C nihh20170404Brian P. Walenz
+A src/AS_UTL/AS_UTL_fileIO.H nihh20170404Brian P. Walenz
+A src/pipelines/canu/Gatekeeper.pm nihh20170403Brian P. Walenz
+A src/pipelines/canu/Gatekeeper.pm nihh20170330Brian P. Walenz
+A documentation/source/faq.rst nihh20170330Brian P. Walenz
+A src/stores/gatekeeperPartition.C nihh20170330Brian P. Walenz
+A src/pipelines/canu.pl nihh20170329Brian P. Walenz
+A src/stores/gatekeeperPartition.C nihh20170329Brian P. Walenz
+A src/stores/gkStore.C nihh20170329Brian P. Walenz
+A src/stores/gkStore.H nihh20170329Brian P. Walenz
+A documentation/source/parameter-reference.rst nihh20170329Brian P. Walenz
+A src/canu_version_update.pl nihh20170329Brian P. Walenz
+A src/main.mk nihh20170329Brian P. Walenz
+A src/utgcns/alignGFA.C nihh20170329Brian P. Walenz
+A src/utgcns/alignGFA.mk nihh20170329Brian P. Walenz
+A src/bogart/AS_BAT_TigGraph.C nihh20170329Brian P. Walenz
+A documentation/source/faq.rst nihh20170328Sergey Koren
+A documentation/source/conf.py nihh20170328Sergey Koren
+A src/pipelines/canu/Gatekeeper.pm nihh20170327Brian P. Walenz
+A src/stores/tgStoreDump.C nihh20170327Brian P. Walenz
+A src/stores/tgTig.C nihh20170327Brian P. Walenz
+A src/stores/tgTig.H nihh20170327Brian P. Walenz
+A src/stores/tgTigMultiAlignDisplay.C nihh20170327Brian P. Walenz
+A src/pipelines/canu.pl nihh20170324Brian P. Walenz
+A src/AS_UTL/AS_UTL_stackTrace.C nihh20170322Brian P. Walenz
+A src/AS_UTL/libbacktrace/LICENSE nihh20170322Brian P. Walenz
+A src/AS_UTL/libbacktrace/README nihh20170322Brian P. Walenz
+A src/AS_UTL/libbacktrace/atomic.c nihh20170322Brian P. Walenz
+A src/AS_UTL/libbacktrace/backtrace-supported.h nihh20170322Brian P. Walenz
+A src/AS_UTL/libbacktrace/backtrace.c nihh20170322Brian P. Walenz
+A src/AS_UTL/libbacktrace/backtrace.h nihh20170322Brian P. Walenz
+A src/AS_UTL/libbacktrace/config.h nihh20170322Brian P. Walenz
+A src/AS_UTL/libbacktrace/dwarf.c nihh20170322Brian P. Walenz
+A src/AS_UTL/libbacktrace/elf.c nihh20170322Brian P. Walenz
+A src/AS_UTL/libbacktrace/fileline.c nihh20170322Brian P. Walenz
+A src/AS_UTL/libbacktrace/internal.h nihh20170322Brian P. Walenz
+A src/AS_UTL/libbacktrace/make.out nihh20170322Brian P. Walenz
+A src/AS_UTL/libbacktrace/make.sh nihh20170322Brian P. Walenz
+A src/AS_UTL/libbacktrace/mmap.c nihh20170322Brian P. Walenz
+A src/AS_UTL/libbacktrace/mmapio.c nihh20170322Brian P. Walenz
+A src/AS_UTL/libbacktrace/posix.c nihh20170322Brian P. Walenz
+A src/AS_UTL/libbacktrace/print.c nihh20170322Brian P. Walenz
+A src/AS_UTL/libbacktrace/simple.c nihh20170322Brian P. Walenz
+A src/AS_UTL/libbacktrace/sort.c nihh20170322Brian P. Walenz
+A src/AS_UTL/libbacktrace/state.c nihh20170322Brian P. Walenz
+A src/AS_UTL/libbacktrace/unknown.c nihh20170322Brian P. Walenz
+A src/Makefile nihh20170322Brian P. Walenz
+A src/main.mk nihh20170322Brian P. Walenz
+A src/Makefile nihh20170322Brian P. Walenz
+A src/AS_UTL/AS_UTL_stackTrace.C nihh20170322Brian P. Walenz
+A src/AS_UTL/AS_UTL_stackTrace.H nihh20170322Brian P. Walenz
+A src/AS_global.C nihh20170322Brian P. Walenz
+A src/AS_UTL/AS_UTL_stackTrace.C nihh20170322Brian P. Walenz
+A src/pipelines/canu.pl nihh20170321Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20170321Brian P. Walenz
+A src/pipelines/canu.pl nihh20170321Brian P. Walenz
+A src/pipelines/canu.pl nihh20170321Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20170321Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20170321Brian P. Walenz
+A src/pipelines/canu.pl nihh20170320Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20170320Brian P. Walenz
+A src/pipelines/canu/Execution.pm nihh20170320Brian P. Walenz
+A src/AS_global.C nihh20170320Brian P. Walenz
+A src/AS_global.C nihh20170320Brian P. Walenz
+A documentation/source/parameter-reference.rst nihh20170320Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20170320Brian P. Walenz
+A src/pipelines/canu/OverlapMMap.pm nihh20170320Brian P. Walenz
+A src/pipelines/canu/OverlapMhap.pm nihh20170320Brian P. Walenz
+A src/pipelines/canu/CorrectReads.pm nihh20170320Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20170320Brian P. Walenz
+A src/pipelines/canu.pl nihh20170320Brian P. Walenz
+A src/pipelines/canu/Report.pm nihh20170320Brian P. Walenz
+A src/pipelines/canu/Gatekeeper.pm nihh20170320Brian P. Walenz
+A src/pipelines/canu/Meryl.pm nihh20170320Brian P. Walenz
+A src/Makefile nihh20170320Brian P. Walenz
+A src/pipelines/canu/CorrectReads.pm nihh20170320Brian P. Walenz
+A src/pipelines/canu/Execution.pm nihh20170320Brian P. Walenz
+A src/pipelines/canu/Gatekeeper.pm nihh20170320Brian P. Walenz
+A src/pipelines/canu/Meryl.pm nihh20170320Brian P. Walenz
+A src/pipelines/canu/OverlapBasedTrimming.pm nihh20170320Brian P. Walenz
+A src/pipelines/canu/OverlapErrorAdjustment.pm nihh20170320Brian P. Walenz
+A src/pipelines/canu/OverlapInCore.pm nihh20170320Brian P. Walenz
+A src/pipelines/canu/OverlapStore.pm nihh20170320Brian P. Walenz
+A src/pipelines/canu/Report.pm nihh20170320Brian P. Walenz
+A src/pipelines/canu/Unitig.pm nihh20170320Brian P. Walenz
+A src/pipelines/canu/Grid_SGE.pm nihh20170316Brian P. Walenz
+A documentation/source/quick-start.rst nihh20170316Brian P. Walenz
+A documentation/source/faq.rst nihh20170316Brian P. Walenz
+A src/pipelines/canu/CorrectReads.pm nihh20170315Sergey Koren
+A src/pipelines/canu/OverlapMhap.pm nihh20170315Sergey Koren
+A src/pipelines/sanity/sanity.sh nihh20170314Brian P. Walenz
+A src/pipelines/sanity/medium.arabidopsis_thaliana.pacbio.p4c2.spec nihh20170314Brian P. Walenz
+A src/pipelines/sanity/medium.arabidopsis_thaliana.pacbio.p5c3.spec nihh20170314Brian P. Walenz
+A src/pipelines/sanity/medium.caenorhabditis_elegans.pacbio.p6c4.spec nihh20170314Brian P. Walenz
+A src/pipelines/sanity/medium.drosophila_melanogaster.pacbio.p5c3.spec nihh20170314Brian P. Walenz
+A src/pipelines/sanity/small.bibersteinia_trehalosi.pacbio.h5-1000.spec nihh20170314Brian P. Walenz
+A src/pipelines/sanity/small.bibersteinia_trehalosi.pacbio.h5-5000.spec nihh20170314Brian P. Walenz
+A src/pipelines/sanity/small.bibersteinia_trehalosi.pacbio.sra-3000.spec nihh20170314Brian P. Walenz
+A src/pipelines/sanity/small.escherichia_coli_k12.pacbio.p6.spec nihh20170314Brian P. Walenz
+A src/pipelines/sanity/small.escherichia_coli_ne92.pacbio.p4.spec nihh20170314Brian P. Walenz
+A src/pipelines/sanity/small.escherichia_coli_ne92.pacbio.p5.spec nihh20170314Brian P. Walenz
+A src/pipelines/sanity/small.escherichia_coli_o157_h7_str_f8092b.pacbio.p4c2.average.spec nihh20170314Brian P. Walenz
+A src/pipelines/sanity/small.escherichia_coli_o157_h7_str_f8092b.pacbio.p4c2.long.spec nihh20170314Brian P. Walenz
+A src/pipelines/sanity/small.francisella_tularensis.pacbio.spec nihh20170314Brian P. Walenz
+A src/pipelines/sanity/small.saccharomyces_cerevisiae_glbrcy22-3.pacbio.spec nihh20170314Brian P. Walenz
+A src/pipelines/sanity/small.saccharomyces_cerevisiae_glbrcy22-3.pacbio.sra.spec nihh20170314Brian P. Walenz
+A src/pipelines/sanity/small.saccharomyces_cerevisiae_s288c.pacbio.spec nihh20170314Brian P. Walenz
+A src/pipelines/canu/Execution.pm nihh20170314Brian P. Walenz
+A src/pipelines/canu-object-store.pl nihh20170314Brian P. Walenz
+A src/overlapInCore/overlapPair.C nihh20170310Brian P. Walenz
+A src/overlapInCore/libedlib/edlib.C nihh20170310Brian P. Walenz
+A src/overlapInCore/libedlib/edlib.H nihh20170310Brian P. Walenz
+A src/pipelines/canu/Meryl.pm nihh20170310Brian P. Walenz
+A src/pipelines/sanity/medium.caenorhabditis_elegans.pacbio.p6c4.spec nihh20170310Brian P. Walenz
+A src/pipelines/sanity/sanity.NOTES nihh20170310Brian P. Walenz
+A src/pipelines/sanity/sanity.sh nihh20170310Brian P. Walenz
+A src/pipelines/sanity/small.bacillus_anthracis_sterne.nanopore.34F2_NBI0483991.poretools.2D.spec nihh20170310Brian P. Walenz
+A src/pipelines/sanity/small.escherichia_coli_k12.nanopore.all.2d.spec nihh20170310Brian P. Walenz
+A src/pipelines/sanity/small.escherichia_coli_k12.nanopore.map006-1.2d.spec nihh20170310Brian P. Walenz
+A src/pipelines/sanity/small.escherichia_coli_k12.nanopore.map006-2.2d.spec nihh20170310Brian P. Walenz
+A src/pipelines/sanity/small.escherichia_coli_k12.nanopore.map006-pcr-1.2d.spec nihh20170310Brian P. Walenz
+A src/pipelines/sanity/small.escherichia_coli_k12.nanopore.map006-pcr-2.2d.spec nihh20170310Brian P. Walenz
+A src/pipelines/sanity/small.escherichia_coli_k12.nanopore.r9.4.superlong.spec nihh20170310Brian P. Walenz
+A src/pipelines/sanity/small.escherichia_coli_k12.nanopore.r9.SpotOn.1d.spec nihh20170310Brian P. Walenz
+A src/pipelines/sanity/small.escherichia_coli_k12.pacbio.p6.spec nihh20170310Brian P. Walenz
+A src/pipelines/sanity/small.escherichia_coli_ne92.pacbio.p4.spec nihh20170310Brian P. Walenz
+A src/pipelines/sanity/small.escherichia_coli_ne92.pacbio.p5.spec nihh20170310Brian P. Walenz
+A src/pipelines/sanity/small.escherichia_coli_o157_h7_str_f8092b.pacbio.p4c2.long.spec nihh20170310Brian P. Walenz
+A src/pipelines/sanity/small.saccharomyces_cerevisiae_s288c.nanopore.r7.spec nihh20170310Brian P. Walenz
+A src/pipelines/sanity/small.saccharomyces_cerevisiae_s288c.nanopore.r9.spec nihh20170310Brian P. Walenz
+A src/pipelines/sanity/small.saccharomyces_cerevisiae_w303.nanopore.poretools.2D.spec nihh20170310Brian P. Walenz
+A src/pipelines/sanity/small.yersinia_pestis.nanopore.NBI0499872.poretools.2D.spec nihh20170310Brian P. Walenz
+A src/pipelines/sanity/success.caenorhabditis_elegans.sh nihh20170310Brian P. Walenz
+A src/pipelines/canu/Configure.pm nihh20170309Sergey Koren
+A src/pipelines/canu/Defaults.pm nihh20170309Sergey Koren
+A src/pipelines/canu/Meryl.pm nihh20170307Sergey Koren
+A src/pipelines/canu/OverlapMMap.pm nihh20170307Sergey Koren
+A src/pipelines/canu/OverlapBasedTrimming.pm nihh20170303Sergey Koren
+A src/pipelines/canu/Defaults.pm nihh20170302Brian P. Walenz
+A documentation/source/parameter-reference.rst nihh20170223Brian P. Walenz
+A src/pipelines/canu/OverlapMhap.pm nihh20170302Sergey Koren
+A src/pipelines/canu/OverlapStore.pm nihh20170227Brian P. Walenz
+A src/main.mk nihh20170224Brian P. Walenz
+A src/merTrim/merTrim.C nihh20170224Brian P. Walenz
+A src/merTrim/merTrim.mk nihh20170224Brian P. Walenz
+A src/merTrim/merTrimResult.H nihh20170224Brian P. Walenz
+A src/main.mk nihh20170224Brian P. Walenz
+A src/meryl/existDB.mk nihh20170224Brian P. Walenz
+A src/meryl/positionDB.mk nihh20170224Brian P. Walenz
+A src/main.mk nihh20170224Brian P. Walenz
+A src/meryl/libkmer/existDB-create-from-fasta.C nihh20170224Brian P. Walenz
+A src/meryl/libkmer/existDB-create-from-meryl.C nihh20170224Brian P. Walenz
+A src/meryl/libkmer/existDB-create-from-sequence.C nihh20170224Brian P. Walenz
+A src/meryl/libkmer/existDB-state.C nihh20170224Brian P. Walenz
+A src/meryl/libkmer/existDB.C nihh20170224Brian P. Walenz
+A src/meryl/libkmer/existDB.H nihh20170224Brian P. Walenz
+A src/meryl/libkmer/positionDB-access.C nihh20170224Brian P. Walenz
+A src/meryl/libkmer/positionDB-dump.C nihh20170224Brian P. Walenz
+A src/meryl/libkmer/positionDB-file.C nihh20170224Brian P. Walenz
+A src/meryl/libkmer/positionDB-mismatch.C nihh20170224Brian P. Walenz
+A src/meryl/libkmer/positionDB-sort.C nihh20170224Brian P. Walenz
+A src/meryl/libkmer/positionDB.C nihh20170224Brian P. Walenz
+A src/meryl/libkmer/positionDB.H nihh20170224Brian P. Walenz
+A addCopyrights.dat nihh20170224Brian P. Walenz
+A src/pipelines/canu/OverlapStore.pm nihh20170224Brian P. Walenz
+A src/pipelines/canu/OverlapMMap.pm nihh20170223Brian P. Walenz
+A src/pipelines/canu/OverlapMhap.pm nihh20170223Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20170223Brian P. Walenz
+A src/pipelines/canu/Execution.pm nihh20170222Brian P. Walenz
+A src/pipelines/canu/Grid_Cloud.pm nihh20170222Brian P. Walenz
+A src/stores/tgStoreCoverageStat.C nihh20170222Brian P. Walenz
+A src/pipelines/canu.pl nihh20170222Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20170222Brian P. Walenz
+A src/pipelines/canu/Grid_Cloud.pm nihh20170222Brian P. Walenz
+A src/pipelines/canu/Execution.pm nihh20170222Brian P. Walenz
+A src/pipelines/canu/Execution.pm nihh20170221Brian P. Walenz
+A src/pipelines/canu/Grid_DNANexus.pm nihh20170221Brian P. Walenz
+A documentation/source/parameter-reference.rst nihh20170221Brian P. Walenz
+A src/stores/ovStoreBucketizer.C nihh20170221Brian P. Walenz
+A src/stores/ovStoreBuild.C nihh20170221Brian P. Walenz
+A src/stores/ovStoreWriter.C nihh20170221Brian P. Walenz
+A src/pipelines/canu/Execution.pm nihh20170221Brian P. Walenz
+A src/pipelines/canu/OverlapStore.pm nihh20170221Brian P. Walenz
+A src/pipelines/canu/Configure.pm nihh20170221Brian P. Walenz
+A documentation/reST-markup-hints nihh20170221Brian P. Walenz
+A src/pipelines/canu/Execution.pm nihh20170221Brian P. Walenz
+A src/pipelines/canu/Grid_PBSTorque.pm nihh20170221Brian P. Walenz
+A src/pipelines/canu/Grid_Cloud.pm nihh20170221Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20170220Brian P. Walenz
+A src/pipelines/canu/HTML.pm nihh20170220Brian P. Walenz
+A src/pipelines/canu/Grid_Cloud.pm nihh20170215Brian P. Walenz
+A src/pipelines/canu/Execution.pm nihh20170215Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20170215Brian P. Walenz
+A src/Makefile nihh20170215Brian P. Walenz
+A src/pipelines/canu.pl nihh20170215Brian P. Walenz
+A src/pipelines/canu/Consensus.pm nihh20170215Brian P. Walenz
+A src/pipelines/canu/CorrectReads.pm nihh20170215Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20170215Brian P. Walenz
+A src/pipelines/canu/Execution.pm nihh20170215Brian P. Walenz
+A src/pipelines/canu/Gatekeeper.pm nihh20170215Brian P. Walenz
+A src/pipelines/canu/Grid_Cloud.pm nihh20170215Brian P. Walenz
+A src/pipelines/canu/Meryl.pm nihh20170215Brian P. Walenz
+A src/pipelines/canu/Output.pm nihh20170215Brian P. Walenz
+A src/pipelines/canu/OverlapBasedTrimming.pm nihh20170215Brian P. Walenz
+A src/pipelines/canu/OverlapErrorAdjustment.pm nihh20170215Brian P. Walenz
+A src/pipelines/canu/OverlapInCore.pm nihh20170215Brian P. Walenz
+A src/pipelines/canu/OverlapMMap.pm nihh20170215Brian P. Walenz
+A src/pipelines/canu/OverlapMhap.pm nihh20170215Brian P. Walenz
+A src/pipelines/canu/OverlapStore.pm nihh20170215Brian P. Walenz
+A src/pipelines/canu/Unitig.pm nihh20170215Brian P. Walenz
+A src/pipelines/canu.pl nihh20170213Brian P. Walenz
+A src/Makefile nihh20170211Brian P. Walenz
+A src/pipelines/canu.pl nihh20170211Brian P. Walenz
+A src/pipelines/canu/Grid_DNANexus.pm nihh20170211Brian P. Walenz
+A src/pipelines/canu/Gatekeeper.pm nihh20170204Brian P. Walenz
+A src/utgcns/libNDFalcon/dw.C nihh20170214Sergey Koren
+A src/utgcns/libcns/unitigConsensus.C nihh20170214Sergey Koren
+A src/utgcns/libcns/unitigConsensus.C nihh20170214Sergey Koren
+A src/utgcns/libcns/unitigConsensus.C nihh20170214Sergey Koren
+A src/pipelines/canu.pl nihh20170213Brian P. Walenz
+A src/pipelines/canu/Consensus.pm nihh20170213Brian P. Walenz
+A src/pipelines/canu/CorrectReads.pm nihh20170213Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20170213Brian P. Walenz
+A src/pipelines/canu/Execution.pm nihh20170213Brian P. Walenz
+A src/pipelines/canu/Meryl.pm nihh20170213Brian P. Walenz
+A src/pipelines/canu/OverlapErrorAdjustment.pm nihh20170213Brian P. Walenz
+A src/pipelines/canu/OverlapInCore.pm nihh20170213Brian P. Walenz
+A src/pipelines/canu/OverlapMMap.pm nihh20170213Brian P. Walenz
+A src/pipelines/canu/OverlapMhap.pm nihh20170213Brian P. Walenz
+A src/pipelines/canu/OverlapStore.pm nihh20170213Brian P. Walenz
+A src/pipelines/canu/Unitig.pm nihh20170213Brian P. Walenz
+A src/pipelines/canu/Execution.pm nihh20170213Brian P. Walenz
+A src/pipelines/canu/Configure.pm nihh20170209Brian P. Walenz
+A src/utgcns/libcns/unitigConsensus.C nihh20170209Sergey Koren
+A src/pipelines/canu/Consensus.pm nihh20170203Sergey Koren
+A src/utgcns/libcns/unitigConsensus.C nihh20170203Sergey Koren
+A src/utgcns/libcns/unitigConsensus.H nihh20170203Sergey Koren
+A src/utgcns/utgcns.C nihh20170203Sergey Koren
+A src/overlapInCore/liboverlap/prefixEditDistance-allocateMoreSpace.C nihh20170202Brian P. Walenz
+A src/overlapInCore/liboverlap/prefixEditDistance-extend.C nihh20170202Brian P. Walenz
+A src/overlapInCore/liboverlap/prefixEditDistance-forward.C nihh20170202Brian P. Walenz
+A src/overlapInCore/liboverlap/prefixEditDistance-reverse.C nihh20170202Brian P. Walenz
+A src/overlapInCore/liboverlap/prefixEditDistance.H nihh20170202Brian P. Walenz
+A src/overlapInCore/overlapInCore-Process_String_Overlaps.C nihh20170202Brian P. Walenz
+A src/pipelines/canu.pl nihh20170202Brian P. Walenz
+A src/pipelines/canu/Consensus.pm nihh20170202Brian P. Walenz
+A src/pipelines/canu/CorrectReads.pm nihh20170202Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20170202Brian P. Walenz
+A src/pipelines/canu/ErrorEstimate.pm nihh20170202Brian P. Walenz
+A src/pipelines/canu/Execution.pm nihh20170202Brian P. Walenz
+A src/pipelines/canu/Gatekeeper.pm nihh20170202Brian P. Walenz
+A src/pipelines/canu/HTML.pm nihh20170202Brian P. Walenz
+A src/pipelines/canu/Meryl.pm nihh20170202Brian P. Walenz
+A src/pipelines/canu/Output.pm nihh20170202Brian P. Walenz
+A src/pipelines/canu/OverlapBasedTrimming.pm nihh20170202Brian P. Walenz
+A src/pipelines/canu/OverlapErrorAdjustment.pm nihh20170202Brian P. Walenz
+A src/pipelines/canu/OverlapInCore.pm nihh20170202Brian P. Walenz
+A src/pipelines/canu/OverlapMMap.pm nihh20170202Brian P. Walenz
+A src/pipelines/canu/OverlapMhap.pm nihh20170202Brian P. Walenz
+A src/pipelines/canu/OverlapStore.pm nihh20170202Brian P. Walenz
+A src/pipelines/canu/Unitig.pm nihh20170202Brian P. Walenz
+A documentation/source/parameter-reference.rst nihh20170202Brian P. Walenz
+A src/pipelines/canu/Consensus.pm nihh20170202Brian P. Walenz
+A src/pipelines/canu/CorrectReads.pm nihh20170202Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20170202Brian P. Walenz
+A src/pipelines/canu/Execution.pm nihh20170202Brian P. Walenz
+A src/pipelines/canu/Gatekeeper.pm nihh20170202Brian P. Walenz
+A src/pipelines/canu/Meryl.pm nihh20170202Brian P. Walenz
+A src/pipelines/canu/OverlapBasedTrimming.pm nihh20170202Brian P. Walenz
+A src/pipelines/canu/OverlapErrorAdjustment.pm nihh20170202Brian P. Walenz
+A src/pipelines/canu/OverlapInCore.pm nihh20170202Brian P. Walenz
+A src/pipelines/canu/OverlapMMap.pm nihh20170202Brian P. Walenz
+A src/pipelines/canu/OverlapMhap.pm nihh20170202Brian P. Walenz
+A src/pipelines/canu/OverlapStore.pm nihh20170202Brian P. Walenz
+A src/pipelines/canu/Unitig.pm nihh20170202Brian P. Walenz
+A src/pipelines/canu/Meryl.pm nihh20170202Brian P. Walenz
+A src/stores/ovStoreBuild.C nihh20170201Brian P. Walenz
+A src/stores/ovStoreHistogram.C nihh20170201Brian P. Walenz
+A src/stores/ovStoreHistogram.H nihh20170201Brian P. Walenz
+A documentation/source/parameter-reference.rst nihh20170201Brian P. Walenz
+A documentation/source/tutorial.rst nihh20170201Brian P. Walenz
+A src/bogart/bogart.C nihh20170127Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20170124Brian P. Walenz
+A documentation/source/parameter-reference.rst nihh20170124Brian P. Walenz
+A src/pipelines/canu/CorrectReads.pm nihh20170124Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20170124Brian P. Walenz
+A src/pipelines/canu/CorrectReads.pm nihh20170117Brian P. Walenz
+A src/pipelines/canu/ErrorEstimate.pm nihh20170117Brian P. Walenz
+A src/pipelines/canu/Gatekeeper.pm nihh20170117Brian P. Walenz
+A src/pipelines/canu/HTML.pm nihh20170117Brian P. Walenz
+A src/pipelines/canu/Meryl.pm nihh20170117Brian P. Walenz
+A src/pipelines/canu/Output.pm nihh20170117Brian P. Walenz
+A src/pipelines/canu/OverlapBasedTrimming.pm nihh20170117Brian P. Walenz
+A src/pipelines/canu/OverlapErrorAdjustment.pm nihh20170117Brian P. Walenz
+A src/pipelines/canu/OverlapInCore.pm nihh20170117Brian P. Walenz
+A src/pipelines/canu/OverlapMMap.pm nihh20170117Brian P. Walenz
+A src/pipelines/canu/OverlapMhap.pm nihh20170117Brian P. Walenz
+A src/pipelines/canu/OverlapStore.pm nihh20170117Brian P. Walenz
+A src/pipelines/canu/Unitig.pm nihh20170117Brian P. Walenz
+A src/pipelines/canu/Execution.pm nihh20170130Brian P. Walenz
+A documentation/source/parameter-reference.rst nihh20170127Brian P. Walenz
+A documentation/source/quick-start.rst nihh20170127Brian P. Walenz
+A documentation/source/tutorial.rst nihh20170127Brian P. Walenz
+A src/pipelines/canu.pl nihh20170127Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20170127Brian P. Walenz
+A src/pipelines/canu/Unitig.pm nihh20170127Brian P. Walenz
+A src/pipelines/canu/Configure.pm nihh20170127Brian P. Walenz
+A src/pipelines/canu/ErrorEstimate.pm nihh20170127Brian P. Walenz
+A src/pipelines/canu/OverlapMMap.pm nihh20170127Brian P. Walenz
+A src/pipelines/canu/OverlapMhap.pm nihh20170127Brian P. Walenz
+A src/pipelines/canu/OverlapInCore.pm nihh20170127Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20170127Brian P. Walenz
+A src/pipelines/canu/Execution.pm nihh20170126Brian P. Walenz
+A src/pipelines/canu.pl nihh20170126Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20170126Brian P. Walenz
+A src/pipelines/canu/OverlapInCore.pm nihh20170126Brian P. Walenz
+A src/falcon_sense/libfalcon/falcon.C nihh20170124Brian P. Walenz
+A src/falcon_sense/libfalcon/falcon.H nihh20170124Brian P. Walenz
+A src/falcon_sense/falcon_sense.C nihh20170124Brian P. Walenz
+A src/falcon_sense/libfalcon/falcon.C nihh20170124Brian P. Walenz
+A src/falcon_sense/libfalcon/falcon.H nihh20170124Brian P. Walenz
+A src/pipelines/canu/Execution.pm nihh20170123Brian P. Walenz
+A src/Makefile nihh20170123Brian P. Walenz
+A src/Makefile nihh20170123Brian P. Walenz
+A src/stores/ovStoreHistogram.C nihh20170123Brian P. Walenz
+A src/bogart/AS_BAT_OverlapCache.C nihh20170120Brian P. Walenz
+A src/bogart/AS_BAT_BestOverlapGraph.C nihh20170120Brian P. Walenz
+A src/pipelines/canu.pl nihh20170120Brian P. Walenz
+A src/pipelines/canu/Configure.pm nihh20170120Brian P. Walenz
+A src/pipelines/canu/Execution.pm nihh20170120Brian P. Walenz
+A src/pipelines/canu.pl nihh20170119Brian P. Walenz
+A src/pipelines/canu/Consensus.pm nihh20170119Brian P. Walenz
+A src/pipelines/canu/CorrectReads.pm nihh20170119Brian P. Walenz
+A src/pipelines/canu/ErrorEstimate.pm nihh20170119Brian P. Walenz
+A src/pipelines/canu/Execution.pm nihh20170119Brian P. Walenz
+A src/pipelines/canu/HTML.pm nihh20170119Brian P. Walenz
+A src/pipelines/canu/Meryl.pm nihh20170119Brian P. Walenz
+A src/pipelines/canu/OverlapBasedTrimming.pm nihh20170119Brian P. Walenz
+A src/pipelines/canu/OverlapErrorAdjustment.pm nihh20170119Brian P. Walenz
+A src/pipelines/canu/OverlapInCore.pm nihh20170119Brian P. Walenz
+A src/pipelines/canu/OverlapMMap.pm nihh20170119Brian P. Walenz
+A src/pipelines/canu/OverlapMhap.pm nihh20170119Brian P. Walenz
+A src/pipelines/canu/Unitig.pm nihh20170119Brian P. Walenz
+A src/utgcns/libcns/unitigConsensus.C nihh20170116Brian P. Walenz
+A src/bogart/AS_BAT_ChunkGraph.C nihh20170113Brian P. Walenz
+A src/bogart/AS_BAT_BestOverlapGraph.C nihh20170112Brian P. Walenz
+A src/bogart/AS_BAT_BestOverlapGraph.H nihh20170112Brian P. Walenz
+A src/bogart/bogart.C nihh20170112Brian P. Walenz
+A src/pipelines/canu/OverlapMMap.pm nihh20170110Brian P. Walenz
+A src/pipelines/canu/OverlapMhap.pm nihh20170110Brian P. Walenz
+A src/stores/ovStore.H nihh20170110Brian P. Walenz
+A src/stores/ovStoreHistogram.C nihh20170110Brian P. Walenz
+A src/stores/ovStoreHistogram.H nihh20170110Brian P. Walenz
+A src/mhap/mhapConvert.C nihh20170110Brian P. Walenz
+A src/minimap/mmapConvert.C nihh20170110Brian P. Walenz
+A src/pipelines/canu/OverlapMMap.pm nihh20170110Brian P. Walenz
+A src/pipelines/canu/OverlapMhap.pm nihh20170110Brian P. Walenz
+A src/pipelines/parallel-ovl-store-test.sh nihh20170109Brian P. Walenz
+A documentation/reST-markup-hints nihh20170109Brian P. Walenz
+A src/pipelines/simple-repeat-test.pl nihh20170109Brian P. Walenz
+A documentation/source/parameter-reference.rst nihh20170109Brian P. Walenz
+A src/pipelines/canu/Execution.pm nihh20170107Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20170107Brian P. Walenz
+A src/pipelines/canu/Execution.pm nihh20170107Brian P. Walenz
+A src/pipelines/canu/Grid_LSF.pm nihh20170107Brian P. Walenz
+A src/pipelines/canu/Grid_PBSTorque.pm nihh20170107Brian P. Walenz
+A src/pipelines/canu/Grid_SGE.pm nihh20170107Brian P. Walenz
+A src/pipelines/canu/Grid_Slurm.pm nihh20170107Brian P. Walenz
+A src/fastq-utilities/fastqSimulate.C nihh20170106Brian P. Walenz
+A src/falcon_sense/falcon_sense.mk nihh20170106Sergey Koren
+A src/falcon_sense/libfalcon/falcon.C nihh20170106Sergey Koren
+A src/falcon_sense/libfalcon/falcon.H nihh20170106Sergey Koren
+A src/main.mk nihh20170106Sergey Koren
+A src/overlapInCore/libedlib/edlib.C nihh20170106Sergey Koren
+A src/overlapInCore/libedlib/edlib.H nihh20170106Sergey Koren
+A documentation/source/parameter-reference.rst nihh20170106Brian P. Walenz
+A src/pipelines/canu/CorrectReads.pm nihh20170106Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20170106Brian P. Walenz
+A src/pipelines/canu/Execution.pm nihh20170106Brian P. Walenz
+A src/pipelines/canu.pl nihh20170106Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20170106Brian P. Walenz
+A src/pipelines/sanity/sanity.pl nihh20170104Brian P. Walenz
+A src/Makefile nihh20170104Brian P. Walenz
+A src/stores/ovOverlap.C nihh20170104Brian P. Walenz
+A src/stores/ovOverlap.H nihh20170104Brian P. Walenz
+A src/stores/ovStoreFile.C nihh20170104Brian P. Walenz
+A src/stores/ovStore.H nihh20170104Brian P. Walenz
+A src/correction/generateCorrectionLayouts.C nihh20170103Brian P. Walenz
+A src/stores/ovStoreBucketizer.C nihh20170103Brian P. Walenz
+A src/stores/ovStoreBuild.C nihh20170103Brian P. Walenz
+A src/stores/ovStoreFilter.C nihh20170103Brian P. Walenz
+A src/pipelines/sanity/sanity.sh nihh20161225Brian P. Walenz
+A src/pipelines/sanity/success.arabidopsis_thaliana.sh nihh20161225Brian P. Walenz
+A src/pipelines/sanity/success.bacillus_anthracis_sterne.sh nihh20161225Brian P. Walenz
+A src/pipelines/sanity/success.bibersteinia_trehalosi.sh nihh20161225Brian P. Walenz
+A src/pipelines/sanity/success.drosophila_melanogaster.sh nihh20161225Brian P. Walenz
+A src/pipelines/sanity/success.escherichia_coli_k12.sh nihh20161225Brian P. Walenz
+A src/pipelines/sanity/success.escherichia_coli_ne92.sh nihh20161225Brian P. Walenz
+A src/pipelines/sanity/success.escherichia_coli_o157_h7_str_f8092b.sh nihh20161225Brian P. Walenz
+A src/pipelines/sanity/success.francisella_tularensis.sh nihh20161225Brian P. Walenz
+A src/pipelines/sanity/success.yersinia_pestis_i195.sh nihh20161225Brian P. Walenz
+A src/pipelines/sanity/small.bacillus_anthracis_sterne.nanopore.34F2_NBI0483991.poretools.2D.spec nihh20161225Brian P. Walenz
+A src/pipelines/sanity/sanity.sh nihh20161225Brian P. Walenz
+A src/pipelines/sanity/small.saccharomyces_cerevisiae_glbrcy22-3.pacbio.spec nihh20161225Brian P. Walenz
+A src/pipelines/sanity/small.saccharomyces_cerevisiae_glbrcy22-3.pacbio.sra.spec nihh20161225Brian P. Walenz
+A src/pipelines/sanity/small.saccharomyces_cerevisiae_s288c.pacbio.spec nihh20161225Brian P. Walenz
+A src/pipelines/sanity/success.saccharomyces_cerevisiae_s288c.sh nihh20161225Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20161225Brian P. Walenz
+A src/pipelines/sanity/sanity.sh nihh20161222Brian P. Walenz
+A src/pipelines/sanity/small.bibersteinia_trehalosi.pacbio.h5-1000.spec nihh20161222Brian P. Walenz
+A src/pipelines/sanity/small.bibersteinia_trehalosi.pacbio.h5-5000.spec nihh20161222Brian P. Walenz
+A src/pipelines/sanity/medium.arabidopsis_thaliana.pacbio.p4c2.spec nihh20161220Brian P. Walenz
+A src/pipelines/sanity/medium.arabidopsis_thaliana.pacbio.p5c3.spec nihh20161220Brian P. Walenz
+A src/pipelines/sanity/medium.drosophila_melanogaster.pacbio.p5c3.spec nihh20161220Brian P. Walenz
+A src/pipelines/sanity/sanity.sh nihh20161220Brian P. Walenz
+A src/pipelines/sanity/small.bacillus_anthracis_sterne.nanopore.34F2_NBI0483991.poretools.2D.spec nihh20161220Brian P. Walenz
+A src/pipelines/sanity/small.bibersteinia_trehalosi.pacbio.h5-1000.spec nihh20161220Brian P. Walenz
+A src/pipelines/sanity/small.bibersteinia_trehalosi.pacbio.h5-5000.spec nihh20161220Brian P. Walenz
+A src/pipelines/sanity/small.bibersteinia_trehalosi.pacbio.sra-1000.spec nihh20161220Brian P. Walenz
+A src/pipelines/sanity/small.escherichia_coli_k12.nanopore.all.2d.spec nihh20161220Brian P. Walenz
+A src/pipelines/sanity/small.escherichia_coli_k12.nanopore.map006-1.2d.spec nihh20161220Brian P. Walenz
+A src/pipelines/sanity/small.escherichia_coli_k12.nanopore.map006-2.2d.spec nihh20161220Brian P. Walenz
+A src/pipelines/sanity/small.escherichia_coli_k12.nanopore.map006-pcr-1.2d.spec nihh20161220Brian P. Walenz
+A src/pipelines/sanity/small.escherichia_coli_k12.nanopore.map006-pcr-2.2d.spec nihh20161220Brian P. Walenz
+A src/pipelines/sanity/small.escherichia_coli_k12.nanopore.r9.SpotOn.1d.spec nihh20161220Brian P. Walenz
+A src/pipelines/sanity/small.escherichia_coli_k12.pacbio.p6.spec nihh20161220Brian P. Walenz
+A src/pipelines/sanity/small.escherichia_coli_ne92.pacbio.p4.spec nihh20161220Brian P. Walenz
+A src/pipelines/sanity/small.escherichia_coli_ne92.pacbio.p5.spec nihh20161220Brian P. Walenz
+A src/pipelines/sanity/small.escherichia_coli_o157_h7_str_f8092b.pacbio.p4c2.average.spec nihh20161220Brian P. Walenz
+A src/pipelines/sanity/small.escherichia_coli_o157_h7_str_f8092b.pacbio.p4c2.long.spec nihh20161220Brian P. Walenz
+A src/pipelines/sanity/small.francisella_tularensis.pacbio.spec nihh20161220Brian P. Walenz
+A src/pipelines/sanity/small.yersinia_pestis.nanopore.NBI0499872.poretools.2D.spec nihh20161220Brian P. Walenz
+A src/pipelines/sanity/success.arabidopsis_thaliana.sh nihh20161220Brian P. Walenz
+A src/pipelines/sanity/success.bacillus_anthracis_sterne.sh nihh20161220Brian P. Walenz
+A src/pipelines/sanity/success.bibersteinia_trehalosi.sh nihh20161220Brian P. Walenz
+A src/pipelines/sanity/success.drosophila_melanogaster.sh nihh20161220Brian P. Walenz
+A src/pipelines/sanity/success.escherichia_coli_k12.sh nihh20161220Brian P. Walenz
+A src/pipelines/sanity/success.escherichia_coli_ne92.sh nihh20161220Brian P. Walenz
+A src/pipelines/sanity/success.escherichia_coli_o157_h7_str_f8092b.sh nihh20161220Brian P. Walenz
+A src/pipelines/sanity/success.francisella_tularensis.sh nihh20161220Brian P. Walenz
+A src/pipelines/sanity/success.yersinia_pestis_i195.sh nihh20161220Brian P. Walenz
+A src/pipelines/sanity/sanity.pl nihh20161220Brian P. Walenz
+A src/stores/tgStoreDump.C nihh20161219Brian P. Walenz
+A src/stores/tgStoreLoad.C nihh20161219Brian P. Walenz
+A src/stores/tgStore.C nihh20161219Brian P. Walenz
+A src/AS_UTL/AS_UTL_fileIO.C nihh20161219Brian P. Walenz
+A src/AS_UTL/AS_UTL_fileIO.H nihh20161219Brian P. Walenz
+A src/AS_UTL/bitPackedFile.C nihh20161219Brian P. Walenz
+A src/pipelines/canu/Configure.pm nihh20161219Brian P. Walenz
+A src/meryl/estimate-mer-threshold.C nihh20161216Brian P. Walenz
+A src/pipelines/canu/Meryl.pm nihh20161216Brian P. Walenz
+A src/pipelines/canu/Execution.pm nihh20161214Brian P. Walenz
+A src/pipelines/canu/Output.pm nihh20161214Brian P. Walenz
+A src/meryl/meryl-build.C nihh20161214Brian P. Walenz
+A src/correction/generateCorrectionLayouts.C nihh20161214Brian P. Walenz
+A src/pipelines/canu/CorrectReads.pm nihh20161214Brian P. Walenz
+A src/pipelines/canu/CorrectReads.pm nihh20161214Brian P. Walenz
+A src/pipelines/canu/CorrectReads.pm nihh20161214Brian P. Walenz
+A buildRelease.sh nihh20161213Brian P. Walenz
+A buildRelease.sh nihh20161213Brian P. Walenz
+A README.licenses nihh20161213Brian P. Walenz
+A src/utgcns/libpbutgcns/LICENSE nihh20161213Brian P. Walenz
+A buildRelease.sh nihh20161213Brian P. Walenz
+A src/pipelines/canu.pl nihh20161212Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20161212Brian P. Walenz
+A src/pipelines/canu/ErrorEstimate.pm nihh20161212Brian P. Walenz
+A src/stores/ovStoreDump.C nihh20161209Brian P. Walenz
+A src/stores/ovOverlap.C nihh20161208Brian P. Walenz
+A src/stores/ovOverlap.H nihh20161208Brian P. Walenz
+A src/stores/ovStoreFile.C nihh20161208Brian P. Walenz
+A addCopyrights.dat nihh20161207Brian P. Walenz
+A addCopyrights.pl nihh20161207Brian P. Walenz
+A src/bogart/bogart.C nihh20161207Brian P. Walenz
+A src/bogart/bogart.mk nihh20161207Brian P. Walenz
+A src/main.mk nihh20161206Brian P. Walenz
+A src/meryl/maskMers.C nihh20161206Brian P. Walenz
+A src/meryl/maskMers.mk nihh20161206Brian P. Walenz
+A src/bogart/AS_BAT_CreateUnitigs.C nihh20161206Brian P. Walenz
+A src/bogart/AS_BAT_SplitDiscontinuous.C nihh20161206Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.C nihh20161206Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.H nihh20161206Brian P. Walenz
+A src/bogart/AS_BAT_PlaceReadUsingOverlaps.C nihh20161206Brian P. Walenz
+A src/bogart/AS_BAT_PlaceContains.C nihh20161206Brian P. Walenz
+A src/bogart/bogart.C nihh20161206Brian P. Walenz
+A src/bogart/AS_BAT_ReadInfo.C nihh20161206Brian P. Walenz
+A src/bogart/AS_BAT_ReadInfo.H nihh20161206Brian P. Walenz
+A src/pipelines/canu/OverlapStore.pm nihh20161205Brian P. Walenz
+A src/stores/ovStoreStats.C nihh20161205Brian P. Walenz
+A src/bogart/AS_BAT_CreateUnitigs.C nihh20161202Brian P. Walenz
+A src/bogart/AS_BAT_CreateUnitigs.H nihh20161202Brian P. Walenz
+A src/bogart/AS_BAT_SplitDiscontinuous.C nihh20161202Brian P. Walenz
+A src/bogart/AS_BAT_SplitDiscontinuous.H nihh20161202Brian P. Walenz
+A src/bogart/AS_BAT_TigGraph.C nihh20161202Brian P. Walenz
+A src/bogart/AS_BAT_TigGraph.H nihh20161202Brian P. Walenz
+A src/bogart/bogart.C nihh20161202Brian P. Walenz
+A src/bogart/AS_BAT_CreateUnitigs.C nihh20161202Brian P. Walenz
+A src/bogart/AS_BAT_CreateUnitigs.H nihh20161202Brian P. Walenz
+A src/bogart/AS_BAT_SplitDiscontinuous.C nihh20161202Brian P. Walenz
+A src/bogart/AS_BAT_SplitDiscontinuous.H nihh20161202Brian P. Walenz
+A src/bogart/AS_BAT_TigGraph.C nihh20161202Brian P. Walenz
+A src/bogart/AS_BAT_TigGraph.H nihh20161202Brian P. Walenz
+A src/bogart/bogart.C nihh20161202Brian P. Walenz
+A src/bogart/AS_BAT_TigGraph.C nihh20161201Brian P. Walenz
+A src/bogart/bogart.C nihh20161201Brian P. Walenz
+A src/pipelines/canu/Output.pm nihh20161201Brian P. Walenz
+A src/stores/gkStore.C nihh20161130Brian P. Walenz
+A src/overlapErrorAdjustment/findErrors.C nihh20161130Brian P. Walenz
+A src/stores/gkStore.C nihh20161130Brian P. Walenz
+A src/stores/ovStore.C nihh20161130Brian P. Walenz
+A src/fastq-utilities/fastqSimulate.C nihh20161130Brian P. Walenz
+A src/overlapInCore/overlapInCorePartition.C nihh20161130Brian P. Walenz
+A src/overlapInCore/overlapInCorePartition.C nihh20161130Brian P. Walenz
+A src/stores/tgStoreDump.C nihh20161130Brian P. Walenz
+A src/overlapInCore/overlapInCore-Build_Hash_Index.C nihh20161130Brian P. Walenz
+A src/overlapInCore/overlapInCore.C nihh20161130Brian P. Walenz
+A src/overlapInCore/overlapInCore.H nihh20161130Brian P. Walenz
+A src/stores/gkStore.C nihh20161130Brian P. Walenz
+A src/stores/ovOverlap.C nihh20161130Brian P. Walenz
+A src/bogart/AS_BAT_OverlapCache.C nihh20161130Brian P. Walenz
+A src/bogart/AS_BAT_Instrumentation.C nihh20161130Brian P. Walenz
+A src/bogart/AS_BAT_BestOverlapGraph.C nihh20161130Brian P. Walenz
+A src/bogart/AS_BAT_BestOverlapGraph.C nihh20161130Brian P. Walenz
+A src/overlapInCore/overlapImport.C nihh20161130Brian P. Walenz
+A src/stores/tgStoreDump.C nihh20161130Brian P. Walenz
+A src/bogart/AS_BAT_SplitDiscontinuous.C nihh20161130Brian P. Walenz
+A src/bogart/AS_BAT_TigGraph.C nihh20161130Brian P. Walenz
+A src/erateEstimate/erateEstimate.C nihh20161130Brian P. Walenz
+A src/stores/ovStore.H nihh20161130Brian P. Walenz
+A src/stores/ovStoreWriter.C nihh20161130Brian P. Walenz
+A src/bogart/AS_BAT_PlaceReadUsingOverlaps.C nihh20161130Brian P. Walenz
+A src/stores/ovStore.H nihh20161130Brian P. Walenz
+A src/bogart/AS_BAT_ChunkGraph.C nihh20161130Brian P. Walenz
+A src/stores/ovStore.C nihh20161130Brian P. Walenz
+A src/bogart/AS_BAT_Instrumentation.C nihh20161129Brian P. Walenz
+A src/bogart/AS_BAT_OverlapCache.C nihh20161129Brian P. Walenz
+A src/fastq-utilities/fastqSample.C nihh20161129Brian P. Walenz
+A src/fastq-utilities/fastqSimulate.C nihh20161129Brian P. Walenz
+A src/meryl/leaff-partition.C nihh20161129Brian P. Walenz
+A src/overlapErrorAdjustment/findErrors-Read_Frags.C nihh20161129Brian P. Walenz
+A src/stores/gatekeeperCreate.C nihh20161129Brian P. Walenz
+A src/stores/tgStoreDump.C nihh20161129Brian P. Walenz
+A src/utgcns/stashContains.C nihh20161129Brian P. Walenz
+A src/stores/tgStoreDump.C nihh20161129Brian P. Walenz
+A src/overlapErrorAdjustment/correctOverlaps.H nihh20161129Brian P. Walenz
+A src/overlapErrorAdjustment/findErrors.H nihh20161129Brian P. Walenz
+A src/utgcns/libNDFalcon/dw.H nihh20161129Brian P. Walenz
+A src/overlapErrorAdjustment/findErrors.H nihh20161129Brian P. Walenz
+A src/main.mk nihh20161129Brian P. Walenz
+A src/stores/gkStore.H nihh20161129Brian P. Walenz
+A src/stores/gkStore.H nihh20161129Brian P. Walenz
+A src/bogart/AS_BAT_ChunkGraph.C nihh20161129Brian P. Walenz
+A src/bogart/AS_BAT_PopBubbles.C nihh20161129Brian P. Walenz
+A src/overlapInCore/overlapInCore.C nihh20161129Brian P. Walenz
+A src/overlapInCore/overlapInCore.H nihh20161129Brian P. Walenz
+A src/bogart/bogart.C nihh20161129Brian P. Walenz
+A src/stores/ovStoreDump.C nihh20161129Brian P. Walenz
+A src/stores/ovStoreWriter.C nihh20161129Brian P. Walenz
+A src/stores/ovStoreBucketizer.C nihh20161129Brian P. Walenz
+A src/stores/ovStoreIndexer.C nihh20161129Brian P. Walenz
+A src/stores/ovStoreSorter.C nihh20161129Brian P. Walenz
+A src/overlapInCore/overlapImport.C nihh20161129Brian P. Walenz
+A src/falcon_sense/falcon_sense.C nihh20161128Brian P. Walenz
+A src/meryl/libmeryl.C nihh20161123Brian P. Walenz
+A src/meryl/meryl-build.C nihh20161122Brian P. Walenz
+A src/stores/tgTig.C nihh20161122Brian P. Walenz
+A src/stores/tgStoreDump.C nihh20161122Brian P. Walenz
+A src/overlapInCore/overlapInCore-Output.C nihh20161122Brian P. Walenz
+A src/stores/gatekeeperDumpFASTQ.C nihh20161122Brian P. Walenz
+A src/bogart/AS_BAT_MarkRepeatReads.C nihh20161122Brian P. Walenz
+A src/meryl/leaff-statistics.C nihh20161122Brian P. Walenz
+A src/stores/ovStoreDump.C nihh20161122Brian P. Walenz
+A src/meryl/libleaff/gkStoreFile.C nihh20161122Brian P. Walenz
+A src/meryl/libleaff/fastqStdin.C nihh20161122Brian P. Walenz
+A src/stores/tgTigMultiAlignDisplay.C nihh20161122Brian P. Walenz
+A src/stores/ovOverlap.H nihh20161122Brian P. Walenz
+A src/overlapErrorAdjustment/findErrors.C nihh20161122Brian P. Walenz
+A src/overlapErrorAdjustment/findErrors.H nihh20161122Brian P. Walenz
+A src/utgcns/libcns/abAbacus-refine.C nihh20161122Brian P. Walenz
+A src/AS_UTL/AS_UTL_fileIO.C nihh20161122Brian P. Walenz
+A src/AS_UTL/AS_UTL_stackTrace.C nihh20161122Brian P. Walenz
+A src/AS_global.C nihh20161122Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.C nihh20161122Brian P. Walenz
+A src/bogart/AS_BAT_BestOverlapGraph.C nihh20161122Brian P. Walenz
+A src/bogart/AS_BAT_ChunkGraph.C nihh20161122Brian P. Walenz
+A src/bogart/AS_BAT_Instrumentation.C nihh20161122Brian P. Walenz
+A src/bogart/AS_BAT_Logging.C nihh20161122Brian P. Walenz
+A src/bogart/AS_BAT_MarkRepeatReads.C nihh20161122Brian P. Walenz
+A src/bogart/AS_BAT_Outputs.C nihh20161122Brian P. Walenz
+A src/bogart/AS_BAT_OverlapCache.C nihh20161122Brian P. Walenz
+A src/bogart/AS_BAT_PopBubbles.C nihh20161122Brian P. Walenz
+A src/bogart/AS_BAT_TigGraph.C nihh20161122Brian P. Walenz
+A src/bogart/AS_BAT_TigVector.C nihh20161122Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.C nihh20161122Brian P. Walenz
+A src/bogart/bogart.C nihh20161122Brian P. Walenz
+A src/bogart/buildGraph.C nihh20161122Brian P. Walenz
+A src/bogus/bogus.C nihh20161122Brian P. Walenz
+A src/bogus/bogusness.C nihh20161122Brian P. Walenz
+A src/correction/filterCorrectionOverlaps.C nihh20161122Brian P. Walenz
+A src/correction/generateCorrectionLayouts.C nihh20161122Brian P. Walenz
+A src/erateEstimate/erateEstimate.C nihh20161122Brian P. Walenz
+A src/falcon_sense/createFalconSenseInputs.C nihh20161122Brian P. Walenz
+A src/fastq-utilities/fastqSample.C nihh20161122Brian P. Walenz
+A src/fastq-utilities/fastqSimulate.C nihh20161122Brian P. Walenz
+A src/merTrim/merTrim.C nihh20161122Brian P. Walenz
+A src/mercy/mercy.C nihh20161122Brian P. Walenz
+A src/meryl/compare-counts.C nihh20161122Brian P. Walenz
+A src/meryl/leaff-partition.C nihh20161122Brian P. Walenz
+A src/meryl/leaff.C nihh20161122Brian P. Walenz
+A src/meryl/libmeryl.C nihh20161122Brian P. Walenz
+A src/meryl/maskMers.C nihh20161122Brian P. Walenz
+A src/meryl/meryl-args.C nihh20161122Brian P. Walenz
+A src/meryl/meryl-build.C nihh20161122Brian P. Walenz
+A src/overlapBasedTrimming/splitReads.C nihh20161122Brian P. Walenz
+A src/overlapBasedTrimming/trimReads-bestEdge.C nihh20161122Brian P. Walenz
+A src/overlapBasedTrimming/trimReads.C nihh20161122Brian P. Walenz
+A src/overlapBasedTrimming/trimStat.H nihh20161122Brian P. Walenz
+A src/overlapInCore/overlapInCorePartition.C nihh20161122Brian P. Walenz
+A src/stores/gatekeeperCreate.C nihh20161122Brian P. Walenz
+A src/stores/gatekeeperDumpFASTQ.C nihh20161122Brian P. Walenz
+A src/stores/gatekeeperPartition.C nihh20161122Brian P. Walenz
+A src/stores/gkStore.C nihh20161122Brian P. Walenz
+A src/stores/ovStore.C nihh20161122Brian P. Walenz
+A src/stores/ovStore.H nihh20161122Brian P. Walenz
+A src/stores/ovStoreBucketizer.C nihh20161122Brian P. Walenz
+A src/stores/ovStoreBuild.C nihh20161122Brian P. Walenz
+A src/stores/ovStoreDump.C nihh20161122Brian P. Walenz
+A src/stores/ovStoreHistogram.C nihh20161122Brian P. Walenz
+A src/stores/ovStoreSorter.C nihh20161122Brian P. Walenz
+A src/stores/ovStoreStats.C nihh20161122Brian P. Walenz
+A src/stores/ovStoreWriter.C nihh20161122Brian P. Walenz
+A src/stores/tgStore.C nihh20161122Brian P. Walenz
+A src/stores/tgStore.H nihh20161122Brian P. Walenz
+A src/stores/tgStoreCoverageStat.C nihh20161122Brian P. Walenz
+A src/stores/tgStoreDump.C nihh20161122Brian P. Walenz
+A src/stores/tgStoreFilter.C nihh20161122Brian P. Walenz
+A src/stores/tgTig.C nihh20161122Brian P. Walenz
+A src/stores/tgTigMultiAlignDisplay.C nihh20161122Brian P. Walenz
+A src/utgcns/utgcns.C nihh20161122Brian P. Walenz
+A src/stores/tgTig.C nihh20161122Brian P. Walenz
+A src/stores/tgTig.C nihh20161122Brian P. Walenz
+A src/AS_UTL/bitPackedFile.C nihh20161122Brian P. Walenz
+A src/AS_UTL/bitPackedFile.C nihh20161122Brian P. Walenz
+A src/AS_UTL/bitPackedFile.C nihh20161122Brian P. Walenz
+A src/AS_UTL/AS_UTL_fileIO.C nihh20161122Brian P. Walenz
+A src/bogart/AS_BAT_CreateUnitigs.C nihh20161121Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.C nihh20161121Brian P. Walenz
+A addCopyrights-BuildData.pl nihh20161121Brian P. Walenz
+A addCopyrights.dat nihh20161121Brian P. Walenz
+A addCopyrights.pl nihh20161121Brian P. Walenz
diff --git a/addCopyrights.pl b/addCopyrights.pl
index 15a0c1f..6a47ccf 100644
--- a/addCopyrights.pl
+++ b/addCopyrights.pl
@@ -2,13 +2,36 @@
use strict;
-my @dateStrings = ( "???", "JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC" );
+# To run this:
+#
+# Update the copyright data file by appending info on new commits:
+# perl addCopyrights-BuildData.pl >> addCopyrights.dat
+#
+# Update copyright on each file, writing to new files:
+# perl addCopyrights.pl -test
+#
+# Update copyright on specific files by listing them at then end:
+# perl addCopyrights.pl -test src/bogart/bogart.C
+#
+# All files get rewritten, even if there are no changes. If not running in 'test' mode
+# you can use git to see what changes, and to verify they look sane.
+#
+# Once source files are updated, update addCopyright-BuildData.pl with the last
+# commit hash and commit those changes (both the dat and pl).
+#
+#
# If set, rename original files to name.ORIG, rewrite files with updated copyright text.
# If not, create new name.MODIFIED files with updated copyright text.
#
+
my $doForReal = 1;
+if ($ARGV[0] eq "-test") {
+ shift @ARGV;
+ $doForReal = 0;
+}
+
#
# The change data 'addCopyrights.dat' contains lines of two types:
#
@@ -26,7 +49,6 @@ my $doForReal = 1;
# of the original name need to be updated to the new name.
#
-
sub toList (@) {
my @all = sort { $a <=> $b } @_;
my $ret;
@@ -76,6 +98,8 @@ sub splitAC ($@) {
my @AC = @_;
my @AClist;
+ my @dateStrings = ( "???", "JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC" );
+
my %dates;
foreach my $ac (@AC) {
@@ -405,8 +429,6 @@ foreach my $file (@filesToProcess) {
if ($doForReal) {
my $perms = `stat -f %p $file`; chomp $perms; $perms = substr($perms, -3);
- #rename "$file", "$file.ORIG";
-
open(F, "> $file") or die "Failed to open '$file' for writing: $!\n";
print F @lines;
close(F);
diff --git a/buildRelease.sh b/buildRelease.sh
index a081df9..db4e86a 100644
--- a/buildRelease.sh
+++ b/buildRelease.sh
@@ -7,12 +7,27 @@ if [ x$version = x ] ; then
exit
fi
-git clone git at github.com:marbl/canu.git
-
-mv canu canu-$version
+# From the tarball
+if [ ! -e canu-$version.tar.gz ] ; then
+ echo Fetch.
+ curl -L -R -o canu-$version.tar.gz https://github.com/marbl/canu/archive/v$version.tar.gz
+fi
+if [ ! -d canu-$versioin ] ; then
+ echo Unpack.
+ gzip -dc canu-$version.tar.gz | tar -xf -
+fi
cd canu-$version
-git tag v$version
+
+# From the repo
+
+#git clone git at github.com:marbl/canu.git
+#mv canu canu-$version
+#cd canu-$version
+#git tag v$version
+#git checkout v$version
+
+echo Build MacOS.
cd src
gmake -j 12 > ../Darwin-amd64.out 2>&1
cd ../..
@@ -20,19 +35,21 @@ cd ../..
rm -f canu-$version/linux.sh
echo >> canu-$version/linux.sh \#\!/bin/bash
-echo >> canu-$version/linux.sh yum install -y git
+#echo >> canu-$version/linux.sh yum install -y git
echo >> canu-$version/linux.sh cd /build/canu-$version/src
echo >> canu-$version/linux.sh gmake -j 12 \> ../Linux-amd64.out 2\>\&1
echo >> canu-$version/linux.sh cd ../..
-echo >> canu-$version/linux.sh tar -cf canu-$version/README* canu-$version.Darwin-amd64.tar canu-$version/Darwin-amd64
-echo >> canu-$version/linux.sh tar -cf canu-$version/README* canu-$version.Linux-amd64.tar canu-$version/Linux-amd64
+echo >> canu-$version/linux.sh rm -rf canu-$version/Darwin-amd64/obj
+echo >> canu-$version/linux.sh rm -rf canu-$version/Linux-amd64/obj
+echo >> canu-$version/linux.sh tar -cf canu-$version.Darwin-amd64.tar canu-$version/README* canu-$version/Darwin-amd64
+echo >> canu-$version/linux.sh tar -cf canu-$version.Linux-amd64.tar canu-$version/README* canu-$version/Linux-amd64
chmod 755 canu-$version/linux.sh
+echo Build Linux and make tarballs.
docker run -v `pwd`:/build -t -i --rm phusion/holy-build-box-64:latest /hbb_exe/activate-exec bash /build/canu-$version/linux.sh
-rm -rf canu-$version/*-amd64/obj
-
+echo Compress.
xz -9v canu-$version.Darwin-amd64.tar
xz -9v canu-$version.Linux-amd64.tar
diff --git a/documentation/source/conf.py b/documentation/source/conf.py
index 35dddac..9860a14 100644
--- a/documentation/source/conf.py
+++ b/documentation/source/conf.py
@@ -55,9 +55,9 @@ copyright = u'2015, Adam Phillippy, Sergey Koren, Brian Walenz'
# built documents.
#
# The short X.Y version.
-version = '1.5'
+version = '1.6'
# The full version, including alpha/beta/rc tags.
-release = '1.5'
+release = '1.6'
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
diff --git a/documentation/source/faq.rst b/documentation/source/faq.rst
index 50cd03f..2147eba 100644
--- a/documentation/source/faq.rst
+++ b/documentation/source/faq.rst
@@ -74,6 +74,9 @@ What parameters should I use for my reads?
slightly decrease the maximum allowed difference from the default of 4.5% to 4.0% with
``correctedErrorRate=0.040 corMhapSensitivity=normal``. For recent Sequel data, the defaults
are appropriate.
+
+ **Nanopore R9 large genomes**
+ Due to some systematic errors, the identity estimate used by Canu for correction can be an over-estimate of true error, inflating runtime. For recent large genomes (>1gbp) we've used ``'corMhapOptions=--threshold 0.8 --num-hashes 512 --ordered-sketch-size 1000 --ordered-kmer-size 14'``. This can be used with 30x or more of coverage, below that the defaults are OK.
My assembly continuity is not good, how can I improve it?
@@ -104,6 +107,8 @@ My assembly continuity is not good, how can I improve it?
information on tuning unitigging in those instances.
+.. _tweak:
+
What parameters can I tweak?
-------------------------------------
For all stages:
@@ -188,50 +193,30 @@ What parameters can I tweak?
primarily an optimization for speed and generally does not change assembly continuity.
-
My asm.contigs.fasta is empty, why?
-------------------------------------
- Canu will split the final output into three files:
-
- <prefix>.contigs.fasta
- Everything which could be assembled and is part of the primary assembly, including both unique
- and repetitive elements. Each contig has several flags included on the fasta def line.
-
- **This file currently includes alternate paths.**
-
- <prefix>.bubbles.fasta
- Alternate paths in the graph which could not be merged into the primary assembly.
-
- **This file is currently ALWAYS empty.**
-
- <prefix>.unassembled.fasta
- Reads and small contigs that appear to be falsely assembled. These are generally low quality
- reads or assemblies of a few low quality reads.
+ Canu creates three assembled sequence :ref:`output files <outputs>`: ``<prefix>.contigs.fasta``,
+ ``<prefix>.unitigs.fasta``, and ``<prefix>.unassembled.fasta``, where contigs are the primary
+ output, unitigs are the primary output split at alternate paths,
+ and unassembled are the leftover pieces.
- **Small plasmids (unfortunately) tend to end up here.**
-
- The ``contigFilter=<minReads minLength singleReadSpan lowCovFraction lowCovDepth>`` parameter
- sets parameters for several filters that decide which contigs are 'unassembled'. A contig is
- 'unassembled' if it:
- - has fewer than minReads (2) reads, or
- - is shorter than minLength (1000), or
- - has a single read spanning singleReadSpan percent (75%) of the contig, or
- - has less than lowCovDepth (2) coverage over at least lowCovSpan fraction (0.75) of the contig
- The default filtering is ``contigFilter="2 1000 0.75 0.75 2"``.
-
- If you are assembling amplified or viral data, it is possible your assembly will be flagged as
- unassembled. Turn off filtering with the parameters ``contigFilter="2 1000 1.0 1.0 2"``.
+ The :ref:`contigFilter` parameter sets several parameters that control how small or low coverage
+ initial contigs are handled. By default, initial contigs with more than 50% of the length at
+ less than 5X coverage will be classified as 'unassembled' and removed from the assembly, that
+ is, ``contigFilter="2 0 1.0 0.5 5"``. The filtering can be disabled by changing the last number
+ from '5' to '0' (meaning, filter if 50% is less than 0X coverage).
Why is my assembly is missing my favorite short plasmid?
-------------------------------------
Only the longest 40X of data (based on the specified genome size) is used for
- correction. Datasets with uneven coverage or small plasmids can fail to generate enough
- corrected reads to give enough coverage for assembly, resulting in gaps in the genome or zero
- reads for small plasmids. Set ``corOutCoverage=1000`` (any value greater than your total input
+ correction. Datasets with uneven coverage or small plasmids can fail to generate enough
+ corrected reads to give enough coverage for assembly, resulting in gaps in the genome or even no
+ reads for small plasmids. Set ``corOutCoverage=1000`` (or any value greater than your total input
coverage) to correct all input data.
- This option is also recommended for metagenomic datasets where all data is useful for assembly.
+ An alternate approach is to correct all reads (``-correct corOutCoverage=1000``) then assemble
+ 40X of reads picked at random from the ``<prefix>.correctedReads.fasta.gz`` output.
Why do I get less corrected read data than I asked for?
@@ -264,3 +249,13 @@ How can I send data to you?
-------------------------------------
FTP to ftp://ftp.cbcb.umd.edu/incoming/sergek. This is a write-only location that only the Canu
developers can see.
+
+ Here is a quick walk-through using a command-line ftp client (should be available on most Linux and OSX installations). Say we want to transfer a file named ``reads.fastq``. First, run ``ftp ftp.cbcb.umd.edu``, specify ``anonymous`` as the user name and hit return for password (blank). Then:
+
+ .. code-block::
+
+ cd incoming/sergek
+ put reads.fastq
+ quit
+
+ That's it, you won't be able to see the file but we can download it.
diff --git a/documentation/source/index.rst b/documentation/source/index.rst
index 37490b2..1fa66f7 100644
--- a/documentation/source/index.rst
+++ b/documentation/source/index.rst
@@ -47,7 +47,7 @@ the PacBio RSII or Oxford Nanopore MinION).
Publication
===========
-Koren S, Walenz BP, Berlin K, Miller JR, Phillippy AM. `Canu: scalable and accurate long-read assembly via adaptive k-mer weighting and repeat separation <http://biorxiv.org/content/early/2016/08/24/071282>`_. bioRxiv. (2016).
+Koren S, Walenz BP, Berlin K, Miller JR, Phillippy AM. `Canu: scalable and accurate long-read assembly via adaptive k-mer weighting and repeat separation <http://doi.org/10.1101/gr.215087.116>`_. Genome Research. (2017).
Install
=========
diff --git a/documentation/source/parameter-reference.rst b/documentation/source/parameter-reference.rst
index 8a0e20c..bb6443a 100644
--- a/documentation/source/parameter-reference.rst
+++ b/documentation/source/parameter-reference.rst
@@ -681,30 +681,15 @@ corFilter <string="expensive">
Output Filtering
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-By default, canu will split the final output into three files:
-
-asm.contigs.fasta
- Everything which could be assembled and is part of the primary assembly, including both unique and repetitive elements. Each contig has several flags included on the fasta def line::
-
-asm.bubbles.fasta
- alternate paths in the graph which could not be merged into the primary assembly.
-
-asm.unassembled.fasta
- reads/tigs which could not be incorporated into the primary or bubble assemblies.
-
-It is possible for tigs comprised of multiple reads to end up in asm.unassembled.fasta. The default filtering eliminates anything with < 2 reads, shorter than 1000bp, or comprised of mostly a single sequence (>75%). The filtering is controlled by the contigFilter parameter which takes 5 values.
-
-::
-
- contigFilter
- minReads
- minLength
- singleReadSpan
- lowCovSpan
- lowCovDepth
-
-The default filtering is "2 1000 0.75 0.75 2". If you are assembling amplified data or viral data, it is possible your assembly will be flagged as unassembled. In those cases, you can turn off the filtering with the parameters
-
-::
-
- contigFilter="2 1000 1.0 1.0 2"
+.. _contigFilter:
+
+contigFilter <minReads, integer=2> <minLength, integer=0> <singleReadSpan, float=1.0> <lowCovSpan, float=0.5> <lowCovDepth, integer=5>
+ Remove spurious assemblies from consideration. Any contig that meeds any of the following
+ conditions is flagged as 'unassembled' and removed from further consideration:
+ - fewer than minReads reads
+ - shorter than minLength bases
+ - a single read covers more than singleReadSpan fraction of the contig
+ - more than lowCovSpan fraction of the contig is at coverage below lowCovDepth
+ This filtering is done immediately after initial contigs are formed, before repeat detection.
+ Initial contigs that span a repeat can be split into multiple conitgs; none of these
+ new contigs will be 'unassembled', even if they are a single read.
diff --git a/documentation/source/quick-start.rst b/documentation/source/quick-start.rst
index 4819e93..561002a 100644
--- a/documentation/source/quick-start.rst
+++ b/documentation/source/quick-start.rst
@@ -4,249 +4,164 @@
Canu Quick Start
================
-Canu specializes in assembling PacBio or Oxford Nanopore sequences. Canu will correct the reads,
-trim suspicious regions (such as remaining SMRTbell adapter), and then assemble the corrected and
-cleaned reads into contigs and unitigs.
+Canu specializes in assembling PacBio or Oxford Nanopore sequences. Canu operates in three phases:
+correction, trimming and assembly. The correction phase will improve the accuracy of bases in
+reads. The trimming phase will trim reads to the portion that appears to be high-quality sequence,
+removing suspicious regions such as remaining SMRTbell adapter. The assembly phase will order the
+reads into contigs, generate consensus sequences and create graphs of alternate paths.
-For eukaryotic genomes, coverage more than 20x is enough to outperform current hybrid methods.
-Between 30x and 60x coverage is the recommended minimum. More coverage will let Canu use longer
-reads for assembly, which will result in better assemblies.
+For eukaryotic genomes, coverage more than 20x is enough to outperform current hybrid methods,
+however, between 30x and 60x coverage is the recommended minimum. More coverage will let Canu use
+longer reads for assembly, which will result in better assemblies.
Input sequences can be FASTA or FASTQ format, uncompressed or compressed with gzip (.gz), bzip2
-(.bz2) or xz (.xz). Zip files (.zip) are not supported.
+(.bz2) or xz (.xz). Note that zip files (.zip) are not supported.
-Canu will auto-detect your resources and scale itself to fit, using all of the resources available
-(depending on the size of your assembly). You can limit memory and processors used with parameters
-:ref:`maxMemory` and :ref:`maxThreads`.
+Canu can resume incomplete assemblies, allowing for recovery from system outages or other abnormal
+terminations.
-Canu will take full advantage of any LSF/PBS/PBSPro/Torque/Slrum/SGE grid available, and do so
-automagically, even submitting itself for execution. For details, refer to the section on
-:ref:`execution`.
+Canu will auto-detect computational resources and scale itself to fit, using all of the resources
+available and are reasonable for the size of your assembly. Memory and processors can be explicitly
+limited with with parameters :ref:`maxMemory` and :ref:`maxThreads`. See section :ref:`execution`
+for more details.
+Canu will automaticall take full advantage of any LSF/PBS/PBSPro/Torque/Slrum/SGE grid available,
+even submitting itself for execution. Canu makes heavy use of array jobs and requires job
+submission from compute nodes, which are sometimes not available or allowed. Canu option
+``useGrid=false`` will restrict Canu to using only the current machine, while option
+``useGrid=remote`` will configure Canu for grid execution but not submit jobs to the grid.
+See section :ref:`execution` for more details.
-Assembling PacBio data
+The :ref:`tutorial` has more background, and the :ref:`faq` has a wealth of practical advice.
+
+
+Assembling PacBio or Nanopore data
----------------------
-Pacific Biosciences released P6-C4 chemistry reads for Escherichia coli K12. You can download them
-`here <https://github.com/PacificBiosciences/DevNet/wiki/E.-coli-Bacterial-Assembly>`_, but note that you must have the `SMRTpipe software <http://www.pacb.com/support/software-downloads/>`_ installed to extract the reads as FASTQ.
+Pacific Biosciences released P6-C4 chemistry reads for Escherichia coli K12. You can `download them
+from their original release
+<https://github.com/PacificBiosciences/DevNet/wiki/E.-coli-Bacterial-Assembly>`_, but note that you
+must have the `SMRTpipe software <http://www.pacb.com/support/software-downloads/>`_ installed to
+extract the reads as FASTQ. Instead, use a `FASTQ format 25X subset
+<http://gembox.cbcb.umd.edu/mhap/raw/ecoli_p6_25x.filtered.fastq>`_ (223MB). Download from the command line
+with::
-We made a 25X subset FASTQ available `here <http://gembox.cbcb.umd.edu/mhap/raw/ecoli_p6_25x.filtered.fastq>`_ (223MB), which can be downloaded with:
+ curl -L -o pacbio.fastq http://gembox.cbcb.umd.edu/mhap/raw/ecoli_p6_25x.filtered.fastq
-::
+There doesn't appear to be any "official" Oxford Nanopore sample data, but the `Loman Lab
+<http://lab.loman.net/>`_ released a `set of runs
+<http://lab.loman.net/2015/09/24/first-sqk-map-006-experiment/>`_, also for Escherichia coli K12.
+This is early data, from September 2015. Any of the four runs will work; we picked `MAP-006-1
+<http://nanopore.s3.climb.ac.uk/MAP006-PCR-1_2D_pass.fasta>`_ (243 MB). Download from the command
+line with::
- curl -L -o p6.25x.fastq http://gembox.cbcb.umd.edu/mhap/raw/ecoli_p6_25x.filtered.fastq
-
-Correct, Trim and Assemble
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ curl -L -o oxford.fasta http://nanopore.s3.climb.ac.uk/MAP006-PCR-1_2D_pass.fasta
-By default, canu will correct the reads, then trim the reads, then assemble the reads to unitigs.
+By default, Canu will correct the reads, then trim the reads, then assemble the reads to unitigs.
+Canu needs to know the approximate genome size (so it can determine coverage in the input reads)
+and the technology used to generate the reads.
-::
+For PacBio::
canu \
- -p ecoli -d ecoli-auto \
+ -p ecoli -d ecoli-pacbio \
genomeSize=4.8m \
- -pacbio-raw p6.25x.fastq
-
-This will use the prefix 'ecoli' to name files, compute the correction task in directory 'ecoli-auto/correction', the trimming task in directory 'ecoli-auto/trimming', and the unitig construction stage in 'ecoli-auto' itself.
-Output files are described in the next section.
-
-Find the Output
-~~~~~~~~~~~~~~~~~~~~~~
-
-The canu progress chatter records statistics such as an input read histogram, corrected read histogram, and overlap types. Outputs from the assembly tasks are in:
+ -pacbio-raw pacbio.fastq
-ecoli*/ecoli.correctedReads.fasta.gz
- The sequences after correction, trimmed and split based on consensus evidence. Typically >99% for PacBio and >98% for Nanopore but it can vary based on your input sequencing quality.
+For Nanopore::
-ecoli*/ecoli.trimmedReads.fasta.gz
- The sequences after correction and final trimming. The corrected sequences above are overlapped again to identify any missed hairpin adapters or bad sequence that could not be detected in the raw sequences.
-
-ecoli*/ecoli.layout
- The layout provides information on where each read ended up in the final assembly, including contig and positions. It also includes the consensus sequence for each contig.
-
-ecoli*/ecoli.gfa
- The `GFA <http://lh3.github.io/2014/07/19/a-proposal-of-the-grapical-fragment-assembly-format/>`_ is the assembly graph generated by Canu. Currently this includes the contigs, associated bubbles, and any overlaps which were not used by the assembly.
-
-The fasta output is split into three types:
-
-ecoli*/asm.contigs.fasta
- Everything which could be assembled and is part of the primary assembly, including both unique and repetitive elements. Each contig has several flags included on the fasta def line::
-
- >tig######## len=<integer> reads=<integer> covStat=<float> gappedBases=<yes|no> class=<contig|bubble|unassm> suggestRepeat=<yes|no> suggestCircular=<yes|no>
-
- len
- Length of the sequence, in bp.
-
- reads
- Number of reads used to form the contig.
-
- covStat
- The log of the ratio of the contig being unique versus being two-copy, based on the read arrival rate. Positive values indicate more likely to be unique, while negative values indicate more likely to be repetitive. See `Footnote 24 <http://science.sciencemag.org/content/287/5461/2196.full#ref-24>`_ in `Myers et al., A Whole-Genome Assembly of Drosophila <http://science.sciencemag.org/content/287/5461/2196.full>`_.
+ canu \
+ -p ecoli -d ecoli-oxford \
+ genomeSize=4.8m \
+ -nanopore-raw oxford.fasta
- gappedBases
- If yes, the sequence includes all gaps in the multialignment.
- class
- Type of sequence. Unassembled sequences are primarily low-coverage sequences spanned by a single read.
+Output and intermediate files will be in directories 'ecoli-pacbio' and 'ecoli-nanopore',
+respectively. Intermeditate files are written in directories 'correction', 'trimming' and
+'unitigging' for the respective stages. Output files are named using the '-p' prefix, such as
+'ecoli.contigs.fasta', 'ecoli.contigs.gfa', etc. See section :ref:`outputs` for more details on
+outputs (intermediate files aren't documented).
- suggestRepeat
- If yes, sequence was detected as a repeat based on graph topology or read overlaps to other sequences.
- suggestCircular
- If yes, sequence is likely circular. Not implemented.
+Assembling With Multiple Technologies and Multiple Files
+-------------------------------------------
-ecoli*/asm.bubbles.fasta
- alternate paths in the graph which could not be merged into the primary assembly.
+Canu can use reads from any number of input files, which can be a mix of formats and technologies.
+We'll assemble a mix of 10X PacBio reads in two FASTQ files and 10X of Nanopore reads in one FASTA
+file::
-ecoli*/asm.unassembled.fasta
- reads which could not be incorporated into the primary or bubble assemblies.
+ curl -L -o mix.tar.gz http://gembox.cbcb.umd.edu/mhap/raw/ecoliP6Oxford.tar.gz
+ tar xvzf mix.tar.gz
+
+ canu \
+ -p ecoli -d ecoli-mix \
+ genomeSize=4.8m \
+ -pacbio-raw pacbio.part?.fastq.gz \
+ -nanopore-raw oxford.fasta.gz
Correct, Trim and Assemble, Manually
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Sometimes, however, it makes sense to do the three top-level tasks by hand. This would allow trying
-multiple unitig construction parameters on the same set of corrected and trimmed reads.
+multiple unitig construction parameters on the same set of corrected and trimmed reads, or skipping
+trimming and assembly if you only want correced reads.
-First, correct the raw reads::
+We'll use the PacBio reads from above. First, correct the raw reads::
canu -correct \
-p ecoli -d ecoli \
genomeSize=4.8m \
- -pacbio-raw p6.25x.fastq
+ -pacbio-raw pacbio.fastq
Then, trim the output of the correction::
canu -trim \
-p ecoli -d ecoli \
genomeSize=4.8m \
- -pacbio-corrected ecoli/correction/ecoli.correctedReads.fasta.gz
+ -pacbio-corrected ecoli/ecoli.correctedReads.fasta.gz
-And finally, assemble the output of trimming, twice::
+And finally, assemble the output of trimming, twice, with different stringency on which overlaps to
+use (see :ref:`correctedErrorRate <correctedErrorRate>`)::
canu -assemble \
- -p ecoli -d ecoli-erate-0.013 \
+ -p ecoli -d ecoli-erate-0.039 \
genomeSize=4.8m \
correctedErrorRate=0.039 \
- -pacbio-corrected ecoli/trimming/ecoli.trimmedReads.fasta.gz
+ -pacbio-corrected ecoli/ecoli.trimmedReads.fasta.gz
canu -assemble \
- -p ecoli -d ecoli-erate-0.025 \
+ -p ecoli -d ecoli-erate-0.075 \
genomeSize=4.8m \
correctedErrorRate=0.075 \
- -pacbio-corrected ecoli/trimming/ecoli.trimmedReads.fasta.gz
+ -pacbio-corrected ecoli/ecoli.trimmedReads.fasta.gz
-The directory layout for correction and trimming is exactly the same as when we ran all tasks in the same command.
-Each unitig construction task needs its own private work space, and in there the 'correction' and 'trimming' directories are empty. The error rate always specifies the error in the corrected reads which is typically <1% for PacBio data and <2% for Nanopore data (<1% on newest chemistries).
+Note that the assembly stages use different '-d' directories. It is not possible to run multiple
+copies of canu with the same work directory.
-Assembling Oxford Nanopore data
---------------------------------
-A set of E. coli runs were released by the Loman lab. You can download one
-`directly <http://nanopore.s3.climb.ac.uk/MAP006-PCR-1_2D_pass.fasta>`_
-or any of them from the
-`original page <http://lab.loman.net/2015/09/24/first-sqk-map-006-experiment/>`_.
-
-or use the following curl command:
-
-::
-
- curl -L -o oxford.fasta http://nanopore.s3.climb.ac.uk/MAP006-PCR-1_2D_pass.fasta
-
-Canu assembles any of the four available datasets into a single contig but we picked one dataset to use in this tutorial. Then, assemble the data as before::
-
- canu \
- -p ecoli -d ecoli-oxford \
- genomeSize=4.8m \
- -nanopore-raw oxford.fasta
-
-The assembled identity is >99% before polishing.
-
-Assembling With Multiple Technologies/Files
--------------------------------------------
-
-Canu takes an arbitrary number of input files/formats. We made a mixed dataset of about 10X of a PacBio P6 and 10X of an Oxford Nanopore run available `here <http://gembox.cbcb.umd.edu/mhap/raw/ecoliP6Oxford.tar.gz>`_
-
-or use the following curl command:
-
-::
-
- curl -L -o mix.tar.gz http://gembox.cbcb.umd.edu/mhap/raw/ecoliP6Oxford.tar.gz
- tar xvzf mix.tar.gz
-
-Now you can assemble all the data::
-
- canu \
- -p ecoli -d ecoli-mix \
- genomeSize=4.8m \
- -pacbio-raw pacbio*fastq.gz \
- -nanopore-raw oxford.fasta.gz
-
-.. _quick_low:
Assembling Low Coverage Datasets
----------------------------------
-When you have 30X or less coverage, it helps to adjust the Canu assembly parameters. Typically, assembly 20X of single-molecule data outperforms hybrid methods with higher coverage. You can download a 20X subset of `S. cerevisae <http://gembox.cbcb.umd.edu/mhap/raw/yeast_filtered.20x.fastq.gz>`_
-
-or use the following curl command:
-::
+We claimed Canu works down to 20X coverage, and we will now assemble `a 20X subset of S. cerevisae
+<http://gembox.cbcb.umd.edu/mhap/raw/yeast_filtered.20x.fastq.gz>`_ (215 MB). When assembling, we
+adjust :ref:`correctedErrorRate <correctedErrorRate>` to accomodate the slightly lower
+quality corrected reads::
curl -L -o yeast.20x.fastq.gz http://gembox.cbcb.umd.edu/mhap/raw/yeast_filtered.20x.fastq.gz
-and run the assembler adding sensitive parameters (**correctedErrorRate=0.105**)::
-
canu \
-p asm -d yeast \
genomeSize=12.1m \
- correctedErrorRate=0.105 \
+ correctedErrorRate=0.075 \
-pacbio-raw yeast.20x.fastq.gz
-
-
-After the run completes, we can check the assembly statistics::
-
- tgStoreDump -sizes -s 12100000 -T yeast/unitigging/asm.ctgStore 2 -G yeast/unitigging/asm.gkpStore
-
-::
-
- lenSuggestRepeat sum 160297 (genomeSize 12100000)
- lenSuggestRepeat num 12
- lenSuggestRepeat ave 13358
- lenUnassembled ng10 13491 bp lg10 77 sum 1214310 bp
- lenUnassembled ng20 11230 bp lg20 176 sum 2424556 bp
- lenUnassembled ng30 9960 bp lg30 290 sum 3632411 bp
- lenUnassembled ng40 8986 bp lg40 418 sum 4841978 bp
- lenUnassembled ng50 8018 bp lg50 561 sum 6054460 bp
- lenUnassembled ng60 7040 bp lg60 723 sum 7266816 bp
- lenUnassembled ng70 6169 bp lg70 906 sum 8474192 bp
- lenUnassembled ng80 5479 bp lg80 1114 sum 9684981 bp
- lenUnassembled ng90 4787 bp lg90 1348 sum 10890099 bp
- lenUnassembled ng100 4043 bp lg100 1624 sum 12103239 bp
- lenUnassembled ng110 3323 bp lg110 1952 sum 13310167 bp
- lenUnassembled ng120 2499 bp lg120 2370 sum 14520362 bp
- lenUnassembled ng130 1435 bp lg130 2997 sum 15731198 bp
- lenUnassembled sum 16139888 (genomeSize 12100000)
- lenUnassembled num 3332
- lenUnassembled ave 4843
- lenContig ng10 770772 bp lg10 2 sum 1566457 bp
- lenContig ng20 710140 bp lg20 4 sum 3000257 bp
- lenContig ng30 669248 bp lg30 5 sum 3669505 bp
- lenContig ng40 604859 bp lg40 7 sum 4884914 bp
- lenContig ng50 552911 bp lg50 10 sum 6571204 bp
- lenContig ng60 390415 bp lg60 12 sum 7407061 bp
- lenContig ng70 236725 bp lg70 16 sum 8521520 bp
- lenContig ng80 142854 bp lg80 23 sum 9768299 bp
- lenContig ng90 94308 bp lg90 33 sum 10927790 bp
- lenContig sum 12059140 (genomeSize 12100000)
- lenContig num 56
- lenContig ave 215341
Consensus Accuracy
-------------------
-While Canu corrects sequences and has 99% identity or greater with PacBio or Nanopore sequences, for the best accuracy we recommend polishing with a sequence-specific tool. We recommend `Quiver <http://github.com/PacificBiosciences/GenomicConsensus>`_ for PacBio and `Nanopolish <http://github.com/jts/nanopolish>`_ for Oxford Nanpore data.
-
-If you have Illumina sequences available, `Pilon <http://www.broadinstitute.org/software/pilon/>`_ can also be used to polish either PacBio or Oxford Nanopore assemblies.
-Futher Reading
--------------------
-See the `FAQ <faq.html>`_ page for commonly-asked questions and the `release <http://github.com/marbl/canu/releases>`_. notes page for information on what's changed and known issues.
+Canu consensus sequences are typically well above 99% identity. Accuracy can be improved by
+polishing the contigs with tools developed specifically for that task. We recommend `Quiver
+<http://github.com/PacificBiosciences/GenomicConsensus>`_ for PacBio and `Nanopolish
+<http://github.com/jts/nanopolish>`_ for Oxford Nanpore data.
+When Illumina reads are available, `Pilon <http://www.broadinstitute.org/software/pilon/>`_
+can be used to polish either PacBio or Oxford Nanopore assemblies.
diff --git a/documentation/source/tutorial.rst b/documentation/source/tutorial.rst
index 2e5ac73..faf6f47 100644
--- a/documentation/source/tutorial.rst
+++ b/documentation/source/tutorial.rst
@@ -9,7 +9,7 @@ Canu Tutorial
Canu assembles reads from PacBio RS II or Oxford Nanopore MinION instruments into
uniquely-assemblable contigs, unitigs. Canu owes lots of it design and code to
-`celera-assembler`_.
+`celera-assembler <Celera Assembler>`_.
Canu can be run using hardware of nearly any shape or size, anywhere from laptops to computational
grids with thousands of nodes. Obviouisly, larger assemblies will take a long time to compute on
@@ -151,7 +151,7 @@ The tags are:
|utgmhap | the mhap overlapper, as used in the assembly phase |
+--------+-------------------------------------------------------------------+
+--------+-------------------------------------------------------------------+
-|mmap | the `minimap <https://github.com/lh3/minimap>`_ overlapper |
+|mmap | the `minimap <https://github.com/lh3/minimap>`_ overlapper |
+--------+-------------------------------------------------------------------+
|cormmap | the minimap overlapper, as used in the correction phase |
+--------+-------------------------------------------------------------------+
@@ -214,7 +214,6 @@ would be waiting for jobs named 'ovl_asm_orange'.
Error Rates
~~~~~~~~~~~~~~~~~~~~~~
-
Canu expects all error rates to be reported as fraction error, not as percent error. We're not sure
exactly why this is so. Previously, it used a mix of fraction error and percent error (or both!),
and was a little confusing. Here's a handy table you can print out that converts between fraction
@@ -261,7 +260,7 @@ tells the consensus algorithm to not trust read alignments above that value.
For convenience, two meta options set the error rates used with uncorrected reads
(:ref:`rawErrorRate <rawErrorRate>`) or used with corrected reads. (:ref:`correctedErrorRate
-<correctedErrorRate>`). The default depends on the type or read being assembled.
+<correctedErrorRate>`). The default depends on the type of read being assembled.
================== ====== ========
Parameter PacBio Nanopore
@@ -270,10 +269,8 @@ rawErrorRate 0.300 0.500
correctedErrorRate 0.045 0.144
================== ====== ========
-In practice, only the :ref:`correctedErrorRate <correctedErrorRate>` is usually changed.
- * For low coverage datasets (less than 30X), we recommend increasing :ref:`correctedErrorRate <correctedErrorRate>` slightly, by 1% or so.
- * For high-coverage datasets (more than 60X), we recommend decreasing :ref:`correctedErrorRate <correctedErrorRate>` slightly, by 1% or so.
-Raising the :ref:`correctedErrorRate <correctedErrorRate>` will increase run time. Likewise, decreasing :ref:`correctedErrorRate <correctedErrorRate>` will decrease run time, at the risk of missing overlaps and fracturing the assembly.
+In practice, only :ref:`correctedErrorRate <correctedErrorRate>` is usually changed. The :ref:`faq`
+has :ref:`specific suggestions <tweak>` on when to change this.
Canu v1.4 and earlier used the :ref:`errorRate <errorRate>` parameter, which set the expected
rate of error in a single corrected read.
@@ -394,3 +391,93 @@ Minimap Overlapper Parameters
Use k-mers of this size for detecting overlaps
Minimap also will ignore high-frequency minimzers, but it's selection of frequent is not exposed.
+
+.. _outputs:
+
+Outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+As Canu runs, it outputs status messages, execution logs, and some analysis to the console. Most of
+the analysis is captured in ``<prefix>.report`` as well.
+
+LOGGING
+
+<prefix>.report
+ Most of the analysis reported during assembly.
+
+READS
+
+<prefix>.correctedReads.fasta.gz
+ The reads after correction.
+
+<prefix>.trimmedReads.fasta.gz
+ The corrected reads after overlap based trimming.
+
+SEQUENCE
+
+<prefix>.contigs.fasta
+ Everything which could be assembled and is part of the primary assembly, including both unique
+ and repetitive elements.
+
+<prefix>.unitigs.fasta
+ Contigs, split at alternate paths in the graph.
+
+<prefix>.unassembled.fasta
+ Reads and low-coverage contigs which could not be incorporated into the primary assembly.
+
+The header line for each sequence provides some metadata on the sequence.::
+
+ >tig######## len=<integer> reads=<integer> covStat=<float> gappedBases=<yes|no> class=<contig|bubble|unassm> suggestRepeat=<yes|no> suggestCircular=<yes|no>
+
+ len
+ Length of the sequence, in bp.
+
+ reads
+ Number of reads used to form the contig.
+
+ covStat
+ The log of the ratio of the contig being unique versus being two-copy, based on the read arrival rate. Positive values indicate more likely to be unique, while negative values indicate more likely to be repetitive. See `Footnote 24 <http://science.sciencemag.org/content/287/5461/2196.full#ref-24>`_ in `Myers et al., A Whole-Genome Assembly of Drosophila <http://science.sciencemag.org/content/287/5461/2196.full>`_.
+
+ gappedBases
+ If yes, the sequence includes all gaps in the multialignment.
+
+ class
+ Type of sequence. Unassembled sequences are primarily low-coverage sequences spanned by a single read.
+
+ suggestRepeat
+ If yes, sequence was detected as a repeat based on graph topology or read overlaps to other sequences.
+
+ suggestCircular
+ If yes, sequence is likely circular. Not implemented.
+
+GRAPHS
+
+<prefix>.contigs.gfa
+ Unused or ambiguous edges between contig sequences. Bubble edges cannot be represented in this format.
+
+<prefix>.unitigs.gfa
+ Contigs split at bubble intersections.
+
+<prefix>.unitigs.bed
+ The position of each unitig in a contig.
+
+METADATA
+
+The layout provides information on where each read ended up in the final assembly, including
+contig and positions. It also includes the consensus sequence for each contig.
+
+<prefix>.contigs.layout, <prefix>.unitigs.layout
+ (undocumented)
+
+<prefix>.contigs.layout.readToTig, <prefix>.unitigs.layout.readToTig
+ The position of each read in a contig (unitig).
+
+<prefix>.contigs.layout.tigInfo, <prefix>.unitigs.layout.tigInfo
+ A list of the contigs (unitigs), lengths, coverage, number of reads and other metadata.
+ Essentially the same information provided in the FASTA header line.
+
+
+
+
+
+
diff --git a/src/AS_UTL/AS_UTL_fileIO.C b/src/AS_UTL/AS_UTL_fileIO.C
index 3c3ad3c..e2f6a3c 100644
--- a/src/AS_UTL/AS_UTL_fileIO.C
+++ b/src/AS_UTL/AS_UTL_fileIO.C
@@ -216,18 +216,18 @@ bool
AS_UTL_readLine(char *&L, uint32 &Llen, uint32 &Lmax, FILE *F) {
if ((L == NULL) || (Lmax == 0))
- allocateArray(L, Lmax = 4, resizeArray_clearNew);
+ allocateArray(L, Lmax = 1024, resizeArray_clearNew);
Llen = 0;
int32 ch = getc(F);
- uint32 growth = 4;
+ uint32 growth = 1024;
if (feof(F))
return(false);
while ((feof(F) == false) && (ch != '\n')) {
- if (Llen >= Lmax)
+ if (Llen + 1 >= Lmax)
resizeArray(L, Llen, Lmax, Lmax + growth, resizeArray_copyData | resizeArray_clearNew); // Grow the array.
L[Llen++] = ch;
@@ -272,6 +272,21 @@ AS_UTL_mkdir(const char *dirname) {
+// Remove a directory, or do nothing if the file doesn't exist.
+void
+AS_UTL_rmdir(const char *dirname) {
+
+ if (AS_UTL_fileExists(dirname, FALSE, FALSE) == false)
+ return;
+
+ errno = 0;
+ rmdir(dirname);
+ if (errno)
+ fprintf(stderr, "AS_UTL_rmdir()-- Failed to remove directory '%s': %s\n", dirname, strerror(errno)), exit(1);
+}
+
+
+
void
AS_UTL_symlink(const char *pathToFile, const char *pathToLink) {
@@ -386,7 +401,7 @@ AS_UTL_sizeOfFile(const char *path) {
if (strcasecmp(path+strlen(path)-3, ".gz") == 0) {
char cmd[FILENAME_MAX], *p = cmd;
- snprintf(cmd, FILENAME_MAX, "gzip -l %s", path);
+ snprintf(cmd, FILENAME_MAX, "gzip -l '%s'", path);
FILE *F = popen(cmd, "r");
fgets(cmd, FILENAME_MAX, F); // compressed uncompressed ratio uncompressed_name
@@ -544,19 +559,19 @@ compressedFileReader::compressedFileReader(const char *filename) {
switch (ft) {
case cftGZ:
- snprintf(cmd, FILENAME_MAX, "gzip -dc %s", filename);
+ snprintf(cmd, FILENAME_MAX, "gzip -dc '%s'", filename);
_file = popen(cmd, "r");
_pipe = true;
break;
case cftBZ2:
- snprintf(cmd, FILENAME_MAX, "bzip2 -dc %s", filename);
+ snprintf(cmd, FILENAME_MAX, "bzip2 -dc '%s'", filename);
_file = popen(cmd, "r");
_pipe = true;
break;
case cftXZ:
- snprintf(cmd, FILENAME_MAX, "xz -dc %s", filename);
+ snprintf(cmd, FILENAME_MAX, "xz -dc '%s'", filename);
_file = popen(cmd, "r");
_pipe = true;
@@ -612,19 +627,19 @@ compressedFileWriter::compressedFileWriter(const char *filename, int32 level) {
switch (ft) {
case cftGZ:
- snprintf(cmd, FILENAME_MAX, "gzip -%dc > %s", level, filename);
+ snprintf(cmd, FILENAME_MAX, "gzip -%dc > '%s'", level, filename);
_file = popen(cmd, "w");
_pipe = true;
break;
case cftBZ2:
- snprintf(cmd, FILENAME_MAX, "bzip2 -%dc > %s", level, filename);
+ snprintf(cmd, FILENAME_MAX, "bzip2 -%dc > '%s'", level, filename);
_file = popen(cmd, "w");
_pipe = true;
break;
case cftXZ:
- snprintf(cmd, FILENAME_MAX, "xz -%dc > %s", level, filename);
+ snprintf(cmd, FILENAME_MAX, "xz -%dc > '%s'", level, filename);
_file = popen(cmd, "w");
_pipe = true;
break;
diff --git a/src/AS_UTL/AS_UTL_fileIO.H b/src/AS_UTL/AS_UTL_fileIO.H
index ac91c13..6aa6bdc 100644
--- a/src/AS_UTL/AS_UTL_fileIO.H
+++ b/src/AS_UTL/AS_UTL_fileIO.H
@@ -61,6 +61,7 @@ size_t AS_UTL_safeRead (FILE *file, void *buffer, const char *desc, size_
bool AS_UTL_readLine(char *&L, uint32 &Llen, uint32 &Lmax, FILE *F);
void AS_UTL_mkdir(const char *dirname);
+void AS_UTL_rmdir(const char *dirname);
void AS_UTL_symlink(const char *pathToFile, const char *pathToLink);
diff --git a/src/AS_UTL/AS_UTL_reverseComplement.C b/src/AS_UTL/AS_UTL_reverseComplement.C
index ee9722c..93eecb2 100644
--- a/src/AS_UTL/AS_UTL_reverseComplement.C
+++ b/src/AS_UTL/AS_UTL_reverseComplement.C
@@ -33,27 +33,44 @@
#include "AS_global.H"
-static char inv[256] = {0};
-
static
-void
-initRC(void) {
- if (inv['a'] == 't')
- return;
+char
+inv[256] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, // 0x00 -
+ 0, 0, 0, 0, 0, 0, 0, 0, // 0x08 -
+ 0, 0, 0, 0, 0, 0, 0, 0, // 0x10 -
+ 0, 0, 0, 0, 0, 0, 0, 0, // 0x18 -
+ 0, 0, 0, 0, 0, 0, 0, 0, // 0x20 - !"#$%&'
+ 0, 0, 0, 0, 0, 0, 0, 0, // 0x28 - ()*+,-./
+ 0, 0, 0, 0, 0, 0, 0, 0, // 0x30 - 01234567
+ 0, 0, 0, 0, 0, 0, 0, 0, // 0x38 - 89:;<=>?
+ 0,'T', 0,'G', 0, 0, 0,'C', // 0x40 - @ABCDEFG
+ 0, 0, 0, 0, 0, 0, 0, 0, // 0x48 - HIJKLMNO
+ 0, 0, 0, 0,'A', 0, 0, 0, // 0x50 - PQRSTUVW
+ 0, 0, 0, 0, 0, 0, 0, 0, // 0x58 - XYZ[\]^_
+ 0,'t', 0,'g', 0, 0, 0,'c', // 0x60 - `abcdefg
+ 0, 0, 0, 0, 0, 0, 0, 0, // 0x68 - hijklmno
+ 0, 0, 0, 0,'a', 0, 0, 0, // 0x70 - pqrstuvw
+ 0, 0, 0, 0, 0, 0, 0, 0, // 0x78 - xyz{|}~
+ 0, 0, 0, 0, 0, 0, 0, 0, // 0x80 -
+ 0, 0, 0, 0, 0, 0, 0, 0, // 0x88 -
+ 0, 0, 0, 0, 0, 0, 0, 0, // 0x90 -
+ 0, 0, 0, 0, 0, 0, 0, 0, // 0x98 -
+ 0, 0, 0, 0, 0, 0, 0, 0, // 0xa0 -
+ 0, 0, 0, 0, 0, 0, 0, 0, // 0xa8 -
+ 0, 0, 0, 0, 0, 0, 0, 0, // 0xb0 -
+ 0, 0, 0, 0, 0, 0, 0, 0, // 0xb8 -
+ 0, 0, 0, 0, 0, 0, 0, 0, // 0xc0 -
+ 0, 0, 0, 0, 0, 0, 0, 0, // 0xc8 -
+ 0, 0, 0, 0, 0, 0, 0, 0, // 0xd0 -
+ 0, 0, 0, 0, 0, 0, 0, 0, // 0xd8 -
+ 0, 0, 0, 0, 0, 0, 0, 0, // 0xe0 -
+ 0, 0, 0, 0, 0, 0, 0, 0, // 0xe8 -
+ 0, 0, 0, 0, 0, 0, 0, 0, // 0xf0 -
+ 0, 0, 0, 0, 0, 0, 0, 0 // 0xf8 -
+};
- inv['a'] = 't';
- inv['c'] = 'g';
- inv['g'] = 'c';
- inv['t'] = 'a';
- inv['n'] = 'n';
- inv['A'] = 'T';
- inv['C'] = 'G';
- inv['G'] = 'C';
- inv['T'] = 'A';
- inv['N'] = 'N';
- inv['-'] = '-';
-}
void
@@ -61,8 +78,6 @@ reverseComplementSequence(char *seq, int len) {
char c=0;
char *s=seq, *S=seq+len-1;
- initRC();
-
if (len == 0) {
len = strlen(seq);
S = seq + len - 1;
@@ -78,30 +93,23 @@ reverseComplementSequence(char *seq, int len) {
*s = inv[*s];
}
-// Inplace reverse-complement an ACGT sequence. A pointer the the
-// string is returned.
-//
-#if 0
-// From kmer
+
+
char *
-reverseComplementSequence(char *seq, uint32 seqlen) {
- char *s = seq;
- char *e = seq + seqlen - 1;
- char t;
- uint32 c = seqlen / 2;
-
- while (c--) {
- t = complementSymbol[*s];
- *(s++) = complementSymbol[*e];
- *(e--) = t;
- }
+reverseComplementCopy(char *seq, int len) {
+ char *rev = new char [len+1];
- if (s == e)
- *s = complementSymbol[*s];
+ assert(len > 0);
- return(seq);
+ for (int32 p=len, q=0; p>0; )
+ rev[q++] = inv[seq[--p]];
+
+ rev[len] = 0;
+
+ return(rev);
}
-#endif
+
+
void
reverseComplement(char *seq, char *qlt, int len) {
@@ -114,8 +122,6 @@ reverseComplement(char *seq, char *qlt, int len) {
return;
}
- initRC();
-
if (len == 0) {
len = strlen(seq);
S = seq + len - 1;
@@ -136,42 +142,3 @@ reverseComplement(char *seq, char *qlt, int len) {
*s = inv[*s];
}
-
-void
-reverse(char *a, char *b, int len) {
- char c=0;
- char *s=a, *S=a+len-1;
- char *q=b, *Q=b+len-1;
-
- while (s < S) {
- c = *s;
- *s++ = *S;
- *S-- = c;
-
- c = *q;
- *q++ = *Q;
- *Q-- = c;
- }
-}
-
-
-// Inplace reverse a string. A pointer the the string is returned.
-//
-#if 0
-// From kmer
-char *
-reverseString(char *seq, uint32 seqlen) {
- char *s = seq;
- char *e = seq + seqlen - 1;
- char t;
- uint32 c = seqlen / 2;
-
- while (c--) {
- t = *s;
- *(s++) = *e;
- *(e--) = t;
- }
-
- return(seq);
-}
-#endif
diff --git a/src/AS_UTL/AS_UTL_reverseComplement.H b/src/AS_UTL/AS_UTL_reverseComplement.H
index 5c9ce32..bd89bbf 100644
--- a/src/AS_UTL/AS_UTL_reverseComplement.H
+++ b/src/AS_UTL/AS_UTL_reverseComplement.H
@@ -32,8 +32,8 @@
#include "AS_global.H"
-void reverseComplementSequence(char *seq, int len);
-void reverseComplement(char *seq, char *qlt, int len);
-void reverse(char *a, char *b, int len);
+void reverseComplementSequence(char *seq, int len);
+char *reverseComplementCopy(char *seq, int len);
+void reverseComplement(char *seq, char *qlt, int len);
#endif
diff --git a/src/AS_UTL/bitPackedFile.C b/src/AS_UTL/bitPackedFile.C
index 7cf9c6f..9f3298e 100644
--- a/src/AS_UTL/bitPackedFile.C
+++ b/src/AS_UTL/bitPackedFile.C
@@ -154,9 +154,16 @@ bitPackedFile::bitPackedFile(char const *name, uint64 offset, bool forceTruncate
nr += read(_file, &ac, sizeof(uint64));
nr += read(_file, &bc, sizeof(uint64));
- if (nr == 0) {
- // Empty file! Write the magic number and our endianess check.
+ // Errors?
+ if (errno)
+ fprintf(stderr, "bitPackedFile::bitPackedFile()-- '%s' failed to read the header: %s\n", _name, strerror(errno)), exit(1);
+ // Empty file, but expecting data!
+ if ((nr == 0) && (_isReadOnly))
+ fprintf(stderr, "bitPackedFile::bitPackedFile()-- '%s' failed to read the header: empty file\n", _name), exit(1);
+
+ // Empty file! Write the magic number and our endianess check.
+ if (nr == 0) {
errno = 0;
write(_file, t, sizeof(char) * 16);
write(_file, &at, sizeof(uint64));
diff --git a/src/AS_UTL/timeAndSize.C b/src/AS_UTL/timeAndSize.C
index 003d226..6ad43ca 100644
--- a/src/AS_UTL/timeAndSize.C
+++ b/src/AS_UTL/timeAndSize.C
@@ -19,6 +19,10 @@
* are Copyright 2014 Battelle National Biodefense Institute, and
* are subject to the BSD 3-Clause License
*
+ * Brian P. Walenz beginning on 2017-AUG-10
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
@@ -31,8 +35,6 @@
-
-
double
getTime(void) {
struct timeval tp;
@@ -41,16 +43,78 @@ getTime(void) {
}
-uint64
-getProcessSizeCurrent(void) {
- struct rusage ru;
- uint64 sz = 0;
+
+static
+bool
+getrusage(struct rusage &ru) {
errno = 0;
+
if (getrusage(RUSAGE_SELF, &ru) == -1) {
- fprintf(stderr, "getProcessSizeCurrent()-- getrusage(RUSAGE_SELF, ...) failed: %s\n",
+ fprintf(stderr, "getrusage(RUSAGE_SELF, ...) failed: %s\n",
+ strerror(errno));
+ return(false);
+ }
+
+ return(true);
+}
+
+
+
+static
+bool
+getrlimit(struct rlimit &rl) {
+
+ errno = 0;
+
+ if (getrlimit(RLIMIT_DATA, &rl) == -1) {
+ fprintf(stderr, "getrlimit(RLIMIT_DATA, ...) failed: %s\n",
strerror(errno));
- } else {
+ return(false);
+ }
+
+ return(true);
+}
+
+
+
+double
+getCPUTime(void) {
+ struct rusage ru;
+ double tm = 0;
+
+ if (getrusage(ru) == true)
+ tm = ((ru.ru_utime.tv_sec + ru.ru_utime.tv_usec / 1000000.0) +
+ (ru.ru_stime.tv_sec + ru.ru_stime.tv_usec / 1000000.0));
+
+ return(tm);
+}
+
+
+
+double
+getProcessTime(void) {
+ struct timeval tp;
+ static double st = 0.0;
+ double tm = 0;
+
+ if (gettimeofday(&tp, NULL) == 0)
+ tm = tp.tv_sec + tp.tv_usec / 100000.0;
+
+ if (st == 0.0)
+ st = tm;
+
+ return(tm - st);
+}
+
+
+
+uint64
+getProcessSize(void) {
+ struct rusage ru;
+ uint64 sz = 0;
+
+ if (getrusage(ru) == true) {
sz = ru.ru_maxrss;
sz *= 1024;
}
@@ -59,18 +123,14 @@ getProcessSizeCurrent(void) {
}
+
uint64
getProcessSizeLimit(void) {
- struct rlimit rlp;
+ struct rlimit rl;
uint64 sz = ~uint64ZERO;
- errno = 0;
- if (getrlimit(RLIMIT_DATA, &rlp) == -1) {
- fprintf(stderr, "getProcessSizeLimit()-- getrlimit(RLIMIT_DATA, ...) failed: %s\n",
- strerror(errno));
- } else {
- sz = rlp.rlim_cur;
- }
+ if (getrlimit(rl) == true)
+ sz = rl.rlim_cur;
return(sz);
}
diff --git a/src/AS_UTL/timeAndSize.H b/src/AS_UTL/timeAndSize.H
index c0ba5ad..b30b5ee 100644
--- a/src/AS_UTL/timeAndSize.H
+++ b/src/AS_UTL/timeAndSize.H
@@ -19,13 +19,20 @@
* are Copyright 2014 Battelle National Biodefense Institute, and
* are subject to the BSD 3-Clause License
*
+ * Brian P. Walenz beginning on 2017-AUG-10
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
#include "AS_global.H"
-double getTime(void);
+double getTime(void);
+
+double getCPUTime(void);
+double getProcessTime(void);
-uint64 getProcessSizeCurrent(void);
+uint64 getProcessSize(void);
uint64 getProcessSizeLimit(void);
diff --git a/src/AS_UTL/writeBuffer.H b/src/AS_UTL/writeBuffer.H
index 7795a81..6e187ab 100644
--- a/src/AS_UTL/writeBuffer.H
+++ b/src/AS_UTL/writeBuffer.H
@@ -19,6 +19,10 @@
* are a 'United States Government Work', and
* are released in the public domain
*
+ * Sergey Koren beginning on 2017-MAY-17
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
@@ -62,7 +66,7 @@ public:
flush();
if (_bufferMax < length)
- AS_UTL_safeWrite(_file, _buffer, "writeBuffer", 1, length);
+ AS_UTL_safeWrite(_file, data, "writeBuffer", 1, length);
else {
memcpy(_buffer + _bufferLen, data, length);
_bufferLen += length;
diff --git a/src/AS_global.C b/src/AS_global.C
index 0b9304f..189b98b 100644
--- a/src/AS_global.C
+++ b/src/AS_global.C
@@ -39,6 +39,7 @@
#include "canu_version.H"
#include "AS_UTL_stackTrace.H"
+#include "timeAndSize.H"
#ifdef X86_GCC_LINUX
#include <fpu_control.h>
@@ -105,6 +106,11 @@ AS_configure(int argc, char **argv) {
AS_UTL_installCrashCatcher(argv[0]);
+ // Set the start time.
+
+ getProcessTime();
+
+
//
// Et cetera.
//
diff --git a/src/Makefile b/src/Makefile
index 40ee044..c852599 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -575,6 +575,21 @@ endif
# No backtrace support.
#CXXFLAGS += -DNOBACKTRACE
+# But, if we we have an old GCC, stack tracing support isn't there.
+
+GCC_45 := $(shell expr `${CC} -dumpversion | sed -e 's/\.\([0-9][0-9]\)/\1/g' -e 's/\.\([0-9]\)/0\1/g' -e 's/^[0-9]\{3,4\}$$/&00/'` \>= 40500)
+GCC_VV := $(shell ${CC} -dumpversion)
+GXX_VV := $(shell ${CXX} -dumpversion)
+
+ifeq (${BUILDSTACKTRACE}, 1)
+ifeq (${GCC_45}, 0)
+$(info WARNING:)
+$(info WARNING: GCC ${GCC_VV} detected, disabling stack trace support. Please upgrade to GCC 4.7 or higher.)
+$(info WARNING:)
+BUILDSTACKTRACE = 0
+endif
+endif
+
ifeq (${BUILDSTACKTRACE}, 1)
CXXFLAGS += -DLIBBACKTRACE
else
@@ -748,6 +763,8 @@ ${TARGET_DIR}/lib/canu/Unitig.pm: pipelines/canu/Unitig.pm
# Makefile processed. Report that we're starting the build.
-${info Building for '${OSTYPE}' '${OSVERSION}' as '${MACHINETYPE}' into '${DESTDIR}${PREFIX}/$(OSTYPE)-$(MACHINETYPE)/{bin,obj}'}
-${info CC ${CC} CXX ${CXX}}
+$(info Building for '${OSTYPE}' '${OSVERSION}' as '${MACHINETYPE}' into '${DESTDIR}${PREFIX}/$(OSTYPE)-$(MACHINETYPE)/{bin,obj}')
+$(info CC ${CC} ${GCC_VV})
+$(info CXX ${CXX} ${GXX_VV})
+
#${info Using LD_RUN_PATH '${LD_RUN_PATH}'}
diff --git a/src/bogart/AS_BAT_AssemblyGraph.C b/src/bogart/AS_BAT_AssemblyGraph.C
index 3d29933..0d68573 100644
--- a/src/bogart/AS_BAT_AssemblyGraph.C
+++ b/src/bogart/AS_BAT_AssemblyGraph.C
@@ -131,6 +131,9 @@ AssemblyGraph::buildGraph(const char *UNUSED(prefix),
if (fiTigID == 0) // Unplaced, don't care.
continue;
+ if (tigs[fiTigID]->_isUnassembled == true) // Unassembled, don't care.
+ continue;
+
if (tigEndsOnly == true) {
uint32 f = tigs[fiTigID]->firstRead()->ident;
uint32 l = tigs[fiTigID]->lastRead()->ident;
@@ -820,7 +823,7 @@ AssemblyGraph::reportReadGraph(TigVector &tigs, const char *prefix, const char *
uint64 nEdgeToUnasm = 0;
- writeStatus("AssemblyGraph()-- generating '%s.%s.edges.gfa'.\n", prefix, label);
+ writeStatus("AssemblyGraph()-- generating '%s.%s.assembly.gfa'.\n", prefix, label);
snprintf(N, FILENAME_MAX, "%s.%s.assembly.gfa", prefix, label);
diff --git a/src/bogart/AS_BAT_BestOverlapGraph.C b/src/bogart/AS_BAT_BestOverlapGraph.C
index 6030cd7..e83c4fc 100644
--- a/src/bogart/AS_BAT_BestOverlapGraph.C
+++ b/src/bogart/AS_BAT_BestOverlapGraph.C
@@ -331,23 +331,30 @@ BestOverlapGraph::removeSpurs(const char *prefix) {
// Contained, not a spur.
continue;
- if ((spur5 == false) && (spur3 == false))
+ if ((spur5 == false) &&
+ (spur3 == false))
// Edges off of both ends. Not a spur.
continue;
- if ((spur5 == true) && (spur3 == true))
- // No edges off either end. Not a spur, just garbage.
- continue;
+ // We've now got either a spur or a singleton.
+ //
+ // How do we get an edge to a singleton, which, by definition, has no edges? The one case I
+ // looked at had different error rates for the A->B and B->A overlap, and these straddled the
+ // error rate cutoff. Dmel had 357 edges to singletons; I didn't look at any of them.
- // Exactly one end is missing a best edge. Bad!
+ bool isSingleton = ((spur5 == true) && (spur3 == true));
if (F)
- fprintf(F, F_U32" %c'\n", fi, (spur5) ? '5' : '3');
+ fprintf(F, F_U32" %s\n", fi, (isSingleton) ? "singleton" : ((spur5) ? "5'" : "3'"));
- _spur.insert(fi);
+ if (isSingleton)
+ _singleton.insert(fi);
+ else
+ _spur.insert(fi);
}
- writeStatus("BestOverlapGraph()-- detected " F_SIZE_T " spur reads.\n", _spur.size());
+ writeStatus("BestOverlapGraph()-- detected " F_SIZE_T " spur reads and " F_SIZE_T " singleton reads.\n",
+ _spur.size(), _singleton.size());
if (F)
fclose(F);
@@ -383,7 +390,8 @@ BestOverlapGraph::findEdges(void) {
// they shouldn't because they're spurs).
for (uint32 ii=0; ii<no; ii++)
- if (_spur.count(ovl[ii].b_iid) == 0)
+ if ((_spur.count(ovl[ii].b_iid) == 0) &&
+ (_singleton.count(ovl[ii].b_iid) == 0))
scoreEdge(ovl[ii]);
}
}
@@ -434,6 +442,7 @@ BestOverlapGraph::BestOverlapGraph(double erateGraph,
_n2EdgeIncompatible = 0;
_suspicious.clear();
+ _singleton.clear();
_bestM.clear();
_scorM.clear();
diff --git a/src/bogart/AS_BAT_BestOverlapGraph.H b/src/bogart/AS_BAT_BestOverlapGraph.H
index 5d14d29..c1fd62e 100644
--- a/src/bogart/AS_BAT_BestOverlapGraph.H
+++ b/src/bogart/AS_BAT_BestOverlapGraph.H
@@ -278,6 +278,7 @@ private:
uint32 _n2EdgeIncompatible;
set<uint32> _suspicious;
+ set<uint32> _singleton;
set<uint32> _spur;
map<uint32, BestOverlaps> _bestM;
diff --git a/src/bogart/AS_BAT_CreateUnitigs.C b/src/bogart/AS_BAT_CreateUnitigs.C
index ac168bb..5d32a7a 100644
--- a/src/bogart/AS_BAT_CreateUnitigs.C
+++ b/src/bogart/AS_BAT_CreateUnitigs.C
@@ -32,6 +32,8 @@
#include "AS_BAT_Unitig.H"
#include "AS_BAT_TigVector.H"
+#include "AS_BAT_PlaceReadUsingOverlaps.H"
+
#include "AS_BAT_CreateUnitigs.H"
@@ -59,6 +61,13 @@ public:
return(a < b);
};
+ bool operator==(breakPointEnd const &that) const {
+ uint64 a = _tigID; a <<= 32; a |= _pos; a <<= 1; a |= _bgn; // Because _tigID is 32-bit
+ uint64 b = that._tigID; b <<= 32; b |= that._pos; b <<= 1; b |= that._bgn;
+
+ return(a == b);
+ };
+
uint32 _tigID;
uint32 _pos;
bool _bgn;
@@ -76,9 +85,7 @@ copyTig(TigVector &tigs,
Unitig *newtig = tigs.newUnitig(false);
newtig->_isUnassembled = oldtig->_isUnassembled;
- newtig->_isBubble = oldtig->_isBubble;
newtig->_isRepeat = oldtig->_isRepeat;
- newtig->_isCircular = oldtig->_isCircular;
for (uint32 fi=0; fi<oldtig->ufpath.size(); fi++)
newtig->addRead(oldtig->ufpath[fi], 0, false);
@@ -98,28 +105,42 @@ splitTig(TigVector &tigs,
vector<breakPointEnd> &BP,
Unitig **newTigs,
int32 *lowCoord,
- uint32 *nMoved,
bool doMove) {
- if (doMove == true) {
- memset(newTigs, 0, sizeof(Unitig *) * BP.size());
- memset(lowCoord, 0, sizeof(int32) * BP.size());
- } else {
- memset(nMoved, 0, sizeof(uint32) * BP.size());
- }
+ writeLog("\n");
+ writeLog("splitTig()-- processing tig %u\n", tig->id());
+
+ // The first call is with doMove = false. This call just figures out how many new tigs are
+ // created. We use nMoved to count if a new tig is made for a break point.
- if (doMove)
+ uint32 *nMoved = NULL;
+
+ if (doMove == false)
+ allocateArray(nMoved, BP.size() + 2);
+
+ // The second call is with doMove = true. This does the actual moving.
+
+ if (doMove == true)
+ for (uint32 tt=0; tt < BP.size() + 2; tt++) {
+ newTigs[tt] = NULL;
+ lowCoord[tt] = INT32_MAX;
+ }
+
+ if (doMove == true)
for (uint32 tt=0; tt < BP.size() - 1; tt++)
writeLog("splitTig()-- piece %2u from %8u %c to %8u %c\n",
tt,
BP[tt ]._pos, BP[tt ]._bgn ? 't' : 'f',
BP[tt+1]._pos, BP[tt+1]._bgn ? 't' : 'f');
+
for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
ufNode &read = tig->ufpath[fi];
uint32 lo = read.position.min();
uint32 hi = read.position.max();
+ //writeLog("splitTig()-- processing read #%u ident %u pos %u-%u\n", fi, read.ident, lo, hi);
+
// Find the intervals the end points of the read fall into. Suppose we're trying to place
// the long read. It begins in piece 1 and ends in piece 6.
//
@@ -205,12 +226,13 @@ splitTig(TigVector &tigs,
// Now move the read, or account for moving it.
if (doMove) {
- //writeLog("splitTig()-- Move read %8u %8u-%-8u to piece %2u tig %6u\n",
- // read.ident, read.position.bgn, read.position.end, finBP, newTigs[finBP]->id());
+ writeLog("splitTig()-- Move read %8u %8u-%-8u to piece %2u tig %6u\n",
+ read.ident, read.position.bgn, read.position.end, finBP, newTigs[finBP]->id());
newTigs[finBP]->addRead(read, -lowCoord[finBP], false);
}
else {
- //writeLog("splitTig()-- Move read %u %u-%u to piece %u (pos=%u)\n", read.ident, read.position.bgn, read.position.end, bp, BP[finBP]._pos);
+ //writeLog("splitTig()-- Move read %u %u-%u to piece %u (pos=%u)\n",
+ // read.ident, read.position.bgn, read.position.end, finBP, BP[finBP]._pos);
nMoved[finBP]++;
}
}
@@ -219,9 +241,13 @@ splitTig(TigVector &tigs,
uint32 nTigsCreated = 0;
- for (uint32 ii=0; ii<BP.size(); ii++)
- if (nMoved[ii] > 0)
- nTigsCreated++;
+ if (doMove == false) {
+ for (uint32 ii=0; ii<BP.size(); ii++)
+ if (nMoved[ii] > 0)
+ nTigsCreated++;
+
+ delete [] nMoved;
+ }
return(nTigsCreated);
}
@@ -229,84 +255,297 @@ splitTig(TigVector &tigs,
static
+uint32
+checkReadContained(overlapPlacement &op,
+ Unitig *tgB) {
+
+ for (uint32 ii=op.tigFidx; ii<=op.tigLidx; ii++) {
+ if (isContained(op.verified, tgB->ufpath[ii].position))
+ return(ii + 1);
+ }
+
+ return(0);
+}
+
+
+
+// Decide which read, and which end, we're overlapping. We know:
+//
+// verified tells us the positions covered with overlaps and the orietation of the aligned read
+//
+// isFirst and rdAfwd tell if the invading tig is flopping free to the left
+// or right of this location
+//
+// break here
+// v
+// invaded tig ----------------------------------------------
+// ------------>
+// ------->
+// <------------------ (ignore these two container reads)
+// <------------ (in reality, this wouldn't be split)
+// | |
+// (overlap) (verified.isForward() == false)
+// | |
+// <--------
+// -----------
+// -------------->
+//
+// isLow is true if this coordinate is the start of the read placement
+//
void
-checkRead(AssemblyGraph *AG,
- TigVector &contigs,
- vector<breakPointEnd> &breaks,
- Unitig *tgA, ufNode *rdA,
- bool isFirst) {
+findEnd(overlapPlacement &op,
+ bool rdAfwd,
+ bool isFirst,
+ bool &isLow,
+ int32 &coord) {
+
+ if (((isFirst == true) && (rdAfwd == true) && (op.verified.isForward() == true)) ||
+ ((isFirst == true) && (rdAfwd == false) && (op.verified.isForward() == false)) ||
+ ((isFirst == false) && (rdAfwd == false) && (op.verified.isForward() == true)) || // rdAfwd is opposite what reality is,
+ ((isFirst == false) && (rdAfwd == true) && (op.verified.isForward() == false))) { // because we've flipped the tig outside here
+ isLow = false;
+ coord = INT32_MIN;
+ } else {
+ isLow = true;
+ coord = INT32_MAX;
+ }
+}
+
- for (uint32 pp=0; pp<AG->getForward(rdA->ident).size(); pp++) {
- BestPlacement &pf = AG->getForward(rdA->ident)[pp];
- // If a contained edge, we cannot split the other tig; it is correct (this read is contained in the other read).
- if (pf.bestC.b_iid > 0) {
- writeLog("createUnitigs()-- read %6u edgeTo tig %5u read %6u position %d-%d CONTAINED\n",
- rdA->ident, contigs.inUnitig(pf.bestC.b_iid), pf.bestC.b_iid, pf.placedBgn, pf.placedEnd);
+static
+uint32
+checkRead(Unitig *tgA,
+ ufNode *rdA,
+ vector<overlapPlacement> &rdAplacements,
+ TigVector &contigs,
+ vector<breakPointEnd> &breakpoints,
+ uint32 minOverlap,
+ uint32 maxPlacements,
+ bool isFirst) {
+ bool verbose = true;
+
+ // To support maxPlacements, we first find all the breaks as we've done forever, then simply
+ // ignore them if there are too many.
+
+ vector<breakPointEnd> breaks;
+
+ for (uint32 pp=0; pp<rdAplacements.size(); pp++) {
+ overlapPlacement &op = rdAplacements[pp];
+ Unitig *tgB = contigs[op.tigID];
+
+ bool toUnassembled = false;
+ bool toSelf = false;
+ bool expected5 = false;
+ bool expected3 = false;
+ bool tooSmall = false;
+ bool isContained = false;
+ bool noOverlaps = false;
+ bool notSimilar = false;
+
+ // Silently ignore stuff to unassembled tigs.
+
+ if (tgB->_isUnassembled == true) {
+ toUnassembled = true;
continue;
}
- // Decide which overlap we want to be using, based on the orientation of the read in the tig,
- // and if it is the first or last read.
+ // If we're overlapping with ourself, not a useful edge to be splitting on.
+
+ if ((tgA->id() == tgB->id()) && (isOverlapping(op.verified, rdA->position))) {
+ toSelf = true;
+ if (verbose == false)
+ continue;
+ }
+
+ // If the overlap is on the end that is used in the tig, not a useful edge.
//
- // first == true first == false
- // best5 fwd == true ---------> fwd == false <---------
- // best3 fwd == false <---------- fwd == true --------->
+ // first == true (tig) first == false (tig)
+ // is5 fwd == true ----------> fwd == false <---------
+ // is3 fwd == false <---------- fwd == true --------->
- BAToverlap best = (isFirst == rdA->position.isForward()) ? pf.best5 : pf.best3;
+ bool is5 = (isFirst == rdA->position.isForward()) ? true : false;
- // If there is no overlap on the expected end, well, that's it, nothing we can do but give up.
- // Don't bother logging if it is the internal edge (which it shouldn't ever be, because those shouldn't
- // be in the graph, right?)
+ if ((is5 == true) && (op.covered.bgn != 0)) {
+ expected5 = true;
+ if (verbose == false)
+ continue;
+ }
- if (best.b_iid == 0) {
- uint32 rdC = (isFirst == rdA->position.isForward()) ? pf.best3.b_iid : pf.best5.b_iid; // Grab the other edge
- uint32 tgC = contigs.inUnitig(rdC);
+ if ((is5 == false) && (op.covered.end != RI->readLength(rdA->ident))) {
+ expected3 = true;
+ if (verbose == false)
+ continue;
+ }
- if (tgC != tgA->id())
- writeLog("createUnitigs()-- read %6u edgeTo tig %5u read %6u position %d-%d WRONG_END\n",
- rdA->ident, tgC, rdC, pf.placedBgn, pf.placedEnd);
- continue;
+ // If too small, bail.
+
+ if (op.verified.max() - op.verified.min() < minOverlap) {
+ tooSmall = true;
+ if (verbose == false)
+ continue;
}
- // Grab the tig and read we overlap to.
+ // Sacn all the reads we supposedly overlap, checking for overlaps. Save the one that is the
+ // lowest (is5 == true) or highest (is5 == false). Also, compute an average erate for the
+ // overlaps to this read.
- Unitig *tgB = contigs[ contigs.inUnitig(best.b_iid) ];
- ufNode *rdB = &tgB->ufpath[ contigs.ufpathIdx(best.b_iid) ];
+ uint32 ovlLen = 0;
+ BAToverlap *ovl = OC->getOverlaps(rdA->ident, ovlLen);
- // And find the coordinate of the break based on the orientation of the rdB and the overlap.
- // isLow is true if the read is forward and the overlap is off of its 5' end, or
- // if the read is reverse and the overlap is off of its 3' end
+ double erate = 0.0;
+ uint32 erateN = 0;
- bool isLow = (rdB->position.isForward()) ? best.BEndIs5prime() : best.BEndIs3prime();
- uint32 coord = (isLow == true) ? rdB->position.min() : rdB->position.max();
+ bool isLow = false;
+ int32 coord = 0;
+ ufNode *rdB = NULL;
- // With all that done, throw out the edge if the overlap was used to form the contig itself.
- //
- // We used to also throw out edges to validated repeats (pf.isRepeat == true), but those are
- // indistinguishable from bubbles.
+ // DEBUG: If not to self, try to find the overlap. Otherwise, this just adds useless clutter,
+ // the self edge is disqualifying enough.
- if (pf.isContig == true) {
- writeLog("createUnitigs()-- read %6u edgeTo tig %5u at coordinate %8u via intersection with read %6u IS_%s\n",
- rdA->ident, tgB->id(), coord, rdB->ident, (pf.isContig == true) ? "CONTIG" : "REPEAT");
- continue;
- }
+ if (toSelf == false) {
+ findEnd(op, rdA->position.isForward(), isFirst, isLow, coord); // Simple code, but lots of comments.
- // Also chuck it out if it is to garbage.
+ writeLog("\n");
+ writeLog("Scan reads from #%u to #%u for %s coordinate in verified region %u-%u\n",
+ op.tigFidx, op.tigLidx,
+ (isLow) ? "low" : "high",
+ op.verified.min(), op.verified.max());
- if (tgB->_isUnassembled == true) {
- writeLog("createUnitigs()-- read %6u edgeTo tig %5u read %6u UNASSEMBLED\n",
- rdA->ident, tgB->id(), rdB->ident);
+ for (uint32 ii=op.tigFidx; ii<=op.tigLidx; ii++) {
+ for (uint32 oo=0; oo<ovlLen; oo++) {
+ ufNode *rdBii = &tgB->ufpath[ii];
+
+ if (ovl[oo].b_iid != rdBii->ident)
+ continue;
+
+ writeLog("Test read #%6u ident %7u %9u-%9u against verified region %9u-%9u",
+ ii,
+ rdBii->ident, rdBii->position.min(), rdBii->position.max(),
+ op.verified.min(), op.verified.max());
+
+ erate += ovl[oo].erate();
+ erateN += 1;
+
+ // Split on the higher coordinate. If this is larger than the current coordinate AND still
+ // within the verified overlap range, reset the coordinate. Allow only dovetail overlaps.
+
+ if ((isLow == false) && (rdBii->position.max() < op.verified.max())) {
+ writeLog(" - CANDIDATE hangs %7d %7d", ovl[oo].a_hang, ovl[oo].b_hang);
+
+ if ((rdBii->position.max() > coord) && (rdBii->position.min() < op.verified.min()) /* && (ovl[oo].a_hang < 0) */) {
+ writeLog(" - SAVED");
+ rdB = rdBii;
+ coord = rdBii->position.max();
+ }
+ }
+
+ // Split on the lower coordinate.
+
+ if ((isLow == true) && (rdBii->position.min() > op.verified.min())) {
+ writeLog(" - CANDIDATE hangs %7d %7d", ovl[oo].a_hang, ovl[oo].b_hang);
+
+ if ((rdBii->position.min() < coord) && (rdBii->position.max() > op.verified.max()) /* && (ovl[oo].b_hang > 0) */) {
+ writeLog(" - SAVED");
+ rdB = rdBii;
+ coord = rdBii->position.min();
+ }
+ }
+
+ writeLog("\n");
+ }
+ }
+
+ if (erateN > 0)
+ erate /= erateN;
+
+ // Huh? If didn't find any overlaps, give up without crashing (this hasn't ever been triggered).
+
+ if (rdB == NULL) {
+ writeLog("\n");
+ writeLog("Failed to find appropriate intersecting read.\n");
+ writeLog("\n");
+ flushLog();
+
+ noOverlaps = true;
+ if (verbose == false)
+ continue;
+ } else {
+ writeLog("Found appropriate intersecting read.\n");
+ }
+ } // End of toSelf DEBUG
+
+ // Finally, ignore it if the overlap isn't similar to everything else in the tig. A
+ // complication here is we don't know what erate we have between tgA and tgB. We approximate
+ // it by averaging all the overlaps from rdA to the reads it overlaps here. Kind of expensive,
+ // too bad.
+
+#define REPEAT_FRACTION 0.5
+
+#warning deviationGraph hard coded
+double deviationGraph = 6;
+
+ double sim = tgB->overlapConsistentWithTig(deviationGraph, op.verified.min(), op.verified.max(), erate);
+
+ if (sim < REPEAT_FRACTION) {
+ notSimilar = true;
+ if (verbose == false)
+ continue;
+ }
+
+ // if not useful, bail. This only occurs here if verbose == true, otherwise, we shortcircuit in the tests above.
+
+ if (toSelf || expected5 || expected3 || tooSmall || isContained || noOverlaps || notSimilar) {
+ if (verbose)
+ writeLog("createUnitigs()-- read %6u place %3d edgeTo tig %5u reads #%5u %9u-%9u verified %9d-%9d position %9d-%9d covered %7d-%7d%s%s%s%s%s%s%s\n",
+ rdA->ident, pp, op.tigID,
+ op.tigFidx, tgB->ufpath[op.tigFidx].ident, tgB->ufpath[op.tigLidx].ident,
+ op.verified.bgn, op.verified.end,
+ op.position.bgn, op.position.end,
+ op.covered.bgn, op.covered.end,
+ (toSelf == true) ? " SELF" : "",
+ (expected5 == true) ? " EXPECTED_5'" : "",
+ (expected3 == true) ? " EXPECTED_3'" : "",
+ (tooSmall == true) ? " TOO_SMALL" : "",
+ (isContained == true) ? " IS_CONTAINED" : "", // Would be nice to report read it's contained in?
+ (noOverlaps == true) ? " NO_OVERLAPS" : "",
+ (notSimilar == true) ? " NOT_SIMILAR" : "");
continue;
}
- // If here, we're all golden!
+ // Otherwise, it's a useful edge.
+
+ if (verbose)
+ writeLog("createUnitigs()-- read %6u place %3d edgeTo tig %5u reads #%5u %9u-%9u verified %9d-%9d position %9d-%9d covered %7d-%7d BREAK at pos %8u read %6u isLow %u sim %.4f\n",
+ rdA->ident, pp, op.tigID,
+ op.tigFidx, tgB->ufpath[op.tigFidx].ident, tgB->ufpath[op.tigLidx].ident,
+ op.verified.bgn, op.verified.end,
+ op.position.bgn, op.position.end,
+ op.covered.bgn, op.covered.end,
+ coord, rdB->ident, isLow, sim);
- writeLog("splitThinEdge()-- read %6u splits tig %5u at coordinate %8u via intersection with read %6u isLow %u\n",
- rdA->ident, pf.tigID, coord, rdB->ident, isLow);
- breaks.push_back(breakPointEnd(pf.tigID, coord, isLow));
+ breaks.push_back(breakPointEnd(op.tigID, coord, isLow));
}
+
+ if (breaks.size() == 0) {
+ // Do nothing.
+ }
+
+ else if (breaks.size() > maxPlacements) {
+ writeLog("createUnitigs()-- discarding %u breakpoints.\n", breaks.size());
+ }
+
+ else if (breaks.size() <= maxPlacements) {
+ writeLog("createUnitigs()-- saving %u breakpoints to master list.\n", breaks.size());
+
+ //breakpoints.isert(breakpoints.end(), breaks.begin(), breaks.end());
+
+ for (uint32 ii=0; ii<breaks.size(); ii++)
+ breakpoints.push_back(breaks[ii]);
+ }
+
+ return(breaks.size());
}
@@ -318,19 +557,16 @@ stripNonBackboneFromStart(TigVector &unitigs, Unitig *tig, bool isFirst) {
while (RI->isBackbone(tig->ufpath[ii].ident) == false) { // Find the first backbone read,
unitigs.registerRead(tig->ufpath[ii].ident);
- writeLog("WARNING: unitig %u %s read %u is not backbone, removing.\n",
+ writeLog("WARNING: unitig %u %s read %8u %9u-%9u is not backbone, removing.\n",
tig->id(),
isFirst ? "first" : "last ",
- tig->ufpath[ii].ident);
+ tig->ufpath[ii].ident,
+ tig->ufpath[ii].position.bgn, tig->ufpath[ii].position.end);
ii++;
}
while (ii < tig->ufpath.size()) { // and copy to a new vector.
ufpath.push_back(tig->ufpath[ii]);
- writeLog("SAVE unitig %u %s read %u IS backbone.\n",
- tig->id(),
- isFirst ? "first" : "last ",
- tig->ufpath[ii].ident);
ii++;
}
@@ -342,65 +578,169 @@ stripNonBackboneFromStart(TigVector &unitigs, Unitig *tig, bool isFirst) {
void
-createUnitigs(AssemblyGraph *AG,
- TigVector &contigs,
- TigVector &unitigs,
- vector<tigLoc> &unitigSource) {
+createUnitigs(TigVector &contigs,
+ TigVector &unitigs,
+ uint32 minIntersectLen,
+ uint32 maxPlacements,
+ vector<confusedEdge> &confusedEdges,
+ vector<tigLoc> &unitigSource) {
vector<breakPointEnd> breaks;
- // Check the reads at the end of every tig for intersections to other tigs. If the read has a
- // compatible overlap to the middle of some other tig, split the other tig into multiple unitigs.
+ uint32 nBreaksSentinel;
+ uint32 nBreaksConfused;
+ uint32 nBreaksIntersection;
+
+
+ // Give each tig a pair of bogus breakpoints at the ends, just to get it in the list. If there
+ // are no break points, it won't be split. These also serve as sentinels during splitting.
writeLog("\n");
writeLog("----------------------------------------\n");
- writeLog("Finding contig-end to contig-middle intersections.\n");
+ writeLog("Adding sentinel breaks at the ends of contigs.\n");
for (uint32 ti=0; ti<contigs.size(); ti++) {
Unitig *tig = contigs[ti];
- if (tig == NULL)
+ if ((tig == NULL) ||
+ (tig->_isUnassembled == true))
continue;
- if (tig->_isUnassembled == true) // Edge is FROM an unassembled thing, ignore it.
+ breaks.push_back(breakPointEnd(ti, 0, true)); // Add one at the start of the tig
+ breaks.push_back(breakPointEnd(ti, tig->getLength(), false)); // And one at the end
+ }
+
+ nBreaksSentinel = breaks.size();
+
+
+ // Add breaks for any confused edges detected during repeat detection. We should, probably,
+ // remove duplicates, but they (should) cause no harm.
+
+ writeLog("\n");
+ writeLog("----------------------------------------\n");
+ writeLog("Adding breaks at confused reads.\n");
+
+ for (uint32 ii=0; ii<confusedEdges.size(); ii++) {
+ uint32 aid = confusedEdges[ii].aid;
+ uint32 a3p = confusedEdges[ii].a3p;
+
+ uint32 tid = contigs.inUnitig(aid);
+ uint32 tpp = contigs.ufpathIdx(aid); // Not the Trans-Pacific Partnership, FYI.
+
+ Unitig *tig = contigs[tid];
+ ufNode *rda = &tig->ufpath[tpp];
+
+ if ((tig == NULL) || // It won't be NULL, but we definitely don't want to
+ (tig->_isUnassembled == true)) // see unassembled crap here. We don't care, and they'll crash.
continue;
- // Give this tig a pair of bogus breakpoints at the ends, just to get it in the list. If there
- // are no break points, it won't be split. These also serve as sentinels during splitting.
+ uint32 coord = 0; // Pick the coordinate and set isLow based on orientation
+ bool isLow = false; // and the end of the read that is confused.
- breaks.push_back(breakPointEnd(ti, 0, true)); // Add one at the start of the tig
- breaks.push_back(breakPointEnd(ti, tig->getLength(), false)); // And one at the end
+ if (((rda->position.isForward() == true) && (a3p == true)) ||
+ ((rda->position.isForward() == false) && (a3p == false))) {
+ coord = rda->position.max();
+ isLow = false;
+ }
+
+ if (((rda->position.isForward() == true) && (a3p == false)) ||
+ ((rda->position.isForward() == false) && (a3p == true))) {
+ coord = rda->position.min();
+ isLow = true;
+ }
+
+ breakPointEnd bp(tid, coord, isLow);
+
+ if (breaks.back() == bp)
+ continue;
+
+ writeLog("createUnitigs()-- add break tig %u pos %u isLow %c\n", tid, coord, (isLow) ? 't' : 'f');
+
+ breaks.push_back(bp);
+ }
+
+ nBreaksConfused = breaks.size();
+
+
+ // Check the reads at the end of every tig for intersections to other tigs. If the read has a
+ // compatible overlap to the middle of some other tig, split the other tig into multiple unitigs.
+
+ writeLog("\n");
+ writeLog("----------------------------------------\n");
+ writeLog("Finding contig-end to contig-middle intersections.\n");
+
+ uint32 *numP = NULL;
+ uint32 lenP = 0;
+ uint32 maxP = 1024;
+
+ allocateArray(numP, maxP);
+
+ for (uint32 ti=0; ti<contigs.size(); ti++) {
+ Unitig *tig = contigs[ti];
+
+ if ((tig == NULL) ||
+ (tig->_isUnassembled == true))
+ continue;
// Find break points in other tigs using the first and last reads.
- ufNode *fi = tig->firstRead();
- ufNode *li = tig->lastRead();
+ ufNode *fi = tig->firstRead();
+ ufNode *li = tig->lastRead();
+ vector<overlapPlacement> fiPlacements;
+ vector<overlapPlacement> liPlacements;
- if (AG->getForward(fi->ident).size() + AG->getForward(li->ident).size() > 0)
- writeLog("\ncreateUnitigs()-- tig %u len %u first read %u with %lu edges - last read %u with %lu edges\n",
+ placeReadUsingOverlaps(contigs, NULL, fi->ident, fiPlacements, placeRead_all);
+ placeReadUsingOverlaps(contigs, NULL, li->ident, liPlacements, placeRead_all);
+
+ if (fiPlacements.size() + liPlacements.size() > 0)
+ writeLog("\ncreateUnitigs()-- tig %u len %u first read %u with %lu placements - last read %u with %lu placements\n",
ti, tig->getLength(),
- fi->ident, AG->getForward(fi->ident).size(),
- li->ident, AG->getForward(li->ident).size());
+ fi->ident, fiPlacements.size(),
+ li->ident, liPlacements.size());
+
+ uint32 npf = checkRead(tig, fi, fiPlacements, contigs, breaks, minIntersectLen, maxPlacements, true);
+ uint32 npr = checkRead(tig, li, liPlacements, contigs, breaks, minIntersectLen, maxPlacements, false);
+
+ lenP = max(lenP, npf);
+ lenP = max(lenP, npr);
+
+ resizeArray(numP, maxP, maxP, lenP+1, resizeArray_copyData | resizeArray_clearNew);
- checkRead(AG, contigs, breaks, tig, fi, true);
- checkRead(AG, contigs, breaks, tig, li, false);
+ numP[npf]++;
+ numP[npr]++;
}
+ nBreaksIntersection = breaks.size();
+
+ writeLog("\n");
+ writeLog("Histogram of number of placements per contig end:\n");
+ writeLog("numPlacements numEnds\n");
+ for (uint32 pp=0; pp<=lenP; pp++)
+ writeLog("%13u %7u\n", pp, numP[pp]);
+ writeLog("\n");
+ writeLog("----------------------------------------\n");
+ writeLog("Found %u breakpoints (including duplicates).\n", breaks.size());
+ writeLog(" %u from sentinels.\n", nBreaksSentinel);
+ writeLog(" %u from confused edges.\n", nBreaksConfused - nBreaksSentinel);
+ writeLog(" %u from intersections.\n", nBreaksIntersection - nBreaksConfused);
+ writeLog("\n");
+ writeLog("Splitting contigs into unitigs.\n");
+ writeLog("\n");
+
+ delete [] numP;
+
// The splitTigs function operates only on a single tig. Sort the break points
// by tig id to find all the break points for each tig.
sort(breaks.begin(), breaks.end());
- writeLog("\n");
- writeLog("createUnitigs()-- Found %u breakpoints.\n", breaks.size());
// Allocate space for breaking tigs. These are _vastly_ too big, but guaranteed.
vector<breakPointEnd> BP;
Unitig **newTigs = new Unitig * [breaks.size() + 2]; // Plus two, because we add an extra
- int32 *lowCoord = new int32 [breaks.size() + 2]; // break at the start and end
- uint32 *nMoved = new uint32 [breaks.size() + 2]; // of each set.
+ int32 *lowCoord = new int32 [breaks.size() + 2]; // break at the start and end of each set.
// Walk through the breaks, making a new vector of breaks for each tig.
@@ -433,16 +773,17 @@ createUnitigs(AssemblyGraph *AG,
// Split the tig. Copy it into the unitigs TigVector too.
- uint32 nTigs = splitTig(contigs, tig, BP, newTigs, lowCoord, nMoved, false);
+ uint32 nTigs = splitTig(contigs, tig, BP, newTigs, lowCoord, false);
if (nTigs > 1) {
- splitTig(unitigs, tig, BP, newTigs, lowCoord, nMoved, true);
+ splitTig(unitigs, tig, BP, newTigs, lowCoord, true);
writeLog("createUnitigs()-- contig %u was split into %u unitigs, %u through %u.\n", // Can't use newTigs, because
tig->id(), nTigs, unitigs.size() - nTigs, unitigs.size() - 1); // there are holes in it
}
else {
- newTigs[0] = copyTig(unitigs, tig);
+ newTigs[0] = copyTig(unitigs, tig); // splitTig populates newTigs and lowCoord, used below.
+ lowCoord[0] = 0;
writeLog("createUnitigs()-- contig %u copied into unitig %u.\n", tig->id(), newTigs[0]->id());
}
@@ -475,10 +816,12 @@ createUnitigs(AssemblyGraph *AG,
// If the last read in the tig is not a backbone read, we can remove it and all reads that come
// after it (because those reads are contained).
+#if 1
for (uint32 ti=0; ti<unitigs.size(); ti++) {
Unitig *tig = unitigs[ti];
- if (tig == NULL)
+ if ((tig == NULL) ||
+ (tig->_isUnassembled == true))
continue;
// First, check if we have any backbone reads. If we have none, leave it as is.
@@ -501,14 +844,14 @@ createUnitigs(AssemblyGraph *AG,
writeLog("unitig %u with %u reads, %u backbone and %u unplaced.\n",
tig->id(), tig->ufpath.size(), bbReads, nbReads);
- stripNonBackboneFromStart(unitigs, tig, true);
+ stripNonBackboneFromStart(unitigs, tig, true); // Does reverse complement at very end
stripNonBackboneFromStart(unitigs, tig, false);
}
+#endif
// Cleanup.
delete [] newTigs;
delete [] lowCoord;
- delete [] nMoved;
}
diff --git a/src/bogart/AS_BAT_CreateUnitigs.H b/src/bogart/AS_BAT_CreateUnitigs.H
index b8a35e1..5dfda8f 100644
--- a/src/bogart/AS_BAT_CreateUnitigs.H
+++ b/src/bogart/AS_BAT_CreateUnitigs.H
@@ -32,6 +32,8 @@
#include "AS_BAT_AssemblyGraph.H"
#include "AS_BAT_Logging.H"
+#include "AS_BAT_MarkRepeatReads.H" // confusedEdge
+
#include "AS_BAT_TigVector.H"
@@ -53,9 +55,11 @@ public:
void
-createUnitigs(AssemblyGraph *AG,
- TigVector &contigs,
- TigVector &unitigs,
- vector<tigLoc> &unitigSource);
+createUnitigs(TigVector &contigs,
+ TigVector &unitigs,
+ uint32 minIntersectLen,
+ uint32 maxPlacements,
+ vector<confusedEdge> &confusedEdges,
+ vector<tigLoc> &unitigSource);
#endif // AS_BAT_CREATEUNITIGS_H
diff --git a/src/bogart/AS_BAT_DropDeadEnds.C b/src/bogart/AS_BAT_DropDeadEnds.C
new file mode 100644
index 0000000..4a1eb3d
--- /dev/null
+++ b/src/bogart/AS_BAT_DropDeadEnds.C
@@ -0,0 +1,297 @@
+
+/******************************************************************************
+ *
+ * This file is part of canu, a software program that assembles whole-genome
+ * sequencing reads into contigs.
+ *
+ * This software is based on:
+ * 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
+ * the 'kmer package' (http://kmer.sourceforge.net)
+ * both originally distributed by Applera Corporation under the GNU General
+ * Public License, version 2.
+ *
+ * Canu branched from Celera Assembler at its revision 4587.
+ * Canu branched from the kmer project at its revision 1994.
+ *
+ * Modifications by:
+ *
+ * Brian P. Walenz beginning on 2017-MAY-31
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
+ * File 'README.licenses' in the root directory of this distribution contains
+ * full conditions and disclaimers for each license.
+ */
+
+#include "AS_BAT_ReadInfo.H"
+#include "AS_BAT_OverlapCache.H"
+#include "AS_BAT_BestOverlapGraph.H"
+#include "AS_BAT_AssemblyGraph.H"
+#include "AS_BAT_Logging.H"
+
+#include "AS_BAT_Unitig.H"
+#include "AS_BAT_TigVector.H"
+
+#include "AS_BAT_CreateUnitigs.H"
+
+
+
+
+// Find the next/previous read in the tig. Skips contained reads if they are isolated.
+
+ufNode *
+findNextRead(Unitig *tig,
+ ufNode *fn) {
+
+ for (uint32 ni = tig->ufpathIdx(fn->ident)+1; ni < tig->ufpath.size(); ni++) {
+ ufNode *nn = &tig->ufpath[ni];
+
+ // If nn is dovetail, return it.
+ // fn -----------
+ // nn ---------
+ //
+ if (fn->position.max() < nn->position.max())
+ return(nn);
+
+ // Otherwise, if it intersects the next-next read, return it.
+ // fn ----------------------
+ // nn ---------
+ // next-next -------
+ //
+ if ((ni + 1 < tig->ufpath.size()) &&
+ (tig->ufpath[ni+1].position.min() < nn->position.max()))
+ return(nn);
+ }
+
+ // Otherwise, ran out of reads.
+
+ return(NULL);
+}
+
+
+
+#if 0
+
+// Not used anymore. Might be incorrect.
+
+ufNode *
+findPrevRead(Unitig *tig,
+ ufNode *li) {
+
+ // A significant complication of working with reads on the 3' end is that they aren't sorted by
+ // their end position. We get around this by saving a copy of the existing reads, reverse
+ // complementing that, and using the same method as in findNextRead().
+ //
+ // Don't be clever and thing you can just reverse complement the tig; that can change order of
+ // reads, and we don't want to do that here.
+
+ vector<ufNode> ufcopy;
+
+ ufcopy.resize(tig->ufpath.size());
+
+ for (uint32 ii=0; ii<tig->ufpath.size(); ii++) {
+ ufcopy[ii] = tig->ufpath[ii];
+
+ ufcopy[ii].position.bgn = tig->getLength() - tig->ufpath[ii].position.bgn;
+ ufcopy[ii].position.end = tig->getLength() - tig->ufpath[ii].position.end;
+ }
+
+ std::sort(ufcopy.begin(), ufcopy.end());
+
+ // ufpathIdx() won't work anymore, but li should be the first read.
+
+ uint32 niPos=0;
+
+ while (ufcopy[niPos].ident != li->ident)
+ niPos++;
+
+ // Set 'fn' to that first node, and search for the next node. This is nearly cut-n-paste from
+ // above (just replaced the return value with one that uses the ufpath).
+
+ ufNode *fn = &ufcopy[niPos];
+
+ for (uint32 ni = niPos+1; ni < tig->ufpath.size(); ni++) {
+ ufNode *nn = &ufcopy[ni];
+
+ if (fn->position.max() < nn->position.max())
+ return(&tig->ufpath[ tig->ufpathIdx(nn->ident) ]);
+
+ if ((ni + 1 < tig->ufpath.size()) &&
+ (ufcopy[ni+1].position.min() < nn->position.max()))
+ return(&tig->ufpath[ tig->ufpathIdx(nn->ident) ]);
+ }
+
+ // Otherwise, ran out of reads.
+
+ return(NULL);
+}
+
+#endif
+
+
+
+
+uint32
+dropDeadFirstRead(AssemblyGraph *AG,
+ Unitig *tig) {
+
+ ufNode *fn = tig->firstRead();
+ ufNode *sn = findNextRead(tig, fn);
+
+ // No next read, keep fn in the tig.
+
+ if (sn == NULL) {
+ writeLog("dropDead()- read %8u no sn\n", fn->ident);
+ return(0);
+ }
+
+ // Over all edges from the first read, look for any edge to something else.
+ //
+ // If a contained edge to anything, read fn is good and should be kept.
+ //
+ // Otherwise, decide which overlap we want to be using, based on the orientation of the read in
+ // the tig. We assume that this is always the first read, which is OK, because the function name
+ // says so. Any edge to anywhere means the read is good and should be kept.
+
+ for (uint32 pp=0; pp<AG->getForward(fn->ident).size(); pp++) {
+ BestPlacement &pf = AG->getForward(fn->ident)[pp];
+
+ writeLog("dropDead()-- 1st read %8u %s pf %3u/%3u best5 %8u best3 %8u bestC %8u\n",
+ fn->ident,
+ fn->position.isForward() ? "->" : "<-",
+ pp, AG->getForward(fn->ident).size(),
+ pf.best5.b_iid, pf.best3.b_iid, pf.bestC.b_iid);
+
+ if (pf.bestC.b_iid > 0) {
+ return(0);
+ }
+
+ if (((fn->position.isForward() == true) && (pf.best5.b_iid != 0)) ||
+ ((fn->position.isForward() == false) && (pf.best3.b_iid != 0)))
+ return(0);
+ }
+
+ // But no edge means we need to check the second read. If it has an edge, then we infer the
+ // first read is bogus and should be removed. If it also has no edge (except to the first read,
+ // duh) then we know nothing: this could be novel sequence or it could be the same garbage that
+ // is infecting the first read.
+ //
+ // This is basically the same as the previous loop, except we also need to exclude edges to the
+ // first read. Well, and that if the second read has an edge we declare the first read to be
+ // junk. That's also a bit of a difference from the previous loop.
+
+ for (uint32 pp=0; pp<AG->getForward(sn->ident).size(); pp++) {
+ BestPlacement &pf = AG->getForward(sn->ident)[pp];
+
+ writeLog("dropDead()-- 2nd read %8u %s pf %3u/%3u best5 %8u best3 %8u bestC %8u\n",
+ sn->ident,
+ sn->position.isForward() ? "->" : "<-",
+ pp, AG->getForward(sn->ident).size(),
+ pf.best5.b_iid, pf.best3.b_iid, pf.bestC.b_iid);
+
+ if ((pf.bestC.b_iid > 0) && (pf.bestC.b_iid != fn->ident))
+ return(fn->ident);
+
+ if (((sn->position.isForward() == true) && (pf.best5.b_iid != 0) && (pf.best5.b_iid != fn->ident)) ||
+ ((sn->position.isForward() == false) && (pf.best3.b_iid != 0) && (pf.best3.b_iid != fn->ident)))
+ return(fn->ident);
+ }
+
+ // Otherwise, the second read had only edges to the first read, and we should keep the first
+ // read.
+
+ return(0);
+}
+
+
+
+void
+dropDeadEnds(AssemblyGraph *AG,
+ TigVector &tigs) {
+
+ uint32 numF = 0; // Number of first-read drops
+ uint32 numL = 0; // Number of last-read drops
+ uint32 numB = 0; // Number of both-first-and-last-read drops
+ uint32 numT = 0; // Number of tigs mucked with
+
+ for (uint32 ti=0; ti<tigs.size(); ti++) {
+ Unitig *tig = tigs[ti];
+
+ if ((tig == NULL) || // No tig, or don't care.
+ (tig->ufpath.size() <= 1) ||
+ (tig->_isUnassembled == true))
+ continue;
+
+ uint32 fn = dropDeadFirstRead(AG, tig); // Decide if the first read is junk.
+
+ tig->reverseComplement(); // Flip.
+ uint32 ln = dropDeadFirstRead(AG, tig); // Decide if the last (now first) read is junk.
+ tig->reverseComplement(); // Flip back.
+
+ if ((fn == 0) && (ln == 0)) // Nothing to remove, just get out of here.
+ continue;
+
+ // At least one read needs to be kicked out. Make new tigs for everything.
+
+ char fnMsg[80] = {0}; Unitig *fnTig = NULL;
+ char nnMsg[80] = {0}; Unitig *nnTig = NULL; int32 nnOff = INT32_MAX;
+ char lnMsg[80] = {0}; Unitig *lnTig = NULL;
+
+ if (fn > 0)
+ fnTig = tigs.newUnitig(false);
+
+ if (tig->ufpath.size() > (fn > 0) + (ln > 0))
+ nnTig = tigs.newUnitig(false);
+
+ if (ln > 0)
+ lnTig = tigs.newUnitig(false);
+
+ // Count what we do
+
+ numT++;
+
+ if (fnTig) numF++;
+ if (fnTig && lnTig) numB++;
+ if (lnTig) numL++;
+
+ // Move reads to their new unitig.
+
+ strcpy(fnMsg, " ");
+ strcpy(nnMsg, " ");
+ strcpy(lnMsg, "");
+
+ for (uint32 cc=0, tt=0; tt<tig->ufpath.size(); tt++) {
+ ufNode &read = tig->ufpath[tt];
+
+ if (read.ident == fn) {
+ sprintf(fnMsg, "first read %9u to tig %7u --", read.ident, fnTig->id());
+ fnTig->addRead(read, -read.position.min(), false);
+
+ } else if (read.ident == ln) {
+ sprintf(lnMsg, "-- last read %9u to tig %7u", read.ident, lnTig->id());
+ lnTig->addRead(read, -read.position.min(), false);
+
+ } else {
+ if (nnOff == INT32_MAX) {
+ sprintf(nnMsg, "other reads to tig %7u", nnTig->id());
+ nnOff = read.position.min();
+ }
+ nnTig->addRead(read, -nnOff, false);
+ }
+ }
+
+ writeLog("dropDeadEnds()-- tig %7u --> %s %s %s\n", tig->id(), fnMsg, nnMsg, lnMsg);
+
+ if (fnTig) fnTig->cleanUp(); // Probably not neeeded, but cheap.
+ if (lnTig) lnTig->cleanUp(); // Probably not neeeded, but cheap.
+ if (nnTig) nnTig->cleanUp(); // Most likely needed.
+
+ // Old tig is now junk.
+
+ delete tigs[ti];
+ tigs[ti] = NULL;
+ }
+
+ writeStatus("dropDeadEnds()-- Modified %u tigs. Dropped %u first and %u last reads, %u tig%s had both reads dropped.\n",
+ numT, numF, numL, numB, (numB == 1) ? "" : "s");
+}
diff --git a/src/bogart/AS_BAT_MarkRepeatReads.H b/src/bogart/AS_BAT_DropDeadEnds.H
similarity index 69%
copy from src/bogart/AS_BAT_MarkRepeatReads.H
copy to src/bogart/AS_BAT_DropDeadEnds.H
index 9f88900..3f6a0f9 100644
--- a/src/bogart/AS_BAT_MarkRepeatReads.H
+++ b/src/bogart/AS_BAT_DropDeadEnds.H
@@ -15,7 +15,7 @@
*
* Modifications by:
*
- * Brian P. Walenz beginning on 2016-MAR-11
+ * Brian P. Walenz beginning on 2017-MAY-31
* are a 'United States Government Work', and
* are released in the public domain
*
@@ -23,17 +23,20 @@
* full conditions and disclaimers for each license.
*/
-#ifndef INCLUDE_AS_BAT_MARKREPEATREADS
-#define INCLUDE_AS_BAT_MARKREPEATREADS
+#ifndef AS_BAT_DROPDEADENDS_H
+#define AS_BAT_DROPDEADENDS_H
+
+#include "AS_BAT_ReadInfo.H"
+#include "AS_BAT_OverlapCache.H"
+#include "AS_BAT_BestOverlapGraph.H"
+#include "AS_BAT_AssemblyGraph.H"
+#include "AS_BAT_Logging.H"
#include "AS_BAT_TigVector.H"
-void
-markRepeatReads(AssemblyGraph *AG,
- TigVector &tigs,
- double deviationRepeat,
- uint32 confusedAbsolute,
- double confusedPercent);
+void
+dropDeadEnds(AssemblyGraph *AG,
+ TigVector &tigs);
-#endif // INCLUDE_AS_BAT_MARKREPEATREADS
+#endif // AS_BAT_DROPDEADENDS_H
diff --git a/src/bogart/AS_BAT_Instrumentation.C b/src/bogart/AS_BAT_Instrumentation.C
index 00c66dc..8d26a22 100644
--- a/src/bogart/AS_BAT_Instrumentation.C
+++ b/src/bogart/AS_BAT_Instrumentation.C
@@ -99,32 +99,161 @@ checkUnitigMembership(TigVector &tigs) {
}
-// Decides if a unitig is unassembled. The other classifications (isBubble, isCircular, isRepeat)
-// are made when the type is processed (e.g., when bubbles are popped).
-//
-// A unitig is unassembled if:
-// 1) it has fewer than R reads (R=2)
-// 2) it is shorter than S bases (S=1000)
-// 3) a single read spans at least fraction F of the lenth (F=1.0)
-// 4) at least fraction F of the unitig is below read depth D (F=1.0, D=2)
-//
+
+
+
+// Rule S. Singleton.
+bool
+classifyRuleS(Unitig *utg, FILE *UNUSED(F), uint32 &num, uint64 &len) {
+
+ if (utg->ufpath.size() > 1)
+ return(false);
+
+ //fprintf(F, "unitig " F_U32 " (%s) unassembled - singleton\n", utg->id(),
+ // (utg->_isRepeat) ? "repeat" : "normal");
+
+ num += 1;
+ len += utg->getLength();
+
+ return(true);
+}
+
+
+
+// Rule 1. Too few reads.
+bool
+classifyRule1(Unitig *utg, FILE *F, uint32 &num, uint64 &len, uint32 fewReadsNumber) {
+
+ if (utg->ufpath.size() == 1)
+ return(false);
+ if (utg->ufpath.size() >= fewReadsNumber)
+ return(false);
+
+ fprintf(F, "unitig " F_U32 " (%s) unassembled - too few reads (" F_U64 " < " F_U32 ")\n",
+ utg->id(), (utg->_isRepeat) ? "repeat" : "normal",
+ utg->ufpath.size(), fewReadsNumber);
+
+ num += 1;
+ len += utg->getLength();
+
+ return(true);
+}
+
+
+
+// Rule 2. Short.
+bool
+classifyRule2(Unitig *utg, FILE *F, uint32 &num, uint64 &len, uint32 tooShortLength) {
+
+ if (utg->ufpath.size() == 1)
+ return(false);
+ if (utg->getLength() >= tooShortLength)
+ return(false);
+
+ if (utg->ufpath.size() > 1)
+ fprintf(F, "unitig " F_U32 " (%s) unassembled - too short (" F_U32 " < " F_U32 ")\n",
+ utg->id(), (utg->_isRepeat) ? "repeat" : "normal",
+ utg->getLength(), tooShortLength);
+
+ num += 1;
+ len += utg->getLength();
+
+ return(true);
+}
+
+
+
+// Rule 3. Single read spans large fraction of tig.
+bool
+classifyRule3(Unitig *utg, FILE *F, uint32 &num, uint64 &len, double spanFraction) {
+
+ if (utg->ufpath.size() == 1)
+ return(false);
+
+ for (uint32 oi=0; oi<utg->ufpath.size(); oi++) {
+ ufNode *frg = &utg->ufpath[oi];
+
+ int frgbgn = MIN(frg->position.bgn, frg->position.end);
+ int frgend = MAX(frg->position.bgn, frg->position.end);
+
+ if (frgend - frgbgn > utg->getLength() * spanFraction) {
+ if (utg->ufpath.size() > 1)
+ fprintf(F, "unitig " F_U32 " (%s) unassembled - single read spans unitig (read " F_U32 " " F_U32 "-" F_U32 " spans fraction %f > %f\n",
+ utg->id(), (utg->_isRepeat) ? "repeat" : "normal",
+ frg->ident, frg->position.bgn, frg->position.end, (double)(frgend - frgbgn) / utg->getLength(), spanFraction);
+ num += 1;
+ len += utg->getLength();
+
+ return(true);
+ }
+ }
+
+ return(false);
+}
+
+
+
+// Rule 4. Low coverage.
+bool
+classifyRule4(Unitig *utg, FILE *F, uint32 &num, uint64 &len, double lowcovFraction, uint32 lowcovDepth) {
+
+ if (utg->ufpath.size() == 1)
+ return(false);
+
+ intervalList<int32> IL;
+
+ for (uint32 oi=0; oi<utg->ufpath.size(); oi++) {
+ ufNode *frg = &utg->ufpath[oi];
+
+ int frgbgn = MIN(frg->position.bgn, frg->position.end);
+ int frgend = MAX(frg->position.bgn, frg->position.end);
+
+ IL.add(frgbgn, frgend - frgbgn);
+ }
+
+ intervalList<int32> ID(IL);
+
+ uint32 basesLow = 0;
+ uint32 basesHigh = 0;
+
+ for (uint32 ii=0; ii<ID.numberOfIntervals(); ii++)
+ if (ID.depth(ii) < lowcovDepth)
+ basesLow += ID.hi(ii) - ID.lo(ii) + 1;
+ else
+ basesHigh += ID.hi(ii) - ID.lo(ii) + 1;
+
+ assert(basesLow + basesHigh > 0);
+
+ double lowcov = (double)basesLow / (basesLow + basesHigh);
+
+ if (lowcov < lowcovFraction)
+ return(false);
+
+ if (utg->ufpath.size() > 1)
+ fprintf(F, "Unitig " F_U32 " (%s) unassembled - low coverage (%.2f%% of unitig at < " F_U32 "x coverage, allowed %.2f%%)\n",
+ utg->id(), (utg->_isRepeat) ? "repeat" : "normal",
+ 100.0 * lowcov, lowcovDepth, 100.0 * lowcovFraction);
+
+ num += 1;
+ len += utg->getLength();
+
+ return(true);
+}
+
+
+
void
classifyTigsAsUnassembled(TigVector &tigs,
uint32 fewReadsNumber,
uint32 tooShortLength,
double spanFraction,
double lowcovFraction, uint32 lowcovDepth) {
- uint32 nTooFew = 0;
- uint32 nShort = 0;
- uint32 nSingle = 0;
- uint32 nCoverage = 0;
- uint32 nContig = 0;
-
- uint64 bTooFew = 0;
- uint64 bShort = 0;
- uint64 bSingle = 0;
- uint64 bCoverage = 0;
- uint64 bContig = 0;
+ uint32 nSingleton = 0; uint64 bSingleton = 0;
+ uint32 nTooFew = 0; uint64 bTooFew = 0;
+ uint32 nShort = 0; uint64 bShort = 0;
+ uint32 nSingleSpan = 0; uint64 bSingleSpan = 0;
+ uint32 nCoverage = 0; uint64 bCoverage = 0;
+ uint32 nContig = 0; uint64 bContig = 0;
char N[FILENAME_MAX];
@@ -135,105 +264,59 @@ classifyTigsAsUnassembled(TigVector &tigs,
if (errno)
F = NULL;
+ if (F) {
+ fprintf(F, "# Contigs flagged as unassembled.\n");
+ fprintf(F, "#\n");
+ fprintf(F, "# fewReadsNumber %u (singletons always removed and not logged)\n", fewReadsNumber);
+ fprintf(F, "# tooShortLength %u\n", tooShortLength);
+ fprintf(F, "# spanFraction %f\n", spanFraction);
+ fprintf(F, "# lowcovFraction %f\n", lowcovFraction);
+ fprintf(F, "# lowcovDepth %u\n", lowcovDepth);
+ fprintf(F, "#\n");
+ }
+
for (uint32 ti=0; ti<tigs.size(); ti++) {
Unitig *utg = tigs[ti];
if (utg == NULL)
continue;
- utg->_isUnassembled = false;
-
- // Rule 1. Too few reads.
-
- if (utg->ufpath.size() < fewReadsNumber) {
- fprintf(F, "unitig " F_U32 " unassembled - too few reads (" F_U64 " < " F_U32 ")\n", ti, utg->ufpath.size(), fewReadsNumber);
- utg->_isUnassembled = true;
- nTooFew += 1;
- bTooFew += utg->getLength();
- continue;
- }
-
- // Rule 2. Short.
+ // Decide that we're junk first.
- if (utg->getLength() < tooShortLength) {
- fprintf(F, "unitig " F_U32 " unassembled - too short (" F_U32 " < " F_U32 ")\n", ti, utg->getLength(), tooShortLength);
- utg->_isUnassembled = true;
- nShort += 1;
- bShort += utg->getLength();
- continue;
- }
+ utg->_isUnassembled = true;
- // Rule 3. Single read spans large fraction of tig.
+ // Check the tig.
- for (uint32 oi=0; oi<utg->ufpath.size(); oi++) {
- ufNode *frg = &utg->ufpath[oi];
+ bool rr = (utg->_isRepeat == true);
+ bool rs = classifyRuleS(utg, F, nSingleton, bSingleton);
+ bool r1 = classifyRule1(utg, F, nTooFew, bTooFew, fewReadsNumber);
+ bool r2 = classifyRule2(utg, F, nShort, bShort, tooShortLength);
+ bool r3 = classifyRule3(utg, F, nSingleSpan, bSingleSpan, spanFraction);
+ bool r4 = classifyRule4(utg, F, nCoverage, bCoverage, lowcovFraction, lowcovDepth);
- int frgbgn = MIN(frg->position.bgn, frg->position.end);
- int frgend = MAX(frg->position.bgn, frg->position.end);
+ // If flagged, we're done, just move on.
- if (frgend - frgbgn > utg->getLength() * spanFraction) {
- fprintf(F, "unitig " F_U32 " unassembled - single read spans unitig (read " F_U32 " " F_U32 "-" F_U32 " spans fraction %f > %f\n",
- ti, frg->ident, frg->position.bgn, frg->position.end, (double)(frgend - frgbgn) / utg->getLength(), spanFraction);
- utg->_isUnassembled = true;
- nSingle += 1;
- bSingle += utg->getLength();
- break;
- }
- }
- if (utg->_isUnassembled)
- continue;
-
- // Rule 4. Low coverage.
-
- intervalList<int32> IL;
-
- for (uint32 oi=0; oi<utg->ufpath.size(); oi++) {
- ufNode *frg = &utg->ufpath[oi];
-
- int frgbgn = MIN(frg->position.bgn, frg->position.end);
- int frgend = MAX(frg->position.bgn, frg->position.end);
-
- IL.add(frgbgn, frgend - frgbgn);
- }
-
- intervalList<int32> ID(IL);
-
- uint32 basesLow = 0;
- uint32 basesHigh = 0;
-
- for (uint32 ii=0; ii<ID.numberOfIntervals(); ii++)
- if (ID.depth(ii) < lowcovDepth)
- basesLow += ID.hi(ii) - ID.lo(ii) + 1;
- else
- basesHigh += ID.hi(ii) - ID.lo(ii) + 1;
-
- assert(basesLow + basesHigh > 0);
-
- double lowcov = (double)basesLow / (basesLow + basesHigh);
-
- if (lowcov >= lowcovFraction) {
- fprintf(F, "Unitig " F_U32 " unassembled - low coverage (%.4f > %.4f at < " F_U32 "x coverage)\n",
- ti, lowcov, lowcovFraction, lowcovDepth);
- utg->_isUnassembled = true;
- nCoverage += 1;
- bCoverage += utg->getLength();
+ if ((rr == false) && (rs || r1 || r2 || r3 || r4))
continue;
- }
// Otherwise, unitig is assembled!
nContig += 1;
bContig += utg->getLength();
+
+ utg->_isUnassembled = false;
}
if (F)
fclose(F);
- writeStatus("classifyAsUnassembled()-- %6u tigs %11lu bases -- too few reads\n", nTooFew, bTooFew);
- writeStatus("classifyAsUnassembled()-- %6u tigs %11lu bases -- too short\n", nShort, bShort);
- writeStatus("classifyAsUnassembled()-- %6u tigs %11lu bases -- single spanning read\n", nSingle, bSingle);
- writeStatus("classifyAsUnassembled()-- %6u tigs %11lu bases -- low coverage\n", nCoverage, bCoverage);
- writeStatus("classifyAsUnassembled()-- %6u tigs %11lu bases -- acceptable contigs\n", nContig, bContig);
+ writeStatus("classifyAsUnassembled()-- %6u tigs %11lu bases -- singleton\n", nSingleton, bSingleton, fewReadsNumber);
+ writeStatus("classifyAsUnassembled()-- %6u tigs %11lu bases -- too few reads (< %u reads)\n", nTooFew, bTooFew, fewReadsNumber);
+ writeStatus("classifyAsUnassembled()-- %6u tigs %11lu bases -- too short (< %u bp)\n", nShort, bShort, tooShortLength);
+ writeStatus("classifyAsUnassembled()-- %6u tigs %11lu bases -- single spanning read (> %f tig length)\n", nSingleSpan, bSingleSpan, spanFraction);
+ writeStatus("classifyAsUnassembled()-- %6u tigs %11lu bases -- low coverage (> %f tig length at < %u coverage)\n", nCoverage, bCoverage, lowcovFraction, lowcovDepth);
+ writeStatus("classifyAsUnassembled()-- %6u tigs %11lu bases -- acceptable contigs\n", nContig, bContig);
+ writeStatus("\n");
}
@@ -281,14 +364,12 @@ reportN50(FILE *F, vector<uint32> &data, char const *label, uint64 genomeSize) {
void
-reportTigs(TigVector &tigs, const char *prefix, const char *name, uint64 genomeSize) {
+reportTigs(TigVector &tigs, const char *UNUSED(prefix), const char *UNUSED(name), uint64 genomeSize) {
// Generate n50. Assumes tigs have been 'classified' already.
vector<uint32> unassembledLength;
- vector<uint32> bubbleLength;
vector<uint32> repeatLength;
- vector<uint32> circularLength;
vector<uint32> contigLength;
for (uint32 ti=0; ti<tigs.size(); ti++) {
@@ -301,18 +382,10 @@ reportTigs(TigVector &tigs, const char *prefix, const char *name, uint64 genomeS
unassembledLength.push_back(utg->getLength());
}
- else if (utg->_isBubble) {
- bubbleLength.push_back(utg->getLength());
- }
-
else if (utg->_isRepeat) {
repeatLength.push_back(utg->getLength());
}
- else if (utg->_isCircular) {
- circularLength.push_back(utg->getLength());
- }
-
else {
contigLength.push_back(utg->getLength());
}
@@ -326,9 +399,7 @@ reportTigs(TigVector &tigs, const char *prefix, const char *name, uint64 genomeS
FILE *F = fopen(N, "w");
if (errno == 0) {
reportN50(F, unassembledLength, "UNASSEMBLED", genomeSize);
- reportN50(F, bubbleLength, "BUBBLE", genomeSize);
reportN50(F, repeatLength, "REPEAT", genomeSize);
- reportN50(F, circularLength, "CIRCULAR", genomeSize);
reportN50(F, contigLength, "CONTIGS", genomeSize);
fclose(F);
@@ -349,27 +420,26 @@ reportTigs(TigVector &tigs, const char *prefix, const char *name, uint64 genomeS
#define tCTG 0 // To a read in a normal tig
#define tRPT 1 // To a read in a repeat tig
-#define tBUB 2 // To a read in a bubble tig
-#define tUNA 3 // To a read in an 'unassembled' leftover tig
-#define tUNU 4 // To a read not placed in a tig
-#define tNOP 5 // To no read (for best edges)
+#define tUNA 2 // To a read in an 'unassembled' leftover tig
+#define tUNU 3 // To a read not placed in a tig
+#define tNOP 4 // To no read (for best edges)
struct olapsUsed {
uint64 total;
// By definition, satisfied overlaps are in the same tig.
- uint64 doveSatSame[6];
- uint64 contSatSame[6];
+ uint64 doveSatSame[5];
+ uint64 contSatSame[5];
// Unsatisfied overlaps can be in the same tig...
- uint64 doveUnsatSame[6];
- uint64 contUnsatSame[6];
+ uint64 doveUnsatSame[5];
+ uint64 contUnsatSame[5];
// ...or can be between tigs.
- uint64 doveUnsatDiff[6][6];
- uint64 contUnsatDiff[6][6];
+ uint64 doveUnsatDiff[5][5];
+ uint64 contUnsatDiff[5][5];
};
@@ -378,14 +448,14 @@ uint32
getTigType(Unitig *tg) {
if (tg == NULL) return(tUNU);
if (tg->_isUnassembled) return(tUNA);
- if (tg->_isBubble) return(tBUB);
if (tg->_isRepeat) return(tRPT);
if (1) return(tCTG);
}
bool
-satisfiedOverlap(uint32 rdAlo, uint32 rdAhi, bool rdAfwd, uint32 rdBlo, uint32 rdBhi, bool rdBfwd, bool flipped) {
+satisfiedOverlap(uint32 UNUSED(rdAlo), uint32 rdAhi, bool rdAfwd,
+ uint32 rdBlo, uint32 rdBhi, bool rdBfwd, bool flipped) {
return(((rdAhi < rdBlo) || (rdBhi < rdBlo)) || // Not satisfied, no overlap
((rdAfwd == rdBfwd) && (flipped == true)) || // Not satisfied, same orient, but flipped overlap
((rdAfwd != rdBfwd) && (flipped == false))); // Not satisfied, diff orient, but normal overlap
@@ -395,7 +465,7 @@ satisfiedOverlap(uint32 rdAlo, uint32 rdAhi, bool rdAfwd, uint32 rdBlo, uint32 r
// Iterate over all overlaps (but the only interface we have is by iterating
// over all reads), and count the number of overlaps satisfied in tigs.
void
-reportOverlaps(TigVector &tigs, const char *prefix, const char *name) {
+reportOverlaps(TigVector &tigs, const char *UNUSED(prefix), const char *UNUSED(name)) {
olapsUsed *dd = new olapsUsed; // Dovetail overlaps to non-contained reads
olapsUsed *dc = new olapsUsed; // Dovetail overlaps to contained reads
olapsUsed *cc = new olapsUsed; // Containment overlaps
@@ -591,8 +661,8 @@ reportOverlaps(TigVector &tigs, const char *prefix, const char *name) {
// Merge the symmetrical counts
- for (uint32 ii=0; ii<6; ii++) {
- for (uint32 jj=ii+1; jj<6; jj++) {
+ for (uint32 ii=0; ii<5; ii++) {
+ for (uint32 jj=ii+1; jj<5; jj++) {
bb->doveUnsatDiff[ii][jj] += bb->doveUnsatDiff[jj][ii]; bb->doveUnsatDiff[jj][ii] = UINT64_MAX;
dd->doveUnsatDiff[ii][jj] += dd->doveUnsatDiff[jj][ii]; dd->doveUnsatDiff[jj][ii] = UINT64_MAX;
dc->doveUnsatDiff[ii][jj] += dc->doveUnsatDiff[jj][ii]; dc->doveUnsatDiff[jj][ii] = UINT64_MAX;
@@ -637,13 +707,11 @@ reportOverlaps(TigVector &tigs, const char *prefix, const char *name) {
fprintf(F, "--------- ------------ -------\n");
fprintf(F, "same-contig %12" F_U64P " %6.2f%%\n", bb->doveSatSame[tCTG], B(bb->doveSatSame[tCTG]));
fprintf(F, "same-repeat %12" F_U64P " %6.2f%%\n", bb->doveSatSame[tRPT], B(bb->doveSatSame[tRPT]));
- fprintf(F, "same-bubble %12" F_U64P " %6.2f%%\n", bb->doveSatSame[tBUB], B(bb->doveSatSame[tBUB]));
fprintf(F, "\n");
fprintf(F, "UNSATISFIED best edges DOVETAIL\n");
fprintf(F, "----------- ------------ -------\n");
fprintf(F, "same-contig %12" F_U64P " %6.2f%%\n", bb->doveUnsatSame[tCTG], B(bb->doveUnsatSame[tCTG]));
fprintf(F, "same-repeat %12" F_U64P " %6.2f%%\n", bb->doveUnsatSame[tRPT], B(bb->doveUnsatSame[tRPT]));
- fprintf(F, "same-bubble %12" F_U64P " %6.2f%%\n", bb->doveUnsatSame[tBUB], B(bb->doveUnsatSame[tBUB]));
fprintf(F, "same-unassembled %12" F_U64P " %6.2f%%\n", bb->doveUnsatSame[tUNA], B(bb->doveUnsatSame[tUNA]));
fprintf(F, "same-unused %12" F_U64P " %6.2f%%\n", bb->doveUnsatSame[tUNU], B(bb->doveUnsatSame[tUNU]));
fprintf(F, "\n");
@@ -651,35 +719,24 @@ reportOverlaps(TigVector &tigs, const char *prefix, const char *name) {
fprintf(F, "----------- ------------ -------\n");
fprintf(F, "contig-contig %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tCTG][tCTG], B(bb->doveUnsatDiff[tCTG][tCTG]));
fprintf(F, "contig-repeat %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tCTG][tRPT], B(bb->doveUnsatDiff[tCTG][tRPT]));
- fprintf(F, "contig-bubble %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tCTG][tBUB], B(bb->doveUnsatDiff[tCTG][tBUB]));
fprintf(F, "contig-unassembled %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tCTG][tUNA], B(bb->doveUnsatDiff[tCTG][tUNA]));
fprintf(F, "contig-unused %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tCTG][tUNU], B(bb->doveUnsatDiff[tCTG][tUNU]));
fprintf(F, "contig-none %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tCTG][tNOP], B(bb->doveUnsatDiff[tCTG][tNOP]));
fprintf(F, "\n");
//fprintf(F, "repeat-contig %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tRPT][tCTG], B(bb->doveUnsatDiff[tRPT][tCTG]));
fprintf(F, "repeat-repeat %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tRPT][tRPT], B(bb->doveUnsatDiff[tRPT][tRPT]));
- fprintf(F, "repeat-bubble %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tRPT][tBUB], B(bb->doveUnsatDiff[tRPT][tBUB]));
fprintf(F, "repeat-unassembled %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tRPT][tUNA], B(bb->doveUnsatDiff[tRPT][tUNA]));
fprintf(F, "repeat-unused %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tRPT][tUNU], B(bb->doveUnsatDiff[tRPT][tUNU]));
fprintf(F, "repeat-none %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tRPT][tNOP], B(bb->doveUnsatDiff[tRPT][tNOP]));
fprintf(F, "\n");
-//fprintf(F, "bubble-contig %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tBUB][tCTG], B(bb->doveUnsatDiff[tBUB][tCTG]));
-//fprintf(F, "bubble-repeat %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tBUB][tRPT], B(bb->doveUnsatDiff[tBUB][tRPT]));
- fprintf(F, "bubble-bubble %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tBUB][tBUB], B(bb->doveUnsatDiff[tBUB][tBUB]));
- fprintf(F, "bubble-unassembled %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tBUB][tUNA], B(bb->doveUnsatDiff[tBUB][tUNA]));
- fprintf(F, "bubble-unused %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tBUB][tUNU], B(bb->doveUnsatDiff[tBUB][tUNU]));
- fprintf(F, "bubble-none %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tBUB][tNOP], B(bb->doveUnsatDiff[tBUB][tNOP]));
- fprintf(F, "\n");
//fprintf(F, "unassembled-contig %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tUNA][tCTG], B(bb->doveUnsatDiff[tUNA][tCTG]));
//fprintf(F, "unassembled-repeat %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tUNA][tRPT], B(bb->doveUnsatDiff[tUNA][tRPT]));
-//fprintf(F, "unassembled-bubble %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tUNA][tBUB], B(bb->doveUnsatDiff[tUNA][tBUB]));
fprintf(F, "unassembled-unassembled %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tUNA][tUNA], B(bb->doveUnsatDiff[tUNA][tUNA]));
fprintf(F, "unassembled-unused %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tUNA][tUNU], B(bb->doveUnsatDiff[tUNA][tUNU]));
fprintf(F, "unassembled-none %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tUNA][tNOP], B(bb->doveUnsatDiff[tUNA][tNOP]));
fprintf(F, "\n");
//fprintf(F, "unused-contig %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tUNU][tCTG], B(bb->doveUnsatDiff[tUNU][tCTG]))
//fprintf(F, "unused-repeat %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tUNU][tRPT], B(bb->doveUnsatDiff[tUNU][tRPT]));
-//fprintf(F, "unused-bubble %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tUNU][tBUB], B(bb->doveUnsatDiff[tUNU][tBUB]));
//fprintf(F, "unused-unassembled %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tUNU][tUNA], B(bb->doveUnsatDiff[tUNU][tUNA]));
fprintf(F, "unused-unused %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tUNU][tUNU], B(bb->doveUnsatDiff[tUNU][tUNU]));
fprintf(F, "unused-none %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tUNU][tNOP], B(bb->doveUnsatDiff[tUNU][tNOP]));
@@ -692,13 +749,11 @@ reportOverlaps(TigVector &tigs, const char *prefix, const char *name) {
fprintf(F, "--------- ------------ ------- ------------ ------- ------------ -------\n");
fprintf(F, "same-contig %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveSatSame[tCTG], P(dd->doveSatSame[tCTG]), dc->doveSatSame[tCTG], Q(dc->doveSatSame[tCTG]), cc->contSatSame[tCTG], R(cc->contSatSame[tCTG]));
fprintf(F, "same-repeat %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveSatSame[tRPT], P(dd->doveSatSame[tRPT]), dc->doveSatSame[tRPT], Q(dc->doveSatSame[tRPT]), cc->contSatSame[tRPT], R(cc->contSatSame[tRPT]));
- fprintf(F, "same-bubble %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveSatSame[tBUB], P(dd->doveSatSame[tBUB]), dc->doveSatSame[tBUB], Q(dc->doveSatSame[tBUB]), cc->contSatSame[tBUB], R(cc->contSatSame[tBUB]));
fprintf(F, "\n");
fprintf(F, "UNSATISFIED all overlaps DOVETAIL DOVECONT CONTAINMENT\n");
fprintf(F, "----------- ------------ ------- ------------ ------- ------------ -------\n");
fprintf(F, "same-contig %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatSame[tCTG], P(dd->doveUnsatSame[tCTG]), dc->doveUnsatSame[tCTG], Q(dc->doveUnsatSame[tCTG]), cc->contUnsatSame[tCTG], R(cc->contUnsatSame[tCTG]));
fprintf(F, "same-repeat %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatSame[tRPT], P(dd->doveUnsatSame[tRPT]), dc->doveUnsatSame[tRPT], Q(dc->doveUnsatSame[tRPT]), cc->contUnsatSame[tRPT], R(cc->contUnsatSame[tRPT]));
- fprintf(F, "same-bubble %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatSame[tBUB], P(dd->doveUnsatSame[tBUB]), dc->doveUnsatSame[tBUB], Q(dc->doveUnsatSame[tBUB]), cc->contUnsatSame[tBUB], R(cc->contUnsatSame[tBUB]));
fprintf(F, "same-unassembled %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatSame[tUNA], P(dd->doveUnsatSame[tUNA]), dc->doveUnsatSame[tUNA], Q(dc->doveUnsatSame[tUNA]), cc->contUnsatSame[tUNA], R(cc->contUnsatSame[tUNA]));
fprintf(F, "same-unused %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatSame[tUNU], P(dd->doveUnsatSame[tUNU]), dc->doveUnsatSame[tUNU], Q(dc->doveUnsatSame[tUNU]), cc->contUnsatSame[tUNU], R(cc->contUnsatSame[tUNU]));
fprintf(F, "\n");
@@ -706,31 +761,21 @@ reportOverlaps(TigVector &tigs, const char *prefix, const char *name) {
fprintf(F, "----------- ------------ ------- ------------ ------- ------------ -------\n");
fprintf(F, "contig-contig %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tCTG][tCTG], P(dd->doveUnsatDiff[tCTG][tCTG]), dc->doveUnsatDiff[tCTG][tCTG], Q(dc->doveUnsatDiff[tCTG][tCTG]), cc->contUnsatDiff[tCTG][tCTG], R(cc->contUnsatDiff[tCTG][tCTG]));
fprintf(F, "contig-repeat %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tCTG][tRPT], P(dd->doveUnsatDiff[tCTG][tRPT]), dc->doveUnsatDiff[tCTG][tRPT], Q(dc->doveUnsatDiff[tCTG][tRPT]), cc->contUnsatDiff[tCTG][tRPT], R(cc->contUnsatDiff[tCTG][tRPT]));
- fprintf(F, "contig-bubble %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tCTG][tBUB], P(dd->doveUnsatDiff[tCTG][tBUB]), dc->doveUnsatDiff[tCTG][tBUB], Q(dc->doveUnsatDiff[tCTG][tBUB]), cc->contUnsatDiff[tCTG][tBUB], R(cc->contUnsatDiff[tCTG][tBUB]));
fprintf(F, "contig-unassembled %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tCTG][tUNA], P(dd->doveUnsatDiff[tCTG][tUNA]), dc->doveUnsatDiff[tCTG][tUNA], Q(dc->doveUnsatDiff[tCTG][tUNA]), cc->contUnsatDiff[tCTG][tUNA], R(cc->contUnsatDiff[tCTG][tUNA]));
fprintf(F, "contig-unused %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tCTG][tUNU], P(dd->doveUnsatDiff[tCTG][tUNU]), dc->doveUnsatDiff[tCTG][tUNU], Q(dc->doveUnsatDiff[tCTG][tUNU]), cc->contUnsatDiff[tCTG][tUNU], R(cc->contUnsatDiff[tCTG][tUNU]));
fprintf(F, "\n");
//fprintf(F, "repeat-contig %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tRPT][tCTG], P(dd->doveUnsatDiff[tRPT][tCTG]), dc->doveUnsatDiff[tRPT][tCTG], Q(dc->doveUnsatDiff[tRPT][tCTG]), cc->contUnsatDiff[tRPT][tCTG], R(cc->contUnsatDiff[tRPT][tCTG]));
fprintf(F, "repeat-repeat %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tRPT][tRPT], P(dd->doveUnsatDiff[tRPT][tRPT]), dc->doveUnsatDiff[tRPT][tRPT], Q(dc->doveUnsatDiff[tRPT][tRPT]), cc->contUnsatDiff[tRPT][tRPT], R(cc->contUnsatDiff[tRPT][tRPT]));
- fprintf(F, "repeat-bubble %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tRPT][tBUB], P(dd->doveUnsatDiff[tRPT][tBUB]), dc->doveUnsatDiff[tRPT][tBUB], Q(dc->doveUnsatDiff[tRPT][tBUB]), cc->contUnsatDiff[tRPT][tBUB], R(cc->contUnsatDiff[tRPT][tBUB]));
fprintf(F, "repeat-unassembled %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tRPT][tUNA], P(dd->doveUnsatDiff[tRPT][tUNA]), dc->doveUnsatDiff[tRPT][tUNA], Q(dc->doveUnsatDiff[tRPT][tUNA]), cc->contUnsatDiff[tRPT][tUNA], R(cc->contUnsatDiff[tRPT][tUNA]));
fprintf(F, "repeat-unused %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tRPT][tUNU], P(dd->doveUnsatDiff[tRPT][tUNU]), dc->doveUnsatDiff[tRPT][tUNU], Q(dc->doveUnsatDiff[tRPT][tUNU]), cc->contUnsatDiff[tRPT][tUNU], R(cc->contUnsatDiff[tRPT][tUNU]));
fprintf(F, "\n");
-//fprintf(F, "bubble-contig %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tBUB][tCTG], P(dd->doveUnsatDiff[tBUB][tCTG]), dc->doveUnsatDiff[tBUB][tCTG], Q(dc->doveUnsatDiff[tBUB][tCTG]), cc->contUnsatDiff[tBUB][tCTG], R(cc->contUnsatDiff[tBUB][tCTG]));
-//fprintf(F, "bubble-repeat %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tBUB][tRPT], P(dd->doveUnsatDiff[tBUB][tRPT]), dc->doveUnsatDiff[tBUB][tRPT], Q(dc->doveUnsatDiff[tBUB][tRPT]), cc->contUnsatDiff[tBUB][tRPT], R(cc->contUnsatDiff[tBUB][tRPT]));
- fprintf(F, "bubble-bubble %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tBUB][tBUB], P(dd->doveUnsatDiff[tBUB][tBUB]), dc->doveUnsatDiff[tBUB][tBUB], Q(dc->doveUnsatDiff[tBUB][tBUB]), cc->contUnsatDiff[tBUB][tBUB], R(cc->contUnsatDiff[tBUB][tBUB]));
- fprintf(F, "bubble-unassembled %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tBUB][tUNA], P(dd->doveUnsatDiff[tBUB][tUNA]), dc->doveUnsatDiff[tBUB][tUNA], Q(dc->doveUnsatDiff[tBUB][tUNA]), cc->contUnsatDiff[tBUB][tUNA], R(cc->contUnsatDiff[tBUB][tUNA]));
- fprintf(F, "bubble-unused %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tBUB][tUNU], P(dd->doveUnsatDiff[tBUB][tUNU]), dc->doveUnsatDiff[tBUB][tUNU], Q(dc->doveUnsatDiff[tBUB][tUNU]), cc->contUnsatDiff[tBUB][tUNU], R(cc->contUnsatDiff[tBUB][tUNU]));
- fprintf(F, "\n");
//fprintf(F, "unassembled-contig %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tUNA][tCTG], P(dd->doveUnsatDiff[tUNA][tCTG]), dc->doveUnsatDiff[tUNA][tCTG], Q(dc->doveUnsatDiff[tUNA][tCTG]), cc->contUnsatDiff[tUNA][tCTG], R(cc->contUnsatDiff[tUNA][tCTG]));
//fprintf(F, "unassembled-repeat %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tUNA][tRPT], P(dd->doveUnsatDiff[tUNA][tRPT]), dc->doveUnsatDiff[tUNA][tRPT], Q(dc->doveUnsatDiff[tUNA][tRPT]), cc->contUnsatDiff[tUNA][tRPT], R(cc->contUnsatDiff[tUNA][tRPT]));
-//fprintf(F, "unassembled-bubble %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tUNA][tBUB], P(dd->doveUnsatDiff[tUNA][tBUB]), dc->doveUnsatDiff[tUNA][tBUB], Q(dc->doveUnsatDiff[tUNA][tBUB]), cc->contUnsatDiff[tUNA][tBUB], R(cc->contUnsatDiff[tUNA][tBUB]));
fprintf(F, "unassembled-unassembled %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tUNA][tUNA], P(dd->doveUnsatDiff[tUNA][tUNA]), dc->doveUnsatDiff[tUNA][tUNA], Q(dc->doveUnsatDiff[tUNA][tUNA]), cc->contUnsatDiff[tUNA][tUNA], R(cc->contUnsatDiff[tUNA][tUNA]));
fprintf(F, "unassembled-unused %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tUNA][tUNU], P(dd->doveUnsatDiff[tUNA][tUNU]), dc->doveUnsatDiff[tUNA][tUNU], Q(dc->doveUnsatDiff[tUNA][tUNU]), cc->contUnsatDiff[tUNA][tUNU], R(cc->contUnsatDiff[tUNA][tUNU]));
fprintf(F, "\n");
//fprintf(F, "unused-contig %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tUNU][tCTG], P(dd->doveUnsatDiff[tUNU][tCTG]), dc->doveUnsatDiff[tUNU][tCTG], Q(dc->doveUnsatDiff[tUNU][tCTG]), cc->contUnsatDiff[tUNU][tCTG], R(cc->contUnsatDiff[tUNU][tCTG]));
//fprintf(F, "unused-repeat %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tUNU][tRPT], P(dd->doveUnsatDiff[tUNU][tRPT]), dc->doveUnsatDiff[tUNU][tRPT], Q(dc->doveUnsatDiff[tUNU][tRPT]), cc->contUnsatDiff[tUNU][tRPT], R(cc->contUnsatDiff[tUNU][tRPT]));
-//fprintf(F, "unused-bubble %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tUNU][tBUB], P(dd->doveUnsatDiff[tUNU][tBUB]), dc->doveUnsatDiff[tUNU][tBUB], Q(dc->doveUnsatDiff[tUNU][tBUB]), cc->contUnsatDiff[tUNU][tBUB], R(cc->contUnsatDiff[tUNU][tBUB]));
//fprintf(F, "unused-unassembled %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tUNU][tUNA], P(dd->doveUnsatDiff[tUNU][tUNA]), dc->doveUnsatDiff[tUNU][tUNA], Q(dc->doveUnsatDiff[tUNU][tUNA]), cc->contUnsatDiff[tUNU][tUNA], R(cc->contUnsatDiff[tUNU][tUNA]));
fprintf(F, "unused-unused %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tUNU][tUNU], P(dd->doveUnsatDiff[tUNU][tUNU]), dc->doveUnsatDiff[tUNU][tUNU], Q(dc->doveUnsatDiff[tUNU][tUNU]), cc->contUnsatDiff[tUNU][tUNU], R(cc->contUnsatDiff[tUNU][tUNU]));
fprintf(F, "\n");
diff --git a/src/bogart/AS_BAT_Logging.C b/src/bogart/AS_BAT_Logging.C
index cfa2acf..443329c 100644
--- a/src/bogart/AS_BAT_Logging.C
+++ b/src/bogart/AS_BAT_Logging.C
@@ -129,7 +129,7 @@ uint64 LOG_ERROR_PROFILES = 0x0000000000000004;
uint64 LOG_CHUNK_GRAPH = 0x0000000000000008; // Report the chunk graph as we build it
uint64 LOG_BUILD_UNITIG = 0x0000000000000010; // Report building of initial tigs (both unitig creation and read placement)
uint64 LOG_PLACE_UNPLACED = 0x0000000000000020; // Report placing of unplaced reads
-uint64 LOG_BUBBLE_DETAIL = 0x0000000000000040;
+uint64 LOG_ORPHAN_DETAIL = 0x0000000000000040;
uint64 LOG_SPLIT_DISCONTINUOUS = 0x0000000000000080; //
uint64 LOG_INTERMEDIATE_TIGS = 0x0000000000000100; // At various spots, dump the current tigs
uint64 LOG_SET_PARENT_AND_HANG = 0x0000000000000200; //
@@ -143,7 +143,7 @@ char const *logFileFlagNames[64] = { "overlapScoring",
"chunkGraph",
"buildUnitig",
"placeUnplaced",
- "bubbles",
+ "orphans",
"splitDiscontinuous", // Update made it to here, need repeats
"intermediateTigs",
"setParentAndHang",
@@ -202,8 +202,6 @@ getLogFilePrefix(void) {
void
writeStatus(char const *fmt, ...) {
va_list ap;
- int32 nt = omp_get_num_threads();
- int32 tn = omp_get_thread_num();
va_start(ap, fmt);
diff --git a/src/bogart/AS_BAT_Logging.H b/src/bogart/AS_BAT_Logging.H
index ef51f35..7ea6656 100644
--- a/src/bogart/AS_BAT_Logging.H
+++ b/src/bogart/AS_BAT_Logging.H
@@ -73,7 +73,7 @@ extern uint64 LOG_ERROR_PROFILES;
extern uint64 LOG_CHUNK_GRAPH;
extern uint64 LOG_BUILD_UNITIG;
extern uint64 LOG_PLACE_UNPLACED;
-extern uint64 LOG_BUBBLE_DETAIL;
+extern uint64 LOG_ORPHAN_DETAIL;
extern uint64 LOG_SPLIT_DISCONTINUOUS;
extern uint64 LOG_INTERMEDIATE_TIGS;
extern uint64 LOG_SET_PARENT_AND_HANG;
diff --git a/src/bogart/AS_BAT_MarkRepeatReads.C b/src/bogart/AS_BAT_MarkRepeatReads.C
index 6c70c03..f92299a 100644
--- a/src/bogart/AS_BAT_MarkRepeatReads.C
+++ b/src/bogart/AS_BAT_MarkRepeatReads.C
@@ -31,6 +31,8 @@
#include "AS_BAT_Unitig.H"
+#include "AS_BAT_MarkRepeatReads.H"
+
#include "intervalList.H"
#include "stddev.H"
@@ -79,6 +81,7 @@ public:
};
+
bool
olapDatByEviRid(const olapDat &A, const olapDat &B) {
if (A.eviRid == B.eviRid)
@@ -93,9 +96,9 @@ olapDatByEviRid(const olapDat &A, const olapDat &B) {
class breakPointCoords {
public:
breakPointCoords(int32 bgn, int32 end, bool rpt=false) {
- _bgn = bgn;
- _end = end;
- _isRepeat = rpt;
+ _bgn = bgn;
+ _end = end;
+ _rpt = rpt;
};
~breakPointCoords() {
};
@@ -106,7 +109,7 @@ public:
int32 _bgn;
int32 _end;
- bool _isRepeat;
+ bool _rpt;
};
@@ -207,7 +210,7 @@ splitTig(TigVector &tigs,
for (uint32 ii=0; ii<BP.size(); ii++) {
int32 rgnbgn = BP[ii]._bgn;
int32 rgnend = BP[ii]._end;
- bool repeat = BP[ii]._isRepeat;
+ bool repeat = BP[ii]._rpt;
// For repeats, the read must be contained fully.
@@ -229,7 +232,7 @@ splitTig(TigVector &tigs,
if (rid == UINT32_MAX) {
fprintf(stderr, "Failed to place read %u at %d-%d\n", frg.ident, frgbgn, frgend);
for (uint32 ii=0; ii<BP.size(); ii++)
- fprintf(stderr, "BP[%3u] at %8u-%8u repeat %u\n", ii, BP[ii]._bgn, BP[ii]._end, BP[ii]._isRepeat);
+ fprintf(stderr, "BP[%3u] at %8u-%8u repeat %u\n", ii, BP[ii]._bgn, BP[ii]._end, BP[ii]._rpt);
flushLog();
}
assert(rid != UINT32_MAX); // We searched all the BP's, the read had better be placed!
@@ -532,12 +535,19 @@ findConfusedEdges(TigVector &tigs,
Unitig *tig,
intervalList<int32> &tigMarksR,
double confusedAbsolute,
- double confusedPercent) {
+ double confusedPercent,
+ vector<confusedEdge> &confusedEdges) {
uint32 *isConfused = new uint32 [tigMarksR.numberOfIntervals()];
memset(isConfused, 0, sizeof(uint32) * tigMarksR.numberOfIntervals());
+ // Examine every read in this tig. If the read intersects a marked repeat, find the best edge
+ // that continues the tig in either direction. If those reads are in the repeat region, scan all
+ // the overlaps of this read for any that are of comparable length. If any are found, declare
+ // this repeat to be potentially confused. If none are found - for the whole repeat region -
+ // then we can leave the repeat alone.
+
for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
ufNode *rdA = &tig->ufpath[fi];
uint32 rdAid = rdA->ident;
@@ -547,8 +557,8 @@ findConfusedEdges(TigVector &tigs,
double sc = (rdAhi - rdAlo) / (double)RI->readLength(rdAid);
- if ((OG->isContained(rdAid) == true) ||
- (OG->isSuspicious(rdAid) == true))
+ if ((OG->isContained(rdAid) == true) || // Don't care about contained or suspicious
+ (OG->isSuspicious(rdAid) == true)) // reads; we'll use the container instead.
continue;
for (uint32 ri=0; ri<tigMarksR.numberOfIntervals(); ri++) {
@@ -697,11 +707,6 @@ findConfusedEdges(TigVector &tigs,
(tigs[tgBid]->ufpath.size() == 1))
continue;
- // If the read is in an annotated bubble, skip.
- if ((tigs[tgBid]->_isBubble == true) &&
- (tigs[tgBid]->_isRepeat == false))
- continue;
-
// Skip if this overlap is the best we're trying to match.
if ((rdBid == b5->readId()) ||
(rdBid == b3->readId()))
@@ -782,7 +787,7 @@ findConfusedEdges(TigVector &tigs,
// Potential confusion!
- if (ovl5 == true)
+ if (ovl5 == true) {
writeLog("tig %7u read %8u pos %7u-%-7u IS confused by 5' edge to read %8u - best edge read %8u len %6u erate %.4f score %8.2f - alt edge len %6u erate %.4f score %8.2f - absdiff %8.2f percdiff %8.4f\n",
tig->id(), rdAid, rdAlo, rdAhi,
rdBid,
@@ -790,7 +795,10 @@ findConfusedEdges(TigVector &tigs,
len, ovl[oo].erate(), score,
ad5, pd5);
- if (ovl3 == true)
+ confusedEdges.push_back(confusedEdge(rdAid, false, rdBid));
+ }
+
+ if (ovl3 == true) {
writeLog("tig %7u read %8u pos %7u-%-7u IS confused by 3' edge to read %8u - best edge read %8u len %6u erate %.4f score %8.2f - alt edge len %6u erate %.4f score %8.2f - absdiff %8.2f percdiff %8.4f\n",
tig->id(), rdAid, rdAlo, rdAhi,
rdBid,
@@ -798,6 +806,9 @@ findConfusedEdges(TigVector &tigs,
len, ovl[oo].erate(), score,
ad3, pd3);
+ confusedEdges.push_back(confusedEdge(rdAid, true, rdBid));
+ }
+
isConfused[ri]++;
}
} // Over all marks (ri)
@@ -813,9 +824,10 @@ discardUnambiguousRepeats(TigVector &tigs,
Unitig *tig,
intervalList<int32> &tigMarksR,
double confusedAbsolute,
- double confusedPercent) {
+ double confusedPercent,
+ vector<confusedEdge> &confusedEdges) {
- uint32 *isConfused = findConfusedEdges(tigs, tig, tigMarksR, confusedAbsolute, confusedPercent);
+ uint32 *isConfused = findConfusedEdges(tigs, tig, tigMarksR, confusedAbsolute, confusedPercent, confusedEdges);
// Scan all the regions, and delete any that have no confusion.
@@ -896,7 +908,7 @@ reportTigsCreated(Unitig *tig,
for (uint32 ii=0; ii<BP.size(); ii++) {
int32 rgnbgn = BP[ii]._bgn;
int32 rgnend = BP[ii]._end;
- bool repeat = BP[ii]._isRepeat;
+ bool repeat = BP[ii]._rpt;
if (nRepeat[ii] + nUnique[ii] == 0)
writeLog("For tig %5u %s region %8d %8d - %6u/%6u repeat/unique reads - no new unitig created.\n",
@@ -917,11 +929,12 @@ reportTigsCreated(Unitig *tig,
void
-markRepeatReads(AssemblyGraph *AG,
- TigVector &tigs,
- double deviationRepeat,
- uint32 confusedAbsolute,
- double confusedPercent) {
+markRepeatReads(AssemblyGraph *AG,
+ TigVector &tigs,
+ double deviationRepeat,
+ uint32 confusedAbsolute,
+ double confusedPercent,
+ vector<confusedEdge> &confusedEdges) {
uint32 tiLimit = tigs.size();
uint32 numThreads = omp_get_max_threads();
uint32 blockSize = (tiLimit < 100000 * numThreads) ? numThreads : tiLimit / 99999;
@@ -937,10 +950,9 @@ markRepeatReads(AssemblyGraph *AG,
for (uint32 ti=0; ti<tiLimit; ti++) {
Unitig *tig = tigs[ti];
- if (tig == NULL)
- continue;
-
- if (tig->ufpath.size() == 1)
+ if ((tig == NULL) || // Deleted, nothing to do.
+ (tig->ufpath.size() == 1) || // Singleton, nothing to do.
+ (tig->_isUnassembled == true)) // Unassembled, don't care.
continue;
writeLog("Annotating repeats in reads for tig %u/%u.\n", ti, tiLimit);
@@ -999,7 +1011,7 @@ markRepeatReads(AssemblyGraph *AG,
writeLog("search for confused edges:\n");
- discardUnambiguousRepeats(tigs, tig, tigMarksR, confusedAbsolute, confusedPercent);
+ discardUnambiguousRepeats(tigs, tig, tigMarksR, confusedAbsolute, confusedPercent, confusedEdges);
// Merge adjacent repeats.
@@ -1054,7 +1066,7 @@ markRepeatReads(AssemblyGraph *AG,
for (uint32 ii=0; ii<BP.size(); ii++)
writeLog(" %8d %8d %s (length %d)\n",
BP[ii]._bgn, BP[ii]._end,
- BP[ii]._isRepeat ? "repeat" : "unique",
+ BP[ii]._rpt ? "repeat" : "unique",
BP[ii]._end - BP[ii]._bgn);
// Scan the reads, counting the number of reads that would be placed in each new tig. This is done
@@ -1092,4 +1104,17 @@ markRepeatReads(AssemblyGraph *AG,
delete tig;
}
}
+
+#if 0
+ FILE *F = fopen("junk.confusedEdges", "w");
+ for (uint32 ii=0; ii<confusedEdges.size(); ii++) {
+ fprintf(F, "%7u %c' from read %7u\n",
+ confusedEdges[ii].aid,
+ confusedEdges[ii].a3p ? '3' : '5',
+ confusedEdges[ii].bid);
+ }
+ fclose(F);
+#endif
+
+ writeStatus("markRepeatReads()-- Found %u confused edges.\n", confusedEdges.size());
}
diff --git a/src/bogart/AS_BAT_MarkRepeatReads.H b/src/bogart/AS_BAT_MarkRepeatReads.H
index 9f88900..229a82c 100644
--- a/src/bogart/AS_BAT_MarkRepeatReads.H
+++ b/src/bogart/AS_BAT_MarkRepeatReads.H
@@ -28,12 +28,32 @@
#include "AS_BAT_TigVector.H"
+
+
+class confusedEdge {
+public:
+ confusedEdge(uint32 aid_, bool a3p_, uint32 bid_) {
+ aid = aid_;
+ a3p = a3p_;
+ bid = bid_;
+ };
+
+ uint32 aid; // Read that is confused and needs to be split.
+ bool a3p; // End of read that is confused.
+
+ uint32 bid; // Read that causes confusion.
+};
+
+
+
void
-markRepeatReads(AssemblyGraph *AG,
- TigVector &tigs,
- double deviationRepeat,
- uint32 confusedAbsolute,
- double confusedPercent);
+markRepeatReads(AssemblyGraph *AG,
+ TigVector &tigs,
+ double deviationRepeat,
+ uint32 confusedAbsolute,
+ double confusedPercent,
+ vector<confusedEdge> &confusedEdges);
+
#endif // INCLUDE_AS_BAT_MARKREPEATREADS
diff --git a/src/bogart/AS_BAT_MergeOrphans.C b/src/bogart/AS_BAT_MergeOrphans.C
index c337166..798a9d3 100644
--- a/src/bogart/AS_BAT_MergeOrphans.C
+++ b/src/bogart/AS_BAT_MergeOrphans.C
@@ -47,20 +47,18 @@
using namespace std;
-#define BUBBLE_READ_FRACTION 0.5
-
#undef SHOW_MULTIPLE_PLACEMENTS // Reports reads that are placed multiple times in a single target region
class candidatePop {
public:
- candidatePop(Unitig *bubble_, Unitig *target_, uint32 bgn_, uint32 end_) {
- bubble = bubble_;
+ candidatePop(Unitig *orphan_, Unitig *target_, uint32 bgn_, uint32 end_) {
+ orphan = orphan_;
target = target_;
bgn = bgn_;
end = end_;
};
- Unitig *bubble;
+ Unitig *orphan;
Unitig *target;
uint32 bgn;
@@ -70,195 +68,160 @@ public:
};
-// A list of the target tigs that a bubble could be popped into.
+// A list of the target tigs that a orphan could be popped into.
typedef map<uint32, vector<uint32> > BubTargetList;
-// Decide which tigs can be bubbles. The first pass finds tigs that can be potential
-// bubbles. Any unitig where every dovetail read has an overlap to some other unitig is a
-// candidate for bubble popping.
+// Decide which tigs can be orphans. Any unitig where (nearly) every dovetail read has an overlap
+// to some other unitig is a candidate for orphan popping.
void
-findPotentialBubbles(TigVector &tigs,
- BubTargetList &potentialBubbles) {
- uint32 tiLimit = tigs.size();
- uint32 tiNumThreads = omp_get_max_threads();
- uint32 tiBlockSize = (tiLimit < 100000 * tiNumThreads) ? tiNumThreads : tiLimit / 99999;
+findPotentialOrphans(TigVector &tigs,
+ BubTargetList &potentialOrphans) {
writeStatus("\n");
- writeStatus("bubbleDetect()-- working on " F_U32 " tigs, with " F_U32 " thread%s.\n", tiLimit, tiNumThreads, (tiNumThreads == 1) ? "" : "s");
+ writeStatus("findPotentialOrphans()-- working on " F_U32 " tigs.\n", tigs.size());
- for (uint32 ti=0; ti<tiLimit; ti++) {
+ for (uint32 ti=0; ti<tigs.size(); ti++) {
Unitig *tig = tigs[ti];
if ((tig == NULL) || // Not a tig, ignore it.
(tig->ufpath.size() == 1)) // Singleton, handled elsewhere.
continue;
- uint32 nonContainedReads = 0;
- bool validBubble = true;
+ // Count the number of reads that have an overlap to some other tig. tigOlapsTo[otherTig] = count.
map<uint32,uint32> tigOlapsTo;
+ uint32 nonContainedReads = 0;
+ bool validOrphan = true;
- uint32 fiLimit = tig->ufpath.size();
- uint32 fiNumThreads = omp_get_max_threads();
- uint32 fiBlockSize = (fiLimit < 100 * fiNumThreads) ? fiNumThreads : fiLimit / 99;
-
- for (uint32 fi=0; (validBubble == true) && (fi<fiLimit); fi++) {
- uint32 rid = tig->ufpath[fi].ident;
+ for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
+ uint32 rid = tig->ufpath[fi].ident;
if (OG->isContained(rid) == true) // Don't need to check contained reads. If their container
continue; // passes the tests below, the contained read will too.
nonContainedReads++;
- uint32 ovlLen = 0;
- BAToverlap *ovl = OC->getOverlaps(rid, ovlLen);
+ // Find the list of tigs that we have an overlap to.
set<uint32> readOlapsTo;
+ uint32 ovlLen = 0;
+ BAToverlap *ovl = OC->getOverlaps(rid, ovlLen);
+
for (uint32 oi=0; oi<ovlLen; oi++) {
uint32 ovlTigID = tigs.inUnitig(ovl[oi].b_iid);
Unitig *ovlTig = tigs[ovlTigID];
- // Skip this overlap if it is to an unplaced read, to a singleton tig, to ourself,
- // or to a unitig that is shorter than us. We can not pop this tig as a bubble
- // in any of those cases.
-
- if ((ovlTigID == 0) ||
- (ovlTig == NULL) ||
- (ovlTig->ufpath.size() == 1) ||
- (ovlTig->id() == tig->id()) ||
- (ovlTig->getLength() < tig->getLength()))
+ if ((ovlTigID == 0) || // Skip this overlap if it is to an unplaced read,
+ (ovlTig == NULL) || // to a singleton tig, to ourself, or to a unitig
+ (ovlTig->ufpath.size() == 1) || // that is shorter than us. We can not pop this
+ (ovlTig->id() == tig->id()) || // tig as a orphan in any of those cases.
+ (ovlTig->getLength() < tig->getLength())) //
continue;
- // Otherwise, remember that we had an overlap to ovlTig.
-
- //writeLog("tig %u read %u overlap to tig %u read %u\n",
- // tig->id(), rid, ovlTigID, ovl[oi].b_iid);
-
- readOlapsTo.insert(ovlTigID);
+ readOlapsTo.insert(ovlTigID); // Otherwise, remember that we had an overlap to ovlTig.
}
- //writeLog("tig %8u read %8u has %u olaps\n", tig->id(), rid, readOlapsTo.size());
-
- // Transfer the per-read counts to the per-unitig counts: add one to the counter for each tig
- // that we have overlaps to.
+ // With the list of tigs that this read has an overlap to, add one to each tig in the list of
+ // tigs that this tig has an overlap to.
for (set<uint32>::iterator it=readOlapsTo.begin(); it != readOlapsTo.end(); ++it)
tigOlapsTo[*it]++;
- // Decide if we're a valid potential bubble. If tig id (in it->first) has overlaps to every
- // read we've seen so far (nonContainedReads), we're still a valid bubble.
- //
- // To _attempt_ to have differences in the bubble, we'll accept it if 3/4 of the reads
- // have overlaps.
+ // Decide if we're a valid potential orphan. If tig id (in it->first) has overlaps to
+ // (nearly) every read we've seen so far (nonContainedReads), we're still a valid orphan.
- validBubble = false;
+ validOrphan = false;
for (map<uint32,uint32>::iterator it=tigOlapsTo.begin(); it != tigOlapsTo.end(); ++it)
- if (it->second >= BUBBLE_READ_FRACTION * nonContainedReads)
- validBubble = true;
-
- // If we've not seen that many reads, pretend it's a valid bubble. It'll get screened out later.
+ if (it->second == nonContainedReads) // All reads have an overlap to the tig
+ validOrphan = true; // at *it, so valid orphan.
- if (nonContainedReads < 16)
- validBubble = true;
+ if (validOrphan == false) // If not a valid orphan, bail. There is no other
+ break; // tig that all of our reads have overlaps to.
}
- // If not validBubble, report.
-
-#if 0
- if (validBubble == false) {
- writeLog("notValidBubble tig %8d expects %6u reads\n", tig->id(), nonContainedReads);
+ // If not a valid orphan, just move on to the next tig.
- for (map<uint32,uint32>::iterator it=tigOlapsTo.begin(); it != tigOlapsTo.end(); ++it)
- writeLog(" to tig %8u overlaps %6u\n", it->first, it->second);
- }
-#endif
+ if (validOrphan == false)
+ continue;
- // If validBubble, then there is a tig that every dovetail read has at least one overlap to.
- // Save those tigs in potentialBubbles.
+ // Otherwise, a valid orphan! There is at least one tig that (nearly) every dovetail read has
+ // at least one overlap to. Save those tigs in potentialOrphans.
uint32 nTigs = 0;
- if (validBubble) {
- for (map<uint32,uint32>::iterator it=tigOlapsTo.begin(); it != tigOlapsTo.end(); ++it)
- if (it->second >= BUBBLE_READ_FRACTION * nonContainedReads)
- nTigs++;
- }
+ for (map<uint32,uint32>::iterator it=tigOlapsTo.begin(); it != tigOlapsTo.end(); ++it)
+ if (it->second >= 0.5 * nonContainedReads)
+ nTigs++;
- // ALWAYS log potential bubbles.
+ writeLog("findPotentialOrphans()--\n");
+ writeLog("findPotentialOrphans()-- potential orphan tig %8u length %9u nReads %7u to %3u tigs:\n",
+ tig->id(), tig->getLength(), tig->ufpath.size(), nTigs);
- if (nTigs > 0) {
- writeLog("\n");
- writeLog("potential bubble tig %8u length %9u nReads %7u to %3u tigs:\n",
- tig->id(), tig->getLength(), tig->ufpath.size(), nTigs);
+ for (map<uint32,uint32>::iterator it=tigOlapsTo.begin(); it != tigOlapsTo.end(); ++it) {
+ if (it->second >= 0.5 * nonContainedReads) {
+ Unitig *dest = tigs[it->first];
- for (map<uint32,uint32>::iterator it=tigOlapsTo.begin(); it != tigOlapsTo.end(); ++it) {
- if (it->second >= BUBBLE_READ_FRACTION * nonContainedReads) {
- Unitig *dest = tigs[it->first];
+ writeLog("findPotentialOrphans()-- tig %8u length %9u nReads %7u\n", dest->id(), dest->getLength(), dest->ufpath.size());
- writeLog(" tig %8u length %9u nReads %7u\n", dest->id(), dest->getLength(), dest->ufpath.size());
-
- potentialBubbles[ti].push_back(dest->id());
- }
+ potentialOrphans[ti].push_back(dest->id());
}
}
- }
+ } // Over all tigs.
flushLog();
}
-
-// Find filtered placements for all the reads in the potential bubble tigs.
+// Find filtered placements for all the reads in the potential orphan tigs.
vector<overlapPlacement> *
-findBubbleReadPlacements(TigVector &tigs,
- BubTargetList &potentialBubbles,
- double deviationBubble) {
+findOrphanReadPlacements(TigVector &tigs,
+ BubTargetList &potentialOrphans,
+ double deviationOrphan) {
uint32 fiLimit = RI->numReads();
uint32 fiNumThreads = omp_get_max_threads();
uint32 fiBlockSize = (fiLimit < 1000 * fiNumThreads) ? fiNumThreads : fiLimit / 999;
+ uint64 nReads = 0;
+ uint64 nPlaces = 0;
+
vector<overlapPlacement> *placed = new vector<overlapPlacement> [fiLimit + 1];
+ writeLog("findOrphanReadPlacement()--\n");
+
#pragma omp parallel for schedule(dynamic, fiBlockSize)
for (uint32 fi=0; fi<fiLimit; fi++) {
uint32 rdAtigID = tigs.inUnitig(fi);
if ((rdAtigID == 0) || // Read not placed in a tig, ignore it.
(OG->isContained(fi)) || // Read is contained, ignore it.
- (potentialBubbles.count(rdAtigID) == 0)) // Read isn't in a potential bubble, ignore it.
+ (potentialOrphans.count(rdAtigID) == 0)) // Read isn't in a potential orphan, ignore it.
continue;
+#pragma omp atomic
+ nReads++;
+
Unitig *rdAtig = tigs[rdAtigID];
ufNode *rdA = &rdAtig->ufpath[ tigs.ufpathIdx(fi) ];
bool rdAfwd = (rdA->position.bgn < rdA->position.end);
int32 rdAlo = (rdAfwd) ? rdA->position.bgn : rdA->position.end;
int32 rdAhi = (rdAfwd) ? rdA->position.end : rdA->position.bgn;
- bool isEnd = (fi == 0) || (fi == fiLimit-1);
-
- uint32 ovlLen = 0;
- BAToverlap *ovl = OC->getOverlaps(rdA->ident, ovlLen);
-
- set<uint32> intersections;
-
- //if ((fi % 100) == 0)
- // fprintf(stderr, "findBubbleReadPlacements()-- read %8u with %6u overlaps - %6.2f%% finished.\r",
- // rdA->ident, ovlLen, 100.0 * fi / fiLimit);
+ bool isEnd = (rdAlo == 0) || (rdAhi == rdAtig->getLength());
- // Compute all placements for this read.
+ // Compute all placements for this read. We ask for only fully placed reads.
vector<overlapPlacement> placements;
- placeReadUsingOverlaps(tigs, NULL, rdA->ident, placements, placeRead_noExtend);
+ placeReadUsingOverlaps(tigs, NULL, rdA->ident, placements, placeRead_fullMatch);
- // Weed out placements that aren't for bubbles, or that are for bubbles but are poor quality. Or are to ourself!
+ // Weed out placements that aren't for orphans, or that are for orphans but are poor quality. Or are to ourself!
for (uint32 pi=0; pi<placements.size(); pi++) {
uint32 rdBtigID = placements[pi].tigID;
@@ -269,546 +232,450 @@ findBubbleReadPlacements(TigVector &tigs,
double erate = placements[pi].errors / placements[pi].aligned;
- // Ignore the placement if it is to ourself.
-
- if (rdAtigID == rdBtigID)
+ if ((rdAtigID == rdBtigID) || // To ourself.
+ (rdBtigID == 0) || // To a singleton read.
+ (rdBtig == NULL) || // To a singleton read.
+ (rdBtig->ufpath.size() == 1) || // To a singleton tig.
+ (potentialOrphans.count((rdBtigID) > 0))) // To a potential orphan tig
continue;
- // Ignore the placement if it is to a non-tig or a singleton read.
-
- if ((rdBtigID == 0) ||
- (rdBtig == NULL) ||
- (rdBtig->ufpath.size() == 1))
- continue;
-
- // Ignore the placement if it is partial and not a terminal read.
-
- if ((isEnd == false) &&
- (placements[pi].fCoverage < 0.99)) {
- if (logFileFlagSet(LOG_BUBBLE_DETAIL))
- writeLog("tig %6u read %8u -> tig %6u %6u reads at %8u-%-8u (cov %7.5f erate %6.4f) - PARTIALLY PLACED\n",
- rdAtigID, placements[pi].frgID, placements[pi].tigID, rdBtig->ufpath.size(), placements[pi].position.bgn, placements[pi].position.end, placements[pi].fCoverage, erate);
- continue;
- }
-
- // Ignore the placement if it isn't to one of our bubble-popping candidate tigs.
+ // Ignore the placement if it isn't to one of our orphan-popping candidate tigs.
bool dontcare = true;
- vector<uint32> &pbubbles = potentialBubbles[rdAtigID];
+ vector<uint32> &porphans = potentialOrphans[rdAtigID];
- for (uint32 pb=0; pb<pbubbles.size(); pb++) {
- if (pbubbles[pb] == rdBtigID)
+ for (uint32 pb=0; pb<porphans.size(); pb++)
+ if (porphans[pb] == rdBtigID)
dontcare = false;
- }
if (dontcare) {
- if (logFileFlagSet(LOG_BUBBLE_DETAIL))
- writeLog("tig %6u read %8u -> tig %6u %6u reads at %8u-%-8u (cov %7.5f erate %6.4f) - NOT CANDIDATE TIG\n",
+ if (logFileFlagSet(LOG_ORPHAN_DETAIL))
+ writeLog("findOrphanReadPlacement()-- tig %6u read %8u -> tig %6u %6u reads at %8u-%-8u (cov %7.5f erate %6.4f) - NOT CANDIDATE TIG\n",
rdAtigID, placements[pi].frgID, placements[pi].tigID, rdBtig->ufpath.size(), placements[pi].position.bgn, placements[pi].position.end, placements[pi].fCoverage, erate);
continue;
}
// Ignore the placement if it is too diverged from the destination tig.
- if (rdBtig->overlapConsistentWithTig(deviationBubble, lo, hi, erate) < 0.5) {
- if (logFileFlagSet(LOG_BUBBLE_DETAIL))
- writeLog("tig %6u read %8u -> tig %6u %6u reads at %8u-%-8u (cov %7.5f erate %6.4f) - HIGH ERROR\n",
+ if (rdBtig->overlapConsistentWithTig(deviationOrphan, lo, hi, erate) < 0.5) {
+ if (logFileFlagSet(LOG_ORPHAN_DETAIL))
+ writeLog("findOrphanReadPlacement()-- tig %6u read %8u -> tig %6u %6u reads at %8u-%-8u (cov %7.5f erate %6.4f) - HIGH ERROR\n",
rdAtigID, placements[pi].frgID, placements[pi].tigID, rdBtig->ufpath.size(), placements[pi].position.bgn, placements[pi].position.end, placements[pi].fCoverage, erate);
continue;
}
// Good placement!
- if (logFileFlagSet(LOG_BUBBLE_DETAIL))
- writeLog("tig %6u read %8u -> tig %6u %6u reads at %8u-%-8u (cov %7.5f erate %6.4f)\n",
+ if (logFileFlagSet(LOG_ORPHAN_DETAIL))
+ writeLog("findOrphanReadPlacement()-- tig %6u read %8u -> tig %6u %6u reads at %8u-%-8u (cov %7.5f erate %6.4f)\n",
rdAtigID, placements[pi].frgID, placements[pi].tigID, rdBtig->ufpath.size(), placements[pi].position.bgn, placements[pi].position.end, placements[pi].fCoverage, erate);
+#pragma omp atomic
+ nPlaces++;
+
placed[fi].push_back(placements[pi]);
}
}
+ writeLog("findOrphanReadPlacement()-- placed %u reads into %u locations\n", nReads, nPlaces);
+
return(placed);
}
+static
+bool
+failedToPlaceAnchor(Unitig *orphan,
+ vector<overlapPlacement> *placed) {
+ uint32 nReads = orphan->ufpath.size();
+ char placed0 = ((nReads > 0) && (placed[ orphan->ufpath[ 0 ].ident ].size() > 0)) ? 't' : '-';
+ char placed1 = ((nReads > 1) && (placed[ orphan->ufpath[ 1 ].ident ].size() > 0)) ? 't' : '-';
+ char placedb = ((nReads > 1) && (placed[ orphan->ufpath[ nReads-2 ].ident ].size() > 0)) ? 't' : '-';
+ char placeda = ((nReads > 0) && (placed[ orphan->ufpath[ nReads-1 ].ident ].size() > 0)) ? 't' : '-';
-// Bubble popping cannot be done in parallel -- there is a race condition when both tigs
-// A and B are considering merging in unitig C.
-
-void
-mergeOrphans(TigVector &tigs,
- double deviationBubble) {
-
- BubTargetList potentialBubbles;
+ char placedS[128];
+ uint32 placedN = 0;
- findPotentialBubbles(tigs, potentialBubbles);
+ bool failed = false;
- writeStatus("mergeOrphans()-- Found " F_SIZE_T " potential bubbles.\n", potentialBubbles.size());
+ if (nReads > 3)
+ for (uint32 fi=2; fi<nReads-2; fi++)
+ if (placed[orphan->ufpath[fi].ident].size() > 0)
+ placedN++;
- //if (potentialBubbles.size() == 0)
- // return;
+ switch (nReads) {
+ case 0:
+ assert(0);
+ break;
- writeLog("\n");
- writeLog("Found " F_SIZE_T " potential bubbles.\n", potentialBubbles.size());
- writeLog("\n");
+ case 1:
+ snprintf(placedS, 128, "%c", placed0);
+ break;
- vector<overlapPlacement> *placed = findBubbleReadPlacements(tigs, potentialBubbles, deviationBubble);
+ case 2:
+ snprintf(placedS, 128, "%c%c", placed0, placeda);
+ break;
- // We now have, in 'placed', a list of all the places that each read could be placed. Decide if there is a _single_
- // place for each bubble to be popped.
+ case 3:
+ snprintf(placedS, 128, "%c%c%c", placed0, placed1, placeda);
+ break;
- uint32 tiLimit = tigs.size();
- //uint32 tiNumThreads = omp_get_max_threads();
- //uint32 tiBlockSize = (tiLimit < 100000 * tiNumThreads) ? tiNumThreads : tiLimit / 99999;
+ case 4:
+ snprintf(placedS, 128, "%c%c%c%c", placed0, placed1, placedb, placeda);
+ break;
- // Clear flags.
- for (uint32 ti=0; ti<tiLimit; ti++) {
- if (tigs[ti]) {
- tigs[ti]->_isBubble = false;
- tigs[ti]->_isRepeat = false;
- }
+ default:
+ snprintf(placedS, 128, "%c%c[%u]%c%c",
+ placed0, placed1, placedN, placedb, placeda);
+ break;
}
- uint32 nUniqOrphan = 0;
- uint32 nReptOrphan = 0;
- uint32 nUniqBubble = 0;
- uint32 nReptBubble = 0;
+ failed = ((placed0 != 't') || (placeda != 't'));
- // In parallel, process the placements.
+ writeLog("failedToPlaceAnchor()-- potential orphan tig %8u (reads %5u length %8u) - placed %s%s\n",
+ orphan->id(), nReads, orphan->getLength(), placedS, failed ? " FAILED" : "");
- for (uint32 ti=0; ti<tiLimit; ti++) {
- if (potentialBubbles.count(ti) == 0) // Not a potential bubble
- continue;
+ return(failed);
+}
- writeLog("\n");
- // Save some interesting bits about our bubble.
- Unitig *bubble = tigs[ti];
- uint32 bubbleLen = bubble->getLength();
- uint32 nReads = bubble->ufpath.size();
- ufNode &fRead = bubble->ufpath.front();
- ufNode &lRead = bubble->ufpath.back();
+static
+void
+addInitialIntervals(Unitig *orphan,
+ vector<overlapPlacement> *placed,
+ uint32 fReadID,
+ uint32 lReadID,
+ map<uint32, intervalList<uint32> *> &targetIntervals) {
+ uint32 orphanLen = orphan->getLength();
+
+ // Add extended intervals for the first read.
+ //
+ // target ---------------------------------------------
+ // read -------
+ // orphan -------------------------
+
+ for (uint32 pp=0; pp<placed[fReadID].size(); pp++) {
+ uint32 tid = placed[fReadID][pp].tigID;
+ uint32 bgn = placed[fReadID][pp].position.min();
+
+ if (targetIntervals[tid] == NULL)
+ targetIntervals[tid] = new intervalList<uint32>;
+
+ targetIntervals[tid]->add(bgn, orphanLen); // Don't care if it goes off the high end of the tig.
+ }
- uint32 fReadID = fRead.ident; // Ident of the first read
- uint32 lReadID = lRead.ident;
+ // Add extended intervals for the last read.
+ //
+ // target ---------------------------------------------
+ // read -------
+ // orphan -------------------------
- bool bubbleInnie = (fRead.position.isForward() && lRead.position.isReverse());
- bool bubbleOuttie = (fRead.position.isReverse() && lRead.position.isForward());
- bool bubbleFwd = (fRead.position.isForward() && lRead.position.isForward());
- bool bubbleRev = (fRead.position.isReverse() && lRead.position.isReverse());
+ for (uint32 pp=0; pp<placed[lReadID].size(); pp++) {
+ uint32 tid = placed[lReadID][pp].tigID;
+ uint32 end = placed[lReadID][pp].position.max();
- // Scan the bubble, decide if there are _ANY_ read placements. Log appropriately.
+ if (targetIntervals[tid] == NULL)
+ targetIntervals[tid] = new intervalList<uint32>;
- bool failedToPlaceAnchor = false;
+ if (end < orphanLen)
+ targetIntervals[tid]->add(0, end); // Careful! Negative will underflow!
+ else
+ targetIntervals[tid]->add(end - orphanLen, orphanLen);
+ }
+}
- {
- char placedS[128];
- char placed0 = ((nReads > 0) && (placed[ bubble->ufpath[ 0 ].ident ].size() > 0)) ? 't' : '-';
- char placed1 = ((nReads > 1) && (placed[ bubble->ufpath[ 1 ].ident ].size() > 0)) ? 't' : '-';
- char placedb = ((nReads > 1) && (placed[ bubble->ufpath[ nReads-2 ].ident ].size() > 0)) ? 't' : '-';
- char placeda = ((nReads > 0) && (placed[ bubble->ufpath[ nReads-1 ].ident ].size() > 0)) ? 't' : '-';
- uint32 placedN = 0;
+static
+void
+saveCorrectlySizedInitialIntervals(Unitig *orphan,
+ Unitig *target,
+ intervalList<uint32> *IL,
+ uint32 fReadID,
+ uint32 lReadID,
+ vector<overlapPlacement> *placed,
+ vector<candidatePop *> &targets) {
- if (nReads > 3)
- for (uint32 fi=2; fi<nReads-2; fi++)
- if (placed[bubble->ufpath[fi].ident].size() > 0)
- placedN++;
+ IL->merge(); // Merge overlapping initial intervals created above.
- switch (nReads) {
- case 0:
- assert(0);
- break;
+ for (uint32 ii=0; ii<IL->numberOfIntervals(); ii++) {
+ bool noFirst = true;
+ bool noLast = true;
- case 1:
- snprintf(placedS, 128, "%c", placed0);
- break;
+ uint32 intBgn = IL->lo(ii);
+ uint32 intEnd = IL->hi(ii);
- case 2:
- snprintf(placedS, 128, "%c%c", placed0, placeda);
- break;
+ SeqInterval fPos;
+ SeqInterval lPos;
- case 3:
- snprintf(placedS, 128, "%c%c%c", placed0, placed1, placeda);
- break;
+ // Find the read placement in this interval, if it exists.
- case 4:
- snprintf(placedS, 128, "%c%c%c%c", placed0, placed1, placedb, placeda);
- break;
+ for (uint32 pp=0; pp<placed[fReadID].size(); pp++) {
+ fPos = placed[fReadID][pp].position;
- default:
- snprintf(placedS, 128, "%c%c[%u]%c%c",
- placed0, placed1, placedN, placedb, placeda);
- break;
+ if ((target->id() == placed[fReadID][pp].tigID) &&
+ (intBgn <= fPos.min()) && (fPos.max() <= intEnd)) {
+ noFirst = false;
+ break;
}
+ }
- failedToPlaceAnchor = ((placed0 != 't') || (placeda != 't'));
+ for (uint32 pp=0; pp<placed[lReadID].size(); pp++) {
+ lPos = placed[lReadID][pp].position;
- writeLog("potential bubble tig %8u (reads %5u length %8u) - placed %s%s\n",
- bubble->id(), nReads, bubbleLen, placedS, failedToPlaceAnchor ? " FAILED" : "");
+ if ((target->id() == placed[lReadID][pp].tigID) &&
+ (intBgn <= lPos.min()) && (lPos.max() <= intEnd)) {
+ noLast = false;
+ break;
+ }
}
- if (failedToPlaceAnchor)
+ // Ignore if missing either read.
+
+ if ((noFirst == true) ||
+ (noLast == true)) {
+ writeLog("saveCorrectlySizedInitialIntervals()-- potential orphan tig %8u (length %8u) - target %8u %8u-%-8u (length %8u) - MISSING %s%s%s READ%s\n",
+ orphan->id(), orphan->getLength(),
+ target->id(), intBgn, intEnd, intEnd - intBgn,
+ (noFirst) ? "FIRST" : "",
+ (noFirst && noLast) ? " and " : "",
+ (noLast) ? "LAST" : "",
+ (noFirst && noLast) ? "S" : "");
continue;
+ }
+ writeLog("saveCorrectlySizedInitialIntervals()-- potential orphan tig %8u (length %8u) - target %8u %8u-%-8u (length %8u) - %8u-%-8u %8u-%-8u\n",
+ orphan->id(), orphan->getLength(),
+ target->id(), intBgn, intEnd, intEnd - intBgn,
+ fPos.min(), fPos.max(),
+ lPos.min(), lPos.max());
- // Split the placements into piles for each target and build an interval list for each target.
- // For each read in the tig, convert the vector of placements into interval lists, one list per target tig.
-
- map<uint32, intervalList<uint32> *> targetIntervals;
-
- // Add extended intervals for the first read.
+ // Ignore if the region is too small or too big.
- for (uint32 pp=0; pp<placed[fReadID].size(); pp++) {
- uint32 tid = placed[fReadID][pp].tigID;
- uint32 bgn = placed[fReadID][pp].position.min();
+ uint32 regionMin = min(fPos.min(), lPos.min());
+ uint32 regionMax = max(fPos.max(), lPos.max());
- if (targetIntervals[tid] == NULL)
- targetIntervals[tid] = new intervalList<uint32>;
-
- targetIntervals[tid]->add(bgn, bubbleLen); // Don't care if it goes off the high end of the tig.
- }
+ if ((regionMax - regionMin < 0.75 * orphan->getLength()) ||
+ (regionMax - regionMin > 1.25 * orphan->getLength()))
+ continue;
- // Add extended intervals for the last read.
+ // We probably should be checking orientation. Maybe tomorrow.
- for (uint32 pp=0; pp<placed[lReadID].size(); pp++) {
- uint32 tid = placed[lReadID][pp].tigID;
- uint32 end = placed[lReadID][pp].position.max();
+ // Both reads placed, and at about the right size. Save the candidate position - we can
+ // possibly place 'orphan' in 'tigs[target->id()' at position regionMin-regionMax.
- if (targetIntervals[tid] == NULL)
- targetIntervals[tid] = new intervalList<uint32>;
+ targets.push_back(new candidatePop(orphan, target, regionMin, regionMax));
+ } // Over all intervals for this target
- if (end < bubbleLen)
- targetIntervals[tid]->add(0, end); // Careful! Negative will underflow!
- else
- targetIntervals[tid]->add(end - bubbleLen, bubbleLen);
- }
+ // We're done with this intervalList, clean it up. This does leave a dangling pointer in the map<> though.
- // For each destination tig:
- // merge the intervals
- // for each interval
- // find which bubble first/last reads map to each interval
- // ignore if the extent of first/last is too big or small
- // save otherwise
-
- vector<candidatePop *> targets;
+ delete IL;
+}
- for (map<uint32, intervalList<uint32> *>::iterator it=targetIntervals.begin(); it != targetIntervals.end(); ++it) {
- uint32 targetID = it->first;
- intervalList<uint32> *IL = it->second;
- // Merge.
- IL->merge();
- // Figure out if each interval has both the first and last read of some bubble, and if those
- // are properly sized.
- for (uint32 ii=0; ii<IL->numberOfIntervals(); ii++) {
- bool noFirst = true;
- bool noLast = true;
+void
+assignReadsToTargets(Unitig *orphan,
+ vector<overlapPlacement> *placed,
+ vector<candidatePop *> targets) {
+
+ for (uint32 fi=0; fi<orphan->ufpath.size(); fi++) {
+ uint32 readID = orphan->ufpath[fi].ident;
+
+ for (uint32 pp=0; pp<placed[readID].size(); pp++) {
+ uint32 tid = placed[readID][pp].tigID;
+ uint32 bgn = placed[readID][pp].position.min();
+ uint32 end = placed[readID][pp].position.max();
+
+ for (uint32 tt=0; tt<targets.size(); tt++) // For a read placed in tig 'tid' at 'bgn-end',
+ if ((targets[tt]->target->id() == tid) && // if the target is the same tig and the read
+ (isContained(bgn, end, targets[tt]->bgn, targets[tt]->end))) // is contained in the target position,
+ targets[tt]->placed.push_back(placed[readID][pp]); // save the position to the target
+ }
+ }
- uint32 intBgn = IL->lo(ii);
- uint32 intEnd = IL->hi(ii);
+ // Remove duplicate placements from each target.
+ //
+ // Detect duplicates, keep the one with lower error.
+ // There are a lot of duplicate placements, logging isn't terribly useful.
- SeqInterval fPos;
- SeqInterval lPos;
+ uint32 nDup = 0;
+ uint32 save;
+ uint32 remo;
- for (uint32 pp=0; pp<placed[fReadID].size(); pp++) {
- fPos = placed[fReadID][pp].position;
+ for (uint32 tt=0; tt<targets.size(); tt++) {
+ candidatePop *t = targets[tt];
- if ((targetID == placed[fReadID][pp].tigID) &&
- (intBgn <= fPos.min()) && (fPos.max() <= intEnd)) {
- noFirst = false;
- break;
- }
- }
+ for (uint32 aa=0; aa<t->placed.size(); aa++) {
+ for (uint32 bb=0; bb<t->placed.size(); bb++) {
+ if ((aa == bb) ||
+ (t->placed[aa].frgID != t->placed[bb].frgID) ||
+ (t->placed[aa].frgID == 0) ||
+ (t->placed[bb].frgID == 0))
+ continue;
- for (uint32 pp=0; pp<placed[lReadID].size(); pp++) {
- lPos = placed[lReadID][pp].position;
+ nDup++;
- if ((targetID == placed[lReadID][pp].tigID) &&
- (intBgn <= lPos.min()) && (lPos.max() <= intEnd)) {
- noLast = false;
- break;
- }
+ if (t->placed[aa].errors / t->placed[aa].aligned < t->placed[bb].errors / t->placed[bb].aligned) {
+ save = aa;
+ remo = bb;
+ } else {
+ save = bb;
+ remo = aa;
}
- // Ignore if missing either read.
-
- if ((noFirst == true) ||
- (noLast == true)) {
- writeLog("potential bubble tig %8u (length %8u) - target %8u %8u-%-8u (length %8u) - MISSING %s%s%s READ%s\n",
- bubble->id(), bubble->getLength(),
- targetID, intBgn, intEnd, intEnd - intBgn,
- (noFirst) ? "FIRST" : "",
- (noFirst && noLast) ? " and " : "",
- (noLast) ? "LAST" : "",
- (noFirst && noLast) ? "S" : "");
- continue;
- }
+#ifdef SHOW_MULTIPLE_PLACEMENTS
+ writeLog("assignReadsToTargets()-- duplicate read alignment for tig %u read %u - better %u-%-u %.4f - worse %u-%-u %.4f\n",
+ t->placed[save].tigID, t->placed[save].frgID,
+ t->placed[save].position.bgn, t->placed[save].position.end, t->placed[save].errors / t->placed[save].aligned,
+ t->placed[remo].position.bgn, t->placed[remo].position.end, t->placed[remo].errors / t->placed[remo].aligned);
+#endif
- writeLog("potential bubble tig %8u (length %8u) - target %8u %8u-%-8u (length %8u) - %8u-%-8u %8u-%-8u\n",
- bubble->id(), bubble->getLength(),
- targetID, intBgn, intEnd, intEnd - intBgn,
- fPos.min(), fPos.max(),
- lPos.min(), lPos.max());
+ t->placed[remo] = overlapPlacement();
+ }
+ }
+ // Get rid of any now-empty entries.
- // Ignore if the reads align in inconsistent orientations.
+ for (uint32 aa=t->placed.size(); aa--; ) {
+ if (t->placed[aa].frgID == 0) {
+ t->placed[aa] = t->placed.back();
+ t->placed.pop_back();
+ }
+ }
+ }
-#if 0
- bool alignFwd = (fPos.min() < lPos.max()) ? true : false;
- bool fPosFwd = fPos.isForward();
- bool lPosFwd = lPos.isForward();
+ writeLog("assignReadsToTargets()-- Removed %u duplicate placements.\n", nDup);
+}
- bool alignInnie = (alignFwd == true) ? ((fPosFwd == true) && (lPosFwd == false)) : ((fPosFwd == false) && (lPosFwd == true));
- bool alignOuttie = false;
- bool alignFwd = false;
- bool alignRev = false;
- bool alignInnie = (alignFwd && fPosFwd && !rPosFwd);
- //if ((bubbleInnie == true) &&
- //if ((bubbleOuttie == true) && ((alignFwd == true) || (fPosFwd == true) || (rPosFwd == false)));
- //if ((bubbleFwd == true) && ((alignFwd == true) || (fPosFwd == true) || (rPosFwd == false)));
- //if ((bubbleRev == true) && ((alignFwd == true) || (fPosFwd == true) || (rPosFwd == false)));
-#endif
- // Ignore if the region is too small or too big.
- uint32 regionMin = min(fPos.min(), lPos.min());
- uint32 regionMax = max(fPos.max(), lPos.max());
+void
+mergeOrphans(TigVector &tigs,
+ double deviationOrphan) {
- if ((regionMax - regionMin < 0.75 * bubbleLen) ||
- (regionMax - regionMin > 1.25 * bubbleLen))
- continue;
+ // Find, for each tig, the list of other tigs that it could potentially be placed into.
- // Both reads placed, and at about the right size. We probably should be checking orientation. Maybe tomorrow.
+ BubTargetList potentialOrphans;
- targets.push_back(new candidatePop(bubble, tigs[targetID], regionMin, regionMax));
- } // Over all intervals for this target
- } // Over all targets
+ findPotentialOrphans(tigs, potentialOrphans);
- // Done with the targetIntervals. Clean up.
+ writeStatus("mergeOrphans()-- Found " F_SIZE_T " potential orphans.\n", potentialOrphans.size());
- for (map<uint32, intervalList<uint32> *>::iterator it=targetIntervals.begin(); it != targetIntervals.end(); ++it)
- delete it->second;
+ writeLog("\n");
+ writeLog("mergeOrphans()-- Found " F_SIZE_T " potential orphans.\n", potentialOrphans.size());
+ writeLog("\n");
- targetIntervals.clear();
+ // For any tig that is a potential orphan, find all read placements.
- // If no targets, nothing to do.
+ vector<overlapPlacement> *placed = findOrphanReadPlacements(tigs, potentialOrphans, deviationOrphan);
- if (targets.size() == 0) {
- writeLog("potential bubble tig %8u - generated no targets\n", ti);
- continue;
- }
+ // We now have, in 'placed', a list of all the places that each read could be placed. Decide if there is a _single_
+ // place for each orphan to be popped.
- // Run through the placements again, and assign them to the correct target.
- //
- // For each read:
- // For each acceptable placement:
- // For each target location:
- // If the placement is for this target, save it.
+ uint32 nUniqOrphan = 0;
+ uint32 nReptOrphan = 0;
- for (uint32 fi=0; fi<nReads; fi++) {
- uint32 readID = bubble->ufpath[fi].ident;
+ for (uint32 ti=0; ti<tigs.size(); ti++) {
+ Unitig *orphan = tigs[ti];
- for (uint32 pp=0; pp<placed[readID].size(); pp++) {
- uint32 tid = placed[readID][pp].tigID;
+ if (potentialOrphans.count(ti) == 0)
+ continue;
- uint32 bgn = placed[readID][pp].position.min();
- uint32 end = placed[readID][pp].position.max();
+ // Scan the orphan, decide if there are _ANY_ read placements. Log appropriately.
- for (uint32 tt=0; tt<targets.size(); tt++)
- if ((targets[tt]->target->id() == tid) &&
- (targets[tt]->bgn < end) && (bgn < targets[tt]->end))
- targets[tt]->placed.push_back(placed[readID][pp]);
- }
- }
+ if (failedToPlaceAnchor(orphan, placed) == true)
+ continue;
- // Count the number of targets that have all the reads (later: in the correct order, etc, etc). Remove those
- // that don't.
+ writeLog("mergeOrphans()-- Processing orphan %u - %u bp %u reads\n", ti, orphan->getLength(), orphan->ufpath.size());
- uint32 nTargets = 0;
+ // Create intervals for each placed read.
+ //
+ // target ---------------------------------------------
+ // read -------
+ // orphan -------------------------
- set<uint32> tigReads; // Reads in the bubble tig.
- set<uint32> tgtReads; // Reads in the bubble that have a placement in the target.
+ uint32 fReadID = orphan->ufpath.front().ident;
+ uint32 lReadID = orphan->ufpath.back().ident;
+ map<uint32, intervalList<uint32> *> targetIntervals;
- // Remove duplicate placements from each target.
+ addInitialIntervals(orphan, placed, fReadID, lReadID, targetIntervals);
- for (uint32 tt=0; tt<targets.size(); tt++) {
- candidatePop *t = targets[tt];
+ // Figure out if each interval has both the first and last read of some orphan, and if those
+ // are properly sized. If so, save a candidatePop.
- // Detect duplicates, keep the one with lower error. There are a lot of duplicate
- // placements, logging isn't terribly useful.
+ vector<candidatePop *> targets;
- for (uint32 aa=0; aa<t->placed.size(); aa++) {
- for (uint32 bb=0; bb<t->placed.size(); bb++) {
- if ((aa == bb) ||
- (t->placed[aa].frgID != t->placed[bb].frgID) ||
- (t->placed[aa].frgID == 0) ||
- (t->placed[bb].frgID == 0))
- continue;
+ for (map<uint32, intervalList<uint32> *>::iterator it=targetIntervals.begin(); it != targetIntervals.end(); ++it)
+ saveCorrectlySizedInitialIntervals(orphan,
+ tigs[it->first], // The targetID in targetIntervals
+ it->second, // The interval list in targetIntervals
+ fReadID,
+ lReadID,
+ placed,
+ targets);
- if (t->placed[aa].errors / t->placed[aa].aligned < t->placed[bb].errors / t->placed[bb].aligned) {
-#ifdef SHOW_MULTIPLE_PLACEMENTS
- writeLog("duplicate read alignment for tig %u read %u - better %u-%-u %.4f - worse %u-%-u %.4f\n",
- t->placed[aa].tigID, t->placed[aa].frgID,
- t->placed[aa].position.bgn, t->placed[aa].position.end, t->placed[aa].errors / t->placed[aa].aligned,
- t->placed[bb].position.bgn, t->placed[bb].position.end, t->placed[bb].errors / t->placed[bb].aligned);
-#endif
- t->placed[bb] = overlapPlacement();
- } else {
-#ifdef SHOW_MULTIPLE_PLACEMENTS
- writeLog("duplicate read alignment for tig %u read %u - better %u-%-u %.4f - worse %u-%-u %.4f\n",
- t->placed[aa].tigID, t->placed[aa].frgID,
- t->placed[bb].position.bgn, t->placed[bb].position.end, t->placed[bb].errors / t->placed[bb].aligned,
- t->placed[aa].position.bgn, t->placed[aa].position.end, t->placed[aa].errors / t->placed[aa].aligned);
-#endif
- t->placed[aa] = overlapPlacement();
- }
- }
- }
+ targetIntervals.clear(); // intervalList already freed.
- // Get rid of any now-empty entries.
+ // If no targets, nothing to do.
- for (uint32 aa=t->placed.size(); aa--; ) {
- if (t->placed[aa].frgID == 0) {
- t->placed[aa] = t->placed.back();
- t->placed.pop_back();
- }
- }
- }
+ writeLog("mergeOrphans()-- Processing orphan %u - found %u target location%s\n", ti, targets.size(), (targets.size() == 1) ? "" : "s");
- // Make a set of the reads in the bubble.
+ if (targets.size() == 0)
+ continue;
- for (uint32 fi=0; fi<nReads; fi++)
- tigReads.insert(bubble->ufpath[fi].ident);
+ // Assign read placements to targets.
- // Compare the bubble against each target.
+ assignReadsToTargets(orphan, placed, targets);
- uint32 nOrphan = 0; // Full coverage; bubble can be popped.
- uint32 orphanTarget = 0;
+ // Compare the orphan against each target.
- uint32 nBubble = 0; // Partial coverage, bubble cannot be popped.
- uint32 bubbleTarget = 0;
+ uint32 nOrphan = 0; // Number of targets that have all the reads.
+ uint32 orphanTarget = 0; // If nOrphan == 1, the target we're popping into.
for (uint32 tt=0; tt<targets.size(); tt++) {
- tgtReads.clear();
+ uint32 orphanSize = orphan->ufpath.size();
+ uint32 targetSize = targets[tt]->placed.size();
+
+ // Report now, before we nuke targets[tt] for being not a orphan!
- for (uint32 op=0; op<targets[tt]->placed.size(); op++) {
- if (logFileFlagSet(LOG_BUBBLE_DETAIL))
- writeLog("tig %8u length %9u -> target %8u piece %2u position %9u-%-9u length %8u - read %7u at %9u-%-9u\n",
- bubble->id(), bubble->getLength(),
+ if (logFileFlagSet(LOG_ORPHAN_DETAIL))
+ for (uint32 op=0; op<targets[tt]->placed.size(); op++)
+ writeLog("mergeOrphans()-- tig %8u length %9u -> target %8u piece %2u position %9u-%-9u length %8u - read %7u at %9u-%-9u\n",
+ orphan->id(), orphan->getLength(),
targets[tt]->target->id(), tt, targets[tt]->bgn, targets[tt]->end, targets[tt]->end - targets[tt]->bgn,
targets[tt]->placed[op].frgID,
targets[tt]->placed[op].position.bgn, targets[tt]->placed[op].position.end);
- assert(targets[tt]->placed[op].frgID > 0);
- tgtReads.insert(targets[tt]->placed[op].frgID);
- }
-
- // Count the number of consecutive reads from the 5' or 3' end of the bubble that are placed
- // in the target.
- //
- // Also, count the number of reads in the bubble that are placed in the target. Likely the
- // same as n5 + n3.
+ writeLog("mergeOrphans()-- tig %8u length %9u -> target %8u piece %2u position %9u-%-9u length %8u - expected %3" F_SIZE_TP " reads, had %3" F_SIZE_TP " reads.\n",
+ orphan->id(), orphan->getLength(),
+ targets[tt]->target->id(), tt, targets[tt]->bgn, targets[tt]->end, targets[tt]->end - targets[tt]->bgn,
+ orphanSize, targetSize);
- uint32 n5 = 0;
- uint32 n3 = 0;
- uint32 nt = 0;
+ // If all reads placed, we can merge this orphan into the target. Preview: if this happens more than once, we just
+ // split the orphan and place reads individually.
- for (uint32 fi=0; fi<nReads; fi++)
- if (tgtReads.count(bubble->ufpath[fi].ident) > 0)
- n5++;
- else
- break;
-
- for (uint32 fi=nReads; fi-->0; )
- if (tgtReads.count(bubble->ufpath[fi].ident) > 0)
- n3++;
- else
- break;
-
-
- for (uint32 fi=0; fi<nReads; fi++)
- if (tgtReads.count(bubble->ufpath[fi].ident) > 0)
- nt++;
-
-
- // Report now, before we nuke targets[tt] for being not a bubble!
-
- if ((nt == nReads) ||
- ((n5 > 0) && (n3 > 0)))
- writeLog("tig %8u length %9u -> target %8u piece %2u position %9u-%-9u length %8u - expected %3" F_SIZE_TP " reads, had %3" F_SIZE_TP " reads. n5=%3u n3=%3u nt=%3u\n",
- bubble->id(), bubble->getLength(),
- targets[tt]->target->id(), tt, targets[tt]->bgn, targets[tt]->end, targets[tt]->end - targets[tt]->bgn,
- tigReads.size(),
- tgtReads.size(), n5, n3, nt);
-
- // Decide if this is a bubble, orphan from construction, or repeat.
-
- if (nt == nReads) {
+ if (orphanSize == targetSize) {
nOrphan++;
orphanTarget = tt;
}
-
- else if ((n5 > 0) && (n3 > 0)) {
- nBubble++;
- bubbleTarget = tt;
- }
- }
-
- // If no placements, pbbbt, not a whole lot we can do here. Leave it as is. It's not even
- // worth logging (there are many of these).
-
- if (nOrphan + nBubble == 0) {
- }
-
- // If not an orphan, mark it as a bubble. If multiple bubble placements, mark it as a repeat
- // so we can use it in repeat detection.
- //
- // If there are orphan placements also, those placements are superior to the bubble placements,
- // and we'll place the orphan.
-
- else if (nOrphan == 0) {
- if (nBubble == 1) {
- nUniqBubble++;
- writeStatus("mergeOrphans()-- tig %8u BUBBLE -> tig %8u\n",
- bubble->id(),
- targets[bubbleTarget]->target->id());
- } else {
- nReptBubble++;
- writeStatus("mergeOrphans()-- tig %8u BUBBLE -> repeat\n",
- bubble->id());
- }
-
- writeLog("tig %8u length %8u reads %6u - %s.\n",
- bubble->id(), bubble->getLength(), nReads,
- (nBubble == 1) ? "bubble" : "bubble-repeat");
- writeLog("\n");
-
- bubble->_isRepeat = (nBubble > 1);
- bubble->_isBubble = true;
}
// If a unique orphan placement, place it there.
- else if (nOrphan == 1) {
+ if (nOrphan == 1) {
+ writeLog("mergeOrphans()-- tig %8u length %8u reads %6u - orphan\n", orphan->id(), orphan->getLength(), orphan->ufpath.size());
nUniqOrphan++;
- writeStatus("mergeOrphans()-- tig %8u ORPHAN -> tig %8u\n",
- bubble->id(),
- targets[bubbleTarget]->target->id());
-
- writeLog("tig %8u length %8u reads %6u - orphan\n", bubble->id(), bubble->getLength(), nReads);
for (uint32 op=0, tt=orphanTarget; op<targets[tt]->placed.size(); op++) {
ufNode frg;
@@ -821,9 +688,9 @@ mergeOrphans(TigVector &tigs,
frg.position.bgn = targets[tt]->placed[op].position.bgn;
frg.position.end = targets[tt]->placed[op].position.end;
- writeLog("move read %u from tig %u to tig %u %u-%-u\n",
+ writeLog("mergeOrphans()-- move read %u from tig %u to tig %u %u-%-u\n",
frg.ident,
- bubble->id(),
+ orphan->id(),
targets[tt]->target->id(), frg.position.bgn, frg.position.end);
targets[tt]->target->addRead(frg, 0, false);
@@ -831,35 +698,38 @@ mergeOrphans(TigVector &tigs,
writeLog("\n");
- tigs[bubble->id()] = NULL;
- delete bubble;
+ tigs[orphan->id()] = NULL;
+ delete orphan;
}
- // Otherwise, there are multiple orphan placements. We can't distinguish between them, and
+ // If multiply placed, we can't distinguish between them, and
// instead just place reads where they individually decide to go.
- else {
- nReptBubble++;
- writeStatus("mergeOrphans()-- tig %8u ORPHAN -> multiple tigs\n",
- bubble->id(),
- targets[bubbleTarget]->target->id());
+ if (nOrphan > 1) {
+ writeLog("tig %8u length %8u reads %6u - orphan with multiple placements\n", orphan->id(), orphan->getLength(), orphan->ufpath.size());
+ nReptOrphan++;
- writeLog("tig %8u length %8u reads %6u - orphan with multiple placements\n", bubble->id(), bubble->getLength(), nReads);
-
- for (uint32 fi=0; fi<nReads; fi++) {
- uint32 rr = bubble->ufpath[fi].ident;
+ for (uint32 fi=0; fi<orphan->ufpath.size(); fi++) {
+ uint32 rr = orphan->ufpath[fi].ident;
double er = 1.00;
uint32 bb = 0;
+ // Over all placements for this read, pick the one with lowest error, as long as it isn't
+ // to the orphan.
+
for (uint32 pp=0; pp<placed[rr].size(); pp++) {
double erate = placed[rr][pp].errors / placed[rr][pp].aligned;
- if (erate < er) {
- er = erate;
- bb = pp;
- }
+ if ((er < erate) || // Worse placement.
+ (placed[rr][pp].tigID == orphan->id())) // Self placement.
+ continue;
+
+ er = erate;
+ bb = pp;
}
+ assert(rr == placed[rr][bb].frgID);
+
ufNode frg;
frg.ident = placed[rr][bb].frgID;
@@ -874,16 +744,18 @@ mergeOrphans(TigVector &tigs,
writeLog("move read %u from tig %u to tig %u %u-%-u\n",
frg.ident,
- bubble->id(),
+ orphan->id(),
target->id(), frg.position.bgn, frg.position.end);
+ assert(target->id() != orphan->id());
+
target->addRead(frg, 0, false);
}
writeLog("\n");
- tigs[bubble->id()] = NULL;
- delete bubble;
+ tigs[orphan->id()] = NULL;
+ delete orphan;
}
// Clean up the targets list.
@@ -895,20 +767,19 @@ mergeOrphans(TigVector &tigs,
targets.clear();
- } // Over all bubbles
+ } // Over all orphans
- writeLog("\n"); // Needed if no bubbles are popped.
+ writeLog("\n"); // Needed if no orphans are popped.
writeStatus("mergeOrphans()-- placed %5u unique orphan tigs\n", nUniqOrphan);
writeStatus("mergeOrphans()-- shattered %5u repeat orphan tigs\n", nReptOrphan);
- writeStatus("mergeOrphans()-- marked %5u unique bubble tigs\n", nUniqBubble);
- writeStatus("mergeOrphans()-- marked %5u repeat bubble tigs\n", nReptBubble);
+ writeStatus("mergeOrphans()--\n");
delete [] placed;
// Sort reads in all the tigs. Overkill, but correct.
- for (uint32 ti=0; ti<tiLimit; ti++) {
+ for (uint32 ti=0; ti<tigs.size(); ti++) {
Unitig *tig = tigs[ti];
if ((tig == NULL) || // Not a tig, ignore it.
diff --git a/src/bogart/AS_BAT_OptimizePositions.C b/src/bogart/AS_BAT_OptimizePositions.C
new file mode 100644
index 0000000..25fdfd5
--- /dev/null
+++ b/src/bogart/AS_BAT_OptimizePositions.C
@@ -0,0 +1,508 @@
+
+/******************************************************************************
+ *
+ * This file is part of canu, a software program that assembles whole-genome
+ * sequencing reads into contigs.
+ *
+ * This software is based on:
+ * 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
+ * the 'kmer package' (http://kmer.sourceforge.net)
+ * both originally distributed by Applera Corporation under the GNU General
+ * Public License, version 2.
+ *
+ * Canu branched from Celera Assembler at its revision 4587.
+ * Canu branched from the kmer project at its revision 1994.
+ *
+ * This file is derived from:
+ *
+ * src/bogart/AS_BAT_Unitig.C
+ *
+ * Modifications by:
+ *
+ * Brian P. Walenz beginning on 2017-JUL-17
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
+ * File 'README.licenses' in the root directory of this distribution contains
+ * full conditions and disclaimers for each license.
+ */
+
+#include "AS_global.H"
+#include "AS_BAT_Unitig.H"
+#include "AS_BAT_ReadInfo.H"
+#include "AS_BAT_BestOverlapGraph.H"
+#include "AS_BAT_Logging.H"
+
+
+
+class optPos {
+public:
+ optPos() {
+ };
+ ~optPos() {
+ };
+
+ void set(ufNode &n) {
+ ident = n.ident;
+ min = n.position.min();
+ max = n.position.max();
+ fwd = n.position.isForward();
+ };
+
+ uint32 ident;
+ double min;
+ double max;
+ bool fwd;
+};
+
+
+
+void
+Unitig::optimize_initPlace(uint32 ii,
+ optPos *op,
+ optPos *np,
+ bool firstPass,
+ set<uint32> &failed,
+ bool beVerbose) {
+ uint32 iid = ufpath[ii].ident;
+ double nmin = 0;
+ int32 cnt = 0;
+
+ if ((firstPass == false) && (failed.count(iid) == 0)) // If the second pass and not
+ return; // failed, do nothing.
+
+ if (firstPass == false)
+ writeLog("optimize_initPlace()-- Second pass begins.\n");
+
+ // Then process all overlaps.
+
+ if (ii > 0) {
+ uint32 ovlLen = 0;
+ BAToverlap *ovl = OC->getOverlaps(iid, ovlLen);
+
+ for (uint32 oo=0; oo<ovlLen; oo++) {
+ uint32 jid = ovl[oo].b_iid;
+ uint32 uu = inUnitig (jid);
+ uint32 jj = ufpathIdx(jid);
+
+ // Probably overkill, but report ALL overlaps for the troubling reads.
+
+ if ((beVerbose) || (firstPass == false))
+ writeLog("optimize_initPlace()-- olap %u a %u b %u hangs %d %d\n", oo, iid, jid, ovl[oo].a_hang, ovl[oo].b_hang);
+
+ if (uu != id()) // Skip if the overlap is to a different tig.
+ continue; // (the ufpathIdx() call is valid, but using it isn't)
+
+ // Reads are in the same tig. Decide if they overlap in position.
+
+ bool isOvl = isOverlapping(ufpath[ii].position, ufpath[jj].position);
+
+ // Log! beVerbose should be true for the second pass, but just in case it isn't.
+
+ if ((beVerbose) || (firstPass == false))
+ writeLog("optimize_initPlace()-- olap %4u tig %7u read %8u (at %9d %9d) olap to read %8u (at %9d %9d) - hangs %7d %7d - %s %s\n",
+ oo, id(),
+ iid, ufpath[ii].position.bgn, ufpath[ii].position.end,
+ jid, ufpath[jj].position.bgn, ufpath[jj].position.end,
+ ovl[oo].a_hang, ovl[oo].b_hang,
+ (isOvl == true) ? "overlapping" : "not-overlapping",
+ (jj > ii) ? "after" : "before");
+
+ if (isOvl == false) // Skip if the reads
+ continue; // don't overlap
+
+ if ((firstPass) && (jj > ii)) // We're setting initial positions, so overlaps to reads after
+ continue; // us aren't correct, unless we're in the 2nd pass
+
+ // Reads overlap. Compute the position of the read using
+ // the overlap and the other read.
+
+ nmin += (op[iid].fwd) ? (op[jid].min - ovl[oo].a_hang) : (op[jid].min + ovl[oo].b_hang);
+ cnt += 1;
+ } // over all overlaps
+
+ // If no overlaps found, flag this read for a second pass. If in the second pass,
+ // not much we can do.
+
+ if ((firstPass == true) && (cnt == 0)) {
+ writeLog("optimize_initPlace()-- Failed to find overlaps for read %u in tig %u at %d-%d (first pass)\n",
+ iid, id(), ufpath[ii].position.bgn, ufpath[ii].position.end);
+ failed.insert(iid);
+ return;
+ }
+
+ if ((firstPass == false) && (cnt == 0)) {
+ writeLog("optimize_initPlace()-- Failed to find overlaps for read %u in tig %u at %d-%d (second pass)\n",
+ iid, id(), ufpath[ii].position.bgn, ufpath[ii].position.end);
+ flushLog();
+ }
+
+ assert(cnt > 0);
+ }
+
+ // The initialization above does very little to enforce read lengths, and the optimization
+ // doesn't put enough weight in the read length to make it stable. We simply force
+ // the correct read length here.
+
+ op[iid].min = (cnt == 0) ? 0 : (nmin / cnt);
+ op[iid].max = op[iid].min + RI->readLength(ufpath[ii].ident);
+
+ np[iid].min = 0;
+ np[iid].max = 0;
+
+ if (beVerbose)
+ writeLog("optimize_initPlace()-- tig %7u read %9u initialized to position %9.2f %9.2f%s\n",
+ id(), op[iid].ident, op[iid].min, op[iid].max, (firstPass == true) ? "" : " SECONDPASS");
+}
+
+
+
+void
+Unitig::optimize_recompute(uint32 iid,
+ optPos *op,
+ optPos *np,
+ bool beVerbose) {
+ uint32 ii = ufpathIdx(iid);
+
+ int32 readLen = RI->readLength(iid);
+
+ uint32 ovlLen = 0;
+ BAToverlap *ovl = OC->getOverlaps(iid, ovlLen);
+
+ double nmin = 0.0;
+ double nmax = 0.0;
+ uint32 cnt = 0;
+
+ if (beVerbose) {
+ writeLog("optimize()-- tig %8u read %8u previous - %9.2f-%-9.2f\n", id(), iid, op[iid].min, op[iid].max);
+ writeLog("optimize()-- tig %8u read %8u length - %9.2f-%-9.2f\n", id(), iid, op[iid].max - readLen, op[iid].min + readLen);
+ }
+
+ // Process all overlaps.
+
+ for (uint32 oo=0; oo<ovlLen; oo++) {
+ uint32 jid = ovl[oo].b_iid;
+ uint32 uu = inUnitig (jid);
+ uint32 jj = ufpathIdx(jid);
+
+ if (uu != id()) // Skip if the overlap is to a different tig.
+ continue; // (the ufpathIdx() call is valid, but using it isn't)
+
+ if (isOverlapping(ufpath[ii].position, ufpath[jj].position) == false) // Skip if the reads
+ continue; // don't overlap
+
+ // Reads overlap. Compute the position of the read using
+ // the overlap and the other read.
+
+ double tmin = (op[iid].fwd) ? (op[jid].min - ovl[oo].a_hang) : (op[jid].min + ovl[oo].b_hang);
+ double tmax = (op[iid].fwd) ? (op[jid].max - ovl[oo].b_hang) : (op[jid].max + ovl[oo].a_hang);
+
+ if (beVerbose)
+ writeLog("optimize()-- tig %8u read %8u olap %4u - %9.2f-%-9.2f\n", id(), iid, oo, tmin, tmax);
+
+ nmin += tmin;
+ nmax += tmax;
+ cnt += 1;
+ } // over all overlaps
+
+ // Add in some evidence for the bases in the read. We want higher weight than the overlaps,
+ // but not enough to swamp the hangs.
+
+ nmin += cnt/4 * (op[iid].max - readLen);
+ nmax += cnt/4 * (op[iid].min + readLen);
+ cnt += cnt/4;
+
+ // Find the average and save.
+
+ np[iid].min = nmin / cnt;
+ np[iid].max = nmax / cnt;
+
+ double dmin = 2 * (op[iid].min - np[iid].min) / (op[iid].min + np[iid].min);
+ double dmax = 2 * (op[iid].max - np[iid].max) / (op[iid].max + np[iid].max);
+ double npll = np[iid].max - np[iid].min;
+
+ if (beVerbose)
+ writeLog("optimize()-- tig %8u read %8u - %9.2f-%-9.2f length %9.2f/%-6d %7.2f%% posChange %+6.4f %+6.4f\n",
+ id(), iid,
+ np[iid].min, np[iid].max,
+ npll, readLen,
+ 200.0 * (npll - readLen) / (npll + readLen),
+ dmin, dmax);
+}
+
+
+
+
+
+void
+Unitig::optimize_expand(optPos *op) {
+
+ for (uint32 ii=0; ii<ufpath.size(); ii++) {
+ uint32 iid = ufpath[ii].ident;
+
+ int32 readLen = RI->readLength(iid);
+
+ double opiimin = op[iid].min; // New start of this read, same as the old start
+ double opiimax = op[iid].min + readLen; // New end of this read
+ double opiilen = op[iid].max - op[iid].min;
+
+ if (readLen <= opiilen) // This read is sufficiently long,
+ continue; // do nothing.
+
+ double scale = readLen / opiilen;
+ double expand = opiimax - op[iid].max; // Amount we changed this read, bases
+
+ // For each read, adjust positions based on how much they overlap with this read.
+
+ for (uint32 jj=0; jj<ufpath.size(); jj++) {
+ uint32 jid = ufpath[jj].ident;
+
+ if (op[jid].min < op[iid].min)
+ ;
+ else if (op[jid].min < op[iid].max)
+ op[jid].min = opiimin + (op[jid].min - op[iid].min) * scale;
+ else
+ op[jid].min += expand;
+
+
+ if (op[jid].max < op[iid].min)
+ ;
+ else if (op[jid].max < op[iid].max)
+ op[jid].max = opiimin + (op[jid].max - op[iid].min) * scale;
+ else
+ op[jid].max += expand;
+ }
+
+ // Finally, actually shift us
+
+ op[iid].min = opiimin;
+ op[iid].max = opiimax;
+ }
+}
+
+
+
+void
+Unitig::optimize_setPositions(optPos *op,
+ bool beVerbose) {
+
+ for (uint32 ii=0; ii<ufpath.size(); ii++) {
+ uint32 iid = ufpath[ii].ident;
+
+ int32 readLen = RI->readLength(iid);
+ int32 opll = (int32)op[iid].max - (int32)op[iid].min;
+ double opdd = 200.0 * (opll - readLen) / (opll + readLen);
+
+ if (op[iid].fwd) {
+ if (beVerbose)
+ writeLog("optimize()-- read %8u -> from %9d,%-9d %7d to %9d,%-9d %7d readLen %7d diff %7.4f%%\n",
+ iid,
+ ufpath[ii].position.bgn,
+ ufpath[ii].position.end,
+ ufpath[ii].position.end - ufpath[ii].position.bgn,
+ (int32)op[iid].min,
+ (int32)op[iid].max,
+ opll,
+ readLen,
+ opdd);
+
+ ufpath[ii].position.bgn = (int32)op[iid].min;
+ ufpath[ii].position.end = (int32)op[iid].max;
+ } else {
+ if (beVerbose)
+ writeLog("optimize()-- read %8u <- from %9d,%-9d %7d to %9d,%-9d %7d readLen %7d diff %7.4f%%\n",
+ iid,
+ ufpath[ii].position.bgn,
+ ufpath[ii].position.end,
+ ufpath[ii].position.bgn - ufpath[ii].position.end,
+ (int32)op[iid].max,
+ (int32)op[iid].min,
+ opll,
+ readLen,
+ opdd);
+
+ ufpath[ii].position.bgn = (int32)op[iid].max;
+ ufpath[ii].position.end = (int32)op[iid].min;
+ }
+ }
+}
+
+
+
+void
+TigVector::optimizePositions(const char *prefix, const char *label) {
+ uint32 numThreads = omp_get_max_threads();
+
+ uint32 tiLimit = size();
+ uint32 tiBlockSize = 10; //(tiLimit < 10 * numThreads) ? numThreads : tiLimit / 9;
+
+ uint32 fiLimit = RI->numReads() + 1;
+ uint32 fiBlockSize = 100; //(fiLimit < 1000 * numThreads) ? numThreads : fiLimit / 999;
+
+ bool beVerbose = false;
+
+ writeStatus("optimizePositions()-- Optimizing read positions for %u reads in %u tigs, with %u thread%s.\n",
+ tiLimit, fiLimit, numThreads, (numThreads == 1) ? "" : "s");
+
+ // Create work space and initialize to current read positions.
+
+ writeStatus("optimizePositions()-- Allocating scratch space for %u reads (%u KB).\n", fiLimit, sizeof(optPos) * fiLimit * 2 >> 1024);
+
+ optPos *pp = NULL;
+ optPos *op = new optPos [fiLimit];
+ optPos *np = new optPos [fiLimit];
+
+ memset(op, 0, sizeof(optPos) * fiLimit);
+ memset(np, 0, sizeof(optPos) * fiLimit);
+
+ for (uint32 fi=0; fi<fiLimit; fi++) {
+ uint32 ti = inUnitig(fi);
+ uint32 pp = ufpathIdx(fi);
+
+ if (ti == 0)
+ continue;
+
+ op[fi].set(operator[](ti)->ufpath[pp]);
+ np[fi].set(operator[](ti)->ufpath[pp]);
+ }
+
+ // Compute initial positions using previously placed reads and the read length.
+
+
+ //
+ // Initialize positions using only reads before us. If any reads fail to find overlaps, a second
+ // round will init positions using any read (before or after).
+ //
+
+ writeStatus("optimizePositions()-- Initializing positions with %u threads.\n", numThreads);
+
+#pragma omp parallel for schedule(dynamic, tiBlockSize)
+ for (uint32 ti=0; ti<tiLimit; ti++) {
+ Unitig *tig = operator[](ti);
+ set<uint32> failed;
+
+ if (tig == NULL)
+ continue;
+
+ for (uint32 ii=0; ii<tig->ufpath.size(); ii++)
+ tig->optimize_initPlace(ii, op, np, true, failed, beVerbose);
+
+ for (uint32 ii=0; ii<tig->ufpath.size(); ii++)
+ tig->optimize_initPlace(ii, op, np, false, failed, true);
+ }
+
+ //
+ // Recompute positions using all overlaps and reads both before and after. Do this for a handful of iterations
+ // so it somewhat stabilizes.
+ //
+
+ for (uint32 iter=0; iter<5; iter++) {
+
+ // Recompute positions
+
+ writeStatus("optimizePositions()-- Recomputing positions, iteration %u, with %u threads.\n", iter+1, numThreads);
+
+#pragma omp parallel for schedule(dynamic, fiBlockSize)
+ for (uint32 fi=0; fi<fiLimit; fi++) {
+ uint32 ti = inUnitig(fi);
+
+ if (ti == 0)
+ continue;
+
+ operator[](ti)->optimize_recompute(fi, op, np, beVerbose);
+ }
+
+ // Reset zero
+
+ writeStatus("optimizePositions()-- Reset zero.\n");
+
+ for (uint32 ti=0; ti<tiLimit; ti++) {
+ Unitig *tig = operator[](ti);
+
+ if (tig == NULL)
+ continue;
+
+ int32 z = np[ tig->ufpath[0].ident ].min;
+
+ for (uint32 ii=0; ii<tig->ufpath.size(); ii++) {
+ uint32 iid = tig->ufpath[ii].ident;
+
+ np[iid].min -= z;
+ np[iid].max -= z;
+ }
+ }
+
+ // Decide if we've converged. We used to compute percent difference in coordinates, but that is
+ // biased by the position of the read. Just use percent difference from read length.
+
+ writeStatus("optimizePositions()-- Checking convergence.\n");
+
+ uint32 nConverged = 0;
+ uint32 nChanged = 0;
+
+ for (uint32 fi=0; fi<fiLimit; fi++) {
+ double minp = 2 * (op[fi].min - np[fi].min) / (RI->readLength(fi));
+ double maxp = 2 * (op[fi].max - np[fi].max) / (RI->readLength(fi));
+
+ if (minp < 0) minp = -minp;
+ if (maxp < 0) maxp = -maxp;
+
+ if ((minp < 0.005) && (maxp < 0.005))
+ nConverged++;
+ else
+ nChanged++;
+ }
+
+ // All reads processed, swap op and np for the next iteration.
+
+ pp = op;
+ op = np;
+ np = pp;
+
+ writeStatus("optimizePositions()-- converged: %6u reads\n", nConverged);
+ writeStatus("optimizePositions()-- changed: %6u reads\n", nChanged);
+
+ if (nChanged == 0)
+ break;
+ }
+
+ //
+ // Reset small reads. If we've placed a read too small, expand it (and all reads that overlap)
+ // to make the length not smaller.
+ //
+
+ writeStatus("optimizePositions()-- Expanding short reads with %u threads.\n", numThreads);
+
+#pragma omp parallel for schedule(dynamic, tiBlockSize)
+ for (uint32 ti=0; ti<tiLimit; ti++) {
+ Unitig *tig = operator[](ti);
+
+ if (tig == NULL)
+ continue;
+
+ tig->optimize_expand(op);
+ }
+
+ //
+ // Update the tig with new positions. op[] is the result of the last iteration.
+ //
+
+ writeStatus("optimizePositions()-- Updating positions.\n");
+
+ for (uint32 ti=0; ti<tiLimit; ti++) {
+ Unitig *tig = operator[](ti);
+
+ if (tig == NULL)
+ continue;
+
+ tig->optimize_setPositions(op, beVerbose);
+ tig->cleanUp();
+ }
+
+ // Cleanup and finish.
+
+ delete [] op;
+ delete [] np;
+
+ writeStatus("optimizePositions()-- Finished.\n");
+}
diff --git a/src/bogart/AS_BAT_Outputs.C b/src/bogart/AS_BAT_Outputs.C
index f6f8302..6ab0b28 100644
--- a/src/bogart/AS_BAT_Outputs.C
+++ b/src/bogart/AS_BAT_Outputs.C
@@ -80,20 +80,11 @@ writeTigsToStore(TigVector &tigs,
tig->_coverageStat = 1.0; // Default to just barely unique
tig->_microhetProb = 1.0; // Default to 100% probability of unique
- // Set the class.
+ // Set the class and some flags.
- if (utg->_isUnassembled == true)
- tig->_class = tgTig_unassembled;
-
- // Disabled, because bogart is not finding most of the true bubbles.
- //else if (utg->_isBubble == true)
- // tig->_class = tgTig_bubble;
-
- else
- tig->_class = tgTig_contig;
-
- tig->_suggestRepeat = (utg->_isRepeat == true);
- tig->_suggestCircular = (utg->_isCircular == true);
+ tig->_class = (utg->_isUnassembled == true) ? tgTig_unassembled : tgTig_contig;
+ tig->_suggestRepeat = utg->_isRepeat;
+ tig->_suggestCircular = utg->_isCircular;
tig->_layoutLen = utg->getLength();
diff --git a/src/bogart/AS_BAT_OverlapCache.C b/src/bogart/AS_BAT_OverlapCache.C
index 0e612db..cde4201 100644
--- a/src/bogart/AS_BAT_OverlapCache.C
+++ b/src/bogart/AS_BAT_OverlapCache.C
@@ -65,9 +65,7 @@ uint64 ovlCacheMagic = 0x65686361436c766fLLU; //0102030405060708LLU;
#define SALT_MASK (((uint64)1 << SALT_BITS) - 1)
-OverlapCache::OverlapCache(gkStore *gkp,
- ovStore *ovlStoreUniq,
- ovStore *ovlStoreRept,
+OverlapCache::OverlapCache(const char *ovlStorePath,
const char *prefix,
double maxErate,
uint32 minOverlap,
@@ -82,126 +80,126 @@ OverlapCache::OverlapCache(gkStore *gkp,
if (memlimit == UINT64_MAX) {
_memLimit = getPhysicalMemorySize();
writeStatus("OverlapCache()-- limited to " F_U64 "MB memory (total physical memory).\n", _memLimit >> 20);
+ writeStatus("\n");
}
else if (memlimit > 0) {
_memLimit = memlimit;
writeStatus("OverlapCache()-- limited to " F_U64 "MB memory (user supplied).\n", _memLimit >> 20);
+ writeStatus("\n");
}
else {
_memLimit = UINT64_MAX;
writeStatus("OverlapCache()-- using unlimited memory (-M 0).\n");
+ writeStatus("\n");
}
- // Need to initialize thread data before we can account for their size.
- _threadMax = omp_get_max_threads();
- _thread = new OverlapCacheThreadData [_threadMax];
-
- // And this too.
- _ovsMax = 1 * 1024 * 1024; // At 16B each, this is 16MB
-
// Account for memory used by read data, best overlaps, and tigs.
// The chunk graph is temporary, and should be less than the size of the tigs.
+ // Likewise, the buffers used for loading and scoring overlaps aren't accounted for.
+ //
+ // NOTES:
+ //
+ // memFI - read length,
+ //
+ // memUT - worst case, we have one unitig per read. also, maps of read-to-unitig and read-to-vector-position.
+ //
+ // memEP - each read adds two epValue points, the open and close points, and two uint32 pointers
+ // to the data.
+ //
+ // memEO - overlaps for computing error profiles. this is definitely a hack, but I can't think of
+ // any reasonable estimates. just reserve 25% of memory, which then dominates our accounting.
+ //
+ // memOS - make sure we're this much below using all the memory - allows for other stuff to run,
+ // and a little buffer in case we're too big.
uint64 memFI = RI->memoryUsage();
- uint64 memBE = RI->numReads() * sizeof(BestEdgeOverlap);
- uint64 memUL = RI->numReads() * sizeof(ufNode); // For read positions in tigs
- uint64 memUT = RI->numReads() * sizeof(uint32) / 16; // For tigs (assumes 32 read / unitig)
- uint64 memID = RI->numReads() * sizeof(uint32) * 2; // For maps of read id to unitig id
- uint64 memEP = RI->numReads() * Unitig::epValueSize() * 2; // For error profile
+ uint64 memBE = RI->numReads() * sizeof(BestOverlaps);
+ uint64 memUT = RI->numReads() * sizeof(Unitig) + RI->numReads() * sizeof(uint32) * 2;
+ uint64 memUL = RI->numReads() * sizeof(ufNode);
+
+ uint64 memEP = RI->numReads() * sizeof(uint32) * 2 + RI->numReads() * Unitig::epValueSize() * 2;
+ uint64 memEO = (_memLimit == UINT64_MAX) ? (0.0) : (0.25 * _memLimit);
- uint64 memC1 = (RI->numReads() + 1) * (sizeof(BAToverlap *) + sizeof(uint32));
- uint64 memC2 = _ovsMax * (sizeof(ovOverlap) + sizeof(uint64) + sizeof(uint64));
- uint64 memC3 = _threadMax * _thread[0]._batMax * sizeof(BAToverlap);
- uint64 memC4 = (RI->numReads() + 1) * sizeof(uint32);
+ uint64 memOS = (_memLimit < 0.9 * getPhysicalMemorySize()) ? (0.0) : (0.1 * getPhysicalMemorySize());
- uint64 memOS = (_memLimit == getPhysicalMemorySize()) ? (0.1 * getPhysicalMemorySize()) : 0.0;
+ uint64 memST = ((RI->numReads() + 1) * (sizeof(BAToverlap *) + sizeof(uint32)) + // Cache pointers
+ (RI->numReads() + 1) * sizeof(uint32) + // Num olaps stored per read
+ (RI->numReads() + 1) * sizeof(uint32)); // Num olaps allocated per read
- uint64 memTT = memFI + memBE + memUL + memUT + memID + memC1 + memC2 + memC3 + memC4 + memOS;
+
+ _memReserved = memFI + memBE + memUL + memUT + memEP + memEO + memST + memOS;
+ _memStore = memST;
+ _memAvail = (_memReserved + _memStore < _memLimit) ? (_memLimit - _memReserved - _memStore) : 0;
+ _memOlaps = 0;
writeStatus("OverlapCache()-- %7" F_U64P "MB for read data.\n", memFI >> 20);
writeStatus("OverlapCache()-- %7" F_U64P "MB for best edges.\n", memBE >> 20);
- writeStatus("OverlapCache()-- %7" F_U64P "MB for unitig layouts.\n", memUL >> 20);
writeStatus("OverlapCache()-- %7" F_U64P "MB for tigs.\n", memUT >> 20);
- writeStatus("OverlapCache()-- %7" F_U64P "MB for id maps.\n", memID >> 20);
- writeStatus("OverlapCache()-- %7" F_U64P "MB for error profiles.\n", memEP >> 20);
- writeStatus("OverlapCache()-- %7" F_U64P "MB for overlap cache pointers.\n", memC1 >> 20);
- writeStatus("OverlapCache()-- %7" F_U64P "MB for overlap cache initial bucket.\n", memC2 >> 20);
- writeStatus("OverlapCache()-- %7" F_U64P "MB for overlap cache thread data.\n", memC3 >> 20);
- writeStatus("OverlapCache()-- %7" F_U64P "MB for number of overlaps per read.\n", memC4 >> 20);
+ writeStatus("OverlapCache()-- %7" F_U64P "MB for tigs - read layouts.\n", memUL >> 20);
+ writeStatus("OverlapCache()-- %7" F_U64P "MB for tigs - error profiles.\n", memEP >> 20);
+ writeStatus("OverlapCache()-- %7" F_U64P "MB for tigs - error profile overlaps.\n", memEO >> 20);
writeStatus("OverlapCache()-- %7" F_U64P "MB for other processes.\n", memOS >> 20);
writeStatus("OverlapCache()-- ---------\n");
- writeStatus("OverlapCache()-- %7" F_U64P "MB for data structures (sum of above).\n", memTT >> 20);
-
- if (_memLimit <= memTT) {
- int64 defecit = (int64)memTT - (int64)_memLimit;
+ writeStatus("OverlapCache()-- %7" F_U64P "MB for data structures (sum of above).\n", _memReserved >> 20);
+ writeStatus("OverlapCache()-- ---------\n");
+ writeStatus("OverlapCache()-- %7" F_U64P "MB for overlap store structure.\n", _memStore >> 20);
+ writeStatus("OverlapCache()-- %7" F_U64P "MB for overlap data.\n", _memAvail >> 20);
+ writeStatus("OverlapCache()-- ---------\n");
+ writeStatus("OverlapCache()-- %7" F_U64P "MB allowed.\n", _memLimit >> 20);
+ writeStatus("OverlapCache()--\n");
- writeStatus("OverlapCache()-- %7" F_S64P "MB available for overlaps.\n", defecit);
+ if (_memAvail == 0) {
writeStatus("OverlapCache()-- Out of memory before loading overlaps; increase -M.\n");
exit(1);
}
- _memLimit -= memTT;
- _memUsed = 0;
+ _maxEvalue = AS_OVS_encodeEvalue(maxErate);
+ _minOverlap = minOverlap;
- writeStatus("OverlapCache()-- %7" F_U64P "MB available for overlaps.\n", _memLimit >> 20);
- writeStatus("\n");
+ // Allocate space to load overlaps. With a NULL gkpStore we can't call the bgn or end methods.
+
+ _ovsMax = 16;
+ _ovs = ovOverlap::allocateOverlaps(NULL, _ovsMax);
+ _ovsSco = new uint64 [_ovsMax];
+ _ovsTmp = new uint64 [_ovsMax];
+
+ // Allocate pointers to overlaps.
- _overlaps = new BAToverlap * [RI->numReads() + 1];
_overlapLen = new uint32 [RI->numReads() + 1];
_overlapMax = new uint32 [RI->numReads() + 1];
+ _overlaps = new BAToverlap * [RI->numReads() + 1];
- memset(_overlaps, 0, sizeof(BAToverlap *) * (RI->numReads() + 1));
memset(_overlapLen, 0, sizeof(uint32) * (RI->numReads() + 1));
memset(_overlapMax, 0, sizeof(uint32) * (RI->numReads() + 1));
+ memset(_overlaps, 0, sizeof(BAToverlap *) * (RI->numReads() + 1));
- _maxEvalue = AS_OVS_encodeEvalue(maxErate);
- _minOverlap = minOverlap;
-
- _minPer = 0;
- _maxPer = 0;
-
- _checkSymmetry = false;
+ // Open the overlap store.
- _ovs = ovOverlap::allocateOverlaps(NULL, _ovsMax); // So can't call bgn or end.
- _ovsSco = new uint64 [_ovsMax];
- _ovsTmp = new uint64 [_ovsMax];
+ ovStore *ovlStore = new ovStore(ovlStorePath, NULL);
- _genomeSize = genomeSize;
+ // Load overlaps!
- _gkp = gkp;
- _ovlStoreUniq = ovlStoreUniq;
- _ovlStoreRept = ovlStoreRept;
+ computeOverlapLimit(ovlStore, genomeSize);
+ loadOverlaps(ovlStore, doSave);
- assert(_ovlStoreUniq != NULL);
- assert(_ovlStoreRept == NULL);
+ delete [] _ovs; _ovs = NULL; // There is a small cost with these arrays that we'd
+ delete [] _ovsSco; _ovsSco = NULL; // like to not have, and a big cost with ovlStore (in that
+ delete [] _ovsTmp; _ovsTmp = NULL; // it loaded updated erates into memory), so release
+ delete ovlStore; ovlStore = NULL; // these before symmetrizing overlaps.
- if (_memUsed > _memLimit)
- writeStatus("OverlapCache()-- ERROR: not enough memory to load ANY overlaps.\n"), exit(1);
-
- computeOverlapLimit();
- loadOverlaps(doSave);
symmetrizeOverlaps();
-
- delete [] _ovs; _ovs = NULL;
- delete [] _ovsSco; _ovsSco = NULL;
- delete [] _ovsTmp; _ovsTmp = NULL;
}
OverlapCache::~OverlapCache() {
- for (uint32 rr=0; rr<RI->numReads(); rr++)
- delete [] _overlaps[rr];
-
delete [] _overlaps;
delete [] _overlapLen;
delete [] _overlapMax;
- delete [] _ovs;
-
- delete [] _thread;
+ delete _overlapStorage;
}
@@ -227,181 +225,118 @@ OverlapCache::~OverlapCache() {
//
void
-OverlapCache::computeOverlapLimit(void) {
+OverlapCache::computeOverlapLimit(ovStore *ovlStore, uint64 genomeSize) {
- _ovlStoreUniq->resetRange();
-
- // AS_OVS_numOverlapsPerFrag returns an array that starts at firstIIDrequested. This is usually
- // 1, unless the first read has no overlaps. In that case, firstIIDrequested will be the
- // first read with overlaps. This is a terrible interface.
-
- writeStatus("OverlapCache()-- Loading number of overlaps per read.\n");
+ ovlStore->resetRange();
uint32 frstRead = 0;
uint32 lastRead = 0;
- uint32 *numPer = _ovlStoreUniq->numOverlapsPerFrag(frstRead, lastRead);
+ uint32 *numPer = ovlStore->numOverlapsPerFrag(frstRead, lastRead);
uint32 totlRead = lastRead - frstRead + 1;
- uint32 numPerMax = findHighestOverlapCount();
-
- uint64 memAvail = (_memLimit - _memUsed);
-
- // Set the minimum number of overlaps per read to 2-3x coverage.
-
- _minPer = 2 * 3 * RI->numBases() / _genomeSize;
- writeStatus("OverlapCache()-- Retain at least " F_U32 " overlaps/read, based on %.2fx coverage.\n",
- _minPer, (double)RI->numBases() / _genomeSize);
+ // Set the minimum number of overlaps per read to twice coverage. Then set the maximum number of
+ // overlaps per read to a guess of what it will take to fill up memory.
- // Set the maximum number of overlaps per read to a guess of what it will take to fill up memory.
+ _minPer = 2 * RI->numBases() / genomeSize;
+ _maxPer = _memAvail / (RI->numReads() * sizeof(BAToverlap));
- _maxPer = memAvail / (RI->numReads() * sizeof(BAToverlap));
+ writeStatus("OverlapCache()-- Retain at least " F_U32 " overlaps/read, based on %.2fx coverage.\n", _minPer, (double)RI->numBases() / genomeSize);
+ writeStatus("OverlapCache()-- Initial guess at " F_U32 " overlaps/read.\n", _maxPer);
+ writeStatus("OverlapCache()--\n");
- writeStatus("OverlapCache()-- Initial guess at " F_U32 " overlaps/read (maximum " F_U32 " overlaps/read).\n",
- _maxPer, numPerMax);
+ if (_maxPer < _minPer)
+ writeStatus("OverlapCache()-- Not enough memory to load the minimum number of overlaps; increase -M.\n"), exit(1);
- if (_maxPer < 10)
- writeStatus("OverlapCache()-- ERROR: not enough memory to load overlaps!.\n"), exit(1);
+ uint64 totalOlaps = ovlStore->numOverlapsInRange();
- uint64 totalLoad = 0; // Total overlaps we would load at this threshold
- uint64 totalOlaps = _ovlStoreUniq->numOverlapsInRange();
+ uint64 olapLoad = 0; // Total overlaps we would load at this threshold
+ uint64 olapMem = 0;
uint32 numBelow = 0; // Number of reads below the threshold
uint32 numEqual = 0;
uint32 numAbove = 0; // Number of reads above the threshold
- uint32 lastMax = 0;
+ writeStatus("OverlapCache()-- Adjusting for sparse overlaps.\n");
+ writeStatus("OverlapCache()--\n");
+ writeStatus("OverlapCache()-- reads loading olaps olaps memory\n");
+ writeStatus("OverlapCache()-- olaps/read all some loaded free\n");
+ writeStatus("OverlapCache()-- ---------- ------- ------- ----------- ------- --------\n");
- uint32 adjust = 1;
-
- while (adjust > 0) {
- totalLoad = 0;
- numBelow = 0;
- numEqual = 0;
- numAbove = 0;
+ while (true) {
+ olapLoad = 0;
+ numBelow = 0;
+ numEqual = 0;
+ numAbove = 0;
for (uint32 i=0; i<totlRead; i++) {
if (numPer[i] < _maxPer) {
- numBelow++;
- totalLoad += numPer[i];
+ numBelow += 1;
+ olapLoad += numPer[i];
} else if (numPer[i] == _maxPer) {
- numEqual++;
- totalLoad += _maxPer;
+ numEqual += 1;
+ olapLoad += _maxPer;
} else {
- numAbove++;
- totalLoad += _maxPer;
+ numAbove += 1;
+ olapLoad += _maxPer;
}
}
- writeStatus("OverlapCache()-- %7" F_U32P " overlaps/read - load all for %7" F_U32P " reads, some for %7" F_U32P " reads - %12" F_U64P " overlaps to load - %4" F_U64P "MB\n",
- _maxPer,
- numBelow + numEqual,
- numAbove,
- totalLoad,
- totalLoad * sizeof(BAToverlap) >> 20);
+ olapMem = olapLoad * sizeof(BAToverlap);
+ // If we're too high, decrease the threshold and compute again. We shouldn't ever be too high.
- // All done, nothing to do here.
- if ((numAbove == 0) && (totalLoad * sizeof(BAToverlap) < memAvail)) {
- adjust = 0;
- }
-
- // This limit worked, let's try moving it a little higher.
- else if (totalLoad * sizeof(BAToverlap) < memAvail) {
- lastMax = _maxPer;
-
- adjust = (memAvail - totalLoad * sizeof(BAToverlap)) / numAbove / sizeof(BAToverlap);
- _maxPer += adjust;
-
- if (_maxPer > numPerMax)
- _maxPer = numPerMax;
+ if (_memAvail < olapMem) {
+ _maxPer--;
+ continue;
}
- // Whoops! Too high! Revert to the last and recompute statistics.
- else {
- adjust = 0;
- _maxPer = lastMax;
-
- totalLoad = 0;
- numBelow = 0;
- numEqual = 0;
- numAbove = 0;
-
- for (uint32 i=0; i<totlRead; i++) {
- if (numPer[i] < _maxPer) {
- numBelow++;
- totalLoad += numPer[i];
-
- } else if (numPer[i] == _maxPer) {
- numEqual++;
- totalLoad += _maxPer;
-
- } else {
- numAbove++;
- totalLoad += _maxPer;
- }
- }
-
- writeStatus("OverlapCache()-- _maxPer=%7" F_U32P " (overestimated, revert to last good and stop)\n", _maxPer);
- }
- }
+ // Log what we will be loading.
- // Report
+ writeStatus("OverlapCache()-- %7" F_U32P " %7" F_U32P " %7" F_U32P " %12" F_U32P " %6.2f%% %7" F_U32P " MB\n",
+ _maxPer,
+ numBelow + numEqual,
+ numAbove,
+ olapLoad,
+ 100.0 * olapLoad / totalOlaps,
+ (_memAvail - olapMem) >> 20);
- writeStatus("\n");
- writeStatus("OverlapCache()-- minPer = " F_U32 " overlaps/reads\n", _minPer);
- writeStatus("OverlapCache()-- maxPer = " F_U32 " overlaps/reads\n", _maxPer);
- writeStatus("OverlapCache()-- numBelow = " F_U32 " reads (all overlaps loaded)\n", numBelow);
- writeStatus("OverlapCache()-- numEqual = " F_U32 " reads (all overlaps loaded)\n", numEqual);
- writeStatus("OverlapCache()-- numAbove = " F_U32 " reads (some overlaps loaded)\n", numAbove);
- writeStatus("OverlapCache()-- totalLoad = " F_U64 " overlaps (%6.2f%%)\n", totalLoad, (totalOlaps > 0) ? (100.0 * totalLoad / totalOlaps) : 0.0);
- writeStatus("\n");
- writeStatus("OverlapCache()-- availForOverlaps = " F_U64 "MB\n", memAvail >> 20);
- writeStatus("OverlapCache()-- totalMemory = " F_U64 "MB for organization\n", _memUsed >> 20);
- writeStatus("OverlapCache()-- totalMemory = " F_U64 "MB for overlaps\n", (totalLoad * sizeof(BAToverlap)) >> 20);
- writeStatus("OverlapCache()-- totalMemory = " F_U64 "MB used\n", (_memUsed + totalLoad * sizeof(BAToverlap)) >> 20);
- writeStatus("\n");
+ // If there are no more overlaps to load, we're done.
- _checkSymmetry = (numAbove > 0) ? true : false;
+ if (numAbove == 0)
+ break;
- delete [] numPer;
-}
+ // Otherwise, there is still (potentially) space left for more overlaps. Estimate how much
+ // higher we could push the threshold: compute how many more overlaps we could load before
+ // exceeding the memory limit, then assume we'd load that many overlaps for each of the
+ // numAbove reads.
+ int64 olapFree = (_memAvail - olapMem) / sizeof(BAToverlap);
+ int64 increase = olapFree / numAbove;
+ if (increase == 0)
+ break;
-uint32
-OverlapCache::findHighestOverlapCount(void) {
- uint32 fRead = 0;
- uint32 lRead = 0;
- uint32 *numPer = _ovlStoreUniq->numOverlapsPerFrag(fRead, lRead);
- uint32 totlRead = lRead - fRead + 1;
+ _maxPer += increase;
+ }
- uint32 numPerMax = 0;
+ // We used to (pre 6 Jul 2017) do the symmetry check only if we didn't load all overlaps.
+ // However, symmetry can also break if we use an error rate cutoff because - for reasons not
+ // explored - the error rate on symmetric overlaps differs. So, just enable this always.
+ //
+ // On a moderate coverage human nanopore assembly, it does:
+ //
+ // OverlapCache()-- Symmetrizing overlaps -- finding missing twins.
+ // OverlapCache()-- -- found 8609 missing twins in 51413413 overlaps, 8002 are strong.
+ // OverlapCache()-- Symmetrizing overlaps -- dropping weak non-twin overlaps.
+ // OverlapCache()-- -- dropped 454 overlaps.
+ // OverlapCache()-- Symmetrizing overlaps -- adding 8155 missing twin overlaps.
- for (uint32 i=0; i<totlRead; i++)
- if (numPerMax < numPer[i])
- numPerMax = numPer[i];
+ _checkSymmetry = (numAbove > 0) ? true : false;
+ _checkSymmetry = true;
delete [] numPer;
-
- return(numPerMax);
-}
-
-
-
-void
-OverlapCache::allocateLoadingSpace(void) {
-
- _ovsMax = findHighestOverlapCount();
-
- _ovs = ovOverlap::allocateOverlaps(NULL, _ovsMax); // So can't call bgn or end.
- _ovsSco = new uint64 [_ovsMax];
- _ovsTmp = new uint64 [_ovsMax];
-
- _memUsed += (_ovsMax) * sizeof(ovOverlap);
- _memUsed += (_ovsMax) * sizeof(uint64);
- _memUsed += (_ovsMax) * sizeof(uint64);
}
@@ -490,22 +425,39 @@ OverlapCache::filterDuplicates(uint32 &no) {
uint32
OverlapCache::filterOverlaps(uint32 maxEvalue, uint32 minOverlap, uint32 no) {
- uint32 ns = 0;
+ uint32 ns = 0;
+ bool beVerbose = false;
+
+ //beVerbose = (_ovs[0].a_iid == 3514657);
for (uint32 ii=0; ii<no; ii++) {
_ovsSco[ii] = 0; // Overlaps 'continue'd below will be filtered, even if 'no filtering' is needed.
if ((RI->readLength(_ovs[ii].a_iid) == 0) || // At least one read in the overlap is deleted
- (RI->readLength(_ovs[ii].b_iid) == 0))
+ (RI->readLength(_ovs[ii].b_iid) == 0)) {
+ if (beVerbose)
+ fprintf(stderr, "olap %d involves deleted reads - %u %s - %u %s\n",
+ ii,
+ _ovs[ii].a_iid, (RI->readLength(_ovs[ii].a_iid) == 0) ? "deleted" : "active",
+ _ovs[ii].b_iid, (RI->readLength(_ovs[ii].b_iid) == 0) ? "deleted" : "active");
continue;
+ }
- if (_ovs[ii].evalue() > maxEvalue) // Too noisy to care
+ if (_ovs[ii].evalue() > maxEvalue) { // Too noisy to care
+ if (beVerbose)
+ fprintf(stderr, "olap %d too noisy evalue %f > maxEvalue %f\n",
+ ii, AS_OVS_decodeEvalue(_ovs[ii].evalue()), AS_OVS_decodeEvalue(maxEvalue));
continue;
+ }
uint32 olen = RI->overlapLength(_ovs[ii].a_iid, _ovs[ii].b_iid, _ovs[ii].a_hang(), _ovs[ii].b_hang());
- if (olen < minOverlap) // Too short to care
+ if (olen < minOverlap) { // Too short to care
+ if (beVerbose)
+ fprintf(stderr, "olap %d too short olen %u minOverlap %u\n",
+ ii, olen, minOverlap);
continue;
+ }
// Just right!
@@ -545,44 +497,60 @@ OverlapCache::filterOverlaps(uint32 maxEvalue, uint32 minOverlap, uint32 no) {
void
-OverlapCache::loadOverlaps(bool doSave) {
+OverlapCache::loadOverlaps(ovStore *ovlStore, bool doSave) {
if (load() == true)
return;
- assert(_ovlStoreUniq != NULL);
- assert(_ovlStoreRept == NULL);
+ writeStatus("OverlapCache()--\n");
+ writeStatus("OverlapCache()-- Loading overlaps.\n");
+ writeStatus("OverlapCache()--\n");
+ writeStatus("OverlapCache()-- read from store saved in cache\n");
+ writeStatus("OverlapCache()-- ------------ --------- ------------ ---------\n");
- _ovlStoreUniq->resetRange();
+ ovlStore->resetRange();
uint64 numTotal = 0;
uint64 numLoaded = 0;
uint64 numDups = 0;
uint32 numReads = 0;
- uint64 numStore = _ovlStoreUniq->numOverlapsInRange();
+ uint64 numStore = ovlStore->numOverlapsInRange();
if (numStore == 0)
writeStatus("ERROR: No overlaps in overlap store?\n"), exit(1);
- // Could probably easily extend to multiple stores. Needs to interleave the two store
- // loads, can't do one after the other as we require all overlaps for a single read
- // be in contiguous memory.
+ _overlapStorage = new OverlapStorage(ovlStore->numOverlapsInRange());
while (1) {
- uint32 numOvl = _ovlStoreUniq->numberOfOverlaps(); // Query how many overlaps for the next read.
+ uint32 numOvl = ovlStore->numberOfOverlaps(); // Query how many overlaps for the next read.
if (numOvl == 0) // If no overlaps, we're at the end of the store.
break;
+ if (_ovsMax < numOvl) {
+ delete [] _ovs;
+ delete [] _ovsSco;
+ delete [] _ovsTmp;
+
+ _ovsMax = numOvl + 1024;
+
+ _ovs = ovOverlap::allocateOverlaps(NULL /* gkpStore */, _ovsMax);
+ _ovsSco = new uint64 [_ovsMax];
+ _ovsTmp = new uint64 [_ovsMax];
+ }
+
assert(numOvl <= _ovsMax);
// Actually load the overlaps, then detect and remove overlaps between the same pair, then
// filter short and low quality overlaps.
- uint32 no = _ovlStoreUniq->readOverlaps(_ovs, _ovsMax); // no == total overlaps == numOvl
+ uint32 no = ovlStore->readOverlaps(_ovs, _ovsMax); // no == total overlaps == numOvl
uint32 nd = filterDuplicates(no); // nd == duplicated overlaps (no is decreased by this amount)
uint32 ns = filterOverlaps(_maxEvalue, _minOverlap, no); // ns == acceptable overlaps
+ //if (_ovs[0].a_iid == 3514657)
+ // fprintf(stderr, "Loaded %u overlaps - no %u nd %u ns %u\n", numOvl, no, nd, ns);
+
// Allocate space for the overlaps. Allocate a multiple of 8k, assumed to be the page size.
//
// If we're loading all overlaps (ns == no) we don't need to overallocate. Otherwise, we're
@@ -593,11 +561,11 @@ OverlapCache::loadOverlaps(bool doSave) {
if (ns > 0) {
uint32 id = _ovs[0].a_iid;
- _overlapMax[id] = (ns == no) ? (ns) : ((((sizeof(BAToverlap) * ns / 8192) + 1) * 8192) / sizeof(BAToverlap));
+ _overlapMax[id] = ns;
_overlapLen[id] = ns;
- _overlaps[id] = new BAToverlap [ _overlapMax[id] ];
+ _overlaps[id] = _overlapStorage->get(_overlapMax[id]);
- _memUsed += _overlapMax[id] * sizeof(BAToverlap);
+ _memOlaps += _overlapMax[id] * sizeof(BAToverlap);
uint32 oo=0;
@@ -629,17 +597,19 @@ OverlapCache::loadOverlaps(bool doSave) {
numLoaded += ns;
numDups += nd;
- if ((numReads++ % 100000) == 0)
- writeStatus("OverlapCache()-- Loading: overlaps processed %12" F_U64P " (%06.2f%%) loaded %12" F_U64P " (%06.2f%%) droppeddupe %12" F_U64P " (%06.2f%%)\n",
+ if ((numReads++ % 100000) == 99999)
+ writeStatus("OverlapCache()-- %12" F_U64P " (%06.2f%%) %12" F_U64P " (%06.2f%%)\n",
numTotal, 100.0 * numTotal / numStore,
- numLoaded, 100.0 * numLoaded / numStore,
- numDups, 100.0 * numDups / numStore);
+ numLoaded, 100.0 * numLoaded / numStore);
}
- writeStatus("OverlapCache()-- Loading: overlaps processed %12" F_U64P " (%06.2f%%) loaded %12" F_U64P " (%06.2f%%) droppeddupe %12" F_U64P " (%06.2f%%)\n",
+ writeStatus("OverlapCache()-- ------------ --------- ------------ ---------\n");
+ writeStatus("OverlapCache()-- %12" F_U64P " (%06.2f%%) %12" F_U64P " (%06.2f%%)\n",
numTotal, 100.0 * numTotal / numStore,
- numLoaded, 100.0 * numLoaded / numStore,
- numDups, 100.0 * numDups / numStore);
+ numLoaded, 100.0 * numLoaded / numStore);
+
+ writeStatus("OverlapCache()--\n");
+ writeStatus("OverlapCache()-- Ignored %lu duplicate overlaps.\n", numDups);
if (doSave == true)
save();
@@ -696,6 +666,9 @@ searchForOverlap(BAToverlap *ovl, uint32 ovlLen, uint32 bID) {
void
OverlapCache::symmetrizeOverlaps(void) {
+ uint32 fiLimit = RI->numReads();
+ uint32 numThreads = omp_get_max_threads();
+ uint32 blockSize = (fiLimit < 100 * numThreads) ? numThreads : fiLimit / 99;
if (_checkSymmetry == false)
return;
@@ -706,15 +679,14 @@ OverlapCache::symmetrizeOverlaps(void) {
// b-read has loaded all overlaps (the overlap we're searching for must exist) but we can't.
// We must still mark the oevrlap as being symmetric.
- writeStatus("OverlapCache()-- Symmetrizing overlaps -- finding missing twins.\n");
+ writeStatus("OverlapCache()--\n");
+ writeStatus("OverlapCache()-- Symmetrizing overlaps.\n");
+ writeStatus("OverlapCache()-- Finding missing twins.\n");
-#pragma omp parallel for schedule(dynamic, RI->numReads() / 1000)
+#pragma omp parallel for schedule(dynamic, blockSize)
for (uint32 rr=0; rr<RI->numReads()+1; rr++) {
nonsymPerRead[rr] = 0;
- if ((rr % 100) == 0)
- fprintf(stderr, " %6.3f%%\r", 100.0 * rr / RI->numReads());
-
for (uint32 oo=0; oo<_overlapLen[rr]; oo++) {
uint32 rb = _overlaps[rr][oo].b_iid;
@@ -746,7 +718,7 @@ OverlapCache::symmetrizeOverlaps(void) {
nCritical += nonsymPerRead[rr];
}
- writeStatus("OverlapCache()-- -- found %llu missing twins in %llu overlaps, %llu are strong.\n", nOnly, nOverlaps, nCritical);
+ writeStatus("OverlapCache()-- Found %llu missing twins in %llu overlaps, %llu are strong.\n", nOnly, nOverlaps, nCritical);
// Score all the overlaps (again) and drop the lower quality ones. We need to drop half of the
// non-twin overlaps, but also want to retain some minimum number.
@@ -755,58 +727,98 @@ OverlapCache::symmetrizeOverlaps(void) {
// need to keep these, only because figuring out which ones are 'saved' above will be a total
// pain in the ass.
- double fractionToDrop = 0.6;
+ // Allocate some scratch space for each thread
- uint64 nDropped = 0;
+ uint64 **ovsScoScratch = new uint64 * [numThreads];
+ uint64 **ovsTmpScratch = new uint64 * [numThreads];
+ uint64 *nDroppedScratch = new uint64 [numThreads];
-#warning this should be parallelized
- writeStatus("OverlapCache()-- Symmetrizing overlaps -- dropping weak non-twin overlaps.\n");
+ for (uint32 tt=0; tt<numThreads; tt++) {
+ ovsScoScratch[tt] = new uint64 [_ovsMax];
+ ovsTmpScratch[tt] = new uint64 [_ovsMax];
+ nDroppedScratch[tt] = 0;
+ }
+
+ writeStatus("OverlapCache()-- Dropping weak non-twin overlaps; allocated " F_U64 " MB scratch space.\n",
+ ((2 * sizeof(uint64 *) + sizeof(uint64)) * numThreads) >> 20);
+
+ // As advertised, score all the overlaps and drop the weak ones.
+
+ double fractionToDrop = 0.6;
+#pragma omp parallel for schedule(dynamic, blockSize)
for (uint32 rr=0; rr<RI->numReads()+1; rr++) {
- if (_overlapLen[rr] <= _minPer)
+
+ if (_overlapLen[rr] <= _minPer) // If already too few overlaps, leave them all as is.
continue;
- if ((rr % 100) == 0)
- fprintf(stderr, " %6.3f%%\r", 100.0 * rr / RI->numReads());
+ uint64 *ovsSco = ovsScoScratch[omp_get_thread_num()];
+ uint64 *ovsTmp = ovsTmpScratch[omp_get_thread_num()];
+ uint64 &nDropped = nDroppedScratch[omp_get_thread_num()];
for (uint32 oo=0; oo<_overlapLen[rr]; oo++) {
- _ovsSco[oo] = RI->overlapLength( _overlaps[rr][oo].a_iid, _overlaps[rr][oo].b_iid, _overlaps[rr][oo].a_hang, _overlaps[rr][oo].b_hang);
- _ovsSco[oo] <<= AS_MAX_EVALUE_BITS;
- _ovsSco[oo] |= (~_ovs[oo].evalue()) & ERR_MASK;
- _ovsSco[oo] <<= SALT_BITS;
- _ovsSco[oo] |= oo & SALT_MASK;
+ ovsSco[oo] = RI->overlapLength( _overlaps[rr][oo].a_iid, _overlaps[rr][oo].b_iid, _overlaps[rr][oo].a_hang, _overlaps[rr][oo].b_hang);
+ ovsSco[oo] <<= AS_MAX_EVALUE_BITS;
+ ovsSco[oo] |= (~_overlaps[rr][oo].evalue) & ERR_MASK;
+ ovsSco[oo] <<= SALT_BITS;
+ ovsSco[oo] |= oo & SALT_MASK;
- _ovsTmp[oo] = _ovsSco[oo];
+ ovsTmp[oo] = ovsSco[oo];
}
- sort(_ovsTmp, _ovsTmp + _overlapLen[rr]);
+ sort(ovsTmp, ovsTmp + _overlapLen[rr]);
uint32 minIdx = (uint32)floor(nonsymPerRead[rr] * fractionToDrop);
if (minIdx < _minPer)
minIdx = _minPer;
- uint64 minScore = _ovsTmp[minIdx];
+ uint64 minScore = ovsTmp[minIdx];
for (uint32 oo=0; oo<_overlapLen[rr]; oo++) {
- if ((_ovsSco[oo] < minScore) && (_overlaps[rr][oo].symmetric == false)) {
+ if ((ovsSco[oo] < minScore) && (_overlaps[rr][oo].symmetric == false)) {
nDropped++;
_overlapLen[rr]--;
_overlaps[rr][oo] = _overlaps[rr][_overlapLen[rr]];
- _ovsSco [oo] = _ovsSco [_overlapLen[rr]];
+ ovsSco [oo] = ovsSco [_overlapLen[rr]];
oo--;
}
}
for (uint32 oo=0; oo<_overlapLen[rr]; oo++)
if (_overlaps[rr][oo].symmetric == false)
- assert(minScore <= _ovsSco[oo]);
+ assert(minScore <= ovsSco[oo]);
}
+ // Are we sane?
+
+ for (uint32 rr=RI->numReads()+1; rr-- > 0; )
+ if (_overlapLen[rr] > 0) {
+ assert(_overlaps[rr][0 ].a_iid == rr);
+ assert(_overlaps[rr][_overlapLen[rr]-1].a_iid == rr);
+ }
+
+ // Cleanup and log results.
+
+ uint64 nDropped = 0;
+
+ for (uint32 ii=0; ii<numThreads; ii++)
+ nDropped += nDroppedScratch[ii];
+
+
delete [] nonsymPerRead;
nonsymPerRead = NULL;
- writeStatus("OverlapCache()-- -- dropped %llu overlaps.\n", nDropped);
+ for (uint32 tt=0; tt<numThreads; tt++) {
+ delete [] ovsScoScratch[tt];
+ delete [] ovsTmpScratch[tt];
+ }
+
+ delete [] ovsScoScratch;
+ delete [] ovsTmpScratch;
+ delete [] nDroppedScratch;
+
+ writeStatus("OverlapCache()-- Dropped %llu overlaps; scratch space released.\n", nDropped);
// Finally, run through all the saved overlaps and count how many we need to add to each read.
@@ -826,20 +838,77 @@ OverlapCache::symmetrizeOverlaps(void) {
for (uint32 rr=0; rr<RI->numReads()+1; rr++)
nToAdd += toAddPerRead[rr];
- writeStatus("OverlapCache()-- Symmetrizing overlaps -- adding %llu missing twin overlaps.\n", nToAdd);
+ writeStatus("OverlapCache()-- Adding %llu missing twin overlaps.\n", nToAdd);
+ //
// Expand or shrink space for the overlaps.
+ //
- for (uint32 rr=0; rr<RI->numReads()+1; rr++)
- if (_overlapLen[rr] + toAddPerRead[rr] > _overlapMax[rr])
- resizeArray(_overlaps[rr], _overlapLen[rr], _overlapMax[rr], _overlapLen[rr] + toAddPerRead[rr] + 2048);
+ // Allocate new temporary pointers for each read.
+
+ BAToverlap **nPtr = new BAToverlap * [RI->numReads()+1];
+
+ memset(nPtr, 0, sizeof(BAToverlap *) * (RI->numReads()+1));
+
+ // The new storage must start after the old storage. And if it starts after the old storage ends,
+ // we can copy easier. If not, we just grab some empty overlaps to make space.
+
+ // A complication occurs at the end of a single segment. If there isn't enough space in the
+ // current segment for the overlaps, we skip ahead to the next segment without accounting for the
+ // overlaps we skip. It's possible for the new size to fit into this unused space, which would
+ // then put the old overlaps physically after the new ones.
+ //
+ // [ olaps1+unused | olaps2+unused | olaps3+unused | ] [ olaps4+unused | ..... ]
+ // [ olaps1+new | olaps2+new | olaps3+new | olaps4+new | ] [ olaps5+new | .... ]
+ //
+ // So, we need to compare not overlap counts, but raw positions in the OverlapStorage object.
+
+ OverlapStorage *oldS = new OverlapStorage(_overlapStorage); // Recreates the existing layout without allocating anything
+ OverlapStorage *newS = _overlapStorage; // Resets pointers for the new layout, using existing space
+
+ newS->reset();
+
+ for (uint32 rr=1; rr<RI->numReads()+1; rr++) {
+ nPtr[rr] = newS->get(_overlapLen[rr] + toAddPerRead[rr]); // Grab the pointer to the new space
+
+ oldS->get(_overlapMax[rr]); // Move old storages ahead
+
+ newS->advance(oldS); // Ensure newS is not before where oldS is.
+
+ _overlapMax[rr] = _overlapLen[rr] + toAddPerRead[rr];
+ }
+
+ // With new pointers in hand, copy overlap data - backwards - to the new locations.
+ // (Remeber that the reads are 1..numReads(), not 0..numReads()-1)
+
+ for (uint32 rr=RI->numReads()+1; rr-- > 0; ) {
+ if (_overlapLen[rr] == 0)
+ continue;
+
+ assert(_overlaps[rr][0 ].a_iid == rr);
+ assert(_overlaps[rr][_overlapLen[rr]-1].a_iid == rr);
+
+ for (uint32 oo=_overlapLen[rr]; oo-- > 0; )
+ nPtr[rr][oo] = _overlaps[rr][oo];
+
+ assert(_overlaps[rr][0 ].a_iid == rr);
+ assert(_overlaps[rr][_overlapLen[rr]-1].a_iid == rr);
+ }
+
+ // Swap pointers to the pointers and cleanup.
+
+ delete [] _overlaps;
+ _overlaps = nPtr;
+
+ delete oldS;
+ // newS is the original _overlapStorage, which we could delete, we'd just lose all the overlaps.
// Copy non-twin overlaps to their twin.
+ //
+ // This cannot (easily) be parallelized. We're iterating over overlaps in read rr, but inserting
+ // overlaps into read rb.
for (uint32 rr=0; rr<RI->numReads()+1; rr++) {
- if ((rr % 100) == 0)
- fprintf(stderr, " %6.3f%%\r", 100.0 * rr / RI->numReads());
-
for (uint32 oo=0; oo<_overlapLen[rr]; oo++) {
if (_overlaps[rr][oo].symmetric == true)
continue;
@@ -865,30 +934,36 @@ OverlapCache::symmetrizeOverlaps(void) {
}
}
- for (uint32 rr=0; rr<RI->numReads()+1; rr++)
+ // Check that everything worked.
+
+ for (uint32 rr=0; rr<RI->numReads()+1; rr++) {
assert(toAddPerRead[rr] == 0);
+ if (_overlapLen[rr] == 0)
+ continue;
+
+ assert(_overlaps[rr][0 ].a_iid == rr);
+ assert(_overlaps[rr][_overlapLen[rr]-1].a_iid == rr);
+ }
+
+ // Cleanup.
+
delete [] toAddPerRead;
toAddPerRead = NULL;
- for (uint32 rr=0; rr<RI->numReads()+1; rr++)
- if (_overlaps[rr] != NULL) {
- assert(_overlaps[rr][0 ].a_iid == rr);
- assert(_overlaps[rr][_overlapLen[rr]-1].a_iid == rr);
- }
-
// Probably should sort again. Not sure if anything depends on this.
for (uint32 rr=0; rr<RI->numReads()+1; rr++) {
}
- writeStatus("OverlapCache()-- Symmetrizing overlaps -- finished.\n");
+ writeStatus("OverlapCache()-- Finished.\n");
}
bool
OverlapCache::load(void) {
+#if 0
char name[FILENAME_MAX];
FILE *file;
size_t numRead;
@@ -916,12 +991,12 @@ OverlapCache::load(void) {
if (magic != ovlCacheMagic)
writeStatus("OverlapCache()-- ERROR: File '%s' isn't a bogart ovlCache.\n", name), exit(1);
- AS_UTL_safeRead(file, &_memLimit, "overlapCache_memLimit", sizeof(uint64), 1);
- AS_UTL_safeRead(file, &_memUsed, "overlapCache_memUsed", sizeof(uint64), 1);
- AS_UTL_safeRead(file, &_maxPer, "overlapCache_maxPer", sizeof(uint32), 1);
-
- _threadMax = omp_get_max_threads();
- _thread = new OverlapCacheThreadData [_threadMax];
+ AS_UTL_safeRead(file, &_memLimit, "overlapCache_memLimit", sizeof(uint64), 1);
+ AS_UTL_safeRead(file, &_memReserved, "overlapCache_memReserved", sizeof(uint64), 1);
+ AS_UTL_safeRead(file, &_memAvail, "overlapCache_memAvail", sizeof(uint64), 1);
+ AS_UTL_safeRead(file, &_memStore, "overlapCache_memStore", sizeof(uint64), 1);
+ AS_UTL_safeRead(file, &_memOlaps, "overlapCache_memOlaps", sizeof(uint64), 1);
+ AS_UTL_safeRead(file, &_maxPer, "overlapCache_maxPer", sizeof(uint32), 1);
_overlaps = new BAToverlap * [RI->numReads() + 1];
_overlapLen = new uint32 [RI->numReads() + 1];
@@ -945,12 +1020,15 @@ OverlapCache::load(void) {
fclose(file);
return(true);
+#endif
+ return(false);
}
void
OverlapCache::save(void) {
+#if 0
char name[FILENAME_MAX];
FILE *file;
@@ -968,20 +1046,24 @@ OverlapCache::save(void) {
uint32 ovserrbits = AS_MAX_EVALUE_BITS;
uint32 ovshngbits = AS_MAX_READLEN_BITS + 1;
- AS_UTL_safeWrite(file, &magic, "overlapCache_magic", sizeof(uint64), 1);
- AS_UTL_safeWrite(file, &ovserrbits, "overlapCache_ovserrbits", sizeof(uint32), 1);
- AS_UTL_safeWrite(file, &ovshngbits, "overlapCache_ovshngbits", sizeof(uint32), 1);
+ AS_UTL_safeWrite(file, &magic, "overlapCache_magic", sizeof(uint64), 1);
+ AS_UTL_safeWrite(file, &ovserrbits, "overlapCache_ovserrbits", sizeof(uint32), 1);
+ AS_UTL_safeWrite(file, &ovshngbits, "overlapCache_ovshngbits", sizeof(uint32), 1);
- AS_UTL_safeWrite(file, &_memLimit, "overlapCache_memLimit", sizeof(uint64), 1);
- AS_UTL_safeWrite(file, &_memUsed, "overlapCache_memUsed", sizeof(uint64), 1);
- AS_UTL_safeWrite(file, &_maxPer, "overlapCache_maxPer", sizeof(uint32), 1);
+ AS_UTL_safeWrite(file, &_memLimit, "overlapCache_memLimit", sizeof(uint64), 1);
+ AS_UTL_safeWrite(file, &_memReserved, "overlapCache_memReserved", sizeof(uint64), 1);
+ AS_UTL_safeWrite(file, &_memAvail, "overlapCache_memAvail", sizeof(uint64), 1);
+ AS_UTL_safeWrite(file, &_memStore, "overlapCache_memStore", sizeof(uint64), 1);
+ AS_UTL_safeWrite(file, &_memOlaps, "overlapCache_memOlaps", sizeof(uint64), 1);
+ AS_UTL_safeWrite(file, &_maxPer, "overlapCache_maxPer", sizeof(uint32), 1);
- AS_UTL_safeWrite(file, _overlapLen, "overlapCache_len", sizeof(uint32), RI->numReads() + 1);
- AS_UTL_safeWrite(file, _overlapMax, "overlapCache_max", sizeof(uint32), RI->numReads() + 1);
+ AS_UTL_safeWrite(file, _overlapLen, "overlapCache_len", sizeof(uint32), RI->numReads() + 1);
+ AS_UTL_safeWrite(file, _overlapMax, "overlapCache_max", sizeof(uint32), RI->numReads() + 1);
for (uint32 rr=0; rr<RI->numReads() + 1; rr++)
AS_UTL_safeWrite(file, _overlaps[rr], "overlapCache_ovl", sizeof(BAToverlap), _overlapLen[rr]);
fclose(file);
+#endif
}
diff --git a/src/bogart/AS_BAT_OverlapCache.H b/src/bogart/AS_BAT_OverlapCache.H
index 7353b57..9c388da 100644
--- a/src/bogart/AS_BAT_OverlapCache.H
+++ b/src/bogart/AS_BAT_OverlapCache.H
@@ -120,6 +120,7 @@ public:
return(AS_OVS_decodeEvalue(evalue));
}
+#if AS_MAX_READLEN_BITS < 24
uint64 evalue : AS_MAX_EVALUE_BITS; // 12
int64 a_hang : AS_MAX_READLEN_BITS+1; // 21+1
int64 b_hang : AS_MAX_READLEN_BITS+1; // 21+1
@@ -130,12 +131,26 @@ public:
uint32 a_iid;
uint32 b_iid;
-};
#if (AS_MAX_EVALUE_BITS + (AS_MAX_READLEN_BITS + 1) + (AS_MAX_READLEN_BITS + 1) + 1 + 1 + 1 > 64)
#error not enough bits to store overlaps. decrease AS_MAX_EVALUE_BITS or AS_MAX_READLEN_BITS.
#endif
+#else
+ int32 a_hang;
+ int32 b_hang;
+
+ uint32 evalue : AS_MAX_EVALUE_BITS; // 12
+ uint32 flipped : 1; // 1
+ uint32 filtered : 1; // 1
+ uint32 symmetric : 1; // 1 - twin overlap exists
+
+ uint32 a_iid;
+ uint32 b_iid;
+#endif
+
+};
+
inline
@@ -146,27 +161,94 @@ BAToverlap_sortByEvalue(BAToverlap const &a, BAToverlap const &b) {
-class OverlapCacheThreadData {
+class OverlapStorage {
public:
- OverlapCacheThreadData() {
- _batMax = 1 * 1024 * 1024; // At 8B each, this is 8MB
- _bat = new BAToverlap [_batMax];
+ OverlapStorage(uint64 nOvl) {
+ _osAllocLen = 1024 * 1024 * 1024 / sizeof(BAToverlap); // 1GB worth of overlaps
+ _osLen = 0; // osMax is cheap and we overallocate it.
+ _osPos = 0; // If allocLen is small, we can end up with
+ _osMax = 2 * nOvl / _osAllocLen + 2; // more blocks than expected, when overlaps
+ _os = new BAToverlap * [_osMax]; // don't fit in the remaining space.
+
+ memset(_os, 0, sizeof(BAToverlap *) * _osMax);
+
+ _os[0] = new BAToverlap [_osAllocLen]; // Alloc first block, keeps getOverlapStorage() simple
+ };
+
+ OverlapStorage(OverlapStorage *original) {
+ _osAllocLen = original->_osAllocLen;
+ _osLen = 0;
+ _osPos = 0;
+ _osMax = original->_osMax;
+ _os = NULL;
+ };
+
+ ~OverlapStorage() {
+ if (_os == NULL)
+ return;
+
+ for (uint32 ii=0; ii<_osMax; ii++)
+ delete [] _os[ii];
+ delete [] _os;
+ }
+
+
+ void reset(void) {
+ _osLen = 0;
+ _osPos = 0;
+ };
+
+
+ BAToverlap *get(void) {
+ if (_os == NULL)
+ return(NULL);
+ return(_os[_osLen] + _osPos);
};
- ~OverlapCacheThreadData() {
- delete [] _bat;
+
+ BAToverlap *get(uint32 nOlaps) {
+ if (_osPos + nOlaps > _osAllocLen) { // If we don't fit in the current allocation,
+ _osPos = 0; // move to the next one.
+ _osLen++;
+ }
+
+ _osPos += nOlaps; // Reserve space for these overlaps.
+
+ assert(_osLen < _osMax);
+
+ if (_os == NULL) // If we're not allowed to allocate,
+ return(NULL); // return nothing.
+
+ if (_os[_osLen] == NULL) // Otherwise, make sure we have space and return
+ _os[_osLen] = new BAToverlap [_osAllocLen]; // that space.
+
+ return(_os[_osLen] + _osPos - nOlaps);
+ };
+
+
+ void advance(OverlapStorage *that) {
+ if (((that->_osLen < _osLen)) || // That segment before mine, or
+ ((that->_osLen == _osLen) && (that->_osPos <= _osPos))) // that segment equal and position before mine
+ return; // So no need to modify
+
+ _osLen = that->_osLen;
+ _osPos = that->_osPos;
};
- uint32 _batMax; // For returning overlaps
- BAToverlap *_bat; //
+
+private:
+ uint32 _osAllocLen; // Size of each allocation
+ uint32 _osLen; // Current allocation being used
+ uint32 _osPos; // Position in current allocation; next free overlap
+ uint32 _osMax; // Number of allocations we can make
+ BAToverlap **_os; // Allocations
};
+
class OverlapCache {
public:
- OverlapCache(gkStore *gkp,
- ovStore *ovlStoreUniq,
- ovStore *ovlStoreRept,
+ OverlapCache(const char *ovlStorePath,
const char *prefix,
double maxErate,
uint32 minOverlap,
@@ -176,14 +258,11 @@ public:
~OverlapCache();
private:
- uint32 findHighestOverlapCount(void);
- void allocateLoadingSpace(void);
-
uint32 filterOverlaps(uint32 maxOVSerate, uint32 minOverlap, uint32 no);
uint32 filterDuplicates(uint32 &no);
- void computeOverlapLimit(void);
- void loadOverlaps(bool doSave);
+ void computeOverlapLimit(ovStore *ovlStore, uint64 genomeSize);
+ void loadOverlaps(ovStore *ovlStore, bool doSave);
void symmetrizeOverlaps(void);
public:
@@ -199,12 +278,22 @@ private:
private:
const char *_prefix;
- uint64 _memLimit;
- uint64 _memUsed;
+ uint64 _memLimit; // Expected max size of bogart
+ uint64 _memReserved; // Memory to reserve for processing
+ uint64 _memAvail; // Memory available for storing overlaps
+ uint64 _memStore; // Memory used to support overlaps
+ uint64 _memOlaps; // Memory used to store overlaps
- BAToverlap **_overlaps;
uint32 *_overlapLen;
uint32 *_overlapMax;
+ BAToverlap **_overlaps;
+
+ // Instead of allocating space for overlaps per read (which has some visible but unknown size
+ // cost with each allocation), or in a single massive allocation (which we can't resize), we
+ // allocate overlaps in large blocks then set pointers into each block where overlaps for each
+ // read start. This is managed by OverlapStorage.
+
+ OverlapStorage *_overlapStorage;
uint32 _maxEvalue; // Don't load overlaps with high error
uint32 _minOverlap; // Don't load overlaps that are short
@@ -219,14 +308,7 @@ private:
uint64 *_ovsSco; // For scoring overlaps during the load
uint64 *_ovsTmp; // For picking out a score threshold
- uint64 _threadMax;
- OverlapCacheThreadData *_thread;
-
uint64 _genomeSize;
-
- gkStore *_gkp;
- ovStore *_ovlStoreUniq; // Pointers to input stores
- ovStore *_ovlStoreRept;
};
diff --git a/src/bogart/AS_BAT_PlaceContains.C b/src/bogart/AS_BAT_PlaceContains.C
index 982ee5b..4d36e5a 100644
--- a/src/bogart/AS_BAT_PlaceContains.C
+++ b/src/bogart/AS_BAT_PlaceContains.C
@@ -78,8 +78,8 @@ breakSingletonTigs(TigVector &tigs) {
void
-placeUnplacedUsingAllOverlaps(TigVector &tigs,
- const char *prefix) {
+placeUnplacedUsingAllOverlaps(TigVector &tigs,
+ const char *UNUSED(prefix)) {
uint32 fiLimit = RI->numReads();
uint32 numThreads = omp_get_max_threads();
uint32 blockSize = (fiLimit < 100 * numThreads) ? numThreads : fiLimit / 99;
diff --git a/src/bogart/AS_BAT_PlaceReadUsingOverlaps.C b/src/bogart/AS_BAT_PlaceReadUsingOverlaps.C
index 1753546..9863952 100644
--- a/src/bogart/AS_BAT_PlaceReadUsingOverlaps.C
+++ b/src/bogart/AS_BAT_PlaceReadUsingOverlaps.C
@@ -36,99 +36,176 @@
#include "intervalList.H"
+#include <vector>
+#include <algorithm>
-#undef TEST_ALT
+void
+placeRead_fromOverlaps(TigVector &tigs,
+ Unitig *target,
+ uint32 fid,
+ uint32 flags,
+ uint32 ovlLen,
+ BAToverlap *ovl,
+ uint32 &ovlPlaceLen,
+ overlapPlacement *ovlPlace) {
-overlapPlacement *
-placeRead_fromOverlaps(TigVector &tigs,
- Unitig *target,
- uint32 fid,
- uint32 flags,
- uint32 ovlLen,
- BAToverlap *ovl) {
- overlapPlacement *ovlPlace = new overlapPlacement[ovlLen];
+ if (logFileFlagSet(LOG_PLACE_READ))
+ writeLog("pROU()-- placements for read %u with %u overlaps\n", fid, ovlLen);
- for (uint32 i=0; i<ovlLen; i++) {
- int32 tigID = tigs.inUnitig(ovl[i].b_iid);
- Unitig *tig = tigs[tigID];
+ for (uint32 oo=0; oo<ovlLen; oo++) {
+ bool disallow = false;
+ uint32 btID = tigs.inUnitig(ovl[oo].b_iid);
- assert(ovl[i].a_iid == fid);
+ assert(ovl[oo].a_iid == fid);
- if (tigID == 0) // Skip if overlapping read isn't in a tig yet - unplaced contained, or garbage read.
+ if ((btID == 0) || // Skip if overlapping read isn't in a tig yet - unplaced contained, or garbage read.
+ ((target != NULL) && (target->id() != btID))) // Skip if we requested a specific tig and if this isn't it.
continue;
- if ((target != NULL) && (target != tig)) // Skip if we requested a specific tig and if this isn't it.
+ Unitig *btig = tigs[btID];
+ ufNode &bread = btig->ufpath[ tigs.ufpathIdx(ovl[oo].b_iid) ];
+
+ if (btig->_isUnassembled == true) // Skip if overlapping read is in an unassembled contig.
continue;
- // Place the read relative to the other read.
+ SeqInterval apos; // Position of the read in the btig
+ SeqInterval bver; // Bases covered by the overlap in the B tig
- BestEdgeOverlap edge(ovl[i]);
- ufNode read;
+ // Place the read relative to the other read. The overlap we have is relative to the A read,
+ // so hangs need to be subtracted from the other coordinate.
+ //
+ // Pictures all show positive hangs.
- if (tig->placeRead(read, fid, ovl[i].AEndIs3prime(), &edge) == false) {
- if (logFileFlagSet(LOG_PLACE_READ))
- writeLog("pRUO()-- WARNING: Failed to place with overlap %u %u hangs %u %u flipped %u\n",
- ovl[i].a_iid, ovl[i].b_iid, ovl[i].a_hang, ovl[i].b_hang, ovl[i].flipped);
- continue;
+ // A ------------> (b)
+ // B (a) ------------>
+ if ((ovl[oo].flipped == false) && (bread.position.isForward() == true)) {
+ apos.bgn = bread.position.min() - ovl[oo].a_hang;
+ apos.end = bread.position.max() - ovl[oo].b_hang;
+
+ bver.bgn = bread.position.min() - ((ovl[oo].a_hang > 0) ? 0 : ovl[oo].a_hang);
+ bver.end = bread.position.max() - ((ovl[oo].b_hang > 0) ? ovl[oo].b_hang : 0);
}
- // Save the placement in our work space.
+ // A ------------> (b)
+ // B (a) <-------------
+ if ((ovl[oo].flipped == true) && (bread.position.isForward() == false)) {
+ apos.bgn = bread.position.min() - ovl[oo].a_hang;
+ apos.end = bread.position.max() - ovl[oo].b_hang;
+
+ bver.bgn = bread.position.min() - ((ovl[oo].a_hang > 0) ? 0 : ovl[oo].a_hang);
+ bver.end = bread.position.max() - ((ovl[oo].b_hang > 0) ? ovl[oo].b_hang : 0);
+ }
- uint32 olen = RI->overlapLength(ovl[i].a_iid, ovl[i].b_iid, ovl[i].a_hang, ovl[i].b_hang);
- uint32 flen = RI->readLength(ovl[i].a_iid);
-
- ovlPlace[i].frgID = fid;
- ovlPlace[i].refID = ovl[i].b_iid;
- ovlPlace[i].tigID = tig->id();
- ovlPlace[i].position = read.position;
- ovlPlace[i].verified.bgn = INT32_MAX;
- ovlPlace[i].verified.end = INT32_MIN;
- ovlPlace[i].covered.bgn = (ovl[i].a_hang < 0) ? 0 : ovl[i].a_hang; // The portion of the read
- ovlPlace[i].covered.end = (ovl[i].b_hang > 0) ? flen : ovl[i].b_hang + flen; // covered by the overlap.
- ovlPlace[i].clusterID = 0;
- ovlPlace[i].fCoverage = 0.0;
- ovlPlace[i].errors = olen * ovl[i].erate();
- ovlPlace[i].aligned = ovlPlace[i].covered.end - ovlPlace[i].covered.bgn;
- ovlPlace[i].tigFidx = UINT32_MAX;
- ovlPlace[i].tigLidx = 0;
-
- assert(ovlPlace[i].covered.bgn >= 0);
- assert(ovlPlace[i].covered.end >= 0);
- assert(ovlPlace[i].covered.bgn <= flen);
- assert(ovlPlace[i].covered.end <= flen);
- assert(ovlPlace[i].covered.bgn < ovlPlace[i].covered.end);
-
- // Disallow any placements that exceed the boundary of the unitig. These cannot be confirmed
- // by overlaps and might be wrong. Sample cases:
- // o sticking a unique/repeat read onto a repeat (leaving the unique uncovered)
- // o sticking a chimeric read onto the end of a unitig (leaving the chimeric join uncovered)
-
- if (((flags & placeRead_fullMatch) ||
- (flags & placeRead_noExtend)) &&
- ((ovlPlace[i].position.min() < 0) ||
- (ovlPlace[i].position.max() > tig->getLength()))) {
- ovlPlace[i] = overlapPlacement();
+ // A (b) <------------
+ // B ------------> (a)
+ if ((ovl[oo].flipped == true) && (bread.position.isForward() == true)) {
+ apos.end = bread.position.min() + ovl[oo].b_hang;
+ apos.bgn = bread.position.max() + ovl[oo].a_hang;
+
+ bver.end = bread.position.min() + ((ovl[oo].b_hang > 0) ? ovl[oo].b_hang : 0);
+ bver.bgn = bread.position.max() + ((ovl[oo].a_hang > 0) ? 0 : ovl[oo].a_hang);
+ }
+
+ // A (b) <------------
+ // B <------------ (a)
+ if ((ovl[oo].flipped == false) && (bread.position.isForward() == false)) {
+ apos.end = bread.position.min() + ovl[oo].b_hang;
+ apos.bgn = bread.position.max() + ovl[oo].a_hang;
+
+ bver.end = bread.position.min() + ((ovl[oo].b_hang > 0) ? ovl[oo].b_hang : 0);
+ bver.bgn = bread.position.max() + ((ovl[oo].a_hang > 0) ? 0 : ovl[oo].a_hang);
}
- // Report the placement.
+ // HOWEVER, the verified position is all goobered up if the overlapping read
+ // was placed too short. Imagine a 20k read with a 500bp overlap, so the hangs
+ // are 19.5k. If we position this read 1k too short, then readLen-hang is negative,
+ // and we end up misorienting the verified coords (not too mention that they're
+ // likely bogus too). So, if that happens, we just ignore the overlap.
+
+ int32 bposlen = (bread.position.max() - bread.position.min());
+
+ if (ovl[oo].a_hang < 0)
+ bposlen += ovl[oo].a_hang;
+
+ if (ovl[oo].b_hang > 0)
+ bposlen -= ovl[oo].b_hang;
+
+ if (bposlen < 0) {
+ writeLog("WARNING: read %u overlap to read %u in tig %u at %d-%d - hangs %d %d to large for placement, ignoring overlap\n",
+ ovl[oo].a_iid,
+ ovl[oo].b_iid,
+ btID,
+ bread.position.bgn, bread.position.end,
+ ovl[oo].a_hang, ovl[oo].b_hang);
+ disallow = true;
+ }
+
+ // Save the placement in our work space.
+
+ uint32 flen = RI->readLength(ovl[oo].a_iid);
+
+ overlapPlacement op;
+
+ op.frgID = fid;
+ op.refID = ovl[oo].b_iid;
+ op.tigID = btig->id();
+ op.position = apos;
+ op.verified = bver;
+ op.covered.bgn = (ovl[oo].a_hang < 0) ? 0 : ovl[oo].a_hang; // The portion of the read
+ op.covered.end = (ovl[oo].b_hang > 0) ? flen : ovl[oo].b_hang + flen; // covered by the overlap.
+ op.clusterID = 0;
+ op.fCoverage = 0.0;
+ op.errors = RI->overlapLength(ovl[oo].a_iid, ovl[oo].b_iid, ovl[oo].a_hang, ovl[oo].b_hang) * ovl[oo].erate();
+ op.aligned = op.covered.end - op.covered.bgn;
+ op.tigFidx = UINT32_MAX;
+ op.tigLidx = 0;
+
+ // If we're looking for final placments either contained completely in the tig or covering the
+ // whole read, disallow any placements that exceed the boundary of the unitig. This is NOT a
+ // filter on these placements; but any placement here that extends past the end of the tig is
+ // guaranteed to not generate a contained/whole-read placement.
+
+ if ((flags & placeRead_noExtend) || (flags & placeRead_fullMatch))
+ if ((op.position.min() < 0) ||
+ (op.position.max() > btig->getLength()))
+ disallow = true;
if (logFileFlagSet(LOG_PLACE_READ))
- writeLog("pRUO()-- read %7d (%5d,%5d) in unitig %5d at %8d,%-8d via read %7d at %8d:%-8d hang %6d %6d %s%s\n",
- ovlPlace[i].frgID,
- ovlPlace[i].covered.bgn, ovlPlace[i].covered.end,
- ovlPlace[i].tigID,
- ovlPlace[i].position.bgn, ovlPlace[i].position.end,
- ovl[i].b_iid,
- tig->readFromId(ovl[i].b_iid)->position.bgn,
- tig->readFromId(ovl[i].b_iid)->position.end,
- ovl[i].a_hang, ovl[i].b_hang,
- (ovl[i].flipped == true) ? "<--" : "-->",
- (ovlPlace[i].frgID == 0) ? " DISALLOWED" : "");
- } // Over all overlaps.
-
- return(ovlPlace);
+ writeLog("pRUO()-- bases %5d-%-5d to tig %5d %8ubp at %8d-%-8d olap %8d-%-8d via read %7d at %8d-%-8d hang %6d %6d %s%s\n",
+ op.covered.bgn, op.covered.end,
+ btig->id(),
+ btig->getLength(),
+ op.position.bgn, op.position.end,
+ op.verified.bgn, op.verified.end,
+ bread.ident,
+ bread.position.bgn,
+ bread.position.end,
+ ovl[oo].a_hang, ovl[oo].b_hang,
+ (ovl[oo].flipped == true) ? "I" : "N",
+ (disallow) ? " DISALLOW" : "");
+
+ // Ensure everything is hunkey dorey and save the overlap.
+
+ if (disallow == false) {
+ assert(op.covered.bgn >= 0);
+ assert(op.covered.end <= flen);
+ assert(op.covered.isForward() == true);
+
+ assert(op.position.isForward() == op.verified.isForward());
+
+ if (op.position.isForward() == true) {
+ assert(op.position.bgn <= op.verified.bgn);
+ assert(op.verified.end <= op.position.end);
+ } else {
+ assert(op.position.end <= op.verified.end);
+ assert(op.verified.bgn <= op.position.bgn);
+ }
+
+ ovlPlace[ovlPlaceLen++] = op;
+ }
+ }
}
@@ -213,10 +290,6 @@ placeRead_findFirstLastOverlapping(overlapPlacement &op,
op.tigFidx = min(ord, op.tigFidx);
op.tigLidx = max(ord, op.tigLidx);
-
- //if (logFileFlagSet(LOG_PLACE_READ))
- // writeLog("pRUO()-- find range from os=%u to oe=%u tig=%u ord=%u f=%u l=%u\n",
- // os, oe, op.tigID, ord, op.tigFidx, op.tigLidx);
}
if (op.tigFidx > op.tigLidx)
@@ -233,152 +306,93 @@ placeRead_findFirstLastOverlapping(overlapPlacement &op,
void
-placeRead_computeQualityAndCoverage(overlapPlacement &op,
- uint32 os, uint32 oe,
- overlapPlacement *ovlPlace) {
- op.errors = 0;
- op.aligned = 0;
-
- op.covered.bgn = INT32_MAX; // Covered interval is always in
- op.covered.end = INT32_MIN; // forward read coordinates
-
- for (uint32 oo=os; oo<oe; oo++) {
- if ((ovlPlace[oo].position.bgn == 0) &&
- (ovlPlace[oo].position.end == 0)) {
- if (logFileFlagSet(LOG_PLACE_READ))
- writeLog("OLD place=%3d read %8d ref read %8d - covered %5d:%-5d with %6.1f errors - DELETED\n",
- op.frgID, ovlPlace[oo].refID, ovlPlace[oo].covered.bgn, ovlPlace[oo].covered.end, ovlPlace[oo].errors);
- continue;
- }
-
- op.errors += ovlPlace[oo].errors;
- op.aligned += ovlPlace[oo].aligned;
-
- op.covered.bgn = min(op.covered.bgn, ovlPlace[oo].covered.bgn);
- op.covered.end = max(op.covered.end, ovlPlace[oo].covered.end);
-
- //if (logFileFlagSet(LOG_PLACE_READ))
- // writeLog("OLD place=%3d read %8d ref read %8d - covered %5d:%-5d with %6.1f errors\n",
- // oo, op.frgID, ovlPlace[oo].refID, ovlPlace[oo].covered.bgn, ovlPlace[oo].covered.end, ovlPlace[oo].errors);
- }
-
- op.fCoverage = (op.covered.end - op.covered.bgn) / (double)RI->readLength(op.frgID);
-}
-
+placeRead_computePlacement(overlapPlacement &op,
+ uint32 os,
+ uint32 oe,
+ overlapPlacement *ovlPlace,
+ Unitig *tig) {
+ stdDev<double> bgnPos, endPos;
+ bool isFwd = ovlPlace[os].position.isForward();
+ int32 readLen = RI->readLength(op.frgID);
+ int32 tigLen = tig->getLength();
-void
-placeRead_computeQualityAndCoverage(overlapPlacement &op,
- BAToverlap *ovl,
- uint32 ovlLen,
- set<uint32> &reads) {
op.errors = 0;
op.aligned = 0;
- op.covered.bgn = INT32_MAX; // Covered interval is always in
- op.covered.end = INT32_MIN; // forward read coordinates
+ op.verified.bgn = (isFwd) ? INT32_MAX : INT32_MIN;
+ op.verified.end = (isFwd) ? INT32_MIN : INT32_MAX;
- // For reads that have two overlaps to the same other read, we have no way of knowing
- // which is the correct overlap, just that we have an overlap.
- //
- // This happens in dros a whole bunch of times, and does change the fCoverave value.
+ int32 bgnVer2 = (isFwd) ? INT32_MAX : INT32_MIN;;
+ int32 endVer2 = (isFwd) ? INT32_MIN : INT32_MAX;;
- for (uint32 oo=0; oo<ovlLen; oo++) {
- if (reads.count(ovl[oo].b_iid) == 0)
- continue;
+ int32 bgnVer3 = (isFwd) ? INT32_MAX : INT32_MIN;;
+ int32 endVer3 = (isFwd) ? INT32_MIN : INT32_MAX;;
- int32 olen = RI->overlapLength(ovl[oo].a_iid, ovl[oo].b_iid, ovl[oo].a_hang, ovl[oo].b_hang);
- int32 flen = RI->readLength(ovl[oo].a_iid);
-
- int32 cbgn = (ovl[oo].a_hang < 0) ? 0 : ovl[oo].a_hang; // The portion of the read
- int32 cend = (ovl[oo].b_hang > 0) ? flen : ovl[oo].b_hang + flen; // covered by the overlap.
-
- //if (logFileFlagSet(LOG_PLACE_READ))
- // writeLog("NEW place=%3d read %8d ref read %8d - covered %5d:%-d with %f errors\n",
- // op.frgID, ovlPlace[oo].refID, cbgn, cend, olen * ovl[oo].erate());
-
- op.errors += olen * ovl[oo].erate();
- op.aligned += cend - cbgn;
-
- op.covered.bgn = min(op.covered.bgn, cbgn);
- op.covered.end = max(op.covered.end, cend);
- }
-
- op.fCoverage = (op.covered.end - op.covered.bgn) / (double)RI->readLength(op.frgID);
-}
-
-
-
-
-// Now that it is placed, estimate the span that is verified by overlaps.
-// Threshold the floating end so it doesn't exceed the placement.
-//
-// Annoyingly, the verified placement can, and does, exceed the bounds of the
-// unitig, and we need to check that threshold too. Indel in the read and all that.
-//
-void
-placeRead_computeVerified(overlapPlacement &op, uint32 tigLen) {
-
- //writeLog("computeVer pos %d-%d cov %d-%d\n", op.position.bgn, op.position.end, op.covered.bgn, op.covered.end);
+ op.covered.bgn = INT32_MAX; // Covered interval is always in
+ op.covered.end = INT32_MIN; // forward read coordinates
- if (op.position.isForward()) {
- op.verified.bgn = op.position.bgn + op.covered.bgn;
- op.verified.end = op.position.bgn + op.covered.end;
+ // Deleted overlaps? From where?
+ for (uint32 oo=os; oo<oe; oo++)
+ assert((ovlPlace[oo].position.bgn != 0) ||
+ (ovlPlace[oo].position.end != 0));
- if (op.verified.end > op.position.end) // verified.bgn is always valid if covered.bgn > 0
- op.verified.end = op.position.end; //
+ if (logFileFlagSet(LOG_PLACE_READ))
+ writeLog("computePlacement() for os=%u od=%u\n", os, oe);
- if (op.verified.bgn < 0)
- op.verified.bgn = 0;
- if (op.verified.end > tigLen)
- op.verified.end = tigLen;
+ // Over all the placements that support this position:
+ // compute the final position as the mean of the supporting overlaps.
+ // compute the verified position as.....
+ // compute the read bases covered by an overlap as the min/max.
+ //
+ // The verified position is a bit annoying.
+ //
+ // The first attempt used the mean just as for the position. But this occasionally
+ // left the verified outside the placed position. It was thresholded to make it sane.
+ // IT ALSO TOTALLY BREAKS GFA EDGE FINDING. (I think because the verified overlap position
+ // is too small).
+ //
+ // The second attempt set it relative to the position, using the hangs from the
+ // 'covered' position on the read. This failed on am ~8k read placed with a 500bp
+ // overlap. The overlapping read was placed shorter than expected. The sum of the overlap
+ // hangs was larger than this placement. (Largely solved by recomputing positions
+ // after unplaced reads are placed).
+ //
+ // The third attempt mirrors what is done for 'covered' -- just take the min/max
+ // of all the overlaps used when placing the read.
- assert(op.verified.bgn >= op.position.bgn);
- assert(op.verified.end <= op.position.end);
- assert(op.verified.bgn < op.verified.end);
- }
+ for (uint32 oo=os; oo<oe; oo++) {
+ bgnPos.insert(ovlPlace[oo].position.bgn);
+ endPos.insert(ovlPlace[oo].position.end);
- else {
- op.verified.bgn = op.position.bgn - op.covered.bgn; // High coord
- op.verified.end = op.position.bgn - op.covered.end; // Low coord
+ // Third attempt
- if (op.verified.end < op.position.end) // verified.bgn is always valid if covered.bgn > 0
- op.verified.end = op.position.end;
+ if (logFileFlagSet(LOG_PLACE_READ))
+ writeLog("placeRead_computePlacement()-- op %3d ovl ver %12d %12d pos %12d %12d\n",
+ oo,
+ ovlPlace[oo].verified.bgn, ovlPlace[oo].verified.end,
+ ovlPlace[oo].position.bgn, ovlPlace[oo].position.end);
+
+#if 1
+ if (isFwd) {
+ bgnVer3 = min(bgnVer3, ovlPlace[oo].verified.bgn);
+ endVer3 = max(endVer3, ovlPlace[oo].verified.end);
+ } else {
+ bgnVer3 = max(bgnVer3, ovlPlace[oo].verified.bgn);
+ endVer3 = min(endVer3, ovlPlace[oo].verified.end);
+ }
+#endif
- if (op.verified.end < 0)
- op.verified.end = 0;
- if (op.verified.bgn > tigLen)
- op.verified.bgn = tigLen;
+ op.errors += ovlPlace[oo].errors;
+ op.aligned += ovlPlace[oo].aligned;
- assert(op.verified.end >= op.position.end);
- assert(op.verified.bgn <= op.position.bgn);
- assert(op.verified.end < op.verified.bgn);
+ op.covered.bgn = min(op.covered.bgn, ovlPlace[oo].covered.bgn);
+ op.covered.end = max(op.covered.end, ovlPlace[oo].covered.end);
}
- assert(op.position.isForward() == op.verified.isForward());
-}
-
-
+ op.fCoverage = (op.covered.end - op.covered.bgn) / (double)readLen;
-void
-placeRead_computePlacement(overlapPlacement &op,
- uint32 os,
- uint32 oe,
- overlapPlacement *ovlPlace,
- Unitig *tig) {
- stdDev<double> bgnPos;
- stdDev<double> endPos;
-
- for (uint32 oo=os; oo<oe; oo++) {
- if ((ovlPlace[oo].position.bgn == 0) &&
- (ovlPlace[oo].position.end == 0))
- continue;
-
- //writeLog("OLD place %d-%d\n", ovlPlace[oo].position.bgn, ovlPlace[oo].position.end);
-
- bgnPos.insert(ovlPlace[oo].position.bgn);
- endPos.insert(ovlPlace[oo].position.end);
- }
+ // Take the mean of the positions as the final position.
bgnPos.finalize();
endPos.finalize();
@@ -386,63 +400,71 @@ placeRead_computePlacement(overlapPlacement &op,
op.position.bgn = bgnPos.mean();
op.position.end = endPos.mean();
- placeRead_computeVerified(op, tig->getLength());
-}
-
-
-
-
-void
-placeRead_computePlacement(overlapPlacement &op,
- BAToverlap *ovl,
- uint32 ovlLen,
- set<uint32> &reads,
- uint32 flags,
- Unitig *tig) {
- stdDev<double> bgnPos;
- stdDev<double> endPos;
+ // Second attempt.
- // For reads that have two overlaps to the same other read, we have no way of knowing
- // which is the correct overlap, just that we have an overlap.
+#if 1
+ if (isFwd) {
+ bgnVer2 = op.position.bgn + op.covered.bgn;
+ endVer2 = op.position.end - (readLen - op.covered.end);
+ } else {
+ bgnVer2 = op.position.bgn - op.covered.bgn;
+ endVer2 = op.position.end + (readLen - op.covered.end);
+ }
+#endif
- for (uint32 oo=0; oo<ovlLen; oo++) {
- if (reads.count(ovl[oo].b_iid) == 0)
- continue;
+ if (logFileFlagSet(LOG_PLACE_READ))
+ writeLog("placeRead_computePlacement()-- position %d-%d verified %d-%d %d-%d\n",
+ op.position.bgn, op.position.end,
+ bgnVer2, endVer2,
+ bgnVer3, endVer3);
- BestEdgeOverlap edge(ovl[oo]);
- ufNode read;
+ // Results in about 15% fewer contig and 3% fewer unitig edges, compared to v3.
+#if 0
+ op.verified.bgn = bgnVer2;
+ op.verified.end = endVer2;
+#endif
- if (tig->placeRead(read, op.frgID, ovl[oo].AEndIs3prime(), &edge) == false) {
- if (logFileFlagSet(LOG_PLACE_READ))
- writeLog("pRUO()-- WARNING: Failed to place with overlap %u %u hangs %d %d flipped %u\n",
- ovl[oo].a_iid, ovl[oo].b_iid, ovl[oo].a_hang, ovl[oo].b_hang, ovl[oo].flipped);
- continue;
- }
+ // On dmel, gives more contig edges than v2, mostly small stuff.
+#if 1
+ op.verified.bgn = bgnVer3;
+ op.verified.end = endVer3;
+#endif
- if (((flags & placeRead_fullMatch) ||
- (flags & placeRead_noExtend)) &&
- ((read.position.min() < 0) ||
- (read.position.max() > tig->getLength())))
- continue;
+ // Finally, limit verified to be the extent of the tig, or the extent of the placement.
- //writeLog("NEW place %d-%d\n", read.position.bgn, read.position.end);
+ if (isFwd) {
+ if (op.verified.bgn < 0) op.verified.bgn = 0;
+ if (op.verified.end > tigLen) op.verified.end = tigLen;
+ } else {
+ if (op.verified.bgn > tigLen) op.verified.bgn = tigLen;
+ if (op.verified.end < 0) op.verified.end = 0;
+ }
- bgnPos.insert(read.position.bgn);
- endPos.insert(read.position.end);
+ if (isFwd) {
+ if (op.verified.bgn < op.position.bgn) op.verified.bgn = op.position.bgn;
+ if (op.verified.end > op.position.end) op.verified.end = op.position.end;
+ } else {
+ if (op.verified.bgn > op.position.bgn) op.verified.bgn = op.position.bgn;
+ if (op.verified.end < op.position.end) op.verified.end = op.position.end;
}
- bgnPos.finalize();
- endPos.finalize();
+ // And check that the result is sane.
- op.position.bgn = bgnPos.mean();
- op.position.end = endPos.mean();
+ assert(op.position.isForward() == isFwd);
+ assert(op.position.isForward() == op.verified.isForward());
+ assert(op.covered.isForward() == true);
- placeRead_computeVerified(op, tig->getLength());
+ if (isFwd) {
+ assert(op.position.bgn <= op.verified.bgn);
+ assert(op.verified.end <= op.position.end);
+ } else {
+ assert(op.position.end <= op.verified.end);
+ assert(op.verified.bgn <= op.position.bgn);
+ }
}
-
bool
placeReadUsingOverlaps(TigVector &tigs,
Unitig *target,
@@ -450,14 +472,18 @@ placeReadUsingOverlaps(TigVector &tigs,
vector<overlapPlacement> &placements,
uint32 flags) {
- //if ((fid == 232074) || (fid == 72374) || (fid == 482602))
- // logFileFlags |= LOG_PLACE_READ;
+ set<uint32> verboseEnable;
+
+ //verboseEnable.insert(fid); // enable for all
+
+ if (verboseEnable.count(fid) > 0)
+ logFileFlags |= LOG_PLACE_READ;
if (logFileFlagSet(LOG_PLACE_READ)) // Nope, not ambiguous.
if (target)
- writeLog("\npRUO()-- begin for read %d into target tig %d\n", fid, target->id());
+ writeLog("\npRUO()-- begin for read %u length %u into target tig %d\n", fid, RI->readLength(fid), target->id());
else
- writeLog("\npRUO()-- begin for read %d into all tigs\n", fid);
+ writeLog("\npRUO()-- begin for read %u length %u into all tigs\n", fid, RI->readLength(fid));
assert(fid > 0);
assert(fid <= RI->numReads());
@@ -474,16 +500,17 @@ placeReadUsingOverlaps(TigVector &tigs,
// Compute placements. Anything that doesn't get placed is left as 'nowhere', specifically, in
// unitig 0 (which doesn't exist).
- overlapPlacement *ovlPlace = placeRead_fromOverlaps(tigs, target, fid, flags, ovlLen, ovl);
+ uint32 ovlPlaceLen = 0;
+ overlapPlacement *ovlPlace = new overlapPlacement [ovlLen];
+
+ placeRead_fromOverlaps(tigs, target, fid, flags, ovlLen, ovl, ovlPlaceLen, ovlPlace);
- // We've placed the read in all possible places, or set unitig ID to 0 (an invalid unitig).
// Sort all the placements. Sort order is:
- // unitig ID (so zero is first)
+ // unitig ID
// placed orientation (reverse is first)
// position
- sort(ovlPlace, ovlPlace + ovlLen, overlapPlacement_byLocation);
-
+ sort(ovlPlace, ovlPlace + ovlPlaceLen, overlapPlacement_byLocation);
// Segregate the overlaps by placement in the unitig. We want to construct one
// overlapPlacement for each distinct placement. How this is done:
@@ -508,19 +535,12 @@ placeReadUsingOverlaps(TigVector &tigs,
uint32 bgn = 0; // Range of overlaps with the same unitig/orientation
uint32 end = 1;
- // Skip overlaps that didn't generate a placement
-
- while ((bgn < ovlLen) && (ovlPlace[bgn].tigID == 0))
- bgn++;
-
- // Process all placements.
-
- while (bgn < ovlLen) {
+ while (bgn < ovlPlaceLen) {
// Find the last placement with the same unitig/orientation as the 'bgn' read.
end = bgn + 1;
- while ((end < ovlLen) &&
+ while ((end < ovlPlaceLen) &&
(ovlPlace[bgn].tigID == ovlPlace[end].tigID) &&
(ovlPlace[bgn].position.isReverse() == ovlPlace[end].position.isReverse()))
end++;
@@ -553,6 +573,8 @@ placeReadUsingOverlaps(TigVector &tigs,
// Each cluster generates one placement.
for (uint32 os=bgn, oe=bgn+1; os<end; ) {
+ if (logFileFlagSet(LOG_PLACE_READ))
+ writeLog("pRUO()-- process clusterID %u\n", ovlPlace[os].clusterID);
// Find the end ovlPlace, oe, for this cluster, and do a quick check on orientation.
@@ -561,106 +583,32 @@ placeReadUsingOverlaps(TigVector &tigs,
assert(ovlPlace[os].position.isReverse() == ovlPlace[oe].position.isReverse());
}
- // Build the set of reads we care about.
-
-#ifdef TEST_ALT
- set<uint32> reads;
-
- for (uint32 oo=os; oo<oe; oo++)
- reads.insert(ovlPlace[oo].refID);
-#endif
-
- // Make a new overlapPlacement from the first placement in this cluster.
-
- if (logFileFlagSet(LOG_PLACE_READ))
- writeLog("pRUO()-- process clusterID %u\n", ovlPlace[os].clusterID);
+ // Make a new overlapPlacement from the first placement in this cluster, figure out the first/last tig reads that
+ // have overlaps to it, and figure out final positions.
overlapPlacement op(fid, ovlPlace[os]);
- // Find the first and last read in the unitig that we overlap with.
-
placeRead_findFirstLastOverlapping(op, tigs[op.tigID], os, oe, ovlPlace);
-
- // Sum the errors and bases aligned for each overlap.
- // Find the minimum and maximum coordinates covered in the read, use that to compute the
- // fraction of read coverage.
-
- placeRead_computeQualityAndCoverage(op, os, oe, ovlPlace);
-
-#ifdef TEST_ALT
- // Test the alternate qual and cov compute that uses overlaps directly
- {
- double er = op.errors;
- uint32 al = op.aligned;
- double fC = op.fCoverage;
- int32 bg = op.covered.bgn;
- int32 ed = op.covered.end;
-
- placeRead_computeQualityAndCoverage(op, ovl, ovlLen, reads);
-
- if ((er - op.errors > 0.0001) ||
- ((int32)al - (int32)op.aligned != 0) ||
- (fC - op.fCoverage > 0.0001) ||
- (bg - op.covered.bgn != 0) ||
- (ed - op.covered.end != 0))
- writeLog("COMPARE er %8.3f %8.3f %8.3f al %7u %7u %7d fC %8.4f %8.4f %8.4f bg %8d %8d %8d ed %8d %8d %8d\n",
- er, op.errors, er - op.errors,
- al, op.aligned, (int32)al - (int32)op.aligned,
- fC, op.fCoverage, fC - op.fCoverage,
- bg, op.covered.bgn, bg - op.covered.bgn,
- ed, op.covered.end, ed - op.covered.end);
- }
-#endif
-
- // Compute placement based on the longest overlap on each end, or the best contain.
-
placeRead_computePlacement(op, os, oe, ovlPlace, tigs[op.tigID]);
-#ifdef TEST_ALT
- {
- SeqInterval origpos = op.position;
- SeqInterval origver = op.verified;
-
- placeRead_computePlacement(op, ovl, ovlLen, reads, flags, tigs[op.tigID]);
-
- if ((origpos.bgn - op.position.bgn > 10) || // Placements wobble by a few bases
- (origpos.end - op.position.end > 10) ||
- (origver.bgn - op.verified.bgn > 10) ||
- (origver.end - op.verified.end > 10))
- writeLog("COMPARE pos bgn %d-%d end %d-%d ver bgn %d-%d end %d-%d\n",
- origpos.bgn, op.position.bgn,
- origpos.end, op.position.end,
- origver.bgn, op.verified.bgn,
- origver.end, op.verified.end);
- }
-#endif
-
// Filter out bogus placements. There used to be a few more, but they made no sense for long reads.
// Reject if either end stddev is high. It has to be pretty bad before this triggers.
- bool goodPlacement = true;
-
-#if 0
- double allowableStdDev = max(2.0, 0.075 * RI->readLength(op.frgID));
-
- if ((bgnPos.stddev() > allowableStdDev) ||
- (endPos.stddev() > allowableStdDev))
- goodPlacement = false;
-#endif
+ bool fullMatch = true;
+ bool noExtend = true;
if ((flags & placeRead_fullMatch) &&
- (op.fCoverage < 0.99))
- goodPlacement = false;
+ (op.fCoverage < 1.0))
+ fullMatch = false;
if ((flags & placeRead_noExtend) &&
((op.position.min() < 0) ||
(op.position.max() > tigs[op.tigID]->getLength())))
- goodPlacement = false;
+ noExtend = false;
- if (goodPlacement)
+ if ((fullMatch == true) && (noExtend == true))
placements.push_back(op);
-
if (logFileFlagSet(LOG_PLACE_READ))
writeLog("pRUO()-- placements[%u] - PLACE READ %d in tig %d at %d,%d -- verified %d,%d -- covered %d,%d %4.1f%% -- errors %.2f aligned %d novl %d%s\n",
placements.size() - 1,
@@ -670,7 +618,8 @@ placeReadUsingOverlaps(TigVector &tigs,
op.covered.bgn, op.covered.end,
op.fCoverage * 100.0,
op.errors, op.aligned, oe - os,
- (goodPlacement == false) ? " -- INVALID" : "");
+ (fullMatch == false) ? " -- PARTIAL" : "",
+ (noExtend == false) ? " -- EXTENDS" : "");
os = oe;
} // End of segregating overlaps by placement
@@ -681,8 +630,8 @@ placeReadUsingOverlaps(TigVector &tigs,
delete [] ovlPlace;
- //if ((fid == 232074) || (fid == 72374) || (fid == 482602))
- // logFileFlags &= ~LOG_PLACE_READ;
+ if (verboseEnable.count(fid) > 0)
+ logFileFlags &= ~LOG_PLACE_READ;
return(true);
}
diff --git a/src/bogart/AS_BAT_PlaceReadUsingOverlaps.H b/src/bogart/AS_BAT_PlaceReadUsingOverlaps.H
index 0c90328..50b63ac 100644
--- a/src/bogart/AS_BAT_PlaceReadUsingOverlaps.H
+++ b/src/bogart/AS_BAT_PlaceReadUsingOverlaps.H
@@ -43,11 +43,13 @@ public:
refID = 0;
tigID = 0;
+
+ clusterID = 0;
+
position = SeqInterval();
verified = SeqInterval();
covered = SeqInterval();
- clusterID = 0;
fCoverage = 0.0;
errors = 0.0;
@@ -62,6 +64,9 @@ public:
refID = UINT32_MAX; // Not valid in the output overlapPlacement.
tigID = op.tigID;
+
+ clusterID = op.clusterID; // Useless to track forward.
+
position.bgn = 0;
position.end = 0;
@@ -71,8 +76,6 @@ public:
covered.bgn = op.covered.bgn;
covered.end = op.covered.end;
- clusterID = op.clusterID; // Useless to track forward.
-
fCoverage = 0.0;
errors = 0.0;
@@ -92,12 +95,12 @@ public:
uint32 refID; // Read ID of the overlapping read were placed with.
uint32 tigID; // Unitig ID of this placement
+ int32 clusterID;
+
SeqInterval position; // Unitig position of this placement
SeqInterval verified; // Unitig position of this placement, verified by overlaps
SeqInterval covered; // Position of the overlap on the read
- int32 clusterID;
-
double fCoverage; // Coverage of the read
double errors; // number of errors in alignments
@@ -108,10 +111,13 @@ public:
};
-// Sort by: tigID, orientation, position
+// Sort by: clusterID, tigID, orientation, position
//
// This sort is used to cluster the reads into overlapping regions. We don't care
// about ties.
+//
+// clusterID is UINT32_MAX if the placement should be ignored.
+//
inline
bool
overlapPlacement_byLocation(const overlapPlacement &A, const overlapPlacement &B) {
diff --git a/src/bogart/AS_BAT_PromoteToSingleton.C b/src/bogart/AS_BAT_PromoteToSingleton.C
index ecf39e5..57e3a03 100644
--- a/src/bogart/AS_BAT_PromoteToSingleton.C
+++ b/src/bogart/AS_BAT_PromoteToSingleton.C
@@ -68,6 +68,8 @@ promoteToSingleton(TigVector &tigs) {
read.position.end = RI->readLength(fi);
utg->addRead(read, 0, false);
+
+ utg->_isUnassembled = true;
}
writeStatus("promoteToSingleton()-- Moved " F_U32 " unplaced read%s to singleton tigs.\n",
diff --git a/src/bogart/AS_BAT_ReadInfo.C b/src/bogart/AS_BAT_ReadInfo.C
index 0f71a81..1e20732 100644
--- a/src/bogart/AS_BAT_ReadInfo.C
+++ b/src/bogart/AS_BAT_ReadInfo.C
@@ -28,13 +28,15 @@
-ReadInfo::ReadInfo(gkStore *gkp,
+ReadInfo::ReadInfo(const char *gkpStorePath,
const char *prefix,
uint32 minReadLen) {
+ gkStore *gkpStore = gkStore::gkStore_open(gkpStorePath);
+
_numBases = 0;
- _numReads = gkp->gkStore_getNumReads();
- _numLibraries = gkp->gkStore_getNumLibraries();
+ _numReads = gkpStore->gkStore_getNumReads();
+ _numLibraries = gkpStore->gkStore_getNumLibraries();
_readStatus = new ReadStatus [_numReads + 1];
@@ -51,7 +53,7 @@ ReadInfo::ReadInfo(gkStore *gkp,
uint32 numLoaded = 0;
for (uint32 fi=1; fi<=_numReads; fi++) {
- gkRead *read = gkp->gkStore_getRead(fi);
+ gkRead *read = gkpStore->gkStore_getRead(fi);
uint32 iid = read->gkRead_readID();
uint32 len = read->gkRead_sequenceLength();
@@ -68,6 +70,8 @@ ReadInfo::ReadInfo(gkStore *gkp,
numLoaded++;
}
+ gkpStore->gkStore_close();
+
if (minReadLen > 0)
writeStatus("ReadInfo()-- Using %d reads, ignoring %u reads less than " F_U32 " bp long.\n",
numLoaded, numSkipped, minReadLen);
diff --git a/src/bogart/AS_BAT_ReadInfo.H b/src/bogart/AS_BAT_ReadInfo.H
index bef0b09..b4cefca 100644
--- a/src/bogart/AS_BAT_ReadInfo.H
+++ b/src/bogart/AS_BAT_ReadInfo.H
@@ -39,25 +39,25 @@
struct ReadStatus {
- uint32 readLength : AS_MAX_READLEN_BITS;
- uint32 libraryID : AS_MAX_LIBRARIES_BITS;
+ uint64 readLength : AS_MAX_READLEN_BITS;
+ uint64 libraryID : AS_MAX_LIBRARIES_BITS;
- uint32 isBackbone : 1; // Used to construct initial contig
- uint32 isUnplaced : 1; // Placed in initial contig using overlaps
- uint32 isLeftover : 1; // Not placed
+ uint64 isBackbone : 1; // Used to construct initial contig
+ uint64 isUnplaced : 1; // Placed in initial contig using overlaps
+ uint64 isLeftover : 1; // Not placed
- uint32 unused : (32 - AS_MAX_READLEN_BITS - AS_MAX_LIBRARIES_BITS - 3);
+ uint64 unused : (64 - AS_MAX_READLEN_BITS - AS_MAX_LIBRARIES_BITS - 3);
};
class ReadInfo {
public:
- ReadInfo(gkStore *gkp, const char *prefix, uint32 minReadLen);
+ ReadInfo(const char *gkpStorePath, const char *prefix, uint32 minReadLen);
~ReadInfo();
uint64 memoryUsage(void) {
- return(sizeof(uint64) + sizeof(uint32) + sizeof(uint32) + sizeof(uint32) * _numReads);
+ return(sizeof(uint64) + sizeof(uint32) + sizeof(uint32) + sizeof(ReadStatus) * (_numReads + 1));
};
uint64 numBases(void) { return(_numBases); };
diff --git a/src/bogart/AS_BAT_SplitDiscontinuous.C b/src/bogart/AS_BAT_SplitDiscontinuous.C
index 25ee78d..4cf0fe2 100644
--- a/src/bogart/AS_BAT_SplitDiscontinuous.C
+++ b/src/bogart/AS_BAT_SplitDiscontinuous.C
@@ -170,7 +170,7 @@ splitDiscontinuous(TigVector &tigs, uint32 minOverlap, vector<tigLoc> &tigSource
if ((tigSource.size() > 0) && (newtig)) {
tigSource.resize(newtig->id() + 1);
- tigSource[newtig->id()].cID = tig->id();
+ tigSource[newtig->id()].cID = tigSource[ tig->id()].cID,
tigSource[newtig->id()].cBgn = tigSource[ tig->id()].cBgn + splitReads[0].position.min();
tigSource[newtig->id()].cEnd = tigSource[newtig->id()].cBgn + newtig->getLength();
tigSource[newtig->id()].uID = newtig->id();
@@ -194,7 +194,7 @@ splitDiscontinuous(TigVector &tigs, uint32 minOverlap, vector<tigLoc> &tigSource
if ((tigSource.size() > 0) && (newtig)) {
tigSource.resize(newtig->id() + 1);
- tigSource[newtig->id()].cID = tig->id();
+ tigSource[newtig->id()].cID = tigSource[ tig->id()].cID,
tigSource[newtig->id()].cBgn = tigSource[ tig->id()].cBgn + splitReads[0].position.min();
tigSource[newtig->id()].cEnd = tigSource[newtig->id()].cBgn + newtig->getLength();
tigSource[newtig->id()].uID = newtig->id();
diff --git a/src/bogart/AS_BAT_TigGraph.C b/src/bogart/AS_BAT_TigGraph.C
index 663cc45..ec1ef85 100644
--- a/src/bogart/AS_BAT_TigGraph.C
+++ b/src/bogart/AS_BAT_TigGraph.C
@@ -130,11 +130,11 @@ emitEdges(TigVector &tigs,
if (((rdA->isForward() == true) && (placements[pp].covered.bgn > 0)) ||
((rdA->isReverse() == true) && (placements[pp].covered.end < rdAlen))) {
#ifdef SHOW_EDGES
- writeLog("emitEdges()-- edge --- - tig %6u read %8u %8u-%-8u placed bases %8u-%-8u in tig %6u %8u-%-8u - INCOMPLETELY PLACED outside\n",
+ writeLog("emitEdges()-- edge --- - tig %6u read %8u %8u-%-8u len %6u placed bases %8u-%-8u in tig %6u %8u-%-8u %9u - INCOMPLETELY PLACED outside\n",
tgA->id(),
- rdA->ident, rdA->position.bgn, rdA->position.end,
+ rdA->ident, rdA->position.bgn, rdA->position.end, rdAlen,
placements[pp].covered.bgn, placements[pp].covered.end,
- tgBid, bgn, end);
+ tgBid, bgn, end, tgBlen);
#endif
continue;
}
@@ -145,9 +145,9 @@ emitEdges(TigVector &tigs,
if (((rdA->isForward() == true) && (placements[pp].covered.end < rdAlen) && (bgn > 100) && (end + 100 < tgBlen)) ||
((rdA->isReverse() == true) && (placements[pp].covered.bgn > 0) && (bgn > 100) && (end + 100 < tgBlen))) {
#ifdef SHOW_EDGES
- writeLog("emitEdges()-- edge --- - tig %6u read %8u %8u-%-8u placed bases %8u-%-8u in tig %6u %8u-%-8u - INCOMPLETELY PLACED inside\n",
+ writeLog("emitEdges()-- edge --- - tig %6u read %8u %8u-%-8u len %6u placed bases %8u-%-8u in tig %6u %8u-%-8u %9u - INCOMPLETELY PLACED inside\n",
tgA->id(),
- rdA->ident, rdA->position.bgn, rdA->position.end,
+ rdA->ident, rdA->position.bgn, rdA->position.end, rdAlen,
placements[pp].covered.bgn, placements[pp].covered.end,
tgBid, bgn, end, tgBlen);
#endif
@@ -155,12 +155,13 @@ emitEdges(TigVector &tigs,
}
#ifdef SHOW_EDGES
- writeLog("emitEdges()-- edge %3u - tig %6u read %8u %8u-%-8u placed bases %8u-%-8u in tig %6u %8u-%-8u quality %f\n",
+ writeLog("emitEdges()-- edge %3u - tig %6u read %8u %8u-%-8u placed bases %8u-%-8u in tig %6u %8u-%-8u %s quality %f\n",
edges.size(),
tgA->id(),
rdA->ident, rdA->position.bgn, rdA->position.end,
placements[pp].covered.bgn, placements[pp].covered.end,
tgBid, bgn, end,
+ placements[pp].verified.isForward() ? "->" : "<-",
(double)placements[pp].errors / placements[pp].aligned);
#endif
@@ -174,11 +175,9 @@ emitEdges(TigVector &tigs,
// <--- alignment on second tig - so if not the same, the second tig needs to be
// -------------------> - flipped to make the alignment work
- bool fwd = false;
+ bool fwd = (rdA->isForward() == placements[pp].verified.isForward());
- if (((rdA->isForward() == true) && (placements[pp].verified.isForward() == true)) ||
- ((rdA->isForward() == false) && (placements[pp].verified.isForward() == false)))
- fwd = true;
+ // And save the placement.
edges.push_back(grEdge(tgBid, bgn, end, fwd));
}
@@ -224,8 +223,10 @@ emitEdges(TigVector &tigs,
(bgn > 100) &&
(end + 100 < tgBlen)) {
#ifdef SHOW_EDGES_UNPLACED
- writeLog("emitEdges()-- read %5u incomplete placement covering %5u-%-5u in at %5u-%-5u in tig %4u\n",
- rdA->ident, placements[pp].covered.bgn, placements[pp].covered.end, bgn, end, tgBid);
+ writeLog("emitEdges()-- read %5u incomplete placement covering %5u-%-5u at %5u-%-5u %s in tig %4u\n",
+ rdA->ident,
+ placements[pp].covered.bgn, placements[pp].covered.end,
+ bgn, end, placements[pp].verified.isForward() ? "->" : "<-", tgBid);
#endif
continue;
}
@@ -245,46 +246,58 @@ emitEdges(TigVector &tigs,
// tgA against CAB in the target tig. If not, we'll need to keep count of which direction
// we extend things in.
-
// Fail if most of the extension is to the wrong side. We always move to higher
// coordinates on tgA. If tgB is forward, it should move to higher coordinates too.
-
- int32 nbgn = min(edges[ee].bgn, bgn);
- int32 nend = max(edges[ee].end, end);
-
- if ((edges[ee].fwd == true) &&
- (bgn - nbgn > nend - end)) { // If we decrease bgn more than we increased end, fail
+ //
+ // tgA ---------------------------------------------
+ // rdA -------------->
+ // <-------------------
+ // ---------------------
+ //
+ // tgB -----------------------------------------------------
+ // [----edges----]
+ // [----read-----]
+ //
+ // To make it more complicated, a contained read should do nothing, so we can't just
+ // insist the end coordinate gets bigger. We must make sure that the bgn coordinate
+ // doesn't get (significantly) smaller.
+
+ int32 nbgn = min(edges[ee].bgn, bgn); // edges[] is the current region aligned
+ int32 nend = max(edges[ee].end, end); // bgn,end is where the new read aligned
+
+ // If tgB is forward, fail if the read aligned to the left (lower) of the current region.
+
+ if ((edges[ee].fwd == true) && (bgn < edges[ee].bgn) && (end < edges[ee].end)) {
#ifdef SHOW_EDGES_UNPLACED
- writeLog("emitEdges()-- edge %3u - extend from %5u-%-5u to %5u-%-5u -- placed read %5u at %5u-%-5u in tig %4u - wrong direction\n",
- ee,
- edges[ee].bgn, edges[ee].end,
- nbgn, nend,
- rdA->ident, bgn, end, tgBid);
+ writeLog("emitEdges()-- edge %3u - extend from %5u-%-5u to %5u-%-5u -- placed read %5u at %5u-%-5u %s in tig %4u - wrong direction (fwd)\n",
+ ee,
+ edges[ee].bgn, edges[ee].end,
+ nbgn, nend,
+ rdA->ident, bgn, end, placements[pp].verified.isForward() ? "->" : "<-", tgBid);
#endif
continue;
}
- // The reverse case is a bit tricky since we're tracking min/max posiiton on tgB.
- // When we extend on tgA, we expect the bgn to decrease on tgB and the end to stay the same.
+ // If tgB is reverse, fail if the read aligned to the left (higher) of the current region.
- if ((edges[ee].fwd == false) &&
- (nend - end > bgn - nbgn)) { // If we increase end more than we decreased bgn, fail
+ if ((edges[ee].fwd == false) && (end > edges[ee].end) && (bgn > edges[ee].bgn)) {
#ifdef SHOW_EDGES_UNPLACED
- writeLog("emitEdges()-- edge %3u - extend from %5u-%-5u to %5u-%-5u -- placed read %5u at %5u-%-5u in tig %4u - wrong direction\n",
+ writeLog("emitEdges()-- edge %3u - extend from %5u-%-5u to %5u-%-5u -- placed read %5u at %5u-%-5u %s in tig %4u - wrong direction (rev)\n",
ee,
edges[ee].bgn, edges[ee].end,
nbgn, nend,
- rdA->ident, bgn, end, tgBid);
+ rdA->ident, bgn, end, placements[pp].verified.isForward() ? "->" : "<-", tgBid);
#endif
continue;
}
#ifdef SHOW_EDGES
- writeLog("emitEdges()-- edge %3u - extend from %5u-%-5u to %5u-%-5u -- placed read %5u at %5u-%-5u in tig %4u\n",
+ writeLog("emitEdges()-- edge %3u - extend from %5u-%-5u to %5u-%-5u -- placed read %5u at %5u-%-5u %s in tig %4u\n",
ee,
edges[ee].bgn, edges[ee].end,
nbgn, nend,
- rdA->ident, bgn, end, tgBid);
+ rdA->ident, bgn, end, placements[pp].verified.isForward() ? "->" : "<-",
+ tgBid);
#endif
edges[ee].bgn = nbgn;
@@ -321,6 +334,9 @@ emitEdges(TigVector &tigs,
tgA->id(), tgAflipped ? '-' : '+',
edges[ee].end - edges[ee].bgn,
(sameContig == true) ? "\tcv:A:T" : "\tcv:A:F");
+
+ tgA->_isCircular = (tgA->id() == edges[ee].tigID);
+
edges[ee].deleted = true;
}
@@ -337,6 +353,9 @@ emitEdges(TigVector &tigs,
tgA->id(), tgAflipped ? '-' : '+',
edges[ee].end - edges[ee].bgn,
(sameContig == true) ? "\tcv:A:T" : "\tcv:A:F");
+
+ tgA->_isCircular = (tgA->id() == edges[ee].tigID);
+
edges[ee].deleted = true;
}
}
@@ -413,7 +432,8 @@ reportTigGraph(TigVector &tigs,
vector<tigLoc> &tigSource,
const char *prefix,
const char *label) {
- char N[FILENAME_MAX];
+ char BEGn[FILENAME_MAX];
+ char BEDn[FILENAME_MAX];
writeLog("\n");
writeLog("----------------------------------------\n");
@@ -421,9 +441,11 @@ reportTigGraph(TigVector &tigs,
writeStatus("AssemblyGraph()-- generating '%s.%s.gfa'.\n", prefix, label);
- snprintf(N, FILENAME_MAX, "%s.%s.gfa", prefix, label);
+ snprintf(BEGn, FILENAME_MAX, "%s.%s.gfa", prefix, label);
+ snprintf(BEDn, FILENAME_MAX, "%s.%s.bed", prefix, label);
- FILE *BEG = fopen(N, "w");
+ FILE *BEG = fopen(BEGn, "w");
+ FILE *BED = (tigSource.size() > 0) ? fopen(BEDn, "w") : NULL;
if (BEG == NULL)
return;
@@ -437,7 +459,8 @@ reportTigGraph(TigVector &tigs,
// make a disconnected unitig and need to split it again.
for (uint32 ti=1; ti<tigs.size(); ti++)
- if ((tigs[ti] != NULL) && (tigs[ti]->_isUnassembled == false))
+ if ((tigs[ti] != NULL) &&
+ (tigs[ti]->_isUnassembled == false))
fprintf(BEG, "S\ttig%08u\t*\tLN:i:%u\n", ti, tigs[ti]->getLength());
// Run through all the tigs, emitting edges for the first and last read.
@@ -445,7 +468,8 @@ reportTigGraph(TigVector &tigs,
for (uint32 ti=1; ti<tigs.size(); ti++) {
Unitig *tgA = tigs[ti];
- if ((tgA == NULL) || (tgA->_isUnassembled == true))
+ if ((tgA == NULL) ||
+ (tgA->_isUnassembled == true))
continue;
//if (ti == 4)
@@ -469,10 +493,20 @@ reportTigGraph(TigVector &tigs,
emitEdges(tigs, tgA, true, BEG, tigSource);
tgA->reverseComplement();
+ if ((tigSource.size() > 0) && (tigSource[ti].cID != UINT32_MAX))
+ fprintf(BED, "ctg%08u\t%u\t%u\tutg%08u\t%u\t%c\n",
+ tigSource[ti].cID,
+ tigSource[ti].cBgn,
+ tigSource[ti].cEnd,
+ ti,
+ 0,
+ '+');
+
//logFileFlags &= ~LOG_PLACE_READ;
}
- fclose(BEG);
+ if (BEG) fclose(BEG);
+ if (BED) fclose(BED);
// And report statistics.
diff --git a/src/bogart/AS_BAT_TigVector.C b/src/bogart/AS_BAT_TigVector.C
index ee5121c..9a97b4b 100644
--- a/src/bogart/AS_BAT_TigVector.C
+++ b/src/bogart/AS_BAT_TigVector.C
@@ -151,10 +151,6 @@ Unitig *&operator[](uint32 i) {
-
-
-
-
void
TigVector::computeArrivalRate(const char *prefix, const char *label) {
uint32 tiLimit = size();
@@ -214,6 +210,8 @@ TigVector::computeErrorProfiles(const char *prefix, const char *label) {
tig->computeErrorProfile(prefix, label);
}
+
+ writeStatus("computeErrorProfiles()-- Finished.\n");
}
diff --git a/src/bogart/AS_BAT_TigVector.H b/src/bogart/AS_BAT_TigVector.H
index 1970244..eb62830 100644
--- a/src/bogart/AS_BAT_TigVector.H
+++ b/src/bogart/AS_BAT_TigVector.H
@@ -41,6 +41,8 @@ public:
size_t size(void) { return(_totalTigs); };
Unitig *&operator[](uint32 i) { return(_blocks[i / _blockSize][i % _blockSize]); };
+ void optimizePositions(const char *prefix, const char *label);
+
void computeArrivalRate(const char *prefix, const char *label);
void computeErrorProfiles(const char *prefix, const char *label);
diff --git a/src/bogart/AS_BAT_Unitig.C b/src/bogart/AS_BAT_Unitig.C
index 094e644..74e229e 100644
--- a/src/bogart/AS_BAT_Unitig.C
+++ b/src/bogart/AS_BAT_Unitig.C
@@ -41,8 +41,6 @@
#include "AS_BAT_BestOverlapGraph.H"
#include "AS_BAT_Logging.H"
-static std::map<uint32,int>* containPartialOrder;
-
#undef SHOW_PROFILE_CONSTRUCTION
#undef SHOW_PROFILE_CONSTRUCTION_DETAILS
@@ -83,8 +81,6 @@ Unitig::reverseComplement(bool doSort) {
-// Ensure that the children are sorted by begin position, and that unitigs start at position zero.
-
void
Unitig::cleanUp(void) {
@@ -93,17 +89,15 @@ Unitig::cleanUp(void) {
int32 minPos = ufpath[0].position.min();
- if (minPos == 0)
- return;
-
- for (uint32 fi=0; fi<ufpath.size(); fi++) {
- ufpath[fi].position.bgn -= minPos;
- ufpath[fi].position.end -= minPos;
- }
+ if (minPos != 0)
+ for (uint32 fi=0; fi<ufpath.size(); fi++) {
+ ufpath[fi].position.bgn -= minPos;
+ ufpath[fi].position.end -= minPos;
+ }
_length = 0;
- for (uint32 fi=0; fi<ufpath.size(); fi++) { // Could use position.max(), but since
+ for (uint32 fi=0; fi<ufpath.size(); fi++) { // Could use position.max(), but since
_length = max(_length, ufpath[fi].position.bgn); // it too calls max(), there's no win
_length = max(_length, ufpath[fi].position.end);
}
@@ -111,25 +105,6 @@ Unitig::cleanUp(void) {
-class epOlapDat {
-public:
- epOlapDat(uint32 p, bool o, float e) {
- pos = p;
- open = o;
- erate = e;
- };
-
- bool operator<(const epOlapDat &that) const { return(pos < that.pos); };
-
- uint32 pos : 31;
- bool open : 1;
- float erate;
-};
-
-
-
-
-
void
Unitig::computeArrivalRate(const char *UNUSED(prefix),
const char *UNUSED(label),
@@ -160,6 +135,29 @@ Unitig::computeArrivalRate(const char *UNUSED(prefix),
+class epOlapDat {
+public:
+ epOlapDat() {
+ pos = 0;
+ open = false;
+ erate = 0.0;
+ };
+
+ epOlapDat(uint32 p, bool o, float e) {
+ pos = p;
+ open = o;
+ erate = e;
+ };
+
+ bool operator<(const epOlapDat &that) const { return(pos < that.pos); };
+
+ uint32 pos : 31;
+ bool open : 1;
+ float erate;
+};
+
+
+
void
Unitig::computeErrorProfile(const char *UNUSED(prefix), const char *UNUSED(label)) {
@@ -171,10 +169,58 @@ Unitig::computeErrorProfile(const char *UNUSED(prefix), const char *UNUSED(label
errorProfile.clear();
errorProfileIndex.clear();
- vector<epOlapDat> olaps;
+ // Count the number of overlaps we need to save. We do this, instead of growing the array,
+ // because occasionally these are big, and having two around at the same time can blow our
+ // memory. (Arabidopsis p5 has a tig with 160,246,250 olaps == 1gb memory)
+
+#if 0
+ // A (much) fancier version would merge the overlap detection and errorProfile compute together.
+ // Keep lists of epOlapDat for each read end (some cleverness could probably get rid of the map,
+ // if we just use the index of the read). Before we process a new read, all data for positions
+ // before this reads start position can be processed and freed.
+
+ map<uint32, uint32> baseToIndex;
+
+ uint32 *olapsMax = new uint32 [ufpath.size() * 2];
+ uint32 *olapsLen = new uint32 [ufpath.size() * 2];
+ epOlapDat **olaps = new epOlapDat [ufpath.size() * 2];
+#endif
+
+ uint32 olapsMax = 0;
+ uint32 olapsLen = 0;
+ epOlapDat *olaps = NULL;
+
+ for (uint32 fi=0; fi<ufpath.size(); fi++) {
+ ufNode *rdA = &ufpath[fi];
+ int32 rdAlo = rdA->position.min();
+ int32 rdAhi = rdA->position.max();
+
+ uint32 ovlLen = 0;
+ BAToverlap *ovl = OC->getOverlaps(rdA->ident, ovlLen);
+
+ for (uint32 oi=0; oi<ovlLen; oi++) {
+ if (id() != _vector->inUnitig(ovl[oi].b_iid)) // Reads in different tigs?
+ continue; // Don't care about this overlap.
+
+ ufNode *rdB = &ufpath[ _vector->ufpathIdx(ovl[oi].b_iid) ];
+
+ if (rdA->ident < rdB->ident) // Only want to see one overlap
+ continue; // for each pair.
+
+ int32 rdBlo = rdB->position.min();
+ int32 rdBhi = rdB->position.max();
+
+ if ((rdAhi <= rdBlo) || (rdBhi <= rdAlo)) // Reads in same tig but not overlapping?
+ continue; // Don't care about this overlap.
+
+ olapsMax += 2;
+ }
+ }
// Scan overlaps to find those that we care about, and save their endpoints.
+ olaps = new epOlapDat [olapsMax];
+
for (uint32 fi=0; fi<ufpath.size(); fi++) {
ufNode *rdA = &ufpath[fi];
int32 rdAlo = rdA->position.min();
@@ -202,17 +248,19 @@ Unitig::computeErrorProfile(const char *UNUSED(prefix), const char *UNUSED(label
uint32 end = min(rdAhi, rdBhi);
#ifdef SHOW_PROFILE_CONSTRUCTION_DETAILS
- writeLog("errorProfile()-- olap[%u] %u %u begin %u end %u\n", oi, rdA->ident, rdB->ident, bgn, end);
+ writeLog("errorProfile()-- olap %5u read %7u read %7u at %9u-%9u\n",
+ oi, rdA->ident, rdB->ident, bgn, end);
#endif
- olaps.push_back(epOlapDat(bgn, true, ovl[oi].erate())); // Save an open event,
- olaps.push_back(epOlapDat(end, false, ovl[oi].erate())); // and a close event.
+ olaps[olapsLen++] = epOlapDat(bgn, true, ovl[oi].erate()); // Save an open event,
+ olaps[olapsLen++] = epOlapDat(end, false, ovl[oi].erate()); // and a close event.
+ assert(olapsLen <= olapsMax);
}
}
// Warn if no overlaps.
- if (olaps.size() == 0) {
+ if (olapsLen == 0) {
writeLog("WARNING: tig %u length %u nReads %u has no overlaps.\n", id(), getLength(), ufpath.size());
for (uint32 fi=0; fi<ufpath.size(); fi++)
writeLog("WARNING: read %7u %7u-%-7u\n",
@@ -223,111 +271,86 @@ Unitig::computeErrorProfile(const char *UNUSED(prefix), const char *UNUSED(label
// Sort.
- std::sort(olaps.begin(), olaps.end());
+#ifdef _GLIBCXX_PARALLEL
+ __gnu_sequential::sort(olaps, olaps + olapsLen);
+#else
+ std::sort(olaps, olaps + olapsLen);
+#endif
// Convert coordinates into intervals. Conceptually, squish out the duplicate numbers, then
// create an interval for every adjacent pair. We need to add intervals for the first and last
// region. And one more, for convenience, to hold the final 'close' values on intervals that
// extend to the end of the unitig.
- if (olaps.size() == 0)
- errorProfile.push_back(epValue(0, getLength()));
+ if (olapsLen == 0) // No olaps, so add an interval
+ errorProfile.push_back(epValue(0, getLength())); // covering the whole tig
- if ((olaps.size() > 0) && (olaps[0].pos != 0))
- errorProfile.push_back(epValue(0, olaps[0].pos));
+ if ((olapsLen > 0) && (olaps[0].pos != 0)) // Olaps, but missing the first
+ errorProfile.push_back(epValue(0, olaps[0].pos)); // interval, so add it.
- for (uint32 bb=0, ii=1; ii<olaps.size(); ii++) {
- if (olaps[bb].pos == olaps[ii].pos)
- continue;
- errorProfile.push_back(epValue(olaps[bb].pos, olaps[ii].pos));
+ stdDev<float> curDev;
-#ifdef SHOW_PROFILE_CONSTRUCTION_DETAILS
- writeLog("errorProfile()-- tig %u make region [%u-%u] @ %u-%u\n", id(), bb, ii, olaps[bb].pos, olaps[ii].pos);
-#endif
+ for (uint32 bb=0, ee=0; ee<olapsLen; ee++) {
+ if (olaps[bb].pos != olaps[ee].pos) { // A different position.
+ errorProfile.push_back(epValue(olaps[bb].pos, // Save the current stats in a new profile entry.
+ olaps[ee].pos,
+ curDev.mean(),
+ curDev.stddev()));
+ bb = ee;
+ }
- bb = ii;
+ if (olaps[ee].open == true) // Add the new overlap to our running
+ curDev.insert(olaps[ee].erate); // std.dev calculation.
+ else
+ curDev.remove(olaps[ee].erate);
+
+ if ((ee == olapsLen - 1) &&
+ (olaps[bb].pos != olaps[ee].pos)) { // If the last olap,
+ errorProfile.push_back(epValue(olaps[bb].pos, // make the final profile entry
+ olaps[ee].pos,
+ curDev.mean(),
+ curDev.stddev()));
+ }
}
- if ((olaps.size() > 0) && (olaps[olaps.size()-1].pos != getLength()))
- errorProfile.push_back(epValue(olaps[olaps.size()-1].pos, getLength()));
-
- errorProfile.push_back(epValue(getLength(), getLength()+1));
+ if ((olapsLen > 0) && (olaps[olapsLen-1].pos != getLength())) // Olaps, but missing the last
+ errorProfile.push_back(epValue(olaps[olapsLen-1].pos, getLength())); // interval, so add it.
+ errorProfile.push_back(epValue(getLength(), getLength()+1)); // And one more to make life easier.
#ifdef SHOW_PROFILE_CONSTRUCTION
- writeLog("errorProfile()-- tig %u generated " F_SIZE_T " profile regions from " F_SIZE_T " overlaps.\n", id(), errorProfile.size(), olaps.size());
+ writeLog("errorProfile()-- tig %u generated " F_SIZE_T " profile regions from " F_SIZE_T " overlaps.\n", id(), errorProfile.size(), olapsLen);
#endif
- // Walk both lists, adding positive erates and removing negative erates.
-
- stdDev<float> curDev;
-
- for (uint32 oo=0, ee=0; oo<olaps.size(); oo++) {
- if (olaps[oo].pos != errorProfile[ee].bgn) // Move to the next profile if the pos is different.
- ee++; // By construction, this single step should be all we need.
-
-#ifdef SHOW_PROFILE_CONSTRUCTION_DETAILS
- writeLog("errorProfile()-- olap[%u] @ %u ep[%u] @ %u %s %f %f +- %f size %u\n",
- oo, olaps[oo].pos,
- ee, errorProfile[ee].bgn,
- olaps[oo].open ? "I" : "R",
- olaps[oo].erate,
- curDev.mean(), curDev.variance(), curDev.size());
-
- if ((olaps[oo].open == false) && (curDev.size() == 0)) {
- for (uint32 fi=0; fi<ufpath.size(); fi++) {
- ufNode *frg = &ufpath[fi];
- writeLog("read %6u %6u-%6u\n", frg->ident, frg->position.bgn, frg->position.end);
- }
-
- writeLog("errorProfile()-- remove from empty set?\n");
- flushLog();
- }
-#endif
-
- assert(olaps[oo].pos == errorProfile[ee].bgn);
- assert(oo < olaps.size());
- assert(ee < errorProfile.size());
-
- if (olaps[oo].open == true)
- curDev.insert(olaps[oo].erate);
- else
- curDev.remove(olaps[oo].erate);
-
- errorProfile[ee].dev = curDev;
- }
-
- // Finalize the values.
-
- for (uint32 bi=0; bi<errorProfile.size(); bi++)
- errorProfile[bi].dev.finalize();
+ delete [] olaps;
// Adjust regions that have no overlaps (mean == 0) to be the average of the adjacent regions.
// There are always at least two elements in the profile list: one that starts at coordinate 0,
// and the terminating one at coordinate (len, len+1).
for (uint32 bi=0; bi<errorProfile.size(); bi++) {
- if (errorProfile[bi].dev.mean() != 0)
+ if (errorProfile[bi].mean != 0)
continue;
// Set any initial zero coverage area to the next one.
if (bi == 0) {
- errorProfile[bi].dev = errorProfile[bi+1].dev;
+ errorProfile[bi].mean = errorProfile[bi+1].mean;
+ errorProfile[bi].stddev = errorProfile[bi+1].stddev;
}
// Set intermediate ones to the average.
else if (bi < errorProfile.size() - 2) {
//writeLog("errorProfile()-- tig %u no overlap coverage %u-%u\n", id(), errorProfile[bi].bgn, errorProfile[bi].end);
- errorProfile[bi].dev = stdDev<float>((errorProfile[bi-1].dev.mean() + errorProfile[bi+1].dev.mean()) / 2,
- (errorProfile[bi-1].dev.stddev() + errorProfile[bi+1].dev.stddev()) / 2,
- 1);
+ errorProfile[bi].mean = (errorProfile[bi-1].mean + errorProfile[bi+1].mean) / 2;
+ errorProfile[bi].stddev = (errorProfile[bi-1].stddev + errorProfile[bi+1].stddev) / 2;
}
// Set the last two - the last real one and the terminator - to the previous one.
else {
- errorProfile[bi].dev = errorProfile[bi-1].dev;
+ errorProfile[bi].mean = errorProfile[bi-1].mean;
+ errorProfile[bi].stddev = errorProfile[bi-1].stddev;
}
}
@@ -348,8 +371,6 @@ Unitig::computeErrorProfile(const char *UNUSED(prefix), const char *UNUSED(label
}
}
-
-
//writeLog("errorProfile()-- tig %u generated " F_SIZE_T " profile regions with " F_U64 " overlap pieces.\n",
// id(), errorProfile.size(), nPieces);
}
@@ -375,6 +396,12 @@ Unitig::overlapConsistentWithTig(double deviations,
assert(bgn < getLength());
assert(end <= getLength());
+ // If this is a singleton tig - we should only be here when finding graph edges to repeats -
+ // we've got nothing to go on, so default to 'consistent'.
+
+ if (errorProfile.size() == 0)
+ return(1.0);
+
// Coarse search to find the first index that is after our region.
#undef BINARY_SEARCH
@@ -504,10 +531,9 @@ Unitig::reportErrorProfile(const char *prefix, const char *label) {
if (F) {
for (uint32 ii=0; ii<errorProfile.size(); ii++)
- fprintf(F, "%u %u %f +- %f (%u overlaps)\n",
- errorProfile[ii].bgn, errorProfile[ii].end,
- errorProfile[ii].dev.mean(), errorProfile[ii].dev.stddev(),
- errorProfile[ii].dev.size());
+ fprintf(F, "%u %u %.5f +- %.5f\n",
+ errorProfile[ii].bgn, errorProfile[ii].end,
+ errorProfile[ii].mean, errorProfile[ii].stddev);
fclose(F);
}
@@ -522,14 +548,13 @@ Unitig::reportErrorProfile(const char *prefix, const char *label) {
for (uint32 ii=0; ii<errorProfileIndex.size(); ii++) {
uint32 xx = errorProfileIndex[ii];
- fprintf(F, "index[%u] = %u -- errorProfile[] = %u-%u %.6f +- %.6f (%u values)\n",
+ fprintf(F, "index[%u] = %u -- errorProfile[] = %u-%u %.6f +- %.6f\n",
ii,
xx,
errorProfile[xx].bgn,
errorProfile[xx].end,
- errorProfile[xx].dev.mean(),
- errorProfile[xx].dev.stddev(),
- errorProfile[xx].dev.size());
+ errorProfile[xx].mean,
+ errorProfile[xx].stddev);
}
fclose(F);
}
diff --git a/src/bogart/AS_BAT_Unitig.H b/src/bogart/AS_BAT_Unitig.H
index 80b3e2d..0f3341f 100644
--- a/src/bogart/AS_BAT_Unitig.H
+++ b/src/bogart/AS_BAT_Unitig.H
@@ -44,10 +44,14 @@
#include "stddev.H"
#include <vector>
+#include <set>
#include <algorithm>
-class BestEdgeOverlap;
+using namespace std;
+
+class BestEdgeOverlap;
+class optPos;
class SeqInterval {
@@ -88,8 +92,44 @@ public:
+// True if A is contained in B.
+inline
+bool
+isContained(int32 Abgn, int32 Aend,
+ int32 Bbgn, int32 Bend) {
+ assert(Abgn < Aend);
+ assert(Bbgn < Bend);
+ return((Bbgn <= Abgn) &&
+ (Aend <= Bend));
+}
+
+inline
+bool
+isContained(SeqInterval &A, SeqInterval &B) {
+ return((B.min() <= A.min()) &&
+ (A.max() <= B.max()));
+}
+
+// True if the A and B intervals overlap
+inline
+bool
+isOverlapping(int32 Abgn, int32 Aend,
+ int32 Bbgn, int32 Bend) {
+ assert(Abgn < Aend);
+ assert(Bbgn < Bend);
+ return((Abgn < Bend) &&
+ (Bbgn < Aend));
+}
+
+inline
+bool
+isOverlapping(SeqInterval &A, SeqInterval &B) {
+ return((A.min() < B.max()) &&
+ (B.min() < A.max()));
+}
+
// Derived from IntMultiPos, but removes some of the data (48b in IntMultiPos, 32b in struct
@@ -132,8 +172,6 @@ public:
-
-
class Unitig {
private:
Unitig(TigVector *v) {
@@ -142,7 +180,6 @@ private:
_id = 0;
_isUnassembled = false;
- _isBubble = false;
_isRepeat = false;
_isCircular = false;
};
@@ -162,17 +199,31 @@ public:
//void bubbleSortLastRead(void);
void reverseComplement(bool doSort=true);
+ // Ensure that the children are sorted by begin position,
+ // and that unitigs start at position zero.
void cleanUp(void);
- // getNumRandomRead() is a placeholder, random reads should not
- // contain guides, or other reads that are not randomly sampled
- // across the whole genome.
-
- uint32 id(void) { return(_id); }; // ID internal to bogart
+ // Recompute bgn/end positions using all overlaps.
+ void optimize_initPlace(uint32 pp,
+ optPos *op,
+ optPos *np,
+ bool firstPass,
+ set<uint32> &failed,
+ bool beVerbose);
+ void optimize_recompute(uint32 ii,
+ optPos *op,
+ optPos *np,
+ bool beVerbose);
+ void optimize_expand(optPos *op);
+ void optimize_setPositions(optPos *op,
+ bool beVerbose);
+ void optimize(const char *prefix, const char *label);
+
+
+ uint32 id(void) { return(_id); };
int32 getLength(void) { return(_length); };
uint32 getNumReads(void) { return(ufpath.size()); };
- uint32 getNumRandomReads(void) { return(getNumReads()); };
// Place 'read' using an edge to some read in this tig. The edge is from 'read3p' end.
//
@@ -190,21 +241,29 @@ public:
epValue(uint32 b, uint32 e) {
bgn = b;
end = e;
+ mean = 0;
+ stddev = 0;
+ };
+
+ epValue(uint32 b, uint32 e, float m, float s) {
+ bgn = b;
+ end = e;
+ mean = m;
+ stddev = s;
};
double max(double deviations) {
- return(dev.mean() + deviations * dev.stddev());
+ return(mean + deviations * stddev);
};
bool operator<(const epValue &that) const { return(bgn < that.bgn); };
bool operator<(const uint32 &that) const { return(bgn < that); };
-
-
uint32 bgn;
uint32 end;
- stdDev<float> dev;
+ float mean;
+ float stddev;
};
static size_t epValueSize(void) { return(sizeof(epValue)); };
@@ -229,6 +288,8 @@ public:
for (uint32 fi=1; (fi < ufpath.size()) && (rd5->position.min() != 0); fi++)
rd5 = &ufpath[fi];
+ if (rd5->position.min() != 0)
+ fprintf(stderr, "ERROR: firstRead() in tig %u doesn't start at the start\n", id());
assert(rd5->position.min() == 0);
return(rd5);
@@ -242,6 +303,8 @@ public:
for (uint32 fi=ufpath.size()-1; (fi-- > 0) && (rd3->position.max() != getLength()); )
rd3 = &ufpath[fi];
+ if (rd3->position.max() != getLength())
+ fprintf(stderr, "ERROR: lastRead() in tig %u doesn't end at the end\n", id());
assert(rd3->position.max() == getLength());
return(rd3);
@@ -271,21 +334,11 @@ private:
uint32 _id;
public:
- // Classification. The output is in three files: 'unassembled', 'bubbles', 'contigs' (defined as
- // not unassembled and not bubble).
-
- uint32 _isUnassembled; // Is a single read or a pseudo singleton
- uint32 _isBubble; // Annotation: from a failed bubble pop
- uint32 _isRepeat; // Annotation: from an identified repeat region
- uint32 _isCircular; // Annotation: has overlap to self
-
- char type(void) {
- if (_isUnassembled) return('U');
- if (_isBubble) return('B');
- if (_isRepeat) return('R');
- if (_isCircular) return('C');
- return('N');
- }
+ // Classification.
+
+ bool _isUnassembled; // Is a single read or a pseudo singleton.
+ bool _isRepeat; // Is from an identified repeat region.
+ bool _isCircular; // Is (probably) a circular tig.
};
diff --git a/src/bogart/AS_BAT_Unitig_PlaceReadUsingEdges.C b/src/bogart/AS_BAT_Unitig_PlaceReadUsingEdges.C
index 188b2e8..5f3bda8 100644
--- a/src/bogart/AS_BAT_Unitig_PlaceReadUsingEdges.C
+++ b/src/bogart/AS_BAT_Unitig_PlaceReadUsingEdges.C
@@ -70,6 +70,16 @@ placeRead_contained(uint32 readId,
// We don't know the true length of the overlap, and our hang-based math tends to shrink reads.
// Reset the end coordinate using the actual length of the read.
+#if 0
+#warning NOT RESETTING fMax BASED ON READ LENGTH
+ writeLog("placeCont()-- read %u %d-%d with hangs %d %d places read %u at %d-%d reset to %d\n",
+ parent.ident,
+ parent.position.min(), parent.position.max(),
+ ahang, bhang,
+ readId,
+ fMin, fMax,
+ fMin + RI->readLength(readId));
+#endif
fMax = fMin + RI->readLength(readId);
// Orientation is straightforward, based on the orient of the parent, and the flipped flag.
@@ -88,11 +98,12 @@ placeRead_contained(uint32 readId,
read.position.end = (fFwd) ? fMax : fMin;
#ifdef DEBUG_PLACE_READ
- writeLog("placeCont()-- parent %7d pos %7d,%7d -- edge to %7d %c' hangs %7d %7d -- read %7d C' -- placed %7d-%7d oriented %s %7d-%7d\n",
+ writeLog("placeCont()-- parent %7d pos %7d,%7d -- edge to %7d %c' hangs %7d %7d -- read %7d C' -- placed %7d-%7d oriented %s %7d-%7d %f%% of length\n",
parent.ident, parent.position.bgn, parent.position.end,
edge->readId(), (edge->read3p()) ? '3' : '5', edge->ahang(), edge->bhang(),
readId,
- fMin, fMax, (fFwd) ? "rev" : "fwd", read.position.bgn, read.position.end);
+ fMin, fMax, (fFwd) ? "rev" : "fwd", read.position.bgn, read.position.end,
+ 100.0 * (read.position.max() - read.position.min()) / RI->readLength(readId));
#endif
return(read);
@@ -174,6 +185,16 @@ placeRead_dovetail(uint32 readId,
// We don't know the true length of the overlap, and our hang-based math tends to shrink reads.
// Reset the end coordinate using the actual length of the read.
+#if 0
+#warning NOT RESETTING fMax BASED ON READ LENGTH
+ writeLog("placeDovs()-- read %u %d-%d with hangs %d %d places read %u at %d-%d reset to %d\n",
+ parent.ident,
+ parent.position.min(), parent.position.max(),
+ ahang, bhang,
+ readId,
+ fMin, fMax,
+ fMin + RI->readLength(readId));
+#endif
fMax = fMin + RI->readLength(readId);
@@ -205,11 +226,12 @@ placeRead_dovetail(uint32 readId,
read.position.end = (fFwd) ? fMax : fMin;
#ifdef DEBUG_PLACE_READ
- writeLog("placeDove()-- parent %7d pos %7d,%7d -- edge to %7d %c' hangs %7d %7d -- read %7d %c' -- placed %7d-%7d oriented %s %7d-%7d\n",
+ writeLog("placeDove()-- parent %7d pos %7d,%7d -- edge to %7d %c' hangs %7d %7d -- read %7d %c' -- placed %7d-%7d oriented %s %7d-%7d %f%% of length\n",
parent.ident, parent.position.bgn, parent.position.end,
edge->readId(), (edge->read3p()) ? '3' : '5', edge->ahang(), edge->bhang(),
readId, (read3p) ? '3' : '5',
- fMin, fMax, (fFwd) ? "rev" : "fwd", read.position.bgn, read.position.end);
+ fMin, fMax, (fFwd) ? "rev" : "fwd", read.position.bgn, read.position.end,
+ 100.0 * (read.position.max() - read.position.min()) / RI->readLength(readId));
#endif
return(read);
diff --git a/src/bogart/bogart.C b/src/bogart/bogart.C
index ec0a35f..cd2f616 100644
--- a/src/bogart/bogart.C
+++ b/src/bogart/bogart.C
@@ -58,6 +58,8 @@
#include "AS_BAT_SplitDiscontinuous.H"
+#include "AS_BAT_DropDeadEnds.H"
+
#include "AS_BAT_PromoteToSingleton.H"
#include "AS_BAT_CreateUnitigs.H"
@@ -76,8 +78,7 @@ ChunkGraph *CG = 0L;
int
main (int argc, char * argv []) {
char *gkpStorePath = NULL;
- char *ovlStoreUniqPath = NULL;
- char *ovlStoreReptPath = NULL;
+ char *ovlStorePath = NULL;
double erateGraph = 0.075;
double erateMax = 0.100;
@@ -86,14 +87,15 @@ main (int argc, char * argv []) {
bool filterHighError = true;
bool filterLopsided = true;
bool filterSpur = true;
+ bool filterDeadEnds = true;
uint64 genomeSize = 0;
uint32 fewReadsNumber = 2; // Parameters for labeling of unassembled; also set in pipelines/canu/Defaults.pm
- uint32 tooShortLength = 1000;
- double spanFraction = 0.75;
- double lowcovFraction = 0.75;
- uint32 lowcovDepth = 2;
+ uint32 tooShortLength = 0;
+ double spanFraction = 1.0;
+ double lowcovFraction = 0.5;
+ uint32 lowcovDepth = 5;
double deviationGraph = 6.0;
double deviationBubble = 6.0;
@@ -111,7 +113,9 @@ main (int argc, char * argv []) {
char *prefix = NULL;
uint32 minReadLen = 0;
- uint32 minOverlap = 500;
+ uint32 minOverlapLen = 500;
+ uint32 minIntersectLen = 500;
+ uint32 maxPlacements = 2;
argc = AS_configure(argc, argv);
@@ -125,37 +129,66 @@ main (int argc, char * argv []) {
gkpStorePath = argv[++arg];
} else if (strcmp(argv[arg], "-O") == 0) {
- if (ovlStoreUniqPath == NULL)
- ovlStoreUniqPath = argv[++arg];
- else if (ovlStoreReptPath == NULL)
- ovlStoreReptPath = argv[++arg];
- else
- err.push_back(NULL);
+ ovlStorePath = argv[++arg];
} else if (strcmp(argv[arg], "-gs") == 0) {
genomeSize = strtoull(argv[++arg], NULL, 10);
} else if (strcmp(argv[arg], "-unassembled") == 0) {
- fewReadsNumber = atoi(argv[++arg]);
- tooShortLength = atoi(argv[++arg]);
- spanFraction = atof(argv[++arg]);
- lowcovFraction = atof(argv[++arg]);
- lowcovDepth = atoi(argv[++arg]);
+ uint32 invalid = 0;
+
+ if ((arg + 1 < argc) && (argv[arg + 1][0] != '-'))
+ fewReadsNumber = atoi(argv[++arg]);
+ else
+ invalid++;
+
+ if ((arg + 1 < argc) && (argv[arg + 1][0] != '-'))
+ tooShortLength = atoi(argv[++arg]);
+ else
+ invalid++;
+
+ if ((arg + 1 < argc) && (argv[arg + 1][0] != '-'))
+ spanFraction = atof(argv[++arg]);
+ else
+ invalid++;
+
+ if ((arg + 1 < argc) && (argv[arg + 1][0] != '-'))
+ lowcovFraction = atof(argv[++arg]);
+ else
+ invalid++;
+
+ if ((arg + 1 < argc) && (argv[arg + 1][0] != '-'))
+ lowcovDepth = atoi(argv[++arg]);
+ else
+ invalid++;
+
+ if (invalid) {
+ char *s = new char [1024];
+ snprintf(s, 1024, "Too few parameters to -unassembled option.\n");
+ err.push_back(s);
+ }
- } else if (strcmp(argv[arg], "-RL") == 0) {
+ } else if ((strcmp(argv[arg], "-mr") == 0) ||
+ (strcmp(argv[arg], "-RL") == 0)) { // Deprecated
minReadLen = atoi(argv[++arg]);
+ } else if ((strcmp(argv[arg], "-mo") == 0) ||
+ (strcmp(argv[arg], "-el") == 0)) { // Deprecated
+ minOverlapLen = atoi(argv[++arg]);
+
+ } else if (strcmp(argv[arg], "-mi") == 0) {
+ minIntersectLen = atoi(argv[++arg]);
+ } else if (strcmp(argv[arg], "-mp") == 0) {
+ maxPlacements = atoi(argv[++arg]);
} else if (strcmp(argv[arg], "-threads") == 0) {
- numThreads = atoi(argv[++arg]);
+ if ((numThreads = atoi(argv[++arg])) > 0)
+ omp_set_num_threads(numThreads);
} else if (strcmp(argv[arg], "-eg") == 0) {
erateGraph = atof(argv[++arg]);
} else if (strcmp(argv[arg], "-eM") == 0) {
erateMax = atof(argv[++arg]);
- } else if (strcmp(argv[arg], "-el") == 0) {
- minOverlap = atoi(argv[++arg]);
-
} else if (strcmp(argv[arg], "-ca") == 0) { // Edge confused, based on absolute difference
confusedAbsolute = atoi(argv[++arg]);
} else if (strcmp(argv[arg], "-cp") == 0) { // Edge confused, based on percent difference
@@ -174,6 +207,7 @@ main (int argc, char * argv []) {
filterHighError = ((arg >= argc) || (strcasestr(argv[arg], "higherror") == NULL));
filterLopsided = ((arg >= argc) || (strcasestr(argv[arg], "lopsided") == NULL));
filterSpur = ((arg >= argc) || (strcasestr(argv[arg], "spur") == NULL));
+ filterDeadEnds = ((arg >= argc) || (strcasestr(argv[arg], "deadends") == NULL));
} else if (strcmp(argv[arg], "-M") == 0) {
ovlCacheMemory = (uint64)(atof(argv[++arg]) * 1024 * 1024 * 1024);
@@ -238,11 +272,11 @@ main (int argc, char * argv []) {
arg++;
}
- if (erateGraph < 0.0) err.push_back("Invalid overlap error threshold (-eg option); must be at least 0.0.\n");
- if (erateMax < 0.0) err.push_back("Invalid overlap error threshold (-eM option); must be at least 0.0.\n");
- if (prefix == NULL) err.push_back("No output prefix name (-o option) supplied.\n");
- if (gkpStorePath == NULL) err.push_back("No gatekeeper store (-G option) supplied.\n");
- if (ovlStoreUniqPath == NULL) err.push_back("No overlap store (-O option) supplied.\n");
+ if (erateGraph < 0.0) err.push_back("Invalid overlap error threshold (-eg option); must be at least 0.0.\n");
+ if (erateMax < 0.0) err.push_back("Invalid overlap error threshold (-eM option); must be at least 0.0.\n");
+ if (prefix == NULL) err.push_back("No output prefix name (-o option) supplied.\n");
+ if (gkpStorePath == NULL) err.push_back("No gatekeeper store (-G option) supplied.\n");
+ if (ovlStorePath == NULL) err.push_back("No overlap store (-O option) supplied.\n");
if (err.size() > 0) {
fprintf(stderr, "usage: %s -o outputName -O ovlStore -G gkpStore -T tigStore\n", argv[0]);
@@ -256,8 +290,11 @@ main (int argc, char * argv []) {
fprintf(stderr, "\n");
fprintf(stderr, " -gs Genome size in bases.\n");
fprintf(stderr, "\n");
- fprintf(stderr, " -RL len Force reads below 'len' bases to be singletons.\n");
- fprintf(stderr, " This WILL cause CGW to fail; diagnostic only.\n");
+ fprintf(stderr, " -mr len Force reads below 'len' bases to be singletons.\n");
+ fprintf(stderr, " -mo len Ignore overlaps shorter than 'len' bases.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -mi len Create unitigs from contig intersections of at least 'len' bases.\n");
+ fprintf(stderr, " -mp num Create unitigs from contig intersections with at most 'num' placements.\n");
fprintf(stderr, "\n");
fprintf(stderr, " -nofilter [suspicious],[higherror],[lopsided],[spur]\n");
fprintf(stderr, " Disable filtering of:\n");
@@ -284,9 +321,6 @@ main (int argc, char * argv []) {
fprintf(stderr, " -eM 0.05 no more than 0.05 fraction (5.0%%) error in any overlap loaded into bogart\n");
fprintf(stderr, " the maximum used will ALWAYS be at leeast the maximum of the four error rates\n");
fprintf(stderr, "\n");
- fprintf(stderr, " For all, the lower limit on overlap length\n");
- fprintf(stderr, " -el 500 no shorter than 40 bases\n");
- fprintf(stderr, "\n");
fprintf(stderr, "Overlap Storage\n");
fprintf(stderr, "\n");
fprintf(stderr, " -M gb Use at most 'gb' gigabytes of memory for storing overlaps.\n");
@@ -301,9 +335,6 @@ main (int argc, char * argv []) {
fprintf(stderr, " %s\n", logFileFlagNames[l]);
fprintf(stderr, "\n");
- if ((ovlStoreUniqPath != NULL) && (ovlStoreUniqPath == ovlStoreReptPath))
- fprintf(stderr, "Too many overlap stores (-O option) supplied.\n");
-
for (uint32 ii=0; ii<err.size(); ii++)
if (err[ii])
fputs(err[ii], stderr);
@@ -312,28 +343,41 @@ main (int argc, char * argv []) {
}
fprintf(stderr, "\n");
- fprintf(stderr, "Graph error threshold = %.3f (%.3f%%)\n", erateGraph, erateGraph * 100);
- fprintf(stderr, "Max error threshold = %.3f (%.3f%%)\n", erateMax, erateMax * 100);
+ fprintf(stderr, "==> PARAMETERS.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Resources:\n");
+ fprintf(stderr, " Memory " F_U64 " GB\n", ovlCacheMemory >> 30);
+ fprintf(stderr, " Compute Threads %d (%s)\n", omp_get_max_threads(), (numThreads > 0) ? "command line" : "OpenMP default");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Lengths:\n");
+ fprintf(stderr, " Minimum read %u bases\n", minReadLen);
+ fprintf(stderr, " Minimum overlap %u bases\n", minOverlapLen);
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Overlap Error Rates:\n");
+ fprintf(stderr, " Graph %.3f (%.3f%%)\n", erateGraph, erateGraph * 100);
+ fprintf(stderr, " Max %.3f (%.3f%%)\n", erateMax, erateMax * 100);
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Deviations:\n");
+ fprintf(stderr, " Graph %.3f\n", deviationGraph);
+ fprintf(stderr, " Bubble %.3f\n", deviationBubble);
+ fprintf(stderr, " Repeat %.3f\n", deviationRepeat);
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Edge Confusion:\n");
+ fprintf(stderr, " Absolute %d\n", confusedAbsolute);
+ fprintf(stderr, " Percent %.4f\n", confusedPercent);
fprintf(stderr, "\n");
- fprintf(stderr, "Minimum overlap length = %u bases\n", minOverlap);
+ fprintf(stderr, "Unitig Construction:\n");
+ fprintf(stderr, " Minimum intersection %u bases\n", minIntersectLen);
+ fprintf(stderr, " Maxiumum placements %u positions\n", maxPlacements);
fprintf(stderr, "\n");
+ fprintf(stderr, "Debugging Enabled:\n");
- if (numThreads > 0) {
- omp_set_num_threads(numThreads);
- fprintf(stderr, "number of threads = %d (command line)\n", numThreads);
- fprintf(stderr, "\n");
- } else {
- fprintf(stderr, "number of threads = %d (OpenMP default)\n", omp_get_max_threads());
- fprintf(stderr, "\n");
- }
+ if (logFileFlags == 0)
+ fprintf(stderr, " (none)\n");
for (uint64 i=0, j=1; i<64; i++, j<<=1)
if (logFileFlagSet(j))
- fprintf(stderr, "DEBUG = %s\n", logFileFlagNames[i]);
-
- gkStore *gkpStore = gkStore::gkStore_open(gkpStorePath);
- ovStore *ovlStoreUniq = new ovStore(ovlStoreUniqPath, gkpStore);
- ovStore *ovlStoreRept = ovlStoreReptPath ? new ovStore(ovlStoreReptPath, gkpStore) : NULL;
+ fprintf(stderr, " %s\n", logFileFlagNames[i]);
writeStatus("\n");
writeStatus("==> LOADING AND FILTERING OVERLAPS.\n");
@@ -341,17 +385,11 @@ main (int argc, char * argv []) {
setLogFile(prefix, "filterOverlaps");
- RI = new ReadInfo(gkpStore, prefix, minReadLen);
- OC = new OverlapCache(gkpStore, ovlStoreUniq, ovlStoreRept, prefix, MAX(erateMax, erateGraph), minOverlap, ovlCacheMemory, genomeSize, doSave);
+ RI = new ReadInfo(gkpStorePath, prefix, minReadLen);
+ OC = new OverlapCache(ovlStorePath, prefix, MAX(erateMax, erateGraph), minOverlapLen, ovlCacheMemory, genomeSize, doSave);
OG = new BestOverlapGraph(erateGraph, deviationGraph, prefix, filterSuspicious, filterHighError, filterLopsided, filterSpur);
CG = new ChunkGraph(prefix);
- delete ovlStoreUniq; ovlStoreUniq = NULL;
- delete ovlStoreRept; ovlStoreRept = NULL;
-
- gkpStore->gkStore_close();
- gkpStore = NULL;
-
//
// Build the initial unitig path from non-contained reads. The first pass is usually the
// only one needed, but occasionally (maybe) we miss reads, so we make an explicit pass
@@ -375,7 +413,12 @@ main (int argc, char * argv []) {
breakSingletonTigs(contigs);
- reportOverlaps(contigs, prefix, "buildGreedy");
+ // populateUnitig() uses only one hang from one overlap to compute the positions of reads.
+ // Once all reads are (approximately) placed, compute positions using all overlaps.
+
+ contigs.optimizePositions(prefix, "buildGreedy");
+
+ //reportOverlaps(contigs, prefix, "buildGreedy");
reportTigs(contigs, prefix, "buildGreedy", genomeSize);
//
@@ -404,7 +447,15 @@ main (int argc, char * argv []) {
placeUnplacedUsingAllOverlaps(contigs, prefix);
- reportOverlaps(contigs, prefix, "placeContains");
+ // Compute positions again. This fixes issues with contains-in-contains that
+ // tend to excessively shrink reads. The one case debugged placed contains in
+ // a three read nanopore contig, where one of the contained reads shrank by 10%,
+ // which was enough to swap bgn/end coords when they were computed using hangs
+ // (that is, sum of the hangs was bigger than the placed read length).
+
+ contigs.optimizePositions(prefix, "placeContains");
+
+ //reportOverlaps(contigs, prefix, "placeContains");
reportTigs(contigs, prefix, "placeContains", genomeSize);
//
@@ -423,10 +474,20 @@ main (int argc, char * argv []) {
mergeOrphans(contigs, deviationBubble);
//checkUnitigMembership(contigs);
- reportOverlaps(contigs, prefix, "mergeOrphans");
+ //reportOverlaps(contigs, prefix, "mergeOrphans");
reportTigs(contigs, prefix, "mergeOrphans", genomeSize);
//
+ // Initial construction done. Classify what we have as assembled or unassembled.
+ //
+
+ classifyTigsAsUnassembled(contigs,
+ fewReadsNumber,
+ tooShortLength,
+ spanFraction,
+ lowcovFraction, lowcovDepth);
+
+ //
// Generate a new graph using only edges that are compatible with existing tigs.
//
@@ -457,11 +518,14 @@ main (int argc, char * argv []) {
setLogFile(prefix, "breakRepeats");
contigs.computeErrorProfiles(prefix, "repeats");
+ contigs.reportErrorProfiles(prefix, "repeats");
- markRepeatReads(AG, contigs, deviationRepeat, confusedAbsolute, confusedPercent);
+ vector<confusedEdge> confusedEdges;
+
+ markRepeatReads(AG, contigs, deviationRepeat, confusedAbsolute, confusedPercent, confusedEdges);
//checkUnitigMembership(contigs);
- reportOverlaps(contigs, prefix, "markRepeatReads");
+ //reportOverlaps(contigs, prefix, "markRepeatReads");
reportTigs(contigs, prefix, "markRepeatReads", genomeSize);
//
@@ -475,9 +539,15 @@ main (int argc, char * argv []) {
setLogFile(prefix, "cleanupMistakes");
- splitDiscontinuous(contigs, minOverlap);
+ splitDiscontinuous(contigs, minOverlapLen);
promoteToSingleton(contigs);
+ if (filterDeadEnds) {
+ dropDeadEnds(AG, contigs);
+ splitDiscontinuous(contigs, minOverlapLen);
+ promoteToSingleton(contigs);
+ }
+
writeStatus("\n");
writeStatus("==> CLEANUP GRAPH.\n");
writeStatus("\n");
@@ -491,12 +561,6 @@ main (int argc, char * argv []) {
setLogFile(prefix, "generateOutputs");
- classifyTigsAsUnassembled(contigs,
- fewReadsNumber,
- tooShortLength,
- spanFraction,
- lowcovFraction, lowcovDepth);
-
//checkUnitigMembership(contigs);
reportOverlaps(contigs, prefix, "final");
reportTigs(contigs, prefix, "final", genomeSize);
@@ -507,42 +571,7 @@ main (int argc, char * argv []) {
AG = NULL;
//
- // Generate outputs. The graph MUST come after output, because it needs
- // the tigStore tigID.
- //
-
- setParentAndHang(contigs);
- writeTigsToStore(contigs, prefix, "ctg", true);
-
- vector<tigLoc> unitigSource; // Needed only to pass something to reportTigGraph.
-
- setLogFile(prefix, "tigGraph");
-
- reportTigGraph(contigs, unitigSource, prefix, "contigs");
-
- //
- // Generate unitigs
- //
- // We want to split the contigs at any potential bubble, so this needs to be
- // at least the 'bubble' deviation. We don't really want to split at confirmed
- // repeats, but we have no way of telling repeat from bubble yet.
- //
-
- writeStatus("\n");
- writeStatus("==> GENERATE UNITIGS.\n");
- writeStatus("\n");
-
- setLogFile(prefix, "generateUnitigs");
-
- contigs.computeErrorProfiles(prefix, "generateUnitigs");
- contigs.reportErrorProfiles(prefix, "generateUnitigs");
-
- AssemblyGraph *EG = new AssemblyGraph(prefix,
- deviationBubble,
- contigs,
- true);
-
-
+ // unitigSource:
//
// We want some way of tracking unitigs that came from the same contig. Ideally,
// we'd be able to emit only the edges that would join unitigs into the original
@@ -559,30 +588,53 @@ main (int argc, char * argv []) {
// good first attempt.
//
- createUnitigs(EG, contigs, unitigs, unitigSource);
+ vector<tigLoc> unitigSource;
- delete EG;
+ // The graph must come first, to find circular contigs.
- splitDiscontinuous(unitigs, minOverlap, unitigSource);
+ reportTigGraph(contigs, unitigSource, prefix, "contigs");
- setParentAndHang(unitigs);
- writeTigsToStore(unitigs, prefix, "utg", true);
+ setParentAndHang(contigs);
+ writeTigsToStore(contigs, prefix, "ctg", true);
setLogFile(prefix, "tigGraph");
+ writeStatus("\n");
+ writeStatus("==> GENERATE UNITIGS.\n");
+ writeStatus("\n");
+
+ setLogFile(prefix, "generateUnitigs");
+
+ contigs.computeErrorProfiles(prefix, "generateUnitigs");
+ contigs.reportErrorProfiles(prefix, "generateUnitigs");
+
+ createUnitigs(contigs, unitigs, minIntersectLen, maxPlacements, confusedEdges, unitigSource);
+
+ splitDiscontinuous(unitigs, minOverlapLen, unitigSource);
+
reportTigGraph(unitigs, unitigSource, prefix, "unitigs");
+ setParentAndHang(unitigs);
+ writeTigsToStore(unitigs, prefix, "utg", true);
+
//
// Tear down bogart.
//
+ // How bizarre. Human regression of 2017-07-28-2128 deadlocked (apparently) when deleting OC.
+ // It had 31 threads in futex_wait, thread 1 was in delete of the second block of data. CPU
+ // usage was 100% IIRC. Reproducable, at least twice, possibly three times. setLogFilePrefix
+ // was moved before the deletes in hope that it'll close down threads. Certainly, it should
+ // close thread output files from createUnitigs.
+
+ setLogFile(prefix, NULL); // Close files.
+ omp_set_num_threads(1); // Hopefully kills off other threads.
+
delete CG;
delete OG;
delete OC;
delete RI;
- setLogFile(prefix, NULL);
-
writeStatus("\n");
writeStatus("Bye.\n");
diff --git a/src/bogart/bogart.mk b/src/bogart/bogart.mk
index 827863e..acf2711 100644
--- a/src/bogart/bogart.mk
+++ b/src/bogart/bogart.mk
@@ -13,10 +13,12 @@ SOURCES := bogart.C \
AS_BAT_BestOverlapGraph.C \
AS_BAT_ChunkGraph.C \
AS_BAT_CreateUnitigs.C \
+ AS_BAT_DropDeadEnds.C \
AS_BAT_Instrumentation.C \
AS_BAT_Logging.C \
AS_BAT_MarkRepeatReads.C \
AS_BAT_MergeOrphans.C \
+ AS_BAT_OptimizePositions.C \
AS_BAT_Outputs.C \
AS_BAT_OverlapCache.C \
AS_BAT_PlaceContains.C \
diff --git a/src/canu_version_update.pl b/src/canu_version_update.pl
index a9ded13..919c141 100755
--- a/src/canu_version_update.pl
+++ b/src/canu_version_update.pl
@@ -32,7 +32,7 @@ my $cwd = getcwd();
my $label = "snapshot"; # Change this to 'release' just before making a release.
my $major = "1"; # ...and this too.
-my $minor = "5"; # ...and this too.
+my $minor = "6"; # ...and this too.
my $commits = "0";
my $hash1 = undef; # This from 'git describe'
@@ -124,7 +124,7 @@ if (defined($dirty)) {
} elsif (defined($hash1)) {
print F "#define CANU_VERSION \"Canu snapshot ($hash1)\\n\"\n";
} else {
- print F "#define CANU_VERSION \"Canu $label v$major.$minor\\n\"\n";
+ print F "#define CANU_VERSION \"Canu $major.$minor\\n\"\n";
}
close(F);
diff --git a/src/falcon_sense/falcon_sense.C b/src/falcon_sense/falcon_sense.C
index cd30680..b4e862e 100644
--- a/src/falcon_sense/falcon_sense.C
+++ b/src/falcon_sense/falcon_sense.C
@@ -74,8 +74,8 @@ main (int argc, char **argv) {
} else if (strcmp(argv[arg], "--max_read_len") == 0) {
max_read_len = atoi(argv[++arg]);
- if (max_read_len <= 0 || max_read_len > AS_MAX_READLEN) {
- max_read_len = AS_MAX_READLEN;
+ if (max_read_len <= 0 || max_read_len > 2*AS_MAX_READLEN) {
+ max_read_len = 2*AS_MAX_READLEN;
}
} else {
diff --git a/src/falcon_sense/libfalcon/falcon.C b/src/falcon_sense/libfalcon/falcon.C
index ef3f4c8..3fd666f 100644
--- a/src/falcon_sense/libfalcon/falcon.C
+++ b/src/falcon_sense/libfalcon/falcon.C
@@ -644,6 +644,10 @@ consensus_data * generate_consensus( vector<string> input_seq,
tags_list = (align_tags_t **)calloc( seq_count, sizeof(align_tags_t*) );
#pragma omp parallel for schedule(dynamic)
for (uint32 j=0; j < seq_count; j++) {
+ // if the current sequence is too long, truncate it to be shorter
+ if (input_seq[j].size() > input_seq[0].size()) {
+ input_seq[j].resize(input_seq[0].size());
+ }
int tolerance = (int)ceil((double)min(input_seq[j].length(), input_seq[0].length())*max_diff*1.1);
EdlibAlignResult align = edlibAlign(input_seq[j].c_str(), input_seq[j].size()-1, input_seq[0].c_str(), input_seq[0].size()-1, edlibNewAlignConfig(tolerance, EDLIB_MODE_HW, EDLIB_TASK_PATH));
if (align.numLocations >= 1 && align.endLocations[0] - align.startLocations[0] > min_len && ((float)align.editDistance / (align.endLocations[0]-align.startLocations[0]) < max_diff)) {
diff --git a/src/fastq-utilities/fastqSample.C b/src/fastq-utilities/fastqSample.C
index 15716a5..a68ac55 100644
--- a/src/fastq-utilities/fastqSample.C
+++ b/src/fastq-utilities/fastqSample.C
@@ -31,6 +31,10 @@
* are a 'United States Government Work', and
* are released in the public domain
*
+ * Sergey Koren beginning on 2017-JUN-13
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
@@ -42,7 +46,7 @@
using namespace std;
-#define MAXLEN 1024*1024
+#define MAXLEN 1024*1024*50
class aRead {
public:
@@ -189,7 +193,7 @@ main(int argc, char **argv) {
} else if (strcmp(argv[arg], "-g") == 0) {
- GENOMESIZE = atoi(argv[++arg]);
+ GENOMESIZE = atol(argv[++arg]);
} else if (strcmp(argv[arg], "-c") == 0) {
COVERAGE = atof(argv[++arg]);
diff --git a/src/gfa/alignGFA.C b/src/gfa/alignGFA.C
index f033a3c..5015376 100644
--- a/src/gfa/alignGFA.C
+++ b/src/gfa/alignGFA.C
@@ -33,6 +33,11 @@
#include "AS_UTL_reverseComplement.H"
#include "gfa.H"
+#include "bed.H"
+
+#define IS_GFA 1
+#define IS_BED 2
+
class sequence {
@@ -60,6 +65,55 @@ public:
+class sequences {
+public:
+ sequences(char *tigName, uint32 tigVers) {
+ tgStore *tigStore = new tgStore(tigName, tigVers);
+
+ b = 0;
+ e = tigStore->numTigs();
+ seqs = new sequence [e+1];
+ used = new uint32 [e+1];
+
+ for (uint32 ti=b; ti < e; ti++) {
+ tgTig *tig = tigStore->loadTig(ti);
+
+ used[ti] = 0;
+
+ if (tig == NULL)
+ continue;
+
+ seqs[ti].set(tig);
+
+ tigStore->unloadTig(ti);
+ }
+
+ delete tigStore;
+ };
+
+ ~sequences() {
+ delete [] seqs;
+ delete [] used;
+ };
+
+ sequence &operator[](uint32 xx) {
+ if (xx < e)
+ return(seqs[xx]);
+
+ fprintf(stderr, "ERROR: sequence id %u out of range b=%u e=%u\n", xx, b, e);
+
+ assert(xx < e);
+ return(seqs[0]);
+ };
+
+ uint32 b;
+ uint32 e;
+ sequence *seqs;
+ uint32 *used;
+};
+
+
+
void
dotplot(uint32 Aid, bool Afwd, char *Aseq,
uint32 Bid, bool Bfwd, char *Bseq) {
@@ -99,15 +153,14 @@ dotplot(uint32 Aid, bool Afwd, char *Aseq,
-
bool
-checkLink(gfaLink *link,
- sequence *seqs,
- bool beVerbose,
- bool doPlot) {
+checkLink(gfaLink *link,
+ sequences &seqs,
+ bool beVerbose,
+ bool doPlot) {
- char *Aseq = seqs[link->_Aid].seq;
- char *Bseq = seqs[link->_Bid].seq;
+ char *Aseq = seqs[link->_Aid].seq, *Arev = NULL;
+ char *Bseq = seqs[link->_Bid].seq, *Brev = NULL;
int32 Abgn, Aend, Alen = seqs[link->_Aid].len;
int32 Bbgn, Bend, Blen = seqs[link->_Bid].len;
@@ -132,10 +185,9 @@ checkLink(gfaLink *link,
link->_cigar = NULL;
if (link->_Afwd == false)
- reverseComplementSequence(Aseq, Alen);
+ Aseq = Arev = reverseComplementCopy(Aseq, Alen);
if (link->_Bfwd == false)
- reverseComplementSequence(Bseq, Blen);
-
+ Bseq = Brev = reverseComplementCopy(Bseq, Blen);
// Ty to find the end coordinate on B. Align the last bits of A to B.
//
@@ -253,12 +305,10 @@ checkLink(gfaLink *link,
dotplot(link->_Aid, link->_Afwd, Aseq,
link->_Bid, link->_Bfwd, Bseq);
- // Cleanup for the next link->
+ // Cleanup for the next link.
- if (link->_Afwd == false)
- reverseComplementSequence(Aseq, Alen);
- if (link->_Bfwd == false)
- reverseComplementSequence(Bseq, Blen);
+ delete [] Arev;
+ delete [] Brev;
if (beVerbose)
fprintf(stderr, "\n");
@@ -268,123 +318,254 @@ checkLink(gfaLink *link,
+// Align all of B into A. Extend A as needed to make the whole thing fit.
+// Abgn, Aend and score are updated with the alignment.
+//
+bool
+checkRecord_align(char *label,
+ char *Aname, char *Aseq, int32 Alen, int32 &Abgn, int32 &Aend,
+ char *Bname, char *Bseq, int32 Blen,
+ int32 &score,
+ bool beVerbose) {
-int
-main (int argc, char **argv) {
- char *tigName = NULL;
- uint32 tigVers = UINT32_MAX;
+ EdlibAlignResult result = { 0, NULL, NULL, 0, NULL, 0, 0 };
- char *inGFA = NULL;
- char *otGFA = NULL;
+ int32 editDist = 0;
+ int32 alignLen = 0;
+ int32 alignScore = 0;
+ int32 maxEdit = (int32)ceil(Blen * 0.03); // Should be the same sequence, but allow for a little difference.
+ int32 step = (int32)ceil(Blen * 0.15);
- uint32 verbosity = 0;
+ Aend = min(Aend + 2 * step, Alen); // Limit Aend to the actual length of the contig (consensus can shrink)
+ Abgn = max(Aend - Blen - 2 * step, 0); // Then push Abgn back to make space for the unitig.
- argc = AS_configure(argc, argv);
+ tryAgain:
+ if (beVerbose)
+ fprintf(stderr, "ALIGN %5s utg %s len=%7d to ctg %s %9d-%9d len=%9d",
+ label,
+ Bname, Blen,
+ Aname, Abgn, Aend, Alen);
- int arg=1;
- int err=0;
- while (arg < argc) {
- if (strcmp(argv[arg], "-T") == 0) {
- tigName = argv[++arg];
- tigVers = atoi(argv[++arg]);
+#if 0
+ char N[FILENAME_MAX];
+ FILE *F;
- if (tigVers == 0)
- fprintf(stderr, "invalid tigStore version (-T store version partition) '-t %s %s %s'.\n", argv[arg-2], argv[arg-1], argv[arg]), exit(1);
+ char ach = Aseq[Aend]; Aseq[Aend] = 0;
+ char bch = Bseq[Bend]; Bseq[Bend] = 0;
- } else if (strcmp(argv[arg], "-i") == 0) {
- inGFA = argv[++arg];
+ sprintf(N, "compare%04d-%04d-ctg%04d.fasta", record->_Aid, record->_Bid, record->_Aid);
+ F = fopen(N, "w");
+ fprintf(F, ">ctg%04d\n%s\n", record->_Aid, Aseq + Abgn);
+ fclose(F);
- } else if (strcmp(argv[arg], "-o") == 0) {
- otGFA = argv[++arg];
+ sprintf(N, "compare%04d-%04d-utg%04d.fasta", record->_Aid, record->_Bid, record->_Bid);
+ F = fopen(N, "w");
+ fprintf(F, ">utg%04d\n%s\n", record->_Bid, Bseq + Bbgn);
+ fclose(F);
- } else if (strcmp(argv[arg], "-V") == 0) {
- verbosity++;
+ Aseq[Aend] = ach;
+ Bseq[Bend] = bch;
+#endif
- } else if (strcmp(argv[arg], "-t") == 0) {
- omp_set_num_threads(atoi(argv[++arg]));
+ result = edlibAlign(Bseq, Blen, // The 'query' (unitig)
+ Aseq + Abgn, Aend-Abgn, // The 'target' (contig)
+ edlibNewAlignConfig(maxEdit, EDLIB_MODE_HW, EDLIB_TASK_LOC));
- } else {
- fprintf(stderr, "%s: Unknown option '%s'\n", argv[0], argv[arg]);
- err++;
+ // Got an alignment? Process and report, and maybe try again.
+
+ if (result.numLocations > 0) {
+ int32 nAbgn = Abgn + result.startLocations[0];
+ int32 nAend = Abgn + result.endLocations[0] + 1; // 0-based to space-based
+ char *cigar = NULL;
+
+ editDist = result.editDistance;
+ alignLen = ((nAend - nAbgn) + (Blen) + (editDist)) / 2;
+ alignScore = 1000 - (int32)(1000.0 * editDist / alignLen);
+
+ // If there's an alignment, we can get a cigar string and better alignment length.
+ if ((result.alignment != NULL) && (result.alignmentLength > 0)) {
+ cigar = edlibAlignmentToCigar(result.alignment, result.alignmentLength, EDLIB_CIGAR_STANDARD);
+ alignLen = result.alignmentLength;
}
- arg++;
+ edlibFreeAlignResult(result);
+
+ if (beVerbose)
+ fprintf(stderr, " - POSITION from %9d-%-9d to %9d-%-9d score %5d/%9d = %4d%s%s\n",
+ Abgn, Aend,
+ nAbgn, nAend,
+ editDist, alignLen, alignScore,
+ (cigar != NULL) ? " align " : "",
+ (cigar != NULL) ? cigar : "");
+
+ delete [] cigar;
+
+ // If it's a full alignment -- if the A region was big enough to have unaligned bases -- then
+ // we're done. Update the result and get out of here.
+
+ if (((Abgn < nAbgn) || (Abgn == 0)) &&
+ ((nAend < Aend) || (Aend == Alen))) {
+
+ Abgn = nAbgn;
+ Aend = nAend;
+ score = alignScore;
+
+ return(true);
+ }
+
+ // Otherwise, we ran out of A sequence to align to before we ran out of stuff to align. Extend
+ // the A region and try again.
+
+ if (Abgn == nAbgn)
+ Abgn = max(Abgn - step, 0);
+
+ if (Aend == nAend)
+ Aend = min(Aend + step, Alen);
+
+ goto tryAgain;
}
- if (tigName == NULL)
- err++;
- if (inGFA == NULL)
- err++;
- if (otGFA == NULL)
- err++;
+ // Didn't get a good alignment.
- if (err) {
- fprintf(stderr, "usage: %s [opts]\n", argv[0]);
- fprintf(stderr, " Validates a GFA by generating alignments.\n");
- fprintf(stderr, " Optionally writes new GFA with updated CIGAR string (NOT IMPLEMENTED).\n");
- fprintf(stderr, "\n");
- fprintf(stderr, " -G g Load reads from gkStore 'g'\n");
- fprintf(stderr, " -T t v Load tigs from tgStore 't', version 'v'.\n");
- fprintf(stderr, " Consensus sequence must exist (usually in v=2)\n");
- fprintf(stderr, " -i input.gfa\n");
- fprintf(stderr, " -o output.gfa\n");
- fprintf(stderr, "\n");
- fprintf(stderr, "\n");
- fprintf(stderr, " -V Increase chatter\n");
- fprintf(stderr, "\n");
- fprintf(stderr, " -t threads Use 'threads' computational threads.\n");
- fprintf(stderr, "\n");
+ // We fail for one of two reasons - either not enough bases in the reference, or too high of
+ // error. Unitigs are supposed to be from the same sequence, but they might be lower coverage
+ // and therefore higher error. It's more likely they are misplaced.
- if (tigName == NULL)
- fprintf(stderr, "ERROR: no tigStore (-T) supplied.\n");
- if (inGFA == NULL)
- fprintf(stderr, "ERROR: no input GFA (-i) supplied.\n");
- if (otGFA == NULL)
- fprintf(stderr, "ERROR: no output GFA (-o) supplied.\n");
- exit(1);
+ if ((Aend - Abgn < 4 * Blen) &&
+ (maxEdit < Blen * 0.25)) {
+ if (beVerbose)
+ fprintf(stderr, " - FAILED, RELAX\n");
+
+ Abgn = max(Abgn - step, 0);
+ Aend = min(Aend + step, Alen);
+
+ maxEdit *= 1.2;
+
+ goto tryAgain;
}
- fprintf(stderr, "-- Opening tigStore '%s' version %u.\n", tigName, tigVers);
- tgStore *tigStore = new tgStore(tigName, tigVers);
+ if (beVerbose)
+ fprintf(stderr, " - ABORT, ABORT, ABORT!\n");
+
+ return(false);
+}
- // Load the GFA file.
- fprintf(stderr, "-- Reading GFA '%s'.\n", inGFA);
- gfaFile *gfa = new gfaFile(inGFA);
- // Load all consensus sequences
+bool
+checkRecord(bedRecord *record,
+ sequences &ctgs,
+ sequences &utgs,
+ bool beVerbose,
+ bool UNUSED(doPlot)) {
- uint32 b = 0;
- uint32 e = tigStore->numTigs();
+ char *Aseq = ctgs[record->_Aid].seq;
+ char *Bseq = utgs[record->_Bid].seq, *Brev = NULL;
- sequence *seqs = new sequence [e+1];
+ int32 Abgn = record->_bgn;
+ int32 Aend = record->_end;
- fprintf(stderr, "-- Loading tigs %u to %u.\n", b, e);
+ int32 Alen = ctgs[record->_Aid].len;
+ int32 Blen = utgs[record->_Bid].len;
- for (uint32 ti=b; ti < e; ti++) {
- tgTig *tig = tigStore->loadTig(ti);
+ bool success = true;
+ int32 alignScore = 0;
- if (tig == NULL)
- continue;
+ if (record->_Bfwd == false)
+ Bseq = Brev = reverseComplementCopy(Bseq, Blen);
- seqs[ti].set(tig);
+ // If Bseq (the unitig) is small, just align the full thing.
- tigStore->unloadTig(ti);
+ if (Blen < 50000) {
+ success &= checkRecord_align("ALL",
+ record->_Aname, Aseq, Alen, Abgn, Aend,
+ record->_Bname, Bseq, Blen,
+ alignScore,
+ beVerbose);
}
- // Set GFA lengths based on the sequences we loaded.
+ // Otherwise, we need to try to align only the ends of the unitig.
+ //
+ // -----------------------[AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA]------------------
+ // BBBBB..............BBBBB
+ //
- fprintf(stderr, "-- Resetting sequence lengths.\n", inGFA);
+ else {
+ int32 AbgnL = Abgn, AendL = Abgn + 50000;
+ int32 AbgnR = Aend - 50000, AendR = Aend;
+
+ char *BseqL = Bseq;
+ char *BseqR = Bseq + Blen - 50000;
+
+#if 0
+ success &= checkRecord_align("ALL",
+ record->_Aname, Aseq, Alen, Abgn, Aend,
+ record->_Bname, Bseq, Blen,
+ alignScore,
+ beVerbose);
+#endif
+
+ success &= checkRecord_align("LEFT",
+ record->_Aname, Aseq, Alen, AbgnL, AendL,
+ record->_Bname, BseqL, 50000,
+ alignScore,
+ beVerbose);
+
+ success &= checkRecord_align("RIGHT",
+ record->_Aname, Aseq, Alen, AbgnR, AendR,
+ record->_Bname, BseqR, 50000,
+ alignScore,
+ beVerbose);
+
+ Abgn = AbgnL;
+ Aend = AendR;
+ }
- for (uint32 ii=0; ii<gfa->_sequences.size(); ii++)
- gfa->_sequences[ii]->_length = seqs[gfa->_sequences[ii]->_id].len;
+ delete [] Brev;
+
+ // If successful, save the coordinates. Because we're usually not aligning the whole
+ // unitig to the contig, we can't save the score.
+
+ if (success) {
+ record->_bgn = Abgn;
+ record->_end = Aend;
+ record->_score = 0; //alignScore;
+ }
+
+ return(success);
+}
+
+
+
+//
+// Try to find an alignment for each link in the GFA file. If found, output a new link
+// with correct CIGAR string. If not found, discard the link.
+//
+void
+processGFA(char *tigName,
+ uint32 tigVers,
+ char *inGFA,
+ char *otGFA,
+ uint32 verbosity) {
+
+ // Load the GFA file.
+
+ fprintf(stderr, "-- Reading GFA '%s'.\n", inGFA);
+
+ gfaFile *gfa = new gfaFile(inGFA);
+
+ fprintf(stderr, "-- Loading sequences from tigStore '%s' version %u.\n", tigName, tigVers);
- // Done with the stores.
+ sequences *seqsp = new sequences(tigName, tigVers);
+ sequences &seqs = *seqsp;
- fprintf(stderr, "-- Closing tigStore '%s'.\n", tigName);
+ // Set GFA lengths based on the sequences we loaded.
- delete tigStore;
+ fprintf(stderr, "-- Resetting sequence lengths.\n");
+
+ for (uint32 ii=0; ii<gfa->_sequences.size(); ii++)
+ gfa->_sequences[ii]->_length = seqs[gfa->_sequences[ii]->_id].len;
// Align!
@@ -453,12 +634,306 @@ main (int argc, char **argv) {
fprintf(stderr, "-- Cleaning up.\n");
- delete [] seqs;
- delete gfa;
+ delete seqsp;
+ delete gfa;
fprintf(stderr, "-- Aligned %6u ciruclar tigs, failed %6u\n", passCircular, failCircular);
fprintf(stderr, "-- Aligned %6u linear tigs, failed %6u\n", passNormal, failNormal);
- fprintf(stderr, "-- Bye.\n");
+}
+
+
+
+//
+// Find an alignment between the unitig (the feature) and the contig (the 'chromosome').
+// Output updated coordiates.
+//
+void
+processBED(char *tigName,
+ uint32 tigVers,
+ char *seqName,
+ uint32 seqVers,
+ char *inBED,
+ char *otBED,
+ uint32 verbosity) {
+
+ // Load the BED file.
+
+ fprintf(stderr, "-- Reading BED '%s'.\n", inBED);
+
+ bedFile *bed = new bedFile(inBED);
+
+ fprintf(stderr, "-- Loading sequences from tigStore '%s' version %u.\n", tigName, tigVers);
+
+ sequences *utgsp = new sequences(tigName, tigVers);
+ sequences &utgs = *utgsp;
+
+ fprintf(stderr, "-- Loading sequences from tigStore '%s' version %u.\n", seqName, seqVers);
+
+ sequences *ctgsp = new sequences(seqName, seqVers);
+ sequences &ctgs = *ctgsp;
+
+ // Align!
+
+ uint32 pass = 0;
+ uint32 fail = 0;
+
+ uint32 iiLimit = bed->_records.size();
+ uint32 iiNumThreads = omp_get_max_threads();
+ uint32 iiBlockSize = (iiLimit < 1000 * iiNumThreads) ? iiNumThreads : iiLimit / 999;
+
+ fprintf(stderr, "-- Aligning " F_U32 " records using " F_U32 " threads.\n", iiLimit, iiNumThreads);
+
+#pragma omp parallel for schedule(dynamic, iiBlockSize)
+ for (uint32 ii=0; ii<iiLimit; ii++) {
+ bedRecord *record = bed->_records[ii];
+
+ if (checkRecord(record, ctgs, utgs, (verbosity > 0), false)) {
+ pass++;
+ } else {
+ delete bed->_records[ii];
+ bed->_records[ii] = NULL;
+ fail++;
+ }
+ }
+
+ fprintf(stderr, "-- Writing BED '%s'.\n", otBED);
+
+ bed->saveFile(otBED);
+
+ fprintf(stderr, "-- Cleaning up.\n");
+
+ delete utgsp;
+ delete ctgsp;
+ delete bed;
+
+ fprintf(stderr, "-- Aligned %6u unitigs to contigs, failed %6u\n", pass, fail);
+}
+
+
+
+//
+// Infer a graph from the positions of unitigs (features) in contigs (chromosomes). Generate a GFA
+// input and toss that up to processGFA.
+//
+void
+processBEDtoGFA(char *tigName,
+ uint32 tigVers,
+ char *inBED,
+ char *otGFA,
+ uint32 verbosity) {
+
+ int32 minOlap = 100;
+
+ // We only really need the sequence lengths here, but eventually, we'll want to generate
+ // alignments for all the overlaps, and so we'll need the sequences too.
+
+ fprintf(stderr, "-- Loading sequences from tigStore '%s' version %u.\n", tigName, tigVers);
+
+ sequences *seqsp = new sequences(tigName, tigVers);
+ sequences &seqs = *seqsp;
+
+ // Load the BED file and allocate an output GFA.
+
+ fprintf(stderr, "-- Reading BED '%s'.\n", inBED);
+
+ bedFile *bed = new bedFile(inBED);
+ gfaFile *gfa = new gfaFile("H\tVN:Z:bogart/edges");
+
+ // Iterate over sequences, looking for overlaps in contigs. Stupid, O(n^2) but seems fast enough.
+
+ uint32 iiLimit = bed->_records.size();
+ uint32 iiNumThreads = omp_get_max_threads();
+ uint32 iiBlockSize = (iiLimit < 1000 * iiNumThreads) ? iiNumThreads : iiLimit / 999;
+
+ fprintf(stderr, "-- Aligning " F_U32 " records using " F_U32 " threads.\n", iiLimit, iiNumThreads);
+
+#pragma omp parallel for schedule(dynamic, iiBlockSize)
+ for (uint64 ii=0; ii<bed->_records.size(); ii++) {
+ for (uint64 jj=ii+1; jj<bed->_records.size(); jj++) {
+
+ if (bed->_records[ii]->_Aid != bed->_records[jj]->_Aid) // Different contigs?
+ continue; // No overlap.
+
+ if ((bed->_records[ii]->_end < bed->_records[jj]->_bgn + minOlap) || // No (thick) intersection?
+ (bed->_records[jj]->_end < bed->_records[ii]->_bgn + minOlap)) //
+ continue; // No overlap.
+
+ // Overlap!
+
+ //fprintf(stderr, "OVERLAP %s %d-%d - %s %d-%d\n",
+ // bed->_records[ii]->_Bname, bed->_records[ii]->_bgn, bed->_records[ii]->_end,
+ // bed->_records[jj]->_Bname, bed->_records[jj]->_bgn, bed->_records[jj]->_end);
+
+ int32 olapLen = 0;
+
+ if (bed->_records[ii]->_bgn < bed->_records[jj]->_end)
+ olapLen = bed->_records[ii]->_end - bed->_records[jj]->_bgn;
+
+ if (bed->_records[jj]->_bgn < bed->_records[ii]->_end)
+ olapLen = bed->_records[jj]->_end - bed->_records[ii]->_bgn;
+
+ assert(olapLen > 0);
+
+ char cigar[81];
+
+ sprintf(cigar, "%dM", olapLen);
+
+ gfaLink *link = new gfaLink(bed->_records[ii]->_Bname, bed->_records[ii]->_Bid, true,
+ bed->_records[jj]->_Bname, bed->_records[jj]->_Bid, true,
+ cigar);
+
+ bool pN = checkLink(link, seqs, (verbosity > 0), false);
+
+#pragma omp critical
+ {
+ if (pN)
+ gfa->_links.push_back(link);
+ else
+ gfa->_links.push_back(link);
+
+ // Remember sequences we've hit.
+
+ seqs.used[bed->_records[ii]->_Bid]++;
+ seqs.used[bed->_records[jj]->_Bid]++;
+ }
+ }
+ }
+
+ // Add sequences. We could have done this as we're running through making edges, but we then
+ // need to figure out if we've seen a sequence already.
+
+ char seqName[80];
+
+ for (uint32 ii=0; ii<seqs.e; ii++)
+ if (seqs.used[ii] > 0) {
+ sprintf(seqName, "utg%08u", ii);
+ gfa->_sequences.push_back(new gfaSequence(seqName, ii, seqs[ii].len));
+ }
+
+ // Write the file, cleanup, done!
+
+ gfa->saveFile(otGFA);
+
+ delete gfa;
+ delete bed;
+}
+
+
+
+int
+main (int argc, char **argv) {
+ char *tigName = NULL; // For GFA and BED, the source of the tigs
+ uint32 tigVers = UINT32_MAX;
+
+ char *seqName = NULL; // For BED, the source of the 'chromosomes'
+ uint32 seqVers = UINT32_MAX; // The -C option (either chromosome or container)
+
+ char *inGraph = NULL;
+ char *otGraph = NULL;
+
+ uint32 graphType = IS_GFA;
+
+ uint32 verbosity = 0;
+
+ argc = AS_configure(argc, argv);
+
+ int arg=1;
+ int err=0;
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-T") == 0) {
+ tigName = argv[++arg];
+ tigVers = atoi(argv[++arg]);
+
+ } else if (strcmp(argv[arg], "-C") == 0) {
+ seqName = argv[++arg];
+ seqVers = atoi(argv[++arg]);
+
+ } else if (strcmp(argv[arg], "-gfa") == 0) {
+ graphType = IS_GFA;
+ } else if (strcmp(argv[arg], "-bed") == 0) {
+ graphType = IS_BED;
+
+ } else if (strcmp(argv[arg], "-i") == 0) {
+ inGraph = argv[++arg];
+ } else if (strcmp(argv[arg], "-o") == 0) {
+ otGraph = argv[++arg];
+
+ } else if (strcmp(argv[arg], "-V") == 0) {
+ verbosity++;
+
+ } else if (strcmp(argv[arg], "-t") == 0) {
+ omp_set_num_threads(atoi(argv[++arg]));
+
+ } else {
+ fprintf(stderr, "%s: Unknown option '%s'\n", argv[0], argv[arg]);
+ err++;
+ }
+
+ arg++;
+ }
+
+ if (tigName == NULL)
+ err++;
+ if (inGraph == NULL)
+ err++;
+ if (otGraph == NULL)
+ err++;
+
+ if ((tigName) && (tigVers == 0))
+ err++;
+ if ((seqName) && (seqVers == 0))
+ err++;
+
+ if (err) {
+ fprintf(stderr, "usage: %s [opts]\n", argv[0]);
+ fprintf(stderr, " Validates a GFA by generating alignments.\n");
+ fprintf(stderr, " Optionally writes new GFA with updated CIGAR string (NOT IMPLEMENTED).\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -G g Load reads from gkStore 'g'.\n");
+ fprintf(stderr, " -T t v Load tigs from tgStore 't', version 'v'.\n");
+ fprintf(stderr, " -C t v For BED format, the source of the 'chromosomes'. Similar to -T.\n");
+ fprintf(stderr, " Consensus sequence must exist for -T and -C (usually in v=2)\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -i input Input graph.\n");
+ fprintf(stderr, " -o output Output graph.\n");
+ fprintf(stderr, " Graph are either GFA (v1) or BED format.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -gfa The input and output graphs are in GFA (v1) format.\n");
+ fprintf(stderr, " -bed The input graph is in BED format. If -C is supplied, the\n");
+ fprintf(stderr, " output will also be BED, and will have updated positions.\n");
+ fprintf(stderr, " If -C is not supplied, the output will be GFA (v1) of the\n");
+ fprintf(stderr, " overlaps inferred from the BED positions.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -V Increase chatter.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -t threads Use 'threads' computational threads.\n");
+ fprintf(stderr, "\n");
+
+ if (tigName == NULL)
+ fprintf(stderr, "ERROR: no tigStore (-T) supplied.\n");
+ if (inGraph == NULL)
+ fprintf(stderr, "ERROR: no input GFA (-i) supplied.\n");
+ if (otGraph == NULL)
+ fprintf(stderr, "ERROR: no output GFA (-o) supplied.\n");
+
+ if ((tigName) && (tigVers == 0))
+ fprintf(stderr, "ERROR: invalid tigStore version (-T) supplied.\n");
+ if ((seqName) && (seqVers == 0))
+ fprintf(stderr, "ERROR: invalid tigStore version (-C) supplied.\n");
+
+ exit(1);
+ }
+
+ if (graphType == IS_GFA)
+ processGFA(tigName, tigVers, inGraph, otGraph, verbosity);
+
+ if ((graphType == IS_BED) && (seqName != NULL))
+ processBED(tigName, tigVers, seqName, seqVers, inGraph, otGraph, verbosity);
+
+ if ((graphType == IS_BED) && (seqName == NULL))
+ processBEDtoGFA(tigName, tigVers, inGraph, otGraph, verbosity);
+
+ fprintf(stderr, "Bye.\n");
exit(0);
}
diff --git a/src/gfa/bed.C b/src/gfa/bed.C
new file mode 100644
index 0000000..06a18a4
--- /dev/null
+++ b/src/gfa/bed.C
@@ -0,0 +1,173 @@
+
+/******************************************************************************
+ *
+ * This file is part of canu, a software program that assembles whole-genome
+ * sequencing reads into contigs.
+ *
+ * This software is based on:
+ * 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
+ * the 'kmer package' (http://kmer.sourceforge.net)
+ * both originally distributed by Applera Corporation under the GNU General
+ * Public License, version 2.
+ *
+ * Canu branched from Celera Assembler at its revision 4587.
+ * Canu branched from the kmer project at its revision 1994.
+ *
+ * Modifications by:
+ *
+ * Brian P. Walenz beginning on 2017-MAY-12
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
+ * File 'README.licenses' in the root directory of this distribution contains
+ * full conditions and disclaimers for each license.
+ */
+
+#include "AS_global.H"
+#include "AS_UTL_fileIO.H"
+
+#include "bed.H"
+
+
+
+// Search for canu-specific names, and convert to tigID's.
+static
+uint32
+nameToCanuID(char *name) {
+ uint32 id = UINT32_MAX;
+
+ if ((name[0] == 't') &&
+ (name[1] == 'i') &&
+ (name[2] == 'g'))
+ id = strtoll(name + 3, NULL, 10);
+
+ if ((name[0] == 'c') &&
+ (name[1] == 't') &&
+ (name[2] == 'g'))
+ id = strtoll(name + 3, NULL, 10);
+
+ if ((name[0] == 'u') &&
+ (name[1] == 't') &&
+ (name[2] == 'g'))
+ id = strtoll(name + 3, NULL, 10);
+
+ return(id);
+}
+
+
+
+bedRecord::bedRecord() {
+ _Aname = NULL;
+ _Aid = UINT32_MAX;
+
+ _bgn = UINT32_MAX;
+ _end = 0;
+
+ _Bname = NULL;
+ _Bid = UINT32_MAX;
+
+ _score = 0;
+ _Bfwd = false;
+}
+
+
+bedRecord::bedRecord(char *inLine) {
+ load(inLine);
+}
+
+
+bedRecord::~bedRecord() {
+ delete [] _Aname;
+ delete [] _Bname;
+}
+
+
+void
+bedRecord::load(char *inLine) {
+ splitToWords W(inLine);
+
+ _Aname = new char [strlen(W[0]) + 1];
+ _Aid = UINT32_MAX;
+
+ _bgn = W(1);
+ _end = W(2);
+
+ _Bname = new char [strlen(W[3]) + 1];
+ _Bid = UINT32_MAX;
+
+ _score = W(4);
+ _Bfwd = W[5][0] == '+';
+
+ strcpy(_Aname, W[0]);
+ strcpy(_Bname, W[3]);
+
+ _Aid = nameToCanuID(_Aname); // Search for canu-specific names, and convert to tigID's.
+ _Bid = nameToCanuID(_Bname);
+}
+
+
+void
+bedRecord::save(FILE *outFile) {
+ fprintf(outFile, "%s\t%d\t%d\t%s\t%u\t%c\n",
+ _Aname, _bgn, _end, _Bname, _score, (_Bfwd == true) ? '+' : '-');
+}
+
+
+
+bedFile::bedFile(char *inFile) {
+ loadFile(inFile);
+}
+
+
+bedFile::~bedFile() {
+ for (uint32 ii=0; ii<_records.size(); ii++)
+ delete _records[ii];
+}
+
+
+bool
+bedFile::loadFile(char *inFile) {
+ FILE *F = NULL;
+ char *L = NULL;
+ uint32 Llen = 0;
+ uint32 Lmax = 0;
+
+ errno = 0;
+ F = fopen(inFile, "r");
+ if (errno)
+ fprintf(stderr, "Failed to open '%s' for reading: %s\n", inFile, strerror(errno)), exit(1);
+
+ while (AS_UTL_readLine(L, Llen, Lmax, F)) {
+ _records.push_back(new bedRecord(L));
+ }
+
+ fclose(F);
+
+ delete [] L;
+
+ fprintf(stderr, "bed: Loaded " F_S64 " records.\n", _records.size());
+
+ return(true);
+}
+
+
+
+
+bool
+bedFile::saveFile(char *outFile) {
+ FILE *F = NULL;
+
+ errno = 0;
+ F = fopen(outFile, "w");
+ if (errno)
+ fprintf(stderr, "Failed to open '%s' for reading: %s\n", outFile, strerror(errno)), exit(1);
+
+ for (uint32 ii=0; ii<_records.size(); ii++)
+ if (_records[ii])
+ _records[ii]->save(F);
+
+ fclose(F);
+
+ return(true);
+}
+
diff --git a/src/gfa/gfa.H b/src/gfa/bed.H
similarity index 58%
copy from src/gfa/gfa.H
copy to src/gfa/bed.H
index a54c518..2f94790 100644
--- a/src/gfa/gfa.H
+++ b/src/gfa/bed.H
@@ -15,7 +15,7 @@
*
* Modifications by:
*
- * Brian P. Walenz beginning on 2017-APR-04
+ * Brian P. Walenz beginning on 2017-MAY-12
* are a 'United States Government Work', and
* are released in the public domain
*
@@ -23,81 +23,51 @@
* full conditions and disclaimers for each license.
*/
-#ifndef AS_UTL_GFA_H
-#define AS_UTL_GFA_H
+#ifndef AS_UTL_BED_H
+#define AS_UTL_BED_H
#include "AS_global.H"
#include "splitToWords.H"
-// Features assumed to hold only the length, and we don't use it.
-
-class gfaSequence {
+class bedRecord {
public:
- gfaSequence();
- gfaSequence(char *inLine);
- ~gfaSequence();
+ bedRecord();
+ bedRecord(char *inLine);
+ ~bedRecord();
void load(char *inLine);
void save(FILE *outFile);
public:
- char *_name;
- uint32 _id;
- char *_sequence;
- char *_features;
-
- uint32 _length;
-};
-
-
-
-
-class gfaLink {
-public:
- gfaLink();
- gfaLink(char *inLine);
- ~gfaLink();
-
- void load(char *inLine);
- void save(FILE *outFile);
-
- void alignmentLength(int32 &queryLen, int32 &refceLen, int32 &alignLen);
-
-public:
- char *_Aname;
+ char *_Aname; // The 'chromosome'
uint32 _Aid; // Canu specific.
- bool _Afwd;
- char *_Bname;
- uint32 _Bid; // Canu specific.
- bool _Bfwd;
+ int32 _bgn;
+ int32 _end;
- char *_cigar;
+ char *_Bname; // The 'feature'
+ uint32 _Bid; // Canu specific.
- char *_features;
+ uint32 _score;
+ bool _Bfwd;
};
-
-
-class gfaFile {
+class bedFile {
public:
- gfaFile(char *inFile);
- ~gfaFile();
+ bedFile(char *inFile);
+ ~bedFile();
bool loadFile(char *inFile);
bool saveFile(char *outFile);
public:
- char *_header;
-
- vector<gfaSequence *> _sequences;
- vector<gfaLink *> _links;
+ vector<bedRecord *> _records;
};
-#endif // AS_UTL_GFA_H
+#endif // AS_UTL_BED_H
diff --git a/src/gfa/gfa.C b/src/gfa/gfa.C
index ebef08a..a50fcc5 100644
--- a/src/gfa/gfa.C
+++ b/src/gfa/gfa.C
@@ -55,14 +55,15 @@ findGFAtokenI(char *features, char *token, TT &value) {
// Search for canu-specific names, and convert to tigID's.
+// Allow either 'tig', 'utg' or 'ctg'.
static
uint32
nameToCanuID(char *name) {
uint32 id = UINT32_MAX;
- if ((name[0] == 't') &&
- (name[1] == 'i') &&
- (name[2] == 'g'))
+ if (((name[0] == 't') && (name[1] == 'i') && (name[2] == 'g')) ||
+ ((name[0] == 'u') && (name[1] == 't') && (name[2] == 'g')) ||
+ ((name[0] == 'c') && (name[1] == 't') && (name[2] == 'g')))
id = strtoll(name + 3, NULL, 10);
return(id);
@@ -72,6 +73,7 @@ nameToCanuID(char *name) {
gfaSequence::gfaSequence() {
_name = NULL;
+ _id = UINT32_MAX;
_sequence = NULL;
_features = NULL;
_length = 0;
@@ -83,6 +85,17 @@ gfaSequence::gfaSequence(char *inLine) {
}
+gfaSequence::gfaSequence(char *name, uint32 id, uint32 len) {
+ _name = new char [strlen(name) + 1];
+ _id = id;
+ _sequence = NULL;
+ _features = NULL;
+ _length = len;
+
+ strcpy(_name, name);
+}
+
+
gfaSequence::~gfaSequence() {
delete [] _name;
delete [] _sequence;
@@ -117,12 +130,13 @@ gfaSequence::load(char *inLine) {
void
gfaSequence::save(FILE *outFile) {
- fprintf(outFile, "S\t%s\t%s\tLN:i:%u\n", _name, _sequence, _length);
+ fprintf(outFile, "S\t%s\t%s\tLN:i:%u\n",
+ _name,
+ _sequence ? _sequence : "*",
+ _length);
}
-
-
gfaLink::gfaLink() {
_Aname = NULL;
_Aid = UINT32_MAX;
@@ -142,6 +156,28 @@ gfaLink::gfaLink(char *inLine) {
}
+gfaLink::gfaLink(char *Aname, uint32 Aid, bool Afwd,
+ char *Bname, uint32 Bid, bool Bfwd, char *cigar) {
+ _Aname = new char [strlen(Aname) + 1];
+ _Aid = Aid;
+ _Afwd = Afwd;
+
+ _Bname = new char [strlen(Bname) + 1];
+ _Bid = Bid;
+ _Bfwd = Bfwd;
+
+ _cigar = new char [strlen(cigar) + 1];
+ _features = NULL;
+
+ strcpy(_Aname, Aname);
+ strcpy(_Bname, Bname);
+ strcpy(_cigar, cigar);
+
+ _Aid = nameToCanuID(_Aname); // Search for canu-specific names, and convert to tigID's.
+ _Bid = nameToCanuID(_Bname);
+}
+
+
gfaLink::~gfaLink() {
delete [] _Aname;
delete [] _Bname;
@@ -251,10 +287,22 @@ gfaLink::alignmentLength(int32 &queryLen, int32 &refceLen, int32 &alignLen) {
+gfaFile::gfaFile() {
+ _header = NULL;
+}
+
+
gfaFile::gfaFile(char *inFile) {
_header = NULL;
- loadFile(inFile);
+ if ((inFile[0] == 'H') && (inFile[1] == '\t')) {
+ _header = new char [strlen(inFile) + 1];
+ strcpy(_header, inFile);
+ }
+
+ else {
+ loadFile(inFile);
+ }
}
diff --git a/src/gfa/gfa.H b/src/gfa/gfa.H
index a54c518..5b0887b 100644
--- a/src/gfa/gfa.H
+++ b/src/gfa/gfa.H
@@ -37,6 +37,7 @@ class gfaSequence {
public:
gfaSequence();
gfaSequence(char *inLine);
+ gfaSequence(char *name, uint32 id, uint32 len);
~gfaSequence();
void load(char *inLine);
@@ -58,6 +59,8 @@ class gfaLink {
public:
gfaLink();
gfaLink(char *inLine);
+ gfaLink(char *Aname, uint32 Aid, bool Afwd,
+ char *Bname, uint32 Bid, bool Bfwd, char *cigar);
~gfaLink();
void load(char *inLine);
@@ -84,6 +87,7 @@ public:
class gfaFile {
public:
+ gfaFile();
gfaFile(char *inFile);
~gfaFile();
diff --git a/src/main.mk b/src/main.mk
index bf47b08..8f7218e 100644
--- a/src/main.mk
+++ b/src/main.mk
@@ -104,11 +104,10 @@ SOURCES := AS_global.C \
utgcns/libcns/abColumn.C \
utgcns/libcns/abMultiAlign.C \
utgcns/libcns/unitigConsensus.C \
- utgcns/libpbutgcns/Alignment.C \
utgcns/libpbutgcns/AlnGraphBoost.C \
- utgcns/libNDFalcon/dw.C \
\
gfa/gfa.C \
+ gfa/bed.C \
\
meryl/libkmer/existDB-create-from-fasta.C \
meryl/libkmer/existDB-create-from-meryl.C \
diff --git a/src/meryl/libmeryl.C b/src/meryl/libmeryl.C
index 94147e2..6f32f32 100644
--- a/src/meryl/libmeryl.C
+++ b/src/meryl/libmeryl.C
@@ -61,7 +61,9 @@ static char *PmagicV = "merylStreamPv04\n";
static char *PmagicX = "merylStreamPvXX\n";
merylStreamReader::merylStreamReader(const char *fn_, uint32 ms_) {
- char inpath[FILENAME_MAX];
+ char idxname[FILENAME_MAX];
+ char datname[FILENAME_MAX];
+ char posname[FILENAME_MAX];
if (fn_ == 0L) {
fprintf(stderr, "ERROR - no counted database file specified.\n");
@@ -74,17 +76,29 @@ merylStreamReader::merylStreamReader(const char *fn_, uint32 ms_) {
// Open the files
//
- snprintf(inpath, FILENAME_MAX, "%s.mcidx", _filename);
- _IDX = new bitPackedFile(inpath);
+ snprintf(idxname, FILENAME_MAX, "%s.mcidx", _filename);
+ snprintf(datname, FILENAME_MAX, "%s.mcdat", _filename);
+ snprintf(posname, FILENAME_MAX, "%s.mcpos", _filename);
- snprintf(inpath, FILENAME_MAX, "%s.mcdat", _filename);
- _DAT = new bitPackedFile(inpath);
+ // bitPackedFile will create a file if it doesn't exist, so we need to fail ahead
+ // of time.
+
+ bool idxexist = AS_UTL_fileExists(idxname);
+ bool datexist = AS_UTL_fileExists(datname);
+ bool posexist = AS_UTL_fileExists(posname);
+
+ if ((idxexist == false) ||
+ (datexist == false)) {
+ fprintf(stderr, "merylStreamReader()-- ERROR: Didn't find data files for reading mer data.\n");
+ fprintf(stderr, "merylStreamReader()-- ERROR: Expecting to find '%s' and\n", idxname);
+ fprintf(stderr, "merylStreamReader()-- ERROR: '%s'\n", datname);
+ exit(1);
+ }
+
+ _IDX = new bitPackedFile(idxname);
+ _DAT = new bitPackedFile(datname);
+ _POS = (posexist) ? new bitPackedFile(posname) : 0L;
- snprintf(inpath, FILENAME_MAX, "%s.mcpos", _filename);
- if (AS_UTL_fileExists(inpath))
- _POS = new bitPackedFile(inpath);
- else
- _POS = 0L;
// Verify that they are what they should be, and read in the header
//
diff --git a/src/minimap/mmapConvert.C b/src/minimap/mmapConvert.C
index 1dab760..42db066 100644
--- a/src/minimap/mmapConvert.C
+++ b/src/minimap/mmapConvert.C
@@ -35,11 +35,13 @@
using namespace std;
-
int
main(int argc, char **argv) {
char *outName = NULL;
char *gkpName = NULL;
+ bool partialOverlaps = false;
+ uint32 minOverlapLength = 0;
+ uint32 tolerance = 0;
vector<char *> files;
@@ -52,6 +54,15 @@ main(int argc, char **argv) {
} else if (strcmp(argv[arg], "-G") == 0) {
gkpName = argv[++arg];
+ } else if (strcmp(argv[arg], "-tolerance") == 0) {
+ tolerance = atoi(argv[++arg]);;
+
+ } else if (strcmp(argv[arg], "-partial") == 0) {
+ partialOverlaps = true;
+
+ } else if (strcmp(argv[arg], "-len") == 0) {
+ minOverlapLength = atoi(argv[++arg]);
+
} else if (AS_UTL_fileExists(argv[arg])) {
files.push_back(argv[arg]);
@@ -79,7 +90,7 @@ main(int argc, char **argv) {
exit(1);
}
- char *ovStr = new char [1024];
+ char *ovStr = new char [1024*1024];
gkStore *gkpStore = gkStore::gkStore_open(gkpName);
ovOverlap ov(gkpStore);
@@ -95,7 +106,7 @@ main(int argc, char **argv) {
// aiid alen bgn end bori biid blen bgn end #match minimizers alnlen ? cm:i:errori
//
- while (fgets(ovStr, 1024, in->file()) != NULL) {
+ while (fgets(ovStr, 1024*1024, in->file()) != NULL) {
splitToWords W(ovStr);
ov.a_iid = W(0);
@@ -104,10 +115,6 @@ main(int argc, char **argv) {
if (ov.a_iid == ov.b_iid)
continue;
- ov.dat.ovl.forUTG = true;
- ov.dat.ovl.forOBT = true;
- ov.dat.ovl.forDUP = true;
-
ov.dat.ovl.ahg5 = W(2);
ov.dat.ovl.ahg3 = W(1) - W(3);
@@ -139,6 +146,50 @@ main(int argc, char **argv) {
exit(1);
}
+ if (!ov.overlapIsDovetail() && partialOverlaps == false) {
+ if (alen <= blen && ov.dat.ovl.ahg5 >= 0 && ov.dat.ovl.ahg3 >= 0 && ov.dat.ovl.bhg5 >= ov.dat.ovl.ahg5 && ov.dat.ovl.bhg3 >= ov.dat.ovl.ahg3 && ((ov.dat.ovl.ahg5 + ov.dat.ovl.ahg3)) < tolerance) {
+ ov.dat.ovl.bhg5 = max(0, ov.dat.ovl.bhg5 - ov.dat.ovl.ahg5); ov.dat.ovl.ahg5 = 0;
+ ov.dat.ovl.bhg3 = max(0, ov.dat.ovl.bhg3 - ov.dat.ovl.ahg3); ov.dat.ovl.ahg3 = 0;
+ }
+ // second is b contained (both b hangs can be extended)
+ //
+ else if (alen >= blen && ov.dat.ovl.bhg5 >= 0 && ov.dat.ovl.bhg3 >= 0 && ov.dat.ovl.ahg5 >= ov.dat.ovl.bhg5 && ov.dat.ovl.ahg3 >= ov.dat.ovl.bhg3 && ((ov.dat.ovl.bhg5 + ov.dat.ovl.bhg3)) < tolerance) {
+ ov.dat.ovl.ahg5 = max(0, ov.dat.ovl.ahg5 - ov.dat.ovl.bhg5); ov.dat.ovl.bhg5 = 0;
+ ov.dat.ovl.ahg3 = max(0, ov.dat.ovl.ahg3 - ov.dat.ovl.bhg3); ov.dat.ovl.bhg3 = 0;
+ }
+ // third is 5' dovetal ---------->
+ // ---------->
+ // or
+ // <---------
+ // bhg5 here is always first overhang on b read
+ //
+ else if (ov.dat.ovl.ahg3 <= ov.dat.ovl.bhg3 && (ov.dat.ovl.ahg3 >= 0 && ((double)(ov.dat.ovl.ahg3)) < tolerance) &&
+ (ov.dat.ovl.bhg5 >= 0 && ((double)(ov.dat.ovl.bhg5)) < tolerance)) {
+ ov.dat.ovl.ahg5 = max(0, ov.dat.ovl.ahg5 - ov.dat.ovl.bhg5); ov.dat.ovl.bhg5 = 0;
+ ov.dat.ovl.bhg3 = max(0, ov.dat.ovl.bhg3 - ov.dat.ovl.ahg3); ov.dat.ovl.ahg3 = 0;
+ }
+ //
+ // fourth is 3' dovetail ---------->
+ // ---------->
+ // or
+ // <----------
+ // bhg5 is always first overhang on b read
+ else if (ov.dat.ovl.ahg5 <= ov.dat.ovl.bhg5 && (ov.dat.ovl.ahg5 >= 0 && ((double)(ov.dat.ovl.ahg5)) < tolerance) &&
+ (ov.dat.ovl.bhg3 >= 0 && ((double)(ov.dat.ovl.bhg3)) < tolerance)) {
+ ov.dat.ovl.bhg5 = max(0, ov.dat.ovl.bhg5 - ov.dat.ovl.ahg5); ov.dat.ovl.ahg5 = 0;
+ ov.dat.ovl.ahg3 = max(0, ov.dat.ovl.ahg3 - ov.dat.ovl.bhg3); ov.dat.ovl.bhg3 = 0;
+ }
+ }
+
+ ov.dat.ovl.forUTG = (partialOverlaps == false) && (ov.overlapIsDovetail() == true);;
+ ov.dat.ovl.forOBT = partialOverlaps;
+ ov.dat.ovl.forDUP = partialOverlaps;
+
+ // check the length is big enough
+ if (ov.a_end() - ov.a_bgn() < minOverlapLength || ov.b_end() - ov.b_bgn() < minOverlapLength) {
+ continue;
+ }
+
// Overlap looks good, write it!
of->writeOverlap(&ov);
diff --git a/src/overlapBasedTrimming/splitReads-trimBad.C b/src/overlapBasedTrimming/splitReads-trimBad.C
index 011d675..9b1d47d 100644
--- a/src/overlapBasedTrimming/splitReads-trimBad.C
+++ b/src/overlapBasedTrimming/splitReads-trimBad.C
@@ -19,6 +19,10 @@
* are a 'United States Government Work', and
* are released in the public domain
*
+ * Sergey Koren beginning on 2017-JUN-13
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
@@ -55,8 +59,11 @@ trimBadInterval(gkStore *gkp,
// Find the largest good region, save it in the output clear range. If there are no
// regions (the whole read was marked bad?), default to a bougs clear range.
- w->clrBgn = UINT32_MAX;
- w->clrEnd = UINT32_MAX;
+ // Was previously set to UINT32_MAX. However, for a read with no good region, when UINT32_MAX is returned
+ // to calling function, asserts on line 370-371 fail because UINT32_MAX is not in initial clear range
+ // set to 0 instead.
+ w->clrBgn = 0;
+ w->clrEnd = 0;
for (uint32 rr=0; rr<goodRegions.numberOfIntervals(); rr++) {
if ((w->clrEnd - w->clrBgn) < (goodRegions.hi(rr) - goodRegions.lo(rr))) {
diff --git a/src/overlapBasedTrimming/splitReads-workUnit.C b/src/overlapBasedTrimming/splitReads-workUnit.C
index c5d20da..73697dd 100644
--- a/src/overlapBasedTrimming/splitReads-workUnit.C
+++ b/src/overlapBasedTrimming/splitReads-workUnit.C
@@ -19,6 +19,10 @@
* are Copyright 2015 Battelle National Biodefense Institute, and
* are subject to the BSD 3-Clause License
*
+ * Brian P. Walenz beginning on 2017-AUG-08
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
@@ -47,8 +51,8 @@ workUnit::addAndFilterOverlaps(gkStore *gkp,
ovOverlap *o = ovl + oo;
adjOverlap *a = adj + adjLen;
- int32 idA = o->a_iid;
- int32 idB = o->b_iid;
+ uint32 idA = o->a_iid;
+ uint32 idB = o->b_iid;
if (finClr->isDeleted(idA) ||
finClr->isDeleted(idB))
diff --git a/src/overlapBasedTrimming/splitReads.C b/src/overlapBasedTrimming/splitReads.C
index 956b2d5..f51621c 100644
--- a/src/overlapBasedTrimming/splitReads.C
+++ b/src/overlapBasedTrimming/splitReads.C
@@ -56,7 +56,7 @@ main(int argc, char **argv) {
FILE *reportFile = NULL;
FILE *subreadFile = NULL;
- bool doSubreadLogging = true;
+ bool doSubreadLogging = false;
bool doSubreadLoggingVerbose = false;
// Statistics on the trimming - the second set are from the old logging, and don't really apply anymore.
@@ -201,12 +201,13 @@ main(int argc, char **argv) {
if (errno)
fprintf(stderr, "Failed to open '%s' for writing: %s\n", outputName, strerror(errno)), exit(1);
- snprintf(outputName, FILENAME_MAX, "%s.subread.log", outputPrefix);
- errno = 0;
- subreadFile = fopen(outputName, "w");
- if (errno)
- fprintf(stderr, "Failed to open '%s' for writing: %s\n", outputName, strerror(errno)), exit(1);
-
+ if (doSubreadLogging) {
+ snprintf(outputName, FILENAME_MAX, "%s.subread.log", outputPrefix);
+ errno = 0;
+ subreadFile = fopen(outputName, "w");
+ if (errno)
+ fprintf(stderr, "Failed to open '%s' for writing: %s\n", outputName, strerror(errno)), exit(1);
+ }
uint32 ovlLen = 0;
uint32 ovlMax = 64 * 1024;
diff --git a/src/overlapErrorAdjustment/findErrors.H b/src/overlapErrorAdjustment/findErrors.H
index 4f2004b..08085dc 100644
--- a/src/overlapErrorAdjustment/findErrors.H
+++ b/src/overlapErrorAdjustment/findErrors.H
@@ -186,7 +186,7 @@ public:
// It is possible, but unlikely, to have two overlaps to the same pair of reads,
// if we overlap a5'-b3' and a3'-b5'. I think.
- return(innie != that.innie);
+ return(innie < that.innie);
};
};
diff --git a/src/overlapInCore/libedlib/edlib.C b/src/overlapInCore/libedlib/edlib.C
index cc36091..48109eb 100644
--- a/src/overlapInCore/libedlib/edlib.C
+++ b/src/overlapInCore/libedlib/edlib.C
@@ -102,52 +102,55 @@ struct Block {
Block(Word P, Word M, int score) :P(P), M(M), score(score) {}
};
-static int myersCalcEditDistanceSemiGlobal(Word* Peq, int W, int maxNumBlocks,
+static int myersCalcEditDistanceSemiGlobal(const Word* Peq, int W, int maxNumBlocks,
const unsigned char* query, int queryLength,
const unsigned char* target, int targetLength,
- int alphabetLength, int k, EdlibAlignMode mode, int* bestScore,
- int** positions, int* numPositions);
+ int alphabetLength, int k, EdlibAlignMode mode,
+ int* bestScore_, int** positions_, int* numPositions_);
-static int myersCalcEditDistanceNW(Word* Peq, int W, int maxNumBlocks,
+static int myersCalcEditDistanceNW(const Word* Peq, int W, int maxNumBlocks,
const unsigned char* query, int queryLength,
const unsigned char* target, int targetLength,
- int alphabetLength, int k, int* bestScore, int* position,
- bool findAlignment, AlignmentData** alignData, int targetStopPosition);
+ int alphabetLength, int k, int* bestScore_,
+ int* position_, bool findAlignment,
+ AlignmentData** alignData, int targetStopPosition);
static int obtainAlignment(
- const unsigned char* query, const unsigned char* rQuery, const int queryLength,
- const unsigned char* target, const unsigned char* rTarget, const int targetLength,
- const int alphabetLength, const int bestScore,
+ const unsigned char* query, const unsigned char* rQuery, int queryLength,
+ const unsigned char* target, const unsigned char* rTarget, int targetLength,
+ int alphabetLength, int bestScore,
unsigned char** alignment, int* alignmentLength);
static int obtainAlignmentHirschberg(
- const unsigned char* query, const unsigned char* rQuery, const int queryLength,
- const unsigned char* target, const unsigned char* rTarget, const int targetLength,
- const int alphabetLength, const int bestScore,
+ const unsigned char* query, const unsigned char* rQuery, int queryLength,
+ const unsigned char* target, const unsigned char* rTarget, int targetLength,
+ int alphabetLength, int bestScore,
unsigned char** alignment, int* alignmentLength);
-static int obtainAlignmentTraceback(const int queryLength, const int targetLength,
- const int bestScore, const AlignmentData* alignData,
+static int obtainAlignmentTraceback(int queryLength, int targetLength,
+ int bestScore, const AlignmentData* alignData,
unsigned char** alignment, int* alignmentLength);
-static int transformSequences(const char* queryOriginal, const int queryLength,
- const char* targetOriginal, const int targetLength,
- unsigned char** queryTransformed, unsigned char** targetTransformed);
+static int transformSequences(const char* queryOriginal, int queryLength,
+ const char* targetOriginal, int targetLength,
+ unsigned char** queryTransformed,
+ unsigned char** targetTransformed);
static inline int ceilDiv(int x, int y);
static inline unsigned char* createReverseCopy(const unsigned char* seq, int length);
-static inline Word* buildPeq(int alphabetLength, const unsigned char* query, int queryLength);
+static inline Word* buildPeq(int alphabetLength, const unsigned char* query,
+ int queryLength);
/**
* Main edlib method.
*/
-EdlibAlignResult edlibAlign(const char* queryOriginal, const int queryLength,
- const char* targetOriginal, const int targetLength,
+EdlibAlignResult edlibAlign(const char* const queryOriginal, const int queryLength,
+ const char* const targetOriginal, const int targetLength,
const EdlibAlignConfig config) {
EdlibAlignResult result;
result.editDistance = -1;
@@ -157,6 +160,8 @@ EdlibAlignResult edlibAlign(const char* queryOriginal, const int queryLength,
result.alignmentLength = 0;
result.alphabetLength = 0;
+ assert(queryLength > 0);
+ assert(targetLength > 0);
/*------------ TRANSFORM SEQUENCES AND RECOGNIZE ALPHABET -----------*/
unsigned char* query, * target;
@@ -269,8 +274,8 @@ EdlibAlignResult edlibAlign(const char* queryOriginal, const int queryLength,
}
-char* edlibAlignmentToCigar(unsigned char* alignment, int alignmentLength,
- EdlibCigarFormat cigarFormat) {
+char* edlibAlignmentToCigar(const unsigned char* const alignment, const int alignmentLength,
+ const EdlibCigarFormat cigarFormat) {
if (cigarFormat != EDLIB_CIGAR_EXTENDED && cigarFormat != EDLIB_CIGAR_STANDARD) {
return 0;
}
@@ -348,7 +353,8 @@ void edlibAlignmentToStrings(const unsigned char* alignment, int alignmentLength
* Bit i of Peq[s * maxNumBlocks + b] is 1 if i-th symbol from block b of query equals symbol s, otherwise it is 0.
* NOTICE: free returned array with delete[]!
*/
-static inline Word* buildPeq(int alphabetLength, const unsigned char* query, int queryLength) {
+static inline Word* buildPeq(const int alphabetLength, const unsigned char* const query,
+ const int queryLength) {
int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE);
// table of dimensions alphabetLength+1 x maxNumBlocks. Last symbol is wildcard.
Word* Peq = new Word[(alphabetLength + 1) * maxNumBlocks];
@@ -377,7 +383,7 @@ static inline Word* buildPeq(int alphabetLength, const unsigned char* query, int
/**
* Returns new sequence that is reverse of given sequence.
*/
-static inline unsigned char* createReverseCopy(const unsigned char* seq, int length) {
+static inline unsigned char* createReverseCopy(const unsigned char* const seq, const int length) {
unsigned char* rSeq = new unsigned char[length];
for (int i = 0; i < length; i++) {
rSeq[i] = seq[length - i - 1];
@@ -441,15 +447,15 @@ static inline int calculateBlock(Word Pv, Word Mv, Word Eq, const int hin,
* Does ceiling division x / y.
* Note: x and y must be non-negative and x + y must not overflow.
*/
-static inline int ceilDiv(int x, int y) {
+static inline int ceilDiv(const int x, const int y) {
return x % y ? x / y + 1 : x / y;
}
-static inline int min(int x, int y) {
+static inline int min(const int x, const int y) {
return x < y ? x : y;
}
-static inline int max(int x, int y) {
+static inline int max(const int x, const int y) {
return x > y ? x : y;
}
@@ -521,13 +527,30 @@ static inline bool allBlockCellsLarger(const Block block, const int k) {
/**
+ * Uses Myers' bit-vector algorithm to find edit distance for one of semi-global alignment methods.
+ * @param [in] Peq Query profile.
+ * @param [in] W Size of padding in last block.
+ * TODO: Calculate this directly from query, instead of passing it.
+ * @param [in] maxNumBlocks Number of blocks needed to cover the whole query.
+ * TODO: Calculate this directly from query, instead of passing it.
+ * @param [in] query
+ * @param [in] queryLength
+ * @param [in] target
+ * @param [in] targetLength
+ * @param [in] alphabetLength
+ * @param [in] k
* @param [in] mode EDLIB_MODE_HW or EDLIB_MODE_SHW
+ * @param [out] bestScore_ Edit distance.
+ * @param [out] positions_ Array of 0-indexed positions in target at which best score was found.
+ Make sure to free this array with free().
+ * @param [out] numPositions_ Number of positions in the positions_ array.
+ * @return Status.
*/
-static int myersCalcEditDistanceSemiGlobal(Word* const Peq, const int W, const int maxNumBlocks,
+static int myersCalcEditDistanceSemiGlobal(const Word* const Peq, const int W, const int maxNumBlocks,
const unsigned char* const query, const int queryLength,
const unsigned char* const target, const int targetLength,
const int alphabetLength, int k, const EdlibAlignMode mode,
- int* bestScore_, int** positions_, int* numPositions_) {
+ int* const bestScore_, int** const positions_, int* const numPositions_) {
*positions_ = NULL;
*numPositions_ = 0;
@@ -679,22 +702,37 @@ static int myersCalcEditDistanceSemiGlobal(Word* const Peq, const int W, const i
}
-
-
/**
- * @param alignData Data generated during calculation, that is needed for reconstruction of alignment.
- * I it is allocated with new, so free it with delete.
- * Data is generated only if findAlignment is true.
- * @param targetStopPosition If set to -1, whole calculation is performed.
+ * Uses Myers' bit-vector algorithm to find edit distance for global(NW) alignment method.
+ * @param [in] Peq Query profile.
+ * @param [in] W Size of padding in last block.
+ * TODO: Calculate this directly from query, instead of passing it.
+ * @param [in] maxNumBlocks Number of blocks needed to cover the whole query.
+ * TODO: Calculate this directly from query, instead of passing it.
+ * @param [in] query
+ * @param [in] queryLength
+ * @param [in] target
+ * @param [in] targetLength
+ * @param [in] alphabetLength
+ * @param [in] k
+ * @param [out] bestScore_ Edit distance.
+ * @param [out] position_ 0-indexed position in target at which best score was found.
+ * @param [in] findAlignment If true, whole matrix is remembered and alignment data is returned.
+ * Quadratic amount of memory is consumed.
+ * @param [out] alignData Data needed for alignment traceback (for reconstruction of alignment).
+ * Set only if findAlignment is set to true, otherwise it is NULL.
+ * Make sure to free this array using delete[].
+ * @param [out] targetStopPosition If set to -1, whole calculation is performed normally, as expected.
* If set to p, calculation is performed up to position p in target (inclusive)
* and column p is returned as the only column in alignData.
+ * @return Status.
*/
-static int myersCalcEditDistanceNW(Word* Peq, int W, int maxNumBlocks,
- const unsigned char* query, int queryLength,
- const unsigned char* target, int targetLength,
- int alphabetLength, int k, int* bestScore_, int* position_,
- bool findAlignment, AlignmentData** alignData,
- int targetStopPosition) {
+static int myersCalcEditDistanceNW(const Word* const Peq, const int W, const int maxNumBlocks,
+ const unsigned char* const query, const int queryLength,
+ const unsigned char* const target, const int targetLength,
+ const int alphabetLength, int k, int* const bestScore_,
+ int* const position_, const bool findAlignment,
+ AlignmentData** const alignData, const int targetStopPosition) {
if (targetStopPosition > -1 && findAlignment) {
// They can not be both set at the same time!
return EDLIB_STATUS_ERROR;
@@ -738,7 +776,7 @@ static int myersCalcEditDistanceNW(Word* Peq, int W, int maxNumBlocks,
const unsigned char* targetChar = target;
for (int c = 0; c < targetLength; c++) { // for each column
- Word* Peq_c = Peq + *targetChar * maxNumBlocks;
+ const Word* Peq_c = Peq + *targetChar * maxNumBlocks;
//----------------------- Calculate column -------------------------//
int hout = 1;
@@ -773,12 +811,14 @@ static int myersCalcEditDistanceNW(Word* Peq, int W, int maxNumBlocks,
hout = newHout;
}
- // While block is out of band, move one block up. - This is optimal now, by my formula.
- // NOTICE: I added + W, and now it works! This has to be added because query is padded with W cells.
+ // While block is out of band, move one block up.
+ // NOTE: Condition used here is more loose than the one from the article, since I simplified the max() part of it.
+ // I could consider adding that max part, for optimal performance.
while (lastBlock >= firstBlock
&& (bl->score >= k + WORD_SIZE
|| ((lastBlock + 1) * WORD_SIZE - 1 >
- k - bl->score + 2 * WORD_SIZE - 2 - targetLength + c + queryLength + W))) {
+ // TODO: Does not work if do not put +1! Why???
+ k - bl->score + 2 * WORD_SIZE - 2 - targetLength + c + queryLength + 1))) {
lastBlock--; bl--;
}
//-------------------------//
@@ -799,11 +839,12 @@ static int myersCalcEditDistanceNW(Word* Peq, int W, int maxNumBlocks,
while (lastBlock >= firstBlock) {
// If all cells outside of band, remove block
vector<int> scores = getBlockCellValues(*bl);
- int r = (lastBlock + 1) * WORD_SIZE - 1;
+ int numCells = lastBlock == maxNumBlocks - 1 ? WORD_SIZE - W : WORD_SIZE;
+ int r = lastBlock * WORD_SIZE + numCells - 1;
bool reduce = true;
- for (int i = 0; i < WORD_SIZE; i++) {
+ for (int i = WORD_SIZE - numCells; i < WORD_SIZE; i++) {
// TODO: Does not work if do not put +1! Why???
- if (scores[i] <= k && r <= k - scores[i] - targetLength + c + queryLength + W + 1) {
+ if (scores[i] <= k && r <= k - scores[i] - targetLength + c + queryLength + 1) {
reduce = false;
break;
}
@@ -816,9 +857,10 @@ static int myersCalcEditDistanceNW(Word* Peq, int W, int maxNumBlocks,
while (firstBlock <= lastBlock) {
// If all cells outside of band, remove block
vector<int> scores = getBlockCellValues(blocks[firstBlock]);
- int r = (firstBlock + 1) * WORD_SIZE - 1;
+ int numCells = firstBlock == maxNumBlocks - 1 ? WORD_SIZE - W : WORD_SIZE;
+ int r = firstBlock * WORD_SIZE + numCells - 1;
bool reduce = true;
- for (int i = 0; i < WORD_SIZE; i++) {
+ for (int i = WORD_SIZE - numCells; i < WORD_SIZE; i++) {
if (scores[i] <= k && r >= scores[i] - k - targetLength + c + queryLength) {
reduce = false;
break;
@@ -853,7 +895,6 @@ static int myersCalcEditDistanceNW(Word* Peq, int W, int maxNumBlocks,
}
}
//----------------------------------------------------------//
-
//---- If this is stop column, save it and finish ----//
if (c == targetStopPosition) {
for (int b = firstBlock; b <= lastBlock; b++) {
@@ -902,8 +943,8 @@ static int myersCalcEditDistanceNW(Word* Peq, int W, int maxNumBlocks,
* @return Status code.
*/
static int obtainAlignmentTraceback(const int queryLength, const int targetLength,
- const int bestScore, const AlignmentData* alignData,
- unsigned char** alignment, int* alignmentLength) {
+ const int bestScore, const AlignmentData* const alignData,
+ unsigned char** const alignment, int* const alignmentLength) {
const int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE);
const int W = maxNumBlocks * WORD_SIZE - queryLength;
@@ -919,6 +960,9 @@ static int obtainAlignmentTraceback(const int queryLength, const int targetLengt
Word currM = alignData->Ms[c * maxNumBlocks + b]; // M of current block
// True if block to left exists and is in band
bool thereIsLeftBlock = c > 0 && b >= alignData->firstBlocks[c-1] && b <= alignData->lastBlocks[c-1];
+ // We set initial values of lP and lM to 0 only to avoid compiler warnings, they should not affect the
+ // calculation as both lP and lM should be initialized at some moment later (but compiler can not
+ // detect it since this initialization is guaranteed by "business" logic).
Word lP = 0, lM = 0;
if (thereIsLeftBlock) {
lP = alignData->Ps[(c - 1) * maxNumBlocks + b]; // P of block to the left
@@ -927,12 +971,17 @@ static int obtainAlignmentTraceback(const int queryLength, const int targetLengt
currP <<= W;
currM <<= W;
int blockPos = WORD_SIZE - W - 1; // 0 based index of current cell in blockPos
+
+ // TODO(martin): refactor this whole piece of code. There are too many if-else statements,
+ // it is too easy for a bug to hide and to hard to effectively cover all the edge-cases.
+ // We need better separation of logic and responsibilities.
+ while (true) {
if (c == 0) {
thereIsLeftBlock = true;
lScore = b * WORD_SIZE + blockPos + 1;
ulScore = lScore - 1;
}
- while (true) {
+
// TODO: improvement: calculate only those cells that are needed,
// for example if I calculate upper cell and can move up,
// there is no need to calculate left and upper left cell
@@ -992,6 +1041,8 @@ static int obtainAlignmentTraceback(const int queryLength, const int targetLengt
lM = alignData->Ms[(c - 1) * maxNumBlocks + b];
} else {
thereIsLeftBlock = false;
+ // TODO(martin): There may not be left block, but there can be left boundary - do we
+ // handle this correctly then? Are l and ul score set correctly? I should check that / refactor this.
}
}
} else {
@@ -1110,10 +1161,12 @@ static int obtainAlignmentTraceback(const int queryLength, const int targetLengt
* @param [out] alignmentLength Length of alignment.
* @return Status code.
*/
-static int obtainAlignment(const unsigned char* query, const unsigned char* rQuery, const int queryLength,
- const unsigned char* target, const unsigned char* rTarget, const int targetLength,
+static int obtainAlignment(
+ const unsigned char* const query, const unsigned char* const rQuery, const int queryLength,
+ const unsigned char* const target, const unsigned char* const rTarget, const int targetLength,
const int alphabetLength, const int bestScore,
- unsigned char** alignment, int* alignmentLength) {
+ unsigned char** const alignment, int* const alignmentLength) {
+
// Handle special case when one of sequences has length of 0.
if (queryLength == 0 || targetLength == 0) {
*alignmentLength = targetLength + queryLength;
@@ -1180,10 +1233,11 @@ static int obtainAlignment(const unsigned char* query, const unsigned char* rQue
* @return Status code.
*/
static int obtainAlignmentHirschberg(
- const unsigned char* query, const unsigned char* rQuery, const int queryLength,
- const unsigned char* target, const unsigned char* rTarget, const int targetLength,
+ const unsigned char* const query, const unsigned char* const rQuery, const int queryLength,
+ const unsigned char* const target, const unsigned char* const rTarget, const int targetLength,
const int alphabetLength, const int bestScore,
- unsigned char** alignment, int* alignmentLength) {
+ unsigned char** const alignment, int* const alignmentLength) {
+
const int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE);
const int W = maxNumBlocks * WORD_SIZE - queryLength;
@@ -1199,7 +1253,8 @@ static int obtainAlignmentHirschberg(
// Calculate left half.
AlignmentData* alignDataLeftHalf = NULL;
- myersCalcEditDistanceNW(Peq, W, maxNumBlocks,
+ int leftHalfCalcStatus = myersCalcEditDistanceNW(
+ Peq, W, maxNumBlocks,
query, queryLength,
target, targetLength,
alphabetLength, bestScore,
@@ -1207,7 +1262,8 @@ static int obtainAlignmentHirschberg(
// Calculate right half.
AlignmentData* alignDataRightHalf = NULL;
- myersCalcEditDistanceNW(rPeq, W, maxNumBlocks,
+ int rightHalfCalcStatus = myersCalcEditDistanceNW(
+ rPeq, W, maxNumBlocks,
rQuery, queryLength,
rTarget, targetLength,
alphabetLength, bestScore,
@@ -1216,6 +1272,12 @@ static int obtainAlignmentHirschberg(
delete[] Peq;
delete[] rPeq;
+ if (leftHalfCalcStatus == EDLIB_STATUS_ERROR || rightHalfCalcStatus == EDLIB_STATUS_ERROR) {
+ if (alignDataLeftHalf) delete alignDataLeftHalf;
+ if (alignDataRightHalf) delete alignDataRightHalf;
+ return EDLIB_STATUS_ERROR;
+ }
+
// Unwrap the left half.
int firstBlockIdxLeft = alignDataLeftHalf->firstBlocks[0];
int lastBlockIdxLeft = alignDataLeftHalf->lastBlocks[0];
@@ -1359,9 +1421,10 @@ static int obtainAlignmentHirschberg(
* @param [out] targetTransformed It will contain values in range [0, alphabet length - 1].
* @return Alphabet length - number of letters in recognized alphabet.
*/
-static int transformSequences(const char* queryOriginal, const int queryLength,
- const char* targetOriginal, const int targetLength,
- unsigned char** queryTransformed, unsigned char** targetTransformed) {
+static int transformSequences(const char* const queryOriginal, const int queryLength,
+ const char* const targetOriginal, const int targetLength,
+ unsigned char** const queryTransformed,
+ unsigned char** const targetTransformed) {
// Alphabet is constructed from letters that are present in sequences.
// Each letter is assigned an ordinal number, starting from 0 up to alphabetLength - 1,
// and new query and target are created in which letters are replaced with their ordinal numbers.
@@ -1370,13 +1433,13 @@ static int transformSequences(const char* queryOriginal, const int queryLength,
*targetTransformed = new unsigned char [targetLength];
// Alphabet information, it is constructed on fly while transforming sequences.
- unsigned char letterIdx[128]; //!< letterIdx[c] is index of letter c in alphabet
- bool inAlphabet[128]; // inAlphabet[c] is true if c is in alphabet
- for (int i = 0; i < 128; i++) inAlphabet[i] = false;
+ unsigned char letterIdx[256]; //!< letterIdx[c] is index of letter c in alphabet
+ bool inAlphabet[256]; // inAlphabet[c] is true if c is in alphabet
+ for (int i = 0; i < 256; i++) inAlphabet[i] = false;
int alphabetLength = 0;
for (int i = 0; i < queryLength; i++) {
- char c = queryOriginal[i];
+ unsigned char c = static_cast<unsigned char>(queryOriginal[i]);
if (!inAlphabet[c]) {
inAlphabet[c] = true;
letterIdx[c] = alphabetLength;
@@ -1385,7 +1448,7 @@ static int transformSequences(const char* queryOriginal, const int queryLength,
(*queryTransformed)[i] = letterIdx[c];
}
for (int i = 0; i < targetLength; i++) {
- char c = targetOriginal[i];
+ unsigned char c = static_cast<unsigned char>(targetOriginal[i]);
if (!inAlphabet[c]) {
inAlphabet[c] = true;
letterIdx[c] = alphabetLength;
@@ -1406,7 +1469,7 @@ EdlibAlignConfig edlibNewAlignConfig(int k, EdlibAlignMode mode, EdlibAlignTask
return config;
}
-EdlibAlignConfig edlibDefaultAlignConfig() {
+EdlibAlignConfig edlibDefaultAlignConfig(void) {
return edlibNewAlignConfig(-1, EDLIB_MODE_NW, EDLIB_TASK_DISTANCE);
}
diff --git a/src/overlapInCore/libedlib/edlib.H b/src/overlapInCore/libedlib/edlib.H
index c85d895..7fa1e2e 100644
--- a/src/overlapInCore/libedlib/edlib.H
+++ b/src/overlapInCore/libedlib/edlib.H
@@ -155,7 +155,7 @@ EdlibAlignConfig edlibNewAlignConfig(int k, EdlibAlignMode mode, EdlibAlignTask
* @return Default configuration object, with following defaults:
* k = -1, mode = EDLIB_MODE_NW, task = EDLIB_TASK_DISTANCE.
*/
-EdlibAlignConfig edlibDefaultAlignConfig();
+EdlibAlignConfig edlibDefaultAlignConfig(void);
/**
@@ -222,9 +222,9 @@ void edlibFreeAlignResult(EdlibAlignResult result);
* It always returns edit distance and end locations of optimal alignment in target.
* It optionally returns start locations of optimal alignment in target and alignment path,
* if you choose appropriate tasks.
- * @param [in] query First sequence. Character codes should be in range [0, 127].
+ * @param [in] query First sequence.
* @param [in] queryLength Number of characters in first sequence.
- * @param [in] target Second sequence. Character codes should be in range [0, 127].
+ * @param [in] target Second sequence.
* @param [in] targetLength Number of characters in second sequence.
* @param [in] config Additional alignment parameters, like alignment method and wanted results.
* @return Result of alignment, which can contain edit distance, start and end locations and alignment path.
@@ -232,7 +232,7 @@ void edlibFreeAlignResult(EdlibAlignResult result);
*/
EdlibAlignResult edlibAlign(const char* query, const int queryLength,
const char* target, const int targetLength,
- EdlibAlignConfig config);
+ const EdlibAlignConfig config);
/**
@@ -254,10 +254,16 @@ EdlibAlignResult edlibAlign(const char* query, const int queryLength,
* Needed memory is allocated and given pointer is set to it.
* Do not forget to free it later using free()!
*/
-char* edlibAlignmentToCigar(unsigned char* alignment, int alignmentLength,
+char* edlibAlignmentToCigar(const unsigned char* alignment, int alignmentLength,
EdlibCigarFormat cigarFormat);
-void edlibAlignmentToStrings(const unsigned char* alignment, int alignmentLength, int tgtStart, int tgtEnd, int qryStart, int qryEnd, const char *tgt, const char *qry, char *tgt_aln_str, char *qry_aln_str);
+void edlibAlignmentToStrings(const unsigned char* alignment, int alignmentLength,
+ int tgtStart, int tgtEnd,
+ int qryStart, int qryEnd,
+ const char *tgt,
+ const char *qry,
+ char *tgt_aln_str,
+ char *qry_aln_str);
#endif // EDLIB_H
diff --git a/src/overlapInCore/liboverlap/Binomial_Bound.C b/src/overlapInCore/liboverlap/Binomial_Bound.C
index 0204891..2ed21a6 100644
--- a/src/overlapInCore/liboverlap/Binomial_Bound.C
+++ b/src/overlapInCore/liboverlap/Binomial_Bound.C
@@ -176,6 +176,14 @@ Initialize_Match_Limit(int32 *ml, double maxErate, int32 maxErrors) {
double sl = 0.986446300363063 / maxErate + 0.052358358862826;
#endif
+#if AS_MAX_READLEN_BITS == 23
+ double sl = 0.989923769842693 / maxErate + 0.0395372203695468;
+#endif
+
+#if AS_MAX_READLEN_BITS == 24
+ double sl = 0.992440299290478 / maxErate + 0.036791522317757;
+#endif
+
// And the first value.
double vl = ml[e-1] + sl;
diff --git a/src/overlapInCore/overlapInCorePartition.C b/src/overlapInCore/overlapInCorePartition.C
index b1abad4..fec0b19 100644
--- a/src/overlapInCore/overlapInCorePartition.C
+++ b/src/overlapInCore/overlapInCorePartition.C
@@ -47,6 +47,8 @@
// ovlbat - batch names
// ovljob - job names
// ovlopt - overlapper options
+//
+// From (very) old paper notes, overlapInCore only computes overlaps for referenceID < hashID.
uint32 batchMax = 1000;
diff --git a/src/pipelines/canu.pl b/src/pipelines/canu.pl
index e15d27a..f7c81d9 100644
--- a/src/pipelines/canu.pl
+++ b/src/pipelines/canu.pl
@@ -92,9 +92,11 @@ setDefaults();
# The bin directory is needed for -version, can only be set after setDefaults(), but really should be
# set after checkParameters() so it can know pathMap.
-my $bin = getBinDirectory(); # Path to binaries, reset later.
-my $cmd = undef; # Temporary string passed to system().
-my $asm = undef; # Name of our assembly.
+my $bin = getBinDirectory(); # Path to binaries, reset later.
+my $cmd = undef; # Temporary string passed to system().
+my $asm = undef; # Name of our assembly.
+my $asmAuto = undef; # If set, the name was auto-discovered.
+
# What a mess. We can't set the version string until after we have a bin directory, and
# Defaults.pm can't call stuff in Execution.pm. So, we need to special case setting the version
@@ -132,7 +134,8 @@ foreach my $arg (@ARGV) {
my $rootdir = undef;
my $readdir = undef;
-my $mode = undef;
+my $mode = undef; # "correct", "trim", "trim-assemble" or "assemble"
+my $type = undef; # "pacbio" or "nanopore"
my $step = "run";
my $haveRaw = 0;
my $haveCorrected = 0;
@@ -143,6 +146,11 @@ while (scalar(@ARGV)) {
if (($arg eq "-h") || ($arg eq "-help") || ($arg eq "--help")) {
printHelp(1);
+ } elsif (($arg eq "-citation") || ($arg eq "--citation")) {
+ print STDERR "\n";
+ printCitation(undef);
+ exit(0);
+
} elsif ($arg eq "-d") {
$rootdir = shift @ARGV;
@@ -178,6 +186,11 @@ while (scalar(@ARGV)) {
$readdir = shift @ARGV;
addCommandLineOption("-readdir '$readdir'");
+ } elsif (($arg eq "-pacbio") ||
+ ($arg eq "-nanopore")) {
+ $type = "pacbio" if ($arg eq "-pacbio");
+ $type = "nanopore" if ($arg eq "-nanopore");
+
} elsif (($arg eq "-pacbio-raw") || # File handling is also present in
($arg eq "-pacbio-corrected") || # Defaults.pm around line 438
($arg eq "-nanopore-raw") ||
@@ -204,21 +217,27 @@ while (scalar(@ARGV)) {
addCommandLineOption("'$arg'");
} else {
- print STDERR "INVALID $arg\n";
addCommandLineError("ERROR: Invalid command line option '$arg'. Did you forget quotes around options with spaces?\n");
}
}
-# Fail if some obvious things aren't set.
+# If no $asm or $dir, see if there is an assembly here. If so, set $asm to what was found.
-addCommandLineError("ERROR: Assembly name prefix not supplied with -p.\n") if (!defined($asm));
+if (!defined($asm)) {
+ $asmAuto = 1; # If we don't actually find a prefix, we'll fail right after this, so OK to set blindly.
-# If the mode isn't set - which is allowed only if a gkpStore exists somewhere - be a little smart
-# and figure out which store exists.
+ open(F, "ls -d */*gkpStore |");
+ while (<F>) {
+ $asm = $1 if (m/^correction\/(.*).gkpStore$/);
+ $asm = $1 if (m/^trimming\/(.*).gkpStore$/);
+ $asm = $1 if (m/^unitigging\/(.*).gkpStore$/);
+ }
+ close(F);
+}
-$mode = "run" if (!defined($mode) && (-d "correction/$asm.gkpStore"));
-$mode = "trim-assemble" if (!defined($mode) && (-d "trimming/$asm.gkpStore"));
-$mode = "assemble" if (!defined($mode) && (-d "unitigging/$asm.gkpStore"));
+# Fail if some obvious things aren't set.
+
+addCommandLineError("ERROR: Assembly name prefix not supplied with -p.\n") if (!defined($asm));
# Load paramters from the defaults files
@@ -235,84 +254,7 @@ foreach my $specFile (@specFiles) {
setParametersFromCommandLine(@specOpts);
-# Set parameters based on file types supplied.
-
-my $setUpForPacBio = 0;
-my $setUpForNanopore = 0;
-
-foreach my $typefile (@inputFiles) {
- my ($type, $file) = split '\0', $typefile;
-
- $mode = "trim-assemble" if (!defined($mode) && ($type =~ m/corrected/));
- $mode = "run" if (!defined($mode) && ($type =~ m/raw/));
-
- $haveCorrected = 1 if ($type =~ m/corrected/);
- $haveRaw = 1 if ($type =~ m/raw/);
-
- $setUpForPacBio++ if ($type =~ m/pacbio/);
- $setUpForNanopore++ if ($type =~ m/nanopore/);
-}
-
-# Fail if both raw and corrected are supplied.
-
-addCommandLineError("ERROR: Canu does not currently support mixing raw and corrected sequences.\n") if ($haveRaw && $haveCorrected);
-
-# If anything complained (invalid option, missing file, etc) printHelp() will trigger and exit.
-
-printHelp();
-
-# When resuming a run without input files, set the error rates based on library type in the
-# gkpStore.
-
-if (scalar(@inputFiles) == 0) {
- my $gkpStore = undef;
-
- $gkpStore = "correction/$asm.gkpStore" if (-e "correction/$asm.gkpStore/libraries.txt");
- $gkpStore = "trimming/$asm.gkpStore" if (-e "trimming/$asm.gkpStore/libraries.txt");
- $gkpStore = "unitigging/$asm.gkpStore" if (-e "unitigging/$asm.gkpStore/libraries.txt");
-
- caExit("ERROR: no reads supplied, and can't find any library information in gkpStore", undef) if (!defined($gkpStore));
-
- my $numPacBioRaw = 0;
- my $numPacBioCorrected = 0;
- my $numNanoporeRaw = 0;
- my $numNanoporeCorrected = 0;
-
- open(L, "< $gkpStore/libraries.txt") or caExit("can't open '$gkpStore/libraries.txt' for reading: $!", undef);
- while (<L>) {
- $numPacBioRaw++ if (m/pacbio-raw/);
- $numPacBioCorrected++ if (m/pacbio-corrected/);
- $numNanoporeRaw++ if (m/nanopore-raw/);
- $numNanoporeCorrected++ if (m/nanopore-corrected/);
- }
-
- $setUpForPacBio++ if ($numPacBioRaw + $numPacBioCorrected > 0);
- $setUpForNanopore++ if ($numNanoporeRaw + $numNanoporeCorrected > 0);
-}
-
-# Now set error rates (if not set already) based on the dominant read type.
-
-if ($setUpForNanopore > 0) {
- setGlobalIfUndef("corOvlErrorRate", 0.320);
- setGlobalIfUndef("obtOvlErrorRate", 0.144);
- setGlobalIfUndef("utgOvlErrorRate", 0.144);
- setGlobalIfUndef("corErrorRate", 0.500);
- setGlobalIfUndef("obtErrorRate", 0.144);
- setGlobalIfUndef("utgErrorRate", 0.144);
- setGlobalIfUndef("cnsErrorRate", 0.144);
-} else {
- setGlobalIfUndef("corOvlErrorRate", 0.240);
- setGlobalIfUndef("obtOvlErrorRate", 0.045);
- setGlobalIfUndef("utgOvlErrorRate", 0.045);
- setGlobalIfUndef("corErrorRate", 0.300);
- setGlobalIfUndef("obtErrorRate", 0.045);
- setGlobalIfUndef("utgErrorRate", 0.045);
- setGlobalIfUndef("cnsErrorRate", 0.045);
-}
-
-# Finish setting parameters, then reset the bin directory using pathMap.
-
-checkParameters();
+# Reset $bin, now that all options, specifically the pathMap, are set.
$bin = getBinDirectory();
@@ -323,7 +265,13 @@ printHelp();
# Now that we know the bin directory, print the version so those pesky users
# will (hopefully) include it when they paste in logs.
-print "-- " . getGlobal("version") . "\n";
+print STDERR "-- " . getGlobal("version") . "\n";
+print STDERR "--\n";
+print STDERR "-- CITATIONS\n";
+print STDERR "--\n";
+printCitation("-- ");
+print STDERR "-- CONFIGURE CANU\n";
+print STDERR "--\n";
# Check java and gnuplot.
@@ -390,6 +338,202 @@ setGlobal("onExitNam", $asm);
setGlobalIfUndef("objectStoreNameSpace", $asm); # No good place to put this.
+# Figure out read inputs. From an existing store? From files? Corrected? Etc, etc.
+
+my $haveCorrected = 0;
+my $haveRaw = 0;
+
+my $setUpForPacBio = 0;
+my $setUpForNanopore = 0;
+
+# If we're a cloud run, fetch the store we expect to be working with.
+
+fetchStore("unitigging/$asm.gkpStore") if ((! -e "unitigging/$asm.gkpStore") && (fileExists("unitigging/$asm.gkpStore.tar")));
+fetchStore("trimming/$asm.gkpStore") if ((! -e "trimming/$asm.gkpStore") && (fileExists("trimming/$asm.gkpStore.tar")) && (! -e "unitigging/$asm.gkpStore"));
+fetchStore("correction/$asm.gkpStore") if ((! -e "correction/$asm.gkpStore") && (fileExists("correction/$asm.gkpStore.tar")) && (! -e "trimming/$asm.gkpStore"));
+
+# Scan for an existing gkpStore. If the output from that stage exists, ignore the store there.
+
+my $gkp;
+
+$gkp = "correction/$asm.gkpStore" if ((-e "correction/$asm.gkpStore/libraries.txt") && (sequenceFileExists("$asm.correctedReads") eq undef));
+$gkp = "trimming/$asm.gkpStore" if ((-e "trimming/$asm.gkpStore/libraries.txt") && (sequenceFileExists("$asm.trimmedReads") eq undef));
+$gkp = "unitigging/$asm.gkpStore" if ((-e "unitigging/$asm.gkpStore/libraries.txt"));
+
+# Scan for existing stage outputs. These only get used if there isn't a gkpStore found above.
+
+my $reads;
+
+$reads = sequenceFileExists("$asm.correctedReads") if (!defined($reads));
+$reads = sequenceFileExists("$asm.trimmedReads") if (!defined($reads));
+
+# A handy function for reporting what reads we found.
+
+sub reportReadsFound ($$$$) {
+ my ($setUpForPacBio, $setUpForNanopore, $haveRaw, $haveCorrected) = @_;
+
+ my $rt;
+ my $ct;
+
+ $rt = "both PacBio and Nanopore" if (($setUpForPacBio > 0) && ($setUpForNanopore > 0));
+ $rt = "PacBio" if (($setUpForPacBio > 0) && ($setUpForNanopore == 0));
+ $rt = "Nanopore" if (($setUpForPacBio == 0) && ($setUpForNanopore > 0));
+ $rt = "unknown" if (($setUpForPacBio == 0) && ($setUpForNanopore == 0));
+
+ $ct = "uncorrected" if (($haveRaw > 0) && ($haveCorrected == 0));
+ $ct = "corrected" if (($haveRaw == 0) && ($haveCorrected > 0));
+ $ct = "uncorrected AND corrected" if (($haveRaw > 0) && ($haveCorrected > 0));
+
+ return("$rt $ct");
+}
+
+# If a gkpStore was found, scan the reads in it to decide what we're working with.
+
+if (defined($gkp)) {
+ my $numPacBioRaw = 0;
+ my $numPacBioCorrected = 0;
+ my $numNanoporeRaw = 0;
+ my $numNanoporeCorrected = 0;
+
+ open(L, "< $gkp/libraries.txt") or caExit("can't open '$gkp/libraries.txt' for reading: $!", undef);
+ while (<L>) {
+ $numPacBioRaw++ if (m/pacbio-raw/);
+ $numPacBioCorrected++ if (m/pacbio-corrected/);
+ $numNanoporeRaw++ if (m/nanopore-raw/);
+ $numNanoporeCorrected++ if (m/nanopore-corrected/);
+ }
+ close(L);
+
+ $setUpForPacBio++ if ($numPacBioRaw + $numPacBioCorrected > 0);
+ $setUpForNanopore++ if ($numNanoporeRaw + $numNanoporeCorrected > 0);
+
+ $haveRaw++ if ($numPacBioRaw + $numNanoporeRaw > 0);
+ $haveCorrected++ if ($numPacBioCorrected + $numNanoporeCorrected > 0);
+
+ my $rtct = reportReadsFound($setUpForPacBio, $setUpForNanopore, $haveRaw, $haveCorrected);
+
+ print STDERR "--\n";
+ print STDERR "-- Found $rtct reads in '$gkp'.\n";
+}
+
+# Like above, scan the gkpStore to decide what we're working with. The catch here is that
+# we scan the previous store, and all reads are corrected.
+
+elsif (defined($reads)) {
+
+ $gkp = "correction/$asm.gkpStore" if ((-e "correction/$asm.gkpStore/libraries.txt") && (sequenceFileExists("$asm.correctedReads")));
+ $gkp = "trimming/$asm.gkpStore" if ((-e "trimming/$asm.gkpStore/libraries.txt") && (sequenceFileExists("$asm.trimmedReads")));
+
+ my $numPacBio = 0;
+ my $numNanopore = 0;
+
+ if (defined($gkp)) {
+ open(L, "< $gkp/libraries.txt") or caExit("can't open '$gkp/libraries.txt' for reading: $!", undef);
+ while (<L>) {
+ $numPacBio++ if (m/pacbio/);
+ $numNanopore++ if (m/nanopore/);
+ }
+ close(L);
+
+ $setUpForPacBio++ if ($numPacBio > 0);
+ $setUpForNanopore++ if ($numNanopore > 0);
+
+ $haveCorrected++;
+ } else {
+ #$setUpForPacBio++; # Leaving both setUp's as zero reports 'unknown' and
+ $haveCorrected++; # defaults to Pacbio below (search for setUpForNanopore).
+ }
+
+ # Regardless of what the user gave us, we always want to restart with these reads.
+
+ undef @inputFiles;
+ push @inputFiles, (($setUpForNanopore == 0) ? "-pacbio" : "-nanopore") . "-corrected\0$reads";
+
+ my $rtct = reportReadsFound($setUpForPacBio, $setUpForNanopore, $haveRaw, $haveCorrected);
+
+ print STDERR "--\n";
+ print STDERR "-- Found $rtct reads in '$reads'.\n";
+}
+
+# Scan input files, counting the different types of libraries we have.
+
+elsif (scalar(@inputFiles) > 0) {
+ foreach my $typefile (@inputFiles) {
+ my ($type, $file) = split '\0', $typefile;
+
+ $haveCorrected++ if ($type =~ m/corrected/);
+ $haveRaw++ if ($type =~ m/raw/);
+
+ $setUpForPacBio++ if ($type =~ m/pacbio/);
+ $setUpForNanopore++ if ($type =~ m/nanopore/);
+ }
+
+ my $rtct = reportReadsFound($setUpForPacBio, $setUpForNanopore, $haveRaw, $haveCorrected);
+
+ print STDERR "--\n";
+ print STDERR "-- Found $rtct reads in the input files.\n";
+}
+
+# Set an initial run mode, based on the libraries we have found, or the stores that exist (unless
+# it was set on the command line).
+
+if (!defined($mode)) {
+ $mode = "run" if ($haveRaw > 0);
+ $mode = "trim-assemble" if ($haveCorrected > 0);
+
+ $mode = "run" if (-e "correction/$asm.gkpStore/libraries.txt");
+ $mode = "trim-assemble" if (-e "trimming/$asm.gkpStore/libraries.txt");
+ $mode = "assemble" if (-e "unitigging/$asm.gkpStore/libraries.txt");
+}
+
+# Set the type of the reads. A command line option could force the type, e.g., "-pacbio" or
+# "-nanopore", to let you do cRaZy stuff like "-nanopore -pacbio-raw *fastq".
+
+if (!defined($type)) {
+ $type = "pacbio" if ($setUpForPacBio > 0);
+ $type = "nanopore" if ($setUpForNanopore > 0);
+}
+
+# Now set error rates (if not set already) based on the dominant read type.
+
+if ($type eq"nanopore") {
+ setGlobalIfUndef("corOvlErrorRate", 0.320);
+ setGlobalIfUndef("obtOvlErrorRate", 0.144);
+ setGlobalIfUndef("utgOvlErrorRate", 0.144);
+ setGlobalIfUndef("corErrorRate", 0.500);
+ setGlobalIfUndef("obtErrorRate", 0.144);
+ setGlobalIfUndef("utgErrorRate", 0.144);
+ setGlobalIfUndef("cnsErrorRate", 0.192);
+}
+
+if ($type eq"pacbio") {
+ setGlobalIfUndef("corOvlErrorRate", 0.240);
+ setGlobalIfUndef("obtOvlErrorRate", 0.045);
+ setGlobalIfUndef("utgOvlErrorRate", 0.045);
+ setGlobalIfUndef("corErrorRate", 0.300);
+ setGlobalIfUndef("obtErrorRate", 0.045);
+ setGlobalIfUndef("utgErrorRate", 0.045);
+ setGlobalIfUndef("cnsErrorRate", 0.075);
+}
+
+# Check for a few errors:
+# no mode -> don't have any reads or any store to run from.
+# both raw and corrected -> don't know how to process these
+
+caExit("ERROR: No reads supplied, and can't find any reads in any gkpStore", undef) if (!defined($mode));
+caExit("ERROR: Failed to determine the sequencing technology of the reads", undef) if (!defined($type));
+caExit("ERROR: Can't mix uncorrected and corrected reads", undef) if ($haveRaw && $haveCorrected);
+
+# Do a final check on parameters, cleaning up paths and case, and failing on invalid stuff.
+
+checkParameters();
+
+# And one final last chance to fail - because java and gnuplot both can set an error.
+
+printHelp();
+
+# Go!
+
printf STDERR "--\n";
printf STDERR "-- Generating assembly '$asm' in '" . getcwd() . "'\n";
printf STDERR "--\n";
@@ -408,22 +552,6 @@ printf STDERR "-- obtErrorRate %6.4f (%6.2f%%)\n", getGlobal("obtErrorRate
printf STDERR "-- utgErrorRate %6.4f (%6.2f%%)\n", getGlobal("utgErrorRate"), getGlobal("utgErrorRate") * 100.0;
printf STDERR "-- cnsErrorRate %6.4f (%6.2f%%)\n", getGlobal("cnsErrorRate"), getGlobal("cnsErrorRate") * 100.0;
-if (defined(getGlobal('errorRateUsed'))) {
- print STDERR getGlobal('errorRateUsed');
-}
-
-# Fail immediately if we run the script on the grid, and the gkpStore directory doesn't exist and
-# we have no input files. Without this check we'd fail only after being scheduled on the grid.
-
-my $cor = (-e "correction/$asm.gkpStore") || fileExists("correction/$asm.gkpStore.tar") || sequenceFileExists("$asm.correctedReads") || (-e "$asm.correctedReads.gkp");
-my $obt = (-e "trimming/$asm.gkpStore") || fileExists("trimming/$asm.gkpStore.tar") || sequenceFileExists("$asm.trimmedReads") || (-e "$asm.trimmedReads.gkp");
-my $utg = (-e "unitigging/$asm.gkpStore") || fileExists("unitigging/$asm.gkpStore.tar");
-
-if (($cor + $obt + $utg == 0) &&
- (scalar(@inputFiles) == 0)) {
- caExit("no input files specified, and store not already created", undef);
-}
-
# Check that we were supplied a work directory, and that it exists, or we can create it.
make_path("canu-logs") if (! -d "canu-logs");
@@ -541,8 +669,6 @@ if (setOptions($mode, "correct") eq "correct") {
generateCorrectedReads($asm) foreach (1..getGlobal("canuIterationMax") + 1);
dumpCorrectedReads($asm);
- estimateCorrectedError($asm, "cor");
-
buildHTML($asm, "cor");
}
@@ -551,7 +677,7 @@ if (setOptions($mode, "correct") eq "correct") {
caExit("can't find corrected reads '$asm.correctedReads*' in directory '" . getcwd() . "'", undef) if (!defined($correctedReads));
undef @inputFiles;
- push @inputFiles, "-pacbio-corrected\0$correctedReads";
+ push @inputFiles, "-$type-corrected\0$correctedReads";
}
@@ -583,7 +709,7 @@ if (setOptions($mode, "trim") eq "trim") {
caExit("can't find trimmed reads '$asm.trimmedReads*' in directory '" . getcwd() . "'", undef) if (!defined($trimmedReads));
undef @inputFiles;
- push @inputFiles, "-pacbio-corrected\0$trimmedReads";
+ push @inputFiles, "-$type-corrected\0$trimmedReads";
}
@@ -629,5 +755,7 @@ if (setOptions($mode, "assemble") eq "assemble") {
}
}
+print STDERR "--\n";
+print STDERR "-- Bye.\n";
exit(0);
diff --git a/src/pipelines/canu/Configure.pm b/src/pipelines/canu/Configure.pm
index 45c268e..1f4f962 100644
--- a/src/pipelines/canu/Configure.pm
+++ b/src/pipelines/canu/Configure.pm
@@ -53,9 +53,11 @@ use canu::Execution;
# 1g-2000m:1g only adds 1g.
# 1g-2048m:1g adds 1 and 2g.
-sub expandRange ($$) {
+sub expandRange ($$$$) {
my $var = shift @_;
my $val = shift @_;
+ my $min = shift @_; # limit the minimum to be above this
+ my $max = shift @_; # limit the maximum to be below this
my @v = split ',', $val;
my @r;
@@ -126,10 +128,24 @@ sub expandRange ($$) {
elsif ($def == 3) {
}
+ # Convert the value and unit to gigabytes.
+
my $b = adjustMemoryValue("$bgn$bgnu");
my $e = adjustMemoryValue("$end$endu");
my $s = adjustMemoryValue("$stp$stpu");
+ # Enforce the user supplied minimum and maximum. We cannot 'decrease min to user supplied
+ # maximum' because this effectively ignores the task setting. For, e.g., batMemory=64-128
+ # and maxMemory=32, we want it to fail.
+
+ $b = $min if ((defined($min)) && ($b < $min)); # Increase min to user supplied minimum.
+ $e = $min if ((defined($min)) && ($e < $min)); # Increase max to user supplied minimum.
+
+ #$b = $max if ((defined($max)) && ($b > $max)); # Decrease min to use supplied maximum.
+ $e = $max if ((defined($max)) && ($e > $max)); # Decrease max to use supplied maximum.
+
+ # Iterate over the range, push values to test onto the array.
+
for (my $ii=$b; $ii<=$e; $ii += $s) {
push @r, $ii;
}
@@ -184,6 +200,9 @@ sub getAllowedResources ($$$$@) {
# Figure out limits.
+ my $minMemory = getGlobal("minMemory");
+ my $minThreads = getGlobal("minThreads");
+
my $maxMemory = getGlobal("maxMemory");
my $maxThreads = getGlobal("maxThreads");
@@ -200,9 +219,11 @@ sub getAllowedResources ($$$$@) {
if ($dbg) {
print STDERR "--\n";
- print STDERR "-- DEBUG\n";
- print STDERR "-- DEBUG Limited to $maxMemory GB memory via maxMemory option\n" if (defined($maxMemory));
- print STDERR "-- DEBUG Limited to $maxThreads threads via maxThreads option\n" if (defined($maxThreads));
+ print STDERR "-- ERROR\n";
+ print STDERR "-- ERROR Limited to at least $minMemory GB memory via minMemory option\n" if (defined($minMemory));
+ print STDERR "-- ERROR Limited to at least $minThreads threads via minThreads option\n" if (defined($minThreads));
+ print STDERR "-- ERROR Limited to at most $maxMemory GB memory via maxMemory option\n" if (defined($maxMemory));
+ print STDERR "-- ERROR Limited to at most $maxThreads threads via maxThreads option\n" if (defined($maxThreads));
}
# Figure out the largest memory and threads that could ever be supported. This lets us short-circuit
@@ -239,12 +260,11 @@ sub getAllowedResources ($$$$@) {
}
if ($dbg) {
- print STDERR "-- DEBUG\n";
- print STDERR "-- DEBUG Have ", scalar(@gridCor), " configurations; largest memory size $maxMemory GB; most cores $maxThreads:\n";
+ print STDERR "-- ERROR\n";
+ print STDERR "-- ERROR Found ", scalar(@gridCor), " machine ", ((scalar(@gridCor) == 1) ? "configuration:\n" : "configurations:\n");
for (my $ii=0; $ii<scalar(@gridCor); $ii++) {
- print STDERR "-- DEBUG class$ii - $gridNum[$ii] machines with $gridCor[$ii] cores with $gridMem[$ii]GB memory each.\n";
+ print STDERR "-- ERROR class$ii - $gridNum[$ii] machines with $gridCor[$ii] cores with $gridMem[$ii] GB memory each.\n";
}
- print STDERR "-- DEBUG\n";
}
# The task usually has multiple choices, and we have a little optimization problem to solve. For each
@@ -255,8 +275,8 @@ sub getAllowedResources ($$$$@) {
# We then (typically) want to maximize the number of cores we can get running.
# Other options would be number of cores * amount of memory.
- my @taskMemory = expandRange("${tag}${alg}Memory", $taskMemory);
- my @taskThreads = expandRange("${tag}${alg}Threads", $taskThreads);
+ my @taskMemory = expandRange("${tag}${alg}Memory", $taskMemory, $minMemory, $maxMemory);
+ my @taskThreads = expandRange("${tag}${alg}Threads", $taskThreads, $minThreads, $maxThreads);
# Find task memory/thread settings that will maximize the number of cores running. This used
# to also compute best as 'cores * memory' but that is better handled by ordering the task
@@ -271,7 +291,7 @@ sub getAllowedResources ($$$$@) {
foreach my $m (@taskMemory) {
foreach my $t (@taskThreads) {
#if ($dbg && (($m > $maxMemory) || ($t > $maxThreads))) {
- # print STDERR "-- DEBUG Tested $tag$alg requesting $t cores and ${m}GB memory - rejected: limited to ${maxMemory}GB and $maxThreads cores.\n";
+ # print STDERR "-- ERROR Tested $tag$alg requesting $t cores and ${m}GB memory - rejected: limited to ${maxMemory}GB and $maxThreads cores.\n";
#}
next if ($m > $maxMemory); # Bail if either of the suggest settings are
next if ($t > $maxThreads); # larger than the maximum allowed.
@@ -297,7 +317,7 @@ sub getAllowedResources ($$$$@) {
my $np = ($np_cpu < $np_mem) ? $np_cpu : $np_mem;
if ($dbg) {
- print STDERR "-- DEBUG for $t threads and $m memory - class$ii can support $np_cpu jobs(cores) and $np_mem jobs(memory), so $np jobs.\n";
+ print STDERR "-- ERROR for $t threads and $m memory - class$ii can support $np_cpu jobs(cores) and $np_mem jobs(memory), so $np jobs.\n";
}
$processes += $np;
@@ -306,7 +326,7 @@ sub getAllowedResources ($$$$@) {
}
if ($dbg) {
- print STDERR "-- DEBUG Tested $tag$alg requesting $t cores and ${m}GB memory and found $cores could be used.\n";
+ print STDERR "-- ERROR Tested $tag$alg requesting $t cores and ${m}GB memory and found $cores could be used.\n";
}
# If no cores, then all machines were too small.
@@ -326,11 +346,18 @@ sub getAllowedResources ($$$$@) {
if (!defined($bestCoresM)) {
getAllowedResources($tag, $alg, $err, $all, 1) if (!defined($dbg));
- print STDERR "--\n";
- print STDERR "-- Task $tag$alg can't run on any available machines.\n";
- print STDERR "-- It is requesting ", getGlobal("${tag}${alg}Memory"), " GB memory and ", getGlobal("${tag}${alg}Threads"), " threads.\n";
- print STDERR "-- See above for hardware limits.\n";
- print STDERR "--\n";
+ print STDERR "-- ERROR\n";
+ print STDERR "-- ERROR Task $tag$alg can't run on any available machines.\n";
+ print STDERR "-- ERROR It is requesting:\n";
+ print STDERR "-- ERROR ${tag}${alg}Memory=", getGlobal("${tag}${alg}Memory"), " memory (gigabytes)\n";
+ print STDERR "-- ERROR ${tag}${alg}Threads=", getGlobal("${tag}${alg}Threads"), " threads\n";
+ print STDERR "-- ERROR\n";
+ print STDERR "-- ERROR No available machine configuration can run this task.\n";
+ print STDERR "-- ERROR\n";
+ print STDERR "-- ERROR Possible solutions:\n";
+ print STDERR "-- ERROR Increase maxMemory\n" if (defined(getGlobal("maxMemory")));
+ print STDERR "-- ERROR Change ${tag}${alg}Memory and/or ${tag}${alg}Threads\n";
+ print STDERR "-- ERROR\n";
caExit("task $tag$alg failed to find a configuration to run on", undef);
}
@@ -381,18 +408,18 @@ sub getAllowedResources ($$$$@) {
my $nam;
- if ($alg eq "bat") { $nam = "bogart"; }
- elsif ($alg eq "cns") { $nam = "consensus"; }
- elsif ($alg eq "gfa") { $nam = "GFA alignment and processing"; }
- elsif ($alg eq "cor") { $nam = "falcon_sense"; }
- elsif ($alg eq "meryl") { $nam = "meryl"; }
- elsif ($alg eq "oea") { $nam = "overlap error adjustment"; }
- elsif ($alg eq "ovb") { $nam = "ovStore bucketizer"; }
- elsif ($alg eq "ovs") { $nam = "ovStore sorting"; }
- elsif ($alg eq "red") { $nam = "read error detection"; }
- elsif ($alg eq "mhap") { $nam = "mhap ($tag)"; }
- elsif ($alg eq "mmap") { $nam = "minimap ($tag)"; }
- elsif ($alg eq "ovl") { $nam = "overlapper ($tag)"; }
+ if ($alg eq "meryl") { $nam = "(k-mer counting)"; }
+ elsif ($alg eq "mhap") { $nam = "(overlap detection with mhap)"; }
+ elsif ($alg eq "mmap") { $nam = "(overlap detection with minimap)"; }
+ elsif ($alg eq "ovl") { $nam = "(overlap detection)"; }
+ elsif ($alg eq "cor") { $nam = "(read correction)"; }
+ elsif ($alg eq "ovb") { $nam = "(overlap store bucketizer)"; }
+ elsif ($alg eq "ovs") { $nam = "(overlap store sorting)"; }
+ elsif ($alg eq "red") { $nam = "(read error detection)"; }
+ elsif ($alg eq "oea") { $nam = "(overlap error adjustment)"; }
+ elsif ($alg eq "bat") { $nam = "(contig construction)"; }
+ elsif ($alg eq "cns") { $nam = "(consensus)"; }
+ elsif ($alg eq "gfa") { $nam = "(GFA alignment and processing)"; }
else {
caFailure("unknown task '$alg' in getAllowedResources().", undef);
}
@@ -401,8 +428,27 @@ sub getAllowedResources ($$$$@) {
my $thr = substr(" $taskThreads", -3) . " CPU" . (($taskThreads == 1) ? " " : "s");
my $mem = substr(" $taskMemory", -4) . " GB";
- $all .= "-- Run $job concurrently using $mem and $thr for stage '$nam'.\n" if ( defined($concurrent));
- $all .= "-- Run under grid control using $mem and $thr for stage '$nam'.\n" if (!defined($concurrent));
+ my $t = substr("$tag$alg ", 0, 7);
+
+ if (!defined($all)) {
+ #$all .= "-- Memory, Threads and Concurrency configuration:\n" if ( defined($concurrent));
+ #$all .= "-- Memory and Threads configuration:\n" if (!defined($concurrent));
+
+ if (defined($concurrent)) {
+ $all .= "-- (tag)Concurrency\n";
+ $all .= "-- (tag)Threads |\n";
+ $all .= "-- (tag)Memory | |\n";
+ $all .= "-- (tag) | | | algorithm\n";
+ $all .= "-- ------- ------ -------- -------- -----------------------------\n";
+ } else {
+ $all .= "-- (tag)Threads\n";
+ $all .= "-- (tag)Memory |\n";
+ $all .= "-- (tag) | | algorithm\n";
+ $all .= "-- ------- ------ -------- -----------------------------\n";
+ }
+ }
+ $all .= "-- Local: $t $mem $thr x $job $nam\n" if ( defined($concurrent));
+ $all .= "-- Grid: $t $mem $thr $nam\n" if (!defined($concurrent));
return($err, $all);
}
@@ -662,25 +708,28 @@ sub configureAssembler () {
if (getGlobal("genomeSize") < adjustGenomeSize("40m")) {
setGlobalIfUndef("batMemory", "2-16"); setGlobalIfUndef("batThreads", "1-4");
- setGlobalIfUndef("gfaMemory", "2-4"); setGlobalIfUndef("gfaThreads", "1");
+ setGlobalIfUndef("gfaMemory", "2-8"); setGlobalIfUndef("gfaThreads", "1-4");
} elsif (getGlobal("genomeSize") < adjustGenomeSize("500m")) {
setGlobalIfUndef("batMemory", "16-64"); setGlobalIfUndef("batThreads", "2-8");
- setGlobalIfUndef("gfaMemory", "2-4"); setGlobalIfUndef("gfaThreads", "2-4");
+ setGlobalIfUndef("gfaMemory", "4-8"); setGlobalIfUndef("gfaThreads", "2-8");
} elsif (getGlobal("genomeSize") < adjustGenomeSize("2g")) {
setGlobalIfUndef("batMemory", "32-256"); setGlobalIfUndef("batThreads", "4-16");
- setGlobalIfUndef("gfaMemory", "4-8"); setGlobalIfUndef("gfaThreads", "4-8");
+ setGlobalIfUndef("gfaMemory", "8-16"); setGlobalIfUndef("gfaThreads", "4-16");
} elsif (getGlobal("genomeSize") < adjustGenomeSize("5g")) {
setGlobalIfUndef("batMemory", "128-512"); setGlobalIfUndef("batThreads", "8-32");
- setGlobalIfUndef("gfaMemory", "8-16"); setGlobalIfUndef("gfaThreads", "8-16");
+ setGlobalIfUndef("gfaMemory", "16-32"); setGlobalIfUndef("gfaThreads", "8-32");
} else {
setGlobalIfUndef("batMemory", "256-1024"); setGlobalIfUndef("batThreads", "16-64");
- setGlobalIfUndef("gfaMemory", "16-32"); setGlobalIfUndef("gfaThreads", "16-64");
+ setGlobalIfUndef("gfaMemory", "32-64"); setGlobalIfUndef("gfaThreads", "16-64");
}
+
+
+
# Finally, use all that setup to pick actual values for each component.
#
# ovsMemory needs to be configured here iff the sequential build method is used. This runs in
@@ -714,10 +763,10 @@ sub configureAssembler () {
($err, $all) = getAllowedResources("", "bat", $err, $all);
- ($err, $all) = getAllowedResources("", "gfa", $err, $all);
-
($err, $all) = getAllowedResources("", "cns", $err, $all);
+ ($err, $all) = getAllowedResources("", "gfa", $err, $all);
+
# Check some minimums.
if ((getGlobal("ovsMemory") =~ m/^([0123456789.]+)-*[0123456789.]*$/) &&
diff --git a/src/pipelines/canu/Consensus.pm b/src/pipelines/canu/Consensus.pm
index 21e7869..0ab5910 100644
--- a/src/pipelines/canu/Consensus.pm
+++ b/src/pipelines/canu/Consensus.pm
@@ -107,7 +107,7 @@ sub utgcns ($$$) {
print F " -e " . getGlobal("cnsErrorRate") . " \\\n";
print F " -quick \\\n" if (getGlobal("cnsConsensus") eq "quick");
print F " -pbdagcon \\\n" if (getGlobal("cnsConsensus") eq "pbdagcon");
- print F " -edlib \\\n" if (getGlobal("canuIteration") > 0);
+ print F " -edlib \\\n" if (getGlobal("canuIteration") >= 0);
print F " -utgcns \\\n" if (getGlobal("cnsConsensus") eq "utgcns");
print F " -threads " . getGlobal("cnsThreads") . " \\\n";
print F "&& \\\n";
@@ -117,7 +117,7 @@ sub utgcns ($$$) {
print F "\n";
print F "exit 0\n";
- if (getGlobal("canuIteration") == 0) {
+ if (getGlobal("canuIteration") < 0) {
print STDERR "-- Using fast alignment for consensus (iteration '", getGlobal("canuIteration"), "').\n";
} else {
print STDERR "-- Using slow alignment for consensus (iteration '", getGlobal("canuIteration"), "').\n";
@@ -125,6 +125,7 @@ sub utgcns ($$$) {
close(F);
+ makeExecutable("$path/consensus.sh");
stashFile("$path/consensus.sh");
}
@@ -143,9 +144,7 @@ sub cleanupPartitions ($$) {
print STDERR "-- Partitioned gkpStore is older than tigs, rebuild partitioning (gkpStore $gkpTime days old; ctgStore $tigTime days old).\n";
- if (runCommandSilently("unitigging", "rm -rf ./$asm.${tag}Store/partitionedReads.gkpStore", 1)) {
- caExit("failed to remove old partitions (unitigging/$asm.${tag}Store/partitionedReads.gkpStore/partitions), can't continue until these are removed", undef);
- }
+ remove_tree("unitigging/$asm.${tag}Store/partitionedReads.gkpStore");
}
@@ -327,19 +326,21 @@ sub consensusCheck ($) {
if (scalar(@failedJobs) > 0) {
- # If not the first attempt, report the jobs that failed, and that we're recomputing.
+ # If too many attempts, give up.
- if ($attempt > 1) {
+ if ($attempt >= getGlobal("canuIterationMax")) {
print STDERR "--\n";
- print STDERR "-- ", scalar(@failedJobs), " consensus jobs failed:\n";
+ print STDERR "-- Consensus jobs failed, tried $attempt times, giving up.\n";
print STDERR $failureMessage;
print STDERR "--\n";
+ caExit(undef, undef);
}
- # If too many attempts, give up.
-
- if ($attempt > getGlobal("canuIterationMax")) {
- caExit("failed to generate consensus. Made " . ($attempt-1) . " attempts, jobs still failed", undef);
+ if ($attempt > 0) {
+ print STDERR "--\n";
+ print STDERR "-- Consensus jobs failed, retry.\n";
+ print STDERR $failureMessage;
+ print STDERR "--\n";
}
# Otherwise, run some jobs.
@@ -374,13 +375,20 @@ sub consensusCheck ($) {
-sub purgeFiles ($$$$$) {
+sub purgeFiles ($$$$$$) {
+ my $asm = shift @_;
my $tag = shift @_;
my $Ncns = shift @_;
my $Nfastq = shift @_;
my $Nlayout = shift @_;
my $Nlog = shift @_;
+ remove_tree("unitigging/$asm.ctgStore/partitionedReads.gkpStore"); # The partitioned gkpStores
+ remove_tree("unitigging/$asm.utgStore/partitionedReads.gkpStore"); # are useless now. Bye bye!
+
+ unlink "unitigging/$asm.ctgStore/partitionedReads.log";
+ unlink "unitigging/$asm.utgStore/partitionedReads.log";
+
my $path = "unitigging/5-consensus";
open(F, "< $path/$tag.files") or caExit("can't open '$path/$tag.files' for reading: $!\n", undef);
@@ -391,25 +399,25 @@ sub purgeFiles ($$$$$) {
my $ID4 = substr("000" . $2, -4);
my $ID0 = $2;
- if (-e "$1/$ID4.cns") {
+ if (-e "unitigging/$1/$ID4.cns") {
$Ncns++;
- unlink "$1/$ID4.cns";
+ unlink "unitigging/$1/$ID4.cns";
}
- if (-e "$1/$ID4.fastq") {
+ if (-e "unitigging/$1/$ID4.fastq") {
$Nfastq++;
- unlink "$1/$ID4.fastq";
+ unlink "unitigging/$1/$ID4.fastq";
}
- if (-e "$1/$ID4.layout") {
+ if (-e "unitigging/$1/$ID4.layout") {
$Nlayout++;
- unlink "$1/$ID4.layout";
+ unlink "unitigging/$1/$ID4.layout";
}
- if (-e "$1/consensus.$ID6.out") {
+ if (-e "unitigging/$1/consensus.$ID6.out") {
$Nlog++;
- unlink "$1/consensus.$ID6.out";
+ unlink "unitigging/$1/consensus.$ID6.out";
}
- if (-e "$1/consensus.$ID0.out") {
+ if (-e "unitigging/$1/consensus.$ID0.out") {
$Nlog++;
- unlink "$1/consensus.$ID0.out";
+ unlink "unitigging/$1/consensus.$ID0.out";
}
} else {
@@ -418,6 +426,9 @@ sub purgeFiles ($$$$$) {
}
close(F);
+ unlink "$path/$tag.files";
+ rmdir "$path/$tag";
+
return($Ncns, $Nfastq, $Nlayout, $Nlog);
}
@@ -463,6 +474,7 @@ sub consensusLoad ($) {
if (runCommand("unitigging", $cmd)) {
caExit("failed to load unitig consensus into ctgStore", "$path/ctgcns.files.ctgStoreLoad.err");
}
+ unlink "$path/ctgcns.files.ctgStoreLoad.err";
stashFile("unitigging/$asm.ctgStore/seqDB.v002.dat");
stashFile("unitigging/$asm.ctgStore/seqDB.v002.tig");
@@ -488,6 +500,7 @@ sub consensusLoad ($) {
if (runCommand("unitigging", $cmd)) {
caExit("failed to load unitig consensus into utgStore", "$path/utgcns.files.utgStoreLoad.err");
}
+ unlink "$path/utgcns.files.utgStoreLoad.err";
stashFile("unitigging/$asm.utgStore/seqDB.v002.dat");
stashFile("unitigging/$asm.utgStore/seqDB.v002.tig");
@@ -504,8 +517,8 @@ sub consensusLoad ($) {
my $Nlayout = 0;
my $Nlog = 0;
- ($Ncns, $Nfastq, $Nlayout, $Nlog) = purgeFiles("ctgcns", $Ncns, $Nfastq, $Nlayout, $Nlog);
- ($Ncns, $Nfastq, $Nlayout, $Nlog) = purgeFiles("utgcns", $Ncns, $Nfastq, $Nlayout, $Nlog);
+ ($Ncns, $Nfastq, $Nlayout, $Nlog) = purgeFiles($asm, "ctgcns", $Ncns, $Nfastq, $Nlayout, $Nlog);
+ ($Ncns, $Nfastq, $Nlayout, $Nlog) = purgeFiles($asm, "utgcns", $Ncns, $Nfastq, $Nlayout, $Nlog);
print STDERR "-- Purged $Ncns .cns outputs.\n" if ($Ncns > 0);
print STDERR "-- Purged $Nfastq .fastq outputs.\n" if ($Nfastq > 0);
@@ -580,7 +593,11 @@ sub alignGFA ($) {
goto allDone if (skipStage($asm, "alignGFA") == 1);
goto allDone if (fileExists("unitigging/4-unitigger/$asm.contigs.aligned.gfa") &&
- fileExists("unitigging/4-unitigger/$asm.unitigs.aligned.gfa"));
+ fileExists("unitigging/4-unitigger/$asm.unitigs.aligned.gfa") &&
+ fileExists("unitigging/4-unitigger/$asm.unitigs.aligned.bed"));
+
+ # If a large genome, run this on the grid, else, run in the canu process itself.
+ my $runGrid = (getGlobal("genomeSize") >= 40000000);
fetchFile("$path/alignGFA.sh");
@@ -590,7 +607,7 @@ sub alignGFA ($) {
print F "\n";
print F getBinDirectoryShellCode();
print F "\n";
- print F setWorkDirectoryShellCode($path);
+ print F setWorkDirectoryShellCode($path) if ($runGrid); # If not local, need to cd first.
print F "\n";
print F fetchFileShellCode("unitigging/$asm.utgStore", "seqDB.v001.dat", "");
print F fetchFileShellCode("unitigging/$asm.utgStore", "seqDB.v001.tig", "");
@@ -605,6 +622,7 @@ sub alignGFA ($) {
print F fetchFileShellCode("unitigging/$asm.ctgStore", "seqDB.v002.tig", "");
print F "\n";
print F "\n";
+
print F "if [ ! -e ./$asm.unitigs.aligned.gfa ] ; then\n";
print F " \$bin/alignGFA \\\n";
print F " -T ../$asm.utgStore 2 \\\n";
@@ -617,6 +635,7 @@ sub alignGFA ($) {
print F "fi\n";
print F "\n";
print F "\n";
+
print F "if [ ! -e ./$asm.contigs.aligned.gfa ] ; then\n";
print F " \$bin/alignGFA \\\n";
print F " -T ../$asm.ctgStore 2 \\\n";
@@ -629,8 +648,24 @@ sub alignGFA ($) {
print F "fi\n";
print F "\n";
print F "\n";
+
+ print F "if [ ! -e ./$asm.unitigs.aligned.bed ] ; then\n";
+ print F " \$bin/alignGFA -bed \\\n";
+ print F " -T ../$asm.utgStore 2 \\\n";
+ print F " -C ../$asm.ctgStore 2 \\\n";
+ print F " -i ./$asm.unitigs.bed \\\n";
+ print F " -o ./$asm.unitigs.aligned.bed \\\n";
+ print F " -t " . getGlobal("gfaThreads") . " \\\n";
+ print F " > ./$asm.unitigs.aligned.bed.err 2>&1";
+ print F "\n";
+ print F stashFileShellCode("$path", "$asm.unitigs.aligned.bed", " ");
+ print F "fi\n";
+ print F "\n";
+ print F "\n";
+
print F "if [ -e ./$asm.unitigs.aligned.gfa -a \\\n";
- print F " -e ./$asm.contigs.aligned.gfa ] ; then\n";
+ print F " -e ./$asm.contigs.aligned.gfa -a \\\n";
+ print F " -e ./$asm.unitigs.aligned.bed ] ; then\n";
print F " echo GFA alignments updated.\n";
print F " exit 0\n";
print F "else\n";
@@ -639,8 +674,7 @@ sub alignGFA ($) {
print F "fi\n";
close(F);
- system("chmod +x $path/alignGFA.sh");
-
+ makeExecutable("$path/alignGFA.sh");
stashFile("$path/alignGFA.sh");
}
@@ -648,31 +682,31 @@ sub alignGFA ($) {
# shows how to process multiple jobs. This only checks for the existence of the final outputs.
# (meryl and unitig are the same)
- # If not the first attempt, report the jobs that failed, and that we're recomputing.
+ # If too many attempts, give up.
- if ($attempt > 1) {
+ if ($attempt >= getGlobal("canuIterationMax")) {
print STDERR "--\n";
- print STDERR "-- GFA alignment failed.\n";
+ print STDERR "-- Graph alignment jobs failed, tried $attempt times, giving up.\n";
print STDERR "--\n";
+ caExit(undef, undef);
}
- # If too many attempts, give up.
-
- if ($attempt > getGlobal("canuIterationMax")) {
- caExit("failed to align GFA links. Made " . ($attempt-1) . " attempts, jobs still failed", undef);
+ if ($attempt > 0) {
+ print STDERR "--\n";
+ print STDERR "-- Graph alignment jobs failed, retry.\n";
+ print STDERR "--\n";
}
- # Otherwise, run some jobs. If the genome is small, just do it here and now, otherwise,
- # run on the grid.
+ # Otherwise, run some jobs.
emitStage($asm, "alignGFA", $attempt);
- if (getGlobal("genomeSize") < 40000000) {
- if (runCommand("$path", "./alignGFA.sh")) {
+ if ($runGrid) {
+ submitOrRunParallelJob($asm, "gfa", $path, "alignGFA", (1));
+ } else {
+ if (runCommand($path, "./alignGFA.sh")) {
caExit("failed to align contigs", "./$asm.contigs.aligned.gfa.err");
}
- } else {
- submitOrRunParallelJob($asm, "gfa", $path, "alignGFA", (1));
}
return;
diff --git a/src/pipelines/canu/CorrectReads.pm b/src/pipelines/canu/CorrectReads.pm
index 5c738f6..ff971d6 100644
--- a/src/pipelines/canu/CorrectReads.pm
+++ b/src/pipelines/canu/CorrectReads.pm
@@ -322,6 +322,7 @@ sub buildCorrectionLayouts_direct ($) {
close(F);
+ makeExecutable("$path/correctReads.sh");
stashFile("$path/correctReads.sh");
finishStage:
@@ -489,6 +490,7 @@ sub buildCorrectionLayouts_piped ($) {
close(F);
+ makeExecutable("$path/correctReads.sh");
stashFile("$path/correctReads.sh");
finishStage:
@@ -1035,18 +1037,21 @@ sub generateCorrectedReads ($) {
if (scalar(@failedJobs) > 0) {
- # If not the first attempt, report the jobs that failed, and that we're recomputing.
+ # If too many attempts, give up.
- if ($attempt > 1) {
+ if ($attempt >= getGlobal("canuIterationMax")) {
print STDERR "--\n";
- print STDERR "-- ", scalar(@failedJobs), " read correction jobs failed:\n";
+ print STDERR "-- Read correction jobs failed, tried $attempt times, giving up.\n";
print STDERR $failureMessage;
+ print STDERR "--\n";
+ caExit(undef, undef);
}
- # If too many attempts, give up.
-
- if ($attempt > getGlobal("canuIterationMax")) {
- caExit("failed to generate corrected reads. Made " . ($attempt-1) . " attempts, jobs still failed", undef);
+ if ($attempt > 0) {
+ print STDERR "--\n";
+ print STDERR "-- Read correction jobs failed, retry.\n";
+ print STDERR $failureMessage;
+ print STDERR "--\n";
}
# Otherwise, run some jobs.
@@ -1502,12 +1507,14 @@ sub dumpCorrectedReads ($) {
}
}
close(F);
+
+ print STDERR "-- Purged $Nsuccess .dump.success sentinels.\n" if ($Nsuccess > 0);
+ print STDERR "-- Purged $Nfasta .fasta outputs.\n" if ($Nfasta > 0);
+ print STDERR "-- Purged $Nerr .err outputs.\n" if ($Nerr > 0);
+ print STDERR "-- Purged $Nlog .out job log outputs.\n" if ($Nlog > 0);
}
- print STDERR "-- Purged $Nsuccess .dump.success sentinels.\n" if ($Nsuccess > 0);
- print STDERR "-- Purged $Nfasta .fasta outputs.\n" if ($Nfasta > 0);
- print STDERR "-- Purged $Nerr .err outputs.\n" if ($Nerr > 0);
- print STDERR "-- Purged $Nlog .out job log outputs.\n" if ($Nlog > 0);
+ remove_tree("correction/$asm.ovlStore") if (getGlobal("saveOverlaps") eq "0");
finishStage:
emitStage($asm, "cor-dumpCorrectedReads");
diff --git a/src/pipelines/canu/Defaults.pm b/src/pipelines/canu/Defaults.pm
index 6e91966..275beaa 100644
--- a/src/pipelines/canu/Defaults.pm
+++ b/src/pipelines/canu/Defaults.pm
@@ -40,7 +40,7 @@ package canu::Defaults;
require Exporter;
@ISA = qw(Exporter);
- at EXPORT = qw(getCommandLineOptions addCommandLineOption addCommandLineError writeLog getNumberOfCPUs getPhysicalMemorySize getAllowedResources diskSpace printOptions printHelp addSequenceFile setParametersFromFile setParametersFromCommandLine checkJava checkGnuplot checkParameters getGlobal setGlobal setGlobalIfUndef setDefaults setVersion);
+ at EXPORT = qw(getCommandLineOptions addCommandLineOption addCommandLineError writeLog getNumberOfCPUs getPhysicalMemorySize getAllowedResources diskSpace printOptions printHelp printCitation addSequenceFile setParametersFromFile setParametersFromCommandLine checkJava checkGnuplot checkParameters getGlobal setGlobal setGlobalIfUndef setDefaults setVersion);
use strict;
use Cwd qw(getcwd abs_path);
@@ -106,44 +106,65 @@ sub setGlobal ($$) {
$val = 0 if (($val =~ m/^false$/i) || ($val =~ m/^f$/i));
$val = 1 if (($val =~ m/^true$/i) || ($val =~ m/^t$/i));
- # Translate from generic to specialized var
+ # Grid options
- foreach my $alg ("ovl", "mhap", "mmap") {
- foreach my $opt ("gridoptions") {
- $set += setGlobalSpecialization($val, ("${opt}cor${alg}", "${opt}obt${alg}", "${opt}utg${alg}")) if ($var eq "${opt}${alg}");
- }
+ foreach my $opt ("gridoptions") {
+ $set += setGlobalSpecialization($val, ("${opt}corovl", "${opt}obtovl", "${opt}utgovl")) if ($var eq "${opt}ovl");
+ $set += setGlobalSpecialization($val, ("${opt}cormhap", "${opt}obtmhap", "${opt}utgmhap")) if ($var eq "${opt}mhap");
+ $set += setGlobalSpecialization($val, ("${opt}cormmap", "${opt}obtmmap", "${opt}utgmmap")) if ($var eq "${opt}mmap");
+ }
- foreach my $opt ("memory", "threads", "concurrency") {
- $set += setGlobalSpecialization($val, ("cor${alg}${opt}", "obt${alg}${opt}", "utg${alg}${opt}")) if ($var eq "${alg}${opt}");
- }
+ foreach my $opt ("memory",
+ "threads",
+ "concurrency") {
+ $set += setGlobalSpecialization($val, ( "corovl${opt}", "obtovl${opt}", "utgovl${opt}")) if ($var eq "ovl${opt}");
+ $set += setGlobalSpecialization($val, ("cormhap${opt}", "obtmhap${opt}", "utgmhap${opt}")) if ($var eq "mhap${opt}");
+ $set += setGlobalSpecialization($val, ("cormmap${opt}", "obtmmap${opt}", "utgmmap${opt}")) if ($var eq "mmap${opt}");
}
- foreach my $opt ("overlapper", "realign") {
+ # Overlapping algorithm choice options
+
+ foreach my $opt ("overlapper",
+ "realign") {
$set += setGlobalSpecialization($val, ("cor${opt}", "obt${opt}", "utg${opt}")) if ($var eq "${opt}");
}
- # e.g., corOvlHashBlockLength
- foreach my $opt ("ovlerrorrate", "ovlhashblocklength", "ovlrefblocksize", "ovlrefblocklength", "ovlhashbits", "ovlhashload", "ovlmersize", "ovlmerthreshold", "ovlmerdistinct", "ovlmertotal", "ovlfrequentmers") {
+ # OverlapInCore options
+
+ foreach my $opt ("ovlerrorrate",
+ "ovlhashblocklength",
+ "ovlrefblocksize",
+ "ovlrefblocklength",
+ "ovlhashbits",
+ "ovlhashload",
+ "ovlmersize",
+ "ovlmerthreshold",
+ "ovlmerdistinct",
+ "ovlmertotal",
+ "ovlfrequentmers") {
$set += setGlobalSpecialization($val, ("cor${opt}", "obt${opt}", "utg${opt}")) if ($var eq "${opt}");
}
- # e.g., corMhapBlockSize
- foreach my $opt ("mhapblocksize", "mhapmersize", "mhaprealign", "mhapsensitivity") {
+ # Mhap options
+
+ foreach my $opt ("mhapblocksize",
+ "mhapmersize",
+ "mhapsensitivity",
+ "mhapfilterunique",
+ "mhapfilterthreshold",
+ "mhapnotf") {
$set += setGlobalSpecialization($val, ("cor${opt}", "obt${opt}", "utg${opt}")) if ($var eq "${opt}");
}
- # Handle the two error rate aliases. Note 'errorRateUsed' must be lowercase. setGlobal/getGlobal do that for us.
-
- if ($var eq "errorrate") {
- $var = "correctederrorrate";
- $val = 3 * $val;
+ # MiniMap options
- $global{"errorrateused"} = "--\n";
- $global{"errorrateused"} .= "-- WARNING: Obsolete 'errorRate' used, replace with 'correctedErrorRate', set to three times the value.\n";
- $global{"errorrateused"} .= "-- WARNING: errorRate was the expected error rate in a single corrected read; correctedErrorRate is the\n";
- $global{"errorrateused"} .= "-- WARNING: allowed difference in an alignment of two corrected reads.\n";
+ foreach my $opt ("mmapblocksize",
+ "mmapmersize") {
+ $set += setGlobalSpecialization($val, ("cor${opt}", "obt${opt}", "utg${opt}")) if ($var eq "${opt}");
}
+ # Handle the two error rate aliases.
+
if ($var eq "rawerrorrate") {
setGlobalIfUndef("corOvlErrorRate", $val);
setGlobalIfUndef("corErrorRate", $val);
@@ -282,18 +303,20 @@ sub diskSpace ($) {
my $dir = dirname($_[0]);
my ($total, $used, $free, $avail) = (0, 0, 0, 0);
- open(DF, "df -P -k $dir |");
- while (<DF>) {
- chomp;
+ if (-d $dir) {
+ open(DF, "df -P -k $dir |");
+ while (<DF>) {
+ chomp;
- if (m/^(.*)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+%)\s+(.*)$/) {
- $total = int($2 / 1048.576) / 1000;
- $used = int($3 / 1048.576) / 1000;
- $free = int($4 / 1048.576) / 1000;
- $avail = int($4 / 1048.576) / 1000; # Possibly limited by quota?
+ if (m/^(.*)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+%)\s+(.*)$/) {
+ $total = int($2 / 1048.576) / 1000;
+ $used = int($3 / 1048.576) / 1000;
+ $free = int($4 / 1048.576) / 1000;
+ $avail = int($4 / 1048.576) / 1000; # Possibly limited by quota?
+ }
}
+ close(DF);
}
- close(DF);
#print STDERR "Disk space: total $total GB, used $used GB, free $free GB, available $avail GB\n";
@@ -314,7 +337,7 @@ sub printOptions () {
$o = substr("$k ", 0, 40);
} else {
- $Text::Wrap::columns = 100;
+ $Text::Wrap::columns = 60;
$o = "$o\n";
$u = wrap(" ", " ", $u) . "\n";
@@ -331,35 +354,60 @@ sub printHelp (@) {
return if (!defined($force) && !defined($global{"errors"}));
print "\n";
- print "usage: canu [-version] \\\n";
- print " [-correct | -trim | -assemble | -trim-assemble] \\\n";
- print " [-s <assembly-specifications-file>] \\\n";
- print " -p <assembly-prefix> \\\n";
- print " -d <assembly-directory> \\\n";
- print " genomeSize=<number>[g|m|k] \\\n";
- print " [other-options] \\\n";
- print " [-pacbio-raw | -pacbio-corrected | -nanopore-raw | -nanopore-corrected] *fastq\n";
+ print "usage: canu [-version] [-citation] \\\n";
+ print " [-correct | -trim | -assemble | -trim-assemble] \\\n";
+ print " [-s <assembly-specifications-file>] \\\n";
+ print " -p <assembly-prefix> \\\n";
+ print " -d <assembly-directory> \\\n";
+ print " genomeSize=<number>[g|m|k] \\\n";
+ print " [other-options] \\\n";
+ print " [-pacbio-raw |\n";
+ print " -pacbio-corrected |\n";
+ print " -nanopore-raw |\n";
+ print " -nanopore-corrected] file1 file2 ...\n";
print "\n";
- print " By default, all three stages (correct, trim, assemble) are computed.\n";
- print " To compute only a single stage, use:\n";
+ print "example: canu -d run1 -p godzilla genomeSize=1g -nanopore-raw reads/*.fasta.gz \n";
+ print "\n";
+ print "\n";
+ print " To restrict canu to only a specific stage, use:\n";
print " -correct - generate corrected reads\n";
print " -trim - generate trimmed reads\n";
print " -assemble - generate an assembly\n";
print " -trim-assemble - generate trimmed reads and then assemble them\n";
print "\n";
- print " The assembly is computed in the (created) -d <assembly-directory>, with most\n";
- print " files named using the -p <assembly-prefix>.\n";
+ print " The assembly is computed in the -d <assembly-directory>, with output files named\n";
+ print " using the -p <assembly-prefix>. This directory is created if needed. It is not\n";
+ print " possible to run multiple assemblies in the same directory.\n";
print "\n";
- print " The genome size is your best guess of the genome size of what is being assembled.\n";
- print " It is used mostly to compute coverage in reads. Fractional values are allowed: '4.7m'\n";
- print " is the same as '4700k' and '4700000'\n";
+ print " The genome size should be your best guess of the haploid genome size of what is being\n";
+ print " assembled. It is used primarily to estimate coverage in reads, NOT as the desired\n";
+ print " assembly size. Fractional values are allowed: '4.7m' equals '4700k' equals '4700000'\n";
print "\n";
- print " A full list of options can be printed with '-options'. All options\n";
- print " can be supplied in an optional sepc file.\n";
+ print " Some common options:\n";
+ print " useGrid=string\n";
+ print " - Run under grid control (true), locally (false), or set up for grid control\n";
+ print " but don't submit any jobs (remote)\n";
+ print " rawErrorRate=fraction-error\n";
+ print " - The allowed difference in an overlap between two raw uncorrected reads. For lower\n";
+ print " quality reads, use a higher number. The defaults are 0.300 for PacBio reads and\n";
+ print " 0.500 for Nanopore reads.\n";
+ print " correctedErrorRate=fraction-error\n";
+ print " - The allowed difference in an overlap between two corrected reads. Assemblies of\n";
+ print " low coverage or data with biological differences will benefit from a slight increase\n";
+ print " in this. Defaults are 0.045 for PacBio reads and 0.144 for Nanopore reads.\n";
+ print " gridOptions=string\n";
+ print " - Pass string to the command used to submit jobs to the grid. Can be used to set\n";
+ print " maximum run time limits. Should NOT be used to set memory limits; Canu will do\n";
+ print " that for you.\n";
+ print " minReadLength=number\n";
+ print " - Ignore reads shorter than 'number' bases long. Default: 1000.\n";
+ print " minOverlapLength=number\n";
+ print " - Ignore read-to-read overlaps shorter than 'number' bases long. Default: 500.\n";
+ print " A full list of options can be printed with '-options'. All options can be supplied in\n";
+ print " an optional sepc file with the -s option.\n";
print "\n";
- print " Reads can be either FASTA or FASTQ format, uncompressed, or compressed\n";
- print " with gz, bz2 or xz. Reads are specified by the technology they were\n";
- print " generated with:\n";
+ print " Reads can be either FASTA or FASTQ format, uncompressed, or compressed with gz, bz2 or xz.\n";
+ print " Reads are specified by the technology they were generated with:\n";
print " -pacbio-raw <files>\n";
print " -pacbio-corrected <files>\n";
print " -nanopore-raw <files>\n";
@@ -377,6 +425,52 @@ sub printHelp (@) {
}
+sub printCitation ($) {
+ my $prefix = shift @_;
+
+ print STDERR "${prefix}Koren S, Walenz BP, Berlin K, Miller JR, Phillippy AM.\n";
+ print STDERR "${prefix}Canu: scalable and accurate long-read assembly via adaptive k-mer weighting and repeat separation.\n";
+ print STDERR "${prefix}Genome Res. 2017 May;27(5):722-736.\n";
+ print STDERR "${prefix}http://doi.org/10.1101/gr.215087.116\n";
+ print STDERR "${prefix}\n";
+ print STDERR "${prefix}Read and contig alignments during correction, consensus and GFA building use:\n";
+ print STDERR "${prefix} Šošic M, Šikic M.\n";
+ print STDERR "${prefix} Edlib: a C/C ++ library for fast, exact sequence alignment using edit distance.\n";
+ print STDERR "${prefix} Bioinformatics. 2017 May 1;33(9):1394-1395.\n";
+ print STDERR "${prefix} http://doi.org/10.1093/bioinformatics/btw753\n";
+ print STDERR "${prefix}\n";
+ print STDERR "${prefix}Overlaps are generated using:\n";
+ print STDERR "${prefix} Berlin K, et al.\n";
+ print STDERR "${prefix} Assembling large genomes with single-molecule sequencing and locality-sensitive hashing.\n";
+ print STDERR "${prefix} Nat Biotechnol. 2015 Jun;33(6):623-30.\n";
+ print STDERR "${prefix} http://doi.org/10.1038/nbt.3238\n";
+ print STDERR "${prefix}\n";
+ print STDERR "${prefix} Myers EW, et al.\n";
+ print STDERR "${prefix} A Whole-Genome Assembly of Drosophila.\n";
+ print STDERR "${prefix} Science. 2000 Mar 24;287(5461):2196-204.\n";
+ print STDERR "${prefix} http://doi.org/10.1126/science.287.5461.2196\n";
+ print STDERR "${prefix}\n";
+ print STDERR "${prefix} Li H.\n";
+ print STDERR "${prefix} Minimap and miniasm: fast mapping and de novo assembly for noisy long sequences.\n";
+ print STDERR "${prefix} Bioinformatics. 2016 Jul 15;32(14):2103-10.\n";
+ print STDERR "${prefix} http://doi.org/10.1093/bioinformatics/btw152\n";
+ print STDERR "${prefix}\n";
+ print STDERR "${prefix}Corrected read consensus sequences are generated using an algorithm derived from FALCON-sense:\n";
+ print STDERR "${prefix} Chin CS, et al.\n";
+ print STDERR "${prefix} Phased diploid genome assembly with single-molecule real-time sequencing.\n";
+ print STDERR "${prefix} Nat Methods. 2016 Dec;13(12):1050-1054.\n";
+ print STDERR "${prefix} http://doi.org/10.1038/nmeth.4035\n";
+ print STDERR "${prefix}\n";
+ print STDERR "${prefix}Contig consensus sequences are generated using an algorithm derived from pbdagcon:\n";
+ print STDERR "${prefix} Chin CS, et al.\n";
+ print STDERR "${prefix} Nonhybrid, finished microbial genome assemblies from long-read SMRT sequencing data.\n";
+ print STDERR "${prefix} Nat Methods. 2013 Jun;10(6):563-9\n";
+ print STDERR "${prefix} http://doi.org/10.1038/nmeth.2474\n";
+ print STDERR "${prefix}\n";
+}
+
+
+
sub makeAbsolute ($) {
my $var = shift @_;
@@ -561,6 +655,19 @@ sub setExecDefaults ($$) {
+sub setOverlapDefault ($$$$) {
+ my $tag = shift @_;
+ my $var = shift @_;
+ my $value = shift @_;
+ my $description = shift @_;
+
+ $global{"${tag}${var}"} = $value;
+ $synops{"${tag}${var}"} = $description;
+ $synops{ "${var}"} = $description;
+}
+
+
+
sub setOverlapDefaults ($$$) {
my $tag = shift @_; # If 'cor', some parameters are loosened for raw pacbio reads
my $name = shift @_;
@@ -568,83 +675,50 @@ sub setOverlapDefaults ($$$) {
# Which overlapper to use.
- $global{"${tag}Overlapper"} = $default;
- $synops{"${tag}Overlapper"} = "Which overlap algorithm to use for $name";
+ setOverlapDefault($tag, "Overlapper", $default, "Which overlap algorithm to use for $name");
+ setOverlapDefault($tag, "ReAlign", 0, "Refine overlaps by computing the actual alignment: 'true' or 'false'. Not useful for overlapper=ovl. Uses ${tag}OvlErrorRate");
# OverlapInCore parameters.
- $global{"${tag}OvlHashBlockLength"} = undef;
- $synops{"${tag}OvlHashBlockLength"} = "Amount of sequence (bp) to load into the overlap hash table";
-
- $global{"${tag}OvlRefBlockSize"} = undef;
- $synops{"${tag}OvlRefBlockSize"} = "Number of reads to search against the hash table per batch";
-
- $global{"${tag}OvlRefBlockLength"} = 0;
- $synops{"${tag}OvlRefBlockLength"} = "Amount of sequence (bp) to search against the hash table per batch";
-
- $global{"${tag}OvlHashBits"} = ($tag eq "cor") ? 18 : 23;
- $synops{"${tag}OvlHashBits"} = "Width of the kmer hash. Width 22=1gb, 23=2gb, 24=4gb, 25=8gb. Plus 10b per ${tag}OvlHashBlockLength";
-
- $global{"${tag}OvlHashLoad"} = 0.75;
- $synops{"${tag}OvlHashLoad"} = "Maximum hash table load. If set too high, table lookups are inefficent; if too low, search overhead dominates run time; default 0.75";
-
- $global{"${tag}OvlMerSize"} = ($tag eq "cor") ? 19 : 22;
- $synops{"${tag}OvlMerSize"} = "K-mer size for seeds in overlaps";
-
- $global{"${tag}OvlMerThreshold"} = "auto";
- $synops{"${tag}OvlMerThreshold"} = "K-mer frequency threshold; mers more frequent than this count are ignored; default 'auto'";
-
- $global{"${tag}OvlMerDistinct"} = undef;
- $synops{"${tag}OvlMerDistinct"} = "K-mer frequency threshold; the least frequent fraction of distinct mers can seed overlaps";
-
- $global{"${tag}OvlMerTotal"} = undef;
- $synops{"${tag}OvlMerTotal"} = "K-mer frequency threshold; the least frequent fraction of all mers can seed overlaps";
-
- $global{"${tag}OvlFrequentMers"} = undef;
- $synops{"${tag}OvlFrequentMers"} = "Do not seed overlaps with these kmers (fasta format)";
-
- $global{"${tag}OvlFilter"} = undef;
- $synops{"${tag}OvlFilter"} = "Filter overlaps based on expected kmers vs observed kmers";
-
- # Mhap parameters.
-
- $global{"${tag}MhapVersion"} = "2.1.2";
- $synops{"${tag}MhapVersion"} = "Version of the MHAP jar file to use";
-
- $global{"${tag}MhapFilterThreshold"} = "0.000005"; # Needs to be a string, else it is printed as 5e-06.
- $synops{"${tag}MhapFilterThreshold"} = "Value between 0 and 1. kmers which comprise more than this percentage of the input are downweighted";
-
- $global{"${tag}MhapFilterUnique"} = undef;
- $synops{"${tag}MhapFilterUnique"} = "Expert option: True or false, supress the low-frequency k-mer distribution based on them being likely noise and not true overlaps. Threshold auto-computed based on error rate and coverage.";
-
- $global{"${tag}MhapNoTf"} = undef;
- $synops{"${tag}MhapNoTf"} = "Expert option: True or false, do not use tf weighting, only idf of tf-idf.";
-
- $global{"${tag}MhapOptions"} = undef;
- $synops{"${tag}MhapOptions"} = "Expert option: free-form parameters to pass to MHAP.";
-
- $global{"${tag}MhapBlockSize"} = 3000;
- $synops{"${tag}MhapBlockSize"} = "Number of reads per 1GB; memory * blockSize = the size of block loaded into memory per job";
-
- $global{"${tag}MhapMerSize"} = ($tag eq "cor") ? 16 : 16;
- $synops{"${tag}MhapMerSize"} = "K-mer size for seeds in mhap";
-
- $global{"${tag}MhapOrderedMerSize"} = ($tag eq "cor") ? 12 : 18;
- $synops{"${tag}MhapOrderedMerSize"} = "K-mer size for second-stage filter in mhap";
+ setOverlapDefault($tag, "OvlHashBlockLength", undef, "Amount of sequence (bp) to load into the overlap hash table");
+ setOverlapDefault($tag, "OvlRefBlockSize", undef, "Number of reads to search against the hash table per batch");
+ setOverlapDefault($tag, "OvlRefBlockLength", 0, "Amount of sequence (bp) to search against the hash table per batch");
+ setOverlapDefault($tag, "OvlHashBits", ($tag eq "cor") ? 18 : 23, "Width of the kmer hash. Width 22=1gb, 23=2gb, 24=4gb, 25=8gb. Plus 10b per ${tag}OvlHashBlockLength");
+ setOverlapDefault($tag, "OvlHashLoad", 0.75, "Maximum hash table load. If set too high, table lookups are inefficent; if too low, search overhead dominates run time; default 0.75");
+ setOverlapDefault($tag, "OvlMerSize", ($tag eq "cor") ? 19 : 22, "K-mer size for seeds in overlaps");
+ setOverlapDefault($tag, "OvlMerThreshold", "auto", "K-mer frequency threshold; mers more frequent than this count are ignored; default 'auto'");
+ setOverlapDefault($tag, "OvlMerDistinct", undef, "K-mer frequency threshold; the least frequent fraction of distinct mers can seed overlaps");
+ setOverlapDefault($tag, "OvlMerTotal", undef, "K-mer frequency threshold; the least frequent fraction of all mers can seed overlaps");
+ setOverlapDefault($tag, "OvlFrequentMers", undef, "Do not seed overlaps with these kmers (fasta format)");
+ setOverlapDefault($tag, "OvlFilter", undef, "Filter overlaps based on expected kmers vs observed kmers");
+
+ # Mhap parameters. FilterThreshold MUST be a string, otherwise it gets printed in scientific notation (5e-06) which java doesn't understand.
+
+ setOverlapDefault($tag, "MhapVersion", "2.1.2", "Version of the MHAP jar file to use");
+ setOverlapDefault($tag, "MhapFilterThreshold", "0.000005", "Value between 0 and 1. kmers which comprise more than this percentage of the input are downweighted");
+ setOverlapDefault($tag, "MhapFilterUnique", undef, "Expert option: True or false, supress the low-frequency k-mer distribution based on them being likely noise and not true overlaps. Threshold auto-computed based on error rate and coverage.");
+ setOverlapDefault($tag, "MhapNoTf", undef, "Expert option: True or false, do not use tf weighting, only idf of tf-idf.");
+ setOverlapDefault($tag, "MhapOptions", undef, "Expert option: free-form parameters to pass to MHAP.");
+ setOverlapDefault($tag, "MhapBlockSize", 3000, "Number of reads per 1GB; memory * blockSize = the size of block loaded into memory per job");
+ setOverlapDefault($tag, "MhapMerSize", ($tag eq "cor") ? 16 : 16, "K-mer size for seeds in mhap");
+ setOverlapDefault($tag, "MhapOrderedMerSize", ($tag eq "cor") ? 12 : 18, "K-mer size for second-stage filter in mhap");
+ setOverlapDefault($tag, "MhapSensitivity", undef, "Coarse sensitivity level: 'low', 'normal' or 'high'. Set automatically based on coverage; 'high' <= 30x < 'normal' < 60x <= 'low'");
+
+ # MiniMap parameters.
+
+ setOverlapDefault($tag, "MMapBlockSize", 6000, "Number of reads per 1GB; memory * blockSize = the size of block loaded into memory per job");
+ setOverlapDefault($tag, "MMapMerSize", ($tag eq "cor") ? 15 : 21, "K-mer size for seeds in minmap");
+}
- $global{"${tag}MhapSensitivity"} = undef;
- $synops{"${tag}MhapSensitivity"} = "Coarse sensitivity level: 'low', 'normal' or 'high'. Set automatically based on coverage; 'high' <= 30x < 'normal' < 60x <= 'low'";
- $global{"${tag}MMapBlockSize"} = 6000;
- $synops{"${tag}MMapBlockSize"} = "Number of reads per 1GB; memory * blockSize = the size of block loaded into memory per job";
- # minimap parameters.
- $global{"${tag}MMapMerSize"} = ($tag eq "cor") ? 15 : 21;
- $synops{"${tag}MMapMerSize"} = "K-mer size for seeds in minmap";
+sub setDefault ($$$) {
+ my $var = shift @_;
+ my $value = shift @_;
+ my $description = shift @_;
- # shared parameters for alignment-free overlappers
- $global{"${tag}ReAlign"} = 0;
- $synops{"${tag}ReAlign"} = "Refine mhap/minimap overlaps by computing the actual alignment: 'true' or 'false'. Uses ${tag}OvlErrorRate";
+ $global{$var} = $value;
+ $synops{$var} = $description;
}
@@ -654,162 +728,107 @@ sub setDefaults () {
##### Internal stuff
$global{"errors"} = undef; # Command line errors
- $global{"errorRateUsed"} = undef; # A warning if obsolete 'errorRate' parameter is used. This lets us print the error in a useful place, instead of at the very start of the output.
-
$global{"version"} = undef; # Reset at the end of this function, once we know where binaries are.
+ $global{"availablehosts"} = undef; # Internal list of cpus-memory-nodes describing the grid
- ##### General Configuration Options (aka miscellany)
-
- $global{"canuIteration"} = 0; # See documentation in Execution.pm
- $global{"canuIterationMax"} = 2;
+ $global{"canuiteration"} = 0;
+ $global{"canuiterationmax"} = 2;
- $global{"showNext"} = undef;
- $synops{"showNext"} = "Don't run any commands, just report what would run";
+ $global{"onexitdir"} = undef; # Copy of $wrk, for caExit() and caFailure() ONLY.
+ $global{"onexitnam"} = undef; # Copy of $asm, for caExit() and caFailure() ONLY.
- $global{"pathMap"} = undef;
- $synops{"pathMap"} = "File with a hostname to binary directory map; binary directories must be absolute paths";
+ ##### Meta options (no $global for these, only synopsis), more of these, many many more, are defined in setOverlapDefaults().
- $global{"shell"} = "/bin/sh";
- $synops{"shell"} = "Command interpreter to use; sh-compatible (e.g., bash), NOT C-shell (csh or tcsh); default '/bin/sh'";
+ $synops{"rawErrorRate"} = "Expected fraction error in an alignment of two uncorrected reads";
+ $synops{"correctedErrorRate"} = "Expected fraction error in an alignment of two corrected reads";
- $global{"java"} = (exists $ENV{"JAVA_HOME"} && -e "$ENV{'JAVA_HOME'}/bin/java") ? "$ENV{'JAVA_HOME'}/bin/java" : "java";
- $synops{"java"} = "Java interpreter to use; at least version 1.8; default 'java'";
-
- $global{"gnuplot"} = "gnuplot";
- $synops{"gnuplot"} = "Path to the gnuplot executable";
-
- $global{"gnuplotImageFormat"} = undef;
- $synops{"gnuplotImageFormat"} = "Image format that gnuplot will generate, used in HTML reports. Default: based on gnuplot, 'png', 'svg' or 'gif'";
+ ##### General Configuration Options (aka miscellany)
- $global{"gnuplotTested"} = 0;
- $synops{"gnuplotTested"} = "If set, skip the initial testing of gnuplot";
+ my $java = (exists $ENV{"JAVA_HOME"} && -e "$ENV{'JAVA_HOME'}/bin/java") ? "$ENV{'JAVA_HOME'}/bin/java" : "java";
- $global{"stageDirectory"} = undef;
- $synops{"stageDirectory"} = "If set, copy heavily used data to this node-local location";
+ setDefault("showNext", undef, "Don't run any commands, just report what would run");
+ setDefault("pathMap", undef, "File with a hostname to binary directory map; binary directories must be absolute paths");
+ setDefault("shell", "/bin/sh", "Command interpreter to use; sh-compatible (e.g., bash), NOT C-shell (csh or tcsh); default '/bin/sh'");
+ setDefault("java", $java, "Java interpreter to use; at least version 1.8; default 'java'");
+ setDefault("gnuplot", "gnuplot", "Path to the gnuplot executable");
+ setDefault("gnuplotImageFormat", undef, "Image format that gnuplot will generate, used in HTML reports. Default: based on gnuplot, 'png', 'svg' or 'gif'");
+ setDefault("gnuplotTested", 0, "If set, skip the initial testing of gnuplot");
+ setDefault("stageDirectory", undef, "If set, copy heavily used data to this node-local location");
##### Cleanup and Termination options
- $global{"saveOverlaps"} = 0;
- $synops{"saveOverlaps"} = "Save intermediate overlap files, almost never a good idea";
-
- $global{"saveReadCorrections"} = 0;
- $synops{"saveReadCorrections"} = "Save intermediate read correction files, almost never a good idea";
-
- $global{"saveMerCounts"} = 0;
- $synops{"saveMerCounts"} = "Save full mer counting results, sometimes useful";
-
- $global{"onSuccess"} = undef;
- $synops{"onSuccess"} = "Full path to command to run on successful completion";
-
- $global{"onFailure"} = undef;
- $synops{"onFailure"} = "Full path to command to run on failure";
-
- $global{"onExitDir"} = undef; # Copy of $wrk, for caExit() and caFailure() ONLY.
- $global{"onExitNam"} = undef; # Copy of $asm, for caExit() and caFailure() ONLY.
+ setDefault("saveOverlaps", 0, "Save intermediate overlap files, almost never a good idea");
+ setDefault("saveReadCorrections", 0, "Save intermediate read correction files, almost never a good idea");
+ setDefault("saveMerCounts", 0, "Save full mer counting results, sometimes useful");
+ setDefault("onSuccess", undef, "Full path to command to run on successful completion");
+ setDefault("onFailure", undef, "Full path to command to run on failure");
##### Error Rates
- $global{"corOvlErrorRate"} = undef;
- $synops{"corOvlErrorRate"} = "Overlaps above this error rate are not computed";
-
- $global{"obtOvlErrorRate"} = undef;
- $synops{"obtOvlErrorRate"} = "Overlaps at or below this error rate are used to trim reads";
-
- $global{"utgOvlErrorRate"} = undef;
- $synops{"utgOvlErrorRate"} = "Overlaps at or below this error rate are used to trim reads";
-
- $global{"utgErrorRate"} = undef;
- $synops{"utgErrorRate"} = "Overlaps at or below this error rate are used to construct contigs";
-
- $global{"utgGraphDeviation"} = 6;
- $synops{"utgGraphDeviation"} = "Overlaps this much above median will not be used for initial graph construction";
-
- $global{"utgRepeatDeviation"} = 3;
- $synops{"utgRepeatDeviation"} = "Overlaps this much above mean unitig error rate will not be used for repeat splitting";
-
- $global{"utgRepeatConfusedBP"} = 2100;
- $synops{"utgRepeatConfusedBP"} = "Repeats where the next best edge is at least this many bp shorter will not be split";
-
- $global{"corErrorRate"} = undef;
- $synops{"corErrorRate"} = "Only use raw alignments below this error rate to construct corrected reads";
-
- $global{"cnsErrorRate"} = undef;
- $synops{"cnsErrorRate"} = "Consensus expects alignments at about this error rate";
+ setDefault("corOvlErrorRate", undef, "Overlaps above this error rate are not computed");
+ setDefault("obtOvlErrorRate", undef, "Overlaps at or below this error rate are used to trim reads");
+ setDefault("utgOvlErrorRate", undef, "Overlaps at or below this error rate are used to trim reads");
+ setDefault("utgErrorRate", undef, "Overlaps at or below this error rate are used to construct contigs");
+ setDefault("utgGraphDeviation", 6, "Overlaps this much above median will not be used for initial graph construction");
+ setDefault("utgRepeatDeviation", 3, "Overlaps this much above mean unitig error rate will not be used for repeat splitting");
+ setDefault("utgRepeatConfusedBP", 2100, "Repeats where the next best edge is at least this many bp shorter will not be split");
+ setDefault("corErrorRate", undef, "Only use raw alignments below this error rate to construct corrected reads");
+ setDefault("cnsErrorRate", undef, "Consensus expects alignments at about this error rate");
##### Minimums and maximums
- $global{"minReadLength"} = 1000;
- $synops{"minReadLength"} = "Reads shorter than this length are not loaded into the assembler; default 1000";
+ setDefault("minReadLength", 1000, "Reads shorter than this length are not loaded into the assembler; default 1000");
+ setDefault("minOverlapLength", 500, "Overlaps shorter than this length are not computed; default 500");
- $global{"minOverlapLength"} = 500;
- $synops{"minOverlapLength"} = "Overlaps shorter than this length are not computed; default 500";
+ setDefault("minMemory", undef, "Minimum amount of memory needed to compute the assembly (do not set unless prompted!)");
+ setDefault("maxMemory", undef, "Maximum memory to use by any component of the assembler");
- $global{"minMemory"} = undef;
- $synops{"minMemory"} = "Minimum amount of memory needed to compute the assembly (do not set unless prompted!)";
- $global{"minThreads"} = undef;
- $synops{"minThreads"} = "Minimum number of compute threads suggested to compute the assembly";
-
- $global{"maxMemory"} = undef;
- $synops{"maxMemory"} = "Maximum memory to use by any component of the assembler";
- $global{"maxThreads"} = undef;
- $synops{"maxThreads"} = "Maximum number of compute threads to use by any component of the assembler";
+ setDefault("minThreads", undef, "Minimum number of compute threads suggested to compute the assembly");
+ setDefault("maxThreads", undef, "Maximum number of compute threads to use by any component of the assembler");
##### Stopping conditions
- $global{"stopOnReadQuality"} = 1;
- $synops{"stopOnReadQuality"} = "Stop if a significant portion of the input data is too short or has quality value or base composition errors";
-
- $global{"stopAfter"} = undef;
- $synops{"stopAfter"} = "Tell canu when to halt execution";
+ setDefault("stopOnReadQuality", 1, "Stop if a significant portion of the input data is too short or has quality value or base composition errors");
+ setDefault("stopAfter", undef, "Stop after a specific algorithm step is completed");
##### Grid Engine configuration, internal parameters. These are filled out in canu.pl, right after this function returns.
- $global{"availableHosts"} = undef; # Internal list of cpus-memory-nodes describing the grid
-
- $global{"gridEngine"} = undef;
- $global{"gridEngineSubmitCommand"} = undef;
- $global{"gridEngineNameOption"} = undef;
- $global{"gridEngineArrayOption"} = undef;
- $global{"gridEngineArrayName"} = undef;
- $global{"gridEngineArrayMaxJobs"} = undef;
- $global{"gridEngineOutputOption"} = undef;
- $global{"gridEnginePropagateCommand"} = undef;
- $global{"gridEngineThreadsOption"} = undef;
- $global{"gridEngineMemoryOption"} = undef;
- $global{"gridEngineMemoryUnits"} = undef;
- $global{"gridEngineNameToJobIDCommand"} = undef;
- $global{"gridEngineNameToJobIDCommandNoArray"} = undef;
- $global{"gridEngineStageOption"} = undef;
- $global{"gridEngineTaskID"} = undef;
- $global{"gridEngineArraySubmitID"} = undef;
- $global{"gridEngineJobID"} = undef;
+ setDefault("gridEngine", undef, "Grid engine configuration, not documented");
+ setDefault("gridEngineSubmitCommand", undef, "Grid engine configuration, not documented");
+ setDefault("gridEngineNameOption", undef, "Grid engine configuration, not documented");
+ setDefault("gridEngineArrayOption", undef, "Grid engine configuration, not documented");
+ setDefault("gridEngineArrayName", undef, "Grid engine configuration, not documented");
+ setDefault("gridEngineArrayMaxJobs", undef, "Grid engine configuration, not documented");
+ setDefault("gridEngineOutputOption", undef, "Grid engine configuration, not documented");
+ setDefault("gridEnginePropagateCommand", undef, "Grid engine configuration, not documented");
+ setDefault("gridEngineThreadsOption", undef, "Grid engine configuration, not documented");
+ setDefault("gridEngineMemoryOption", undef, "Grid engine configuration, not documented");
+ setDefault("gridEngineMemoryUnits", undef, "Grid engine configuration, not documented");
+ setDefault("gridEngineNameToJobIDCommand", undef, "Grid engine configuration, not documented");
+ setDefault("gridEngineNameToJobIDCommandNoArray", undef, "Grid engine configuration, not documented");
+ setDefault("gridEngineStageOption", undef, "Grid engine configuration, not documented");
+ setDefault("gridEngineTaskID", undef, "Grid engine configuration, not documented");
+ setDefault("gridEngineArraySubmitID", undef, "Grid engine configuration, not documented");
+ setDefault("gridEngineJobID", undef, "Grid engine configuration, not documented");
##### Grid Engine Pipeline
- $global{"useGrid"} = 1;
- $synops{"useGrid"} = "If 'true', enable grid-based execution; if 'false', run all jobs on the local machine; if 'remote', create jobs for grid execution but do not submit; default 'true'";
+ setDefault("useGrid", 1, "If 'true', enable grid-based execution; if 'false', run all jobs on the local machine; if 'remote', create jobs for grid execution but do not submit; default 'true'");
foreach my $c (qw(BAT GFA CNS COR MERYL CORMHAP CORMMAP COROVL OBTMHAP OBTMMAP OBTOVL OEA OVB OVS RED UTGMHAP UTGMMAP UTGOVL)) {
- $global{"useGrid$c"} = 1;
- $synops{"useGrid$c"} = "If 'true', run module $c under grid control; if 'false' run locally.";
+ setDefault("useGrid$c", 1, "If 'true', run module $c under grid control; if 'false' run locally.");
}
##### Grid Engine configuration, for each step of the pipeline
- $global{"gridOptions"} = undef;
- $synops{"gridOptions"} = "Grid engine options applied to all jobs";
-
- $global{"gridOptionsExecutive"} = undef;
- $synops{"gridOptionsExecutive"} = "Grid engine options applied to the canu executive script";
-
- $global{"gridOptionsJobName"} = undef;
- $synops{"gridOptionsJobName"} = "Grid jobs job-name suffix";
+ setDefault("gridOptions", undef, "Grid engine options applied to all jobs");
+ setDefault("gridOptionsExecutive", undef, "Grid engine options applied to the canu executive script");
+ setDefault("gridOptionsJobName", undef, "Grid jobs job-name suffix");
##### Grid Engine configuration and parameters, for each step of the pipeline (memory, threads)
- setExecDefaults("meryl", "mer counting");
-
- setExecDefaults("cor", "read correction");
+ setExecDefaults("meryl", "mer counting");
+ setExecDefaults("cor", "read correction");
setExecDefaults("corovl", "overlaps for correction");
setExecDefaults("obtovl", "overlaps for trimming");
@@ -823,26 +842,21 @@ sub setDefaults () {
setExecDefaults("obtmmap", "mmap overlaps for trimming");
setExecDefaults("utgmmap", "mmap overlaps for unitig construction");
- setExecDefaults("ovb", "overlap store bucketizing");
- setExecDefaults("ovs", "overlap store sorting");
+ setExecDefaults("ovb", "overlap store bucketizing");
+ setExecDefaults("ovs", "overlap store sorting");
- setExecDefaults("red", "read error detection");
- setExecDefaults("oea", "overlap error adjustment");
+ setExecDefaults("red", "read error detection");
+ setExecDefaults("oea", "overlap error adjustment");
- setExecDefaults("bat", "unitig construction");
- setExecDefaults("cns", "unitig consensus");
- setExecDefaults("gfa", "graph alignment and processing");
+ setExecDefaults("bat", "unitig construction");
+ setExecDefaults("cns", "unitig consensus");
+ setExecDefaults("gfa", "graph alignment and processing");
##### Object Storage
- $global{"objectStore"} = undef;
- $synops{"objectStore"} = "Type of object storage used; not ready for production yet";
-
- $global{"objectStoreClient"} = undef;
- $synops{"objectStoreClient"} = "Path to the command line client used to access the object storage";
-
- $global{"objectStoreNameSpace"} = undef;
- $synops{"objectStoreNameSpace"} = "Object store parameters; specific to the type of objectStore used";
+ setDefault("objectStore", undef, "Type of object storage used; not ready for production yet");
+ setDefault("objectStoreClient", undef, "Path to the command line client used to access the object storage");
+ setDefault("objectStoreNameSpace", undef, "Object store parameters; specific to the type of objectStore used");
##### Overlapper
@@ -852,140 +866,75 @@ sub setDefaults () {
##### Overlap Store
- $global{"ovsMethod"} = undef;
- $synops{"ovsMethod"} = "Use the 'sequential' or 'parallel' algorithm for constructing an overlap store; default 'sequential'";
+ setDefault("ovsMethod", undef, "Use the 'sequential' or 'parallel' algorithm for constructing an overlap store; default 'sequential'");
##### Mers
- $global{"merylMemory"} = undef;
- $synops{"merylMemory"} = "Amount of memory, in gigabytes, to use for mer counting";
-
- $global{"merylThreads"} = undef;
- $synops{"merylThreads"} = "Number of threads to use for mer counting";
-
- $global{"merylConcurrency"} = undef;
- $synops{"merylConcurrency"} = "Unused, there is only one process";
+ setDefault("merylMemory", undef, "Amount of memory, in gigabytes, to use for mer counting");
+ setDefault("merylThreads", undef, "Number of threads to use for mer counting");
+ setDefault("merylConcurrency", undef, "Unused, there is only one process");
##### Overlap Based Trimming
- $global{"obtErrorRate"} = undef;
- $synops{"obtErrorRate"} = "Stringency of overlaps to use for trimming";
-
- $global{"trimReadsOverlap"} = 1;
- $synops{"trimReadsOverlap"} = "Minimum overlap between evidence to make contiguous trim; default '1'";
-
- $global{"trimReadsCoverage"} = 1;
- $synops{"trimReadsCoverage"} = "Minimum depth of evidence to retain bases; default '1'";
+ setDefault("obtErrorRate", undef, "Stringency of overlaps to use for trimming");
+ setDefault("trimReadsOverlap", 1, "Minimum overlap between evidence to make contiguous trim; default '1'");
+ setDefault("trimReadsCoverage", 1, "Minimum depth of evidence to retain bases; default '1'");
#$global{"splitReads..."} = 1;
#$synops{"splitReads..."} = "";
##### Fragment/Overlap Error Correction
- $global{"enableOEA"} = 1;
- $synops{"enableOEA"} = "Do overlap error adjustment - comprises two steps: read error detection (RED) and overlap error adjustment (OEA); default 'true'";
-
- $global{"redBatchSize"} = undef;
- $synops{"redBatchSize"} = "Number of reads per fragment error detection batch";
-
- $global{"redBatchLength"} = undef;
- $synops{"redBatchLength"} = "Number of bases per fragment error detection batch";
-
- $global{"oeaBatchSize"} = undef;
- $synops{"oeaBatchSize"} = "Number of reads per overlap error correction batch";
-
- $global{"oeaBatchLength"} = undef;
- $synops{"oeaBatchLength"} = "Number of bases per overlap error correction batch";
+ setDefault("enableOEA", 1, "Do overlap error adjustment - comprises two steps: read error detection (RED) and overlap error adjustment (OEA); default 'true'");
+ setDefault("redBatchSize", undef, "Number of reads per fragment error detection batch");
+ setDefault("redBatchLength", undef, "Number of bases per fragment error detection batch");
+ setDefault("oeaBatchSize", undef, "Number of reads per overlap error correction batch");
+ setDefault("oeaBatchLength", undef, "Number of bases per overlap error correction batch");
##### Unitigger & BOG & bogart Options
- $global{"unitigger"} = "bogart";
- $synops{"unitigger"} = "Which unitig algorithm to use; only 'bogart' supported; default 'bogart'";
-
- $global{"genomeSize"} = undef;
- $synops{"genomeSize"} = "An estimate of the size of the genome";
-
- $global{"batOptions"} = undef;
- $synops{"batOptions"} = "Advanced options to bogart";
-
- $global{"batMemory"} = undef;
- $synops{"batMemory"} = "Approximate maximum memory usage, in gigabytes, default is the maxMemory limit";
-
- $global{"batThreads"} = undef;
- $synops{"batThreads"} = "Number of threads to use; default is the maxThreads limit";
-
- $global{"batConcurrency"} = undef;
- $synops{"batConcurrency"} = "Unused, only one process supported";
-
- ##### Unitig Filtering Options (also set in bogart/bogart.C)
+ setDefault("unitigger", "bogart", "Which unitig algorithm to use; only 'bogart' supported; default 'bogart'");
+ setDefault("genomeSize", undef, "An estimate of the size of the genome");
+ setDefault("batOptions", undef, "Advanced options to bogart");
+ setDefault("batMemory", undef, "Approximate maximum memory usage, in gigabytes, default is the maxMemory limit");
+ setDefault("batThreads", undef, "Number of threads to use; default is the maxThreads limit");
+ setDefault("batConcurrency", undef, "Unused, only one process supported");
- $global{"contigFilter"} = "2 1000 0.75 0.75 2";
- $synops{"contigFilter"} = "Parameters to filter out 'unassembled' unitigs: minReads; minLength; singleReadSpan; lowCovFraction, lowCovDepth";
+ setDefault("contigFilter", "2 0 1.0 0.5 5", "Parameters to filter out 'unassembled' unitigs. Five values: minReads minLength singleReadSpan lowCovFraction lowCovDepth");
##### Consensus Options
- $global{"cnsPartitions"} = undef;
- $synops{"cnsPartitions"} = "Partition consensus into N jobs";
-
- $global{"cnsPartitionMin"} = undef;
- $synops{"cnsPartitionMin"} = "Don't make a consensus partition with fewer than N reads";
-
- $global{"cnsMaxCoverage"} = 40;
- $synops{"cnsMaxCoverage"} = "Limit unitig consensus to at most this coverage; default '0' = unlimited";
-
- $global{"cnsConsensus"} = "pbdagcon";
- $synops{"cnsConsensus"} = "Which consensus algorithm to use; 'pbdagcon' (fast, reliable); 'utgcns' (multialignment output); 'quick' (single read mosaic); default 'pbdagcon'";
+ setDefault("cnsPartitions", undef, "Partition consensus into N jobs");
+ setDefault("cnsPartitionMin", undef, "Don't make a consensus partition with fewer than N reads");
+ setDefault("cnsMaxCoverage", 40, "Limit unitig consensus to at most this coverage; default '0' = unlimited");
+ setDefault("cnsConsensus", "pbdagcon", "Which consensus algorithm to use; 'pbdagcon' (fast, reliable); 'utgcns' (multialignment output); 'quick' (single read mosaic); default 'pbdagcon'");
##### Correction Options
- $global{"corPartitions"} = undef;
- $synops{"corPartitions"} = "Partition read correction into N jobs";
-
- $global{"corPartitionMin"} = undef;
- $synops{"corPartitionMin"} = "Don't make a read correction partition with fewer than N reads";
-
- $global{"corMinEvidenceLength"} = undef;
- $synops{"corMinEvidenceLength"} = "Limit read correction to only overlaps longer than this; default: unlimited";
-
- $global{"corMaxEvidenceErate"} = undef;
- $synops{"corMaxEvidenceErate"} = "Limit read correction to only overlaps at or below this fraction error; default: unlimited";
-
- $global{"corMaxEvidenceCoverageGlobal"}= "1.0x";
- $synops{"corMaxEvidenceCoverageGlobal"}= "Limit reads used for correction to supporting at most this coverage; default: '1.0x' = 1.0 * estimated coverage";
-
- $global{"corMaxEvidenceCoverageLocal"} = "2.0x";
- $synops{"corMaxEvidenceCoverageLocal"} = "Limit reads being corrected to at most this much evidence coverage; default: '2.0x' = 2.0 * estimated coverage";
-
- $global{"corOutCoverage"} = 40;
- $synops{"corOutCoverage"} = "Only correct the longest reads up to this coverage; default 40";
-
- $global{"corMinCoverage"} = undef;
- $synops{"corMinCoverage"} = "Minimum number of bases supporting each corrected base, if less than this sequences are split; default based on input read coverage: 0 <= 30x < 4 < 60x <= 4";
-
- $global{"corFilter"} = "expensive";
- $synops{"corFilter"} = "Method to filter short reads from correction; 'quick' or 'expensive'; default 'expensive'";
-
- $global{"corConsensus"} = "falconpipe";
- $synops{"corConsensus"} = "Which consensus algorithm to use; only 'falcon' and 'falconpipe' are supported; default 'falconpipe'";
-
- $global{"corLegacyFilter"} = undef;
- $synops{"corLegacyFilter"} = "Expert option: global filter, length * identity (default) or length with broken by identity (if on)";
+ setDefault("corPartitions", undef, "Partition read correction into N jobs");
+ setDefault("corPartitionMin", undef, "Don't make a read correction partition with fewer than N reads");
+ setDefault("corMinEvidenceLength", undef, "Limit read correction to only overlaps longer than this; default: unlimited");
+ setDefault("corMaxEvidenceErate", undef, "Limit read correction to only overlaps at or below this fraction error; default: unlimited");
+ setDefault("corMaxEvidenceCoverageGlobal", "1.0x", "Limit reads used for correction to supporting at most this coverage; default: '1.0x' = 1.0 * estimated coverage");
+ setDefault("corMaxEvidenceCoverageLocal", "2.0x", "Limit reads being corrected to at most this much evidence coverage; default: '2.0x' = 2.0 * estimated coverage");
+ setDefault("corOutCoverage", 40, "Only correct the longest reads up to this coverage; default 40");
+ setDefault("corMinCoverage", undef, "Minimum number of bases supporting each corrected base, if less than this sequences are split; default based on input read coverage: 0 <= 30x < 4 < 60x <= 4");
+ setDefault("corFilter", "expensive", "Method to filter short reads from correction; 'quick' or 'expensive'; default 'expensive'");
+ setDefault("corConsensus", "falconpipe", "Which consensus algorithm to use; only 'falcon' and 'falconpipe' are supported; default 'falconpipe'");
+ setDefault("corLegacyFilter", undef, "Expert option: global filter, length * identity (default) or length with broken by identity (if on)");
# Convert all the keys to lowercase, and remember the case-sensitive version
- foreach my $k (keys %global) {
+ foreach my $k (keys %synops) {
(my $l = $k) =~ tr/A-Z/a-z/;
- if (! exists($synnam{$l})) {
- $synnam{$l} = $k;
+ $synnam{$l} = $k; # Remember that option $l is stylized as $k.
- if (!exists($global{$l})) {
- $global{$l} = $global{$k};
- delete $global{$k};
- }
+ next if (!exists($global{$k})); # If no option for this (it's a meta-option), skip.
+ next if ( exists($global{$l})); # If lowercase already exists, skip.
- #print "$k -> $l\n";
- }
+ $global{$l} = $global{$k}; # Otherwise, set the lowercase option and
+ delete $global{$k}; # delete the uppercase version
}
# If this is set, it breaks the consensus.sh and overlap.sh scripts. Good grief! Why
@@ -1251,7 +1200,10 @@ sub checkParameters () {
foreach my $var ("corOutCoverage") {
if (!defined(getGlobal($var))) {
- addCommandLineError("ERROR: Invalid 'corOutCoverage' specified (" . getGlobal("corOutCoverage") . "); must be at least 1.0\n");
+ addCommandLineError("ERROR: Invalid 'corOutCoverage' specified; must be at least 1.0\n");
+ }
+ elsif (getGlobal($var) =~ m/all/i) {
+ setGlobal($var, 9999);
}
elsif (getGlobal($var) !~ m/^[.-0123456789]/) {
addCommandLineError("ERROR: Invalid '$var' specified (" . getGlobal("$var") . "); must be numeric\n");
@@ -1312,6 +1264,15 @@ sub checkParameters () {
}
}
+ if ((getGlobal("ovsMethod") ne "sequential") &&
+ (getGlobal("ovsMethod") ne "parallel")) {
+ addCommandLineError("ERROR: Invalid 'ovsMethod' specified (" . getGlobal("ovsMethod") . "); must be 'sequential' or 'parallel'\n");
+ }
+ if ((getGlobal("useGrid") eq "0") &&
+ (getGlobal("ovsMethod") eq "parallel")) {
+ addCommandLineError("ERROR: ovsMethod=parallel requires useGrid=true or useGrid=remote. Set ovsMethod=sequential if no grid is available\n");
+ }
+
if ((getGlobal("unitigger") ne "unitigger") &&
(getGlobal("unitigger") ne "bogart")) {
addCommandLineError("ERROR: Invalid 'unitigger' specified (" . getGlobal("unitigger") . "); must be 'unitigger' or 'bogart'\n");
@@ -1335,12 +1296,11 @@ sub checkParameters () {
addCommandLineError("ERROR: Invalid 'lowCoverageAllowed' and 'lowCoverageDepth' specified; both must be set\n");
}
- #if ((getGlobal("cleanup") ne "none") &&
- # (getGlobal("cleanup") ne "light") &&
- # (getGlobal("cleanup") ne "heavy") &&
- # (getGlobal("cleanup") ne "aggressive")) {
- # addCommandLineError("ERROR: Invalid cleaup specified (" . getGlobal("cleanup") . "); must be 'none', 'light', 'heavy' or 'aggressive'\n");
- #}
+ if ((getGlobal("saveOverlaps") ne "0") &&
+ (getGlobal("saveOverlaps") ne "stores") &&
+ (getGlobal("saveOverlaps") ne "1")) {
+ addCommandLineError("ERROR: Invalid 'saveOverlaps' specified (" . getGlobal("saveOverlaps") . "); must be 'false', 'stores', or 'true'\n");
+ }
if ((getGlobal("corFilter") ne "quick") &&
(getGlobal("corFilter") ne "expensive") &&
@@ -1388,6 +1348,20 @@ sub checkParameters () {
addCommandLineError($failureString) if ($ok == 0);
}
+ {
+ my @v = split '\s+', getGlobal("contigFilter");
+
+ if (scalar(@v) != 5) {
+ addCommandLineError("contigFilter must have five values: minReads minLength singleReadSpan lowCovFraction lowCovDepth\n");
+ }
+
+ addCommandLineError("contigFilter 'minReads' must be a positive integer, currently $v[0]\n") if (($v[0] < 0) || ($v[0] !~ m/^[0-9]+$/));
+ addCommandLineError("contigFilter 'minLength' must be a positive integer, currently $v[1]\n") if (($v[1] < 0) || ($v[1] !~ m/^[0-9]+$/));
+ addCommandLineError("contigFilter 'singleReadSpan' must be between 0.0 and 1.0, currently $v[2]\n") if (($v[2] < 0) || (1 < $v[2]) || ($v[2] !~ m/^[0-9]*\.{0,1}[0-9]*$/));
+ addCommandLineError("contigFilter 'lowCovFraction' must be between 0.0 and 1.0, currently $v[3]\n") if (($v[3] < 0) || (1 < $v[3]) || ($v[3] !~ m/^[0-9]*\.{0,1}[0-9]*$/));
+ addCommandLineError("contigFilter 'lowCovDepth' must be a positive integer, currently $v[4]\n") if (($v[4] < 0) || ($v[4] !~ m/^[0-9]+$/));
+ }
+
#
# Minimap, no valid identities, set legacy
#
diff --git a/src/pipelines/canu/Execution.pm b/src/pipelines/canu/Execution.pm
index be1fe77..97cc0f0 100644
--- a/src/pipelines/canu/Execution.pm
+++ b/src/pipelines/canu/Execution.pm
@@ -59,6 +59,7 @@ require Exporter;
skipStage
emitStage
touch
+ makeExecutable
getInstallDirectory
getJobIDShellCode
getLimitShellCode
@@ -167,8 +168,9 @@ sub schedulerRun () {
}
}
-sub schedulerFinish ($) {
+sub schedulerFinish ($$) {
my $dir = shift @_;
+ my $nam = shift @_;
my $child;
my @newProcesses;
my $remain;
@@ -179,8 +181,8 @@ sub schedulerFinish ($) {
my $diskfree = (defined($dir)) ? (diskSpace($dir)) : (0);
print STDERR "----------------------------------------\n";
- print STDERR "-- Starting concurrent execution on ", scalar(localtime()), " with $diskfree GB free disk space ($remain processes; $numberOfProcesses concurrently)\n" if (defined($dir));
- print STDERR "-- Starting concurrent execution on ", scalar(localtime()), " ($remain processes; $numberOfProcesses concurrently)\n" if (!defined($dir));
+ print STDERR "-- Starting '$nam' concurrent execution on ", scalar(localtime()), " with $diskfree GB free disk space ($remain processes; $numberOfProcesses concurrently)\n" if (defined($dir));
+ print STDERR "-- Starting '$nam' concurrent execution on ", scalar(localtime()), " ($remain processes; $numberOfProcesses concurrently)\n" if (!defined($dir));
print STDERR "\n";
print STDERR " cd $dir\n";
@@ -242,6 +244,13 @@ sub touch ($@) {
+sub makeExecutable ($) {
+ my $file = shift @_;
+
+ chmod(0755 & ~umask(), $file);
+}
+
+
#
# State management
#
@@ -346,41 +355,34 @@ sub getJobIDShellCode () {
# Emits a block of shell code to change shell imposed limit on the number of open files and
# processes.
#
-sub getLimitShellCode ($) {
- my $which = shift @_;
+sub getLimitShellCode () {
my $string;
- if ($which eq "processes") {
- $string .= "\n";
- $string .= "max=`ulimit -Hu`\n";
- $string .= "bef=`ulimit -Su`\n";
- $string .= "if [ \$bef -lt \$max ] ; then\n";
- $string .= " ulimit -Su \$max\n";
- $string .= " aft=`ulimit -Su`\n";
- $string .= " echo \"Changed max processes per user from \$bef to \$aft (max \$max).\"\n";
- $string .= " echo \"\"\n";
- $string .= "else\n";
- $string .= " echo \"Max processes per user limited to \$bef, no increase possible.\"\n";
- $string .= " echo \"\"\n";
- $string .= "fi\n";
- $string .= "\n";
- }
-
- if ($which eq "files") {
- $string .= "\n";
- $string .= "max=`ulimit -Hn`\n";
- $string .= "bef=`ulimit -Sn`\n";
- $string .= "if [ \$bef -lt \$max ] ; then\n";
- $string .= " ulimit -Sn \$max\n";
- $string .= " aft=`ulimit -Sn`\n";
- $string .= " echo \"Changed max open files from \$bef to \$aft (max \$max).\"\n";
- $string .= " echo \"\"\n";
- $string .= "else\n";
- $string .= " echo \"Max open files limited to \$bef, no increase possible.\"\n";
- $string .= " echo \"\"\n";
- $string .= "fi\n";
- $string .= "\n";
- }
+ $string .= "echo \"\"\n";
+ $string .= "echo \"Attempting to increase maximum allowed processes and open files.\"";
+ $string .= "\n";
+ $string .= "max=`ulimit -Hu`\n";
+ $string .= "bef=`ulimit -Su`\n";
+ $string .= "if [ \$bef -lt \$max ] ; then\n";
+ $string .= " ulimit -Su \$max\n";
+ $string .= " aft=`ulimit -Su`\n";
+ $string .= " echo \" Changed max processes per user from \$bef to \$aft (max \$max).\"\n";
+ $string .= "else\n";
+ $string .= " echo \" Max processes per user limited to \$bef, no increase possible.\"\n";
+ $string .= "fi\n";
+ $string .= "\n";
+ $string .= "max=`ulimit -Hn`\n";
+ $string .= "bef=`ulimit -Sn`\n";
+ $string .= "if [ \$bef -lt \$max ] ; then\n";
+ $string .= " ulimit -Sn \$max\n";
+ $string .= " aft=`ulimit -Sn`\n";
+ $string .= " echo \" Changed max open files from \$bef to \$aft (max \$max).\"\n";
+ $string .= "else\n";
+ $string .= " echo \" Max open files limited to \$bef, no increase possible.\"\n";
+ $string .= "fi\n";
+ $string .= "\n";
+ $string .= "echo \"\"\n";
+ $string .= "\n";
return($string);
}
@@ -663,7 +665,7 @@ sub submitScript ($$) {
print F "\$bin/canu " . getCommandLineOptions() . " canuIteration=" . getGlobal("canuIteration") . "\n";
close(F);
- system("chmod +x $script");
+ makeExecutable("$script");
# Construct a submission command line.
@@ -844,6 +846,18 @@ sub buildThreadOption ($) {
}
+sub purgeGridJobSubmitScripts ($$) {
+ my $path = shift @_;
+ my $script = shift @_;
+ my $idx = "01";
+
+ while (-e "$path/$script.jobSubmit-$idx.sh") {
+ unlink "$path/$script.jobSubmit-$idx.sh";
+ $idx++;
+ }
+}
+
+
sub buildGridJob ($$$$$$$$$) {
my $asm = shift @_;
my $jobType = shift @_;
@@ -894,12 +908,18 @@ sub buildGridJob ($$$$$$$$$) {
$opts .= "$outputOption " if (defined($outputOption));
$opts =~ s/\s+$//;
+ # Find a unique file name to save the command.
+
+ my $idx = "01";
+
+ while (-e "$path/$script.jobSubmit-$idx.sh") {
+ $idx++;
+ }
+
# Build and save the command line. Return the command PREFIX (we'll be adding .sh and .out as
# appropriate), and the job name it will be submitted with (which isn't expected to be used).
- my $cmd;
-
- open(F, "> $path/$script.jobSubmit.sh") or die;
+ open(F, "> $path/$script.jobSubmit-$idx.sh") or die;
print F "#!/bin/sh\n";
print F "\n";
print F "$submitCommand \\\n";
@@ -907,12 +927,12 @@ sub buildGridJob ($$$$$$$$$) {
print F " $nameOption \"$jobName\" \\\n";
print F " $arrayOpt \\\n";
print F " ./$script.sh $arrayOff \\\n";
- print F "> ./$script.jobSubmit.out 2>&1\n";
+ print F "> ./$script.jobSubmit-$idx.out 2>&1\n";
close(F);
- chmod 0755, "$path/$script.jobSubmit.sh";
+ makeExecutable("$path/$script.jobSubmit-$idx.sh");
- return("$script.jobSubmit", $jobName);
+ return("$script.jobSubmit-$idx", $jobName);
}
@@ -1029,7 +1049,7 @@ sub submitOrRunParallelJob ($$$$@) {
# The script MUST be executable.
- system("chmod +x \"$path/$script.sh\"");
+ makeExecutable("$path/$script.sh");
# Report what we're doing.
@@ -1091,6 +1111,8 @@ sub submitOrRunParallelJob ($$$$@) {
print STDERR "--\n";
+ purgeGridJobSubmitScripts($path, $script);
+
foreach my $j (@jobs) {
my ($cmd, $jobName) = buildGridJob($asm, $jobType, $path, $script, $mem, $thr, $dsk, $j, undef);
@@ -1201,6 +1223,8 @@ sub submitOrRunParallelJob ($$$$@) {
print STDERR "Please run the following commands to submit jobs to the grid for execution using $mem gigabytes memory and $thr threads:\n";
print STDERR "\n";
+ purgeGridJobSubmitScripts($path, $script);
+
foreach my $j (@jobs) {
my $cwd = getcwd();
my ($cmd, $jobName) = buildGridJob($asm, $jobType, $path, $script, $mem, $thr, $dsk, $j, undef);
@@ -1247,7 +1271,7 @@ sub submitOrRunParallelJob ($$$$@) {
my $nParallel = $nCParallel < $nMParallel ? $nCParallel : $nMParallel;
schedulerSetNumberOfProcesses($nParallel);
- schedulerFinish($path);
+ schedulerFinish($path, $jobType);
}
@@ -1430,33 +1454,39 @@ sub findExecutable ($) {
# Use caExit() for transient errors, like not opening files, processes that die, etc.
sub caExit ($$) {
- my $asm = getGlobal("onExitNam");
- my $msg = shift @_;
- my $log = shift @_;
+ my $asm = getGlobal("onExitNam");
+ my $msg = shift @_;
+ my $log = shift @_;
+ my $version = getGlobal("version");
- print STDERR "================================================================================\n";
- print STDERR "Don't panic, but a mostly harmless error occurred and Canu stopped.\n";
print STDERR "\n";
-
- if (defined($log)) {
+ print STDERR "ABORT:\n";
+ print STDERR "ABORT: $version\n";
+ print STDERR "ABORT: Don't panic, but a mostly harmless error occurred and Canu stopped.\n";
+ print STDERR "ABORT: Try restarting. If that doesn't work, ask for help.\n";
+ print STDERR "ABORT:\n";
+ print STDERR "ABORT: $msg.\n" if (defined($msg));
+ print STDERR "ABORT:\n" if (defined($msg));
+
+ if (defined($log) && -e $log) {
my $df = diskSpace($log);
- print STDERR "Disk space available: $df GB\n";
- print STDERR "\n";
+ print STDERR "ABORT: Disk space available: $df GB\n";
+ print STDERR "ABORT:\n";
}
if (-e $log) {
- print STDERR "Last 50 lines of the relevant log file ($log):\n";
- print STDERR "\n";
- system("tail -n 50 $log");
- print STDERR "\n";
- }
+ print STDERR "ABORT: Last 50 lines of the relevant log file ($log):\n";
+ print STDERR "ABORT:\n";
- my $version = getGlobal("version");
+ open(Z, "tail -n 50 $log");
+ while (<Z>) {
+ print STDERR "ABORT: $_";
+ }
+ close(Z);
- print STDERR "$version failed with:\n";
- print STDERR " $msg\n";
- print STDERR "\n";
+ print STDERR "ABORT:\n";
+ }
my $fail = getGlobal('onFailure');
if (defined($fail)) {
@@ -1473,26 +1503,35 @@ sub caFailure ($$) {
my $msg = shift @_;
my $log = shift @_;
my $version = getGlobal("version");
- my $trace = longmess(undef);
+ my $trace = longmess("Failed");
+
+ $trace =~ s/\n/\nCRASH: /g;
- print STDERR "================================================================================\n";
- print STDERR "Please panic. Canu failed, and it shouldn't have.\n";
- print STDERR "\n";
- print STDERR "Stack trace:\n";
- print STDERR "\n";
- print STDERR "$trace\n";
print STDERR "\n";
+ print STDERR "CRASH:\n";
+ print STDERR "CRASH: $version\n";
+ print STDERR "CRASH: Please panic, this is abnormal.\n";
+ print STDERR "ABORT:\n";
+ print STDERR "CRASH: $msg.\n";
+ print STDERR "CRASH:\n";
+ print STDERR "CRASH: $trace\n";
+ #print STDERR "CRASH:\n"; # $trace has an extra CRASH: at the end
if (-e $log) {
- print STDERR "Last few lines of the relevant log file ($log):\n";
- print STDERR "\n";
- system("tail -n 50 $log");
- }
+ print STDERR "CRASH: Last 50 lines of the relevant log file ($log):\n";
+ print STDERR "CRASH:\n";
- print STDERR "\n";
- print STDERR "$version failed with:\n";
- print STDERR " $msg\n";
- print STDERR "\n";
+ open(Z, "tail -n 50 $log");
+ while (<Z>) {
+ print STDERR "CRASH: $_";
+ }
+ close(Z);
+
+ print STDERR "CRASH:\n";
+ } else {
+ print STDERR "CRASH: No log file supplied.\n";
+ print STDERR "CRASH:\n";
+ }
my $fail = getGlobal('onFailure');
if (defined($fail)) {
diff --git a/src/pipelines/canu/Gatekeeper.pm b/src/pipelines/canu/Gatekeeper.pm
index bf28144..2d8fd04 100644
--- a/src/pipelines/canu/Gatekeeper.pm
+++ b/src/pipelines/canu/Gatekeeper.pm
@@ -168,6 +168,23 @@ sub gatekeeperCreateStore ($$@) {
return;
}
+ # If the store failed to build and the user just reruns canu, this will be triggered. We'll
+ # skip rebuilding the store again, and report the original error message.
+
+ if (-e "$base/$asm.gkpStore.BUILDING") {
+ print STDERR "-- WARNING:\n";
+ print STDERR "-- WARNING: Previously failed gkpStore detected.\n";
+ print STDERR "-- WARNING:\n";
+ }
+
+ # Not sure how this can occur. Possibly the user just deleted gkpStore.BUILDING and restarted?
+
+ if ((! -e "$base/$asm.gkpStore.BUILDING") && (-e "$base/$asm.gkpStore.gkp")) {
+ print STDERR "-- WARNING:\n";
+ print STDERR "-- WARNING: Existing sequence inputs used.\n";
+ print STDERR "-- WARNING:\n";
+ }
+
# Fail if there are no inputs.
caExit("no input files specified, and store not already created, I have nothing to work on!", undef)
@@ -178,87 +195,91 @@ sub gatekeeperCreateStore ($$@) {
# At the same time, check that all files exist.
- my $ff = undef;
+ if (!-e "$base/$asm.gkpStore.gkp") {
+ my $ff = undef;
+
+ foreach my $iii (@inputs) {
+ my ($type, $file) = split '\0', $iii;
- foreach my $iii (@inputs) {
- my ($type, $file) = split '\0', $iii;
+ if (($file =~ m/\.correctedReads\./) ||
+ ($file =~ m/\.trimmedReads\./)) {
+ fetchFile($file);
- if (($file =~ m/\.correctedReads\./) ||
- ($file =~ m/\.trimmedReads\./)) {
- fetchFile($file);
+ chdir($base); # Move to where we run the command
+ $file = "../$file" if (-e "../$file"); # If file exists up one dir, it's our file
+ $iii = "$type\0$file"; # Rewrite the option
+ chdir(".."); # ($file is used below too)
+ }
- chdir($base); # Move to where we run the command
- $file = "../$file" if (-e "../$file"); # If file exists up one dir, it's our file
- $iii = "$type\0$file"; # Rewrite the option
- chdir(".."); # ($file is used below too)
+ chdir($base);
+ $ff .= (defined($ff) ? "\n " : "") . "reads '$file' not found." if (! -e $file);
+ chdir("..");
}
- chdir($base);
- $ff .= (defined($ff) ? "\n " : "") . "reads '$file' not found." if (! -e $file);
- chdir("..");
- }
+ caExit($ff, undef) if defined($ff);
- caExit($ff, undef) if defined($ff);
+ # Build a gkp file for all the raw sequence inputs. For simplicity, we just copy in any gkp
+ # files as is. This documents what gatekeeper was built with, etc.
- # Build a gkp file for all the raw sequence inputs. For simplicity, we just copy in any gkp
- # files as is. This documents what gatekeeper was built with, etc.
+ open(F, "> $base/$asm.gkpStore.gkp") or caExit("cant' open '$base/$asm.gkpStore.gkp' for writing: $0", undef);
- open(F, "> $base/$asm.gkpStore.gkp") or caExit("cant' open '$base/$asm.gkpStore.gkp' for writing: $0", undef);
+ foreach my $iii (@inputs) {
+ if ($iii =~ m/^-(.*)\0(.*)$/) {
+ my $tech = $1;
+ my $file = $2;
+ my @name = split '/', $2;
+ my $name = $name[scalar(@name)-1];
- foreach my $iii (@inputs) {
- if ($iii =~ m/^-(.*)\0(.*)$/) {
- my $tech = $1;
- my $file = $2;
- my @name = split '/', $2;
- my $name = $name[scalar(@name)-1];
+ $name = $1 if ($name =~ m/(.*).[xgb][z]2{0,1}$/i);
+ $name = $1 if ($name =~ m/(.*).fast[aq]$/i);
+ $name = $1 if ($name =~ m/(.*).f[aq]$/i);
- $name = $1 if ($name =~ m/(.*).[xgb][z]2{0,1}$/i);
- $name = $1 if ($name =~ m/(.*).fast[aq]$/i);
- $name = $1 if ($name =~ m/(.*).f[aq]$/i);
+ print F "########################################\n";
+ print F "# $tech: $file\n";
+ print F "#\n";
+ print F "name $name\n";
+ print F "preset $tech\n";
+ print F "$file\n";
+ print F "\n";
- print F "########################################\n";
- print F "# $tech: $file\n";
- print F "#\n";
- print F "name $name\n";
- print F "preset $tech\n";
- print F "$file\n";
- print F "\n";
+ } elsif (-e $iii) {
+ print F "########################################\n";
+ print F "# $iii\n";
+ print F "#\n";
+ open(I, "< $iii") or caExit("can't open gatekeeper input '$iii' for reading: $0", undef);
+ while (<I>) {
+ print F $_;
+ }
+ close(I);
+ print F "\n";
- } elsif (-e $iii) {
- print F "########################################\n";
- print F "# $iii\n";
- print F "#\n";
- open(I, "< $iii") or caExit("can't open gatekeeper input '$iii' for reading: $0", undef);
- while (<I>) {
- print F $_;
+ } else {
+ caExit("unrecognized gatekeeper input file '$iii'", undef);
}
- close(I);
- print F "\n";
-
- } else {
- caExit("unrecognized gatekeeper input file '$iii'", undef);
}
- }
- close(F);
+ close(F);
+ }
# Load the store.
- my $cmd;
- $cmd .= "$bin/gatekeeperCreate \\\n";
- $cmd .= " -minlength " . getGlobal("minReadLength") . " \\\n";
- $cmd .= " -o ./$asm.gkpStore.BUILDING \\\n";
- $cmd .= " ./$asm.gkpStore.gkp \\\n";
- $cmd .= "> ./$asm.gkpStore.BUILDING.err 2>&1";
-
- # A little funny business to make gatekeeper not fail on read quality issues.
- # A return code of 0 is total success.
- # A return code of 1 means it found errors in the inputs, but finished.
- # Anything larger is a crash.
-
- if (runCommand($base, $cmd) > 1) {
- caExit("gatekeeper failed", "$base/$asm.gkpStore.BUILDING.err");
+ if (! -e "$base/$asm.gkpStore.BUILDING") {
+ my $cmd;
+ $cmd .= "$bin/gatekeeperCreate \\\n";
+ $cmd .= " -minlength " . getGlobal("minReadLength") . " \\\n";
+ $cmd .= " -o ./$asm.gkpStore.BUILDING \\\n";
+ $cmd .= " ./$asm.gkpStore.gkp \\\n";
+ $cmd .= "> ./$asm.gkpStore.BUILDING.err 2>&1";
+
+ # A little funny business to make gatekeeper not fail on read quality issues.
+ # A return code of 0 is total success.
+ # A return code of 1 means it found errors in the inputs, but finished.
+ # Anything larger is a crash.
+
+ if (runCommand($base, $cmd) > 1) {
+ caExit("gatekeeper failed", "$base/$asm.gkpStore.BUILDING.err");
+ }
}
# Check for quality issues.
@@ -273,6 +294,7 @@ sub gatekeeperCreateStore ($$@) {
close(F);
if ($nProblems > 0) {
+ print STDERR "\n";
print STDERR "Gatekeeper detected problems in your input reads. Please review the logging in files:\n";
print STDERR " ", getcwd(), "/$base/$asm.gkpStore.BUILDING.err\n";
print STDERR " ", getcwd(), "/$base/$asm.gkpStore.BUILDING/errorLog\n";
@@ -467,7 +489,7 @@ sub gatekeeper ($$@) {
if ((-e "$base/$asm.gkpStore/info") && (getNumberOfReadsInStore($base, $asm) == 0)) {
print STDERR "-- Removing empty or incomplate gkpStore '$base/$asm.gkpStore'\n";
- runCommandSilently($base, "rm -rf ./$asm.gkpStore", 1);
+ remove_tree("$asm.gkpStore");
}
# Store with reads? Yay! Report it, then skip.
@@ -476,7 +498,7 @@ sub gatekeeper ($$@) {
goto allDone if (getNumberOfReadsInStore($base, $asm) > 0);
# Create the store. If all goes well, we get asm.gkpStore. If not, we could end up with
- # asm.BUILDING.gkpStore and ask the user to examine it and rename it to asm.ACCEPTED.gkpStore
+ # asm.gkpStore.BUILDING and ask the user to examine it and rename it to asm.gkpStore.ACCEPTED
# and restart. On the restart, gatekeeperCreateStore() detects the 'ACCPETED' store and
# renames to asm.gkpStore.
diff --git a/src/pipelines/canu/Grid_LSF.pm b/src/pipelines/canu/Grid_LSF.pm
index 22c6f00..2d584de 100644
--- a/src/pipelines/canu/Grid_LSF.pm
+++ b/src/pipelines/canu/Grid_LSF.pm
@@ -63,7 +63,7 @@ sub configureLSF () {
setGlobalIfUndef("gridEngineArrayName", "ARRAY_NAME\[ARRAY_JOBS\]");
setGlobalIfUndef("gridEngineArrayMaxJobs", 65535);
setGlobalIfUndef("gridEngineOutputOption", "-o");
- setGlobalIfUndef("gridEngineThreadsOption", "-n THREADS");
+ setGlobalIfUndef("gridEngineThreadsOption", "-R span[hosts=1] -n THREADS");
setGlobalIfUndef("gridEngineMemoryOption", "-M MEMORY");
setGlobalIfUndef("gridEnginePropagateCommand", "bmodify -w \"done\(\"WAIT_TAG\"\)\"");
setGlobalIfUndef("gridEngineNameToJobIDCommand", "bjobs -A -J \"WAIT_TAG\" | grep -v JOBID");
diff --git a/src/pipelines/canu/Grid_PBSTorque.pm b/src/pipelines/canu/Grid_PBSTorque.pm
index 50c1621..c5705d0 100644
--- a/src/pipelines/canu/Grid_PBSTorque.pm
+++ b/src/pipelines/canu/Grid_PBSTorque.pm
@@ -158,6 +158,26 @@ sub configurePBSTorque () {
my $isPro = (uc(getGlobal("gridEngine")) eq "PBSPRO");
+ # For Torque, see if there is a max array size.
+ # For Pro, set to 1000.
+
+ my $maxArraySize = getGlobal("gridEngineArrayMaxJobs");
+
+ if (!defined($maxArraySize)) {
+ $maxArraySize = 1000;
+
+ open(F, "qmgr -c 'p s' |");
+ while (<F>) {
+ if (m/max_job_array_size\s+=\s+(\d+)/) { # Torque
+ $maxArraySize = $1;
+ }
+ if (m/max_array_size\s+=\s+(\d+)/) { # PBSPro
+ $maxArraySize = $1;
+ }
+ }
+ close(F);
+ }
+
# PBSPro, again, throws a curve ball at us. There is no way to set the output of array jobs
# to someting reasonable like name.TASK_ID.err, even though that is basically the default.
# So, we unset gridEngineArraySubmitID to get the default name, but then need to move the '-j oe'
@@ -169,7 +189,7 @@ sub configurePBSTorque () {
setGlobalIfUndef("gridEngineArrayOption", "-t ARRAY_JOBS") if ($isPro == 0);
setGlobalIfUndef("gridEngineArrayOption", "-J ARRAY_JOBS") if ($isPro == 1);
setGlobalIfUndef("gridEngineArrayName", "ARRAY_NAME");
- setGlobalIfUndef("gridEngineArrayMaxJobs", 268435456); # Effectively unlimited.
+ setGlobalIfUndef("gridEngineArrayMaxJobs", $maxArraySize);
setGlobalIfUndef("gridEngineOutputOption", "-o");
setGlobalIfUndef("gridEngineThreadsOption", "-l nodes=1:ppn=THREADS");
setGlobalIfUndef("gridEngineMemoryOption", "-l mem=MEMORY");
diff --git a/src/pipelines/canu/Meryl.pm b/src/pipelines/canu/Meryl.pm
index c6068ea..57f263d 100644
--- a/src/pipelines/canu/Meryl.pm
+++ b/src/pipelines/canu/Meryl.pm
@@ -478,6 +478,7 @@ sub merylConfigure ($$) {
close(F);
+ makeExecutable("$path/meryl.sh");
stashFile("$path/meryl.sh");
finishStage:
@@ -514,18 +515,19 @@ sub merylCheck ($$) {
# shows how to process multiple jobs. This only checks for the existence of the final outputs.
# (unitigger is the same)
- # If not the first attempt, report the jobs that failed, and that we're recomputing.
+ # If too many attempts, give up.
- if ($attempt > 1) {
+ if ($attempt >= getGlobal("canuIterationMax")) {
print STDERR "--\n";
- print STDERR "-- meryl failed.\n";
+ print STDERR "-- Meryl failed, tried $attempt times, giving up.\n";
print STDERR "--\n";
+ caExit(undef, undef);
}
- # If too many attempts, give up.
-
- if ($attempt > getGlobal("canuIterationMax")) {
- caExit("failed to generate mer counts. Made " . ($attempt-1) . " attempts, jobs still failed", undef);
+ if ($attempt > 0) {
+ print STDERR "--\n";
+ print STDERR "-- Meryl failed, retry.\n";
+ print STDERR "--\n";
}
# Otherwise, run some jobs.
diff --git a/src/pipelines/canu/Output.pm b/src/pipelines/canu/Output.pm
index 8269bb0..bc921e2 100644
--- a/src/pipelines/canu/Output.pm
+++ b/src/pipelines/canu/Output.pm
@@ -174,6 +174,13 @@ sub generateOutputs ($) {
stashFile("$asm.unitigs.gfa");
}
+ if ((! fileExists("$asm.unitigs.bed")) &&
+ ( fileExists("unitigging/4-unitigger/$asm.unitigs.aligned.bed"))) {
+ fetchFile("unitigging/4-unitigger/$asm.unitigs.aligned.bed");
+ copy("unitigging/4-unitigger/$asm.unitigs.aligned.bed", "$asm.unitigs.bed");
+ stashFile("$asm.unitigs.bed");
+ }
+
# User-supplied termination command.
if (defined(getGlobal("onSuccess"))) {
diff --git a/src/pipelines/canu/OverlapBasedTrimming.pm b/src/pipelines/canu/OverlapBasedTrimming.pm
index 4abf45a..67b2b68 100644
--- a/src/pipelines/canu/OverlapBasedTrimming.pm
+++ b/src/pipelines/canu/OverlapBasedTrimming.pm
@@ -230,8 +230,7 @@ sub dumpReads ($) {
stashFile("./$asm.trimmedReads.fasta.gz");
- # Need gatekeeperDumpFASTQ to also write a gkp input file
- #touch("../$asm.trimmedReads.gkp");
+ remove_tree("trimming/$asm.ovlStore") if (getGlobal("saveOverlaps") eq "0");
finishStage:
emitStage($asm, "obt-dumpReads");
diff --git a/src/pipelines/canu/OverlapErrorAdjustment.pm b/src/pipelines/canu/OverlapErrorAdjustment.pm
index 5db2253..0160015 100644
--- a/src/pipelines/canu/OverlapErrorAdjustment.pm
+++ b/src/pipelines/canu/OverlapErrorAdjustment.pm
@@ -184,7 +184,7 @@ sub readErrorDetectionConfigure ($) {
print F "fi\n";
}
- print F "jobid=`printf %04d \$jobid`\n";
+ print F "jobid=`printf %05d \$jobid`\n";
print F "\n";
print F "if [ -e ./\$jobid.red ] ; then\n";
print F " echo Job previously completed successfully.\n";
@@ -206,8 +206,7 @@ sub readErrorDetectionConfigure ($) {
close(F);
- chmod 0755, "$path/red.sh";
-
+ makeExecutable("$path/red.sh");
stashFile("$path/red.sh");
finishStage:
@@ -246,7 +245,7 @@ sub readErrorDetectionCheck ($) {
open(A, "< $path/red.sh") or caExit("can't open '$path/red.sh' for reading: $!", undef);
while (<A>) {
if (m/if.*jobid\s+=\s+(\d+)\s+.*then/) {
- my $ji = substr("0000" . $1, -4);
+ my $ji = substr("00000" . $1, -5);
my $jn = "unitigging/3-overlapErrorAdjustment/$ji.red";
if (! fileExists($jn)) {
@@ -263,19 +262,21 @@ sub readErrorDetectionCheck ($) {
if (scalar(@failedJobs) > 0) {
- # If not the first attempt, report the jobs that failed, and that we're recomputing.
+ # If too many attempts, give up.
- if ($attempt > 1) {
+ if ($attempt >= getGlobal("canuIterationMax")) {
print STDERR "--\n";
- print STDERR "-- ", scalar(@failedJobs), " read error detection jobs failed:\n";
+ print STDERR "-- Read error detection jobs failed, tried $attempt times, giving up.\n";
print STDERR $failureMessage;
print STDERR "--\n";
+ caExit(undef, undef);
}
- # If too many attempts, give up.
-
- if ($attempt > getGlobal("canuIterationMax")) {
- caExit("failed to detect errors in reads. Made " . ($attempt-1) . " attempts, jobs still failed", undef);
+ if ($attempt > 0) {
+ print STDERR "--\n";
+ print STDERR "-- Read error detection jobs failed, retry.\n";
+ print STDERR $failureMessage;
+ print STDERR "--\n";
}
# Otherwise, run some jobs.
@@ -496,7 +497,7 @@ sub overlapErrorAdjustmentConfigure ($) {
print F "fi\n";
}
- print F "jobid=`printf %04d \$jobid`\n";
+ print F "jobid=`printf %05d \$jobid`\n";
print F "\n";
print F "if [ -e ./\$jobid.oea ] ; then\n";
print F " echo Job previously completed successfully.\n";
@@ -520,8 +521,7 @@ sub overlapErrorAdjustmentConfigure ($) {
close(F);
- chmod 0755, "$path/oea.sh";
-
+ makeExecutable("$path/oea.sh");
stashFile("$path/oea.sh");
finishStage:
@@ -567,7 +567,7 @@ sub overlapErrorAdjustmentCheck ($) {
open(A, "< $path/oea.sh") or caExit("can't open '$path/oea.sh' for reading: $!", undef);
while (<A>) {
if (m/if.*jobid\s+=\s+(\d+)\s+.*then/) {
- my $ji = substr("0000" . $1, -4);
+ my $ji = substr("00000" . $1, -5);
if (! fileExists("unitigging/3-overlapErrorAdjustment/$ji.oea")) {
$failureMessage .= "-- job $ji.oea FAILED.\n";
@@ -583,19 +583,21 @@ sub overlapErrorAdjustmentCheck ($) {
if (scalar(@failedJobs) > 0) {
- # If not the first attempt, report the jobs that failed, and that we're recomputing.
+ # If too many attempts, give up.
- if ($attempt > 1) {
+ if ($attempt >= getGlobal("canuIterationMax")) {
print STDERR "--\n";
- print STDERR "-- ", scalar(@failedJobs), " overlap error adjustment jobs failed:\n";
+ print STDERR "-- Overlap error adjustment jobs failed, tried $attempt times, giving up.\n";
print STDERR $failureMessage;
print STDERR "--\n";
+ caExit(undef, undef);
}
- # If too many attempts, give up.
-
- if ($attempt > getGlobal("canuIterationMax")) {
- caExit("failed to adjust overlap error rates. Made " . ($attempt-1) . " attempts, jobs still failed", undef);
+ if ($attempt > 0) {
+ print STDERR "--\n";
+ print STDERR "-- Overlap error adjustment jobs failed, retry.\n";
+ print STDERR $failureMessage;
+ print STDERR "--\n";
}
# Otherwise, run some jobs.
diff --git a/src/pipelines/canu/OverlapInCore.pm b/src/pipelines/canu/OverlapInCore.pm
index 4997172..45c1798 100644
--- a/src/pipelines/canu/OverlapInCore.pm
+++ b/src/pipelines/canu/OverlapInCore.pm
@@ -232,8 +232,7 @@ sub overlapConfigure ($$$) {
print F "exit 0\n";
close(F);
- system("chmod +x $path/overlap.sh");
-
+ makeExecutable("$path/overlap.sh");
stashFile("$path/overlap.sh");
}
@@ -391,7 +390,7 @@ sub overlapCheck ($$$) {
push @miscJobs, "1-overlapper/$1.counts\n";
} else {
- $failureMessage .= "-- job 1-overlapper/$1.ovb FAILED.\n";
+ $failureMessage .= "-- job $path/$1.ovb FAILED.\n";
push @failedJobs, $currentJobID;
}
@@ -405,19 +404,21 @@ sub overlapCheck ($$$) {
if (scalar(@failedJobs) > 0) {
- # If not the first attempt, report the jobs that failed, and that we're recomputing.
+ # If too many attempts, give up.
- if ($attempt > 1) {
+ if ($attempt >= getGlobal("canuIterationMax")) {
print STDERR "--\n";
- print STDERR "-- ", scalar(@failedJobs), " overlapper jobs failed:\n";
+ print STDERR "-- Overlap jobs failed, tried $attempt times, giving up.\n";
print STDERR $failureMessage;
print STDERR "--\n";
+ caExit(undef, undef);
}
- # If too many attempts, give up.
-
- if ($attempt > getGlobal("canuIterationMax")) {
- caExit("failed to overlap. Made " . ($attempt-1) . " attempts, jobs still failed", undef);
+ if ($attempt > 0) {
+ print STDERR "--\n";
+ print STDERR "-- Overlap jobs failed, retry.\n";
+ print STDERR $failureMessage;
+ print STDERR "--\n";
}
# Otherwise, run some jobs.
diff --git a/src/pipelines/canu/OverlapMMap.pm b/src/pipelines/canu/OverlapMMap.pm
index a190189..9cee643 100644
--- a/src/pipelines/canu/OverlapMMap.pm
+++ b/src/pipelines/canu/OverlapMMap.pm
@@ -69,18 +69,42 @@ sub mmapConfigure ($$$) {
goto allDone if (-e "$base/$asm.ovlStore");
goto allDone if (fileExists("$base/$asm.ovlStore.tar"));
+ my $numPacBioRaw = 0;
+ my $numPacBioCorrected = 0;
+ my $numNanoporeRaw = 0;
+ my $numNanoporeCorrected = 0;
+
+ open(L, "< $base/$asm.gkpStore/libraries.txt") or caExit("can't open '$base/$asm.gkpStore/libraries.txt' for reading: $!", undef);
+ while (<L>) {
+ $numPacBioRaw++ if (m/pacbio-raw/);
+ $numPacBioCorrected++ if (m/pacbio-corrected/);
+ $numNanoporeRaw++ if (m/nanopore-raw/);
+ $numNanoporeCorrected++ if (m/nanopore-corrected/);
+ }
+ close(L);
+ my $parameters = "";
+ if ($numPacBioRaw > 0) {
+ $parameters = "-x ava-pb";
+ } elsif ($numNanoporeRaw > 0) {
+ $parameters = "-x ava-ont";
+ } elsif ($numPacBioCorrected > 0) {
+ $parameters = "-x ava-pb -c -Hk21 -w14"; #tuned to find 1000bp 5% error
+ } elsif ($numNanoporeCorrected > 0) {
+ $parameters = "-x ava-ont -c -k17 -w11"; #tuned to find 1000bp 15% error
+ } else {
+ caFailiure("--ERROR: no know read types found in $base/$asm.gkpStore/libraries.txt")
+ }
+
print STDERR "--\n";
- print STDERR "-- OVERLAPPER (mmap) (correction)\n" if ($tag eq "cor");
- print STDERR "-- OVERLAPPER (mmap) (trimming)\n" if ($tag eq "obt");
- print STDERR "-- OVERLAPPER (mmap) (assembly)\n" if ($tag eq "utg");
+ print STDERR "-- OVERLAPPER (mmap) (correction) with $parameters\n" if ($tag eq "cor");
+ print STDERR "-- OVERLAPPER (mmap) (trimming) with $parameters\n" if ($tag eq "obt");
+ print STDERR "-- OVERLAPPER (mmap) (assembly) with $parameters\n" if ($tag eq "utg");
print STDERR "--\n";
make_path($path) if (! -d $path);
# Constants.
- my $merSize = getGlobal("${tag}MMapMerSize");
-
my $numReads = getNumberOfReadsInStore($base, $asm);
my $memorySize = getGlobal("${tag}mmapMemory");
my $blockPerGb = getGlobal("${tag}MMapBlockSize");
@@ -247,8 +271,29 @@ sub mmapConfigure ($$$) {
print F " -o ./blocks/\$job.input \\\n";
print F "&& \\\n";
print F "mv -f ./blocks/\$job.input.fasta ./blocks/\$job.fasta\n";
+ print F "if [ ! -e ./blocks/\$job.fasta ] ; then\n";
+ print F " echo Failed to extract fasta.\n";
+ print F " exit 1\n";
+ print F "fi\n";
+ print F "\n";
+ print F "\n";
+ print F "echo \"\"\n";
+ print F "echo Starting mmap precompute.\n";
+ print F "echo \"\"\n";
+ print F "\n";
+ print F " \$bin/minimap2 \\\n";
+ print F " $parameters -t ", getGlobal("${tag}mmapThreads"), " \\\n";
+ print F " -d ./blocks/\$job.input.mmi\\\n";
+ print F " ./blocks/\$job.fasta \\\n";
+ print F "&& \\\n";
+ print F "mv -f ./blocks/\$job.input.mmi ./blocks/\$job.mmi\n";
+ print F "\n";
+ print F "if [ ! -e ./blocks/\$job.mmi ] ; then\n";
+ print F " echo MMap failed.\n";
+ print F " exit 1\n";
+ print F "fi\n";
print F "\n";
- print F stashFileShellCode("$base/1-overlapper/blocks", "\$job.fasta", "");
+ print F stashFileShellCode("$base/1-overlapper/blocks", "\$job.mmi", "");
print F "\n";
print F "exit 0\n";
@@ -300,7 +345,7 @@ sub mmapConfigure ($$$) {
print F "fi\n";
print F "\n";
- print F fetchFileShellCode("$path", "blocks/\$blk.fasta", "");
+ print F fetchFileShellCode("$path", "blocks/\$blk.mmi", "");
print F "for ii in `ls ./queries/\$qry` ; do\n";
print F " echo Fetch blocks/\$ii\n";
@@ -314,13 +359,9 @@ sub mmapConfigure ($$$) {
print F "if [ x\$slf = x ]; then\n";
print F " > ./results/\$qry.mmap.WORKING\n";
print F "else\n";
- print F " \$bin/minimap \\\n";
- print F " -k $merSize \\\n";
- print F " -Sw5 \\\n";
- print F " -L100 \\\n";
- print F " -m0 \\\n";
- print F " -t ", getGlobal("${tag}mmapThreads"), " \\\n";
- print F " ./blocks/\$blk.fasta \\\n";
+ print F " \$bin/minimap2 \\\n";
+ print F " $parameters -t ", getGlobal("${tag}mmapThreads"), " \\\n";
+ print F " ./blocks/\$blk.mmi \\\n";
print F " ./blocks/\$blk.fasta \\\n";
print F " > ./results/\$qry.mmap.WORKING \n";
print F " \n";
@@ -328,13 +369,9 @@ sub mmapConfigure ($$$) {
print F "\n";
print F "for file in `ls queries/\$qry/*.fasta`; do\n";
- print F " \$bin/minimap \\\n";
- print F " -k $merSize \\\n";
- print F " -Sw5 \\\n";
- print F " -L100 \\\n";
- print F " -m0 \\\n";
- print F " -t ", getGlobal("${tag}mmapThreads"), " \\\n";
- print F " ./blocks/\$blk.fasta \\\n";
+ print F " \$bin/minimap2 \\\n";
+ print F " $parameters -t ", getGlobal("${tag}mmapThreads"), " \\\n";
+ print F " ./blocks/\$blk.mmi \\\n";
print F " \$file \\\n";
print F " >> ./results/\$qry.mmap.WORKING \n";
print F "done\n";
@@ -347,6 +384,9 @@ sub mmapConfigure ($$$) {
print F " \$bin/mmapConvert \\\n";
print F " -G ../$asm.gkpStore \\\n";
print F " -o ./results/\$qry.mmap.ovb.WORKING \\\n";
+ print F " -partial \\\n" if ($typ eq "partial");
+ print F " -tolerance 100 \\\n" if ($typ eq "normal");
+ print F " -len " , getGlobal("minOverlapLength"), " \\\n";
print F " ./results/\$qry.mmap \\\n";
print F " && \\\n";
print F " mv ./results/\$qry.mmap.ovb.WORKING ./results/\$qry.mmap.ovb\n";
@@ -410,6 +450,9 @@ sub mmapConfigure ($$$) {
print STDERR "-- Configured $numJobs mmap overlap jobs.\n";
}
+ makeExecutable("$path/precompute.sh");
+ makeExecutable("$path/mhap.sh");
+
stashFile("$path/precompute.sh");
stashFile("$path/mhap.sh");
@@ -454,10 +497,10 @@ sub mmapPrecomputeCheck ($$$) {
open(F, "< $path/precompute.sh") or caFailure("can't open '$path/precompute.sh' for reading: $!", undef);
while (<F>) {
if (m/^\s+job=\"(\d+)\"$/) {
- if (fileExists("$path/blocks/$1.fasta")) {
- push @successJobs, "1-overlapper/blocks/$1.fasta\n";
+ if (fileExists("$path/blocks/$1.mmi")) {
+ push @successJobs, "1-overlapper/blocks/$1.mmi\\n";
} else {
- $failureMessage .= "-- job 1-overlapper/blocks/$1.fasta FAILED.\n";
+ $failureMessage .= "-- job $path/blocks/$1.fasta FAILED.\n";
push @failedJobs, $currentJobID;
}
@@ -470,19 +513,21 @@ sub mmapPrecomputeCheck ($$$) {
if (scalar(@failedJobs) > 0) {
- # If not the first attempt, report the jobs that failed, and that we're recomputing.
+ # If too many attempts, give up.
- if ($attempt > 1) {
+ if ($attempt >= getGlobal("canuIterationMax")) {
print STDERR "--\n";
- print STDERR "-- ", scalar(@failedJobs), " mmap precompute jobs failed:\n";
+ print STDERR "-- MiniMap precompute jobs failed, tried $attempt times, giving up.\n";
print STDERR $failureMessage;
print STDERR "--\n";
+ caExit(undef, undef);
}
- # If too many attempts, give up.
-
- if ($attempt > getGlobal("canuIterationMax")) {
- caExit("failed to precompute mmap indices. Made " . ($attempt-1) . " attempts, jobs still failed", undef);
+ if ($attempt > 0) {
+ print STDERR "--\n";
+ print STDERR "-- MiniMap precompute jobs failed, retry.\n";
+ print STDERR $failureMessage;
+ print STDERR "--\n";
}
# Otherwise, run some jobs.
@@ -570,7 +615,7 @@ sub mmapCheck ($$$) {
push @miscJobs, "1-overlapper/results/$1.counts\n";
} else {
- $failureMessage .= "-- job 1-overlapper/results/$1.ovb FAILED.\n";
+ $failureMessage .= "-- job $path/results/$1.ovb FAILED.\n";
push @failedJobs, $currentJobID;
}
@@ -582,7 +627,7 @@ sub mmapCheck ($$$) {
# Also find the queries symlinks so we can remove those. And the query directories, because
# the last directory can be empty, and so we'd never see it at all if only finding files.
- open(F, "find $path/queries -print |");
+ open(F, "cd $base && find 1-overlapper/queries -print |");
while (<F>) {
push @mmapJobs, $_;
}
@@ -592,19 +637,21 @@ sub mmapCheck ($$$) {
if (scalar(@failedJobs) > 0) {
- # If not the first attempt, report the jobs that failed, and that we're recomputing.
+ # If too many attempts, give up.
- if ($attempt > 1) {
+ if ($attempt >= getGlobal("canuIterationMax")) {
print STDERR "--\n";
- print STDERR "-- ", scalar(@failedJobs), " mmap jobs failed:\n";
+ print STDERR "-- MiniMap overlap jobs failed, tried $attempt times, giving up.\n";
print STDERR $failureMessage;
print STDERR "--\n";
+ caExit(undef, undef);
}
- # If too many attempts, give up.
-
- if ($attempt > getGlobal("canuIterationMax")) {
- caExit("failed to compute mmap overlaps. Made " . ($attempt-1) . " attempts, jobs still failed", undef);
+ if ($attempt > 0) {
+ print STDERR "--\n";
+ print STDERR "-- MiniMap overlap jobs failed, retry.\n";
+ print STDERR $failureMessage;
+ print STDERR "--\n";
}
# Otherwise, run some jobs.
diff --git a/src/pipelines/canu/OverlapMhap.pm b/src/pipelines/canu/OverlapMhap.pm
index caee4bb..9307775 100644
--- a/src/pipelines/canu/OverlapMhap.pm
+++ b/src/pipelines/canu/OverlapMhap.pm
@@ -388,6 +388,7 @@ sub mhapConfigure ($$$) {
print F " --ordered-kmer-size $ordSketchMer \\\n";
print F " --threshold $threshold \\\n";
print F " --filter-threshold $filterThreshold \\\n";
+ print F " --min-olap-length ", getGlobal("minOverlapLength"), " \\\n";
print F " --num-threads ", getGlobal("${tag}mhapThreads"), " \\\n";
print F " " . getGlobal("${tag}MhapOptions") . " \\\n" if (defined(getGlobal("${tag}MhapOptions")));
print F " -f $cygA ../../0-mercounts/$asm.ms$merSize.frequentMers.ignore.gz $cygB \\\n" if (-e "$base/0-mercounts/$asm.ms$merSize.frequentMers.ignore.gz");
@@ -487,6 +488,7 @@ sub mhapConfigure ($$$) {
print F " --filter-threshold $filterThreshold \\\n";
print F " --ordered-sketch-size $ordSketch \\\n";
print F " --ordered-kmer-size $ordSketchMer \\\n";
+ print F " --min-olap-length ", getGlobal("minOverlapLength"), " \\\n";
print F " --num-threads ", getGlobal("${tag}mhapThreads"), " \\\n";
print F " " . getGlobal("${tag}MhapOptions") . " \\\n" if (defined(getGlobal("${tag}MhapOptions")));
print F " -s $cygA ./blocks/\$blk.dat \$slf $cygB \\\n";
@@ -569,6 +571,9 @@ sub mhapConfigure ($$$) {
print STDERR "-- Configured $numJobs mhap overlap jobs.\n";
}
+ makeExecutable("$path/precompute.sh");
+ makeExecutable("$path/mhap.sh");
+
stashFile("$path/precompute.sh");
stashFile("$path/mhap.sh");
@@ -618,7 +623,7 @@ sub mhapPrecomputeCheck ($$$) {
if (fileExists("$path/blocks/$1.dat")) {
push @successJobs, "1-overlapper/blocks/$1.dat\n";
} else {
- $failureMessage .= "-- job 1-overlapper/blocks/$1.dat FAILED.\n";
+ $failureMessage .= "-- job $path/blocks/$1.dat FAILED.\n";
push @failedJobs, $currentJobID;
}
@@ -631,19 +636,21 @@ sub mhapPrecomputeCheck ($$$) {
if (scalar(@failedJobs) > 0) {
- # If not the first attempt, report the jobs that failed, and that we're recomputing.
+ # If too many attempts, give up.
- if ($attempt > 1) {
+ if ($attempt >= getGlobal("canuIterationMax")) {
print STDERR "--\n";
- print STDERR "-- ", scalar(@failedJobs), " mhap precompute jobs failed:\n";
+ print STDERR "-- Mhap precompute jobs failed, tried $attempt times, giving up.\n";
print STDERR $failureMessage;
print STDERR "--\n";
+ caExit(undef, undef);
}
- # If too many attempts, give up.
-
- if ($attempt > getGlobal("canuIterationMax")) {
- caExit("failed to precompute mhap indices. Made " . ($attempt-1) . " attempts, jobs still failed", undef);
+ if ($attempt > 0) {
+ print STDERR "--\n";
+ print STDERR "-- Mhap precompute jobs failed, retry.\n";
+ print STDERR $failureMessage;
+ print STDERR "--\n";
}
# Otherwise, run some jobs.
@@ -734,7 +741,7 @@ sub mhapCheck ($$$) {
push @miscJobs, "1-overlapper/results/$1.counts\n";
} else {
- $failureMessage .= "-- job 1-overlapper/results/$1.ovb FAILED.\n";
+ $failureMessage .= "-- job $path/results/$1.ovb FAILED.\n";
push @failedJobs, $currentJobID;
}
@@ -746,7 +753,7 @@ sub mhapCheck ($$$) {
# Also find the queries symlinks so we can remove those. And the query directories, because
# the last directory can be empty, and so we'd never see it at all if only finding files.
- open(F, "find $path/queries -print |");
+ open(F, "cd $base && find 1-overlapper/queries -print |");
while (<F>) {
push @mhapJobs, $_;
}
@@ -756,19 +763,21 @@ sub mhapCheck ($$$) {
if (scalar(@failedJobs) > 0) {
- # If not the first attempt, report the jobs that failed, and that we're recomputing.
+ # If too many attempts, give up.
- if ($attempt > 1) {
+ if ($attempt >= getGlobal("canuIterationMax")) {
print STDERR "--\n";
- print STDERR "-- ", scalar(@failedJobs), " mhap jobs failed:\n";
+ print STDERR "-- Mhap overlap jobs failed, tried $attempt times, giving up.\n";
print STDERR $failureMessage;
print STDERR "--\n";
+ caExit(undef, undef);
}
- # If too many attempts, give up.
-
- if ($attempt > getGlobal("canuIterationMax")) {
- caExit("failed to compute mhap overlaps. Made " . ($attempt-1) . " attempts, jobs still failed", undef);
+ if ($attempt > 0) {
+ print STDERR "--\n";
+ print STDERR "-- Mhap overlap jobs failed, retry.\n";
+ print STDERR $failureMessage;
+ print STDERR "--\n";
}
# Otherwise, run some jobs.
diff --git a/src/pipelines/canu/OverlapStore.pm b/src/pipelines/canu/OverlapStore.pm
index 6579e9e..e69f40b 100644
--- a/src/pipelines/canu/OverlapStore.pm
+++ b/src/pipelines/canu/OverlapStore.pm
@@ -44,6 +44,7 @@ require Exporter;
use strict;
use File::Basename; # dirname
+use File::Path 2.08 qw(make_path remove_tree);
use POSIX qw(ceil);
use canu::Defaults;
@@ -203,14 +204,13 @@ sub overlapStoreConfigure ($$$) {
open(F, "> $path/scripts/0-config.sh") or die;
print F "#!" . getGlobal("shell") . "\n";
print F "\n";
- print F setWorkDirectoryShellCode($path);
- print F "\n";
+ #print F setWorkDirectoryShellCode($path); # This is never run on grid, so don't need to cd first.
+ #print F "\n";
#print F getJobIDShellCode();
#print F "\n";
print F getBinDirectoryShellCode();
print F "\n";
- print F getLimitShellCode("processes");
- print F getLimitShellCode("files");
+ print F getLimitShellCode();
print F "\n";
print F "\$bin/ovStoreBuild \\\n";
print F " -G ./$asm.gkpStore \\\n";
@@ -220,7 +220,8 @@ sub overlapStoreConfigure ($$$) {
print F " -L ./1-overlapper/ovljob.files \\\n";
close(F);
}
- system("chmod +x $path/scripts/0-config.sh");
+ makeExecutable("$path/scripts/0-config.sh");
+ stashFile("$path/scripts/0-config.sh");
if (! -e "$path/config") {
$cmd = "./$asm.ovlStore.BUILDING/scripts/0-config.sh \\\n";
@@ -285,8 +286,7 @@ sub overlapStoreConfigure ($$$) {
print F " rm -rf \"./create\$bn\"\n";
print F "fi\n";
print F "\n";
- print F getLimitShellCode("processes");
- print F getLimitShellCode("files");
+ print F getLimitShellCode();
print F "\n";
print F "\$bin/ovStoreBucketizer \\\n";
print F " -O . \\\n";
@@ -295,14 +295,6 @@ sub overlapStoreConfigure ($$$) {
#print F " -e " . getGlobal("") . " \\\n" if (defined(getGlobal("")));
print F " -job \$jobid \\\n";
print F " -i \$jn\n";
- print F "\n";
- print F "if [ \$? = 0 ] ; then\n";
- print F " echo Success.\n";
- print F " exit 0\n";
- print F "else\n";
- print F " echo Failure.\n";
- print F " exit 1\n";
- print F "fi\n";
close(F);
}
@@ -320,8 +312,7 @@ sub overlapStoreConfigure ($$$) {
print F "\n";
print F getBinDirectoryShellCode();
print F "\n";
- print F getLimitShellCode("processes");
- print F getLimitShellCode("files");
+ print F getLimitShellCode();
print F "\n";
print F "\$bin/ovStoreSorter \\\n";
print F " -deletelate \\\n"; # Choices -deleteearly -deletelate or nothing
@@ -330,14 +321,6 @@ sub overlapStoreConfigure ($$$) {
print F " -G ../$asm.gkpStore \\\n";
print F " -F $numSlices \\\n";
print F " -job \$jobid $numInputs\n";
- print F "\n";
- print F "if [ \$? = 0 ] ; then\n";
- print F " echo Success.\n";
- print F " exit 0\n";
- print F "else\n";
- print F " echo Failure.\n";
- print F " exit 1\n";
- print F "fi\n";
close(F);
}
@@ -347,8 +330,8 @@ sub overlapStoreConfigure ($$$) {
open(F, "> $path/scripts/3-index.sh") or die;
print F "#!" . getGlobal("shell") . "\n";
print F "\n";
- print F setWorkDirectoryShellCode($path);
- print F "\n";
+ #print F setWorkDirectoryShellCode($path); # This is never run on grid, so don't need to cd first.
+ #print F "\n";
#print F getJobIDShellCode();
#print F "\n";
print F getBinDirectoryShellCode();
@@ -357,20 +340,16 @@ sub overlapStoreConfigure ($$$) {
#print F " -nodelete \\\n"; # Choices -nodelete or nothing
print F " -O . \\\n";
print F " -F $numSlices\n";
- print F "\n";
- print F "if [ \$? = 0 ] ; then\n";
- print F " echo Success.\n";
- print F " exit 0\n";
- print F "else\n";
- print F " echo Failure.\n";
- print F " exit 1\n";
- print F "fi\n";
close(F);
}
- system("chmod +x $path/scripts/1-bucketize.sh");
- system("chmod +x $path/scripts/2-sort.sh");
- system("chmod +x $path/scripts/3-index.sh");
+ makeExecutable("$path/scripts/1-bucketize.sh");
+ makeExecutable("$path/scripts/2-sort.sh");
+ makeExecutable("$path/scripts/3-index.sh");
+
+ stashFile("$path/scripts/1-bucketize.sh");
+ stashFile("$path/scripts/2-sort.sh");
+ stashFile("$path/scripts/3-index.sh");
finishStage:
emitStage($asm, "$tag-overlapStoreConfigure");
@@ -431,19 +410,21 @@ sub overlapStoreBucketizerCheck ($$$) {
if (scalar(@failedJobs) > 0) {
- # If not the first attempt, report the jobs that failed, and that we're recomputing.
+ # If too many attempts, give up.
- if ($attempt > 1) {
+ if ($attempt >= getGlobal("canuIterationMax")) {
print STDERR "--\n";
- print STDERR "-- ", scalar(@failedJobs), " overlap store bucketizer jobs failed:\n";
+ print STDERR "-- Overlap store bucketizer jobs failed, tried $attempt times, giving up.\n";
print STDERR $failureMessage;
print STDERR "--\n";
+ caExit(undef, undef);
}
- # If too many attempts, give up.
-
- if ($attempt > getGlobal("canuIterationMax")) {
- caExit("failed to overlapStoreBucketize. Made " . ($attempt-1) . " attempts, jobs still failed", undef);
+ if ($attempt > 0) {
+ print STDERR "--\n";
+ print STDERR "-- Overlap store bucketizer jobs failed, retry.\n";
+ print STDERR $failureMessage;
+ print STDERR "--\n";
}
# Otherwise, run some jobs.
@@ -528,19 +509,22 @@ sub overlapStoreSorterCheck ($$$) {
if (scalar(@failedJobs) > 0) {
- # If not the first attempt, report the jobs that failed, and that we're recomputing.
+ # If too many attempts, give up.
+
- if ($attempt > 1) {
+ if ($attempt >= getGlobal("canuIterationMax")) {
print STDERR "--\n";
- print STDERR "-- ", scalar(@failedJobs), " overlap store sorter jobs failed:\n";
+ print STDERR "-- Overlap store sorting jobs failed, tried $attempt times, giving up.\n";
print STDERR $failureMessage;
print STDERR "--\n";
+ caExit(undef, undef);
}
- # If too many attempts, give up.
-
- if ($attempt > getGlobal("canuIterationMax")) {
- caExit("failed to overlapStoreSorter. Made " . ($attempt-1) . " attempts, jobs still failed", undef);
+ if ($attempt > 0) {
+ print STDERR "--\n";
+ print STDERR "-- Overlap store sorting jobs failed, retry.\n";
+ print STDERR $failureMessage;
+ print STDERR "--\n";
}
# Otherwise, run some jobs.
@@ -565,7 +549,6 @@ sub overlapStoreSorterCheck ($$$) {
-
sub createOverlapStoreParallel ($$$) {
my $base = shift @_;
my $asm = shift @_;
@@ -584,6 +567,51 @@ sub createOverlapStoreParallel ($$$) {
}
+
+sub checkOverlapStore ($$) {
+ my $base = shift @_;
+ my $asm = shift @_;
+
+ my $bin = getBinDirectory();
+ my $cmd;
+
+ $cmd = "$bin/ovStoreDump \\\n";
+ $cmd .= " -G ./$asm.gkpStore \\\n";
+ $cmd .= " -O ./$asm.ovlStore \\\n";
+ $cmd .= " -d -counts \\\n";
+ $cmd .= " > ./$asm.ovlStore/counts.dat 2> ./$asm.ovlStore/counts.err";
+
+ print STDERR "-- Checking store.\n";
+
+ if (runCommand($base, $cmd)) {
+ caExit("failed to dump counts of overlaps; invalid store?", "./$asm.ovlStore/counts.err");
+ }
+
+ my $totOvl = 0;
+ my $nulReads = 0;
+ my $ovlReads = 0;
+
+ open(F, "< ./$base/$asm.ovlStore/counts.dat") or die;
+ while (<F>) {
+ my @v = split '\s+', $_;
+
+ $nulReads += 1 if ($v[1] < 1);
+ $ovlReads += 1 if ($v[1] > 0);
+ $totOvl += $v[1];
+ }
+ close(F);
+
+ print STDERR "--\n";
+ print STDERR "-- Overlap store '$base/$asm.ovlStore' successfully constructed.\n";
+ print STDERR "-- Found $totOvl overlaps for $ovlReads reads; $nulReads reads have no overlaps.\n";
+ print STDERR "--\n";
+
+ unlink "./$base/$asm.ovlStore/counts.dat";
+ unlink "./$base/$asm.ovlStore/counts.err";
+}
+
+
+
sub generateOverlapStoreStats ($$) {
my $base = shift @_;
my $asm = shift @_;
@@ -606,16 +634,6 @@ sub generateOverlapStoreStats ($$) {
}
unlink "$base/$asm.ovlStore.summary.err";
-
- my $report;
-
- open(F, "< $base/$asm.ovlStore.summary") or caExit("Failed to open overlap store statistics in '$base/$asm.ovlStore.summary': $!", undef);
- while (<F>) {
- $report .= "-- $_";
- }
- close(F);
-
- addToReport("overlaps", $report);
}
@@ -643,13 +661,24 @@ sub createOverlapStore ($$$) {
createOverlapStoreSequential($base, $asm, $tag) if ($seq eq "sequential");
createOverlapStoreParallel ($base, $asm, $tag) if ($seq eq "parallel");
- print STDERR "--\n";
- print STDERR "-- Overlap store '$base/$asm.ovlStore' successfully constructed.\n";
+ checkOverlapStore($base, $asm);
goto finishStage if (getGlobal("saveOverlaps") eq "1");
- # Delete the inputs and directories. Some contortions are needed to get directory deletes in order.
- # In particular, mhap's queries directory needs to be deleted after it's subdirectories are.
+ # Delete the inputs and directories.
+ #
+ # Directories - Viciously remove the whole thing (after all files are deleted, so we
+ # can get the sizes).
+ # Files - Sum the size, remove the file, and try to remove the directory. In
+ # particular, we don't want to remove_tree() this directory - there could
+ # be other stuff in it - only remove if empty.
+ #
+ # Ideally, every directory we have in our list should be empty after we delete the files in the
+ # list. But they won't be. Usually because there are empty directories in there too. Maybe
+ # some stray files we didn't track. Regardless, just blow them away.
+ #
+ # Previous (to July 2017) versions tried to gently rmdir things, but it was ugly and didn't
+ # quite work.
my %directories;
my $bytes = 0;
@@ -666,51 +695,35 @@ sub createOverlapStore ($$$) {
while (<F>) {
chomp;
- # Decide what to do. Directories - register for later deletion. Files - sum size and
- # delete.
-
if (-d "$base/$_") {
- $directories{"$base/$_"}++;
+ print STDERR "DIRECTORY $base/$_\n";
+ $directories{$_}++;
} elsif (-e "$base/$_") {
+ print STDERR "FILE $base/$_\n";
$bytes += -s "$base/$_";
$files += 1;
unlink "$base/$_";
+ rmdir dirname("$base/$_"); # Try to rmdir the directory the file is in. If empty, yay!
}
-
- # If the path isn't a directory register the directory it is in for deletion.
-
- $directories{dirname("$base/$_")}++ if (! -d "$base/$_");
}
close(F);
-
- unlink $file;
}
- # Ideally, every directory we have in our list should be empty. But they won't be. So, loop until we fail to delete anything.
-
- my $dirs = 0;
- my $deleted = 1;
-
- while ($deleted > 0) {
- $deleted = 0;
-
- foreach my $dir (keys %directories) {
- if (-d $dir) {
- rmdir $dir;
- $dirs++;
- }
-
- if (! -d $dir) { # If really removed, remove it from our list.
- delete $directories{$dir};
- $deleted++;
- }
- }
+ foreach my $dir (keys %directories) {
+ print STDERR "REMOVE TREE $base/$dir\n";
+ remove_tree("$base/$dir");
}
+ unlink "$base/1-overlapper/ovljob.files";
+ unlink "$base/1-overlapper/ovljob.more.files";
+ unlink "$base/1-overlapper/mhap.files";
+ unlink "$base/1-overlapper/mmap.files";
+ unlink "$base/1-overlapper/precompute.files";
+
print STDERR "--\n";
- print STDERR "-- Purged ", int(1000 * $bytes / 1024 / 1024 / 1024) / 1000, " GB in $files overlap output files and $dirs directories.\n";
+ print STDERR "-- Purged ", int(1000 * $bytes / 1024 / 1024 / 1024) / 1000, " GB in $files overlap output files.\n";
# Now all done!
@@ -724,14 +737,19 @@ sub createOverlapStore ($$$) {
print STDERR "-- Overlap store '$base/$asm.ovlStore' contains:\n";
print STDERR "--\n";
+ my $report;
+
open(F, "< $base/$asm.ovlStore.summary") or caExit("Failed to open overlap store statistics in '$base/$asm.ovlStore': $!", undef);
while (<F>) {
- print STDERR "-- $_";
+ $report .= "-- $_";
}
close(F);
+ addToReport("overlaps", $report); # Also shows it.
+
} else {
print STDERR "-- Overlap store '$base/$asm.ovlStore' statistics not available (skipped in correction and trimming stages).\n";
+ print STDERR "--\n";
}
emitStage($asm, "$tag-createOverlapStore");
diff --git a/src/pipelines/canu/Unitig.pm b/src/pipelines/canu/Unitig.pm
index fdc8194..501d633 100644
--- a/src/pipelines/canu/Unitig.pm
+++ b/src/pipelines/canu/Unitig.pm
@@ -191,7 +191,7 @@ sub unitig ($) {
print F "\n";
print F getJobIDShellCode();
print F "\n";
- print F "if [ -e unitigging/$asm.ctgStore/seqDB.v001.tig -a -e unitigging/$asm.utgStore/seqDB.v001.tig ] ; then\n";
+ print F "if [ -e ../$asm.ctgStore/seqDB.v001.tig -a -e ../$asm.utgStore/seqDB.v001.tig ] ; then\n";
print F " exit 0\n";
print F "fi\n";
print F "\n";
@@ -204,7 +204,7 @@ sub unitig ($) {
print F " -gs " . getGlobal("genomeSize") . " \\\n";
print F " -eg " . getGlobal("utgErrorRate") . " \\\n";
print F " -eM " . getGlobal("utgErrorRate") . " \\\n";
- print F " -el " . $overlapLength . " \\\n";
+ print F " -mo " . $overlapLength . " \\\n";
print F " -dg " . getGlobal("utgGraphDeviation") . " \\\n";
print F " -db " . getGlobal("utgGraphDeviation") . " \\\n";
print F " -dr " . getGlobal("utgRepeatDeviation") . " \\\n";
@@ -245,6 +245,7 @@ sub unitig ($) {
close(F);
+ makeExecutable("$path/unitigger.sh");
stashFile("$path/unitigger.sh");
finishStage:
@@ -273,18 +274,19 @@ sub unitigCheck ($) {
# shows how to process multiple jobs. This only checks for the existence of the final outputs.
# (meryl is the same)
- # If not the first attempt, report the jobs that failed, and that we're recomputing.
+ # If too many attempts, give up.
- if ($attempt > 1) {
+ if ($attempt >= getGlobal("canuIterationMax")) {
print STDERR "--\n";
- print STDERR "-- Unitigger failed.\n";
+ print STDERR "-- Bogart failed, tried $attempt times, giving up.\n";
print STDERR "--\n";
+ caExit(undef, undef);
}
- # If too many attempts, give up.
-
- if ($attempt > getGlobal("canuIterationMax")) {
- caExit("failed to generate unitigs. Made " . ($attempt-1) . " attempts, jobs still failed", undef);
+ if ($attempt > 0) {
+ print STDERR "--\n";
+ print STDERR "-- Bogart failed, retry\n";
+ print STDERR "--\n";
}
# Otherwise, run some jobs.
diff --git a/src/stores/gatekeeperPartition.C b/src/stores/gatekeeperPartition.C
index 97c259a..dbfa13f 100644
--- a/src/stores/gatekeeperPartition.C
+++ b/src/stores/gatekeeperPartition.C
@@ -124,6 +124,7 @@ main(int argc, char **argv) {
uint32 tigStoreVers = 0;
uint32 readCountTarget = 2500; // No partition smaller than this
uint32 partCountTarget = 200; // No more than this many partitions
+ bool doDelete = false;
argc = AS_configure(argc, argv);
@@ -143,6 +144,11 @@ main(int argc, char **argv) {
} else if (strcmp(argv[arg], "-p") == 0) {
partCountTarget = atoi(argv[++arg]);
+ } else if (strcmp(argv[arg], "-D") == 0) {
+ tigStorePath = argv[++arg];
+ tigStoreVers = 1;
+ doDelete = true;
+
} else {
char *s = new char [1024];
snprintf(s, 1024, "ERROR: unknown option '%s'\n", argv[arg]);
@@ -152,13 +158,16 @@ main(int argc, char **argv) {
arg++;
}
- if (gkpStorePath == NULL) err.push_back("ERROR: no gkpStore (-G) supplied.\n");
- if (tigStorePath == NULL) err.push_back("ERROR: no partition input (-P) supplied.\n");
+ if ((gkpStorePath == NULL) &&
+ (doDelete == false)) err.push_back("ERROR: no gkpStore (-G) supplied.\n");
+ if (tigStorePath == NULL) err.push_back("ERROR: no tigStore (-T) supplied.\n");
if (err.size() > 0) {
- fprintf(stderr, "usage: %s -G <gkpStore> -T <tigStore> <v>\n", argv[0]);
+ fprintf(stderr, "usage: %s [-G <gkpStore> -T <tigStore> <v>] ...\n", argv[0]);
+ fprintf(stderr, " %s [-D <tigStore>]\n", argv[0]);
fprintf(stderr, " -G <gkpStore> path to gatekeeper store\n");
fprintf(stderr, " -T <tigStore> <v> path to tig store and version to be partitioned\n");
+ fprintf(stderr, " -D <tigStore> remove a partitioned gkpStore\n");
fprintf(stderr, "\n");
fprintf(stderr, " -b <nReads> minimum number of reads per partition (50000)\n");
fprintf(stderr, " -p <nPartitions> number of partitions (200)\n");
@@ -166,14 +175,14 @@ main(int argc, char **argv) {
fprintf(stderr, "Create a partitioned copy of <gkpStore> and place it in <tigStore>/partitionedReads.gkpStore\n");
fprintf(stderr, "\n");
fprintf(stderr, "NOTE: Path handling in this is probably quite brittle. Due to an implementation\n");
- fprintf(stderr, " detail, the new store must have symlinks back to the original store. Canu \n");
- fprintf(stderr, " wants to use relative paths, and this program tries to adjust <gkpStore> to be\n");
- fprintf(stderr, " relative to <tigStore/partitionedReads.gkpStore. If it fails to do this correctly,\n");
- fprintf(stderr, " one of two (seen so far) errors will occur:\n");
+ fprintf(stderr, " detail, the new store must have symlinks back to the original store. Canu\n");
+ fprintf(stderr, " wants to use relative paths, and this program tries to adjust <gkpStore> to\n");
+ fprintf(stderr, " be relative to <tigStore/partitionedReads.gkpStore. If it fails to do this,\n");
+ fprintf(stderr, " correctly one of two (seen so far) errors will occur:\n");
fprintf(stderr, " Original file '.../partitionedReads.gkpStore/info' doesn't exist, won't make a link to nothing.\n");
- fprintf(stderr, " Couldn't open '.../partitionedReads.gkpStore/libraries' for mmap: No such file or directoryn");
- fprintf(stderr, " In both cases, try to simplify <tigStore> -- in particular, remove any '..' or '.' components -- or\n");
- fprintf(stderr, " run this from a higher/lower directory.\n");
+ fprintf(stderr, " Couldn't open '.../partitionedReads.gkpStore/libraries' for mmap: No such file or directory\n");
+ fprintf(stderr, " In both cases, try to simplify <tigStore> -- in particular, remove any '..'\n");
+ fprintf(stderr, " or '.' components -- or run this from a higher/lower directory.\n");
for (uint32 ii=0; ii<err.size(); ii++)
if (err[ii])
@@ -190,15 +199,21 @@ main(int argc, char **argv) {
snprintf(gkpClonePath, FILENAME_MAX, "%s/partitionedReads.gkpStore", tigStorePath);
// The path to the gkpStore that we want to use in the link is a wee-bit more complicated.
- // If it's an absolute path, there's nothing we need to do.
+ //
+ // 1. Do nothing if there is no gkpStore.
+ // 2. If it's an absolute path, there's nothing we need to do.
+ // 3. But if it's a relative path, we need to add a bunch of dots. One pair
+ // to account for the directory we added above, and then more dots for
+ // each component in tigStorePath.
+ //
+
+ if (gkpStorePath == NULL) {
+ }
- if (gkpStorePath[0] == '/') {
+ else if (gkpStorePath[0] == '/') {
strcpy(gkpSourcePath, gkpStorePath);
}
- // But if it's a relative path, we need to add a bunch of dots. One pair to account
- // for the directory we added above, and then more dots for each component in tigStorePath.
-
else {
char t[FILENAME_MAX]; // Copy command line tigStorePath to a
char *p = t; // local, and modifiable, space.
@@ -218,30 +233,32 @@ main(int argc, char **argv) {
strcat(gkpSourcePath, gkpStorePath); // start.
}
- // Make the clone.
+ // Now, just....do it.
- gkStore::gkStore_clone(gkpSourcePath, gkpClonePath);
-
- // Open the clone.
+ if (doDelete == true) {
+ gkStore *gkpStore = gkStore::gkStore_open(gkpClonePath, gkStore_readOnly);
+ gkpStore->gkStore_deletePartitions();
+ gkpStore->gkStore_close();
+ }
- gkStore *gkpStore = gkStore::gkStore_open(gkpClonePath, gkStore_readOnly);
- // Scan all the tigs to build a map from read to partition.
+ if (doDelete == false) {
+ gkStore::gkStore_clone(gkpSourcePath, gkpClonePath); // Make the clone.
- uint32 *partition = buildPartition(tigStorePath, tigStoreVers,
- readCountTarget,
- partCountTarget,
- gkpStore->gkStore_getNumReads());
+ gkStore *gkpStore = gkStore::gkStore_open(gkpClonePath, gkStore_readOnly); // Open the clone.
- // Dump the partition data to the store, let it build partitions.
+ uint32 *partition = buildPartition(tigStorePath, tigStoreVers, // Scan all the tigs
+ readCountTarget, // to build a map from
+ partCountTarget, // read to partition.
+ gkpStore->gkStore_getNumReads());
- gkpStore->gkStore_buildPartitions(partition);
+ gkpStore->gkStore_buildPartitions(partition); // Build partitions.
- // That's all folks.
+ delete [] partition;
- delete [] partition;
+ gkpStore->gkStore_close();
+ }
- gkpStore->gkStore_close();
exit(0);
}
diff --git a/src/stores/gkStore.C b/src/stores/gkStore.C
index 814fffd..f0dc0ff 100644
--- a/src/stores/gkStore.C
+++ b/src/stores/gkStore.C
@@ -296,7 +296,7 @@ gkReadData::gkReadData_encodeBlobChunk(char const *tag,
// Or make it bigger
- while (_blobMax + 8 + len < _blobMax) {
+ while (_blobMax <= _blobLen + 8 + len) {
_blobMax *= 2;
uint8 *b = new uint8 [_blobMax];
memcpy(b, _blob, sizeof(uint8) * _blobLen);
@@ -1335,12 +1335,6 @@ void
gkStore::gkStore_delete(void) {
char path[FILENAME_MAX];
- delete [] _libraries;
- delete [] _reads;
-
- _libraries = NULL;
- _reads = NULL;
-
gkStore_deletePartitions();
snprintf(path, FILENAME_MAX, "%s/info", gkStore_path()); AS_UTL_unlink(path);
@@ -1348,7 +1342,7 @@ gkStore::gkStore_delete(void) {
snprintf(path, FILENAME_MAX, "%s/reads", gkStore_path()); AS_UTL_unlink(path);
snprintf(path, FILENAME_MAX, "%s/blobs", gkStore_path()); AS_UTL_unlink(path);
- AS_UTL_unlink(path);
+ AS_UTL_rmdir(gkStore_path());
}
@@ -1380,6 +1374,12 @@ gkStore::gkStore_deletePartitions(void) {
snprintf(path, FILENAME_MAX, "%s/partitions/reads.%04u", gkStore_path(), ii+1); AS_UTL_unlink(path);
snprintf(path, FILENAME_MAX, "%s/partitions/blobs.%04u", gkStore_path(), ii+1); AS_UTL_unlink(path);
}
+
+ // And the directory.
+
+ snprintf(path, FILENAME_MAX, "%s/partitions", gkStore_path());
+
+ AS_UTL_rmdir(path);
}
diff --git a/src/stores/ovStoreBucketizer.C b/src/stores/ovStoreBucketizer.C
index a9227b4..d5c3a8f 100644
--- a/src/stores/ovStoreBucketizer.C
+++ b/src/stores/ovStoreBucketizer.C
@@ -305,5 +305,7 @@ main(int argc, char **argv) {
delete [] sliceFile;
delete [] sliceSize;
+ fprintf(stderr, "Success!\n");
+
return(0);
}
diff --git a/src/stores/ovStoreDump.C b/src/stores/ovStoreDump.C
index e6bcdd8..e0e89e4 100644
--- a/src/stores/ovStoreDump.C
+++ b/src/stores/ovStoreDump.C
@@ -410,7 +410,7 @@ dumpPicture(ovOverlap *overlaps,
bogartStatus *bogart) {
char ovl[256] = {0};
- uint32 MHS = 7; // Max Hang Size, amount of padding for "+### "
+ uint32 MHS = 9; // Max Hang Size, amount of padding for "+### "
uint32 Aid = qryID;
gkRead *A = gkpStore->gkStore_getRead(Aid);
@@ -424,9 +424,10 @@ dumpPicture(ovOverlap *overlaps,
ovl[ 99 + MHS] = '>';
ovl[100 + MHS] = 0;
- fprintf(stdout, "%8d A: %5d %5d %s %s%s\n",
- Aid,
+ fprintf(stdout, "A %7d:%-7d A %9d %7d:%-7d %7d %s %s%s\n",
0, frgLenA,
+ Aid,
+ 0, frgLenA, frgLenA,
ovl,
bogart->getContained(Aid) ? "contained" : "",
bogart->getSuspicious(Aid) ? "suspicious" : "");
@@ -583,10 +584,13 @@ dumpPicture(ovOverlap *overlaps,
// Report!
- fprintf(stdout, "%8d A: %5d %5d (%5d) B: %5d %5d (%5d) %5.2f%% %s%s\n",
+ fprintf(stdout, "A %7d:%-7d B %9d %7d:%-7d %7d %5.2f%% %s%s\n",
+ ovlBgnA,
+ ovlEndA,
Bid,
- ovlBgnA, ovlEndA, frgLenA,
- ovlBgnB, ovlEndB, frgLenB,
+ min(ovlBgnB, ovlEndB),
+ max(ovlBgnB, ovlEndB),
+ frgLenB,
overlaps[o].erate() * 100.0,
ovl,
olapClass);
@@ -704,7 +708,7 @@ main(int argc, char **argv) {
char *erateFile = NULL;
- uint32 bgnID = 0;
+ uint32 bgnID = 1;
uint32 endID = UINT32_MAX;
uint32 qryID = 0;
diff --git a/src/stores/ovStoreIndexer.C b/src/stores/ovStoreIndexer.C
index 927a50e..3abc66d 100644
--- a/src/stores/ovStoreIndexer.C
+++ b/src/stores/ovStoreIndexer.C
@@ -159,7 +159,7 @@ main(int argc, char **argv) {
writer->removeAllIntermediateFiles();
- fprintf(stderr, "Finished.\n");
+ fprintf(stderr, "Success!\n");
exit(0);
}
diff --git a/src/stores/ovStoreSorter.C b/src/stores/ovStoreSorter.C
index 5650a89..1764772 100644
--- a/src/stores/ovStoreSorter.C
+++ b/src/stores/ovStoreSorter.C
@@ -61,10 +61,21 @@ void
makeSentinel(char *storePath, uint32 fileID, bool forceRun) {
char name[FILENAME_MAX];
+ // Check if done.
+
+ snprintf(name, FILENAME_MAX, "%s/%04d", storePath, fileID);
+
+ if ((forceRun == false) && (AS_UTL_fileExists(name, FALSE, FALSE)))
+ fprintf(stderr, "Job " F_U32 " is finished (remove '%s' or -force to try again).\n", fileID, name), exit(0);
+
+ // Check if running.
+
snprintf(name, FILENAME_MAX, "%s/%04d.ovs", storePath, fileID);
if ((forceRun == false) && (AS_UTL_fileExists(name, FALSE, FALSE)))
- fprintf(stderr, "Job " F_U32 " is running or finished (remove '%s' or -force to try again).\n", fileID, name), exit(0);
+ fprintf(stderr, "Job " F_U32 " is running (remove '%s' or -force to try again).\n", fileID, name), exit(0);
+
+ // Not done, not running, so create a sentinel to say we're running.
errno = 0;
FILE *F = fopen(name, "w");
@@ -190,6 +201,9 @@ main(int argc, char **argv) {
// Get the number of overlaps in each bucket slice.
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Finding overlaps.\n");
+
uint64 *bucketSizes = new uint64 [jobIdxMax + 1];
uint64 totOvl = writer->loadBucketSizes(bucketSizes);
@@ -204,8 +218,9 @@ main(int argc, char **argv) {
// Or report that we can process.
- fprintf(stderr, "Overlaps need %.2f GB memory, allowed to use up to (via -M) " F_U64 " GB.\n",
- ovOverlapSortSize * totOvl / 1024.0 / 1024.0 / 1024.0, maxMemory >> 30);
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Loading %10" F_U64P " overlaps using %.2f GB of requested (-M) " F_U64 " GB memory.\n",
+ totOvl, ovOverlapSortSize * totOvl / 1024.0 / 1024.0 / 1024.0, maxMemory >> 30);
// Load all overlaps - we're guaranteed that either 'name.gz' or 'name' exists (we checked when
// we loaded bucket sizes) or funny business is happening with our files.
@@ -229,6 +244,7 @@ main(int argc, char **argv) {
// Sort the overlaps! Finally! The parallel STL sort is NOT inplace, and blows up our memory.
+ fprintf(stderr, "\n");
fprintf(stderr, "Sorting.\n");
#ifdef _GLIBCXX_PARALLEL
@@ -239,24 +255,31 @@ main(int argc, char **argv) {
// Output to the store.
- fprintf(stderr, "Writing output.\n");
+ fprintf(stderr, "\n"); // Sorting has no output, so this would generate a distracting extra newline
+ fprintf(stderr, "Writing sorted overlaps.\n");
writer->writeOverlaps(ovls, ovlsLen);
// Clean up. Delete inputs, remove the sentinel, release memory, etc.
delete [] ovls;
-
- if (deleteIntermediateLate)
- writer->removeOverlapSlice();
-
delete [] bucketSizes;
removeSentinel(storePath, fileID);
gkp->gkStore_close();
+ if (deleteIntermediateLate) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Removing bucketized overlaps.\n");
+ fprintf(stderr, "\n");
+
+ writer->removeOverlapSlice();
+ }
+
// Success!
+ fprintf(stderr, "Success!\n");
+
return(0);
}
diff --git a/src/stores/ovStoreWriter.C b/src/stores/ovStoreWriter.C
index dcddf3c..a05c3c4 100644
--- a/src/stores/ovStoreWriter.C
+++ b/src/stores/ovStoreWriter.C
@@ -275,8 +275,6 @@ ovStoreWriter::writeOverlaps(ovOverlap *ovls,
// Dump the overlaps
- fprintf(stderr, "Writing " F_U64 " overlaps.\n", ovlsLen);
-
for (uint64 i=0; i<ovlsLen; i++ ) {
bof->writeOverlap(ovls + i);
@@ -348,8 +346,8 @@ ovStoreWriter::writeOverlaps(ovOverlap *ovls,
info.save(_storePath, _fileID, true);
- fprintf(stderr, "Created ovStore segment '%s/%04d' with " F_U64 " overlaps for reads from " F_U32 " to " F_U32 ".\n",
- _storePath, _fileID, _info.numOverlaps(), _info.smallestID(), _info.largestID());
+ fprintf(stderr, " created '%s/%04d' with " F_U64 " overlaps for reads " F_U32 " to " F_U32 ".\n",
+ _storePath, _fileID, info.numOverlaps(), info.smallestID(), info.largestID());
}
@@ -634,7 +632,7 @@ ovStoreWriter::loadBucketSizes(uint64 *bucketSizes) {
}
assert(nr == _fileLimit + 1);
- fprintf(stderr, "Found " F_U64 " overlaps from '%s'.\n", sliceSizes[_fileID], name);
+ fprintf(stderr, " found %10" F_U64P " overlaps in '%s'.\n", sliceSizes[_fileID], name);
bucketSizes[i] = sliceSizes[_fileID];
totOvl += sliceSizes[_fileID];
@@ -664,7 +662,7 @@ ovStoreWriter::loadOverlapsFromSlice(uint32 slice, uint64 expectedLen, ovOverlap
expectedLen, name);
}
- fprintf(stderr, "Loading " F_U64 " overlaps from '%s'.\n", expectedLen, name);
+ fprintf(stderr, " loading %10" F_U64P " overlaps from '%s'.\n", expectedLen, name);
ovFile *bof = new ovFile(_gkp, name, ovFileFull);
uint64 num = 0;
diff --git a/src/stores/tgStore.C b/src/stores/tgStore.C
index 7538a01..f67d443 100644
--- a/src/stores/tgStore.C
+++ b/src/stores/tgStore.C
@@ -446,7 +446,9 @@ tgStore::loadTig(uint32 tigID) {
AS_UTL_fseek(FP, _tigEntry[tigID].fileOffset, SEEK_SET);
_tigCache[tigID] = new tgTig;
- _tigCache[tigID]->loadFromStream(FP);
+
+ if (_tigCache[tigID]->loadFromStream(FP) == false)
+ fprintf(stderr, "Failed to load tig %u.\n", tigID), exit(1);
// ALWAYS assume the incore record is more up to date
*_tigCache[tigID] = _tigEntry[tigID].tigRecord;
@@ -509,7 +511,9 @@ tgStore::copyTig(uint32 tigID, tgTig *tigcopy) {
AS_UTL_fseek(FP, _tigEntry[tigID].fileOffset, SEEK_SET);
tigcopy->clear();
- tigcopy->loadFromStream(FP);
+
+ if (tigcopy->loadFromStream(FP) == false)
+ fprintf(stderr, "Failed to load tig %u.\n", tigID), exit(1);
// ALWAYS assume the incore record is more up to date
*tigcopy = _tigEntry[tigID].tigRecord;
diff --git a/src/stores/tgTig.C b/src/stores/tgTig.C
index eae55c0..1351a9e 100644
--- a/src/stores/tgTig.C
+++ b/src/stores/tgTig.C
@@ -400,6 +400,7 @@ tgTig::loadFromStream(FILE *F) {
tgTigRecord tr;
if (4 != AS_UTL_safeRead(F, tag, "tgTig::saveToStream::tigr", sizeof(char), 4)) {
+ fprintf(stderr, "tgTig::loadFromStream()-- failed to read four byte code: %s\n", strerror(errno));
return(false);
}
@@ -407,11 +408,14 @@ tgTig::loadFromStream(FILE *F) {
(tag[1] != 'I') ||
(tag[2] != 'G') ||
(tag[3] != 'R')) {
+ fprintf(stderr, "tgTig::loadFromStream()-- not at a tigRecord, got bytes '%c%c%c%c' (0x%02x%02x%02x%02x).\n",
+ tag[0], tag[1], tag[2], tag[3],
+ tag[0], tag[1], tag[2], tag[3]);
return(false);
}
if (0 == AS_UTL_safeRead(F, &tr, "tgTig::loadFromStream::tr", sizeof(tgTigRecord), 1)) {
- // Nothing loaded, end of file.
+ fprintf(stderr, "tgTig::loadFromStream()-- failed to read tgTigRecord: %s\n", strerror(errno));
return(false);
}
diff --git a/src/utgcns/libNDFalcon/LICENSE b/src/utgcns/libNDFalcon/LICENSE
deleted file mode 100644
index 94e4fd5..0000000
--- a/src/utgcns/libNDFalcon/LICENSE
+++ /dev/null
@@ -1,36 +0,0 @@
-#################################################################################$$
-# Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted (subject to the limitations in the
-# disclaimer below) provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials provided
-# with the distribution.
-#
-# * Neither the name of Pacific Biosciences nor the names of its
-# contributors may be used to endorse or promote products derived
-# from this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-# SUCH DAMAGE.
-#################################################################################$$
diff --git a/src/utgcns/libNDFalcon/dw.C b/src/utgcns/libNDFalcon/dw.C
deleted file mode 100644
index ef9e7c2..0000000
--- a/src/utgcns/libNDFalcon/dw.C
+++ /dev/null
@@ -1,359 +0,0 @@
-
-/******************************************************************************
- *
- * This file is part of canu, a software program that assembles whole-genome
- * sequencing reads into contigs.
- *
- * This software is based on:
- * 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
- * the 'kmer package' (http://kmer.sourceforge.net)
- * both originally distributed by Applera Corporation under the GNU General
- * Public License, version 2.
- *
- * Canu branched from Celera Assembler at its revision 4587.
- * Canu branched from the kmer project at its revision 1994.
- *
- * Modifications by:
- *
- * Sergey Koren beginning on 2015-DEC-28
- * are a 'United States Government Work', and
- * are released in the public domain
- *
- * Brian P. Walenz beginning on 2016-JUL-19
- * are a 'United States Government Work', and
- * are released in the public domain
- *
- * File 'README.licenses' in the root directory of this distribution contains
- * full conditions and disclaimers for each license.
- */
-
-/*
- * =====================================================================================
- *
- * Filename: DW.c
- *
- * Description: A banded version for the O(ND) greedy sequence alignment algorithm
- *
- * Version: 0.1
- * Created: 07/20/2013 17:00:00
- * Revision: none
- * Compiler: gcc
- *
- * Author: Jason Chin,
- * Company:
- *
- * =====================================================================================
-
- #################################################################################$$
- # Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
- #
- # All rights reserved.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted (subject to the limitations in the
- # disclaimer below) provided that the following conditions are met:
- #
- # * Redistributions of source code must retain the above copyright
- # notice, this list of conditions and the following disclaimer.
- #
- # * Redistributions in binary form must reproduce the above
- # copyright notice, this list of conditions and the following
- # disclaimer in the documentation and/or other materials provided
- # with the distribution.
- #
- # * Neither the name of Pacific Biosciences nor the names of its
- # contributors may be used to endorse or promote products derived
- # from this software without specific prior written permission.
- #
- # NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
- # GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
- # BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
- # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- # DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
- # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- # USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- # OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- # SUCH DAMAGE.
- #################################################################################$$
-
-
-*/
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <limits.h>
-#include <stdbool.h>
-#include "dw.H"
-
-namespace NDalignment {
-
-int compare_d_path(const void * a, const void * b)
-{
- const d_path_data2 * arg1 = (d_path_data2 *)a;
- const d_path_data2 * arg2 = (d_path_data2 *)b;
- if (arg1->d - arg2->d == 0) {
- return arg1->k - arg2->k;
- } else {
- return arg1->d - arg2->d;
- }
-}
-
-
-void d_path_sort( d_path_data2 * base, unsigned long max_idx) {
- qsort(base, max_idx, sizeof(d_path_data2), compare_d_path);
-}
-
-d_path_data2 * get_dpath_idx( seq_coor_t d, seq_coor_t k, unsigned long max_idx, d_path_data2 * base) {
- d_path_data2 d_tmp;
- d_path_data2 *rtn;
- d_tmp.d = d;
- d_tmp.k = k;
- rtn = (d_path_data2 *) bsearch( &d_tmp, base, max_idx, sizeof(d_path_data2), compare_d_path);
- //printf("dp %ld %ld %ld %ld %ld %ld %ld\n", (rtn)->d, (rtn)->k, (rtn)->x1, (rtn)->y1, (rtn)->x2, (rtn)->y2, (rtn)->pre_k);
-
- return rtn;
-
-}
-
-void print_d_path( d_path_data2 * base, unsigned long max_idx) {
- unsigned long idx;
- for (idx = 0; idx < max_idx; idx++){
- printf("dp %ld %d %d %d %d %d %d %d\n",idx, (base+idx)->d, (base+idx)->k, (base+idx)->x1, (base+idx)->y1, (base+idx)->x2, (base+idx)->y2, (base+idx)->pre_k);
- }
-}
-
-
-bool align(const char * query_seq, seq_coor_t q_len,
- const char * target_seq, seq_coor_t t_len,
- seq_coor_t band_tolerance,
- bool get_aln_str,
- NDalignResult &align_rtn) {
- seq_coor_t * V;
- seq_coor_t * U; // array of matched bases for each "k"
- seq_coor_t k_offset;
- seq_coor_t d;
- seq_coor_t k, k2;
- seq_coor_t best_m; // the best "matches" for each d
- seq_coor_t min_k, new_min_k;
- seq_coor_t max_k, new_max_k;
- seq_coor_t pre_k;
- seq_coor_t x, y;
- seq_coor_t cd;
- seq_coor_t ck;
- seq_coor_t cx, cy, nx, ny;
- seq_coor_t max_d;
- seq_coor_t band_size;
- unsigned long d_path_idx = 0;
- unsigned long max_idx = 0;
-
- d_path_data2 * d_path;
- d_path_data2 * d_path_aux;
- path_point * aln_path;
- seq_coor_t aln_path_idx;
- seq_coor_t aln_pos;
- seq_coor_t i;
- bool aligned = false;
-
- //printf("debug: %ld %ld\n", q_len, t_len);
- //printf("%s\n", query_seq);
-
- max_d = (int) (0.3*(q_len + t_len));
-
- band_size = band_tolerance * 2;
-
- V = (seq_coor_t *)calloc( max_d * 2 + 1, sizeof(seq_coor_t) );
- U = (seq_coor_t *)calloc( max_d * 2 + 1, sizeof(seq_coor_t) );
-
- k_offset = max_d;
-
- // We should probably use hashmap to store the backtracing information to save memory allocation time
- // This O(MN) block allocation scheme is convient for now but it is slower for very long sequences
- d_path = (d_path_data2 *)calloc( max_d * (band_size + 1 ) * 2 + 1, sizeof(d_path_data2) );
-
- aln_path = (path_point *)calloc( q_len + t_len + 1, sizeof(path_point) );
-
- if (d_path == NULL || aln_path == NULL) {
- fprintf(stderr, "generatePBDAG()-- Failed memory allocation max_d %d band_size %d.\n",
- max_d, band_size);
- free(V);
- free(U);
- free(d_path);
- free(aln_path);
- return aligned;
- }
-
- if (get_aln_str) {
- align_rtn._tgt_aln_str = (char *)calloc( q_len + t_len + 1, sizeof(char));
- align_rtn._qry_aln_str = (char *)calloc( q_len + t_len + 1, sizeof(char));
- } else {
- align_rtn._tgt_aln_str = NULL;
- align_rtn._qry_aln_str = NULL;
- }
- align_rtn._size = 0;
- align_rtn._qry_bgn = 0;
- align_rtn._qry_end = 0;
- align_rtn._tgt_bgn = 0;
- align_rtn._tgt_end = 0;
-
- //printf("max_d: %lu, band_size: %lu\n", max_d, band_size);
- best_m = -1;
- min_k = 0;
- max_k = 0;
- d_path_idx = 0;
- max_idx = 0;
- for (d = 0; d < max_d; d ++ ) {
- if (max_k - min_k > band_size) {
- fprintf(stderr, "generatePBDAG()-- Exceeded band size max_k %d - min_k %d = %d > band_size = %d.\n",
- max_k, min_k, max_k - min_k, band_size);
- break;
- }
-
- for (k = min_k; k <= max_k; k += 2) {
-
- if ( (k == min_k) || ((k != max_k) && (V[ k - 1 + k_offset ] < V[ k + 1 + k_offset])) ) {
- pre_k = k + 1;
- x = V[ k + 1 + k_offset];
- } else {
- pre_k = k - 1;
- x = V[ k - 1 + k_offset] + 1;
- }
- y = x - k;
- d_path[d_path_idx].d = d;
- d_path[d_path_idx].k = k;
- d_path[d_path_idx].x1 = x;
- d_path[d_path_idx].y1 = y;
-
- while ( x < q_len && y < t_len && query_seq[x] == target_seq[y] ){
- x++;
- y++;
- }
-
- d_path[d_path_idx].x2 = x;
- d_path[d_path_idx].y2 = y;
- d_path[d_path_idx].pre_k = pre_k;
- d_path_idx ++;
-
- V[ k + k_offset ] = x;
- U[ k + k_offset ] = x + y;
-
- if ( x + y > best_m) {
- best_m = x + y;
- }
-
- if ( x >= q_len || y >= t_len) {
- aligned = true;
- max_idx = d_path_idx;
- break;
- }
- }
-
- // For banding
- new_min_k = max_k;
- new_max_k = min_k;
-
- for (k2 = min_k; k2 <= max_k; k2 += 2) {
- if (U[ k2 + k_offset] >= best_m - band_tolerance ) {
- if ( k2 < new_min_k ) {
- new_min_k = k2;
- }
- if ( k2 > new_max_k ) {
- new_max_k = k2;
- }
- }
- }
-
- max_k = new_max_k + 1;
- min_k = new_min_k - 1;
-
- // For no banding
- // max_k ++;
- // min_k --;
-
- // For debuging
- // printf("min_max_k,d, %ld %ld %ld\n", min_k, max_k, d);
-
- if (aligned == true) {
- align_rtn._qry_end = x;
- align_rtn._tgt_end = y;
- align_rtn._dist = d;
- align_rtn._size = (x + y + d) / 2;
- align_rtn._qry_bgn = 0;
- align_rtn._tgt_bgn = 0;
-
- d_path_sort(d_path, max_idx);
- //print_d_path(d_path, max_idx);
-
- if (get_aln_str) {
- cd = d;
- ck = k;
- aln_path_idx = 0;
- while (cd >= 0 && aln_path_idx < q_len + t_len + 1) {
- d_path_aux = (d_path_data2 *) get_dpath_idx( cd, ck, max_idx, d_path);
- aln_path[aln_path_idx].x = d_path_aux -> x2;
- aln_path[aln_path_idx].y = d_path_aux -> y2;
- aln_path_idx ++;
- aln_path[aln_path_idx].x = d_path_aux -> x1;
- aln_path[aln_path_idx].y = d_path_aux -> y1;
- aln_path_idx ++;
- ck = d_path_aux -> pre_k;
- cd -= 1;
- }
- aln_path_idx --;
- cx = aln_path[aln_path_idx].x;
- cy = aln_path[aln_path_idx].y;
- align_rtn._qry_bgn = cx;
- align_rtn._tgt_bgn = cy;
- aln_pos = 0;
- while ( aln_path_idx > 0 ) {
- aln_path_idx --;
- nx = aln_path[aln_path_idx].x;
- ny = aln_path[aln_path_idx].y;
- if (cx == nx && cy == ny){
- continue;
- }
- if (nx == cx && ny != cy){ //advance in y
- for (i = 0; i < ny - cy; i++) {
- align_rtn._qry_aln_str[aln_pos + i] = '-';
- }
- for (i = 0; i < ny - cy; i++) {
- align_rtn._tgt_aln_str[aln_pos + i] = target_seq[cy + i];
- }
- aln_pos += ny - cy;
- } else if (nx != cx && ny == cy){ //advance in x
- for (i = 0; i < nx - cx; i++) {
- align_rtn._qry_aln_str[aln_pos + i] = query_seq[cx + i];
- }
- for (i = 0; i < nx - cx; i++) {
- align_rtn._tgt_aln_str[aln_pos + i] = '-';
- }
- aln_pos += nx - cx;
- } else {
- for (i = 0; i < nx - cx; i++) {
- align_rtn._qry_aln_str[aln_pos + i] = query_seq[cx + i];
- }
- for (i = 0; i < ny - cy; i++) {
- align_rtn._tgt_aln_str[aln_pos + i] = target_seq[cy + i];
- }
- aln_pos += ny - cy;
- }
- cx = nx;
- cy = ny;
- }
- align_rtn._size = aln_pos;
- }
- break;
- }
- }
-
- free(V);
- free(U);
- free(d_path);
- free(aln_path);
- return aligned;
-}
-}
diff --git a/src/utgcns/libNDFalcon/dw.H b/src/utgcns/libNDFalcon/dw.H
deleted file mode 100644
index 836d9fa..0000000
--- a/src/utgcns/libNDFalcon/dw.H
+++ /dev/null
@@ -1,161 +0,0 @@
-
-/******************************************************************************
- *
- * This file is part of canu, a software program that assembles whole-genome
- * sequencing reads into contigs.
- *
- * This software is based on:
- * 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
- * the 'kmer package' (http://kmer.sourceforge.net)
- * both originally distributed by Applera Corporation under the GNU General
- * Public License, version 2.
- *
- * Canu branched from Celera Assembler at its revision 4587.
- * Canu branched from the kmer project at its revision 1994.
- *
- * Modifications by:
- *
- * Sergey Koren beginning on 2015-DEC-28
- * are a 'United States Government Work', and
- * are released in the public domain
- *
- * Brian P. Walenz beginning on 2016-JAN-04
- * are a 'United States Government Work', and
- * are released in the public domain
- *
- * File 'README.licenses' in the root directory of this distribution contains
- * full conditions and disclaimers for each license.
- */
-
-/*
- * =====================================================================================
- *
- * Filename: dw.h
- *
- * Description: Common delclaration for the code base
- *
- * Version: 0.1
- * Created: 07/16/2013 07:46:23 AM
- * Revision: none
- * Compiler: gcc
- *
- * Author: Jason Chin,
- * Company:
- *
- * =====================================================================================
-
- #################################################################################$$
- # Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
- #
- # All rights reserved.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted (subject to the limitations in the
- # disclaimer below) provided that the following conditions are met:
- #
- # * Redistributions of source code must retain the above copyright
- # notice, this list of conditions and the following disclaimer.
- #
- # * Redistributions in binary form must reproduce the above
- # copyright notice, this list of conditions and the following
- # disclaimer in the documentation and/or other materials provided
- # with the distribution.
- #
- # * Neither the name of Pacific Biosciences nor the names of its
- # contributors may be used to endorse or promote products derived
- # from this software without specific prior written permission.
- #
- # NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
- # GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
- # BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
- # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- # DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
- # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- # USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- # OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- # SUCH DAMAGE.
- #################################################################################$$
- */
-
-#ifndef NDALIGNER_H
-#define NDALIGNER_H
-
-#include <cstdlib>
-#include <stdint.h>
-
-namespace NDalignment {
-typedef int32_t seq_coor_t;
-
-class NDalignResult {
-public:
- NDalignResult() {
- clear();
-
- _qry_aln_str = _tgt_aln_str = 0;
- };
- ~NDalignResult() {
- std::free(_qry_aln_str);
- std::free(_tgt_aln_str);
- };
-
-
- void clear(void) {
- _qry_bgn = 0;
- _qry_end = 0;
-
- _tgt_bgn = 0;
- _tgt_end = 0;
-
- _dist = 0;
- _size = 0;
- };
-
- int32_t _size;
- int32_t _dist;
-
- int32_t _qry_bgn;
- int32_t _qry_end;
-
- int32_t _tgt_bgn;
- int32_t _tgt_end;
-
- char* _qry_aln_str;
- char* _tgt_aln_str;
-};
-
-
-typedef struct {
- seq_coor_t pre_k;
- seq_coor_t x1;
- seq_coor_t y1;
- seq_coor_t x2;
- seq_coor_t y2;
-} d_path_data;
-
-typedef struct {
- seq_coor_t d;
- seq_coor_t k;
- seq_coor_t pre_k;
- seq_coor_t x1;
- seq_coor_t y1;
- seq_coor_t x2;
- seq_coor_t y2;
-} d_path_data2;
-
-typedef struct {
- seq_coor_t x;
- seq_coor_t y;
-} path_point;
-
-bool align(const char *, seq_coor_t,
- const char *, seq_coor_t,
- seq_coor_t,
- bool, NDalignResult &aln);
-}
-
-#endif
diff --git a/src/utgcns/libcns/unitigConsensus.C b/src/utgcns/libcns/unitigConsensus.C
index d8c58e8..4a0a02a 100644
--- a/src/utgcns/libcns/unitigConsensus.C
+++ b/src/utgcns/libcns/unitigConsensus.C
@@ -76,7 +76,6 @@
#include "Alignment.H"
#include "AlnGraphBoost.H"
#include "edlib.H"
-#include "dw.H"
#include "NDalign.H"
@@ -258,223 +257,427 @@ unitigConsensus::generate(tgTig *tig_,
}
-bool
-unitigConsensus::generatePBDAG(char aligner,
- tgTig *tig_,
- map<uint32, gkRead *> *inPackageRead_,
- map<uint32, gkReadData *> *inPackageReadData_) {
- tig = tig_;
- numfrags = tig->numberOfChildren();
- if (initialize(inPackageRead_, inPackageReadData_) == FALSE) {
- fprintf(stderr, "generatePBDAG()-- Failed to initialize for tig %u with %u children\n", tig->tigID(), tig->numberOfChildren());
- return(false);
+char *
+generateTemplateStitch(abAbacus *abacus,
+ tgPosition *utgpos,
+ uint32 numfrags,
+ double errorRate,
+ bool verbose) {
+ int32 minOlap = 500;
+
+ // Initialize, copy the first read.
+
+ uint32 rid = 0;
+
+ abSequence *seq = abacus->getSequence(rid);
+ char *fragment = seq->getBases();
+ uint32 readLen = seq->length();
+
+ uint32 tigmax = AS_MAX_READLEN; // Must be at least AS_MAX_READLEN, else resizeArray() could fail
+ uint32 tiglen = 0;
+ char *tigseq = NULL;
+
+ allocateArray(tigseq, tigmax, resizeArray_clearNew);
+
+ if (verbose) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "generateTemplateStitch()-- COPY READ read #%d %d (len=%d to %d-%d)\n",
+ 0, utgpos[0].ident(), readLen, utgpos[0].min(), utgpos[0].max());
}
- // First we need to load into Unitig data structure the quick cns
+ for (uint32 ii=0; ii<readLen; ii++)
+ tigseq[tiglen++] = fragment[ii];
- Unitig utg;
+ tigseq[tiglen] = 0;
- utg.id = tig->tigID();
- utg.seq = string(tig->_layoutLen, 'N');
+ uint32 ePos = utgpos[0].max(); // Expected end of template, from bogart supplied positions.
- // Build a quick consensus to align to, just smash together sequences.
- for (uint32 i=0; i<numfrags; i++) {
- abSequence *seq = abacus->getSequence(i);
- char *fragment = seq->getBases();
- uint32 readLen = seq->length();
+ // Find the next read that has some minimum overlap and a large extension, copy that into the template.
+
+ // Align read to template. Expecting to find alignment:
+ //
+ // template ---------------
+ // read ---------------
+ // ^
+ //
+ // All we need to find is where the template ends on the read, the ^ above. We know the
+ // expected size of the overlap, and can extract those bases from the template and look for a
+ // full alignment to the read.
+ //
+ // We'll align 80% of the expected overlap to the read, allowing a 20% buffer on either end.
+ //
+ // | +-80% expected overlap size
+ // | | +-fPos
+ // v v v
+ // template ----------(-----)
+ // read (-----------)------
+ //
- uint32 start = utgpos[i].min();
- uint32 end = utgpos[i].max();
+ while (rid < numfrags) {
+ uint32 nr = 0; // Next read
+ uint32 nm = 0; // Next read maximum position
- if (start > utg.seq.length()) {
- fprintf(stderr, "WARNING: reset start from " F_U32 " to " F_U64 "\n", start, utg.seq.length()-1);
- start = utg.seq.length() - 1;
- }
+ // Pick the next read as the one with the longest extension from all with some minimum overlap
+ // to the template
+
+ if (verbose)
+ fprintf(stderr, "\n");
- if (end - start > readLen) {
- fprintf(stderr, "WARNING: reset end from " F_U32 " to " F_U32 "\n", end, start+readLen);
- end = start + readLen;
+ for (uint32 ii=rid+1; ii < numfrags; ii++) {
+
+ // If contained, move to the next read. (Not terribly useful to log, so we don't)
+
+ if (utgpos[ii].max() < ePos)
+ continue;
+
+ // If a bigger end position, save the overlap. One quirk: if we've already saved an overlap, and this
+ // overlap is thin, don't save the thin overlap.
+
+ bool thick = (utgpos[ii].min() + minOlap < ePos);
+ bool first = (nm == 0);
+ bool save = false;
+
+ if ((nm < utgpos[ii].max()) && (thick || first)) {
+ save = true;
+ nr = ii;
+ nm = utgpos[ii].max();
+ }
+
+ if (verbose)
+ fprintf(stderr, "generateTemplateStitch()-- read #%d/%d ident %d position %d-%d%s%s%s\n",
+ ii, numfrags, utgpos[ii].ident(), utgpos[ii].min(), utgpos[ii].max(),
+ (save == true) ? " SAVE" : "",
+ (thick == false) ? " THIN" : "",
+ (first == true) ? " FIRST" : "");
+
+
+ // If this read has an overlap smaller than we want, stop searching.
+
+ if (thick == false)
+ break;
}
- if (end > utg.seq.length()) {
- fprintf(stderr, "WARNING: truncate end from " F_U32 " to " F_U64 "\n", end, utg.seq.length()-1);
- end = utg.seq.length() - 1;
+ if (nr == 0) {
+ if (verbose)
+ fprintf(stderr, "generateTemplateStitch()-- NO MORE READS TO ALIGN\n");
+ break;
}
- // Read aligns from position start to end. Skip ahead until we find unset bases.
+ assert(nr != 0);
- uint32 cur = start;
- while ((cur < end) && (utg.seq[cur] != 'N'))
- cur++;
+ rid = nr; // We'll place read 'nr' in the template.
- fprintf(stderr, "generatePBDAG()-- template from %7d to %7d comes from read %3d id %6d bases (%5d %5d) nominally %6d %6d)\n",
- cur, end, i, seq->gkpIdent(),
- cur - start,
- end - start,
- utgpos[i].min(),
- utgpos[i].max());
+ seq = abacus->getSequence(rid);
+ fragment = seq->getBases();
+ readLen = seq->length();
- for (uint32 j=cur; j<end; j++) {
- //if (utg.seq[j] != 'N')
- // fprintf(stderr, "WARNING: template %6d already set\n", j);
- utg.seq[j] = fragment[j - start];
- }
- }
+ int32 readBgn;
+ int32 readEnd;
- for (uint32 jj=0; jj<tig->_layoutLen; jj++)
- if (utg.seq[jj] == 'N')
- fprintf(stdout, "generatePBDAG()-- WARNING: template position %u not defined.\n", jj);
+ EdlibAlignResult result;
+ bool aligned = false;
- assert(utg.seq[tig->_layoutLen] == 0);
+ double templateSize = 0.80;
+ double extensionSize = 0.20;
-#if 0
- FILE *F = fopen("template.fasta", "w");
- fprintf(F, ">tig%d template\n%s\n", tig->tigID(), utg.seq.c_str());
- fclose(F);
-#endif
+ int32 olapLen = ePos - utgpos[nr].min(); // The expected size of the overlap
+ int32 templateLen = 0;
+ int32 extensionLen = 0;
- AlnGraphBoost ag(utg.seq);
+ alignAgain:
+ templateLen = (int32)ceil(olapLen * templateSize); // Extract 80% of the expected overlap size
+ extensionLen = (int32)ceil(olapLen * extensionSize); // Extend read by 20% of the expected overlap size
- // Compute alignments of each sequence in parallel
+ readBgn = 0;
+ readEnd = olapLen + extensionLen;
-#pragma omp parallel for schedule(dynamic)
- for (uint32 i=0; i<numfrags; i++) {
- abSequence *seq = abacus->getSequence(i);
- char *fragment = seq->getBases();
+ if (readEnd > readLen)
+ readEnd = readLen;
+
+ if (verbose) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "generateTemplateStitch()-- ALIGN template %d-%d (len=%d) to read #%d %d %d-%d (len=%d actual=%d at %d-%d) expecting olap of %d\n",
+ tiglen - templateLen, tiglen, templateLen,
+ nr, utgpos[nr].ident(), readBgn, readEnd, readEnd - readBgn, readLen,
+ utgpos[nr].min(), utgpos[nr].max(),
+ olapLen);
+ }
+
+ result = edlibAlign(tigseq + tiglen - templateLen, templateLen,
+ fragment, readEnd - readBgn,
+ edlibNewAlignConfig(olapLen * errorRate, EDLIB_MODE_HW, EDLIB_TASK_PATH));
- // computePositionFromLayout() does NOT work here; it needs to have abacus->numberOfColumns() updated.
- // When the reads aren't placed in frankenstein, this function probably also just returns
- // the original utgpos position anyway.
+ // We're expecting the template to align inside the read.
+ //
+ // v- always the end
+ // TEMPLATE --------------------------[---------------]
+ // READ [------------------------------]---------
+ // always the start -^
//
- //computePositionFromLayout();
+ // If we don't find an alignment at all, we move the template start point to the right (making
+ // the template smaller) and also move the read end point to the right (making the read
+ // bigger).
- fprintf(stderr, "\n");
- fprintf(stderr, "generatePBDAG()-- align read %u (%u/%u) at %u-%u\n",
- seq->gkpIdent(), i, numfrags, utgpos[i].min(), utgpos[i].max());
+ bool tryAgain = false;
-#if 0
- char N[FILENAME_MAX];
- sprintf(N, "read-%03d.fasta", i, seq->gkpIdent());
- FILE *F = fopen(N, "w");
- fprintf(F, ">read%d pos %d %d\n%s\n", seq->gkpIdent(), utgpos[i].min(), utgpos[i].max(), fragment);
- fclose(F);
-#endif
+ bool noResult = (result.numLocations == 0);
+ bool gotResult = (result.numLocations > 0);
- dagcon::Alignment aln;
- NDalignment::NDalignResult ndaln;
- EdlibAlignResult align;
- int32 padding = (aligner == 'E' ? (int32)round((double)(utgpos[i].max() - utgpos[i].min()) * errorRate) + 1 : 0);
- aln.start = max((int32)0, (int32)utgpos[i].min() - padding);
- aln.end = min((int32)utg.seq.size(), (int32)utgpos[i].max() + padding);
- aln.frgid = utgpos[i].ident();
- aln.qstr = string(fragment);
-
- aln.tstr = utg.seq.substr(aln.start, aln.end-aln.start);
-
- uint32 aLen = aln.qstr.size();
- uint32 bLen = aln.tstr.size();
-
- uint32 bandTolerance = 150;
- bool aligned = false;
- if (aligner == 'E') {
- align = edlibAlign(aln.qstr.c_str(), aln.qstr.size()-1, aln.tstr.c_str(), aln.tstr.size()-1, edlibNewAlignConfig(bandTolerance, EDLIB_MODE_HW, EDLIB_TASK_PATH));
- aligned = (align.numLocations >= 1);
- } else {
- aligned = NDalignment::align(aln.qstr.c_str(), aln.qstr.size(),
- aln.tstr.c_str(), aln.tstr.size(),
- bandTolerance,
- true,
- ndaln);
+ bool hitTheStart = (gotResult) && (result.startLocations[0] == 0);
+
+ bool hitTheEnd = (gotResult) && (result.endLocations[0] + 1 == readEnd - readBgn);
+ bool moreToExtend = (readEnd < readLen);
+
+ // HOWEVER, if we get a result and it's near perfect, declare success even if we hit the start.
+ // These are simple repeats that will align with any overlap. The one BPW debugged was 99+% A.
+
+ if ((gotResult == true) &&
+ (hitTheStart == true) &&
+ ((double)result.editDistance / result.alignmentLength < 0.1)) {
+ hitTheStart = false;
+ }
+
+ // NOTE that if we hit the end with the same conditions, we should try again, unless there
+ // isn't anything left. In that case, we don't extend the template.
+
+ if ((gotResult == true) &&
+ (hitTheEnd == true) &&
+ (moreToExtend == false) &&
+ ((double)result.editDistance / result.alignmentLength < 0.1)) {
+ hitTheEnd = false;
}
- while ((aligned == false) && (bandTolerance < errorRate * (aLen + bLen))) {
- bandTolerance *= 2;
- if (aligner == 'E')
- edlibFreeAlignResult(align);
-
- fprintf(stderr, "generatePBDAG()-- retry with bandTolerance = %d\n",
- bandTolerance);
-
- if (aligner == 'E') {
- align = edlibAlign(aln.qstr.c_str(), aln.qstr.size()-1, aln.tstr.c_str(), aln.tstr.size()-1, edlibNewAlignConfig(bandTolerance, EDLIB_MODE_HW, EDLIB_TASK_PATH));
- aligned = (align.numLocations >= 1);
- } else {
- aligned = NDalignment::align(aln.qstr.c_str(), aln.qstr.size(),
- aln.tstr.c_str(), aln.tstr.size(),
- bandTolerance,
- true,
- ndaln);
- }
+ // Now, report what happened, and maybe try again.
+
+ if (verbose)
+ if (noResult)
+ fprintf(stderr, "generateTemplateStitch()-- FAILED to align - no result\n");
+ else
+ fprintf(stderr, "generateTemplateStitch()-- FOUND alignment at %d-%d editDist %d alignLen %d %.f%%\n",
+ result.startLocations[0], result.endLocations[0]+1,
+ result.editDistance,
+ result.alignmentLength,
+ (double)result.editDistance / result.alignmentLength);
+
+ if ((noResult) || (hitTheStart)) {
+ if (verbose)
+ fprintf(stderr, "generateTemplateStitch()-- FAILED to align - %s - decrease template size by 10%%\n",
+ (noResult == true) ? "no result" : "hit the start");
+ tryAgain = true;
+ templateSize -= 0.10;
}
- double errorRateAln = 0;
- if (aligner == 'E')
- errorRateAln = (align.alignmentLength > 0) ? ((double)align.editDistance / align.alignmentLength) : 1.0;
- else
- errorRateAln = (ndaln._size > 0) ? ((double)ndaln._dist / ndaln._size) : 1.0;
+ if ((noResult) || (hitTheEnd && moreToExtend)) {
+ if (verbose)
+ fprintf(stderr, "generateTemplateStitch()-- FAILED to align - %s - increase read size by 10%%\n",
+ (noResult == true) ? "no result" : "hit the end");
+ tryAgain = true;
+ extensionSize += 0.10;
+ }
- if ((aligned == true) && (errorRateAln > errorRate)) {
- fprintf(stderr, "generatePBDAG()-- error rate too high distance=%5d size=%5d, %f > %f\n",
- align.editDistance, align.alignmentLength, errorRateAln, errorRate);
- aligned = false;
+ if (tryAgain) {
+ edlibFreeAlignResult(result);
+ goto alignAgain;
}
+ readBgn = result.startLocations[0]; // Expected to be zero
+ readEnd = result.endLocations[0] + 1; // Where we need to start copying the read
- if (aligned == false) {
- aln.start = aln.end = 0;
- aln.qstr = std::string();
- aln.tstr = std::string();
+ edlibFreeAlignResult(result);
- fprintf(stderr, "generatePBDAG()-- failed to align read #%u id %u at position %u-%u.\n",
- i, utgpos[i].ident(), utgpos[i].min(), utgpos[i].max());
+ if (verbose)
+ fprintf(stderr, "generateTemplateStitch()-- Aligned template %d-%d to read %u %d-%d; copy read %d-%d to template.\n", tiglen - templateLen, tiglen, nr, readBgn, readEnd, readEnd, readLen);
+
+ increaseArray(tigseq, tiglen, tigmax, tiglen + readLen - readEnd + 1);
+
+ for (uint32 ii=readEnd; ii<readLen; ii++)
+ tigseq[tiglen++] = fragment[ii];
+
+ tigseq[tiglen] = 0;
+
+ assert(tiglen < tigmax);
+
+ ePos = utgpos[rid].max();
+
+ if (verbose)
+ fprintf(stderr, "generateTemplateStitch()-- Template now length %d, expected %d, difference %7.4f%%\n",
+ tiglen, ePos, 200.0 * ((int32)tiglen - (int32)ePos) / ((int32)tiglen + (int32)ePos));
+ }
+
+ // Report the expected and final size. Guard against long tigs getting chopped.
+
+ double pd = 200.0 * ((int32)tiglen - (int32)ePos) / ((int32)tiglen + (int32)ePos);
+
+ fprintf(stderr, "\n");
+ fprintf(stderr, "generateTemplateStitch()-- generated template of length %d, expected length %d, %7.4f%% difference.\n",
+ tiglen, ePos, pd);
+
+ if ((tiglen >= 100000) && ((pd < -50.0) || (pd > 50.0)))
+ fprintf(stderr, "generateTemplateStitch()-- significant size difference, stopping.\n");
+ assert((tiglen < 100000) || ((-50.0 <= pd) && (pd <= 50.0)));
+
+ return(tigseq);
+}
- cnspos[i].setMinMax(0, 0);
- if (aligner == 'E')
- edlibFreeAlignResult(align);
- continue;
- }
- if (aligner == 'E') {
- fprintf(stderr, "generatePBDAG()-- aligned distance=%5d size=%5d, %f < %f\n",
- align.editDistance, align.alignmentLength,
- (double) align.editDistance / align.alignmentLength,
- errorRate);
- char *tgt_aln_str = new char[align.alignmentLength+1];
- char *qry_aln_str = new char[align.alignmentLength+1];
- edlibAlignmentToStrings(align.alignment, align.alignmentLength, align.startLocations[0], align.endLocations[0]+1, 0, aln.qstr.length(), aln.tstr.c_str(), aln.qstr.c_str(), tgt_aln_str, qry_aln_str);
+bool
+alignEdLib(dagAlignment &aln,
+ tgPosition &utgpos,
+ char *fragment,
+ uint32 fragmentLength,
+ char *tigseq,
+ uint32 tiglen,
+ double lengthScale,
+ double errorRate,
+ bool normalize,
+ bool verbose) {
+
+ EdlibAlignResult align;
+
+ int32 padding = (int32)ceil(fragmentLength * 0.10);
+ double bandErrRate = errorRate / 2;
+ bool aligned = false;
+ double alignedErrRate = 0.0;
+
+ // Decide on where to align this read.
+
+ // But, the utgpos positions are largely bogus, especially at the end of the tig. utgcns (the
+ // original) used to track positions of previously placed reads, find an overlap beterrn this
+ // read and the last read, and use that info to find the coordinates for the new read. That was
+ // very complicated. Here, we just linearly scale.
+
+ int32 tigbgn = max((int32)0, (int32)floor(lengthScale * utgpos.min() - padding));
+ int32 tigend = min((int32)tiglen, (int32)floor(lengthScale * utgpos.max() + padding));
+
+ if (verbose)
+ fprintf(stderr, "alignEdLib()-- align read %7u eRate %.4f at %9d-%-9d", utgpos.ident(), bandErrRate, tigbgn, tigend);
+
+ // This occurs if we don't lengthScale the positions.
+
+ if (tigend < tigbgn)
+ fprintf(stderr, "alignEdLib()-- ERROR: tigbgn %d > tigend %d - tiglen %d utgpos %d-%d padding %d\n",
+ tigbgn, tigend, tiglen, utgpos.min(), utgpos.max(), padding);
+ assert(tigend > tigbgn);
+
+ // Align! If there is an alignment, compute error rate and declare success if acceptable.
+
+ align = edlibAlign(fragment, fragmentLength,
+ tigseq + tigbgn, tigend - tigbgn,
+ edlibNewAlignConfig(bandErrRate * fragmentLength, EDLIB_MODE_HW, EDLIB_TASK_PATH));
+
+ if (align.alignmentLength > 0) {
+ alignedErrRate = (double)align.editDistance / align.alignmentLength;
+ aligned = (alignedErrRate <= errorRate);
+ if (verbose)
+ fprintf(stderr, " - ALIGNED %.4f at %9d-%-9d\n", alignedErrRate, tigbgn + align.startLocations[0], tigbgn + align.endLocations[0]+1);
+ } else {
+ if (verbose)
+ fprintf(stderr, "\n");
+ }
+
+ for (uint32 ii=0; ((ii < 4) && (aligned == false)); ii++) {
+ tigbgn = max((int32)0, tigbgn - 2 * padding);
+ tigend = min((int32)tiglen, tigend + 2 * padding);
+
+ bandErrRate += errorRate / 2;
+
+ edlibFreeAlignResult(align);
- aln.start += align.startLocations[0];
- aln.end = aln.start + (align.endLocations[0] - align.startLocations[0]) + 1;
- aln.qstr = std::string(qry_aln_str);
- aln.tstr = std::string(tgt_aln_str);
+ if (verbose)
+ fprintf(stderr, "alignEdLib()-- eRate %.4f at %9d-%-9d", bandErrRate, tigbgn, tigend);
- edlibFreeAlignResult(align);
- delete[] tgt_aln_str;
- delete[] qry_aln_str;
+ align = edlibAlign(fragment, strlen(fragment),
+ tigseq + tigbgn, tigend - tigbgn,
+ edlibNewAlignConfig(bandErrRate * fragmentLength, EDLIB_MODE_HW, EDLIB_TASK_PATH));
+
+ if (align.alignmentLength > 0) {
+ alignedErrRate = (double)align.editDistance / align.alignmentLength;
+ aligned = (alignedErrRate <= errorRate);
+ if (verbose)
+ fprintf(stderr, " - ALIGNED %.4f at %9d-%-9d\n", alignedErrRate, tigbgn + align.startLocations[0], tigbgn + align.endLocations[0]+1);
} else {
- aln.start += ndaln._tgt_bgn;
- aln.end = aln.start + (ndaln._tgt_end - ndaln._tgt_bgn) - 1;
- aln.qstr = std::string(ndaln._qry_aln_str);
- aln.tstr = std::string(ndaln._tgt_aln_str);
+ if (verbose)
+ fprintf(stderr, "\n");
}
- aln.start++;
- cnspos[i].setMinMax(aln.start, aln.end);
- assert(aln.qstr.length() == aln.tstr.length());
- assert(aln.end < utg.seq.size());
+ }
+
+ if (aligned == false) {
+ edlibFreeAlignResult(align);
+ return(false);
+ }
- dagcon::Alignment norm = normalizeGaps(aln);
+ char *tgtaln = new char [align.alignmentLength+1];
+ char *qryaln = new char [align.alignmentLength+1];
+
+ memset(tgtaln, 0, sizeof(char) * (align.alignmentLength+1));
+ memset(qryaln, 0, sizeof(char) * (align.alignmentLength+1));
+
+ edlibAlignmentToStrings(align.alignment, // Alignment
+ align.alignmentLength, // and length
+ align.startLocations[0], // tgtStart
+ align.endLocations[0]+1, // tgtEnd
+ 0, // qryStart
+ fragmentLength, // qryEnd
+ tigseq + tigbgn, // tgt sequence
+ fragment, // qry sequence
+ tgtaln, // output tgt alignment string
+ qryaln); // output qry alignment string
+
+ // Populate the output. AlnGraphBoost does not handle mismatch alignments, at all, so convert
+ // them to a pair of indel.
+
+ uint32 nMatch = 0;
+
+ for (uint32 ii=0; ii<align.alignmentLength; ii++) // Edlib guarantees aln[alignmentLength] == 0.
+ if ((tgtaln[ii] != '-') &&
+ (qryaln[ii] != '-') &&
+ (tgtaln[ii] != qryaln[ii]))
+ nMatch++;
+
+ aln.start = tigbgn + align.startLocations[0] + 1; // AlnGraphBoost expects 1-based positions.
+ aln.end = tigbgn + align.endLocations[0] + 1; // EdLib returns 0-based positions.
+
+ aln.qstr = new char [align.alignmentLength + nMatch + 1];
+ aln.tstr = new char [align.alignmentLength + nMatch + 1];
+
+ for (uint32 ii=0, jj=0; ii<align.alignmentLength; ii++) {
+ char tc = tgtaln[ii];
+ char qc = qryaln[ii];
+
+ if ((tc != '-') &&
+ (qc != '-') &&
+ (tc != qc)) {
+ aln.tstr[jj] = '-'; aln.qstr[jj] = qc; jj++;
+ aln.tstr[jj] = tc; aln.qstr[jj] = '-'; jj++;
+ } else {
+ aln.tstr[jj] = tc; aln.qstr[jj] = qc; jj++;
+ }
-#pragma omp critical (graphAdd)
- ag.addAln(norm); // NOT thread safe!
+ aln.length = jj;
}
- // Merge the nodes and call consensus
- ag.mergeNodes();
+ aln.qstr[aln.length] = 0;
+ aln.tstr[aln.length] = 0;
+
+ delete [] tgtaln;
+ delete [] qryaln;
+
+ edlibFreeAlignResult(align);
+
+ if (aln.end > tiglen)
+ fprintf(stderr, "ERROR: alignment from %d to %d, but tiglen is only %d\n", aln.start, aln.end, tiglen);
+ assert(aln.end <= tiglen);
+
+ return(true);
+}
+
- std::string cns = ag.consensus(1);
+
+void
+realignReads() {
#ifdef REALIGN
// update positions, this requires remapping but this time to the final consensus, turned off for now
@@ -493,27 +696,128 @@ unitigConsensus::generatePBDAG(char aligner,
EdlibAlignResult align = edlibAlign(seq->getBases(), seq->length()-1, cns.c_str()+start, end-start+1, edlibNewAlignConfig(bandTolerance, EDLIB_MODE_HW, EDLIB_TASK_LOC));
if (align.numLocations > 0) {
- cnspos[i].setMinMax(align.startLocations[0]+start, align.endLocations[0]+start+1);
- // when we are very close to end extend
- if (cnspos[i].max() < cns.size() && cns.size() - cnspos[i].max() <= maxExtend && (align.editDistance + cns.size() - cnspos[i].max()) < bandTolerance) {
- cnspos[i].setMinMax(cnspos[i].min(), cns.size());
- }
+ cnspos[i].setMinMax(align.startLocations[0]+start, align.endLocations[0]+start+1);
+ // when we are very close to end extend
+ if (cnspos[i].max() < cns.size() && cns.size() - cnspos[i].max() <= maxExtend && (align.editDistance + cns.size() - cnspos[i].max()) < bandTolerance) {
+ cnspos[i].setMinMax(cnspos[i].min(), cns.size());
+ }
#pragma omp critical (trackMin)
- if (cnspos[i].min() < minPos) minPos = cnspos[i].min();
+ if (cnspos[i].min() < minPos) minPos = cnspos[i].min();
#pragma omp critical (trackMax)
- if (cnspos[i].max() > maxPos) maxPos = cnspos[i].max();
+ if (cnspos[i].max() > maxPos) maxPos = cnspos[i].max();
} else {
-}
+ }
edlibFreeAlignResult(align);
}
memcpy(tig->getChild(0), cnspos, sizeof(tgPosition) * numfrags);
// trim consensus if needed
if (maxPos < cns.size())
- cns = cns.substr(0, maxPos);
+ cns = cns.substr(0, maxPos);
+
assert(minPos == 0);
assert(maxPos == cns.size());
#endif
+}
+
+
+
+bool
+unitigConsensus::generatePBDAG(char aligner,
+ bool normalize,
+ tgTig *tig_,
+ map<uint32, gkRead *> *inPackageRead_,
+ map<uint32, gkReadData *> *inPackageReadData_) {
+
+ bool verbose = (tig_->_utgcns_verboseLevel > 1);
+
+ tig = tig_;
+ numfrags = tig->numberOfChildren();
+
+ if (initialize(inPackageRead_, inPackageReadData_) == FALSE) {
+ fprintf(stderr, "generatePBDAG()-- Failed to initialize for tig %u with %u children\n", tig->tigID(), tig->numberOfChildren());
+ return(false);
+ }
+
+ // Build a quick consensus to align to.
+
+ char *tigseq = generateTemplateStitch(abacus, utgpos, numfrags, errorRate, tig->_utgcns_verboseLevel);
+ uint32 tiglen = strlen(tigseq);
+
+ fprintf(stderr, "Generated template of length %d\n", tiglen);
+
+ // Compute alignments of each sequence in parallel
+
+ fprintf(stderr, "Aligning reads.\n");
+
+ dagAlignment *aligns = new dagAlignment [numfrags];
+ uint32 pass = 0;
+ uint32 fail = 0;
+
+#pragma omp parallel for schedule(dynamic)
+ for (uint32 ii=0; ii<numfrags; ii++) {
+ abSequence *seq = abacus->getSequence(ii);
+ bool aligned = false;
+
+ assert(aligner == 'E'); // Maybe later we'll have more than one aligner again.
+
+ aligned = alignEdLib(aligns[ii],
+ utgpos[ii],
+ seq->getBases(), seq->length(),
+ tigseq, tiglen,
+ (double)tiglen / tig->_layoutLen,
+ errorRate,
+ normalize,
+ verbose);
+
+ if (aligned == false) {
+ if (verbose)
+ fprintf(stderr, "generatePBDAG()-- read %7u FAILED\n", utgpos[ii].ident());
+
+ fail++;
+
+ continue;
+ }
+
+ pass++;
+ }
+
+ fprintf(stderr, "Finished aligning reads. %d failed, %d passed.\n", fail, pass);
+
+ // Construct the graph from the alignments. This is not thread safe.
+
+ fprintf(stderr, "Constructing graph\n");
+
+ AlnGraphBoost ag(string(tigseq, tiglen));
+
+ for (uint32 ii=0; ii<numfrags; ii++) {
+ cnspos[ii].setMinMax(aligns[ii].start, aligns[ii].end);
+
+ if ((aligns[ii].start == 0) &&
+ (aligns[ii].end == 0))
+ continue;
+
+ ag.addAln(aligns[ii]);
+
+ aligns[ii].clear();
+ }
+
+ delete [] aligns;
+
+ fprintf(stderr, "Merging graph\n");
+
+ // Merge the nodes and call consensus
+ ag.mergeNodes();
+
+ fprintf(stderr, "Calling consensus\n");
+
+ std::string cns = ag.consensus(1);
+
+ delete [] tigseq;
+
+ // Realign reads to get precise endpoints
+
+ realignReads();
// Save consensus
@@ -525,6 +829,7 @@ unitigConsensus::generatePBDAG(char aligner,
tig->_gappedBases[len] = cns[len];
tig->_gappedQuals[len] = CNS_MIN_QV;
}
+
// Terminate the string.
tig->_gappedBases[len] = 0;
@@ -538,79 +843,86 @@ unitigConsensus::generatePBDAG(char aligner,
}
+
bool
unitigConsensus::generateQuick(tgTig *tig_,
map<uint32, gkRead *> *inPackageRead_,
map<uint32, gkReadData *> *inPackageReadData_) {
-
tig = tig_;
numfrags = tig->numberOfChildren();
if (initialize(inPackageRead_, inPackageReadData_) == FALSE) {
- fprintf(stderr, "generateMultiAlignment()-- Failed to initialize for tig %u with %u children\n", tig->tigID(), tig->numberOfChildren());
+ fprintf(stderr, "generatePBDAG()-- Failed to initialize for tig %u with %u children\n", tig->tigID(), tig->numberOfChildren());
return(false);
}
- // The quick variety doesn't generate alignments, it just pastes read bases into consensus. It
- // still needs to find a placement for the read, and it uses the other placed reads for that.
+ // Quick is just the template sequence, so one and done!
- while (moreFragments()) {
- reportStartingWork();
+ char *tigseq = generateTemplateStitch(abacus, utgpos, numfrags, errorRate, tig->_utgcns_verboseLevel);
+ uint32 tiglen = strlen(tigseq);
- piid = -1;
+ // Save consensus
+
+ resizeArrayPair(tig->_gappedBases, tig->_gappedQuals, 0, tig->_gappedMax, tiglen + 1, resizeArray_doNothing);
+
+ for (uint32 ii=0; ii<tiglen; ii++) {
+ tig->_gappedBases[ii] = tigseq[ii];
+ tig->_gappedQuals[ii] = CNS_MIN_QV;
+ }
- bool placed = computePositionFromLayout();
+ // Terminate the string.
- abSequence *seq = abacus->getSequence(utgpos[tiid].ident());
- char *fragment = seq->getBases();
- uint32 readLen = seq->length();
+ tig->_gappedBases[tiglen] = 0;
+ tig->_gappedQuals[tiglen] = 0;
+ tig->_gappedLen = tiglen;
+ tig->_layoutLen = tiglen;
- uint32 start = cnspos[tiid].min();
- uint32 end = cnspos[tiid].max();
+ delete [] tigseq;
- // if we couldn't place the read, fall back to utg positions
- if (placed == false) {
- start = abacus->numberOfColumns() - 1;
- end = start + readLen;
- }
+ return(true);
+}
- uint32 bHang = end - abacus->numberOfColumns();
- if (bHang <= 0) {
- // this read doesn't add anything, skip it
- continue;
- }
- // check if our positions are wonky, adjust the end to match reality
- if (start > abacus->numberOfColumns()) {
- start = abacus->numberOfColumns();
- }
- if (end - start > readLen) {
- end = start + readLen;
- }
- cnspos[tiid].setMinMax(start, end);
- // appendBases() will append only if new bases are needed. Otherwise, it just
- // sets the first/last bead position.
+bool
+unitigConsensus::generateSingleton(tgTig *tig_,
+ map<uint32, gkRead *> *inPackageRead_,
+ map<uint32, gkReadData *> *inPackageReadData_) {
+ tig = tig_;
+ numfrags = tig->numberOfChildren();
+
+ assert(numfrags == 1);
- abacus->appendBases(tiid,
- cnspos[tiid].min(),
- cnspos[tiid].max());
+ if (initialize(inPackageRead_, inPackageReadData_) == FALSE) {
+ fprintf(stderr, "generatePBDAG()-- Failed to initialize for tig %u with %u children\n", tig->tigID(), tig->numberOfChildren());
+ return(false);
+ }
- // I _think_ we need to rebuild iff bases are added. This also resets positions for each read.
- // Until someone complains this is too slow, it's left in.
+ // Copy the single read to the tig sequence.
- refreshPositions();
+ abSequence *seq = abacus->getSequence(0);
+ char *fragment = seq->getBases();
+ uint32 readLen = seq->length();
+
+ resizeArrayPair(tig->_gappedBases, tig->_gappedQuals, 0, tig->_gappedMax, readLen + 1, resizeArray_doNothing);
+
+ for (uint32 ii=0; ii<readLen; ii++) {
+ tig->_gappedBases[ii] = fragment[ii];
+ tig->_gappedQuals[ii] = CNS_MIN_QV;
}
- generateConsensus(tig);
+ // Terminate the string.
+
+ tig->_gappedBases[readLen] = 0;
+ tig->_gappedQuals[readLen] = 0;
+ tig->_gappedLen = readLen;
+ tig->_layoutLen = readLen;
return(true);
}
-
-
int
unitigConsensus::initialize(map<uint32, gkRead *> *inPackageRead,
map<uint32, gkReadData *> *inPackageReadData) {
diff --git a/src/utgcns/libcns/unitigConsensus.H b/src/utgcns/libcns/unitigConsensus.H
index 5a0a286..6c5e543 100644
--- a/src/utgcns/libcns/unitigConsensus.H
+++ b/src/utgcns/libcns/unitigConsensus.H
@@ -84,7 +84,8 @@ public:
map<uint32, gkRead *> *inPackageRead = NULL,
map<uint32, gkReadData *> *inPackageReadData = NULL);
- bool generatePBDAG(char aligner,
+ bool generatePBDAG(char aligner,
+ bool normalize,
tgTig *tig,
map<uint32, gkRead *> *inPackageRead = NULL,
map<uint32, gkReadData *> *inPackageReadData = NULL);
@@ -93,6 +94,10 @@ public:
map<uint32, gkRead *> *inPackageRead = NULL,
map<uint32, gkReadData *> *inPackageReadData = NULL);
+ bool generateSingleton(tgTig *tig,
+ map<uint32, gkRead *> *inPackageRead = NULL,
+ map<uint32, gkReadData *> *inPackageReadData = NULL);
+
int32 initialize(map<uint32, gkRead *> *inPackageRead,
map<uint32, gkReadData *> *inPackageReadData);
diff --git a/src/utgcns/libpbutgcns/Alignment.C b/src/utgcns/libpbutgcns/Alignment.C
deleted file mode 100644
index df7da23..0000000
--- a/src/utgcns/libpbutgcns/Alignment.C
+++ /dev/null
@@ -1,113 +0,0 @@
-
-/******************************************************************************
- *
- * This file is part of canu, a software program that assembles whole-genome
- * sequencing reads into contigs.
- *
- * This software is based on:
- * 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
- * the 'kmer package' (http://kmer.sourceforge.net)
- * both originally distributed by Applera Corporation under the GNU General
- * Public License, version 2.
- *
- * Canu branched from Celera Assembler at its revision 4587.
- * Canu branched from the kmer project at its revision 1994.
- *
- * Modifications by:
- *
- * Sergey Koren beginning on 2015-DEC-28
- * are a 'United States Government Work', and
- * are released in the public domain
- *
- * File 'README.licenses' in the root directory of this distribution contains
- * full conditions and disclaimers for each license.
- */
-
-#include <iostream>
-#include <fstream>
-#include <sstream>
-#include <string>
-#include <list>
-#include <vector>
-#include <cassert>
-#include "Alignment.H"
-
-using namespace dagcon;
-
-Alignment::Alignment() :
- start(0),
- end(0),
- frgid(""),
- qstr(""),
- tstr("") { }
-
-
-Alignment normalizeGaps(Alignment& aln) {
- size_t qlen = aln.qstr.length(), tlen = aln.tstr.length();
- assert(qlen == tlen);
- std::string qNorm, tNorm;
-
- // convert mismatches to indels
- for (size_t i=0; i < qlen; i++) {
- char qb = aln.qstr[i], tb = aln.tstr[i];
- if (qb != tb && qb != '-' && tb != '-') {
- qNorm += '-';
- qNorm += qb;
- tNorm += tb;
- tNorm += '-';
- } else {
- qNorm += qb;
- tNorm += tb;
- }
- }
-
- // update lengths
- qlen = qNorm.length();
- tlen = tNorm.length();
-
- // push gaps to the right, but not past the end
- for (size_t i=0; i < qlen-1; i++) {
- // pushing target gaps
- if (tNorm[i] == '-') {
- size_t j = i;
- while (true) {
- char c = tNorm[++j];
- if (c != '-' || j > qlen - 1) {
- if (c == qNorm[i]) {
- tNorm[i] = c;
- tNorm[j] = '-';
- }
- break;
- }
- }
- }
-
- // pushing query gaps
- if (qNorm[i] == '-') {
- size_t j = i;
- while (true) {
- char c = qNorm[++j];
- if (c != '-' || j > tlen - 1) {
- if (c == tNorm[i]) {
- qNorm[i] = c;
- qNorm[j] = '-';
- }
- break;
- }
- }
- }
- }
-
- // generate the final, normalized alignment strings
- Alignment finalNorm;
- finalNorm.frgid = aln.frgid;
- finalNorm.start = aln.start;
- for (size_t i=0; i < qlen; i++) {
- if (qNorm[i] != '-' || tNorm[i] != '-') {
- finalNorm.qstr += qNorm[i];
- finalNorm.tstr += tNorm[i];
- }
- }
-
- return finalNorm;
-}
diff --git a/src/utgcns/libpbutgcns/Alignment.H b/src/utgcns/libpbutgcns/Alignment.H
index 633260d..825491a 100644
--- a/src/utgcns/libpbutgcns/Alignment.H
+++ b/src/utgcns/libpbutgcns/Alignment.H
@@ -40,45 +40,46 @@
#include <string>
#include <stdint.h>
-namespace dagcon {
-class Alignment {
+class dagAlignment {
public:
- // conforming offsets are 1-based
- uint32_t start;
+ dagAlignment() {
+ start = 0;
+ end = 0;
- uint32_t end;
+ length = 0;
- // Fragment ID
- std::string frgid;
+ qstr = NULL;
+ tstr = NULL;
+ };
+ ~dagAlignment() {
+ delete [] qstr;
+ delete [] tstr;
+ };
- // query and target strings must be equal length
- std::string qstr;
- std::string tstr;
+ void clear(void) {
- Alignment();
-};
-}
+ delete [] qstr;
+ delete [] tstr;
+
+ start = 0;
+ end = 0;
+
+ length = 0;
+
+ qstr = NULL;
+ tstr = NULL;
+ };
+
+ void normalizeGaps(void);
-struct Unitig {
- uint32_t id;
- std::string seq;
+ uint32_t start; // 1-based!
+ uint32_t end;
+
+ uint32_t length;
+
+ char *qstr;
+ char *tstr;
};
-/// Simplifies the alignment by normalizing gaps. Converts mismatches into
-/// indels ...
-/// query: CAC query: C-AC
-/// | | ---> | |
-/// target: CGC target: CG-C
-///
-/// Shifts equivalent gaps to the right in the reference ...
-/// query: CAACAT query: CAACAT
-/// | | || ---> ||| |
-/// target: C-A-AT target: CAA--T
-///
-/// Shifts equivalent gaps to the right in the read ...
-/// query: -C--CGT query: CCG--T
-/// | | | ---> ||| |
-/// target: CCGAC-T target: CCGACT
-dagcon::Alignment normalizeGaps(dagcon::Alignment& aln);
#endif // __GCON_ALIGNMENT_HPP__
diff --git a/src/utgcns/libpbutgcns/AlnGraphBoost.C b/src/utgcns/libpbutgcns/AlnGraphBoost.C
index e0018be..8b89d1e 100644
--- a/src/utgcns/libpbutgcns/AlnGraphBoost.C
+++ b/src/utgcns/libpbutgcns/AlnGraphBoost.C
@@ -19,6 +19,10 @@
* are a 'United States Government Work', and
* are released in the public domain
*
+ * Brian P. Walenz beginning on 2017-MAY-09
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
@@ -120,12 +124,12 @@ AlnGraphBoost::AlnGraphBoost(const size_t blen) {
_g[_exitVtx].backbone = true;
}
-void AlnGraphBoost::addAln(dagcon::Alignment& aln) {
+void AlnGraphBoost::addAln(dagAlignment& aln) {
IndexMap index = boost::get(boost::vertex_index, _g);
// tracks the position on the backbone
uint32_t bbPos = aln.start;
VtxDesc prevVtx = _enterVtx;
- for (size_t i = 0; i < aln.qstr.length(); i++) {
+ for (size_t i = 0; i < aln.length; i++) {
char queryBase = aln.qstr[i], targetBase = aln.tstr[i];
VtxDesc currVtx = index[bbPos];
// match
diff --git a/src/utgcns/libpbutgcns/AlnGraphBoost.H b/src/utgcns/libpbutgcns/AlnGraphBoost.H
index 25e3c6e..c3ea851 100644
--- a/src/utgcns/libpbutgcns/AlnGraphBoost.H
+++ b/src/utgcns/libpbutgcns/AlnGraphBoost.H
@@ -19,6 +19,10 @@
* are a 'United States Government Work', and
* are released in the public domain
*
+ * Brian P. Walenz beginning on 2017-MAY-09
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
@@ -149,7 +153,7 @@ public:
/// Add alignment to the graph.
/// \param Alignment an alignment record (see Alignment.hpp)
- void addAln(dagcon::Alignment& aln);
+ void addAln(dagAlignment& aln);
/// Adds a new or increments an existing edge between two aligned bases.
/// \param u the 'from' vertex descriptor
diff --git a/src/utgcns/utgcns.C b/src/utgcns/utgcns.C
index bdf03be..51d6d1c 100644
--- a/src/utgcns/utgcns.C
+++ b/src/utgcns/utgcns.C
@@ -84,7 +84,9 @@ main (int argc, char **argv) {
char *inPackageName = NULL;
char algorithm = 'P';
- char aligner = 'N';
+ char aligner = 'E';
+ bool normalize = false; // Not used, left for future use.
+
uint32 numThreads = 0;
bool forceCompute = false;
@@ -100,6 +102,12 @@ main (int argc, char **argv) {
double maxCov = 0.0;
uint32 maxLen = UINT32_MAX;
+ bool onlyUnassem = false;
+ bool onlyBubble = false;
+ bool onlyContig = false;
+
+ bool noSingleton = false;
+
uint32 verbosity = 0;
argc = AS_configure(argc, argv);
@@ -123,7 +131,8 @@ main (int argc, char **argv) {
if (tigPart == 0)
fprintf(stderr, "invalid tigStore partition (-T store version partition) '-t %s %s %s'.\n", argv[arg-2], argv[arg-1], argv[arg]), exit(1);
- } else if (strcmp(argv[arg], "-u") == 0) {
+ } else if ((strcmp(argv[arg], "-u") == 0) ||
+ (strcmp(argv[arg], "-tig") == 0)) {
AS_UTL_decodeRange(argv[++arg], utgBgn, utgEnd);
} else if (strcmp(argv[arg], "-t") == 0) {
@@ -147,9 +156,15 @@ main (int argc, char **argv) {
algorithm = 'P';
} else if (strcmp(argv[arg], "-utgcns") == 0) {
algorithm = 'U';
+
} else if (strcmp(argv[arg], "-edlib") == 0) {
aligner = 'E';
+ } else if (strcmp(argv[arg], "-normalize") == 0) {
+ normalize = true;
+ } else if (strcmp(argv[arg], "-nonormalize") == 0) {
+ normalize = false;
+
} else if (strcmp(argv[arg], "-threads") == 0) {
numThreads = atoi(argv[++arg]);
@@ -183,6 +198,18 @@ main (int argc, char **argv) {
} else if (strcmp(argv[arg], "-maxlength") == 0) {
maxLen = atof(argv[++arg]);
+ } else if (strcmp(argv[arg], "-onlyunassem") == 0) {
+ onlyUnassem = true;
+
+ } else if (strcmp(argv[arg], "-onlybubble") == 0) {
+ onlyBubble = true;
+
+ } else if (strcmp(argv[arg], "-onlycontig") == 0) {
+ onlyContig = true;
+
+ } else if (strcmp(argv[arg], "-nosingleton") == 0) {
+ noSingleton = true;
+
} else {
fprintf(stderr, "%s: Unknown option '%s'\n", argv[0], argv[arg]);
err++;
@@ -202,26 +229,29 @@ main (int argc, char **argv) {
if ((tigFileName == NULL) && (tigName == NULL) && (inPackageName == NULL))
err++;
+ if ((algorithm != 'Q') && (algorithm != 'P') && (algorithm != 'U'))
+ err++;
+
if (err) {
fprintf(stderr, "usage: %s [opts]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, " INPUT\n");
fprintf(stderr, " -G g Load reads from gkStore 'g'\n");
- fprintf(stderr, " -T t v p Load unitigs from tgStore 't', version 'v', partition 'p'.\n");
+ fprintf(stderr, " -T t v p Load tig from tgStore 't', version 'v', partition 'p'.\n");
fprintf(stderr, " Expects reads will be in gkStore partition 'p' as well\n");
fprintf(stderr, " Use p='.' to specify no partition\n");
- fprintf(stderr, " -t file Test the computation of the unitig layout in 'file'\n");
+ fprintf(stderr, " -t file Test the computation of the tig layout in 'file'\n");
fprintf(stderr, " 'file' can be from:\n");
fprintf(stderr, " 'tgStoreDump -d layout' (human readable layout format)\n");
fprintf(stderr, " 'utgcns -L' (human readable layout format)\n");
fprintf(stderr, " 'utgcns -O' (binary multialignment format)\n");
fprintf(stderr, "\n");
- fprintf(stderr, " -p package Load unitig and read from 'package' created with -P. This\n");
+ fprintf(stderr, " -p package Load tig and reads from 'package' created with -P. This\n");
fprintf(stderr, " is usually used by developers.\n");
fprintf(stderr, "\n");
fprintf(stderr, "\n");
fprintf(stderr, " ALGORITHM\n");
- fprintf(stderr, " -quick No alignments, just paste read sequence into the unitig positions.\n");
+ fprintf(stderr, " -quick No alignments, just paste read sequence into the tig positions.\n");
fprintf(stderr, " This is very fast, but the consensus sequence is formed from a mosaic\n");
fprintf(stderr, " of read sequences, and there can be large indel. This is useful for\n");
fprintf(stderr, " checking intermediate assembly structure by mapping to reference, or\n");
@@ -235,22 +265,37 @@ main (int argc, char **argv) {
fprintf(stderr, " output.\n");
fprintf(stderr, "\n");
fprintf(stderr, "\n");
+ fprintf(stderr, " ALIGNER\n");
+ fprintf(stderr, " -edlib Myers' O(ND) algorithm from Edlib (https://github.com/Martinsos/edlib).\n");
+ fprintf(stderr, " This is the default (and, yes, there is no non-default aligner).\n");
+ //fprintf(stderr, "\n");
+ //fprintf(stderr, " -normalize Shift gaps to one side. Probably not useful anymore.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "\n");
fprintf(stderr, " OUTPUT\n");
fprintf(stderr, " -O results Write computed tigs to binary output file 'results'\n");
fprintf(stderr, " -L layouts Write computed tigs to layout output file 'layouts'\n");
fprintf(stderr, " -A fasta Write computed tigs to fasta output file 'fasta'\n");
fprintf(stderr, " -Q fastq Write computed tigs to fastq output file 'fastq'\n");
fprintf(stderr, "\n");
- fprintf(stderr, " -P package Create a copy of the inputs needed to compute the unitigs. This\n");
- fprintf(stderr, " file can then be sent to the developers for debugging. The unitig(s)\n");
+ fprintf(stderr, " -P package Create a copy of the inputs needed to compute the tigs. This\n");
+ fprintf(stderr, " file can then be sent to the developers for debugging. The tig(s)\n");
fprintf(stderr, " are not processed and no other outputs are created. Ideally,\n");
- fprintf(stderr, " only one unitig is selected (-u, below).\n");
+ fprintf(stderr, " only one tig is selected (-u, below).\n");
+ fprintf(stderr, "\n");
fprintf(stderr, "\n");
fprintf(stderr, " TIG SELECTION (if -T input is used)\n");
- fprintf(stderr, " -u b Compute only unitig ID 'b' (must be in the correct partition!)\n");
- fprintf(stderr, " -u b-e Compute only unitigs from ID 'b' to ID 'e'\n");
- fprintf(stderr, " -f Recompute unitigs that already have a multialignment\n");
- fprintf(stderr, " -maxlength l Do not compute consensus for unitigs longer than l bases.\n");
+ fprintf(stderr, " -tig b Compute only tig ID 'b' (must be in the correct partition!)\n");
+ fprintf(stderr, " -tig b-e Compute only tigs from ID 'b' to ID 'e'\n");
+ fprintf(stderr, " -u Alias for -tig\n");
+ fprintf(stderr, " -f Recompute tigs that already have a multialignment\n");
+ fprintf(stderr, " -maxlength l Do not compute consensus for tigs longer than l bases.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -onlyunassem Only compute consensus for unassembled tigs.\n");
+ fprintf(stderr, " -onlybubble Only compute consensus for bubble tigs (there are no bubbles).\n");
+ fprintf(stderr, " -onlycontig Only compute consensus for real unitigs/contigs.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -nosingleton Do not compute consensus for singleton (single-read) tigs.\n");
fprintf(stderr, "\n");
fprintf(stderr, " PARAMETERS\n");
fprintf(stderr, " -e e Expect alignments at up to fraction e error\n");
@@ -259,6 +304,7 @@ main (int argc, char **argv) {
fprintf(stderr, " -maxcoverage c Use non-contained reads and the longest contained reads, up to\n");
fprintf(stderr, " C coverage, for consensus generation. The default is 0, and will\n");
fprintf(stderr, " use all reads.\n");
+ fprintf(stderr, " -threads t Use 't' compute threads; default 1.\n");
fprintf(stderr, "\n");
fprintf(stderr, " LOGGING\n");
fprintf(stderr, " -v Show multialigns.\n");
@@ -270,7 +316,10 @@ main (int argc, char **argv) {
fprintf(stderr, "ERROR: No gkpStore (-G) and no package (-p) supplied.\n");
if ((tigFileName == NULL) && (tigName == NULL) && (inPackageName == NULL))
- fprintf(stderr, "ERROR: No tigStore (-T) OR no test unitig (-t) OR no package (-p) supplied.\n");
+ fprintf(stderr, "ERROR: No tigStore (-T) OR no test tig (-t) OR no package (-p) supplied.\n");
+
+ if ((algorithm != 'Q') && (algorithm != 'P') && (algorithm != 'U'))
+ fprintf(stderr, "ERROR: Invalid algorithm '%c' specified; must be one of -quick, -pbdagcon, -utgcns.\n", algorithm);
exit(1);
}
@@ -357,7 +406,7 @@ main (int argc, char **argv) {
fprintf(stderr, "sizeof(abAbacus) " F_SIZE_T "\n", sizeof(abAbacus));
fprintf(stderr, "sizeof(abSequence) " F_SIZE_T "\n", sizeof(abSequence));
- // Decide on what to compute. Either all unitigs, or a single unitig, or a special case test.
+ // Decide on what to compute. Either all tigs, or a single tig, or a special case test.
uint32 b = 0;
uint32 e = UINT32_MAX;
@@ -375,12 +424,12 @@ main (int argc, char **argv) {
e = utgEnd;
}
- fprintf(stderr, "-- Computing unitig consensus for b=" F_U32 " to e=" F_U32 " with errorRate %0.4f (max %0.4f) and minimum overlap " F_U32 "\n",
+ fprintf(stderr, "-- Computing consensus for b=" F_U32 " to e=" F_U32 " with errorRate %0.4f (max %0.4f) and minimum overlap " F_U32 "\n",
b, e, errorRate, errorRateMax, minOverlap);
}
else {
- fprintf(stderr, "-- Computing unitig consensus with errorRate %0.4f (max %0.4f) and minimum overlap " F_U32 "\n",
+ fprintf(stderr, "-- Computing consensus with errorRate %0.4f (max %0.4f) and minimum overlap " F_U32 "\n",
errorRate, errorRateMax, minOverlap);
}
@@ -392,27 +441,33 @@ main (int argc, char **argv) {
tgTig *tig = NULL;
// If a tigStore, load the tig. The tig is the owner; it cannot be deleted by us.
- if (tigStore)
+
+ if (tigStore) {
tig = tigStore->loadTig(ti);
+ }
- // If a tigFile or a package, create a new tig and fill it. Obviously, we own it.
- if (tigFile || inPackageFile) {
+ // If a tigFile, create a new tig and load it. Obviously, we own it.
+
+ if (tigFile) {
tig = new tgTig();
- if (tig->loadFromStreamOrLayout((tigFile != NULL) ? tigFile : inPackageFile) == false) {
+ if (tig->loadFromStreamOrLayout(tigFile) == false) {
delete tig;
break;
}
}
- // No tig loaded, keep going.
+ // If a package, create a new tig and loat it. Obviously, we own it. If the tig loads,
+ // populate the read and readData maps with data from the package.
- if (tig == NULL)
- continue;
+ if (inPackageFile) {
+ tig = new tgTig();
- // If a package, populate the read and readData maps with data from the package.
+ if (tig->loadFromStreamOrLayout(inPackageFile) == false) {
+ delete tig;
+ break;
+ }
- if (inPackageFile) {
inPackageRead = new map<uint32, gkRead *>;
inPackageReadData = new map<uint32, gkReadData *>;
@@ -430,6 +485,11 @@ main (int argc, char **argv) {
}
}
+ // No tig loaded, keep going.
+
+ if (tig == NULL)
+ continue;
+
// More 'not liking' - set the verbosity level for logging.
tig->_utgcns_verboseLevel = verbosity;
@@ -444,35 +504,44 @@ main (int argc, char **argv) {
missingReads++;
if (missingReads) {
- //fprintf(stderr, "SKIP unitig %u with %u reads found only %u reads in partition, skipped\n",
+ //fprintf(stderr, "SKIP tig %u with %u reads found only %u reads in partition, skipped\n",
// tig->tigID(), tig->numberOfChildren(), tig->numberOfChildren() - missingReads);
continue;
}
}
- if (tig->length(true) > maxLen) {
- fprintf(stderr, "SKIP unitig %d of length %d (%d children) - too long, skipped\n",
- tig->tigID(), tig->length(true), tig->numberOfChildren());
+ // Skip stuff we want to skip.
+
+ if (tig->length(true) > maxLen)
continue;
- }
- if (tig->numberOfChildren() == 0) {
- fprintf(stderr, "SKIP unitig %d of length %d (%d children) - no children, skipped\n",
- tig->tigID(), tig->length(true), tig->numberOfChildren());
+ if ((onlyUnassem == true) && (tig->_class != tgTig_unassembled))
+ continue;
+
+ if ((onlyContig == true) && (tig->_class != tgTig_contig))
+ continue;
+
+ if ((onlyBubble == true) && (tig->_class != tgTig_bubble))
+ continue;
+
+ if ((noSingleton == true) && (tig->numberOfChildren() == 1))
+ continue;
+
+ if (tig->numberOfChildren() == 0)
continue;
- }
+
+
+ // Process the tig. Remove deep coverage, create a consensus object, process it, and report the results.
+ // before we add it to the store.
bool exists = tig->consensusExists();
if (tig->numberOfChildren() > 1)
- fprintf(stderr, "Working on unitig %d of length %d (%d children)%s%s\n",
+ fprintf(stderr, "Working on tig %d of length %d (%d children)%s%s\n",
tig->tigID(), tig->length(true), tig->numberOfChildren(),
((exists == true) && (forceCompute == false)) ? " - already computed" : "",
((exists == true) && (forceCompute == true)) ? " - already computed, recomputing" : "");
- // Process the tig. Remove deep coverage, create a consensus object, process it, and report the results.
- // before we add it to the store.
-
unitigConsensus *utgcns = new unitigConsensus(gkpStore, errorRate, errorRateMax, minOverlap);
savedChildren *origChildren = NULL;
bool success = exists;
@@ -490,7 +559,7 @@ main (int argc, char **argv) {
if (outPackageFile) {
utgcns->savePackage(outPackageFile, tig);
- fprintf(stderr, " Packaged unitig %u into '%s'\n", tig->tigID(), outPackageName);
+ fprintf(stderr, " Packaged tig %u into '%s'\n", tig->tigID(), outPackageName);
}
// Compute consensus if it doesn't exist, or if we're forcing a recompute. But only if we
@@ -500,21 +569,29 @@ main (int argc, char **argv) {
((exists == false) || (forceCompute == true))) {
origChildren = stashContains(tig, maxCov, true);
- switch (algorithm) {
- case 'Q':
- success = utgcns->generateQuick(tig, inPackageRead, inPackageReadData);
- break;
- case 'P':
- default:
- success = utgcns->generatePBDAG(aligner, tig, inPackageRead, inPackageReadData);
- break;
- case 'U':
- success = utgcns->generate(tig, inPackageRead, inPackageReadData);
- break;
+ if (tig->numberOfChildren() == 1) {
+ success = utgcns->generateSingleton(tig, inPackageRead, inPackageReadData);
+ }
+
+ else if (algorithm == 'Q') {
+ success = utgcns->generateQuick(tig, inPackageRead, inPackageReadData);
+ }
+
+ else if (algorithm == 'P') {
+ success = utgcns->generatePBDAG(aligner, normalize, tig, inPackageRead, inPackageReadData);
+ }
+
+ else if (algorithm == 'U') {
+ success = utgcns->generate(tig, inPackageRead, inPackageReadData);
+ }
+
+ else {
+ fprintf(stderr, "Invalid algorithm. How'd you do this?\n");
+ assert(0);
}
}
- // If it was successful (or existed already), output. Success is always false if the unitig
+ // If it was successful (or existed already), output. Success is always false if the tig
// was packaged, regardless of if it existed already.
if (success == true) {
@@ -539,7 +616,7 @@ main (int argc, char **argv) {
// Report failures.
if ((success == false) && (outPackageFile == NULL)) {
- fprintf(stderr, "unitigConsensus()-- unitig %d failed.\n", tig->tigID());
+ fprintf(stderr, "unitigConsensus()-- tig %d failed.\n", tig->tigID());
numFailures++;
}
@@ -567,7 +644,7 @@ main (int argc, char **argv) {
if (inPackageFile) fclose(inPackageFile);
if (numFailures) {
- fprintf(stderr, "WARNING: Total number of unitig failures = %d\n", numFailures);
+ fprintf(stderr, "WARNING: Total number of tig failures = %d\n", numFailures);
fprintf(stderr, "\n");
fprintf(stderr, "Consensus did NOT finish successfully.\n");
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/canu.git
More information about the debian-med-commit
mailing list