[med-svn] [canu] 01/04: Imported Upstream version 1.4+dfsg
Afif Elghraoui
afif at moszumanska.debian.org
Tue Mar 21 02:57:59 UTC 2017
This is an automated email from the git hooks/post-receive script.
afif pushed a commit to branch master
in repository canu.
commit a8dd66496c4feba3182f770869bd5c98a013597b
Author: Afif Elghraoui <afif at debian.org>
Date: Mon Mar 20 22:37:55 2017 -0400
Imported Upstream version 1.4+dfsg
---
README.citation | 3 +-
README.md | 21 +-
addCopyrights-BuildData.pl | 1 +
addCopyrights.dat | 868 +++++
addCopyrights.pl | 36 +-
documentation/source/canu-overlaps.svg | 2836 ++++++++++++++++
documentation/source/canu-pipeline.svg | 3528 ++++++++++++++++++++
documentation/source/faq.rst | 249 +-
documentation/source/index.rst | 21 +-
documentation/source/overlap_transformations.svg | 500 +++
documentation/source/overlaps.svg | 384 +++
documentation/source/parameter-reference.rst | 17 +
documentation/source/pipeline.rst | 30 +-
documentation/source/quick-start.rst | 2 +-
documentation/source/repeat-spanned.svg | 214 ++
documentation/source/repeat-unspanned.svg | 232 ++
documentation/source/tutorial.rst | 46 +-
src/AS_RUN/fragmentDepth.C | 466 ---
src/AS_RUN/replaceIIDwithName-overlapDump.pl | 70 -
src/AS_RUN/replaceUIDwithName-fastq.pl | 153 -
src/AS_RUN/replaceUIDwithName-posmap.pl | 98 -
src/AS_UTL/AS_UTL_alloc.C | 48 +-
src/AS_UTL/AS_UTL_alloc.H | 34 +-
src/AS_UTL/AS_UTL_fileIO.C | 224 +-
src/AS_UTL/AS_UTL_fileIO.H | 22 +
src/AS_UTL/AS_UTL_stackTrace.C | 2 +-
src/AS_UTL/bitPackedArray.C | 8 +-
src/AS_UTL/bitPackedArray.H | 24 +-
src/AS_UTL/bitPackedFile.C | 21 +-
src/AS_UTL/bitPackedFile.H | 6 +-
src/AS_UTL/hexDump.C | 85 +
.../SimpleAligner.H => AS_UTL/hexDump.H} | 18 +-
src/AS_UTL/intervalList.H | 8 +-
src/AS_UTL/intervalListTest.C | 42 +-
src/AS_UTL/kMer.C | 4 +-
src/AS_UTL/kMerHuge.H | 14 +-
src/AS_UTL/memoryMappedFile.H | 4 +-
src/AS_UTL/mt19937ar.C | 8 +
src/AS_UTL/mt19937ar.H | 1 +
src/AS_UTL/mt19937arTest.C | 19 +
src/AS_UTL/readBuffer.C | 6 +-
src/AS_UTL/stddev.H | 16 +-
src/AS_UTL/sweatShop.C | 18 +-
src/AS_UTL/writeBuffer.H | 93 +
src/AS_global.C | 40 +-
src/AS_global.H | 56 +-
src/Makefile | 8 +-
src/bogart/AS_BAT_AssemblyGraph.C | 971 ++++++
src/bogart/AS_BAT_AssemblyGraph.H | 133 +
src/bogart/AS_BAT_BestOverlapGraph.C | 592 ++--
src/bogart/AS_BAT_BestOverlapGraph.H | 122 +-
src/bogart/AS_BAT_ChunkGraph.C | 144 +-
src/bogart/AS_BAT_ChunkGraph.H | 24 +-
src/bogart/AS_BAT_CreateUnitigs.C | 514 +++
...{AS_BAT_PopBubbles.H => AS_BAT_CreateUnitigs.H} | 41 +-
src/bogart/AS_BAT_FragmentInfo.C | 179 -
src/bogart/AS_BAT_Instrumentation.C | 441 +--
src/bogart/AS_BAT_Instrumentation.H | 16 +-
src/bogart/AS_BAT_Logging.C | 91 +-
src/bogart/AS_BAT_Logging.H | 15 +-
src/bogart/AS_BAT_MarkRepeatReads.C | 1276 ++++---
src/bogart/AS_BAT_MarkRepeatReads.H | 10 +-
.../{AS_BAT_PopBubbles.C => AS_BAT_MergeOrphans.C} | 517 ++-
.../{AS_BAT_PopBubbles.H => AS_BAT_MergeOrphans.H} | 10 +-
src/bogart/AS_BAT_MergeUnitigs.C | 246 --
src/bogart/AS_BAT_Outputs.C | 464 +--
src/bogart/AS_BAT_Outputs.H | 15 +-
src/bogart/AS_BAT_OverlapCache.C | 1104 +++---
src/bogart/AS_BAT_OverlapCache.H | 124 +-
src/bogart/AS_BAT_PlaceContains.C | 105 +-
src/bogart/AS_BAT_PlaceContains.H | 8 +-
src/bogart/AS_BAT_PlaceFragUsingOverlaps.C | 600 ----
src/bogart/AS_BAT_PlaceReadUsingOverlaps.C | 688 ++++
...gOverlaps.H => AS_BAT_PlaceReadUsingOverlaps.H} | 102 +-
src/bogart/AS_BAT_PopBubbles.txt | 2 +-
src/bogart/AS_BAT_PopulateUnitig.C | 117 +-
src/bogart/AS_BAT_PopulateUnitig.H | 4 +-
src/bogart/AS_BAT_PromoteToSingleton.C | 46 +-
src/bogart/AS_BAT_PromoteToSingleton.H | 2 +-
src/bogart/AS_BAT_ReadInfo.C | 83 +
.../{AS_BAT_FragmentInfo.H => AS_BAT_ReadInfo.H} | 73 +-
src/bogart/AS_BAT_ReconstructRepeats.C | 94 -
src/bogart/AS_BAT_SetParentAndHang.C | 28 +-
src/bogart/AS_BAT_SetParentAndHang.H | 4 +-
src/bogart/AS_BAT_SplitDiscontinuous.C | 219 +-
src/bogart/AS_BAT_SplitDiscontinuous.H | 5 +-
src/bogart/AS_BAT_TigGraph.C | 463 +++
.../{AS_BAT_PopBubbles.H => AS_BAT_TigGraph.H} | 19 +-
.../{AS_BAT_UnitigVector.C => AS_BAT_TigVector.C} | 73 +-
.../{AS_BAT_UnitigVector.H => AS_BAT_TigVector.H} | 43 +-
src/bogart/AS_BAT_Unitig.C | 243 +-
src/bogart/AS_BAT_Unitig.H | 111 +-
...AT_Unitig_AddFrag.C => AS_BAT_Unitig_AddRead.C} | 35 +-
src/bogart/AS_BAT_Unitig_PlaceFragUsingEdges.C | 275 --
src/bogart/AS_BAT_Unitig_PlaceReadUsingEdges.C | 270 ++
src/bogart/addReadsToUnitigs.C | 31 +-
src/bogart/analyzeBest.C | 14 +-
src/bogart/bogart.C | 360 +-
src/bogart/bogart.mk | 17 +-
src/bogart/buildGraph.C | 403 ---
src/bogart/findOverlappingReads.pl | 120 +
.../plotErrorProfile.pl} | 52 +-
src/bogus/bogus.C | 16 +-
src/bogus/bogusness.C | 4 +-
src/correction/filterCorrectionOverlaps.C | 34 +-
src/correction/generateCorrectionLayouts.C | 20 +-
src/erateEstimate/erateEstimate.C | 54 +-
src/falcon_sense/createFalconSenseInputs.C | 4 +-
src/falcon_sense/falcon_sense.C | 9 +-
src/falcon_sense/outputFalcon.C | 4 +-
src/fastq-utilities/fastqAnalyze.C | 18 +-
src/fastq-utilities/fastqSample.C | 41 +-
src/fastq-utilities/fastqSimulate-sort.C | 4 +-
src/fastq-utilities/fastqSimulate.C | 31 +-
src/main.mk | 23 +-
src/merTrim/merTrim.C | 30 +-
src/merTrim/merTrimResult.H | 4 +-
src/mercy/mercy-regions.C | 30 +-
src/mercy/mercy.C | 44 +-
src/meryl/compare-counts.C | 6 +-
src/meryl/estimate-mer-threshold.C | 26 +-
src/meryl/leaff-blocks.C | 10 +-
src/meryl/leaff-duplicates.C | 14 +-
src/meryl/leaff-gc.C | 6 +-
src/meryl/leaff-partition.C | 22 +-
src/meryl/leaff-statistics.C | 24 +-
src/meryl/leaff.C | 22 +-
src/meryl/libleaff/fastaFile.C | 19 +-
src/meryl/libleaff/fastaStdin.C | 6 +-
src/meryl/libleaff/fastqFile.C | 21 +-
src/meryl/libleaff/fastqStdin.C | 9 +-
src/meryl/libleaff/gkStoreFile.C | 2 +
src/meryl/libleaff/seqCache.C | 8 +-
src/meryl/libleaff/seqFactory.C | 8 +-
src/meryl/libleaff/seqStore.C | 20 +-
src/meryl/libleaff/seqStream.C | 24 +-
src/meryl/libleaff/sffFile.C | 5 +
src/meryl/libmeryl.C | 70 +-
src/meryl/libmeryl.H | 4 +
src/meryl/maskMers.C | 33 +-
src/{bogart/buildGraph.mk => meryl/maskMers.mk} | 12 +-
src/meryl/meryl-args.C | 24 +-
src/meryl/meryl-binaryOp.C | 8 +-
src/meryl/meryl-build-threads.C | 123 -
src/meryl/meryl-build.C | 166 +-
src/meryl/meryl-dump.C | 26 +-
src/meryl/meryl-estimate.C | 6 +-
src/meryl/meryl-merge.C | 6 +-
src/meryl/meryl.H | 2 -
src/meryl/meryl.mk | 1 -
src/meryl/simple.C | 12 +-
src/mhap/mhap.mk | 4 +-
src/mhap/mhapConvert.C | 16 +-
src/minimap/mmapConvert.C | 6 +-
src/overlapBasedTrimming/splitReads-subReads.C | 2 +-
src/overlapBasedTrimming/splitReads.C | 62 +-
src/overlapBasedTrimming/trimReads-bestEdge.C | 6 +-
.../trimReads-largestCovered.C | 8 +-
src/overlapBasedTrimming/trimReads-quality.C | 4 +-
src/overlapBasedTrimming/trimReads.C | 32 +-
src/overlapBasedTrimming/trimStat.H | 6 +-
src/overlapErrorAdjustment/analyzeAlignment.C | 28 +-
.../correctOverlaps-Correct_Frags.C | 12 +-
.../correctOverlaps-Prefix_Edit_Distance.C | 2 +-
.../correctOverlaps-Read_Olaps.C | 6 +-
.../correctOverlaps-Redo_Olaps.C | 20 +-
src/overlapErrorAdjustment/correctOverlaps.C | 7 +-
src/overlapErrorAdjustment/correctOverlaps.H | 13 +-
.../findErrors-Process_Olap.C | 11 +-
src/overlapErrorAdjustment/findErrors-Read_Frags.C | 10 +-
src/overlapErrorAdjustment/findErrors-Read_Olaps.C | 6 +-
src/overlapErrorAdjustment/findErrors.C | 84 +-
src/overlapErrorAdjustment/findErrors.H | 20 +-
src/overlapInCore/libedlib/edlib.C | 1394 ++++++++
src/overlapInCore/libedlib/edlib.H | 270 ++
src/overlapInCore/overlapConvert.C | 13 +-
src/overlapInCore/overlapImport.C | 95 +-
src/overlapInCore/overlapInCore-Build_Hash_Index.C | 36 +-
src/overlapInCore/overlapInCore-Find_Overlaps.C | 16 +-
src/overlapInCore/overlapInCore-Output.C | 28 +-
src/overlapInCore/overlapInCore-Process_Overlaps.C | 52 +-
.../overlapInCore-Process_String_Overlaps.C | 18 +
src/overlapInCore/overlapInCore.C | 129 +-
src/overlapInCore/overlapInCore.H | 18 +-
src/overlapInCore/overlapInCorePartition.C | 92 +-
src/overlapInCore/overlapPair.C | 377 +--
src/overlapInCore/overlapPair.mk | 2 +-
src/overlapInCore/overlapReadCache.C | 6 +-
src/pipelines/canu.pl | 81 +-
src/pipelines/canu/Configure.pm | 14 +-
src/pipelines/canu/Consensus.pm | 354 +-
src/pipelines/canu/CorrectReads.pm | 94 +-
src/pipelines/canu/Defaults.pm | 495 ++-
src/pipelines/canu/ErrorEstimate.pm | 6 +-
src/pipelines/canu/Execution.pm | 534 ++-
src/pipelines/canu/Gatekeeper.pm | 60 +-
src/pipelines/canu/Grid.pm | 11 +-
src/pipelines/canu/Grid_PBSTorque.pm | 141 +-
src/pipelines/canu/Grid_Slurm.pm | 10 +-
src/pipelines/canu/HTML.pm | 255 +-
src/pipelines/canu/Meryl.pm | 15 +-
src/pipelines/canu/Output.pm | 178 +-
src/pipelines/canu/OverlapErrorAdjustment.pm | 82 +-
src/pipelines/canu/OverlapInCore.pm | 35 +-
src/pipelines/canu/OverlapMMap.pm | 41 +-
src/pipelines/canu/OverlapMhap.pm | 63 +-
src/pipelines/canu/OverlapStore.pm | 88 +-
src/pipelines/canu/Unitig.pm | 66 +-
src/pipelines/install-perl-libraries.sh | 110 -
src/pipelines/sanity/build-all-wgs-revisions.pl | 173 -
src/pipelines/sanity/compile-all-wgs-revisions.pl | 201 --
src/pipelines/sanity/sanity-asm-done.pl | 132 -
src/pipelines/sanity/sanity-get-next-date.pl | 66 -
src/pipelines/sanity/sanity-purge-old.pl | 210 --
src/pipelines/sanity/sanity-update-reference.pl | 93 -
src/pipelines/sanity/sanity.pl | 500 ++-
src/pipelines/sanity/sanity.sh | 77 +-
src/stores/gatekeeperCreate.C | 155 +-
src/stores/gatekeeperDumpFASTQ.C | 56 +-
src/stores/gatekeeperDumpMetaData.C | 10 +-
src/stores/gatekeeperPartition.C | 173 +-
src/stores/gkStore.C | 485 +--
src/stores/gkStore.H | 149 +-
src/stores/libsnappy/snappy-internal.h | 237 ++
src/stores/libsnappy/snappy-sinksource.cc | 104 +
src/stores/libsnappy/snappy-sinksource.h | 207 ++
src/stores/libsnappy/snappy-stubs-internal.cc | 42 +
src/stores/libsnappy/snappy-stubs-internal.h | 553 +++
src/stores/libsnappy/snappy-stubs-public.h | 125 +
src/stores/libsnappy/snappy.cc | 1400 ++++++++
src/stores/libsnappy/snappy.h | 228 ++
src/stores/ovOverlap.C | 16 +-
src/stores/{ovStore.H => ovOverlap.H} | 408 +--
src/stores/ovStore.C | 1150 +------
src/stores/ovStore.H | 650 ++--
src/stores/ovStoreBucketizer.C | 109 +-
src/stores/ovStoreBuild.C | 373 ++-
src/stores/ovStoreDump.C | 143 +-
src/stores/ovStoreFile.C | 416 ++-
src/stores/ovStoreFile.H | 120 +
src/stores/ovStoreFilter.C | 293 ++
.../ovStoreFilter.H} | 16 +-
src/stores/ovStoreHistogram.C | 447 +++
src/stores/ovStoreHistogram.H | 115 +
src/stores/ovStoreIndexer.C | 97 +-
src/stores/ovStoreSorter.C | 209 +-
src/stores/ovStoreStats.C | 38 +-
src/stores/ovStoreWriter.C | 757 +++++
src/stores/tgStore.C | 25 +-
src/stores/tgStore.H | 4 +-
src/stores/tgStoreCompress.C | 8 +-
src/stores/tgStoreCoverageStat.C | 8 +-
src/stores/tgStoreDump.C | 59 +-
src/stores/tgStoreFilter.C | 20 +-
src/stores/tgTig.C | 55 +-
src/stores/tgTig.H | 2 +
src/stores/tgTigMultiAlignDisplay.C | 10 +-
src/stores/tgTigSizeAnalysis.C | 9 +-
src/utgcns/libNDFalcon/dw.C | 6 +
src/utgcns/libNDFalcon/dw.H | 2 -
src/utgcns/libNDalign/NDalign.C | 2 +-
src/utgcns/libcns/abAbacus-refine.C | 17 +-
src/utgcns/libcns/unitigConsensus.C | 290 +-
src/utgcns/libpbutgcns/SimpleAligner.C | 59 -
src/utgcns/stashContains.C | 15 +-
src/utgcns/stashContains.H | 4 +-
src/utgcns/utgcns.C | 19 +-
267 files changed, 27367 insertions(+), 13101 deletions(-)
diff --git a/README.citation b/README.citation
index e31960f..4e922ed 100644
--- a/README.citation
+++ b/README.citation
@@ -1,2 +1 @@
-Berlin K, Koren S, Chin CS, Drake PJ, Landolin JM, Phillippy AM Assembling Large Genomes with Single-Molecule Sequencing and Locality Sensitive Hashing. Nature Biotechnology. (2015).
-Stay tuned for a Canu-specific citation
+Koren S, Walenz BP, Berlin K, Miller JR, Phillippy AM. Canu: scalable and accurate long-read assembly via adaptive k-mer weighting and repeat separation. bioRxiv 071282; doi: http://dx.doi.org/10.1101/071282
diff --git a/README.md b/README.md
index da8c900..ccb8252 100644
--- a/README.md
+++ b/README.md
@@ -2,35 +2,36 @@
Canu is a fork of the [Celera Assembler](http://wgs-assembler.sourceforge.net/wiki/index.php?title=Main_Page), designed for high-noise single-molecule sequencing (such as the [PacBio](http://www.pacb.com) [RS II](http://www.pacb.com/products-and-services/pacbio-systems/rsii/) or [Oxford Nanopore](https://www.nanoporetech.com/) [MinION](https://www.nanoporetech.com/products-services/minion-mki)).
-Canu is a hierachical assembly pipeline which runs in four steps:
+Canu is a hierarchical assembly pipeline which runs in four steps:
* Detect overlaps in high-noise sequences using [MHAP](https://github.com/marbl/MHAP)
* Generate corrected sequence consensus
* Trim corrected sequences
* Assemble trimmed corrected sequences
-## Build:
+## Install:
+
+The easiest way to get started is to download a [release](http://github.com/marbl/canu/releases).
+
+Alternatively, you can also build the latest unreleased from github:
git clone https://github.com/marbl/canu.git
cd canu/src
make -j <number of threads>
+## Learn:
+
+The [quick start](http://canu.readthedocs.io/en/stable/quick-start.html) will get you assembling quickly, while the [tutorial](http://canu.readthedocs.io/en/stable/tutorial.html) explains things in more detail.
+
## Run:
Brief command line help:
../<achitechture>/bin/canu
-
Full list of parameters:
../<architecture>/bin/canu -options
-
-## Learn:
-
-The [quick start](http://canu.readthedocs.io/en/stable/quick-start.html) will get you assembling quickly, while the [tutorial](http://canu.readthedocs.io/en/stable/tutorial.html) explains things in more detail.
## Citation:
-
- - Berlin K, Koren S, Chin CS, Drake PJ, Landolin JM, Phillippy AM [Assembling Large Genomes with Single-Molecule Sequencing and Locality Sensitive Hashing](http://www.nature.com/nbt/journal/v33/n6/abs/nbt.3238.html). Nature Biotechnology. (2015).
- - Stay tuned for a Canu-specific citation
+ - Koren S, Walenz BP, Berlin K, Miller JR, Phillippy AM. [Canu: scalable and accurate long-read assembly via adaptive k-mer weighting and repeat separation](http://dx.doi.org/10.1101/071282). bioRxiv. (2016).
diff --git a/addCopyrights-BuildData.pl b/addCopyrights-BuildData.pl
index 82362c2..fd1a618 100644
--- a/addCopyrights-BuildData.pl
+++ b/addCopyrights-BuildData.pl
@@ -32,6 +32,7 @@ $stoppingCommits{"6950cb74e302a97673a5ba482b3b8992eea72c37"} = 1; # 20 AUG 20
$stoppingCommits{"72c27c95d61cb8f37e859c4039456eb2acc5c55b"} = 1; # 19 NOV 2015 - Second copyright addition.
$stoppingCommits{"b2df5790f77d38cc31fe77a7f65360e02389f92e"} = 1; # 04 MAR 2016
$stoppingCommits{"1ef335952342ef06ad1651a888f09c312f54dab8"} = 1; # 18 MAY 2016
+$stoppingCommits{"bbbdcd063560e5f86006ee6b8b96d2d7b80bb750"} = 1; # 21 NOV 2016
open(F, "< logs") or die "Failed to open 'logs': $!\n";
diff --git a/addCopyrights.dat b/addCopyrights.dat
index dba4a79..cf41bea 100644
--- a/addCopyrights.dat
+++ b/addCopyrights.dat
@@ -10239,3 +10239,871 @@ A src/AS_UTL/memoryMappedFile.H nihh20160310Brian P. Walenz
A documentation/source/quick-start.rst nihh20160309Sergey Koren
A src/pipelines/canu/CorrectReads.pm nihh20160309Sergey Koren
A documentation/source/quick-start.rst nihh20160307Sergey Koren
+D src/bogart/TigVector.C src/bogart/UnitigVector.C
+D src/bogart/TigVector.H src/bogart/UnitigVector.H
+D src/bogart/AS_BAT_PlaceReadUsingOverlaps.C src/bogart/AS_BAT_PlaceFragUsingOverlaps.C
+D src/bogart/AS_BAT_PlaceReadUsingOverlaps.H src/bogart/AS_BAT_PlaceFragUsingOverlaps.H
+D src/bogart/AS_BAT_Unitig_AddRead.C src/bogart/AS_BAT_Unitig_AddFrag.C
+D src/bogart/AS_BAT_Unitig_PlaceReadUsingEdges.C src/bogart/AS_BAT_Unitig_PlaceFragUsingEdges.C
+D src/stores/ovOverlap.H src/stores/ovStore.H
+D src/stores/ovStoreFile.H src/stores/ovStore.H
+D src/stores/ovStoreFilter.C src/stores/ovStore.C
+D src/stores/ovStoreFilter.H src/stores/ovStore.H
+D src/stores/ovStoreWriter.C src/stores/ovStore.C
+A src/bogart/AS_BAT_TigGraph.C nihh20161121Brian P. Walenz
+A src/stores/gatekeeperCreate.C nihh20161121Brian P. Walenz
+A src/stores/gkStore.C nihh20161121Brian P. Walenz
+A src/stores/gkStore.H nihh20161121Brian P. Walenz
+A src/stores/gkStore.C nihh20161118Brian P. Walenz
+A src/stores/gkStore.H nihh20161118Brian P. Walenz
+A src/AS_UTL/writeBuffer.H nihh20161118Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.C nihh20161118Brian P. Walenz
+A src/bogart/bogart.C nihh20161118Brian P. Walenz
+A src/bogart/AS_BAT_TigGraph.C nihh20161117Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.C nihh20161117Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.H nihh20161117Brian P. Walenz
+A src/bogart/bogart.C nihh20161117Brian P. Walenz
+A src/bogart/AS_BAT_CreateUnitigs.C nihh20161117Brian P. Walenz
+A src/bogart/AS_BAT_PlaceReadUsingOverlaps.C nihh20161117Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.C nihh20161117Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.H nihh20161117Brian P. Walenz
+A src/bogart/bogart.C nihh20161117Brian P. Walenz
+A src/bogart/bogart.C nihh20161117Brian P. Walenz
+A src/bogart/AS_BAT_TigGraph.C nihh20161116Brian P. Walenz
+A src/bogart/AS_BAT_TigGraph.C nihh20161116Brian P. Walenz
+A src/bogart/AS_BAT_TigGraph.C nihh20161116Brian P. Walenz
+A src/bogart/AS_BAT_PlaceReadUsingOverlaps.C nihh20161116Brian P. Walenz
+A src/stores/ovStoreBucketizer.C nihh20161110Brian P. Walenz
+A src/stores/ovStoreWriter.C nihh20161110Brian P. Walenz
+A src/pipelines/sanity/sanity.pl nihh20161108Brian P. Walenz
+A src/pipelines/sanity/sanity.sh nihh20161108Brian P. Walenz
+A src/pipelines/canu.pl nihh20161108Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20161108Brian P. Walenz
+A documentation/source/parameter-reference.rst nihh20161107Brian P. Walenz
+A src/pipelines/canu.pl nihh20161107Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20161107Brian P. Walenz
+A src/pipelines/canu/Execution.pm nihh20161107Brian P. Walenz
+A src/pipelines/canu/Output.pm nihh20161107Brian P. Walenz
+A src/pipelines/canu/Execution.pm nihh20161107Brian P. Walenz
+A src/AS_global.C nihh20161105Brian P. Walenz
+A src/bogart/AS_BAT_OverlapCache.C nihh20161105Brian P. Walenz
+A src/bogart/AS_BAT_OverlapCache.H nihh20161105Brian P. Walenz
+A src/bogart/bogart.C nihh20161105Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.C nihh20161104Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.H nihh20161104Brian P. Walenz
+A src/bogart/AS_BAT_ReadInfo.C nihh20161104Brian P. Walenz
+A src/bogart/AS_BAT_ReadInfo.H nihh20161104Brian P. Walenz
+A src/stores/ovStoreHistogram.H nihh20161104Brian P. Walenz
+A src/stores/ovStoreDump.C nihh20161031Brian P. Walenz
+A src/AS_UTL/AS_UTL_alloc.C nihh20161031Brian P. Walenz
+A src/AS_UTL/AS_UTL_alloc.H nihh20161031Brian P. Walenz
+A src/bogart/AS_BAT_OverlapCache.C nihh20161031Brian P. Walenz
+A src/meryl/libmeryl.C nihh20161031Brian P. Walenz
+A src/stores/gkStore.C nihh20161031Brian P. Walenz
+A src/AS_UTL/AS_UTL_alloc.H nihh20161031Brian P. Walenz
+A addCopyrights.dat nihh20161028Brian P. Walenz
+A src/erateEstimate/erateEstimate.C nihh20161028Brian P. Walenz
+A src/main.mk nihh20161028Brian P. Walenz
+A src/overlapInCore/overlapImport.C nihh20161028Brian P. Walenz
+A src/overlapInCore/overlapPair.C nihh20161028Brian P. Walenz
+A src/stores/ovStore.C nihh20161028Brian P. Walenz
+A src/stores/ovStore.H nihh20161028Brian P. Walenz
+A src/stores/ovStoreBucketizer.C nihh20161028Brian P. Walenz
+A src/stores/ovStoreBuild.C nihh20161028Brian P. Walenz
+A src/stores/ovStoreFile.C nihh20161028Brian P. Walenz
+A src/stores/ovStoreFile.H nihh20161028Brian P. Walenz
+A src/stores/ovStoreFilter.C nihh20161028Brian P. Walenz
+A src/stores/ovStoreFilter.H nihh20161028Brian P. Walenz
+A src/stores/ovStoreHistogram.C nihh20161028Brian P. Walenz
+A src/stores/ovStoreHistogram.H nihh20161028Brian P. Walenz
+A src/stores/ovStoreIndexer.C nihh20161028Brian P. Walenz
+A src/stores/ovStoreSorter.C nihh20161028Brian P. Walenz
+A src/stores/ovStoreWriter.C nihh20161028Brian P. Walenz
+A src/stores/ovStore.C nihh20161026Brian P. Walenz
+A src/stores/ovStore.H nihh20161026Brian P. Walenz
+A src/main.mk nihh20161025Brian P. Walenz
+A src/stores/ovStore.C nihh20161025Brian P. Walenz
+A src/stores/ovStore.H nihh20161025Brian P. Walenz
+A src/stores/ovStoreBuild.C nihh20161025Brian P. Walenz
+A src/stores/ovStoreDump.C nihh20161025Brian P. Walenz
+A src/stores/ovStoreFile.C nihh20161025Brian P. Walenz
+A src/stores/ovStoreFile.H nihh20161025Brian P. Walenz
+A src/stores/ovStoreHistogram.C nihh20161025Brian P. Walenz
+A src/stores/ovStoreHistogram.H nihh20161025Brian P. Walenz
+A src/stores/ovStoreIndexer.C nihh20161025Brian P. Walenz
+A src/stores/ovStoreBuild.C nihh20161025Brian P. Walenz
+A src/Makefile nihh20161025Brian P. Walenz
+A src/AS_UTL/AS_UTL_fileIO.C nihh20161025Brian P. Walenz
+A src/AS_UTL/AS_UTL_fileIO.H nihh20161025Brian P. Walenz
+A src/AS_UTL/AS_UTL_alloc.H nihh20161025Brian P. Walenz
+A src/bogart/AS_BAT_OverlapCache.C nihh20161024Brian P. Walenz
+A addCopyrights.dat nihh20161024Brian P. Walenz
+A src/stores/ovOverlap.H nihh20161024Brian P. Walenz
+A src/stores/ovStore.H nihh20161024Brian P. Walenz
+A src/stores/ovStoreFile.H nihh20161024Brian P. Walenz
+A src/mhap/mhapConvert.C nihh20161024Brian P. Walenz
+A src/minimap/mmapConvert.C nihh20161024Brian P. Walenz
+A src/overlapInCore/overlapConvert.C nihh20161024Brian P. Walenz
+A src/overlapInCore/overlapImport.C nihh20161024Brian P. Walenz
+A src/overlapInCore/overlapInCore.C nihh20161024Brian P. Walenz
+A src/overlapInCore/overlapPair.C nihh20161024Brian P. Walenz
+A src/stores/ovStore.C nihh20161024Brian P. Walenz
+A src/stores/ovStore.H nihh20161024Brian P. Walenz
+A src/stores/ovStoreBucketizer.C nihh20161024Brian P. Walenz
+A src/stores/ovStoreBuild.C nihh20161024Brian P. Walenz
+A src/stores/ovStoreFile.C nihh20161024Brian P. Walenz
+A src/stores/ovStoreSorter.C nihh20161024Brian P. Walenz
+A src/pipelines/canu/Output.pm nihh20161021Brian P. Walenz
+A src/bogart/AS_BAT_OverlapCache.C nihh20161021Brian P. Walenz
+A src/bogart/AS_BAT_OverlapCache.H nihh20161021Brian P. Walenz
+A src/bogart/AS_BAT_OverlapCache.C nihh20161021Brian P. Walenz
+A src/pipelines/canu/HTML.pm nihh20161021Brian P. Walenz
+A documentation/source/faq.rst nihh20161018Brian P. Walenz
+A documentation/source/faq.rst nihh20161018Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20161018Brian P. Walenz
+A src/bogart/AS_BAT_Outputs.C nihh20161018Brian P. Walenz
+A src/stores/ovStoreDump.C nihh20161018Brian P. Walenz
+A src/stores/ovStoreDump.C nihh20161018Brian P. Walenz
+A documentation/source/overlap_transformations.svg nihh20161014Brian P. Walenz
+A documentation/source/overlaps.svg nihh20161014Brian P. Walenz
+A documentation/source/repeat-spanned.svg nihh20161014Brian P. Walenz
+A documentation/source/repeat-unspanned.svg nihh20161014Brian P. Walenz
+A src/AS_RUN/fragmentDepth.C nihh20161017Brian P. Walenz
+A src/AS_UTL/AS_UTL_fileIO.C nihh20161017Brian P. Walenz
+A src/AS_UTL/AS_UTL_stackTrace.C nihh20161017Brian P. Walenz
+A src/AS_UTL/bitPackedArray.C nihh20161017Brian P. Walenz
+A src/AS_UTL/bitPackedArray.H nihh20161017Brian P. Walenz
+A src/AS_UTL/bitPackedFile.C nihh20161017Brian P. Walenz
+A src/AS_UTL/bitPackedFile.H nihh20161017Brian P. Walenz
+A src/AS_UTL/kMer.C nihh20161017Brian P. Walenz
+A src/AS_UTL/kMerHuge.H nihh20161017Brian P. Walenz
+A src/AS_UTL/memoryMappedFile.H nihh20161017Brian P. Walenz
+A src/AS_UTL/readBuffer.C nihh20161017Brian P. Walenz
+A src/AS_UTL/stddev.H nihh20161017Brian P. Walenz
+A src/AS_UTL/sweatShop.C nihh20161017Brian P. Walenz
+A src/AS_global.C nihh20161017Brian P. Walenz
+A src/AS_global.H nihh20161017Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.C nihh20161017Brian P. Walenz
+A src/bogart/AS_BAT_BestOverlapGraph.C nihh20161017Brian P. Walenz
+A src/bogart/AS_BAT_Instrumentation.C nihh20161017Brian P. Walenz
+A src/bogart/AS_BAT_Logging.C nihh20161017Brian P. Walenz
+A src/bogart/AS_BAT_MarkRepeatReads.C nihh20161017Brian P. Walenz
+A src/bogart/AS_BAT_OverlapCache.C nihh20161017Brian P. Walenz
+A src/bogart/AS_BAT_PopBubbles.C nihh20161017Brian P. Walenz
+A src/bogart/AS_BAT_PromoteToSingleton.C nihh20161017Brian P. Walenz
+A src/bogart/AS_BAT_ReadInfo.C nihh20161017Brian P. Walenz
+A src/bogart/AS_BAT_ReadInfo.H nihh20161017Brian P. Walenz
+A src/bogart/AS_BAT_SplitDiscontinuous.C nihh20161017Brian P. Walenz
+A src/bogart/AS_BAT_TigVector.C nihh20161017Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.C nihh20161017Brian P. Walenz
+A src/bogart/addReadsToUnitigs.C nihh20161017Brian P. Walenz
+A src/bogart/analyzeBest.C nihh20161017Brian P. Walenz
+A src/bogart/buildGraph.C nihh20161017Brian P. Walenz
+A src/bogus/bogus.C nihh20161017Brian P. Walenz
+A src/correction/filterCorrectionOverlaps.C nihh20161017Brian P. Walenz
+A src/correction/generateCorrectionLayouts.C nihh20161017Brian P. Walenz
+A src/erateEstimate/erateEstimate.C nihh20161017Brian P. Walenz
+A src/falcon_sense/createFalconSenseInputs.C nihh20161017Brian P. Walenz
+A src/falcon_sense/outputFalcon.C nihh20161017Brian P. Walenz
+A src/fastq-utilities/fastqAnalyze.C nihh20161017Brian P. Walenz
+A src/fastq-utilities/fastqSample.C nihh20161017Brian P. Walenz
+A src/fastq-utilities/fastqSimulate-sort.C nihh20161017Brian P. Walenz
+A src/fastq-utilities/fastqSimulate.C nihh20161017Brian P. Walenz
+A src/merTrim/merTrim.C nihh20161017Brian P. Walenz
+A src/merTrim/merTrimResult.H nihh20161017Brian P. Walenz
+A src/mercy/mercy-regions.C nihh20161017Brian P. Walenz
+A src/mercy/mercy.C nihh20161017Brian P. Walenz
+A src/meryl/estimate-mer-threshold.C nihh20161017Brian P. Walenz
+A src/meryl/leaff-blocks.C nihh20161017Brian P. Walenz
+A src/meryl/leaff-duplicates.C nihh20161017Brian P. Walenz
+A src/meryl/leaff-gc.C nihh20161017Brian P. Walenz
+A src/meryl/leaff-partition.C nihh20161017Brian P. Walenz
+A src/meryl/leaff-statistics.C nihh20161017Brian P. Walenz
+A src/meryl/leaff.C nihh20161017Brian P. Walenz
+A src/meryl/libleaff/fastaFile.C nihh20161017Brian P. Walenz
+A src/meryl/libleaff/fastaStdin.C nihh20161017Brian P. Walenz
+A src/meryl/libleaff/fastqFile.C nihh20161017Brian P. Walenz
+A src/meryl/libleaff/fastqStdin.C nihh20161017Brian P. Walenz
+A src/meryl/libleaff/seqCache.C nihh20161017Brian P. Walenz
+A src/meryl/libleaff/seqStore.C nihh20161017Brian P. Walenz
+A src/meryl/libleaff/seqStream.C nihh20161017Brian P. Walenz
+A src/meryl/libmeryl.C nihh20161017Brian P. Walenz
+A src/meryl/meryl-args.C nihh20161017Brian P. Walenz
+A src/meryl/meryl-binaryOp.C nihh20161017Brian P. Walenz
+A src/meryl/meryl-build.C nihh20161017Brian P. Walenz
+A src/meryl/meryl-dump.C nihh20161017Brian P. Walenz
+A src/meryl/meryl-estimate.C nihh20161017Brian P. Walenz
+A src/meryl/meryl-merge.C nihh20161017Brian P. Walenz
+A src/meryl/simple.C nihh20161017Brian P. Walenz
+A src/overlapBasedTrimming/splitReads-subReads.C nihh20161017Brian P. Walenz
+A src/overlapBasedTrimming/splitReads.C nihh20161017Brian P. Walenz
+A src/overlapBasedTrimming/trimReads-largestCovered.C nihh20161017Brian P. Walenz
+A src/overlapBasedTrimming/trimReads-quality.C nihh20161017Brian P. Walenz
+A src/overlapBasedTrimming/trimReads.C nihh20161017Brian P. Walenz
+A src/overlapErrorAdjustment/analyzeAlignment.C nihh20161017Brian P. Walenz
+A src/overlapErrorAdjustment/correctOverlaps-Correct_Frags.C nihh20161017Brian P. Walenz
+A src/overlapErrorAdjustment/correctOverlaps-Read_Olaps.C nihh20161017Brian P. Walenz
+A src/overlapErrorAdjustment/correctOverlaps-Redo_Olaps.C nihh20161017Brian P. Walenz
+A src/overlapErrorAdjustment/correctOverlaps.C nihh20161017Brian P. Walenz
+A src/overlapErrorAdjustment/correctOverlaps.H nihh20161017Brian P. Walenz
+A src/overlapErrorAdjustment/findErrors-Read_Frags.C nihh20161017Brian P. Walenz
+A src/overlapErrorAdjustment/findErrors-Read_Olaps.C nihh20161017Brian P. Walenz
+A src/overlapErrorAdjustment/findErrors.C nihh20161017Brian P. Walenz
+A src/overlapInCore/overlapInCore-Build_Hash_Index.C nihh20161017Brian P. Walenz
+A src/overlapInCore/overlapInCore-Find_Overlaps.C nihh20161017Brian P. Walenz
+A src/overlapInCore/overlapInCore-Process_Overlaps.C nihh20161017Brian P. Walenz
+A src/overlapInCore/overlapInCore.C nihh20161017Brian P. Walenz
+A src/overlapInCore/overlapInCorePartition.C nihh20161017Brian P. Walenz
+A src/overlapInCore/overlapReadCache.C nihh20161017Brian P. Walenz
+A src/stores/gatekeeperCreate.C nihh20161017Brian P. Walenz
+A src/stores/gatekeeperDumpFASTQ.C nihh20161017Brian P. Walenz
+A src/stores/gatekeeperDumpMetaData.C nihh20161017Brian P. Walenz
+A src/stores/gkStore.C nihh20161017Brian P. Walenz
+A src/stores/gkStore.H nihh20161017Brian P. Walenz
+A src/stores/ovOverlap.C nihh20161017Brian P. Walenz
+A src/stores/ovStore.C nihh20161017Brian P. Walenz
+A src/stores/ovStore.H nihh20161017Brian P. Walenz
+A src/stores/ovStoreBucketizer.C nihh20161017Brian P. Walenz
+A src/stores/ovStoreBuild.C nihh20161017Brian P. Walenz
+A src/stores/ovStoreDump.C nihh20161017Brian P. Walenz
+A src/stores/ovStoreFile.C nihh20161017Brian P. Walenz
+A src/stores/ovStoreIndexer.C nihh20161017Brian P. Walenz
+A src/stores/ovStoreSorter.C nihh20161017Brian P. Walenz
+A src/stores/ovStoreStats.C nihh20161017Brian P. Walenz
+A src/stores/tgStore.C nihh20161017Brian P. Walenz
+A src/stores/tgStoreCompress.C nihh20161017Brian P. Walenz
+A src/stores/tgStoreCoverageStat.C nihh20161017Brian P. Walenz
+A src/stores/tgStoreDump.C nihh20161017Brian P. Walenz
+A src/stores/tgStoreFilter.C nihh20161017Brian P. Walenz
+A src/stores/tgTig.C nihh20161017Brian P. Walenz
+A src/stores/tgTigSizeAnalysis.C nihh20161017Brian P. Walenz
+A src/utgcns/libNDalign/NDalign.C nihh20161017Brian P. Walenz
+A src/utgcns/libcns/unitigConsensus.C nihh20161017Brian P. Walenz
+A src/utgcns/stashContains.C nihh20161017Brian P. Walenz
+A src/utgcns/stashContains.H nihh20161017Brian P. Walenz
+A src/utgcns/utgcns.C nihh20161017Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20161014Sergey Koren
+A src/pipelines/canu/OverlapMhap.pm nihh20161014Sergey Koren
+A src/pipelines/canu/OverlapMhap.pm nihh20161014Sergey Koren
+A src/bogart/AS_BAT_CreateUnitigs.C nihh20161014Brian P. Walenz
+A src/bogart/AS_BAT_TigGraph.C nihh20161014Brian P. Walenz
+A src/bogart/AS_BAT_CreateUnitigs.C nihh20161014Brian P. Walenz
+A src/bogart/AS_BAT_CreateUnitigs.C nihh20161014Brian P. Walenz
+A src/pipelines/canu/Gatekeeper.pm nihh20161014Brian P. Walenz
+A src/bogart/AS_BAT_TigVector.C nihh20161013Brian P. Walenz
+A src/pipelines/canu/Output.pm nihh20161013Brian P. Walenz
+A src/bogart/AS_BAT_ChunkGraph.C nihh20161013Brian P. Walenz
+A src/bogart/AS_BAT_TigGraph.C nihh20161012Brian P. Walenz
+A src/bogart/AS_BAT_TigGraph.C nihh20161012Brian P. Walenz
+A src/stores/ovStoreBucketizer.C nihh20161011Brian P. Walenz
+A src/pipelines/canu/Grid_PBSTorque.pm nihh20161011Brian P. Walenz
+A src/pipelines/canu.pl nihh20161011Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20161011Brian P. Walenz
+A src/pipelines/canu/CorrectReads.pm nihh20161011Sergey Koren
+A src/pipelines/canu.pl nihh20161011Sergey Koren
+A src/pipelines/canu/OverlapMhap.pm nihh20161011Sergey Koren
+A src/pipelines/canu/Defaults.pm nihh20161011Sergey Koren
+A src/pipelines/canu/Unitig.pm nihh20161011Sergey Koren
+A src/mhap/mhap-2.1.2.tar nihh20161011Sergey Koren
+A src/mhap/mhap.mk nihh20161011Sergey Koren
+A src/pipelines/canu/Defaults.pm nihh20161011Sergey Koren
+A src/overlapInCore/libedlib/edlib.C nihh20161007Sergey Koren
+A src/overlapInCore/libedlib/edlib.H nihh20161007Sergey Koren
+A src/overlapInCore/overlapPair.C nihh20161007Sergey Koren
+A src/meryl/libleaff/fastaFile.C nihh20161006Brian P. Walenz
+A src/meryl/libleaff/fastqFile.C nihh20161006Brian P. Walenz
+A src/meryl/libleaff/sffFile.C nihh20161006Brian P. Walenz
+A src/meryl/libleaff/seqFactory.C nihh20161006Brian P. Walenz
+A src/meryl/leaff.C nihh20161006Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.C nihh20161006Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.C nihh20161006Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.H nihh20161006Brian P. Walenz
+A addCopyrights.pl nihh20161006Brian P. Walenz
+A kmer/libutil/qsort_mt.c nihh20161006Brian P. Walenz
+A src/bogart/AS_BAT_TigGraph.C nihh20161005Brian P. Walenz
+A src/bogart/AS_BAT_CreateUnitigs.C nihh20161005Brian P. Walenz
+A documentation/source/faq.rst nihh20161005Sergey Koren
+A src/overlapInCore/overlapPair.C nihh20161004Sergey Koren
+A src/bogart/AS_BAT_Unitig.C nihh20161004Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.C nihh20161004Brian P. Walenz
+A src/AS_UTL/stddev.H nihh20161004Brian P. Walenz
+A src/bogart/AS_BAT_CreateUnitigs.C nihh20161003Brian P. Walenz
+A src/bogart/AS_BAT_CreateUnitigs.H nihh20161003Brian P. Walenz
+A src/bogart/AS_BAT_Instrumentation.C nihh20161003Brian P. Walenz
+A src/bogart/AS_BAT_Outputs.C nihh20161003Brian P. Walenz
+A src/bogart/AS_BAT_Outputs.H nihh20161003Brian P. Walenz
+A src/bogart/AS_BAT_SetParentAndHang.C nihh20161003Brian P. Walenz
+A src/bogart/AS_BAT_TigGraph.C nihh20161003Brian P. Walenz
+A src/bogart/AS_BAT_TigGraph.H nihh20161003Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.H nihh20161003Brian P. Walenz
+A src/bogart/bogart.C nihh20161003Brian P. Walenz
+A src/bogart/bogart.mk nihh20161003Brian P. Walenz
+A src/pipelines/canu/Consensus.pm nihh20161003Brian P. Walenz
+A src/pipelines/canu/Execution.pm nihh20161003Brian P. Walenz
+A src/pipelines/canu/Output.pm nihh20161003Brian P. Walenz
+A src/pipelines/canu/Unitig.pm nihh20161003Brian P. Walenz
+A src/stores/gatekeeperPartition.C nihh20161003Brian P. Walenz
+A src/utgcns/utgcns.C nihh20161003Brian P. Walenz
+A src/stores/gkStore.C nihh20161003Brian P. Walenz
+A src/stores/gkStore.C nihh20160930Brian P. Walenz
+A src/stores/gkStore.H nihh20160930Brian P. Walenz
+A src/AS_UTL/AS_UTL_fileIO.C nihh20160930Brian P. Walenz
+A src/AS_UTL/AS_UTL_fileIO.H nihh20160930Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.C nihh20160930Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.C nihh20160930Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.C nihh20160930Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.C nihh20160930Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.H nihh20160930Brian P. Walenz
+A src/bogart/bogart.C nihh20160930Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.H nihh20160930Brian P. Walenz
+A src/bogart/AS_BAT_MarkRepeatReads.C nihh20160930Brian P. Walenz
+A src/bogart/AS_BAT_MarkRepeatReads.H nihh20160930Brian P. Walenz
+A src/bogart/bogart.C nihh20160930Brian P. Walenz
+A src/bogart/AS_BAT_Instrumentation.C nihh20160929Brian P. Walenz
+A src/bogart/AS_BAT_Outputs.C nihh20160929Brian P. Walenz
+A src/bogart/AS_BAT_Outputs.H nihh20160929Brian P. Walenz
+A src/bogart/bogart.C nihh20160929Brian P. Walenz
+A src/pipelines/canu/Consensus.pm nihh20160929Brian P. Walenz
+A src/pipelines/canu/Meryl.pm nihh20160929Brian P. Walenz
+A src/pipelines/canu/Output.pm nihh20160929Brian P. Walenz
+A src/pipelines/canu/OverlapErrorAdjustment.pm nihh20160929Brian P. Walenz
+A src/pipelines/canu/OverlapStore.pm nihh20160929Brian P. Walenz
+A src/pipelines/canu/Unitig.pm nihh20160929Brian P. Walenz
+A src/bogart/AS_BAT_BestOverlapGraph.C nihh20160929Brian P. Walenz
+A src/bogart/AS_BAT_BestOverlapGraph.C nihh20160929Brian P. Walenz
+A src/bogart/AS_BAT_BestOverlapGraph.H nihh20160929Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.H nihh20160929Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.C nihh20160929Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.H nihh20160929Brian P. Walenz
+A src/bogart/AS_BAT_BestOverlapGraph.C nihh20160929Brian P. Walenz
+A src/bogart/AS_BAT_BestOverlapGraph.H nihh20160929Brian P. Walenz
+A src/bogart/AS_BAT_Instrumentation.C nihh20160929Brian P. Walenz
+A src/bogart/AS_BAT_MarkRepeatReads.C nihh20160929Brian P. Walenz
+A src/bogart/AS_BAT_Outputs.C nihh20160929Brian P. Walenz
+A src/bogart/AS_BAT_OverlapCache.C nihh20160929Brian P. Walenz
+A src/bogart/AS_BAT_OverlapCache.H nihh20160929Brian P. Walenz
+A src/bogart/AS_BAT_PlaceReadUsingOverlaps.C nihh20160929Brian P. Walenz
+A src/bogart/AS_BAT_PopBubbles.C nihh20160929Brian P. Walenz
+A src/bogart/AS_BAT_SetParentAndHang.C nihh20160929Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.C nihh20160929Brian P. Walenz
+A documentation/source/tutorial.rst nihh20160926Brian P. Walenz
+A documentation/source/canu-overlaps.svg nihh20160926Brian P. Walenz
+A documentation/source/canu-pipeline.svg nihh20160926Brian P. Walenz
+A documentation/source/pipeline.rst nihh20160926Brian P. Walenz
+A src/overlapInCore/libedlib/edlib.C nihh20160923Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.C nihh20160923Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.C nihh20160923Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.C nihh20160923Brian P. Walenz
+A addCopyrights.dat nihh20160923Brian P. Walenz
+A src/bogart/AS_BAT_MarkRepeatReads.C nihh20160921Brian P. Walenz
+A src/bogart/AS_BAT_MarkRepeatReads.H nihh20160921Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.H nihh20160921Brian P. Walenz
+A src/pipelines/canu/Consensus.pm nihh20160921Brian P. Walenz
+A src/pipelines/canu/CorrectReads.pm nihh20160921Brian P. Walenz
+A src/pipelines/canu/Meryl.pm nihh20160921Brian P. Walenz
+A src/pipelines/canu/OverlapErrorAdjustment.pm nihh20160921Brian P. Walenz
+A src/pipelines/canu/OverlapInCore.pm nihh20160921Brian P. Walenz
+A src/pipelines/canu/OverlapMMap.pm nihh20160921Brian P. Walenz
+A src/pipelines/canu/OverlapMhap.pm nihh20160921Brian P. Walenz
+A src/pipelines/canu/OverlapStore.pm nihh20160921Brian P. Walenz
+A src/pipelines/canu/Unitig.pm nihh20160921Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.C nihh20160919Brian P. Walenz
+A src/bogart/AS_BAT_Instrumentation.C nihh20160919Brian P. Walenz
+A src/bogart/AS_BAT_MarkRepeatReads.C nihh20160919Brian P. Walenz
+A src/bogart/AS_BAT_Outputs.C nihh20160919Brian P. Walenz
+A src/bogart/AS_BAT_PlaceContains.C nihh20160919Brian P. Walenz
+A src/bogart/AS_BAT_PlaceReadUsingOverlaps.C nihh20160919Brian P. Walenz
+A src/bogart/AS_BAT_PopBubbles.C nihh20160919Brian P. Walenz
+A src/bogart/AS_BAT_PopulateUnitig.C nihh20160919Brian P. Walenz
+A src/bogart/AS_BAT_PromoteToSingleton.C nihh20160919Brian P. Walenz
+A src/bogart/AS_BAT_TigVector.C nihh20160919Brian P. Walenz
+A src/bogart/AS_BAT_TigVector.H nihh20160919Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.C nihh20160919Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.H nihh20160919Brian P. Walenz
+A src/bogart/AS_BAT_Unitig_AddRead.C nihh20160919Brian P. Walenz
+A src/bogart/AS_BAT_Unitig_PlaceReadUsingEdges.C nihh20160919Brian P. Walenz
+A src/bogart/bogart.C nihh20160919Brian P. Walenz
+A src/overlapInCore/overlapInCore.C nihh20160917Brian P. Walenz
+A src/Makefile nihh20160917Brian P. Walenz
+A src/correction/generateCorrectionLayouts.C nihh20160917Brian P. Walenz
+A src/overlapErrorAdjustment/correctOverlaps-Prefix_Edit_Distance.C nihh20160917Brian P. Walenz
+A src/overlapInCore/overlapInCore-Process_Overlaps.C nihh20160917Brian P. Walenz
+A src/stores/ovStoreBuild.C nihh20160917Brian P. Walenz
+A documentation/source/index.rst nihh20160920Sergey Koren
+A documentation/source/index.rst nihh20160920Sergey Koren
+A src/pipelines/canu/Execution.pm nihh20160916Brian P. Walenz
+A src/pipelines/canu/OverlapErrorAdjustment.pm nihh20160916Brian P. Walenz
+A src/pipelines/canu/OverlapErrorAdjustment.pm nihh20160916Brian P. Walenz
+A src/stores/gkStore.C nihh20160915Brian P. Walenz
+A src/stores/gkStore.H nihh20160915Brian P. Walenz
+A src/overlapInCore/overlapInCore-Output.C nihh20160915Brian P. Walenz
+A src/overlapInCore/overlapInCore-Process_Overlaps.C nihh20160915Brian P. Walenz
+A src/overlapInCore/overlapInCore.C nihh20160915Brian P. Walenz
+A src/overlapInCore/overlapInCore.H nihh20160915Brian P. Walenz
+A src/meryl/meryl-args.C nihh20160914Brian P. Walenz
+A src/meryl/meryl-build.C nihh20160914Brian P. Walenz
+A src/meryl/meryl.H nihh20160914Brian P. Walenz
+A src/meryl/meryl.mk nihh20160914Brian P. Walenz
+A src/meryl/libleaff/gkStoreFile.C nihh20160913Brian P. Walenz
+A src/stores/gatekeeperCreate.C nihh20160913Brian P. Walenz
+A src/stores/gkStore.C nihh20160913Brian P. Walenz
+A src/stores/gkStore.H nihh20160913Brian P. Walenz
+A src/stores/gkStore.C nihh20160913Brian P. Walenz
+A src/AS_UTL/hexDump.C nihh20160915Brian P. Walenz
+A src/AS_UTL/hexDump.H nihh20160915Brian P. Walenz
+A src/main.mk nihh20160915Brian P. Walenz
+A src/AS_UTL/mt19937arTest.C nihh20160909Brian P. Walenz
+A src/pipelines/canu/CorrectReads.pm nihh20160909Brian P. Walenz
+A src/pipelines/canu/OverlapStore.pm nihh20160909Brian P. Walenz
+A src/pipelines/canu/OverlapInCore.pm nihh20160909Brian P. Walenz
+A src/overlapInCore/overlapInCorePartition.C nihh20160909Brian P. Walenz
+A src/stores/ovStoreBuild.C nihh20160909Brian P. Walenz
+A src/overlapInCore/overlapImport.C nihh20160909Brian P. Walenz
+A src/AS_UTL/mt19937ar.C nihh20160909Brian P. Walenz
+A src/AS_UTL/mt19937ar.H nihh20160909Brian P. Walenz
+A src/stores/ovStoreBuild.C nihh20160909Brian P. Walenz
+A src/pipelines/canu/OverlapMMap.pm nihh20160906Sergey Koren
+A src/pipelines/canu/OverlapInCore.pm nihh20160902Brian P. Walenz
+A src/pipelines/canu/OverlapMMap.pm nihh20160902Brian P. Walenz
+A src/pipelines/canu/OverlapMhap.pm nihh20160902Brian P. Walenz
+A src/pipelines/canu/Execution.pm nihh20160902Brian P. Walenz
+A src/pipelines/canu/Grid_PBSTorque.pm nihh20160902Brian P. Walenz
+A src/pipelines/canu/Grid_PBSTorque.pm nihh20160902Brian P. Walenz
+A src/pipelines/canu/Execution.pm nihh20160902Brian P. Walenz
+A src/pipelines/canu/CorrectReads.pm nihh20160901Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20160901Brian P. Walenz
+A src/pipelines/canu/Meryl.pm nihh20160901Brian P. Walenz
+A src/pipelines/canu/Execution.pm nihh20160901Brian P. Walenz
+A src/pipelines/canu/Grid_PBSTorque.pm nihh20160901Brian P. Walenz
+A src/pipelines/canu/CorrectReads.pm nihh20160831Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20160831Brian P. Walenz
+A src/pipelines/canu/Gatekeeper.pm nihh20160831Brian P. Walenz
+A src/pipelines/canu/HTML.pm nihh20160831Brian P. Walenz
+A src/pipelines/canu/Meryl.pm nihh20160831Brian P. Walenz
+A documentation/source/parameter-reference.rst nihh20160831Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20160831Brian P. Walenz
+A src/overlapInCore/overlapConvert.C nihh20160831Brian P. Walenz
+A src/overlapInCore/overlapImport.C nihh20160831Brian P. Walenz
+A src/stores/ovStore.H nihh20160831Brian P. Walenz
+A src/stores/ovStoreFile.C nihh20160831Brian P. Walenz
+A src/main.mk nihh20160830Brian P. Walenz
+A src/stores/libsnappy/snappy-internal.h nihh20160830Brian P. Walenz
+A src/stores/libsnappy/snappy-sinksource.cc nihh20160830Brian P. Walenz
+A src/stores/libsnappy/snappy-sinksource.h nihh20160830Brian P. Walenz
+A src/stores/libsnappy/snappy-stubs-internal.cc nihh20160830Brian P. Walenz
+A src/stores/libsnappy/snappy-stubs-internal.h nihh20160830Brian P. Walenz
+A src/stores/libsnappy/snappy-stubs-public.h nihh20160830Brian P. Walenz
+A src/stores/libsnappy/snappy.cc nihh20160830Brian P. Walenz
+A src/stores/libsnappy/snappy.h nihh20160830Brian P. Walenz
+A src/stores/ovStore.H nihh20160830Brian P. Walenz
+A src/stores/ovStoreFile.C nihh20160830Brian P. Walenz
+A src/main.mk nihh20160830Brian P. Walenz
+A src/stores/ovStoreBucketizer.C nihh20160829Brian P. Walenz
+A src/AS_UTL/AS_UTL_fileIO.C nihh20160829Brian P. Walenz
+A src/AS_UTL/AS_UTL_fileIO.H nihh20160829Brian P. Walenz
+A src/stores/ovStore.H nihh20160829Brian P. Walenz
+A src/stores/ovStoreFile.C nihh20160829Brian P. Walenz
+A src/AS_UTL/AS_UTL_fileIO.H nihh20160829Brian P. Walenz
+A src/stores/ovStore.H nihh20160829Brian P. Walenz
+A src/stores/ovStoreFile.C nihh20160829Brian P. Walenz
+A src/bogart/AS_BAT_PopBubbles.C nihh20160829Brian P. Walenz
+A src/bogart/AS_BAT_PopBubbles.C nihh20160829Brian P. Walenz
+A src/bogart/AS_BAT_PopBubbles.C nihh20160829Brian P. Walenz
+A src/bogart/AS_BAT_PopBubbles.C nihh20160829Brian P. Walenz
+A src/AS_global.H nihh20160829Brian P. Walenz
+A src/pipelines/canu/HTML.pm nihh20160829Brian P. Walenz
+A src/bogart/bogart.C nihh20160826Brian P. Walenz
+A src/overlapInCore/overlapPair.mk nihh20160830Sergey Koren
+A src/main.mk nihh20160830Sergey Koren
+A src/overlapInCore/libedlib/edlib.C nihh20160830Sergey Koren
+A src/overlapInCore/libedlib/edlib.H nihh20160830Sergey Koren
+A src/overlapInCore/overlapPair.C nihh20160830Sergey Koren
+A src/pipelines/canu/OverlapMhap.pm nihh20160830Sergey Koren
+A src/pipelines/canu/Defaults.pm nihh20160826Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.C nihh20160825Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.C nihh20160825Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.C nihh20160825Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.C nihh20160825Brian P. Walenz
+A README.md nihh20160824Sergey Koren
+A README.citation nihh20160824Sergey Koren
+A src/bogart/AS_BAT_PopBubbles.C nihh20160822Brian P. Walenz
+A src/bogart/bogart.C nihh20160821Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.C nihh20160821Brian P. Walenz
+A src/bogart/AS_BAT_BestOverlapGraph.C nihh20160821Brian P. Walenz
+A src/bogart/AS_BAT_MarkRepeatReads.C nihh20160821Brian P. Walenz
+A src/bogart/AS_BAT_PlaceContains.C nihh20160821Brian P. Walenz
+A src/bogart/AS_BAT_PopBubbles.C nihh20160821Brian P. Walenz
+A src/bogart/AS_BAT_TigVector.C nihh20160821Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.C nihh20160821Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.H nihh20160821Brian P. Walenz
+A src/bogart/bogart.C nihh20160821Brian P. Walenz
+A src/bogart/bogart.C nihh20160821Brian P. Walenz
+A src/bogart/AS_BAT_MarkRepeatReads.C nihh20160821Brian P. Walenz
+A src/bogart/bogart.C nihh20160821Brian P. Walenz
+A src/bogart/AS_BAT_PromoteToSingleton.C nihh20160821Brian P. Walenz
+A src/bogart/AS_BAT_SplitDiscontinuous.C nihh20160821Brian P. Walenz
+A src/bogart/AS_BAT_Unitig_PlaceReadUsingEdges.C nihh20160821Brian P. Walenz
+A src/bogart/AS_BAT_TigVector.C nihh20160821Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.C nihh20160819Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.H nihh20160819Brian P. Walenz
+A src/bogart/AS_BAT_MarkRepeatReads.C nihh20160819Brian P. Walenz
+A src/bogart/AS_BAT_BestOverlapGraph.H nihh20160819Brian P. Walenz
+A src/bogart/AS_BAT_PlaceReadUsingOverlaps.H nihh20160819Brian P. Walenz
+A src/bogart/AS_BAT_MarkRepeatReads.C nihh20160818Brian P. Walenz
+A src/pipelines/canu/Configure.pm nihh20160817Brian P. Walenz
+A src/pipelines/canu/Grid.pm nihh20160817Brian P. Walenz
+A src/bogart/AS_BAT_PlaceReadUsingOverlaps.C nihh20160816Brian P. Walenz
+A src/bogart/AS_BAT_PlaceReadUsingOverlaps.H nihh20160816Brian P. Walenz
+A src/bogart/AS_BAT_OverlapCache.C nihh20160816Brian P. Walenz
+A src/bogart/AS_BAT_OverlapCache.H nihh20160816Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_BestOverlapGraph.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_ChunkGraph.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_Instrumentation.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_MarkRepeatReads.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_Outputs.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_OverlapCache.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_PlaceContains.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_PlaceReadUsingOverlaps.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_PopBubbles.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_PopulateUnitig.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_PromoteToSingleton.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_ReadInfo.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_ReadInfo.H nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_SetParentAndHang.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_Unitig_AddRead.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_Unitig_PlaceReadUsingEdges.C nihh20160812Brian P. Walenz
+A src/bogart/bogart.C nihh20160812Brian P. Walenz
+A src/bogart/bogart.mk nihh20160812Brian P. Walenz
+A addCopyrights.dat nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.H nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_Outputs.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_PlaceContains.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_PlaceReadUsingOverlaps.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_PlaceReadUsingOverlaps.H nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_PopBubbles.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_PopBubbles.txt nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_Unitig_AddRead.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_Unitig_PlaceReadUsingEdges.C nihh20160812Brian P. Walenz
+A src/bogart/bogart.mk nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_BestOverlapGraph.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_BestOverlapGraph.H nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_ChunkGraph.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_ChunkGraph.H nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_FragmentInfo.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_FragmentInfo.H nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_Instrumentation.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_Logging.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_Logging.H nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_MarkRepeatReads.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_Outputs.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_OverlapCache.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_OverlapCache.H nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_PlaceContains.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_PlaceFragUsingOverlaps.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_PlaceFragUsingOverlaps.H nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_PopBubbles.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_PopulateUnitig.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_PopulateUnitig.H nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_PromoteToSingleton.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_SetParentAndHang.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_SplitDiscontinuous.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.H nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_Unitig_AddFrag.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_Unitig_PlaceFragUsingEdges.C nihh20160812Brian P. Walenz
+A src/bogart/addReadsToUnitigs.C nihh20160812Brian P. Walenz
+A src/bogart/analyzeBest.C nihh20160812Brian P. Walenz
+A src/bogart/bogart.C nihh20160812Brian P. Walenz
+A src/bogart/buildGraph.C nihh20160812Brian P. Walenz
+A src/pipelines/canu/Consensus.pm nihh20160812Brian P. Walenz
+A src/stores/gkStore.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_MarkRepeatReads.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.H nihh20160812Brian P. Walenz
+A src/bogart/bogart.C nihh20160812Brian P. Walenz
+A src/bogart/AS_BAT_PlaceContains.C nihh20160810Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.C nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.H nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_Instrumentation.C nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_Instrumentation.H nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_Logging.C nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_Logging.H nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_MarkRepeatReads.C nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_MarkRepeatReads.H nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_Outputs.C nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_Outputs.H nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_OverlapCache.C nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_PlaceContains.C nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_PlaceContains.H nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_PlaceFragUsingOverlaps.C nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_PlaceFragUsingOverlaps.H nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_PopBubbles.C nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_PopBubbles.H nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_PopulateUnitig.C nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_PopulateUnitig.H nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_PromoteToSingleton.C nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_PromoteToSingleton.H nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_SetParentAndHang.C nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_SetParentAndHang.H nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_SplitDiscontinuous.C nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_SplitDiscontinuous.H nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_TigVector.C nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_TigVector.H nihh20160809Brian P. Walenz
+A src/bogart/addReadsToUnitigs.C nihh20160809Brian P. Walenz
+A src/bogart/bogart.C nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_BestOverlapGraph.C nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_BestOverlapGraph.H nihh20160809Brian P. Walenz
+A src/bogart/bogart.C nihh20160809Brian P. Walenz
+A src/bogart/bogart.mk nihh20160809Brian P. Walenz
+A src/bogart/bogart.C nihh20160809Brian P. Walenz
+A src/bogart/bogart.mk nihh20160809Brian P. Walenz
+A addCopyrights.dat nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.C nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.H nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_Instrumentation.C nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_Instrumentation.H nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_MarkRepeatReads.C nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_MarkRepeatReads.H nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_MergeUnitigs.C nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_MergeUnitigs.H nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_Outputs.C nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_Outputs.H nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_PlaceContains.C nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_PlaceContains.H nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_PlaceFragUsingOverlaps.C nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_PlaceFragUsingOverlaps.H nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_PopBubbles.C nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_PopBubbles.H nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_PopulateUnitig.C nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_PopulateUnitig.H nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_PromoteToSingleton.C nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_PromoteToSingleton.H nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_ReconstructRepeats.C nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_SetParentAndHang.C nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_SetParentAndHang.H nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_SplitDiscontinuous.C nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_SplitDiscontinuous.H nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_TigVector.C nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_TigVector.H nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.H nihh20160809Brian P. Walenz
+A src/bogart/bogart.C nihh20160809Brian P. Walenz
+A src/bogart/bogart.mk nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.C nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.H nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_MarkRepeatReads.C nihh20160809Brian P. Walenz
+A src/mhap/mhapConvert.C nihh20160809Brian P. Walenz
+A src/bogart/bogart.C nihh20160809Brian P. Walenz
+A src/bogart/AS_BAT_MarkRepeatReads.C nihh20160808Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.C nihh20160808Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.C nihh20160805Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.C nihh20160805Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.C nihh20160805Brian P. Walenz
+A src/bogart/findOverlappingReads.pl nihh20160805Brian P. Walenz
+A src/stores/tgStoreDump.C nihh20160805Brian P. Walenz
+A src/stores/tgTig.C nihh20160805Brian P. Walenz
+A src/bogart/AS_BAT_MarkRepeatReads.C nihh20160805Brian P. Walenz
+A src/bogart/bogart.C nihh20160805Brian P. Walenz
+A src/bogart/AS_BAT_OverlapCache.H nihh20160805Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.C nihh20160805Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.H nihh20160805Brian P. Walenz
+A src/bogart/AS_BAT_PlaceFragUsingOverlaps.C nihh20160805Brian P. Walenz
+A src/bogart/AS_BAT_PopBubbles.C nihh20160805Brian P. Walenz
+A src/bogart/AS_BAT_PlaceFragUsingOverlaps.H nihh20160805Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.C nihh20160805Brian P. Walenz
+A src/bogart/bogart.C nihh20160805Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.C nihh20160801Brian P. Walenz
+A src/bogart/AS_BAT_MergeUnitigs.C nihh20160801Brian P. Walenz
+A src/bogart/AS_BAT_PlaceContains.C nihh20160801Brian P. Walenz
+A src/bogart/AS_BAT_PlaceFragUsingOverlaps.C nihh20160801Brian P. Walenz
+A src/bogart/AS_BAT_PlaceFragUsingOverlaps.H nihh20160801Brian P. Walenz
+A src/bogart/AS_BAT_PopBubbles.C nihh20160801Brian P. Walenz
+A src/bogart/AS_BAT_Instrumentation.C nihh20160801Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.C nihh20160721Brian P. Walenz
+A src/bogart/AS_BAT_AssemblyGraph.H nihh20160721Brian P. Walenz
+A src/bogart/bogart.C nihh20160721Brian P. Walenz
+A src/bogart/bogart.mk nihh20160721Brian P. Walenz
+A src/bogart/AS_BAT_PlaceFragUsingOverlaps.C nihh20160721Brian P. Walenz
+A src/bogart/AS_BAT_PlaceFragUsingOverlaps.H nihh20160721Brian P. Walenz
+A src/bogart/AS_BAT_BestOverlapGraph.C nihh20160721Brian P. Walenz
+A src/bogart/AS_BAT_PlaceContains.C nihh20160721Brian P. Walenz
+A src/bogart/AS_BAT_PopBubbles.C nihh20160721Brian P. Walenz
+A src/bogart/AS_BAT_Unitig_PlaceFragUsingEdges.C nihh20160721Brian P. Walenz
+A src/bogart/AS_BAT_PlaceContains.C nihh20160721Brian P. Walenz
+A src/bogart/AS_BAT_OverlapCache.C nihh20160721Brian P. Walenz
+A src/bogart/AS_BAT_OverlapCache.H nihh20160721Brian P. Walenz
+A src/pipelines/canu/Grid_PBSTorque.pm nihh20160803Brian P. Walenz
+A src/meryl/libmeryl.C nihh20160724Brian P. Walenz
+A src/bogart/AS_BAT_PopBubbles.C nihh20160719Brian P. Walenz
+A src/main.mk nihh20160719Brian P. Walenz
+A src/utgcns/libNDFalcon/dw.C nihh20160719Brian P. Walenz
+A src/utgcns/libcns/unitigConsensus.C nihh20160719Brian P. Walenz
+A src/stores/gkStore.H nihh20160707Brian P. Walenz
+A src/utgcns/libcns/unitigConsensus.C nihh20160706Brian P. Walenz
+A src/utgcns/libcns/unitigConsensus.C nihh20160706Brian P. Walenz
+A src/utgcns/libcns/unitigConsensus.C nihh20160706Brian P. Walenz
+A README.md nihh20160718Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20160718Brian P. Walenz
+A src/Makefile nihh20160718Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20160718Brian P. Walenz
+A src/pipelines/canu/Execution.pm nihh20160711Brian P. Walenz
+A src/pipelines/canu/Execution.pm nihh20160709Sergey Koren
+A src/pipelines/canu/Defaults.pm nihh20160709Sergey Koren
+A src/bogart/AS_BAT_MarkRepeatReads.C nihh20160706Brian P. Walenz
+A src/bogart/AS_BAT_PopBubbles.C nihh20160706Brian P. Walenz
+A src/pipelines/canu/Configure.pm nihh20160706Brian P. Walenz
+A src/bogart/bogart.C nihh20160706Brian P. Walenz
+A src/pipelines/canu.pl nihh20160706Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20160706Brian P. Walenz
+A src/pipelines/canu/Grid_Slurm.pm nihh20160629Brian P. Walenz
+A src/bogart/buildGraph.C nihh20160629Brian P. Walenz
+A src/pipelines/canu/CorrectReads.pm nihh20160629Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20160628Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20160628Brian P. Walenz
+A src/overlapErrorAdjustment/findErrors.C nihh20160627Brian P. Walenz
+A src/overlapErrorAdjustment/findErrors.C nihh20160627Brian P. Walenz
+A src/overlapErrorAdjustment/correctOverlaps.C nihh20160627Brian P. Walenz
+A src/overlapErrorAdjustment/findErrors-Process_Olap.C nihh20160627Brian P. Walenz
+A src/overlapErrorAdjustment/findErrors.C nihh20160627Brian P. Walenz
+A src/overlapErrorAdjustment/findErrors.H nihh20160627Brian P. Walenz
+A src/pipelines/canu/OverlapErrorAdjustment.pm nihh20160627Brian P. Walenz
+A README.md nihh20160624Sergey Koren
+A src/pipelines/canu/Execution.pm nihh20160621Brian P. Walenz
+A src/pipelines/canu/Grid.pm nihh20160621Brian P. Walenz
+A src/pipelines/canu/Execution.pm nihh20160620Sergey Koren
+A src/pipelines/canu/Grid.pm nihh20160620Sergey Koren
+A src/pipelines/canu/Consensus.pm nihh20160618Brian P. Walenz
+A src/pipelines/canu/CorrectReads.pm nihh20160618Brian P. Walenz
+A src/pipelines/canu/Execution.pm nihh20160618Brian P. Walenz
+A src/pipelines/canu/Grid_Slurm.pm nihh20160618Brian P. Walenz
+A src/pipelines/canu/OverlapErrorAdjustment.pm nihh20160618Brian P. Walenz
+A src/pipelines/canu/OverlapInCore.pm nihh20160618Brian P. Walenz
+A src/pipelines/canu/OverlapMMap.pm nihh20160618Brian P. Walenz
+A src/pipelines/canu/OverlapMhap.pm nihh20160618Brian P. Walenz
+A src/pipelines/canu/OverlapStore.pm nihh20160618Brian P. Walenz
+A src/AS_UTL/intervalList.H nihh20160615Brian P. Walenz
+A src/AS_UTL/intervalListTest.C nihh20160615Brian P. Walenz
+A src/pipelines/canu/Execution.pm nihh20160615Brian P. Walenz
+A src/pipelines/canu/OverlapStore.pm nihh20160615Brian P. Walenz
+A src/pipelines/canu/HTML.pm nihh20160614Brian P. Walenz
+A src/bogart/AS_BAT_BestOverlapGraph.C nihh20160614Brian P. Walenz
+A src/bogart/AS_BAT_ChunkGraph.C nihh20160614Brian P. Walenz
+A src/bogart/AS_BAT_ChunkGraph.H nihh20160614Brian P. Walenz
+A src/bogart/AS_BAT_FragmentInfo.C nihh20160614Brian P. Walenz
+A src/bogart/AS_BAT_Instrumentation.C nihh20160614Brian P. Walenz
+A src/bogart/AS_BAT_Logging.C nihh20160614Brian P. Walenz
+A src/bogart/AS_BAT_Logging.H nihh20160614Brian P. Walenz
+A src/bogart/AS_BAT_OverlapCache.C nihh20160614Brian P. Walenz
+A src/bogart/AS_BAT_PlaceContains.C nihh20160614Brian P. Walenz
+A src/bogart/AS_BAT_PopBubbles.C nihh20160614Brian P. Walenz
+A src/bogart/AS_BAT_SplitDiscontinuous.C nihh20160614Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.C nihh20160614Brian P. Walenz
+A src/bogart/AS_BAT_UnitigVector.C nihh20160614Brian P. Walenz
+A src/bogart/bogart.C nihh20160614Brian P. Walenz
+A src/bogart/AS_BAT_FragmentInfo.C nihh20160613Brian P. Walenz
+A src/bogart/AS_BAT_FragmentInfo.C nihh20160613Brian P. Walenz
+A src/bogart/AS_BAT_FragmentInfo.H nihh20160613Brian P. Walenz
+A src/bogart/AS_BAT_Instrumentation.C nihh20160613Brian P. Walenz
+A src/bogart/AS_BAT_BestOverlapGraph.C nihh20160613Brian P. Walenz
+A src/bogart/AS_BAT_BestOverlapGraph.H nihh20160613Brian P. Walenz
+A src/stores/ovStore.C nihh20160610Brian P. Walenz
+A src/bogart/AS_BAT_BestOverlapGraph.C nihh20160603Brian P. Walenz
+A src/bogart/AS_BAT_BestOverlapGraph.H nihh20160603Brian P. Walenz
+A src/stores/tgStoreDump.C nihh20160603Brian P. Walenz
+A src/stores/tgTig.C nihh20160603Brian P. Walenz
+A src/stores/tgTig.H nihh20160603Brian P. Walenz
+A src/falcon_sense/falcon_sense.C nihh20160610Sergey Koren
+A src/utgcns/utgcns.C nihh20160610Sergey Koren
+A src/bogart/bogart.C nihh20160608Brian P. Walenz
+A src/stores/gatekeeperDumpFASTQ.C nihh20160608Brian P. Walenz
+A src/stores/gkStore.C nihh20160608Brian P. Walenz
+A src/stores/gkStore.H nihh20160608Brian P. Walenz
+A src/overlapInCore/overlapInCore-Find_Overlaps.C nihh20160608Sergey Koren
+A src/overlapInCore/overlapInCore-Process_Overlaps.C nihh20160608Sergey Koren
+A src/overlapInCore/overlapInCore-Process_String_Overlaps.C nihh20160608Sergey Koren
+A src/overlapInCore/overlapInCore.C nihh20160608Sergey Koren
+A src/overlapInCore/overlapInCore.H nihh20160608Sergey Koren
+A src/pipelines/canu/Defaults.pm nihh20160608Sergey Koren
+A src/pipelines/canu/OverlapInCore.pm nihh20160608Sergey Koren
+A src/pipelines/canu/Defaults.pm nihh20160608Sergey Koren
+A src/pipelines/canu/OverlapMhap.pm nihh20160608Sergey Koren
+A src/bogart/AS_BAT_MarkRepeatReads.C nihh20160608Brian P. Walenz
+A src/stores/ovStoreBuild.C nihh20160607Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.C nihh20160606Brian P. Walenz
+A src/bogart/AS_BAT_Logging.C nihh20160606Brian P. Walenz
+A src/bogart/AS_BAT_Logging.H nihh20160606Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.C nihh20160606Brian P. Walenz
+A src/bogart/bogart.C nihh20160606Brian P. Walenz
+A src/bogart/plotErrorProfile.pl nihh20160606Brian P. Walenz
+A src/AS_UTL/stddev.H nihh20160606Brian P. Walenz
+A README.md nihh20160603Sergey Koren
+A src/bogart/AS_BAT_Unitig.C nihh20160531Brian P. Walenz
+A src/bogart/AS_BAT_OverlapCache.C nihh20160531Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.H nihh20160531Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.C nihh20160531Brian P. Walenz
+A src/bogart/AS_BAT_Unitig.H nihh20160531Brian P. Walenz
+A src/AS_UTL/stddev.H nihh20160531Brian P. Walenz
+A src/bogart/AS_BAT_MarkRepeatReads.C nihh20160531Brian P. Walenz
+A src/pipelines/canu/Configure.pm nihh20160531Sergey Koren
+A documentation/source/faq.rst nihh20160531Sergey Koren
+A documentation/source/quick-start.rst nihh20160531Sergey Koren
+A documentation/source/quick-start.rst nihh20160531Sergey Koren
+A documentation/source/faq.rst nihh20160527Sergey Koren
+A documentation/source/faq.rst nihh20160527Sergey Koren
+A documentation/source/faq.rst nihh20160527Sergey Koren
+A src/pipelines/canu/HTML.pm nihh20160526Brian P. Walenz
+A src/pipelines/canu/Defaults.pm nihh20160526Brian P. Walenz
+A documentation/source/quick-start.rst nihh20160526Sergey Koren
+A src/stores/gatekeeperCreate.C nihh20160526Brian P. Walenz
+A src/stores/gkStore.C nihh20160526Brian P. Walenz
+A src/stores/gkStoreEncode.C nihh20160526Brian P. Walenz
+A src/overlapInCore/overlapInCore.C nihh20160524Sergey Koren
+A src/pipelines/canu.pl nihh20160524Sergey Koren
+A src/pipelines/canu/ErrorEstimate.pm nihh20160524Sergey Koren
+A src/pipelines/canu/CorrectReads.pm nihh20160524Sergey Koren
+A src/mhap/mhap-2.1.tar nihh20160523Sergey Koren
+A src/pipelines/canu/ErrorEstimate.pm nihh20160523Sergey Koren
+A src/bogart/AS_BAT_BestOverlapGraph.C nihh20160523Brian P. Walenz
+A src/mhap/mhap-2.1.tar nihh20160523Sergey Koren
+A src/pipelines/canu/Defaults.pm nihh20160523Sergey Koren
+A src/pipelines/canu/ErrorEstimate.pm nihh20160523Sergey Koren
+A src/pipelines/canu/Meryl.pm nihh20160523Sergey Koren
+A src/pipelines/canu/OverlapMhap.pm nihh20160523Sergey Koren
+A src/Makefile nihh20160523Brian P. Walenz
+A src/meryl/meryl-build.C nihh20160523Brian P. Walenz
+A src/meryl/libleaff/gkStoreFile.C nihh20160523Brian P. Walenz
+A src/stores/gkStore.H nihh20160523Brian P. Walenz
+A src/overlapErrorAdjustment/findErrors-Analyze_Alignment.C nihh20160520Brian P. Walenz
+A src/main.mk nihh20160520Brian P. Walenz
+A src/overlapErrorAdjustment/findErrors-Dump.C nihh20160520Brian P. Walenz
+A src/overlapErrorAdjustment/findErrors-Dump.mk nihh20160520Brian P. Walenz
+A src/overlapErrorAdjustment/findErrors.C nihh20160520Brian P. Walenz
+A src/bogart/AS_BAT_PopBubbles.C nihh20160519Brian P. Walenz
+A src/meryl/libmeryl.C nihh20160519Brian P. Walenz
+A src/meryl/libmeryl.H nihh20160519Brian P. Walenz
+A src/meryl/meryl-dump.C nihh20160519Brian P. Walenz
+A src/meryl/libleaff/fastaFile.C nihh20160519Brian P. Walenz
+A src/meryl/libleaff/fastaStdin.C nihh20160519Brian P. Walenz
+A src/meryl/libleaff/fastqFile.C nihh20160519Brian P. Walenz
+A src/meryl/libleaff/fastqStdin.C nihh20160519Brian P. Walenz
+A src/meryl/libleaff/seqCache.C nihh20160519Brian P. Walenz
+A src/meryl/libleaff/seqFile.H nihh20160519Brian P. Walenz
+A src/correction/errorEstimate.C nihh20160518Sergey Koren
+A src/pipelines/canu/Defaults.pm nihh20160518Sergey Koren
+A src/pipelines/canu/ErrorEstimate.pm nihh20160518Sergey Koren
+A src/pipelines/canu.pl nihh20160518Sergey Koren
+A src/pipelines/canu/CorrectReads.pm nihh20160518Sergey Koren
+A src/mhap/mhap.mk nihh20160518Sergey Koren
+A src/pipelines/canu/Meryl.pm nihh20160518Sergey Koren
+A src/pipelines/canu/OverlapMhap.pm nihh20160518Sergey Koren
+A src/pipelines/canu/Defaults.pm nihh20160518Sergey Koren
+A src/correction/errorEstimate.C nihh20160518Sergey Koren
+A src/pipelines/canu/ErrorEstimate.pm nihh20160518Sergey Koren
+A addCopyrights-BuildData.pl nihh20160518Brian P. Walenz
+A addCopyrights.dat nihh20160518Brian P. Walenz
+A addCopyrights.pl nihh20160518Brian P. Walenz
+D src/bogart/AS_BAT_MergeOrphans.C src/bogart/AS_BAT_PopBubbles.C
+D src/bogart/AS_BAT_MergeOrphans.H src/bogart/AS_BAT_PopBubbles.H
diff --git a/addCopyrights.pl b/addCopyrights.pl
index bf6778b..8f5607c 100644
--- a/addCopyrights.pl
+++ b/addCopyrights.pl
@@ -9,7 +9,6 @@ my @dateStrings = ( "???", "JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG
#
my $doForReal = 1;
-
#
# The change data 'addCopyrights.dat' contains lines of two types:
#
@@ -215,20 +214,21 @@ my %derived;
# Process each file.
-
-open(FIN, "find kmer src -type f -print |") or die "Failed to launch 'find'\n";
#open(OUT, "> addCopyrights.dat.new") or die "Failed to open 'addCopyrights.dat.new' for writing: $!\n";
-while (<FIN>) {
- chomp;
-
- my $file = $_;
-
- $file = $1 if ($_ =~ m/^\.\/(.*)$/); # Remove leading ./ added by find.
-
- my @lines;
+my @filesToProcess = @ARGV;
+if (scalar(@filesToProcess) == 0) {
+ open(FIN, "find kmer src -type f -print |") or die "Failed to launch 'find'\n";
+ while (<FIN>) {
+ chomp;
+ s/^\.\/(.*)$//; # Remove leading ./ added by find.
+ push @filesToProcess, $_;
+ }
+ close(FIN);
+}
+foreach my $file (@filesToProcess) {
next if ($file =~ m/\.mk$/);
next if ($file =~ m/Makefile/);
@@ -258,6 +258,8 @@ while (<FIN>) {
next if ($file =~ m/libboost/);
+ next if ($file =~ m/qsort_mt.c$/);
+
my $cb = "/";
my $cc = "*";
my $ce = "/";
@@ -340,11 +342,10 @@ while (<FIN>) {
push @DElist, " $cc\n";
}
- if ($file =~ m/\.pl$/) {
- push @lines, "#!/usr/bin/env perl\n";
- }
-
+ my @lines;
+
+ push @lines, "#!/usr/bin/env perl\n" if ($file =~ m/\.pl$/);
push @lines, "\n";
push @lines, "$cb" . $cc x 78 . "\n";
push @lines, " $cc\n";
@@ -411,11 +412,16 @@ while (<FIN>) {
close(F);
if ($doForReal) {
+ my $perms = `stat -f %p $file`; chomp $perms; $perms = substr($perms, -3);
+
rename "$file", "$file.ORIG";
open(F, "> $file") or die "Failed to open '$file' for writing: $!\n";
print F @lines;
close(F);
+
+ system("chmod $perms $file");
+
} else {
open(F, "> $file.MODIFIED") or die "Failed to open '$file.MODIFIED' for writing: $!\n";
print F @lines;
diff --git a/documentation/source/canu-overlaps.svg b/documentation/source/canu-overlaps.svg
new file mode 100644
index 0000000..35415da
--- /dev/null
+++ b/documentation/source/canu-overlaps.svg
@@ -0,0 +1,2836 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+ xmlns:osb="http://www.openswatchbook.org/uri/2009/osb"
+ xmlns:dc="http://purl.org/dc/elements/1.1/"
+ xmlns:cc="http://creativecommons.org/ns#"
+ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+ xmlns:svg="http://www.w3.org/2000/svg"
+ xmlns="http://www.w3.org/2000/svg"
+ xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+ xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+ width="8.5in"
+ height="11in"
+ viewBox="0 0 765.00003 990.00004"
+ id="svg2"
+ version="1.1"
+ inkscape:version="0.91 r13725"
+ sodipodi:docname="pipeline-1-simple.svg">
+ <defs
+ id="defs4">
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker25050"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path25052" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker23256"
+ style="overflow:visible"
+ inkscape:isstock="true"
+ inkscape:collect="always">
+ <path
+ id="path23258"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker22412"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ id="path22414"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker24623"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ id="path24625"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mstart"
+ orient="auto"
+ refY="0.0"
+ refX="0.0"
+ id="Arrow1Mstart"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ id="path23441"
+ d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
+ style="fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1;fill:#000000;fill-opacity:1"
+ transform="scale(0.4) translate(10,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker44026"
+ style="overflow:visible"
+ inkscape:isstock="true"
+ inkscape:collect="always">
+ <path
+ id="path44028"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker30557"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ id="path30559"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker29983"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ id="path29985"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker29415"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ id="path29417"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker28853"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ id="path28855"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker28297"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ id="path28299"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <linearGradient
+ id="linearGradient21530"
+ osb:paint="solid">
+ <stop
+ style="stop-color:#000000;stop-opacity:1;"
+ offset="0"
+ id="stop21532" />
+ </linearGradient>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker18520"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path18522"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker18066"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path18068"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker17618"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path17620"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker17320"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path17322"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker14037"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ id="path14039"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker13817"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ id="path13819"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker13603"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ id="path13605"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker12968"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path12970"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker12796"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path12798"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker12630"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path12632"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend"
+ style="overflow:visible"
+ inkscape:isstock="true"
+ inkscape:collect="always">
+ <path
+ id="path4155"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow2Lend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow2Lend"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ id="path4167"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.625;stroke-linejoin:round;stroke-opacity:1"
+ d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+ transform="matrix(-1.1,0,0,-1.1,-1.1,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-1"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-4"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-6"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-7"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-7"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-8"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-3"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-2"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-1"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-77"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-1-9"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-77-3"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-1-6"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-77-4"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-1-3"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-77-1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-1-99"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-77-2"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-1-9-1"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-77-3-6"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-1-9-5"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-77-3-2"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-1-9-59"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-77-3-5"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-1-9-3"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-77-3-8"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-1-9-2"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-77-3-0"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-1-9-6"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-77-3-9"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-1-9-5-5"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-77-3-2-4"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-1-9-6-2"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-77-3-9-1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-1-99-7"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-77-2-5"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-1-3-8"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-77-1-7"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-1-6-3"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-77-4-9"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-1-9-8"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-77-3-3"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker13146-3"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path13148-3" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker12630-3"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path12632-6" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker12796-5"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path12798-7" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker12968-6"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path12970-6" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker14037-7"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path14039-5"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker14037-5"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path14039-6"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker14037-2"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path14039-2"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker14037-27"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path14039-53"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker14037-8"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path14039-8"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker14037-26"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path14039-0"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker14037-89"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path14039-27"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker14037-9"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path14039-65"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker14037-8-2"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path14039-8-3"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker14037-8-5"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path14039-8-7"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-7-0"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-8-2"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-6-1"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-7-1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-6-1-6"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-7-1-2"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-7-0-3"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-8-2-1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-7-0-7"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-8-2-18"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-7-0-5"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-8-2-17"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-7-0-7-0"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-8-2-18-3"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-5"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-7"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-2"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-8"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-1"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-77"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-51"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-87"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-1-0"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-77-0"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-1-6"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-77-02"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-1-9"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-77-8"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-1-6-8"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-77-02-1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-51-4"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-87-3"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-1-05"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-77-9"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-2-1"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-8-0"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-5-1"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-7-8"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-7-0-7-0-0"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-8-2-18-3-7"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-7-0-7-8"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-8-2-18-7"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-6-1-6-7"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-7-1-2-0"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-0"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-1-6"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-4-0"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-6"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-8"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker18066-4"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path18068-5" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker18066-2"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path18068-2" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-1-0-4"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-77-0-1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-1-6-6"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-77-02-7"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-1-6-8-2"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-77-02-1-3"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-1-0-4-5"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-77-0-1-3"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-1-6-6-4"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-77-02-7-7"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker18980-0"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path18982-7" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker18066-1"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path18068-3" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker17320-0"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path17322-8" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker17618-1"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path17620-1" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker18520-5"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path18522-5" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-6-1-6-1"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-7-1-2-9"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-6-1-6-2"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-7-1-2-5"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker28297-3"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path28299-6"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker28297-7"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path28299-68"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker28297-7-9"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path28299-68-0"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker28297-7-9-8"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path28299-68-0-8"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker28297-7-9-1"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path28299-68-0-6"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker28297-7-9-5"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path28299-68-0-0"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker41018-0"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path41020-9" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker44026-9"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ id="path44028-4"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker44026-4"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ id="path44028-45"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-8"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-3"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-9"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-0"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-5-5"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-7-2"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-2-2"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-8-7"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-1-3"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-77-7"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-51-9"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-87-0"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker41018-0-9"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path41020-9-9" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker44026-4-8"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ id="path44028-45-4"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-8-0"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-3-4"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-9-6"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-0-0"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-5-5-3"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-7-2-2"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-2-2-6"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-8-7-9"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-1-3-4"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-77-7-1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-51-9-3"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-87-0-7"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker41018-0-9-3"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path41020-9-9-8" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker7927-8"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path7929-5" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker7927-4"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path7929-3" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker7927-5"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path7929-6" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker7927-5-0"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path7929-6-8" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker7927-5-04"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path7929-6-6" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker28297-7-3"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path28299-68-8"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker18066-2-3"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path18068-2-2" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker18066-2-3-0"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path18068-2-2-4" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker18066-2-6"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path18068-2-3" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker18066-2-3-0-4"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path18068-2-2-4-8" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker18066-2-3-7"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path18068-2-2-9" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-6-1-6-8"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-7-1-2-7"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker44026-8"
+ style="overflow:visible"
+ inkscape:isstock="true"
+ inkscape:collect="always">
+ <path
+ id="path44028-8"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-34"
+ style="overflow:visible"
+ inkscape:isstock="true"
+ inkscape:collect="always">
+ <path
+ id="path4155-23"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ </defs>
+ <sodipodi:namedview
+ id="base"
+ pagecolor="#ffffff"
+ bordercolor="#666666"
+ borderopacity="1.0"
+ inkscape:pageopacity="0.0"
+ inkscape:pageshadow="2"
+ inkscape:zoom="1.4142137"
+ inkscape:cx="291.86354"
+ inkscape:cy="673.91662"
+ inkscape:document-units="mm"
+ inkscape:current-layer="layer1"
+ showgrid="true"
+ inkscape:window-width="2899"
+ inkscape:window-height="1050"
+ inkscape:window-x="0"
+ inkscape:window-y="0"
+ inkscape:window-maximized="0"
+ inkscape:snap-intersection-paths="false"
+ inkscape:snap-smooth-nodes="true"
+ inkscape:snap-midpoints="false"
+ inkscape:object-nodes="true"
+ inkscape:snap-bbox="true"
+ inkscape:snap-bbox-midpoints="true"
+ inkscape:bbox-nodes="true"
+ units="in"
+ inkscape:snap-nodes="true"
+ inkscape:snap-others="true"
+ inkscape:snap-global="true"
+ inkscape:snap-object-midpoints="true">
+ <inkscape:grid
+ type="xygrid"
+ id="grid3336"
+ units="mm"
+ spacingx="7.0866143"
+ spacingy="7.0866143" />
+ </sodipodi:namedview>
+ <metadata
+ id="metadata7">
+ <rdf:RDF>
+ <cc:Work
+ rdf:about="">
+ <dc:format>image/svg+xml</dc:format>
+ <dc:type
+ rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+ <dc:title></dc:title>
+ </cc:Work>
+ </rdf:RDF>
+ </metadata>
+ <g
+ inkscape:label="Layer 1"
+ inkscape:groupmode="layer"
+ id="layer1"
+ transform="translate(0,-62.362205)">
+ <rect
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#c1c1c1;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shap [...]
+ id="rect9021-0-6-6"
+ width="170.07872"
+ height="226.77167"
+ x="276.37796"
+ y="492.51971" />
+ <rect
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#c1c1c1;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shap [...]
+ id="rect9021-0-6"
+ width="170.07874"
+ height="141.7323"
+ x="475.68896"
+ y="337.50003" />
+ <rect
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#c1c1c1;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shap [...]
+ id="rect9021-0"
+ width="170.07874"
+ height="297.63782"
+ x="63.779526"
+ y="336.6142" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow1Mend);color-rendering:a [...]
+ d="m 354.3307,294.09452 0,35.43307"
+ id="path4140"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <g
+ id="g5980"
+ transform="translate(-276.378,28.346456)">
+ <rect
+ y="329.52759"
+ x="361.4173"
+ height="42.519688"
+ width="141.73227"
+ id="rect4134-2-7-9"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <rect
+ y="322.44098"
+ x="354.33075"
+ height="42.519688"
+ width="141.73227"
+ id="rect4134-2-7-8"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <rect
+ y="315.35437"
+ x="347.24408"
+ height="42.519688"
+ width="141.73227"
+ id="rect4134-2-7"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <text
+ sodipodi:linespacing="125%"
+ id="text26240"
+ y="333.77698"
+ x="418.3107"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ xml:space="preserve"><tspan
+ y="333.77698"
+ x="418.3107"
+ id="tspan8004"
+ sodipodi:role="line">compute minhash</tspan><tspan
+ y="347.83948"
+ x="418.3107"
+ sodipodi:role="line"
+ id="tspan6041">tables</tspan></text>
+ </g>
+ <g
+ id="g5776"
+ transform="translate(-63.779549,177.16537)">
+ <rect
+ y="159.44884"
+ x="347.24411"
+ height="42.519688"
+ width="141.73227"
+ id="rect4134"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <text
+ sodipodi:linespacing="125%"
+ id="text4731"
+ y="177.87146"
+ x="418.0773"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ xml:space="preserve"><tspan
+ y="177.87146"
+ x="418.0773"
+ id="tspan5304"
+ sodipodi:role="line">count k-mers</tspan></text>
+ </g>
+ <g
+ id="g9433"
+ transform="translate(-254.59602,347.60787)">
+ <ellipse
+ ry="20.896082"
+ rx="43.041752"
+ cy="95.305557"
+ cx="609.44879"
+ id="path43977"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <text
+ sodipodi:linespacing="125%"
+ id="text4613-0"
+ y="92.468338"
+ x="609.42133"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ xml:space="preserve"><tspan
+ id="tspan4806"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;writing-mode:lr-tb;text-anchor:middle"
+ y="92.468338"
+ x="609.42133"
+ sodipodi:role="line">frequent</tspan><tspan
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;writing-mode:lr-tb;text-anchor:middle"
+ y="106.53084"
+ x="609.42133"
+ sodipodi:role="line"
+ id="tspan5822">k-mers</tspan></text>
+ </g>
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#marker18066-2);color-renderin [...]
+ d="m 141.73228,386.22051 0,35.43307"
+ id="path4140-0-3-5-7-7"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <g
+ id="g8680"
+ transform="translate(70.866157,-191.33861)">
+ <rect
+ y="705.11816"
+ x="226.77165"
+ height="42.519688"
+ width="141.73227"
+ id="rect4134-2-7-9-4"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <rect
+ y="698.03156"
+ x="219.68504"
+ height="42.519688"
+ width="141.73227"
+ id="rect4134-2-7-8-3"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <rect
+ y="690.94495"
+ x="212.5984"
+ height="42.519688"
+ width="141.73227"
+ id="rect4134-2-7-4"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <text
+ sodipodi:linespacing="125%"
+ id="text26240-4"
+ y="709.36755"
+ x="283.23111"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ xml:space="preserve"><tspan
+ y="709.36755"
+ x="283.23111"
+ id="tspan8086"
+ sodipodi:role="line">place overlaps</tspan><tspan
+ y="723.43005"
+ x="283.23111"
+ sodipodi:role="line"
+ id="tspan8665">in buckets</tspan></text>
+ </g>
+ <g
+ id="g8688"
+ transform="translate(63.779457,-127.56004)">
+ <rect
+ y="797.24414"
+ x="233.85832"
+ height="42.519688"
+ width="141.73227"
+ id="rect4134-2-7-9-8"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <rect
+ y="790.15753"
+ x="226.77165"
+ height="42.519688"
+ width="141.73227"
+ id="rect4134-2-7-8-1"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <rect
+ y="783.0719"
+ x="219.6851"
+ height="42.519688"
+ width="141.73227"
+ id="rect4134-2-7-1"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <text
+ sodipodi:linespacing="125%"
+ id="text26240-1"
+ y="808.52478"
+ x="290.52371"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ xml:space="preserve"><tspan
+ y="808.52478"
+ x="290.52371"
+ id="tspan8090"
+ sodipodi:role="line">sort buckets</tspan></text>
+ </g>
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow1Mend-4-0-1);color-rende [...]
+ d="m 271.59192,201.96853 35.43308,0"
+ id="path4140-0-3-5"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <g
+ id="g15108"
+ transform="translate(127.55906,-70.866141)">
+ <ellipse
+ ry="20.896084"
+ rx="43.041752"
+ cy="188.15906"
+ cx="70.866142"
+ id="path43977-8"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <text
+ sodipodi:linespacing="125%"
+ id="text4613"
+ y="192.71684"
+ x="70.591484"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ xml:space="preserve"><tspan
+ y="192.71684"
+ x="70.591484"
+ id="tspan5271"
+ sodipodi:role="line">Raw Reads</tspan></text>
+ <path
+ sodipodi:nodetypes="cc"
+ inkscape:connector-curvature="0"
+ id="path4140-0-3-5-6"
+ d="m 70.86614,209.05515 0,35.43307"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow1Mend-4-0-1-9);color-r [...]
+ <g
+ id="g6182">
+ <rect
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:au [...]
+ id="rect4134-1"
+ width="141.73227"
+ height="42.519691"
+ x="-3.5351563e-06"
+ y="251.57483" />
+ <text
+ xml:space="preserve"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ x="70.849655"
+ y="277.02872"
+ id="text12562"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ x="70.849655"
+ y="277.02872"
+ id="tspan12568">Create read database</tspan></text>
+ </g>
+ </g>
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#marker44026);color-rendering:a [...]
+ d="m 311.81102,258.66145 -170.07874,0 0,77.95276"
+ id="path44018"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="ccc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#marker41018-0-9);color-renderi [...]
+ d="m 212.59842,605.90555 35.43307,0 0,-85.03937 28.34646,0"
+ id="path40998-3-3"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cccc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow1Mend-4-8-0);color-rende [...]
+ d="m 425.19684,676.7717 56.69292,0"
+ id="path4140-0-9-8"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <g
+ transform="translate(595.27557,99.212606)"
+ id="g9143-0">
+ <rect
+ y="95.669312"
+ x="-283.46457"
+ height="85.039368"
+ width="85.039368"
+ id="rect9099-1"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.99999994px;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto; [...]
+ <path
+ d="m -198.42519,180.70869 a 42.519684,10.629927 0 0 1 -21.32511,9.21519 42.519684,10.629927 0 0 1 -42.58475,-0.0283 42.519684,10.629927 0 0 1 -21.1289,-9.24339"
+ sodipodi:open="true"
+ sodipodi:end="3.1469076"
+ sodipodi:start="0"
+ sodipodi:ry="10.629927"
+ sodipodi:rx="42.519684"
+ sodipodi:cy="180.70869"
+ sodipodi:cx="-240.94487"
+ sodipodi:type="arc"
+ id="path4460-2-5-4"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:aut [...]
+ <path
+ inkscape:connector-curvature="0"
+ id="path4468-6-6"
+ d="m -198.42519,95.669315 0,85.039375"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <path
+ inkscape:connector-curvature="0"
+ id="path4470-4-7"
+ d="m -283.46456,95.669315 0,85.039375"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <ellipse
+ ry="10.629922"
+ rx="42.519684"
+ cy="95.669312"
+ cx="-240.94487"
+ id="path4460-6-3"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:aut [...]
+ <text
+ sodipodi:linespacing="125%"
+ id="text4487-63-0"
+ y="148.37924"
+ x="-240.95587"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ xml:space="preserve"><tspan
+ y="148.37924"
+ x="-240.95587"
+ id="tspan4489-1-0"
+ sodipodi:role="line">gkpStore</tspan></text>
+ </g>
+ <g
+ transform="translate(651.96848,524.40946)"
+ id="g9134-3">
+ <rect
+ y="95.669312"
+ x="-162.99213"
+ height="85.039375"
+ width="85.039368"
+ id="rect9099-9-4"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.99999994px;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto; [...]
+ <path
+ d="m -77.952744,180.70871 a 42.519684,10.629928 0 0 1 -21.325112,9.21519 42.519684,10.629928 0 0 1 -42.584754,-0.0283 42.519684,10.629928 0 0 1 -21.1289,-9.2434"
+ sodipodi:open="true"
+ sodipodi:end="3.1469076"
+ sodipodi:start="0"
+ sodipodi:ry="10.629928"
+ sodipodi:rx="42.519684"
+ sodipodi:cy="180.70871"
+ sodipodi:cx="-120.47243"
+ sodipodi:type="arc"
+ id="path4460-2-5-1-9"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:aut [...]
+ <path
+ inkscape:connector-curvature="0"
+ id="path4468-6-1-40"
+ d="m -77.952755,95.669321 0,85.039379"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <path
+ inkscape:connector-curvature="0"
+ id="path4470-4-8-68"
+ d="m -162.99212,95.669321 0,85.039379"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <ellipse
+ ry="10.629923"
+ rx="42.519684"
+ cy="95.669319"
+ cx="-120.47243"
+ id="path4460-6-6-5"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:aut [...]
+ <text
+ sodipodi:linespacing="125%"
+ id="text4487-63-5-5"
+ y="149.46964"
+ x="-120.48343"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ xml:space="preserve"><tspan
+ y="149.46964"
+ x="-120.48343"
+ id="tspan4489-1-9-8"
+ sodipodi:role="line">ovlStore</tspan></text>
+ </g>
+ <g
+ transform="translate(135.5315,29.232277)"
+ id="g5980-0">
+ <rect
+ y="329.52759"
+ x="361.4173"
+ height="42.519688"
+ width="141.73227"
+ id="rect4134-2-7-9-43"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <rect
+ y="322.44098"
+ x="354.33075"
+ height="42.519688"
+ width="141.73227"
+ id="rect4134-2-7-8-7"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <rect
+ y="315.35437"
+ x="347.24408"
+ height="42.519688"
+ width="141.73227"
+ id="rect4134-2-7-7"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <text
+ sodipodi:linespacing="125%"
+ id="text26240-40"
+ y="333.77698"
+ x="418.3107"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ xml:space="preserve"><tspan
+ y="333.77698"
+ x="418.3107"
+ id="tspan8004-2"
+ sodipodi:role="line">compute overlaps</tspan><tspan
+ y="347.83948"
+ x="418.3107"
+ sodipodi:role="line"
+ id="tspan6279">with overlapInCore</tspan></text>
+ </g>
+ <g
+ transform="translate(-276.37794,184.25197)"
+ id="g5980-3">
+ <rect
+ y="329.52759"
+ x="361.4173"
+ height="42.519688"
+ width="141.73227"
+ id="rect4134-2-7-9-9"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <rect
+ y="322.44098"
+ x="354.33075"
+ height="42.519688"
+ width="141.73227"
+ id="rect4134-2-7-8-10"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <rect
+ y="315.35437"
+ x="347.24408"
+ height="42.519688"
+ width="141.73227"
+ id="rect4134-2-7-5"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <text
+ sodipodi:linespacing="125%"
+ id="text26240-7"
+ y="333.77698"
+ x="418.3107"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ xml:space="preserve"><tspan
+ y="333.77698"
+ x="418.3107"
+ id="tspan8004-1"
+ sodipodi:role="line">compute overlaps</tspan><tspan
+ y="347.83948"
+ x="418.3107"
+ sodipodi:role="line"
+ id="tspan6267">with MHAP</tspan></text>
+ </g>
+ <g
+ id="g6281"
+ transform="translate(14.173235,21.259833)">
+ <ellipse
+ ry="20.896086"
+ rx="43.041752"
+ cy="428.7402"
+ cx="148.29683"
+ id="path43977-9-3-5"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <ellipse
+ ry="20.896086"
+ rx="43.041752"
+ cy="428.7402"
+ cx="134.1236"
+ id="path43977-9-3-6"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <ellipse
+ ry="20.896084"
+ rx="43.041752"
+ cy="428.7402"
+ cx="120.9945"
+ id="path43977-9-3"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <text
+ sodipodi:linespacing="125%"
+ id="text5127-9"
+ y="432.93423"
+ x="120.23898"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ xml:space="preserve"><tspan
+ y="432.93423"
+ x="120.23898"
+ id="tspan5290-5"
+ sodipodi:role="line">minHashes</tspan></text>
+ </g>
+ <flowRoot
+ xml:space="preserve"
+ id="flowRoot6269"
+ style="fill:black;stroke:none;stroke-opacity:1;stroke-width:1px;stroke-linejoin:miter;stroke-linecap:butt;fill-opacity:1;font-family:Bitstream Vera Sans;font-style:normal;font-weight:normal;font-size:11.25000000000000000px;line-height:125%;letter-spacing:0px;word-spacing:0px"><flowRegion
+ id="flowRegion6271"><rect
+ id="rect6273"
+ width="99.212593"
+ height="21.259842"
+ x="375.59055"
+ y="288.4252" /></flowRegion><flowPara
+ id="flowPara6275"></flowPara></flowRoot> <g
+ id="g6281-6"
+ transform="translate(433.16929,22.14566)">
+ <ellipse
+ ry="20.896086"
+ rx="43.041752"
+ cy="428.7402"
+ cx="148.29683"
+ id="path43977-9-3-5-3"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <ellipse
+ ry="20.896086"
+ rx="43.041752"
+ cy="428.7402"
+ cx="134.1236"
+ id="path43977-9-3-6-2"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <ellipse
+ ry="20.896084"
+ rx="43.041752"
+ cy="428.7402"
+ cx="120.9945"
+ id="path43977-9-3-4"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <text
+ sodipodi:linespacing="125%"
+ id="text5127-9-5"
+ y="432.93423"
+ x="120.23898"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ xml:space="preserve"><tspan
+ y="432.93423"
+ x="120.23898"
+ id="tspan5290-5-5"
+ sodipodi:role="line">overlaps</tspan></text>
+ </g>
+ <g
+ id="g6281-1"
+ transform="translate(21.259849,177.16535)">
+ <ellipse
+ ry="20.896086"
+ rx="43.041752"
+ cy="428.7402"
+ cx="148.29683"
+ id="path43977-9-3-5-5"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <ellipse
+ ry="20.896086"
+ rx="43.041752"
+ cy="428.7402"
+ cx="134.1236"
+ id="path43977-9-3-6-26"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <ellipse
+ ry="20.896084"
+ rx="43.041752"
+ cy="428.7402"
+ cx="120.9945"
+ id="path43977-9-3-3"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <text
+ sodipodi:linespacing="125%"
+ id="text5127-9-8"
+ y="432.93423"
+ x="120.23898"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ xml:space="preserve"><tspan
+ y="432.93423"
+ x="120.23898"
+ id="tspan5290-5-55"
+ sodipodi:role="line">overlaps</tspan></text>
+ </g>
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#marker18066-2-3);color-render [...]
+ d="m 141.73228,471.25988 0,21.25984"
+ id="path4140-0-3-5-7-7-4"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#marker18066-2-3-0);color-rend [...]
+ d="m 141.73228,542.12602 0,35.43308"
+ id="path4140-0-3-5-7-7-4-4"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#marker18066-2-6);color-render [...]
+ d="m 553.64172,387.10634 0,35.43307"
+ id="path4140-0-3-5-7-7-6"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <g
+ id="g6281-1-9"
+ transform="translate(233.85827,177.16535)">
+ <ellipse
+ ry="20.896086"
+ rx="43.041752"
+ cy="428.7402"
+ cx="148.29683"
+ id="path43977-9-3-5-5-6"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <ellipse
+ ry="20.896086"
+ rx="43.041752"
+ cy="428.7402"
+ cx="134.1236"
+ id="path43977-9-3-6-26-3"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <ellipse
+ ry="20.896084"
+ rx="43.041752"
+ cy="428.7402"
+ cx="120.9945"
+ id="path43977-9-3-3-4"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <text
+ sodipodi:linespacing="125%"
+ id="text5127-9-8-5"
+ y="425.90298"
+ x="120.50539"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ xml:space="preserve"><tspan
+ y="425.90298"
+ x="120.50539"
+ id="tspan5290-5-55-2"
+ sodipodi:role="line">unsorted</tspan><tspan
+ y="439.96548"
+ x="120.50539"
+ sodipodi:role="line"
+ id="tspan25908">buckets</tspan></text>
+ </g>
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#marker18066-2-3-0-4);color-re [...]
+ d="m 354.3307,542.12602 0,35.43309"
+ id="path4140-0-3-5-7-7-4-4-8"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#marker18066-2-3-7);color-rend [...]
+ d="m 354.3307,627.1654 0,21.25984"
+ id="path4140-0-3-5-7-7-4-3"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#marker44026-8);color-rendering [...]
+ d="m 396.85039,258.66145 155.90551,0.88582 0,77.95277"
+ id="path44018-42"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="ccc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow1Mend-34);color-renderin [...]
+ d="m 354.3307,379.13389 0,35.43307"
+ id="path4140-5"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165359;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shap [...]
+ d="m 311.81102,442.9134 -49.6063,0 0,-77.95275 -42.51969,0"
+ id="path22402"
+ inkscape:connector-curvature="0" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165359;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shap [...]
+ d="m 396.85039,442.9134 49.60629,0 0,-77.95275 28.34646,0"
+ id="path22404"
+ inkscape:connector-curvature="0" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165359;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shap [...]
+ d="m 552.75589,471.25986 0,49.60629 -120.47243,0"
+ id="path25042"
+ inkscape:connector-curvature="0" />
+ <text
+ xml:space="preserve"
+ style="font-style:normal;font-weight:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ x="403.3053"
+ y="251.41553"
+ id="text25900"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan25902"
+ x="403.3053"
+ y="251.41553">CORRECTED READS</tspan></text>
+ <text
+ xml:space="preserve"
+ style="font-style:normal;font-weight:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ x="176.67874"
+ y="251.41553"
+ id="text25904"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan25906"
+ x="176.67874"
+ y="251.41553">UNCORRECTED READS</tspan></text>
+ </g>
+</svg>
diff --git a/documentation/source/canu-pipeline.svg b/documentation/source/canu-pipeline.svg
new file mode 100644
index 0000000..94a27e0
--- /dev/null
+++ b/documentation/source/canu-pipeline.svg
@@ -0,0 +1,3528 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+ xmlns:osb="http://www.openswatchbook.org/uri/2009/osb"
+ xmlns:dc="http://purl.org/dc/elements/1.1/"
+ xmlns:cc="http://creativecommons.org/ns#"
+ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+ xmlns:svg="http://www.w3.org/2000/svg"
+ xmlns="http://www.w3.org/2000/svg"
+ xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+ xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+ width="7.0866141in"
+ height="9.999999in"
+ viewBox="0 0 637.7953 899.99995"
+ id="svg2"
+ version="1.1"
+ inkscape:version="0.91 r13725"
+ sodipodi:docname="figure1.svg"
+ inkscape:export-filename="/Users/walenzbp/Downloads/figure1.png"
+ inkscape:export-xdpi="240"
+ inkscape:export-ydpi="240">
+ <defs
+ id="defs4">
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker7170"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path7172" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker6234"
+ style="overflow:visible"
+ inkscape:isstock="true"
+ inkscape:collect="always">
+ <path
+ id="path6236"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker27362"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend"
+ inkscape:collect="always">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path27364" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker25419"
+ style="overflow:visible"
+ inkscape:isstock="true"
+ inkscape:collect="always">
+ <path
+ id="path25421"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker24623"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ id="path24625"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mstart"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mstart"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ id="path23441"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(0.4,0,0,0.4,4,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker18341"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend"
+ inkscape:collect="always">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path18343" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker16499"
+ style="overflow:visible"
+ inkscape:isstock="true"
+ inkscape:collect="always">
+ <path
+ id="path16501"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker7927"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend"
+ inkscape:collect="always">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path7929" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker44026"
+ style="overflow:visible"
+ inkscape:isstock="true"
+ inkscape:collect="always">
+ <path
+ id="path44028"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker41018"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend"
+ inkscape:collect="always">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path41020" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker30557"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ id="path30559"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker29983"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ id="path29985"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker29415"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ id="path29417"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker28853"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ id="path28855"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker28297"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ id="path28299"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <linearGradient
+ id="linearGradient21530"
+ osb:paint="solid">
+ <stop
+ style="stop-color:#000000;stop-opacity:1;"
+ offset="0"
+ id="stop21532" />
+ </linearGradient>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker18520"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path18522"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker18066"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path18068"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker17618"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path17620"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker17320"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path17322"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker14037"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ id="path14039"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker13817"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ id="path13819"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker13603"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ id="path13605"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker12968"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path12970"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker12796"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path12798"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker12630"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path12632"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend"
+ style="overflow:visible"
+ inkscape:isstock="true"
+ inkscape:collect="always">
+ <path
+ id="path4155"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow2Lend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow2Lend"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ id="path4167"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.625;stroke-linejoin:round;stroke-opacity:1"
+ d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+ transform="matrix(-1.1,0,0,-1.1,-1.1,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-1"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-4"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-6"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-7"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-7"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-8"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-3"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-2"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-1"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-77"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-1-9"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-77-3"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-1-6"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-77-4"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-1-3"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-77-1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-1-99"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-77-2"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-1-9-1"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-77-3-6"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-1-9-5"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-77-3-2"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-1-9-59"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-77-3-5"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-1-9-3"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-77-3-8"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-1-9-2"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-77-3-0"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-1-9-6"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-77-3-9"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-1-9-5-5"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-77-3-2-4"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-1-9-6-2"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-77-3-9-1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-1-99-7"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-77-2-5"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-1-3-8"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-77-1-7"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-1-6-3"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-77-4-9"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-1-9-8"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-77-3-3"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker13146-3"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path13148-3" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker12630-3"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path12632-6" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker12796-5"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path12798-7" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker12968-6"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path12970-6" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker14037-7"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path14039-5"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker14037-5"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path14039-6"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker14037-2"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path14039-2"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker14037-27"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path14039-53"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker14037-8"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path14039-8"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker14037-26"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path14039-0"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker14037-89"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path14039-27"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker14037-9"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path14039-65"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker14037-8-2"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path14039-8-3"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker14037-8-5"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path14039-8-7"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-7-0"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-8-2"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-6-1"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-7-1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-6-1-6"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-7-1-2"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-7-0-3"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-8-2-1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-7-0-7"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-8-2-18"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-7-0-5"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-8-2-17"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-7-0-7-0"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-8-2-18-3"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-5"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-7"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-2"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-8"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-1"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-77"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-51"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-87"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-1-0"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-77-0"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-1-6"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-77-02"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-1-9"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-77-8"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-1-6-8"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-77-02-1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-51-4"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-87-3"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-1-05"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-77-9"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-2-1"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-8-0"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-5-1"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-7-8"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-7-0-7-0-0"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-8-2-18-3-7"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-7-0-7-8"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-8-2-18-7"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-6-1-6-7"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-7-1-2-0"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-0"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-1-6"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-4-0"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-6"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-8"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker18066-4"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path18068-5" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker18066-2"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path18068-2" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-1-0-4"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-77-0-1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-1-6-6"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-77-02-7"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-1-6-8-2"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-77-02-1-3"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-1-0-4-5"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-77-0-1-3"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-1-6-6-4"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-77-02-7-7"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker18980-0"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path18982-7" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker18066-1"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path18068-3" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker17320-0"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path17322-8" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker17618-1"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path17620-1" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker18520-5"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path18522-5" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-6-1-6-1"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-7-1-2-9"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-6-1-6-2"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-7-1-2-5"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker28297-3"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path28299-6"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker28297-7"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path28299-68"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker28297-7-9"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path28299-68-0"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker28297-7-9-8"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path28299-68-0-8"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker28297-7-9-1"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path28299-68-0-6"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker28297-7-9-5"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path28299-68-0-0"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker41018-0"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path41020-9" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker44026-9"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ id="path44028-4"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker44026-4"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ id="path44028-45"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-8"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-3"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-9"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-0"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-5-5"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-7-2"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-2-2"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-8-7"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-1-3"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-77-7"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-51-9"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-87-0"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker41018-2"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend"
+ inkscape:collect="always">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path41020-3" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker41018-0-9"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path41020-9-9" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker44026-4-8"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ id="path44028-45-4"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-8-0"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-3-4"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-9-6"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-0-0"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-5-5-3"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-7-2-2"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-2-2-6"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-8-7-9"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-1-3-4"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-77-7-1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Mend-4-0-51-9-3"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4155-6-6-87-0-7"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker41018-2-8"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend"
+ inkscape:collect="always">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path41020-3-8" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker41018-0-9-3"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path41020-9-9-8" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker7927-8"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path7929-5" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker7927-4"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path7929-3" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker7927-5"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path7929-6" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker7927-5-0"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path7929-6-8" />
+ </marker>
+ <marker
+ inkscape:isstock="true"
+ style="overflow:visible"
+ id="marker7927-5-04"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Mend">
+ <path
+ inkscape:connector-curvature="0"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ id="path7929-6-6" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="marker28297-7-3"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ inkscape:connector-curvature="0"
+ id="path28299-68-8"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+ </marker>
+ </defs>
+ <sodipodi:namedview
+ id="base"
+ pagecolor="#ffffff"
+ bordercolor="#666666"
+ borderopacity="1.0"
+ inkscape:pageopacity="0.0"
+ inkscape:pageshadow="2"
+ inkscape:zoom="1.28"
+ inkscape:cx="260.57204"
+ inkscape:cy="499.03431"
+ inkscape:document-units="mm"
+ inkscape:current-layer="layer1"
+ showgrid="true"
+ inkscape:window-width="2899"
+ inkscape:window-height="1050"
+ inkscape:window-x="0"
+ inkscape:window-y="0"
+ inkscape:window-maximized="0"
+ inkscape:snap-intersection-paths="true"
+ inkscape:snap-smooth-nodes="true"
+ inkscape:snap-midpoints="false"
+ inkscape:object-nodes="true"
+ inkscape:snap-bbox="true"
+ inkscape:snap-bbox-midpoints="true"
+ inkscape:bbox-nodes="true"
+ units="in"
+ inkscape:snap-nodes="false"
+ inkscape:snap-others="true"
+ inkscape:snap-global="true"
+ inkscape:snap-object-midpoints="true"
+ fit-margin-top="0"
+ fit-margin-left="0"
+ fit-margin-right="0"
+ fit-margin-bottom="0"
+ inkscape:snap-bbox-edge-midpoints="true">
+ <inkscape:grid
+ type="xygrid"
+ id="grid3336"
+ units="mm"
+ spacingx="7.0866143"
+ spacingy="7.0866143"
+ originx="-7.0866143"
+ originy="-7.0866594" />
+ </sodipodi:namedview>
+ <metadata
+ id="metadata7">
+ <rdf:RDF>
+ <cc:Work
+ rdf:about="">
+ <dc:format>image/svg+xml</dc:format>
+ <dc:type
+ rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+ <dc:title />
+ </cc:Work>
+ </rdf:RDF>
+ </metadata>
+ <g
+ inkscape:label="Layer 1"
+ inkscape:groupmode="layer"
+ id="layer1"
+ transform="translate(-7.0866141,-145.27563)">
+ <rect
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#d37279;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shap [...]
+ id="rect9021-7"
+ width="637.79529"
+ height="259.9657"
+ x="7.0866141"
+ y="393.39374"
+ ry="0" />
+ <rect
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#c381d4;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shap [...]
+ id="rect9021-3"
+ width="637.79529"
+ height="397.01163"
+ x="7.0866141"
+ y="648.26398" />
+ <rect
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#9cddbb;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shap [...]
+ id="rect9021"
+ width="637.79529"
+ height="248.03148"
+ x="7.0866141"
+ y="145.27563" />
+ <ellipse
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#efc1fc;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;s [...]
+ id="path43977-5"
+ cx="595.27563"
+ cy="1016.5654"
+ rx="43.041752"
+ ry="20.896084" />
+ <ellipse
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#efc1fc;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;s [...]
+ id="path43977-90"
+ cx="595.27563"
+ cy="918.08032"
+ rx="43.041752"
+ ry="20.896084" />
+ <ellipse
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#efc1fc;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;s [...]
+ id="path43977-5-4"
+ cx="595.27563"
+ cy="967.68665"
+ rx="43.041752"
+ ry="20.896086" />
+ <rect
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffd1d5;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;s [...]
+ id="rect4134-6"
+ width="141.73227"
+ height="42.519688"
+ x="347.24405"
+ y="414.56696" />
+ <rect
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffd1d5;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;s [...]
+ id="rect4134-2-0"
+ width="141.73227"
+ height="42.519688"
+ x="347.24405"
+ y="492.51971" />
+ <rect
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffd1d5;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;s [...]
+ id="rect4134-2-7-2"
+ width="141.73227"
+ height="42.519688"
+ x="347.24405"
+ y="577.55908" />
+ <rect
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#efc1fc;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;s [...]
+ id="rect4134-2-7-2-5"
+ width="141.73227"
+ height="42.519688"
+ x="347.24408"
+ y="853.93707" />
+ <rect
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#efc1fc;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;s [...]
+ id="rect4134-2-7-2-5-5-5"
+ width="141.73227"
+ height="42.519688"
+ x="347.24408"
+ y="995.66937" />
+ <text
+ xml:space="preserve"
+ style="color:#000000;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-indent:0;text-align:center;text-decoration:none;text-decoration-line:none;text-decoration-style:solid;text-decoration-color:#000000;letter-spacing:0px;word-spacing:0px;text-transform:none;direction:ltr;block-progression:tb;writing-mode:lr-tb;baseline-shi [...]
+ x="595.24817"
+ y="1012.6378"
+ id="text37127"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan8110"
+ x="595.24817"
+ y="1012.6378">contig</tspan><tspan
+ sodipodi:role="line"
+ id="tspan8112"
+ x="595.24817"
+ y="1026.7003">sequences</tspan></text>
+ <rect
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffd1d5;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;s [...]
+ id="rect4134-7-9"
+ width="99.21257"
+ height="70.866142"
+ x="14.17323"
+ y="492.51971" />
+ <rect
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#efc1fc;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;s [...]
+ id="rect4134-7-9-5"
+ width="99.21257"
+ height="70.866142"
+ x="14.173261"
+ y="768.89764" />
+ <text
+ xml:space="preserve"
+ style="color:#000000;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-indent:0;text-align:center;text-decoration:none;text-decoration-line:none;text-decoration-style:solid;text-decoration-color:#000000;letter-spacing:0px;word-spacing:0px;text-transform:none;direction:ltr;block-progression:tb;writing-mode:lr-tb;baseline-shi [...]
+ x="595.10535"
+ y="914.15271"
+ id="text37131"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan8102"
+ x="595.10535"
+ y="914.15271">assembly</tspan><tspan
+ sodipodi:role="line"
+ id="tspan8104"
+ x="595.10535"
+ y="928.21521">graph</tspan></text>
+ <text
+ xml:space="preserve"
+ style="color:#000000;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-indent:0;text-align:center;text-decoration:none;text-decoration-line:none;text-decoration-style:solid;text-decoration-color:#000000;letter-spacing:0px;word-spacing:0px;text-transform:none;direction:ltr;block-progression:tb;writing-mode:lr-tb;baseline-shi [...]
+ x="595.02295"
+ y="963.75903"
+ id="text37135"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan8114"
+ x="595.02295"
+ y="963.75903">read</tspan><tspan
+ sodipodi:role="line"
+ id="tspan8116"
+ x="595.02295"
+ y="977.82153">layouts</tspan></text>
+ <rect
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ddefe5;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;s [...]
+ id="rect4134"
+ width="141.73227"
+ height="42.519688"
+ x="347.24411"
+ y="159.44884" />
+ <rect
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ddefe5;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;s [...]
+ id="rect4134-2"
+ width="141.73227"
+ height="42.519688"
+ x="347.24411"
+ y="237.4016" />
+ <rect
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ddefe5;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;s [...]
+ id="rect4134-7"
+ width="99.21257"
+ height="70.866142"
+ x="14.173253"
+ y="237.40161" />
+ <ellipse
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ddefe5;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;s [...]
+ id="path43977-8"
+ cx="63.779526"
+ cy="187.43156"
+ rx="43.041752"
+ ry="20.896084" />
+ <text
+ xml:space="preserve"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ x="63.504868"
+ y="191.98935"
+ id="text4613"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan5271"
+ x="63.504868"
+ y="191.98935">Raw Reads</tspan></text>
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow1Mend);color-rendering:a [...]
+ d="m 488.97638,180.70868 56.6929,2e-5"
+ id="path4140"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow1Mend-4-1);color-renderi [...]
+ d="m 488.97638,258.66143 56.6929,2e-5"
+ id="path4140-0-7"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <rect
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ddefe5;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;s [...]
+ id="rect4134-2-7-9"
+ width="141.73227"
+ height="42.519688"
+ x="361.4173"
+ y="329.52759" />
+ <rect
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ddefe5;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;s [...]
+ id="rect4134-2-7-8"
+ width="141.73227"
+ height="42.519688"
+ x="354.33075"
+ y="322.44098" />
+ <rect
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ddefe5;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;s [...]
+ id="rect4134-2-7"
+ width="141.73227"
+ height="42.519688"
+ x="347.24408"
+ y="315.35437" />
+ <text
+ xml:space="preserve"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ x="418.3107"
+ y="333.77698"
+ id="text26240"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan8002"
+ x="418.3107"
+ y="333.77698">generate corrected</tspan><tspan
+ sodipodi:role="line"
+ id="tspan8004"
+ x="418.3107"
+ y="347.83948">read consensus</tspan></text>
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow1Mend-6-1-6);color-rende [...]
+ d="m 488.97637,336.61421 56.69291,0"
+ id="path4140-3-7-5"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <text
+ xml:space="preserve"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ x="418.0773"
+ y="177.87146"
+ id="text4731"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan5302"
+ x="418.0773"
+ y="177.87146">choose overlaps</tspan><tspan
+ sodipodi:role="line"
+ id="tspan5304"
+ x="418.0773"
+ y="191.93396">for correction</tspan></text>
+ <g
+ id="g9433"
+ transform="translate(-14.69528,85.403145)">
+ <ellipse
+ ry="20.896082"
+ rx="43.041752"
+ cy="95.305557"
+ cx="609.44879"
+ id="path43977"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ddefe5;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <text
+ sodipodi:linespacing="125%"
+ id="text4613-0"
+ y="92.468338"
+ x="609.42133"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ xml:space="preserve"><tspan
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;writing-mode:lr-tb;text-anchor:middle"
+ id="tspan4619-4"
+ y="92.468338"
+ x="609.42133"
+ sodipodi:role="line">global</tspan><tspan
+ id="tspan4806"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;writing-mode:lr-tb;text-anchor:middle"
+ y="106.53084"
+ x="609.42133"
+ sodipodi:role="line">scores</tspan></text>
+ </g>
+ <text
+ xml:space="preserve"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ x="418.31076"
+ y="254.73384"
+ id="text4808"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan5298"
+ x="418.31076"
+ y="254.73384">estimate corrected</tspan><tspan
+ sodipodi:role="line"
+ id="tspan5300"
+ x="418.31076"
+ y="268.79633">read lengths</tspan></text>
+ <g
+ id="g9439"
+ transform="translate(-14.69528,85.403145)">
+ <ellipse
+ ry="20.896084"
+ rx="43.041752"
+ cy="173.25832"
+ cx="609.44879"
+ id="path43977-9"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ddefe5;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <text
+ sodipodi:linespacing="125%"
+ id="text5127"
+ y="170.4211"
+ x="609.43506"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ xml:space="preserve"><tspan
+ y="170.4211"
+ x="609.43506"
+ id="tspan5288"
+ sodipodi:role="line">read IDs</tspan><tspan
+ y="184.4836"
+ x="609.43506"
+ id="tspan5290"
+ sodipodi:role="line">to correct</tspan></text>
+ </g>
+ <g
+ id="g9445"
+ transform="translate(-14.173247,85.03939)">
+ <ellipse
+ ry="20.896084"
+ rx="43.041752"
+ cy="251.57483"
+ cx="608.92676"
+ id="path43977-3"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ddefe5;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <text
+ sodipodi:linespacing="125%"
+ id="text7349"
+ y="248.73761"
+ x="609.12726"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ xml:space="preserve"><tspan
+ y="248.73761"
+ x="609.12726"
+ id="tspan5273"
+ sodipodi:role="line">corrected</tspan><tspan
+ y="262.80011"
+ x="609.12726"
+ id="tspan5275"
+ sodipodi:role="line">reads</tspan></text>
+ </g>
+ <text
+ xml:space="preserve"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ x="63.73835"
+ y="262.96622"
+ id="text12562"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan12564"
+ x="63.73835"
+ y="262.96622">Build read and</tspan><tspan
+ sodipodi:role="line"
+ x="63.73835"
+ y="277.02872"
+ id="tspan12566">overlap</tspan><tspan
+ sodipodi:role="line"
+ x="63.73835"
+ y="291.09122"
+ id="tspan12568">databases</tspan></text>
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow1Mend-4-0-1-9);color-ren [...]
+ d="m 63.779527,209.05515 0,21.25985"
+ id="path4140-0-3-5-6"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <text
+ xml:space="preserve"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ x="418.07721"
+ y="601.92255"
+ id="text26240-9"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan26242-4"
+ x="418.07721"
+ y="601.92255">output reads</tspan></text>
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#marker7927);color-rendering:a [...]
+ d="m 488.97635,598.81895 56.69293,0"
+ id="path4140-3-7-5-1"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <text
+ xml:space="preserve"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ x="418.23651"
+ y="440.02084"
+ id="text4731-1"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4733-2"
+ x="418.23651"
+ y="440.02084">trim reads</tspan></text>
+ <text
+ xml:space="preserve"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ x="418.0827"
+ y="516.88324"
+ id="text4808-7"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4810-3"
+ x="418.0827"
+ y="516.88324">split reads</tspan></text>
+ <g
+ id="g9426"
+ transform="translate(-15.217375,85.039352)">
+ <ellipse
+ ry="20.896084"
+ rx="43.041752"
+ cy="513.7796"
+ cx="609.97089"
+ id="path43977-0"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffd1d5;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <text
+ sodipodi:linespacing="125%"
+ id="text7349-1"
+ y="510.94238"
+ x="610.33069"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ xml:space="preserve"><tspan
+ y="510.94238"
+ x="610.33069"
+ id="tspan8072"
+ sodipodi:role="line">trimmed</tspan><tspan
+ y="525.00488"
+ x="610.33069"
+ id="tspan8074"
+ sodipodi:role="line">reads</tspan><tspan
+ y="539.06738"
+ x="610.33069"
+ id="tspan8076"
+ sodipodi:role="line" /></text>
+ </g>
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#marker18066-4);color-renderin [...]
+ d="m 418.1102,457.08666 0,28.34646"
+ id="path4140-0-3-5-7-2"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#marker18066-2);color-renderin [...]
+ d="m 418.1102,535.03942 0,35.43307"
+ id="path4140-0-3-5-7-7"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <rect
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#efc1fc;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;s [...]
+ id="rect4134-2-7-9-4"
+ width="141.73227"
+ height="42.519688"
+ x="361.41733"
+ y="690.94489" />
+ <rect
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#efc1fc;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;s [...]
+ id="rect4134-2-7-8-3"
+ width="141.73227"
+ height="42.519688"
+ x="354.33072"
+ y="683.85828" />
+ <rect
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#efc1fc;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;s [...]
+ id="rect4134-2-7-4"
+ width="141.73227"
+ height="42.519688"
+ x="347.24408"
+ y="676.77167" />
+ <text
+ xml:space="preserve"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ x="418.07727"
+ y="702.22559"
+ id="text26240-4"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan8086"
+ x="418.07727"
+ y="702.22559">detect errors in reads</tspan></text>
+ <rect
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#efc1fc;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;s [...]
+ id="rect4134-2-7-9-8"
+ width="141.73227"
+ height="42.519688"
+ x="361.4173"
+ y="783.06995" />
+ <rect
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#efc1fc;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;s [...]
+ id="rect4134-2-7-8-1"
+ width="141.73227"
+ height="42.519688"
+ x="354.33063"
+ y="775.98334" />
+ <rect
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#efc1fc;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;s [...]
+ id="rect4134-2-7-1"
+ width="141.73227"
+ height="42.519688"
+ x="347.24408"
+ y="768.89771" />
+ <text
+ xml:space="preserve"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ x="417.90695"
+ y="786.22992"
+ id="text26240-1"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan8088"
+ x="417.90695"
+ y="786.22992">recompute overlap</tspan><tspan
+ sodipodi:role="line"
+ id="tspan8090"
+ x="417.90695"
+ y="800.29242">alignments</tspan></text>
+ <g
+ id="g9420"
+ transform="translate(-15.217375,85.039368)">
+ <ellipse
+ ry="20.896084"
+ rx="43.041752"
+ cy="612.99219"
+ cx="609.97089"
+ id="path43977-2"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#efc1fc;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <text
+ sodipodi:linespacing="125%"
+ id="text28285"
+ y="609.03162"
+ x="609.7182"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ xml:space="preserve"><tspan
+ y="609.03162"
+ x="609.7182"
+ id="tspan8078"
+ sodipodi:role="line">errors</tspan><tspan
+ y="623.09412"
+ x="609.7182"
+ id="tspan8080"
+ sodipodi:role="line">in reads</tspan></text>
+ </g>
+ <g
+ id="g9414"
+ transform="translate(-15.217375,85.039379)">
+ <ellipse
+ ry="20.896084"
+ rx="43.041752"
+ cy="705.11816"
+ cx="609.97089"
+ id="path43977-4"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#efc1fc;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <text
+ sodipodi:linespacing="125%"
+ id="text32865"
+ y="702.28094"
+ x="609.93793"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ xml:space="preserve"><tspan
+ y="702.28094"
+ x="609.93793"
+ id="tspan8082"
+ sodipodi:role="line">adjusted</tspan><tspan
+ y="716.34344"
+ x="609.93793"
+ id="tspan8084"
+ sodipodi:role="line">error rates</tspan></text>
+ </g>
+ <text
+ xml:space="preserve"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ x="418.07724"
+ y="871.26929"
+ id="text32894"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan8092"
+ x="418.07724"
+ y="871.26929">construct contigs</tspan><tspan
+ sodipodi:role="line"
+ id="tspan8094"
+ x="418.07724"
+ y="885.33179">(bogart)</tspan></text>
+ <rect
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#efc1fc;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;s [...]
+ id="rect4134-2-7-9-8-6"
+ width="141.73227"
+ height="42.519691"
+ x="361.41733"
+ y="938.97638" />
+ <rect
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#efc1fc;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;s [...]
+ id="rect4134-2-7-8-1-3"
+ width="141.73227"
+ height="42.519691"
+ x="354.33069"
+ y="931.88977" />
+ <rect
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#efc1fc;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;s [...]
+ id="rect4134-2-7-2-5-5"
+ width="141.73227"
+ height="42.519688"
+ x="347.24408"
+ y="924.80322" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-start:url(#Arrow1Mstart);marker-end:ur [...]
+ d="m 311.81102,938.97644 28.34646,0"
+ id="path4140-0-3-7-4-6-7"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#marker28297-7);color-renderin [...]
+ d="m 283.46457,953.14967 56.69291,0"
+ id="path4140-0-3-7-4-6-3"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:7.08661413;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;sha [...]
+ d="m 304.72441,1016.9292 0,-113.38583"
+ id="path11082-9-0-9"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:7.08661413;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shap [...]
+ d="m 262.20472,1002.756 42.51969,0"
+ id="path12182-8-7-0"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <text
+ xml:space="preserve"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ x="418.31073"
+ y="943.22583"
+ id="text37139"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan8096"
+ x="418.31073"
+ y="943.22583">generate contig</tspan><tspan
+ sodipodi:role="line"
+ id="tspan8098"
+ x="418.31073"
+ y="957.28833">consensus</tspan></text>
+ <text
+ xml:space="preserve"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ x="418.07727"
+ y="1019.7087"
+ id="text37143"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan8100"
+ x="418.07727"
+ y="1019.7087">generate outputs</tspan></text>
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow1Mend-4);color-rendering [...]
+ d="m 283.46457,173.62207 56.69292,0"
+ id="path4140-0"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow1Mend-4-0);color-renderi [...]
+ d="m 283.46457,251.57482 56.69292,0"
+ id="path4140-0-3"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow1Mend-4-0-5);color-rende [...]
+ d="m 283.46457,336.61419 56.69292,0"
+ id="path4140-0-3-7"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow1Mend-4-0-2);color-rende [...]
+ d="m 304.72441,265.74805 35.43308,0"
+ id="path4140-0-3-77"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow1Mend-4-0-1);color-rende [...]
+ d="m 304.72441,187.79529 35.43308,0"
+ id="path4140-0-3-5"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow1Mend-4-0-51);color-rend [...]
+ d="m 304.72441,350.78742 35.43308,0"
+ id="path4140-0-3-9"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:7.08661413;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;sha [...]
+ d="m 304.72441,357.87404 0,-170.07875"
+ id="path11080"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:7.08661413;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;sha [...]
+ d="m 283.46457,336.61419 0,-162.99212"
+ id="path11082"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:7.08661461;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shap [...]
+ d="m 262.20473,173.62207 21.25984,0"
+ id="path12182"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:7.08661413;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shap [...]
+ d="m 304.72441,357.87404 -41.63386,0.88582"
+ id="path12184"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#marker41018);color-rendering:a [...]
+ d="m 113.38583,294.09453 21.25984,0 7.08662,-2e-5 28.34646,0"
+ id="path40998"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cccc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#marker41018-0);color-rendering [...]
+ d="m 113.38583,251.57484 28.34645,0 1e-5,-70.86616 28.34646,0"
+ id="path40998-3"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cccc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#marker44026);color-rendering:a [...]
+ d="m 595.27558,201.96854 0,7.08661 -177.16535,0 0,21.25985"
+ id="path44018"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cccc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#marker44026-9);color-rendering [...]
+ d="m 595.27558,279.9213 0,7.08661 -177.16535,0 0,21.25984"
+ id="path44018-4"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cccc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#marker44026-4);color-rendering [...]
+ d="m 595.27558,357.87406 0,35.43307 -531.496053,0 0,92.12599"
+ id="path44018-9"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cccc" />
+ <text
+ xml:space="preserve"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ x="63.738327"
+ y="518.08435"
+ id="text12562-3"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan12564-8"
+ x="63.738327"
+ y="518.08435">Build read and</tspan><tspan
+ sodipodi:role="line"
+ x="63.738327"
+ y="532.14685"
+ id="tspan12566-0">overlap</tspan><tspan
+ sodipodi:role="line"
+ x="63.738327"
+ y="546.20935"
+ id="tspan12568-24">databases</tspan></text>
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow1Mend-4-8);color-renderi [...]
+ d="m 282.57872,427.8544 56.69292,0"
+ id="path4140-0-9"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow1Mend-4-0-9);color-rende [...]
+ d="m 282.57872,505.80716 56.69292,0"
+ id="path4140-0-3-1"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow1Mend-4-0-5-5);color-ren [...]
+ d="m 282.57872,590.84653 56.69292,0"
+ id="path4140-0-3-7-7"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow1Mend-4-0-2-2);color-ren [...]
+ d="m 303.83856,519.98039 35.43308,0"
+ id="path4140-0-3-77-2"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow1Mend-4-0-1-3);color-ren [...]
+ d="m 303.83856,442.02763 35.43308,0"
+ id="path4140-0-3-5-3"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow1Mend-4-0-51-9);color-re [...]
+ d="m 303.83856,605.01976 35.43308,0"
+ id="path4140-0-3-9-6"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:7.08661413;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;sha [...]
+ d="m 304.72439,612.99218 0,-170.07875"
+ id="path11080-5"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:7.08661413;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;sha [...]
+ d="m 282.57873,590.84653 0,-162.99213"
+ id="path11082-5"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:7.08661461;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shap [...]
+ d="m 261.31888,427.8544 21.25984,0"
+ id="path12182-81"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:7.08661413;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shap [...]
+ d="m 304.72439,612.99218 -42.51969,0"
+ id="path12184-47"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#marker41018-2);color-rendering [...]
+ d="m 112.49998,548.32687 21.25984,0 7.08662,-2e-5 28.34646,0"
+ id="path40998-1"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cccc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#marker41018-0-9);color-renderi [...]
+ d="m 112.49998,505.80718 28.34645,0 1e-5,-70.86617 28.34646,0"
+ id="path40998-3-3"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cccc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#marker44026-4-8);color-renderi [...]
+ d="m 595.27558,620.07879 0,28.34646 -532.381883,-10e-6 0,106.29922"
+ id="path44018-9-8"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cccc" />
+ <text
+ xml:space="preserve"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ x="63.73835"
+ y="794.46222"
+ id="text12562-3-3"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan12564-8-5"
+ x="63.73835"
+ y="794.46222">Build read and</tspan><tspan
+ sodipodi:role="line"
+ x="63.73835"
+ y="808.52472"
+ id="tspan12566-0-4">overlap</tspan><tspan
+ sodipodi:role="line"
+ x="63.73835"
+ y="822.58722"
+ id="tspan12568-24-3">databases</tspan></text>
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow1Mend-4-8-0);color-rende [...]
+ d="m 283.46456,690.94493 56.69292,0"
+ id="path4140-0-9-8"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow1Mend-4-0-9-6);color-ren [...]
+ d="m 282.57875,782.1851 56.69292,0"
+ id="path4140-0-3-1-8"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow1Mend-4-0-5-5-3);color-r [...]
+ d="m 282.57875,867.22447 56.69292,0"
+ id="path4140-0-3-7-7-3"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow1Mend-4-0-2-2-6);color-r [...]
+ d="m 303.83859,796.35833 35.43308,0"
+ id="path4140-0-3-77-2-5"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow1Mend-4-0-1-3-4);color-r [...]
+ d="m 304.7244,705.11816 35.43308,0"
+ id="path4140-0-3-5-3-2"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow1Mend-4-0-51-9-3);color- [...]
+ d="m 303.83859,881.3977 35.43308,0"
+ id="path4140-0-3-9-6-2"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:7.08661413;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;sha [...]
+ d="m 304.72441,889.37014 0,-184.25198"
+ id="path11080-5-6"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:7.08661413;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;sha [...]
+ d="m 283.46456,953.14967 10e-6,-262.20474"
+ id="path11082-5-6"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:7.08661413;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shap [...]
+ d="m 304.72441,889.37014 -42.51968,0"
+ id="path12184-47-8"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#marker41018-2-8);color-renderi [...]
+ d="m 112.50001,824.70481 21.25984,0 7.08662,-2e-5 28.34646,0"
+ id="path40998-1-4"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cccc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#marker41018-0-9-3);color-rende [...]
+ d="m 112.50001,782.18512 28.34645,0 1e-5,-70.86617 28.34646,0"
+ id="path40998-3-3-1"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cccc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#marker7927-8);color-rendering [...]
+ d="m 488.97637,698.03155 56.69291,0"
+ id="path4140-3-7-5-1-9"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#marker7927-4);color-rendering [...]
+ d="m 488.97637,790.15754 56.69291,0"
+ id="path4140-3-7-5-1-90"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#marker16499)"
+ d="m 595.27558,811.41738 0,28.34646 -283.46456,0"
+ id="path16491"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="ccc" />
+ <path
+ style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#marker24623)"
+ d="m 488.97637,1016.9292 56.69291,0"
+ id="path24605"
+ inkscape:connector-curvature="0" />
+ <path
+ style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#marker18341)"
+ d="m 531.49606,1016.9292 0,-49.6063 14.17322,0"
+ id="path24607"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="ccc" />
+ <path
+ style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#marker25419)"
+ d="m 517.32283,1016.9292 0,-92.12599 28.34645,0"
+ id="path24609"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="ccc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#marker28297-7-3);color-render [...]
+ d="m 304.72441,1016.9292 35.43307,0"
+ id="path4140-0-3-7-4-6-3-4"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#marker27362)"
+ d="m 488.97637,882.28352 14.17323,0 0,28.34642 -191.33858,0"
+ id="path27354"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cccc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#marker6234);color-rendering:au [...]
+ d="m 488.97637,868.11025 28.34646,5e-5 0,42.51968 28.34645,0"
+ id="path5422"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cccc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#marker7170);color-rendering:au [...]
+ d="m 595.27558,719.29139 0,28.34646 -177.16535,4e-5 0,14.17323"
+ id="path7162"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cccc" />
+ <g
+ transform="translate(460.62991,70.866162)"
+ id="g9143-0">
+ <rect
+ y="95.669312"
+ x="-283.46457"
+ height="85.039368"
+ width="85.039368"
+ id="rect9099-1"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ddefe5;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.99999994px;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto; [...]
+ <path
+ d="m -198.42519,180.70869 a 42.519684,10.629927 0 0 1 -21.32511,9.21519 42.519684,10.629927 0 0 1 -42.58475,-0.0283 42.519684,10.629927 0 0 1 -21.1289,-9.24339"
+ sodipodi:open="true"
+ sodipodi:end="3.1469076"
+ sodipodi:start="0"
+ sodipodi:ry="10.629927"
+ sodipodi:rx="42.519684"
+ sodipodi:cy="180.70869"
+ sodipodi:cx="-240.94487"
+ sodipodi:type="arc"
+ id="path4460-2-5-4"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ddefe5;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:aut [...]
+ <path
+ inkscape:connector-curvature="0"
+ id="path4468-6-6"
+ d="m -198.42519,95.669315 0,85.039375"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ddefe5;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <path
+ inkscape:connector-curvature="0"
+ id="path4470-4-7"
+ d="m -283.46456,95.669315 0,85.039375"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ddefe5;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <ellipse
+ ry="10.629922"
+ rx="42.519684"
+ cy="95.669312"
+ cx="-240.94487"
+ id="path4460-6-3"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ddefe5;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:aut [...]
+ <text
+ sodipodi:linespacing="125%"
+ id="text4487-63-0"
+ y="148.37924"
+ x="-240.95587"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ xml:space="preserve"><tspan
+ y="148.37924"
+ x="-240.95587"
+ id="tspan4489-1-0"
+ sodipodi:role="line">gkpStore</tspan></text>
+ </g>
+ <g
+ transform="translate(340.15747,184.25198)"
+ id="g9134-8">
+ <rect
+ y="95.669312"
+ x="-162.99213"
+ height="85.039375"
+ width="85.039368"
+ id="rect9099-9-8"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ddefe5;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.99999994px;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto; [...]
+ <path
+ d="m -77.952744,180.70871 a 42.519684,10.629928 0 0 1 -21.325112,9.21519 42.519684,10.629928 0 0 1 -42.584754,-0.0283 42.519684,10.629928 0 0 1 -21.1289,-9.2434"
+ sodipodi:open="true"
+ sodipodi:end="3.1469076"
+ sodipodi:start="0"
+ sodipodi:ry="10.629928"
+ sodipodi:rx="42.519684"
+ sodipodi:cy="180.70871"
+ sodipodi:cx="-120.47243"
+ sodipodi:type="arc"
+ id="path4460-2-5-1-0"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ddefe5;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:aut [...]
+ <path
+ inkscape:connector-curvature="0"
+ id="path4468-6-1-5"
+ d="m -77.952755,95.669321 0,85.039379"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ddefe5;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <path
+ inkscape:connector-curvature="0"
+ id="path4470-4-8-6"
+ d="m -162.99212,95.669321 0,85.039379"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ddefe5;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <ellipse
+ ry="10.629923"
+ rx="42.519684"
+ cy="95.669319"
+ cx="-120.47243"
+ id="path4460-6-6-3"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ddefe5;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:aut [...]
+ <text
+ sodipodi:linespacing="125%"
+ id="text4487-63-5-8"
+ y="149.46964"
+ x="-120.48343"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ xml:space="preserve"><tspan
+ y="149.46964"
+ x="-120.48343"
+ id="tspan4489-1-9-1"
+ sodipodi:role="line">ovlStore</tspan></text>
+ </g>
+ <g
+ transform="translate(340.15745,439.3701)"
+ id="g9134-7">
+ <rect
+ y="95.669312"
+ x="-162.99213"
+ height="85.039375"
+ width="85.039368"
+ id="rect9099-9-1"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffd1d5;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.99999994px;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto; [...]
+ <path
+ d="m -77.952744,180.70871 a 42.519684,10.629928 0 0 1 -21.325112,9.21519 42.519684,10.629928 0 0 1 -42.584754,-0.0283 42.519684,10.629928 0 0 1 -21.1289,-9.2434"
+ sodipodi:open="true"
+ sodipodi:end="3.1469076"
+ sodipodi:start="0"
+ sodipodi:ry="10.629928"
+ sodipodi:rx="42.519684"
+ sodipodi:cy="180.70871"
+ sodipodi:cx="-120.47243"
+ sodipodi:type="arc"
+ id="path4460-2-5-1-04"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffd1d5;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:aut [...]
+ <path
+ inkscape:connector-curvature="0"
+ id="path4468-6-1-4"
+ d="m -77.952755,95.669321 0,85.039379"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffd1d5;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <path
+ inkscape:connector-curvature="0"
+ id="path4470-4-8-0"
+ d="m -162.99212,95.669321 0,85.039379"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffd1d5;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <ellipse
+ ry="10.629923"
+ rx="42.519684"
+ cy="95.669319"
+ cx="-120.47243"
+ id="path4460-6-6-8"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffd1d5;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:aut [...]
+ <text
+ sodipodi:linespacing="125%"
+ id="text4487-63-5-4"
+ y="149.46964"
+ x="-120.48343"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ xml:space="preserve"><tspan
+ y="149.46964"
+ x="-120.48343"
+ id="tspan4489-1-9-4"
+ sodipodi:role="line">ovlStore</tspan></text>
+ </g>
+ <g
+ transform="translate(340.15747,715.74806)"
+ id="g9134-3">
+ <rect
+ y="95.669312"
+ x="-162.99213"
+ height="85.039375"
+ width="85.039368"
+ id="rect9099-9-4"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#efc1fc;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.99999994px;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto; [...]
+ <path
+ d="m -77.952744,180.70871 a 42.519684,10.629928 0 0 1 -21.325112,9.21519 42.519684,10.629928 0 0 1 -42.584754,-0.0283 42.519684,10.629928 0 0 1 -21.1289,-9.2434"
+ sodipodi:open="true"
+ sodipodi:end="3.1469076"
+ sodipodi:start="0"
+ sodipodi:ry="10.629928"
+ sodipodi:rx="42.519684"
+ sodipodi:cy="180.70871"
+ sodipodi:cx="-120.47243"
+ sodipodi:type="arc"
+ id="path4460-2-5-1-9"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#efc1fc;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:aut [...]
+ <path
+ inkscape:connector-curvature="0"
+ id="path4468-6-1-40"
+ d="m -77.952755,95.669321 0,85.039379"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#efc1fc;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <path
+ inkscape:connector-curvature="0"
+ id="path4470-4-8-68"
+ d="m -162.99212,95.669321 0,85.039379"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#efc1fc;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <ellipse
+ ry="10.629923"
+ rx="42.519684"
+ cy="95.669319"
+ cx="-120.47243"
+ id="path4460-6-6-5"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#efc1fc;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:aut [...]
+ <text
+ sodipodi:linespacing="125%"
+ id="text4487-63-5-5"
+ y="149.46964"
+ x="-120.48343"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ xml:space="preserve"><tspan
+ y="149.46964"
+ x="-120.48343"
+ id="tspan4489-1-9-8"
+ sodipodi:role="line">ovlStore</tspan></text>
+ </g>
+ <g
+ id="g5493">
+ <rect
+ y="924.80322"
+ x="177.16534"
+ height="85.039375"
+ width="85.039368"
+ id="rect9099-9-9"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#efc1fc;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.99999994px;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto; [...]
+ <path
+ d="m 262.20473,1009.8426 a 42.519684,10.629928 0 0 1 -21.32512,9.2152 42.519684,10.629928 0 0 1 -42.58475,-0.028 42.519684,10.629928 0 0 1 -21.1289,-9.2434"
+ sodipodi:open="true"
+ sodipodi:end="3.1469076"
+ sodipodi:start="0"
+ sodipodi:ry="10.629928"
+ sodipodi:rx="42.519684"
+ sodipodi:cy="1009.8426"
+ sodipodi:cx="219.68504"
+ sodipodi:type="arc"
+ id="path4460-2-5-1-1"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#efc1fc;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:aut [...]
+ <path
+ inkscape:connector-curvature="0"
+ id="path4468-6-1-9"
+ d="m 262.20471,924.80321 0,85.03939"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#efc1fc;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <path
+ inkscape:connector-curvature="0"
+ id="path4470-4-8-8"
+ d="m 177.16535,924.80321 0,85.03939"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#efc1fc;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <ellipse
+ ry="10.629923"
+ rx="42.519684"
+ cy="924.80322"
+ cx="219.68504"
+ id="path4460-6-6-34"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#efc1fc;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:aut [...]
+ <text
+ sodipodi:linespacing="125%"
+ id="text4487-63-5-0"
+ y="978.60352"
+ x="219.67404"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ xml:space="preserve"><tspan
+ y="978.60352"
+ x="219.67404"
+ id="tspan4489-1-9-2"
+ sodipodi:role="line">tigStore</tspan></text>
+ </g>
+ <g
+ transform="translate(460.62989,325.98428)"
+ id="g9143-4">
+ <rect
+ y="95.669312"
+ x="-283.46457"
+ height="85.039368"
+ width="85.039368"
+ id="rect9099-8"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffd1d5;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.99999994px;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto; [...]
+ <path
+ d="m -198.42519,180.70869 a 42.519684,10.629927 0 0 1 -21.32511,9.21519 42.519684,10.629927 0 0 1 -42.58475,-0.0283 42.519684,10.629927 0 0 1 -21.1289,-9.24339"
+ sodipodi:open="true"
+ sodipodi:end="3.1469076"
+ sodipodi:start="0"
+ sodipodi:ry="10.629927"
+ sodipodi:rx="42.519684"
+ sodipodi:cy="180.70869"
+ sodipodi:cx="-240.94487"
+ sodipodi:type="arc"
+ id="path4460-2-5-3"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffd1d5;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:aut [...]
+ <path
+ inkscape:connector-curvature="0"
+ id="path4468-6-4"
+ d="m -198.42519,95.669315 0,85.039375"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffd1d5;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <path
+ inkscape:connector-curvature="0"
+ id="path4470-4-3"
+ d="m -283.46456,95.669315 0,85.039375"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffd1d5;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <ellipse
+ ry="10.629922"
+ rx="42.519684"
+ cy="95.669312"
+ cx="-240.94487"
+ id="path4460-6-39"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#ffd1d5;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:aut [...]
+ <text
+ sodipodi:linespacing="125%"
+ id="text4487-63-8"
+ y="148.37924"
+ x="-240.95587"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ xml:space="preserve"><tspan
+ y="148.37924"
+ x="-240.95587"
+ id="tspan4489-1-7"
+ sodipodi:role="line">gkpStore</tspan></text>
+ </g>
+ <g
+ transform="translate(460.62991,602.36223)"
+ id="g9143-8">
+ <rect
+ y="95.669312"
+ x="-283.46457"
+ height="85.039368"
+ width="85.039368"
+ id="rect9099-81"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#efc1fc;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.99999994px;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto; [...]
+ <path
+ d="m -198.42519,180.70869 a 42.519684,10.629927 0 0 1 -21.32511,9.21519 42.519684,10.629927 0 0 1 -42.58475,-0.0283 42.519684,10.629927 0 0 1 -21.1289,-9.24339"
+ sodipodi:open="true"
+ sodipodi:end="3.1469076"
+ sodipodi:start="0"
+ sodipodi:ry="10.629927"
+ sodipodi:rx="42.519684"
+ sodipodi:cy="180.70869"
+ sodipodi:cx="-240.94487"
+ sodipodi:type="arc"
+ id="path4460-2-5-6"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#efc1fc;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:aut [...]
+ <path
+ inkscape:connector-curvature="0"
+ id="path4468-6-3"
+ d="m -198.42519,95.669315 0,85.039375"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#efc1fc;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <path
+ inkscape:connector-curvature="0"
+ id="path4470-4-0"
+ d="m -283.46456,95.669315 0,85.039375"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#efc1fc;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto [...]
+ <ellipse
+ ry="10.629922"
+ rx="42.519684"
+ cy="95.669312"
+ cx="-240.94487"
+ id="path4460-6-32"
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#efc1fc;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.77165353;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:aut [...]
+ <text
+ sodipodi:linespacing="125%"
+ id="text4487-63-3"
+ y="148.37924"
+ x="-240.95587"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ xml:space="preserve"><tspan
+ y="148.37924"
+ x="-240.95587"
+ id="tspan4489-1-6"
+ sodipodi:role="line">gkpStore</tspan></text>
+ </g>
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:7.08661413;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shap [...]
+ d="m 283.46457,705.11816 -21.25985,0"
+ id="path12184-47-8-3"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <text
+ xml:space="preserve"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ x="-269.46991"
+ y="-0.15930183"
+ id="text4487-63-5-0-7"
+ sodipodi:linespacing="125%"
+ transform="matrix(0,-1,1,0,0,0)"><tspan
+ sodipodi:role="line"
+ id="tspan4489-1-9-2-9"
+ x="-269.46991"
+ y="-0.15930183">Correct</tspan></text>
+ <text
+ xml:space="preserve"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ x="-522.87946"
+ y="-6.7621535e-08"
+ id="text4487-63-5-0-3"
+ sodipodi:linespacing="125%"
+ transform="matrix(0,-1,1,0,0,0)"><tspan
+ sodipodi:role="line"
+ id="tspan4489-1-9-2-8"
+ x="-522.87946"
+ y="-6.7621535e-08">Trim</tspan></text>
+ <text
+ xml:space="preserve"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Bitstream Vera Sans';-inkscape-font-specification:'Bitstream Vera Sans, Normal';text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ x="-846.51434"
+ y="-0.15930183"
+ id="text4487-63-5-0-0"
+ sodipodi:linespacing="125%"
+ transform="matrix(0,-1,1,0,0,0)"><tspan
+ sodipodi:role="line"
+ id="tspan4489-1-9-2-2"
+ x="-846.51434"
+ y="-0.15930183">Assemble</tspan></text>
+ </g>
+</svg>
diff --git a/documentation/source/faq.rst b/documentation/source/faq.rst
index c65b23e..3863259 100644
--- a/documentation/source/faq.rst
+++ b/documentation/source/faq.rst
@@ -2,123 +2,206 @@
.. _faq:
Canu FAQ
-========================
+========
-**Q**:
- What resources does Canu require for a bacterial genome assembly? A mammalian assembly?
-**A**:
- Canu is designed to scale resources to the system it runs on. It will report if the a system does not meet the minimum requirements for a given genome size.
-
- Typically, a bacterial genome can be assembled in 1-10 cpu hours, depending on coverage (~20 min on 16-cores) and 4GB of ram (8GB is recommended). A mammalian genome (such as human) can be assembled in 10-25K cpu hours, depending on coverage (a grid environment is recommended) and at least one machine with 64GB of ram (128GB is recommended).
-
-**Q**:
- What parameters should I use for my genome? Sequencing type?
-
-**A**:
- By default, Canu is designed to be universal on a large range of PacBio (C2-P6-C4) and Oxford Nanopore (R6-R9) data. You can adjust parameters to increase efficiency for your datatype. For example, for higher coverage PacBio datasets, especially from inbred samples, you can decrease the error rate (``errorRate=0.013``). For recent Nanopore data (R9) 2D data, you can also decrease the default error rate (``errorRate=0.013``).
-
- With R7 1D sequencing data, multiple rounds of error correction are helpful. This should not be necessary for sequences over 85% identity. You can run just the correction from Canu with the options
-
- ::
-
- -correct corOutCoverage=500 corMinCoverage=0 corMhapSensitivity=high
-
- for 5-10 rounds, supplying the asm.correctedReads.fasta.gz output from round ``i-1`` to round ``i``. Assemble with
-
- ::
-
- -nanopore-corrected <your data> errorRate=0.1 utgGraphDeviation=50
-
-**Q**:
- How do I run Canu on my SLURM/SGE/PBS/LSF/Torque system?
+.. contents::
+ :local:
-**A**:
- Canu will auto-detect and configure itself to submit on most grids. If your grid requires special options (such as a partition on SLURM or an account code on SGE, specify it with ``gridOptions="<your options list>"`` which will passed to the sheduler by Canu. If you have a grid system but prefer to run locally, specify useGrid=false
-
-**Q**:
- My asm.contigs.fasta is empty, why?
-**A**:
- By default, canu will split the final output into three files:
+What resources does Canu require for a bacterial genome assembly? A mammalian assembly?
+-------------------------------------
+ Canu will detect available resources and configure itself to run efficiently using those
+ resources. It will request resources, for example, the number of compute threads to use, Based
+ on the ``genomeSize`` being assembled. It will fail to even start if it feels there are
+ insufficient resources available.
+
+ A typical bacterial genome can be assembled with 8GB memory in a few CPU hours - around an hour
+ on 8 cores. It is possible, but not allowed by default, to run with only 4GB memory.
- asm.contigs.fasta
- Everything which could be assembled and is part of the primary assembly, including both unique and repetitive elements. Each contig has several flags included on the fasta def line::
+ A well-behaved large genome, such as human or other mammals, can be assembled in 10,000 to
+ 25,000 CPU hours, depending on coverage. A grid environment is strongly recommended, with at
+ least 16GB available on each compute node, and one node with at least 64GB memory. You should
+ plan on having 3TB free disk space, much more for highly repetitive genomes.
- asm.bubbles.fasta
- alternate paths in the graph which could not be merged into the primary assembly.
+ Our compute nodes have 48 compute threads and 128GB memory, with a few larger nodes with up to
+ 1TB memory. We develop and test (mostly bacteria, yeast and drosophila) on laptops and desktops
+ with 4 to 12 compute threads and 16GB to 64GB memory.
- asm.unassembled.fasta
- reads/tigs which could not be incorporated into the primary or bubble assemblies.
+
+How do I run Canu on my SLURM / SGE / PBS / LSF / Torque system?
+-------------------------------------
+ Canu will detect and configure itself to use on most grids. You can supply your own grid
+ options, such as a partition on SLURM or an account code on SGE, with ``gridOptions="<your
+ options list>"`` which will passed to every job submitted by Canu. Similar options exist for
+ every stage of Canu, which could be used to, for example, restrict overlapping to a specific
+ partition or queue.
- It is possible for tigs comprised of multiple reads to end up in asm.unassembled.fasta. The default filtering eliminates anything with < 2 reads, shorter than 1000bp, or comprised of mostly a single sequence (>75%). The filtering is controlled by the contigFilter parameter which takes 5 values.
+ To disable grid support and run only on the local machine, specify ``useGrid=false``
- ::
- contigFilter
- minReads
- minLength
- singleReadSpan
- lowCovSpan
- lowCovDepth
+What parameters should I use for my genome? Sequencing type?
+-------------------------------------
+ Canu is designed to be universal on a large range of PacBio (C2-P6-C4) and Oxford Nanopore
+ (R6-R9) data. You can adjust parameters to increase efficiency for your datatype:
+
+ **Nanopore R7 1D** and **Low Identity Reads**
+ With R7 1D sequencing data, and generally for any reads lower than 80% identity, five to ten
+ rounds of error correction are helpful. To run just the correction phase, use options
+ ``-correct corOutCoverage=500 corMinCoverage=0 corMhapSensitivity=high``. Use the output of
+ the previous run (in ``asm.correctedReads.fasta.gz``) as input to the next round.
- The default filtering is ``2 1000 0.75 0.75 2``. If you are assembling amplified data or viral data, it is possible your assembly will be flagged as unassembled. In those cases, you can turn off the filtering with the parameters
+ Once corrected, assemble with ``-nanopore-corrected <your data> errorRate=0.1 utgGraphDeviation=50``
- ::
+ **Nanopore R7 2D** and **Nanopore R9 1D**
+ ``errorRate=0.025``
- contigFilter="2 1000 1.0 1.0 2"
+ **Nanopore R9 2D** and **PacBio P6**
+ ``errorRate=0.013``
-**Q**:
- Why is my assembly is missing my favorite short plasmid X?
+ **PacBio Sequel**
+ Based on exactly one publically released `*A. thaliana* dataset <http://www.pacb.com/blog/sequel-system-data-release-arabidopsis-dataset-genome-assembly/>`_),
+ ``errorRate=0.013 corMhapSensitivity=normal``
-**A**:
- The first step in Canu is to find high-error overlaps and generate corrected sequences for subsequent assembly. This is currently the fastest step in Canu. By default, only the longest 40X of data (based on the specified genome size) is used for correction. If you have a dataset with uneven coverage or small plasmids, correcting the longest 40X may not give you sufficient coverage of your genome/plasmid. In these cases, you can set
+My assembly continuity is not good, how can I improve it?
+-------------------------------------
+ The most important determinant for assembly quality is sequence length, followed by the repeat
+ complexity/heterozygosity of your sample. The first thing to check is the amount of corrected
+ bases output by the correction step. This is logged in the stdout of Canu or in
+ canu-scripts/canu.*.out if you are running in a grid environment. For example on `a
+ haploid H. sapiens <https://www.ncbi.nlm.nih.gov/Traces/study/?acc=SAMN02744161>`_ sample:
+
::
+
+ -- BEGIN TRIMMING
+ --
+ ...
+ -- In gatekeeper store 'chm1/trimming/asm.gkpStore':
+ -- Found 5459105 reads.
+ -- Found 91697412754 bases (29.57 times coverage).
+ ...
+
+ Canu tries to correct the longest 40X of data. Some loss is normal but having output coverage
+ below 20-25X is a sign that correction did not work well (assuming you have more input coverage
+ than that). If that is the case, re-running with ``corMhapSensitivity=normal`` if you have >50X
+ or ``corMhapSensitivity=high corMinCoverage=0`` otherwise can help. You can also increase the
+ target coverage to correct ``corOutCoverage=100`` to get more correct sequences for assembly. If
+ there are sufficient corrected reads, the poor assembly is likely due to either repeats in the
+ genome being greater than read lengths or a high heterozygosity in the sample. Stay tuned for mor
+ information on tuning unitigging in those instances.
+
+
+What parameters can I tweak?
+-------------------------------------
+ For all stages:
+
+ - ``errorRate`` is the expected error rate in _corrected_ reads. It is a meta-parameter that
+ sets other parameters. It has been obsolesced and will eventually be removed.
+
+ - ``minReadLength`` and ``minOverlapLength``. The defaults are to discard reads shorter than
+ 1000bp and to not look for overlaps shorter than 500bp. Increasing ``minReadLength`` can
+ improve run time, and increasing ``minOverlapLength`` can improve assembly quality by removing
+ false overlaps. However, increasing either too much will quickly degrade assemblies by either
+ omitting valuable reads or missing true overlaps.
+
+ For correction:
+
+ - ``corOutCoverage`` controls how much coverage in corrected reads is generated. The default is
+ to target 40X, but, for various reasons, this results in 30X to 35X of reads being generated.
+ - ``corMinCoverage``, loosely, controls the quality of the corrected reads. It is the coverage
+ in evidence reads that is needed before a (portion of a) corrected read is reported.
+ Corrected reads are generated as a consensus of other reads; this is just the minimum ocverage
+ needed for the consensus sequence to be reported. The default is based on input read
+ coverage: 0x coverage for less than 30X input coverage, and 4x coverage for more than that.
+
+ For assembly:
+
+ - ``utgOvlErrorRate`` is essientially a speed optimization. Overlaps above this error rate are
+ not computed. Setting it too high generally just wastes compute time, while setting it too
+ low will degrade assemblies by missing true overlaps between lower quality reads.
+
+ - ``utgGraphDeviation`` and ``utgRepeatDeviation`` what quality of overlaps are used in contig
+ construction or in breaking contigs at false repeat joins, respectively. Both are in terms of
+ a deviation from the mean error rate in the longest overlaps.
+
+ - ``utgRepeatConfusedBP`` controls how similar a true overlap (between two reads in the same
+ contig) and a false overlap (between two reads in different contigs) need to be before the
+ contig is split. When this occurs, it isn't clear which overlap is 'true' - the longer one or
+ the slightly shorter one - and the contig is split to avoid misassemblies.
- corOutCoverage=1000
-
- Or any large value greater than your total input coverage which will correct and assemble all input data, at the expense of runtime. This option is also recommended for metagenomic datasets where all data is useful for assembly.
+
+My asm.contigs.fasta is empty, why?
+-------------------------------------
+ Canu will split the final output into three files:
-**Q**:
- Why do I get only 30X of corrected data?
+ <prefix>.contigs.fasta
+ Everything which could be assembled and is part of the primary assembly, including both unique
+ and repetitive elements. Each contig has several flags included on the fasta def line.
-**A**:
- By default, only the longest 40X of data (based on the specified genome size) is used for correction. Typically, some reads are trimmed during correction due to being chimeric or having erroneous sequence, resulting in a loss of 20-25% (30X output). You can force correction to be non-lossy by setting
+ **This file currently includes alternate paths.**
- ::
+ <prefix>.bubbles.fasta
+ Alternate paths in the graph which could not be merged into the primary assembly.
- corMinCoverage=0
+ **This file is currently ALWAYS empty.**
- In which case the corrected reads output will be the same length as the input data, keeping any high-error unsupported bases. Canu will trim these in downstream steps before assembly.
+ <prefix>.unassembled.fasta
+ Reads and small contigs that appear to be falsely assembled. These are generally low quality
+ reads or assemblies of a few low quality reads.
-**Q**:
- What is the minimum coverage required to run Canu?
+ **Small plasmids (unfortunately) tend to end up here.**
-**A**:
- We have found that on eukaryotic genomes >=20X typically begins to outperform current hybrid methods. For low coverage datasets (<=30X) we recommend the following parameters
+ The ``contigFilter=<minReads minLength singleReadSpan lowCovFraction lowCovDepth>`` parameter
+ sets parameters for several filters that decide which contigs are 'unassembled'. A contig is
+ 'unassembled' if it:
+ - has fewer than minReads (2) reads, or
+ - is shorter than minLength (1000), or
+ - has a single read spanning singleReadSpan percent (75%) of the contig, or
+ - has less than lowCovDepth (2) coverage over at least lowCovSpan fraction (0.75) of the contig
+ The default filtering is ``contigFilter="2 1000 0.75 0.75 2"``.
- ::
+ If you are assembling amplified or viral data, it is possible your assembly will be flagged as
+ unassembled. Turn off filtering with the parameters ``contigFilter="2 1000 1.0 1.0 2"``.
- corMinCoverage=0 errorRate=0.035
- For high-coverage datasets (typically >=60X) you can decrease the error rate since the higher number of reads should allow sufficient assembly from only the best subset
+Why is my assembly is missing my favorite short plasmid?
+-------------------------------------
+ Only the longest 40X of data (based on the specified genome size) is used for
+ correction. Datasets with uneven coverage or small plasmids can fail to generate enough
+ corrected reads to give enough coverage for assembly, resulting in gaps in the genome or zero
+ reads for small plasmids. Set ``corOutCoverage=1000`` (any value greater than your total input
+ coverage) to correct all input data.
- ::
+ This option is also recommended for metagenomic datasets where all data is useful for assembly.
- errorRate=0.013
- However, the above is mainly an optimization for speed and will not affect your assembly continuity.
+Why do I get less corrected read data than I asked for?
+-------------------------------------
+ Some reads are trimmed during correction due to being chimeric or because there wasn't enough
+ evidence to generate a quality corrected sequence. Typically, this results in a 25% loss.
+ Setting ``corMinCoverage=0`` will report all bases, even low those of low quality. Canu will
+ trim these in its 'trimming' phase before assembly.
-**Q**:
- My genome is AT/GC rich, do I need to adjust parameters?
+What is the minimum coverage required to run Canu?
+-------------------------------------
+ For eukaryotic genomes, coverage more than 20X is enough to outperform current hybrid methods.
+ - For less than 30X coverage, we recommend using ``corMinCoverage=0 errorRate=0.035`` to correct
+ as many reads as possible.
+ - For more than 60X coverage, we recommend using ``errorRate=0.013`` to slightly decrease the
+ error rate to use only the better reads. This is primarily an optimization for speed and
+ generally does not improve (or degrade) assembly continuity.
-**A**:
- On bacterial genomes, typically no. On repetitive genomes with AT<=25 or 75>=AT (or GC) the sequence biases the Jaccard estimate used by MHAP. In those cases setting
- ::
+My genome is AT (or GC) rich, do I need to adjust parameters? What about highly repetitive genomes?
+-------------------------------------
+ On bacterial genomes, no adjustment of parameters is (usually) needed. See the next question.
- corMaxEvidenceErate=0.15
+ On repetitive genomes with with a significantly skewed AT/GC ratio, the Jaccard estimate used by
+ MHAP is biased. Setting ``corMaxEvidenceErate=0.15`` is sufficient to correct for the bias in
+ our testing.
- has been sufficient to correct for the bias in our testing. In general, with high coverage repetitive genomes (such as plants) it can be beneficial to set the above parameter as it will eliminate repetitive matches, speed up the assembly, and sometime improve unitigs.
+ In general, with high coverage repetitive genomes (such as plants) it can be beneficial to set
+ the above parameter anyway, as it will eliminate repetitive matches, speed up the assembly, and
+ sometime improve unitigs.
diff --git a/documentation/source/index.rst b/documentation/source/index.rst
index 4fb5474..37490b2 100644
--- a/documentation/source/index.rst
+++ b/documentation/source/index.rst
@@ -43,9 +43,28 @@ Canu
`Canu <http://github.com/marbl/canu>`_ is a fork of the Celera Assembler designed for high-noise single-molecule sequencing (such as
-the PacBio RSII or Oxford Nanopore MinION). You can `download <http://github.com/marbl/canu/releases>`_ a release. If you encounter
+the PacBio RSII or Oxford Nanopore MinION).
+
+Publication
+===========
+Koren S, Walenz BP, Berlin K, Miller JR, Phillippy AM. `Canu: scalable and accurate long-read assembly via adaptive k-mer weighting and repeat separation <http://biorxiv.org/content/early/2016/08/24/071282>`_. bioRxiv. (2016).
+
+Install
+=========
+The easiest way to get started is to download a `release <https://github.com/marbl/canu/releases>`_. If you encounter
any issues, please report them using the `github issues <http://github.com/marbl/canu/issues>`_ page.
+Alternatively, you can also build the latest unreleased from github:
+
+::
+
+ git clone https://github.com/marbl/canu.git
+ cd canu/src
+ make -j <number of threads>
+
+Learn
+=========
+
* :ref:`Quick Start <quickstart>` - no experience or data required, download and assemble *Escherichia coli* today!
* :ref:`FAQ <faq>` - Frequently asked questions
* :ref:`Canu tutorial <tutorial>` - a gentle introduction to the complexities of canu.
diff --git a/documentation/source/overlap_transformations.svg b/documentation/source/overlap_transformations.svg
new file mode 100644
index 0000000..f6757f5
--- /dev/null
+++ b/documentation/source/overlap_transformations.svg
@@ -0,0 +1,500 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+ xmlns:dc="http://purl.org/dc/elements/1.1/"
+ xmlns:cc="http://creativecommons.org/ns#"
+ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+ xmlns:svg="http://www.w3.org/2000/svg"
+ xmlns="http://www.w3.org/2000/svg"
+ xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+ xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+ width="615"
+ height="145"
+ id="svg2"
+ sodipodi:version="0.32"
+ inkscape:version="0.47 r22583"
+ sodipodi:docname="4-OverlapTransforms.svg"
+ inkscape:output_extension="org.inkscape.output.svg.inkscape"
+ version="1.1"
+ inkscape:export-filename="/work/wgs/wiki/ovl/4-OverlapTransforms.png"
+ inkscape:export-xdpi="90"
+ inkscape:export-ydpi="90">
+ <sodipodi:namedview
+ id="base"
+ pagecolor="#ffffff"
+ bordercolor="#666666"
+ borderopacity="1.0"
+ gridtolerance="10000"
+ guidetolerance="10"
+ objecttolerance="10"
+ inkscape:pageopacity="1"
+ inkscape:pageshadow="2"
+ inkscape:zoom="1.4142136"
+ inkscape:cx="316.98265"
+ inkscape:cy="124.60159"
+ inkscape:document-units="px"
+ inkscape:current-layer="layer1"
+ showgrid="true"
+ showguides="false"
+ inkscape:window-width="1272"
+ inkscape:window-height="963"
+ inkscape:window-x="2078"
+ inkscape:window-y="1"
+ inkscape:window-maximized="0">
+ <inkscape:grid
+ id="grid2383"
+ type="xygrid"
+ empspacing="5"
+ visible="true"
+ enabled="true"
+ snapvisiblegridlinesonly="true" />
+ </sodipodi:namedview>
+ <defs
+ id="defs4">
+ <marker
+ style="overflow:visible"
+ id="Arrow1Sstart"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Sstart">
+ <path
+ transform="matrix(0.2,0,0,0.2,1.2,0)"
+ style="fill-rule:evenodd;stroke:#000000;stroke-width:1pt;marker-start:none"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 z"
+ id="path3269" />
+ </marker>
+ <marker
+ style="overflow:visible"
+ id="Arrow2Sstart"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow2Sstart">
+ <path
+ transform="matrix(0.3,0,0,0.3,-0.69,0)"
+ d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+ style="font-size:12px;fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
+ id="path3287" />
+ </marker>
+ <marker
+ style="overflow:visible"
+ id="Arrow2Send"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow2Send">
+ <path
+ transform="matrix(-0.3,0,0,-0.3,0.69,0)"
+ d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+ style="font-size:12px;fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
+ id="path3290" />
+ </marker>
+ <marker
+ style="overflow:visible"
+ id="Arrow2Mstart"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow2Mstart">
+ <path
+ transform="scale(0.6,0.6)"
+ d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+ style="font-size:12px;fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
+ id="path3190" />
+ </marker>
+ <marker
+ style="overflow:visible"
+ id="Arrow2Lstart"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow2Lstart">
+ <path
+ transform="matrix(1.1,0,0,1.1,1.1,0)"
+ d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+ style="font-size:12px;fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
+ id="path3184" />
+ </marker>
+ <marker
+ style="overflow:visible"
+ id="Arrow2Lend"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow2Lend">
+ <path
+ transform="matrix(-1.1,0,0,-1.1,-1.1,0)"
+ d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+ style="font-size:12px;fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
+ id="path3187" />
+ </marker>
+ <marker
+ style="overflow:visible"
+ id="Arrow1Lend"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow1Lend">
+ <path
+ transform="matrix(-0.8,0,0,-0.8,-10,0)"
+ style="fill-rule:evenodd;stroke:#000000;stroke-width:1pt;marker-start:none"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 z"
+ id="path3169" />
+ </marker>
+ <marker
+ style="overflow:visible"
+ id="Arrow2Mend"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow2Mend">
+ <path
+ transform="scale(-0.6,-0.6)"
+ d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+ style="font-size:12px;fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
+ id="path3193" />
+ </marker>
+ <inkscape:perspective
+ id="perspective10"
+ inkscape:persp3d-origin="372.04724 : 350.78739 : 1"
+ inkscape:vp_z="744.09448 : 526.18109 : 1"
+ inkscape:vp_y="0 : 1000 : 0"
+ inkscape:vp_x="0 : 526.18109 : 1"
+ sodipodi:type="inkscape:persp3d" />
+ <marker
+ style="overflow:visible"
+ id="Arrow2Sendn"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow2Sendn">
+ <path
+ transform="matrix(-0.3,0,0,-0.3,0.69,0)"
+ d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+ style="font-size:12px;fill:#2184db;fill-rule:evenodd;stroke:#2184db;stroke-width:0.625;stroke-linejoin:round"
+ id="path12633" />
+ </marker>
+ <marker
+ style="overflow:visible"
+ id="Arrow2SendS"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow2SendS">
+ <path
+ transform="matrix(-0.3,0,0,-0.3,0.69,0)"
+ d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+ style="font-size:12px;fill:#f31746;fill-rule:evenodd;stroke:#f31746;stroke-width:0.625;stroke-linejoin:round"
+ id="path12770" />
+ </marker>
+ <marker
+ style="overflow:visible"
+ id="Arrow2Send8"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow2Send8">
+ <path
+ transform="matrix(-0.3,0,0,-0.3,0.69,0)"
+ d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+ style="font-size:12px;fill:#f31746;fill-rule:evenodd;stroke:#f31746;stroke-width:0.625;stroke-linejoin:round"
+ id="path12773" />
+ </marker>
+ <marker
+ style="overflow:visible"
+ id="Arrow2SendW"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow2SendW">
+ <path
+ transform="matrix(-0.3,0,0,-0.3,0.69,0)"
+ d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+ style="font-size:12px;fill:#f31746;fill-rule:evenodd;stroke:#f31746;stroke-width:0.625;stroke-linejoin:round"
+ id="path12776" />
+ </marker>
+ <marker
+ style="overflow:visible"
+ id="Arrow2SendJ"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow2SendJ">
+ <path
+ transform="matrix(-0.3,0,0,-0.3,0.69,0)"
+ d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+ style="font-size:12px;fill:#2184db;fill-rule:evenodd;stroke:#2184db;stroke-width:0.625;stroke-linejoin:round"
+ id="path12779" />
+ </marker>
+ <marker
+ style="overflow:visible"
+ id="Arrow2Send6"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow2Send6">
+ <path
+ transform="matrix(-0.3,0,0,-0.3,0.69,0)"
+ d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+ style="font-size:12px;fill:#2184db;fill-rule:evenodd;stroke:#2184db;stroke-width:0.625;stroke-linejoin:round"
+ id="path12782" />
+ </marker>
+ <marker
+ style="overflow:visible"
+ id="Arrow2Sendv"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow2Sendv">
+ <path
+ transform="matrix(-0.3,0,0,-0.3,0.69,0)"
+ d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+ style="font-size:12px;fill:#eb1943;fill-rule:evenodd;stroke:#eb1943;stroke-width:0.625;stroke-linejoin:round"
+ id="path12785" />
+ </marker>
+ <marker
+ style="overflow:visible"
+ id="Arrow2SendC"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="Arrow2SendC">
+ <path
+ transform="matrix(-0.3,0,0,-0.3,0.69,0)"
+ d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+ style="font-size:12px;fill:#2184db;fill-rule:evenodd;stroke:#2184db;stroke-width:0.625;stroke-linejoin:round"
+ id="path12944" />
+ </marker>
+ </defs>
+ <metadata
+ id="metadata7">
+ <rdf:RDF>
+ <cc:Work
+ rdf:about="">
+ <dc:format>image/svg+xml</dc:format>
+ <dc:type
+ rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+ <dc:title></dc:title>
+ </cc:Work>
+ </rdf:RDF>
+ </metadata>
+ <g
+ id="layer1"
+ inkscape:groupmode="layer"
+ inkscape:label="Layer 1"
+ transform="translate(-78.124996,-428.09552)">
+ <path
+ sodipodi:nodetypes="cc"
+ id="path2391"
+ d="m 80,437.36218 150,0"
+ style="fill:none;stroke:#eb1943;stroke-width:3.75;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow2Sendv)" />
+ <path
+ id="path6035"
+ d="m 330,457.36218 c -140,0 -150,0 -150,0"
+ style="fill:none;stroke:#2184db;stroke-width:3.75;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-start:none;marker-end:url(#Arrow2Sendn)" />
+ <text
+ sodipodi:linespacing="125%"
+ id="text8516"
+ y="457.36218"
+ x="115"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Arial;-inkscape-font-specification:Arial"
+ xml:space="preserve"><tspan
+ y="457.36218"
+ x="115"
+ id="tspan8518"
+ sodipodi:role="line">a = x</tspan></text>
+ <text
+ sodipodi:linespacing="125%"
+ id="text8520"
+ y="437.36218"
+ x="270"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Arial;-inkscape-font-specification:Arial"
+ xml:space="preserve"><tspan
+ y="437.36218"
+ x="270"
+ id="tspan8522"
+ sodipodi:role="line">b = y</tspan></text>
+ <path
+ sodipodi:nodetypes="cc"
+ id="path8524"
+ d="m 79.999996,567.36218 150.000004,0"
+ style="fill:none;stroke:#f31746;stroke-width:3.75;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow2SendS)" />
+ <path
+ id="path8526"
+ d="m 330,547.36218 c -140,0 -150,0 -150,0"
+ style="fill:none;stroke:#2184db;stroke-width:3.75;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-start:none;marker-end:url(#Arrow2SendC)" />
+ <text
+ sodipodi:linespacing="125%"
+ id="text8536"
+ y="557.36218"
+ x="115"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Arial;-inkscape-font-specification:Arial"
+ xml:space="preserve"><tspan
+ y="557.36218"
+ x="115"
+ id="tspan8538"
+ sodipodi:role="line">a = -x</tspan></text>
+ <text
+ sodipodi:linespacing="125%"
+ id="text8540"
+ y="567.36218"
+ x="270"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Arial;-inkscape-font-specification:Arial"
+ xml:space="preserve"><tspan
+ y="567.36218"
+ x="270"
+ id="tspan8542"
+ sodipodi:role="line">b = -y</tspan></text>
+ <path
+ id="path8566"
+ d="m 185,477.36218 0,0 0,50"
+ style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow2Mend)" />
+ <path
+ id="path8568"
+ d="m 205,527.36218 0,-50"
+ style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow2Mend)" />
+ <text
+ sodipodi:linespacing="125%"
+ id="text10249"
+ y="505.4151"
+ x="258.09396"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Arial;-inkscape-font-specification:Arial"
+ xml:space="preserve"><tspan
+ y="505.4151"
+ x="258.09396"
+ id="tspan10251"
+ sodipodi:role="line" /></text>
+ <path
+ id="path10253"
+ d="m 415.42496,452.36269 0,0 -65.10695,0"
+ style="fill:none;stroke:#000000;stroke-width:1.1232022px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow2Mend)" />
+ <path
+ id="path10255"
+ d="m 349.81478,437.36218 65.18522,0"
+ style="fill:none;stroke:#000000;stroke-width:1.12379217px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow2Mend)" />
+ <text
+ sodipodi:linespacing="125%"
+ id="text10257"
+ y="447.36249"
+ x="355.12106"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Arial;-inkscape-font-specification:Arial"
+ xml:space="preserve"><tspan
+ y="447.36249"
+ x="355.12106"
+ id="tspan10259"
+ sodipodi:role="line">RevComp</tspan></text>
+ <text
+ sodipodi:linespacing="125%"
+ id="text10261"
+ y="507.36218"
+ x="187"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Arial;-inkscape-font-specification:Arial"
+ xml:space="preserve"><tspan
+ y="507.36218"
+ x="187"
+ id="tspan10263"
+ sodipodi:role="line">AB</tspan></text>
+ <path
+ id="path10265"
+ d="m 420.61018,562.36269 0,0 -65.10695,0"
+ style="fill:none;stroke:#000000;stroke-width:1.1232022px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow2Mend)" />
+ <path
+ id="path10267"
+ d="m 355,547.36218 65.18522,0"
+ style="fill:none;stroke:#000000;stroke-width:1.12379217px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow2Mend)" />
+ <text
+ sodipodi:linespacing="125%"
+ id="text10269"
+ y="557.36249"
+ x="360.30627"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Arial;-inkscape-font-specification:Arial"
+ xml:space="preserve"><tspan
+ y="557.36249"
+ x="360.30627"
+ id="tspan10271"
+ sodipodi:role="line">RevComp</tspan></text>
+ <path
+ sodipodi:nodetypes="cc"
+ id="path10281"
+ d="m 689.69403,438.20601 -150,0"
+ style="fill:none;stroke:#f31746;stroke-width:3.75;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow2Send8)" />
+ <path
+ id="path10283"
+ d="m 439.69403,458.20601 c 140,0 150,0 150,0"
+ style="fill:none;stroke:#2184db;stroke-width:3.75;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-start:none;marker-end:url(#Arrow2SendJ)" />
+ <text
+ sodipodi:linespacing="125%"
+ id="text10293"
+ y="458.20602"
+ x="628.00458"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Arial;-inkscape-font-specification:Arial"
+ xml:space="preserve"><tspan
+ y="458.20602"
+ x="628.00458"
+ id="tspan10295"
+ sodipodi:role="line">b = -x</tspan></text>
+ <text
+ sodipodi:linespacing="125%"
+ id="text10297"
+ y="438.20602"
+ x="472.67059"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Arial;-inkscape-font-specification:Arial"
+ xml:space="preserve"><tspan
+ y="438.20602"
+ x="472.67059"
+ id="tspan10299"
+ sodipodi:role="line">a = -y</tspan></text>
+ <path
+ sodipodi:nodetypes="cc"
+ id="path10301"
+ d="m 689.69403,568.20601 -150,0"
+ style="fill:none;stroke:#f31746;stroke-width:3.75;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow2SendW)" />
+ <path
+ id="path10303"
+ d="m 439.69403,548.20601 c 140,0 150,0 150,0"
+ style="fill:none;stroke:#2184db;stroke-width:3.75;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-start:none;marker-end:url(#Arrow2Send6)" />
+ <text
+ sodipodi:linespacing="125%"
+ id="text10313"
+ y="558.20599"
+ x="605.99677"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Arial;-inkscape-font-specification:Arial"
+ xml:space="preserve"><tspan
+ y="558.20599"
+ x="605.99677"
+ id="tspan10315"
+ sodipodi:role="line">b = x</tspan></text>
+ <text
+ sodipodi:linespacing="125%"
+ id="text10317"
+ y="568.20599"
+ x="468.66278"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Arial;-inkscape-font-specification:Arial"
+ xml:space="preserve"><tspan
+ y="568.20599"
+ x="468.66278"
+ id="tspan10319"
+ sodipodi:role="line">a = y</tspan></text>
+ <path
+ id="path10321"
+ d="m 544.69403,478.20601 0,0 0,50"
+ style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow2Mend)" />
+ <path
+ id="path10323"
+ d="m 564.69403,528.20601 0,-50"
+ style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow2Mend)" />
+ <text
+ sodipodi:linespacing="125%"
+ id="text10325"
+ y="508.20602"
+ x="546.69403"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Arial;-inkscape-font-specification:Arial"
+ xml:space="preserve"><tspan
+ y="508.20602"
+ x="546.69403"
+ id="tspan10327"
+ sodipodi:role="line">AB</tspan></text>
+ </g>
+</svg>
diff --git a/documentation/source/overlaps.svg b/documentation/source/overlaps.svg
new file mode 100644
index 0000000..94806d9
--- /dev/null
+++ b/documentation/source/overlaps.svg
@@ -0,0 +1,384 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+ xmlns:dc="http://purl.org/dc/elements/1.1/"
+ xmlns:cc="http://creativecommons.org/ns#"
+ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+ xmlns:svg="http://www.w3.org/2000/svg"
+ xmlns="http://www.w3.org/2000/svg"
+ xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+ xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+ width="560"
+ height="115"
+ id="svg2"
+ sodipodi:version="0.32"
+ inkscape:version="0.47 r22583"
+ sodipodi:docname="3-OverlapsInTheStore.svg"
+ inkscape:output_extension="org.inkscape.output.svg.inkscape"
+ version="1.1"
+ inkscape:export-filename="/work/wgs/wiki/ovl/3-OverlapsInTheStore.png"
+ inkscape:export-xdpi="90"
+ inkscape:export-ydpi="90">
+ <defs
+ id="defs4">
+ <marker
+ inkscape:stockid="StopM"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="StopM"
+ style="overflow:visible">
+ <path
+ id="path3438"
+ d="M 0,5.65 0,-5.65"
+ style="fill:none;stroke:#000000;stroke-width:1pt"
+ transform="scale(0.4,0.4)" />
+ </marker>
+ <marker
+ style="overflow:visible"
+ id="DistanceEnd"
+ refX="0"
+ refY="0"
+ orient="auto"
+ inkscape:stockid="DistanceEnd">
+ <g
+ id="g3321">
+ <path
+ style="fill:none;stroke:#ffffff;stroke-width:1.14999998;stroke-linecap:square"
+ d="M 0,0 -2,0"
+ id="path2316" />
+ <path
+ style="fill:#000000;fill-rule:evenodd;stroke:none"
+ d="M 0,0 -13,4 -9,0 -13,-4 0,0 z"
+ id="path2312" />
+ <path
+ style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:square"
+ d="M 0,-4 0,40"
+ id="path2314" />
+ </g>
+ </marker>
+ <marker
+ inkscape:stockid="Arrow2Send"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow2Send"
+ style="overflow:visible">
+ <path
+ id="path4907"
+ style="font-size:12px;fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
+ d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+ transform="matrix(-0.3,0,0,-0.3,0.69,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow2Mstart"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow2Mstart"
+ style="overflow:visible">
+ <path
+ id="path3190"
+ style="font-size:12px;fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
+ d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+ transform="scale(0.6,0.6)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow2Lstart"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow2Lstart"
+ style="overflow:visible">
+ <path
+ id="path3184"
+ style="font-size:12px;fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
+ d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+ transform="matrix(1.1,0,0,1.1,1.1,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow2Lend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow2Lend"
+ style="overflow:visible">
+ <path
+ id="path3187"
+ style="font-size:12px;fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
+ d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+ transform="matrix(-1.1,0,0,-1.1,-1.1,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Lend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Lend"
+ style="overflow:visible">
+ <path
+ id="path3169"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 z"
+ style="fill-rule:evenodd;stroke:#000000;stroke-width:1pt;marker-start:none"
+ transform="matrix(-0.8,0,0,-0.8,-10,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow2Mend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow2Mend"
+ style="overflow:visible">
+ <path
+ id="path3193"
+ style="font-size:12px;fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
+ d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+ transform="scale(-0.6,-0.6)" />
+ </marker>
+ <inkscape:perspective
+ sodipodi:type="inkscape:persp3d"
+ inkscape:vp_x="0 : 526.18109 : 1"
+ inkscape:vp_y="0 : 1000 : 0"
+ inkscape:vp_z="744.09448 : 526.18109 : 1"
+ inkscape:persp3d-origin="372.04724 : 350.78739 : 1"
+ id="perspective10" />
+ </defs>
+ <sodipodi:namedview
+ id="base"
+ pagecolor="#ffffff"
+ bordercolor="#666666"
+ borderopacity="1.0"
+ gridtolerance="10000"
+ guidetolerance="10"
+ objecttolerance="10"
+ inkscape:pageopacity="1"
+ inkscape:pageshadow="2"
+ inkscape:zoom="2.0000001"
+ inkscape:cx="290.88378"
+ inkscape:cy="39.486124"
+ inkscape:document-units="px"
+ inkscape:current-layer="layer1"
+ showgrid="true"
+ showguides="false"
+ inkscape:window-width="1638"
+ inkscape:window-height="963"
+ inkscape:window-x="1712"
+ inkscape:window-y="1"
+ inkscape:snap-global="true"
+ inkscape:window-maximized="0"
+ inkscape:snap-grids="false"
+ inkscape:snap-to-guides="false">
+ <inkscape:grid
+ type="xygrid"
+ id="grid2383"
+ empspacing="5"
+ visible="true"
+ enabled="true"
+ snapvisiblegridlinesonly="true" />
+ </sodipodi:namedview>
+ <metadata
+ id="metadata7">
+ <rdf:RDF>
+ <cc:Work
+ rdf:about="">
+ <dc:format>image/svg+xml</dc:format>
+ <dc:type
+ rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+ <dc:title></dc:title>
+ </cc:Work>
+ </rdf:RDF>
+ </metadata>
+ <g
+ inkscape:label="Layer 1"
+ inkscape:groupmode="layer"
+ id="layer1"
+ transform="translate(-78.092563,-424.85827)">
+ <path
+ style="fill:none;stroke:#000000;stroke-width:3.75;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow2Send)"
+ d="m 88.092563,439.85827 149.999997,0"
+ id="path2391"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="fill:none;stroke:#000000;stroke-width:3.79664898;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow2Send)"
+ d="m 338.09256,459.85827 c -144.83852,0 -160,0 -160,0"
+ id="path6035"
+ sodipodi:nodetypes="cc" />
+ <text
+ xml:space="preserve"
+ style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans"
+ x="92.59256"
+ y="463.85828"
+ id="text6588"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ x="92.59256"
+ y="463.85828"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;font-family:Arial;-inkscape-font-specification:Arial"
+ id="tspan7241">a-hang >= 0</tspan></text>
+ <text
+ xml:space="preserve"
+ style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans"
+ x="251"
+ y="443.86218"
+ id="text6592"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ x="251"
+ y="443.86218"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;font-family:Arial;-inkscape-font-specification:Arial"
+ id="tspan7247">b-hang >= 0</tspan></text>
+ <path
+ style="fill:none;stroke:#000000;stroke-width:1.20985293;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-start:url(#StopM);marker-mid:none;marker-end:url(#StopM)"
+ d="m 178.09256,449.85827 -89.999997,0"
+ id="path6612"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="fill:none;stroke:#000000;stroke-width:1.24113286;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-start:url(#StopM);marker-mid:none;marker-end:url(#StopM)"
+ d="m 238.09256,449.85827 100,0"
+ id="path6618"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="fill:none;stroke:#000000;stroke-width:3.72201467;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow2Send)"
+ d="m 88.092563,499.85827 249.999997,0"
+ id="path9958"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="fill:none;stroke:#000000;stroke-width:3.62519932;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow2Send)"
+ d="m 238.09256,519.85827 c -51.12841,0 -60,0 -60,0"
+ id="path9960"
+ sodipodi:nodetypes="cc" />
+ <text
+ xml:space="preserve"
+ style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans"
+ x="93.63604"
+ y="523.87689"
+ id="text9962"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ x="93.63604"
+ y="523.87689"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;font-family:Arial;-inkscape-font-specification:Arial"
+ id="tspan9964">a-hang >= 0</tspan></text>
+ <path
+ style="fill:none;stroke:#000000;stroke-width:1.2385987;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-start:url(#StopM);marker-mid:none;marker-end:url(#StopM)"
+ d="m 178.09256,509.85827 -89.999997,0"
+ id="path9986"
+ sodipodi:nodetypes="cc" />
+ <text
+ xml:space="preserve"
+ style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans"
+ x="248.98959"
+ y="523.1698"
+ id="text10016"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ x="248.98959"
+ y="523.1698"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;font-family:Arial;-inkscape-font-specification:Arial"
+ id="tspan10018">b-hang <= 0</tspan></text>
+ <path
+ style="fill:none;stroke:#000000;stroke-width:1.24136651;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-start:url(#StopM);marker-mid:none;marker-end:url(#StopM)"
+ d="m 238.09256,509.85827 100,0"
+ id="path10038"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="fill:none;stroke:#000000;stroke-width:3.79601526;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow2Send)"
+ d="m 468.09256,439.85827 160,0"
+ id="path5679"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="fill:none;stroke:#000000;stroke-width:3.6613636;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow2Send)"
+ d="m 528.09256,459.85827 c -110,0 -150,0 -150,0"
+ id="path5681"
+ sodipodi:nodetypes="cc" />
+ <text
+ xml:space="preserve"
+ style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans"
+ x="385.75735"
+ y="443.82666"
+ id="text5683"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ x="385.75735"
+ y="443.82666"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;font-family:Arial;-inkscape-font-specification:Arial"
+ id="tspan5685">a-hang <= 0</tspan></text>
+ <text
+ xml:space="preserve"
+ style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans"
+ x="539.29291"
+ y="464.18021"
+ id="text5687"
+ sodipodi:linespacing="125%"
+ inkscape:export-xdpi="90"
+ inkscape:export-ydpi="90"><tspan
+ sodipodi:role="line"
+ x="539.29291"
+ y="464.18021"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;font-family:Arial;-inkscape-font-specification:Arial"
+ id="tspan5689">b-hang <= 0</tspan></text>
+ <path
+ style="fill:none;stroke:#000000;stroke-width:1.17885327;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-start:url(#StopM);marker-mid:none;marker-end:url(#StopM)"
+ d="m 468.09256,449.85827 -90,0"
+ id="path5707"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="fill:none;stroke:#000000;stroke-width:1.24106419;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-start:url(#StopM);marker-mid:none;marker-end:url(#StopM)"
+ d="m 528.09256,449.85827 100,0"
+ id="path5709"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="fill:none;stroke:#000000;stroke-width:3.77026725;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow2Send)"
+ d="m 628.09256,529.85827 -250,0"
+ id="path5711"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="fill:none;stroke:#000000;stroke-width:3.75;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow2Send)"
+ d="m 468.09256,509.85827 c 56,0 60,0 60,0"
+ id="path5713" />
+ <text
+ xml:space="preserve"
+ style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans"
+ x="386.81802"
+ y="513.77637"
+ id="text5715"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ x="386.81802"
+ y="513.77637"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;font-family:Arial;-inkscape-font-specification:Arial"
+ id="tspan5717">a-hang <= 0</tspan></text>
+ <path
+ style="fill:none;stroke:#000000;stroke-width:1.2133956;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-start:url(#StopM);marker-mid:none;marker-end:url(#StopM)"
+ d="m 468.09256,519.85827 -90,0"
+ id="path5731" />
+ <text
+ xml:space="preserve"
+ style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans"
+ x="540"
+ y="513.42285"
+ id="text5733"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ x="540"
+ y="513.42285"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;font-family:Arial;-inkscape-font-specification:Arial"
+ id="tspan5735">b-hang >= 0</tspan></text>
+ <path
+ style="fill:none;stroke:#000000;stroke-width:1.2736851;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-start:url(#StopM);marker-mid:none;marker-end:url(#StopM)"
+ d="m 528.09256,519.85827 100,0"
+ id="path5741" />
+ <text
+ xml:space="preserve"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Arial;-inkscape-font-specification:Arial"
+ x="89.802559"
+ y="473.94885"
+ id="text5788"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan5790"
+ x="89.802559"
+ y="473.94885" /></text>
+ </g>
+</svg>
diff --git a/documentation/source/parameter-reference.rst b/documentation/source/parameter-reference.rst
index e9de816..27e6f26 100644
--- a/documentation/source/parameter-reference.rst
+++ b/documentation/source/parameter-reference.rst
@@ -30,6 +30,14 @@ canuIterationMax <integer=2>
Sometimes, jobs fail due to other jobs exhausting resources (memory), or by the node itself failing. In this case, canu will launch the jobs
again. This parameter controls how many times it tries.
+onSuccess <string=unset>
+onFailure <string=unset>
+ On success or failure, execute the command supplied. The command will execute in the <assembly-directory> (the -d option to canu) and will be supplied with the name of the assembly (the -p option to canu) as its first and only parameter.
+
+ The 'onSuccess' command will run when canu finishes an assembly.
+
+ The 'onFailure' command will run when canu terminates abnormally. There are two exceptions: if a 'spec' file cannot be read, and if canu tries to access an invalid parameter. The former will be reported as a command line error, and canu will never start. The latter should never occur (and, in fact, has never occurred) except when developers are developing the software.
+
Process Control
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -101,6 +109,15 @@ shell <string="/bin/sh">
java <string="java">
A path to a Java application launcher of at least version 1.8.
+gnuplot <string="gnuplot">
+ A path to the gnuplot graphing utility.
+
+gnuplotImageFormat <string="png">
+ The type of image to generate in gnuplot. By default, canu will use png, svg or gif, in that order.
+
+gnuplotTested <boolean=false>
+ If set, skip the tests to determine if gnuplot will run, and to decide the image type to generate. This is used when gnuplot fails to run, or isn't even installed, and allows canu to continue execution without generating graphs.
+
Cleanup Options
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/documentation/source/pipeline.rst b/documentation/source/pipeline.rst
index 8a8645a..ef4be29 100644
--- a/documentation/source/pipeline.rst
+++ b/documentation/source/pipeline.rst
@@ -4,31 +4,9 @@
Canu Pipeline
=============
-The canu pipeline is big and complicated and we haven't written it up yet. Sorry.
+The pipeline is described in Koren S, Walenz BP, Berlin K, Miller JR, Phillippy AM. `Canu: scalable and accurate long-read assembly via adaptive k-mer weighting and repeat separation <http://biorxiv.org/content/early/2016/08/24/071282>`_. bioRxiv. (2016).
+Figure 1 of the paper shows the primary pipeline (below, top) and the supplement contains the sub-pipeline for building read and overlap databases (below, bottom).
+.. image:: canu-pipeline.*
-The basic flow is:
-
-- Gatekeeper
-- Meryl
-- Overlapper
-
- + Read Correction
- + Read Trimming
-
- * Trimming
- * Splitting
-
- + Unitig Construction
-
- * Overlap Error Adjustment
- * Overlap Filtering (NOT IMPLEMENTED)
- * Unitig
- * Consensus
- * Labeling
- * Output
-
-Details:
-
-Meryl - counts kmers, generates a histogram txt file, and a histogram plot png
-
+.. image:: canu-overlaps.*
diff --git a/documentation/source/quick-start.rst b/documentation/source/quick-start.rst
index c406970..2149260 100644
--- a/documentation/source/quick-start.rst
+++ b/documentation/source/quick-start.rst
@@ -4,7 +4,7 @@
Canu Quick Start
================
-Canu specializes in assembling PacBio or Oxford Nanopre sequences. Canu will correct the reads, then trim suspicious regions (such as remaining SMRTbell adapter), then
+Canu specializes in assembling PacBio or Oxford Nanopore sequences. Canu will correct the reads, then trim suspicious regions (such as remaining SMRTbell adapter), then
assemble the corrected and cleaned reads into unitigs.
Brief Introduction
diff --git a/documentation/source/repeat-spanned.svg b/documentation/source/repeat-spanned.svg
new file mode 100644
index 0000000..bd56a63
--- /dev/null
+++ b/documentation/source/repeat-spanned.svg
@@ -0,0 +1,214 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+ xmlns:dc="http://purl.org/dc/elements/1.1/"
+ xmlns:cc="http://creativecommons.org/ns#"
+ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+ xmlns:svg="http://www.w3.org/2000/svg"
+ xmlns="http://www.w3.org/2000/svg"
+ xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+ xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+ width="5.933465in"
+ height="3.5792274in"
+ viewBox="0 0 534.01187 322.13048"
+ id="svg2"
+ version="1.1"
+ inkscape:version="0.91 r13725"
+ sodipodi:docname="unspanned-repeat-2.svg">
+ <defs
+ id="defs4">
+ <marker
+ inkscape:stockid="Arrow1Lend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Lend"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ id="path4399"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.8,0,0,-0.8,-10,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Lstart"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Lstart"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ id="path4396"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(0.8,0,0,0.8,10,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ </defs>
+ <sodipodi:namedview
+ id="base"
+ pagecolor="#ffffff"
+ bordercolor="#666666"
+ borderopacity="1.0"
+ inkscape:pageopacity="0.0"
+ inkscape:pageshadow="2"
+ inkscape:zoom="2.8"
+ inkscape:cx="280.24485"
+ inkscape:cy="141.1326"
+ inkscape:document-units="mm"
+ inkscape:current-layer="layer1"
+ showgrid="true"
+ inkscape:window-width="1670"
+ inkscape:window-height="1024"
+ inkscape:window-x="0"
+ inkscape:window-y="0"
+ inkscape:window-maximized="0"
+ inkscape:snap-bbox="true"
+ inkscape:bbox-nodes="true"
+ inkscape:snap-bbox-edge-midpoints="false"
+ inkscape:object-nodes="true"
+ units="in"
+ fit-margin-top="1"
+ fit-margin-left="1"
+ fit-margin-right="1"
+ fit-margin-bottom="1">
+ <inkscape:grid
+ type="xygrid"
+ id="grid3336"
+ units="mm"
+ spacingx="3.5433071"
+ spacingy="3.5433071"
+ originx="66.809084"
+ originy="-402.51769" />
+ </sodipodi:namedview>
+ <metadata
+ id="metadata7">
+ <rdf:RDF>
+ <cc:Work
+ rdf:about="">
+ <dc:format>image/svg+xml</dc:format>
+ <dc:type
+ rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+ <dc:title></dc:title>
+ </cc:Work>
+ </rdf:RDF>
+ </metadata>
+ <g
+ inkscape:label="Layer 1"
+ inkscape:groupmode="layer"
+ id="layer1"
+ transform="translate(66.80908,-327.71409)">
+ <rect
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#9a36cf;fill-opacity:0.27167634;fill-rule:nonzero;stroke:none;stroke-width:0.88582695;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:0.88582695, 0.88582695;stroke-dashoffset:0;stroke-opacity:1;color-rendering:aut [...]
+ id="rect4172"
+ width="95.669281"
+ height="127.55906"
+ x="148.81889"
+ y="432.28354" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.88582695;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shap [...]
+ d="m 361.41732,549.71224 -124.01575,0"
+ id="path4699"
+ inkscape:connector-curvature="0" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.88582689;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shap [...]
+ d="m 262.20472,555.79964 77.95275,0"
+ id="path4701"
+ inkscape:connector-curvature="0" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#0d00f5;stroke-width:1.77165353;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shap [...]
+ d="m 124.01574,520.86618 177.16536,0"
+ id="path4691"
+ inkscape:connector-curvature="0" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.88582695;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shap [...]
+ d="m 198.42519,541.62641 3.54331,0 113.38582,0"
+ id="path4697"
+ inkscape:connector-curvature="0" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.885827;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shape- [...]
+ d="m 177.16536,534.5398 67.32283,0"
+ id="path4695"
+ inkscape:connector-curvature="0" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.88582695;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shap [...]
+ d="m 148.81889,527.45318 74.40945,0.49965"
+ id="path4693"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.885827;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shape- [...]
+ d="m 102.7559,513.77954 113.38582,6e-5"
+ id="path4689"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.885827;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shape- [...]
+ d="m 81.496062,506.69292 145.275588,0"
+ id="path4687"
+ inkscape:connector-curvature="0" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.885827;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shape- [...]
+ d="m 63.779527,499.1067 116.929123,0.49967"
+ id="path4685"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.88582689;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shap [...]
+ d="m 31.889758,492.02011 113.385832,0"
+ id="path4683"
+ inkscape:connector-curvature="0" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.88582689;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-start:url(#Arrow1Lstart);marker-end:url [...]
+ d="m 24.803149,485.4331 350.787401,-2e-5"
+ id="path4681"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.88582695;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shap [...]
+ d="m 216.14173,435.82679 -67.32283,0 -28.34646,-14.17318"
+ id="path4378"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="ccc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.88582695;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shap [...]
+ d="m 187.79527,442.9134 -38.97637,0 -21.25985,-10.62987"
+ id="path4380"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="ccc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.88582695;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shap [...]
+ d="m 230.31496,450.00001 -81.49606,0 -7.08662,-3.54325"
+ id="path4382"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="ccc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.88582689;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shap [...]
+ d="m 155.90551,457.08663 53.1496,5e-5"
+ id="path4384"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.885827;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shape- [...]
+ d="m 177.16535,471.25986 67.32284,0 14.17321,-7.08656"
+ id="path4388"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="ccc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.885827;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shape- [...]
+ d="m 191.33858,478.34647 53.1496,6e-5 28.34645,-14.17323"
+ id="path4390"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="ccc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shape-rende [...]
+ d="m 170.07873,464.1733 53.14961,0 m 0,0 14.17322,-7.08662"
+ id="path4170"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cccc" />
+ </g>
+</svg>
diff --git a/documentation/source/repeat-unspanned.svg b/documentation/source/repeat-unspanned.svg
new file mode 100644
index 0000000..efd6d46
--- /dev/null
+++ b/documentation/source/repeat-unspanned.svg
@@ -0,0 +1,232 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+ xmlns:dc="http://purl.org/dc/elements/1.1/"
+ xmlns:cc="http://creativecommons.org/ns#"
+ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+ xmlns:svg="http://www.w3.org/2000/svg"
+ xmlns="http://www.w3.org/2000/svg"
+ xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+ xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+ width="5.933465in"
+ height="3.5792274in"
+ viewBox="0 0 534.01187 322.13048"
+ id="svg2"
+ version="1.1"
+ inkscape:version="0.91 r13725"
+ sodipodi:docname="unspanned-repeat-1.svg">
+ <defs
+ id="defs4">
+ <marker
+ inkscape:stockid="Arrow1Lend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Lend"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ id="path4399"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(-0.8,0,0,-0.8,-10,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Lstart"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow1Lstart"
+ style="overflow:visible"
+ inkscape:isstock="true">
+ <path
+ id="path4396"
+ d="M 0,0 5,-5 -12.5,0 5,5 0,0 Z"
+ style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1"
+ transform="matrix(0.8,0,0,0.8,10,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ </defs>
+ <sodipodi:namedview
+ id="base"
+ pagecolor="#ffffff"
+ bordercolor="#666666"
+ borderopacity="1.0"
+ inkscape:pageopacity="0.0"
+ inkscape:pageshadow="2"
+ inkscape:zoom="2.8"
+ inkscape:cx="212.21137"
+ inkscape:cy="166.877"
+ inkscape:document-units="mm"
+ inkscape:current-layer="layer1"
+ showgrid="true"
+ inkscape:window-width="1670"
+ inkscape:window-height="1024"
+ inkscape:window-x="0"
+ inkscape:window-y="0"
+ inkscape:window-maximized="0"
+ inkscape:snap-bbox="true"
+ inkscape:bbox-nodes="true"
+ inkscape:snap-bbox-edge-midpoints="false"
+ inkscape:object-nodes="true"
+ units="in"
+ fit-margin-top="1"
+ fit-margin-left="1"
+ fit-margin-right="1"
+ fit-margin-bottom="1">
+ <inkscape:grid
+ type="xygrid"
+ id="grid3336"
+ units="mm"
+ spacingx="3.5433071"
+ spacingy="3.5433071"
+ originx="66.809084"
+ originy="-402.51769" />
+ </sodipodi:namedview>
+ <metadata
+ id="metadata7">
+ <rdf:RDF>
+ <cc:Work
+ rdf:about="">
+ <dc:format>image/svg+xml</dc:format>
+ <dc:type
+ rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+ <dc:title></dc:title>
+ </cc:Work>
+ </rdf:RDF>
+ </metadata>
+ <g
+ inkscape:label="Layer 1"
+ inkscape:groupmode="layer"
+ id="layer1"
+ transform="translate(66.80908,-327.71409)">
+ <rect
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:#9a36cf;fill-opacity:0.27167634;fill-rule:nonzero;stroke:none;stroke-width:0.88582695;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:0.88582695, 0.88582695;stroke-dashoffset:0;stroke-opacity:1;color-rendering:aut [...]
+ id="rect4172"
+ width="141.73227"
+ height="148.81891"
+ x="102.7559"
+ y="411.02368" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.88582695;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shap [...]
+ d="m 361.41731,549.21267 -124.01575,0"
+ id="path4699"
+ inkscape:connector-curvature="0" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.88582689;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shap [...]
+ d="m 262.20471,556.29929 77.95275,0"
+ id="path4701"
+ inkscape:connector-curvature="0" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#0f01ff;stroke-width:1.77165365;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shap [...]
+ d="m 124.01574,520.4233 177.16536,0"
+ id="path4691"
+ inkscape:connector-curvature="0" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.88582695;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shap [...]
+ d="m 198.42519,542.12606 3.54331,0 113.38582,0"
+ id="path4697"
+ inkscape:connector-curvature="0" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.885827;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shape- [...]
+ d="m 177.16535,535.03944 67.32283,0"
+ id="path4695"
+ inkscape:connector-curvature="0" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.88582695;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shap [...]
+ d="m 148.82186,527.95282 74.40945,0.49965"
+ id="path4693"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#0f01ff;stroke-width:1.77165365;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shap [...]
+ d="m 81.496062,506.69292 145.275588,0"
+ id="path4687"
+ inkscape:connector-curvature="0" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.95056921;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shap [...]
+ d="m 56.692911,499.60637 134.645659,0.49967"
+ id="path4685"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.88582689;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shap [...]
+ d="m 46.06299,492.96267 113.38583,0"
+ id="path4683"
+ inkscape:connector-curvature="0" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.88582689;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-start:url(#Arrow1Lstart);marker-end:url [...]
+ d="m 24.803149,485.4331 350.787401,-2e-5"
+ id="path4681"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.88582695;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shap [...]
+ d="m 170.07873,414.1241 -67.32283,0 -28.346464,-14.17318"
+ id="path4378"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="ccc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.88582695;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shap [...]
+ d="m 141.73227,421.21071 -38.97637,0 -42.519684,-21.25985"
+ id="path4380"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="ccc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.88582695;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shap [...]
+ d="m 184.25196,428.29732 -81.49606,0 -35.43307,-17.71654"
+ id="path4382"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="ccc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.88582689;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shap [...]
+ d="m 145.27558,456.64377 63.77953,0"
+ id="path4384"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#f60059;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shape-rende [...]
+ d="m 113.38582,442.91345 113.38582,0 m 0,0 24.80315,-14.17323"
+ id="path4170-5"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cccc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.885827;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shape- [...]
+ d="m 177.16535,471.25986 67.32284,0 63.77951,-31.88971"
+ id="path4388"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="ccc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.885827;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shape- [...]
+ d="m 191.33858,478.34647 53.1496,6e-5 28.34645,-14.17323"
+ id="path4390"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="ccc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shape-rende [...]
+ d="m 162.99212,464.1733 60.23622,0 m 0,0 56.69291,-31.88977"
+ id="path4170"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cccc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.09690034;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shap [...]
+ d="m 109.84251,435.38393 113.38583,0"
+ id="path4384-2"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.16624212;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shap [...]
+ d="m 120.47244,450.00007 99.21259,0"
+ id="path4384-0"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="color:#000000;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.88582695;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shap [...]
+ d="m 127.55904,513.27995 74.40945,0.49965"
+ id="path4693-1"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ </g>
+</svg>
diff --git a/documentation/source/tutorial.rst b/documentation/source/tutorial.rst
index 89f9cba..be7660e 100644
--- a/documentation/source/tutorial.rst
+++ b/documentation/source/tutorial.rst
@@ -28,7 +28,7 @@ The **canu** command is the 'executive' program that runs all modules of the ass
each of the three top-level tasks (correction, trimming, unitig construction), each of which
consists of many steps. Canu ensures that input files for each step exist, that each step
successfully finished, and that the output for each step exists. It does minor bits of processing,
-such as reformatting files, but generally generally just executes other programs.
+such as reformatting files, but generally just executes other programs.
::
@@ -176,16 +176,19 @@ Execution Configuration
~~~~~~~~~~~~~~~~~~~~~~~~
There are two modes that canu runs in: locally, using just one machine, or grid-enabled, using
-multiple hosts managed by a grid engine. At present, only Sun Grid Engine / Open Grid Engine /
-Univa Grid Engine / SGE / whatever it's called now, PBS, and Slurm support is available. LSF
-support is also included but has limited testing. Section :ref:`grid-engine-config` has a few hints on how to set up a
-new grid engine.
+multiple hosts managed by a grid engine. LSF, PBS/Torque, PBSPro, Sun Grid Engine (and
+derivations), and Slurm are supported, though LSF has has limited testing. Section
+:ref:`grid-engine-config` has a few hints on how to set up a new grid engine.
-To enable execution of the parallel steps on the grid, set ``useGrid=1``. The canu pipeline will immediately submit itself to
-the grid, and run entirely under grid control. This is the default if a grid-engine is detected on your system. If you prefer to run locally, set ``useGrid=0``.
+By default, if a grid is detected the canu pipeline will immediately submit itself to the grid and
+run entirely under grid control. If no grid is detected, or if option ``useGrid=false`` is set,
+canu will run on the local machine.
-In both cases, local or grid, Canu will auto-detect available resources and scale the jobs to run, based on the resources and genome size you're assembling. Thus, most users should be able to run the command without modifying the defaults. Some advanced options are outlined below. Each stage has the same five configuration options, and tags are used to specialize the option to a
-specific stage. The options are:
+In both cases, Canu will auto-detect available resources and configure job sizes based on the
+resources and genome size you're assembling. Thus, most users should be able to run the command
+without modifying the defaults. Some advanced options are outlined below. Each stage has the same
+five configuration options, and tags are used to specialize the option to a specific stage. The
+options are:
useGrid<tag>=boolean
Run this stage on the grid, usually in parallel.
@@ -234,8 +237,13 @@ Fraction Error Percent Error
. .
============== =============
-Eventually, we want to have Canu take a single error rate, the error rate of a single input read,
-and derive all other rates from there. This is the parameter ``errorRate``. Currently, the defaults are 0.025 for PacBio sequences and 0.05 for Oxford Nanpore sequences. Typically, you should not need to modify this setting. However, the error rate does affect runtime and lowering it can significantly speed up your assembly. Thus, for low coverage datasets (<=30X) we recommend increasing the error rate slightly (by 1%, so errorRate=0.035 or PacBio) and for high-coverage (>=60X) datasets [...]
+Eventually, we want to have Canu take a single error rate, the error rate of a single corrected read,
+and derive all other rates from there. This is the parameter ``errorRate``. Currently, the defaults
+are 0.025 for PacBio sequences and 0.05 for Oxford Nanpore sequences. Typically, you should not need
+to modify this setting. However, the error rate does affect runtime and lowering it can
+significantly speed up your assembly. Thus, for low coverage datasets (<=30X) we recommend
+increasing the error rate slightly (by 1%, so errorRate=0.035 or PacBio) and for high-coverage
+(>=60X) datasets, we recommend decreasing it (by 1%, so errorRate=0.015 for PacBio).
The following error rates are defined:
@@ -333,8 +341,8 @@ smaller of the forward and reverse-complemented kmer sequence. Kmer ACTT, with
AAGT, has a canonical kmer AAGT. Kmer CTTA, reverse-complement TAAG, has canonical kmer CTTA.
A 'distinct' kmer is the kmer sequence with no count associated with it. A 'total' kmer (for lack
-of a better term) is the kmer with its count. The sequence CGTTTTTTTCGTCG has (forward) 12 'total' 4-mers
-and 7 'distinct' kmers.
+of a better term) is the kmer with its count. The sequence TCGTTTTTTTCGTCG has 12 'total' 4-mers
+and 8 'distinct' kmers.
::
@@ -346,21 +354,21 @@ and 7 'distinct' kmers.
TTTT 4 copy of distinct-4
TTTT 4 copy of distinct-4
TTTT 4 copy of distinct-4
- TTTC 4 copy of distinct-4
- TTCG 1 distinct-5
+ TTTC 1 distinct-5
+ TTCG 1 distinct-6
TCGT 2 copy of distinct-1
- CGTC 1 distinct-6
- GTCG 1 distinct-7
+ CGTC 1 distinct-7
+ GTCG 1 distinct-8
<tag>MerThreshold
any kmer with count higher than N is not used
<tag>MerDistinct
pick a threshold so as to seed overlaps using this fraction of all distinct kmers in the input. In the example above,
- fraction 0.8572 of the k-mers (6/7) will be at or below threshold 2.
+ fraction 0.875 of the k-mers (7/8) will be at or below threshold 2.
<tag>MerTotal
pick a threshold so as to seed overlaps using this fraction of all kmers in the input. In the example above,
- fraction 0.6364 of the k-mers (7/11) will be at or below threshold 2.
+ fraction 0.667 of the k-mers (8/12) will be at or below threshold 2.
<tag>FrequentMers
don't compute frequent kmers, use those listed in this fasta file
diff --git a/src/AS_RUN/fragmentDepth.C b/src/AS_RUN/fragmentDepth.C
deleted file mode 100644
index 3e0375e..0000000
--- a/src/AS_RUN/fragmentDepth.C
+++ /dev/null
@@ -1,466 +0,0 @@
-
-/******************************************************************************
- *
- * This file is part of canu, a software program that assembles whole-genome
- * sequencing reads into contigs.
- *
- * This software is based on:
- * 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
- * the 'kmer package' (http://kmer.sourceforge.net)
- * both originally distributed by Applera Corporation under the GNU General
- * Public License, version 2.
- *
- * Canu branched from Celera Assembler at its revision 4587.
- * Canu branched from the kmer project at its revision 1994.
- *
- * Modifications by:
- *
- * Brian P. Walenz from 2007-MAR-28 to 2013-AUG-01
- * are Copyright 2007-2008,2010,2012-2013 J. Craig Venter Institute, and
- * are subject to the GNU General Public License version 2
- *
- * Sergey Koren from 2007-SEP-04 to 2009-AUG-14
- * are Copyright 2007-2009 J. Craig Venter Institute, and
- * are subject to the GNU General Public License version 2
- *
- * Brian P. Walenz on 2014-NOV-21
- * are Copyright 2014 Battelle National Biodefense Institute, and
- * are subject to the BSD 3-Clause License
- *
- * Brian P. Walenz beginning on 2016-JAN-11
- * are a 'United States Government Work', and
- * are released in the public domain
- *
- * File 'README.licenses' in the root directory of this distribution contains
- * full conditions and disclaimers for each license.
- */
-
-#include "AS_global.H"
-#include "AS_UTL_fasta.H"
-
-// A quick hack to compute a histogram of coverage depth using
-// the runCA-OBT posmap files.
-
-#define HISTMAX (8192)
-#define DEPTHSIZE (128 * 1024 * 1024)
-#define FRAGMAX (1024 * 1024)
-
-#define MODE_HISTOGRAM 0
-#define MODE_SCAFFOLD 1
-#define MODE_DEPTH 2
-
-typedef struct {
- uint32 lo;
- uint32 hi;
- uint32 de;
-} intDep;
-
-
-static
-int
-intDep_sort(const void *a, const void *b) {
- intDep *A = (intDep *)a;
- intDep *B = (intDep *)b;
-
- if (A->lo < B->lo) return(-1);
- if (A->lo > B->lo) return(1);
- return(0);
-}
-
-
-void
-computeStuff(uint32 *V, uint32 N,
- uint32 B,
- uint32 E,
- uint32 *mode,
- double *mean,
- uint32 *median) {
-
- uint32 histogramMax = 128 * 1024;
- uint32 *histogram = (uint32 *)safe_calloc(histogramMax, sizeof(uint32));
- uint32 histogramBig = 0;
- uint32 meanCount = 0;
- uint32 i;
-
- if (E > N)
- E = N;
-
- *mean = 0;
- for (i=B; i<E; i++) {
- if (V[i] > 0) {
- *mean += V[i];
- meanCount++;
- }
-
- if (V[i] < histogramMax)
- histogram[V[i]]++;
- else
- histogramBig++;
- }
-
- if (histogramBig) {
- fprintf(stderr, "histogramBig: "F_U32"\n", histogramBig);
- exit(1);
- }
-
- // Find the mode -- except for 0.
- //
- *mode = 1;
- for (i=1; i<histogramMax; i++) {
- if (histogram[*mode] < histogram[i])
- *mode = i;
- }
-
- // Find the mean
- //
- if (meanCount == 0) {
- *mean = 0;
- }
- else {
- *mean = *mean / meanCount;
- }
-
-
- // Find the median
- //
- meanCount /= 2;
- *median = 1;
-
- for (i=1; i<histogramMax; i++)
- if (meanCount >= histogram[i]) {
- meanCount -= histogram[i];
- } else {
- *median = i;
- break;
- }
-
- safe_free(histogram);
-}
-
-void outputResult(AS_IID lastiid,
- intDep *id,
- uint32 idlen,
- int mode,
- uint32 *histogram,
- uint32 *histmax,
- int stepSize) {
- uint32 i = 0;
- uint32 lastpos = 0;
-
- switch (mode) {
- case MODE_HISTOGRAM:
- // Update the histogram
- //
- for (i=0; i<idlen; i++) {
- if (id[i].de < HISTMAX) {
- // if there is a gap between the previous interval and the current one, add to our 0 coverage count
- if ((id[i].lo - lastpos) > 0) {
- histogram[0] += id[i].lo - lastpos;
- }
-
- histogram[id[i].de] += id[i].hi - id[i].lo;
- if ((*histmax) < id[i].de)
- (*histmax) = id[i].de;
- }
- lastpos = id[i].hi;
- }
- break;
-
-
- case MODE_SCAFFOLD:
- // Report mode, mean and median for this scaffold
- //
- {
- uint32 N = id[idlen-1].hi;
- uint32 *V = (uint32 *)safe_calloc(N, sizeof(uint32));
- uint32 mode = 0;
- double mean = 0.0;
- uint32 median = 0;
- uint32 currStep = 0;
-
- for (i=0; i<idlen; i++) {
- uint32 j;
- for (j=id[i].lo; j<id[i].hi; j++) {
- V[j] = id[i].de;
- }
- }
-
- if (stepSize == 0) {
- currStep = N;
- }
- else {
- currStep = stepSize;
- }
-
- for (i = 0; i < N; i+=currStep) {
- uint32 E = i+currStep;
- if (E > N) { E = N; }
-
- computeStuff(V, N, i, E, &mode, &mean, &median);
- fprintf(stdout, "%s\t"F_U32"\t"F_U32"\t"F_U32"\t%f\t"F_U32"\n", lastiid, i, E, mode, mean, median);
- }
- safe_free(V);
- }
- break;
-
- case MODE_DEPTH:
- {
- char *seq = (char *)safe_malloc((id[idlen-1].hi + 1) * sizeof(char));
- uint32 j;
-
- memset(seq, '0', id[idlen-1].hi);
-
- for (i=0; i<idlen; i++) {
- for (j=id[i].lo; j<id[i].hi; j++) {
- if (id[i].de < 10)
- seq[j] = '0' + id[i].de;
- else if (id[i].de < 68)
- seq[j] = 'A' + id[i].de - 10;
- else
- seq[j] = '~';
- }
- }
-
- seq[id[idlen-1].hi] = 0;
-
- AS_UTL_writeFastA(stdout, seq, id[idlen-1].hi, 0, ">%d\n", lastiid);
- }
- break;
- }
-}
-
-void processScaffold(AS_IID lastiid,
- intDep *in,
- uint32 inlen,
- int mode,
- uint32 *histogram,
- uint32 *histmax,
- int stepSize) {
- uint32 i = 0;
- uint32 idlen = 0;
- intDep *id = NULL;
-
- // Convert the list of overlapping intervals into a list
- // of non-intersecting intervals annotated with depth
- uint32 islen = inlen * 2;
- intDep *is = (intDep *)safe_malloc(sizeof(intDep) * islen);
-
- for (i=0; i<inlen; i++) {
- is[2*i ].lo = in[i].lo;
- is[2*i ].hi = 0;
- is[2*i ].de = 1;
- is[2*i+1].lo = in[i].hi;
- is[2*i+1].hi = 0;
- is[2*i+1].de = 0;
- }
-
- qsort(is, islen, sizeof(intDep), intDep_sort);
-
- // Scan the list, counting how many times we change depth.
- //
- idlen = 1;
- for (i=1; i<islen; i++) {
- if (is[i-1].lo != is[i].lo)
- idlen++;
- }
-
- // Allocate the real depth of coverage intervals
- //
- id = (intDep *)safe_malloc(sizeof(intDep) * idlen);
- idlen = 0;
-
- // Build new intervals
- //
- // Initialize the first interval
- //
- id[idlen].lo = is[0].lo;
- id[idlen].hi = is[0].lo;
- id[idlen].de = 1;
-
- for (i=1; i<islen; i++) {
-
- if (id[idlen].de == 0) {
- // Update the start position if the current interval is at zero
- // depth.
- //
- id[idlen].lo = is[i].lo;
- } else {
- // If we are at a position different from the start, we need to
- // close out the current interval and make a new one.
- //
- if (is[i-1].lo != is[i].lo) {
- id[idlen].hi = is[i].lo;
-
- idlen++;
-
- id[idlen].lo = is[i].lo;
- id[idlen].hi = is[i].lo;
- id[idlen].de = id[idlen-1].de;
- }
- }
-
- // Finally, update the depth of the current interval
- //
- if (is[i].de)
- id[idlen].de++;
- else
- id[idlen].de--;
- }
-
- // The way the loop is constructed above, the number of id
- // intervals is idlen+1. The last interval is always zero
- // (thats id[idlen]) and so our later loops are supposed to
- // be i<idlen.
- assert(id[idlen].lo == id[idlen].hi);
-
- safe_free(is);
- outputResult(lastiid, id, idlen, mode, histogram, histmax, stepSize);
- safe_free(id);
-}
-
-
-int
-main(int argc, char **argv) {
- uint32 i = 0;
-
- AS_IID lastiid = NO_IID;
- int lastend = 0;
-
- uint32 histogram[HISTMAX] = { 0 };
- uint32 histmax = 0;
-
- int minSize = 0;
- int maxSize = DEPTHSIZE;
-
- int mode = MODE_HISTOGRAM;
- int stepSize = 0;
-
- argc = AS_configure(argc, argv);
-
- int arg=1;
- int err=0;
- while (arg < argc) {
- if (strcmp(argv[arg], "-min") == 0) {
- minSize = atoi(argv[++arg]);
- } else if (strcmp(argv[arg], "-max") == 0) {
- maxSize = atoi(argv[++arg]);
- } else if (strcmp(argv[arg], "-stepSize") == 0) {
- stepSize = atoi(argv[++arg]);
- } else if (strcmp(argv[arg], "-histogram") == 0) {
- mode = MODE_HISTOGRAM;
- } else if (strcmp(argv[arg], "-scaffold") == 0) {
- mode = MODE_SCAFFOLD;
- } else if (strcmp(argv[arg], "-depth") == 0) {
- mode = MODE_DEPTH;
- } else {
- fprintf(stderr, "unknown option '%s'\n", argv[arg]);
- err++;
- }
- arg++;
- }
- if (err || isatty(fileno(stdin))) {
- fprintf(stderr, "usage: %s MODE [-min N] [-max N] [-stepSize N] < x.posmap.frgscf\n", argv[0]);
- fprintf(stderr, "\n");
- fprintf(stderr, " -min N use scaffolds at least N bases long.\n");
- fprintf(stderr, " -max N use scaffolds at most N bases long.\n");
- fprintf(stderr, "\n");
- fprintf(stderr, "MODES: -histogram, -scaffold or -depth\n");
- fprintf(stderr, "\n");
- fprintf(stderr, "The default mode is to compute a histogram of the number of bases at some\n");
- fprintf(stderr, "depth of coverage.\n");
- fprintf(stderr, "\n");
- fprintf(stderr, "The -scaffold mode reports the mode, mean, median depth per scaffold. The\n");
- fprintf(stderr, "-stepSize option will compute those stats, in blocks of N bases (e.g., for bases\n");
- fprintf(stderr, "0 through N, then N through 2N, then 2N through 3N, etc.)\n");
- fprintf(stderr, "\n");
- fprintf(stderr, "The -depth mode writes a multi-fasta file with the actual depth at each base\n");
- fprintf(stderr, "encoded. The encoding is somewhat complicated to avoid using the '>' letter.\n");
- fprintf(stderr, "Depth 0 through 9 is encoded as '0' through '9'. Depth 10 through 68 is\n");
- fprintf(stderr, "encoded as A-Z[\\]^_`a-z{|}, and depth more than 68 is encoded as ~. Decode as:\n");
- fprintf(stderr, " depth = letter - '0';\n");
- fprintf(stderr, " if (depth > 9)\n");
- fprintf(stderr, " depth -= 7;\n");
- fprintf(stderr, "\n");
- fprintf(stderr, "!!WARNING -- The input frgscf MUST be sorted by scaffold ID -- WARNING!!\n");
- fprintf(stderr, "\n");
- exit(1);
- }
-
-
-
-
-
-
- uint32 inlen = 0;
- uint32 inmax = 4194304; // 4 million fragments per scaffold should be enough (but we'll realloc later if not)
- intDep *in = (intDep *)safe_malloc(sizeof(intDep) * inmax);
-
- char line[1024] = {0};
- char *cont = NULL;
-
- if (mode == MODE_SCAFFOLD)
- fprintf(stdout, "iid\tstart\tend\tmode\tmean\tmedian\n");
-
- while (fgets(line, 1024, stdin) != NULL) {
- AS_IID iidjunk = strtol(cont, &cont, 10); // read id
- AS_IID iid = strtol(cont, &cont, 10); // scaffold id
- int32 beg = strtol(cont, &cont, 10);
- int32 end = strtol(cont, &cont, 10);
-
- if (lastiid == NO_IID)
- lastiid = iid;
-
- // Did we switch to a new scaffold? Process this set of intervals.
- //
- if ((iid != lastiid) && (inlen > 0)) {
-
- // This scaffold is the correct size
- //
- if ((minSize <= lastend) &&
- (lastend <= maxSize)) {
- processScaffold(lastiid, in, inlen, mode, histogram, &histmax, stepSize);
- }
-
- // Setup for the next scaffold
- //
- inlen = 0;
-
- lastiid = iid;
- lastend = 0;
- } // got a new scaffold
-
- // Save this fragment.
- //
- in[inlen].lo = beg;
- in[inlen].hi = end;
- inlen++;
-
- if (inlen >= inmax) {
- inmax *= 2;
- in = (intDep *)safe_realloc(in, inmax * sizeof(intDep));
- }
-
- if (lastend < end)
- lastend = end;
- }
-
- // process last scaffold
- if ((lastiid != NO_IID) && (inlen > 0)) {
-
- // This scaffold is the correct size
- //
- if ((minSize <= lastend) &&
- (lastend <= maxSize)) {
- processScaffold(lastiid, in, inlen, mode, histogram, &histmax, stepSize);
- }
- }
-
-
- if (mode == MODE_HISTOGRAM)
- for (i=0; i<=histmax; i++)
- fprintf(stdout, "%d\t%d\n", i, histogram[i]);
-
- safe_free(in);
-
- exit(0);
-}
diff --git a/src/AS_RUN/replaceIIDwithName-overlapDump.pl b/src/AS_RUN/replaceIIDwithName-overlapDump.pl
deleted file mode 100644
index 1a581ef..0000000
--- a/src/AS_RUN/replaceIIDwithName-overlapDump.pl
+++ /dev/null
@@ -1,70 +0,0 @@
-#!/usr/bin/env perl
-
-###############################################################################
- #
- # This file is part of canu, a software program that assembles whole-genome
- # sequencing reads into contigs.
- #
- # This software is based on:
- # 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
- # the 'kmer package' (http://kmer.sourceforge.net)
- # both originally distributed by Applera Corporation under the GNU General
- # Public License, version 2.
- #
- # Canu branched from Celera Assembler at its revision 4587.
- # Canu branched from the kmer project at its revision 1994.
- #
- # Modifications by:
- #
- # Brian P. Walenz on 2013-SEP-22
- # are Copyright 2013 J. Craig Venter Institute, and
- # are subject to the GNU General Public License version 2
- #
- # Brian P. Walenz beginning on 2015-OCT-12
- # are a 'United States Government Work', and
- # are released in the public domain
- #
- # File 'README.licenses' in the root directory of this distribution contains
- # full conditions and disclaimers for each license.
- ##
-
-use strict;
-
-my %IIDtoNAME;
-
-my $fastqUIDmap = shift @ARGV;
-
-print STDERR "Loading UID map from '$fastqUIDmap'.\n";
-
-open (F, "< $fastqUIDmap") or die "Failed to open '$fastqUIDmap'\n";
-while (<F>) {
- my @v = split '\s+', $_;
-
- if (scalar(@v) == 3) {
- $IIDtoNAME{$v[1]} = $v[2];
-
- } elsif (scalar(@v) == 6) {
- $IIDtoNAME{$v[1]} = $v[2];
- $IIDtoNAME{$v[4]} = $v[5];
-
- } else {
- die "unknown format '$_'\n";
- }
-}
-close(F);
-
-
-while (<STDIN>) {
- $_ =~ s/^\s+//;
- $_ =~ s/\s+$//;
-
- my @v = split '\s+', $_;
-
- die "Didn't find IID '$v[0]' in overlap '$_'.\n" if (!exists($IIDtoNAME{$v[0]}));
- die "Didn't find IID '$v[0]' in overlap '$_'.\n" if (!exists($IIDtoNAME{$v[1]}));
-
- $v[0] = $IIDtoNAME{$v[0]};
- $v[1] = $IIDtoNAME{$v[1]};
-
- print join("\t", @v), "\n";
-}
diff --git a/src/AS_RUN/replaceUIDwithName-fastq.pl b/src/AS_RUN/replaceUIDwithName-fastq.pl
deleted file mode 100644
index 1f2843a..0000000
--- a/src/AS_RUN/replaceUIDwithName-fastq.pl
+++ /dev/null
@@ -1,153 +0,0 @@
-#!/usr/bin/env perl
-
-###############################################################################
- #
- # This file is part of canu, a software program that assembles whole-genome
- # sequencing reads into contigs.
- #
- # This software is based on:
- # 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
- # the 'kmer package' (http://kmer.sourceforge.net)
- # both originally distributed by Applera Corporation under the GNU General
- # Public License, version 2.
- #
- # Canu branched from Celera Assembler at its revision 4587.
- # Canu branched from the kmer project at its revision 1994.
- #
- # This file is derived from:
- #
- # src/AS_RUN/replaceUIDwithName.pl
- #
- # Modifications by:
- #
- # Brian P. Walenz from 2012-DEC-13 to 2013-AUG-23
- # are Copyright 2012-2013 J. Craig Venter Institute, and
- # are subject to the GNU General Public License version 2
- #
- # Brian P. Walenz on 2014-OCT-01
- # are Copyright 2014 Battelle National Biodefense Institute, and
- # are subject to the BSD 3-Clause License
- #
- # Brian P. Walenz beginning on 2015-OCT-12
- # are a 'United States Government Work', and
- # are released in the public domain
- #
- # File 'README.licenses' in the root directory of this distribution contains
- # full conditions and disclaimers for each license.
- ##
-
-use strict;
-
-# Usage
-# fastqUIDmap fastq fastq fastq
-
-my $fastqUIDmap = shift @ARGV;
-
-my $inFile;
-my $otFile;
-
-my %UIDtoNAME;
-
-my $namesLoaded = 0;
-my $readsRenamed = 0;
-
-sub loadMoreNames {
- my $lim = 100000;
-
- undef %UIDtoNAME;
-
- while (<N>) {
- chomp;
-
- my @v = split '\s+', $_;
-
- if (scalar(@v) == 3) {
- $UIDtoNAME{$v[0]} = $v[2];
- $namesLoaded++;
-
- } elsif (scalar(@v) == 6) {
- $UIDtoNAME{$v[0]} = $v[2];
- $UIDtoNAME{$v[3]} = $v[5];
- $namesLoaded++;
- $namesLoaded++;
-
- } else {
- die "unknown format '$_'\n";
- }
-
- if (--$lim == 0) {
- return;
- }
- }
-}
-
-
-
-while (scalar(@ARGV)) {
- $inFile = shift @ARGV;
- $otFile = $inFile;
-
- if ($inFile =~ m/(.*).fastq/) {
- $otFile = "$1.fastq.RENAMING";
- } else {
- die "Failed to generate output file name.\n";
- }
-
- open(F, "< $inFile") or die "Failed to open '$inFile' for reading\n";
- open(O, "> $otFile") or die "Failed to open '$otFile' for writing\n";
- open(N, "< $fastqUIDmap") or die "Failed to open '$fastqUIDmap'\n";
-
- $namesLoaded = 0;
- $readsRenamed = 0;
-
- print STDERR "Renaming '$inFile' to '$otFile'.\n";
-
- while (!eof(F)) {
- my $a = <F>; chomp $a;
- my $b = <F>;
- my $c = <F>;
- my $d = <F>;
-
- if ($a =~ m/\@(\w+),\w+\s*/) {
- # UID,IID
- $a = $1;
-
- } elsif ($a =~ m/\@(\w+)\s*/) {
- # UID
- $a = $1;
-
- } else {
- die "Nope '$a'\n";
- }
-
- while (!exists($UIDtoNAME{$a})) {
- loadMoreNames();
-
- #if ((!exists($UIDtoNAME{$a})) && ($readsRenamed > 0)) {
- # print STDERR "WARNING: Looping to load more names; out of sync?\n";
- #}
- }
-
- die "Didn't find UID '$a'\n" if (!exists($UIDtoNAME{$a}));
- $a = "\@$UIDtoNAME{$a}\n";
-
- print O "$a$b$c$d";
-
- $readsRenamed++;
-
- if (($readsRenamed % 10000) == 0) {
- print STDERR "Renamed $readsRenamed reads using $namesLoaded names.\r";
- }
- }
-
- print STDERR "Renamed $readsRenamed reads using $namesLoaded names.\n";
-
- close(F);
- close(O);
- close(N);
-
- if ($inFile =~ m/(.*).fastq/) {
- rename "$1.fastq", "$1.CA_UIDs.fastq";
- rename "$1.fastq.RENAMING", "$1.fastq";
- }
-}
diff --git a/src/AS_RUN/replaceUIDwithName-posmap.pl b/src/AS_RUN/replaceUIDwithName-posmap.pl
deleted file mode 100644
index ab54c79..0000000
--- a/src/AS_RUN/replaceUIDwithName-posmap.pl
+++ /dev/null
@@ -1,98 +0,0 @@
-#!/usr/bin/env perl
-
-###############################################################################
- #
- # This file is part of canu, a software program that assembles whole-genome
- # sequencing reads into contigs.
- #
- # This software is based on:
- # 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
- # the 'kmer package' (http://kmer.sourceforge.net)
- # both originally distributed by Applera Corporation under the GNU General
- # Public License, version 2.
- #
- # Canu branched from Celera Assembler at its revision 4587.
- # Canu branched from the kmer project at its revision 1994.
- #
- # This file is derived from:
- #
- # src/AS_RUN/replaceUIDwithName-simple.pl
- #
- # Modifications by:
- #
- # Brian P. Walenz on 2013-SEP-22
- # are Copyright 2013 J. Craig Venter Institute, and
- # are subject to the GNU General Public License version 2
- #
- # Brian P. Walenz on 2014-OCT-01
- # are Copyright 2014 Battelle National Biodefense Institute, and
- # are subject to the BSD 3-Clause License
- #
- # Brian P. Walenz beginning on 2015-OCT-12
- # are a 'United States Government Work', and
- # are released in the public domain
- #
- # File 'README.licenses' in the root directory of this distribution contains
- # full conditions and disclaimers for each license.
- ##
-
-use strict;
-
-my %UIDtoNAME;
-
-# Usage
-# fastqUIDmap posmnap posmap posmap
-
-my $fastqUIDmap = shift @ARGV;
-
-print STDERR "Loading UID map from '$fastqUIDmap'.\n";
-
-open (F, "< $fastqUIDmap") or die "Failed to open '$fastqUIDmap'\n";
-while (<F>) {
- chomp;
-
- my @v = split '\s+', $_;
-
- if (scalar(@v) == 3) {
- $UIDtoNAME{$v[0]} = $v[2];
-
- } elsif (scalar(@v) == 6) {
- $UIDtoNAME{$v[0]} = $v[2];
- $UIDtoNAME{$v[3]} = $v[5];
-
- } else {
- die "unknown format '$_'\n";
- }
-}
-close(F);
-
-my $inFile;
-my $otFile;
-
-while (scalar(@ARGV)) {
- $inFile = shift @ARGV;
-
- print STDERR "Renaming '$inFile' to '$inFile.UID'.\n";
-
- rename "$inFile", "$inFile.UID";
-
- open(F, "< $inFile.UID") or die "Failed to open '$inFile.UID' for reading\n";
- open(O, "> $inFile") or die "Failed to open '$inFile' for writing\n";
-
- while (!eof(F)) {
- my $a = <F>;
- my @a = split '\s+', $a;
-
- foreach my $a (@a) {
- if (exists($UIDtoNAME{$a})) {
- $a = $UIDtoNAME{$a};
- }
- }
-
- print O join "\t", @a;
- print O "\n";
- }
-
- close(F);
- close(O);
-}
diff --git a/src/AS_UTL/AS_UTL_alloc.C b/src/AS_UTL/AS_UTL_alloc.C
index 3aa472d..4f39173 100644
--- a/src/AS_UTL/AS_UTL_alloc.C
+++ b/src/AS_UTL/AS_UTL_alloc.C
@@ -31,4 +31,50 @@
* full conditions and disclaimers for each license.
*/
-// Nothing here. Left for future use.
+#include "AS_UTL_alloc.H"
+
+
+
+
+#if !defined(__CYGWIN__) && !defined(_WIN32)
+#include <sys/sysctl.h>
+#endif
+
+#ifdef HW_PHYSMEM
+
+uint64
+getPhysicalMemorySize(void) {
+ uint64 physMemory = 0;
+
+ int mib[2] = { CTL_HW, HW_PHYSMEM };
+ size_t len = sizeof(uint64);
+
+ errno = 0;
+
+ if (sysctl(mib, 2, &physMemory, &len, NULL, 0) != 0)
+ fprintf(stderr, "getPhysicalMemorySize()-- sysctl() failed to return CTL_HW, HW_PHYSMEM: %s\n", strerror(errno)), exit(1);
+
+ if (len != sizeof(uint64)) {
+#ifdef HW_MEMSIZE
+ mib[1] = HW_MEMSIZE;
+ len = sizeof(uint64);
+ if (sysctl(mib, 2, &physMemory, &len, NULL, 0) != 0 || len != sizeof(uint64))
+#endif
+ fprintf(stderr, "getPhysicalMemorySize()-- sysctl() failed to return CTL_HW, HW_PHYSMEM: %s\n", strerror(errno)), exit(1);
+ }
+
+ return(physMemory);
+}
+
+#else
+
+uint64
+getPhysicalMemorySize(void) {
+ uint64 physPages = sysconf(_SC_PHYS_PAGES);
+ uint64 pageSize = sysconf(_SC_PAGESIZE);
+ uint64 physMemory = physPages * pageSize;
+
+ return(physMemory);
+}
+
+#endif
diff --git a/src/AS_UTL/AS_UTL_alloc.H b/src/AS_UTL/AS_UTL_alloc.H
index dfd1e05..69d62a5 100644
--- a/src/AS_UTL/AS_UTL_alloc.H
+++ b/src/AS_UTL/AS_UTL_alloc.H
@@ -34,6 +34,33 @@
#ifndef AS_UTL_ALLOC_H
#define AS_UTL_ALLOC_H
+#include "AS_global.H"
+
+
+uint64 getPhysicalMemorySize(void);
+
+
+
+const uint32 resizeArray_doNothing = 0x00;
+const uint32 resizeArray_copyData = 0x01;
+const uint32 resizeArray_clearNew = 0x02;
+
+
+
+template<typename TT, typename LL>
+void
+allocateArray(TT*& array, LL arrayMax, uint32 op=resizeArray_clearNew) {
+
+ if (array != NULL)
+ delete [] array;
+
+ array = new TT [arrayMax];
+
+ if (op == resizeArray_clearNew)
+ memset(array, 0, sizeof(TT) * arrayMax);
+}
+
+
template<typename TT, typename LL>
void
@@ -51,9 +78,6 @@ duplicateArray(TT*& to, LL &toLen, LL &toMax, TT *fr, LL frLen, LL UNUSED(frMax)
memcpy(to, fr, sizeof(TT) * toLen);
}
-const uint32 resizeArray_doNothing = 0x00;
-const uint32 resizeArray_copyData = 0x01;
-const uint32 resizeArray_clearNew = 0x02;
template<typename TT, typename LL>
@@ -78,6 +102,7 @@ resizeArray(TT*& array, uint64 arrayLen, LL &arrayMax, uint64 newMax, uint32 op=
}
+
template<typename T1, typename T2, typename LL>
void
resizeArrayPair(T1*& array1, T2*& array2, uint64 arrayLen, LL &arrayMax, LL newMax, uint32 op=resizeArray_copyData) {
@@ -108,6 +133,7 @@ resizeArrayPair(T1*& array1, T2*& array2, uint64 arrayLen, LL &arrayMax, LL newM
}
+
template<typename TT, typename LL>
void
increaseArray(TT*& array, uint64 arrayLen, LL &arrayMax, uint64 increment) {
@@ -124,6 +150,7 @@ increaseArray(TT*& array, uint64 arrayLen, LL &arrayMax, uint64 increment) {
}
+
template<typename T1, typename T2, typename LL>
void
increaseArrayPair(T1*& array1, T2*& array2, uint64 arrayLen, LL &arrayMax, uint64 increment) {
@@ -140,4 +167,5 @@ increaseArrayPair(T1*& array1, T2*& array2, uint64 arrayLen, LL &arrayMax, uint6
}
+
#endif // AS_UTL_ALLOC_H
diff --git a/src/AS_UTL/AS_UTL_fileIO.C b/src/AS_UTL/AS_UTL_fileIO.C
index 3aee5b6..71fbaa5 100644
--- a/src/AS_UTL/AS_UTL_fileIO.C
+++ b/src/AS_UTL/AS_UTL_fileIO.C
@@ -44,6 +44,32 @@
// and that we ended up at the expected location.
#undef VERIFY_WRITE_POSITIONS
+
+
+
+// Return the basename of a path -- that is, strip off any and all extensions.
+// Anything after the first dot after the last slash is removed.
+//
+// But if a directory, do nothing.
+
+void
+AS_UTL_findBaseFileName(char *basename, const char *filename) {
+
+ strcpy(basename, filename);
+
+ if (AS_UTL_fileExists(basename, true, false))
+ return;
+
+ char *slash = strrchr(basename, '/');
+ char *dot = strchr((slash == NULL) ? basename : slash, '.');
+
+ if (dot)
+ *dot = 0;
+}
+
+
+
+
// Provides a safe and reliable mechanism for reading / writing
// binary data.
//
@@ -76,7 +102,7 @@ AS_UTL_safeWrite(FILE *file, const void *buffer, const char *desc, size_t size,
if (errno) {
fprintf(stderr, "safeWrite()-- Write failure on %s: %s\n", desc, strerror(errno));
- fprintf(stderr, "safeWrite()-- Wanted to write "F_SIZE_T" objects (size="F_SIZE_T"), wrote "F_SIZE_T".\n",
+ fprintf(stderr, "safeWrite()-- Wanted to write " F_SIZE_T " objects (size=" F_SIZE_T "), wrote " F_SIZE_T ".\n",
towrite, size, written);
assert(errno == 0);
}
@@ -92,7 +118,7 @@ AS_UTL_safeWrite(FILE *file, const void *buffer, const char *desc, size_t size,
#ifdef VERIFY_WRITE_POSITIONS
if ((expectedposition > 0) &&
(AS_UTL_ftell(file) != expectedposition)) {
- fprintf(stderr, "safeWrite()-- EXPECTED "F_OFF_T", ended up at "F_OFF_T"\n",
+ fprintf(stderr, "safeWrite()-- EXPECTED " F_OFF_T ", ended up at " F_OFF_T "\n",
expectedposition, AS_UTL_ftell(file));
assert(AS_UTL_ftell(file) == expectedposition);
}
@@ -121,7 +147,7 @@ AS_UTL_safeRead(FILE *file, void *buffer, const char *desc, size_t size, size_t
if ((errno) && (errno != EINTR)) {
fprintf(stderr, "safeRead()-- Read failure on %s: %s.\n", desc, strerror(errno));
- fprintf(stderr, "safeRead()-- Wanted to read "F_SIZE_T" objects (size="F_SIZE_T"), read "F_SIZE_T".\n",
+ fprintf(stderr, "safeRead()-- Wanted to read " F_SIZE_T " objects (size=" F_SIZE_T "), read " F_SIZE_T ".\n",
toread, size, written);
assert(errno == 0);
}
@@ -130,7 +156,7 @@ AS_UTL_safeRead(FILE *file, void *buffer, const char *desc, size_t size, size_t
finish:
// Just annoys developers. Stop it.
//if (position != nobj)
- // fprintf(stderr, "AS_UTL_safeRead()-- Short read; wanted "F_SIZE_T" objects, read "F_SIZE_T" instead.\n",
+ // fprintf(stderr, "AS_UTL_safeRead()-- Short read; wanted " F_SIZE_T " objects, read " F_SIZE_T " instead.\n",
// nobj, position);
return(position);
}
@@ -175,6 +201,24 @@ AS_UTL_mkdir(const char *dirname) {
+int
+AS_UTL_symlink(const char *pathToFile, const char *pathToLink) {
+
+ if (AS_UTL_fileExists(pathToFile, FALSE, FALSE) == 0)
+ fprintf(stderr, "AS_UTL_symlink()-- Original file '%s' doesn't exist, won't make a link to nothing.\n",
+ pathToFile), exit(1);
+
+ errno = 0;
+ symlink(pathToFile, pathToLink);
+ if (errno)
+ fprintf(stderr, "AS_UTL_symlink()-- Failed to make link '%s' pointing to file '%s': %s\n",
+ pathToLink, pathToFile, strerror(errno)), exit(1);
+
+ return(0);
+}
+
+
+
// Remove a file, or do nothing if the file doesn't exist. Returns true if the file
// was deleted, false if the file never existsed.
int
@@ -267,17 +311,27 @@ AS_UTL_sizeOfFile(const char *path) {
// bzipped files have no contents and we just guess.
if (strcasecmp(path+strlen(path)-3, ".gz") == 0) {
- char cmd[256];
- FILE *F;
+ char cmd[FILENAME_MAX], *p = cmd;
- sprintf(cmd, "gzip -l %s", path);
- F = popen(cmd, "r");
- fscanf(F, " %*s %*s %*s %*s ");
- fscanf(F, " %*d "F_OFF_T" %*s %*s ", &size);
+ snprintf(cmd, FILENAME_MAX, "gzip -l %s", path);
+
+ FILE *F = popen(cmd, "r");
+ fgets(cmd, FILENAME_MAX, F); // compressed uncompressed ratio uncompressed_name
+ fgets(cmd, FILENAME_MAX, F); // 30264891 43640320 30.6% file
pclose(F);
- } else if (strcasecmp(path+strlen(path)-4, ".bz2") == 0) {
+
+ while (isspace(*p) == true) p++; // Skip spaces at the start of the line
+ while (isspace(*p) == false) p++; // Skip the compressed size
+ while (isspace(*p) == true) p++; // Skip spaces
+
+ size = strtoull(p, NULL, 10); // Retain the uncompresssed size
+ }
+
+ else if (strcasecmp(path+strlen(path)-4, ".bz2") == 0) {
size = s.st_size * 14 / 10;
- } else {
+ }
+
+ else {
size = s.st_size;
}
@@ -327,7 +381,7 @@ AS_UTL_fseek(FILE *stream, off_t offset, int whence) {
if ((whence == SEEK_SET) && (beginpos == offset)) {
#ifdef DEBUG_SEEK
// This isn't terribly informative, and adds a lot of clutter.
- //fprintf(stderr, "AS_UTL_fseek()-- seek to "F_OFF_T" (whence=%d); already there\n", offset, whence);
+ //fprintf(stderr, "AS_UTL_fseek()-- seek to " F_OFF_T " (whence=%d); already there\n", offset, whence);
#endif
return;
}
@@ -339,7 +393,7 @@ AS_UTL_fseek(FILE *stream, off_t offset, int whence) {
}
#ifdef DEBUG_SEEK
- fprintf(stderr, "AS_UTL_fseek()-- seek to "F_OFF_T" (requested "F_OFF_T", whence=%d) from "F_OFF_T"\n",
+ fprintf(stderr, "AS_UTL_fseek()-- seek to " F_OFF_T " (requested " F_OFF_T ", whence=%d) from " F_OFF_T "\n",
AS_UTL_ftell(stream), offset, whence, beginpos);
#endif
@@ -377,52 +431,77 @@ AS_UTL_loadFileList(char *fileName, vector<char *> &fileList) {
+cftType
+compressedFileType(char const *filename) {
+
+ if ((filename == NULL) || (filename[0] == 0) || (strcmp(filename, "-") == 0))
+ return(cftSTDIN);
+
+ int32 len = strlen(filename);
+
+ if ((len > 3) && (strcasecmp(filename + len - 3, ".gz") == 0))
+ return(cftGZ);
+
+ else if ((len > 4) && (strcasecmp(filename + len - 4, ".bz2") == 0))
+ return(cftBZ2);
+
+ else if ((len > 3) && (strcasecmp(filename + len - 3, ".xz") == 0))
+ return(cftXZ);
+
+ else
+ return(cftNONE);
+}
compressedFileReader::compressedFileReader(const char *filename) {
- char cmd[FILENAME_MAX * 2];
+ char cmd[FILENAME_MAX];
int32 len = 0;
_file = NULL;
_pipe = false;
_stdi = false;
- if (filename != NULL)
- len = strlen(filename);
+ cftType ft = compressedFileType(filename);
- if ((len > 0) && (strcmp(filename, "-") != 0) && (AS_UTL_fileExists(filename, FALSE, FALSE) == FALSE))
+ if ((ft != cftSTDIN) && (AS_UTL_fileExists(filename, FALSE, FALSE) == FALSE))
fprintf(stderr, "ERROR: Failed to open input file '%s': %s\n", filename, strerror(errno)), exit(1);
errno = 0;
- if ((len > 3) && (strcasecmp(filename + len - 3, ".gz") == 0)) {
- sprintf(cmd, "gzip -dc %s", filename);
- _file = popen(cmd, "r");
- _pipe = true;
-
- } else if ((len > 4) && (strcasecmp(filename + len - 4, ".bz2") == 0)) {
- sprintf(cmd, "bzip2 -dc %s", filename);
- _file = popen(cmd, "r");
- _pipe = true;
-
- } else if ((len > 3) && (strcasecmp(filename + len - 3, ".xz") == 0)) {
- sprintf(cmd, "xz -dc %s", filename);
- _file = popen(cmd, "r");
- _pipe = true;
-
- if (_file == NULL) // popen() returns NULL on error. It does not reliably set errno.
- fprintf(stderr, "ERROR: Failed to open input file '%s': popen() returned NULL\n", filename), exit(1);
-
- errno = 0;
-
- } else if ((len == 0) || (strcmp(filename, "-") == 0)) {
- _file = stdin;
- _stdi = 1;
-
- } else {
- _file = fopen(filename, "r");
- _pipe = false;
+ switch (ft) {
+ case cftGZ:
+ snprintf(cmd, FILENAME_MAX, "gzip -dc %s", filename);
+ _file = popen(cmd, "r");
+ _pipe = true;
+ break;
+
+ case cftBZ2:
+ snprintf(cmd, FILENAME_MAX, "bzip2 -dc %s", filename);
+ _file = popen(cmd, "r");
+ _pipe = true;
+ break;
+
+ case cftXZ:
+ snprintf(cmd, FILENAME_MAX, "xz -dc %s", filename);
+ _file = popen(cmd, "r");
+ _pipe = true;
+
+ if (_file == NULL) // popen() returns NULL on error. It does not reliably set errno.
+ fprintf(stderr, "ERROR: Failed to open input file '%s': popen() returned NULL\n", filename), exit(1);
+
+ errno = 0;
+ break;
+
+ case cftSTDIN:
+ _file = stdin;
+ _stdi = 1;
+ break;
+
+ default:
+ _file = fopen(filename, "r");
+ _pipe = false;
+ break;
}
if (errno)
@@ -447,40 +526,45 @@ compressedFileReader::~compressedFileReader() {
compressedFileWriter::compressedFileWriter(const char *filename, int32 level) {
- char cmd[FILENAME_MAX * 2];
+ char cmd[FILENAME_MAX];
int32 len = 0;
_file = NULL;
_pipe = false;
_stdi = false;
- if (filename != NULL)
- len = strlen(filename);
+ cftType ft = compressedFileType(filename);
errno = 0;
- if ((len > 3) && (strcasecmp(filename + len - 3, ".gz") == 0)) {
- sprintf(cmd, "gzip -%dc > %s", level, filename);
- _file = popen(cmd, "w");
- _pipe = true;
-
- } else if ((len > 4) && (strcasecmp(filename + len - 4, ".bz2") == 0)) {
- sprintf(cmd, "bzip2 -%dc > %s", level, filename);
- _file = popen(cmd, "w");
- _pipe = true;
-
- } else if ((len > 3) && (strcasecmp(filename + len - 3, ".xz") == 0)) {
- sprintf(cmd, "xz -%dc > %s", level, filename);
- _file = popen(cmd, "w");
- _pipe = true;
-
- } else if ((len == 0) || (strcmp(filename, "-") == 0)) {
- _file = stdout;
- _stdi = 1;
-
- } else {
- _file = fopen(filename, "w");
- _pipe = false;
+ switch (ft) {
+ case cftGZ:
+ snprintf(cmd, FILENAME_MAX, "gzip -%dc > %s", level, filename);
+ _file = popen(cmd, "w");
+ _pipe = true;
+ break;
+
+ case cftBZ2:
+ snprintf(cmd, FILENAME_MAX, "bzip2 -%dc > %s", level, filename);
+ _file = popen(cmd, "w");
+ _pipe = true;
+ break;
+
+ case cftXZ:
+ snprintf(cmd, FILENAME_MAX, "xz -%dc > %s", level, filename);
+ _file = popen(cmd, "w");
+ _pipe = true;
+ break;
+
+ case cftSTDIN:
+ _file = stdout;
+ _stdi = 1;
+ break;
+
+ default:
+ _file = fopen(filename, "w");
+ _pipe = false;
+ break;
}
if (errno)
diff --git a/src/AS_UTL/AS_UTL_fileIO.H b/src/AS_UTL/AS_UTL_fileIO.H
index de74e4a..7fddfcc 100644
--- a/src/AS_UTL/AS_UTL_fileIO.H
+++ b/src/AS_UTL/AS_UTL_fileIO.H
@@ -53,10 +53,15 @@ using namespace std;
#define O_LARGEFILE 0
#endif
+void AS_UTL_findBaseFileName(char *basename, const char *filename);
+
void AS_UTL_safeWrite(FILE *file, const void *buffer, const char *desc, size_t size, size_t nobj);
size_t AS_UTL_safeRead (FILE *file, void *buffer, const char *desc, size_t size, size_t nobj);
int AS_UTL_mkdir(const char *dirname);
+
+int AS_UTL_symlink(const char *pathToFile, const char *pathToLink);
+
int AS_UTL_unlink(const char *filename);
int AS_UTL_fileExists(const char *path, int directory=false, int readwrite=false);
@@ -65,9 +70,24 @@ off_t AS_UTL_sizeOfFile(const char *path);
off_t AS_UTL_ftell(FILE *stream);
void AS_UTL_fseek(FILE *stream, off_t offset, int whence);
+
// Read a file-of-files into a vector
void AS_UTL_loadFileList(char *fileName, vector<char *> &fileList);
+
+
+enum cftType {
+ cftNONE = 0,
+ cftGZ = 1,
+ cftBZ2 = 2,
+ cftXZ = 3,
+ cftSTDIN = 4
+};
+
+cftType compressedFileType(char const *filename);
+
+
+
class compressedFileReader {
public:
compressedFileReader(char const *filename);
@@ -84,6 +104,8 @@ private:
bool _stdi;
};
+
+
class compressedFileWriter {
public:
compressedFileWriter(char const *filename, int32 level=1);
diff --git a/src/AS_UTL/AS_UTL_stackTrace.C b/src/AS_UTL/AS_UTL_stackTrace.C
index 0c55da4..b764174 100644
--- a/src/AS_UTL/AS_UTL_stackTrace.C
+++ b/src/AS_UTL/AS_UTL_stackTrace.C
@@ -82,7 +82,7 @@ AS_UTL_envokeGDB(void) {
// Child
- sprintf(cmd, "gdb -quiet -silent -p "F_U64" -batch -x commands", pid);
+ snprintf(cmd, 1024, "gdb -quiet -silent -p " F_U64 " -batch -x commands", pid);
system(cmd);
exit(0);
#endif
diff --git a/src/AS_UTL/bitPackedArray.C b/src/AS_UTL/bitPackedArray.C
index 686950b..01acc66 100644
--- a/src/AS_UTL/bitPackedArray.C
+++ b/src/AS_UTL/bitPackedArray.C
@@ -27,6 +27,10 @@
* are Copyright 2014 Battelle National Biodefense Institute, and
* are subject to the BSD 3-Clause License
*
+ * Brian P. Walenz beginning on 2016-OCT-17
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
@@ -66,7 +70,7 @@ bitPackedArray::get(uint64 idx) {
uint64 p = _valueWidth * (idx % _valuesPerSegment);
if (idx >= _nextElement) {
- fprintf(stderr, "bitPackedArray::get()-- element index "F_U64" is out of range, only "F_U64" elements.\n",
+ fprintf(stderr, "bitPackedArray::get()-- element index " F_U64 " is out of range, only " F_U64 " elements.\n",
idx, _nextElement-1);
return(0xdeadbeefdeadbeefULL);
}
@@ -80,7 +84,7 @@ bitPackedArray::set(uint64 idx, uint64 val) {
uint64 s = idx / _valuesPerSegment;
uint64 p = _valueWidth * (idx % _valuesPerSegment);
- //fprintf(stderr, "s="F_U64" p="F_U64" segments="F_U64"/"F_U64"\n", s, p, _numSegments, _maxSegments);
+ //fprintf(stderr, "s=" F_U64 " p=" F_U64 " segments=" F_U64 "/" F_U64 "\n", s, p, _numSegments, _maxSegments);
if (idx >= _nextElement)
_nextElement = idx+1;
diff --git a/src/AS_UTL/bitPackedArray.H b/src/AS_UTL/bitPackedArray.H
index d7590c7..b4ddb93 100644
--- a/src/AS_UTL/bitPackedArray.H
+++ b/src/AS_UTL/bitPackedArray.H
@@ -27,6 +27,10 @@
* are Copyright 2014 Battelle National Biodefense Institute, and
* are subject to the BSD 3-Clause License
*
+ * Brian P. Walenz beginning on 2016-OCT-17
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
@@ -164,20 +168,20 @@ public:
}
#ifdef DEBUG_BPH_GET
- fprintf(stderr, "test c="F_U64" and p="F_U64" lastVal="F_U64"\n",
+ fprintf(stderr, "test c=" F_U64 " and p=" F_U64 " lastVal=" F_U64 "\n",
cidx, pidx, _lastVal);
- fprintf(stderr, "test c="F_U64"="F_U64"\n",
+ fprintf(stderr, "test c=" F_U64 "=" F_U64 "\n",
cidx, cval);
- fprintf(stderr, "test p="F_U64"="F_U64"\n",
+ fprintf(stderr, "test p=" F_U64 "=" F_U64 "\n",
pidx, pval);
- fprintf(stderr, "test c="F_U64"="F_U64" and p="F_U64"="F_U64"\n",
+ fprintf(stderr, "test c=" F_U64 "=" F_U64 " and p=" F_U64 "=" F_U64 "\n",
cidx, cval, pidx, pval);
#endif
if (cval < pval) {
#ifdef DEBUG_BPH_GET
- fprintf(stderr, "swap c="F_U64"="F_U64" and p="F_U64"="F_U64"\n",
+ fprintf(stderr, "swap c=" F_U64 "=" F_U64 " and p=" F_U64 "=" F_U64 "\n",
cidx, cval, pidx, pval);
#endif
@@ -204,7 +208,7 @@ public:
bool more = false;
#ifdef DEBUG_BPH_ADD
- fprintf(stderr, "add c="F_U64"="F_U64" -- lastVal="F_U64"\n",
+ fprintf(stderr, "add c=" F_U64 "=" F_U64 " -- lastVal=" F_U64 "\n",
cidx, cval, _lastVal);
#endif
@@ -217,20 +221,20 @@ public:
pidx = (cidx-1) / 2;
#ifdef DEBUG_BPH_ADD
- fprintf(stderr, "more c="F_U64" and p="F_U64"\n", cidx, pidx);
+ fprintf(stderr, "more c=" F_U64 " and p=" F_U64 "\n", cidx, pidx);
#endif
pval = _array->get(pidx);
#ifdef DEBUG_BPH_ADD
- fprintf(stderr, "test c="F_U64"="F_U64" and p="F_U64"="F_U64"\n",
+ fprintf(stderr, "test c=" F_U64 "=" F_U64 " and p=" F_U64 "=" F_U64 "\n",
cidx, cval, pidx, pval);
#endif
if (pval > cval) {
#ifdef DEBUG_BPH_ADD
- fprintf(stderr, "swap c="F_U64"="F_U64" and p="F_U64"="F_U64"\n",
+ fprintf(stderr, "swap c=" F_U64 "=" F_U64 " and p=" F_U64 "=" F_U64 "\n",
cidx, cval, pidx, pval);
#endif
@@ -254,7 +258,7 @@ public:
void dump(void) {
for (uint32 i=0; i<_lastVal; i++)
- fprintf(stderr, "HEAP["F_U32"]="F_U64"\n", i, _array->get(i));
+ fprintf(stderr, "HEAP[" F_U32 "]=" F_U64 "\n", i, _array->get(i));
}
void clear(void) {
diff --git a/src/AS_UTL/bitPackedFile.C b/src/AS_UTL/bitPackedFile.C
index cdd85c5..e77a9ea 100644
--- a/src/AS_UTL/bitPackedFile.C
+++ b/src/AS_UTL/bitPackedFile.C
@@ -128,8 +128,12 @@ bitPackedFile::bitPackedFile(char const *name, uint64 offset, bool forceTruncate
// Move to the correct position in the file.
//
file_offset = offset;
- if (file_offset > 0)
+ if (file_offset > 0) {
+ errno = 0;
lseek(_file, file_offset, SEEK_SET);
+ if (errno)
+ fprintf(stderr, "bitPackedFile::bitPackedFile()-- '%s' failed to seed to position %llu: %s\n", _name, file_offset, strerror(errno)), exit(1);
+ }
// Deal with endianess. We write out some bytes (or read back some bytes) to the start of
// the file, and then hide them from the user.
@@ -213,7 +217,7 @@ bitPackedFile::bitPackedFile(char const *name, uint64 offset, bool forceTruncate
fprintf(stderr, "bitPackedFile::bitPackedFile()-- found ");
for (uint32 i=0; i<16; i++)
fprintf(stderr, "%c", isascii(c[i]) ? c[i] : '.');
- fprintf(stderr, " at position "F_X64"\n", file_offset);
+ fprintf(stderr, " at position " F_X64 "\n", file_offset);
exit(1);
}
@@ -412,7 +416,7 @@ bitPackedFile::seekNormal(uint64 bitpos) {
errno = 0;
lseek(_file, _pos * 8 + endianess_offset, SEEK_SET);
if (errno) {
- fprintf(stderr, "bitPackedFile::seekNormal() '%s' seek to pos="F_U64" failed: %s\n",
+ fprintf(stderr, "bitPackedFile::seekNormal() '%s' seek to pos=" F_U64 " failed: %s\n",
_name,
_pos * 8 + endianess_offset, strerror(errno));
exit(1);
@@ -421,7 +425,7 @@ bitPackedFile::seekNormal(uint64 bitpos) {
errno = 0;
size_t wordsread = read(_file, _bfr, sizeof(uint64) * _bfrmax);
if (errno) {
- fprintf(stderr, "bitPackedFile::seekNormal() '%s' read of "F_U64" bytes failed': %s\n",
+ fprintf(stderr, "bitPackedFile::seekNormal() '%s' read of " F_U64 " bytes failed': %s\n",
_name,
sizeof(uint64) * _bfrmax,
strerror(errno));
@@ -431,7 +435,7 @@ bitPackedFile::seekNormal(uint64 bitpos) {
// Flip all the words we just read, if needed
//
if (endianess_flipped)
- for (uint32 i=0; i<wordsread; i++)
+ for (size_t i=0; i<wordsread; i++)
_bfr[i] = uint64Swap(_bfr[i]);
// Clear any words that we didn't read (supposedly, because we hit
@@ -459,7 +463,7 @@ bitPackedFile::seek(uint64 bitpos) {
if ((_pos <= np) && (np <= _pos + _bfrmax - 32)) {
_bit = bitpos - (_pos << 6);
stat_seekInside++;
- //fprintf(stderr, "SEEK INSIDE to _bit="F_U64"\n", _bit);
+ //fprintf(stderr, "SEEK INSIDE to _bit=" F_U64 "\n", _bit);
return;
}
}
@@ -481,7 +485,7 @@ bitPackedFile::seek(uint64 bitpos) {
_forceFirstLoad = false;
- //fprintf(stderr, "SEEK OUTSIDE to _pos="F_U64" _bit="F_U64"\n", _pos, _bit);
+ //fprintf(stderr, "SEEK OUTSIDE to _pos=" F_U64 " _bit=" F_U64 "\n", _pos, _bit);
}
@@ -495,7 +499,10 @@ bitPackedFile::loadInCore(void) {
flushDirty();
+ errno = 0;
fstat(_file, &sb);
+ if (errno)
+ fprintf(stderr, "bitPackedFile::loadInCore() failed to fstat(): %s\n", strerror(errno)), exit(1);
// The extra 1024 words is to keep seek() from attempting to grab
// the next block (there isn't a next block, we've got it all!)
diff --git a/src/AS_UTL/bitPackedFile.H b/src/AS_UTL/bitPackedFile.H
index ba415b7..8dd36b8 100644
--- a/src/AS_UTL/bitPackedFile.H
+++ b/src/AS_UTL/bitPackedFile.H
@@ -35,6 +35,10 @@
* are Copyright 2014 Battelle National Biodefense Institute, and
* are subject to the BSD 3-Clause License
*
+ * Brian P. Walenz beginning on 2016-OCT-17
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
@@ -71,7 +75,7 @@ public:
uint64 loadInCore(void);
void showStats(FILE *f) {
- fprintf(f, "inside: "F_U64" outside: "F_U64"\n", stat_seekInside, stat_seekOutside);
+ fprintf(f, "inside: " F_U64 " outside: " F_U64 "\n", stat_seekInside, stat_seekOutside);
fflush(f);
};
private:
diff --git a/src/AS_UTL/hexDump.C b/src/AS_UTL/hexDump.C
new file mode 100644
index 0000000..d3a6f74
--- /dev/null
+++ b/src/AS_UTL/hexDump.C
@@ -0,0 +1,85 @@
+
+/******************************************************************************
+ *
+ * This file is part of canu, a software program that assembles whole-genome
+ * sequencing reads into contigs.
+ *
+ * This software is based on:
+ * 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
+ * the 'kmer package' (http://kmer.sourceforge.net)
+ * both originally distributed by Applera Corporation under the GNU General
+ * Public License, version 2.
+ *
+ * Canu branched from Celera Assembler at its revision 4587.
+ * Canu branched from the kmer project at its revision 1994.
+ *
+ * Modifications by:
+ *
+ * Brian P. Walenz beginning on 2016-SEP-15
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
+ * File 'README.licenses' in the root directory of this distribution contains
+ * full conditions and disclaimers for each license.
+ */
+
+#include "AS_global.H"
+
+
+// Dump DATAlen bytes from DATA in a hex format.
+// It will print W bytes per line, separated into words of 8 bytes.
+// The end of the line will have the ASCII representation of the data.
+//
+// 00000000 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f '................'
+//
+void
+hexDump(FILE *F,
+ void *DATA, uint32 DATAlen, uint32 W) {
+ char *STR = new char [8 + 2 + W * 3 + W/8 + 2 + 1 + W * 1 + 1];
+
+ for (uint32 Dp=0; Dp < DATAlen; Dp += W) {
+ uint8 *D = (uint8 *)DATA + Dp;
+ uint32 Ds = (Dp + W <= DATAlen) ? (W) : (DATAlen - Dp);
+
+ for (uint32 Z=Dp, ii=8; ii>0; Z>>=4) // Dump the address in hexadecimal
+ STR[--ii] = ((Z & 0x0f) < 0x0a) ? ((Z & 0x0f) + '0') : ((Z & 0x0f) - 0x0a + 'a');
+
+ char *H = STR + 8; // Data pointer
+ char *A = STR + 8 + 1 + 3 * W + W/8; // ASCII pointer
+
+ *H++ = ' '; // Another space is added at ii=0 below.
+
+ *A++ = ' '; // Space between the last digit and the string
+ *A++ = '\''; // Bracket at the start of the string.
+
+ for (uint32 ii=0; ii<W; ii++) {
+ if ((ii % 8) == 0) // An extra space between words
+ *H++ = ' ';
+
+ if (ii < Ds) { // Emit a digit, or...
+ *H++ = ((D[ii] & 0xf0) < 0xa0) ? (((D[ii] & 0xf0) >> 4) + '0') : (((D[ii] & 0xf0) >> 4) - 0x0a + 'a');
+ *H++ = ((D[ii] & 0x0f) < 0x0a) ? (((D[ii] & 0x0f) ) + '0') : (((D[ii] & 0x0f) ) - 0x0a + 'a');
+ }
+ else {
+ *H++ = ' '; // ...spaces if we fell off the end of the data
+ *H++ = ' ';
+ }
+
+ *H++ = ' '; // Space between digits
+
+ if (ii < Ds) // Printable ASCII or a dot
+ *A++ = ((' ' <= D[ii]) && (D[ii] <= '~')) ? (D[ii]) : ('.');
+ }
+
+ *A++ = '\''; // Bracket at the end of the string.
+ *A++ = '\n';
+
+ *A = 0; // NUL terminate the string.
+
+ fputs(STR, F);
+ }
+
+ delete [] STR;
+}
+
+
diff --git a/src/utgcns/libpbutgcns/SimpleAligner.H b/src/AS_UTL/hexDump.H
similarity index 75%
rename from src/utgcns/libpbutgcns/SimpleAligner.H
rename to src/AS_UTL/hexDump.H
index f1792e3..416ad27 100644
--- a/src/utgcns/libpbutgcns/SimpleAligner.H
+++ b/src/AS_UTL/hexDump.H
@@ -15,7 +15,7 @@
*
* Modifications by:
*
- * Sergey Koren beginning on 2015-DEC-28
+ * Brian P. Walenz beginning on 2016-SEP-15
* are a 'United States Government Work', and
* are released in the public domain
*
@@ -23,15 +23,11 @@
* full conditions and disclaimers for each license.
*/
-#ifndef __GCON_SIMPLE_ALIGNER__
-#define __GCON_SIMPLE_ALIGNER__
+#ifndef HEXDUMP_H
+#define HEXDUMP_H
-#include "dw.H"
+void
+hexDump(FILE *F,
+ void *DATA, uint32 DATAlen, uint32 W=32);
-class SimpleAligner {
-public:
- SimpleAligner();
- void align(dagcon::Alignment& aln, double errorRate);
-};
-
-#endif // __GCON_SIMPLE_ALIGNER__
+#endif // HEXDUMP_H
diff --git a/src/AS_UTL/intervalList.H b/src/AS_UTL/intervalList.H
index 929ddf6..168338c 100644
--- a/src/AS_UTL/intervalList.H
+++ b/src/AS_UTL/intervalList.H
@@ -659,10 +659,16 @@ intervalList<iNum, iVal>::computeDepth(intervalDepthRegions<iNum, iVal> *id, uin
_list = new _intervalPair<iNum, iVal> [_listMax];
}
- // Init first interval.
+ // The first thing must be an 'open' event. If not, someone supplied a negative length to the
+ // original intervalList. Or, possibly, two zero-length intervals.
+ if (id[0].open == false)
+ for (uint32 ii=0; ii<idlen; ii++)
+ fprintf(stderr, "id[%d] pos %d open %d\n", ii, id[ii].pos, id[ii].open);
assert(id[0].open == true);
+ // Init first interval.
+
_list[_listLen].lo = id[0].pos;
_list[_listLen].hi = id[0].pos;
_list[_listLen].ct = 1;
diff --git a/src/AS_UTL/intervalListTest.C b/src/AS_UTL/intervalListTest.C
index 9df9124..da64d53 100644
--- a/src/AS_UTL/intervalListTest.C
+++ b/src/AS_UTL/intervalListTest.C
@@ -41,24 +41,44 @@ typedef uint64_t uint64;
#include "intervalList.H"
+// g++ -o intervalListTest -I.. -I. intervalListTest.C
+
int
main(int argc, char **argv) {
- intervalList<int32> t1;
+ if (0) {
+ intervalList<int32> t1;
+
+ t1.add(0, 10);
+ t1.add(11,7);
+ t1.add(20, 8);
+
+ fprintf(stderr, "BEFORE:\n");
+ for (uint32 ii=0; ii<t1.numberOfIntervals(); ii++)
+ fprintf(stderr, "%2d %3d-%3d\n", ii, t1.lo(ii), t1.hi(ii));
+
+ t1.merge(-1);
+
+ fprintf(stderr, "AFTER:\n");
+ for (uint32 ii=0; ii<t1.numberOfIntervals(); ii++)
+ fprintf(stderr, "%2d %3d-%3d\n", ii, t1.lo(ii), t1.hi(ii));
+ }
+
+ if (1) {
+ intervalList<uint32> il;
+
+ il.add(1, -1);
- t1.add(0, 10);
- t1.add(11,7);
- t1.add(20, 8);
+ intervalList<uint32> de(il);
- fprintf(stderr, "BEFORE:\n");
- for (uint32 ii=0; ii<t1.numberOfIntervals(); ii++)
- fprintf(stderr, "%2d %3d-%3d\n", ii, t1.lo(ii), t1.hi(ii));
+ il.merge();
- t1.merge(-1);
+ for (uint32 ii=0; ii<il.numberOfIntervals(); ii++)
+ fprintf(stderr, "il %2u %4u-%4u\n", ii, il.lo(ii), il.hi(ii));
- fprintf(stderr, "AFTER:\n");
- for (uint32 ii=0; ii<t1.numberOfIntervals(); ii++)
- fprintf(stderr, "%2d %3d-%3d\n", ii, t1.lo(ii), t1.hi(ii));
+ for (uint32 ii=0; ii<de.numberOfIntervals(); ii++)
+ fprintf(stderr, "de %2u %4u-%4u %4d\n", ii, de.lo(ii), de.hi(ii), de.depth(ii));
+ }
exit(0);
}
diff --git a/src/AS_UTL/kMer.C b/src/AS_UTL/kMer.C
index d0c8cdf..e8da1f0 100644
--- a/src/AS_UTL/kMer.C
+++ b/src/AS_UTL/kMer.C
@@ -300,7 +300,7 @@ kMerBuilder::addBaseCompressed(uint64 cf, uint64 cr) {
lb = 9; // No valid last base (should probably be ~uint64ZERO, but that screws up diagnostic output)
#ifdef DEBUGCOMP
- fprintf(stderr, "kMerBuilder::addBaseCompressed()-- lb="uint64FMT" cf="uint64FMT" ms="F_U32" ccl="F_U32" lvl="F_U32"\n",
+ fprintf(stderr, "kMerBuilder::addBaseCompressed()-- lb="uint64FMT" cf="uint64FMT" ms=" F_U32 " ccl=" F_U32 " lvl=" F_U32 "\n",
lb, cf, ms, _compressionCurrentLength, _compression);
#endif
@@ -392,7 +392,7 @@ kMerBuilder::addBaseCompressed(uint64 cf, uint64 cr) {
ms -= _compressionLength[_compressionIndex]; // subtract the count for the letter we just shifted out
#ifdef DEBUGCOMP
- fprintf(stderr, "kMerBuilder::addBaseCompressed()-- ADDNEWBASE shifted out at idx="F_U32" with "F_U32" positions; final span "F_U32"\n",
+ fprintf(stderr, "kMerBuilder::addBaseCompressed()-- ADDNEWBASE shifted out at idx=" F_U32 " with " F_U32 " positions; final span " F_U32 "\n",
_compressionIndex,
_compressionLength[_compressionIndex],
ms + 1);
diff --git a/src/AS_UTL/kMerHuge.H b/src/AS_UTL/kMerHuge.H
index 0347ccc..ee9936b 100644
--- a/src/AS_UTL/kMerHuge.H
+++ b/src/AS_UTL/kMerHuge.H
@@ -27,6 +27,10 @@
* are Copyright 2014 Battelle National Biodefense Institute, and
* are subject to the BSD 3-Clause License
*
+ * Brian P. Walenz beginning on 2016-OCT-17
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
@@ -260,7 +264,7 @@ public:
val &= uint64MASK(numbits);
if (wrd >= KMER_WORDS) {
- fprintf(stderr, "kMer::setBits()-- ERROR: tried to set pos="F_U32" numbits="F_U32" larger than KMER_WORDS=%d\n",
+ fprintf(stderr, "kMer::setBits()-- ERROR: tried to set pos=" F_U32 " numbits=" F_U32 " larger than KMER_WORDS=%d\n",
pos, numbits, KMER_WORDS), exit(1);
}
@@ -274,7 +278,7 @@ public:
MERWORD(wrd) |= val << bit;
} else {
if (wrd+1 >= KMER_WORDS) {
- fprintf(stderr, "kMer::setBits()-- ERROR: tried to set pos="F_U32" numbits="F_U32" larger than KMER_WORDS=%d\n",
+ fprintf(stderr, "kMer::setBits()-- ERROR: tried to set pos=" F_U32 " numbits=" F_U32 " larger than KMER_WORDS=%d\n",
pos, numbits, KMER_WORDS), exit(1);
}
@@ -295,7 +299,7 @@ public:
uint32 bit = pos & 0x3f;
if (wrd >= KMER_WORDS) {
- fprintf(stderr, "kMer::getBits()-- ERROR: tried to get pos="F_U32" numbits="F_U32" larger than KMER_WORDS=%d\n",
+ fprintf(stderr, "kMer::getBits()-- ERROR: tried to get pos=" F_U32 " numbits=" F_U32 " larger than KMER_WORDS=%d\n",
pos, numbits, KMER_WORDS), exit(1);
}
@@ -303,7 +307,7 @@ public:
val = MERWORD(wrd) >> bit;
} else {
if (wrd+1 >= KMER_WORDS) {
- fprintf(stderr, "kMer::getBits()-- ERROR: tried to get pos="F_U32" numbits="F_U32" larger than KMER_WORDS=%d\n",
+ fprintf(stderr, "kMer::getBits()-- ERROR: tried to get pos=" F_U32 " numbits=" F_U32 " larger than KMER_WORDS=%d\n",
pos, numbits, KMER_WORDS), exit(1);
}
@@ -395,7 +399,7 @@ kMerHuge::setMerSize(uint32 ms) {
}
if (_maskWord >= KMER_WORDS) {
- fprintf(stderr, "kMer::setMerSize()-- ERROR! Desired merSize of "F_U32" larger than\n", _merSize);
+ fprintf(stderr, "kMer::setMerSize()-- ERROR! Desired merSize of " F_U32 " larger than\n", _merSize);
fprintf(stderr, " available storage space (KMER_WORDS=%d, max merSize %d).\n", KMER_WORDS, KMER_WORDS*32);
exit(1);
}
diff --git a/src/AS_UTL/memoryMappedFile.H b/src/AS_UTL/memoryMappedFile.H
index 6edd748..e88b09b 100644
--- a/src/AS_UTL/memoryMappedFile.H
+++ b/src/AS_UTL/memoryMappedFile.H
@@ -109,7 +109,7 @@ public:
: mmap(0L, _length, PROT_READ | PROT_WRITE, MAP_FILE | MAP_SHARED, fd, 0);
if (errno)
- fprintf(stderr, "memoryMappedFile()-- Couldn't mmap '%s' of length "F_SIZE_T": %s\n", _name, _length, strerror(errno)), exit(1);
+ fprintf(stderr, "memoryMappedFile()-- Couldn't mmap '%s' of length " F_SIZE_T ": %s\n", _name, _length, strerror(errno)), exit(1);
close(fd);
@@ -135,7 +135,7 @@ public:
length = _length - offset;
if (offset + length > _length)
- fprintf(stderr, "memoryMappedFile()-- Requested "F_SIZE_T" bytes at position "F_SIZE_T" in file '%s', but only "F_SIZE_T" bytes in file.\n",
+ fprintf(stderr, "memoryMappedFile()-- Requested " F_SIZE_T " bytes at position " F_SIZE_T " in file '%s', but only " F_SIZE_T " bytes in file.\n",
length, offset, _name, _length), exit(1);
_offset = offset + length;
diff --git a/src/AS_UTL/mt19937ar.C b/src/AS_UTL/mt19937ar.C
index 3363e94..ccdbbb7 100644
--- a/src/AS_UTL/mt19937ar.C
+++ b/src/AS_UTL/mt19937ar.C
@@ -168,3 +168,11 @@ mtRandom::mtRandomGaussian(void) {
return(y1);
}
+
+
+// Ganerate a number from an exponential distribution using Inverse Transform Sampling.
+//
+double
+mtRandom::mtRandomExponential(double mode, double lambda) {
+ return(mode - 1/lambda * log(mtRandomRealOpen()));
+}
diff --git a/src/AS_UTL/mt19937ar.H b/src/AS_UTL/mt19937ar.H
index 604f8f4..dc9893b 100644
--- a/src/AS_UTL/mt19937ar.H
+++ b/src/AS_UTL/mt19937ar.H
@@ -54,6 +54,7 @@ public:
//
double mtRandomGaussian(void);
+ double mtRandomExponential(double lambda, double tau=1.0);
private:
uint32 mt[MT_N]; // State vector array
diff --git a/src/AS_UTL/mt19937arTest.C b/src/AS_UTL/mt19937arTest.C
new file mode 100644
index 0000000..cef09f8
--- /dev/null
+++ b/src/AS_UTL/mt19937arTest.C
@@ -0,0 +1,19 @@
+#include "mt19937ar.H"
+
+int
+main(int argc, char **argv) {
+ mtRandom mt;
+
+ if (argc != 4)
+ fprintf(stderr, "usage: %s <iterations> <lambda> <rho>\n", argv[0]), exit(1);
+
+ uint32 number = atoi(argv[1]);
+ double mode = atof(argv[2]);
+ double scale = atof(argv[3]);
+
+ for (uint32 ii=0; ii<number; ii++)
+ fprintf(stdout, "%f\n", mt.mtRandomExponential(mode, scale));
+
+ exit(0);
+}
+
diff --git a/src/AS_UTL/readBuffer.C b/src/AS_UTL/readBuffer.C
index 0743921..9532847 100644
--- a/src/AS_UTL/readBuffer.C
+++ b/src/AS_UTL/readBuffer.C
@@ -176,7 +176,7 @@ readBuffer::fillBuffer(void) {
if (errno == EAGAIN)
goto again;
if (errno)
- fprintf(stderr, "readBuffer::fillBuffer()-- only read "F_U64" bytes, couldn't read "F_U64" bytes from '%s': %s\n",
+ fprintf(stderr, "readBuffer::fillBuffer()-- only read " F_U64 " bytes, couldn't read " F_U64 " bytes from '%s': %s\n",
_bufferLen, _bufferMax, _filename, strerror(errno)), exit(1);
if (_bufferLen == 0)
@@ -209,7 +209,7 @@ readBuffer::seek(uint64 pos) {
errno = 0;
lseek(_file, pos, SEEK_SET);
if (errno)
- fprintf(stderr, "readBuffer()-- '%s' couldn't seek to position "F_U64": %s\n",
+ fprintf(stderr, "readBuffer()-- '%s' couldn't seek to position " F_U64 ": %s\n",
_filename, pos, strerror(errno)), exit(1);
_bufferLen = 0;
@@ -272,7 +272,7 @@ readBuffer::read(void *buf, uint64 len) {
errno = 0;
bAct = (uint64)::read(_file, bufchar + bCopied + bRead, len - bCopied - bRead);
if (errno)
- fprintf(stderr, "readBuffer()-- couldn't read "F_U64" bytes from '%s': n%s\n",
+ fprintf(stderr, "readBuffer()-- couldn't read " F_U64 " bytes from '%s': n%s\n",
len, _filename, strerror(errno)), exit(1);
// If we hit EOF, return a short read
diff --git a/src/AS_UTL/stddev.H b/src/AS_UTL/stddev.H
index 28dfe79..d27aca5 100644
--- a/src/AS_UTL/stddev.H
+++ b/src/AS_UTL/stddev.H
@@ -51,10 +51,10 @@ using namespace std;
template<typename TT>
class stdDev {
public:
- stdDev() {
- _mn = 0.0;
- _sn = 0.0;
- _nn = 0;
+ stdDev(double mn=0.0, double sn=0.0, uint32 nn=0) {
+ _mn = mn;
+ _sn = sn;
+ _nn = nn;
};
~stdDev() {
@@ -65,6 +65,9 @@ public:
double s0 = _sn;
uint32 n0 = _nn + 1;
+ if (_nn == 0x7fffffff)
+ fprintf(stderr, "ERROR: stdDev is full; can't insert() new value.\n"), exit(1);
+
if (_nn & 0x80000000)
fprintf(stderr, "ERROR: stdDev has been finalized; can't insert() new value.\n"), exit(1);
@@ -78,6 +81,9 @@ public:
double m0 = (n0 == 0) ? (0) : ((_nn * _mn - val) / n0);
double s0 = _sn - (val - m0) * (val - _mn);
+ if (_nn == 0)
+ fprintf(stderr, "ERROR: stdDev has no data; can't remove() old value.\n"), exit(1);
+
if (_nn & 0x80000000)
fprintf(stderr, "ERROR: stdDev has been finalized; can't remove() old value.\n"), exit(1);
@@ -514,7 +520,7 @@ public:
fprintf(F, "#%s\tquantity\n", label);
for (uint64 ii=0; ii <= _histogramMax; ii++)
- fprintf(F, F_U64"\t"F_U64"\n", ii, _histogram[ii]);
+ fprintf(F, F_U64"\t" F_U64 "\n", ii, _histogram[ii]);
};
diff --git a/src/AS_UTL/sweatShop.C b/src/AS_UTL/sweatShop.C
index 81068f2..3bbf2cd 100644
--- a/src/AS_UTL/sweatShop.C
+++ b/src/AS_UTL/sweatShop.C
@@ -27,6 +27,10 @@
* are Copyright 2014-2015 Battelle National Biodefense Institute, and
* are subject to the BSD 3-Clause License
*
+ * Brian P. Walenz beginning on 2016-OCT-17
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
@@ -150,7 +154,7 @@ sweatShop::setThreadData(uint32 t, void *x) {
_workerData = new sweatShopWorker [_numberOfWorkers];
if (t >= _numberOfWorkers)
- fprintf(stderr, "sweatShop::setThreadData()-- worker ID "F_U32" more than number of workers="F_U32"\n", t, _numberOfWorkers), exit(1);
+ fprintf(stderr, "sweatShop::setThreadData()-- worker ID " F_U32 " more than number of workers=" F_U32 "\n", t, _numberOfWorkers), exit(1);
_workerData[t].threadUserData = x;
}
@@ -346,7 +350,7 @@ sweatShop::writer(void) {
naptime.tv_sec = 0;
naptime.tv_nsec = 5000000ULL;
- //fprintf(stderr, "Writer waits for slow thread at "F_U64".\n", _numberOutput);
+ //fprintf(stderr, "Writer waits for slow thread at " F_U64 ".\n", _numberOutput);
nanosleep(&naptime, 0L);
} else if (_writerP->_next == 0L) {
// Wait for the input.
@@ -354,7 +358,7 @@ sweatShop::writer(void) {
naptime.tv_sec = 0;
naptime.tv_nsec = 5000000ULL;
- //fprintf(stderr, "Writer waits for all threads at "F_U64".\n", _numberOutput);
+ //fprintf(stderr, "Writer waits for all threads at " F_U64 ".\n", _numberOutput);
nanosleep(&naptime, 0L);
} else {
(*_userWriter)(_globalUserData, _writerP->_user);
@@ -413,7 +417,7 @@ sweatShop::status(void) {
cpuPerSec = _numberComputed / (thisTime - startTime);
if (_showStatus) {
- fprintf(stderr, " %6.1f/s - %8"F_U64P" loaded; %8"F_U64P" queued for compute; %08"F_U64P" finished; %8"F_U64P" written; %8"F_U64P" queued for output)\r",
+ fprintf(stderr, " %6.1f/s - %8" F_U64P " loaded; %8" F_U64P " queued for compute; %08" F_U64P " finished; %8" F_U64P " written; %8" F_U64P " queued for output)\r",
cpuPerSec, _numberLoaded, deltaCPU, _numberComputed, _numberOutput, deltaOut);
fflush(stderr);
}
@@ -448,7 +452,7 @@ sweatShop::status(void) {
cpuPerSec = _numberComputed / (thisTime - startTime);
- fprintf(stderr, " %6.1f/s - %08"F_U64P" queued for compute; %08"F_U64P" finished; %08"F_U64P" queued for output)\n",
+ fprintf(stderr, " %6.1f/s - %08" F_U64P " queued for compute; %08" F_U64P " finished; %08" F_U64P " queued for output)\n",
cpuPerSec, deltaCPU, _numberComputed, deltaOut);
}
@@ -583,7 +587,7 @@ sweatShop::run(void *user, bool beVerbose) {
for (uint32 i=0; i<_numberOfWorkers; i++) {
err = pthread_create(&_workerData[i].threadID, &threadAttr, _sweatshop_workerThread, _workerData + i);
if (err)
- fprintf(stderr, "sweatShop::run()-- Failed to launch worker thread "F_U32": %s.\n", i, strerror(err)), exit(1);
+ fprintf(stderr, "sweatShop::run()-- Failed to launch worker thread " F_U32 ": %s.\n", i, strerror(err)), exit(1);
}
// Now sit back and relax.
@@ -603,7 +607,7 @@ sweatShop::run(void *user, bool beVerbose) {
for (uint32 i=0; i<_numberOfWorkers; i++) {
err = pthread_join(_workerData[i].threadID, 0L);
if (err)
- fprintf(stderr, "sweatShop::run()-- Failed to join worker thread "F_U32": %s.\n", i, strerror(err)), exit(1);
+ fprintf(stderr, "sweatShop::run()-- Failed to join worker thread " F_U32 ": %s.\n", i, strerror(err)), exit(1);
}
// Cleanup.
diff --git a/src/AS_UTL/writeBuffer.H b/src/AS_UTL/writeBuffer.H
new file mode 100644
index 0000000..7795a81
--- /dev/null
+++ b/src/AS_UTL/writeBuffer.H
@@ -0,0 +1,93 @@
+
+/******************************************************************************
+ *
+ * This file is part of canu, a software program that assembles whole-genome
+ * sequencing reads into contigs.
+ *
+ * This software is based on:
+ * 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
+ * the 'kmer package' (http://kmer.sourceforge.net)
+ * both originally distributed by Applera Corporation under the GNU General
+ * Public License, version 2.
+ *
+ * Canu branched from Celera Assembler at its revision 4587.
+ * Canu branched from the kmer project at its revision 1994.
+ *
+ * Modifications by:
+ *
+ * Brian P. Walenz beginning on 2016-NOV-18
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
+ * File 'README.licenses' in the root directory of this distribution contains
+ * full conditions and disclaimers for each license.
+ */
+
+#ifndef WRITE_BUFFER_H
+#define WRITE_BUFFER_H
+
+#include "AS_global.H"
+#include "AS_UTL_fileIO.H"
+
+class writeBuffer {
+public:
+ writeBuffer(const char *filename, const char *filemode, uint64 bufferMax = 1024 * 1024) {
+ _filename = filename;
+ _filemode = filemode;
+
+ errno = 0;
+ _file = fopen(filename, filemode);
+ if (errno)
+ fprintf(stderr, "writeBuffer()-- Failed to open file '%s' with mode '%s': %s\n",
+ filename, filemode, strerror(errno)), exit(1);
+
+ _filePos = AS_UTL_ftell(_file);
+
+ _bufferLen = 0;
+ _bufferMax = bufferMax;
+ _buffer = new char [_bufferMax];
+ };
+
+ ~writeBuffer() {
+ flush();
+ delete [] _buffer;
+ fclose(_file);
+ };
+
+ uint64 tell(void) { return(_filePos); };
+
+ void write(void *data, uint64 length) {
+ if ((_bufferMax < length) ||
+ (_bufferLen + length > _bufferMax))
+ flush();
+
+ if (_bufferMax < length)
+ AS_UTL_safeWrite(_file, _buffer, "writeBuffer", 1, length);
+ else {
+ memcpy(_buffer + _bufferLen, data, length);
+ _bufferLen += length;
+ }
+
+ _filePos += length;
+ };
+
+ const char *filename(void) { return(_filename); };
+
+private:
+ void flush(void) {
+ AS_UTL_safeWrite(_file, _buffer, "writeBuffer", 1, _bufferLen);
+ _bufferLen = 0;
+ };
+
+ const char *_filename;
+ const char *_filemode;
+
+ FILE *_file;
+ uint64 _filePos;
+
+ uint64 _bufferLen;
+ uint64 _bufferMax;
+ char *_buffer;
+};
+
+#endif // WRITE_BUFFER_H
diff --git a/src/AS_global.C b/src/AS_global.C
index 3e5f0c3..533f428 100644
--- a/src/AS_global.C
+++ b/src/AS_global.C
@@ -56,8 +56,7 @@
//
int
AS_configure(int argc, char **argv) {
- char *p = NULL;
- int i, j;
+
#ifdef X86_GCC_LINUX
// Set the x86 FPU control word to force double precision rounding
@@ -71,6 +70,7 @@ AS_configure(int argc, char **argv) {
_FPU_SETCW( fpu_cw );
#endif
+
#ifdef _GLIBCXX_PARALLEL_SETTINGS_H
__gnu_parallel::_Settings s = __gnu_parallel::_Settings::get();
@@ -92,15 +92,24 @@ AS_configure(int argc, char **argv) {
__gnu_parallel::_Settings::set(s);
#endif
+
+ // Default to one thread. This is mostly to disable the parallel sort,
+ // which seems to have a few bugs left in it. e.g., a crash when using 48
+ // threads, but not when using 47, 49 or 64 threads.
+
+ omp_set_num_threads(1);
+
+
// Install a signal handler to catch seg faults and errors.
AS_UTL_installCrashCatcher();
+
//
// Et cetera.
//
- for (i=0; i<argc; i++) {
+ for (int32 i=0; i<argc; i++) {
if (strcmp(argv[i], "--version") == 0) {
fprintf(stderr, "Canu v%s.%s (+%s commits) r%s %s.\n",
CANU_VERSION_MAJOR,
@@ -112,23 +121,22 @@ AS_configure(int argc, char **argv) {
}
}
+
//
// Logging.
//
- p = getenv("CANU_DIRECTORY");
+ char *p = getenv("CANU_DIRECTORY");
if (p == NULL)
return(argc);
char D[FILENAME_MAX] = {0};
char N[FILENAME_MAX] = {0};
char H[1024] = {0}; // HOST_NAME_MAX? Undefined.
- char *E;
- FILE *F;
// Make a directory for logs. If an error, just return now, there's nothing we can log.
- sprintf(D, "%s/canu-logs", p);
+ snprintf(D, FILENAME_MAX, "%s/canu-logs", p);
errno = 0;
mkdir(D, S_IRWXU | S_IRWXG | S_IRWXO);
@@ -141,7 +149,7 @@ AS_configure(int argc, char **argv) {
// Our executable name is part of our unique filename too.
- E = argv[0] + strlen(argv[0]) - 1;
+ char *E = argv[0] + strlen(argv[0]) - 1;
while ((E != argv[0]) && (*E != '/'))
E--;
if (*E == '/')
@@ -149,15 +157,15 @@ AS_configure(int argc, char **argv) {
// Construct a name for this log, and open it. If we can't open it, just skip the log.
- sprintf(N, "%s/"F_U64"_%s_"F_U64"_%s",
- D,
- (uint64)time(NULL),
- H,
- (uint64)getpid(),
- E);
+ snprintf(N, FILENAME_MAX, "%s/" F_U64 "_%s_" F_U64 "_%s",
+ D,
+ (uint64)time(NULL),
+ H,
+ (uint64)getpid(),
+ E);
errno = 0;
- F = fopen(N, "w");
+ FILE *F = fopen(N, "w");
if ((errno != 0) || (F == NULL))
return(argc);
@@ -174,7 +182,7 @@ AS_configure(int argc, char **argv) {
fprintf(F, "Command:\n");
fprintf(F, "%s", argv[0]);
- for (i=1; i<argc; i++)
+ for (int32 i=1; i<argc; i++)
if (argv[i][0] == '-')
fprintf(F, " \\\n %s", argv[i]);
else
diff --git a/src/AS_global.H b/src/AS_global.H
index 03d2e06..62ad5f7 100644
--- a/src/AS_global.H
+++ b/src/AS_global.H
@@ -90,7 +90,6 @@
#include <sys/types.h>
#include <sys/stat.h>
-#include <sys/types.h>
#ifndef TRUE
#define TRUE true
@@ -166,27 +165,27 @@ typedef uint64_t uint64;
#define F_STRI "%*s"
// Integers
-#define F_S16 "%"PRId16
-#define F_S16P PRId16
-#define F_S16I "%*"PRId16
-#define F_U16 "%"PRIu16
-#define F_U16P PRIu16
-#define F_U16I "%*"PRIu16
-#define F_S32 "%"PRId32
-#define F_S32P PRId32
-#define F_S32I "%*"PRId32
-#define F_U32 "%"PRIu32
-#define F_U32P PRIu32
-#define F_U32I "%*"PRIu32
-#define F_S64 "%"PRId64
-#define F_S64P PRId64
-#define F_S64I "%*"PRId64
-#define F_U64 "%"PRIu64
-#define F_U64P PRIu64
-#define F_U64I "%*"PRIu64
-#define F_X64 "%016"PRIx64
-#define F_X64P PRIx64
-#define F_X64I "%*"PRIx64
+#define F_S16 "%" PRId16
+#define F_S16P PRId16
+#define F_S16I "%*" PRId16
+#define F_U16 "%" PRIu16
+#define F_U16P PRIu16
+#define F_U16I "%*" PRIu16
+#define F_S32 "%" PRId32
+#define F_S32P PRId32
+#define F_S32I "%*" PRId32
+#define F_U32 "%" PRIu32
+#define F_U32P PRIu32
+#define F_U32I "%*" PRIu32
+#define F_S64 "%" PRId64
+#define F_S64P PRId64
+#define F_S64I "%*" PRId64
+#define F_U64 "%" PRIu64
+#define F_U64P PRIu64
+#define F_U64I "%*" PRIu64
+#define F_X64 "%016" PRIx64
+#define F_X64P PRIx64
+#define F_X64I "%*" PRIx64
// Floating points
#define F_F32 "%f"
@@ -205,15 +204,6 @@ typedef uint64_t uint64;
#define F_OFF_TP F_S64P
#define F_OFF_TI F_S64I
-typedef uintptr_t INTPTR;
-
-
-// These are used to pad various structs to specific sizes
-#if ULONG_MAX == 0xffffffff
-#define TRUE32BIT
-#else
-#define TRUE64BIT
-#endif
#if defined(_FILE_OFFSET_BITS) && (_FILE_OFFSET_BITS == 32)
@@ -221,10 +211,6 @@ typedef uintptr_t INTPTR;
#endif
-// Enable troublesome asserts. These typically have work arounds, and occasionally trigger.
-// They're really only useful if the assembly can be debugged.
-#undef AGGRESSIVE_ASSERT
-
// perl's chomp is pretty nice
// Not a great place to put this, but it's getting used all over.
diff --git a/src/Makefile b/src/Makefile
index 19f4511..05b2385 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -442,7 +442,7 @@ ifeq (${MACHINETYPE}, amd64)
CC ?= gcc48
CXX ?= g++48
- CXXFLAGS := -pthread -fPIC -Wall -Wno-write-strings -Wno-unused -Wno-char-subscripts -Wno-sign-compare -Wformat -Wextra
+ CXXFLAGS := -pthread -fPIC -Wall -Wno-write-strings -Wno-unused -Wno-char-subscripts -Wno-sign-compare -Wformat -Wextra -Wno-parentheses
LDFLAGS := -pthread -lm
CXXFLAGS += -fopenmp
@@ -474,7 +474,7 @@ ifeq (${MACHINETYPE}, amd64)
#LDFLAGS += -L/usr/local/lib -lunwind -lunwind-x86_64
ifeq ($(BUILDDEBUG), 1)
- CXXFLAGS += -g
+ CXXFLAGS += -g3
else
ifeq ($(BUILDPROFILE), 1)
CXXFLAGS += -g3 -O4 -funroll-loops -fexpensive-optimizations -finline-functions -fno-omit-frame-pointer
@@ -547,7 +547,6 @@ all: UPDATE_VERSION MAKE_DIRS \
$(addprefix ${TARGET_DIR}/,${ALL_TGTS}) \
${TARGET_DIR}/canu \
${TARGET_DIR}/canu.defaults \
- ${TARGET_DIR}/lib/canu/lib \
${TARGET_DIR}/lib/canu/Consensus.pm \
${TARGET_DIR}/lib/canu/CorrectReads.pm \
${TARGET_DIR}/lib/canu/Configure.pm \
@@ -619,9 +618,6 @@ ${TARGET_DIR}/canu.defaults:
echo > ${TARGET_DIR}/canu.defaults "# Add site specific options (for setting up Grid or limiting memory/threads) here."
chmod -x ${TARGET_DIR}/canu.defaults
-${TARGET_DIR}/lib/canu/lib: pipelines/install-perl-libraries.sh
- cd pipelines && sh install-perl-libraries.sh ${TARGET_DIR}/lib/canu
-
${TARGET_DIR}/lib/canu/Consensus.pm: pipelines/canu/Consensus.pm
cp -pf pipelines/canu/Consensus.pm ${TARGET_DIR}/lib/canu/
diff --git a/src/bogart/AS_BAT_AssemblyGraph.C b/src/bogart/AS_BAT_AssemblyGraph.C
new file mode 100644
index 0000000..3d29933
--- /dev/null
+++ b/src/bogart/AS_BAT_AssemblyGraph.C
@@ -0,0 +1,971 @@
+
+/******************************************************************************
+ *
+ * This file is part of canu, a software program that assembles whole-genome
+ * sequencing reads into contigs.
+ *
+ * This software is based on:
+ * 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
+ * the 'kmer package' (http://kmer.sourceforge.net)
+ * both originally distributed by Applera Corporation under the GNU General
+ * Public License, version 2.
+ *
+ * Canu branched from Celera Assembler at its revision 4587.
+ * Canu branched from the kmer project at its revision 1994.
+ *
+ * Modifications by:
+ *
+ * Brian P. Walenz beginning on 2016-JUL-21
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
+ * File 'README.licenses' in the root directory of this distribution contains
+ * full conditions and disclaimers for each license.
+ */
+
+#include "AS_BAT_ReadInfo.H"
+#include "AS_BAT_BestOverlapGraph.H"
+#include "AS_BAT_AssemblyGraph.H"
+#include "AS_BAT_Logging.H"
+
+#include "AS_BAT_PlaceReadUsingOverlaps.H"
+
+#include "intervalList.H"
+#include "stddev.H"
+
+#undef FILTER_DENSE_BUBBLES_FROM_GRAPH
+#define FILTER_DENSE_BUBBLES_THRESHOLD 3 // Retain bubbles if they have fewer than this number of edges to other tigs
+
+#undef LOG_GRAPH
+#undef LOG_GRAPH_ALL
+
+
+void
+AssemblyGraph::buildReverseEdges(void) {
+
+ writeStatus("AssemblyGraph()-- building reverse edges.\n");
+
+ for (uint32 fi=1; fi<RI->numReads()+1; fi++)
+ _pReverse[fi].clear();
+
+ for (uint32 fi=1; fi<RI->numReads()+1; fi++) {
+ for (uint32 ff=0; ff<_pForward[fi].size(); ff++) {
+ BestPlacement &bp = _pForward[fi][ff];
+ BestReverse br(fi, ff);
+
+ // Ensure that contained edges have no dovetail edges. This screws up the logic when
+ // rebuilding and outputting the graph.
+
+ if (bp.bestC.b_iid != 0) {
+ assert(bp.best5.b_iid == 0);
+ assert(bp.best3.b_iid == 0);
+ }
+
+ // Add reverse edges if the forward edge exists
+
+ if (bp.bestC.b_iid != 0) _pReverse[bp.bestC.b_iid].push_back(br);
+ if (bp.best5.b_iid != 0) _pReverse[bp.best5.b_iid].push_back(br);
+ if (bp.best3.b_iid != 0) _pReverse[bp.best3.b_iid].push_back(br);
+
+ // Check sanity.
+
+ assert((bp.bestC.a_hang <= 0) && (bp.bestC.b_hang >= 0)); // ALL contained edges should be this.
+ assert((bp.best5.a_hang <= 0) && (bp.best5.b_hang <= 0)); // ALL 5' edges should be this.
+ assert((bp.best3.a_hang >= 0) && (bp.best3.b_hang >= 0)); // ALL 3' edges should be this.
+ }
+ }
+}
+
+
+
+void
+AssemblyGraph::buildGraph(const char *UNUSED(prefix),
+ double deviationRepeat,
+ TigVector &tigs,
+ bool tigEndsOnly) {
+ uint32 fiLimit = RI->numReads();
+ uint32 numThreads = omp_get_max_threads();
+ uint32 blockSize = (fiLimit < 100 * numThreads) ? numThreads : fiLimit / 99;
+
+ // Just some logging. Count the number of reads we try to place.
+
+ uint32 nToPlaceContained = 0;
+ uint32 nToPlace = 0;
+ uint32 nPlacedContained = 0;
+ uint32 nPlaced = 0;
+ uint32 nFailedContained = 0;
+ uint32 nFailed = 0;
+
+ for (uint32 fid=1; fid<RI->numReads()+1; fid++) {
+ if (tigs.inUnitig(fid) == 0) // Unplaced, don't care. These didn't assemble, and aren't contained.
+ continue;
+
+ if (OG->isContained(fid))
+ nToPlaceContained++;
+ else
+ nToPlace++;
+ }
+
+ writeStatus("\n");
+
+ writeStatus("AssemblyGraph()-- allocating vectors for placements, %.3fMB\n", // vector<> is 24 bytes, pretty tiny.
+ (sizeof(vector<BestPlacement>) + sizeof(vector<BestReverse>)) * (fiLimit + 1) / 1048576.0);
+
+ _pForward = new vector<BestPlacement> [fiLimit + 1];
+ _pReverse = new vector<BestReverse> [fiLimit + 1];
+
+ writeStatus("AssemblyGraph()-- finding edges for %u reads (%u contained), ignoring %u unplaced reads, with %d thread%s.\n",
+ nToPlaceContained + nToPlace,
+ nToPlaceContained,
+ RI->numReads() - nToPlaceContained - nToPlace,
+ numThreads, (numThreads == 1) ? "" : "s");
+
+ // Do the placing!
+
+#pragma omp parallel for schedule(dynamic, blockSize)
+ for (uint32 fi=1; fi<RI->numReads()+1; fi++) {
+ bool enableLog = true;
+
+ uint32 fiTigID = tigs.inUnitig(fi);
+
+ if (fiTigID == 0) // Unplaced, don't care.
+ continue;
+
+ if (tigEndsOnly == true) {
+ uint32 f = tigs[fiTigID]->firstRead()->ident;
+ uint32 l = tigs[fiTigID]->lastRead()->ident;
+
+ if ((f != fi) && (l != fi)) // Not the first read and not the last read,
+ continue; // Don't care.
+ }
+
+ // Grab a bit about this read.
+
+ uint32 fiLen = RI->readLength(fi);
+ ufNode *fiRead = &tigs[fiTigID]->ufpath[ tigs.ufpathIdx(fi) ];
+ int32 fiMin = fiRead->position.min();
+ int32 fiMax = fiRead->position.max();
+
+ // Find ALL potential placements, regardless of error rate.
+
+ vector<overlapPlacement> placements;
+
+ placeReadUsingOverlaps(tigs, NULL, fi, placements);
+
+#ifdef LOG_GRAPH
+ //writeLog("AG()-- working on read %u with %u placements\n", fi, placements.size());
+#endif
+
+ // For each placement decide if the overlap is compatible with the tig.
+
+ for (uint32 pp=0; pp<placements.size(); pp++) {
+ Unitig *tig = tigs[placements[pp].tigID];
+
+ double erate = placements[pp].errors / placements[pp].aligned;
+
+ // Ignore placements in singletons.
+ if (tig->ufpath.size() <= 1) {
+#ifdef LOG_GRAPH
+ writeLog("AG()-- read %8u placement %2u -> tig %7u placed %9d-%9d verified %9d-%9d cov %7.5f erate %6.4f SINGLETON\n",
+ fi, pp,
+ placements[pp].tigID,
+ placements[pp].position.bgn, placements[pp].position.end,
+ placements[pp].verified.bgn, placements[pp].verified.end,
+ placements[pp].fCoverage, erate);
+#endif
+ continue;
+ }
+
+ int32 utgmin = placements[pp].position.min(); // Placement in unitig.
+ int32 utgmax = placements[pp].position.max();
+ bool utgfwd = placements[pp].position.isForward();
+
+ int32 ovlmin = placements[pp].verified.min(); // Placement in unitig, verified by overlaps.
+ int32 ovlmax = placements[pp].verified.max();
+
+ assert(placements[pp].covered.bgn < placements[pp].covered.end); // Coverage is always forward.
+
+ bool is5 = (placements[pp].covered.bgn == 0) ? true : false; // Placement covers the 5' end of the read
+ bool is3 = (placements[pp].covered.end == fiLen) ? true : false; // Placement covers the 3' end of the read
+
+
+ // Ignore placements that aren't overlaps (contained reads placed inside this read will do this).
+ if ((is5 == false) && (is3 == false)) {
+#ifdef LOG_GRAPH_ALL
+ writeLog("AG()-- read %8u placement %2u -> tig %7u placed %9d-%9d verified %9d-%9d cov %7.5f erate %6.4f SPANNED_REPEAT\n",
+ fi, pp,
+ placements[pp].tigID,
+ placements[pp].position.bgn, placements[pp].position.end,
+ placements[pp].verified.bgn, placements[pp].verified.end,
+ placements[pp].fCoverage, erate);
+#endif
+ continue;
+ }
+
+ // Decide if the overlap is to the left (towards 0) or right (towards infinity) of us on the tig.
+ bool onLeft = (((utgfwd == true) && (is5 == true)) ||
+ ((utgfwd == false) && (is3 == true))) ? true : false;
+
+ bool onRight = (((utgfwd == true) && (is3 == true)) ||
+ ((utgfwd == false) && (is5 == true))) ? true : false;
+
+
+ // Decide if this is already captured in a tig. If so, we'll emit to GFA, but omit from our
+ // internal graph.
+ bool isTig = false;
+
+ if ((placements[pp].tigID == fiTigID) && (utgmin <= fiMax) && (fiMin <= utgmax))
+ isTig = true;
+
+ // Decide if the placement is complatible with the other reads in the tig.
+
+#define REPEAT_FRACTION 0.5
+
+ if ((isTig == false) &&
+ (tig->overlapConsistentWithTig(deviationRepeat, ovlmin, ovlmax, erate) < REPEAT_FRACTION)) {
+#ifdef LOG_GRAPH_ALL
+ if ((enableLog == true) && (logFileFlagSet(LOG_PLACE_UNPLACED)))
+ writeLog("AG()-- read %8u placement %2u -> tig %7u placed %9d-%9d verified %9d-%9d cov %7.5f erate %6.4f HIGH_ERROR\n",
+ fi, pp,
+ placements[pp].tigID,
+ placements[pp].position.bgn, placements[pp].position.end,
+ placements[pp].verified.bgn, placements[pp].verified.end,
+ placements[pp].fCoverage, erate);
+#endif
+ continue;
+ }
+
+ // A valid placement! Create a BestPlacement for it.
+
+ BestPlacement bp;
+
+#ifdef LOG_GRAPH
+ writeLog("AG()-- read %8u placement %2u -> tig %7u placed %9d-%9d verified %9d-%9d cov %7.5f erate %6.4f Fidx %6u Lidx %6u is5 %d is3 %d onLeft %d onRight %d VALID_PLACEMENT\n",
+ fi, pp,
+ placements[pp].tigID,
+ placements[pp].position.bgn, placements[pp].position.end,
+ placements[pp].verified.bgn, placements[pp].verified.end,
+ placements[pp].fCoverage, erate,
+ placements[pp].tigFidx, placements[pp].tigLidx,
+ is5, is3, onLeft, onRight);
+#endif
+
+ // Find the reads we have overlaps to. The range of reads here is the first and last read in
+ // the tig layout that overlaps with ourself. We don't need to check that the reads overlap in the
+ // layout: the only false case I can think of involves contained reads.
+ //
+ // READ: -----------------------------------
+ // TIG: Fidx -----------------------------
+ // TIG: (1) ------
+ // TIG: --------------------------------------
+ // TIG: Lidx -----------------------------------------
+ // (2) ------
+ //
+ // The short read is placed at (1), but also has an overlap to us at (2).
+
+ set<uint32> tigReads;
+
+ for (uint32 rr=placements[pp].tigFidx; rr <= placements[pp].tigLidx; rr++)
+ tigReads.insert(tig->ufpath[rr].ident);
+
+ // Scan all overlaps. Decide if the overlap is to the L or R of the _placed_ read, and save
+ // the thickest overlap on the 5' or 3' end of the read.
+
+ uint32 no = 0;
+ BAToverlap *ovl = OC->getOverlaps(fi, no);
+
+ uint32 thickestC = UINT32_MAX, thickestCident = 0;
+ uint32 thickest5 = UINT32_MAX, thickest5len = 0;
+ uint32 thickest3 = UINT32_MAX, thickest3len = 0;
+
+ for (uint32 oo=0; oo<no; oo++) {
+ if (tigReads.count(ovl[oo].b_iid) == 0) // Don't care about overlaps to reads not in the set.
+ continue;
+
+ uint32 olapLen = RI->overlapLength(ovl[oo].a_iid, ovl[oo].b_iid, ovl[oo].a_hang, ovl[oo].b_hang);
+
+ if (ovl[oo].AisContainer() == true) {
+ continue;
+ }
+
+ else if ((ovl[oo].AisContained() == true) && (is5 == true) && (is3 == true)) {
+ if (thickestCident < ovl[oo].evalue) {
+ thickestC = oo;
+ thickestCident = ovl[oo].evalue;
+ bp.bestC = ovl[oo];
+ }
+ }
+
+ else if ((ovl[oo].AEndIs5prime() == true) && (is5 == true)) {
+ if (thickest5len < olapLen) {
+ thickest5 = oo;
+ thickest5len = olapLen;
+ bp.best5 = ovl[oo];
+ }
+ }
+
+ else if ((ovl[oo].AEndIs3prime() == true) && (is3 == true)) {
+ if (thickest3len < olapLen) {
+ thickest3 = oo;
+ thickest3len = olapLen;
+ bp.best3 = ovl[oo];
+ }
+ }
+ }
+
+ // If we have both 5' and 3' edges, delete the containment edge.
+
+ if ((bp.best5.b_iid != 0) && (bp.best3.b_iid != 0)) {
+ thickestC = UINT32_MAX; thickestCident = 0; bp.bestC = BAToverlap();
+ }
+
+ // If we have a containment edge, delete the 5' and 3' edges.
+
+ if (bp.bestC.b_iid != 0) {
+ thickest5 = UINT32_MAX; thickest5len = 0; bp.best5 = BAToverlap();
+ thickest3 = UINT32_MAX; thickest3len = 0; bp.best3 = BAToverlap();
+ }
+
+
+ // Save the edge.
+
+ bp.tigID = placements[pp].tigID;
+
+ bp.placedBgn = placements[pp].position.bgn;
+ bp.placedEnd = placements[pp].position.end;
+
+ bp.olapBgn = placements[pp].verified.bgn;
+ bp.olapEnd = placements[pp].verified.end;
+
+ bp.isContig = isTig;
+ bp.isUnitig = false;
+ bp.isBubble = false;
+ bp.isRepeat = false;
+
+ // If there are best edges off the 5' or 3' end, grab all the overlaps, find the particular
+ // overlap, and generate new BestEdgeOverlaps for them.
+
+ if ((thickestC == UINT32_MAX) &&
+ (thickest5 == UINT32_MAX) &&
+ (thickest3 == UINT32_MAX)) {
+#ifdef LOG_GRAPH
+ writeLog("AG()-- read %8u placement %2u -> tig %7u placed %9d-%9d verified %9d-%9d cov %7.5f erate %6.4f NO_EDGES Fidx %6u Lidx %6u is5 %d is3 %d onLeft %d onRight %d\n",
+ fi, pp,
+ placements[pp].tigID,
+ placements[pp].position.bgn, placements[pp].position.end,
+ placements[pp].verified.bgn, placements[pp].verified.end,
+ placements[pp].fCoverage, erate,
+ placements[pp].tigFidx, placements[pp].tigLidx,
+ is5, is3, onLeft, onRight);
+#endif
+ continue;
+ }
+ assert((thickestC != 0) ||
+ (thickest5 != 0) ||
+ (thickest3 != 0));
+
+ // Save the BestPlacement
+
+ uint32 ff = _pForward[fi].size();
+
+ _pForward[fi].push_back(bp);
+
+ // And now just log.
+
+#ifdef LOG_GRAPH
+ if (thickestC != UINT32_MAX) {
+ writeLog("AG()-- read %8u placement %2u -> tig %7u placed %9d-%9d verified %9d-%9d cov %7.5f erate %6.4f CONTAINED %8d (%8d %8d)%s\n",
+ fi, pp,
+ placements[pp].tigID,
+ placements[pp].position.bgn, placements[pp].position.end,
+ placements[pp].verified.bgn, placements[pp].verified.end,
+ placements[pp].fCoverage, erate,
+ bp.bestC.b_iid, bp.best5.b_iid, bp.best3.b_iid,
+ (isTig == true) ? " IN_UNITIG" : "");
+ } else {
+ writeLog("AG()-- read %8u placement %2u -> tig %7u placed %9d-%9d verified %9d-%9d cov %7.5f erate %6.4f DOVETAIL (%8d) %8d %8d%s\n",
+ fi, pp,
+ placements[pp].tigID,
+ placements[pp].position.bgn, placements[pp].position.end,
+ placements[pp].verified.bgn, placements[pp].verified.end,
+ placements[pp].fCoverage, erate,
+ bp.bestC.b_iid, bp.best5.b_iid, bp.best3.b_iid,
+ (isTig == true) ? " IN_UNITIG" : "");
+ }
+#endif
+ } // Over all placements
+ } // Over all reads
+
+ buildReverseEdges();
+
+ writeStatus("AssemblyGraph()-- build complete.\n");
+}
+
+
+
+
+
+
+void
+placeAsContained(TigVector &tigs,
+ uint32 fi,
+ BestPlacement &bp) {
+ BestEdgeOverlap edge(bp.bestC);
+ ufNode read;
+ Unitig *tig = tigs[ tigs.inUnitig(edge.readId()) ];
+
+ if (tig->placeRead(read, fi, bp.bestC.AEndIs3prime(), &edge) == false) {
+ fprintf(stderr, "WARNING: placeAsContained failed for fi=%u\n", fi);
+ assert(0);
+ }
+
+ bp.tigID = tig->id();
+
+ bp.placedBgn = read.position.bgn;
+ bp.placedEnd = read.position.end;
+
+ bp.olapBgn = INT32_MIN; // We don't know the overlapping region (without a lot
+ bp.olapEnd = INT32_MAX; // of work) so make it invalid.
+
+ bp.isContig = (tigs.inUnitig(fi) == tigs.inUnitig(edge.readId()));
+}
+
+
+
+// This test is correct, but it isn't used correctly. When rebuilding the graph, we don't know if
+// a read is fully covered. If it isn't fully covered, it isn't 'inContig' even if the positions
+// overlap.
+bool
+areReadsOverlapping(TigVector &tigs,
+ uint32 ai,
+ uint32 bi) {
+ Unitig *at = tigs[ tigs.inUnitig(ai) ];
+ Unitig *bt = tigs[ tigs.inUnitig(bi) ];
+
+ if (at != bt)
+ return(false);
+
+ ufNode &ar = at->ufpath[ tigs.ufpathIdx(ai) ];
+ ufNode &br = bt->ufpath[ tigs.ufpathIdx(bi) ];
+
+ return((ar.position.min() < br.position.max()) &&
+ (br.position.min() < ar.position.max()));
+}
+
+
+
+void
+placeAsDovetail(TigVector &tigs,
+ uint32 fi,
+ BestPlacement &bp) {
+ BestEdgeOverlap edge5(bp.best5), edge3(bp.best3);
+ ufNode read5, read3;
+
+ if ((bp.best5.b_iid > 0) && (bp.best3.b_iid > 0)) {
+ Unitig *tig5 = tigs[ tigs.inUnitig(edge5.readId()) ];
+ Unitig *tig3 = tigs[ tigs.inUnitig(edge3.readId()) ];
+
+ assert(tig5->id() == tig3->id());
+
+ if ((tig5->placeRead(read5, fi, bp.best5.AEndIs3prime(), &edge5) == false) ||
+ (tig3->placeRead(read3, fi, bp.best3.AEndIs3prime(), &edge3) == false)) {
+ fprintf(stderr, "WARNING: placeAsDovetail 5' 3' failed for fi=%u\n", fi);
+ assert(0);
+ }
+
+ bp.tigID = tig5->id();
+ bp.placedBgn = (read5.position.bgn + read3.position.bgn) / 2;
+ bp.placedEnd = (read5.position.end + read3.position.end) / 2;
+
+#if 0
+ bp.isContig = (areReadsOverlapping(tigs, fi, bp.best5.b_iid) &&
+ areReadsOverlapping(tigs, fi, bp.best3.b_iid));
+#else
+ if ((bp.isContig == true) && // Remove the isContig mark if this read is now
+ (tigs.inUnitig(fi) != bp.tigID)) // in a different tig than the two edges (which is unlikely).
+ bp.isContig = false;
+#endif
+ }
+
+ else if (bp.best5.b_iid > 0) {
+ Unitig *tig5 = tigs[ tigs.inUnitig(edge5.readId()) ];
+
+ if (tig5->placeRead(read5, fi, bp.best5.AEndIs3prime(), &edge5) == false) {
+ fprintf(stderr, "WARNING: placeAsDovetail 5' failed for fi=%u\n", fi);
+ assert(0);
+ }
+
+ bp.tigID = tig5->id();
+ bp.placedBgn = read5.position.bgn;
+ bp.placedEnd = read5.position.end;
+
+#if 0
+ bp.isContig = areReadsOverlapping(tigs, fi, bp.best5.b_iid);
+#else
+ if ((bp.isContig == true) && // Remove the isContig mark if this read is now
+ (tigs.inUnitig(fi) != bp.tigID)) // in a different tig than the edge.
+ bp.isContig = false;
+#endif
+ }
+
+ else if (bp.best3.b_iid > 0) {
+ Unitig *tig3 = tigs[ tigs.inUnitig(edge3.readId()) ];
+
+ if (tig3->placeRead(read3, fi, bp.best3.AEndIs3prime(), &edge3) == false) {
+ fprintf(stderr, "WARNING: placeAsDovetail 3' failed for fi=%u\n", fi);
+ assert(0);
+ }
+
+ bp.tigID = tig3->id();
+ bp.placedBgn = read3.position.bgn;
+ bp.placedEnd = read3.position.end;
+
+#if 0
+ bp.isContig = areReadsOverlapping(tigs, fi, bp.best3.b_iid);
+#else
+ if ((bp.isContig == true) && // Remove the isContig mark if this read is now
+ (tigs.inUnitig(fi) != bp.tigID)) // in a different tig than the edge.
+ bp.isContig = false;
+#endif
+ }
+
+ assert(tigs[bp.tigID] != NULL);
+
+ bp.olapBgn = INT32_MIN; // We don't know the overlapping region (without a lot
+ bp.olapEnd = INT32_MAX; // of work) so make it invalid.
+}
+
+
+
+
+void
+AssemblyGraph::rebuildGraph(TigVector &tigs) {
+
+ writeStatus("AssemblyGraph()-- rebuilding\n");
+
+ uint64 nContain = 0;
+ uint64 nSame = 0;
+ uint64 nSplit = 0;
+
+ for (uint32 fi=1; fi<RI->numReads()+1; fi++) {
+ for (uint32 ff=0; ff<_pForward[fi].size(); ff++) {
+ BestPlacement &bp = _pForward[fi][ff];
+
+ // Figure out which tig each of our three overlaps is in.
+
+ uint32 t5 = (bp.best5.b_iid > 0) ? tigs.inUnitig(bp.best5.b_iid) : UINT32_MAX;
+ uint32 t3 = (bp.best3.b_iid > 0) ? tigs.inUnitig(bp.best3.b_iid) : UINT32_MAX;
+
+ //writeLog("AssemblyGraph()-- rebuilding read %u edge %u with overlaps %u %u %u\n",
+ // fi, ff, bp.bestC.b_iid, bp.best5.b_iid, bp.best3.b_iid);
+
+ // If a containment relationship, place it using the contain and update the placement.
+
+ if (bp.bestC.b_iid > 0) {
+ assert(bp.best5.b_iid == 0);
+ assert(bp.best3.b_iid == 0);
+
+ nContain++;
+ placeAsContained(tigs, fi, bp);
+ }
+
+ // Otherwise, dovetails. If both overlapping reads are in the same tig, place it and update
+ // the placement.
+
+ else if ((t5 == t3) || // Both in the same tig
+ (t5 == UINT32_MAX) || // 5' overlap isn't set
+ (t3 == UINT32_MAX)) { // 3' overlap isn't set
+ nSame++;
+ placeAsDovetail(tigs, fi, bp);
+ }
+
+ // Otherwise, yikes, our overlapping reads are in different tigs! We need to make new
+ // placements and delete the current one.
+
+ else {
+ BestPlacement bp5 = bp;
+ BestPlacement bp3 = bp;
+
+ bp5.best3 = BAToverlap(); // Erase the 3' overlap
+ bp3.best5 = BAToverlap(); // Erase the 5' overlap
+
+ assert(bp5.best5.b_iid != 0); // Overlap must exist!
+ assert(bp3.best3.b_iid != 0); // Overlap must exist!
+
+ nSplit++;
+ placeAsDovetail(tigs, fi, bp5);
+ placeAsDovetail(tigs, fi, bp3);
+
+ // Add the two placements to our list. We let one placement overwrite the current
+ // placement, move the placement after that to the end of the list, and overwrite
+ // that placement with our other new one.
+
+ uint32 ll = _pForward[fi].size();
+
+ // There's a nasty case when ff is the last currently on the list; there isn't an ff+1
+ // element to move to the end of the list. So, we add a new element to the list -
+ // guaranteeing there is always an ff+1 element - then move, then replace.
+
+ _pForward[fi].push_back(BestPlacement());
+
+ _pForward[fi][ll] = _pForward[fi][ff+1];
+
+ _pForward[fi][ff] = bp5;
+ _pForward[fi][ff+1] = bp3;
+
+ // Skip the edge we just added.
+
+ ff++;
+ }
+ }
+ }
+
+ buildReverseEdges();
+
+ writeStatus("AssemblyGraph()-- rebuild complete.\n");
+}
+
+
+
+
+
+// Filter edges that originate from the middle of a tig.
+// Need to save interior edges as long as they are consistent with a boundary edge.
+
+void
+AssemblyGraph::filterEdges(TigVector &tigs) {
+ uint64 nUnitig = 0;
+ uint64 nContig = 0;
+ uint64 nBubble = 0;
+ uint64 nRepeat = 0;
+
+ uint64 nMiddleFiltered = 0, nMiddleReads = 0;
+ uint64 nRepeatFiltered = 0, nRepeatReads = 0;
+
+ uint64 nIntersecting = 0;
+
+ uint64 nRepeatEdges = 0;
+ uint64 nBubbleEdges = 0;
+
+ writeStatus("AssemblyGraph()-- filtering edges\n");
+
+ // Mark edges that are from the interior of a tig as 'repeat'.
+
+ for (uint32 fi=1; fi<RI->numReads()+1; fi++) {
+ if (_pForward[fi].size() == 0)
+ continue;
+
+ uint32 tT = tigs.inUnitig(fi);
+ Unitig *tig = tigs[tT];
+ ufNode &read = tig->ufpath[tigs.ufpathIdx(fi)];
+
+ bool hadMiddle = false;
+
+ for (uint32 ff=0; ff<_pForward[fi].size(); ff++) {
+ BestPlacement &bp = _pForward[fi][ff];
+
+ // Edges forming the tig are not repeats.
+
+ if (bp.isUnitig == true) continue;
+ if (bp.isContig == true) continue;
+
+ // Edges from the end of a tig are not repeats.
+
+ if (((read.position.min() == 0) && (read.position.isForward()) && (bp.best5.b_iid > 0) && (bp.best3.b_iid == 0)) ||
+ ((read.position.min() == 0) && (read.position.isReverse()) && (bp.best5.b_iid == 0) && (bp.best3.b_iid > 0)) ||
+ ((read.position.max() == tig->getLength()) && (read.position.isForward()) && (bp.best5.b_iid == 0) && (bp.best3.b_iid > 0)) ||
+ ((read.position.max() == tig->getLength()) && (read.position.isReverse()) && (bp.best5.b_iid > 0) && (bp.best3.b_iid == 0))) {
+ nIntersecting++;
+ continue;
+ }
+
+ nMiddleFiltered++;
+
+ bp.isRepeat = true;
+ hadMiddle = true;
+ }
+
+ if (hadMiddle)
+ nMiddleReads++;
+ }
+
+ // Filter edges that hit too many tigs
+
+ for (uint32 fi=1; fi<RI->numReads()+1; fi++) {
+ if (_pForward[fi].size() == 0)
+ continue;
+
+ uint32 tT = tigs.inUnitig(fi);
+ Unitig *tig = tigs[tT];
+ ufNode &read = tig->ufpath[tigs.ufpathIdx(fi)];
+
+ set<uint32> hits;
+
+ for (uint32 ff=0; ff<_pForward[fi].size(); ff++) {
+ BestPlacement &bp = _pForward[fi][ff];
+
+ assert(bp.isUnitig == false);
+
+ if (bp.isUnitig == true) { continue; } // Skip edges that are in tigs
+ if (bp.isContig == true) { continue; } //
+ if (bp.isRepeat == true) { continue; } // Skip edges that are already ignored
+
+ hits.insert(bp.tigID);
+ }
+
+ // If only a few other tigs are involved, keep all.
+
+ if (hits.size() > 0)
+ writeLog("AG()-- read %u in tig %u has edges to %u tigs\n", fi, tT, hits.size());
+
+
+#ifdef FILTER_DENSE_BUBBLES_FROM_GRAPH
+ if (hits.size() <= FILTER_DENSE_BUBBLES_THRESHOLD)
+ continue;
+
+ // Otherwise, mark all edges as repeat.
+
+ nRepeatReads++;
+
+ for (uint32 ff=0; ff<_pForward[fi].size(); ff++) {
+ BestPlacement &bp = _pForward[fi][ff];
+
+ assert(bp.isUnitig == false);
+
+ if (bp.isUnitig == true) { continue; } // Skip edges that are in tigs
+ if (bp.isContig == true) { continue; } //
+ if (bp.isRepeat == true) { continue; } // Skip edges that are already ignored
+
+ nRepeatFiltered++;
+
+ bp.isRepeat = true;
+ }
+#endif
+ }
+
+ // Generate statistics
+
+ for (uint32 fi=1; fi<RI->numReads()+1; fi++) {
+ for (uint32 ff=0; ff<_pForward[fi].size(); ff++) {
+ BestPlacement &bp = _pForward[fi][ff];
+
+ if (bp.isUnitig == true) { nUnitig++; continue; }
+ if (bp.isContig == true) { nContig++; continue; }
+ if (bp.isRepeat == true) { nRepeatEdges++; }
+ if (bp.isRepeat == false) { nBubbleEdges++; }
+ }
+ }
+
+ // Report
+
+ writeStatus("AssemblyGraph()-- " F_U64 " contig edges and " F_U64 " unitig edges.\n", nContig, nUnitig);
+ writeStatus("AssemblyGraph()-- " F_U64 " bubble edges and " F_U64 " repeat edges.\n", nBubble, nRepeat);
+ writeStatus("AssemblyGraph()-- " F_U64 " middle contig edges filtered from " F_U64 " reads.\n", nMiddleFiltered, nMiddleReads);
+ writeStatus("AssemblyGraph()-- " F_U64 " repeat end edges filtered from " F_U64 " reads.\n", nRepeatFiltered, nRepeatReads);
+ writeStatus("AssemblyGraph()-- " F_U64 " repeat edges (not output).\n", nRepeatEdges);
+ writeStatus("AssemblyGraph()-- " F_U64 " bubble edges.\n", nBubbleEdges);
+ writeStatus("AssemblyGraph()-- " F_U64 " intersecting edges (from the end of a tig to somewhere else).\n", nIntersecting);
+}
+
+
+
+
+
+
+bool
+reportReadGraph_reportEdge(TigVector &tigs,
+ BestPlacement &pf,
+ bool skipBubble,
+ bool skipRepeat,
+ bool &reportC,
+ bool &report5,
+ bool &report3) {
+ reportC = false;
+ report5 = false;
+ report3 = false;
+
+ if ((skipBubble == true) && (pf.isBubble == true))
+ return(false);
+
+ if ((skipRepeat == true) && (pf.isRepeat == true))
+ return(false);
+
+ // If the destination isunassembled, all edges are ignored.
+ if ((tigs[pf.tigID] == NULL) || (tigs[pf.tigID]->_isUnassembled == true))
+ return(false);
+
+ reportC = (tigs.inUnitig(pf.bestC.b_iid) != 0) && (tigs[ tigs.inUnitig(pf.bestC.b_iid) ]->_isUnassembled == false);
+ report5 = (tigs.inUnitig(pf.best5.b_iid) != 0) && (tigs[ tigs.inUnitig(pf.best5.b_iid) ]->_isUnassembled == false);
+ report3 = (tigs.inUnitig(pf.best3.b_iid) != 0) && (tigs[ tigs.inUnitig(pf.best3.b_iid) ]->_isUnassembled == false);
+
+ if ((reportC == false) &&
+ (report5 == false) &&
+ (report3 == false))
+ return(false);
+
+ return(true);
+}
+
+
+// SWIPED FROM BestOverlapGraph::reportBestEdges
+
+void
+AssemblyGraph::reportReadGraph(TigVector &tigs, const char *prefix, const char *label) {
+ char N[FILENAME_MAX];
+ FILE *BEG = NULL;
+
+ bool skipBubble = true;
+ bool skipRepeat = true;
+ bool skipUnassembled = true;
+
+ uint64 nEdgeToUnasm = 0;
+
+ writeStatus("AssemblyGraph()-- generating '%s.%s.edges.gfa'.\n", prefix, label);
+
+ snprintf(N, FILENAME_MAX, "%s.%s.assembly.gfa", prefix, label);
+
+ BEG = fopen(N, "w");
+
+ if (BEG == NULL)
+ return;
+
+ fprintf(BEG, "H\tVN:Z:bogart/edges\n");
+
+ // First, figure out what sequences are used. A sequence is used if it has forward edges,
+ // or if it is referred to by a forward edge.
+
+ uint32 *used = new uint32 [RI->numReads() + 1];
+
+ memset(used, 0, sizeof(uint32) * (RI->numReads() + 1));
+
+ for (uint32 fi=1; fi<RI->numReads() + 1; fi++) {
+ for (uint32 pp=0; pp<_pForward[fi].size(); pp++) {
+ BestPlacement &pf = _pForward[fi][pp];
+ bool reportC=false, report5=false, report3=false;
+
+ if ((tigs.inUnitig(pf.bestC.b_iid) != 0) && (tigs[ tigs.inUnitig(pf.bestC.b_iid) ]->_isUnassembled == true))
+ nEdgeToUnasm++;
+ if ((tigs.inUnitig(pf.best5.b_iid) != 0) && (tigs[ tigs.inUnitig(pf.best5.b_iid) ]->_isUnassembled == true))
+ nEdgeToUnasm++;
+ if ((tigs.inUnitig(pf.best3.b_iid) != 0) && (tigs[ tigs.inUnitig(pf.best3.b_iid) ]->_isUnassembled == true))
+ nEdgeToUnasm++;
+
+ if (reportReadGraph_reportEdge(tigs, pf, skipBubble, skipRepeat, reportC, report5, report3) == false)
+ continue;
+
+ used[fi] = 1;
+
+ if (reportC) used[pf.bestC.b_iid] = 1;
+ if (report5) used[pf.best5.b_iid] = 1;
+ if (report3) used[pf.best3.b_iid] = 1;
+ }
+ }
+
+ writeStatus("AssemblyGraph()-- Found " F_U64 " edges to unassembled contigs.\n", nEdgeToUnasm);
+
+ // Then write those sequences.
+
+ for (uint32 fi=1; fi<RI->numReads() + 1; fi++)
+ if (used[fi] == 1)
+ fprintf(BEG, "S\tread%08u\t*\tLN:i:%u\n", fi, RI->readLength(fi));
+
+ delete [] used;
+
+
+ // Now, report edges. GFA wants edges in exactly this format:
+ //
+ // -------------
+ // -------------
+ //
+ // with read orientation given by +/-. Conveniently, this is what we've saved (for the edges).
+
+ uint64 nTig[3] = {0,0,0}; // Number of edges - both contig and unitig
+ uint64 nCtg[3] = {0,0,0}; // Number of edges - contig only
+ uint64 nUtg[3] = {0,0,0}; // Number of edges - unitig only (should be zero)
+ uint64 nAsm[3] = {0,0,0}; // Number of edges - between contigs
+
+ uint64 nBubble = 0;
+ uint64 nRepeat = 0;
+
+ for (uint32 fi=1; fi<RI->numReads() + 1; fi++) {
+ for (uint32 pp=0; pp<_pForward[fi].size(); pp++) {
+ BestPlacement &pf = _pForward[fi][pp];
+ bool reportC=false, report5=false, report3=false;
+
+ if (reportReadGraph_reportEdge(tigs, pf, skipBubble, skipRepeat, reportC, report5, report3) == false)
+ continue;
+
+ // Some statistics - number of edges of each type (in a contig, in a unitig, in both (tig), in neither (asm))
+
+ if ((pf.isContig == true) && (pf.isUnitig == true)) {
+ if (reportC == true) nTig[0]++;
+ if (report5 == true) nTig[1]++;
+ if (report3 == true) nTig[2]++;
+ }
+
+ if ((pf.isContig == true) && (pf.isUnitig == false)) {
+ if (reportC == true) nCtg[0]++;
+ if (report5 == true) nCtg[1]++;
+ if (report3 == true) nCtg[2]++;
+ }
+
+ if ((pf.isContig == false) && (pf.isUnitig == true)) {
+ if (reportC == true) nUtg[0]++;
+ if (report5 == true) nUtg[1]++;
+ if (report3 == true) nUtg[2]++;
+ }
+
+ if ((pf.isContig == false) && (pf.isUnitig == false)) {
+ if (reportC == true) nAsm[0]++;
+ if (report5 == true) nAsm[1]++;
+ if (report3 == true) nAsm[2]++;
+ }
+
+ // Finally, output the edge.
+
+ if (reportC)
+ fprintf(BEG, "C\tread%08u\t+\tread%08u\t%c\t%u\t%uM\tic:i:%d\tiu:i:%d\tib:i:%d\tir:i:%d\n",
+ fi,
+ pf.bestC.b_iid, pf.bestC.flipped ? '-' : '+',
+ -pf.bestC.a_hang,
+ RI->readLength(fi),
+ pf.isContig,
+ pf.isUnitig,
+ pf.isBubble,
+ pf.isRepeat);
+
+ if (report5)
+ fprintf(BEG, "L\tread%08u\t-\tread%08u\t%c\t%uM\tic:i:%d\tiu:i:%d\tib:i:%d\tir:i:%d\n",
+ fi,
+ pf.best5.b_iid, pf.best5.BEndIs3prime() ? '-' : '+',
+ RI->overlapLength(fi, pf.best5.b_iid, pf.best5.a_hang, pf.best5.b_hang),
+ pf.isContig,
+ pf.isUnitig,
+ pf.isBubble,
+ pf.isRepeat);
+
+ if (report3)
+ fprintf(BEG, "L\tread%08u\t+\tread%08u\t%c\t%uM\tic:i:%d\tiu:i:%d\tib:i:%d\tir:i:%d\n",
+ fi,
+ pf.best3.b_iid, pf.best3.BEndIs3prime() ? '-' : '+',
+ RI->overlapLength(fi, pf.best3.b_iid, pf.best3.a_hang, pf.best3.b_hang),
+ pf.isContig,
+ pf.isUnitig,
+ pf.isBubble,
+ pf.isRepeat);
+ }
+ }
+
+ fclose(BEG);
+
+ // And report statistics.
+
+ writeStatus("AssemblyGraph()-- %8" F_U64P " bubble placements\n", nBubble);
+ writeStatus("AssemblyGraph()-- %8" F_U64P " repeat placements\n", nRepeat);
+ writeStatus("\n");
+ writeStatus("AssemblyGraph()-- Intratig edges: %8" F_U64P " contained %8" F_U64P " 5' %8" F_U64P " 3' (in both contig and unitig)\n", nTig[0], nTig[1], nTig[2]);
+ writeStatus("AssemblyGraph()-- Contig only edges: %8" F_U64P " contained %8" F_U64P " 5' %8" F_U64P " 3'\n", nCtg[0], nCtg[1], nCtg[2]);
+ writeStatus("AssemblyGraph()-- Unitig only edges: %8" F_U64P " contained %8" F_U64P " 5' %8" F_U64P " 3'\n", nUtg[0], nUtg[1], nUtg[2]);
+ writeStatus("AssemblyGraph()-- Intercontig edges: %8" F_U64P " contained %8" F_U64P " 5' %8" F_U64P " 3' (in neither contig nor unitig)\n", nAsm[0], nAsm[1], nAsm[2]);
+}
+
diff --git a/src/bogart/AS_BAT_AssemblyGraph.H b/src/bogart/AS_BAT_AssemblyGraph.H
new file mode 100644
index 0000000..89624c7
--- /dev/null
+++ b/src/bogart/AS_BAT_AssemblyGraph.H
@@ -0,0 +1,133 @@
+
+/******************************************************************************
+ *
+ * This file is part of canu, a software program that assembles whole-genome
+ * sequencing reads into contigs.
+ *
+ * This software is based on:
+ * 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
+ * the 'kmer package' (http://kmer.sourceforge.net)
+ * both originally distributed by Applera Corporation under the GNU General
+ * Public License, version 2.
+ *
+ * Canu branched from Celera Assembler at its revision 4587.
+ * Canu branched from the kmer project at its revision 1994.
+ *
+ * Modifications by:
+ *
+ * Brian P. Walenz beginning on 2016-JUL-21
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
+ * File 'README.licenses' in the root directory of this distribution contains
+ * full conditions and disclaimers for each license.
+ */
+
+#ifndef INCLUDE_AS_BAT_ASSEMBLYGRAPH
+#define INCLUDE_AS_BAT_ASSEMBLYGRAPH
+
+#include "AS_global.H"
+#include "AS_BAT_OverlapCache.H"
+#include "AS_BAT_BestOverlapGraph.H" // For ReadEnd
+#include "AS_BAT_Unitig.H" // For SeqInterval
+#include "AS_BAT_TigVector.H"
+
+
+class BestPlacement {
+public:
+ BestPlacement() {
+ tigID = UINT32_MAX;
+
+ placedBgn = INT32_MIN;
+ placedEnd = INT32_MAX;
+
+ olapBgn = INT32_MIN;
+ olapEnd = INT32_MAX;
+
+ isContig = false;
+ isUnitig = false;
+ isBubble = false;
+ isRepeat = false;
+ };
+ ~BestPlacement() {
+ };
+
+ uint32 tigID; // Which tig this is placed in.
+
+ int32 placedBgn; // Position in the tig. Can extend negative.
+ int32 placedEnd; //
+
+ int32 olapBgn; // Position in the tig covered by overlaps.
+ int32 olapEnd; //
+
+ bool isContig; // This placement is in a contig
+ bool isUnitig; // This placement is in a unitig
+ bool isBubble; // This placement is to an unambiguous region in a contig
+ bool isRepeat; // This placement is to an ambiguous region in a contig that was split
+
+ BAToverlap bestC;
+ BAToverlap best5;
+ BAToverlap best3;
+};
+
+
+
+class BestReverse {
+public:
+ BestReverse() {
+ readID = 0;
+ placeID = 0;
+ };
+ BestReverse(uint32 id, uint32 pp) {
+ readID = id;
+ placeID = pp;
+ };
+ ~BestReverse() {
+ };
+
+ uint32 readID; // readID we have an overlap from; Index into _pForward
+ uint32 placeID; // index into the vector for _pForward[readID]
+};
+
+
+
+class AssemblyGraph {
+public:
+ AssemblyGraph(const char *prefix,
+ double deviationRepeat,
+ TigVector &tigs,
+ bool tigEndsOnly = false) {
+ buildGraph(prefix, deviationRepeat, tigs, tigEndsOnly);
+ }
+
+ ~AssemblyGraph() {
+ delete [] _pForward;
+ delete [] _pReverse;
+ };
+
+
+public:
+ vector<BestPlacement> &getForward(uint32 fi) { return(_pForward[fi]); };
+ vector<BestReverse> &getReverse(uint32 fi) { return(_pReverse[fi]); };
+
+
+public:
+ void buildReverseEdges(void);
+ void buildGraph(const char *prefix,
+ double deviationRepeat,
+ TigVector &tigs,
+ bool tigEndsOnly);
+
+ void rebuildGraph(TigVector &tigs);
+ void filterEdges(TigVector &tigs);
+ void reportReadGraph(TigVector &tigs, const char *prefix, const char *label);
+
+private:
+ vector<BestPlacement> *_pForward; // Where each read is placed in other tigs
+ vector<BestReverse> *_pReverse; // What reads overlap to me
+};
+
+
+
+
+#endif // INCLUDE_AS_BAT_ASSEMBLYGRAPH
diff --git a/src/bogart/AS_BAT_BestOverlapGraph.C b/src/bogart/AS_BAT_BestOverlapGraph.C
index d749b9f..6d20b3d 100644
--- a/src/bogart/AS_BAT_BestOverlapGraph.C
+++ b/src/bogart/AS_BAT_BestOverlapGraph.C
@@ -39,7 +39,7 @@
* full conditions and disclaimers for each license.
*/
-#include "AS_BAT_FragmentInfo.H"
+#include "AS_BAT_ReadInfo.H"
#include "AS_BAT_BestOverlapGraph.H"
#include "AS_BAT_Logging.H"
@@ -51,22 +51,22 @@
void
-BestOverlapGraph::removeSuspicious(void) {
- uint32 fiLimit = FI->numFragments();
+BestOverlapGraph::removeSuspicious(const char *UNUSED(prefix)) {
+ uint32 fiLimit = RI->numReads();
uint32 numThreads = omp_get_max_threads();
uint32 blockSize = (fiLimit < 100 * numThreads) ? numThreads : fiLimit / 99;
- writeLog("BestOverlapGraph()-- removing suspicious reads from graph, with %d threads.\n", numThreads);
+ writeStatus("BestOverlapGraph()-- removing suspicious reads from graph, with %d thread%s.\n", numThreads, (numThreads == 1) ? "" : "s");
#pragma omp parallel for schedule(dynamic, blockSize)
for (uint32 fi=1; fi <= fiLimit; fi++) {
uint32 no = 0;
- BAToverlap *ovl = OC->getOverlaps(fi, AS_MAX_EVALUE, no);
+ BAToverlap *ovl = OC->getOverlaps(fi, no);
bool verified = false;
intervalList<int32> IL;
- uint32 fLen = FI->fragmentLength(fi);
+ uint32 fLen = RI->readLength(fi);
for (uint32 ii=0; (ii<no) && (verified == false); ii++) {
if (isOverlapBadQuality(ovl[ii]))
@@ -101,21 +101,26 @@ BestOverlapGraph::removeSuspicious(void) {
if (verified == false) {
#pragma omp critical (suspInsert)
- _suspicious.insert(fi);
+ {
+ _suspicious.insert(fi);
+ _nSuspicious;
+ }
}
}
- writeLog("BestOverlapGraph()-- marked "F_U64" reads as suspicious.\n", _suspicious.size());
+ writeStatus("BestOverlapGraph()-- marked " F_U64 " reads as suspicious.\n", _suspicious.size());
}
void
BestOverlapGraph::removeHighErrorBestEdges(void) {
- uint32 fiLimit = FI->numFragments();
+ uint32 fiLimit = RI->numReads();
uint32 numThreads = omp_get_max_threads();
uint32 blockSize = (fiLimit < 100 * numThreads) ? numThreads : fiLimit / 99;
+ writeStatus("BestOverlapGraph()-- analyzing best edges to find useful edge error rate\n");
+
stdDev<double> edgeStats;
// Find the overlap for every best edge.
@@ -128,16 +133,13 @@ BestOverlapGraph::removeHighErrorBestEdges(void) {
BestEdgeOverlap *b5 = getBestEdgeOverlap(fi, false);
BestEdgeOverlap *b3 = getBestEdgeOverlap(fi, true);
- if (b5->fragId() != 0) edgeStats.insert(erates[eratesLen++] = b5->erate());
- if (b3->fragId() != 0) edgeStats.insert(erates[eratesLen++] = b3->erate());
+ if (b5->readId() != 0) edgeStats.insert(erates[eratesLen++] = b5->erate());
+ if (b3->readId() != 0) edgeStats.insert(erates[eratesLen++] = b3->erate());
}
_mean = edgeStats.mean();
_stddev = edgeStats.stddev();
- writeLog("removeHighErrorBestEdges()-- with %u points - mean %f stddev %f -- would use overlaps below %f fraction error\n",
- edgeStats.size(), _mean, _stddev, _mean + _deviationGraph * _stddev);
-
// Find the median and absolute deviations.
sort(erates, erates+eratesLen);
@@ -159,54 +161,48 @@ BestOverlapGraph::removeHighErrorBestEdges(void) {
delete [] absdev;
delete [] erates;
- writeLog("removeHighErrorBestEdges()-- with %u points - median %f mad %f - would use overlaps below %f fraction error\n",
- edgeStats.size(), _median, _mad, _median + _deviationGraph * 1.4826 * _mad);
+ // Compute an error limit based on the median or absolute deviation.
- // The real filtering is done on the next pass through findEdges(). Here, we just report statistics.
+ double Tmean = _mean + _deviationGraph * _stddev;
+ double Tmad = _median + _deviationGraph * 1.4826 * _mad;
- uint32 noedge = 0;
- uint32 removed = 0;
- uint32 retained = 0;
+ _errorLimit = (_median > 1e-10) ? Tmad : Tmean;
+
+ // The real filtering is done on the next pass through findEdges(). Here, we're just collecting statistics.
+
+ uint32 oneFiltered = 0;
+ uint32 twoFiltered = 0;
for (uint32 fi=1; fi <= fiLimit; fi++) {
BestEdgeOverlap *b5 = getBestEdgeOverlap(fi, false);
BestEdgeOverlap *b3 = getBestEdgeOverlap(fi, true);
- if (b5->fragId() == 0)
- noedge++;
- else if (b5->erate() > _mean + _deviationGraph * _stddev)
- removed++;
- else
- retained++;
-
- if (b3->fragId() == 0)
- noedge++;
- else if (b3->erate() > _mean + _deviationGraph * _stddev)
- removed++;
- else
- retained++;
+ bool b5filtered = (b5->erate() > _errorLimit);
+ bool b3filtered = (b3->erate() > _errorLimit);
+
+ if (b5filtered && b3filtered)
+ _n2EdgeFiltered++;
+ else if (b5filtered || b3filtered)
+ _n1EdgeFiltered++;
}
- writeLog("removeHighErrorBestEdges()-- %u ends have no best edge; %u ends are suspiciously high error; %u ends are acceptable.\n",
- noedge, removed, retained);
+ writeLog("\n");
+ writeLog("ERROR RATES (%u samples)\n", edgeStats.size());
+ writeLog("-----------\n");
+ writeLog("mean %10.8f stddev %10.8f -> %10.8f fraction error = %10.6f%% error\n", _mean, _stddev, Tmean, 100.0 * Tmean);
+ writeLog("median %10.8f mad %10.8f -> %10.8f fraction error = %10.6f%% error\n", _median, _mad, Tmad, 100.0 * Tmad);
+ writeLog("\n");
}
void
-BestOverlapGraph::removeLopsidedEdges(void) {
- uint32 fiLimit = FI->numFragments();
+BestOverlapGraph::removeLopsidedEdges(const char *UNUSED(prefix)) {
+ uint32 fiLimit = RI->numReads();
uint32 numThreads = omp_get_max_threads();
uint32 blockSize = (fiLimit < 100 * numThreads) ? numThreads : fiLimit / 99;
- writeLog("BestOverlapGraph()-- removing suspicious edges from graph, with %d threads.\n", numThreads);
-
- uint32 nSuspicious = 0;
- uint32 nContained = 0;
- uint32 nSpur = 0;
- uint32 nMutual = 0;
- uint32 nAccepted = 0;
- uint32 nRejected = 0;
+ writeStatus("BestOverlapGraph()-- removing suspicious edges from graph, with %d thread%s.\n", numThreads, (numThreads == 1) ? "" : "s");
#pragma omp parallel for schedule(dynamic, blockSize)
for (uint32 fi=1; fi <= fiLimit; fi++) {
@@ -217,53 +213,36 @@ BestOverlapGraph::removeLopsidedEdges(void) {
// do not have best edges back to them, and it's possible to find reads B where best edge A->B
// exists, yet no best edge from B exists.
- if (isSuspicious(fi) == true) {
-#pragma omp atomic
- nSuspicious++;
- continue;
- }
-
- if (isContained(fi) == true) {
-#pragma omp atomic
- nContained++;
- continue;
- }
-
- if ((this5->fragId() == 0) ||
- (this3->fragId() == 0)) {
-#pragma omp atomic
- nSpur++;
+ if ((isSuspicious(fi) == true) || // Suspicious overlap pattern
+ (isContained(fi) == true) || // Contained read (duh!)
+ ((this5->readId() == 0) || // Spur read
+ (this3->readId() == 0)))
continue;
- }
// Find the overlap for this5 and this3.
- int32 this5ovlLen = FI->overlapLength(fi, this5->fragId(), this5->ahang(), this5->bhang());
- int32 this3ovlLen = FI->overlapLength(fi, this3->fragId(), this3->ahang(), this3->bhang());
+ int32 this5ovlLen = RI->overlapLength(fi, this5->readId(), this5->ahang(), this5->bhang());
+ int32 this3ovlLen = RI->overlapLength(fi, this3->readId(), this3->ahang(), this3->bhang());
// Find the edges for our best overlaps.
- BestEdgeOverlap *that5 = getBestEdgeOverlap(this5->fragId(), this5->frag3p());
- BestEdgeOverlap *that3 = getBestEdgeOverlap(this3->fragId(), this3->frag3p());
+ BestEdgeOverlap *that5 = getBestEdgeOverlap(this5->readId(), this5->read3p());
+ BestEdgeOverlap *that3 = getBestEdgeOverlap(this3->readId(), this3->read3p());
- // If both point back to us, we're done.
+ // If both point back to us, we're done. These must be symmetric, else overlapper is bonkers.
- if ((that5->fragId() == fi) && (that5->frag3p() == false) &&
- (that3->fragId() == fi) && (that3->frag3p() == true)) {
-#pragma omp atomic
- nMutual++;
+ if ((that5->readId() == fi) && (that5->read3p() == false) &&
+ (that3->readId() == fi) && (that3->read3p() == true))
continue;
- }
- // If there is an overlap to something with no overlaps out of it, that's
- // a little suspicious.
+ // If there is an overlap to something with no overlaps out of it, that's a little suspicious.
- if ((that5->fragId() == 0) ||
- (that3->fragId() == 0)) {
- writeLog("WARNING: read %u has overlap to spur - 3' to read %u back to %u - 5' to read %u back to %u\n",
+ if ((that5->readId() == 0) ||
+ (that3->readId() == 0)) {
+ writeLog("WARNING: read %u has overlap to spur! 3' overlap to read %u back to read %u 5' overlap to read %u back to read %u\n",
fi,
- this5->fragId(), that5->fragId(),
- this3->fragId(), that3->fragId());
+ this5->readId(), that5->readId(),
+ this3->readId(), that3->readId());
#pragma omp critical (suspInsert)
_suspicious.insert(fi);
continue;
@@ -271,57 +250,66 @@ BestOverlapGraph::removeLopsidedEdges(void) {
// Something doesn't agree. Find those overlaps...
- int32 that5ovlLen = FI->overlapLength(this5->fragId(), that5->fragId(), that5->ahang(), that5->bhang());
- int32 that3ovlLen = FI->overlapLength(this3->fragId(), that3->fragId(), that3->ahang(), that3->bhang());
+ int32 that5ovlLen = RI->overlapLength(this5->readId(), that5->readId(), that5->ahang(), that5->bhang());
+ int32 that3ovlLen = RI->overlapLength(this3->readId(), that3->readId(), that3->ahang(), that3->bhang());
// ...and compare.
double percDiff5 = 200.0 * abs(this5ovlLen - that5ovlLen) / (this5ovlLen + that5ovlLen);
double percDiff3 = 200.0 * abs(this3ovlLen - that3ovlLen) / (this3ovlLen + that3ovlLen);
- if ((percDiff5 <= 5) &&
- (percDiff3 <= 5)) {
-#if 0
- writeLog("fi %8u -- %8u/%c' len %6u VS %8u/%c' len %6u %8.4f%% -- %8u/%c' len %6u VS %8u/%c' len %6u %8.4f%% -- ACCEPTED\n",
- fi,
- this5->fragId(), this5->frag3p() ? '3' : '5', this5ovlLen, that5->fragId(), that5->frag3p() ? '3' : '5', that5ovlLen, percDiff5,
- this3->fragId(), this3->frag3p() ? '3' : '5', this3ovlLen, that3->fragId(), that3->frag3p() ? '3' : '5', that3ovlLen, percDiff3);
-#endif
- nAccepted++;
+ if ((percDiff5 <= 5.0) && // Both good, keep 'em as is.
+ (percDiff3 <= 5.0)) {
+ //writeLog("fi %8u -- %8u/%c' len %6u VS %8u/%c' len %6u %8.4f%% -- %8u/%c' len %6u VS %8u/%c' len %6u %8.4f%% -- ACCEPTED\n",
+ // fi,
+ // this5->readId(), this5->read3p() ? '3' : '5', this5ovlLen, that5->readId(), that5->read3p() ? '3' : '5', that5ovlLen, percDiff5,
+ // this3->readId(), this3->read3p() ? '3' : '5', this3ovlLen, that3->readId(), that3->read3p() ? '3' : '5', that3ovlLen, percDiff3);
+ continue;
+ }
- } else {
-#if 0
- writeLog("fi %8u -- %8u/%c' len %6u VS %8u/%c' len %6u %8.4f%% -- %8u/%c' len %6u VS %8u/%c' len %6u %8.4f%%\n",
- fi,
- this5->fragId(), this5->frag3p() ? '3' : '5', this5ovlLen, that5->fragId(), that5->frag3p() ? '3' : '5', that5ovlLen, percDiff5,
- this3->fragId(), this3->frag3p() ? '3' : '5', this3ovlLen, that3->fragId(), that3->frag3p() ? '3' : '5', that3ovlLen, percDiff3);
-#endif
- nRejected++;
+ // Nope, one or both of the edges are too different. Flag the read as suspicious.
+
+ //writeLog("fi %8u -- %8u/%c' len %6u VS %8u/%c' len %6u %8.4f%% -- %8u/%c' len %6u VS %8u/%c' len %6u %8.4f%%\n",
+ // fi,
+ // this5->readId(), this5->read3p() ? '3' : '5', this5ovlLen, that5->readId(), that5->read3p() ? '3' : '5', that5ovlLen, percDiff5,
+ // this3->readId(), this3->read3p() ? '3' : '5', this3ovlLen, that3->readId(), that3->read3p() ? '3' : '5', that3ovlLen, percDiff3);
#pragma omp critical (suspInsert)
+ {
_suspicious.insert(fi);
+
+ if ((percDiff5 > 5.0) && (percDiff3 > 5.0))
+ _n2EdgeIncompatible++;
+ else
+ _n1EdgeIncompatible++;
}
}
-
- writeLog("BestOverlapGraph()-- suspicious %u contained %u spur %u mutual-best %u accepted %u rejected %u\n",
- nSuspicious, nContained, nSpur, nMutual, nAccepted, nRejected);
}
void
-BestOverlapGraph::removeSpurs(void) {
- uint32 fiLimit = FI->numFragments();
+BestOverlapGraph::removeSpurs(const char *prefix) {
+ uint32 fiLimit = RI->numReads();
uint32 numThreads = omp_get_max_threads();
uint32 blockSize = (fiLimit < 100 * numThreads) ? numThreads : fiLimit / 99;
- writeLog("BestOverlapGraph()-- detecting spur fragments.\n");
+ char N[FILENAME_MAX];
+
+ snprintf(N, FILENAME_MAX, "%s.best.spurs", prefix);
+
+ errno = 0;
+ FILE *F = fopen(N, "w");
+ if (errno)
+ F = NULL;
+
+ writeStatus("BestOverlapGraph()-- detecting spur reads.\n");
_spur.clear();
for (uint32 fi=1; fi <= fiLimit; fi++) {
- bool spur5 = (getBestEdgeOverlap(fi, false)->fragId() == 0);
- bool spur3 = (getBestEdgeOverlap(fi, true)->fragId() == 0);
+ bool spur5 = (getBestEdgeOverlap(fi, false)->readId() == 0);
+ bool spur3 = (getBestEdgeOverlap(fi, true)->readId() == 0);
if (isContained(fi))
// Contained, not a spur.
@@ -337,40 +325,46 @@ BestOverlapGraph::removeSpurs(void) {
// Exactly one end is missing a best edge. Bad!
- writeLog("BestOverlapGraph()-- frag "F_U32" is a %s spur.\n", fi, (spur5) ? "5'" : "3'");
+ if (F)
+ fprintf(F, F_U32" %c'\n", fi, (spur5) ? '5' : '3');
_spur.insert(fi);
}
+
+ writeStatus("BestOverlapGraph()-- detected " F_SIZE_T " spur reads.\n", _spur.size());
+
+ if (F)
+ fclose(F);
}
void
BestOverlapGraph::findEdges(void) {
- uint32 fiLimit = FI->numFragments();
+ uint32 fiLimit = RI->numReads();
uint32 numThreads = omp_get_max_threads();
uint32 blockSize = (fiLimit < 100 * numThreads) ? numThreads : fiLimit / 99;
memset(_bestA, 0, sizeof(BestOverlaps) * (fiLimit + 1));
memset(_scorA, 0, sizeof(BestScores) * (fiLimit + 1));
- writeLog("BestOverlapGraph()-- analyzing %d fragments for best contains, with %d threads.\n", fiLimit, numThreads);
+ writeStatus("BestOverlapGraph()-- analyzing %d reads for best contains, with %d thread%s.\n", fiLimit, numThreads, (numThreads == 1) ? "" : "s");
#pragma omp parallel for schedule(dynamic, blockSize)
for (uint32 fi=1; fi <= fiLimit; fi++) {
uint32 no = 0;
- BAToverlap *ovl = OC->getOverlaps(fi, AS_MAX_EVALUE, no);
+ BAToverlap *ovl = OC->getOverlaps(fi, no);
for (uint32 ii=0; ii<no; ii++)
scoreContainment(ovl[ii]);
}
- writeLog("BestOverlapGraph()-- analyzing %d fragments for best edges, with %d threads.\n", fiLimit, numThreads);
+ writeStatus("BestOverlapGraph()-- analyzing %d reads for best edges, with %d thread%s.\n", fiLimit, numThreads, (numThreads == 1) ? "" : "s");
#pragma omp parallel for schedule(dynamic, blockSize)
for (uint32 fi=1; fi <= fiLimit; fi++) {
uint32 no = 0;
- BAToverlap *ovl = OC->getOverlaps(fi, AS_MAX_EVALUE, no);
+ BAToverlap *ovl = OC->getOverlaps(fi, no);
// Build edges out of spurs, but don't allow edges into them. This should prevent them from
// being incorporated into a promiscuous unitig, but still let them be popped as bubbles (but
@@ -386,9 +380,9 @@ BestOverlapGraph::findEdges(void) {
void
BestOverlapGraph::removeContainedDovetails(void) {
- uint32 fiLimit = FI->numFragments();
+ uint32 fiLimit = RI->numReads();
- writeLog("BestOverlapGraph()-- removing best edges for contained fragments.\n");
+ writeStatus("BestOverlapGraph()-- removing best edges for contained reads.\n");
for (uint32 fi=1; fi <= fiLimit; fi++) {
if (isContained(fi) == true) {
@@ -404,34 +398,46 @@ BestOverlapGraph::BestOverlapGraph(double erateGraph,
double deviationGraph,
const char *prefix) {
- setLogFile(prefix, "bestOverlapGraph");
+ writeStatus("\n");
+ writeStatus("BestOverlapGraph()-- allocating best edges (" F_SIZE_T "MB)\n",
+ ((2 * sizeof(BestEdgeOverlap) * (RI->numReads() + 1)) >> 20));
- writeLog("BestOverlapGraph-- allocating best edges ("F_SIZE_T"MB)\n",
- ((2 * sizeof(BestEdgeOverlap) * (FI->numFragments() + 1)) >> 20));
+ _bestA = new BestOverlaps [RI->numReads() + 1]; // Cleared in findEdges()
+ _scorA = new BestScores [RI->numReads() + 1];
- _bestA = new BestOverlaps [FI->numFragments() + 1]; // Cleared in findEdges()
- _scorA = new BestScores [FI->numFragments() + 1];
+ _mean = erateGraph;
+ _stddev = 0.0;
- _mean = erateGraph;
- _stddev = 0.0;
+ _median = erateGraph;
+ _mad = 0.0;
- _median = erateGraph;
- _mad = 0.0;
+ _errorLimit = erateGraph;
+
+ _nSuspicious = 0;
+ _n1EdgeFiltered = 0;
+ _n2EdgeFiltered = 0;
+ _n1EdgeIncompatible = 0;
+ _n2EdgeIncompatible = 0;
_suspicious.clear();
_bestM.clear();
_scorM.clear();
- _restrict = NULL;
- _restrictEnabled = false;
+ _restrict = NULL;
+ _restrictEnabled = false;
+
+ _erateGraph = erateGraph;
+ _deviationGraph = deviationGraph;
- _erateGraph = erateGraph;
- _deviationGraph = deviationGraph;
+ // Find initial edges, only so we can report initial statistics on the graph
+
+ findEdges();
+ reportEdgeStatistics(prefix, "INITIAL");
// Mark reads as suspicious if they are not fully covered by overlaps.
- removeSuspicious();
+ removeSuspicious(prefix);
findEdges();
if (logFileFlagSet(LOG_ALL_BEST_EDGES))
@@ -452,7 +458,7 @@ BestOverlapGraph::BestOverlapGraph(double erateGraph,
//
// This must come before removeSpurs().
- removeLopsidedEdges();
+ removeLopsidedEdges(prefix);
findEdges();
if (logFileFlagSet(LOG_ALL_BEST_EDGES))
@@ -460,7 +466,7 @@ BestOverlapGraph::BestOverlapGraph(double erateGraph,
// Mark reads as spurs, so we don't find best edges to them.
- removeSpurs();
+ removeSpurs(prefix);
findEdges();
reportBestEdges(prefix, logFileFlagSet(LOG_ALL_BEST_EDGES) ? "best.3.final" : "best");
@@ -471,6 +477,21 @@ BestOverlapGraph::BestOverlapGraph(double erateGraph,
removeContainedDovetails();
+ // Report filtering and final statistics.
+
+ writeLog("\n");
+ writeLog("EDGE FILTERING\n");
+ writeLog("-------- ------------------------------------------\n");
+ writeLog("%8u reads have a suspicious overlap pattern\n", _nSuspicious);
+ writeLog("%8u reads had edges filtered\n", _n1EdgeFiltered + _n2EdgeFiltered);
+ writeLog(" %8u had one\n", _n1EdgeFiltered);
+ writeLog(" %8u had two\n", _n2EdgeFiltered);
+ writeLog("%8u reads have length incompatible edges\n", _n1EdgeIncompatible + _n2EdgeIncompatible);
+ writeLog(" %8u have one\n", _n1EdgeIncompatible);
+ writeLog(" %8u have two\n", _n2EdgeIncompatible);
+
+ reportEdgeStatistics(prefix, "FINAL");
+
// Done with scoring data.
delete [] _scorA;
@@ -483,162 +504,155 @@ BestOverlapGraph::BestOverlapGraph(double erateGraph,
+void
+BestOverlapGraph::reportEdgeStatistics(const char *prefix, const char *label) {
+ uint32 fiLimit = RI->numReads();
+ uint32 numThreads = omp_get_max_threads();
+ uint32 blockSize = (fiLimit < 100 * numThreads) ? numThreads : fiLimit / 99;
+
+ uint32 nContained = 0;
+ uint32 nSingleton = 0;
+ uint32 nSpur = 0;
+ uint32 nSpur1Mutual = 0;
+ uint32 nBoth = 0;
+ uint32 nBoth1Mutual = 0;
+ uint32 nBoth2Mutual = 0;
-BestOverlapGraph::BestOverlapGraph(double erateGraph,
- double deviationGraph,
- set<uint32> *restrict) {
-
- _erateGraph = erateGraph;
- _deviationGraph = deviationGraph;
-
- _bestA = NULL;
- _scorA = NULL;
-
- _bestM.clear();
- _scorM.clear();
-
- assert(restrict != NULL);
-
- _restrict = restrict;
- _restrictEnabled = true;
-
- // PASS 0: Load the map (necessary?)
-
-#if 0
- for (set<uint32>::iterator it=_restrict->begin(); it != _restrict->end(); it++) {
- uint32 fi = *it;
+ for (uint32 fi=1; fi <= fiLimit; fi++) {
+ BestEdgeOverlap *this5 = getBestEdgeOverlap(fi, false);
+ BestEdgeOverlap *this3 = getBestEdgeOverlap(fi, true);
- _bestM[fi].insert();
- _scorM[fi].insert();
- }
-#endif
+ // Count contained reads
- // PASS 1: Find containments.
+ if (isContained(fi)) {
+ nContained++;
+ continue;
+ }
- for (set<uint32>::iterator it=_restrict->begin(); it != _restrict->end(); it++) {
- uint32 fi = *it;
- uint32 no = 0;
- BAToverlap *ovl = OC->getOverlaps(fi, AS_MAX_EVALUE, no);
+ // Count singleton reads
- for (uint32 ii=0; ii<no; ii++)
- scoreContainment(ovl[ii]);
- }
+ if ((this5->readId() == 0) && (this3->readId() == 0)) {
+ nSingleton++;
+ continue;
+ }
- // PASS 2: Find dovetails.
+ // Compute mutual bestedness
- for (set<uint32>::iterator it=_restrict->begin(); it != _restrict->end(); it++) {
- uint32 fi = *it;
- uint32 no = 0;
- BAToverlap *ovl = OC->getOverlaps(fi, AS_MAX_EVALUE, no);
+ bool mutual5 = false;
+ bool mutual3 = false;
- for (uint32 ii=0; ii<no; ii++)
- scoreEdge(ovl[ii]);
- }
+ if (this5->readId() != 0) {
+ BestEdgeOverlap *that5 = getBestEdgeOverlap(this5->readId(), this5->read3p());
- // Remove temporary scoring data
+ mutual5 = ((that5->readId() == fi) && (that5->read3p() == false));
+ }
- _scorM.clear();
+ if (this3->readId() != 0) {
+ BestEdgeOverlap *that3 = getBestEdgeOverlap(this3->readId(), this3->read3p());
- // Remove dovetail overlaps for contained fragments.
+ mutual3 = ((that3->readId() == fi) && (that3->read3p() == true));
+ }
- for (set<uint32>::iterator it=_restrict->begin(); it != _restrict->end(); it++) {
- uint32 fi = *it;
+ // Compute spur, and mutual best
- if (isContained(fi) == true) {
- getBestEdgeOverlap(fi, false)->clear();
- getBestEdgeOverlap(fi, true) ->clear();
+ if ((this5->readId() == 0) ||
+ (this3->readId() == 0)) {
+ nSpur++;
+ nSpur1Mutual += (mutual5 || mutual3) ? 1 : 0;
+ continue;
}
- }
- // Remove spurs
+ // Otherwise, both edges exist
-#if 0
- for (set<uint32>::iterator it=_restrict->begin(); it != _restrict->end(); it++) {
- uint32 fi = *it;
-
- if ((getBestEdgeOverlap(fi, false)->fragId() == 0) ||
- (getBestEdgeOverlap(fi, true)->fragId() == 0)) {
- getBestEdgeOverlap(fi, false)->clear();
- getBestEdgeOverlap(fi, true) ->clear();
- }
+ nBoth++;
+ nBoth1Mutual += (mutual5 != mutual3) ? 1 : 0;
+ nBoth2Mutual += ((mutual5 == true) && (mutual3 == true)) ? 1 : 0;
}
-#endif
- _restrict = NULL;
- _restrictEnabled = false;
+ writeLog("\n");
+ writeLog("%s EDGES\n", label);
+ writeLog("-------- ----------------------------------------\n");
+ writeLog("%8u reads are contained\n", nContained);
+ writeLog("%8u reads have no best edges (singleton)\n", nSingleton);
+ writeLog("%8u reads have only one best edge (spur) \n", nSpur);
+ writeLog(" %8u are mutual best\n", nSpur1Mutual);
+ writeLog("%8u reads have two best edges \n", nBoth);
+ writeLog(" %8u have one mutual best edge\n", nBoth1Mutual);
+ writeLog(" %8u have two mutual best edges\n", nBoth2Mutual);
+ writeLog("\n");
}
-
void
BestOverlapGraph::reportBestEdges(const char *prefix, const char *label) {
char N[FILENAME_MAX];
FILE *BCH = NULL;
- FILE *BE = NULL, *BEH = NULL, *BEG;
+ FILE *BE = NULL, *BEH = NULL, *BEG = NULL;
FILE *BS = NULL;
FILE *SS = NULL;
- sprintf(N, "%s.%s.edges", prefix, label); BE = fopen(N, "w");
- sprintf(N, "%s.%s.singletons", prefix, label); BS = fopen(N, "w");
- sprintf(N, "%s.%s.edges.suspicious", prefix, label); SS = fopen(N, "w");
+ // Open output files.
+
+ snprintf(N, FILENAME_MAX, "%s.%s.edges", prefix, label); BE = fopen(N, "w");
+ snprintf(N, FILENAME_MAX, "%s.%s.singletons", prefix, label); BS = fopen(N, "w");
+ snprintf(N, FILENAME_MAX, "%s.%s.edges.suspicious", prefix, label); SS = fopen(N, "w");
+
+ snprintf(N, FILENAME_MAX, "%s.%s.edges.gfa", prefix, label); BEG = fopen(N, "w");
- sprintf(N, "%s.%s.contains.histogram", prefix, label); BCH = fopen(N, "w");
- sprintf(N, "%s.%s.edges.histogram", prefix, label); BEH = fopen(N, "w");
+ snprintf(N, FILENAME_MAX, "%s.%s.contains.histogram", prefix, label); BCH = fopen(N, "w");
+ snprintf(N, FILENAME_MAX, "%s.%s.edges.histogram", prefix, label); BEH = fopen(N, "w");
- sprintf(N, "%s.%s.edges.gfa", prefix, label); BEG = fopen(N, "w");
+ // Write best edges, singletons and suspicious edges.
- if ((BE) && (BS)) {
- fprintf(BE, "#fragId\tlibId\tbest5iid\tbest5end\tbest3iid\tbest3end\teRate5\teRate3\tbest5len\tbest3len\n");
- fprintf(BS, "#fragId\tlibId\n");
+ if ((BE) && (BS) && (SS)) {
+ fprintf(BE, "#readId\tlibId\tbest5iid\tbest5end\tbest3iid\tbest3end\teRate5\teRate3\tbest5len\tbest3len\n");
+ fprintf(BS, "#readId\tlibId\n");
- for (uint32 id=1; id<FI->numFragments() + 1; id++) {
+ for (uint32 id=1; id<RI->numReads() + 1; id++) {
BestEdgeOverlap *bestedge5 = getBestEdgeOverlap(id, false);
BestEdgeOverlap *bestedge3 = getBestEdgeOverlap(id, true);
- if ((bestedge5->fragId() == 0) && (bestedge3->fragId() == 0) && (isContained(id) == false)) {
- fprintf(BS, "%u\t%u\n", id, FI->libraryIID(id));
+ if ((bestedge5->readId() == 0) && (bestedge3->readId() == 0) && (isContained(id) == false)) {
+ fprintf(BS, "%u\t%u\n", id, RI->libraryIID(id));
}
else if (_suspicious.count(id) > 0) {
- fprintf(SS, "%u\t%u\t%u\t%c'\t%u\t%c'\t%6.4f\t%6.4f\t%u\t%u%s\n", id, FI->libraryIID(id),
- bestedge5->fragId(), bestedge5->frag3p() ? '3' : '5',
- bestedge3->fragId(), bestedge3->frag3p() ? '3' : '5',
+ fprintf(SS, "%u\t%u\t%u\t%c'\t%u\t%c'\t%6.4f\t%6.4f\t%u\t%u%s\n", id, RI->libraryIID(id),
+ bestedge5->readId(), bestedge5->read3p() ? '3' : '5',
+ bestedge3->readId(), bestedge3->read3p() ? '3' : '5',
AS_OVS_decodeEvalue(bestedge5->evalue()),
AS_OVS_decodeEvalue(bestedge3->evalue()),
- (bestedge5->fragId() == 0 ? 0 : FI->overlapLength(id, bestedge5->fragId(), bestedge5->ahang(), bestedge5->bhang())),
- (bestedge3->fragId() == 0 ? 0 : FI->overlapLength(id, bestedge3->fragId(), bestedge3->ahang(), bestedge3->bhang())),
+ (bestedge5->readId() == 0 ? 0 : RI->overlapLength(id, bestedge5->readId(), bestedge5->ahang(), bestedge5->bhang())),
+ (bestedge3->readId() == 0 ? 0 : RI->overlapLength(id, bestedge3->readId(), bestedge3->ahang(), bestedge3->bhang())),
isContained(id) ? "\tcontained" : "");
}
else {
- fprintf(BE, "%u\t%u\t%u\t%c'\t%u\t%c'\t%6.4f\t%6.4f\t%u\t%u%s\n", id, FI->libraryIID(id),
- bestedge5->fragId(), bestedge5->frag3p() ? '3' : '5',
- bestedge3->fragId(), bestedge3->frag3p() ? '3' : '5',
+ fprintf(BE, "%u\t%u\t%u\t%c'\t%u\t%c'\t%6.4f\t%6.4f\t%u\t%u%s\n", id, RI->libraryIID(id),
+ bestedge5->readId(), bestedge5->read3p() ? '3' : '5',
+ bestedge3->readId(), bestedge3->read3p() ? '3' : '5',
AS_OVS_decodeEvalue(bestedge5->evalue()),
AS_OVS_decodeEvalue(bestedge3->evalue()),
- (bestedge5->fragId() == 0 ? 0 : FI->overlapLength(id, bestedge5->fragId(), bestedge5->ahang(), bestedge5->bhang())),
- (bestedge3->fragId() == 0 ? 0 : FI->overlapLength(id, bestedge3->fragId(), bestedge3->ahang(), bestedge3->bhang())),
+ (bestedge5->readId() == 0 ? 0 : RI->overlapLength(id, bestedge5->readId(), bestedge5->ahang(), bestedge5->bhang())),
+ (bestedge3->readId() == 0 ? 0 : RI->overlapLength(id, bestedge3->readId(), bestedge3->ahang(), bestedge3->bhang())),
isContained(id) ? "\tcontained" : "");
}
}
-
- fclose(BE);
- fclose(BS);
- fclose(SS);
}
+ // Write best edge graph.
if (BEG) {
fprintf(BEG, "H\tVN:Z:bogart/edges\n");
// First, write the sequences used.
- for (uint32 id=1; id<FI->numFragments() + 1; id++) {
+ for (uint32 id=1; id<RI->numReads() + 1; id++) {
BestEdgeOverlap *bestedge5 = getBestEdgeOverlap(id, false);
BestEdgeOverlap *bestedge3 = getBestEdgeOverlap(id, true);
- if ((bestedge5->fragId() == 0) && (bestedge3->fragId() == 0) && (isContained(id) == false)) {
+ if ((bestedge5->readId() == 0) && (bestedge3->readId() == 0) && (isContained(id) == false)) {
// Do nothing, a singleton.
}
@@ -652,7 +666,7 @@ BestOverlapGraph::reportBestEdges(const char *prefix, const char *label) {
else {
// Report the read, it has best edges - including contained reads.
- fprintf(BEG, "S\tread%08u\t*\tLN:i:%u\n", id, FI->fragmentLength(id));
+ fprintf(BEG, "S\tread%08u\t*\tLN:i:%u\n", id, RI->readLength(id));
}
}
@@ -663,11 +677,11 @@ BestOverlapGraph::reportBestEdges(const char *prefix, const char *label) {
//
// with read orientation given by +/-. Conveniently, this is what we've saved (for the edges).
- for (uint32 id=1; id<FI->numFragments() + 1; id++) {
+ for (uint32 id=1; id<RI->numReads() + 1; id++) {
BestEdgeOverlap *bestedge5 = getBestEdgeOverlap(id, false);
BestEdgeOverlap *bestedge3 = getBestEdgeOverlap(id, true);
- if ((bestedge5->fragId() == 0) && (bestedge3->fragId() == 0) && (isContained(id) == false)) {
+ if ((bestedge5->readId() == 0) && (bestedge3->readId() == 0) && (isContained(id) == false)) {
// Do nothing, a singleton.
}
@@ -680,46 +694,45 @@ BestOverlapGraph::reportBestEdges(const char *prefix, const char *label) {
}
else {
- if (bestedge5->fragId() != 0) {
+ if (bestedge5->readId() != 0) {
int32 ahang = bestedge5->ahang();
int32 bhang = bestedge5->bhang();
- int32 olaplen = FI->overlapLength(id, bestedge5->fragId(), bestedge5->ahang(), bestedge5->bhang());
+ int32 olaplen = RI->overlapLength(id, bestedge5->readId(), bestedge5->ahang(), bestedge5->bhang());
assert((ahang <= 0) && (bhang <= 0)); // ALL 5' edges should be this.
fprintf(BEG, "L\tread%08u\t-\tread%08u\t%c\t%uM\n",
id,
- bestedge5->fragId(), bestedge5->frag3p() ? '-' : '+',
+ bestedge5->readId(), bestedge5->read3p() ? '-' : '+',
olaplen);
}
- if (bestedge3->fragId() != 0) {
+ if (bestedge3->readId() != 0) {
int32 ahang = bestedge3->ahang();
int32 bhang = bestedge3->bhang();
- int32 olaplen = FI->overlapLength(id, bestedge3->fragId(), bestedge3->ahang(), bestedge3->bhang());
+ int32 olaplen = RI->overlapLength(id, bestedge3->readId(), bestedge3->ahang(), bestedge3->bhang());
assert((ahang >= 0) && (bhang >= 0)); // ALL 3' edges should be this.
fprintf(BEG, "L\tread%08u\t+\tread%08u\t%c\t%uM\n",
id,
- bestedge3->fragId(), bestedge3->frag3p() ? '-' : '+',
- FI->overlapLength(id, bestedge3->fragId(), bestedge3->ahang(), bestedge3->bhang()));
+ bestedge3->readId(), bestedge3->read3p() ? '-' : '+',
+ RI->overlapLength(id, bestedge3->readId(), bestedge3->ahang(), bestedge3->bhang()));
}
}
}
-
- fclose(BEG);
}
+ // Write error rate histograms of best edges and contains.
if ((BCH) && (BEH)) {
- double *bc = new double [FI->numFragments() + 1 + FI->numFragments() + 1];
- double *be = new double [FI->numFragments() + 1 + FI->numFragments() + 1];
+ double *bc = new double [RI->numReads() + 1 + RI->numReads() + 1];
+ double *be = new double [RI->numReads() + 1 + RI->numReads() + 1];
uint32 bcl = 0;
uint32 bel = 0;
- for (uint32 id=1; id<FI->numFragments() + 1; id++) {
+ for (uint32 id=1; id<RI->numReads() + 1; id++) {
BestEdgeOverlap *bestedge5 = getBestEdgeOverlap(id, false);
BestEdgeOverlap *bestedge3 = getBestEdgeOverlap(id, true);
@@ -730,10 +743,10 @@ BestOverlapGraph::reportBestEdges(const char *prefix, const char *label) {
bc[bcl++] = bestedge3->erate();
}
else {
- if (bestedge5->fragId() > 0)
+ if (bestedge5->readId() > 0)
be[bel++] = bestedge5->erate();
- if (bestedge3->fragId() > 0)
+ if (bestedge3->readId() > 0)
be[bel++] = bestedge3->erate();
}
}
@@ -747,18 +760,26 @@ BestOverlapGraph::reportBestEdges(const char *prefix, const char *label) {
for (uint32 ii=0; ii<bel; ii++)
fprintf(BEH, "%f\n", be[ii]);
- fclose(BCH);
- fclose(BEH);
-
delete [] bc;
delete [] be;
}
+
+ // Close all the files.
+
+ if (BE) fclose(BE);
+ if (BS) fclose(BS);
+ if (SS) fclose(SS);
+
+ if (BEG) fclose(BEG);
+
+ if (BCH) fclose(BCH);
+ if (BEH) fclose(BEH);
}
void
-BestOverlapGraph::scoreContainment(const BAToverlap& olap) {
+BestOverlapGraph::scoreContainment(BAToverlap& olap) {
if (isOverlapBadQuality(olap))
// Yuck. Don't want to use this crud.
@@ -776,7 +797,7 @@ BestOverlapGraph::scoreContainment(const BAToverlap& olap) {
if ((olap.a_hang > 0) ||
(olap.b_hang < 0))
- // We only save if A is the contained fragment.
+ // We only save if A is the contained read.
return;
setContained(olap.a_iid);
@@ -785,7 +806,7 @@ BestOverlapGraph::scoreContainment(const BAToverlap& olap) {
void
-BestOverlapGraph::scoreEdge(const BAToverlap& olap) {
+BestOverlapGraph::scoreEdge(BAToverlap& olap) {
bool enableLog = false; // useful for reporting this stuff only for specific reads
//if ((olap.a_iid == 97202) || (olap.a_iid == 30701))
@@ -794,24 +815,24 @@ BestOverlapGraph::scoreEdge(const BAToverlap& olap) {
if (isOverlapBadQuality(olap)) {
// Yuck. Don't want to use this crud.
if ((enableLog == true) && (logFileFlagSet(LOG_OVERLAP_SCORING)))
- writeLog("scoreEdge()-- OVERLAP BADQ: %d %d %c hangs "F_S32" "F_S32" err %.3f -- bad quality\n",
- olap.a_iid, olap.b_iid, olap.flipped ? 'A' : 'N', olap.a_hang, olap.b_hang, olap.erate);
+ writeLog("scoreEdge()-- OVERLAP BADQ: %d %d %c hangs " F_S32 " " F_S32 " err %.3f -- bad quality\n",
+ olap.a_iid, olap.b_iid, olap.flipped ? 'A' : 'N', olap.a_hang, olap.b_hang, olap.erate());
return;
}
if (isOverlapRestricted(olap)) {
// Whoops, don't want this overlap for this BOG
if ((enableLog == true) && (logFileFlagSet(LOG_OVERLAP_SCORING)))
- writeLog("scoreEdge()-- OVERLAP RESTRICT: %d %d %c hangs "F_S32" "F_S32" err %.3f -- restricted\n",
- olap.a_iid, olap.b_iid, olap.flipped ? 'A' : 'N', olap.a_hang, olap.b_hang, olap.erate);
+ writeLog("scoreEdge()-- OVERLAP RESTRICT: %d %d %c hangs " F_S32 " " F_S32 " err %.3f -- restricted\n",
+ olap.a_iid, olap.b_iid, olap.flipped ? 'A' : 'N', olap.a_hang, olap.b_hang, olap.erate());
return;
}
if (isSuspicious(olap.b_iid)) {
// Whoops, don't want this overlap for this BOG
if ((enableLog == true) && (logFileFlagSet(LOG_OVERLAP_SCORING)))
- writeLog("scoreEdge()-- OVERLAP SUSP: %d %d %c hangs "F_S32" "F_S32" err %.3f -- suspicious\n",
- olap.a_iid, olap.b_iid, olap.flipped ? 'A' : 'N', olap.a_hang, olap.b_hang, olap.erate);
+ writeLog("scoreEdge()-- OVERLAP SUSP: %d %d %c hangs " F_S32 " " F_S32 " err %.3f -- suspicious\n",
+ olap.a_iid, olap.b_iid, olap.flipped ? 'A' : 'N', olap.a_hang, olap.b_hang, olap.erate());
return;
}
@@ -819,16 +840,16 @@ BestOverlapGraph::scoreEdge(const BAToverlap& olap) {
((olap.a_hang <= 0) && (olap.b_hang >= 0))) {
// Skip containment overlaps.
if ((enableLog == true) && (logFileFlagSet(LOG_OVERLAP_SCORING)))
- writeLog("scoreEdge()-- OVERLAP CONT: %d %d %c hangs "F_S32" "F_S32" err %.3f -- container read\n",
- olap.a_iid, olap.b_iid, olap.flipped ? 'A' : 'N', olap.a_hang, olap.b_hang, olap.erate);
+ writeLog("scoreEdge()-- OVERLAP CONT: %d %d %c hangs " F_S32 " " F_S32 " err %.3f -- container read\n",
+ olap.a_iid, olap.b_iid, olap.flipped ? 'A' : 'N', olap.a_hang, olap.b_hang, olap.erate());
return;
}
if (isContained(olap.b_iid) == true) {
// Skip overlaps to contained reads (allow scoring of best edges from contained reads).
if ((enableLog == true) && (logFileFlagSet(LOG_OVERLAP_SCORING)))
- writeLog("scoreEdge()-- OVERLAP CONT: %d %d %c hangs "F_S32" "F_S32" err %.3f -- contained read\n",
- olap.a_iid, olap.b_iid, olap.flipped ? 'A' : 'N', olap.a_hang, olap.b_hang, olap.erate);
+ writeLog("scoreEdge()-- OVERLAP CONT: %d %d %c hangs " F_S32 " " F_S32 " err %.3f -- contained read\n",
+ olap.a_iid, olap.b_iid, olap.flipped ? 'A' : 'N', olap.a_hang, olap.b_hang, olap.erate());
return;
}
@@ -841,8 +862,8 @@ BestOverlapGraph::scoreEdge(const BAToverlap& olap) {
if (newScr <= score) {
if ((enableLog == true) && (logFileFlagSet(LOG_OVERLAP_SCORING)))
- writeLog("scoreEdge()-- OVERLAP GOOD: %d %d %c hangs "F_S32" "F_S32" err %.3f -- no better than best\n",
- olap.a_iid, olap.b_iid, olap.flipped ? 'A' : 'N', olap.a_hang, olap.b_hang, olap.erate);
+ writeLog("scoreEdge()-- OVERLAP GOOD: %d %d %c hangs " F_S32 " " F_S32 " err %.3f -- no better than best\n",
+ olap.a_iid, olap.b_iid, olap.flipped ? 'A' : 'N', olap.a_hang, olap.b_hang, olap.erate());
return;
}
@@ -851,42 +872,40 @@ BestOverlapGraph::scoreEdge(const BAToverlap& olap) {
score = newScr;
if ((enableLog == true) && (logFileFlagSet(LOG_OVERLAP_SCORING)))
- writeLog("scoreEdge()-- OVERLAP BEST: %d %d %c hangs "F_S32" "F_S32" err %.3f -- NOW BEST\n",
- olap.a_iid, olap.b_iid, olap.flipped ? 'A' : 'N', olap.a_hang, olap.b_hang, olap.erate);
+ writeLog("scoreEdge()-- OVERLAP BEST: %d %d %c hangs " F_S32 " " F_S32 " err %.3f -- NOW BEST\n",
+ olap.a_iid, olap.b_iid, olap.flipped ? 'A' : 'N', olap.a_hang, olap.b_hang, olap.erate());
}
-
bool
-BestOverlapGraph::isOverlapBadQuality(const BAToverlap& olap) {
+BestOverlapGraph::isOverlapBadQuality(BAToverlap& olap) {
bool enableLog = false; // useful for reporting this stuff only for specific reads
//if ((olap.a_iid == 97202) || (olap.a_iid == 30701))
// enableLog = true;
- if ((FI->fragmentLength(olap.a_iid) == 0) ||
- (FI->fragmentLength(olap.b_iid) == 0))
- // The overlap is bad if it involves deleted fragments. Shouldn't happen in a normal
- // assembly, but sometimes us users want to delete fragments after overlaps are generated.
+ // The overlap is bad if it involves deleted reads. Shouldn't happen in a normal
+ // assembly, but sometimes us users want to delete reads after overlaps are generated.
+
+ if ((RI->readLength(olap.a_iid) == 0) ||
+ (RI->readLength(olap.b_iid) == 0)) {
+ olap.filtered = true;
return(true);
+ }
// The overlap is GOOD (false == not bad) if the error rate is below the allowed erate.
// Initially, this is just the erate passed in. After the first rount of finding edges,
// it is reset to the mean and stddev of selected best edges.
- //
-
- double Tstddev = _mean + _deviationGraph * _stddev;
- double Tmad = _median + _deviationGraph * 1.4826 * _mad;
- if (olap.erate <= Tmad || (Tmad == 0 && olap.erate <= Tstddev)) {
+ if (olap.erate() <= _errorLimit) {
if ((enableLog == true) && (logFileFlagSet(LOG_OVERLAP_SCORING)))
- writeLog("isOverlapBadQuality()-- OVERLAP GOOD: %d %d %c hangs "F_S32" "F_S32" err %.3f\n",
+ writeLog("isOverlapBadQuality()-- OVERLAP GOOD: %d %d %c hangs " F_S32 " " F_S32 " err %.3f\n",
olap.a_iid, olap.b_iid,
olap.flipped ? 'A' : 'N',
olap.a_hang,
olap.b_hang,
- olap.erate);
+ olap.erate());
return(false);
}
@@ -896,21 +915,22 @@ BestOverlapGraph::isOverlapBadQuality(const BAToverlap& olap) {
// error rate above a few percent. canu doesn't do short overlaps.
if ((enableLog == true) && (logFileFlagSet(LOG_OVERLAP_SCORING)))
- writeLog("isOverlapBadQuality()-- OVERLAP REJECTED: %d %d %c hangs "F_S32" "F_S32" err %.3f\n",
+ writeLog("isOverlapBadQuality()-- OVERLAP REJECTED: %d %d %c hangs " F_S32 " " F_S32 " err %.3f\n",
olap.a_iid, olap.b_iid,
olap.flipped ? 'A' : 'N',
olap.a_hang,
olap.b_hang,
- olap.erate);
+ olap.erate());
+ olap.filtered = true;
return(true);
}
-// If no restrictions are known, this overlap is useful if both fragments are not in a unitig
-// already. Otherwise, we are restricted to just a specific set of fragments (usually a single
-// unitig and all the mated reads). The overlap is useful if both fragments are in the set.
+// If no restrictions are known, this overlap is useful if both reads are not in a unitig
+// already. Otherwise, we are restricted to just a specific set of reads (usually a single
+// unitig and all the mated reads). The overlap is useful if both reads are in the set.
//
bool
BestOverlapGraph::isOverlapRestricted(const BAToverlap &olap) {
@@ -930,7 +950,7 @@ BestOverlapGraph::isOverlapRestricted(const BAToverlap &olap) {
uint64
-BestOverlapGraph::scoreOverlap(const BAToverlap& olap) {
+BestOverlapGraph::scoreOverlap(BAToverlap& olap) {
uint64 leng = 0;
uint64 rate = AS_MAX_EVALUE - olap.evalue;
@@ -952,19 +972,19 @@ BestOverlapGraph::scoreOverlap(const BAToverlap& olap) {
// takes into account both reads, or as the number of aligned bases on the A read.
#if 0
- leng = FI->overlapLength(olap.a_iid, olap.b_iid, olap.a_hang, olap.b_hang);
+ leng = RI->overlapLength(olap.a_iid, olap.b_iid, olap.a_hang, olap.b_hang);
#endif
if (olap.a_hang > 0)
- leng = FI->fragmentLength(olap.a_iid) - olap.a_hang;
+ leng = RI->readLength(olap.a_iid) - olap.a_hang;
else
- leng = FI->fragmentLength(olap.a_iid) + olap.b_hang;
+ leng = RI->readLength(olap.a_iid) + olap.b_hang;
// Convert the length into an expected number of matches.
#if 0
- assert(olap.erate <= 1.0);
- leng -= leng * olap.erate;
+ assert(olap.erate() <= 1.0);
+ leng -= leng * olap.erate();
#endif
// And finally shift it to the correct place in the word.
diff --git a/src/bogart/AS_BAT_BestOverlapGraph.H b/src/bogart/AS_BAT_BestOverlapGraph.H
index 408e1d8..f1b3730 100644
--- a/src/bogart/AS_BAT_BestOverlapGraph.H
+++ b/src/bogart/AS_BAT_BestOverlapGraph.H
@@ -45,34 +45,34 @@
#include "AS_global.H"
#include "AS_BAT_OverlapCache.H"
-class FragmentEnd {
+class ReadEnd {
public:
- FragmentEnd() {
+ ReadEnd() {
_id = 0;
_e3p = false;
};
- FragmentEnd(uint32 id, bool e3p) {
+ ReadEnd(uint32 id, bool e3p) {
_id = id;
_e3p = e3p;
};
- uint32 fragId(void) const { return(_id); };
- bool frag3p(void) const { return(_e3p == true); };
- bool frag5p(void) const { return(_e3p == false); };
+ uint32 readId(void) const { return(_id); };
+ bool read3p(void) const { return(_e3p == true); };
+ bool read5p(void) const { return(_e3p == false); };
- bool operator==(FragmentEnd const that) const {
- return((fragId() == that.fragId()) && (frag3p() == that.frag3p()));
+ bool operator==(ReadEnd const that) const {
+ return((readId() == that.readId()) && (read3p() == that.read3p()));
};
- bool operator!=(FragmentEnd const that) const {
- return((fragId() != that.fragId()) || (frag3p() != that.frag3p()));
+ bool operator!=(ReadEnd const that) const {
+ return((readId() != that.readId()) || (read3p() != that.read3p()));
};
- bool operator<(FragmentEnd const that) const {
- if (fragId() != that.fragId())
- return fragId() < that.fragId();
+ bool operator<(ReadEnd const that) const {
+ if (readId() != that.readId())
+ return readId() < that.readId();
else
- return frag3p() < that.frag3p();
+ return read3p() < that.read3p();
};
private:
@@ -87,14 +87,9 @@ private:
//
class BestEdgeOverlap {
public:
- BestEdgeOverlap() {
- clear();
- };
- BestEdgeOverlap(BAToverlap const &ovl) {
- set(ovl);
- };
- ~BestEdgeOverlap() {
- };
+ BestEdgeOverlap() { clear(); };
+ BestEdgeOverlap(BAToverlap const &ovl) { set(ovl); };
+ ~BestEdgeOverlap() { };
void clear(void) {
_id = 0;
@@ -109,8 +104,8 @@ public:
if (((olap.a_hang <= 0) && (olap.b_hang >= 0)) || // If contained, _e3p just means
((olap.a_hang >= 0) && (olap.b_hang <= 0))) // the other read is flipped
_e3p = olap.flipped;
- else
- _e3p = olap.BEndIs3prime(); // Otherwise, means olap is to the 3' end
+ else // Otherwise, means the
+ _e3p = olap.BEndIs3prime(); // olap is to the 3' end
_ahang = olap.a_hang;
_bhang = olap.b_hang;
@@ -126,9 +121,9 @@ public:
};
- uint32 fragId(void) const { return(_id); };
- bool frag3p(void) const { return(_e3p == true); };
- bool frag5p(void) const { return(_e3p == false); };
+ uint32 readId(void) const { return(_id); };
+ bool read3p(void) const { return(_e3p == true); };
+ bool read5p(void) const { return(_e3p == false); };
int32 ahang(void) const { return(_ahang); };
int32 bhang(void) const { return(_bhang); };
@@ -138,7 +133,7 @@ public:
private:
uint32 _id;
- uint64 _e3p : 1; // Overlap with the 3' end of that fragment, or flipped contain
+ uint64 _e3p : 1; // Overlap with the 3' end of that read, or flipped contain
int64 _ahang : AS_MAX_READLEN_BITS+1;
int64 _bhang : AS_MAX_READLEN_BITS+1;
uint64 _evalue : AS_MAX_EVALUE_BITS;
@@ -181,9 +176,9 @@ public:
class BestOverlapGraph {
private:
- void removeSuspicious(void);
- void removeSpurs(void);
- void removeLopsidedEdges(void);
+ void removeSuspicious(const char *prefix);
+ void removeSpurs(const char *prefix);
+ void removeLopsidedEdges(const char *prefix);
void findEdges(void);
@@ -196,62 +191,58 @@ public:
double deviationGraph,
const char *prefix);
- BestOverlapGraph(double erateGraph,
- double deviationGraph,
- set<uint32> *restrict);
-
~BestOverlapGraph() {
delete [] _bestA;
delete [] _scorA;
};
- // Given a fragment UINT32 and which end, returns pointer to
+ // Given a read UINT32 and which end, returns pointer to
// BestOverlap node.
- BestEdgeOverlap *getBestEdgeOverlap(uint32 fragid, bool threePrime) {
+ BestEdgeOverlap *getBestEdgeOverlap(uint32 readid, bool threePrime) {
if (_bestA)
- return((threePrime) ? (&_bestA[fragid]._best3) : (&_bestA[fragid]._best5));
- return((threePrime) ? (&_bestM[fragid]._best3) : (&_bestM[fragid]._best5));
+ return((threePrime) ? (&_bestA[readid]._best3) : (&_bestA[readid]._best5));
+ return((threePrime) ? (&_bestM[readid]._best3) : (&_bestM[readid]._best5));
};
- // given a FragmentEnd sets it to the next FragmentEnd after following the
+ // given a ReadEnd sets it to the next ReadEnd after following the
// best edge
- FragmentEnd followOverlap(FragmentEnd end) {
- if (end.fragId() == 0)
- return(FragmentEnd());
+ ReadEnd followOverlap(ReadEnd end) {
+ if (end.readId() == 0)
+ return(ReadEnd());
- BestEdgeOverlap *edge = getBestEdgeOverlap(end.fragId(), end.frag3p());
+ BestEdgeOverlap *edge = getBestEdgeOverlap(end.readId(), end.read3p());
- return(FragmentEnd(edge->fragId(), !edge->frag3p()));
+ return(ReadEnd(edge->readId(), !edge->read3p()));
};
- void setContained(const uint32 fragid) {
+ void setContained(const uint32 readid) {
if (_bestA)
- _bestA[fragid]._isC = true;
+ _bestA[readid]._isC = true;
else
- _bestM[fragid]._isC = true;
+ _bestM[readid]._isC = true;
};
- bool isContained(const uint32 fragid) {
+ bool isContained(const uint32 readid) {
if (_bestA)
- return(_bestA[fragid]._isC);
- return(_bestM[fragid]._isC);
+ return(_bestA[readid]._isC);
+ return(_bestM[readid]._isC);
};
- bool isSuspicious(const uint32 fragid) {
- return(_suspicious.count(fragid) > 0);
+ bool isSuspicious(const uint32 readid) {
+ return(_suspicious.count(readid) > 0);
};
+ void reportEdgeStatistics(const char *prefix, const char *label);
void reportBestEdges(const char *prefix, const char *label);
public:
- bool isOverlapBadQuality(const BAToverlap& olap); // Used in repeat detection
+ bool isOverlapBadQuality(BAToverlap& olap); // Used in repeat detection
private:
- bool isOverlapRestricted(const BAToverlap &olap);
- uint64 scoreOverlap(const BAToverlap& olap);
+ uint64 scoreOverlap(BAToverlap& olap);
private:
- void scoreContainment(const BAToverlap& olap);
- void scoreEdge(const BAToverlap& olap);
+ void scoreContainment(BAToverlap& olap);
+ void scoreEdge(BAToverlap& olap);
private:
uint64 &best5score(uint32 id) {
@@ -276,18 +267,33 @@ private:
double _median;
double _mad;
+ uint32 _nSuspicious; // Stats for output
+ uint32 _n1EdgeFiltered;
+ uint32 _n2EdgeFiltered;
+ uint32 _n1EdgeIncompatible;
+ uint32 _n2EdgeIncompatible;
+
set<uint32> _suspicious;
set<uint32> _spur;
map<uint32, BestOverlaps> _bestM;
map<uint32, BestScores> _scorM;
+ // These restrict the best overlap graph to a set of reads, instead of all reads.
+ // Currently (Aug 2016) unused. There used to be a constructor that would take
+ // a set(uint32) of reads we cared about, but it was quite stale and was removed.
+private:
+ bool isOverlapRestricted(const BAToverlap &olap);
+private:
set<uint32> *_restrict;
bool _restrictEnabled;
+
public:
double _erateGraph;
double _deviationGraph;
+private:
+ double _errorLimit;
}; //BestOverlapGraph
diff --git a/src/bogart/AS_BAT_ChunkGraph.C b/src/bogart/AS_BAT_ChunkGraph.C
index 66c67ac..3a9003a 100644
--- a/src/bogart/AS_BAT_ChunkGraph.C
+++ b/src/bogart/AS_BAT_ChunkGraph.C
@@ -35,115 +35,127 @@
* full conditions and disclaimers for each license.
*/
-#include "AS_BAT_FragmentInfo.H"
+#include "AS_BAT_ReadInfo.H"
#include "AS_BAT_BestOverlapGraph.H"
#include "AS_BAT_ChunkGraph.H"
#include "AS_BAT_Logging.H"
-ChunkGraph::ChunkGraph(const char *output_prefix) {
+ChunkGraph::ChunkGraph(const char *prefix) {
+ char N[FILENAME_MAX];
- setLogFile(output_prefix, "ChunkGraph");
+ snprintf(N, FILENAME_MAX, "%s.chunkGraph.log", prefix);
- _maxFragment = FI->numFragments();
+ errno = 0;
+ _chunkLog = (logFileFlagSet(LOG_CHUNK_GRAPH)) ? fopen(N, "w") : NULL;
+
+ if (errno)
+ _chunkLog = NULL;
+
+ _maxRead = RI->numReads();
_restrict = NULL;
- _pathLen = new uint32 [_maxFragment * 2 + 2];
- _chunkLength = new ChunkLength [_maxFragment];
+ _pathLen = new uint32 [_maxRead * 2 + 2];
+ _chunkLength = new ChunkLength [_maxRead];
_chunkLengthIter = 0;
- memset(_pathLen, 0, sizeof(uint32) * (_maxFragment * 2 + 2));
- memset(_chunkLength, 0, sizeof(ChunkLength) * (_maxFragment));
+ memset(_pathLen, 0, sizeof(uint32) * (_maxRead * 2 + 2));
+ memset(_chunkLength, 0, sizeof(ChunkLength) * (_maxRead));
- for (uint32 fid=1; fid <= _maxFragment; fid++) {
+ for (uint32 fid=1; fid <= _maxRead; fid++) {
if (OG->isContained(fid)) {
- if (logFileFlagSet(LOG_CHUNK_GRAPH))
- writeLog("read %u contained\n", fid);
+ if (_chunkLog)
+ fprintf(_chunkLog, "read %u contained\n", fid);
continue;
}
if (OG->isSuspicious(fid)) {
- if (logFileFlagSet(LOG_CHUNK_GRAPH))
- writeLog("read %u suspicious\n", fid);
+ if (_chunkLog)
+ fprintf(_chunkLog, "read %u suspicious\n", fid);
continue;
}
- uint32 l5 = countFullWidth(FragmentEnd(fid, false));
- uint32 l3 = countFullWidth(FragmentEnd(fid, true));
+ uint32 l5 = countFullWidth(ReadEnd(fid, false));
+ uint32 l3 = countFullWidth(ReadEnd(fid, true));
- _chunkLength[fid-1].fragId = fid;
+ _chunkLength[fid-1].readId = fid;
_chunkLength[fid-1].cnt = l5 + l3;
}
+ if (_chunkLog)
+ fclose(_chunkLog);
+
delete [] _pathLen;
_pathLen = NULL;
- std::sort(_chunkLength, _chunkLength + _maxFragment);
+ std::sort(_chunkLength, _chunkLength + _maxRead);
}
ChunkGraph::ChunkGraph(set<uint32> *restrict) {
- _maxFragment = 0;
+ _chunkLog = NULL;
+
+ _maxRead = 0;
_restrict = restrict;
for (set<uint32>::iterator it=_restrict->begin(); it != _restrict->end(); it++)
- _idMap[*it] = _maxFragment++;
+ _idMap[*it] = _maxRead++;
- _pathLen = new uint32 [_maxFragment * 2 + 2];
- _chunkLength = new ChunkLength [_maxFragment];
+ _pathLen = new uint32 [_maxRead * 2 + 2];
+ _chunkLength = new ChunkLength [_maxRead];
_chunkLengthIter = 0;
- memset(_pathLen, 0, sizeof(uint32) * (_maxFragment * 2 + 2));
- memset(_chunkLength, 0, sizeof(ChunkLength) * (_maxFragment));
+ memset(_pathLen, 0, sizeof(uint32) * (_maxRead * 2 + 2));
+ memset(_chunkLength, 0, sizeof(ChunkLength) * (_maxRead));
for (set<uint32>::iterator it=_restrict->begin(); it != _restrict->end(); it++) {
- uint32 fid = *it; // Actual fragment ID
+ uint32 fid = *it; // Actual read ID
uint32 fit = _idMap[fid]; // Local array index
if (OG->isContained(fid))
continue;
- _chunkLength[fit].fragId = fid;
- _chunkLength[fit].cnt = (countFullWidth(FragmentEnd(fid, false)) +
- countFullWidth(FragmentEnd(fid, true)));
+ _chunkLength[fit].readId = fid;
+ _chunkLength[fit].cnt = (countFullWidth(ReadEnd(fid, false)) +
+ countFullWidth(ReadEnd(fid, true)));
}
delete [] _pathLen;
_pathLen = NULL;
- std::sort(_chunkLength, _chunkLength + _maxFragment);
+ std::sort(_chunkLength, _chunkLength + _maxRead);
}
uint64
-ChunkGraph::getIndex(FragmentEnd e) {
+ChunkGraph::getIndex(ReadEnd e) {
if (_restrict == NULL)
- return(e.fragId() * 2 + e.frag3p());
+ return(e.readId() * 2 + e.read3p());
- return(_idMap[e.fragId()] * 2 + e.frag3p());
+ return(_idMap[e.readId()] * 2 + e.read3p());
}
uint32
-ChunkGraph::countFullWidth(FragmentEnd firstEnd) {
+ChunkGraph::countFullWidth(ReadEnd firstEnd) {
uint64 firstIdx = getIndex(firstEnd);
- assert(firstIdx < _maxFragment * 2 + 2);
+ assert(firstIdx < _maxRead * 2 + 2);
if (_pathLen[firstIdx] > 0)
return _pathLen[firstIdx];
uint32 length = 0;
- std::set<FragmentEnd> seen;
- FragmentEnd lastEnd = firstEnd;
+ std::set<ReadEnd> seen;
+ ReadEnd lastEnd = firstEnd;
uint64 lastIdx = firstIdx;
- // Until we run off the chain, or we hit a fragment with a known length, compute the length FROM
+ // Until we run off the chain, or we hit a read with a known length, compute the length FROM
// THE START.
//
while ((lastIdx != 0) &&
@@ -161,24 +173,24 @@ ChunkGraph::countFullWidth(FragmentEnd firstEnd) {
// Check why we stopped. Three cases:
//
- // 1) We ran out of best edges to follow -- lastEnd.fragId() == 0
- // 2) We encountered a fragment with known length -- _pathLen[lastEnd.index()] > 0
+ // 1) We ran out of best edges to follow -- lastEnd.readId() == 0
+ // 2) We encountered a read with known length -- _pathLen[lastEnd.index()] > 0
// 3) We encountered a self-loop (same condition as case 2)
//
- // To distinguish case 2 and 3, we keep a set<> of the fragments we've seen in this construction.
+ // To distinguish case 2 and 3, we keep a set<> of the reads we've seen in this construction.
// If 'lastEnd' is in that set, then we're case 3. If so, adjust every node in the cycle to have
// the same length, the length of the cycle itself.
//
- // 'lastEnd' and 'index' are the first fragment in the cycle; we've seen this one before.
+ // 'lastEnd' and 'index' are the first read in the cycle; we've seen this one before.
//
- if (lastEnd.fragId() == 0) {
+ if (lastEnd.readId() == 0) {
// Case 1. Do nothing.
;
} else if (seen.find(lastEnd) != seen.end()) {
// Case 3, a cycle.
uint32 cycleLen = length - _pathLen[lastIdx] + 1;
- FragmentEnd currEnd = lastEnd;
+ ReadEnd currEnd = lastEnd;
uint64 currIdx = lastIdx;
do {
@@ -199,7 +211,7 @@ ChunkGraph::countFullWidth(FragmentEnd firstEnd) {
// cycle has had its length set correctly already, and we stop at either the start of the cycle,
// or at the start of any existing path.
//
- FragmentEnd currEnd = firstEnd;
+ ReadEnd currEnd = firstEnd;
uint64 currIdx = firstIdx;
while (currEnd != lastEnd) {
@@ -209,6 +221,13 @@ ChunkGraph::countFullWidth(FragmentEnd firstEnd) {
}
+ if (lengthMax != _pathLen[firstIdx]) {
+ writeStatus("chunkGraph()-- ERROR: lengthMax %d _pathLen[] %d\n",
+ lengthMax, _pathLen[firstIdx]);
+ flushLog();
+ }
+ assert(lengthMax == _pathLen[firstIdx]);
+
if (logFileFlagSet(LOG_CHUNK_GRAPH)) {
seen.clear();
@@ -216,40 +235,39 @@ ChunkGraph::countFullWidth(FragmentEnd firstEnd) {
currEnd = firstEnd;
currIdx = firstIdx;
- writeLog("path from %d,%d length %d:",
- firstEnd.fragId(),
- (firstEnd.frag3p()) ? 3 : 5,
- _pathLen[firstIdx]);
+ if (_chunkLog)
+ fprintf(_chunkLog, "path from %d,%d'(length=%d):",
+ firstEnd.readId(),
+ (firstEnd.read3p()) ? 3 : 5,
+ _pathLen[firstIdx]);
- while ((currEnd.fragId() != 0) &&
+ while ((currEnd.readId() != 0) &&
(seen.find(currEnd) == seen.end())) {
seen.insert(currEnd);
- if (currEnd == lastEnd)
- writeLog(" LAST");
+ if ((_chunkLog) && (currEnd == lastEnd))
+ fprintf(_chunkLog, " LAST");
- writeLog(" %d,%d(%d)",
- currEnd.fragId(),
- (currEnd.frag3p()) ? 3 : 5,
- _pathLen[currIdx]);
+ if (_chunkLog)
+ fprintf(_chunkLog, " %d,%d'(%d)",
+ currEnd.readId(),
+ (currEnd.read3p()) ? 3 : 5,
+ _pathLen[currIdx]);
currEnd = OG->followOverlap(currEnd);
currIdx = getIndex(currEnd);
}
- if (seen.find(currEnd) != seen.end())
- writeLog(" CYCLE %d,%d(%d)",
- currEnd.fragId(),
- (currEnd.frag3p()) ? 3 : 5,
+ if ((_chunkLog) && (seen.find(currEnd) != seen.end()))
+ fprintf(_chunkLog, " CYCLE %d,%d'(%d)",
+ currEnd.readId(),
+ (currEnd.read3p()) ? 3 : 5,
_pathLen[currIdx]);
- writeLog("\n");
+ if (_chunkLog)
+ fprintf(_chunkLog, "\n");
}
- if (lengthMax != _pathLen[firstIdx])
- writeLog("ERROR: lengthMax %d _pathLen[] %d\n",
- lengthMax, _pathLen[firstIdx]);
- assert(lengthMax == _pathLen[firstIdx]);
return(_pathLen[firstIdx]);
}
diff --git a/src/bogart/AS_BAT_ChunkGraph.H b/src/bogart/AS_BAT_ChunkGraph.H
index f2e7a52..7a6cd5b 100644
--- a/src/bogart/AS_BAT_ChunkGraph.H
+++ b/src/bogart/AS_BAT_ChunkGraph.H
@@ -50,12 +50,12 @@ class BestOverlapGraph;
class ChunkLength {
public:
- uint32 fragId;
+ uint32 readId;
uint32 cnt;
bool operator<(ChunkLength const that) const {
if (cnt == that.cnt)
- return(fragId < that.fragId);
+ return(readId < that.readId);
return(cnt > that.cnt);
};
};
@@ -63,30 +63,32 @@ public:
class ChunkGraph {
public:
- ChunkGraph(const char *output_prefix);
+ ChunkGraph(const char *prefix);
ChunkGraph(set<uint32> *restrict);
~ChunkGraph(void) {
delete [] _chunkLength;
};
- uint32 nextFragByChunkLength(void) {
- if (_chunkLengthIter >= _maxFragment)
+ uint32 nextReadByChunkLength(void) {
+ if (_chunkLengthIter >= _maxRead)
return(0);
- return(_chunkLength[_chunkLengthIter++].fragId);
+ return(_chunkLength[_chunkLengthIter++].readId);
};
private:
- uint64 getIndex(FragmentEnd e);
- uint32 countFullWidth(FragmentEnd firstEnd);
+ uint64 getIndex(ReadEnd e);
+ uint32 countFullWidth(ReadEnd firstEnd);
- uint64 _maxFragment;
+ FILE *_chunkLog;
- // The usual case, for a chunk graph of all fragments.
+ uint64 _maxRead;
+
+ // The usual case, for a chunk graph of all reads.
ChunkLength *_chunkLength;
uint32 _chunkLengthIter;
uint32 *_pathLen;
- // For a chunk graph of a single unitig plus some extra fragments.
+ // For a chunk graph of a single unitig plus some extra reads.
// This maps the uint32 to an index in the arrays above.
map<uint32,uint32> _idMap;
set<uint32> *_restrict;
diff --git a/src/bogart/AS_BAT_CreateUnitigs.C b/src/bogart/AS_BAT_CreateUnitigs.C
new file mode 100644
index 0000000..7b1c58e
--- /dev/null
+++ b/src/bogart/AS_BAT_CreateUnitigs.C
@@ -0,0 +1,514 @@
+
+/******************************************************************************
+ *
+ * This file is part of canu, a software program that assembles whole-genome
+ * sequencing reads into contigs.
+ *
+ * This software is based on:
+ * 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
+ * the 'kmer package' (http://kmer.sourceforge.net)
+ * both originally distributed by Applera Corporation under the GNU General
+ * Public License, version 2.
+ *
+ * Canu branched from Celera Assembler at its revision 4587.
+ * Canu branched from the kmer project at its revision 1994.
+ *
+ * Modifications by:
+ *
+ * Brian P. Walenz beginning on 2016-OCT-03
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
+ * File 'README.licenses' in the root directory of this distribution contains
+ * full conditions and disclaimers for each license.
+ */
+
+#include "AS_BAT_ReadInfo.H"
+#include "AS_BAT_OverlapCache.H"
+#include "AS_BAT_BestOverlapGraph.H"
+#include "AS_BAT_AssemblyGraph.H"
+#include "AS_BAT_Logging.H"
+
+#include "AS_BAT_Unitig.H"
+#include "AS_BAT_TigVector.H"
+
+#include "AS_BAT_CreateUnitigs.H"
+
+
+
+// Break on at a specific position. In converting to unitigs, the position
+// is the end of a read with an intersection.
+//
+// _bgn == true -> reads that begin at after position are in the region
+// _end == false -> reads that end before position are in the region
+
+class breakPointEnd {
+public:
+ breakPointEnd(uint32 tigID, uint32 pos, bool bgn) {
+ _tigID = tigID;
+ _pos = pos;
+ _bgn = bgn;
+ };
+ ~breakPointEnd() {
+ };
+
+ bool operator<(breakPointEnd const &that) const {
+ uint64 a = _tigID; a <<= 32; a |= _pos; a <<= 1; a |= _bgn; // Because _tigID is 32-bit
+ uint64 b = that._tigID; b <<= 32; b |= that._pos; b <<= 1; b |= that._bgn;
+
+ return(a < b);
+ };
+
+ uint32 _tigID;
+ uint32 _pos;
+ bool _bgn;
+};
+
+
+
+
+
+
+
+Unitig *
+copyTig(TigVector &tigs,
+ Unitig *oldtig) {
+ Unitig *newtig = tigs.newUnitig(false);
+
+ newtig->_isUnassembled = oldtig->_isUnassembled;
+ newtig->_isBubble = oldtig->_isBubble;
+ newtig->_isRepeat = oldtig->_isRepeat;
+ newtig->_isCircular = oldtig->_isCircular;
+
+ for (uint32 fi=0; fi<oldtig->ufpath.size(); fi++)
+ newtig->addRead(oldtig->ufpath[fi], 0, false);
+
+ return(newtig);
+}
+
+
+
+
+
+// Split a tig based on read ends.
+
+uint32
+splitTig(TigVector &tigs,
+ Unitig *tig,
+ vector<breakPointEnd> &BP,
+ Unitig **newTigs,
+ int32 *lowCoord,
+ uint32 *nMoved,
+ bool doMove) {
+
+ if (doMove == true) {
+ memset(newTigs, 0, sizeof(Unitig *) * BP.size());
+ memset(lowCoord, 0, sizeof(int32) * BP.size());
+ } else {
+ memset(nMoved, 0, sizeof(uint32) * BP.size());
+ }
+
+ if (doMove)
+ for (uint32 tt=0; tt < BP.size() - 1; tt++)
+ writeLog("splitTig()-- piece %2u from %8u %c to %8u %c\n",
+ tt,
+ BP[tt ]._pos, BP[tt ]._bgn ? 't' : 'f',
+ BP[tt+1]._pos, BP[tt+1]._bgn ? 't' : 'f');
+
+ for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
+ ufNode &read = tig->ufpath[fi];
+ uint32 lo = read.position.min();
+ uint32 hi = read.position.max();
+
+ // Find the intervals the end points of the read fall into. Suppose we're trying to place
+ // the long read. It begins in piece 1 and ends in piece 6.
+ //
+ //
+ // [----1---][----3----]---4---[--5---]------6-----] Piece and boundary condition
+ // ------
+ // --------------------------------------
+ // -----
+ // ------
+ // ------
+ // ----
+ // -----
+ // ----------
+ //
+ // The long read can not go in piece 1, as it would span the end boundary. Piece 2 is
+ // of size zero between pieces 1 and 3, and we can place the read there. Or, we can place
+ // it in piece 6 (we prefer piece 6).
+
+ uint32 bgnBP = UINT32_MAX;
+ uint32 endBP = UINT32_MAX;
+ uint32 finBP = UINT32_MAX;
+
+ // Find the pieces the end points are in.
+
+ for (uint32 tt=0; tt < BP.size()-1; tt++) {
+ uint32 p = BP[tt ]._pos; bool pb = BP[tt ]._bgn;
+ uint32 n = BP[tt+1]._pos; bool nb = BP[tt+1]._bgn;
+
+ if ((p <= lo) && (lo < n)) // If bgn == true -- p == lo is in this region
+ bgnBP = tt;
+
+ if ((p < hi) && (hi <= n)) // If bgn == false -- hi == n is in this region
+ endBP = tt;
+ }
+
+ // If both pieces are the same, we're done.
+
+ if (bgnBP == endBP) {
+ finBP = bgnBP;
+ }
+
+ // If the next BP is a bgn boundary, we can still place the read in this piece. It'll extend
+ // off the end, but we don't care.
+
+ else if (BP[bgnBP+1]._bgn == true) {
+ finBP = bgnBP;
+ }
+
+ // If not, the next boundary is an end point, and we cannot place the read in this piece.
+ // If the endBP piece doesn't have restrictions on the begin, we can place the read there.
+
+ else if (BP[endBP]._bgn == false) {
+ finBP = endBP;
+ }
+
+ // Well, shucks. No place to put the read. Search for an unbounded region between bgnBP and
+ // endBP. There must be one, because bgnBP ends with a bgn=false boundary, and endBP begins
+ // with a bgn=true boundary. If there are no intermediate boundaries, we can place the read in
+ // the middle. If there are intermediate boundaries, we'll still have some piece that is
+ // unbounded.
+
+ else {
+ for (finBP=bgnBP+1; finBP < endBP; finBP++) {
+ if ((BP[finBP ]._bgn == false) &&
+ (BP[finBP+1]._bgn == true))
+ break;
+ }
+
+ if (finBP == endBP)
+ writeLog("splitTig()-- failed to place read %u %u-%u in a region. found bgn=%u and end=%u\n",
+ read.ident, read.position.bgn, read.position.end, bgnBP, endBP);
+ assert(finBP != endBP);
+ }
+
+ // Make a new tig, if needed
+
+ if ((doMove == true) && (newTigs[finBP] == NULL)) {
+ writeLog("splitTig()-- new tig %u (id=%u) at read %u %u-%u\n", tigs.size(), finBP, read.ident, read.position.min(), read.position.max());
+ lowCoord[finBP] = read.position.min();
+ newTigs[finBP] = tigs.newUnitig(false);
+ }
+
+ // Now move the read, or account for moving it.
+
+ if (doMove) {
+ //writeLog("splitTig()-- Move read %8u %8u-%-8u to piece %2u tig %6u\n",
+ // read.ident, read.position.bgn, read.position.end, finBP, newTigs[finBP]->id());
+ newTigs[finBP]->addRead(read, -lowCoord[finBP], false);
+ }
+ else {
+ //writeLog("splitTig()-- Move read %u %u-%u to piece %u (pos=%u)\n", read.ident, read.position.bgn, read.position.end, bp, BP[finBP]._pos);
+ nMoved[finBP]++;
+ }
+ }
+
+ // Return the number of tigs created.
+
+ uint32 nTigsCreated = 0;
+
+ for (uint32 ii=0; ii<BP.size(); ii++)
+ if (nMoved[ii] > 0)
+ nTigsCreated++;
+
+ return(nTigsCreated);
+}
+
+
+
+static
+void
+checkRead(AssemblyGraph *AG,
+ TigVector &contigs,
+ vector<breakPointEnd> &breaks,
+ Unitig *tgA, ufNode *rdA,
+ bool isFirst) {
+
+ for (uint32 pp=0; pp<AG->getForward(rdA->ident).size(); pp++) {
+ BestPlacement &pf = AG->getForward(rdA->ident)[pp];
+
+ // If a contained edge, we cannot split the other tig; it is correct (this read is contained in the other read).
+
+ if (pf.bestC.b_iid > 0) {
+ writeLog("createUnitigs()-- read %6u edgeTo tig %5u read %6u position %d-%d CONTAINED\n",
+ rdA->ident, contigs.inUnitig(pf.bestC.b_iid), pf.bestC.b_iid, pf.placedBgn, pf.placedEnd);
+ continue;
+ }
+
+ // Decide which overlap we want to be using, based on the orientation of the read in the tig,
+ // and if it is the first or last read.
+ //
+ // first == true first == false
+ // best5 fwd == true ---------> fwd == false <---------
+ // best3 fwd == false <---------- fwd == true --------->
+
+ BAToverlap best = (isFirst == rdA->position.isForward()) ? pf.best5 : pf.best3;
+
+ // If there is no overlap on the expected end, well, that's it, nothing we can do but give up.
+ // Don't bother logging if it is the internal edge (which it shouldn't ever be, because those shouldn't
+ // be in the graph, right?)
+
+ if (best.b_iid == 0) {
+ uint32 rdC = (isFirst == rdA->position.isForward()) ? pf.best3.b_iid : pf.best5.b_iid; // Grab the other edge
+ uint32 tgC = contigs.inUnitig(rdC);
+
+ if (tgC != tgA->id())
+ writeLog("createUnitigs()-- read %6u edgeTo tig %5u read %6u position %d-%d WRONG_END\n",
+ rdA->ident, tgC, rdC, pf.placedBgn, pf.placedEnd);
+ continue;
+ }
+
+ // Grab the tig and read we overlap to.
+
+ Unitig *tgB = contigs[ contigs.inUnitig(best.b_iid) ];
+ ufNode *rdB = &tgB->ufpath[ contigs.ufpathIdx(best.b_iid) ];
+
+ // And find the coordinate of the break based on the orientation of the rdB and the overlap.
+ // isLow is true if the read is forward and the overlap is off of its 5' end, or
+ // if the read is reverse and the overlap is off of its 3' end
+
+ bool isLow = (rdB->position.isForward()) ? best.BEndIs5prime() : best.BEndIs3prime();
+ uint32 coord = (isLow == true) ? rdB->position.min() : rdB->position.max();
+
+ // With all that done, throw out the edge if the overlap was used to form the contig itself.
+ //
+ // We used to also throw out edges to validated repeats (pf.isRepeat == true), but those are
+ // indistinguishable from bubbles.
+
+ if (pf.isContig == true) {
+ writeLog("createUnitigs()-- read %6u edgeTo tig %5u at coordinate %8u via intersection with read %6u IS_%s\n",
+ rdA->ident, tgB->id(), coord, rdB->ident, (pf.isContig == true) ? "CONTIG" : "REPEAT");
+ continue;
+ }
+
+ // Also chuck it out if it is to garbage.
+
+ if (tgB->_isUnassembled == true) {
+ writeLog("createUnitigs()-- read %6u edgeTo tig %5u read %6u UNASSEMBLED\n",
+ rdA->ident, tgB->id(), rdB->ident);
+ continue;
+ }
+
+ // If here, we're all golden!
+
+ writeLog("splitThinEdge()-- read %6u splits tig %5u at coordinate %8u via intersection with read %6u isLow %u\n",
+ rdA->ident, pf.tigID, coord, rdB->ident, isLow);
+ breaks.push_back(breakPointEnd(pf.tigID, coord, isLow));
+ }
+}
+
+
+
+void
+stripNonBackboneFromStart(TigVector &unitigs, Unitig *tig, bool isFirst) {
+ vector<ufNode> ufpath;
+ uint32 ii = 0;
+
+ while (RI->isBackbone(tig->ufpath[ii].ident) == false) { // Find the first backbone read,
+ unitigs.registerRead(tig->ufpath[ii].ident);
+ writeLog("WARNING: unitig %u %s read %u is not backbone, removing.\n",
+ tig->id(),
+ isFirst ? "first" : "last ",
+ tig->ufpath[ii].ident);
+ ii++;
+ }
+
+ while (ii < tig->ufpath.size()) { // and copy to a new vector.
+ ufpath.push_back(tig->ufpath[ii]);
+ writeLog("SAVE unitig %u %s read %u IS backbone.\n",
+ tig->id(),
+ isFirst ? "first" : "last ",
+ tig->ufpath[ii].ident);
+ ii++;
+ }
+
+ tig->ufpath.swap(ufpath); // assign the new vector to the tig
+ tig->cleanUp(); // adjust zero, find new length
+ tig->reverseComplement(); // rebuild the idx mappings, and reverse for the next phase
+}
+
+
+
+void
+createUnitigs(AssemblyGraph *AG,
+ TigVector &contigs,
+ TigVector &unitigs,
+ vector<tigLoc> &unitigSource) {
+
+ vector<breakPointEnd> breaks;
+
+ // Check the reads at the end of every tig for intersections to other tigs. If the read has a
+ // compatible overlap to the middle of some other tig, split the other tig into multiple unitigs.
+
+ writeLog("\n");
+ writeLog("----------------------------------------\n");
+ writeLog("Finding contig-end to contig-middle intersections.\n");
+
+ for (uint32 ti=0; ti<contigs.size(); ti++) {
+ Unitig *tig = contigs[ti];
+
+ if (tig == NULL)
+ continue;
+
+ if (tig->_isUnassembled == true) // Edge is FROM an unassembled thing, ignore it.
+ continue;
+
+ // Give this tig a pair of bogus breakpoints at the ends, just to get it in the list. If there
+ // are no break points, it won't be split. These also serve as sentinels during splitting.
+
+ breaks.push_back(breakPointEnd(ti, 0, true)); // Add one at the start of the tig
+ breaks.push_back(breakPointEnd(ti, tig->getLength(), false)); // And one at the end
+
+ // Find break points in other tigs using the first and last reads.
+
+ ufNode *fi = tig->firstRead();
+ ufNode *li = tig->lastRead();
+
+ if (AG->getForward(fi->ident).size() + AG->getForward(li->ident).size() > 0)
+ writeLog("\ncreateUnitigs()-- tig %u len %u first read %u with %lu edges - last read %u with %lu edges\n",
+ ti, tig->getLength(),
+ fi->ident, AG->getForward(fi->ident).size(),
+ li->ident, AG->getForward(li->ident).size());
+
+ checkRead(AG, contigs, breaks, tig, fi, true);
+ checkRead(AG, contigs, breaks, tig, li, false);
+ }
+
+ // The splitTigs function operates only on a single tig. Sort the break points
+ // by tig id to find all the break points for each tig.
+
+ sort(breaks.begin(), breaks.end());
+
+ writeLog("\n");
+ writeLog("createUnitigs()-- Found %u breakpoints.\n", breaks.size());
+
+ // Allocate space for breaking tigs. These are _vastly_ too big, but guaranteed.
+
+ vector<breakPointEnd> BP;
+
+ Unitig **newTigs = new Unitig * [breaks.size() + 2]; // Plus two, because we add an extra
+ int32 *lowCoord = new int32 [breaks.size() + 2]; // break at the start and end
+ uint32 *nMoved = new uint32 [breaks.size() + 2]; // of each set.
+
+ // Walk through the breaks, making a new vector of breaks for each tig.
+
+ uint32 ss = 0;
+ uint32 ee = 0;
+
+ while (ss < breaks.size()) {
+ Unitig *tig = contigs[breaks[ss]._tigID];
+
+ // Find the last break point for this tig. (Technically, the one after the last, but...)
+
+ while ((ee < breaks.size()) && (breaks[ss]._tigID == breaks[ee]._tigID))
+ ee++;
+
+ // Make a new vector for those break points.
+
+ BP.clear();
+
+ for (uint32 bb=ss; bb<ee; bb++)
+ if ((BP.size() == 0) ||
+ (BP.back()._pos != breaks[bb]._pos) ||
+ (BP.back()._bgn != breaks[bb]._bgn))
+ BP.push_back(breaks[bb]);
+
+ writeLog("\n");
+
+ if (BP.size() > 2)
+ writeLog("createUnitigs()-- contig %u found %u breakpoint%s\n",
+ tig->id(), BP.size()-2, (BP.size()-2 != 1) ? "s" : "");
+
+ // Split the tig. Copy it into the unitigs TigVector too.
+
+ uint32 nTigs = splitTig(contigs, tig, BP, newTigs, lowCoord, nMoved, false);
+
+ if (nTigs > 1) {
+ splitTig(unitigs, tig, BP, newTigs, lowCoord, nMoved, true);
+ writeLog("createUnitigs()-- contig %u was split into %u unitigs, %u through %u.\n", // Can't use newTigs, because
+ tig->id(), nTigs, unitigs.size() - nTigs, unitigs.size() - 1); // there are holes in it
+ }
+
+ else {
+ newTigs[0] = copyTig(unitigs, tig);
+ writeLog("createUnitigs()-- contig %u copied into unitig %u.\n", tig->id(), newTigs[0]->id());
+ }
+
+ // Remember where these unitigs came from.
+
+ unitigSource.resize(unitigs.size() + 1);
+
+ for (uint32 tt=0; tt<nTigs; tt++) {
+ if (newTigs[tt]) {
+ uint32 id = newTigs[tt]->id();
+
+ writeLog("createUnitigs()-- piece %3u -> tig %u from contig %u %u-%u\n",
+ tt, id, tig->id(), lowCoord[tt], lowCoord[tt] + newTigs[tt]->getLength());
+
+ unitigSource[id].cID = tig->id();
+ unitigSource[id].cBgn = lowCoord[tt];
+ unitigSource[id].cEnd = lowCoord[tt] + newTigs[tt]->getLength();
+ unitigSource[id].uID = id;
+ }
+ }
+
+ // Reset for the next iteration.
+
+ ss = ee;
+ }
+
+ // Remove non-backbone reads from the ends of unitigs. These confound graph building because
+ // they can be missing overlaps.
+ //
+ // If the last read in the tig is not a backbone read, we can remove it and all reads that come
+ // after it (because those reads are contained).
+
+ for (uint32 ti=0; ti<unitigs.size(); ti++) {
+ Unitig *tig = unitigs[ti];
+
+ if (tig == NULL)
+ continue;
+
+ // First, check if we have any backbone reads. If we have none, leave it as is.
+
+ uint32 bbReads = 0;
+ uint32 nbReads = 0;
+
+ for (uint32 li=0; li<tig->ufpath.size(); li++) {
+ if (RI->isBackbone(tig->ufpath[li].ident) == true)
+ bbReads++;
+ else
+ nbReads++;
+ }
+
+ if (bbReads == 0)
+ continue;
+
+ // Now remove non-backbone reads from the start of the tig.
+
+ writeLog("unitig %u with %u reads, %u backbone and %u unplaced.\n",
+ tig->id(), tig->ufpath.size(), bbReads, nbReads);
+
+ stripNonBackboneFromStart(unitigs, tig, true);
+ stripNonBackboneFromStart(unitigs, tig, false);
+ }
+
+ // Cleanup.
+
+ delete [] newTigs;
+ delete [] lowCoord;
+ delete [] nMoved;
+}
+
diff --git a/src/bogart/AS_BAT_PopBubbles.H b/src/bogart/AS_BAT_CreateUnitigs.H
similarity index 57%
copy from src/bogart/AS_BAT_PopBubbles.H
copy to src/bogart/AS_BAT_CreateUnitigs.H
index 8ab8db6..b8a35e1 100644
--- a/src/bogart/AS_BAT_PopBubbles.H
+++ b/src/bogart/AS_BAT_CreateUnitigs.H
@@ -15,7 +15,7 @@
*
* Modifications by:
*
- * Brian P. Walenz beginning on 2016-MAR-11
+ * Brian P. Walenz beginning on 2016-OCT-03
* are a 'United States Government Work', and
* are released in the public domain
*
@@ -23,16 +23,39 @@
* full conditions and disclaimers for each license.
*/
-#ifndef INCLUDE_AS_BAT_BUBBLEPOPPING
-#define INCLUDE_AS_BAT_BUBBLEPOPPING
+#ifndef AS_BAT_CREATEUNITIGS_H
+#define AS_BAT_CREATEUNITIGS_H
-#include "AS_global.H"
+#include "AS_BAT_ReadInfo.H"
+#include "AS_BAT_OverlapCache.H"
#include "AS_BAT_BestOverlapGraph.H"
-#include "AS_BAT_Unitig.H"
+#include "AS_BAT_AssemblyGraph.H"
+#include "AS_BAT_Logging.H"
-void
-popBubbles(UnitigVector &unitigs,
- double deviationBubble);
+#include "AS_BAT_TigVector.H"
+
+
+class tigLoc {
+public:
+ tigLoc() {
+ cID = UINT32_MAX;
+ cBgn = 0;
+ cEnd = 0;
+ uID = UINT32_MAX;
+ };
+
+ uint32 cID;
+ uint32 cBgn;
+ uint32 cEnd;
+ uint32 uID; // Debugging.
+};
-#endif // INCLUDE_AS_BAT_BUBBLEPOPPING
+
+void
+createUnitigs(AssemblyGraph *AG,
+ TigVector &contigs,
+ TigVector &unitigs,
+ vector<tigLoc> &unitigSource);
+
+#endif // AS_BAT_CREATEUNITIGS_H
diff --git a/src/bogart/AS_BAT_FragmentInfo.C b/src/bogart/AS_BAT_FragmentInfo.C
deleted file mode 100644
index 58f4347..0000000
--- a/src/bogart/AS_BAT_FragmentInfo.C
+++ /dev/null
@@ -1,179 +0,0 @@
-
-/******************************************************************************
- *
- * This file is part of canu, a software program that assembles whole-genome
- * sequencing reads into contigs.
- *
- * This software is based on:
- * 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
- * the 'kmer package' (http://kmer.sourceforge.net)
- * both originally distributed by Applera Corporation under the GNU General
- * Public License, version 2.
- *
- * Canu branched from Celera Assembler at its revision 4587.
- * Canu branched from the kmer project at its revision 1994.
- *
- * This file is derived from:
- *
- * src/AS_BAT/AS_BAT_FragmentInfo.C
- *
- * Modifications by:
- *
- * Brian P. Walenz from 2010-NOV-23 to 2013-AUG-01
- * are Copyright 2010-2013 J. Craig Venter Institute, and
- * are subject to the GNU General Public License version 2
- *
- * Brian P. Walenz from 2014-DEC-19 to 2015-JUN-16
- * are Copyright 2014-2015 Battelle National Biodefense Institute, and
- * are subject to the BSD 3-Clause License
- *
- * Brian P. Walenz beginning on 2016-JAN-11
- * are a 'United States Government Work', and
- * are released in the public domain
- *
- * File 'README.licenses' in the root directory of this distribution contains
- * full conditions and disclaimers for each license.
- */
-
-#include "AS_BAT_FragmentInfo.H"
-#include "AS_BAT_Logging.H"
-
-const uint64 fiMagicNumber = 0x6f666e4967617266llu; // 'fragInfo' until it gets messed up by endianess.
-const uint64 fiVersionNumber = 2;
-
-
-FragmentInfo::FragmentInfo(gkStore *gkp,
- const char *prefix,
- uint32 minReadLen) {
-
- if (load(prefix))
- return;
-
- writeLog("FragmentInfo()-- Loading fragment information\n");
-
- if (minReadLen > 0)
- writeLog("FragmentInfo()-- Reads shorter than "F_U32" bases are forced to be singleton.\n",
- minReadLen);
-
- _numLibraries = gkp->gkStore_getNumLibraries();
- _numFragments = gkp->gkStore_getNumReads();
-
- _fragLength = new uint32 [_numFragments + 1];
- _libIID = new uint32 [_numFragments + 1];
-
- for (uint32 i=0; i<_numFragments + 1; i++) {
- _fragLength[i] = 0;
- _libIID[i] = 0;
- }
-
- uint32 numSkipped = 0;
- uint32 numLoaded = 0;
-
- for (uint32 fi=1; fi<=_numFragments; fi++) {
- gkRead *read = gkp->gkStore_getRead(fi);
-
- if (read->gkRead_sequenceLength() < minReadLen) {
- numSkipped++;
-
- } else {
- uint32 iid = read->gkRead_readID();
- uint32 lib = read->gkRead_libraryID();
-
- _fragLength[iid] = read->gkRead_sequenceLength();
- _libIID[iid] = lib;
-
- numLoaded++;
- }
-
- if (((numSkipped + numLoaded) % 10000000) == 0)
- writeLog("FragmentInfo()-- Loading fragment information: skipped:%9d active:%9d\n",
- numSkipped, numLoaded);
- }
-
- writeLog("FragmentInfo()-- Loaded %d alive reads, skipped %d short reads.\n",
- numLoaded, numSkipped);
-
- save(prefix);
-}
-
-
-
-FragmentInfo::~FragmentInfo() {
- delete [] _fragLength;
- delete [] _libIID;
-}
-
-
-
-void
-FragmentInfo::save(const char *prefix) {
- char name[FILENAME_MAX];
-
- sprintf(name, "%s.fragmentInfo", prefix);
-
- errno = 0;
- FILE *file = fopen(name, "w");
- if (errno) {
- writeLog("FragmentInfo()-- Failed to open '%s' for writing: %s\n", name, strerror(errno));
- writeLog("FragmentInfo()-- Will not save fragment information to cache.\n");
- return;
- }
-
- writeLog("FragmentInfo()-- Saving fragment information to cache '%s'\n", name);
-
- AS_UTL_safeWrite(file, &fiMagicNumber, "fragmentInformationMagicNumber", sizeof(uint64), 1);
- AS_UTL_safeWrite(file, &fiVersionNumber, "fragmentInformationMagicNumber", sizeof(uint64), 1);
- AS_UTL_safeWrite(file, &_numFragments, "fragmentInformationNumFrgs", sizeof(uint32), 1);
- AS_UTL_safeWrite(file, &_numLibraries, "fragmentInformationNumLibs", sizeof(uint32), 1);
-
- AS_UTL_safeWrite(file, _fragLength, "fragmentInformationFragLen", sizeof(uint32), _numFragments + 1);
- AS_UTL_safeWrite(file, _libIID, "fragmentInformationLibIID", sizeof(uint32), _numFragments + 1);
-
- fclose(file);
-}
-
-
-bool
-FragmentInfo::load(const char *prefix) {
- char name[FILENAME_MAX];
-
- sprintf(name, "%s.fragmentInfo", prefix);
-
- errno = 0;
- FILE *file = fopen(name, "r");
- if (errno)
- return(false);
-
- uint64 magicNumber = 0;
- uint64 versionNumber = 0;
-
- AS_UTL_safeRead(file, &magicNumber, "fragmentInformationMagicNumber", sizeof(uint64), 1);
- AS_UTL_safeRead(file, &versionNumber, "fragmentInformationVersionNumber", sizeof(uint64), 1);
- AS_UTL_safeRead(file, &_numFragments, "fragmentInformationNumFrgs", sizeof(uint32), 1);
- AS_UTL_safeRead(file, &_numLibraries, "fragmentInformationNumLibs", sizeof(uint32), 1);
-
- if (magicNumber != fiMagicNumber) {
- writeLog("FragmentInfo()-- File '%s' is not a fragment info; cannot load.\n", name);
- fclose(file);
- return(false);
- }
- if (versionNumber != fiVersionNumber) {
- writeLog("FragmentInfo()-- File '%s' is version "F_U64", I can only read version "F_U64"; cannot load.\n",
- name, versionNumber, fiVersionNumber);
- fclose(file);
- return(false);
- }
-
- writeLog("FragmentInfo()-- Loading fragment information for "F_U32" fragments and "F_U32" libraries from cache '%s'\n",
- _numFragments, _numLibraries, name);
-
- _fragLength = new uint32 [_numFragments + 1];
- _libIID = new uint32 [_numFragments + 1];
-
- AS_UTL_safeRead(file, _fragLength, "fragmentInformationFragLen", sizeof(uint32), _numFragments + 1);
- AS_UTL_safeRead(file, _libIID, "fragmentInformationLibIID", sizeof(uint32), _numFragments + 1);
-
- fclose(file);
-
- return(true);
-}
diff --git a/src/bogart/AS_BAT_Instrumentation.C b/src/bogart/AS_BAT_Instrumentation.C
index 41775ed..00c66dc 100644
--- a/src/bogart/AS_BAT_Instrumentation.C
+++ b/src/bogart/AS_BAT_Instrumentation.C
@@ -35,7 +35,7 @@
* full conditions and disclaimers for each license.
*/
-#include "AS_BAT_FragmentInfo.H"
+#include "AS_BAT_ReadInfo.H"
#include "AS_BAT_BestOverlapGraph.H"
#include "AS_BAT_Logging.H"
@@ -49,19 +49,19 @@
// Will fail if a read is in unitig 0, or if a read isn't in a unitig.
void
-checkUnitigMembership(UnitigVector &unitigs) {
- uint32 *inUnitig = new uint32 [FI->numFragments()+1];
+checkUnitigMembership(TigVector &tigs) {
+ uint32 *inUnitig = new uint32 [RI->numReads()+1];
uint32 noUnitig = 0xffffffff;
// All reads start of not placed in a unitig.
- for (uint32 i=0; i<FI->numFragments()+1; i++)
+ for (uint32 i=0; i<RI->numReads()+1; i++)
inUnitig[i] = noUnitig;
- // Over all unitigs, remember where each read is.
+ // Over all tigs, remember where each read is.
- for (uint32 ti=0; ti<unitigs.size(); ti++) {
- Unitig *tig = unitigs[ti];
+ for (uint32 ti=0; ti<tigs.size(); ti++) {
+ Unitig *tig = tigs[ti];
int32 len = 0;
if (tig == NULL)
@@ -70,15 +70,15 @@ checkUnitigMembership(UnitigVector &unitigs) {
for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
ufNode *frg = &tig->ufpath[fi];
- if (frg->ident > FI->numFragments())
+ if (frg->ident > RI->numReads())
fprintf(stderr, "tig %u ufpath[%d] ident %u more than number of reads %u\n",
- tig->id(), fi, frg->ident, FI->numFragments());
+ tig->id(), fi, frg->ident, RI->numReads());
if (inUnitig[frg->ident] != noUnitig)
fprintf(stderr, "tig %u ufpath[%d] ident %u placed multiple times\n",
tig->id(), fi, frg->ident);
- assert(frg->ident <= FI->numFragments()); // Can't be out of range.
+ assert(frg->ident <= RI->numReads()); // Can't be out of range.
assert(inUnitig[frg->ident] == noUnitig); // Read must be not placed yet.
inUnitig[frg->ident] = ti;
@@ -87,8 +87,8 @@ checkUnitigMembership(UnitigVector &unitigs) {
// Find any read not placed in a unitig.
- for (uint32 i=0; i<FI->numFragments()+1; i++) {
- if (FI->fragmentLength(i) == 0) // Deleted read.
+ for (uint32 i=0; i<RI->numReads()+1; i++) {
+ if (RI->readLength(i) == 0) // Deleted read.
continue;
assert(inUnitig[i] != 0); // There shouldn't be a unitig 0.
@@ -109,11 +109,11 @@ checkUnitigMembership(UnitigVector &unitigs) {
// 4) at least fraction F of the unitig is below read depth D (F=1.0, D=2)
//
void
-classifyUnitigsAsUnassembled(UnitigVector &unitigs,
- uint32 fewReadsNumber,
- uint32 tooShortLength,
- double spanFraction,
- double lowcovFraction, uint32 lowcovDepth) {
+classifyTigsAsUnassembled(TigVector &tigs,
+ uint32 fewReadsNumber,
+ uint32 tooShortLength,
+ double spanFraction,
+ double lowcovFraction, uint32 lowcovDepth) {
uint32 nTooFew = 0;
uint32 nShort = 0;
uint32 nSingle = 0;
@@ -126,10 +126,17 @@ classifyUnitigsAsUnassembled(UnitigVector &unitigs,
uint64 bCoverage = 0;
uint64 bContig = 0;
- writeLog("==> FILTERING UNASSEMBLED CRUD\n");
+ char N[FILENAME_MAX];
- for (uint32 ti=0; ti<unitigs.size(); ti++) {
- Unitig *utg = unitigs[ti];
+ snprintf(N, FILENAME_MAX, "%s.unassembled", getLogFilePrefix());
+
+ errno = 0;
+ FILE *F = fopen(N, "w");
+ if (errno)
+ F = NULL;
+
+ for (uint32 ti=0; ti<tigs.size(); ti++) {
+ Unitig *utg = tigs[ti];
if (utg == NULL)
continue;
@@ -139,7 +146,7 @@ classifyUnitigsAsUnassembled(UnitigVector &unitigs,
// Rule 1. Too few reads.
if (utg->ufpath.size() < fewReadsNumber) {
- writeLog("unitig %u unassembled - too few reads (%u < %u)\n", ti, utg->ufpath.size(), fewReadsNumber);
+ fprintf(F, "unitig " F_U32 " unassembled - too few reads (" F_U64 " < " F_U32 ")\n", ti, utg->ufpath.size(), fewReadsNumber);
utg->_isUnassembled = true;
nTooFew += 1;
bTooFew += utg->getLength();
@@ -149,7 +156,7 @@ classifyUnitigsAsUnassembled(UnitigVector &unitigs,
// Rule 2. Short.
if (utg->getLength() < tooShortLength) {
- writeLog("unitig %u unassembled - too short (%u < %u)\n", ti, utg->getLength(), tooShortLength);
+ fprintf(F, "unitig " F_U32 " unassembled - too short (" F_U32 " < " F_U32 ")\n", ti, utg->getLength(), tooShortLength);
utg->_isUnassembled = true;
nShort += 1;
bShort += utg->getLength();
@@ -165,7 +172,7 @@ classifyUnitigsAsUnassembled(UnitigVector &unitigs,
int frgend = MAX(frg->position.bgn, frg->position.end);
if (frgend - frgbgn > utg->getLength() * spanFraction) {
- writeLog("unitig %u unassembled - single read spans unitig (read %u %u-%u spans fraction %f > %f\n",
+ fprintf(F, "unitig " F_U32 " unassembled - single read spans unitig (read " F_U32 " " F_U32 "-" F_U32 " spans fraction %f > %f\n",
ti, frg->ident, frg->position.bgn, frg->position.end, (double)(frgend - frgbgn) / utg->getLength(), spanFraction);
utg->_isUnassembled = true;
nSingle += 1;
@@ -200,10 +207,12 @@ classifyUnitigsAsUnassembled(UnitigVector &unitigs,
else
basesHigh += ID.hi(ii) - ID.lo(ii) + 1;
+ assert(basesLow + basesHigh > 0);
+
double lowcov = (double)basesLow / (basesLow + basesHigh);
if (lowcov >= lowcovFraction) {
- writeLog("Unitig %u unassembled - low coverage (%.4f > %.4f at < %ux coverage)\n",
+ fprintf(F, "Unitig " F_U32 " unassembled - low coverage (%.4f > %.4f at < " F_U32 "x coverage)\n",
ti, lowcov, lowcovFraction, lowcovDepth);
utg->_isUnassembled = true;
nCoverage += 1;
@@ -217,16 +226,19 @@ classifyUnitigsAsUnassembled(UnitigVector &unitigs,
bContig += utg->getLength();
}
- writeLog("unassembled filter: %6u unitigs %11lu bases -- too few reads\n", nTooFew, bTooFew);
- writeLog("unassembled filter: %6u unitigs %11lu bases -- too short\n", nShort, bShort);
- writeLog("unassembled filter: %6u unitigs %11lu bases -- single spanning read\n", nSingle, bSingle);
- writeLog("unassembled filter: %6u unitigs %11lu bases -- low coverage\n", nCoverage, bCoverage);
- writeLog("unassembled filter: %6u unitigs %11lu bases -- acceptable contigs\n", nContig, bContig);
+ if (F)
+ fclose(F);
+
+ writeStatus("classifyAsUnassembled()-- %6u tigs %11lu bases -- too few reads\n", nTooFew, bTooFew);
+ writeStatus("classifyAsUnassembled()-- %6u tigs %11lu bases -- too short\n", nShort, bShort);
+ writeStatus("classifyAsUnassembled()-- %6u tigs %11lu bases -- single spanning read\n", nSingle, bSingle);
+ writeStatus("classifyAsUnassembled()-- %6u tigs %11lu bases -- low coverage\n", nCoverage, bCoverage);
+ writeStatus("classifyAsUnassembled()-- %6u tigs %11lu bases -- acceptable contigs\n", nContig, bContig);
}
void
-reportN50(vector<uint32> &data, char const *label, uint64 genomeSize) {
+reportN50(FILE *F, vector<uint32> &data, char const *label, uint64 genomeSize) {
uint64 cnt = data.size();
uint64 sum = 0;
uint64 tot = 0;
@@ -243,8 +255,8 @@ reportN50(vector<uint32> &data, char const *label, uint64 genomeSize) {
for (uint64 i=0; i<cnt; i++)
tot += data[i];
- writeLog("%s (%u tigs) (%u length) (%u average) (%.2fx coverage)\n",
- label, cnt, tot, tot / cnt, (double)tot / genomeSize);
+ fprintf(F, "%s (" F_U64 " tigs) (" F_U64 " length) (" F_U64 " average) (%.2fx coverage)\n",
+ label, cnt, tot, tot / cnt, (double)tot / genomeSize);
if (genomeSize > 0)
siz = genomeSize;
@@ -255,23 +267,23 @@ reportN50(vector<uint32> &data, char const *label, uint64 genomeSize) {
sum += data[i];
while (siz * nnn / 100 < sum) {
- writeLog("ng%03"F_U64P" %9"F_U64P" lg%03"F_U64P" %8"F_U64P" sum %11"F_U64P" (%s)\n",
- nnn, data[i],
- nnn, i+1,
- sum,
- label);
+ fprintf(F, "ng%03" F_U64P " %9" F_U32P " lg%03" F_U64P " %8" F_U64P " sum %11" F_U64P " (%s)\n",
+ nnn, data[i],
+ nnn, i+1,
+ sum,
+ label);
nnn += 10;
}
}
-
}
+
void
-reportUnitigs(UnitigVector &unitigs, const char *prefix, const char *name, uint64 genomeSize) {
+reportTigs(TigVector &tigs, const char *prefix, const char *name, uint64 genomeSize) {
- // Generate n50. Assumes unitigs have been 'classified' already.
+ // Generate n50. Assumes tigs have been 'classified' already.
vector<uint32> unassembledLength;
vector<uint32> bubbleLength;
@@ -279,8 +291,8 @@ reportUnitigs(UnitigVector &unitigs, const char *prefix, const char *name, uint6
vector<uint32> circularLength;
vector<uint32> contigLength;
- for (uint32 ti=0; ti<unitigs.size(); ti++) {
- Unitig *utg = unitigs[ti];
+ for (uint32 ti=0; ti<tigs.size(); ti++) {
+ Unitig *utg = tigs[ti];
if (utg == NULL)
continue;
@@ -306,52 +318,30 @@ reportUnitigs(UnitigVector &unitigs, const char *prefix, const char *name, uint6
}
}
- reportN50(unassembledLength, "UNASSEMBLED", genomeSize);
- reportN50(bubbleLength, "BUBBLE", genomeSize);
- reportN50(repeatLength, "REPEAT", genomeSize);
- reportN50(circularLength, "CIRCULAR", genomeSize);
- reportN50(contigLength, "CONTIGS", genomeSize);
-
- if (logFileFlagSet(LOG_INTERMEDIATE_UNITIGS) == 0)
- return;
-
- // Dump to an intermediate store.
-
- char tigStorePath[FILENAME_MAX];
- sprintf(tigStorePath, "%s.%03u.%s.tigStore", prefix, logFileOrder, name);
-
- fprintf(stderr, "Creating intermediate tigStore '%s'\n", tigStorePath);
+ char N[FILENAME_MAX];
- uint32 numFragsT = 0;
- uint32 numFragsP = 0;
- uint64 utgLen = 0;
+ snprintf(N, FILENAME_MAX, "%s.sizes", getLogFilePrefix());
- // Compute average frags per partition.
+ errno = 0;
+ FILE *F = fopen(N, "w");
+ if (errno == 0) {
+ reportN50(F, unassembledLength, "UNASSEMBLED", genomeSize);
+ reportN50(F, bubbleLength, "BUBBLE", genomeSize);
+ reportN50(F, repeatLength, "REPEAT", genomeSize);
+ reportN50(F, circularLength, "CIRCULAR", genomeSize);
+ reportN50(F, contigLength, "CONTIGS", genomeSize);
- for (uint32 ti=0; ti<unitigs.size(); ti++) {
- Unitig *utg = unitigs[ti];
-
- if (utg == NULL)
- continue;
-
- numFragsT += utg->ufpath.size();
-
- if (utg->ufpath.size() > 2)
- utgLen += utg->getLength();
+ fclose(F);
}
- if (utgLen < 16 * 1024 * 1024)
- numFragsP = numFragsT / 7;
- else if (utgLen < 64 * 1024 * 1024)
- numFragsP = numFragsT / 63;
- else
- numFragsP = numFragsT / 127;
+ if (logFileFlagSet(LOG_INTERMEDIATE_TIGS) == 0)
+ return;
- // Dump the unitigs to an intermediate store.
+ // Dump the tigs to an intermediate store.
- setParentAndHang(unitigs);
+ setParentAndHang(tigs);
- writeUnitigsToStore(unitigs, tigStorePath, tigStorePath, numFragsP, false);
+ writeTigsToStore(tigs, getLogFilePrefix(), "tig", false);
}
@@ -403,9 +393,9 @@ satisfiedOverlap(uint32 rdAlo, uint32 rdAhi, bool rdAfwd, uint32 rdBlo, uint32 r
// Iterate over all overlaps (but the only interface we have is by iterating
-// over all reads), and count the number of overlaps satisfied in unitigs.
+// over all reads), and count the number of overlaps satisfied in tigs.
void
-reportOverlaps(UnitigVector &unitigs, const char *prefix, const char *name) {
+reportOverlaps(TigVector &tigs, const char *prefix, const char *name) {
olapsUsed *dd = new olapsUsed; // Dovetail overlaps to non-contained reads
olapsUsed *dc = new olapsUsed; // Dovetail overlaps to contained reads
olapsUsed *cc = new olapsUsed; // Containment overlaps
@@ -417,28 +407,28 @@ reportOverlaps(UnitigVector &unitigs, const char *prefix, const char *name) {
memset(bb, 0, sizeof(olapsUsed));
- for (uint32 fi=0; fi<FI->numFragments()+1; fi++) {
- if (FI->fragmentLength(fi) == 0)
+ for (uint32 fi=0; fi<RI->numReads()+1; fi++) {
+ if (RI->readLength(fi) == 0)
continue;
uint32 rdAid = fi;
- uint32 tgAid = Unitig::fragIn(rdAid);
- Unitig *tgA = unitigs[tgAid];
+ uint32 tgAid = tigs.inUnitig(rdAid);
+ Unitig *tgA = tigs[tgAid];
uint32 tgAtype = getTigType(tgA);
// Best overlaps exist if the read isn't contained.
if (OG->isContained(rdAid) == false) {
BestEdgeOverlap *b5 = OG->getBestEdgeOverlap(fi, false);
- uint32 rd5id = b5->fragId();
- uint32 tg5id = Unitig::fragIn(rd5id);
- Unitig *tg5 = unitigs[tg5id];
+ uint32 rd5id = b5->readId();
+ uint32 tg5id = tigs.inUnitig(rd5id);
+ Unitig *tg5 = tigs[tg5id];
uint32 tg5type = getTigType(tg5);
BestEdgeOverlap *b3 = OG->getBestEdgeOverlap(fi, true);
- uint32 rd3id = b3->fragId();
- uint32 tg3id = Unitig::fragIn(rd3id);
- Unitig *tg3 = unitigs[tg3id];
+ uint32 rd3id = b3->readId();
+ uint32 tg3id = tigs.inUnitig(rd3id);
+ Unitig *tg3 = tigs[tg3id];
uint32 tg3type = getTigType(tg3);
bb->total += 2;
@@ -460,8 +450,8 @@ reportOverlaps(UnitigVector &unitigs, const char *prefix, const char *name) {
// Otherwise, its in a tig, and we need to compare positions.
else {
- uint32 rdApos = unitigs[tgAid]->pathPosition(rdAid);
- ufNode *rdA = &unitigs[tgAid]->ufpath[rdApos];
+ uint32 rdApos = tigs[tgAid]->ufpathIdx(rdAid);
+ ufNode *rdA = &tigs[tgAid]->ufpath[rdApos];
bool rdAfwd = (rdA->position.bgn < rdA->position.end);
int32 rdAlo = (rdAfwd) ? rdA->position.bgn : rdA->position.end;
int32 rdAhi = (rdAfwd) ? rdA->position.end : rdA->position.bgn;
@@ -475,13 +465,13 @@ reportOverlaps(UnitigVector &unitigs, const char *prefix, const char *name) {
bb->doveUnsatDiff[tgAtype][tNOP]++;
} else {
- uint32 rd5pos = unitigs[tg5id]->pathPosition(rd5id);
- ufNode *rd5 = &unitigs[tg5id]->ufpath[rd5pos];
+ uint32 rd5pos = tigs[tg5id]->ufpathIdx(rd5id);
+ ufNode *rd5 = &tigs[tg5id]->ufpath[rd5pos];
bool rd5fwd = (rd5->position.bgn < rd5->position.end);
int32 rd5lo = (rd5fwd) ? rd5->position.bgn : rd5->position.end;
int32 rd5hi = (rd5fwd) ? rd5->position.end : rd5->position.bgn;
- if (satisfiedOverlap(rdAlo, rdAhi, rdAfwd, rd5lo, rd5hi, rd5fwd, (b5->frag3p() == true))) {
+ if (satisfiedOverlap(rdAlo, rdAhi, rdAfwd, rd5lo, rd5hi, rd5fwd, (b5->read3p() == true))) {
bb->doveSatSame[tgAtype]++;
} else {
bb->doveUnsatSame[tgAtype]++;
@@ -496,13 +486,13 @@ reportOverlaps(UnitigVector &unitigs, const char *prefix, const char *name) {
bb->doveUnsatDiff[tgAtype][tNOP]++;
} else {
- uint32 rd3pos = unitigs[tg3id]->pathPosition(rd3id);
- ufNode *rd3 = &unitigs[tg3id]->ufpath[rd3pos];
+ uint32 rd3pos = tigs[tg3id]->ufpathIdx(rd3id);
+ ufNode *rd3 = &tigs[tg3id]->ufpath[rd3pos];
bool rd3fwd = (rd3->position.bgn < rd3->position.end);
int32 rd3lo = (rd3fwd) ? rd3->position.bgn : rd3->position.end;
int32 rd3hi = (rd3fwd) ? rd3->position.end : rd3->position.bgn;
- if (satisfiedOverlap(rdAlo, rdAhi, rdAfwd, rd3lo, rd3hi, rd3fwd, (b3->frag3p() == false))) {
+ if (satisfiedOverlap(rdAlo, rdAhi, rdAfwd, rd3lo, rd3hi, rd3fwd, (b3->read3p() == false))) {
bb->doveSatSame[tgAtype]++;
} else {
bb->doveUnsatSame[tgAtype]++;
@@ -515,18 +505,18 @@ reportOverlaps(UnitigVector &unitigs, const char *prefix, const char *name) {
// For all overlaps.
uint32 ovlLen = 0;
- BAToverlap *ovl = OC->getOverlaps(fi, AS_MAX_ERATE, ovlLen);
+ BAToverlap *ovl = OC->getOverlaps(fi, ovlLen);
for (uint32 oi=0; oi<ovlLen; oi++) {
uint32 rdAid = ovl[oi].a_iid;
- uint32 tgAid = Unitig::fragIn(rdAid);
- Unitig *tgA = unitigs[tgAid];
+ uint32 tgAid = tigs.inUnitig(rdAid);
+ Unitig *tgA = tigs[tgAid];
uint32 tgAtype = getTigType(tgA);
uint32 rdBid = ovl[oi].b_iid;
- uint32 tgBid = Unitig::fragIn(rdBid);
- Unitig *tgB = unitigs[tgBid];
+ uint32 tgBid = tigs.inUnitig(rdBid);
+ Unitig *tgB = tigs[tgBid];
uint32 tgBtype = getTigType(tgB);
bool isDove = ovl[oi].isDovetail();
@@ -568,14 +558,14 @@ reportOverlaps(UnitigVector &unitigs, const char *prefix, const char *name) {
// Else, possibly satisfied. We need to check positions.
- uint32 rdApos = unitigs[tgAid]->pathPosition(rdAid);
- ufNode *rdA = &unitigs[tgAid]->ufpath[rdApos];
+ uint32 rdApos = tigs[tgAid]->ufpathIdx(rdAid);
+ ufNode *rdA = &tigs[tgAid]->ufpath[rdApos];
bool rdAfwd = (rdA->position.bgn < rdA->position.end);
int32 rdAlo = (rdAfwd) ? rdA->position.bgn : rdA->position.end;
int32 rdAhi = (rdAfwd) ? rdA->position.end : rdA->position.bgn;
- uint32 rdBpos = unitigs[tgBid]->pathPosition(rdBid);
- ufNode *rdB = &unitigs[tgBid]->ufpath[rdBpos];
+ uint32 rdBpos = tigs[tgBid]->ufpathIdx(rdBid);
+ ufNode *rdB = &tigs[tgBid]->ufpath[rdBpos];
bool rdBfwd = (rdB->position.bgn < rdB->position.end);
int32 rdBlo = (rdBfwd) ? rdB->position.bgn : rdB->position.end;
int32 rdBhi = (rdBfwd) ? rdB->position.end : rdB->position.bgn;
@@ -623,117 +613,134 @@ reportOverlaps(UnitigVector &unitigs, const char *prefix, const char *name) {
#define Q(X) (100.0 * (X) / (dc->total))
#define R(X) (100.0 * (X) / (cc->total))
- writeLog("--------------------------------------------------------------------------------\n");
- writeLog("OVERLAP FATE\n");
- writeLog("\n");
- writeLog("dovetail overlaps (best) "F_U64"\n", bb->total);
- writeLog("dovetail overlaps "F_U64"\n", dd->total);
- writeLog("dovetail overlaps to contained reads "F_U64"\n", dc->total);
- writeLog("containment overlaps "F_U64"\n", cc->total);
- writeLog("\n");
- writeLog("SATISFIED best edges DOVETAIL\n");
- writeLog("--------- ------------ -------\n");
- writeLog("contig %12"F_U64P" %6.2f%%\n", bb->doveSatSame[tCTG], B(bb->doveSatSame[tCTG]));
- writeLog("repeat contig %12"F_U64P" %6.2f%%\n", bb->doveSatSame[tRPT], B(bb->doveSatSame[tRPT]));
- writeLog("bubble %12"F_U64P" %6.2f%%\n", bb->doveSatSame[tBUB], B(bb->doveSatSame[tBUB]));
- writeLog("\n");
- writeLog("UNSATISFIED best edges DOVETAIL\n");
- writeLog("----------- ------------ -------\n");
- writeLog("contig %12"F_U64P" %6.2f%%\n", bb->doveUnsatSame[tCTG], B(bb->doveUnsatSame[tCTG]));
- writeLog("repeat %12"F_U64P" %6.2f%%\n", bb->doveUnsatSame[tRPT], B(bb->doveUnsatSame[tRPT]));
- writeLog("bubble %12"F_U64P" %6.2f%%\n", bb->doveUnsatSame[tBUB], B(bb->doveUnsatSame[tBUB]));
- writeLog("unassembled %12"F_U64P" %6.2f%%\n", bb->doveUnsatSame[tUNA], B(bb->doveUnsatSame[tUNA]));
- writeLog("unused %12"F_U64P" %6.2f%%\n", bb->doveUnsatSame[tUNU], B(bb->doveUnsatSame[tUNU]));
- writeLog("\n");
- writeLog("UNSATISFIED best edges DOVETAIL\n");
- writeLog("----------- ------------ -------\n");
- writeLog("contig-contig %12"F_U64P" %6.2f%%\n", bb->doveUnsatDiff[tCTG][tCTG], B(bb->doveUnsatDiff[tCTG][tCTG]));
- writeLog("contig-repeat %12"F_U64P" %6.2f%%\n", bb->doveUnsatDiff[tCTG][tRPT], B(bb->doveUnsatDiff[tCTG][tRPT]));
- writeLog("contig-bubble %12"F_U64P" %6.2f%%\n", bb->doveUnsatDiff[tCTG][tBUB], B(bb->doveUnsatDiff[tCTG][tBUB]));
- writeLog("contig-unassembled %12"F_U64P" %6.2f%%\n", bb->doveUnsatDiff[tCTG][tUNA], B(bb->doveUnsatDiff[tCTG][tUNA]));
- writeLog("contig-unused %12"F_U64P" %6.2f%%\n", bb->doveUnsatDiff[tCTG][tUNU], B(bb->doveUnsatDiff[tCTG][tUNU]));
- writeLog("contig-none %12"F_U64P" %6.2f%%\n", bb->doveUnsatDiff[tCTG][tNOP], B(bb->doveUnsatDiff[tCTG][tNOP]));
- writeLog("\n");
-//writeLog("repeat-contig %12"F_U64P" %6.2f%%\n", bb->doveUnsatDiff[tRPT][tCTG], B(bb->doveUnsatDiff[tRPT][tCTG]));
- writeLog("repeat-repeat %12"F_U64P" %6.2f%%\n", bb->doveUnsatDiff[tRPT][tRPT], B(bb->doveUnsatDiff[tRPT][tRPT]));
- writeLog("repeat-bubble %12"F_U64P" %6.2f%%\n", bb->doveUnsatDiff[tRPT][tBUB], B(bb->doveUnsatDiff[tRPT][tBUB]));
- writeLog("repeat-unassembled %12"F_U64P" %6.2f%%\n", bb->doveUnsatDiff[tRPT][tUNA], B(bb->doveUnsatDiff[tRPT][tUNA]));
- writeLog("repeat-unused %12"F_U64P" %6.2f%%\n", bb->doveUnsatDiff[tRPT][tUNU], B(bb->doveUnsatDiff[tRPT][tUNU]));
- writeLog("repeat-none %12"F_U64P" %6.2f%%\n", bb->doveUnsatDiff[tRPT][tNOP], B(bb->doveUnsatDiff[tRPT][tNOP]));
- writeLog("\n");
-//writeLog("bubble-contig %12"F_U64P" %6.2f%%\n", bb->doveUnsatDiff[tBUB][tCTG], B(bb->doveUnsatDiff[tBUB][tCTG]));
-//writeLog("bubble-repeat %12"F_U64P" %6.2f%%\n", bb->doveUnsatDiff[tBUB][tRPT], B(bb->doveUnsatDiff[tBUB][tRPT]));
- writeLog("bubble-bubble %12"F_U64P" %6.2f%%\n", bb->doveUnsatDiff[tBUB][tBUB], B(bb->doveUnsatDiff[tBUB][tBUB]));
- writeLog("bubble-unassembled %12"F_U64P" %6.2f%%\n", bb->doveUnsatDiff[tBUB][tUNA], B(bb->doveUnsatDiff[tBUB][tUNA]));
- writeLog("bubble-unused %12"F_U64P" %6.2f%%\n", bb->doveUnsatDiff[tBUB][tUNU], B(bb->doveUnsatDiff[tBUB][tUNU]));
- writeLog("bubble-none %12"F_U64P" %6.2f%%\n", bb->doveUnsatDiff[tBUB][tNOP], B(bb->doveUnsatDiff[tBUB][tNOP]));
- writeLog("\n");
-//writeLog("unassembled-contig %12"F_U64P" %6.2f%%\n", bb->doveUnsatDiff[tUNA][tCTG], B(bb->doveUnsatDiff[tUNA][tCTG]));
-//writeLog("unassembled-repeat %12"F_U64P" %6.2f%%\n", bb->doveUnsatDiff[tUNA][tRPT], B(bb->doveUnsatDiff[tUNA][tRPT]));
-//writeLog("unassembled-bubble %12"F_U64P" %6.2f%%\n", bb->doveUnsatDiff[tUNA][tBUB], B(bb->doveUnsatDiff[tUNA][tBUB]));
- writeLog("unassembled-unassembled %12"F_U64P" %6.2f%%\n", bb->doveUnsatDiff[tUNA][tUNA], B(bb->doveUnsatDiff[tUNA][tUNA]));
- writeLog("unassembled-unused %12"F_U64P" %6.2f%%\n", bb->doveUnsatDiff[tUNA][tUNU], B(bb->doveUnsatDiff[tUNA][tUNU]));
- writeLog("unassembled-none %12"F_U64P" %6.2f%%\n", bb->doveUnsatDiff[tUNA][tNOP], B(bb->doveUnsatDiff[tUNA][tNOP]));
- writeLog("\n");
-//writeLog("unused-contig %12"F_U64P" %6.2f%%\n", bb->doveUnsatDiff[tUNU][tCTG], B(bb->doveUnsatDiff[tUNU][tCTG]))
-//writeLog("unused-repeat %12"F_U64P" %6.2f%%\n", bb->doveUnsatDiff[tUNU][tRPT], B(bb->doveUnsatDiff[tUNU][tRPT]));
-//writeLog("unused-bubble %12"F_U64P" %6.2f%%\n", bb->doveUnsatDiff[tUNU][tBUB], B(bb->doveUnsatDiff[tUNU][tBUB]));
-//writeLog("unused-unassembled %12"F_U64P" %6.2f%%\n", bb->doveUnsatDiff[tUNU][tUNA], B(bb->doveUnsatDiff[tUNU][tUNA]));
- writeLog("unused-unused %12"F_U64P" %6.2f%%\n", bb->doveUnsatDiff[tUNU][tUNU], B(bb->doveUnsatDiff[tUNU][tUNU]));
- writeLog("unused-none %12"F_U64P" %6.2f%%\n", bb->doveUnsatDiff[tUNU][tNOP], B(bb->doveUnsatDiff[tUNU][tNOP]));
- writeLog("\n");
- writeLog("\n");
- writeLog("\n");
- writeLog("SATISFIED all overlaps DOVETAIL DOVECONT CONTAINMENT\n");
- writeLog("--------- ------------ ------- ------------ ------- ------------ -------\n");
- writeLog("contig %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%%\n", dd->doveSatSame[tCTG], P(dd->doveSatSame[tCTG]), dc->doveSatSame[tCTG], Q(dc->doveSatSame[tCTG]), cc->contSatSame[tCTG], R(cc->contSatSame[tCTG]));
- writeLog("repeat contig %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%%\n", dd->doveSatSame[tRPT], P(dd->doveSatSame[tRPT]), dc->doveSatSame[tRPT], Q(dc->doveSatSame[tRPT]), cc->contSatSame[tRPT], R(cc->contSatSame[tRPT]));
- writeLog("bubble %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%%\n", dd->doveSatSame[tBUB], P(dd->doveSatSame[tBUB]), dc->doveSatSame[tBUB], Q(dc->doveSatSame[tBUB]), cc->contSatSame[tBUB], R(cc->contSatSame[tBUB]));
- writeLog("\n");
- writeLog("UNSATISFIED all overlaps DOVETAIL DOVECONT CONTAINMENT\n");
- writeLog("----------- ------------ ------- ------------ ------- ------------ -------\n");
- writeLog("contig %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%%\n", dd->doveUnsatSame[tCTG], P(dd->doveUnsatSame[tCTG]), dc->doveUnsatSame[tCTG], Q(dc->doveUnsatSame[tCTG]), cc->contUnsatSame[tCTG], R(cc->contUnsatSame[tCTG]));
- writeLog("repeat %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%%\n", dd->doveUnsatSame[tRPT], P(dd->doveUnsatSame[tRPT]), dc->doveUnsatSame[tRPT], Q(dc->doveUnsatSame[tRPT]), cc->contUnsatSame[tRPT], R(cc->contUnsatSame[tRPT]));
- writeLog("bubble %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%%\n", dd->doveUnsatSame[tBUB], P(dd->doveUnsatSame[tBUB]), dc->doveUnsatSame[tBUB], Q(dc->doveUnsatSame[tBUB]), cc->contUnsatSame[tBUB], R(cc->contUnsatSame[tBUB]));
- writeLog("unassembled %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%%\n", dd->doveUnsatSame[tUNA], P(dd->doveUnsatSame[tUNA]), dc->doveUnsatSame[tUNA], Q(dc->doveUnsatSame[tUNA]), cc->contUnsatSame[tUNA], R(cc->contUnsatSame[tUNA]));
- writeLog("unused %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%%\n", dd->doveUnsatSame[tUNU], P(dd->doveUnsatSame[tUNU]), dc->doveUnsatSame[tUNU], Q(dc->doveUnsatSame[tUNU]), cc->contUnsatSame[tUNU], R(cc->contUnsatSame[tUNU]));
- writeLog("\n");
- writeLog("UNSATISFIED all overlaps DOVETAIL DOVECONT CONTAINMENT\n");
- writeLog("----------- ------------ ------- ------------ ------- ------------ -------\n");
- writeLog("contig-contig %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%%\n", dd->doveUnsatDiff[tCTG][tCTG], P(dd->doveUnsatDiff[tCTG][tCTG]), dc->doveUnsatDiff[tCTG][tCTG], Q(dc->doveUnsatDiff[tCTG][tCTG]), cc->contUnsatDiff[tCTG][tCTG], R(cc->contUnsatDiff[tCTG][tCTG]));
- writeLog("contig-repeat %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%%\n", dd->doveUnsatDiff[tCTG][tRPT], P(dd->doveUnsatDiff[tCTG][tRPT]), dc->doveUnsatDiff[tCTG][tRPT], Q(dc->doveUnsatDiff[tCTG][tRPT]), cc->contUnsatDiff[tCTG][tRPT], R(cc->contUnsatDiff[tCTG][tRPT]));
- writeLog("contig-bubble %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%%\n", dd->doveUnsatDiff[tCTG][tBUB], P(dd->doveUnsatDiff[tCTG][tBUB]), dc->doveUnsatDiff[tCTG][tBUB], Q(dc->doveUnsatDiff[tCTG][tBUB]), cc->contUnsatDiff[tCTG][tBUB], R(cc->contUnsatDiff[tCTG][tBUB]));
- writeLog("contig-unassembled %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%%\n", dd->doveUnsatDiff[tCTG][tUNA], P(dd->doveUnsatDiff[tCTG][tUNA]), dc->doveUnsatDiff[tCTG][tUNA], Q(dc->doveUnsatDiff[tCTG][tUNA]), cc->contUnsatDiff[tCTG][tUNA], R(cc->contUnsatDiff[tCTG][tUNA]));
- writeLog("contig-unused %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%%\n", dd->doveUnsatDiff[tCTG][tUNU], P(dd->doveUnsatDiff[tCTG][tUNU]), dc->doveUnsatDiff[tCTG][tUNU], Q(dc->doveUnsatDiff[tCTG][tUNU]), cc->contUnsatDiff[tCTG][tUNU], R(cc->contUnsatDiff[tCTG][tUNU]));
- writeLog("\n");
-//writeLog("repeat-contig %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%%\n", dd->doveUnsatDiff[tRPT][tCTG], P(dd->doveUnsatDiff[tRPT][tCTG]), dc->doveUnsatDiff[tRPT][tCTG], Q(dc->doveUnsatDiff[tRPT][tCTG]), cc->contUnsatDiff[tRPT][tCTG], R(cc->contUnsatDiff[tRPT][tCTG]));
- writeLog("repeat-repeat %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%%\n", dd->doveUnsatDiff[tRPT][tRPT], P(dd->doveUnsatDiff[tRPT][tRPT]), dc->doveUnsatDiff[tRPT][tRPT], Q(dc->doveUnsatDiff[tRPT][tRPT]), cc->contUnsatDiff[tRPT][tRPT], R(cc->contUnsatDiff[tRPT][tRPT]));
- writeLog("repeat-bubble %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%%\n", dd->doveUnsatDiff[tRPT][tBUB], P(dd->doveUnsatDiff[tRPT][tBUB]), dc->doveUnsatDiff[tRPT][tBUB], Q(dc->doveUnsatDiff[tRPT][tBUB]), cc->contUnsatDiff[tRPT][tBUB], R(cc->contUnsatDiff[tRPT][tBUB]));
- writeLog("repeat-unassembled %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%%\n", dd->doveUnsatDiff[tRPT][tUNA], P(dd->doveUnsatDiff[tRPT][tUNA]), dc->doveUnsatDiff[tRPT][tUNA], Q(dc->doveUnsatDiff[tRPT][tUNA]), cc->contUnsatDiff[tRPT][tUNA], R(cc->contUnsatDiff[tRPT][tUNA]));
- writeLog("repeat-unused %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%%\n", dd->doveUnsatDiff[tRPT][tUNU], P(dd->doveUnsatDiff[tRPT][tUNU]), dc->doveUnsatDiff[tRPT][tUNU], Q(dc->doveUnsatDiff[tRPT][tUNU]), cc->contUnsatDiff[tRPT][tUNU], R(cc->contUnsatDiff[tRPT][tUNU]));
- writeLog("\n");
-//writeLog("bubble-contig %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%%\n", dd->doveUnsatDiff[tBUB][tCTG], P(dd->doveUnsatDiff[tBUB][tCTG]), dc->doveUnsatDiff[tBUB][tCTG], Q(dc->doveUnsatDiff[tBUB][tCTG]), cc->contUnsatDiff[tBUB][tCTG], R(cc->contUnsatDiff[tBUB][tCTG]));
-//writeLog("bubble-repeat %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%%\n", dd->doveUnsatDiff[tBUB][tRPT], P(dd->doveUnsatDiff[tBUB][tRPT]), dc->doveUnsatDiff[tBUB][tRPT], Q(dc->doveUnsatDiff[tBUB][tRPT]), cc->contUnsatDiff[tBUB][tRPT], R(cc->contUnsatDiff[tBUB][tRPT]));
- writeLog("bubble-bubble %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%%\n", dd->doveUnsatDiff[tBUB][tBUB], P(dd->doveUnsatDiff[tBUB][tBUB]), dc->doveUnsatDiff[tBUB][tBUB], Q(dc->doveUnsatDiff[tBUB][tBUB]), cc->contUnsatDiff[tBUB][tBUB], R(cc->contUnsatDiff[tBUB][tBUB]));
- writeLog("bubble-unassembled %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%%\n", dd->doveUnsatDiff[tBUB][tUNA], P(dd->doveUnsatDiff[tBUB][tUNA]), dc->doveUnsatDiff[tBUB][tUNA], Q(dc->doveUnsatDiff[tBUB][tUNA]), cc->contUnsatDiff[tBUB][tUNA], R(cc->contUnsatDiff[tBUB][tUNA]));
- writeLog("bubble-unused %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%%\n", dd->doveUnsatDiff[tBUB][tUNU], P(dd->doveUnsatDiff[tBUB][tUNU]), dc->doveUnsatDiff[tBUB][tUNU], Q(dc->doveUnsatDiff[tBUB][tUNU]), cc->contUnsatDiff[tBUB][tUNU], R(cc->contUnsatDiff[tBUB][tUNU]));
- writeLog("\n");
-//writeLog("unassembled-contig %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%%\n", dd->doveUnsatDiff[tUNA][tCTG], P(dd->doveUnsatDiff[tUNA][tCTG]), dc->doveUnsatDiff[tUNA][tCTG], Q(dc->doveUnsatDiff[tUNA][tCTG]), cc->contUnsatDiff[tUNA][tCTG], R(cc->contUnsatDiff[tUNA][tCTG]));
-//writeLog("unassembled-repeat %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%%\n", dd->doveUnsatDiff[tUNA][tRPT], P(dd->doveUnsatDiff[tUNA][tRPT]), dc->doveUnsatDiff[tUNA][tRPT], Q(dc->doveUnsatDiff[tUNA][tRPT]), cc->contUnsatDiff[tUNA][tRPT], R(cc->contUnsatDiff[tUNA][tRPT]));
-//writeLog("unassembled-bubble %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%%\n", dd->doveUnsatDiff[tUNA][tBUB], P(dd->doveUnsatDiff[tUNA][tBUB]), dc->doveUnsatDiff[tUNA][tBUB], Q(dc->doveUnsatDiff[tUNA][tBUB]), cc->contUnsatDiff[tUNA][tBUB], R(cc->contUnsatDiff[tUNA][tBUB]));
- writeLog("unassembled-unassembled %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%%\n", dd->doveUnsatDiff[tUNA][tUNA], P(dd->doveUnsatDiff[tUNA][tUNA]), dc->doveUnsatDiff[tUNA][tUNA], Q(dc->doveUnsatDiff[tUNA][tUNA]), cc->contUnsatDiff[tUNA][tUNA], R(cc->contUnsatDiff[tUNA][tUNA]));
- writeLog("unassembled-unused %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%%\n", dd->doveUnsatDiff[tUNA][tUNU], P(dd->doveUnsatDiff[tUNA][tUNU]), dc->doveUnsatDiff[tUNA][tUNU], Q(dc->doveUnsatDiff[tUNA][tUNU]), cc->contUnsatDiff[tUNA][tUNU], R(cc->contUnsatDiff[tUNA][tUNU]));
- writeLog("\n");
-//writeLog("unused-contig %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%%\n", dd->doveUnsatDiff[tUNU][tCTG], P(dd->doveUnsatDiff[tUNU][tCTG]), dc->doveUnsatDiff[tUNU][tCTG], Q(dc->doveUnsatDiff[tUNU][tCTG]), cc->contUnsatDiff[tUNU][tCTG], R(cc->contUnsatDiff[tUNU][tCTG]));
-//writeLog("unused-repeat %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%%\n", dd->doveUnsatDiff[tUNU][tRPT], P(dd->doveUnsatDiff[tUNU][tRPT]), dc->doveUnsatDiff[tUNU][tRPT], Q(dc->doveUnsatDiff[tUNU][tRPT]), cc->contUnsatDiff[tUNU][tRPT], R(cc->contUnsatDiff[tUNU][tRPT]));
-//writeLog("unused-bubble %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%%\n", dd->doveUnsatDiff[tUNU][tBUB], P(dd->doveUnsatDiff[tUNU][tBUB]), dc->doveUnsatDiff[tUNU][tBUB], Q(dc->doveUnsatDiff[tUNU][tBUB]), cc->contUnsatDiff[tUNU][tBUB], R(cc->contUnsatDiff[tUNU][tBUB]));
-//writeLog("unused-unassembled %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%%\n", dd->doveUnsatDiff[tUNU][tUNA], P(dd->doveUnsatDiff[tUNU][tUNA]), dc->doveUnsatDiff[tUNU][tUNA], Q(dc->doveUnsatDiff[tUNU][tUNA]), cc->contUnsatDiff[tUNU][tUNA], R(cc->contUnsatDiff[tUNU][tUNA]));
- writeLog("unused-unused %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%% %12"F_U64P" %6.2f%%\n", dd->doveUnsatDiff[tUNU][tUNU], P(dd->doveUnsatDiff[tUNU][tUNU]), dc->doveUnsatDiff[tUNU][tUNU], Q(dc->doveUnsatDiff[tUNU][tUNU]), cc->contUnsatDiff[tUNU][tUNU], R(cc->contUnsatDiff[tUNU][tUNU]));
- writeLog("\n");
- writeLog("\n");
+ char N[FILENAME_MAX];
+
+ snprintf(N, FILENAME_MAX, "%s.overlaps", getLogFilePrefix());
+
+ errno = 0;
+ FILE *F = fopen(N, "w");
+ if (errno)
+ return;
+
+ fprintf(F, "=====================================\n");
+ fprintf(F, "OVERLAP COUNTS\n");
+ fprintf(F, "\n");
+ fprintf(F, "dovetail overlaps (best) " F_U64 "\n", bb->total);
+ fprintf(F, "dovetail overlaps " F_U64 "\n", dd->total);
+ fprintf(F, "dovetail overlaps to contained reads " F_U64 "\n", dc->total);
+ fprintf(F, "containment overlaps " F_U64 "\n", cc->total);
+ fprintf(F, "\n");
+ fprintf(F, "=====================================\n");
+ fprintf(F, "BEST EDGE OVERLAP FATE\n");
+ fprintf(F, "\n");
+ fprintf(F, "SATISFIED best edges DOVETAIL\n");
+ fprintf(F, "--------- ------------ -------\n");
+ fprintf(F, "same-contig %12" F_U64P " %6.2f%%\n", bb->doveSatSame[tCTG], B(bb->doveSatSame[tCTG]));
+ fprintf(F, "same-repeat %12" F_U64P " %6.2f%%\n", bb->doveSatSame[tRPT], B(bb->doveSatSame[tRPT]));
+ fprintf(F, "same-bubble %12" F_U64P " %6.2f%%\n", bb->doveSatSame[tBUB], B(bb->doveSatSame[tBUB]));
+ fprintf(F, "\n");
+ fprintf(F, "UNSATISFIED best edges DOVETAIL\n");
+ fprintf(F, "----------- ------------ -------\n");
+ fprintf(F, "same-contig %12" F_U64P " %6.2f%%\n", bb->doveUnsatSame[tCTG], B(bb->doveUnsatSame[tCTG]));
+ fprintf(F, "same-repeat %12" F_U64P " %6.2f%%\n", bb->doveUnsatSame[tRPT], B(bb->doveUnsatSame[tRPT]));
+ fprintf(F, "same-bubble %12" F_U64P " %6.2f%%\n", bb->doveUnsatSame[tBUB], B(bb->doveUnsatSame[tBUB]));
+ fprintf(F, "same-unassembled %12" F_U64P " %6.2f%%\n", bb->doveUnsatSame[tUNA], B(bb->doveUnsatSame[tUNA]));
+ fprintf(F, "same-unused %12" F_U64P " %6.2f%%\n", bb->doveUnsatSame[tUNU], B(bb->doveUnsatSame[tUNU]));
+ fprintf(F, "\n");
+ fprintf(F, "UNSATISFIED best edges DOVETAIL\n");
+ fprintf(F, "----------- ------------ -------\n");
+ fprintf(F, "contig-contig %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tCTG][tCTG], B(bb->doveUnsatDiff[tCTG][tCTG]));
+ fprintf(F, "contig-repeat %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tCTG][tRPT], B(bb->doveUnsatDiff[tCTG][tRPT]));
+ fprintf(F, "contig-bubble %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tCTG][tBUB], B(bb->doveUnsatDiff[tCTG][tBUB]));
+ fprintf(F, "contig-unassembled %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tCTG][tUNA], B(bb->doveUnsatDiff[tCTG][tUNA]));
+ fprintf(F, "contig-unused %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tCTG][tUNU], B(bb->doveUnsatDiff[tCTG][tUNU]));
+ fprintf(F, "contig-none %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tCTG][tNOP], B(bb->doveUnsatDiff[tCTG][tNOP]));
+ fprintf(F, "\n");
+//fprintf(F, "repeat-contig %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tRPT][tCTG], B(bb->doveUnsatDiff[tRPT][tCTG]));
+ fprintf(F, "repeat-repeat %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tRPT][tRPT], B(bb->doveUnsatDiff[tRPT][tRPT]));
+ fprintf(F, "repeat-bubble %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tRPT][tBUB], B(bb->doveUnsatDiff[tRPT][tBUB]));
+ fprintf(F, "repeat-unassembled %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tRPT][tUNA], B(bb->doveUnsatDiff[tRPT][tUNA]));
+ fprintf(F, "repeat-unused %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tRPT][tUNU], B(bb->doveUnsatDiff[tRPT][tUNU]));
+ fprintf(F, "repeat-none %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tRPT][tNOP], B(bb->doveUnsatDiff[tRPT][tNOP]));
+ fprintf(F, "\n");
+//fprintf(F, "bubble-contig %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tBUB][tCTG], B(bb->doveUnsatDiff[tBUB][tCTG]));
+//fprintf(F, "bubble-repeat %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tBUB][tRPT], B(bb->doveUnsatDiff[tBUB][tRPT]));
+ fprintf(F, "bubble-bubble %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tBUB][tBUB], B(bb->doveUnsatDiff[tBUB][tBUB]));
+ fprintf(F, "bubble-unassembled %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tBUB][tUNA], B(bb->doveUnsatDiff[tBUB][tUNA]));
+ fprintf(F, "bubble-unused %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tBUB][tUNU], B(bb->doveUnsatDiff[tBUB][tUNU]));
+ fprintf(F, "bubble-none %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tBUB][tNOP], B(bb->doveUnsatDiff[tBUB][tNOP]));
+ fprintf(F, "\n");
+//fprintf(F, "unassembled-contig %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tUNA][tCTG], B(bb->doveUnsatDiff[tUNA][tCTG]));
+//fprintf(F, "unassembled-repeat %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tUNA][tRPT], B(bb->doveUnsatDiff[tUNA][tRPT]));
+//fprintf(F, "unassembled-bubble %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tUNA][tBUB], B(bb->doveUnsatDiff[tUNA][tBUB]));
+ fprintf(F, "unassembled-unassembled %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tUNA][tUNA], B(bb->doveUnsatDiff[tUNA][tUNA]));
+ fprintf(F, "unassembled-unused %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tUNA][tUNU], B(bb->doveUnsatDiff[tUNA][tUNU]));
+ fprintf(F, "unassembled-none %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tUNA][tNOP], B(bb->doveUnsatDiff[tUNA][tNOP]));
+ fprintf(F, "\n");
+//fprintf(F, "unused-contig %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tUNU][tCTG], B(bb->doveUnsatDiff[tUNU][tCTG]))
+//fprintf(F, "unused-repeat %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tUNU][tRPT], B(bb->doveUnsatDiff[tUNU][tRPT]));
+//fprintf(F, "unused-bubble %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tUNU][tBUB], B(bb->doveUnsatDiff[tUNU][tBUB]));
+//fprintf(F, "unused-unassembled %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tUNU][tUNA], B(bb->doveUnsatDiff[tUNU][tUNA]));
+ fprintf(F, "unused-unused %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tUNU][tUNU], B(bb->doveUnsatDiff[tUNU][tUNU]));
+ fprintf(F, "unused-none %12" F_U64P " %6.2f%%\n", bb->doveUnsatDiff[tUNU][tNOP], B(bb->doveUnsatDiff[tUNU][tNOP]));
+ fprintf(F, "\n");
+ fprintf(F, "\n");
+ fprintf(F, "=====================================\n");
+ fprintf(F, "ALL OVERLAP FATE\n");
+ fprintf(F, "\n");
+ fprintf(F, "SATISFIED all overlaps DOVETAIL DOVECONT CONTAINMENT\n");
+ fprintf(F, "--------- ------------ ------- ------------ ------- ------------ -------\n");
+ fprintf(F, "same-contig %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveSatSame[tCTG], P(dd->doveSatSame[tCTG]), dc->doveSatSame[tCTG], Q(dc->doveSatSame[tCTG]), cc->contSatSame[tCTG], R(cc->contSatSame[tCTG]));
+ fprintf(F, "same-repeat %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveSatSame[tRPT], P(dd->doveSatSame[tRPT]), dc->doveSatSame[tRPT], Q(dc->doveSatSame[tRPT]), cc->contSatSame[tRPT], R(cc->contSatSame[tRPT]));
+ fprintf(F, "same-bubble %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveSatSame[tBUB], P(dd->doveSatSame[tBUB]), dc->doveSatSame[tBUB], Q(dc->doveSatSame[tBUB]), cc->contSatSame[tBUB], R(cc->contSatSame[tBUB]));
+ fprintf(F, "\n");
+ fprintf(F, "UNSATISFIED all overlaps DOVETAIL DOVECONT CONTAINMENT\n");
+ fprintf(F, "----------- ------------ ------- ------------ ------- ------------ -------\n");
+ fprintf(F, "same-contig %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatSame[tCTG], P(dd->doveUnsatSame[tCTG]), dc->doveUnsatSame[tCTG], Q(dc->doveUnsatSame[tCTG]), cc->contUnsatSame[tCTG], R(cc->contUnsatSame[tCTG]));
+ fprintf(F, "same-repeat %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatSame[tRPT], P(dd->doveUnsatSame[tRPT]), dc->doveUnsatSame[tRPT], Q(dc->doveUnsatSame[tRPT]), cc->contUnsatSame[tRPT], R(cc->contUnsatSame[tRPT]));
+ fprintf(F, "same-bubble %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatSame[tBUB], P(dd->doveUnsatSame[tBUB]), dc->doveUnsatSame[tBUB], Q(dc->doveUnsatSame[tBUB]), cc->contUnsatSame[tBUB], R(cc->contUnsatSame[tBUB]));
+ fprintf(F, "same-unassembled %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatSame[tUNA], P(dd->doveUnsatSame[tUNA]), dc->doveUnsatSame[tUNA], Q(dc->doveUnsatSame[tUNA]), cc->contUnsatSame[tUNA], R(cc->contUnsatSame[tUNA]));
+ fprintf(F, "same-unused %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatSame[tUNU], P(dd->doveUnsatSame[tUNU]), dc->doveUnsatSame[tUNU], Q(dc->doveUnsatSame[tUNU]), cc->contUnsatSame[tUNU], R(cc->contUnsatSame[tUNU]));
+ fprintf(F, "\n");
+ fprintf(F, "UNSATISFIED all overlaps DOVETAIL DOVECONT CONTAINMENT\n");
+ fprintf(F, "----------- ------------ ------- ------------ ------- ------------ -------\n");
+ fprintf(F, "contig-contig %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tCTG][tCTG], P(dd->doveUnsatDiff[tCTG][tCTG]), dc->doveUnsatDiff[tCTG][tCTG], Q(dc->doveUnsatDiff[tCTG][tCTG]), cc->contUnsatDiff[tCTG][tCTG], R(cc->contUnsatDiff[tCTG][tCTG]));
+ fprintf(F, "contig-repeat %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tCTG][tRPT], P(dd->doveUnsatDiff[tCTG][tRPT]), dc->doveUnsatDiff[tCTG][tRPT], Q(dc->doveUnsatDiff[tCTG][tRPT]), cc->contUnsatDiff[tCTG][tRPT], R(cc->contUnsatDiff[tCTG][tRPT]));
+ fprintf(F, "contig-bubble %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tCTG][tBUB], P(dd->doveUnsatDiff[tCTG][tBUB]), dc->doveUnsatDiff[tCTG][tBUB], Q(dc->doveUnsatDiff[tCTG][tBUB]), cc->contUnsatDiff[tCTG][tBUB], R(cc->contUnsatDiff[tCTG][tBUB]));
+ fprintf(F, "contig-unassembled %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tCTG][tUNA], P(dd->doveUnsatDiff[tCTG][tUNA]), dc->doveUnsatDiff[tCTG][tUNA], Q(dc->doveUnsatDiff[tCTG][tUNA]), cc->contUnsatDiff[tCTG][tUNA], R(cc->contUnsatDiff[tCTG][tUNA]));
+ fprintf(F, "contig-unused %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tCTG][tUNU], P(dd->doveUnsatDiff[tCTG][tUNU]), dc->doveUnsatDiff[tCTG][tUNU], Q(dc->doveUnsatDiff[tCTG][tUNU]), cc->contUnsatDiff[tCTG][tUNU], R(cc->contUnsatDiff[tCTG][tUNU]));
+ fprintf(F, "\n");
+//fprintf(F, "repeat-contig %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tRPT][tCTG], P(dd->doveUnsatDiff[tRPT][tCTG]), dc->doveUnsatDiff[tRPT][tCTG], Q(dc->doveUnsatDiff[tRPT][tCTG]), cc->contUnsatDiff[tRPT][tCTG], R(cc->contUnsatDiff[tRPT][tCTG]));
+ fprintf(F, "repeat-repeat %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tRPT][tRPT], P(dd->doveUnsatDiff[tRPT][tRPT]), dc->doveUnsatDiff[tRPT][tRPT], Q(dc->doveUnsatDiff[tRPT][tRPT]), cc->contUnsatDiff[tRPT][tRPT], R(cc->contUnsatDiff[tRPT][tRPT]));
+ fprintf(F, "repeat-bubble %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tRPT][tBUB], P(dd->doveUnsatDiff[tRPT][tBUB]), dc->doveUnsatDiff[tRPT][tBUB], Q(dc->doveUnsatDiff[tRPT][tBUB]), cc->contUnsatDiff[tRPT][tBUB], R(cc->contUnsatDiff[tRPT][tBUB]));
+ fprintf(F, "repeat-unassembled %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tRPT][tUNA], P(dd->doveUnsatDiff[tRPT][tUNA]), dc->doveUnsatDiff[tRPT][tUNA], Q(dc->doveUnsatDiff[tRPT][tUNA]), cc->contUnsatDiff[tRPT][tUNA], R(cc->contUnsatDiff[tRPT][tUNA]));
+ fprintf(F, "repeat-unused %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tRPT][tUNU], P(dd->doveUnsatDiff[tRPT][tUNU]), dc->doveUnsatDiff[tRPT][tUNU], Q(dc->doveUnsatDiff[tRPT][tUNU]), cc->contUnsatDiff[tRPT][tUNU], R(cc->contUnsatDiff[tRPT][tUNU]));
+ fprintf(F, "\n");
+//fprintf(F, "bubble-contig %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tBUB][tCTG], P(dd->doveUnsatDiff[tBUB][tCTG]), dc->doveUnsatDiff[tBUB][tCTG], Q(dc->doveUnsatDiff[tBUB][tCTG]), cc->contUnsatDiff[tBUB][tCTG], R(cc->contUnsatDiff[tBUB][tCTG]));
+//fprintf(F, "bubble-repeat %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tBUB][tRPT], P(dd->doveUnsatDiff[tBUB][tRPT]), dc->doveUnsatDiff[tBUB][tRPT], Q(dc->doveUnsatDiff[tBUB][tRPT]), cc->contUnsatDiff[tBUB][tRPT], R(cc->contUnsatDiff[tBUB][tRPT]));
+ fprintf(F, "bubble-bubble %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tBUB][tBUB], P(dd->doveUnsatDiff[tBUB][tBUB]), dc->doveUnsatDiff[tBUB][tBUB], Q(dc->doveUnsatDiff[tBUB][tBUB]), cc->contUnsatDiff[tBUB][tBUB], R(cc->contUnsatDiff[tBUB][tBUB]));
+ fprintf(F, "bubble-unassembled %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tBUB][tUNA], P(dd->doveUnsatDiff[tBUB][tUNA]), dc->doveUnsatDiff[tBUB][tUNA], Q(dc->doveUnsatDiff[tBUB][tUNA]), cc->contUnsatDiff[tBUB][tUNA], R(cc->contUnsatDiff[tBUB][tUNA]));
+ fprintf(F, "bubble-unused %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tBUB][tUNU], P(dd->doveUnsatDiff[tBUB][tUNU]), dc->doveUnsatDiff[tBUB][tUNU], Q(dc->doveUnsatDiff[tBUB][tUNU]), cc->contUnsatDiff[tBUB][tUNU], R(cc->contUnsatDiff[tBUB][tUNU]));
+ fprintf(F, "\n");
+//fprintf(F, "unassembled-contig %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tUNA][tCTG], P(dd->doveUnsatDiff[tUNA][tCTG]), dc->doveUnsatDiff[tUNA][tCTG], Q(dc->doveUnsatDiff[tUNA][tCTG]), cc->contUnsatDiff[tUNA][tCTG], R(cc->contUnsatDiff[tUNA][tCTG]));
+//fprintf(F, "unassembled-repeat %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tUNA][tRPT], P(dd->doveUnsatDiff[tUNA][tRPT]), dc->doveUnsatDiff[tUNA][tRPT], Q(dc->doveUnsatDiff[tUNA][tRPT]), cc->contUnsatDiff[tUNA][tRPT], R(cc->contUnsatDiff[tUNA][tRPT]));
+//fprintf(F, "unassembled-bubble %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tUNA][tBUB], P(dd->doveUnsatDiff[tUNA][tBUB]), dc->doveUnsatDiff[tUNA][tBUB], Q(dc->doveUnsatDiff[tUNA][tBUB]), cc->contUnsatDiff[tUNA][tBUB], R(cc->contUnsatDiff[tUNA][tBUB]));
+ fprintf(F, "unassembled-unassembled %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tUNA][tUNA], P(dd->doveUnsatDiff[tUNA][tUNA]), dc->doveUnsatDiff[tUNA][tUNA], Q(dc->doveUnsatDiff[tUNA][tUNA]), cc->contUnsatDiff[tUNA][tUNA], R(cc->contUnsatDiff[tUNA][tUNA]));
+ fprintf(F, "unassembled-unused %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tUNA][tUNU], P(dd->doveUnsatDiff[tUNA][tUNU]), dc->doveUnsatDiff[tUNA][tUNU], Q(dc->doveUnsatDiff[tUNA][tUNU]), cc->contUnsatDiff[tUNA][tUNU], R(cc->contUnsatDiff[tUNA][tUNU]));
+ fprintf(F, "\n");
+//fprintf(F, "unused-contig %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tUNU][tCTG], P(dd->doveUnsatDiff[tUNU][tCTG]), dc->doveUnsatDiff[tUNU][tCTG], Q(dc->doveUnsatDiff[tUNU][tCTG]), cc->contUnsatDiff[tUNU][tCTG], R(cc->contUnsatDiff[tUNU][tCTG]));
+//fprintf(F, "unused-repeat %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tUNU][tRPT], P(dd->doveUnsatDiff[tUNU][tRPT]), dc->doveUnsatDiff[tUNU][tRPT], Q(dc->doveUnsatDiff[tUNU][tRPT]), cc->contUnsatDiff[tUNU][tRPT], R(cc->contUnsatDiff[tUNU][tRPT]));
+//fprintf(F, "unused-bubble %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tUNU][tBUB], P(dd->doveUnsatDiff[tUNU][tBUB]), dc->doveUnsatDiff[tUNU][tBUB], Q(dc->doveUnsatDiff[tUNU][tBUB]), cc->contUnsatDiff[tUNU][tBUB], R(cc->contUnsatDiff[tUNU][tBUB]));
+//fprintf(F, "unused-unassembled %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tUNU][tUNA], P(dd->doveUnsatDiff[tUNU][tUNA]), dc->doveUnsatDiff[tUNU][tUNA], Q(dc->doveUnsatDiff[tUNU][tUNA]), cc->contUnsatDiff[tUNU][tUNA], R(cc->contUnsatDiff[tUNU][tUNA]));
+ fprintf(F, "unused-unused %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%% %12" F_U64P " %6.2f%%\n", dd->doveUnsatDiff[tUNU][tUNU], P(dd->doveUnsatDiff[tUNU][tUNU]), dc->doveUnsatDiff[tUNU][tUNU], Q(dc->doveUnsatDiff[tUNU][tUNU]), cc->contUnsatDiff[tUNU][tUNU], R(cc->contUnsatDiff[tUNU][tUNU]));
+ fprintf(F, "\n");
+ fprintf(F, "\n");
+
+ fclose(F);
delete dd;
delete dc;
delete cc;
+ delete bb;
}
diff --git a/src/bogart/AS_BAT_Instrumentation.H b/src/bogart/AS_BAT_Instrumentation.H
index 0db8578..3487f4c 100644
--- a/src/bogart/AS_BAT_Instrumentation.H
+++ b/src/bogart/AS_BAT_Instrumentation.H
@@ -39,14 +39,14 @@
#define INCLUDE_AS_BAT_INSTRUMENTATION
-void checkUnitigMembership(UnitigVector &unitigs);
-void reportOverlaps(UnitigVector &unitigs, const char *prefix, const char *name);
-void reportUnitigs(UnitigVector &unitigs, const char *prefix, const char *name, uint64 genomeSize);
+void checkUnitigMembership(TigVector &tigs);
+void reportOverlaps(TigVector &tigs, const char *prefix, const char *name);
+void reportTigs(TigVector &tigs, const char *prefix, const char *name, uint64 genomeSize);
-void classifyUnitigsAsUnassembled(UnitigVector &unitigs,
- uint32 fewReadsNumber,
- uint32 tooShortLength,
- double spanFraction,
- double lowcovFraction, uint32 lowcovDepth);
+void classifyTigsAsUnassembled(TigVector &tigs,
+ uint32 fewReadsNumber,
+ uint32 tooShortLength,
+ double spanFraction,
+ double lowcovFraction, uint32 lowcovDepth);
#endif // INCLUDE_AS_BAT_INSTRUMENTATION
diff --git a/src/bogart/AS_BAT_Logging.C b/src/bogart/AS_BAT_Logging.C
index 08aaff4..cfa2acf 100644
--- a/src/bogart/AS_BAT_Logging.C
+++ b/src/bogart/AS_BAT_Logging.C
@@ -40,10 +40,11 @@
class logFileInstance {
public:
logFileInstance() {
- file = stderr;
- name[0] = 0;
- part = 0;
- length = 0;
+ file = stderr;
+ prefix[0] = 0;
+ name[0] = 0;
+ part = 0;
+ length = 0;
};
~logFileInstance() {
if ((name[0] != 0) && (file)) {
@@ -52,16 +53,18 @@ public:
}
};
- void set(char const *prefix, int32 order, char const *label, int32 tn) {
- if (label == NULL) {
- file = stderr;
- name[0] = 0;
- part = 0;
- length = 0;
+ void set(char const *prefix_, int32 order_, char const *label_, int32 tn_) {
+ if (label_ == NULL) {
+ file = stderr;
+ prefix[0] = 0;
+ name[0] = 0;
+ part = 0;
+ length = 0;
return;
}
- sprintf(name, "%s.%03u.%s.thr%03d", prefix, order, label, tn);
+ snprintf(prefix, FILENAME_MAX, "%s.%03u.%s", prefix_, order_, label_);
+ snprintf(name, FILENAME_MAX, "%s.%03u.%s.thr%03d", prefix_, order_, label_, tn_);
};
void rotate(void) {
@@ -82,13 +85,13 @@ public:
assert(file == NULL);
assert(name[0] != 0);
- sprintf(path, "%s.num%03d.log", name, part);
+ snprintf(path, FILENAME_MAX, "%s.num%03d.log", name, part);
errno = 0;
file = fopen(path, "w");
if (errno) {
- fprintf(stderr, "setLogFile()-- Failed to open logFile '%s': %s.\n", path, strerror(errno));
- fprintf(stderr, "setLogFile()-- Will now log to stderr instead.\n");
+ writeStatus("setLogFile()-- Failed to open logFile '%s': %s.\n", path, strerror(errno));
+ writeStatus("setLogFile()-- Will now log to stderr instead.\n");
file = stderr;
}
};
@@ -97,13 +100,15 @@ public:
if ((file != NULL) && (file != stderr))
fclose(file);
- file = NULL;
- name[0] = 0;
- part = 0;
- length = 0;
+ file = NULL;
+ prefix[0] = 0;
+ name[0] = 0;
+ part = 0;
+ length = 0;
};
FILE *file;
+ char prefix[FILENAME_MAX];
char name[FILENAME_MAX];
uint32 part;
uint64 length;
@@ -120,25 +125,27 @@ uint64 logFileFlags = 0;
uint64 LOG_OVERLAP_SCORING = 0x0000000000000001; // Debug, scoring of overlaps
uint64 LOG_ALL_BEST_EDGES = 0x0000000000000002;
-uint64 LOG_CHUNK_GRAPH = 0x0000000000000004; // Report the chunk graph as we build it
-uint64 LOG_BUILD_UNITIG = 0x0000000000000008; // Report building of initial unitigs (both unitig creation and fragment placement)
-uint64 LOG_PLACE_UNPLACED = 0x0000000000000010; // Report placing of unplaced reads
-uint64 LOG_BUBBLE_DETAIL = 0x0000000000000020;
-uint64 LOG_SPLIT_DISCONTINUOUS = 0x0000000000000040; //
-uint64 LOG_INTERMEDIATE_UNITIGS = 0x0000000000000080; // At various spots, dump the current unitigs
-uint64 LOG_SET_PARENT_AND_HANG = 0x0000000000000100; //
-uint64 LOG_STDERR = 0x0000000000000200; // Write ALL logging to stderr, not the files.
-
-uint64 LOG_PLACE_FRAG = 0x8000000000000000; // Internal use only.
+uint64 LOG_ERROR_PROFILES = 0x0000000000000004;
+uint64 LOG_CHUNK_GRAPH = 0x0000000000000008; // Report the chunk graph as we build it
+uint64 LOG_BUILD_UNITIG = 0x0000000000000010; // Report building of initial tigs (both unitig creation and read placement)
+uint64 LOG_PLACE_UNPLACED = 0x0000000000000020; // Report placing of unplaced reads
+uint64 LOG_BUBBLE_DETAIL = 0x0000000000000040;
+uint64 LOG_SPLIT_DISCONTINUOUS = 0x0000000000000080; //
+uint64 LOG_INTERMEDIATE_TIGS = 0x0000000000000100; // At various spots, dump the current tigs
+uint64 LOG_SET_PARENT_AND_HANG = 0x0000000000000200; //
+uint64 LOG_STDERR = 0x0000000000000400; // Write ALL logging to stderr, not the files.
+
+uint64 LOG_PLACE_READ = 0x8000000000000000; // Internal use only.
char const *logFileFlagNames[64] = { "overlapScoring",
"allBestEdges",
+ "errorProfiles",
"chunkGraph",
"buildUnitig",
"placeUnplaced",
"bubbles",
"splitDiscontinuous", // Update made it to here, need repeats
- "intermediateUnitigs",
+ "intermediateTigs",
"setParentAndHang",
"stderr",
NULL
@@ -181,8 +188,28 @@ setLogFile(char const *prefix, char const *label) {
// File open is delayed until it is used.
- if (label != NULL)
- fprintf(stderr, "setLogFile()-- Now logging to '%s.%03d.%s'\n", prefix, logFileOrder, label);
+}
+
+
+
+char *
+getLogFilePrefix(void) {
+ return(logFileMain.prefix);
+}
+
+
+
+void
+writeStatus(char const *fmt, ...) {
+ va_list ap;
+ int32 nt = omp_get_num_threads();
+ int32 tn = omp_get_thread_num();
+
+ va_start(ap, fmt);
+
+ vfprintf(stderr, fmt, ap);
+
+ va_end(ap);
}
@@ -201,7 +228,7 @@ writeLog(char const *fmt, ...) {
if ((lf->name[0] != 0) &&
(lf->length > maxLength)) {
- fprintf(lf->file, "logFile()-- size "F_U64" exceeds limit of "F_U64"; rotate to new file.\n",
+ fprintf(lf->file, "logFile()-- size " F_U64 " exceeds limit of " F_U64 "; rotate to new file.\n",
lf->length, maxLength);
lf->rotate();
}
diff --git a/src/bogart/AS_BAT_Logging.H b/src/bogart/AS_BAT_Logging.H
index 246e873..ef51f35 100644
--- a/src/bogart/AS_BAT_Logging.H
+++ b/src/bogart/AS_BAT_Logging.H
@@ -54,9 +54,13 @@
#include <omp.h>
#endif
-void setLogFile(char const *prefix, char const *name);
-void writeLog(char const *fmt, ...);
-void flushLog(void);
+void setLogFile(char const *prefix, char const *name);
+char *getLogFilePrefix(void);
+
+void writeStatus(char const *fmt, ...);
+void writeLog(char const *fmt, ...);
+
+void flushLog(void);
#define logFileFlagSet(L) ((logFileFlags & L) == L)
@@ -65,16 +69,17 @@ extern uint32 logFileOrder; // Used debug tigStore dumps, etc
extern uint64 LOG_OVERLAP_SCORING;
extern uint64 LOG_ALL_BEST_EDGES;
+extern uint64 LOG_ERROR_PROFILES;
extern uint64 LOG_CHUNK_GRAPH;
extern uint64 LOG_BUILD_UNITIG;
extern uint64 LOG_PLACE_UNPLACED;
extern uint64 LOG_BUBBLE_DETAIL;
extern uint64 LOG_SPLIT_DISCONTINUOUS;
-extern uint64 LOG_INTERMEDIATE_UNITIGS;
+extern uint64 LOG_INTERMEDIATE_TIGS;
extern uint64 LOG_SET_PARENT_AND_HANG;
extern uint64 LOG_STDERR;
-extern uint64 LOG_PLACE_FRAG;
+extern uint64 LOG_PLACE_READ;
extern char const *logFileFlagNames[64];
diff --git a/src/bogart/AS_BAT_MarkRepeatReads.C b/src/bogart/AS_BAT_MarkRepeatReads.C
index 7dc5abe..6c70c03 100644
--- a/src/bogart/AS_BAT_MarkRepeatReads.C
+++ b/src/bogart/AS_BAT_MarkRepeatReads.C
@@ -23,9 +23,10 @@
* full conditions and disclaimers for each license.
*/
-#include "AS_BAT_FragmentInfo.H"
+#include "AS_BAT_ReadInfo.H"
#include "AS_BAT_OverlapCache.H"
#include "AS_BAT_BestOverlapGraph.H"
+#include "AS_BAT_AssemblyGraph.H"
#include "AS_BAT_Logging.H"
#include "AS_BAT_Unitig.H"
@@ -45,6 +46,8 @@ int32 REPEAT_OVERLAP_MIN = 50;
#define REPEAT_FRACTION 0.5
+#undef OLD_ANNOTATE
+#undef SHOW_ANNOTATE
#undef SHOW_ANNOTATION_RAW // Show all overlaps used to annotate reads
#undef SHOW_ANNOTATION_RAW_FILTERED // Show all overlaps filtered by high error rate
@@ -59,20 +62,20 @@ int32 REPEAT_OVERLAP_MIN = 50;
class olapDat {
public:
- olapDat(uint32 b, uint32 e, uint32 t, uint32 r) {
+ olapDat(uint32 b, uint32 e, uint32 r, uint32 p) {
tigbgn = b;
tigend = e;
- eviTid = t;
eviRid = r;
+ eviPid = p;
};
bool operator<(const olapDat &that) const { return(tigbgn < that.tigbgn); };
- uint32 tigbgn; // Location of the overlap on this tig
- uint32 tigend; //
+ int32 tigbgn; // Location of the overlap on this tig
+ int32 tigend; //
- uint32 eviTid; // tig that the evidence read came from
uint32 eviRid; // evidence read
+ uint32 eviPid; // evidence read placeID
};
@@ -89,8 +92,7 @@ olapDatByEviRid(const olapDat &A, const olapDat &B) {
class breakPointCoords {
public:
- breakPointCoords(uint32 tigID, int32 bgn, int32 end, bool rpt=false) {
- _tigID = tigID;
+ breakPointCoords(int32 bgn, int32 end, bool rpt=false) {
_bgn = bgn;
_end = end;
_isRepeat = rpt;
@@ -102,7 +104,6 @@ public:
return(_bgn < that._bgn);
};
- uint32 _tigID;
int32 _bgn;
int32 _end;
bool _isRepeat;
@@ -124,7 +125,7 @@ olapToReadCoords(ufNode *frg,
int32 &lo, int32 &hi) {
lo = 0;
- hi = FI->fragmentLength(frg->ident);
+ hi = RI->readLength(frg->ident);
if (ahang > 0)
lo += ahang; // Positive hang!
@@ -135,8 +136,8 @@ olapToReadCoords(ufNode *frg,
assert(0 <= lo);
assert(0 <= hi);
assert(lo <= hi);
- assert(lo <= FI->fragmentLength(frg->ident));
- assert(hi <= FI->fragmentLength(frg->ident));
+ assert(lo <= RI->readLength(frg->ident));
+ assert(hi <= RI->readLength(frg->ident));
}
@@ -160,7 +161,7 @@ findUnitigCoverage(Unitig *tig,
#ifdef DUMP_READ_COVERAGE
char fn[FILENAME_MAX];
- sprintf(fn, "%08u.coverage", tig->id());
+ snprintf(fn, FILENAME_MAX, "%08u.coverage", tig->id());
FILE *F = fopen(fn, "w");
for (uint32 ii=0; ii<coverage.numberOfIntervals(); ii++)
@@ -172,21 +173,15 @@ findUnitigCoverage(Unitig *tig,
-
-
-
-
-
uint32
-splitUnitigs(UnitigVector &unitigs,
- Unitig *tig,
- vector<breakPointCoords> &BP,
- Unitig **newTigs,
- int32 *lowCoord,
- uint32 *nRepeat,
- uint32 *nUnique,
- bool doMove) {
- uint32 nTigsCreated = 0;
+splitTig(TigVector &tigs,
+ Unitig *tig,
+ vector<breakPointCoords> &BP,
+ Unitig **newTigs,
+ int32 *lowCoord,
+ uint32 *nRepeat,
+ uint32 *nUnique,
+ bool doMove) {
if (doMove == true) {
memset(newTigs, 0, sizeof(Unitig *) * BP.size());
@@ -207,7 +202,7 @@ splitUnitigs(UnitigVector &unitigs,
uint32 rid = UINT32_MAX;
bool rpt = false;
- //fprintf(stderr, "Searching for placement for read %u at %u-%u\n", frg.ident, frgbgn, frgend);
+ //fprintf(stderr, "Searching for placement for read %u at %d-%d\n", frg.ident, frgbgn, frgend);
for (uint32 ii=0; ii<BP.size(); ii++) {
int32 rgnbgn = BP[ii]._bgn;
@@ -232,9 +227,10 @@ splitUnitigs(UnitigVector &unitigs,
}
if (rid == UINT32_MAX) {
- fprintf(stderr, "Failed to place read %u at %u-%u\n", frg.ident, frgbgn, frgend);
+ fprintf(stderr, "Failed to place read %u at %d-%d\n", frg.ident, frgbgn, frgend);
for (uint32 ii=0; ii<BP.size(); ii++)
- fprintf(stderr, "Breakpoints %2u %8u-%8u repeat %u\n", ii, BP[ii]._bgn, BP[ii]._end, BP[ii]._isRepeat);
+ fprintf(stderr, "BP[%3u] at %8u-%8u repeat %u\n", ii, BP[ii]._bgn, BP[ii]._end, BP[ii]._isRepeat);
+ flushLog();
}
assert(rid != UINT32_MAX); // We searched all the BP's, the read had better be placed!
@@ -244,13 +240,13 @@ splitUnitigs(UnitigVector &unitigs,
if (newTigs[rid] == NULL) {
lowCoord[rid] = frgbgn;
- newTigs[rid] = unitigs.newUnitig(true); // LOG_ADDUNITIG_BREAKING
+ newTigs[rid] = tigs.newUnitig(true);
if (nRepeat[rid] > nUnique[rid])
newTigs[rid]->_isRepeat = true;
}
- newTigs[rid]->addFrag(frg, -lowCoord[rid], false); //LOG_ADDFRAG_BREAKING);
+ newTigs[rid]->addRead(frg, -lowCoord[rid], false);
}
// Else, we're not moving, just count how many reads came from repeats or uniques.
@@ -265,6 +261,8 @@ splitUnitigs(UnitigVector &unitigs,
// Return the number of tigs created.
+ uint32 nTigsCreated = 0;
+
for (uint32 ii=0; ii<BP.size(); ii++)
if (nRepeat[ii] + nUnique[ii] > 0)
nTigsCreated++;
@@ -275,802 +273,783 @@ splitUnitigs(UnitigVector &unitigs,
-
-
-// For each overlap, if the b-read is in this tig, ignore it.
-// Otherwise annotate the read with the overlap region.
+// Over all reads in tgA, return a vector of olapDat (tigBgn, tigEnd, eviRid)
+// for all reads that overlap into this tig.
//
-// Later, check if the two reads in this unitig overlap; if not, annotate also.
+// The current AssemblyGraph is backwards to what we need. It has, for each read, the
+// overlaps from that read that are compatible - but we need to the overlaps to each
+// read that are compatible, and the two are not symmetric. A can be compatible in tig 1,
+// but the same overlapping read B can be incompatible with tig 2.
//
-void
-annotateRepeatsOnRead(UnitigVector &unitigs,
- Unitig *tgA,
- ufNode *rdA,
- double deviationRepeat,
- vector<olapDat> &repeats) {
- uint32 ovlLen = 0;
- BAToverlap *ovl = OC->getOverlaps(rdA->ident, AS_MAX_ERATE, ovlLen);
+// We can invert the graph at the start of repeat detection, making a list of
+// read B ---> overlaps to tig N position X-Y, with read A
- vector<olapDat> readOlaps; // List of valid repeat overlaps to this read
- uint32 tgAid = tgA->id();
+void
+annotateRepeatsOnRead(AssemblyGraph *AG,
+ TigVector &UNUSED(tigs),
+ Unitig *tig,
+ double UNUSED(deviationRepeat),
+ vector<olapDat> &repeats) {
- bool rdAfwd = (rdA->position.bgn < rdA->position.end);
- int32 rdAlo = (rdAfwd) ? rdA->position.bgn : rdA->position.end;
- int32 rdAhi = (rdAfwd) ? rdA->position.end : rdA->position.bgn;
+ // Over all reads in this tig,
+ // Grab pointers to all incoming edges.
+ // Push those locations onto our output list.
- assert(rdAlo < rdAhi);
+ for (uint32 ii=0; ii<tig->ufpath.size(); ii++) {
+ ufNode *read = &tig->ufpath[ii];
+ vector<BestReverse> &rPlace = AG->getReverse(read->ident);
- // Beacuse the read is placed with a lot of fudging in the positions, we need
- // to scale the coordinates we compute here.
- double sc = (rdAhi - rdAlo) / (double)FI->fragmentLength(rdA->ident);
+#if 0
+ writeLog("annotateRepeatsOnRead()-- tig %u read #%u %u at %d-%d reverse %u items\n",
+ tig->id(), ii, read->ident,
+ read->position.bgn,
+ read->position.end,
+ rPlace.size());
+#endif
- // For all overlaps to this read, save the overlap if it is not represented in this tig.
+ for (uint32 rr=0; rr<rPlace.size(); rr++) {
+ uint32 rID = rPlace[rr].readID;
+ uint32 pID = rPlace[rr].placeID;
+ BestPlacement &fPlace = AG->getForward(rID)[pID];
- uint32 nOlaps = ovlLen;
- uint32 nDiff = 0; // Overlap to different tig
- uint32 nSelf = 0; // Overlap to same tig, different location
- uint32 nConf = 0; // Overlap to same tig, confirmed good overlap
+#ifdef SHOW_ANNOTATION_RAW
+ writeLog("annotateRepeatsOnRead()-- tig %u read #%u %u place %u reverse read %u in tig %u placed %d-%d olap %d-%d%s\n",
+ tig->id(), ii, read->ident, rr,
+ rID,
+ tig->inUnitig(rID),
+ fPlace.placedBgn, fPlace.placedEnd,
+ fPlace.olapBgn, fPlace.olapEnd,
+ (fPlace.isUnitig) ? " IN_UNITIG" : "");
+#endif
- for (uint32 oi=0; oi<ovlLen; oi++) {
- uint32 rdBid = ovl[oi].b_iid;
- uint32 tgBid = Unitig::fragIn(rdBid);
+ if ((fPlace.isUnitig == true) ||
+ (fPlace.isContig == true))
+ continue;
- int32 bgn = 0; // Position in the read that
- int32 end = 0; // the overlap covers
+ repeats.push_back(olapDat(fPlace.olapBgn, fPlace.olapEnd, rID, pID));
+ }
+ }
+}
- // If the read is in a singleton, skip. These are unassembled crud.
- if ((tgBid == 0) ||
- (unitigs[tgBid] == NULL) ||
- (unitigs[tgBid]->ufpath.size() == 1))
- continue;
- // If the read is in an annotated bubble, skip.
- if (unitigs[tgBid]->_isBubble)
- continue;
- // If the overlap is to a container read, skip it.
- if ((ovl[oi].a_hang < 0) && (ovl[oi].b_hang > 0))
- continue;
+void
+mergeAnnotations(vector<olapDat> &repeatOlaps) {
+ sort(repeatOlaps.begin(), repeatOlaps.end(), olapDatByEviRid);
- // If the overlap is to a contained read, skip it.
- if ((ovl[oi].a_hang > 0) && (ovl[oi].b_hang < 0))
- continue;
+#ifdef SHOW_ANNOTATE
+ for (uint32 ii=0; ii<repeatOlaps.size(); ii++)
+ if (repeatOlaps[ii].tigbgn < 1000000)
+ writeLog("repeatOlaps[%u] %d-%d from read %u place %u RAW\n",
+ ii,
+ repeatOlaps[ii].tigbgn, repeatOlaps[ii].tigend,
+ repeatOlaps[ii].eviRid, repeatOlaps[ii].eviPid);
- uint32 rdBpos = unitigs[tgBid]->pathPosition(rdBid);
- ufNode *rdB = &unitigs[tgBid]->ufpath[rdBpos];
+ flushLog();
+#endif
- bool rdBfwd = (rdB->position.bgn < rdB->position.end);
- int32 rdBlo = (rdBfwd) ? rdB->position.bgn : rdB->position.end;
- int32 rdBhi = (rdBfwd) ? rdB->position.end : rdB->position.bgn;
+ for (uint32 dd=0, ss=1; ss<repeatOlaps.size(); ss++) {
+ assert(repeatOlaps[dd].eviRid <= repeatOlaps[ss].eviRid);
- assert(rdBlo < rdBhi);
+ // If different evidence reads, close the destination olap, set up
+ // for a new destination.
- // If the overlap is to a read in a different tig, save it.
- if (tgBid != tgAid) {
- nDiff++;
- olapToReadCoords(rdA, ovl[oi].a_hang, ovl[oi].b_hang, bgn, end);
+ if (repeatOlaps[dd].eviRid != repeatOlaps[ss].eviRid) {
+ dd = ss;
+ continue;
}
- // If the overlap is to a read in the same tig, but we don't overlap in the tig, save it.
- else if ((rdAhi < rdBlo) || (rdBhi < rdAlo)) {
- nSelf++;
- olapToReadCoords(rdA, ovl[oi].a_hang, ovl[oi].b_hang, bgn, end);
- }
+ // If the destination ends before the source begins, there is no overlap between the
+ // two regions. Close dd, set up for a new dd.
- // Otherwise, the overlap is present in the tig, and can't indicate a repeat.
- else {
- nConf++;
+ if (repeatOlaps[dd].tigend <= repeatOlaps[ss].tigbgn) {
+ dd = ss;
continue;
}
- // Find the positions of the read that are covered by the overlap.
+ // Otherwise, there must be an overlap. Extend the destination region, erase the source
+ // region.
- int32 tigbgn = (rdAfwd) ? (rdAlo + sc * bgn) : (rdAhi - sc * end);
- int32 tigend = (rdAfwd) ? (rdAlo + sc * end) : (rdAhi - sc * bgn);
+ repeatOlaps[dd].tigbgn = min(repeatOlaps[ss].tigbgn, repeatOlaps[dd].tigbgn);
+ repeatOlaps[dd].tigend = max(repeatOlaps[ss].tigend, repeatOlaps[dd].tigend);
- assert(tigbgn < tigend);
+ repeatOlaps[ss].tigbgn = UINT32_MAX;
+ repeatOlaps[ss].tigend = UINT32_MAX;
+ repeatOlaps[ss].eviRid = UINT32_MAX;
+ repeatOlaps[ss].eviPid = UINT32_MAX;
+ }
- if (tigbgn < 0) tigbgn = 0;
- if (tigend > tgA->getLength()) tigend = tgA->getLength();
+ // Sort overlaps again. This pushes all those 'erased' regions to the end of the list, which
+ // we can then just pop off.
- // Filter overlaps that are higher error than expected.
+ sort(repeatOlaps.begin(), repeatOlaps.end(), olapDatByEviRid);
- double consistent = tgA->overlapConsistentWithTig(deviationRepeat, tigbgn, tigend, ovl[oi].erate);
+ for (uint32 ii=repeatOlaps.size(); ii--; )
+ if (repeatOlaps[ii].eviRid == UINT32_MAX)
+ repeatOlaps.pop_back();
- if (consistent < REPEAT_FRACTION) {
-#ifdef SHOW_ANNOTATION_RAW_FILTERED
- writeLog("tig %6u read %7u %8u-%8u OVERLAP from tig %6u read %7u %8u-%8u at tigpos %8u-%8u erate %.6f consistent %.4f FILTERED\n",
- tgAid, rdA->ident, rdAlo, rdAhi,
- tgBid, rdBid, rdBlo, rdBhi,
- tigbgn, tigend, ovl[oi].erate, consistent);
-#endif
- continue;
- }
+ // For logging, sort by coordinate
-#ifdef SHOW_ANNOTATION_RAW
- writeLog("tig %6u read %7u %8u-%8u OVERLAP from tig %6u read %7u %8u-%8u at tigpos %8u-%8u erate %.6f consistent %.4f\n",
- tgAid, rdA->ident, rdAlo, rdAhi,
- tgBid, rdBid, rdBlo, rdBhi,
- tigbgn, tigend, ovl[oi].erate, consistent);
+ sort(repeatOlaps.begin(), repeatOlaps.end());
+
+#ifdef SHOW_ANNOTATE
+ for (uint32 ii=0; ii<repeatOlaps.size(); ii++)
+ if (repeatOlaps[ii].tigbgn < 1000000)
+ writeLog("repeatOlaps[%d] %d-%d from tig %u read %u place %u MERGED\n",
+ ii,
+ repeatOlaps[ii].tigbgn, repeatOlaps[ii].tigend,
+ repeatOlaps[ii].eviRid, repeatOlaps[ii].eviPid);
#endif
+}
- readOlaps.push_back(olapDat(tigbgn, tigend, tgBid, rdBid));
- }
- // All overlaps processed. Save to the master list.
-#pragma omp critical (repeatsPushBack)
- for (uint32 rr=0; rr<readOlaps.size(); rr++)
- repeats.push_back(readOlaps[rr]);
-}
+void
+discardSpannedRepeats(Unitig *tig,
+ intervalList<int32> &tigMarksR) {
+
+ for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
+ ufNode *frg = &tig->ufpath[fi];
+ bool frgfwd = (frg->position.bgn < frg->position.end);
+ int32 frglo = (frgfwd) ? frg->position.bgn : frg->position.end;
+ int32 frghi = (frgfwd) ? frg->position.end : frg->position.bgn;
+ bool discarded = false;
+
+ for (uint32 ri=0; ri<tigMarksR.numberOfIntervals(); ri++) {
+ bool spanLo = false;
+ bool spanHi = false;
+
+ // The decision of 'spanned by a read' is broken into two pieces: does the read span the
+ // lower (higher) boundary of the region. To be spanned, the boundary needs to be spanned
+ // by at least MIN_ANCHOR_HANG additional bases (to anchor the read to non-repeat
+ // sequence).
+ //
+ // This is a problem at the start/end of the tig, beacuse no read will extend past the
+ // start/end of the tig. Instead, if the repeat is contained within the first (last) read
+ // with no extension at the respective end, it is spanned.
+
+ if ((frglo == 0) && // Read at start of tig, spans off the high end
+ (tigMarksR.hi(ri) + MIN_ANCHOR_HANG <= frghi))
+ spanLo = spanHi = true;
+
+ if ((frghi == tig->getLength()) && // Read at end of tig, spans off the low end
+ (frglo + MIN_ANCHOR_HANG <= tigMarksR.lo(ri)))
+ spanLo = spanHi = true;
+
+ if (frglo + MIN_ANCHOR_HANG <= tigMarksR.lo(ri)) // Read spanned off the low end
+ spanLo = true;
+
+ if (tigMarksR.hi(ri) + MIN_ANCHOR_HANG <= frghi) // Read spanned off the high end
+ spanHi = true;
+ if (spanLo && spanHi) {
+ writeLog("discard region %8d:%-8d - contained in read %6u %8d-%8d\n",
+ tigMarksR.lo(ri), tigMarksR.hi(ri), frg->ident, frglo, frghi);
+
+ tigMarksR.lo(ri) = 0;
+ tigMarksR.hi(ri) = 0;
+
+ discarded = true;
+ }
+ }
+
+ if (discarded)
+ tigMarksR.filterShort(1);
+ }
+}
void
-markRepeatReads(UnitigVector &unitigs,
- double deviationRepeat,
- uint32 confusedAbsolute,
- double confusedPercent) {
- uint32 tiLimit = unitigs.size();
- uint32 numThreads = omp_get_max_threads();
- uint32 blockSize = (tiLimit < 100000 * numThreads) ? numThreads : tiLimit / 99999;
+reportThickestEdgesInRepeats(Unitig *tig,
+ intervalList<int32> &tigMarksR) {
- writeLog("repeatDetect()-- working on "F_U32" unitigs, with "F_U32" threads.\n", tiLimit, numThreads);
+ writeLog("thickest edges to the repeat regions:\n");
- vector<olapDat> repeatOlaps; // Overlaps to reads promoted to tig coords
+ for (uint32 ri=0; ri<tigMarksR.numberOfIntervals(); ri++) {
+ uint32 t5 = UINT32_MAX, l5 = 0, t5bgn = 0, t5end = 0;
+ uint32 t3 = UINT32_MAX, l3 = 0, t3bgn = 0, t3end = 0;
- intervalList<int32> tigMarksR; // Marked repeats based on reads, filtered by spanning reads
- intervalList<int32> tigMarksU; // Non-repeat invervals, just the inversion of tigMarksR
+ for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
+ ufNode *frg = &tig->ufpath[fi];
+ bool frgfwd = (frg->position.bgn < frg->position.end);
+ int32 frglo = (frgfwd) ? frg->position.bgn : frg->position.end;
+ int32 frghi = (frgfwd) ? frg->position.end : frg->position.bgn;
+ bool discarded = false;
+ // Overlap off the 5' end of the region.
+ if (frglo <= tigMarksR.lo(ri) && (tigMarksR.lo(ri) <= frghi)) {
+ uint32 olap = frghi - tigMarksR.lo(ri);
+ if (l5 < olap) {
+ l5 = olap;
+ t5 = fi;
+ t5bgn = frglo; // Easier than recomputing it later on...
+ t5end = frghi;
+ }
+ }
- for (uint32 ti=0; ti<tiLimit; ti++) {
- Unitig *tig = unitigs[ti];
+ // Overlap off the 3' end of the region.
+ if (frglo <= tigMarksR.hi(ri) && (tigMarksR.hi(ri) <= frghi)) {
+ uint32 olap = tigMarksR.hi(ri) - frglo;
+ if (l3 < olap) {
+ l3 = olap;
+ t3 = fi;
+ t3bgn = frglo;
+ t3end = frghi;
+ }
+ }
- if (tig == NULL)
- continue;
+ if (frglo <= tigMarksR.lo(ri) && (tigMarksR.hi(ri) <= frghi)) {
+ writeLog("saved region %8d:%-8d - closest read %6u (%+6d) %8d:%-8d (%+6d) (contained)\n",
+ tigMarksR.lo(ri), tigMarksR.hi(ri),
+ frg->ident,
+ tigMarksR.lo(ri) - frglo, frglo,
+ frghi, frghi - tigMarksR.hi(ri));
+ }
+ }
- if (tig->ufpath.size() == 1)
- continue;
+ if (t5 != UINT32_MAX)
+ writeLog("saved region %8d:%-8d - closest 5' read %6u (%+6d) %8d:%-8d (%+6d)\n",
+ tigMarksR.lo(ri), tigMarksR.hi(ri),
+ tig->ufpath[t5].ident,
+ tigMarksR.lo(ri) - t5bgn, t5bgn,
+ t5end, t5end - tigMarksR.hi(ri));
+
+ if (t3 != UINT32_MAX)
+ writeLog("saved region %8d:%-8d - closest 3' read %6u (%+6d) %8d:%-8d (%+6d)\n",
+ tigMarksR.lo(ri), tigMarksR.hi(ri),
+ tig->ufpath[t3].ident,
+ tigMarksR.lo(ri) - t3bgn, t3bgn,
+ t3end, t3end - tigMarksR.hi(ri));
+ }
+}
- vector<olapDat> repeats;
- writeLog("Annotating repeats in reads for tig %u/%u.\n", ti, tiLimit);
- // Clear out all the existing marks. They're not for this tig.
+uint32 *
+findConfusedEdges(TigVector &tigs,
+ Unitig *tig,
+ intervalList<int32> &tigMarksR,
+ double confusedAbsolute,
+ double confusedPercent) {
+ uint32 *isConfused = new uint32 [tigMarksR.numberOfIntervals()];
- // Analyze overlaps for each read. For each overlap to a read not in this tig, or not
- // overlapping in this tig, and of acceptable error rate, add the overlap to repeatOlaps.
+ memset(isConfused, 0, sizeof(uint32) * tigMarksR.numberOfIntervals());
- repeatOlaps.clear();
+ for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
+ ufNode *rdA = &tig->ufpath[fi];
+ uint32 rdAid = rdA->ident;
+ bool rdAfwd = (rdA->position.bgn < rdA->position.end);
+ int32 rdAlo = (rdAfwd) ? rdA->position.bgn : rdA->position.end;
+ int32 rdAhi = (rdAfwd) ? rdA->position.end : rdA->position.bgn;
- uint32 fiLimit = tig->ufpath.size();
- uint32 numThreads = omp_get_max_threads();
- uint32 blockSize = (fiLimit < 100 * numThreads) ? numThreads : fiLimit / 99;
+ double sc = (rdAhi - rdAlo) / (double)RI->readLength(rdAid);
-#pragma omp parallel for if(fiLimit > 100) schedule(dynamic, blockSize)
- for (uint32 fi=0; fi<fiLimit; fi++)
- annotateRepeatsOnRead(unitigs, tig, &tig->ufpath[fi], deviationRepeat, repeatOlaps);
+ if ((OG->isContained(rdAid) == true) ||
+ (OG->isSuspicious(rdAid) == true))
+ continue;
- writeLog("Annotated with %lu overlaps.\n", repeatOlaps.size());
+ for (uint32 ri=0; ri<tigMarksR.numberOfIntervals(); ri++) {
+ uint32 rMin = tigMarksR.lo(ri);
+ uint32 rMax = tigMarksR.hi(ri);
- // Merge marks for the same read into the largest possible.
+ if ((rdAhi < rMin) || // Read ends before the region
+ (rMax < rdAlo)) // Read starts after the region
+ continue; // -> don't care about this read!
- sort(repeatOlaps.begin(), repeatOlaps.end(), olapDatByEviRid);
+ // Compute the position (in the tig) of the best overlaps.
-#ifdef SHOW_ANNOTATE
- for (uint32 ii=0; ii<repeatOlaps.size(); ii++)
- if (repeatOlaps[ii].tigbgn < 1000000)
- writeLog("repeatOlaps[%u] %u-%u from tig %u read %u RAW\n",
- ii,
- repeatOlaps[ii].tigbgn, repeatOlaps[ii].tigend,
- repeatOlaps[ii].eviTid, repeatOlaps[ii].eviRid);
-
- flushLog();
-#endif
+ int32 tig5bgn=0, tig5end=0;
+ int32 tig3bgn=0, tig3end=0;
- for (uint32 dd=0, ss=1; ss<repeatOlaps.size(); ss++) {
- assert(repeatOlaps[dd].eviRid <= repeatOlaps[ss].eviRid);
+ // Instead of using the best edge - which might not be the edge used in the unitig -
+ // we need to scan the layout to return the previous/next dovetail
- // If different evidence reads, close the destination olap, set up
- // for a new destination.
+ // Put this in a function - what to return if no best overlap?
- if (repeatOlaps[dd].eviRid != repeatOlaps[ss].eviRid) {
- dd = ss;
- continue;
+ BestEdgeOverlap *b5 = OG->getBestEdgeOverlap(rdAid, false);
+ BestEdgeOverlap *b3 = OG->getBestEdgeOverlap(rdAid, true);
+
+ // If the best edge is to a read not in this tig, there is nothing to compare against.
+ // Is this confused by default? Possibly. The unitig was constructed somehow, and that
+ // must then be the edge coming into us. We'll pick it up later.
+
+ bool b5use = true;
+ bool b3use = true;
+
+ if (b5->readId() == 0)
+ b5use = false;
+ if (b3->readId() == 0)
+ b3use = false;
+
+ if ((b5use) && (tig->inUnitig(b5->readId()) != tig->id()))
+ b5use = false;
+ if ((b3use) && (tig->inUnitig(b3->readId()) != tig->id()))
+ b3use = false;
+
+ // The best edge read is in this tig. If they don't overlap, again, nothing to compare
+ // against.
+
+ if (b5use) {
+ ufNode *rdB = &tig->ufpath[tig->ufpathIdx(b5->readId())];
+ uint32 rdBid = rdB->ident;
+ bool rdBfwd = (rdB->position.bgn < rdB->position.end);
+ int32 rdBlo = (rdBfwd) ? rdB->position.bgn : rdB->position.end;
+ int32 rdBhi = (rdBfwd) ? rdB->position.end : rdB->position.bgn;
+
+ if ((rdAhi < rdBlo) ||
+ (rdBhi < rdAlo))
+ b5use = false;
}
- // If the destination ends before the source begins, there is no overlap between the
- // two regions. Close dd, set up for a new dd.
+ if (b3use) {
+ ufNode *rdB = &tig->ufpath[tig->ufpathIdx(b3->readId())];
+ uint32 rdBid = rdB->ident;
+ bool rdBfwd = (rdB->position.bgn < rdB->position.end);
+ int32 rdBlo = (rdBfwd) ? rdB->position.bgn : rdB->position.end;
+ int32 rdBhi = (rdBfwd) ? rdB->position.end : rdB->position.bgn;
- if (repeatOlaps[dd].tigend <= repeatOlaps[ss].tigbgn) {
- dd = ss;
- continue;
+ if ((rdAhi < rdBlo) ||
+ (rdBhi < rdAlo))
+ b3use = false;
}
- // Otherwise, there must be an overlap. Extend the destination region, erase the source
- // region.
+ // If we can use this edge, compute the placement of the overlap on the unitig.
- repeatOlaps[dd].tigbgn = min(repeatOlaps[ss].tigbgn, repeatOlaps[dd].tigbgn);
- repeatOlaps[dd].tigend = max(repeatOlaps[ss].tigend, repeatOlaps[dd].tigend);
+ // Call #1;
- repeatOlaps[ss].tigbgn = UINT32_MAX;
- repeatOlaps[ss].tigend = UINT32_MAX;
- repeatOlaps[ss].eviTid = UINT32_MAX;
- repeatOlaps[ss].eviRid = UINT32_MAX;
- }
+ if (b5use) {
+ int32 bgn=0, end=0;
- // Sort overlaps again. This pushes all those 'erased' regions to the end of the list, which
- // we can then just pop off.
+ olapToReadCoords(rdA,
+ b5->ahang(), b5->bhang(),
+ bgn, end);
- sort(repeatOlaps.begin(), repeatOlaps.end(), olapDatByEviRid);
+ tig5bgn = (rdAfwd) ? (rdAlo + sc * bgn) : (rdAhi - sc * end);
+ tig5end = (rdAfwd) ? (rdAlo + sc * end) : (rdAhi - sc * bgn);
- for (uint32 ii=repeatOlaps.size(); ii--; )
- if (repeatOlaps[ii].eviTid == UINT32_MAX)
- repeatOlaps.pop_back();
+ assert(tig5bgn < tig5end);
- // For logging, sort by coordinate
+ if (tig5bgn < 0) tig5bgn = 0;
+ if (tig5end > tig->getLength()) tig5end = tig->getLength();
+ }
- sort(repeatOlaps.begin(), repeatOlaps.end());
+ // Call #2
-#ifdef SHOW_ANNOTATE
- for (uint32 ii=0; ii<repeatOlaps.size(); ii++)
- writeLog("repeatOlaps[%d] %u-%u from tig %u read %u MERGED\n",
- ii,
- repeatOlaps[ii].tigbgn, repeatOlaps[ii].tigend,
- repeatOlaps[ii].eviTid, repeatOlaps[ii].eviRid);
-#endif
+ if (b3use) {
+ int32 bgn=0, end=0;
- // Make a new set of intervals based on all the detected repeats.
+ olapToReadCoords(rdA,
+ b3->ahang(), b3->bhang(),
+ bgn, end);
- tigMarksR.clear();
+ tig3bgn = (rdAfwd) ? (rdAlo + sc * bgn) : (rdAhi - sc * end);
+ tig3end = (rdAfwd) ? (rdAlo + sc * end) : (rdAhi - sc * bgn);
- for (uint32 bb=0, ii=0; ii<repeatOlaps.size(); ii++)
- tigMarksR.add(repeatOlaps[ii].tigbgn, repeatOlaps[ii].tigend - repeatOlaps[ii].tigbgn);
+ assert(tig3bgn < tig3end);
- // Collapse these markings Collapse all the read markings to intervals on the unitig, merging those that overlap
- // significantly.
+ if (tig3bgn < 0) tig3bgn = 0;
+ if (tig3end > tig->getLength()) tig3end = tig->getLength();
+ }
- writeLog("Merge marks.\n");
+ // If either of the 5' or 3' overlaps (or both!) are in the repeat region, we need to check for
+ // close overlaps on that end.
- tigMarksR.merge(REPEAT_OVERLAP_MIN);
+ uint32 len5 = 0;
+ uint32 len3 = 0;
- // Scan reads, discard any mark that is contained in a read
- //
- // We don't need to filterShort() after every one is removed, but it's simpler to do it Right Now than
- // to track if it is needed.
+ if ((rMin < tig5bgn) &&
+ (tig5end < rMax) &&
+ (b5use))
+ len5 = RI->overlapLength(rdAid, b5->readId(), b5->ahang(), b5->bhang());
+ else
+ b5use = false;
- writeLog("Scan reads to discard spanned repeats.\n");
+ if ((rMin < tig3bgn) &&
+ (tig3end < rMax) &&
+ (b3use))
+ len3 = RI->overlapLength(rdAid, b3->readId(), b3->ahang(), b3->bhang());
+ else
+ b3use = false;
- for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
- ufNode *frg = &tig->ufpath[fi];
- bool frgfwd = (frg->position.bgn < frg->position.end);
- int32 frglo = (frgfwd) ? frg->position.bgn : frg->position.end;
- int32 frghi = (frgfwd) ? frg->position.end : frg->position.bgn;
- bool discarded = false;
+ double score5 = len5 * (1 - b5->erate());
+ double score3 = len3 * (1 - b3->erate());
- for (uint32 ri=0; ri<tigMarksR.numberOfIntervals(); ri++) {
- bool spanLo = false;
- bool spanHi = false;
+ // Neither of the best edges are in the repeat region; move to the next region and/or read.
+ if (len5 + len3 == 0)
+ continue;
- // The decision of 'spanned by a read' is broken into two pieces: does the read span the
- // lower (higher) boundary of the region. To be spanned, the boundary needs to be spanned
- // by at least MIN_ANCHOR_HANG additional bases (to anchor the read to non-repeat
- // sequence).
- //
- // This is a problem at the start/end of the tig, beacuse no read will extend past the
- // start/end of the tig. Instead, if the repeat is contained within the first (last) read
- // with no extension at the respective end, it is spanned.
+ // At least one of the best edge overlaps is in the repeat region. Scan for other edges
+ // that are of comparable length and quality.
- if ((frglo == 0) && // Read at start of tig, spans off the high end
- (tigMarksR.hi(ri) + MIN_ANCHOR_HANG <= frghi))
- spanLo = spanHi = true;
+ uint32 ovlLen = 0;
+ BAToverlap *ovl = OC->getOverlaps(rdAid, ovlLen);
- if ((frghi == tig->getLength()) && // Read at end of tig, spans off the low end
- (frglo + MIN_ANCHOR_HANG <= tigMarksR.lo(ri)))
- spanLo = spanHi = true;
+ for (uint32 oo=0; oo<ovlLen; oo++) {
+ uint32 rdBid = ovl[oo].b_iid;
+ uint32 tgBid = tigs.inUnitig(rdBid);
- if (frglo + MIN_ANCHOR_HANG <= tigMarksR.lo(ri)) // Read spanned off the low end
- spanLo = true;
+ // If the read is in a singleton, skip. These are unassembled crud.
+ if ((tgBid == 0) ||
+ (tigs[tgBid] == NULL) ||
+ (tigs[tgBid]->ufpath.size() == 1))
+ continue;
- if (tigMarksR.hi(ri) + MIN_ANCHOR_HANG <= frghi) // Read spanned off the high end
- spanHi = true;
+ // If the read is in an annotated bubble, skip.
+ if ((tigs[tgBid]->_isBubble == true) &&
+ (tigs[tgBid]->_isRepeat == false))
+ continue;
- if (spanLo && spanHi) {
- writeLog("discard region %8d:%-8d - contained in read %6u %8d-%8d\n",
- tigMarksR.lo(ri), tigMarksR.hi(ri), frg->ident, frglo, frghi);
+ // Skip if this overlap is the best we're trying to match.
+ if ((rdBid == b5->readId()) ||
+ (rdBid == b3->readId()))
+ continue;
- tigMarksR.lo(ri) = 0;
- tigMarksR.hi(ri) = 0;
+ // Skip if this overlap is crappy quality
+ if (OG->isOverlapBadQuality(ovl[oo]))
+ continue;
- discarded = true;
- }
- }
+ // Skip if the read is contained or suspicious.
+ if ((OG->isContained(rdBid) == true) ||
+ (OG->isSuspicious(rdBid) == true))
+ continue;
+ // Skip if the overlap isn't dovetail.
+ bool ovl5 = ovl[oo].AEndIs5prime();
+ bool ovl3 = ovl[oo].AEndIs3prime();
- if (discarded)
- tigMarksR.filterShort(1);
- }
+ if ((ovl5 == false) &&
+ (ovl3 == false))
+ continue;
- // Run through again, looking for the thickest overlap(s) to the remaining regions.
- // This isn't caring about the end effect noted above.
+ // Skip if we're not using this overlap
+ if ((ovl5 == true) && (b5use == false))
+ continue;
-#if 1
- writeLog("thickest edges to the repeat regions:\n");
+ if ((ovl3 == true) && (b3use == false))
+ continue;
- for (uint32 ri=0; ri<tigMarksR.numberOfIntervals(); ri++) {
- uint32 t5 = UINT32_MAX, l5 = 0, t5bgn = 0, t5end = 0;
- uint32 t3 = UINT32_MAX, l3 = 0, t3bgn = 0, t3end = 0;
-
- for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
- ufNode *frg = &tig->ufpath[fi];
- bool frgfwd = (frg->position.bgn < frg->position.end);
- int32 frglo = (frgfwd) ? frg->position.bgn : frg->position.end;
- int32 frghi = (frgfwd) ? frg->position.end : frg->position.bgn;
- bool discarded = false;
-
- // Overlap off the 5' end of the region.
- if (frglo <= tigMarksR.lo(ri) && (tigMarksR.lo(ri) <= frghi)) {
- uint32 olap = frghi - tigMarksR.lo(ri);
- if (l5 < olap) {
- l5 = olap;
- t5 = fi;
- t5bgn = frglo; // Easier than recomputing it later on...
- t5end = frghi;
- }
- }
- // Overlap off the 3' end of the region.
- if (frglo <= tigMarksR.hi(ri) && (tigMarksR.hi(ri) <= frghi)) {
- uint32 olap = tigMarksR.hi(ri) - frglo;
- if (l3 < olap) {
- l3 = olap;
- t3 = fi;
- t3bgn = frglo;
- t3end = frghi;
- }
- }
+ uint32 rdBpos = tigs[tgBid]->ufpathIdx(rdBid);
+ ufNode *rdB = &tigs[tgBid]->ufpath[rdBpos];
- if (frglo <= tigMarksR.lo(ri) && (tigMarksR.hi(ri) <= frghi)) {
- writeLog("saved region %8d:%-8d - closest read %6u (%+6d) %8d:%-8d (%+6d) (contained)\n",
- tigMarksR.lo(ri), tigMarksR.hi(ri),
- frg->ident,
- tigMarksR.lo(ri) - frglo, frglo,
- frghi, frghi - tigMarksR.hi(ri));
- }
- }
+ bool rdBfwd = (rdB->position.bgn < rdB->position.end);
+ int32 rdBlo = (rdBfwd) ? rdB->position.bgn : rdB->position.end;
+ int32 rdBhi = (rdBfwd) ? rdB->position.end : rdB->position.bgn;
- if (t5 != UINT32_MAX)
- writeLog("saved region %8d:%-8d - closest 5' read %6u (%+6d) %8d:%-8d (%+6d)\n",
- tigMarksR.lo(ri), tigMarksR.hi(ri),
- tig->ufpath[t5].ident,
- tigMarksR.lo(ri) - t5bgn, t5bgn,
- t5end, t5end - tigMarksR.hi(ri));
+ // If the overlap is to a read in a different tig, or
+ // the overlap is to a read in the same tig, but we don't overlap in the tig, check lengths.
+ // Otherwise, the overlap is present in the tig, and can't be confused.
+ if ((tgBid == tig->id()) &&
+ (rdBlo <= rdAhi) &&
+ (rdAlo <= rdBhi))
+ continue;
- if (t3 != UINT32_MAX)
- writeLog("saved region %8d:%-8d - closest 3' read %6u (%+6d) %8d:%-8d (%+6d)\n",
- tigMarksR.lo(ri), tigMarksR.hi(ri),
- tig->ufpath[t3].ident,
- tigMarksR.lo(ri) - t3bgn, t3bgn,
- t3end, t3end - tigMarksR.hi(ri));
- }
-#endif
+ uint32 len = RI->overlapLength(rdAid, ovl[oo].b_iid, ovl[oo].a_hang, ovl[oo].b_hang);
+ double score = len * (1 - ovl[oo].erate());
+ // Compute percent difference.
- // Scan reads. If a read intersects a repeat interval, and the best edge for that read
- // is entirely in the repeat region, decide if there is a near-best edge to something
- // not in this tig.
- //
- // A region with no such near-best edges is _probably_ correct.
+ double ad5 = fabs(score - score5);
+ double ad3 = fabs(score - score3);
- writeLog("search for confused edges:\n");
+ double pd5 = 200 * ad5 / (score + score5);
+ double pd3 = 200 * ad3 / (score + score3);
- uint32 *isConfused = new uint32 [tigMarksR.numberOfIntervals()];
+ // Skip if this overlap is vastly worse than the best.
- memset(isConfused, 0, sizeof(uint32) * tigMarksR.numberOfIntervals());
+ if ((ovl5 == true) && ((ad5 >= confusedAbsolute) || (pd5 > confusedPercent))) {
+ writeLog("tig %7u read %8u pos %7u-%-7u NOT confused by 5' edge to read %8u - best edge read %8u len %6u erate %.4f score %8.2f - alt edge len %6u erate %.4f score %8.2f - absdiff %8.2f percdiff %8.4f\n",
+ tig->id(), rdAid, rdAlo, rdAhi,
+ rdBid,
+ b5->readId(), len5, b5->erate(), score5,
+ len, ovl[oo].erate(), score,
+ ad5, pd5);
+ continue;
+ }
- for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
- ufNode *rdA = &tig->ufpath[fi];
- uint32 rdAid = rdA->ident;
- bool rdAfwd = (rdA->position.bgn < rdA->position.end);
- int32 rdAlo = (rdAfwd) ? rdA->position.bgn : rdA->position.end;
- int32 rdAhi = (rdAfwd) ? rdA->position.end : rdA->position.bgn;
+ if ((ovl3 == true) && ((ad3 >= confusedAbsolute) || (pd3 > confusedPercent))) {
+ writeLog("tig %7u read %8u pos %7u-%-7u NOT confused by 3' edge to read %8u - best edge read %8u len %6u erate %.4f score %8.2f - alt edge len %6u erate %.4f score %8.2f - absdiff %8.2f percdiff %8.4f\n",
+ tig->id(), rdAid, rdAlo, rdAhi,
+ rdBid,
+ b3->readId(), len3, b3->erate(), score3,
+ len, ovl[oo].erate(), score,
+ ad3, pd3);
+ continue;
+ }
- double sc = (rdAhi - rdAlo) / (double)FI->fragmentLength(rdAid);
+ // Potential confusion!
+
+ if (ovl5 == true)
+ writeLog("tig %7u read %8u pos %7u-%-7u IS confused by 5' edge to read %8u - best edge read %8u len %6u erate %.4f score %8.2f - alt edge len %6u erate %.4f score %8.2f - absdiff %8.2f percdiff %8.4f\n",
+ tig->id(), rdAid, rdAlo, rdAhi,
+ rdBid,
+ b5->readId(), len5, b5->erate(), score5,
+ len, ovl[oo].erate(), score,
+ ad5, pd5);
+
+ if (ovl3 == true)
+ writeLog("tig %7u read %8u pos %7u-%-7u IS confused by 3' edge to read %8u - best edge read %8u len %6u erate %.4f score %8.2f - alt edge len %6u erate %.4f score %8.2f - absdiff %8.2f percdiff %8.4f\n",
+ tig->id(), rdAid, rdAlo, rdAhi,
+ rdBid,
+ b3->readId(), len3, b3->erate(), score3,
+ len, ovl[oo].erate(), score,
+ ad3, pd3);
+
+ isConfused[ri]++;
+ }
+ } // Over all marks (ri)
+ } // Over all reads (fi)
- if ((OG->isContained(rdAid) == true) ||
- (OG->isSuspicious(rdAid) == true))
- continue;
+ return(isConfused);
+}
- for (uint32 ri=0; ri<tigMarksR.numberOfIntervals(); ri++) {
- uint32 rMin = tigMarksR.lo(ri);
- uint32 rMax = tigMarksR.hi(ri);
- if ((rdAhi < rMin) || // Read ends before the region
- (rMax < rdAlo)) // Read starts after the region
- continue; // -> don't care about this read!
- // Compute the position (in the tig) of the best overlaps.
+void
+discardUnambiguousRepeats(TigVector &tigs,
+ Unitig *tig,
+ intervalList<int32> &tigMarksR,
+ double confusedAbsolute,
+ double confusedPercent) {
- int32 tig5bgn=0, tig5end=0;
- int32 tig3bgn=0, tig3end=0;
+ uint32 *isConfused = findConfusedEdges(tigs, tig, tigMarksR, confusedAbsolute, confusedPercent);
- // Instead of using the best edge - which might not be the edge used in the unitig -
- // we need to scan the layout to return the previous/next dovetail
+ // Scan all the regions, and delete any that have no confusion.
- // Put this in a function - what to return if no best overlap?
+ bool discarded = false;
- BestEdgeOverlap *b5 = OG->getBestEdgeOverlap(rdAid, false);
- BestEdgeOverlap *b3 = OG->getBestEdgeOverlap(rdAid, true);
+ for (uint32 ri=0; ri<tigMarksR.numberOfIntervals(); ri++) {
+ if (isConfused[ri] == 0) {
+ writeLog("discard region %8d:%-8d - no confusion in best edges\n",
+ tigMarksR.lo(ri), tigMarksR.hi(ri));
- // If the best edge is to a read not in this tig, there is nothing to compare against.
- // Is this confused by default? Possibly. The unitig was constructed somehow, and that
- // must then be the edge coming into us. We'll pick it up later.
+ tigMarksR.lo(ri) = 0;
+ tigMarksR.hi(ri) = 0;
- bool b5use = true;
- bool b3use = true;
+ discarded = true;
+ }
- if (b5->fragId() == 0)
- b5use = false;
- if (b3->fragId() == 0)
- b3use = false;
+ else {
+ writeLog("saved region %8d:%-8d - %u best edges are potentially confused\n",
+ tigMarksR.lo(ri), tigMarksR.hi(ri), isConfused[ri]);
+ }
+ }
- if ((b5use) && (Unitig::fragIn(b5->fragId()) != tig->id()))
- b5use = false;
- if ((b3use) && (Unitig::fragIn(b3->fragId()) != tig->id()))
- b3use = false;
+ if (discarded)
+ tigMarksR.filterShort(1);
- // The best edge read is in this tig. If they don't overlap, again, nothing to compare
- // against.
+ delete [] isConfused;
+}
- if (b5use) {
- ufNode *rdB = &tig->ufpath[Unitig::pathPosition(b5->fragId())];
- uint32 rdBid = rdB->ident;
- bool rdBfwd = (rdB->position.bgn < rdB->position.end);
- int32 rdBlo = (rdBfwd) ? rdB->position.bgn : rdB->position.end;
- int32 rdBhi = (rdBfwd) ? rdB->position.end : rdB->position.bgn;
- if ((rdAhi < rdBlo) ||
- (rdBhi < rdAlo))
- b5use = false;
- }
- if (b3use) {
- ufNode *rdB = &tig->ufpath[Unitig::pathPosition(b3->fragId())];
- uint32 rdBid = rdB->ident;
- bool rdBfwd = (rdB->position.bgn < rdB->position.end);
- int32 rdBlo = (rdBfwd) ? rdB->position.bgn : rdB->position.end;
- int32 rdBhi = (rdBfwd) ? rdB->position.end : rdB->position.bgn;
+void
+mergeAdjacentRegions(Unitig *tig,
+ intervalList<int32> &tigMarksR) {
- if ((rdAhi < rdBlo) ||
- (rdBhi < rdAlo))
- b3use = false;
- }
+ // Extend, but don't extend past the end of the tig.
- // If we can use this edge, compute the placement of the overlap on the unitig.
+ for (uint32 ii=0; ii<tigMarksR.numberOfIntervals(); ii++) {
+ tigMarksR.lo(ii) = max<int32>(tigMarksR.lo(ii) - MIN_ANCHOR_HANG, 0);
+ tigMarksR.hi(ii) = min<int32>(tigMarksR.hi(ii) + MIN_ANCHOR_HANG, tig->getLength());
+ }
- // Call #1;
+ // Merge.
- if (b5use) {
- int32 bgn=0, end=0;
+ bool merged = false;
- olapToReadCoords(rdA,
- b5->ahang(), b5->bhang(),
- bgn, end);
+ for (uint32 ri=1; ri<tigMarksR.numberOfIntervals(); ri++) {
+ uint32 rMin = min(tigMarksR.hi(ri-1), tigMarksR.lo(ri));
+ uint32 rMax = max(tigMarksR.hi(ri-1), tigMarksR.lo(ri));
- tig5bgn = (rdAfwd) ? (rdAlo + sc * bgn) : (rdAhi - sc * end);
- tig5end = (rdAfwd) ? (rdAlo + sc * end) : (rdAhi - sc * bgn);
+ if (tigMarksR.lo(ri) <= tigMarksR.hi(ri-1)) {
+ writeLog("merge extended regions %8d:%-8d and %8d:%-8d\n",
+ tigMarksR.lo(ri-1), tigMarksR.hi(ri-1),
+ tigMarksR.lo(ri), tigMarksR.hi(ri));
- assert(tig5bgn < tig5end);
+ tigMarksR.lo(ri) = tigMarksR.lo(ri-1);
- if (tig5bgn < 0) tig5bgn = 0;
- if (tig5end > tig->getLength()) tig5end = tig->getLength();
- }
+ tigMarksR.lo(ri-1) = 0; // CRITICAL to delete the ri-1 interval (and not ri) because the next
+ tigMarksR.hi(ri-1) = 0; // iteration will be using ri (as its ri-1). ri-1 here is never seen again.
- // Call #2
+ merged = true;
+ }
+ }
- if (b3use) {
- int32 bgn=0, end=0;
+ if (merged)
+ tigMarksR.filterShort(1);
+}
- olapToReadCoords(rdA,
- b3->ahang(), b3->bhang(),
- bgn, end);
- tig3bgn = (rdAfwd) ? (rdAlo + sc * bgn) : (rdAhi - sc * end);
- tig3end = (rdAfwd) ? (rdAlo + sc * end) : (rdAhi - sc * bgn);
- assert(tig3bgn < tig3end);
+void
+reportTigsCreated(Unitig *tig,
+ vector<breakPointCoords> &BP,
+ uint32 nTigs,
+ Unitig **newTigs,
+ uint32 *nRepeat,
+ uint32 *nUnique) {
+
+ for (uint32 ii=0; ii<BP.size(); ii++) {
+ int32 rgnbgn = BP[ii]._bgn;
+ int32 rgnend = BP[ii]._end;
+ bool repeat = BP[ii]._isRepeat;
+
+ if (nRepeat[ii] + nUnique[ii] == 0)
+ writeLog("For tig %5u %s region %8d %8d - %6u/%6u repeat/unique reads - no new unitig created.\n",
+ tig->id(), (repeat == true) ? "repeat" : "unique", rgnbgn, rgnend, nRepeat[ii], nUnique[ii]);
+
+ else if (nTigs > 1)
+ writeLog("For tig %5u %s region %8d %8d - %6u/%6u reads repeat/unique - unitig %5u created.\n",
+ tig->id(), (repeat == true) ? "repeat" : "unique", rgnbgn, rgnend, nRepeat[ii], nUnique[ii], newTigs[ii]->id());
- if (tig3bgn < 0) tig3bgn = 0;
- if (tig3end > tig->getLength()) tig3end = tig->getLength();
- }
+ else
+ writeLog("For tig %5u %s region %8d %8d - %6u/%6u repeat/unique reads - unitig %5u remains unchanged.\n",
+ tig->id(), (repeat == true) ? "repeat" : "unique", rgnbgn, rgnend, nRepeat[ii], nUnique[ii], tig->id());
+ }
+}
- // If either of the 5' or 3' overlaps (or both!) are in the repeat region, we need to check for
- // close overlaps on that end.
- uint32 len5 = 0;
- uint32 len3 = 0;
- if ((rMin < tig5bgn) &&
- (tig5end < rMax) &&
- (b5use))
- len5 = FI->overlapLength(rdAid, b5->fragId(), b5->ahang(), b5->bhang());
- else
- b5use = false;
- if ((rMin < tig3bgn) &&
- (tig3end < rMax) &&
- (b3use))
- len3 = FI->overlapLength(rdAid, b3->fragId(), b3->ahang(), b3->bhang());
- else
- b3use = false;
- double score5 = len5 * (1 - b5->erate());
- double score3 = len3 * (1 - b3->erate());
+void
+markRepeatReads(AssemblyGraph *AG,
+ TigVector &tigs,
+ double deviationRepeat,
+ uint32 confusedAbsolute,
+ double confusedPercent) {
+ uint32 tiLimit = tigs.size();
+ uint32 numThreads = omp_get_max_threads();
+ uint32 blockSize = (tiLimit < 100000 * numThreads) ? numThreads : tiLimit / 99999;
- // Neither of the best edges are in the repeat region; move to the next region and/or read.
- if (len5 + len3 == 0)
- continue;
+ writeLog("repeatDetect()-- working on " F_U32 " tigs, with " F_U32 " thread%s.\n", tiLimit, numThreads, (numThreads == 1) ? "" : "s");
- // At least one of the best edge overlaps is in the repeat region. Scan for other edges
- // that are of comparable length and quality.
-
- uint32 ovlLen = 0;
- BAToverlap *ovl = OC->getOverlaps(rdAid, AS_MAX_ERATE, ovlLen);
-
- for (uint32 oo=0; oo<ovlLen; oo++) {
- uint32 rdBid = ovl[oo].b_iid;
- uint32 tgBid = Unitig::fragIn(rdBid);
-
- // If the read is in a singleton, skip. These are unassembled crud.
- if ((tgBid == 0) ||
- (unitigs[tgBid] == NULL) ||
- (unitigs[tgBid]->ufpath.size() == 1))
- continue;
-
- // If the read is in an annotated bubble, skip.
- if (unitigs[tgBid]->_isBubble)
- continue;
-
- // Skip if this overlap is the best we're trying to match.
- if ((rdBid == b5->fragId()) ||
- (rdBid == b3->fragId()))
- continue;
-
- // Skip if this overlap is crappy quality
- if (OG->isOverlapBadQuality(ovl[oo]))
- continue;
-
- // Skip if the read is contained or suspicious.
- if ((OG->isContained(rdBid) == true) ||
- (OG->isSuspicious(rdBid) == true))
- continue;
-
- // Skip if the overlap isn't dovetail.
- bool ovl5 = ovl[oo].AEndIs5prime();
- bool ovl3 = ovl[oo].AEndIs3prime();
-
- if ((ovl5 == false) &&
- (ovl3 == false))
- continue;
-
- // Skip if we're not using this overlap
- if ((ovl5 == true) && (b5use == false))
- continue;
-
- if ((ovl3 == true) && (b3use == false))
- continue;
-
-
- uint32 rdBpos = unitigs[tgBid]->pathPosition(rdBid);
- ufNode *rdB = &unitigs[tgBid]->ufpath[rdBpos];
-
- bool rdBfwd = (rdB->position.bgn < rdB->position.end);
- int32 rdBlo = (rdBfwd) ? rdB->position.bgn : rdB->position.end;
- int32 rdBhi = (rdBfwd) ? rdB->position.end : rdB->position.bgn;
-
- // If the overlap is to a read in a different tig, or
- // the overlap is to a read in the same tig, but we don't overlap in the tig, check lengths.
- // Otherwise, the overlap is present in the tig, and can't be confused.
- if ((tgBid == tig->id()) &&
- (rdBlo <= rdAhi) &&
- (rdAlo <= rdBhi))
- continue;
-
- uint32 len = FI->overlapLength(rdAid, ovl[oo].b_iid, ovl[oo].a_hang, ovl[oo].b_hang);
- double score = len * (1 - ovl[oo].erate);
-
- // Compute percent difference.
-
- double ad5 = fabs(score - score5);
- double ad3 = fabs(score - score3);
-
- double pd5 = 200 * ad5 / (score + score5);
- double pd3 = 200 * ad3 / (score + score3);
-
- // Skip if this overlap is vastly worse than the best.
-
- if ((ovl5 == true) && ((ad5 >= confusedAbsolute) || (pd3 > confusedPercent))) {
- writeLog("tig %7u read %8u pos %7u-%-7u NOT confused by 5' edge to read %8u - best edge read %8u len %6u erate %.4f score %8.2f - alt edge len %6u erate %.4f score %8.2f - absdiff %8.2f percdiff %8.4f\n",
- tig->id(), rdAid, rdAlo, rdAhi,
- rdBid,
- b5->fragId(), len5, b5->erate(), score5,
- len, ovl[oo].erate, score,
- ad5, pd5);
- continue;
- }
-
- if ((ovl3 == true) && ((ad3 >= confusedAbsolute) || (pd3 > confusedPercent))) {
- writeLog("tig %7u read %8u pos %7u-%-7u NOT confused by 3' edge to read %8u - best edge read %8u len %6u erate %.4f score %8.2f - alt edge len %6u erate %.4f score %8.2f - absdiff %8.2f percdiff %8.4f\n",
- tig->id(), rdAid, rdAlo, rdAhi,
- rdBid,
- b3->fragId(), len3, b3->erate(), score3,
- len, ovl[oo].erate, score,
- ad3, pd3);
- continue;
- }
-
- // Potential confusion!
-
- if (ovl5 == true)
- writeLog("tig %7u read %8u pos %7u-%-7u IS confused by 5' edge to read %8u - best edge read %8u len %6u erate %.4f score %8.2f - alt edge len %6u erate %.4f score %8.2f - absdiff %8.2f percdiff %8.4f\n",
- tig->id(), rdAid, rdAlo, rdAhi,
- rdBid,
- b5->fragId(), len5, b5->erate(), score5,
- len, ovl[oo].erate, score,
- ad5, pd5);
-
- if (ovl3 == true)
- writeLog("tig %7u read %8u pos %7u-%-7u IS confused by 3' edge to read %8u - best edge read %8u len %6u erate %.4f score %8.2f - alt edge len %6u erate %.4f score %8.2f - absdiff %8.2f percdiff %8.4f\n",
- tig->id(), rdAid, rdAlo, rdAhi,
- rdBid,
- b3->fragId(), len3, b3->erate(), score3,
- len, ovl[oo].erate, score,
- ad3, pd3);
-
- isConfused[ri]++;
- }
- } // Over all marks (ri)
- } // Over all reads (fi)
+ vector<olapDat> repeatOlaps; // Overlaps to reads promoted to tig coords
+ intervalList<int32> tigMarksR; // Marked repeats based on reads, filtered by spanning reads
+ intervalList<int32> tigMarksU; // Non-repeat invervals, just the inversion of tigMarksR
- // Scan all the regions, and delete any that have no confusion.
- {
- bool discarded = false;
+ for (uint32 ti=0; ti<tiLimit; ti++) {
+ Unitig *tig = tigs[ti];
- for (uint32 ri=0; ri<tigMarksR.numberOfIntervals(); ri++) {
- if (isConfused[ri] == 0) {
- writeLog("discard region %8d:%-8d - no confusion in best edges\n",
- tigMarksR.lo(ri), tigMarksR.hi(ri));
+ if (tig == NULL)
+ continue;
- tigMarksR.lo(ri) = 0;
- tigMarksR.hi(ri) = 0;
+ if (tig->ufpath.size() == 1)
+ continue;
- discarded = true;
- }
+ writeLog("Annotating repeats in reads for tig %u/%u.\n", ti, tiLimit);
- else {
- writeLog("saved region %8d:%-8d - %u best edges are potentially confused\n",
- tigMarksR.lo(ri), tigMarksR.hi(ri), isConfused[ri]);
- }
- }
+ // Clear out all the existing marks. They're not for this tig.
- if (discarded)
- tigMarksR.filterShort(1);
- }
- delete [] isConfused;
+ // Analyze overlaps for each read. For each overlap to a read not in this tig, or not
+ // overlapping in this tig, and of acceptable error rate, add the overlap to repeatOlaps.
+ repeatOlaps.clear();
+ uint32 fiLimit = tig->ufpath.size();
+ uint32 numThreads = omp_get_max_threads();
+ uint32 blockSize = (fiLimit < 100 * numThreads) ? numThreads : fiLimit / 99;
+ annotateRepeatsOnRead(AG, tigs, tig, deviationRepeat, repeatOlaps);
+ writeLog("Annotated with %lu overlaps.\n", repeatOlaps.size());
- // Scan reads, join any marks that have their junctions spanned by a sufficiently large amount.
- //
- // If the read spans this junction be the usual amount, merge the intervals.
- //
- // The intervals can be overlapping (by up to REPEAT_OVERLAP_MIN (x2?) bases. For this junction
- // to be spanned, the read must span from min-ROM to max+ROM, not just hi(ri-1) to lo(ri).
- //
- // We DO need to filterShort() after every merge, otherwise, we'd have an empty bogus interval
- // in the middle of our list, which could be preventing some other merge. OK, we could
- //
- // Anything that gets merged is now no longer a true repeat. It's unique, just bordered by repeats.
- // We can't track this through the indices (because we delete things). We track it with a set of
- // begin coordinates.
+ // Merge marks for the same read into the largest possible.
- set<int32> nonRepeatIntervals;
+ mergeAnnotations(repeatOlaps);
- writeLog("Scan reads to merge repeat regions.\n");
+ // Make a new set of intervals based on all the detected repeats.
- for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
- ufNode *frg = &tig->ufpath[fi];
- bool frgfwd = (frg->position.bgn < frg->position.end);
- int32 frglo = (frgfwd) ? frg->position.bgn : frg->position.end;
- int32 frghi = (frgfwd) ? frg->position.end : frg->position.bgn;
- bool merged = false;
+ tigMarksR.clear();
- for (uint32 ri=1; ri<tigMarksR.numberOfIntervals(); ri++) {
- uint32 rMin = min(tigMarksR.hi(ri-1), tigMarksR.lo(ri));
- uint32 rMax = max(tigMarksR.hi(ri-1), tigMarksR.lo(ri));
+ for (uint32 bb=0, ii=0; ii<repeatOlaps.size(); ii++)
+ tigMarksR.add(repeatOlaps[ii].tigbgn, repeatOlaps[ii].tigend - repeatOlaps[ii].tigbgn);
- if ((frglo + MIN_ANCHOR_HANG <= rMin) && (rMax + MIN_ANCHOR_HANG <= frghi)) {
- writeLog("merge regions %8d:%-8d and %8d:%-8d - junction contained in read %6u %5d-%5d\n",
- tigMarksR.lo(ri-1), tigMarksR.hi(ri-1),
- tigMarksR.lo(ri), tigMarksR.hi(ri),
- frg->ident, frglo, frghi);
+ // Collapse these markings Collapse all the read markings to intervals on the unitig, merging those that overlap
+ // significantly.
- tigMarksR.lo(ri) = tigMarksR.lo(ri-1);
+ tigMarksR.merge(REPEAT_OVERLAP_MIN);
- tigMarksR.lo(ri-1) = 0; // CRITICAL to delete this interval (and not ri) because the next
- tigMarksR.hi(ri-1) = 0; // iteration will be using ri-1 (== ri here) and ri (== ri+1).
+ // Scan reads, discard any mark that is contained in a read
+ //
+ // We don't need to filterShort() after every one is removed, but it's simpler to do it Right Now than
+ // to track if it is needed.
- merged = true;
+ writeLog("Scan reads to discard spanned repeats.\n");
- nonRepeatIntervals.insert(tigMarksR.lo(ri));
- }
- }
+ discardSpannedRepeats(tig, tigMarksR);
- if (merged)
- tigMarksR.filterShort(1);
- }
+ // Run through again, looking for the thickest overlap(s) to the remaining regions.
+ // This isn't caring about the end effect noted above.
- // Extend the regions by MIN_ANCHOR_HANG. This makes checking for reads that span and are
- // anchored in the next region easier. It also solved a quirk when the first/last repeat
- // region doesn't extend to the end of the sequence:
- // 0-183 unique (created from inversion below, but useless and incorrect)
- // 183-9942 repeat
+ reportThickestEdgesInRepeats(tig, tigMarksR);
- for (uint32 ii=0; ii<tigMarksR.numberOfIntervals(); ii++) {
- tigMarksR.lo(ii) = max<int32>(tigMarksR.lo(ii) - MIN_ANCHOR_HANG, 0);
- tigMarksR.hi(ii) = min<int32>(tigMarksR.hi(ii) + MIN_ANCHOR_HANG, tig->getLength());
- }
+ // Scan reads. If a read intersects a repeat interval, and the best edge for that read
+ // is entirely in the repeat region, decide if there is a near-best edge to something
+ // not in this tig.
+ //
+ // A region with no such near-best edges is _probably_ correct.
- // Find the non-repeat intervals.
+ writeLog("search for confused edges:\n");
- tigMarksU = tigMarksR;
- tigMarksU.invert(0, tig->getLength());
+ discardUnambiguousRepeats(tigs, tig, tigMarksR, confusedAbsolute, confusedPercent);
- // Create the list of intervals we'll use to make new unitigs.
+
+ // Merge adjacent repeats.
+ //
+ // When we split (later), we require a MIN_ANCHOR_HANG overlap to anchor a read in a unique
+ // region. This is accomplished by extending the repeat regions on both ends. For regions
+ // close together, this could leave a negative length unique region between them:
//
- // The repeat intervals are extended by MIN_ANCHOR_HANG, and then any read fully contained in one of
- // these is moved here.
+ // ---[-----]--[-----]--- before
+ // -[--------[]--------]- after extending by MIN_ANCHOR_HANG (== two dashes)
//
- // The non-repeat intervals are shortened by the same amount, and any read that intersects one
- // is moved there.
+ // To solve this, regions that were linked together by a single read (with sufficient overlaps
+ // to each) were merged. However, there was no maximum imposed on the distance between the
+ // repeats, so (in theory) a 150kbp read could attach two repeats to a 149kbp unique unitig --
+ // and label that as a repeat. After the merges were completed, the regions were extended.
//
- // Does order matter? Not sure. The repeat intervals are first, then the formerly repeat
- // merged intervals, then the unique intervals. Splitting might depend on the repeats being
- // first.
+ // This version will extend regions first, then merge repeats only if they intersect. No need
+ // for a linking read.
+ //
+ // The extension also serves to clean up the edges of tigs, where the repeat doesn't quite
+ // extend to the end of the tig, leaving a few hundred bases of non-repeat.
- writeLog("Make breakpoints.\n");
+ mergeAdjacentRegions(tig, tigMarksR);
- vector<breakPointCoords> BP;
- for (uint32 ii=0; ii<tigMarksR.numberOfIntervals(); ii++)
- if (nonRepeatIntervals.count(tigMarksR.lo(ii)) == 0)
- BP.push_back(breakPointCoords(ti, tigMarksR.lo(ii), tigMarksR.hi(ii), true));
+ // Invert. This finds the non-repeat intervals, which get turned into non-repeat tigs.
+
+ tigMarksU = tigMarksR;
+ tigMarksU.invert(0, tig->getLength());
+
+ // Create the list of intervals we'll use to make new tigs.
+
+ vector<breakPointCoords> BP;
for (uint32 ii=0; ii<tigMarksR.numberOfIntervals(); ii++)
- if (nonRepeatIntervals.count(tigMarksR.lo(ii)) != 0)
- BP.push_back(breakPointCoords(ti, tigMarksR.lo(ii), tigMarksR.hi(ii), true));
+ BP.push_back(breakPointCoords(tigMarksR.lo(ii), tigMarksR.hi(ii), true));
- for (uint32 ii=0; ii<tigMarksU.numberOfIntervals(); ii++) {
- BP.push_back(breakPointCoords(ti, tigMarksU.lo(ii), tigMarksU.hi(ii), false));
- }
+ for (uint32 ii=0; ii<tigMarksU.numberOfIntervals(); ii++)
+ BP.push_back(breakPointCoords(tigMarksU.lo(ii), tigMarksU.hi(ii), false));
- // If only one region, the whole unitig was declared repeat. Nothing to do.
+ // If there is only one BP, the tig is entirely resolved or entirely repeat. Either case,
+ // there is nothing more for us to do.
if (BP.size() == 1)
continue;
- sort(BP.begin(), BP.end());
-
// Report.
+ sort(BP.begin(), BP.end()); // Makes the report nice. Doesn't impact splitting.
+
writeLog("break tig %u into up to %u pieces:\n", ti, BP.size());
for (uint32 ii=0; ii<BP.size(); ii++)
writeLog(" %8d %8d %s (length %d)\n",
@@ -1088,32 +1067,16 @@ markRepeatReads(UnitigVector &unitigs,
// First call, count the number of tigs we would create if we let it create them.
- uint32 nTigs = splitUnitigs(unitigs, tig, BP, newTigs, lowCoord, nRepeat, nUnique, false);
+ uint32 nTigs = splitTig(tigs, tig, BP, newTigs, lowCoord, nRepeat, nUnique, false);
// Second call, actually create the tigs, if anything would change.
if (nTigs > 1)
- splitUnitigs(unitigs, tig, BP, newTigs, lowCoord, nRepeat, nUnique, true);
+ splitTig(tigs, tig, BP, newTigs, lowCoord, nRepeat, nUnique, true);
// Report the tigs created.
- for (uint32 ii=0; ii<BP.size(); ii++) {
- int32 rgnbgn = BP[ii]._bgn;
- int32 rgnend = BP[ii]._end;
- bool repeat = BP[ii]._isRepeat;
-
- if (nRepeat[ii] + nUnique[ii] == 0)
- writeLog("For tig %5u %s region %8d %8d - %6u/%6u repeat/unique reads - no new unitig created.\n",
- ti, (repeat == true) ? "repeat" : "unique", rgnbgn, rgnend, nRepeat[ii], nUnique[ii]);
-
- else if (nTigs > 1)
- writeLog("For tig %5u %s region %8d %8d - %6u/%6u reads repeat/unique - unitig %5u created.\n",
- ti, (repeat == true) ? "repeat" : "unique", rgnbgn, rgnend, nRepeat[ii], nUnique[ii], newTigs[ii]->id());
-
- else
- writeLog("For tig %5u %s region %8d %8d - %6u/%6u repeat/unique reads - unitig %5u remains unchanged.\n",
- ti, (repeat == true) ? "repeat" : "unique", rgnbgn, rgnend, nRepeat[ii], nUnique[ii], tig->id());
- }
+ reportTigsCreated(tig, BP, nTigs, newTigs, nRepeat, nUnique);
// Cleanup.
@@ -1125,9 +1088,8 @@ markRepeatReads(UnitigVector &unitigs,
// Remove the old unitig....if we made new ones.
if (nTigs > 1) {
+ tigs[tig->id()] = NULL;
delete tig;
- unitigs[ti] = NULL;
}
}
}
-
diff --git a/src/bogart/AS_BAT_MarkRepeatReads.H b/src/bogart/AS_BAT_MarkRepeatReads.H
index 4e9e0f1..9f88900 100644
--- a/src/bogart/AS_BAT_MarkRepeatReads.H
+++ b/src/bogart/AS_BAT_MarkRepeatReads.H
@@ -26,12 +26,14 @@
#ifndef INCLUDE_AS_BAT_MARKREPEATREADS
#define INCLUDE_AS_BAT_MARKREPEATREADS
+#include "AS_BAT_TigVector.H"
void
-markRepeatReads(UnitigVector &unitigs,
- double deviationRepeat,
- uint32 confusedAbsolute,
- double confusedPercent);
+markRepeatReads(AssemblyGraph *AG,
+ TigVector &tigs,
+ double deviationRepeat,
+ uint32 confusedAbsolute,
+ double confusedPercent);
#endif // INCLUDE_AS_BAT_MARKREPEATREADS
diff --git a/src/bogart/AS_BAT_PopBubbles.C b/src/bogart/AS_BAT_MergeOrphans.C
similarity index 53%
rename from src/bogart/AS_BAT_PopBubbles.C
rename to src/bogart/AS_BAT_MergeOrphans.C
index 17f0951..c337166 100644
--- a/src/bogart/AS_BAT_PopBubbles.C
+++ b/src/bogart/AS_BAT_MergeOrphans.C
@@ -13,9 +13,13 @@
* Canu branched from Celera Assembler at its revision 4587.
* Canu branched from the kmer project at its revision 1994.
*
+ * This file is derived from:
+ *
+ * src/bogart/AS_BAT_PopBubbles.C
+ *
* Modifications by:
*
- * Brian P. Walenz beginning on 2016-MAR-11
+ * Brian P. Walenz beginning on 2016-DEC-07
* are a 'United States Government Work', and
* are released in the public domain
*
@@ -23,16 +27,18 @@
* full conditions and disclaimers for each license.
*/
-#include "AS_BAT_FragmentInfo.H"
+#include "AS_BAT_ReadInfo.H"
#include "AS_BAT_OverlapCache.H"
#include "AS_BAT_BestOverlapGraph.H"
#include "AS_BAT_Logging.H"
#include "AS_BAT_Unitig.H"
-#include "AS_BAT_PlaceFragUsingOverlaps.H"
+#include "AS_BAT_PlaceReadUsingOverlaps.H"
#include "AS_BAT_Instrumentation.H"
+#include "AS_BAT_MergeOrphans.H"
+
#include "intervalList.H"
#include <vector>
@@ -47,16 +53,12 @@ using namespace std;
class candidatePop {
public:
- candidatePop() {
- };
candidatePop(Unitig *bubble_, Unitig *target_, uint32 bgn_, uint32 end_) {
bubble = bubble_;
target = target_;
bgn = bgn_;
end = end_;
};
- ~candidatePop() {
- };
Unitig *bubble;
Unitig *target;
@@ -68,26 +70,27 @@ public:
};
-// A list of the target unitigs that a bubble could be popped into.
+// A list of the target tigs that a bubble could be popped into.
typedef map<uint32, vector<uint32> > BubTargetList;
-// Decide which unitigs can be bubbles. The first pass finds unitigs that can be potential
+// Decide which tigs can be bubbles. The first pass finds tigs that can be potential
// bubbles. Any unitig where every dovetail read has an overlap to some other unitig is a
// candidate for bubble popping.
void
-findPotentialBubbles(UnitigVector &unitigs,
+findPotentialBubbles(TigVector &tigs,
BubTargetList &potentialBubbles) {
- uint32 tiLimit = unitigs.size();
+ uint32 tiLimit = tigs.size();
uint32 tiNumThreads = omp_get_max_threads();
uint32 tiBlockSize = (tiLimit < 100000 * tiNumThreads) ? tiNumThreads : tiLimit / 99999;
- writeLog("bubbleDetect()-- working on "F_U32" unitigs, with "F_U32" threads.\n", tiLimit, tiNumThreads);
+ writeStatus("\n");
+ writeStatus("bubbleDetect()-- working on " F_U32 " tigs, with " F_U32 " thread%s.\n", tiLimit, tiNumThreads, (tiNumThreads == 1) ? "" : "s");
for (uint32 ti=0; ti<tiLimit; ti++) {
- Unitig *tig = unitigs[ti];
+ Unitig *tig = tigs[ti];
if ((tig == NULL) || // Not a tig, ignore it.
(tig->ufpath.size() == 1)) // Singleton, handled elsewhere.
@@ -111,13 +114,13 @@ findPotentialBubbles(UnitigVector &unitigs,
nonContainedReads++;
uint32 ovlLen = 0;
- BAToverlap *ovl = OC->getOverlaps(rid, AS_MAX_ERATE, ovlLen);
+ BAToverlap *ovl = OC->getOverlaps(rid, ovlLen);
set<uint32> readOlapsTo;
for (uint32 oi=0; oi<ovlLen; oi++) {
- uint32 ovlTigID = Unitig::fragIn(ovl[oi].b_iid);
- Unitig *ovlTig = unitigs[ovlTigID];
+ uint32 ovlTigID = tigs.inUnitig(ovl[oi].b_iid);
+ Unitig *ovlTig = tigs[ovlTigID];
// Skip this overlap if it is to an unplaced read, to a singleton tig, to ourself,
// or to a unitig that is shorter than us. We can not pop this tig as a bubble
@@ -195,7 +198,7 @@ findPotentialBubbles(UnitigVector &unitigs,
for (map<uint32,uint32>::iterator it=tigOlapsTo.begin(); it != tigOlapsTo.end(); ++it) {
if (it->second >= BUBBLE_READ_FRACTION * nonContainedReads) {
- Unitig *dest = unitigs[it->first];
+ Unitig *dest = tigs[it->first];
writeLog(" tig %8u length %9u nReads %7u\n", dest->id(), dest->getLength(), dest->ufpath.size());
@@ -214,10 +217,10 @@ findPotentialBubbles(UnitigVector &unitigs,
// Find filtered placements for all the reads in the potential bubble tigs.
vector<overlapPlacement> *
-findBubbleReadPlacements(UnitigVector &unitigs,
+findBubbleReadPlacements(TigVector &tigs,
BubTargetList &potentialBubbles,
double deviationBubble) {
- uint32 fiLimit = FI->numFragments();
+ uint32 fiLimit = RI->numReads();
uint32 fiNumThreads = omp_get_max_threads();
uint32 fiBlockSize = (fiLimit < 1000 * fiNumThreads) ? fiNumThreads : fiLimit / 999;
@@ -225,21 +228,23 @@ findBubbleReadPlacements(UnitigVector &unitigs,
#pragma omp parallel for schedule(dynamic, fiBlockSize)
for (uint32 fi=0; fi<fiLimit; fi++) {
- uint32 rdAtigID = Unitig::fragIn(fi);
+ uint32 rdAtigID = tigs.inUnitig(fi);
if ((rdAtigID == 0) || // Read not placed in a tig, ignore it.
(OG->isContained(fi)) || // Read is contained, ignore it.
(potentialBubbles.count(rdAtigID) == 0)) // Read isn't in a potential bubble, ignore it.
continue;
- Unitig *rdAtig = unitigs[rdAtigID];
- ufNode *rdA = &rdAtig->ufpath[ Unitig::pathPosition(fi) ];
+ Unitig *rdAtig = tigs[rdAtigID];
+ ufNode *rdA = &rdAtig->ufpath[ tigs.ufpathIdx(fi) ];
bool rdAfwd = (rdA->position.bgn < rdA->position.end);
int32 rdAlo = (rdAfwd) ? rdA->position.bgn : rdA->position.end;
int32 rdAhi = (rdAfwd) ? rdA->position.end : rdA->position.bgn;
+ bool isEnd = (fi == 0) || (fi == fiLimit-1);
+
uint32 ovlLen = 0;
- BAToverlap *ovl = OC->getOverlaps(rdA->ident, AS_MAX_ERATE, ovlLen);
+ BAToverlap *ovl = OC->getOverlaps(rdA->ident, ovlLen);
set<uint32> intersections;
@@ -251,41 +256,42 @@ findBubbleReadPlacements(UnitigVector &unitigs,
vector<overlapPlacement> placements;
- placeFragUsingOverlaps(unitigs, AS_MAX_ERATE, NULL, rdA->ident, placements);
+ placeReadUsingOverlaps(tigs, NULL, rdA->ident, placements, placeRead_noExtend);
// Weed out placements that aren't for bubbles, or that are for bubbles but are poor quality. Or are to ourself!
for (uint32 pi=0; pi<placements.size(); pi++) {
uint32 rdBtigID = placements[pi].tigID;
- Unitig *rdBtig = unitigs[rdBtigID];
+ Unitig *rdBtig = tigs[rdBtigID];
- uint32 lo = (placements[pi].position.bgn < placements[pi].position.end) ? placements[pi].position.bgn : placements[pi].position.end;
- uint32 hi = (placements[pi].position.bgn < placements[pi].position.end) ? placements[pi].position.end : placements[pi].position.bgn;
+ uint32 lo = placements[pi].position.min();
+ uint32 hi = placements[pi].position.max();
double erate = placements[pi].errors / placements[pi].aligned;
// Ignore the placement if it is to ourself.
- if (rdAtigID == rdBtigID) {
- //writeLog("tig %6u frag %8u -> tig %6u %6u reads at %8u-%8u (cov %7.5f erate %6.4f) - SAME TIG\n",
- // rdAtigID, placements[pi].frgID, placements[pi].tigID, rdBtig->ufpath.size(), placements[pi].position.bgn, placements[pi].position.end, placements[pi].fCoverage, erate);
+ if (rdAtigID == rdBtigID)
continue;
- }
- // Ignore the placement if it is to a non-tig / singleton read, or if it didn't place the
- // read fully.
+ // Ignore the placement if it is to a non-tig or a singleton read.
if ((rdBtigID == 0) ||
(rdBtig == NULL) ||
- (rdBtig->ufpath.size() == 1) ||
+ (rdBtig->ufpath.size() == 1))
+ continue;
+
+ // Ignore the placement if it is partial and not a terminal read.
+
+ if ((isEnd == false) &&
(placements[pi].fCoverage < 0.99)) {
if (logFileFlagSet(LOG_BUBBLE_DETAIL))
- writeLog("tig %6u frag %8u -> tig %6u %6u reads at %8u-%8u (cov %7.5f erate %6.4f) - PARTIALLY PLACED\n",
+ writeLog("tig %6u read %8u -> tig %6u %6u reads at %8u-%-8u (cov %7.5f erate %6.4f) - PARTIALLY PLACED\n",
rdAtigID, placements[pi].frgID, placements[pi].tigID, rdBtig->ufpath.size(), placements[pi].position.bgn, placements[pi].position.end, placements[pi].fCoverage, erate);
continue;
}
- // Ignore the placement if it isn't to one of our bubble-popping candidate unitigs.
+ // Ignore the placement if it isn't to one of our bubble-popping candidate tigs.
bool dontcare = true;
vector<uint32> &pbubbles = potentialBubbles[rdAtigID];
@@ -297,7 +303,7 @@ findBubbleReadPlacements(UnitigVector &unitigs,
if (dontcare) {
if (logFileFlagSet(LOG_BUBBLE_DETAIL))
- writeLog("tig %6u frag %8u -> tig %6u %6u reads at %8u-%8u (cov %7.5f erate %6.4f) - NOT CANDIDATE TIG\n",
+ writeLog("tig %6u read %8u -> tig %6u %6u reads at %8u-%-8u (cov %7.5f erate %6.4f) - NOT CANDIDATE TIG\n",
rdAtigID, placements[pi].frgID, placements[pi].tigID, rdBtig->ufpath.size(), placements[pi].position.bgn, placements[pi].position.end, placements[pi].fCoverage, erate);
continue;
}
@@ -306,7 +312,7 @@ findBubbleReadPlacements(UnitigVector &unitigs,
if (rdBtig->overlapConsistentWithTig(deviationBubble, lo, hi, erate) < 0.5) {
if (logFileFlagSet(LOG_BUBBLE_DETAIL))
- writeLog("tig %6u frag %8u -> tig %6u %6u reads at %8u-%8u (cov %7.5f erate %6.4f) - HIGH ERROR\n",
+ writeLog("tig %6u read %8u -> tig %6u %6u reads at %8u-%-8u (cov %7.5f erate %6.4f) - HIGH ERROR\n",
rdAtigID, placements[pi].frgID, placements[pi].tigID, rdBtig->ufpath.size(), placements[pi].position.bgn, placements[pi].position.end, placements[pi].fCoverage, erate);
continue;
}
@@ -314,7 +320,7 @@ findBubbleReadPlacements(UnitigVector &unitigs,
// Good placement!
if (logFileFlagSet(LOG_BUBBLE_DETAIL))
- writeLog("tig %6u frag %8u -> tig %6u %6u reads at %8u-%8u (cov %7.5f erate %6.4f)\n",
+ writeLog("tig %6u read %8u -> tig %6u %6u reads at %8u-%-8u (cov %7.5f erate %6.4f)\n",
rdAtigID, placements[pi].frgID, placements[pi].tigID, rdBtig->ufpath.size(), placements[pi].position.bgn, placements[pi].position.end, placements[pi].fCoverage, erate);
placed[fi].push_back(placements[pi]);
@@ -329,126 +335,281 @@ findBubbleReadPlacements(UnitigVector &unitigs,
-// Bubble popping cannot be done in parallel -- there is a race condition when both unitigs
+// Bubble popping cannot be done in parallel -- there is a race condition when both tigs
// A and B are considering merging in unitig C.
void
-popBubbles(UnitigVector &unitigs,
- double deviationBubble) {
+mergeOrphans(TigVector &tigs,
+ double deviationBubble) {
BubTargetList potentialBubbles;
- findPotentialBubbles(unitigs, potentialBubbles);
+ findPotentialBubbles(tigs, potentialBubbles);
+
+ writeStatus("mergeOrphans()-- Found " F_SIZE_T " potential bubbles.\n", potentialBubbles.size());
+
+ //if (potentialBubbles.size() == 0)
+ // return;
writeLog("\n");
- writeLog("Found "F_SIZE_T" potential bubbles.\n", potentialBubbles.size());
+ writeLog("Found " F_SIZE_T " potential bubbles.\n", potentialBubbles.size());
writeLog("\n");
- vector<overlapPlacement> *placed = findBubbleReadPlacements(unitigs, potentialBubbles, deviationBubble);
+ vector<overlapPlacement> *placed = findBubbleReadPlacements(tigs, potentialBubbles, deviationBubble);
// We now have, in 'placed', a list of all the places that each read could be placed. Decide if there is a _single_
// place for each bubble to be popped.
- uint32 tiLimit = unitigs.size();
+ uint32 tiLimit = tigs.size();
//uint32 tiNumThreads = omp_get_max_threads();
//uint32 tiBlockSize = (tiLimit < 100000 * tiNumThreads) ? tiNumThreads : tiLimit / 99999;
// Clear flags.
for (uint32 ti=0; ti<tiLimit; ti++) {
- if (unitigs[ti]) {
- unitigs[ti]->_isBubble = false;
- unitigs[ti]->_isRepeat = false;
+ if (tigs[ti]) {
+ tigs[ti]->_isBubble = false;
+ tigs[ti]->_isRepeat = false;
}
}
+ uint32 nUniqOrphan = 0;
+ uint32 nReptOrphan = 0;
+ uint32 nUniqBubble = 0;
+ uint32 nReptBubble = 0;
+
// In parallel, process the placements.
for (uint32 ti=0; ti<tiLimit; ti++) {
if (potentialBubbles.count(ti) == 0) // Not a potential bubble
continue;
+ writeLog("\n");
+
+ // Save some interesting bits about our bubble.
+
+ Unitig *bubble = tigs[ti];
+ uint32 bubbleLen = bubble->getLength();
+ uint32 nReads = bubble->ufpath.size();
+
+ ufNode &fRead = bubble->ufpath.front();
+ ufNode &lRead = bubble->ufpath.back();
+
+ uint32 fReadID = fRead.ident; // Ident of the first read
+ uint32 lReadID = lRead.ident;
+
+ bool bubbleInnie = (fRead.position.isForward() && lRead.position.isReverse());
+ bool bubbleOuttie = (fRead.position.isReverse() && lRead.position.isForward());
+ bool bubbleFwd = (fRead.position.isForward() && lRead.position.isForward());
+ bool bubbleRev = (fRead.position.isReverse() && lRead.position.isReverse());
+
// Scan the bubble, decide if there are _ANY_ read placements. Log appropriately.
- Unitig *bubble = unitigs[ti];
- bool hasPlacements = false;
+ bool failedToPlaceAnchor = false;
- for (uint32 fi=0; fi<bubble->ufpath.size(); fi++) {
- uint32 readID = bubble->ufpath[fi].ident;
+ {
+ char placedS[128];
+
+ char placed0 = ((nReads > 0) && (placed[ bubble->ufpath[ 0 ].ident ].size() > 0)) ? 't' : '-';
+ char placed1 = ((nReads > 1) && (placed[ bubble->ufpath[ 1 ].ident ].size() > 0)) ? 't' : '-';
+ char placedb = ((nReads > 1) && (placed[ bubble->ufpath[ nReads-2 ].ident ].size() > 0)) ? 't' : '-';
+ char placeda = ((nReads > 0) && (placed[ bubble->ufpath[ nReads-1 ].ident ].size() > 0)) ? 't' : '-';
- if (placed[readID].size() > 0)
- hasPlacements = true;
+ uint32 placedN = 0;
+
+ if (nReads > 3)
+ for (uint32 fi=2; fi<nReads-2; fi++)
+ if (placed[bubble->ufpath[fi].ident].size() > 0)
+ placedN++;
+
+ switch (nReads) {
+ case 0:
+ assert(0);
+ break;
+
+ case 1:
+ snprintf(placedS, 128, "%c", placed0);
+ break;
+
+ case 2:
+ snprintf(placedS, 128, "%c%c", placed0, placeda);
+ break;
+
+ case 3:
+ snprintf(placedS, 128, "%c%c%c", placed0, placed1, placeda);
+ break;
+
+ case 4:
+ snprintf(placedS, 128, "%c%c%c%c", placed0, placed1, placedb, placeda);
+ break;
+
+ default:
+ snprintf(placedS, 128, "%c%c[%u]%c%c",
+ placed0, placed1, placedN, placedb, placeda);
+ break;
+ }
+
+ failedToPlaceAnchor = ((placed0 != 't') || (placeda != 't'));
+
+ writeLog("potential bubble tig %8u (reads %5u length %8u) - placed %s%s\n",
+ bubble->id(), nReads, bubbleLen, placedS, failedToPlaceAnchor ? " FAILED" : "");
}
- if (hasPlacements == false)
- writeLog("potential bubble %u had no valid placements (all were not contained in target tig)\n", ti);
- else
- writeLog("potential bubble %u\n", ti);
+ if (failedToPlaceAnchor)
+ continue;
+
// Split the placements into piles for each target and build an interval list for each target.
// For each read in the tig, convert the vector of placements into interval lists, one list per target tig.
map<uint32, intervalList<uint32> *> targetIntervals;
- for (uint32 fi=0; fi<bubble->ufpath.size(); fi++) {
- uint32 readID = bubble->ufpath[fi].ident;
+ // Add extended intervals for the first read.
- for (uint32 pp=0; pp<placed[readID].size(); pp++) {
- uint32 tid = placed[readID][pp].tigID;
+ for (uint32 pp=0; pp<placed[fReadID].size(); pp++) {
+ uint32 tid = placed[fReadID][pp].tigID;
+ uint32 bgn = placed[fReadID][pp].position.min();
- assert(placed[readID][pp].frgID > 0);
+ if (targetIntervals[tid] == NULL)
+ targetIntervals[tid] = new intervalList<uint32>;
- uint32 bgn = (placed[readID][pp].position.bgn < placed[readID][pp].position.end) ? placed[readID][pp].position.bgn : placed[readID][pp].position.end;
- uint32 end = (placed[readID][pp].position.bgn < placed[readID][pp].position.end) ? placed[readID][pp].position.end : placed[readID][pp].position.bgn;
+ targetIntervals[tid]->add(bgn, bubbleLen); // Don't care if it goes off the high end of the tig.
+ }
- if (targetIntervals[tid] == NULL)
- targetIntervals[tid] = new intervalList<uint32>;
+ // Add extended intervals for the last read.
- //writeLog("read %u -> tig %u intervals %u-%u\n", readID, tid, bgn, end);
+ for (uint32 pp=0; pp<placed[lReadID].size(); pp++) {
+ uint32 tid = placed[lReadID][pp].tigID;
+ uint32 end = placed[lReadID][pp].position.max();
- targetIntervals[tid]->add(bgn, end-bgn);
- }
+ if (targetIntervals[tid] == NULL)
+ targetIntervals[tid] = new intervalList<uint32>;
+
+ if (end < bubbleLen)
+ targetIntervals[tid]->add(0, end); // Careful! Negative will underflow!
+ else
+ targetIntervals[tid]->add(end - bubbleLen, bubbleLen);
}
- vector<candidatePop *> targets;
+ // For each destination tig:
+ // merge the intervals
+ // for each interval
+ // find which bubble first/last reads map to each interval
+ // ignore if the extent of first/last is too big or small
+ // save otherwise
- // Squish the intervals. Create new candidatePops for each interval that isn't too big or
- // small. Assign each overlapPlacements to the correct candidatePop.
+ vector<candidatePop *> targets;
for (map<uint32, intervalList<uint32> *>::iterator it=targetIntervals.begin(); it != targetIntervals.end(); ++it) {
uint32 targetID = it->first;
intervalList<uint32> *IL = it->second;
+ // Merge.
+
IL->merge();
- // Discard intervals that are significantly too small or large. Save the ones that are
- // nicely sized. Logging here isn't terribly useful, it's just repeated (out of order) later
- // when we try to make sense of the read alignments.
+ // Figure out if each interval has both the first and last read of some bubble, and if those
+ // are properly sized.
for (uint32 ii=0; ii<IL->numberOfIntervals(); ii++) {
- if ((IL->hi(ii) - IL->lo(ii) < 0.75 * bubble->getLength()) || // Too small!
- (1.25 * bubble->getLength() < IL->hi(ii) - IL->lo(ii))) { // Too big!
- writeLog("tig %8u length %9u -> target %8u piece %2u position %9u-%9u length %8u - size mismatch, discarded\n",
+ bool noFirst = true;
+ bool noLast = true;
+
+ uint32 intBgn = IL->lo(ii);
+ uint32 intEnd = IL->hi(ii);
+
+ SeqInterval fPos;
+ SeqInterval lPos;
+
+ for (uint32 pp=0; pp<placed[fReadID].size(); pp++) {
+ fPos = placed[fReadID][pp].position;
+
+ if ((targetID == placed[fReadID][pp].tigID) &&
+ (intBgn <= fPos.min()) && (fPos.max() <= intEnd)) {
+ noFirst = false;
+ break;
+ }
+ }
+
+ for (uint32 pp=0; pp<placed[lReadID].size(); pp++) {
+ lPos = placed[lReadID][pp].position;
+
+ if ((targetID == placed[lReadID][pp].tigID) &&
+ (intBgn <= lPos.min()) && (lPos.max() <= intEnd)) {
+ noLast = false;
+ break;
+ }
+ }
+
+ // Ignore if missing either read.
+
+ if ((noFirst == true) ||
+ (noLast == true)) {
+ writeLog("potential bubble tig %8u (length %8u) - target %8u %8u-%-8u (length %8u) - MISSING %s%s%s READ%s\n",
bubble->id(), bubble->getLength(),
- targetID, ii, IL->lo(ii), IL->hi(ii), IL->hi(ii) - IL->lo(ii));
+ targetID, intBgn, intEnd, intEnd - intBgn,
+ (noFirst) ? "FIRST" : "",
+ (noFirst && noLast) ? " and " : "",
+ (noLast) ? "LAST" : "",
+ (noFirst && noLast) ? "S" : "");
continue;
}
- writeLog("tig %8u length %9u -> target %8u piece %2u position %9u-%9u length %8u\n",
+ writeLog("potential bubble tig %8u (length %8u) - target %8u %8u-%-8u (length %8u) - %8u-%-8u %8u-%-8u\n",
bubble->id(), bubble->getLength(),
- targetID, ii, IL->lo(ii), IL->hi(ii), IL->hi(ii) - IL->lo(ii));
+ targetID, intBgn, intEnd, intEnd - intBgn,
+ fPos.min(), fPos.max(),
+ lPos.min(), lPos.max());
- targets.push_back(new candidatePop(bubble, unitigs[targetID], IL->lo(ii), IL->hi(ii)));
- }
- delete IL;
- }
+ // Ignore if the reads align in inconsistent orientations.
+
+#if 0
+ bool alignFwd = (fPos.min() < lPos.max()) ? true : false;
+ bool fPosFwd = fPos.isForward();
+ bool lPosFwd = lPos.isForward();
+
+ bool alignInnie = (alignFwd == true) ? ((fPosFwd == true) && (lPosFwd == false)) : ((fPosFwd == false) && (lPosFwd == true));
+ bool alignOuttie = false;
+ bool alignFwd = false;
+ bool alignRev = false;
+
+ bool alignInnie = (alignFwd && fPosFwd && !rPosFwd);
+
+
+ //if ((bubbleInnie == true) &&
+ //if ((bubbleOuttie == true) && ((alignFwd == true) || (fPosFwd == true) || (rPosFwd == false)));
+ //if ((bubbleFwd == true) && ((alignFwd == true) || (fPosFwd == true) || (rPosFwd == false)));
+ //if ((bubbleRev == true) && ((alignFwd == true) || (fPosFwd == true) || (rPosFwd == false)));
+#endif
+
+ // Ignore if the region is too small or too big.
+
+ uint32 regionMin = min(fPos.min(), lPos.min());
+ uint32 regionMax = max(fPos.max(), lPos.max());
+
+ if ((regionMax - regionMin < 0.75 * bubbleLen) ||
+ (regionMax - regionMin > 1.25 * bubbleLen))
+ continue;
+
+ // Both reads placed, and at about the right size. We probably should be checking orientation. Maybe tomorrow.
+
+ targets.push_back(new candidatePop(bubble, tigs[targetID], regionMin, regionMax));
+ } // Over all intervals for this target
+ } // Over all targets
+
+ // Done with the targetIntervals. Clean up.
+
+ for (map<uint32, intervalList<uint32> *>::iterator it=targetIntervals.begin(); it != targetIntervals.end(); ++it)
+ delete it->second;
targetIntervals.clear();
// If no targets, nothing to do.
- if (targets.size() == 0)
+ if (targets.size() == 0) {
+ writeLog("potential bubble tig %8u - generated no targets\n", ti);
continue;
+ }
// Run through the placements again, and assign them to the correct target.
//
@@ -457,14 +618,14 @@ popBubbles(UnitigVector &unitigs,
// For each target location:
// If the placement is for this target, save it.
- for (uint32 fi=0; fi<bubble->ufpath.size(); fi++) {
+ for (uint32 fi=0; fi<nReads; fi++) {
uint32 readID = bubble->ufpath[fi].ident;
for (uint32 pp=0; pp<placed[readID].size(); pp++) {
uint32 tid = placed[readID][pp].tigID;
- uint32 bgn = (placed[readID][pp].position.bgn < placed[readID][pp].position.end) ? placed[readID][pp].position.bgn : placed[readID][pp].position.end;
- uint32 end = (placed[readID][pp].position.bgn < placed[readID][pp].position.end) ? placed[readID][pp].position.end : placed[readID][pp].position.bgn;
+ uint32 bgn = placed[readID][pp].position.min();
+ uint32 end = placed[readID][pp].position.max();
for (uint32 tt=0; tt<targets.size(); tt++)
if ((targets[tt]->target->id() == tid) &&
@@ -499,7 +660,7 @@ popBubbles(UnitigVector &unitigs,
if (t->placed[aa].errors / t->placed[aa].aligned < t->placed[bb].errors / t->placed[bb].aligned) {
#ifdef SHOW_MULTIPLE_PLACEMENTS
- writeLog("duplicate read alignment for tig %u read %u - better %u-%u %.4f - worse %u-%u %.4f\n",
+ writeLog("duplicate read alignment for tig %u read %u - better %u-%-u %.4f - worse %u-%-u %.4f\n",
t->placed[aa].tigID, t->placed[aa].frgID,
t->placed[aa].position.bgn, t->placed[aa].position.end, t->placed[aa].errors / t->placed[aa].aligned,
t->placed[bb].position.bgn, t->placed[bb].position.end, t->placed[bb].errors / t->placed[bb].aligned);
@@ -507,7 +668,7 @@ popBubbles(UnitigVector &unitigs,
t->placed[bb] = overlapPlacement();
} else {
#ifdef SHOW_MULTIPLE_PLACEMENTS
- writeLog("duplicate read alignment for tig %u read %u - better %u-%u %.4f - worse %u-%u %.4f\n",
+ writeLog("duplicate read alignment for tig %u read %u - better %u-%-u %.4f - worse %u-%-u %.4f\n",
t->placed[aa].tigID, t->placed[aa].frgID,
t->placed[bb].position.bgn, t->placed[bb].position.end, t->placed[bb].errors / t->placed[bb].aligned,
t->placed[aa].position.bgn, t->placed[aa].position.end, t->placed[aa].errors / t->placed[aa].aligned);
@@ -527,11 +688,13 @@ popBubbles(UnitigVector &unitigs,
}
}
- // Make a set of the reads in the bubble. We'll compare each target against this to decide if all reads are placed.
+ // Make a set of the reads in the bubble.
- for (uint32 fi=0; fi<bubble->ufpath.size(); fi++)
+ for (uint32 fi=0; fi<nReads; fi++)
tigReads.insert(bubble->ufpath[fi].ident);
+ // Compare the bubble against each target.
+
uint32 nOrphan = 0; // Full coverage; bubble can be popped.
uint32 orphanTarget = 0;
@@ -543,7 +706,7 @@ popBubbles(UnitigVector &unitigs,
for (uint32 op=0; op<targets[tt]->placed.size(); op++) {
if (logFileFlagSet(LOG_BUBBLE_DETAIL))
- writeLog("tig %8u length %9u -> target %8u piece %2u position %9u-%9u length %8u - read %7u at %9u-%9u\n",
+ writeLog("tig %8u length %9u -> target %8u piece %2u position %9u-%-9u length %8u - read %7u at %9u-%-9u\n",
bubble->id(), bubble->getLength(),
targets[tt]->target->id(), tt, targets[tt]->bgn, targets[tt]->end, targets[tt]->end - targets[tt]->bgn,
targets[tt]->placed[op].frgID,
@@ -563,29 +726,29 @@ popBubbles(UnitigVector &unitigs,
uint32 n3 = 0;
uint32 nt = 0;
- for (uint32 fi=0; fi<bubble->ufpath.size(); fi++)
+ for (uint32 fi=0; fi<nReads; fi++)
if (tgtReads.count(bubble->ufpath[fi].ident) > 0)
n5++;
else
break;
- for (uint32 fi=bubble->ufpath.size(); fi-->0; )
+ for (uint32 fi=nReads; fi-->0; )
if (tgtReads.count(bubble->ufpath[fi].ident) > 0)
n3++;
else
break;
- for (uint32 fi=0; fi<bubble->ufpath.size(); fi++)
+ for (uint32 fi=0; fi<nReads; fi++)
if (tgtReads.count(bubble->ufpath[fi].ident) > 0)
nt++;
// Report now, before we nuke targets[tt] for being not a bubble!
- if ((nt == bubble->ufpath.size()) ||
+ if ((nt == nReads) ||
((n5 > 0) && (n3 > 0)))
- writeLog("tig %8u length %9u -> target %8u piece %2u position %9u-%9u length %8u - expected %3"F_SIZE_TP" reads, had %3"F_SIZE_TP" reads. n5=%3u n3=%3u nt=%3u\n",
+ writeLog("tig %8u length %9u -> target %8u piece %2u position %9u-%-9u length %8u - expected %3" F_SIZE_TP " reads, had %3" F_SIZE_TP " reads. n5=%3u n3=%3u nt=%3u\n",
bubble->id(), bubble->getLength(),
targets[tt]->target->id(), tt, targets[tt]->bgn, targets[tt]->end, targets[tt]->end - targets[tt]->bgn,
tigReads.size(),
@@ -593,7 +756,7 @@ popBubbles(UnitigVector &unitigs,
// Decide if this is a bubble, orphan from construction, or repeat.
- if (nt == bubble->ufpath.size()) {
+ if (nt == nReads) {
nOrphan++;
orphanTarget = tt;
}
@@ -604,71 +767,149 @@ popBubbles(UnitigVector &unitigs,
}
}
- // If no placements, pbbbt.
+ // If no placements, pbbbt, not a whole lot we can do here. Leave it as is. It's not even
+ // worth logging (there are many of these).
if (nOrphan + nBubble == 0) {
- //writeLog("tig %8u length %8u reads %6u had no bubble or orphan placements.\n", bubble->id(), bubble->getLength(), bubble->ufpath.size());
- continue;
}
- // If multiple orphan and/or bubble placements, it's a repeat.
+ // If not an orphan, mark it as a bubble. If multiple bubble placements, mark it as a repeat
+ // so we can use it in repeat detection.
+ //
+ // If there are orphan placements also, those placements are superior to the bubble placements,
+ // and we'll place the orphan.
+
+ else if (nOrphan == 0) {
+ if (nBubble == 1) {
+ nUniqBubble++;
+ writeStatus("mergeOrphans()-- tig %8u BUBBLE -> tig %8u\n",
+ bubble->id(),
+ targets[bubbleTarget]->target->id());
+ } else {
+ nReptBubble++;
+ writeStatus("mergeOrphans()-- tig %8u BUBBLE -> repeat\n",
+ bubble->id());
+ }
- if (nOrphan + nBubble > 1) {
- writeLog("tig %8u length %8u reads %6u - repeat - %u orphan %u bubble placements.\n",
- bubble->id(), bubble->getLength(), bubble->ufpath.size(),
- nOrphan, nBubble);
+ writeLog("tig %8u length %8u reads %6u - %s.\n",
+ bubble->id(), bubble->getLength(), nReads,
+ (nBubble == 1) ? "bubble" : "bubble-repeat");
writeLog("\n");
- bubble->_isRepeat = true;
- continue;
+
+ bubble->_isRepeat = (nBubble > 1);
+ bubble->_isBubble = true;
}
- // If a bubble placement, mark it as a bubble so it can be skipped during repeat detection.
+ // If a unique orphan placement, place it there.
+
+ else if (nOrphan == 1) {
+ nUniqOrphan++;
+ writeStatus("mergeOrphans()-- tig %8u ORPHAN -> tig %8u\n",
+ bubble->id(),
+ targets[bubbleTarget]->target->id());
+
+ writeLog("tig %8u length %8u reads %6u - orphan\n", bubble->id(), bubble->getLength(), nReads);
+
+ for (uint32 op=0, tt=orphanTarget; op<targets[tt]->placed.size(); op++) {
+ ufNode frg;
+
+ frg.ident = targets[tt]->placed[op].frgID;
+ frg.contained = 0;
+ frg.parent = 0;
+ frg.ahang = 0;
+ frg.bhang = 0;
+ frg.position.bgn = targets[tt]->placed[op].position.bgn;
+ frg.position.end = targets[tt]->placed[op].position.end;
+
+ writeLog("move read %u from tig %u to tig %u %u-%-u\n",
+ frg.ident,
+ bubble->id(),
+ targets[tt]->target->id(), frg.position.bgn, frg.position.end);
+
+ targets[tt]->target->addRead(frg, 0, false);
+ }
- if (nBubble > 0) {
- writeLog("tig %8u length %8u reads %6u - bubble\n",
- bubble->id(), bubble->getLength(), bubble->ufpath.size());
writeLog("\n");
- bubble->_isBubble = true;
- continue;
+
+ tigs[bubble->id()] = NULL;
+ delete bubble;
}
- // Otherwise, it's an orphan, move the reads to the proper place.
+ // Otherwise, there are multiple orphan placements. We can't distinguish between them, and
+ // instead just place reads where they individually decide to go.
+
+ else {
+ nReptBubble++;
+ writeStatus("mergeOrphans()-- tig %8u ORPHAN -> multiple tigs\n",
+ bubble->id(),
+ targets[bubbleTarget]->target->id());
+
+ writeLog("tig %8u length %8u reads %6u - orphan with multiple placements\n", bubble->id(), bubble->getLength(), nReads);
+
+ for (uint32 fi=0; fi<nReads; fi++) {
+ uint32 rr = bubble->ufpath[fi].ident;
+ double er = 1.00;
+ uint32 bb = 0;
- writeLog("tig %8u length %8u reads %6u - orphan\n", bubble->id(), bubble->getLength(), bubble->ufpath.size());
+ for (uint32 pp=0; pp<placed[rr].size(); pp++) {
+ double erate = placed[rr][pp].errors / placed[rr][pp].aligned;
- for (uint32 op=0, tt=orphanTarget; op<targets[tt]->placed.size(); op++) {
- ufNode frg;
+ if (erate < er) {
+ er = erate;
+ bb = pp;
+ }
+ }
+
+ ufNode frg;
+
+ frg.ident = placed[rr][bb].frgID;
+ frg.contained = 0;
+ frg.parent = 0;
+ frg.ahang = 0;
+ frg.bhang = 0;
+ frg.position.bgn = placed[rr][bb].position.bgn;
+ frg.position.end = placed[rr][bb].position.end;
+
+ Unitig *target = tigs[placed[rr][bb].tigID];
+
+ writeLog("move read %u from tig %u to tig %u %u-%-u\n",
+ frg.ident,
+ bubble->id(),
+ target->id(), frg.position.bgn, frg.position.end);
+
+ target->addRead(frg, 0, false);
+ }
+
+ writeLog("\n");
- frg.ident = targets[tt]->placed[op].frgID;
- frg.contained = 0;
- frg.parent = 0;
- frg.ahang = 0;
- frg.bhang = 0;
- frg.position.bgn = targets[tt]->placed[op].position.bgn;
- frg.position.end = targets[tt]->placed[op].position.end;
+ tigs[bubble->id()] = NULL;
+ delete bubble;
+ }
- writeLog("move read %u from tig %u to tig %u %u-%u\n",
- frg.ident,
- bubble->id(),
- targets[tt]->target->id(), frg.position.bgn, frg.position.end);
+ // Clean up the targets list.
- targets[tt]->target->addFrag(frg, 0, false);
+ for (uint32 tt=0; tt<targets.size(); tt++) {
+ delete targets[tt];
+ targets[tt] = NULL;
}
- writeLog("\n");
+ targets.clear();
- unitigs[bubble->id()] = NULL;
- delete bubble;
} // Over all bubbles
writeLog("\n"); // Needed if no bubbles are popped.
+ writeStatus("mergeOrphans()-- placed %5u unique orphan tigs\n", nUniqOrphan);
+ writeStatus("mergeOrphans()-- shattered %5u repeat orphan tigs\n", nReptOrphan);
+ writeStatus("mergeOrphans()-- marked %5u unique bubble tigs\n", nUniqBubble);
+ writeStatus("mergeOrphans()-- marked %5u repeat bubble tigs\n", nReptBubble);
+
delete [] placed;
// Sort reads in all the tigs. Overkill, but correct.
for (uint32 ti=0; ti<tiLimit; ti++) {
- Unitig *tig = unitigs[ti];
+ Unitig *tig = tigs[ti];
if ((tig == NULL) || // Not a tig, ignore it.
(tig->ufpath.size() == 1)) // Singleton, already sorted.
diff --git a/src/bogart/AS_BAT_PopBubbles.H b/src/bogart/AS_BAT_MergeOrphans.H
similarity index 84%
copy from src/bogart/AS_BAT_PopBubbles.H
copy to src/bogart/AS_BAT_MergeOrphans.H
index 8ab8db6..da8c37e 100644
--- a/src/bogart/AS_BAT_PopBubbles.H
+++ b/src/bogart/AS_BAT_MergeOrphans.H
@@ -23,16 +23,16 @@
* full conditions and disclaimers for each license.
*/
-#ifndef INCLUDE_AS_BAT_BUBBLEPOPPING
-#define INCLUDE_AS_BAT_BUBBLEPOPPING
+#ifndef INCLUDE_AS_BAT_MERGEORPHANS
+#define INCLUDE_AS_BAT_MERGEORPHANS
#include "AS_global.H"
#include "AS_BAT_BestOverlapGraph.H"
#include "AS_BAT_Unitig.H"
void
-popBubbles(UnitigVector &unitigs,
- double deviationBubble);
+mergeOrphans(TigVector &tigs,
+ double deviationBubble);
-#endif // INCLUDE_AS_BAT_BUBBLEPOPPING
+#endif // INCLUDE_AS_BAT_MERGEORPHANS
diff --git a/src/bogart/AS_BAT_MergeUnitigs.C b/src/bogart/AS_BAT_MergeUnitigs.C
deleted file mode 100644
index e513550..0000000
--- a/src/bogart/AS_BAT_MergeUnitigs.C
+++ /dev/null
@@ -1,246 +0,0 @@
-
-/******************************************************************************
- *
- * This file is part of canu, a software program that assembles whole-genome
- * sequencing reads into contigs.
- *
- * This software is based on:
- * 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
- * the 'kmer package' (http://kmer.sourceforge.net)
- * both originally distributed by Applera Corporation under the GNU General
- * Public License, version 2.
- *
- * Canu branched from Celera Assembler at its revision 4587.
- * Canu branched from the kmer project at its revision 1994.
- *
- * Modifications by:
- *
- * Brian P. Walenz beginning on 2016-MAY-17
- * are a 'United States Government Work', and
- * are released in the public domain
- *
- * File 'README.licenses' in the root directory of this distribution contains
- * full conditions and disclaimers for each license.
- */
-
-#include "AS_BAT_FragmentInfo.H"
-#include "AS_BAT_OverlapCache.H"
-#include "AS_BAT_BestOverlapGraph.H"
-#include "AS_BAT_Logging.H"
-
-#include "AS_BAT_Unitig.H"
-#include "AS_BAT_PlaceFragUsingOverlaps.H"
-
-#include "intervalList.H"
-#include "stddev.H"
-
-#include <vector>
-
-using namespace std;
-
-
-
-
-void
-mergeUnitigs_findPlacements(UnitigVector &unitigs,
- ufNode *rd,
- double deviation,
- vector<overlapPlacement> &validPlacements) {
- vector<overlapPlacement> placements;
-
- placeFragUsingOverlaps(unitigs, AS_MAX_ERATE, NULL, rd->ident, placements);
-
- for (uint32 pi=0; pi<placements.size(); pi++) {
- Unitig *tig = unitigs[placements[pi].tigID];
-
- uint32 bgn = placements[pi].position.min();
- uint32 end = placements[pi].position.max();
-
- double erate = placements[pi].errors / placements[pi].aligned;
-
- if ((rd->position.min() < end) && (bgn < rd->position.max())) // Ignore placements to the same place
- continue;
-
- if ((placements[pi].fCoverage < 0.99) || // Ignore partially placed reads.
- (tig->ufpath.size() == 1)) { // Ignore placements in singletons.
- //writeLog("read %8u tig %6u (%8u-%8u) placed -- tig %6u (%6u reads) at %8u-%8u (cov %7.5f erate %6.4f) - LOW COV or SINGLETON\n",
- // rd->ident, Unitig::fragIn(rd->ident), rd->position.bgn, rd->position.end,
- // placements[pi].tigID, tig->ufpath.size(), placements[pi].position.bgn, placements[pi].position.end, placements[pi].fCoverage, erate);
- continue;
- }
-
- if (tig->overlapConsistentWithTig(deviation, bgn, end, erate) < 0.5) {
- //if ((enableLog == true) && (logFileFlagSet(LOG_PLACE_UNPLACED)))
- // writeLog("read %8u tig %6u (%8u-%8u) placed -- tig %6u (%6u reads) at %8u-%8u (cov %7.5f erate %6.4f) - HIGH ERROR\n",
- // rd->ident, Unitig::fragIn(rd->ident), rd->position.bgn, rd->position.end,
- // placements[pi].tigID, tig->ufpath.size(), placements[pi].position.bgn, placements[pi].position.end, placements[pi].fCoverage, erate);
- continue;
- }
-
- writeLog("read %8u tig %6u (%8u-%8u) placed -- tig %6u (%6u reads) at %8u-%8u (cov %7.5f erate %6.4f)\n",
- rd->ident, Unitig::fragIn(rd->ident), rd->position.bgn, rd->position.end,
- placements[pi].tigID, tig->ufpath.size(), placements[pi].position.bgn, placements[pi].position.end, placements[pi].fCoverage, erate);
-
- validPlacements.push_back(placements[pi]);
- }
-}
-
-
-
-
-
-void
-mergeUnitigs(UnitigVector &unitigs,
- double deviation,
- bool findCircularTigs) {
-
-
- // For every tig, decide if it can merge, end-to-end, with some other tig. This operation
- // should occur before bubbles are popped (so that whatever we chop off can be popped as a
- // bubble) and repeats are split (so that whatever we join can be split if it's not supported).
- //
- // The basic idea is that the end read on each tig should align to the middle of the other tig.
- // If the reads between those also align, we can merge. If they do not align, we should split
- // off one end and join. The split off end is either a bubble, or we made bad joins and will
- // end up with four pieces after repeat breaking.
- //
- // -----------------------------------
- // ^^^^ ----
- // |||| ||||
- // ---- vvvv
- // ----------------------------------
- //
- // This is the same basic operation as for finding circular tigs, and those
- // are found too. However, this should occur after bubbles and repeats.
-
-
-
-
- // Step 1: For every end read, place it. Save only placements that are full-length and
- // compatible with the destination tig.
-
- vector<overlapPlacement> validPlacements;
-
- for (uint32 ti=0; ti<unitigs.size(); ti++) {
- Unitig *tig = unitigs[ti];
-
- if ((tig == NULL) ||
- (tig->getNumFrags() < 2) ||
- (tig->_isUnassembled == true))
- continue;
-
- ufNode *f = tig->firstRead();
- ufNode *l = tig->lastRead();
-
- if (f == l)
- continue;
-
- mergeUnitigs_findPlacements(unitigs, f, deviation, validPlacements);
- mergeUnitigs_findPlacements(unitigs, l, deviation, validPlacements);
- }
-
- writeLog("Found "F_SIZE_T" valid placements of end reads.\n", validPlacements.size());
-
- // Step 2: Find pairs of placements between two tigs.
-
- vector<pair<uint32, uint32> > potentialCircles;
- vector<pair<uint32, uint32> > potentialMerges;
-
- for (uint32 pa=0; pa<validPlacements.size(); pa++) {
- uint32 paSrcTigID = Unitig::fragIn(validPlacements[pa].frgID);
- uint32 paDstTigID = validPlacements[pa].tigID;
-
- for (uint32 pb=pa+1; pb<validPlacements.size(); pb++) {
- uint32 pbSrcTigID = Unitig::fragIn(validPlacements[pb].frgID);
- uint32 pbDstTigID = validPlacements[pb].tigID;
-
- if (validPlacements[pa].frgID == validPlacements[pb].frgID) // Whatever we're trying, we can't use the same read twice.
- continue;
-
- if ((paDstTigID == paSrcTigID) && // pa placed in same tig as it came from
- (pbDstTigID == pbSrcTigID) && // pb placed in same tig as it came from
- (paSrcTigID == pbSrcTigID)) { // and both placed in same tig
- potentialCircles.push_back(pair<uint32,uint32>(pa, pb));
- continue;
- }
-
- if ((paDstTigID == pbSrcTigID) && // pa placed in same tig as pb came from
- (pbDstTigID == paSrcTigID)) { // pb placed in same tig as pa came from
- potentialMerges.push_back(pair<uint32,uint32>(pa, pb));
- continue;
- }
- }
- }
-
- writeLog("Found "F_SIZE_T" potential circular tigs.\n", potentialCircles.size());
- writeLog("Found "F_SIZE_T" potential joins.\n", potentialMerges.size());
-
-
- // Step 3: For the potential circles, each read needs to be placed with the same orientation as
- // its source, and the distance between (paSrc,pbDst) and (pbSrc,paDst) needs to be
- // (approximately) the same. Then, we should really check the reads between those two points.
-
- for (uint32 pc=0; pc<potentialCircles.size(); pc++) {
- uint32 pa = potentialCircles[pc].first;
- uint32 pb = potentialCircles[pc].second;
-
- uint32 paReadID = validPlacements[pa].frgID;
- uint32 pbReadID = validPlacements[pb].frgID;
-
- uint32 tigID = validPlacements[pa].tigID; // All reads placed in the same tig, see above.
- Unitig *tig = unitigs[tigID];
-
- ufNode *paRead = &tig->ufpath[Unitig::pathPosition(paReadID)];
- bool paSrcFwd = paRead->position.isForward();
- bool paDstFwd = validPlacements[pa].position.isForward();
-
- ufNode *pbRead = &tig->ufpath[Unitig::pathPosition(pbReadID)];
- bool pbSrcFwd = pbRead->position.isForward();
- bool pbDstFwd = validPlacements[pb].position.isForward();
-
- writeLog("TEST CIRCULAR - tig %u - pa=%u pb=%u - reads %u @ %u-%u -> %u-%u and %u @ %u-%u -> %u-%u\n",
- tigID, pa, pb,
- paReadID, paRead->position.bgn, paRead->position.end, validPlacements[pa].position.bgn, validPlacements[pa].position.end,
- pbReadID, pbRead->position.bgn, pbRead->position.end, validPlacements[pb].position.bgn, validPlacements[pb].position.end);
-
- if ((paSrcFwd != paDstFwd) ||
- (pbSrcFwd != pbDstFwd)) {
- writeLog("not circular - orient mismatch for tig %u pa %u pb %u reads %u and %u\n",
- tigID, pa, pb, paReadID, pbReadID);
- continue;
- }
- }
-
-
-
- for (uint32 pc=0; pc<potentialMerges.size(); pc++) {
- uint32 pa = potentialMerges[pc].first;
- uint32 pb = potentialMerges[pc].second;
-
- uint32 paReadID = validPlacements[pa].frgID;
- uint32 pbReadID = validPlacements[pb].frgID;
-
- uint32 paTigID = validPlacements[pa].tigID; // All reads placed in the same tig, see above.
- uint32 pbTigID = validPlacements[pb].tigID; // All reads placed in the same tig, see above.
-
- Unitig *paTig = unitigs[paTigID];
- Unitig *pbTig = unitigs[pbTigID];
-
- ufNode *paRead = &paTig->ufpath[Unitig::pathPosition(paReadID)];
- bool paSrcFwd = paRead->position.isForward();
- bool paDstFwd = validPlacements[pa].position.isForward();
-
- ufNode *pbRead = &pbTig->ufpath[Unitig::pathPosition(pbReadID)];
- bool pbSrcFwd = pbRead->position.isForward();
- bool pbDstFwd = validPlacements[pb].position.isForward();
-
- writeLog("TEST JOIN - pa tig %u read %u @ %u-%u -> %u-%u -- pb tig %u read %u @ %u-%u -> %u-%u\n",
- paTigID, paReadID, paRead->position.bgn, paRead->position.end, validPlacements[pa].position.bgn, validPlacements[pa].position.end,
- pbTigID, pbReadID, pbRead->position.bgn, pbRead->position.end, validPlacements[pb].position.bgn, validPlacements[pb].position.end);
- }
-
-
-
-
- exit(0);
-}
diff --git a/src/bogart/AS_BAT_Outputs.C b/src/bogart/AS_BAT_Outputs.C
index d763304..f6f8302 100644
--- a/src/bogart/AS_BAT_Outputs.C
+++ b/src/bogart/AS_BAT_Outputs.C
@@ -39,469 +39,81 @@
* full conditions and disclaimers for each license.
*/
-#include "AS_BAT_FragmentInfo.H"
+#include "AS_BAT_ReadInfo.H"
#include "AS_BAT_OverlapCache.H"
#include "AS_BAT_BestOverlapGraph.H"
#include "AS_BAT_Logging.H"
#include "AS_BAT_Unitig.H"
-#include "AS_BAT_PlaceFragUsingOverlaps.H"
+#include "AS_BAT_PlaceReadUsingOverlaps.H"
#include "tgStore.H"
-void
-unitigToTig(tgTig *tig,
- uint32 tigid,
- Unitig *utg) {
-
- // Initialize the output tig.
-
- tig->clear();
-
- tig->_tigID = tigid;
- utg->_tigID = tigid;
-
- tig->_coverageStat = 1.0; // Default to just barely unique
- tig->_microhetProb = 1.0; // Default to 100% probability of unique
-
- // Set the class.
-
- if (utg->_isUnassembled == true)
- tig->_class = tgTig_unassembled;
-
- else if (utg->_isBubble == true)
- tig->_class = tgTig_bubble;
-
- else
- tig->_class = tgTig_contig;
-
- tig->_suggestRepeat = (utg->_isRepeat == true);
- tig->_suggestCircular = (utg->_isCircular == true);
-
- tig->_layoutLen = utg->getLength();
-
- // Transfer reads from the bogart tig to the output tig.
-
- resizeArray(tig->_children, tig->_childrenLen, tig->_childrenMax, utg->ufpath.size(), resizeArray_doNothing);
-
- for (uint32 ti=0; ti<utg->ufpath.size(); ti++) {
- ufNode *frg = &utg->ufpath[ti];
-
- tig->addChild()->set(frg->ident,
- frg->parent, frg->ahang, frg->bhang,
- frg->position.bgn, frg->position.end);
- }
-}
-
-
void
-writeUnitigsToStore(UnitigVector &unitigs,
- char *fileprefix,
- char *tigStorePath,
- uint32 frg_count_target,
- bool isFinal) {
- uint32 utg_count = 0;
- uint32 frg_count = 0;
- uint32 prt_count = 1;
+writeTigsToStore(TigVector &tigs,
+ char *filePrefix,
+ char *storeName,
+ bool isFinal) {
char filename[FILENAME_MAX] = {0};
- // Open up the initial output file
-
- sprintf(filename, "%s.iidmap", fileprefix);
- FILE *iidm = fopen(filename, "w");
- assert(NULL != iidm);
-
- sprintf(filename, "%s.partitioning", fileprefix);
- FILE *part = fopen(filename, "w");
- assert(NULL != part);
-
- sprintf(filename, "%s.partitioningInfo", fileprefix);
- FILE *pari = fopen(filename, "w");
- assert(NULL != pari);
-
- // Step through all the unitigs once to build the partition mapping and IID mapping.
-
- tgStore *tigStore = new tgStore(tigStorePath);
+ snprintf(filename, FILENAME_MAX, "%s.%sStore", filePrefix, storeName);
+ tgStore *tigStore = new tgStore(filename);
tgTig *tig = new tgTig;
- for (uint32 tigID=0, ti=0; ti<unitigs.size(); ti++) {
- Unitig *utg = unitigs[ti];
+ for (uint32 ti=0; ti<tigs.size(); ti++) {
+ Unitig *utg = tigs[ti];
- if ((utg == NULL) || (utg->getNumFrags() == 0))
+ if ((utg == NULL) || (utg->getNumReads() == 0))
continue;
assert(utg->getLength() > 0);
- // Convert the bogart tig to a tgTig and save to the store.
-
- unitigToTig(tig, (isFinal) ? tigID : ti, utg);
- tigID++;
-
- tigStore->insertTig(tig, false);
-
- // Increment the partition if the current one is too large.
-
- if ((frg_count + utg->getNumFrags() >= frg_count_target) &&
- (frg_count > 0)) {
- fprintf(pari, "Partition %d has %d unitigs and %d fragments.\n",
- prt_count, utg_count, frg_count);
-
- prt_count++;
- utg_count = 0;
- frg_count = 0;
- }
-
- // Note that the tig is included in this partition.
-
- utg_count += 1;
- frg_count += utg->getNumFrags();
-
- // Map the tig to a partition, and log both the tig-to-partition map and the partition-to-read map.
-
- fprintf(iidm, "bogart "F_U32" -> tig "F_U32" (in partition "F_U32" with "F_U32" frags)\n",
- utg->id(),
- utg->tigID(),
- prt_count,
- utg->getNumFrags());
-
- for (uint32 fragIdx=0; fragIdx<utg->getNumFrags(); fragIdx++)
- fprintf(part, "%d\t%d\n", prt_count, utg->ufpath[fragIdx].ident);
- }
-
- fprintf(pari, "Partition %d has %d unitigs and %d fragments.\n", // Don't forget to log the last partition!
- prt_count, utg_count, frg_count);
-
- fclose(pari);
- fclose(part);
- fclose(iidm);
-
- delete tig;
- delete tigStore;
-}
-
-
-
-class rawEdge_t {
-public:
- rawEdge_t(uint32 o, uint32 t, int32 ab, int32 ae, int32 bb, int32 be) {
- oi = o;
- tigID = t;
-
- Abgn = ab;
- Aend = ae;
-
- Bbgn = bb;
- Bend = be;
- };
-
- uint32 oi;
- int32 tigID;
-
- int32 Abgn; // Overlapping read placement.
- int32 Aend;
-
- int32 Bbgn; // Parent placement.
- int32 Bend;
-
- bool operator<(rawEdge_t const &that) const {
- if (tigID != that.tigID)
- return(tigID < that.tigID);
-
- return(Abgn < that.Abgn);
- }
-};
-
-
-
-void
-findUnusedEdges(UnitigVector &unitigs,
- ufNode *rdA, // Read we're finding edges for
- bool rdA3p, // Overlaps from the 3' end of the read
- set<uint32> edgeReads,
- FILE *EF) {
-
- uint32 rdAid = rdA->ident;
- uint32 rdAlen = FI->fragmentLength(rdAid);
- bool rdAfwd = rdA->isForward();
- int32 rdAlo = (rdAfwd) ? (rdA->position.bgn) : (rdA->position.end);
- int32 rdAhi = (rdAfwd) ? (rdA->position.end) : (rdA->position.bgn);
- uint32 rdAtigID = Unitig::fragIn(rdAid);
- Unitig *rdAtig = unitigs[rdAtigID];
-
- uint32 ovlLen = 0;
- BAToverlap *ovl = OC->getOverlaps(rdA->ident, AS_MAX_ERATE, ovlLen);
-
- vector<rawEdge_t> rawEdges;
-
- //fprintf(stderr, "WORKING ON read rdA=%u 3p=%d\n", rdA->ident, rdA3p);
-
- // Over all overlaps for this read, find and report edges to 'edgeReads'. Though
- // edgeReads should be just one read per tig end, the code below was originally written
- // to find all edges to all reads, then pick the longest for each cluster.
-
- for (uint32 oi=0; oi<ovlLen; oi++) {
- if ((ovl[oi].AisContainer()) || // Not interested in container overlaps.
- (ovl[oi].AisContained()) || // Allow A-is-contained overlaps? Should be OK, but only really care about dovetails.
- (ovl[oi].AEndIs3prime() != rdA3p)) // Overlap off the wrong end of A.
- continue;
-
- uint32 rdBid = ovl[oi].b_iid;
- uint32 rdBtigID = Unitig::fragIn(rdBid);
- Unitig *rdBtig = unitigs[rdBtigID];
-
- if ((rdBtig == NULL) ||
- (rdBtig->getNumFrags() == 0) || // Not interested in edges to singletons
- (rdBtig->_isUnassembled == true)) // Or other unassembled crap. rdA filtered outside here.
- continue;
-
- if ((rdAtigID != rdBtigID) && // Not to self (circular) and
- (edgeReads.count(rdBid) == 0)) // not a read we can overlap to.
- continue;
-
- ufNode *rdB = &rdBtig->ufpath[ Unitig::pathPosition(rdBid) ];
- bool rdBfwd = rdB->isForward();
- int32 rdBlo = (rdBfwd) ? (rdB->position.bgn) : (rdB->position.end);
- int32 rdBhi = (rdBfwd) ? (rdB->position.end) : (rdB->position.bgn);
-
- // Exclude overlaps satisfied in the same tig.
-
- if ((rdAtigID == rdBtigID) && (rdAlo < rdBhi) && (rdBlo < rdAhi))
- continue;
-
- // Exclude overlaps that are higher than expected error.
-
- ;
-
- // Compute the placement of rdA on rdBtig.
-
- ufNode placed;
- BestEdgeOverlap edge(ovl[oi]);
-
- rdBtig->placeFrag(placed,
- rdAid,
- rdA3p,
- &edge);
-
- //writeLog("placed tig %u rdA %u %d-%d on tig %u %d-%d from rdB %u %d-%d oi %u\n",
- // rdAtigID, rdAid, rdAlo, rdAhi, rdBtigID, placed.position.bgn, placed.position.end, rdBid, rdBlo, rdBhi, oi);
-
- // Save the overlap.
-
- rawEdges.push_back(rawEdge_t(oi, rdBtigID, placed.position.min(), placed.position.max(), rdBlo, rdBhi));
- }
-
- // We've now got a pile of (unsorted) overlaps to reads in other tigs. We need to pick one
- // overlap (the longest?) from each pile and output it.
-
- sort(rawEdges.begin(), rawEdges.end());
-
- // We expect to have a pile of placements that are 'the same', generated by each one of the
- // overlapping reads in the target tig. We need to group these placements together and pick
- // one exemplar overlap to output the edge for.
- //
- // A complication is caused by large tandem repeats. We can get two distinct placements that
- // overlap:
- //
- // [rrrr][rrrr]
- // --------------- (rdA aligning to the first and second repeat)
- // ---------------- (rdA aligning to only the second repeat)
- //
- // These are just overlaps, and we don't know that the rest of rdA fails to align.
- //
- // Overlaps are sorted by the start of rdA on rdBtig. We'll use the simple and largely unvalidated
- // heuristic of any placement that starts within 500bp of the last is for the same placement.
-
- for (uint32 ri=0, rj=0; ri<rawEdges.size(); ri = rj) {
- for (rj=ri+1; ((rj < rawEdges.size()) &&
- (rawEdges[rj].tigID == rawEdges[ri].tigID) &&
- (rawEdges[rj-1].Abgn + 500 >= rawEdges[rj].Abgn)); )
- rj++;
-
- // Scan overlaps from ri to rj, retain the thickest.
-
- //fprintf(stderr, "Scan batch from ri=%u to rj=%u\n", ri, rj);
-
- uint32 rrMax = 0;
- int32 rrIdx = INT32_MAX;
-
- for (uint32 rr=ri; rr<rj; rr++) {
- int32 olapLen = 0;
-
- if (rawEdges[rr].Abgn < rawEdges[rr].Bbgn) {
- assert(rawEdges[rr].Bend >= rawEdges[rr].Abgn);
- olapLen = rawEdges[rr].Bend - rawEdges[rr].Abgn;
- } else {
- assert(rawEdges[rr].Aend >= rawEdges[rr].Bbgn);
- olapLen = rawEdges[rr].Aend - rawEdges[rr].Bbgn;
- }
-
- if (rrMax < olapLen) {
- rrMax = olapLen;
- rrIdx = rr;
- }
- }
-
- // Emit the edge.
-
- uint32 oi = rawEdges[rrIdx].oi;
-
- uint32 rdBid = ovl[oi].b_iid;
- uint32 rdBtigID = Unitig::fragIn(rdBid);
- Unitig *rdBtig = unitigs[rdBtigID];
-
- ufNode *rdB = &rdBtig->ufpath[ Unitig::pathPosition(rdBid) ];
- bool rdBfwd = rdB->isForward();
- int32 rdBlo = (rdBfwd) ? (rdB->position.bgn) : (rdB->position.end);
- int32 rdBhi = (rdBfwd) ? (rdB->position.end) : (rdB->position.bgn);
-
- char rdAEnd, rdBEnd;
-
- if (ovl[oi].isDovetail()) {
- rdAEnd = ovl[oi].AEndIs5prime() ? '5' : '3';
- rdBEnd = ovl[oi].BEndIs5prime() ? '5' : '3';
- } else {
- rdAEnd = ovl[oi].AisContainer() ? 'C' : 'c';
- rdBEnd = ovl[oi].AisContainer() ? 'c' : 'C';
- }
-
- char ori = (ovl[oi].flipped) ? '<' : '>';
-
- fprintf(EF, "tig %7u %c read %8u at %9u %-9u %8d %c %-8d tig %7u %c read %8u at %9u %-9u\n",
- rdAtig->_tigID, rdAtig->type(), rdAid, rdA->position.bgn, rdA->position.end,
- ovl[oi].a_hang, ori, ovl[oi].b_hang,
- rdBtig->_tigID, rdBtig->type(), rdBid, rdB->position.bgn, rdB->position.end);
- }
-}
-
+ // Initialize the output tig.
+ tig->clear();
+ tig->_tigID = utg->id();
-void
-writeUnusedEdges(UnitigVector &unitigs,
- char *fileprefix) {
- char filename[FILENAME_MAX] = {0};
+ tig->_coverageStat = 1.0; // Default to just barely unique
+ tig->_microhetProb = 1.0; // Default to 100% probability of unique
- sprintf(filename, "%s.unused.edges", fileprefix);
- FILE *EF = fopen(filename, "w");
- if (errno)
- fprintf(stderr, "Failed to create unused edge output '%s': %s\n", filename, strerror(errno)), exit(1);
+ // Set the class.
- // Find reads we're allowed to find edges to. We can pick either the outer-most non-contained reads,
- // or just the reads touching the edge of the tig.
+ if (utg->_isUnassembled == true)
+ tig->_class = tgTig_unassembled;
- set<uint32> edgeReads; // Reads at the end of the tig
- set<uint32> nearReads; // Reads close to the end of the tig
+ // Disabled, because bogart is not finding most of the true bubbles.
+ //else if (utg->_isBubble == true)
+ // tig->_class = tgTig_bubble;
+ else
+ tig->_class = tgTig_contig;
- // Find the outer-most non-contained reads in each unitig.
+ tig->_suggestRepeat = (utg->_isRepeat == true);
+ tig->_suggestCircular = (utg->_isCircular == true);
-#if 0
- for (uint32 ti=0; ti<unitigs.size(); ti++) {
- Unitig *tig = unitigs[ti];
+ tig->_layoutLen = utg->getLength();
- if ((tig == NULL) ||
- (tig->getNumFrags() == 0) ||
- (tig->_isUnassembled == true))
- continue;
+ // Transfer reads from the bogart tig to the output tig.
- // Find reads at the start of the tig
+ resizeArray(tig->_children, tig->_childrenLen, tig->_childrenMax, utg->ufpath.size(), resizeArray_doNothing);
- for (uint32 ct=0, fi=0; (ct < 5) && (fi < tig->ufpath.size()); fi++) {
- ufNode *frg = &tig->ufpath[fi];
+ for (uint32 ti=0; ti<utg->ufpath.size(); ti++) {
+ ufNode *frg = &utg->ufpath[ti];
- if (OG->isContained(frg->ident) == false) {
- if (ct == 0)
- edgeReads5e.insert(frg->ident);
- else
- nearReads5m.insert(frg->ident);
- ct++;
- }
+ tig->addChild()->set(frg->ident,
+ frg->parent, frg->ahang, frg->bhang,
+ frg->position.bgn, frg->position.end);
}
- // Find reads at the end of the tig
-
- for (uint32 ct=0, fi=tig->ufpath.size(); (ct < 5) && (fi-- > 0); ) {
- ufNode *frg = &tig->ufpath[fi];
-
- if (OG->isContained(frg->ident) == false) {
- if (ct == 0)
- edgeReads3e.insert(frg->ident);
- else
- nearReads3m.insert(frg->ident);
- ct++;
- }
- }
- }
-#endif
-
- // Find the reads at the ends of the tig.
-
-#if 1
- for (uint32 ti=0; ti<unitigs.size(); ti++) {
- Unitig *tig = unitigs[ti];
-
- if ((tig == NULL) ||
- (tig->getNumFrags() == 0) ||
- (tig->_isUnassembled == true))
- continue;
-
- edgeReads.insert(tig->firstRead()->ident);
- edgeReads.insert(tig->lastRead()->ident);
- }
-#endif
-
-
- // Step through all the unitigs, find all unused overlaps off the ends of the tig.
-
-
- for (uint32 ti=0; ti<unitigs.size(); ti++) {
- Unitig *tig = unitigs[ti];
+ // And write to the store
- if ((tig == NULL) ||
- (tig->getNumFrags() == 0) ||
- (tig->_isUnassembled == true))
- continue;
-
- assert(tig->getLength() > 0);
-
- // Find the first/last non-contained reads in the tig.
-
-#if 0
- ufNode *rd5 = &tig->ufpath.front();
- ufNode *rd3 = &tig->ufpath.back();
-
- for (uint32 fi=1; (fi < tig->ufpath.size()) && (OG->isContained(rd5->ident) == true); fi++)
- rd5 = &tig->ufpath[fi];
-
- for (uint32 fi=tig->ufpath.size()-1; (fi-- > 0) && (OG->isContained(rd3->ident) == true); )
- rd3 = &tig->ufpath[fi];
-
- // What to do if either of those reads are contained? If so (then both will be contained; no
- // dovetail at all) we've swapped the meaning of 5' and 3'.
-
- if ((OG->isContained(rd5) == true) || (OG->isContained(rd3) == true)) {
- rd5 = &tig->ufpath.front();
- rd3 = &tig->ufpath.back();
- }
-#endif
-
- // Find the smallest/largest read position - the two reads that are at the end of the tig.
-
-#if 1
- ufNode *rd5 = tig->firstRead();
- ufNode *rd3 = tig->lastRead();
-#endif
-
- // Finally, we probably should be finding just the reads touching the ends of the unitig, not the
- // first/last non-contained read.
-
- findUnusedEdges(unitigs, rd5, rd5->isReverse(), edgeReads, EF); // First read, if reverse, find edges off 3' end
- findUnusedEdges(unitigs, rd3, rd3->isForward(), edgeReads, EF); // Last read, if forward, find edges off 3' end
+ tigStore->insertTig(tig, false);
}
- fclose(EF);
+ delete tig;
+ delete tigStore;
}
-
diff --git a/src/bogart/AS_BAT_Outputs.H b/src/bogart/AS_BAT_Outputs.H
index dad3784..5783c70 100644
--- a/src/bogart/AS_BAT_Outputs.H
+++ b/src/bogart/AS_BAT_Outputs.H
@@ -39,19 +39,14 @@
#define INCLUDE_AS_BAT_OUTPUTS
#include "AS_BAT_Unitig.H"
-#include "AS_BAT_UnitigVector.H"
+#include "AS_BAT_TigVector.H"
void
-writeUnitigsToStore(UnitigVector &unitigs,
- char *fileprefix,
- char *tigStorePath,
- uint32 frg_count_target,
- bool isFinal);
-
-void
-writeUnusedEdges(UnitigVector &unitigs,
- char *fileprefix);
+writeTigsToStore(TigVector &tigs,
+ char *filePrefix,
+ char *storeName,
+ bool isFinal);
#endif // INCLUDE_AS_BAT_OUTPUTS
diff --git a/src/bogart/AS_BAT_OverlapCache.C b/src/bogart/AS_BAT_OverlapCache.C
index 6bfcae7..97a9829 100644
--- a/src/bogart/AS_BAT_OverlapCache.C
+++ b/src/bogart/AS_BAT_OverlapCache.C
@@ -43,7 +43,7 @@
* full conditions and disclaimers for each license.
*/
-#include "AS_BAT_FragmentInfo.H"
+#include "AS_BAT_ReadInfo.H"
#include "AS_BAT_OverlapCache.H"
#include "AS_BAT_BestOverlapGraph.H" // sizeof(BestEdgeOverlap)
#include "AS_BAT_Unitig.H" // sizeof(ufNode)
@@ -55,110 +55,43 @@
uint64 ovlCacheMagic = 0x65686361436c766fLLU; //0102030405060708LLU;
-#ifndef __CYGWIN__
- #ifndef _WIN32
- #include <sys/sysctl.h>
- #endif
-#endif
-
-#ifdef HW_PHYSMEM
-
-uint64
-getMemorySize(void) {
- uint64 physMemory = 0;
-
- int mib[2] = { CTL_HW, HW_PHYSMEM };
- size_t len = sizeof(uint64);
-
- errno = 0;
-
- if (sysctl(mib, 2, &physMemory, &len, NULL, 0) != 0)
- // failed to get memory size, so what?
- fprintf(stderr, "sysctl() failed to return CTL_HW, HW_PHYSMEM: %s\n", strerror(errno)), exit(1);
-
- if (len != sizeof(uint64)) {
-#ifdef HW_MEMSIZE
- mib[1] = HW_MEMSIZE;
- len = sizeof(uint64);
- if (sysctl(mib, 2, &physMemory, &len, NULL, 0) != 0 || len != sizeof(uint64))
-#endif
- // wasn't enough space, so what?
- fprintf(stderr, "sysctl() failed to return CTL_HW, HW_PHYSMEM: %s\n", strerror(errno)), exit(1);
- }
-
- return(physMemory);
-}
-#else
+#undef TEST_LINEAR_SEARCH
-uint64
-getMemorySize(void) {
- uint64 physPages = sysconf(_SC_PHYS_PAGES);
- uint64 pageSize = sysconf(_SC_PAGESIZE);
- uint64 physMemory = physPages * pageSize;
- fprintf(stderr, "PHYS_PAGES = "F_U64"\n", physPages);
- fprintf(stderr, "PAGE_SIZE = "F_U64"\n", pageSize);
- fprintf(stderr, "MEMORY = "F_U64"\n", physMemory);
-
- return(physMemory);
-}
-
-#endif
+#define ERR_MASK (((uint64)1 << AS_MAX_EVALUE_BITS) - 1)
+#define SALT_BITS (64 - AS_MAX_READLEN_BITS - AS_MAX_EVALUE_BITS)
+#define SALT_MASK (((uint64)1 << SALT_BITS) - 1)
-OverlapCache::OverlapCache(ovStore *ovlStoreUniq,
+OverlapCache::OverlapCache(gkStore *gkp,
+ ovStore *ovlStoreUniq,
ovStore *ovlStoreRept,
const char *prefix,
- double erate,
+ double maxErate,
uint32 minOverlap,
uint64 memlimit,
- uint32 maxOverlaps,
- bool onlySave,
+ uint64 genomeSize,
bool doSave) {
- _memLimit = 0;
- _memUsed = 0;
+ _prefix = prefix;
- _storMax = 0;
- _storLen = 0;
- _stor = NULL;
-
- _heaps.clear();
-
- _cacheMMF = NULL;
-
- _cachePtr = NULL;
- _cacheLen = NULL;
-
- _maxPer = 0;
-
- _ovsMax = 0;
- _ovs = NULL;
- _ovsSco = NULL;
- _ovsTmp = NULL;
-
- _threadMax = 0;
- _thread = NULL;
-
- _ovlStoreUniq = NULL;
- _ovlStoreRept = NULL;
-
- if (load(prefix, erate) == true)
- return;
-
- fprintf(stderr, "\n");
+ writeStatus("\n");
if (memlimit == UINT64_MAX) {
- _memLimit = getMemorySize();
- fprintf(stderr, "OverlapCache()-- limited to "F_U64"MB memory (total physical memory).\n", _memLimit >> 20);
- } else if (memlimit > 0) {
+ _memLimit = getPhysicalMemorySize();
+ writeStatus("OverlapCache()-- limited to " F_U64 "MB memory (total physical memory).\n", _memLimit >> 20);
+ }
+
+ else if (memlimit > 0) {
_memLimit = memlimit;
- fprintf(stderr, "OverlapCache()-- limited to "F_U64"MB memory (user supplied).\n", _memLimit >> 20);
- } else {
- fprintf(stderr, "OverlapCache()-- using unlimited memory (-M 0).\n");
+ writeStatus("OverlapCache()-- limited to " F_U64 "MB memory (user supplied).\n", _memLimit >> 20);
+ }
+
+ else {
_memLimit = UINT64_MAX;
+ writeStatus("OverlapCache()-- using unlimited memory (-M 0).\n");
}
// Need to initialize thread data before we can account for their size.
@@ -168,98 +101,76 @@ OverlapCache::OverlapCache(ovStore *ovlStoreUniq,
// And this too.
_ovsMax = 1 * 1024 * 1024; // At 16B each, this is 16MB
- // Account for memory used by fragment data, best overlaps, and unitigs.
- // The chunk graph is temporary, and should be less than the size of the unitigs.
+ // Account for memory used by read data, best overlaps, and tigs.
+ // The chunk graph is temporary, and should be less than the size of the tigs.
- uint64 memFI = FI->memoryUsage();
- uint64 memBE = FI->numFragments() * sizeof(BestEdgeOverlap);
- uint64 memUL = FI->numFragments() * sizeof(ufNode); // For fragment positions in unitigs
- uint64 memUT = FI->numFragments() * sizeof(uint32) / 16; // For unitigs (assumes 32 frag / unitig)
- uint64 memID = FI->numFragments() * sizeof(uint32) * 2; // For maps of fragment id to unitig id
- uint64 memEP = FI->numFragments() * Unitig::epValueSize() * 2; // For error profile
+ uint64 memFI = RI->memoryUsage();
+ uint64 memBE = RI->numReads() * sizeof(BestEdgeOverlap);
+ uint64 memUL = RI->numReads() * sizeof(ufNode); // For read positions in tigs
+ uint64 memUT = RI->numReads() * sizeof(uint32) / 16; // For tigs (assumes 32 read / unitig)
+ uint64 memID = RI->numReads() * sizeof(uint32) * 2; // For maps of read id to unitig id
+ uint64 memEP = RI->numReads() * Unitig::epValueSize() * 2; // For error profile
- uint64 memC1 = (FI->numFragments() + 1) * (sizeof(BAToverlapInt *) + sizeof(uint32));
+ uint64 memC1 = (RI->numReads() + 1) * (sizeof(BAToverlap *) + sizeof(uint32));
uint64 memC2 = _ovsMax * (sizeof(ovOverlap) + sizeof(uint64) + sizeof(uint64));
uint64 memC3 = _threadMax * _thread[0]._batMax * sizeof(BAToverlap);
- uint64 memC4 = (FI->numFragments() + 1) * sizeof(uint32);
+ uint64 memC4 = (RI->numReads() + 1) * sizeof(uint32);
- uint64 memOS = (_memLimit == getMemorySize()) ? (0.1 * getMemorySize()) : 0.0;
+ uint64 memOS = (_memLimit == getPhysicalMemorySize()) ? (0.1 * getPhysicalMemorySize()) : 0.0;
uint64 memTT = memFI + memBE + memUL + memUT + memID + memC1 + memC2 + memC3 + memC4 + memOS;
- if (onlySave) {
- fprintf(stderr, "OverlapCache()-- Only saving overlaps, not computing unitigs.\n");
- memBE = 0;
- memUL = 0;
- memUT = 0;
- memID = 0;
- memTT = memFI + memBE + memUL + memUT + memID + memOS + memC1 + memC2 + memC3 + memC4;
- }
-
- fprintf(stderr, "OverlapCache()-- %7"F_U64P"MB for fragment data.\n", memFI >> 20);
- fprintf(stderr, "OverlapCache()-- %7"F_U64P"MB for best edges.\n", memBE >> 20);
- fprintf(stderr, "OverlapCache()-- %7"F_U64P"MB for unitig layouts.\n", memUL >> 20);
- fprintf(stderr, "OverlapCache()-- %7"F_U64P"MB for unitigs.\n", memUT >> 20);
- fprintf(stderr, "OverlapCache()-- %7"F_U64P"MB for id maps.\n", memID >> 20);
- fprintf(stderr, "OverlapCache()-- %7"F_U64P"MB for error profiles.\n", memEP >> 20);
- fprintf(stderr, "OverlapCache()-- %7"F_U64P"MB for overlap cache pointers.\n", memC1 >> 20);
- fprintf(stderr, "OverlapCache()-- %7"F_U64P"MB for overlap cache initial bucket.\n", memC2 >> 20);
- fprintf(stderr, "OverlapCache()-- %7"F_U64P"MB for overlap cache thread data.\n", memC3 >> 20);
- fprintf(stderr, "OverlapCache()-- %7"F_U64P"MB for number of overlaps per read.\n", memC4 >> 20);
- fprintf(stderr, "OverlapCache()-- %7"F_U64P"MB for other processes.\n", memOS >> 20);
- fprintf(stderr, "OverlapCache()-- ---------\n");
- fprintf(stderr, "OverlapCache()-- %7"F_U64P"MB for data structures (sum of above).\n", memTT >> 20);
+ writeStatus("OverlapCache()-- %7" F_U64P "MB for read data.\n", memFI >> 20);
+ writeStatus("OverlapCache()-- %7" F_U64P "MB for best edges.\n", memBE >> 20);
+ writeStatus("OverlapCache()-- %7" F_U64P "MB for unitig layouts.\n", memUL >> 20);
+ writeStatus("OverlapCache()-- %7" F_U64P "MB for tigs.\n", memUT >> 20);
+ writeStatus("OverlapCache()-- %7" F_U64P "MB for id maps.\n", memID >> 20);
+ writeStatus("OverlapCache()-- %7" F_U64P "MB for error profiles.\n", memEP >> 20);
+ writeStatus("OverlapCache()-- %7" F_U64P "MB for overlap cache pointers.\n", memC1 >> 20);
+ writeStatus("OverlapCache()-- %7" F_U64P "MB for overlap cache initial bucket.\n", memC2 >> 20);
+ writeStatus("OverlapCache()-- %7" F_U64P "MB for overlap cache thread data.\n", memC3 >> 20);
+ writeStatus("OverlapCache()-- %7" F_U64P "MB for number of overlaps per read.\n", memC4 >> 20);
+ writeStatus("OverlapCache()-- %7" F_U64P "MB for other processes.\n", memOS >> 20);
+ writeStatus("OverlapCache()-- ---------\n");
+ writeStatus("OverlapCache()-- %7" F_U64P "MB for data structures (sum of above).\n", memTT >> 20);
if (_memLimit <= memTT) {
int64 defecit = (int64)memTT - (int64)_memLimit;
- fprintf(stderr, "OverlapCache()-- %7"F_S64P"MB available for overlaps.\n", defecit);
- fprintf(stderr, "OverlapCache()-- Out of memory before loading overlaps; increase -M.\n");
+ writeStatus("OverlapCache()-- %7" F_S64P "MB available for overlaps.\n", defecit);
+ writeStatus("OverlapCache()-- Out of memory before loading overlaps; increase -M.\n");
exit(1);
}
_memLimit -= memTT;
_memUsed = 0;
- fprintf(stderr, "OverlapCache()-- %7"F_U64P"MB available for overlaps.\n", _memLimit >> 20);
- fprintf(stderr, "\n");
-
- // Decide on the default block size. We want to use large blocks (to reduce the number of
- // allocations, and load on the allocator) but not so large that we can't fit nicely.
- //
- // 1gb blocks @ 64 -> 64gb
- // 128mb blocks @ 64 -> 8gb
- //
- // below 8gb we'll use 128mb blocks
- // from 8gb to 64gb, we'll use _memLimit/64
- // from 64gb on, we'll use 1gb block
-
- if (_memLimit <= (uint64)8 * 1024 * 1024 * 1024)
- _storMax = 128 * 1024 * 1024 / sizeof(BAToverlapInt);
-
- else if (_memLimit <= (uint64)64 * 1024 * 1024 * 1024)
- _storMax = _memLimit / 64 / sizeof(BAToverlapInt);
+ writeStatus("OverlapCache()-- %7" F_U64P "MB available for overlaps.\n", _memLimit >> 20);
+ writeStatus("\n");
- else
- _storMax = (uint64)1024 * 1024 * 1024 / sizeof(BAToverlapInt);
+ _overlaps = new BAToverlap * [RI->numReads() + 1];
+ _overlapLen = new uint32 [RI->numReads() + 1];
+ _overlapMax = new uint32 [RI->numReads() + 1];
- _storLen = 0;
- _stor = NULL;
+ memset(_overlaps, 0, sizeof(BAToverlap *) * (RI->numReads() + 1));
+ memset(_overlapLen, 0, sizeof(uint32) * (RI->numReads() + 1));
+ memset(_overlapMax, 0, sizeof(uint32) * (RI->numReads() + 1));
- _cacheMMF = NULL;
+ _maxEvalue = AS_OVS_encodeEvalue(maxErate);
+ _minOverlap = minOverlap;
- _cachePtr = new BAToverlapInt * [FI->numFragments() + 1];
- _cacheLen = new uint32 [FI->numFragments() + 1];
+ _minPer = 0;
+ _maxPer = 0;
- memset(_cachePtr, 0, sizeof(BAToverlapInt *) * (FI->numFragments() + 1));
- memset(_cacheLen, 0, sizeof(uint32) * (FI->numFragments() + 1));
-
- _maxPer = maxOverlaps;
+ _checkSymmetry = false;
_ovs = ovOverlap::allocateOverlaps(NULL, _ovsMax); // So can't call bgn or end.
_ovsSco = new uint64 [_ovsMax];
_ovsTmp = new uint64 [_ovsMax];
+ _genomeSize = genomeSize;
+
+ _gkp = gkp;
_ovlStoreUniq = ovlStoreUniq;
_ovlStoreRept = ovlStoreRept;
@@ -267,105 +178,96 @@ OverlapCache::OverlapCache(ovStore *ovlStoreUniq,
assert(_ovlStoreRept == NULL);
if (_memUsed > _memLimit)
- fprintf(stderr, "OverlapCache()-- ERROR: not enough memory to load ANY overlaps.\n"), exit(1);
+ writeStatus("OverlapCache()-- ERROR: not enough memory to load ANY overlaps.\n"), exit(1);
computeOverlapLimit();
- loadOverlaps(erate, minOverlap, prefix, onlySave, doSave);
+ loadOverlaps(doSave);
+ symmetrizeOverlaps();
delete [] _ovs; _ovs = NULL;
delete [] _ovsSco; _ovsSco = NULL;
delete [] _ovsTmp; _ovsTmp = NULL;
-
- if (doSave == true)
- save(prefix, erate);
-
- if ((doSave == true) && (onlySave == true))
- fprintf(stderr, "Exiting; only requested to build the overlap graph.\n"), exit(0);
}
OverlapCache::~OverlapCache() {
- if (_cacheMMF) {
- _stor = NULL;
- delete _cacheMMF;
- }
+ for (uint32 rr=0; rr<RI->numReads(); rr++)
+ delete [] _overlaps[rr];
+
+ delete [] _overlaps;
+ delete [] _overlapLen;
+ delete [] _overlapMax;
delete [] _ovs;
delete [] _thread;
-
- delete [] _cacheLen;
- delete [] _cachePtr;
-
- for (uint32 i=0; i<_heaps.size(); i++)
- delete [] _heaps[i];
}
-
-// Decide on limits per fragment.
+// Decide on limits per read.
//
-// From the memory limit, we can compute the average allowed per fragment. If this is higher than
-// the expected coverage, we'll not fill memory completely as the fragments in unique sequence will
+// From the memory limit, we can compute the average allowed per read. If this is higher than
+// the expected coverage, we'll not fill memory completely as the reads in unique sequence will
// have fewer than this number of overlaps.
//
-// We'd like to iterate this, but the unused space computation assumes all fragments are assigned
+// We'd like to iterate this, but the unused space computation assumes all reads are assigned
// the same amount of memory. On the next iteration, this isn't true any more. The benefit is
// (hopefully) small, and the algorithm is unknown.
//
// This isn't perfect. It estimates based on whatever is in the store, not only those overlaps
// below the error threshold. Result is that memory usage is far below what it should be. Easy to
-// fix if we assume all fragments have the same properties (same library, same length, same error
+// fix if we assume all reads have the same properties (same library, same length, same error
// rate) but not so easy in reality. We need big architecture changes to make it easy (grouping
// reads by library, collecting statistics from the overlaps, etc).
//
// It also doesn't distinguish between 5' and 3' overlaps - it is possible for all the long
// overlaps to be off of one end.
//
+
void
OverlapCache::computeOverlapLimit(void) {
-
- if (_maxPer < UINT32_MAX) {
- // -N supplied on the command line, use that instead.
- fprintf(stderr, "OverlapCache()-- _maxPer = "F_U32" overlaps/frag (from command line)\n", _maxPer);
- return;
- }
-
_ovlStoreUniq->resetRange();
// AS_OVS_numOverlapsPerFrag returns an array that starts at firstIIDrequested. This is usually
- // 1, unless the first fragment has no overlaps. In that case, firstIIDrequested will be the
- // first fragment with overlaps. This is a terrible interface.
+ // 1, unless the first read has no overlaps. In that case, firstIIDrequested will be the
+ // first read with overlaps. This is a terrible interface.
- fprintf(stderr, "OverlapCache()-- Loading number of overlaps per fragment.\n");
+ writeStatus("OverlapCache()-- Loading number of overlaps per read.\n");
- uint32 frstFrag = 0;
- uint32 lastFrag = 0;
- uint32 *numPer = _ovlStoreUniq->numOverlapsPerFrag(frstFrag, lastFrag);
- uint32 totlFrag = lastFrag - frstFrag + 1;
- uint32 numPerMax = 0;
+ uint32 frstRead = 0;
+ uint32 lastRead = 0;
+ uint32 *numPer = _ovlStoreUniq->numOverlapsPerFrag(frstRead, lastRead);
+ uint32 totlRead = lastRead - frstRead + 1;
+ uint32 numPerMax = findHighestOverlapCount();
- for (uint32 i=0; i<totlFrag; i++)
- if (numPerMax < numPer[i])
- numPerMax = numPer[i];
+ uint64 memAvail = (_memLimit - _memUsed);
+
+ // Set the minimum number of overlaps per read to 2-3x coverage.
- _maxPer = (_memLimit - _memUsed) / (FI->numFragments() * sizeof(BAToverlapInt));
+ _minPer = 2 * 3 * RI->numBases() / _genomeSize;
- fprintf(stderr, "OverlapCache()-- Initial guess at _maxPer="F_U32" (max of "F_U32") from (memLimit="F_U64" - memUsed="F_U64") / (numFrags="F_U32" * sizeof(OVL)="F_SIZE_T")\n",
- _maxPer, numPerMax, _memLimit, _memUsed, FI->numFragments(), sizeof(BAToverlapInt));
+ writeStatus("OverlapCache()-- Retain at least " F_U32 " overlaps/read, based on %.2fx coverage.\n",
+ _minPer, (double)RI->numBases() / _genomeSize);
+
+ // Set the maximum number of overlaps per read to a guess of what it will take to fill up memory.
+
+ _maxPer = memAvail / (RI->numReads() * sizeof(BAToverlap));
+
+ writeStatus("OverlapCache()-- Initial guess at " F_U32 " overlaps/read (maximum " F_U32 " overlaps/read).\n",
+ _maxPer, numPerMax);
if (_maxPer < 10)
- fprintf(stderr, "OverlapCache()-- ERROR: not enough memory to load overlaps (_maxPer="F_U32" < 10).\n", _maxPer), exit(1);
+ writeStatus("OverlapCache()-- ERROR: not enough memory to load overlaps!.\n"), exit(1);
uint64 totalLoad = 0; // Total overlaps we would load at this threshold
+ uint64 totalOlaps = _ovlStoreUniq->numOverlapsInRange();
- uint32 numBelow = 0; // Number below the threshold
- //uint64 numBelowS = 0; // Amount of space wasted beacuse of this
+ uint32 numBelow = 0; // Number of reads below the threshold
uint32 numEqual = 0;
- uint32 numAbove = 0; // Number of fragments above the threshold
+ uint32 numAbove = 0; // Number of reads above the threshold
uint32 lastMax = 0;
@@ -374,14 +276,12 @@ OverlapCache::computeOverlapLimit(void) {
while (adjust > 0) {
totalLoad = 0;
numBelow = 0;
- //numBelowS = 0;
numEqual = 0;
numAbove = 0;
- for (uint32 i=0; i<totlFrag; i++) {
+ for (uint32 i=0; i<totlRead; i++) {
if (numPer[i] < _maxPer) {
numBelow++;
- //numBelowS += _maxPer - MAX(lastMax, numPer[i]); // Number of extra overlaps we could still load; the unused space for this read
totalLoad += numPer[i];
} else if (numPer[i] == _maxPer) {
@@ -394,34 +294,32 @@ OverlapCache::computeOverlapLimit(void) {
}
}
- fprintf(stderr, "OverlapCache()-- _maxPer=%7"F_U32P" (numBelow="F_U32" numEqual="F_U32" numAbove="F_U32" totalLoad="F_U64" -- "F_U64" + "F_U64" = "F_U64" <? "F_U64"\n",
- _maxPer, numBelow, numEqual, numAbove,
- totalLoad, _memUsed, totalLoad + _memUsed,
- totalLoad * sizeof(BAToverlapInt), _memLimit);
+ writeStatus("OverlapCache()-- %7" F_U32P " overlaps/read - load all for %7" F_U32P " reads, some for %7" F_U32P " reads - %12" F_U64P " overlaps to load - %4" F_U64P "MB\n",
+ _maxPer,
+ numBelow + numEqual,
+ numAbove,
+ totalLoad,
+ totalLoad * sizeof(BAToverlap) >> 20);
- if ((numAbove == 0) && (_memUsed + totalLoad * sizeof(BAToverlapInt) < _memLimit)) {
- // All done, nothing to do here.
+ // All done, nothing to do here.
+ if ((numAbove == 0) && (totalLoad * sizeof(BAToverlap) < memAvail)) {
adjust = 0;
+ }
- } else if (_memUsed + totalLoad * sizeof(BAToverlapInt) < _memLimit) {
- // This limit worked, let's try moving it a little higher.
-
+ // This limit worked, let's try moving it a little higher.
+ else if (totalLoad * sizeof(BAToverlap) < memAvail) {
lastMax = _maxPer;
- adjust = (_memLimit - _memUsed - totalLoad * sizeof(BAToverlapInt)) / numAbove / sizeof(BAToverlapInt);
+ adjust = (memAvail - totalLoad * sizeof(BAToverlap)) / numAbove / sizeof(BAToverlap);
_maxPer += adjust;
- fprintf(stderr, "OverlapCache()-- ("F_U64" MB free, adjust by "F_U32")\n",
- (_memLimit - _memUsed - totalLoad * sizeof(BAToverlapInt)) >> 20,
- adjust);
-
if (_maxPer > numPerMax)
_maxPer = numPerMax;
+ }
- } else {
- // Whoops! Too high! Revert to the last and recompute statistics.
-
+ // Whoops! Too high! Revert to the last and recompute statistics.
+ else {
adjust = 0;
_maxPer = lastMax;
@@ -430,10 +328,9 @@ OverlapCache::computeOverlapLimit(void) {
numEqual = 0;
numAbove = 0;
- for (uint32 i=0; i<totlFrag; i++) {
+ for (uint32 i=0; i<totlRead; i++) {
if (numPer[i] < _maxPer) {
numBelow++;
- //numBelowS += _maxPer - numPer[i];
totalLoad += numPer[i];
} else if (numPer[i] == _maxPer) {
@@ -446,61 +343,156 @@ OverlapCache::computeOverlapLimit(void) {
}
}
- fprintf(stderr, "OverlapCache()-- _maxPer=%7"F_U32P" (overestimated, revert to last good and stop)\n", _maxPer);
+ writeStatus("OverlapCache()-- _maxPer=%7" F_U32P " (overestimated, revert to last good and stop)\n", _maxPer);
}
}
// Report
- fprintf(stderr, "\n");
- fprintf(stderr, "OverlapCache()-- blockSize = "F_U32" ("F_SIZE_T"MB)\n", _storMax, (_storMax * sizeof(BAToverlapInt)) >> 20);
- fprintf(stderr, "\n");
- fprintf(stderr, "OverlapCache()-- _maxPer = "F_U32" overlaps/reads\n", _maxPer);
- fprintf(stderr, "OverlapCache()-- numBelow = "F_U32" reads (all overlaps loaded)\n", numBelow);
- fprintf(stderr, "OverlapCache()-- numEqual = "F_U32" reads (all overlaps loaded)\n", numEqual);
- fprintf(stderr, "OverlapCache()-- numAbove = "F_U32" reads (some overlaps loaded)\n", numAbove);
- fprintf(stderr, "OverlapCache()-- totalLoad = "F_U64" overlaps (%6.2f%%)\n", totalLoad, 100.0 * totalLoad / _ovlStoreUniq->numOverlapsInRange());
- fprintf(stderr, "\n");
- fprintf(stderr, "OverlapCache()-- availForOverlaps = "F_U64"MB\n", _memLimit >> 20);
- fprintf(stderr, "OverlapCache()-- totalMemory = "F_U64"MB for organization\n", _memUsed >> 20);
- fprintf(stderr, "OverlapCache()-- totalMemory = "F_U64"MB for overlaps\n", (totalLoad * sizeof(BAToverlapInt)) >> 20);
- fprintf(stderr, "OverlapCache()-- totalMemory = "F_U64"MB used\n", (_memUsed + totalLoad * sizeof(BAToverlapInt)) >> 20);
- fprintf(stderr, "\n");
+ writeStatus("\n");
+ writeStatus("OverlapCache()-- minPer = " F_U32 " overlaps/reads\n", _minPer);
+ writeStatus("OverlapCache()-- maxPer = " F_U32 " overlaps/reads\n", _maxPer);
+ writeStatus("OverlapCache()-- numBelow = " F_U32 " reads (all overlaps loaded)\n", numBelow);
+ writeStatus("OverlapCache()-- numEqual = " F_U32 " reads (all overlaps loaded)\n", numEqual);
+ writeStatus("OverlapCache()-- numAbove = " F_U32 " reads (some overlaps loaded)\n", numAbove);
+ writeStatus("OverlapCache()-- totalLoad = " F_U64 " overlaps (%6.2f%%)\n", totalLoad, (totalOlaps > 0) ? (100.0 * totalLoad / totalOlaps) : 0.0);
+ writeStatus("\n");
+ writeStatus("OverlapCache()-- availForOverlaps = " F_U64 "MB\n", memAvail >> 20);
+ writeStatus("OverlapCache()-- totalMemory = " F_U64 "MB for organization\n", _memUsed >> 20);
+ writeStatus("OverlapCache()-- totalMemory = " F_U64 "MB for overlaps\n", (totalLoad * sizeof(BAToverlap)) >> 20);
+ writeStatus("OverlapCache()-- totalMemory = " F_U64 "MB used\n", (_memUsed + totalLoad * sizeof(BAToverlap)) >> 20);
+ writeStatus("\n");
+
+ _checkSymmetry = (numAbove > 0) ? true : false;
delete [] numPer;
}
+uint32
+OverlapCache::findHighestOverlapCount(void) {
+ uint32 fRead = 0;
+ uint32 lRead = 0;
+ uint32 *numPer = _ovlStoreUniq->numOverlapsPerFrag(fRead, lRead);
+ uint32 totlRead = lRead - fRead + 1;
+
+ uint32 numPerMax = 0;
+
+ for (uint32 i=0; i<totlRead; i++)
+ if (numPerMax < numPer[i])
+ numPerMax = numPer[i];
+
+ delete [] numPer;
+
+ return(numPerMax);
+}
+
+
+
+void
+OverlapCache::allocateLoadingSpace(void) {
+
+ _ovsMax = findHighestOverlapCount();
+
+ _ovs = ovOverlap::allocateOverlaps(NULL, _ovsMax); // So can't call bgn or end.
+ _ovsSco = new uint64 [_ovsMax];
+ _ovsTmp = new uint64 [_ovsMax];
+
+ _memUsed += (_ovsMax) * sizeof(ovOverlap);
+ _memUsed += (_ovsMax) * sizeof(uint64);
+ _memUsed += (_ovsMax) * sizeof(uint64);
+}
+
uint32
-OverlapCache::filterOverlaps(uint32 maxEvalue, uint32 minOverlap, uint32 no) {
- uint32 ns = 0;
+OverlapCache::filterDuplicates(uint32 &no) {
+ uint32 nFiltered = 0;
+
+ for (uint32 ii=0, jj=1; jj<no; ii++, jj++) {
+ if (_ovs[ii].b_iid != _ovs[jj].b_iid)
+ continue;
+
+ // Found duplicate B IDs. Drop one of them.
+
+ nFiltered++;
+
+ // If they're the same length, make the one with the higher evalue be length zero so it'll be
+ // the shortest.
+
+ uint32 iilen = RI->overlapLength(_ovs[ii].a_iid, _ovs[ii].b_iid, _ovs[ii].a_hang(), _ovs[ii].b_hang());
+ uint32 jjlen = RI->overlapLength(_ovs[jj].a_iid, _ovs[jj].b_iid, _ovs[jj].a_hang(), _ovs[jj].b_hang());
+
+ if (iilen == jjlen) {
+ if (_ovs[ii].evalue() < _ovs[jj].evalue())
+ jjlen = 0;
+ else
+ iilen = 0;
+ }
+
+ // Drop the shorter overlap by forcing its erate to the maximum.
+
+ if (iilen < jjlen)
+ _ovs[ii].evalue(AS_MAX_EVALUE);
+ else
+ _ovs[jj].evalue(AS_MAX_EVALUE);
+ }
+
+ // Now that all have been filtered, squeeze out the filtered overlaps. We used to just copy the
+ // last element over any deleted ones, leaving the list unsorted, but we're now (Nov 2016) binary
+ // searching on it, so can't do that.
+
+ if (nFiltered > 0) {
+ // Needs to have it's own log. Lots of stuff here.
+ //writeLog("OverlapCache()-- read %u filtered %u overlaps to the same read pair\n", _ovs[0].a_iid, nFiltered);
+
+ for (uint32 ii=0, jj=0; jj<no; ) {
+ if (_ovs[jj].evalue() == AS_MAX_EVALUE) {
+ jj++;
+ continue;
+ }
+
+ if (ii != jj) {
+ _ovs[ii] = _ovs[jj];
+ _ovs[jj].clear();
+ }
+
+ ii++;
+ jj++;
+ }
+
+ no -= nFiltered;
+
+ for (uint32 jj=0; jj<no; jj++) {
+ assert(_ovs[jj].a_iid != 0);
+ assert(_ovs[jj].b_iid != 0);
+ assert(_ovs[jj].evalue() != AS_MAX_EVALUE);
+ }
+ }
- // Score the overlaps.
+ return(nFiltered);
+}
- uint64 ERR_MASK = ((uint64)1 << AS_MAX_EVALUE_BITS) - 1;
- uint32 SALT_BITS = (64 - AS_MAX_READLEN_BITS - AS_MAX_EVALUE_BITS);
- uint64 SALT_MASK = (((uint64)1 << SALT_BITS) - 1);
- memset(_ovsSco, 0, sizeof(uint64) * no);
+uint32
+OverlapCache::filterOverlaps(uint32 maxEvalue, uint32 minOverlap, uint32 no) {
+ uint32 ns = 0;
for (uint32 ii=0; ii<no; ii++) {
- if ((FI->fragmentLength(_ovs[ii].a_iid) == 0) ||
- (FI->fragmentLength(_ovs[ii].b_iid) == 0))
- // At least one read deleted in the overlap
+ _ovsSco[ii] = 0; // Overlaps 'continue'd below will be filtered, even if 'no filtering' is needed.
+
+ if ((RI->readLength(_ovs[ii].a_iid) == 0) || // At least one read in the overlap is deleted
+ (RI->readLength(_ovs[ii].b_iid) == 0))
continue;
- if (_ovs[ii].evalue() > maxEvalue)
- // Too noisy.
+ if (_ovs[ii].evalue() > maxEvalue) // Too noisy to care
continue;
- uint32 olen = FI->overlapLength(_ovs[ii].a_iid, _ovs[ii].b_iid, _ovs[ii].a_hang(), _ovs[ii].b_hang());
+ uint32 olen = RI->overlapLength(_ovs[ii].a_iid, _ovs[ii].b_iid, _ovs[ii].a_hang(), _ovs[ii].b_hang());
- if (olen < minOverlap)
- // Too short.
+ if (olen < minOverlap) // Too short to care
continue;
// Just right!
@@ -510,480 +502,474 @@ OverlapCache::filterOverlaps(uint32 maxEvalue, uint32 minOverlap, uint32 no) {
_ovsSco[ii] |= (~_ovs[ii].evalue()) & ERR_MASK;
_ovsSco[ii] <<= SALT_BITS;
_ovsSco[ii] |= ii & SALT_MASK;
+
ns++;
}
- // If fewer than the limit, keep them all. Should we reset ovsSco to be 1? Do we really need ovsTmp?
-
- memcpy(_ovsTmp, _ovsSco, sizeof(uint64) * no);
-
- if (ns <= _maxPer)
+ if (ns <= _maxPer) // Fewer overlaps than the limit, no filtering needed.
return(ns);
- // Otherwise, filter out the short and low quality.
+ // Otherwise, filter out the short and low quality overlaps and count how many we saved.
- sort(_ovsTmp, _ovsTmp + no);
-
- uint64 cutoff = _ovsTmp[no - _maxPer];
+ memcpy(_ovsTmp, _ovsSco, sizeof(uint64) * no);
- for (uint32 ii=0; ii<no; ii++)
- if (_ovsSco[ii] < cutoff)
- _ovsSco[ii] = 0;
+ sort(_ovsTmp, _ovsTmp + no);
- // Count how many overlaps we saved.
+ uint64 minScore = _ovsTmp[no - _maxPer];
ns = 0;
for (uint32 ii=0; ii<no; ii++)
- if (_ovsSco[ii] > 0)
+ if (_ovsSco[ii] < minScore)
+ _ovsSco[ii] = 0;
+ else
ns++;
- if (ns > _maxPer)
- fprintf(stderr, "WARNING: fragment "F_U32" loaded "F_U32" overlas (it has "F_U32" in total); over the limit of "F_U32"\n",
- _ovs[0].a_iid, ns, no, _maxPer);
+ assert(ns <= _maxPer);
return(ns);
}
-
void
-OverlapCache::loadOverlaps(double erate, uint32 minOverlap, const char *prefix, bool onlySave, bool doSave) {
- uint64 numTotal = 0;
- uint64 numLoaded = 0;
- uint32 numFrags = 0;
- uint32 numOvl = 0;
- uint32 maxEvalue = AS_OVS_encodeEvalue(erate);
-
- FILE *ovlDat = NULL;
-
- if (doSave == true) {
- char name[FILENAME_MAX];
+OverlapCache::loadOverlaps(bool doSave) {
- sprintf(name, "%s.ovlCacheDat", prefix);
-
- fprintf(stderr, "OverlapCache()-- Saving overlaps to '%s'.\n", name);
-
- errno = 0;
-
- ovlDat = fopen(name, "w");
- if (errno)
- fprintf(stderr, "OverlapCache()-- Failed to open '%s' for write: %s\n", name, strerror(errno)), exit(1);
- }
+ if (load() == true)
+ return;
assert(_ovlStoreUniq != NULL);
assert(_ovlStoreRept == NULL);
_ovlStoreUniq->resetRange();
- uint64 numStore = _ovlStoreUniq->numOverlapsInRange();
+ uint64 numTotal = 0;
+ uint64 numLoaded = 0;
+ uint64 numDups = 0;
+ uint32 numReads = 0;
+ uint64 numStore = _ovlStoreUniq->numOverlapsInRange();
- writeLog("OverlapCache()-- Loading overlap information\n");
+ if (numStore == 0)
+ writeStatus("ERROR: No overlaps in overlap store?\n"), exit(1);
// Could probably easily extend to multiple stores. Needs to interleave the two store
- // loads, can't do one after the other as we require all overlaps for a single fragment
+ // loads, can't do one after the other as we require all overlaps for a single read
// be in contiguous memory.
while (1) {
+ uint32 numOvl = _ovlStoreUniq->numberOfOverlaps(); // Query how many overlaps for the next read.
- // Ask the store how many overlaps exist for this fragment.
- numOvl = _ovlStoreUniq->numberOfOverlaps();
-
- numTotal += numOvl;
-
- if (numOvl == 0)
- // No overlaps? We're at the end of the store.
+ if (numOvl == 0) // If no overlaps, we're at the end of the store.
break;
- // Resize temporary storage space to hold all these overlaps.
- while (_ovsMax <= numOvl) {
- _memUsed -= (_ovsMax) * sizeof(ovOverlap);
- _memUsed -= (_ovsMax) * sizeof(uint64);
- _ovsMax *= 2;
- delete [] _ovs;
- delete [] _ovsSco;
- delete [] _ovsTmp;
- _ovs = ovOverlap::allocateOverlaps(NULL, _ovsMax); // So can't call bgn or end.
- _ovsSco = new uint64 [_ovsMax];
- _ovsTmp = new uint64 [_ovsMax];
- _memUsed += (_ovsMax) * sizeof(ovOverlap);
- _memUsed += (_ovsMax) * sizeof(uint64);
- _memUsed += (_ovsMax) * sizeof(uint64);
- }
+ assert(numOvl <= _ovsMax);
- // Actually load the overlaps.
- uint32 no = _ovlStoreUniq->readOverlaps(_ovs, _ovsMax);
- uint32 ns = filterOverlaps(maxEvalue, minOverlap, no);
+ // Actually load the overlaps, then detect and remove overlaps between the same pair, then
+ // filter short and low quality overlaps.
- // Resize the permament storage space for overlaps.
- if ((_storLen + ns > _storMax) ||
- (_stor == NULL)) {
+ uint32 no = _ovlStoreUniq->readOverlaps(_ovs, _ovsMax); // no == total overlaps == numOvl
+ uint32 nd = filterDuplicates(no); // nd == duplicated overlaps (no is decreased by this amount)
+ uint32 ns = filterOverlaps(_maxEvalue, _minOverlap, no); // ns == acceptable overlaps
- if ((ovlDat) && (_storLen > 0))
- AS_UTL_safeWrite(ovlDat, _stor, "_stor", sizeof(BAToverlapInt), _storLen);
- if (onlySave)
- delete [] _stor;
+ // Allocate space for the overlaps. Allocate a multiple of 8k, assumed to be the page size.
+ //
+ // If we're loading all overlaps (ns == no) we don't need to overallocate. Otherwise, we're
+ // loading only some of them and might have to make a twin later.
+ //
+ // Once allocated copy the good overlaps.
- _storLen = 0;
- _stor = new BAToverlapInt [_storMax];
- _heaps.push_back(_stor);
+ if (ns > 0) {
+ uint32 id = _ovs[0].a_iid;
- _memUsed += _storMax * sizeof(BAToverlapInt);
- }
+ _overlapMax[id] = (ns == no) ? (ns) : ((((sizeof(BAToverlap) * ns / 8192) + 1) * 8192) / sizeof(BAToverlap));
+ _overlapLen[id] = ns;
+ _overlaps[id] = new BAToverlap [ _overlapMax[id] ];
- // Save a pointer to the start of the overlaps for this fragment, and the number of overlaps
- // that exist.
- _cachePtr[_ovs[0].a_iid] = _stor + _storLen;
- _cacheLen[_ovs[0].a_iid] = ns;
+ _memUsed += _overlapMax[id] * sizeof(BAToverlap);
- numLoaded += ns;
+ uint32 oo=0;
- uint32 storEnd = _storLen + ns;
+ for (uint32 ii=0; ii<no; ii++) {
+ if (_ovsSco[ii] == 0)
+ continue;
- // Finally, append the overlaps to the storage.
- for (uint32 ii=0; ii<no; ii++) {
- if (_ovsSco[ii] == 0)
- continue;
+ _overlaps[id][oo].evalue = _ovs[ii].evalue();
+ _overlaps[id][oo].a_hang = _ovs[ii].a_hang();
+ _overlaps[id][oo].b_hang = _ovs[ii].b_hang();
+ _overlaps[id][oo].flipped = _ovs[ii].flipped();
+ _overlaps[id][oo].filtered = false;
+ _overlaps[id][oo].symmetric = false;
+ _overlaps[id][oo].a_iid = _ovs[ii].a_iid;
+ _overlaps[id][oo].b_iid = _ovs[ii].b_iid;
+
+ assert(_overlaps[id][oo].a_iid != 0);
+ assert(_overlaps[id][oo].b_iid != 0);
- _stor[_storLen].evalue = _ovs[ii].evalue();
- _stor[_storLen].a_hang = _ovs[ii].a_hang();
- _stor[_storLen].b_hang = _ovs[ii].b_hang();
- _stor[_storLen].flipped = _ovs[ii].flipped();
- _stor[_storLen].b_iid = _ovs[ii].b_iid;
+ oo++;
+ }
- _storLen++;
+ assert(oo == _overlapLen[id]);
}
- assert(storEnd == _storLen);
+ // Keep track of what we loaded and didn't.
- if ((numFrags++ % 1000000) == 0)
- writeLog("OverlapCache()-- Loading overlap information: overlaps processed %12"F_U64P" (%06.2f%%) loaded %12"F_U64P" (%06.2f%%) (at read iid %d)\n",
- numTotal, 100.0 * numTotal / numStore,
- numLoaded, 100.0 * numLoaded / numStore,
- _ovs[0].a_iid);
- }
+ numTotal += no + nd; // Because no was decremented by nd in filterDuplicates()
+ numLoaded += ns;
+ numDups += nd;
- if ((ovlDat) && (_storLen > 0))
- AS_UTL_safeWrite(ovlDat, _stor, "_stor", sizeof(BAToverlapInt), _storLen);
- if (onlySave)
- delete [] _stor;
+ if ((numReads++ % 100000) == 0)
+ writeStatus("OverlapCache()-- Loading: overlaps processed %12" F_U64P " (%06.2f%%) loaded %12" F_U64P " (%06.2f%%) droppeddupe %12" F_U64P " (%06.2f%%)\n",
+ numTotal, 100.0 * numTotal / numStore,
+ numLoaded, 100.0 * numLoaded / numStore,
+ numDups, 100.0 * numDups / numStore);
+ }
- if (ovlDat)
- fclose(ovlDat);
+ writeStatus("OverlapCache()-- Loading: overlaps processed %12" F_U64P " (%06.2f%%) loaded %12" F_U64P " (%06.2f%%) droppeddupe %12" F_U64P " (%06.2f%%)\n",
+ numTotal, 100.0 * numTotal / numStore,
+ numLoaded, 100.0 * numLoaded / numStore,
+ numDups, 100.0 * numDups / numStore);
- writeLog("OverlapCache()-- Loading overlap information: overlaps processed %12"F_U64P" (%06.2f%%) loaded %12"F_U64P" (%06.2f%%)\n",
- numTotal, 100.0 * numTotal / numStore,
- numLoaded, 100.0 * numLoaded / numStore);
+ if (doSave == true)
+ save();
}
+bool
+searchForOverlap(BAToverlap *ovl, uint32 ovlLen, uint32 bID) {
-BAToverlap *
-OverlapCache::getOverlaps(uint32 fragIID, double maxErate, uint32 &numOverlaps) {
- uint32 tid = omp_get_thread_num();
-
- while (_thread[tid]._batMax <= _cacheLen[fragIID]) {
- _thread[tid]._batMax *= 2;
- delete [] _thread[tid]._bat;
- _thread[tid]._bat = new BAToverlap [_thread[tid]._batMax];
- }
-
- BAToverlapInt *ptr = _cachePtr[fragIID];
- uint32 maxEvalue = AS_OVS_encodeEvalue(maxErate);
-
- numOverlaps = 0;
+#ifdef TEST_LINEAR_SEARCH
+ bool linearSearchFound = false;
- for (uint32 pos=0; pos < _cacheLen[fragIID]; pos++) {
- if (ptr[pos].evalue > maxEvalue)
- continue;
+ for (uint32 ss=0; ss<ovlLen; ss++)
+ if (ovl[ss].b_iid == bID) {
+ linearSearchFound = true;
+ break;
+ }
+#endif
- _thread[tid]._bat[numOverlaps].a_hang = ptr[pos].a_hang;
- _thread[tid]._bat[numOverlaps].b_hang = ptr[pos].b_hang;
+ // If not, these are repeats and we should binary search everything.
+ // There will be no short lists where we could exhaustively search.
- _thread[tid]._bat[numOverlaps].flipped = ptr[pos].flipped;
+ int32 F = 0;
+ int32 L = ovlLen - 1;
+ int32 M = 0;
- _thread[tid]._bat[numOverlaps].evalue = ptr[pos].evalue;
- _thread[tid]._bat[numOverlaps].erate = AS_OVS_decodeEvalue(ptr[pos].evalue);
+ while (F <= L) {
+ M = (F + L) / 2;
- _thread[tid]._bat[numOverlaps].a_iid = fragIID;
- _thread[tid]._bat[numOverlaps].b_iid = ptr[pos].b_iid;
+ if (ovl[M].b_iid == bID) {
+ ovl[M].symmetric = true;
+#ifdef TEST_LINEAR_SEARCH
+ assert(linearSearchFound == true);
+#endif
+ return(true);
+ }
- numOverlaps++;
+ if (ovl[M].b_iid < bID)
+ F = M+1;
+ else
+ L = M-1;
}
- return(_thread[tid]._bat);
+#ifdef TEST_LINEAR_SEARCH
+ assert(linearSearchFound == false);
+#endif
+
+ return(false);
}
void
-OverlapCache::removeWeakOverlaps(uint32 *minEvalue5p,
- uint32 *minEvalue3p) {
+OverlapCache::symmetrizeOverlaps(void) {
- uint32 fiLimit = FI->numFragments();
+ if (_checkSymmetry == false)
+ return;
- uint64 saved = 0;
- uint64 ignored = 0;
- uint64 removed = 0;
+ uint32 *nonsymPerRead = new uint32 [RI->numReads() + 1]; // Overlap in this read is missing it's twin
- for (uint32 fi=1; fi <= fiLimit; fi++) {
- uint32 numOverlaps = _cacheLen[fi];
- BAToverlapInt *ptr = _cachePtr[fi];
+ // For each overlap, see if the twin overlap exists. It is tempting to skip searching if the
+ // b-read has loaded all overlaps (the overlap we're searching for must exist) but we can't.
+ // We must still mark the oevrlap as being symmetric.
- for (uint32 pos=0; pos < numOverlaps; pos++) {
- uint32 aiid = fi;
- uint32 biid = ptr[pos].b_iid;
- uint32 evalue = ptr[pos].evalue;
+ writeStatus("OverlapCache()-- Symmetrizing overlaps -- finding missing twins.\n");
- // Ignore contained overlaps.
+#pragma omp parallel for schedule(dynamic, RI->numReads() / 1000)
+ for (uint32 rr=0; rr<RI->numReads()+1; rr++) {
+ nonsymPerRead[rr] = 0;
- if (((ptr[pos].a_hang <= 0) && (ptr[pos].b_hang >= 0)) ||
- ((ptr[pos].a_hang >= 0) && (ptr[pos].b_hang <= 0))) {
- ignored++;
- continue;
- }
+ if ((rr % 100) == 0)
+ fprintf(stderr, " %6.3f%%\r", 100.0 * rr / RI->numReads());
- // Decide which end we need to be looking at.
+ for (uint32 oo=0; oo<_overlapLen[rr]; oo++) {
+ uint32 rb = _overlaps[rr][oo].b_iid;
- uint32 ta = 0;
- uint32 tb = 0;
-
- if (ptr[pos].a_hang > 0)
- ta = minEvalue3p[aiid];
- else
- ta = minEvalue5p[aiid];
+ if (_overlaps[rr][oo].symmetric == true) // If already marked, we're done.
+ continue;
- if (ptr[pos].flipped == false) {
- if (ptr[pos].b_hang > 0)
- tb = minEvalue5p[biid];
- else
- tb = minEvalue3p[biid];
+ // Search for the twin overlap, and if found, we're done. The twin is marked as symmetric in the function.
- } else {
- if (ptr[pos].b_hang > 0)
- tb = minEvalue3p[biid];
- else
- tb = minEvalue5p[biid];
+ if (searchForOverlap(_overlaps[rb], _overlapLen[rb], rr)) {
+ _overlaps[rr][oo].symmetric = true;
+ continue;
}
- // If the erate is more than the threshold, 'remove' the overlap by maxing out the erate.
+ // Didn't find a twin. Count how many overlaps we need to create duplicates of.
- if ((evalue > ta) ||
- (evalue > tb)) {
- //fprintf(stdout, "OverlapCache::removeWeakOverlaps()-- remove %7d %7d at %.3f\n", aiid, biid, AS_OVS_decodeEvalue(evalue));
- removed++;
- ptr[pos].evalue = AS_MAX_EVALUE;
- } else {
- saved++;
- }
+ nonsymPerRead[rr]++;
}
}
- writeLog("OverlapCache::removeWeakOverlaps()-- removed "F_U64" weak overlaps.\n", removed);
- writeLog("OverlapCache::removeWeakOverlaps()-- ignored "F_U64" contained overlaps.\n", ignored);
- writeLog("OverlapCache::removeWeakOverlaps()-- retained "F_U64" strong overlaps.\n", saved);
-}
+ uint64 nOverlaps = 0;
+ uint64 nOnly = 0;
+ uint64 nCritical = 0;
+ for (uint32 rr=0; rr<RI->numReads()+1; rr++) {
+ nOverlaps += _overlapLen[rr];
+ nOnly += nonsymPerRead[rr];
+ if (_overlapLen[rr] <= _minPer)
+ nCritical += nonsymPerRead[rr];
+ }
+ writeStatus("OverlapCache()-- -- found %llu missing twins in %llu overlaps, %llu are strong.\n", nOnly, nOverlaps, nCritical);
-#if 0
-double
-OverlapCache::findErate(uint32 aIID, uint32 bIID) {
+ // Score all the overlaps (again) and drop the lower quality ones. We need to drop half of the
+ // non-twin overlaps, but also want to retain some minimum number.
- for (uint32 pos=0; pos < _cacheLen[aIID]; pos++)
- if (_cachePtr[aIID][pos].b_iid == bIID)
- return(AS_OVS_decodeEvalue(_cachePtr[aIID][pos].evalue));
+ // But, there are a bunch of overlaps that fall below our score threshold that are symmetric. We
+ // need to keep these, only because figuring out which ones are 'saved' above will be a total
+ // pain in the ass.
- for (uint32 pos=0; pos < _cacheLen[bIID]; pos++)
- if (_cachePtr[bIID][pos].b_iid == aIID)
- return(AS_OVS_decodeEvalue(_cachePtr[bIID][pos].evalue));
+ double fractionToDrop = 0.6;
- return(1.0);
-}
-#endif
+ uint64 nDropped = 0;
+#warning this should be parallelized
+ writeStatus("OverlapCache()-- Symmetrizing overlaps -- dropping weak non-twin overlaps.\n");
+ for (uint32 rr=0; rr<RI->numReads()+1; rr++) {
+ if (_overlapLen[rr] <= _minPer)
+ continue;
+ if ((rr % 100) == 0)
+ fprintf(stderr, " %6.3f%%\r", 100.0 * rr / RI->numReads());
+ for (uint32 oo=0; oo<_overlapLen[rr]; oo++) {
+ _ovsSco[oo] = RI->overlapLength( _overlaps[rr][oo].a_iid, _overlaps[rr][oo].b_iid, _overlaps[rr][oo].a_hang, _overlaps[rr][oo].b_hang);
+ _ovsSco[oo] <<= AS_MAX_EVALUE_BITS;
+ _ovsSco[oo] |= (~_ovs[oo].evalue()) & ERR_MASK;
+ _ovsSco[oo] <<= SALT_BITS;
+ _ovsSco[oo] |= oo & SALT_MASK;
-bool
-OverlapCache::load(const char *prefix, double erate) {
- char name[FILENAME_MAX];
- FILE *file;
- size_t numRead;
+ _ovsTmp[oo] = _ovsSco[oo];
+ }
- sprintf(name, "%s.ovlCache", prefix);
- if (AS_UTL_fileExists(name, FALSE, FALSE) == false)
- return(false);
+ sort(_ovsTmp, _ovsTmp + _overlapLen[rr]);
- fprintf(stderr, "OverlapCache()-- Loading graph from '%s'.\n", name);
+ uint32 minIdx = (uint32)floor(nonsymPerRead[rr] * fractionToDrop);
- errno = 0;
+ if (minIdx < _minPer)
+ minIdx = _minPer;
- file = fopen(name, "r");
- if (errno)
- fprintf(stderr, "OverlapCache()-- Failed to open '%s' for reading: %s\n", name, strerror(errno)), exit(1);
+ uint64 minScore = _ovsTmp[minIdx];
- uint64 magic = ovlCacheMagic;
- uint32 ovserrbits = AS_MAX_EVALUE_BITS;
- uint32 ovshngbits = AS_MAX_READLEN_BITS + 1;
+ for (uint32 oo=0; oo<_overlapLen[rr]; oo++) {
+ if ((_ovsSco[oo] < minScore) && (_overlaps[rr][oo].symmetric == false)) {
+ nDropped++;
+ _overlapLen[rr]--;
+ _overlaps[rr][oo] = _overlaps[rr][_overlapLen[rr]];
+ _ovsSco [oo] = _ovsSco [_overlapLen[rr]];
+ oo--;
+ }
+ }
- AS_UTL_safeRead(file, &magic, "overlapCache_magic", sizeof(uint64), 1);
- AS_UTL_safeRead(file, &ovserrbits, "overlapCache_ovserrbits", sizeof(uint32), 1);
- AS_UTL_safeRead(file, &ovshngbits, "overlapCache_ovshngbits", sizeof(uint32), 1);
+ for (uint32 oo=0; oo<_overlapLen[rr]; oo++)
+ if (_overlaps[rr][oo].symmetric == false)
+ assert(minScore <= _ovsSco[oo]);
+ }
- if (magic != ovlCacheMagic)
- fprintf(stderr, "OverlapCache()-- ERROR: File '%s' isn't a bogart ovlCache.\n", name), exit(1);
+ delete [] nonsymPerRead;
+ nonsymPerRead = NULL;
- AS_UTL_safeRead(file, &_memLimit, "overlapCache_memLimit", sizeof(uint64), 1);
- AS_UTL_safeRead(file, &_memUsed, "overlapCache_memUsed", sizeof(uint64), 1);
+ writeStatus("OverlapCache()-- -- dropped %llu overlaps.\n", nDropped);
- uint32 unused; // Former _batMax, left in for compatibility with old caches.
+ // Finally, run through all the saved overlaps and count how many we need to add to each read.
- AS_UTL_safeRead(file, &_maxPer, "overlapCache_maxPer", sizeof(uint32), 1);
- AS_UTL_safeRead(file, &unused, "overlapCache_batMax", sizeof(uint32), 1);
+ uint32 *toAddPerRead = new uint32 [RI->numReads() + 1]; // Overlap needs to be added to this read
- _threadMax = omp_get_max_threads();
- _thread = new OverlapCacheThreadData [_threadMax];
+ for (uint32 rr=0; rr<RI->numReads()+1; rr++)
+ toAddPerRead[rr] = 0;
- _cachePtr = new BAToverlapInt * [FI->numFragments() + 1];
- _cacheLen = new uint32 [FI->numFragments() + 1];
+ for (uint32 rr=0; rr<RI->numReads()+1; rr++) {
+ for (uint32 oo=0; oo<_overlapLen[rr]; oo++)
+ if (_overlaps[rr][oo].symmetric == false)
+ toAddPerRead[_overlaps[rr][oo].b_iid]++;
+ }
- numRead = AS_UTL_safeRead(file, _cacheLen, "overlapCache_cacheLen", sizeof(uint32), FI->numFragments() + 1);
+ uint64 nToAdd = 0;
- if (numRead != FI->numFragments() + 1)
- fprintf(stderr, "OverlapCache()-- Short read loading graph '%s'. Fail.\n", name), exit(1);
+ for (uint32 rr=0; rr<RI->numReads()+1; rr++)
+ nToAdd += toAddPerRead[rr];
- _ovlStoreUniq = NULL;
- _ovlStoreRept = NULL;
+ writeStatus("OverlapCache()-- Symmetrizing overlaps -- adding %llu missing twin overlaps.\n", nToAdd);
- fclose(file);
+ // Expand or shrink space for the overlaps.
- // Memory map the overlaps
+ for (uint32 rr=0; rr<RI->numReads()+1; rr++)
+ if (_overlapLen[rr] + toAddPerRead[rr] > _overlapMax[rr])
+ resizeArray(_overlaps[rr], _overlapLen[rr], _overlapMax[rr], _overlapLen[rr] + toAddPerRead[rr] + 2048);
- sprintf(name, "%s.ovlCacheDat", prefix);
+ // Copy non-twin overlaps to their twin.
- _cacheMMF = new memoryMappedFile(name);
+ for (uint32 rr=0; rr<RI->numReads()+1; rr++) {
+ if ((rr % 100) == 0)
+ fprintf(stderr, " %6.3f%%\r", 100.0 * rr / RI->numReads());
- _stor = (BAToverlapInt *)_cacheMMF->get(0);
+ for (uint32 oo=0; oo<_overlapLen[rr]; oo++) {
+ if (_overlaps[rr][oo].symmetric == true)
+ continue;
- // Update pointers into the overlaps
+ uint32 rb = _overlaps[rr][oo].b_iid;
+ uint32 nn = _overlapLen[rb]++;
- _cachePtr[0] = _stor;
- for (uint32 fi=1; fi<FI->numFragments() + 1; fi++)
- _cachePtr[fi] = _cachePtr[fi-1] + _cacheLen[fi-1];
+ _overlaps[rb][nn].evalue = _overlaps[rr][oo].evalue;
+ _overlaps[rb][nn].a_hang = (_overlaps[rr][oo].flipped) ? (_overlaps[rr][oo].b_hang) : (-_overlaps[rr][oo].a_hang);
+ _overlaps[rb][nn].b_hang = (_overlaps[rr][oo].flipped) ? (_overlaps[rr][oo].a_hang) : (-_overlaps[rr][oo].b_hang);
+ _overlaps[rb][nn].flipped = _overlaps[rr][oo].flipped;
- bool doCleaning = false;
- uint64 nOvl = 0;
+ _overlaps[rb][nn].filtered = _overlaps[rr][oo].filtered;
+ _overlaps[rb][nn].symmetric = _overlaps[rr][oo].symmetric = true;
- for (uint32 fi=1; fi<FI->numFragments() + 1; fi++) {
- nOvl += _cacheLen[fi];
+ _overlaps[rb][nn].a_iid = _overlaps[rr][oo].b_iid;
+ _overlaps[rb][nn].b_iid = _overlaps[rr][oo].a_iid;
- if ((FI->fragmentLength(fi) == 0) &&
- (_cacheLen[fi] > 0))
- doCleaning = true;
+ assert(_overlapLen[rb] <= _overlapMax[rb]);
- if (_cacheLen[fi] == 0)
- _cachePtr[fi] = NULL;
+ assert(toAddPerRead[rb] > 0);
+ toAddPerRead[rb]--;
+ }
}
- // For each fragment, remove any overlaps to deleted fragments.
+ for (uint32 rr=0; rr<RI->numReads()+1; rr++)
+ assert(toAddPerRead[rr] == 0);
- writeLog("OverlapCache()-- Loaded "F_U64" overlaps.\n", nOvl);
+ delete [] toAddPerRead;
+ toAddPerRead = NULL;
- if (doCleaning) {
- uint64 nDel = 0;
- uint64 nMod = 0;
- uint64 nOvl = 0;
+ for (uint32 rr=0; rr<RI->numReads()+1; rr++)
+ if (_overlaps[rr] != NULL) {
+ assert(_overlaps[rr][0 ].a_iid == rr);
+ assert(_overlaps[rr][_overlapLen[rr]-1].a_iid == rr);
+ }
- writeLog("OverlapCache()-- Freshly deleted fragments detected. Cleaning overlaps.\n");
+ // Probably should sort again. Not sure if anything depends on this.
- char N[FILENAME_MAX];
+ for (uint32 rr=0; rr<RI->numReads()+1; rr++) {
+ }
- sprintf(N, "%s.overlapsRemoved.log", prefix);
+ writeStatus("OverlapCache()-- Symmetrizing overlaps -- finished.\n");
+}
- errno = 0;
- FILE *F = fopen(N, "w");
- if (errno)
- fprintf(stderr, "OverlapCache()-- Failed to open '%s' for writing: %s\n", N, strerror(errno)), exit(1);
- for (uint32 fi=1; fi<FI->numFragments() + 1; fi++) {
- if ((FI->fragmentLength(fi) == 0) &&
- (_cacheLen[fi] > 0)) {
- nDel++;
- fprintf(F, "Removing "F_U32" overlaps from deleted deleted fragment "F_U32"\n", _cacheLen[fi], fi);
- _cachePtr[fi] = NULL;
- _cacheLen[fi] = 0;
- }
- uint32 on = 0;
+bool
+OverlapCache::load(void) {
+ char name[FILENAME_MAX];
+ FILE *file;
+ size_t numRead;
+
+ snprintf(name, FILENAME_MAX, "%s.ovlCache", _prefix);
+ if (AS_UTL_fileExists(name, FALSE, FALSE) == false)
+ return(false);
- for (uint32 oi=0; oi<_cacheLen[fi]; oi++) {
- uint32 iid = _cachePtr[fi][oi].b_iid;
- bool del = (FI->fragmentLength(iid) == 0);
+ writeStatus("OverlapCache()-- Loading graph from '%s'.\n", name);
- if ((del == false) &&
- (on < oi))
- _cachePtr[fi][on] = _cachePtr[fi][oi];
+ errno = 0;
- if (del == false)
- on++;
- }
+ file = fopen(name, "r");
+ if (errno)
+ writeStatus("OverlapCache()-- Failed to open '%s' for reading: %s\n", name, strerror(errno)), exit(1);
- if (_cacheLen[fi] != on) {
- nMod++;
- nOvl += _cacheLen[fi] - on;
- fprintf(F, "Removing "F_U32" overlaps from living fragment "F_U32"\n", _cacheLen[fi] - on, fi);
- memset(_cachePtr[fi] + on, 0xff, (_cacheLen[fi] - on) * (sizeof(BAToverlapInt)));
- }
+ uint64 magic = ovlCacheMagic;
+ uint32 ovserrbits = AS_MAX_EVALUE_BITS;
+ uint32 ovshngbits = AS_MAX_READLEN_BITS + 1;
- _cacheLen[fi] = on;
- }
+ AS_UTL_safeRead(file, &magic, "overlapCache_magic", sizeof(uint64), 1);
+ AS_UTL_safeRead(file, &ovserrbits, "overlapCache_ovserrbits", sizeof(uint32), 1);
+ AS_UTL_safeRead(file, &ovshngbits, "overlapCache_ovshngbits", sizeof(uint32), 1);
+
+ if (magic != ovlCacheMagic)
+ writeStatus("OverlapCache()-- ERROR: File '%s' isn't a bogart ovlCache.\n", name), exit(1);
+
+ AS_UTL_safeRead(file, &_memLimit, "overlapCache_memLimit", sizeof(uint64), 1);
+ AS_UTL_safeRead(file, &_memUsed, "overlapCache_memUsed", sizeof(uint64), 1);
+ AS_UTL_safeRead(file, &_maxPer, "overlapCache_maxPer", sizeof(uint32), 1);
+
+ _threadMax = omp_get_max_threads();
+ _thread = new OverlapCacheThreadData [_threadMax];
- fclose(F);
+ _overlaps = new BAToverlap * [RI->numReads() + 1];
+ _overlapLen = new uint32 [RI->numReads() + 1];
+ _overlapMax = new uint32 [RI->numReads() + 1];
- fprintf(stderr, "OverlapCache()-- Removed all overlaps from "F_U64" deleted fragments. Removed "F_U64" overlaps from "F_U64" alive fragments.\n",
- nDel, nOvl, nMod);
+ AS_UTL_safeRead(file, _overlapLen, "overlapCache_len", sizeof(uint32), RI->numReads() + 1);
+ AS_UTL_safeRead(file, _overlapMax, "overlapCache_max", sizeof(uint32), RI->numReads() + 1);
+
+ for (uint32 rr=0; rr<RI->numReads() + 1; rr++) {
+ if (_overlapLen[rr] == 0)
+ continue;
+
+ _overlaps[rr] = new BAToverlap [ _overlapMax[rr] ];
+ memset(_overlaps[rr], 0xff, sizeof(BAToverlap) * _overlapMax[rr]);
+
+ AS_UTL_safeRead(file, _overlaps[rr], "overlapCache_ovl", sizeof(BAToverlap), _overlapLen[rr]);
+
+ assert(_overlaps[rr][0].a_iid == rr);
}
+ fclose(file);
+
return(true);
}
+
void
-OverlapCache::save(const char *prefix, double erate) {
+OverlapCache::save(void) {
char name[FILENAME_MAX];
FILE *file;
- sprintf(name, "%s.ovlCache", prefix);
+ snprintf(name, FILENAME_MAX, "%s.ovlCache", _prefix);
- fprintf(stderr, "OverlapCache()-- Saving graph to '%s'.\n", name);
+ writeStatus("OverlapCache()-- Saving graph to '%s'.\n", name);
errno = 0;
file = fopen(name, "w");
if (errno)
- fprintf(stderr, "OverlapCache()-- Failed to open '%s' for writing: %s\n", name, strerror(errno)), exit(1);
+ writeStatus("OverlapCache()-- Failed to open '%s' for writing: %s\n", name, strerror(errno)), exit(1);
uint64 magic = ovlCacheMagic;
uint32 ovserrbits = AS_MAX_EVALUE_BITS;
uint32 ovshngbits = AS_MAX_READLEN_BITS + 1;
- AS_UTL_safeWrite(file, &magic, "overlapCache_magic", sizeof(uint64), 1);
- AS_UTL_safeWrite(file, &ovserrbits, "overlapCache_ovserrbits", sizeof(uint32), 1);
- AS_UTL_safeWrite(file, &ovshngbits, "overlapCache_ovshngbits", sizeof(uint32), 1);
+ AS_UTL_safeWrite(file, &magic, "overlapCache_magic", sizeof(uint64), 1);
+ AS_UTL_safeWrite(file, &ovserrbits, "overlapCache_ovserrbits", sizeof(uint32), 1);
+ AS_UTL_safeWrite(file, &ovshngbits, "overlapCache_ovshngbits", sizeof(uint32), 1);
- AS_UTL_safeWrite(file, &_memLimit, "overlapCache_memLimit", sizeof(uint64), 1);
- AS_UTL_safeWrite(file, &_memUsed, "overlapCache_memUsed", sizeof(uint64), 1);
+ AS_UTL_safeWrite(file, &_memLimit, "overlapCache_memLimit", sizeof(uint64), 1);
+ AS_UTL_safeWrite(file, &_memUsed, "overlapCache_memUsed", sizeof(uint64), 1);
+ AS_UTL_safeWrite(file, &_maxPer, "overlapCache_maxPer", sizeof(uint32), 1);
- AS_UTL_safeWrite(file, &_maxPer, "overlapCache_maxPer", sizeof(uint32), 1);
- AS_UTL_safeWrite(file, &_maxPer, "overlapCache_batMax", sizeof(uint32), 1); // COMPATIBILITY, REMOVE
+ AS_UTL_safeWrite(file, _overlapLen, "overlapCache_len", sizeof(uint32), RI->numReads() + 1);
+ AS_UTL_safeWrite(file, _overlapMax, "overlapCache_max", sizeof(uint32), RI->numReads() + 1);
- AS_UTL_safeWrite(file, _cacheLen, "overlapCache_cacheLen", sizeof(uint32), FI->numFragments() + 1);
+ for (uint32 rr=0; rr<RI->numReads() + 1; rr++)
+ AS_UTL_safeWrite(file, _overlaps[rr], "overlapCache_ovl", sizeof(BAToverlap), _overlapLen[rr]);
fclose(file);
}
+
diff --git a/src/bogart/AS_BAT_OverlapCache.H b/src/bogart/AS_BAT_OverlapCache.H
index 814c2f4..7353b57 100644
--- a/src/bogart/AS_BAT_OverlapCache.H
+++ b/src/bogart/AS_BAT_OverlapCache.H
@@ -47,7 +47,6 @@
#include "gkStore.H"
#include "memoryMappedFile.H"
-
// CA8 used to re-encode the error rate into a smaller-precision number. This was
// confusing and broken (it tried to use a log-based encoding to give more precision
// to the smaller values). CA3g gives up and uses all 12 bits of precision.
@@ -56,37 +55,24 @@
// storage.
// For storing overlaps in memory. 12 bytes per overlap.
-struct BAToverlapInt {
- uint64 evalue :AS_MAX_EVALUE_BITS; // 12 by default (same as AS_MAX_EVALUE_BITS)
- int64 a_hang :AS_MAX_READLEN_BITS+1; // 21+1 by default
- int64 b_hang :AS_MAX_READLEN_BITS+1; // 21+1 by default
- uint64 flipped :1;
- uint32 b_iid;
-};
-
-#if (AS_MAX_EVALUE_BITS + AS_MAX_READLEN_BITS + 1 + AS_MAX_READLEN_BITS + 1 + 1 > 64)
-#error not enough bits to store overlaps. decrease AS_MAX_EVALUE_BITS or AS_MAX_READLEN_BITS.
-#endif
-
-
-// For working with overlaps, 32 bytes per overlap. This data is copied
-// from the overlap storage (from a BAToverlapInt) with the erate expanded,
-// and a_iid added.
-
class BAToverlap {
public:
BAToverlap() {
+ evalue = 0;
+ a_hang = 0;
+ b_hang = 0;
+ flipped = false;
+
+ filtered = false;
+ symmetric = false;
+
+ a_iid = 0;
+ b_iid = 0;
};
- ~BAToverlap() {
- };
+ ~BAToverlap() {};
-public:
+ // Nasty bit of code duplication.
- // Return which end of the read the overlap is on. For 'Ais', the orientation
- // of B doesn't matter; likewise for 'Bis'.
- //
- // If the overlap is a containment relationship, both Is5 and Is3 are false.
- //
bool
isDovetail(void) const {
return(((a_hang < 0) && (b_hang < 0)) ||
@@ -129,26 +115,27 @@ public:
(AEndIs3prime() && (flipped == false))); // <=== ---->
};
+ double
+ erate(void) const {
+ return(AS_OVS_decodeEvalue(evalue));
+ }
-public:
- int32 a_hang;
- int32 b_hang;
-
- uint32 flipped;
+ uint64 evalue : AS_MAX_EVALUE_BITS; // 12
+ int64 a_hang : AS_MAX_READLEN_BITS+1; // 21+1
+ int64 b_hang : AS_MAX_READLEN_BITS+1; // 21+1
+ uint64 flipped : 1; // 1
- uint32 evalue; // Encoded fraction error
- double erate; // Decoded fraction error
+ uint64 filtered : 1; // 1
+ uint64 symmetric : 1; // 1 - twin overlap exists
uint32 a_iid;
uint32 b_iid;
};
+#if (AS_MAX_EVALUE_BITS + (AS_MAX_READLEN_BITS + 1) + (AS_MAX_READLEN_BITS + 1) + 1 + 1 + 1 > 64)
+#error not enough bits to store overlaps. decrease AS_MAX_EVALUE_BITS or AS_MAX_READLEN_BITS.
+#endif
-inline
-bool
-BAToverlap_sortByErate(BAToverlap const &a, BAToverlap const &b) {
- return(a.erate > b.erate);
-}
inline
@@ -158,6 +145,7 @@ BAToverlap_sortByEvalue(BAToverlap const &a, BAToverlap const &b) {
}
+
class OverlapCacheThreadData {
public:
OverlapCacheThreadData() {
@@ -176,59 +164,67 @@ public:
class OverlapCache {
public:
- OverlapCache(ovStore *ovlStoreUniq,
+ OverlapCache(gkStore *gkp,
+ ovStore *ovlStoreUniq,
ovStore *ovlStoreRept,
const char *prefix,
double maxErate,
uint32 minOverlap,
uint64 maxMemory,
- uint32 maxOverlaps,
- bool onlysave,
+ uint64 genomeSize,
bool dosave);
~OverlapCache();
- void computeOverlapLimit(void);
+private:
+ uint32 findHighestOverlapCount(void);
+ void allocateLoadingSpace(void);
uint32 filterOverlaps(uint32 maxOVSerate, uint32 minOverlap, uint32 no);
+ uint32 filterDuplicates(uint32 &no);
- void loadOverlaps(double erate, uint32 minOverlap, const char *prefix, bool onlySave, bool doSave);
-
- BAToverlap *getOverlaps(uint32 fragIID, double maxErate, uint32 &numOverlaps);
-
- void removeWeakOverlaps(uint32 *minEvalue5p,
- uint32 *minEvalue3p);
+ void computeOverlapLimit(void);
+ void loadOverlaps(bool doSave);
+ void symmetrizeOverlaps(void);
- //double findErate(uint32 aIID, uint32 bIID);
+public:
+ BAToverlap *getOverlaps(uint32 readIID, uint32 &numOverlaps) {
+ numOverlaps = _overlapLen[readIID];
+ return(_overlaps[readIID]);
+ }
private:
- bool load(const char *prefix, double erate);
- void save(const char *prefix, double erate);
+ bool load(void);
+ void save(void);
private:
+ const char *_prefix;
+
uint64 _memLimit;
uint64 _memUsed;
- uint32 _storMax; // Size of the heap
- uint32 _storLen; // Position we are at in this heap
- BAToverlapInt *_stor; // Pointer to the heap (probably the last thing on _heaps)
-
- vector<BAToverlapInt*> _heaps;
+ BAToverlap **_overlaps;
+ uint32 *_overlapLen;
+ uint32 *_overlapMax;
- memoryMappedFile *_cacheMMF;
+ uint32 _maxEvalue; // Don't load overlaps with high error
+ uint32 _minOverlap; // Don't load overlaps that are short
- BAToverlapInt **_cachePtr; // Mapping of frag iid to overlaps stored in the heap
- uint32 *_cacheLen; // Number of overlaps per frag iid
+ uint32 _minPer; // Minimum number of overlaps to retain for a single read
+ uint32 _maxPer; // Maximum number of overlaps to load for a single read
- uint32 _maxPer; // Maximum number of overlaps to load for a single fragment
+ bool _checkSymmetry;
- uint32 _ovsMax; // For loading overlaps
- ovOverlap *_ovs; //
- uint64 *_ovsSco; // For scoring overlaps during the load
- uint64 *_ovsTmp; // For picking out a score threshold
+ uint32 _ovsMax; // For loading overlaps
+ ovOverlap *_ovs; //
+ uint64 *_ovsSco; // For scoring overlaps during the load
+ uint64 *_ovsTmp; // For picking out a score threshold
uint64 _threadMax;
OverlapCacheThreadData *_thread;
+ uint64 _genomeSize;
+
+ gkStore *_gkp;
ovStore *_ovlStoreUniq; // Pointers to input stores
ovStore *_ovlStoreRept;
};
diff --git a/src/bogart/AS_BAT_PlaceContains.C b/src/bogart/AS_BAT_PlaceContains.C
index f28b931..982ee5b 100644
--- a/src/bogart/AS_BAT_PlaceContains.C
+++ b/src/bogart/AS_BAT_PlaceContains.C
@@ -35,29 +35,29 @@
* full conditions and disclaimers for each license.
*/
-#include "AS_BAT_FragmentInfo.H"
+#include "AS_BAT_ReadInfo.H"
#include "AS_BAT_BestOverlapGraph.H"
#include "AS_BAT_Logging.H"
#include "AS_BAT_Unitig.H"
#include "AS_BAT_PlaceContains.H"
-#include "AS_BAT_PlaceFragUsingOverlaps.H"
+#include "AS_BAT_PlaceReadUsingOverlaps.H"
-#define SHOW_PLACEMENT_DETAIL // Reports evidence (too much) for placing reads.
-#define SHOW_PLACEMENT // Reports where the read was placed.
+#undef SHOW_PLACEMENT_DETAIL // Reports evidence (too much) for placing reads.
+#undef SHOW_PLACEMENT // Reports where the read was placed.
void
-breakSingletonTigs(UnitigVector &unitigs) {
+breakSingletonTigs(TigVector &tigs) {
// For any singleton unitig, eject the read and delete the unitig. Eventually,
- // we will stop making singleton unitigs.
+ // we will stop making singleton tigs.
uint32 removed = 0;
- for (uint32 ti=1; ti<unitigs.size(); ti++) {
- Unitig *utg = unitigs[ti];
+ for (uint32 ti=1; ti<tigs.size(); ti++) {
+ Unitig *utg = tigs[ti];
if (utg == NULL)
continue;
@@ -65,31 +65,30 @@ breakSingletonTigs(UnitigVector &unitigs) {
if (utg->ufpath.size() > 1)
continue;
- unitigs[ti] = NULL; // Remove the unitig from the list
- utg->removeFrag(utg->ufpath[0].ident); // Eject the read
- delete utg; // Reclaim space
- removed++; // Count
+ tigs[ti] = NULL; // Remove the tig from the list
+ tigs.registerRead(utg->ufpath[0].ident); // Eject the read
+ delete utg; // Reclaim space
+ removed++; // Count
}
- writeLog("Removed %u read%s from %u singleton unitig%s.\n",
- removed, (removed != 1) ? "" : "s",
- removed, (removed != 1) ? "" : "s");
+ writeStatus("breakSingletonTigs()-- Removed %u singleton tig%s; reads are now unplaced.\n",
+ removed, (removed == 1) ? "" : "s");
}
void
-placeUnplacedUsingAllOverlaps(UnitigVector &unitigs,
+placeUnplacedUsingAllOverlaps(TigVector &tigs,
const char *prefix) {
- uint32 fiLimit = FI->numFragments();
+ uint32 fiLimit = RI->numReads();
uint32 numThreads = omp_get_max_threads();
uint32 blockSize = (fiLimit < 100 * numThreads) ? numThreads : fiLimit / 99;
- uint32 *placedTig = new uint32 [FI->numFragments() + 1];
- SeqInterval *placedPos = new SeqInterval [FI->numFragments() + 1];
+ uint32 *placedTig = new uint32 [RI->numReads() + 1];
+ SeqInterval *placedPos = new SeqInterval [RI->numReads() + 1];
- memset(placedTig, 0, sizeof(uint32) * (FI->numFragments() + 1));
- memset(placedPos, 0, sizeof(SeqInterval) * (FI->numFragments() + 1));
+ memset(placedTig, 0, sizeof(uint32) * (RI->numReads() + 1));
+ memset(placedPos, 0, sizeof(SeqInterval) * (RI->numReads() + 1));
// Just some logging. Count the number of reads we try to place.
@@ -100,37 +99,38 @@ placeUnplacedUsingAllOverlaps(UnitigVector &unitigs,
uint32 nFailedContained = 0;
uint32 nFailed = 0;
- for (uint32 fid=1; fid<FI->numFragments()+1; fid++)
- if (Unitig::fragIn(fid) == 0)
+ for (uint32 fid=1; fid<RI->numReads()+1; fid++)
+ if (tigs.inUnitig(fid) == 0) // I'm NOT ambiguous!
if (OG->isContained(fid))
nToPlaceContained++;
else
nToPlace++;
- writeLog("placeContains()-- placing %u contained and %u unplaced reads, with %d threads.\n",
- nToPlaceContained, nToPlace, numThreads);
+ writeStatus("\n");
+ writeStatus("placeContains()-- placing %u contained and %u unplaced reads, with %d thread%s.\n",
+ nToPlaceContained, nToPlace, numThreads, (numThreads == 1) ? "" : "s");
// Do the placing!
#pragma omp parallel for schedule(dynamic, blockSize)
- for (uint32 fid=1; fid<FI->numFragments()+1; fid++) {
+ for (uint32 fid=1; fid<RI->numReads()+1; fid++) {
bool enableLog = true;
- if (Unitig::fragIn(fid) > 0)
+ if (tigs.inUnitig(fid) > 0)
continue;
// Place the read.
vector<overlapPlacement> placements;
- placeFragUsingOverlaps(unitigs, AS_MAX_ERATE, NULL, fid, placements);
+ placeReadUsingOverlaps(tigs, NULL, fid, placements, placeRead_fullMatch);
// Search the placements for the highest expected identity placement using all overlaps in the unitig.
uint32 b = UINT32_MAX;
for (uint32 i=0; i<placements.size(); i++) {
- Unitig *tig = unitigs[placements[i].tigID];
+ Unitig *tig = tigs[placements[i].tigID];
if (placements[i].fCoverage < 0.99) // Ignore partially placed reads.
continue;
@@ -138,20 +138,20 @@ placeUnplacedUsingAllOverlaps(UnitigVector &unitigs,
if (tig->ufpath.size() == 1) // Ignore placements in singletons.
continue;
- uint32 bgn = (placements[i].position.bgn < placements[i].position.end) ? placements[i].position.bgn : placements[i].position.end;
- uint32 end = (placements[i].position.bgn < placements[i].position.end) ? placements[i].position.end : placements[i].position.bgn;
+ uint32 bgn = placements[i].position.min();
+ uint32 end = placements[i].position.max();
double erate = placements[i].errors / placements[i].aligned;
if (tig->overlapConsistentWithTig(5.0, bgn, end, erate) < 0.5) {
if ((enableLog == true) && (logFileFlagSet(LOG_PLACE_UNPLACED)))
- writeLog("frag %8u tested tig %6u (%6u reads) at %8u-%8u (cov %7.5f erate %6.4f) - HIGH ERROR\n",
+ writeLog("read %8u tested tig %6u (%6u reads) at %8u-%8u (cov %7.5f erate %6.4f) - HIGH ERROR\n",
fid, placements[i].tigID, tig->ufpath.size(), placements[i].position.bgn, placements[i].position.end, placements[i].fCoverage, erate);
continue;
}
if ((enableLog == true) && (logFileFlagSet(LOG_PLACE_UNPLACED)))
- writeLog("frag %8u tested tig %6u (%6u reads) at %8u-%8u (cov %7.5f erate %6.4f)\n",
+ writeLog("read %8u tested tig %6u (%6u reads) at %8u-%8u (cov %7.5f erate %6.4f)\n",
fid, placements[i].tigID, tig->ufpath.size(), placements[i].position.bgn, placements[i].position.end, placements[i].fCoverage, erate);
if ((b == UINT32_MAX) ||
@@ -164,15 +164,15 @@ placeUnplacedUsingAllOverlaps(UnitigVector &unitigs,
if (b == UINT32_MAX) {
if ((enableLog == true) && (logFileFlagSet(LOG_PLACE_UNPLACED)))
- writeLog("frag %8u remains unplaced\n", fid);
+ writeLog("read %8u remains unplaced\n", fid);
placedPos[fid].bgn = 0;
- placedPos[fid].end = FI->fragmentLength(fid);
+ placedPos[fid].end = RI->readLength(fid);
}
else {
if ((enableLog == true) && (logFileFlagSet(LOG_PLACE_UNPLACED)))
- writeLog("frag %8u placed tig %6u (%6u reads) at %8u-%8u (cov %7.5f erate %6.4f)\n",
- fid, placements[b].tigID, unitigs[placements[b].tigID]->ufpath.size(),
+ writeLog("read %8u placed tig %6u (%6u reads) at %8u-%8u (cov %7.5f erate %6.4f)\n",
+ fid, placements[b].tigID, tigs[placements[b].tigID]->ufpath.size(),
placements[b].position.bgn, placements[b].position.end,
placements[b].fCoverage,
placements[b].errors / placements[b].aligned);
@@ -183,25 +183,21 @@ placeUnplacedUsingAllOverlaps(UnitigVector &unitigs,
// All reads placed, now just dump them in their correct tigs.
- for (uint32 fid=1; fid<FI->numFragments()+1; fid++) {
+ for (uint32 fid=1; fid<RI->numReads()+1; fid++) {
Unitig *tig = NULL;
ufNode frg;
- if (Unitig::fragIn(fid) > 0)
+ if (tigs.inUnitig(fid) > 0) // Already placed, just skip it.
continue;
- // If not placed, dump it in a new unitig. Well, not anymore. These reads were not placed in
- // any tig initially, were not allowed to seed a tig, and now, could find no place to go.
- // They're garbage. Plus, it screws up the logging above because we don't know the new tig ID
- // until now.
+ // If not placed, it's garbage. These reads were not placed in any tig initially, were not
+ // allowed to seed a tig, and now, could find no place to go. They're garbage.
if (placedTig[fid] == 0) {
if (OG->isContained(fid))
nFailedContained++;
else
nFailed++;
-
- //tig = unitigs.newUnitig(false);
}
// Otherwise, it was placed somewhere, grab the tig.
@@ -212,7 +208,7 @@ placeUnplacedUsingAllOverlaps(UnitigVector &unitigs,
else
nPlaced++;
- tig = unitigs[placedTig[fid]];
+ tig = tigs[placedTig[fid]];
}
// Regardless, add it to the tig. Logging for this is above.
@@ -225,8 +221,15 @@ placeUnplacedUsingAllOverlaps(UnitigVector &unitigs,
frg.bhang = 0;
frg.position = placedPos[fid];
- tig->addFrag(frg, 0, false);
+ tig->addRead(frg, 0, false);
}
+
+ // Update status.
+
+ if (tig)
+ RI->setUnplaced(fid);
+ else
+ RI->setLeftover(fid);
}
// Cleanup.
@@ -234,15 +237,15 @@ placeUnplacedUsingAllOverlaps(UnitigVector &unitigs,
delete [] placedPos;
delete [] placedTig;
- writeLog("placeContains()-- Placed %u contained reads and %u unplaced reads.\n", nPlacedContained, nPlaced);
- writeLog("placeContains()-- Failed to place %u contained reads (too high error suspected) and %u unplaced reads (lack of overlaps suspected).\n", nFailedContained, nFailed);
+ writeStatus("placeContains()-- Placed %u contained reads and %u unplaced reads.\n", nPlacedContained, nPlaced);
+ writeStatus("placeContains()-- Failed to place %u contained reads (too high error suspected) and %u unplaced reads (lack of overlaps suspected).\n", nFailedContained, nFailed);
// But wait! All the tigs need to be sorted. Well, not really _all_, but the hard ones to sort
// are big, and those quite likely had reads added to them, so it's really not worth the effort
// of tracking which ones need sorting, since the ones that don't need it are trivial to sort.
- for (uint32 ti=1; ti<unitigs.size(); ti++) {
- Unitig *utg = unitigs[ti];
+ for (uint32 ti=1; ti<tigs.size(); ti++) {
+ Unitig *utg = tigs[ti];
if (utg)
utg->sort();
diff --git a/src/bogart/AS_BAT_PlaceContains.H b/src/bogart/AS_BAT_PlaceContains.H
index c6827f1..c6964ad 100644
--- a/src/bogart/AS_BAT_PlaceContains.H
+++ b/src/bogart/AS_BAT_PlaceContains.H
@@ -38,12 +38,12 @@
#ifndef INCLUDE_AS_BAT_PLACECONTAINS
#define INCLUDE_AS_BAT_PLACECONTAINS
-#include "AS_BAT_UnitigVector.H"
+#include "AS_BAT_TigVector.H"
-void breakSingletonTigs(UnitigVector &unitigs);
+void breakSingletonTigs(TigVector &tigs);
-void placeUnplacedUsingAllOverlaps(UnitigVector &unitigs,
- const char *prefix);
+void placeUnplacedUsingAllOverlaps(TigVector &tigs,
+ const char *prefix);
diff --git a/src/bogart/AS_BAT_PlaceFragUsingOverlaps.C b/src/bogart/AS_BAT_PlaceFragUsingOverlaps.C
deleted file mode 100644
index 118a1da..0000000
--- a/src/bogart/AS_BAT_PlaceFragUsingOverlaps.C
+++ /dev/null
@@ -1,600 +0,0 @@
-
-/******************************************************************************
- *
- * This file is part of canu, a software program that assembles whole-genome
- * sequencing reads into contigs.
- *
- * This software is based on:
- * 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
- * the 'kmer package' (http://kmer.sourceforge.net)
- * both originally distributed by Applera Corporation under the GNU General
- * Public License, version 2.
- *
- * Canu branched from Celera Assembler at its revision 4587.
- * Canu branched from the kmer project at its revision 1994.
- *
- * This file is derived from:
- *
- * src/AS_BAT/AS_BAT_PlaceFragUsingOverlaps.C
- *
- * Modifications by:
- *
- * Brian P. Walenz from 2010-NOV-23 to 2013-SEP-08
- * are Copyright 2010-2013 J. Craig Venter Institute, and
- * are subject to the GNU General Public License version 2
- *
- * Brian P. Walenz from 2014-OCT-09 to 2015-AUG-05
- * are Copyright 2014-2015 Battelle National Biodefense Institute, and
- * are subject to the BSD 3-Clause License
- *
- * Brian P. Walenz beginning on 2016-JAN-11
- * are a 'United States Government Work', and
- * are released in the public domain
- *
- * File 'README.licenses' in the root directory of this distribution contains
- * full conditions and disclaimers for each license.
- */
-
-#include "AS_BAT_FragmentInfo.H"
-#include "AS_BAT_BestOverlapGraph.H"
-#include "AS_BAT_Logging.H"
-
-#include "AS_BAT_Unitig.H"
-#include "AS_BAT_PlaceFragUsingOverlaps.H"
-
-#include "intervalList.H"
-
-// Report LOTS of details on placement, including evidence.
-#undef VERBOSE_PLACEMENT
-
-
-
-
-
-bool
-placeFragUsingOverlaps(UnitigVector &unitigs,
- double erate,
- Unitig *target,
- uint32 fid,
- vector<overlapPlacement> &placements) {
-
- //logFileFlags |= LOG_PLACE_FRAG;
-
- if (logFileFlagSet(LOG_PLACE_FRAG))
- writeLog("placeFragUsingOverlaps()-- begin for frag %d into target tig %d\n", fid, target->id());
-
- assert(fid > 0);
- assert(fid <= FI->numFragments());
-
- ufNode frag;
-
- frag.ident = fid;
- frag.contained = 0;
- frag.parent = 0;
- frag.ahang = 0;
- frag.bhang = 0;
- frag.position.bgn = 0;
- frag.position.end = 0;
-
- placements.clear();
-
- uint32 ovlLen = 0;
- BAToverlap *ovl = OC->getOverlaps(frag.ident, erate, ovlLen);
-
- overlapPlacement *ovlPlace = new overlapPlacement[ovlLen];
- uint32 nFragmentsNotPlaced = 0;
-
- // Initialize placements to nowhere.
-
- for (uint32 i=0; i<ovlLen; i++)
- ovlPlace[i] = overlapPlacement();
-
- // Compute placements. Anything that doesn't get placed is left as 'nowhere', specifically, in unitig 0 (which doesn't exist).
-
- for (uint32 i=0; i<ovlLen; i++) {
- int32 tigID = Unitig::fragIn(ovl[i].b_iid);
- Unitig *tig = unitigs[tigID];
-
- assert(ovl[i].a_iid == frag.ident);
-
- if (tigID == 0)
- // Fragment not in a unitig yet -- possibly this is a contained fragment that we haven't
- // placed yet, or have temporarily removed it from a unitig.
- continue;
-
- if ((target != NULL) && (target != tig))
- // Requested placement in a specific unitig, and this isn't it.
- continue;
-
- // Place the fragment relative to the other fragment.
-
- BestEdgeOverlap edge(ovl[i]);
- ufNode frag;
-
- if (tig->placeFrag(frag,
- fid,
- ovl[i].AEndIs3prime(),
- &edge) == false) {
- nFragmentsNotPlaced++;
- continue;
- }
-
- uint32 olen = FI->overlapLength(ovl[i].a_iid, ovl[i].b_iid, ovl[i].a_hang, ovl[i].b_hang);
- uint32 flen = FI->fragmentLength(ovl[i].a_iid);
-
- ovlPlace[i].frgID = frag.ident;
- ovlPlace[i].refID = ovl[i].b_iid;
- ovlPlace[i].tigID = tig->id();
- ovlPlace[i].position = frag.position;
- ovlPlace[i].errors = olen * ovl[i].erate;
- ovlPlace[i].covered.bgn = (ovl[i].a_hang < 0) ? 0 : ovl[i].a_hang;
- ovlPlace[i].covered.end = (ovl[i].b_hang > 0) ? flen : ovl[i].b_hang + flen;
- ovlPlace[i].aligned = ovlPlace[i].covered.end - ovlPlace[i].covered.bgn;
-
- assert(ovlPlace[i].covered.bgn < ovlPlace[i].covered.end);
-
- // Compute the portion of the unitig that is actually verified by
- // the overlap.
-
- if (ovlPlace[i].position.bgn < ovlPlace[i].position.end) {
- ovlPlace[i].verified.bgn = ovlPlace[i].position.bgn + ovlPlace[i].covered.bgn;
- ovlPlace[i].verified.end = ovlPlace[i].position.bgn + ovlPlace[i].covered.end;
-
- if (ovlPlace[i].verified.end > ovlPlace[i].position.end)
- ovlPlace[i].verified.end = ovlPlace[i].position.end;
-
- assert(ovlPlace[i].verified.bgn >= ovlPlace[i].position.bgn);
- assert(ovlPlace[i].verified.end <= ovlPlace[i].position.end);
- assert(ovlPlace[i].verified.bgn < ovlPlace[i].verified.end);
- } else {
- ovlPlace[i].verified.bgn = ovlPlace[i].position.bgn - ovlPlace[i].covered.bgn; // pos.bgn is the larger and cov.bgn the smaller, so ver.bgn is the larger
- ovlPlace[i].verified.end = ovlPlace[i].position.bgn - ovlPlace[i].covered.end; // pos.bgn is the larger and cov.bgn the larger, so ver.end the smaller
-
- if (ovlPlace[i].verified.end < ovlPlace[i].position.end)
- ovlPlace[i].verified.end = ovlPlace[i].position.end;
-
- assert(ovlPlace[i].verified.end >= ovlPlace[i].position.end);
- assert(ovlPlace[i].verified.bgn <= ovlPlace[i].position.bgn);
- assert(ovlPlace[i].verified.end < ovlPlace[i].verified.bgn);
- }
-
- // Disallow any placements that exceed the boundary of the unitig. These cannot be confirmed
- // by overlaps and might be wrong. Sample cases:
- // o sticking a unique/repeat fragment onto a repeat (leaving the unique uncovered)
- // o sticking a chimeric fragment onto the end of a unitig (leaving the chimeric join uncovered)
-
- if ((MIN(ovlPlace[i].position.bgn, ovlPlace[i].position.end) < 0) ||
- (MAX(ovlPlace[i].position.bgn, ovlPlace[i].position.end) > tig->getLength())) {
-#ifdef VERBOSE_PLACEMENT
- //if (logFileFlagSet(LOG_PLACE_FRAG))
- writeLog("placeFragUsingOverlaps()-- frag %d in unitig %d at %d,%d (verified %d,%d) from overlap ident %d %d hang %d %d flipped %d covered %d,%d DISALLOWED\n",
- frag.ident, tig->id(), ovlPlace[i].position.bgn, ovlPlace[i].position.end, ovlPlace[i].verified.bgn, ovlPlace[i].verified.end,
- ovl[i].a_iid, ovl[i].b_iid, ovl[i].a_hang, ovl[i].b_hang, ovl[i].flipped,
- ovlPlace[i].covered.bgn, ovlPlace[i].covered.end);
-#endif
- ovlPlace[i] = overlapPlacement();
-
- } else {
-#ifdef VERBOSE_PLACEMENT
- //if (logFileFlagSet(LOG_PLACE_FRAG))
- writeLog("placeFragUsingOverlaps()-- frag %d in unitig %d at %d,%d (verified %d,%d) from overlap ident %d %d hang %d %d flipped %d covered %d,%d\n",
- frag.ident, tig->id(), ovlPlace[i].position.bgn, ovlPlace[i].position.end, ovlPlace[i].verified.bgn, ovlPlace[i].verified.end,
- ovl[i].a_iid, ovl[i].b_iid, ovl[i].a_hang, ovl[i].b_hang, ovl[i].flipped,
- ovlPlace[i].covered.bgn, ovlPlace[i].covered.end);
-#endif
- }
-
- assert((ovlPlace[i].position.bgn < ovlPlace[i].position.end) == (ovlPlace[i].verified.bgn < ovlPlace[i].verified.end));
- } // Over all overlaps.
-
-
- // Report if any of the placement routines fail. This shouldn't happen, but if it does, it is
- // hardly fatal.
-
-#ifdef VERBOSE_PLACEMENT
- if (nFragmentsNotPlaced > 0)
- //if (logFileFlagSet(LOG_PLACE_FRAG))
- writeLog("placeFragUsingOverlaps()-- WARNING: Failed to place %d fragments\n", nFragmentsNotPlaced);
-#endif
-
- // Sort all the placements. Any overlap we couldn't place is automatically in Unitig 0, the
- // invalid unitig. Sort order is by unitig ID, then by orientation, then by position.
- //
- sort(ovlPlace, ovlPlace + ovlLen, overlapPlacement_byLocation);
-
-
- // Segregate the overlaps by placement in the unitig. We want to construct one
- // overlapPlacement for each distinct placement. How this is done:
- //
- // For all overlapping overlaps for a specific unitig and a specific fragment orientation, the
- // end points (+- a few bases) are added to an interval list. The intervals are combined.
- // Each combined interval now forms the basis of a cluster of overlaps. A list of the pairs of
- // clusters hit by each overlap is built. If there is one clear winner, that is picked. If
- // there is no clear winner, the fragment cannot be placed.
- //
- // unitig ==========================================================================
- // overlaps -------------------- ------------------
- // ------------------- -------------------
- // ------------------- -------------------
- // intervals x1x x2x x3x x4x x5x x6xx
- //
- // This unitig has two sets of "overlapping overlaps". The left set could be from a tandem
- // repeat. We'll get two overlaps to the 2,4 pair, and one overlap to the 1,3 pair. Assuming
- // that is good enough to be a clear winner, we'll ignore the 1,3 overlap and compute position
- // based on the other two overlaps.
-
- uint32 bgn = 0; // Range of overlaps with the same unitig/orientation
- uint32 end = 1;
-
- // Skip overlaps that didn't generate a placement
-
- while ((bgn < ovlLen) && (ovlPlace[bgn].tigID == 0))
- bgn++;
-
- // Process all placements.
-
- while (bgn < ovlLen) {
-
- // Find the last placement with the same unitig/orientation as the 'bgn' fragment.
- // Orientation of 'position' and 'verified' is the same, asserted above.
-
- end = bgn + 1;
- while ((end < ovlLen) &&
- (ovlPlace[bgn].tigID == ovlPlace[end].tigID) &&
- (ovlPlace[bgn].verified.isReverse() == ovlPlace[end].verified.isReverse()))
- end++;
-
- // Over all placements with the same unitig/orientation (that'd be from bgn to end), build
- // interval lists for the begin point and the end point. Remember, this is all fragments to a
- // single unitig (the whole picture above), not just the overlapping fragment sets (left or
- // right blocks).
- //
- // This used to (before MAY-2016) use the 'verified' placement, instead of the 'full' placement.
- // In long pacbio reads, this seems to result in far too many clusters - each placement is
- // derived from one overlap, which will almost never cover the whole read.
-
- intervalList<int32> bgnPoints;
- intervalList<int32> endPoints;
-
- int32 windowSlop = 0.075 * FI->fragmentLength(frag.ident);
-
- if (windowSlop < 5)
- windowSlop = 5;
-
-#ifdef VERBOSE_PLACEMENT
- writeLog("placeFragUsingOverlaps()-- windowSlop = %d\n", windowSlop);
-#endif
-
- for (uint32 oo=bgn; oo<end; oo++) {
- assert(ovlPlace[oo].tigID > 0);
-
- int32 bb = ovlPlace[oo].position.bgn;
- int32 be = ovlPlace[oo].position.bgn + windowSlop;
- int32 eb = ovlPlace[oo].position.end;
- int32 ee = ovlPlace[oo].position.end + windowSlop;
-
- bb = (bb < windowSlop) ? 0 : bb - windowSlop;
- eb = (eb < windowSlop) ? 0 : eb - windowSlop;
-
- bgnPoints.add(bb, be - bb);
- endPoints.add(eb, ee - eb);
- }
-
- bgnPoints.merge();
- endPoints.merge();
-
- // Now, assign each placement to a end-pair cluster based on the interval ID that the end point
- // falls in.
- //
- // count the number of fragments that hit each pair of points. We can do this two ways:
- // 1) With a list of point-pairs that we sort and count -- O(n) size, O(n log n) time
- // 2) With an array of all point-pairs that we increment directly -- O(p*p) size, O(n) time
- // Typically, p is small.
-
- int32 numBgnPoints = bgnPoints.numberOfIntervals();
- int32 numEndPoints = endPoints.numberOfIntervals();
-
- for (uint32 oo=bgn; oo<end; oo++) {
- int32 b = ovlPlace[oo].position.bgn;
- int32 e = ovlPlace[oo].position.end;
- int32 c = 0;
-
- ovlPlace[oo].clusterID = 0;
-
- for (int32 r=0; r<numBgnPoints; r++)
- if ((bgnPoints.lo(r) <= b) && (b <= bgnPoints.hi(r))) {
- assert(ovlPlace[oo].clusterID == 0); // Obvious; we just set it to zero above.
- ovlPlace[oo].clusterID = c = r * numEndPoints + 1;
- }
-
- for (int32 r=0; r<numEndPoints; r++)
- if ((endPoints.lo(r) <= e) && (e <= endPoints.hi(r))) {
- assert(ovlPlace[oo].clusterID == c); // Otherwise, bgn point wasn't placed in a cluster!
- ovlPlace[oo].clusterID += r;
- }
- }
-
- sort(ovlPlace + bgn, ovlPlace + end, overlapPlacement_byCluster);
-
- // Run through each 'cluster' and compute the placement.
-
- for (uint32 os=bgn, oe=bgn; os<end; ) {
- overlapPlacement op;
-
- while ((oe < end) && (ovlPlace[os].clusterID == ovlPlace[oe].clusterID))
- oe++;
-
- // Overlaps from os to oe are all for a single location. Examine them to fill out an
- // overlapPlacement, including scores.
- //
- // position: the MAX extent (which is actually exactly what the intervalList computed). A possibly
- // better solution is to use the mode.
- //
- // errors: sum of the estimated number of errors in all the overlaps
- //
- // fCoverage: coverage of the fragment. Instead of building another interval list, this is approximated
- // by (max-min) overlap position.
-
- op.frgID = frag.ident;
- op.refID = ovlPlace[os].refID;
- op.tigID = ovlPlace[os].tigID;
-
- op.fCoverage = 0.0;
-
- op.errors = 0.0;
- op.aligned = 0;
-
- assert((ovlPlace[os].position.bgn < ovlPlace[os].position.end) == (ovlPlace[os].verified.bgn < ovlPlace[os].verified.end));
-
- // op.position is not set yet.
- //assert((op.position.bgn < op.position.end) == (ovlPlace[os].verified.bgn < ovlPlace[os].verified.end));
-
- op.verified.bgn = ovlPlace[os].verified.bgn;
- op.verified.end = ovlPlace[os].verified.end;
-
- op.covered.bgn = ovlPlace[os].covered.bgn;
- op.covered.end = ovlPlace[os].covered.end;
-
- uint32 nForward = 0;
- uint32 nReverse = 0;
-
- for (uint32 oo=os; oo<oe; oo++) {
- assert(op.tigID == ovlPlace[oo].tigID);
-
- op.errors += ovlPlace[oo].errors;
- op.aligned += ovlPlace[oo].aligned;
-
- op.covered.bgn = MIN(op.covered.bgn, ovlPlace[oo].covered.bgn);
- op.covered.end = MAX(op.covered.end, ovlPlace[oo].covered.end);
-
- if (ovlPlace[oo].position.isReverse())
- nReverse++;
- else
- nForward++;
- }
-
- assert((nReverse == 0) || (nForward == 0));
-
- op.fCoverage = (op.covered.end - op.covered.bgn) / (double)FI->fragmentLength(op.frgID);
-
- // Find the first and last fragment in the unitig that we overlap with.
- //
- // The first fragment is easy. We can run through the list of overlaps, ask the unitig for
- // the ordinal of each fragment, and return the lowest.
- //
- // The last fragment is similar, but we need to return the highest ordinal that is also the
- // longest. (In the first fragment case, we are guaranteed by the construction of the unitig
- // to have the earliest fragment position).
-
- Unitig *destTig = unitigs[op.tigID];
-
- uint32 firstOrdinal = UINT32_MAX;
- uint32 firstPosition = UINT32_MAX;
- uint32 lastOrdinal = 0;
- uint32 lastPosition = 0;
-
- FragmentEnd firstEnd;
- FragmentEnd lastEnd;
-
- for (uint32 oo=os; oo<oe; oo++) {
- uint32 ordinal = destTig->pathPosition(ovlPlace[oo].refID);
- ufNode &ovlFrg = destTig->ufpath[ordinal];
- uint32 minPos = MIN(ovlFrg.position.bgn, ovlFrg.position.end);
- uint32 maxPos = MAX(ovlFrg.position.bgn, ovlFrg.position.end);
-
- //writeLog("placeFragUsingOverlaps()-- PickEnds ordinal %d tigFrg %d pos %d,%d\n",
- // ordinal, ovlFrg.ident, minPos, maxPos);
-
- // For a normal dovetail alignment, this is pretty straight forward. We pick the
- // end we align to.
- //
- // For spur alignments (repeat detection) it is backwards to what we want. More comments
- // in repeatJunctionEvidence::repeatJunctionEvidence().
- //
- // \ /
- // -----alignedfragment----
- // ------.... .....-----
- //
-
- if (((minPos < firstPosition)) ||
- ((minPos <= firstPosition) && (ordinal < firstOrdinal))) {
- firstOrdinal = ordinal;
- firstPosition = minPos;
- firstEnd = FragmentEnd(ovlFrg.ident, (ovlFrg.position.bgn < ovlFrg.position.end));
- }
-
- if (((maxPos > lastPosition)) ||
- ((maxPos >= lastPosition) && (ordinal > lastOrdinal))) {
- lastOrdinal = ordinal;
- lastPosition = minPos;
- lastEnd = FragmentEnd(ovlFrg.ident, (ovlFrg.position.end < ovlFrg.position.bgn));
- }
- }
-
- if (nForward > 0) {
- op.frag5p = firstEnd;
- op.frag3p = lastEnd;
- } else {
- op.frag5p = lastEnd;
- op.frag3p = firstEnd;
- }
-
- // Compute mean and stddev placement.
-
- uint32 numPlace = 0;
- double bgnMean = 0;
- double endMean = 0;
-
- op.bgnStdDev = 0.0;
- op.endStdDev = 0.0;
-
- for (uint32 oo=os; oo<oe; oo++) {
- if ((ovlPlace[oo].position.bgn == 0) &&
- (ovlPlace[oo].position.end == 0))
- continue;
-
- if (ovlPlace[oo].position.bgn < ovlPlace[oo].position.end) {
- if (ovlPlace[oo].verified.bgn >= ovlPlace[oo].verified.end)
- writeLog("placeFragUsingOverlaps()-- frag %d FWD verified placement invalid (bgn,end %d,%d) for position (bgn,end %d,%d)\n",
- ovlPlace[oo].frgID,
- ovlPlace[oo].verified.bgn, ovlPlace[oo].verified.end,
- ovlPlace[oo].position.bgn, ovlPlace[oo].position.end);
- assert(ovlPlace[oo].verified.bgn < ovlPlace[oo].verified.end);
-
- bgnMean += ovlPlace[oo].position.bgn;
- endMean += ovlPlace[oo].position.end;
-
- op.verified.bgn = MIN(op.verified.bgn, ovlPlace[oo].verified.bgn);
- op.verified.end = MAX(op.verified.end, ovlPlace[oo].verified.end);
-
- } else {
- if (ovlPlace[oo].verified.bgn < ovlPlace[oo].verified.end)
- writeLog("placeFragUsingOverlaps()-- frag %d REV verified placement invalid (bgn,end %d,%d) for position (bgn,end %d,%d)\n",
- ovlPlace[oo].frgID,
- ovlPlace[oo].verified.bgn, ovlPlace[oo].verified.end,
- ovlPlace[oo].position.bgn, ovlPlace[oo].position.end);
- assert(ovlPlace[oo].verified.bgn >= ovlPlace[oo].verified.end);
-
- bgnMean += ovlPlace[oo].position.end;
- endMean += ovlPlace[oo].position.bgn;
-
- op.verified.bgn = MAX(op.verified.bgn, ovlPlace[oo].verified.bgn);
- op.verified.end = MIN(op.verified.end, ovlPlace[oo].verified.end);
- }
-
- numPlace++;
- }
-
- bgnMean /= numPlace;
- endMean /= numPlace;
-
- op.position.bgn = (int32)((nReverse == 0) ? bgnMean : endMean);
- op.position.end = (int32)((nReverse == 0) ? endMean : bgnMean);
-
- for (uint32 oo=os; oo<oe; oo++) {
- if ((ovlPlace[oo].position.bgn == 0) &&
- (ovlPlace[oo].position.end == 0))
- continue;
-
- if (ovlPlace[oo].position.bgn < ovlPlace[oo].position.end) {
- op.bgnStdDev += (ovlPlace[oo].position.bgn - bgnMean) * (ovlPlace[oo].position.bgn - bgnMean);
- op.endStdDev += (ovlPlace[oo].position.end - endMean) * (ovlPlace[oo].position.end - endMean);
- } else {
- op.bgnStdDev += (ovlPlace[oo].position.end - bgnMean) * (ovlPlace[oo].position.end - bgnMean);
- op.endStdDev += (ovlPlace[oo].position.bgn - endMean) * (ovlPlace[oo].position.bgn - endMean);
- }
- }
-
- op.bgnStdDev = sqrt(op.bgnStdDev / numPlace);
- op.endStdDev = sqrt(op.endStdDev / numPlace);
-
- // Filter out bogus placements.
- //
- // This placement is invalid if the std.dev is too high on either end. (Was 3% fragment length before 11 Apr 2013)
- // This placement is invalid if both nReverse and nForward are more than zero.
-
- bool weakStdDev = false;
- bool overlappingSpan = false;
- bool spanBad = false;
- bool reject = false;
-
- double allowableStdDev = MAX(2.0, 0.075 * FI->fragmentLength(op.frgID));
-
- if ((op.bgnStdDev > allowableStdDev) ||
- (op.endStdDev > allowableStdDev))
- weakStdDev = true;
-
- if (((op.position.bgn < op.position.end) && (op.position.bgn + 3 * op.bgnStdDev > op.position.end - 3 * op.endStdDev)) ||
- ((op.position.end < op.position.bgn) && (op.position.end + 3 * op.endStdDev > op.position.bgn - 3 * op.bgnStdDev)))
- overlappingSpan = true;
-
- int32 poslen = (op.position.end > op.position.bgn) ? (op.position.end - op.position.bgn) : (op.position.bgn - op.position.end);
- int32 trulen = FI->fragmentLength(op.frgID);
- double scaled = (double)poslen / trulen;
-
- if ((scaled < 0.3333) ||
- (2.0 < scaled))
- spanBad = true;
-
-
- if ((weakStdDev) && (0))
- // Read is not known to have lots of indel, but the stddev is high.
- reject = true;
-
- if (overlappingSpan)
- // Read placements are conflicting and overlapping.
- reject = true;
-
- if (spanBad)
- // Bogus placement, more than twice as large as expected, or less than 1/3 expected.
- reject = true;
-
-
- if (reject) {
-#ifdef VERBOSE_PLACEMENT
- //if (logFileFlagSet(LOG_PLACE_FRAG)) {
- writeLog("placeFragUsingOverlaps()-- frag %d in unitig %d at %d,%d (+- %.2f,%.2f) -- cov %.2f (%d,%d) errors %.2f aligned %d novl %d -- INVALID stddev weak %d overlapping %d bad size %d\n",
- op.frgID, op.tigID, op.position.bgn, op.position.end, op.bgnStdDev, op.endStdDev,
- op.fCoverage, op.covered.bgn, op.covered.end,
- op.errors,
- op.aligned,
- oe - os,
- weakStdDev, overlappingSpan, spanBad);
- for (uint32 oo=os; oo<oe; oo++) {
- if ((ovlPlace[oo].position.bgn == 0) &&
- (ovlPlace[oo].position.end == 0))
- continue;
-
- writeLog("placeFragUsingOverlaps()-- %8u,%8u\n", ovlPlace[oo].position.bgn, ovlPlace[oo].position.end);
- }
- //}
-#endif
-
- } else {
- placements.push_back(op);
-#ifdef VERBOSE_PLACEMENT
- //if (logFileFlagSet(LOG_PLACE_FRAG))
- writeLog("placeFragUsingOverlaps()-- frag %d in unitig %d at %d,%d (+- %.2f,%.2f) -- cov %.2f (%d,%d) errors %.2f aligned %d novl %d\n",
- op.frgID, op.tigID, op.position.bgn, op.position.end, op.bgnStdDev, op.endStdDev,
- op.fCoverage, op.covered.bgn, op.covered.end,
- op.errors,
- op.aligned,
- oe - os);
-#endif
- }
-
- os = oe;
- oe = oe + 1;
- } // End of segregating overlaps by placement
-
- // Move to the next block of overlaps.
- bgn = end;
- end = end + 1;
- }
-
- delete [] ovlPlace;
-
- //logFileFlags &= ~LOG_PLACE_FRAG;
-
- return(true);
-}
diff --git a/src/bogart/AS_BAT_PlaceReadUsingOverlaps.C b/src/bogart/AS_BAT_PlaceReadUsingOverlaps.C
new file mode 100644
index 0000000..1753546
--- /dev/null
+++ b/src/bogart/AS_BAT_PlaceReadUsingOverlaps.C
@@ -0,0 +1,688 @@
+
+/******************************************************************************
+ *
+ * This file is part of canu, a software program that assembles whole-genome
+ * sequencing reads into contigs.
+ *
+ * This software is based on:
+ * 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
+ * the 'kmer package' (http://kmer.sourceforge.net)
+ * both originally distributed by Applera Corporation under the GNU General
+ * Public License, version 2.
+ *
+ * Canu branched from Celera Assembler at its revision 4587.
+ * Canu branched from the kmer project at its revision 1994.
+ *
+ * This file is derived from:
+ *
+ * src/bogart/AS_BAT_PlaceFragUsingOverlaps.C
+ *
+ * Modifications by:
+ *
+ * Brian P. Walenz beginning on 2016-AUG-12
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
+ * File 'README.licenses' in the root directory of this distribution contains
+ * full conditions and disclaimers for each license.
+ */
+
+#include "AS_BAT_ReadInfo.H"
+#include "AS_BAT_BestOverlapGraph.H"
+#include "AS_BAT_Logging.H"
+
+#include "AS_BAT_Unitig.H"
+#include "AS_BAT_PlaceReadUsingOverlaps.H"
+
+#include "intervalList.H"
+
+
+#undef TEST_ALT
+
+
+
+overlapPlacement *
+placeRead_fromOverlaps(TigVector &tigs,
+ Unitig *target,
+ uint32 fid,
+ uint32 flags,
+ uint32 ovlLen,
+ BAToverlap *ovl) {
+ overlapPlacement *ovlPlace = new overlapPlacement[ovlLen];
+
+ for (uint32 i=0; i<ovlLen; i++) {
+ int32 tigID = tigs.inUnitig(ovl[i].b_iid);
+ Unitig *tig = tigs[tigID];
+
+ assert(ovl[i].a_iid == fid);
+
+ if (tigID == 0) // Skip if overlapping read isn't in a tig yet - unplaced contained, or garbage read.
+ continue;
+
+ if ((target != NULL) && (target != tig)) // Skip if we requested a specific tig and if this isn't it.
+ continue;
+
+ // Place the read relative to the other read.
+
+ BestEdgeOverlap edge(ovl[i]);
+ ufNode read;
+
+ if (tig->placeRead(read, fid, ovl[i].AEndIs3prime(), &edge) == false) {
+ if (logFileFlagSet(LOG_PLACE_READ))
+ writeLog("pRUO()-- WARNING: Failed to place with overlap %u %u hangs %u %u flipped %u\n",
+ ovl[i].a_iid, ovl[i].b_iid, ovl[i].a_hang, ovl[i].b_hang, ovl[i].flipped);
+ continue;
+ }
+
+ // Save the placement in our work space.
+
+ uint32 olen = RI->overlapLength(ovl[i].a_iid, ovl[i].b_iid, ovl[i].a_hang, ovl[i].b_hang);
+ uint32 flen = RI->readLength(ovl[i].a_iid);
+
+ ovlPlace[i].frgID = fid;
+ ovlPlace[i].refID = ovl[i].b_iid;
+ ovlPlace[i].tigID = tig->id();
+ ovlPlace[i].position = read.position;
+ ovlPlace[i].verified.bgn = INT32_MAX;
+ ovlPlace[i].verified.end = INT32_MIN;
+ ovlPlace[i].covered.bgn = (ovl[i].a_hang < 0) ? 0 : ovl[i].a_hang; // The portion of the read
+ ovlPlace[i].covered.end = (ovl[i].b_hang > 0) ? flen : ovl[i].b_hang + flen; // covered by the overlap.
+ ovlPlace[i].clusterID = 0;
+ ovlPlace[i].fCoverage = 0.0;
+ ovlPlace[i].errors = olen * ovl[i].erate();
+ ovlPlace[i].aligned = ovlPlace[i].covered.end - ovlPlace[i].covered.bgn;
+ ovlPlace[i].tigFidx = UINT32_MAX;
+ ovlPlace[i].tigLidx = 0;
+
+ assert(ovlPlace[i].covered.bgn >= 0);
+ assert(ovlPlace[i].covered.end >= 0);
+ assert(ovlPlace[i].covered.bgn <= flen);
+ assert(ovlPlace[i].covered.end <= flen);
+ assert(ovlPlace[i].covered.bgn < ovlPlace[i].covered.end);
+
+ // Disallow any placements that exceed the boundary of the unitig. These cannot be confirmed
+ // by overlaps and might be wrong. Sample cases:
+ // o sticking a unique/repeat read onto a repeat (leaving the unique uncovered)
+ // o sticking a chimeric read onto the end of a unitig (leaving the chimeric join uncovered)
+
+ if (((flags & placeRead_fullMatch) ||
+ (flags & placeRead_noExtend)) &&
+ ((ovlPlace[i].position.min() < 0) ||
+ (ovlPlace[i].position.max() > tig->getLength()))) {
+ ovlPlace[i] = overlapPlacement();
+ }
+
+ // Report the placement.
+
+ if (logFileFlagSet(LOG_PLACE_READ))
+ writeLog("pRUO()-- read %7d (%5d,%5d) in unitig %5d at %8d,%-8d via read %7d at %8d:%-8d hang %6d %6d %s%s\n",
+ ovlPlace[i].frgID,
+ ovlPlace[i].covered.bgn, ovlPlace[i].covered.end,
+ ovlPlace[i].tigID,
+ ovlPlace[i].position.bgn, ovlPlace[i].position.end,
+ ovl[i].b_iid,
+ tig->readFromId(ovl[i].b_iid)->position.bgn,
+ tig->readFromId(ovl[i].b_iid)->position.end,
+ ovl[i].a_hang, ovl[i].b_hang,
+ (ovl[i].flipped == true) ? "<--" : "-->",
+ (ovlPlace[i].frgID == 0) ? " DISALLOWED" : "");
+ } // Over all overlaps.
+
+ return(ovlPlace);
+}
+
+
+
+void
+placeRead_assignEndPointsToCluster(uint32 bgn, uint32 end,
+ uint32 fid,
+ overlapPlacement *ovlPlace,
+ intervalList<int32> &bgnPoints,
+ intervalList<int32> &endPoints) {
+ int32 windowSlop = 0.075 * RI->readLength(fid);
+
+ if (windowSlop < 5)
+ windowSlop = 5;
+
+ for (uint32 oo=bgn; oo<end; oo++) {
+ bgnPoints.add(ovlPlace[oo].position.bgn - windowSlop, 2 * windowSlop);
+ endPoints.add(ovlPlace[oo].position.end - windowSlop, 2 * windowSlop);
+ }
+
+ bgnPoints.merge();
+ endPoints.merge();
+
+ if (logFileFlagSet(LOG_PLACE_READ)) {
+ writeLog("pRUO()-- Using windowSlop %d\n", windowSlop);
+ writeLog("pRUO()-- Found %3u bgn interval%s", bgnPoints.numberOfIntervals(), bgnPoints.numberOfIntervals() == 1 ? ": " : "s: ");
+ for (uint32 r=0; r<bgnPoints.numberOfIntervals(); r++)
+ writeLog(" %6d:%-6d", bgnPoints.lo(r), bgnPoints.hi(r));
+ writeLog("\n");
+
+ writeLog("pRUO()-- Found %3u end interval%s", endPoints.numberOfIntervals(), endPoints.numberOfIntervals() == 1 ? ": " : "s: ");
+ for (uint32 r=0; r<endPoints.numberOfIntervals(); r++)
+ writeLog(" %6d:%-6d", endPoints.lo(r), endPoints.hi(r));
+ writeLog("\n");
+ }
+}
+
+
+
+void
+placeRead_assignPlacementsToCluster(uint32 bgn, uint32 end,
+ uint32 fid,
+ overlapPlacement *ovlPlace,
+ intervalList<int32> &bgnPoints,
+ intervalList<int32> &endPoints) {
+ int32 numBgnPoints = bgnPoints.numberOfIntervals();
+ int32 numEndPoints = endPoints.numberOfIntervals();
+
+ for (uint32 oo=bgn; oo<end; oo++) {
+ int32 b = ovlPlace[oo].position.bgn;
+ int32 e = ovlPlace[oo].position.end;
+ int32 c = 0;
+
+ ovlPlace[oo].clusterID = 0;
+
+ for (int32 r=0; r<numBgnPoints; r++)
+ if ((bgnPoints.lo(r) <= b) && (b <= bgnPoints.hi(r))) {
+ assert(ovlPlace[oo].clusterID == 0); // Obvious; we just set it to zero above.
+ ovlPlace[oo].clusterID = c = r * numEndPoints + 1;
+ }
+
+ for (int32 r=0; r<numEndPoints; r++)
+ if ((endPoints.lo(r) <= e) && (e <= endPoints.hi(r))) {
+ assert(ovlPlace[oo].clusterID == c); // Otherwise, bgn point wasn't placed in a cluster!
+ ovlPlace[oo].clusterID += r;
+ }
+ }
+}
+
+
+
+void
+placeRead_findFirstLastOverlapping(overlapPlacement &op,
+ Unitig *tig,
+ uint32 os, uint32 oe,
+ overlapPlacement *ovlPlace) {
+ op.tigFidx = UINT32_MAX;
+ op.tigLidx = 0;
+
+ for (uint32 oo=os; oo<oe; oo++) {
+ uint32 ord = tig->ufpathIdx(ovlPlace[oo].refID);
+
+ op.tigFidx = min(ord, op.tigFidx);
+ op.tigLidx = max(ord, op.tigLidx);
+
+ //if (logFileFlagSet(LOG_PLACE_READ))
+ // writeLog("pRUO()-- find range from os=%u to oe=%u tig=%u ord=%u f=%u l=%u\n",
+ // os, oe, op.tigID, ord, op.tigFidx, op.tigLidx);
+ }
+
+ if (op.tigFidx > op.tigLidx)
+ writeStatus("pRUO()-- Invalid placement indices: tigFidx %u tigLidx %u\n", op.tigFidx, op.tigLidx);
+ assert(op.tigFidx <= op.tigLidx);
+
+ if (logFileFlagSet(LOG_PLACE_READ))
+ writeLog("pRUO()-- spans reads #%u (%u) to #%u (%u) in tig %u\n",
+ op.tigFidx, tig->ufpath[op.tigFidx].ident,
+ op.tigLidx, tig->ufpath[op.tigLidx].ident,
+ op.tigID);
+}
+
+
+
+void
+placeRead_computeQualityAndCoverage(overlapPlacement &op,
+ uint32 os, uint32 oe,
+ overlapPlacement *ovlPlace) {
+ op.errors = 0;
+ op.aligned = 0;
+
+ op.covered.bgn = INT32_MAX; // Covered interval is always in
+ op.covered.end = INT32_MIN; // forward read coordinates
+
+ for (uint32 oo=os; oo<oe; oo++) {
+ if ((ovlPlace[oo].position.bgn == 0) &&
+ (ovlPlace[oo].position.end == 0)) {
+ if (logFileFlagSet(LOG_PLACE_READ))
+ writeLog("OLD place=%3d read %8d ref read %8d - covered %5d:%-5d with %6.1f errors - DELETED\n",
+ op.frgID, ovlPlace[oo].refID, ovlPlace[oo].covered.bgn, ovlPlace[oo].covered.end, ovlPlace[oo].errors);
+ continue;
+ }
+
+ op.errors += ovlPlace[oo].errors;
+ op.aligned += ovlPlace[oo].aligned;
+
+ op.covered.bgn = min(op.covered.bgn, ovlPlace[oo].covered.bgn);
+ op.covered.end = max(op.covered.end, ovlPlace[oo].covered.end);
+
+ //if (logFileFlagSet(LOG_PLACE_READ))
+ // writeLog("OLD place=%3d read %8d ref read %8d - covered %5d:%-5d with %6.1f errors\n",
+ // oo, op.frgID, ovlPlace[oo].refID, ovlPlace[oo].covered.bgn, ovlPlace[oo].covered.end, ovlPlace[oo].errors);
+ }
+
+ op.fCoverage = (op.covered.end - op.covered.bgn) / (double)RI->readLength(op.frgID);
+}
+
+
+
+void
+placeRead_computeQualityAndCoverage(overlapPlacement &op,
+ BAToverlap *ovl,
+ uint32 ovlLen,
+ set<uint32> &reads) {
+ op.errors = 0;
+ op.aligned = 0;
+
+ op.covered.bgn = INT32_MAX; // Covered interval is always in
+ op.covered.end = INT32_MIN; // forward read coordinates
+
+ // For reads that have two overlaps to the same other read, we have no way of knowing
+ // which is the correct overlap, just that we have an overlap.
+ //
+ // This happens in dros a whole bunch of times, and does change the fCoverave value.
+
+ for (uint32 oo=0; oo<ovlLen; oo++) {
+ if (reads.count(ovl[oo].b_iid) == 0)
+ continue;
+
+ int32 olen = RI->overlapLength(ovl[oo].a_iid, ovl[oo].b_iid, ovl[oo].a_hang, ovl[oo].b_hang);
+ int32 flen = RI->readLength(ovl[oo].a_iid);
+
+ int32 cbgn = (ovl[oo].a_hang < 0) ? 0 : ovl[oo].a_hang; // The portion of the read
+ int32 cend = (ovl[oo].b_hang > 0) ? flen : ovl[oo].b_hang + flen; // covered by the overlap.
+
+ //if (logFileFlagSet(LOG_PLACE_READ))
+ // writeLog("NEW place=%3d read %8d ref read %8d - covered %5d:%-d with %f errors\n",
+ // op.frgID, ovlPlace[oo].refID, cbgn, cend, olen * ovl[oo].erate());
+
+ op.errors += olen * ovl[oo].erate();
+ op.aligned += cend - cbgn;
+
+ op.covered.bgn = min(op.covered.bgn, cbgn);
+ op.covered.end = max(op.covered.end, cend);
+ }
+
+ op.fCoverage = (op.covered.end - op.covered.bgn) / (double)RI->readLength(op.frgID);
+}
+
+
+
+
+// Now that it is placed, estimate the span that is verified by overlaps.
+// Threshold the floating end so it doesn't exceed the placement.
+//
+// Annoyingly, the verified placement can, and does, exceed the bounds of the
+// unitig, and we need to check that threshold too. Indel in the read and all that.
+//
+void
+placeRead_computeVerified(overlapPlacement &op, uint32 tigLen) {
+
+ //writeLog("computeVer pos %d-%d cov %d-%d\n", op.position.bgn, op.position.end, op.covered.bgn, op.covered.end);
+
+ if (op.position.isForward()) {
+ op.verified.bgn = op.position.bgn + op.covered.bgn;
+ op.verified.end = op.position.bgn + op.covered.end;
+
+ if (op.verified.end > op.position.end) // verified.bgn is always valid if covered.bgn > 0
+ op.verified.end = op.position.end; //
+
+ if (op.verified.bgn < 0)
+ op.verified.bgn = 0;
+ if (op.verified.end > tigLen)
+ op.verified.end = tigLen;
+
+ assert(op.verified.bgn >= op.position.bgn);
+ assert(op.verified.end <= op.position.end);
+ assert(op.verified.bgn < op.verified.end);
+ }
+
+ else {
+ op.verified.bgn = op.position.bgn - op.covered.bgn; // High coord
+ op.verified.end = op.position.bgn - op.covered.end; // Low coord
+
+ if (op.verified.end < op.position.end) // verified.bgn is always valid if covered.bgn > 0
+ op.verified.end = op.position.end;
+
+ if (op.verified.end < 0)
+ op.verified.end = 0;
+ if (op.verified.bgn > tigLen)
+ op.verified.bgn = tigLen;
+
+ assert(op.verified.end >= op.position.end);
+ assert(op.verified.bgn <= op.position.bgn);
+ assert(op.verified.end < op.verified.bgn);
+ }
+
+ assert(op.position.isForward() == op.verified.isForward());
+}
+
+
+
+void
+placeRead_computePlacement(overlapPlacement &op,
+ uint32 os,
+ uint32 oe,
+ overlapPlacement *ovlPlace,
+ Unitig *tig) {
+ stdDev<double> bgnPos;
+ stdDev<double> endPos;
+
+ for (uint32 oo=os; oo<oe; oo++) {
+ if ((ovlPlace[oo].position.bgn == 0) &&
+ (ovlPlace[oo].position.end == 0))
+ continue;
+
+ //writeLog("OLD place %d-%d\n", ovlPlace[oo].position.bgn, ovlPlace[oo].position.end);
+
+ bgnPos.insert(ovlPlace[oo].position.bgn);
+ endPos.insert(ovlPlace[oo].position.end);
+ }
+
+ bgnPos.finalize();
+ endPos.finalize();
+
+ op.position.bgn = bgnPos.mean();
+ op.position.end = endPos.mean();
+
+ placeRead_computeVerified(op, tig->getLength());
+}
+
+
+
+
+void
+placeRead_computePlacement(overlapPlacement &op,
+ BAToverlap *ovl,
+ uint32 ovlLen,
+ set<uint32> &reads,
+ uint32 flags,
+ Unitig *tig) {
+ stdDev<double> bgnPos;
+ stdDev<double> endPos;
+
+ // For reads that have two overlaps to the same other read, we have no way of knowing
+ // which is the correct overlap, just that we have an overlap.
+
+ for (uint32 oo=0; oo<ovlLen; oo++) {
+ if (reads.count(ovl[oo].b_iid) == 0)
+ continue;
+
+ BestEdgeOverlap edge(ovl[oo]);
+ ufNode read;
+
+ if (tig->placeRead(read, op.frgID, ovl[oo].AEndIs3prime(), &edge) == false) {
+ if (logFileFlagSet(LOG_PLACE_READ))
+ writeLog("pRUO()-- WARNING: Failed to place with overlap %u %u hangs %d %d flipped %u\n",
+ ovl[oo].a_iid, ovl[oo].b_iid, ovl[oo].a_hang, ovl[oo].b_hang, ovl[oo].flipped);
+ continue;
+ }
+
+ if (((flags & placeRead_fullMatch) ||
+ (flags & placeRead_noExtend)) &&
+ ((read.position.min() < 0) ||
+ (read.position.max() > tig->getLength())))
+ continue;
+
+ //writeLog("NEW place %d-%d\n", read.position.bgn, read.position.end);
+
+ bgnPos.insert(read.position.bgn);
+ endPos.insert(read.position.end);
+ }
+
+ bgnPos.finalize();
+ endPos.finalize();
+
+ op.position.bgn = bgnPos.mean();
+ op.position.end = endPos.mean();
+
+ placeRead_computeVerified(op, tig->getLength());
+}
+
+
+
+
+bool
+placeReadUsingOverlaps(TigVector &tigs,
+ Unitig *target,
+ uint32 fid,
+ vector<overlapPlacement> &placements,
+ uint32 flags) {
+
+ //if ((fid == 232074) || (fid == 72374) || (fid == 482602))
+ // logFileFlags |= LOG_PLACE_READ;
+
+ if (logFileFlagSet(LOG_PLACE_READ)) // Nope, not ambiguous.
+ if (target)
+ writeLog("\npRUO()-- begin for read %d into target tig %d\n", fid, target->id());
+ else
+ writeLog("\npRUO()-- begin for read %d into all tigs\n", fid);
+
+ assert(fid > 0);
+ assert(fid <= RI->numReads());
+
+ // Grab overlaps we'll use to place this read.
+
+ uint32 ovlLen = 0;
+ BAToverlap *ovl = OC->getOverlaps(fid, ovlLen);
+
+ // Grab some work space, and clear the output.
+
+ placements.clear();
+
+ // Compute placements. Anything that doesn't get placed is left as 'nowhere', specifically, in
+ // unitig 0 (which doesn't exist).
+
+ overlapPlacement *ovlPlace = placeRead_fromOverlaps(tigs, target, fid, flags, ovlLen, ovl);
+
+ // We've placed the read in all possible places, or set unitig ID to 0 (an invalid unitig).
+ // Sort all the placements. Sort order is:
+ // unitig ID (so zero is first)
+ // placed orientation (reverse is first)
+ // position
+
+ sort(ovlPlace, ovlPlace + ovlLen, overlapPlacement_byLocation);
+
+
+ // Segregate the overlaps by placement in the unitig. We want to construct one
+ // overlapPlacement for each distinct placement. How this is done:
+ //
+ // For all overlapping overlaps for a specific unitig and a specific read orientation, the
+ // end points (+- a few bases) are added to an interval list. The intervals are combined.
+ // Each combined interval now forms the basis of a cluster of overlaps. A list of the pairs of
+ // clusters hit by each overlap is built. If there is one clear winner, that is picked. If
+ // there is no clear winner, the read cannot be placed.
+ //
+ // unitig ==========================================================================
+ // overlaps -------------------- ------------------
+ // ------------------- -------------------
+ // ------------------- -------------------
+ // intervals x1x x2x x3x x4x x5x x6xx
+ //
+ // This unitig has two sets of "overlapping overlaps". The left set could be from a tandem
+ // repeat. We'll get two overlaps to the 2,4 pair, and one overlap to the 1,3 pair. Assuming
+ // that is good enough to be a clear winner, we'll ignore the 1,3 overlap and compute position
+ // based on the other two overlaps.
+
+ uint32 bgn = 0; // Range of overlaps with the same unitig/orientation
+ uint32 end = 1;
+
+ // Skip overlaps that didn't generate a placement
+
+ while ((bgn < ovlLen) && (ovlPlace[bgn].tigID == 0))
+ bgn++;
+
+ // Process all placements.
+
+ while (bgn < ovlLen) {
+
+ // Find the last placement with the same unitig/orientation as the 'bgn' read.
+
+ end = bgn + 1;
+ while ((end < ovlLen) &&
+ (ovlPlace[bgn].tigID == ovlPlace[end].tigID) &&
+ (ovlPlace[bgn].position.isReverse() == ovlPlace[end].position.isReverse()))
+ end++;
+
+ if (logFileFlagSet(LOG_PLACE_READ))
+ writeLog("\nplaceReadUsingOverlaps()-- Merging placements %u to %u to place the read.\n", bgn, end);
+
+ // Build interval lists for the begin point and the end point. Remember, this is all reads
+ // to a single unitig (the whole picture above), not just the overlapping read sets (left
+ // or right blocks).
+
+ intervalList<int32> bgnPoints;
+ intervalList<int32> endPoints;
+
+ placeRead_assignEndPointsToCluster(bgn, end, fid, ovlPlace, bgnPoints, endPoints);
+
+ // Now, assign each placement to an end-pair cluster based on the interval ID that the end point falls in.
+ //
+ // Count the number of reads that hit each pair of points. Assign each ovlPlace to an implicit
+ // numbering of each pair of points.
+
+ placeRead_assignPlacementsToCluster(bgn, end, fid, ovlPlace, bgnPoints, endPoints);
+
+ // Sort these placements by their clusterID.
+
+ sort(ovlPlace + bgn, ovlPlace + end, overlapPlacement_byCluster);
+
+ // Run through each 'cluster' and compute a final placement for the read.
+ // A cluster extends from placements os to oe.
+ // Each cluster generates one placement.
+
+ for (uint32 os=bgn, oe=bgn+1; os<end; ) {
+
+ // Find the end ovlPlace, oe, for this cluster, and do a quick check on orientation.
+
+ for (oe=os+1; (oe < end) && (ovlPlace[os].clusterID == ovlPlace[oe].clusterID); oe++) {
+ assert(ovlPlace[os].tigID == ovlPlace[oe].tigID);
+ assert(ovlPlace[os].position.isReverse() == ovlPlace[oe].position.isReverse());
+ }
+
+ // Build the set of reads we care about.
+
+#ifdef TEST_ALT
+ set<uint32> reads;
+
+ for (uint32 oo=os; oo<oe; oo++)
+ reads.insert(ovlPlace[oo].refID);
+#endif
+
+ // Make a new overlapPlacement from the first placement in this cluster.
+
+ if (logFileFlagSet(LOG_PLACE_READ))
+ writeLog("pRUO()-- process clusterID %u\n", ovlPlace[os].clusterID);
+
+ overlapPlacement op(fid, ovlPlace[os]);
+
+ // Find the first and last read in the unitig that we overlap with.
+
+ placeRead_findFirstLastOverlapping(op, tigs[op.tigID], os, oe, ovlPlace);
+
+ // Sum the errors and bases aligned for each overlap.
+ // Find the minimum and maximum coordinates covered in the read, use that to compute the
+ // fraction of read coverage.
+
+ placeRead_computeQualityAndCoverage(op, os, oe, ovlPlace);
+
+#ifdef TEST_ALT
+ // Test the alternate qual and cov compute that uses overlaps directly
+ {
+ double er = op.errors;
+ uint32 al = op.aligned;
+ double fC = op.fCoverage;
+ int32 bg = op.covered.bgn;
+ int32 ed = op.covered.end;
+
+ placeRead_computeQualityAndCoverage(op, ovl, ovlLen, reads);
+
+ if ((er - op.errors > 0.0001) ||
+ ((int32)al - (int32)op.aligned != 0) ||
+ (fC - op.fCoverage > 0.0001) ||
+ (bg - op.covered.bgn != 0) ||
+ (ed - op.covered.end != 0))
+ writeLog("COMPARE er %8.3f %8.3f %8.3f al %7u %7u %7d fC %8.4f %8.4f %8.4f bg %8d %8d %8d ed %8d %8d %8d\n",
+ er, op.errors, er - op.errors,
+ al, op.aligned, (int32)al - (int32)op.aligned,
+ fC, op.fCoverage, fC - op.fCoverage,
+ bg, op.covered.bgn, bg - op.covered.bgn,
+ ed, op.covered.end, ed - op.covered.end);
+ }
+#endif
+
+ // Compute placement based on the longest overlap on each end, or the best contain.
+
+ placeRead_computePlacement(op, os, oe, ovlPlace, tigs[op.tigID]);
+
+#ifdef TEST_ALT
+ {
+ SeqInterval origpos = op.position;
+ SeqInterval origver = op.verified;
+
+ placeRead_computePlacement(op, ovl, ovlLen, reads, flags, tigs[op.tigID]);
+
+ if ((origpos.bgn - op.position.bgn > 10) || // Placements wobble by a few bases
+ (origpos.end - op.position.end > 10) ||
+ (origver.bgn - op.verified.bgn > 10) ||
+ (origver.end - op.verified.end > 10))
+ writeLog("COMPARE pos bgn %d-%d end %d-%d ver bgn %d-%d end %d-%d\n",
+ origpos.bgn, op.position.bgn,
+ origpos.end, op.position.end,
+ origver.bgn, op.verified.bgn,
+ origver.end, op.verified.end);
+ }
+#endif
+
+ // Filter out bogus placements. There used to be a few more, but they made no sense for long reads.
+ // Reject if either end stddev is high. It has to be pretty bad before this triggers.
+
+ bool goodPlacement = true;
+
+#if 0
+ double allowableStdDev = max(2.0, 0.075 * RI->readLength(op.frgID));
+
+ if ((bgnPos.stddev() > allowableStdDev) ||
+ (endPos.stddev() > allowableStdDev))
+ goodPlacement = false;
+#endif
+
+ if ((flags & placeRead_fullMatch) &&
+ (op.fCoverage < 0.99))
+ goodPlacement = false;
+
+ if ((flags & placeRead_noExtend) &&
+ ((op.position.min() < 0) ||
+ (op.position.max() > tigs[op.tigID]->getLength())))
+ goodPlacement = false;
+
+ if (goodPlacement)
+ placements.push_back(op);
+
+
+ if (logFileFlagSet(LOG_PLACE_READ))
+ writeLog("pRUO()-- placements[%u] - PLACE READ %d in tig %d at %d,%d -- verified %d,%d -- covered %d,%d %4.1f%% -- errors %.2f aligned %d novl %d%s\n",
+ placements.size() - 1,
+ op.frgID, op.tigID,
+ op.position.bgn, op.position.end,
+ op.verified.bgn, op.verified.end,
+ op.covered.bgn, op.covered.end,
+ op.fCoverage * 100.0,
+ op.errors, op.aligned, oe - os,
+ (goodPlacement == false) ? " -- INVALID" : "");
+
+ os = oe;
+ } // End of segregating overlaps by placement
+
+ // Move to the next block of overlaps.
+ bgn = end;
+ }
+
+ delete [] ovlPlace;
+
+ //if ((fid == 232074) || (fid == 72374) || (fid == 482602))
+ // logFileFlags &= ~LOG_PLACE_READ;
+
+ return(true);
+}
diff --git a/src/bogart/AS_BAT_PlaceFragUsingOverlaps.H b/src/bogart/AS_BAT_PlaceReadUsingOverlaps.H
similarity index 53%
rename from src/bogart/AS_BAT_PlaceFragUsingOverlaps.H
rename to src/bogart/AS_BAT_PlaceReadUsingOverlaps.H
index 3067503..0c90328 100644
--- a/src/bogart/AS_BAT_PlaceFragUsingOverlaps.H
+++ b/src/bogart/AS_BAT_PlaceReadUsingOverlaps.H
@@ -15,19 +15,11 @@
*
* This file is derived from:
*
- * src/AS_BAT/AS_BAT_PlaceFragUsingOverlaps.H
+ * src/bogart/AS_BAT_PlaceFragUsingOverlaps.H
*
* Modifications by:
*
- * Brian P. Walenz from 2010-DEC-06 to 2013-AUG-01
- * are Copyright 2010-2013 J. Craig Venter Institute, and
- * are subject to the GNU General Public License version 2
- *
- * Brian P. Walenz from 2014-DEC-19 to 2015-JUN-03
- * are Copyright 2014-2015 Battelle National Biodefense Institute, and
- * are subject to the BSD 3-Clause License
- *
- * Brian P. Walenz beginning on 2016-JAN-11
+ * Brian P. Walenz beginning on 2016-AUG-12
* are a 'United States Government Work', and
* are released in the public domain
*
@@ -35,68 +27,84 @@
* full conditions and disclaimers for each license.
*/
-#ifndef INCLUDE_AS_BAT_PLACEFRAGUSINGOVERLAPS
-#define INCLUDE_AS_BAT_PLACEFRAGUSINGOVERLAPS
+#ifndef INCLUDE_AS_BAT_PLACEREADUSINGOVERLAPS
+#define INCLUDE_AS_BAT_PLACEREADUSINGOVERLAPS
#include "AS_BAT_OverlapCache.H"
-#include "AS_BAT_BestOverlapGraph.H" // For FragmentEnd
+#include "AS_BAT_BestOverlapGraph.H" // For ReadEnd
#include "AS_BAT_Unitig.H" // For SeqInterval
-#include "AS_BAT_UnitigVector.H"
+#include "AS_BAT_TigVector.H"
class overlapPlacement {
public:
- overlapPlacement() {
- frgID = 0;
- refID = 0;
+ overlapPlacement(uint32 fi=0) {
+ frgID = fi;
+
+ refID = 0;
+ tigID = 0;
+ position = SeqInterval();
+ verified = SeqInterval();
+ covered = SeqInterval();
+
+ clusterID = 0;
+ fCoverage = 0.0;
+
+ errors = 0.0;
+ aligned = 0;
+
+ tigFidx = UINT32_MAX;
+ tigLidx = 0;
+ };
- tigID = 0;
- position.bgn = 0;
- position.end = 0;
+ overlapPlacement(uint32 fid, overlapPlacement &op) {
+ frgID = fid;
- verified.bgn = 0;
- verified.end = 0;
+ refID = UINT32_MAX; // Not valid in the output overlapPlacement.
+ tigID = op.tigID;
+ position.bgn = 0;
+ position.end = 0;
- bgnStdDev = 0.0;
- endStdDev = 0.0;
+ verified.bgn = 0;
+ verified.end = 0;
- clusterID = 0;
+ covered.bgn = op.covered.bgn;
+ covered.end = op.covered.end;
- fCoverage = 0.0;
+ clusterID = op.clusterID; // Useless to track forward.
- errors = 0.0;
- aligned = 0;
- covered.bgn = 0;
- covered.end = 0;
+ fCoverage = 0.0;
- frag5p = FragmentEnd();
- frag3p = FragmentEnd();
+ errors = 0.0;
+ aligned = 0;
+
+ tigFidx = UINT32_MAX;
+ tigLidx = UINT32_MAX;
};
+
~overlapPlacement() {
};
+
public:
- uint32 frgID; // ID of the fragment this position is for.
- uint32 refID; // ID if the fragment we used to place this frag (the reference).
+ uint32 frgID; // Read ID of the read this position is for.
+ uint32 refID; // Read ID of the overlapping read were placed with.
uint32 tigID; // Unitig ID of this placement
SeqInterval position; // Unitig position of this placement
SeqInterval verified; // Unitig position of this placement, verified by overlaps
-
- double bgnStdDev; // Standard deviation of position.bgn
- double endStdDev; // Standard deviation of position.end
+ SeqInterval covered; // Position of the overlap on the read
int32 clusterID;
- double fCoverage; // Coverage of the fragment
+ double fCoverage; // Coverage of the read
double errors; // number of errors in alignments
uint32 aligned; // number of bases in alignments
- SeqInterval covered; // Position of the overlap on the read
- FragmentEnd frag5p; // First unitig fragment that supports this placement
- FragmentEnd frag3p; // Last unitig fragment that supports this placement
+ uint32 tigFidx; // First unitig read that supports this placement
+ uint32 tigLidx; // Last unitig read that supports this placement
};
@@ -127,12 +135,16 @@ overlapPlacement_byCluster(const overlapPlacement &A, const overlapPlacement &B)
}
+const uint32 placeRead_all = 0x00; // Return all alignments
+const uint32 placeRead_fullMatch = 0x01; // Return only alignments for the whole read
+const uint32 placeRead_noExtend = 0x02; // Return only alignments contained in the tig
+
bool
-placeFragUsingOverlaps(UnitigVector &unitigs,
- double erate,
+placeReadUsingOverlaps(TigVector &tigs,
Unitig *target,
uint32 fid,
- vector<overlapPlacement> &placements);
+ vector<overlapPlacement> &placements,
+ uint32 flags = placeRead_all);
-#endif // INCLUDE_AS_BAT_PLACEFRAGUSINGOVERLAPS
+#endif // INCLUDE_AS_BAT_PLACEREADUSINGOVERLAPS
diff --git a/src/bogart/AS_BAT_PopBubbles.txt b/src/bogart/AS_BAT_PopBubbles.txt
index ac90b0e..7826d48 100644
--- a/src/bogart/AS_BAT_PopBubbles.txt
+++ b/src/bogart/AS_BAT_PopBubbles.txt
@@ -6,7 +6,7 @@ findPotentialBubbles()
findBubbleReadPlacements()
- threaded on the reads
- - for reads in potential bubbles, uses placeFragUsingOverlaps() to find high-quality
+ - for reads in potential bubbles, uses placeReadUsingOverlaps() to find high-quality
alignments to unitigs that can pop the bubble.
- returns an array of vector<overlapPlacement> - one vector per read - of the placements
for this read. Placements are high quality and to popper tigs only.
diff --git a/src/bogart/AS_BAT_PopulateUnitig.C b/src/bogart/AS_BAT_PopulateUnitig.C
index 4e0c505..56367b9 100644
--- a/src/bogart/AS_BAT_PopulateUnitig.C
+++ b/src/bogart/AS_BAT_PopulateUnitig.C
@@ -35,7 +35,7 @@
* full conditions and disclaimers for each license.
*/
-#include "AS_BAT_FragmentInfo.H"
+#include "AS_BAT_ReadInfo.H"
#include "AS_BAT_BestOverlapGraph.H"
#include "AS_BAT_Logging.H"
@@ -44,71 +44,70 @@
#include "AS_BAT_PopulateUnitig.H"
-
void
populateUnitig(Unitig *unitig,
BestEdgeOverlap *bestnext) {
assert(unitig->getLength() > 0);
- if ((bestnext == NULL) || (bestnext->fragId() == 0))
+ if ((bestnext == NULL) || (bestnext->readId() == 0))
// Nothing to add!
return;
- ufNode frag = unitig->ufpath.back();
+ ufNode read = unitig->ufpath.back();
- // The ID of the last fragment in the unitig, and the end we should walk off of it.
- int32 lastID = frag.ident;
- bool last3p = (frag.position.bgn < frag.position.end);
+ // The ID of the last read in the unitig, and the end we should walk off of it.
+ int32 lastID = read.ident;
+ bool last3p = (read.position.bgn < read.position.end);
uint32 nAdded = 0;
- // While there are fragments to add AND those fragments to add are not already in a unitig,
- // construct a reverse-edge, and add the fragment.
+ // While there are reads to add AND those reads to add are not already in a unitig,
+ // construct a reverse-edge, and add the read.
- while ((bestnext->fragId() != 0) &&
- (Unitig::fragIn(bestnext->fragId()) == 0)) {
+ while ((bestnext->readId() != 0) &&
+ (unitig->inUnitig(bestnext->readId()) == 0)) {
BestEdgeOverlap bestprev;
- // Reverse nextedge (points from the unitig to the next fragment to add) so that it points from
- // the next fragment to add back to something in the unitig. If the fragments are
- // innie/outtie, we need to reverse the overlap to maintain that the A fragment is forward.
+ // Reverse nextedge (points from the unitig to the next read to add) so that it points from
+ // the next read to add back to something in the unitig. If the reads are
+ // innie/outtie, we need to reverse the overlap to maintain that the A read is forward.
- if (last3p == bestnext->frag3p())
+ if (last3p == bestnext->read3p())
bestprev.set(lastID, last3p, bestnext->bhang(), bestnext->ahang(), bestnext->evalue());
else
bestprev.set(lastID, last3p, -bestnext->ahang(), -bestnext->bhang(), bestnext->evalue());
- // We just made 'bestprev' pointing from read 'bestnext->fragId()' end 'bestnext->frag3p()'
+ // We just made 'bestprev' pointing from read 'bestnext->readId()' end 'bestnext->read3p()'
// back to read 'lastID' end 'last3p'. Compute the placement.
- if (unitig->placeFrag(frag, bestnext->fragId(), bestnext->frag3p(), &bestprev)) {
- unitig->addFrag(frag, 0, false);
+ if (unitig->placeRead(read, bestnext->readId(), bestnext->read3p(), &bestprev)) {
+ unitig->addRead(read, 0, false);
nAdded++;
} else {
- writeLog("ERROR: Failed to place frag %d into BOG path.\n", frag.ident);
+ writeLog("ERROR: Failed to place read %d into BOG path.\n", read.ident);
assert(0);
}
- // Set up for the next fragmnet
+ // Set up for the next read
- lastID = frag.ident;
- last3p = (frag.position.bgn < frag.position.end);
+ lastID = read.ident;
+ last3p = (read.position.bgn < read.position.end);
bestnext = OG->getBestEdgeOverlap(lastID, last3p);
}
if (logFileFlagSet(LOG_BUILD_UNITIG))
- if (bestnext->fragId() == 0)
- writeLog("Stopped adding at frag %u/%c' because no next best edge. Added %u reads.\n",
+ if (bestnext->readId() == 0)
+ writeLog("Stopped adding at read %u/%c' because no next best edge. Added %u reads.\n",
lastID, (last3p) ? '3' : '5',
nAdded);
else
- writeLog("Stopped adding at frag %u/%c' beacuse next best frag %u/%c' is in unitig %u. Added %u reads.\n",
+ writeLog("Stopped adding at read %u/%c' beacuse next best read %u/%c' is in unitig %u. Added %u reads.\n",
lastID, (last3p) ? '3' : '5',
- bestnext->fragId(), bestnext->frag3p() ? '3' : '5',
- Unitig::fragIn(bestnext->fragId()),
+ bestnext->readId(), bestnext->read3p() ? '3' : '5',
+ unitig->inUnitig(bestnext->readId()),
nAdded);
}
@@ -116,32 +115,32 @@ populateUnitig(Unitig *unitig,
void
-populateUnitig(UnitigVector &unitigs,
- int32 fi) {
+populateUnitig(TigVector &tigs,
+ int32 fi) {
- if ((FI->fragmentLength(fi) == 0) || // Skip deleted
- (Unitig::fragIn(fi) != 0) || // Skip placed
+ if ((RI->readLength(fi) == 0) || // Skip deleted
+ (tigs.inUnitig(fi) != 0) || // Skip placed
(OG->isContained(fi) == true)) // Skip contained
return;
- Unitig *utg = unitigs.newUnitig(logFileFlagSet(LOG_BUILD_UNITIG));
+ Unitig *utg = tigs.newUnitig(logFileFlagSet(LOG_BUILD_UNITIG));
- // Add a first fragment -- to be 'compatable' with the old code, the first fragment is added
+ // Add a first read -- to be 'compatable' with the old code, the first read is added
// reversed, we walk off of its 5' end, flip it, and add the 3' walk.
- ufNode frag;
+ ufNode read;
- frag.ident = fi;
- frag.contained = 0;
- frag.parent = 0;
- frag.ahang = 0;
- frag.bhang = 0;
- frag.position.bgn = FI->fragmentLength(fi);
- frag.position.end = 0;
+ read.ident = fi;
+ read.contained = 0;
+ read.parent = 0;
+ read.ahang = 0;
+ read.bhang = 0;
+ read.position.bgn = RI->readLength(fi);
+ read.position.end = 0;
- utg->addFrag(frag, 0, logFileFlagSet(LOG_BUILD_UNITIG));
+ utg->addRead(read, 0, logFileFlagSet(LOG_BUILD_UNITIG));
- // Add fragments as long as there is a path to follow...from the 3' end of the first fragment.
+ // Add reads as long as there is a path to follow...from the 3' end of the first read.
BestEdgeOverlap *bestedge5 = OG->getBestEdgeOverlap(fi, false);
BestEdgeOverlap *bestedge3 = OG->getBestEdgeOverlap(fi, true);
@@ -151,48 +150,48 @@ populateUnitig(UnitigVector &unitigs,
assert(bestedge3->ahang() >= 0);
assert(bestedge3->bhang() >= 0);
- // If this fragment is not covered by the two best overlaps we are finished. We will not follow
- // the paths out. This indicates either low coverage, or a chimeric fragment. If it is low
+ // If this read is not covered by the two best overlaps we are finished. We will not follow
+ // the paths out. This indicates either low coverage, or a chimeric read. If it is low
// coverage, then the best overlaps will be mutual and we'll recover the same path. If it is a
- // chimeric fragment the overlaps will not be mutual and we will skip this fragment.
+ // chimeric read the overlaps will not be mutual and we will skip this read.
//
- // The amount of our fragment that is covered by the two best overlaps is
+ // The amount of our read that is covered by the two best overlaps is
//
- // (fragLen + bestedge5->bhang()) + (fragLen - bestedge3->ahang())
+ // (readLen + bestedge5->bhang()) + (readLen - bestedge3->ahang())
//
- // If that is not significantly longer than the fragment length, then we will not use this
- // fragment as a seed for unitig construction.
+ // If that is not significantly longer than the read length, then we will not use this
+ // read as a seed for unitig construction.
//
if (OG->isSuspicious(fi))
return;
#if 0
- uint32 covered = FI->fragmentLength(fi) + bestedge5->bhang() + FI->fragmentLength(fi) - bestedge3->ahang();
+ uint32 covered = RI->readLength(fi) + bestedge5->bhang() + RI->readLength(fi) - bestedge3->ahang();
- // This breaks unitigs at 0x best-coverage regions. There might be a contain that spans (joins)
- // the two best overlaps to verify the fragment, but we can't easily tell right now.
- if (covered < FI->fragmentLength(fi) + AS_OVERLAP_MIN_LEN / 2) {
- writeLog("Stopping unitig construction of suspicious frag %d in unitig %d\n",
+ // This breaks tigs at 0x best-coverage regions. There might be a contain that spans (joins)
+ // the two best overlaps to verify the read, but we can't easily tell right now.
+ if (covered < RI->readLength(fi) + AS_OVERLAP_MIN_LEN / 2) {
+ writeLog("Stopping unitig construction of suspicious read %d in unitig %d\n",
utg->ufpath.back().ident, utg->id());
return;
}
#endif
if (logFileFlagSet(LOG_BUILD_UNITIG))
- writeLog("Adding 5' edges off of frag %d in unitig %d\n",
+ writeLog("Adding 5' edges off of read %d in unitig %d\n",
utg->ufpath.back().ident, utg->id());
- if (bestedge5->fragId())
+ if (bestedge5->readId())
populateUnitig(utg, bestedge5);
utg->reverseComplement(false);
if (logFileFlagSet(LOG_BUILD_UNITIG))
- writeLog("Adding 3' edges off of frag %d in unitig %d\n",
+ writeLog("Adding 3' edges off of read %d in unitig %d\n",
utg->ufpath.back().ident, utg->id());
- if (bestedge3->fragId())
+ if (bestedge3->readId())
populateUnitig(utg, bestedge3);
// Enabling this reverse complement is known to degrade the assembly. It is not known WHY it
diff --git a/src/bogart/AS_BAT_PopulateUnitig.H b/src/bogart/AS_BAT_PopulateUnitig.H
index 225f5c2..896a77e 100644
--- a/src/bogart/AS_BAT_PopulateUnitig.H
+++ b/src/bogart/AS_BAT_PopulateUnitig.H
@@ -41,7 +41,7 @@
void populateUnitig(Unitig *unitig,
BestEdgeOverlap *nextedge);
-void populateUnitig(UnitigVector &unitigs,
- int32 fragID);
+void populateUnitig(TigVector &tigs,
+ int32 readID);
#endif // INCLUDE_AS_BAT_POPULATUNITIG
diff --git a/src/bogart/AS_BAT_PromoteToSingleton.C b/src/bogart/AS_BAT_PromoteToSingleton.C
index ec9af9a..ecf39e5 100644
--- a/src/bogart/AS_BAT_PromoteToSingleton.C
+++ b/src/bogart/AS_BAT_PromoteToSingleton.C
@@ -35,37 +35,41 @@
* full conditions and disclaimers for each license.
*/
-#include "AS_global.H"
-#include "AS_BAT_FragmentInfo.H"
+#include "AS_BAT_Logging.H"
+
#include "AS_BAT_Unitig.H"
+#include "AS_BAT_TigVector.H"
+
+#include "AS_BAT_ReadInfo.H"
-// If we are not reconstructing repeats, promote all the unplaced fragments to new unitigs.
-// Oodles of possibilities here; promote everything to a singleton unitig, promote only
-// the non-contained, then place contains, then promote what is left over, etc.
void
-promoteToSingleton(UnitigVector &unitigs) {
+promoteToSingleton(TigVector &tigs) {
+ uint32 nPromoted = 0;
- for (uint32 fi=1; fi<=FI->numFragments(); fi++) {
- if (Unitig::fragIn(fi) != 0)
- // Placed already
+ for (uint32 fi=1; fi<=RI->numReads(); fi++) {
+ if (tigs.inUnitig(fi) != 0) // Placed.
continue;
- if (FI->fragmentLength(fi) == 0)
- // Deleted.
+ if (RI->readLength(fi) == 0) // Deleted.
continue;
- Unitig *utg = unitigs.newUnitig(false);
- ufNode frag;
+ nPromoted++;
- frag.ident = fi;
- frag.contained = 0;
- frag.parent = 0;
- frag.ahang = 0;
- frag.bhang = 0;
- frag.position.bgn = 0;
- frag.position.end = FI->fragmentLength(fi);
+ Unitig *utg = tigs.newUnitig(false);
+ ufNode read;
- utg->addFrag(frag, 0, false);
+ read.ident = fi;
+ read.contained = 0;
+ read.parent = 0;
+ read.ahang = 0;
+ read.bhang = 0;
+ read.position.bgn = 0;
+ read.position.end = RI->readLength(fi);
+
+ utg->addRead(read, 0, false);
}
+
+ writeStatus("promoteToSingleton()-- Moved " F_U32 " unplaced read%s to singleton tigs.\n",
+ nPromoted, (nPromoted == 1) ? "" : "s");
}
diff --git a/src/bogart/AS_BAT_PromoteToSingleton.H b/src/bogart/AS_BAT_PromoteToSingleton.H
index 29a2b18..b83f651 100644
--- a/src/bogart/AS_BAT_PromoteToSingleton.H
+++ b/src/bogart/AS_BAT_PromoteToSingleton.H
@@ -30,6 +30,6 @@
#ifndef INCLUDE_AS_BAT_PROMOTE_TO_SINGLETON
void
-promoteToSingleton(UnitigVector &unitigs);
+promoteToSingleton(TigVector &tigs);
#endif // INCLUDE_AS_BAT_PROMOTE_TO_SINGLETON
diff --git a/src/bogart/AS_BAT_ReadInfo.C b/src/bogart/AS_BAT_ReadInfo.C
new file mode 100644
index 0000000..0f71a81
--- /dev/null
+++ b/src/bogart/AS_BAT_ReadInfo.C
@@ -0,0 +1,83 @@
+
+/******************************************************************************
+ *
+ * This file is part of canu, a software program that assembles whole-genome
+ * sequencing reads into contigs.
+ *
+ * This software is based on:
+ * 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
+ * the 'kmer package' (http://kmer.sourceforge.net)
+ * both originally distributed by Applera Corporation under the GNU General
+ * Public License, version 2.
+ *
+ * Canu branched from Celera Assembler at its revision 4587.
+ * Canu branched from the kmer project at its revision 1994.
+ *
+ * Modifications by:
+ *
+ * Brian P. Walenz beginning on 2016-AUG-12
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
+ * File 'README.licenses' in the root directory of this distribution contains
+ * full conditions and disclaimers for each license.
+ */
+
+#include "AS_BAT_ReadInfo.H"
+#include "AS_BAT_Logging.H"
+
+
+
+ReadInfo::ReadInfo(gkStore *gkp,
+ const char *prefix,
+ uint32 minReadLen) {
+
+ _numBases = 0;
+ _numReads = gkp->gkStore_getNumReads();
+ _numLibraries = gkp->gkStore_getNumLibraries();
+
+ _readStatus = new ReadStatus [_numReads + 1];
+
+ for (uint32 i=0; i<_numReads + 1; i++) {
+ _readStatus[i].readLength = 0;
+ _readStatus[i].libraryID = 0;
+ _readStatus[i].isBackbone = false;
+ _readStatus[i].isUnplaced = false;
+ _readStatus[i].isLeftover = false;
+ _readStatus[i].unused = 0;
+ }
+
+ uint32 numSkipped = 0;
+ uint32 numLoaded = 0;
+
+ for (uint32 fi=1; fi<=_numReads; fi++) {
+ gkRead *read = gkp->gkStore_getRead(fi);
+ uint32 iid = read->gkRead_readID();
+ uint32 len = read->gkRead_sequenceLength();
+
+ if (len < minReadLen) {
+ numSkipped++;
+ continue;
+ }
+
+ _numBases += len;
+
+ _readStatus[iid].readLength = len;
+ _readStatus[iid].libraryID = read->gkRead_libraryID();
+
+ numLoaded++;
+ }
+
+ if (minReadLen > 0)
+ writeStatus("ReadInfo()-- Using %d reads, ignoring %u reads less than " F_U32 " bp long.\n",
+ numLoaded, numSkipped, minReadLen);
+ else
+ writeStatus("ReadInfo()-- Using %d reads, no minimum read length used.\n",
+ numLoaded);
+}
+
+
+
+ReadInfo::~ReadInfo() {
+ delete [] _readStatus;
+}
diff --git a/src/bogart/AS_BAT_FragmentInfo.H b/src/bogart/AS_BAT_ReadInfo.H
similarity index 53%
rename from src/bogart/AS_BAT_FragmentInfo.H
rename to src/bogart/AS_BAT_ReadInfo.H
index cba2d08..bef0b09 100644
--- a/src/bogart/AS_BAT_FragmentInfo.H
+++ b/src/bogart/AS_BAT_ReadInfo.H
@@ -13,13 +13,9 @@
* Canu branched from Celera Assembler at its revision 4587.
* Canu branched from the kmer project at its revision 1994.
*
- * This file is derived from:
- *
- * src/bogart/AS_BAT_Datatypes.H
- *
* Modifications by:
*
- * Brian P. Walenz beginning on 2016-APR-28
+ * Brian P. Walenz beginning on 2016-AUG-12
* are a 'United States Government Work', and
* are released in the public domain
*
@@ -27,8 +23,8 @@
* full conditions and disclaimers for each license.
*/
-#ifndef INCLUDE_AS_BAT_FRAGMENT_INFO
-#define INCLUDE_AS_BAT_FRAGMENT_INFO
+#ifndef INCLUDE_AS_BAT_READ_INFO
+#define INCLUDE_AS_BAT_READ_INFO
#include "AS_global.H"
#include "ovStore.H"
@@ -42,26 +38,38 @@
-class FragmentInfo {
+struct ReadStatus {
+ uint32 readLength : AS_MAX_READLEN_BITS;
+ uint32 libraryID : AS_MAX_LIBRARIES_BITS;
+
+ uint32 isBackbone : 1; // Used to construct initial contig
+ uint32 isUnplaced : 1; // Placed in initial contig using overlaps
+ uint32 isLeftover : 1; // Not placed
+
+ uint32 unused : (32 - AS_MAX_READLEN_BITS - AS_MAX_LIBRARIES_BITS - 3);
+};
+
+
+
+class ReadInfo {
public:
- FragmentInfo(gkStore *gkp, const char *prefix, uint32 minReadLen);
- ~FragmentInfo();
+ ReadInfo(gkStore *gkp, const char *prefix, uint32 minReadLen);
+ ~ReadInfo();
uint64 memoryUsage(void) {
- return((3 * sizeof(uint32) * _numFragments) +
- (2 * sizeof(double) * _numLibraries) +
- (2 * sizeof(uint32) * _numLibraries));
+ return(sizeof(uint64) + sizeof(uint32) + sizeof(uint32) + sizeof(uint32) * _numReads);
};
- uint32 numFragments(void) { return(_numFragments); };
+ uint64 numBases(void) { return(_numBases); };
+ uint32 numReads(void) { return(_numReads); };
uint32 numLibraries(void) { return(_numLibraries); };
- uint32 fragmentLength(uint32 iid) { return(_fragLength[iid]); };
- uint32 libraryIID(uint32 iid) { return(_libIID[iid]); };
+ uint32 readLength(uint32 iid) { return(_readStatus[iid].readLength); };
+ uint32 libraryIID(uint32 iid) { return(_readStatus[iid].libraryID); };
uint32 overlapLength(uint32 a_iid, uint32 b_iid, int32 a_hang, int32 b_hang) {
- int32 alen = fragmentLength(a_iid);
- int32 blen = fragmentLength(b_iid);
+ int32 alen = readLength(a_iid);
+ int32 blen = readLength(b_iid);
int32 aovl = 0;
int32 bovl = 0;
@@ -83,9 +91,9 @@ public:
}
if ((aovl <= 0) || (bovl <= 0) || (aovl > alen) || (bovl > blen)) {
- fprintf(stderr, "WARNING: bogus overlap found for A="F_U32" B="F_U32"\n", a_iid, b_iid);
- fprintf(stderr, "WARNING: A len="F_S32" hang="F_S32" ovl="F_S32"\n", alen, a_hang, aovl);
- fprintf(stderr, "WARNING: B len="F_S32" hang="F_S32" ovl="F_S32"\n", blen, b_hang, bovl);
+ fprintf(stderr, "WARNING: bogus overlap found for A=" F_U32 " B=" F_U32 "\n", a_iid, b_iid);
+ fprintf(stderr, "WARNING: A len=" F_S32 " hang=" F_S32 " ovl=" F_S32 "\n", alen, a_hang, aovl);
+ fprintf(stderr, "WARNING: B len=" F_S32 " hang=" F_S32 " ovl=" F_S32 "\n", blen, b_hang, bovl);
}
if (aovl < 0) aovl = 0;
@@ -105,19 +113,24 @@ public:
return(aovl);
};
-private:
- void save(const char *prefix);
- bool load(const char *prefix);
+ void setBackbone(uint32 fi) { _readStatus[fi].isBackbone = true; };
+ void setUnplaced(uint32 fi) { _readStatus[fi].isUnplaced = true; };
+ void setLeftover(uint32 fi) { _readStatus[fi].isLeftover = true; };
- uint32 _numFragments;
- uint32 _numLibraries;
+ bool isBackbone(uint32 fi) { return(_readStatus[fi].isBackbone); };
+ bool isUnplaced(uint32 fi) { return(_readStatus[fi].isUnplaced); };
+ bool isLeftover(uint32 fi) { return(_readStatus[fi].isLeftover); };
+
+private:
+ uint64 _numBases;
+ uint32 _numReads;
+ uint32 _numLibraries;
- uint32 *_fragLength;
- uint32 *_libIID;
+ ReadStatus *_readStatus;
};
-extern FragmentInfo *FI;
+extern ReadInfo *RI;
-#endif // INCLUDE_AS_BAT_FRAGMENT_INFO
+#endif // INCLUDE_AS_BAT_READ_INFO
diff --git a/src/bogart/AS_BAT_ReconstructRepeats.C b/src/bogart/AS_BAT_ReconstructRepeats.C
deleted file mode 100644
index bc010bf..0000000
--- a/src/bogart/AS_BAT_ReconstructRepeats.C
+++ /dev/null
@@ -1,94 +0,0 @@
-
-/******************************************************************************
- *
- * This file is part of canu, a software program that assembles whole-genome
- * sequencing reads into contigs.
- *
- * This software is based on:
- * 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
- * the 'kmer package' (http://kmer.sourceforge.net)
- * both originally distributed by Applera Corporation under the GNU General
- * Public License, version 2.
- *
- * Canu branched from Celera Assembler at its revision 4587.
- * Canu branched from the kmer project at its revision 1994.
- *
- * This file is derived from:
- *
- * src/AS_BAT/AS_BAT_ReconstructRepeats.C
- *
- * Modifications by:
- *
- * Brian P. Walenz from 2012-JAN-05 to 2013-AUG-01
- * are Copyright 2012-2013 J. Craig Venter Institute, and
- * are subject to the GNU General Public License version 2
- *
- * Brian P. Walenz from 2014-DEC-19 to 2015-APR-24
- * are Copyright 2014-2015 Battelle National Biodefense Institute, and
- * are subject to the BSD 3-Clause License
- *
- * Brian P. Walenz beginning on 2016-JAN-11
- * are a 'United States Government Work', and
- * are released in the public domain
- *
- * File 'README.licenses' in the root directory of this distribution contains
- * full conditions and disclaimers for each license.
- */
-
-#include "AS_BAT_FragmentInfo.H"
-#include "AS_BAT_BestOverlapGraph.H"
-#include "AS_BAT_ChunkGraph.H"
-#include "AS_BAT_Logging.H"
-
-#include "AS_BAT_Unitig.H"
-
-#include "AS_BAT_PlaceFragUsingOverlaps.H"
-
-#include "AS_BAT_PopulateUnitig.H"
-#include "AS_BAT_PlaceContains.H"
-
-
-
-// estimate read error rate from best overlaps (per library?)
-// use that error rate below when rebuilding repeats
-
-void
-reconstructRepeats(UnitigVector &unitigs,
- double erateGraph,
- double deviationGraph) {
-
- // Build a set<> of all the unplaced fragments, then construct a new BOG and CG from which we
- // construct unitigs.
-
- BestOverlapGraph *OGsave = OG;
- ChunkGraph *CGsave = CG;
-
- set<uint32> unplaced;
-
- for (uint32 fi=1; fi<=FI->numFragments(); fi++)
- if (Unitig::fragIn(fi) == 0)
- unplaced.insert(fi);
-
- OG = new BestOverlapGraph(erateGraph / 2.0, deviationGraph, &unplaced);
- CG = new ChunkGraph(&unplaced);
-
- writeLog("==> BUILDING REPEAT UNITIGS from %d fragments.\n", unplaced.size());
-
- for (uint32 fi=CG->nextFragByChunkLength(); fi>0; fi=CG->nextFragByChunkLength())
- populateUnitig(unitigs, fi);
-
- writeLog("==> BUILDING REPEAT UNITIGS catching missed fragments.\n");
-
- for (uint32 fi=1; fi <= FI->numFragments(); fi++)
- populateUnitig(unitigs, fi);
-
- writeLog("==> BUILDING REPEAT UNITIGS placing contained fragments.\n");
-
- placeUnplacedUsingAllOverlaps(unitigs, "PREFIX");
-
- delete OG;
- delete CG;
-
- OG = OGsave;
- CG = CGsave;
-}
diff --git a/src/bogart/AS_BAT_SetParentAndHang.C b/src/bogart/AS_BAT_SetParentAndHang.C
index 6f336e2..4cf2f01 100644
--- a/src/bogart/AS_BAT_SetParentAndHang.C
+++ b/src/bogart/AS_BAT_SetParentAndHang.C
@@ -35,16 +35,16 @@
* full conditions and disclaimers for each license.
*/
-#include "AS_BAT_FragmentInfo.H"
+#include "AS_BAT_ReadInfo.H"
#include "AS_BAT_OverlapCache.H"
#include "AS_BAT_Unitig.H"
-#include "AS_BAT_UnitigVector.H"
+#include "AS_BAT_TigVector.H"
#include "AS_BAT_SetParentAndHang.H"
void
-setParentAndHang(UnitigVector &unitigs) {
+setParentAndHang(TigVector &tigs) {
return;
@@ -54,8 +54,8 @@ setParentAndHang(UnitigVector &unitigs) {
// Just for stats, build a map fo the reads in the unitig.
- for (uint32 ti=0; ti<unitigs.size(); ti++) {
- Unitig *tig = unitigs[ti];
+ for (uint32 ti=0; ti<tigs.size(); ti++) {
+ Unitig *tig = tigs[ti];
if (tig == NULL)
continue;
@@ -75,7 +75,7 @@ setParentAndHang(UnitigVector &unitigs) {
allreads[frg->ident] = true;
}
- // For each fragment, set parent/hangs using the edges.
+ // For each read, set parent/hangs using the edges.
for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
ufNode *frg = &tig->ufpath[fi];
@@ -92,7 +92,7 @@ setParentAndHang(UnitigVector &unitigs) {
// Otherwise, find the thickest overlap to any read already placed in the unitig.
uint32 olapsLen = 0;
- BAToverlap *olaps = OC->getOverlaps(frg->ident, AS_MAX_EVALUE, olapsLen);
+ BAToverlap *olaps = OC->getOverlaps(frg->ident, olapsLen);
uint32 tt = UINT32_MAX;
uint32 ttLen = 0;
@@ -118,7 +118,7 @@ setParentAndHang(UnitigVector &unitigs) {
continue;
}
- uint32 l = FI->overlapLength(olaps[oo].a_iid, olaps[oo].b_iid, olaps[oo].a_hang, olaps[oo].b_hang);
+ uint32 l = RI->overlapLength(olaps[oo].a_iid, olaps[oo].b_iid, olaps[oo].a_hang, olaps[oo].b_hang);
// Compute the hangs, so we can ignore those that would place this read before the parent.
// This is a flaw somewhere in bogart, and should be caught and fixed earlier.
@@ -153,21 +153,21 @@ setParentAndHang(UnitigVector &unitigs) {
// If the overlap is worse than the one we already have, we don't care.
- if ((l < ttLen) || // Too short
- (ttErr < olaps[oo].erate)) { // Too noisy
+ if ((l < ttLen) || // Too short
+ (ttErr < olaps[oo].erate())) { // Too noisy
continue;
}
tt = oo;
ttLen = l;
- ttErr = olaps[oo].erate;
+ ttErr = olaps[oo].erate();
}
// If no thickest overlap, we screwed up somewhere. Complain and eject the read.
if (tt == UINT32_MAX) {
fprintf(stderr, "ERROR: read %u in tig %u has no overlap to any previous read, ejected. %u overlaps total. %u negative hang. %u to read not in tig. %u to read later in tig. %u good overlaps.\n",
- frg->ident, tig->tigID(), olapsLen, negHang, notPresent, notPlaced, goodOlap);
+ frg->ident, tig->id(), olapsLen, negHang, notPresent, notPlaced, goodOlap);
continue;
}
@@ -177,6 +177,6 @@ setParentAndHang(UnitigVector &unitigs) {
- } // Over all fragments
- } // Over all unitigs
+ } // Over all reads
+ } // Over all tigs
}
diff --git a/src/bogart/AS_BAT_SetParentAndHang.H b/src/bogart/AS_BAT_SetParentAndHang.H
index 5fb13ea..9d0d6c0 100644
--- a/src/bogart/AS_BAT_SetParentAndHang.H
+++ b/src/bogart/AS_BAT_SetParentAndHang.H
@@ -38,8 +38,8 @@
#ifndef INCLUDE_AS_BAT_SETPARENTANDHANG
#define INCLUDE_AS_BAT_SETPARENTANDHANG
-#include "AS_BAT_UnitigVector.H"
+#include "AS_BAT_TigVector.H"
-void setParentAndHang(UnitigVector &unitigs);
+void setParentAndHang(TigVector &tigs);
#endif // INCLUDE_AS_BAT_SETPARENTANDHANG
diff --git a/src/bogart/AS_BAT_SplitDiscontinuous.C b/src/bogart/AS_BAT_SplitDiscontinuous.C
index 210ea69..25ee78d 100644
--- a/src/bogart/AS_BAT_SplitDiscontinuous.C
+++ b/src/bogart/AS_BAT_SplitDiscontinuous.C
@@ -40,167 +40,188 @@
#include "AS_BAT_Unitig.H"
+#include "AS_BAT_SplitDiscontinuous.H"
static
-void
-makeNewUnitig(UnitigVector &unitigs,
- uint32 splitFragsLen,
- ufNode *splitFrags) {
- Unitig *dangler = unitigs.newUnitig(false);
+Unitig *
+makeNewUnitig(TigVector &tigs,
+ uint32 splitReadsLen,
+ ufNode *splitReads) {
+
+ if (splitReadsLen == 0) {
+ writeLog("splitDiscontinuous()-- WARNING: tried to make a new tig with no reads!\n");
+ return(NULL);
+ }
+
+ Unitig *newtig = tigs.newUnitig(false);
if (logFileFlagSet(LOG_SPLIT_DISCONTINUOUS))
- writeLog("splitDiscontinuous()-- new tig "F_U32" with "F_U32" fragments (starting at frag "F_U32").\n",
- dangler->id(), splitFragsLen, splitFrags[0].ident);
+ writeLog("splitDiscontinuous()-- new tig " F_U32 " with " F_U32 " reads (starting at read " F_U32 ").\n",
+ newtig->id(), splitReadsLen, splitReads[0].ident);
- int splitOffset = -MIN(splitFrags[0].position.bgn, splitFrags[0].position.end);
+ int splitOffset = -splitReads[0].position.min();
// This should already be true, but we force it still
- splitFrags[0].contained = 0;
+ splitReads[0].contained = 0;
- for (uint32 i=0; i<splitFragsLen; i++)
- dangler->addFrag(splitFrags[i], splitOffset, false); //logFileFlagSet(LOG_SPLIT_DISCONTINUOUS));
-}
+ for (uint32 i=0; i<splitReadsLen; i++)
+ newtig->addRead(splitReads[i], splitOffset, false); //logFileFlagSet(LOG_SPLIT_DISCONTINUOUS));
+ return(newtig);
+}
-// After splitting and ejecting some contains, check for discontinuous unitigs.
+// Tests if the tig is contiguous.
//
-void splitDiscontinuousUnitigs(UnitigVector &unitigs, uint32 minOverlap) {
-
- writeLog("==> SPLIT DISCONTINUOUS\n");
+bool
+tigIsContiguous(Unitig *tig, uint32 minOverlap) {
+ int32 maxEnd = tig->ufpath[0].position.max();
- uint32 numTested = 0;
- uint32 numSplit = 0;
- uint32 numCreated = 0;
-
- uint32 splitFragsLen = 0;
- uint32 splitFragsMax = 0;
- ufNode *splitFrags = NULL;
-
- for (uint32 ti=0; ti<unitigs.size(); ti++) {
- Unitig *tig = unitigs[ti];
-
- if ((tig == NULL) || (tig->ufpath.size() < 2))
- continue;
+ for (uint32 fi=1; fi<tig->ufpath.size(); fi++) {
+ ufNode *frg = &tig->ufpath[fi];
- // Unitig must be sorted. Someone upstream os screwing this up.
- tig->sort();
+ if (frg->position.min() > maxEnd - minOverlap)
+ return(false);
- // We'll want to build an array of new fragments to split out. This can be up
- // to the size of the largest unitig.
- splitFragsMax = MAX(splitFragsMax, tig->ufpath.size());
-
- // Check that the unitig starts at position zero. Not critical for the next loop, but
- // needs to be dome sometime.
- int32 minPos = MIN(tig->ufpath[0].position.bgn, tig->ufpath[0].position.end);
-
- if (minPos == 0)
- continue;
+ maxEnd = max(maxEnd, frg->position.max());
+ }
- writeLog("splitDiscontinuous()-- tig "F_U32" offset messed up; reset by "F_S32".\n", tig->id(), minPos);
+ return(true);
+}
- for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
- ufNode *frg = &tig->ufpath[fi];
- frg->position.bgn -= minPos;
- frg->position.end -= minPos;
- }
- }
- splitFrags = new ufNode [splitFragsMax];
- // Now, finally, we can check for gaps in unitigs.
+// After splitting and ejecting some contains, check for discontinuous tigs.
+//
+void
+splitDiscontinuous(TigVector &tigs, uint32 minOverlap, vector<tigLoc> &tigSource) {
+ uint32 numTested = 0;
+ uint32 numSplit = 0;
+ uint32 numCreated = 0;
- for (uint32 ti=0; ti<unitigs.size(); ti++) {
- Unitig *tig = unitigs[ti];
+ // Sort and make sure the tigs start at zero. Shouldn't be here.
- if ((tig == NULL) || (tig->ufpath.size() < 2))
- continue;
+ for (uint32 ti=0; ti<tigs.size(); ti++)
+ if (tigs[ti])
+ tigs[ti]->cleanUp();
- // We don't expect many unitigs to be broken, so we'll do a first quick pass to just
- // test if it is.
+ // Allocate space for the largest number of reads.
- int32 maxEnd = MAX(tig->ufpath[0].position.bgn, tig->ufpath[0].position.end);
- bool isBroken = false;
+ uint32 splitReadsMax = 0;
- for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
- ufNode *frg = &tig->ufpath[fi];
+ for (uint32 ti=0; ti<tigs.size(); ti++)
+ if ((tigs[ti]) && (splitReadsMax < tigs[ti]->ufpath.size()))
+ splitReadsMax = tigs[ti]->ufpath.size();
- int32 bgn = MIN(frg->position.bgn, frg->position.end);
- int32 end = MAX(frg->position.bgn, frg->position.end);
+ ufNode *splitReads = new ufNode [splitReadsMax];
- if (bgn > maxEnd - minOverlap) {
- isBroken = true;
- break;
- }
+ // Now, finally, we can check for gaps in tigs.
- maxEnd = MAX(maxEnd, end);
- }
+ for (uint32 ti=0; ti<tigs.size(); ti++) {
+ Unitig *tig = tigs[ti];
+ if ((tig == NULL) || (tig->ufpath.size() < 2)) // No tig, or guaranteed to be contiguous.
+ continue;
numTested++;
- if (isBroken == false)
+ if (tigIsContiguous(tig, minOverlap) == true) // No gaps, nothing to do.
continue;
-
numSplit++;
// Dang, busted unitig. Fix it up.
- splitFragsLen = 0;
- maxEnd = 0;
-
if (logFileFlagSet(LOG_SPLIT_DISCONTINUOUS))
- writeLog("splitDiscontinuous()-- discontinuous tig "F_U32" with "F_SIZE_T" fragments broken into:\n",
+ writeLog("splitDiscontinuous()-- discontinuous tig " F_U32 " with " F_SIZE_T " reads broken into:\n",
tig->id(), tig->ufpath.size());
+ int32 maxEnd = 0;
+ uint32 splitReadsLen = 0;
+
for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
ufNode *frg = &tig->ufpath[fi];
+ int32 bgn = frg->position.min();
+ int32 end = frg->position.max();
- int32 bgn = MIN(frg->position.bgn, frg->position.end);
- int32 end = MAX(frg->position.bgn, frg->position.end);
+ // Good thick overlap exists to this read, save it.
- // Good thick overlap exists to this fragment, save it.
if (bgn <= maxEnd - minOverlap) {
- assert(splitFragsLen < splitFragsMax);
- splitFrags[splitFragsLen++] = *frg;
- maxEnd = MAX(maxEnd, end);
+ assert(splitReadsLen < splitReadsMax);
+ splitReads[splitReadsLen++] = *frg;
+ maxEnd = max(maxEnd, end);
continue;
}
- // No thick overlap found. We need to break right here before the current fragment. We used
- // to try to place contained reads with their container. For simplicity, we instead just
- // make a new unitig, letting the main() decide what to do with them (e.g., bubble pop or try
- // to place all reads in singleton unitigs as contained reads again).
+ // No thick overlap found. We need to break right here before the current read. We used to
+ // try to place contained reads with their container. For simplicity, we instead just make a
+ // new unitig, letting the main() decide what to do with them (e.g., bubble pop or try to
+ // place all reads in singleton tigs as contained reads again).
numCreated++;
- makeNewUnitig(unitigs, splitFragsLen, splitFrags);
- tig = unitigs[ti];
+ Unitig *newtig = makeNewUnitig(tigs, splitReadsLen, splitReads);
+
+ // 'tigs' can be reallocated, so grab the pointer again.
+
+ tig = tigs[ti];
- // Done with the split, save the current fragment. This resets everything.
+ // Keep tracking tigSource.
- splitFragsLen = 0;
- splitFrags[splitFragsLen++] = *frg;
+ if ((tigSource.size() > 0) && (newtig)) {
+ tigSource.resize(newtig->id() + 1);
+
+ tigSource[newtig->id()].cID = tig->id();
+ tigSource[newtig->id()].cBgn = tigSource[ tig->id()].cBgn + splitReads[0].position.min();
+ tigSource[newtig->id()].cEnd = tigSource[newtig->id()].cBgn + newtig->getLength();
+ tigSource[newtig->id()].uID = newtig->id();
+ }
+
+ // Done with the split, save the current read. This resets everything.
+
+ splitReadsLen = 0;
+ splitReads[splitReadsLen++] = *frg;
maxEnd = end;
}
+ // If we did any splitting, then the length of the reads in splitReads will be less than the
+ // length of the path in the current unitig. Make a final new unitig for the remaining reads.
- // If we did any splitting, then the length of the frags in splitFrags will be less than the length
- // of the path in the current unitig. Make a final new unitig for the remaining fragments.
- //
- if (splitFragsLen != tig->ufpath.size()) {
+ if (splitReadsLen != tig->ufpath.size()) {
numCreated++;
- makeNewUnitig(unitigs, splitFragsLen, splitFrags);
+ Unitig *newtig = makeNewUnitig(tigs, splitReadsLen, splitReads);
+
+ if ((tigSource.size() > 0) && (newtig)) {
+ tigSource.resize(newtig->id() + 1);
+
+ tigSource[newtig->id()].cID = tig->id();
+ tigSource[newtig->id()].cBgn = tigSource[ tig->id()].cBgn + splitReads[0].position.min();
+ tigSource[newtig->id()].cEnd = tigSource[newtig->id()].cBgn + newtig->getLength();
+ tigSource[newtig->id()].uID = newtig->id();
+ }
- delete unitigs[ti];
- unitigs[ti] = NULL;
+ delete tigs[ti];
+ tigs[ti] = NULL;
}
}
- writeLog("splitDiscontinuous()-- Tested "F_U32" unitigs, split "F_U32" into "F_U32" new unitigs.\n",
- numTested, numSplit, numCreated);
+ delete [] splitReads;
- delete [] splitFrags;
+ if (numSplit == 0)
+ writeStatus("splitDiscontinuous()-- Tested " F_U32 " tig%s, split none.\n",
+ numTested, (numTested == 1) ? "" : "s");
+ else
+ writeStatus("splitDiscontinuous()-- Tested " F_U32 " tig%s, split " F_U32 " tig%s into " F_U32 " new tig%s.\n",
+ numTested, (numTested == 1) ? "" : "s",
+ numSplit, (numSplit == 1) ? "" : "s",
+ numCreated, (numCreated == 1) ? "" : "s");
}
+
+
+void
+splitDiscontinuous(TigVector &tigs, uint32 minOverlap) {
+ vector<tigLoc> nothingToSeeHere;
+
+ splitDiscontinuous(tigs, minOverlap, nothingToSeeHere);
+}
diff --git a/src/bogart/AS_BAT_SplitDiscontinuous.H b/src/bogart/AS_BAT_SplitDiscontinuous.H
index 1251be6..277bc0b 100644
--- a/src/bogart/AS_BAT_SplitDiscontinuous.H
+++ b/src/bogart/AS_BAT_SplitDiscontinuous.H
@@ -38,6 +38,9 @@
#ifndef INCLUDE_AS_BAT_SPLITDISCONTINUOUS
#define INCLUDE_AS_BAT_SPLITDISCONTINUOUS
-void splitDiscontinuousUnitigs(UnitigVector &unitigs, uint32 minOverlap);
+#include "AS_BAT_CreateUnitigs.H"
+
+void splitDiscontinuous(TigVector &tigs, uint32 minOverlap, vector<tigLoc> &tigSource);
+void splitDiscontinuous(TigVector &tigs, uint32 minOverlap);
#endif // INCLUDE_AS_BAT_SPLITDISCONTINUOUS
diff --git a/src/bogart/AS_BAT_TigGraph.C b/src/bogart/AS_BAT_TigGraph.C
new file mode 100644
index 0000000..fee75b0
--- /dev/null
+++ b/src/bogart/AS_BAT_TigGraph.C
@@ -0,0 +1,463 @@
+
+/******************************************************************************
+ *
+ * This file is part of canu, a software program that assembles whole-genome
+ * sequencing reads into contigs.
+ *
+ * This software is based on:
+ * 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
+ * the 'kmer package' (http://kmer.sourceforge.net)
+ * both originally distributed by Applera Corporation under the GNU General
+ * Public License, version 2.
+ *
+ * Canu branched from Celera Assembler at its revision 4587.
+ * Canu branched from the kmer project at its revision 1994.
+ *
+ * Modifications by:
+ *
+ * Brian P. Walenz beginning on 2016-OCT-03
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
+ * File 'README.licenses' in the root directory of this distribution contains
+ * full conditions and disclaimers for each license.
+ */
+
+#include "AS_BAT_ReadInfo.H"
+#include "AS_BAT_BestOverlapGraph.H"
+#include "AS_BAT_AssemblyGraph.H"
+#include "AS_BAT_Logging.H"
+
+#include "AS_BAT_PlaceReadUsingOverlaps.H"
+
+#include "AS_BAT_TigGraph.H"
+
+#undef SHOW_EDGES
+#undef SHOW_EDGES_VERBOSE
+
+
+class grEdge {
+public:
+ grEdge() {
+ tigID = 0;
+ bgn = 0;
+ end = 0;
+ fwd = false;
+ extended = false;
+ deleted = true;
+ };
+
+ grEdge(uint32 t, int32 b, int32 e, bool f) {
+ tigID = t;
+ bgn = b;
+ end = e;
+ fwd = f;
+ extended = false;
+ deleted = false;
+ };
+
+ uint32 tigID; // Which tig we're placing this in
+ int32 bgn; // Location of overlap
+ int32 end; //
+ bool fwd; // Overlap indicates tgB is forward (tgA is defined to be forward)
+
+ bool extended;
+ bool deleted;
+};
+
+
+
+void
+emitEdges(TigVector &tigs,
+ Unitig *tgA,
+ bool tgAflipped,
+ FILE *BEG,
+ vector<tigLoc> &tigSource) {
+ vector<overlapPlacement> placements;
+ vector<grEdge> edges;
+
+ // Place the first read.
+
+ ufNode *rdA = tgA->firstRead();
+ uint32 rdAlen = RI->readLength(rdA->ident);
+
+ placeReadUsingOverlaps(tigs, NULL, rdA->ident, placements, placeRead_all);
+
+ //
+ // Somewhere we need to weed out the high error overlaps - Unitig::overlapConsistentWithTig() won't work
+ // because we're at the end of the tig and can have 1x of coverage.
+ //
+
+ // Convert those placements into potential edges.
+ //
+ // Overview: from this first placement, we'll try to extend the tig-tig alignment to generate the
+ // full edge. In pictures:
+ //
+ // <----------------------------------------------- tgA
+ // rd1 ------------>
+ // rd2 -------------->
+ // rd3 <----------------
+ //
+ // --------------------------------> tgB (we don't care about its reads)
+ //
+ // We'll place rd1 in tgB, then place rd2 and extend the alignment, then rd3 and notice that
+ // we've covered all of tgB, so an edge is emitted. If, say, rd2 failed to align fully, we'd
+ // still extend the alignment, and let the total failure of rd3 kill the edge.
+
+ for (uint32 pp=0; pp<placements.size(); pp++) {
+ uint32 tgBid = placements[pp].tigID;
+ Unitig *tgB = tigs[tgBid];
+ uint32 tgBlen = tigs[tgBid]->getLength();
+
+ int32 bgn = placements[pp].verified.min();
+ int32 end = placements[pp].verified.max();
+
+ if ((tgA->id() == tgBid) && // If placed in the same tig and
+ (bgn <= rdA->position.max()) && // at the same location, skip it.
+ (rdA->position.min() <= end))
+ continue;
+
+ if (tgB->_isUnassembled == true) // Ignore placements to unassembled crud.
+ continue;
+
+ // For this to be a valid starting edge, the read must be placed from it's beginning. In the
+ // picture above, rd1 must be placed fully to it's 5' end. The 3' end can flop around; if the
+ // tig-tig alignment isn't true, then rd2 will fail to align. Note thhat if the tig-tig
+ // alignment is fully captured by only rd1, its 3' end will flop around, tgB will be fully covered,
+ // and the edge will be emitted.
+
+ if (((rdA->isForward() == true) && (placements[pp].covered.bgn > 0)) ||
+ ((rdA->isReverse() == true) && (placements[pp].covered.end < rdAlen))) {
+#ifdef SHOW_EDGES
+ writeLog("emitEdges()-- edge --- - tig %6u read %8u %8u-%-8u placed bases %8u-%-8u in tig %6u %8u-%-8u - INCOMPLETELY PLACED outside\n",
+ tgA->id(),
+ rdA->ident, rdA->position.bgn, rdA->position.end,
+ placements[pp].covered.bgn, placements[pp].covered.end,
+ tgBid, bgn, end);
+#endif
+ continue;
+ }
+
+ // Now, if the placed read didn't get placed to it's other end, and it's placed in the middle
+ // of the tig, reject the placement.
+
+ if (((rdA->isForward() == true) && (placements[pp].covered.end < rdAlen) && (bgn > 100) && (end + 100 < tgBlen)) ||
+ ((rdA->isReverse() == true) && (placements[pp].covered.bgn > 0) && (bgn > 100) && (end + 100 < tgBlen))) {
+#ifdef SHOW_EDGES
+ writeLog("emitEdges()-- edge --- - tig %6u read %8u %8u-%-8u placed bases %8u-%-8u in tig %6u %8u-%-8u - INCOMPLETELY PLACED inside\n",
+ tgA->id(),
+ rdA->ident, rdA->position.bgn, rdA->position.end,
+ placements[pp].covered.bgn, placements[pp].covered.end,
+ tgBid, bgn, end, tgBlen);
+#endif
+ continue;
+ }
+
+#ifdef SHOW_EDGES
+ writeLog("emitEdges()-- edge %3u - tig %6u read %8u %8u-%-8u placed bases %8u-%-8u in tig %6u %8u-%-8u quality %f\n",
+ edges.size(),
+ tgA->id(),
+ rdA->ident, rdA->position.bgn, rdA->position.end,
+ placements[pp].covered.bgn, placements[pp].covered.end,
+ tgBid, bgn, end,
+ (double)placements[pp].errors / placements[pp].aligned);
+#endif
+
+ bool fwd = false;
+
+ if (((rdA->isForward() == true) && (placements[pp].verified.isForward() == true)) ||
+ ((rdA->isForward() == false) && (placements[pp].verified.isForward() == false)))
+ fwd = true;
+
+ edges.push_back(grEdge(tgBid, bgn, end, fwd));
+ }
+
+ // Technically, we should run through the edges and emit those that are already satisfied. But
+ // we can defer this until after the second read is processed. Heck, we could defer until all
+ // reads are processed, but cleaning up the list makes us a little faster, and also lets us short
+ // circuit when we run out of potential edges before we run out of reads in the tig.
+
+ // While there are still placements to process, march down the reads in this tig, adding to the
+ // appropriate placement.
+
+ for (uint32 fi=1; (fi<tgA->ufpath.size()) && (edges.size() > 0); fi++) {
+ ufNode *rdA = &tgA->ufpath[fi];
+ uint32 rdAlen = RI->readLength(rdA->ident);
+
+ placeReadUsingOverlaps(tigs, NULL, rdA->ident, placements, placeRead_all);
+
+ // Mark every edge as being not extended.
+
+ for (uint32 ee=0; ee<edges.size(); ee++)
+ edges[ee].extended = false;
+
+ // Merge the new placements with the saved placements.
+
+ for (uint32 pp=0; pp<placements.size(); pp++) {
+ uint32 tgBid = placements[pp].tigID;
+ Unitig *tgB = tigs[tgBid];
+ uint32 tgBlen = tigs[tgBid]->getLength();
+ int32 bgn = placements[pp].verified.min();
+ int32 end = placements[pp].verified.max();
+
+ // Ignore placements to unassembled crud. Just an optimization. We'd filter these out
+ // when trying to associate it with an existing overlap.
+
+ if (tgB->_isUnassembled == true)
+ continue;
+
+ // Accept the placement only if it is for the whole read, or if it is touching the end of the target tig.
+
+ if (((placements[pp].covered.bgn > 0) ||
+ (placements[pp].covered.end < rdAlen)) &&
+ (bgn > 100) &&
+ (end + 100 < tgBlen)) {
+#ifdef SHOW_EDGES
+ writeLog("emitEdges()-- read %5u incomplete placement covering %5u-%-5u in at %5u-%-5u in tig %4u\n",
+ rdA->ident, placements[pp].covered.bgn, placements[pp].covered.end, bgn, end, tgBid);
+#endif
+ continue;
+ }
+
+ for (uint32 ee=0; ee<edges.size(); ee++) {
+ if (edges[ee].deleted == true) // Invalid or already finished edge.
+ continue;
+
+ if ((tgBid != edges[ee].tigID) || // Wrong tig, keep looking.
+ (end < edges[ee].bgn) || // No intersection, keep looking.
+ (edges[ee].end < bgn))
+ continue;
+
+ // Otherwise, the right tig, and we intersect. Extend the interval and mark it as extended.
+
+ // We're trusting that we don't find some bizarre repeat that would let us match ABC in
+ // tgA against CAB in the target tig. If not, we'll need to keep count of which direction
+ // we extend things in.
+
+
+ // Fail if most of the extension is to the wrong side. We always move to higher
+ // coordinates on tgA. If tgB is forward, it should move to higher coordinates too.
+
+ int32 nbgn = min(edges[ee].bgn, bgn);
+ int32 nend = max(edges[ee].end, end);
+
+ if ((edges[ee].fwd == true) &&
+ (bgn - nbgn > nend - end)) { // If we decrease bgn more than we increased end, fail
+#ifdef SHOW_EDGES
+ writeLog("emitEdges()-- edge %3u - extend from %5u-%-5u to %5u-%-5u -- placed read %5u at %5u-%-5u in tig %4u - wrong direction\n",
+ ee,
+ edges[ee].bgn, edges[ee].end,
+ nbgn, nend,
+ rdA->ident, bgn, end, tgBid);
+#endif
+ continue;
+ }
+
+ // The reverse case is a bit tricky since we're tracking min/max posiiton on tgB.
+ // When we extend on tgA, we expect the bgn to decrease on tgB and the end to stay the same.
+
+ if ((edges[ee].fwd == false) &&
+ (nend - end > bgn - nbgn)) { // If we increase end more than we decreased bgn, fail
+#ifdef SHOW_EDGES
+ writeLog("emitEdges()-- edge %3u - extend from %5u-%-5u to %5u-%-5u -- placed read %5u at %5u-%-5u in tig %4u - wrong direction\n",
+ ee,
+ edges[ee].bgn, edges[ee].end,
+ nbgn, nend,
+ rdA->ident, bgn, end, tgBid);
+#endif
+ continue;
+ }
+
+#ifdef SHOW_EDGES
+ writeLog("emitEdges()-- edge %3u - extend from %5u-%-5u to %5u-%-5u -- placed read %5u at %5u-%-5u in tig %4u\n",
+ ee,
+ edges[ee].bgn, edges[ee].end,
+ nbgn, nend,
+ rdA->ident, bgn, end, tgBid);
+#endif
+
+ edges[ee].bgn = nbgn;
+ edges[ee].end = nend;
+ edges[ee].extended = true;
+ }
+ }
+
+ // Emit edges that are complete and mark them as done.
+ //
+ // A better idea is to see if this read is overlapping with the first/last read
+ // in the other tig, and we're close enough to the end, instead of these silly 100bp thresholds.
+
+ for (uint32 ee=0; ee<edges.size(); ee++) {
+ bool tgBflipped = (edges[ee].tigID == tgA->id()) && (tgAflipped);
+
+ bool sameContig = false;
+
+ if ((tigSource.size() > 0) && (tigSource[tgA->id()].cID == tigSource[edges[ee].tigID].cID))
+ sameContig = true;
+
+ if ((edges[ee].fwd == false) && (edges[ee].bgn <= 100)) {
+#ifdef SHOW_EDGES_VERBOSE
+ writeLog("emitEdges()-- edge %3u - tig %6u %s edgeTo tig %6u %s of length %6u (%6u-%6u)\n",
+ ee,
+ tgA->id(), tgAflipped ? "<--" : "-->",
+ edges[ee].tigID, tgBflipped ? "-->" : "<--",
+ edges[ee].end - edges[ee].bgn, edges[ee].bgn, edges[ee].end);
+#endif
+ fprintf(BEG, "L\ttig%08u\t%c\ttig%08u\t%c\t%uM%s\n",
+ tgA->id(), tgAflipped ? '-' : '+',
+ edges[ee].tigID, tgBflipped ? '+' : '-',
+ edges[ee].end - edges[ee].bgn,
+ (sameContig == true) ? "\tcv:A:T" : "\tcv:A:F");
+ edges[ee].deleted = true;
+ }
+
+ if ((edges[ee].fwd == true) && (edges[ee].end + 100 >= tigs[edges[ee].tigID]->getLength())) {
+#ifdef SHOW_EDGES_VERBOSE
+ writeLog("emitEdges()-- edge %3u - tig %6u %s edgeTo tig %6u %s of length %6u (%6u-%6u)\n",
+ ee,
+ tgA->id(), tgAflipped ? "<--" : "-->",
+ edges[ee].tigID, tgBflipped ? "<--" : "-->",
+ edges[ee].end - edges[ee].bgn, edges[ee].bgn, edges[ee].end);
+#endif
+ fprintf(BEG, "L\ttig%08u\t%c\ttig%08u\t%c\t%uM%s\n",
+ tgA->id(), tgAflipped ? '-' : '+',
+ edges[ee].tigID, tgBflipped ? '-' : '+',
+ edges[ee].end - edges[ee].bgn,
+ (sameContig == true) ? "\tcv:A:T" : "\tcv:A:F");
+ edges[ee].deleted = true;
+ }
+ }
+
+ // A bit of cleverness. If we emit edges before dealing with deleted and non-extended edges, the first
+ // time we hit this code we'll emit edges for both the first read and the second read.
+
+ for (uint32 ee=0; ee<edges.size(); ee++) {
+ bool tgBflipped = (edges[ee].tigID == tgA->id()) && (tgAflipped);
+
+ if (edges[ee].fwd == false)
+ tgBflipped = !tgBflipped;
+
+ if (edges[ee].extended == true)
+ continue;
+
+#ifdef SHOW_EDGES
+ writeLog("emitEdges()-- tig %6u %s edgeTo tig %6u %s [0 %u-%u %u] UNSATISFIED at read %u #%u\n",
+ tgA->id(), tgAflipped ? "<--" : "-->",
+ edges[ee].tigID, tgBflipped ? "<--" : "-->",
+ edges[ee].bgn, edges[ee].end, tigs[edges[ee].tigID]->getLength(),
+ rdA->ident, fi);
+#endif
+
+ edges[ee].deleted = true;
+ }
+
+ // Compress the edges list (optional) to remove the deleted edges.
+
+ uint32 oo = 0;
+
+ for (uint32 ee=0; ee<edges.size(); ee++) {
+ if (edges[ee].deleted == false) { // Not deleted, so copy it to the output vector
+ if (ee != oo) // at location oo.
+ edges[oo] = edges[ee];
+ oo++;
+ }
+ }
+
+ edges.resize(oo); // Reset the vector to size we ended up with.
+
+ // And now place the next read in the source tig.
+ }
+
+ // Any edges still on the list aren't edges, so we're all done without needing to check anything.
+
+#ifdef SHOW_EDGES
+ for (uint32 ee=0; ee<edges.size(); ee++) {
+ bool tgBflipped = (edges[ee].tigID == tgA->id()) && (tgBflipped);
+
+ if (edges[ee].fwd == false)
+ tgBflipped = !tgBflipped;
+
+ if (edges[ee].extended == false)
+ writeLog("emitEdges()-- tig %6u %s edgeTo tig %6u %s [0 %u-%u %u] UNSATISFIED after all reads\n",
+ tgA->id(), tgAflipped ? "<--" : "-->",
+ edges[ee].tigID, tgBflipped ? "<--" : "-->",
+ edges[ee].bgn, edges[ee].end, tigs[edges[ee].tigID]->getLength());
+ }
+#endif
+}
+
+
+
+// Unlike placing bubbles and repeats, we don't have enough coverage to do any
+// fancy filtering based on the error profile. We thus fall back to using
+// the filtering for best edges.
+
+void
+reportTigGraph(TigVector &tigs,
+ vector<tigLoc> &tigSource,
+ const char *prefix,
+ const char *label) {
+ char N[FILENAME_MAX];
+
+ writeLog("\n");
+ writeLog("----------------------------------------\n");
+ writeLog("Generating graph\n");
+
+ writeStatus("AssemblyGraph()-- generating '%s.%s.gfa'.\n", prefix, label);
+
+ snprintf(N, FILENAME_MAX, "%s.%s.gfa", prefix, label);
+
+ FILE *BEG = fopen(N, "w");
+
+ if (BEG == NULL)
+ return;
+
+ // Write a header. You've gotta start somewhere!
+
+ fprintf(BEG, "H\tVN:Z:bogart/edges\n");
+
+ // Then write the sequences used in the graph. Unlike the read and contig graphs, every sequence
+ // in our set is output. By construction, only valid unitigs are in it. Though we occasionally
+ // make a disconnected unitig and need to split it again.
+
+ for (uint32 ti=1; ti<tigs.size(); ti++)
+ if ((tigs[ti] != NULL) && (tigs[ti]->_isUnassembled == false))
+ fprintf(BEG, "S\ttig%08u\t*\tLN:i:%u\n", ti, tigs[ti]->getLength());
+
+ // Run through all the tigs, emitting edges for the first and last read.
+
+ for (uint32 ti=1; ti<tigs.size(); ti++) {
+ Unitig *tgA = tigs[ti];
+
+ if ((tgA == NULL) || (tgA->_isUnassembled == true))
+ continue;
+
+ //if (ti == 4)
+ // logFileFlags |= LOG_PLACE_READ;
+
+#ifdef SHOW_EDGES
+ writeLog("\n");
+ writeLog("reportTigGraph()-- tig %u len %u reads %u - firstRead %u\n",
+ ti, tgA->getLength(), tgA->ufpath.size(), tgA->firstRead()->ident);
+#endif
+
+ emitEdges(tigs, tgA, false, BEG, tigSource);
+
+#ifdef SHOW_EDGES
+ writeLog("\n");
+ writeLog("reportTigGraph()-- tig %u len %u reads %u - lastRead %u\n",
+ ti, tgA->getLength(), tgA->ufpath.size(), tgA->lastRead()->ident);
+#endif
+
+ tgA->reverseComplement();
+ emitEdges(tigs, tgA, true, BEG, tigSource);
+ tgA->reverseComplement();
+
+ //logFileFlags &= ~LOG_PLACE_READ;
+ }
+
+ fclose(BEG);
+
+ // And report statistics.
+
+}
diff --git a/src/bogart/AS_BAT_PopBubbles.H b/src/bogart/AS_BAT_TigGraph.H
similarity index 70%
rename from src/bogart/AS_BAT_PopBubbles.H
rename to src/bogart/AS_BAT_TigGraph.H
index 8ab8db6..6147b33 100644
--- a/src/bogart/AS_BAT_PopBubbles.H
+++ b/src/bogart/AS_BAT_TigGraph.H
@@ -15,7 +15,7 @@
*
* Modifications by:
*
- * Brian P. Walenz beginning on 2016-MAR-11
+ * Brian P. Walenz beginning on 2016-OCT-03
* are a 'United States Government Work', and
* are released in the public domain
*
@@ -23,16 +23,21 @@
* full conditions and disclaimers for each license.
*/
-#ifndef INCLUDE_AS_BAT_BUBBLEPOPPING
-#define INCLUDE_AS_BAT_BUBBLEPOPPING
+#ifndef INCLUDE_AS_BAT_TIGGRAPH
+#define INCLUDE_AS_BAT_TIGGRAPH
#include "AS_global.H"
-#include "AS_BAT_BestOverlapGraph.H"
+
#include "AS_BAT_Unitig.H"
+#include "AS_BAT_TigVector.H"
+
+#include "AS_BAT_CreateUnitigs.H"
void
-popBubbles(UnitigVector &unitigs,
- double deviationBubble);
+reportTigGraph(TigVector &tigs,
+ vector<tigLoc> &tigSource,
+ const char *prefix,
+ const char *label);
-#endif // INCLUDE_AS_BAT_BUBBLEPOPPING
+#endif // INCLUDE_AS_BAT_ASSEMBLYGRAPH
diff --git a/src/bogart/AS_BAT_UnitigVector.C b/src/bogart/AS_BAT_TigVector.C
similarity index 68%
rename from src/bogart/AS_BAT_UnitigVector.C
rename to src/bogart/AS_BAT_TigVector.C
index 1fc99d6..ee5121c 100644
--- a/src/bogart/AS_BAT_UnitigVector.C
+++ b/src/bogart/AS_BAT_TigVector.C
@@ -15,7 +15,7 @@
*
* Modifications by:
*
- * Brian P. Walenz beginning on 2016-APR-06
+ * Brian P. Walenz beginning on 2016-AUG-09
* are a 'United States Government Work', and
* are released in the public domain
*
@@ -26,47 +26,70 @@
#include "AS_BAT_Logging.H"
#include "AS_BAT_Unitig.H"
-#include "AS_BAT_UnitigVector.H"
+#include "AS_BAT_TigVector.H"
-UnitigVector::UnitigVector() {
+TigVector::TigVector(uint32 nReads) {
+
+ // The read-to-tig map
+
+ _inUnitig = new uint32 [nReads + 1];
+ _ufpathIdx = new uint32 [nReads + 1];
+
+ for (uint32 ii=0; ii<nReads+1; ii++) {
+ _inUnitig[ii] = 0;
+ _ufpathIdx[ii] = UINT32_MAX;
+ }
+
+ // The vector
+
_blockSize = 1048576;
+
_numBlocks = 1;
_maxBlocks = 1024;
_blocks = new Unitig ** [_maxBlocks];
_blocks[0] = new Unitig * [_blockSize];
- _blocks[0][0] = NULL; // No first unitig.
+ memset(_blocks[0], 0, sizeof(Unitig **) * _blockSize);
_blockNext = 1;
- _totalUnitigs = 1;
+
+ _totalTigs = 1;
};
-UnitigVector::~UnitigVector() {
+TigVector::~TigVector() {
+
+ // Delete the maps
+
+ delete [] _inUnitig;
+ delete [] _ufpathIdx;
+
+ // Delete the tigs.
- // Delete the unitigs.
for (uint32 ii=0; ii<_numBlocks; ii++)
for (uint32 jj=0; jj<_blockSize; jj++)
delete _blocks[ii][jj];
// Delete the blocks.
+
for (uint32 ii=0; ii<_numBlocks; ii++)
delete [] _blocks[ii];
// And the block pointers.
+
delete [] _blocks;
};
Unitig *
-UnitigVector::newUnitig(bool verbose) {
- Unitig *u = new Unitig();
+TigVector::newUnitig(bool verbose) {
+ Unitig *u = new Unitig(this);
#pragma omp critical
{
- u->_id = _totalUnitigs++;
+ u->_id = _totalTigs++;
if (verbose)
writeLog("Creating Unitig %d\n", u->_id);
@@ -98,7 +121,7 @@ UnitigVector::newUnitig(bool verbose) {
void
-UnitigVector::deleteUnitig(uint32 i) {
+TigVector::deleteUnitig(uint32 i) {
delete _blocks[i / _blockSize][i % _blockSize];
_blocks[i / _blockSize][i % _blockSize] = NULL;
}
@@ -110,15 +133,15 @@ Unitig *&operator[](uint32 i) {
uint32 idx = i / _blockSize;
uint32 pos = i % _blockSize;
- if (((i >= _totalUnitigs)) ||
+ if (((i >= _totalTigs)) ||
((idx >= _numBlocks)) ||
(((pos >= _blockNext) && (idx >= _numBlocks - 1)))) {
- fprintf(stderr, "UnitigVector::operator[]()-- i="F_U32" with totalUnitigs="F_U64"\n", i, _totalUnitigs);
- fprintf(stderr, "UnitigVector::operator[]()-- blockSize="F_U64"\n", _blockSize);
- fprintf(stderr, "UnitigVector::operator[]()-- idx="F_U32" numBlocks="F_U64"\n", idx, _numBlocks);
- fprintf(stderr, "UnitigVector::operator[]()-- pos="F_U32" blockNext="F_U64"\n", pos, _blockNext);
+ writeStatus("TigVector::operator[]()-- i=" F_U32 " with totalTigs=" F_U64 "\n", i, _totalTigs);
+ writeStatus("TigVector::operator[]()-- blockSize=" F_U64 "\n", _blockSize);
+ writeStatus("TigVector::operator[]()-- idx=" F_U32 " numBlocks=" F_U64 "\n", idx, _numBlocks);
+ writeStatus("TigVector::operator[]()-- pos=" F_U32 " blockNext=" F_U64 "\n", pos, _blockNext);
}
- assert(i < _totalUnitigs);
+ assert(i < _totalTigs);
assert((idx < _numBlocks));
assert((pos < _blockNext) || (idx < _numBlocks - 1));
@@ -133,12 +156,12 @@ Unitig *&operator[](uint32 i) {
void
-UnitigVector::computeArrivalRate(const char *prefix, const char *label) {
+TigVector::computeArrivalRate(const char *prefix, const char *label) {
uint32 tiLimit = size();
uint32 numThreads = omp_get_max_threads();
uint32 blockSize = (tiLimit < 100000 * numThreads) ? numThreads : tiLimit / 99999;
- fprintf(stderr, "Computing arrival rates for %u unitigs using %u threads.\n", tiLimit, numThreads);
+ writeStatus("computeArrivalRate()-- Computing arrival rates for %u tigs, with %u thread%s.\n", tiLimit, numThreads, (numThreads == 1) ? "" : "s");
vector<int32> hist[6];
@@ -158,7 +181,7 @@ UnitigVector::computeArrivalRate(const char *prefix, const char *label) {
for (uint32 ii=1; ii<6; ii++) {
char N[FILENAME_MAX];
- sprintf(N, "%s.arrivalRate.%u.dat", prefix, ii);
+ snprintf(N, FILENAME_MAX, "%s.arrivalRate.%u.dat", prefix, ii);
FILE *F = fopen(N, "w");
for (uint32 jj=0; jj<hist[ii].size(); jj++)
fprintf(F, "%d\n", hist[ii][jj]);
@@ -172,14 +195,14 @@ UnitigVector::computeArrivalRate(const char *prefix, const char *label) {
void
-UnitigVector::computeErrorProfiles(const char *prefix, const char *label) {
+TigVector::computeErrorProfiles(const char *prefix, const char *label) {
uint32 tiLimit = size();
uint32 numThreads = omp_get_max_threads();
uint32 blockSize = (tiLimit < 100000 * numThreads) ? numThreads : tiLimit / 99999;
- fprintf(stderr, "Computing error profiles for %u unitigs using %u threads.\n", tiLimit, numThreads);
+ writeStatus("computeErrorProfiles()-- Computing error profiles for %u tigs, with %u thread%s.\n", tiLimit, numThreads, (numThreads == 1) ? "" : "s");
- //#pragma omp parallel for schedule(dynamic, blockSize)
+#pragma omp parallel for schedule(dynamic, blockSize)
for (uint32 ti=0; ti<tiLimit; ti++) {
Unitig *tig = operator[](ti);
@@ -191,14 +214,12 @@ UnitigVector::computeErrorProfiles(const char *prefix, const char *label) {
tig->computeErrorProfile(prefix, label);
}
-
- fprintf(stderr, "Computing error profiles - FINISHED.\n");
}
void
-UnitigVector::reportErrorProfiles(const char *prefix, const char *label) {
+TigVector::reportErrorProfiles(const char *prefix, const char *label) {
uint32 tiLimit = size();
uint32 numThreads = omp_get_max_threads();
uint32 blockSize = (tiLimit < 100000 * numThreads) ? numThreads : tiLimit / 99999;
diff --git a/src/bogart/AS_BAT_UnitigVector.H b/src/bogart/AS_BAT_TigVector.H
similarity index 57%
rename from src/bogart/AS_BAT_UnitigVector.H
rename to src/bogart/AS_BAT_TigVector.H
index 5f12276..1970244 100644
--- a/src/bogart/AS_BAT_UnitigVector.H
+++ b/src/bogart/AS_BAT_TigVector.H
@@ -15,7 +15,7 @@
*
* Modifications by:
*
- * Brian P. Walenz beginning on 2016-APR-06
+ * Brian P. Walenz beginning on 2016-AUG-09
* are a 'United States Government Work', and
* are released in the public domain
*
@@ -23,22 +23,22 @@
* full conditions and disclaimers for each license.
*/
-#ifndef INCLUDE_AS_BAT_UNITIG_VECTOR
-#define INCLUDE_AS_BAT_UNITIG_VECTOR
+#ifndef INCLUDE_AS_BAT_TIGVECTOR
+#define INCLUDE_AS_BAT_TIGVECTOR
#include "AS_global.H"
class Unitig;
-class UnitigVector {
+class TigVector {
public:
- UnitigVector();
- ~UnitigVector();
+ TigVector(uint32 nReads);
+ ~TigVector();
Unitig *newUnitig(bool verbose);
void deleteUnitig(uint32 i);
- size_t size(void) { return(_totalUnitigs); };
+ size_t size(void) { return(_totalTigs); };
Unitig *&operator[](uint32 i) { return(_blocks[i / _blockSize][i % _blockSize]); };
void computeArrivalRate(const char *prefix, const char *label);
@@ -46,16 +46,31 @@ public:
void computeErrorProfiles(const char *prefix, const char *label);
void reportErrorProfiles(const char *prefix, const char *label);
+ // Mapping from read to position in a tig.
+public:
+ void registerRead(uint32 readId, uint32 tigid=0, uint32 ufpathidx=UINT32_MAX) {
+ _inUnitig[readId] = tigid;
+ _ufpathIdx[readId] = ufpathidx;
+ };
+
+ uint32 inUnitig(uint32 readId) { return(_inUnitig[readId]); };
+ uint32 ufpathIdx(uint32 readId) { return(_ufpathIdx[readId]); };
+
+private:
+ uint32 *_inUnitig; // Maps a read iid to a unitig id.
+ uint32 *_ufpathIdx; // Maps a read iid to an index in ufpath
+
+ // The actual vector.
private:
- uint64 _blockSize;
+ uint64 _blockSize;
- uint64 _numBlocks;
- uint64 _maxBlocks;
- Unitig ***_blocks;
- uint64 _blockNext;
+ uint64 _numBlocks;
+ uint64 _maxBlocks;
+ Unitig ***_blocks;
+ uint64 _blockNext;
- uint64 _totalUnitigs;
+ uint64 _totalTigs;
};
-#endif // INCLUDE_AS_BAT_UNITIG_VECTOR
+#endif // INCLUDE_AS_BAT_TIGVECTOR
diff --git a/src/bogart/AS_BAT_Unitig.C b/src/bogart/AS_BAT_Unitig.C
index d345120..094e644 100644
--- a/src/bogart/AS_BAT_Unitig.C
+++ b/src/bogart/AS_BAT_Unitig.C
@@ -37,25 +37,22 @@
#include "AS_global.H"
#include "AS_BAT_Unitig.H"
-#include "AS_BAT_FragmentInfo.H"
+#include "AS_BAT_ReadInfo.H"
#include "AS_BAT_BestOverlapGraph.H"
+#include "AS_BAT_Logging.H"
static std::map<uint32,int>* containPartialOrder;
-uint32* Unitig::_inUnitig = NULL;
-uint32* Unitig::_pathPosition = NULL;
-
-
#undef SHOW_PROFILE_CONSTRUCTION
#undef SHOW_PROFILE_CONSTRUCTION_DETAILS
void
Unitig::reverseComplement(bool doSort) {
- // If there are contained fragments, we need to sort by position to place them correctly after
- // their containers. If there are no contained fragments, sorting can break the initial unitig
- // building. When two frags start at position zero, we'll exchange the order. Initial unitig
- // building depends on having the first fragment added become the last fragment in the unitig
+ // If there are contained reads, we need to sort by position to place them correctly after
+ // their containers. If there are no contained reads, sorting can break the initial unitig
+ // building. When two reads start at position zero, we'll exchange the order. Initial unitig
+ // building depends on having the first read added become the last read in the unitig
// after reversing.
for (uint32 fi=0; fi<ufpath.size(); fi++) {
@@ -72,7 +69,7 @@ Unitig::reverseComplement(bool doSort) {
}
// We've updated the positions of everything. Now, sort or reverse the list, and rebuild the
- // pathPosition map.
+ // ufpathIdx map.
if (doSort) {
sort();
@@ -80,16 +77,43 @@ Unitig::reverseComplement(bool doSort) {
std::reverse(ufpath.begin(), ufpath.end());
for (uint32 fi=0; fi<ufpath.size(); fi++)
- _pathPosition[ufpath[fi].ident] = fi;
+ _vector->registerRead(ufpath[fi].ident, _id, fi);
}
}
+// Ensure that the children are sorted by begin position, and that unitigs start at position zero.
+
+void
+Unitig::cleanUp(void) {
+
+ if (ufpath.size() > 1)
+ sort();
+
+ int32 minPos = ufpath[0].position.min();
+
+ if (minPos == 0)
+ return;
+
+ for (uint32 fi=0; fi<ufpath.size(); fi++) {
+ ufpath[fi].position.bgn -= minPos;
+ ufpath[fi].position.end -= minPos;
+ }
+
+ _length = 0;
+
+ for (uint32 fi=0; fi<ufpath.size(); fi++) { // Could use position.max(), but since
+ _length = max(_length, ufpath[fi].position.bgn); // it too calls max(), there's no win
+ _length = max(_length, ufpath[fi].position.end);
+ }
+}
+
+
class epOlapDat {
public:
- epOlapDat(uint32 p, bool o, double e) {
+ epOlapDat(uint32 p, bool o, float e) {
pos = p;
open = o;
erate = e;
@@ -97,9 +121,9 @@ public:
bool operator<(const epOlapDat &that) const { return(pos < that.pos); };
- uint32 pos;
- bool open;
- double erate;
+ uint32 pos : 31;
+ bool open : 1;
+ float erate;
};
@@ -136,22 +160,11 @@ Unitig::computeArrivalRate(const char *UNUSED(prefix),
-
-
-
-#if 1
-void
-Unitig::computeErrorProfileApproximate(const char *UNUSED(prefix), const char *UNUSED(label)) {
-}
-#endif
-
-
-
void
Unitig::computeErrorProfile(const char *UNUSED(prefix), const char *UNUSED(label)) {
#ifdef SHOW_PROFILE_CONSTRUCTION
- writeLog("Find error profile for tig "F_U32" of length "F_U32" with "F_SIZE_T" reads.\n",
+ writeLog("errorProfile()-- Find error profile for tig " F_U32 " of length " F_U32 " with " F_SIZE_T " reads.\n",
id(), getLength(), ufpath.size());
#endif
@@ -160,95 +173,53 @@ Unitig::computeErrorProfile(const char *UNUSED(prefix), const char *UNUSED(label
vector<epOlapDat> olaps;
-
-
- // Pick a set of reads to use. We need full coverage in overlaps.
-
-
-
-
// Scan overlaps to find those that we care about, and save their endpoints.
for (uint32 fi=0; fi<ufpath.size(); fi++) {
ufNode *rdA = &ufpath[fi];
- bool rdAfwd = (rdA->position.bgn < rdA->position.end);
- int32 rdAlo = (rdAfwd) ? rdA->position.bgn : rdA->position.end;
- int32 rdAhi = (rdAfwd) ? rdA->position.end : rdA->position.bgn;
+ int32 rdAlo = rdA->position.min();
+ int32 rdAhi = rdA->position.max();
uint32 ovlLen = 0;
- BAToverlap *ovl = OC->getOverlaps(rdA->ident, AS_MAX_ERATE, ovlLen);
-
- uint32 nDiffTig = 0;
- uint32 nDiffPos = 0;
- uint32 nIsect = 0;
+ BAToverlap *ovl = OC->getOverlaps(rdA->ident, ovlLen);
for (uint32 oi=0; oi<ovlLen; oi++) {
+ if (id() != _vector->inUnitig(ovl[oi].b_iid)) // Reads in different tigs?
+ continue; // Don't care about this overlap.
- // Reads in different tigs? Don't care about this overlap.
+ ufNode *rdB = &ufpath[ _vector->ufpathIdx(ovl[oi].b_iid) ];
- if (id() != Unitig::fragIn(ovl[oi].b_iid)) {
- nDiffTig++;
- continue;
- }
+ if (rdA->ident < rdB->ident) // Only want to see one overlap
+ continue; // for each pair.
+
+ int32 rdBlo = rdB->position.min();
+ int32 rdBhi = rdB->position.max();
- // Reads in same tig but not overlapping? Don't care about this overlap.
+ if ((rdAhi <= rdBlo) || (rdBhi <= rdAlo)) // Reads in same tig but not overlapping?
+ continue; // Don't care about this overlap.
- ufNode *rdB = &ufpath[ Unitig::pathPosition(ovl[oi].b_iid) ];
- bool rdBfwd = (rdB->position.bgn < rdB->position.end);
- int32 rdBlo = (rdBfwd) ? rdB->position.bgn : rdB->position.end;
- int32 rdBhi = (rdBfwd) ? rdB->position.end : rdB->position.bgn;
+ uint32 bgn = max(rdAlo, rdBlo);
+ uint32 end = min(rdAhi, rdBhi);
- if ((rdAhi < rdBlo) || (rdBhi < rdAlo)) {
- nDiffPos++;
#ifdef SHOW_PROFILE_CONSTRUCTION_DETAILS
- writeLog("diffPos rdA %u=%u %u-%u rdB %u=%u %u-%u\n",
- ovl[oi].a_iid, rdA->ident, rdAlo, rdAhi,
- ovl[oi].b_iid, rdB->ident, rdBlo, rdBhi);
+ writeLog("errorProfile()-- olap[%u] %u %u begin %u end %u\n", oi, rdA->ident, rdB->ident, bgn, end);
#endif
- continue;
- }
-
- // Now figure out what region is covered by the overlap.
-
- int32 tiglo = 0;
- int32 tighi = FI->fragmentLength(rdA->ident);
- if (ovl[oi].a_hang > 0)
- tiglo += ovl[oi].a_hang; // Postiive hang!
-
- if (ovl[oi].b_hang < 0)
- tighi += ovl[oi].b_hang; // Negative hang!
-
- assert(0 <= tiglo);
- assert(0 <= tighi);
- assert(tiglo <= tighi);
- assert(tiglo <= FI->fragmentLength(rdA->ident));
- assert(tighi <= FI->fragmentLength(rdA->ident));
-
- // Offset and adjust to tig coordinates
-
- // Beacuse the read is placed with a lot of fudging in the positions, we need
- // to scale the coordinates we compute here.
- double sc = (rdAhi - rdAlo) / (double)FI->fragmentLength(rdA->ident);
-
- uint32 bgn = (uint32)floor(rdAlo + sc * tiglo);
- uint32 end = (uint32)floor(rdAlo + sc * tighi);
-
- nIsect++;
-
- olaps.push_back(epOlapDat(bgn, true, ovl[oi].erate));
- olaps.push_back(epOlapDat(end, false, ovl[oi].erate));
+ olaps.push_back(epOlapDat(bgn, true, ovl[oi].erate())); // Save an open event,
+ olaps.push_back(epOlapDat(end, false, ovl[oi].erate())); // and a close event.
}
-
-#ifdef SHOW_PROFILE_CONSTRUCTION_DETAILS
- writeLog("tig %u read %u with %u overlaps - diffTig %u diffPos %u intersect %u\n",
- id(), rdA->ident, ovlLen, nDiffTig, nDiffPos, nIsect);
-#endif
}
-#ifdef SHOW_PROFILE_CONSTRUCTION
- writeLog("tig %u generated "F_SIZE_T" olaps.\n", id(), olaps.size());
-#endif
+ // Warn if no overlaps.
+
+ if (olaps.size() == 0) {
+ writeLog("WARNING: tig %u length %u nReads %u has no overlaps.\n", id(), getLength(), ufpath.size());
+ for (uint32 fi=0; fi<ufpath.size(); fi++)
+ writeLog("WARNING: read %7u %7u-%-7u\n",
+ ufpath[fi].ident,
+ ufpath[fi].position.bgn,
+ ufpath[fi].position.end);
+ }
// Sort.
@@ -259,7 +230,10 @@ Unitig::computeErrorProfile(const char *UNUSED(prefix), const char *UNUSED(label
// region. And one more, for convenience, to hold the final 'close' values on intervals that
// extend to the end of the unitig.
- if (olaps[0].pos != 0)
+ if (olaps.size() == 0)
+ errorProfile.push_back(epValue(0, getLength()));
+
+ if ((olaps.size() > 0) && (olaps[0].pos != 0))
errorProfile.push_back(epValue(0, olaps[0].pos));
for (uint32 bb=0, ii=1; ii<olaps.size(); ii++) {
@@ -269,36 +243,47 @@ Unitig::computeErrorProfile(const char *UNUSED(prefix), const char *UNUSED(label
errorProfile.push_back(epValue(olaps[bb].pos, olaps[ii].pos));
#ifdef SHOW_PROFILE_CONSTRUCTION_DETAILS
- writeLog("tig %u make region bb=%u ii=%i - %u %u\n", id(), bb, ii, olaps[bb].pos, olaps[ii].pos);
+ writeLog("errorProfile()-- tig %u make region [%u-%u] @ %u-%u\n", id(), bb, ii, olaps[bb].pos, olaps[ii].pos);
#endif
bb = ii;
}
- if (olaps[olaps.size()-1].pos != getLength())
+ if ((olaps.size() > 0) && (olaps[olaps.size()-1].pos != getLength()))
errorProfile.push_back(epValue(olaps[olaps.size()-1].pos, getLength()));
errorProfile.push_back(epValue(getLength(), getLength()+1));
#ifdef SHOW_PROFILE_CONSTRUCTION
- writeLog("tig %u generated "F_SIZE_T" profile regions.\n", id(), errorProfile.size());
+ writeLog("errorProfile()-- tig %u generated " F_SIZE_T " profile regions from " F_SIZE_T " overlaps.\n", id(), errorProfile.size(), olaps.size());
#endif
// Walk both lists, adding positive erates and removing negative erates.
- stdDev<double> curDev;
+ stdDev<float> curDev;
for (uint32 oo=0, ee=0; oo<olaps.size(); oo++) {
if (olaps[oo].pos != errorProfile[ee].bgn) // Move to the next profile if the pos is different.
ee++; // By construction, this single step should be all we need.
#ifdef SHOW_PROFILE_CONSTRUCTION_DETAILS
- writeLog("oo=%u bgn=%u -- ee=%u bgn=%u -- olaps.size "F_SIZE_T" errorProfile.size "F_SIZE_T" -- insert %d erate %f\n",
+ writeLog("errorProfile()-- olap[%u] @ %u ep[%u] @ %u %s %f %f +- %f size %u\n",
oo, olaps[oo].pos,
ee, errorProfile[ee].bgn,
- olaps.size(), errorProfile.size(),
- olaps[oo].open, olaps[oo].erate);
+ olaps[oo].open ? "I" : "R",
+ olaps[oo].erate,
+ curDev.mean(), curDev.variance(), curDev.size());
+
+ if ((olaps[oo].open == false) && (curDev.size() == 0)) {
+ for (uint32 fi=0; fi<ufpath.size(); fi++) {
+ ufNode *frg = &ufpath[fi];
+ writeLog("read %6u %6u-%6u\n", frg->ident, frg->position.bgn, frg->position.end);
+ }
+
+ writeLog("errorProfile()-- remove from empty set?\n");
+ flushLog();
+ }
#endif
assert(olaps[oo].pos == errorProfile[ee].bgn);
@@ -313,6 +298,40 @@ Unitig::computeErrorProfile(const char *UNUSED(prefix), const char *UNUSED(label
errorProfile[ee].dev = curDev;
}
+ // Finalize the values.
+
+ for (uint32 bi=0; bi<errorProfile.size(); bi++)
+ errorProfile[bi].dev.finalize();
+
+ // Adjust regions that have no overlaps (mean == 0) to be the average of the adjacent regions.
+ // There are always at least two elements in the profile list: one that starts at coordinate 0,
+ // and the terminating one at coordinate (len, len+1).
+
+ for (uint32 bi=0; bi<errorProfile.size(); bi++) {
+ if (errorProfile[bi].dev.mean() != 0)
+ continue;
+
+ // Set any initial zero coverage area to the next one.
+ if (bi == 0) {
+ errorProfile[bi].dev = errorProfile[bi+1].dev;
+ }
+
+ // Set intermediate ones to the average.
+ else if (bi < errorProfile.size() - 2) {
+ //writeLog("errorProfile()-- tig %u no overlap coverage %u-%u\n", id(), errorProfile[bi].bgn, errorProfile[bi].end);
+
+ errorProfile[bi].dev = stdDev<float>((errorProfile[bi-1].dev.mean() + errorProfile[bi+1].dev.mean()) / 2,
+ (errorProfile[bi-1].dev.stddev() + errorProfile[bi+1].dev.stddev()) / 2,
+ 1);
+ }
+
+ // Set the last two - the last real one and the terminator - to the previous one.
+ else {
+ errorProfile[bi].dev = errorProfile[bi-1].dev;
+ }
+ }
+
+
// Build an index.
// bi - base we are indexing.
// pi - profile
@@ -329,12 +348,9 @@ Unitig::computeErrorProfile(const char *UNUSED(prefix), const char *UNUSED(label
}
}
- // Finalize the values.
- for (uint32 bi=0; bi<errorProfile.size(); bi++)
- errorProfile[bi].dev.finalize();
- //writeLog("tig %u generated "F_SIZE_T" profile regions with "F_U64" overlap pieces.\n",
+ //writeLog("errorProfile()-- tig %u generated " F_SIZE_T " profile regions with " F_U64 " overlap pieces.\n",
// id(), errorProfile.size(), nPieces);
}
@@ -388,7 +404,7 @@ Unitig::overlapConsistentWithTig(double deviations,
uint32 pbi = bgn / 1000;
if (errorProfileIndex.size() <= pbi)
- fprintf(stderr, "errorProfileIndex.size() = "F_SIZE_T"\n", errorProfileIndex.size());
+ fprintf(stderr, "errorProfileIndex.size() = " F_SIZE_T " but pbi = " F_U32 "\n", errorProfileIndex.size(),pbi);
assert(pbi < errorProfileIndex.size());
while ((0 < pbi) && (errorProfile[errorProfileIndex[pbi]].bgn > bgn)) {
@@ -479,7 +495,10 @@ Unitig::reportErrorProfile(const char *prefix, const char *label) {
char N[FILENAME_MAX];
FILE *F;
- sprintf(N, "%s.%s.%08u.profile", prefix, label, id());
+ if (logFileFlagSet(LOG_ERROR_PROFILES) == false)
+ return;
+
+ snprintf(N, FILENAME_MAX, "%s.%s.%08u.profile", prefix, label, id());
F = fopen(N, "w");
@@ -495,7 +514,7 @@ Unitig::reportErrorProfile(const char *prefix, const char *label) {
// Reporting the index isn't generally useful, only for debugging.
#if 0
- sprintf(N, "%s.%s.%08u.profile.index", prefix, label, id());
+ snprintf(N, FILENAME_MAX, "%s.%s.%08u.profile.index", prefix, label, id());
F = fopen(N, "w");
diff --git a/src/bogart/AS_BAT_Unitig.H b/src/bogart/AS_BAT_Unitig.H
index 432ced4..80b3e2d 100644
--- a/src/bogart/AS_BAT_Unitig.H
+++ b/src/bogart/AS_BAT_Unitig.H
@@ -39,7 +39,7 @@
#define INCLUDE_AS_BAT_UNITIG
#include "AS_global.H"
-#include "AS_BAT_UnitigVector.H"
+#include "AS_BAT_TigVector.H"
#include "stddev.H"
@@ -78,15 +78,6 @@ public:
bool operator<(SeqInterval const that) const {
return(min() < that.min());
-#if 0
- if (isReverse()) {
- if (b.isReverse()) return end < that.end;
- else return end < that.bgn;
- } else {
- if (b.isReverse()) return bgn < that.end;
- else return bgn < that.bgn;
- }
-#endif
};
@@ -112,10 +103,10 @@ class ufNode {
public:
uint32 ident;
uint32 contained;
- uint32 parent; // IID of the fragment we align to
+ uint32 parent; // IID of the read we align to
int32 ahang; // If parent defined, these are relative
- int32 bhang; // that fragment
+ int32 bhang; // that read
SeqInterval position;
@@ -145,10 +136,10 @@ public:
class Unitig {
private:
- Unitig() {
+ Unitig(TigVector *v) {
+ _vector = v;
_length = 0;
_id = 0;
- _tigID = 0;
_isUnassembled = false;
_isBubble = false;
@@ -160,36 +151,37 @@ public:
~Unitig(void) {
};
- friend class UnitigVector;
+ friend class TigVector;
void sort(void) {
std::sort(ufpath.begin(), ufpath.end());
for (uint32 fi=0; fi<ufpath.size(); fi++)
- _pathPosition[ufpath[fi].ident] = fi;
+ _vector->registerRead(ufpath[fi].ident, _id, fi);
};
- //void bubbleSortLastFrag(void);
+ //void bubbleSortLastRead(void);
void reverseComplement(bool doSort=true);
- // getNumRandomFrags() is a placeholder, random frags should not
- // contain guides, or other fragments that are not randomly sampled
+ void cleanUp(void);
+
+ // getNumRandomRead() is a placeholder, random reads should not
+ // contain guides, or other reads that are not randomly sampled
// across the whole genome.
uint32 id(void) { return(_id); }; // ID internal to bogart
- uint32 tigID(void) { return(_tigID); }; // ID in the output store
int32 getLength(void) { return(_length); };
- uint32 getNumFrags(void) { return(ufpath.size()); };
- uint32 getNumRandomFrags(void) { return(getNumFrags()); };
+ uint32 getNumReads(void) { return(ufpath.size()); };
+ uint32 getNumRandomReads(void) { return(getNumReads()); };
- // Place 'frag' using an edge to some read in this tig. The edge is from 'frag3p' end.
+ // Place 'read' using an edge to some read in this tig. The edge is from 'read3p' end.
//
- bool placeFrag(ufNode &frag, // resulting placement
- uint32 fragId, // read we want to place
- bool frag3p, // end that the edge is from
+ bool placeRead(ufNode &read, // resulting placement
+ uint32 readId, // read we want to place
+ bool read3p, // end that the edge is from
BestEdgeOverlap *edge); // edge to something in this tig
- void addFrag(ufNode node, int offset=0, bool report=false);
+ void addRead(ufNode node, int offset=0, bool report=false);
public:
@@ -212,7 +204,7 @@ public:
uint32 bgn;
uint32 end;
- stdDev<double> dev;
+ stdDev<float> dev;
};
static size_t epValueSize(void) { return(sizeof(epValue)); };
@@ -221,7 +213,6 @@ public:
const char *label,
vector<int32> *hist);
- void computeErrorProfileApproximate(const char *prefix, const char *label);
void computeErrorProfile(const char *prefix, const char *label);
void reportErrorProfile(const char *prefix, const char *label);
void clearErrorProfile(void) { errorProfile.clear(); };
@@ -256,59 +247,39 @@ public:
return(rd3);
};
+ // Public Member Variables
+public:
+ vector<ufNode> ufpath;
+ vector<epValue> errorProfile;
+ vector<uint32> errorProfileIndex;
- static void removeFrag(int32 fid) {
- _inUnitig[fid] = 0;
- _pathPosition[fid] = ~0;
- };
-
- static uint32 fragIn(uint32 fragId) {
- if ((_inUnitig == NULL) || (fragId == 0))
- return 0;
- return _inUnitig[fragId];
- };
-
- static uint32 pathPosition(uint32 fragId) {
- if ((_pathPosition == NULL) || (fragId == 0))
- return ~0;
- return _pathPosition[fragId];
- };
-
- static void resetFragUnitigMap(uint32 numFrags) {
- if (_inUnitig == NULL)
- _inUnitig = new uint32[numFrags+1];
- memset(_inUnitig, 0, (numFrags+1) * sizeof(uint32));
-
- if (_pathPosition == NULL)
- _pathPosition = new uint32[numFrags+1];
- memset(_pathPosition, 0, (numFrags+1) * sizeof(uint32));
- };
+public:
+ // r > 0 guards against calling these from Idx's, while r < size guards
+ // against calling with Id's.
+ //
+ uint32 inUnitig(uint32 r) { assert(r > 0); return(_vector->inUnitig(r)); };
+ uint32 ufpathIdx(uint32 r) { assert(r > 0); return(_vector->ufpathIdx(r)); };
- // Public Member Variables
- vector<ufNode> ufpath;
- vector<epValue> errorProfile;
- vector<uint32> errorProfileIndex;
+ ufNode *readFromId(uint32 r) { assert(r > 0); return(&ufpath[ ufpathIdx(r) ]); };
+ ufNode *readFromIdx(uint32 r) { assert(r < ufpath.size()); return(&ufpath[ r ]); };
private:
- int32 _length;
- uint32 _id;
-public:
- uint32 _tigID;
+ TigVector *_vector; // For updating the read map.
private:
- static uint32 *_inUnitig; // Maps a fragment iid to a unitig id.
- static uint32 *_pathPosition; // Maps a fragment iid to an index in the dovetail path
+ int32 _length;
+ uint32 _id;
public:
// Classification. The output is in three files: 'unassembled', 'bubbles', 'contigs' (defined as
// not unassembled and not bubble).
- uint32 _isUnassembled; // Is a single read or a pseudo singleton
- uint32 _isBubble; // Annotation: from a failed bubble pop
- uint32 _isRepeat; // Annotation: from an identified repeat region
- uint32 _isCircular; // Annotation: has overlap to self
+ uint32 _isUnassembled; // Is a single read or a pseudo singleton
+ uint32 _isBubble; // Annotation: from a failed bubble pop
+ uint32 _isRepeat; // Annotation: from an identified repeat region
+ uint32 _isCircular; // Annotation: has overlap to self
- char type(void) {
+ char type(void) {
if (_isUnassembled) return('U');
if (_isBubble) return('B');
if (_isRepeat) return('R');
diff --git a/src/bogart/AS_BAT_Unitig_AddFrag.C b/src/bogart/AS_BAT_Unitig_AddRead.C
similarity index 70%
rename from src/bogart/AS_BAT_Unitig_AddFrag.C
rename to src/bogart/AS_BAT_Unitig_AddRead.C
index 4fc0cc6..e9d87c1 100644
--- a/src/bogart/AS_BAT_Unitig_AddFrag.C
+++ b/src/bogart/AS_BAT_Unitig_AddRead.C
@@ -15,19 +15,11 @@
*
* This file is derived from:
*
- * src/AS_BAT/AS_BAT_Unitig_AddFrag.C
+ * src/bogart/AS_BAT_Unitig_AddFrag.C
*
* Modifications by:
*
- * Brian P. Walenz from 2010-NOV-23 to 2013-AUG-01
- * are Copyright 2010,2012-2013 J. Craig Venter Institute, and
- * are subject to the GNU General Public License version 2
- *
- * Brian P. Walenz on 2014-DEC-19
- * are Copyright 2014 Battelle National Biodefense Institute, and
- * are subject to the BSD 3-Clause License
- *
- * Brian P. Walenz beginning on 2016-JAN-11
+ * Brian P. Walenz beginning on 2016-AUG-12
* are a 'United States Government Work', and
* are released in the public domain
*
@@ -35,7 +27,7 @@
* full conditions and disclaimers for each license.
*/
-#include "AS_BAT_FragmentInfo.H"
+#include "AS_BAT_ReadInfo.H"
#include "AS_BAT_BestOverlapGraph.H"
#include "AS_BAT_Logging.H"
@@ -45,16 +37,15 @@
void
-Unitig::addFrag(ufNode node, int offset, bool report) {
+Unitig::addRead(ufNode node, int offset, bool report) {
node.position.bgn += offset;
node.position.end += offset;
assert(node.ident > 0);
- // keep track of the unitig a frag is in
- _inUnitig[node.ident] = _id;
- _pathPosition[node.ident] = ufpath.size();
+ // keep track of the unitig a read is in
+ _vector->registerRead(node.ident, _id, ufpath.size());
// keep track of max position in unitig
int32 frgEnd = MAX(node.position.bgn, node.position.end);
@@ -64,17 +55,17 @@ Unitig::addFrag(ufNode node, int offset, bool report) {
ufpath.push_back(node);
if ((report) || (node.position.bgn < 0) || (node.position.end < 0)) {
- int32 trulen = FI->fragmentLength(node.ident);
+ int32 trulen = RI->readLength(node.ident);
int32 poslen = (node.position.end > node.position.bgn) ? (node.position.end - node.position.bgn) : (node.position.bgn - node.position.end);
if (node.contained)
- writeLog("Added frag %d (len %d) to unitig %d at %d,%d (idx %lu) (lendiff %d) (contained in %d)\n",
+ writeLog("Added read %d (len %d) to unitig %d at %d,%d (idx %lu) (lendiff %d) (contained in %d)\n",
node.ident, trulen, _id, node.position.bgn, node.position.end,
ufpath.size() - 1,
poslen - trulen,
node.contained);
else
- writeLog("Added frag %d (len %d) to unitig %d at %d,%d (idx %lu) (lendiff %d)\n",
+ writeLog("Added read %d (len %d) to unitig %d at %d,%d (idx %lu) (lendiff %d)\n",
node.ident, trulen, _id, node.position.bgn, node.position.end,
ufpath.size() - 1,
poslen - trulen);
@@ -89,10 +80,10 @@ Unitig::addFrag(ufNode node, int offset, bool report) {
-// Percolate the last fragment to the correct spot in the list.
+// Percolate the last read to the correct spot in the list.
#if 0
void
-Unitig::bubbleSortLastFrag(void) {
+Unitig::bubbleSortLastRead(void) {
uint32 previd = ufpath.size() - 2;
uint32 lastid = ufpath.size() - 1;
@@ -103,13 +94,13 @@ Unitig::bubbleSortLastFrag(void) {
(lastbgn < MIN(ufpath[previd].position.bgn, ufpath[previd].position.end))) {
ufpath[lastid] = ufpath[previd];
- _pathPosition[ufpath[lastid].ident] = lastid;
+ _ufpathIdx[ufpath[lastid].ident] = lastid;
lastid--;
previd--;
}
- _pathPosition[last.ident] = lastid;
+ _ufpathIdx[last.ident] = lastid;
if (lastid < ufpath.size() - 1)
ufpath[lastid] = last;
diff --git a/src/bogart/AS_BAT_Unitig_PlaceFragUsingEdges.C b/src/bogart/AS_BAT_Unitig_PlaceFragUsingEdges.C
deleted file mode 100644
index c08643b..0000000
--- a/src/bogart/AS_BAT_Unitig_PlaceFragUsingEdges.C
+++ /dev/null
@@ -1,275 +0,0 @@
-
-/******************************************************************************
- *
- * This file is part of canu, a software program that assembles whole-genome
- * sequencing reads into contigs.
- *
- * This software is based on:
- * 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
- * the 'kmer package' (http://kmer.sourceforge.net)
- * both originally distributed by Applera Corporation under the GNU General
- * Public License, version 2.
- *
- * Canu branched from Celera Assembler at its revision 4587.
- * Canu branched from the kmer project at its revision 1994.
- *
- * This file is derived from:
- *
- * src/AS_BAT/AS_BAT_Unitig_PlaceFragUsingEdges.C
- *
- * Modifications by:
- *
- * Brian P. Walenz from 2010-NOV-23 to 2013-AUG-01
- * are Copyright 2010-2013 J. Craig Venter Institute, and
- * are subject to the GNU General Public License version 2
- *
- * Brian P. Walenz from 2014-DEC-19 to 2015-MAR-06
- * are Copyright 2014-2015 Battelle National Biodefense Institute, and
- * are subject to the BSD 3-Clause License
- *
- * Brian P. Walenz beginning on 2016-JAN-11
- * are a 'United States Government Work', and
- * are released in the public domain
- *
- * File 'README.licenses' in the root directory of this distribution contains
- * full conditions and disclaimers for each license.
- */
-
-#include "AS_BAT_FragmentInfo.H"
-#include "AS_BAT_BestOverlapGraph.H"
-#include "AS_BAT_Logging.H"
-
-#include "AS_BAT_Unitig.H"
-
-
-#undef DEBUG_PLACE_FRAG
-
-
-ufNode
-placeFrag_contained(uint32 fragId,
- ufNode &parent,
- BestEdgeOverlap *edge) {
-
- bool pFwd = (parent.position.bgn < parent.position.end) ? true : false;
- int32 pMin = (parent.position.bgn < parent.position.end) ? parent.position.bgn : parent.position.end;
- int32 pMax = (parent.position.bgn < parent.position.end) ? parent.position.end : parent.position.bgn;
-
- assert(pMin < pMax);
-
- // Reverse the overlap. frag3p here means the overlap is flipped.
- int32 ahang = (edge->frag3p() == false) ? -edge->ahang() : edge->bhang();
- int32 bhang = (edge->frag3p() == false) ? -edge->bhang() : edge->ahang();
-
- // Depending on the parent orientation...
- //
- // pMin pMax pMin pMax
- // ----------------> <----------------
- // ahang ----- bhang bhang ----- ahang
- // > 0 < 0 < 0 > 0
-
- int32 fMin = (pFwd == true) ? pMin + ahang : pMin - bhang;
- int32 fMax = (pFwd == true) ? pMax + bhang : pMax - ahang;
-
- //int32 fMin = pMin + ((frag3p == false) ? -edge->ahang() : edge->bhang()); // * intraScale
- //int32 fMax = pMax + ((frag3p == false) ? -edge->bhang() : edge->ahang()); // * interScale
-
- assert(fMin < fMax);
-
- // We don't know the true length of the overlap, and our hang-based math tends to shrink reads.
- // Reset the end coordinate using the actual length of the read.
-
- fMax = fMin + FI->fragmentLength(fragId);
-
- // Orientation is straightforward, based on the orient of the parent, and the flipped flag.
-
- bool fFwd = (((pFwd == true) && (edge->frag3p() == false)) || // parent is fwd, olap is not flipped
- ((pFwd == false) && (edge->frag3p() == true))); // parent is rev, olap is flipped
-
- ufNode frag;
-
- frag.ident = fragId;
- frag.contained = 0;
- frag.parent = edge->fragId(); // == parent->ident
- frag.ahang = 0; // Not used in bogart, set on output
- frag.bhang = 0; // Not used in bogart, set on output
- frag.position.bgn = (fFwd) ? fMin : fMax;
- frag.position.end = (fFwd) ? fMax : fMin;
-
-#ifdef DEBUG_PLACE_FRAG
- writeLog("placeCont()-- parent %7d pos %7d,%7d -- edge to %7d %c' hangs %7d %7d -- frag %7d C' -- placed %7d-%7d oriented %s %7d-%7d\n",
- parent.ident, parent.position.bgn, parent.position.end,
- edge->fragId(), (edge->frag3p()) ? '3' : '5', edge->ahang(), edge->bhang(),
- fragId,
- fMin, fMax, (fFwd) ? "rev" : "fwd", frag.position.bgn, frag.position.end);
-#endif
-
- return(frag);
-}
-
-
-
-
-
-ufNode
-placeFrag_dovetail(uint32 fragId,
- bool frag3p,
- ufNode &parent,
- BestEdgeOverlap *edge) {
-
- // We have an 'edge' from 'fragId' end 'frag3p' back to 'parent'.
- // Use that to compute the placement of 'frag'.
-
- bool pFwd = (parent.position.bgn < parent.position.end) ? true : false;
- int32 pMin = (parent.position.bgn < parent.position.end) ? parent.position.bgn : parent.position.end;
- int32 pMax = (parent.position.bgn < parent.position.end) ? parent.position.end : parent.position.bgn;
-
- assert(pMin < pMax);
-
- // Scale the hangs based on the placed versus actual length of the parent read.
-
- //double intraScale = (double)(pMax - pMin) / FI->fragmentLength(parent.ident); // Within the parent read overlap
- //double interScale = 1.0; // Outside the parent read overlap
-
- // We're given an edge from the read-to-place back to the parent. Reverse the edge so it points
- // from the parent to the read-to-place.
- //
- // The canonical edge is from a forward parent to the child.
- //
- // -P----\--> +b
- // +a ---v--------C-
- //
- // To reverse the edge:
- //
- // If child is forward, swapping the order of the reads results in a canonical overlap. The
- // hangs become negative.
- //
- // -P----\--> +b ----> -a ---/--------C>
- // +a ---v--------C> ----> -P----v--> -b
- //
- // If child is reverse, swapping the order of the reads results in a backwards canonical
- // overlap, and we need to flip end-to-end also. The hangs are swapped.
- //
- // -P----\--> +b ----> -C--------\--> +a
- // +a <--v--------C- ----> +b <--v----P-
- //
- int32 ahang = (frag3p == false) ? -edge->ahang() : edge->bhang();
- int32 bhang = (frag3p == false) ? -edge->bhang() : edge->ahang();
-
- // The read is placed 'to the right' of the parent if
- // pFwd == true and edge points to 3' end
- // pFwd == false and edge points to 5' end
- //
- bool toRight = (pFwd == edge->frag3p());
-
- // If placing 'to the right', we add hangs. Else, subtract the swapped hangs.
-
- int32 fMin = 0;
- int32 fMax = 0;
-
- if (toRight) {
- fMin = pMin + ahang;
- fMax = pMax + bhang;
- } else {
- fMin = pMin - bhang;
- fMax = pMax - ahang;
- }
-
- //int32 fMin = pMin + ((frag3p == false) ? -edge->ahang() : edge->bhang()); // * intraScale
- //int32 fMax = pMax + ((frag3p == false) ? -edge->bhang() : edge->ahang()); // * interScale
-
- assert(fMin < fMax);
-
- // We don't know the true length of the overlap, and our hang-based math tends to shrink reads.
- // Reset the end coordinate using the actual length of the read.
-
- fMax = fMin + FI->fragmentLength(fragId);
-
-
- // Orientation is a bit more complicated, with eight cases (drawing pictures helps).
- //
- // edge from frag3p=true to forward parent 3p -> reverse
- // edge from frag3p=false to reverse parent 3p -> reverse
- // edge from frag3p=false to forward parent 5p -> reverse
- // edge from frag3p=true to reverse parent 5p -> reverse
- //
- // edge from frag3p=true to reverse parent 3p -> forward
- // edge from frag3p=false to forward parent 3p -> forward
- // edge from frag3p=false to reverse parent 5p -> forward
- // edge from frag3p=true to forward parent 5p -> forward
- //
- bool fFwd = (((frag3p == true) && (pFwd == true) && (edge->frag3p() == true)) ||
- ((frag3p == false) && (pFwd == false) && (edge->frag3p() == true)) ||
- ((frag3p == false) && (pFwd == true) && (edge->frag3p() == false)) ||
- ((frag3p == true) && (pFwd == false) && (edge->frag3p() == false))) ? false : true;
-
- ufNode frag;
-
- frag.ident = fragId;
- frag.contained = 0;
- frag.parent = edge->fragId(); // == parent->ident
- frag.ahang = 0; // Not used in bogart, set on output
- frag.bhang = 0; // Not used in bogart, set on output
- frag.position.bgn = (fFwd) ? fMin : fMax;
- frag.position.end = (fFwd) ? fMax : fMin;
-
-#ifdef DEBUG_PLACE_FRAG
- writeLog("placeDove()-- parent %7d pos %7d,%7d -- edge to %7d %c' hangs %7d %7d -- frag %7d %c' -- placed %7d-%7d oriented %s %7d-%7d\n",
- parent.ident, parent.position.bgn, parent.position.end,
- edge->fragId(), (edge->frag3p()) ? '3' : '5', edge->ahang(), edge->bhang(),
- fragId, (frag3p) ? '3' : '5',
- fMin, fMax, (fFwd) ? "rev" : "fwd", frag.position.bgn, frag.position.end);
-#endif
-
- return(frag);
-}
-
-
-
-
-
-// Place a read into this tig using an edge from the read to some read in this tig.
-//
-bool
-Unitig::placeFrag(ufNode &frag, // output placement
- uint32 fragId, // id of read we want to place
- bool frag3p, // end of read 'edge' is from, meaningless if contained
- BestEdgeOverlap *edge) { // edge to read in this tig
-
- assert(fragId > 0);
- assert(fragId <= FI->numFragments());
-
- frag.ident = fragId;
- frag.contained = 0;
- frag.parent = 0;
- frag.ahang = 0;
- frag.bhang = 0;
- frag.position.bgn = 0;
- frag.position.end = 0;
-
- if (edge == NULL)
- // No best edge? Hard to place without one.
- return(false);
-
- if (edge->fragId() == 0)
- // Empty best edge? Still hard to place.
- return(false);
-
- if (fragIn(edge->fragId()) != id())
- // Edge not pointing to a read in this tig?
- return(false);
-
- // Grab the index of the parent read.
-
- uint32 bidx = pathPosition(edge->fragId());
- assert(edge->fragId() == ufpath[bidx].ident);
-
- // Now, just compute the placement and return success!
-
- if (((edge->ahang() >= 0) && (edge->bhang() <= 0)) ||
- ((edge->ahang() <= 0) && (edge->bhang() >= 0)))
- frag = placeFrag_contained(fragId, ufpath[bidx], edge);
- else
- frag = placeFrag_dovetail(fragId, frag3p, ufpath[bidx], edge);
-
- return(true);
-}
diff --git a/src/bogart/AS_BAT_Unitig_PlaceReadUsingEdges.C b/src/bogart/AS_BAT_Unitig_PlaceReadUsingEdges.C
new file mode 100644
index 0000000..188b2e8
--- /dev/null
+++ b/src/bogart/AS_BAT_Unitig_PlaceReadUsingEdges.C
@@ -0,0 +1,270 @@
+
+/******************************************************************************
+ *
+ * This file is part of canu, a software program that assembles whole-genome
+ * sequencing reads into contigs.
+ *
+ * This software is based on:
+ * 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
+ * the 'kmer package' (http://kmer.sourceforge.net)
+ * both originally distributed by Applera Corporation under the GNU General
+ * Public License, version 2.
+ *
+ * Canu branched from Celera Assembler at its revision 4587.
+ * Canu branched from the kmer project at its revision 1994.
+ *
+ * This file is derived from:
+ *
+ * src/bogart/AS_BAT_Unitig_PlaceFragUsingEdges.C
+ *
+ * Modifications by:
+ *
+ * Brian P. Walenz beginning on 2016-AUG-12
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
+ * File 'README.licenses' in the root directory of this distribution contains
+ * full conditions and disclaimers for each license.
+ */
+
+#include "AS_BAT_ReadInfo.H"
+#include "AS_BAT_BestOverlapGraph.H"
+#include "AS_BAT_Logging.H"
+
+#include "AS_BAT_Unitig.H"
+
+
+#undef DEBUG_PLACE_READ
+
+
+ufNode
+placeRead_contained(uint32 readId,
+ ufNode &parent,
+ BestEdgeOverlap *edge) {
+
+ bool pFwd = parent.position.isForward();
+ int32 pMin = parent.position.min();
+ int32 pMax = parent.position.max();
+
+ assert(pMin < pMax);
+
+ // Reverse the overlap. read3p here means the overlap is flipped.
+ int32 ahang = (edge->read3p() == false) ? -edge->ahang() : edge->bhang();
+ int32 bhang = (edge->read3p() == false) ? -edge->bhang() : edge->ahang();
+
+ // Depending on the parent orientation...
+ //
+ // pMin pMax pMin pMax
+ // ----------------> <----------------
+ // ahang ----- bhang bhang ----- ahang
+ // > 0 < 0 < 0 > 0
+
+ int32 fMin = (pFwd == true) ? pMin + ahang : pMin - bhang;
+ int32 fMax = (pFwd == true) ? pMax + bhang : pMax - ahang;
+
+ //int32 fMin = pMin + ((read3p == false) ? -edge->ahang() : edge->bhang()); // * intraScale
+ //int32 fMax = pMax + ((read3p == false) ? -edge->bhang() : edge->ahang()); // * interScale
+
+ assert(fMin < fMax);
+
+ // We don't know the true length of the overlap, and our hang-based math tends to shrink reads.
+ // Reset the end coordinate using the actual length of the read.
+
+ fMax = fMin + RI->readLength(readId);
+
+ // Orientation is straightforward, based on the orient of the parent, and the flipped flag.
+
+ bool fFwd = (((pFwd == true) && (edge->read3p() == false)) || // parent is fwd, olap is not flipped
+ ((pFwd == false) && (edge->read3p() == true))); // parent is rev, olap is flipped
+
+ ufNode read;
+
+ read.ident = readId;
+ read.contained = 0;
+ read.parent = edge->readId(); // == parent->ident
+ read.ahang = 0; // Not used in bogart, set on output
+ read.bhang = 0; // Not used in bogart, set on output
+ read.position.bgn = (fFwd) ? fMin : fMax;
+ read.position.end = (fFwd) ? fMax : fMin;
+
+#ifdef DEBUG_PLACE_READ
+ writeLog("placeCont()-- parent %7d pos %7d,%7d -- edge to %7d %c' hangs %7d %7d -- read %7d C' -- placed %7d-%7d oriented %s %7d-%7d\n",
+ parent.ident, parent.position.bgn, parent.position.end,
+ edge->readId(), (edge->read3p()) ? '3' : '5', edge->ahang(), edge->bhang(),
+ readId,
+ fMin, fMax, (fFwd) ? "rev" : "fwd", read.position.bgn, read.position.end);
+#endif
+
+ return(read);
+}
+
+
+
+
+
+ufNode
+placeRead_dovetail(uint32 readId,
+ bool read3p,
+ ufNode &parent,
+ BestEdgeOverlap *edge) {
+
+ // We have an 'edge' from 'readId' end 'read3p' back to 'parent'.
+ // Use that to compute the placement of 'read'.
+
+ bool pFwd = parent.position.isForward();
+ int32 pMin = parent.position.min();
+ int32 pMax = parent.position.max();
+
+ assert(pMin < pMax);
+
+ // Scale the hangs based on the placed versus actual length of the parent read.
+
+ //double intraScale = (double)(pMax - pMin) / RI->readLength(parent.ident); // Within the parent read overlap
+ //double interScale = 1.0; // Outside the parent read overlap
+
+ // We're given an edge from the read-to-place back to the parent. Reverse the edge so it points
+ // from the parent to the read-to-place.
+ //
+ // The canonical edge is from a forward parent to the child.
+ //
+ // -P----\--> +b
+ // +a ---v--------C-
+ //
+ // To reverse the edge:
+ //
+ // If child is forward, swapping the order of the reads results in a canonical overlap. The
+ // hangs become negative.
+ //
+ // -P----\--> +b ----> -a ---/--------C>
+ // +a ---v--------C> ----> -P----v--> -b
+ //
+ // If child is reverse, swapping the order of the reads results in a backwards canonical
+ // overlap, and we need to flip end-to-end also. The hangs are swapped.
+ //
+ // -P----\--> +b ----> -C--------\--> +a
+ // +a <--v--------C- ----> +b <--v----P-
+ //
+ int32 ahang = (read3p == false) ? -edge->ahang() : edge->bhang();
+ int32 bhang = (read3p == false) ? -edge->bhang() : edge->ahang();
+
+ // The read is placed 'to the right' of the parent if
+ // pFwd == true and edge points to 3' end
+ // pFwd == false and edge points to 5' end
+ //
+ bool toRight = (pFwd == edge->read3p());
+
+ // If placing 'to the right', we add hangs. Else, subtract the swapped hangs.
+
+ int32 fMin = 0;
+ int32 fMax = 0;
+
+ if (toRight) {
+ fMin = pMin + ahang;
+ fMax = pMax + bhang;
+ } else {
+ fMin = pMin - bhang;
+ fMax = pMax - ahang;
+ }
+
+ //int32 fMin = pMin + ((read3p == false) ? -edge->ahang() : edge->bhang()); // * intraScale
+ //int32 fMax = pMax + ((read3p == false) ? -edge->bhang() : edge->ahang()); // * interScale
+
+ assert(fMin < fMax);
+
+ // We don't know the true length of the overlap, and our hang-based math tends to shrink reads.
+ // Reset the end coordinate using the actual length of the read.
+
+ fMax = fMin + RI->readLength(readId);
+
+
+ // Orientation is a bit more complicated, with eight cases (drawing pictures helps).
+ //
+ // edge from read3p=true to forward parent 3p -> reverse
+ // edge from read3p=false to reverse parent 3p -> reverse
+ // edge from read3p=false to forward parent 5p -> reverse
+ // edge from read3p=true to reverse parent 5p -> reverse
+ //
+ // edge from read3p=true to reverse parent 3p -> forward
+ // edge from read3p=false to forward parent 3p -> forward
+ // edge from read3p=false to reverse parent 5p -> forward
+ // edge from read3p=true to forward parent 5p -> forward
+ //
+ bool fFwd = (((read3p == true) && (pFwd == true) && (edge->read3p() == true)) ||
+ ((read3p == false) && (pFwd == false) && (edge->read3p() == true)) ||
+ ((read3p == false) && (pFwd == true) && (edge->read3p() == false)) ||
+ ((read3p == true) && (pFwd == false) && (edge->read3p() == false))) ? false : true;
+
+ ufNode read;
+
+ read.ident = readId;
+ read.contained = 0;
+ read.parent = edge->readId(); // == parent->ident
+ read.ahang = 0; // Not used in bogart, set on output
+ read.bhang = 0; // Not used in bogart, set on output
+ read.position.bgn = (fFwd) ? fMin : fMax;
+ read.position.end = (fFwd) ? fMax : fMin;
+
+#ifdef DEBUG_PLACE_READ
+ writeLog("placeDove()-- parent %7d pos %7d,%7d -- edge to %7d %c' hangs %7d %7d -- read %7d %c' -- placed %7d-%7d oriented %s %7d-%7d\n",
+ parent.ident, parent.position.bgn, parent.position.end,
+ edge->readId(), (edge->read3p()) ? '3' : '5', edge->ahang(), edge->bhang(),
+ readId, (read3p) ? '3' : '5',
+ fMin, fMax, (fFwd) ? "rev" : "fwd", read.position.bgn, read.position.end);
+#endif
+
+ return(read);
+}
+
+
+
+
+
+// Place a read into this tig using an edge from the read to some read in this tig.
+//
+bool
+Unitig::placeRead(ufNode &read, // output placement
+ uint32 readId, // id of read we want to place
+ bool read3p, // end of read 'edge' is from, meaningless if contained
+ BestEdgeOverlap *edge) { // edge to read in this tig
+
+ assert(readId > 0);
+ assert(readId <= RI->numReads());
+
+ read.ident = readId;
+ read.contained = 0;
+ read.parent = 0;
+ read.ahang = 0;
+ read.bhang = 0;
+ read.position.bgn = 0;
+ read.position.end = 0;
+
+ // No best edge? Hard to place without one.
+ assert(edge != NULL);
+ if (edge == NULL)
+ return(false);
+
+ // Empty best edge? Still hard to place.
+ assert(edge->readId() != 0);
+ if (edge->readId() == 0)
+ return(false);
+
+ // Edge not pointing to a read in this tig?
+ assert(inUnitig(edge->readId()) == id());
+ if (inUnitig(edge->readId()) != id())
+ return(false);
+
+ // Grab the index of the parent read.
+
+ uint32 bidx = ufpathIdx(edge->readId());
+ assert(edge->readId() == ufpath[bidx].ident);
+
+ // Now, just compute the placement and return success!
+
+ if (((edge->ahang() >= 0) && (edge->bhang() <= 0)) ||
+ ((edge->ahang() <= 0) && (edge->bhang() >= 0)))
+ read = placeRead_contained(readId, ufpath[bidx], edge);
+ else
+ read = placeRead_dovetail(readId, read3p, ufpath[bidx], edge);
+
+ return(true);
+}
diff --git a/src/bogart/addReadsToUnitigs.C b/src/bogart/addReadsToUnitigs.C
index daacf74..31ac157 100644
--- a/src/bogart/addReadsToUnitigs.C
+++ b/src/bogart/addReadsToUnitigs.C
@@ -111,7 +111,7 @@ main(int argc, char **argv) {
map<string,uint32> lookupIID;
#ifdef UNFINISHED_ADD_TO_SINGLETON
- vector<bool> iidInTig; // true if the read is already in a unitig
+ vector<bool> iidInTig; // true if the read is already in a tig
vector<uint32> iidInTigByLib; // count of reads in tigs, by library
vector<uint32> iidInLib; // count of reads, by library
#endif
@@ -190,7 +190,7 @@ main(int argc, char **argv) {
fprintf(stderr, " -M fastqUIDmap gatekeeper output fastqUIDmap for read name to IID translation\n");
fprintf(stderr, "\n");
#if 0
- fprintf(stderr, "unmapped reads: default is to promote to singleton unitigs\n");
+ fprintf(stderr, "unmapped reads: default is to promote to singleton tigs\n");
fprintf(stderr, " -U leave unmapped reads alone (will crash CGW)\n");
fprintf(stderr, " -D delete unmapped reads from gkpStore\n");
#else
@@ -251,7 +251,7 @@ main(int argc, char **argv) {
fgets(LL, 1024, LF);
}
- fprintf(stderr, "Loaded "F_SIZE_T" name to IIDs\n", lookupIID.size());
+ fprintf(stderr, "Loaded " F_SIZE_T " name to IIDs\n", lookupIID.size());
}
@@ -287,10 +287,10 @@ main(int argc, char **argv) {
//
- // Load the alignment map. The challenge here is to parse the unitig and read names
+ // Load the alignment map. The challenge here is to parse the tig and read names
// into correct IIDs. We assume that:
// Reads were dumped with -dumpfasta and have names ">UID,IID"
- // Unitigs were dumped with tigStore -d consensus and have names "utgIID"
+ // Tigs were dumped with tigStore -d consensus and have names "utgIID"
// Alignments are in the convertToExtent -extended format
//
@@ -337,7 +337,7 @@ main(int argc, char **argv) {
rm.rFWD = (S(4) < S(5));
rm.rCNT = 1;
- assert(S[6][0] == 'u'); // Unitig must be from a tigStore dump, and will be
+ assert(S[6][0] == 'u'); // tig must be from a tigStore dump, and will be
assert(S[6][1] == 't'); // named 'utg#######'.
assert(S[6][2] == 'g');
@@ -466,8 +466,8 @@ main(int argc, char **argv) {
fprintf(stderr, "Will NOT add %u pairs - one read failed to map.\n", unpaired);
fprintf(stderr, "Will NOT add %u pairs - multiple mappings.\n", multiple);
- fprintf(stderr, "Will add %u pairs in the same unitig\n", pairsToSame);
- fprintf(stderr, "Will add %u pairs in different unitigs\n", pairsToDiff);
+ fprintf(stderr, "Will add %u pairs in the same tig\n", pairsToSame);
+ fprintf(stderr, "Will add %u pairs in different tigs\n", pairsToDiff);
//
// Open stores. gkpStore cannot be opened for writing, because then we can't loadall.
@@ -482,14 +482,14 @@ main(int argc, char **argv) {
}
//
- // Rebuild unitigs, stuff them back into the same version.
+ // Rebuild tigs, stuff them back into the same version.
//
// Argh, really should convert this to a vector right now....
VA_TYPE(int32) *unGappedOffsets = CreateVA_int32(1024 * 1024);
for (uint32 bb=0; bb<RM.size(); bb++) {
- if ((RM[bb].tIID == UINT32_MAX) || // not mapped into a unitig
+ if ((RM[bb].tIID == UINT32_MAX) || // not mapped into a tig
(RM[bb].good == false) || // not useful mate pair
(RM[bb].proc == true)) // already added
continue;
@@ -520,11 +520,11 @@ main(int argc, char **argv) {
readsAdded++;
- fprintf(stdout, "bb=%u ee=%u ADD frag %u to unitig %u at %u,%u (from ungapped %u,%u)\n",
+ fprintf(stdout, "bb=%u ee=%u ADD read %u to tig %u at %u,%u (from ungapped %u,%u)\n",
bb, ee,
RM[ee].rIID, RM[ee].tIID, bgn, end, RM[ee].tBGN, RM[ee].tEND);
- // Add a read to the unitig.
+ // Add a read to the tig.
IntMultiPos frg;
@@ -546,7 +546,7 @@ main(int argc, char **argv) {
RM[ee].proc = true;
}
- fprintf(stderr, "Added %u reads to unitig %u (previously %lu reads)\n",
+ fprintf(stderr, "Added %u reads to tig %u (previously %lu reads)\n",
readsAdded,
ma->maID,
GetNumIntMultiPoss(ma->f_list) - readsAdded);
@@ -558,13 +558,13 @@ main(int argc, char **argv) {
if (showResult)
PrintMultiAlignT(stdout, ma, gkpStore, false, false, AS_READ_CLEAR_LATEST);
} else {
- fprintf(stderr, "MultiAlignUnitig()-- unitig %d failed.\n", ma->maID);
+ fprintf(stderr, "MultiAlignUnitig()-- tig %d failed.\n", ma->maID);
numFailures++;
}
}
if (doModify) {
- fprintf(stderr, "Updating unitig %u\n", ma->maID);
+ fprintf(stderr, "Updating tig %u\n", ma->maID);
tigStore->insertMultiAlign(ma, true, false);
}
}
@@ -574,4 +574,3 @@ main(int argc, char **argv) {
exit(0);
}
-
diff --git a/src/bogart/analyzeBest.C b/src/bogart/analyzeBest.C
index 4b85eab..53121b7 100644
--- a/src/bogart/analyzeBest.C
+++ b/src/bogart/analyzeBest.C
@@ -93,7 +93,7 @@ main(int argc, char **argv) {
if (errno)
fprintf(stderr, "Failed to open '%s' for reading: %s\n", bSing, strerror(errno)), exit(1);
- fprintf(stderr, "Loading fragment to library mapping.\n");
+ fprintf(stderr, "Loading read to library mapping.\n");
gkStore *gkp = gkStore::gkStore_open(gkpName, false, false);
gkStream *str = new gkStream(gkp, 0, 0, GKFRAGMENT_INF);
@@ -104,7 +104,7 @@ main(int argc, char **argv) {
uint32 *frgToLib = new uint32 [numFrg + 1]; memset(frgToLib, 0, sizeof(uint32) * (numFrg + 1));
- uint64 *fragPerLib = new uint64 [numLib + 1]; memset(fragPerLib, 0, sizeof(uint64) * (numLib + 1));
+ uint64 *readPerLib = new uint64 [numLib + 1]; memset(readPerLib, 0, sizeof(uint64) * (numLib + 1));
uint64 *deldPerLib = new uint64 [numLib + 1]; memset(deldPerLib, 0, sizeof(uint64) * (numLib + 1));
while (str->next(&fr)) {
@@ -113,7 +113,7 @@ main(int argc, char **argv) {
if (fr.gkFragment_getIsDeleted())
deldPerLib[fr.gkFragment_getLibraryIID()]++;
else
- fragPerLib[fr.gkFragment_getLibraryIID()]++;
+ readPerLib[fr.gkFragment_getLibraryIID()]++;
}
delete str;
@@ -187,12 +187,12 @@ main(int argc, char **argv) {
fprintf(stderr, "libIID libUID #frg #del cnt'd cnt'r sing spur5 spur3 dove\n");
for (uint32 i=0; i<numLib+1; i++) {
- double tot = fragPerLib[i] + deldPerLib[i];
+ double tot = readPerLib[i] + deldPerLib[i];
- fprintf(stderr, "%-8u %-30s %8"F_U64P" %8"F_U64P" (%4.1f%%) %8"F_U64P" (%4.1f%%) %8"F_U64P" (%4.1f%%) %8"F_U64P" (%4.1f%%) %8"F_U64P" (%4.1f%%) %8"F_U64P" (%4.1f%%) %8"F_U64P" (%4.1f%%)\n",
+ fprintf(stderr, "%-8u %-30s %8" F_U64P " %8" F_U64P " (%4.1f%%) %8" F_U64P " (%4.1f%%) %8" F_U64P " (%4.1f%%) %8" F_U64P " (%4.1f%%) %8" F_U64P " (%4.1f%%) %8" F_U64P " (%4.1f%%) %8" F_U64P " (%4.1f%%)\n",
i,
gkp->gkStore_getLibrary(i)->libraryName,
- fragPerLib[i],
+ readPerLib[i],
deldPerLib[i], (deldPerLib[i] == 0) ? 0.0 : 100.0 * deldPerLib[i] / tot,
cntdPerLib[i], (cntdPerLib[i] == 0) ? 0.0 : 100.0 * cntdPerLib[i] / tot,
cntrPerLib[i], (cntrPerLib[i] == 0) ? 0.0 : 100.0 * cntrPerLib[i] / tot,
@@ -209,7 +209,7 @@ main(int argc, char **argv) {
delete [] frgToLib;
- delete [] fragPerLib;
+ delete [] readPerLib;
delete [] deldPerLib;
delete [] cntdPerLib;
diff --git a/src/bogart/bogart.C b/src/bogart/bogart.C
index 02af654..d750c06 100644
--- a/src/bogart/bogart.C
+++ b/src/bogart/bogart.C
@@ -39,10 +39,11 @@
* full conditions and disclaimers for each license.
*/
-#include "AS_BAT_FragmentInfo.H"
+#include "AS_BAT_ReadInfo.H"
#include "AS_BAT_OverlapCache.H"
#include "AS_BAT_BestOverlapGraph.H"
#include "AS_BAT_ChunkGraph.H"
+#include "AS_BAT_AssemblyGraph.H"
#include "AS_BAT_Logging.H"
@@ -52,19 +53,22 @@
#include "AS_BAT_Instrumentation.H"
#include "AS_BAT_PlaceContains.H"
-#include "AS_BAT_MergeUnitigs.H"
-#include "AS_BAT_PopBubbles.H"
+#include "AS_BAT_MergeOrphans.H"
#include "AS_BAT_MarkRepeatReads.H"
#include "AS_BAT_SplitDiscontinuous.H"
#include "AS_BAT_PromoteToSingleton.H"
+#include "AS_BAT_CreateUnitigs.H"
+
#include "AS_BAT_SetParentAndHang.H"
#include "AS_BAT_Outputs.H"
+#include "AS_BAT_TigGraph.H"
+
-FragmentInfo *FI = 0L;
+ReadInfo *RI = 0L;
OverlapCache *OC = 0L;
BestOverlapGraph *OG = 0L;
ChunkGraph *CG = 0L;
@@ -74,10 +78,9 @@ main (int argc, char * argv []) {
char *gkpStorePath = NULL;
char *ovlStoreUniqPath = NULL;
char *ovlStoreReptPath = NULL;
- char *tigStorePath = NULL;
- double erateGraph = 0.030;
- double erateMax = 0.050;
+ double erateGraph = 0.075;
+ double erateMax = 0.100;
uint64 genomeSize = 0;
@@ -87,29 +90,21 @@ main (int argc, char * argv []) {
double lowcovFraction = 0.75;
uint32 lowcovDepth = 2;
- double deviationGraph = 5.0;
- double deviationBubble = 5.0;
+ double deviationGraph = 6.0;
+ double deviationBubble = 6.0;
double deviationRepeat = 3.0;
- uint32 confusedAbsolute = 250;
- double confusedPercent = 100.0;
+ uint32 confusedAbsolute = 5000;
+ double confusedPercent = 500.0;
int32 numThreads = 0;
uint64 ovlCacheMemory = UINT64_MAX;
- uint32 ovlCacheLimit = UINT32_MAX;
- bool onlySave = false;
bool doSave = false;
- int fragment_count_target = 0;
char *prefix = NULL;
- bool enableJoining = false;
-
- bool enableShatterRepeats = false;
- bool enableReconstructRepeats = false;
-
uint32 minReadLen = 0;
uint32 minOverlap = 500;
@@ -118,10 +113,7 @@ main (int argc, char * argv []) {
vector<char *> err;
int arg = 1;
while (arg < argc) {
- if (strcmp(argv[arg], "-B") == 0) {
- fragment_count_target = atoi(argv[++arg]);
-
- } else if (strcmp(argv[arg], "-o") == 0) {
+ if (strcmp(argv[arg], "-o") == 0) {
prefix = argv[++arg];
} else if (strcmp(argv[arg], "-G") == 0) {
@@ -138,19 +130,6 @@ main (int argc, char * argv []) {
} else if (strcmp(argv[arg], "-gs") == 0) {
genomeSize = strtoull(argv[++arg], NULL, 10);
- } else if (strcmp(argv[arg], "-J") == 0) {
- enableJoining = true;
-
- } else if (strcmp(argv[arg], "-T") == 0) {
- tigStorePath = argv[++arg];
-
- } else if (strcmp(argv[arg], "-SR") == 0) {
- enableShatterRepeats = true;
-
- } else if (strcmp(argv[arg], "-R") == 0) {
- enableShatterRepeats = true;
- enableReconstructRepeats = true;
-
} else if (strcmp(argv[arg], "-unassembled") == 0) {
fewReadsNumber = atoi(argv[++arg]);
tooShortLength = atoi(argv[++arg]);
@@ -187,13 +166,6 @@ main (int argc, char * argv []) {
} else if (strcmp(argv[arg], "-M") == 0) {
ovlCacheMemory = (uint64)(atof(argv[++arg]) * 1024 * 1024 * 1024);
- } else if (strcmp(argv[arg], "-N") == 0) {
- ovlCacheLimit = atoi(argv[++arg]);
-
- } else if (strcmp(argv[arg], "-create") == 0) {
- onlySave = true;
- doSave = true;
-
} else if (strcmp(argv[arg], "-save") == 0) {
doSave = true;
@@ -217,6 +189,7 @@ main (int argc, char * argv []) {
for (flg=1, opt=0; logFileFlagNames[opt]; flg <<= 1, opt++)
if ((strcasecmp(logFileFlagNames[opt], "stderr") != 0) &&
(strcasecmp(logFileFlagNames[opt], "overlapScoring") != 0) &&
+ (strcasecmp(logFileFlagNames[opt], "errorProfiles") != 0) &&
(strcasecmp(logFileFlagNames[opt], "chunkGraph") != 0) &&
(strcasecmp(logFileFlagNames[opt], "setParentAndHang") != 0))
logFileFlags |= flg;
@@ -224,7 +197,7 @@ main (int argc, char * argv []) {
}
if (fnd == false) {
char *s = new char [1024];
- sprintf(s, "Unknown '-D' option '%s'.\n", argv[arg]);
+ snprintf(s, 1024, "Unknown '-D' option '%s'.\n", argv[arg]);
err.push_back(s);
}
@@ -240,32 +213,24 @@ main (int argc, char * argv []) {
}
if (fnd == false) {
char *s = new char [1024];
- sprintf(s, "Unknown '-d' option '%s'.\n", argv[arg]);
+ snprintf(s, 1024, "Unknown '-d' option '%s'.\n", argv[arg]);
err.push_back(s);
}
} else {
char *s = new char [1024];
- sprintf(s, "Unknown option '%s'.\n", argv[arg]);
+ snprintf(s, 1024, "Unknown option '%s'.\n", argv[arg]);
err.push_back(s);
}
arg++;
}
- if (erateGraph < 0.0)
- err.push_back(NULL);
- if (erateMax < 0.0)
- err.push_back(NULL);
-
- if (prefix == NULL)
- err.push_back(NULL);
- if (gkpStorePath == NULL)
- err.push_back(NULL);
- if (ovlStoreUniqPath == NULL)
- err.push_back(NULL);
- if (tigStorePath == NULL)
- err.push_back(NULL);
+ if (erateGraph < 0.0) err.push_back("Invalid overlap error threshold (-eg option); must be at least 0.0.\n");
+ if (erateMax < 0.0) err.push_back("Invalid overlap error threshold (-eM option); must be at least 0.0.\n");
+ if (prefix == NULL) err.push_back("No output prefix name (-o option) supplied.\n");
+ if (gkpStorePath == NULL) err.push_back("No gatekeeper store (-G option) supplied.\n");
+ if (ovlStoreUniqPath == NULL) err.push_back("No overlap store (-O option) supplied.\n");
if (err.size() > 0) {
fprintf(stderr, "usage: %s -o outputName -O ovlStore -G gkpStore -T tigStore\n", argv[0]);
@@ -275,16 +240,10 @@ main (int argc, char * argv []) {
fprintf(stderr, " -T Mandatory path to a tigStore (can exist or not).\n");
fprintf(stderr, " -o prefix Mandatory name for the output files\n");
fprintf(stderr, "\n");
- fprintf(stderr, " -B b Target number of fragments per tigStore (consensus) partition\n");
- fprintf(stderr, "\n");
fprintf(stderr, "Algorithm Options\n");
fprintf(stderr, "\n");
fprintf(stderr, " -gs Genome size in bases.\n");
fprintf(stderr, "\n");
- fprintf(stderr, " -J Join promiscuous unitigs using unused best edges.\n");
- fprintf(stderr, "\n");
- fprintf(stderr, " -SR Shatter repeats, don't rebuild.\n");
- fprintf(stderr, " -R Shatter repeats (-SR), then rebuild them\n");
fprintf(stderr, " -RL len Force reads below 'len' bases to be singletons.\n");
fprintf(stderr, " This WILL cause CGW to fail; diagnostic only.\n");
fprintf(stderr, "\n");
@@ -295,7 +254,7 @@ main (int argc, char * argv []) {
fprintf(stderr, "Overlap Selection - an overlap will be considered for use in a unitig under\n");
fprintf(stderr, " the following conditions:\n");
fprintf(stderr, "\n");
- fprintf(stderr, " When constructing the Best Overlap Graph and Promiscuous Unitigs ('g'raph):\n");
+ fprintf(stderr, " When constructing the Best Overlap Graph and Greedy tigs ('g'raph):\n");
fprintf(stderr, " -eg 0.020 no more than 0.020 fraction (2.0%%) error ** DEPRECATED **\n");
fprintf(stderr, "\n");
fprintf(stderr, " When loading overlaps, an inflated maximum (to allow reruns with different error rates):\n");
@@ -308,9 +267,7 @@ main (int argc, char * argv []) {
fprintf(stderr, "Overlap Storage\n");
fprintf(stderr, "\n");
fprintf(stderr, " -M gb Use at most 'gb' gigabytes of memory for storing overlaps.\n");
- fprintf(stderr, " -N num Load at most 'num' overlaps per read.\n");
fprintf(stderr, "\n");
- fprintf(stderr, " -create Only create the overlap graph, save to disk and quit.\n");
fprintf(stderr, " -save Save the overlap graph to disk, and continue.\n");
fprintf(stderr, "\n");
fprintf(stderr, "Debugging and Logging\n");
@@ -321,26 +278,9 @@ main (int argc, char * argv []) {
fprintf(stderr, " %s\n", logFileFlagNames[l]);
fprintf(stderr, "\n");
- if (erateGraph < 0.0)
- fprintf(stderr, "Invalid overlap error threshold (-eg option); must be at least 0.0.\n");
- if (erateMax < 0.0)
- fprintf(stderr, "Invalid overlap error threshold (-eM option); must be at least 0.0.\n");
-
- if (prefix == NULL)
- fprintf(stderr, "No output prefix name (-o option) supplied.\n");
-
- if (gkpStorePath == NULL)
- fprintf(stderr, "No gatekeeper store (-G option) supplied.\n");
-
- if (ovlStoreUniqPath == NULL)
- fprintf(stderr, "No overlap store (-O option) supplied.\n");
-
if ((ovlStoreUniqPath != NULL) && (ovlStoreUniqPath == ovlStoreReptPath))
fprintf(stderr, "Too many overlap stores (-O option) supplied.\n");
- if (tigStorePath == NULL)
- fprintf(stderr, "No output tigStore (-T option) supplied.\n");
-
for (uint32 ii=0; ii<err.size(); ii++)
if (err[ii])
fputs(err[ii], stderr);
@@ -372,16 +312,14 @@ main (int argc, char * argv []) {
ovStore *ovlStoreUniq = new ovStore(ovlStoreUniqPath, gkpStore);
ovStore *ovlStoreRept = ovlStoreReptPath ? new ovStore(ovlStoreReptPath, gkpStore) : NULL;
- UnitigVector unitigs;
-
- setLogFile(prefix, NULL);
-
- FI = new FragmentInfo(gkpStore, prefix, minReadLen);
+ writeStatus("\n");
+ writeStatus("==> LOADING AND FILTERING OVERLAPS.\n");
+ writeStatus("\n");
- // Initialize where we've been to nowhere
- Unitig::resetFragUnitigMap(FI->numFragments());
+ setLogFile(prefix, "filterOverlaps");
- OC = new OverlapCache(ovlStoreUniq, ovlStoreRept, prefix, MAX(erateMax, erateGraph), minOverlap, ovlCacheMemory, ovlCacheLimit, onlySave, doSave);
+ RI = new ReadInfo(gkpStore, prefix, minReadLen);
+ OC = new OverlapCache(gkpStore, ovlStoreUniq, ovlStoreRept, prefix, MAX(erateMax, erateGraph), minOverlap, ovlCacheMemory, genomeSize, doSave);
OG = new BestOverlapGraph(erateGraph, deviationGraph, prefix);
CG = new ChunkGraph(prefix);
@@ -392,151 +330,224 @@ main (int argc, char * argv []) {
gkpStore = NULL;
//
- // Build the initial unitig path from non-contained fragments. The first pass is usually the
- // only one needed, but occasionally (maybe) we miss fragments, so we make an explicit pass
- // through all fragments and place whatever isn't already placed.
+ // Build the initial unitig path from non-contained reads. The first pass is usually the
+ // only one needed, but occasionally (maybe) we miss reads, so we make an explicit pass
+ // through all reads and place whatever isn't already placed.
//
- setLogFile(prefix, "buildUnitigs");
- writeLog("==> BUILDING UNITIGS from %d fragments.\n", FI->numFragments());
+ TigVector contigs(RI->numReads()); // Both initial greedy tigs and final contigs
+ TigVector unitigs(RI->numReads()); // The 'final' contigs, split at every intersection in the graph
+
+ writeStatus("\n");
+ writeStatus("==> BUILDING GREEDY TIGS.\n");
+ writeStatus("\n");
- for (uint32 fi=CG->nextFragByChunkLength(); fi>0; fi=CG->nextFragByChunkLength())
- populateUnitig(unitigs, fi);
+ setLogFile(prefix, "buildGreedy");
+
+ for (uint32 fi=CG->nextReadByChunkLength(); fi>0; fi=CG->nextReadByChunkLength())
+ populateUnitig(contigs, fi);
delete CG;
CG = NULL;
- breakSingletonTigs(unitigs);
+ breakSingletonTigs(contigs);
+
+ reportOverlaps(contigs, prefix, "buildGreedy");
+ reportTigs(contigs, prefix, "buildGreedy", genomeSize);
+
+ //
+ // For future use, remember the reads in contigs. When we make unitigs, we'll
+ // require that every unitig end with one of these reads -- this will let
+ // us reconstruct contigs from the unitigs.
+ //
- reportOverlaps(unitigs, prefix, "buildUnitigs");
- reportUnitigs(unitigs, prefix, "buildUnitigs", genomeSize);
+ for (uint32 fid=1; fid<RI->numReads()+1; fid++) // This really should be incorporated
+ if (contigs.inUnitig(fid) != 0) // into populateUnitig()
+ RI->setBackbone(fid);
//
// Place contained reads.
//
-#if 1
+ writeStatus("\n");
+ writeStatus("==> PLACE CONTAINED READS.\n");
+ writeStatus("\n");
+
setLogFile(prefix, "placeContains");
- unitigs.computeArrivalRate(prefix, "initial");
- unitigs.computeErrorProfiles(prefix, "initial");
- //unitigs.reportErrorProfiles(prefix, "initial");
+ //contigs.computeArrivalRate(prefix, "initial");
+ contigs.computeErrorProfiles(prefix, "initial");
+ contigs.reportErrorProfiles(prefix, "initial");
- placeUnplacedUsingAllOverlaps(unitigs, prefix);
+ placeUnplacedUsingAllOverlaps(contigs, prefix);
- reportOverlaps(unitigs, prefix, "placeContains");
- reportUnitigs(unitigs, prefix, "placeContains", genomeSize);
-#endif
+ reportOverlaps(contigs, prefix, "placeContains");
+ reportTigs(contigs, prefix, "placeContains", genomeSize);
//
- // Merge tigs (and detect ciruclar ones too). Contained reads need to be placed to 'clean up'
- // the error rate. Dovetail alone is too 'clean' for circular to be detected (in ecoli).
+ // Merge orphans.
//
-#if 0
- setLogFile(prefix, "merge");
+ writeStatus("\n");
+ writeStatus("==> MERGE ORPHANS.\n");
+ writeStatus("\n");
- computeErrorProfiles(unitigs, prefix, "merge");
- //reportErrorProfiles(unitigs, prefix, "merge");
+ setLogFile(prefix, "mergeOrphans");
- mergeUnitigs(unitigs, deviationGraph, false);
+ contigs.computeErrorProfiles(prefix, "unplaced");
+ contigs.reportErrorProfiles(prefix, "unplaced");
- reportOverlaps(unitigs, prefix, "merge");
- reportUnitigs(unitigs, prefix, "merge", genomeSize);
-#endif
+ mergeOrphans(contigs, deviationBubble);
+ //checkUnitigMembership(contigs);
+ reportOverlaps(contigs, prefix, "mergeOrphans");
+ reportTigs(contigs, prefix, "mergeOrphans", genomeSize);
//
- // Pop bubbles
+ // Generate a new graph using only edges that are compatible with existing tigs.
//
-#if 1
- setLogFile(prefix, "popBubbles");
+ writeStatus("\n");
+ writeStatus("==> GENERATING ASSEMBLY GRAPH.\n");
+ writeStatus("\n");
+
+ setLogFile(prefix, "assemblyGraph");
- unitigs.computeErrorProfiles(prefix, "unplaced");
- //unitigs.reportErrorProfiles(prefix, "unplaced");
+ contigs.computeErrorProfiles(prefix, "assemblyGraph");
+ contigs.reportErrorProfiles(prefix, "assemblyGraph");
- popBubbles(unitigs,
- deviationBubble);
+ AssemblyGraph *AG = new AssemblyGraph(prefix,
+ deviationRepeat,
+ contigs);
- //checkUnitigMembership(unitigs);
- reportOverlaps(unitigs, prefix, "popBubbles");
- reportUnitigs(unitigs, prefix, "popBubbles", genomeSize);
-#endif
+ AG->reportReadGraph(contigs, prefix, "initial");
//
// Detect and break repeats. Annotate each read with overlaps to reads not overlapping in the tig,
// project these regions back to the tig, and break unless there is a read spanning the region.
//
- setLogFile(prefix, "markRepeatReads");
+ writeStatus("\n");
+ writeStatus("==> BREAK REPEATS.\n");
+ writeStatus("\n");
- unitigs.computeErrorProfiles(prefix, "repeats");
+ setLogFile(prefix, "breakRepeats");
- markRepeatReads(unitigs, deviationRepeat, confusedAbsolute, confusedPercent);
+ contigs.computeErrorProfiles(prefix, "repeats");
- //checkUnitigMembership(unitigs);
- reportOverlaps(unitigs, prefix, "markRepeatReads");
- reportUnitigs(unitigs, prefix, "markRepeatReads", genomeSize);
+ markRepeatReads(AG, contigs, deviationRepeat, confusedAbsolute, confusedPercent);
+
+ //checkUnitigMembership(contigs);
+ reportOverlaps(contigs, prefix, "markRepeatReads");
+ reportTigs(contigs, prefix, "markRepeatReads", genomeSize);
//
- // Try to reassemble just the split repeats.
+ // Cleanup tigs. Break those that have gaps in them. Place contains again. For any read
+ // still unplaced, make it a singleton unitig.
//
-#if 0
- if (enableReconstructRepeats) {
- assert(enableShatterRepeats);
- setLogFile(prefix, "reconstructRepeats");
+ writeStatus("\n");
+ writeStatus("==> CLEANUP MISTAKES.\n");
+ writeStatus("\n");
- reconstructRepeats(unitigs, erateGraph);
+ setLogFile(prefix, "cleanupMistakes");
- //checkUnitigMembership(unitigs);
- reportOverlaps(unitigs, prefix, "reconstructRepeats");
- reportUnitigs(unitigs, prefix, "reconstructRepeats", genomeSize);
- }
-#endif
+ splitDiscontinuous(contigs, minOverlap);
+ promoteToSingleton(contigs);
+
+ writeStatus("\n");
+ writeStatus("==> CLEANUP GRAPH.\n");
+ writeStatus("\n");
+
+ AG->rebuildGraph(contigs);
+ AG->filterEdges(contigs);
+
+ writeStatus("\n");
+ writeStatus("==> GENERATE OUTPUTS.\n");
+ writeStatus("\n");
+
+ setLogFile(prefix, "generateOutputs");
+
+ classifyTigsAsUnassembled(contigs,
+ fewReadsNumber,
+ tooShortLength,
+ spanFraction,
+ lowcovFraction, lowcovDepth);
+
+ //checkUnitigMembership(contigs);
+ reportOverlaps(contigs, prefix, "final");
+ reportTigs(contigs, prefix, "final", genomeSize);
+
+ AG->reportReadGraph(contigs, prefix, "final");
+
+ delete AG;
+ AG = NULL;
//
- // Cleanup unitigs. Break those that have gaps in them. Place contains again. For any read
- // still unplaced, make it a singleton unitig.
+ // Generate outputs. The graph MUST come after output, because it needs
+ // the tigStore tigID.
//
- setLogFile(prefix, "cleanup");
+ setParentAndHang(contigs);
+ writeTigsToStore(contigs, prefix, "ctg", true);
+
+ vector<tigLoc> unitigSource; // Needed only to pass something to reportTigGraph.
+
+ setLogFile(prefix, "tigGraph");
- splitDiscontinuousUnitigs(unitigs, minOverlap);
+ reportTigGraph(contigs, unitigSource, prefix, "contigs");
- breakSingletonTigs(unitigs);
+ //
+ // Generate unitigs
+ //
+ // We want to split the contigs at any potential bubble, so this needs to be
+ // at least the 'bubble' deviation. We don't really want to split at confirmed
+ // repeats, but we have no way of telling repeat from bubble yet.
+ //
- //unitigs.computeErrorProfiles(prefix, "final");
- //unitigs.reportErrorProfiles(prefix, "final");
+ writeStatus("\n");
+ writeStatus("==> GENERATE UNITIGS.\n");
+ writeStatus("\n");
- //placeUnplacedUsingAllOverlaps(unitigs, prefix);
+ setLogFile(prefix, "generateUnitigs");
- promoteToSingleton(unitigs);
+ contigs.computeErrorProfiles(prefix, "generateUnitigs");
+ contigs.reportErrorProfiles(prefix, "generateUnitigs");
- classifyUnitigsAsUnassembled(unitigs,
- fewReadsNumber,
- tooShortLength,
- spanFraction,
- lowcovFraction, lowcovDepth);
+ AssemblyGraph *EG = new AssemblyGraph(prefix,
+ deviationBubble,
+ contigs,
+ true);
- //checkUnitigMembership(unitigs);
- reportOverlaps(unitigs, prefix, "final");
- reportUnitigs(unitigs, prefix, "final", genomeSize);
//
- // Generate outputs. The graph MUST come after output, because it needs
- // the tigStore tigID.
+ // We want some way of tracking unitigs that came from the same contig. Ideally,
+ // we'd be able to emit only the edges that would join unitigs into the original
+ // contig, but it's complicated by containments. For example:
+ //
+ // [----------------------------------] CONTIG
+ // ------------- UNITIG
+ // -------------------------- UNITIG
+ // ------- UNITIG
//
+ // So, instead, we just remember the set of unitigs that were created from each
+ // contig, and assume that any edge between those unitigs represents the contig.
+ // Which it totally doesn't -- any repeat in the contig collapses -- but is a
+ // good first attempt.
+ //
+
+ createUnitigs(EG, contigs, unitigs, unitigSource);
+
+ delete EG;
- setLogFile(prefix, "output");
+ splitDiscontinuous(unitigs, minOverlap, unitigSource);
setParentAndHang(unitigs);
- writeUnitigsToStore(unitigs, prefix, tigStorePath, fragment_count_target, true);
+ writeTigsToStore(unitigs, prefix, "utg", true);
- setLogFile(prefix, "graph");
+ setLogFile(prefix, "tigGraph");
- writeUnusedEdges(unitigs, prefix);
+ reportTigGraph(unitigs, unitigSource, prefix, "unitigs");
//
// Tear down bogart.
@@ -545,11 +556,12 @@ main (int argc, char * argv []) {
delete CG;
delete OG;
delete OC;
- delete FI;
+ delete RI;
setLogFile(prefix, NULL);
- writeLog("Bye.\n");
+ writeStatus("\n");
+ writeStatus("Bye.\n");
return(0);
}
diff --git a/src/bogart/bogart.mk b/src/bogart/bogart.mk
index c9dfba8..827863e 100644
--- a/src/bogart/bogart.mk
+++ b/src/bogart/bogart.mk
@@ -9,27 +9,28 @@ endif
TARGET := bogart
SOURCES := bogart.C \
+ AS_BAT_AssemblyGraph.C \
AS_BAT_BestOverlapGraph.C \
AS_BAT_ChunkGraph.C \
- AS_BAT_FragmentInfo.C \
+ AS_BAT_CreateUnitigs.C \
AS_BAT_Instrumentation.C \
AS_BAT_Logging.C \
AS_BAT_MarkRepeatReads.C \
- AS_BAT_MergeUnitigs.C \
+ AS_BAT_MergeOrphans.C \
AS_BAT_Outputs.C \
AS_BAT_OverlapCache.C \
AS_BAT_PlaceContains.C \
- AS_BAT_PlaceFragUsingOverlaps.C \
- AS_BAT_PopBubbles.C \
+ AS_BAT_PlaceReadUsingOverlaps.C \
AS_BAT_PopulateUnitig.C \
AS_BAT_PromoteToSingleton.C \
- AS_BAT_ReconstructRepeats.C \
+ AS_BAT_ReadInfo.C \
AS_BAT_SetParentAndHang.C \
AS_BAT_SplitDiscontinuous.C \
+ AS_BAT_TigGraph.C \
+ AS_BAT_TigVector.C \
AS_BAT_Unitig.C \
- AS_BAT_UnitigVector.C \
- AS_BAT_Unitig_AddFrag.C \
- AS_BAT_Unitig_PlaceFragUsingEdges.C
+ AS_BAT_Unitig_AddRead.C \
+ AS_BAT_Unitig_PlaceReadUsingEdges.C
SRC_INCDIRS := .. ../AS_UTL ../stores
diff --git a/src/bogart/buildGraph.C b/src/bogart/buildGraph.C
deleted file mode 100644
index b58222d..0000000
--- a/src/bogart/buildGraph.C
+++ /dev/null
@@ -1,403 +0,0 @@
-
-/******************************************************************************
- *
- * This file is part of canu, a software program that assembles whole-genome
- * sequencing reads into contigs.
- *
- * This software is based on:
- * 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
- * the 'kmer package' (http://kmer.sourceforge.net)
- * both originally distributed by Applera Corporation under the GNU General
- * Public License, version 2.
- *
- * Canu branched from Celera Assembler at its revision 4587.
- * Canu branched from the kmer project at its revision 1994.
- *
- * Modifications by:
- *
- * Brian P. Walenz beginning on 2016-FEB-12
- * are a 'United States Government Work', and
- * are released in the public domain
- *
- * File 'README.licenses' in the root directory of this distribution contains
- * full conditions and disclaimers for each license.
- */
-
-#include "AS_global.H"
-#include "gkStore.H"
-#include "tgStore.H"
-
-#include "splitToWords.H"
-
-#include <vector>
-using namespace std;
-
-
-
-class bestRead {
-public:
- bestRead() {
- };
- ~bestRead() {
- };
-
- uint32 tigID;
- char tigType;
- uint32 readID;
- uint32 readBgn;
- uint32 readEnd;
-};
-
-
-class bestEdge {
-public:
- bestEdge() {
- };
- ~bestEdge() {
- };
-
- bestRead fr;
- int32 ahang;
- int32 bhang;
- bestRead to;
-
- bool flipped;
-};
-
-
-void
-loadEdges(char *edgeName, vector<bestEdge> &edges, set<uint32> &vertices) {
-
- errno = 0;
- FILE *edgeFile = fopen(edgeName, "r");
- if (errno)
- fprintf(stderr, "Failed to open '%s' for reading: %s\n", edgeName, strerror(errno));
-
- char edgeLine[1024];
-
- fgets(edgeLine, 1024, edgeFile);
-
- while (!feof(edgeFile)) {
- splitToWords W(edgeLine);
- bestEdge E;
- uint32 w = 0;
-
- assert(W[w++][0] == 't'); // 'tig'
- E.fr.tigID = W(w++); // tigID
- E.fr.tigType = W[w++][0]; // tig type 'R', 'N', ...
- assert(W[w++][0] == 'r'); // 'read'
- E.fr.readID = W(w++); // readID
- assert(W[w++][0] == 'a'); // 'at'
- E.fr.readBgn = W(w++); // bgn-position
- E.fr.readEnd = W(w++); // end-position
-
- E.ahang = W(w++); // a-hang
- E.flipped = (W[w++][0] == '<'); // '<' if flipped, '>' if normal
- E.bhang = W(w++); // b-hang
-
- assert(W[w++][0] == 't'); // 'tig'
- E.to.tigID = W(w++); // tigID
- E.fr.tigType = W[w++][0]; // tig type
- assert(W[w++][0] == 'r'); // 'read'
- E.to.readID = W(w++); // readID
- assert(W[w++][0] == 'a'); // 'at'
- E.to.readBgn = W(w++); // bgn-position
- E.to.readEnd = W(w++); // end-position
-
- edges.push_back(E);
-
- vertices.insert(E.fr.tigID);
- vertices.insert(E.to.tigID);
-
- fgets(edgeLine, 1024, edgeFile);
- }
-
- fprintf(stderr, "Loaded "F_SIZE_T" edges from '%s'.\n", edges.size(), edgeName);
-}
-
-
-
-
-tgPosition *
-findRead(tgTig *tig, uint32 id) {
- uint32 rr = 0;
- tgPosition *rd = NULL;
-
- do {
- rd = tig->getChild(rr++);
- } while ((rr < tig->numberOfChildren()) && (rd->ident() != id));
-
- if (rd->ident() != id) {
- fprintf(stderr, "WARNING: failed to find read %u in tig %u - ejected?\n", id, tig->tigID());
- rd = NULL;
- }
-
- return(rd);
-}
-
-
-
-
-int
-main(int argc, char **argv) {
- char *gkpName = NULL;
- char *tigName = NULL;
- int32 tigVers = -1;
- char *edgesName = NULL;
- char *graphName = NULL;
-
- argc = AS_configure(argc, argv);
-
- vector<char *> err;
- int arg = 1;
- while (arg < argc) {
- if (strcmp(argv[arg], "-G") == 0) {
- gkpName = argv[++arg];
-
- } else if (strcmp(argv[arg], "-T") == 0) {
- tigName = argv[++arg];
- tigVers = atoi(argv[++arg]);
-
- } else if (strcmp(argv[arg], "-E") == 0) {
- edgesName = argv[++arg];
-
- } else if (strcmp(argv[arg], "-o") == 0) {
- graphName = argv[++arg];
-
- } else {
- char *s = new char [1024];
- sprintf(s, "Unknown option '%s'.\n", argv[arg]);
- err.push_back(s);
- }
-
- arg++;
- }
-
- if (gkpName == NULL)
- err.push_back("No gatekeeper store (-G option) supplied.\n");
- if ((tigName == NULL) || (tigVers == -1))
- err.push_back("No tigStore store (-T option) supplied.\n");
- if (edgesName == NULL)
- err.push_back("No edges file (-E option) supplied.\n");
- if (graphName == NULL)
- err.push_back("No output graph file (-o option) supplied.\n");
-
- if (err.size() > 0) {
- fprintf(stderr, "usage: %s -G gkpStore -T tigStore tigVersion -E edgesFile ...\n", argv[0]);
- fprintf(stderr, " -G gkpStore path to gkpStore\n");
- fprintf(stderr, " -T tigStore version path to tigStore\n");
- fprintf(stderr, " -E edgeFile path to bogart unused-edges file\n");
- fprintf(stderr, "\n");
- fprintf(stderr, " -o graph.gfa write to 'graph.gfa'\n");
- fprintf(stderr, "\n");
-
- for (uint32 ii=0; ii<err.size(); ii++)
- if (err[ii])
- fputs(err[ii], stderr);
-
- exit(1);
- }
-
- // Open output.
-
- errno = 0;
- FILE *graph = fopen(graphName, "w");
- if (errno)
- fprintf(stderr, "Failed to open output graph '%s': %s\n", graphName, strerror(errno)), exit(1);
-
- // Open inputs, load the graph.
-
- gkStore *gkpStore = gkStore::gkStore_open(gkpName);
- tgStore *tigStore = new tgStore(tigName, tigVers);
-
- vector<bestEdge> edges;
- set<uint32> vertices;
-
- loadEdges(edgesName, edges, vertices);
-
- // Dump vertcies.
-
- fprintf(graph, "H\tVN:Z:canu\n");
-
- for (uint32 tt=0; tt<tigStore->numTigs(); tt++) {
- tgTig *tig = tigStore->loadTig(tt);
-
- if (vertices.count(tig->tigID()) > 0)
- fprintf(graph, "S\ttig%08u\t*\tLN:i:%u\tRC:i:%u\n", tig->tigID(), tig->length(), tig->numberOfChildren());
-
- tigStore->unloadTig(tt, true);
- }
-
- // Dump graph.
-
- char *cigar = new char [1024 * 1024];
-
- uint32 nEdgesUnassembled = 0;
-
- for (uint32 ee=0; ee<edges.size(); ee++) {
-
- // Get the tigs for this edge, ignore if either is unassembled.
-
- tgTig *frTig = tigStore->loadTig(edges[ee].fr.tigID);
- uint32 frTigID = frTig->tigID();
-
- tgTig *toTig = tigStore->loadTig(edges[ee].to.tigID);
- uint32 toTigID = toTig->tigID();
-
- assert(frTig->_class != tgTig_noclass);
- assert(toTig->_class != tgTig_noclass);
-
- if ((frTig->_class == tgTig_unassembled) ||
- (toTig->_class == tgTig_unassembled)) {
- nEdgesUnassembled++;
- continue;
- }
-
- // Find the reads we're using to anchor the tigs together.
-
- tgPosition *frRead = findRead(frTig, edges[ee].fr.readID);
- tgPosition *toRead = findRead(toTig, edges[ee].to.readID);
-
- if ((frRead == NULL) ||
- (toRead == NULL))
- continue;
-
- // Map coordinates from gapped to ungapped.
-
- uint32 frReadMin = frTig->mapGappedToUngapped(frRead->min());
- uint32 frReadMax = frTig->mapGappedToUngapped(frRead->max());
- uint32 frLen = frTig->length(false);
-
- uint32 toReadMin = toTig->mapGappedToUngapped(toRead->min());
- uint32 toReadMax = toTig->mapGappedToUngapped(toRead->max());
- uint32 toLen = toTig->length(false);
-
- //
- // Convert from a read-read overlap to a tig-tig overlap.
- //
-
- // Orient tigs based oin the read orientation.
- // For 'fr', we require that the read always be forward.
- // For 'to', the overlap dictates the orientation.
-
- bool frTigFwd = frRead->isForward(); // Tig forward if read forward.
- bool toTigFwd = toRead->isForward(); // Same, unless...
-
- if (edges[ee].flipped == true) // ...edge is flipped, so flip
- toTigFwd = !toTigFwd; // 'to' tig.
-
- // Cleanup. Makes skipping an edge much easier.
-
- tigStore->unloadTig(frTigID, true); frRead = NULL;
- tigStore->unloadTig(toTigID, true); toRead = NULL;
-
- // Based on tig orientation, find the bgn and end lengths from each read.
-
- int32 frBgn = (frTigFwd) ? ( frReadMin) : (frLen - frReadMax);
- int32 frEnd = (frTigFwd) ? (frLen - frReadMax) : ( frReadMin);
-
- int32 toBgn = (toTigFwd) ? ( toReadMin) : (toLen - toReadMax);
- int32 toEnd = (toTigFwd) ? (toLen - toReadMax) : ( toReadMin);
-
- //fprintf(graph, "hangs0- fr %d-%d to %d-%d ahang %d bhang %d\n",
- // frBgn, frEnd, toBgn, toEnd, edges[ee].ahang, edges[ee].bhang);
-
- // Apply the overlap hangs to find the overlapping regions on the tigs.
-
- if (edges[ee].ahang < 0)
- toBgn += -edges[ee].ahang;
- else
- frBgn += edges[ee].ahang;
-
- if (edges[ee].bhang < 0)
- frEnd += -edges[ee].bhang;
- else
- toEnd += edges[ee].bhang;
-
- //fprintf(graph, "hangs1- fr %d-%d to %d-%d\n",
- // frBgn, frEnd, toBgn, toEnd);
-
- // The overlap is now between regions toBgn-toEnd and frBgn-frEnd. Extend this to cover the ends of each tig.
- //
- // ------------------------------------------
- // +++ ||| olap ||| +++
- // -------------------------------
-
- if (toBgn < frBgn) {
- toBgn -= toBgn;
- frBgn -= toBgn;
- } else {
- toBgn -= frBgn;
- frBgn -= frBgn;
- }
-
- if (toEnd < frEnd) {
- toEnd -= toEnd;
- frEnd -= toEnd;
- } else {
- toEnd -= frEnd;
- frEnd -= frEnd;
- }
-
- //fprintf(graph, "hangs2- fr %d-%d to %d-%d\n",
- // frBgn, frEnd, toBgn, toEnd);
-
- // Compute the alignment between the two regions, and convert to a cigar string.
-
- frLen -= (frBgn + frEnd);
- toLen -= (toBgn + toEnd);
-
- sprintf(cigar, "%dM", (frLen + toLen) / 2); // Used to be 'm', Bandage complained about it not being 'M'.
-
- // The overlap should now have one of:
- // frBgn == toEnd == 0 -- to has an overlap to fr
- // frEnd == toBgn == 0 -- fr has an overlap to to
- //
- // If not, the overlap is inconsistent with the tigs; it implies the two tigs overlap in their
- // entirety.
-
- // GFA requires that the overlap be between the end of the first read and the start of the second read.
- // Flip the order if needed.
-
- if (frTigID == toTigID) {
- fprintf(stderr, "L\ttig%08u\t%c\ttig%08d\t%c\t%s circular\n",
- frTigID, (frTigFwd) ? '+' : '-',
- toTigID, (toTigFwd) ? '+' : '-',
- cigar);
- continue;
- }
-
- if ((toBgn == 0) && (frEnd == 0)) {
- fprintf(graph, "L\ttig%08u\t%c\ttig%08d\t%c\t%s\n",
- frTigID, (frTigFwd) ? '+' : '-',
- toTigID, (toTigFwd) ? '+' : '-',
- cigar);
- continue;
- }
-
- if ((frBgn == 0) && (toEnd == 0)) {
- fprintf(graph, "L\ttig%08u\t%c\ttig%08d\t%c\t%s\n",
- toTigID, (toTigFwd) ? '+' : '-',
- frTigID, (frTigFwd) ? '+' : '-',
- cigar);
- continue;
- }
-
- // Inconsistent edge.
-
- fprintf(stderr, "L\ttig%08u\t%c\ttig%08d\t%c\t%s inconsistent\n",
- frTigID, (frTigFwd) ? '+' : '-',
- toTigID, (toTigFwd) ? '+' : '-',
- cigar);
- }
-
- edges.clear(); // Make valgrind slightly happier.
- vertices.clear();
-
- delete [] cigar;
- delete tigStore;
-
- gkpStore->gkStore_close();
-
- exit(0);
-}
diff --git a/src/bogart/findOverlappingReads.pl b/src/bogart/findOverlappingReads.pl
new file mode 100644
index 0000000..bc5955d
--- /dev/null
+++ b/src/bogart/findOverlappingReads.pl
@@ -0,0 +1,120 @@
+#!/usr/bin/env perl
+
+###############################################################################
+ #
+ # This file is part of canu, a software program that assembles whole-genome
+ # sequencing reads into contigs.
+ #
+ # This software is based on:
+ # 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
+ # the 'kmer package' (http://kmer.sourceforge.net)
+ # both originally distributed by Applera Corporation under the GNU General
+ # Public License, version 2.
+ #
+ # Canu branched from Celera Assembler at its revision 4587.
+ # Canu branched from the kmer project at its revision 1994.
+ #
+ # Modifications by:
+ #
+ # Brian P. Walenz beginning on 2016-AUG-05
+ # are a 'United States Government Work', and
+ # are released in the public domain
+ #
+ # File 'README.licenses' in the root directory of this distribution contains
+ # full conditions and disclaimers for each license.
+ ##
+
+use strict;
+
+if (scalar(@ARGV) == 0) {
+ die "usage: $0 assembly-prefix readID [tigStore-bogart-stage]\n";
+}
+
+my $prefix = shift @ARGV;
+my $readID = shift @ARGV;
+my $store = shift @ARGV;
+
+my $gkpStore = "$prefix.gkpStore";
+my $ovlStore = "$prefix.ovlStore";
+my $tigStore = "$prefix.tigStore";
+my $tigVers = 1;
+
+$tigStore = "$prefix.003.buildUnitigs.tigStore" if ($store eq "003");
+$tigStore = "$prefix.004.placeContains.tigStore" if ($store eq "004");
+$tigStore = "$prefix.005.mergeOrphans.tigStore" if ($store eq "005");
+$tigStore = "$prefix.007.breakRepeats.tigStore" if ($store eq "007");
+$tigStore = "$prefix.009.generateOutputs.tigStore" if ($store eq "009");
+
+$gkpStore = "../$gkpStore" if (! -d $gkpStore);
+$gkpStore = "../$gkpStore" if (! -d $gkpStore);
+$gkpStore = "../$gkpStore" if (! -d $gkpStore);
+
+$ovlStore = "../$ovlStore" if (! -d $ovlStore);
+$ovlStore = "../$ovlStore" if (! -d $ovlStore);
+$ovlStore = "../$ovlStore" if (! -d $ovlStore);
+
+$tigStore = "../$tigStore" if (! -d $tigStore);
+$tigStore = "../$tigStore" if (! -d $tigStore);
+$tigStore = "../$tigStore" if (! -d $tigStore);
+
+die "failed to find gkpStore $prefix.gkpStore" if (! -d $gkpStore);
+die "failed to find ovlStore $prefix.ovlStore" if (! -d $ovlStore);
+die "failed to find tigStore $prefix.tigStore" if (! -d $tigStore);
+
+
+my %readOvl;
+
+my $nOvl = 0;
+my $nTig = 0;
+
+open(F, "ovStoreDump -G $gkpStore -O $ovlStore -p $readID |");
+while (<F>) {
+ chomp;
+
+ # For -d dumps
+ if (m/^\s*\d+\s+(\d+)\s+/) {
+ $nOvl++;
+ $readOvl{$1} = $_;
+ }
+
+ # For -p dumps
+ if (m/^\s*(\d+)\s+A:\s+\d+\s+/) {
+ $nOvl++;
+ $readOvl{$1} = $_;
+ }
+}
+close(F);
+
+
+system("ovStoreDump -G $gkpStore -O $ovlStore -p $readID");
+print "\n";
+
+
+my $tig;
+my $len;
+my $num;
+
+open(F, "tgStoreDump -G $gkpStore -T $tigStore $tigVers -layout |");
+while (<F>) {
+ chomp;
+
+ $tig = $1 if (m/^tig\s+(\d+)$/);
+ $len = $1 if (m/^len\s+(\d+)$/);
+ $num = $1 if (m/^numChildren\s+(\d+)$/);
+
+ if (m/^read\s+(\d+)\s+/) {
+ my $r = $1;
+
+ if ($r == $readID) {
+ print "tig $tig len $len -- $_\n";
+ } elsif (exists($readOvl{$1})) {
+ $nTig++;
+ printf "tig %6d len %8d -- %s -- %s\n", $tig, $len, $_, $readOvl{$1};
+ }
+ }
+}
+close(F);
+
+print "\n";
+print STDERR "Found $nOvl overlaps in $ovlStore (with $gkpStore).\n";
+print STDERR "Found $nTig placements in $tigStore.\n";
diff --git a/src/pipelines/sanity/build-all-kmer-revisions.pl b/src/bogart/plotErrorProfile.pl
similarity index 51%
rename from src/pipelines/sanity/build-all-kmer-revisions.pl
rename to src/bogart/plotErrorProfile.pl
index 07e20a1..2c09596 100644
--- a/src/pipelines/sanity/build-all-kmer-revisions.pl
+++ b/src/bogart/plotErrorProfile.pl
@@ -16,7 +16,7 @@
#
# Modifications by:
#
- # Brian P. Walenz beginning on 2015-OCT-12
+ # Brian P. Walenz beginning on 2016-JUN-06
# are a 'United States Government Work', and
# are released in the public domain
#
@@ -26,25 +26,45 @@
use strict;
-# Given a local rsync'd copy of the repository, check out all versions and compile.
+my $nn = shift @ARGV;
+my $pn = shift @ARGV;
-my $kmersvn = "/work/NIGHTLY/kmer-svn"
+die "usage: $0 <prefix> <plot-number>\n" if (!defined($pn));
-open(F, "< $kmersvn/db/current");
-my $latest = <F>;
-chomp $latest;
-close(F);
+my $name = "$nn." . substr("00000000$pn", -8) . ".profile";
+print "Plotting '$nn' '$pn' - '$name'\n";
-for (my $i=1917; $i<=$latest; $i++) {
- if (! -d "kmer$i") {
- print "Check out r$i\n";
- system("mkdir kmer$i");
- system("cd kmer$i && svn co -r $i file://$kmersvn/trunk . > kmer-checkout.err 2>&1");
- }
+my $lastX = 0;
+my $lastY = 0;
+
+open(O, "> $name.dat") or die;
+open(F, "< $name") or die;
+while (<F>) {
+ if (m/^(\d+)\s+(\d+)\s+(\d+.\d+)\s+\+-\s+(\d+.\d+)\s\(\d+\s+overlaps\)/) {
+ print O "$lastX\t$lastY\n";
+ print O "$1\t$3\n";
+ print O "\n";
- if (! -d "kmer$i/FreeBSD-amd64") {
- print "Compile r$i\n";
- system("cd kmer$i && gmake install > kmer-build.err 2>& 1 &");
+ print O "$1\t$3\n";
+ print O "$2\t$3\n";
+ print O "\n";
+
+ $lastX = $2;
+ $lastY = $3;
}
}
+close(F);
+
+print O "$lastX\t$lastY\n";
+print O "$lastX\t0\n";
+print O "\n";
+
+close(O);
+
+open(O, "> $name.gp");
+print O "plot '$name.dat' with lines\n";
+print O "pause -1\n";
+close(O);
+
+system("gnuplot $name.gp");
diff --git a/src/bogus/bogus.C b/src/bogus/bogus.C
index 67afd34..d03f6b9 100644
--- a/src/bogus/bogus.C
+++ b/src/bogus/bogus.C
@@ -537,14 +537,14 @@ main(int argc, char **argv) {
errno = 0;
- sprintf(outputName, "%s.intervals", outputPrefix);
+ snprintf(outputName, FILENAME_MAX, "%s.intervals", outputPrefix);
intervalOutput = fopen(outputName, "w");
if (errno)
fprintf(stderr, "Failed to open '%s' for writing: %s\n",
outputName, strerror(errno)), exit(1);
- sprintf(outputName, "%s.gff3", outputPrefix);
+ snprintf(outputName, FILENAME_MAX, "%s.gff3", outputPrefix);
gffOutput = fopen(outputName, "w");
if (errno)
@@ -645,14 +645,14 @@ main(int argc, char **argv) {
//assert(refcnt != 0);
if ((refcnt > 0) && (minFrags <= refcnt) && (minLength <= refend - refbgn)) {
- fprintf(intervalOutput, "%s\t%8"F_S64P"\t%8"F_S64P"\tREPT\t"F_S64"%s\n",
+ fprintf(intervalOutput, "%s\t%8" F_S64P "\t%8" F_S64P "\tREPT\t" F_S64 "%s\n",
refhdr, refbgn, refend, refcnt, (REPTvalid[ir]) ? "" : " weak");
if (REPTvalid[ir])
- fprintf(gffOutput, "%s\t.\tbogus_rept_interval\t"F_S64"\t"F_S64"\t.\t.\t.\tID=REPT%04d;fragCount="F_S64"\n",
+ fprintf(gffOutput, "%s\t.\tbogus_rept_interval\t" F_S64 "\t" F_S64 "\t.\t.\t.\tID=REPT%04d;fragCount=" F_S64 "\n",
refhdr, refbgn, refend, ir, refcnt);
else
- fprintf(gffOutput, "%s\t.\tbogus_weak_interval\t"F_S64"\t"F_S64"\t.\t.\t.\tParent=UNIQ%04d;fragCount="F_S64"\n",
+ fprintf(gffOutput, "%s\t.\tbogus_weak_interval\t" F_S64 "\t" F_S64 "\t.\t.\t.\tParent=UNIQ%04d;fragCount=" F_S64 "\n",
refhdr, refbgn, refend, REPTvalidParent[ir], refcnt);
}
@@ -684,14 +684,14 @@ main(int argc, char **argv) {
//assert(refcnt != 0);
if ((refcnt > 0) && (minFrags <= refcnt) && (minLength <= refend - refbgn)) {
- fprintf(intervalOutput, "%s\t%8"F_S64P"\t%8"F_S64P"\tUNIQ\t"F_S64"%s\n",
+ fprintf(intervalOutput, "%s\t%8" F_S64P "\t%8" F_S64P "\tUNIQ\t" F_S64 "%s\n",
refhdr, refbgn, refend, refcnt, (UNIQvalid[iu]) ? "" : " separation");
if (UNIQvalid[iu])
- fprintf(gffOutput, "%s\t.\tbogus_uniq_interval\t"F_S64"\t"F_S64"\t.\t.\t.\tID=UNIQ%04d;fragCount="F_S64"\n",
+ fprintf(gffOutput, "%s\t.\tbogus_uniq_interval\t" F_S64 "\t" F_S64 "\t.\t.\t.\tID=UNIQ%04d;fragCount=" F_S64 "\n",
refhdr, refbgn, refend, iu, refcnt);
else
- fprintf(gffOutput, "%s\t.\tbogus_sepr_interval\t"F_S64"\t"F_S64"\t.\t.\t.\tParent=REPT%04d;fragCount="F_S64"\n",
+ fprintf(gffOutput, "%s\t.\tbogus_sepr_interval\t" F_S64 "\t" F_S64 "\t.\t.\t.\tParent=REPT%04d;fragCount=" F_S64 "\n",
refhdr, refbgn, refend, UNIQvalidParent[iu], refcnt);
}
diff --git a/src/bogus/bogusness.C b/src/bogus/bogusness.C
index 9793903..c30e8b8 100644
--- a/src/bogus/bogusness.C
+++ b/src/bogus/bogusness.C
@@ -385,14 +385,14 @@ main(int argc, char **argv) {
errno = 0;
- sprintf(outputName, "%s.bogusness", outputPrefix);
+ snprintf(outputName, FILENAME_MAX, "%s.bogusness", outputPrefix);
resultsOutput = fopen(outputName, "w");
if (errno)
fprintf(stderr, "Failed to open '%s' for writing: %s\n",
outputName, strerror(errno)), exit(1);
- sprintf(outputName, "%s.gff3", outputPrefix);
+ snprintf(outputName, FILENAME_MAX, "%s.gff3", outputPrefix);
gffOutput = fopen(outputName, "w");
if (errno)
diff --git a/src/correction/filterCorrectionOverlaps.C b/src/correction/filterCorrectionOverlaps.C
index dedb270..31bfbc5 100644
--- a/src/correction/filterCorrectionOverlaps.C
+++ b/src/correction/filterCorrectionOverlaps.C
@@ -164,8 +164,8 @@ main(int argc, char **argv) {
uint64 *scores = new uint64 [gkpStore->gkStore_getNumReads() + 1];
- sprintf(logFileName, "%s.log", scoreFileName);
- sprintf(statsFileName, "%s.stats", scoreFileName);
+ snprintf(logFileName, FILENAME_MAX, "%s.log", scoreFileName);
+ snprintf(statsFileName, FILENAME_MAX, "%s.stats", scoreFileName);
errno = 0;
FILE *scoreFile = (scoreFileName == NULL) ? NULL : fopen(scoreFileName, "w");
@@ -360,8 +360,8 @@ main(int argc, char **argv) {
fprintf(statsFile, "PARAMETERS:\n");
fprintf(statsFile, "----------\n");
fprintf(statsFile, "\n");
- fprintf(statsFile, "%7"F_U32P" (expected coverage)\n", expectedCoverage);
- fprintf(statsFile, "%7"F_U32P" (don't use overlaps shorter than this)\n", minOvlLength);
+ fprintf(statsFile, "%7" F_U32P " (expected coverage)\n", expectedCoverage);
+ fprintf(statsFile, "%7" F_U32P " (don't use overlaps shorter than this)\n", minOvlLength);
fprintf(statsFile, "%7.3f (don't use overlaps with erate less than this)\n", minErate);
fprintf(statsFile, "%7.3f (don't use overlaps with erate more than this)\n", maxErate);
fprintf(statsFile, "\n");
@@ -370,32 +370,32 @@ main(int argc, char **argv) {
fprintf(statsFile, "\n");
fprintf(statsFile, "IGNORED:\n");
fprintf(statsFile, "\n");
- fprintf(statsFile, "%12"F_U64P" (< %6.4f fraction error)\n", lowErate, AS_OVS_decodeEvalue(minEvalue));
- fprintf(statsFile, "%12"F_U64P" (> %6.4f fraction error)\n", highErate, AS_OVS_decodeEvalue(maxEvalue));
- fprintf(statsFile, "%12"F_U64P" (< %u bases long)\n", tooShort, minOvlLength);
- fprintf(statsFile, "%12"F_U64P" (> %u bases long)\n", tooLong, maxOvlLength);
+ fprintf(statsFile, "%12" F_U64P " (< %6.4f fraction error)\n", lowErate, AS_OVS_decodeEvalue(minEvalue));
+ fprintf(statsFile, "%12" F_U64P " (> %6.4f fraction error)\n", highErate, AS_OVS_decodeEvalue(maxEvalue));
+ fprintf(statsFile, "%12" F_U64P " (< %u bases long)\n", tooShort, minOvlLength);
+ fprintf(statsFile, "%12" F_U64P " (> %u bases long)\n", tooLong, maxOvlLength);
fprintf(statsFile, "\n");
fprintf(statsFile, "FILTERED:\n");
fprintf(statsFile, "\n");
- fprintf(statsFile, "%12"F_U64P" (too many overlaps, discard these shortest ones)\n", belowCutoff);
+ fprintf(statsFile, "%12" F_U64P " (too many overlaps, discard these shortest ones)\n", belowCutoff);
fprintf(statsFile, "\n");
fprintf(statsFile, "EVIDENCE:\n");
fprintf(statsFile, "\n");
- fprintf(statsFile, "%12"F_U64P" (longest overlaps)\n", retained);
+ fprintf(statsFile, "%12" F_U64P " (longest overlaps)\n", retained);
fprintf(statsFile, "\n");
fprintf(statsFile, "TOTAL:\n");
fprintf(statsFile, "\n");
- fprintf(statsFile, "%12"F_U64P" (all overlaps)\n", totalOverlaps);
+ fprintf(statsFile, "%12" F_U64P " (all overlaps)\n", totalOverlaps);
fprintf(statsFile, "\n");
fprintf(statsFile, "READS:\n");
fprintf(statsFile, "-----\n");
fprintf(statsFile, "\n");
- fprintf(statsFile, "%12"F_U64P" (no overlaps)\n", readsNoOlaps);
- fprintf(statsFile, "%12"F_U64P" (no overlaps filtered)\n", reads00OlapsFiltered);
- fprintf(statsFile, "%12"F_U64P" (< 50%% overlaps filtered)\n", reads50OlapsFiltered);
- fprintf(statsFile, "%12"F_U64P" (< 80%% overlaps filtered)\n", reads80OlapsFiltered);
- fprintf(statsFile, "%12"F_U64P" (< 95%% overlaps filtered)\n", reads95OlapsFiltered);
- fprintf(statsFile, "%12"F_U64P" (< 100%% overlaps filtered)\n", reads99OlapsFiltered);
+ fprintf(statsFile, "%12" F_U64P " (no overlaps)\n", readsNoOlaps);
+ fprintf(statsFile, "%12" F_U64P " (no overlaps filtered)\n", reads00OlapsFiltered);
+ fprintf(statsFile, "%12" F_U64P " (< 50%% overlaps filtered)\n", reads50OlapsFiltered);
+ fprintf(statsFile, "%12" F_U64P " (< 80%% overlaps filtered)\n", reads80OlapsFiltered);
+ fprintf(statsFile, "%12" F_U64P " (< 95%% overlaps filtered)\n", reads95OlapsFiltered);
+ fprintf(statsFile, "%12" F_U64P " (< 100%% overlaps filtered)\n", reads99OlapsFiltered);
fprintf(statsFile, "\n");
fclose(statsFile);
diff --git a/src/correction/generateCorrectionLayouts.C b/src/correction/generateCorrectionLayouts.C
index c8e39f4..9067510 100644
--- a/src/correction/generateCorrectionLayouts.C
+++ b/src/correction/generateCorrectionLayouts.C
@@ -83,7 +83,7 @@ generateLayout(gkStore *gkpStore,
resizeArray(layout->_children, layout->_childrenLen, layout->_childrenMax, ovlLen, resizeArray_doNothing);
if (flgFile)
- fprintf(flgFile, "Generate layout for read "F_U32" length "F_U32" using up to "F_U32" overlaps.\n",
+ fprintf(flgFile, "Generate layout for read " F_U32 " length " F_U32 " using up to " F_U32 " overlaps.\n",
layout->_tigID, layout->_layoutLen, ovlLen);
for (uint32 oo=0; oo<ovlLen; oo++) {
@@ -106,14 +106,14 @@ generateLayout(gkStore *gkpStore,
if (ovl[oo].erate() > maxEvidenceErate) {
if (flgFile)
- fprintf(flgFile, " filter read %9u at position %6u,%6u length %5u erate %.3f - low quality (threshold %.2f)\n",
+ fprintf(flgFile, " filter read %9u at position %6u,%6u length %5lu erate %.3f - low quality (threshold %.2f)\n",
ovl[oo].b_iid, ovl[oo].a_bgn(), ovl[oo].a_end(), ovlLength, ovl[oo].erate(), maxEvidenceErate);
continue;
}
if (ovl[oo].a_end() - ovl[oo].a_bgn() < minEvidenceLength) {
if (flgFile)
- fprintf(flgFile, " filter read %9u at position %6u,%6u length %5u erate %.3f - too short (threshold %u)\n",
+ fprintf(flgFile, " filter read %9u at position %6u,%6u length %5lu erate %.3f - too short (threshold %u)\n",
ovl[oo].b_iid, ovl[oo].a_bgn(), ovl[oo].a_end(), ovlLength, ovl[oo].erate(), minEvidenceLength);
continue;
}
@@ -121,20 +121,20 @@ generateLayout(gkStore *gkpStore,
if ((readScores != NULL) &&
(ovlScore < readScores[ovl[oo].b_iid])) {
if (flgFile)
- fprintf(flgFile, " filter read %9u at position %6u,%6u length %5u erate %.3f - filtered by global filter (threshold "F_U64")\n",
+ fprintf(flgFile, " filter read %9u at position %6u,%6u length %5lu erate %.3f - filtered by global filter (threshold " F_U64 ")\n",
ovl[oo].b_iid, ovl[oo].a_bgn(), ovl[oo].a_end(), ovlLength, ovl[oo].erate(), readScores[ovl[oo].b_iid]);
continue;
}
if (children.find(ovl[oo].b_iid) != children.end()) {
if (flgFile)
- fprintf(flgFile, " filter read %9u at position %6u,%6u length %5u erate %.3f - duplicate\n",
+ fprintf(flgFile, " filter read %9u at position %6u,%6u length %5lu erate %.3f - duplicate\n",
ovl[oo].b_iid, ovl[oo].a_bgn(), ovl[oo].a_end(), ovlLength, ovl[oo].erate());
continue;
}
if (flgFile)
- fprintf(flgFile, " allow read %9u at position %6u,%6u length %5u erate %.3f\n",
+ fprintf(flgFile, " allow read %9u at position %6u,%6u length %5lu erate %.3f\n",
ovl[oo].b_iid, ovl[oo].a_bgn(), ovl[oo].a_end(), ovlLength, ovl[oo].erate());
tgPosition *pos = layout->addChild();
@@ -359,7 +359,7 @@ main(int argc, char **argv) {
// Threshold the range of reads to operate on.
if (gkpStore->gkStore_getNumReads() < iidMin) {
- fprintf(stderr, "ERROR: only "F_U32" reads in the store (IDs 0-"F_U32" inclusive); can't process requested range -b "F_U32" -e "F_U32"\n",
+ fprintf(stderr, "ERROR: only " F_U32 " reads in the store (IDs 0-" F_U32 " inclusive); can't process requested range -b " F_U32 " -e " F_U32 "\n",
gkpStore->gkStore_getNumReads(),
gkpStore->gkStore_getNumReads()-1,
iidMin, iidMax);
@@ -401,9 +401,9 @@ main(int argc, char **argv) {
// Open logging and summary files
if (outputPrefix) {
- sprintf(logName, "%s.log", outputPrefix);
- sprintf(sumName, "%s.summary", outputPrefix);
- sprintf(flgName, "%s.filter.log", outputPrefix);
+ snprintf(logName, FILENAME_MAX, "%s.log", outputPrefix);
+ snprintf(sumName, FILENAME_MAX, "%s.summary", outputPrefix);
+ snprintf(flgName, FILENAME_MAX, "%s.filter.log", outputPrefix);
errno = 0;
diff --git a/src/erateEstimate/erateEstimate.C b/src/erateEstimate/erateEstimate.C
index 9c0b447..2be7216 100644
--- a/src/erateEstimate/erateEstimate.C
+++ b/src/erateEstimate/erateEstimate.C
@@ -227,12 +227,12 @@ saveProfile(uint32 iid,
uint32 iter,
readErrorEstimate *readProfile) {
char N[FILENAME_MAX];
- sprintf(N, "erate-%08u-%02u.dat", iid, iter);
+ snprintf(N, FILENAME_MAX, "erate-%08u-%02u.dat", iid, iter);
FILE *F = fopen(N, "w");
for (uint32 pp=0; pp<readProfile[iid].seqLen; pp++)
- fprintf(F, "%u %7.4f\n", pp, AS_OVS_decodeEvalue(readProfile[iid].errorMeanS[pp]));
+ fprintf(F, "" F_U32 " %7.4f\n", pp, AS_OVS_decodeEvalue(readProfile[iid].errorMeanS[pp]));
fclose(F);
@@ -297,7 +297,7 @@ recomputeErrorProfile(gkStore *gkpStore,
uint64 nDiscard = 0;
uint64 nRemain = 0;
- fprintf(stderr, "Processing from IID "F_U32" to "F_U32" out of "F_U32" reads, iteration %u.\n",
+ fprintf(stderr, "Processing from IID " F_U32 " to " F_U32 " out of " F_U32 " reads, iteration " F_U32 ".\n",
iidMin,
iidMin + numIIDs,
gkpStore->gkStore_getNumReads(),
@@ -375,7 +375,7 @@ recomputeErrorProfile(gkStore *gkpStore,
// Keep users entertained.
if ((iid % 1000) == 0)
- fprintf(stderr, "IID %u\r", iid);
+ fprintf(stderr, "IID " F_U32 "\r", iid);
}
// All new estimates are computed. Convert the array of mean error per base into an array of
@@ -394,9 +394,9 @@ recomputeErrorProfile(gkStore *gkpStore,
// Report stats.
fprintf(stderr, "\n");
- fprintf(stderr, "nDiscarded "F_U64" (in previous iterations)\n", nDiscarded);
- fprintf(stderr, "nDiscard "F_U64" (in this iteration)\n", nDiscard);
- fprintf(stderr, "nRemain "F_U64"\n", nRemain);
+ fprintf(stderr, "nDiscarded " F_U64 " (in previous iterations)\n", nDiscarded);
+ fprintf(stderr, "nDiscard " F_U64 " (in this iteration)\n", nDiscard);
+ fprintf(stderr, "nRemain " F_U64 "\n", nRemain);
}
@@ -410,9 +410,9 @@ outputOverlaps(gkStore *gkpStore,
uint32 iidMin,
uint32 numIIDs,
char *ovlStoreName,
- uint64 *overlapIndex,
+ uint64 *UNUSED(overlapIndex),
ESToverlap *overlaps,
- readErrorEstimate *readProfile,
+ readErrorEstimate *UNUSED(readProfile),
char *outputName) {
uint64 nDiscarded = 0;
uint64 nRemain = 0;
@@ -420,12 +420,12 @@ outputOverlaps(gkStore *gkpStore,
// Open the original and output stores. We copy overlaps from the original to the copy, instead
// of recreating overlaps from our cache. The cache doesn't have all the overlap information.
- ovStore *inpStore = new ovStore(ovlStoreName, gkpStore);
- ovStore *outStore = new ovStore(outputName, gkpStore, ovStoreWrite);
+ ovStore *inpStore = new ovStore(ovlStoreName, gkpStore);
+ ovStoreWriter *outStore = new ovStoreWriter(outputName, gkpStore);
uint64 numOvls = inpStore->numOverlapsInRange();
- fprintf(stderr, "Processing from IID "F_U32" to "F_U32" out of "F_U32" reads.\n",
+ fprintf(stderr, "Processing from IID " F_U32 " to " F_U32 " out of " F_U32 " reads.\n",
iidMin,
iidMin + numIIDs,
gkpStore->gkStore_getNumReads());
@@ -459,7 +459,7 @@ outputOverlaps(gkStore *gkpStore,
}
if ((no & 0x000fffff) == 0)
- fprintf(stderr, " overlap %10"F_U64P" %8"F_U32P"-%8"F_U32P"\r", no, a_iid, b_iid);
+ fprintf(stderr, " overlap %10" F_U64P " %8" F_U32P "-%8" F_U32P "\r", no, a_iid, b_iid);
}
}
@@ -489,7 +489,7 @@ outputOverlaps(gkStore *gkpStore,
}
if ((iid % 1000) == 0)
- fprintf(stderr, "IID %u\r", iid);
+ fprintf(stderr, "IID " F_U32 "\r", iid);
}
#endif
@@ -497,8 +497,8 @@ outputOverlaps(gkStore *gkpStore,
delete inpStore;
fprintf(stderr, "\n");
- fprintf(stderr, "nDiscarded "F_U64" (in previous iterations)\n", nDiscarded);
- fprintf(stderr, "nRemain "F_U64"\n", nRemain);
+ fprintf(stderr, "nDiscarded " F_U64 " (in previous iterations)\n", nDiscarded);
+ fprintf(stderr, "nRemain " F_U64 "\n", nRemain);
}
@@ -617,8 +617,8 @@ main(int argc, char **argv) {
fprintf(stderr, " partNum = %9u\n", partNum);
fprintf(stderr, " partMax = %9u\n", partMax);
- //fprintf(stderr, "ovOverlap %lu\n", sizeof(ovOverlap));
- //fprintf(stderr, "ESToverlap %lu\n", sizeof(ESToverlap));
+ //fprintf(stderr, "ovOverlap " F_U64 "\n", sizeof(ovOverlap));
+ //fprintf(stderr, "ESToverlap " F_U64 "\n", sizeof(ESToverlap));
// Load read metadata, clear ranges, read lengths, and deleted status.
@@ -631,11 +631,11 @@ main(int argc, char **argv) {
readProfileSize += readProfile[iid].initialize(gkpStore->gkStore_getRead(iid + iidMin));
if ((iid % 10000) == 0)
- fprintf(stderr, " %u reads\r", iid);
+ fprintf(stderr, " " F_U32 " reads\r", iid);
}
- fprintf(stderr, " %u reads\n", numIIDs);
- fprintf(stderr, " %lu GB\n", readProfileSize >> 30);
+ fprintf(stderr, " " F_U32 " reads\n", numIIDs);
+ fprintf(stderr, " " F_U64 " GB\n", readProfileSize >> 30);
// Open overlap stores
@@ -665,10 +665,10 @@ main(int argc, char **argv) {
// Load overlaps.
fprintf(stderr, "Loading overlaps\n");
- fprintf(stderr, " number %lu overlaps\n", numOvls);
- fprintf(stderr, " index %lu GB\n", (sizeof(uint64) * numIIDs) >> 30);
- fprintf(stderr, " overlaps %lu GB (previous size)\n", (sizeof(ovOverlap) * numOvls) >> 30);
- fprintf(stderr, " overlaps %lu GB\n", (sizeof(ESToverlap) * numOvls) >> 30);
+ fprintf(stderr, " number " F_U64 " overlaps\n", numOvls);
+ fprintf(stderr, " index " F_U64 " GB\n", (sizeof(uint64) * (uint64)numIIDs) >> 30);
+ fprintf(stderr, " overlaps " F_U64 " GB (previous size)\n", (sizeof(ovOverlap) * numOvls) >> 30);
+ fprintf(stderr, " overlaps " F_U64 " GB\n", (sizeof(ESToverlap) * numOvls) >> 30);
ESToverlap *overlaps = NULL;
memoryMappedFile *overlapsMMF = NULL;
@@ -702,7 +702,7 @@ main(int argc, char **argv) {
if (ESTcache)
fwrite(overlaps + no - nLoad, sizeof(ESToverlap), nLoad, ESTcache);
- fprintf(stderr, " loading overlaps: %lu out of %lu (%.4f%%)\r",
+ fprintf(stderr, " loading overlaps: " F_U64 " out of " F_U64 " (%.4f%%)\r",
no, numOvls, 100.0 * no / numOvls);
}
@@ -712,7 +712,7 @@ main(int argc, char **argv) {
fclose(ESTcache);
fprintf(stderr, "\n");
- fprintf(stderr, " loaded and cached %lu overlaps.\n", numOvls);
+ fprintf(stderr, " loaded and cached " F_U64 " overlaps.\n", numOvls);
}
delete ovlStore;
diff --git a/src/falcon_sense/createFalconSenseInputs.C b/src/falcon_sense/createFalconSenseInputs.C
index 7bd36d5..0a864ea 100644
--- a/src/falcon_sense/createFalconSenseInputs.C
+++ b/src/falcon_sense/createFalconSenseInputs.C
@@ -138,7 +138,7 @@ main(int argc, char **argv) {
if (numReadsPer > 0)
numPartitions = nReadsInTigs / numReadsPer + 1;
- fprintf(stderr, "Will partition "F_U64" total child reads into "F_U32" partitions.\n",
+ fprintf(stderr, "Will partition " F_U64 " total child reads into " F_U32 " partitions.\n",
nReadsInTigs, numPartitions);
// Decide on a partitioning, based on total reads per tig.
@@ -202,7 +202,7 @@ main(int argc, char **argv) {
if (partFile[pp] == NULL) {
char name[FILENAME_MAX];
- sprintf(name, "%s%04d", outputPrefix, pp); // Sync'd with canu/CorrectReads.pm
+ snprintf(name, FILENAME_MAX, "%s%04d", outputPrefix, pp); // Sync'd with canu/CorrectReads.pm
errno = 0;
partFile[pp] = fopen(name, "w");
diff --git a/src/falcon_sense/falcon_sense.C b/src/falcon_sense/falcon_sense.C
index ad477dd..f5fa694 100644
--- a/src/falcon_sense/falcon_sense.C
+++ b/src/falcon_sense/falcon_sense.C
@@ -30,7 +30,9 @@
#include "falcon.H"
+#ifndef BROKEN_CLANG_OpenMP
#include <omp.h>
+#endif
#include <vector>
#include <string>
@@ -97,8 +99,9 @@ main (int argc, char **argv) {
string seed;
char *A = new char[AS_MAX_READLEN * 2];
- while (true) {
- fgets(A, AS_MAX_READLEN * 2, stdin);
+ fgets(A, AS_MAX_READLEN * 2, stdin);
+
+ while (!feof(stdin)) {
splitToWords W(A);
if (W[0][0] == '+') {
@@ -126,6 +129,8 @@ main (int argc, char **argv) {
seqs.push_back(string(W[1]));
}
}
+
+ fgets(A, AS_MAX_READLEN * 2, stdin);
}
delete[] A;
diff --git a/src/falcon_sense/outputFalcon.C b/src/falcon_sense/outputFalcon.C
index 97a10b4..8ed6731 100644
--- a/src/falcon_sense/outputFalcon.C
+++ b/src/falcon_sense/outputFalcon.C
@@ -54,7 +54,7 @@ outputFalcon(gkStore *gkpStore,
gkpStore->gkStore_loadReadData(tig->tigID(), readData);
- fprintf(F, "read"F_U32" %s\n", tig->tigID(), readData->gkReadData_getSequence());
+ fprintf(F, "read" F_U32 " %s\n", tig->tigID(), readData->gkReadData_getSequence());
for (uint32 cc=0; cc<tig->numberOfChildren(); cc++) {
tgPosition *child = tig->getChild(cc);
@@ -80,7 +80,7 @@ outputFalcon(gkStore *gkpStore,
seq[ readData->gkReadData_getRead()->gkRead_sequenceLength() - child->_askip - child->_bskip ] = 0;
}
- fprintf(F, "data"F_U32" %s\n", tig->getChild(cc)->ident(), seq);
+ fprintf(F, "data" F_U32 " %s\n", tig->getChild(cc)->ident(), seq);
}
fprintf(F, "+ +\n");
diff --git a/src/fastq-utilities/fastqAnalyze.C b/src/fastq-utilities/fastqAnalyze.C
index 94fecd6..37f3979 100644
--- a/src/fastq-utilities/fastqAnalyze.C
+++ b/src/fastq-utilities/fastqAnalyze.C
@@ -163,18 +163,18 @@ doStats(char *inName,
totBases += ii;
if ((totSeqs % 10000) == 0)
- fprintf(stderr, "Reading "F_U64"\r", totSeqs);
+ fprintf(stderr, "Reading " F_U64 "\r", totSeqs);
}
- fprintf(stderr, "Read "F_U64"\n", totSeqs);
+ fprintf(stderr, "Read " F_U64 "\n", totSeqs);
fprintf(stdout, "%s\n", inName);
fprintf(stdout, "\n");
- fprintf(stdout, "sequences\t"F_U64"\n", totSeqs);
- fprintf(stdout, "bases\t"F_U64"\n", totBases);
+ fprintf(stdout, "sequences\t" F_U64 "\n", totSeqs);
+ fprintf(stdout, "bases\t" F_U64 "\n", totBases);
fprintf(stdout, "\n");
- fprintf(stdout, "average\t"F_U64"\n", totBases / totSeqs);
+ fprintf(stdout, "average\t" F_U64 "\n", totBases / totSeqs);
fprintf(stdout, "\n");
//sort(seqLen.begin(), seqLen.end());
@@ -196,7 +196,7 @@ doStats(char *inName,
histogram[seqLen[ii]]++;
for (uint32 ii=min; ii<=max; ii++)
- fprintf(stdout, F_U32"\t"F_U64"\n", ii, histogram[ii]);
+ fprintf(stdout, F_U32"\t" F_U64 "\n", ii, histogram[ii]);
delete [] histogram;
@@ -212,7 +212,7 @@ doStats(char *inName,
freq->mono[ii] * 100.0 / totBases));
sort(output.begin(), output.end());
for (uint32 ii=0; ii<output.size(); ii++)
- fprintf(stdout, "%s\t"F_U64"\t%.4f%%\n", output[ii].label, output[ii].count, output[ii].freq);
+ fprintf(stdout, "%s\t" F_U64 "\t%.4f%%\n", output[ii].label, output[ii].count, output[ii].freq);
output.clear();
fprintf(stdout, "\n");
@@ -226,7 +226,7 @@ doStats(char *inName,
freq->di[ii][jj] * 100.0 / totBases));
sort(output.begin(), output.end());
for (uint32 ii=0; ii<output.size(); ii++)
- fprintf(stdout, "%s\t"F_U64"\t%.4f%%\n", output[ii].label, output[ii].count, output[ii].freq);
+ fprintf(stdout, "%s\t" F_U64 "\t%.4f%%\n", output[ii].label, output[ii].count, output[ii].freq);
output.clear();
fprintf(stdout, "\n");
@@ -241,7 +241,7 @@ doStats(char *inName,
freq->tri[ii][jj][kk] * 100.0 / totBases));
sort(output.begin(), output.end());
for (uint32 ii=0; ii<output.size(); ii++)
- fprintf(stdout, "%s\t"F_U64"\t%.4f%%\n", output[ii].label, output[ii].count, output[ii].freq);
+ fprintf(stdout, "%s\t" F_U64 "\t%.4f%%\n", output[ii].label, output[ii].count, output[ii].freq);
output.clear();
//fclose(O);
diff --git a/src/fastq-utilities/fastqSample.C b/src/fastq-utilities/fastqSample.C
index 31ac7ca..15716a5 100644
--- a/src/fastq-utilities/fastqSample.C
+++ b/src/fastq-utilities/fastqSample.C
@@ -277,8 +277,8 @@ main(int argc, char **argv) {
fprintf(stderr, "Counting the number of reads in the input.\n");
- sprintf(path1, "%s.%c.fastq", INPNAME, (isMated == true) ? '1' : 'u');
- sprintf(path2, "%s.%c.fastq", INPNAME, (isMated == true) ? '2' : 'u');
+ snprintf(path1, FILENAME_MAX, "%s.%c.fastq", INPNAME, (isMated == true) ? '1' : 'u');
+ snprintf(path2, FILENAME_MAX, "%s.%c.fastq", INPNAME, (isMated == true) ? '2' : 'u');
errno = 0;
Ai = fopen(path1, "r");
@@ -315,7 +315,7 @@ main(int argc, char **argv) {
if (Ai) fclose(Ai);
if (Bi) fclose(Bi);
- fprintf(stderr, "Found "F_U64" bases and "F_U64" reads in '%s'\n",
+ fprintf(stderr, "Found " F_U64 " bases and " F_U64 " reads in '%s'\n",
totBasesInInput, totPairsInInput, path1);
if (Ac != Bc) {
@@ -367,12 +367,12 @@ main(int argc, char **argv) {
}
if (totBasesInInput < nBasesToOutput)
- fprintf(stderr, "ERROR: not enough reads, "F_U64" bp in input, "F_U64" needed for desired .....\n",
+ fprintf(stderr, "ERROR: not enough reads, " F_U64 " bp in input, " F_U64 " needed for desired .....\n",
totBasesInInput, nBasesToOutput),
exit(1);
if (totPairsInInput < nPairsToOutput)
- fprintf(stderr, "ERROR: not enough reads, "F_U64" %s in input, "F_U64" needed for desired ......\n",
+ fprintf(stderr, "ERROR: not enough reads, " F_U64 " %s in input, " F_U64 " needed for desired ......\n",
totPairsInInput, (isMated) ? "pairs" : "reads", nPairsToOutput),
exit(1);
@@ -450,8 +450,8 @@ main(int argc, char **argv) {
// Do the output
//
- sprintf(path1, "%s.%c.fastq", INPNAME, (isMated == true) ? '1' : 'u');
- sprintf(path2, "%s.%c.fastq", INPNAME, (isMated == true) ? '2' : 'u');
+ snprintf(path1, FILENAME_MAX, "%s.%c.fastq", INPNAME, (isMated == true) ? '1' : 'u');
+ snprintf(path2, FILENAME_MAX, "%s.%c.fastq", INPNAME, (isMated == true) ? '2' : 'u');
errno = 0;
Ai = fopen(path1, "r");
@@ -467,16 +467,16 @@ main(int argc, char **argv) {
if (AUTONAME == false) {
- sprintf(path1, "%s.%c.fastq", OUTNAME, (isMated == true) ? '1' : 'u');
- sprintf(path2, "%s.%c.fastq", OUTNAME, (isMated == true) ? '2' : 'u');
+ snprintf(path1, FILENAME_MAX, "%s.%c.fastq", OUTNAME, (isMated == true) ? '1' : 'u');
+ snprintf(path2, FILENAME_MAX, "%s.%c.fastq", OUTNAME, (isMated == true) ? '2' : 'u');
} else if (GENOMESIZE > 0) {
- sprintf(path1, "%s.x=%07.3f.n=%09"F_U64P".%c.fastq", OUTNAME, (double)nBasesToOutput / GENOMESIZE, nPairsToOutput, (isMated == true) ? '1' : 'u');
- sprintf(path2, "%s.x=%07.3f.n=%09"F_U64P".%c.fastq", OUTNAME, (double)nBasesToOutput / GENOMESIZE, nPairsToOutput, (isMated == true) ? '2' : 'u');
+ snprintf(path1, FILENAME_MAX, "%s.x=%07.3f.n=%09" F_U64P ".%c.fastq", OUTNAME, (double)nBasesToOutput / GENOMESIZE, nPairsToOutput, (isMated == true) ? '1' : 'u');
+ snprintf(path2, FILENAME_MAX, "%s.x=%07.3f.n=%09" F_U64P ".%c.fastq", OUTNAME, (double)nBasesToOutput / GENOMESIZE, nPairsToOutput, (isMated == true) ? '2' : 'u');
} else {
- sprintf(path1, "%s.x=UNKNOWN.n=%09"F_U64P".%c.fastq", OUTNAME, nPairsToOutput, (isMated == true) ? '1' : 'u');
- sprintf(path2, "%s.x=UNKNOWN.n=%09"F_U64P".%c.fastq", OUTNAME, nPairsToOutput, (isMated == true) ? '2' : 'u');
+ snprintf(path1, FILENAME_MAX, "%s.x=UNKNOWN.n=%09" F_U64P ".%c.fastq", OUTNAME, nPairsToOutput, (isMated == true) ? '1' : 'u');
+ snprintf(path2, FILENAME_MAX, "%s.x=UNKNOWN.n=%09" F_U64P ".%c.fastq", OUTNAME, nPairsToOutput, (isMated == true) ? '2' : 'u');
}
errno = 0;
@@ -498,10 +498,10 @@ main(int argc, char **argv) {
if (isMated == true) {
if (nPairsToOutput > 0)
- fprintf(stderr, "Extracting "F_U64" mate pairs into %s and %s\n",
+ fprintf(stderr, "Extracting " F_U64 " mate pairs into %s and %s\n",
nPairsToOutput, path1, path2);
else
- fprintf(stderr, "Extracting "F_U64" bases of mate pairs into %s and %s\n",
+ fprintf(stderr, "Extracting " F_U64 " bases of mate pairs into %s and %s\n",
nBasesToOutput, path1, path2);
for (; Ar->read(Ai) && Br->read(Bi); i++) {
@@ -521,10 +521,10 @@ main(int argc, char **argv) {
} else {
if (nPairsToOutput > 0)
- fprintf(stderr, "Extracting "F_U64" reads into %s\n",
+ fprintf(stderr, "Extracting " F_U64 " reads into %s\n",
nPairsToOutput, path1);
else
- fprintf(stderr, "Extracting "F_U64" bases of reads into %s\n",
+ fprintf(stderr, "Extracting " F_U64 " bases of reads into %s\n",
nBasesToOutput, path1);
for (; Ar->read(Ai); i++) {
@@ -542,17 +542,18 @@ main(int argc, char **argv) {
delete Br;
if (i > totPairsInInput) {
- fprintf(stderr, "WARNING: There are "F_U64" %s in the input; you claimed there are "F_U64" (-t option) %s.\n",
+ fprintf(stderr, "WARNING: There are " F_U64 " %s in the input; you claimed there are " F_U64 " (-t option) %s.\n",
i, (isMated) ? "mates" : "reads",
totPairsInInput, (isMated) ? "mates" : "reads");
fprintf(stderr, "WARNING: Result is not a random sample of the input file.\n");
}
if (i < totPairsInInput) {
- fprintf(stderr, "WARNING: There are "F_U64" %s in the input; you claimed there are "F_U64" (-t option) %s.\n",
+ fprintf(stderr, "WARNING: There are " F_U64 " %s in the input; you claimed there are " F_U64 " (-t option) %s.\n",
i, (isMated) ? "mates" : "reads",
totPairsInInput, (isMated) ? "mates" : "reads");
- fprintf(stderr, "WARNING: Result is only %f X coverage.\n", (double)s * READLENGTH / GENOMESIZE);
+ if (GENOMESIZE > 0)
+ fprintf(stderr, "WARNING: Result is only %f X coverage.\n", (double)s * READLENGTH / GENOMESIZE);
}
return(0);
diff --git a/src/fastq-utilities/fastqSimulate-sort.C b/src/fastq-utilities/fastqSimulate-sort.C
index 13af42a..a4dbc17 100644
--- a/src/fastq-utilities/fastqSimulate-sort.C
+++ b/src/fastq-utilities/fastqSimulate-sort.C
@@ -128,7 +128,7 @@ readRead(FILE *inFile, uint32 &seq, uint32 &bgn, uint32 &end) {
end = strtoul(a+p, NULL, 10);
- //fprintf(stderr, "seq="F_U32" bgn="F_U32" end="F_U32" line %s",
+ //fprintf(stderr, "seq=" F_U32 " bgn=" F_U32 " end=" F_U32 " line %s",
// seq, bgn, end, a);
char *retstr = new char [al + bl + cl + dl + 1];
@@ -249,7 +249,7 @@ main(int argc, char **argv) {
reads.push_back(pr);
}
- fprintf(stderr, "Loaded "F_U64" mated reads.\n", reads.size());
+ fprintf(stderr, "Loaded " F_U64 " mated reads.\n", reads.size());
sort(reads.begin(), reads.end());
diff --git a/src/fastq-utilities/fastqSimulate.C b/src/fastq-utilities/fastqSimulate.C
index e5382a9..50e946e 100644
--- a/src/fastq-utilities/fastqSimulate.C
+++ b/src/fastq-utilities/fastqSimulate.C
@@ -646,6 +646,12 @@ makeMP(char *seq,
//if ((np % 1000) == 0)
// fprintf(stderr, "%9d / %9d - %5.2f%%\r", np, numPairs, 100.0 * np / numPairs);
}
+
+ delete [] s1;
+ delete [] q1;
+ delete [] s2;
+ delete [] q2;
+ delete [] sh;
}
@@ -997,7 +1003,7 @@ main(int argc, char **argv) {
// when the end of genome is hit. This is caught later in makeSequence() and the
// read is aborted.
- fprintf(stderr, "seed = "F_U64"\n", seed);
+ fprintf(stderr, "seed = " F_U64 "\n", seed);
srand48(seed);
memset(revComp, '&', sizeof(char) * 256);
@@ -1039,31 +1045,31 @@ main(int argc, char **argv) {
errno = 0;
if ((seEnable == true) || (ccEnable == true)) {
- sprintf(outputName, "%s.s.fastq", outputPrefix);
+ snprintf(outputName, FILENAME_MAX, "%s.s.fastq", outputPrefix);
outputI = fopen(outputName, "w");
if (errno)
fprintf(stderr, "Failed to open output file '%s': %s\n", outputName, strerror(errno)), exit(1);
}
if ((seEnable == false) && (ccEnable == false)) {
- sprintf(outputName, "%s.i.fastq", outputPrefix);
+ snprintf(outputName, FILENAME_MAX, "%s.i.fastq", outputPrefix);
outputI = fopen(outputName, "w");
if (errno)
fprintf(stderr, "Failed to open output file '%s': %s\n", outputName, strerror(errno)), exit(1);
- sprintf(outputName, "%s.c.fastq", outputPrefix);
+ snprintf(outputName, FILENAME_MAX, "%s.c.fastq", outputPrefix);
outputC = fopen(outputName, "w");
if (errno)
fprintf(stderr, "Failed to open output file '%s': %s\n", outputName, strerror(errno)), exit(1);
}
if (peEnable || mpEnable) {
- sprintf(outputName, "%s.1.fastq", outputPrefix);
+ snprintf(outputName, FILENAME_MAX, "%s.1.fastq", outputPrefix);
output1 = fopen(outputName, "w");
if (errno)
fprintf(stderr, "Failed to open output file '%s': %s\n", outputName, strerror(errno)), exit(1);
- sprintf(outputName, "%s.2.fastq", outputPrefix);
+ snprintf(outputName, FILENAME_MAX, "%s.2.fastq", outputPrefix);
output2 = fopen(outputName, "w");
if (errno)
fprintf(stderr, "Failed to open output file '%s': %s\n", outputName, strerror(errno)), exit(1);
@@ -1119,9 +1125,14 @@ main(int argc, char **argv) {
assert(numSeq == seqStartPositions.size());
+
fprintf(stderr, "Loaded %u sequences of length %d, with %u invalid bases fixed.\n",
numSeq, seqLen - numSeq, nInvalid);
+ if ((numSeq == 0) || (seqLen == 0))
+ fprintf(stderr, "ERROR: No sequences or bases loaded, can't simulate reads.\n"), exit(1);
+
+
//
// If requested, compute the number of pairs to get a desired X of coverage
//
@@ -1204,10 +1215,10 @@ main(int argc, char **argv) {
fprintf(stderr, "\n");
fprintf(stderr, "Number of reads with:\n");
- fprintf(stderr, " nNoChange = "F_U64"\n", nNoChange);
- fprintf(stderr, " nMismatch = "F_U64"\n", nMismatch);
- fprintf(stderr, " nInsert = "F_U64"\n", nInsert);
- fprintf(stderr, " nDelete = "F_U64"\n", nDelete);
+ fprintf(stderr, " nNoChange = " F_U64 "\n", nNoChange);
+ fprintf(stderr, " nMismatch = " F_U64 "\n", nMismatch);
+ fprintf(stderr, " nInsert = " F_U64 "\n", nInsert);
+ fprintf(stderr, " nDelete = " F_U64 "\n", nDelete);
exit(0);
}
diff --git a/src/main.mk b/src/main.mk
index b272b26..740ab00 100644
--- a/src/main.mk
+++ b/src/main.mk
@@ -38,6 +38,7 @@ SOURCES := AS_global.C \
AS_UTL/bitPackedFile.C \
AS_UTL/bitPackedArray.C \
AS_UTL/dnaAlphabets.C \
+ AS_UTL/hexDump.C \
AS_UTL/md5.C \
AS_UTL/mt19937ar.C \
AS_UTL/readBuffer.C \
@@ -46,18 +47,28 @@ SOURCES := AS_global.C \
AS_UTL/timeAndSize.C \
AS_UTL/kMer.C \
\
+ falcon_sense/libfalcon/falcon.C \
+ falcon_sense/libfalcon/kmer_lookup.C \
+ \
stores/gkStore.C \
stores/gkStoreEncode.C \
\
stores/ovOverlap.C \
stores/ovStore.C \
+ stores/ovStoreWriter.C \
+ stores/ovStoreFilter.C \
stores/ovStoreFile.C \
+ stores/ovStoreHistogram.C \
\
stores/tgStore.C \
stores/tgTig.C \
stores/tgTigSizeAnalysis.C \
stores/tgTigMultiAlignDisplay.C \
\
+ stores/libsnappy/snappy-sinksource.cc \
+ stores/libsnappy/snappy-stubs-internal.cc \
+ stores/libsnappy/snappy.cc \
+ \
meryl/libmeryl.C \
\
overlapInCore/overlapReadCache.C \
@@ -72,6 +83,8 @@ SOURCES := AS_global.C \
overlapInCore/liboverlap/prefixEditDistance-forward.C \
overlapInCore/liboverlap/prefixEditDistance-reverse.C \
\
+ overlapInCore/libedlib/edlib.C \
+ \
utgcns/libNDalign/NDalign.C \
\
utgcns/libNDalign/Binomial_Bound.C \
@@ -94,15 +107,12 @@ SOURCES := AS_global.C \
utgcns/libcns/unitigConsensus.C \
utgcns/libpbutgcns/Alignment.C \
utgcns/libpbutgcns/AlnGraphBoost.C \
- utgcns/libpbutgcns/SimpleAligner.C \
- utgcns/libNDFalcon/dw.C \
- \
- falcon_sense/libfalcon/falcon.C \
- falcon_sense/libfalcon/kmer_lookup.C
+ utgcns/libNDFalcon/dw.C
SRC_INCDIRS := . \
AS_UTL \
stores \
+ stores/libsnappy \
alignment \
utgcns/libNDalign \
utgcns/libcns \
@@ -111,6 +121,7 @@ SRC_INCDIRS := . \
utgcns/libboost \
meryl/libleaff \
overlapInCore \
+ overlapInCore/libedlib \
overlapInCore/liboverlap \
falcon_sense/libfalcon
@@ -134,6 +145,7 @@ SUBMAKEFILES := stores/gatekeeperCreate.mk \
meryl/libleaff.mk \
meryl/leaff.mk \
meryl/meryl.mk \
+ meryl/maskMers.mk \
meryl/simple.mk \
meryl/estimate-mer-threshold.mk \
\
@@ -166,7 +178,6 @@ SUBMAKEFILES := stores/gatekeeperCreate.mk \
overlapErrorAdjustment/correctOverlaps.mk \
\
bogart/bogart.mk \
- bogart/buildGraph.mk \
\
bogus/bogus.mk \
\
diff --git a/src/merTrim/merTrim.C b/src/merTrim/merTrim.C
index 6cc5b47..f005e00 100644
--- a/src/merTrim/merTrim.C
+++ b/src/merTrim/merTrim.C
@@ -180,10 +180,10 @@ public:
gktCur = gktBgn;
if (gktBgn > gktEnd)
- fprintf(stderr, "ERROR: invalid range: -b ("F_U32") >= -e ("F_U32").\n",
+ fprintf(stderr, "ERROR: invalid range: -b (" F_U32 ") >= -e (" F_U32 ").\n",
gktBgn, gktEnd), exit(1);
if (gktEnd > gkRead->gkStore_getNumFragments())
- fprintf(stderr, "ERROR: invalid range: -e ("F_U32") > num frags ("F_U32").\n",
+ fprintf(stderr, "ERROR: invalid range: -e (" F_U32 ") > num frags (" F_U32 ").\n",
gktEnd, gkRead->gkStore_getNumFragments()), exit(1);
errno = 0;
@@ -208,7 +208,7 @@ public:
char fqName[FILENAME_MAX];
- sprintf(fqName, "%s.log", fqOutputPath);
+ snprintf(fqName, FILENAME_MAX, "%s.log", fqOutputPath);
fqLog = new compressedFileWriter(fqName);
};
@@ -223,28 +223,28 @@ public:
uint32 i = 0;
uint32 iX = 0;
- //fprintf(stderr, "distinct: "F_U64"\n", MF->numberOfDistinctMers());
- //fprintf(stderr, "unique: "F_U64"\n", MF->numberOfUniqueMers());
- //fprintf(stderr, "total: "F_U64"\n", MF->numberOfTotalMers());
+ //fprintf(stderr, "distinct: " F_U64 "\n", MF->numberOfDistinctMers());
+ //fprintf(stderr, "unique: " F_U64 "\n", MF->numberOfUniqueMers());
+ //fprintf(stderr, "total: " F_U64 "\n", MF->numberOfTotalMers());
- //fprintf(stderr, "Xcoverage zero 1 0 "F_U64"\n", MF->histogram(1));
+ //fprintf(stderr, "Xcoverage zero 1 0 " F_U64 "\n", MF->histogram(1));
for (i=2; (i < MF->histogramLength()) && (MF->histogram(i-1) > MF->histogram(i)); i++)
- //fprintf(stderr, "Xcoverage drop "F_U32" "F_U64" "F_U64"\n", i, MF->histogram(i-1), MF->histogram(i));
+ //fprintf(stderr, "Xcoverage drop " F_U32 " " F_U64 " " F_U64 "\n", i, MF->histogram(i-1), MF->histogram(i));
;
iX = i - 1;
for (; i < MF->histogramLength(); i++) {
if (MF->histogram(iX) < MF->histogram(i)) {
- //fprintf(stderr, "Xcoverage incr "F_U32" "F_U64" "F_U64"\n", i, MF->histogram(iX), MF->histogram(i));
+ //fprintf(stderr, "Xcoverage incr " F_U32 " " F_U64 " " F_U64 "\n", i, MF->histogram(iX), MF->histogram(i));
iX = i;
} else {
- //fprintf(stderr, "Xcoverage drop "F_U32" "F_U64" "F_U64"\n", i, MF->histogram(iX), MF->histogram(i));
+ //fprintf(stderr, "Xcoverage drop " F_U32 " " F_U64 " " F_U64 "\n", i, MF->histogram(iX), MF->histogram(i));
}
}
- fprintf(stderr, "Guessed X coverage is "F_U32"\n", iX);
+ fprintf(stderr, "Guessed X coverage is " F_U32 "\n", iX);
delete MF;
@@ -257,7 +257,7 @@ public:
if (minVerifiedFraction > 0)
minVerified = (uint32)floor(minVerifiedFraction * actualCoverage);
- fprintf(stderr, "Use minCorrect="F_U32" minVerified="F_U32"\n", minCorrect, minVerified);
+ fprintf(stderr, "Use minCorrect=" F_U32 " minVerified=" F_U32 "\n", minCorrect, minVerified);
if (minCorrect < minVerified) {
fprintf(stderr, "WARNING!\n");
@@ -285,7 +285,7 @@ public:
}
char cacheName[FILENAME_MAX];
- sprintf(cacheName, "%s.merTrimDB", merCountsFile);
+ snprintf(cacheName, FILENAME_MAX, "%s.merTrimDB", merCountsFile);
if (AS_UTL_fileExists(cacheName, FALSE, FALSE)) {
fprintf(stderr, "loading genome mer database from cache '%s'.\n", cacheName);
@@ -1229,7 +1229,7 @@ mertrimComputation::scoreAdapter(void) {
containsAdapterEnd = MAX(containsAdapterEnd, end + 1);
if (VERBOSE > 1)
- log.add("ADAPTER at "F_U32","F_U32" ["F_U32","F_U32"]\n",
+ log.add("ADAPTER at " F_U32 "," F_U32 " [" F_U32 "," F_U32 "]\n",
bgn, end, containsAdapterBgn, containsAdapterEnd);
for (uint32 a=bgn; a<=end; a++)
@@ -2155,7 +2155,7 @@ mertrimWriterFASTQ(mertrimGlobalData *g, mertrimComputation *s) {
if (s->verifySeq) {
}
- fprintf(g->fqLog->file(), F_U32"\t"F_U32"\tchimer\t%c\t"F_U32"\t"F_U32"\tadapter\t%c\t"F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\t%s\t%s\n",
+ fprintf(g->fqLog->file(), F_U32"\t" F_U32 "\tchimer\t%c\t" F_U32 "\t" F_U32 "\tadapter\t%c\t" F_U32 "\t" F_U32 "\t" F_U32 "\t" F_U32 "\t%s\t%s\n",
s->clrBgn,
s->clrEnd,
s->suspectedChimer ? 't' : 'f',
diff --git a/src/merTrim/merTrimResult.H b/src/merTrim/merTrimResult.H
index c72d1a2..651be0d 100644
--- a/src/merTrim/merTrimResult.H
+++ b/src/merTrim/merTrimResult.H
@@ -60,11 +60,11 @@ public:
if (F == NULL)
return;
if (chimer)
- fprintf(F, F_IID"\t"F_U32"\t"F_U32"\tchimer\t"F_U32"\t"F_U32"%s\n",
+ fprintf(F, F_IID"\t" F_U32 "\t" F_U32 "\tchimer\t" F_U32 "\t" F_U32 "%s\n",
readIID, clrBgn, clrEnd, chmBgn, chmEnd,
(deleted) ? "\tdeleted" : "");
else
- fprintf(F, F_IID"\t"F_U32"\t"F_U32"%s\n",
+ fprintf(F, F_IID"\t" F_U32 "\t" F_U32 "%s\n",
readIID, clrBgn, clrEnd,
(deleted) ? "\tdeleted" : "");
};
diff --git a/src/mercy/mercy-regions.C b/src/mercy/mercy-regions.C
index 9ef17a8..50d0699 100644
--- a/src/mercy/mercy-regions.C
+++ b/src/mercy/mercy-regions.C
@@ -78,7 +78,7 @@ readDepth(char *depthname, map<uint64,intervalList<uint32>*> &lowCoverage) {
uint32 end = strtoul(W[3], 0L, 10);
if (beg > end)
- fprintf(stderr, "ERROR: l="F_U32" h="F_U32"\n", beg, end);
+ fprintf(stderr, "ERROR: l=" F_U32 " h=" F_U32 "\n", beg, end);
if (ILs[uid] == 0L)
ILs[uid] = new intervalList<uint32>;
@@ -90,7 +90,7 @@ readDepth(char *depthname, map<uint64,intervalList<uint32>*> &lowCoverage) {
}
fclose(F);
- fprintf(stderr, " "F_U32" lines.\n", i);
+ fprintf(stderr, " " F_U32 " lines.\n", i);
map<uint64,intervalList<uint32>*>::iterator it = ILs.begin();
map<uint64,intervalList<uint32>*>::iterator ed = ILs.end();
@@ -135,7 +135,7 @@ readVariation(char *depthname, map<uint64,intervalList<uint32>*> &variation) {
}
fclose(F);
- fprintf(stderr, " "F_U32" lines.\n", i);
+ fprintf(stderr, " " F_U32 " lines.\n", i);
}
@@ -175,7 +175,7 @@ readBadMers(char *depthname, map<uint64,intervalList<uint32>*> &badMers) {
}
fclose(F);
- fprintf(stderr, " "F_U32" lines.\n", i);
+ fprintf(stderr, " " F_U32 " lines.\n", i);
}
@@ -275,7 +275,7 @@ main(int argc, char **argv) {
// We want to count the number of times a badmer region
// begins/ends in some depth.
- //fprintf(stderr, "testing beg "F_U32" "F_U32" -- "F_U32" "F_U32"\n",
+ //fprintf(stderr, "testing beg " F_U32 " " F_U32 " -- " F_U32 " " F_U32 "\n",
// Ii->lo(ii), Ii->hi(ii), Id->lo(id), Id->hi(id));
uint32 beg = 0;
@@ -285,7 +285,7 @@ main(int argc, char **argv) {
while ((id < Id->numberOfIntervals()) &&
(Id->hi(id) <= Ii->lo(ii))) {
id++;
- //fprintf(stderr, "testing beg (m) "F_U32" "F_U32" -- "F_U32" "F_U32"\n",
+ //fprintf(stderr, "testing beg (m) " F_U32 " " F_U32 " -- " F_U32 " " F_U32 "\n",
// Ii->lo(ii), Ii->hi(ii), Id->lo(id), Id->hi(id));
}
if (id < Id->numberOfIntervals()) {
@@ -296,23 +296,23 @@ main(int argc, char **argv) {
if ((lo <= Ii->lo(ii)) && (Ii->lo(ii) < hi)) {
beg = Id->depth(id);
} else {
- fprintf(stderr, "failed to find begin "F_U32" "F_U32" -- "F_U32" "F_U32" "F_U32"\n",
+ fprintf(stderr, "failed to find begin " F_U32 " " F_U32 " -- " F_U32 " " F_U32 " " F_U32 "\n",
Ii->lo(ii), Ii->hi(ii), Id->lo(id), Id->hi(id), Id->depth(id));
if (id > 0)
- fprintf(stderr, " "F_U32" "F_U32" -- "F_U32" "F_U32" "F_U32"\n",
+ fprintf(stderr, " " F_U32 " " F_U32 " -- " F_U32 " " F_U32 " " F_U32 "\n",
Ii->lo(ii), Ii->hi(ii), Id->lo(id-1), Id->hi(id-1), Id->depth(id-1));
//exit(1);
}
}
- //fprintf(stderr, "testing end "F_U64" "F_U64" -- "F_U64" "F_U64"\n",
+ //fprintf(stderr, "testing end " F_U64 " " F_U64 " -- " F_U64 " " F_U64 "\n",
// Ii->lo(ii), Ii->hi(ii), Id->lo(id), Id->hi(id));
// High points can be equal.
while ((id < Id->numberOfIntervals()) &&
(Id->hi(id) < Ii->hi(ii))) {
id++;
- //fprintf(stderr, "testing end (m) "F_U64" "F_U64" -- "F_U64" "F_U64"\n",
+ //fprintf(stderr, "testing end (m) " F_U64 " " F_U64 " -- " F_U64 " " F_U64 "\n",
// Ii->lo(ii), Ii->hi(ii), Id->lo(id), Id->hi(id));
}
if (id < Id->numberOfIntervals()) {
@@ -323,10 +323,10 @@ main(int argc, char **argv) {
if ((lo < Ii->hi(ii)) && (Ii->hi(ii) <= hi)) {
end = Id->depth(id);
} else {
- fprintf(stderr, "failed to find end "F_U32" "F_U32" -- "F_U32" "F_U32" "F_U32"\n",
+ fprintf(stderr, "failed to find end " F_U32 " " F_U32 " -- " F_U32 " " F_U32 " " F_U32 "\n",
Ii->lo(ii), Ii->hi(ii), Id->lo(id), Id->hi(id), Id->depth(id));
if (id > 0)
- fprintf(stderr, " "F_U32" "F_U32" -- "F_U32" "F_U32" "F_U32"\n",
+ fprintf(stderr, " " F_U32 " " F_U32 " -- " F_U32 " " F_U32 " " F_U32 "\n",
Ii->lo(ii), Ii->hi(ii), Id->lo(id-1), Id->hi(id-1), Id->depth(id-1));
//exit(1);
}
@@ -335,7 +335,7 @@ main(int argc, char **argv) {
badBegDepth[beg]++;
badEndDepth[end]++;
- fprintf(stdout, F_U64"\t"F_U32"\t"F_U32"\tdepth="F_U32","F_U32"\n",
+ fprintf(stdout, F_U64"\t" F_U32 "\t" F_U32 "\tdepth=" F_U32 "," F_U32 "\n",
uid, Ii->lo(ii), Ii->hi(ii), beg, end);
if ((beg < 32) && (end < 32))
@@ -351,11 +351,11 @@ main(int argc, char **argv) {
uint32 bb = 0;
uint32 be = 0;
for (uint32 x=0; x<32; x++) {
- fprintf(stdout, F_U32"\t"F_U32"\t"F_U32"\n", x, badBegDepth[x], badEndDepth[x]);
+ fprintf(stdout, F_U32"\t" F_U32 "\t" F_U32 "\n", x, badBegDepth[x], badEndDepth[x]);
bb += badBegDepth[x];
be += badEndDepth[x];
}
- fprintf(stdout, "total\t"F_U32"\t"F_U32"\n", bb, be);
+ fprintf(stdout, "total\t" F_U32 "\t" F_U32 "\n", bb, be);
for (uint32 i=0; i<30; i++) {
for (uint32 j=0; j<30; j++)
diff --git a/src/mercy/mercy.C b/src/mercy/mercy.C
index 7fab726..ab09cf1 100644
--- a/src/mercy/mercy.C
+++ b/src/mercy/mercy.C
@@ -86,7 +86,7 @@ findMode(char *name) {
if (H[i] > H[mi])
mi = i;
- fprintf(stderr, "Mode of '%s' is "F_U32"\n", name, mi);
+ fprintf(stderr, "Mode of '%s' is " F_U32 "\n", name, mi);
return(mi);
}
@@ -149,9 +149,9 @@ compare(merylStreamReader *F,
if (dumpFlag)
if (Ftype == 0)
if (Ctype == 1)
- fprintf(dumpSCZF, ">"F_U32"\n%s\n", Ccnt, Cmer.merToString(merstring));
+ fprintf(dumpSCZF, ">" F_U32 "\n%s\n", Ccnt, Cmer.merToString(merstring));
else
- fprintf(dumpMCZF, ">"F_U32"\n%s\n", Ccnt, Cmer.merToString(merstring));
+ fprintf(dumpMCZF, ">" F_U32 "\n%s\n", Ccnt, Cmer.merToString(merstring));
}
return;
}
@@ -168,9 +168,9 @@ compare(merylStreamReader *F,
// Again, save the mer since it's in contigs, but not fragments.
if (dumpFlag)
if (Ctype == 1)
- fprintf(dumpSCZF, ">"F_U32"\n%s\n", Ccnt, Cmer.merToString(merstring));
+ fprintf(dumpSCZF, ">" F_U32 "\n%s\n", Ccnt, Cmer.merToString(merstring));
else
- fprintf(dumpMCZF, ">"F_U32"\n%s\n", Ccnt, Cmer.merToString(merstring));
+ fprintf(dumpMCZF, ">" F_U32 "\n%s\n", Ccnt, Cmer.merToString(merstring));
}
return;
@@ -187,12 +187,12 @@ compare(merylStreamReader *F,
if (dumpFlag) {
if (Ftype < Ctype)
if (Ctype == 2)
- fprintf(dumpMCSF, ">"F_U32"\n%s\n", Ccnt, Cmer.merToString(merstring));
+ fprintf(dumpMCSF, ">" F_U32 "\n%s\n", Ccnt, Cmer.merToString(merstring));
else
- fprintf(dumpMCMF, ">"F_U32"\n%s\n", Ccnt, Cmer.merToString(merstring));
+ fprintf(dumpMCMF, ">" F_U32 "\n%s\n", Ccnt, Cmer.merToString(merstring));
if ((Ftype == 0) && (Ctype == 1))
- fprintf(dumpSCZF, ">"F_U32"\n%s\n", Ccnt, Cmer.merToString(merstring));
+ fprintf(dumpSCZF, ">" F_U32 "\n%s\n", Ccnt, Cmer.merToString(merstring));
}
}
@@ -216,7 +216,7 @@ output(char *title,
default: fprintf(stdout, "????????? "); break;
}
for (uint32 j=0; j<5; j++)
- fprintf(stdout, "%12"F_U32P, R[i][j]);
+ fprintf(stdout, "%12" F_U32P, R[i][j]);
fprintf(stdout, "\n");
}
}
@@ -235,10 +235,10 @@ main(int argc, char **argv) {
uint32 AFmode = 0;
uint32 TFmode = 0;
- char dumpSCZFname[1024] = {0}; // single contig, zero frags
- char dumpMCZFname[1024] = {0}; // low contig, zero frags
- char dumpMCSFname[1024] = {0}; // medium contig, low frags
- char dumpMCMFname[1024] = {0}; // everything else, contig > frags
+ char dumpSCZFname[FILENAME_MAX] = {0}; // single contig, zero frags
+ char dumpMCZFname[FILENAME_MAX] = {0}; // low contig, zero frags
+ char dumpMCSFname[FILENAME_MAX] = {0}; // medium contig, low frags
+ char dumpMCMFname[FILENAME_MAX] = {0}; // everything else, contig > frags
bool beVerbose = false;
@@ -268,10 +268,10 @@ main(int argc, char **argv) {
} else if (strcmp(argv[arg], "-dump") == 0) {
arg++;
dumpFlag = true;
- sprintf(dumpSCZFname, "%s.0.singlecontig.zerofrag.fasta", argv[arg]);
- sprintf(dumpMCZFname, "%s.1.multiplecontig.zerofrag.fasta", argv[arg]);
- sprintf(dumpMCSFname, "%s.2.multiplecontig.lowfrag.fasta", argv[arg]);
- sprintf(dumpMCMFname, "%s.3.multiplecontig.multiplefrag.fasta", argv[arg]);
+ snprintf(dumpSCZFname, FILENAME_MAX, "%s.0.singlecontig.zerofrag.fasta", argv[arg]);
+ snprintf(dumpMCZFname, FILENAME_MAX, "%s.1.multiplecontig.zerofrag.fasta", argv[arg]);
+ snprintf(dumpMCSFname, FILENAME_MAX, "%s.2.multiplecontig.lowfrag.fasta", argv[arg]);
+ snprintf(dumpMCMFname, FILENAME_MAX, "%s.3.multiplecontig.multiplefrag.fasta", argv[arg]);
} else if (strcmp(argv[arg], "-v") == 0) {
beVerbose = true;
} else {
@@ -319,11 +319,11 @@ main(int argc, char **argv) {
if (differ) {
fprintf(stderr, "error: mer size differ.\n");
- fprintf(stderr, " AF - "F_U32"\n", ms[0]);
- fprintf(stderr, " TF - "F_U32"\n", ms[1]);
- fprintf(stderr, " AC - "F_U32"\n", ms[2]);
- fprintf(stderr, " DC - "F_U32"\n", ms[3]);
- fprintf(stderr, " CO - "F_U32"\n", ms[4]);
+ fprintf(stderr, " AF - " F_U32 "\n", ms[0]);
+ fprintf(stderr, " TF - " F_U32 "\n", ms[1]);
+ fprintf(stderr, " AC - " F_U32 "\n", ms[2]);
+ fprintf(stderr, " DC - " F_U32 "\n", ms[3]);
+ fprintf(stderr, " CO - " F_U32 "\n", ms[4]);
exit(1);
}
diff --git a/src/meryl/compare-counts.C b/src/meryl/compare-counts.C
index 1ed9372..d70cc68 100644
--- a/src/meryl/compare-counts.C
+++ b/src/meryl/compare-counts.C
@@ -238,10 +238,10 @@ main(int argc, char **argv) {
char outputName[FILENAME_MAX];
- sprintf(outputName, "%s.gp", outputPrefix);
+ snprintf(outputName, FILENAME_MAX, "%s.gp", outputPrefix);
FILE *outputGP = fopen(outputName, "w");
- sprintf(outputName, "%s.dat", outputPrefix);
+ snprintf(outputName, FILENAME_MAX, "%s.dat", outputPrefix);
FILE *outputDAT = fopen(outputName, "w");
fprintf(outputGP, "set terminal png\n");
@@ -259,7 +259,7 @@ main(int argc, char **argv) {
fclose(outputDAT);
- sprintf(outputName, "gnuplot < %s.gp", outputPrefix);
+ snprintf(outputName, FILENAME_MAX, "gnuplot < %s.gp", outputPrefix);
system(outputName);
exit(0);
diff --git a/src/meryl/estimate-mer-threshold.C b/src/meryl/estimate-mer-threshold.C
index f788b25..e2b696f 100644
--- a/src/meryl/estimate-mer-threshold.C
+++ b/src/meryl/estimate-mer-threshold.C
@@ -78,9 +78,9 @@ main(int argc, char **argv) {
uint64 total = 0;
uint32 Xcoverage = 8;
- fprintf(stderr, "distinct: "F_U64"\n", MF->numberOfDistinctMers());
- fprintf(stderr, "unique: "F_U64"\n", MF->numberOfUniqueMers());
- fprintf(stderr, "total: "F_U64"\n", MF->numberOfTotalMers());
+ fprintf(stderr, "distinct: " F_U64 "\n", MF->numberOfDistinctMers());
+ fprintf(stderr, "unique: " F_U64 "\n", MF->numberOfUniqueMers());
+ fprintf(stderr, "total: " F_U64 "\n", MF->numberOfTotalMers());
// Pass 0: try to deduce the X coverage we have. The
// pattern we should see in mer counts is an initial spike
@@ -99,27 +99,27 @@ main(int argc, char **argv) {
uint32 i = 0;
uint32 iX = 0;
- fprintf(stderr, "distinct: "F_U64"\n", MF->numberOfDistinctMers());
- fprintf(stderr, "unique: "F_U64"\n", MF->numberOfUniqueMers());
- fprintf(stderr, "total: "F_U64"\n", MF->numberOfTotalMers());
+ fprintf(stderr, "distinct: " F_U64 "\n", MF->numberOfDistinctMers());
+ fprintf(stderr, "unique: " F_U64 "\n", MF->numberOfUniqueMers());
+ fprintf(stderr, "total: " F_U64 "\n", MF->numberOfTotalMers());
- fprintf(stderr, "Xcoverage zero 1 0 "F_U64"\n", MF->histogram(1));
+ fprintf(stderr, "Xcoverage zero 1 0 " F_U64 "\n", MF->histogram(1));
for (i=2; (i < MF->histogramLength()) && (MF->histogram(i-1) > MF->histogram(i)); i++)
- fprintf(stderr, "Xcoverage drop "F_U32" "F_U64" "F_U64"\n", i, MF->histogram(i-1), MF->histogram(i));
+ fprintf(stderr, "Xcoverage drop " F_U32 " " F_U64 " " F_U64 "\n", i, MF->histogram(i-1), MF->histogram(i));
iX = i - 1;
for (; i < MF->histogramLength(); i++) {
if (MF->histogram(iX) < MF->histogram(i)) {
- fprintf(stderr, "Xcoverage incr "F_U32" "F_U64" "F_U64"\n", i, MF->histogram(iX), MF->histogram(i));
+ fprintf(stderr, "Xcoverage incr " F_U32 " " F_U64 " " F_U64 "\n", i, MF->histogram(iX), MF->histogram(i));
iX = i;
} else {
- //fprintf(stderr, "Xcoverage drop "F_U32" "F_U64" "F_U64"\n", i, MF->histogram(iX), MF->histogram(i));
+ //fprintf(stderr, "Xcoverage drop " F_U32 " " F_U64 " " F_U64 "\n", i, MF->histogram(iX), MF->histogram(i));
}
}
- fprintf(stderr, "Guessed X coverage is "F_U32"\n", iX);
+ fprintf(stderr, "Guessed X coverage is " F_U32 "\n", iX);
Xcoverage = iX;
@@ -143,7 +143,7 @@ main(int argc, char **argv) {
maxCount = i;
}
- fprintf(stderr, "Set maxCount to "F_U32", which will cover %.2f%% of distinct mers and %.2f%% of all mers.\n",
+ fprintf(stderr, "Set maxCount to " F_U32 ", which will cover %.2f%% of distinct mers and %.2f%% of all mers.\n",
i, 100.0 * distinct / totalUsefulDistinct, 100.0 * total / totalUsefulAll);
@@ -174,7 +174,7 @@ main(int argc, char **argv) {
maxCount = i;
- fprintf(stderr, "Reset maxCount to "F_U32", which will cover %.2f%% of distinct mers and %.2f%% of all mers.\n",
+ fprintf(stderr, "Reset maxCount to " F_U32 ", which will cover %.2f%% of distinct mers and %.2f%% of all mers.\n",
maxCount, 100.0 * distinct / totalUsefulDistinct, 100.0 * total / totalUsefulAll);
}
diff --git a/src/meryl/leaff-blocks.C b/src/meryl/leaff-blocks.C
index e0261f9..58fba51 100644
--- a/src/meryl/leaff-blocks.C
+++ b/src/meryl/leaff-blocks.C
@@ -27,6 +27,10 @@
* are Copyright 2014 Battelle National Biodefense Institute, and
* are subject to the BSD 3-Clause License
*
+ * Brian P. Walenz beginning on 2016-OCT-17
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
@@ -62,7 +66,7 @@ seqCache *F = 0L;
char seq = S->sequence()[pos];
if (nnn != V[seq]) {
- fprintf(stdout, "%c "F_U32" "F_U32" "F_U32" "F_U32"\n",
+ fprintf(stdout, "%c " F_U32 " " F_U32 " " F_U32 " " F_U32 "\n",
begseq, s, begpos, pos, pos - begpos);
nnn = V[seq];
begpos = pos;
@@ -70,9 +74,9 @@ seqCache *F = 0L;
}
}
- fprintf(stdout, "%c "F_U32" "F_U32" "F_U32" "F_U32"\n",
+ fprintf(stdout, "%c " F_U32 " " F_U32 " " F_U32 " " F_U32 "\n",
begseq, s, begpos, pos, pos - begpos);
- fprintf(stdout, ". "F_U32" "F_U32" "F_U32"\n", s, pos, 0);
+ fprintf(stdout, ". " F_U32 " " F_U32 " " F_U32 "\n", s, pos, 0);
delete S;
}
diff --git a/src/meryl/leaff-duplicates.C b/src/meryl/leaff-duplicates.C
index 902d727..2a8efff 100644
--- a/src/meryl/leaff-duplicates.C
+++ b/src/meryl/leaff-duplicates.C
@@ -27,6 +27,10 @@
* are Copyright 2014 Battelle National Biodefense Institute, and
* are subject to the BSD 3-Clause License
*
+ * Brian P. Walenz beginning on 2016-OCT-17
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
@@ -57,9 +61,9 @@ mapDuplicates_Print(char *filea, seqInCore *sa,
char *fileb, seqInCore *sb) {
if (strcmp(sa->sequence(), sb->sequence()) == 0)
- fprintf(stdout, F_U32" <-> "F_U32"\n", sa->getIID(), sb->getIID());
+ fprintf(stdout, F_U32" <-> " F_U32 "\n", sa->getIID(), sb->getIID());
else
- fprintf(stderr, "COLLISION DETECTED BETWEEN %s:"F_U32" AND %s:"F_U32"!\nPLEASE REPORT THIS TO bri at walenz.org!\n",
+ fprintf(stderr, "COLLISION DETECTED BETWEEN %s:" F_U32 " AND %s:" F_U32 "!\nPLEASE REPORT THIS TO bri at walenz.org!\n",
filea, sa->getIID(), fileb, sb->getIID());
}
@@ -83,7 +87,7 @@ findDuplicates(char *filename) {
for (uint32 idx=1; idx<numSeqs; idx++) {
if (md5_compare(result+idx-1, result+idx) == 0) {
if (result[idx-1].i == result[idx].i) {
- fprintf(stderr, "Internal error: found two copies of the same sequence iid ("F_U32")!\n", result[idx].i);
+ fprintf(stderr, "Internal error: found two copies of the same sequence iid (" F_U32 ")!\n", result[idx].i);
exit(1);
}
@@ -91,11 +95,11 @@ findDuplicates(char *filename) {
s2 = A->getSequenceInCore(result[idx].i);
if (strcmp(s1->sequence(), s2->sequence()) == 0) {
- fprintf(stdout, F_U32":%s\n"F_U32":%s\n\n",
+ fprintf(stdout, F_U32":%s\n" F_U32 ":%s\n\n",
result[idx-1].i, s1->header(),
result[idx ].i, s2->header());
} else {
- fprintf(stderr, "COLLISION DETECTED BETWEEN IID "F_U32" AND "F_U32"!\nPLEASE REPORT THIS TO bri at walenz.org!\n",
+ fprintf(stderr, "COLLISION DETECTED BETWEEN IID " F_U32 " AND " F_U32 "!\nPLEASE REPORT THIS TO bri at walenz.org!\n",
result[idx-1].i, result[idx].i);
}
diff --git a/src/meryl/leaff-gc.C b/src/meryl/leaff-gc.C
index 5daed02..7ada477 100644
--- a/src/meryl/leaff-gc.C
+++ b/src/meryl/leaff-gc.C
@@ -27,6 +27,10 @@
* are Copyright 2014 Battelle National Biodefense Institute, and
* are subject to the BSD 3-Clause License
*
+ * Brian P. Walenz beginning on 2016-OCT-17
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
@@ -98,7 +102,7 @@ computeGCcontent(char *filename) {
ave1001 += g[i+500] - ((i > 500) ? g[i-501] : 0);
ave2001 += g[i+1000] - ((i > 1000) ? g[i-1001] : 0);
- fprintf(stdout, F_U32"\t"F_U32"\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n",
+ fprintf(stdout, F_U32"\t" F_U32 "\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n",
i,
s[i],
ave3 / (double)((i >= 1) ? 3 - ((i < genomeLength - 1) ? 0 : i + 2 - genomeLength) : i+2),
diff --git a/src/meryl/leaff-partition.C b/src/meryl/leaff-partition.C
index f338736..9621aec 100644
--- a/src/meryl/leaff-partition.C
+++ b/src/meryl/leaff-partition.C
@@ -27,6 +27,10 @@
* are Copyright 2014 Battelle National Biodefense Institute, and
* are subject to the BSD 3-Clause License
*
+ * Brian P. Walenz beginning on 2016-OCT-17
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
@@ -79,20 +83,20 @@ void
outputPartition(seqCache *F,
char *prefix,
partition_s *p, uint32 openP, uint32 n) {
- char filename[1024];
+ char filename[FILENAME_MAX];
// Check that everything has been partitioned
//
for (uint32 i=0; i<n; i++)
if (p[i].partition == 0)
- fprintf(stderr, "ERROR: Failed to partition "F_U32"\n", i);
+ fprintf(stderr, "ERROR: Failed to partition " F_U32 "\n", i);
if (prefix) {
// This rewrites the source fasta file into partitioned fasta files
//
for (uint32 o=1; o<=openP; o++) {
- sprintf(filename, "%s-%03"F_U32P".fasta", prefix, o);
+ snprintf(filename, FILENAME_MAX, "%s-%03" F_U32P ".fasta", prefix, o);
errno = 0;
FILE *file = fopen(filename, "w");
@@ -107,7 +111,7 @@ outputPartition(seqCache *F,
fprintf(file, "\n");
if (S->sequenceLength() != p[i].length) {
- fprintf(stderr, "Huh? '%s' "F_U32" != "F_U32"\n", S->header(), S->sequenceLength(), p[i].length);
+ fprintf(stderr, "Huh? '%s' " F_U32 " != " F_U32 "\n", S->header(), S->sequenceLength(), p[i].length);
}
delete S;
@@ -126,10 +130,10 @@ outputPartition(seqCache *F,
for (uint32 i=0; i<n; i++)
if (p[i].partition == o)
sizeP += p[i].length;
- fprintf(stdout, F_U32"]("F_U32")", o, sizeP);
+ fprintf(stdout, F_U32"](" F_U32 ")", o, sizeP);
for (uint32 i=0; i<n; i++)
if (p[i].partition == o)
- fprintf(stdout, " "F_U32"("F_U32")", p[i].index, p[i].length);
+ fprintf(stdout, " " F_U32 "(" F_U32 ")", p[i].index, p[i].length);
fprintf(stdout, "\n");
}
@@ -223,6 +227,12 @@ partitionByBucket(char *prefix, uint64 partitionSize, char *filename) {
void
partitionBySegment(char *prefix, uint64 numSegments, char *filename) {
+
+ if (numSegments == 0) {
+ fprintf(stderr, "Number of segments to partition into must be larger than zero.\n");
+ exit(1);
+ }
+
seqCache *F = new seqCache(filename);
uint32 n = F->getNumberOfSequences();
partition_s *p = new partition_s [n];
diff --git a/src/meryl/leaff-statistics.C b/src/meryl/leaff-statistics.C
index 60e4225..3b4b546 100644
--- a/src/meryl/leaff-statistics.C
+++ b/src/meryl/leaff-statistics.C
@@ -27,6 +27,10 @@
* are Copyright 2014-2015 Battelle National Biodefense Institute, and
* are subject to the BSD 3-Clause License
*
+ * Brian P. Walenz beginning on 2016-OCT-17
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
@@ -127,7 +131,7 @@ stats(char *filename, uint64 refLen) {
for (uint32 i=0, sum=0, n=1; (i < numSeq) && (n < 11); i++) {
if ((sum < sizeb[n]) && (sizeb[n] <= sum + Lb[i])) {
- n50b[n] = Ls[i];
+ n50b[n] = Lb[i];
l50b[n] = i;
n++;
}
@@ -140,19 +144,19 @@ stats(char *filename, uint64 refLen) {
fprintf(stdout, "%s\n", F->getSourceName());
fprintf(stdout, "\n");
- fprintf(stdout, "numSeqs "F_U32"\n", numSeq);
+ fprintf(stdout, "numSeqs " F_U32 "\n", numSeq);
fprintf(stdout, "\n");
- fprintf(stdout, "SPAN (smallest "F_U32" largest "F_U32")\n", Ls[numSeq-1], Ls[0]);
+ fprintf(stdout, "SPAN (smallest " F_U32 " largest " F_U32 ")\n", Ls[numSeq-1], Ls[0]);
for (uint32 i=1; i<10; i++)
- fprintf(stdout, "n"F_U32" %10"F_U32P" at index "F_U32"\n", 10 * i, n50s[i], l50s[i]);
- fprintf(stdout, "totLen %10"F_U64P"\n", Ss);
- fprintf(stdout, "refLen %10"F_U64P"\n", Rs);
+ fprintf(stdout, "n" F_U32 " %10" F_U32P " at index " F_U32 "\n", 10 * i, n50s[i], l50s[i]);
+ fprintf(stdout, "totLen %10" F_U64P "\n", Ss);
+ fprintf(stdout, "refLen %10" F_U64P "\n", Rs);
fprintf(stdout, "\n");
- fprintf(stdout, "BASES (smallest "F_U32" largest "F_U32")\n", Lb[numSeq-1], Lb[0]);
+ fprintf(stdout, "BASES (smallest " F_U32 " largest " F_U32 ")\n", Lb[numSeq-1], Lb[0]);
for (uint32 i=1; i<10; i++)
- fprintf(stdout, "n"F_U32" %10"F_U32P" at index "F_U32"\n", 10 * i, n50b[i], l50b[i]);
- fprintf(stdout, "totLen %10"F_U64P"\n", Sb);
- fprintf(stdout, "refLen %10"F_U64P"\n", Rb);
+ fprintf(stdout, "n" F_U32 " %10" F_U32P " at index " F_U32 "\n", 10 * i, n50b[i], l50b[i]);
+ fprintf(stdout, "totLen %10" F_U64P "\n", Sb);
+ fprintf(stdout, "refLen %10" F_U64P "\n", Rb);
delete [] Ls;
delete [] Lb;
diff --git a/src/meryl/leaff.C b/src/meryl/leaff.C
index 3fcc2a2..805b366 100644
--- a/src/meryl/leaff.C
+++ b/src/meryl/leaff.C
@@ -278,10 +278,10 @@ printSequence(char *def,
char d[1024];
uint32 l = strlen(seq);
- sprintf(d, "%s_5", def);
+ snprintf(d, 1024, "%s_5", def);
printSequence(d, seq, 0, endExtract);
- sprintf(d, "%s_3", def);
+ snprintf(d, 1024, "%s_3", def);
printSequence(d, seq, l-endExtract, l);
return;
@@ -361,7 +361,7 @@ void
printSequence(uint32 sid) {
seqInCore *sic = fasta->getSequenceInCore(sid);
if (sic == 0L)
- fprintf(stderr, "WARNING: Didn't find sequence with iid '"F_U32"'\n", sid);
+ fprintf(stderr, "WARNING: Didn't find sequence with iid '" F_U32 "'\n", sid);
else
printSequence(sic);
delete sic;
@@ -450,7 +450,7 @@ processArray(int argc, char **argv) {
(argv[arg] == 0L) ? "(nullpointer)" : argv[arg]), exit(1);
for (uint32 s=0; s<fasta->getNumberOfSequences(); s++)
- fprintf(stdout, "G\tseq\t%s:"F_U32"\t"F_U32"\t%s\n",
+ fprintf(stdout, "G\tseq\t%s:" F_U32 "\t" F_U32 "\t%s\n",
argv[arg], s, fasta->getSequenceLength(s), ">unimplemented");
} else if (strcmp(argv[arg], "-d") == 0) {
@@ -519,7 +519,7 @@ processArray(int argc, char **argv) {
seq[p++] = bases[MT.mtRandom32() & 0x3];
seq[p] = 0;
- sprintf(def, "random%06"F_U32P, i);
+ snprintf(def, 1024, "random%06" F_U32P, i);
printSequence(def, seq, 0, j);
}
@@ -607,6 +607,16 @@ processArray(int argc, char **argv) {
begPos = strtouint32(argv[++arg]);
endPos = strtouint32(argv[++arg]);
+ if (endPos < begPos) {
+ uint32 t = begPos;
+
+ begPos = endPos;
+ endPos = t;
+
+ doReverse = true;
+ doComplement = true;
+ }
+
} else if (strcmp(argv[arg], "-ends") == 0) {
endExtract = strtouint32(argv[++arg]);
@@ -777,7 +787,7 @@ processFile(char *filename) {
errno = 0;
len = fread(data+pos, 1, max - pos, F);
if (errno)
- fprintf(stderr, "Couldn't read "F_U64" bytes from '%s': %s\n",
+ fprintf(stderr, "Couldn't read " F_U64 " bytes from '%s': %s\n",
(uint64)(max-pos), filename, strerror(errno)), exit(1);
pos += len;
diff --git a/src/meryl/libleaff/fastaFile.C b/src/meryl/libleaff/fastaFile.C
index 94f250c..05e759b 100644
--- a/src/meryl/libleaff/fastaFile.C
+++ b/src/meryl/libleaff/fastaFile.C
@@ -19,6 +19,10 @@
* are Copyright 2014 Battelle National Biodefense Institute, and
* are subject to the BSD 3-Clause License
*
+ * Brian P. Walenz beginning on 2016-MAY-19
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
@@ -141,7 +145,7 @@ uint32
fastaFile::getSequenceLength(uint32 iid) {
#ifdef DEBUG
- fprintf(stderr, "fastaFile::getSequenceLength()-- "F_U32"\n", iid);
+ fprintf(stderr, "fastaFile::getSequenceLength()-- " F_U32 "\n", iid);
#endif
return((iid < _numberOfSequences) ? _index[iid]._seqLength : 0);
@@ -155,7 +159,7 @@ fastaFile::getSequence(uint32 iid,
char *&s, uint32 &sLen, uint32 &sMax) {
#ifdef DEBUG
- fprintf(stderr, "fastaFile::getSequence(full)-- "F_U32"\n", iid);
+ fprintf(stderr, "fastaFile::getSequence(full)-- " F_U32 "\n", iid);
#endif
// Assume there is no index. Without being horribly complicated
@@ -169,7 +173,7 @@ fastaFile::getSequence(uint32 iid,
// use the seqFile instead.
if (iid >= _header._numberOfSequences) {
- fprintf(stderr, "fastaFile::getSequence(full)-- iid "F_U32" more than number of sequences "F_U32"\n",
+ fprintf(stderr, "fastaFile::getSequence(full)-- iid " F_U32 " more than number of sequences " F_U32 "\n",
iid, _header._numberOfSequences);
return(false);
}
@@ -194,7 +198,7 @@ fastaFile::getSequence(uint32 iid,
sLen = 0;
#ifdef DEBUG
- fprintf(stderr, "fastaFile::getSequence(full)-- seek to iid="F_U32" at pos="F_U32"\n",
+ fprintf(stderr, "fastaFile::getSequence(full)-- seek to iid=" F_U32 " at pos=" F_U32 "\n",
iid, _index[iid]._seqPosition);
#endif
_rb->seek(_index[iid]._seqPosition);
@@ -270,13 +274,13 @@ fastaFile::getSequence(uint32 iid,
uint32 bgn, uint32 end, char *s) {
if (iid >= _header._numberOfSequences) {
- fprintf(stderr, "fastaFile::getSequence(part)-- iid "F_U32" more than number of sequences "F_U32"\n",
+ fprintf(stderr, "fastaFile::getSequence(part)-- iid " F_U32 " more than number of sequences " F_U32 "\n",
iid, _header._numberOfSequences);
return(false);
}
#ifdef DEBUG
- fprintf(stderr, "fastaFile::getSequence(part)-- "F_U32"\n", iid);
+ fprintf(stderr, "fastaFile::getSequence(part)-- " F_U32 "\n", iid);
#endif
// It is impossible to be efficient here; see the big comment in
@@ -341,6 +345,7 @@ fastaFile::clear(void) {
strcpy(_typename, "FastA");
+ _randomAccessSupported = true;
_numberOfSequences = 0;
_rb = 0L;
@@ -565,7 +570,7 @@ fastaFile::constructIndex(void) {
_index[indexLen]._seqLength = seqLen;
#ifdef DEBUG
- fprintf(stderr, "INDEX iid="F_U32" len="F_U32" pos="F_U64"\n",
+ fprintf(stderr, "INDEX iid=" F_U32 " len=" F_U32 " pos=" F_U64 "\n",
indexLen, seqLen, seqStart);
#endif
diff --git a/src/meryl/libleaff/fastaStdin.C b/src/meryl/libleaff/fastaStdin.C
index 94055b3..af41cf6 100644
--- a/src/meryl/libleaff/fastaStdin.C
+++ b/src/meryl/libleaff/fastaStdin.C
@@ -19,6 +19,10 @@
* are Copyright 2014 Battelle National Biodefense Institute, and
* are subject to the BSD 3-Clause License
*
+ * Brian P. Walenz beginning on 2016-MAY-19
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
@@ -137,7 +141,7 @@ fastaStdin::getSequence(uint32 iid,
bool ret = true;
#ifdef DEBUG
- fprintf(stderr, "fastaStdin::getSequence(full)-- "F_U32"\n", iid);
+ fprintf(stderr, "fastaStdin::getSequence(full)-- " F_U32 "\n", iid);
#endif
if (iid == _nextIID)
diff --git a/src/meryl/libleaff/fastqFile.C b/src/meryl/libleaff/fastqFile.C
index 5ee1a9b..2b9b118 100644
--- a/src/meryl/libleaff/fastqFile.C
+++ b/src/meryl/libleaff/fastqFile.C
@@ -19,6 +19,10 @@
* are Copyright 2014 Battelle National Biodefense Institute, and
* are subject to the BSD 3-Clause License
*
+ * Brian P. Walenz beginning on 2016-MAY-19
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
@@ -141,7 +145,7 @@ uint32
fastqFile::getSequenceLength(uint32 iid) {
#ifdef DEBUG
- fprintf(stderr, "fastqFile::getSequenceLength()-- "F_U32"\n", iid);
+ fprintf(stderr, "fastqFile::getSequenceLength()-- " F_U32 "\n", iid);
#endif
return((iid < _numberOfSequences) ? _index[iid]._seqLength : 0);
@@ -155,11 +159,11 @@ fastqFile::getSequence(uint32 iid,
char *&s, uint32 &sLen, uint32 &sMax) {
#ifdef DEBUG
- fprintf(stderr, "fastqFile::getSequence(full)-- "F_U32"\n", iid);
+ fprintf(stderr, "fastqFile::getSequence(full)-- " F_U32 "\n", iid);
#endif
if (iid >= _header._numberOfSequences) {
- fprintf(stderr, "fastqFile::getSequence(full)-- iid "F_U32" more than number of sequences "F_U32"\n",
+ fprintf(stderr, "fastqFile::getSequence(full)-- iid " F_U32 " more than number of sequences " F_U32 "\n",
iid, _header._numberOfSequences);
return(false);
}
@@ -184,7 +188,7 @@ fastqFile::getSequence(uint32 iid,
sLen = 0;
#ifdef DEBUG
- fprintf(stderr, "fastqFile::getSequence(full)-- seek to iid="F_U32" at pos="F_U32"\n",
+ fprintf(stderr, "fastqFile::getSequence(full)-- seek to iid=" F_U32 " at pos=" F_U32 "\n",
iid, _index[iid]._seqPosition);
#endif
_rb->seek(_index[iid]._seqPosition);
@@ -271,13 +275,13 @@ fastqFile::getSequence(uint32 iid,
uint32 bgn, uint32 end, char *s) {
if (iid >= _header._numberOfSequences) {
- fprintf(stderr, "fastqFile::getSequence(part)-- iid "F_U32" more than number of sequences "F_U32"\n",
+ fprintf(stderr, "fastqFile::getSequence(part)-- iid " F_U32 " more than number of sequences " F_U32 "\n",
iid, _header._numberOfSequences);
return(false);
}
#ifdef DEBUG
- fprintf(stderr, "fastqFile::getSequence(part)-- "F_U32"\n", iid);
+ fprintf(stderr, "fastqFile::getSequence(part)-- " F_U32 "\n", iid);
#endif
// Unlike the fasta version of this, we know that all the sequence is on one line. However, we
@@ -338,6 +342,7 @@ fastqFile::clear(void) {
strcpy(_typename, "Fastq");
+ _randomAccessSupported = true;
_numberOfSequences = 0;
_rb = 0L;
@@ -549,7 +554,7 @@ fastqFile::constructIndex(void) {
// Save to the index.
if (indexLen >= indexMax) {
- fprintf(stderr, "REALLOC len="F_U32" from "F_U32" to "F_U32"\n", indexLen, indexMax, indexMax * 2);
+ fprintf(stderr, "REALLOC len=" F_U32 " from " F_U32 " to " F_U32 "\n", indexLen, indexMax, indexMax * 2);
indexMax *= 2;
fastqFileIndex *et = new fastqFileIndex[indexMax];
memcpy(et, _index, sizeof(fastqFileIndex) * indexLen);
@@ -563,7 +568,7 @@ fastqFile::constructIndex(void) {
#if 0
if ((indexLen * sizeof(fastqFileIndex) > 131000) &&
(indexLen * sizeof(fastqFileIndex) < 131200))
- fprintf(stderr, "INDEX pos="F_U64" iid="F_U32" len="F_U32" pos="F_U64"\n",
+ fprintf(stderr, "INDEX pos=" F_U64 " iid=" F_U32 " len=" F_U32 " pos=" F_U64 "\n",
indexLen * sizeof(fastqFileIndex), indexLen, seqLen, seqStart);
#endif
diff --git a/src/meryl/libleaff/fastqStdin.C b/src/meryl/libleaff/fastqStdin.C
index d0e5f49..d288745 100644
--- a/src/meryl/libleaff/fastqStdin.C
+++ b/src/meryl/libleaff/fastqStdin.C
@@ -58,6 +58,7 @@ fastqStdin::~fastqStdin() {
delete _rb;
delete [] _header;
delete [] _sequence;
+ delete [] _quality;
}
@@ -141,7 +142,7 @@ fastqStdin::getSequence(uint32 iid,
bool ret = true;
#ifdef DEBUG
- fprintf(stderr, "fastqStdin::getSequence(full)-- "F_U32"\n", iid);
+ fprintf(stderr, "fastqStdin::getSequence(full)-- " F_U32 "\n", iid);
#endif
if (iid == _nextIID)
@@ -181,7 +182,7 @@ bool
fastqStdin::getSequence(uint32 iid,
uint32 bgn, uint32 end, char *s) {
- fprintf(stderr, "fastqStdin::getSequence(part)-- ERROR! Used for random access on iid "F_U32" from position "F_U32"-"F_U32".\n", iid, bgn, end);
+ fprintf(stderr, "fastqStdin::getSequence(part)-- ERROR! Used for random access on iid " F_U32 " from position " F_U32 "-" F_U32 ".\n", iid, bgn, end);
assert(0);
return(false);
}
@@ -210,6 +211,10 @@ fastqStdin::clear(void) {
_sequence = 0L;
_sequenceLen = 0;
_sequenceMax = 0;
+
+ _quality = 0L;
+ _qualityLen = 0;
+ _qualityMax = 0;
}
diff --git a/src/meryl/libleaff/gkStoreFile.C b/src/meryl/libleaff/gkStoreFile.C
index 5458cd9..ce31cb1 100644
--- a/src/meryl/libleaff/gkStoreFile.C
+++ b/src/meryl/libleaff/gkStoreFile.C
@@ -33,6 +33,7 @@
gkStoreFile::gkStoreFile() {
clear();
+ gkp = NULL;
}
@@ -86,6 +87,7 @@ bool
gkStoreFile::getSequence(uint32 iid,
char *&h, uint32 &hLen, uint32 &hMax,
char *&s, uint32 &sLen, uint32 &sMax) {
+
if (iid > _numberOfSequences) {
fprintf(stderr, "gkStoreFile::getSequence()-- iid %u exceeds number in store %u\n", iid, _numberOfSequences);
return(false);
diff --git a/src/meryl/libleaff/seqCache.C b/src/meryl/libleaff/seqCache.C
index 184e09b..f989e8f 100644
--- a/src/meryl/libleaff/seqCache.C
+++ b/src/meryl/libleaff/seqCache.C
@@ -19,6 +19,10 @@
* are Copyright 2014 Battelle National Biodefense Institute, and
* are subject to the BSD 3-Clause License
*
+ * Brian P. Walenz beginning on 2016-MAY-19
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
@@ -77,7 +81,7 @@ seqCache::getSequenceIID(char *name) {
iid = _fb->find(name);
#ifdef DEBUG
- fprintf(stderr, "seqCache::getSequenceIID()-- '%s' -> "F_U32"\n", name, iid);
+ fprintf(stderr, "seqCache::getSequenceIID()-- '%s' -> " F_U32 "\n", name, iid);
#endif
return(iid);
@@ -190,7 +194,7 @@ seqCache::loadAllSequences(void) {
char *h=0L, *s=0L;
if (_fb->getSequence(iid, h, hLen, hMax, s, sLen, sMax) == false)
- fprintf(stderr, "seqCache::loadAllSequences()-- Failed to load iid "F_U32".\n",
+ fprintf(stderr, "seqCache::loadAllSequences()-- Failed to load iid " F_U32 ".\n",
iid), exit(1);
_cache[iid] = new seqInCore(iid, h, hLen, s, sLen, true);
diff --git a/src/meryl/libleaff/seqFactory.C b/src/meryl/libleaff/seqFactory.C
index 8d1d8af..88afae8 100644
--- a/src/meryl/libleaff/seqFactory.C
+++ b/src/meryl/libleaff/seqFactory.C
@@ -19,6 +19,10 @@
* are Copyright 2015 Battelle National Biodefense Institute, and
* are subject to the BSD 3-Clause License
*
+ * Brian P. Walenz beginning on 2016-OCT-06
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
@@ -77,10 +81,10 @@ seqFactory::openFile(const char *name) {
return(n);
}
- fprintf(stderr, "seqFactory::registerFile()-- Cannot determine type of file '%s'. Tried:\n", name);
+ fprintf(stderr, "seqFactory::openFile()-- Cannot determine type of file '%s'. Tried:\n", name);
for (uint32 i=0; i<_filesNum; i++)
- fprintf(stderr, "seqFactory::registerFile()-- '%s'\n", _files[i]->getFileTypeName());
+ fprintf(stderr, "seqFactory::openFile()-- '%s'\n", _files[i]->getFileTypeName());
exit(1);
return(n);
diff --git a/src/meryl/libleaff/seqStore.C b/src/meryl/libleaff/seqStore.C
index cff9aa8..fb16afa 100644
--- a/src/meryl/libleaff/seqStore.C
+++ b/src/meryl/libleaff/seqStore.C
@@ -155,7 +155,7 @@ seqStore::getSequence(uint32 iid,
loadIndex();
if (iid >= _header._numberOfSequences) {
- fprintf(stderr, "seqStore::getSequence(full)-- iid "F_U32" more than number of sequences "F_U32"\n",
+ fprintf(stderr, "seqStore::getSequence(full)-- iid " F_U32 " more than number of sequences " F_U32 "\n",
iid, _header._numberOfSequences);
return(false);
}
@@ -223,13 +223,13 @@ seqStore::getSequence(uint32 iid,
loadIndex();
if (iid >= _header._numberOfSequences) {
- fprintf(stderr, "seqStore::getSequence(part)-- iid "F_U32" more than number of sequences "F_U32"\n",
+ fprintf(stderr, "seqStore::getSequence(part)-- iid " F_U32 " more than number of sequences " F_U32 "\n",
iid, _header._numberOfSequences);
return(false);
}
if (bgn >= end) {
- fprintf(stderr, "seqStore::getSequence(part)-- for iid "F_U32"; invalid bgn="F_U32" end="F_U32"; seqLen="F_U32"\n",
+ fprintf(stderr, "seqStore::getSequence(part)-- for iid " F_U32 "; invalid bgn=" F_U32 " end=" F_U32 "; seqLen=" F_U32 "\n",
iid, bgn, end, _index[iid]._seqLength);
return(false);
}
@@ -346,9 +346,9 @@ seqStore::loadIndex(void) {
fread(&_header, sizeof(seqStoreHeader), 1, F);
- //fprintf(stderr, "seqStore::seqStore()-- Allocating space for "F_U32" sequences ("F_U64"MB)\n", _header._numberOfSequences, _header._numberOfSequences * sizeof(seqStoreIndex) / 1024 / 1024);
- //fprintf(stderr, "seqStore::seqStore()-- Allocating space for "F_U32" blocks ("F_U64"MB)\n", _header._numberOfBlocks, _header._numberOfBlocks * sizeof(seqStoreBlock) / 1024 / 1024);
- //fprintf(stderr, "seqStore::seqStore()-- Allocating space for "F_U32" labels ("F_U64"MB)\n", _header._namesLength, _header._namesLength * sizeof(char) / 1024 / 1024);
+ //fprintf(stderr, "seqStore::seqStore()-- Allocating space for " F_U32 " sequences (" F_U64 "MB)\n", _header._numberOfSequences, _header._numberOfSequences * sizeof(seqStoreIndex) / 1024 / 1024);
+ //fprintf(stderr, "seqStore::seqStore()-- Allocating space for " F_U32 " blocks (" F_U64 "MB)\n", _header._numberOfBlocks, _header._numberOfBlocks * sizeof(seqStoreBlock) / 1024 / 1024);
+ //fprintf(stderr, "seqStore::seqStore()-- Allocating space for " F_U32 " labels (" F_U64 "MB)\n", _header._namesLength, _header._namesLength * sizeof(char) / 1024 / 1024);
_index = new seqStoreIndex [_header._numberOfSequences];
_block = new seqStoreBlock [_header._numberOfBlocks];
@@ -436,7 +436,7 @@ constructSeqStore(char *filename, seqCache *inputseq) {
filename, inputseq->getSourceName(), inputseq->getFileTypeName());
seqStoreHeader HEAD;
- memset(&HEAD, sizeof(seqStoreHeader), 0);
+ memset(&HEAD, 0, sizeof(seqStoreHeader));
bitPackedFile *DATA = new bitPackedFile(filename, sizeof(seqStoreHeader), true);
@@ -490,13 +490,13 @@ constructSeqStore(char *filename, seqCache *inputseq) {
#if SEQSTOREBLOCK_MAXPOS < uint64MASK(32)
if (sic->sequenceLength() > SEQSTOREBLOCK_MAXPOS)
- fprintf(stderr, "constructSeqStore()-- sequence %s too long, must be shorter than "F_U64" Gbp.\n",
+ fprintf(stderr, "constructSeqStore()-- sequence %s too long, must be shorter than " F_U64 " Gbp.\n",
sic->header(), SEQSTOREBLOCK_MAXPOS / 1024 / 1024 / 1024), exit(1);
#endif
#if SEQSTOREBLOCK_MAXIID < uint64MASK(32)
if (sic->getIID() > SEQSTOREBLOCK_MAXPOS)
- fprintf(stderr, "constructSeqStore()-- too many sequences, must be fewer than "F_U64".\n",
+ fprintf(stderr, "constructSeqStore()-- too many sequences, must be fewer than " F_U64 ".\n",
SEQSTOREBLOCK_MAXIID), exit(1);
#endif
@@ -650,6 +650,6 @@ constructSeqStore(char *filename, seqCache *inputseq) {
// ESTmapper depends on this output.
- fprintf(stderr, "constructSeqStore()-- seqStore '%s' constructed ("F_U32" sequences, "F_U64" ACGT letters, "F_U32" ACGT blocks, "F_U32" GAP blocks).\n",
+ fprintf(stderr, "constructSeqStore()-- seqStore '%s' constructed (" F_U32 " sequences, " F_U64 " ACGT letters, " F_U32 " ACGT blocks, " F_U32 " GAP blocks).\n",
filename, HEAD._numberOfSequences, HEAD._numberOfACGT, HEAD._numberOfBlocksACGT, HEAD._numberOfBlocksGAP);
}
diff --git a/src/meryl/libleaff/seqStream.C b/src/meryl/libleaff/seqStream.C
index ec38689..277c2c1 100644
--- a/src/meryl/libleaff/seqStream.C
+++ b/src/meryl/libleaff/seqStream.C
@@ -19,6 +19,10 @@
* are Copyright 2014 Battelle National Biodefense Institute, and
* are subject to the BSD 3-Clause License
*
+ * Brian P. Walenz beginning on 2016-OCT-17
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
@@ -44,7 +48,7 @@ seqStream::seqStream(const char *filename) {
_idxLen = _file->getNumberOfSequences();
_idx = new seqStreamIndex [_idxLen + 1];
- //fprintf(stderr, "seqStream::seqStream()-- Allocating "F_U64"MB for seqStreamIndex on "F_U64" sequences.\n",
+ //fprintf(stderr, "seqStream::seqStream()-- Allocating " F_U64 "MB for seqStreamIndex on " F_U64 " sequences.\n",
// _idxLen * sizeof(seqStreamIndex) / 1024 / 1024, _idxLen);
_seqNumOfPos = 0L;
@@ -169,7 +173,7 @@ seqStream::tradeSpaceForTime(void) {
uint32 i = 0;
uint32 s = 0;
- //fprintf(stderr, "Allocating "F_U32" uint32s for seqNumOfPos.\n", _lengthOfSequences);
+ //fprintf(stderr, "Allocating " F_U32 " uint32s for seqNumOfPos.\n", _lengthOfSequences);
_seqNumOfPos = new uint32 [_lengthOfSequences];
@@ -232,12 +236,12 @@ seqStream::rewind(void){
_streamPos = _bgn;
_bufferPos = _bgn;
- //fprintf(stderr, "seqStream::rewind()-- 1 currentIdx="F_U32" currentPos="F_U32" streamPos="F_U32" bufferPos="F_U32"\n",
+ //fprintf(stderr, "seqStream::rewind()-- 1 currentIdx=" F_U32 " currentPos=" F_U32 " streamPos=" F_U32 " bufferPos=" F_U32 "\n",
// _currentIdx, _currentPos, _streamPos, _bufferPos);
fillBuffer();
- //fprintf(stderr, "seqStream::rewind()-- 2 currentIdx="F_U32" currentPos="F_U32" streamPos="F_U32" bufferPos="F_U32"\n",
+ //fprintf(stderr, "seqStream::rewind()-- 2 currentIdx=" F_U32 " currentPos=" F_U32 " streamPos=" F_U32 " bufferPos=" F_U32 "\n",
// _currentIdx, _currentPos, _streamPos, _bufferPos);
}
@@ -258,7 +262,7 @@ seqStream::setRange(uint64 bgn, uint64 end) {
end = l;
if ((bgn > l) || (end > l))
- fprintf(stderr, "seqStream::setRange()-- ERROR: range ("F_U64","F_U64") too big; only "F_U64" positions.\n",
+ fprintf(stderr, "seqStream::setRange()-- ERROR: range (" F_U64 "," F_U64 ") too big; only " F_U64 " positions.\n",
bgn, end, l), exit(1);
_bgn = bgn;
@@ -290,7 +294,7 @@ seqStream::sequenceNumberOfPosition(uint64 p) {
// sequence that p is in.
if (_lengthOfSequences <= p) {
- fprintf(stderr, "seqStream::sequenceNumberOfPosition()-- WARNING: position p="F_U64" too big; only "F_U64" positions.\n",
+ fprintf(stderr, "seqStream::sequenceNumberOfPosition()-- WARNING: position p=" F_U64 " too big; only " F_U64 " positions.\n",
p, _lengthOfSequences);
return(s);
}
@@ -354,7 +358,7 @@ seqStream::fillBuffer(void) {
if (_currentPos < _idx[_currentIdx]._len) {
#ifdef DEBUG
- fprintf(stderr, "seqStream::fillBuffer()-- More Seq currentPos="F_U32" len="F_U32"\n", _currentPos, _idx[_currentIdx]._len);
+ fprintf(stderr, "seqStream::fillBuffer()-- More Seq currentPos=" F_U32 " len=" F_U32 "\n", _currentPos, _idx[_currentIdx]._len);
#endif
_bufferLen = MIN(_idx[_currentIdx]._len - _currentPos, _bufferMax);
@@ -362,7 +366,7 @@ seqStream::fillBuffer(void) {
_currentPos,
_currentPos + _bufferLen,
_buffer) == false)
- fprintf(stderr, "seqStream::fillBuffer()-- Failed to getSequence(part) #1 iid="F_U32" bgn="F_U32" end="F_U32"\n",
+ fprintf(stderr, "seqStream::fillBuffer()-- Failed to getSequence(part) #1 iid=" F_U32 " bgn=" F_U32 " end=" F_U32 "\n",
_idx[_currentIdx]._iid, _currentPos, _currentPos + _bufferLen), exit(1);
return;
@@ -377,7 +381,7 @@ seqStream::fillBuffer(void) {
_currentIdx++;
#ifdef DEBUG
- fprintf(stderr, "seqStream::fillBuffer()-- New Seq currentPos="F_U32" len="F_U32"\n", _currentPos, _idx[_currentIdx]._len);
+ fprintf(stderr, "seqStream::fillBuffer()-- New Seq currentPos=" F_U32 " len=" F_U32 "\n", _currentPos, _idx[_currentIdx]._len);
#endif
// All done if there is no more sequence.
@@ -408,7 +412,7 @@ seqStream::fillBuffer(void) {
_currentPos,
_currentPos + bl,
_buffer + _bufferLen) == false)
- fprintf(stderr, "seqStream::fillBuffer()-- Failed to getSequence(part) #2 iid="F_U32" bgn="F_U32" end="F_U32"\n",
+ fprintf(stderr, "seqStream::fillBuffer()-- Failed to getSequence(part) #2 iid=" F_U32 " bgn=" F_U32 " end=" F_U32 "\n",
_idx[_currentIdx]._iid, _currentPos, _currentPos + bl), exit(1);
_bufferLen += bl;
diff --git a/src/meryl/libleaff/sffFile.C b/src/meryl/libleaff/sffFile.C
index 9df4537..769f9bf 100644
--- a/src/meryl/libleaff/sffFile.C
+++ b/src/meryl/libleaff/sffFile.C
@@ -19,6 +19,10 @@
* are Copyright 2014 Battelle National Biodefense Institute, and
* are subject to the BSD 3-Clause License
*
+ * Brian P. Walenz beginning on 2016-OCT-06
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
@@ -220,6 +224,7 @@ sffFile::clear(void) {
strcpy(_typename, "SFF");
+ _randomAccessSupported = true;
_numberOfSequences = 0;
_rb = 0L;
diff --git a/src/meryl/libmeryl.C b/src/meryl/libmeryl.C
index 72ce68c..94147e2 100644
--- a/src/meryl/libmeryl.C
+++ b/src/meryl/libmeryl.C
@@ -43,10 +43,10 @@
* full conditions and disclaimers for each license.
*/
-#include "libmeryl.H"
-
+#include "AS_global.H"
#include "AS_UTL_fileIO.H"
-#include "AS_UTL_alloc.H"
+
+#include "libmeryl.H"
// Version 3 ??
@@ -61,6 +61,7 @@ static char *PmagicV = "merylStreamPv04\n";
static char *PmagicX = "merylStreamPvXX\n";
merylStreamReader::merylStreamReader(const char *fn_, uint32 ms_) {
+ char inpath[FILENAME_MAX];
if (fn_ == 0L) {
fprintf(stderr, "ERROR - no counted database file specified.\n");
@@ -72,22 +73,19 @@ merylStreamReader::merylStreamReader(const char *fn_, uint32 ms_) {
// Open the files
//
- char *inpath = new char [strlen(_filename) + 8];
- sprintf(inpath, "%s.mcidx", _filename);
+ snprintf(inpath, FILENAME_MAX, "%s.mcidx", _filename);
_IDX = new bitPackedFile(inpath);
- sprintf(inpath, "%s.mcdat", _filename);
+ snprintf(inpath, FILENAME_MAX, "%s.mcdat", _filename);
_DAT = new bitPackedFile(inpath);
- sprintf(inpath, "%s.mcpos", _filename);
+ snprintf(inpath, FILENAME_MAX, "%s.mcpos", _filename);
if (AS_UTL_fileExists(inpath))
_POS = new bitPackedFile(inpath);
else
_POS = 0L;
- delete [] inpath;
-
// Verify that they are what they should be, and read in the header
//
char Imagic[16] = {0};
@@ -208,20 +206,20 @@ merylStreamReader::merylStreamReader(const char *fn_, uint32 ms_) {
_validMer = true;
#ifdef SHOW_VARIABLES
- fprintf(stderr, "_merSizeInBits = "F_U32"\n", _merSizeInBits);
- fprintf(stderr, "_merCompression = "F_U32"\n", _merCompression);
- fprintf(stderr, "_prefixSize = "F_U32"\n", _prefixSize);
- fprintf(stderr, "_merDataSize = "F_U32"\n", _merDataSize);
- fprintf(stderr, "_numUnique = "F_U64"\n", _numUnique);
- fprintf(stderr, "_numDistinct = "F_U64"\n", _numDistinct);
- fprintf(stderr, "_numTotal = "F_U64"\n", _numTotal);
- fprintf(stderr, "_thisBucket = "F_U64"\n", _thisBucket);
- fprintf(stderr, "_thisBucketSize = "F_U64"\n", _thisBucketSize);
- fprintf(stderr, "_thisMerCount = "F_U64"\n", _thisMerCount);
+ fprintf(stderr, "_merSizeInBits = " F_U32 "\n", _merSizeInBits);
+ fprintf(stderr, "_merCompression = " F_U32 "\n", _merCompression);
+ fprintf(stderr, "_prefixSize = " F_U32 "\n", _prefixSize);
+ fprintf(stderr, "_merDataSize = " F_U32 "\n", _merDataSize);
+ fprintf(stderr, "_numUnique = " F_U64 "\n", _numUnique);
+ fprintf(stderr, "_numDistinct = " F_U64 "\n", _numDistinct);
+ fprintf(stderr, "_numTotal = " F_U64 "\n", _numTotal);
+ fprintf(stderr, "_thisBucket = " F_U64 "\n", _thisBucket);
+ fprintf(stderr, "_thisBucketSize = " F_U64 "\n", _thisBucketSize);
+ fprintf(stderr, "_thisMerCount = " F_U64 "\n", _thisMerCount);
#endif
if ((ms_ > 0) && (_merSizeInBits >> 1 != ms_)) {
- fprintf(stderr, "merylStreamReader()-- ERROR: User requested mersize "F_U32" but '%s' is mersize "F_U32"\n",
+ fprintf(stderr, "merylStreamReader()-- ERROR: User requested mersize " F_U32 " but '%s' is mersize " F_U32 "\n",
ms_, _filename, _merSizeInBits >> 1);
exit(1);
}
@@ -287,27 +285,24 @@ merylStreamWriter::merylStreamWriter(const char *fn_,
uint32 merComp,
uint32 prefixSize,
bool positionsEnabled) {
+ char outpath[FILENAME_MAX];
memset(_filename, 0, sizeof(char) * FILENAME_MAX);
strcpy(_filename, fn_);
- char *outpath = new char [FILENAME_MAX];
-
- sprintf(outpath, "%s.mcidx.creating", _filename);
+ snprintf(outpath, FILENAME_MAX, "%s.mcidx.creating", _filename);
_IDX = new bitPackedFile(outpath, 0, true);
- sprintf(outpath, "%s.mcdat.creating", _filename);
+ snprintf(outpath, FILENAME_MAX, "%s.mcdat.creating", _filename);
_DAT = new bitPackedFile(outpath, 0, true);
if (positionsEnabled) {
- sprintf(outpath, "%s.mcpos.creating", _filename);
+ snprintf(outpath, FILENAME_MAX, "%s.mcpos.creating", _filename);
_POS = new bitPackedFile(outpath, 0, true);
} else {
_POS = 0L;
}
- delete [] outpath;
-
_idxIsPacked = 1;
_datIsPacked = 1;
_posIsPacked = 0;
@@ -448,25 +443,22 @@ merylStreamWriter::~merylStreamWriter() {
// All done! Rename our temporary outputs to final outputs.
- char *outpath = new char [FILENAME_MAX];
- char *finpath = new char [FILENAME_MAX];
+ char outpath[FILENAME_MAX];
+ char finpath[FILENAME_MAX];
- sprintf(outpath, "%s.mcidx.creating", _filename);
- sprintf(finpath, "%s.mcidx", _filename);
+ snprintf(outpath, FILENAME_MAX, "%s.mcidx.creating", _filename);
+ snprintf(finpath, FILENAME_MAX, "%s.mcidx", _filename);
rename(outpath, finpath);
- sprintf(outpath, "%s.mcdat.creating", _filename);
- sprintf(finpath, "%s.mcdat", _filename);
+ snprintf(outpath, FILENAME_MAX, "%s.mcdat.creating", _filename);
+ snprintf(finpath, FILENAME_MAX, "%s.mcdat", _filename);
rename(outpath, finpath);
if (_POS) {
- sprintf(outpath, "%s.mcpos.creating", _filename);
- sprintf(finpath, "%s.mcpos", _filename);
+ snprintf(outpath, FILENAME_MAX, "%s.mcpos.creating", _filename);
+ snprintf(finpath, FILENAME_MAX, "%s.mcpos", _filename);
rename(outpath, finpath);
}
-
- delete [] finpath;
- delete [] outpath;
}
@@ -480,7 +472,7 @@ merylStreamWriter::writeMer(void) {
_numDistinct++;
if (_thisMerCount >= _histogramLen)
- resizeArray(_histogram, _histogramMaxValue, _histogramLen, _thisMerCount + 16384);
+ resizeArray(_histogram, _histogramMaxValue+1, _histogramLen, _thisMerCount + 16384, resizeArray_copyData | resizeArray_clearNew);
_histogram[_thisMerCount]++;
diff --git a/src/meryl/libmeryl.H b/src/meryl/libmeryl.H
index a69693c..7020821 100644
--- a/src/meryl/libmeryl.H
+++ b/src/meryl/libmeryl.H
@@ -35,6 +35,10 @@
* are Copyright 2014-2015 Battelle National Biodefense Institute, and
* are subject to the BSD 3-Clause License
*
+ * Brian P. Walenz beginning on 2016-MAY-19
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
diff --git a/src/meryl/maskMers.C b/src/meryl/maskMers.C
index 9596712..afca643 100644
--- a/src/meryl/maskMers.C
+++ b/src/meryl/maskMers.C
@@ -31,15 +31,12 @@
* full conditions and disclaimers for each license.
*/
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-
-#include "bio++.H"
+#include "AS_global.H"
#include "seqStream.H"
#include "libmeryl.H"
+#include "speedCounter.H"
+
#include <algorithm>
#define MAX_COVERAGE 51
@@ -110,7 +107,7 @@ public:
strcpy(_maskMersName, _merylName);
strcat(_maskMersName, ".maskMers");
- if (fileExists(_maskMersName))
+ if (AS_UTL_fileExists(_maskMersName))
loadMasking(onlySeqIID_);
else
buildMasking();
@@ -162,7 +159,7 @@ merMaskedSequence::loadMasking(uint32 onlySeqIID_) {
_masking = new char * [_numSeq];
_repeatID = new uint32 * [_numSeq];
- fprintf(stderr, uint32FMT" sequences in '%s'\n", _numSeq, _fastaName);
+ fprintf(stderr, F_U32" sequences in '%s'\n", _numSeq, _fastaName);
fread( _seqLen, sizeof(uint32), _numSeq, maskMersFile);
@@ -171,7 +168,7 @@ merMaskedSequence::loadMasking(uint32 onlySeqIID_) {
_repeatID[i] = 0L;
if ((onlySeqIID_ >= _numSeq) || (onlySeqIID_ == i)) {
- fprintf(stderr, "Loading sequence "uint32FMT" of length "uint32FMT"\n", i, _seqLen[i]);
+ fprintf(stderr, "Loading sequence " F_U32 " of length " F_U32 "\n", i, _seqLen[i]);
_masking[i] = new char [_seqLen[i]];
_repeatID[i] = new uint32 [_seqLen[i]];
@@ -221,7 +218,7 @@ merMaskedSequence::buildMasking(void) {
_merSize = 0;
- fprintf(stderr, uint32FMT" sequences in '%s'\n", _numSeq, _fastaName);
+ fprintf(stderr, F_U32" sequences in '%s'\n", _numSeq, _fastaName);
for (uint32 i=0; i<_numSeq; i++) {
_seqLen[i] = STR->lengthOf(i);
@@ -249,7 +246,7 @@ merMaskedSequence::buildMasking(void) {
_merSize = MS->merSize();
while (MS->nextMer()) {
- //fprintf(stderr, "mer count="uint64FMT" pos="uint32FMT"\n", MS->theCount(), MS->getPosition(0));
+ //fprintf(stderr, "mer count="uint64FMT" pos=" F_U32 "\n", MS->theCount(), MS->getPosition(0));
if (MS->theCount() == 1) {
uint32 p = MS->getPosition(0);
@@ -309,7 +306,7 @@ computeDensity(merMaskedSequence *S, char *outputPrefix) {
if (S->seqLen(s) == 0)
continue;
- sprintf(outputName, "%s.density.seq"uint32FMTW(02), outputPrefix, s);
+ snprintf(outputName, FILENAME_MAX, "%s.density.seq%02" F_U32P, outputPrefix, s);
outputFile = fopen(outputName, "w");
fprintf(stderr, "Starting '%s'\n", outputName);
@@ -336,7 +333,7 @@ computeDensity(merMaskedSequence *S, char *outputPrefix) {
p++;
}
- fprintf(outputFile, uint32FMT"\t%f\t%f\t%f\n",
+ fprintf(outputFile, F_U32"\t%f\t%f\t%f\n",
p - windowSize,
(double)uniqueSum / windowSize,
(double)repeatSum / windowSize,
@@ -385,12 +382,12 @@ computeMateRescue(merMaskedSequence *S, char *outputPrefix, mateRescueData *lib,
if (S->seqLen(s) == 0)
continue;
- fprintf(stderr, "Starting sequence "uint32FMT"\n", s);
+ fprintf(stderr, "Starting sequence " F_U32 "\n", s);
- sprintf(outputName, "%s.mateRescue.seq"uint32FMTW(02)".out", outputPrefix, s);
+ snprintf(outputName, FILENAME_MAX, "%s.mateRescue.seq%02" F_U32P ".out", outputPrefix, s);
outputFile = fopen(outputName, "w");
- sprintf(outputName, "%s.mateRescue.seq"uint32FMTW(02)".dat", outputPrefix, s);
+ snprintf(outputName, FILENAME_MAX, "%s.mateRescue.seq%02" F_U32P ".dat", outputPrefix, s);
outputData = fopen(outputName, "w");
double numRR[MAX_COVERAGE] = {0}; // num repeats rescued (expected) for [] X coverage
@@ -536,7 +533,7 @@ computeMateRescue(merMaskedSequence *S, char *outputPrefix, mateRescueData *lib,
}
} // over all libs
- fprintf(outputData, int32FMT"\t%f\t%f\n", p, pRtot / totalDepth, pFtot / totalDepth);
+ fprintf(outputData, F_S32"\t%f\t%f\n", p, pRtot / totalDepth, pFtot / totalDepth);
} // if masking is r
} // over all positions
@@ -544,7 +541,7 @@ computeMateRescue(merMaskedSequence *S, char *outputPrefix, mateRescueData *lib,
fprintf(outputFile, "seqIID\tmerSize\ttRepeat\teRescue\teFailed\tXcov\tmean\tstddev\n");
for (uint32 x=1, l=0, n=0; l<libLen; x++) {
- fprintf(outputFile, uint32FMT"\t"uint32FMT"\t"uint32FMT"\t%.0f\t%.0f\t"uint32FMT"\t"int32FMT"\t"int32FMT"\n",
+ fprintf(outputFile, F_U32"\t" F_U32 "\t" F_U32 "\t%.0f\t%.0f\t" F_U32 "\t" F_S32 "\t" F_S32 "\n",
s, S->merSize(), numRT, numRR[x], numNR[x], x, lib[l].mean(), lib[l].stddev());
n++;
if (n >= lib[l].coverage()) {
diff --git a/src/bogart/buildGraph.mk b/src/meryl/maskMers.mk
similarity index 69%
rename from src/bogart/buildGraph.mk
rename to src/meryl/maskMers.mk
index af0cf02..203d231 100644
--- a/src/bogart/buildGraph.mk
+++ b/src/meryl/maskMers.mk
@@ -1,3 +1,4 @@
+
# If 'make' isn't run from the root directory, we need to set these to
# point to the upper level build directory.
ifeq "$(strip ${BUILD_DIR})" ""
@@ -7,13 +8,14 @@ ifeq "$(strip ${TARGET_DIR})" ""
TARGET_DIR := ../$(OSTYPE)-$(MACHINETYPE)/bin
endif
-TARGET := buildGraph
-SOURCES := buildGraph.C
+TARGET := maskMers
+SOURCES := maskMers.C
-SRC_INCDIRS := .. ../AS_UTL ../stores
+SRC_INCDIRS := .. ../AS_UTL libleaff
TGT_LDFLAGS := -L${TARGET_DIR}
-TGT_LDLIBS := -lcanu
-TGT_PREREQS := libcanu.a
+TGT_LDLIBS := -lleaff -lcanu
+TGT_PREREQS := libleaff.a libcanu.a
SUBMAKEFILES :=
+
diff --git a/src/meryl/meryl-args.C b/src/meryl/meryl-args.C
index 414241c..11c6cb7 100644
--- a/src/meryl/meryl-args.C
+++ b/src/meryl/meryl-args.C
@@ -35,6 +35,10 @@
* are Copyright 2014-2015 Battelle National Biodefense Institute, and
* are subject to the BSD 3-Clause License
*
+ * Brian P. Walenz beginning on 2016-SEP-14
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
@@ -58,7 +62,7 @@ writeString(const char *str, FILE *F) {
}
if (errno) {
- fprintf(stderr, "writeString()-- Failed to write string of length "F_U32": %s\n", len, strerror(errno));
+ fprintf(stderr, "writeString()-- Failed to write string of length " F_U32 ": %s\n", len, strerror(errno));
fprintf(stderr, "writeString()-- First 80 bytes of string is:\n");
fprintf(stderr, "%80.80s\n", str);
return(false);
@@ -514,9 +518,14 @@ merylArgs::merylArgs(int argc, char **argv) {
fprintf(stderr, "WARNING: -threads has no effect with -countbatch, disabled.\n");
if (mergeBatch)
fprintf(stderr, "WARNING: -threads has no effect with -mergebatch, disabled.\n");
- numThreads = 0;
+ numThreads = 1;
}
+ if (numThreads == 0)
+ numThreads = omp_get_max_threads();
+
+ omp_set_num_threads(numThreads);
+
// SGE is not useful unless we are in batch mode.
//
if (sgeJobName && !configBatch && !countBatch && !mergeBatch) {
@@ -531,11 +540,11 @@ merylArgs::merylArgs(int argc, char **argv) {
merylArgs::merylArgs(const char *prefix) {
+ char filename[FILENAME_MAX];
clear();
- char *filename = new char [strlen(prefix) + 17];
- sprintf(filename, "%s.merylArgs", prefix);
+ snprintf(filename, FILENAME_MAX, "%s.merylArgs", prefix);
errno = 0;
FILE *F = fopen(filename, "rb");
@@ -570,8 +579,6 @@ merylArgs::merylArgs(const char *prefix) {
mergeFiles[i] = readString(F);
fclose(F);
-
- delete [] filename;
}
@@ -592,10 +599,9 @@ merylArgs::~merylArgs() {
bool
merylArgs::writeConfig(void) {
- char *filename;
+ char filename[FILENAME_MAX];
- filename = new char [strlen(outputFile) + 17];
- sprintf(filename, "%s.merylArgs", outputFile);
+ snprintf(filename, FILENAME_MAX, "%s.merylArgs", outputFile);
errno = 0;
FILE *F = fopen(filename, "wb");
diff --git a/src/meryl/meryl-binaryOp.C b/src/meryl/meryl-binaryOp.C
index 31d628c..857ba23 100644
--- a/src/meryl/meryl-binaryOp.C
+++ b/src/meryl/meryl-binaryOp.C
@@ -35,6 +35,10 @@
* are Copyright 2014 Battelle National Biodefense Institute, and
* are subject to the BSD 3-Clause License
*
+ * Brian P. Walenz beginning on 2016-OCT-17
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
@@ -81,8 +85,8 @@ binaryOperations(merylArgs *args) {
//
if (A->merSize() != B->merSize()) {
fprintf(stderr, "ERROR - mersizes are different!\n");
- fprintf(stderr, "ERROR - mersize of '%s' is "F_U32"\n", args->mergeFiles[0], A->merSize());
- fprintf(stderr, "ERROR - mersize of '%s' is "F_U32"\n", args->mergeFiles[1], B->merSize());
+ fprintf(stderr, "ERROR - mersize of '%s' is " F_U32 "\n", args->mergeFiles[0], A->merSize());
+ fprintf(stderr, "ERROR - mersize of '%s' is " F_U32 "\n", args->mergeFiles[1], B->merSize());
exit(1);
}
diff --git a/src/meryl/meryl-build-threads.C b/src/meryl/meryl-build-threads.C
deleted file mode 100644
index ed8b02a..0000000
--- a/src/meryl/meryl-build-threads.C
+++ /dev/null
@@ -1,123 +0,0 @@
-
-/******************************************************************************
- *
- * This file is part of canu, a software program that assembles whole-genome
- * sequencing reads into contigs.
- *
- * This software is based on:
- * 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
- * the 'kmer package' (http://kmer.sourceforge.net)
- * both originally distributed by Applera Corporation under the GNU General
- * Public License, version 2.
- *
- * Canu branched from Celera Assembler at its revision 4587.
- * Canu branched from the kmer project at its revision 1994.
- *
- * This file is derived from:
- *
- * kmer/meryl/build-threads.C
- *
- * Modifications by:
- *
- * Brian P. Walenz from 2004-APR-13 to 2004-OCT-10
- * are Copyright 2004 Brian P. Walenz, and
- * are subject to the GNU General Public License version 2
- *
- * Brian P. Walenz from 2006-MAY-14 to 2014-APR-11
- * are Copyright 2006,2014 J. Craig Venter Institute, and
- * are subject to the GNU General Public License version 2
- *
- * Brian P. Walenz on 2014-DEC-05
- * are Copyright 2014 Battelle National Biodefense Institute, and
- * are subject to the BSD 3-Clause License
- *
- * File 'README.licenses' in the root directory of this distribution contains
- * full conditions and disclaimers for each license.
- */
-
-#include "meryl.H"
-
-void
-runSegment(merylArgs *args, uint64 segment);
-
-pthread_mutex_t segmentMutex;
-uint64 segmentNext;
-uint64 segmentMax;
-uint32 *segmentDone;
-
-
-void*
-buildThread(void *U) {
- uint64 segment = uint32ZERO;
- merylArgs *args = (merylArgs *)U;
-
- while (segment < segmentMax) {
- pthread_mutex_lock(&segmentMutex);
- segment = segmentNext++;
- pthread_mutex_unlock(&segmentMutex);
-
- if (segment < segmentMax) {
- runSegment(args, segment);
- segmentDone[segment]++;
- }
- }
-
- if (args->beVerbose)
- fprintf(stderr, "Thread exits.\n");
-
- return(0L);
-}
-
-
-void
-runThreaded(merylArgs *args) {
-
- // Clear stuff
- //
- segmentNext = uint64ZERO;
- segmentMax = args->segmentLimit;
- segmentDone = new uint32 [segmentMax];
- for (uint64 s=0; s<segmentMax; s++)
- segmentDone[s] = uint32ZERO;
-
- // Initialize threads
- //
- pthread_attr_t threadAttr;
- pthread_t threadID;
-
- pthread_mutex_init(&segmentMutex, NULL);
-
- pthread_attr_init(&threadAttr);
- pthread_attr_setscope(&threadAttr, PTHREAD_SCOPE_SYSTEM);
- pthread_attr_setdetachstate(&threadAttr, PTHREAD_CREATE_DETACHED);
- pthread_attr_setschedpolicy(&threadAttr, SCHED_OTHER);
-
- // Start the threads
- //
- for (uint64 i=0; i<args->numThreads; i++)
- pthread_create(&threadID, &threadAttr, buildThread, (void *)args);
-
- // Wait for the threads to complete
- //
- struct timespec shortNap;
- shortNap.tv_sec = 1;
- shortNap.tv_nsec = 0;
-
- uint64 s=0;
- while (s < segmentMax) {
- if (segmentDone[s] == 0)
- nanosleep(&shortNap, 0L);
- else
- s++;
- }
-
- if (args->beVerbose)
- fprintf(stderr, "Threads all done, cleaning up.\n");
-
- // Cleanup
- //
- pthread_attr_destroy(&threadAttr);
- pthread_mutex_destroy(&segmentMutex);
-
- delete [] segmentDone;
-}
diff --git a/src/meryl/meryl-build.C b/src/meryl/meryl-build.C
index e69a13a..3d67120 100644
--- a/src/meryl/meryl-build.C
+++ b/src/meryl/meryl-build.C
@@ -146,10 +146,10 @@ adjustHeap(sortedList_t *M, int64 i, int64 n) {
void
submitPrepareBatch(merylArgs *args) {
FILE *F;
- char nam[1024];
- char cmd[1024];
+ char nam[FILENAME_MAX];
+ char cmd[FILENAME_MAX];
- sprintf(nam, "%s-prepare.sh", args->outputFile);
+ snprintf(nam, FILENAME_MAX, "%s-prepare.sh", args->outputFile);
errno = 0;
F = fopen(nam, "w");
@@ -162,10 +162,10 @@ submitPrepareBatch(merylArgs *args) {
fclose(F);
if (args->sgeMergeOpt)
- sprintf(cmd, "qsub -cwd -b n -j y -o %s-prepare.err %s -N mp%s %s-prepare.sh",
+ snprintf(cmd, FILENAME_MAX, "qsub -cwd -b n -j y -o %s-prepare.err %s -N mp%s %s-prepare.sh",
args->outputFile, args->sgeMergeOpt, args->sgeJobName, args->outputFile);
else
- sprintf(cmd, "qsub -cwd -b n -j y -o %s-prepare.err -N mp%s %s-prepare.sh",
+ snprintf(cmd, FILENAME_MAX, "qsub -cwd -b n -j y -o %s-prepare.err -N mp%s %s-prepare.sh",
args->outputFile, args->sgeJobName, args->outputFile);
fprintf(stderr, "%s\n", cmd);
if (system(cmd))
@@ -176,10 +176,10 @@ submitPrepareBatch(merylArgs *args) {
void
submitCountBatches(merylArgs *args) {
FILE *F;
- char nam[1024];
- char cmd[1024];
+ char nam[FILENAME_MAX];
+ char cmd[FILENAME_MAX];
- sprintf(nam, "%s-count.sh", args->outputFile);
+ snprintf(nam, FILENAME_MAX, "%s-count.sh", args->outputFile);
errno = 0;
F = fopen(nam, "w");
@@ -193,10 +193,10 @@ submitCountBatches(merylArgs *args) {
fclose(F);
if (args->sgeBuildOpt)
- sprintf(cmd, "qsub -t 1-"F_U64" -cwd -b n -j y -o %s-count-\\$TASK_ID.err %s -N mc%s %s-count.sh",
+ snprintf(cmd, FILENAME_MAX, "qsub -t 1-" F_U64 " -cwd -b n -j y -o %s-count-\\$TASK_ID.err %s -N mc%s %s-count.sh",
args->segmentLimit, args->outputFile, args->sgeBuildOpt, args->sgeJobName, args->outputFile);
else
- sprintf(cmd, "qsub -t 1-"F_U64" -cwd -b n -j y -o %s-count-\\$TASK_ID.err -N mc%s %s-count.sh",
+ snprintf(cmd, FILENAME_MAX, "qsub -t 1-" F_U64 " -cwd -b n -j y -o %s-count-\\$TASK_ID.err -N mc%s %s-count.sh",
args->segmentLimit, args->outputFile, args->sgeJobName, args->outputFile);
fprintf(stderr, "%s\n", cmd);
if (system(cmd))
@@ -204,7 +204,7 @@ submitCountBatches(merylArgs *args) {
// submit the merge
- sprintf(nam, "%s-merge.sh", args->outputFile);
+ snprintf(nam, FILENAME_MAX, "%s-merge.sh", args->outputFile);
errno = 0;
F = fopen(nam, "w");
@@ -217,10 +217,10 @@ submitCountBatches(merylArgs *args) {
fclose(F);
if (args->sgeMergeOpt)
- sprintf(cmd, "qsub -hold_jid mc%s -cwd -b n -j y -o %s-merge.err %s -N mm%s %s-merge.sh",
+ snprintf(cmd, FILENAME_MAX, "qsub -hold_jid mc%s -cwd -b n -j y -o %s-merge.err %s -N mm%s %s-merge.sh",
args->sgeJobName, args->outputFile, args->sgeMergeOpt, args->sgeJobName, args->outputFile);
else
- sprintf(cmd, "qsub -hold_jid mc%s -cwd -b n -j y -o %s-merge.err -N mm%s %s-merge.sh",
+ snprintf(cmd, FILENAME_MAX, "qsub -hold_jid mc%s -cwd -b n -j y -o %s-merge.err -N mm%s %s-merge.sh",
args->sgeJobName, args->outputFile, args->sgeJobName, args->outputFile);
fprintf(stderr, "%s\n", cmd);
if (system(cmd))
@@ -335,28 +335,28 @@ prepareBatch(merylArgs *args) {
args->merDataWidth = args->merSize * 2 - args->numBuckets_log2;
if (args->merDataWidth > SORTED_LIST_WIDTH * 64) {
- fprintf(stderr, " numMersActual = "F_U64"\n", args->numMersActual);
- fprintf(stderr, " mersPerBatch = "F_U64"\n", args->mersPerBatch);
- fprintf(stderr, " basesPerBatch = "F_U64"\n", args->basesPerBatch);
- fprintf(stderr, " numBuckets = "F_U64" ("F_U32" bits)\n", args->numBuckets, args->numBuckets_log2);
- fprintf(stderr, " bucketPointerWidth = "F_U32"\n", args->bucketPointerWidth);
- fprintf(stderr, " merDataWidth = "F_U32"\n", args->merDataWidth);
+ fprintf(stderr, " numMersActual = " F_U64 "\n", args->numMersActual);
+ fprintf(stderr, " mersPerBatch = " F_U64 "\n", args->mersPerBatch);
+ fprintf(stderr, " basesPerBatch = " F_U64 "\n", args->basesPerBatch);
+ fprintf(stderr, " numBuckets = " F_U64 " (" F_U32 " bits)\n", args->numBuckets, args->numBuckets_log2);
+ fprintf(stderr, " bucketPointerWidth = " F_U32 "\n", args->bucketPointerWidth);
+ fprintf(stderr, " merDataWidth = " F_U32 "\n", args->merDataWidth);
fprintf(stderr, "Sorry! merSize too big! Increase KMER_WORDS in libbio.kmer.H\n");
exit(1);
}
if (args->beVerbose) {
- fprintf(stderr, "Computing "F_U64" segments using "F_U32" threads and "F_U64"MB memory ("F_U64"MB if in one batch).\n",
+ fprintf(stderr, "Computing " F_U64 " segments using " F_U32 " threads and " F_U64 "MB memory (" F_U64 "MB if in one batch).\n",
args->segmentLimit, args->numThreads,
estimateMemory(args->merSize, args->mersPerBatch, args->positionsEnabled) * args->numThreads,
estimateMemory(args->merSize, args->numMersActual, args->positionsEnabled));
- fprintf(stderr, " numMersActual = "F_U64"\n", args->numMersActual);
- fprintf(stderr, " mersPerBatch = "F_U64"\n", args->mersPerBatch);
- fprintf(stderr, " basesPerBatch = "F_U64"\n", args->basesPerBatch);
- fprintf(stderr, " numBuckets = "F_U64" ("F_U32" bits)\n", args->numBuckets, args->numBuckets_log2);
- fprintf(stderr, " bucketPointerWidth = "F_U32"\n", args->bucketPointerWidth);
- fprintf(stderr, " merDataWidth = "F_U32"\n", args->merDataWidth);
+ fprintf(stderr, " numMersActual = " F_U64 "\n", args->numMersActual);
+ fprintf(stderr, " mersPerBatch = " F_U64 "\n", args->mersPerBatch);
+ fprintf(stderr, " basesPerBatch = " F_U64 "\n", args->basesPerBatch);
+ fprintf(stderr, " numBuckets = " F_U64 " (" F_U32 " bits)\n", args->numBuckets, args->numBuckets_log2);
+ fprintf(stderr, " bucketPointerWidth = " F_U32 "\n", args->bucketPointerWidth);
+ fprintf(stderr, " merDataWidth = " F_U32 "\n", args->merDataWidth);
}
}
@@ -378,31 +378,30 @@ runSegment(merylArgs *args, uint64 segment) {
// XXX: This should be a command line option.
// XXX: This should check that the files are complete meryl files.
//
- char *filename = new char [strlen(args->outputFile) + 17];
- sprintf(filename, "%s.batch"F_U64".mcdat", args->outputFile, segment);
+ char filename[FILENAME_MAX];
+
+ snprintf(filename, FILENAME_MAX, "%s.batch" F_U64 ".mcdat", args->outputFile, segment);
if (AS_UTL_fileExists(filename)) {
if (args->beVerbose)
- fprintf(stderr, "Found result for batch "F_U64" in %s.\n", segment, filename);
- delete [] filename;
+ fprintf(stderr, "Found result for batch " F_U64 " in %s.\n", segment, filename);
return;
}
if ((args->beVerbose) && (args->segmentLimit > 1))
- fprintf(stderr, "Computing segment "F_U64" of "F_U64".\n", segment+1, args->segmentLimit);
+ fprintf(stderr, "Computing segment " F_U64 " of " F_U64 ".\n", segment+1, args->segmentLimit);
- delete [] filename;
// Allocate space for bucket pointers and (temporary) bucket sizes.
if (args->beVerbose)
- fprintf(stderr, " Allocating "F_U64"MB for bucket pointer table ("F_U32" bits wide).\n",
+ fprintf(stderr, " Allocating " F_U64 "MB for bucket pointer table (" F_U32 " bits wide).\n",
(args->numBuckets * args->bucketPointerWidth + 128) >> 23, args->bucketPointerWidth);
bucketPointers = new uint64 [(args->numBuckets * args->bucketPointerWidth + 128) >> 6];
if (args->beVerbose)
- fprintf(stderr, " Allocating "F_U64"MB for counting the size of each bucket.\n", args->numBuckets >> 18);
+ fprintf(stderr, " Allocating " F_U64 "MB for counting the size of each bucket.\n", args->numBuckets >> 18);
bucketSizes = new uint32 [ args->numBuckets ];
for (uint64 i=args->numBuckets; i--; )
bucketSizes[i] = uint32ZERO;
@@ -483,7 +482,7 @@ runSegment(merylArgs *args, uint64 segment) {
// All done with the counting table, get rid of it.
if (args->beVerbose)
- fprintf(stderr, " Releasing "F_U64"MB from counting the size of each bucket.\n", args->numBuckets >> 18);
+ fprintf(stderr, " Releasing " F_U64 "MB from counting the size of each bucket.\n", args->numBuckets >> 18);
delete [] bucketSizes;
@@ -492,7 +491,7 @@ runSegment(merylArgs *args, uint64 segment) {
// allocate full words.
if (args->beVerbose)
- fprintf(stderr, " Allocating "F_U64"MB for mer storage ("F_U32" bits wide).\n",
+ fprintf(stderr, " Allocating " F_U64 "MB for mer storage (" F_U32 " bits wide).\n",
(args->basesPerBatch * args->merDataWidth + 64) >> 23, args->merDataWidth);
for (uint64 mword=0, width=args->merDataWidth; width > 0; ) {
@@ -510,7 +509,7 @@ runSegment(merylArgs *args, uint64 segment) {
if (args->positionsEnabled) {
if (args->beVerbose)
- fprintf(stderr, " Allocating "F_U64"MB for mer position storage.\n",
+ fprintf(stderr, " Allocating " F_U64 "MB for mer position storage.\n",
(args->basesPerBatch * 32 + 32) >> 23);
merPosnArray = new uint32 [ args->basesPerBatch + 1 ];
}
@@ -567,8 +566,8 @@ runSegment(merylArgs *args, uint64 segment) {
delete C;
delete M;
- char *batchOutputFile = new char [strlen(args->outputFile) + 33];
- sprintf(batchOutputFile, "%s.batch"F_U64, args->outputFile, segment);
+ char batchOutputFile[FILENAME_MAX];
+ snprintf(batchOutputFile, FILENAME_MAX, "%s.batch" F_U64, args->outputFile, segment);
C = new speedCounter(" Writing output: %7.2f Mmers -- %5.2f Mmers/second\r", 1000000.0, 0x1fffff, args->beVerbose);
W = new merylStreamWriter((args->segmentLimit == 1) ? args->outputFile : batchOutputFile,
@@ -588,20 +587,20 @@ runSegment(merylArgs *args, uint64 segment) {
uint64 ed = getDecodedValue(bucketPointers, bucketPos, args->bucketPointerWidth);
if (ed < st) {
- fprintf(stderr, "ERROR: In segment "F_U64"\n", segment);
- fprintf(stderr, "ERROR: Bucket "F_U64" (out of "F_U64") ends before it starts!\n",
+ fprintf(stderr, "ERROR: In segment " F_U64 "\n", segment);
+ fprintf(stderr, "ERROR: Bucket " F_U64 " (out of " F_U64 ") ends before it starts!\n",
bucket, args->numBuckets);
- fprintf(stderr, "ERROR: start="F_U64"\n", st);
- fprintf(stderr, "ERROR: end ="F_U64"\n", ed);
+ fprintf(stderr, "ERROR: start=" F_U64 "\n", st);
+ fprintf(stderr, "ERROR: end =" F_U64 "\n", ed);
}
assert(ed >= st);
if ((ed - st) > (uint64ONE << 30)) {
- fprintf(stderr, "ERROR: In segment "F_U64"\n", segment);
- fprintf(stderr, "ERROR: Bucket "F_U64" (out of "F_U64") is HUGE!\n",
+ fprintf(stderr, "ERROR: In segment " F_U64 "\n", segment);
+ fprintf(stderr, "ERROR: Bucket " F_U64 " (out of " F_U64 ") is HUGE!\n",
bucket, args->numBuckets);
- fprintf(stderr, "ERROR: start="F_U64"\n", st);
- fprintf(stderr, "ERROR: end ="F_U64"\n", ed);
+ fprintf(stderr, "ERROR: start=" F_U64 "\n", st);
+ fprintf(stderr, "ERROR: end =" F_U64 "\n", ed);
}
// Nothing here? Keep going.
@@ -693,8 +692,6 @@ runSegment(merylArgs *args, uint64 segment) {
delete C;
delete W;
- delete [] batchOutputFile;
-
for (uint32 x=0; x<SORTED_LIST_WIDTH; x++)
delete [] merDataArray[x];
@@ -703,7 +700,7 @@ runSegment(merylArgs *args, uint64 segment) {
delete [] bucketPointers;
if (args->beVerbose)
- fprintf(stderr, "Segment "F_U64" finished.\n", segment);
+ fprintf(stderr, "Segment " F_U64 " finished.\n", segment);
}
@@ -729,10 +726,9 @@ build(merylArgs *args) {
bool doMerge = false;
- if (args->configBatch) {
+ // Write out our configuration and exit if we are -configbatch
- // Write out our configuration and exit if we are -configbatch
- //
+ if (args->configBatch) {
args->writeConfig();
if (args->sgeJobName) {
@@ -741,48 +737,41 @@ build(merylArgs *args) {
} else {
fprintf(stdout, "Batch prepared. Please run:\n");
for (uint64 s=0; s<args->segmentLimit; s++)
- fprintf(stdout, "%s -countbatch "F_U64" -o %s\n", args->execName, s, args->outputFile);
+ fprintf(stdout, "%s -countbatch " F_U64 " -o %s\n", args->execName, s, args->outputFile);
fprintf(stdout, "%s -mergebatch -o %s\n", args->execName, args->outputFile);
}
- } else if (args->countBatch) {
+ }
- // Read back the configuration, run the segment and exit if we
- // are -countbatch
- //
+ // Read back the configuration, run the segment and exit if we are -countbatch
+
+ else if (args->countBatch) {
merylArgs *savedArgs = new merylArgs(args->outputFile);
savedArgs->beVerbose = args->beVerbose;
runSegment(savedArgs, args->batchNumber);
delete savedArgs;
- } else if (args->mergeBatch) {
+ }
- // Check that all the files exist if we are -mergebatch and
- // continue with execution
- //
- // MEMORY LEAK! We should delete this at the end of the
- // function, but it's a pain, and who cares?
- //
+ // Check that all the files exist if we are -mergebatch and continue with execution
+ //
+ // MEMORY LEAK! We should delete this at the end of the function, but it's a pain, and who
+ // cares?
+
+ else if (args->mergeBatch) {
merylArgs *savedArgs = new merylArgs(args->outputFile);
savedArgs->beVerbose = args->beVerbose;
args = savedArgs;
doMerge = true;
- } else {
+ }
- if (args->numThreads > 1)
+ // Otherwise, compute batches.
- // Run, using threads. There is a lot of baloney needed, so it's
- // all in a separate function.
- //
- runThreaded(args);
- else
- // No special options given, do all the work here and now
- //
- for (uint64 s=0; s<args->segmentLimit; s++)
- runSegment(args, s);
+ else {
+#pragma omp parallel for
+ for (uint64 s=0; s<args->segmentLimit; s++)
+ runSegment(args, s);
- // Either case, we want to merge now.
- //
doMerge = true;
}
@@ -818,8 +807,8 @@ build(merylArgs *args) {
arga[argc] = false;
argv[argc++] = "-s";
arga[argc] = true;
- argv[argc] = new char [strlen(args->outputFile) + 33];
- sprintf(argv[argc], "%s.batch"F_U32, args->outputFile, i);
+ argv[argc] = new char [FILENAME_MAX];
+ snprintf(argv[argc], FILENAME_MAX, "%s.batch" F_U32, args->outputFile, i);
argc++;
}
@@ -840,28 +829,25 @@ build(merylArgs *args) {
// Remove temporary files
//
- char *filename = new char [strlen(args->outputFile) + 17];
for (uint32 i=0; i<args->segmentLimit; i++) {
- sprintf(filename, "%s.batch"F_U32".mcidx", args->outputFile, i);
+ char filename[FILENAME_MAX];
+
+ snprintf(filename, FILENAME_MAX, "%s.batch" F_U32 ".mcidx", args->outputFile, i);
unlink(filename);
- sprintf(filename, "%s.batch"F_U32".mcdat", args->outputFile, i);
+ snprintf(filename, FILENAME_MAX, "%s.batch" F_U32 ".mcdat", args->outputFile, i);
unlink(filename);
- sprintf(filename, "%s.batch"F_U32".mcpos", args->outputFile, i);
+ snprintf(filename, FILENAME_MAX, "%s.batch" F_U32 ".mcpos", args->outputFile, i);
unlink(filename);
}
-
- delete [] filename;
}
// If we just merged, delete the merstream file
//
if (doMerge) {
- char *filename = new char [strlen(args->outputFile) + 17];
+ char filename[FILENAME_MAX];
- sprintf(filename, "%s.merStream", args->outputFile);
+ snprintf(filename, FILENAME_MAX, "%s.merStream", args->outputFile);
unlink(filename);
-
- delete [] filename;
}
}
diff --git a/src/meryl/meryl-dump.C b/src/meryl/meryl-dump.C
index b4156e4..f6caed6 100644
--- a/src/meryl/meryl-dump.C
+++ b/src/meryl/meryl-dump.C
@@ -59,7 +59,7 @@ dumpThreshold(merylArgs *args) {
while (M->nextMer()) {
if (M->theCount() >= args->numMersEstimated)
- fprintf(stdout, ">"F_U64"\n%s\n",
+ fprintf(stdout, ">" F_U64 "\n%s\n",
M->theCount(),
M->theFMer().merToString(str));
}
@@ -77,9 +77,9 @@ dumpPositions(merylArgs *args) {
fprintf(stderr, "File '%s' contains no position information.\n", args->inputFile);
} else {
while (M->nextMer()) {
- fprintf(stdout, ">"F_U64, M->theCount());
+ fprintf(stdout, ">" F_U64, M->theCount());
for (uint32 i=0; i<M->theCount(); i++)
- fprintf(stdout, " "F_U32, M->getPosition(i));
+ fprintf(stdout, " " F_U32, M->getPosition(i));
fprintf(stdout, "\n%s\n", M->theFMer().merToString(str));
}
}
@@ -114,9 +114,9 @@ countUnique(merylArgs *args) {
fprintf(stderr, "OK\n");
#endif
- fprintf(stdout, "Found "F_U64" mers.\n", M->numberOfTotalMers());
- fprintf(stdout, "Found "F_U64" distinct mers.\n", M->numberOfDistinctMers());
- fprintf(stdout, "Found "F_U64" unique mers.\n", M->numberOfUniqueMers());
+ fprintf(stdout, "Found " F_U64 " mers.\n", M->numberOfTotalMers());
+ fprintf(stdout, "Found " F_U64 " distinct mers.\n", M->numberOfDistinctMers());
+ fprintf(stdout, "Found " F_U64 " unique mers.\n", M->numberOfUniqueMers());
delete M;
}
@@ -129,11 +129,11 @@ plotHistogram(merylArgs *args) {
merylStreamReader *M = new merylStreamReader(args->inputFile);
- fprintf(stderr, "Found "F_U64" mers.\n", M->numberOfTotalMers());
- fprintf(stderr, "Found "F_U64" distinct mers.\n", M->numberOfDistinctMers());
- fprintf(stderr, "Found "F_U64" unique mers.\n", M->numberOfUniqueMers());
+ fprintf(stderr, "Found " F_U64 " mers.\n", M->numberOfTotalMers());
+ fprintf(stderr, "Found " F_U64 " distinct mers.\n", M->numberOfDistinctMers());
+ fprintf(stderr, "Found " F_U64 " unique mers.\n", M->numberOfUniqueMers());
- fprintf(stderr, "Largest mercount is "F_U64".\n",
+ fprintf(stderr, "Largest mercount is " F_U64 ".\n",
M->histogramMaximumCount());
for (uint32 i=1; i<M->histogramLength(); i++) {
@@ -143,7 +143,7 @@ plotHistogram(merylArgs *args) {
distinct += hist;
total += hist * i;
- fprintf(stdout, F_U32"\t"F_U64"\t%.4f\t%.4f\n",
+ fprintf(stdout, F_U32"\t" F_U64 "\t%.4f\t%.4f\n",
i,
hist,
distinct / (double)M->numberOfDistinctMers(),
@@ -192,10 +192,10 @@ dumpDistanceBetweenMers(merylArgs *args) {
for (uint32 d=0; d<maxd; d++)
if (hist[d])
- fprintf(stderr, F_U32"\t"F_U64"\n", d, hist[d]);
+ fprintf(stderr, F_U32"\t" F_U64 "\n", d, hist[d]);
if (histHuge)
- fprintf(stderr, "huge\t"F_U64"\n", histHuge);
+ fprintf(stderr, "huge\t" F_U64 "\n", histHuge);
}
delete [] hist;
diff --git a/src/meryl/meryl-estimate.C b/src/meryl/meryl-estimate.C
index cd85d8d..a41f12b 100644
--- a/src/meryl/meryl-estimate.C
+++ b/src/meryl/meryl-estimate.C
@@ -97,7 +97,7 @@ estimateNumMersInMemorySize(uint32 merSize,
}
if (beVerbose)
- fprintf(stdout, "Can fit "F_U64" mers into table with prefix of "F_U64" bits, using %.3fMB (%.3fMB for positions)\n",
+ fprintf(stdout, "Can fit " F_U64 " mers into table with prefix of " F_U64 " bits, using %.3fMB (%.3fMB for positions)\n",
maxN * numThreads,
bestT,
(((uint64ONE << bestT) * logBaseTwo64(maxN) + maxN * (2*merSize - bestT + posPerMer)) >> 3) * numThreads / 1048576.0,
@@ -167,7 +167,7 @@ optimalNumberOfBuckets(uint32 merSize,
for (h=2; h<=hmax && h<2*merSize; h++) {
s = (uint64ONE << h) * hwidth + numMers * (2 * merSize - h + posPerMer);
- //fprintf(stderr, "optimalNumberOfBuckets()-- h="F_U64" s="F_U64"\n", h, s);
+ //fprintf(stderr, "optimalNumberOfBuckets()-- h=" F_U64 " s=" F_U64 "\n", h, s);
if (s < opts) {
opth = h;
@@ -205,6 +205,6 @@ estimate(merylArgs *args) {
uint32 opth = optimalNumberOfBuckets(args->merSize, args->numMersEstimated, args->positionsEnabled);
uint64 memu = ((uint64ONE << opth) * logBaseTwo64(args->numMersEstimated+1) + args->numMersEstimated * (2 * args->merSize - opth));
- fprintf(stderr, F_U64" "F_U32"-mers can be computed using "F_U64"MB memory.\n",
+ fprintf(stderr, F_U64" " F_U32 "-mers can be computed using " F_U64 "MB memory.\n",
args->numMersEstimated, args->merSize, memu >> 23);
}
diff --git a/src/meryl/meryl-merge.C b/src/meryl/meryl-merge.C
index a8e3430..c0f52d6 100644
--- a/src/meryl/meryl-merge.C
+++ b/src/meryl/meryl-merge.C
@@ -35,6 +35,10 @@
* are Copyright 2014 Battelle National Biodefense Institute, and
* are subject to the BSD 3-Clause License
*
+ * Brian P. Walenz beginning on 2016-OCT-17
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
@@ -49,7 +53,7 @@ void
multipleOperations(merylArgs *args) {
if (args->mergeFilesLen < 2) {
- fprintf(stderr, "ERROR - must have at least two databases (you gave "F_U32")!\n", args->mergeFilesLen);
+ fprintf(stderr, "ERROR - must have at least two databases (you gave " F_U32 ")!\n", args->mergeFilesLen);
exit(1);
}
if (args->outputFile == 0L) {
diff --git a/src/meryl/meryl.H b/src/meryl/meryl.H
index c8ba6fa..803fd70 100644
--- a/src/meryl/meryl.H
+++ b/src/meryl/meryl.H
@@ -56,8 +56,6 @@
#include "speedCounter.H"
#include "timeAndSize.H"
-#include <pthread.h>
-
#define PERSONALITY_MERGE 0xff
#define PERSONALITY_MIN 0x01
diff --git a/src/meryl/meryl.mk b/src/meryl/meryl.mk
index 2c9465e..9209427 100644
--- a/src/meryl/meryl.mk
+++ b/src/meryl/meryl.mk
@@ -12,7 +12,6 @@ TARGET := meryl
SOURCES := meryl-args.C \
meryl-binaryOp.C \
meryl-build.C \
- meryl-build-threads.C \
meryl-dump.C \
meryl-estimate.C \
meryl-merge.C \
diff --git a/src/meryl/simple.C b/src/meryl/simple.C
index 7a2b3b1..f3e9746 100644
--- a/src/meryl/simple.C
+++ b/src/meryl/simple.C
@@ -27,6 +27,10 @@
* are Copyright 2014-2015 Battelle National Biodefense Institute, and
* are subject to the BSD 3-Clause License
*
+ * Brian P. Walenz beginning on 2016-OCT-17
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
@@ -145,7 +149,7 @@ main(int argc, char **argv) {
numMers = M->approximateNumberOfMers();
delete M;
- fprintf(stderr, "Guessing "F_U64" mers in input '%s'\n", numMers, inName);
+ fprintf(stderr, "Guessing " F_U64 " mers in input '%s'\n", numMers, inName);
M = new merStream(new kMerBuilder(merSize, merCompression),
new seqStream(inName),
@@ -160,7 +164,7 @@ main(int argc, char **argv) {
uint64 theMersMax = 2 * numMers; // for allowing both -f and -r
uint32 *theMers = new uint32 [theMersMax];
- fprintf(stderr, "Allocating "F_U64"MB for mer storage.\n", numMers * sizeof(uint64) >> 20);
+ fprintf(stderr, "Allocating " F_U64 "MB for mer storage.\n", numMers * sizeof(uint64) >> 20);
C = new speedCounter(" Filling mer list: %7.2f Mmers -- %5.2f Mmers/second\r", 1000000.0, 0x1fffff, 1);
@@ -180,7 +184,7 @@ main(int argc, char **argv) {
delete C;
delete M;
- fprintf(stderr, "Found "F_U64" mers in input '%s'\n", theMersLen, inName);
+ fprintf(stderr, "Found " F_U64 " mers in input '%s'\n", theMersLen, inName);
if (theMersLen > theMersMax)
fprintf(stderr, "ERROR: too many mers in input!\n"), exit(1);
@@ -212,7 +216,7 @@ main(int argc, char **argv) {
uint64 numCounts = ((uint64)1) << (2 * merSize);
uint32 *theCounts = new uint32 [numCounts];
- fprintf(stderr, "Allocating "F_U64"MB for count storage.\n", numCounts * sizeof(uint32) >> 20);
+ fprintf(stderr, "Allocating " F_U64 "MB for count storage.\n", numCounts * sizeof(uint32) >> 20);
memset(theCounts, 0, sizeof(uint32) * numCounts);
diff --git a/src/mhap/mhap.mk b/src/mhap/mhap.mk
index 0655256..1f1278b 100644
--- a/src/mhap/mhap.mk
+++ b/src/mhap/mhap.mk
@@ -7,6 +7,6 @@ ifeq "$(strip ${TARGET_DIR})" ""
TARGET_DIR := ../$(OSTYPE)-$(MACHINETYPE)/bin
endif
-TARGET := mhap-2.1.jar
-SOURCES := mhap-2.1.tar
+TARGET := mhap-2.1.2.jar
+SOURCES := mhap-2.1.2.tar
diff --git a/src/mhap/mhapConvert.C b/src/mhap/mhapConvert.C
index dfca508..7f5daae 100644
--- a/src/mhap/mhapConvert.C
+++ b/src/mhap/mhapConvert.C
@@ -23,6 +23,10 @@
* are a 'United States Government Work', and
* are released in the public domain
*
+ * Brian P. Walenz beginning on 2016-AUG-09
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
@@ -94,7 +98,7 @@ main(int argc, char **argv) {
char *ovStr = new char [1024];
ovOverlap ov(NULL);
- ovFile *of = new ovFile(outName, ovFileFullWrite);
+ ovFile *of = new ovFile(NULL, outName, ovFileFullWrite);
for (uint32 ff=0; ff<files.size(); ff++) {
compressedFileReader *in = new compressedFileReader(files[ff]);
@@ -113,7 +117,13 @@ main(int argc, char **argv) {
if (ov.a_iid == ov.b_iid)
continue;
- assert(W[4][0] == '0');
+ assert(W[4][0] == '0'); // first read is always forward
+
+ assert(W(5) < W(6)); // first read bgn < end
+ assert(W(6) <= W(7)); // first read end <= len
+
+ assert(W(9) < W(10)); // second read bgn < end
+ assert(W(10) <= W(11)); // second read end <= len
ov.dat.ovl.forUTG = true;
ov.dat.ovl.forOBT = true;
@@ -137,6 +147,8 @@ main(int argc, char **argv) {
of->writeOverlap(&ov);
}
+ delete in;
+
arg++;
}
diff --git a/src/minimap/mmapConvert.C b/src/minimap/mmapConvert.C
index b6c1fdf..f817d5d 100644
--- a/src/minimap/mmapConvert.C
+++ b/src/minimap/mmapConvert.C
@@ -19,6 +19,10 @@
* are a 'United States Government Work', and
* are released in the public domain
*
+ * Brian P. Walenz beginning on 2016-OCT-24
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
@@ -75,7 +79,7 @@ main(int argc, char **argv) {
char *ovStr = new char [1024];
ovOverlap ov(NULL);
- ovFile *of = new ovFile(outName, ovFileFullWrite);
+ ovFile *of = new ovFile(NULL, outName, ovFileFullWrite);
for (uint32 ff=0; ff<files.size(); ff++) {
compressedFileReader *in = new compressedFileReader(files[ff]);
diff --git a/src/overlapBasedTrimming/splitReads-subReads.C b/src/overlapBasedTrimming/splitReads-subReads.C
index 2994c85..8b2b318 100644
--- a/src/overlapBasedTrimming/splitReads-subReads.C
+++ b/src/overlapBasedTrimming/splitReads-subReads.C
@@ -287,7 +287,7 @@ detectSubReads(gkStore *gkp,
numSpan += (doCheckSubRead(gkp, w->adj[ii].a_iid)) ? 1 : 2;
if (subreadFile)
- fprintf(subreadFile, "AcheckSub region %u ("F_S32"-"F_S32") with %u hits %u bighits - span %u largePalindrome %s\n",
+ fprintf(subreadFile, "AcheckSub region %u (" F_S32 "-" F_S32 ") with %u hits %u bighits - span %u largePalindrome %s\n",
w->adj[0].a_iid, BAD.lo(bb), BAD.hi(bb), BAD.count(bb), allHits,
numSpan, largePalindrome ? "true" : "false");
diff --git a/src/overlapBasedTrimming/splitReads.C b/src/overlapBasedTrimming/splitReads.C
index a90e1a8..956b2d5 100644
--- a/src/overlapBasedTrimming/splitReads.C
+++ b/src/overlapBasedTrimming/splitReads.C
@@ -195,13 +195,13 @@ main(int argc, char **argv) {
outClr->copy(finClr);
- sprintf(outputName, "%s.log", outputPrefix);
+ snprintf(outputName, FILENAME_MAX, "%s.log", outputPrefix);
errno = 0;
reportFile = fopen(outputName, "w");
if (errno)
fprintf(stderr, "Failed to open '%s' for writing: %s\n", outputName, strerror(errno)), exit(1);
- sprintf(outputName, "%s.subread.log", outputPrefix);
+ snprintf(outputName, FILENAME_MAX, "%s.subread.log", outputPrefix);
errno = 0;
subreadFile = fopen(outputName, "w");
if (errno)
@@ -222,7 +222,7 @@ main(int argc, char **argv) {
if (idMax > gkp->gkStore_getNumReads())
idMax = gkp->gkStore_getNumReads();
- fprintf(stderr, "Processing from ID "F_U32" to "F_U32" out of "F_U32" reads, using errorRate = %.2f\n",
+ fprintf(stderr, "Processing from ID " F_U32 " to " F_U32 " out of " F_U32 " reads, using errorRate = %.2f\n",
idMin,
idMax,
gkp->gkStore_getNumReads(),
@@ -392,7 +392,7 @@ main(int argc, char **argv) {
// Write the summary
if (outputPrefix) {
- sprintf(outputName, "%s.stats", outputPrefix);
+ snprintf(outputName, FILENAME_MAX, "%s.stats", outputPrefix);
errno = 0;
staFile = fopen(outputName, "w");
@@ -412,55 +412,55 @@ main(int argc, char **argv) {
//fprintf(staFile, "%7u (use only overlaps longer than this)\n", minAlignLength); // NOT SUPPORTED!
fprintf(staFile, "INPUT READS:\n");
fprintf(staFile, "-----------\n");
- fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (reads processed)\n", readsIn.nReads, readsIn.nBases);
- fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (reads not processed, previously deleted)\n", deletedIn.nReads, deletedIn.nBases);
- fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (reads not processed, in a library where trimming isn't allowed)\n", noTrimIn.nReads, noTrimIn.nBases);
+ fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (reads processed)\n", readsIn.nReads, readsIn.nBases);
+ fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (reads not processed, previously deleted)\n", deletedIn.nReads, deletedIn.nBases);
+ fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (reads not processed, in a library where trimming isn't allowed)\n", noTrimIn.nReads, noTrimIn.nBases);
fprintf(staFile, "\n");
fprintf(staFile, "PROCESSED:\n");
fprintf(staFile, "--------\n");
- fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (no overlaps)\n", noOverlaps.nReads, noOverlaps.nBases);
- fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (no coverage after adjusting for trimming done already)\n", noCoverage.nReads, noCoverage.nBases);
- fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (processed for chimera)\n", readsProcChimera.nReads, readsProcChimera.nBases);
- fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (processed for spur)\n", readsProcSpur.nReads, readsProcSpur.nBases);
- fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (processed for subreads)\n", readsProcSubRead.nReads, readsProcSubRead.nBases);
+ fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (no overlaps)\n", noOverlaps.nReads, noOverlaps.nBases);
+ fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (no coverage after adjusting for trimming done already)\n", noCoverage.nReads, noCoverage.nBases);
+ fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (processed for chimera)\n", readsProcChimera.nReads, readsProcChimera.nBases);
+ fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (processed for spur)\n", readsProcSpur.nReads, readsProcSpur.nBases);
+ fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (processed for subreads)\n", readsProcSubRead.nReads, readsProcSubRead.nBases);
fprintf(staFile, "\n");
fprintf(staFile, "READS WITH SIGNALS:\n");
fprintf(staFile, "------------------\n");
- fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" signals (number of 5' spur signal)\n", readsBadSpur5.nReads, readsBadSpur5.nBases);
- fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" signals (number of 3' spur signal)\n", readsBadSpur3.nReads, readsBadSpur3.nBases);
- fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" signals (number of chimera signal)\n", readsBadChimera.nReads, readsBadChimera.nBases);
- fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" signals (number of subread signal)\n", readsBadSubread.nReads, readsBadSubread.nBases);
+ fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " signals (number of 5' spur signal)\n", readsBadSpur5.nReads, readsBadSpur5.nBases);
+ fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " signals (number of 3' spur signal)\n", readsBadSpur3.nReads, readsBadSpur3.nBases);
+ fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " signals (number of chimera signal)\n", readsBadChimera.nReads, readsBadChimera.nBases);
+ fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " signals (number of subread signal)\n", readsBadSubread.nReads, readsBadSubread.nBases);
fprintf(staFile, "\n");
fprintf(staFile, "SIGNALS:\n");
fprintf(staFile, "-------\n");
- fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (size of 5' spur signal)\n", basesBadSpur5.nReads, basesBadSpur5.nBases);
- fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (size of 3' spur signal)\n", basesBadSpur3.nReads, basesBadSpur3.nBases);
- fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (size of chimera signal)\n", basesBadChimera.nReads, basesBadChimera.nBases);
- fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (size of subread signal)\n", basesBadSubread.nReads, basesBadSubread.nBases);
+ fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (size of 5' spur signal)\n", basesBadSpur5.nReads, basesBadSpur5.nBases);
+ fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (size of 3' spur signal)\n", basesBadSpur3.nReads, basesBadSpur3.nBases);
+ fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (size of chimera signal)\n", basesBadChimera.nReads, basesBadChimera.nBases);
+ fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (size of subread signal)\n", basesBadSubread.nReads, basesBadSubread.nBases);
fprintf(staFile, "\n");
fprintf(staFile, "TRIMMING:\n");
fprintf(staFile, "--------\n");
- fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (trimmed from the 5' end of the read)\n", readsTrimmed5.nReads, readsTrimmed5.nBases);
- fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (trimmed from the 3' end of the read)\n", readsTrimmed3.nReads, readsTrimmed3.nBases);
+ fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (trimmed from the 5' end of the read)\n", readsTrimmed5.nReads, readsTrimmed5.nBases);
+ fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (trimmed from the 3' end of the read)\n", readsTrimmed3.nReads, readsTrimmed3.nBases);
#if 0
fprintf(staFile, "DELETED:\n");
fprintf(staFile, "-------\n");
- fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (deleted because of both cimera and spur signals)\n", bothDeletedSmall.nReads, bothDeletedSmall.nBases);
- fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (deleted because of chimera signal)\n", chimeraDeletedSmall.nReads, chimeraDeletedSmall.nBases);
- fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (deleted because of spur signal)\n", spurDeletedSmall.nReads, spurDeletedSmall.nBases);
+ fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (deleted because of both cimera and spur signals)\n", bothDeletedSmall.nReads, bothDeletedSmall.nBases);
+ fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (deleted because of chimera signal)\n", chimeraDeletedSmall.nReads, chimeraDeletedSmall.nBases);
+ fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (deleted because of spur signal)\n", spurDeletedSmall.nReads, spurDeletedSmall.nBases);
fprintf(staFile, "\n");
fprintf(staFile, "SPUR TYPES:\n");
fprintf(staFile, "----------\n");
- fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (normal spur detected)\n", spurDetectedNormal.nReads, spurDetectedNormal.nBases);
- fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (linker spur detected)\n", spurDetectedLinker.nReads, spurDetectedLinker.nBases);
+ fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (normal spur detected)\n", spurDetectedNormal.nReads, spurDetectedNormal.nBases);
+ fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (linker spur detected)\n", spurDetectedLinker.nReads, spurDetectedLinker.nBases);
fprintf(staFile, "\n");
fprintf(staFile, "CHIMERA TYPES:\n");
fprintf(staFile, "-------------\n");
- fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (innie-pair chimera detected)\n", chimeraDetectedInnie.nReads, chimeraDetectedInnie.nBases);
- fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (overhanging chimera detected)\n", chimeraDetectedOverhang.nReads, chimeraDetectedOverhang.nBases);
- fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (gap chimera detected)\n", chimeraDetectedGap.nReads, chimeraDetectedGap.nBases);
- fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (linker chimera detected)\n", chimeraDetectedLinker.nReads, chimeraDetectedLinker.nBases);
+ fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (innie-pair chimera detected)\n", chimeraDetectedInnie.nReads, chimeraDetectedInnie.nBases);
+ fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (overhanging chimera detected)\n", chimeraDetectedOverhang.nReads, chimeraDetectedOverhang.nBases);
+ fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (gap chimera detected)\n", chimeraDetectedGap.nReads, chimeraDetectedGap.nBases);
+ fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (linker chimera detected)\n", chimeraDetectedLinker.nReads, chimeraDetectedLinker.nBases);
#endif
// INPUT READS = ACCEPTED + TRIMMED + DELETED
diff --git a/src/overlapBasedTrimming/trimReads-bestEdge.C b/src/overlapBasedTrimming/trimReads-bestEdge.C
index d4ede1c..1616dc3 100644
--- a/src/overlapBasedTrimming/trimReads-bestEdge.C
+++ b/src/overlapBasedTrimming/trimReads-bestEdge.C
@@ -330,9 +330,9 @@ bestEdge(ovOverlap *ovl,
FILE *F;
- sprintf(D, "trim-%08d.dat", read->gkRead_readID());
- sprintf(G, "trim-%08d.gp", read->gkRead_readID());
- sprintf(S, "gnuplot < trim-%08d.gp", read->gkRead_readID());
+ snprintf(D, FILENAME_MAX, "trim-%08d.dat", read->gkRead_readID());
+ snprintf(G, FILENAME_MAX, "trim-%08d.gp", read->gkRead_readID());
+ snprintf(S, FILENAME_MAX, "gnuplot < trim-%08d.gp", read->gkRead_readID());
F = fopen(D, "w");
for (uint32 i=0; i<MAX(trim5.size(), trim3.size()); i++) {
diff --git a/src/overlapBasedTrimming/trimReads-largestCovered.C b/src/overlapBasedTrimming/trimReads-largestCovered.C
index a217455..18b9895 100644
--- a/src/overlapBasedTrimming/trimReads-largestCovered.C
+++ b/src/overlapBasedTrimming/trimReads-largestCovered.C
@@ -80,10 +80,10 @@ largestCovered(ovOverlap *ovl,
#if 0
for (uint32 it=0; it<IL.numberOfIntervals(); it++)
- fprintf(stderr, "IL - %d - "F_S64" "F_S64" "F_S64"\n", fr.gkFragment_getReadIID(), IL.lo(it), IL.hi(it), IL.ct(it));
+ fprintf(stderr, "IL - %d - " F_S64 " " F_S64 " " F_S64 "\n", fr.gkFragment_getReadIID(), IL.lo(it), IL.hi(it), IL.ct(it));
for (uint32 it=0; it<ID.numberOfIntervals(); it++)
- fprintf(stderr, "ID - %d - "F_S64" "F_S64" "F_S64"\n", fr.gkFragment_getReadIID(), ID.lo(it), ID.hi(it), ID.de(it));
+ fprintf(stderr, "ID - %d - " F_S64 " " F_S64 " " F_S64 "\n", fr.gkFragment_getReadIID(), ID.lo(it), ID.hi(it), ID.de(it));
#endif
// I thought I'd allow low coverage at the end of the read, but not internally, but that is hard,
@@ -98,7 +98,7 @@ largestCovered(ovOverlap *ovl,
uint32 ie = 0;
while (it < DE.numberOfIntervals()) {
- //fprintf(stderr, "DE - %d - "F_S64" "F_S64" "F_U32"\n", fr.gkFragment_getReadIID(), DE.lo(it), DE.hi(it), DE.depth(it));
+ //fprintf(stderr, "DE - %d - " F_S64 " " F_S64 " " F_U32 "\n", fr.gkFragment_getReadIID(), DE.lo(it), DE.hi(it), DE.depth(it));
if (DE.depth(it) < minCoverage) {
// Dropped below good coverage depth. If we have an interval, save it. Reset.
@@ -209,7 +209,7 @@ largestCovered(ovOverlap *ovl,
#if 0
if (IL.numberOfIntervals() > 1)
for (uint32 it=0; it<IL.numberOfIntervals(); it++)
- fprintf(stderr, "IL[%02d] - iid %d - "F_S64" "F_S64"\n", it, read->gkRead_readID(), IL.lo(it), IL.hi(it));
+ fprintf(stderr, "IL[%02d] - iid %d - " F_S64 " " F_S64 "\n", it, read->gkRead_readID(), IL.lo(it), IL.hi(it));
#endif
if (IL.numberOfIntervals() == 0) {
diff --git a/src/overlapBasedTrimming/trimReads-quality.C b/src/overlapBasedTrimming/trimReads-quality.C
index 9877d83..92eb4ef 100644
--- a/src/overlapBasedTrimming/trimReads-quality.C
+++ b/src/overlapBasedTrimming/trimReads-quality.C
@@ -151,7 +151,7 @@ findGoodQuality(double *qltD,
flen = fpos;
rlen = rpos;
- //fprintf(stderr, "qltLen = "F_U32" flen="F_U32" rlen="F_U32"\n", qltLen, flen, rlen);
+ //fprintf(stderr, "qltLen = " F_U32 " flen=" F_U32 " rlen=" F_U32 "\n", qltLen, flen, rlen);
uint32 winningFPos = 0;
uint32 winningRPos = 0;
@@ -230,7 +230,7 @@ findGoodQuality(double *qltD,
}
else {
- fprintf(stderr, "UNMATCHED OVERLAP\t"F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\n",
+ fprintf(stderr, "UNMATCHED OVERLAP\t" F_U32 "\t" F_U32 "\t" F_U32 "\t" F_U32 "\n",
f[fpos].start, f[fpos].end, r[rpos].start, r[rpos].end);
}
}
diff --git a/src/overlapBasedTrimming/trimReads.C b/src/overlapBasedTrimming/trimReads.C
index fa41500..dbccd89 100644
--- a/src/overlapBasedTrimming/trimReads.C
+++ b/src/overlapBasedTrimming/trimReads.C
@@ -235,7 +235,7 @@ main(int argc, char **argv) {
if (outputPrefix) {
- sprintf(logName, "%s.log", outputPrefix);
+ snprintf(logName, FILENAME_MAX, "%s.log", outputPrefix);
errno = 0;
logFile = fopen(logName, "w");
@@ -259,7 +259,7 @@ main(int argc, char **argv) {
if (idMax > gkp->gkStore_getNumReads())
idMax = gkp->gkStore_getNumReads();
- fprintf(stderr, "Processing from ID "F_U32" to "F_U32" out of "F_U32" reads.\n",
+ fprintf(stderr, "Processing from ID " F_U32 " to " F_U32 " out of " F_U32 " reads.\n",
idMin,
idMax,
gkp->gkStore_getNumReads());
@@ -379,7 +379,7 @@ main(int argc, char **argv) {
outClr->setend(id) = fend;
outClr->setDeleted(id); // Gah, just obliterates the clear range.
- fprintf(logFile, F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\tNOV%s\n",
+ fprintf(logFile, F_U32"\t" F_U32 "\t" F_U32 "\t" F_U32 "\t" F_U32 "\tNOV%s\n",
id,
ibgn, iend,
fbgn, fend,
@@ -393,7 +393,7 @@ main(int argc, char **argv) {
outClr->setend(id) = fend;
outClr->setDeleted(id); // Gah, just obliterates the clear range.
- fprintf(logFile, F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\tDEL%s\n",
+ fprintf(logFile, F_U32"\t" F_U32 "\t" F_U32 "\t" F_U32 "\t" F_U32 "\tDEL%s\n",
id,
ibgn, iend,
fbgn, fend,
@@ -406,7 +406,7 @@ main(int argc, char **argv) {
(iend == fend)) {
noChangeOut += read->gkRead_sequenceLength();
- fprintf(logFile, F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\tNOC%s\n",
+ fprintf(logFile, F_U32"\t" F_U32 "\t" F_U32 "\t" F_U32 "\t" F_U32 "\tNOC%s\n",
id,
ibgn, iend,
fbgn, fend,
@@ -428,7 +428,7 @@ main(int argc, char **argv) {
if (fbgn - ibgn > 0) trim5 += fbgn - ibgn;
if (iend - fend > 0) trim3 += iend - fend;
- fprintf(logFile, F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\tMOD%s\n",
+ fprintf(logFile, F_U32"\t" F_U32 "\t" F_U32 "\t" F_U32 "\t" F_U32 "\tMOD%s\n",
id,
ibgn, iend,
fbgn, fend,
@@ -455,7 +455,7 @@ main(int argc, char **argv) {
// Dump the statistics and plots
if (outputPrefix) {
- sprintf(sumName, "%s.stats", outputPrefix);
+ snprintf(sumName, FILENAME_MAX, "%s.stats", outputPrefix);
errno = 0;
staFile = fopen(sumName, "w");
@@ -476,9 +476,9 @@ main(int argc, char **argv) {
fprintf(staFile, "INPUT READS:\n");
fprintf(staFile, "-----------\n");
- fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (reads processed)\n", readsIn.nReads, readsIn.nBases);
- fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (reads not processed, previously deleted)\n", deletedIn.nReads, deletedIn.nBases);
- fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (reads not processed, in a library where trimming isn't allowed)\n", noTrimIn.nReads, noTrimIn.nBases);
+ fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (reads processed)\n", readsIn.nReads, readsIn.nBases);
+ fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (reads not processed, previously deleted)\n", deletedIn.nReads, deletedIn.nBases);
+ fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (reads not processed, in a library where trimming isn't allowed)\n", noTrimIn.nReads, noTrimIn.nBases);
readsIn .generatePlots(outputPrefix, "inputReads", 250);
deletedIn.generatePlots(outputPrefix, "inputDeletedReads", 250);
@@ -487,10 +487,10 @@ main(int argc, char **argv) {
fprintf(staFile, "\n");
fprintf(staFile, "OUTPUT READS:\n");
fprintf(staFile, "------------\n");
- fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (trimmed reads output)\n", readsOut.nReads, readsOut.nBases);
- fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (reads with no change, kept as is)\n", noChangeOut.nReads, noChangeOut.nBases);
- fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (reads with no overlaps, deleted)\n", noOvlOut.nReads, noOvlOut.nBases);
- fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (reads with short trimmed length, deleted)\n", deletedOut.nReads, deletedOut.nBases);
+ fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (trimmed reads output)\n", readsOut.nReads, readsOut.nBases);
+ fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (reads with no change, kept as is)\n", noChangeOut.nReads, noChangeOut.nBases);
+ fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (reads with no overlaps, deleted)\n", noOvlOut.nReads, noOvlOut.nBases);
+ fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (reads with short trimmed length, deleted)\n", deletedOut.nReads, deletedOut.nBases);
readsOut .generatePlots(outputPrefix, "outputTrimmedReads", 250);
noOvlOut .generatePlots(outputPrefix, "outputNoOvlReads", 250);
@@ -500,8 +500,8 @@ main(int argc, char **argv) {
fprintf(staFile, "\n");
fprintf(staFile, "TRIMMING DETAILS:\n");
fprintf(staFile, "----------------\n");
- fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (bases trimmed from the 5' end of a read)\n", trim5.nReads, trim5.nBases);
- fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (bases trimmed from the 3' end of a read)\n", trim3.nReads, trim3.nBases);
+ fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (bases trimmed from the 5' end of a read)\n", trim5.nReads, trim5.nBases);
+ fprintf(staFile, "%6" F_U32P " reads %12" F_U64P " bases (bases trimmed from the 3' end of a read)\n", trim3.nReads, trim3.nBases);
trim5.generatePlots(outputPrefix, "trim5", 25);
trim3.generatePlots(outputPrefix, "trim3", 25);
diff --git a/src/overlapBasedTrimming/trimStat.H b/src/overlapBasedTrimming/trimStat.H
index 5e97e69..6327991 100644
--- a/src/overlapBasedTrimming/trimStat.H
+++ b/src/overlapBasedTrimming/trimStat.H
@@ -48,7 +48,7 @@ public:
char N[FILENAME_MAX];
FILE *F;
- sprintf(N, "%s.%s.dat", outputPrefix, outputName);
+ snprintf(N, FILENAME_MAX, "%s.%s.dat", outputPrefix, outputName);
F = fopen(N, "w");
if (errno)
fprintf(stderr, "Failed to open '%s' for writing: %s\n", N, strerror(errno)), exit(1);
@@ -56,7 +56,7 @@ public:
fprintf(F, F_U32"\n", histo[ii]);
fclose(F);
- sprintf(N, "%s.%s.gp", outputPrefix, outputName);
+ snprintf(N, FILENAME_MAX, "%s.%s.gp", outputPrefix, outputName);
F = fopen(N, "w");
if (errno)
fprintf(stderr, "Failed to open '%s' for writing: %s\n", N, strerror(errno)), exit(1);
@@ -77,7 +77,7 @@ public:
fprintf(F, "replot\n");
fclose(F);
- sprintf(N, "gnuplot %s.%s.gp > /dev/null 2>&1", outputPrefix, outputName);
+ snprintf(N, FILENAME_MAX, "gnuplot %s.%s.gp > /dev/null 2>&1", outputPrefix, outputName);
system(N);
};
diff --git a/src/overlapErrorAdjustment/analyzeAlignment.C b/src/overlapErrorAdjustment/analyzeAlignment.C
index e8434b6..6c48cfb 100644
--- a/src/overlapErrorAdjustment/analyzeAlignment.C
+++ b/src/overlapErrorAdjustment/analyzeAlignment.C
@@ -19,6 +19,10 @@
* are Copyright 2015 Battelle National Biodefense Institute, and
* are subject to the BSD 3-Clause License
*
+ * Brian P. Walenz beginning on 2016-OCT-17
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
@@ -292,7 +296,7 @@ analyzeAlignment::analyze(char *aSeq, int32 aLen, int32 aOffset,
void
analyzeAlignment::outputDetails(uint32 j) {
- fprintf(stderr, "%3"F_U32P": %c conf %3"F_U64P" deletes %3"F_U64P" | subst %3"F_U64P" %3"F_U64P" %3"F_U64P" %3"F_U64P" | no_insert %3"F_U64P" insert %3"F_U64P" %3"F_U64P" %3"F_U64P" %3"F_U64P"\n",
+ fprintf(stderr, "%3" F_U32P ": %c conf %3" F_U64P " deletes %3" F_U64P " | subst %3" F_U64P " %3" F_U64P " %3" F_U64P " %3" F_U64P " | no_insert %3" F_U64P " insert %3" F_U64P " %3" F_U64P " %3" F_U64P " %3" F_U64P "\n",
j,
_seq[j],
_vote[j].confirmed,
@@ -403,14 +407,14 @@ analyzeAlignment::generateCorrections(FILE *corFile) {
// (total > 1)
if (total <= 1) {
- fprintf(stderr, "FEW total = "F_U64" <= 1\n", total);
+ fprintf(stderr, "FEW total = " F_U64 " <= 1\n", total);
skippedTooFew++;
continue;
}
// (2 * max > total)
if (2 * max <= total) {
- fprintf(stderr, "WEAK 2*max = "F_U64" <= total = "F_U64"\n", 2*max, total);
+ fprintf(stderr, "WEAK 2*max = " F_U64 " <= total = " F_U64 "\n", 2*max, total);
skippedTooWeak++;
continue;
}
@@ -424,7 +428,7 @@ analyzeAlignment::generateCorrections(FILE *corFile) {
// ((haplo_ct < 2) || (Use_Haplo_Ct == false))
if ((haplo_ct >= 2) && (Use_Haplo_Ct == true)) {
- fprintf(stderr, "HAPLO haplo_ct="F_U64" >= 2 AND Use_Haplo_Ct = %s\n", haplo_ct, (Use_Haplo_Ct) ? "true" : "false");
+ fprintf(stderr, "HAPLO haplo_ct=" F_U64 " >= 2 AND Use_Haplo_Ct = %s\n", haplo_ct, (Use_Haplo_Ct) ? "true" : "false");
skippedHaplo++;
continue;
}
@@ -433,7 +437,7 @@ analyzeAlignment::generateCorrections(FILE *corFile) {
// ((_vote[j].confirmed == 1) && (max > 6)))
if ((_vote[j].confirmed > 0) &&
((_vote[j].confirmed != 1) || (max <= 6))) {
- fprintf(stderr, "INDET confirmed = "F_U64" max = "F_U64"\n", _vote[j].confirmed, max);
+ fprintf(stderr, "INDET confirmed = " F_U64 " max = " F_U64 "\n", _vote[j].confirmed, max);
skippedConfirmed++;
continue;
}
@@ -442,7 +446,7 @@ analyzeAlignment::generateCorrections(FILE *corFile) {
substitutions++;
- fprintf(stderr, "SUBSTITUTE position "F_U32" to %c\n", j, Matching_Char(vval));
+ fprintf(stderr, "SUBSTITUTE position " F_U32 " to %c\n", j, Matching_Char(vval));
_cor[_corLen].type = vval;
_cor[_corLen].pos = j;
@@ -485,26 +489,26 @@ analyzeAlignment::generateCorrections(FILE *corFile) {
_vote[j].t_insert);
if (ins_total <= 1) {
- fprintf(stderr, "FEW ins_total = "F_U64" <= 1\n", ins_total);
+ fprintf(stderr, "FEW ins_total = " F_U64 " <= 1\n", ins_total);
skippedInsTotal++;
continue;
}
if (2 * ins_max >= ins_total) {
- fprintf(stderr, "WEAK 2*ins_max = "F_U64" <= ins_total = "F_U64"\n", 2*ins_max, ins_total);
+ fprintf(stderr, "WEAK 2*ins_max = " F_U64 " <= ins_total = " F_U64 "\n", 2*ins_max, ins_total);
skippedInsMax++;
continue;
}
if ((ins_haplo_ct >= 2) && (Use_Haplo_Ct == true)) {
- fprintf(stderr, "HAPLO ins_haplo_ct="F_U64" >= 2 AND Use_Haplo_Ct = %s\n", ins_haplo_ct, (Use_Haplo_Ct) ? "true" : "false");
+ fprintf(stderr, "HAPLO ins_haplo_ct=" F_U64 " >= 2 AND Use_Haplo_Ct = %s\n", ins_haplo_ct, (Use_Haplo_Ct) ? "true" : "false");
skippedInsHaplo++;
continue;
}
if ((_vote[j].no_insert > 0) &&
((_vote[j].no_insert != 1) || (ins_max <= 6))) {
- fprintf(stderr, "INDET no_insert = "F_U64" ins_max = "F_U64"\n", _vote[j].no_insert, ins_max);
+ fprintf(stderr, "INDET no_insert = " F_U64 " ins_max = " F_U64 "\n", _vote[j].no_insert, ins_max);
skippedInsTooMany++;
continue;
}
@@ -513,7 +517,7 @@ analyzeAlignment::generateCorrections(FILE *corFile) {
insertions++;
- fprintf(stderr, "INSERT position "F_U32" to %c\n", j, Matching_Char(ins_vote));
+ fprintf(stderr, "INSERT position " F_U32 " to %c\n", j, Matching_Char(ins_vote));
_cor[_corLen].type = ins_vote;
_cor[_corLen].pos = j;
@@ -547,7 +551,7 @@ analyzeAlignment::generateCorrections(FILE *corFile) {
#if 0
- fprintf(stderr, "Corrected "F_U64" bases with "F_U64" substitutions, "F_U64" deletions and "F_U64" insertions.\n",
+ fprintf(stderr, "Corrected " F_U64 " bases with " F_U64 " substitutions, " F_U64 " deletions and " F_U64 " insertions.\n",
G->basesLen,
changes[A_SUBST] + changes[C_SUBST] + changes[G_SUBST] + changes[T_SUBST],
changes[DELETE],
diff --git a/src/overlapErrorAdjustment/correctOverlaps-Correct_Frags.C b/src/overlapErrorAdjustment/correctOverlaps-Correct_Frags.C
index d7166fb..b8e0666 100644
--- a/src/overlapErrorAdjustment/correctOverlaps-Correct_Frags.C
+++ b/src/overlapErrorAdjustment/correctOverlaps-Correct_Frags.C
@@ -91,7 +91,7 @@ correctRead(uint32 curID,
if ((i != C[Cpos].pos) &&
(i != C[Cpos].pos + 1))
- fprintf(stderr, "i="F_U32" Cpos="F_U64" C[Cpos].pos="F_U32"\n", i, Cpos, C[Cpos].pos);
+ fprintf(stderr, "i=" F_U32 " Cpos=" F_U64 " C[Cpos].pos=" F_U32 "\n", i, Cpos, C[Cpos].pos);
assert((i == C[Cpos].pos) ||
(i == C[Cpos].pos + 1));
@@ -215,7 +215,7 @@ Correct_Frags(coParameters *G,
uint64 firstRecord = 0;
uint64 currentRecord = 0;
- fprintf(stderr, "Reading "F_U64" corrections from '%s'.\n", Clen, G->correctionsName);
+ fprintf(stderr, "Reading " F_U64 " corrections from '%s'.\n", Clen, G->correctionsName);
// Count the number of bases, so we can do two gigantic allocations for bases and adjustments.
// Adjustments are always less than the number of corrections; we could also count exactly.
@@ -241,9 +241,9 @@ Correct_Frags(coParameters *G,
}
}
- fprintf(stderr, "Correcting "F_U64" bases with "F_U64" indel adjustments.\n", G->basesLen, G->adjustsLen);
+ fprintf(stderr, "Correcting " F_U64 " bases with " F_U64 " indel adjustments.\n", G->basesLen, G->adjustsLen);
- fprintf(stderr, "--Allocate "F_U64" + "F_U64" + "F_U64" MB for bases, adjusts and reads.\n",
+ fprintf(stderr, "--Allocate " F_U64 " + " F_U64 " + " F_U64 " MB for bases, adjusts and reads.\n",
(sizeof(char) * G->basesLen) >> 20,
(sizeof(Adjust_t) * G->adjustsLen) >> 20,
(sizeof(Frag_Info_t) * (G->endID - G->bgnID + 1)) >> 20);
@@ -285,7 +285,7 @@ Correct_Frags(coParameters *G,
// We should be at the IDENT message.
if (C[Cpos].type != IDENT) {
- fprintf(stderr, "ERROR: didn't find IDENT at Cpos="F_U64" for read "F_U32"\n", Cpos, curID);
+ fprintf(stderr, "ERROR: didn't find IDENT at Cpos=" F_U64 " for read " F_U32 "\n", Cpos, curID);
fprintf(stderr, " C[Cpos] = keep_left=%u keep_right=%u type=%u pos=%u readID=%u\n",
C[Cpos].keep_left,
C[Cpos].keep_right,
@@ -324,7 +324,7 @@ Correct_Frags(coParameters *G,
delete readData;
delete Cfile;
- fprintf(stderr, "Corrected "F_U64" bases with "F_U64" substitutions, "F_U64" deletions and "F_U64" insertions.\n",
+ fprintf(stderr, "Corrected " F_U64 " bases with " F_U64 " substitutions, " F_U64 " deletions and " F_U64 " insertions.\n",
G->basesLen,
changes[A_SUBST] + changes[C_SUBST] + changes[G_SUBST] + changes[T_SUBST],
changes[DELETE],
diff --git a/src/overlapErrorAdjustment/correctOverlaps-Prefix_Edit_Distance.C b/src/overlapErrorAdjustment/correctOverlaps-Prefix_Edit_Distance.C
index 2db1a19..36f5f13 100644
--- a/src/overlapErrorAdjustment/correctOverlaps-Prefix_Edit_Distance.C
+++ b/src/overlapErrorAdjustment/correctOverlaps-Prefix_Edit_Distance.C
@@ -142,7 +142,7 @@ Allocate_More_Edit_Space(pedWorkArea_t *WA) {
fprintf(stderr, "Allocate_More_Edit_Space()-- ERROR: couldn't allocate enough space for even one more entry! e=%d\n", e);
assert(e != b);
- fprintf(stderr, "--Allocate "F_U64" MB for edit array work space %u (positions %u-%u)\n", Size >> 20, a, b, e-1);
+ fprintf(stderr, "--Allocate %d MB for edit array work space %d (positions %u-%u)\n", Size >> 20, a, b, e-1);
}
diff --git a/src/overlapErrorAdjustment/correctOverlaps-Read_Olaps.C b/src/overlapErrorAdjustment/correctOverlaps-Read_Olaps.C
index e845f7f..eb88a6c 100644
--- a/src/overlapErrorAdjustment/correctOverlaps-Read_Olaps.C
+++ b/src/overlapErrorAdjustment/correctOverlaps-Read_Olaps.C
@@ -44,10 +44,10 @@ Read_Olaps(coParameters *G, gkStore *gkpStore) {
uint64 numNormal = 0;
uint64 numInnie = 0;
- fprintf(stderr, "Read_Olaps()-- Loading "F_U64" overlaps from '%s' for reads "F_U32" to "F_U32"\n",
+ fprintf(stderr, "Read_Olaps()-- Loading " F_U64 " overlaps from '%s' for reads " F_U32 " to " F_U32 "\n",
numolaps, G->ovlStorePath, G->bgnID, G->endID);
- fprintf(stderr, "--Allocate "F_U64" MB for overlaps.\n",
+ fprintf(stderr, "--Allocate " F_U64 " MB for overlaps.\n",
(sizeof(Olap_Info_t) * numolaps) >> 20);
G->olaps = new Olap_Info_t [numolaps];
@@ -75,7 +75,7 @@ Read_Olaps(coParameters *G, gkStore *gkpStore) {
delete ovs;
- fprintf(stderr, "Read_Olaps()-- Loaded "F_U64" overlaps -- "F_U64" normal and "F_U64" innie.\n",
+ fprintf(stderr, "Read_Olaps()-- Loaded " F_U64 " overlaps -- " F_U64 " normal and " F_U64 " innie.\n",
G->olapsLen, numNormal, numInnie);
}
diff --git a/src/overlapErrorAdjustment/correctOverlaps-Redo_Olaps.C b/src/overlapErrorAdjustment/correctOverlaps-Redo_Olaps.C
index 2cede22..ada0c4e 100644
--- a/src/overlapErrorAdjustment/correctOverlaps-Redo_Olaps.C
+++ b/src/overlapErrorAdjustment/correctOverlaps-Redo_Olaps.C
@@ -290,19 +290,19 @@ Redo_Olaps(coParameters *G, gkStore *gkpStore) {
// Allocate some temporary work space for the forward and reverse corrected B reads.
- fprintf(stderr, "--Allocate "F_U64" MB for fseq and rseq.\n", (2 * sizeof(char) * 2 * (AS_MAX_READLEN + 1)) >> 20);
+ fprintf(stderr, "--Allocate " F_U64 " MB for fseq and rseq.\n", (2 * sizeof(char) * 2 * (AS_MAX_READLEN + 1)) >> 20);
char *fseq = new char [AS_MAX_READLEN + 1 + AS_MAX_READLEN + 1];
uint32 fseqLen = 0;
char *rseq = new char [AS_MAX_READLEN + 1 + AS_MAX_READLEN + 1];
uint32 rseqLen = 0;
- fprintf(stderr, "--Allocate "F_U64" MB for fadj and radj.\n", (2 * sizeof(Adjust_t) * (AS_MAX_READLEN + 1)) >> 20);
+ fprintf(stderr, "--Allocate " F_U64 " MB for fadj and radj.\n", (2 * sizeof(Adjust_t) * (AS_MAX_READLEN + 1)) >> 20);
Adjust_t *fadj = new Adjust_t [AS_MAX_READLEN + 1];
Adjust_t *radj = new Adjust_t [AS_MAX_READLEN + 1];
uint32 fadjLen = 0; // radj is the same length
- fprintf(stderr, "--Allocate "F_U64" MB for pedWorkArea_t.\n", sizeof(pedWorkArea_t) >> 20);
+ fprintf(stderr, "--Allocate " F_U64 " MB for pedWorkArea_t.\n", sizeof(pedWorkArea_t) >> 20);
gkReadData *readData = new gkReadData;
pedWorkArea_t *ped = new pedWorkArea_t;
@@ -540,14 +540,14 @@ Redo_Olaps(coParameters *G, gkStore *gkpStore) {
delete [] G->adjusts; G->adjusts = NULL;
delete [] G->reads; G->reads = NULL;
- fprintf(stderr, "Olaps Fwd "F_U64"\n", olapsFwd);
- fprintf(stderr, "Olaps Rev "F_U64"\n", olapsRev);
+ fprintf(stderr, "Olaps Fwd " F_U64 "\n", olapsFwd);
+ fprintf(stderr, "Olaps Rev " F_U64 "\n", olapsRev);
- fprintf(stderr, "Total: "F_U64"\n", Total_Alignments_Ct);
- fprintf(stderr, "Failed: "F_U64" (both)\n", Failed_Alignments_Both_Ct);
- fprintf(stderr, "Failed: "F_U64" (either)\n", Failed_Alignments_Ct);
- fprintf(stderr, "Failed: "F_U64" (match to end)\n", Failed_Alignments_End_Ct);
- fprintf(stderr, "Failed: "F_U64" (negative length)\n", Failed_Alignments_Length_Ct);
+ fprintf(stderr, "Total: " F_U64 "\n", Total_Alignments_Ct);
+ fprintf(stderr, "Failed: " F_U64 " (both)\n", Failed_Alignments_Both_Ct);
+ fprintf(stderr, "Failed: " F_U64 " (either)\n", Failed_Alignments_Ct);
+ fprintf(stderr, "Failed: " F_U64 " (match to end)\n", Failed_Alignments_End_Ct);
+ fprintf(stderr, "Failed: " F_U64 " (negative length)\n", Failed_Alignments_Length_Ct);
fprintf(stderr, "rhaFail %u rhaPass %u\n", rhaFail, rhaPass);
}
diff --git a/src/overlapErrorAdjustment/correctOverlaps.C b/src/overlapErrorAdjustment/correctOverlaps.C
index 2037b00..778c386 100644
--- a/src/overlapErrorAdjustment/correctOverlaps.C
+++ b/src/overlapErrorAdjustment/correctOverlaps.C
@@ -151,7 +151,7 @@ main(int argc, char **argv) {
// Load the reads for the overlaps we are going to be correcting, and apply corrections to them
- fprintf(stderr, "Correcting reads "F_U32" to "F_U32".\n", G->bgnID, G->endID);
+ fprintf(stderr, "Correcting reads " F_U32 " to " F_U32 ".\n", G->bgnID, G->endID);
Correct_Frags(G, gkpStore);
@@ -204,7 +204,7 @@ main(int argc, char **argv) {
AS_UTL_safeWrite(fp, &G->endID, "hiid", sizeof(int32), 1);
AS_UTL_safeWrite(fp, &G->olapsLen, "num", sizeof(uint64), 1);
- fprintf(stderr, "--Allocate "F_U64" MB for output error rates.\n",
+ fprintf(stderr, "--Allocate " F_U64 " MB for output error rates.\n",
(sizeof(uint16) * G->olapsLen) >> 20);
uint16 *evalue = new uint16 [G->olapsLen];
@@ -227,7 +227,8 @@ main(int argc, char **argv) {
delete G;
- fprintf(stderr, "DONE.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Bye.\n");
exit(0);
}
diff --git a/src/overlapErrorAdjustment/correctOverlaps.H b/src/overlapErrorAdjustment/correctOverlaps.H
index ab5e950..b4a75d5 100644
--- a/src/overlapErrorAdjustment/correctOverlaps.H
+++ b/src/overlapErrorAdjustment/correctOverlaps.H
@@ -138,7 +138,16 @@ public:
class Olap_Info_t {
public:
- Olap_Info_t() {};
+ Olap_Info_t() {
+ a_iid = 0;
+ b_iid = 0;
+ a_hang = 0;
+ b_hang = 0;
+ innie = false;
+ normal = false;
+ order = 0;
+ evalue = 0;
+ };
~Olap_Info_t() {};
uint32 a_iid;
@@ -208,7 +217,7 @@ public:
void initialize(coParameters *G_, double errorRate) {
G = G_;
- fprintf(stderr, "-- Allocate "F_U64" MB for Edit_Array pointers.\n", (sizeof(int32 *) * Edit_Array_Max) >> 20);
+ fprintf(stderr, "-- Allocate " F_U64 " MB for Edit_Array pointers.\n", (sizeof(int32 *) * Edit_Array_Max) >> 20);
Edit_Array_Max = 1 + (uint32)(errorRate * AS_MAX_READLEN);
Edit_Array_Lazy = new int32 * [Edit_Array_Max];
diff --git a/src/overlapErrorAdjustment/findErrors-Process_Olap.C b/src/overlapErrorAdjustment/findErrors-Process_Olap.C
index 5d17564..0ad303b 100644
--- a/src/overlapErrorAdjustment/findErrors-Process_Olap.C
+++ b/src/overlapErrorAdjustment/findErrors-Process_Olap.C
@@ -15,6 +15,10 @@
*
* Modifications by:
*
+ * Brian P. Walenz beginning on 2016-JUN-27
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
@@ -147,11 +151,14 @@ Process_Olap(Olap_Info_t *olap,
match_to_end = true;
}
- if ((errors <= wa->G->Error_Bound[olap_len]) && (match_to_end == true))
+
+ if ((errors <= wa->G->Error_Bound[olap_len]) && (match_to_end == true)) {
+ wa->passedOlaps++;
Analyze_Alignment(wa,
a_part, a_end, a_offset,
b_part, b_end,
ri);
- else
+ } else {
wa->failedOlaps++;
+ }
}
diff --git a/src/overlapErrorAdjustment/findErrors-Read_Frags.C b/src/overlapErrorAdjustment/findErrors-Read_Frags.C
index ab80b55..60c21db 100644
--- a/src/overlapErrorAdjustment/findErrors-Read_Frags.C
+++ b/src/overlapErrorAdjustment/findErrors-Read_Frags.C
@@ -19,6 +19,10 @@
* are Copyright 2015 Battelle National Biodefense Institute, and
* are subject to the BSD 3-Clause License
*
+ * Brian P. Walenz beginning on 2016-OCT-17
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
@@ -56,7 +60,7 @@ Read_Frags(feParameters *G,
uint64 votesLength = 0;
uint64 readsLoaded = 0;
- fprintf(stderr, "Read_Frags()-- from "F_U32" through "F_U32"\n",
+ fprintf(stderr, "Read_Frags()-- from " F_U32 " through " F_U32 "\n",
G->bgnID, G->endID);
for (uint32 curID=G->bgnID; curID<=G->endID; curID++) {
@@ -74,7 +78,7 @@ Read_Frags(feParameters *G,
totAlloc >> 20,
G->endID - G->bgnID + 1,
basesLength,
- (double)totAlloc / basesLength);
+ (basesLength > 0) ? ((double)totAlloc / basesLength) : 0.0);
G->readBases = new char [basesLength];
G->readVotes = new Vote_Tally_t [votesLength]; // NO constructor, MUST INIT
@@ -118,6 +122,6 @@ Read_Frags(feParameters *G,
delete readData;
- fprintf(stderr, "Read_Frags()-- from "F_U32" through "F_U32" -- loaded "F_U64" bases in "F_U64" reads.\n",
+ fprintf(stderr, "Read_Frags()-- from " F_U32 " through " F_U32 " -- loaded " F_U64 " bases in " F_U64 " reads.\n",
G->bgnID, G->endID-1, basesLength, readsLoaded);
}
diff --git a/src/overlapErrorAdjustment/findErrors-Read_Olaps.C b/src/overlapErrorAdjustment/findErrors-Read_Olaps.C
index 8a82a67..19626d4 100644
--- a/src/overlapErrorAdjustment/findErrors-Read_Olaps.C
+++ b/src/overlapErrorAdjustment/findErrors-Read_Olaps.C
@@ -19,6 +19,10 @@
* are Copyright 2015 Battelle National Biodefense Institute, and
* are subject to the BSD 3-Clause License
*
+ * Brian P. Walenz beginning on 2016-OCT-17
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
@@ -37,7 +41,7 @@ Read_Olaps(feParameters *G, gkStore *gkpStore) {
uint64 numolaps = ovs->numOverlapsInRange();
- fprintf(stderr, "Read_Olaps()-- loading "F_U64" overlaps.\n",
+ fprintf(stderr, "Read_Olaps()-- loading " F_U64 " overlaps.\n",
numolaps);
G->olaps = new Olap_Info_t [numolaps];
diff --git a/src/overlapErrorAdjustment/findErrors.C b/src/overlapErrorAdjustment/findErrors.C
index f39a5f5..1359f0b 100644
--- a/src/overlapErrorAdjustment/findErrors.C
+++ b/src/overlapErrorAdjustment/findErrors.C
@@ -53,7 +53,7 @@ Output_Corrections(feParameters *G);
// From overlapInCore.C
int
-Binomial_Bound (int e, double p, int Start, double Limit);
+Binomial_Bound(int e, double p, int Start, double Limit);
@@ -92,10 +92,8 @@ Extract_Needed_Frags(feParameters *G,
assert(loID <= fi);
- fprintf(stderr, "Extract_Needed_Frags()-- Loading used reads between "F_U32" and "F_U32".\n",
- fi, hiID);
-
- fprintf(stderr, "Extract_Needed_Frags()-- At overlap "F_U64"\n", lastOlap);
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Extract_Needed_Frags()-- Loading used reads between " F_U32 " and " F_U32 ", at overlap " F_U64 ".\n", fi, hiID, lastOlap);
while (fi <= hiID) {
gkRead *read = gkpStore->gkStore_getRead(fi);
@@ -111,9 +109,7 @@ Extract_Needed_Frags(feParameters *G,
fi = (lastOlap < G->olapsLen) ? G->olaps[lastOlap].b_iid : hiID + 1;
}
- fprintf(stderr, "Extract_Needed_Frags()-- Loading reads for overlaps "F_U64" to "F_U64"\n",
- nextOlap, lastOlap);
- fprintf(stderr, "Extract_Needed_Frags()-- reads "F_U32" bases "F_U64"\n", fl->readsLen, fl->basesLen);
+ fprintf(stderr, "Extract_Needed_Frags()-- Loading reads for overlaps " F_U64 " to " F_U64 " (reads " F_U32 " bases " F_U64 ")\n", nextOlap, lastOlap, fl->readsLen, fl->basesLen);
// Ensure there is space.
@@ -121,7 +117,7 @@ Extract_Needed_Frags(feParameters *G,
delete [] fl->readIDs;
delete [] fl->readBases;
- fprintf(stderr, "realloc reads from "F_U32" to "F_U32"\n", fl->readsMax, 12 * fl->readsLen / 10);
+ //fprintf(stderr, "Extract_Needed_Frags()-- realloc reads from " F_U32 " to " F_U32 "\n", fl->readsMax, 12 * fl->readsLen / 10);
fl->readIDs = new uint32 [12 * fl->readsLen / 10];
fl->readBases = new char * [12 * fl->readsLen / 10];
@@ -132,7 +128,7 @@ Extract_Needed_Frags(feParameters *G,
if (fl->basesMax < fl->basesLen) {
delete [] fl->bases;
- fprintf(stderr, "realloc bases from "F_U64" to "F_U64"\n", fl->basesMax, 12 * fl->basesLen / 10);
+ //fprintf(stderr, "Extract_Needed_Frags()-- realloc bases from " F_U64 " to " F_U64 "\n", fl->basesMax, 12 * fl->basesLen / 10);
fl->bases = new char [12 * fl->basesLen / 10];
@@ -182,9 +178,13 @@ Extract_Needed_Frags(feParameters *G,
fl->readsLen = ii;
- fprintf(stderr, "Extract_Needed_Frags()-- Loaded "F_U32" reads (%.4f%%). Loaded IDs "F_U32" through "F_U32".\n",
- fl->readsLen, 100.0 * fl->readsLen / (hiID - 1 - loID),
- fl->readIDs[0], fl->readIDs[fl->readsLen-1]);
+ if (fl->readsLen > 0)
+ fprintf(stderr, "Extract_Needed_Frags()-- Loaded " F_U32 " reads (%.4f%%). Loaded IDs " F_U32 " through " F_U32 ".\n",
+ fl->readsLen, 100.0 * fl->readsLen / (hiID - 1 - loID),
+ fl->readIDs[0], fl->readIDs[fl->readsLen-1]);
+ else
+ fprintf(stderr, "Extract_Needed_Frags()-- Loaded " F_U32 " reads (%.4f%%).\n",
+ fl->readsLen, 100.0 * fl->readsLen / (hiID - 1 - loID));
}
@@ -230,10 +230,6 @@ Threaded_Process_Stream(void *ptr) {
}
}
- //pthread_mutex_lock(& Print_Mutex);
- //fprintf(stderr, "Thread %d processed %d olaps\n", wa->thread_id, olap_ct);
- //pthread_mutex_unlock(& Print_Mutex);
-
pthread_exit(ptr);
return(NULL);
@@ -251,17 +247,18 @@ Threaded_Process_Stream(void *ptr) {
static
void
-Threaded_Stream_Old_Frags(feParameters *G, gkStore *gkpStore) {
+Threaded_Stream_Old_Frags(feParameters *G,
+ gkStore *gkpStore,
+ uint64 &passedOlaps,
+ uint64 &failedOlaps) {
pthread_attr_t attr;
- pthread_mutex_init(&G->Print_Mutex, NULL);
-
pthread_attr_init(&attr);
pthread_attr_setstacksize(&attr, THREAD_STACKSIZE);
- pthread_t *thread_id = new pthread_t [G->numThreads];
- Thread_Work_Area_t *thread_wa = new Thread_Work_Area_t[G->numThreads];
+ pthread_t *thread_id = new pthread_t [G->numThreads];
+ Thread_Work_Area_t *thread_wa = new Thread_Work_Area_t [G->numThreads];
for (uint32 i=0; i<G->numThreads; i++) {
thread_wa[i].thread_id = i;
@@ -271,6 +268,7 @@ Threaded_Stream_Old_Frags(feParameters *G, gkStore *gkpStore) {
thread_wa[i].G = G;
thread_wa[i].frag_list = NULL;
thread_wa[i].rev_id = UINT32_MAX;
+ thread_wa[i].passedOlaps = 0;
thread_wa[i].failedOlaps = 0;
memset(thread_wa[i].rev_seq, 0, sizeof(char) * AS_MAX_READLEN);
@@ -349,6 +347,19 @@ Threaded_Stream_Old_Frags(feParameters *G, gkStore *gkpStore) {
next_frag_list = s;
}
}
+
+ // Threads all done, sum up stats.
+
+ passedOlaps = 0;
+ failedOlaps = 0;
+
+ for (uint32 i=0; i<G->numThreads; i++) {
+ passedOlaps += thread_wa[i].passedOlaps;
+ failedOlaps += thread_wa[i].failedOlaps;
+ }
+
+ delete [] thread_id;
+ delete [] thread_wa;
}
@@ -456,10 +467,7 @@ main(int argc, char **argv) {
exit(1);
}
-
- //
// Initialize Globals
- //
double MAX_ERRORS = 1 + (uint32)(G->errorRate * AS_MAX_READLEN);
@@ -468,9 +476,7 @@ main(int argc, char **argv) {
for (uint32 i = 0; i <= AS_MAX_READLEN; i++)
G->Error_Bound[i] = (int)ceil(i * G->errorRate);
- //
- //
- //
+ // Load data.
gkStore *gkpStore = gkStore::gkStore_open(G->gkpStorePath);
@@ -483,23 +489,35 @@ main(int argc, char **argv) {
Read_Frags(G, gkpStore);
Read_Olaps(G, gkpStore);
- // Now sort them!
+ // Sort overlaps, process each.
sort(G->olaps, G->olaps + G->olapsLen);
- //fprintf (stderr, "Before Stream_Old_Frags Num_Olaps = "F_S64"\n", Num_Olaps);
+ uint64 passedOlaps = 0;
+ uint64 failedOlaps = 0;
- Threaded_Stream_Old_Frags(G, gkpStore);
+ Threaded_Stream_Old_Frags(G, gkpStore, passedOlaps, failedOlaps);
- //fprintf (stderr, " Failed overlaps = %d\n", Failed_Olaps);
+ // All done. Sum up what we did.
- gkpStore->gkStore_close();
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Passed overlaps = %10" F_U64P " %8.4f%%\n", passedOlaps, 100.0 * passedOlaps / (failedOlaps + passedOlaps));
+ fprintf(stderr, "Failed overlaps = %10" F_U64P " %8.4f%%\n", failedOlaps, 100.0 * failedOlaps / (failedOlaps + passedOlaps));
+
+ // Dump output.
//Output_Details(G);
Output_Corrections(G);
+ // Cleanup and exit!
+
+ gkpStore->gkStore_close();
+
delete G;
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Bye.\n");
+
exit(0);
}
diff --git a/src/overlapErrorAdjustment/findErrors.H b/src/overlapErrorAdjustment/findErrors.H
index 775d178..4f2004b 100644
--- a/src/overlapErrorAdjustment/findErrors.H
+++ b/src/overlapErrorAdjustment/findErrors.H
@@ -15,6 +15,10 @@
*
* Modifications by:
*
+ * Brian P. Walenz beginning on 2016-JUN-27
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
@@ -154,7 +158,14 @@ public:
class Olap_Info_t {
public:
- Olap_Info_t() {};
+ Olap_Info_t() {
+ a_iid = 0;
+ b_iid = 0;
+ a_hang = 0;
+ b_hang = 0;
+ innie = false;
+ normal = false;
+ };
~Olap_Info_t() {};
uint32 a_iid;
@@ -225,6 +236,7 @@ public:
deltaLen = 0;
Edit_Array_Lazy = NULL;
+ Edit_Array_Max = 0;
};
~pedWorkArea_t() {
@@ -274,7 +286,8 @@ struct Thread_Work_Area_t {
Vote_t globalvote[AS_MAX_READLEN];
- uint32 failedOlaps;
+ uint64 passedOlaps;
+ uint64 failedOlaps;
pedWorkArea_t ped;
};
@@ -375,8 +388,5 @@ public:
// This array [i] is the maximum number of errors allowed in a match between sequences of length
// i , which is i * MAXERROR_RATE .
int Error_Bound [AS_MAX_READLEN + 1];
-
- // To make debugging printout come out together
- pthread_mutex_t Print_Mutex;
};
diff --git a/src/overlapInCore/libedlib/edlib.C b/src/overlapInCore/libedlib/edlib.C
new file mode 100644
index 0000000..5f6d8b8
--- /dev/null
+++ b/src/overlapInCore/libedlib/edlib.C
@@ -0,0 +1,1394 @@
+
+/******************************************************************************
+ *
+ * This file is part of canu, a software program that assembles whole-genome
+ * sequencing reads into contigs.
+ *
+ * This software is based on:
+ * 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
+ * the 'kmer package' (http://kmer.sourceforge.net)
+ * both originally distributed by Applera Corporation under the GNU General
+ * Public License, version 2.
+ *
+ * Canu branched from Celera Assembler at its revision 4587.
+ * Canu branched from the kmer project at its revision 1994.
+ *
+ * Modifications by:
+ *
+ * Sergey Koren beginning on 2016-AUG-30
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
+ * Brian P. Walenz beginning on 2016-SEP-23
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
+ * File 'README.licenses' in the root directory of this distribution contains
+ * full conditions and disclaimers for each license.
+ */
+
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2014 Martin Šošić
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "edlib.H"
+
+#include <stdint.h>
+#include <cstdlib>
+#include <algorithm>
+#include <vector>
+#include <cstring>
+#include <cassert>
+
+using namespace std;
+
+typedef uint64_t Word;
+static const int WORD_SIZE = sizeof(Word) * 8; // Size of Word in bits
+static const Word WORD_1 = (Word)1;
+static const Word HIGH_BIT_MASK = WORD_1 << (WORD_SIZE - 1); // 100..00
+
+// Data needed to find alignment.
+struct AlignmentData {
+ Word* Ps;
+ Word* Ms;
+ int* scores;
+ int* firstBlocks;
+ int* lastBlocks;
+
+ AlignmentData(int maxNumBlocks, int targetLength) {
+ // We build a complete table and mark first and last block for each column
+ // (because algorithm is banded so only part of each columns is used).
+ // TODO: do not build a whole table, but just enough blocks for each column.
+ Ps = new Word[maxNumBlocks * targetLength];
+ Ms = new Word[maxNumBlocks * targetLength];
+ scores = new int[maxNumBlocks * targetLength];
+ firstBlocks = new int[targetLength];
+ lastBlocks = new int[targetLength];
+ }
+
+ ~AlignmentData() {
+ delete[] Ps;
+ delete[] Ms;
+ delete[] scores;
+ delete[] firstBlocks;
+ delete[] lastBlocks;
+ }
+};
+
+struct Block {
+ Word P; // Pvin
+ Word M; // Mvin
+ int score; // score of last cell in block;
+
+ Block() {}
+ Block(Word P, Word M, int score) :P(P), M(M), score(score) {}
+};
+
+static int myersCalcEditDistanceSemiGlobal(Word* Peq, int W, int maxNumBlocks,
+ const unsigned char* query, int queryLength,
+ const unsigned char* target, int targetLength,
+ int alphabetLength, int k, EdlibAlignMode mode, int* bestScore,
+ int** positions, int* numPositions);
+
+static int myersCalcEditDistanceNW(Word* Peq, int W, int maxNumBlocks,
+ const unsigned char* query, int queryLength,
+ const unsigned char* target, int targetLength,
+ int alphabetLength, int k, int* bestScore, int* position,
+ bool findAlignment, AlignmentData** alignData, int targetStopPosition);
+
+
+static int obtainAlignment(
+ const unsigned char* query, const unsigned char* rQuery, const int queryLength,
+ const unsigned char* target, const unsigned char* rTarget, const int targetLength,
+ const int alphabetLength, const int bestScore,
+ unsigned char** alignment, int* alignmentLength);
+
+static int obtainAlignmentHirschberg(
+ const unsigned char* query, const unsigned char* rQuery, const int queryLength,
+ const unsigned char* target, const unsigned char* rTarget, const int targetLength,
+ const int alphabetLength, const int bestScore,
+ unsigned char** alignment, int* alignmentLength);
+
+static int obtainAlignmentTraceback(const int queryLength, const int targetLength,
+ const int bestScore, const AlignmentData* alignData,
+ unsigned char** alignment, int* alignmentLength);
+
+static int transformSequences(const char* queryOriginal, const int queryLength,
+ const char* targetOriginal, const int targetLength,
+ unsigned char** queryTransformed, unsigned char** targetTransformed);
+
+static inline int ceilDiv(int x, int y);
+
+static inline unsigned char* createReverseCopy(const unsigned char* seq, int length);
+
+static inline Word* buildPeq(int alphabetLength, const unsigned char* query, int queryLength);
+
+
+
+/**
+ * Main edlib method.
+ */
+EdlibAlignResult edlibAlign(const char* queryOriginal, const int queryLength,
+ const char* targetOriginal, const int targetLength,
+ const EdlibAlignConfig config) {
+ EdlibAlignResult result;
+ result.editDistance = -1;
+ result.endLocations = result.startLocations = NULL;
+ result.numLocations = 0;
+ result.alignment = NULL;
+ result.alignmentLength = 0;
+ result.alphabetLength = 0;
+
+
+ /*------------ TRANSFORM SEQUENCES AND RECOGNIZE ALPHABET -----------*/
+ unsigned char* query, * target;
+ int alphabetLength = transformSequences(queryOriginal, queryLength, targetOriginal, targetLength,
+ &query, &target);
+ result.alphabetLength = alphabetLength;
+ /*-------------------------------------------------------*/
+
+
+ /*--------------------- INITIALIZATION ------------------*/
+ int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE); // bmax in Myers
+ int W = maxNumBlocks * WORD_SIZE - queryLength; // number of redundant cells in last level blocks
+
+ Word* Peq = buildPeq(alphabetLength, query, queryLength);
+ /*-------------------------------------------------------*/
+
+
+ /*------------------ MAIN CALCULATION -------------------*/
+ // TODO: Store alignment data only after k is determined? That could make things faster.
+ int positionNW; // Used only when mode is NW.
+ AlignmentData* alignData = NULL;
+ bool dynamicK = false;
+ int k = config.k;
+ if (k < 0) { // If valid k is not given, auto-adjust k until solution is found.
+ dynamicK = true;
+ k = WORD_SIZE; // Gives better results than smaller k.
+ }
+
+ do {
+ if (config.mode == EDLIB_MODE_HW || config.mode == EDLIB_MODE_SHW) {
+ myersCalcEditDistanceSemiGlobal(Peq, W, maxNumBlocks,
+ query, queryLength, target, targetLength,
+ alphabetLength, k, config.mode, &(result.editDistance),
+ &(result.endLocations), &(result.numLocations));
+ } else { // mode == EDLIB_MODE_NW
+ myersCalcEditDistanceNW(Peq, W, maxNumBlocks,
+ query, queryLength, target, targetLength,
+ alphabetLength, k, &(result.editDistance), &positionNW,
+ false, &alignData, -1);
+ }
+ k *= 2;
+ } while(dynamicK && result.editDistance == -1);
+
+ if (result.editDistance >= 0) { // If there is solution.
+ // If NW mode, set end location explicitly.
+ if (config.mode == EDLIB_MODE_NW) {
+ result.endLocations = (int *) malloc(sizeof(int) * 1);
+ result.endLocations[0] = targetLength - 1;
+ result.numLocations = 1;
+ }
+
+ // Find starting locations.
+ if (config.task == EDLIB_TASK_LOC || config.task == EDLIB_TASK_PATH) {
+ result.startLocations = (int*) malloc(result.numLocations * sizeof(int));
+ if (config.mode == EDLIB_MODE_HW) { // If HW, I need to calculate start locations.
+ const unsigned char* rTarget = createReverseCopy(target, targetLength);
+ const unsigned char* rQuery = createReverseCopy(query, queryLength);
+ Word* rPeq = buildPeq(alphabetLength, rQuery, queryLength); // Peq for reversed query
+ for (int i = 0; i < result.numLocations; i++) {
+ int endLocation = result.endLocations[i];
+ int bestScoreSHW, numPositionsSHW;
+ int* positionsSHW;
+ myersCalcEditDistanceSemiGlobal(
+ rPeq, W, maxNumBlocks,
+ rQuery, queryLength, rTarget + targetLength - endLocation - 1, endLocation + 1,
+ alphabetLength, result.editDistance, EDLIB_MODE_SHW,
+ &bestScoreSHW, &positionsSHW, &numPositionsSHW);
+ // Taking last location as start ensures that alignment will not start with insertions
+ // if it can start with mismatches instead.
+ result.startLocations[i] = endLocation - positionsSHW[numPositionsSHW - 1];
+ delete[] positionsSHW;
+ }
+ delete[] rTarget;
+ delete[] rQuery;
+ delete[] rPeq;
+ } else { // If mode is SHW or NW
+ for (int i = 0; i < result.numLocations; i++) {
+ result.startLocations[i] = 0;
+ }
+ }
+ }
+
+ // Find alignment -> all comes down to finding alignment for NW.
+ // Currently we return alignment only for first pair of locations.
+ if (config.task == EDLIB_TASK_PATH) {
+ int alnStartLocation = result.startLocations[0];
+ int alnEndLocation = result.endLocations[0];
+ const unsigned char* alnTarget = target + alnStartLocation;
+ const int alnTargetLength = alnEndLocation - alnStartLocation + 1;
+ const unsigned char* rAlnTarget = createReverseCopy(alnTarget, alnTargetLength);
+ const unsigned char* rQuery = createReverseCopy(query, queryLength);
+ obtainAlignment(query, rQuery, queryLength,
+ alnTarget, rAlnTarget, alnTargetLength,
+ alphabetLength, result.editDistance,
+ &(result.alignment), &(result.alignmentLength));
+ delete[] rAlnTarget;
+ delete[] rQuery;
+ }
+ }
+ /*-------------------------------------------------------*/
+
+ //--- Free memory ---//
+ delete[] Peq;
+ free(query);
+ free(target);
+ if (alignData) delete alignData;
+ //-------------------//
+
+ return result;
+}
+
+
+char* edlibAlignmentToCigar(unsigned char* alignment, int alignmentLength,
+ EdlibCigarFormat cigarFormat) {
+ if (cigarFormat != EDLIB_CIGAR_EXTENDED && cigarFormat != EDLIB_CIGAR_STANDARD) {
+ return 0;
+ }
+
+ // Maps move code from alignment to char in cigar.
+ // 0 1 2 3
+ char moveCodeToChar[] = {'=', 'I', 'D', 'X'};
+ if (cigarFormat == EDLIB_CIGAR_STANDARD) {
+ moveCodeToChar[0] = moveCodeToChar[3] = 'M';
+ }
+
+ vector<char>* cigar = new vector<char>();
+ char lastMove = 0; // Char of last move. 0 if there was no previous move.
+ int numOfSameMoves = 0;
+ for (int i = 0; i <= alignmentLength; i++) {
+ // if new sequence of same moves started
+ if (i == alignmentLength || (moveCodeToChar[alignment[i]] != lastMove && lastMove != 0)) {
+ // Write number of moves to cigar string.
+ int numDigits = 0;
+ for (; numOfSameMoves; numOfSameMoves /= 10) {
+ cigar->push_back('0' + numOfSameMoves % 10);
+ numDigits++;
+ }
+ reverse(cigar->end() - numDigits, cigar->end());
+ // Write code of move to cigar string.
+ cigar->push_back(lastMove);
+ // If not at the end, start new sequence of moves.
+ if (i < alignmentLength) {
+ // Check if alignment has valid values.
+ if (alignment[i] > 3) {
+ delete cigar;
+ return 0;
+ }
+ numOfSameMoves = 0;
+ }
+ }
+ if (i < alignmentLength) {
+ lastMove = moveCodeToChar[alignment[i]];
+ numOfSameMoves++;
+ }
+ }
+ cigar->push_back(0); // Null character termination.
+ char* cigar_ = (char*) malloc(cigar->size() * sizeof(char));
+ memcpy(cigar_, &(*cigar)[0], cigar->size() * sizeof(char));
+ delete cigar;
+
+ return cigar_;
+}
+
+/**
+ * Build Peq table for given query and alphabet.
+ * Peq is table of dimensions alphabetLength+1 x maxNumBlocks.
+ * Bit i of Peq[s * maxNumBlocks + b] is 1 if i-th symbol from block b of query equals symbol s, otherwise it is 0.
+ * NOTICE: free returned array with delete[]!
+ */
+static inline Word* buildPeq(int alphabetLength, const unsigned char* query, int queryLength) {
+ int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE);
+ // table of dimensions alphabetLength+1 x maxNumBlocks. Last symbol is wildcard.
+ Word* Peq = new Word[(alphabetLength + 1) * maxNumBlocks];
+
+ // Build Peq (1 is match, 0 is mismatch). NOTE: last column is wildcard(symbol that matches anything) with just 1s
+ for (int symbol = 0; symbol <= alphabetLength; symbol++) {
+ for (int b = 0; b < maxNumBlocks; b++) {
+ if (symbol < alphabetLength) {
+ Peq[symbol * maxNumBlocks + b] = 0;
+ for (int r = (b+1) * WORD_SIZE - 1; r >= b * WORD_SIZE; r--) {
+ Peq[symbol * maxNumBlocks + b] <<= 1;
+ // NOTE: We pretend like query is padded at the end with W wildcard symbols
+ if (r >= queryLength || query[r] == symbol)
+ Peq[symbol * maxNumBlocks + b] += 1;
+ }
+ } else { // Last symbol is wildcard, so it is all 1s
+ Peq[symbol * maxNumBlocks + b] = (Word)-1;
+ }
+ }
+ }
+
+ return Peq;
+}
+
+
+/**
+ * Returns new sequence that is reverse of given sequence.
+ */
+static inline unsigned char* createReverseCopy(const unsigned char* seq, int length) {
+ unsigned char* rSeq = new unsigned char[length];
+ for (int i = 0; i < length; i++) {
+ rSeq[i] = seq[length - i - 1];
+ }
+ return rSeq;
+}
+
+
+
+/**
+ * Corresponds to Advance_Block function from Myers.
+ * Calculates one word(block), which is part of a column.
+ * Highest bit of word (one most to the left) is most bottom cell of block from column.
+ * Pv[i] and Mv[i] define vin of cell[i]: vin = cell[i] - cell[i-1].
+ * @param [in] Pv Bitset, Pv[i] == 1 if vin is +1, otherwise Pv[i] == 0.
+ * @param [in] Mv Bitset, Mv[i] == 1 if vin is -1, otherwise Mv[i] == 0.
+ * @param [in] Eq Bitset, Eq[i] == 1 if match, 0 if mismatch.
+ * @param [in] hin Will be +1, 0 or -1.
+ * @param [out] PvOut Bitset, PvOut[i] == 1 if vout is +1, otherwise PvOut[i] == 0.
+ * @param [out] MvOut Bitset, MvOut[i] == 1 if vout is -1, otherwise MvOut[i] == 0.
+ * @param [out] hout Will be +1, 0 or -1.
+ */
+static inline int calculateBlock(Word Pv, Word Mv, Word Eq, const int hin,
+ Word &PvOut, Word &MvOut) {
+ // hin can be 1, -1 or 0.
+ // 1 -> 00...01
+ // 0 -> 00...00
+ // -1 -> 11...11 (2-complement)
+
+ Word hinIsNeg = (Word)(hin >> 2) & WORD_1; // 00...001 if hin is -1, 00...000 if 0 or 1
+
+ Word Xv = Eq | Mv;
+ // This is instruction below written using 'if': if (hin < 0) Eq |= (Word)1;
+ Eq |= hinIsNeg;
+ Word Xh = (((Eq & Pv) + Pv) ^ Pv) | Eq;
+
+ Word Ph = Mv | ~(Xh | Pv);
+ Word Mh = Pv & Xh;
+
+ int hout = 0;
+ // This is instruction below written using 'if': if (Ph & HIGH_BIT_MASK) hout = 1;
+ hout = (Ph & HIGH_BIT_MASK) >> (WORD_SIZE - 1);
+ // This is instruction below written using 'if': if (Mh & HIGH_BIT_MASK) hout = -1;
+ hout -= (Mh & HIGH_BIT_MASK) >> (WORD_SIZE - 1);
+
+ Ph <<= 1;
+ Mh <<= 1;
+
+ // This is instruction below written using 'if': if (hin < 0) Mh |= (Word)1;
+ Mh |= hinIsNeg;
+ // This is instruction below written using 'if': if (hin > 0) Ph |= (Word)1;
+ Ph |= (Word)((hin + 1) >> 1);
+
+ PvOut = Mh | ~(Xv | Ph);
+ MvOut = Ph & Xv;
+
+ return hout;
+}
+
+/**
+ * Does ceiling division x / y.
+ * Note: x and y must be non-negative and x + y must not overflow.
+ */
+static inline int ceilDiv(int x, int y) {
+ return x % y ? x / y + 1 : x / y;
+}
+
+static inline int min(int x, int y) {
+ return x < y ? x : y;
+}
+
+static inline int max(int x, int y) {
+ return x > y ? x : y;
+}
+
+
+/**
+ * @param [in] block
+ * @return Values of cells in block, starting with bottom cell in block.
+ */
+static inline vector<int> getBlockCellValues(const Block block) {
+ vector<int> scores(WORD_SIZE);
+ int score = block.score;
+ Word mask = HIGH_BIT_MASK;
+ for (int i = 0; i < WORD_SIZE - 1; i++) {
+ scores[i] = score;
+ if (block.P & mask) score--;
+ if (block.M & mask) score++;
+ mask >>= 1;
+ }
+ scores[WORD_SIZE - 1] = score;
+ return scores;
+}
+
+/**
+ * Writes values of cells in block into given array, starting with first/top cell.
+ * @param [in] block
+ * @param [out] dest Array into which cell values are written. Must have size of at least WORD_SIZE.
+ */
+static inline void readBlock(const Block block, int* const dest) {
+ int score = block.score;
+ Word mask = HIGH_BIT_MASK;
+ for (int i = 0; i < WORD_SIZE - 1; i++) {
+ dest[WORD_SIZE - 1 - i] = score;
+ if (block.P & mask) score--;
+ if (block.M & mask) score++;
+ mask >>= 1;
+ }
+ dest[0] = score;
+}
+
+/**
+ * Writes values of cells in block into given array, starting with last/bottom cell.
+ * @param [in] block
+ * @param [out] dest Array into which cell values are written. Must have size of at least WORD_SIZE.
+ */
+static inline void readBlockReverse(const Block block, int* const dest) {
+ int score = block.score;
+ Word mask = HIGH_BIT_MASK;
+ for (int i = 0; i < WORD_SIZE - 1; i++) {
+ dest[i] = score;
+ if (block.P & mask) score--;
+ if (block.M & mask) score++;
+ mask >>= 1;
+ }
+ dest[WORD_SIZE - 1] = score;
+}
+
+/**
+ * @param [in] block
+ * @param [in] k
+ * @return True if all cells in block have value larger than k, otherwise false.
+ */
+static inline bool allBlockCellsLarger(const Block block, const int k) {
+ vector<int> scores = getBlockCellValues(block);
+ for (int i = 0; i < WORD_SIZE; i++) {
+ if (scores[i] <= k) return false;
+ }
+ return true;
+}
+
+
+/**
+ * @param [in] mode EDLIB_MODE_HW or EDLIB_MODE_SHW
+ */
+static int myersCalcEditDistanceSemiGlobal(Word* const Peq, const int W, const int maxNumBlocks,
+ const unsigned char* const query, const int queryLength,
+ const unsigned char* const target, const int targetLength,
+ const int alphabetLength, int k, const EdlibAlignMode mode,
+ int* bestScore_, int** positions_, int* numPositions_) {
+ *positions_ = NULL;
+ *numPositions_ = 0;
+
+ // firstBlock is 0-based index of first block in Ukkonen band.
+ // lastBlock is 0-based index of last block in Ukkonen band.
+ int firstBlock = 0;
+ int lastBlock = min(ceilDiv(k + 1, WORD_SIZE), maxNumBlocks) - 1; // y in Myers
+ Block *bl; // Current block
+
+ Block* blocks = new Block[maxNumBlocks];
+
+ // For HW, solution will never be larger then queryLength.
+ if (mode == EDLIB_MODE_HW) {
+ k = min(queryLength, k);
+ }
+
+ // Each STRONG_REDUCE_NUM column is reduced in more expensive way.
+ // This gives speed up of about 2 times for small k.
+ const int STRONG_REDUCE_NUM = 2048;
+
+ // Initialize P, M and score
+ bl = blocks;
+ for (int b = 0; b <= lastBlock; b++) {
+ bl->score = (b + 1) * WORD_SIZE;
+ bl->P = (Word)-1; // All 1s
+ bl->M = (Word)0;
+ bl++;
+ }
+
+ int bestScore = -1;
+ vector<int> positions; // TODO: Maybe put this on heap?
+ const int startHout = mode == EDLIB_MODE_HW ? 0 : 1; // If 0 then gap before query is not penalized;
+ const unsigned char* targetChar = target;
+ for (int c = 0; c < targetLength; c++) { // for each column
+ const Word* Peq_c = Peq + (*targetChar) * maxNumBlocks;
+
+ //----------------------- Calculate column -------------------------//
+ int hout = startHout;
+ bl = blocks + firstBlock;
+ Peq_c += firstBlock;
+ for (int b = firstBlock; b <= lastBlock; b++) {
+ hout = calculateBlock(bl->P, bl->M, *Peq_c, hout, bl->P, bl->M);
+ bl->score += hout;
+ bl++; Peq_c++;
+ }
+ bl--; Peq_c--;
+ //------------------------------------------------------------------//
+
+ //---------- Adjust number of blocks according to Ukkonen ----------//
+ if ((lastBlock < maxNumBlocks - 1) && (bl->score - hout <= k) // bl is pointing to last block
+ && ((*(Peq_c + 1) & WORD_1) || hout < 0)) { // Peq_c is pointing to last block
+ // If score of left block is not too big, calculate one more block
+ lastBlock++; bl++; Peq_c++;
+ bl->P = (Word)-1; // All 1s
+ bl->M = (Word)0;
+ bl->score = (bl - 1)->score - hout + WORD_SIZE + calculateBlock(bl->P, bl->M, *Peq_c, hout, bl->P, bl->M);
+ } else {
+ while (lastBlock >= firstBlock && bl->score >= k + WORD_SIZE) {
+ lastBlock--; bl--; Peq_c--;
+ }
+ }
+
+ // Every some columns, do some expensive but also more efficient block reducing -> this is important!
+ if (c % STRONG_REDUCE_NUM == 0) {
+ while (lastBlock >= firstBlock && allBlockCellsLarger(*bl, k)) {
+ lastBlock--; bl--; Peq_c--;
+ }
+ }
+
+ if (mode != EDLIB_MODE_HW) {
+ while (firstBlock <= lastBlock && blocks[firstBlock].score >= k + WORD_SIZE) {
+ firstBlock++;
+ }
+ if (c % STRONG_REDUCE_NUM == 0) { // Do strong reduction every some blocks
+ while (firstBlock <= lastBlock && allBlockCellsLarger(blocks[firstBlock], k)) {
+ firstBlock++;
+ }
+ }
+ }
+
+ // For HW, even if all cells are > k, there still may be solution in next
+ // column because starting conditions at upper boundary are 0.
+ // That means that first block is always candidate for solution,
+ // and we can never end calculation before last column.
+ if (mode == EDLIB_MODE_HW) {
+ lastBlock = max(0, lastBlock);
+ }
+
+ // If band stops to exist finish
+ if (lastBlock < firstBlock) {
+ *bestScore_ = bestScore;
+ if (bestScore != -1) {
+ *positions_ = (int *) malloc(sizeof(int) * positions.size());
+ *numPositions_ = positions.size();
+ copy(positions.begin(), positions.end(), *positions_);
+ }
+ delete[] blocks;
+ return EDLIB_STATUS_OK;
+ }
+ //------------------------------------------------------------------//
+
+ //------------------------- Update best score ----------------------//
+ if (lastBlock == maxNumBlocks - 1) {
+ int colScore = bl->score;
+ if (colScore <= k) { // Scores > k dont have correct values (so we cannot use them), but are certainly > k.
+ // NOTE: Score that I find in column c is actually score from column c-W
+ if (bestScore == -1 || colScore <= bestScore) {
+ if (colScore != bestScore) {
+ positions.clear();
+ bestScore = colScore;
+ // Change k so we will look only for equal or better
+ // scores then the best found so far.
+ k = bestScore;
+ }
+ positions.push_back(c - W);
+ }
+ }
+ }
+ //------------------------------------------------------------------//
+
+ targetChar++;
+ }
+
+
+ // Obtain results for last W columns from last column.
+ if (lastBlock == maxNumBlocks - 1) {
+ vector<int> blockScores = getBlockCellValues(*bl);
+ for (int i = 0; i < W; i++) {
+ int colScore = blockScores[i + 1];
+ if (colScore <= k && (bestScore == -1 || colScore <= bestScore)) {
+ if (colScore != bestScore) {
+ positions.clear();
+ k = bestScore = colScore;
+ }
+ positions.push_back(targetLength - W + i);
+ }
+ }
+ }
+
+ *bestScore_ = bestScore;
+ if (bestScore != -1) {
+ *positions_ = (int *) malloc(sizeof(int) * positions.size());
+ *numPositions_ = positions.size();
+ copy(positions.begin(), positions.end(), *positions_);
+ }
+
+ delete[] blocks;
+ return EDLIB_STATUS_OK;
+}
+
+
+
+
+/**
+ * @param alignData Data generated during calculation, that is needed for reconstruction of alignment.
+ * I it is allocated with new, so free it with delete.
+ * Data is generated only if findAlignment is true.
+ * @param targetStopPosition If set to -1, whole calculation is performed.
+ * If set to p, calculation is performed up to position p in target (inclusive)
+ * and column p is returned as the only column in alignData.
+ */
+static int myersCalcEditDistanceNW(Word* Peq, int W, int maxNumBlocks,
+ const unsigned char* query, int queryLength,
+ const unsigned char* target, int targetLength,
+ int alphabetLength, int k, int* bestScore_, int* position_,
+ bool findAlignment, AlignmentData** alignData,
+ int targetStopPosition) {
+ if (targetStopPosition > -1 && findAlignment) {
+ // They can not be both set at the same time!
+ return EDLIB_STATUS_ERROR;
+ }
+
+ // Each STRONG_REDUCE_NUM column is reduced in more expensive way.
+ const int STRONG_REDUCE_NUM = 2048; // TODO: Choose this number dinamically (based on query and target lengths?), so it does not affect speed of computation
+
+ if (k < abs(targetLength - queryLength)) {
+ *bestScore_ = *position_ = -1;
+ return EDLIB_STATUS_OK;
+ }
+
+ k = min(k, max(queryLength, targetLength)); // Upper bound for k
+
+ // firstBlock is 0-based index of first block in Ukkonen band.
+ // lastBlock is 0-based index of last block in Ukkonen band.
+ int firstBlock = 0;
+ // This is optimal now, by my formula.
+ int lastBlock = min(maxNumBlocks, ceilDiv(min(k, (k + queryLength - targetLength) / 2) + 1, WORD_SIZE)) - 1;
+ Block* bl; // Current block
+
+ Block* blocks = new Block[maxNumBlocks];
+
+ // Initialize P, M and score
+ bl = blocks;
+ for (int b = 0; b <= lastBlock; b++) {
+ bl->score = (b + 1) * WORD_SIZE;
+ bl->P = (Word)-1; // All 1s
+ bl->M = (Word)0;
+ bl++;
+ }
+
+ // If we want to find alignment, we have to store needed data.
+ if (findAlignment)
+ *alignData = new AlignmentData(maxNumBlocks, targetLength);
+ else if (targetStopPosition > -1)
+ *alignData = new AlignmentData(maxNumBlocks, 1);
+ else
+ *alignData = NULL;
+
+ const unsigned char* targetChar = target;
+ for (int c = 0; c < targetLength; c++) { // for each column
+ Word* Peq_c = Peq + *targetChar * maxNumBlocks;
+
+ //----------------------- Calculate column -------------------------//
+ int hout = 1;
+ bl = blocks + firstBlock;
+ for (int b = firstBlock; b <= lastBlock; b++) {
+ hout = calculateBlock(bl->P, bl->M, Peq_c[b], hout, bl->P, bl->M);
+ bl->score += hout;
+ bl++;
+ }
+ bl--;
+ //------------------------------------------------------------------//
+ // bl now points to last block
+
+ // Update k. I do it only on end of column because it would slow calculation too much otherwise.
+ // NOTICE: I add W when in last block because it is actually result from W cells to the left and W cells up.
+ k = min(k, bl->score
+ + max(targetLength - c - 1, queryLength - ((1 + lastBlock) * WORD_SIZE - 1) - 1)
+ + (lastBlock == maxNumBlocks - 1 ? W : 0));
+
+ //---------- Adjust number of blocks according to Ukkonen ----------//
+ //--- Adjust last block ---//
+ // If block is not beneath band, calculate next block. Only next because others are certainly beneath band.
+ if (lastBlock + 1 < maxNumBlocks
+ && !(//score[lastBlock] >= k + WORD_SIZE || // NOTICE: this condition could be satisfied if above block also!
+ ((lastBlock + 1) * WORD_SIZE - 1
+ > k - bl->score + 2 * WORD_SIZE - 2 - targetLength + c + queryLength))) {
+ lastBlock++; bl++;
+ bl->P = (Word)-1; // All 1s
+ bl->M = (Word)0;
+ int newHout = calculateBlock(bl->P, bl->M, Peq_c[lastBlock], hout, bl->P, bl->M);
+ bl->score = (bl - 1)->score - hout + WORD_SIZE + newHout;
+ hout = newHout;
+ }
+
+ // While block is out of band, move one block up. - This is optimal now, by my formula.
+ // NOTICE: I added + W, and now it works! This has to be added because query is padded with W cells.
+ while (lastBlock >= firstBlock
+ && (bl->score >= k + WORD_SIZE
+ || ((lastBlock + 1) * WORD_SIZE - 1 >
+ k - bl->score + 2 * WORD_SIZE - 2 - targetLength + c + queryLength + W))) {
+ lastBlock--; bl--;
+ }
+ //-------------------------//
+
+ //--- Adjust first block ---//
+ // While outside of band, advance block
+ while (firstBlock <= lastBlock
+ && (blocks[firstBlock].score >= k + WORD_SIZE
+ || ((firstBlock + 1) * WORD_SIZE - 1 <
+ blocks[firstBlock].score - k - targetLength + queryLength + c))) {
+ firstBlock++;
+ }
+ //--------------------------/
+
+
+ // TODO: consider if this part is useful, it does not seem to help much
+ if (c % STRONG_REDUCE_NUM == 0) { // Every some columns do more expensive but more efficient reduction
+ while (lastBlock >= firstBlock) {
+ // If all cells outside of band, remove block
+ vector<int> scores = getBlockCellValues(*bl);
+ int r = (lastBlock + 1) * WORD_SIZE - 1;
+ bool reduce = true;
+ for (int i = 0; i < WORD_SIZE; i++) {
+ // TODO: Does not work if do not put +1! Why???
+ if (scores[i] <= k && r <= k - scores[i] - targetLength + c + queryLength + W + 1) {
+ reduce = false;
+ break;
+ }
+ r--;
+ }
+ if (!reduce) break;
+ lastBlock--; bl--;
+ }
+
+ while (firstBlock <= lastBlock) {
+ // If all cells outside of band, remove block
+ vector<int> scores = getBlockCellValues(blocks[firstBlock]);
+ int r = (firstBlock + 1) * WORD_SIZE - 1;
+ bool reduce = true;
+ for (int i = 0; i < WORD_SIZE; i++) {
+ if (scores[i] <= k && r >= scores[i] - k - targetLength + c + queryLength) {
+ reduce = false;
+ break;
+ }
+ r--;
+ }
+ if (!reduce) break;
+ firstBlock++;
+ }
+ }
+
+
+ // If band stops to exist finish
+ if (lastBlock < firstBlock) {
+ *bestScore_ = *position_ = -1;
+ delete[] blocks;
+ return EDLIB_STATUS_OK;
+ }
+ //------------------------------------------------------------------//
+
+
+ //---- Save column so it can be used for reconstruction ----//
+ if (findAlignment && c < targetLength) {
+ bl = blocks + firstBlock;
+ for (int b = firstBlock; b <= lastBlock; b++) {
+ (*alignData)->Ps[maxNumBlocks * c + b] = bl->P;
+ (*alignData)->Ms[maxNumBlocks * c + b] = bl->M;
+ (*alignData)->scores[maxNumBlocks * c + b] = bl->score;
+ (*alignData)->firstBlocks[c] = firstBlock;
+ (*alignData)->lastBlocks[c] = lastBlock;
+ bl++;
+ }
+ }
+ //----------------------------------------------------------//
+
+ //---- If this is stop column, save it and finish ----//
+ if (c == targetStopPosition) {
+ for (int b = firstBlock; b <= lastBlock; b++) {
+ (*alignData)->Ps[b] = (blocks + b)->P;
+ (*alignData)->Ms[b] = (blocks + b)->M;
+ (*alignData)->scores[b] = (blocks + b)->score;
+ (*alignData)->firstBlocks[0] = firstBlock;
+ (*alignData)->lastBlocks[0] = lastBlock;
+ }
+ *bestScore_ = -1;
+ *position_ = targetStopPosition;
+ delete[] blocks;
+ return EDLIB_STATUS_OK;
+ }
+ //----------------------------------------------------//
+
+ targetChar++;
+ }
+
+ if (lastBlock == maxNumBlocks - 1) { // If last block of last column was calculated
+ // Obtain best score from block -> it is complicated because query is padded with W cells
+ int bestScore = getBlockCellValues(blocks[lastBlock])[W];
+ if (bestScore <= k) {
+ *bestScore_ = bestScore;
+ *position_ = targetLength - 1;
+ delete[] blocks;
+ return EDLIB_STATUS_OK;
+ }
+ }
+
+ *bestScore_ = *position_ = -1;
+ delete[] blocks;
+ return EDLIB_STATUS_OK;
+}
+
+
+/**
+ * Finds one possible alignment that gives optimal score by moving back through the dynamic programming matrix,
+ * that is stored in alignData. Consumes large amount of memory: O(queryLength * targetLength).
+ * @param [in] queryLength Normal length, without W.
+ * @param [in] targetLength Normal length, without W.
+ * @param [in] bestScore Best score.
+ * @param [in] alignData Data obtained during finding best score that is useful for finding alignment.
+ * @param [out] alignment Alignment.
+ * @param [out] alignmentLength Length of alignment.
+ * @return Status code.
+ */
+static int obtainAlignmentTraceback(const int queryLength, const int targetLength,
+ const int bestScore, const AlignmentData* alignData,
+ unsigned char** alignment, int* alignmentLength) {
+ const int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE);
+ const int W = maxNumBlocks * WORD_SIZE - queryLength;
+
+ *alignment = (unsigned char*) malloc((queryLength + targetLength - 1) * sizeof(unsigned char));
+ *alignmentLength = 0;
+ int c = targetLength - 1; // index of column
+ int b = maxNumBlocks - 1; // index of block in column
+ int currScore = bestScore; // Score of current cell
+ int lScore = -1; // Score of left cell
+ int uScore = -1; // Score of upper cell
+ int ulScore = -1; // Score of upper left cell
+ Word currP = alignData->Ps[c * maxNumBlocks + b]; // P of current block
+ Word currM = alignData->Ms[c * maxNumBlocks + b]; // M of current block
+ // True if block to left exists and is in band
+ bool thereIsLeftBlock = c > 0 && b >= alignData->firstBlocks[c-1] && b <= alignData->lastBlocks[c-1];
+ Word lP = 0, lM = 0;
+ if (thereIsLeftBlock) {
+ lP = alignData->Ps[(c - 1) * maxNumBlocks + b]; // P of block to the left
+ lM = alignData->Ms[(c - 1) * maxNumBlocks + b]; // M of block to the left
+ }
+ currP <<= W;
+ currM <<= W;
+ int blockPos = WORD_SIZE - W - 1; // 0 based index of current cell in blockPos
+ if (c == 0) {
+ thereIsLeftBlock = true;
+ lScore = b * WORD_SIZE + blockPos + 1;
+ ulScore = lScore - 1;
+ }
+ while (true) {
+ // TODO: improvement: calculate only those cells that are needed,
+ // for example if I calculate upper cell and can move up,
+ // there is no need to calculate left and upper left cell
+ //---------- Calculate scores ---------//
+ if (lScore == -1 && thereIsLeftBlock) {
+ lScore = alignData->scores[(c - 1) * maxNumBlocks + b]; // score of block to the left
+ for (int i = 0; i < WORD_SIZE - blockPos - 1; i++) {
+ if (lP & HIGH_BIT_MASK) lScore--;
+ if (lM & HIGH_BIT_MASK) lScore++;
+ lP <<= 1;
+ lM <<= 1;
+ }
+ }
+ if (ulScore == -1) {
+ if (lScore != -1) {
+ ulScore = lScore;
+ if (lP & HIGH_BIT_MASK) ulScore--;
+ if (lM & HIGH_BIT_MASK) ulScore++;
+ }
+ else if (c > 0 && b-1 >= alignData->firstBlocks[c-1] && b-1 <= alignData->lastBlocks[c-1]) {
+ // This is the case when upper left cell is last cell in block,
+ // and block to left is not in band so lScore is -1.
+ ulScore = alignData->scores[(c - 1) * maxNumBlocks + b - 1];
+ }
+ }
+ if (uScore == -1) {
+ uScore = currScore;
+ if (currP & HIGH_BIT_MASK) uScore--;
+ if (currM & HIGH_BIT_MASK) uScore++;
+ currP <<= 1;
+ currM <<= 1;
+ }
+ //-------------------------------------//
+
+ // TODO: should I check if there is upper block?
+
+ //-------------- Move --------------//
+ // Move up - insertion to target - deletion from query
+ if (uScore != -1 && uScore + 1 == currScore) {
+ currScore = uScore;
+ lScore = ulScore;
+ uScore = ulScore = -1;
+ if (blockPos == 0) { // If entering new (upper) block
+ if (b == 0) { // If there are no cells above (only boundary cells)
+ (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_INSERT; // Move up
+ for (int i = 0; i < c + 1; i++) // Move left until end
+ (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_DELETE;
+ break;
+ } else {
+ blockPos = WORD_SIZE - 1;
+ b--;
+ currP = alignData->Ps[c * maxNumBlocks + b];
+ currM = alignData->Ms[c * maxNumBlocks + b];
+ if (c > 0 && b >= alignData->firstBlocks[c-1] && b <= alignData->lastBlocks[c-1]) {
+ thereIsLeftBlock = true;
+ lP = alignData->Ps[(c - 1) * maxNumBlocks + b]; // TODO: improve this, too many operations
+ lM = alignData->Ms[(c - 1) * maxNumBlocks + b];
+ } else {
+ thereIsLeftBlock = false;
+ }
+ }
+ } else {
+ blockPos--;
+ lP <<= 1;
+ lM <<= 1;
+ }
+ // Mark move
+ (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_INSERT;
+ }
+ // Move left - deletion from target - insertion to query
+ else if (lScore != -1 && lScore + 1 == currScore) {
+ currScore = lScore;
+ uScore = ulScore;
+ lScore = ulScore = -1;
+ c--;
+ if (c == -1) { // If there are no cells to the left (only boundary cells)
+ (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_DELETE; // Move left
+ int numUp = b * WORD_SIZE + blockPos + 1;
+ for (int i = 0; i < numUp; i++) // Move up until end
+ (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_INSERT;
+ break;
+ }
+ currP = lP;
+ currM = lM;
+ if (c > 0 && b >= alignData->firstBlocks[c-1] && b <= alignData->lastBlocks[c-1]) {
+ thereIsLeftBlock = true;
+ lP = alignData->Ps[(c - 1) * maxNumBlocks + b];
+ lM = alignData->Ms[(c - 1) * maxNumBlocks + b];
+ } else {
+ if (c == 0) { // If there are no cells to the left (only boundary cells)
+ thereIsLeftBlock = true;
+ lScore = b * WORD_SIZE + blockPos + 1;
+ ulScore = lScore - 1;
+ } else {
+ thereIsLeftBlock = false;
+ }
+ }
+ // Mark move
+ (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_DELETE;
+ }
+ // Move up left - (mis)match
+ else if (ulScore != -1) {
+ unsigned char moveCode = ulScore == currScore ? EDLIB_EDOP_MATCH : EDLIB_EDOP_MISMATCH;
+ currScore = ulScore;
+ uScore = lScore = ulScore = -1;
+ c--;
+ if (c == -1) { // If there are no cells to the left (only boundary cells)
+ (*alignment)[(*alignmentLength)++] = moveCode; // Move left
+ int numUp = b * WORD_SIZE + blockPos;
+ for (int i = 0; i < numUp; i++) // Move up until end
+ (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_INSERT;
+ break;
+ }
+ if (blockPos == 0) { // If entering upper left block
+ if (b == 0) { // If there are no more cells above (only boundary cells)
+ (*alignment)[(*alignmentLength)++] = moveCode; // Move up left
+ for (int i = 0; i < c + 1; i++) // Move left until end
+ (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_DELETE;
+ break;
+ }
+ blockPos = WORD_SIZE - 1;
+ b--;
+ currP = alignData->Ps[c * maxNumBlocks + b];
+ currM = alignData->Ms[c * maxNumBlocks + b];
+ } else { // If entering left block
+ blockPos--;
+ currP = lP;
+ currM = lM;
+ currP <<= 1;
+ currM <<= 1;
+ }
+ // Set new left block
+ if (c > 0 && b >= alignData->firstBlocks[c-1] && b <= alignData->lastBlocks[c-1]) {
+ thereIsLeftBlock = true;
+ lP = alignData->Ps[(c - 1) * maxNumBlocks + b];
+ lM = alignData->Ms[(c - 1) * maxNumBlocks + b];
+ } else {
+ if (c == 0) { // If there are no cells to the left (only boundary cells)
+ thereIsLeftBlock = true;
+ lScore = b * WORD_SIZE + blockPos + 1;
+ ulScore = lScore - 1;
+ } else {
+ thereIsLeftBlock = false;
+ }
+ }
+ // Mark move
+ (*alignment)[(*alignmentLength)++] = moveCode;
+ } else {
+ // Reached end - finished!
+ break;
+ }
+ //----------------------------------//
+ }
+
+ *alignment = (unsigned char*) realloc(*alignment, (*alignmentLength) * sizeof(unsigned char));
+ reverse(*alignment, *alignment + (*alignmentLength));
+ return EDLIB_STATUS_OK;
+}
+
+
+/**
+ * Finds one possible alignment that gives optimal score (bestScore).
+ * It will split problem into smaller problems using Hirschberg's algorithm and when they are small enough,
+ * it will solve them using traceback algorithm.
+ * @param [in] query
+ * @param [in] rQuery Reversed query.
+ * @param [in] queryLength
+ * @param [in] target
+ * @param [in] rTarget Reversed target.
+ * @param [in] targetLength
+ * @param [in] alphabetLength
+ * @param [in] bestScore Best(optimal) score.
+ * @param [out] alignment Sequence of edit operations that make target equal to query.
+ * @param [out] alignmentLength Length of alignment.
+ * @return Status code.
+ */
+static int obtainAlignment(const unsigned char* query, const unsigned char* rQuery, const int queryLength,
+ const unsigned char* target, const unsigned char* rTarget, const int targetLength,
+ const int alphabetLength, const int bestScore,
+ unsigned char** alignment, int* alignmentLength) {
+ // Handle special case when one of sequences has length of 0.
+ if (queryLength == 0 || targetLength == 0) {
+ *alignmentLength = targetLength + queryLength;
+ *alignment = (unsigned char*) malloc((*alignmentLength) * sizeof(unsigned char));
+ for (int i = 0; i < *alignmentLength; i++) {
+ (*alignment)[i] = queryLength == 0 ? EDLIB_EDOP_DELETE : EDLIB_EDOP_INSERT;
+ }
+ return EDLIB_STATUS_OK;
+ }
+
+ const int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE);
+ const int W = maxNumBlocks * WORD_SIZE - queryLength;
+ int statusCode;
+
+ // TODO: think about reducing number of memory allocations in alignment functions, probably
+ // by sharing some memory that is allocated only once. That refers to: Peq, columns in Hirschberg,
+ // and it could also be done for alignments - we could have one big array for alignment that would be
+ // sparsely populated by each of steps in recursion, and at the end we would just consolidate those results.
+
+ // If estimated memory consumption for traceback algorithm is smaller than 1MB use it,
+ // otherwise use Hirschberg's algorithm. By running few tests I choose boundary of 1MB as optimal.
+ long long alignmentDataSize = (long long) (2 * sizeof(Word) + sizeof(int)) * maxNumBlocks * targetLength
+ + (long long) 2 * sizeof(int) * targetLength;
+ if (alignmentDataSize < 1024 * 1024) {
+ int score_, endLocation_; // Used only to call function.
+ AlignmentData* alignData = NULL;
+ Word* Peq = buildPeq(alphabetLength, query, queryLength);
+ myersCalcEditDistanceNW(Peq, W, maxNumBlocks,
+ query, queryLength,
+ target, targetLength,
+ alphabetLength, bestScore,
+ &score_, &endLocation_, true, &alignData, -1);
+ assert(score_ == bestScore);
+ assert(endLocation_ == targetLength - 1);
+
+ statusCode = obtainAlignmentTraceback(queryLength, targetLength,
+ bestScore, alignData,
+ alignment, alignmentLength);
+ delete alignData;
+ delete[] Peq;
+ } else {
+ statusCode = obtainAlignmentHirschberg(query, rQuery, queryLength,
+ target, rTarget, targetLength,
+ alphabetLength, bestScore,
+ alignment, alignmentLength);
+ }
+ return statusCode;
+}
+
+
+/**
+ * Finds one possible alignment that gives optimal score (bestScore).
+ * Uses Hirschberg's algorithm to split problem into two sub-problems, solve them and combine them together.
+ * @param [in] query
+ * @param [in] rQuery Reversed query.
+ * @param [in] queryLength
+ * @param [in] target
+ * @param [in] rTarget Reversed target.
+ * @param [in] targetLength
+ * @param [in] alphabetLength
+ * @param [in] bestScore Best(optimal) score.
+ * @param [out] alignment Sequence of edit operations that make target equal to query.
+ * @param [out] alignmentLength Length of alignment.
+ * @return Status code.
+ */
+static int obtainAlignmentHirschberg(
+ const unsigned char* query, const unsigned char* rQuery, const int queryLength,
+ const unsigned char* target, const unsigned char* rTarget, const int targetLength,
+ const int alphabetLength, const int bestScore,
+ unsigned char** alignment, int* alignmentLength) {
+ const int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE);
+ const int W = maxNumBlocks * WORD_SIZE - queryLength;
+
+ Word* Peq = buildPeq(alphabetLength, query, queryLength);
+ Word* rPeq = buildPeq(alphabetLength, rQuery, queryLength);
+
+ // Used only to call functions.
+ int score_, endLocation_;
+
+ // Divide dynamic matrix into two halfs, left and right.
+ const int leftHalfWidth = targetLength / 2;
+ const int rightHalfWidth = targetLength - leftHalfWidth;
+
+ // Calculate left half.
+ AlignmentData* alignDataLeftHalf = NULL;
+ myersCalcEditDistanceNW(Peq, W, maxNumBlocks,
+ query, queryLength,
+ target, targetLength,
+ alphabetLength, bestScore,
+ &score_, &endLocation_, false, &alignDataLeftHalf, leftHalfWidth - 1);
+
+ // Calculate right half.
+ AlignmentData* alignDataRightHalf = NULL;
+ myersCalcEditDistanceNW(rPeq, W, maxNumBlocks,
+ rQuery, queryLength,
+ rTarget, targetLength,
+ alphabetLength, bestScore,
+ &score_, &endLocation_, false, &alignDataRightHalf, rightHalfWidth - 1);
+
+ delete[] Peq;
+ delete[] rPeq;
+
+ // Unwrap the left half.
+ int firstBlockIdxLeft = alignDataLeftHalf->firstBlocks[0];
+ int lastBlockIdxLeft = alignDataLeftHalf->lastBlocks[0];
+ // TODO: avoid this allocation by using some shared array?
+ // scoresLeft contains scores from left column, starting with scoresLeftStartIdx row (query index)
+ // and ending with scoresLeftEndIdx row (0-indexed).
+ int scoresLeftLength = (lastBlockIdxLeft - firstBlockIdxLeft + 1) * WORD_SIZE;
+ int* scoresLeft = new int[scoresLeftLength];
+ for (int blockIdx = firstBlockIdxLeft; blockIdx <= lastBlockIdxLeft; blockIdx++) {
+ Block block(alignDataLeftHalf->Ps[blockIdx], alignDataLeftHalf->Ms[blockIdx],
+ alignDataLeftHalf->scores[blockIdx]);
+ readBlock(block, scoresLeft + (blockIdx - firstBlockIdxLeft) * WORD_SIZE);
+ }
+ int scoresLeftStartIdx = firstBlockIdxLeft * WORD_SIZE;
+ // If last block contains padding, shorten the length of scores for the length of padding.
+ if (lastBlockIdxLeft == maxNumBlocks - 1) {
+ scoresLeftLength -= W;
+ }
+
+ // Unwrap the right half (I also reverse it while unwraping).
+ int firstBlockIdxRight = alignDataRightHalf->firstBlocks[0];
+ int lastBlockIdxRight = alignDataRightHalf->lastBlocks[0];
+ int scoresRightLength = (lastBlockIdxRight - firstBlockIdxRight + 1) * WORD_SIZE;
+ int* scoresRight = new int[scoresRightLength];
+ int* scoresRightOriginalStart = scoresRight;
+ for (int blockIdx = firstBlockIdxRight; blockIdx <= lastBlockIdxRight; blockIdx++) {
+ Block block(alignDataRightHalf->Ps[blockIdx], alignDataRightHalf->Ms[blockIdx],
+ alignDataRightHalf->scores[blockIdx]);
+ readBlockReverse(block, scoresRight + (lastBlockIdxRight - blockIdx) * WORD_SIZE);
+ }
+ int scoresRightStartIdx = queryLength - (lastBlockIdxRight + 1) * WORD_SIZE;
+ // If there is padding at the beginning of scoresRight (that can happen because of reversing that we do),
+ // move pointer forward to remove the padding (that is why we remember originalStart).
+ if (scoresRightStartIdx < 0) {
+ assert(scoresRightStartIdx == -1 * W);
+ scoresRight += W;
+ scoresRightStartIdx += W;
+ scoresRightLength -= W;
+ }
+
+ delete alignDataLeftHalf;
+ delete alignDataRightHalf;
+
+ //--------------------- Find the best move ----------------//
+ // Find the query/row index of cell in left column which together with its lower right neighbour
+ // from right column gives the best score (when summed). We also have to consider boundary cells
+ // (those cells at -1 indexes).
+ // x|
+ // -+-
+ // |x
+ int queryIdxLeftStart = max(scoresLeftStartIdx, scoresRightStartIdx - 1);
+ int queryIdxLeftEnd = min(scoresLeftStartIdx + scoresLeftLength - 1,
+ scoresRightStartIdx + scoresRightLength - 2);
+ int leftScore, rightScore;
+ int queryIdxLeftAlignment; // Query/row index of cell in left column where alignment is passing through.
+ bool queryIdxLeftAlignmentFound = false;
+ for (int queryIdx = queryIdxLeftStart; queryIdx <= queryIdxLeftEnd; queryIdx++) {
+ leftScore = scoresLeft[queryIdx - scoresLeftStartIdx];
+ rightScore = scoresRight[queryIdx + 1 - scoresRightStartIdx];
+ if (leftScore + rightScore == bestScore) {
+ queryIdxLeftAlignment = queryIdx;
+ queryIdxLeftAlignmentFound = true;
+ break;
+ }
+ }
+ // Check boundary cells.
+ if (!queryIdxLeftAlignmentFound && scoresLeftStartIdx == 0 && scoresRightStartIdx == 0) {
+ leftScore = leftHalfWidth;
+ rightScore = scoresRight[0];
+ if (leftScore + rightScore == bestScore) {
+ queryIdxLeftAlignment = -1;
+ queryIdxLeftAlignmentFound = true;
+ }
+ }
+ if (!queryIdxLeftAlignmentFound && scoresLeftStartIdx + scoresLeftLength == queryLength
+ && scoresRightStartIdx + scoresRightLength == queryLength) {
+ leftScore = scoresLeft[scoresLeftLength - 1];
+ rightScore = rightHalfWidth;
+ if (leftScore + rightScore == bestScore) {
+ queryIdxLeftAlignment = queryLength - 1;
+ queryIdxLeftAlignmentFound = true;
+ }
+ }
+
+ delete[] scoresLeft;
+ delete[] scoresRightOriginalStart;
+
+ if (queryIdxLeftAlignmentFound == false) {
+ // If there was no move that is part of optimal alignment, then there is no such alignment
+ // or given bestScore is not correct!
+ return EDLIB_STATUS_ERROR;
+ }
+ //----------------------------------------------------------//
+
+ // Calculate alignments for upper half of left half (upper left - ul)
+ // and lower half of right half (lower right - lr).
+ const int ulHeight = queryIdxLeftAlignment + 1;
+ const int lrHeight = queryLength - ulHeight;
+ const int ulWidth = leftHalfWidth;
+ const int lrWidth = rightHalfWidth;
+ unsigned char* ulAlignment = NULL; int ulAlignmentLength;
+ int ulStatusCode = obtainAlignment(query, rQuery + lrHeight, ulHeight,
+ target, rTarget + lrWidth, ulWidth,
+ alphabetLength, leftScore, &ulAlignment, &ulAlignmentLength);
+ unsigned char* lrAlignment = NULL; int lrAlignmentLength;
+ int lrStatusCode = obtainAlignment(query + ulHeight, rQuery, lrHeight,
+ target + ulWidth, rTarget, lrWidth,
+ alphabetLength, rightScore, &lrAlignment, &lrAlignmentLength);
+ if (ulStatusCode == EDLIB_STATUS_ERROR || lrStatusCode == EDLIB_STATUS_ERROR) {
+ if (ulAlignment) free(ulAlignment);
+ if (lrAlignment) free(lrAlignment);
+ return EDLIB_STATUS_ERROR;
+ }
+
+ // Build alignment by concatenating upper left alignment with lower right alignment.
+ *alignmentLength = ulAlignmentLength + lrAlignmentLength;
+ *alignment = (unsigned char*) malloc((*alignmentLength) * sizeof(unsigned char));
+ memcpy(*alignment, ulAlignment, ulAlignmentLength);
+ memcpy(*alignment + ulAlignmentLength, lrAlignment, lrAlignmentLength);
+
+ free(ulAlignment);
+ free(lrAlignment);
+ return EDLIB_STATUS_OK;
+}
+
+
+/**
+ * Takes char query and char target, recognizes alphabet and transforms them into unsigned char sequences
+ * where elements in sequences are not any more letters of alphabet, but their index in alphabet.
+ * Most of internal edlib functions expect such transformed sequences.
+ * This function will allocate queryTransformed and targetTransformed, so make sure to free them when done.
+ * Example:
+ * Original sequences: "ACT" and "CGT".
+ * Alphabet would be recognized as ['A', 'C', 'T', 'G']. Alphabet length = 4.
+ * Transformed sequences: [0, 1, 2] and [1, 3, 2].
+ * @param [in] queryOriginal
+ * @param [in] queryLength
+ * @param [in] targetOriginal
+ * @param [in] targetLength
+ * @param [out] queryTransformed It will contain values in range [0, alphabet length - 1].
+ * @param [out] targetTransformed It will contain values in range [0, alphabet length - 1].
+ * @return Alphabet length - number of letters in recognized alphabet.
+ */
+static int transformSequences(const char* queryOriginal, const int queryLength,
+ const char* targetOriginal, const int targetLength,
+ unsigned char** queryTransformed, unsigned char** targetTransformed) {
+ // Alphabet is constructed from letters that are present in sequences.
+ // Each letter is assigned an ordinal number, starting from 0 up to alphabetLength - 1,
+ // and new query and target are created in which letters are replaced with their ordinal numbers.
+ // This query and target are used in all the calculations later.
+ *queryTransformed = (unsigned char *) malloc(sizeof(unsigned char) * queryLength);
+ *targetTransformed = (unsigned char *) malloc(sizeof(unsigned char) * targetLength);
+
+ // Alphabet information, it is constructed on fly while transforming sequences.
+ unsigned char letterIdx[128]; //!< letterIdx[c] is index of letter c in alphabet
+ bool inAlphabet[128]; // inAlphabet[c] is true if c is in alphabet
+ for (int i = 0; i < 128; i++) inAlphabet[i] = false;
+ int alphabetLength = 0;
+
+ for (int i = 0; i < queryLength; i++) {
+ char c = queryOriginal[i];
+ if (!inAlphabet[c]) {
+ inAlphabet[c] = true;
+ letterIdx[c] = alphabetLength;
+ alphabetLength++;
+ }
+ (*queryTransformed)[i] = letterIdx[c];
+ }
+ for (int i = 0; i < targetLength; i++) {
+ char c = targetOriginal[i];
+ if (!inAlphabet[c]) {
+ inAlphabet[c] = true;
+ letterIdx[c] = alphabetLength;
+ alphabetLength++;
+ }
+ (*targetTransformed)[i] = letterIdx[c];
+ }
+
+ return alphabetLength;
+}
+
+
+EdlibAlignConfig edlibNewAlignConfig(int k, EdlibAlignMode mode, EdlibAlignTask task) {
+ EdlibAlignConfig config;
+ config.k = k;
+ config.mode = mode;
+ config.task = task;
+ return config;
+}
+
+EdlibAlignConfig edlibDefaultAlignConfig() {
+ return edlibNewAlignConfig(-1, EDLIB_MODE_NW, EDLIB_TASK_DISTANCE);
+}
+
+void edlibFreeAlignResult(EdlibAlignResult result) {
+ if (result.endLocations) free(result.endLocations);
+ if (result.startLocations) free(result.startLocations);
+ if (result.alignment) free(result.alignment);
+}
diff --git a/src/overlapInCore/libedlib/edlib.H b/src/overlapInCore/libedlib/edlib.H
new file mode 100644
index 0000000..ee9b320
--- /dev/null
+++ b/src/overlapInCore/libedlib/edlib.H
@@ -0,0 +1,270 @@
+
+/******************************************************************************
+ *
+ * This file is part of canu, a software program that assembles whole-genome
+ * sequencing reads into contigs.
+ *
+ * This software is based on:
+ * 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
+ * the 'kmer package' (http://kmer.sourceforge.net)
+ * both originally distributed by Applera Corporation under the GNU General
+ * Public License, version 2.
+ *
+ * Canu branched from Celera Assembler at its revision 4587.
+ * Canu branched from the kmer project at its revision 1994.
+ *
+ * Modifications by:
+ *
+ * Sergey Koren beginning on 2016-AUG-30
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
+ * File 'README.licenses' in the root directory of this distribution contains
+ * full conditions and disclaimers for each license.
+ */
+
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2014 Martin Šošić
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef EDLIB_H
+#define EDLIB_H
+
+/**
+ * @file
+ * @author Martin Sosic
+ * @brief Main header file, containing all public functions and structures.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Status codes
+#define EDLIB_STATUS_OK 0
+#define EDLIB_STATUS_ERROR 1
+
+/**
+ * Alignment methods - how should Edlib treat gaps before and after query?
+ */
+typedef enum {
+ /**
+ * Global method. This is the standard method.
+ * Useful when you want to find out how similar is first sequence to second sequence.
+ */
+ EDLIB_MODE_NW,
+ /**
+ * Prefix method. Similar to global method, but with a small twist - gap at query end is not penalized.
+ * What that means is that deleting elements from the end of second sequence is "free"!
+ * For example, if we had "AACT" and "AACTGGC", edit distance would be 0, because removing "GGC" from the end
+ * of second sequence is "free" and does not count into total edit distance. This method is appropriate
+ * when you want to find out how well first sequence fits at the beginning of second sequence.
+ */
+ EDLIB_MODE_SHW,
+ /**
+ * Infix method. Similar as prefix method, but with one more twist - gaps at query end and start are
+ * not penalized. What that means is that deleting elements from the start and end of second sequence is "free"!
+ * For example, if we had ACT and CGACTGAC, edit distance would be 0, because removing CG from the start
+ * and GAC from the end of second sequence is "free" and does not count into total edit distance.
+ * This method is appropriate when you want to find out how well first sequence fits at any part of
+ * second sequence.
+ * For example, if your second sequence was a long text and your first sequence was a sentence from that text,
+ * but slightly scrambled, you could use this method to discover how scrambled it is and where it fits in
+ * that text. In bioinformatics, this method is appropriate for aligning read to a sequence.
+ */
+ EDLIB_MODE_HW
+} EdlibAlignMode;
+
+/**
+ * Alignment tasks - what do you want Edlib to do?
+ */
+typedef enum {
+ EDLIB_TASK_DISTANCE, //!< Find edit distance and end locations.
+ EDLIB_TASK_LOC, //!< Find edit distance, end locations and start locations.
+ EDLIB_TASK_PATH //!< Find edit distance, end locations and start locations and alignment path.
+} EdlibAlignTask;
+
+/**
+ * Describes cigar format.
+ * @see http://samtools.github.io/hts-specs/SAMv1.pdf
+ * @see http://drive5.com/usearch/manual/cigar.html
+ */
+typedef enum {
+ EDLIB_CIGAR_STANDARD, //!< Match: 'M', Insertion: 'I', Deletion: 'D', Mismatch: 'M'.
+ EDLIB_CIGAR_EXTENDED //!< Match: '=', Insertion: 'I', Deletion: 'D', Mismatch: 'X'.
+} EdlibCigarFormat;
+
+// Edit operations.
+#define EDLIB_EDOP_MATCH 0 //!< Match.
+#define EDLIB_EDOP_INSERT 1 //!< Insertion to target = deletion from query.
+#define EDLIB_EDOP_DELETE 2 //!< Deletion from target = insertion to query.
+#define EDLIB_EDOP_MISMATCH 3 //!< Mismatch.
+
+
+
+ /**
+ * @brief Configuration object for edlibAlign() function.
+ */
+ typedef struct {
+ /**
+ * Set k to non-negative value to tell edlib that edit distance is not larger than k.
+ * Smaller k can significantly improve speed of computation.
+ * If edit distance is larger than k, edlib will set edit distance to -1.
+ * Set k to negative value and edlib will internally auto-adjust k until score is found.
+ */
+ int k;
+
+ /**
+ * Alignment method.
+ * EDLIB_MODE_NW: global (Needleman-Wunsch)
+ * EDLIB_MODE_SHW: prefix. Gap after query is not penalized.
+ * EDLIB_MODE_HW: infix. Gaps before and after query are not penalized.
+ */
+ EdlibAlignMode mode;
+
+ /**
+ * Alignment task - tells Edlib what to calculate. Less to calculate, faster it is.
+ * EDLIB_TASK_DISTANCE - find edit distance and end locations of optimal alignment paths in target.
+ * EDLIB_TASK_LOC - find edit distance and start and end locations of optimal alignment paths in target.
+ * EDLIB_TASK_PATH - find edit distance, alignment path (and start and end locations of it in target).
+ */
+ EdlibAlignTask task;
+ } EdlibAlignConfig;
+
+ /**
+ * Helper method for easy construction of configuration object.
+ * @return Configuration object filled with given parameters.
+ */
+ EdlibAlignConfig edlibNewAlignConfig(int k, EdlibAlignMode mode, EdlibAlignTask task);
+
+ /**
+ * @return Default configuration object, with following defaults:
+ * k = -1, mode = EDLIB_MODE_NW, task = EDLIB_TASK_DISTANCE.
+ */
+ EdlibAlignConfig edlibDefaultAlignConfig();
+
+
+ /**
+ * Container for results of alignment done by edlibAlign() function.
+ */
+ typedef struct {
+ /**
+ * -1 if k is non-negative and edit distance is larger than k.
+ */
+ int editDistance;
+ /**
+ * Array of zero-based positions in target where optimal alignment paths end.
+ * If gap after query is penalized, gap counts as part of query (NW), otherwise not.
+ * Set to NULL if edit distance is larger than k.
+ * If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free().
+ */
+ int* endLocations;
+ /**
+ * Array of zero-based positions in target where optimal alignment paths start,
+ * they correspond to endLocations.
+ * If gap before query is penalized, gap counts as part of query (NW), otherwise not.
+ * Set to NULL if not calculated or if edit distance is larger than k.
+ * If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free().
+ */
+ int* startLocations;
+ /**
+ * Number of end (and start) locations.
+ */
+ int numLocations;
+ /**
+ * Alignment is found for first pair of start and end locations.
+ * Set to NULL if not calculated.
+ * Alignment is sequence of numbers: 0, 1, 2, 3.
+ * 0 stands for match.
+ * 1 stands for insertion to target.
+ * 2 stands for insertion to query.
+ * 3 stands for mismatch.
+ * Alignment aligns query to target from begining of query till end of query.
+ * If gaps are not penalized, they are not in alignment.
+ * If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free().
+ */
+ unsigned char* alignment;
+ /**
+ * Length of alignment.
+ */
+ int alignmentLength;
+ /**
+ * Number of different characters in query and target together.
+ */
+ int alphabetLength;
+ } EdlibAlignResult;
+
+ /**
+ * Frees memory in EdlibAlignResult that was allocated by edlib.
+ * If you do not use it, make sure to free needed members manually using free().
+ */
+ void edlibFreeAlignResult(EdlibAlignResult result);
+
+
+ /**
+ * Aligns two sequences (query and target) using edit distance (levenshtein distance).
+ * Through config parameter, this function supports different alignment methods (global, prefix, infix),
+ * as well as different modes of search (tasks).
+ * It always returns edit distance and end locations of optimal alignment in target.
+ * It optionally returns start locations of optimal alignment in target and alignment path,
+ * if you choose appropriate tasks.
+ * @param [in] query First sequence. Character codes should be in range [0, 127].
+ * @param [in] queryLength Number of characters in first sequence.
+ * @param [in] target Second sequence. Character codes should be in range [0, 127].
+ * @param [in] targetLength Number of characters in second sequence.
+ * @param [in] config Additional alignment parameters, like alignment method and wanted results.
+ * @return Result of alignment, which can contain edit distance, start and end locations and alignment path.
+ * Make sure to clean up the object using edlibFreeAlignResult() or by manually freeing needed members.
+ */
+ EdlibAlignResult edlibAlign(const char* query, const int queryLength,
+ const char* target, const int targetLength,
+ EdlibAlignConfig config);
+
+
+ /**
+ * Builds cigar string from given alignment sequence.
+ * @param [in] alignment Alignment sequence.
+ * 0 stands for match.
+ * 1 stands for insertion to target.
+ * 2 stands for insertion to query.
+ * 3 stands for mismatch.
+ * @param [in] alignmentLength
+ * @param [in] cigarFormat Cigar will be returned in specified format.
+ * @return Cigar string.
+ * I stands for insertion.
+ * D stands for deletion.
+ * X stands for mismatch. (used only in extended format)
+ * = stands for match. (used only in extended format)
+ * M stands for (mis)match. (used only in standard format)
+ * String is null terminated.
+ * Needed memory is allocated and given pointer is set to it.
+ * Do not forget to free it later using free()!
+ */
+ char* edlibAlignmentToCigar(unsigned char* alignment, int alignmentLength,
+ EdlibCigarFormat cigarFormat);
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // EDLIB_H
diff --git a/src/overlapInCore/overlapConvert.C b/src/overlapInCore/overlapConvert.C
index 474f5a0..cc92708 100644
--- a/src/overlapInCore/overlapConvert.C
+++ b/src/overlapInCore/overlapConvert.C
@@ -42,7 +42,7 @@ main(int argc, char **argv) {
gkStore *gkpStore = NULL;
ovOverlapDisplayType dt = ovOverlapAsCoords;
-
+ bool native = false;
vector<char *> files;
@@ -61,6 +61,9 @@ main(int argc, char **argv) {
} else if (strcmp(argv[arg], "-raw") == 0) {
dt = ovOverlapAsRaw;
+ } else if (strcmp(argv[arg], "-native") == 0) {
+ native = true;
+
} else if (AS_UTL_fileExists(argv[arg])) {
files.push_back(argv[arg]);
@@ -83,6 +86,9 @@ main(int argc, char **argv) {
fprintf(stderr, " -coords output coordiantes on reads\n");
fprintf(stderr, " -hangs output hangs on reads\n");
fprintf(stderr, " -raw output raw hangs on reads\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -native input ovb file is NOT snappy compressed\n");
+ fprintf(stderr, "\n");
if ((gkpStoreName == NULL) && (dt == ovOverlapAsCoords))
fprintf(stderr, "ERROR: -coords mode requires a gkpStore (-G)\n");
@@ -99,9 +105,12 @@ main(int argc, char **argv) {
char *ovStr = new char [1024];
for (uint32 ff=0; ff<files.size(); ff++) {
- ovFile *of = new ovFile(files[ff], ovFileFull);
+ ovFile *of = new ovFile(gkpStore, files[ff], ovFileFull);
ovOverlap ov(gkpStore);
+ if (native == true)
+ of->enableSnappy(false);
+
while (of->readOverlap(&ov))
fputs(ov.toString(ovStr, dt, true), stdout);
diff --git a/src/overlapInCore/overlapImport.C b/src/overlapInCore/overlapImport.C
index 2990996..59630f9 100644
--- a/src/overlapInCore/overlapImport.C
+++ b/src/overlapInCore/overlapImport.C
@@ -36,6 +36,7 @@
#include "ovStore.H"
#include "splitToWords.H"
+#include "mt19937ar.H"
#include <vector>
@@ -46,7 +47,8 @@ using namespace std;
#define TYPE_COORDS 'C'
#define TYPE_HANGS 'H'
#define TYPE_RAW 'R'
-
+#define TYPE_OVB 'O'
+#define TYPE_RANDOM 'r'
int
@@ -59,6 +61,10 @@ main(int argc, char **argv) {
char inType = TYPE_NONE;
+ uint64 numRandom = 0;
+
+ bool native = false;
+
vector<char *> files;
@@ -88,6 +94,18 @@ main(int argc, char **argv) {
} else if (strcmp(argv[arg], "-raw") == 0) {
inType = TYPE_RAW;
+ } else if (strcmp(argv[arg], "-ovb") == 0) {
+ fprintf(stderr, "-ovb not implemented.\n"), exit(1);
+ inType = TYPE_OVB;
+
+ } else if (strcmp(argv[arg], "-random") == 0) {
+ inType = TYPE_RANDOM;
+ numRandom = strtoull(argv[++arg], NULL, 10);
+ files.push_back(NULL);
+
+ } else if (strcmp(argv[arg], "-native") == 0) {
+ native = true;
+
} else if ((strcmp(argv[arg], "-") == 0) ||
(AS_UTL_fileExists(argv[arg]))) {
files.push_back(argv[arg]);
@@ -111,15 +129,19 @@ main(int argc, char **argv) {
fprintf(stderr, "Required:\n");
fprintf(stderr, " -G name.gkpStore path to valid gatekeeper store\n");
fprintf(stderr, "\n");
- fprintf(stderr, "Output options:\n");
+ fprintf(stderr, "Output Format:\n");
fprintf(stderr, " -o file.ovb output file name\n");
fprintf(stderr, " -O name.ovlStore output overlap store");
fprintf(stderr, "\n");
- fprintf(stderr, "Format options:\n");
+ fprintf(stderr, "Input Format:\n");
fprintf(stderr, " -legacy 'CA8 overlapStore -d' format\n");
fprintf(stderr, " -coords 'overlapConvert -coords' format (not implemented)\n");
fprintf(stderr, " -hangs 'overlapConvert -hangs' format (not implemented)\n");
fprintf(stderr, " -raw 'overlapConvert -raw' format\n");
+ fprintf(stderr, " -ovb 'overlapInCore' format (not implemented)\n");
+ fprintf(stderr, " -random N create N random overlaps, for store testing\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -native output ovb (-o) files will not be snappy compressed\n");
fprintf(stderr, "\n");
fprintf(stderr, "Input file can be stdin ('-') or a gz/bz2/xz compressed file.\n");
fprintf(stderr, "\n");
@@ -128,6 +150,8 @@ main(int argc, char **argv) {
fprintf(stderr, "ERROR: need to supply a gkpStore (-G).\n");
if (inType == TYPE_NONE)
fprintf(stderr, "ERROR: need to supply a format type (-legacy, -coords, -hangs, -raw).\n");
+ if (files.size() == 0)
+ fprintf(stderr, "ERROR: need to supply input files.\n");
exit(1);
}
@@ -135,12 +159,67 @@ main(int argc, char **argv) {
if (gkpStoreName)
gkpStore = gkStore::gkStore_open(gkpStoreName);
- char *S = new char [1024];
- splitToWords W;
- ovOverlap ov(gkpStore);
+ char *S = new char [1024];
+ splitToWords W;
+ ovOverlap ov(gkpStore);
+
+ ovFile *of = (ovlFileName == NULL) ? NULL : new ovFile(gkpStore, ovlFileName, ovFileFullWrite);
+ ovStoreWriter *os = (ovlStoreName == NULL) ? NULL : new ovStoreWriter(ovlStoreName, gkpStore);
+
+ if ((of) && (native == true))
+ of->enableSnappy(false);
+
+ // Make random inputs first.
+
+ if (inType == TYPE_RANDOM) {
+ mtRandom mt;
+
+ for (uint64 ii=0; ii<numRandom; ii++) {
+ uint32 aID = floor(mt.mtRandomRealOpen() * gkpStore->gkStore_getNumReads()) + 1;
+ uint32 bID = floor(mt.mtRandomRealOpen() * gkpStore->gkStore_getNumReads()) + 1;
+
+#if 0
+ // For testing when reads have no overlaps in store building. Issue #302.
+ aID = aID & 0xfffffff0;
+ bID = bID & 0xfffffff0;
+
+ if (aID == 0) aID = 1;
+ if (bID == 0) bID = 1;
+#endif
+
+ uint32 aLen = gkpStore->gkStore_getRead(aID)->gkRead_sequenceLength();
+ uint32 bLen = gkpStore->gkStore_getRead(bID)->gkRead_sequenceLength();
+
+ bool olapFlip = mt.mtRandom32() % 2;
+
+ // We could be fancy and make actual overlaps that make sense, or punt and make overlaps that
+ // are valid but nonsense.
+
+ ov.a_iid = aID;
+ ov.b_iid = bID;
+
+ ov.flipped(olapFlip);
+
+ ov.a_hang((int32)(mt.mtRandomRealOpen() * 2 * aLen - aLen));
+ ov.b_hang((int32)(mt.mtRandomRealOpen() * 2 * bLen - bLen));
+
+ ov.dat.ovl.forOBT = false;
+ ov.dat.ovl.forDUP = false;
+ ov.dat.ovl.forUTG = true;
+
+ ov.erate(mt.mtRandomRealOpen() * 0.1);
+
+ if (of)
+ of->writeOverlap(&ov);
+
+ if (os)
+ os->writeOverlap(&ov);
+ }
+
+ files.pop_back();
+ }
- ovFile *of = (ovlFileName == NULL) ? NULL : new ovFile(ovlFileName, ovFileFullWrite);
- ovStore *os = (ovlStoreName == NULL) ? NULL : new ovStore(ovlStoreName, gkpStore, ovStoreWrite);
+ // Now process any files.
for (uint32 ff=0; ff<files.size(); ff++) {
compressedFileReader *in = new compressedFileReader(files[ff]);
diff --git a/src/overlapInCore/overlapInCore-Build_Hash_Index.C b/src/overlapInCore/overlapInCore-Build_Hash_Index.C
index e42128a..9d749da 100644
--- a/src/overlapInCore/overlapInCore-Build_Hash_Index.C
+++ b/src/overlapInCore/overlapInCore-Build_Hash_Index.C
@@ -91,7 +91,7 @@ Add_Extra_Hash_String(const char *s) {
if (sub >= String_Start_Size) {
uint64 n = max(sub * 1.1, String_Start_Size * 1.5);
- //fprintf(stderr, "REALLOC String_Start from "F_U64" to "F_U64"\n", String_Start_Size, n);
+ //fprintf(stderr, "REALLOC String_Start from " F_U64 " to " F_U64 "\n", String_Start_Size, n);
resizeArray(String_Start, String_Start_Size, String_Start_Size, n);
}
@@ -105,7 +105,7 @@ Add_Extra_Hash_String(const char *s) {
if (new_len >= Extra_Data_Len) {
uint64 n = max(new_len * 1.1, Extra_Data_Len * 1.5);
- //fprintf(stderr, "REALLOC basesData from "F_U64" to "F_U64"\n", Extra_Data_Len, n);
+ //fprintf(stderr, "REALLOC basesData from " F_U64 " to " F_U64 "\n", Extra_Data_Len, n);
resizeArray(basesData, Extra_Data_Len, Extra_Data_Len, n);
}
@@ -280,7 +280,7 @@ Mark_Skip_Kmers(void) {
Hash_Mark_Empty (key, line);
}
- fprintf (stderr, "String_Ct = "F_U64" Extra_String_Ct = "F_U64" Extra_String_Subcount = "F_U64"\n",
+ fprintf (stderr, "String_Ct = " F_U64 " Extra_String_Ct = " F_U64 " Extra_String_Subcount = " F_U64 "\n",
String_Ct, Extra_String_Ct, Extra_String_Subcount);
fprintf (stderr, "Read %d kmers to mark to skip\n", ct / 2);
}
@@ -357,7 +357,7 @@ Hash_Insert(String_Ref_t Ref, uint64 Key, char * S) {
// global variables basesData, String_Start, String_Info, ....
static
void
-Put_String_In_Hash(uint32 curID, uint32 i) {
+Put_String_In_Hash(uint32 UNUSED(curID), uint32 i) {
String_Ref_t ref = 0;
int skip_ct;
uint64 key;
@@ -445,7 +445,7 @@ Build_Hash_Index(gkStore *gkpStore, uint32 bgnID, uint32 endID) {
uint64 total_len;
uint64 hash_entry_limit;
- fprintf(stderr, "Build_Hash_Index from "F_U32" to "F_U32"\n", bgnID, endID);
+ fprintf(stderr, "Build_Hash_Index from " F_U32 " to " F_U32 "\n", bgnID, endID);
Hash_String_Num_Offset = bgnID;
String_Ct = 0;
@@ -474,10 +474,10 @@ Build_Hash_Index(gkStore *gkpStore, uint32 bgnID, uint32 endID) {
hash_entry_limit = G.Max_Hash_Load * HASH_TABLE_SIZE * ENTRIES_PER_BUCKET;
#if 0
- fprintf(stderr, "HASH LOADING STARTED: fragID %12"F_U64P"\n", first_frag_id);
- fprintf(stderr, "HASH LOADING STARTED: strings %12"F_U64P" out of %12"F_U64P" max.\n", String_Ct, G.Max_Hash_Strings);
- fprintf(stderr, "HASH LOADING STARTED: length %12"F_U64P" out of %12"F_U64P" max.\n", total_len, G.Max_Hash_Data_Len);
- fprintf(stderr, "HASH LOADING STARTED: entries %12"F_U64P" out of %12"F_U64P" max (load %.2f).\n", Hash_Entries, hash_entry_limit,
+ fprintf(stderr, "HASH LOADING STARTED: fragID %12" F_U64P "\n", first_frag_id);
+ fprintf(stderr, "HASH LOADING STARTED: strings %12" F_U64P " out of %12" F_U64P " max.\n", String_Ct, G.Max_Hash_Strings);
+ fprintf(stderr, "HASH LOADING STARTED: length %12" F_U64P " out of %12" F_U64P " max.\n", total_len, G.Max_Hash_Data_Len);
+ fprintf(stderr, "HASH LOADING STARTED: entries %12" F_U64P " out of %12" F_U64P " max (load %.2f).\n", Hash_Entries, hash_entry_limit,
(100.0 * Hash_Entries) / (HASH_TABLE_SIZE * ENTRIES_PER_BUCKET));
#endif
@@ -513,13 +513,13 @@ Build_Hash_Index(gkStore *gkpStore, uint32 bgnID, uint32 endID) {
maxAlloc += read->gkRead_sequenceLength() + 1;
}
- fprintf(stderr, "Found "F_U32" reads with length "F_U64" to load; "F_U32" skipped by being too short; "F_U32" skipped per library restriction\n",
+ fprintf(stderr, "Found " F_U32 " reads with length " F_U64 " to load; " F_U32 " skipped by being too short; " F_U32 " skipped per library restriction\n",
nLoadable, maxAlloc, nShort, nSkipped);
// This should be less than what the user requested on the command line
if (maxAlloc >= G.Max_Hash_Data_Len + AS_MAX_READLEN)
- fprintf(stderr, "maxAlloc = "F_U64" G.Max_Hash_Data_Len = "F_U64" AS_MAX_READLEN = %u\n", maxAlloc, G.Max_Hash_Data_Len, AS_MAX_READLEN);
+ fprintf(stderr, "maxAlloc = " F_U64 " G.Max_Hash_Data_Len = " F_U64 " AS_MAX_READLEN = %u\n", maxAlloc, G.Max_Hash_Data_Len, AS_MAX_READLEN);
assert(maxAlloc < G.Max_Hash_Data_Len + AS_MAX_READLEN);
// Allocate space, then fill it.
@@ -599,7 +599,7 @@ Build_Hash_Index(gkStore *gkpStore, uint32 bgnID, uint32 endID) {
// This was computed ahead of time!
if (total_len > maxAlloc)
- fprintf(stderr, "total_len="F_U64" len="F_U32" maxAlloc="F_U64"\n", total_len, len, maxAlloc);
+ fprintf(stderr, "total_len=" F_U64 " len=" F_U32 " maxAlloc=" F_U64 "\n", total_len, len, maxAlloc);
assert(total_len <= maxAlloc);
// What is Extra_Data_Len? It's set to Data_Len if we would have reallocated here.
@@ -607,7 +607,7 @@ Build_Hash_Index(gkStore *gkpStore, uint32 bgnID, uint32 endID) {
Put_String_In_Hash(curID, String_Ct);
if ((String_Ct % 100000) == 0)
- fprintf (stderr, "String_Ct:%12"F_U64P"/%12"F_U32P" totalLen:%12"F_U64P"/%12"F_U64P" Hash_Entries:%12"F_U64P"/%12"F_U64P" Load: %.2f%%\n",
+ fprintf (stderr, "String_Ct:%12" F_U64P "/%12" F_U32P " totalLen:%12" F_U64P "/%12" F_U64P " Hash_Entries:%12" F_U64P "/%12" F_U64P " Load: %.2f%%\n",
String_Ct, G.Max_Hash_Strings,
total_len, G.Max_Hash_Data_Len,
Hash_Entries,
@@ -619,9 +619,9 @@ Build_Hash_Index(gkStore *gkpStore, uint32 bgnID, uint32 endID) {
delete readData;
- fprintf(stderr, "HASH LOADING STOPPED: strings %12"F_U64P" out of %12"F_U32P" max.\n", String_Ct, G.Max_Hash_Strings);
- fprintf(stderr, "HASH LOADING STOPPED: length %12"F_U64P" out of %12"F_U64P" max.\n", total_len, G.Max_Hash_Data_Len);
- fprintf(stderr, "HASH LOADING STOPPED: entries %12"F_U64P" out of %12"F_U64P" max (load %.2f).\n", Hash_Entries, hash_entry_limit,
+ fprintf(stderr, "HASH LOADING STOPPED: strings %12" F_U64P " out of %12" F_U32P " max.\n", String_Ct, G.Max_Hash_Strings);
+ fprintf(stderr, "HASH LOADING STOPPED: length %12" F_U64P " out of %12" F_U64P " max.\n", total_len, G.Max_Hash_Data_Len);
+ fprintf(stderr, "HASH LOADING STOPPED: entries %12" F_U64P " out of %12" F_U64P " max (load %.2f).\n", Hash_Entries, hash_entry_limit,
100.0 * Hash_Entries / (HASH_TABLE_SIZE * ENTRIES_PER_BUCKET));
if (String_Ct == 0) {
@@ -631,7 +631,7 @@ Build_Hash_Index(gkStore *gkpStore, uint32 bgnID, uint32 endID) {
Used_Data_Len = total_len;
- //fprintf(stderr, "Extra_Ref_Ct = "F_U64" Max_Extra_Ref_Space = "F_U64"\n", Extra_Ref_Ct, Max_Extra_Ref_Space);
+ //fprintf(stderr, "Extra_Ref_Ct = " F_U64 " Max_Extra_Ref_Space = " F_U64 "\n", Extra_Ref_Ct, Max_Extra_Ref_Space);
if (Extra_Ref_Ct > Max_Extra_Ref_Space) {
int32 newSize = (Max_Extra_Ref_Space == 0) ? 16 * 1024 : Max_Extra_Ref_Space * 2;
@@ -656,7 +656,7 @@ Build_Hash_Index(gkStore *gkpStore, uint32 bgnID, uint32 endID) {
// Coalesce reference chain into adjacent entries in Extra_Ref_Space
Extra_Ref_Ct = 0;
- for (int32 i = 0; i < HASH_TABLE_SIZE; i ++)
+ for (uint64 i = 0; i < HASH_TABLE_SIZE; i ++)
for (int32 j = 0; j < Hash_Table[i].Entry_Ct; j ++) {
ref = Hash_Table[i].Entry[j];
if (! getStringRefLast(ref) && ! getStringRefEmpty(ref)) {
diff --git a/src/overlapInCore/overlapInCore-Find_Overlaps.C b/src/overlapInCore/overlapInCore-Find_Overlaps.C
index e70f00c..65e48bb 100644
--- a/src/overlapInCore/overlapInCore-Find_Overlaps.C
+++ b/src/overlapInCore/overlapInCore-Find_Overlaps.C
@@ -60,6 +60,10 @@
* are a 'United States Government Work', and
* are released in the public domain
*
+ * Sergey Koren beginning on 2016-JUN-08
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
@@ -187,6 +191,8 @@ Add_Ref(String_Ref_t Ref, int Offset, Work_Area_t * WA) {
WA->String_Olap_Space [Sub].Match_List = 0;
WA->String_Olap_Space [Sub].diag_sum = 0.0;
WA->String_Olap_Space [Sub].diag_ct = 0;
+ WA->String_Olap_Space [Sub].diag_bgn = AS_MAX_READLEN;
+ WA->String_Olap_Space [Sub].diag_end = 0;
WA->String_Olap_Space [Sub].Next = 0;
WA->String_Olap_Space [Sub].Full = TRUE;
WA->String_Olap_Space [Sub].consistent = TRUE;
@@ -194,8 +200,10 @@ Add_Ref(String_Ref_t Ref, int Offset, Work_Area_t * WA) {
consistent = WA->String_Olap_Space [Sub].consistent;
- WA->String_Olap_Space [Sub].diag_sum += getStringRefOffset(Ref) - Offset;
+ WA->String_Olap_Space [Sub].diag_sum += (double)getStringRefOffset(Ref) - Offset;
WA->String_Olap_Space [Sub].diag_ct ++;
+ if (WA->String_Olap_Space [Sub].diag_bgn > Offset) WA->String_Olap_Space [Sub].diag_bgn = Offset;
+ if (WA->String_Olap_Space [Sub].diag_end < Offset) WA->String_Olap_Space [Sub].diag_end = Offset;
Add_Match (Ref, & (WA->String_Olap_Space [Sub].Match_List), Offset, & consistent, WA);
WA->String_Olap_Space [Sub].consistent = consistent;
@@ -233,15 +241,15 @@ Hash_Find(uint64 Key, int64 Sub, char * S, int64 * Where, int * hi_hits) {
int is_empty;
H_Ref = Hash_Table [Sub].Entry [i];
- //fprintf(stderr, "Href = Hash_Table %u Entry %u = "F_U64"\n", Sub, i, H_Ref);
+ //fprintf(stderr, "Href = Hash_Table %u Entry %u = " F_U64 "\n", Sub, i, H_Ref);
is_empty = getStringRefEmpty(H_Ref);
if (! getStringRefLast(H_Ref) && ! is_empty) {
(* Where) = ((uint64)getStringRefStringNum(H_Ref) << OFFSET_BITS) + getStringRefOffset(H_Ref);
H_Ref = Extra_Ref_Space [(* Where)];
- //fprintf(stderr, "Href = Extra_Ref_Space "F_U64" = "F_U64"\n", *Where, H_Ref);
+ //fprintf(stderr, "Href = Extra_Ref_Space " F_U64 " = " F_U64 "\n", *Where, H_Ref);
}
- //fprintf(stderr, "Href = "F_U64" Get String_Start[ "F_U64" ] + "F_U64"\n", getStringRefStringNum(H_Ref), getStringRefOffset(H_Ref));
+ //fprintf(stderr, "Href = " F_U64 " Get String_Start[ " F_U64 " ] + " F_U64 "\n", getStringRefStringNum(H_Ref), getStringRefOffset(H_Ref));
T = basesData + String_Start [getStringRefStringNum(H_Ref)] + getStringRefOffset(H_Ref);
if (strncmp (S, T, G.Kmer_Len) == 0) {
if (is_empty) {
diff --git a/src/overlapInCore/overlapInCore-Output.C b/src/overlapInCore/overlapInCore-Output.C
index 7cfe941..546f77a 100644
--- a/src/overlapInCore/overlapInCore-Output.C
+++ b/src/overlapInCore/overlapInCore-Output.C
@@ -65,7 +65,6 @@
*/
#include "overlapInCore.H"
-#include <pthread.h>
// Output the overlap between strings S_ID and T_ID which
// have lengths S_Len and T_Len , respectively.
@@ -192,7 +191,7 @@ Output_Overlap(uint32 S_ID, int S_Len, Direction_t S_Dir,
ovs->dat.ovl.flipped = true;
break;
- case 'A':
+ case 'A': // Never reached.
ovs->a_hang(-bhg);
ovs->b_hang(-ahg);
ovs->dat.ovl.flipped = false;
@@ -227,19 +226,17 @@ Output_Overlap(uint32 S_ID, int S_Len, Direction_t S_Dir,
else
WA->Dovetail_Overlap_Ct ++;
+ // Write overlaps if we've saved too many.
+ // They're also written at the end of the thread.
+ if (WA->overlapsLen >= WA->overlapsMax)
+#pragma omp critical
+ {
+ for (int32 zz=0; zz<WA->overlapsLen; zz++)
+ Out_BOF->writeOverlap(WA->overlaps + zz);
- // We also flush the file at the end of a thread
-
- if (WA->overlapsLen >= WA->overlapsMax) {
- pthread_mutex_lock (& Write_Proto_Mutex);
-
- for (int32 zz=0; zz<WA->overlapsLen; zz++)
- Out_BOF->writeOverlap(WA->overlaps + zz);
- WA->overlapsLen = 0;
-
- pthread_mutex_unlock (& Write_Proto_Mutex);
- }
+ WA->overlapsLen = 0;
+ }
}
@@ -307,14 +304,11 @@ Output_Partial_Overlap(uint32 s_id,
// We also flush the file at the end of a thread
if (WA->overlapsLen >= WA->overlapsMax) {
- pthread_mutex_lock(&Write_Proto_Mutex);
-
+#pragma omp critical
for (int32 zz=0; zz<WA->overlapsLen; zz++)
Out_BOF->writeOverlap(WA->overlaps + zz);
WA->overlapsLen = 0;
-
- pthread_mutex_unlock(&Write_Proto_Mutex);
}
}
diff --git a/src/overlapInCore/overlapInCore-Process_Overlaps.C b/src/overlapInCore/overlapInCore-Process_Overlaps.C
index 0d2342a..16a30c3 100644
--- a/src/overlapInCore/overlapInCore-Process_Overlaps.C
+++ b/src/overlapInCore/overlapInCore-Process_Overlaps.C
@@ -60,13 +60,16 @@
* are a 'United States Government Work', and
* are released in the public domain
*
+ * Sergey Koren beginning on 2016-JUN-08
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
#include "overlapInCore.H"
#include "AS_UTL_reverseComplement.H"
-#include <pthread.h>
// Find and output all overlaps between strings in store and those in the global hash table.
// This is the entry point for each compute thread.
@@ -89,9 +92,10 @@ Process_Overlaps(void *ptr){
WA->Kmer_Hits_Without_Olap_Ct = 0;
WA->Kmer_Hits_With_Olap_Ct = 0;
+ WA->Kmer_Hits_Skipped_Ct = 0;
WA->Multi_Overlap_Ct = 0;
- fprintf(stderr, "Thread %02u processes reads "F_U32"-"F_U32"\n",
+ fprintf(stderr, "Thread %02u processes reads " F_U32 "-" F_U32 "\n",
WA->thread_id, WA->bgnID, WA->endID);
for (uint32 fi=WA->bgnID; fi<=WA->endID; fi++) {
@@ -135,39 +139,37 @@ Process_Overlaps(void *ptr){
// Write out this block of overlaps, no need to keep them in core!
// While we have a mutex, also find the next block of things to process.
- fprintf(stderr, "Thread %02u writes reads "F_U32"-"F_U32" (%u overlaps %u/%u kmer hits with/without overlap)\n",
+ fprintf(stderr, "Thread %02u writes reads " F_U32 "-" F_U32 " (" F_U64 " overlaps " F_U64 "/" F_U64 "/" F_U64 " kmer hits with/without overlap/skipped)\n",
WA->thread_id, WA->bgnID, WA->endID,
WA->overlapsLen,
- WA->Kmer_Hits_With_Olap_Ct, WA->Kmer_Hits_Without_Olap_Ct);
-
- pthread_mutex_lock(& Write_Proto_Mutex);
+ WA->Kmer_Hits_With_Olap_Ct, WA->Kmer_Hits_Without_Olap_Ct, WA->Kmer_Hits_Skipped_Ct);
- // Flush any remaining overlaps.
+ // Flush any remaining overlaps and update statistics.
- for (int zz=0; zz<WA->overlapsLen; zz++)
- Out_BOF->writeOverlap(WA->overlaps + zz);
- WA->overlapsLen = 0;
+#pragma omp critical
+ {
+ for (int zz=0; zz<WA->overlapsLen; zz++)
+ Out_BOF->writeOverlap(WA->overlaps + zz);
- // Update stats
+ WA->overlapsLen = 0;
- Total_Overlaps += WA->Total_Overlaps;
- Contained_Overlap_Ct += WA->Contained_Overlap_Ct;
- Dovetail_Overlap_Ct += WA->Dovetail_Overlap_Ct;
+ Total_Overlaps += WA->Total_Overlaps;
+ Contained_Overlap_Ct += WA->Contained_Overlap_Ct;
+ Dovetail_Overlap_Ct += WA->Dovetail_Overlap_Ct;
- Kmer_Hits_Without_Olap_Ct += WA->Kmer_Hits_Without_Olap_Ct;
- Kmer_Hits_With_Olap_Ct += WA->Kmer_Hits_With_Olap_Ct;
- Multi_Overlap_Ct += WA->Multi_Overlap_Ct;
+ Kmer_Hits_Without_Olap_Ct += WA->Kmer_Hits_Without_Olap_Ct;
+ Kmer_Hits_With_Olap_Ct += WA->Kmer_Hits_With_Olap_Ct;
+ Kmer_Hits_Skipped_Ct += WA->Kmer_Hits_Skipped_Ct;
+ Multi_Overlap_Ct += WA->Multi_Overlap_Ct;
- WA->bgnID = G.curRefID;
- WA->endID = G.curRefID + G.perThread - 1;
+ WA->bgnID = G.curRefID;
+ WA->endID = G.curRefID + G.perThread - 1;
- if (WA->endID > G.endRefID)
- WA->endID = G.endRefID;
-
- G.curRefID = WA->endID + 1;
-
- pthread_mutex_unlock(& Write_Proto_Mutex);
+ if (WA->endID > G.endRefID)
+ WA->endID = G.endRefID;
+ G.curRefID = WA->endID + 1;
+ }
}
delete readData;
diff --git a/src/overlapInCore/overlapInCore-Process_String_Overlaps.C b/src/overlapInCore/overlapInCore-Process_String_Overlaps.C
index c6f2a18..50cdef7 100644
--- a/src/overlapInCore/overlapInCore-Process_String_Overlaps.C
+++ b/src/overlapInCore/overlapInCore-Process_String_Overlaps.C
@@ -68,8 +68,22 @@
* full conditions and disclaimers for each license.
*/
+#include <math.h>
#include "overlapInCore.H"
+static
+uint64 computeExpected(uint64 kmerSize, double ovlLen, double erate) {
+ if (ovlLen < kmerSize) return 0;
+ return int(floor(exp(-1.0 * (double)kmerSize * erate) * (ovlLen - kmerSize + 1)));
+}
+
+static
+uint64 computeMinimumKmers(uint64 kmerSize, double ovlLen, double erate) {
+ if (G.Filter_By_Kmer_Count == 0) return G.Filter_By_Kmer_Count;
+
+ ovlLen = (ovlLen < 0 ? ovlLen*-1.0 : ovlLen);
+ return max(G.Filter_By_Kmer_Count, computeExpected(kmerSize, ovlLen, erate));
+}
// Choose the best overlap in olap[0 .. (ct - 1)] .
// Mark all others as deleted (by setting deleted[] true for them)
@@ -707,6 +721,8 @@ Process_String_Olaps (char * S,
if (ct <= G.Frag_Olap_Limit) {
for (i = 0; i < ct; i ++) {
root_num = WA->String_Olap_Space[i].String_Num;
+ //fprintf(stderr, "Processing overlap from %d and global, curr match is %d of %.2f len and %d diag matches min of %d\n", ID, (root_num + Hash_String_Num_Offset), (double)WA->String_Olap_Space[i].diag_end-WA->String_Olap_Space[i].diag_bgn, WA->String_Olap_Space[i].diag_ct, computeMinimumKmers(G.Kmer_Len, WA->String_Olap_Space[i].diag_end-WA->String_Olap_Space[i].diag_bgn, G.maxErate));
+ if (computeMinimumKmers(G.Kmer_Len, WA->String_Olap_Space[i].diag_end-WA->String_Olap_Space[i].diag_bgn, G.maxErate) > WA->String_Olap_Space[i].diag_ct) { WA->Kmer_Hits_Skipped_Ct++; continue; }
Process_Matches(&WA->String_Olap_Space[i].Match_List,
S,
@@ -738,6 +754,7 @@ Process_String_Olaps (char * S,
for (i = start; i < ct && WA->A_Olaps_For_Frag < G.Frag_Olap_Limit ; i ++) {
root_num = WA->String_Olap_Space[i].String_Num;
+ if (computeMinimumKmers(G.Kmer_Len, WA->String_Olap_Space[i].diag_end-WA->String_Olap_Space[i].diag_bgn, G.maxErate) > WA->String_Olap_Space[i].diag_ct) { WA->Kmer_Hits_Skipped_Ct++; continue; }
Process_Matches(&WA->String_Olap_Space[i].Match_List,
S,
@@ -759,6 +776,7 @@ Process_String_Olaps (char * S,
for (i = start - 1; i >= 0 && WA->B_Olaps_For_Frag < G.Frag_Olap_Limit ; i --) {
root_num = WA->String_Olap_Space[i].String_Num;
+ if (computeMinimumKmers(G.Kmer_Len, WA->String_Olap_Space[i].diag_end-WA->String_Olap_Space[i].diag_bgn, G.maxErate) > WA->String_Olap_Space[i].diag_ct) { WA->Kmer_Hits_Skipped_Ct++; continue; }
Process_Matches(&WA->String_Olap_Space[i].Match_List,
S,
diff --git a/src/overlapInCore/overlapInCore.C b/src/overlapInCore/overlapInCore.C
index 74c2336..281abfc 100644
--- a/src/overlapInCore/overlapInCore.C
+++ b/src/overlapInCore/overlapInCore.C
@@ -103,6 +103,7 @@ Hash_Bucket_t * Hash_Table;
uint64 Kmer_Hits_With_Olap_Ct = 0;
uint64 Kmer_Hits_Without_Olap_Ct = 0;
+uint64 Kmer_Hits_Skipped_Ct = 0;
uint64 Multi_Overlap_Ct = 0;
uint64 String_Ct;
@@ -135,8 +136,6 @@ uint64 SV1 = 666;
uint64 SV2 = 666;
uint64 SV3 = 666;
-pthread_mutex_t Write_Proto_Mutex;
-
ovFile *Out_BOF = NULL;
@@ -167,14 +166,10 @@ Initialize_Work_Area(Work_Area_t *WA, int id, gkStore *gkpStore) {
allocated += sizeof(ovOverlap) * WA->overlapsMax;
- fprintf(stderr, "Initialize_Work_Area()-- new prefixEditDistance\n");
-
WA->editDist = new prefixEditDistance(G.Doing_Partial_Overlaps, G.maxErate);
WA->q_diff = new char [AS_MAX_READLEN];
WA->distinct_olap = new Olap_Info_t [MAX_DISTINCT_OLAPS];
-
- fprintf(stderr, "Initialize_Work_Area()-- done\n");
}
@@ -195,27 +190,17 @@ Delete_Work_Area(Work_Area_t *WA) {
int
OverlapDriver(void) {
- fprintf(stderr, "OverlapDriver()--\n");
-
- pthread_t *thread_id = new pthread_t [G.Num_PThreads];
- fprintf(stderr, "OverlapDriver()-- WA\n");
Work_Area_t *thread_wa = new Work_Area_t [G.Num_PThreads];
- fprintf(stderr, "OverlapDriver()-- gkpStore\n");
gkStore *gkpStore = gkStore::gkStore_open(G.Frag_Store_Path);
- pthread_attr_t attr;
+ Out_BOF = new ovFile(gkpStore, G.Outfile_Name, ovFileFullWrite);
- pthread_attr_init(&attr);
- pthread_attr_setstacksize(&attr, THREAD_STACKSIZE);
- pthread_mutex_init(&Write_Proto_Mutex, NULL);
+ fprintf(stderr, "Initializing %u work areas.\n", G.Num_PThreads);
- for (uint32 i=0; i<G.Num_PThreads; i++) {
- fprintf(stderr, "OverlapDriver()-- Initialize_Work_Area %u\n", i);
+#pragma omp parallel for
+ for (uint32 i=0; i<G.Num_PThreads; i++)
Initialize_Work_Area(thread_wa+i, i, gkpStore);
- }
-
- fprintf(stderr, "OverlapDriver()-- Initialized\n");
// Command line options are Lo_Hash_Frag and Hi_Hash_Frag
// Command line options are Lo_Old_Frag and Hi_Old_Frag
@@ -234,8 +219,6 @@ OverlapDriver(void) {
// Iterate over read blocks, build a hash table, then search in threads.
- //fprintf(stderr, "OverlapDriver()-- Loop top\n");
-
while (bgnHashID < G.endHashID) {
if (endHashID > G.endHashID)
endHashID = G.endHashID;
@@ -247,12 +230,8 @@ OverlapDriver(void) {
// Load as much as we can. If we load less than expected, the endHashID is updated to reflect
// the last read loaded.
- //fprintf(stderr, "OverlapDriver()-- Build_Hash_Index\n");
-
endHashID = Build_Hash_Index(gkpStore, bgnHashID, endHashID);
- //fprintf(stderr, "Index built.\n");
-
// Decide the range of reads to process. No more than what is loaded in the table.
if (G.bgnRefID < 1)
@@ -272,38 +251,26 @@ OverlapDriver(void) {
fprintf(stderr, "\n");
fprintf(stderr, "Range: %u-%u. Store has %u reads.\n",
G.bgnRefID, G.endRefID, gkpStore->gkStore_getNumReads());
- fprintf(stderr, "Chunk: "F_U32" reads/thread -- (G.endRefID="F_U32" - G.bgnRefID="F_U32") / G.Num_PThreads="F_U32" / 8\n",
+ fprintf(stderr, "Chunk: " F_U32 " reads/thread -- (G.endRefID=" F_U32 " - G.bgnRefID=" F_U32 ") / G.Num_PThreads=" F_U32 " / 8\n",
G.perThread, G.endRefID, G.bgnRefID, G.Num_PThreads);
fprintf(stderr, "\n");
- fprintf(stderr, "Starting "F_U32"-"F_U32" with "F_U32" per thread\n", G.bgnRefID, G.endRefID, G.perThread);
+ fprintf(stderr, "Starting " F_U32 "-" F_U32 " with " F_U32 " per thread\n", G.bgnRefID, G.endRefID, G.perThread);
fprintf(stderr, "\n");
- for (uint32 i=0; i<G.Num_PThreads; i++) {
-
- // Initialize each thread, reset the current position.
+ // Initialize each thread, reset the current position. curRefID and endRefID are updated, this
+ // cannot be done in the parallel loop!
+ for (uint32 i=0; i<G.Num_PThreads; i++) {
thread_wa[i].bgnID = G.curRefID;
thread_wa[i].endID = thread_wa[i].bgnID + G.perThread - 1;
- G.curRefID = thread_wa[i].endID + 1;
-
- if (G.endRefID > G.endRefID)
- G.endRefID = G.endRefID;
-
- int status = pthread_create(thread_id+i, &attr, Process_Overlaps, thread_wa+i);
-
- if (status != 0)
- fprintf(stderr, "pthread_create error: %s\n", strerror(status)), exit(1);
+ G.curRefID = thread_wa[i].endID + 1; // Global value updated!
}
- // The master thread just sits here and waits.
-
- for (uint32 i=0; i<G.Num_PThreads; i++) {
- int status = pthread_join(thread_id[i], NULL);
- if (status != 0)
- fprintf(stderr, "pthread_join error: %s\n", strerror(status)), exit(1);
- }
+#pragma omp parallel for
+ for (uint32 i=0; i<G.Num_PThreads; i++)
+ Process_Overlaps(thread_wa + i);
// Clear out the hash table. This stuff is allocated in Build_Hash_Index
@@ -320,8 +287,7 @@ OverlapDriver(void) {
endHashID = bgnHashID + G.Max_Hash_Strings - 1; // Inclusive!
}
- pthread_mutex_destroy(&Write_Proto_Mutex);
- pthread_attr_destroy(&attr);
+ delete Out_BOF;
gkpStore->gkStore_close();
@@ -329,7 +295,6 @@ OverlapDriver(void) {
Delete_Work_Area(thread_wa + i);
delete [] thread_wa;
- delete [] thread_id;
return 0;
}
@@ -344,7 +309,7 @@ main(int argc, char **argv) {
argc = AS_configure(argc, argv);
- G.initialize();
+ G.initialize(); // Probably redundant with the call in the constructor, but doesn't hurt.
int err=0;
int arg=1;
@@ -424,6 +389,8 @@ main(int argc, char **argv) {
} else if (strcmp(argv[arg], "--minlength") == 0) {
G.Min_Olap_Len = strtol (argv[++arg], NULL, 10);
+ } else if (strcmp(argv[arg], "--minkmers") == 0) {
+ G.Filter_By_Kmer_Count = int(floor(exp(-1.0 * (double)G.Kmer_Len * G.maxErate) * (G.Min_Olap_Len - G.Kmer_Len + 1)));
} else if (strcmp(argv[arg], "--maxerate") == 0) {
G.maxErate = strtof(argv[++arg], NULL);
@@ -460,7 +427,7 @@ main(int argc, char **argv) {
fprintf(stderr, "* No kmer length supplied; -k needed!\n"), err++;
if (G.Max_Hash_Strings > MAX_STRING_NUM)
- fprintf(stderr, "Too many strings (--hashstrings), must be less than "F_U64"\n", MAX_STRING_NUM), err++;
+ fprintf(stderr, "Too many strings (--hashstrings), must be less than " F_U64 "\n", MAX_STRING_NUM), err++;
if (G.Outfile_Name == NULL)
fprintf (stderr, "ERROR: No output file name specified\n"), err++;
@@ -515,8 +482,6 @@ main(int argc, char **argv) {
exit(1);
}
- Out_BOF = new ovFile(G.Outfile_Name, ovFileFullWrite);
-
// We know enough now to set the hash function variables, and some other random variables.
HSF1 = G.Kmer_Len - (G.Hash_Mask_Bits / 2);
@@ -528,21 +493,24 @@ main(int argc, char **argv) {
// Log parameters.
fprintf(stderr, "\n");
- fprintf(stderr, "STRING_NUM_BITS "F_U32"\n", STRING_NUM_BITS);
- fprintf(stderr, "OFFSET_BITS "F_U32"\n", OFFSET_BITS);
- fprintf(stderr, "STRING_NUM_MASK "F_U64"\n", STRING_NUM_MASK);
- fprintf(stderr, "OFFSET_MASK "F_U64"\n", OFFSET_MASK);
- fprintf(stderr, "MAX_STRING_NUM "F_U64"\n", MAX_STRING_NUM);
+ fprintf(stderr, "STRING_NUM_BITS " F_U32 "\n", STRING_NUM_BITS);
+ fprintf(stderr, "OFFSET_BITS " F_U32 "\n", OFFSET_BITS);
+ fprintf(stderr, "STRING_NUM_MASK " F_U64 "\n", STRING_NUM_MASK);
+ fprintf(stderr, "OFFSET_MASK " F_U64 "\n", OFFSET_MASK);
+ fprintf(stderr, "MAX_STRING_NUM " F_U64 "\n", MAX_STRING_NUM);
fprintf(stderr, "\n");
- fprintf(stderr, "Hash_Mask_Bits "F_U32"\n", G.Hash_Mask_Bits);
- fprintf(stderr, "Max_Hash_Strings "F_U32"\n", G.Max_Hash_Strings);
- fprintf(stderr, "Max_Hash_Data_Len "F_U64"\n", G.Max_Hash_Data_Len);
+ fprintf(stderr, "Hash_Mask_Bits " F_U32 "\n", G.Hash_Mask_Bits);
+ fprintf(stderr, "Max_Hash_Strings " F_U32 "\n", G.Max_Hash_Strings);
+ fprintf(stderr, "Max_Hash_Data_Len " F_U64 "\n", G.Max_Hash_Data_Len);
fprintf(stderr, "Max_Hash_Load %f\n", G.Max_Hash_Load);
- fprintf(stderr, "Kmer Length "F_U64"\n", G.Kmer_Len);
+ fprintf(stderr, "Kmer Length " F_U64 "\n", G.Kmer_Len);
fprintf(stderr, "Min Overlap Length %d\n", G.Min_Olap_Len);
fprintf(stderr, "Max Error Rate %f\n", G.maxErate);
+ fprintf(stderr, "Min Kmer Matches " F_U64 "\n", G.Filter_By_Kmer_Count);
fprintf(stderr, "\n");
- fprintf(stderr, "Num_PThreads "F_U32"\n", G.Num_PThreads);
+ fprintf(stderr, "Num_PThreads " F_U32 "\n", G.Num_PThreads);
+
+ omp_set_num_threads(G.Num_PThreads);
assert (8 * sizeof (uint64) > 2 * G.Kmer_Len);
@@ -561,16 +529,16 @@ main(int argc, char **argv) {
}
fprintf(stderr, "\n");
- fprintf(stderr, "HASH_TABLE_SIZE "F_U32"\n", HASH_TABLE_SIZE);
- fprintf(stderr, "sizeof(Hash_Bucket_t) "F_SIZE_T"\n", sizeof(Hash_Bucket_t));
- fprintf(stderr, "hash table size: "F_SIZE_T" MB\n", (HASH_TABLE_SIZE * sizeof(Hash_Bucket_t)) >> 20);
+ fprintf(stderr, "HASH_TABLE_SIZE " F_U64 "\n", HASH_TABLE_SIZE);
+ fprintf(stderr, "sizeof(Hash_Bucket_t) " F_U64 "\n", (uint64)sizeof(Hash_Bucket_t));
+ fprintf(stderr, "hash table size: " F_U64 " MB\n", (HASH_TABLE_SIZE * sizeof(Hash_Bucket_t)) >> 20);
fprintf(stderr, "\n");
Hash_Table = new Hash_Bucket_t [HASH_TABLE_SIZE];
- fprintf(stderr, "check "F_SIZE_T" MB\n", (HASH_TABLE_SIZE * sizeof (Check_Vector_t) >> 20));
- fprintf(stderr, "info "F_SIZE_T" MB\n", (G.Max_Hash_Strings * sizeof (Hash_Frag_Info_t) >> 20));
- fprintf(stderr, "start "F_SIZE_T" MB\n", (G.Max_Hash_Strings * sizeof (int64) >> 20));
+ fprintf(stderr, "check " F_U64 " MB\n", ((HASH_TABLE_SIZE * sizeof (Check_Vector_t)) >> 20));
+ fprintf(stderr, "info " F_SIZE_T " MB\n", ((G.Max_Hash_Strings * sizeof (Hash_Frag_Info_t)) >> 20));
+ fprintf(stderr, "start " F_SIZE_T " MB\n", ((G.Max_Hash_Strings * sizeof (int64)) >> 20));
fprintf(stderr, "\n");
Hash_Check_Array = new Check_Vector_t [HASH_TABLE_SIZE];
@@ -598,8 +566,6 @@ main(int argc, char **argv) {
delete [] Hash_Check_Array;
delete [] Hash_Table;
- delete Out_BOF;
-
FILE *stats = stderr;
if (G.Outstat_Name != NULL) {
@@ -611,17 +577,20 @@ main(int argc, char **argv) {
}
}
- fprintf(stats, " Kmer hits without olaps = "F_S64"\n", Kmer_Hits_Without_Olap_Ct);
- fprintf(stats, " Kmer hits with olaps = "F_S64"\n", Kmer_Hits_With_Olap_Ct);
- fprintf(stats, " Multiple overlaps/pair = "F_S64"\n", Multi_Overlap_Ct);
- fprintf(stats, " Total overlaps produced = "F_S64"\n", Total_Overlaps);
- fprintf(stats, " Contained overlaps = "F_S64"\n", Contained_Overlap_Ct);
- fprintf(stats, " Dovetail overlaps = "F_S64"\n", Dovetail_Overlap_Ct);
- fprintf(stats, "Rejected by short window = "F_S64"\n", Bad_Short_Window_Ct);
- fprintf(stats, " Rejected by long window = "F_S64"\n", Bad_Long_Window_Ct);
+ fprintf(stats, " Kmer hits without olaps = " F_S64 "\n", Kmer_Hits_Without_Olap_Ct);
+ fprintf(stats, " Kmer hits with olaps = " F_S64 "\n", Kmer_Hits_With_Olap_Ct);
+ //fprintf(stats, " Kmer hits below %u = " F_S64 "\n", G.Filter_By_Kmer_Count, Kmer_Hits_Skipped_Ct);
+ fprintf(stats, " Multiple overlaps/pair = " F_S64 "\n", Multi_Overlap_Ct);
+ fprintf(stats, " Total overlaps produced = " F_S64 "\n", Total_Overlaps);
+ fprintf(stats, " Contained overlaps = " F_S64 "\n", Contained_Overlap_Ct);
+ fprintf(stats, " Dovetail overlaps = " F_S64 "\n", Dovetail_Overlap_Ct);
+ fprintf(stats, "Rejected by short window = " F_S64 "\n", Bad_Short_Window_Ct);
+ fprintf(stats, " Rejected by long window = " F_S64 "\n", Bad_Long_Window_Ct);
if (stats != stderr)
fclose(stats);
+ fprintf(stderr, "Bye.\n");
+
return(0);
}
diff --git a/src/overlapInCore/overlapInCore.H b/src/overlapInCore/overlapInCore.H
index 9fdd2e9..9ca9b28 100644
--- a/src/overlapInCore/overlapInCore.H
+++ b/src/overlapInCore/overlapInCore.H
@@ -67,8 +67,6 @@
#include "prefixEditDistance.H"
-#include <pthread.h>
-
#ifndef OVERLAPINCORE_H
#define OVERLAPINCORE_H
@@ -112,7 +110,7 @@
#define HASH_EXPANSION_FACTOR 1.4
// Hash table size is >= this times MAX_HASH_STRINGS
-#define HASH_MASK ((1 << G.Hash_Mask_Bits) - 1)
+#define HASH_MASK (((uint64)1 << G.Hash_Mask_Bits) - 1)
// Extract right Hash_Mask_Bits bits of hash key
#define HASH_TABLE_SIZE (1 + HASH_MASK)
@@ -248,7 +246,9 @@ typedef struct String_Olap_Node {
uint32 String_Num; // Of hash-table frag that have exact match with
int32 Match_List; // Subscript of start of list of exact matches
double diag_sum; // Sum of diagonals of all k-mer matches to this frag
- int32 diag_ct; // Count of all k-mer matches to this frag
+ int32 diag_ct; // Count of all k-mer matches to this frag
+ int diag_bgn;
+ int diag_end;
signed int Next : 29; // Next match if this is a collision
unsigned Full : 1;
unsigned consistent : 1;
@@ -313,6 +313,7 @@ typedef struct Work_Area {
uint64 Kmer_Hits_Without_Olap_Ct;
uint64 Kmer_Hits_With_Olap_Ct;
+ uint64 Kmer_Hits_Skipped_Ct;
uint64 Multi_Overlap_Ct;
prefixEditDistance *editDist;
@@ -397,6 +398,7 @@ extern uint64 Hash_String_Num_Offset;
extern Hash_Bucket_t * Hash_Table;
extern uint64 Kmer_Hits_With_Olap_Ct;
extern uint64 Kmer_Hits_Without_Olap_Ct;
+extern uint64 Kmer_Hits_Skipped_Ct;
extern uint64 Multi_Overlap_Ct;
extern uint64 String_Ct;
extern Hash_Frag_Info_t * String_Info;
@@ -415,7 +417,9 @@ extern uint64 Dovetail_Overlap_Ct;
class oicParameters {
public:
- oicParameters() {};
+ oicParameters() {
+ initialize();
+ };
~oicParameters() {};
void initialize(void) {
@@ -434,6 +438,7 @@ public:
Kmer_Len = 0;
Kmer_Skip_File = NULL;
+ Filter_By_Kmer_Count = 0;
Frag_Olap_Limit = UINT64_MAX;
@@ -485,6 +490,7 @@ public:
uint32 perThread; // When processing, how many to do per block
uint64 Kmer_Len; // -k
+ uint64 Filter_By_Kmer_Count;
FILE *Kmer_Skip_File; // -k
// Maximum number of overlaps for end of an old fragment against
@@ -533,8 +539,6 @@ extern uint64 SV2;
extern uint64 SV3;
extern ovFile *Out_BOF;
-extern pthread_mutex_t Frag_Store_Mutex;
-extern pthread_mutex_t Write_Proto_Mutex;
diff --git a/src/overlapInCore/overlapInCorePartition.C b/src/overlapInCore/overlapInCorePartition.C
index 650fb27..b1abad4 100644
--- a/src/overlapInCore/overlapInCorePartition.C
+++ b/src/overlapInCore/overlapInCorePartition.C
@@ -64,16 +64,16 @@ outputJob(FILE *BAT,
uint32 &batchName,
uint32 &jobName) {
- fprintf(BAT, "%03"F_U32P"\n", batchName);
- fprintf(JOB, "%06"F_U32P"\n", jobName);
+ fprintf(BAT, "%03" F_U32P "\n", batchName);
+ fprintf(JOB, "%06" F_U32P "\n", jobName);
if (maxNumReads == 0) {
- fprintf(OPT, "-h "F_U32"-"F_U32" -r "F_U32"-"F_U32"\n",
+ fprintf(OPT, "-h " F_U32 "-" F_U32 " -r " F_U32 "-" F_U32 "\n",
hashBeg, hashEnd, refBeg, refEnd);
fprintf(stderr, "HASH %10d-%10d REFR %10d-%10d JOB %d\n",
hashBeg, hashEnd, refBeg, refEnd, jobName);
} else {
- fprintf(OPT, "-h "F_U32"-"F_U32" -r "F_U32"-"F_U32" --hashstrings "F_U32" --hashdatalen "F_U32"\n",
+ fprintf(OPT, "-h " F_U32 "-" F_U32 " -r " F_U32 "-" F_U32 " --hashstrings " F_U32 " --hashdatalen " F_U32 "\n",
hashBeg, hashEnd, refBeg, refEnd, maxNumReads, maxLength);
fprintf(stderr, "HASH %10d-%10d REFR %10d-%10d STRINGS %10d BASES %10d JOB %d\n",
hashBeg, hashEnd, refBeg, refEnd, maxNumReads, maxLength, jobName);
@@ -94,8 +94,8 @@ outputJob(FILE *BAT,
uint32 *
loadReadLengths(gkStore *gkp,
- set<uint32> &libToHash, uint32 &hashMin, uint32 &hashMax,
- set<uint32> &libToRef, uint32 &refMin, uint32 &refMax) {
+ set<uint32> &libToHash, uint32 &hashMin, uint32 &hashMax,
+ set<uint32> &libToRef, uint32 &refMin, uint32 &refMax) {
uint32 numReads = gkp->gkStore_getNumReads();
uint32 numLibs = gkp->gkStore_getNumLibraries();
uint32 *readLen = new uint32 [numReads + 1];
@@ -123,7 +123,7 @@ loadReadLengths(gkStore *gkp,
doRef[i] = (libToRef.count(i) == 0) ? false : true;
}
- fprintf(stderr, "Loading lengths of "F_U32" fragments ("F_SIZE_T"mb)\n",
+ fprintf(stderr, "Loading lengths of " F_U32 " fragments (" F_SIZE_T "mb)\n",
numReads, (numReads * sizeof(uint32)) >> 20);
memset(readLen, 0, sizeof(uint32) * (numReads + 1));
@@ -153,10 +153,13 @@ loadReadLengths(gkStore *gkp,
}
if ((ii % 1048576) == 0)
- fprintf(stderr, "Loading lengths at "F_U32" out of "F_U32". H: "F_U32","F_U32" R: "F_U32","F_U32"\n",
+ fprintf(stderr, "Loading lengths at " F_U32 " out of " F_U32 ". H: " F_U32 "," F_U32 " R: " F_U32 "," F_U32 "\n",
ii, numReads, hashMin, hashMax, refMin, refMax);
}
+ delete [] doHash;
+ delete [] doRef;
+
return(readLen);
}
@@ -199,7 +202,7 @@ partitionFrags(gkStore *gkp,
if (refMax > numReads)
refMax = numReads;
- fprintf(stderr, "Partitioning for hash: "F_U32"-"F_U32" ref: "F_U32","F_U32"\n",
+ fprintf(stderr, "Partitioning for hash: " F_U32 "-" F_U32 " ref: " F_U32 "," F_U32 "\n",
hashMin, hashMax, refMin, refMax);
hashBeg = hashMin;
@@ -243,6 +246,8 @@ partitionFrags(gkStore *gkp,
hashBeg = hashEnd + 1;
}
+
+ delete [] readLen;
}
@@ -282,7 +287,7 @@ partitionLength(gkStore *gkp,
if (refMax > numReads)
refMax = numReads;
- fprintf(stderr, "Partitioning for hash: "F_U32"-"F_U32" ref: "F_U32","F_U32"\n",
+ fprintf(stderr, "Partitioning for hash: " F_U32 "-" F_U32 " ref: " F_U32 "," F_U32 "\n",
hashMin, hashMax, refMin, refMax);
hashBeg = hashMin;
@@ -340,10 +345,42 @@ partitionLength(gkStore *gkp,
hashBeg = hashEnd + 1;
}
+
+ delete [] readLen;
+}
+
+
+
+FILE *
+openOutput(char *prefix, char *type) {
+ char A[FILENAME_MAX];
+
+ snprintf(A, FILENAME_MAX, "%s.%s.WORKING", prefix, type);
+
+ errno = 0;
+
+ FILE *F = fopen(A, "w");
+
+ if (errno)
+ fprintf(stderr, "Failed to open '%s': %s\n", A, strerror(errno)), exit(1);
+
+ return(F);
}
+void
+renameToFinal(char *prefix, char *type) {
+ char A[FILENAME_MAX];
+ char B[FILENAME_MAX];
+
+ snprintf(A, FILENAME_MAX, "%s.%s.WORKING", prefix, type);
+ snprintf(B, FILENAME_MAX, "%s.%s", prefix, type);
+
+ rename(A, B);
+}
+
+
int
main(int argc, char **argv) {
@@ -418,8 +455,8 @@ main(int argc, char **argv) {
if ((ovlRefBlockLength > 0) && (ovlRefBlockSize > 0))
fprintf(stderr, "ERROR: At most one of -rl and -rs can be non-zero.\n"), exit(1);
- fprintf(stderr, "HASH: "F_U64" reads or "F_U64" length.\n", ovlHashBlockSize, ovlHashBlockLength);
- fprintf(stderr, "REF: "F_U64" reads or "F_U64" length.\n", ovlRefBlockSize, ovlRefBlockLength);
+ fprintf(stderr, "HASH: " F_U64 " reads or " F_U64 " length.\n", ovlHashBlockSize, ovlHashBlockLength);
+ fprintf(stderr, "REF: " F_U64 " reads or " F_U64 " length.\n", ovlRefBlockSize, ovlRefBlockLength);
gkStore *gkp = gkStore::gkStore_open(gkpStoreName);
uint32 numLibs = gkp->gkStore_getNumLibraries();
@@ -427,12 +464,12 @@ main(int argc, char **argv) {
for (set<uint32>::iterator it=libToHash.begin(); it != libToHash.end(); it++)
if (numLibs < *it)
- fprintf(stderr, "ERROR: -H "F_U32" is invalid; only "F_U32" libraries in '%s'\n",
+ fprintf(stderr, "ERROR: -H " F_U32 " is invalid; only " F_U32 " libraries in '%s'\n",
*it, numLibs, gkpStoreName), invalidLibs++;
for (set<uint32>::iterator it=libToRef.begin(); it != libToRef.end(); it++)
if (numLibs < *it)
- fprintf(stderr, "ERROR: -R "F_U32" is invalid; only "F_U32" libraries in '%s'\n",
+ fprintf(stderr, "ERROR: -R " F_U32 " is invalid; only " F_U32 " libraries in '%s'\n",
*it, numLibs, gkpStoreName), invalidLibs++;
if ((libToHash.size() > 0) && (libToRef.size() > 0)) {
@@ -440,31 +477,18 @@ main(int argc, char **argv) {
if ((libToHash.find(lib) == libToHash.end()) &&
(libToRef.find(lib) == libToRef.end())) {
if (checkAllLibUsed == true)
- fprintf(stderr, "ERROR: library "F_U32" is not mentioned in either -H or -R.\n", lib), invalidLibs++;
+ fprintf(stderr, "ERROR: library " F_U32 " is not mentioned in either -H or -R.\n", lib), invalidLibs++;
else
- fprintf(stderr, "Warning: library "F_U32" is not mentioned in either -H or -R.\n", lib);
+ fprintf(stderr, "Warning: library " F_U32 " is not mentioned in either -H or -R.\n", lib);
}
}
}
if (invalidLibs > 0)
fprintf(stderr, "ERROR: one of -H and/or -R are invalid.\n"), exit(1);
- errno = 0;
-
- sprintf(outputName, "%s.ovlbat", outputPrefix);
- FILE *BAT = fopen(outputName, "w");
- if (errno)
- fprintf(stderr, "Failed to open '%s': %s\n", outputName, strerror(errno)), exit(1);
-
- sprintf(outputName, "%s.ovljob", outputPrefix);
- FILE *JOB = fopen(outputName, "w");
- if (errno)
- fprintf(stderr, "Failed to open '%s': %s\n", outputName, strerror(errno)), exit(1);
-
- sprintf(outputName, "%s.ovlopt", outputPrefix);
- FILE *OPT = fopen(outputName, "w");
- if (errno)
- fprintf(stderr, "Failed to open '%s': %s\n", outputName, strerror(errno)), exit(1);
+ FILE *BAT = openOutput(outputPrefix, "ovlbat");
+ FILE *JOB = openOutput(outputPrefix, "ovljob");
+ FILE *OPT = openOutput(outputPrefix, "ovlopt");
if (ovlHashBlockLength == 0)
partitionFrags(gkp, BAT, JOB, OPT, minOverlapLength, ovlHashBlockSize, ovlRefBlockLength, ovlRefBlockSize, libToHash, libToRef);
@@ -475,6 +499,10 @@ main(int argc, char **argv) {
fclose(JOB);
fclose(OPT);
+ renameToFinal(outputPrefix, "ovlbat");
+ renameToFinal(outputPrefix, "ovljob");
+ renameToFinal(outputPrefix, "ovlopt");
+
gkp->gkStore_close();
exit(0);
diff --git a/src/overlapInCore/overlapPair.C b/src/overlapInCore/overlapPair.C
index 0fa091e..bb7ac31 100644
--- a/src/overlapInCore/overlapPair.C
+++ b/src/overlapInCore/overlapPair.C
@@ -38,18 +38,8 @@
#include "gkStore.H"
#include "ovStore.H"
-//#define BUSTED
-#define FALCON
+#include "edlib.H"
-#ifdef FALCON
-#include "dw.H"
-#else
-#include "ssw_cpp.H"
-#endif
-
-#ifdef BUSTED
-#include "NDalign.H"
-#endif
#include "overlapReadCache.H"
#include "AS_UTL_reverseComplement.H"
@@ -70,12 +60,9 @@
#define BATCH_SIZE 1024 * 1024
#define THREAD_SIZE 128
-#ifdef FALCON
-// we don't alow slop because the ND aligner in falcon has no concept of non-global aligns. It always slams reads in so we find the conservative overlap and then we extend if it's close to a dovetail manually
-#define MHAP_SLOP 0
-#else
#define MHAP_SLOP 500
-#endif
+//#define DEBUG 1
+
overlapReadCache *rcache = NULL; // Used to be just 'cache', but that conflicted with -pg: /usr/lib/libc_p.a(msgcat.po):(.bss+0x0): multiple definition of `cache'
uint32 batchPrtID = 0; // When to report progress
@@ -83,7 +70,7 @@ uint32 batchPosID = 0; // The current position of the batch
uint32 batchEndID = 0; // The end of the batch
pthread_mutex_t balanceMutex;
-
+uint32 minOverlapLength = 0;
class workSpace {
@@ -96,25 +83,12 @@ public:
invertOverlaps = false;
gkpStore = NULL;
- align = NULL;
//analyze = NULL;
overlapsLen = 0;
overlaps = NULL;
readSeq = NULL;
};
~workSpace() {
-#ifdef BUSTED
- delete NDaln;
- NDaln = NULL;
-#endif
-#ifndef FALCON
- delete align;
- delete filter;
- align = filter = NULL;
-#else
- delete align;
- align=NULL;
-#endif
delete[] readSeq;
};
@@ -127,17 +101,6 @@ public:
gkStore *gkpStore;
-#ifdef BUSTED
- NDalign *NDaln;
-#endif
-#ifndef FALCON
- StripedSmithWaterman::Aligner *align;
- StripedSmithWaterman::Filter *filter;
-#else
- NDalignment::NDalignResult *align;
-#endif
- //analyzeAlignment *analyze;
-
uint32 overlapsLen; // Not used.
ovOverlap *overlaps;
};
@@ -157,13 +120,6 @@ getRange(uint32 &bgnID, uint32 &endID) {
if (endID > batchEndID)
endID = batchEndID;
-#if 0
- if (batchPosID >= batchPrtID) {
- fprintf(stderr, "getRange()-- at %u out of %u -- %6.2f%%\n", batchPosID, batchEndID, 100.0 * batchPosID / batchEndID);
- batchPrtID += batchEndID / 32;
- }
-#endif
-
pthread_mutex_unlock(&balanceMutex);
// If we're out of overlaps, batchPosID is more than batchEndID (from the last call to this
@@ -187,17 +143,10 @@ recomputeOverlaps(void *ptr) {
// Lazy allocation of the prefixEditDistance structure; it's slow.
-#ifdef BUSTED
- if (WA->NDaln == NULL)
- WA->NDaln = new NDalign(WA->partialOverlaps ? pedLocal : pedOverlap, WA->maxErate, 15);
-#endif
-
//if (WA->analyze == NULL)
// WA->analyze = new analyzeAlignment();
while (getRange(bgnID, endID)) {
- //fprintf(stderr, "Thread %2u computes overlaps %7u - %7u\n", WA->threadID, bgnID, endID);
-
double startTime = getTime();
for (uint32 oo=bgnID; oo<endID; oo++) {
@@ -209,29 +158,23 @@ recomputeOverlaps(void *ptr) {
WA->overlaps[oo].swapIDs(swapped); // Needs to be from a temporary!
}
-#if 0
- fprintf(stderr, "BEGIN overlap A %5u %5u-%5u B %5u %5u-%5u\n",
- ovl->a_iid, ovl->a_bgn(), ovl->b_end(),
- ovl->a_iid, ovl->b_bgn(), ovl->b_end());
-#endif
-
- // This closely follows readConsensus
-
-#if 0
- if (ovl->b_iid != 64)
- continue;
-#endif
-
// Invalidate the overlap.
ovl->evalue(AS_MAX_EVALUE);
+ ovl->dat.ovl.forOBT = false;
+ ovl->dat.ovl.forDUP = false;
+ ovl->dat.ovl.forUTG = false;
+
uint32 aID = ovl->a_iid;
uint32 bID = ovl->b_iid;
// Compute the overlap
+ if (ovl->a_end() - ovl->a_bgn() + 1 < minOverlapLength && ovl->b_end() - ovl->b_bgn() + 1 < minOverlapLength) {
+ continue;
+ }
-#if 0
+#ifdef DEBUG
nTested++;
if (nTested % 1000 == 0) {
double deltaTime = getTime() - startTime;
@@ -239,123 +182,173 @@ if (nTested % 1000 == 0) {
WA->threadID, bgnID, endID, deltaTime, (endID - bgnID) / deltaTime, nFailed, nPassed);
}
#endif
-
-#ifdef BUSTED
- WA->NDaln->initialize(aID, rcache->getRead(aID), rcache->getLength(aID), ovl->a_bgn(), ovl->a_end(),
- bID, rcache->getRead(bID), rcache->getLength(bID), ovl->b_bgn(), ovl->b_end(),
- ovl->flipped());
-
- if (WA->NDaln->findMinMaxDiagonal(40) == false) {
- fprintf(stderr, "A %6u %5d-%5d -> B %6u %5d-%5d %s ALIGN LENGTH TOO SHORT.\n",
- aID, ovl->a_bgn(), ovl->a_end(),
- bID, ovl->b_bgn(), ovl->b_end(),
- ovl->flipped() ? "<-" : "->");
- continue;
- }
-
- if (WA->NDaln->findSeeds(true) == false) {
- fprintf(stderr, "A %6u %5d-%5d -> B %6u %5d-%5d %s NO SEEDS.\n",
- aID, ovl->a_bgn(), ovl->a_end(),
- bID, ovl->b_bgn(), ovl->b_end(),
- ovl->flipped() ? "<-" : "->");
- continue;
- }
-
- if ((WA->NDaln->findHits() == true) &&
- (WA->NDaln->chainHits() == true) &&
- (WA->NDaln->processHits() == true)) {
-
- WA->align->display("MHAP align():", true);
-// fprintf(stderr, "Reads %d to %d, expected overlap %d - %d to %d - %d and found error rate %f from %d - %d and %d - %d\n", aID, bID, ovl->a_bgn(), ovl->a_end(), ovl->b_bgn(), ovl->b_end(), WA->NDaln->erate(), WA->NDaln->abgn(), WA->NDaln->aend(), WA->NDaln->bbgn(), WA->NDaln->bend());
-#else
- char *bRead = WA->readSeq;
- int32 astart = std::max((int32)0, (int32)ovl->a_bgn() - MHAP_SLOP);
- int32 aend = std::min((int32)rcache->getLength(aID), (int32)ovl->a_end() + MHAP_SLOP);
- int32 bstart = std::max((int32)0, (int32)ovl->b_bgn() - MHAP_SLOP);
- int32 bend = std::min((int32)rcache->getLength(bID), (int32)ovl->b_end() + MHAP_SLOP);
+ char *bRead = WA->readSeq;
+ int32 astart = (int32)ovl->a_bgn();
+ int32 aend = (int32)ovl->a_end();
+ int32 astartExtended = max((int32)0, (int32)ovl->a_bgn() - MHAP_SLOP);
+ int32 aendExtended = min((int32)rcache->getLength(aID), (int32)ovl->a_end() + MHAP_SLOP);
+ int32 bstart = (int32)ovl->b_bgn();
+ int32 bend = (int32)ovl->b_end();
+ int32 bstartExtended = max((int32)0, (int32)ovl->b_bgn() - MHAP_SLOP);
+ int32 bendExtended = min((int32)rcache->getLength(bID), (int32)ovl->b_end() + MHAP_SLOP);
strcpy(bRead, rcache->getRead(bID));
if (ovl->flipped()) {
reverseComplementSequence(bRead, rcache->getLength(bID));
- bstart = std::max((int32)0, (int32)rcache->getLength(bID) - (int32)ovl->b_bgn() - MHAP_SLOP);
- bend = std::min((int32)rcache->getLength(bID), (int32)rcache->getLength(bID) - (int32)ovl->b_end() + MHAP_SLOP);
+ bstart = (int32)rcache->getLength(bID) - (int32)ovl->b_bgn();
+ bend = (int32)rcache->getLength(bID) - (int32)ovl->b_end();
+ bstartExtended = max((int32)0, (int32)rcache->getLength(bID) - (int32)ovl->b_bgn() - MHAP_SLOP);
+ bendExtended = min((int32)rcache->getLength(bID), (int32)rcache->getLength(bID) - (int32)ovl->b_end() + MHAP_SLOP);
}
-#ifdef FALCON
- int tolerance = std::min(150, (int)round(0.5 * WA->maxErate * (rcache->getLength(aID) + rcache->getLength(bID))));
- WA->align->clear();
- bool aligned = NDalignment::align(bRead+bstart, bend-bstart+1, rcache->getRead(aID)+astart, aend-astart+1, tolerance, false, *(WA->align));
- NDalignment::NDalignResult& alignResult = *WA->align;
-
- uint32 alignmentLength = WA->align->_tgt_end - WA->align->_tgt_bgn + 1;
-#else
- StripedSmithWaterman::Alignment alignment;
- WA->align->Align(bRead+bstart, rcache->getRead(aID)+astart, aend-astart+1, *WA->filter, &alignment);
- NDalignment::NDalignResult alignResult;
-
- alignResult._dist = alignment.mismatches;
- alignResult._tgt_bgn = alignment.ref_begin;
- alignResult._tgt_end = alignment.ref_end;
- alignResult._qry_bgn = alignment.query_begin;
- alignResult._qry_end = alignment.query_end;
-
- uint32 alignmentLength = alignment.ref_end-alignment.ref_begin+1;
-#endif
+ int tolerance = (int)ceil((double)max(aendExtended-astartExtended, bendExtended-bstartExtended)*WA->maxErate*1.1);
+ EdlibAlignResult bQuery = edlibAlign(rcache->getRead(aID)+astart, aend-astart, bRead+bstartExtended, bendExtended-bstartExtended, edlibNewAlignConfig(tolerance, EDLIB_MODE_HW, EDLIB_TASK_LOC));
+ EdlibAlignResult aQuery = edlibAlign(bRead+bstart, bend-bstart, rcache->getRead(aID)+astartExtended, aendExtended-astartExtended, edlibNewAlignConfig(tolerance, EDLIB_MODE_HW, EDLIB_TASK_LOC));
- //fprintf(stderr, "Reads %d (%d) to %d (%d), expected overlap %d - %d to %d - %d and found error rate %f from %d - %d and %d - %d\n", aID, rcache->getLength(aID), bID, rcache->getLength(bID), ovl->a_bgn(), ovl->a_end(), ovl->b_bgn(), ovl->b_end(), (double)alignResult._dist/(alignmentLength),alignResult._tgt_bgn+astart, alignResult._tgt_end+astart-1, alignResult._qry_bgn+bstart, alignResult._qry_end+bstart-1);
+ uint32 alignmentLength = 0;
+ double dist = 0;
+
+#ifdef DEBUG
+fprintf(stderr, "Overlap between %d and %d at %d found %d %d hits\n", aID, bID, tolerance, bQuery.numLocations, aQuery.numLocations);
+#endif
+ if (aQuery.numLocations >= 1 || bQuery.numLocations >= 1) {
+ // if we couldn't find one of the options, try trimming and re-computing
+ if (bQuery.numLocations == 0) {
+ ovl->dat.ovl.ahg5 = aQuery.startLocations[0] + astartExtended;
+ ovl->dat.ovl.ahg3 = rcache->getLength(aID) - (aQuery.endLocations[0] + astartExtended + 1);
+ alignmentLength = max(alignmentLength, (uint32)(aQuery.endLocations[0] - aQuery.startLocations[0]));
+ dist = min(aQuery.editDistance, (int)dist);
+ edlibFreeAlignResult(bQuery);
+ bQuery = edlibAlign(rcache->getRead(aID)+astart, aend-astart, bRead+bstartExtended, bendExtended-bstartExtended, edlibNewAlignConfig(tolerance, EDLIB_MODE_HW, EDLIB_TASK_LOC));
+ }
+ if (aQuery.numLocations == 0) {
+ ovl->dat.ovl.bhg5 = bQuery.startLocations[0] + bstartExtended;
+ ovl->dat.ovl.bhg3 = rcache->getLength(bID) - (bQuery.endLocations[0] + bstartExtended + 1);
+ alignmentLength = bQuery.endLocations[0] - bQuery.startLocations[0];
+ dist = bQuery.editDistance;
+ edlibFreeAlignResult(aQuery);
+ aQuery = edlibAlign(bRead+bstart, bend-bstart, rcache->getRead(aID)+astartExtended, aendExtended-astartExtended, edlibNewAlignConfig(tolerance, EDLIB_MODE_HW, EDLIB_TASK_LOC));
+ }
+
+ // now update the trim points based on where the overlapping broke
+ // the aligner computes 0-based end positions so for matching ACGTA to ACTGTA positiosn are 0-4 so we need to adjust for that
+ if (bQuery.numLocations >= 1) {
+ ovl->dat.ovl.bhg5 = bQuery.startLocations[0] + bstartExtended;
+ ovl->dat.ovl.bhg3 = rcache->getLength(bID) - (bQuery.endLocations[0] + bstartExtended + 1);
+ alignmentLength = bQuery.endLocations[0] - bQuery.startLocations[0];
+ dist = bQuery.editDistance;
+ }
+ if (aQuery.numLocations >= 1) {
+ ovl->dat.ovl.ahg5 = aQuery.startLocations[0] + astartExtended;
+ ovl->dat.ovl.ahg3 = rcache->getLength(aID) - (aQuery.endLocations[0] + astartExtended + 1);
+ alignmentLength = max(alignmentLength, (uint32)(aQuery.endLocations[0] - aQuery.startLocations[0]));
+ dist = min(aQuery.editDistance, (int)dist);
+ }
+ edlibFreeAlignResult(aQuery);
+ edlibFreeAlignResult(bQuery);
+
+#ifdef DEBUG
+fprintf(stderr, "Expected overlap between %d and %d from %d - %d and %d - %d found overlap from %d - %d and %d - %d length %d dist %f\n", aID, bID, astart, aend, bstart, bend, ovl->a_bgn(), ovl->a_end(), ovl->b_bgn(), ovl->b_end(), alignmentLength, dist);
+#endif
- if (alignmentLength > 40 && ((double)alignResult._dist / (double) (alignmentLength)) < WA->maxErate) {
+ bool changed = true;
+ tolerance = (int)(alignmentLength * WA->maxErate) + 1;
+ EdlibAlignResult result;
+ // extend to the ends if we are not looking for partial and we can, don't extend contains
+ if (changed && WA->partialOverlaps == false && !ovl->overlapIsDovetail()) {
+ bstart = ovl->flipped() ? rcache->getLength(bID) - ovl->b_bgn() : ovl->b_bgn();
+ bend = ovl->flipped() ? rcache->getLength(bID) - ovl->b_end() : ovl->b_end();
+#ifdef DEBUG
+fprintf(stderr, "Overlap %d %d (%d %d) invert %d is %f error and not dovetail with %d %d and %d %d\n", aID, bID, rcache->getLength(aID) , rcache->getLength(bID), ovl->flipped(), dist/*result.editDistance*/, ovl->dat.ovl.ahg5, ovl->dat.ovl.ahg3, ovl->dat.ovl.bhg5, ovl->dat.ovl.bhg3);
#endif
- nPassed++;
+ // dist = result.editDistance;
+ // check these cases one by one and extend both concordantly with each other
+ // first is a contained in b
+ if (rcache->getLength(aID) <= rcache->getLength(bID) && ovl->dat.ovl.ahg5 >= 0 && ovl->dat.ovl.ahg3 >= 0 && ovl->dat.ovl.bhg5 >= ovl->dat.ovl.ahg5 && ovl->dat.ovl.bhg3 >= ovl->dat.ovl.ahg3 && ((double)(ovl->dat.ovl.ahg5 + ovl->dat.ovl.ahg3 + dist) / ((double)(alignmentLength + ovl->dat.ovl.ahg5 + ovl->dat.ovl.ahg3))) <= WA->maxErate) {
+ ovl->dat.ovl.bhg5 = max(0, ovl->dat.ovl.bhg5 - ovl->dat.ovl.ahg5); ovl->dat.ovl.ahg5 = 0;
+ ovl->dat.ovl.bhg3 = max(0, ovl->dat.ovl.bhg3 - ovl->dat.ovl.ahg3); ovl->dat.ovl.ahg3 = 0;
+ changed = true;
+#ifdef DEBUG
+fprintf(stderr, "Overlap %d %d case 1 acontained\n", aID, bID);
+#endif
+ }
+ // second is b contained (both b hangs can be extended)
+ //
+ else if (rcache->getLength(aID) >= rcache->getLength(bID) && ovl->dat.ovl.bhg5 >= 0 && ovl->dat.ovl.bhg3 >= 0 && ovl->dat.ovl.ahg5 >= ovl->dat.ovl.bhg5 && ovl->dat.ovl.ahg3 >= ovl->dat.ovl.bhg3 && ((double)(ovl->dat.ovl.bhg5 + ovl->dat.ovl.bhg3 + dist) / ((double)(alignmentLength + ovl->dat.ovl.bhg5 + ovl->dat.ovl.bhg3))) <= WA->maxErate) {
+ ovl->dat.ovl.ahg5 = max(0, ovl->dat.ovl.ahg5 - ovl->dat.ovl.bhg5); ovl->dat.ovl.bhg5 = 0;
+ ovl->dat.ovl.ahg3 = max(0, ovl->dat.ovl.ahg3 - ovl->dat.ovl.bhg3); ovl->dat.ovl.bhg3 = 0;
+ changed = true;
+#ifdef DEBUG
+fprintf(stderr, "Overlap %d %d case 2 bconatined\n", aID, bID);
+#endif
+ }
+ // third is 5' dovetal ---------->
+ // ---------->
+ // or
+ // <---------
+ // bhg5 here is always first overhang on b read
+ //
+ else if (ovl->dat.ovl.ahg3 <= ovl->dat.ovl.bhg3 && (ovl->dat.ovl.ahg3 >= 0 && ((double)(ovl->dat.ovl.ahg3 + dist) / ((double)(alignmentLength + ovl->dat.ovl.ahg3))) <= WA->maxErate) &&
+ (ovl->dat.ovl.bhg5 >= 0 && ((double)(ovl->dat.ovl.bhg5 + dist) / ((double)(alignmentLength + ovl->dat.ovl.bhg5))) <= WA->maxErate)) {
+ ovl->dat.ovl.ahg5 = max(0, ovl->dat.ovl.ahg5 - ovl->dat.ovl.bhg5); ovl->dat.ovl.bhg5 = 0;
+ ovl->dat.ovl.bhg3 = max(0, ovl->dat.ovl.bhg3 - ovl->dat.ovl.ahg3); ovl->dat.ovl.ahg3 = 0;
+ changed = true;
+#ifdef DEBUG
+fprintf(stderr, "Overlap %d %d case 3 5' dovetail \n", aID, bID);
+#endif
+ }
+ //
+ // fourth is 3' dovetail ---------->
+ // ---------->
+ // or
+ // <----------
+ // bhg5 is always first overhang on b read
+ else if (ovl->dat.ovl.ahg5 <= ovl->dat.ovl.bhg5 && (ovl->dat.ovl.ahg5 >= 0 && ((double)(ovl->dat.ovl.ahg5 + dist) / ((double)(alignmentLength + ovl->dat.ovl.ahg5))) <= WA->maxErate) &&
+ (ovl->dat.ovl.bhg3 >= 0 && ((double)(ovl->dat.ovl.bhg3 + dist) / ((double)(alignmentLength + ovl->dat.ovl.bhg3))) <= WA->maxErate)) {
+ ovl->dat.ovl.bhg5 = max(0, ovl->dat.ovl.bhg5 - ovl->dat.ovl.ahg5); ovl->dat.ovl.ahg5 = 0;
+ ovl->dat.ovl.ahg3 = max(0, ovl->dat.ovl.ahg3 - ovl->dat.ovl.bhg3); ovl->dat.ovl.bhg3 = 0;
+ changed = true;
+#ifdef DEBUG
+fprintf(stderr, "Overlap %d %d case 4 3' dovetail \n", aID, bID);
+#endif
+ }
+ }
- //WA->align->display();
-
-#ifdef BUSTED
- ovl->dat.ovl.bhg5 = WA->NDaln->bhg5();
- ovl->dat.ovl.bhg3 = WA->NDaln->bhg3();
- ovl->dat.ovl.ahg5 = WA->NDaln->ahg5();
- ovl->dat.ovl.ahg3 = WA->NDaln->ahg3();
- ovl->erate(WA->NDaln->erate());
-#else
- ovl->dat.ovl.ahg5 = alignResult._tgt_bgn+astart;
- ovl->dat.ovl.ahg3 = rcache->getLength(aID) - (alignResult._tgt_end+astart - 1);
- ovl->dat.ovl.bhg5 = alignResult._qry_bgn + bstart;
- ovl->dat.ovl.bhg3 = rcache->getLength(bID) - (alignResult._qry_end + bstart - 1);
- // check for almost dovetail if we're not looking for partial and extend
- if (WA->partialOverlaps == false) {
- if ((double)(alignResult._dist + ovl->dat.ovl.ahg5) / (alignmentLength+ovl->dat.ovl.ahg5) < WA->maxErate) {
- alignResult._dist += ovl->dat.ovl.ahg5;
- alignmentLength += ovl->dat.ovl.ahg5;
- ovl->dat.ovl.ahg5 = 0;
- }
- if ((double)(alignResult._dist + ovl->dat.ovl.ahg3) / (alignmentLength+ovl->dat.ovl.ahg3) < WA->maxErate) {
- alignResult._dist += ovl->dat.ovl.ahg3;
- alignmentLength += ovl->dat.ovl.ahg3;
- ovl->dat.ovl.ahg3 = 0;
- }
- if ((double)(alignResult._dist + ovl->dat.ovl.bhg5) / (alignmentLength+ovl->dat.ovl.bhg5) < WA->maxErate) {
- alignResult._dist += ovl->dat.ovl.bhg5;
- alignmentLength += ovl->dat.ovl.bhg5;
- ovl->dat.ovl.bhg5 = 0;
- }
- if ((double)(alignResult._dist + ovl->dat.ovl.bhg3) / (alignmentLength+ovl->dat.ovl.bhg3) < WA->maxErate) {
- alignResult._dist += ovl->dat.ovl.bhg3;
- alignmentLength += ovl->dat.ovl.bhg3;
- ovl->dat.ovl.bhg3 = 0;
- }
+#ifdef DEBUG
+fprintf(stderr, "Recomputed overlap from %d to %d is %d - %d and %d - %d\n", aID, bID, ovl->a_bgn(), ovl->a_end(), (ovl->flipped() ? rcache->getLength(bID) - ovl->b_bgn() : ovl->b_bgn()), (ovl->flipped() ? rcache->getLength(bID) - ovl->b_end() : ovl->b_end()));
+#endif
+ // now compute the final
+ if (changed) {
+ bstart = ovl->flipped() ? rcache->getLength(bID) - ovl->b_bgn() : ovl->b_bgn();
+ bend = ovl->flipped() ? rcache->getLength(bID) - ovl->b_end() : ovl->b_end();
+ result = edlibAlign(rcache->getRead(aID)+ovl->a_bgn(), ovl->a_end()-ovl->a_bgn(), bRead+bstart, bend-bstart, edlibNewAlignConfig(tolerance, EDLIB_MODE_NW, EDLIB_TASK_LOC));
+ if (result.numLocations >= 1) {
+ dist = result.editDistance;
+ alignmentLength = ovl->a_end() - ovl->a_bgn();
+ } else {
+ dist = ovl->a_end() - ovl->a_bgn();
+ alignmentLength = 0;
}
- ovl->erate((double)alignResult._dist/(alignmentLength));
- // fprintf(stderr, "Reads %d (%d) to %d (%d), updated overlap to be %d - %d to %d - %d at error rate %f\n", aID, rcache->getLength(aID), bID, rcache->getLength(bID), ovl->a_bgn(), ovl->a_end(), ovl->b_bgn(), ovl->b_end(), ovl->erate());
- //
+ edlibFreeAlignResult(result);
+ }
+#ifdef DEBUG
+fprintf(stderr, "Done and error rate for this overlap between %d and %d is %d bp and %f errors is dovetail %d\n", aID, bID, alignmentLength, dist, ovl->overlapIsDovetail());
#endif
+ } else {
+ edlibFreeAlignResult(aQuery);
+ edlibFreeAlignResult(bQuery);
+ }
+ if (alignmentLength >= minOverlapLength && (dist / (double) (alignmentLength)) <= WA->maxErate) {
+ nPassed++;
+
+ ovl->erate((double)dist/(alignmentLength));
ovl->dat.ovl.forOBT = (WA->partialOverlaps == true);
ovl->dat.ovl.forDUP = (WA->partialOverlaps == true);
ovl->dat.ovl.forUTG = (WA->partialOverlaps == false) && (ovl->overlapIsDovetail() == true);
} else {
nFailed++;
-
ovl->evalue(AS_MAX_EVALUE);
ovl->dat.ovl.forOBT = false;
@@ -364,7 +357,7 @@ if (nTested % 1000 == 0) {
}
}
-#if 0
+#ifdef DEBUG
double deltaTime = getTime() - startTime;
fprintf(stderr, "Thread %2u computed overlaps %7u - %7u in %7.3f seconds - %6.2f olaps per second (%8u fail %8u pass)\n",
WA->threadID, bgnID, endID, deltaTime, (endID - bgnID) / deltaTime, nFailed, nPassed);
@@ -436,6 +429,9 @@ main(int argc, char **argv) {
} else if (strcmp(argv[arg], "-memory") == 0) {
memLimit = atoi(argv[++arg]);
+ } else if (strcmp(argv[arg], "-len") == 0) {
+ minOverlapLength = atoi(argv[++arg]);
+
} else {
err++;
}
@@ -481,14 +477,16 @@ main(int argc, char **argv) {
gkStore *gkpStore = gkStore::gkStore_open(gkpName);
- ovStore *ovlStore = NULL, *ovlStoreOut = NULL;
- ovFile *ovlFile = NULL, *ovlFileOut = NULL;
+ ovStore *ovlStore = NULL;
+ ovStoreWriter *outStore = NULL;
+ ovFile *ovlFile = NULL;
+ ovFile *outFile = NULL;
if (AS_UTL_fileExists(ovlName, true)) {
fprintf(stderr, "Reading overlaps from store '%s' and writing to '%s'\n",
ovlName, outName);
- ovlStore = new ovStore(ovlName, gkpStore);
- ovlStoreOut = new ovStore(outName, gkpStore, ovStoreWrite);
+ ovlStore = new ovStore(ovlName, gkpStore);
+ outStore = new ovStoreWriter(outName, gkpStore);
if (bgnID < 1)
bgnID = 1;
@@ -500,8 +498,8 @@ main(int argc, char **argv) {
} else {
fprintf(stderr, "Reading overlaps from file '%s' and writing to '%s'\n",
ovlName, outName);
- ovlFile = new ovFile(ovlName, ovFileFull);
- ovlFileOut = new ovFile(outName, ovFileFullWrite);
+ ovlFile = new ovFile(gkpStore, ovlName, ovFileFull);
+ outFile = new ovFile(gkpStore, outName, ovFileFullWrite);
}
workSpace *WA = new workSpace [numThreads];
@@ -515,26 +513,17 @@ main(int argc, char **argv) {
// Initialize thread work areas. Mirrored from overlapInCore.C
for (uint32 tt=0; tt<numThreads; tt++) {
- fprintf(stderr, "Initialize thread %u\n", tt);
+ fprintf(stderr, "Initialize thread %u\n", tt);
- WA[tt].threadID = tt;
- WA[tt].maxErate = maxErate;
- WA[tt].partialOverlaps = partialOverlaps;
- WA[tt].invertOverlaps = invertOverlaps;
+ WA[tt].threadID = tt;
+ WA[tt].maxErate = maxErate;
+ WA[tt].partialOverlaps = partialOverlaps;
+ WA[tt].invertOverlaps = invertOverlaps;
- WA[tt].gkpStore = gkpStore;
-#ifndef BUSTED
- WA[tt].align = NULL;
-#endif
- WA[tt].overlaps = NULL;
+ WA[tt].gkpStore = gkpStore;
+ WA[tt].overlaps = NULL;
// preallocate some work thread memory for common tasks to avoid allocation
-#ifndef FALCON
- WA[tt].align = new StripedSmithWaterman::Aligner(1, 3, 3, 1);
- WA[tt].filter = new StripedSmithWaterman::Filter();
-#else
- WA[tt].align = new NDalignment::NDalignResult();
-#endif
WA[tt].readSeq = new char[AS_MAX_READLEN+1];
}
@@ -624,9 +613,9 @@ main(int argc, char **argv) {
if (ovlStore)
for (uint64 oo=0; oo<*overlapsLen; oo++)
- ovlStoreOut->writeOverlap(overlaps + oo);
+ outStore->writeOverlap(overlaps + oo);
if (ovlFile)
- ovlFileOut->writeOverlaps(overlaps, *overlapsLen);
+ outFile->writeOverlaps(overlaps, *overlapsLen);
// Load more overlaps
@@ -662,10 +651,10 @@ main(int argc, char **argv) {
gkpStore->gkStore_close();
delete ovlStore;
- delete ovlStoreOut;
+ delete outStore;
delete ovlFile;
- delete ovlFileOut;
+ delete outFile;
delete [] overlapsA;
delete [] overlapsB;
diff --git a/src/overlapInCore/overlapPair.mk b/src/overlapInCore/overlapPair.mk
index 2724d8c..9efdb95 100644
--- a/src/overlapInCore/overlapPair.mk
+++ b/src/overlapInCore/overlapPair.mk
@@ -10,7 +10,7 @@ endif
TARGET := overlapPair
SOURCES := overlapPair.C
-SRC_INCDIRS := .. ../AS_UTL ../stores ../meryl/libleaff liboverlap ../utgcns/libNDFalcon
+SRC_INCDIRS := .. ../AS_UTL ../stores ../meryl/libleaff libedlib
TGT_LDFLAGS := -L${TARGET_DIR}
TGT_LDLIBS := -lleaff -lcanu
diff --git a/src/overlapInCore/overlapReadCache.C b/src/overlapInCore/overlapReadCache.C
index 1e88a88..1415a1e 100644
--- a/src/overlapInCore/overlapReadCache.C
+++ b/src/overlapInCore/overlapReadCache.C
@@ -19,6 +19,10 @@
* are Copyright 2015 Battelle National Biodefense Institute, and
* are subject to the BSD 3-Clause License
*
+ * Brian P. Walenz beginning on 2016-OCT-17
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
@@ -191,7 +195,7 @@ overlapReadCache::purgeReads(void) {
while ((memoryLimit < memoryUsed) &&
(maxAge > 1)) {
- fprintf(stderr, "purgeReads()-- used "F_U64"MB limit "F_U64"MB -- purge age "F_U32"\n", memoryUsed >> 20, memoryLimit >> 20, maxAge);
+ fprintf(stderr, "purgeReads()-- used " F_U64 "MB limit " F_U64 "MB -- purge age " F_U32 "\n", memoryUsed >> 20, memoryLimit >> 20, maxAge);
for (uint32 rr=0; rr<=nReads; rr++) {
if (maxAge == readAge[rr]) {
diff --git a/src/pipelines/canu.pl b/src/pipelines/canu.pl
index 17ce844..b0c4e9e 100644
--- a/src/pipelines/canu.pl
+++ b/src/pipelines/canu.pl
@@ -131,10 +131,12 @@ while (scalar(@ARGV)) {
$wrk = shift @ARGV;
$wrk = "$ENV{'PWD'}/$wrk" if ($wrk !~ m!^/!);
addCommandLineOption("-d \"$wrk\"");
+ setGlobal("onExitDir", $wrk);
} elsif ($arg eq "-p") {
$asm = shift @ARGV;
addCommandLineOption("-p \"$asm\"");
+ setGlobal("onExitNam", $asm);
} elsif ($arg eq "-s") {
my $spec = shift @ARGV;
@@ -160,28 +162,10 @@ while (scalar(@ARGV)) {
$mode = $step = "trim-assemble";
addCommandLineOption("-trim-assemble");
- } elsif (($arg eq "-pacbio-raw") ||
- ($arg eq "-pacbio-corrected") ||
+ } elsif (($arg eq "-pacbio-raw") || # File handling is also present in
+ ($arg eq "-pacbio-corrected") || # Defaults.pm around line 438
($arg eq "-nanopore-raw") ||
($arg eq "-nanopore-corrected")) {
- if ($arg =~ m/pacbio/) {
- setErrorRate(0.025);
- setGlobal("corErrorRate", "0.30");
- setGlobal("cnsMaxCoverage", 40);
- setGlobal("utgGraphDeviation", 6);
- } elsif ($arg =~ m/nanopore/) {
- setErrorRate(0.048);
- setGlobal("corErrorRate", "0.50");
- setGlobal("cnsMaxCoverage", 40);
- setGlobal("utgGraphDeviation", 6);
- }
-
- $mode = "trim-assemble" if (!defined($mode) && ($arg =~ m/corrected/));
- $mode = "run" if (!defined($mode) && ($arg =~ m/raw/));
-
- $haveCorrected = 1 if ($arg =~ m/corrected/);
- $haveRaw = 1 if ($arg =~ m/raw/);
-
addCommandLineError("ERROR: File '$ARGV[0]' not found.\n") if (! -e $ARGV[0]);
while (-e $ARGV[0]) {
@@ -205,10 +189,6 @@ while (scalar(@ARGV)) {
}
}
-# Fail if both raw and corrected are supplied.
-
-addCommandLineError("ERROR: Canu does not currently support mixing raw and corrected sequences.\n") if ($haveRaw && $haveCorrected);
-
# Fail if some obvious things aren't set.
addCommandLineError("ERROR: Assembly name prefix not supplied with -p.\n") if (!defined($asm));
@@ -236,8 +216,33 @@ foreach my $specFile (@specFiles) {
setParametersFromCommandLine(@specOpts);
-# When resuming a run without input files, set the error rates based on library type in the gkpStore. If the user set error, do nothing
-# check if we have gkpStores but no input files and reset error rates based on gkpStore
+# Set parameters based on file types supplied.
+
+foreach my $typefile (@inputFiles) {
+ my ($type, $file) = split '\0', $typefile;
+
+ $mode = "trim-assemble" if (!defined($mode) && ($type =~ m/corrected/));
+ $mode = "run" if (!defined($mode) && ($type =~ m/raw/));
+
+ $haveCorrected = 1 if ($type =~ m/corrected/);
+ $haveRaw = 1 if ($type =~ m/raw/);
+
+ setErrorRate(0.015, 0) if ($type =~ m/pacbio/);
+ setGlobal("corErrorRate", "0.30") if ($type =~ m/pacbio/);
+
+ setErrorRate(0.048, 0) if ($type =~ m/nanopore/);
+ setGlobal("corErrorRate", "0.50") if ($type =~ m/nanopore/);
+}
+
+# Fail if both raw and corrected are supplied.
+
+addCommandLineError("ERROR: Canu does not currently support mixing raw and corrected sequences.\n") if ($haveRaw && $haveCorrected);
+
+# When resuming a run without input files, set the error rates based on library type in the
+# gkpStore. If the user set the error rate already, do nothing.
+#
+# Also, check if we have gkpStores but no input files and reset error rates based on gkpStore.
+
if (scalar(@inputFiles) == 0 && ! defined(getGlobal("errorRate"))) {
my $gkpStore = undef;
$gkpStore = "$wrk/correction/$asm.gkpStore" if -e "$wrk/correction/$asm.gkpStore/libraries.txt";
@@ -246,7 +251,7 @@ if (scalar(@inputFiles) == 0 && ! defined(getGlobal("errorRate"))) {
# set to the default if we can't find anything
if (!defined($gkpStore)) {
- setErrorRate(0.01);
+ setErrorRate(0.01, 0);
} else {
my $numPacBioRaw = 0;
my $numPacBioCorrected = 0;
@@ -261,12 +266,12 @@ if (scalar(@inputFiles) == 0 && ! defined(getGlobal("errorRate"))) {
$numNanoporeCorrected++ if (m/nanopore-corrected/);
}
if ($numPacBioRaw > 0 || $numPacBioCorrected > 0) {
- setErrorRate(0.025);
+ setErrorRate(0.015, 0);
setGlobal("corErrorRate", "0.30");
setGlobal("cnsMaxCoverage", 40);
}
if ($numNanoporeRaw > 0 || $numNanoporeCorrected > 0) {
- setErrorRate(0.048);
+ setErrorRate(0.048, 0);
setGlobal("corErrorRate", "0.50");
setGlobal("cnsMaxCoverage", 40);
}
@@ -283,6 +288,19 @@ $bin = getBinDirectory();
printHelp();
+# Now that we know the bin directory, print the version so those pesky users
+# will (hopefully) include it when they paste in logs.
+
+printVersion($bin);
+
+# Check java and gnuplot.
+
+checkJava();
+checkGnuplot();
+
+# And one last chance to fail - because java and gnuplot both can set an error.
+
+printHelp();
# Detect grid support. If 'gridEngine' isn't set, the execution methods submitScript() and
# submitOrRunParallelJob() will return without submitting, or run locally (respectively). This
@@ -534,10 +552,7 @@ if (setOptions($mode, "assemble") eq "assemble") {
consensusLoad($wrk, $asm);
consensusAnalyze($wrk, $asm);
- outputGraph($wrk, $asm);
- outputLayout($wrk, $asm);
- outputSequence($wrk, $asm);
- outputSummary($wrk, $asm);
+ generateOutputs($wrk, $asm);
}
exit(0);
diff --git a/src/pipelines/canu/Configure.pm b/src/pipelines/canu/Configure.pm
index 36b7d2a..c03d4aa 100644
--- a/src/pipelines/canu/Configure.pm
+++ b/src/pipelines/canu/Configure.pm
@@ -195,9 +195,11 @@ sub getAllowedResources ($$$$) {
foreach my $g (@grid) {
my ($cpu, $mem, $num) = split '-', $g;
- push @gridCor, $cpu;
- push @gridMem, $mem;
- push @gridNum, $num;
+ if (($cpu > 0) && ($mem > 0) && ($num > 0)) {
+ push @gridCor, $cpu;
+ push @gridMem, $mem;
+ push @gridNum, $num;
+ }
}
} else {
push @gridCor, $maxThreads;
@@ -290,6 +292,10 @@ sub getAllowedResources ($$$$) {
$memory += $np * $m;
}
+ # If no cores, then all machines were too small.
+
+ next if ($cores == 0);
+
# Save the best one seen so far.
if ($bestCores <= $cores) {
@@ -357,7 +363,7 @@ sub getAllowedResources ($$$$) {
my $nam;
if ($alg eq "bat") { $nam = "bogart (unitigger)"; }
- elsif ($alg eq "cns") { $nam = "utgcns (consensus"; }
+ elsif ($alg eq "cns") { $nam = "utgcns (consensus)"; }
elsif ($alg eq "cor") { $nam = "falcon_sense (read correction)"; }
elsif ($alg eq "meryl") { $nam = "meryl (k-mer counting)"; }
elsif ($alg eq "oea") { $nam = "overlap error adjustment"; }
diff --git a/src/pipelines/canu/Consensus.pm b/src/pipelines/canu/Consensus.pm
index 3103f17..7f03b6a 100644
--- a/src/pipelines/canu/Consensus.pm
+++ b/src/pipelines/canu/Consensus.pm
@@ -53,63 +53,47 @@ use canu::Unitig;
use canu::HTML;
-sub computeNumberOfConsensusJobs ($$) {
- my $wrk = shift @_; # Local work directory
- my $asm = shift @_;
-
- my $jobs = 0;
- open(F, "< $wrk/4-unitigger/$asm.partitioningInfo") or caExit("can't open '$wrk/4-unitigger/$asm.partitioningInfo' for reading: $!", undef);
- while (<F>) {
- if (m/Partition\s+(\d+)\s+has\s+(\d+)\s+unitigs\sand\s+(\d+)\s+fragments./) {
- $jobs = $1;
- }
- }
- close(F);
-
- return($jobs);
-}
-
-
-
-sub utgcns ($$$) {
- my $wrk = shift @_; # Local work directory
- my $asm = shift @_;
- my $jobs = shift @_;
-
- #getAllowedResources("", "cns");
+sub utgcns ($$$$) {
+ my $wrk = shift @_; # Local work directory
+ my $asm = shift @_;
+ my $ctgjobs = shift @_;
+ my $utgjobs = shift @_;
+ my $jobs = $ctgjobs + $utgjobs;
open(F, "> $wrk/5-consensus/consensus.sh") or caExit("can't open '$wrk/5-consensus/consensus.sh' for writing: $!", undef);
print F "#!" . getGlobal("shell") . "\n";
print F "\n";
- print F "jobid=\$" . getGlobal("gridEngineTaskID") . "\n";
- print F "if [ x\$jobid = x -o x\$jobid = xundefined -o x\$jobid = x0 ]; then\n";
- print F " jobid=\$1\n";
- print F "fi\n";
- print F "if [ x\$jobid = x ]; then\n";
- print F " echo Error: I need " . getGlobal("gridEngineTaskID") . " set, or a job index on the command line.\n";
- print F " exit 1\n";
- print F "fi\n";
+ print F getJobIDShellCode();
print F "\n";
print F "if [ \$jobid -gt $jobs ]; then\n";
print F " echo Error: Only $jobs partitions, you asked for \$jobid.\n";
print F " exit 1\n";
print F "fi\n";
print F "\n";
+ print F "if [ \$jobid -le $ctgjobs ] ; then\n";
+ print F " tag=\"ctg\"\n";
+ print F "else\n";
+ print F " tag=\"utg\"\n";
+ print F " jobid=`expr \$jobid - $ctgjobs`\n";
+ print F "fi\n";
+ print F "\n";
print F "jobid=`printf %04d \$jobid`\n";
print F "\n";
- print F "if [ -e $wrk/5-consensus/\$jobid.cns ] ; then\n";
+ print F "if [ ! -d $wrk/5-consensus/\${tag}cns ] ; then\n";
+ print F " mkdir -p $wrk/5-consensus/\${tag}cns\n";
+ print F "fi\n";
+ print F "\n";
+ print F "if [ -e $wrk/5-consensus/\${tag}cns/\$jobid.cns ] ; then\n";
print F " exit 0\n";
print F "fi\n";
print F "\n";
print F getBinDirectoryShellCode();
print F "\n";
print F "\$bin/utgcns \\\n";
- print F " -G $wrk/$asm.gkpStore \\\n";
- print F " -T $wrk/$asm.tigStore 1 \$jobid \\\n";
- print F " -O $wrk/5-consensus/\$jobid.cns.WORKING \\\n";
- #print F " -L $wrk/5-consensus/\$jobid.layout.WORKING \\\n";
- #print F " -Q $wrk/5-consensus/\$jobid.fastq.WORKING \\\n";
+ print F " -G $wrk/$asm.\${tag}Store/partitionedReads.gkpStore \\\n"; # Optional; utgcns will default to this
+ print F " -T $wrk/$asm.\${tag}Store 1 \$jobid \\\n";
+ print F " -O $wrk/5-consensus/\${tag}cns/\$jobid.cns.WORKING \\\n";
print F " -maxcoverage " . getGlobal('cnsMaxCoverage') . " \\\n";
print F " -e " . getGlobal("cnsErrorRate") . " \\\n";
print F " -quick \\\n" if (getGlobal("cnsConsensus") eq "quick");
@@ -117,11 +101,7 @@ sub utgcns ($$$) {
print F " -utgcns \\\n" if (getGlobal("cnsConsensus") eq "utgcns");
print F " -threads " . getGlobal("cnsThreads") . " \\\n";
print F "&& \\\n";
- print F "mv $wrk/5-consensus/\$jobid.cns.WORKING $wrk/5-consensus/\$jobid.cns \\\n";
- #print F "&& \\\n";
- #print F "mv $wrk/5-consensus/\$jobid.layout.WORKING $wrk/5-consensus/\$jobid.layout \\\n";
- #print F "&& \\\n";
- #print F "mv $wrk/5-consensus/\$jobid.fastq.WORKING $wrk/5-consensus/\$jobid.fastq\n";
+ print F "mv $wrk/5-consensus/\${tag}cns/\$jobid.cns.WORKING $wrk/5-consensus/\${tag}cns/\$jobid.cns \\\n";
print F "\n";
print F "exit 0\n";
@@ -130,6 +110,72 @@ sub utgcns ($$$) {
+sub cleanupPartitions ($$$) {
+ my $wrk = shift @_;
+ my $asm = shift @_;
+ my $tag = shift @_;
+
+ return if (! -e "$wrk/$asm.${tag}Store/partitionedReads.gkpStore/partitions/map");
+
+ my $gkpTime = -M "$wrk/$asm.${tag}Store/partitionedReads.gkpStore/partitions/map";
+ my $tigTime = -M "$wrk/$asm.ctgStore/seqDB.v001.tig";
+
+ return if ($gkpTime <= $tigTime);
+
+ print STDERR "-- Partitioned gkpStore is older than tigs, rebuild partitioning (gkpStore $gkpTime days old; ctgStore $tigTime days old).\n";
+
+ if (runCommandSilently($wrk, "rm -rf $wrk/$asm.${tag}Store/partitionedReads.gkpStore", 1)) {
+ caExit("failed to remove old partitions ($wrk/$asm.${tag}Store/partitionedReads.gkpStore/partitions), can't continue until these are removed", undef);
+ }
+}
+
+
+
+sub partitionReads ($$$) {
+ my $wrk = shift @_;
+ my $asm = shift @_;
+ my $tag = shift @_;
+ my $bin = getBinDirectory();
+ my $cmd;
+
+ return if (-e "$wrk/$asm.${tag}Store/partitionedReads.gkpStore/partitions/map");
+
+ $cmd = "$bin/gatekeeperPartition \\\n";
+ $cmd .= " -G $wrk/$asm.gkpStore \\\n";
+ $cmd .= " -T $wrk/$asm.${tag}Store 1 \\\n";
+ $cmd .= " -b " . getGlobal("cnsPartitionMin") . " \\\n" if (defined(getGlobal("cnsPartitionMin")));
+ $cmd .= " -p " . getGlobal("cnsPartitions") . " \\\n" if (defined(getGlobal("cnsPartitions")));
+ $cmd .= "> $wrk/$asm.${tag}Store/partitionedReads.err 2>&1";
+
+ stopBefore("consensusConfigure", $cmd);
+
+ if (runCommand("$wrk", $cmd)) {
+ caExit("failed to partition the reads", "$wrk/$asm.${tag}Store/partitionedReads.err");
+ }
+}
+
+
+
+sub computeNumberOfConsensusJobs ($$$) {
+ my $wrk = shift @_;
+ my $asm = shift @_;
+ my $tag = shift @_;
+ my $jobs = 0;
+ my $bin = getBinDirectory();
+
+ open(F, "ls $wrk/$asm.${tag}Store/partitionedReads.gkpStore/partitions/blobs.* |") or caExit("failed to find partitioned files in '$wrk/$asm.${tag}Store/partitionedReads.gkpStore/partitions/blobs.*': $!", undef);
+ while (<F>) {
+ if (m/blobs.(\d+)$/) {
+ $jobs = int($1);
+ }
+ }
+ close(F);
+
+ return($jobs);
+}
+
+
+
sub consensusConfigure ($$) {
my $WRK = shift @_; # Root work directory
my $wrk = "$WRK/unitigging"; # Local work directory
@@ -139,51 +185,34 @@ sub consensusConfigure ($$) {
my $path = "$wrk/5-consensus";
goto allDone if (skipStage($WRK, $asm, "consensusConfigure") == 1);
- goto allDone if (-e "$wrk/$asm.tigStore/seqDB.v002.tig");
+ goto allDone if ((-e "$wrk/$asm.ctgStore/seqDB.v002.tig") &&
+ (-e "$wrk/$asm.utgStore/seqDB.v002.tig"));
make_path("$path") if (! -d "$path");
- # If the gkpStore partitions are older than the tigStore unitig output, assume the unitigs have
+ # If the gkpStore partitions are older than the ctgStore unitig output, assume the unitigs have
# changed and remove the gkpStore partition. -M is (annoyingly) 'file age', so we need to
# rebuild if gkp is older (larger) than tig.
- if (-e "$wrk/$asm.gkpStore/partitions/map") {
- my $gkpTime = -M "$wrk/$asm.gkpStore/partitions/map";
- my $tigTime = -M "$wrk/$asm.tigStore/seqDB.v001.tig";
-
- if ($gkpTime > $tigTime) {
- print STDERR "-- Partitioned gkpStore is older than tigs, rebuild partitioning (gkpStore $gkpTime days old; tigStore $tigTime days old).\n";
+ cleanupPartitions($wrk, $asm, "ctg");
+ cleanupPartitions($wrk, $asm, "utg");
- if (runCommandSilently($wrk, "rm -rf $wrk/$asm.gkpStore/partitions", 1)) {
- caExit("failed to remove old partitions ($wrk/$asm.gkpStore/partitions), can't continue until these are removed", undef);
- }
- }
- }
-
- # Partition gkpStore if needed.
-
- if (! -e "$wrk/$asm.gkpStore/partitions/map") {
- $cmd = "$bin/gatekeeperPartition \\\n";
- $cmd .= " -G $wrk/$asm.gkpStore \\\n";
- $cmd .= " -P $wrk/4-unitigger/$asm.partitioning \\\n";
- $cmd .= "> $path/$asm.partitioned.err 2>&1";
-
- stopBefore("consensusConfigure", $cmd);
+ # Partition gkpStore if needed. Yeah, we could create both at the same time, with significant
+ # effort in coding it up.
- if (runCommand("$path", $cmd)) {
- caExit("failed to partition the reads", "$path/$asm.partitioned.err");
- }
- }
+ partitionReads($wrk, $asm, "ctg");
+ partitionReads($wrk, $asm, "utg");
# Set up the consensus compute. It's in a useless if chain because there used to be
# different executables; now they're all rolled into utgcns itself.
- my $jobs = computeNumberOfConsensusJobs($wrk, $asm);
+ my $ctgjobs = computeNumberOfConsensusJobs($wrk, $asm, "ctg");
+ my $utgjobs = computeNumberOfConsensusJobs($wrk, $asm, "utg");
if ((getGlobal("cnsConsensus") eq "quick") ||
(getGlobal("cnsConsensus") eq "pbdagcon") ||
(getGlobal("cnsConsensus") eq "utgcns")) {
- utgcns($wrk, $asm, $jobs);
+ utgcns($wrk, $asm, $ctgjobs, $utgjobs);
} else {
caFailure("unknown consensus style '" . getGlobal("cnsConsensus") . "'", undef);
@@ -195,7 +224,7 @@ sub consensusConfigure ($$) {
stopAfter("consensusConfigure");
allDone:
- print STDERR "-- Configured ", computeNumberOfConsensusJobs($wrk, $asm), " consensus jobs.\n";
+ print STDERR "-- Configured $ctgjobs contig and $utgjobs unitig consensus jobs.\n";
}
@@ -212,37 +241,49 @@ sub consensusCheck ($$) {
my $path = "$wrk/5-consensus";
goto allDone if (skipStage($WRK, $asm, "consensusCheck", $attempt) == 1);
- goto allDone if (-e "$path/cnsjob.files");
- goto allDone if (-e "$wrk/$asm.tigStore/seqDB.v002.tig");
+ goto allDone if ((-e "$path/ctgcns.files") && (-e "$path/utgcns.files"));
+ goto allDone if (-e "$wrk/$asm.ctgStore/seqDB.v002.tig");
# Figure out if all the tasks finished correctly.
- my $jobs = computeNumberOfConsensusJobs($wrk, $asm);
+ my $ctgjobs = computeNumberOfConsensusJobs($wrk, $asm, "ctg");
+ my $utgjobs = computeNumberOfConsensusJobs($wrk, $asm, "utg");
+ my $jobs = $ctgjobs + $utgjobs;
my $currentJobID = "0001";
- my @successJobs;
+ my $tag = "ctgcns";
+
+ my @ctgSuccessJobs;
+ my @utgSuccessJobs;
my @failedJobs;
my $failureMessage = "";
for (my $job=1; $job <= $jobs; $job++) {
- if (-e "$path/$currentJobID.cns") {
- push @successJobs, "$path/$currentJobID.cns\n";
+ if (-e "$path/$tag/$currentJobID.cns") {
+ push @ctgSuccessJobs, "$path/$tag/$currentJobID.cns\n" if ($tag eq "ctgcns");
+ push @utgSuccessJobs, "$path/$tag/$currentJobID.cns\n" if ($tag eq "utgcns");
- } elsif (-e "$path/$currentJobID.cns.gz") {
- push @successJobs, "$path/$currentJobID.cns.gz\n";
+ } elsif (-e "$path/$tag/$currentJobID.cns.gz") {
+ push @ctgSuccessJobs, "$path/$tag/$currentJobID.cns.gz\n" if ($tag eq "ctgcns");
+ push @utgSuccessJobs, "$path/$tag/$currentJobID.cns.gz\n" if ($tag eq "utgcns");
- } elsif (-e "$path/$currentJobID.cns.bz2") {
- push @successJobs, "$path/$currentJobID.cns.bz2\n";
+ } elsif (-e "$path/$tag/$currentJobID.cns.bz2") {
+ push @ctgSuccessJobs, "$path/$tag/$currentJobID.cns.bz2\n" if ($tag eq "ctgcns");
+ push @utgSuccessJobs, "$path/$tag/$currentJobID.cns.bz2\n" if ($tag eq "utgcns");
- } elsif (-e "$path/$currentJobID.cns.xz") {
- push @successJobs, "$path/$currentJobID.cns.xz\n";
+ } elsif (-e "$path/$tag/$currentJobID.cns.xz") {
+ push @ctgSuccessJobs, "$path/$tag/$currentJobID.cns.xz\n" if ($tag eq "ctgcns");
+ push @utgSuccessJobs, "$path/$tag/$currentJobID.cns.xz\n" if ($tag eq "utgcns");
} else {
- $failureMessage .= "-- job $path/$currentJobID.cns FAILED.\n";
+ $failureMessage .= "-- job $path/$tag/$currentJobID.cns FAILED.\n";
push @failedJobs, $job;
}
$currentJobID++;
+
+ $currentJobID = "0001" if ($job == $ctgjobs); # Reset for first utg job.
+ $tag = "utgcns" if ($job == $ctgjobs);
}
# Failed jobs, retry.
@@ -266,7 +307,7 @@ sub consensusCheck ($$) {
# Otherwise, run some jobs.
- print STDERR "-- Consensus attempt $attempt begins with ", scalar(@successJobs), " finished, and ", scalar(@failedJobs), " to compute.\n";
+ print STDERR "-- Consensus attempt $attempt begins with ", scalar(@ctgSuccessJobs) + scalar(@utgSuccessJobs), " finished, and ", scalar(@failedJobs), " to compute.\n";
emitStage($WRK, $asm, "consensusCheck", $attempt);
buildHTML($WRK, $asm, "utg");
@@ -276,13 +317,17 @@ sub consensusCheck ($$) {
}
finishStage:
- print STDERR "-- All ", scalar(@successJobs), " consensus jobs finished successfully.\n";
+ print STDERR "-- All ", scalar(@ctgSuccessJobs) + scalar(@utgSuccessJobs), " consensus jobs finished successfully.\n";
+
+ open(L, "> $path/ctgcns.files") or caExit("can't open '$path/ctgcns.files' for writing: $!", undef);
+ print L @ctgSuccessJobs;
+ close(L);
- open(L, "> $path/cnsjob.files") or caExit("can't open '$path/cnsjob.files' for writing: $!", undef);
- print L @successJobs;
+ open(L, "> $path/utgcns.files") or caExit("can't open '$path/utgcns.files' for writing: $!", undef);
+ print L @utgSuccessJobs;
close(L);
- setGlobal("canuIteration", 0);
+ setGlobal("canuIteration", 1);
emitStage($WRK, $asm, "consensusCheck");
buildHTML($WRK, $asm, "utg");
stopAfter("consensusCheck");
@@ -292,6 +337,53 @@ sub consensusCheck ($$) {
+sub purgeFiles ($$$$$$) {
+ my $path = shift @_;
+ my $tag = shift @_;
+ my $Ncns = shift @_;
+ my $Nfastq = shift @_;
+ my $Nlayout = shift @_;
+ my $Nlog = shift @_;
+
+ open(F, "< $path/$tag.files") or caExit("can't open '$path/$tag.files' for reading: $!\n", undef);
+ while (<F>) {
+ chomp;
+ if (m/^(.*)\/0*(\d+).cns$/) {
+ my $ID6 = substr("00000" . $2, -6);
+ my $ID4 = substr("000" . $2, -4);
+ my $ID0 = $2;
+
+ if (-e "$1/$ID4.cns") {
+ $Ncns++;
+ unlink "$1/$ID4.cns";
+ }
+ if (-e "$1/$ID4.fastq") {
+ $Nfastq++;
+ unlink "$1/$ID4.fastq";
+ }
+ if (-e "$1/$ID4.layout") {
+ $Nlayout++;
+ unlink "$1/$ID4.layout";
+ }
+ if (-e "$1/consensus.$ID6.out") {
+ $Nlog++;
+ unlink "$1/consensus.$ID6.out";
+ }
+ if (-e "$1/consensus.$ID0.out") {
+ $Nlog++;
+ unlink "$1/consensus.$ID0.out";
+ }
+
+ } else {
+ caExit("unknown consensus job name '$_'\n", undef);
+ }
+ }
+ close(F);
+
+ return($Ncns, $Nfastq, $Nlayout, $Nlog);
+}
+
+
sub consensusLoad ($$) {
my $WRK = shift @_; # Root work directory
@@ -302,68 +394,48 @@ sub consensusLoad ($$) {
my $path = "$wrk/5-consensus";
goto allDone if (skipStage($WRK, $asm, "consensusLoad") == 1);
- goto allDone if (-e "$wrk/$asm.tigStore/seqDB.v002.tig");
+ goto allDone if ((-e "$wrk/$asm.ctgStore/seqDB.v002.tig") && (-e "$wrk/$asm.utgStore/seqDB.v002.tig"));
- # Expects to have a cnsjob.files list of output files from the consensusCheck() function.
+ # Expects to have a list of output files from the consensusCheck() function.
- caExit("can't find '$path/cnsjob.files' for loading tigs into store: $!", undef) if (! -e "$path/cnsjob.files");
+ caExit("can't find '$path/ctgcns.files' for loading tigs into store: $!", undef) if (! -e "$path/ctgcns.files");
+ caExit("can't find '$path/utgcns.files' for loading tigs into store: $!", undef) if (! -e "$path/utgcns.files");
# Now just load them.
$cmd = "$bin/tgStoreLoad \\\n";
$cmd .= " -G $wrk/$asm.gkpStore \\\n";
- $cmd .= " -T $wrk/$asm.tigStore 2 \\\n";
- $cmd .= " -L $path/cnsjob.files \\\n";
- $cmd .= "> $path/cnsjobs.files.tigStoreLoad.err 2>&1";
+ $cmd .= " -T $wrk/$asm.ctgStore 2 \\\n";
+ $cmd .= " -L $path/ctgcns.files \\\n";
+ $cmd .= "> $path/ctgcns.files.ctgStoreLoad.err 2>&1";
+
+ if (runCommand($path, $cmd)) {
+ caExit("failed to load unitig consensus into ctgStore", "$path/ctgcns.files.ctgStoreLoad.err");
+ }
+
+ $cmd = "$bin/tgStoreLoad \\\n";
+ $cmd .= " -G $wrk/$asm.gkpStore \\\n";
+ $cmd .= " -T $wrk/$asm.utgStore 2 \\\n";
+ $cmd .= " -L $path/utgcns.files \\\n";
+ $cmd .= "> $path/utgcns.files.utgStoreLoad.err 2>&1";
if (runCommand($path, $cmd)) {
- caExit("failed to load unitig consensus into tigStore", "$path/cnsjobs.files.tigStoreLoad.err");
+ caExit("failed to load unitig consensus into utgStore", "$path/utgcns.files.utgStoreLoad.err");
}
# Remvoe consensus outputs
- if (-e "$path/cnsjob.files") {
- print STDERR "-- Purging consensus output after loading to tigStore.\n";
+ if ((-e "$path/ctgcns.files") ||
+ (-e "$path/utgcns.files")) {
+ print STDERR "-- Purging consensus output after loading to ctgStore and/or utgStore.\n";
my $Ncns = 0;
my $Nfastq = 0;
my $Nlayout = 0;
my $Nlog = 0;
- open(F, "< $path/cnsjob.files") or caExit("can't open '$path/cnsjob.files' for reading: $!\n", undef);
- while (<F>) {
- chomp;
- if (m/^(.*)\/0*(\d+).cns$/) {
- my $ID6 = substr("00000" . $2, -6);
- my $ID4 = substr("000" . $2, -4);
- my $ID0 = $2;
-
- if (-e "$1/$ID4.cns") {
- $Ncns++;
- unlink "$1/$ID4.cns";
- }
- if (-e "$1/$ID4.fastq") {
- $Nfastq++;
- unlink "$1/$ID4.fastq";
- }
- if (-e "$1/$ID4.layout") {
- $Nlayout++;
- unlink "$1/$ID4.layout";
- }
- if (-e "$1/consensus.$ID6.out") {
- $Nlog++;
- unlink "$1/consensus.$ID6.out";
- }
- if (-e "$1/consensus.$ID0.out") {
- $Nlog++;
- unlink "$1/consensus.$ID0.out";
- }
-
- } else {
- caExit("unknown consensus job name '$_'\n", undef);
- }
- }
- close(F);
+ ($Ncns, $Nfastq, $Nlayout, $Nlog) = purgeFiles($path, "ctgcns", $Ncns, $Nfastq, $Nlayout, $Nlog);
+ ($Ncns, $Nfastq, $Nlayout, $Nlog) = purgeFiles($path, "utgcns", $Ncns, $Nfastq, $Nlayout, $Nlog);
print STDERR "-- Purged $Ncns .cns outputs.\n" if ($Ncns > 0);
print STDERR "-- Purged $Nfastq .fastq outputs.\n" if ($Nfastq > 0);
@@ -391,25 +463,25 @@ sub consensusAnalyze ($$) {
my $path = "$wrk/5-consensus";
goto allDone if (skipStage($WRK, $asm, "consensusAnalyze") == 1);
- goto allDone if (-e "$wrk/$asm.tigStore/status.coverageStat");
+ goto allDone if (-e "$wrk/$asm.ctgStore/status.coverageStat");
$cmd = "$bin/tgStoreCoverageStat \\\n";
$cmd .= " -G $wrk/$asm.gkpStore \\\n";
- $cmd .= " -T $wrk/$asm.tigStore 2 \\\n";
+ $cmd .= " -T $wrk/$asm.ctgStore 2 \\\n";
$cmd .= " -s " . getGlobal("genomeSize") . " \\\n";
- $cmd .= " -o $wrk/$asm.tigStore.coverageStat \\\n";
- $cmd .= "> $wrk/$asm.tigStore.coverageStat.err 2>&1";
+ $cmd .= " -o $wrk/$asm.ctgStore.coverageStat \\\n";
+ $cmd .= "> $wrk/$asm.ctgStore.coverageStat.err 2>&1";
if (runCommand($path, $cmd)) {
- caExit("failed to compute coverage statistics", "$wrk/$asm.tigStore.coverageStat.err");
+ caExit("failed to compute coverage statistics", "$wrk/$asm.ctgStore.coverageStat.err");
}
- unlink "$wrk/$asm.tigStore.coverageStat.err";
+ unlink "$wrk/$asm.ctgStore.coverageStat.err";
finishStage:
emitStage($WRK, $asm, "consensusAnalyze");
buildHTML($WRK, $asm, "utg");
- touch("$wrk/$asm.tigStore/status.coverageStat");
+ touch("$wrk/$asm.ctgStore/status.coverageStat");
stopAfter("consensusAnalyze");
allDone:
}
diff --git a/src/pipelines/canu/CorrectReads.pm b/src/pipelines/canu/CorrectReads.pm
index 9a545cf..fee4d3f 100644
--- a/src/pipelines/canu/CorrectReads.pm
+++ b/src/pipelines/canu/CorrectReads.pm
@@ -215,14 +215,7 @@ sub buildCorrectionLayouts_direct ($$) {
print F "#!" . getGlobal("shell") . "\n";
print F "\n";
- print F "jobid=\$" . getGlobal("gridEngineTaskID") . "\n";
- print F "if [ x\$jobid = x -o x\$jobid = xundefined -o x\$jobid = x0 ]; then\n";
- print F " jobid=\$1\n";
- print F "fi\n";
- print F "if [ x\$jobid = x ]; then\n";
- print F " echo Error: I need " . getGlobal("gridEngineTaskID") . " set, or a job index on the command line.\n";
- print F " exit 1\n";
- print F "fi\n";
+ print F getJobIDShellCode();
print F "\n";
print F "if [ \$jobid -gt $jobs ]; then\n";
print F " echo Error: Only $jobs partitions, you asked for \$jobid.\n";
@@ -325,14 +318,7 @@ sub buildCorrectionLayouts_piped ($$) {
print F "#!" . getGlobal("shell") . "\n";
print F "\n";
- print F "jobid=\$" . getGlobal("gridEngineTaskID") . "\n";
- print F "if [ x\$jobid = x -o x\$jobid = xundefined -o x\$jobid = x0 ]; then\n";
- print F " jobid=\$1\n";
- print F "fi\n";
- print F "if [ x\$jobid = x ]; then\n";
- print F " echo Error: I need " . getGlobal("gridEngineTaskID") . " set, or a job index on the command line.\n";
- print F " exit 1\n";
- print F "fi\n";
+ print F getJobIDShellCode();
print F "\n";
print F "if [ \$jobid -gt $nJobs ]; then\n";
print F " echo Error: Only $nJobs partitions, you asked for \$jobid.\n";
@@ -500,12 +486,15 @@ sub expensiveFilter ($$) {
$cmd .= " -E " . getGlobal("corMaxEvidenceErate") . " \\\n" if (defined(getGlobal("corMaxEvidenceErate")));
$cmd .= " -C $maxCov \\\n" if (defined($maxCov));
$cmd .= " -legacy \\\n" if (defined(getGlobal("corLegacyFilter")));
- $cmd .= " -p $path/$asm.estimate";
+ $cmd .= " -p $path/$asm.estimate.WORKING";
if (runCommand($wrk, $cmd)) {
rename "$path/$asm.estimate.log", "$path/$asm.estimate.log.FAILED";
caExit("failed to generate estimated lengths of corrected reads", "$wrk/$asm.corStore.err");
}
+ rename "$path/$asm.estimate.WORKING.filter.log", "$path/$asm.estimate";
+ rename "$path/$asm.estimate.WORKING.summary", "$path/$asm.estimate.stats";
+ rename "$path/$asm.estimate.WORKING.log", "$path/$asm.estimate.log";
}
if (runCommandSilently($path, "sort -T . -k4nr -k2nr < $path/$asm.estimate.log > $path/$asm.estimate.correctedLength.log", 1)) {
@@ -674,25 +663,28 @@ sub expensiveFilter ($$) {
# Plot a scatter plot of the original vs the expected corrected read lengths. Early versions
# also plotted the sorted length vs the other length, but those were not interesting.
- if (! -e "$path/$asm.estimate.original-x-correctedLength.png") {
+ if (! -e "$path/$asm.estimate.original-x-correctedLength.gp") {
+ my $gnuplot = getGlobal("gnuplot");
+ my $format = getGlobal("gnuplotImageFormat");
+
open(F, "> $path/$asm.estimate.original-x-correctedLength.gp");
print F "set title 'original length (x) vs corrected length (y)'\n";
print F "set xlabel 'original read length'\n";
print F "set ylabel 'corrected read length (expected)'\n";
print F "set pointsize 0.25\n";
print F "\n";
- print F "set terminal png size 1024,1024\n";
- print F "set output '$path/$asm.estimate.original-x-corrected.lg.png'\n";
+ print F "set terminal $format size 1024,1024\n";
+ print F "set output '$path/$asm.estimate.original-x-corrected.lg.$format'\n";
print F "plot '$path/$asm.estimate.tn.log' using 2:4 title 'tn', \\\n";
print F " '$path/$asm.estimate.fn.log' using 2:4 title 'fn', \\\n";
print F " '$path/$asm.estimate.fp.log' using 2:4 title 'fp', \\\n";
print F " '$path/$asm.estimate.tp.log' using 2:4 title 'tp'\n";
- print F "set terminal png size 256,256\n";
- print F "set output '$path/$asm.estimate.original-x-corrected.sm.png'\n";
+ print F "set terminal $format size 256,256\n";
+ print F "set output '$path/$asm.estimate.original-x-corrected.sm.$format'\n";
print F "replot\n";
close(F);
- if (runCommandSilently($path, "gnuplot < $path/$asm.estimate.original-x-correctedLength.gp > /dev/null 2>&1", 0)) {
+ if (runCommandSilently($path, "$gnuplot $path/$asm.estimate.original-x-correctedLength.gp > /dev/null 2>&1", 0)) {
print STDERR "--\n";
print STDERR "-- WARNING: gnuplot failed; no plots will appear in HTML output.\n";
print STDERR "--\n";
@@ -741,7 +733,7 @@ sub buildCorrectionLayouts ($$) {
$cmd = "$bin/filterCorrectionOverlaps \\\n";
$cmd .= " -G $wrk/$asm.gkpStore \\\n";
$cmd .= " -O $wrk/$asm.ovlStore \\\n";
- $cmd .= " -S $path/$asm.globalScores \\\n";
+ $cmd .= " -S $path/$asm.globalScores.WORKING \\\n";
$cmd .= " -c $maxCov \\\n";
$cmd .= " -l $minLen \\\n";
$cmd .= " -e " . getGlobal("corMaxEvidenceErate") . " \\\n" if (defined(getGlobal("corMaxEvidenceErate")));
@@ -752,6 +744,9 @@ sub buildCorrectionLayouts ($$) {
caExit("failed to globally filter overlaps for correction", "$path/$asm.globalScores.err");
}
+ rename "$path/$asm.globalScores.WORKING", "$path/$asm.globalScores";
+ rename "$path/$asm.globalScores.WORKING.stats", "$path/$asm.globalScores.stats";
+ rename "$path/$asm.globalScores.WORKING.log", "$path/$asm.globalScores.log";
unlink "$path/$asm.globalScores.err";
}
@@ -873,7 +868,7 @@ sub generateCorrectedReads ($$) {
print L @successJobs;
close(L);
- setGlobal("canuIteration", 0);
+ setGlobal("canuIteration", 1);
emitStage($WRK, $asm, "cor-generateCorrectedReads");
buildHTML($WRK, $asm, "cor");
stopAfter("readCorrection");
@@ -959,7 +954,7 @@ sub dumpCorrectedReads ($$) {
# corrected read name as is.
if ($rid eq $nameid) {
- $h = ">$name iid=${rid}_${pid}";
+ $h = ">$name id=${rid}_${pid}";
}
# And write the read to the output as FASTA.
@@ -1132,6 +1127,9 @@ sub dumpCorrectedReads ($$) {
# Scatterplot of lengths.
+ my $gnuplot = getGlobal("gnuplot");
+ my $format = getGlobal("gnuplotImageFormat");
+
open(F, "> $wrk/2-correction/$asm.originalLength-vs-correctedLength.gp") or caExit("", undef);
print F "\n";
print F "set pointsize 0.25\n";
@@ -1140,37 +1138,37 @@ sub dumpCorrectedReads ($$) {
print F "set xlabel 'original read length'\n";
print F "set ylabel 'expected corrected read length'\n";
print F "\n";
- print F "set terminal png size 1024,1024\n";
- print F "set output '$wrk/2-correction/$asm.originalLength-vs-expectedLength.lg.png'\n";
+ print F "set terminal $format size 1024,1024\n";
+ print F "set output '$wrk/2-correction/$asm.originalLength-vs-expectedLength.lg.$format'\n";
print F "plot [0:$maxReadLen] [0:$maxReadLen] '$wrk/2-correction/$asm.original-expected-corrected-length.dat' using 2:3 title 'original (x) vs expected (y)'\n";
- print F "set terminal png size 256,256\n";
- print F "set output '$wrk/2-correction/$asm.originalLength-vs-expectedLength.sm.png'\n";
+ print F "set terminal $format size 256,256\n";
+ print F "set output '$wrk/2-correction/$asm.originalLength-vs-expectedLength.sm.$format'\n";
print F "replot\n";
print F "\n";
print F "set title 'original read length vs sum of corrected read lengths'\n";
print F "set xlabel 'original read length'\n";
print F "set ylabel 'sum of corrected read lengths'\n";
print F "\n";
- print F "set terminal png size 1024,1024\n";
- print F "set output '$wrk/2-correction/$asm.originalLength-vs-correctedLength.lg.png'\n";
+ print F "set terminal $format size 1024,1024\n";
+ print F "set output '$wrk/2-correction/$asm.originalLength-vs-correctedLength.lg.$format'\n";
print F "plot [0:$maxReadLen] [0:$maxReadLen] '$wrk/2-correction/$asm.original-expected-corrected-length.dat' using 2:4 title 'original (x) vs corrected (y)'\n";
- print F "set terminal png size 256,256\n";
- print F "set output '$wrk/2-correction/$asm.originalLength-vs-correctedLength.sm.png'\n";
+ print F "set terminal $format size 256,256\n";
+ print F "set output '$wrk/2-correction/$asm.originalLength-vs-correctedLength.sm.$format'\n";
print F "replot\n";
print F "\n";
print F "set title 'expected read length vs sum of corrected read lengths'\n";
print F "set xlabel 'expected read length'\n";
print F "set ylabel 'sum of corrected read lengths'\n";
print F "\n";
- print F "set terminal png size 1024,1024\n";
- print F "set output '$wrk/2-correction/$asm.expectedLength-vs-correctedLength.lg.png'\n";
+ print F "set terminal $format size 1024,1024\n";
+ print F "set output '$wrk/2-correction/$asm.expectedLength-vs-correctedLength.lg.$format'\n";
print F "plot [0:$maxReadLen] [0:$maxReadLen] '$wrk/2-correction/$asm.original-expected-corrected-length.dat' using 3:4 title 'expected (x) vs corrected (y)'\n";
- print F "set terminal png size 256,256\n";
- print F "set output '$wrk/2-correction/$asm.expectedLength-vs-correctedLength.sm.png'\n";
+ print F "set terminal $format size 256,256\n";
+ print F "set output '$wrk/2-correction/$asm.expectedLength-vs-correctedLength.sm.$format'\n";
print F "replot\n";
close(F);
- if (runCommandSilently("$wrk/2-correction", "gnuplot $wrk/2-correction/$asm.originalLength-vs-correctedLength.gp > /dev/null 2>&1", 0)) {
+ if (runCommandSilently("$wrk/2-correction", "$gnuplot $wrk/2-correction/$asm.originalLength-vs-correctedLength.gp > /dev/null 2>&1", 0)) {
print STDERR "--\n";
print STDERR "-- WARNING: gnuplot failed; no plots will appear in HTML output.\n";
print STDERR "--\n";
@@ -1190,28 +1188,28 @@ sub dumpCorrectedReads ($$) {
print F "set boxwidth binwidth\n";
print F "bin(x,width) = width*floor(x/width) + binwidth/2.0\n";
print F "\n";
- print F "set terminal png size 1024,1024\n";
- print F "set output '$wrk/2-correction/$asm.length-histograms.lg.png'\n";
+ print F "set terminal $format size 1024,1024\n";
+ print F "set output '$wrk/2-correction/$asm.length-histograms.lg.$format'\n";
print F "plot [1:$maxReadLen] [0:] \\\n";
print F " '$wrk/2-correction/$asm.original-expected-corrected-length.dat' using (bin(\$2,binwidth)):(1.0) smooth freq with boxes title 'original', \\\n";
print F " '$wrk/2-correction/$asm.original-expected-corrected-length.dat' using (bin(\$3,binwidth)):(1.0) smooth freq with boxes title 'expected', \\\n";
print F " '$wrk/2-correction/$asm.original-expected-corrected-length.dat' using (bin(\$4,binwidth)):(1.0) smooth freq with boxes title 'corrected'\n";
- print F "set terminal png size 256,256\n";
- print F "set output '$wrk/2-correction/$asm.length-histograms.sm.png'\n";
+ print F "set terminal $format size 256,256\n";
+ print F "set output '$wrk/2-correction/$asm.length-histograms.sm.$format'\n";
print F "replot\n";
print F "\n";
print F "set xlabel 'difference between expected and corrected read length, bin width = 250, min=$minDiff, max=$maxDiff'\n";
print F "\n";
- print F "set terminal png size 1024,1024\n";
- print F "set output '$wrk/2-correction/$asm.length-difference-histograms.lg.png'\n";
+ print F "set terminal $format size 1024,1024\n";
+ print F "set output '$wrk/2-correction/$asm.length-difference-histograms.lg.$format'\n";
print F "plot [$minDiff:$maxDiff] [0:] \\\n";
print F " '$wrk/2-correction/$asm.original-expected-corrected-length.dat' using (bin(\$7,binwidth)):(1.0) smooth freq with boxes title 'expected - corrected'\n";
- print F "set terminal png size 256,256\n";
- print F "set output '$wrk/2-correction/$asm.length-difference-histograms.sm.png'\n";
+ print F "set terminal $format size 256,256\n";
+ print F "set output '$wrk/2-correction/$asm.length-difference-histograms.sm.$format'\n";
print F "replot\n";
close(F);
- if (runCommandSilently("$wrk/2-correction", "gnuplot $wrk/2-correction/$asm.length-histograms.gp > /dev/null 2>&1", 0)) {
+ if (runCommandSilently("$wrk/2-correction", "$gnuplot $wrk/2-correction/$asm.length-histograms.gp > /dev/null 2>&1", 0)) {
print STDERR "--\n";
print STDERR "-- WARNING: gnuplot failed; no plots will appear in HTML output.\n";
print STDERR "--\n";
diff --git a/src/pipelines/canu/Defaults.pm b/src/pipelines/canu/Defaults.pm
index 467dbeb..0c3d69e 100644
--- a/src/pipelines/canu/Defaults.pm
+++ b/src/pipelines/canu/Defaults.pm
@@ -40,33 +40,38 @@ package canu::Defaults;
require Exporter;
@ISA = qw(Exporter);
- at EXPORT = qw(getCommandLineOptions addCommandLineOption addCommandLineError writeLog caExit caFailure getNumberOfCPUs getPhysicalMemorySize getAllowedResources diskSpace printOptions printHelp setParametersFromFile setParametersFromCommandLine checkParameters getGlobal setGlobal setGlobalIfUndef showErrorRates setErrorRate setDefaults);
+ at EXPORT = qw(getCommandLineOptions addCommandLineOption addCommandLineError writeLog getNumberOfCPUs getPhysicalMemorySize getAllowedResources diskSpace printOptions printVersion printHelp setParametersFromFile setParametersFromCommandLine checkJava checkGnuplot checkParameters getGlobal setGlobal setGlobalIfUndef showErrorRates setErrorRate setDefaults);
use strict;
use Carp qw(cluck);
use Sys::Hostname;
use Text::Wrap;
-use Filesys::Df; # for diskSpace()
-
-my %global;
-my %synops;
-my %synnam;
+my %global; # Parameter value
+my %synops; # Parameter description (for -defaults)
+my %synnam; # Parameter name (beacuse the key is lowercase)
my $cLineOpts = "";
my $specLog = "";
-# Return the second argument, unless the first argument is found in
-# %global, in which case return that.
-#
sub getGlobal ($) {
my $var = shift @_;
$var =~ tr/A-Z/a-z/;
- caFailure("parameter '$var' is not known", undef) if (!exists($global{$var}));
+ # We lost the use of caFailure in Defaults.pm (because it was moved to
+ # Execution.pm so it can run stuff) here, so duplicate the functionality.
+ # This should only trigger on static pipeline errors (i.e., no depending
+ # on reads input) and so should never occur in the wild.
+
+ if (!exists($global{$var})) {
+ print STDERR "================================================================================\n";
+ print STDERR "Unknown parameter '$var' accessed. Stack trace:\n";
+ cluck;
+ exit(1);
+ }
return($global{$var});
}
@@ -126,7 +131,7 @@ sub setGlobal ($$) {
return if ($set > 0);
if ($var eq "errorrate") {
- setErrorRate($val);
+ setErrorRate($val, 1);
return;
}
@@ -190,63 +195,6 @@ sub writeLog ($) {
-# Use caExit() for transient errors, like not opening files, processes that die, etc.
-sub caExit ($$) {
- my $msg = shift @_;
- my $log = shift @_;
-
- print STDERR "================================================================================\n";
- print STDERR "Don't panic, but a mostly harmless error occurred and canu failed.\n";
- print STDERR "\n";
-
- # Really should pass in $wrk
- if (defined($log)) {
- my $df = diskSpace($log);
-
- print STDERR "Disk space available: $df GB\n";
- print STDERR "\n";
- }
-
- if (-e $log) {
- print STDERR "Last 50 lines of the relevant log file ($log):\n";
- print STDERR "\n";
- system("tail -n 50 $log");
- print STDERR "\n";
- }
-
- print STDERR "canu failed with '$msg'.\n";
- print STDERR "\n";
-
- exit(1);
-}
-
-
-# Use caFailure() for errors that definitely will require code changes to fix.
-sub caFailure ($$) {
- my $msg = shift @_;
- my $log = shift @_;
-
- print STDERR "================================================================================\n";
- print STDERR "Please panic. canu failed, and it shouldn't have.\n";
- print STDERR "\n";
- print STDERR "Stack trace:\n";
- print STDERR "\n";
- cluck;
- print STDERR "\n";
-
- if (-e $log) {
- print STDERR "Last few lines of the relevant log file ($log):\n";
- print STDERR "\n";
- system("tail -n 50 $log");
- }
-
- print STDERR "\n";
- print STDERR "canu failed with '$msg'.\n";
-
- exit(1);
-}
-
-
#
# Host management - these really belong in 'Execution.pm' (or 'Utilities.pm') but can't go there
# (Execution.pm) and be used here too.
@@ -317,13 +265,21 @@ sub dirname ($) {
sub diskSpace ($) {
- my $wrk = dirname($_[0]);
- my $df = df($wrk, 1024);
-
- my $total = int(10 * $df->{blocks} / 1048576) / 10;
- my $used = int(10 * $df->{used} / 1048576) / 10;
- my $free = int(10 * $df->{bfree} / 1048576) / 10;
- my $avail = int(10 * $df->{bavail} / 1048576) / 10;
+ my $wrk = dirname($_[0]);
+ my ($total, $used, $free, $avail) = (0, 0, 0, 0);
+
+ open(DF, "df -P -k $wrk |");
+ while (<DF>) {
+ chomp;
+
+ if (m/^(.*)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+%)\s+(.*)$/) {
+ $total = int($2 / 1048.576) / 1000;
+ $used = int($3 / 1048.576) / 1000;
+ $free = int($4 / 1048.576) / 1000;
+ $avail = int($4 / 1048.576) / 1000; # Possibly limited by quota?
+ }
+ }
+ close(DF);
#print STDERR "Disk space: total $total GB, used $used GB, free $free GB, available $avail GB\n";
@@ -355,12 +311,28 @@ sub printOptions () {
}
+sub printVersion ($) {
+ my $bin = shift @_;
+ my $version;
+
+ open(F, "$bin/gatekeeperCreate --version 2>&1 |");
+ while (<F>) {
+ $version = $_; chomp $version;
+ }
+ close(F);
+
+ if (length($version) > 0) {
+ print "-- $version\n";
+ }
+}
+
+
sub printHelp () {
return if (!exists($global{'errors'}));
print "\n";
- print "usage: canu [-correct | -trim | -assemble] \\\n";
+ print "usage: canu [-correct | -trim | -assemble | -trim-assemble] \\\n";
print " [-s <assembly-specifications-file>] \\\n";
print " -p <assembly-prefix> \\\n";
print " -d <assembly-directory> \\\n";
@@ -371,9 +343,10 @@ sub printHelp () {
print "\n";
print " By default, all three stages (correct, trim, assemble) are computed.\n";
print " To compute only a single stage, use:\n";
- print " -correct - generate corrected reads\n";
- print " -trim - generate trimmed reads\n";
- print " -assemble - generate an assembly\n";
+ print " -correct - generate corrected reads\n";
+ print " -trim - generate trimmed reads\n";
+ print " -assemble - generate an assembly\n";
+ print " -trim-assemble - generate trimmed reads and then assemble them\n";
print "\n";
print " The assembly is computed in the (created) -d <assembly-directory>, with most\n";
print " files named using the -p <assembly-prefix>.\n";
@@ -448,7 +421,10 @@ sub setParametersFromFile ($@) {
$specLog .= "###\n";
$specLog .= "\n";
- open(F, "< $specFile") or caExit("can't open '$specFile' for reading: $!", undef);
+ # We lost the use of caExit() here (moved to Execution.pm) and so can't call it.
+ # Just die.
+
+ open(F, "< $specFile") or die("can't open '$specFile' for reading: $!\n");
while (<F>) {
$specLog .= $_;
@@ -456,24 +432,29 @@ sub setParametersFromFile ($@) {
s/^\s+//;
s/\s+$//;
- next if (m/^\s*\#/);
- next if (m/^\s*$/);
+ next if (m/^#/);
+ next if (length($_) eq 0);
- if (-e $_) {
- my $xx = $_;
- $xx = "$ENV{'PWD'}/$xx" if ($xx !~ m!^/!);
- if (-e $xx) {
- push @fragFiles, $xx;
- } else {
- addCommandLineError("ERROR: File not found '$_' after appending absolute path.\n");
- }
- } elsif (m/\s*(\w*)\s*=([^#]*)#*.*$/) {
+ # File handling is also present in canu.pl around line 165.
+ if (m/^-(pacbio|nanopore)-(corrected|raw)\s+(.*)$/) {
+ my $arg = "-$1-$2";
+ my $file = $3;
+
+ $file = "$ENV{'PWD'}/$file" if ($file !~ m!^/!);
+
+ push @fragFiles, "$arg\0$file";
+ addCommandLineOption("$arg \"$file\"");
+ }
+
+ elsif (m/\s*(\w*)\s*=([^#]*)#*.*$/) {
my ($var, $val) = ($1, $2);
$var =~ s/^\s+//; $var =~ s/\s+$//;
$val =~ s/^\s+//; $val =~ s/\s+$//;
undef $val if ($val eq "undef");
setGlobal($var, $val);
- } else {
+ }
+
+ else {
addCommandLineError("ERROR: File not found or unknown specFile option line '$_'.\n");
}
}
@@ -552,11 +533,16 @@ sub showErrorRates ($) {
# trimming errorRate = 0.009 obtOvlErrorRate = 0.06 obtErrorRate = 0.035
# assembly errorRate = 0.009 utgOvlErrorRate = 0.06 bogart 0.035
#
-sub setErrorRate ($@) {
+sub setErrorRate ($$) {
my $er = shift @_;
- my $verbose = shift @_;
+ my $force = shift @_;
+
+ if (($force == 0) && (defined($global{"errorrate"}))) {
+ #print STDERR "-- Can't change error rate from ", getGlobal('errorRate'), " to $er - not allowed.\n";
+ return;
+ }
- print STDERR "-- Set errorRate to $er (verbose='$verbose')\n" if (defined($verbose));
+ #print STDERR "-- Set errorRate to $er\n";
# Can NOT call setGlobal() for this, because it calls setErrorRate()!.
$global{"errorrate"} = $er;
@@ -570,7 +556,7 @@ sub setErrorRate ($@) {
#setGlobal("corErrorRate", $er * 10); # Erorr rate used for raw sequence alignment/consensus
setGlobal("cnsErrorRate", $er * 3);
- showErrorRates("-- ") if (defined($verbose));
+ #showErrorRates("-- ");
}
@@ -617,9 +603,12 @@ sub setOverlapDefaults ($$$) {
$global{"${tag}OvlFrequentMers"} = undef;
$synops{"${tag}OvlFrequentMers"} = "Do not seed overlaps with these kmers (fasta format)";
+ $global{"${tag}OvlFilter"} = undef;
+ $synops{"${tag}OvlFilter"} = "Filter overlaps based on expected kmers vs observed kmers";
+
# Mhap parameters.
- $global{"${tag}MhapVersion"} = "2.1";
+ $global{"${tag}MhapVersion"} = "2.1.2";
$synops{"${tag}MhapVersion"} = "Version of the MHAP jar file to use";
$global{"${tag}MhapFilterThreshold"} = "0.000005";
@@ -631,13 +620,16 @@ sub setOverlapDefaults ($$$) {
$global{"${tag}MhapNoTf"} = undef;
$synops{"${tag}MhapNoTf"} = "Expert option: True or false, do not use tf weighting, only idf of tf-idf.";
+ $global{"${tag}MhapOptions"} = undef;
+ $synops{"${tag}MhapOptions"} = "Expert option: free-form parameters to pass to MHAP.";
+
$global{"${tag}MhapBlockSize"} = 3000;
$synops{"${tag}MhapBlockSize"} = "Number of reads per 1GB; memory * blockSize = the size of block loaded into memory per job";
- $global{"${tag}MhapMerSize"} = ($tag eq "cor") ? 16 : 22;
+ $global{"${tag}MhapMerSize"} = ($tag eq "cor") ? 16 : 16;
$synops{"${tag}MhapMerSize"} = "K-mer size for seeds in mhap";
- $global{"${tag}MhapOrderedMerSize"} = ($tag eq "cor") ? 12 : 22;
+ $global{"${tag}MhapOrderedMerSize"} = ($tag eq "cor") ? 12 : 18;
$synops{"${tag}MhapOrderedMerSize"} = "K-mer size for second-stage filter in mhap";
$global{"${tag}MhapSensitivity"} = undef;
@@ -647,7 +639,7 @@ sub setOverlapDefaults ($$$) {
$synops{"${tag}MMapBlockSize"} = "Number of reads per 1GB; memory * blockSize = the size of block loaded into memory per job";
# minimap parameters.
- $global{"${tag}MMapMerSize"} = ($tag eq "cor") ? 15 : 22;
+ $global{"${tag}MMapMerSize"} = ($tag eq "cor") ? 15 : 21;
$synops{"${tag}MMapMerSize"} = "K-mer size for seeds in minmap";
# shared parameters for alignment-free overlappers
@@ -676,7 +668,16 @@ sub setDefaults () {
$global{"java"} = (exists $ENV{"JAVA_HOME"} && -e "$ENV{'JAVA_HOME'}/bin/java") ? "$ENV{'JAVA_HOME'}/bin/java" : "java";
$synops{"java"} = "Java interpreter to use; at least version 1.8; default 'java'";
- ##### Cleanup options
+ $global{"gnuplot"} = "gnuplot";
+ $synops{"gnuplot"} = "Path to the gnuplot executable";
+
+ $global{"gnuplotImageFormat"} = undef;
+ $synops{"gnuplotImageFormat"} = "Image format that gnuplot will generate, used in HTML reports. Default: based on gnuplot, 'png', 'svg' or 'gif'";
+
+ $global{"gnuplotTested"} = 0;
+ $synops{"gnuplotTested"} = "If set, skip the initial testing of gnuplot";
+
+ ##### Cleanup and Termination options
$global{"saveOverlaps"} = 0;
$synops{"saveOverlaps"} = "Save intermediate overlap files, almost never a good idea";
@@ -687,6 +688,15 @@ sub setDefaults () {
$global{"saveMerCounts"} = 0;
$synops{"saveMerCounts"} = "Save full mer counting results, sometimes useful";
+ $global{"onSuccess"} = undef;
+ $synops{"onSuccess"} = "Full path to command to run on successful completion";
+
+ $global{"onFailure"} = undef;
+ $synops{"onFailure"} = "Full path to command to run on failure";
+
+ $global{"onExitDir"} = undef; # Copy of $wrk, for caExit() and caFailure() ONLY.
+ $global{"onExitNam"} = undef; # Copy of $asm, for caExit() and caFailure() ONLY.
+
##### Error Rates
$global{"errorRate"} = undef;
@@ -704,13 +714,13 @@ sub setDefaults () {
#$global{"utgErrorRate"} = undef;
#$synops{"utgErrorRate"} = "Overlaps at or below this error rate are used to construct unitigs (BOG and UTG)";
- $global{"utgGraphDeviation"} = 5;
+ $global{"utgGraphDeviation"} = 6;
$synops{"utgGraphDeviation"} = "Overlaps this much above median will not be used for initial graph construction (BOGART)";
$global{"utgRepeatDeviation"} = 3;
$synops{"utgRepeatDeviation"} = "Overlaps this much above mean unitig error rate will not be used for repeat splitting (BOGART)";
- $global{"utgRepeatConfusedBP"} = 5000;
+ $global{"utgRepeatConfusedBP"} = 2100;
$synops{"utgRepeatConfusedBP"} = "Repeats where the next best edge is at least this many bp shorter will not be split (BOGART)";
$global{"corErrorRate"} = undef;
@@ -896,7 +906,7 @@ sub setDefaults () {
##### Unitig Filtering Options (also set in bogart/bogart.C)
$global{"contigFilter"} = "2 1000 0.75 0.75 2";
- $synops{"contigFilter"} = "Parameters to filter out 'unassembled' unitigs: minReads; minLength; singleReadSpan; lowCovSpan, lowCovDepth";
+ $synops{"contigFilter"} = "Parameters to filter out 'unassembled' unitigs: minReads; minLength; singleReadSpan; lowCovFraction, lowCovDepth";
##### Consensus Options
@@ -906,7 +916,7 @@ sub setDefaults () {
$global{"cnsPartitionMin"} = undef;
$synops{"cnsPartitionMin"} = "Don't make a consensus partition with fewer than N reads";
- $global{"cnsMaxCoverage"} = 0;
+ $global{"cnsMaxCoverage"} = 40;
$synops{"cnsMaxCoverage"} = "Limit unitig consensus to at most this coverage; default '0' = unlimited";
$global{"cnsConsensus"} = "pbdagcon";
@@ -978,6 +988,168 @@ sub setDefaults () {
+sub checkJava () {
+ return if ((getGlobal("corOverlapper") ne "mhap") &&
+ (getGlobal("obtOverlapper") ne "mhap") &&
+ (getGlobal("utgOverlapper") ne "mhap"));
+
+ my $java = getGlobal("java");
+ my $versionStr = "unknown";
+ my $version = 0;
+
+ # Argh, we can't use runCommand() here, because we're included in Execution.pm. Try to check
+ # it with -x. Nope. Fails if $java == "java".
+
+ #if (! -x $java) {
+ # addCommandLineError("ERROR: java executable '$java' not found or not executable\n");
+ #}
+
+ open(F, "$java -Xmx1g -showversion 2>&1 |");
+ while (<F>) {
+ # First word is either "java" or "openjdk" or ...
+ if (m/^.*\s+version\s+\"(\d+.\d+)(.*)\".*$/) {
+ $versionStr = "$1$2";
+ $version = $1;
+ }
+ }
+ close(F);
+
+ if ($version < 1.8) {
+ addCommandLineError("ERROR: mhap overlapper requires java version at least 1.8.0; you have $versionStr (from '$java').\n");
+ addCommandLineError("ERROR: '$java -showversion' reports:\n");
+
+ open(F, "$java -showversion 2>&1 |");
+ while (<F>) {
+ chomp;
+ addCommandLineError("ERROR: '$_'\n");
+ }
+ close(F);
+
+ } else {
+ print STDERR "-- Detected Java(TM) Runtime Environment '$versionStr' (from '$java').\n";
+ }
+}
+
+
+
+sub checkGnuplot () {
+
+ return if (getGlobal("gnuPlotTested") == 1);
+
+ my $gnuplot = getGlobal("gnuplot");
+ my $format = getGlobal("gnuplotImageFormat");
+ my $version = undef;
+
+ # Check for existence of gnuplot.
+
+ open(F, "$gnuplot -V |");
+ while (<F>) {
+ chomp;
+ $version = $_;
+ $version = $1 if ($version =~ m/^gnuplot\s+(.*)$/);
+ }
+ close(F);
+
+ if (!defined($version)) {
+ addCommandLineError("ERROR: Failed to run gnuplot from '$gnuplot'.");
+ addCommandLineError("ERROR: Set option gnuplot=<path-to-gnuplot> or gnuplotTested=true to skip this test and not generate plots.\n");
+ return;
+ }
+
+ # Check for existence of a decent output format. Need to redirect in /dev/null to make gnuplot
+ # not use it's builtin pager.
+
+ if (!defined($format)) {
+ my $havePNG = 0;
+ my $haveSVG = 0;
+ my $haveGIF = 0;
+
+ open(F, "> /tmp/gnuplot-$$-test.gp");
+ print F "set terminal\n";
+ close(F);
+
+ system("cd /tmp && $gnuplot < /dev/null /tmp/gnuplot-$$-test.gp > /tmp/gnuplot-$$-test.err 2>&1");
+
+ open(F, "< /tmp/gnuplot-$$-test.err");
+ while (<F>) {
+ s/^\s+//;
+ s/\s+$//;
+
+ my @t = split '\s+', $_;
+
+ $havePNG = 1 if ($t[0] eq 'png');
+ $haveSVG = 1 if ($t[0] eq 'svg');
+ $haveGIF = 1 if ($t[0] eq 'gif');
+ }
+ close(F);
+
+ $format = "gif" if ($haveGIF);
+ $format = "svg" if ($haveSVG);
+ $format = "png" if ($havePNG);
+
+ setGlobal("gnuplotImageFormat", $format);
+
+ unlink "/tmp/gnuplot-$$-test.gp";
+ unlink "/tmp/gnuplot-$$-test.err";
+ }
+
+ if (!defined($format)) {
+ addCommandLineError("ERROR: Failed to detect a suitable output format for gnuplot.\n");
+ addCommandLineError("ERROR: Looked for png, svg and gif, found none of them.\n");
+ addCommandLineError("Set option gnuplotImageFormat=<type>, or gnuplotTested=true to skip this test and not generate plots.\n");
+ return;
+ }
+
+ # Test if we can actually make images.
+
+ open(F, "> /tmp/gnuplot-$$-test.gp");
+ print F "set title 'gnuplot test'\n";
+ print F "set xlabel 'X'\n";
+ print F "set xlabel 'Y'\n";
+ print F "\n";
+ print F "set terminal $format size 1024,1024\n";
+ print F "set output '/tmp/gnuplot-$$-test.1.$format'\n";
+ print F "\n";
+ print F "plot [-30:20] sin(x*20) * atan(x)\n\n";
+ print F "\n";
+ print F "set terminal $format size 256,256\n";
+ print F "set output '/tmp/gnuplot-$$-test.2.$format'\n";
+ print F "\n";
+ print F "bogus line\n";
+ close(F);
+
+ # Dang, we don't have runCommandSilently here, so have to do it the hard way.
+
+ system("cd /tmp && $gnuplot < /dev/null /tmp/gnuplot-$$-test.gp > /tmp/gnuplot-$$-test.err 2>&1");
+
+ if ((! -e "/tmp/gnuplot-$$-test.1.$format") ||
+ (! -e "/tmp/gnuplot-$$-test.2.$format")) {
+ addCommandLineError("ERROR: gnuplot failed to generate images.\n");
+
+ open(F, "< /tmp/gnuplot-$$-test.err");
+ while (<F>) {
+ chomp;
+ addCommandLineError("ERROR: gnuplot reports: $_\n");
+ }
+ close(F);
+
+ addCommandLineError("ERROR: Set option gnuplotImageFormat=<type>, or gnuplotTested=true to skip this test and not generate plots.\n");
+ return;
+ }
+
+ # Yay, gnuplot works!
+
+ print STDERR "-- Detected gnuplot version '$version' (from '$gnuplot') and image format '$format'.\n";
+ #addCommandLineOption("gnuplotTested=1");
+
+ unlink "/tmp/gnuplot-$$-test.gp";
+ unlink "/tmp/gnuplot-$$-test.err";
+ unlink "/tmp/gnuplot-$$-test.1.$format";
+ unlink "/tmp/gnuplot-$$-test.2.$format";
+}
+
+
+
sub checkParameters () {
#
@@ -1011,15 +1183,75 @@ sub checkParameters () {
# Check for inconsistent parameters
#
+ # Genome size isn't properly decoded until later, but we want to fail quickly. So, just test if
+ # a unitless number is supplied, and if that number is tiny.
+
+ {
+ my $gs = getGlobal("genomeSize");
+
+ if (($gs =~ m/^(\d+)$/) ||
+ ($gs =~ m/^(\d+\.\d+)$/)) {
+ if ($gs < 1000) {
+ addCommandLineError("ERROR: Implausibly small genome size $gs. Check units!\n");
+ }
+ }
+ }
+
+ foreach my $var ("corOvlErrorRate", "obtOvlErrorRate", "utgOvlErrorRate", "corErrorRate", "cnsErrorRate", "obtErrorRate") {
+ if (!defined(getGlobal($var))) {
+ addCommandLineError("ERROR: Invalid '$var' specified; must be set\n");
+ }
+ elsif (getGlobal($var) !~ m/^[.-0123456789]/) {
+ addCommandLineError("ERROR: Invalid '$var' specified (" . getGlobal("$var") . "); must be numeric\n");
+ }
+ elsif ((getGlobal($var) < 0.0) || (getGlobal($var) > 1.0)) {
+ addCommandLineError("ERROR: Invalid '$var' specified (" . getGlobal("$var") . "); must be at least 0.0 and no more than 1.0\n");
+ }
+ }
+
if (getGlobal("minReadLength") < getGlobal("minOverlapLength")) {
my $mr = getGlobal("minReadLength");
my $mo = getGlobal("minOverlapLength");
addCommandLineError("ERROR: minReadLength=$mr must be at least minOverlapLength=$mo.\n");
+ }
+
+ foreach my $var ("corOutCoverage") {
+ if (!defined(getGlobal($var))) {
+ addCommandLineError("ERROR: Invalid 'corOutCoverage' specified (" . getGlobal("corOutCoverage") . "); must be at least 1.0\n");
+ }
+ elsif (getGlobal($var) !~ m/^[.-0123456789]/) {
+ addCommandLineError("ERROR: Invalid '$var' specified (" . getGlobal("$var") . "); must be numeric\n");
+ }
+ elsif (getGlobal($var) < 1.0) {
+ addCommandLineError("ERROR: Invalid '$var' specified (" . getGlobal("$var") . "); must be at least 1.0\n");
+ }
+ }
- # Or we can just reset one or the other....
- #print STDERR "-- WARNING: minReadLength reset from $mr to $mo (limited by minOverlapLength)\n";
- #setGlobal("minOverlapLength", $mo);
+ foreach my $var ("corMaxEvidenceCoverageGlobal", "corMaxEvidenceCoverageLocal") {
+ if (!defined(getGlobal($var))) {
+ # If undef, defaults to corOutCoverage in CorrectReads.pm
+ }
+ elsif (getGlobal($var) =~ m/^(\d*\.*\d*)(x*)$/) {
+ if (($1 < 1.0) && ($2 ne "x")) {
+ addCommandLineError("ERROR: Invalid '$var' specified (" . getGlobal("$var") . "); must be at least 1.0\n");
+ }
+ }
+ else {
+ addCommandLineError("ERROR: Invalid '$var' specified (" . getGlobal("$var") . "); must be numeric\n");
+ }
+ }
+
+ foreach my $var ("utgGraphDeviation", "utgRepeatDeviation", "utgRepeatConfusedBP", "minReadLength", "minOverlapLength") {
+ if (!defined(getGlobal($var))) {
+ addCommandLineError("ERROR: Invalid '$var' specified; must be set\n");
+ }
+ elsif (getGlobal($var) !~ m/^[.-0123456789]/) {
+ addCommandLineError("ERROR: Invalid '$var' specified (" . getGlobal("$var") . "); must be numeric\n");
+ }
+ elsif (getGlobal($var) < 0.0) {
+ addCommandLineError("ERROR: Invalid '$var' specified (" . getGlobal("$var") . "); must be at least 0.0\n");
+ }
}
#
@@ -1152,54 +1384,11 @@ sub checkParameters () {
addCommandLineError("ERROR: Required parameter 'genomeSize' is not set\n") if (! defined(getGlobal("genomeSize")));
#
- # Java? Need JRE 1.8.
- #
-
- if ((getGlobal("corOverlapper") eq "mhap") ||
- (getGlobal("obtOverlapper") eq "mhap") ||
- (getGlobal("utgOverlapper") eq "mhap")) {
- my $java = getGlobal("java");
- my $versionStr = "unknown";
- my $version = 0;
-
- # Argh, we can't use runCommand() here, because we're included in Execution.pm. Try to check it with -x.
- # Nope. Fails if $java == "java".
-
- #if (! -x $java) {
- # addCommandLineError("ERROR: java executable '$java' not found or not executable\n");
- #}
-
- open(F, "$java -showversion 2>&1 |");
- while (<F>) {
- # First word is either "java" or "openjdk" or ...
- if (m/^.*\s+version\s+\"(\d+.\d+)(.*)\".*$/) {
- $versionStr = "$1$2";
- $version = $1;
- }
- }
- close(F);
-
- if ($version < 1.8) {
- addCommandLineError("ERROR: mhap overlapper requires java version at least 1.8.0; you have $versionStr (from '$java').\n");
- addCommandLineError("ERROR: '$java -showversion' reports:\n");
-
- open(F, "$java -showversion 2>&1 |");
- while (<F>) {
- chomp;
- addCommandLineError("ERROR: '$_'\n");
- }
- close(F);
-
- } else {
- print STDERR "-- Detected Java(TM) Runtime Environment '$versionStr' (from '$java').\n";
- }
- }
-
- #
# Minimap, no valid identities, set legacy
#
+
if (getGlobal("corOverlapper") eq "minimap") {
- setGlobalIfUndef("corLegacyFilter", 1);
+ setGlobalIfUndef("corLegacyFilter", 1);
}
#
diff --git a/src/pipelines/canu/ErrorEstimate.pm b/src/pipelines/canu/ErrorEstimate.pm
index 3adcd2c..8809838 100644
--- a/src/pipelines/canu/ErrorEstimate.pm
+++ b/src/pipelines/canu/ErrorEstimate.pm
@@ -69,7 +69,7 @@ sub uniqueKmerThreshold($$$$) {
my $threshold = 0;
my $kMer_loss = poisson_pdf($effective_coverage, 0);
- return 1 if($kMer_loss > $loss);
+ return 1 if($kMer_loss > $loss);
my $keepTrying = 1;
while($keepTrying)
@@ -119,7 +119,7 @@ sub runMHAP($$$$$$$$$$$$) {
print STDERR "--\n";
print STDERR "-- PARAMETERS: hashes=$numHashes, minMatches=$minNumMatches, threshold=$threshold\n";
- print STDERR "--\n";
+ print STDERR "--\n";
my $cmd = "$javaPath -d64 -server -Xmx4g -jar $bin/mhap-" . getGlobal("${tag}MhapVersion") . ".jar ";
$cmd .= " --no-self --repeat-weight 0.9 -k $merSize --num-hashes $numHashes --num-min-matches $minNumMatches --ordered-sketch-size $ordSketch --ordered-kmer-size $ordSketchMer --threshold $threshold --filter-threshold $filterThreshold --num-threads " . getGlobal("${tag}mhapThreads");
@@ -252,7 +252,7 @@ sub estimateCorrectedError ($$$) {
} else {
print STDERR "-- Estimated error rate: " . ($errorRate * 100) . "%.\n";
}
- setErrorRate($errorRate);
+ setErrorRate($errorRate, 1);
showErrorRates("-- ");
print STDERR "-- \n";
}
diff --git a/src/pipelines/canu/Execution.pm b/src/pipelines/canu/Execution.pm
index acbb55d..3da3977 100644
--- a/src/pipelines/canu/Execution.pm
+++ b/src/pipelines/canu/Execution.pm
@@ -55,11 +55,12 @@ package canu::Execution;
require Exporter;
@ISA = qw(Exporter);
- at EXPORT = qw(stopBefore stopAfter skipStage emitStage touch getInstallDirectory getBinDirectory getBinDirectoryShellCode submitScript submitOrRunParallelJob runCommand runCommandSilently findCommand findExecutable);
+ at EXPORT = qw(stopBefore stopAfter skipStage emitStage touch getInstallDirectory getJobIDShellCode getLimitShellCode getBinDirectory getBinDirectoryShellCode submitScript submitOrRunParallelJob runCommand runCommandSilently findCommand findExecutable caExit caFailure);
use strict;
use Config; # for @signame
use Cwd qw(getcwd);
+use Carp qw(cluck);
use POSIX ":sys_wait_h"; # For waitpid(..., &WNOHANG)
use List::Util qw(min max);
@@ -250,304 +251,16 @@ sub stopAfter ($) {
}
-sub lookupStageLabel ($) {
- my $label = shift @_;
- my %ckp;
- my $index;
-
- # For correction
-
- $index = 100;
-
- $ckp{'cor-gatekeeper'} = $index++;
-
- $ckp{'cor-meryl'} = $index++;
-
- $ckp{'cor-mhapConfigure'} = $index++;
- $ckp{'cor-mhapPrecomputeCheck'} = $index++; # + attempt
- $ckp{'cor-mhapCheck'} = $index++; # + attempt
- $ckp{'cor-overlapConfigure'} = $index++;
- $ckp{'cor-overlapCheck'} = $index++; # + attempt
-
- $ckp{'cor-overlapStoreConfigure'} = $index++;
- $ckp{'cor-overlapStoreBucketizerCheck'} = $index++;
- $ckp{'cor-overlapStoreSorterCheck'} = $index++;
- $ckp{'cor-createOverlapStore'} = $index++;
-
- $ckp{'cor-buildCorrectionLayouts'} = $index++;
- $ckp{'cor-generateCorrectedReads'} = $index++; # + attempt
- $ckp{'cor-dumpCorrectedReads'} = $index++;
-
- # For trimming
-
- $index = 200;
-
- $ckp{'obt-gatekeeper'} = $index++;
-
- $ckp{'obt-meryl'} = $index++;
-
- $ckp{'obt-mhapConfigure'} = $index++;
- $ckp{'obt-mhapPrecomputeCheck'} = $index++; # + attempt
- $ckp{'obt-overlapConfigure'} = $index++;
- $ckp{'obt-overlapCheck'} = $index++; # + attempt
-
- $ckp{'obt-overlapStoreConfigure'} = $index++;
- $ckp{'obt-overlapStoreBucketizerCheck'} = $index++;
- $ckp{'obt-overlapStoreSorterCheck'} = $index++;
- $ckp{'obt-createOverlapStore'} = $index++;
-
- $ckp{'obt-trimReads'} = $index++;
- $ckp{'obt-splitReads'} = $index++;
- $ckp{'obt-dumpReads'} = $index++; # rename this
-
- # For assembly
-
- $index = 300;
-
- $ckp{'utg-gatekeeper'} = $index++;
-
- $ckp{'utg-meryl'} = $index++;
-
- $ckp{'utg-mhapConfigure'} = $index++;
- $ckp{'utg-mhapPrecomputeCheck'} = $index++; # + attempt
- $ckp{'utg-overlapConfigure'} = $index++;
- $ckp{'utg-overlapCheck'} = $index++; # + attempt
-
- $ckp{'utg-overlapStoreConfigure'} = $index++;
- $ckp{'utg-overlapStoreBucketizerCheck'} = $index++;
- $ckp{'utg-overlapStoreSorterCheck'} = $index++;
- $ckp{'utg-createOverlapStore'} = $index++;
-
- $ckp{'overlapFilterDetectConfigure'} = $index++;
- $ckp{'overlapFilterDetectCheck'} = $index++; # + attempt
- $ckp{'overlapFilterConfigure'} = $index++;
- $ckp{'overlapFilter'} = $index++; # + attempt
-
- $ckp{'readErrorDetectionConfigure'} = $index++;
- $ckp{'readErrorDetectionCheck'} = $index++; # + attempt
- $ckp{'overlapErrorAdjustmentConfigure'} = $index++;
- $ckp{'overlapErrorAdjustmentCheck'} = $index++; # + attempt
- $ckp{'updateOverlapStore'} = $index++;
-
- $ckp{'unitig'} = $index++;
-
- $ckp{'consensusConfigure'} = $index++;
- $ckp{'consensusCheck'} = $index++; # + attempt
- $ckp{'consensusLoad'} = $index++;
- $ckp{'consensusFilter'} = $index++;
-
- $ckp{'outputLayout'} = $index++;
- $ckp{'outputGraph'} = $index++;
- $ckp{'outputSequence'} = $index++;
-
- caFailure("invalid checkpoint label '$label'", undef) if (!defined($ckp{$label}));
-
- return($ckp{$label});
+sub emitStage ($$$@) {
+ return;
}
-
-# Returns true if we should skip this stage. No used, but left in for possible use in cleaning up things. Signals that we're
-# at the start of some stage, and we could clean up earlier stages.
-#
sub skipStage ($$$@) {
- my $wrk = shift @_;
- my $asm = shift @_;
- my $stage = shift @_;
- my $attempt = shift @_;
-
- my $ckpstage = "";
- my $ckpattempt = undef;
-
- # DISABLED.
return(0);
-
- if (! -e "$wrk/$asm.stage") {
- # No checkpoint file exists, must compute!
- print STDERR "No $wrk/$asm.stage file, compute it all!\n";
- return(0);
- }
-
- open(F, "< $wrk/$asm.stage") or caFailure("failed to open '$wrk/$asm.stage' for reading", undef);
- while (<F>) {
- if (m/canu\s+at\s+stage\s+(\S*)\s+\(#\d+\)\sattempt\s+(\d+)$/) {
- $ckpstage = $1;
- $ckpattempt = $2;
- }
- if (m/canu\s+at\s+stage\s+(\S*)\s+\(#\d+\)$/) {
- $ckpstage = $1;
- }
- }
- close(F);
-
- caFailure("didn't find stage in '$wrk/$asm.stage'", undef) if ($ckpstage eq "");
-
- # Don't skip it. The stage to run is after the stage in the checkpoint.
- if (lookupStageLabel($ckpstage) < lookupStageLabel($stage)) {
- #print STDERR "PURGE at stage $stage (ignored attempt $attempt)\n";
- purgeRecomputable($wrk, $asm, $stage);
- return(0);
- };
-
- # Don't skip it. The stage to run is the stage in the checkpoint, but the attempt we're trying
- # is after.
- if ((lookupStageLabel($stage) == lookupStageLabel($ckpstage) &&
- (defined($attempt)) &&
- (defined($ckpattempt)) &&
- ($ckpattempt < $attempt))) {
- #print STDERR "PURGE at stage $stage attempt $attempt\n";
- purgeRecomputable($wrk, $asm, $stage);
- return(0);
- };
-
- #print STDERR "skipStage()-- target $stage/" . lookupStageLabel($stage);
- #print STDERR " attempt $attempt" if (defined($attempt));
- #print STDERR " -- checkpoint $ckpstage/" . lookupStageLabel($ckpstage);
- #print STDERR " attempt $ckpattempt" if (defined($ckpattempt));
- #print STDERR " -- from '$wrk/$asm.stage'\n";
-
- # Skip it. But first, purge any files we'll never need again.
-
- print STDERR "PURGE extraneous before stage $stage attempt $attempt\n";
- purgeExtraneous($wrk, $asm, $stage);
-
- return(1);
}
-# Same as skipStage(), left in for future use cleaning up. Signals that we're done with a stage.
-#
-sub emitStage ($$$@) {
-
- return;
-
- my $wrk = shift @_;
- my $asm = shift @_;
- my $stage = shift @_;
- my $attempt = shift @_;
- my $time = localtime();
-
- my $label = lookupStageLabel($stage);
- my $label1 = $label - 1;
- my $attempt = (defined($attempt)) ? " attempt $attempt" : "";
- my $ATTEMPT = (defined($attempt)) ? " ATTEMPT $attempt" : "";
-
- open(F, ">> $wrk/$asm.stage") or caFailure("failed to open '$wrk/$asm.stage' for appending\n", undef);
- print F "$time -- canu at stage $stage (#$label)$attempt\n";
- close(F);
-
- #print "----------------------------------------STAGE $stage (#$label)$ATTEMPT FINISHED.\n";
-
- make_path("$wrk/$asm.stage.fileLists") if (! -d "$wrk/$asm.stage.fileLists");
-
- # Find all files created since the last checkpoint, or accessed since the last. Linux claims either -anewer or -newera will work
- #
- # Format is: access-time -- modification-time -- status-change-time -- filename
-
- # Problem - label-1 doesn't always exist because we occasionally reset the label to 200, 300. Fixed by not doing that.
-
- while (($label1 > 100) && (! -e "$wrk/$asm.stage.fileLists/stage.$label1.created")) {
- $label1--;
- }
-
- if (-e "$wrk/$asm.stage.fileLists/stage.$label1.created") {
- runCommandSilently($wrk, "find . -type f -and -newer $wrk/$asm.stage.fileLists/stage.$label1.created -print > $wrk/$asm.stage.fileLists/stage.$label.created", 1);
- runCommandSilently($wrk, "find . -type f -and -anewer $wrk/$asm.stage.fileLists/stage.$label1.created -print > $wrk/$asm.stage.fileLists/stage.$label.accessed", 1);
- } else {
- runCommandSilently($wrk, "find . -type f -print > $wrk/$asm.stage.fileLists/stage.$label.created", 1);
- }
-
- if (! -e "$wrk/$asm.stage.fileLists/stage.$label1.created") {
- caExit("failed to generate list of files created since last checkpoint", undef);
- }
-}
-
-
-#
-#my %firstAccessed;
-#my %lastAccessed;
-#
-#
-#sub readAccessed ($$$$) {
-# my $wrk = shift @_;
-# my $asm = shift @_;
-# my $stage = shift @_;
-# my $file = shift @_;
-#
-# open(G, "< $file") or die "Failed to open '$file' for reading: $!\n";
-# while (<G>) {
-# chomp;
-#
-# s!^\.\/!!;
-# s!$asm\.!PREFIX.!;
-# #s!\d\d\d\d\d\d\.out!TASKID.out!;
-# s!\d\d\d\d\d\d\.!TASKID.!;
-#
-# next if (m/PREFIX.stage.fileLists/);
-# next if (m/PREFIX.stage/);
-# next if (m/canu-logs/);
-#
-# $firstAccessed{$_} = $stage if (!exists($firstAccessed{$_}));
-# $lastAccessed{$_} = $stage;
-# }
-# close(G);
-#}
-#
-#
-## Remove files we won't ever need again.
-#sub purgeExtraneous ($$$$) {
-# my $wrk = shift @_;
-# my $asm = shift @_;
-# my $stage = shift @_;
-#
-# open(F, "ls $wrk/$asm.stage.fileLists/ |");
-# while (<F>) {
-# my $file = $_; chomp $file;
-#
-# if ($file =~ m/stage.(\d+).accessed/) {
-# readAccessed($wrk, $asm, $1, "$wrk/test.stage.fileLists/$file");
-# }
-# }
-# close(F);
-#
-# foreach my $f (keys %lastAccessed) {
-# next if ($lastAccessed{$f} < $stage);
-#
-# #print STDERR "AT stage $stage REMOVE extraneous file $f last needed in stage $lastAccessed{$f}\n";
-# }
-#}
-#
-#
-#
-## Remove files we'll be computing again. This is based on knowing the current stage we're at,
-## then examining the sge.fileLists directory for future stages and removing those files.
-##
-## If any are detected, we emit a new stage (to find files between the last stage and the last stop),
-## then delete those files.
-##
-#sub purgeRecomputable ($$$) {
-# my $wrk = shift @_;
-# my $asm = shift @_;
-# my $stage = shift @_;
-#
-# open(F, "ls $wrk/$asm.stage.fileLists/ |");
-# while (<F>) {
-# my $file = $_; chomp $file;
-#
-# if ($file =~ m/stage.(\d+).accessed/) {
-# readAccessed($wrk, $asm, $1, "$wrk/test.stage.fileLists/$file");
-# }
-# }
-# close(F);
-#
-# foreach my $f (keys %firstAccessed) {
-# next if ($stage < $firstAccessed{$f});
-#
-# print STDERR "AT stage $stage REMOVE premature file $f first needed in stage $firstAccessed{$f}\n";
-# }
-#}
-#
-
# Decide what bin directory to use.
#
@@ -570,6 +283,91 @@ sub getInstallDirectory () {
}
+# Emits a block of shell code to parse the grid task id and offset.
+# Expects zero or one argument, which is interpreted different in grid and non-grid mode.
+# Off grid - the job to run
+# On grid - an offset to add to SGE_TASK_ID or SLURM_ARRAY_TASK_ID to compute the job to run
+#
+# PBSPro refuses to run an array job with one element. They're submitted as a normal job. Here,
+# we check if it is running on the grid and if the task ID (aka, array ID) isn't set. If so, we
+# assume it is job 1.
+#
+sub getJobIDShellCode () {
+ my $string;
+ my $taskenv = getGlobal('gridEngineTaskID');
+
+ $string .= "# Discover the job ID to run, from either a grid environment variable and a\n";
+ $string .= "# command line offset, or directly from the command line.\n";
+ $string .= "#\n";
+ $string .= "if [ x\$PBS_JOBID != x -a x\$$taskenv = x ]; then\n" if (uc(getGlobal("gridEngine")) eq "PBSPRO");
+ $string .= " $taskenv=1\n" if (uc(getGlobal("gridEngine")) eq "PBSPRO");
+ $string .= "fi\n" if (uc(getGlobal("gridEngine")) eq "PBSPRO");
+ $string .= "if [ x\$$taskenv = x -o x\$$taskenv = xundefined -o x\$$taskenv = x0 ]; then\n";
+ $string .= " baseid=\$1\n"; # Off grid
+ $string .= " offset=0\n";
+ $string .= "else\n";
+ $string .= " baseid=\$$taskenv\n"; # On Grid
+ $string .= " offset=\$1\n";
+ $string .= "fi\n";
+ $string .= "if [ x\$offset = x ]; then\n";
+ $string .= " offset=0\n";
+ $string .= "fi\n";
+ $string .= "if [ x\$baseid = x ]; then\n";
+ $string .= " echo Error: I need $taskenv set, or a job index on the command line.\n";
+ $string .= " exit\n";
+ $string .= "fi\n";
+ $string .= "jobid=`expr \$baseid + \$offset`\n";
+ $string .= "if [ x\$$taskenv = x ]; then\n";
+ $string .= " echo Running job \$jobid based on command line options.\n";
+ $string .= "else\n";
+ $string .= " echo Running job \$jobid based on $taskenv=\$$taskenv and offset=\$offset.\n";
+ $string .= "fi\n";
+}
+
+
+# Emits a block of shell code to change shell imposed limit on the number of open files and
+# processes.
+#
+sub getLimitShellCode ($) {
+ my $which = shift @_;
+ my $string;
+
+ if ($which eq "processes") {
+ $string .= "\n";
+ $string .= "max=`ulimit -Hu`\n";
+ $string .= "bef=`ulimit -Su`\n";
+ $string .= "if [ \$bef -lt \$max ] ; then\n";
+ $string .= " ulimit -Su \$max\n";
+ $string .= " aft=`ulimit -Su`\n";
+ $string .= " echo \"Changed max processes per user from \$bef to \$aft (max \$max).\"\n";
+ $string .= " echo \"\"\n";
+ $string .= "else\n";
+ $string .= " echo \"Max processes per user limited to \$bef, no increase possible.\"\n";
+ $string .= " echo \"\"\n";
+ $string .= "fi\n";
+ $string .= "\n";
+ }
+
+ if ($which eq "files") {
+ $string .= "\n";
+ $string .= "max=`ulimit -Hn`\n";
+ $string .= "bef=`ulimit -Sn`\n";
+ $string .= "if [ \$bef -lt \$max ] ; then\n";
+ $string .= " ulimit -Sn \$max\n";
+ $string .= " aft=`ulimit -Sn`\n";
+ $string .= " echo \"Changed max open files from \$bef to \$aft (max \$max).\"\n";
+ $string .= " echo \"\"\n";
+ $string .= "else\n";
+ $string .= " echo \"Max open files limited to \$bef, no increase possible.\"\n";
+ $string .= " echo \"\"\n";
+ $string .= "fi\n";
+ $string .= "\n";
+ }
+
+ return($string);
+}
+
+
# Used inside canu to find where binaries are located. It uses uname to find OS, architecture and
# system name, then uses that to construct a path to binaries. If a "pathMap" is defined, this is
# used to hardcode a path to a system name.
@@ -683,7 +481,7 @@ sub makeUniqueJobName ($$) {
# First, find the list of all jobs that exist.
- if (getGlobal("gridEngine") eq "SGE") {
+ if (uc(getGlobal("gridEngine")) eq "SGE") {
open(F, "qstat -xml |");
while (<F>) {
$jobs{$1}++ if (m/^\s*<JB_name>(.*)<\/JB_name>$/);
@@ -691,10 +489,13 @@ sub makeUniqueJobName ($$) {
close(F);
}
- if (getGlobal("gridEngine") eq "PBS") {
+ if (uc(getGlobal("gridEngine")) eq "PBS") {
}
- if (getGlobal("gridEngine") eq "LSF") {
+ if (uc(getGlobal("gridEngine")) eq "PBSPro") {
+ }
+
+ if (uc(getGlobal("gridEngine")) eq "LSF") {
}
# If the jobName doesn't exist, we can use it.
@@ -796,7 +597,8 @@ sub submitScript ($$$) {
# However, the sequential overlap store is still built from within the canu process.
if (getGlobal("ovsMethod") eq "sequential") {
- $mem = max($mem, getGlobal("ovsMemory"));
+ $mem = getGlobal("ovsMemory");
+ $mem = $2 if ($mem =~ m/^(\d+)-(\d+)$/);
}
$memOption = buildMemoryOption($mem, 1);
@@ -818,7 +620,9 @@ sub submitScript ($$$) {
my $hold = getGlobal("gridEngineHoldOption");
# most grid engines don't understand job names to hold on, only IDs
- if (uc(getGlobal("gridEngine")) eq "PBS" || uc(getGlobal("gridEngine")) eq "SLURM"){
+ if ((uc(getGlobal("gridEngine")) eq "PBS") ||
+ (uc(getGlobal("gridEngine")) eq "PBSPRO") ||
+ (uc(getGlobal("gridEngine")) eq "SLURM")){
my $tcmd = getGlobal("gridEngineNameToJobIDCommand");
$tcmd =~ s/WAIT_TAG/$jobToWaitOn/g;
my $propJobCount = `$tcmd |wc -l`;
@@ -873,12 +677,41 @@ sub submitScript ($$$) {
# global pattern for option
#
sub buildGridArray ($$$$) {
- my $r = $_[3];
+ my ($name, $bgn, $end, $opt) = @_;
+ my $off = 0;
- $r =~ s/ARRAY_NAME/$_[0]/g; # Replace ARRAY_NAME with 'job name'
- $r =~ s/ARRAY_JOBS/$_[1]-$_[2]/g; # Replace ARRAY_JOBS with 'bgn-end'
+ # In some grids (SGE) this is the maximum size of an array job.
+ # In some grids (Slurm) this is the maximum index of an array job.
+ #
+ # So, here, we just don't let any index be above the value. Both types will be happy.
- return($r);
+ if ($end > getGlobal('gridEngineArrayMaxJobs')) {
+ $off = $bgn - 1;
+ $bgn -= $off;
+ $end -= $off;
+ }
+
+ # PBSPro requires array jobs to have bgn < end. When $bgn == $end, we
+ # just remove the array qualifier. But only if this option is setting
+ # the number of jobs, not if it is setting the name.
+
+ if (uc(getGlobal("gridEngine")) eq "PBSPRO") {
+ $opt = "" if (($bgn == $end) && ($opt =~ m/ARRAY_JOBS/));
+ $off = "";
+ }
+
+ # Further, PBS/Torque won't let scripts be passed options unless they
+ # are prefixed with a -F....and PBSPro doesn't need this.
+
+ if (uc(getGlobal("gridEngine")) eq "PBS") {
+ $off = "-F \"$off\"";
+ $off = "";
+ }
+
+ $opt =~ s/ARRAY_NAME/$name/g; # Replace ARRAY_NAME with 'job name'
+ $opt =~ s/ARRAY_JOBS/$bgn-$end/g; # Replace ARRAY_JOBS with 'bgn-end'
+
+ return($opt, $off);
}
@@ -966,19 +799,19 @@ sub buildGridJob ($$$$$$$$) {
# Figure out the command and options needed to run the job.
- my $submitCommand = getGlobal("gridEngineSubmitCommand");
- my $nameOption = getGlobal("gridEngineNameOption");
+ my $submitCommand = getGlobal("gridEngineSubmitCommand");
+ my $nameOption = getGlobal("gridEngineNameOption");
- my $jobNameT = makeUniqueJobName($jobType, $asm);
+ my $jobNameT = makeUniqueJobName($jobType, $asm);
- my $jobName = buildGridArray($jobNameT, $bgnJob, $endJob, getGlobal("gridEngineArrayName"));
- my $arrayOpt = buildGridArray($jobNameT, $bgnJob, $endJob, getGlobal("gridEngineArrayOption"));
+ my ($jobName, $jobOff) = buildGridArray($jobNameT, $bgnJob, $endJob, getGlobal("gridEngineArrayName"));
+ my ($arrayOpt, $arrayOff) = buildGridArray($jobNameT, $bgnJob, $endJob, getGlobal("gridEngineArrayOption"));
- my $outputOption = getGlobal("gridEngineOutputOption");
- my $outName = buildOutputName($path, $script, getGlobal("gridEngineArraySubmitID"));
+ my $outputOption = getGlobal("gridEngineOutputOption");
+ my $outName = buildOutputName($path, $script, getGlobal("gridEngineArraySubmitID"));
- my $memOption = buildMemoryOption($mem, $thr);
- my $thrOption = buildThreadOption($thr);
+ my $memOption = buildMemoryOption($mem, $thr);
+ my $thrOption = buildThreadOption($thr);
my $gridOpts;
@@ -998,7 +831,7 @@ sub buildGridJob ($$$$$$$$) {
$cmd .= " $nameOption \"$jobName\" \\\n";
$cmd .= " $arrayOpt \\\n";
$cmd .= " $outputOption $outName \\\n";
- $cmd .= " $path/$script.sh\n";
+ $cmd .= " $path/$script.sh $arrayOff\n";
# Save it, just because.
@@ -1065,7 +898,11 @@ sub convertToJobRange (@) {
push @jobs, ($st == $ed) ? "$st" : "$st-$ed";
- # If any of the ranges are larger than allowed, split into multiple pieces.
+ # In some grids (SGE) this is the maximum size of an array job.
+ # In some grids (Slurm) this is the maximum index of an array job.
+ #
+ # So, here, we make blocks that have at most that many jobs. When we submit the job, we'll
+ # offset the indices to be 1..Max.
my $l = getGlobal("gridEngineArrayMaxJobs") - 1;
@@ -1229,12 +1066,12 @@ sub submitOrRunParallelJob ($$$$$@) {
# compute limit based on # of cpus
my $nCParallel = getGlobal("${jobType}Concurrency");
$nCParallel = int(getGlobal("maxThreads") / $thr) if ((!defined($nCParallel)) || ($nCParallel == 0));
- $nCParallel = 1 if ((!defined($nCParallel)) || ($nCParallel == 0));
+ $nCParallel = 1 if ((!defined($nCParallel)) || ($nCParallel == 0));
# compute limit based on physical memory
my $nMParallel = getGlobal("${jobType}Concurrency");
$nMParallel = int(getGlobal("maxMemory") / getGlobal("${jobType}Memory")) if ((!defined($nMParallel)) || ($nMParallel == 0));
- $nMParallel = 1 if ((!defined($nMParallel)) || ($nMParallel == 0));
+ $nMParallel = 1 if ((!defined($nMParallel)) || ($nMParallel == 0));
# run min of our limits
my $nParallel = $nCParallel < $nMParallel ? $nCParallel : $nMParallel;
@@ -1415,4 +1252,75 @@ sub findExecutable ($) {
}
+# Use caExit() for transient errors, like not opening files, processes that die, etc.
+sub caExit ($$) {
+ my $wrk = getGlobal("onExitDir");
+ my $asm = getGlobal("onExitNam");
+ my $msg = shift @_;
+ my $log = shift @_;
+
+ print STDERR "================================================================================\n";
+ print STDERR "Don't panic, but a mostly harmless error occurred and canu failed.\n";
+ print STDERR "\n";
+
+ # Really should pass in $wrk
+ if (defined($log)) {
+ my $df = diskSpace($log);
+
+ print STDERR "Disk space available: $df GB\n";
+ print STDERR "\n";
+ }
+
+ if (-e $log) {
+ print STDERR "Last 50 lines of the relevant log file ($log):\n";
+ print STDERR "\n";
+ system("tail -n 50 $log");
+ print STDERR "\n";
+ }
+
+ print STDERR "canu failed with '$msg'.\n";
+ print STDERR "\n";
+
+ my $fail = getGlobal('onFailure');
+ if (defined($fail)) {
+ runCommandSilently($wrk, "$fail $asm", 0);
+ }
+
+ exit(1);
+}
+
+
+# Use caFailure() for errors that definitely will require code changes to fix.
+sub caFailure ($$) {
+ my $wrk = getGlobal("onExitDir");
+ my $asm = getGlobal("onExitNam");
+ my $msg = shift @_;
+ my $log = shift @_;
+
+ print STDERR "================================================================================\n";
+ print STDERR "Please panic. canu failed, and it shouldn't have.\n";
+ print STDERR "\n";
+ print STDERR "Stack trace:\n";
+ print STDERR "\n";
+ cluck;
+ print STDERR "\n";
+
+ if (-e $log) {
+ print STDERR "Last few lines of the relevant log file ($log):\n";
+ print STDERR "\n";
+ system("tail -n 50 $log");
+ }
+
+ print STDERR "\n";
+ print STDERR "canu failed with '$msg'.\n";
+
+ my $fail = getGlobal('onFailure');
+ if (defined($fail)) {
+ runCommandSilently($wrk, "$fail $asm", 0);
+ }
+
+ exit(1);
+}
+
+
1;
diff --git a/src/pipelines/canu/Gatekeeper.pm b/src/pipelines/canu/Gatekeeper.pm
index 80f627c..4ce67e2 100644
--- a/src/pipelines/canu/Gatekeeper.pm
+++ b/src/pipelines/canu/Gatekeeper.pm
@@ -163,7 +163,7 @@ sub gatekeeperCreateStore ($$$@) {
if (-e "$wrk/$asm.gkpStore.ACCEPTED") {
rename("$wrk/$asm.gkpStore.ACCEPTED", "$wrk/$asm.gkpStore");
- rename "$wrk/$asm.gkpStore.BUILDING.errorLog", "$wrk/$asm.gkpStore.errorLog";
+ rename("$wrk/$asm.gkpStore.BUILDING.err", "$wrk/$asm.gkpStore.err");
return;
}
@@ -266,10 +266,8 @@ sub gatekeeperCreateStore ($$$@) {
if ($nProblems > 0) {
print STDERR "Gatekeeper detected problems in your input reads. Please review the logging in files:\n";
- print STDERR " $wrk/$asm.gkpStore.BUILDING.err\n" if (getGlobal("stopOnReadQuality") == 0);
- print STDERR " $wrk/$asm.gkpStore.BUILDING.errorLog\n" if (getGlobal("stopOnReadQuality") == 0);
- print STDERR " $wrk/$asm.gkpStore.err\n" if (getGlobal("stopOnReadQuality") == 1);
- print STDERR " $wrk/$asm.gkpStore.errorLog\n" if (getGlobal("stopOnReadQuality") == 1);
+ print STDERR " $wrk/$asm.gkpStore.BUILDING.err\n";
+ print STDERR " $wrk/$asm.gkpStore.BUILDING/errorLog\n";
if (getGlobal("stopOnReadQuality")) {
print STDERR "If you wish to proceed, rename the store with the following commands and restart canu.\n";
@@ -288,7 +286,6 @@ sub gatekeeperCreateStore ($$$@) {
rename "$wrk/$asm.gkpStore.BUILDING", "$wrk/$asm.gkpStore";
rename "$wrk/$asm.gkpStore.BUILDING.err", "$wrk/$asm.gkpStore.err";
- rename "$wrk/$asm.gkpStore.BUILDING.errorLog", "$wrk/$asm.gkpStore.errorLog";
}
@@ -365,30 +362,33 @@ sub gatekeeperGenerateReadLengthPlot ($$$) {
my $tag = shift @_;
my $bin = getBinDirectory();
- open(F, "> $wrk/$asm.gkpStore/readlengths.gp") or caExit("can't open '$wrk/$asm.gkpStore/readlengths.gp' for writing: $!", undef);
- print F "set title 'read length'\n";
- print F "set xlabel 'read length, bin width = 250'\n";
- print F "set ylabel 'number of reads'\n";
- print F "\n";
- print F "binwidth=250\n";
- print F "set boxwidth binwidth\n";
- print F "bin(x,width) = width*floor(x/width) + binwidth/2.0\n";
- print F "\n";
- print F "set terminal png size 1024,1024\n";
- print F "set output '$wrk/$asm.gkpStore/readlengths.lg.png'\n";
- print F "plot [] '$wrk/$asm.gkpStore/readlengths.txt' using (bin(\$1,binwidth)):(1.0) smooth freq with boxes title ''\n";
- print F "\n";
- print F "set terminal png size 256,256\n";
- print F "set output '$wrk/$asm.gkpStore/readlengths.sm.png'\n";
- print F "plot [] '$wrk/$asm.gkpStore/readlengths.txt' using (bin(\$1,binwidth)):(1.0) smooth freq with boxes title ''\n";
- close(F);
+ my $gnuplot = getGlobal("gnuplot");
+ my $format = getGlobal("gnuplotImageFormat");
+
+ open(F, "> $wrk/$asm.gkpStore/readlengths.gp") or caExit("can't open '$wrk/$asm.gkpStore/readlengths.gp' for writing: $!", undef);
+ print F "set title 'read length'\n";
+ print F "set xlabel 'read length, bin width = 250'\n";
+ print F "set ylabel 'number of reads'\n";
+ print F "\n";
+ print F "binwidth=250\n";
+ print F "set boxwidth binwidth\n";
+ print F "bin(x,width) = width*floor(x/width) + binwidth/2.0\n";
+ print F "\n";
+ print F "set terminal $format size 1024,1024\n";
+ print F "set output '$wrk/$asm.gkpStore/readlengths.lg.$format'\n";
+ print F "plot [] '$wrk/$asm.gkpStore/readlengths.txt' using (bin(\$1,binwidth)):(1.0) smooth freq with boxes title ''\n";
+ print F "\n";
+ print F "set terminal $format size 256,256\n";
+ print F "set output '$wrk/$asm.gkpStore/readlengths.sm.$format'\n";
+ print F "plot [] '$wrk/$asm.gkpStore/readlengths.txt' using (bin(\$1,binwidth)):(1.0) smooth freq with boxes title ''\n";
+ close(F);
- if (runCommandSilently("$wrk/$asm.gkpStore", "gnuplot $wrk/$asm.gkpStore/readlengths.gp > /dev/null 2>&1", 0)) {
- print STDERR "--\n";
- print STDERR "-- WARNING: gnuplot failed; no plots will appear in HTML output.\n";
- print STDERR "--\n";
- print STDERR "----------------------------------------\n";
- }
+ if (runCommandSilently("$wrk/$asm.gkpStore", "$gnuplot $wrk/$asm.gkpStore/readlengths.gp > /dev/null 2>&1", 0)) {
+ print STDERR "--\n";
+ print STDERR "-- WARNING: gnuplot failed; no plots will appear in HTML output.\n";
+ print STDERR "--\n";
+ print STDERR "----------------------------------------\n";
+ }
}
@@ -462,7 +462,7 @@ sub gatekeeper ($$$@) {
gatekeeperGenerateReadsList($wrk, $asm, $tag) if (! -e "$wrk/$asm.gkpStore/reads.txt");
gatekeeperGenerateLibrariesList($wrk, $asm, $tag) if (! -e "$wrk/$asm.gkpStore/libraries.txt");
gatekeeperGenerateReadLengths($wrk, $asm, $tag) if (! -e "$wrk/$asm.gkpStore/readlengths.txt");
- gatekeeperGenerateReadLengthPlot($wrk, $asm, $tag) if (! -e "$wrk/$asm.gkpStore/readlengths.png");
+ gatekeeperGenerateReadLengthPlot($wrk, $asm, $tag) if (! -e "$wrk/$asm.gkpStore/readlengths.gp");
finishStage:
gatekeeperReportReadLengthHistogram($wrk, $asm, $tag);
diff --git a/src/pipelines/canu/Grid.pm b/src/pipelines/canu/Grid.pm
index fbedc1d..722a778 100644
--- a/src/pipelines/canu/Grid.pm
+++ b/src/pipelines/canu/Grid.pm
@@ -19,6 +19,10 @@
# are a 'United States Government Work', and
# are released in the public domain
#
+ # Sergey Koren beginning on 2016-JUN-20
+ # are a 'United States Government Work', and
+ # are released in the public domain
+ #
# File 'README.licenses' in the root directory of this distribution contains
# full conditions and disclaimers for each license.
##
@@ -43,7 +47,7 @@ sub formatAllowedResources (\%$) {
my $hosts_ref = shift @_;
my $geName = shift @_;
my %hosts = %$hosts_ref;
- my $hosts = "";
+ my $hosts = undef;
print STDERR "-- \n";
@@ -73,6 +77,7 @@ sub configureRemote () {
return if (uc(getGlobal("gridEngine")) ne "");
- # Set to "1" so that shell scripts will emit "jobid=$1" at the start.
- setGlobalIfUndef("gridEngineTaskID", "1");
+ # If here, gridEngine is not set, and we're running locally.
+ # Set to a variable we don't expect to see in the environment.
+ setGlobalIfUndef("gridEngineTaskID", "CANU_LOCAL_JOB_ID");
}
diff --git a/src/pipelines/canu/Grid_PBSTorque.pm b/src/pipelines/canu/Grid_PBSTorque.pm
index a3dbb9b..186526f 100644
--- a/src/pipelines/canu/Grid_PBSTorque.pm
+++ b/src/pipelines/canu/Grid_PBSTorque.pm
@@ -40,6 +40,27 @@ use canu::Defaults;
use canu::Execution;
use canu::Grid;
+
+sub detectPBSVersion () {
+ my $isPro = 0;
+ my $version = "";
+
+ open(F, "pbsnodes --version 2>&1 |");
+ while (<F>) {
+ if (m/pbs_version\s+=\s+(.*)/) {
+ $isPro = 1;
+ $version = $1;
+ }
+ if (m/Version:\s+(.*)/) {
+ $version = $1;
+ }
+ }
+ close(F);
+
+ return($version, $isPro);
+}
+
+
sub detectPBSTorque () {
return if ( defined(getGlobal("gridEngine")));
@@ -48,43 +69,25 @@ sub detectPBSTorque () {
return if (!defined($pbsnodes));
- print STDERR "-- Detected PBS/Torque with 'pbsnodes' binary in $pbsnodes.\n";
- setGlobal("gridEngine", "PBS");
-}
-
+ my ($version, $isPro) = detectPBSVersion();
-sub configurePBSTorque () {
+ if ($isPro == 0) {
+ print STDERR "-- Detected PBS/Torque '$version' with 'pbsnodes' binary in $pbsnodes.\n";
+ setGlobal("gridEngine", "PBS");
+ } else {
+ print STDERR "-- Detected PBSPro '$version' with 'pbsnodes' binary in $pbsnodes.\n";
+ setGlobal("gridEngine", "PBSPRO");
+ }
+}
- return if (uc(getGlobal("gridEngine")) ne "PBS");
- setGlobalIfUndef("gridEngineSubmitCommand", "qsub");
- setGlobalIfUndef("gridEngineHoldOption", "-W depend=afteranyarray:WAIT_TAG");
- setGlobalIfUndef("gridEngineHoldOptionNoArray", "-W depend=afterany:WAIT_TAG");
- setGlobalIfUndef("gridEngineSyncOption", "");
- setGlobalIfUndef("gridEngineNameOption", "-d `pwd` -N");
- setGlobalIfUndef("gridEngineArrayOption", "-t ARRAY_JOBS");
- setGlobalIfUndef("gridEngineArrayName", "ARRAY_NAME");
- setGlobalIfUndef("gridEngineArrayMaxJobs", 65535);
- setGlobalIfUndef("gridEngineOutputOption", "-j oe -o");
- setGlobalIfUndef("gridEngineThreadsOption", "-l nodes=1:ppn=THREADS");
- setGlobalIfUndef("gridEngineMemoryOption", "-l mem=MEMORY");
- setGlobalIfUndef("gridEnginePropagateCommand", "qalter -W depend=afterany:\"WAIT_TAG\"");
- setGlobalIfUndef("gridEngineNameToJobIDCommand", "qstat -f |grep -F -B 1 WAIT_TAG | grep Id: | grep -F [] |awk '{print \$NF}'");
- setGlobalIfUndef("gridEngineNameToJobIDCommandNoArray", "qstat -f |grep -F -B 1 WAIT_TAG | grep Id: |awk '{print \$NF}'");
- setGlobalIfUndef("gridEngineTaskID", "PBS_ARRAYID");
- setGlobalIfUndef("gridEngineArraySubmitID", "\\\$PBS_ARRAYID");
- setGlobalIfUndef("gridEngineJobID", "PBS_JOBID");
- # Build a list of the resources available in the grid. This will contain a list with keys
- # of "#CPUs-#GBs" and values of the number of nodes With such a config. Later on, we'll use this
- # to figure out what specific settings to use for each algorithm.
- #
- # The list is saved in global{"availableHosts"}
- #
+sub configurePBSTorqueNodes () {
my %hosts;
- open(F, "pbsnodes |");
+ print STDERR "-- Detecting PBS/Torque resources.\n";
+ open(F, "pbsnodes |");
while (<F>) {
my $cpus = 0;
my $mem = 0;
@@ -109,3 +112,81 @@ sub configurePBSTorque () {
setGlobal("availableHosts", formatAllowedResources(%hosts, "PBS/Torque"));
}
+
+
+
+sub configurePBSProNodes () {
+ my %hosts;
+ my $mem = 0;
+ my $cpus = 0;
+
+ print STDERR "-- Detecting PBSPro resources.\n";
+
+ open(F, "pbsnodes -av |");
+ while (<F>) {
+ if (m/resources_available.mem\s*=\s*(\d+)kb/) {
+ $mem = int($1 / 1024 / 1024);
+ }
+ if (m/resources_available.mem\s*=\s*(\d+)mb/) {
+ $mem = int($1 / 1024);
+ }
+ if (m/resources_available.mem\s*=\s*(\d+)gb/) {
+ $mem = int($1);
+ }
+
+ if (m/resources_available.ncpus\s*=\s*(\d+)/) {
+ $cpus = $1;
+ }
+
+ if (($cpus > 0) && ($mem > 0)) {
+ $hosts{"$cpus-$mem"}++;
+ $cpus = 0;
+ $mem = 0;
+ }
+ }
+ close(F);
+
+ setGlobal("availableHosts", formatAllowedResources(%hosts, "PBSPro"));
+}
+
+
+
+sub configurePBSTorque () {
+
+ return if ((uc(getGlobal("gridEngine")) ne "PBS") &&
+ (uc(getGlobal("gridEngine")) ne "PBSPRO"));
+
+ my $isPro = (uc(getGlobal("gridEngine")) eq "PBSPRO");
+
+ setGlobalIfUndef("gridEngineSubmitCommand", "qsub");
+ setGlobalIfUndef("gridEngineHoldOption", "-W depend=afteranyarray:WAIT_TAG") if ($isPro == 0);
+ setGlobalIfUndef("gridEngineHoldOption", "-W depend=afterany:WAIT_TAG") if ($isPro == 1);
+ setGlobalIfUndef("gridEngineHoldOptionNoArray", "-W depend=afterany:WAIT_TAG");
+ setGlobalIfUndef("gridEngineSyncOption", "");
+ setGlobalIfUndef("gridEngineNameOption", "-d `pwd` -N") if ($isPro == 0);
+ setGlobalIfUndef("gridEngineNameOption", "-N") if ($isPro == 1);
+ setGlobalIfUndef("gridEngineArrayOption", "-t ARRAY_JOBS") if ($isPro == 0);
+ setGlobalIfUndef("gridEngineArrayOption", "-J ARRAY_JOBS") if ($isPro == 1);
+ setGlobalIfUndef("gridEngineArrayName", "ARRAY_NAME");
+ setGlobalIfUndef("gridEngineArrayMaxJobs", 268435456); # Effectively unlimited.
+ setGlobalIfUndef("gridEngineOutputOption", "-j oe -o");
+ setGlobalIfUndef("gridEngineThreadsOption", "-l nodes=1:ppn=THREADS");
+ setGlobalIfUndef("gridEngineMemoryOption", "-l mem=MEMORY");
+ setGlobalIfUndef("gridEnginePropagateCommand", "qalter -W depend=afterany:\"WAIT_TAG\"");
+ setGlobalIfUndef("gridEngineNameToJobIDCommand", "qstat -f |grep -F -B 1 WAIT_TAG | grep Id: | grep -F [] |awk '{print \$NF}'");
+ setGlobalIfUndef("gridEngineNameToJobIDCommandNoArray", "qstat -f |grep -F -B 1 WAIT_TAG | grep Id: |awk '{print \$NF}'");
+ setGlobalIfUndef("gridEngineTaskID", "PBS_ARRAYID") if ($isPro == 0);
+ setGlobalIfUndef("gridEngineTaskID", "PBS_ARRAY_INDEX") if ($isPro == 1);
+ setGlobalIfUndef("gridEngineArraySubmitID", "\\\$PBS_ARRAYID") if ($isPro == 0);
+ setGlobalIfUndef("gridEngineArraySubmitID", "\\\$PBS_ARRAY_INDEX") if ($isPro == 1);
+ setGlobalIfUndef("gridEngineJobID", "PBS_JOBID");
+
+ # Build a list of the resources available in the grid. This will contain a list with keys
+ # of "#CPUs-#GBs" and values of the number of nodes With such a config. Later on, we'll use this
+ # to figure out what specific settings to use for each algorithm.
+ #
+ # The list is saved in global{"availableHosts"}
+
+ configurePBSTorqueNodes() if ($isPro == 0);
+ configurePBSProNodes() if ($isPro == 1);
+}
diff --git a/src/pipelines/canu/Grid_Slurm.pm b/src/pipelines/canu/Grid_Slurm.pm
index dedd058..6cdb0c9 100644
--- a/src/pipelines/canu/Grid_Slurm.pm
+++ b/src/pipelines/canu/Grid_Slurm.pm
@@ -59,10 +59,18 @@ sub configureSlurm () {
my $maxArraySize = 65535;
+ # From the docs (http://slurm.schedmd.com/job_array.html):
+ #
+ # Note that the minimum index value is zero and the maximum value is a Slurm configuration
+ # parameter (MaxArraySize minus one).
+ #
+ # Which is a totally stupid name for the parameter, and a totally stupid interpretation.
+
open(F, "scontrol show config |") or caExit("can't run 'scontrol' to get SLURM config", undef);
while (<F>) {
if (m/MaxArraySize\s+=\s+(\d+)/) {
- $maxArraySize = $1;
+ $maxArraySize = $1 - 1;
+ print STDERR "-- Detected Slurm with 'MaxArraySize' limited to $maxArraySize jobs.\n";
}
}
close(F);
diff --git a/src/pipelines/canu/HTML.pm b/src/pipelines/canu/HTML.pm
index b7d4872..4c309fc 100644
--- a/src/pipelines/canu/HTML.pm
+++ b/src/pipelines/canu/HTML.pm
@@ -58,23 +58,24 @@ sub simpleFigure ($$$$) {
my $sImage = shift @_;
my $dImage = shift @_;
my $text = shift @_;
+ my $format = getGlobal("gnuplotImageFormat");
# No image? Note so in the html.
- if ((! -e "$sImage.sm.png") && (! -e "$sImage.lg.png") &&
- (! -e "$dImage.sm.png") && (! -e "$dImage.lg.png")) {
+ if ((! -e "$sImage.sm.$format") && (! -e "$sImage.lg.$format") &&
+ (! -e "$dImage.sm.$format") && (! -e "$dImage.lg.$format")) {
push @$body, "<p>Image '$sImage' not found.</p>\n";
return;
}
# Copy the file to our files location.
- copyFile("$sImage.lg.png", "$dImage.lg.png");
- copyFile("$sImage.sm.png", "$dImage.sm.png");
+ copyFile("$sImage.lg.$format", "$dImage.lg.$format");
+ copyFile("$sImage.sm.$format", "$dImage.sm.$format");
# Empty image? Note so in the html.
- if ((-z "$dImage.sm.png") || (-z "$dImage.lg.png")) {
+ if ((-z "$dImage.sm.$format") || (-z "$dImage.lg.$format")) {
push @$body, "<p>Image '$sImage' is empty. Probably no data to display.</p>\n";
return;
}
@@ -82,7 +83,7 @@ sub simpleFigure ($$$$) {
# Otherwise, show it!
push @$body, "<figure>\n";
- push @$body, "<a href='$dImage.lg.png'><img src='$dImage.sm.png'></a>\n";
+ push @$body, "<a href='$dImage.lg.$format'><img src='$dImage.sm.$format'></a>\n";
push @$body, "<figcaption>\n";
push @$body, "$text\n";
push @$body, "</figcaption>\n";
@@ -251,15 +252,15 @@ sub buildMerylHTML ($$$$$$) {
}
elsif ((-e "$wrk/0-mercounts/$asm.ms$ms.ignore") && (-z "$wrk/0-mercounts/$asm.ms$ms.ignore")) {
- push @$body, "Threshold zdero. No mers reported.\n";
+ push @$body, "Threshold zero. No mers reported.\n";
}
elsif ((-e "$wrk/0-mercounts/$asm.ms$ms.fasta") && (-z "$wrk/0-mercounts/$asm.ms$ms.fasta")) {
- push @$body, "Threshold zdero. No mers reported.\n";
+ push @$body, "Threshold zero. No mers reported.\n";
}
else {
- push @$body, "Using frequent mers in <some path>\n";
+ push @$body, "Using user-supplied frequent mers.\n";
}
}
}
@@ -393,7 +394,7 @@ sub buildCorrectionHTML ($$$$$$) {
simpleFigure($body,
"$wrk/2-correction/$asm.estimate.original-x-corrected",
- "correction.html.files/$asm.estimate.original-x-corrected",
+ "$wrk.html.files/$asm.estimate.original-x-corrected",
"Scatter plot of the original read length (X axis) against the expected corrected read length (Y axis).\n" .
"Colors show a comparison of the simple filter (which doesn't use overlaps) to the expensive filter (which does).\n" .
"A large green triangle (false negatives) hints that there could be abnormally low quality regions in the reads.\n");
@@ -403,24 +404,24 @@ sub buildCorrectionHTML ($$$$$$) {
# Original vs expected shown above.
simpleFigure($body,
"$wrk/2-correction/$asm.originalLength-vs-expectedLength",
- "correction.html.files/$asm.originalLength-vs-expectedLength",
+ "$wrk.html.files/$asm.originalLength-vs-expectedLength",
"Scatter plot of original vs expected read length. Shown in filter plot above.");
simpleFigure($body,
"$wrk/2-correction/$asm.originalLength-vs-correctedLength",
- "correction.html.files/$asm.originalLength-vs-correctedLength",
+ "$wrk.html.files/$asm.originalLength-vs-correctedLength",
"Scatter plot of original vs corrected read length.");
simpleFigure($body,
"$wrk/2-correction/$asm.expectedLength-vs-correctedLength",
- "correction.html.files/$asm.expectedLength-vs-correctedLength",
+ "$wrk.html.files/$asm.expectedLength-vs-correctedLength",
"Scatter plot of expected vs corrected read length.");
# Histogram - expected vs corrected lengths NEEDS TO SHOW NEGATIVES!?
simpleFigure($body,
"$wrk/2-correction/$asm.length-difference-histograms",
- "correction.html.files/$asm.length-difference-histograms",
+ "$wrk.html.files/$asm.length-difference-histograms",
"Histogram of the difference between the expected and corrected read lengths.\n" .
"Note that a negative difference means the corrected read is larger than expected.\n");
@@ -428,7 +429,7 @@ sub buildCorrectionHTML ($$$$$$) {
simpleFigure($body,
"$wrk/2-correction/$asm.length-histograms",
- "correction.html.files/$asm.length-histograms",
+ "$wrk.html.files/$asm.length-histograms",
"Histogram of original (red), expected (green) and actual corrected (blue) read lengths.\n");
}
@@ -498,15 +499,15 @@ sub buildTrimmingHTML ($$$$$$) {
push @$body, "<p>Stage not computed or results file removed ($wrk/3-overlapbasedtrimming/$asm.1.trimReads.stats).</p>\n";
}
- simpleFigure($body, "$wrk/3-overlapbasedtrimming/$asm.1.trimReads.inputDeletedReads", "trimming.html.files/$asm.1.trimReads.inputDeletedReads", "");
- simpleFigure($body, "$wrk/3-overlapbasedtrimming/$asm.1.trimReads.inputNoTrimReads", "trimming.html.files/$asm.1.trimReads.inputNoTrimReads", "");
- simpleFigure($body, "$wrk/3-overlapbasedtrimming/$asm.1.trimReads.inputReads", "trimming.html.files/$asm.1.trimReads.inputReads", "");
- simpleFigure($body, "$wrk/3-overlapbasedtrimming/$asm.1.trimReads.outputDeletedReads", "trimming.html.files/$asm.1.trimReads.outputDeletedReads", "");
- simpleFigure($body, "$wrk/3-overlapbasedtrimming/$asm.1.trimReads.outputNoOvlReads", "trimming.html.files/$asm.1.trimReads.outputNoOvlReads", "");
- simpleFigure($body, "$wrk/3-overlapbasedtrimming/$asm.1.trimReads.outputTrimmedReads", "trimming.html.files/$asm.1.trimReads.outputTrimmedReads", "");
- simpleFigure($body, "$wrk/3-overlapbasedtrimming/$asm.1.trimReads.outputUnchangedReads", "trimming.html.files/$asm.1.trimReads.outputUnchangedReads", "");
- simpleFigure($body, "$wrk/3-overlapbasedtrimming/$asm.1.trimReads.trim3", "trimming.html.files/$asm.1.trimReads.trim3", "");
- simpleFigure($body, "$wrk/3-overlapbasedtrimming/$asm.1.trimReads.trim5", "trimming.html.files/$asm.1.trimReads.trim5", "");
+ simpleFigure($body, "$wrk/3-overlapbasedtrimming/$asm.1.trimReads.inputDeletedReads", "$wrk.html.files/$asm.1.trimReads.inputDeletedReads", "");
+ simpleFigure($body, "$wrk/3-overlapbasedtrimming/$asm.1.trimReads.inputNoTrimReads", "$wrk.html.files/$asm.1.trimReads.inputNoTrimReads", "");
+ simpleFigure($body, "$wrk/3-overlapbasedtrimming/$asm.1.trimReads.inputReads", "$wrk.html.files/$asm.1.trimReads.inputReads", "");
+ simpleFigure($body, "$wrk/3-overlapbasedtrimming/$asm.1.trimReads.outputDeletedReads", "$wrk.html.files/$asm.1.trimReads.outputDeletedReads", "");
+ simpleFigure($body, "$wrk/3-overlapbasedtrimming/$asm.1.trimReads.outputNoOvlReads", "$wrk.html.files/$asm.1.trimReads.outputNoOvlReads", "");
+ simpleFigure($body, "$wrk/3-overlapbasedtrimming/$asm.1.trimReads.outputTrimmedReads", "$wrk.html.files/$asm.1.trimReads.outputTrimmedReads", "");
+ simpleFigure($body, "$wrk/3-overlapbasedtrimming/$asm.1.trimReads.outputUnchangedReads", "$wrk.html.files/$asm.1.trimReads.outputUnchangedReads", "");
+ simpleFigure($body, "$wrk/3-overlapbasedtrimming/$asm.1.trimReads.trim3", "$wrk.html.files/$asm.1.trimReads.trim3", "");
+ simpleFigure($body, "$wrk/3-overlapbasedtrimming/$asm.1.trimReads.trim5", "$wrk.html.files/$asm.1.trimReads.trim5", "");
push @$body, "<h2>Splitting</h2>\n";
push @$body, "\n";
@@ -660,6 +661,42 @@ sub buildOverlapErrorCorrectionHTML ($$$$$$) {
}
+
+sub reportSizeStatistics ($$$) {
+ my $css = shift @_; # Array reference
+ my $body = shift @_; # Array reference
+ my $scripts = shift @_; # Array reference
+
+ $_ = <F>; chomp; # First real line.
+
+ push @$body, "<table>\n";
+ push @$body, "<tr><th>Fraction</th><th>Length</th><th>Sequences</th><th>Bases</th></tr>\n";
+
+ while (!eof(F) && (length($_) > 0)) {
+ if (m/^(\w+)\s+\((\d+)\s+tigs\)\s+\((\d+)\s+length\)\s+\((\d+)\s+average\)\s+\((\d+.\d+x)\s+coverage\)$/) {
+ push @$body, "<tr><td colspan='5'>$_</td></tr>\n";
+ }
+
+ if (m/^ng(\d\d\d)\s+(\d+)\s+lg(\d\d\d)\s+(\d+)\s+sum\s+(\d+)\s+\((\w+\))$/) {
+ my $ng = $1;
+ my $ngv = $2;
+ my $lg = $3;
+ my $lgv = $4;
+ my $sum = $5;
+ my $typ = $6;
+
+ $ng =~ s/^0*//;
+
+ push @$body, "<tr><td>$ng</td><td>$ngv</td><td>$lgv</td><td>$sum</td></tr>\n";
+ }
+
+ $_ = <F>; chomp;
+ }
+
+ push @$body, "</table>\n";
+}
+
+
sub buildUnitiggerHTML ($$$$$$) {
my $wrk = shift @_;
my $asm = shift @_;
@@ -668,11 +705,181 @@ sub buildUnitiggerHTML ($$$$$$) {
my $body = shift @_; # Array reference
my $scripts = shift @_; # Array reference
+ return if (! -d "$wrk/4-unitigger");
+
+ my @logs;
+
+ push @logs, "$wrk/4-unitigger/unitigger.err";
+
+ open(F, "ls $wrk/4-unitigger |");
+ while (<F>) {
+ chomp;
+
+ push @logs, "$wrk/4-unitigger/$_" if (m/log$/);
+ }
+ close(F);
+
push @$body, "<h2>Unitigs</h2>\n";
push @$body, "\n";
+
+ if (-e "$wrk/4-unitigger/unitigger.err") {
+ my $all = 0;
+ my $some = 0;
+ my $someL = 0;
+ my $olaps = 0;
+
+ open(F, "< $wrk/4-unitigger/unitigger.err");
+ while (<F>) {
+ chomp;
+
+ #if (m/maxPer.*numBelow=(\d+)\snumEqual=(\d+)\snumAbove=(\d+)\stotalLoad=(\d+)\s/) {
+ # push @$body, "Loaded $4 overlaps. $3 overlaps were omitted due to memory constraints.\n";
+ #}
+
+ $someL = $1 if (m/_maxPer\s+=\s+(\d+)\s+overlaps/);
+ $all += $1 if (m/numBelow\s+=\s+(\d+)\s+reads/);
+ $all += $1 if (m/numEqual\s+=\s+(\d+)\s+reads/);
+ $some = $1 if (m/numAbove\s+=\s+(\d+)\s+reads/);
+ $olaps = $1 if (m/totalLoad\s+=\s+(\d+)\s+overlaps/);
+
+
+ }
+ close(F);
+
+ push @$body, "<h3>Overlaps</h3>\n";
+ push @$body, "\n";
+ push @$body, "Loaded all overlaps for $all reads.<br>\n";
+ push @$body, "Loaded some overlaps for $some reads (the best $someL for each read).<br>\n" if ($some > 0);
+ push @$body, "Loaded $olaps overlaps in total.<br>\n";
+ }
+
+ if (-e "$wrk/4-unitigger/$asm.001.filterOverlaps.thr000.num000.log") {
+ push @$body, "<h3>Edges</h3>\n";
+ push @$body, "\n";
+
+ my $initContained = 0;
+ my $initSingleton = 0;
+ my $initSpur = 0;
+ my $initSpurMutualBest = 0;
+ my $initBest = 0;
+ my $initBest0Mutual = 0;
+ my $initBest1Mutual = 0;
+ my $initBest2Mutual = 0;
+
+ my $mean = 0; my $stddev = 0; my $ms = 0;
+ my $median = 0; my $mad = 0; my $mm = 0;
+ my $noBest = 0; my $highErr = 0; my $acceptable = 0;
+
+ my $suspicious = 0;
+ my $filtered1 = 0;
+ my $filtered2 = 0;
+ my $lopsided1 = 0;
+ my $lopsided2 = 0;
+
+ my $finalContained = 0;
+ my $finalSingleton = 0;
+ my $finalSpur = 0;
+ my $finalSpurMutualBest = 0;
+ my $finalBest = 0;
+ my $finalBest0Mutual = 0;
+ my $finalBest1Mutual = 0;
+ my $finalBest2Mutual = 0;
+
+ open(F, "$wrk/4-unitigger/$asm.001.filterOverlaps.thr000.num000.log");
+ $_ = <F>; chomp;
+
+ my $block = "none";
+
+ while (!eof(F)) {
+ $block = "init" if (m/^INITIAL\sEDGES/);
+ $block = "error" if (m/^ERROR\sRATES/);
+ $block = "edge" if (m/^EDGE\sFILTERING/);
+ $block = "final" if (m/^FINAL\sEDGES/);
+
+ $initContained = $1 if (($block eq "init") && (m/(\d+)\sreads\sare\scontained/));
+ $initSingleton = $1 if (($block eq "init") && (m/(\d+)\sreads\shave\sno\sbest\sedges/));
+ $initSpur = $1 if (($block eq "init") && (m/(\d+)\sreads\shave\sonly\sone\sbest\sedge.*spur/));
+ $initSpurMutualBest = $1 if (($block eq "init") && (m/(\d+)\sare\smutual\sbest/));
+ $initBest = $1 if (($block eq "init") && (m/(\d+)\sreads\shave\stwo\sbest\sedges/));
+ $initBest1Mutual = $1 if (($block eq "init") && (m/(\d+)\shave\sone\smutual\sbest/));
+ $initBest2Mutual = $1 if (($block eq "init") && (m/(\d+)\shave\stwo\smutual\sbest/));
+
+ if (($block eq "error") && (m/mean\s+(\d+.\d+)\s+stddev\s+(\d+.\d+)\s+.*\s+(\d+.\d+)\s+fraction\serror/)) {
+ $mean = $1;
+ $stddev = $2;
+ $ms = $3;
+ }
+ if (($block eq "error") && (m/median\s+(\d+.\d+)\s+mad\s+(\d+.\d+)\s+.*\s+(\d+.\d+)\s+fraction\serror/)) {
+ $median = $1;
+ $mad = $2;
+ $mm = $3;
+ }
+
+ $suspicious = $1 if (($block eq "edge") && (m/(\d+)\sreads\shave\sa\ssuspicious\soverlap\spattern/));
+ $filtered1 = $1 if (($block eq "edge") && (m/(\d+)\shad\sone/));
+ $filtered2 = $1 if (($block eq "edge") && (m/(\d+)\shad\stwo/));
+ $lopsided1 = $1 if (($block eq "edge") && (m/(\d+)\shave\sone/));
+ $lopsided2 = $1 if (($block eq "edge") && (m/(\d+)\shave\stwo/));
+
+ $finalContained = $1 if (($block eq "final") && (m/(\d+)\sreads\sare\scontained/));
+ $finalSingleton = $1 if (($block eq "final") && (m/(\d+)\sreads\shave\sno\sbest\sedges/));
+ $finalSpur = $1 if (($block eq "final") && (m/(\d+)\sreads\shave\sonly\sone\sbest\sedge.*spur/));
+ $finalSpurMutualBest = $1 if (($block eq "final") && (m/(\d+)\sare\smutual\sbest/));
+ $finalBest = $1 if (($block eq "final") && (m/(\d+)\sreads\shave\stwo\sbest\sedges/));
+ $finalBest1Mutual = $1 if (($block eq "final") && (m/(\d+)\shave\sone\smutual\sbest/));
+ $finalBest2Mutual = $1 if (($block eq "final") && (m/(\d+)\shave\stwo\smutual\sbest/));
+
+ $_ = <F>; chomp;
+ }
+
+ close(F);
+
+ $initBest0Mutual = $initBest - $initBest1Mutual - $initBest2Mutual;
+ $finalBest0Mutual = $finalBest - $finalBest1Mutual - $finalBest2Mutual;
+
+ push @$body, "Constructing unitigs using overlaps of at most this fraction error:<br>\n";
+ push @$body, "$median +- $mad = $mm = ", $mm * 100, "\% (median absolute deviation)<br>\n";
+ push @$body, "$mean +- $stddev = $ms = ", $ms * 100, "\% (standard deviation)<br>\n";
+ push @$body, "<br>\n";
+ push @$body, "INITIAL EDGES<br>\n";
+ push @$body, "$initContained reads are contained.<br>\n";
+ push @$body, "$initSingleton reads are singleton.<br>\n";
+ push @$body, "$initSpur reads are spur ($initSpurMutualBest have a mutual best edge).<br>\n";
+ push @$body, "$initBest reads form the backbone ($initBest0Mutual have no mutual best edges; $initBest1Mutual have one; $initBest2Mutual have both).<br>\n";
+ push @$body, "<br>\n";
+ push @$body, "FILTERING<br>\n";
+ push @$body, "$suspicious reads have a suspicious overlap pattern.<br>\n";
+ push @$body, "$filtered1 had one high error rate edge filtered; $filtered2 had both.<br>\n";
+ push @$body, "$lopsided1 had one size incompatible edge filtered; $lopsided2 had both.<br>\n";
+ push @$body, "<br>\n";
+ push @$body, "FINAL EDGES<br>\n";
+ push @$body, "$finalContained reads are contained.<br>\n";
+ push @$body, "$finalSingleton reads are singleton.<br>\n";
+ push @$body, "$finalSpur reads are spur ($finalSpurMutualBest have a mutual best edge).<br>\n";
+ push @$body, "$finalBest reads form the backbone ($finalBest0Mutual have no mutual best edges; $finalBest1Mutual have one; $finalBest2Mutual have both).<br>\n";
+ }
+
+
+ push @$body, "<h3>Initial Tig Sizes</h3>\n";
+
+ if (-e "$wrk/4-unitigger/$asm.003.buildUnitigs.sizes") {
+ open(F, "< $wrk/4-unitigger/$asm.003.buildUnitigs.sizes");
+ reportSizeStatistics($css, $body, $scripts);
+ close(F);
+ }
+
+ push @$body, "<h3>Final Tig Sizes</h3>\n";
+
+ if (-e "$wrk/4-unitigger/$asm.008.generateOutputs.sizes") {
+ open(F, "< $wrk/4-unitigger/$asm.008.generateOutputs.sizes");
+ reportSizeStatistics($css, $body, $scripts);
+ close(F);
+ }
+
}
+
sub buildConsensusHTML ($$$$$$) {
my $wrk = shift @_;
my $asm = shift @_;
diff --git a/src/pipelines/canu/Meryl.pm b/src/pipelines/canu/Meryl.pm
index 596fa36..6b5c98e 100644
--- a/src/pipelines/canu/Meryl.pm
+++ b/src/pipelines/canu/Meryl.pm
@@ -109,14 +109,17 @@ sub plotHistogram ($$$$) {
my $suffix = shift @_;
my $size = shift @_;
- return if (-e "$ofile.histogram.$suffix.png");
+ return if (-e "$ofile.histogram.$suffix.gp");
+
+ my $gnuplot = getGlobal("gnuplot");
+ my $format = getGlobal("gnuplotImageFormat");
open(F, "> $ofile.histogram.$suffix.gp");
print F "\n";
print F "unset multiplot\n";
print F "\n";
- print F "set terminal png size $size,$size\n";
- print F "set output '$ofile.histogram.$suffix.png'\n";
+ print F "set terminal $format size $size,$size\n";
+ print F "set output '$ofile.histogram.$suffix.$format'\n";
print F "\n";
print F "set multiplot\n";
print F "\n";
@@ -163,7 +166,7 @@ sub plotHistogram ($$$$) {
print F "plot [0:200] '$ofile.histogram' using 1:2 with lines title 'Histogram'\n";
close(F);
- if (runCommandSilently("$wrk/0-mercounts", "gnuplot $ofile.histogram.$suffix.gp > /dev/null 2>&1", 0)) {
+ if (runCommandSilently("$wrk/0-mercounts", "$gnuplot $ofile.histogram.$suffix.gp > /dev/null 2>&1", 0)) {
print STDERR "--\n";
print STDERR "-- WARNING: gnuplot failed; no plots will appear in HTML output.\n";
print STDERR "--\n";
@@ -288,7 +291,7 @@ sub merylConfigure ($$$) {
print F "#!" . getGlobal("shell") . "\n";
print F "\n";
- print F "if [ -e $wrk/$asm.tigStore/seqDB.v001.tig ] ; then\n";
+ print F "if [ -e $wrk/$asm.ctgStore/seqDB.v001.tig ] ; then\n";
print F " exit 0\n";
print F "fi\n";
print F "\n";
@@ -377,7 +380,7 @@ sub merylCheck ($$$) {
rename("$ofile.FINISHED.mcdat", "$ofile.mcdat");
rename("$ofile.FINISHED.mcidx", "$ofile.mcidx");
- setGlobal("canuIteration", 0);
+ setGlobal("canuIteration", 1);
emitStage($WRK, $asm, "merylCheck");
buildHTML($WRK, $asm, $tag);
stopAfter("merylCheck");
diff --git a/src/pipelines/canu/Output.pm b/src/pipelines/canu/Output.pm
index e61911f..4512842 100644
--- a/src/pipelines/canu/Output.pm
+++ b/src/pipelines/canu/Output.pm
@@ -40,11 +40,11 @@ package canu::Output;
require Exporter;
@ISA = qw(Exporter);
- at EXPORT = qw(outputLayout outputGraph outputSequence outputSummary);
+ at EXPORT = qw(generateOutputs);
use strict;
-use POSIX qw(UINT_MAX);
+use File::Copy;
use canu::Defaults;
use canu::Execution;
@@ -56,130 +56,134 @@ use canu::HTML;
# were written to the 9-terminator directory.
-sub outputLayout ($$) {
+sub generateOutputs ($$) {
my $WRK = shift @_; # Root work directory (the -d option to canu)
my $wrk = "$WRK/unitigging"; # Local work directory
my $asm = shift @_;
my $bin = getBinDirectory();
my $cmd;
- goto allDone if (skipStage($WRK, $asm, "outputLayout") == 1);
- goto allDone if (-e "$WRK/$asm.layout");
+ my $type = "fasta"; # Should probably be an option.
- $cmd = "$bin/tgStoreDump \\\n";
- $cmd .= " -G $wrk/$asm.gkpStore \\\n";
- $cmd .= " -T $wrk/$asm.tigStore 2 \\\n";
- $cmd .= " -o $WRK/$asm \\\n";
- $cmd .= " -layout \\\n";
- $cmd .= "> $WRK/$asm.layout.err 2>&1";
+ goto allDone if (skipStage($WRK, $asm, "generateOutputs") == 1);
- if (runCommand($wrk, $cmd)) {
- caExit("failed to output layouts", "$WRK/$asm.layout.err");
- }
-
- unlink "$WRK/$asm.layout.err";
-
- finishStage:
- emitStage($WRK, $asm, "outputLayout");
- buildHTML($WRK, $asm, "utg");
-
- allDone:
- print STDERR "-- Unitig layouts saved in '$WRK/$asm.layout'.\n";
-}
-
-
-
-
-sub outputGraph ($$) {
- my $WRK = shift @_; # Root work directory (the -d option to canu)
- my $wrk = "$WRK/unitigging"; # Local work directory
- my $asm = shift @_;
- my $bin = getBinDirectory();
- my $cmd;
+ # Layouts
- goto allDone if (skipStage($WRK, $asm, "outputGraph") == 1);
- goto allDone if (-e "$WRK/$asm.gfa");
-
- if (-e "$wrk/4-unitigger/$asm.unused.edges") {
- $cmd = "$bin/buildGraph \\\n";
+ if (! -e "$WRK/$asm.contigs.layout") {
+ $cmd = "$bin/tgStoreDump \\\n";
$cmd .= " -G $wrk/$asm.gkpStore \\\n";
- $cmd .= " -T $wrk/$asm.tigStore 2 \\\n";
- $cmd .= " -E $wrk/4-unitigger/$asm.unused.edges \\\n";
- $cmd .= " -o $WRK/$asm.gfa \\\n";
- $cmd .= "2>&1 > $WRK/$asm.gfa.err\n";
+ $cmd .= " -T $wrk/$asm.ctgStore 2 \\\n";
+ $cmd .= " -o $WRK/$asm.contigs \\\n";
+ $cmd .= " -layout \\\n";
+ $cmd .= "> $WRK/$asm.contigs.layout.err 2>&1";
if (runCommand($wrk, $cmd)) {
- caExit("failed to output consensus", "$WRK/$asm.gfa.err");
+ caExit("failed to output contig layouts", "$WRK/$asm.contigs.layout.err");
}
- unlink "$WRK/$asm.gfa.err";
- } else {
- print STDERR "-- Unused best edges file missing, no graph output generated.\n";
+ unlink "$WRK/$asm.contigs.layout.err";
}
- finishStage:
- emitStage($WRK, $asm, "outputGraph");
- buildHTML($WRK, $asm, "utg");
-
- allDone:
- print STDERR "-- Unitig graph saved in '$WRK/$asm.gfa'.\n";
-}
-
-
-
+ if (! -e "$WRK/$asm.unitigs.layout") {
+ $cmd = "$bin/tgStoreDump \\\n";
+ $cmd .= " -G $wrk/$asm.gkpStore \\\n";
+ $cmd .= " -T $wrk/$asm.utgStore 2 \\\n";
+ $cmd .= " -o $WRK/$asm.unitigs \\\n";
+ $cmd .= " -layout \\\n";
+ $cmd .= "> $WRK/$asm.unitigs.layout.err 2>&1";
-sub outputSequence ($$) {
- my $WRK = shift @_; # Root work directory (the -d option to canu)
- my $wrk = "$WRK/unitigging"; # Local work directory
- my $asm = shift @_;
- my $bin = getBinDirectory();
- my $cmd;
+ if (runCommand($wrk, $cmd)) {
+ caExit("failed to output unitig layouts", "$WRK/$asm.unitigs.layout.err");
+ }
- my $type = "fasta"; # Should probably be an option.
+ unlink "$WRK/$asm.unitigs.layout.err";
+ }
- goto allDone if (skipStage($WRK, $asm, "outputSequence") == 1);
- goto allDone if (-e "$WRK/$asm.contigs.$type");
+ # Sequences
foreach my $tt ("unassembled", "bubbles", "contigs") {
+ if (! -e "$WRK/$asm.$tt.$type") {
+ $cmd = "$bin/tgStoreDump \\\n";
+ $cmd .= " -G $wrk/$asm.gkpStore \\\n";
+ $cmd .= " -T $wrk/$asm.ctgStore 2 \\\n";
+ $cmd .= " -consensus -$type \\\n";
+ $cmd .= " -$tt \\\n";
+ $cmd .= "> $WRK/$asm.$tt.$type\n";
+ $cmd .= "2> $WRK/$asm.$tt.err";
+
+ if (runCommand($WRK, $cmd)) {
+ caExit("failed to output $tt consensus sequences", "$WRK/$asm.$tt.err");
+ }
+
+ unlink "$WRK/$asm.$tt.err";
+ }
+ }
+
+ if (! -e "$WRK/$asm.unitigs.$type") {
$cmd = "$bin/tgStoreDump \\\n";
$cmd .= " -G $wrk/$asm.gkpStore \\\n";
- $cmd .= " -T $wrk/$asm.tigStore 2 \\\n";
+ $cmd .= " -T $wrk/$asm.utgStore 2 \\\n";
$cmd .= " -consensus -$type \\\n";
- $cmd .= " -$tt \\\n";
- $cmd .= "> $WRK/$asm.$tt.$type\n";
- $cmd .= "2> $WRK/$asm.$tt.err";
+ $cmd .= " -contigs \\\n";
+ $cmd .= "> $WRK/$asm.unitigs.$type\n";
+ $cmd .= "2> $WRK/$asm.unitigs.err";
if (runCommand($WRK, $cmd)) {
- caExit("failed to output consensus", "$WRK/$asm.$tt.err");
+ caExit("failed to output unitig consensus sequences", "$WRK/$asm.unitigs.err");
}
- unlink "$WRK/$asm.$tt.err";
+ unlink "$WRK/$asm.unitigs.err";
}
- finishStage:
- emitStage($WRK, $asm, "outputSequence");
- buildHTML($WRK, $asm, "utg");
+ # Graphs
- allDone:
- print STDERR "-- Unitig sequences saved in '$WRK/$asm.*.$type'.\n";
-}
+ if ((! -e "$WRK/$asm.contigs.gfa") &&
+ ( -e "$wrk/4-unitigger/$asm.contigs.gfa")) {
+ copy("$wrk/4-unitigger/$asm.contigs.gfa", "$WRK/$asm.contigs.gfa");
+ }
+ if ((! -e "$WRK/$asm.unitigs.gfa") &&
+ ( -e "$wrk/4-unitigger/$asm.unitigs.gfa")) {
+ copy("$wrk/4-unitigger/$asm.unitigs.gfa", "$WRK/$asm.unitigs.gfa");
+ }
+ # User-supplied termination command.
-sub outputSummary ($$) {
- my $WRK = shift @_; # Root work directory (the -d option to canu)
- my $wrk = "$WRK/unitigging"; # Local work directory
- my $asm = shift @_;
- my $bin = getBinDirectory();
- my $cmd;
+ if (defined(getGlobal("onSuccess"))) {
+ print STDERR "-- Running user-supplied termination command.\n";
+ runCommand($WRK, getGlobal("onSuccess") . " $asm");
+ }
- goto allDone if (skipStage($WRK, $asm, "outputSummary") == 1);
- goto allDone if (-e "$WRK/unitiggging.html");
finishStage:
- emitStage($WRK, $asm, "outputSummary");
+ emitStage($WRK, $asm, "generateOutputs");
buildHTML($WRK, $asm, "utg");
allDone:
+ print STDERR "--\n";
+ print STDERR "-- Assembly finished.\n";
+ print STDERR "--\n";
print STDERR "-- Summary saved in '$WRK/unitigging.html'.\n";
+ print STDERR "--\n";
+ print STDERR "-- Sequences saved:\n";
+ print STDERR "-- Contigs -> '$WRK/$asm.contigs.$type'\n";
+ print STDERR "-- Bubbles -> '$WRK/$asm.bubbles.$type' (DEPRECATED)\n";
+ print STDERR "-- Unassembled -> '$WRK/$asm.unassembled.$type'\n";
+ print STDERR "-- Unitigs -> '$WRK/$asm.unitigs.$type'\n";
+ print STDERR "--\n";
+ print STDERR "-- Read layouts saved:\n";
+ print STDERR "-- Contigs -> '$WRK/$asm.contigs.layout'.\n";
+ print STDERR "-- Unitigs -> '$WRK/$asm.unitigs.layout'.\n";
+ print STDERR "--\n";
+ print STDERR "-- Graphs saved:\n";
+ print STDERR "-- Contigs -> '$WRK/$asm.contigs.gfa'.\n";
+ print STDERR "-- Unitigs -> '$WRK/$asm.unitigs.gfa'.\n";
+ print STDERR "--\n";
+ print STDERR "-- Bye.\n";
+
+ finishStage:
+ emitStage($WRK, $asm, "outputSequence");
+ buildHTML($WRK, $asm, "utg");
+
+ allDone:
}
diff --git a/src/pipelines/canu/OverlapErrorAdjustment.pm b/src/pipelines/canu/OverlapErrorAdjustment.pm
index 4316102..07c5164 100644
--- a/src/pipelines/canu/OverlapErrorAdjustment.pm
+++ b/src/pipelines/canu/OverlapErrorAdjustment.pm
@@ -101,7 +101,7 @@ sub readErrorDetectionConfigure ($$) {
goto allDone if (-e "$path/red.red");
goto allDone if (-e "$wrk/$asm.ovlStore/adjustedEvalues");
- goto allDone if (-d "$wrk/$asm.tigStore");
+ goto allDone if (-d "$wrk/$asm.ctgStore");
make_path("$path") if (! -d "$path");
@@ -164,7 +164,7 @@ sub readErrorDetectionConfigure ($$) {
if ((($maxMem > 0) && ($memory >= $maxMem * 0.75)) || # Allow 25% slop (10% is probably sufficient)
(($maxReads > 0) && ($reads >= $maxReads)) ||
(($maxBases > 0) && ($bases >= $maxBases)) ||
- (($id == $maxID - 1))) {
+ (($id == $maxID))) {
push @end, $id;
printf(STDERR "RED job %3u from read %9u to read %9u - %7.3f GB for %7u reads - %7.3f GB for %9u olaps - %7.3f GB for evidence\n",
@@ -195,14 +195,7 @@ sub readErrorDetectionConfigure ($$) {
print F "#!" . getGlobal("shell") . "\n\n";
print F "\n";
- print F "jobid=\$" . getGlobal("gridEngineTaskID") . "\n";
- print F "if [ x\$jobid = x -o x\$jobid = xundefined -o x\$jobid = x0 ]; then\n";
- print F " jobid=\$1\n";
- print F "fi\n";
- print F "if [ x\$jobid = x ]; then\n";
- print F " echo Error: I need " . getGlobal("gridEngineTaskID") . " set, or a job index on the command line.\n";
- print F " exit 1\n";
- print F "fi\n";
+ print F getJobIDShellCode();
print F "\n";
for (my $jj=1; $jj <= $nj; $jj++) {
@@ -260,7 +253,7 @@ sub readErrorDetectionCheck ($$) {
goto allDone if (-e "$path/red.red");
goto allDone if (-e "$wrk/$asm.ovlStore/adjustedEvalues");
- goto allDone if (-d "$wrk/$asm.tigStore");
+ goto allDone if (-d "$wrk/$asm.ctgStore");
# Figure out if all the tasks finished correctly.
@@ -268,7 +261,7 @@ sub readErrorDetectionCheck ($$) {
my @failedJobs;
my $failureMessage = "";
- open(A, "< $path/red.sh") or caExit("can't open '$path/red.sh' for reading: $!\n", undef);
+ open(A, "< $path/red.sh") or caExit("can't open '$path/red.sh' for reading: $!", undef);
while (<A>) {
if (m/if.*jobid\s+=\s+(\d+)\s+.*then/) {
my $ji = substr("0000" . $1, -4);
@@ -324,7 +317,7 @@ sub readErrorDetectionCheck ($$) {
concatOutput("$path/red.red", @successJobs);
- setGlobal("canuIteration", 0);
+ setGlobal("canuIteration", 1);
emitStage($WRK, $asm, "readErrorDetectionCheck");
buildHTML($WRK, $asm, "utg");
stopAfter("red");
@@ -349,7 +342,7 @@ sub overlapErrorAdjustmentConfigure ($$) {
goto allDone if (-e "$path/oea.sh");
goto allDone if (-e "$wrk/$asm.ovlStore/adjustedEvalues");
- goto allDone if (-d "$wrk/$asm.tigStore");
+ goto allDone if (-d "$wrk/$asm.ctgStore");
# OEA uses 1 byte/base + 8 bytes/adjustment + 28 bytes/overlap. We don't know the number of adjustments, but that's
# basically error rate. No adjustment is output for mismatches.
@@ -375,11 +368,12 @@ sub overlapErrorAdjustmentConfigure ($$) {
# Make an array of partitions, putting as many reads into each as will fit in the desired memory.
- my @bgn;
- my @end;
- my $nj = 0;
+ tryOEAagain:
+ my @bgn; undef @bgn;
+ my @end; undef @end;
+ my @log; undef @log;
- #getAllowedResources("", "oea");
+ my $nj = 0;
my $maxID = getNumberOfReadsInStore($wrk, $asm);
my $maxMem = getGlobal("oeaMemory") * 1024 * 1024 * 1024;
@@ -388,14 +382,16 @@ sub overlapErrorAdjustmentConfigure ($$) {
print STDERR "\n";
print STDERR "Configure OEA for ", getGlobal("oeaMemory"), "gb memory with batches of at most ", ($maxReads > 0) ? $maxReads : "(unlimited)", " reads and ", ($maxBases > 0) ? $maxBases : "(unlimited)", " bases.\n";
- print STDERR "\n";
my $reads = 0;
my $bases = 0;
my $olaps = 0;
- my $coverage = getExpectedCoverage($wrk, $asm);
- my $corrSize = (-s "$path/red.red");
+ my $coverage = getExpectedCoverage($wrk, $asm);
+ my $corrSize = (-s "$path/red.red");
+
+ my $smallJobs = 0;
+ my $smallJobSize = 1024;
push @bgn, 1;
@@ -423,7 +419,9 @@ sub overlapErrorAdjustmentConfigure ($$) {
(($id == $maxID))) {
push @end, $id;
- printf(STDERR "OEA job %3u from read %9u to read %9u - %4.1f bases + %4.1f adjusts + %4.1f reads + %4.1f olaps + %4.1f fseq/rseq + %4.1f fadj/radj + %4.1f work + %4.1f misc = %5.1f MB\n",
+ $smallJobs++ if ($end[$nj] - $bgn[$nj] < $smallJobSize);
+
+ push @log, sprintf("OEA job %3u from read %9u to read %9u - %4.1f bases + %4.1f adjusts + %4.1f reads + %4.1f olaps + %4.1f fseq/rseq + %4.1f fadj/radj + %4.1f work + %4.1f misc = %5.1f MB\n",
$nj + 1, $bgn[$nj], $end[$nj],
$memBases / 1024 / 1024,
$memAdj1 / 1024 / 1024,
@@ -445,20 +443,38 @@ sub overlapErrorAdjustmentConfigure ($$) {
}
}
+ # If too many small jobs, increase memory and try again. We'll allow any size jobs as long as
+ # there are 8 or less, but then demand there are at most 2 small jobs.
+
+ if (($nj > 8) && ($smallJobs >= 2)) {
+ my $curMem = getGlobal("oeaMemory");
+ my $newMem = int(1000 * getGlobal("oeaMemory") * 1.25) / 1000;
+
+ print STDERR " FAILED - configured $nj jobs, but $smallJobs jobs process $smallJobSize reads or less each. Increasing memory from $curMem GB to $newMem GB.\n";
+
+ setGlobal("oeaMemory", $newMem);
+
+ goto tryOEAagain;
+ }
+
+ # Report.
+
+ print STDERR "Configured $nj jobs.\n";
+ print STDERR "\n";
+
+ foreach my $l (@log) {
+ print STDERR $l;
+ }
+
+ print STDERR "\n";
+
# Dump a script
open(F, "> $path/oea.sh") or caExit("can't open '$path/oea.sh' for writing: $!", undef);
print F "#!" . getGlobal("shell") . "\n\n";
print F "\n";
- print F "jobid=\$" . getGlobal("gridEngineTaskID") . "\n";
- print F "if [ x\$jobid = x -o x\$jobid = xundefined -o x\$jobid = x0 ]; then\n";
- print F " jobid=\$1\n";
- print F "fi\n";
- print F "if [ x\$jobid = x ]; then\n";
- print F " echo Error: I need " . getGlobal("gridEngineTaskID") . " set, or a job index on the command line.\n";
- print F " exit 1\n";
- print F "fi\n";
+ print F getJobIDShellCode();
print F "\n";
for (my $jj=1; $jj <= $nj; $jj++) {
@@ -529,7 +545,7 @@ sub overlapErrorAdjustmentCheck ($$) {
my @failedJobs;
my $failureMessage = "";
- open(A, "< $path/oea.sh") or caExit("can't open '$path/oea.sh' for reading: $!\n", undef);
+ open(A, "< $path/oea.sh") or caExit("can't open '$path/oea.sh' for reading: $!", undef);
while (<A>) {
if (m/if.*jobid\s+=\s+(\d+)\s+.*then/) {
my $ji = substr("0000" . $1, -4);
@@ -584,7 +600,7 @@ sub overlapErrorAdjustmentCheck ($$) {
}
close(L);
- setGlobal("canuIteration", 0);
+ setGlobal("canuIteration", 1);
emitStage($WRK, $asm, "overlapErrorAdjustmentCheck");
buildHTML($WRK, $asm, "utg");
stopAfter("oea");
@@ -609,7 +625,7 @@ sub updateOverlapStore ($$) {
goto allDone if (-e "$wrk/$asm.ovlStore/evalues");
goto allDone if (-e "$wrk/$asm.ovlStore/adjustedEvalues");
- goto allDone if (-d "$wrk/$asm.tigStore");
+ goto allDone if (-d "$wrk/$asm.ctgStore");
caExit("didn't find '$path/oea.files' to add to store, yet overlapper finished", undef) if (! -e "$path/oea.files");
diff --git a/src/pipelines/canu/OverlapInCore.pm b/src/pipelines/canu/OverlapInCore.pm
index b1609aa..1d2d661 100644
--- a/src/pipelines/canu/OverlapInCore.pm
+++ b/src/pipelines/canu/OverlapInCore.pm
@@ -27,6 +27,10 @@
# are a 'United States Government Work', and
# are released in the public domain
#
+ # Sergey Koren beginning on 2016-JUN-08
+ # are a 'United States Government Work', and
+ # are released in the public domain
+ #
# File 'README.licenses' in the root directory of this distribution contains
# full conditions and disclaimers for each license.
##
@@ -78,11 +82,16 @@ sub overlapConfigure ($$$$) {
make_path("$path") if (! -d "$path");
+ # overlapInCorePartition internally uses 'WORKING' outputs, and renames to the final
+ # version right before it exits. All we need to do here is check for existence of
+ # the output, and exit if the command fails.
+
if (! -e "$path/$asm.partition.ovlopt") {
- # These used to be runCA options, but were removed in canu. They were used mostly for illumina-pacbio correction,
- # but were also used (or could have been used) during the Salmon assembly when overlaps were computed differently
- # depending on the libraries involved (and was run manually). These are left in for documentation.
+ # These used to be runCA options, but were removed in canu. They were used mostly for
+ # illumina-pacbio correction, but were also used (or could have been used) during the
+ # Salmon assembly when overlaps were computed differently depending on the libraries
+ # involved (and was run manually). These are left in for documentation.
#
#my $checkLibrary = getGlobal("${tag}CheckLibrary");
#my $hashLibrary = getGlobal("${tag}HashLibrary");
@@ -128,8 +137,6 @@ sub overlapConfigure ($$$$) {
close(JOB);
close(OPT);
- #getAllowedResources($tag, "ovl");
-
if (! -e "$path/overlap.sh") {
my $merSize = getGlobal("${tag}OvlMerSize");
@@ -147,14 +154,7 @@ sub overlapConfigure ($$$$) {
print F "\n";
print F "perl='/usr/bin/env perl'\n";
print F "\n";
- print F "jobid=\$" . getGlobal("gridEngineTaskID") . "\n";
- print F "if [ x\$jobid = x -o x\$jobid = xundefined -o x\$jobid = x0 ]; then\n";
- print F " jobid=\$1\n";
- print F "fi\n";
- print F "if [ x\$jobid = x ]; then\n";
- print F " echo Error: I need " . getGlobal("gridEngineTaskID") . " set, or a job index on the command line.\n";
- print F " exit 1\n";
- print F "fi\n";
+ print F getJobIDShellCode();
print F "\n";
for (my $ii=1; $ii<=scalar(@bat); $ii++) {
@@ -171,7 +171,7 @@ sub overlapConfigure ($$$$) {
print F " mkdir $path/\$bat\n";
print F "fi\n";
print F "\n";
- print F "if [ -e $path/\$job.ovb.gz ]; then\n";
+ print F "if [ -e $path/\$job.ovb ]; then\n";
print F " echo Job previously completed successfully.\n";
print F " exit\n";
print F "fi\n";
@@ -187,14 +187,15 @@ sub overlapConfigure ($$$$) {
print F " --hashload $hashLoad \\\n";
print F " --maxerate ", getGlobal("${tag}OvlErrorRate"), " \\\n";
print F " --minlength ", getGlobal("minOverlapLength"), " \\\n";
+ print F " --minkmers \\\n" if (defined(getGlobal("${tag}OvlFilter")) && getGlobal("${tag}OvlFilter")==1);
print F " \$opt \\\n";
- print F " -o $path/\$job.ovb.WORKING.gz \\\n";
+ print F " -o $path/\$job.ovb.WORKING \\\n";
print F " -s $path/\$job.stats \\\n";
#print F " -H $hashLibrary \\\n" if ($hashLibrary ne "0");
#print F " -R $refLibrary \\\n" if ($refLibrary ne "0");
print F " $wrk/$asm.gkpStore \\\n";
print F "&& \\\n";
- print F "mv $path/\$job.ovb.WORKING.gz $path/\$job.ovb.gz\n";
+ print F "mv $path/\$job.ovb.WORKING $path/\$job.ovb\n";
print F "\n";
print F "exit 0\n";
close(F);
@@ -405,7 +406,7 @@ sub overlapCheck ($$$$) {
reportOverlapStats($wrk, $asm, @statsJobs);
- setGlobal("canuIteration", 0);
+ setGlobal("canuIteration", 1);
emitStage($WRK, $asm, "$tag-overlapCheck");
buildHTML($WRK, $asm, $tag);
stopAfter("overlapper");
diff --git a/src/pipelines/canu/OverlapMMap.pm b/src/pipelines/canu/OverlapMMap.pm
index cfde7c3..91aebd7 100644
--- a/src/pipelines/canu/OverlapMMap.pm
+++ b/src/pipelines/canu/OverlapMMap.pm
@@ -196,14 +196,7 @@ sub mmapConfigure ($$$$) {
print F "#!" . getGlobal("shell") . "\n";
print F "\n";
- print F "jobid=\$" . getGlobal("gridEngineTaskID") . "\n";
- print F "if [ x\$jobid = x -o x\$jobid = xundefined -o x\$jobid = x0 ]; then\n";
- print F " jobid=\$1\n";
- print F "fi\n";
- print F "if [ x\$jobid = x ]; then\n";
- print F " echo Error: I need " . getGlobal("gridEngineTaskID") . " set, or a job index on the command line.\n";
- print F " exit 1\n";
- print F "fi\n";
+ print F getJobIDShellCode();
print F "\n";
for (my $ii=1; $ii < scalar(@blocks); $ii++) {
print F "if [ \$jobid -eq $ii ] ; then\n";
@@ -233,6 +226,7 @@ sub mmapConfigure ($$$$) {
print F " -G $wrk/$asm.gkpStore \\\n";
print F " \$rge \\\n";
print F " -nolibname \\\n";
+ print F " -noreadname \\\n";
print F " -fasta \\\n";
print F " -o $path/blocks/\$job \\\n";
print F "|| \\\n";
@@ -249,14 +243,7 @@ sub mmapConfigure ($$$$) {
print F "#!" . getGlobal("shell") . "\n";
print F "\n";
- print F "jobid=\$" . getGlobal("gridEngineTaskID") . "\n";
- print F "if [ x\$jobid = x -o x\$jobid = xundefined -o x\$jobid = x0 ]; then\n";
- print F " jobid=\$1\n";
- print F "fi\n";
- print F "if [ x\$jobid = x ]; then\n";
- print F " echo Error: I need " . getGlobal("gridEngineTaskID") . " set, or a job index on the command line.\n";
- print F " exit 1\n";
- print F "fi\n";
+ print F getJobIDShellCode();
print F "\n";
for (my $ii=1; $ii < scalar(@hashes); $ii++) {
print F "if [ \$jobid -eq $ii ] ; then\n";
@@ -273,7 +260,7 @@ sub mmapConfigure ($$$$) {
print F " exit 1\n";
print F "fi\n";
print F "\n";
- print F "if [ -e $path/results/\$qry.ovb.gz ]; then\n";
+ print F "if [ -e $path/results/\$qry.ovb ]; then\n";
print F " echo Job previously completed successfully.\n";
print F " exit\n";
print F "fi\n";
@@ -322,29 +309,29 @@ sub mmapConfigure ($$$$) {
print F "\n";
print F "if [ -e \"$path/results/\$qry.mmap\" -a \\\n";
- print F " ! -e \"$path/results/\$qry.ovb.gz\" ] ; then\n";
+ print F " ! -e \"$path/results/\$qry.ovb\" ] ; then\n";
print F " \$bin/mmapConvert \\\n";
- print F " -o $path/results/\$qry.mmap.ovb.WORKING.gz \\\n";
+ print F " -o $path/results/\$qry.mmap.ovb.WORKING \\\n";
print F " $path/results/\$qry.mmap \\\n";
print F " && \\\n";
- print F " mv $path/results/\$qry.mmap.ovb.WORKING.gz $path/results/\$qry.mmap.ovb.gz\n";
+ print F " mv $path/results/\$qry.mmap.ovb.WORKING $path/results/\$qry.mmap.ovb\n";
print F "fi\n";
print F "\n";
if (getGlobal('saveOverlaps') eq "0") {
print F "if [ -e \"$path/results/\$qry.mmap\" -a \\\n";
- print F " -e \"$path/results/\$qry.mmap.ovb.gz\" ] ; then\n";
+ print F " -e \"$path/results/\$qry.mmap.ovb\" ] ; then\n";
print F " rm -f $path/results/\$qry.mmap\n";
print F "fi\n";
print F "\n";
}
- print F "if [ -e \"$path/results/\$qry.mmap.ovb.gz\" ] ; then\n";
+ print F "if [ -e \"$path/results/\$qry.mmap.ovb\" ] ; then\n";
if (getGlobal("${tag}ReAlign") eq "raw") {
print F " \$bin/overlapPair \\\n";
print F " -G $wrk/$asm.gkpStore \\\n";
- print F " -O $path/results/\$qry.mmap.ovb.gz \\\n";
- print F " -o $path/results/\$qry.ovb.gz \\\n";
+ print F " -O $path/results/\$qry.mmap.ovb \\\n";
+ print F " -o $path/results/\$qry.ovb \\\n";
print F " -partial \\\n" if ($typ eq "partial");
print F " -erate ", getGlobal("corErrorRate"), " \\\n" if ($tag eq "cor");
print F " -erate ", getGlobal("obtOvlErrorRate"), " \\\n" if ($tag eq "obt");
@@ -352,7 +339,7 @@ sub mmapConfigure ($$$$) {
print F " -memory " . getGlobal("${tag}mmapMemory") . " \\\n";
print F " -t " . getGlobal("${tag}mmapThreads") . " \n";
} else {
- print F " mv -f \"$path/results/\$qry.mmap.ovb.gz\" \"$path/results/\$qry.ovb.gz\"\n";
+ print F " mv -f \"$path/results/\$qry.mmap.ovb\" \"$path/results/\$qry.ovb\"\n";
}
print F "fi\n";
@@ -472,7 +459,7 @@ sub mmapPrecomputeCheck ($$$$) {
print L @successJobs;
close(L);
- setGlobal("canuIteration", 0);
+ setGlobal("canuIteration", 1);
emitStage($WRK, $asm, "$tag-mmapPrecomputeCheck");
buildHTML($WRK, $asm, $tag);
stopAfter("mmapPrecompute");
@@ -598,7 +585,7 @@ sub mmapCheck ($$$$) {
print L @miscJobs;
close(L);
- setGlobal("canuIteration", 0);
+ setGlobal("canuIteration", 1);
emitStage($WRK, $asm, "$tag-mmapCheck");
buildHTML($WRK, $asm, $tag);
stopAfter("mmapOverlap");
diff --git a/src/pipelines/canu/OverlapMhap.pm b/src/pipelines/canu/OverlapMhap.pm
index 954457c..7d51772 100644
--- a/src/pipelines/canu/OverlapMhap.pm
+++ b/src/pipelines/canu/OverlapMhap.pm
@@ -143,11 +143,10 @@ sub mhapConfigure ($$$$) {
# quick guess parameter adjustment for corrected reads, hack for now and should better take error rate into account
if (($tag eq "obt") || ($tag eq "utg")) {
- $numHashes /= 4;
- $minNumMatches = floor(1.5 * $minNumMatches);
- $ordSketch = floor($ordSketch / 2);
+ $numHashes = "128";
+ $minNumMatches = 5;
+ $ordSketch = 1000;
$threshold = 1-getGlobal("${tag}OvlErrorRate");
- $blockPerGb *= 2;
}
print STDERR "--\n";
@@ -293,19 +292,13 @@ sub mhapConfigure ($$$$) {
#getAllowedResources($tag, "mhap");
my $javaPath = getGlobal("java");
+ my $javaMemory = int(getGlobal("${tag}mhapMemory") * 1024 + 0.5);
open(F, "> $path/precompute.sh") or caFailure("can't open '$path/precompute.sh' for writing: $!", undef);
print F "#!" . getGlobal("shell") . "\n";
print F "\n";
- print F "jobid=\$" . getGlobal("gridEngineTaskID") . "\n";
- print F "if [ x\$jobid = x -o x\$jobid = xundefined -o x\$jobid = x0 ]; then\n";
- print F " jobid=\$1\n";
- print F "fi\n";
- print F "if [ x\$jobid = x ]; then\n";
- print F " echo Error: I need " . getGlobal("gridEngineTaskID") . " set, or a job index on the command line.\n";
- print F " exit 1\n";
- print F "fi\n";
+ print F getJobIDShellCode();
print F "\n";
for (my $ii=1; $ii < scalar(@blocks); $ii++) {
print F "if [ \$jobid -eq $ii ] ; then\n";
@@ -356,9 +349,9 @@ sub mhapConfigure ($$$$) {
print F "# So mhap writes its output in the correct spot.\n";
print F "cd $path/blocks\n";
print F "\n";
- print F "$javaPath -d64 -server -Xmx", getGlobal("${tag}mhapMemory"), "g \\\n";
+ print F "$javaPath -d64 -server -Xmx", $javaMemory, "m \\\n";
print F " -jar " . ($^O eq "cygwin" ? "\$(cygpath -w " : "") . "\$bin/mhap-" . getGlobal("${tag}MhapVersion") . ".jar " . ($^O eq "cygwin" ? ")" : "") . "\\\n";
- print F " --repeat-weight 0.9 -k $merSize \\\n";
+ print F " --repeat-weight 0.9 --repeat-idf-scale 10 -k $merSize \\\n";
print F " --supress-noise 2 \\\n" if (defined(getGlobal("${tag}MhapFilterUnique")) && getGlobal("${tag}MhapFilterUnique") == 1);
print F " --no-tf \\\n" if (defined(getGlobal("${tag}MhapNoTf")) && getGlobal("${tag}MhapNoTf") == 1);
print F " --num-hashes $numHashes \\\n";
@@ -368,6 +361,7 @@ sub mhapConfigure ($$$$) {
print F " --threshold $threshold \\\n";
print F " --filter-threshold $filterThreshold \\\n";
print F " --num-threads ", getGlobal("${tag}mhapThreads"), " \\\n";
+ print F " " . getGlobal("${tag}MhapOptions") . " \\\n" if (defined(getGlobal("${tag}MhapOptions")));
print F " -f " . ($^O eq "cygwin" ? "\$(cygpath -w " : "") . "$wrk/0-mercounts/$asm.ms$merSize.frequentMers.ignore.gz" . ($^O eq "cygwin" ? ") " : "") . "\\\n" if (-e "$wrk/0-mercounts/$asm.ms$merSize.frequentMers.ignore.gz");
print F " -p " . ($^O eq "cygwin" ? "\$(cygpath -w " : "") . "$path/blocks/\$job.fasta" . ($^O eq "cygwin" ? ") " : "") . "\\\n";
print F " -q " . ($^O eq "cygwin" ? "\$(cygpath -w " : "") . "$path/blocks" .($^O eq "cygwin" ? ") " : "") . "\\\n";
@@ -392,14 +386,7 @@ sub mhapConfigure ($$$$) {
print F "#!" . getGlobal("shell") . "\n";
print F "\n";
- print F "jobid=\$" . getGlobal("gridEngineTaskID") . "\n";
- print F "if [ x\$jobid = x -o x\$jobid = xundefined -o x\$jobid = x0 ]; then\n";
- print F " jobid=\$1\n";
- print F "fi\n";
- print F "if [ x\$jobid = x ]; then\n";
- print F " echo Error: I need " . getGlobal("gridEngineTaskID") . " set, or a job index on the command line.\n";
- print F " exit 1\n";
- print F "fi\n";
+ print F getJobIDShellCode();
print F "\n";
for (my $ii=1; $ii < scalar(@hashes); $ii++) {
print F "if [ \$jobid -eq $ii ] ; then\n";
@@ -417,7 +404,7 @@ sub mhapConfigure ($$$$) {
print F " exit 1\n";
print F "fi\n";
print F "\n";
- print F "if [ -e $path/results/\$qry.ovb.gz ]; then\n";
+ print F "if [ -e $path/results/\$qry.ovb ]; then\n";
print F " echo Job previously completed successfully.\n";
print F " exit\n";
print F "fi\n";
@@ -433,9 +420,9 @@ sub mhapConfigure ($$$$) {
print F getBinDirectoryShellCode();
print F "\n";
print F "if [ ! -e \"$path/results/\$qry.mhap\" ] ; then\n";
- print F " $javaPath -d64 -server -Xmx", getGlobal("${tag}mhapMemory"), "g \\\n";
+ print F " $javaPath -d64 -server -Xmx", $javaMemory, "m \\\n";
print F " -jar " . ($^O eq "cygwin" ? "\$(cygpath -w " : "") . "\$bin/mhap-" . getGlobal("${tag}MhapVersion") . ".jar " . ($^O eq "cygwin" ? ")" : "") . "\\\n";
- print F " --repeat-weight 0.9 -k $merSize \\\n";
+ print F " --repeat-weight 0.9 --repeat-idf-scale 10 -k $merSize \\\n";
print F " --supress-noise 2 \\\n" if (defined(getGlobal("${tag}MhapFilterUnique")) && getGlobal("${tag}MhapFilterUnique") == 1);
print F " --no-tf \\\n" if (defined(getGlobal("${tag}MhapNoTf")) && getGlobal("${tag}MhapNoTf") == 1);
print F " --num-hashes $numHashes \\\n";
@@ -445,6 +432,7 @@ sub mhapConfigure ($$$$) {
print F " --ordered-sketch-size $ordSketch \\\n";
print F " --ordered-kmer-size $ordSketchMer \\\n";
print F " --num-threads ", getGlobal("${tag}mhapThreads"), " \\\n";
+ print F " " . getGlobal("${tag}MhapOptions") . " \\\n" if (defined(getGlobal("${tag}MhapOptions")));
print F " -f " . ($^O eq "cygwin" ? "\$(cygpath -w " : "") . "$wrk/0-mercounts/$asm.ms$merSize.frequentMers.ignore.gz" . ($^O eq "cygwin" ? ")" : "") . "\\\n" if (-e "$wrk/0-mercounts/$asm.ms$merSize.frequentMers.ignore.gz");
print F " -s " . ($^O eq "cygwin" ? "\$(cygpath -w " : "") . "$path/blocks/\$blk.dat \$slf" . ($^O eq "cygwin" ? ")" : "") . "\\\n";
print F " -q " . ($^O eq "cygwin" ? "\$(cygpath -w " : "") . "$path/queries/\$qry" . ($^O eq "cygwin" ? ")" : "") . "\\\n";
@@ -455,38 +443,41 @@ sub mhapConfigure ($$$$) {
print F "\n";
print F "if [ -e \"$path/results/\$qry.mhap\" -a \\\n";
- print F " ! -e \"$path/results/\$qry.ovb.gz\" ] ; then\n";
+ print F " ! -e \"$path/results/\$qry.ovb\" ] ; then\n";
print F " \$bin/mhapConvert \\\n";
print F " \$cvt \\\n";
- print F " -o $path/results/\$qry.mhap.ovb.WORKING.gz \\\n";
+ print F " -o $path/results/\$qry.mhap.ovb.WORKING \\\n";
print F " $path/results/\$qry.mhap \\\n";
print F " && \\\n";
- print F " mv $path/results/\$qry.mhap.ovb.WORKING.gz $path/results/\$qry.mhap.ovb.gz\n";
+ print F " mv $path/results/\$qry.mhap.ovb.WORKING $path/results/\$qry.mhap.ovb\n";
print F "fi\n";
print F "\n";
if (getGlobal('saveOverlaps') eq "0") {
print F "if [ -e \"$path/results/\$qry.mhap\" -a \\\n";
- print F " -e \"$path/results/\$qry.mhap.ovb.gz\" ] ; then\n";
+ print F " -e \"$path/results/\$qry.mhap.ovb\" ] ; then\n";
print F " rm -f $path/results/\$qry.mhap\n";
print F "fi\n";
print F "\n";
}
- print F "if [ -e \"$path/results/\$qry.mhap.ovb.gz\" ] ; then\n";
+ print F "if [ -e \"$path/results/\$qry.mhap.ovb\" ] ; then\n";
if (getGlobal("${tag}ReAlign") eq "raw") {
print F " \$bin/overlapPair \\\n";
print F " -G $wrk/$asm.gkpStore \\\n";
- print F " -O $path/results/\$qry.mhap.ovb.gz \\\n";
- print F " -o $path/results/\$qry.ovb.gz \\\n";
+ print F " -O $path/results/\$qry.mhap.ovb \\\n";
+ print F " -o $path/results/\$qry.WORKING.ovb \\\n";
print F " -partial \\\n" if ($typ eq "partial");
+ print F " -len " , getGlobal("minOverlapLength"), " \\\n";
print F " -erate ", getGlobal("corErrorRate"), " \\\n" if ($tag eq "cor");
print F " -erate ", getGlobal("obtOvlErrorRate"), " \\\n" if ($tag eq "obt");
print F " -erate ", getGlobal("utgOvlErrorRate"), " \\\n" if ($tag eq "utg");
print F " -memory " . getGlobal("${tag}mhapMemory") . " \\\n";
- print F " -t " . getGlobal("${tag}mhapThreads") . " \n";
+ print F " -t " . getGlobal("${tag}mhapThreads") . " \\\n";
+ print F " && \\\n";
+ print F " mv -f $path/results/\$qry.WORKING.ovb $path/results/\$qry.ovb\n";
} else {
- print F " mv -f \"$path/results/\$qry.mhap.ovb.gz\" \"$path/results/\$qry.ovb.gz\"\n";
+ print F " mv -f \"$path/results/\$qry.mhap.ovb\" \"$path/results/\$qry.ovb\"\n";
}
print F "fi\n";
@@ -608,7 +599,7 @@ sub mhapPrecomputeCheck ($$$$) {
print L @successJobs;
close(L);
- setGlobal("canuIteration", 0);
+ setGlobal("canuIteration", 1);
emitStage($WRK, $asm, "$tag-mhapPrecomputeCheck");
buildHTML($WRK, $asm, $tag);
stopAfter("mhapPrecompute");
@@ -738,7 +729,7 @@ sub mhapCheck ($$$$) {
print L @miscJobs;
close(L);
- setGlobal("canuIteration", 0);
+ setGlobal("canuIteration", 1);
emitStage($WRK, $asm, "$tag-mhapCheck");
buildHTML($WRK, $asm, $tag);
stopAfter("mhapOverlap");
diff --git a/src/pipelines/canu/OverlapStore.pm b/src/pipelines/canu/OverlapStore.pm
index b8cdefa..807d815 100644
--- a/src/pipelines/canu/OverlapStore.pm
+++ b/src/pipelines/canu/OverlapStore.pm
@@ -170,18 +170,34 @@ sub overlapStoreConfigure ($$$$) {
system("mkdir -p $wrk/$asm.ovlStore.BUILDING/scripts") if (! -d "$wrk/$asm.ovlStore.BUILDING/scripts");
system("mkdir -p $wrk/$asm.ovlStore.BUILDING/logs") if (! -d "$wrk/$asm.ovlStore.BUILDING/logs");
- # Run the normal store build, but just to get the partitioning.
+ # Run the normal store build, but just to get the partitioning. ovStoreBuild internally
+ # writes to config.WORKING, then renames when it is finished. No need for the script
+ # to be overly careful about incomplete files.
+
+ if (! -e "$wrk/$asm.ovlStore.BUILDING/scripts/0-config.sh") {
+ open(F, "> $wrk/$asm.ovlStore.BUILDING/scripts/0-config.sh") or die;
+ print F "#!" . getGlobal("shell") . "\n";
+ print F "\n";
+ print F getLimitShellCode("processes");
+ print F getLimitShellCode("files");
+ print F "\n";
+ print F getBinDirectoryShellCode();
+ print F "\n";
+ print F "\$bin/ovStoreBuild \\\n";
+ print F " -G $wrk/$asm.gkpStore \\\n";
+ print F " -O $wrk/$asm.ovlStore \\\n"; # NOT created!
+ print F " -M " . getGlobal("ovsMemory") . " \\\n";
+ print F " -config $wrk/$asm.ovlStore.BUILDING/config \\\n";
+ print F " -L $files \\\n";
+ close(F);
+ }
+ system("chmod +x $wrk/$asm.ovlStore.BUILDING/scripts/0-config.sh");
if (! -e "$wrk/$asm.ovlStore.BUILDING/config") {
- $cmd = "$bin/ovStoreBuild \\\n";
- $cmd .= " -G $wrk/$asm.gkpStore \\\n";
- $cmd .= " -O $wrk/$asm.ovlStore \\\n"; # NOT created!
- $cmd .= " -M " . getGlobal("ovsMemory") . " \\\n";
- $cmd .= " -config $wrk/$asm.ovlStore.BUILDING/config \\\n";
- $cmd .= " -L $files \\\n";
+ $cmd = "$wrk/$asm.ovlStore.BUILDING/scripts/0-config.sh \\\n";
$cmd .= "> $wrk/$asm.ovlStore.BUILDING/config.err 2>&1\n";
- if (runCommand($wrk, $cmd)) {
+ if (runCommand("$wrk/$asm.ovlStore.BUILDING/scripts", $cmd)) {
caExit("failed to generate configuration for building overlap store", "$wrk/$asm.ovlStore.BUILDING/config.err");
}
}
@@ -199,14 +215,7 @@ sub overlapStoreConfigure ($$$$) {
open(F, "> $wrk/$asm.ovlStore.BUILDING/scripts/1-bucketize.sh") or die;
print F "#!" . getGlobal("shell") . "\n";
print F "\n";
- print F "jobid=\$" . getGlobal("gridEngineTaskID") . "\n";
- print F "if [ x\$jobid = x -o x\$jobid = xundefined -o x\$jobid = x0 ]; then\n";
- print F " jobid=\$1\n";
- print F "fi\n";
- print F "if [ x\$jobid = x ]; then\n";
- print F " echo Error: I need " . getGlobal("gridEngineTaskID") . " set, or a job index on the command line.\n";
- print F " exit 1\n";
- print F "fi\n";
+ print F getJobIDShellCode();
print F "\n";
print F "bn=`printf %04d \$jobid`\n";
print F "jn=\"undefined\"\n";
@@ -241,17 +250,8 @@ sub overlapStoreConfigure ($$$$) {
print F " rm -rf \"$wrk/$asm.ovlStore.BUILDING/create\$bn\"\n";
print F "fi\n";
print F "\n";
- print F "max=`ulimit -Hu`\n";
- print F "bef=`ulimit -Su`\n";
- print F "if [ \$bef -lt \$max ] ; then\n";
- print F " ulimit -Su \$max\n";
- print F " aft=`ulimit -Su`\n";
- print F " echo \"Changed max processes per user from \$bef to \$aft (max \$max).\"\n";
- print F " echo \"\"\n";
- print F "else\n";
- print F " echo \"Max processes per user limited to \$bef, no increase possible.\"\n";
- print F " echo \"\"\n";
- print F "fi\n";
+ print F getLimitShellCode("processes");
+ print F getLimitShellCode("files");
print F "\n";
print F getBinDirectoryShellCode();
print F "\n";
@@ -273,26 +273,10 @@ sub overlapStoreConfigure ($$$$) {
open(F, "> $wrk/$asm.ovlStore.BUILDING/scripts/2-sort.sh") or die;
print F "#!" . getGlobal("shell") . "\n";
print F "\n";
- print F "jobid=\$" . getGlobal("gridEngineTaskID") . "\n";
- print F "if [ x\$jobid = x -o x\$jobid = xundefined -o x\$jobid = x0 ]; then\n";
- print F " jobid=\$1\n";
- print F "fi\n";
- print F "if [ x\$jobid = x ]; then\n";
- print F " echo Error: I need " . getGlobal("gridEngineTaskID") . " set, or a job index on the command line.\n";
- print F " exit 1\n";
- print F "fi\n";
+ print F getJobIDShellCode();
print F "\n";
- print F "max=`ulimit -Hu`\n";
- print F "bef=`ulimit -Su`\n";
- print F "if [ \$bef -lt \$max ] ; then\n";
- print F " ulimit -Su \$max\n";
- print F " aft=`ulimit -Su`\n";
- print F " echo \"Changed max processes per user from \$bef to \$aft (max \$max).\"\n";
- print F " echo \"\"\n";
- print F "else\n";
- print F " echo \"Max processes per user limited to \$bef, no increase possible.\"\n";
- print F " echo \"\"\n";
- print F "fi\n";
+ print F getLimitShellCode("processes");
+ print F getLimitShellCode("files");
print F "\n";
print F getBinDirectoryShellCode();
print F "\n";
@@ -430,7 +414,7 @@ sub overlapStoreBucketizerCheck ($$$$) {
touch("$wrk/$asm.ovlStore.BUILDING/1-bucketize.success");
- setGlobal("canuIteration", 0);
+ setGlobal("canuIteration", 1);
emitStage($WRK, $asm, "$tag-overlapStoreBucketizerCheck");
buildHTML($WRK, $asm, $tag);
stopAfter("overlapBucketizer");
@@ -534,7 +518,7 @@ sub overlapStoreSorterCheck ($$$$) {
touch("$wrk/$asm.ovlStore.BUILDING/2-sorter.success");
- setGlobal("canuIteration", 0);
+ setGlobal("canuIteration", 1);
emitStage($WRK, $asm, "$tag-overlapStoreSorterCheck");
buildHTML($WRK, $asm, $tag);
stopAfter("overlapSorter");
@@ -606,7 +590,7 @@ sub createOverlapStore ($$$$) {
goto allDone if (skipStage($WRK, $asm, "$tag-createOverlapStore") == 1);
goto allDone if (-d "$wrk/$asm.ovlStore");
- goto allDone if (-d "$wrk/$asm.tigStore");
+ goto allDone if (-d "$wrk/$asm.ctgStore");
# Did we _really_ complete?
@@ -693,7 +677,9 @@ sub createOverlapStore ($$$$) {
# Now all done!
finishStage:
- generateOverlapStoreStats($wrk, $asm);
+ if ($tag eq "utg") {
+ generateOverlapStoreStats($wrk, $asm);
+ }
if (-e "$wrk/$asm.ovlStore.summary") {
print STDERR "--\n";
@@ -707,7 +693,7 @@ sub createOverlapStore ($$$$) {
close(F);
} else {
- print STDERR "-- Overlap store '$wrk/$asm.ovlStore' statistics not available.\n";
+ print STDERR "-- Overlap store '$wrk/$asm.ovlStore' statistics not available (skipped in correction and trimming stages).\n";
}
emitStage($WRK, $asm, "$tag-createOverlapStore");
diff --git a/src/pipelines/canu/Unitig.pm b/src/pipelines/canu/Unitig.pm
index 59c42cf..13d0245 100644
--- a/src/pipelines/canu/Unitig.pm
+++ b/src/pipelines/canu/Unitig.pm
@@ -82,12 +82,12 @@ sub reportUnitigSizes ($$$$) {
my $gs = getGlobal("genomeSize");
my $V = substr("000000" . $version, -3);
- my $N = "$wrk/$asm.tigStore/seqDB.v$V.sizes.txt";
+ my $N = "$wrk/$asm.ctgStore/seqDB.v$V.sizes.txt";
if (! -e $N) {
$cmd = "$bin/tgStoreDump \\\n";
$cmd .= " -G $wrk/$asm.gkpStore \\\n";
- $cmd .= " -T $wrk/$asm.tigStore $version \\\n";
+ $cmd .= " -T $wrk/$asm.ctgStore $version \\\n";
$cmd .= " -sizes -s " . getGlobal("genomeSize") . " \\\n";
$cmd .= "> $N";
@@ -148,25 +148,21 @@ sub unitig ($$) {
my $asm = shift @_;
goto allDone if (skipStage($wrk, $asm, "unitig") == 1);
- goto allDone if (-d "$wrk/$asm.tigStore");
+ goto allDone if ((-d "$wrk/$asm.ctgStore") && (-d "$wrk/$asm.utgStore"));
make_path("$wrk/4-unitigger") if (! -d "$wrk/4-unitigger");
# How many reads per partition? This will change - it'll move to be after unitigs are constructed.
- my $perPart = int(getNumberOfReadsInStore($wrk, $asm) / getGlobal("cnsPartitions"));
- my $minPart = getGlobal("cnsPartitionMin");
my $overlapLength = getGlobal("minOverlapLength");
- $perPart = ($perPart < $minPart) ? ($perPart) : ($minPart);
-
# Dump a script to run the unitigger.
open(F, "> $wrk/4-unitigger/unitigger.sh") or caExit("can't open '$wrk/4-unitigger/unitigger.sh' for writing: $!\n", undef);
print F "#!" . getGlobal("shell") . "\n";
print F "\n";
- print F "if [ -e $wrk/$asm.tigStore/seqDB.v001.tig ] ; then\n";
+ print F "if [ -e $wrk/$asm.ctgStore/seqDB.v001.tig -a -e $wrk/$asm.utgStore/seqDB.v001.tig ] ; then\n";
print F " exit 0\n";
print F "fi\n";
print F "\n";
@@ -177,9 +173,7 @@ sub unitig ($$) {
print F "\$bin/bogart \\\n";
print F " -G $wrk/$asm.gkpStore \\\n";
print F " -O $wrk/$asm.ovlStore \\\n";
- print F " -T $wrk/$asm.tigStore.WORKING \\\n";
print F " -o $wrk/4-unitigger/$asm \\\n";
- print F " -B $perPart \\\n";
print F " -gs " . getGlobal("genomeSize") . " \\\n";
print F " -eg " . getGlobal("utgOvlErrorRate") . " \\\n";
print F " -eM " . getGlobal("utgOvlErrorRate") . " \\\n";
@@ -188,14 +182,16 @@ sub unitig ($$) {
print F " -db " . getGlobal("utgGraphDeviation") . " \\\n";
print F " -dr " . getGlobal("utgRepeatDeviation") . " \\\n";
print F " -ca " . getGlobal("utgRepeatConfusedBP"). " \\\n";
- print F " -cp " . "500" . " \\\n";
+ print F " -cp " . "200" . " \\\n";
print F " -threads " . getGlobal("batThreads") . " \\\n" if (defined(getGlobal("batThreads")));
print F " -M " . getGlobal("batMemory") . " \\\n" if (defined(getGlobal("batMemory")));
print F " -unassembled " . getGlobal("contigFilter") . " \\\n" if (defined(getGlobal("contigFilter")));
print F " " . getGlobal("batOptions") . " \\\n" if (defined(getGlobal("batOptions")));
print F " > $wrk/4-unitigger/unitigger.err 2>&1 \\\n";
print F "&& \\\n";
- print F "mv $wrk/$asm.tigStore.WORKING $wrk/$asm.tigStore.FINISHED\n";
+ print F "mv $wrk/4-unitigger/$asm.ctgStore $wrk/$asm.ctgStore \\\n";
+ print F "&& \\\n";
+ print F "mv $wrk/4-unitigger/$asm.utgStore $wrk/$asm.utgStore\n";
} else {
caFailure("unknown unitigger '" . getGlobal("unitigger") . "'", undef);
}
@@ -224,40 +220,34 @@ sub unitigCheck ($$) {
my $path = "$wrk/4-unitigger";
goto allDone if (skipStage($WRK, $asm, "unitigCheck", $attempt) == 1);
- goto allDone if (-e "$wrk/$asm.tigStore/seqDB.v001.tig");
+ goto allDone if ((-e "$wrk/$asm.ctgStore/seqDB.v001.tig") && (-e "$wrk/$asm.utgStore/seqDB.v001.tig"));
# Since there is only one job, if we get here, we're not done. Any other 'check' function
- # shows how to process multiple jobs. This only checks for the existence of either *.WORKING
- # (crashed or killed) or *.FINISHED (done).
-
- # If 'FINISHED' exists, the job finished successfully.
+ # shows how to process multiple jobs. This only checks for the existence of the final outputs.
- if (! -e "$wrk/$asm.tigStore.FINISHED/seqDB.v001.tig") {
+ # If not the first attempt, report the jobs that failed, and that we're recomputing.
- # If not the first attempt, report the jobs that failed, and that we're recomputing.
-
- if ($attempt > 1) {
- print STDERR "--\n";
- print STDERR "-- Unitigger failed.\n";
- print STDERR "--\n";
- }
+ if ($attempt > 1) {
+ print STDERR "--\n";
+ print STDERR "-- Unitigger failed.\n";
+ print STDERR "--\n";
+ }
- # If too many attempts, give up.
+ # If too many attempts, give up.
- if ($attempt > getGlobal("canuIterationMax")) {
- caExit("failed to generate unitigs. Made " . ($attempt-1) . " attempts, jobs still failed", undef);
- }
+ if ($attempt > getGlobal("canuIterationMax")) {
+ caExit("failed to generate unitigs. Made " . ($attempt-1) . " attempts, jobs still failed", undef);
+ }
- # Otherwise, run some jobs.
+ # Otherwise, run some jobs.
- print STDERR "-- Unitigger attempt $attempt begins.\n";
+ print STDERR "-- Unitigger attempt $attempt begins.\n";
- emitStage($WRK, $asm, "unitigCheck", $attempt);
- buildHTML($WRK, $asm, "utg");
+ emitStage($WRK, $asm, "unitigCheck", $attempt);
+ buildHTML($WRK, $asm, "utg");
- submitOrRunParallelJob($WRK, $asm, "bat", $path, "unitigger", (1));
- return;
- }
+ submitOrRunParallelJob($WRK, $asm, "bat", $path, "unitigger", (1));
+ return;
# If onGrid, the submitOrRun() has submitted parallel jobs to the grid, and resubmitted the
# executive. # The parallel version never gets here
@@ -265,11 +255,9 @@ sub unitigCheck ($$) {
finishStage:
print STDERR "-- Unitigger finished successfully.\n";
- rename "$wrk/$asm.tigStore.FINISHED", "$wrk/$asm.tigStore";
-
reportUnitigSizes($wrk, $asm, 1, "after unitig construction");
- setGlobal("canuIteration", 0);
+ setGlobal("canuIteration", 1);
emitStage($WRK, $asm, "unitigCheck");
buildHTML($WRK, $asm, "utg");
stopAfter("unitigCheck");
diff --git a/src/pipelines/install-perl-libraries.sh b/src/pipelines/install-perl-libraries.sh
deleted file mode 100755
index 789aecc..0000000
--- a/src/pipelines/install-perl-libraries.sh
+++ /dev/null
@@ -1,110 +0,0 @@
-#!/bin/sh
-
-installdir=$1
-
-# Should detect which modules we really need to install. For now, just install both.
-
-
-# Remove old junk
-
-rm -rf File-Path-2.09
-rm -rf Filesys-Df-0.92
-
-# Clean our environment
-
-unset CC
-unset CXX
-
-# Decide which perl we have.
-
-perlversion=`perl -e '$x = ($] < 5.009) ? "old" : "new"; print $x'`
-
-# Our older perl (5.8.8) uses an older MakeMaker, which doesn't know INSTALL_BASE, and installs the
-# Filesys::Df package in the wrong place.
-#
-# Our newer perl (5.10.1) knows INSTALL_BASE, and has File::Path installed already.
-
-if [ $perlversion = "old" ] ; then
-
- echo "Installing File-Path-2.09"
-
- echo \
- tar xzf File-Path-2.09.tar.gz
- tar xzf File-Path-2.09.tar.gz
-
- cd File-Path-2.09
-
- echo \
- perl Makefile.PL PREFIX=$installdir
- perl Makefile.PL PREFIX=$installdir > make.maker.err 2>&1
-
- echo \
- make install
- make install > make.install.err 2>&1
- cd ..
-
- echo "Installing Filesys-Df-0.92"
-
- echo \
- tar xzf Filesys-Df-0.92.tar.gz
- tar xzf Filesys-Df-0.92.tar.gz
-
- cd Filesys-Df-0.92
-
- echo \
- perl Makefile.PL PREFIX=$installdir
- perl Makefile.PL PREFIX=$installdir > make.maker.err 2>&1
-
- echo \
- make install
- make install > make.install.err 2>&1
-
- cd ..
-
- echo \
- mv $installdir/lib64/perl5/site_perl/5*/*/Filesys $installdir/lib64/perl5/5*/*/
- mv $installdir/lib64/perl5/site_perl/5*/*/Filesys $installdir/lib64/perl5/5*/*/
-
- echo \
- mv $installdir/lib64/perl5/site_perl/5*/*/auto/Filesys $installdir/lib64/perl5/5*/*/auto/
- mv $installdir/lib64/perl5/site_perl/5*/*/auto/Filesys $installdir/lib64/perl5/5*/*/auto/
-
- echo \
- rm -rf $installdir/lib64/perl5/site_perl
- rm -rf $installdir/lib64/perl5/site_perl
-
-else
-
- echo "Installing Filesys-Df-0.92"
-
- echo \
- tar xzf Filesys-Df-0.92.tar.gz
- tar xzf Filesys-Df-0.92.tar.gz
-
- cd Filesys-Df-0.92
-
- echo \
- perl Makefile.PL INSTALL_BASE=$installdir
- perl Makefile.PL INSTALL_BASE=$installdir > make.maker.err 2>& 1
-
- # The toplevel GNU make is setting MAKEFLAGS to be "-j --jobserver-fds=3,4". The BSD make
- # invoked below (on FreeBSD and probably OS X) requires a value for -j. So we just remove it.
- # In GNU make, '-j' says to run as many tasks in parallel as possible; this isn't heavy lifting,
- # so won't matter if it's sequential. The jobserver-fds baloney is to track jobs in parallel,
- # again, we don't care.
-
- export MAKEFLAGS=
-
- echo \
- make install
- make install > make.install.err 2>&1
-
- cd ..
-
-fi
-
-# Remove the junk
-
-rm -rf File-Path-2.09
-rm -rf Filesys-Df-0.92
-
diff --git a/src/pipelines/sanity/build-all-wgs-revisions.pl b/src/pipelines/sanity/build-all-wgs-revisions.pl
deleted file mode 100644
index 595fef0..0000000
--- a/src/pipelines/sanity/build-all-wgs-revisions.pl
+++ /dev/null
@@ -1,173 +0,0 @@
-#!/usr/bin/env perl
-
-###############################################################################
- #
- # This file is part of canu, a software program that assembles whole-genome
- # sequencing reads into contigs.
- #
- # This software is based on:
- # 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
- # the 'kmer package' (http://kmer.sourceforge.net)
- # both originally distributed by Applera Corporation under the GNU General
- # Public License, version 2.
- #
- # Canu branched from Celera Assembler at its revision 4587.
- # Canu branched from the kmer project at its revision 1994.
- #
- # Modifications by:
- #
- # Brian P. Walenz beginning on 2015-OCT-12
- # are a 'United States Government Work', and
- # are released in the public domain
- #
- # File 'README.licenses' in the root directory of this distribution contains
- # full conditions and disclaimers for each license.
- ##
-
-#--------------------------------------------------------------------------------
-
-use strict;
-
-my $wgssvn = "/work/NIGHTLY/SVN-wgs";
-my $kmersvn = "/work/NIGHTLY/SVN-kmer";
-
-my $wgsbyd = "/work/NIGHTLY/bydate";
-
-system("mkdir $wgsbyd") if (! -d "$wgsbyd");
-
-my %revToDate;
-
-if (-d "$wgsbyd/TIP") {
- print STDERR "Updating to latest. DON'T FORGET TO sanity.pl rsync FIRST!\n";
- system("cd $wgsbyd/TIP/src ; svn update > update.err");
-} else {
- print STDERR "Checking out latest.\n";
- system("mkdir $wgsbyd/TIP");
- system("cd $wgsbyd/TIP ; svn co file://$wgssvn/trunk/src src > checkout.err");
-}
-
-print STDERR "Reading CA commit logs.\n";
-
-open(F, "cd $wgsbyd/TIP/src ; svn log |");
-while (<F>) {
- #print STDERR $_;
-
- # CVS relic
- #if (m/^date:\s+(....)\/(..)\/(..)\s+(..):(..):(..);\s+author:\s+(.*);\s+state/) {
- # my $fildate = "$1-$2-$3-$4$5";
- # my $cvsdate = "$1/$2/$3 $4:$5:$6 GMT";
- # my $svndate = "{$1$2$3T$4$5Z}";
- # $logdate{"$fildate\0$cvsdate\0$svndate"}++;
- #}
-
- if (m/^(r\d+)\s+\|\s+(.+)\s+\|\s+(\d\d\d\d)-(\d\d)-(\d\d)\s(\d\d):(\d\d):(\d\d)\s(.\d\d\d\d)\s/) {
- my $fildate = "$3-$4-$5-$6$7";
- my $svndate = "{$3$4$5T$6$7Z}";
-
- $revToDate{$1} = "$fildate\0$svndate\0$2";
- }
-}
-close(F);
-
-
-
-my @keys = sort keys %revToDate;
-
-print STDERR "Checking out versions.\n";
-
-foreach my $wgsrev (@keys) {
- my ($dirdate, $svndate, $author) = split '\0', $revToDate{$wgsrev};
-
- #next if (-e "$wgsbyd/$dirdate");
- #next if (-e "$wgsbyd/$dirdate.tar");
- #next if (-e "$wgsbyd/$dirdate.tar.gz");
- #next if (-e "$wgsbyd/$dirdate.tar.bz2");
- #next if (-e "$wgsbyd/$dirdate.tar.xz");
-
- next if ($dirdate eq "2004-04-14-1349"); # These are all one
- next if ($dirdate eq "2004-04-14-1350"); # slow commit, which
- next if ($dirdate eq "2004-04-14-1351"); # we capture in
- next if ($dirdate eq "2004-04-14-1352"); # 2004-04-14-1354
- next if ($dirdate eq "2004-04-14-1353"); #
-
- next if ($dirdate le "2010-00");
-
- next if ($dirdate eq "2011-09-06-1707"); # Doesn't compile
-
- next if ($dirdate eq "2012-02-01-2010"); # Broken GWS overlapStore
- next if ($dirdate eq "2012-02-01-2012"); # Broken GWS overlapStore
- next if ($dirdate eq "2012-02-01-2015"); # Broken GWS overlapStore
-
- next if ($dirdate le "2014-00");
-
- system("mkdir -p $wgsbyd/$dirdate") if (! -e "$wgsbyd/$dirdate");
- system("ln -s $dirdate $wgsbyd/$wgsrev") if (! -e "$wgsbyd/$wgsrev");
-
- # Checkout the assembler
-
- if (! -e "$wgsbyd/$dirdate/src") {
- print "$dirdate -- $wgsrev CHECKOUT\n";
-
- system("cd $wgsbyd/$dirdate ; svn co -r $wgsrev file://$wgssvn/trunk/src src > src.$wgsrev.err");
- } else {
- print "$dirdate -- $wgsrev UPDATE\n";
-
- system("cd $wgsbyd/$dirdate/src ; svn update -r $wgsrev");
- }
-
- # Checkout kmer, or, more probably, just link to one that exists already
-
- my $kmerev = "0000";
-
- open(L, "svn info -r \"$svndate\" file://$kmersvn/trunk |") or die "Failed to get info: $!\n";
- while (<L>) {
- $kmerev = $1 if (m/^Revision:\s+(\d+)/);
- }
- close(L);
-
- die "Didn't find a kmer revision in $wgsbyd/$dirdate/kmer/kmer.err.\n" if ($kmerev eq "0000");
-
- if (! -e "$wgsbyd/kmer$kmerev") {
- print "$dirdate -- kmer$kmerev CHECKOUT\n";
-
- system("cd $wgsbyd/$dirdate && svn co -r \"$svndate\" file://$kmersvn/trunk $wgsbyd/kmer$kmerev > $wgsbyd/kmer$kmerev.err 2>&1");
- }
-
- system("ln -s ../kmer$kmerev $wgsbyd/$dirdate/kmer") if (! -e "$wgsbyd/$dirdate/kmer");
-
- # Compile
-
- if (! -e "$wgsbyd/kmer$kmerev/build.err") {
- print "$dirdate -- kmer$kmerev COMPILE\n";
-
- if (! -e "$wgsbyd/kmer$kmerev/Makefile") {
- system("cd $wgsbyd/kmer$kmerev && sh configure.sh > configure.err 2>&1");
- }
-
- open(F, "> $wgsbyd/kmer$kmerev/build.err");
- print F "Waiting to compile.\n";
- close(F);
-
- system("qsub -N kmer$kmerev -cwd -j y -o $wgsbyd/kmer$kmerev/build.err -q vomit.q -wd $wgsbyd/kmer$kmerev -b y gmake install");
- #system("cd $wgsbyd/kmer$kmerev && gmake install > build.err 2>&1 &");
- }
-
- if (! -e "$wgsbyd/$dirdate/src/build.err") {
- print "$dirdate -- $wgsrev COMPILE\n";
-
- open(F, "> $wgsbyd/$dirdate/src/build.err");
- print F "Waiting to compile.\n";
- close(F);
-
- system("qsub -N wgs$wgsrev -hold_jid kmer$kmerev -cwd -j y -o $wgsbyd/$dirdate/src/build.err -q vomit.q -wd $wgsbyd/$dirdate/src -b y gmake");
- #system("cd $wgsbyd/$dirdate/src && gmake > build.err 2>&1");
- }
-
- # Archive
-
- #if (! -e "$wgsbyd/$dirdate.tar.bz2") {
- # Use sge, hold on both $kmerev and $wgsrev
- # system("cd $wgsbyd && tar -cf - $dirdate | xz -9vc > $dirdate.tar.bz2 && mv $wgsbyd/$dirdate $wgsbyd/$dirdate.DELETE &");
- #}
-}
-
diff --git a/src/pipelines/sanity/compile-all-wgs-revisions.pl b/src/pipelines/sanity/compile-all-wgs-revisions.pl
deleted file mode 100644
index fa4e682..0000000
--- a/src/pipelines/sanity/compile-all-wgs-revisions.pl
+++ /dev/null
@@ -1,201 +0,0 @@
-#!/usr/bin/env perl
-
-###############################################################################
- #
- # This file is part of canu, a software program that assembles whole-genome
- # sequencing reads into contigs.
- #
- # This software is based on:
- # 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
- # the 'kmer package' (http://kmer.sourceforge.net)
- # both originally distributed by Applera Corporation under the GNU General
- # Public License, version 2.
- #
- # Canu branched from Celera Assembler at its revision 4587.
- # Canu branched from the kmer project at its revision 1994.
- #
- # Modifications by:
- #
- # Brian P. Walenz beginning on 2015-OCT-12
- # are a 'United States Government Work', and
- # are released in the public domain
- #
- # File 'README.licenses' in the root directory of this distribution contains
- # full conditions and disclaimers for each license.
- ##
-
-use strict;
-
-my $bgn = shift @ARGV;
-my $end = shift @ARGV;
-
-my @dates;
-
-open(F, "ls -r *bz2 |");
-while (<F>) {
- chomp;
-
- next if (defined($bgn) && ($_ lt $bgn));
- next if (defined($end) && ($end lt $_));
-
- push @dates, $_;
-}
-
-$bgn = "START_OF_TIME" if (!defined($bgn));
-$end = "END_OF_TIME" if (!defined($end));
-
-print STDERR "Building ", scalar(@dates), " revisions from $bgn to $end.\n";
-
-while (scalar(@dates) > 0) {
- my $bz2 = shift @dates;
- my $dir;
-
- if ($bz2 =~ m/(\d\d\d\d-\d\d-\d\d-\d\d\d\d).tar.bz2/) {
- $dir = $1;
- } else {
- die "Nope bz2='$bz2'\n";
- }
-
- if (! -e "$dir") {
- print STDERR "Uncompressing $bz2\n";
- system("bzip2 -dc $bz2 | tar -xf -") and die "Failed to uncompress '$bz2'\n";
- next;
- }
-
- #if (! -e "$dir/kmer/FreeBSD-amd64") {
- # print STDERR "Building $dir/kmer\n";
- # if ($dir le "2000-00-00-0000") {
- # die;
- # } else {
- # system("cd $dir/kmer && gmake install > errs 2>&1") and die "Failed to build '$dir/kmer'\n";
- # }
- #}
-
- next if ($dir le "2008-10-10-0304"); # ALL the early assemblers fail to build; no kmer
-
- next if ($dir eq "2008-10-29-1053"); # buildRefUnitigs is not committed
- next if ($dir eq "2008-10-29-1619"); # buildRefUnitigs is not committed
- next if ($dir eq "2008-10-29-1649"); # buildRefUnitigs is not committed
- next if ($dir eq "2008-10-29-1651"); # buildRefUnitigs is not committed
- next if ($dir eq "2008-10-29-1719"); # buildRefUnitigs is not committed
- next if ($dir eq "2008-10-29-1720"); # buildRefUnitigs is not committed
- next if ($dir eq "2008-10-30-0449"); # buildRefUnitigs is not committed
- next if ($dir eq "2008-11-02-0628");
- next if ($dir eq "2009-01-16-1639");
- next if ($dir eq "2009-01-16-1643");
- next if ($dir eq "2009-01-16-1646");
- next if ($dir eq "2009-01-16-1647");
- next if ($dir eq "2009-01-16-1657");
- next if ($dir eq "2009-03-06-2012");
- next if ($dir eq "2009-03-31-2004");
- next if ($dir eq "2009-03-31-2032");
- next if ($dir eq "2009-04-08-1725");
- next if ($dir eq "2009-04-09-1401");
- next if ($dir eq "2009-06-10-1805");
- next if ($dir eq "2009-06-24-1205");
- next if ($dir eq "2009-07-06-2003");
- next if ($dir eq "2009-07-27-0806");
- next if ($dir eq "2009-07-28-1230");
- next if ($dir eq "2009-08-04-1103");
- next if ($dir eq "2009-08-04-1105");
- next if ($dir eq "2009-08-06-1136");
- next if ($dir eq "2009-08-14-1103"); # BREAKS WITH rename-to-c++
- next if ($dir eq "2009-08-28-17597");
- next if ($dir eq "2009-09-04-2024");
- next if ($dir eq "2009-09-04-2025");
- next if ($dir eq "2009-09-07-0740");
- next if ($dir eq "2009-09-09-0340");
- next if ($dir eq "2009-09-09-0745");
- next if ($dir eq "2009-10-12-0619");
- next if ($dir eq "2009-10-27-1244");
- next if ($dir eq "2009-11-19-1501");
- next if ($dir eq "2009-12-03-0119");
- next if ($dir eq "2009-12-19-0536");
- next if ($dir eq "2010-01-14-0044");
- next if ($dir eq "2010-01-26-0351"); # AS_BOG_BestOverlapGraph() prototype mismatch
- next if ($dir eq "2010-02-04-2156"); # BOG fails to build
- next if ($dir eq "2010-02-09-1812"); # markUniqueUnique() bad header file
- next if ($dir eq "2010-02-17-0132"); # pointer/object confusion
- next if ($dir eq "2010-03-22-2008"); # AS_UTL_writeFastQ() not defined
- next if ($dir eq "2010-09-28-1055"); # Makefile refers to missing overlapStore_genomeLength.c
- next if ($dir eq "2010-09-28-1720"); # Makefile refers to missing overlapStore_genomeLength.c
- next if ($dir eq "2010-10-01-1343");
- next if ($dir eq "2010-10-04-0852");
- next if ($dir eq "2010-10-15-0246"); # const char * vs char * in AS_BOG
- next if ($dir eq "2010-12-08-1242"); # Makefile refers to deleted AS_CGW/AS_CGW_dump.c
- next if ($dir eq "2010-12-08-1243"); # Makefile refers to deleted AS_CGW/AS_CGW_dump.c
- next if ($dir eq "2010-12-08-1245"); # Makefile refers to deleted AS_CGW/AS_CGW_dump.c
- next if ($dir eq "2010-12-09-0405"); # Makefile refers to deleted AS_CGW/AS_CGW_dump.c
- next if ($dir eq "2010-12-10-1328"); # Makefile refers to deleted AS_CGW/AS_CGW_dump.c
- next if ($dir eq "2011-01-25-1356");
- next if ($dir eq "2011-02-11-0548");
- next if ($dir eq "2011-03-08-2118"); # Undefined constant in AS_BOG
- next if ($dir eq "2011-06-17-1303");
- next if ($dir eq "2011-08-01-1835");
- next if ($dir eq "2011-08-01-2033");
- next if ($dir eq "2011-08-01-2100");
- next if ($dir eq "2011-08-01-2159");
- next if ($dir eq "2011-08-01-2233");
- next if ($dir eq "2011-08-02-0221");
- next if ($dir eq "2011-08-02-0223");
- next if ($dir eq "2011-08-02-0225");
- next if ($dir eq "2011-08-02-0318");
- next if ($dir eq "2011-08-04-1708");
- next if ($dir eq "2011-08-24-2114");
- next if ($dir eq "2011-08-25-0240");
- next if ($dir eq "2011-08-30-1332");
- next if ($dir eq "2011-08-30-1812");
- next if ($dir eq "2011-08-30-1813");
- next if ($dir eq "2011-08-30-2309");
- next if ($dir eq "2011-09-02-1714"); # ouch, broken through 2011-09-06-1707
- next if ($dir eq "2011-09-02-1825"); # ouch.
- next if ($dir eq "2011-09-02-2204"); # ouch.
- next if ($dir eq "2011-09-03-0129"); # ouch.
- next if ($dir eq "2011-09-03-0408"); # ouch.
- next if ($dir eq "2011-09-03-0736"); # ouch.
- next if ($dir eq "2011-09-03-0813"); # ouch.
- next if ($dir eq "2011-09-04-0101"); # ouch.
- next if ($dir eq "2011-09-05-1649"); # ouch.
- next if ($dir eq "2011-09-05-2123"); # ouch.
- next if ($dir eq "2011-09-06-0111"); # ouch.
- next if ($dir eq "2011-09-06-0215"); # ouch.
- next if ($dir eq "2011-09-06-0947"); # ouch.
- next if ($dir eq "2011-09-06-1415"); # ouch.
- next if ($dir eq "2011-09-06-1422"); # ouch.
- next if ($dir eq "2011-09-06-1506"); # ouch.
- next if ($dir eq "2011-09-06-1524"); # ouch.
- next if ($dir eq "2011-09-06-1527"); # ouch.
- next if ($dir eq "2011-09-06-1601"); # ouch.
- next if ($dir eq "2011-09-06-1641"); # ouch.
- next if ($dir eq "2011-09-06-1707"); # ouch.
- next if ($dir eq "2011-12-09-2302"); # Assigning 64-bit allocation to 32-bit pointer in AS_BAT
-
-
- if (! -e "$dir/kmer/FreeBSD-amd64") {
- die "Didn't find kmer in $dir\n";
- }
-
- if (! -e "$dir/FreeBSD-amd64/bin/gatekeeper") {
- print STDERR "Building $dir/src\n";
-
- if ($dir le "2009-02-03-2104") {
- # These old builds REQUIRE that SITE_NAME be set explicitly.
- $ENV{'SITE_NAME'} = "LOCAL";
- system("cd $dir/src && gmake > errs 2>&1") and die "Failed to build '$dir/src'\n";
- undef $ENV{'SITE_NAME'};
-
- } elsif ($dir lt "2009-06-12-1741") {
- # These builds are before the C++ hack
- system("cd $dir/src && gmake > errs 2>&1") and die "Failed to build '$dir/src'\n";
-
- } elsif ($dir le "2009-08-05-2205") {
- # These builds need to rename C to C++ to build.
- # The rename script doesn't go away until (after) 2009-09-30-1832
- system("cd $dir/src && sh rename-to-c++.sh") and die "Failed to rename '$dir/src'\n";
- system("cd $dir/src && gmake > errs 2>&1") and die "Failed to build '$dir/src'\n";
-
- } else {
- system("cd $dir/src && gmake > errs 2>&1") and die "Failed to build '$dir/src'\n";
- }
- }
-}
diff --git a/src/pipelines/sanity/sanity-asm-done.pl b/src/pipelines/sanity/sanity-asm-done.pl
deleted file mode 100755
index 1ba11d7..0000000
--- a/src/pipelines/sanity/sanity-asm-done.pl
+++ /dev/null
@@ -1,132 +0,0 @@
-#!/usr/bin/env perl
-
-###############################################################################
- #
- # This file is part of canu, a software program that assembles whole-genome
- # sequencing reads into contigs.
- #
- # This software is based on:
- # 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
- # the 'kmer package' (http://kmer.sourceforge.net)
- # both originally distributed by Applera Corporation under the GNU General
- # Public License, version 2.
- #
- # Canu branched from Celera Assembler at its revision 4587.
- # Canu branched from the kmer project at its revision 1994.
- #
- # Modifications by:
- #
- # Brian P. Walenz beginning on 2015-OCT-12
- # are a 'United States Government Work', and
- # are released in the public domain
- #
- # File 'README.licenses' in the root directory of this distribution contains
- # full conditions and disclaimers for each license.
- ##
-
-if (scalar(@ARGV) != 3) {
- die "wrong args.\n";
-}
-
-my $wrkdir = shift @ARGV;
-my $prefix = shift @ARGV;
-my $thisdate = shift @ARGV;
-
-# Special hack to add group rw
-
-system("chmod -R ug+rw $wrkdir/$thisdate/$prefix");
-system("chgrp -R atg $wrkdir/$thisdate/$prefix");
-
-my $lastdate = undef;
-my $gooddate = undef;
-
-my $resultlast = "none";
-my $resultgood = "none";
-
-my $diffs = "$wrkdir/$thisdate/$prefix/9-terminator/$prefix.qc";
-
-# Special case; don't set last/ref to the current date. Useful in
-# testing (and restarting in general).
-
-if (-e "$wrkdir/POINTERS/$prefix.last") {
- open(F, "< $wrkdir/POINTERS/$prefix.last");
- while (<F>) {
- chomp;
- $lastdate = $_ if ($_ lt $thisdate);
- }
- close(F);
-}
-if (-e "$wrkdir/POINTERS/$prefix.reference") {
- open(F, "< $wrkdir/POINTERS/$prefix.reference");
- while (<F>) {
- chomp;
- $gooddate = $_ if ($_ lt $thisdate);
- }
- close(F);
-}
-
-open(RESULT, "> $wrkdir/$thisdate/$prefix/sanity-result.out") or die;
-open(ERROR, "> $wrkdir/$thisdate/$prefix/sanity-error.out") or die;
-
-if (! -e "$wrkdir/$thisdate/$prefix/9-terminator/$prefix.asm") {
- print RESULT "Assembly result for $thisdate/$prefix: FAILURE (no asm file)\n";
-
- open(F, "ls -ltr $wrkdir/$thisdate/$prefix |");
- while (<F>) {
- print ERROR $_;
- }
- close(F);
-
- print ERROR "\n";
- print ERROR "\n";
- print ERROR "\n";
-
- my $logfile;
-
- open(F, "ls $wrkdir/$thisdate/$prefix/runCA.sge.out.[0-9][0-9] |");
- while (<F>) {
- chomp;
- $logfile = $_;
- }
- close(F);
-
- open(F, "< $logfile");
- while (<F>) {
- print ERROR $_;
- }
- close(F);
-
- close(RESULT);
- close(ERROR);
- exit(0);
-} else {
- if (defined($lastdate) && (-e "$wrkdir/$lastdate/$prefix/9-terminator/$prefix.asm")) {
- $diffs = "$wrkdir/$lastdate/$prefix/9-terminator/$prefix.qc $diffs";
- if (system("diff -q $wrkdir/$thisdate/$prefix/9-terminator/$prefix.asm $wrkdir/$lastdate/$prefix/9-terminator/$prefix.asm > /dev/null 2>&1") == 0) {
- $resultlast = "$lastdate same";
- } else {
- $resultlast = "$lastdate differs";
- }
- }
-
- if (defined($gooddate) && (-e "$wrkdir/$gooddate/$prefix/9-terminator/$prefix.asm")) {
- $diffs = "$wrkdir/$gooddate/$prefix/9-terminator/$prefix.qc $diffs";
- if (system("diff -q $wrkdir/$thisdate/$prefix/9-terminator/$prefix.asm $wrkdir/$gooddate/$prefix/9-terminator/$prefix.asm > /dev/null 2>&1") == 0) {
- $resultgood = "$gooddate same";
- } else {
- $resultgood = "$gooddate differs";
- }
- }
-
- print RESULT "Assembly result for $thisdate/$prefix: SUCCESS (last: $resultlast) (reference: $resultgood)\n";
-
- system("perl $wrkdir/sanity-merge-qc.pl $diffs > $wrkdir/$thisdate/$prefix/sanity-qc.out");
-
- open(F, ">> $wrkdir/POINTERS/$prefix.last");
- print F "$thisdate\n";
- close(F);
-}
-
-
-close(RESULT);
-close(ERROR);
diff --git a/src/pipelines/sanity/sanity-get-next-date.pl b/src/pipelines/sanity/sanity-get-next-date.pl
deleted file mode 100644
index c5a144b..0000000
--- a/src/pipelines/sanity/sanity-get-next-date.pl
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/usr/bin/env perl
-
-###############################################################################
- #
- # This file is part of canu, a software program that assembles whole-genome
- # sequencing reads into contigs.
- #
- # This software is based on:
- # 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
- # the 'kmer package' (http://kmer.sourceforge.net)
- # both originally distributed by Applera Corporation under the GNU General
- # Public License, version 2.
- #
- # Canu branched from Celera Assembler at its revision 4587.
- # Canu branched from the kmer project at its revision 1994.
- #
- # Modifications by:
- #
- # Brian P. Walenz beginning on 2015-OCT-12
- # are a 'United States Government Work', and
- # are released in the public domain
- #
- # File 'README.licenses' in the root directory of this distribution contains
- # full conditions and disclaimers for each license.
- ##
-
-use Time::Local;
-
-my @v;
-
-my $date = shift @ARGV;
-my $offt = shift @ARGV;
-my $type = shift @ARGV;
-my $time = time();
-
-if ($date =~ m/(\d\d\d\d)-(\d\d)-(\d\d)-(\d\d)(\d\d)/) {
- $time = timelocal(0, $5, $4, $3, $2 - 1, $1);
-}
-
-$time += $offt;
-
-my @v = localtime($time);
-
-$v[5] += 1900;
-$v[4]++;
-
-$v[5] = substr("0000$v[5]", -4);
-$v[4] = substr("0000$v[4]", -2);
-$v[3] = substr("0000$v[3]", -2);
-$v[2] = substr("0000$v[2]", -2);
-$v[1] = substr("0000$v[1]", -2);
-
-#$v[2] = "00";
-#$v[1] = "01";
-
-if ($type eq "next") {
- $thisdate = "$v[5]-$v[4]-$v[3]-$v[2]$v[1]";
-} elsif ($type eq "hold") {
- $thisdate = "$v[5]$v[4]$v[3]$v[2]$v[1].00";
-} else {
- $thisdate = "$v[4]$v[3]-$v[2]$v[1]";
-}
-
-print "$thisdate\n";
-
-
diff --git a/src/pipelines/sanity/sanity-purge-old.pl b/src/pipelines/sanity/sanity-purge-old.pl
deleted file mode 100644
index da93376..0000000
--- a/src/pipelines/sanity/sanity-purge-old.pl
+++ /dev/null
@@ -1,210 +0,0 @@
-#!/usr/bin/env perl
-
-###############################################################################
- #
- # This file is part of canu, a software program that assembles whole-genome
- # sequencing reads into contigs.
- #
- # This software is based on:
- # 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
- # the 'kmer package' (http://kmer.sourceforge.net)
- # both originally distributed by Applera Corporation under the GNU General
- # Public License, version 2.
- #
- # Canu branched from Celera Assembler at its revision 4587.
- # Canu branched from the kmer project at its revision 1994.
- #
- # Modifications by:
- #
- # Brian P. Walenz beginning on 2015-OCT-12
- # are a 'United States Government Work', and
- # are released in the public domain
- #
- # File 'README.licenses' in the root directory of this distribution contains
- # full conditions and disclaimers for each license.
- ##
-
-use strict;
-
-# Purge all but the last reference assembly, and all but the last two
-# assemblies, and all but the last two that failed to complete.
-#
-# Up to five assemblies are saved:
-# The reference
-# Two crashes
-# Two successful finished assemblies
-#
-# No crashes are save that are earlier than the latest successful assembly.
-#
-# OLDEST
-# crash
-# reference (saved)
-# crash
-# finished
-# finished
-# finished (saved)
-# crash
-# crash
-# finished (saved)
-# crash (saved)
-# NEWEST
-
-my @assemblies;
-my @nightlies;
-
-my $doPurge = ($ARGV[0] eq "purge");
-
-# This is automagically updated by sanity-asm-done.pl, whenever an assembly finishes.
-open(F, "ls POINTERS/*last |");
-while (<F>) {
- if ($_ =~ m/^POINTERS\/(.*).last$/) {
- push @assemblies, $1;
- }
-}
-
-open(F, "ls -d ????-??-??-???? |");
-while (<F>) {
- chomp;
- push @nightlies, $_;
-}
-close(F);
-
-
-foreach my $asm (@assemblies) {
- my $reference;
- my $last1;
- my $last2;
- my $crash1;
- my $crash2;
-
- # Read reference pointers, find the latest.
- {
- my %ref;
- my @ref;
-
- $ref{"0000-00-00-0000"}++;
-
- if (! -e "POINTERS/$asm.reference") {
- open(F, "> POINTERS/$asm.reference");
- close(F);
- }
-
- open(F, "< POINTERS/$asm.reference") or die;
- while (<F>) {
- chomp;
- if (-d "$_/$asm") {
- $ref{$_}++;
- }
- }
- close(F);
-
- @ref = sort keys %ref;
- $reference = pop @ref;
- }
-
- # Read success pointers, find the latest two.
- {
- my %ref;
- my @ref;
-
- $ref{"0000-00-00-0000"}++;
- $ref{"0000-00-00-0001"}++;
-
- if (! -e "POINTERS/$asm.last") {
- open(F, "> POINTERS/$asm.last");
- close(F);
- }
-
- open(F, "< POINTERS/$asm.last") or die;
- while (<F>) {
- chomp;
- if (-e "$_/$asm/$asm.qc") {
- $ref{$_}++;
- }
- }
- close(F);
-
- @ref = sort keys %ref;
- $last1 = pop @ref;
- $last2 = pop @ref;
- }
-
- # Find the last two failures.
- {
- my %ref;
- my @ref;
-
- $ref{"0000-00-00-0000"}++;
- $ref{"0000-00-00-0001"}++;
-
- foreach my $n (@nightlies) {
- if (! -e "$n/$asm/$asm.qc") {
- $ref{$n}++;
- }
- }
-
- @ref = sort keys %ref;
- $crash1 = pop @ref;
- $crash2 = pop @ref;
-
- $crash1 = "0000-00-00-0000" if ($crash1 lt $last1);
- $crash2 = "0000-00-00-0000" if ($crash2 lt $last1);
- }
-
- print STDERR "REF\t$reference\tLAST\t$last1\t$last2\tCRASH\t$crash1\t$crash2\tASM\t$asm\n";
-
- # Save the last TWO finished assemblies, or just one? Comment to save two.
- $last2 = "0000-00-00-0000";
-
- foreach my $n (@nightlies) {
- if (-d "$n/$asm") {
- my $finished = (-e "$n/$asm/$asm.qc") ? "FINISHED" : "CRASHED";
-
- $finished = ($n eq $reference) ? "REFERENCE" : $finished;
-
- if (($n ne $reference) &&
- ($n ne $last2) &&
- ($n ne $last1) &&
- ($n ne $crash2) &&
- ($n ne $crash1)) {
- print STDERR "REMOVE\t$n/$asm\n";
-
- if ($doPurge) {
- if (! -d "DEL") { system("mkdir DEL"); }
- if (! -d "DEL/$n") { system("mkdir DEL/$n"); }
-
- # Save some juicy bits
- rename "$n/$asm/9-terminator/$asm.qc", "$n/$asm.qc";
- rename "$n/$asm/4-unitigger/$asm.cga.0", "$n/$asm.cga.0";
-
- rename "$n/$asm", "DEL/$n/$asm";
- }
- } else {
- print STDERR "SAVE\t$n/$asm\n";
- }
- }
- }
-}
-
-
-foreach my $dir (@nightlies) {
- my $asmExist = 0;
-
- next if (! -d "$dir/wgs");
-
- foreach my $asm (@assemblies) {
- $asmExist++ if (-d "$dir/$asm");
- }
-
- if ($asmExist == 0) {
- print STDERR "$dir has $asmExist saved assemblies; purge source code.\n";
- if ($doPurge) {
- if (! -d "DEL") { system("mkdir DEL"); }
- if (! -d "DEL/$dir") { system("mkdir DEL/$dir"); }
-
- rename "$dir/wgs", "DEL/$dir/wgs";
- }
- } else {
- print STDERR "$dir has $asmExist saved assemblies.\n";
- }
-}
diff --git a/src/pipelines/sanity/sanity-update-reference.pl b/src/pipelines/sanity/sanity-update-reference.pl
deleted file mode 100644
index 1f29fb2..0000000
--- a/src/pipelines/sanity/sanity-update-reference.pl
+++ /dev/null
@@ -1,93 +0,0 @@
-#!/usr/bin/env perl
-
-###############################################################################
- #
- # This file is part of canu, a software program that assembles whole-genome
- # sequencing reads into contigs.
- #
- # This software is based on:
- # 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
- # the 'kmer package' (http://kmer.sourceforge.net)
- # both originally distributed by Applera Corporation under the GNU General
- # Public License, version 2.
- #
- # Canu branched from Celera Assembler at its revision 4587.
- # Canu branched from the kmer project at its revision 1994.
- #
- # Modifications by:
- #
- # Brian P. Walenz beginning on 2015-OCT-12
- # are a 'United States Government Work', and
- # are released in the public domain
- #
- # File 'README.licenses' in the root directory of this distribution contains
- # full conditions and disclaimers for each license.
- ##
-
-use strict;
-
-# Updates the reference POINTER to the latest successful assembly.
-
-my @assemblies;
-
-open(F, "ls POINTERS/*last |");
-while (<F>) {
- if ($_ =~ m/^POINTERS\/(.*).last$/) {
- push @assemblies, $1;
- }
-}
-
-
-foreach my $asm (@assemblies) {
- my $reference = "0000-00-00-0000";
- my $last = "0000-00-00-0000";
-
- if (-e "POINTERS/$asm.reference") {
- my %ref;
- my @ref;
-
- $ref{"0000-00-00-0000"}++;
-
- open(F, "< POINTERS/$asm.reference") or die;
- while (<F>) {
- chomp;
- if (-d "$_/$asm") {
- $ref{$_}++;
- }
- }
- close(F);
-
- @ref = sort keys %ref;
- $reference = pop @ref;
- }
-
-
- if (-e "POINTERS/$asm.last") {
- my %ref;
- my @ref;
-
- $ref{"0000-00-00-0000"}++;
-
- open(F, "< POINTERS/$asm.last") or die;
- while (<F>) {
- chomp;
- if (-e "$_/$asm/$asm.qc") {
- $ref{$_}++;
- }
- }
- close(F);
-
- @ref = sort keys %ref;
- $last = pop @ref;
- }
-
- if ($reference lt $last) {
- print STDERR "REF\t$reference\tLAST\t$last\tSTALE\t$asm\n";
-
- open(F, ">> POINTERS/$asm.reference") or die;
- print F "$last\n";
- close(F);
- } else {
- print STDERR "REF\t$reference\tLAST\t$last\t\t$asm\n";
- }
-}
diff --git a/src/pipelines/sanity/sanity.pl b/src/pipelines/sanity/sanity.pl
index 9a37900..52904b8 100755
--- a/src/pipelines/sanity/sanity.pl
+++ b/src/pipelines/sanity/sanity.pl
@@ -26,39 +26,10 @@
use strict;
use Config; # for @signame
+use Time::Local;
-# The only four globals configurables:
-#
-my $site = undef;
-my $wrkdir = undef;
-my $wgssvn = undef;
-my $kmersvn = undef;
-
-if (-d "/usr/local/projects/BIOINFO/ASSEMBLY/NIGHTLY") {
- $site = "JCVI";
- $wrkdir = "/usr/local/projects/BIOINFO/ASSEMBLY/NIGHTLY";
- $wgssvn = "/usr/local/projects/BIOINFO/ASSEMBLY/NIGHTLY/SVN-wgs";
- $kmersvn = "/usr/local/projects/BIOINFO/ASSEMBLY/NIGHTLY/SVN-kmer";
-} elsif (-d "/work/NIGHTLY/") {
- $site = "BPWI";
- $wrkdir = "/work/NIGHTLY";
- $wgssvn = "/work/NIGHTLY/SVN-wgs";
- $kmersvn = "/work/NIGHTLY/SVN-kmer";
-} else {
- die "Unknown site configuration.\n";
-}
-
-
-# Command line options are
-#
-# 'oper' -- thing to do.
-#
-# 'ddir' -- properly formatted directory name to do it in. op ==
-# checkout creates this directory. If still supplied, it will
-# force a checkout from that date.
-#
-# rsync
-# update the local repositories in 'wgs-assembler-cvs' and 'kmer-svn'.
+# fetch
+# update the local repositories from github.
#
# checkout date
# Date must be of the form yyyy-mm-dd-hhmm. This will checkout
@@ -69,48 +40,101 @@ if (-d "/usr/local/projects/BIOINFO/ASSEMBLY/NIGHTLY") {
# build date
# Builds the assembler checked out into 'date'.
#
-# assemble date label specFile .... email ...
-# Launches runCA on each specFile, running it in directory 'date'.
+# assemble date specFile .... email ...
+# Launches canu on each specFile, running it in directory 'date'.
# Each specFile must have a unique name; the assembly is named
-# after the specFile. The label is used only for reporting via email.
+# after the specFile.
# At the end, a diff is generated to POINTERS/$prefix.last and
# POINTERS/$prefix.reference. If the assembly finished, POINTERS/$prefix.last
# is updated.
#
+# submit date
+# Submit ourself for execution at date.
+#
-{
- my $oper = shift @ARGV;
- my $ddir = shift @ARGV;
+my $oper = shift @ARGV;
- my ($thisdate, $lastdate) = parseDate($ddir);
+my $site = undef;
+my $wrkdir = undef;
+my $gitrepo = undef;
- if ($oper eq "rsync") {
- system("mkdir -p $wgssvn") if (! -d "$wgssvn");
- system("mkdir -p $kmersvn") if (! -d "$kmersvn");
+my $tz = `date +%z`; chomp $tz;
- #system("cd $wgscvs && rsync -av rsync://wgs-assembler.cvs.sourceforge.net/cvsroot/wgs-assembler/\* . > rsync.out 2>&1");
- system("cd $wgssvn && rsync -av svn.code.sf.net::p/wgs-assembler/svn/ . > rsync.out 2>&1");
- system("cd $kmersvn && rsync -av svn.code.sf.net::p/kmer/code/ . > rsync.out 2>&1");
- } elsif ($oper eq "checkout") {
- checkoutAndLog($thisdate, $lastdate, "file://$kmersvn/trunk", "kmer", "kmer");
- checkoutAndLog($thisdate, $lastdate, "file://$wgssvn/trunk/src", "src", "src");
+# Set up for the machine we're on.
- } elsif ($oper eq "build") {
- buildKmer($thisdate);
- buildCA($thisdate);
- } elsif ($oper eq "assemble") {
- assemble($thisdate, @ARGV);
+if (-d "/gryphon") {
+ $site = "gryphon";
+ $wrkdir = "/data/projects/phillippy/scratch/NIGHTLY";
+ $gitrepo = "/data/projects/phillippy/scratch/NIGHTLY/canu";
+}
+
+elsif (-d "/data/walenzbp/NIGHTLY/") {
+ $site = "BPWI";
+ $wrkdir = "/data/walenzbp/NIGHTLY";
+ $gitrepo = "/data/walenzbp/NIGHTLY/canu";
+}
+
+elsif (-d "/assembly/NIGHTLY/") {
+ $site = "BPWI";
+ $wrkdir = "/assembly/NIGHTLY";
+ $gitrepo = "/assembly/NIGHTLY/canu";
+}
+
+elsif (-d "/work/NIGHTLY/") {
+ $site = "BPWI";
+ $wrkdir = "/work/NIGHTLY";
+ $gitrepo = "/work/NIGHTLY/canu";
+}
+
+else {
+ die "Unknown site configuration.\n";
+}
+
+
+# Now do something.
+
+
+if ($oper eq "fetch") {
+ fetch();
+ exit(0);
+}
- } else {
- die "$0: unknown action '$oper'\n";
- }
+if ($oper eq "checkout") {
+ checkout(@ARGV);
exit(0);
}
+if ($oper eq "build") {
+ build(@ARGV);
+ exit(0);
+}
+
+
+if ($oper eq "assemble") {
+ # if canu.merge.out is "Already up-to-date." we can skip running stuff.
+ assemble(@ARGV);
+ exit(0);
+}
+
+
+if ($oper eq "submit") {
+ submit(@ARGV);
+ exit(0);
+}
+
+
+die "$0: unknown action '$oper'\n";
+exit(0);
+
+
+
+
+
+
sub parseDate ($) {
my $pathdate = shift @_;
my $thisdate;
@@ -150,303 +174,209 @@ sub parseDate ($) {
}
close(F);
- print STDERR "Working on '$thisdate' -- last found is '$lastdate'.\n";
+ #print STDERR "Working on '$thisdate' -- last found is '$lastdate'.\n";
return($thisdate, $lastdate);
}
-sub checkoutAndLog ($$) {
- my $thisdate = shift @_;
- my $lastdate = shift @_;
- my $repo = shift @_; # Path to local repository
- my $target = shift @_; # Thing to checkout
- my $path = shift @_; # Where to put it (in wgs/)
- print STDERR "Checking out $wrkdir/$thisdate (repo=$repo path=$path)\n";
+sub gitDate ($) {
+ my $date = shift @_;
- if (-d "$wrkdir/$thisdate/wgs/$path") {
- print STDERR "$wrkdir/$thisdate/wgs/$path already exists. Please remove to rerun.\n";
- return;
+ if ((!defined($date)) || ($date eq "")) {
+ return($date);
}
- system("mkdir -p $wrkdir/$thisdate/wgs") if (! -d "$wrkdir/$thisdate/wgs");
+ if ($date =~ m/(\d\d\d\d)-(\d\d)-(\d\d)-(\d\d)(\d\d)/) {
+ $date = "$1-$2-$3T$4:$5$tz";
+ } else {
+ die "Malformed date '$date' to gitDate().\n";
+ }
- # Convert time-names to dates that svn can use.
+ return($date);
+}
- my $thisdatesvn;
- my $lastdatesvn;
- # NOT tested with SVN. The old format was "-04:00"; new format is "-0400".
- my $tz = `date +%z`; chomp $tz;
- if ($thisdate =~ m/(\d\d\d\d)-(\d\d)-(\d\d)-(\d\d)(\d\d)/) {
- $thisdatesvn = "$1-$2-$3T$4:$5$tz";
- } else {
- }
+sub fetch () {
- if (defined($lastdate)) {
- if ($lastdate =~ m/(\d\d\d\d)-(\d\d)-(\d\d)-(\d\d)(\d\d)/) {
- $lastdatesvn = "$1-$2-$3T$4:$5$tz";
- } else {
- }
+ print STDERR "\n";
+ print STDERR "SANITY FETCH\n";
+ print STDERR "\n";
+
+ if (! -d "$gitrepo") {
+ print STDERR " Cloning new repo '$gitrepo'.\n";
+ system("mkdir -p $gitrepo") if (! -d "$gitrepo");
+ system("cd $gitrepo/.. ; git clone git\@github.com:marbl/canu.git > canu.clone.out 2>&1");
}
+ else {
+ print STDERR " Updating existing repo '$gitrepo'.\n";
+ system("cd $gitrepo ; git fetch > ../canu.fetch.out 2>&1");
+ system("cd $gitrepo ; git merge > ../canu.merge.out 2>&1");
+ }
+}
- system("cd $wrkdir/$thisdate/wgs && svn co -r \"{$thisdatesvn}\" $repo $target > $path.checkout.err 2>&1");
-
- # This is annoying. SVN log will report changes inclusive to revisions. -r 5:9 will report
- # changes made in revisions 5 through 9. When you give it a date, it finds the revision that
- # was active on that date. What we want here, though, is the changes SINCE that date (or,
- # since revision 5, up to revision 9).
- #
- # To get around this, we first get the logs, but scan for the lowest and highest revision
- # numbers, then dump correctly.
- #
- if ($lastdate ne "") {
- my $loRev;
- my $hiRev;
-
- print "svn log -v $repo -r \"{$lastdatesvn}:{$thisdatesvn}\"\n";
- open(F, "cd $wrkdir/$thisdate/wgs && svn log -v $repo -r \"{$lastdatesvn}:{$thisdatesvn}\" |");
- while (<F>) {
- if (m/^r(\d+)\s+\|\s+/) {
- $loRev = $1 if ((!defined($loRev)) || ($1 < $loRev));
- $hiRev = $1 if ((!defined($hiRev)) || ($hiRev < $1));
- }
- }
- close(F);
- $loRev++ if (defined($loRev));
- print STDERR "loRev='$loRev' hiRev='$hiRev'\n";
+sub checkout (@) {
+ my ($thisdate, $lastdate) = parseDate($_[0]);
- if (defined($loRev) && defined($hiRev) && ($loRev < $hiRev)) {
- print "svn log -v $repo -r $loRev:$hiRev\n";
- system("cd $wrkdir/$thisdate/wgs && svn log -v $repo -r $loRev:$hiRev > $path.updates");
- }
+ my $ldate = gitDate($lastdate);
+ my $tdate = gitDate($thisdate);
+
+ print STDERR "\n";
+ print STDERR "SANITY CHECKOUT $wrkdir/$thisdate\n";
+ print STDERR "\n";
+
+ if (-d "$wrkdir/$thisdate/canu") {
+ print STDERR " $wrkdir/$thisdate/canu already exists. Please remove to rerun.\n";
+ return;
}
- print STDERR "$thisdate checked out!\n";
-}
+ system("mkdir -p $wrkdir/$thisdate/canu") if (! -d "$wrkdir/$thisdate/canu");
+ # Clone the clone.
+ system("cd $wrkdir/$thisdate/canu && rsync -a $gitrepo/ .");
-sub buildKmer ($) {
- my $thisdate = shift @_;
+ # Find a git hash to grab the bits we want.
- print STDERR "Building KMER $wrkdir/$thisdate\n";
+ my $hash = `cd $wrkdir/$thisdate/canu && git rev-list -n 1 --first-parent --before=\"$tdate\" master`; chomp $hash;
- if (-e "$wrkdir/$thisdate/wgs/kmer/make.err") {
- print STDERR "$wrkdir/$thisdate/wgs/kmer was already built once (successuflly or not). Please cleanup first.\n";
- } else {
- system("cd $wrkdir/$thisdate/wgs/kmer && sh configure.sh > configure.out 2> configure.err");
- system("cd $wrkdir/$thisdate/wgs/kmer && gmake install > make.out.raw 2> make.err.raw");
-
- my %lines;
-
- open(F, "< $wrkdir/$thisdate/wgs/kmer/make.err.raw");
- open(G, "> $wrkdir/$thisdate/wgs/kmer/make.err");
- while (<F>) {
- chomp;
- next if (m/^ar:\s+creating/);
- if (!exists($lines{$_})) {
- $lines{$_}++;
- print G "$_\n";
- }
- }
- close(F);
- close(G);
- }
-}
+ # Checkout that hash.
+ system("cd $wrkdir/$thisdate/canu && git checkout -b SANITY-$thisdate $hash");
-sub buildCA ($) {
- my $thisdate = shift @_;
+ # git clone $gitrepo $wrkdir/$thisdate/canu
- print STDERR "Building CA $wrkdir/$thisdate\n";
+ if ($ldate ne "") {
+ print sTDERR "log\n";
+ system("cd $wrkdir/$thisdate/canu && git log --after=\"$ldate\" --until=\"$tdate\" > $wrkdir/$thisdate/canu.updates");
+ }
- if (-e "$wrkdir/$thisdate/wgs/src/make.err") {
- print STDERR "$wrkdir/$thisdate/wgs/src was already built once (successuflly or not). Please cleanup first.\n";
- } else {
- # Temporary hack to handle the C to C++ renaming.
- # This was broken on "2009/06/10 17:34:22".
- # This was fixed on "2009/08/06 11:36:54"
- #
- if ((-e "$wrkdir/$thisdate/wgs/src/rename-to-c++.sh") &&
- (-s "$wrkdir/$thisdate/wgs/src/c_make.gen" == 14877)) {
- system("cd $wrkdir/$thisdate/wgs/src && sh rename-to-c++.sh");
- }
+ print STDERR " $thisdate checked out!\n";
+}
- system("cd $wrkdir/$thisdate/wgs/src && gmake > make.out.raw 2> make.err.raw");
- my %lines;
- open(F, "< $wrkdir/$thisdate/wgs/src/make.err.raw");
- open(G, "> $wrkdir/$thisdate/wgs/src/make.err");
- while (<F>) {
- chomp;
- next if (m/^ar:\s+creating/);
- if (!exists($lines{$_})) {
- $lines{$_}++;
- print G "$_\n";
- }
- }
- close(F);
- close(G);
+sub build (@) {
+ my ($thisdate, $lastdate) = parseDate($_[0]);
+
+ print STDERR "\n";
+ print STDERR "SANITY BUILD $wrkdir/$thisdate\n";
+ print STDERR "\n";
+
+ if (-e "$wrkdir/$thisdate/canu/src/make.err") {
+ print STDERR " $wrkdir/$thisdate/canu was already built once (successuflly or not). Please cleanup first.\n";
+ return;
}
+
+ system("cd $wrkdir/$thisdate/canu/src && gmake -j 12 > make.out 2> make.err");
}
-sub assemble ($$@) {
- my $thisdate = shift @_;
- my $label = shift @_;
- my $holds_asms;
- my $names_asms;
- my $holds_done;
+sub assemble (@) {
+ my ($thisdate, $lastdate) = parseDate(shift @_);
- my $arch;
+ print STDERR "\n";
+ print STDERR "ASSEMBLE\n";
+ print STDERR "\n";
- # Ripped from runCA/util.pl
my $syst = `uname -s`; chomp $syst; # OS implementation
my $arch = `uname -m`; chomp $arch; # Hardware platform
my $name = `uname -n`; chomp $name; # Name of the system
$arch = "amd64" if ($arch eq "x86_64");
$arch = "ppc" if ($arch eq "Power Macintosh");
- # end of rip.
-
- # Figure out which are specfiles and which are email addresses.
- #
my @spec;
- my @addresses;
my $addresses;
foreach my $arg (@_) {
if ($arg =~ m/\@/) {
- push @addresses, $arg;
- } else {
+ $addresses = $arg if (!defined($addresses));
+ $addresses .= ",$arg" if ( defined($addresses));
+ }
+ else {
$arg = "$ENV{PWD}/$arg" if ($arg !~ m/^\//);
push @spec, $arg;
}
}
- $addresses = join ',', @addresses;
+ #open(F, "> $wrkdir/$thisdate/asm-done.sh");
+ #print F "#!/bin/sh\n";
+ #print F "\n";
+ #print F "perl $wrkdir/sanity-asm-done.pl $wrkdir \$1 $thisdate\n";
+ #close(F);
+
+ #open(F, "> $wrkdir/$thisdate/all-done.sh");
+ #print F "#!/bin/sh\n";
+ #print F "\n";
+ #print F "perl $wrkdir/sanity-all-done.pl $wrkdir $thisdate \$1 $addresses \$2 \\\n";
+ #print F "| \\\n";
+ #print F "tee \"$wrkdir/$thisdate/sanity-all-done.\$1\" \\\n";
+ #print F "| \\\n";
+ #print F "/usr/sbin/sendmail -i -t -f thebri\@gmail.com\n" if ($site eq "JCVI");
+ #print F "/usr/local/sbin/ssmtp thebri\@gmail.com\n" if ($site eq "BPWI");
+ #close(F);
- open(F, "> $wrkdir/$thisdate/asm-done.sh");
- print F "#!/bin/sh\n";
- print F "\n";
- print F "perl $wrkdir/sanity-asm-done.pl $wrkdir \$1 $thisdate\n";
- close(F);
+ foreach my $s (@spec) {
+ my @c = split '/', $s;
+ my $n = $c[scalar(@c) - 1];
- open(F, "> $wrkdir/$thisdate/all-done.sh");
- print F "#!/bin/sh\n";
- print F "\n";
- print F "perl $wrkdir/sanity-all-done.pl $wrkdir $thisdate \$1 $addresses \$2 \\\n";
- print F "| \\\n";
- print F "tee \"$wrkdir/$thisdate/sanity-all-done.\$1\" \\\n";
- print F "| \\\n";
- print F "/usr/sbin/sendmail -i -t -f celera_assembler_test\@jcvi.org\n" if ($site eq "JCVI");
- print F "/usr/local/sbin/ssmtp thebri\@gmail.com\n" if ($site eq "BPWI");
- close(F);
+ $n =~ s/.spec$//;
+ $n =~ s/.specFile$//;
- foreach my $s (@spec) {
- my $n;
+ print STDERR "Submitting assembly '$n' for spec '$s'.\n";
- {
- my @c = split '/', $s;
- $n = $c[scalar(@c) - 1];
- $n =~ s/.spec$//;
- $n =~ s/.specFile$//;
- }
+ print STDERR "cd $wrkdir/$thisdate \\\n";
+ print STDERR "&& \\\n";
+ print STDERR "$wrkdir/$thisdate/canu/$syst-$arch/bin/canu -p $n -d $n -s $s";
+ system("cd $wrkdir/$thisdate && $wrkdir/$thisdate/canu/$syst-$arch/bin/canu -p $n -d $n -s $s");
+ }
+}
- print STDERR "----------------------------------------\n";
- print STDERR "Submitting assembly '$n'.\n";
-
- system("mkdir $wrkdir/$thisdate/$n") if (! -d "$wrkdir/$thisdate/$n");
-
- my $jl = "CAini_${n}_$$"; # Name of the launcher
- my $jn = "CAtst_${n}_$$"; # Name of the asm-done
-
- open(F, "> $wrkdir/$thisdate/$n/launch-assembly.sh");
- print F "#!/bin/sh\n";
- print F "\n";
- print F "# runCA checks if this is set to decide if it is on the grid or not. We want\n";
- print F "# to pretend we are NOT on the grid, so runCA will resubmit itself immediately.\n";
- print F "#\n";
- print F "unset SGE_TASK_ID\n";
- print F "\n";
- print F "\n";
- print F "# Attempt to (re)configure SGE. For reasons Bri doesn't know,\n";
- print F "# jobs submitted to SGE, and running under SGE, fail to read his\n";
- print F "# .tcshrc (or .bashrc, limited testing), and so they don't setup\n";
- print F "# SGE (or ANY other paths, etc) properly. For the record,\n";
- print F "# interactive SGE logins (qlogin, etc) DO set the environment.\n";
- print F "\n";
- print F ". \$SGE_ROOT/\$SGE_CELL/common/settings.sh\n";
- print F "\n";
- print F "\n";
- print F "# Submit runCA to the grid. Do not set any runCA parameters here; they\n";
- print F "# will override ALL specFile parameters. scriptOnGrid must be set, otherwise\n";
- print F "# the assembly will be performed with this call, instead of just launched\n";
- print F "# to the grid.\n";
- print F "\n";
- print F "perl $wrkdir/$thisdate/wgs/$syst-$arch/bin/runCA \\\n";
- print F " sgePropagateHold=$jn scriptOnGrid=1 \\\n";
- print F " -p $n -d $wrkdir/$thisdate/$n -s $s\n";
- print F "\n";
- print F "# Once that runCA finishes, we've updated the hold on $jn, and so can release\n";
- print F "# our user hold on it.\n";
- print F "#\n";
- print F "qrls -h u $jn\n";
- print F "\n";
- print F "\n";
- close(F);
-
- # A separate script is used to launch runCA, so that we can get a user hold on it. This is messy.
- # 1) Submit the launcher, holding it.
- # 2) Submit the asm-done, holding it too. Make it also hold_jid on the launcher.
- # 3) Release the launcher.
- # 4) The launcher submits runCA, with sgePropagateHold. When that runCA finishes, the assembly
- # is now running, AND it has updated the hold_jid on asm-done.
- # 5) We can now release the asm-done.
- #
- # Steps 1 and 2 are done here, step 3 is at the very end, steps 4 and 5 are done in the launch-assembly.sh above.
-
- if ($site eq "JCVI") {
- system("cd $wrkdir/$thisdate/$n && qsub -P 334007 -A CAsanity -b n -cwd -j y -o $wrkdir/$thisdate/$n/launch-assembly.err -l fast -h -N $jl $wrkdir/$thisdate/$n/launch-assembly.sh");
- system("cd $wrkdir/$thisdate/$n && qsub -P 334007 -A CAsanity -b n -cwd -j y -o $wrkdir/$thisdate/$n/asm-done.err -l fast -h -hold_jid $jl -N $jn $wrkdir/$thisdate/asm-done.sh $n");
- } elsif ($site eq "BPWI") {
- system("cd $wrkdir/$thisdate/$n && qsub -A CAsanity -b n -cwd -j y -o $wrkdir/$thisdate/$n/launch-assembly.err -h -N $jl $wrkdir/$thisdate/$n/launch-assembly.sh");
- system("cd $wrkdir/$thisdate/$n && qsub -A CAsanity -b n -cwd -j y -o $wrkdir/$thisdate/$n/asm-done.err -h -hold_jid $jl -N $jn $wrkdir/$thisdate/asm-done.sh $n");
- }
- if (defined($holds_done)) {
- $holds_done .= ",$jn";
- } else {
- $holds_done = "$jn";
- }
- if (defined($holds_asms)) {
- $holds_asms .= ",$jl";
- $names_asms .= ",$n";
- } else {
- $holds_asms = "$jl";
- $names_asms .= "$n";
- }
- }
+sub submit (@) {
+ my ($thisdate, $lastdate) = parseDate(shift @ARGV);
+ my $seconds;
- if ($site eq "JCVI") {
- system("cd $wrkdir/$thisdate && qsub -P 334007 -A CAsanity -b n -cwd -j y -o $wrkdir/$thisdate/all-done.err -l fast -hold_jid $holds_done -N CAfin_$thisdate $wrkdir/$thisdate/all-done.sh $label $names_asms");
- } elsif ($site eq "BPWI") {
- system("cd $wrkdir/$thisdate && qsub -A CAsanity -b n -cwd -j y -o $wrkdir/$thisdate/all-done.err -hold_jid $holds_done -N CAfin_$thisdate $wrkdir/$thisdate/all-done.sh $label $names_asms");
+ if ($thisdate =~ m/(\d\d\d\d)-(\d\d)-(\d\d)-(\d\d)(\d\d)/) {
+ $seconds = timelocal(0, $5, $4, $3, $2 - 1, $1);
}
- # Now, release everything to run.
+ $seconds += 604800; # one week
+ $seconds += 86400; # one day
+ $seconds += 14400; # four hours
+ $seconds += 7200; # two hours
+ $seconds += 21600; # six hours
+ $seconds += 43200; # twelve hours
+
+ my @v = localtime($seconds);
- system("qrls -h u $holds_asms");
+ $v[5] += 1900;
+ $v[4]++;
+
+ $v[5] = substr("0000$v[5]", -4);
+ $v[4] = substr("0000$v[4]", -2);
+ $v[3] = substr("0000$v[3]", -2);
+ $v[2] = substr("0000$v[2]", -2);
+ $v[1] = substr("0000$v[1]", -2);
+
+ #$v[2] = "00";
+ #$v[1] = "01";
+
+ my $nextdate = "$v[5]-$v[4]-$v[3]-$v[2]$v[1]";
+ my $nexthold = "$v[5]$v[4]$v[3]$v[2]$v[1].00";
+ my $nextname = "$v[4]$v[3]-$v[2]$v[1]";
+
+ print STDERR "Submit next at date='$nextdate' hold='$nexthold' name='$nextname'\n";
+
+ #system("qsub -cwd -j y -o $nextdate.err -A assembly-nightly -N snty$nextname -a $nexthold -b n sanity.sh $nextdate grid");
}
-exit(0);
+
diff --git a/src/pipelines/sanity/sanity.sh b/src/pipelines/sanity/sanity.sh
index 0731a71..cf3d7bd 100644
--- a/src/pipelines/sanity/sanity.sh
+++ b/src/pipelines/sanity/sanity.sh
@@ -3,12 +3,6 @@
# (re)Load the sge config.
. $SGE_ROOT/$SGE_CELL/common/settings.sh
-# Needed for old checkouts. Newer checkouts default to LOCAL.
-#With SITE=JCVI, every EUID is different, so every QC report is different.
-#With default behavior, EUIDs always start at the same number, so QC reports can be identical.
-#We want default bahavior so the summary email reports no differences.
-#export SITE_NAME=JCVI
-
#
# Master controller of nightly sanity checks. Optional date on command line.
#
@@ -22,64 +16,35 @@
#
date=$1
-grid=$2
-
-if [ x$date != x ] ; then
- echo "SANITY BEGINS for $date at `date`"
-fi
-
-
-# Remove old versions
-perl sanity-purge-old.pl purge
-rm -rf DEL
-
-# Update the repository.
-perl sanity.pl rsync
+bins=/work/canu/src/pipelines/sanity
+spec=/work/canu/src/pipelines/sanity
-
-# Checkout the latest version.
-perl sanity.pl checkout $date
-
-
-# Figure out what was checked out.
if [ x$date = x ] ; then
- date=`ls -1d 20??-??-??-???? | tail -n 1`
+ date=`date +%Y-%m-%d-%H%M`
fi
+echo "SANITY BEGINS for $date at `date`"
-# Build it.
-perl sanity.pl build $date
-
-# Let the user pick one to run
-if [ x$grid = x ] ; then
- echo "$date checked out and compiled. Run some of:"
- echo " sh sanity-daily-test.sh $date"
- echo " sh sanity-daily-pging.sh $date"
-
-else
- nextofft=604800 # one week
- nextofft=86400 # one day
- nextofft=14400 # four hours
- nextofft=7200 # two hours
- nextofft=21600 # six hours
- nextofft=43200 # twelve hours
- nextdate=`perl sanity-get-next-date.pl $date $nextofft next`
- nexthold=`perl sanity-get-next-date.pl $date $nextofft hold`
- nextname=`perl sanity-get-next-date.pl $date $nextofft name`
+# Remove old versions
+#perl $bins/sanity-purge-old.pl purge
+#rm -rf DEL
- # Which tests should we run? For now, just the simple p.ging
+perl $bins/sanity.pl fetch # Update the local repo.
+perl $bins/sanity.pl checkout $date # Checkout from that repo.
+perl $bins/sanity.pl build $date # Compile.
+perl $bins/sanity.pl submit $date # Submit the next iteration.
- sh sanity-daily-pging.sh $date
+# Run small stuff daily.
- #if [ `date +%u` = 6] ; then
- # sh sanity-weekly-dros.sh $date
- # sh sanity-weekly-moore.sh $date
- #fi
+perl $bins/sanity.pl assemble $date $spec/small1.spec
+perl $bins/sanity.pl assemble $date $spec/small2.spec
+perl $bins/sanity.pl assemble $date $spec/small3.spec
+perl $bins/sanity.pl assemble $date $spec/small4.spec
- echo "SUBMIT for $nextdate ($nexthold)"
+# Run big stuff weekly.
- echo \
- qsub -cwd -j y -o $nextdate.err -P 334007 -A assembly-nightly -N CAsnty$nextname -a $nexthold -b n sanity.sh $nextdate grid
+#if [ `date +%u` = 6] ; then
+# sh sanity-weekly-dros.sh $date
+# sh sanity-weekly-moore.sh $date
+#fi
- qsub -cwd -j y -o $nextdate.err -P 334007 -A assembly-nightly -N CAsnty$nextname -a $nexthold -b n sanity.sh $nextdate grid
-fi
diff --git a/src/stores/gatekeeperCreate.C b/src/stores/gatekeeperCreate.C
index eb418a0..84aadff 100644
--- a/src/stores/gatekeeperCreate.C
+++ b/src/stores/gatekeeperCreate.C
@@ -156,7 +156,7 @@ loadFASTA(char *L,
// Report errors.
if (baseErrors > 0) {
- fprintf(errorLog, "read '%s' has "F_U32" invalid base%s. Converted to 'N'.\n",
+ fprintf(errorLog, "read '%s' has " F_U32 " invalid base%s. Converted to 'N'.\n",
H, baseErrors, (baseErrors > 1) ? "s" : "");
nWARNS++;
}
@@ -261,7 +261,7 @@ loadFASTQ(char *L,
}
if (baseErrors > 0) {
- fprintf(errorLog, "read '%s' has "F_U32" invalid base%s. Converted to 'N'.\n",
+ fprintf(errorLog, "read '%s' has " F_U32 " invalid base%s. Converted to 'N'.\n",
L, baseErrors, (baseErrors > 1) ? "s" : "");
nWARNS++;
}
@@ -307,7 +307,7 @@ loadFASTQ(char *L,
}
if (QVerrors > 0) {
- fprintf(errorLog, "read '%s' has "F_U32" invalid QV%s. Converted to min or max value.\n",
+ fprintf(errorLog, "read '%s' has " F_U32 " invalid QV%s. Converted to min or max value.\n",
L, QVerrors, (QVerrors > 1) ? "s" : "");
nWARNS++;
}
@@ -359,18 +359,7 @@ loadReads(gkStore *gkpStore,
fprintf(stderr, "\n");
fprintf(stderr, " Loading reads from '%s'\n", fileName);
-#if 0
- fprintf(htmlLog, "<tr id='gkpload%u'><td colspan='2'>%s</td></tr>\n", gkpFileID, fileName);
- fprintf(htmlLog, "<tr class='details'><td rowspan='9'>Parameters</td><td>preset=N/A</td></tr>\n");
- fprintf(htmlLog, "<tr class='details'><td>defaultQV=%u</td></tr>\n", gkpLibrary->gkLibrary_defaultQV());
- fprintf(htmlLog, "<tr class='details'><td>isNonRandom=%s</td></tr>\n", gkpLibrary->gkLibrary_isNonRandom() ? "true" : "false");
- fprintf(htmlLog, "<tr class='details'><td>removeDuplicateReads=%s</td></tr>\n", gkpLibrary->gkLibrary_removeDuplicateReads() ? "true" : "false");
- fprintf(htmlLog, "<tr class='details'><td>finalTrim=%s</td></tr>\n", gkpLibrary->gkLibrary_finalTrim() ? "true" : "false");
- fprintf(htmlLog, "<tr class='details'><td>removeSpurReads=%s</td></tr>\n", gkpLibrary->gkLibrary_removeSpurReads() ? "true" : "false");
- fprintf(htmlLog, "<tr class='details'><td>removeChimericReads=%s</td></tr>\n", gkpLibrary->gkLibrary_removeChimericReads() ? "true" : "false");
- fprintf(htmlLog, "<tr class='details'><td>checkForSubReads=%s</td></tr>\n", gkpLibrary->gkLibrary_checkForSubReads() ? "true" : "false");
-#else
- fprintf(htmlLog, "nam "F_U32" %s\n", gkpFileID, fileName);
+ fprintf(htmlLog, "nam " F_U32 " %s\n", gkpFileID, fileName);
fprintf(htmlLog, "lib preset=N/A");
fprintf(htmlLog, " defaultQV=%u", gkpLibrary->gkLibrary_defaultQV());
@@ -380,8 +369,6 @@ loadReads(gkStore *gkpStore,
fprintf(htmlLog, " removeSpurReads=%s", gkpLibrary->gkLibrary_removeSpurReads() ? "true" : "false");
fprintf(htmlLog, " removeChimericReads=%s", gkpLibrary->gkLibrary_removeChimericReads() ? "true" : "false");
fprintf(htmlLog, " checkForSubReads=%s\n", gkpLibrary->gkLibrary_checkForSubReads() ? "true" : "false");
-#endif
-
compressedFileReader *F = new compressedFileReader(fileName);
@@ -421,7 +408,7 @@ loadReads(gkStore *gkpStore,
}
else {
- fprintf(errorLog, "invalid read header '%.40s%s' in file '%s' at line "F_U64", skipping.\n",
+ fprintf(errorLog, "invalid read header '%.40s%s' in file '%s' at line " F_U64 ", skipping.\n",
L, (strlen(L) > 80) ? "..." : "", fileName, lineNumber);
L[0] = 0;
nWARNSlocal++;
@@ -430,7 +417,7 @@ loadReads(gkStore *gkpStore,
// If S[0] isn't nul, we loaded a sequence and need to store it.
if (Slen < minReadLength) {
- fprintf(errorLog, "read '%s' of length "F_U32" in file '%s' at line "F_U64" is too short, skipping.\n",
+ fprintf(errorLog, "read '%s' of length " F_U32 " in file '%s' at line " F_U64 " is too short, skipping.\n",
H, Slen, fileName, lineNumber);
if (isFASTA) {
@@ -488,50 +475,37 @@ loadReads(gkStore *gkpStore,
// Write status to the screen
- fprintf(stderr, " Processed "F_U64" lines.\n", lineNumber);
+ fprintf(stderr, " Processed " F_U64 " lines.\n", lineNumber);
- fprintf(stderr, " Loaded "F_U64" bp from:\n", bLOADEDAlocal + bLOADEDQlocal);
+ fprintf(stderr, " Loaded " F_U64 " bp from:\n", bLOADEDAlocal + bLOADEDQlocal);
if (nFASTAlocal > 0)
- fprintf(stderr, " "F_U32" FASTA format reads ("F_U64" bp).\n", nFASTAlocal, bLOADEDAlocal);
+ fprintf(stderr, " " F_U32 " FASTA format reads (" F_U64 " bp).\n", nFASTAlocal, bLOADEDAlocal);
if (nFASTQlocal > 0)
- fprintf(stderr, " "F_U32" FASTQ format reads ("F_U64" bp).\n", nFASTQlocal, bLOADEDQlocal);
+ fprintf(stderr, " " F_U32 " FASTQ format reads (" F_U64 " bp).\n", nFASTQlocal, bLOADEDQlocal);
if (nWARNSlocal > 0)
- fprintf(stderr, " WARNING: "F_U32" reads issued a warning.\n", nWARNSlocal);
+ fprintf(stderr, " WARNING: " F_U32 " reads issued a warning.\n", nWARNSlocal);
if (nSKIPPEDAlocal > 0)
- fprintf(stderr, " WARNING: "F_U32" reads (%0.4f%%) with "F_U64" bp (%0.4f%%) were too short (< "F_U32"bp) and were ignored.\n",
+ fprintf(stderr, " WARNING: " F_U32 " reads (%0.4f%%) with " F_U64 " bp (%0.4f%%) were too short (< " F_U32 "bp) and were ignored.\n",
nSKIPPEDAlocal, 100.0 * nSKIPPEDAlocal / (nSKIPPEDAlocal + nLOADEDAlocal),
bSKIPPEDAlocal, 100.0 * bSKIPPEDAlocal / (bSKIPPEDAlocal + bLOADEDAlocal),
minReadLength);
if (nSKIPPEDQlocal > 0)
- fprintf(stderr, " WARNING: "F_U32" reads (%0.4f%%) with "F_U64" bp (%0.4f%%) were too short (< "F_U32"bp) and were ignored.\n",
+ fprintf(stderr, " WARNING: " F_U32 " reads (%0.4f%%) with " F_U64 " bp (%0.4f%%) were too short (< " F_U32 "bp) and were ignored.\n",
nSKIPPEDQlocal, 100.0 * nSKIPPEDQlocal / (nSKIPPEDQlocal + nLOADEDQlocal),
bSKIPPEDQlocal, 100.0 * bSKIPPEDQlocal / (bSKIPPEDQlocal + bLOADEDQlocal),
minReadLength);
// Write status to HTML
-#if 0
- fprintf(htmlLog, "<tr class='details'><td rowspan='2'>FASTA</td><td>"F_U32" reads ("F_U64" bp)</td></tr>\n", nLOADEDAlocal, bLOADEDAlocal);
- fprintf(htmlLog, "<tr class='details'><td>"F_U32" reads ("F_U64" bp) were short and not loaded</td></tr>\n", nSKIPPEDAlocal, bSKIPPEDAlocal);
-
- fprintf(htmlLog, "<tr class='details'><td rowspan='2'>FASTQ</td><td>"F_U32" reads ("F_U64" bp)</td></tr>\n", nLOADEDQlocal, bLOADEDQlocal);
- fprintf(htmlLog, "<tr class='details'><td>"F_U32" reads ("F_U64" bp) were short and not loaded</td></tr>\n", nSKIPPEDQlocal, bSKIPPEDQlocal);
-
- fprintf(htmlLog, "<tr><td colspan='2'>"F_U32" reads ("F_U64" bp) loaded, "F_U32" reads ("F_U64" bp) skipped, "F_U32" warnings</td></tr>\n",
- nLOADEDAlocal + nLOADEDQlocal, bLOADEDAlocal + bLOADEDQlocal,
- nSKIPPEDAlocal + nSKIPPEDQlocal, bSKIPPEDAlocal + bSKIPPEDQlocal,
- nWARNSlocal);
-#else
- fprintf(htmlLog, "dat "F_U32" "F_U64" "F_U32" "F_U64" "F_U32" "F_U64" "F_U32" "F_U64" "F_U32"\n",
+ fprintf(htmlLog, "dat " F_U32 " " F_U64 " " F_U32 " " F_U64 " " F_U32 " " F_U64 " " F_U32 " " F_U64 " " F_U32 "\n",
nLOADEDAlocal, bLOADEDAlocal,
nSKIPPEDAlocal, bSKIPPEDAlocal,
nLOADEDQlocal, bLOADEDQlocal,
nSKIPPEDQlocal, bSKIPPEDQlocal,
nWARNSlocal);
-#endif
// Add the just loaded numbers to the global numbers
@@ -551,6 +525,7 @@ int
main(int argc, char **argv) {
char *gkpStoreName = NULL;
char *outPrefix = NULL;
+ gkStore_mode mode = gkStore_create;
uint32 minReadLength = 0;
@@ -565,8 +540,13 @@ main(int argc, char **argv) {
int arg = 1;
int err = 0;
while (arg < argc) {
- if (strcmp(argv[arg], "-o") == 0) {
- gkpStoreName = argv[++arg];
+ if (strcmp(argv[arg], "-o") == 0) { // This previously used gkStore_append here, but if
+ mode = gkStore_create; // two instances of gatekeeperCreate were run in the
+ gkpStoreName = argv[++arg]; // same directory, they would clobber each other,
+ // generating a blobs file that is a mix of both.
+ } else if (strcmp(argv[arg], "-a") == 0) { // So now the -c will fail if any trace of a store
+ mode = gkStore_extend; // exists in the output location, and -a will
+ gkpStoreName = argv[++arg]; // blindly add reads to an existing set of files
} else if (strcmp(argv[arg], "-minlength") == 0) {
minReadLength = atoi(argv[++arg]);
@@ -594,6 +574,7 @@ main(int argc, char **argv) {
if (err) {
fprintf(stderr, "usage: %s [...] -o gkpStore\n", argv[0]);
fprintf(stderr, " -o gkpStore create this gkpStore\n");
+ fprintf(stderr, " -a gkpStore append to this gkpStore\n");
fprintf(stderr, " \n");
fprintf(stderr, " -minlength L discard reads shorter than L\n");
fprintf(stderr, " \n");
@@ -608,7 +589,7 @@ main(int argc, char **argv) {
}
- gkStore *gkpStore = gkStore::gkStore_open(gkpStoreName, gkStore_extend);
+ gkStore *gkpStore = gkStore::gkStore_open(gkpStoreName, mode);
gkRead *gkpRead = NULL;
gkLibrary *gkpLibrary = NULL;
uint32 gkpFileID = 0; // Used for HTML output, an ID for each file loaded.
@@ -621,17 +602,17 @@ main(int argc, char **argv) {
errno = 0;
- sprintf(errorLogName, "%s/errorLog", gkpStoreName);
+ snprintf(errorLogName, FILENAME_MAX, "%s/errorLog", gkpStoreName);
FILE *errorLog = fopen(errorLogName, "w");
if (errno)
fprintf(stderr, "ERROR: cannot open error file '%s': %s\n", errorLogName, strerror(errno)), exit(1);
- sprintf(htmlLogName, "%s/load.dat", gkpStoreName);
+ snprintf(htmlLogName, FILENAME_MAX, "%s/load.dat", gkpStoreName);
FILE *htmlLog = fopen(htmlLogName, "w");
if (errno)
fprintf(stderr, "ERROR: cannot open uid map file '%s': %s\n", htmlLogName, strerror(errno)), exit(1);
- sprintf(nameMapName, "%s/readNames.txt", gkpStoreName);
+ snprintf(nameMapName, FILENAME_MAX, "%s/readNames.txt", gkpStoreName);
FILE *nameMap = fopen(nameMapName, "w");
if (errno)
fprintf(stderr, "ERROR: cannot open uid map file '%s': %s\n", nameMapName, strerror(errno)), exit(1);
@@ -645,26 +626,6 @@ main(int argc, char **argv) {
uint32 nSKIPPED = 0;
uint64 bSKIPPED = 0; // Bases not loaded, too short
-#if 0
- fprintf(htmlLog, "<!DOCTYPE html>\n");
- fprintf(htmlLog, "<html>\n");
- fprintf(htmlLog, "<head>\n");
- fprintf(htmlLog, "<title>gatekeeper load statistics</title>\n");
- fprintf(htmlLog, "<style type='text/css'>\n");
- fprintf(htmlLog, "body { font-family: Helvetica, Verdana, sans-serif; }\n");
- fprintf(htmlLog, "h1, h2 { color: #ee3e80; }\n");
- fprintf(htmlLog, "p { color: #665544; }\n");
- fprintf(htmlLog, "th, td { border: 1px solid #111111; padding: 2px 2px 2px 2px; }\n");
- fprintf(htmlLog, "td:hover { background-color: #e4e4e4; }\n");
- fprintf(htmlLog, "th:hover { background-color: #d4d4d4; }\n");
- fprintf(htmlLog, "tr.details { visibility: collapse; }\n");
- fprintf(htmlLog, "</style>\n");
- fprintf(htmlLog, "</head>\n");
- fprintf(htmlLog, "<body>\n");
- fprintf(htmlLog, "<h2>Input Files</h2>\n");
- fprintf(htmlLog, "<table>\n");
-#endif
-
for (; firstFileArg < argc; firstFileArg++) {
fprintf(stderr, "\n");
fprintf(stderr, "Starting file '%s'.\n", argv[firstFileArg]);
@@ -745,10 +706,6 @@ main(int argc, char **argv) {
delete [] linekv;
}
-#if 0
- fprintf(htmlLog, "</table>\n");
-#endif
-
gkpStore->gkStore_close();
fclose(nameMap);
@@ -756,67 +713,21 @@ main(int argc, char **argv) {
fprintf(stderr, "\n");
fprintf(stderr, "Finished with:\n");
- fprintf(stderr, " "F_U32" warnings (bad base or qv, too short, too long)\n", nWARNS);
- fprintf(stderr, "\n");
-#if 0
- fprintf(stderr, "Read from inputs:\n");
- fprintf(stderr, " "F_U64" bp.\n", bLOADED);
- fprintf(stderr, " "F_U32" reads.\n", nLOADED);
+ fprintf(stderr, " " F_U32 " warnings (bad base or qv, too short, too long)\n", nWARNS);
fprintf(stderr, "\n");
-#endif
fprintf(stderr, "Loaded into store:\n");
- fprintf(stderr, " "F_U64" bp.\n", bLOADED);
- fprintf(stderr, " "F_U32" reads.\n", nLOADED);
+ fprintf(stderr, " " F_U64 " bp.\n", bLOADED);
+ fprintf(stderr, " " F_U32 " reads.\n", nLOADED);
fprintf(stderr, "\n");
fprintf(stderr, "Skipped (too short):\n");
- fprintf(stderr, " "F_U64" bp (%.4f%%).\n", bSKIPPED, 100.0 * bSKIPPED / (bSKIPPED + bLOADED));
- fprintf(stderr, " "F_U32" reads (%.4f%%).\n", nSKIPPED, 100.0 * nSKIPPED / (nSKIPPED + nLOADED));
+ fprintf(stderr, " " F_U64 " bp (%.4f%%).\n", bSKIPPED, (bSKIPPED + bLOADED > 0) ? (100.0 * bSKIPPED / (bSKIPPED + bLOADED)) : 0);
+ fprintf(stderr, " " F_U32 " reads (%.4f%%).\n", nSKIPPED, (nSKIPPED + nLOADED > 0) ? (100.0 * nSKIPPED / (nSKIPPED + nLOADED)) : 0);
fprintf(stderr, "\n");
fprintf(stderr, "\n");
-
-#if 0
- fprintf(htmlLog, "\n");
- fprintf(htmlLog, "<h2>Final Store</h2>\n");
- fprintf(htmlLog, "<table>\n");
- fprintf(htmlLog, "<tr><td colspan='2'>%s</td></tr>\n", gkpStoreName);
- fprintf(htmlLog, "<tr><td>readsLoaded</td><td>"F_U32" reads ("F_U64" bp)</td></tr>\n", nLOADED, bLOADED);
- fprintf(htmlLog, "<tr><td>readsSkipped</td><td>"F_U32" reads ("F_U64" bp) (read was too short)</td></tr>\n", nSKIPPED, bSKIPPED);
- fprintf(htmlLog, "<tr><td>warnings</td><td>"F_U32" warnings (invalid base or quality value)</td></tr>\n", nWARNS);
- fprintf(htmlLog, "</table>\n");
- fprintf(htmlLog, "\n");
-
- fprintf(htmlLog, "<script type='text/javascript'>\n");
- fprintf(htmlLog, "var toggleOne = function() {\n");
- fprintf(htmlLog, " var table = this.closest('table');\n");
- fprintf(htmlLog, " var elts = table.querySelectorAll('.details');\n");
- fprintf(htmlLog, "\n");
- fprintf(htmlLog, " for (var i=0; i<elts.length; i++) {\n");
- fprintf(htmlLog, " if (!elts[i].enabled) {\n");
- fprintf(htmlLog, " elts[i].enabled = true;\n");
- fprintf(htmlLog, " elts[i].style.visibility = 'visible';\n");
- fprintf(htmlLog, " } else {\n");
- fprintf(htmlLog, " elts[i].enabled = false;\n");
- fprintf(htmlLog, " elts[i].style.visibility = 'collapse';\n");
- fprintf(htmlLog, " }\n");
- fprintf(htmlLog, " }\n");
- fprintf(htmlLog, "}\n");
- fprintf(htmlLog, "\n");
- for (uint32 ii=0; ii<gkpFileID; ii++) {
- fprintf(htmlLog, "document.getElementById('gkpload%u').onclick = toggleOne;\n", ii);
- fprintf(htmlLog, "document.getElementById('gkpload%u').style = 'cursor: pointer;';\n", ii);
- }
- fprintf(htmlLog, "</script>\n");
- fprintf(htmlLog, "\n");
- fprintf(htmlLog, "</body>\n");
- fprintf(htmlLog, "</html>\n");
-#else
- fprintf(htmlLog, "sum "F_U32" "F_U64" "F_U32" "F_U64" "F_U32"\n", nLOADED, bLOADED, nSKIPPED, bSKIPPED, nWARNS);
-#endif
+ fprintf(htmlLog, "sum " F_U32 " " F_U64 " " F_U32 " " F_U64 " " F_U32 "\n", nLOADED, bLOADED, nSKIPPED, bSKIPPED, nWARNS);
fclose(htmlLog);
-
-
if (nERROR > 0)
fprintf(stderr, "gatekeeperCreate did NOT finish successfully; too many errors.\n");
diff --git a/src/stores/gatekeeperDumpFASTQ.C b/src/stores/gatekeeperDumpFASTQ.C
index fe2f3e2..f2280c0 100644
--- a/src/stores/gatekeeperDumpFASTQ.C
+++ b/src/stores/gatekeeperDumpFASTQ.C
@@ -52,7 +52,7 @@ public:
strcpy(_p, outPrefix);
if (outSuffix[0])
- sprintf(_s, ".%s", outSuffix);
+ snprintf(_s, FILENAME_MAX, ".%s", outSuffix);
else
_s[0] = 0;
@@ -83,12 +83,12 @@ public:
char N[FILENAME_MAX];
if (_n[0])
- sprintf(N, "%s.%s.fastq%s", _p, _n, _s);
+ snprintf(N, FILENAME_MAX, "%s.%s.fastq%s", _p, _n, _s);
else
- sprintf(N, "%s.fastq%s", _p, _s);
+ snprintf(N, FILENAME_MAX, "%s.fastq%s", _p, _s);
if ((_p[0] == '-') && (_p[1] == 0)) {
- sprintf(N, "(stdout)");
+ snprintf(N, FILENAME_MAX, "(stdout)");
_FASTQ = stdout;
}
@@ -113,12 +113,12 @@ public:
char N[FILENAME_MAX];
if (_n[0])
- sprintf(N, "%s.%s.fasta%s", _p, _n, _s);
+ snprintf(N, FILENAME_MAX, "%s.%s.fasta%s", _p, _n, _s);
else
- sprintf(N, "%s.fasta%s", _p, _s);
+ snprintf(N, FILENAME_MAX, "%s.fasta%s", _p, _s);
if ((_p[0] == '-') && (_p[1] == 0)) {
- sprintf(N, "(stdout)");
+ snprintf(N, FILENAME_MAX, "(stdout)");
_FASTA = stdout;
}
@@ -189,6 +189,7 @@ main(int argc, char **argv) {
bool dumpFASTA = false;
bool withLibName = true;
+ bool withReadName = true;
argc = AS_configure(argc, argv);
@@ -242,6 +243,9 @@ main(int argc, char **argv) {
} else if (strcmp(argv[arg], "-nolibname") == 0) {
withLibName = false;
+ } else if (strcmp(argv[arg], "-noreadname") == 0) {
+ withReadName = false;
+
} else {
err++;
@@ -274,6 +278,9 @@ main(int argc, char **argv) {
fprintf(stderr, "\n");
fprintf(stderr, " -nolibname don't include the library name in the output file name\n");
fprintf(stderr, "\n");
+ fprintf(stderr, " -noreadname don't include the read name in the sequence header. header will be:\n");
+ fprintf(stderr, " '>original-name id=<gkpID> clr=<bgn>,<end> with names\n");
+ fprintf(stderr, " '><gkpID> clr=<bgn>,<end> without names\n");
if (gkpStoreName == NULL)
fprintf(stderr, "ERROR: no gkpStore (-G) supplied.\n");
@@ -296,7 +303,7 @@ main(int argc, char **argv) {
endID = numReads;
if (endID < bgnID)
- fprintf(stderr, "No reads to dump; reversed ranges make no sense: bgn="F_U32" end="F_U32"??\n", bgnID, endID);
+ fprintf(stderr, "No reads to dump; reversed ranges make no sense: bgn=" F_U32 " end=" F_U32 "??\n", bgnID, endID);
@@ -360,6 +367,8 @@ main(int argc, char **argv) {
gkpStore->gkStore_loadReadData(read, readData);
+ char *name = readData->gkReadData_getName();
+
char *seq = readData->gkReadData_getSequence();
char *qlt = readData->gkReadData_getQualities();
uint32 clen = rclr - lclr;
@@ -373,7 +382,7 @@ main(int argc, char **argv) {
for (uint32 i=lclr; i<rclr; i++)
seq[i] += (seq[i] >= 'A') ? 0 : 'A' - 'a';
- for (uint32 i=rclr; flen; i++)
+ for (uint32 i=rclr; i<flen; i++)
seq[i] += (seq[i] >= 'A') ? 'a' - 'A' : 0;
lclr = 0;
@@ -390,15 +399,26 @@ main(int argc, char **argv) {
// Print the read.
- if (dumpFASTA)
- AS_UTL_writeFastA(out[libID]->getFASTA(), seq, clen, 100,
- ">"F_U32" clr="F_U32","F_U32"\n",
- rid, lclr, rclr);
-
- if (dumpFASTQ)
- AS_UTL_writeFastQ(out[libID]->getFASTQ(), seq, clen, qlt, clen,
- "@"F_U32" clr="F_U32","F_U32"\n",
- rid, lclr, rclr);
+ if (dumpFASTA) // Dear GCC: I'm NOT ambiguous
+ if ((withReadName == true) && (name != NULL))
+ AS_UTL_writeFastA(out[libID]->getFASTA(), seq, clen, 100,
+ ">%s id=" F_U32 " clr=" F_U32 "," F_U32 "\n",
+ name, rid, lclr, rclr);
+ else
+ AS_UTL_writeFastA(out[libID]->getFASTA(), seq, clen, 100,
+ ">" F_U32 " clr=" F_U32 "," F_U32 "\n",
+ rid, lclr, rclr);
+
+ if (dumpFASTQ) // Dear GCC: I'm NOT ambiguous
+ if ((withReadName == true) && (name != NULL))
+ AS_UTL_writeFastQ(out[libID]->getFASTQ(), seq, clen, qlt, clen,
+ "@%s id=" F_U32 " clr=" F_U32 "," F_U32 "\n",
+ name,
+ rid, lclr, rclr);
+ else
+ AS_UTL_writeFastQ(out[libID]->getFASTQ(), seq, clen, qlt, clen,
+ "@" F_U32 " clr=" F_U32 "," F_U32 "\n",
+ rid, lclr, rclr);
}
delete clrRange;
diff --git a/src/stores/gatekeeperDumpMetaData.C b/src/stores/gatekeeperDumpMetaData.C
index 04dd6d9..73bc7b3 100644
--- a/src/stores/gatekeeperDumpMetaData.C
+++ b/src/stores/gatekeeperDumpMetaData.C
@@ -41,7 +41,7 @@ dumpLibs(gkStore *gkp, uint32 bgnID, uint32 endID) {
for (uint32 lid=bgnID; lid<=endID; lid++) {
gkLibrary *library = gkp->gkStore_getLibrary(lid);
- fprintf(stdout, F_U32"\t"F_U32"\t%s\t%s\t%s\t"F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\t%s\n",
+ fprintf(stdout, F_U32"\t" F_U32 "\t%s\t%s\t%s\t" F_U32 "\t" F_U32 "\t" F_U32 "\t" F_U32 "\t" F_U32 "\t%s\n",
library->gkLibrary_libraryID(),
library->gkLibrary_isNonRandom(),
library->gkLibrary_readTypeString(),
@@ -68,12 +68,12 @@ dumpReads(gkStore *gkp, uint32 bgnID, uint32 endID, bool fullDump) {
continue;
if (fullDump == false)
- fprintf(stdout, F_U32"\t"F_U32"\t"F_U32"\n",
+ fprintf(stdout, F_U32"\t" F_U32 "\t" F_U32 "\n",
read->gkRead_readID(),
read->gkRead_libraryID(),
read->gkRead_sequenceLength());
else
- fprintf(stdout, F_U32"\t"F_U32"\t"F_U32"\t"F_U64"\t"F_U64"\n",
+ fprintf(stdout, F_U32"\t" F_U32 "\t" F_U32 "\t" F_U64 "\t" F_U64 "\n",
read->gkRead_readID(),
read->gkRead_libraryID(),
read->gkRead_sequenceLength(),
@@ -147,7 +147,7 @@ dumpStats(gkStore *gkp, uint32 bgnID, uint32 endID) {
// length histogram plot
for (uint32 l=0; l<gkp->gkStore_getNumLibraries() + 1; l++) {
- fprintf(stdout, "library "F_U32" reads "F_U32" bases: total "F_U64" ave "F_U64" min "F_U64" max "F_U64"\n",
+ fprintf(stdout, "library " F_U32 " reads " F_U32 " bases: total " F_U64 " ave " F_U64 " min " F_U64 " max " F_U64 "\n",
l, rs[l].numberOfReads(), rs[l].numberOfBases(), rs[l].numberOfBases() / rs[l].numberOfReads(), rs[l].minBases(), rs[l].maxBases());
}
}
@@ -256,7 +256,7 @@ main(int argc, char **argv) {
if (endID < bgnID)
- fprintf(stderr, "No objects to dump; reversed ranges make no sense: bgn="F_U32" end="F_U32"??\n", bgnID, endID);
+ fprintf(stderr, "No objects to dump; reversed ranges make no sense: bgn=" F_U32 " end=" F_U32 "??\n", bgnID, endID);
if (wantLibs)
diff --git a/src/stores/gatekeeperPartition.C b/src/stores/gatekeeperPartition.C
index 81dd653..ac89892 100644
--- a/src/stores/gatekeeperPartition.C
+++ b/src/stores/gatekeeperPartition.C
@@ -28,83 +28,170 @@
*/
#include "AS_global.H"
+
#include "gkStore.H"
-#include "AS_UTL_fileIO.H"
+#include "tgStore.H"
+
+//#include "AS_UTL_fileIO.H"
+
+
+uint32 *
+buildPartition(char *tigStoreName,
+ uint32 tigStoreVers,
+ uint32 readCountTarget,
+ uint32 partCountTarget,
+ uint32 numReads) {
+ tgStore *tigStore = new tgStore(tigStoreName, tigStoreVers);
+
+ // Decide on how many reads per partition. We take two targets, the partCountTarget
+ // is used to decide how many partitions to make, but if there are too few reads in
+ // each partition, we'll reset to readCountTarget.
+
+ if (readCountTarget < numReads / partCountTarget)
+ readCountTarget = numReads / partCountTarget;
+
+ // Figure out how many partitions we'll make, then spread the reads equally through them.
+
+ uint32 numParts = (uint32)ceil((double)numReads / readCountTarget);
+
+ readCountTarget = 1 + numReads / numParts;
+
+ fprintf(stderr, "For %u reads, will make %u partition%s with up to %u reads%s.\n",
+ numReads,
+ (numParts),
+ (numParts == 1) ? "" : "s",
+ readCountTarget,
+ (numParts == 1) ? "" : " in each");
+
+ // Allocate space for the partitioning.
+
+ uint32 *readToPart = new uint32 [numReads + 1];
+
+ for (uint32 i=0; i<=numReads; i++) // All reads are in invalid
+ readToPart[i] = UINT32_MAX; // partitions, initially.
+
+ // Run through all tigs and partition!
+
+ uint32 partCount = 1;
+ uint32 tigsCount = 0;
+ uint32 readCount = 0;
+
+ for (uint32 ti=0; ti<tigStore->numTigs(); ti++) {
+ if (tigStore->isDeleted(ti))
+ continue;
+
+ tgTig *tig = tigStore->loadTig(ti);
+
+ // Move to the next partition if needed
+
+ if ((readCount + tig->numberOfChildren() >= readCountTarget) &&
+ (readCount > 0)) {
+ fprintf(stderr, "Partition %d has %d tigs and %d reads.\n",
+ partCount, tigsCount, readCount);
+
+ partCount++;
+ tigsCount = 0;
+ readCount = 0;
+ }
+
+ // Assign all the reads in this tig to this partition.
+
+ readCount += tig->numberOfChildren();
+
+ for (uint32 ci=0; ci<tig->numberOfChildren(); ci++)
+ readToPart[tig->getChild(ci)->ident()] = partCount;
+
+ tigStore->unloadTig(ti);
+ }
+
+ if (readCount > 0)
+ fprintf(stderr, "Partition %d has %d tigs and %d reads.\n",
+ partCount, tigsCount, readCount);
+
+ delete tigStore;
+
+ return(readToPart);
+}
+
int
main(int argc, char **argv) {
char *gkpStoreName = NULL;
- char *partitionFile = NULL;
+ char *tigStoreName = NULL;
+ char gkpCloneName[FILENAME_MAX];
+ uint32 tigStoreVers = 0;
+ uint32 readCountTarget = 2500; // No partition smaller than this
+ uint32 partCountTarget = 200; // No more than this many partitions
argc = AS_configure(argc, argv);
- int arg = 1;
- int err = 0;
+ vector<char *> err;
+ int arg = 1;
while (arg < argc) {
if (strcmp(argv[arg], "-G") == 0) {
gkpStoreName = argv[++arg];
- } else if (strcmp(argv[arg], "-P") == 0) {
- partitionFile = argv[++arg];
+ } else if (strcmp(argv[arg], "-T") == 0) {
+ tigStoreName = argv[++arg];
+ tigStoreVers = atoi(argv[++arg]);
+
+ } else if (strcmp(argv[arg], "-b") == 0) {
+ readCountTarget = atoi(argv[++arg]);
+
+ } else if (strcmp(argv[arg], "-p") == 0) {
+ partCountTarget = atoi(argv[++arg]);
} else {
- err++;
- fprintf(stderr, "ERROR: unknown option '%s'\n", argv[arg]);
+ char *s = new char [1024];
+ snprintf(s, 1024, "ERROR: unknown option '%s'\n", argv[arg]);
+ err.push_back(s);
}
+
arg++;
}
- if (gkpStoreName == NULL)
- err++;
- if (partitionFile == NULL)
- err++;
- if (err) {
- fprintf(stderr, "usage: %s -G gkpStore -P partitionMapFile\n", argv[0]);
- fprintf(stderr, " -G gkpStore path to gatekeeper store\n");
- fprintf(stderr, " -P partFile file mapping read ID to partiton\n");
- fprintf(stderr, " format: 'partition readID'\n");
- fprintf(stderr, " \n");
-
- if (gkpStoreName == NULL)
- fprintf(stderr, "ERROR: no gkpStore (-G) supplied.\n");
- if (partitionFile == NULL)
- fprintf(stderr, "ERROR: no partition input (-P) supplied.\n");
+ if (gkpStoreName == NULL) err.push_back("ERROR: no gkpStore (-G) supplied.\n");
+ if (tigStoreName == NULL) err.push_back("ERROR: no partition input (-P) supplied.\n");
+
+ if (err.size() > 0) {
+ fprintf(stderr, "usage: %s -G <gkpStore> -T <tigStore v>\n", argv[0]);
+ fprintf(stderr, " -G <gkpStore> path to gatekeeper store\n");
+ fprintf(stderr, " -T <tigStore> <v> path to tig store and version to be partitioned\n");
+ fprintf(stderr, " -b <nReads> minimum number of reads per partition (50000)\n");
+ fprintf(stderr, " -p <nPartitions> number of partitions (200)\n");
+
+ for (uint32 ii=0; ii<err.size(); ii++)
+ if (err[ii])
+ fputs(err[ii], stderr);
+
exit(1);
}
+
// Open a READ ONLY store. This prevents us from mucking with the non-partitioned reads
// (like, by changing the read ID or pointer to the blob). We don't need it opened
// for writing anyway.
- gkStore *gkpStore = gkStore::gkStore_open(gkpStoreName, gkStore_readOnly);
- uint32 numReads = gkpStore->gkStore_getNumReads();
+ gkStore *gkpStore = gkStore::gkStore_open(gkpStoreName, gkStore_readOnly);
+ uint32 numReads = gkpStore->gkStore_getNumReads();
- uint32 *partition = new uint32 [numReads + 1];
+ // Clone that store into the tigStore directory.
- // Set all partitions to invalid.
+ snprintf(gkpCloneName, FILENAME_MAX, "%s/partitionedReads.gkpStore", tigStoreName);
- for (uint32 i=0; i<=numReads; i++)
- partition[i] = UINT32_MAX;
+ gkpStore->gkStore_clone(gkpCloneName);
- // Read the partition file
+ // Then close the original store and open the clone.
- errno = 0;
- FILE *F = fopen(partitionFile, "r");
- if (errno)
- fprintf(stderr, "GKP Error: Build_Partition()-- failed to open '%s': %s\n", partitionFile, strerror(errno)), exit(1);
+ gkpStore->gkStore_close();
- while (!feof(F)) {
- uint32 i, p;
+ gkpStore = gkStore::gkStore_open(gkpCloneName, gkStore_readOnly);
- if (2 == fscanf(F, " "F_U32" "F_U32" ", &p, &i)) {
- assert(i <= numReads);
- assert(partition[i] == UINT32_MAX);
+ // Scan all the tigs to build a map from read to partition.
- partition[i] = p;
- }
- }
- fclose(F);
+ uint32 *partition = buildPartition(tigStoreName, tigStoreVers,
+ readCountTarget, partCountTarget, numReads);
// Dump the partition data to the store, let it build partitions.
diff --git a/src/stores/gkStore.C b/src/stores/gkStore.C
index f81003f..bdcb322 100644
--- a/src/stores/gkStore.C
+++ b/src/stores/gkStore.C
@@ -34,16 +34,20 @@
#include "gkStore.H"
#include "AS_UTL_fileIO.H"
-#include "AS_UTL_alloc.H"
gkStore *gkStore::_instance = NULL;
uint32 gkStore::_instanceCount = 0;
+// Define this to use the original memory mapped file interface to the blobs data.
+#undef MMAP_BLOBS
-bool
-gkRead::gkRead_loadData(gkReadData *readData, void *blobs) {
+
+// Lowest level function to load data into a read.
+//
+void
+gkRead::gkRead_loadData(gkReadData *readData, uint8 *blob) {
readData->_read = this;
@@ -51,10 +55,6 @@ gkRead::gkRead_loadData(gkReadData *readData, void *blobs) {
resizeArrayPair(readData->_seq, readData->_qlt, readData->_seqAlloc, readData->_seqAlloc, (uint32)_seqLen+1, resizeArray_doNothing);
- // Where, or where!, is the data?
-
- uint64 offset = _mPtr;
-
// One might be tempted to set the readData blob to point to the blob data in the mmap,
// but doing so will cause it to be written out again.
@@ -62,16 +62,16 @@ gkRead::gkRead_loadData(gkReadData *readData, void *blobs) {
readData->_blobMax = 0;
readData->_blob = NULL;
- // Instead, we'll use someting horribly similar.
+ // Make sure that our blob is actually a blob.
- uint8 *blob = ((uint8 *)blobs) + offset;
char chunk[5];
if ((blob[0] != 'B') && (blob[1] != 'L') && (blob[2] != 'O') && (blob[3] != 'B'))
- fprintf(stderr, "Index error in read "F_U32" %c mPtr "F_U64" pID "F_U64" expected BLOB, got %c%c%c%c\n",
+ fprintf(stderr, "Index error in read " F_U32 " %c mPtr " F_U64 " pID " F_U64 " expected BLOB, got %02x %02x %02x %02x '%c%c%c%c'\n",
gkRead_readID(),
'?', //(_numberOfPartitions == 0) ? 'm' : 'p',
_mPtr, _pID,
+ blob[0], blob[1], blob[2], blob[3],
blob[0], blob[1], blob[2], blob[3]);
assert(blob[0] == 'B');
assert(blob[1] == 'L');
@@ -80,8 +80,6 @@ gkRead::gkRead_loadData(gkReadData *readData, void *blobs) {
uint32 blobLen = *((uint32 *)blob + 1);
- //fprintf(stderr, "BLOB len %u\n", blobLen);
-
blob += 8;
while ((blob[0] != 'S') ||
@@ -96,20 +94,15 @@ gkRead::gkRead_loadData(gkReadData *readData, void *blobs) {
uint32 chunkLen = *((uint32 *)blob + 1);
-#if 1
- // Fix for stores built between 9/3/15-9/7/15 when QVs were changed to uniform value but nothing was stored on disk
- // Loading from these stores lead to random uninitialized QVs
- // set all QVs to a default value
- // Should be unnecessary after Dec 8, 2015 and can be removed
- for (uint32 ii=0; ii<_seqLen; ii++)
- readData->_qlt[ii] = 20;
-#endif
-
- //fprintf(stderr, "%s len %u\n", chunk, chunkLen);
-
if (strncmp(chunk, "VERS", 4) == 0) {
}
+ else if (strncmp(chunk, "NAME", 4) == 0) {
+ resizeArray(readData->_name, 0, readData->_nameAlloc, chunkLen + 1, resizeArray_doNothing);
+ memcpy(readData->_name, blob + 8, chunkLen);
+ readData->_name[chunkLen] = 0;
+ }
+
else if (strncmp(chunk, "QSEQ", 4) == 0) {
//fprintf(stderr, "QSEQ not supported.\n");
}
@@ -126,39 +119,6 @@ gkRead::gkRead_loadData(gkReadData *readData, void *blobs) {
assert(_seqLen <= readData->_seqAlloc);
memcpy(readData->_qlt, blob + 8, _seqLen);
readData->_qlt[_seqLen] = 0;
-
-#if 1
- // Fix for older gkpStore that encoded QV's with offset '0'. Wasn't RIFF format supposed to solve problems like this?
- // Old encoding is ASCII was from '0' = 48 to 'l' = 108.
- // New encoding is integer from 0 to 60.
- // So, if we see:
- // 0-47, we're new format.
- // 48-60 we're either (but STRONGLY likely to be old).
- // 61-108 we're old format.
- // Most reads are well below qv=40, so this will be easy.
-
- bool isOld = false;
-
- for (uint32 ii=0; ii<_seqLen; ii++) {
- if (readData->_qlt[ii] < 48) {
- isOld = false;
- break;
- }
-
- else if (readData->_qlt[ii] < 61) {
- isOld = true;
- }
-
- else {
- isOld = true;
- break;
- }
- }
-
- if (isOld)
- for (uint32 ii=0; ii<_seqLen; ii++)
- readData->_qlt[ii] -= '0';
-#endif
}
else if (strncmp(chunk, "2SEQ", 4) == 0) {
@@ -185,16 +145,58 @@ gkRead::gkRead_loadData(gkReadData *readData, void *blobs) {
}
else {
- fprintf(stderr, "gkRead::gkRead_loadData()-- unknown chunk type '%s' skipped\n", chunk);
+ fprintf(stderr, "gkRead::gkRead_loadDataFromBlob()-- unknown chunk type %02x %02x %02x %02x '%c%c%c%c' skipped\n",
+ chunk[0], chunk[1], chunk[2], chunk[3],
+ chunk[0], chunk[1], chunk[2], chunk[3]);
+ assert(0);
}
blob += 4 + 4 + chunkLen;
}
+}
- return(true);
-};
+void
+gkRead::gkRead_loadDataFromStream(gkReadData *readData, FILE *file) {
+ char tag[5];
+ uint32 size;
+
+ // Ideally, we'd do one read to get the whole blob. Without knowing
+ // the length, we're forced to do two.
+
+ AS_UTL_safeRead(file, tag, "gkStore::gkStore_loadDataFromFile::blob", sizeof(int8), 4);
+ AS_UTL_safeRead(file, &size, "gkStore::gkStore_loadDataFromFile::size", sizeof(uint32), 1);
+
+ uint8 *blob = new uint8 [8 + size];
+
+ memcpy(blob, tag, sizeof(uint8) * 4);
+ memcpy(blob+4, &size, sizeof(uint32) * 1);
+
+ AS_UTL_safeRead(file, blob+8, "gkStore::gkStore_loadDataFromFile::blob", sizeof(char), size);
+
+ gkRead_loadData(readData, blob);
+
+ delete [] blob;
+}
+
+
+
+void
+gkRead::gkRead_loadDataFromMMap(gkReadData *readData, void *blobs) {
+ //fprintf(stderr, "gkRead::gkRead_loadDataFromMMap()-- read %lu position %lu\n", _readID, _mPtr);
+ gkRead_loadData(readData, ((uint8 *)blobs) + _mPtr);
+}
+
+
+
+void
+gkRead::gkRead_loadDataFromFile(gkReadData *readData, FILE *file) {
+ //fprintf(stderr, "gkRead::gkRead_loadDataFromFile()-- read %lu position %lu\n", _readID, _mPtr);
+ AS_UTL_fseek(file, _mPtr, SEEK_SET);
+ gkRead_loadDataFromStream(readData, file);
+}
+
// Dump a block of encoded data to disk, then update the gkRead to point to it.
@@ -202,18 +204,14 @@ gkRead::gkRead_loadData(gkReadData *readData, void *blobs) {
void
gkStore::gkStore_stashReadData(gkRead *read, gkReadData *data) {
- assert(_blobsFile != NULL);
+ assert(_blobsWriter != NULL);
- read->_mPtr = AS_UTL_ftell(_blobsFile);
+ read->_mPtr = _blobsWriter->tell();
read->_pID = _partitionID; // 0 if not partitioned
- //fprintf(stderr, "STASH read %u at position "F_SIZE_T"\n", read->gkRead_readID(), AS_UTL_ftell(_blobsFile));
+ //fprintf(stderr, "STASH read %u at position " F_U64 " or length " F_U64 "\n", read->gkRead_readID(), read->_mPtr, data->_blobLen);
- AS_UTL_safeWrite(_blobsFile,
- data->_blob,
- "gkStore_stashReadData::blob",
- sizeof(char),
- data->_blobLen);
+ _blobsWriter->write(data->_blob, data->_blobLen);
}
@@ -239,28 +237,13 @@ gkStore::gkStore_loadReadFromStream(FILE *S, gkRead *read, gkReadData *readData)
AS_UTL_safeRead(S, read, "gkStore::gkStore_loadReadFromStream::read", sizeof(gkRead), 1);
- // With some pain, we read the BLOB and its length, then allocate space for the blob
- // and finsh reading it.
+ // Load the read data.
- AS_UTL_safeRead(S, tag, "gkStore::gkStore_loadReadFromStream::blob", sizeof(int8), 4);
- AS_UTL_safeRead(S, &size, "gkStore::gkStore_loadReadFromStream::size", sizeof(uint32), 1);
-
- uint8 *blob = new uint8 [8 + size];
-
- memcpy(blob, tag, sizeof(uint8) * 4);
- memcpy(blob+4, &size, sizeof(uint32) * 1);
-
- AS_UTL_safeRead(S, blob+8, "gkStore::gkStore_loadReadFromStream::blob", sizeof(char), size);
-
- // Unpack the blob into a readData
-
- read->_mPtr = 0;
- read->gkRead_loadData(readData, blob);
-
- // And, that's it! Sweet!
+ read->gkRead_loadDataFromStream(readData, S);
}
+
// Dump the read metadata and read data to a stream.
//
void
@@ -365,6 +348,7 @@ gkRead::gkRead_encodeSeqQlt(char *H, char *S, char *Q, uint32 qv) {
// If there is a QV string, ensure that the lengths are the same. If not, trim or pad the QVs.
// Then, convert the expected Sanger-encoded QV's (base='!') to be just integers.
+ uint32 Hlen = strlen(H);
uint32 Slen = _seqLen = strlen(S);
uint32 Qlen = 0;
@@ -411,8 +395,10 @@ gkRead::gkRead_encodeSeqQlt(char *H, char *S, char *Q, uint32 qv) {
uint32 blobVers = 0x00000001;
- rd->gkReadData_encodeBlobChunk("BLOB", 0, NULL);
- rd->gkReadData_encodeBlobChunk("VERS", 4, &blobVers);
+ rd->gkReadData_encodeBlobChunk("BLOB", 0, NULL);
+ rd->gkReadData_encodeBlobChunk("VERS", 4, &blobVers);
+
+ rd->gkReadData_encodeBlobChunk("NAME", Hlen, H);
if (seq2Len > 0)
rd->gkReadData_encodeBlobChunk("2SEQ", seq2Len, seq); // Two-bit encoded sequence (ACGT only)
@@ -445,26 +431,6 @@ gkRead::gkRead_encodeSeqQlt(char *H, char *S, char *Q, uint32 qv) {
-#if 0
-// Not implemented.
-gkReadData *
-gkRead::gkRead_encodePacBio(char *H, char *S, char *Q) {
- gkReadData *rd = new gkReadData;
-
- return(rd);
-}
-
-// Not implemented.
-gkReadData *
-gkRead::gkRead_encodeMinION(char *H, char *S, char *Q) {
- gkReadData *rd = new gkReadData;
-
- return(rd);
-}
-#endif
-
-
-
////////////////////////////////////////
//
// gkLibrary is lightweight, except for three functions that need to parse strings
@@ -624,7 +590,7 @@ gkLibrary::gkLibrary_finalTrimString(void) {
// 3) No addition, no modification. gkStore(path);
//
gkStore::gkStore(char const *path, gkStore_mode mode, uint32 partID) {
- char name[FILENAME_MAX + 5];
+ char name[FILENAME_MAX];
memset(_storePath, 0, sizeof(char) * FILENAME_MAX);
memset(_storeName, 0, sizeof(char) * FILENAME_MAX);
@@ -632,10 +598,10 @@ gkStore::gkStore(char const *path, gkStore_mode mode, uint32 partID) {
strncpy(_storePath, path, FILENAME_MAX-1);
strncpy(_storeName, path, FILENAME_MAX-1); // Broken.
- sprintf(name, "%s/info", _storePath);
-
// If the info file exists, load it.
+ snprintf(name, FILENAME_MAX, "%s/info", _storePath);
+
if (AS_UTL_fileExists(name, false, false) == true) {
errno = 0;
FILE *I = fopen(name, "r");
@@ -648,27 +614,27 @@ gkStore::gkStore(char const *path, gkStore_mode mode, uint32 partID) {
uint32 failed = 0;
if (_info.gkLibrarySize != sizeof(gkLibrary))
- failed += fprintf(stderr, "ERROR: gkLibrary size in store = "F_U32", differs from executable = "F_SIZE_T"\n",
+ failed += fprintf(stderr, "ERROR: gkLibrary size in store = " F_U32 ", differs from executable = " F_SIZE_T "\n",
_info.gkLibrarySize, sizeof(gkLibrary));
if (_info.gkReadSize != sizeof(gkRead))
- failed += fprintf(stderr, "ERROR: gkRead size in store = "F_U32", differs from executable = "F_SIZE_T"\n",
+ failed += fprintf(stderr, "ERROR: gkRead size in store = " F_U32 ", differs from executable = " F_SIZE_T "\n",
_info.gkReadSize, sizeof(gkRead));
if (_info.gkMaxLibrariesBits != AS_MAX_LIBRARIES_BITS)
- failed += fprintf(stderr, "ERROR: AS_MAX_LIBRARIES_BITS in store = "F_U32", differs from executable = "F_U32"\n",
+ failed += fprintf(stderr, "ERROR: AS_MAX_LIBRARIES_BITS in store = " F_U32 ", differs from executable = " F_U32 "\n",
_info.gkMaxLibrariesBits, AS_MAX_LIBRARIES_BITS);
if (_info.gkLibraryNameSize != LIBRARY_NAME_SIZE)
- failed += fprintf(stderr, "ERROR: LIBRARY_NAME_SIZE in store = "F_U32", differs from executable = "F_U32"\n",
+ failed += fprintf(stderr, "ERROR: LIBRARY_NAME_SIZE in store = " F_U32 ", differs from executable = " F_U32 "\n",
_info.gkLibraryNameSize, LIBRARY_NAME_SIZE);
if (_info.gkMaxReadBits != AS_MAX_READS_BITS)
- failed += fprintf(stderr, "ERROR: AS_MAX_READS_BITS in store = "F_U32", differs from executable = "F_U32"\n",
+ failed += fprintf(stderr, "ERROR: AS_MAX_READS_BITS in store = " F_U32 ", differs from executable = " F_U32 "\n",
_info.gkMaxReadBits, AS_MAX_READS_BITS);
if (_info.gkMaxReadLenBits != AS_MAX_READLEN_BITS)
- failed += fprintf(stderr, "ERROR: AS_MAX_READLEN_BITS in store = "F_U32", differs from executable = "F_U32"\n",
+ failed += fprintf(stderr, "ERROR: AS_MAX_READLEN_BITS in store = " F_U32 ", differs from executable = " F_U32 "\n",
_info.gkMaxReadLenBits, AS_MAX_READLEN_BITS);
if (failed)
@@ -682,6 +648,21 @@ gkStore::gkStore(char const *path, gkStore_mode mode, uint32 partID) {
assert(_info.gkMaxReadBits == AS_MAX_READS_BITS);
assert(_info.gkMaxReadLenBits == AS_MAX_READLEN_BITS);
+ // If creating, check that files don't exist already.
+
+ if (mode == gkStore_create) {
+ mode = gkStore_extend;
+
+ snprintf(name, FILENAME_MAX, "%s/blobs", _storePath);
+
+ if (AS_UTL_fileExists(name, false, false) == true) {
+ fprintf(stderr, "ERROR: Can't create store '%s': %s store exists in same location.\n",
+ _storePath,
+ (_info.numReads > 0) ? "complete" : "partial");
+ exit(1);
+ }
+ }
+
// Clear ourself, to make valgrind happier.
_librariesMMap = NULL;
@@ -694,7 +675,8 @@ gkStore::gkStore(char const *path, gkStore_mode mode, uint32 partID) {
_blobsMMap = NULL;
_blobs = NULL;
- _blobsFile = NULL;
+ _blobsWriter = NULL;
+ _blobsFiles = NULL;
_mode = mode;
@@ -718,17 +700,30 @@ gkStore::gkStore(char const *path, gkStore_mode mode, uint32 partID) {
exit(1);
}
- sprintf(name, "%s/libraries", _storePath);
+ snprintf(name, FILENAME_MAX, "%s/libraries", _storePath);
_librariesMMap = new memoryMappedFile (name, memoryMappedFile_readOnly);
_libraries = (gkLibrary *)_librariesMMap->get(0);
- sprintf(name, "%s/reads", _storePath);
+ snprintf(name, FILENAME_MAX, "%s/reads", _storePath);
_readsMMap = new memoryMappedFile (name, memoryMappedFile_readOnly);
_reads = (gkRead *)_readsMMap->get(0);
- sprintf(name, "%s/blobs", _storePath);
+ snprintf(name, FILENAME_MAX, "%s/blobs", _storePath);
+#ifdef MMAP_BLOBS
_blobsMMap = new memoryMappedFile (name, memoryMappedFile_readOnly);
_blobs = (void *)_blobsMMap->get(0);
+#else
+ _blobsFiles = new FILE * [omp_get_max_threads()];
+
+ errno = 0;
+
+ for (uint32 ii=0; ii<omp_get_max_threads(); ii++)
+ _blobsFiles[ii] = fopen(name, "r");
+
+ if (errno)
+ fprintf(stderr, "Failed to open %u copies of the blobs file '%s' for reading: %s\n",
+ omp_get_max_threads(), name, strerror(errno)), exit(1);
+#endif
}
//
@@ -744,15 +739,15 @@ gkStore::gkStore(char const *path, gkStore_mode mode, uint32 partID) {
exit(1);
}
- sprintf(name, "%s/libraries", _storePath);
+ snprintf(name, FILENAME_MAX, "%s/libraries", _storePath);
_librariesMMap = new memoryMappedFile (name, memoryMappedFile_readWrite);
_libraries = (gkLibrary *)_librariesMMap->get(0);
- sprintf(name, "%s/reads", _storePath);
+ snprintf(name, FILENAME_MAX, "%s/reads", _storePath);
_readsMMap = new memoryMappedFile (name, memoryMappedFile_readWrite);
_reads = (gkRead *)_readsMMap->get(0);
- sprintf(name, "%s/blobs", _storePath);
+ snprintf(name, FILENAME_MAX, "%s/blobs", _storePath);
_blobsMMap = new memoryMappedFile (name, memoryMappedFile_readWrite);
_blobs = (void *)_blobsMMap->get(0);
}
@@ -771,7 +766,7 @@ gkStore::gkStore(char const *path, gkStore_mode mode, uint32 partID) {
_librariesAlloc = MAX(64, 2 * _info.numLibraries);
_libraries = new gkLibrary [_librariesAlloc];
- sprintf(name, "%s/libraries", _storePath);
+ snprintf(name, FILENAME_MAX, "%s/libraries", _storePath);
if (AS_UTL_fileExists(name, false, false) == true) {
_librariesMMap = new memoryMappedFile (name, memoryMappedFile_readOnly);
@@ -784,7 +779,7 @@ gkStore::gkStore(char const *path, gkStore_mode mode, uint32 partID) {
_readsAlloc = MAX(128, 2 * _info.numReads);
_reads = new gkRead [_readsAlloc];
- sprintf(name, "%s/reads", _storePath);
+ snprintf(name, FILENAME_MAX, "%s/reads", _storePath);
if (AS_UTL_fileExists(name, false, false) == true) {
_readsMMap = new memoryMappedFile (name, memoryMappedFile_readOnly);
@@ -794,16 +789,12 @@ gkStore::gkStore(char const *path, gkStore_mode mode, uint32 partID) {
_readsMMap = NULL;
}
- sprintf(name, "%s/blobs", _storePath);
+ snprintf(name, FILENAME_MAX, "%s/blobs", _storePath);
_blobsMMap = NULL;
_blobs = NULL;
- errno = 0;
- _blobsFile = fopen(name, "a+");
- if (errno)
- fprintf(stderr, "gkStore()-- Failed to open blobs file '%s' for appending: %s\n",
- name, strerror(errno)), exit(1);
+ _blobsWriter = new writeBuffer(name, "a+");
}
//
@@ -821,7 +812,7 @@ gkStore::gkStore(char const *path, gkStore_mode mode, uint32 partID) {
// bytes for the full meta data. Assuming 100x of 3kb read coverage on human, that's 100
// million reads, so 0.400 GB vs 2.4 GB.
- sprintf(name, "%s/partitions/map", _storePath);
+ snprintf(name, FILENAME_MAX, "%s/partitions/map", _storePath);
errno = 0;
FILE *F = fopen(name, "r");
@@ -842,20 +833,20 @@ gkStore::gkStore(char const *path, gkStore_mode mode, uint32 partID) {
fclose(F);
- sprintf(name, "%s/libraries", _storePath);
+ snprintf(name, FILENAME_MAX, "%s/libraries", _storePath);
_librariesMMap = new memoryMappedFile (name, memoryMappedFile_readOnly);
_libraries = (gkLibrary *)_librariesMMap->get(0);
- //fprintf(stderr, " -- openend '%s' at "F_X64"\n", name, _libraries);
+ //fprintf(stderr, " -- openend '%s' at " F_X64 "\n", name, _libraries);
- sprintf(name, "%s/partitions/reads.%04"F_U32P"", _storePath, partID);
+ snprintf(name, FILENAME_MAX, "%s/partitions/reads.%04" F_U32P, _storePath, partID);
_readsMMap = new memoryMappedFile (name, memoryMappedFile_readOnly);
_reads = (gkRead *)_readsMMap->get(0);
- //fprintf(stderr, " -- openend '%s' at "F_X64"\n", name, _reads);
+ //fprintf(stderr, " -- openend '%s' at " F_X64 "\n", name, _reads);
- sprintf(name, "%s/partitions/blobs.%04"F_U32P"", _storePath, partID);
+ snprintf(name, FILENAME_MAX, "%s/partitions/blobs.%04" F_U32P, _storePath, partID);
_blobsMMap = new memoryMappedFile (name, memoryMappedFile_readOnly);
_blobs = (void *)_blobsMMap->get(0);
- //fprintf(stderr, " -- openend '%s' at "F_X64"\n", name, _blobs);
+ //fprintf(stderr, " -- openend '%s' at " F_X64 "\n", name, _blobs);
}
// Info only, no access to reads or libraries.
@@ -890,7 +881,7 @@ gkStore::~gkStore() {
delete _librariesMMap;
} else if (_libraries) {
- sprintf(N, "%s/libraries", gkStore_path());
+ snprintf(N, FILENAME_MAX, "%s/libraries", gkStore_path());
errno = 0;
F = fopen(N, "w");
if (errno)
@@ -910,7 +901,7 @@ gkStore::~gkStore() {
delete _readsMMap;
} else if (_reads) {
- sprintf(N, "%s/reads", gkStore_path());
+ snprintf(N, FILENAME_MAX, "%s/reads", gkStore_path());
errno = 0;
F = fopen(N, "w");
if (errno)
@@ -927,7 +918,7 @@ gkStore::~gkStore() {
if (needsInfoUpdate) {
- sprintf(N, "%s/info", gkStore_path());
+ snprintf(N, FILENAME_MAX, "%s/info", gkStore_path());
errno = 0;
F = fopen(N, "w");
if (errno)
@@ -938,7 +929,7 @@ gkStore::~gkStore() {
fclose(F);
- sprintf(N, "%s/info.txt", gkStore_path());
+ snprintf(N, FILENAME_MAX, "%s/info.txt", gkStore_path());
errno = 0;
F = fopen(N, "w");
if (errno)
@@ -954,8 +945,14 @@ gkStore::~gkStore() {
if (_blobsMMap)
delete _blobsMMap;
- if (_blobsFile)
- fclose(_blobsFile);
+ if (_blobsWriter)
+ delete _blobsWriter;
+
+ for (uint32 ii=0; ii<omp_get_max_threads(); ii++)
+ if ((_blobsFiles) && (_blobsFiles[ii]))
+ fclose(_blobsFiles[ii]);
+
+ delete [] _blobsFiles;
delete [] _readIDtoPartitionIdx;
delete [] _readIDtoPartitionID;
@@ -976,13 +973,18 @@ gkStore::gkStore_addEmptyLibrary(char const *name) {
if (_info.numLibraries == _librariesAlloc)
increaseArray(_libraries, _info.numLibraries, _librariesAlloc, 128);
+ _libraries[_info.numLibraries] = gkLibrary();
+ _libraries[_info.numLibraries]._libraryID = _info.numLibraries;
+
// Bullet proof the library name - so we can make files with this prefix.
- char libname[LIBRARY_NAME_SIZE];
+ char *libname = _libraries[_info.numLibraries]._libraryName;
uint32 libnamepos = 0;
bool modified = false;
bool truncated = false;
+ memset(libname, 0, sizeof(char) * LIBRARY_NAME_SIZE);
+
for (char const *orig=name; *orig; orig++) {
if (*orig == '/') {
libname[libnamepos++] = '_';
@@ -1005,23 +1007,6 @@ gkStore::gkStore_addEmptyLibrary(char const *name) {
}
}
- libname[libnamepos] = 0;
-
-#if 0
- if (modified || truncated)
- fprintf(stderr, "gkStore_addEmptyLibrary()-- added library '%s' (original name '%s')\n",
- libname, name);
- else
- fprintf(stderr, "gkStore_addEmptyLibrary()-- added library '%s'\n",
- libname);
-#endif
-
- _libraries[_info.numLibraries] = gkLibrary();
-
- strncpy(_libraries[_info.numLibraries]._libraryName, libname, LIBRARY_NAME_SIZE-1);
-
- _libraries[_info.numLibraries]._libraryID = _info.numLibraries;
-
return(_libraries + _info.numLibraries);
}
@@ -1056,11 +1041,13 @@ gkStore::gkStore_addEmptyRead(gkLibrary *lib) {
void
-gkRead::gkRead_copyDataToPartition(void *blobs, FILE **partfiles, uint64 *partfileslen, uint32 partID) {
-
- // Stash away the location of the partitioned data
+gkRead::gkRead_copyDataToPartition(void *blobs,
+ FILE **partfiles,
+ uint64 *partfileslen,
+ uint32 partID) {
- assert(partfileslen[partID] == AS_UTL_ftell(partfiles[partID]));
+ if (partID == UINT32_MAX) // If an invalid partition, don't do anything.
+ return;
// Figure out where the blob actually is, and make sure that it really is a blob
@@ -1072,6 +1059,10 @@ gkRead::gkRead_copyDataToPartition(void *blobs, FILE **partfiles, uint64 *partfi
assert(blob[2] == 'O');
assert(blob[3] == 'B');
+ // The partfile should be at what we think is the end.
+
+ assert(partfileslen[partID] == AS_UTL_ftell(partfiles[partID]));
+
// Write the blob to the partition, update the length of the partition
AS_UTL_safeWrite(partfiles[partID], blob, "gkRead::gkRead_copyDataToPartition::blob", sizeof(char), blobLen);
@@ -1086,10 +1077,70 @@ gkRead::gkRead_copyDataToPartition(void *blobs, FILE **partfiles, uint64 *partfi
partfileslen[partID] += blobLen;
assert(partfileslen[partID] == AS_UTL_ftell(partfiles[partID]));
+}
+
+
+
+void
+gkRead::gkRead_copyDataToPartition(FILE **blobsFiles,
+ FILE **partfiles,
+ uint64 *partfileslen,
+ uint32 partID) {
+
+ // Load the blob from disk.
+
+ char tag[5];
+ uint8 *blob;
+ uint32 blobLen;
+ FILE *file = blobsFiles[omp_get_thread_num()];
+
+ // Ideally, we'd do one read to get the whole blob. Without knowing
+ // the length, we're forced to do two.
+
+ AS_UTL_safeRead(file, tag, "gkStore::gkStore_loadDataFromFile::tag", sizeof(int8), 4);
+ AS_UTL_safeRead(file, &blobLen, "gkStore::gkStore_loadDataFromFile::blobLen", sizeof(uint32), 1);
+
+ blob = new uint8 [8 + blobLen];
+
+ memcpy(blob, tag, sizeof(uint8) * 4);
+ memcpy(blob+4, &blobLen, sizeof(uint32) * 1);
+ AS_UTL_safeRead(file, blob+8, "gkStore::gkStore_loadDataFromFile::blob", sizeof(char), blobLen);
+
+ assert(blob[0] == 'B');
+ assert(blob[1] == 'L');
+ assert(blob[2] == 'O');
+ assert(blob[3] == 'B');
+
+ // If a valid partition, write the data (we always have to read it though, so don't be all clever
+ // and try to move this test backward).
+
+ if (partID != UINT32_MAX) {
+ assert(partfileslen[partID] == AS_UTL_ftell(partfiles[partID])); // The partfile should be at what we think is the end.
+
+ // Write the blob to the partition, update the length of the partition
+
+ blobLen += 8;
+
+ AS_UTL_safeWrite(partfiles[partID], blob, "gkRead::gkRead_copyDataToPartition::blob", sizeof(char), blobLen);
+
+ // Update the read to the new location of the blob in the partitioned data.
+
+ _mPtr = partfileslen[partID];
+ _pID = partID;
+
+ // And finalize by remembering the length.
+
+ partfileslen[partID] += blobLen;
+
+ assert(partfileslen[partID] == AS_UTL_ftell(partfiles[partID]));
+ }
+
+ delete [] blob;
}
+
void
gkStore::gkStore_buildPartitions(uint32 *partitionMap) {
char name[FILENAME_MAX];
@@ -1115,7 +1166,7 @@ gkStore::gkStore_buildPartitions(uint32 *partitionMap) {
maxPartition = partitionMap[fi];
}
- fprintf(stderr, "Found "F_U32" unpartitioned reads and maximum partition of "F_U32"\n",
+ fprintf(stderr, "Found " F_U32 " unpartitioned reads and maximum partition of " F_U32 "\n",
unPartitioned, maxPartition);
// Create the partitions by opening N copies of the data stores,
@@ -1129,7 +1180,7 @@ gkStore::gkStore_buildPartitions(uint32 *partitionMap) {
// Be nice and put all the partitions in a subdirectory.
- sprintf(name,"%s/partitions", _storePath);
+ snprintf(name, FILENAME_MAX, "%s/partitions", _storePath);
if (AS_UTL_fileExists(name, true, true) == false)
AS_UTL_mkdir(name);
@@ -1142,7 +1193,7 @@ gkStore::gkStore_buildPartitions(uint32 *partitionMap) {
readfileslen[0] = UINT32_MAX;
for (uint32 i=1; i<=maxPartition; i++) {
- sprintf(name,"%s/partitions/blobs.%04d", _storePath, i);
+ snprintf(name, FILENAME_MAX, "%s/partitions/blobs.%04d", _storePath, i);
errno = 0;
blobfiles[i] = fopen(name, "w");
@@ -1152,7 +1203,7 @@ gkStore::gkStore_buildPartitions(uint32 *partitionMap) {
fprintf(stderr, "gkStore::gkStore_buildPartitions()-- ERROR: failed to open partition %u file '%s' for write: %s\n",
i, name, strerror(errno)), exit(1);
- sprintf(name,"%s/partitions/reads.%04d", _storePath, i);
+ snprintf(name, FILENAME_MAX, "%s/partitions/reads.%04d", _storePath, i);
errno = 0;
readfiles[i] = fopen(name, "w");
@@ -1165,7 +1216,7 @@ gkStore::gkStore_buildPartitions(uint32 *partitionMap) {
// Open the output partition map file -- we might as well fail early if we can't make it also.
- sprintf(name,"%s/partitions/map", _storePath);
+ snprintf(name, FILENAME_MAX, "%s/partitions/map", _storePath);
errno = 0;
FILE *rIDmF = fopen(name, "w");
@@ -1182,29 +1233,42 @@ gkStore::gkStore_buildPartitions(uint32 *partitionMap) {
assert(pi != 0); // No zeroth partition, right?
- if (pi == UINT32_MAX)
- // Deleted reads are not assigned a partition; skip them
- continue;
-
// Make a copy of the read, then modify it for the partition, then write it to the partition.
// Without the copy, we'd need to update the master record too.
- gkRead partRead = _reads[fi]; //*gkStore_getRead(fi);
+ gkRead partRead = _reads[fi];
+
+ if (_blobs)
+ partRead.gkRead_copyDataToPartition(_blobs, blobfiles, blobfileslen, pi);
+ if (_blobsFiles)
+ partRead.gkRead_copyDataToPartition(_blobsFiles, blobfiles, blobfileslen, pi);
- partRead.gkRead_copyDataToPartition(_blobs, blobfiles, blobfileslen, pi);
+ // Because the blobsFiles copyDataToPartition variant is streaming through the file,
+ // we need to let it load (and ignore) deleted reads. After they're loaded (and ignored)
+ // we can then skip it.
-#if 1
- fprintf(stderr, "read "F_U32"="F_U32" len "F_U32" -- blob master "F_U64" -- to part "F_U32" new read id "F_U32" blob "F_U64"/"F_U64" -- at readIdx "F_U32"\n",
- fi, _reads[fi].gkRead_readID(), _reads[fi].gkRead_sequenceLength(),
- _reads[fi]._mPtr,
- pi,
- partRead.gkRead_readID(), partRead._pID, partRead._mPtr,
- readfileslen[pi]);
+ if (pi < UINT32_MAX) {
+#if 0
+ fprintf(stderr, "read " F_U32 "=" F_U32 " len " F_U32 " -- blob master " F_U64 " -- to part " F_U32 " new read id " F_U32 " blob " F_U64 "/" F_U64 " -- at readIdx " F_U32 "\n",
+ fi, _reads[fi].gkRead_readID(), _reads[fi].gkRead_sequenceLength(),
+ _reads[fi]._mPtr,
+ pi,
+ partRead.gkRead_readID(), partRead._pID, partRead._mPtr,
+ readfileslen[pi]);
#endif
- AS_UTL_safeWrite(readfiles[pi], &partRead, "gkStore::gkStore_buildPartitions::read", sizeof(gkRead), 1);
+ AS_UTL_safeWrite(readfiles[pi], &partRead, "gkStore::gkStore_buildPartitions::read", sizeof(gkRead), 1);
+
+ readIDmap[fi] = readfileslen[pi]++;
+ }
- readIDmap[fi] = readfileslen[pi]++;
+ else {
+#if 0
+ fprintf(stderr, "read " F_U32 "=" F_U32 " len " F_U32 " -- blob master " F_U64 " -- DELETED\n",
+ fi, _reads[fi].gkRead_readID(), _reads[fi].gkRead_sequenceLength(),
+ _reads[fi]._mPtr);
+#endif
+ }
}
// There isn't a zeroth read.
@@ -1219,7 +1283,7 @@ gkStore::gkStore_buildPartitions(uint32 *partitionMap) {
fclose(rIDmF);
for (uint32 i=1; i<=maxPartition; i++) {
- fprintf(stderr, "partition "F_U32" has "F_U32" reads\n", i, readfileslen[i]);
+ fprintf(stderr, "partition " F_U32 " has " F_U32 " reads\n", i, readfileslen[i]);
errno = 0;
@@ -1240,6 +1304,39 @@ gkStore::gkStore_buildPartitions(uint32 *partitionMap) {
void
+gkStore::gkStore_clone(char *clonePath) {
+ char rPath[FILENAME_MAX];
+ char sPath[FILENAME_MAX];
+ char dPath[FILENAME_MAX];
+
+ AS_UTL_mkdir(clonePath);
+
+ errno = 0;
+ realpath(gkStore_path(), rPath);
+ if (errno)
+ fprintf(stderr, "gkStore::gkStore_clone()- failed to find path of '%s': %s\n",
+ gkStore_path(), strerror(errno)), exit(1);
+
+ snprintf(sPath, FILENAME_MAX, "%s/info", rPath);
+ snprintf(dPath, FILENAME_MAX, "%s/info", clonePath);
+ AS_UTL_symlink(sPath, dPath);
+
+ snprintf(sPath, FILENAME_MAX, "%s/libraries", rPath);
+ snprintf(dPath, FILENAME_MAX, "%s/libraries", clonePath);
+ AS_UTL_symlink(sPath, dPath);
+
+ snprintf(sPath, FILENAME_MAX, "%s/reads", rPath);
+ snprintf(dPath, FILENAME_MAX, "%s/reads", clonePath);
+ AS_UTL_symlink(sPath, dPath);
+
+ snprintf(sPath, FILENAME_MAX, "%s/blobs", rPath);
+ snprintf(dPath, FILENAME_MAX, "%s/blobs", clonePath);
+ AS_UTL_symlink(sPath, dPath);
+}
+
+
+
+void
gkStore::gkStore_delete(void) {
char path[FILENAME_MAX];
@@ -1251,10 +1348,10 @@ gkStore::gkStore_delete(void) {
gkStore_deletePartitions();
- sprintf(path, "%s/info", gkStore_path()); AS_UTL_unlink(path);
- sprintf(path, "%s/libraries", gkStore_path()); AS_UTL_unlink(path);
- sprintf(path, "%s/reads", gkStore_path()); AS_UTL_unlink(path);
- sprintf(path, "%s/blobs", gkStore_path()); AS_UTL_unlink(path);
+ snprintf(path, FILENAME_MAX, "%s/info", gkStore_path()); AS_UTL_unlink(path);
+ snprintf(path, FILENAME_MAX, "%s/libraries", gkStore_path()); AS_UTL_unlink(path);
+ snprintf(path, FILENAME_MAX, "%s/reads", gkStore_path()); AS_UTL_unlink(path);
+ snprintf(path, FILENAME_MAX, "%s/blobs", gkStore_path()); AS_UTL_unlink(path);
AS_UTL_unlink(path);
}
@@ -1265,7 +1362,7 @@ void
gkStore::gkStore_deletePartitions(void) {
char path[FILENAME_MAX];
- sprintf(path, "%s/partitions/map", gkStore_path());
+ snprintf(path, FILENAME_MAX, "%s/partitions/map", gkStore_path());
if (AS_UTL_fileExists(path, false, false) == false)
return;
@@ -1285,8 +1382,8 @@ gkStore::gkStore_deletePartitions(void) {
AS_UTL_unlink(path);
for (uint32 ii=0; ii<_numberOfPartitions; ii++) {
- sprintf(path, "%s/partitions/reads.%04u", gkStore_path(), ii+1); AS_UTL_unlink(path);
- sprintf(path, "%s/partitions/blobs.%04u", gkStore_path(), ii+1); AS_UTL_unlink(path);
+ snprintf(path, FILENAME_MAX, "%s/partitions/reads.%04u", gkStore_path(), ii+1); AS_UTL_unlink(path);
+ snprintf(path, FILENAME_MAX, "%s/partitions/blobs.%04u", gkStore_path(), ii+1); AS_UTL_unlink(path);
}
}
diff --git a/src/stores/gkStore.H b/src/stores/gkStore.H
index fb7eee0..3dcb424 100644
--- a/src/stores/gkStore.H
+++ b/src/stores/gkStore.H
@@ -32,6 +32,7 @@
#include "AS_global.H"
#include "memoryMappedFile.H"
+#include "writeBuffer.H"
#include <vector>
@@ -105,12 +106,7 @@ public:
memset(_libraryName, 0, sizeof(char) * LIBRARY_NAME_SIZE);
strncpy(_libraryName, "UNDEFINED", LIBRARY_NAME_SIZE-1);
- gkLibrary_clearFeatures();
- };
-
- gkLibrary(char const *name) {
- memset(_libraryName, 0, sizeof(char) * LIBRARY_NAME_SIZE);
- strncpy(_libraryName, name, LIBRARY_NAME_SIZE-1);
+ _libraryID = UINT32_MAX;
gkLibrary_clearFeatures();
};
@@ -209,7 +205,10 @@ class gkRead;
class gkReadData {
public:
gkReadData() {
- _read = NULL;
+ _read = NULL;
+
+ _name = NULL;
+ _nameAlloc = 0;
_seq = NULL;
_qlt = NULL;
@@ -218,33 +217,21 @@ public:
_blobLen = 0;
_blobMax = 0;
_blob = NULL;
-
- _pacbio._deletionQV = NULL;
- _pacbio._deletionTag = NULL;
- _pacbio._insertionQV = NULL;
- _pacbio._substitutionQV = NULL;
- _pacbio._mergeQV = NULL;
-
- _nanopore._someQV = NULL;
};
~gkReadData() {
+ delete [] _name;
+
delete [] _seq;
delete [] _qlt;
delete [] _blob;
-
- delete [] _pacbio._deletionQV;
- delete [] _pacbio._deletionTag;
- delete [] _pacbio._insertionQV;
- delete [] _pacbio._substitutionQV;
- delete [] _pacbio._mergeQV;
-
- delete [] _nanopore._someQV;
};
gkRead *gkReadData_getRead(void) { return(_read); };
+ char *gkReadData_getName(void) { return(_name); };
+
bool gkReadData_hasSequence(void) { return(_seq != NULL); };
bool gkReadData_hasQualities(void) { return(_qlt != NULL); };
@@ -254,6 +241,9 @@ public:
private:
gkRead *_read; // Pointer to the mmap'd read
+ char *_name;
+ uint32 _nameAlloc;
+
char *_seq; // Everyone has sequence
char *_qlt; // and quality
uint32 _seqAlloc;
@@ -266,23 +256,6 @@ private:
void gkReadData_encodeBlobChunk(char const *tag, uint32 len, void *dat);
- // Some reads have more data than others. This used to be a union, but that
- // only complicated things with no clear beneift. Yay, we saved a few bytes on
- // a structure that we have only a few copies in core at a time.
-
- struct pacbio {
- char *_deletionQV;
- char *_deletionTag;
- char *_insertionQV;
- char *_substitutionQV;
- char *_mergeQV;
- } _pacbio;
-
- struct nanopore {
- char *_someQV;
- } _nanopore;
-
-
friend class gkRead;
friend class gkStore;
};
@@ -313,8 +286,20 @@ public:
uint64 gkRead_mPtr(void) { return(_mPtr); }; // For debugging, in gatekeeperDumpMetatData
uint64 gkRead_pID(void) { return(_pID); };
-public:
- bool gkRead_loadData(gkReadData *readData, void *blob);
+ // Functions to load the read data from disk.
+ //
+ // loadData() -- lowest level, called by the other functions to decode the
+ // encoded data into the gkReadData structure.
+ // loadDataFromStream() -- reads data from a FILE, does not position the stream
+ // loadDataFromFile() -- reads data from a FILE, positions the stream first
+ // loadDataFromMMap() -- reads data from a memory mapped file
+ //
+private:
+ void gkRead_loadData (gkReadData *readData, uint8 *blob);
+
+ void gkRead_loadDataFromStream(gkReadData *readData, FILE *file);
+ void gkRead_loadDataFromFile (gkReadData *readData, FILE *file);
+ void gkRead_loadDataFromMMap (gkReadData *readData, void *blob);
private:
uint32 gkRead_encode2bit(uint8 *&chunk, char *seq, uint32 seqLen);
@@ -327,11 +312,9 @@ private:
bool gkRead_decode4bit(uint8 *chunk, uint32 chunkLen, char *qlt, uint32 seqLen);
bool gkRead_decode5bit(uint8 *chunk, uint32 chunkLen, char *qlt, uint32 seqLen);
+ // Called by gatekeeperCreate to add a new read to the store.
public:
- // These are called by gatekeeperCreate.
gkReadData *gkRead_encodeSeqQlt(char *H, char *S, char *Q, uint32 qv);
- //gkReadData *gkRead_encodePacBio(char *H, char *S, char *Q); // Not implemented.
- //gkReadData *gkRead_encodeMinION(char *H, char *S, char *Q);
private:
char *gkRead_encodeSequence(char *sequence, char *encoded);
@@ -342,7 +325,8 @@ private:
private:
// Used by the store to copy data to a partition
- void gkRead_copyDataToPartition(void *blobs, FILE **partfiles, uint64 *partfileslen, uint32 partID);
+ void gkRead_copyDataToPartition(void *blobs, FILE **partfiles, uint64 *partfileslen, uint32 partID);
+ void gkRead_copyDataToPartition(FILE **blobsFiles, FILE **partfiles, uint64 *partfileslen, uint32 partID);
private:
@@ -384,18 +368,18 @@ public:
};
void writeInfoAsText(FILE *F) {
- fprintf(F, "gkMagic = 0x"F_X64"\n", gkMagic);
- fprintf(F, "gkVersion = 0x"F_X64"\n", gkVersion);
+ fprintf(F, "gkMagic = 0x" F_X64 "\n", gkMagic);
+ fprintf(F, "gkVersion = 0x" F_X64 "\n", gkVersion);
fprintf(F, "\n");
- fprintf(F, "gkLibrarySize = "F_U32"\n", gkLibrarySize);
- fprintf(F, "gkReadSize = "F_U32"\n", gkReadSize);
- fprintf(F, "gkMaxLibrariesBits = "F_U32"\n", gkMaxLibrariesBits);
- fprintf(F, "gkLibraryNameSize = "F_U32"\n", gkLibraryNameSize);
- fprintf(F, "gkMaxReadBits = "F_U32"\n", gkMaxReadBits);
- fprintf(F, "gkMaxReadLenBits = "F_U32"\n", gkMaxReadLenBits);
+ fprintf(F, "gkLibrarySize = " F_U32 "\n", gkLibrarySize);
+ fprintf(F, "gkReadSize = " F_U32 "\n", gkReadSize);
+ fprintf(F, "gkMaxLibrariesBits = " F_U32 "\n", gkMaxLibrariesBits);
+ fprintf(F, "gkLibraryNameSize = " F_U32 "\n", gkLibraryNameSize);
+ fprintf(F, "gkMaxReadBits = " F_U32 "\n", gkMaxReadBits);
+ fprintf(F, "gkMaxReadLenBits = " F_U32 "\n", gkMaxReadLenBits);
fprintf(F, "\n");
- fprintf(F, "numLibraries = "F_U32"\n", numLibraries);
- fprintf(F, "numReads = "F_U32"\n", numReads);
+ fprintf(F, "numLibraries = " F_U32 "\n", numLibraries);
+ fprintf(F, "numReads = " F_U32 "\n", numReads);
};
private:
@@ -423,9 +407,10 @@ private:
typedef enum {
gkStore_readOnly = 0x00, // Open read only
- gkStore_modify = 0x01, // Open for modification
- gkStore_extend = 0x02, // Open for modification and appending new reads/libraries
- gkStore_infoOnly = 0x03 // Open read only, but only load the info on the store; no access to reads or libraries
+ gkStore_modify = 0x01, // Open for modification - never used, explicitly uses mmap file
+ gkStore_create = 0x02, // Open for creating, will fail if files exist already
+ gkStore_extend = 0x03, // Open for modification and appending new reads/libraries
+ gkStore_infoOnly = 0x04 // Open read only, but only load the info on the store; no access to reads or libraries
} gkStore_mode;
@@ -436,6 +421,7 @@ toString(gkStore_mode m) {
switch (m) {
case gkStore_readOnly: return("gkStore_readOnly"); break;
case gkStore_modify: return("gkStore_modify"); break;
+ case gkStore_create: return("gkStore_create"); break;
case gkStore_extend: return("gkStore_extend"); break;
case gkStore_infoOnly: return("gkStore_infoOnly"); break;
}
@@ -455,21 +441,17 @@ public:
static
gkStore *gkStore_open(char const *path, gkStore_mode mode=gkStore_readOnly, uint32 partID=UINT32_MAX) {
+ // If an instance exists, return it, otherwise, make a new one.
+
#pragma omp critical
{
- // If an instance exists, return it.
-
if (_instance != NULL) {
_instanceCount++;
- //fprintf(stderr, "gkStore_open(%s), %u instances now\n", path, _instanceCount);
- }
-
- // Otherwise, make a new one.
-
- else {
+ //fprintf(stderr, "gkStore_open(%s) from thread %d, %u instances now\n", path, omp_get_thread_num(), _instanceCount);
+ } else {
_instance = new gkStore(path, mode, partID);
_instanceCount = 1;
- //fprintf(stderr, "gkStore_open(%s), first instance, create store\n", path);
+ //fprintf(stderr, "gkStore_open(%s) form thread %d, first instance, create store\n", path, omp_get_thread_num());
}
}
@@ -486,14 +468,15 @@ public:
if (_instanceCount == 0) {
delete _instance;
_instance = NULL;
- //fprintf(stderr, "gkStore_close(%s), no instances remain, delete store\n", _storeName, _instanceCount);
+ //fprintf(stderr, "gkStore_close(%s) from thread %d, no instances remain, delete store\n",
+ // _storeName, omp_get_thread_num());
}
else {
- //fprintf(stderr, "gkStore_close(%s), %u instances remain\n", _storeName, _instanceCount);
+ //fprintf(stderr, "gkStore_close(%s) from thread %d, %u instances remain\n",
+ // _storeName, omp_get_thread_num(), _instanceCount);
}
}
-
};
@@ -503,6 +486,8 @@ public:
void gkStore_buildPartitions(uint32 *partitionMap);
+ void gkStore_clone(char *clonePath);
+
void gkStore_delete(void); // Deletes the files in the store.
void gkStore_deletePartitions(void); // Deletes the files for a partition.
@@ -548,15 +533,21 @@ public:
gkLibrary *gkStore_addEmptyLibrary(char const *name);
gkRead *gkStore_addEmptyRead(gkLibrary *lib);
- bool gkStore_loadReadData(gkRead *read, gkReadData *readData) {
- return(read->gkRead_loadData(readData, _blobs));
+ void gkStore_loadReadData(gkRead *read, gkReadData *readData) {
+ //fprintf(stderr, "loadReadData()- read " F_U64 " thread " F_S32 " out of " F_S32 "\n",
+ // read->_readID, omp_get_thread_num(), omp_get_max_threads());
+ if (_blobs)
+ read->gkRead_loadDataFromMMap(readData, _blobs);
+ if (_blobsFiles)
+ read->gkRead_loadDataFromFile(readData, _blobsFiles[omp_get_thread_num()]);
};
- bool gkStore_loadReadData(uint32 readID, gkReadData *readData) {
- return(gkStore_getRead(readID)->gkRead_loadData(readData, _blobs));
+ void gkStore_loadReadData(uint32 readID, gkReadData *readData) {
+ gkStore_loadReadData(gkStore_getRead(readID), readData);
};
void gkStore_stashReadData(gkRead *read, gkReadData *data);
+ // Used in utgcns, for the package format.
static
void gkStore_loadReadFromStream(FILE *S, gkRead *read, gkReadData *readData);
void gkStore_saveReadToStream(FILE *S, uint32 id);
@@ -574,6 +565,9 @@ private:
// If these are memory mapped, then multiple processes on the same host can share the
// (read-only) data.
+ //
+ // For blobs, we allow either using the mmap directly, or skipping the mmap and
+ // using a buffer.
memoryMappedFile *_librariesMMap;
uint32 _librariesAlloc; // If zero, the mmap is used.
@@ -584,8 +578,9 @@ private:
gkRead *_reads;
memoryMappedFile *_blobsMMap; // Either the full blobs, or the partitioned blobs.
- void *_blobs;
- FILE *_blobsFile;
+ void *_blobs; // Pointer to the data in the blobsMMap.
+ writeBuffer *_blobsWriter; // For constructing a store, data gets dumped here.
+ FILE **_blobsFiles; // For loading reads directly, one per thread.
// If the store is openend partitioned, this data is loaded from disk
diff --git a/src/stores/libsnappy/snappy-internal.h b/src/stores/libsnappy/snappy-internal.h
new file mode 100644
index 0000000..38846e2
--- /dev/null
+++ b/src/stores/libsnappy/snappy-internal.h
@@ -0,0 +1,237 @@
+
+/******************************************************************************
+ *
+ * This file is part of canu, a software program that assembles whole-genome
+ * sequencing reads into contigs.
+ *
+ * This software is based on:
+ * 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
+ * the 'kmer package' (http://kmer.sourceforge.net)
+ * both originally distributed by Applera Corporation under the GNU General
+ * Public License, version 2.
+ *
+ * Canu branched from Celera Assembler at its revision 4587.
+ * Canu branched from the kmer project at its revision 1994.
+ *
+ * Modifications by:
+ *
+ * Brian P. Walenz beginning on 2016-AUG-30
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
+ * File 'README.licenses' in the root directory of this distribution contains
+ * full conditions and disclaimers for each license.
+ */
+
+// Copyright 2008 Google Inc. All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Internals shared between the Snappy implementation and its unittest.
+
+#ifndef THIRD_PARTY_SNAPPY_SNAPPY_INTERNAL_H_
+#define THIRD_PARTY_SNAPPY_SNAPPY_INTERNAL_H_
+
+#include "snappy-stubs-internal.h"
+
+namespace snappy {
+namespace internal {
+
+class WorkingMemory {
+ public:
+ WorkingMemory() : large_table_(NULL) { }
+ ~WorkingMemory() { delete[] large_table_; }
+
+ // Allocates and clears a hash table using memory in "*this",
+ // stores the number of buckets in "*table_size" and returns a pointer to
+ // the base of the hash table.
+ uint16* GetHashTable(size_t input_size, int* table_size);
+
+ private:
+ uint16 small_table_[1<<10]; // 2KB
+ uint16* large_table_; // Allocated only when needed
+
+ DISALLOW_COPY_AND_ASSIGN(WorkingMemory);
+};
+
+// Flat array compression that does not emit the "uncompressed length"
+// prefix. Compresses "input" string to the "*op" buffer.
+//
+// REQUIRES: "input_length <= kBlockSize"
+// REQUIRES: "op" points to an array of memory that is at least
+// "MaxCompressedLength(input_length)" in size.
+// REQUIRES: All elements in "table[0..table_size-1]" are initialized to zero.
+// REQUIRES: "table_size" is a power of two
+//
+// Returns an "end" pointer into "op" buffer.
+// "end - op" is the compressed size of "input".
+char* CompressFragment(const char* input,
+ size_t input_length,
+ char* op,
+ uint16* table,
+ const int table_size);
+
+// Return the largest n such that
+//
+// s1[0,n-1] == s2[0,n-1]
+// and n <= (s2_limit - s2).
+//
+// Does not read *s2_limit or beyond.
+// Does not read *(s1 + (s2_limit - s2)) or beyond.
+// Requires that s2_limit >= s2.
+//
+// Separate implementation for x86_64, for speed. Uses the fact that
+// x86_64 is little endian.
+#if defined(ARCH_K8)
+static inline int FindMatchLength(const char* s1,
+ const char* s2,
+ const char* s2_limit) {
+ assert(s2_limit >= s2);
+ int matched = 0;
+
+ // Find out how long the match is. We loop over the data 64 bits at a
+ // time until we find a 64-bit block that doesn't match; then we find
+ // the first non-matching bit and use that to calculate the total
+ // length of the match.
+ while (PREDICT_TRUE(s2 <= s2_limit - 8)) {
+ if (UNALIGNED_LOAD64(s2) == UNALIGNED_LOAD64(s1 + matched)) {
+ s2 += 8;
+ matched += 8;
+ } else {
+ // On current (mid-2008) Opteron models there is a 3% more
+ // efficient code sequence to find the first non-matching byte.
+ // However, what follows is ~10% better on Intel Core 2 and newer,
+ // and we expect AMD's bsf instruction to improve.
+ uint64 x = UNALIGNED_LOAD64(s2) ^ UNALIGNED_LOAD64(s1 + matched);
+ int matching_bits = Bits::FindLSBSetNonZero64(x);
+ matched += matching_bits >> 3;
+ return matched;
+ }
+ }
+ while (PREDICT_TRUE(s2 < s2_limit)) {
+ if (s1[matched] == *s2) {
+ ++s2;
+ ++matched;
+ } else {
+ return matched;
+ }
+ }
+ return matched;
+}
+#else
+static inline int FindMatchLength(const char* s1,
+ const char* s2,
+ const char* s2_limit) {
+ // Implementation based on the x86-64 version, above.
+ assert(s2_limit >= s2);
+ int matched = 0;
+
+ while (s2 <= s2_limit - 4 &&
+ UNALIGNED_LOAD32(s2) == UNALIGNED_LOAD32(s1 + matched)) {
+ s2 += 4;
+ matched += 4;
+ }
+ if (LittleEndian::IsLittleEndian() && s2 <= s2_limit - 4) {
+ uint32 x = UNALIGNED_LOAD32(s2) ^ UNALIGNED_LOAD32(s1 + matched);
+ int matching_bits = Bits::FindLSBSetNonZero(x);
+ matched += matching_bits >> 3;
+ } else {
+ while ((s2 < s2_limit) && (s1[matched] == *s2)) {
+ ++s2;
+ ++matched;
+ }
+ }
+ return matched;
+}
+#endif
+
+// Lookup tables for decompression code. Give --snappy_dump_decompression_table
+// to the unit test to recompute char_table.
+
+enum {
+ LITERAL = 0,
+ COPY_1_BYTE_OFFSET = 1, // 3 bit length + 3 bits of offset in opcode
+ COPY_2_BYTE_OFFSET = 2,
+ COPY_4_BYTE_OFFSET = 3
+};
+static const int kMaximumTagLength = 5; // COPY_4_BYTE_OFFSET plus the actual offset.
+
+// Mapping from i in range [0,4] to a mask to extract the bottom 8*i bits
+static const uint32 wordmask[] = {
+ 0u, 0xffu, 0xffffu, 0xffffffu, 0xffffffffu
+};
+
+// Data stored per entry in lookup table:
+// Range Bits-used Description
+// ------------------------------------
+// 1..64 0..7 Literal/copy length encoded in opcode byte
+// 0..7 8..10 Copy offset encoded in opcode byte / 256
+// 0..4 11..13 Extra bytes after opcode
+//
+// We use eight bits for the length even though 7 would have sufficed
+// because of efficiency reasons:
+// (1) Extracting a byte is faster than a bit-field
+// (2) It properly aligns copy offset so we do not need a <<8
+static const uint16 char_table[256] = {
+ 0x0001, 0x0804, 0x1001, 0x2001, 0x0002, 0x0805, 0x1002, 0x2002,
+ 0x0003, 0x0806, 0x1003, 0x2003, 0x0004, 0x0807, 0x1004, 0x2004,
+ 0x0005, 0x0808, 0x1005, 0x2005, 0x0006, 0x0809, 0x1006, 0x2006,
+ 0x0007, 0x080a, 0x1007, 0x2007, 0x0008, 0x080b, 0x1008, 0x2008,
+ 0x0009, 0x0904, 0x1009, 0x2009, 0x000a, 0x0905, 0x100a, 0x200a,
+ 0x000b, 0x0906, 0x100b, 0x200b, 0x000c, 0x0907, 0x100c, 0x200c,
+ 0x000d, 0x0908, 0x100d, 0x200d, 0x000e, 0x0909, 0x100e, 0x200e,
+ 0x000f, 0x090a, 0x100f, 0x200f, 0x0010, 0x090b, 0x1010, 0x2010,
+ 0x0011, 0x0a04, 0x1011, 0x2011, 0x0012, 0x0a05, 0x1012, 0x2012,
+ 0x0013, 0x0a06, 0x1013, 0x2013, 0x0014, 0x0a07, 0x1014, 0x2014,
+ 0x0015, 0x0a08, 0x1015, 0x2015, 0x0016, 0x0a09, 0x1016, 0x2016,
+ 0x0017, 0x0a0a, 0x1017, 0x2017, 0x0018, 0x0a0b, 0x1018, 0x2018,
+ 0x0019, 0x0b04, 0x1019, 0x2019, 0x001a, 0x0b05, 0x101a, 0x201a,
+ 0x001b, 0x0b06, 0x101b, 0x201b, 0x001c, 0x0b07, 0x101c, 0x201c,
+ 0x001d, 0x0b08, 0x101d, 0x201d, 0x001e, 0x0b09, 0x101e, 0x201e,
+ 0x001f, 0x0b0a, 0x101f, 0x201f, 0x0020, 0x0b0b, 0x1020, 0x2020,
+ 0x0021, 0x0c04, 0x1021, 0x2021, 0x0022, 0x0c05, 0x1022, 0x2022,
+ 0x0023, 0x0c06, 0x1023, 0x2023, 0x0024, 0x0c07, 0x1024, 0x2024,
+ 0x0025, 0x0c08, 0x1025, 0x2025, 0x0026, 0x0c09, 0x1026, 0x2026,
+ 0x0027, 0x0c0a, 0x1027, 0x2027, 0x0028, 0x0c0b, 0x1028, 0x2028,
+ 0x0029, 0x0d04, 0x1029, 0x2029, 0x002a, 0x0d05, 0x102a, 0x202a,
+ 0x002b, 0x0d06, 0x102b, 0x202b, 0x002c, 0x0d07, 0x102c, 0x202c,
+ 0x002d, 0x0d08, 0x102d, 0x202d, 0x002e, 0x0d09, 0x102e, 0x202e,
+ 0x002f, 0x0d0a, 0x102f, 0x202f, 0x0030, 0x0d0b, 0x1030, 0x2030,
+ 0x0031, 0x0e04, 0x1031, 0x2031, 0x0032, 0x0e05, 0x1032, 0x2032,
+ 0x0033, 0x0e06, 0x1033, 0x2033, 0x0034, 0x0e07, 0x1034, 0x2034,
+ 0x0035, 0x0e08, 0x1035, 0x2035, 0x0036, 0x0e09, 0x1036, 0x2036,
+ 0x0037, 0x0e0a, 0x1037, 0x2037, 0x0038, 0x0e0b, 0x1038, 0x2038,
+ 0x0039, 0x0f04, 0x1039, 0x2039, 0x003a, 0x0f05, 0x103a, 0x203a,
+ 0x003b, 0x0f06, 0x103b, 0x203b, 0x003c, 0x0f07, 0x103c, 0x203c,
+ 0x0801, 0x0f08, 0x103d, 0x203d, 0x1001, 0x0f09, 0x103e, 0x203e,
+ 0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040
+};
+
+} // end namespace internal
+} // end namespace snappy
+
+#endif // THIRD_PARTY_SNAPPY_SNAPPY_INTERNAL_H_
diff --git a/src/stores/libsnappy/snappy-sinksource.cc b/src/stores/libsnappy/snappy-sinksource.cc
new file mode 100644
index 0000000..369a132
--- /dev/null
+++ b/src/stores/libsnappy/snappy-sinksource.cc
@@ -0,0 +1,104 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <string.h>
+
+#include "snappy-sinksource.h"
+
+namespace snappy {
+
+Source::~Source() { }
+
+Sink::~Sink() { }
+
+char* Sink::GetAppendBuffer(size_t length, char* scratch) {
+ return scratch;
+}
+
+char* Sink::GetAppendBufferVariable(
+ size_t min_size, size_t desired_size_hint, char* scratch,
+ size_t scratch_size, size_t* allocated_size) {
+ *allocated_size = scratch_size;
+ return scratch;
+}
+
+void Sink::AppendAndTakeOwnership(
+ char* bytes, size_t n,
+ void (*deleter)(void*, const char*, size_t),
+ void *deleter_arg) {
+ Append(bytes, n);
+ (*deleter)(deleter_arg, bytes, n);
+}
+
+ByteArraySource::~ByteArraySource() { }
+
+size_t ByteArraySource::Available() const { return left_; }
+
+const char* ByteArraySource::Peek(size_t* len) {
+ *len = left_;
+ return ptr_;
+}
+
+void ByteArraySource::Skip(size_t n) {
+ left_ -= n;
+ ptr_ += n;
+}
+
+UncheckedByteArraySink::~UncheckedByteArraySink() { }
+
+void UncheckedByteArraySink::Append(const char* data, size_t n) {
+ // Do no copying if the caller filled in the result of GetAppendBuffer()
+ if (data != dest_) {
+ memcpy(dest_, data, n);
+ }
+ dest_ += n;
+}
+
+char* UncheckedByteArraySink::GetAppendBuffer(size_t len, char* scratch) {
+ return dest_;
+}
+
+void UncheckedByteArraySink::AppendAndTakeOwnership(
+ char* data, size_t n,
+ void (*deleter)(void*, const char*, size_t),
+ void *deleter_arg) {
+ if (data != dest_) {
+ memcpy(dest_, data, n);
+ (*deleter)(deleter_arg, data, n);
+ }
+ dest_ += n;
+}
+
+char* UncheckedByteArraySink::GetAppendBufferVariable(
+ size_t min_size, size_t desired_size_hint, char* scratch,
+ size_t scratch_size, size_t* allocated_size) {
+ *allocated_size = desired_size_hint;
+ return dest_;
+}
+
+} // namespace snappy
diff --git a/src/stores/libsnappy/snappy-sinksource.h b/src/stores/libsnappy/snappy-sinksource.h
new file mode 100644
index 0000000..5ff7ff5
--- /dev/null
+++ b/src/stores/libsnappy/snappy-sinksource.h
@@ -0,0 +1,207 @@
+
+/******************************************************************************
+ *
+ * This file is part of canu, a software program that assembles whole-genome
+ * sequencing reads into contigs.
+ *
+ * This software is based on:
+ * 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
+ * the 'kmer package' (http://kmer.sourceforge.net)
+ * both originally distributed by Applera Corporation under the GNU General
+ * Public License, version 2.
+ *
+ * Canu branched from Celera Assembler at its revision 4587.
+ * Canu branched from the kmer project at its revision 1994.
+ *
+ * Modifications by:
+ *
+ * Brian P. Walenz beginning on 2016-AUG-30
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
+ * File 'README.licenses' in the root directory of this distribution contains
+ * full conditions and disclaimers for each license.
+ */
+
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef THIRD_PARTY_SNAPPY_SNAPPY_SINKSOURCE_H_
+#define THIRD_PARTY_SNAPPY_SNAPPY_SINKSOURCE_H_
+
+#include <stddef.h>
+
+namespace snappy {
+
+// A Sink is an interface that consumes a sequence of bytes.
+class Sink {
+ public:
+ Sink() { }
+ virtual ~Sink();
+
+ // Append "bytes[0,n-1]" to this.
+ virtual void Append(const char* bytes, size_t n) = 0;
+
+ // Returns a writable buffer of the specified length for appending.
+ // May return a pointer to the caller-owned scratch buffer which
+ // must have at least the indicated length. The returned buffer is
+ // only valid until the next operation on this Sink.
+ //
+ // After writing at most "length" bytes, call Append() with the
+ // pointer returned from this function and the number of bytes
+ // written. Many Append() implementations will avoid copying
+ // bytes if this function returned an internal buffer.
+ //
+ // If a non-scratch buffer is returned, the caller may only pass a
+ // prefix of it to Append(). That is, it is not correct to pass an
+ // interior pointer of the returned array to Append().
+ //
+ // The default implementation always returns the scratch buffer.
+ virtual char* GetAppendBuffer(size_t length, char* scratch);
+
+ // For higher performance, Sink implementations can provide custom
+ // AppendAndTakeOwnership() and GetAppendBufferVariable() methods.
+ // These methods can reduce the number of copies done during
+ // compression/decompression.
+
+ // Append "bytes[0,n-1] to the sink. Takes ownership of "bytes"
+ // and calls the deleter function as (*deleter)(deleter_arg, bytes, n)
+ // to free the buffer. deleter function must be non NULL.
+ //
+ // The default implementation just calls Append and frees "bytes".
+ // Other implementations may avoid a copy while appending the buffer.
+ virtual void AppendAndTakeOwnership(
+ char* bytes, size_t n, void (*deleter)(void*, const char*, size_t),
+ void *deleter_arg);
+
+ // Returns a writable buffer for appending and writes the buffer's capacity to
+ // *allocated_size. Guarantees *allocated_size >= min_size.
+ // May return a pointer to the caller-owned scratch buffer which must have
+ // scratch_size >= min_size.
+ //
+ // The returned buffer is only valid until the next operation
+ // on this ByteSink.
+ //
+ // After writing at most *allocated_size bytes, call Append() with the
+ // pointer returned from this function and the number of bytes written.
+ // Many Append() implementations will avoid copying bytes if this function
+ // returned an internal buffer.
+ //
+ // If the sink implementation allocates or reallocates an internal buffer,
+ // it should use the desired_size_hint if appropriate. If a caller cannot
+ // provide a reasonable guess at the desired capacity, it should set
+ // desired_size_hint = 0.
+ //
+ // If a non-scratch buffer is returned, the caller may only pass
+ // a prefix to it to Append(). That is, it is not correct to pass an
+ // interior pointer to Append().
+ //
+ // The default implementation always returns the scratch buffer.
+ virtual char* GetAppendBufferVariable(
+ size_t min_size, size_t desired_size_hint, char* scratch,
+ size_t scratch_size, size_t* allocated_size);
+
+ private:
+ // No copying
+ Sink(const Sink&);
+ void operator=(const Sink&);
+};
+
+// A Source is an interface that yields a sequence of bytes
+class Source {
+ public:
+ Source() { }
+ virtual ~Source();
+
+ // Return the number of bytes left to read from the source
+ virtual size_t Available() const = 0;
+
+ // Peek at the next flat region of the source. Does not reposition
+ // the source. The returned region is empty iff Available()==0.
+ //
+ // Returns a pointer to the beginning of the region and store its
+ // length in *len.
+ //
+ // The returned region is valid until the next call to Skip() or
+ // until this object is destroyed, whichever occurs first.
+ //
+ // The returned region may be larger than Available() (for example
+ // if this ByteSource is a view on a substring of a larger source).
+ // The caller is responsible for ensuring that it only reads the
+ // Available() bytes.
+ virtual const char* Peek(size_t* len) = 0;
+
+ // Skip the next n bytes. Invalidates any buffer returned by
+ // a previous call to Peek().
+ // REQUIRES: Available() >= n
+ virtual void Skip(size_t n) = 0;
+
+ private:
+ // No copying
+ Source(const Source&);
+ void operator=(const Source&);
+};
+
+// A Source implementation that yields the contents of a flat array
+class ByteArraySource : public Source {
+ public:
+ ByteArraySource(const char* p, size_t n) : ptr_(p), left_(n) { }
+ virtual ~ByteArraySource();
+ virtual size_t Available() const;
+ virtual const char* Peek(size_t* len);
+ virtual void Skip(size_t n);
+ private:
+ const char* ptr_;
+ size_t left_;
+};
+
+// A Sink implementation that writes to a flat array without any bound checks.
+class UncheckedByteArraySink : public Sink {
+ public:
+ explicit UncheckedByteArraySink(char* dest) : dest_(dest) { }
+ virtual ~UncheckedByteArraySink();
+ virtual void Append(const char* data, size_t n);
+ virtual char* GetAppendBuffer(size_t len, char* scratch);
+ virtual char* GetAppendBufferVariable(
+ size_t min_size, size_t desired_size_hint, char* scratch,
+ size_t scratch_size, size_t* allocated_size);
+ virtual void AppendAndTakeOwnership(
+ char* bytes, size_t n, void (*deleter)(void*, const char*, size_t),
+ void *deleter_arg);
+
+ // Return the current output pointer so that a caller can see how
+ // many bytes were produced.
+ // Note: this is not a Sink method.
+ char* CurrentDestination() const { return dest_; }
+ private:
+ char* dest_;
+};
+
+} // namespace snappy
+
+#endif // THIRD_PARTY_SNAPPY_SNAPPY_SINKSOURCE_H_
diff --git a/src/stores/libsnappy/snappy-stubs-internal.cc b/src/stores/libsnappy/snappy-stubs-internal.cc
new file mode 100644
index 0000000..6ed3343
--- /dev/null
+++ b/src/stores/libsnappy/snappy-stubs-internal.cc
@@ -0,0 +1,42 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <algorithm>
+#include <string>
+
+#include "snappy-stubs-internal.h"
+
+namespace snappy {
+
+void Varint::Append32(string* s, uint32 value) {
+ char buf[Varint::kMax32];
+ const char* p = Varint::Encode32(buf, value);
+ s->append(buf, p - buf);
+}
+
+} // namespace snappy
diff --git a/src/stores/libsnappy/snappy-stubs-internal.h b/src/stores/libsnappy/snappy-stubs-internal.h
new file mode 100644
index 0000000..02a07c0
--- /dev/null
+++ b/src/stores/libsnappy/snappy-stubs-internal.h
@@ -0,0 +1,553 @@
+
+/******************************************************************************
+ *
+ * This file is part of canu, a software program that assembles whole-genome
+ * sequencing reads into contigs.
+ *
+ * This software is based on:
+ * 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
+ * the 'kmer package' (http://kmer.sourceforge.net)
+ * both originally distributed by Applera Corporation under the GNU General
+ * Public License, version 2.
+ *
+ * Canu branched from Celera Assembler at its revision 4587.
+ * Canu branched from the kmer project at its revision 1994.
+ *
+ * Modifications by:
+ *
+ * Brian P. Walenz beginning on 2016-AUG-30
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
+ * File 'README.licenses' in the root directory of this distribution contains
+ * full conditions and disclaimers for each license.
+ */
+
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Various stubs for the open-source version of Snappy.
+
+#ifndef THIRD_PARTY_SNAPPY_OPENSOURCE_SNAPPY_STUBS_INTERNAL_H_
+#define THIRD_PARTY_SNAPPY_OPENSOURCE_SNAPPY_STUBS_INTERNAL_H_
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <string>
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef HAVE_SYS_MMAN_H
+#include <sys/mman.h>
+#endif
+
+#include "snappy-stubs-public.h"
+
+#if defined(__x86_64__)
+
+// Enable 64-bit optimized versions of some routines.
+#define ARCH_K8 1
+
+#endif
+
+// Needed by OS X, among others.
+#ifndef MAP_ANONYMOUS
+#define MAP_ANONYMOUS MAP_ANON
+#endif
+
+// Pull in std::min, std::ostream, and the likes. This is safe because this
+// header file is never used from any public header files.
+using namespace std;
+
+// The size of an array, if known at compile-time.
+// Will give unexpected results if used on a pointer.
+// We undefine it first, since some compilers already have a definition.
+#ifdef ARRAYSIZE
+#undef ARRAYSIZE
+#endif
+#define ARRAYSIZE(a) (sizeof(a) / sizeof(*(a)))
+
+// Static prediction hints.
+#ifdef HAVE_BUILTIN_EXPECT
+#define PREDICT_FALSE(x) (__builtin_expect(x, 0))
+#define PREDICT_TRUE(x) (__builtin_expect(!!(x), 1))
+#else
+#define PREDICT_FALSE(x) x
+#define PREDICT_TRUE(x) x
+#endif
+
+// This is only used for recomputing the tag byte table used during
+// decompression; for simplicity we just remove it from the open-source
+// version (anyone who wants to regenerate it can just do the call
+// themselves within main()).
+#define DEFINE_bool(flag_name, default_value, description) \
+ bool FLAGS_ ## flag_name = default_value
+#define DECLARE_bool(flag_name) \
+ extern bool FLAGS_ ## flag_name
+
+namespace snappy {
+
+static const uint32 kuint32max = static_cast<uint32>(0xFFFFFFFF);
+static const int64 kint64max = static_cast<int64>(0x7FFFFFFFFFFFFFFFLL);
+
+// Potentially unaligned loads and stores.
+
+// x86 and PowerPC can simply do these loads and stores native.
+
+#if defined(__i386__) || defined(__x86_64__) || defined(__powerpc__)
+
+#define UNALIGNED_LOAD16(_p) (*reinterpret_cast<const uint16 *>(_p))
+#define UNALIGNED_LOAD32(_p) (*reinterpret_cast<const uint32 *>(_p))
+#define UNALIGNED_LOAD64(_p) (*reinterpret_cast<const uint64 *>(_p))
+
+#define UNALIGNED_STORE16(_p, _val) (*reinterpret_cast<uint16 *>(_p) = (_val))
+#define UNALIGNED_STORE32(_p, _val) (*reinterpret_cast<uint32 *>(_p) = (_val))
+#define UNALIGNED_STORE64(_p, _val) (*reinterpret_cast<uint64 *>(_p) = (_val))
+
+// ARMv7 and newer support native unaligned accesses, but only of 16-bit
+// and 32-bit values (not 64-bit); older versions either raise a fatal signal,
+// do an unaligned read and rotate the words around a bit, or do the reads very
+// slowly (trip through kernel mode). There's no simple #define that says just
+// “ARMv7 or higher”, so we have to filter away all ARMv5 and ARMv6
+// sub-architectures.
+//
+// This is a mess, but there's not much we can do about it.
+//
+// To further complicate matters, only LDR instructions (single reads) are
+// allowed to be unaligned, not LDRD (two reads) or LDM (many reads). Unless we
+// explicitly tell the compiler that these accesses can be unaligned, it can and
+// will combine accesses. On armcc, the way to signal this is done by accessing
+// through the type (uint32 __packed *), but GCC has no such attribute
+// (it ignores __attribute__((packed)) on individual variables). However,
+// we can tell it that a _struct_ is unaligned, which has the same effect,
+// so we do that.
+
+#elif defined(__arm__) && \
+ !defined(__ARM_ARCH_4__) && \
+ !defined(__ARM_ARCH_4T__) && \
+ !defined(__ARM_ARCH_5__) && \
+ !defined(__ARM_ARCH_5T__) && \
+ !defined(__ARM_ARCH_5TE__) && \
+ !defined(__ARM_ARCH_5TEJ__) && \
+ !defined(__ARM_ARCH_6__) && \
+ !defined(__ARM_ARCH_6J__) && \
+ !defined(__ARM_ARCH_6K__) && \
+ !defined(__ARM_ARCH_6Z__) && \
+ !defined(__ARM_ARCH_6ZK__) && \
+ !defined(__ARM_ARCH_6T2__)
+
+#if __GNUC__
+#define ATTRIBUTE_PACKED __attribute__((__packed__))
+#else
+#define ATTRIBUTE_PACKED
+#endif
+
+namespace base {
+namespace internal {
+
+struct Unaligned16Struct {
+ uint16 value;
+ uint8 dummy; // To make the size non-power-of-two.
+} ATTRIBUTE_PACKED;
+
+struct Unaligned32Struct {
+ uint32 value;
+ uint8 dummy; // To make the size non-power-of-two.
+} ATTRIBUTE_PACKED;
+
+} // namespace internal
+} // namespace base
+
+#define UNALIGNED_LOAD16(_p) \
+ ((reinterpret_cast<const ::snappy::base::internal::Unaligned16Struct *>(_p))->value)
+#define UNALIGNED_LOAD32(_p) \
+ ((reinterpret_cast<const ::snappy::base::internal::Unaligned32Struct *>(_p))->value)
+
+#define UNALIGNED_STORE16(_p, _val) \
+ ((reinterpret_cast< ::snappy::base::internal::Unaligned16Struct *>(_p))->value = \
+ (_val))
+#define UNALIGNED_STORE32(_p, _val) \
+ ((reinterpret_cast< ::snappy::base::internal::Unaligned32Struct *>(_p))->value = \
+ (_val))
+
+// TODO(user): NEON supports unaligned 64-bit loads and stores.
+// See if that would be more efficient on platforms supporting it,
+// at least for copies.
+
+inline uint64 UNALIGNED_LOAD64(const void *p) {
+ uint64 t;
+ memcpy(&t, p, sizeof t);
+ return t;
+}
+
+inline void UNALIGNED_STORE64(void *p, uint64 v) {
+ memcpy(p, &v, sizeof v);
+}
+
+#else
+
+// These functions are provided for architectures that don't support
+// unaligned loads and stores.
+
+inline uint16 UNALIGNED_LOAD16(const void *p) {
+ uint16 t;
+ memcpy(&t, p, sizeof t);
+ return t;
+}
+
+inline uint32 UNALIGNED_LOAD32(const void *p) {
+ uint32 t;
+ memcpy(&t, p, sizeof t);
+ return t;
+}
+
+inline uint64 UNALIGNED_LOAD64(const void *p) {
+ uint64 t;
+ memcpy(&t, p, sizeof t);
+ return t;
+}
+
+inline void UNALIGNED_STORE16(void *p, uint16 v) {
+ memcpy(p, &v, sizeof v);
+}
+
+inline void UNALIGNED_STORE32(void *p, uint32 v) {
+ memcpy(p, &v, sizeof v);
+}
+
+inline void UNALIGNED_STORE64(void *p, uint64 v) {
+ memcpy(p, &v, sizeof v);
+}
+
+#endif
+
+// This can be more efficient than UNALIGNED_LOAD64 + UNALIGNED_STORE64
+// on some platforms, in particular ARM.
+inline void UnalignedCopy64(const void *src, void *dst) {
+ if (sizeof(void *) == 8) {
+ UNALIGNED_STORE64(dst, UNALIGNED_LOAD64(src));
+ } else {
+ const char *src_char = reinterpret_cast<const char *>(src);
+ char *dst_char = reinterpret_cast<char *>(dst);
+
+ UNALIGNED_STORE32(dst_char, UNALIGNED_LOAD32(src_char));
+ UNALIGNED_STORE32(dst_char + 4, UNALIGNED_LOAD32(src_char + 4));
+ }
+}
+
+// The following guarantees declaration of the byte swap functions.
+#ifdef WORDS_BIGENDIAN
+
+#ifdef HAVE_SYS_BYTEORDER_H
+#include <sys/byteorder.h>
+#endif
+
+#ifdef HAVE_SYS_ENDIAN_H
+#include <sys/endian.h>
+#endif
+
+#ifdef _MSC_VER
+#include <stdlib.h>
+#define bswap_16(x) _byteswap_ushort(x)
+#define bswap_32(x) _byteswap_ulong(x)
+#define bswap_64(x) _byteswap_uint64(x)
+
+#elif defined(__APPLE__)
+// Mac OS X / Darwin features
+#include <libkern/OSByteOrder.h>
+#define bswap_16(x) OSSwapInt16(x)
+#define bswap_32(x) OSSwapInt32(x)
+#define bswap_64(x) OSSwapInt64(x)
+
+#elif defined(HAVE_BYTESWAP_H)
+#include <byteswap.h>
+
+#elif defined(bswap32)
+// FreeBSD defines bswap{16,32,64} in <sys/endian.h> (already #included).
+#define bswap_16(x) bswap16(x)
+#define bswap_32(x) bswap32(x)
+#define bswap_64(x) bswap64(x)
+
+#elif defined(BSWAP_64)
+// Solaris 10 defines BSWAP_{16,32,64} in <sys/byteorder.h> (already #included).
+#define bswap_16(x) BSWAP_16(x)
+#define bswap_32(x) BSWAP_32(x)
+#define bswap_64(x) BSWAP_64(x)
+
+#else
+
+inline uint16 bswap_16(uint16 x) {
+ return (x << 8) | (x >> 8);
+}
+
+inline uint32 bswap_32(uint32 x) {
+ x = ((x & 0xff00ff00UL) >> 8) | ((x & 0x00ff00ffUL) << 8);
+ return (x >> 16) | (x << 16);
+}
+
+inline uint64 bswap_64(uint64 x) {
+ x = ((x & 0xff00ff00ff00ff00ULL) >> 8) | ((x & 0x00ff00ff00ff00ffULL) << 8);
+ x = ((x & 0xffff0000ffff0000ULL) >> 16) | ((x & 0x0000ffff0000ffffULL) << 16);
+ return (x >> 32) | (x << 32);
+}
+
+#endif
+
+#endif // WORDS_BIGENDIAN
+
+// Convert to little-endian storage, opposite of network format.
+// Convert x from host to little endian: x = LittleEndian.FromHost(x);
+// convert x from little endian to host: x = LittleEndian.ToHost(x);
+//
+// Store values into unaligned memory converting to little endian order:
+// LittleEndian.Store16(p, x);
+//
+// Load unaligned values stored in little endian converting to host order:
+// x = LittleEndian.Load16(p);
+class LittleEndian {
+ public:
+ // Conversion functions.
+#ifdef WORDS_BIGENDIAN
+
+ static uint16 FromHost16(uint16 x) { return bswap_16(x); }
+ static uint16 ToHost16(uint16 x) { return bswap_16(x); }
+
+ static uint32 FromHost32(uint32 x) { return bswap_32(x); }
+ static uint32 ToHost32(uint32 x) { return bswap_32(x); }
+
+ static bool IsLittleEndian() { return false; }
+
+#else // !defined(WORDS_BIGENDIAN)
+
+ static uint16 FromHost16(uint16 x) { return x; }
+ static uint16 ToHost16(uint16 x) { return x; }
+
+ static uint32 FromHost32(uint32 x) { return x; }
+ static uint32 ToHost32(uint32 x) { return x; }
+
+ static bool IsLittleEndian() { return true; }
+
+#endif // !defined(WORDS_BIGENDIAN)
+
+ // Functions to do unaligned loads and stores in little-endian order.
+ static uint16 Load16(const void *p) {
+ return ToHost16(UNALIGNED_LOAD16(p));
+ }
+
+ static void Store16(void *p, uint16 v) {
+ UNALIGNED_STORE16(p, FromHost16(v));
+ }
+
+ static uint32 Load32(const void *p) {
+ return ToHost32(UNALIGNED_LOAD32(p));
+ }
+
+ static void Store32(void *p, uint32 v) {
+ UNALIGNED_STORE32(p, FromHost32(v));
+ }
+};
+
+// Some bit-manipulation functions.
+class Bits {
+ public:
+ // Return floor(log2(n)) for positive integer n. Returns -1 iff n == 0.
+ static int Log2Floor(uint32 n);
+
+ // Return the first set least / most significant bit, 0-indexed. Returns an
+ // undefined value if n == 0. FindLSBSetNonZero() is similar to ffs() except
+ // that it's 0-indexed.
+ static int FindLSBSetNonZero(uint32 n);
+ static int FindLSBSetNonZero64(uint64 n);
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(Bits);
+};
+
+#ifdef HAVE_BUILTIN_CTZ
+
+inline int Bits::Log2Floor(uint32 n) {
+ return n == 0 ? -1 : 31 ^ __builtin_clz(n);
+}
+
+inline int Bits::FindLSBSetNonZero(uint32 n) {
+ return __builtin_ctz(n);
+}
+
+inline int Bits::FindLSBSetNonZero64(uint64 n) {
+ return __builtin_ctzll(n);
+}
+
+#else // Portable versions.
+
+inline int Bits::Log2Floor(uint32 n) {
+ if (n == 0)
+ return -1;
+ int log = 0;
+ uint32 value = n;
+ for (int i = 4; i >= 0; --i) {
+ int shift = (1 << i);
+ uint32 x = value >> shift;
+ if (x != 0) {
+ value = x;
+ log += shift;
+ }
+ }
+ assert(value == 1);
+ return log;
+}
+
+inline int Bits::FindLSBSetNonZero(uint32 n) {
+ int rc = 31;
+ for (int i = 4, shift = 1 << 4; i >= 0; --i) {
+ const uint32 x = n << shift;
+ if (x != 0) {
+ n = x;
+ rc -= shift;
+ }
+ shift >>= 1;
+ }
+ return rc;
+}
+
+// FindLSBSetNonZero64() is defined in terms of FindLSBSetNonZero().
+inline int Bits::FindLSBSetNonZero64(uint64 n) {
+ const uint32 bottombits = static_cast<uint32>(n);
+ if (bottombits == 0) {
+ // Bottom bits are zero, so scan in top bits
+ return 32 + FindLSBSetNonZero(static_cast<uint32>(n >> 32));
+ } else {
+ return FindLSBSetNonZero(bottombits);
+ }
+}
+
+#endif // End portable versions.
+
+// Variable-length integer encoding.
+class Varint {
+ public:
+ // Maximum lengths of varint encoding of uint32.
+ static const int kMax32 = 5;
+
+ // Attempts to parse a varint32 from a prefix of the bytes in [ptr,limit-1].
+ // Never reads a character at or beyond limit. If a valid/terminated varint32
+ // was found in the range, stores it in *OUTPUT and returns a pointer just
+ // past the last byte of the varint32. Else returns NULL. On success,
+ // "result <= limit".
+ static const char* Parse32WithLimit(const char* ptr, const char* limit,
+ uint32* OUTPUT);
+
+ // REQUIRES "ptr" points to a buffer of length sufficient to hold "v".
+ // EFFECTS Encodes "v" into "ptr" and returns a pointer to the
+ // byte just past the last encoded byte.
+ static char* Encode32(char* ptr, uint32 v);
+
+ // EFFECTS Appends the varint representation of "value" to "*s".
+ static void Append32(string* s, uint32 value);
+};
+
+inline const char* Varint::Parse32WithLimit(const char* p,
+ const char* l,
+ uint32* OUTPUT) {
+ const unsigned char* ptr = reinterpret_cast<const unsigned char*>(p);
+ const unsigned char* limit = reinterpret_cast<const unsigned char*>(l);
+ uint32 b, result;
+ if (ptr >= limit) return NULL;
+ b = *(ptr++); result = b & 127; if (b < 128) goto done;
+ if (ptr >= limit) return NULL;
+ b = *(ptr++); result |= (b & 127) << 7; if (b < 128) goto done;
+ if (ptr >= limit) return NULL;
+ b = *(ptr++); result |= (b & 127) << 14; if (b < 128) goto done;
+ if (ptr >= limit) return NULL;
+ b = *(ptr++); result |= (b & 127) << 21; if (b < 128) goto done;
+ if (ptr >= limit) return NULL;
+ b = *(ptr++); result |= (b & 127) << 28; if (b < 16) goto done;
+ return NULL; // Value is too long to be a varint32
+ done:
+ *OUTPUT = result;
+ return reinterpret_cast<const char*>(ptr);
+}
+
+inline char* Varint::Encode32(char* sptr, uint32 v) {
+ // Operate on characters as unsigneds
+ unsigned char* ptr = reinterpret_cast<unsigned char*>(sptr);
+ static const int B = 128;
+ if (v < (1<<7)) {
+ *(ptr++) = v;
+ } else if (v < (1<<14)) {
+ *(ptr++) = v | B;
+ *(ptr++) = v>>7;
+ } else if (v < (1<<21)) {
+ *(ptr++) = v | B;
+ *(ptr++) = (v>>7) | B;
+ *(ptr++) = v>>14;
+ } else if (v < (1<<28)) {
+ *(ptr++) = v | B;
+ *(ptr++) = (v>>7) | B;
+ *(ptr++) = (v>>14) | B;
+ *(ptr++) = v>>21;
+ } else {
+ *(ptr++) = v | B;
+ *(ptr++) = (v>>7) | B;
+ *(ptr++) = (v>>14) | B;
+ *(ptr++) = (v>>21) | B;
+ *(ptr++) = v>>28;
+ }
+ return reinterpret_cast<char*>(ptr);
+}
+
+// If you know the internal layout of the std::string in use, you can
+// replace this function with one that resizes the string without
+// filling the new space with zeros (if applicable) --
+// it will be non-portable but faster.
+inline void STLStringResizeUninitialized(string* s, size_t new_size) {
+ s->resize(new_size);
+}
+
+// Return a mutable char* pointing to a string's internal buffer,
+// which may not be null-terminated. Writing through this pointer will
+// modify the string.
+//
+// string_as_array(&str)[i] is valid for 0 <= i < str.size() until the
+// next call to a string method that invalidates iterators.
+//
+// As of 2006-04, there is no standard-blessed way of getting a
+// mutable reference to a string's internal buffer. However, issue 530
+// (http://www.open-std.org/JTC1/SC22/WG21/docs/lwg-defects.html#530)
+// proposes this as the method. It will officially be part of the standard
+// for C++0x. This should already work on all current implementations.
+inline char* string_as_array(string* str) {
+ return str->empty() ? NULL : &*str->begin();
+}
+
+} // namespace snappy
+
+#endif // THIRD_PARTY_SNAPPY_OPENSOURCE_SNAPPY_STUBS_INTERNAL_H_
diff --git a/src/stores/libsnappy/snappy-stubs-public.h b/src/stores/libsnappy/snappy-stubs-public.h
new file mode 100644
index 0000000..997c3b9
--- /dev/null
+++ b/src/stores/libsnappy/snappy-stubs-public.h
@@ -0,0 +1,125 @@
+
+/******************************************************************************
+ *
+ * This file is part of canu, a software program that assembles whole-genome
+ * sequencing reads into contigs.
+ *
+ * This software is based on:
+ * 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
+ * the 'kmer package' (http://kmer.sourceforge.net)
+ * both originally distributed by Applera Corporation under the GNU General
+ * Public License, version 2.
+ *
+ * Canu branched from Celera Assembler at its revision 4587.
+ * Canu branched from the kmer project at its revision 1994.
+ *
+ * Modifications by:
+ *
+ * Brian P. Walenz beginning on 2016-AUG-30
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
+ * File 'README.licenses' in the root directory of this distribution contains
+ * full conditions and disclaimers for each license.
+ */
+
+// Copyright 2011 Google Inc. All Rights Reserved.
+// Author: sesse at google.com (Steinar H. Gunderson)
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Various type stubs for the open-source version of Snappy.
+//
+// This file cannot include config.h, as it is included from snappy.h,
+// which is a public header. Instead, snappy-stubs-public.h is generated by
+// from snappy-stubs-public.h.in at configure time.
+
+#ifndef THIRD_PARTY_SNAPPY_OPENSOURCE_SNAPPY_STUBS_PUBLIC_H_
+#define THIRD_PARTY_SNAPPY_OPENSOURCE_SNAPPY_STUBS_PUBLIC_H_
+
+#if 1
+#include <stdint.h>
+#endif
+
+#if 1
+#include <stddef.h>
+#endif
+
+#if 0
+#include <sys/uio.h>
+#endif
+
+#define SNAPPY_MAJOR 1
+#define SNAPPY_MINOR 1
+#define SNAPPY_PATCHLEVEL 3
+#define SNAPPY_VERSION \
+ ((SNAPPY_MAJOR << 16) | (SNAPPY_MINOR << 8) | SNAPPY_PATCHLEVEL)
+
+#include <string>
+
+namespace snappy {
+
+#if 1
+typedef int8_t int8;
+typedef uint8_t uint8;
+typedef int16_t int16;
+typedef uint16_t uint16;
+typedef int32_t int32;
+typedef uint32_t uint32;
+typedef int64_t int64;
+typedef uint64_t uint64;
+#else
+typedef signed char int8;
+typedef unsigned char uint8;
+typedef short int16;
+typedef unsigned short uint16;
+typedef int int32;
+typedef unsigned int uint32;
+typedef long long int64;
+typedef unsigned long long uint64;
+#endif
+
+typedef std::string string;
+
+#ifndef DISALLOW_COPY_AND_ASSIGN
+#define DISALLOW_COPY_AND_ASSIGN(TypeName) \
+ TypeName(const TypeName&); \
+ void operator=(const TypeName&)
+#endif
+
+#if !0
+// Windows does not have an iovec type, yet the concept is universally useful.
+// It is simple to define it ourselves, so we put it inside our own namespace.
+struct iovec {
+ void* iov_base;
+ size_t iov_len;
+};
+#endif
+
+} // namespace snappy
+
+#endif // THIRD_PARTY_SNAPPY_OPENSOURCE_SNAPPY_STUBS_PUBLIC_H_
diff --git a/src/stores/libsnappy/snappy.cc b/src/stores/libsnappy/snappy.cc
new file mode 100644
index 0000000..8a3668c
--- /dev/null
+++ b/src/stores/libsnappy/snappy.cc
@@ -0,0 +1,1400 @@
+// Copyright 2005 Google Inc. All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "snappy.h"
+#include "snappy-internal.h"
+#include "snappy-sinksource.h"
+
+#include <stdio.h>
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+
+namespace snappy {
+
+using internal::COPY_1_BYTE_OFFSET;
+using internal::COPY_2_BYTE_OFFSET;
+using internal::COPY_4_BYTE_OFFSET;
+using internal::LITERAL;
+using internal::char_table;
+using internal::kMaximumTagLength;
+using internal::wordmask;
+
+// Any hash function will produce a valid compressed bitstream, but a good
+// hash function reduces the number of collisions and thus yields better
+// compression for compressible input, and more speed for incompressible
+// input. Of course, it doesn't hurt if the hash function is reasonably fast
+// either, as it gets called a lot.
+static inline uint32 HashBytes(uint32 bytes, int shift) {
+ uint32 kMul = 0x1e35a7bd;
+ return (bytes * kMul) >> shift;
+}
+static inline uint32 Hash(const char* p, int shift) {
+ return HashBytes(UNALIGNED_LOAD32(p), shift);
+}
+
+size_t MaxCompressedLength(size_t source_len) {
+ // Compressed data can be defined as:
+ // compressed := item* literal*
+ // item := literal* copy
+ //
+ // The trailing literal sequence has a space blowup of at most 62/60
+ // since a literal of length 60 needs one tag byte + one extra byte
+ // for length information.
+ //
+ // Item blowup is trickier to measure. Suppose the "copy" op copies
+ // 4 bytes of data. Because of a special check in the encoding code,
+ // we produce a 4-byte copy only if the offset is < 65536. Therefore
+ // the copy op takes 3 bytes to encode, and this type of item leads
+ // to at most the 62/60 blowup for representing literals.
+ //
+ // Suppose the "copy" op copies 5 bytes of data. If the offset is big
+ // enough, it will take 5 bytes to encode the copy op. Therefore the
+ // worst case here is a one-byte literal followed by a five-byte copy.
+ // I.e., 6 bytes of input turn into 7 bytes of "compressed" data.
+ //
+ // This last factor dominates the blowup, so the final estimate is:
+ return 32 + source_len + source_len/6;
+}
+
+// Copy "len" bytes from "src" to "op", one byte at a time. Used for
+// handling COPY operations where the input and output regions may
+// overlap. For example, suppose:
+// src == "ab"
+// op == src + 2
+// len == 20
+// After IncrementalCopy(src, op, len), the result will have
+// eleven copies of "ab"
+// ababababababababababab
+// Note that this does not match the semantics of either memcpy()
+// or memmove().
+static inline void IncrementalCopy(const char* src, char* op, ssize_t len) {
+ assert(len > 0);
+ do {
+ *op++ = *src++;
+ } while (--len > 0);
+}
+
+// Equivalent to IncrementalCopy except that it can write up to ten extra
+// bytes after the end of the copy, and that it is faster.
+//
+// The main part of this loop is a simple copy of eight bytes at a time until
+// we've copied (at least) the requested amount of bytes. However, if op and
+// src are less than eight bytes apart (indicating a repeating pattern of
+// length < 8), we first need to expand the pattern in order to get the correct
+// results. For instance, if the buffer looks like this, with the eight-byte
+// <src> and <op> patterns marked as intervals:
+//
+// abxxxxxxxxxxxx
+// [------] src
+// [------] op
+//
+// a single eight-byte copy from <src> to <op> will repeat the pattern once,
+// after which we can move <op> two bytes without moving <src>:
+//
+// ababxxxxxxxxxx
+// [------] src
+// [------] op
+//
+// and repeat the exercise until the two no longer overlap.
+//
+// This allows us to do very well in the special case of one single byte
+// repeated many times, without taking a big hit for more general cases.
+//
+// The worst case of extra writing past the end of the match occurs when
+// op - src == 1 and len == 1; the last copy will read from byte positions
+// [0..7] and write to [4..11], whereas it was only supposed to write to
+// position 1. Thus, ten excess bytes.
+
+namespace {
+
+const int kMaxIncrementCopyOverflow = 10;
+
+inline void IncrementalCopyFastPath(const char* src, char* op, ssize_t len) {
+ while (PREDICT_FALSE(op - src < 8)) {
+ UnalignedCopy64(src, op);
+ len -= op - src;
+ op += op - src;
+ }
+ while (len > 0) {
+ UnalignedCopy64(src, op);
+ src += 8;
+ op += 8;
+ len -= 8;
+ }
+}
+
+} // namespace
+
+static inline char* EmitLiteral(char* op,
+ const char* literal,
+ int len,
+ bool allow_fast_path) {
+ int n = len - 1; // Zero-length literals are disallowed
+ if (n < 60) {
+ // Fits in tag byte
+ *op++ = LITERAL | (n << 2);
+
+ // The vast majority of copies are below 16 bytes, for which a
+ // call to memcpy is overkill. This fast path can sometimes
+ // copy up to 15 bytes too much, but that is okay in the
+ // main loop, since we have a bit to go on for both sides:
+ //
+ // - The input will always have kInputMarginBytes = 15 extra
+ // available bytes, as long as we're in the main loop, and
+ // if not, allow_fast_path = false.
+ // - The output will always have 32 spare bytes (see
+ // MaxCompressedLength).
+ if (allow_fast_path && len <= 16) {
+ UnalignedCopy64(literal, op);
+ UnalignedCopy64(literal + 8, op + 8);
+ return op + len;
+ }
+ } else {
+ // Encode in upcoming bytes
+ char* base = op;
+ int count = 0;
+ op++;
+ while (n > 0) {
+ *op++ = n & 0xff;
+ n >>= 8;
+ count++;
+ }
+ assert(count >= 1);
+ assert(count <= 4);
+ *base = LITERAL | ((59+count) << 2);
+ }
+ memcpy(op, literal, len);
+ return op + len;
+}
+
+static inline char* EmitCopyLessThan64(char* op, size_t offset, int len) {
+ assert(len <= 64);
+ assert(len >= 4);
+ assert(offset < 65536);
+
+ if ((len < 12) && (offset < 2048)) {
+ size_t len_minus_4 = len - 4;
+ assert(len_minus_4 < 8); // Must fit in 3 bits
+ *op++ = COPY_1_BYTE_OFFSET + ((len_minus_4) << 2) + ((offset >> 8) << 5);
+ *op++ = offset & 0xff;
+ } else {
+ *op++ = COPY_2_BYTE_OFFSET + ((len-1) << 2);
+ LittleEndian::Store16(op, offset);
+ op += 2;
+ }
+ return op;
+}
+
+static inline char* EmitCopy(char* op, size_t offset, int len) {
+ // Emit 64 byte copies but make sure to keep at least four bytes reserved
+ while (PREDICT_FALSE(len >= 68)) {
+ op = EmitCopyLessThan64(op, offset, 64);
+ len -= 64;
+ }
+
+ // Emit an extra 60 byte copy if have too much data to fit in one copy
+ if (len > 64) {
+ op = EmitCopyLessThan64(op, offset, 60);
+ len -= 60;
+ }
+
+ // Emit remainder
+ op = EmitCopyLessThan64(op, offset, len);
+ return op;
+}
+
+
+bool GetUncompressedLength(const char* start, size_t n, size_t* result) {
+ uint32 v = 0;
+ const char* limit = start + n;
+ if (Varint::Parse32WithLimit(start, limit, &v) != NULL) {
+ *result = v;
+ return true;
+ } else {
+ return false;
+ }
+}
+
+namespace internal {
+uint16* WorkingMemory::GetHashTable(size_t input_size, int* table_size) {
+ // Use smaller hash table when input.size() is smaller, since we
+ // fill the table, incurring O(hash table size) overhead for
+ // compression, and if the input is short, we won't need that
+ // many hash table entries anyway.
+ assert(kMaxHashTableSize >= 256);
+ size_t htsize = 256;
+ while (htsize < kMaxHashTableSize && htsize < input_size) {
+ htsize <<= 1;
+ }
+
+ uint16* table;
+ if (htsize <= ARRAYSIZE(small_table_)) {
+ table = small_table_;
+ } else {
+ if (large_table_ == NULL) {
+ large_table_ = new uint16[kMaxHashTableSize];
+ }
+ table = large_table_;
+ }
+
+ *table_size = htsize;
+ memset(table, 0, htsize * sizeof(*table));
+ return table;
+}
+} // end namespace internal
+
+// For 0 <= offset <= 4, GetUint32AtOffset(GetEightBytesAt(p), offset) will
+// equal UNALIGNED_LOAD32(p + offset). Motivation: On x86-64 hardware we have
+// empirically found that overlapping loads such as
+// UNALIGNED_LOAD32(p) ... UNALIGNED_LOAD32(p+1) ... UNALIGNED_LOAD32(p+2)
+// are slower than UNALIGNED_LOAD64(p) followed by shifts and casts to uint32.
+//
+// We have different versions for 64- and 32-bit; ideally we would avoid the
+// two functions and just inline the UNALIGNED_LOAD64 call into
+// GetUint32AtOffset, but GCC (at least not as of 4.6) is seemingly not clever
+// enough to avoid loading the value multiple times then. For 64-bit, the load
+// is done when GetEightBytesAt() is called, whereas for 32-bit, the load is
+// done at GetUint32AtOffset() time.
+
+#ifdef ARCH_K8
+
+typedef uint64 EightBytesReference;
+
+static inline EightBytesReference GetEightBytesAt(const char* ptr) {
+ return UNALIGNED_LOAD64(ptr);
+}
+
+static inline uint32 GetUint32AtOffset(uint64 v, int offset) {
+ assert(offset >= 0);
+ assert(offset <= 4);
+ return v >> (LittleEndian::IsLittleEndian() ? 8 * offset : 32 - 8 * offset);
+}
+
+#else
+
+typedef const char* EightBytesReference;
+
+static inline EightBytesReference GetEightBytesAt(const char* ptr) {
+ return ptr;
+}
+
+static inline uint32 GetUint32AtOffset(const char* v, int offset) {
+ assert(offset >= 0);
+ assert(offset <= 4);
+ return UNALIGNED_LOAD32(v + offset);
+}
+
+#endif
+
+// Flat array compression that does not emit the "uncompressed length"
+// prefix. Compresses "input" string to the "*op" buffer.
+//
+// REQUIRES: "input" is at most "kBlockSize" bytes long.
+// REQUIRES: "op" points to an array of memory that is at least
+// "MaxCompressedLength(input.size())" in size.
+// REQUIRES: All elements in "table[0..table_size-1]" are initialized to zero.
+// REQUIRES: "table_size" is a power of two
+//
+// Returns an "end" pointer into "op" buffer.
+// "end - op" is the compressed size of "input".
+namespace internal {
+char* CompressFragment(const char* input,
+ size_t input_size,
+ char* op,
+ uint16* table,
+ const int table_size) {
+ // "ip" is the input pointer, and "op" is the output pointer.
+ const char* ip = input;
+ assert(input_size <= kBlockSize);
+ assert((table_size & (table_size - 1)) == 0); // table must be power of two
+ const int shift = 32 - Bits::Log2Floor(table_size);
+ assert(static_cast<int>(kuint32max >> shift) == table_size - 1);
+ const char* ip_end = input + input_size;
+ const char* base_ip = ip;
+ // Bytes in [next_emit, ip) will be emitted as literal bytes. Or
+ // [next_emit, ip_end) after the main loop.
+ const char* next_emit = ip;
+
+ const size_t kInputMarginBytes = 15;
+ if (PREDICT_TRUE(input_size >= kInputMarginBytes)) {
+ const char* ip_limit = input + input_size - kInputMarginBytes;
+
+ for (uint32 next_hash = Hash(++ip, shift); ; ) {
+ assert(next_emit < ip);
+ // The body of this loop calls EmitLiteral once and then EmitCopy one or
+ // more times. (The exception is that when we're close to exhausting
+ // the input we goto emit_remainder.)
+ //
+ // In the first iteration of this loop we're just starting, so
+ // there's nothing to copy, so calling EmitLiteral once is
+ // necessary. And we only start a new iteration when the
+ // current iteration has determined that a call to EmitLiteral will
+ // precede the next call to EmitCopy (if any).
+ //
+ // Step 1: Scan forward in the input looking for a 4-byte-long match.
+ // If we get close to exhausting the input then goto emit_remainder.
+ //
+ // Heuristic match skipping: If 32 bytes are scanned with no matches
+ // found, start looking only at every other byte. If 32 more bytes are
+ // scanned (or skipped), look at every third byte, etc.. When a match is
+ // found, immediately go back to looking at every byte. This is a small
+ // loss (~5% performance, ~0.1% density) for compressible data due to more
+ // bookkeeping, but for non-compressible data (such as JPEG) it's a huge
+ // win since the compressor quickly "realizes" the data is incompressible
+ // and doesn't bother looking for matches everywhere.
+ //
+ // The "skip" variable keeps track of how many bytes there are since the
+ // last match; dividing it by 32 (ie. right-shifting by five) gives the
+ // number of bytes to move ahead for each iteration.
+ uint32 skip = 32;
+
+ const char* next_ip = ip;
+ const char* candidate;
+ do {
+ ip = next_ip;
+ uint32 hash = next_hash;
+ assert(hash == Hash(ip, shift));
+ uint32 bytes_between_hash_lookups = skip >> 5;
+ skip += bytes_between_hash_lookups;
+ next_ip = ip + bytes_between_hash_lookups;
+ if (PREDICT_FALSE(next_ip > ip_limit)) {
+ goto emit_remainder;
+ }
+ next_hash = Hash(next_ip, shift);
+ candidate = base_ip + table[hash];
+ assert(candidate >= base_ip);
+ assert(candidate < ip);
+
+ table[hash] = ip - base_ip;
+ } while (PREDICT_TRUE(UNALIGNED_LOAD32(ip) !=
+ UNALIGNED_LOAD32(candidate)));
+
+ // Step 2: A 4-byte match has been found. We'll later see if more
+ // than 4 bytes match. But, prior to the match, input
+ // bytes [next_emit, ip) are unmatched. Emit them as "literal bytes."
+ assert(next_emit + 16 <= ip_end);
+ op = EmitLiteral(op, next_emit, ip - next_emit, true);
+
+ // Step 3: Call EmitCopy, and then see if another EmitCopy could
+ // be our next move. Repeat until we find no match for the
+ // input immediately after what was consumed by the last EmitCopy call.
+ //
+ // If we exit this loop normally then we need to call EmitLiteral next,
+ // though we don't yet know how big the literal will be. We handle that
+ // by proceeding to the next iteration of the main loop. We also can exit
+ // this loop via goto if we get close to exhausting the input.
+ EightBytesReference input_bytes;
+ uint32 candidate_bytes = 0;
+
+ do {
+ // We have a 4-byte match at ip, and no need to emit any
+ // "literal bytes" prior to ip.
+ const char* base = ip;
+ int matched = 4 + FindMatchLength(candidate + 4, ip + 4, ip_end);
+ ip += matched;
+ size_t offset = base - candidate;
+ assert(0 == memcmp(base, candidate, matched));
+ op = EmitCopy(op, offset, matched);
+ // We could immediately start working at ip now, but to improve
+ // compression we first update table[Hash(ip - 1, ...)].
+ const char* insert_tail = ip - 1;
+ next_emit = ip;
+ if (PREDICT_FALSE(ip >= ip_limit)) {
+ goto emit_remainder;
+ }
+ input_bytes = GetEightBytesAt(insert_tail);
+ uint32 prev_hash = HashBytes(GetUint32AtOffset(input_bytes, 0), shift);
+ table[prev_hash] = ip - base_ip - 1;
+ uint32 cur_hash = HashBytes(GetUint32AtOffset(input_bytes, 1), shift);
+ candidate = base_ip + table[cur_hash];
+ candidate_bytes = UNALIGNED_LOAD32(candidate);
+ table[cur_hash] = ip - base_ip;
+ } while (GetUint32AtOffset(input_bytes, 1) == candidate_bytes);
+
+ next_hash = HashBytes(GetUint32AtOffset(input_bytes, 2), shift);
+ ++ip;
+ }
+ }
+
+ emit_remainder:
+ // Emit the remaining bytes as a literal
+ if (next_emit < ip_end) {
+ op = EmitLiteral(op, next_emit, ip_end - next_emit, false);
+ }
+
+ return op;
+}
+} // end namespace internal
+
+// Signature of output types needed by decompression code.
+// The decompression code is templatized on a type that obeys this
+// signature so that we do not pay virtual function call overhead in
+// the middle of a tight decompression loop.
+//
+// class DecompressionWriter {
+// public:
+// // Called before decompression
+// void SetExpectedLength(size_t length);
+//
+// // Called after decompression
+// bool CheckLength() const;
+//
+// // Called repeatedly during decompression
+// bool Append(const char* ip, size_t length);
+// bool AppendFromSelf(uint32 offset, size_t length);
+//
+// // The rules for how TryFastAppend differs from Append are somewhat
+// // convoluted:
+// //
+// // - TryFastAppend is allowed to decline (return false) at any
+// // time, for any reason -- just "return false" would be
+// // a perfectly legal implementation of TryFastAppend.
+// // The intention is for TryFastAppend to allow a fast path
+// // in the common case of a small append.
+// // - TryFastAppend is allowed to read up to <available> bytes
+// // from the input buffer, whereas Append is allowed to read
+// // <length>. However, if it returns true, it must leave
+// // at least five (kMaximumTagLength) bytes in the input buffer
+// // afterwards, so that there is always enough space to read the
+// // next tag without checking for a refill.
+// // - TryFastAppend must always return decline (return false)
+// // if <length> is 61 or more, as in this case the literal length is not
+// // decoded fully. In practice, this should not be a big problem,
+// // as it is unlikely that one would implement a fast path accepting
+// // this much data.
+// //
+// bool TryFastAppend(const char* ip, size_t available, size_t length);
+// };
+
+
+// Helper class for decompression
+class SnappyDecompressor {
+ private:
+ Source* reader_; // Underlying source of bytes to decompress
+ const char* ip_; // Points to next buffered byte
+ const char* ip_limit_; // Points just past buffered bytes
+ uint32 peeked_; // Bytes peeked from reader (need to skip)
+ bool eof_; // Hit end of input without an error?
+ char scratch_[kMaximumTagLength]; // See RefillTag().
+
+ // Ensure that all of the tag metadata for the next tag is available
+ // in [ip_..ip_limit_-1]. Also ensures that [ip,ip+4] is readable even
+ // if (ip_limit_ - ip_ < 5).
+ //
+ // Returns true on success, false on error or end of input.
+ bool RefillTag();
+
+ public:
+ explicit SnappyDecompressor(Source* reader)
+ : reader_(reader),
+ ip_(NULL),
+ ip_limit_(NULL),
+ peeked_(0),
+ eof_(false) {
+ }
+
+ ~SnappyDecompressor() {
+ // Advance past any bytes we peeked at from the reader
+ reader_->Skip(peeked_);
+ }
+
+ // Returns true iff we have hit the end of the input without an error.
+ bool eof() const {
+ return eof_;
+ }
+
+ // Read the uncompressed length stored at the start of the compressed data.
+ // On succcess, stores the length in *result and returns true.
+ // On failure, returns false.
+ bool ReadUncompressedLength(uint32* result) {
+ assert(ip_ == NULL); // Must not have read anything yet
+ // Length is encoded in 1..5 bytes
+ *result = 0;
+ uint32 shift = 0;
+ while (true) {
+ if (shift >= 32) return false;
+ size_t n;
+ const char* ip = reader_->Peek(&n);
+ if (n == 0) return false;
+ const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip));
+ reader_->Skip(1);
+ uint32 val = c & 0x7f;
+ if (((val << shift) >> shift) != val) return false;
+ *result |= val << shift;
+ if (c < 128) {
+ break;
+ }
+ shift += 7;
+ }
+ return true;
+ }
+
+ // Process the next item found in the input.
+ // Returns true if successful, false on error or end of input.
+ template <class Writer>
+ void DecompressAllTags(Writer* writer) {
+ const char* ip = ip_;
+
+ // We could have put this refill fragment only at the beginning of the loop.
+ // However, duplicating it at the end of each branch gives the compiler more
+ // scope to optimize the <ip_limit_ - ip> expression based on the local
+ // context, which overall increases speed.
+ #define MAYBE_REFILL() \
+ if (ip_limit_ - ip < kMaximumTagLength) { \
+ ip_ = ip; \
+ if (!RefillTag()) return; \
+ ip = ip_; \
+ }
+
+ MAYBE_REFILL();
+ for ( ;; ) {
+ const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip++));
+
+ if ((c & 0x3) == LITERAL) {
+ size_t literal_length = (c >> 2) + 1u;
+ if (writer->TryFastAppend(ip, ip_limit_ - ip, literal_length)) {
+ assert(literal_length < 61);
+ ip += literal_length;
+ // NOTE(user): There is no MAYBE_REFILL() here, as TryFastAppend()
+ // will not return true unless there's already at least five spare
+ // bytes in addition to the literal.
+ continue;
+ }
+ if (PREDICT_FALSE(literal_length >= 61)) {
+ // Long literal.
+ const size_t literal_length_length = literal_length - 60;
+ literal_length =
+ (LittleEndian::Load32(ip) & wordmask[literal_length_length]) + 1;
+ ip += literal_length_length;
+ }
+
+ size_t avail = ip_limit_ - ip;
+ while (avail < literal_length) {
+ if (!writer->Append(ip, avail)) return;
+ literal_length -= avail;
+ reader_->Skip(peeked_);
+ size_t n;
+ ip = reader_->Peek(&n);
+ avail = n;
+ peeked_ = avail;
+ if (avail == 0) return; // Premature end of input
+ ip_limit_ = ip + avail;
+ }
+ if (!writer->Append(ip, literal_length)) {
+ return;
+ }
+ ip += literal_length;
+ MAYBE_REFILL();
+ } else {
+ const uint32 entry = char_table[c];
+ const uint32 trailer = LittleEndian::Load32(ip) & wordmask[entry >> 11];
+ const uint32 length = entry & 0xff;
+ ip += entry >> 11;
+
+ // copy_offset/256 is encoded in bits 8..10. By just fetching
+ // those bits, we get copy_offset (since the bit-field starts at
+ // bit 8).
+ const uint32 copy_offset = entry & 0x700;
+ if (!writer->AppendFromSelf(copy_offset + trailer, length)) {
+ return;
+ }
+ MAYBE_REFILL();
+ }
+ }
+
+#undef MAYBE_REFILL
+ }
+};
+
+bool SnappyDecompressor::RefillTag() {
+ const char* ip = ip_;
+ if (ip == ip_limit_) {
+ // Fetch a new fragment from the reader
+ reader_->Skip(peeked_); // All peeked bytes are used up
+ size_t n;
+ ip = reader_->Peek(&n);
+ peeked_ = n;
+ if (n == 0) {
+ eof_ = true;
+ return false;
+ }
+ ip_limit_ = ip + n;
+ }
+
+ // Read the tag character
+ assert(ip < ip_limit_);
+ const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip));
+ const uint32 entry = char_table[c];
+ const uint32 needed = (entry >> 11) + 1; // +1 byte for 'c'
+ assert(needed <= sizeof(scratch_));
+
+ // Read more bytes from reader if needed
+ uint32 nbuf = ip_limit_ - ip;
+ if (nbuf < needed) {
+ // Stitch together bytes from ip and reader to form the word
+ // contents. We store the needed bytes in "scratch_". They
+ // will be consumed immediately by the caller since we do not
+ // read more than we need.
+ memmove(scratch_, ip, nbuf);
+ reader_->Skip(peeked_); // All peeked bytes are used up
+ peeked_ = 0;
+ while (nbuf < needed) {
+ size_t length;
+ const char* src = reader_->Peek(&length);
+ if (length == 0) return false;
+ uint32 to_add = min<uint32>(needed - nbuf, length);
+ memcpy(scratch_ + nbuf, src, to_add);
+ nbuf += to_add;
+ reader_->Skip(to_add);
+ }
+ assert(nbuf == needed);
+ ip_ = scratch_;
+ ip_limit_ = scratch_ + needed;
+ } else if (nbuf < kMaximumTagLength) {
+ // Have enough bytes, but move into scratch_ so that we do not
+ // read past end of input
+ memmove(scratch_, ip, nbuf);
+ reader_->Skip(peeked_); // All peeked bytes are used up
+ peeked_ = 0;
+ ip_ = scratch_;
+ ip_limit_ = scratch_ + nbuf;
+ } else {
+ // Pass pointer to buffer returned by reader_.
+ ip_ = ip;
+ }
+ return true;
+}
+
+template <typename Writer>
+static bool InternalUncompress(Source* r, Writer* writer) {
+ // Read the uncompressed length from the front of the compressed input
+ SnappyDecompressor decompressor(r);
+ uint32 uncompressed_len = 0;
+ if (!decompressor.ReadUncompressedLength(&uncompressed_len)) return false;
+ return InternalUncompressAllTags(&decompressor, writer, uncompressed_len);
+}
+
+template <typename Writer>
+static bool InternalUncompressAllTags(SnappyDecompressor* decompressor,
+ Writer* writer,
+ uint32 uncompressed_len) {
+ writer->SetExpectedLength(uncompressed_len);
+
+ // Process the entire input
+ decompressor->DecompressAllTags(writer);
+ writer->Flush();
+ return (decompressor->eof() && writer->CheckLength());
+}
+
+bool GetUncompressedLength(Source* source, uint32* result) {
+ SnappyDecompressor decompressor(source);
+ return decompressor.ReadUncompressedLength(result);
+}
+
+size_t Compress(Source* reader, Sink* writer) {
+ size_t written = 0;
+ size_t N = reader->Available();
+ char ulength[Varint::kMax32];
+ char* p = Varint::Encode32(ulength, N);
+ writer->Append(ulength, p-ulength);
+ written += (p - ulength);
+
+ internal::WorkingMemory wmem;
+ char* scratch = NULL;
+ char* scratch_output = NULL;
+
+ while (N > 0) {
+ // Get next block to compress (without copying if possible)
+ size_t fragment_size;
+ const char* fragment = reader->Peek(&fragment_size);
+ assert(fragment_size != 0); // premature end of input
+ const size_t num_to_read = min(N, kBlockSize);
+ size_t bytes_read = fragment_size;
+
+ size_t pending_advance = 0;
+ if (bytes_read >= num_to_read) {
+ // Buffer returned by reader is large enough
+ pending_advance = num_to_read;
+ fragment_size = num_to_read;
+ } else {
+ // Read into scratch buffer
+ if (scratch == NULL) {
+ // If this is the last iteration, we want to allocate N bytes
+ // of space, otherwise the max possible kBlockSize space.
+ // num_to_read contains exactly the correct value
+ scratch = new char[num_to_read];
+ }
+ memcpy(scratch, fragment, bytes_read);
+ reader->Skip(bytes_read);
+
+ while (bytes_read < num_to_read) {
+ fragment = reader->Peek(&fragment_size);
+ size_t n = min<size_t>(fragment_size, num_to_read - bytes_read);
+ memcpy(scratch + bytes_read, fragment, n);
+ bytes_read += n;
+ reader->Skip(n);
+ }
+ assert(bytes_read == num_to_read);
+ fragment = scratch;
+ fragment_size = num_to_read;
+ }
+ assert(fragment_size == num_to_read);
+
+ // Get encoding table for compression
+ int table_size;
+ uint16* table = wmem.GetHashTable(num_to_read, &table_size);
+
+ // Compress input_fragment and append to dest
+ const int max_output = MaxCompressedLength(num_to_read);
+
+ // Need a scratch buffer for the output, in case the byte sink doesn't
+ // have room for us directly.
+ if (scratch_output == NULL) {
+ scratch_output = new char[max_output];
+ } else {
+ // Since we encode kBlockSize regions followed by a region
+ // which is <= kBlockSize in length, a previously allocated
+ // scratch_output[] region is big enough for this iteration.
+ }
+ char* dest = writer->GetAppendBuffer(max_output, scratch_output);
+ char* end = internal::CompressFragment(fragment, fragment_size,
+ dest, table, table_size);
+ writer->Append(dest, end - dest);
+ written += (end - dest);
+
+ N -= num_to_read;
+ reader->Skip(pending_advance);
+ }
+
+ delete[] scratch;
+ delete[] scratch_output;
+
+ return written;
+}
+
+// -----------------------------------------------------------------------
+// IOVec interfaces
+// -----------------------------------------------------------------------
+
+// A type that writes to an iovec.
+// Note that this is not a "ByteSink", but a type that matches the
+// Writer template argument to SnappyDecompressor::DecompressAllTags().
+class SnappyIOVecWriter {
+ private:
+ const struct iovec* output_iov_;
+ const size_t output_iov_count_;
+
+ // We are currently writing into output_iov_[curr_iov_index_].
+ size_t curr_iov_index_;
+
+ // Bytes written to output_iov_[curr_iov_index_] so far.
+ size_t curr_iov_written_;
+
+ // Total bytes decompressed into output_iov_ so far.
+ size_t total_written_;
+
+ // Maximum number of bytes that will be decompressed into output_iov_.
+ size_t output_limit_;
+
+ inline char* GetIOVecPointer(size_t index, size_t offset) {
+ return reinterpret_cast<char*>(output_iov_[index].iov_base) +
+ offset;
+ }
+
+ public:
+ // Does not take ownership of iov. iov must be valid during the
+ // entire lifetime of the SnappyIOVecWriter.
+ inline SnappyIOVecWriter(const struct iovec* iov, size_t iov_count)
+ : output_iov_(iov),
+ output_iov_count_(iov_count),
+ curr_iov_index_(0),
+ curr_iov_written_(0),
+ total_written_(0),
+ output_limit_(-1) {
+ }
+
+ inline void SetExpectedLength(size_t len) {
+ output_limit_ = len;
+ }
+
+ inline bool CheckLength() const {
+ return total_written_ == output_limit_;
+ }
+
+ inline bool Append(const char* ip, size_t len) {
+ if (total_written_ + len > output_limit_) {
+ return false;
+ }
+
+ while (len > 0) {
+ assert(curr_iov_written_ <= output_iov_[curr_iov_index_].iov_len);
+ if (curr_iov_written_ >= output_iov_[curr_iov_index_].iov_len) {
+ // This iovec is full. Go to the next one.
+ if (curr_iov_index_ + 1 >= output_iov_count_) {
+ return false;
+ }
+ curr_iov_written_ = 0;
+ ++curr_iov_index_;
+ }
+
+ const size_t to_write = std::min(
+ len, output_iov_[curr_iov_index_].iov_len - curr_iov_written_);
+ memcpy(GetIOVecPointer(curr_iov_index_, curr_iov_written_),
+ ip,
+ to_write);
+ curr_iov_written_ += to_write;
+ total_written_ += to_write;
+ ip += to_write;
+ len -= to_write;
+ }
+
+ return true;
+ }
+
+ inline bool TryFastAppend(const char* ip, size_t available, size_t len) {
+ const size_t space_left = output_limit_ - total_written_;
+ if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16 &&
+ output_iov_[curr_iov_index_].iov_len - curr_iov_written_ >= 16) {
+ // Fast path, used for the majority (about 95%) of invocations.
+ char* ptr = GetIOVecPointer(curr_iov_index_, curr_iov_written_);
+ UnalignedCopy64(ip, ptr);
+ UnalignedCopy64(ip + 8, ptr + 8);
+ curr_iov_written_ += len;
+ total_written_ += len;
+ return true;
+ }
+
+ return false;
+ }
+
+ inline bool AppendFromSelf(size_t offset, size_t len) {
+ if (offset > total_written_ || offset == 0) {
+ return false;
+ }
+ const size_t space_left = output_limit_ - total_written_;
+ if (len > space_left) {
+ return false;
+ }
+
+ // Locate the iovec from which we need to start the copy.
+ size_t from_iov_index = curr_iov_index_;
+ size_t from_iov_offset = curr_iov_written_;
+ while (offset > 0) {
+ if (from_iov_offset >= offset) {
+ from_iov_offset -= offset;
+ break;
+ }
+
+ offset -= from_iov_offset;
+ assert(from_iov_index > 0);
+ --from_iov_index;
+ from_iov_offset = output_iov_[from_iov_index].iov_len;
+ }
+
+ // Copy <len> bytes starting from the iovec pointed to by from_iov_index to
+ // the current iovec.
+ while (len > 0) {
+ assert(from_iov_index <= curr_iov_index_);
+ if (from_iov_index != curr_iov_index_) {
+ const size_t to_copy = std::min(
+ output_iov_[from_iov_index].iov_len - from_iov_offset,
+ len);
+ Append(GetIOVecPointer(from_iov_index, from_iov_offset), to_copy);
+ len -= to_copy;
+ if (len > 0) {
+ ++from_iov_index;
+ from_iov_offset = 0;
+ }
+ } else {
+ assert(curr_iov_written_ <= output_iov_[curr_iov_index_].iov_len);
+ size_t to_copy = std::min(output_iov_[curr_iov_index_].iov_len -
+ curr_iov_written_,
+ len);
+ if (to_copy == 0) {
+ // This iovec is full. Go to the next one.
+ if (curr_iov_index_ + 1 >= output_iov_count_) {
+ return false;
+ }
+ ++curr_iov_index_;
+ curr_iov_written_ = 0;
+ continue;
+ }
+ if (to_copy > len) {
+ to_copy = len;
+ }
+ IncrementalCopy(GetIOVecPointer(from_iov_index, from_iov_offset),
+ GetIOVecPointer(curr_iov_index_, curr_iov_written_),
+ to_copy);
+ curr_iov_written_ += to_copy;
+ from_iov_offset += to_copy;
+ total_written_ += to_copy;
+ len -= to_copy;
+ }
+ }
+
+ return true;
+ }
+
+ inline void Flush() {}
+};
+
+bool RawUncompressToIOVec(const char* compressed, size_t compressed_length,
+ const struct iovec* iov, size_t iov_cnt) {
+ ByteArraySource reader(compressed, compressed_length);
+ return RawUncompressToIOVec(&reader, iov, iov_cnt);
+}
+
+bool RawUncompressToIOVec(Source* compressed, const struct iovec* iov,
+ size_t iov_cnt) {
+ SnappyIOVecWriter output(iov, iov_cnt);
+ return InternalUncompress(compressed, &output);
+}
+
+// -----------------------------------------------------------------------
+// Flat array interfaces
+// -----------------------------------------------------------------------
+
+// A type that writes to a flat array.
+// Note that this is not a "ByteSink", but a type that matches the
+// Writer template argument to SnappyDecompressor::DecompressAllTags().
+class SnappyArrayWriter {
+ private:
+ char* base_;
+ char* op_;
+ char* op_limit_;
+
+ public:
+ inline explicit SnappyArrayWriter(char* dst)
+ : base_(dst),
+ op_(dst),
+ op_limit_(dst) {
+ }
+
+ inline void SetExpectedLength(size_t len) {
+ op_limit_ = op_ + len;
+ }
+
+ inline bool CheckLength() const {
+ return op_ == op_limit_;
+ }
+
+ inline bool Append(const char* ip, size_t len) {
+ char* op = op_;
+ const size_t space_left = op_limit_ - op;
+ if (space_left < len) {
+ return false;
+ }
+ memcpy(op, ip, len);
+ op_ = op + len;
+ return true;
+ }
+
+ inline bool TryFastAppend(const char* ip, size_t available, size_t len) {
+ char* op = op_;
+ const size_t space_left = op_limit_ - op;
+ if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16) {
+ // Fast path, used for the majority (about 95%) of invocations.
+ UnalignedCopy64(ip, op);
+ UnalignedCopy64(ip + 8, op + 8);
+ op_ = op + len;
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ inline bool AppendFromSelf(size_t offset, size_t len) {
+ char* op = op_;
+ const size_t space_left = op_limit_ - op;
+
+ // Check if we try to append from before the start of the buffer.
+ // Normally this would just be a check for "produced < offset",
+ // but "produced <= offset - 1u" is equivalent for every case
+ // except the one where offset==0, where the right side will wrap around
+ // to a very big number. This is convenient, as offset==0 is another
+ // invalid case that we also want to catch, so that we do not go
+ // into an infinite loop.
+ assert(op >= base_);
+ size_t produced = op - base_;
+ if (produced <= offset - 1u) {
+ return false;
+ }
+ if (len <= 16 && offset >= 8 && space_left >= 16) {
+ // Fast path, used for the majority (70-80%) of dynamic invocations.
+ UnalignedCopy64(op - offset, op);
+ UnalignedCopy64(op - offset + 8, op + 8);
+ } else {
+ if (space_left >= len + kMaxIncrementCopyOverflow) {
+ IncrementalCopyFastPath(op - offset, op, len);
+ } else {
+ if (space_left < len) {
+ return false;
+ }
+ IncrementalCopy(op - offset, op, len);
+ }
+ }
+
+ op_ = op + len;
+ return true;
+ }
+ inline size_t Produced() const {
+ return op_ - base_;
+ }
+ inline void Flush() {}
+};
+
+bool RawUncompress(const char* compressed, size_t n, char* uncompressed) {
+ ByteArraySource reader(compressed, n);
+ return RawUncompress(&reader, uncompressed);
+}
+
+bool RawUncompress(Source* compressed, char* uncompressed) {
+ SnappyArrayWriter output(uncompressed);
+ return InternalUncompress(compressed, &output);
+}
+
+bool Uncompress(const char* compressed, size_t n, string* uncompressed) {
+ size_t ulength;
+ if (!GetUncompressedLength(compressed, n, &ulength)) {
+ return false;
+ }
+ // On 32-bit builds: max_size() < kuint32max. Check for that instead
+ // of crashing (e.g., consider externally specified compressed data).
+ if (ulength > uncompressed->max_size()) {
+ return false;
+ }
+ STLStringResizeUninitialized(uncompressed, ulength);
+ return RawUncompress(compressed, n, string_as_array(uncompressed));
+}
+
+// A Writer that drops everything on the floor and just does validation
+class SnappyDecompressionValidator {
+ private:
+ size_t expected_;
+ size_t produced_;
+
+ public:
+ inline SnappyDecompressionValidator() : expected_(0), produced_(0) { }
+ inline void SetExpectedLength(size_t len) {
+ expected_ = len;
+ }
+ inline bool CheckLength() const {
+ return expected_ == produced_;
+ }
+ inline bool Append(const char* ip, size_t len) {
+ produced_ += len;
+ return produced_ <= expected_;
+ }
+ inline bool TryFastAppend(const char* ip, size_t available, size_t length) {
+ return false;
+ }
+ inline bool AppendFromSelf(size_t offset, size_t len) {
+ // See SnappyArrayWriter::AppendFromSelf for an explanation of
+ // the "offset - 1u" trick.
+ if (produced_ <= offset - 1u) return false;
+ produced_ += len;
+ return produced_ <= expected_;
+ }
+ inline void Flush() {}
+};
+
+bool IsValidCompressedBuffer(const char* compressed, size_t n) {
+ ByteArraySource reader(compressed, n);
+ SnappyDecompressionValidator writer;
+ return InternalUncompress(&reader, &writer);
+}
+
+bool IsValidCompressed(Source* compressed) {
+ SnappyDecompressionValidator writer;
+ return InternalUncompress(compressed, &writer);
+}
+
+void RawCompress(const char* input,
+ size_t input_length,
+ char* compressed,
+ size_t* compressed_length) {
+ ByteArraySource reader(input, input_length);
+ UncheckedByteArraySink writer(compressed);
+ Compress(&reader, &writer);
+
+ // Compute how many bytes were added
+ *compressed_length = (writer.CurrentDestination() - compressed);
+}
+
+size_t Compress(const char* input, size_t input_length, string* compressed) {
+ // Pre-grow the buffer to the max length of the compressed output
+ compressed->resize(MaxCompressedLength(input_length));
+
+ size_t compressed_length;
+ RawCompress(input, input_length, string_as_array(compressed),
+ &compressed_length);
+ compressed->resize(compressed_length);
+ return compressed_length;
+}
+
+// -----------------------------------------------------------------------
+// Sink interface
+// -----------------------------------------------------------------------
+
+// A type that decompresses into a Sink. The template parameter
+// Allocator must export one method "char* Allocate(int size);", which
+// allocates a buffer of "size" and appends that to the destination.
+template <typename Allocator>
+class SnappyScatteredWriter {
+ Allocator allocator_;
+
+ // We need random access into the data generated so far. Therefore
+ // we keep track of all of the generated data as an array of blocks.
+ // All of the blocks except the last have length kBlockSize.
+ vector<char*> blocks_;
+ size_t expected_;
+
+ // Total size of all fully generated blocks so far
+ size_t full_size_;
+
+ // Pointer into current output block
+ char* op_base_; // Base of output block
+ char* op_ptr_; // Pointer to next unfilled byte in block
+ char* op_limit_; // Pointer just past block
+
+ inline size_t Size() const {
+ return full_size_ + (op_ptr_ - op_base_);
+ }
+
+ bool SlowAppend(const char* ip, size_t len);
+ bool SlowAppendFromSelf(size_t offset, size_t len);
+
+ public:
+ inline explicit SnappyScatteredWriter(const Allocator& allocator)
+ : allocator_(allocator),
+ full_size_(0),
+ op_base_(NULL),
+ op_ptr_(NULL),
+ op_limit_(NULL) {
+ }
+
+ inline void SetExpectedLength(size_t len) {
+ assert(blocks_.empty());
+ expected_ = len;
+ }
+
+ inline bool CheckLength() const {
+ return Size() == expected_;
+ }
+
+ // Return the number of bytes actually uncompressed so far
+ inline size_t Produced() const {
+ return Size();
+ }
+
+ inline bool Append(const char* ip, size_t len) {
+ size_t avail = op_limit_ - op_ptr_;
+ if (len <= avail) {
+ // Fast path
+ memcpy(op_ptr_, ip, len);
+ op_ptr_ += len;
+ return true;
+ } else {
+ return SlowAppend(ip, len);
+ }
+ }
+
+ inline bool TryFastAppend(const char* ip, size_t available, size_t length) {
+ char* op = op_ptr_;
+ const int space_left = op_limit_ - op;
+ if (length <= 16 && available >= 16 + kMaximumTagLength &&
+ space_left >= 16) {
+ // Fast path, used for the majority (about 95%) of invocations.
+ UNALIGNED_STORE64(op, UNALIGNED_LOAD64(ip));
+ UNALIGNED_STORE64(op + 8, UNALIGNED_LOAD64(ip + 8));
+ op_ptr_ = op + length;
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ inline bool AppendFromSelf(size_t offset, size_t len) {
+ // See SnappyArrayWriter::AppendFromSelf for an explanation of
+ // the "offset - 1u" trick.
+ if (offset - 1u < op_ptr_ - op_base_) {
+ const size_t space_left = op_limit_ - op_ptr_;
+ if (space_left >= len + kMaxIncrementCopyOverflow) {
+ // Fast path: src and dst in current block.
+ IncrementalCopyFastPath(op_ptr_ - offset, op_ptr_, len);
+ op_ptr_ += len;
+ return true;
+ }
+ }
+ return SlowAppendFromSelf(offset, len);
+ }
+
+ // Called at the end of the decompress. We ask the allocator
+ // write all blocks to the sink.
+ inline void Flush() { allocator_.Flush(Produced()); }
+};
+
+template<typename Allocator>
+bool SnappyScatteredWriter<Allocator>::SlowAppend(const char* ip, size_t len) {
+ size_t avail = op_limit_ - op_ptr_;
+ while (len > avail) {
+ // Completely fill this block
+ memcpy(op_ptr_, ip, avail);
+ op_ptr_ += avail;
+ assert(op_limit_ - op_ptr_ == 0);
+ full_size_ += (op_ptr_ - op_base_);
+ len -= avail;
+ ip += avail;
+
+ // Bounds check
+ if (full_size_ + len > expected_) {
+ return false;
+ }
+
+ // Make new block
+ size_t bsize = min<size_t>(kBlockSize, expected_ - full_size_);
+ op_base_ = allocator_.Allocate(bsize);
+ op_ptr_ = op_base_;
+ op_limit_ = op_base_ + bsize;
+ blocks_.push_back(op_base_);
+ avail = bsize;
+ }
+
+ memcpy(op_ptr_, ip, len);
+ op_ptr_ += len;
+ return true;
+}
+
+template<typename Allocator>
+bool SnappyScatteredWriter<Allocator>::SlowAppendFromSelf(size_t offset,
+ size_t len) {
+ // Overflow check
+ // See SnappyArrayWriter::AppendFromSelf for an explanation of
+ // the "offset - 1u" trick.
+ const size_t cur = Size();
+ if (offset - 1u >= cur) return false;
+ if (expected_ - cur < len) return false;
+
+ // Currently we shouldn't ever hit this path because Compress() chops the
+ // input into blocks and does not create cross-block copies. However, it is
+ // nice if we do not rely on that, since we can get better compression if we
+ // allow cross-block copies and thus might want to change the compressor in
+ // the future.
+ size_t src = cur - offset;
+ while (len-- > 0) {
+ char c = blocks_[src >> kBlockLog][src & (kBlockSize-1)];
+ Append(&c, 1);
+ src++;
+ }
+ return true;
+}
+
+class SnappySinkAllocator {
+ public:
+ explicit SnappySinkAllocator(Sink* dest): dest_(dest) {}
+ ~SnappySinkAllocator() {}
+
+ char* Allocate(int size) {
+ Datablock block(new char[size], size);
+ blocks_.push_back(block);
+ return block.data;
+ }
+
+ // We flush only at the end, because the writer wants
+ // random access to the blocks and once we hand the
+ // block over to the sink, we can't access it anymore.
+ // Also we don't write more than has been actually written
+ // to the blocks.
+ void Flush(size_t size) {
+ size_t size_written = 0;
+ size_t block_size;
+ for (int i = 0; i < blocks_.size(); ++i) {
+ block_size = min<size_t>(blocks_[i].size, size - size_written);
+ dest_->AppendAndTakeOwnership(blocks_[i].data, block_size,
+ &SnappySinkAllocator::Deleter, NULL);
+ size_written += block_size;
+ }
+ blocks_.clear();
+ }
+
+ private:
+ struct Datablock {
+ char* data;
+ size_t size;
+ Datablock(char* p, size_t s) : data(p), size(s) {}
+ };
+
+ static void Deleter(void* arg, const char* bytes, size_t size) {
+ delete[] bytes;
+ }
+
+ Sink* dest_;
+ vector<Datablock> blocks_;
+
+ // Note: copying this object is allowed
+};
+
+size_t UncompressAsMuchAsPossible(Source* compressed, Sink* uncompressed) {
+ SnappySinkAllocator allocator(uncompressed);
+ SnappyScatteredWriter<SnappySinkAllocator> writer(allocator);
+ InternalUncompress(compressed, &writer);
+ return writer.Produced();
+}
+
+bool Uncompress(Source* compressed, Sink* uncompressed) {
+ // Read the uncompressed length from the front of the compressed input
+ SnappyDecompressor decompressor(compressed);
+ uint32 uncompressed_len = 0;
+ if (!decompressor.ReadUncompressedLength(&uncompressed_len)) {
+ return false;
+ }
+
+ char c;
+ size_t allocated_size;
+ char* buf = uncompressed->GetAppendBufferVariable(
+ 1, uncompressed_len, &c, 1, &allocated_size);
+
+ // If we can get a flat buffer, then use it, otherwise do block by block
+ // uncompression
+ if (allocated_size >= uncompressed_len) {
+ SnappyArrayWriter writer(buf);
+ bool result = InternalUncompressAllTags(
+ &decompressor, &writer, uncompressed_len);
+ uncompressed->Append(buf, writer.Produced());
+ return result;
+ } else {
+ SnappySinkAllocator allocator(uncompressed);
+ SnappyScatteredWriter<SnappySinkAllocator> writer(allocator);
+ return InternalUncompressAllTags(&decompressor, &writer, uncompressed_len);
+ }
+}
+
+} // end namespace snappy
diff --git a/src/stores/libsnappy/snappy.h b/src/stores/libsnappy/snappy.h
new file mode 100644
index 0000000..0225560
--- /dev/null
+++ b/src/stores/libsnappy/snappy.h
@@ -0,0 +1,228 @@
+
+/******************************************************************************
+ *
+ * This file is part of canu, a software program that assembles whole-genome
+ * sequencing reads into contigs.
+ *
+ * This software is based on:
+ * 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
+ * the 'kmer package' (http://kmer.sourceforge.net)
+ * both originally distributed by Applera Corporation under the GNU General
+ * Public License, version 2.
+ *
+ * Canu branched from Celera Assembler at its revision 4587.
+ * Canu branched from the kmer project at its revision 1994.
+ *
+ * Modifications by:
+ *
+ * Brian P. Walenz beginning on 2016-AUG-30
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
+ * File 'README.licenses' in the root directory of this distribution contains
+ * full conditions and disclaimers for each license.
+ */
+
+// Copyright 2005 and onwards Google Inc.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// A light-weight compression algorithm. It is designed for speed of
+// compression and decompression, rather than for the utmost in space
+// savings.
+//
+// For getting better compression ratios when you are compressing data
+// with long repeated sequences or compressing data that is similar to
+// other data, while still compressing fast, you might look at first
+// using BMDiff and then compressing the output of BMDiff with
+// Snappy.
+
+#ifndef THIRD_PARTY_SNAPPY_SNAPPY_H__
+#define THIRD_PARTY_SNAPPY_SNAPPY_H__
+
+#include <stddef.h>
+#include <string>
+
+#include "snappy-stubs-public.h"
+
+namespace snappy {
+ class Source;
+ class Sink;
+
+ // ------------------------------------------------------------------------
+ // Generic compression/decompression routines.
+ // ------------------------------------------------------------------------
+
+ // Compress the bytes read from "*source" and append to "*sink". Return the
+ // number of bytes written.
+ size_t Compress(Source* source, Sink* sink);
+
+ // Find the uncompressed length of the given stream, as given by the header.
+ // Note that the true length could deviate from this; the stream could e.g.
+ // be truncated.
+ //
+ // Also note that this leaves "*source" in a state that is unsuitable for
+ // further operations, such as RawUncompress(). You will need to rewind
+ // or recreate the source yourself before attempting any further calls.
+ bool GetUncompressedLength(Source* source, uint32* result);
+
+ // ------------------------------------------------------------------------
+ // Higher-level string based routines (should be sufficient for most users)
+ // ------------------------------------------------------------------------
+
+ // Sets "*output" to the compressed version of "input[0,input_length-1]".
+ // Original contents of *output are lost.
+ //
+ // REQUIRES: "input[]" is not an alias of "*output".
+ size_t Compress(const char* input, size_t input_length, string* output);
+
+ // Decompresses "compressed[0,compressed_length-1]" to "*uncompressed".
+ // Original contents of "*uncompressed" are lost.
+ //
+ // REQUIRES: "compressed[]" is not an alias of "*uncompressed".
+ //
+ // returns false if the message is corrupted and could not be decompressed
+ bool Uncompress(const char* compressed, size_t compressed_length,
+ string* uncompressed);
+
+ // Decompresses "compressed" to "*uncompressed".
+ //
+ // returns false if the message is corrupted and could not be decompressed
+ bool Uncompress(Source* compressed, Sink* uncompressed);
+
+ // This routine uncompresses as much of the "compressed" as possible
+ // into sink. It returns the number of valid bytes added to sink
+ // (extra invalid bytes may have been added due to errors; the caller
+ // should ignore those). The emitted data typically has length
+ // GetUncompressedLength(), but may be shorter if an error is
+ // encountered.
+ size_t UncompressAsMuchAsPossible(Source* compressed, Sink* uncompressed);
+
+ // ------------------------------------------------------------------------
+ // Lower-level character array based routines. May be useful for
+ // efficiency reasons in certain circumstances.
+ // ------------------------------------------------------------------------
+
+ // REQUIRES: "compressed" must point to an area of memory that is at
+ // least "MaxCompressedLength(input_length)" bytes in length.
+ //
+ // Takes the data stored in "input[0..input_length]" and stores
+ // it in the array pointed to by "compressed".
+ //
+ // "*compressed_length" is set to the length of the compressed output.
+ //
+ // Example:
+ // char* output = new char[snappy::MaxCompressedLength(input_length)];
+ // size_t output_length;
+ // RawCompress(input, input_length, output, &output_length);
+ // ... Process(output, output_length) ...
+ // delete [] output;
+ void RawCompress(const char* input,
+ size_t input_length,
+ char* compressed,
+ size_t* compressed_length);
+
+ // Given data in "compressed[0..compressed_length-1]" generated by
+ // calling the Snappy::Compress routine, this routine
+ // stores the uncompressed data to
+ // uncompressed[0..GetUncompressedLength(compressed)-1]
+ // returns false if the message is corrupted and could not be decrypted
+ bool RawUncompress(const char* compressed, size_t compressed_length,
+ char* uncompressed);
+
+ // Given data from the byte source 'compressed' generated by calling
+ // the Snappy::Compress routine, this routine stores the uncompressed
+ // data to
+ // uncompressed[0..GetUncompressedLength(compressed,compressed_length)-1]
+ // returns false if the message is corrupted and could not be decrypted
+ bool RawUncompress(Source* compressed, char* uncompressed);
+
+ // Given data in "compressed[0..compressed_length-1]" generated by
+ // calling the Snappy::Compress routine, this routine
+ // stores the uncompressed data to the iovec "iov". The number of physical
+ // buffers in "iov" is given by iov_cnt and their cumulative size
+ // must be at least GetUncompressedLength(compressed). The individual buffers
+ // in "iov" must not overlap with each other.
+ //
+ // returns false if the message is corrupted and could not be decrypted
+ bool RawUncompressToIOVec(const char* compressed, size_t compressed_length,
+ const struct iovec* iov, size_t iov_cnt);
+
+ // Given data from the byte source 'compressed' generated by calling
+ // the Snappy::Compress routine, this routine stores the uncompressed
+ // data to the iovec "iov". The number of physical
+ // buffers in "iov" is given by iov_cnt and their cumulative size
+ // must be at least GetUncompressedLength(compressed). The individual buffers
+ // in "iov" must not overlap with each other.
+ //
+ // returns false if the message is corrupted and could not be decrypted
+ bool RawUncompressToIOVec(Source* compressed, const struct iovec* iov,
+ size_t iov_cnt);
+
+ // Returns the maximal size of the compressed representation of
+ // input data that is "source_bytes" bytes in length;
+ size_t MaxCompressedLength(size_t source_bytes);
+
+ // REQUIRES: "compressed[]" was produced by RawCompress() or Compress()
+ // Returns true and stores the length of the uncompressed data in
+ // *result normally. Returns false on parsing error.
+ // This operation takes O(1) time.
+ bool GetUncompressedLength(const char* compressed, size_t compressed_length,
+ size_t* result);
+
+ // Returns true iff the contents of "compressed[]" can be uncompressed
+ // successfully. Does not return the uncompressed data. Takes
+ // time proportional to compressed_length, but is usually at least
+ // a factor of four faster than actual decompression.
+ bool IsValidCompressedBuffer(const char* compressed,
+ size_t compressed_length);
+
+ // Returns true iff the contents of "compressed" can be uncompressed
+ // successfully. Does not return the uncompressed data. Takes
+ // time proportional to *compressed length, but is usually at least
+ // a factor of four faster than actual decompression.
+ // On success, consumes all of *compressed. On failure, consumes an
+ // unspecified prefix of *compressed.
+ bool IsValidCompressed(Source* compressed);
+
+ // The size of a compression block. Note that many parts of the compression
+ // code assumes that kBlockSize <= 65536; in particular, the hash table
+ // can only store 16-bit offsets, and EmitCopy() also assumes the offset
+ // is 65535 bytes or less. Note also that if you change this, it will
+ // affect the framing format (see framing_format.txt).
+ //
+ // Note that there might be older data around that is compressed with larger
+ // block sizes, so the decompression code should not rely on the
+ // non-existence of long backreferences.
+ static const int kBlockLog = 16;
+ static const size_t kBlockSize = 1 << kBlockLog;
+
+ static const int kMaxHashTableBits = 14;
+ static const size_t kMaxHashTableSize = 1 << kMaxHashTableBits;
+} // end namespace snappy
+
+#endif // THIRD_PARTY_SNAPPY_SNAPPY_H__
diff --git a/src/stores/ovOverlap.C b/src/stores/ovOverlap.C
index d4b70de..801246f 100644
--- a/src/stores/ovOverlap.C
+++ b/src/stores/ovOverlap.C
@@ -23,6 +23,10 @@
* are a 'United States Government Work', and
* are released in the public domain
*
+ * Brian P. Walenz beginning on 2016-OCT-17
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
@@ -40,7 +44,7 @@ ovOverlap::toString(char *str,
switch (type) {
case ovOverlapAsHangs:
- sprintf(str, "%10"F_U32P" %10"F_U32P" %c %6"F_S32P" %6"F_U32P" %6"F_S32P" %7.6f%s%s",
+ sprintf(str, "%10" F_U32P " %10" F_U32P " %c %6" F_S32P " %6" F_U32P " %6" F_S32P " %7.6f%s%s",
a_iid, b_iid,
flipped() ? 'I' : 'N',
a_hang(), span(), b_hang(),
@@ -50,7 +54,7 @@ ovOverlap::toString(char *str,
break;
case ovOverlapAsCoords:
- sprintf(str, "%10"F_U32P" %10"F_U32P" %c %6"F_U32P" %6"F_U32P" %6"F_U32P" %6"F_U32P" %6"F_U32P" %7.6f%s",
+ sprintf(str, "%10" F_U32P " %10" F_U32P " %c %6" F_U32P " %6" F_U32P " %6" F_U32P " %6" F_U32P " %6" F_U32P " %7.6f%s",
a_iid, b_iid,
flipped() ? 'I' : 'N',
span(),
@@ -61,7 +65,7 @@ ovOverlap::toString(char *str,
break;
case ovOverlapAsRaw:
- sprintf(str, "%10"F_U32P" %10"F_U32P" %c %6"F_U32P" %6"F_U64P" %6"F_U64P" %6"F_U64P" %6"F_U64P" %7.6f %s %s %s%s",
+ sprintf(str, "%10" F_U32P " %10" F_U32P " %c %6" F_U32P " %6" F_OVP " %6" F_OVP " %6" F_OVP " %6" F_OVP " %7.6f %s %s %s%s",
a_iid, b_iid,
flipped() ? 'I' : 'N',
span(),
@@ -75,7 +79,7 @@ ovOverlap::toString(char *str,
break;
case ovOverlapAsCompat:
- sprintf(str, "%8"F_U32P" %8"F_U32P" %c %6d %6d %5.2f %5.2f%s",
+ sprintf(str, "%8" F_U32P " %8" F_U32P " %c %6d %6d %5.2f %5.2f%s",
a_iid,
b_iid,
dat.ovl.flipped ? 'I' : 'N',
@@ -87,7 +91,7 @@ ovOverlap::toString(char *str,
case ovOverlapAsPaf:
// miniasm/map expects entries to be separated by tabs
// no padding spaces on names we don't confuse read identifiers
- sprintf(str, "%"F_U32P"\t%6"F_U32P"\t%6"F_U32P"\t%6"F_U32P"\t%c\t%"F_U32P"\t%6"F_U32P"\t%6"F_U32P"\t%6"F_U32P"\t%6"F_U32P"\t%6"F_U32P"\t%6"F_U32P" %s",
+ sprintf(str, "%" F_U32P "\t%6" F_U32P "\t%6" F_U32P "\t%6" F_U32P "\t%c\t%" F_U32P "\t%6" F_U32P "\t%6" F_U32P "\t%6" F_U32P "\t%6" F_U32P "\t%6" F_U32P "\t%6" F_U32P " %s",
a_iid,
(g->gkStore_getRead(a_iid)->gkRead_sequenceLength()), a_bgn(), a_end(),
flipped() ? '-' : '+',
@@ -143,5 +147,7 @@ ovOverlap::swapIDs(ovOverlap const &orig) {
// Whatever alignment orientation was in the original, it is opposite now.
+#ifndef DO_NOT_STORE_ALIGN_PTR
dat.ovl.alignSwapped = ! orig.dat.ovl.alignSwapped;
+#endif
}
diff --git a/src/stores/ovStore.H b/src/stores/ovOverlap.H
similarity index 51%
copy from src/stores/ovStore.H
copy to src/stores/ovOverlap.H
index c178efa..362495e 100644
--- a/src/stores/ovStore.H
+++ b/src/stores/ovOverlap.H
@@ -13,17 +13,13 @@
* Canu branched from Celera Assembler at its revision 4587.
* Canu branched from the kmer project at its revision 1994.
*
- * Modifications by:
+ * This file is derived from:
*
- * Brian P. Walenz from 2014-DEC-09 to 2015-JUL-01
- * are Copyright 2014-2015 Battelle National Biodefense Institute, and
- * are subject to the BSD 3-Clause License
+ * src/stores/ovStore.H
*
- * Brian P. Walenz beginning on 2015-OCT-12
- * are a 'United States Government Work', and
- * are released in the public domain
+ * Modifications by:
*
- * Sergey Koren beginning on 2016-MAR-11
+ * Brian P. Walenz beginning on 2016-OCT-24
* are a 'United States Government Work', and
* are released in the public domain
*
@@ -31,11 +27,9 @@
* full conditions and disclaimers for each license.
*/
-#ifndef AS_OVERLAP_H
-#define AS_OVERLAP_H
+#ifndef AS_OVOVERLAP_H
+#define AS_OVOVERLAP_H
-#include "AS_global.H"
-#include "gkStore.H"
// Error rates are encoded as a 12-bit fixed-point value. This gives us up to 40.95% error, with
// 0.01% resolution. Changing the number of bits WILL break the carefully structured
@@ -55,15 +49,17 @@
// The old implementation allowed up to 20-bit reads, and used 3 32-bit words. No alignment was
// stored.
//
-// The new implementation uses either 5 (for EXACTLY 16-bit reads) or 6 32-bit words. It uses 2
-// 32-bit words for storing a pointer to the alignments.
-//
-// Note that the 5-word version must use uint32, and the 6-word version must use uint64.
-// Note that the 5-word version needs to split out the alignPos into two words.
+// The new implementation uses either 3 32-bit words (for EXACTLY 16-bit reads), 2 64-bit words
+// (for up to 21-bit reads) or 6 32-bit words. It can optionally use 64 bits for storing a pointer
+// to the alignments, though this is not actually implemented.
+
+#define DO_NOT_STORE_ALIGN_PTR
+
#if AS_MAX_READLEN_BITS < 17
-#define ovOverlapNWORDS 5
+#define ovOverlapNWORDS 3
+#define ovOverlapWORDSZ 32
typedef uint32 ovOverlapWORD;
#define F_OV F_U32
#define F_OVP F_U32P
@@ -84,15 +80,20 @@ public:
ovOverlapWORD forDUP : 1; // 1
ovOverlapWORD forUTG : 1; // 1
+#ifndef DO_NOT_STORE_ALIGN_PTR
+#undef ovOverlapNWORDS
+#define ovOverlapNWORDS 5
ovOverlapWORD alignSwapped : 1; // Our IDs are opposite those in the alignment
ovOverlapWORD alignFile : 19; // Which file of overlap alignments
ovOverlapWORD alignPosHi : 12; // Position in that file (high-order bits)
ovOverlapWORD alignPosLo : 32; // Position in that file (low-order bits)
+#endif
};
#elif AS_MAX_READLEN_BITS < 22
-#define ovOverlapNWORDS 3
+#define ovOverlapNWORDS 2
+#define ovOverlapWORDSZ 64
typedef uint64 ovOverlapWORD;
#define F_OV F_U64
#define F_OVP F_U64P
@@ -113,14 +114,19 @@ public:
ovOverlapWORD span : AS_MAX_READLEN_BITS; // 17-21
ovOverlapWORD extra2 : 64 - 3 * AS_MAX_READLEN_BITS; // Between 13 and 1
+#ifndef DO_NOT_STORE_ALIGN_PTR
+#undef ovOverlapNWORDS
+#define ovOverlapNWORDS 3
ovOverlapWORD alignSwapped : 1; // Our IDs are opposite those in the alignment
ovOverlapWORD alignFile : 19; // Which file of overlap alignments
ovOverlapWORD alignPos : 44; // Position in that file
+#endif
};
#else
-#define ovOverlapNWORDS 8
+#define ovOverlapNWORDS 6
+#define ovOverlapWORDSZ 32
typedef uint32 ovOverlapWORD;
#define F_OV F_U32
#define F_OVP F_U32P
@@ -140,10 +146,14 @@ public:
ovOverlapWORD forUTG : 1; // 1
ovOverlapWORD extra : 32 - AS_MAX_EVALUE_BITS - 1 - 1 - 1 - 1; // Between 15 and 7
+#ifndef DO_NOT_STORE_ALIGN_PTR
+#undef ovOverlapNWORDS
+#define ovOverlapNWORDS 8
ovOverlapWORD alignSwapped : 1; // Our IDs are opposite those in the alignment
ovOverlapWORD alignFile : 19; // Which file of overlap alignments
ovOverlapWORD alignPosHi : 12; // Position in that file (high-order bits)
ovOverlapWORD alignPosLo : 32; // Position in that file (low-order bits)
+#endif
};
#endif
@@ -164,6 +174,7 @@ enum ovOverlapDisplayType {
class ovOverlap {
private:
ovOverlap() {
+ g = NULL;
clear();
};
@@ -288,13 +299,13 @@ public:
void swapIDs(ovOverlap const &orig);
void clear(void) {
- dat.dat[0] = 0;
- dat.dat[1] = 0;
- dat.dat[2] = 0;
-#if (ovOverlapNWORDS == 5)
- dat.dat[3] = 0;
- dat.dat[4] = 0;
-#endif
+ //g = NULL; // Explicitly DO NOT clear the pointer to gkpStore.
+
+ for (uint32 ii=0; ii<ovOverlapNWORDS; ii++)
+ dat.dat[ii] = 0;
+
+ a_iid = 0;
+ b_iid = 0;
};
bool
@@ -332,345 +343,4 @@ public:
};
-// The default, no flags, is to open for normal overlaps, read only. Normal overlaps mean they
-// have only the B id, i.e., they are in a fully built store.
-//
-// Output of overlapper (input to store building) should be ovFileFullWrite. The specialized
-// ovFileFullWriteNoCounts is used internally by store creation.
-//
-enum ovFileType {
- ovFileNormal = 0, // Reading of b_id overlaps (aka store files)
- ovFileNormalWrite = 1, // Writing of b_id overlaps
- ovFileFull = 2, // Reading of a_id+b_id overlaps (aka dump files)
- ovFileFullWrite = 3, // Writing of a_id+b_id overlaps
- ovFileFullWriteNoCounts = 4 // Writing of a_id+b_id overlaps, omitting the counts of olaps per read
-};
-
-
-
-
-class ovFile {
-public:
- ovFile(const char *name,
- ovFileType type = ovFileNormal,
- uint32 bufferSize = 1 * 1024 * 1024);
- ~ovFile();
-
- void flushOverlaps(void);
-
- void writeOverlap(ovOverlap *overlap);
- void writeOverlaps(ovOverlap *overlaps, uint64 overlapLen);
-
- bool readOverlap(ovOverlap *overlap);
- uint64 readOverlaps(ovOverlap *overlaps, uint64 overlapMax);
-
- void seekOverlap(off_t overlap);
-
- // The size of an overlap record is 1 or 2 IDs + the size of a word times the number of words.
- uint64 recordSize(void) {
- return(sizeof(uint32) * ((_isNormal) ? 1 : 2) + sizeof(ovOverlapWORD) * ovOverlapNWORDS);
- };
-
-private:
- uint32 _bufferLen; // length of valid data in the buffer
- uint32 _bufferPos; // position the read is at in the buffer
- uint32 _bufferMax; // allocated size of the buffer
- uint32 *_buffer;
-
- uint32 _olapsPerReadAlloc;
- uint32 _olapsPerReadLast;
- uint32 *_olapsPerRead;
-
- bool _isOutput; // if true, we can writeOverlap()
- bool _isSeekable; // if true, we can seekOverlap()
- bool _isNormal; // if true, 3 words per overlap, else 4
-
- compressedFileReader *_reader;
- compressedFileWriter *_writer;
-
- char _prefix[FILENAME_MAX];
- FILE *_file;
-};
-
-
-
-
-
-class ovStoreInfo {
-private:
- uint64 _ovsMagic;
- uint64 _ovsVersion;
- uint64 _UNUSED;
- uint64 _smallestIID; // smallest frag iid in the store
- uint64 _largestIID; // largest frag iid in the store
- uint64 _numOverlapsTotal; // number of overlaps in the store
- uint64 _highestFileIndex;
- uint64 _maxReadLenInBits; // length of a fragment
-
- friend class ovStore;
-
- friend
- void writeOverlaps(char *storePath,
- ovOverlap *ovls,
- uint64 ovlsLen,
- uint32 fileID);
-
- friend
- bool
- testIndex(char *storePath,
- bool doFixes);
-
- friend
- void
- mergeInfoFiles(char *storePath,
- uint32 nPieces);
-};
-
-
-class ovStoreOfft {
-private:
- uint32 _a_iid; // read ID for this block of overlaps.
-
- uint32 _fileno; // the file that contains this a_iid
- uint32 _offset; // offset to the first overlap for this iid
- uint32 _numOlaps; // number of overlaps for this iid
-
- uint64 _overlapID; // overlapID for the first overlap in this block. in memory, this is the id of the next overlap.
-
- void clear(void) {
- _a_iid = 0;
- _fileno = 0;
- _offset = 0;
- _numOlaps = 0;
- _overlapID = 0;
- };
-
- friend class ovStore;
-
- friend
- void
- writeOverlaps(char *storePath,
- ovOverlap *ovls,
- uint64 ovlsLen,
- uint32 fileID);
-
- friend
- bool
- testIndex(char *storePath,
- bool doFixes);
-
- friend
- void
- mergeInfoFiles(char *storePath,
- uint32 nPieces);
-};
-
-
-// The default here is to open a read only store.
-//
-enum ovStoreType {
- ovStoreReadOnly = 0,
- ovStoreWrite = 1, // Open for write, fail if one exists already
- ovStoreOverwrite = 2, // Open for write, and obliterate an existing store
-};
-
-
-class ovStore {
-private:
- void ovStore_read(void);
- void ovStore_write(void);
-
-public:
- ovStore(const char *name, gkStore *gkp, ovStoreType cType=ovStoreReadOnly);
- ~ovStore();
-
- // Read the next overlap from the store. Return value is the number of overlaps read.
- uint32 readOverlap(ovOverlap *overlap);
-
- // Return the number of overlaps that would be read. Basically the same as the next readOverlaps() call.
- uint32 numberOfOverlaps(void);
-
- // Read ALL remaining overlaps for the current A_iid. Return value is the number of overlaps read.
- uint32 readOverlaps(ovOverlap *&overlaps,
- uint32 &maxOverlaps,
- bool restrictToIID=true);
-
- // Append ALL remaining overlaps for the current A_iid to the overlaps in ovl. Return value is
- // the number of overlaps in ovl that are for A_iid == iid.
- //
- // It is up to the client to verify that ovl[0] is the same as iid (e.g., that the return value
- // is not zero); ovlLen is the number of overlaps in ovl, NOT the number of overlaps in ovl that
- // are the same as iid.
- //
- uint32 readOverlaps(uint32 iid,
- ovOverlap *&ovl,
- uint32 &ovlLen,
- uint32 &ovlMax);
-
- void setRange(uint32 low, uint32 high);
- void resetRange(void);
-
- uint64 numOverlapsInRange(void);
- uint32 * numOverlapsPerFrag(uint32 &firstFrag, uint32 &lastFrag);
-
- // The (mostly) private interface for adding overlaps to a store. Overlaps must be sorted already.
-
- void writeOverlap(ovOverlap *olap);
- void writeOverlap(ovOverlap *overlap, uint32 maxOverlapsThisFile);
-
- // Write a block of sorted overlaps to store file 'fileID', saving the info and index into
- // 'fileID.info' and 'fileID.index'
-
- friend
- void writeOverlaps(char *storePath,
- ovOverlap *ovls,
- uint64 ovlsLen,
- uint32 fileID);
-
-
- // Add new evalues for reads between bgnID and endID. No checking of IDs is done, but the number
- // of evalues must agree.
-
- void addEvalues(uint32 bgnID, uint32 endID, uint16 *evalues, uint64 evaluesLen);
-
-private:
- char _storePath[FILENAME_MAX];
-
- bool _isOutput;
-
- ovStoreInfo _info;
-
- uint32 _firstIIDrequested;
- uint32 _lastIIDrequested;
-
- FILE *_offtFile; // For writing overlaps, a place to dump ovStoreOfft's.
- ovStoreOfft _offt; // For writing overlaps, the current ovStoreOfft.
- ovStoreOfft _offm; // For writing overlaps, an empty ovStoreOfft, for reads with no overlaps.
-
- memoryMappedFile *_evaluesMap;
- uint16 *_evalues;
-
- uint64 _overlapsThisFile; // Count of the number of overlaps written so far
- uint32 _currentFileIndex;
- ovFile *_bof;
-
- gkStore *_gkp;
-};
-
-
-// This should be part of ovStore, but when it is used, in ovStoreSorter, we don't
-// have a store opened.
-void
-writeOverlaps(char *storePath,
- ovOverlap *ovls,
- uint64 ovlsLen,
- uint32 fileID);
-
-bool
-testIndex(char *storePath,
- bool doFixes);
-
-void
-mergeInfoFiles(char *storePath,
- uint32 nPieces);
-
-
-
-
-
-
-
-// For store construction. Probably should be in either ovOverlap or ovStore.
-
-class ovStoreFilter {
-public:
- ovStoreFilter(gkStore *gkp_, double maxErate) {
- gkp = gkp_;
-
- resetCounters();
-
- maxID = gkp->gkStore_getNumReads() + 1;
- maxEvalue = AS_OVS_encodeEvalue(maxErate);
-
- skipReadOBT = new char [maxID];
- skipReadDUP = new char [maxID];
-
- memset(skipReadOBT, 0, sizeof(char) * maxID);
- memset(skipReadDUP, 0, sizeof(char) * maxID);
-
-
- uint32 numSkipOBT = 0;
- uint32 numSkipDUP = 0;
-
- fprintf(stderr, "Marking fragments to skip overlap based trimming.\n");
-
- fprintf(stderr, "LIB 1 - dup=%d trim=%d spur=%d chimera=%d subreads=%d\n",
- gkp->gkStore_getLibrary(1)->gkLibrary_removeDuplicateReads(),
- gkp->gkStore_getLibrary(1)->gkLibrary_finalTrim(),
- gkp->gkStore_getLibrary(1)->gkLibrary_removeSpurReads(),
- gkp->gkStore_getLibrary(1)->gkLibrary_removeChimericReads(),
- gkp->gkStore_getLibrary(1)->gkLibrary_checkForSubReads());
-
- for (uint64 iid=0; iid<maxID; iid++) {
- uint32 Lid = gkp->gkStore_getRead(iid)->gkRead_libraryID();
- gkLibrary *L = gkp->gkStore_getLibrary(Lid);
-
- if ((L->gkLibrary_removeDuplicateReads() == false) &&
- (L->gkLibrary_finalTrim() == GK_FINALTRIM_NONE) &&
- (L->gkLibrary_removeSpurReads() == false) &&
- (L->gkLibrary_removeChimericReads() == false) &&
- (L->gkLibrary_checkForSubReads() == false)) {
- numSkipOBT++;
- skipReadOBT[iid] = true;
- }
-
- if (L->gkLibrary_removeDuplicateReads() == false) {
- numSkipDUP++;
- skipReadDUP[iid] = true;
- }
- }
-
- fprintf(stderr, "Marked "F_U32" reads so skip OBT, "F_U32" reads to skip dedupe.\n", numSkipOBT, numSkipDUP);
- };
-
- ~ovStoreFilter() {
- delete [] skipReadOBT;
- delete [] skipReadDUP;
- };
-
-
- void filterOverlap(ovOverlap &foverlap,
- ovOverlap &roverlap);
-
- void reportFate(void);
- void resetCounters(void);
-
-public:
- gkStore *gkp;
-
- uint32 maxID;
- uint32 maxEvalue;
-
- uint64 saveUTG;
- uint64 saveOBT;
- uint64 saveDUP;
-
- uint64 skipERATE;
-
- uint64 skipOBT; // OBT not requested for the A read
- uint64 skipOBTbad; // Overlap too similiar
- uint64 skipOBTshort; // Overlap is too short
-
- uint64 skipDUP; // DUP not requested for the A read
- uint64 skipDUPdiff; // Overlap isn't remotely similar
- uint64 skipDUPlib;
-
- // Not really stats, but global state for the filter.
-
- char *skipReadOBT;
- char *skipReadDUP;
-};
-
-
-
-#endif
+#endif // AS_OVOVERLAP_H
diff --git a/src/stores/ovStore.C b/src/stores/ovStore.C
index 0dad977..f484a67 100644
--- a/src/stores/ovStore.C
+++ b/src/stores/ovStore.C
@@ -54,132 +54,11 @@
#include "ovStore.H"
-const uint64 ovStoreVersion = 2;
-const uint64 ovStoreMagic = 0x53564f3a756e6163; // == "canu:OVS - store complete
-const uint64 ovStoreMagicIncomplete = 0x50564f3a756e6163; // == "canu:OVP - store under construction
-void
-ovStore::ovStore_write(void) {
- AS_UTL_mkdir(_storePath);
-
- char name[FILENAME_MAX];
-
- sprintf(name, "%s/info", _storePath);
-
- // If the ovs file exists, AND has a valid magic number, then the store is complete and we should
- // abort before the valid store is destroyed.
-
- if (AS_UTL_fileExists(name, false, false)) {
- errno = 0;
- FILE *ovsinfo = fopen(name, "r");
- if (errno) {
- fprintf(stderr, "ERROR: failed to read store metadata from '%s': %s\n", name, strerror(errno));
- exit(1);
- }
-
- AS_UTL_safeRead(ovsinfo, &_info, "ovStore::ovStore::testinfo", sizeof(ovStoreInfo), 1);
-
- fclose(ovsinfo);
-
- if (_info._ovsMagic == ovStoreMagic)
- fprintf(stderr, "ERROR: overlapStore '%s' is a valid overlap store, will not overwrite.\n",
- _storePath), exit(1);
- }
-
- // Create a new incomplete info file.
-
- errno = 0;
- FILE *ovsinfo = fopen(name, "w");
-
- if (errno)
- fprintf(stderr, "failed to create overlap store '%s': %s\n", _storePath, strerror(errno)), exit(1);
-
- AS_UTL_safeWrite(ovsinfo, &_info, "ovStore::ovStore::saveinfo", sizeof(ovStoreInfo), 1);
-
- fclose(ovsinfo);
-
- sprintf(name, "%s/index", _storePath);
-
- errno = 0;
- _offtFile = fopen(name, "w");
- if (errno)
- fprintf(stderr, "AS_OVS_createOverlapStore()-- failed to open offset file '%s': %s\n", name, strerror(errno)), exit(1);
-
- _overlapsThisFile = 0;
- _currentFileIndex = 0;
- _bof = NULL;
-}
-
-
-
-void
-ovStore::ovStore_read(void) {
+ovStore::ovStore(const char *path, gkStore *gkp) {
char name[FILENAME_MAX];
- sprintf(name, "%s/info", _storePath);
- errno = 0;
- FILE *ovsinfo = fopen(name, "r");
- if (errno)
- fprintf(stderr, "ERROR: directory '%s' is not an ovelrapStore; failed to open info file '%s': %s\n",
- _storePath, name, strerror(errno)), exit(1);
-
- AS_UTL_safeRead(ovsinfo, &_info, "ovStore::ovStore::info", sizeof(ovStoreInfo), 1);
-
- fclose(ovsinfo);
-
- if ((_info._ovsMagic != ovStoreMagic) && (_info._ovsMagic != ovStoreMagicIncomplete))
- fprintf(stderr, "ERROR: directory '%s' is not an overlapStore; magic number 0x%016"F_X64P" incorrect.\n",
- _storePath, _info._ovsMagic), exit(1);
-
- if ((_info._ovsMagic != ovStoreMagic) && (_info._ovsMagic != ovStoreMagicIncomplete))
- fprintf(stderr, "ERROR: overlapStore '%s' is incomplate; creation crashed?\n",
- _storePath), exit(1);
-
- if (_info._ovsVersion != ovStoreVersion)
- fprintf(stderr, "ERROR: overlapStore '%s' is version "F_U64"; this code supports only version "F_U64".\n",
- _storePath, _info._ovsVersion, ovStoreVersion), exit(1);
-
- if (_info._maxReadLenInBits != AS_MAX_READLEN_BITS)
- fprintf(stderr, "ERROR: overlapStore '%s' is for AS_MAX_READLEN_BITS="F_U64"; this code supports only %d bits.\n",
- _storePath, _info._maxReadLenInBits, AS_MAX_READLEN_BITS), exit(1);
-
- // Load stats
-
-#if 0
- sprintf(name, "%s/statistics", _storePath);
- errno = 0;
- FILE *ost = fopen(name, "r");
- if (errno)
- fprintf(stderr, "failed to open the stats file '%s': %s\n", name, strerror(errno)), exit(1);
- AS_UTL_safeRead(ost, &_stats, "ovStore::ovStore::stats", sizeof(OverlapStoreStats), 1);
- fclose(ost);
-#endif
-
- // Open the index
-
- sprintf(name, "%s/index", _storePath);
-
- errno = 0;
- _offtFile = fopen(name, "r");
- if (errno)
- fprintf(stderr, "ERROR: failed to open offset file '%s': %s\n", name, strerror(errno)), exit(1);
-
- // Open erates
-
- sprintf(name, "%s/evalues", _storePath);
-
- if (AS_UTL_fileExists(name)) {
- _evaluesMap = new memoryMappedFile(name, memoryMappedFile_readOnly);
- _evalues = (uint16 *)_evaluesMap->get(0);
- }
-}
-
-
-
-
-ovStore::ovStore(const char *path, gkStore *gkp, ovStoreType cType) {
-
if (path == NULL)
fprintf(stderr, "ovStore::ovStore()-- ERROR: no name supplied.\n"), exit(1);
@@ -190,88 +69,67 @@ ovStore::ovStore(const char *path, gkStore *gkp, ovStoreType cType) {
memset(_storePath, 0, FILENAME_MAX);
strncpy(_storePath, path, FILENAME_MAX-1);
- _isOutput = (cType & ovStoreWrite) ? true : false;
-
- _info._ovsMagic = ovStoreMagicIncomplete; // Appropriate for a new store.
- _info._ovsVersion = ovStoreVersion;
- _info._smallestIID = UINT64_MAX;
- _info._largestIID = 0;
- _info._numOverlapsTotal = 0;
- _info._highestFileIndex = 0;
- _info._maxReadLenInBits = AS_MAX_READLEN_BITS;
+ _info.clear();
+ _gkp = gkp;
- _offtFile = NULL;
+ _offtFile = NULL;
_offt.clear();
_offm.clear();
- _evaluesMap = NULL;
- _evalues = NULL;
+ _evaluesMap = NULL;
+ _evalues = NULL;
_overlapsThisFile = 0;
_currentFileIndex = 0;
_bof = NULL;
- // Now open an existing store, or a create a new store.
+ // Now open the store
- if (_isOutput == false)
- ovStore_read();
- else
- ovStore_write();
+ if (_info.load(_storePath) == false)
+ fprintf(stderr, "ERROR: failed to intiialize ovStore '%s'.\n", path), exit(1);
- // AFTER the info is loaded, set the ranges.
+ if (_info.checkIncomplete() == true)
+ fprintf(stderr, "ERROR: directory '%s' is an incomplete ovStore, remove and rebuild.\n", path), exit(1);
- _firstIIDrequested = _info._smallestIID;
- _lastIIDrequested = _info._largestIID;
-
- _gkp = gkp;
-}
+ if (_info.checkMagic() == false)
+ fprintf(stderr, "ERROR: directory '%s' is not an ovStore.\n", path), exit(1);
+ if (_info.checkVersion() == false)
+ fprintf(stderr, "ERROR: directory '%s' is not a supported ovStore version (store version %u; supported version %u.\n",
+ path, _info.getVersion(), _info.getCurrentVersion()), exit(1);
+ if (_info.checkSize() == false)
+ fprintf(stderr, "ERROR: directory '%s' is not a supported read length (store is %u bits, AS_MAX_READLEN_BITS is %u).\n",
+ path, _info.getSize(), AS_MAX_READLEN_BITS), exit(1);
+ // Open the index
-ovStore::~ovStore() {
+ snprintf(name, FILENAME_MAX, "%s/index", _storePath);
- // If output, write the last index element (don't forget to fill in gaps);
- // update the info, using the final magic number
+ errno = 0;
+ _offtFile = fopen(name, "r");
+ if (errno)
+ fprintf(stderr, "ERROR: failed to open offset file '%s': %s\n", name, strerror(errno)), exit(1);
- if (_isOutput) {
- if (_offt._numOlaps > 0) {
- for (; _offm._a_iid < _offt._a_iid; _offm._a_iid++) {
- _offm._fileno = _offt._fileno;
- _offm._offset = _offt._offset;
- _offm._numOlaps = 0;
+ // Open and load erates
- AS_UTL_safeWrite(_offtFile, &_offm, "ovStore::~ovStore::offm", sizeof(ovStoreOfft), 1);
- }
+ snprintf(name, FILENAME_MAX, "%s/evalues", _storePath);
- AS_UTL_safeWrite(_offtFile, &_offt, "ovStore::~ovStore::offt", sizeof(ovStoreOfft), 1);
- }
+ if (AS_UTL_fileExists(name)) {
+ _evaluesMap = new memoryMappedFile(name, memoryMappedFile_readOnly);
+ _evalues = (uint16 *)_evaluesMap->get(0);
+ }
- _info._ovsMagic = ovStoreMagic;
- _info._ovsVersion = ovStoreVersion;
- _info._highestFileIndex = _currentFileIndex;
+ // Set the initial range to everything.
- char name[FILENAME_MAX];
+ _firstIIDrequested = _info.smallestID();
+ _lastIIDrequested = _info.largestID();
+}
- sprintf(name, "%s/info", _storePath);
- errno = 0;
- FILE *ovsinfo = fopen(name, "w");
- if (errno)
- fprintf(stderr, "failed to create overlap store '%s': %s\n", _storePath, strerror(errno)), exit(1);
- AS_UTL_safeWrite(ovsinfo, &_info, "ovStore::~ovStore::info", sizeof(ovStoreInfo), 1);
- fclose(ovsinfo);
- fprintf(stderr, "Closing the new store:\n");
- fprintf(stderr, " info._ovsMagic = 0x%016"F_X64P"\n", _info._ovsMagic);
- fprintf(stderr, " info._ovsVersion = "F_U64"\n", _info._ovsVersion);
- fprintf(stderr, " info._smallestIID = "F_U64"\n", _info._smallestIID);
- fprintf(stderr, " info._largestIID = "F_U64"\n", _info._largestIID);
- fprintf(stderr, " info._numOverlapsTotal = "F_U64"\n", _info._numOverlapsTotal);
- fprintf(stderr, " info._highestFileIndex = "F_U64"\n", _info._highestFileIndex);
- fprintf(stderr, " info._maxReadLenInBits = "F_U64"\n", _info._maxReadLenInBits);
- }
+ovStore::~ovStore() {
if (_evaluesMap) {
delete _evaluesMap;
@@ -280,24 +138,6 @@ ovStore::~ovStore() {
_evalues = NULL;
}
-#if 0
- if (_statsUpdated) {
- fprintf(stderr, "Writing new stats.\n");
-
- char name [FILENAME_MAX];
-
- sprintf(name, "%s/ost", _storePath);
- errno = 0;
- FILE *ost = fopen(name, "w");
- if (errno)
- fprintf(stderr, "failed to write overlap stats '%s': %s\n", name, strerror(errno)), exit(1);
-
- AS_UTL_safeWrite(ost, &_stats, "AS_OVS_closeOverlapStore", sizeof(OverlapStoreStats), 1);
-
- fclose(ost);
- }
-#endif
-
delete _bof;
fclose(_offtFile);
@@ -308,8 +148,6 @@ ovStore::~ovStore() {
uint32
ovStore::readOverlap(ovOverlap *overlap) {
- assert(_isOutput == FALSE);
-
// If we've finished reading overlaps for the current a_iid, get
// another a_iid. If we hit EOF here, we're all done, no more
// overlaps.
@@ -335,8 +173,8 @@ ovStore::readOverlap(ovOverlap *overlap) {
_currentFileIndex++;
- sprintf(name, "%s/%04d", _storePath, _currentFileIndex);
- _bof = new ovFile(name, ovFileNormal);
+ snprintf(name, FILENAME_MAX, "%s/%04d", _storePath, _currentFileIndex);
+ _bof = new ovFile(_gkp, name, ovFileNormal);
}
overlap->a_iid = _offt._a_iid;
@@ -367,8 +205,6 @@ uint32
ovStore::readOverlaps(ovOverlap *&overlaps, uint32 &maxOverlaps, bool restrictToIID) {
int numOvl = 0;
- assert(_isOutput == FALSE);
-
// If we've finished reading overlaps for the current a_iid, get
// another a_iid. If we hit EOF here, we're all done, no more
// overlaps.
@@ -416,18 +252,18 @@ ovStore::readOverlaps(ovOverlap *&overlaps, uint32 &maxOverlaps, bool restrictTo
_currentFileIndex++;
- if (_currentFileIndex > _info._highestFileIndex)
+ if (_currentFileIndex > _info.lastFileIndex())
// No more files, stop trying to load an overlap.
break;
- sprintf(name, "%s/%04d", _storePath, _currentFileIndex);
- _bof = new ovFile(name, ovFileNormal);
+ snprintf(name, FILENAME_MAX, "%s/%04d", _storePath, _currentFileIndex);
+ _bof = new ovFile(_gkp, name, ovFileNormal);
}
// If the currentFileIndex is invalid, we ran out of overlaps to load. Don't save that
// empty overlap to the list.
- if (_currentFileIndex <= _info._highestFileIndex) {
+ if (_currentFileIndex <= _info.lastFileIndex()) {
overlaps[numOvl].a_iid = _offt._a_iid;
overlaps[numOvl].g = _gkp;
@@ -462,14 +298,6 @@ ovStore::readOverlaps(ovOverlap *&overlaps, uint32 &maxOverlaps, bool restrictTo
-
-
-
-
-
-
-
-
uint32
ovStore::readOverlaps(uint32 iid,
ovOverlap *&ovl,
@@ -549,16 +377,6 @@ ovStore::readOverlaps(uint32 iid,
-
-
-
-
-
-
-
-
-
-
void
ovStore::setRange(uint32 firstIID, uint32 lastIID) {
char name[FILENAME_MAX];
@@ -567,15 +385,15 @@ ovStore::setRange(uint32 firstIID, uint32 lastIID) {
// can quickly grab the correct record, and seek to the start of
// those overlaps
- if (firstIID > _info._largestIID)
- firstIID = _info._largestIID + 1;
- if (lastIID >= _info._largestIID)
- lastIID = _info._largestIID;
+ if (firstIID > _info.largestID())
+ firstIID = _info.largestID() + 1;
+ if (lastIID >= _info.largestID())
+ lastIID = _info.largestID();
// If our range is invalid (firstIID > lastIID) we keep going, and
// let readOverlap() deal with it.
- AS_UTL_fseek(_offtFile, (size_t)firstIID * sizeof(ovStoreOfft), SEEK_SET);
+ AS_UTL_fseek(_offtFile, (off_t)firstIID * sizeof(ovStoreOfft), SEEK_SET);
// Unfortunately, we need to actually read the record to figure out
// where to position the overlap stream. If the read fails, we
@@ -599,8 +417,8 @@ ovStore::setRange(uint32 firstIID, uint32 lastIID) {
delete _bof;
- sprintf(name, "%s/%04d", _storePath, _currentFileIndex);
- _bof = new ovFile(name, ovFileNormal);
+ snprintf(name, FILENAME_MAX, "%s/%04d", _storePath, _currentFileIndex);
+ _bof = new ovFile(_gkp, name, ovFileNormal);
_bof->seekOverlap(_offt._offset);
}
@@ -620,164 +438,18 @@ ovStore::resetRange(void) {
delete _bof;
- sprintf(name, "%s/%04d", _storePath, _currentFileIndex);
- _bof = new ovFile(name, ovFileNormal);
+ snprintf(name, FILENAME_MAX, "%s/%04d", _storePath, _currentFileIndex);
+ _bof = new ovFile(_gkp, name, ovFileNormal);
- _firstIIDrequested = _info._smallestIID;
- _lastIIDrequested = _info._largestIID;
+ _firstIIDrequested = _info.smallestID();
+ _lastIIDrequested = _info.largestID();
}
-
-
-void
-ovStore::writeOverlap(ovOverlap *overlap) {
- char name[FILENAME_MAX];
-
- assert(_isOutput == TRUE);
-
- if (_offt._a_iid > overlap->a_iid) {
- // Woah! The last overlap we saw is bigger than the one we have now?!
- fprintf(stderr, "LAST: a:"F_U32"\n", _offt._a_iid);
- fprintf(stderr, "THIS: a:"F_U32" b:"F_U32"\n", overlap->a_iid, overlap->b_iid);
- }
- assert(_offt._a_iid <= overlap->a_iid);
-
- if (_info._smallestIID > overlap->a_iid)
- _info._smallestIID = overlap->a_iid;
- if (_info._largestIID < overlap->a_iid)
- _info._largestIID = overlap->a_iid;
-
-
- // If we don't have an output file yet, or the current file is
- // too big, open a new file.
- //
- if ((_bof) && (_overlapsThisFile >= 1024 * 1024 * 1024 / _bof->recordSize())) {
- delete _bof;
-
- _bof = NULL;
- _overlapsThisFile = 0;
- }
-
- if (_bof == NULL) {
- char name[FILENAME_MAX];
-
- _currentFileIndex++;
-
- sprintf(name, "%s/%04d", _storePath, _currentFileIndex);
- _bof = new ovFile(name, ovFileNormalWrite);
- }
-
-
- // Put the index to disk, filling any gaps
- //
- if ((_offt._numOlaps != 0) &&
- (_offt._a_iid != overlap->a_iid)) {
-
- while (_offm._a_iid < _offt._a_iid) {
- _offm._fileno = _offt._fileno;
- _offm._offset = _offt._offset;
- _offm._overlapID = _offt._overlapID; // Not needed, but makes life easier
-
- AS_UTL_safeWrite(_offtFile, &_offm, "ovStore::writeOverlap::offset", sizeof(ovStoreOfft), 1);
-
- _offm._a_iid++;
- }
-
- _offm._a_iid++; // One more, since this iid is not missing -- we write it next!
-
- AS_UTL_safeWrite(_offtFile, &_offt, "AS_OVS_writeOverlapToStore offset", sizeof(ovStoreOfft), 1);
-
- _offt._numOlaps = 0; // Reset; this new id has no overlaps yet.
- }
-
-
- // Update the index if this is the first overlap for this a_iid
- //
- if (_offt._numOlaps == 0) {
- _offt._a_iid = overlap->a_iid;
- _offt._fileno = _currentFileIndex;
- _offt._offset = _overlapsThisFile;
- _offt._overlapID = _info._numOverlapsTotal;
- }
-
- //AS_OVS_accumulateStats(ovs, overlap);
- _bof->writeOverlap(overlap);
-
- _offt._numOlaps++;
- _info._numOverlapsTotal++;
- _overlapsThisFile++;
-}
-
-
-
-void
-ovStore::writeOverlap(ovOverlap *overlap, uint32 maxOverlapsThisFile) {
- char name[FILENAME_MAX];
-
- assert(_isOutput == TRUE);
-
- _currentFileIndex++;
- _overlapsThisFile = 0;
-
- for (uint64 i=0; i < maxOverlapsThisFile; i++ ) {
- // All overlaps will be sorted by a_iid
- if (_offt._a_iid > overlap[i].a_iid) {
- fprintf(stderr, "LAST: a:"F_U32"\n", _offt._a_iid);
- fprintf(stderr, "THIS: a:"F_U32" b:"F_U32"\n", overlap[i].a_iid, overlap[i].b_iid);
- }
-
- assert(_offt._a_iid <= overlap[i].a_iid);
-
- if (_info._smallestIID > overlap[i].a_iid)
- _info._smallestIID = overlap[i].a_iid;
- if (_info._largestIID < overlap[i].a_iid)
- _info._largestIID = overlap[i].a_iid;
-
-
- // Put the index to disk, filling any gaps
- if ((_offt._numOlaps != 0) && (_offt._a_iid != overlap[i].a_iid)) {
-
- while (_offm._a_iid < _offt._a_iid) {
- _offm._fileno = _offt._fileno;
- _offm._offset = _offt._offset;
- _offm._overlapID = _offt._overlapID; // Not needed, but makes life easier
-
- AS_UTL_safeWrite(_offtFile, &_offm, "AS_OVS_writeOverlapToStore offset", sizeof(ovStoreOfft), 1);
-
- _offm._a_iid++;
- }
-
- _offm._a_iid++; // One more, since this iid is not missing -- we write it next!
-
- AS_UTL_safeWrite(_offtFile, &_offt, "AS_OVS_writeOverlapToStore offset", sizeof(ovStoreOfft), 1);
-
- _offt._numOlaps = 0; // Reset; this new id has no overlaps yet.
- }
-
- // Update the index if this is the first overlap for this a_iid
- if (_offt._numOlaps == 0) {
- _offt._a_iid = overlap[i].a_iid;
- _offt._fileno = _currentFileIndex;
- _offt._offset = _overlapsThisFile;
- _offt._overlapID = _info._numOverlapsTotal;
- }
-
- _offt._numOlaps++;
- _info._numOverlapsTotal++;
- _overlapsThisFile++;
- }
-
- fprintf(stderr,"Done building index for dumpfile %d.\n",_currentFileIndex);
-}
-
-
-
-
uint64
ovStore::numOverlapsInRange(void) {
- size_t originalposition = 0;
+ off_t originalposition = 0;
uint64 i = 0;
uint64 len = 0;
ovStoreOfft *offsets = NULL;
@@ -788,7 +460,7 @@ ovStore::numOverlapsInRange(void) {
originalposition = AS_UTL_ftell(_offtFile);
- AS_UTL_fseek(_offtFile, (size_t)_firstIIDrequested * sizeof(ovStoreOfft), SEEK_SET);
+ AS_UTL_fseek(_offtFile, (off_t)_firstIIDrequested * sizeof(ovStoreOfft), SEEK_SET);
// Even if we're doing a whole human-size store, this allocation is
// (a) temporary and (b) only 512MB. The only current consumer of
@@ -824,9 +496,9 @@ ovStore::numOverlapsPerFrag(uint32 &firstFrag, uint32 &lastFrag) {
firstFrag = _firstIIDrequested;
lastFrag = _lastIIDrequested;
- size_t originalPosition = AS_UTL_ftell(_offtFile);
+ off_t originalPosition = AS_UTL_ftell(_offtFile);
- AS_UTL_fseek(_offtFile, (size_t)_firstIIDrequested * sizeof(ovStoreOfft), SEEK_SET);
+ AS_UTL_fseek(_offtFile, (off_t)_firstIIDrequested * sizeof(ovStoreOfft), SEEK_SET);
// Even if we're doing a whole human-size store, this allocation is
// (a) temporary and (b) only 512MB. The only current consumer of
@@ -841,7 +513,7 @@ ovStore::numOverlapsPerFrag(uint32 &firstFrag, uint32 &lastFrag) {
uint64 act = AS_UTL_safeRead(_offtFile, offsets, "ovStore::numOverlapsInRange::offsets", sizeof(ovStoreOfft), len);
if (len != act)
- fprintf(stderr, "AS_OVS_numOverlapsPerFrag()-- short read on offsets! Expected len="F_U64" read act="F_U64"\n", len, act), exit(1);
+ fprintf(stderr, "AS_OVS_numOverlapsPerFrag()-- short read on offsets! Expected len=" F_U64 " read act=" F_U64 "\n", len, act), exit(1);
for (uint64 i=0; i<len; i++)
numolap[i] = offsets[i]._numOlaps;
@@ -855,10 +527,37 @@ ovStore::numOverlapsPerFrag(uint32 &firstFrag, uint32 &lastFrag) {
+void
+ovStore::addEvalues(vector<char *> &fileList) {
+ for (uint32 i=0; i<fileList.size(); i++) {
+ errno = 0;
+ FILE *fp = fopen(fileList[i], "r");
+ if (errno)
+ fprintf(stderr, "Failed to open evalues file '%s': %s\n", fileList[i], strerror(errno)), exit(1);
+ uint32 bgnID = 0;
+ uint32 endID = 0;
+ uint64 len = 0;
+ AS_UTL_safeRead(fp, &bgnID, "loid", sizeof(uint32), 1);
+ AS_UTL_safeRead(fp, &endID, "hiid", sizeof(uint32), 1);
+ AS_UTL_safeRead(fp, &len, "len", sizeof(uint64), 1);
+ uint16 *evalues = new uint16 [len];
+
+ AS_UTL_safeRead(fp, evalues, "evalues", sizeof(uint16), len);
+
+ fclose(fp);
+
+ fprintf(stderr, "- Loading evalues from '%s' -- ID range " F_U32 "-" F_U32 " with " F_U64 " overlaps\n",
+ fileList[i], bgnID, endID, len);
+
+ addEvalues(bgnID, endID, evalues, len);
+
+ delete [] evalues;
+ }
+}
@@ -866,7 +565,7 @@ void
ovStore::addEvalues(uint32 bgnID, uint32 endID, uint16 *evalues, uint64 evaluesLen) {
char name[FILENAME_MAX];
- sprintf(name, "%s/evalues", _storePath);
+ snprintf(name, FILENAME_MAX, "%s/evalues", _storePath);
// If we have an opened memory mapped file, and it isn't open for writing, close it.
@@ -881,16 +580,16 @@ ovStore::addEvalues(uint32 bgnID, uint32 endID, uint16 *evalues, uint64 evaluesL
// Remove a bogus evalues file if one exists.
if ((AS_UTL_fileExists(name) == true) &&
- (AS_UTL_sizeOfFile(name) != (sizeof(uint16) * _info._numOverlapsTotal))) {
- fprintf(stderr, "WARNING: existing evalues file is incorrect size: should be "F_U64" bytes, is "F_U64" bytes. Removing.\n",
- (sizeof(uint16) * _info._numOverlapsTotal), AS_UTL_sizeOfFile(name));
+ (AS_UTL_sizeOfFile(name) != (sizeof(uint16) * _info.numOverlaps()))) {
+ fprintf(stderr, "WARNING: existing evalues file is incorrect size: should be " F_U64 " bytes, is " F_U64 " bytes. Removing.\n",
+ (sizeof(uint16) * _info.numOverlaps()), AS_UTL_sizeOfFile(name));
AS_UTL_unlink(name);
}
// Make a new evalues file if one doesn't exist.
if (AS_UTL_fileExists(name) == false) {
- fprintf(stderr, "Creating evalues file for "F_U64" overlaps.\r", _info._numOverlapsTotal);
+ fprintf(stderr, "Creating evalues file for " F_U64 " overlaps.\r", _info.numOverlaps());
errno = 0;
FILE *F = fopen(name, "w");
@@ -902,21 +601,23 @@ ovStore::addEvalues(uint32 bgnID, uint32 endID, uint16 *evalues, uint64 evaluesL
memset(Z, 0, sizeof(uint16) * 1048576);
- while (Zn < _info._numOverlapsTotal) {
- uint64 S = (Zn + 1048576 < _info._numOverlapsTotal) ? 1048576 : _info._numOverlapsTotal - Zn;
+ while (Zn < _info.numOverlaps()) {
+ uint64 S = (Zn + 1048576 < _info.numOverlaps()) ? 1048576 : _info.numOverlaps() - Zn;
AS_UTL_safeWrite(F, Z, "zero evalues", sizeof(uint16), S);
Zn += S;
- fprintf(stderr, "Creating evalues file for "F_U64" overlaps....%07.3f%%\r",
- _info._numOverlapsTotal, 100.0 * Zn / _info._numOverlapsTotal);
+ fprintf(stderr, "Creating evalues file for " F_U64 " overlaps....%07.3f%%\r",
+ _info.numOverlaps(), 100.0 * Zn / _info.numOverlaps());
}
- fprintf(stderr, "Creating evalues file for "F_U64" overlaps....%07.3f%%\n",
- _info._numOverlapsTotal, 100.0 * Zn / _info._numOverlapsTotal);
+ delete [] Z;
fclose(F);
+
+ fprintf(stderr, "Creating evalues file for " F_U64 " overlaps....%07.3f%%\n",
+ _info.numOverlaps(), 100.0 * Zn / _info.numOverlaps());
}
// Open the evalues file if it isn't already opened
@@ -938,658 +639,3 @@ ovStore::addEvalues(uint32 bgnID, uint32 endID, uint16 *evalues, uint64 evaluesL
// That's it. Deleting the ovStore object will close the memoryMappedFile. It's left open
// for more updates.
}
-
-
-
-
-
-
-
-
-
-
-
-// For the parallel sort, write a block of sorted overlaps into a single file, with index and info.
-
-void
-writeOverlaps(char *storePath,
- ovOverlap *ovls,
- uint64 ovlsLen,
- uint32 fileID) {
-
- char name[FILENAME_MAX];
-
- uint32 currentFileIndex = fileID;
- uint64 overlapsThisFile = 0;
-
- ovStoreInfo info;
-
- info._ovsMagic = 1;
- info._ovsVersion = ovStoreVersion;
- info._UNUSED = 0;
- info._smallestIID = UINT64_MAX;
- info._largestIID = 0;
- info._numOverlapsTotal = 0;
- info._highestFileIndex = 0;
- info._maxReadLenInBits = AS_MAX_READLEN_BITS;
-
- ovStoreOfft offt;
- ovStoreOfft offm;
-
- offt._a_iid = offm._a_iid = ovls[0].a_iid;
- offt._fileno = offm._fileno = fileID;
- offt._offset = offm._offset = 0;
- offt._numOlaps = offm._numOlaps = 0;
- offt._overlapID = offm._overlapID = 0;
-
- // Create the output file
-
- sprintf(name, "%s/%04d", storePath, fileID);
- ovFile *bof = new ovFile(name, ovFileNormalWrite);
-
- // Create the index file
-
- sprintf(name,"%s/%04d.index", storePath, fileID);
-
- errno = 0;
- FILE *offtFile=fopen(name,"w");
- if (errno)
- fprintf(stderr, "ERROR: Failed to open '%s' for writing: %s\n", name, strerror(errno)), exit(1);
-
- // Dump the overlaps
-
- fprintf(stderr, "Writing "F_U64" overlaps.\n", ovlsLen);
-
- for (uint64 i=0; i<ovlsLen; i++ ) {
- bof->writeOverlap(ovls + i);
-
- if (offt._a_iid > ovls[i].a_iid) {
- fprintf(stderr, "LAST: a:"F_U32"\n", offt._a_iid);
- fprintf(stderr, "THIS: a:"F_U32" b:"F_U32"\n", ovls[i].a_iid, ovls[i].b_iid);
- }
- assert(offt._a_iid <= ovls[i].a_iid);
-
- info._smallestIID = MIN(info._smallestIID, ovls[i].a_iid);
- info._largestIID = MAX(info._largestIID, ovls[i].a_iid);
-
- // Put the index to disk, filling any gaps
-
- if ((offt._numOlaps != 0) && (offt._a_iid != ovls[i].a_iid)) {
- while (offm._a_iid < offt._a_iid) {
- offm._fileno = offt._fileno;
- offm._offset = offt._offset;
- offm._overlapID = offt._overlapID; // Not needed, but makes life easier
- offm._numOlaps = 0;
-
- AS_UTL_safeWrite(offtFile, &offm, "AS_OVS_writeOverlapToStore offt", sizeof(ovStoreOfft), 1);
- offm._a_iid++;
- }
-
- // One more, since this iid is not offm -- we write it next!
- offm._a_iid++;
-
- AS_UTL_safeWrite(offtFile, &offt, "AS_OVS_writeOverlapToStore offt", sizeof(ovStoreOfft), 1);
-
- offt._overlapID += offt._numOlaps; // The next block of overlaps starts with this ID
- offt._numOlaps = 0; // The next block has no overlaps yet.
- }
-
- // Update the index if this is the first overlap for this a_iid
-
- if (offt._numOlaps == 0) {
- offt._a_iid = ovls[i].a_iid;
- offt._fileno = currentFileIndex;
- offt._offset = overlapsThisFile;
- }
-
- offt._numOlaps++;
-
- info._numOverlapsTotal++;
-
- overlapsThisFile++;
- }
-
- // Close the output file.
-
- delete bof;
-
- // Write the final (empty) index entries.
-
- while (offm._a_iid < offt._a_iid) {
- offm._fileno = offt._fileno;
- offm._offset = offt._offset;
- offm._overlapID = offt._overlapID; // Not needed, but makes life easier
- offm._numOlaps = 0;
-
- AS_UTL_safeWrite(offtFile, &offm, "AS_OVS_writeOverlapToStore offt", sizeof(ovStoreOfft), 1);
- offm._a_iid++;
- }
-
- // And the final (real) index entry. We could, but don't need to, update overlapID with the
- // number of overlaps in this block.
-
- AS_UTL_safeWrite(offtFile, &offt, "AS_OVS_writeOverlapToStore offt", sizeof(ovStoreOfft), 1);
-
- fclose(offtFile);
-
- // In the nasty case that there were no overlaps in this slice, set meaningful smallest and
- // largest. Well, at least, set non-nonsense smallest and largest.
-
- if (overlapsThisFile == 0) {
- info._smallestIID = 0;
- info._largestIID = 0;
- }
-
- // Write the info, and some stats for the user.
-
- sprintf(name,"%s/%04d.info", storePath, fileID);
-
- errno = 0;
- FILE *F = fopen(name, "w");
- if (errno)
- fprintf(stderr, "ERROR: Failed to open '%s' for writing: %s\n", name, strerror(errno)), exit(1);
-
- AS_UTL_safeWrite(F, &info, "Partition ovs file", sizeof(ovStoreInfo), 1);
-
- fclose(F);
-
- fprintf(stderr, "Wrote "F_U64" overlaps into '%s'\n", info._numOverlapsTotal, name);
- fprintf(stderr, " Smallest "F_U64"\n", info._smallestIID);
- fprintf(stderr, " Largest "F_U64"\n", info._largestIID);
-}
-
-
-
-
-// For the parallel sort, but also generally applicable, test that the index is sane.
-
-bool
-testIndex(char *ovlName,
- bool doFixes) {
- char name[FILENAME_MAX];
- FILE *I = NULL;
- FILE *F = NULL;
-
- sprintf(name, "%s/index", ovlName);
-
- errno = 0;
- I = fopen(name, "r");
- if (errno)
- fprintf(stderr, "ERROR: Failed to open '%s' for reading: %s\n", name, strerror(errno)), exit(1);
-
- //fprintf(stderr, "TESTING '%s'\n", name);
-
- if (doFixes) {
- sprintf(name, "%s/index.fixed", ovlName);
-
- errno = 0;
- F = fopen(name, "w");
- if (errno)
- fprintf(stderr, "ERROR: Failed to open '%s' for writing: %s\n", name, strerror(errno)), exit(1);
-
- //fprintf(stderr, "WITH FIXES TO '%s'\n", name);
- }
-
- ovStoreOfft O;
-
- uint32 curIID = 0;
- uint32 minIID = UINT32_MAX;
- uint32 maxIID = 0;
-
- uint32 nErrs = 0;
-
- while (1 == AS_UTL_safeRead(I, &O, "offset", sizeof(ovStoreOfft), 1)) {
- bool maxIncreases = (maxIID < O._a_iid);
- bool errorDecreased = ((O._a_iid < curIID));
- bool errorGap = ((O._a_iid > 0) && (curIID + 1 != O._a_iid));
-
- if (O._a_iid < minIID)
- minIID = O._a_iid;
-
- if (maxIncreases)
- maxIID = O._a_iid;
-
- if (errorDecreased)
- fprintf(stderr, "ERROR: index decreased from "F_U32" to "F_U32"\n", curIID, O._a_iid), nErrs++;
- else if (errorGap)
- fprintf(stderr, "ERROR: gap between "F_U32" and "F_U32"\n", curIID, O._a_iid), nErrs++;
-
- if ((maxIncreases == true) && (errorGap == false)) {
- if (doFixes)
- AS_UTL_safeWrite(F, &O, "offset", sizeof(ovStoreOfft), 1);
-
- } else if (O._numOlaps > 0) {
- fprintf(stderr, "ERROR: lost overlaps a_iid "F_U32" fileno "F_U32" offset "F_U32" numOlaps "F_U32"\n",
- O._a_iid, O._fileno, O._offset, O._numOlaps);
- }
-
- curIID = O._a_iid;
- }
-
- fclose(I);
-
- if (F)
- fclose(F);
-
- return(nErrs == 0);
-}
-
-
-
-
-
-// For the parallel sort, merge index and info files into one, clean up the intermediates.
-
-void
-mergeInfoFiles(char *storePath,
- uint32 nPieces) {
- ovStoreInfo infopiece;
- ovStoreInfo info;
-
- info._ovsMagic = ovStoreMagic;
- info._ovsVersion = ovStoreVersion;
- info._smallestIID = UINT64_MAX;
- info._largestIID = 0;
- info._numOverlapsTotal = 0;
- info._highestFileIndex = nPieces;
- info._maxReadLenInBits = AS_MAX_READLEN_BITS;
-
- ovStoreOfft offm;
-
- offm._a_iid = 0;
- offm._fileno = 1;
- offm._offset = 0;
- offm._numOlaps = 0;
- offm._overlapID = 0;
-
- // Open the new master index output file
-
- char name[FILENAME_MAX];
-
- sprintf(name, "%s/index", storePath);
-
- errno = 0;
- FILE *idx = fopen(name, "w");
- if (errno)
- fprintf(stderr, "ERROR: Failed to open '%s': %s\n", name, strerror(errno)), exit(1);
-
- // Special case, we need an empty index for the zeroth fragment.
-
- AS_UTL_safeWrite(idx, &offm, "ovStore::mergeInfoFiles::offsetZero", sizeof(ovStoreOfft), 1);
-
- // Sanity checking, compare the number of overlaps processed against the overlapID
- // of each ovStoreOfft.
-
- uint64 totalOverlaps = 0;
-
- // Process each
-
- for (uint32 i=1; i<=nPieces; i++) {
- sprintf(name, "%s/%04d.info", storePath, i);
-
- fprintf(stderr, "Processing '%s'\n", name);
-
- if (AS_UTL_fileExists(name, FALSE, FALSE) == false) {
- fprintf(stderr, "ERROR: file '%s' not found.\n", name);
- exit(1);
- }
-
- {
- errno = 0;
- FILE *F = fopen(name, "r");
- if (errno)
- fprintf(stderr, "ERROR: Failed to open '%s': %s\n", name, strerror(errno)), exit(1);
- AS_UTL_safeRead(F, &infopiece, "ovStore::mergeInfoFiles::infopiece", sizeof(ovStoreInfo), 1);
- fclose(F);
- }
-
- // Add empty index elements for missing overlaps
-
- if (infopiece._numOverlapsTotal == 0) {
- fprintf(stderr, " No overlaps found.\n");
- continue;
- }
-
- assert(infopiece._smallestIID <= infopiece._largestIID);
-
- if (info._largestIID + 1 < infopiece._smallestIID)
- fprintf(stderr, " Adding empty records for fragments "F_U64" to "F_U64"\n",
- info._largestIID + 1, infopiece._smallestIID - 1);
-
- while (info._largestIID + 1 < infopiece._smallestIID) {
- offm._a_iid = info._largestIID + 1;
- //offm._fileno = set below, where the recs are written to the master file
- //offm._offset = set below, where the recs are written to the master file
-
- AS_UTL_safeWrite(idx, &offm, "ovStore::mergeInfoFiles::offsets", sizeof(ovStoreOfft), 1);
-
- info._largestIID++;
- }
-
- // Copy index elements for existing overlaps. While copying, update the supposed position
- // of any fragments with no overlaps. Without doing this, accessing the store beginning
- // or ending at such a fragment will fail.
-
- {
- sprintf(name, "%s/%04d.index", storePath, i);
-
- errno = 0;
- FILE *F = fopen(name, "r");
- if (errno)
- fprintf(stderr, "ERROR: Failed to open '%s': %s\n", name, strerror(errno)), exit(1);
-
- uint32 recsLen = 0;
- uint32 recsMax = 1024 * 1024;
- ovStoreOfft *recs = new ovStoreOfft [recsMax];
-
- recsLen = AS_UTL_safeRead(F, recs, "ovStore::mergeInfoFiles::offsetsLoad", sizeof(ovStoreOfft), recsMax);
-
- if (recsLen > 0) {
- if (info._largestIID + 1 != recs[0]._a_iid)
- fprintf(stderr, "ERROR: '%s' starts with iid "F_U32", but store only up to "F_U64"\n",
- name, recs[0]._a_iid, info._largestIID);
- assert(info._largestIID + 1 == recs[0]._a_iid);
- }
-
- while (recsLen > 0) {
-
- // Update location of missing reads.
-
- offm._fileno = recs[recsLen-1]._fileno;
- offm._offset = recs[recsLen-1]._offset;
-
- // Update overlapID for each record.
-
- for (uint32 rr=0; rr<recsLen; rr++) {
- recs[rr]._overlapID += info._numOverlapsTotal;
-
- if (recs[rr]._numOlaps > 0)
- assert(recs[rr]._overlapID == totalOverlaps);
-
- totalOverlaps += recs[rr]._numOlaps;
- }
-
- // Write the records, read next batch
-
- AS_UTL_safeWrite(idx, recs, "ovStore::mergeInfoFiles::offsetsWrite", sizeof(ovStoreOfft), recsLen);
-
- recsLen = AS_UTL_safeRead(F, recs, "ovStore::mergeInfoFiles::offsetsReLoad", sizeof(ovStoreOfft), recsMax);
- }
-
- delete [] recs;
-
- fclose(F);
- }
-
- // Update the info block to include the overlaps we just added
-
- info._smallestIID = MIN(info._smallestIID, infopiece._smallestIID);
- info._largestIID = MAX(info._largestIID, infopiece._largestIID);
-
- info._numOverlapsTotal += infopiece._numOverlapsTotal;
-
- fprintf(stderr, " Now finished with fragments "F_U64" to "F_U64" -- "F_U64" overlaps.\n",
- info._smallestIID, info._largestIID, info._numOverlapsTotal);
- }
-
- fclose(idx);
-
-
- // Dump the new store info file
-
- {
- sprintf(name, "%s/info", storePath);
-
- errno = 0;
- FILE *F = fopen(name, "w");
- if (errno)
- fprintf(stderr, "ERROR: Failed to open '%s': %s\n", name, strerror(errno)), exit(1);
-
- AS_UTL_safeWrite(F, &info, "ovStore::mergeInfoFiles::finalInfo", sizeof(ovStoreInfo), 1);
-
- fclose(F);
- }
-
- fprintf(stderr, "\n");
- fprintf(stderr, "Index finalized for reads "F_U64" to "F_U64" with "F_U64" overlaps.\n",
- info._smallestIID,
- info._largestIID,
- info._numOverlapsTotal);
-}
-
-
-
-
-
-
-
-//
-//
-// For overlap store building, both sequential and parallel. Overlap filtering.
-//
-//
-
-
-
-#define OBT_FAR5PRIME (29)
-#define OBT_MIN_LENGTH (75)
-
-
-
-// Are the 5' end points very different? If the overlap is flipped, then, yes, they are.
-static
-bool
-isOverlapDifferent(ovOverlap &ol) {
- bool isDiff = true;
-
- if (ol.flipped() == false) {
- if (ol.a_bgn() > ol.b_bgn())
- isDiff = ((ol.a_bgn() - ol.b_bgn()) > OBT_FAR5PRIME) ? (true) : (false);
- else
- isDiff = ((ol.b_bgn() - ol.a_bgn()) > OBT_FAR5PRIME) ? (true) : (false);
- }
-
- return(isDiff);
-}
-
-
-// Is the overlap long?
-static
-bool
-isOverlapLong(ovOverlap &ol) {
- int32 ab = ol.a_bgn();
- int32 ae = ol.a_end();
- int32 bb = ol.b_bgn();
- int32 be = ol.b_end();
-
- int32 Alength = ae - ab;
- int32 Blength = be - bb;
-
- if (be < bb)
- Blength = bb - be;
-
- return(((Alength > OBT_MIN_LENGTH) && (Blength > OBT_MIN_LENGTH)) ? (true) : (false));
-}
-
-
-
-
-void
-ovStoreFilter::filterOverlap(ovOverlap &foverlap,
- ovOverlap &roverlap) {
-
- // Quick sanity check on IIDs.
-
- if ((foverlap.a_iid == 0) ||
- (foverlap.b_iid == 0) ||
- (foverlap.a_iid >= maxID) ||
- (foverlap.b_iid >= maxID)) {
- char ovlstr[256];
-
- fprintf(stderr, "Overlap has IDs out of range (maxID "F_U32"), possibly corrupt input data.\n", maxID);
- fprintf(stderr, " coords -- %s\n", foverlap.toString(ovlstr, ovOverlapAsCoords, false));
- fprintf(stderr, " hangs -- %s\n", foverlap.toString(ovlstr, ovOverlapAsHangs, false));
- exit(1);
- }
-
- // Make the reverse overlap (important, AFTER resetting the erate-based 'for' flags).
-
- roverlap.swapIDs(foverlap);
-
-
- // Ignore high error overlaps
-
- if ((foverlap.evalue() > maxEvalue)) {
- foverlap.dat.ovl.forUTG = false;
- foverlap.dat.ovl.forOBT = false;
- foverlap.dat.ovl.forDUP = false;
-
- roverlap.dat.ovl.forUTG = false;
- roverlap.dat.ovl.forOBT = false;
- roverlap.dat.ovl.forDUP = false;
-
- skipERATE++;
- skipERATE++;
- }
-
-
-
-
- // Don't OBT if not requested.
-
- if ((foverlap.dat.ovl.forOBT == false) && (skipReadOBT[foverlap.a_iid] == true)) {
- foverlap.dat.ovl.forOBT = false;
- skipOBT++;
- }
-
- if ((roverlap.dat.ovl.forOBT == false) && (skipReadOBT[roverlap.a_iid] == true)) {
- roverlap.dat.ovl.forOBT = false;
- skipOBT++;
- }
-
- // If either overlap is good for either obt or dup, compute if it is different and long. These
- // are the same for both foverlap and roverlap.
-
- bool isDiff = isOverlapDifferent(foverlap);
- bool isLong = isOverlapLong(foverlap);
-
- // Remove the bad-for-OBT overlaps.
-
- if ((isDiff == false) && (foverlap.dat.ovl.forOBT == true)) {
- foverlap.dat.ovl.forOBT = false;
- skipOBTbad++;
- }
-
- if ((isDiff == false) && (roverlap.dat.ovl.forOBT == true)) {
- roverlap.dat.ovl.forOBT = false;
- skipOBTbad++;
- }
-
- // Remove the too-short-for-OBT overlaps.
-
- if ((isLong == false) && (foverlap.dat.ovl.forOBT == true)) {
- foverlap.dat.ovl.forOBT = false;
- skipOBTshort++;
- }
-
- if ((isLong == false) && (roverlap.dat.ovl.forOBT == true)) {
- roverlap.dat.ovl.forOBT = false;
- skipOBTshort++;
- }
-
-
-
-
- // Don't dedupe if not requested.
-
- if ((foverlap.dat.ovl.forDUP == true) && (skipReadDUP[foverlap.a_iid] == true)) {
- foverlap.dat.ovl.forDUP = false;
- skipDUP++;
- }
-
- if ((roverlap.dat.ovl.forDUP == true) && (skipReadDUP[roverlap.b_iid] == true)) {
- roverlap.dat.ovl.forDUP = false;
- skipDUP++;
- }
-
- // Remove the bad-for-DUP overlaps.
-
-#if 0
- // Nah, do this in dedupe, since parameters can change.
- if ((isDiff == true) && (foverlap.dat.ovl.forDUP == true)) {
- foverlap.dat.ovl.forDUP = false;
- skipDUPdiff++;
- }
-
- if ((isDiff == true) && (roverlap.dat.ovl.forDUP == true)) {
- roverlap.dat.ovl.forDUP = false;
- skipDUPdiff++;
- }
-#endif
-
- // Can't have duplicates between libraries.
-
- if (((foverlap.dat.ovl.forDUP == true) ||
- (roverlap.dat.ovl.forDUP == true)) &&
- (gkp->gkStore_getRead(foverlap.a_iid)->gkRead_libraryID() != gkp->gkStore_getRead(foverlap.b_iid)->gkRead_libraryID())) {
-
- if ((foverlap.dat.ovl.forDUP == true)) {
- foverlap.dat.ovl.forDUP = false;
- skipDUPlib++;
- }
-
- if ((roverlap.dat.ovl.forDUP == true)) {
- roverlap.dat.ovl.forDUP = false;
- skipDUPlib++;
- }
- }
-
- // All done with the filtering, record some counts.
-
- if (foverlap.dat.ovl.forUTG == true) saveUTG++;
- if (foverlap.dat.ovl.forOBT == true) saveOBT++;
- if (foverlap.dat.ovl.forDUP == true) saveDUP++;
-
- if (roverlap.dat.ovl.forUTG == true) saveUTG++;
- if (roverlap.dat.ovl.forOBT == true) saveOBT++;
- if (roverlap.dat.ovl.forDUP == true) saveDUP++;
-}
-
-
-
-
-void
-ovStoreFilter::reportFate(void) {
- fprintf(stderr, "overlap fate:\n");
- fprintf(stderr, "%16"F_U64P" SAVE - overlaps output (for unitigging)\n", saveUTG);
- fprintf(stderr, "%16"F_U64P" SAVE - overlaps output (for OBT)\n", saveOBT);
- fprintf(stderr, "%16"F_U64P" SAVE - overlaps output (for dedupe)\n", saveDUP);
- fprintf(stderr, "\n");
- fprintf(stderr, "%16"F_U64P" ERATE - low quality, more than %.3f fraction error\n", skipERATE, AS_OVS_decodeEvalue(maxEvalue));
- fprintf(stderr, "\n");
- fprintf(stderr, "%16"F_U64P" OBT - not requested\n", skipOBT);
- fprintf(stderr, "%16"F_U64P" OBT - too similar\n", skipOBTbad);
- fprintf(stderr, "%16"F_U64P" OBT - too short\n", skipOBTshort);
- fprintf(stderr, "\n");
- fprintf(stderr, "%16"F_U64P" DUP - dedupe not requested\n", skipDUP);
- fprintf(stderr, "%16"F_U64P" DUP - different library\n", skipDUPlib);
- fprintf(stderr, "%16"F_U64P" DUP - obviously not duplicates\n", skipDUPdiff);
-}
-
-
-void
-ovStoreFilter::resetCounters(void) {
- saveUTG = 0;
- saveOBT = 0;
- saveDUP = 0;
-
- skipERATE = 0;
-
- skipOBT = 0;
- skipOBTbad = 0;
- skipOBTshort = 0;
-
- skipDUP = 0;
- skipDUPdiff = 0;
- skipDUPlib = 0;
-}
diff --git a/src/stores/ovStore.H b/src/stores/ovStore.H
index c178efa..d86a6b1 100644
--- a/src/stores/ovStore.H
+++ b/src/stores/ovStore.H
@@ -31,403 +31,166 @@
* full conditions and disclaimers for each license.
*/
-#ifndef AS_OVERLAP_H
-#define AS_OVERLAP_H
+#ifndef AS_OVSTORE_H
+#define AS_OVSTORE_H
#include "AS_global.H"
#include "gkStore.H"
-// Error rates are encoded as a 12-bit fixed-point value. This gives us up to 40.95% error, with
-// 0.01% resolution. Changing the number of bits WILL break the carefully structured
-// ovOverlapDAT.
-//
-// The decoded value is a double representing fraction error -- between 0.0000 and 1.0000.
-// The encoded value is an integer type (see the ovsOvelrapDAT below).
+#define SNAPPY
-#define AS_MAX_EVALUE_BITS 12
-#define AS_MAX_EVALUE ((1 << AS_MAX_EVALUE_BITS) - 1)
+#include "ovOverlap.H"
+#include "ovStoreFile.H"
+#include "ovStoreHistogram.H"
-#define AS_OVS_decodeEvalue(E) ((E) / 10000.0)
-#define AS_OVS_encodeEvalue(Q) (((Q) < AS_OVS_decodeEvalue(AS_MAX_EVALUE)) ? (int)(10000.0 * (Q) + 0.5) : AS_MAX_EVALUE)
-#define AS_MAX_ERATE AS_OVS_decodeEvalue(AS_MAX_EVALUE)
-// The old implementation allowed up to 20-bit reads, and used 3 32-bit words. No alignment was
-// stored.
-//
-// The new implementation uses either 5 (for EXACTLY 16-bit reads) or 6 32-bit words. It uses 2
-// 32-bit words for storing a pointer to the alignments.
-//
-// Note that the 5-word version must use uint32, and the 6-word version must use uint64.
-// Note that the 5-word version needs to split out the alignPos into two words.
+const uint64 ovStoreVersion = 2;
+const uint64 ovStoreMagic = 0x53564f3a756e6163; // == "canu:OVS - store complete
+const uint64 ovStoreMagicIncomplete = 0x50564f3a756e6163; // == "canu:OVP - store under construction
-#if AS_MAX_READLEN_BITS < 17
-
-#define ovOverlapNWORDS 5
-typedef uint32 ovOverlapWORD;
-#define F_OV F_U32
-#define F_OVP F_U32P
-
-class ovOverlapDAT {
-public:
- ovOverlapWORD ahg5 : AS_MAX_READLEN_BITS; // 16
- ovOverlapWORD ahg3 : AS_MAX_READLEN_BITS; // 16
-
- ovOverlapWORD bhg5 : AS_MAX_READLEN_BITS; // 16
- ovOverlapWORD bhg3 : AS_MAX_READLEN_BITS; // 16
-
- ovOverlapWORD span : AS_MAX_READLEN_BITS; // 16
- ovOverlapWORD evalue : AS_MAX_EVALUE_BITS; // 12
- ovOverlapWORD flipped : 1; // 1
-
- ovOverlapWORD forOBT : 1; // 1
- ovOverlapWORD forDUP : 1; // 1
- ovOverlapWORD forUTG : 1; // 1
-
- ovOverlapWORD alignSwapped : 1; // Our IDs are opposite those in the alignment
- ovOverlapWORD alignFile : 19; // Which file of overlap alignments
- ovOverlapWORD alignPosHi : 12; // Position in that file (high-order bits)
- ovOverlapWORD alignPosLo : 32; // Position in that file (low-order bits)
-};
-
-#elif AS_MAX_READLEN_BITS < 22
-
-#define ovOverlapNWORDS 3
-typedef uint64 ovOverlapWORD;
-#define F_OV F_U64
-#define F_OVP F_U64P
-
-class ovOverlapDAT {
-public:
- ovOverlapWORD ahg5 : AS_MAX_READLEN_BITS; // 17-21
- ovOverlapWORD ahg3 : AS_MAX_READLEN_BITS; // 17-21
- ovOverlapWORD evalue : AS_MAX_EVALUE_BITS; // 12
- ovOverlapWORD flipped : 1; // 1
- ovOverlapWORD forOBT : 1; // 1
- ovOverlapWORD forDUP : 1; // 1
- ovOverlapWORD forUTG : 1; // 1
- ovOverlapWORD extra1 : 64 - 2 * AS_MAX_READLEN_BITS - AS_MAX_EVALUE_BITS - 1 - 1 - 1 - 1; // Between 15 and 7
-
- ovOverlapWORD bhg5 : AS_MAX_READLEN_BITS; // 17-21
- ovOverlapWORD bhg3 : AS_MAX_READLEN_BITS; // 17-21
- ovOverlapWORD span : AS_MAX_READLEN_BITS; // 17-21
- ovOverlapWORD extra2 : 64 - 3 * AS_MAX_READLEN_BITS; // Between 13 and 1
-
- ovOverlapWORD alignSwapped : 1; // Our IDs are opposite those in the alignment
- ovOverlapWORD alignFile : 19; // Which file of overlap alignments
- ovOverlapWORD alignPos : 44; // Position in that file
-};
-
-#else
-
-#define ovOverlapNWORDS 8
-typedef uint32 ovOverlapWORD;
-#define F_OV F_U32
-#define F_OVP F_U32P
-
-class ovOverlapDAT {
-public:
- ovOverlapWORD ahg5;
- ovOverlapWORD ahg3;
- ovOverlapWORD bhg5;
- ovOverlapWORD bhg3;
- ovOverlapWORD span;
-
- ovOverlapWORD evalue : AS_MAX_EVALUE_BITS; // 12
- ovOverlapWORD flipped : 1; // 1
- ovOverlapWORD forOBT : 1; // 1
- ovOverlapWORD forDUP : 1; // 1
- ovOverlapWORD forUTG : 1; // 1
- ovOverlapWORD extra : 32 - AS_MAX_EVALUE_BITS - 1 - 1 - 1 - 1; // Between 15 and 7
-
- ovOverlapWORD alignSwapped : 1; // Our IDs are opposite those in the alignment
- ovOverlapWORD alignFile : 19; // Which file of overlap alignments
- ovOverlapWORD alignPosHi : 12; // Position in that file (high-order bits)
- ovOverlapWORD alignPosLo : 32; // Position in that file (low-order bits)
-};
-
-#endif
-
-
-
-enum ovOverlapDisplayType {
- ovOverlapAsHangs = 0, // Show a and b hang
- ovOverlapAsCoords = 1, // Show bgn,end for each read
- ovOverlapAsRaw = 2, // Show all four hangs
- ovOverlapAsCompat = 3, // Show in a format more-or-less compatible with CA 8.3
- ovOverlapAsPaf = 4, // Show in a format compatible with miniasm
-};
-
-
-
-
-class ovOverlap {
-private:
- ovOverlap() {
- clear();
- };
+class ovStoreInfo {
public:
- ovOverlap(gkStore *gkp) {
- g = gkp;
+ ovStoreInfo() {
clear();
};
- ~ovOverlap() {
+ void clear(void) {
+ _ovsMagic = ovStoreMagicIncomplete; // Appropriate for a new store.
+ _ovsVersion = ovStoreVersion;
+ _UNUSED = 0;
+ _smallestIID = UINT64_MAX;
+ _largestIID = 0;
+ _numOverlapsTotal = 0;
+ _highestFileIndex = 0;
+ _maxReadLenInBits = AS_MAX_READLEN_BITS;
};
- static
- ovOverlap *allocateOverlaps(gkStore *gkp, uint64 num) {
- ovOverlap *r = new ovOverlap [num];
-
- for (uint32 ii=0; ii<num; ii++)
- r[ii].g = gkp;
-
- return(r);
- };
+ bool load(const char *path, uint32 index=UINT32_MAX, bool temporary=false) {
+ char name[FILENAME_MAX];
+ if (temporary == false)
+ snprintf(name, FILENAME_MAX, "%s/info", path);
+ else
+ snprintf(name, FILENAME_MAX, "%s/%04u.info", path, index);
- // Dovetail if any of the following are true:
- // ahg3 == 0 && ahg5 == 0 (a is contained)
- // ahg3 == 0 && bhg5 == 0 (a3' dovetail b5')
- //
- // bhg3 == 0 && bhg5 == 0 (b is contained)
- // bhg3 == 0 && ahg5 == 0 (a5' dovetail b3')
- //
- // In general, if the 3' hang of either A or B is zero, AND the 5' hang of either A or B is zero.
- //
- uint32 overlapIsDovetail(void) const {
- return(((dat.ovl.ahg5 == 0) || (dat.ovl.bhg5 == 0)) &&
- ((dat.ovl.ahg3 == 0) || (dat.ovl.bhg3 == 0)));
- };
-
-
- // These assume that at most one of ahg5 and bhg5 (or 3') is positive. If two are positive, then the overlap is partial.
- //
- // The conversion from a_hang is trivial:
- // a_hang > 0 ---> ahg5 > 0 (and bhg5 == 0)
- // a_hang < 0 ---> bhg5 > 0 (and ahg5 == 0)
- //
- // b_hang > 0 ---> bhg3 > 0 (and ahg3 == 0)
- // b_hang < 0 ---> ahg3 > 0 (and bhg3 == 0)
- //
-
- // Convenience functions.
- int32 a_hang(void) const { return((int32)dat.ovl.ahg5 - (int32)dat.ovl.bhg5); };
- int32 b_hang(void) const { return((int32)dat.ovl.bhg3 - (int32)dat.ovl.ahg3); };
-
- void a_hang(int32 a) { dat.ovl.ahg5 = (a < 0) ? 0 : a; dat.ovl.bhg5 = (a < 0) ? -a : 0; };
- void b_hang(int32 b) { dat.ovl.bhg3 = (b < 0) ? 0 : b; dat.ovl.ahg3 = (b < 0) ? -b : 0; };
-
- // These return the actual coordinates on the read. For reverse B reads, the coordinates are in the reverse-complemented
- // sequence, and are returned as bgn > end to show this.
- uint32 a_bgn(void) const { return(dat.ovl.ahg5); };
- uint32 a_end(void) const { return(g->gkStore_getRead(a_iid)->gkRead_sequenceLength() - dat.ovl.ahg3); };
-
- uint32 b_bgn(void) const { return((dat.ovl.flipped) ? (g->gkStore_getRead(b_iid)->gkRead_sequenceLength() - dat.ovl.bhg5) : (dat.ovl.bhg5)); };
- uint32 b_end(void) const { return((dat.ovl.flipped) ? (dat.ovl.bhg3) : (g->gkStore_getRead(b_iid)->gkRead_sequenceLength() - dat.ovl.bhg3)); };
-
- uint32 span(void) const { return(dat.ovl.span); };
- void span(uint32 s) { dat.ovl.span = s; };
-
-#if 0
- // Return an approximate span as the average of the read span aligned.
- uint32 span(void) const {
- if (dat.ovl.span > 0)
- return(dat.ovl.span);
- else {
- uint32 ab = a_bgn(), ae = a_end();
- uint32 bb = b_bgn(), be = b_end();
-
- if (bb < be)
- return(((ae - ab) + (be - bb)) / 2);
- else
- return(((ae - ab) + (bb - be)) / 2);
+ if (AS_UTL_fileExists(name, false, false) == false) {
+ fprintf(stderr, "ERROR: directory '%s' is not an overlapStore; didn't find file '%s': %s\n",
+ path, name, strerror(errno));
+ return(false);
}
- }
-#endif
-
- void flipped(uint32 f) { dat.ovl.flipped = f; };
- uint32 flipped(void) const { return(dat.ovl.flipped == true); };
-
- void erate(double e) { dat.ovl.evalue = AS_OVS_encodeEvalue(e); };
- double erate(void) const { return(AS_OVS_decodeEvalue(dat.ovl.evalue)); };
-
- void evalue(uint64 e) { dat.ovl.evalue = e; };
- uint64 evalue(void) const { return(dat.ovl.evalue); };
- bool forOBT(void) { return(dat.ovl.forOBT); };
- bool forDUP(void) { return(dat.ovl.forDUP); };
- bool forUTG(void) { return(dat.ovl.forUTG); };
-
- // These are true only if the overlap is dovetail, which is the usual case, and isn't checked.
-
- uint32 overlapAEndIs5prime(void) const { return((dat.ovl.bhg5 > 0) && (dat.ovl.ahg3 > 0)); };
- uint32 overlapAEndIs3prime(void) const { return((dat.ovl.ahg5 > 0) && (dat.ovl.bhg3 > 0)); };
-
- uint32 overlapBEndIs5prime(void) const { return((overlapAEndIs5prime() && (dat.ovl.flipped == true)) ||
- (overlapAEndIs3prime() && (dat.ovl.flipped == false))); };
- uint32 overlapBEndIs3prime(void) const { return((overlapAEndIs5prime() && (dat.ovl.flipped == false)) ||
- (overlapAEndIs3prime() && (dat.ovl.flipped == true))); };
-
- uint32 overlapAIsContained(void) const { return((dat.ovl.ahg5 == 0) && (dat.ovl.ahg3 == 0)); };
- uint32 overlapBIsContainer(void) const { return((dat.ovl.ahg5 == 0) && (dat.ovl.ahg3 == 0)); };
-
- uint32 overlapAIsContainer(void) const { return((dat.ovl.bhg5 == 0) && (dat.ovl.bhg3 == 0)); };
- uint32 overlapBIsContained(void) const { return((dat.ovl.bhg5 == 0) && (dat.ovl.bhg3 == 0)); };
-
- // Test if the overlap is dovetail or partial.
-
- uint32 overlap5primeIsPartial(void) const { return((dat.ovl.ahg5 > 0) && (dat.ovl.bhg5 > 0)); };
- uint32 overlap3primeIsPartial(void) const { return((dat.ovl.ahg3 > 0) && (dat.ovl.bhg3 > 0)); };
-
- uint32 overlapIsPartial(void) const { return(overlap5primeIsPartial() || overlap3primeIsPartial()); };
-
- char *toString(char *str, ovOverlapDisplayType type, bool newLine);
+ errno = 0;
+ FILE *ovsinfo = fopen(name, "r");
+ if (errno) {
+ fprintf(stderr, "ERROR: directory '%s' is not an overlapStore; failed to open '%s': %s\n",
+ path, name, strerror(errno));
+ return(false);
+ }
- void swapIDs(ovOverlap const &orig);
+ AS_UTL_safeRead(ovsinfo, this, "ovStore::ovStore::info", sizeof(ovStoreInfo), 1);
- void clear(void) {
- dat.dat[0] = 0;
- dat.dat[1] = 0;
- dat.dat[2] = 0;
-#if (ovOverlapNWORDS == 5)
- dat.dat[3] = 0;
- dat.dat[4] = 0;
-#endif
- };
+ fclose(ovsinfo);
- bool
- operator<(const ovOverlap &that) const {
- if (a_iid < that.a_iid) return(true);
- if (a_iid > that.a_iid) return(false);
- if (b_iid < that.b_iid) return(true);
- if (b_iid > that.b_iid) return(false);
- if (dat.dat[0] < that.dat.dat[0]) return(true);
- if (dat.dat[0] > that.dat.dat[0]) return(false);
- if (dat.dat[1] < that.dat.dat[1]) return(true);
- if (dat.dat[1] > that.dat.dat[1]) return(false);
- if (dat.dat[2] < that.dat.dat[2]) return(true);
- if (dat.dat[2] > that.dat.dat[2]) return(false);
-#if (ovOverlapNWORDS == 5)
- if (dat.dat[3] < that.dat.dat[3]) return(true);
- if (dat.dat[3] > that.dat.dat[3]) return(false);
- if (dat.dat[4] < that.dat.dat[4]) return(true);
- if (dat.dat[4] > that.dat.dat[4]) return(false);
-#endif
- return(false);
+ return(true);
};
-public:
- gkStore *g;
-
-public:
- uint32 a_iid;
- uint32 b_iid;
-
- union {
- ovOverlapWORD dat[ovOverlapNWORDS];
- ovOverlapDAT ovl;
- } dat;
-};
+ bool test(const char *path) {
+ char name[FILENAME_MAX];
+ snprintf(name, FILENAME_MAX, "%s/info", path);
-// The default, no flags, is to open for normal overlaps, read only. Normal overlaps mean they
-// have only the B id, i.e., they are in a fully built store.
-//
-// Output of overlapper (input to store building) should be ovFileFullWrite. The specialized
-// ovFileFullWriteNoCounts is used internally by store creation.
-//
-enum ovFileType {
- ovFileNormal = 0, // Reading of b_id overlaps (aka store files)
- ovFileNormalWrite = 1, // Writing of b_id overlaps
- ovFileFull = 2, // Reading of a_id+b_id overlaps (aka dump files)
- ovFileFullWrite = 3, // Writing of a_id+b_id overlaps
- ovFileFullWriteNoCounts = 4 // Writing of a_id+b_id overlaps, omitting the counts of olaps per read
-};
+ if (AS_UTL_fileExists(name, false, false) == false)
+ return(false);
+ errno = 0;
+ FILE *ovsinfo = fopen(name, "r");
+ if (errno)
+ fprintf(stderr, "ERROR: failed to load '%s'; can't check if this is a valid ovStore: %s\n",
+ name, strerror(errno)), exit(1);
+ AS_UTL_safeRead(ovsinfo, this, "ovStore::ovStore::info", sizeof(ovStoreInfo), 1);
+ return(checkMagic());
+ };
-class ovFile {
-public:
- ovFile(const char *name,
- ovFileType type = ovFileNormal,
- uint32 bufferSize = 1 * 1024 * 1024);
- ~ovFile();
+ void save(const char *path, uint32 index=UINT32_MAX, bool temporary=false) {
+ char name[FILENAME_MAX];
- void flushOverlaps(void);
+ if (temporary == false)
+ snprintf(name, FILENAME_MAX, "%s/info", path);
+ else
+ snprintf(name, FILENAME_MAX, "%s/%04u.info", path, index);
- void writeOverlap(ovOverlap *overlap);
- void writeOverlaps(ovOverlap *overlaps, uint64 overlapLen);
+ if (temporary == false) {
+ _ovsMagic = ovStoreMagic;
+ _ovsVersion = ovStoreVersion;
+ _highestFileIndex = index;
+ } else {
+ }
- bool readOverlap(ovOverlap *overlap);
- uint64 readOverlaps(ovOverlap *overlaps, uint64 overlapMax);
+ errno = 0;
+ FILE *ovsinfo = fopen(name, "w");
+ if (errno)
+ fprintf(stderr, "ERROR: failed to save '%s': %s\n", name, strerror(errno)), exit(1);
- void seekOverlap(off_t overlap);
+ AS_UTL_safeWrite(ovsinfo, this, "ovStore::ovStore::saveinfo", sizeof(ovStoreInfo), 1);
- // The size of an overlap record is 1 or 2 IDs + the size of a word times the number of words.
- uint64 recordSize(void) {
- return(sizeof(uint32) * ((_isNormal) ? 1 : 2) + sizeof(ovOverlapWORD) * ovOverlapNWORDS);
+ fclose(ovsinfo);
};
-private:
- uint32 _bufferLen; // length of valid data in the buffer
- uint32 _bufferPos; // position the read is at in the buffer
- uint32 _bufferMax; // allocated size of the buffer
- uint32 *_buffer;
-
- uint32 _olapsPerReadAlloc;
- uint32 _olapsPerReadLast;
- uint32 *_olapsPerRead;
+ void addOverlap(uint32 id, uint32 nOverlaps=1) {
+ if (_smallestIID > id) _smallestIID = id;
+ if (_largestIID < id) _largestIID = id;
- bool _isOutput; // if true, we can writeOverlap()
- bool _isSeekable; // if true, we can seekOverlap()
- bool _isNormal; // if true, 3 words per overlap, else 4
-
- compressedFileReader *_reader;
- compressedFileWriter *_writer;
-
- char _prefix[FILENAME_MAX];
- FILE *_file;
-};
+ _numOverlapsTotal += nOverlaps;
+ };
+ bool checkIncomplete(void) { return(_ovsMagic == ovStoreMagicIncomplete); };
+ bool checkMagic(void) { return(_ovsMagic == ovStoreMagic); };
+ bool checkVersion(void) { return(_ovsVersion == ovStoreVersion); };
+ bool checkSize(void) { return(_maxReadLenInBits == AS_MAX_READLEN_BITS); };
+ uint32 getVersion(void) { return((uint32)_ovsVersion); };
+ uint32 getCurrentVersion(void) { return((uint32)ovStoreVersion); };
+ uint32 getSize(void) { return((uint32)_maxReadLenInBits); };
+ uint64 numOverlaps(void) { return(_numOverlapsTotal); };
+ uint32 smallestID(void) { return(_smallestIID); };
+ uint32 largestID(void) { return(_largestIID); };
+ uint32 lastFileIndex(void) { return(_highestFileIndex); };
-class ovStoreInfo {
private:
uint64 _ovsMagic;
uint64 _ovsVersion;
- uint64 _UNUSED;
+ uint64 _UNUSED; // needed to keep the file layout the same
uint64 _smallestIID; // smallest frag iid in the store
uint64 _largestIID; // largest frag iid in the store
uint64 _numOverlapsTotal; // number of overlaps in the store
uint64 _highestFileIndex;
uint64 _maxReadLenInBits; // length of a fragment
+};
- friend class ovStore;
- friend
- void writeOverlaps(char *storePath,
- ovOverlap *ovls,
- uint64 ovlsLen,
- uint32 fileID);
- friend
- bool
- testIndex(char *storePath,
- bool doFixes);
-
- friend
- void
- mergeInfoFiles(char *storePath,
- uint32 nPieces);
-};
+class ovStoreOfft {
+public:
+ ovStoreOfft() {
+ clear();
+ };
+ ~ovStoreOfft() {
+ };
+ void clear(void) {
+ _a_iid = 0;
+ _fileno = 0;
+ _offset = 0;
+ _numOlaps = 0;
+ _overlapID = 0;
+ };
-class ovStoreOfft {
private:
uint32 _a_iid; // read ID for this block of overlaps.
@@ -437,20 +200,14 @@ private:
uint64 _overlapID; // overlapID for the first overlap in this block. in memory, this is the id of the next overlap.
- void clear(void) {
- _a_iid = 0;
- _fileno = 0;
- _offset = 0;
- _numOlaps = 0;
- _overlapID = 0;
- };
-
friend class ovStore;
+ friend class ovStoreWriter;
friend
void
- writeOverlaps(char *storePath,
- ovOverlap *ovls,
+ writeOverlaps(gkStore *gkp,
+ char *storePath,
+ ovOverlap *ovls,
uint64 ovlsLen,
uint32 fileID);
@@ -466,22 +223,70 @@ private:
};
-// The default here is to open a read only store.
-//
-enum ovStoreType {
- ovStoreReadOnly = 0,
- ovStoreWrite = 1, // Open for write, fail if one exists already
- ovStoreOverwrite = 2, // Open for write, and obliterate an existing store
-};
+class ovStoreWriter {
+public:
+ ~ovStoreWriter();
+
+ // For sequential construction, there is only a constructor, destructor and writeOverlap().
+ // Overlaps must be sorted by a_iid (then b_iid) already.
+
+ ovStoreWriter(const char *path, gkStore *gkp);
+
+ void writeOverlap(ovOverlap *olap);
+
+ // For parallel construction, usage is much more complicated. The constructor
+ // will write a single file of sorted overlaps, and each file has it's own metadata.
+ // After all files are written, the metadata is merged into one file.
+
+ ovStoreWriter(const char *path, gkStore *gkp, uint32 fileLimit, uint32 fileID, uint32 jobIdxMax);
+
+ void writeOverlaps(ovOverlap *ovls, uint64 ovlsLen);
+
+ uint64 loadBucketSizes(uint64 *bucketSizes);
+ void loadOverlapsFromSlice(uint32 slice, uint64 expectedLen, ovOverlap *ovls, uint64& ovlsLen);
+ void removeOverlapSlice(void);
+
+ void mergeInfoFiles(void);
+ void mergeHistogram(void);
+
+ bool testIndex(bool doFixes);
+
+ void checkSortingIsComplete(void);
+ void removeAllIntermediateFiles(void);
-class ovStore {
private:
- void ovStore_read(void);
- void ovStore_write(void);
+ char _storePath[FILENAME_MAX];
+
+ ovStoreInfo _info;
+ gkStore *_gkp;
+
+ FILE *_offtFile; // For writing overlaps, a place to dump ovStoreOfft's.
+ ovStoreOfft _offt; // For writing overlaps, the current ovStoreOfft.
+ ovStoreOfft _offm; // For writing overlaps, an empty ovStoreOfft, for reads with no overlaps.
+
+ memoryMappedFile *_evaluesMap;
+ uint16 *_evalues;
+
+ uint64 _overlapsThisFile; // Count of the number of overlaps written so far
+ uint64 _overlapsThisFileMax;
+ uint32 _currentFileIndex;
+ ovFile *_bof;
+
+ ovStoreHistogram *_histogram; // When constructing a sequential store, collects all the stats from each file
+
+ // Parallel store support
+ uint32 _fileLimit; // number of slices used in bucketizing/sorting
+ uint32 _fileID; // index of the overlap file we're processing
+ uint32 _jobIdxMax; // total number of overlap files
+};
+
+
+
+class ovStore {
public:
- ovStore(const char *name, gkStore *gkp, ovStoreType cType=ovStoreReadOnly);
+ ovStore(const char *name, gkStore *gkp);
~ovStore();
// Read the next overlap from the store. Return value is the number of overlaps read.
@@ -513,32 +318,23 @@ public:
uint64 numOverlapsInRange(void);
uint32 * numOverlapsPerFrag(uint32 &firstFrag, uint32 &lastFrag);
- // The (mostly) private interface for adding overlaps to a store. Overlaps must be sorted already.
-
- void writeOverlap(ovOverlap *olap);
- void writeOverlap(ovOverlap *overlap, uint32 maxOverlapsThisFile);
-
- // Write a block of sorted overlaps to store file 'fileID', saving the info and index into
- // 'fileID.info' and 'fileID.index'
-
- friend
- void writeOverlaps(char *storePath,
- ovOverlap *ovls,
- uint64 ovlsLen,
- uint32 fileID);
-
-
// Add new evalues for reads between bgnID and endID. No checking of IDs is done, but the number
// of evalues must agree.
+ void addEvalues(vector<char *> &fileList);
void addEvalues(uint32 bgnID, uint32 endID, uint16 *evalues, uint64 evaluesLen);
+ // Return the statistics associated with this store
+
+ ovStoreHistogram *getHistogram(void) {
+ return(new ovStoreHistogram(_gkp, _storePath));
+ };
+
private:
char _storePath[FILENAME_MAX];
- bool _isOutput;
-
ovStoreInfo _info;
+ gkStore *_gkp;
uint32 _firstIIDrequested;
uint32 _lastIIDrequested;
@@ -553,30 +349,9 @@ private:
uint64 _overlapsThisFile; // Count of the number of overlaps written so far
uint32 _currentFileIndex;
ovFile *_bof;
-
- gkStore *_gkp;
};
-// This should be part of ovStore, but when it is used, in ovStoreSorter, we don't
-// have a store opened.
-void
-writeOverlaps(char *storePath,
- ovOverlap *ovls,
- uint64 ovlsLen,
- uint32 fileID);
-
-bool
-testIndex(char *storePath,
- bool doFixes);
-
-void
-mergeInfoFiles(char *storePath,
- uint32 nPieces);
-
-
-
-
@@ -584,66 +359,28 @@ mergeInfoFiles(char *storePath,
class ovStoreFilter {
public:
- ovStoreFilter(gkStore *gkp_, double maxErate) {
- gkp = gkp_;
-
- resetCounters();
-
- maxID = gkp->gkStore_getNumReads() + 1;
- maxEvalue = AS_OVS_encodeEvalue(maxErate);
-
- skipReadOBT = new char [maxID];
- skipReadDUP = new char [maxID];
-
- memset(skipReadOBT, 0, sizeof(char) * maxID);
- memset(skipReadDUP, 0, sizeof(char) * maxID);
-
-
- uint32 numSkipOBT = 0;
- uint32 numSkipDUP = 0;
-
- fprintf(stderr, "Marking fragments to skip overlap based trimming.\n");
-
- fprintf(stderr, "LIB 1 - dup=%d trim=%d spur=%d chimera=%d subreads=%d\n",
- gkp->gkStore_getLibrary(1)->gkLibrary_removeDuplicateReads(),
- gkp->gkStore_getLibrary(1)->gkLibrary_finalTrim(),
- gkp->gkStore_getLibrary(1)->gkLibrary_removeSpurReads(),
- gkp->gkStore_getLibrary(1)->gkLibrary_removeChimericReads(),
- gkp->gkStore_getLibrary(1)->gkLibrary_checkForSubReads());
-
- for (uint64 iid=0; iid<maxID; iid++) {
- uint32 Lid = gkp->gkStore_getRead(iid)->gkRead_libraryID();
- gkLibrary *L = gkp->gkStore_getLibrary(Lid);
-
- if ((L->gkLibrary_removeDuplicateReads() == false) &&
- (L->gkLibrary_finalTrim() == GK_FINALTRIM_NONE) &&
- (L->gkLibrary_removeSpurReads() == false) &&
- (L->gkLibrary_removeChimericReads() == false) &&
- (L->gkLibrary_checkForSubReads() == false)) {
- numSkipOBT++;
- skipReadOBT[iid] = true;
- }
+ ovStoreFilter(gkStore *gkp_, double maxErate);
+ ~ovStoreFilter();
- if (L->gkLibrary_removeDuplicateReads() == false) {
- numSkipDUP++;
- skipReadDUP[iid] = true;
- }
- }
+ void filterOverlap(ovOverlap &foverlap,
+ ovOverlap &roverlap);
- fprintf(stderr, "Marked "F_U32" reads so skip OBT, "F_U32" reads to skip dedupe.\n", numSkipOBT, numSkipDUP);
- };
+ //void reportFate(void);
+ void resetCounters(void);
- ~ovStoreFilter() {
- delete [] skipReadOBT;
- delete [] skipReadDUP;
- };
+ uint64 savedUnitigging(void) { return(saveUTG); };
+ uint64 savedTrimming(void) { return(saveOBT); };
+ uint64 savedDedupe(void) { return(saveDUP); };
+ uint64 filteredErate(void) { return(skipERATE); };
- void filterOverlap(ovOverlap &foverlap,
- ovOverlap &roverlap);
+ uint64 filteredNoTrim(void) { return(skipOBT); };
+ uint64 filteredBadTrim(void) { return(skipOBTbad); };
+ uint64 filteredShortTrim(void) { return(skipOBTshort); };
- void reportFate(void);
- void resetCounters(void);
+ uint64 filteredNoDedupe(void) { return(skipDUP); };
+ uint64 filteredNotDupe(void) { return(skipDUPdiff); };
+ uint64 filteredDiffLib(void) { return(skipDUPlib); };
public:
gkStore *gkp;
@@ -665,12 +402,9 @@ public:
uint64 skipDUPdiff; // Overlap isn't remotely similar
uint64 skipDUPlib;
- // Not really stats, but global state for the filter.
-
- char *skipReadOBT;
+ char *skipReadOBT; // State of the filter.
char *skipReadDUP;
};
-
-#endif
+#endif // AS_OVSTORE_H
diff --git a/src/stores/ovStoreBucketizer.C b/src/stores/ovStoreBucketizer.C
index 6dd4ea8..afa4e66 100644
--- a/src/stores/ovStoreBucketizer.C
+++ b/src/stores/ovStoreBucketizer.C
@@ -47,7 +47,8 @@
static
void
-writeToFile(ovOverlap *overlap,
+writeToFile(gkStore *gkp,
+ ovOverlap *overlap,
ovFile **sliceFile,
uint32 sliceFileMax,
uint64 *sliceSize,
@@ -61,8 +62,8 @@ writeToFile(ovOverlap *overlap,
if (sliceFile[df] == NULL) {
char name[FILENAME_MAX];
- sprintf(name, "%s/create%04d/slice%03d%s", ovlName, jobIndex, df, (useGzip) ? ".gz" : "");
- sliceFile[df] = new ovFile(name, ovFileFullWriteNoCounts);
+ snprintf(name, FILENAME_MAX, "%s/create%04d/slice%03d%s", ovlName, jobIndex, df, (useGzip) ? ".gz" : "");
+ sliceFile[df] = new ovFile(gkp, name, ovFileFullWriteNoCounts);
sliceSize[df] = 0;
}
@@ -72,68 +73,6 @@ writeToFile(ovOverlap *overlap,
-// These are duplicated between ovStoreBucketizer and ovStoreBuild
-
-static
-void
-markOBT(gkStore *gkp, uint32 maxIID, char *skipRead) {
- uint64 numMarked = 0;
-
- if (skipRead == NULL)
- return;
-
- fprintf(stderr, "Marking fragments to skip overlap based trimming.\n");
-
- for (uint64 iid=0; iid<maxIID; iid++) {
- uint32 Lid = gkp->gkStore_getRead(iid)->gkRead_libraryID();
- gkLibrary *L = gkp->gkStore_getLibrary(Lid);
-
- if (L == NULL)
- continue;
-
- if ((L->gkLibrary_removeDuplicateReads() == false) &&
- (L->gkLibrary_finalTrim() != GK_FINALTRIM_LARGEST_COVERED) &&
- (L->gkLibrary_removeSpurReads() == false) &&
- (L->gkLibrary_removeChimericReads() == false)) {
- numMarked++;
- skipRead[iid] = true;
- }
- }
-
- fprintf(stderr, "Marked "F_U64" fragments.\n", numMarked);
-}
-
-
-static
-void
-markDUP(gkStore *gkp, uint32 maxIID, char *skipRead) {
- uint64 numMarked = 0;
-
- if (skipRead == NULL)
- return;
-
- fprintf(stderr, "Marking fragments to skip deduplication.\n");
-
- for (uint64 iid=0; iid<maxIID; iid++) {
- uint32 Lid = gkp->gkStore_getRead(iid)->gkRead_libraryID();
- gkLibrary *L = gkp->gkStore_getLibrary(Lid);
-
- if (L == NULL)
- continue;
-
- if (L->gkLibrary_removeDuplicateReads() == false) {
- numMarked++;
- skipRead[iid] = true;
- }
- }
-
- fprintf(stderr, "Marked "F_U64" fragments.\n", numMarked);
-}
-
-
-
-
-
int
main(int argc, char **argv) {
char *ovlName = NULL;
@@ -149,7 +88,7 @@ main(int argc, char **argv) {
char *ovlInput = NULL;
- bool useGzip = true;
+ bool useGzip = false;
argc = AS_configure(argc, argv);
@@ -178,11 +117,12 @@ main(int argc, char **argv) {
maxErrorRate = atof(argv[++arg]);
maxError = AS_OVS_encodeEvalue(maxErrorRate);
- } else if (strcmp(argv[arg], "-raw") == 0) {
- useGzip = false;
+ } else if (strcmp(argv[arg], "-gzip") == 0) {
+ useGzip = true;
} else {
fprintf(stderr, "ERROR: unknown option '%s'\n", argv[arg]);
+ err++;
}
arg++;
@@ -199,13 +139,13 @@ main(int argc, char **argv) {
err++;
if (err) {
- fprintf(stderr, "usage: %s -O asm.ovlStore -G asm.gkpStore -i file.ovb.gz -job j [opts]\n", argv[0]);
+ fprintf(stderr, "usage: %s -O asm.ovlStore -G asm.gkpStore -i file.ovb -job j [opts]\n", argv[0]);
fprintf(stderr, " -O asm.ovlStore path to store to create\n");
fprintf(stderr, " -G asm.gkpStore path to gkpStore for this assembly\n");
fprintf(stderr, "\n");
fprintf(stderr, " -C config path to previously created ovStoreBuild config data file\n");
fprintf(stderr, "\n");
- fprintf(stderr, " -i file.ovb.gz input overlaps\n");
+ fprintf(stderr, " -i file.ovb[.gz] input overlaps\n");
fprintf(stderr, " -job j index of this overlap input file\n");
fprintf(stderr, "\n");
fprintf(stderr, " -F f use up to 'f' files for store creation\n");
@@ -215,7 +155,7 @@ main(int argc, char **argv) {
fprintf(stderr, "\n");
fprintf(stderr, " -e e filter overlaps above e fraction error\n");
fprintf(stderr, "\n");
- fprintf(stderr, " -raw write uncompressed buckets\n");
+ fprintf(stderr, " -gzip compress buckets even more\n");
fprintf(stderr, "\n");
fprintf(stderr, " DANGER DO NOT USE DO NOT USE DO NOT USE DANGER\n");
fprintf(stderr, " DANGER DANGER\n");
@@ -234,7 +174,7 @@ main(int argc, char **argv) {
if (jobIndex == 0)
fprintf(stderr, "ERROR: No job index (-job) supplied.\n");
if (fileLimit > maxFiles)
- fprintf(stderr, "ERROR: Too many jobs (-F); only "F_SIZE_T" supported on this architecture.\n", maxFiles);
+ fprintf(stderr, "ERROR: Too many jobs (-F); only " F_U32 " supported on this architecture.\n", maxFiles);
exit(1);
}
@@ -249,7 +189,7 @@ main(int argc, char **argv) {
{
char name[FILENAME_MAX];
- sprintf(name, "%s/create%04d", ovlName, jobIndex);
+ snprintf(name, FILENAME_MAX, "%s/create%04d", ovlName, jobIndex);
if (AS_UTL_fileExists(name, TRUE, FALSE) == false)
AS_UTL_mkdir(name);
@@ -261,7 +201,7 @@ main(int argc, char **argv) {
{
char name[FILENAME_MAX];
- sprintf(name, "%s/bucket%04d/sliceSizes", ovlName, jobIndex);
+ snprintf(name, FILENAME_MAX, "%s/bucket%04d/sliceSizes", ovlName, jobIndex);
if (AS_UTL_fileExists(name, FALSE, FALSE) == true)
fprintf(stderr, "Job finished; file '%s' exists.\n", name), exit(0);
@@ -285,7 +225,7 @@ main(int argc, char **argv) {
AS_UTL_safeRead(C, iidToBucket, "iidToBucket", sizeof(uint32), maxIID);
if (maxIIDtest != maxIID)
- fprintf(stderr, "ERROR: maxIID in store ("F_U32") differs from maxIID in config file ("F_U32").\n",
+ fprintf(stderr, "ERROR: maxIID in store (" F_U32 ") differs from maxIID in config file (" F_U32 ").\n",
maxIID, maxIIDtest), exit(1);
}
@@ -296,7 +236,7 @@ main(int argc, char **argv) {
memset(sliceFile, 0, sizeof(ovFile *) * (fileLimit + 1));
memset(sliceSize, 0, sizeof(uint64) * (fileLimit + 1));
- fprintf(stderr, "maxError fraction: %.3f percent: %.3f encoded: "F_U64"\n",
+ fprintf(stderr, "maxError fraction: %.3f percent: %.3f encoded: " F_U64 "\n",
maxErrorRate, maxErrorRate * 100, maxError);
fprintf(stderr, "Bucketizing %s\n", ovlInput);
@@ -304,7 +244,7 @@ main(int argc, char **argv) {
ovStoreFilter *filter = new ovStoreFilter(gkp, maxError);
ovOverlap foverlap(gkp);
ovOverlap roverlap(gkp);
- ovFile *inputFile = new ovFile(ovlInput, ovFileFull);
+ ovFile *inputFile = new ovFile(gkp, ovlInput, ovFileFull);
// Do bigger buffers increase performance? Do small ones hurt?
//AS_OVS_setBinaryOverlapFileBufferSize(2 * 1024 * 1024);
@@ -317,18 +257,19 @@ main(int argc, char **argv) {
if ((foverlap.dat.ovl.forUTG == true) ||
(foverlap.dat.ovl.forOBT == true) ||
(foverlap.dat.ovl.forDUP == true))
- writeToFile(&foverlap, sliceFile, fileLimit, sliceSize, iidToBucket, ovlName, jobIndex, useGzip);
+ writeToFile(gkp, &foverlap, sliceFile, fileLimit, sliceSize, iidToBucket, ovlName, jobIndex, useGzip);
if ((roverlap.dat.ovl.forUTG == true) ||
(roverlap.dat.ovl.forOBT == true) ||
(roverlap.dat.ovl.forDUP == true))
- writeToFile(&roverlap, sliceFile, fileLimit, sliceSize, iidToBucket, ovlName, jobIndex, useGzip);
+ writeToFile(gkp, &roverlap, sliceFile, fileLimit, sliceSize, iidToBucket, ovlName, jobIndex, useGzip);
}
delete inputFile;
- filter->reportFate();
- filter->resetCounters();
+#warning not reporting fate
+ //filter->reportFate();
+ //filter->resetCounters();
delete filter;
@@ -341,7 +282,7 @@ main(int argc, char **argv) {
char name[FILENAME_MAX];
char finl[FILENAME_MAX];
- sprintf(name, "%s/create%04d/sliceSizes", ovlName, jobIndex);
+ snprintf(name, FILENAME_MAX, "%s/create%04d/sliceSizes", ovlName, jobIndex);
FILE *F = fopen(name, "w");
if (errno)
@@ -351,8 +292,8 @@ main(int argc, char **argv) {
fclose(F);
- sprintf(name, "%s/create%04d", ovlName, jobIndex);
- sprintf(finl, "%s/bucket%04d", ovlName, jobIndex);
+ snprintf(name, FILENAME_MAX, "%s/create%04d", ovlName, jobIndex);
+ snprintf(finl, FILENAME_MAX, "%s/bucket%04d", ovlName, jobIndex);
errno = 0;
rename(name, finl);
diff --git a/src/stores/ovStoreBuild.C b/src/stores/ovStoreBuild.C
index 0c66ce7..a6f1a08 100644
--- a/src/stores/ovStoreBuild.C
+++ b/src/stores/ovStoreBuild.C
@@ -61,6 +61,47 @@ using namespace std;
//
#define ovOverlapSortSize (sizeof(ovOverlap))
+
+
+static
+void
+addEvalues(char *ovlName, vector<char *> &fileList) {
+ ovStore *ovs = new ovStore(ovlName, NULL);
+
+ ovs->addEvalues(fileList);
+
+ delete ovs;
+
+ fprintf(stderr, "- Evalues updated.\n");
+}
+
+
+
+void
+reportConfiguration(char *configOut, uint32 maxIID, uint32 *iidToBucket) {
+ char F[FILENAME_MAX];
+
+ snprintf(F, FILENAME_MAX, "%s.WORKING", configOut);
+
+ errno = 0;
+ FILE *C = fopen(configOut, "w");
+ if (errno)
+ fprintf(stderr, "Failed to open config output file '%s': %s\n", configOut, strerror(errno)), exit(1);
+
+ AS_UTL_safeWrite(C, &maxIID, "maxIID", sizeof(uint32), 1);
+ AS_UTL_safeWrite(C, iidToBucket, "iidToBucket", sizeof(uint32), maxIID);
+
+ fclose(C);
+
+ rename(F, configOut);
+
+ delete [] iidToBucket;
+
+ fprintf(stderr, "- Saved configuration to '%s'.\n", configOut);
+}
+
+
+
static
uint32 *
computeIIDperBucket(uint32 fileLimit,
@@ -68,8 +109,23 @@ computeIIDperBucket(uint32 fileLimit,
uint64 maxMemory,
uint32 maxIID,
vector<char *> &fileList) {
- uint32 *iidToBucket = new uint32 [maxIID];
- uint32 maxFiles = MIN(floor(sysconf(_SC_CHILD_MAX) / 2), sysconf(_SC_OPEN_MAX) - 16);
+ uint32 *iidToBucket = new uint32 [maxIID];
+ int64 procMax = sysconf(_SC_CHILD_MAX);
+ int64 openMax = sysconf(_SC_OPEN_MAX);
+ int64 maxFiles = 0;
+
+ // As of late August 2016, the ovb files are not gzip compressed, and do not need to use an
+ // external process to decompress. The support for limiting by number of processes is left in -
+ // but disabled - because it's really just these three lines here, and because the code still
+ // supports gzip inputs.
+
+ if (openMax > 16)
+ openMax -= 16;
+
+ if (procMax > 8192) // Once saw a case where procMax was 18,446,744,073,709,551,615 (2^64-1)
+ procMax = 8192; // and openMax was 262,144. It didn't end well.
+
+ maxFiles = openMax; // MIN(procMax, openMax); ENABLE THIS TO LIMIT PROCESSES TOO.
// If we're reading from stdin, not much we can do but divide the IIDs equally per file. Note
// that the IIDs must be consecutive; the obvious, simple and clean division of 'mod' won't work.
@@ -102,63 +158,24 @@ computeIIDperBucket(uint32 fileLimit,
return(iidToBucket);
}
- // Otherwise, we have files, and should have counts.
-
- uint32 *overlapsPerRead = new uint32 [maxIID]; // Sum over all files.
-
- memset(overlapsPerRead, 0, sizeof(uint32) * maxIID);
-
- // For each overlap file, find the counts file and merge into overlapsPerRead.
-
- for (uint32 i=0; i<fileList.size(); i++) {
- char countsName[FILENAME_MAX];
-
- strcpy(countsName, fileList[i]);
-
- char *slash = strrchr(countsName, '/');
- char *dot = strchr((slash == NULL) ? countsName : slash, '.');
-
- if (dot)
- *dot = 0;
-
- strcat(countsName, ".counts");
-
- errno = 0;
- FILE *C = fopen(countsName, "r");
- if (errno)
- fprintf(stderr, "failed to open counts file '%s' for reading: %s\n", countsName, strerror(errno)), exit(1);
-
- uint32 perLen = 0;
- uint32 *per = NULL;
-
- AS_UTL_safeRead(C, &perLen, "perLen", sizeof(uint32), 1);
- AS_UTL_safeRead(C, per = new uint32 [perLen], "per", sizeof(uint32), perLen);
+ // Otherwise, we have files, and should have counts. Load them!
- fclose(C);
+ ovStoreHistogram *hist = new ovStoreHistogram();
+ uint32 *oPR = NULL;
- //fprintf(stderr, "Summing overlap counts for %u reads from '%s'.\n", perLen, countsName);
+ allocateArray(oPR, maxIID);
- assert(perLen <= maxIID);
+ for (uint32 i=0; i<fileList.size(); i++)
+ hist->loadData(fileList[i]);
- for (uint32 ii=0; ii<perLen; ii++)
- overlapsPerRead[ii] += per[ii];
+ uint64 numOverlaps = hist->getOverlapsPerRead(oPR, maxIID);
- delete [] per;
- }
-
- // How many overlaps?
+ delete hist; hist = NULL;
- uint64 numOverlaps = 0;
+ if (numOverlaps == 0)
+ fprintf(stderr, "Found no overlaps to sort.\n"), exit(1);
- for (uint32 ii=0; ii<maxIID; ii++)
- numOverlaps += overlapsPerRead[ii];
-
- if (numOverlaps == 0) {
- fprintf(stderr, "Found no overlaps to sort.\n");
- exit(1);
- }
-
- fprintf(stderr, "Found "F_U64" (%.2f million) overlaps.\n", numOverlaps, numOverlaps / 1000000.0);
+ fprintf(stderr, "Found " F_U64 " (%.2f million) overlaps.\n", numOverlaps, numOverlaps / 1000000.0);
// Partition the overlaps into buckets.
@@ -168,7 +185,7 @@ computeIIDperBucket(uint32 fileLimit,
// If a file limit, distribute the overlaps to equal sized files.
if (fileLimit > 0) {
olapsPerBucketMax = (uint64)ceil((double)numOverlaps / (double)fileLimit);
- fprintf(stderr, "Will sort using "F_U32" files; "F_U64" (%.2f million) overlaps per bucket; %.2f GB memory per bucket\n",
+ fprintf(stderr, "Will sort using " F_U32 " files; " F_U64 " (%.2f million) overlaps per bucket; %.2f GB memory per bucket\n",
fileLimit, olapsPerBucketMax, olapsPerBucketMax / 1000000.0, olapsPerBucketMax * GBperOlap);
}
@@ -178,42 +195,67 @@ computeIIDperBucket(uint32 fileLimit,
// values can break this - either too low memory or too high allowed open files (an OS limit).
if (maxMemory > 0) {
- fprintf(stderr, "Configuring for %.2f GB to %.2f GB memory.\n",
+ fprintf(stderr, "Configuring for %.2f GB to %.2f GB memory and " F_S64 " open files.\n",
minMemory / 1024.0 / 1024.0 / 1024.0,
- maxMemory / 1024.0 / 1024.0 / 1024.0);
+ maxMemory / 1024.0 / 1024.0 / 1024.0,
+ maxFiles);
- if (minMemory < MEMORY_OVERHEAD + ovOverlapSortSize)
+ if (minMemory < MEMORY_OVERHEAD + ovOverlapSortSize) {
+ fprintf(stderr, "Reset minMemory from " F_U64 " to " F_SIZE_T "\n", minMemory, MEMORY_OVERHEAD + ovOverlapSortSize);
minMemory = MEMORY_OVERHEAD + ovOverlapSortSize;
+ }
- uint64 incr = (maxMemory - minMemory) / 1000;
- if (incr < 1)
- incr = 1;
+ uint64 incr = (maxMemory - minMemory) / 128;
+ if (incr < 1024 * 1024)
+ incr = 1024 * 1024;
- // iterate until we can fit the files into file system limits.
+ uint64 useMemory = minMemory;
- do {
- olapsPerBucketMax = (minMemory - MEMORY_OVERHEAD) / ovOverlapSortSize;
- minMemory += incr;
- } while ((minMemory <= maxMemory) &&
- (numOverlaps / olapsPerBucketMax + 1 > 0.50 * maxFiles));
+ // Compute the initial number of overlaps per bucket, based on the smallest memory allowed.
- // Should we prefer finding 0.50 * maxFiles/2 (as above) but allow up to, say, 0.75 * maxFiles if 0.50 can't be satisfied?
- // Is the 0.5 scaling because we open two files per bucket? Seems very tight if so.
+ olapsPerBucketMax = (useMemory - MEMORY_OVERHEAD) / ovOverlapSortSize;
+
+ // Find the smallest memory size that uses fewer files than the OS allows.
+
+ for (;
+ ((useMemory <= maxMemory) && (numOverlaps / olapsPerBucketMax + 1 > maxFiles));
+ useMemory += incr) {
+ olapsPerBucketMax = (useMemory - MEMORY_OVERHEAD) / ovOverlapSortSize;
+ fprintf(stderr, "At memory %.3fGB, " F_U64 " olaps per bucket, " F_U64 " buckets (pass 1).\n",
+ useMemory / 1024.0 / 1024.0 / 1024.0, olapsPerBucketMax, numOverlaps / olapsPerBucketMax + 1);
+ }
+
+ // If we're at less than half the max, make buckets a little bit bigger to reduce the open file
+ // count. This helps when multiple bucketizer jobs get scheduled to the same node.
+
+ if (useMemory < minMemory + (maxMemory - minMemory) / 2) {
+ for (;
+ ((useMemory <= maxMemory) && (numOverlaps / olapsPerBucketMax + 1 > maxFiles / 2));
+ useMemory += incr) {
+ olapsPerBucketMax = (useMemory - MEMORY_OVERHEAD) / ovOverlapSortSize;
+ fprintf(stderr, "At memory %.3fGB, " F_U64 " olaps per bucket, " F_U64 " buckets (pass 2).\n",
+ useMemory / 1024.0 / 1024.0 / 1024.0, olapsPerBucketMax, numOverlaps / olapsPerBucketMax + 1);
+ }
+ }
// Give up if we hit our max limit.
if ((minMemory > maxMemory) ||
- (numOverlaps / olapsPerBucketMax + 1) > 0.50 * maxFiles) {
+ (numOverlaps / olapsPerBucketMax + 1) > maxFiles) {
fprintf(stderr, "ERROR: Cannot sort %.2f million overlaps using %.2f GB memory; too few file handles available.\n",
numOverlaps / 1000000.0,
maxMemory / 1024.0 / 1024.0 / 1024.0);
- fprintf(stderr, "ERROR: olapsPerBucket "F_U64"\n", olapsPerBucketMax);
- fprintf(stderr, "ERROR: buckets "F_U64"\n", numOverlaps / olapsPerBucketMax + 1);
+ fprintf(stderr, "ERROR: minMemory " F_U64 "\n", minMemory);
+ fprintf(stderr, "ERROR: maxMemory " F_U64 "\n", maxMemory);
+ fprintf(stderr, "ERROR: olapsPerBucket " F_U64 "\n", olapsPerBucketMax);
+ fprintf(stderr, "ERROR: buckets " F_U64 "\n", numOverlaps / olapsPerBucketMax + 1);
+ fprintf(stderr, "ERROR: SC_CHILD_MAX " F_S64 "\n", (int64)sysconf(_SC_CHILD_MAX));
+ fprintf(stderr, "ERROR: SC_OPEN_MAX " F_S64 "\n", (int64)sysconf(_SC_OPEN_MAX));
fprintf(stderr, "ERROR: Increase memory size (in canu, ovsMemory; in ovStoreBuild, -M)\n");
exit(1);
}
- fprintf(stderr, "Will sort using "F_U64" files; "F_U64" (%.2f million) overlaps per bucket; %.2f GB memory per bucket\n",
+ fprintf(stderr, "Will sort using " F_U64 " files; " F_U64 " (%.2f million) overlaps per bucket; %.2f GB memory per bucket\n",
numOverlaps / olapsPerBucketMax + 1,
olapsPerBucketMax,
olapsPerBucketMax / 1000000.0,
@@ -228,7 +270,7 @@ computeIIDperBucket(uint32 fileLimit,
uint32 bucket = 1;
for (uint32 ii=0; ii<maxIID; ii++) {
- olaps += overlapsPerRead[ii];
+ olaps += oPR[ii];
iidToBucket[ii] = bucket;
if (olaps >= olapsPerBucketMax) {
@@ -247,25 +289,26 @@ computeIIDperBucket(uint32 fileLimit,
uint32 bucket = 1;
for (uint32 ii=0; ii<maxIID; ii++) {
- olaps += overlapsPerRead[ii];
+ olaps += oPR[ii];
iidToBucket[ii] = bucket;
if (olaps >= olapsPerBucketMax) {
- fprintf(stderr, " bucket %3d has "F_U64" olaps.\n", bucket, olaps);
+ fprintf(stderr, " bucket %3d has " F_U64 " olaps.\n", bucket, olaps);
olaps = 0;
bucket++;
}
}
- fprintf(stderr, " bucket %3d has "F_U64" olaps.\n", bucket, olaps);
+ fprintf(stderr, " bucket %3d has " F_U64 " olaps.\n", bucket, olaps);
}
fprintf(stderr, "Will sort %.3f million overlaps per bucket, using %u buckets %.2f GB per bucket.\n",
olapsPerBucketMax / 1000000.0,
iidToBucket[maxIID-1],
olapsPerBucketMax * GBperOlap + MEMORY_OVERHEAD / 1024.0 / 1024.0 / 1024.0);
+ fprintf(stderr, "\n");
- delete [] overlapsPerRead;
+ delete hist;
return(iidToBucket);
}
@@ -274,7 +317,8 @@ computeIIDperBucket(uint32 fileLimit,
static
void
-writeToDumpFile(ovOverlap *overlap,
+writeToDumpFile(gkStore *gkp,
+ ovOverlap *overlap,
ovFile **dumpFile,
uint64 *dumpLength,
uint32 *iidToBucket,
@@ -285,9 +329,9 @@ writeToDumpFile(ovOverlap *overlap,
if (dumpFile[df] == NULL) {
char name[FILENAME_MAX];
- sprintf(name, "%s/tmp.sort.%03d", ovlName, df);
- fprintf(stderr, "CREATE bucket '%s'\n", name);
- dumpFile[df] = new ovFile(name, ovFileFullWriteNoCounts);
+ snprintf(name, FILENAME_MAX, "%s/tmp.sort.%03d", ovlName, df);
+ fprintf(stderr, "-- Create bucket '%s'\n", name);
+ dumpFile[df] = new ovFile(gkp, name, ovFileFullWriteNoCounts);
dumpLength[df] = 0;
}
@@ -397,118 +441,70 @@ main(int argc, char **argv) {
fprintf(stderr, " -evalues input files are evalue updates from overlap error adjustment\n");
fprintf(stderr, " -config out.dat don't build a store, just dump a binary partitioning file for ovStoreBucketizer\n");
fprintf(stderr, "\n");
+ fprintf(stderr, "Sizes and Limits:\n");
+ fprintf(stderr, " ovOverlapSortSize " F_S32 " bytes\n", (int32)ovOverlapSortSize);
+ fprintf(stderr, " SC_CHILD_MAX " F_S32 " processes\n", (int32)sysconf(_SC_CHILD_MAX));
+ fprintf(stderr, " SC_OPEN_MAX " F_S32 " files\n", (int32)sysconf(_SC_OPEN_MAX));
+ fprintf(stderr, "\n");
if (ovlName == NULL)
- fprintf(stderr, "ERROR: No overlap store (-o) supplied.\n");
+ fprintf(stderr, "ERROR: No overlap store (-O) supplied.\n");
if (gkpName == NULL)
- fprintf(stderr, "ERROR: No gatekeeper store (-g) supplied.\n");
+ fprintf(stderr, "ERROR: No gatekeeper store (-G) supplied.\n");
if (fileList.size() == 0)
fprintf(stderr, "ERROR: No input overlap files (-L or last on the command line) supplied.\n");
if (fileLimit > sysconf(_SC_OPEN_MAX) - 16)
- fprintf(stderr, "ERROR: Too many jobs (-F); only "F_SIZE_T" supported on this architecture.\n", sysconf(_SC_OPEN_MAX) - 16);
+ fprintf(stderr, "ERROR: Too many jobs (-F); only " F_SIZE_T " supported on this architecture.\n", sysconf(_SC_OPEN_MAX) - 16);
if (maxMemory < MEMORY_OVERHEAD)
fprintf(stderr, "ERROR: Memory (-M) must be at least %.3f GB to account for overhead.\n", MEMORY_OVERHEAD / 1024.0 / 1024.0 / 1024.0);
exit(1);
}
+ // If only updating evalues, do it and quit.
-
-
- if (eValues) {
- ovStore *ovs = new ovStore(ovlName, NULL);
-
- for (uint32 i=0; i<fileList.size(); i++) {
- errno = 0;
- FILE *fp = fopen(fileList[i], "r");
- if (errno)
- fprintf(stderr, "Failed to open evalues file '%s': %s\n", fileList[i], strerror(errno));
-
- uint32 bgnID = 0;
- uint32 endID = 0;
- uint64 len = 0;
-
- fprintf(stderr, "loading evalues from '%s'\n", fileList[i]);
-
- AS_UTL_safeRead(fp, &bgnID, "loid", sizeof(uint32), 1);
- AS_UTL_safeRead(fp, &endID, "hiid", sizeof(uint32), 1);
- AS_UTL_safeRead(fp, &len, "len", sizeof(uint64), 1);
-
- uint16 *evalues = new uint16 [len];
-
- AS_UTL_safeRead(fp, evalues, "evalues", sizeof(uint16), len);
-
- fclose(fp);
-
- fprintf(stderr, "loading evalues from '%s' -- ID range "F_U32"-"F_U32" with "F_U64" overlaps\n",
- fileList[i], bgnID, endID, len);
-
- ovs->addEvalues(bgnID, endID, evalues, len);
-
- delete [] evalues;
- }
-
- delete ovs;
-
- exit(0);
- }
-
-
+ if (eValues)
+ addEvalues(ovlName, fileList), exit(0);
// Open reads, figure out a partitioning scheme.
gkStore *gkp = gkStore::gkStore_open(gkpName);
- uint64 maxIID = gkp->gkStore_getNumReads() + 1;
+ uint32 maxIID = gkp->gkStore_getNumReads() + 1;
uint32 *iidToBucket = computeIIDperBucket(fileLimit, minMemory, maxMemory, maxIID, fileList);
uint32 maxFiles = sysconf(_SC_OPEN_MAX);
if (iidToBucket[maxIID-1] > maxFiles - 8) {
fprintf(stderr, "ERROR:\n");
- fprintf(stderr, "ERROR: Operating system limit of "F_U32" open files. The current -F/-M settings\n", maxFiles);
- fprintf(stderr, "ERROR: will need to create "F_U32" files to construct the store.\n", iidToBucket[maxIID-1]);
+ fprintf(stderr, "ERROR: Operating system limit of " F_U32 " open files. The current -F/-M settings\n", maxFiles);
+ fprintf(stderr, "ERROR: will need to create " F_U32 " files to construct the store.\n", iidToBucket[maxIID-1]);
fprintf(stderr, "ERROR:\n");
exit(1);
}
+ // But if only asked to report the configuration, do it and quit.
-
- // Dump the configuration if told to.
-
- if (configOut) {
- errno = 0;
- FILE *C = fopen(configOut, "w");
- if (errno)
- fprintf(stderr, "Failed to open config output file '%s': %s\n", configOut, strerror(errno)), exit(1);
-
- AS_UTL_safeWrite(C, &maxIID, "maxIID", sizeof(uint32), 1);
- AS_UTL_safeWrite(C, iidToBucket, "iidToBucket", sizeof(uint32), maxIID);
-
- fclose(C);
-
- delete [] iidToBucket;
-
- gkp->gkStore_close();
-
- fprintf(stderr, "saved configuration to '%s'.\n", configOut);
-
- exit(0);
- }
-
-
+ if (configOut)
+ reportConfiguration(configOut, maxIID, iidToBucket), gkp->gkStore_close(), exit(0);
// Read the gkStore to determine which fragments we care about.
ovStoreFilter *filter = new ovStoreFilter(gkp, maxError);
+ //
+
+ fprintf(stderr, "\n");
+ fprintf(stderr, "-- BUCKETIZING --\n");
+ fprintf(stderr, "\n");
+
// And load reads into the store! We used to create the store before filtering, so it could fail
// quicker, but the filter should be much faster with the mmap()'d gkpStore in canu.
- ovStore *storeFile = new ovStore(ovlName, gkp, ovStoreWrite);
+ ovStoreWriter *store = new ovStoreWriter(ovlName, gkp);
- uint32 dumpFileMax = iidToBucket[maxIID-1] + 1;
- ovFile **dumpFile = new ovFile * [dumpFileMax];
- uint64 *dumpLength = new uint64 [dumpFileMax];
+ uint32 dumpFileMax = iidToBucket[maxIID-1] + 1;
+ ovFile **dumpFile = new ovFile * [dumpFileMax];
+ uint64 *dumpLength = new uint64 [dumpFileMax];
memset(dumpFile, 0, sizeof(ovFile *) * dumpFileMax);
memset(dumpLength, 0, sizeof(uint64) * dumpFileMax);
@@ -517,9 +513,9 @@ main(int argc, char **argv) {
ovOverlap foverlap(gkp);
ovOverlap roverlap(gkp);
- fprintf(stderr, "bucketizing %s\n", fileList[i]);
+ fprintf(stderr, "- Bucketizing '%s'\n", fileList[i]);
- ovFile *inputFile = new ovFile(fileList[i], ovFileFull);
+ ovFile *inputFile = new ovFile(gkp, fileList[i], ovFileFull);
while (inputFile->readOverlap(&foverlap)) {
filter->filterOverlap(foverlap, roverlap); // The filter copies f into r
@@ -532,43 +528,60 @@ main(int argc, char **argv) {
if ((foverlap.dat.ovl.forUTG == true) ||
(foverlap.dat.ovl.forOBT == true) ||
(foverlap.dat.ovl.forDUP == true))
- writeToDumpFile(&foverlap, dumpFile, dumpLength, iidToBucket, ovlName);
+ writeToDumpFile(gkp, &foverlap, dumpFile, dumpLength, iidToBucket, ovlName);
if ((roverlap.dat.ovl.forUTG == true) ||
(roverlap.dat.ovl.forOBT == true) ||
(roverlap.dat.ovl.forDUP == true))
- writeToDumpFile(&roverlap, dumpFile, dumpLength, iidToBucket, ovlName);
+ writeToDumpFile(gkp, &roverlap, dumpFile, dumpLength, iidToBucket, ovlName);
}
delete inputFile;
+ }
+
+ for (uint32 i=0; i<dumpFileMax; i++)
+ delete dumpFile[i];
+
+ // Report the fate of filtering
- // AFTER EVERY FILE
+ fprintf(stderr, "- Bucketizing finished:\n");
- filter->reportFate();
- filter->resetCounters();
+ if (filter->savedDedupe() > 0) {
+ fprintf(stderr, "-- Saved " F_U64 " dedupe overlaps\n", filter->savedDedupe());
+ fprintf(stderr, "-- Discarded " F_U64 " don't care " F_U64 " different library " F_U64 " obviously not duplicates\n", filter->filteredNoDedupe(), filter->filteredNotDupe(), filter->filteredDiffLib());
}
- delete filter;
+ if (filter->savedTrimming() > 0) {
+ fprintf(stderr, "-- Saved " F_U64 " trimming overlaps\n", filter->savedTrimming());
+ fprintf(stderr, "-- Discarded " F_U64 " don't care " F_U64 " too similar " F_U64 " too short\n", filter->filteredNoTrim(), filter->filteredBadTrim(), filter->filteredShortTrim());
+ }
- for (uint32 i=0; i<dumpFileMax; i++)
- delete dumpFile[i];
+ if (filter->savedUnitigging() > 0) {
+ fprintf(stderr, "-- Saved " F_U64 " unitigging overlaps\n", filter->savedUnitigging());
+ }
+
+ if (filter->filteredErate() > 0)
+ fprintf(stderr, "-- Discarded " F_U64 " low quality, more than %.4f fraction error\n", filter->filteredErate(), maxError);
- fprintf(stderr, "bucketizing DONE!\n");
+ delete filter;
//
// Read each bucket, sort it, and dump it to the store
//
+ fprintf(stderr, "\n");
+ fprintf(stderr, "-- SORTING --\n");
+ fprintf(stderr, "\n");
+
uint64 dumpLengthMax = 0;
for (uint32 i=0; i<dumpFileMax; i++)
if (dumpLengthMax < dumpLength[i])
dumpLengthMax = dumpLength[i];
+ ovStoreHistogram *histogram = new ovStoreHistogram;
ovOverlap *overlapsort = ovOverlap::allocateOverlaps(gkp, dumpLengthMax);
- time_t beginTime = time(NULL);
-
for (uint32 i=0; i<dumpFileMax; i++) {
char name[FILENAME_MAX];
ovFile *bof = NULL;
@@ -580,10 +593,10 @@ main(int argc, char **argv) {
// directly....BUT....we can't do that because the AS_OVS interface is rearranging the data to
// make sure the store is cross-platform compatible.
- sprintf(name, "%s/tmp.sort.%03d", ovlName, i);
- fprintf(stderr, "reading %s (%ld)\n", name, time(NULL) - beginTime);
+ snprintf(name, FILENAME_MAX, "%s/tmp.sort.%03d", ovlName, i);
+ fprintf(stderr, "- Loading '%s'\n", name);
- bof = new ovFile(name, ovFileFull);
+ bof = new ovFile(gkp, name, ovFileFull);
uint64 numOvl = 0;
while (bof->readOverlap(overlapsort + numOvl)) {
@@ -596,8 +609,8 @@ main(int argc, char **argv) {
(overlapsort[numOvl].b_iid >= maxIID)) {
char ovlstr[256];
- fprintf(stderr, "Overlap has IDs out of range (maxIID "F_U64"), possibly corrupt input data.\n", maxIID);
- fprintf(stderr, " Aid "F_U32" Bid "F_U32"\n", overlapsort[numOvl].a_iid, overlapsort[numOvl].b_iid);
+ fprintf(stderr, "Overlap has IDs out of range (maxIID " F_U32 "), possibly corrupt input data.\n", maxIID);
+ fprintf(stderr, " Aid " F_U32 " Bid " F_U32 "\n", overlapsort[numOvl].a_iid, overlapsort[numOvl].b_iid);
exit(1);
}
@@ -606,7 +619,6 @@ main(int argc, char **argv) {
delete bof;
-
assert(numOvl == dumpLength[i]);
assert(numOvl <= dumpLengthMax);
@@ -616,7 +628,7 @@ main(int argc, char **argv) {
unlink(name);
- fprintf(stderr, "sorting %s (%ld)\n", name, time(NULL) - beginTime);
+ fprintf(stderr, "- Sorting\n");
#ifdef _GLIBCXX_PARALLEL
// If we have the parallel STL, don't use it! Sort is not inplace!
@@ -625,12 +637,17 @@ main(int argc, char **argv) {
sort(overlapsort, overlapsort + dumpLength[i]);
#endif
- fprintf(stderr, "writing %s (%ld)\n", name, time(NULL) - beginTime);
+ fprintf(stderr, "- Writing\n");
+
for (uint64 x=0; x<dumpLength[i]; x++)
- storeFile->writeOverlap(overlapsort + x);
+ store->writeOverlap(overlapsort + x);
}
- delete storeFile;
+ fprintf(stderr, "\n");
+ fprintf(stderr, "-- FINISHING --\n");
+ fprintf(stderr, "\n");
+
+ delete store;
delete [] overlapsort;
gkp->gkStore_close();
diff --git a/src/stores/ovStoreDump.C b/src/stores/ovStoreDump.C
index 853d7ec..e6bcdd8 100644
--- a/src/stores/ovStoreDump.C
+++ b/src/stores/ovStoreDump.C
@@ -57,20 +57,23 @@
enum dumpOp {
- OP_NONE = 1,
- OP_DUMP = 2,
- OP_DUMP_PICTURE = 3
+ OP_NONE = 1,
+ OP_DUMP = 2,
+ OP_DUMP_PICTURE = 3
};
enum dumpFlags {
- NO_5p = 1,
- NO_3p = 2,
- NO_CONTAINED = 4,
- NO_CONTAINS = 8,
+ NO_5p = 1,
+ NO_3p = 2,
+ NO_CONTAINED = 4,
+ NO_CONTAINS = 8,
NO_CONTAINED_READS = 16,
NO_SUSPICIOUS_READS = 32,
- NO_SINGLETON_READS = 64
+ NO_SINGLETON_READS = 64,
+ WITH_ERATE = 128,
+ WITH_LENGTH = 256,
+ ONE_SIDED = 512
};
@@ -122,17 +125,17 @@ bogartStatus::bogartStatus(const char *prefix, uint32 nReads) {
errno = 0;
- sprintf(N, "%s.edges", prefix);
+ snprintf(N, FILENAME_MAX, "%s.edges", prefix);
FILE *E = fopen(N, "r");
if (errno)
fprintf(stderr, "Failed to open '%s' for reading: %s\n", N, strerror(errno)), exit(1);
- sprintf(N, "%s.edges.suspicious", prefix);
+ snprintf(N, FILENAME_MAX, "%s.edges.suspicious", prefix);
FILE *S = fopen(N, "r");
if (errno)
fprintf(stderr, "Failed to open '%s' for reading: %s\n", N, strerror(errno)), exit(1);
- sprintf(N, "%s.singletons", prefix);
+ snprintf(N, FILENAME_MAX, "%s.singletons", prefix);
FILE *G = fopen(N, "r");
if (errno)
fprintf(stderr, "Failed to open '%s' for reading: %s\n", N, strerror(errno)), exit(1);
@@ -221,20 +224,20 @@ bogartStatus::bogartStatus(const char *prefix, uint32 nReads) {
//
void
-dumpStore(ovStore *ovlStore,
- gkStore *gkpStore,
- bool asBinary,
- bool asCounts,
- double dumpERate,
- uint32 dumpType,
- uint32 dumpLength,
- uint32 bgnID,
- uint32 endID,
- uint32 qryID,
+dumpStore(ovStore *ovlStore,
+ gkStore *gkpStore,
+ bool asBinary,
+ bool asCounts,
+ bool asErateLen,
+ double dumpERate,
+ uint32 dumpLength,
+ uint32 dumpType,
+ uint32 bgnID,
+ uint32 endID,
+ uint32 qryID,
ovOverlapDisplayType type,
- bool beVerbose,
- bool oneSided,
- char *bestPrefix) {
+ bool beVerbose,
+ char *bestPrefix) {
ovOverlap overlap(gkpStore);
uint64 evalue = AS_OVS_encodeEvalue(dumpERate);
@@ -251,24 +254,52 @@ dumpStore(ovStore *ovlStore,
uint32 obtDumped = 0;
uint32 merDumped = 0;
- uint32 *counts = (asCounts) ? new uint32 [endID - bgnID + 1] : NULL;
+ uint32 *counts = NULL;
+ ovStoreHistogram *hist = NULL;
+
+ // Set the range of the reads to dump early so that we can reset it later.
+
+ ovlStore->setRange(bgnID, endID);
+
+ // If we're dumping counts, and there are modifiers, we need to scan all overlaps
+
+ if ((asCounts) && (dumpType != 0)) {
+ counts = new uint32 [endID - bgnID + 1];
- if (asCounts)
for (uint32 ii=bgnID; ii<=endID; ii++)
counts[ii - bgnID] = 0;
+ }
- ovlStore->setRange(bgnID, endID);
+ // If we're dumping counts, and no modifiers, we can just ask the store for the counts
+ // and set the range to null.
+
+ if ((asCounts) && (dumpType == 0)) {
+ counts = ovlStore->numOverlapsPerFrag(bgnID, endID);
+ ovlStore->setRange(1, 0);
+ }
+
+ // If we're dumping the erate-vs-length histogram, and no modifiers, grab it from the store and
+ // set the range to null. Otherwise, allocate a new one.
+
+ if ((asErateLen) && (dumpType == 0)) {
+ hist = ovlStore->getHistogram();
+ ovlStore->setRange(1, 0);
+ }
+
+ if ((asErateLen) && (dumpType > 0)) {
+ hist = new ovStoreHistogram(gkpStore, ovFileNormalWrite);
+ }
// Length filtering is expensive to compute, need to load both reads to get their length.
//
- //if ((dumpLength > 0) && (dumpLength < overlapLength(overlap)))
+ //if ((dumpType & WITH_LENGTH) && (dumpLength < overlapLength(overlap)))
// continue;
while (ovlStore->readOverlap(&overlap) == TRUE) {
if ((qryID != 0) && (qryID != overlap.b_iid))
continue;
- if (overlap.evalue() > evalue) {
+ if ((dumpType & WITH_ERATE) && (overlap.evalue() > evalue)) {
ovlTooHighError++;
continue;
}
@@ -296,21 +327,24 @@ dumpStore(ovStore *ovlStore,
continue;
}
- if (oneSided == true && overlap.a_iid >= overlap.b_iid) {
+ if ((dumpType & ONE_SIDED) && (overlap.a_iid >= overlap.b_iid)) {
ovlNotUnique++;
continue;
}
ovlDumped++;
- // The toString() method is quite slow, all from sprintf().
+ // The toString() method is quite slow, all from snprintf().
// Without both the puts() and AtoString(), a dump ran in 3 seconds.
// With both, 138 seconds.
// Without the puts(), 127 seconds.
- if (asCounts)
+ if (asCounts)
counts[overlap.a_iid - bgnID]++;
+ else if (asErateLen)
+ hist->addOverlap(&overlap);
+
else if (asBinary)
AS_UTL_safeWrite(stdout, &overlap, "dumpStore", sizeof(ovOverlap), 1);
@@ -318,11 +352,17 @@ dumpStore(ovStore *ovlStore,
fputs(overlap.toString(ovlString, type, true), stdout);
}
- if (asCounts)
+ if (asCounts) {
for (uint32 ii=bgnID; ii<=endID; ii++)
fprintf(stdout, "%u\t%u\n", ii, counts[ii - bgnID]);
+ }
+
+ if (asErateLen) {
+ hist->dumpEvalueLength(stdout);
+ }
delete [] counts;
+ delete hist;
if (beVerbose) {
fprintf(stderr, "ovlTooHighError %u\n", ovlTooHighError);
@@ -418,8 +458,8 @@ dumpPicture(ovOverlap *overlaps,
// For the A read, find the points in our string representation where the overlap ends.
- uint32 ovlStrBgn = ovlBgnA * 100 / frgLenA + MHS;
- uint32 ovlStrEnd = ovlEndA * 100 / frgLenA + MHS;
+ uint32 ovlStrBgn = (int32)floor(ovlBgnA * 100.0 / frgLenA + MHS);
+ uint32 ovlStrEnd = (int32)ceil (ovlEndA * 100.0 / frgLenA + MHS);
// Fill the string representation with spaces, then fill the string with dashes where the read
// is, add an arrow, and terminate the string.
@@ -489,7 +529,7 @@ dumpPicture(ovOverlap *overlaps,
char str[256];
int32 len;
- sprintf(str, "+%d", ovlBgnHang);
+ snprintf(str, 256, "+%d", ovlBgnHang);
len = strlen(str);
for (int32 i=0; i<len; i++)
@@ -498,7 +538,7 @@ dumpPicture(ovOverlap *overlaps,
// Append the end hang.
if (ovlEndHang > 0) {
- sprintf(ovl + ovlStrEnd, " +%d", ovlEndHang);
+ snprintf(ovl + ovlStrEnd, 256 - ovlStrEnd, " +%d", ovlEndHang);
}
// Set flags for best edge and singleton/contained/suspicious. Left in for when I get annoyed with the different lines.
@@ -565,7 +605,7 @@ dumpPicture(ovStore *ovlStore,
uint32 qryID,
char *bestPrefix) {
- //fprintf(stderr, "DUMPING PICTURE for ID "F_U32" in store %s (gkp %s)\n",
+ //fprintf(stderr, "DUMPING PICTURE for ID " F_U32 " in store %s (gkp %s)\n",
// qryID, ovlName, gkpName);
uint32 Aid = qryID;
@@ -656,6 +696,7 @@ main(int argc, char **argv) {
bool asBinary = false;
bool asCounts = false;
+ bool asErateLen = false;
double dumpERate = 1.0;
uint32 dumpLength = 0;
@@ -668,7 +709,6 @@ main(int argc, char **argv) {
uint32 qryID = 0;
bool beVerbose = false;
- bool oneSided = false;
char *bestPrefix = NULL;
@@ -730,12 +770,19 @@ main(int argc, char **argv) {
else if (strcmp(argv[arg], "-counts") == 0)
asCounts = true;
+ else if (strcmp(argv[arg], "-eratelen") == 0)
+ asErateLen = true;
+
// standard bulk dump options
- else if (strcmp(argv[arg], "-E") == 0)
+ else if (strcmp(argv[arg], "-E") == 0) {
dumpERate = atof(argv[++arg]);
+ dumpType |= WITH_ERATE;
+ }
- else if (strcmp(argv[arg], "-L") == 0)
+ else if (strcmp(argv[arg], "-L") == 0) {
dumpLength = atoi(argv[++arg]);
+ dumpType |= WITH_LENGTH;
+ }
else if (strcmp(argv[arg], "-d5") == 0)
dumpType |= NO_5p;
@@ -753,7 +800,7 @@ main(int argc, char **argv) {
beVerbose = true;
else if (strcmp(argv[arg], "-unique") == 0)
- oneSided = true;
+ dumpType |= ONE_SIDED;
else if (strcmp(argv[arg], "-best") == 0)
bestPrefix = argv[++arg];
@@ -799,6 +846,7 @@ main(int argc, char **argv) {
fprintf(stderr, " -paf dump overlaps in miniasm/minimap format\n");
fprintf(stderr, " -binary dump overlap as raw binary data\n");
fprintf(stderr, " -counts dump the number of overlaps per read\n");
+ fprintf(stderr, " -eratelen dump a heatmap of error-rate vs overlap-length\n");
fprintf(stderr, "\n");
fprintf(stderr, " MODIFIERS (for -d and -p)\n");
fprintf(stderr, "\n");
@@ -837,7 +885,16 @@ main(int argc, char **argv) {
switch (operation) {
case OP_DUMP:
- dumpStore(ovlStore, gkpStore, asBinary, asCounts, dumpERate, dumpLength, dumpType, bgnID, endID, qryID, type, beVerbose, oneSided, bestPrefix);
+ dumpStore(ovlStore,
+ gkpStore,
+ asBinary, asCounts, asErateLen,
+ dumpERate,
+ dumpLength,
+ dumpType,
+ bgnID, endID, qryID,
+ type,
+ beVerbose,
+ bestPrefix);
break;
case OP_DUMP_PICTURE:
for (qryID=bgnID; qryID <= endID; qryID++)
diff --git a/src/stores/ovStoreFile.C b/src/stores/ovStoreFile.C
index e822566..46a3fe0 100644
--- a/src/stores/ovStoreFile.C
+++ b/src/stores/ovStoreFile.C
@@ -46,11 +46,20 @@
#include "ovStore.H"
+#ifdef SNAPPY
+#include "snappy.h"
+#endif
+
+// The histogram associated with this is written to files with any suffices stripped off.
-ovFile::ovFile(const char *name,
+ovFile::ovFile(gkStore *gkp,
+ const char *name,
ovFileType type,
uint32 bufferSize) {
+ _gkp = gkp;
+ _histogram = new ovStoreHistogram(_gkp, type);
+
// We write two sizes of overlaps. The 'normal' format doesn't contain the a_iid, while the
// 'full' format does. The buffer size must hold an integer number of overlaps, otherwise the
// reader will read partial overlaps and fail. Choose a buffer size that can handle both.
@@ -66,158 +75,157 @@ ovFile::ovFile(const char *name,
_bufferMax = (bufferSize / (lcm * sizeof(uint32))) * lcm;
_buffer = new uint32 [_bufferMax];
+#ifdef SNAPPY
+ _snappyLen = 0;
+ _snappyBuffer = NULL;
+#endif
+
assert(_bufferMax % ((sizeof(uint32) * 1) + (sizeof(ovOverlapDAT))) == 0);
assert(_bufferMax % ((sizeof(uint32) * 2) + (sizeof(ovOverlapDAT))) == 0);
- // When writing full overlaps, we also write the number of overlaps per read. This is used to
- // build the store. Overlaps in the store, normal format, don't need this extra data, as the
- // store itself knows how many overlaps per read.
-
- _olapsPerReadAlloc = 0;
- _olapsPerReadLast = 0;
- _olapsPerRead = NULL;
-
- if (type == ovFileFullWrite) {
- _olapsPerReadAlloc = 128 * 1024;
- _olapsPerReadLast = 0;
- _olapsPerRead = new uint32 [_olapsPerReadAlloc];
-
- memset(_olapsPerRead, 0, sizeof(uint32) * _olapsPerReadAlloc);
- }
-
// Create the input/output buffers and files.
_isOutput = false;
_isSeekable = false;
_isNormal = (type == ovFileNormal) || (type == ovFileNormalWrite);
+#ifdef SNAPPY
+ _useSnappy = false;
+#endif
_reader = NULL;
_writer = NULL;
- // Open a file for reading?
- if ((type == ovFileNormal) || (type == ovFileFull)) {
+ // Open store files for reading. These generally cannot be compressed, but we pretend they can be.
+ if (type == ovFileNormal) {
_reader = new compressedFileReader(name);
_file = _reader->file();
_isSeekable = (_reader->isCompressed() == false);
}
- // Open a file for writing?
- else {
+ // Open dump files for reading. These certainly can be compressed.
+ else if (type == ovFileFull) {
+ _reader = new compressedFileReader(name);
+ _file = _reader->file();
+ _isSeekable = (_reader->isCompressed() == false);
+#ifdef SNAPPY
+ _useSnappy = true;
+#endif
+ }
+
+ // Open a store file for writing?
+ else if (type == ovFileNormalWrite) {
_writer = new compressedFileWriter(name);
_file = _writer->file();
_isOutput = true;
}
- // Make a copy of the output name, and clean it up. This is used as the base for
- // the counts output. We just strip off all the dotted extensions in the filename.
-
- strcpy(_prefix, name);
-
- char *slash = strrchr(_prefix, '/');
- char *dot = strchr((slash == NULL) ? _prefix : slash, '.');
+ // Else, open a dump file for writing. This catches two cases, one with counts and one without counts.
+ else {
+ _writer = new compressedFileWriter(name);
+ _file = _writer->file();
+ _isOutput = true;
+#ifdef SNAPPY
+ _useSnappy = true;
+#endif
+ }
- if (dot)
- *dot = 0;
+ AS_UTL_findBaseFileName(_prefix, name);
}
ovFile::~ovFile() {
- flushOverlaps();
+ writeBuffer(true);
delete _reader;
delete _writer;
delete [] _buffer;
- if (_olapsPerRead) {
- char name[FILENAME_MAX];
-
- sprintf(name, "%s.counts", _prefix);
-
- errno = 0;
- _file = fopen(name, "w");
- if (errno)
- fprintf(stderr, "failed to open counts file '%s' for writing: %s\n", name, strerror(errno)), exit(1);
-
- _olapsPerReadLast++;
-
- AS_UTL_safeWrite(_file, &_olapsPerReadLast, "ovFile::olapsPerReadLast", sizeof(uint32), 1);
- AS_UTL_safeWrite(_file, _olapsPerRead, "ovFile::olapsPerRead", sizeof(uint32), _olapsPerReadLast);
-
- fclose(_file);
+#ifdef SNAPPY
+ delete [] _snappyBuffer;
+#endif
- delete [] _olapsPerRead;
+ _histogram->saveData(_prefix);
- //fprintf(stderr, "Wrote counts file '%s' for reads up to iid "F_U32"\n", name, _olapsPerReadLast);
- }
+ delete _histogram;
}
void
-ovFile::flushOverlaps(void) {
+ovFile::writeBuffer(bool force) {
- if (_isOutput == false)
+ if (_isOutput == false) // Needed because it's called in the destructor.
return;
+ if ((force == false) && (_bufferLen < _bufferMax))
+ return;
if (_bufferLen == 0)
return;
- AS_UTL_safeWrite(_file, _buffer, "ovFile::flushOverlaps", sizeof(uint32), _bufferLen);
+ // If compressing, compress the block then write compressed length and the block.
- _bufferLen = 0;
-}
+#ifdef SNAPPY
+ if (_useSnappy == true) {
+ size_t bl = snappy::MaxCompressedLength(_bufferLen * sizeof(uint32));
+ if (_snappyLen < bl) {
+ delete [] _snappyBuffer;
+ _snappyLen = bl;
+ _snappyBuffer = new char [_snappyLen];
+ }
+ snappy::RawCompress((const char *)_buffer, _bufferLen * sizeof(uint32), _snappyBuffer, &bl);
-void
-ovFile::writeOverlap(ovOverlap *overlap) {
+ AS_UTL_safeWrite(_file, &bl, "ovFile::writeBuffer::bl", sizeof(size_t), 1);
+ AS_UTL_safeWrite(_file, _snappyBuffer, "ovFile::writeBuffer::sb", sizeof(char), bl);
+ }
- assert(_isOutput == true);
+ // Otherwise, just dump the block
- if (_bufferLen >= _bufferMax) {
- AS_UTL_safeWrite(_file, _buffer, "ovFile::writeOverlap", sizeof(uint32), _bufferLen);
- _bufferLen = 0;
- }
+ else
+#endif
+ AS_UTL_safeWrite(_file, _buffer, "ovFile::writeBuffer", sizeof(uint32), _bufferLen);
- if (_olapsPerRead) {
- uint32 newmax = _olapsPerReadAlloc;
- uint32 newlast = _olapsPerReadLast;
+ // Buffer written. Clear it.
+ _bufferLen = 0;
+}
- newlast = max(newlast, overlap->a_iid);
- newlast = max(newlast, overlap->b_iid);
- while (newmax <= newlast)
- newmax += newmax / 4;
- resizeArray(_olapsPerRead, _olapsPerReadLast+1, _olapsPerReadAlloc, newmax, resizeArray_copyData | resizeArray_clearNew);
+void
+ovFile::writeOverlap(ovOverlap *overlap) {
- _olapsPerRead[overlap->a_iid]++;
- _olapsPerRead[overlap->b_iid]++;
+ assert(_isOutput == true);
- _olapsPerReadLast = newlast;
- }
+ writeBuffer();
+
+ _histogram->addOverlap(overlap);
if (_isNormal == false)
_buffer[_bufferLen++] = overlap->a_iid;
_buffer[_bufferLen++] = overlap->b_iid;
-#if (ovOverlapNWORDS == 5)
+#if (ovOverlapWORDSZ == 32) && (ovOverlapNWORDS == 3)
+ _buffer[_bufferLen++] = overlap->dat.dat[0];
+ _buffer[_bufferLen++] = overlap->dat.dat[1];
+ _buffer[_bufferLen++] = overlap->dat.dat[2];
+#elif (ovOverlapWORDSZ == 32) && (ovOverlapNWORDS == 5)
_buffer[_bufferLen++] = overlap->dat.dat[0];
_buffer[_bufferLen++] = overlap->dat.dat[1];
_buffer[_bufferLen++] = overlap->dat.dat[2];
_buffer[_bufferLen++] = overlap->dat.dat[3];
_buffer[_bufferLen++] = overlap->dat.dat[4];
-#elif (ovOverlapNWORDS == 3)
- _buffer[_bufferLen++] = (overlap->dat.dat[0] >> 32) & 0xffffffff;
- _buffer[_bufferLen++] = (overlap->dat.dat[0] >> 0) & 0xffffffff;
- _buffer[_bufferLen++] = (overlap->dat.dat[1] >> 32) & 0xffffffff;
- _buffer[_bufferLen++] = (overlap->dat.dat[1] >> 0) & 0xffffffff;
- _buffer[_bufferLen++] = (overlap->dat.dat[2] >> 32) & 0xffffffff;
- _buffer[_bufferLen++] = (overlap->dat.dat[2] >> 0) & 0xffffffff;
-#elif (ovOverlapNWORDS == 8)
+#elif (ovOverlapWORDSZ == 32) && (ovOverlapNWORDS == 6)
+ _buffer[_bufferLen++] = overlap->dat.dat[0];
+ _buffer[_bufferLen++] = overlap->dat.dat[1];
+ _buffer[_bufferLen++] = overlap->dat.dat[2];
+ _buffer[_bufferLen++] = overlap->dat.dat[3];
+ _buffer[_bufferLen++] = overlap->dat.dat[4];
+ _buffer[_bufferLen++] = overlap->dat.dat[5];
+#elif (ovOverlapWORDSZ == 32) && (ovOverlapNWORDS == 8)
_buffer[_bufferLen++] = overlap->dat.dat[0];
_buffer[_bufferLen++] = overlap->dat.dat[1];
_buffer[_bufferLen++] = overlap->dat.dat[2];
@@ -226,6 +234,18 @@ ovFile::writeOverlap(ovOverlap *overlap) {
_buffer[_bufferLen++] = overlap->dat.dat[5];
_buffer[_bufferLen++] = overlap->dat.dat[6];
_buffer[_bufferLen++] = overlap->dat.dat[7];
+#elif (ovOverlapWORDSZ == 64) && (ovOverlapNWORDS == 2)
+ _buffer[_bufferLen++] = (overlap->dat.dat[0] >> 32) & 0xffffffff;
+ _buffer[_bufferLen++] = (overlap->dat.dat[0] >> 0) & 0xffffffff;
+ _buffer[_bufferLen++] = (overlap->dat.dat[1] >> 32) & 0xffffffff;
+ _buffer[_bufferLen++] = (overlap->dat.dat[1] >> 0) & 0xffffffff;
+#elif (ovOverlapWORDSZ == 64) && (ovOverlapNWORDS == 3)
+ _buffer[_bufferLen++] = (overlap->dat.dat[0] >> 32) & 0xffffffff;
+ _buffer[_bufferLen++] = (overlap->dat.dat[0] >> 0) & 0xffffffff;
+ _buffer[_bufferLen++] = (overlap->dat.dat[1] >> 32) & 0xffffffff;
+ _buffer[_bufferLen++] = (overlap->dat.dat[1] >> 0) & 0xffffffff;
+ _buffer[_bufferLen++] = (overlap->dat.dat[2] >> 32) & 0xffffffff;
+ _buffer[_bufferLen++] = (overlap->dat.dat[2] >> 0) & 0xffffffff;
#else
#error unknown ovOverlapNWORDS
#endif
@@ -241,65 +261,56 @@ ovFile::writeOverlaps(ovOverlap *overlaps, uint64 overlapsLen) {
assert(_isOutput == true);
- // Resize the olapsPerRead array once per batch.
-
- if (_olapsPerRead) {
- uint32 newmax = _olapsPerReadAlloc;
- uint32 newlast = _olapsPerReadLast;
-
- for (uint32 oo=0; oo<overlapsLen; oo++) {
- newlast = max(newlast, overlaps[oo].a_iid);
- newlast = max(newlast, overlaps[oo].b_iid);
-
- while (newmax <= newlast)
- newmax += newmax / 4;
- }
-
- resizeArray(_olapsPerRead, _olapsPerReadLast+1, _olapsPerReadAlloc, newmax, resizeArray_copyData | resizeArray_clearNew);
-
- _olapsPerReadLast = newlast;
- }
-
// Add all overlaps to the buffer.
while (nWritten < overlapsLen) {
- if (_bufferLen >= _bufferMax) {
- AS_UTL_safeWrite(_file, _buffer, "ovFile::writeOverlap", sizeof(uint32), _bufferLen);
- _bufferLen = 0;
- }
+ writeBuffer();
- if (_olapsPerRead) {
- _olapsPerRead[overlaps[nWritten].a_iid]++;
- _olapsPerRead[overlaps[nWritten].b_iid]++;
- }
+ _histogram->addOverlap(overlaps + nWritten);
if (_isNormal == false)
_buffer[_bufferLen++] = overlaps[nWritten].a_iid;
_buffer[_bufferLen++] = overlaps[nWritten].b_iid;
-#if (ovOverlapNWORDS == 5)
- _buffer[_bufferLen++] = overlaps[nWritten].dat.dat[0];
- _buffer[_bufferLen++] = overlaps[nWritten].dat.dat[1];
- _buffer[_bufferLen++] = overlaps[nWritten].dat.dat[2];
- _buffer[_bufferLen++] = overlaps[nWritten].dat.dat[3];
- _buffer[_bufferLen++] = overlaps[nWritten].dat.dat[4];
-#elif (ovOverlapNWORDS == 3)
- _buffer[_bufferLen++] = (overlaps[nWritten].dat.dat[0] >> 32) & 0xffffffff;
- _buffer[_bufferLen++] = (overlaps[nWritten].dat.dat[0] >> 0) & 0xffffffff;
- _buffer[_bufferLen++] = (overlaps[nWritten].dat.dat[1] >> 32) & 0xffffffff;
- _buffer[_bufferLen++] = (overlaps[nWritten].dat.dat[1] >> 0) & 0xffffffff;
- _buffer[_bufferLen++] = (overlaps[nWritten].dat.dat[2] >> 32) & 0xffffffff;
- _buffer[_bufferLen++] = (overlaps[nWritten].dat.dat[2] >> 0) & 0xffffffff;
-#elif (ovOverlapNWORDS == 8)
- _buffer[_bufferLen++] = overlaps[nWritten].dat.dat[0];
- _buffer[_bufferLen++] = overlaps[nWritten].dat.dat[1];
- _buffer[_bufferLen++] = overlaps[nWritten].dat.dat[2];
- _buffer[_bufferLen++] = overlaps[nWritten].dat.dat[3];
- _buffer[_bufferLen++] = overlaps[nWritten].dat.dat[4];
- _buffer[_bufferLen++] = overlaps[nWritten].dat.dat[5];
- _buffer[_bufferLen++] = overlaps[nWritten].dat.dat[6];
- _buffer[_bufferLen++] = overlaps[nWritten].dat.dat[7];
+#if (ovOverlapWORDSZ == 32) && (ovOverlapNWORDS == 3)
+ _buffer[_bufferLen++] = overlaps[nWritten].dat.dat[0];
+ _buffer[_bufferLen++] = overlaps[nWritten].dat.dat[1];
+ _buffer[_bufferLen++] = overlaps[nWritten].dat.dat[2];
+#elif (ovOverlapWORDSZ == 32) && (ovOverlapNWORDS == 5)
+ _buffer[_bufferLen++] = overlaps[nWritten].dat.dat[0];
+ _buffer[_bufferLen++] = overlaps[nWritten].dat.dat[1];
+ _buffer[_bufferLen++] = overlaps[nWritten].dat.dat[2];
+ _buffer[_bufferLen++] = overlaps[nWritten].dat.dat[3];
+ _buffer[_bufferLen++] = overlaps[nWritten].dat.dat[4];
+#elif (ovOverlapWORDSZ == 32) && (ovOverlapNWORDS == 6)
+ _buffer[_bufferLen++] = overlaps[nWritten].dat.dat[0];
+ _buffer[_bufferLen++] = overlaps[nWritten].dat.dat[1];
+ _buffer[_bufferLen++] = overlaps[nWritten].dat.dat[2];
+ _buffer[_bufferLen++] = overlaps[nWritten].dat.dat[3];
+ _buffer[_bufferLen++] = overlaps[nWritten].dat.dat[4];
+ _buffer[_bufferLen++] = overlaps[nWritten].dat.dat[5];
+#elif (ovOverlapWORDSZ == 32) && (ovOverlapNWORDS == 8)
+ _buffer[_bufferLen++] = overlaps[nWritten].dat.dat[0];
+ _buffer[_bufferLen++] = overlaps[nWritten].dat.dat[1];
+ _buffer[_bufferLen++] = overlaps[nWritten].dat.dat[2];
+ _buffer[_bufferLen++] = overlaps[nWritten].dat.dat[3];
+ _buffer[_bufferLen++] = overlaps[nWritten].dat.dat[4];
+ _buffer[_bufferLen++] = overlaps[nWritten].dat.dat[5];
+ _buffer[_bufferLen++] = overlaps[nWritten].dat.dat[6];
+ _buffer[_bufferLen++] = overlaps[nWritten].dat.dat[7];
+#elif (ovOverlapWORDSZ == 64) && (ovOverlapNWORDS == 2)
+ _buffer[_bufferLen++] = (overlaps[nWritten].dat.dat[0] >> 32) & 0xffffffff;
+ _buffer[_bufferLen++] = (overlaps[nWritten].dat.dat[0] >> 0) & 0xffffffff;
+ _buffer[_bufferLen++] = (overlaps[nWritten].dat.dat[1] >> 32) & 0xffffffff;
+ _buffer[_bufferLen++] = (overlaps[nWritten].dat.dat[1] >> 0) & 0xffffffff;
+#elif (ovOverlapWORDSZ == 64) && (ovOverlapNWORDS == 3)
+ _buffer[_bufferLen++] = (overlaps[nWritten].dat.dat[0] >> 32) & 0xffffffff;
+ _buffer[_bufferLen++] = (overlaps[nWritten].dat.dat[0] >> 0) & 0xffffffff;
+ _buffer[_bufferLen++] = (overlaps[nWritten].dat.dat[1] >> 32) & 0xffffffff;
+ _buffer[_bufferLen++] = (overlaps[nWritten].dat.dat[1] >> 0) & 0xffffffff;
+ _buffer[_bufferLen++] = (overlaps[nWritten].dat.dat[2] >> 32) & 0xffffffff;
+ _buffer[_bufferLen++] = (overlaps[nWritten].dat.dat[2] >> 0) & 0xffffffff;
#else
#error unknown ovOverlapNWORDS
#endif
@@ -312,15 +323,58 @@ ovFile::writeOverlaps(ovOverlap *overlaps, uint64 overlapsLen) {
+void
+ovFile::readBuffer(void) {
+
+ if (_bufferPos < _bufferLen)
+ return;
+
+ // Need to load a new buffer. Everyone resets bufferPos to the start.
+
+ _bufferPos = 0;
+
+ // If compressed, we need to decode the block.
+
+#ifdef SNAPPY
+ if (_useSnappy == true) {
+ size_t cl = 0;
+ size_t clc = AS_UTL_safeRead(_file, &cl, "ovFile::readBuffer::cl", sizeof(size_t), 1);
+
+ if (_snappyLen < cl) {
+ delete [] _snappyBuffer;
+ _snappyLen = cl;
+ _snappyBuffer = new char [cl];
+ }
+
+ size_t sbc = AS_UTL_safeRead(_file, _snappyBuffer, "ovFile::readBuffer::sb", sizeof(char), cl);
+
+ if (sbc != cl)
+ fprintf(stderr, "ERROR: short read on file '%s': read " F_SIZE_T " bytes, expected " F_SIZE_T ".\n",
+ _prefix, sbc, cl), exit(1);
+
+ size_t ol = 0;
+
+ snappy::GetUncompressedLength(_snappyBuffer, cl, &ol);
+ snappy::RawUncompress(_snappyBuffer, cl, (char *)_buffer);
+
+ _bufferLen = ol / sizeof(uint32);
+ }
+
+ // But if loading from 'normal' files, just load. Easy peasy.
+
+ else
+#endif
+ _bufferLen = AS_UTL_safeRead(_file, _buffer, "ovFile::readBuffer", sizeof(uint32), _bufferMax);
+}
+
+
+
bool
ovFile::readOverlap(ovOverlap *overlap) {
assert(_isOutput == false);
- if (_bufferPos >= _bufferLen) {
- _bufferLen = AS_UTL_safeRead(_file, _buffer, "ovFile::readOverlap", sizeof(uint32), _bufferMax);
- _bufferPos = 0;
- }
+ readBuffer();
if (_bufferLen == 0)
return(false);
@@ -332,20 +386,24 @@ ovFile::readOverlap(ovOverlap *overlap) {
overlap->b_iid = _buffer[_bufferPos++];
-#if (ovOverlapNWORDS == 5)
+#if (ovOverlapWORDSZ == 32) && (ovOverlapNWORDS == 3)
+ overlap->dat.dat[0] = _buffer[_bufferPos++];
+ overlap->dat.dat[1] = _buffer[_bufferPos++];
+ overlap->dat.dat[2] = _buffer[_bufferPos++];
+#elif (ovOverlapWORDSZ == 32) && (ovOverlapNWORDS == 5)
overlap->dat.dat[0] = _buffer[_bufferPos++];
overlap->dat.dat[1] = _buffer[_bufferPos++];
overlap->dat.dat[2] = _buffer[_bufferPos++];
overlap->dat.dat[3] = _buffer[_bufferPos++];
overlap->dat.dat[4] = _buffer[_bufferPos++];
-#elif (ovOverlapNWORDS == 3)
- overlap->dat.dat[0] = _buffer[_bufferPos++]; overlap->dat.dat[0] <<= 32;
- overlap->dat.dat[0] |= _buffer[_bufferPos++];
- overlap->dat.dat[1] = _buffer[_bufferPos++]; overlap->dat.dat[1] <<= 32;
- overlap->dat.dat[1] |= _buffer[_bufferPos++];
- overlap->dat.dat[2] = _buffer[_bufferPos++]; overlap->dat.dat[2] <<= 32;
- overlap->dat.dat[2] |= _buffer[_bufferPos++];
-#elif (ovOverlapNWORDS == 8)
+#elif (ovOverlapWORDSZ == 32) && (ovOverlapNWORDS == 6)
+ overlap->dat.dat[0] = _buffer[_bufferPos++];
+ overlap->dat.dat[1] = _buffer[_bufferPos++];
+ overlap->dat.dat[2] = _buffer[_bufferPos++];
+ overlap->dat.dat[3] = _buffer[_bufferPos++];
+ overlap->dat.dat[4] = _buffer[_bufferPos++];
+ overlap->dat.dat[5] = _buffer[_bufferPos++];
+#elif (ovOverlapWORDSZ == 32) && (ovOverlapNWORDS == 8)
overlap->dat.dat[0] = _buffer[_bufferPos++];
overlap->dat.dat[1] = _buffer[_bufferPos++];
overlap->dat.dat[2] = _buffer[_bufferPos++];
@@ -354,6 +412,18 @@ ovFile::readOverlap(ovOverlap *overlap) {
overlap->dat.dat[5] = _buffer[_bufferPos++];
overlap->dat.dat[6] = _buffer[_bufferPos++];
overlap->dat.dat[7] = _buffer[_bufferPos++];
+#elif (ovOverlapWORDSZ == 64) && (ovOverlapNWORDS == 2)
+ overlap->dat.dat[0] = _buffer[_bufferPos++]; overlap->dat.dat[0] <<= 32;
+ overlap->dat.dat[0] |= _buffer[_bufferPos++];
+ overlap->dat.dat[1] = _buffer[_bufferPos++]; overlap->dat.dat[1] <<= 32;
+ overlap->dat.dat[1] |= _buffer[_bufferPos++];
+#elif (ovOverlapWORDSZ == 64) && (ovOverlapNWORDS == 3)
+ overlap->dat.dat[0] = _buffer[_bufferPos++]; overlap->dat.dat[0] <<= 32;
+ overlap->dat.dat[0] |= _buffer[_bufferPos++];
+ overlap->dat.dat[1] = _buffer[_bufferPos++]; overlap->dat.dat[1] <<= 32;
+ overlap->dat.dat[1] |= _buffer[_bufferPos++];
+ overlap->dat.dat[2] = _buffer[_bufferPos++]; overlap->dat.dat[2] <<= 32;
+ overlap->dat.dat[2] |= _buffer[_bufferPos++];
#else
#error unknown ovOverlapNWORDS
#endif
@@ -372,10 +442,7 @@ ovFile::readOverlaps(ovOverlap *overlaps, uint64 overlapsLen) {
assert(_isOutput == false);
while (nLoaded < overlapsLen) {
- if (_bufferPos >= _bufferLen) {
- _bufferLen = AS_UTL_safeRead(_file, _buffer, "ovFile::readOverlaps", sizeof(uint32), _bufferMax);
- _bufferPos = 0;
- }
+ readBuffer();
if (_bufferLen == 0)
return(nLoaded);
@@ -387,20 +454,24 @@ ovFile::readOverlaps(ovOverlap *overlaps, uint64 overlapsLen) {
overlaps[nLoaded].b_iid = _buffer[_bufferPos++];
-#if (ovOverlapNWORDS == 5)
+#if (ovOverlapWORDSZ == 32) && (ovOverlapNWORDS == 3)
+ overlaps[nLoaded].dat.dat[0] = _buffer[_bufferPos++];
+ overlaps[nLoaded].dat.dat[1] = _buffer[_bufferPos++];
+ overlaps[nLoaded].dat.dat[2] = _buffer[_bufferPos++];
+#elif (ovOverlapWORDSZ == 32) && (ovOverlapNWORDS == 5)
overlaps[nLoaded].dat.dat[0] = _buffer[_bufferPos++];
overlaps[nLoaded].dat.dat[1] = _buffer[_bufferPos++];
overlaps[nLoaded].dat.dat[2] = _buffer[_bufferPos++];
overlaps[nLoaded].dat.dat[3] = _buffer[_bufferPos++];
overlaps[nLoaded].dat.dat[4] = _buffer[_bufferPos++];
-#elif (ovOverlapNWORDS == 3)
- overlaps[nLoaded].dat.dat[0] = _buffer[_bufferPos++]; overlaps[nLoaded].dat.dat[0] <<= 32;
- overlaps[nLoaded].dat.dat[0] |= _buffer[_bufferPos++];
- overlaps[nLoaded].dat.dat[1] = _buffer[_bufferPos++]; overlaps[nLoaded].dat.dat[1] <<= 32;
- overlaps[nLoaded].dat.dat[1] |= _buffer[_bufferPos++];
- overlaps[nLoaded].dat.dat[2] = _buffer[_bufferPos++]; overlaps[nLoaded].dat.dat[2] <<= 32;
- overlaps[nLoaded].dat.dat[2] |= _buffer[_bufferPos++];
-#elif (ovOverlapNWORDS == 8)
+#elif (ovOverlapWORDSZ == 32) && (ovOverlapNWORDS == 6)
+ overlaps[nLoaded].dat.dat[0] = _buffer[_bufferPos++];
+ overlaps[nLoaded].dat.dat[1] = _buffer[_bufferPos++];
+ overlaps[nLoaded].dat.dat[2] = _buffer[_bufferPos++];
+ overlaps[nLoaded].dat.dat[3] = _buffer[_bufferPos++];
+ overlaps[nLoaded].dat.dat[4] = _buffer[_bufferPos++];
+ overlaps[nLoaded].dat.dat[5] = _buffer[_bufferPos++];
+#elif (ovOverlapWORDSZ == 32) && (ovOverlapNWORDS == 8)
overlaps[nLoaded].dat.dat[0] = _buffer[_bufferPos++];
overlaps[nLoaded].dat.dat[1] = _buffer[_bufferPos++];
overlaps[nLoaded].dat.dat[2] = _buffer[_bufferPos++];
@@ -409,6 +480,18 @@ ovFile::readOverlaps(ovOverlap *overlaps, uint64 overlapsLen) {
overlaps[nLoaded].dat.dat[5] = _buffer[_bufferPos++];
overlaps[nLoaded].dat.dat[6] = _buffer[_bufferPos++];
overlaps[nLoaded].dat.dat[7] = _buffer[_bufferPos++];
+#elif (ovOverlapWORDSZ == 64) && (ovOverlapNWORDS == 2)
+ overlaps[nLoaded].dat.dat[0] = _buffer[_bufferPos++]; overlaps[nLoaded].dat.dat[0] <<= 32;
+ overlaps[nLoaded].dat.dat[0] |= _buffer[_bufferPos++];
+ overlaps[nLoaded].dat.dat[1] = _buffer[_bufferPos++]; overlaps[nLoaded].dat.dat[1] <<= 32;
+ overlaps[nLoaded].dat.dat[1] |= _buffer[_bufferPos++];
+#elif (ovOverlapWORDSZ == 64) && (ovOverlapNWORDS == 3)
+ overlaps[nLoaded].dat.dat[0] = _buffer[_bufferPos++]; overlaps[nLoaded].dat.dat[0] <<= 32;
+ overlaps[nLoaded].dat.dat[0] |= _buffer[_bufferPos++];
+ overlaps[nLoaded].dat.dat[1] = _buffer[_bufferPos++]; overlaps[nLoaded].dat.dat[1] <<= 32;
+ overlaps[nLoaded].dat.dat[1] |= _buffer[_bufferPos++];
+ overlaps[nLoaded].dat.dat[2] = _buffer[_bufferPos++]; overlaps[nLoaded].dat.dat[2] <<= 32;
+ overlaps[nLoaded].dat.dat[2] |= _buffer[_bufferPos++];
#else
#error unknown ovOverlapNWORDS
#endif
@@ -435,3 +518,20 @@ ovFile::seekOverlap(off_t overlap) {
_bufferPos = _bufferLen; // We probably need to reload the buffer.
}
+
+
+
+
+void
+ovFile::transferHistogram(ovStoreHistogram *copy) {
+
+ if (copy == NULL)
+ return;
+
+ copy->add(_histogram);
+
+ delete _histogram;
+
+ _histogram = new ovStoreHistogram;
+}
+
diff --git a/src/stores/ovStoreFile.H b/src/stores/ovStoreFile.H
new file mode 100644
index 0000000..cdbc5e7
--- /dev/null
+++ b/src/stores/ovStoreFile.H
@@ -0,0 +1,120 @@
+
+/******************************************************************************
+ *
+ * This file is part of canu, a software program that assembles whole-genome
+ * sequencing reads into contigs.
+ *
+ * This software is based on:
+ * 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
+ * the 'kmer package' (http://kmer.sourceforge.net)
+ * both originally distributed by Applera Corporation under the GNU General
+ * Public License, version 2.
+ *
+ * Canu branched from Celera Assembler at its revision 4587.
+ * Canu branched from the kmer project at its revision 1994.
+ *
+ * This file is derived from:
+ *
+ * src/stores/ovStore.H
+ *
+ * Modifications by:
+ *
+ * Brian P. Walenz beginning on 2016-OCT-24
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
+ * File 'README.licenses' in the root directory of this distribution contains
+ * full conditions and disclaimers for each license.
+ */
+
+#ifndef AS_OVSTOREFILE_H
+#define AS_OVSTOREFILE_H
+
+#include "AS_global.H"
+#include "gkStore.H"
+
+#include "ovOverlap.H"
+
+
+class ovStoreHistogram;
+
+
+// The default, no flags, is to open for normal overlaps, read only. Normal overlaps mean they
+// have only the B id, i.e., they are in a fully built store.
+//
+// Output of overlapper (input to store building) should be ovFileFullWrite. The specialized
+// ovFileFullWriteNoCounts is used internally by store creation.
+//
+enum ovFileType {
+ ovFileNormal = 0, // Reading of b_id overlaps (aka store files)
+ ovFileNormalWrite = 1, // Writing of b_id overlaps
+ ovFileFull = 2, // Reading of a_id+b_id overlaps (aka dump files)
+ ovFileFullWrite = 3, // Writing of a_id+b_id overlaps
+ ovFileFullWriteNoCounts = 4 // Writing of a_id+b_id overlaps, omitting the counts of olaps per read
+};
+
+
+class ovFile {
+public:
+ ovFile(gkStore *gkpName,
+ const char *name,
+ ovFileType type = ovFileNormal,
+ uint32 bufferSize = 1 * 1024 * 1024);
+ ~ovFile();
+
+ void writeBuffer(bool force=false);
+ void writeOverlap(ovOverlap *overlap);
+ void writeOverlaps(ovOverlap *overlaps, uint64 overlapLen);
+
+ void readBuffer(void);
+ bool readOverlap(ovOverlap *overlap);
+ uint64 readOverlaps(ovOverlap *overlaps, uint64 overlapMax);
+
+ void seekOverlap(off_t overlap);
+
+ // The size of an overlap record is 1 or 2 IDs + the size of a word times the number of words.
+ uint64 recordSize(void) {
+ return(sizeof(uint32) * ((_isNormal) ? 1 : 2) + sizeof(ovOverlapWORD) * ovOverlapNWORDS);
+ };
+
+ // For use in conversion, force snappy compression. By default, it is ENABLED, and we cannot
+ // read older ovb files.
+#ifdef SNAPPY
+ void enableSnappy(bool enabled) {
+ _useSnappy = enabled;
+ };
+#endif
+
+ // Move the stats in our histogram to the one supplied, and remove our data
+ void transferHistogram(ovStoreHistogram *copy);
+
+private:
+ gkStore *_gkp;
+ ovStoreHistogram *_histogram;
+
+ uint32 _bufferLen; // length of valid data in the buffer
+ uint32 _bufferPos; // position the read is at in the buffer
+ uint32 _bufferMax; // allocated size of the buffer
+ uint32 *_buffer;
+
+#ifdef SNAPPY
+ size_t _snappyLen;
+ char *_snappyBuffer;
+#endif
+
+ bool _isOutput; // if true, we can writeOverlap()
+ bool _isSeekable; // if true, we can seekOverlap()
+ bool _isNormal; // if true, 3 words per overlap, else 4
+#ifdef SNAPPY
+ bool _useSnappy; // if true, compress with snappy before writing
+#endif
+
+ compressedFileReader *_reader;
+ compressedFileWriter *_writer;
+
+ char _prefix[FILENAME_MAX];
+ FILE *_file;
+};
+
+
+#endif // AS_OVSTOREFILE_H
diff --git a/src/stores/ovStoreFilter.C b/src/stores/ovStoreFilter.C
new file mode 100644
index 0000000..89bc2fe
--- /dev/null
+++ b/src/stores/ovStoreFilter.C
@@ -0,0 +1,293 @@
+
+/******************************************************************************
+ *
+ * This file is part of canu, a software program that assembles whole-genome
+ * sequencing reads into contigs.
+ *
+ * This software is based on:
+ * 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
+ * the 'kmer package' (http://kmer.sourceforge.net)
+ * both originally distributed by Applera Corporation under the GNU General
+ * Public License, version 2.
+ *
+ * Canu branched from Celera Assembler at its revision 4587.
+ * Canu branched from the kmer project at its revision 1994.
+ *
+ * This file is derived from:
+ *
+ * src/stores/ovStore.C
+ *
+ * Modifications by:
+ *
+ * Brian P. Walenz beginning on 2016-OCT-28
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
+ * File 'README.licenses' in the root directory of this distribution contains
+ * full conditions and disclaimers for each license.
+ */
+
+#include "ovStore.H"
+
+
+#define OBT_FAR5PRIME (29)
+#define OBT_MIN_LENGTH (75)
+
+
+
+
+ovStoreFilter::ovStoreFilter(gkStore *gkp_, double maxErate) {
+ gkp = gkp_;
+
+ resetCounters();
+
+ maxID = gkp->gkStore_getNumReads() + 1;
+ maxEvalue = AS_OVS_encodeEvalue(maxErate);
+
+ skipReadOBT = new char [maxID];
+ skipReadDUP = new char [maxID];
+
+ memset(skipReadOBT, 0, sizeof(char) * maxID);
+ memset(skipReadDUP, 0, sizeof(char) * maxID);
+
+
+ uint32 numSkipOBT = 0;
+ uint32 numSkipDUP = 0;
+
+#if 0
+ fprintf(stderr, "Marking fragments to skip overlap based trimming.\n");
+
+ fprintf(stderr, "LIB 1 - dup=%d trim=%d spur=%d chimera=%d subreads=%d\n",
+ gkp->gkStore_getLibrary(1)->gkLibrary_removeDuplicateReads(),
+ gkp->gkStore_getLibrary(1)->gkLibrary_finalTrim(),
+ gkp->gkStore_getLibrary(1)->gkLibrary_removeSpurReads(),
+ gkp->gkStore_getLibrary(1)->gkLibrary_removeChimericReads(),
+ gkp->gkStore_getLibrary(1)->gkLibrary_checkForSubReads());
+#endif
+
+ for (uint64 iid=0; iid<maxID; iid++) {
+ uint32 Lid = gkp->gkStore_getRead(iid)->gkRead_libraryID();
+ gkLibrary *L = gkp->gkStore_getLibrary(Lid);
+
+ if ((L->gkLibrary_removeDuplicateReads() == false) &&
+ (L->gkLibrary_finalTrim() == GK_FINALTRIM_NONE) &&
+ (L->gkLibrary_removeSpurReads() == false) &&
+ (L->gkLibrary_removeChimericReads() == false) &&
+ (L->gkLibrary_checkForSubReads() == false)) {
+ numSkipOBT++;
+ skipReadOBT[iid] = true;
+ }
+
+ if (L->gkLibrary_removeDuplicateReads() == false) {
+ numSkipDUP++;
+ skipReadDUP[iid] = true;
+ }
+ }
+
+ if (numSkipOBT > 0)
+ fprintf(stderr, "- Marked " F_U32 " reads to skip trimming.\n", numSkipOBT);
+
+ if (numSkipDUP > 0)
+ fprintf(stderr, "- Marked " F_U32 " reads to skip deduplication.\n", numSkipDUP);
+}
+
+
+
+ovStoreFilter::~ovStoreFilter() {
+ delete [] skipReadOBT;
+ delete [] skipReadDUP;
+}
+
+
+
+
+
+// Are the 5' end points very different? If the overlap is flipped, then, yes, they are.
+static
+bool
+isOverlapDifferent(ovOverlap &ol) {
+ bool isDiff = true;
+
+ if (ol.flipped() == false) {
+ if (ol.a_bgn() > ol.b_bgn())
+ isDiff = ((ol.a_bgn() - ol.b_bgn()) > OBT_FAR5PRIME) ? (true) : (false);
+ else
+ isDiff = ((ol.b_bgn() - ol.a_bgn()) > OBT_FAR5PRIME) ? (true) : (false);
+ }
+
+ return(isDiff);
+}
+
+
+// Is the overlap long?
+static
+bool
+isOverlapLong(ovOverlap &ol) {
+ int32 ab = ol.a_bgn();
+ int32 ae = ol.a_end();
+ int32 bb = ol.b_bgn();
+ int32 be = ol.b_end();
+
+ int32 Alength = ae - ab;
+ int32 Blength = be - bb;
+
+ if (be < bb)
+ Blength = bb - be;
+
+ return(((Alength > OBT_MIN_LENGTH) && (Blength > OBT_MIN_LENGTH)) ? (true) : (false));
+}
+
+
+
+void
+ovStoreFilter::filterOverlap(ovOverlap &foverlap,
+ ovOverlap &roverlap) {
+
+ // Quick sanity check on IIDs.
+
+ if ((foverlap.a_iid == 0) ||
+ (foverlap.b_iid == 0) ||
+ (foverlap.a_iid >= maxID) ||
+ (foverlap.b_iid >= maxID)) {
+ char ovlstr[256];
+
+ fprintf(stderr, "Overlap has IDs out of range (maxID " F_U32 "), possibly corrupt input data.\n", maxID);
+ fprintf(stderr, " coords -- %s\n", foverlap.toString(ovlstr, ovOverlapAsCoords, false));
+ fprintf(stderr, " hangs -- %s\n", foverlap.toString(ovlstr, ovOverlapAsHangs, false));
+ exit(1);
+ }
+
+ // Make the reverse overlap (important, AFTER resetting the erate-based 'for' flags).
+
+ roverlap.swapIDs(foverlap);
+
+ // Ignore high error overlaps
+
+ if ((foverlap.evalue() > maxEvalue)) {
+ foverlap.dat.ovl.forUTG = false;
+ foverlap.dat.ovl.forOBT = false;
+ foverlap.dat.ovl.forDUP = false;
+
+ roverlap.dat.ovl.forUTG = false;
+ roverlap.dat.ovl.forOBT = false;
+ roverlap.dat.ovl.forDUP = false;
+
+ skipERATE++;
+ skipERATE++;
+ }
+
+ // Don't OBT if not requested.
+
+ if ((foverlap.dat.ovl.forOBT == false) && (skipReadOBT[foverlap.a_iid] == true)) {
+ foverlap.dat.ovl.forOBT = false;
+ skipOBT++;
+ }
+
+ if ((roverlap.dat.ovl.forOBT == false) && (skipReadOBT[roverlap.a_iid] == true)) {
+ roverlap.dat.ovl.forOBT = false;
+ skipOBT++;
+ }
+
+ // If either overlap is good for either obt or dup, compute if it is different and long. These
+ // are the same for both foverlap and roverlap.
+
+ bool isDiff = isOverlapDifferent(foverlap);
+ bool isLong = isOverlapLong(foverlap);
+
+ // Remove the bad-for-OBT overlaps.
+
+ if ((isDiff == false) && (foverlap.dat.ovl.forOBT == true)) {
+ foverlap.dat.ovl.forOBT = false;
+ skipOBTbad++;
+ }
+
+ if ((isDiff == false) && (roverlap.dat.ovl.forOBT == true)) {
+ roverlap.dat.ovl.forOBT = false;
+ skipOBTbad++;
+ }
+
+ // Remove the too-short-for-OBT overlaps.
+
+ if ((isLong == false) && (foverlap.dat.ovl.forOBT == true)) {
+ foverlap.dat.ovl.forOBT = false;
+ skipOBTshort++;
+ }
+
+ if ((isLong == false) && (roverlap.dat.ovl.forOBT == true)) {
+ roverlap.dat.ovl.forOBT = false;
+ skipOBTshort++;
+ }
+
+ // Don't dedupe if not requested.
+
+ if ((foverlap.dat.ovl.forDUP == true) && (skipReadDUP[foverlap.a_iid] == true)) {
+ foverlap.dat.ovl.forDUP = false;
+ skipDUP++;
+ }
+
+ if ((roverlap.dat.ovl.forDUP == true) && (skipReadDUP[roverlap.b_iid] == true)) {
+ roverlap.dat.ovl.forDUP = false;
+ skipDUP++;
+ }
+
+ // Remove the bad-for-DUP overlaps.
+
+#if 0
+ // Nah, do this in dedupe, since parameters can change.
+ if ((isDiff == true) && (foverlap.dat.ovl.forDUP == true)) {
+ foverlap.dat.ovl.forDUP = false;
+ skipDUPdiff++;
+ }
+
+ if ((isDiff == true) && (roverlap.dat.ovl.forDUP == true)) {
+ roverlap.dat.ovl.forDUP = false;
+ skipDUPdiff++;
+ }
+#endif
+
+ // Can't have duplicates between libraries.
+
+ if (((foverlap.dat.ovl.forDUP == true) ||
+ (roverlap.dat.ovl.forDUP == true)) &&
+ (gkp->gkStore_getRead(foverlap.a_iid)->gkRead_libraryID() != gkp->gkStore_getRead(foverlap.b_iid)->gkRead_libraryID())) {
+
+ if ((foverlap.dat.ovl.forDUP == true)) {
+ foverlap.dat.ovl.forDUP = false;
+ skipDUPlib++;
+ }
+
+ if ((roverlap.dat.ovl.forDUP == true)) {
+ roverlap.dat.ovl.forDUP = false;
+ skipDUPlib++;
+ }
+ }
+
+ // All done with the filtering, record some counts.
+
+ if (foverlap.dat.ovl.forUTG == true) saveUTG++;
+ if (foverlap.dat.ovl.forOBT == true) saveOBT++;
+ if (foverlap.dat.ovl.forDUP == true) saveDUP++;
+
+ if (roverlap.dat.ovl.forUTG == true) saveUTG++;
+ if (roverlap.dat.ovl.forOBT == true) saveOBT++;
+ if (roverlap.dat.ovl.forDUP == true) saveDUP++;
+}
+
+
+
+void
+ovStoreFilter::resetCounters(void) {
+ saveUTG = 0;
+ saveOBT = 0;
+ saveDUP = 0;
+
+ skipERATE = 0;
+
+ skipOBT = 0;
+ skipOBTbad = 0;
+ skipOBTshort = 0;
+
+ skipDUP = 0;
+ skipDUPdiff = 0;
+ skipDUPlib = 0;
+}
diff --git a/src/bogart/AS_BAT_MergeUnitigs.H b/src/stores/ovStoreFilter.H
similarity index 74%
rename from src/bogart/AS_BAT_MergeUnitigs.H
rename to src/stores/ovStoreFilter.H
index a876c76..0e6267b 100644
--- a/src/bogart/AS_BAT_MergeUnitigs.H
+++ b/src/stores/ovStoreFilter.H
@@ -13,9 +13,13 @@
* Canu branched from Celera Assembler at its revision 4587.
* Canu branched from the kmer project at its revision 1994.
*
+ * This file is derived from:
+ *
+ * src/stores/ovStore.H
+ *
* Modifications by:
*
- * Brian P. Walenz beginning on 2016-MAY-17
+ * Brian P. Walenz beginning on 2016-OCT-28
* are a 'United States Government Work', and
* are released in the public domain
*
@@ -23,13 +27,3 @@
* full conditions and disclaimers for each license.
*/
-#ifndef INCLUDE_AS_BAT_MERGE_UNITIGS
-#define INCLUDE_AS_BAT_MERGE_UNITIGS
-
-
-void
-mergeUnitigs(UnitigVector &unitigs,
- double deviation,
- bool findCircularTigs);
-
-#endif // INCLUDE_AS_BAT_MERGE_UNITIGS
diff --git a/src/stores/ovStoreHistogram.C b/src/stores/ovStoreHistogram.C
new file mode 100644
index 0000000..1a1f2f0
--- /dev/null
+++ b/src/stores/ovStoreHistogram.C
@@ -0,0 +1,447 @@
+
+/******************************************************************************
+ *
+ * This file is part of canu, a software program that assembles whole-genome
+ * sequencing reads into contigs.
+ *
+ * This software is based on:
+ * 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
+ * the 'kmer package' (http://kmer.sourceforge.net)
+ * both originally distributed by Applera Corporation under the GNU General
+ * Public License, version 2.
+ *
+ * Canu branched from Celera Assembler at its revision 4587.
+ * Canu branched from the kmer project at its revision 1994.
+ *
+ * Modifications by:
+ *
+ * Brian P. Walenz beginning on 2016-OCT-25
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
+ * File 'README.licenses' in the root directory of this distribution contains
+ * full conditions and disclaimers for each license.
+ */
+
+#include "ovStoreHistogram.H"
+
+
+
+ovStoreHistogram::ovStoreHistogram() {
+
+ _gkp = NULL;
+
+ _maxOlength = 0;
+ _maxEvalue = 0;
+
+ _epb = 0;
+ _bpb = 0;
+
+ _opelLen = 0;
+ _opel = NULL;
+
+ _oprLen = 0;
+ _oprMax = 0;
+ _opr = NULL;
+}
+
+
+
+ovStoreHistogram::ovStoreHistogram(gkStore *gkp, char *path) {
+
+ _gkp = NULL;
+
+ _maxOlength = 0;
+ _maxEvalue = 0;
+
+ _epb = 0;
+ _bpb = 0;
+
+ _opelLen = 0;
+ _opel = NULL;
+
+ _oprLen = 0;
+ _oprMax = 0;
+ _opr = NULL;
+
+ loadData(path);
+}
+
+
+ovStoreHistogram::ovStoreHistogram(gkStore *gkp, ovFileType type) {
+
+ _gkp = gkp;
+
+ _maxOlength = 0;
+ _maxEvalue = 0;
+
+ _epb = 1; // Evalues per bucket
+ _bpb = 250; // Bases per bucket
+
+ _opelLen = 0;
+ _opel = NULL;
+
+ _oprLen = 0;
+ _oprMax = 0;
+ _opr = NULL;
+
+ // When writing full overlaps out of an overlapper (ovFileFullWrite) we want
+ // to keep track of the number of overlaps per read. We could pre-allocate
+ // the array based on the size of gkpStore, but if we don't have that, it's
+ // easy enough to grow the array.
+ //
+ // _opr is notably skipped if ovFileFullWriteNoCounts is used. That symbol
+ // isn't actually used anywhere except in this comment (and when some ovFile
+ // is created) so we mention it here for grep.
+
+ if (type == ovFileFullWrite) {
+ _oprLen = 0;
+ _oprMax = (_gkp == NULL) ? (256 * 1024) : (_gkp->gkStore_getNumReads() + 1);
+ _opr = new uint32 [_oprMax];
+
+ memset(_opr, 0, sizeof(uint32) * _oprMax);
+ }
+
+ // When writing store overlaps (ovFileNormalWrite) we want to keep track of
+ // how many overlaps for each evalue X length.
+ //
+ // A gkpStore is required here so we can allocate the correct amount of
+ // space and compute the length of an overlap.
+ //
+ // The histogram always allocates one pointer for each eValue (there's only 4096 of them),
+ // but defers allocating the vector until needed.
+
+ if (type == ovFileNormalWrite) {
+ if (_gkp == NULL)
+ fprintf(stderr, "ovStoreHistogram()-- ERROR: I need a valid gkpStore.\n"), exit(1);
+
+ for (uint32 ii=1; ii<_gkp->gkStore_getNumReads(); ii++)
+ if (_opelLen < _gkp->gkStore_getRead(ii)->gkRead_sequenceLength())
+ _opelLen = _gkp->gkStore_getRead(ii)->gkRead_sequenceLength();
+ _opelLen = _opelLen * 1.40 / _bpb + 1; // the overlap could have 40% insertions.
+
+ _opel = new uint32 * [AS_MAX_EVALUE + 1];
+
+ memset(_opel, 0, sizeof(uint32 *) * (AS_MAX_EVALUE + 1));
+ }
+}
+
+
+
+ovStoreHistogram::~ovStoreHistogram() {
+
+ if (_opel)
+ for (uint32 ii=0; ii<AS_MAX_EVALUE + 1; ii++)
+ delete [] _opel[ii];
+
+ delete [] _opel;
+ delete [] _opr;
+}
+
+
+
+void
+ovStoreHistogram::addOverlap(ovOverlap *overlap) {
+
+ if (_opr) {
+ uint32 maxID = max(overlap->a_iid, overlap->b_iid);
+
+ if (_oprMax < maxID)
+ resizeArray(_opr, _oprLen, _oprMax, maxID + maxID/2, resizeArray_copyData | resizeArray_clearNew);
+
+ if (_oprLen < maxID + 1)
+ _oprLen = maxID + 1;
+
+ _opr[overlap->a_iid]++;
+ _opr[overlap->b_iid]++;
+ }
+
+ if (_opel) {
+ uint32 ev = overlap->evalue();
+ uint32 len = (_gkp->gkStore_getRead(overlap->a_iid)->gkRead_sequenceLength() - overlap->dat.ovl.ahg5 - overlap->dat.ovl.ahg3 +
+ _gkp->gkStore_getRead(overlap->b_iid)->gkRead_sequenceLength() - overlap->dat.ovl.bhg5 - overlap->dat.ovl.bhg3) / 2;
+
+ ev /= _epb;
+ len /= _bpb;
+
+ if (_opel[ev] == NULL) {
+ _opel[ev] = new uint32 [_opelLen];
+ memset(_opel[ev], 0, sizeof(uint32) * _opelLen);
+ }
+
+ _opel[ev][len]++;
+ }
+}
+
+
+
+
+// Build an output file name from a prefix and a suffix based
+// on if the prefix is a directory or a file. If a directory,
+// the new name will be a file in the directory, otherwise,
+// it will be an extension to the origianl name.
+void
+createDataName(char *name, char *prefix, char *suffix) {
+
+ if (AS_UTL_fileExists(prefix, true, false)) {
+ snprintf(name, FILENAME_MAX, "%s/%s", prefix, suffix);
+ }
+
+ else {
+ AS_UTL_findBaseFileName(name, prefix);
+ strcat(name, ".");
+ strcat(name, suffix);
+ }
+}
+
+
+
+void
+ovStoreHistogram::saveData(char *prefix) {
+ char name[FILENAME_MAX];
+
+ // If we have overlaps-per-read data, dump it. Just a simple array.
+
+ if (_opr) {
+ createDataName(name, prefix, "counts");
+
+ errno = 0;
+ FILE *F = fopen(name, "w");
+ if (errno)
+ fprintf(stderr, "failed to open counts file '%s' for writing: %s\n", name, strerror(errno)), exit(1);
+
+ AS_UTL_safeWrite(F, &_oprLen, "ovStoreHistogram::nr", sizeof(uint32), 1);
+ AS_UTL_safeWrite(F, _opr, "ovStoreHistogram::opr", sizeof(uint32), _oprLen);
+
+ fclose(F);
+ }
+
+ // If we have overlaps-per-evalue-length, dump it. This is a bit more complicated, as it has
+ // holes in the array.
+
+ if (_opel) {
+ createDataName(name, prefix, "evalueLen");
+
+ errno = 0;
+ FILE *F = fopen(name, "w");
+ if (errno)
+ fprintf(stderr, "failed to open evalueLen file '%s' for writing: %s\n", name, strerror(errno)), exit(1);
+
+ uint32 nArr = 0;
+
+ for (uint32 ii=0; ii<AS_MAX_EVALUE + 1; ii++)
+ if (_opel[ii])
+ nArr++;
+
+ AS_UTL_safeWrite(F, &_opelLen, "ovStoreHistogram::opelLen", sizeof(uint32), 1);
+ AS_UTL_safeWrite(F, &_maxOlength, "ovStoreHistogram::maxOlength", sizeof(uint32), 1);
+ AS_UTL_safeWrite(F, &_maxEvalue, "ovStoreHistogram::maxEvalue", sizeof(uint32), 1);
+
+ AS_UTL_safeWrite(F, &_epb, "ovStoreHistogram::epb", sizeof(uint32), 1);
+ AS_UTL_safeWrite(F, &_bpb, "ovStoreHistogram::bpb", sizeof(uint32), 1);
+
+ AS_UTL_safeWrite(F, &nArr, "ovStoreHistogram::nArr", sizeof(uint32), 1);
+
+ for (uint32 ii=0; ii<AS_MAX_EVALUE + 1; ii++) {
+ if (_opel[ii] == NULL)
+ continue;
+
+ AS_UTL_safeWrite(F, &ii, "ovStoreHistogram::evalue", sizeof(uint32), 1);
+ AS_UTL_safeWrite(F, _opel[ii], "ovStoreHistogram::evalueLen", sizeof(uint32), _opelLen);
+ }
+
+ fclose(F);
+ }
+}
+
+
+
+void
+ovStoreHistogram::loadData(char *prefix) {
+ char name[FILENAME_MAX];
+
+ // Add in any overlaps-per-read data.
+
+ createDataName(name, prefix, "counts");
+
+ if (AS_UTL_fileExists(name, false, false) == true) {
+ errno = 0;
+ FILE *F = fopen(name, "r");
+ if (errno)
+ fprintf(stderr, "failed to open counts file '%s' for reading: %s\n", name, strerror(errno)), exit(1);
+
+ uint32 inLen = 0;
+
+ AS_UTL_safeRead(F, &inLen, "ovStoreHistogram::opr", sizeof(uint32), 1); // How many values on disk?
+
+ if (_oprMax < inLen) // Resize to fit those values
+ resizeArray(_opr, _oprLen, _oprMax, inLen + inLen/2, resizeArray_copyData | resizeArray_clearNew);
+
+ if (_oprLen < inLen) // Remember the new length
+ _oprLen = inLen;
+
+ uint32 *in = new uint32 [inLen]; // Allocate temp space for new values
+
+ AS_UTL_safeRead(F, in, "ovStoreHistogram::opr", sizeof(uint32), inLen); // Load new values
+
+ for (uint32 ii=0; ii<inLen; ii++) // Add in new values
+ _opr[ii] += in[ii];
+
+ delete [] in;
+
+ fclose(F);
+ }
+
+ // Add in any overlaps-per-evalue-length data.
+
+ createDataName(name, prefix, "evalueLen");
+
+ if (AS_UTL_fileExists(name, false, false) == true) {
+ errno = 0;
+ FILE *F = fopen(name, "r");
+ if (errno)
+ fprintf(stderr, "failed to open evalueLen file '%s' for reading: %s\n", name, strerror(errno)), exit(1);
+
+ uint32 nArr = 0;
+
+ AS_UTL_safeRead(F, &_opelLen, "ovStoreHistogram::opelLen", sizeof(uint32), 1); // Load parameters of the data
+ AS_UTL_safeRead(F, &_maxOlength, "ovStoreHistogram::maxOlength", sizeof(uint32), 1);
+ AS_UTL_safeRead(F, &_maxEvalue, "ovStoreHistogram::maxEvalue", sizeof(uint32), 1);
+ AS_UTL_safeRead(F, &_epb, "ovStoreHistogram::epb", sizeof(uint32), 1);
+ AS_UTL_safeRead(F, &_bpb, "ovStoreHistogram::bpb", sizeof(uint32), 1);
+ AS_UTL_safeRead(F, &nArr, "ovStoreHistogram::nArr", sizeof(uint32), 1);
+
+ if (_opel == NULL)
+ allocateArray(_opel, AS_MAX_EVALUE+1, resizeArray_clearNew); // Abuse resizeArray() to alloc new
+
+ uint32 *in = new uint32 [_opelLen]; // Allocate space for a single vector
+
+ for (uint32 ev=0; nArr-- > 0; ) { // For each saved vector:
+ AS_UTL_safeRead(F, &ev, "ovStoreHistogram::evalue", sizeof(uint32), 1); // Load the evalue it is for
+ AS_UTL_safeRead(F, in, "ovStoreHistogram::evalueLen", sizeof(uint32), _opelLen); // Load the data.
+
+ if (_opel[ev] == NULL) // More abuse, if needed
+ allocateArray(_opel[ev], _opelLen, resizeArray_clearNew);
+
+ for (uint32 kk=0; kk<_opelLen; kk++) // Add new data to old data
+ _opel[ev][kk] += in[kk];
+ }
+
+ delete [] in;
+
+ fclose(F);
+ }
+}
+
+
+
+void
+ovStoreHistogram::removeData(char *prefix) {
+ char name[FILENAME_MAX];
+
+ createDataName(name, prefix, "counts"); AS_UTL_unlink(name);
+ createDataName(name, prefix, "evalueLen"); AS_UTL_unlink(name);
+}
+
+
+
+void
+ovStoreHistogram::add(ovStoreHistogram *input) {
+
+ if (input->_opr) {
+ resizeArray(_opr, _oprLen, _oprMax, input->_oprMax, resizeArray_copyData | resizeArray_clearNew);
+
+ for (uint32 ii=0; ii<input->_oprMax; ii++)
+ _opr[ii] += input->_opr[ii];
+
+ _oprLen = max(_oprLen, input->_oprLen);
+ }
+
+ if (input->_opel) {
+ if (_opel == NULL) {
+ allocateArray(_opel, AS_MAX_EVALUE+1, resizeArray_clearNew);
+ _opelLen = input->_opelLen;
+ _maxOlength = input->_maxOlength;
+ _maxEvalue = input->_maxEvalue;
+ _epb = input->_epb;
+ _bpb = input->_bpb;
+ }
+
+ if ((_opelLen != input->_opelLen) ||
+ (_epb != input->_epb) ||
+ (_bpb != input->_bpb)) {
+ fprintf(stderr, "ERROR: can't merge histogram; parameters differ.\n");
+ fprintf(stderr, "ERROR: opelLen = %7u vs %7u\n", _opelLen, input->_opelLen);
+ fprintf(stderr, "ERROR: opelLen = %7u vs %7u\n", _epb, input->_epb);
+ fprintf(stderr, "ERROR: opelLen = %7u vs %7u\n", _bpb, input->_bpb);
+ exit(1);
+ }
+
+ _maxOlength = max(_maxOlength, input->_maxOlength);
+ _maxEvalue = max(_maxEvalue, input->_maxEvalue);
+
+ for (uint32 ev=0; ev<AS_MAX_EVALUE+1; ev++) {
+ if (input->_opel[ev] == NULL)
+ continue;
+
+ if (_opel[ev] == NULL)
+ allocateArray(_opel[ev], _opelLen, resizeArray_clearNew);
+
+ for (uint32 kk=0; kk<_opelLen; kk++)
+ _opel[ev][kk] += input->_opel[ev][kk];
+ }
+ }
+}
+
+
+
+uint64
+ovStoreHistogram::getOverlapsPerRead(uint32 *oprOut, uint32 oprOutLen) {
+ uint64 tot = 0;
+
+ if (oprOutLen < _oprLen)
+ fprintf(stderr, "ERROR: more reads in histogram than available for output? oprOutLen=%u _oprLen=%u\n", oprOutLen, _oprLen), exit(1);
+
+ for (uint32 ii=0; ii<_oprLen; ii++) {
+ oprOut[ii] += _opr[ii];
+ tot += _opr[ii];
+ }
+
+ return(tot);
+}
+
+
+
+void
+ovStoreHistogram::dumpEvalueLength(FILE *out) {
+ uint32 maxEvalue = 0;
+ uint32 maxLength = 0;
+
+ // Find the largest Evalue and length with values in the histogram
+
+ for (uint32 ee=0; ee<AS_MAX_EVALUE + 1; ee++) {
+ if (_opel[ee] == NULL)
+ continue;
+
+ maxEvalue = ee;
+
+ for (uint32 ll=maxLength; ll<_opelLen; ll++)
+ if (_opel[ee][ll] > 0)
+ maxLength = ll;
+ }
+
+ // Dump those values
+
+ for (uint32 ee=0; ee<=maxEvalue; ee++) {
+ for (uint32 ll=0; ll<=maxLength; ll++)
+ fprintf(out, "%u\t%.4f\t%u\n",
+ ll * _bpb,
+ AS_OVS_decodeEvalue(ee),
+ (_opel[ee] == NULL) ? 0 : _opel[ee][ll]);
+
+ fprintf(out, "\n");
+ }
+
+ fprintf(stderr, "MAX Evalue %.4f\n", AS_OVS_decodeEvalue(maxEvalue));
+ fprintf(stderr, "MAX Length %u\n", maxLength * _bpb);
+}
diff --git a/src/stores/ovStoreHistogram.H b/src/stores/ovStoreHistogram.H
new file mode 100644
index 0000000..917d07d
--- /dev/null
+++ b/src/stores/ovStoreHistogram.H
@@ -0,0 +1,115 @@
+
+/******************************************************************************
+ *
+ * This file is part of canu, a software program that assembles whole-genome
+ * sequencing reads into contigs.
+ *
+ * This software is based on:
+ * 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
+ * the 'kmer package' (http://kmer.sourceforge.net)
+ * both originally distributed by Applera Corporation under the GNU General
+ * Public License, version 2.
+ *
+ * Canu branched from Celera Assembler at its revision 4587.
+ * Canu branched from the kmer project at its revision 1994.
+ *
+ * Modifications by:
+ *
+ * Brian P. Walenz beginning on 2016-OCT-25
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
+ * File 'README.licenses' in the root directory of this distribution contains
+ * full conditions and disclaimers for each license.
+ */
+
+#ifndef AS_OVSTOREHISTOGRAM_H
+#define AS_OVSTOREHISTOGRAM_H
+
+// Automagically gathers statistics on overlaps as they're written:
+// from overlappers, the number of overlaps per read.
+// in the store, the number of overlaps per (evalue,overlapLength)
+
+#include "AS_global.H"
+#include "gkStore.H"
+
+#include "ovStoreFile.H"
+
+class ovStoreHistogram {
+public:
+ ovStoreHistogram(); // Used when loading data, user must loadData() later
+ ovStoreHistogram(gkStore *gkp, char *path); // Used when loading data, calls loadData() for you
+ ovStoreHistogram(gkStore *gkp, ovFileType type); // Used when writing ovFile
+ ~ovStoreHistogram();
+
+ double minErate(void) { return(AS_OVS_decodeEvalue(0)); };
+ double maxErate(void) { return(AS_OVS_decodeEvalue(_maxEvalue)); };
+
+ uint32 minEvalue(void) { return(0); };
+ uint32 maxEvalue(void) { return(_maxEvalue); };
+
+ uint32 numEvalueBuckets(void) { return(AS_MAX_EVALUE + 1); };
+ uint32 numLengthBuckets(void) { return(_opelLen); };
+
+ //uint32 minOverlapLength(void) { return(0); };
+ //uint32 maxOverlapLength(void) { return(_maxLength * _bpb); };
+
+ uint32 evaluePerBucket(void) { return(_epb); };
+ uint32 basesPerBucket(void) { return(_bpb); };
+
+ uint32 numOverlaps(uint32 eb, uint32 lb) {
+ assert(eb < numEvalueBuckets());
+ assert(lb < numLengthBuckets());
+
+ return((_opel[eb] == NULL) ? 0 : _opel[eb][lb]);
+ };
+
+ //uint32 numOverlaps(uint32 id);
+ //uint32 numOverlaps(uint32 evalue, uint32 length);
+
+ // In an ovFile, add a single value to the histogram
+
+ void addOverlap(ovOverlap *overlap);
+
+ // In an ovStore, load the histogram saved in a file, and add it to our current data.
+
+ void saveData(char *prefix);
+ void loadData(char *prefix);
+
+ // Remove data associated with some prefix.
+
+ static
+ void removeData(char *prefix);
+
+ // Add in the data from histogram 'input' to this histogram
+
+ void add(ovStoreHistogram *input);
+
+ // Copies the number of overlaps per read into oprOut. This array is assumed to sized using
+ // gkpStore to get the number of reads. The data in the histogram can be shorter, but shouldn't
+ // be longer. If so, it will fail and exit.
+
+ uint64 getOverlapsPerRead(uint32 *oprOut, uint32 oprOutLen); // Returns total overlaps in this histogram
+
+ // Dump a gnuplot-friendly data file of the evalues-length.
+
+ void dumpEvalueLength(FILE *out);
+
+private:
+ gkStore *_gkp;
+
+ uint32 _maxOlength; // Max overlap length seen
+ uint32 _maxEvalue; // Max evalue seen
+
+ uint32 _epb; // Evalues per bucket
+ uint32 _bpb; // Bases per bucket
+
+ uint32 _opelLen; // Length of the data vector for one evalue
+ uint32 **_opel; // Overlaps per evalue-length
+
+ uint32 _oprLen; // Length of opr valid data
+ uint32 _oprMax; // Last allocated opr
+ uint32 *_opr; // Overlaps per read
+};
+
+#endif // AS_OVSTOREHISTOGRAM_H
diff --git a/src/stores/ovStoreIndexer.C b/src/stores/ovStoreIndexer.C
index 328969c..927a50e 100644
--- a/src/stores/ovStoreIndexer.C
+++ b/src/stores/ovStoreIndexer.C
@@ -44,8 +44,8 @@
int
main(int argc, char **argv) {
- char *ovlName = NULL;
- uint32 maxJob = 0;
+ char *storePath = NULL;
+ uint32 fileLimit = 0; // Number of 'slices' from bucketizer
bool deleteIntermediates = true;
@@ -60,30 +60,31 @@ main(int argc, char **argv) {
int arg=1;
while (arg < argc) {
if (strcmp(argv[arg], "-O") == 0) {
- ovlName = argv[++arg];
+ storePath = argv[++arg];
} else if (strcmp(argv[arg], "-F") == 0) {
- maxJob = atoi(argv[++arg]);
+ fileLimit = atoi(argv[++arg]);
} else if (strcmp(argv[arg], "-f") == 0) {
doFixes = true;
} else if (strcmp(argv[arg], "-t") == 0) {
doExplicitTest = true;
- ovlName = argv[++arg];
+ storePath = argv[++arg];
} else if (strcmp(argv[arg], "-nodelete") == 0) {
deleteIntermediates = false;
} else {
fprintf(stderr, "ERROR: unknown option '%s'\n", argv[arg]);
+ err++;
}
arg++;
}
- if (ovlName == NULL)
+ if (storePath == NULL)
err++;
- if ((maxJob == 0) && (doExplicitTest == false))
+ if ((fileLimit == 0) && (doExplicitTest == false))
err++;
if (err) {
@@ -106,9 +107,9 @@ main(int argc, char **argv) {
fprintf(stderr, " DANGER DO NOT USE DO NOT USE DO NOT USE DANGER\n");
fprintf(stderr, "\n");
- if (ovlName == NULL)
+ if (storePath == NULL)
fprintf(stderr, "ERROR: No overlap store (-O) supplied.\n");
- if ((maxJob == 0) && (doExplicitTest == false))
+ if ((fileLimit == 0) && (doExplicitTest == false))
fprintf(stderr, "ERROR: One of -F (number of slices) or -t (test a store) must be supplied.\n");
exit(1);
@@ -116,54 +117,31 @@ main(int argc, char **argv) {
// Do the test, and maybe fix things up.
- if (doExplicitTest == true) {
- bool passed = testIndex(ovlName, doFixes);
+ //gkStore *gkp = gkStore::gkStore_open(gkpName);
+ ovStoreWriter *writer = new ovStoreWriter(storePath, NULL, fileLimit, 0, 0);
- exit((passed == true) ? 0 : 1);
+ if (doExplicitTest == true) {
+ bool passed = writer->testIndex(doFixes);
+ if (passed == true)
+ fprintf(stderr, "Index looks correct.\n");
+ delete writer;
+ exit(passed == false);
}
// Check that all segments are present. Every segment should have an info file.
- uint32 cntJob = 0;
-
- for (uint32 i=1; i<=maxJob; i++) {
- uint32 complete = 0;
-
- sprintf(name, "%s/%04d", ovlName, i);
- if (AS_UTL_fileExists(name, FALSE, FALSE) == true)
- complete++;
- else
- fprintf(stderr, "ERROR: Segment "F_U32" data not present (%s)\n", i, name);
-
- sprintf(name, "%s/%04d.info", ovlName, i);
- if (AS_UTL_fileExists(name, FALSE, FALSE) == true)
- complete++;
- else
- fprintf(stderr, "ERROR: Segment "F_U32" info not present (%s)\n", i, name);
-
- sprintf(name, "%s/%04d.index", ovlName, i);
- if (AS_UTL_fileExists(name, FALSE, FALSE) == true)
- complete++;
- else
- fprintf(stderr, "ERROR: Segment "F_U32" index not present (%s)\n", i, name);
-
- if (complete == 3)
- cntJob++;
- }
-
- if (cntJob != maxJob) {
- fprintf(stderr, "ERROR: Expected "F_U32" segments, only found "F_U32".\n", maxJob, cntJob);
- exit(1);
- }
+ writer->checkSortingIsComplete();
- // Merge the stuff.
+ // Merge the indices and histogram data.
- mergeInfoFiles(ovlName, maxJob);
+ writer->mergeInfoFiles();
+ writer->mergeHistogram();
// Diagnostics.
- if (testIndex(ovlName, false) == false) {
+ if (writer->testIndex(false) == false) {
fprintf(stderr, "ERROR: index failed tests.\n");
+ delete writer;
exit(1);
}
@@ -179,32 +157,7 @@ main(int argc, char **argv) {
fprintf(stderr, "\n");
fprintf(stderr, "Removing intermediate files.\n");
- // Removing indices is easy, beacuse we know how many there are.
-
- for (uint32 i=1; i<=maxJob; i++) {
- sprintf(name, "%s/%04u.index", ovlName, i); AS_UTL_unlink(name);
- sprintf(name, "%s/%04u.info", ovlName, i); AS_UTL_unlink(name);
- }
-
- // We don't know how many buckets there are, so we remove until we fail to find ten
- // buckets in a row.
-
- for (uint32 missing=0, i=1; missing<10; i++) {
- sprintf(name, "%s/bucket%04d", ovlName, i);
-
- if (AS_UTL_fileExists(name, TRUE, FALSE) == FALSE) {
- missing++;
- continue;
- }
-
- missing = 0;
-
- sprintf(name, "%s/bucket%04d/sliceSizes", ovlName, i);
- AS_UTL_unlink(name);
-
- sprintf(name, "%s/bucket%04d", ovlName, i);
- rmdir(name);
- }
+ writer->removeAllIntermediateFiles();
fprintf(stderr, "Finished.\n");
diff --git a/src/stores/ovStoreSorter.C b/src/stores/ovStoreSorter.C
index a911c13..5650a89 100644
--- a/src/stores/ovStoreSorter.C
+++ b/src/stores/ovStoreSorter.C
@@ -56,6 +56,34 @@ using namespace std;
#define ovOverlapSortSize (sizeof(ovOverlap))
+
+void
+makeSentinel(char *storePath, uint32 fileID, bool forceRun) {
+ char name[FILENAME_MAX];
+
+ snprintf(name, FILENAME_MAX, "%s/%04d.ovs", storePath, fileID);
+
+ if ((forceRun == false) && (AS_UTL_fileExists(name, FALSE, FALSE)))
+ fprintf(stderr, "Job " F_U32 " is running or finished (remove '%s' or -force to try again).\n", fileID, name), exit(0);
+
+ errno = 0;
+ FILE *F = fopen(name, "w");
+ if (errno)
+ fprintf(stderr, "ERROR: Failed to open '%s' for writing: %s\n", name, strerror(errno)), exit(1);
+ fclose(F);
+}
+
+
+
+void
+removeSentinel(char *storePath, uint32 fileID) {
+ char name[FILENAME_MAX];
+ snprintf(name, FILENAME_MAX, "%s/%04d.ovs", storePath, fileID);
+ unlink(name);
+}
+
+
+
int
main(int argc, char **argv) {
char *storePath = NULL;
@@ -72,6 +100,8 @@ main(int argc, char **argv) {
bool forceRun = false;
+ char name[FILENAME_MAX];
+
argc = AS_configure(argc, argv);
int err=0;
@@ -104,6 +134,7 @@ main(int argc, char **argv) {
} else {
fprintf(stderr, "ERROR: unknown option '%s'\n", argv[arg]);
+ err++;
}
arg++;
@@ -117,8 +148,8 @@ main(int argc, char **argv) {
if (err) {
fprintf(stderr, "usage: %s ...\n", argv[0]);
- fprintf(stderr, " -O x.ovlStore path to overlap store to build the final index for\n");
fprintf(stderr, " -G asm.gkpStore path to gkpStore for this assembly\n");
+ fprintf(stderr, " -O x.ovlStore path to overlap store to build the final index for\n");
fprintf(stderr, "\n");
fprintf(stderr, " -F s number of slices used in bucketizing/sorting\n");
fprintf(stderr, " -job j m index of this overlap input file, and max number of files\n");
@@ -150,190 +181,80 @@ main(int argc, char **argv) {
// Check if we're running or done (or crashed), then note that we're running.
- {
- char name[FILENAME_MAX];
- sprintf(name,"%s/%04d.ovs", storePath, fileID);
-
- if ((forceRun == false) && (AS_UTL_fileExists(name, FALSE, FALSE)))
- fprintf(stderr, "Job "F_U32" is running or finished (remove '%s' or -force to try again).\n", fileID, name), exit(0);
-
- errno = 0;
- FILE *F = fopen(name, "w");
- if (errno)
- fprintf(stderr, "ERROR: Failed to open '%s' for writing: %s\n", name, strerror(errno)), exit(1);
-
- fclose(F);
- }
-
- // Get sizes of each bucket, and the final merge
-
- uint64 *sliceSizes = new uint64 [fileLimit + 1]; // For each overlap job, number of overlaps per bucket
- uint64 *bucketSizes = new uint64 [jobIdxMax + 1]; // For each bucket we care about, number of overlaps
+ makeSentinel(storePath, fileID, forceRun);
- uint64 totOvl = 0;
- uint64 ovlsLen = 0;
+ // Not done. Let's go!
- for (uint32 i=0; i<=jobIdxMax; i++) {
- bucketSizes[i] = 0;
+ gkStore *gkp = gkStore::gkStore_open(gkpName);
+ ovStoreWriter *writer = new ovStoreWriter(storePath, gkp, fileLimit, fileID, jobIdxMax);
- char namz[FILENAME_MAX];
- char name[FILENAME_MAX];
+ // Get the number of overlaps in each bucket slice.
- sprintf(namz, "%s/bucket%04d/slice%03d.gz", storePath, i, fileID);
- sprintf(name, "%s/bucket%04d/slice%03d", storePath, i, fileID);
-
- if ((AS_UTL_fileExists(namz, FALSE, FALSE) == false) &&
- (AS_UTL_fileExists(name, FALSE, FALSE) == false))
- // If no file, there are no overlaps. Skip loading the bucketSizes file.
- // We expect the gz version to exist (that's the default in bucketizer) more frequently, so
- // be sure to test for existence of that one first.
- continue;
-
- sprintf(name, "%s/bucket%04d/sliceSizes", storePath, i);
-
- FILE *F = fopen(name, "r");
- if (errno)
- fprintf(stderr, "ERROR: Failed to open %s: %s\n", name, strerror(errno)), exit(1);
-
- uint64 nr = AS_UTL_safeRead(F, sliceSizes, "sliceSizes", sizeof(uint64), fileLimit + 1);
-
- fclose(F);
-
- if (nr != fileLimit + 1) {
- fprintf(stderr, "ERROR: short read on '%s'.\n", name);
- fprintf(stderr, "ERROR: read "F_U64" sizes insteadof "F_U32".\n", nr, fileLimit + 1);
- }
- assert(nr == fileLimit + 1);
+ uint64 *bucketSizes = new uint64 [jobIdxMax + 1];
+ uint64 totOvl = writer->loadBucketSizes(bucketSizes);
- fprintf(stderr, "Found "F_U64" overlaps from '%s'.\n", sliceSizes[fileID], name);
-
- bucketSizes[i] = sliceSizes[fileID];
- totOvl += sliceSizes[fileID];
- }
-
- delete [] sliceSizes;
- sliceSizes = NULL;
+ // Fail if we don't have enough memory to process.
if (ovOverlapSortSize * totOvl > maxMemory) {
- fprintf(stderr, "ERROR: Overlaps need %.2f GB memory, but process limited (via -M) to "F_U64" GB.\n",
+ fprintf(stderr, "ERROR: Overlaps need %.2f GB memory, but process limited (via -M) to " F_U64 " GB.\n",
ovOverlapSortSize * totOvl / 1024.0 / 1024.0 / 1024.0, maxMemory >> 30);
-
- char name[FILENAME_MAX];
- sprintf(name,"%s/%04d.ovs", storePath, fileID);
-
- unlink(name);
-
+ removeSentinel(storePath, fileID);
exit(1);
}
- fprintf(stderr, "Overlaps need %.2f GB memory, allowed to use up to (via -M) "F_U64" GB.\n",
- ovOverlapSortSize * totOvl / 1024.0 / 1024.0 / 1024.0, maxMemory >> 30);
-
- ovOverlap *ovls = ovOverlap::allocateOverlaps(NULL, totOvl);
-
- // Load all overlaps - we're guaranteed that either 'name.gz' or 'name' exists (we checked above)
- // or funny business is happening with our files.
-
- for (uint32 i=0; i<=jobIdxMax; i++) {
- if (bucketSizes[i] == 0)
- continue;
-
- char name[FILENAME_MAX];
+ // Or report that we can process.
- sprintf(name, "%s/bucket%04d/slice%03d.gz", storePath, i, fileID);
- if (AS_UTL_fileExists(name, FALSE, FALSE) == false)
- sprintf(name, "%s/bucket%04d/slice%03d", storePath, i, fileID);
-
- if (AS_UTL_fileExists(name, FALSE, FALSE) == false)
- fprintf(stderr, "ERROR: "F_U64" overlaps claim to exist in bucket '%s', but file not found.\n",
- bucketSizes[i], name);
+ fprintf(stderr, "Overlaps need %.2f GB memory, allowed to use up to (via -M) " F_U64 " GB.\n",
+ ovOverlapSortSize * totOvl / 1024.0 / 1024.0 / 1024.0, maxMemory >> 30);
- fprintf(stderr, "Loading "F_U64" overlaps from '%s'.\n", bucketSizes[i], name);
+ // Load all overlaps - we're guaranteed that either 'name.gz' or 'name' exists (we checked when
+ // we loaded bucket sizes) or funny business is happening with our files.
- ovFile *bof = new ovFile(name, ovFileFull);
- uint64 num = 0;
+ ovOverlap *ovls = ovOverlap::allocateOverlaps(gkp, totOvl);
+ uint64 ovlsLen = 0;
- while (bof->readOverlap(ovls + ovlsLen)) {
- ovlsLen++;
- num++;
- }
+ for (uint32 i=0; i<=jobIdxMax; i++)
+ writer->loadOverlapsFromSlice(i, bucketSizes[i], ovls, ovlsLen);
- if (num != bucketSizes[i])
- fprintf(stderr, "ERROR: expected "F_U64" overlaps, found "F_U64" overlaps.\n", bucketSizes[i], num);
- assert(num == bucketSizes[i]);
-
- delete bof;
- }
+ // Check that we found all the overlaps we were expecting.
if (ovlsLen != totOvl)
- fprintf(stderr, "ERROR: read "F_U64" overlaps, expected "F_U64"\n", ovlsLen, totOvl);
+ fprintf(stderr, "ERROR: read " F_U64 " overlaps, expected " F_U64 "\n", ovlsLen, totOvl);
assert(ovlsLen == totOvl);
- if (deleteIntermediateEarly) {
- char name[FILENAME_MAX];
+ // Clean up space if told to.
- fprintf(stderr, "Removing inputs.\n");
- for (uint32 i=0; i<=jobIdxMax; i++) {
- if (bucketSizes[i] == 0)
- continue;
+ if (deleteIntermediateEarly)
+ writer->removeOverlapSlice();
- sprintf(name, "%s/bucket%04d/slice%03d.gz", storePath, i, fileID);
- AS_UTL_unlink(name);
+ // Sort the overlaps! Finally! The parallel STL sort is NOT inplace, and blows up our memory.
- sprintf(name, "%s/bucket%04d/slice%03d", storePath, i, fileID);
- AS_UTL_unlink(name);
- }
- }
-
- // Sort the overlaps - at least on FreeBSD 8.2 with gcc46, the parallel STL sort
- // algorithms are NOT inplace. Restrict to sequential sorting.
- //
- // This sort takes at most 2 minutes on 7gb of overlaps.
- //
fprintf(stderr, "Sorting.\n");
#ifdef _GLIBCXX_PARALLEL
- // If we have the parallel STL, don't use it! Sort is not inplace!
__gnu_sequential::sort(ovls, ovls + ovlsLen);
#else
sort(ovls, ovls + ovlsLen);
#endif
- // Output to store format
+ // Output to the store.
fprintf(stderr, "Writing output.\n");
- writeOverlaps(storePath, ovls, ovlsLen, fileID);
-
- // Clean up.
- delete [] ovls;
+ writer->writeOverlaps(ovls, ovlsLen);
- if (deleteIntermediateLate) {
- char name[FILENAME_MAX];
+ // Clean up. Delete inputs, remove the sentinel, release memory, etc.
- fprintf(stderr, "Removing inputs.\n");
- for (uint32 i=0; i<=jobIdxMax; i++) {
- if (bucketSizes[i] == 0)
- continue;
-
- sprintf(name, "%s/bucket%04d/slice%03d.gz", storePath, i, fileID);
- AS_UTL_unlink(name);
+ delete [] ovls;
- sprintf(name, "%s/bucket%04d/slice%03d", storePath, i, fileID);
- AS_UTL_unlink(name);
- }
- }
+ if (deleteIntermediateLate)
+ writer->removeOverlapSlice();
delete [] bucketSizes;
- // Remove the sentinel to show we're done. The output is in "%s/%04d".
-
- {
- char name[FILENAME_MAX];
- sprintf(name,"%s/%04d.ovs", storePath, fileID);
+ removeSentinel(storePath, fileID);
- unlink(name);
- }
+ gkp->gkStore_close();
// Success!
diff --git a/src/stores/ovStoreStats.C b/src/stores/ovStoreStats.C
index eccb440..3360c34 100644
--- a/src/stores/ovStoreStats.C
+++ b/src/stores/ovStoreStats.C
@@ -34,6 +34,7 @@
#include "stddev.H"
#include "intervalList.H"
+#include "speedCounter.H"
#define OVL_5 0x01
@@ -65,6 +66,7 @@ main(int argc, char **argv) {
double expectedStdDev = 7.0;
bool toFile = true;
+ bool beVerbose = false;
argc = AS_configure(argc, argv);
@@ -91,6 +93,9 @@ main(int argc, char **argv) {
else if (strcmp(argv[arg], "-c") == 0)
toFile = false;
+ else if (strcmp(argv[arg], "-v") == 0)
+ beVerbose = true;
+
else if (strcmp(argv[arg], "-b") == 0)
bgnID = atoi(argv[++arg]);
@@ -153,6 +158,7 @@ main(int argc, char **argv) {
fprintf(stderr, "\n");
fprintf(stderr, " -C mean stddev Expect coverage at mean +- stddev\n");
fprintf(stderr, " -c Write stats to stdout, not to a file\n");
+ fprintf(stderr, " -v Report processing speed to stderr\n");
fprintf(stderr, "\n");
fprintf(stderr, "Outputs:\n");
fprintf(stderr, "\n");
@@ -247,7 +253,7 @@ main(int argc, char **argv) {
// Open outputs.
char N[FILENAME_MAX];
- sprintf(N, "%s.per-read.log", outPrefix);
+ snprintf(N, FILENAME_MAX, "%s.per-read.log", outPrefix);
FILE *LOG = fopen(N, "w");
if (errno)
@@ -260,6 +266,8 @@ main(int argc, char **argv) {
uint32 overlapsLen = 0;
ovOverlap *overlaps = ovOverlap::allocateOverlaps(gkpStore, overlapsMax);
+ speedCounter C(" %9.0f reads (%6.1f reads/sec)\r", 1, 100, beVerbose);
+
overlapsLen = ovlStore->readOverlaps(overlaps, overlapsMax);
while (overlapsLen > 0) {
@@ -529,6 +537,8 @@ main(int argc, char **argv) {
// Done. Read more data.
+ C.tick();
+
overlapsLen = ovlStore->readOverlaps(overlaps, overlapsMax);
}
@@ -580,7 +590,7 @@ main(int argc, char **argv) {
LOG = stdout;
if (toFile == true) {
- sprintf(N, "%s.summary", outPrefix);
+ snprintf(N, FILENAME_MAX, "%s.summary", outPrefix);
LOG = fopen(N, "w");
if (errno)
@@ -589,20 +599,20 @@ main(int argc, char **argv) {
fprintf(LOG, "category reads %% read length feature size or coverage analysis\n");
fprintf(LOG, "---------------- ------- ------- ---------------------- ------------------------ --------------------\n");
- fprintf(LOG, "middle-missing %7"F_U64P" %6.2f %10.2f +- %-8.2f %10.2f +- %-8.2f (bad trimming)\n", readHole->numberOfObjects(), (float)readHole->numberOfObjects()/gkpStore->gkStore_getNumReads()*100, readHole->mean(), readHole->stddev(), olapHole->mean(), olapHole->stddev());
- fprintf(LOG, "middle-hump %7"F_U64P" %6.2f %10.2f +- %-8.2f %10.2f +- %-8.2f (bad trimming)\n", readHump->numberOfObjects(), (float)readHump->numberOfObjects()/gkpStore->gkStore_getNumReads()*100, readHump->mean(), readHump->stddev(), olapHump->mean(), olapHump->stddev());
- fprintf(LOG, "no-5-prime %7"F_U64P" %6.2f %10.2f +- %-8.2f %10.2f +- %-8.2f (bad trimming)\n", readNo5->numberOfObjects(), (float)readNo5->numberOfObjects()/gkpStore->gkStore_getNumReads()*100, readNo5->mean(), readNo5->stddev(), olapNo5->mean(), olapNo5->stddev());
- fprintf(LOG, "no-3-prime %7"F_U64P" %6.2f %10.2f +- %-8.2f %10.2f +- %-8.2f (bad trimming)\n", readNo3->numberOfObjects(), (float)readNo3->numberOfObjects()/gkpStore->gkStore_getNumReads()*100, readNo3->mean(), readNo3->stddev(), olapNo3->mean(), olapNo3->stddev());
+ fprintf(LOG, "middle-missing %7" F_U64P " %6.2f %10.2f +- %-8.2f %10.2f +- %-8.2f (bad trimming)\n", readHole->numberOfObjects(), (float)readHole->numberOfObjects()/gkpStore->gkStore_getNumReads()*100, readHole->mean(), readHole->stddev(), olapHole->mean(), olapHole->stddev());
+ fprintf(LOG, "middle-hump %7" F_U64P " %6.2f %10.2f +- %-8.2f %10.2f +- %-8.2f (bad trimming)\n", readHump->numberOfObjects(), (float)readHump->numberOfObjects()/gkpStore->gkStore_getNumReads()*100, readHump->mean(), readHump->stddev(), olapHump->mean(), olapHump->stddev());
+ fprintf(LOG, "no-5-prime %7" F_U64P " %6.2f %10.2f +- %-8.2f %10.2f +- %-8.2f (bad trimming)\n", readNo5->numberOfObjects(), (float)readNo5->numberOfObjects()/gkpStore->gkStore_getNumReads()*100, readNo5->mean(), readNo5->stddev(), olapNo5->mean(), olapNo5->stddev());
+ fprintf(LOG, "no-3-prime %7" F_U64P " %6.2f %10.2f +- %-8.2f %10.2f +- %-8.2f (bad trimming)\n", readNo3->numberOfObjects(), (float)readNo3->numberOfObjects()/gkpStore->gkStore_getNumReads()*100, readNo3->mean(), readNo3->stddev(), olapNo3->mean(), olapNo3->stddev());
fprintf(LOG, "\n");
- fprintf(LOG, "low-coverage %7"F_U64P" %6.2f %10.2f +- %-8.2f %10.2f +- %-8.2f (easy to assemble, potential for lower quality consensus)\n", readLowCov->numberOfObjects(), (float)readLowCov->numberOfObjects()/gkpStore->gkStore_getNumReads()*100, readLowCov->mean(), readLowCov->stddev(), covrLowCov->mean(), covrLowCov->stddev());
- fprintf(LOG, "unique %7"F_U64P" %6.2f %10.2f +- %-8.2f %10.2f +- %-8.2f (easy to assemble, perfect, yay)\n", readUnique->numberOfObjects(), (float)readUnique->numberOfObjects()/gkpStore->gkStore_getNumReads()*100, readUnique->mean(), readUnique->stddev(), covrUnique->mean(), covrUnique->stddev());
- fprintf(LOG, "repeat-cont %7"F_U64P" %6.2f %10.2f +- %-8.2f %10.2f +- %-8.2f (potential for consensus errors, no impact on assembly)\n", readRepeatCont->numberOfObjects(), (float)readRepeatCont->numberOfObjects()/gkpStore->gkStore_getNumReads()*100, readRepeatCont->mean(), readRepeatCont->stddev(), covrRepeatCont->mean(), covrRepeatCont->stddev());
- fprintf(LOG, "repeat-dove %7"F_U64P" %6.2f %10.2f +- %-8.2f %10.2f +- %-8.2f (hard to assemble, likely won't assemble correctly or even at all)\n", readRepeatDove->numberOfObjects(), (float)readRepeatDove->numberOfObjects()/gkpStore->gkStore_getNumReads()*100, readRepeatDove->mean(), readRepeatDove->stddev(), covrRepeatDove->mean(), covrRepeatDove->stddev());
+ fprintf(LOG, "low-coverage %7" F_U64P " %6.2f %10.2f +- %-8.2f %10.2f +- %-8.2f (easy to assemble, potential for lower quality consensus)\n", readLowCov->numberOfObjects(), (float)readLowCov->numberOfObjects()/gkpStore->gkStore_getNumReads()*100, readLowCov->mean(), readLowCov->stddev(), covrLowCov->mean(), covrLowCov->stddev());
+ fprintf(LOG, "unique %7" F_U64P " %6.2f %10.2f +- %-8.2f %10.2f +- %-8.2f (easy to assemble, perfect, yay)\n", readUnique->numberOfObjects(), (float)readUnique->numberOfObjects()/gkpStore->gkStore_getNumReads()*100, readUnique->mean(), readUnique->stddev(), covrUnique->mean(), covrUnique->stddev());
+ fprintf(LOG, "repeat-cont %7" F_U64P " %6.2f %10.2f +- %-8.2f %10.2f +- %-8.2f (potential for consensus errors, no impact on assembly)\n", readRepeatCont->numberOfObjects(), (float)readRepeatCont->numberOfObjects()/gkpStore->gkStore_getNumReads()*100, readRepeatCont->mean(), readRepeatCont->stddev(), covrRepeatCont->mean(), covrRepeatCont->stddev());
+ fprintf(LOG, "repeat-dove %7" F_U64P " %6.2f %10.2f +- %-8.2f %10.2f +- %-8.2f (hard to assemble, likely won't assemble correctly or even at all)\n", readRepeatDove->numberOfObjects(), (float)readRepeatDove->numberOfObjects()/gkpStore->gkStore_getNumReads()*100, readRepeatDove->mean(), readRepeatDove->stddev(), covrRepeatDove->mean(), covrRepeatDove->stddev());
fprintf(LOG, "\n");
- fprintf(LOG, "span-repeat %7"F_U64P" %6.2f %10.2f +- %-8.2f %10.2f +- %-8.2f (read spans a large repeat, usually easy to assemble)\n", readSpanRepeat->numberOfObjects(), (float)readSpanRepeat->numberOfObjects()/gkpStore->gkStore_getNumReads()*100, readSpanRepeat->mean(), readSpanRepeat->stddev(), olapSpanRepeat->mean(), olapSpanRepeat->stddev());
- fprintf(LOG, "uniq-repeat-cont %7"F_U64P" %6.2f %10.2f +- %-8.2f (should be uniquely placed, low potential for consensus errors, no impact on assembly)\n", readUniqRepeatCont->numberOfObjects(), (float)readUniqRepeatCont->numberOfObjects()/gkpStore->gkStore_getNumReads()*100, readUniqRepeatCont->mean(), readUniqRepeatCont->stddev());
- fprintf(LOG, "uniq-repeat-dove %7"F_U64P" %6.2f %10.2f +- %-8.2f (will end contigs, potential to misassemble)\n", readUniqRepeatDove->numberOfObjects(), (float)readUniqRepeatDove->numberOfObjects()/gkpStore->gkStore_getNumReads()*100, readUniqRepeatDove->mean(), readUniqRepeatDove->stddev());
- fprintf(LOG, "uniq-anchor %7"F_U64P" %6.2f %10.2f +- %-8.2f %10.2f +- %-8.2f (repeat read, with unique section, probable bad read)\n", readUniqAnchor->numberOfObjects(), (float)readUniqAnchor->numberOfObjects()/gkpStore->gkStore_getNumReads()*100, readUniqAnchor->mean(), readUniqAnchor->stddev(), olapUniqAnchor->mean(), olapUniqAnchor->stddev());
+ fprintf(LOG, "span-repeat %7" F_U64P " %6.2f %10.2f +- %-8.2f %10.2f +- %-8.2f (read spans a large repeat, usually easy to assemble)\n", readSpanRepeat->numberOfObjects(), (float)readSpanRepeat->numberOfObjects()/gkpStore->gkStore_getNumReads()*100, readSpanRepeat->mean(), readSpanRepeat->stddev(), olapSpanRepeat->mean(), olapSpanRepeat->stddev());
+ fprintf(LOG, "uniq-repeat-cont %7" F_U64P " %6.2f %10.2f +- %-8.2f (should be uniquely placed, low potential for consensus errors, no impact on assembly)\n", readUniqRepeatCont->numberOfObjects(), (float)readUniqRepeatCont->numberOfObjects()/gkpStore->gkStore_getNumReads()*100, readUniqRepeatCont->mean(), readUniqRepeatCont->stddev());
+ fprintf(LOG, "uniq-repeat-dove %7" F_U64P " %6.2f %10.2f +- %-8.2f (will end contigs, potential to misassemble)\n", readUniqRepeatDove->numberOfObjects(), (float)readUniqRepeatDove->numberOfObjects()/gkpStore->gkStore_getNumReads()*100, readUniqRepeatDove->mean(), readUniqRepeatDove->stddev());
+ fprintf(LOG, "uniq-anchor %7" F_U64P " %6.2f %10.2f +- %-8.2f %10.2f +- %-8.2f (repeat read, with unique section, probable bad read)\n", readUniqAnchor->numberOfObjects(), (float)readUniqAnchor->numberOfObjects()/gkpStore->gkStore_getNumReads()*100, readUniqAnchor->mean(), readUniqAnchor->stddev(), olapUniqAnchor->mean(), olapUniqAnchor->stddev());
if (toFile == true)
fclose(LOG);
diff --git a/src/stores/ovStoreWriter.C b/src/stores/ovStoreWriter.C
new file mode 100644
index 0000000..cc6a5ca
--- /dev/null
+++ b/src/stores/ovStoreWriter.C
@@ -0,0 +1,757 @@
+
+/******************************************************************************
+ *
+ * This file is part of canu, a software program that assembles whole-genome
+ * sequencing reads into contigs.
+ *
+ * This software is based on:
+ * 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
+ * the 'kmer package' (http://kmer.sourceforge.net)
+ * both originally distributed by Applera Corporation under the GNU General
+ * Public License, version 2.
+ *
+ * Canu branched from Celera Assembler at its revision 4587.
+ * Canu branched from the kmer project at its revision 1994.
+ *
+ * This file is derived from:
+ *
+ * src/stores/ovStore.C
+ *
+ * Modifications by:
+ *
+ * Brian P. Walenz beginning on 2016-OCT-28
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
+ * File 'README.licenses' in the root directory of this distribution contains
+ * full conditions and disclaimers for each license.
+ */
+
+#include "ovStore.H"
+
+
+void
+checkAndSaveName(char *storePath, const char *path) {
+ if (path == NULL)
+ fprintf(stderr, "ovStoreWriter::ovStoreWriter()-- ERROR: no name supplied.\n"), exit(1);
+
+ if ((path[0] == '-') &&
+ (path[1] == 0))
+ fprintf(stderr, "ovStoreWriter::ovStoreWriter()-- ERROR: name cannot be '-' (stdin).\n"), exit(1);
+
+ memset(storePath, 0, FILENAME_MAX);
+ strncpy(storePath, path, FILENAME_MAX-1);
+}
+
+
+
+ovStoreWriter::ovStoreWriter(const char *path, gkStore *gkp) {
+ char name[FILENAME_MAX];
+
+ checkAndSaveName(_storePath, path);
+
+ // Fail if this is a valid ovStore.
+
+ if (_info.test(_storePath) == true)
+ fprintf(stderr, "ERROR: '%s' is a valid ovStore; cannot create a new one.\n", _storePath), exit(1);
+
+ // Create the new store
+
+ AS_UTL_mkdir(_storePath);
+
+ _info.clear();
+ _info.save(_storePath);
+
+ _gkp = gkp;
+
+ _offtFile = NULL;
+ _offt.clear();
+ _offm.clear();
+
+ _evaluesMap = NULL;
+ _evalues = NULL;
+
+ _overlapsThisFile = 0;
+ _currentFileIndex = 0;
+ _bof = NULL;
+
+ // This is used by the sequential store build, so we want to collect stats.
+
+ _histogram = new ovStoreHistogram(_gkp, ovFileNormalWrite);
+
+ // Open the index file.
+
+ snprintf(name, FILENAME_MAX, "%s/index", _storePath);
+
+ errno = 0;
+ _offtFile = fopen(name, "w");
+ if (errno)
+ fprintf(stderr, "AS_OVS_createOverlapStore()-- failed to open offset file '%s': %s\n", name, strerror(errno)), exit(1);
+
+ _overlapsThisFile = 0;
+ _overlapsThisFileMax = 0; // 1024 * 1024 * 1024 / _bof->recordSize(); -- needs a valid _bof, dang.
+ _currentFileIndex = 0;
+ _bof = NULL;
+
+ _fileLimit = 0; // Used in the parallel store, not here.
+ _fileID = 0;
+ _jobIdxMax = 0;
+}
+
+
+
+ovStoreWriter::ovStoreWriter(const char *path, gkStore *gkp, uint32 fileLimit, uint32 fileID, uint32 jobIdxMax) {
+
+ checkAndSaveName(_storePath, path);
+
+ _gkp = gkp;
+
+ _offtFile = NULL;
+ _evaluesMap = NULL;
+ _evalues = NULL;
+
+ _overlapsThisFile = 0;
+ _overlapsThisFileMax = 0;
+ _currentFileIndex = 0;
+ _bof = NULL;
+
+ _histogram = NULL;
+
+ _fileLimit = fileLimit;
+ _fileID = fileID;
+ _jobIdxMax = jobIdxMax;
+};
+
+
+
+ovStoreWriter::~ovStoreWriter() {
+
+ // Write the last index element (don't forget to fill in gaps);
+ // update the info, using the final magic number
+
+ if (_offt._numOlaps > 0) {
+ for (; _offm._a_iid < _offt._a_iid; _offm._a_iid++) {
+ _offm._fileno = _offt._fileno;
+ _offm._offset = _offt._offset;
+ _offm._numOlaps = 0;
+
+ AS_UTL_safeWrite(_offtFile, &_offm, "ovStore::~ovStore::offm", sizeof(ovStoreOfft), 1);
+ }
+
+ AS_UTL_safeWrite(_offtFile, &_offt, "ovStore::~ovStore::offt", sizeof(ovStoreOfft), 1);
+ }
+
+ _info.save(_storePath, _currentFileIndex);
+
+ if (_bof)
+ _bof->transferHistogram(_histogram);
+ delete _bof;
+
+ if (_histogram)
+ _histogram->saveData(_storePath);
+ delete _histogram;
+
+ fprintf(stderr, "Created ovStore '%s' with " F_U64 " overlaps for reads from " F_U32 " to " F_U32 ".\n",
+ _storePath, _info.numOverlaps(), _info.smallestID(), _info.largestID());
+
+ fclose(_offtFile);
+}
+
+
+
+
+void
+ovStoreWriter::writeOverlap(ovOverlap *overlap) {
+ char name[FILENAME_MAX];
+
+ // Make sure overlaps are sorted, failing if not.
+
+ if (_offt._a_iid > overlap->a_iid) {
+ fprintf(stderr, "LAST: a:" F_U32 "\n", _offt._a_iid);
+ fprintf(stderr, "THIS: a:" F_U32 " b:" F_U32 "\n", overlap->a_iid, overlap->b_iid);
+ }
+ assert(_offt._a_iid <= overlap->a_iid);
+
+ // If we don't have an output file yet, or the current file is
+ // too big, open a new file.
+
+ if ((_bof) && (_overlapsThisFile >= _overlapsThisFileMax)) {
+ _bof->transferHistogram(_histogram);
+
+ delete _bof;
+
+ _bof = NULL;
+ _overlapsThisFile = 0;
+ _overlapsThisFileMax = 0;
+ }
+
+ if (_bof == NULL) {
+ char name[FILENAME_MAX];
+
+ snprintf(name, FILENAME_MAX, "%s/%04d", _storePath, ++_currentFileIndex);
+
+ _bof = new ovFile(_gkp, name, ovFileNormalWrite);
+ _overlapsThisFile = 0;
+ _overlapsThisFileMax = 1024 * 1024 * 1024 / _bof->recordSize();
+ }
+
+ // Put the index to disk, filling any gaps
+
+ if ((_offt._numOlaps != 0) &&
+ (_offt._a_iid != overlap->a_iid)) {
+
+ while (_offm._a_iid < _offt._a_iid) {
+ _offm._fileno = _offt._fileno;
+ _offm._offset = _offt._offset;
+ _offm._overlapID = _offt._overlapID; // Not needed, but makes life easier
+
+ AS_UTL_safeWrite(_offtFile, &_offm, "ovStore::writeOverlap::offset", sizeof(ovStoreOfft), 1);
+
+ _offm._a_iid++;
+ }
+
+ _offm._a_iid++; // One more, since this iid is not missing -- we write it next!
+
+ AS_UTL_safeWrite(_offtFile, &_offt, "AS_OVS_writeOverlapToStore offset", sizeof(ovStoreOfft), 1);
+
+ _offt._numOlaps = 0; // Reset; this new id has no overlaps yet.
+ }
+
+ // Update the index if this is the first overlap for this a_iid
+
+ if (_offt._numOlaps == 0) {
+ _offt._a_iid = overlap->a_iid;
+ _offt._fileno = _currentFileIndex;
+ _offt._offset = _overlapsThisFile;
+ _offt._overlapID = _info.numOverlaps();
+ }
+
+ _bof->writeOverlap(overlap);
+
+ _offt._numOlaps++;
+ _info.addOverlap(overlap->a_iid);
+ _overlapsThisFile++;
+}
+
+
+
+
+
+// For the parallel sort, write a block of sorted overlaps into a single file, with index and info.
+
+void
+ovStoreWriter::writeOverlaps(ovOverlap *ovls,
+ uint64 ovlsLen) {
+ char name[FILENAME_MAX];
+
+ uint32 currentFileIndex = _fileID;
+
+ ovStoreInfo info;
+
+ info.clear();
+
+ ovStoreOfft offt;
+ ovStoreOfft offm;
+
+ offt._a_iid = offm._a_iid = ovls[0].a_iid;
+ offt._fileno = offm._fileno = _fileID;
+ offt._offset = offm._offset = 0;
+ offt._numOlaps = offm._numOlaps = 0;
+ offt._overlapID = offm._overlapID = 0;
+
+ // Create the output file
+
+ snprintf(name, FILENAME_MAX, "%s/%04d", _storePath, _fileID);
+ ovFile *bof = new ovFile(_gkp, name, ovFileNormalWrite);
+
+ // Create the index file
+
+ snprintf(name, FILENAME_MAX, "%s/%04d.index", _storePath, _fileID);
+
+ errno = 0;
+ FILE *offtFile=fopen(name,"w");
+ if (errno)
+ fprintf(stderr, "ERROR: Failed to open '%s' for writing: %s\n", name, strerror(errno)), exit(1);
+
+ // Dump the overlaps
+
+ fprintf(stderr, "Writing " F_U64 " overlaps.\n", ovlsLen);
+
+ for (uint64 i=0; i<ovlsLen; i++ ) {
+ bof->writeOverlap(ovls + i);
+
+ if (offt._a_iid > ovls[i].a_iid) {
+ fprintf(stderr, "LAST: a:" F_U32 "\n", offt._a_iid);
+ fprintf(stderr, "THIS: a:" F_U32 " b:" F_U32 "\n", ovls[i].a_iid, ovls[i].b_iid);
+ }
+ assert(offt._a_iid <= ovls[i].a_iid);
+
+ // Put the index to disk, filling any gaps
+
+ if ((offt._numOlaps != 0) && (offt._a_iid != ovls[i].a_iid)) {
+ while (offm._a_iid < offt._a_iid) {
+ offm._fileno = offt._fileno;
+ offm._offset = offt._offset;
+ offm._overlapID = offt._overlapID; // Not needed, but makes life easier
+ offm._numOlaps = 0;
+
+ AS_UTL_safeWrite(offtFile, &offm, "AS_OVS_writeOverlapToStore offt", sizeof(ovStoreOfft), 1);
+ offm._a_iid++;
+ }
+
+ // One more, since this iid is not offm -- we write it next!
+ offm._a_iid++;
+
+ AS_UTL_safeWrite(offtFile, &offt, "AS_OVS_writeOverlapToStore offt", sizeof(ovStoreOfft), 1);
+
+ offt._overlapID += offt._numOlaps; // The next block of overlaps starts with this ID
+ offt._numOlaps = 0; // The next block has no overlaps yet.
+ }
+
+ // Update the index if this is the first overlap for this a_iid
+
+ if (offt._numOlaps == 0) {
+ offt._a_iid = ovls[i].a_iid;
+ offt._fileno = currentFileIndex;
+ offt._offset = info.numOverlaps();
+ }
+
+ offt._numOlaps++;
+
+ info.addOverlap(ovls[i].a_iid);
+ }
+
+ // Close the output file.
+
+ delete bof;
+
+ // Write the final (empty) index entries.
+
+ while (offm._a_iid < offt._a_iid) {
+ offm._fileno = offt._fileno;
+ offm._offset = offt._offset;
+ offm._overlapID = offt._overlapID; // Not needed, but makes life easier
+ offm._numOlaps = 0;
+
+ AS_UTL_safeWrite(offtFile, &offm, "AS_OVS_writeOverlapToStore offt", sizeof(ovStoreOfft), 1);
+ offm._a_iid++;
+ }
+
+ // And the final (real) index entry. We could, but don't need to, update overlapID with the
+ // number of overlaps in this block.
+
+ AS_UTL_safeWrite(offtFile, &offt, "AS_OVS_writeOverlapToStore offt", sizeof(ovStoreOfft), 1);
+
+ fclose(offtFile);
+
+ // Write the info, and some stats for the user.
+
+ info.save(_storePath, _fileID, true);
+
+ fprintf(stderr, "Created ovStore segment '%s/%04d' with " F_U64 " overlaps for reads from " F_U32 " to " F_U32 ".\n",
+ _storePath, _fileID, _info.numOverlaps(), _info.smallestID(), _info.largestID());
+}
+
+
+
+
+// For the parallel sort, but also generally applicable, test that the index is sane.
+
+bool
+ovStoreWriter::testIndex(bool doFixes) {
+ char name[FILENAME_MAX];
+ FILE *I = NULL;
+ FILE *F = NULL;
+
+ // Open the input index.
+
+ snprintf(name, FILENAME_MAX, "%s/index", _storePath);
+
+ errno = 0;
+ I = fopen(name, "r");
+ if (errno)
+ fprintf(stderr, "ERROR: Failed to open '%s' for reading: %s\n", name, strerror(errno)), exit(1);
+
+ // If we're fixing, open the output index.
+
+ if (doFixes) {
+ snprintf(name, FILENAME_MAX, "%s/index.fixed", _storePath);
+
+ errno = 0;
+ F = fopen(name, "w");
+ if (errno)
+ fprintf(stderr, "ERROR: Failed to open '%s' for writing: %s\n", name, strerror(errno)), exit(1);
+ }
+
+ ovStoreOfft O;
+
+ uint32 curIID = 0;
+ uint32 minIID = UINT32_MAX;
+ uint32 maxIID = 0;
+
+ uint32 nErrs = 0;
+
+ while (1 == AS_UTL_safeRead(I, &O, "offset", sizeof(ovStoreOfft), 1)) {
+ bool maxIncreases = (maxIID < O._a_iid);
+ bool errorDecreased = ((O._a_iid < curIID));
+ bool errorGap = ((O._a_iid > 0) && (curIID + 1 != O._a_iid));
+
+ if (O._a_iid < minIID)
+ minIID = O._a_iid;
+
+ if (maxIncreases)
+ maxIID = O._a_iid;
+
+ if (errorDecreased)
+ fprintf(stderr, "ERROR: index decreased from " F_U32 " to " F_U32 "\n", curIID, O._a_iid), nErrs++;
+ else if (errorGap)
+ fprintf(stderr, "ERROR: gap between " F_U32 " and " F_U32 "\n", curIID, O._a_iid), nErrs++;
+
+ if ((maxIncreases == true) && (errorGap == false)) {
+ if (doFixes)
+ AS_UTL_safeWrite(F, &O, "offset", sizeof(ovStoreOfft), 1);
+
+ } else if (O._numOlaps > 0) {
+ fprintf(stderr, "ERROR: lost overlaps a_iid " F_U32 " fileno " F_U32 " offset " F_U32 " numOlaps " F_U32 "\n",
+ O._a_iid, O._fileno, O._offset, O._numOlaps);
+ }
+
+ curIID = O._a_iid;
+ }
+
+ fclose(I);
+
+ if (F)
+ fclose(F);
+
+ return(nErrs == 0);
+}
+
+
+
+
+
+// For the parallel sort, merge index and info files into one, clean up the intermediates.
+
+void
+ovStoreWriter::mergeInfoFiles(void) {
+ ovStoreInfo infopiece;
+ ovStoreInfo info;
+
+ info.clear();
+
+ ovStoreOfft offm;
+
+ offm._a_iid = 0;
+ offm._fileno = 1;
+ offm._offset = 0;
+ offm._numOlaps = 0;
+ offm._overlapID = 0;
+
+ // Open the new master index output file
+
+ char name[FILENAME_MAX];
+
+ snprintf(name, FILENAME_MAX, "%s/index", _storePath);
+
+ errno = 0;
+ FILE *idx = fopen(name, "w");
+ if (errno)
+ fprintf(stderr, "ERROR: Failed to open '%s': %s\n", name, strerror(errno)), exit(1);
+
+ // Special case, we need an empty index for the zeroth fragment.
+
+ AS_UTL_safeWrite(idx, &offm, "ovStore::mergeInfoFiles::offsetZero", sizeof(ovStoreOfft), 1);
+
+ // Sanity checking, compare the number of overlaps processed against the overlapID
+ // of each ovStoreOfft.
+
+ uint64 totalOverlaps = 0;
+
+ // Process each
+
+ for (uint32 i=1; i<=_fileLimit; i++) {
+ fprintf(stderr, "Processing '%s'\n", name);
+
+ infopiece.load(_storePath, i, true);
+
+ if (infopiece.numOverlaps() == 0) {
+ fprintf(stderr, " No overlaps found.\n");
+ continue;
+ }
+
+ // Add empty index elements for missing overlaps
+
+ if (info.largestID() + 1 < infopiece.smallestID())
+ fprintf(stderr, " Adding empty records for fragments " F_U32 " to " F_U32 "\n",
+ info.largestID() + 1, infopiece.smallestID() - 1);
+
+ while (info.largestID() + 1 < infopiece.smallestID()) {
+ offm._a_iid = info.largestID() + 1;
+ //offm._fileno = set below, where the recs are written to the master file
+ //offm._offset = set below, where the recs are written to the master file
+
+ AS_UTL_safeWrite(idx, &offm, "ovStore::mergeInfoFiles::offsets", sizeof(ovStoreOfft), 1);
+
+ info.addOverlap(offm._a_iid, 0);
+ }
+
+ // Copy index elements for existing overlaps. While copying, update the supposed position
+ // of any fragments with no overlaps. Without doing this, accessing the store beginning
+ // or ending at such a fragment will fail.
+
+ {
+ snprintf(name, FILENAME_MAX, "%s/%04d.index", _storePath, i);
+
+ errno = 0;
+ FILE *F = fopen(name, "r");
+ if (errno)
+ fprintf(stderr, "ERROR: Failed to open '%s': %s\n", name, strerror(errno)), exit(1);
+
+ uint32 recsLen = 0;
+ uint32 recsMax = 1024 * 1024;
+ ovStoreOfft *recs = new ovStoreOfft [recsMax];
+
+ recsLen = AS_UTL_safeRead(F, recs, "ovStore::mergeInfoFiles::offsetsLoad", sizeof(ovStoreOfft), recsMax);
+
+ if (recsLen > 0) {
+ if (info.largestID() + 1 != recs[0]._a_iid)
+ fprintf(stderr, "ERROR: '%s' starts with iid " F_U32 ", but store only up to " F_U32 "\n",
+ name, recs[0]._a_iid, info.largestID());
+ assert(info.largestID() + 1 == recs[0]._a_iid);
+ }
+
+ while (recsLen > 0) {
+
+ // Update location of missing reads.
+
+ offm._fileno = recs[recsLen-1]._fileno;
+ offm._offset = recs[recsLen-1]._offset;
+
+ // Update overlapID for each record.
+
+ for (uint32 rr=0; rr<recsLen; rr++) {
+ recs[rr]._overlapID += info.numOverlaps();
+
+ if (recs[rr]._numOlaps > 0)
+ assert(recs[rr]._overlapID == totalOverlaps);
+
+ totalOverlaps += recs[rr]._numOlaps;
+ }
+
+ // Write the records, read next batch
+
+ AS_UTL_safeWrite(idx, recs, "ovStore::mergeInfoFiles::offsetsWrite", sizeof(ovStoreOfft), recsLen);
+
+ recsLen = AS_UTL_safeRead(F, recs, "ovStore::mergeInfoFiles::offsetsReLoad", sizeof(ovStoreOfft), recsMax);
+ }
+
+ delete [] recs;
+
+ fclose(F);
+ }
+
+ // Update the info block to include the overlaps we just added
+
+ info.addOverlap(infopiece.smallestID(), 0);
+ info.addOverlap(infopiece.largestID(), infopiece.numOverlaps());
+
+ fprintf(stderr, " Now finished with fragments " F_U32 " to " F_U32 " -- " F_U64 " overlaps.\n",
+ info.smallestID(), info.largestID(), info.numOverlaps());
+ }
+
+ fclose(idx);
+
+
+ // Dump the new store info file
+
+ info.save(_storePath, _fileLimit);
+
+ fprintf(stderr, "Created ovStore '%s' with " F_U64 " overlaps for reads from " F_U32 " to " F_U32 ".\n",
+ _storePath, _info.numOverlaps(), _info.smallestID(), _info.largestID());
+}
+
+
+
+
+void
+ovStoreWriter::mergeHistogram(void) {
+ char name[FILENAME_MAX];
+ ovStoreHistogram *histogram = new ovStoreHistogram;
+
+ for (uint32 i=1; i<=_fileLimit; i++) {
+ snprintf(name, FILENAME_MAX, "%s/%04d", _storePath, i);
+
+ histogram->loadData(name);
+ }
+
+ histogram->saveData(_storePath);
+
+ delete histogram;
+}
+
+
+
+
+
+
+
+uint64
+ovStoreWriter::loadBucketSizes(uint64 *bucketSizes) {
+ char namz[FILENAME_MAX];
+ char name[FILENAME_MAX];
+
+ uint64 *sliceSizes = new uint64 [_fileLimit + 1]; // For each overlap job, number of overlaps per bucket
+ uint64 totOvl = 0;
+
+ for (uint32 i=0; i<=_jobIdxMax; i++) {
+ bucketSizes[i] = 0;
+
+ snprintf(name, FILENAME_MAX, "%s/bucket%04d/slice%03d", _storePath, i, _fileID);
+ snprintf(namz, FILENAME_MAX, "%s/bucket%04d/slice%03d.gz", _storePath, i, _fileID);
+
+ // If no file, there are no overlaps. Skip loading the bucketSizes file.
+ // With snappy compression, we expect the file to be not gzip compressed, but will happily
+ // accept a gzipped file.
+
+ if ((AS_UTL_fileExists(name, FALSE, FALSE) == false) &&
+ (AS_UTL_fileExists(namz, FALSE, FALSE) == false))
+ continue;
+
+ snprintf(name, FILENAME_MAX, "%s/bucket%04d/sliceSizes", _storePath, i);
+
+ FILE *F = fopen(name, "r");
+ if (errno)
+ fprintf(stderr, "ERROR: Failed to open %s: %s\n", name, strerror(errno)), exit(1);
+
+ uint64 nr = AS_UTL_safeRead(F, sliceSizes, "sliceSizes", sizeof(uint64), _fileLimit + 1);
+
+ fclose(F);
+
+ if (nr != _fileLimit + 1) {
+ fprintf(stderr, "ERROR: short read on '%s'.\n", name);
+ fprintf(stderr, "ERROR: read " F_U64 " sizes insteadof " F_U32 ".\n", nr, _fileLimit + 1);
+ }
+ assert(nr == _fileLimit + 1);
+
+ fprintf(stderr, "Found " F_U64 " overlaps from '%s'.\n", sliceSizes[_fileID], name);
+
+ bucketSizes[i] = sliceSizes[_fileID];
+ totOvl += sliceSizes[_fileID];
+ }
+
+ delete [] sliceSizes;
+
+ return(totOvl);
+}
+
+
+
+void
+ovStoreWriter::loadOverlapsFromSlice(uint32 slice, uint64 expectedLen, ovOverlap *ovls, uint64& ovlsLen) {
+ char name[FILENAME_MAX];
+
+ if (expectedLen == 0)
+ return;
+
+ snprintf(name, FILENAME_MAX, "%s/bucket%04d/slice%03d", _storePath, slice, _fileID);
+
+ if (AS_UTL_fileExists(name, FALSE, FALSE) == false) {
+ snprintf(name, FILENAME_MAX, "%s/bucket%04d/slice%03d.gz", _storePath, slice, _fileID);
+
+ if (AS_UTL_fileExists(name, FALSE, FALSE) == false)
+ fprintf(stderr, "ERROR: " F_U64 " overlaps claim to exist in bucket '%s', but file not found.\n",
+ expectedLen, name);
+ }
+
+ fprintf(stderr, "Loading " F_U64 " overlaps from '%s'.\n", expectedLen, name);
+
+ ovFile *bof = new ovFile(_gkp, name, ovFileFull);
+ uint64 num = 0;
+
+ while (bof->readOverlap(ovls + ovlsLen)) {
+ ovlsLen++;
+ num++;
+ }
+
+ if (num != expectedLen)
+ fprintf(stderr, "ERROR: expected " F_U64 " overlaps, found " F_U64 " overlaps.\n", expectedLen, num);
+ assert(num == expectedLen);
+
+ delete bof;
+}
+
+
+
+void
+ovStoreWriter::removeOverlapSlice(void) {
+ char name[FILENAME_MAX];
+
+ for (uint32 i=0; i<=_jobIdxMax; i++) {
+ snprintf(name, FILENAME_MAX, "%s/bucket%04d/slice%03d.gz", _storePath, i, _fileID); AS_UTL_unlink(name);
+ snprintf(name, FILENAME_MAX, "%s/bucket%04d/slice%03d", _storePath, i, _fileID); AS_UTL_unlink(name);
+ }
+}
+
+
+
+void
+ovStoreWriter::checkSortingIsComplete(void) {
+ char nameD[FILENAME_MAX];
+ char nameF[FILENAME_MAX];
+ char nameI[FILENAME_MAX];
+
+ uint32 failedJobs = 0;
+
+ for (uint32 i=1; i<=_fileLimit; i++) {
+ snprintf(nameD, FILENAME_MAX, "%s/%04d", _storePath, i);
+ snprintf(nameF, FILENAME_MAX, "%s/%04d.info", _storePath, i);
+ snprintf(nameI, FILENAME_MAX, "%s/%04d.index", _storePath, i);
+
+ bool existD = AS_UTL_fileExists(nameD, FALSE, FALSE);
+ bool existF = AS_UTL_fileExists(nameF, FALSE, FALSE);
+ bool existI = AS_UTL_fileExists(nameI, FALSE, FALSE);
+
+ if (existD && existF && existI)
+ continue;
+
+ failedJobs++;
+
+ if (existD == false) fprintf(stderr, "ERROR: Segment " F_U32 " data not present (%s)\n", i, nameD);
+ if (existF == false) fprintf(stderr, "ERROR: Segment " F_U32 " info not present (%s)\n", i, nameF);
+ if (existI == false) fprintf(stderr, "ERROR: Segment " F_U32 " index not present (%s)\n", i, nameI);
+ }
+
+ if (failedJobs > 0)
+ fprintf(stderr, "ERROR: " F_U32 " segments, out of " F_U32 ", failed.\n", _fileLimit, failedJobs), exit(1);
+}
+
+
+
+void
+ovStoreWriter::removeAllIntermediateFiles(void) {
+ char name[FILENAME_MAX];
+
+ // Removing indices and histogram data is easy, beacuse we know how many there are.
+
+ for (uint32 i=1; i<=_fileLimit; i++) {
+ snprintf(name, FILENAME_MAX, "%s/%04u.index", _storePath, i); AS_UTL_unlink(name);
+ snprintf(name, FILENAME_MAX, "%s/%04u.info", _storePath, i); AS_UTL_unlink(name);
+ snprintf(name, FILENAME_MAX, "%s/%04d", _storePath, i); ovStoreHistogram::removeData(name);
+ }
+
+ // We don't know how many buckets there are, so we remove until we fail to find ten
+ // buckets in a row.
+
+ for (uint32 missing=0, i=1; missing<10; i++, missing++) {
+ snprintf(name, FILENAME_MAX, "%s/bucket%04d", _storePath, i);
+
+ if (AS_UTL_fileExists(name, false, false) == false)
+ continue;
+
+ snprintf(name, FILENAME_MAX, "%s/bucket%04d/sliceSizes", _storePath, i); AS_UTL_unlink(name);
+ snprintf(name, FILENAME_MAX, "%s/bucket%04d", _storePath, i); rmdir(name);
+
+ missing = 0;
+ }
+}
diff --git a/src/stores/tgStore.C b/src/stores/tgStore.C
index 1de32bf..55b326f 100644
--- a/src/stores/tgStore.C
+++ b/src/stores/tgStore.C
@@ -69,7 +69,8 @@ tgStore::tgStore(const char *path_,
_type = type_;
- strcpy(_path, path_);
+ _path[FILENAME_MAX] = 0;
+ strncpy(_path, path_, FILENAME_MAX-1);
_newTigs = false;
@@ -191,9 +192,9 @@ tgStore::~tgStore() {
void
tgStore::purgeVersion(uint32 version) {
- sprintf(_name, "%s/seqDB.v%03d.dat", _path, version); AS_UTL_unlink(_name);
- sprintf(_name, "%s/seqDB.v%03d.ctg", _path, version); AS_UTL_unlink(_name);
- sprintf(_name, "%s/seqDB.v%03d.utg", _path, version); AS_UTL_unlink(_name);
+ snprintf(_name, FILENAME_MAX, "%s/seqDB.v%03d.dat", _path, version); AS_UTL_unlink(_name);
+ snprintf(_name, FILENAME_MAX, "%s/seqDB.v%03d.ctg", _path, version); AS_UTL_unlink(_name);
+ snprintf(_name, FILENAME_MAX, "%s/seqDB.v%03d.utg", _path, version); AS_UTL_unlink(_name);
}
@@ -269,7 +270,7 @@ tgStore::writeTigToDisk(tgTig *tig, tgStoreEntry *te) {
te->flushNeeded = 0;
te->fileOffset = AS_UTL_ftell(FP);
- //fprintf(stderr, "tgStore::writeTigToDisk()-- write tig "F_S32" in store version "F_U64" at file position "F_U64"\n",
+ //fprintf(stderr, "tgStore::writeTigToDisk()-- write tig " F_S32 " in store version " F_U64 " at file position " F_U64 "\n",
// tig->_tigID, te->svID, te->fileOffset);
tig->saveToStream(FP);
@@ -409,7 +410,7 @@ tgStore::loadTig(uint32 tigID) {
bool cantLoad = true;
if (_tigLen <= tigID)
- fprintf(stderr, "tgStore::loadTig()-- WARNING: invalid out-of-range tigID "F_S32", only "F_S32" ma in store; return NULL.\n",
+ fprintf(stderr, "tgStore::loadTig()-- WARNING: invalid out-of-range tigID " F_S32 ", only " F_S32 " ma in store; return NULL.\n",
tigID, _tigLen);
assert(tigID < _tigLen);
@@ -596,7 +597,7 @@ tgStore::numTigsInMASRfile(char *name) {
void
tgStore::dumpMASR(tgStoreEntry* &R, uint32& L, uint32 V) {
- sprintf(_name, "%s/seqDB.v%03d.tig", _path, V);
+ snprintf(_name, FILENAME_MAX, "%s/seqDB.v%03d.tig", _path, V);
errno = 0;
FILE *F = fopen(_name, "w");
@@ -631,7 +632,7 @@ tgStore::loadMASR(tgStoreEntry* &R, uint32& L, uint32& M, uint32 V) {
//
if (R == NULL) {
for (int32 i=V; i>0; i--) {
- sprintf(_name, "%s/seqDB.v%03d.tig", _path, i);
+ snprintf(_name, FILENAME_MAX, "%s/seqDB.v%03d.tig", _path, i);
L = numTigsInMASRfile(_name);
if (L > 0)
break;
@@ -642,11 +643,11 @@ tgStore::loadMASR(tgStoreEntry* &R, uint32& L, uint32& M, uint32 V) {
memset(R, 0, sizeof(tgStoreEntry) * M);
}
- sprintf(_name, "%s/seqDB.v%03d.tig", _path, V);
+ snprintf(_name, FILENAME_MAX, "%s/seqDB.v%03d.tig", _path, V);
while ((AS_UTL_fileExists(_name) == false) && (V > 0)) {
V--;
- sprintf(_name, "%s/seqDB.v%03d.tig", _path, V);
+ snprintf(_name, FILENAME_MAX, "%s/seqDB.v%03d.tig", _path, V);
}
if (V == 0)
@@ -684,7 +685,7 @@ tgStore::loadMASR(tgStoreEntry* &R, uint32& L, uint32& M, uint32 V) {
// Check we're consistent.
if (L < MASRtotalInFile)
- fprintf(stderr, "tgStore::loadMASR()-- '%s' has more tigs ("F_U32") than expected ("F_U32").\n",
+ fprintf(stderr, "tgStore::loadMASR()-- '%s' has more tigs (" F_U32 ") than expected (" F_U32 ").\n",
_name, MASRtotalInFile, L), exit(1);
AS_UTL_safeRead(F, R, "MASR", sizeof(tgStoreEntry), masrLen);
@@ -703,7 +704,7 @@ tgStore::openDB(uint32 version) {
// Load the data
- sprintf(_name, "%s/seqDB.v%03d.dat", _path, version);
+ snprintf(_name, FILENAME_MAX, "%s/seqDB.v%03d.dat", _path, version);
// If version is the _currentVersion, open for writing if allowed.
//
diff --git a/src/stores/tgStore.H b/src/stores/tgStore.H
index 4ed3ead..bd98e0a 100644
--- a/src/stores/tgStore.H
+++ b/src/stores/tgStore.H
@@ -148,8 +148,8 @@ private:
FILE *openDB(uint32 V);
- char _path[FILENAME_MAX]; // Path to the store.
- char _name[FILENAME_MAX]; // Name of the currently opened file, and other uses.
+ char _path[FILENAME_MAX+1]; // Path to the store.
+ char _name[FILENAME_MAX+1]; // Name of the currently opened file, and other uses.
tgStoreType _type;
diff --git a/src/stores/tgStoreCompress.C b/src/stores/tgStoreCompress.C
index 32fe427..4a47f9c 100644
--- a/src/stores/tgStoreCompress.C
+++ b/src/stores/tgStoreCompress.C
@@ -51,7 +51,7 @@ operationCompress(char *tigName, int tigVers) {
continue;
if (tigStore->getVersion(ti) > tigVers) {
- fprintf(stderr, "WARNING: Attempt to move future unitig "F_U32" from version "F_U32" to previous version %d.\n",
+ fprintf(stderr, "WARNING: Attempt to move future unitig " F_U32 " from version " F_U32 " to previous version %d.\n",
ti, tigStore->getVersion(ti), tigVers);
nErrors++;
} else if (tigStore->getVersion(ti) < tigVers) {
@@ -61,7 +61,7 @@ operationCompress(char *tigName, int tigVers) {
if (nErrors > 0) {
fprintf(stderr, "Store can't be compressed; probably trying to compress to something that isn't the latest version.\n");
- fprintf(stderr, " "F_U32" tigs failed; "F_U32" compressable\n", nErrors, nCompress);
+ fprintf(stderr, " " F_U32 " tigs failed; " F_U32 " compressable\n", nErrors, nCompress);
delete tigStore;
exit(1);
}
@@ -75,7 +75,7 @@ operationCompress(char *tigName, int tigVers) {
}
if (nCompress > 0) {
- fprintf(stderr, "Compressing "F_U32" tigs into version %d\n", nCompress, tigVers);
+ fprintf(stderr, "Compressing " F_U32 " tigs into version %d\n", nCompress, tigVers);
for (uint32 ti=0; ti<tigStore->numTigs(); ti++) {
if ((ti % 1000000) == 0)
@@ -102,7 +102,7 @@ operationCompress(char *tigName, int tigVers) {
if (nCompress > 0) {
for (uint32 version=1; version<tigVers; version++) {
- fprintf(stderr, "Purge version "F_U32".\n", version);
+ fprintf(stderr, "Purge version " F_U32 ".\n", version);
tigStore->purgeVersion(version);
}
}
diff --git a/src/stores/tgStoreCoverageStat.C b/src/stores/tgStoreCoverageStat.C
index 9c4caa4..ededf2f 100644
--- a/src/stores/tgStoreCoverageStat.C
+++ b/src/stores/tgStoreCoverageStat.C
@@ -213,8 +213,8 @@ getGlobalArrivalRate(tgStore *tigStore,
fprintf(outSTA, "BASED ON ALL UNITIGS:\n");
fprintf(outSTA, "sumRho: %.0f\n", sumRho);
- fprintf(outSTA, "totalRandomFrags: "F_U64"\n", totalRandom);
- fprintf(outSTA, "Supplied genome size "F_U64"\n", genomeSize);
+ fprintf(outSTA, "totalRandomFrags: " F_U64 "\n", totalRandom);
+ fprintf(outSTA, "Supplied genome size " F_U64 "\n", genomeSize);
fprintf(outSTA, "Computed genome size: %.2f\n", totalRandom / globalRate);
fprintf(outSTA, "Calculated Global Arrival rate: %f\n", globalRate);
@@ -474,13 +474,13 @@ main(int argc, char **argv) {
errno = 0;
- sprintf(outName, "%s.log", outPrefix);
+ snprintf(outName, FILENAME_MAX, "%s.log", outPrefix);
outLOG = fopen(outName, "w");
if (errno)
fprintf(stderr, "Failed to open '%s': %s\n", outName, strerror(errno)), exit(1);
- sprintf(outName, "%s.stats", outPrefix);
+ snprintf(outName, FILENAME_MAX, "%s.stats", outPrefix);
outSTA = fopen(outName, "w");
if (errno)
diff --git a/src/stores/tgStoreDump.C b/src/stores/tgStoreDump.C
index c862abf..ff45a7c 100644
--- a/src/stores/tgStoreDump.C
+++ b/src/stores/tgStoreDump.C
@@ -200,7 +200,7 @@ public:
else
badCov += ID->hi(ii) - ID->lo(ii);
- double fracGood = (goodCov) / (goodCov + badCov);
+ double fracGood = (double)(goodCov) / (goodCov + badCov);
return((fracGood < minGoodCov) ||
(maxGoodCov < fracGood));
@@ -250,11 +250,12 @@ dumpStatus(gkStore *UNUSED(gkpStore), tgStore *tigStore) {
void
dumpTig(FILE *out, tgTig *tig, bool useGapped) {
- fprintf(out, F_U32"\t"F_U32"\t%s\t%.2f\t%s\t%s\t%s\t"F_U32"\n",
+ fprintf(out, F_U32"\t" F_U32 "\t%s\t%.2f\t%.2f\t%s\t%s\t%s\t" F_U32 "\n",
tig->tigID(),
tig->length(useGapped),
tig->coordinateType(useGapped),
tig->_coverageStat,
+ tig->computeCoverage(useGapped),
toString(tig->_class),
tig->_suggestRepeat ? "yes" : "no",
tig->_suggestCircular ? "yes" : "no",
@@ -265,7 +266,7 @@ dumpTig(FILE *out, tgTig *tig, bool useGapped) {
void
dumpRead(FILE *out, tgTig *tig, tgPosition *read, bool useGapped) {
- fprintf(out, F_U32"\t"F_U32"\t%s\t"F_U32"\t"F_U32"\n",
+ fprintf(out, F_U32"\t" F_U32 "\t%s\t" F_U32 "\t" F_U32 "\n",
read->ident(),
tig->tigID(),
tig->coordinateType(useGapped),
@@ -278,7 +279,7 @@ dumpRead(FILE *out, tgTig *tig, tgPosition *read, bool useGapped) {
void
dumpTigs(gkStore *UNUSED(gkpStore), tgStore *tigStore, tgFilter &filter, bool useGapped) {
- fprintf(stdout, "tigID\ttigLen\ttype\tcovStat\tsr\tsu\tsc\tsh\tnumChildren\n");
+ fprintf(stdout, "#tigID\ttigLen\tcoordType\tcovStat\tcoverage\ttigClass\tsugRept\tsugCirc\tnumChildren\n");
for (uint32 ti=0; ti<tigStore->numTigs(); ti++) {
if (tigStore->isDeleted(ti))
@@ -353,9 +354,9 @@ dumpLayout(gkStore *UNUSED(gkpStore), tgStore *tigStore, tgFilter &filter, bool
char R[FILENAME_MAX]; int32 Rerr = 0;
char L[FILENAME_MAX]; int32 Lerr = 0;
- sprintf(T, "%s.layout.tigInfo", outPrefix);
- sprintf(R, "%s.layout.readToTig", outPrefix);
- sprintf(L, "%s.layout", outPrefix);
+ snprintf(T, FILENAME_MAX, "%s.layout.tigInfo", outPrefix);
+ snprintf(R, FILENAME_MAX, "%s.layout.readToTig", outPrefix);
+ snprintf(L, FILENAME_MAX, "%s.layout", outPrefix);
errno = 0;
@@ -372,6 +373,9 @@ dumpLayout(gkStore *UNUSED(gkpStore), tgStore *tigStore, tgFilter &filter, bool
if (Terr + Rerr + Lerr > 0)
exit(1);
+
+ fprintf(tigs, "#tigID\ttigLen\tcoordType\tcovStat\tcoverage\ttigClass\tsugRept\tsugCirc\tnumChildren\n");
+ fprintf(reads, "#readID\ttigID\tcoordType\tbgn\tend\n");
}
for (uint32 ti=0; ti<tigStore->numTigs(); ti++) {
@@ -545,7 +549,7 @@ plotDepthHistogram(char *N, uint64 *cov, uint32 covMax) {
fprintf(stderr, "Failed to open '%s' for writing: %s\n", N, strerror(errno));
for (uint32 ii=minii; ii<=maxii; ii++)
- fprintf(F, F_U32"\t"F_U64"\n", ii, cov[ii]);
+ fprintf(F, F_U32"\t" F_U64 "\n", ii, cov[ii]);
fclose(F);
}
@@ -600,7 +604,7 @@ dumpDepthHistogram(gkStore *UNUSED(gkpStore), tgStore *tigStore, tgFilter &filte
// Maybe plot the histogram (and if so, clear it for the next tig).
if (single == true) {
- sprintf(N, "%s.tig%06d.depthHistogram", outPrefix, tig->tigID());
+ snprintf(N, FILENAME_MAX, "%s.tig%06d.depthHistogram", outPrefix, tig->tigID());
plotDepthHistogram(N, cov, covMax);
memset(cov, 0, sizeof(uint64) * covMax); // Slight optimization if we do this in plotDepthHistogram of just the set values.
@@ -612,11 +616,11 @@ dumpDepthHistogram(gkStore *UNUSED(gkpStore), tgStore *tigStore, tgFilter &filte
}
if (single == false) {
- sprintf(N, "%s.depthHistogram", outPrefix);
+ snprintf(N, FILENAME_MAX, "%s.depthHistogram", outPrefix);
plotDepthHistogram(N, cov, covMax);
}
- delete cov;
+ delete [] cov;
}
@@ -641,6 +645,11 @@ dumpCoverage(gkStore *UNUSED(gkpStore), tgStore *tigStore, tgFilter &filter, boo
continue;
}
+ if (tigLen == 0) {
+ tigStore->unloadTig(ti);
+ continue;
+ }
+
// Do something.
intervalList<int32> allL;
@@ -738,7 +747,7 @@ dumpCoverage(gkStore *UNUSED(gkpStore), tgStore *tigStore, tgFilter &filter, boo
if (outPrefix) {
char outName[FILENAME_MAX];
- sprintf(outName, "%s.tig%08u.depth", outPrefix, tig->tigID());
+ snprintf(outName, FILENAME_MAX, "%s.tig%08u.depth", outPrefix, tig->tigID());
FILE *outFile = fopen(outName, "w");
if (errno)
@@ -767,7 +776,7 @@ dumpCoverage(gkStore *UNUSED(gkpStore), tgStore *tigStore, tgFilter &filter, boo
fprintf(gnuPlot, " %f title '' lt 0 lc 2, \\\n", aveDepth - sdeDepth);
fprintf(gnuPlot, " %f title '' lt 0 lc 2\n", aveDepth + sdeDepth);
- fclose(gnuPlot);
+ pclose(gnuPlot);
}
}
@@ -775,6 +784,8 @@ dumpCoverage(gkStore *UNUSED(gkpStore), tgStore *tigStore, tgFilter &filter, boo
tigStore->unloadTig(ti);
}
+
+ delete [] cov;
}
@@ -972,7 +983,7 @@ dumpOverlapHistogram(gkStore *UNUSED(gkpStore), tgStore *tigStore, tgFilter &fil
char N[FILENAME_MAX];
- sprintf(N, "%s.thickestOverlapHistogram", outPrefix);
+ snprintf(N, FILENAME_MAX, "%s.thickestOverlapHistogram", outPrefix);
plotDepthHistogram(N, hist, histMax);
@@ -1022,7 +1033,7 @@ main (int argc, char **argv) {
int arg=1;
int err=0;
while (arg < argc) {
- if (strcmp(argv[arg], "-G") == 0) {
+ if (strcmp(argv[arg], "-G") == 0) {
gkpName = argv[++arg];
}
@@ -1031,7 +1042,9 @@ main (int argc, char **argv) {
tigVers = atoi(argv[++arg]);
}
- else if (strcmp(argv[arg], "-tig") == 0) {
+ else if ((strcmp(argv[arg], "-tig") == 0) ||
+ (strcmp(argv[arg], "-t") == 0) || // Deprecated!
+ (strcmp(argv[arg], "-u") == 0)) { // Deprecated too!
AS_UTL_decodeRange(argv[++arg], filter.tigIDbgn, filter.tigIDend);
}
@@ -1178,10 +1191,12 @@ main (int argc, char **argv) {
fprintf(stderr, " -fasta report sequences in FASTA format (the default)\n");
fprintf(stderr, " -fastq report sequences in FASTQ format\n");
fprintf(stderr, "\n");
- fprintf(stderr, " -layout [opts] the layout of reads in each tig\n");
- fprintf(stderr, " if '-o' is supplied, three files are created, otherwise just the layout is printed to stdout\n");
+ fprintf(stderr, " -layout [opts] the layout of reads in each tig. if '-o' is supplied, three files are created.\n");
fprintf(stderr, " -gapped report the gapped (multialignment) positions\n");
- fprintf(stderr, " -o outputPrefix write plots to 'outputPrefix.*' in the current directory\n");
+ fprintf(stderr, " -o name write data to 'name.*' files in the current directory\n");
+ fprintf(stderr, " name.layout - layout of reads\n");
+ fprintf(stderr, " name.layout.readToTig - read to tig position\n");
+ fprintf(stderr, " name.layout.tigInfo - metadata for each tig\n");
fprintf(stderr, "\n");
fprintf(stderr, " -multialign [opts] the full multialignment, output is to stdout\n");
fprintf(stderr, " -w width width of the page\n");
@@ -1235,13 +1250,13 @@ main (int argc, char **argv) {
filter.tigIDend = nTigs-1;
if (nTigs <= filter.tigIDend) {
- fprintf(stderr, "WARNING: adjusting tig ID range from "F_U32"-"F_U32" to "F_U32"-"F_U32" as there are only "F_U32" tigs in the store.\n",
+ fprintf(stderr, "WARNING: adjusting tig ID range from " F_U32 "-" F_U32 " to " F_U32 "-" F_U32 " as there are only " F_U32 " tigs in the store.\n",
filter.tigIDbgn, filter.tigIDend, filter.tigIDbgn, nTigs-1, nTigs);
filter.tigIDend = nTigs - 1;
}
if (filter.tigIDend < filter.tigIDbgn) {
- fprintf(stderr, "WARNING: adjusting inverted tig ID range -t "F_U32"-"F_U32"\n",
+ fprintf(stderr, "WARNING: adjusting inverted tig ID range -t " F_U32 "-" F_U32 "\n",
filter.tigIDbgn, filter.tigIDend);
uint32 x = filter.tigIDend;
filter.tigIDend = filter.tigIDbgn;
@@ -1249,7 +1264,7 @@ main (int argc, char **argv) {
}
if (nTigs <= filter.tigIDbgn)
- fprintf(stderr, "ERROR: only "F_U32" tigs in the store (IDs 0-"F_U32" inclusive); can't dump requested range -t "F_U32"-"F_U32"\n",
+ fprintf(stderr, "ERROR: only " F_U32 " tigs in the store (IDs 0-" F_U32 " inclusive); can't dump requested range -t " F_U32 "-" F_U32 "\n",
nTigs,
nTigs-1,
filter.tigIDbgn, filter.tigIDend), exit(1);
diff --git a/src/stores/tgStoreFilter.C b/src/stores/tgStoreFilter.C
index 1d2fa61..56db02e 100644
--- a/src/stores/tgStoreFilter.C
+++ b/src/stores/tgStoreFilter.C
@@ -256,7 +256,7 @@ main(int argc, char **argv) {
errno = 0;
- sprintf(outName, "%s.log", outPrefix);
+ snprintf(outName, FILENAME_MAX, "%s.log", outPrefix);
outLOG = fopen(outName, "w");
if (errno)
@@ -264,7 +264,7 @@ main(int argc, char **argv) {
fprintf(outLOG, "tigID\trho\tcovStat\tarrDist\n");
- sprintf(outName, "%s.stats", outPrefix);
+ snprintf(outName, FILENAME_MAX, "%s.stats", outPrefix);
outSTA = fopen(outName, "w");
if (errno)
@@ -557,14 +557,14 @@ main(int argc, char **argv) {
}
fprintf(outSTA, "classification number of unitigs total length\n");
- fprintf(outSTA, " unique: %17"F_U32P" %14"F_U64P"\n", repeat_IsUnique.num, repeat_IsUnique.len);
- fprintf(outSTA, " singleton: %17"F_U32P" %14"F_U64P"\n", repeat_IsSingleton.num, repeat_IsSingleton.len);
- fprintf(outSTA, " repeat: %17"F_U32P" %14"F_U64P"\n", repeat_IsRepeat.num, repeat_IsRepeat.len);
- fprintf(outSTA, " too few reads: %17"F_U32P" %14"F_U64P"\n", repeat_LowReads.num, repeat_LowReads.len);
- fprintf(outSTA, " low cov stat: %17"F_U32P" %14"F_U64P"\n", repeat_LowCovStat.num, repeat_LowCovStat.len);
- fprintf(outSTA, " too short: %17"F_U32P" %14"F_U64P"\n", repeat_Short.num, repeat_Short.len);
- fprintf(outSTA, " spanning read: %17"F_U32P" %14"F_U64P"\n", repeat_SingleSpan.num, repeat_SingleSpan.len);
- fprintf(outSTA, " low coverage: %17"F_U32P" %14"F_U64P"\n", repeat_LowCov.num, repeat_LowCov.len);
+ fprintf(outSTA, " unique: %17" F_U32P " %14" F_U64P "\n", repeat_IsUnique.num, repeat_IsUnique.len);
+ fprintf(outSTA, " singleton: %17" F_U32P " %14" F_U64P "\n", repeat_IsSingleton.num, repeat_IsSingleton.len);
+ fprintf(outSTA, " repeat: %17" F_U32P " %14" F_U64P "\n", repeat_IsRepeat.num, repeat_IsRepeat.len);
+ fprintf(outSTA, " too few reads: %17" F_U32P " %14" F_U64P "\n", repeat_LowReads.num, repeat_LowReads.len);
+ fprintf(outSTA, " low cov stat: %17" F_U32P " %14" F_U64P "\n", repeat_LowCovStat.num, repeat_LowCovStat.len);
+ fprintf(outSTA, " too short: %17" F_U32P " %14" F_U64P "\n", repeat_Short.num, repeat_Short.len);
+ fprintf(outSTA, " spanning read: %17" F_U32P " %14" F_U64P "\n", repeat_SingleSpan.num, repeat_SingleSpan.len);
+ fprintf(outSTA, " low coverage: %17" F_U32P " %14" F_U64P "\n", repeat_LowCov.num, repeat_LowCov.len);
fclose(outLOG);
fclose(outSTA);
diff --git a/src/stores/tgTig.C b/src/stores/tgTig.C
index 7a58946..3f7643e 100644
--- a/src/stores/tgTig.C
+++ b/src/stores/tgTig.C
@@ -37,6 +37,7 @@
#include "AS_UTL_fasta.C"
#include "splitToWords.H"
+#include "intervalList.H"
tgPosition::tgPosition() {
@@ -230,6 +231,32 @@ tgTig::operator=(tgTig & tg) {
+double
+tgTig::computeCoverage(bool useGapped) {
+ intervalList<int32> allL;
+
+ for (uint32 ci=0; ci<numberOfChildren(); ci++) {
+ tgPosition *read = getChild(ci);
+ uint32 bgn = (useGapped) ? read->min() : mapGappedToUngapped(read->min());
+ uint32 end = (useGapped) ? read->max() : mapGappedToUngapped(read->max());
+
+ allL.add(bgn, end - bgn);
+ }
+
+ intervalList<int32> ID(allL);
+
+ double aveDepth = 0;
+
+ for (uint32 ii=0; ii<ID.numberOfIntervals(); ii++)
+ aveDepth += (ID.hi(ii) - ID.lo(ii) + 1) * ID.depth(ii);
+
+ if (length(useGapped) == 0)
+ return(0);
+
+ return(aveDepth / length(useGapped));
+}
+
+
void
tgTig::buildUngapped(void) {
@@ -318,7 +345,7 @@ tgTig::loadFromStreamOrLayout(FILE *F) {
// Decide if the file contains an ASCII layout or a binary stream. It's probably rather fragile,
// testing if the first byte is 't' (from 'tig') or 'T' (from 'TIGR').
- char ch = getc(F);
+ int ch = getc(F);
ungetc(ch, F);
@@ -430,7 +457,7 @@ tgTig::dumpLayout(FILE *F) {
if (_gappedLen > 0)
assert(_gappedLen == _layoutLen);
- fprintf(F, "tig "F_U32"\n", _tigID);
+ fprintf(F, "tig " F_U32 "\n", _tigID);
fprintf(F, "len %d\n", _layoutLen);
// Adjust QV's to Sanger encoding
@@ -461,7 +488,7 @@ tgTig::dumpLayout(FILE *F) {
fprintf(F, "class %s\n", toString(_class));
fprintf(F, "suggestRepeat %c\n", _suggestRepeat ? 'T' : 'F');
fprintf(F, "suggestCircular %c\n", _suggestCircular ? 'T' : 'F');
- fprintf(F, "numChildren "F_U32"\n", _childrenLen);
+ fprintf(F, "numChildren " F_U32 "\n", _childrenLen);
// And the reads.
@@ -472,22 +499,22 @@ tgTig::dumpLayout(FILE *F) {
deltaString[0] = 0;
if (imp->_askip + imp->_bskip > 0)
- sprintf(trimString, " trim %6u %6u", imp->_askip, imp->_bskip);
+ snprintf(trimString, 128, " trim %6u %6u", imp->_askip, imp->_bskip);
if (imp->_deltaLen > 0)
- sprintf(deltaString, " delta %5u at %u", imp->_deltaLen, imp->_deltaOffset);
+ snprintf(deltaString, 128, " delta %5u at %u", imp->_deltaLen, imp->_deltaOffset);
if (imp->_isRead)
- fprintf(F, "read %9"F_U32P" anchor %9"F_U32P" hang %6"F_S32P" %6"F_S32P" position %6"F_U32P" %6"F_U32P"%s%s\n",
+ fprintf(F, "read %9" F_U32P " anchor %9" F_U32P " hang %7" F_S32P " %7" F_S32P " position %9" F_U32P " %9" F_U32P "%s%s\n",
imp->ident(), imp->anchor(), imp->aHang(), imp->bHang(), imp->bgn(), imp->end(), trimString, deltaString);
if (imp->_isUnitig)
- fprintf(F, "unitig %9"F_U32P" anchor %9"F_U32P" hang %6"F_S32P" %6"F_S32P" position %6"F_U32P" %6"F_U32P"%s%s\n",
+ fprintf(F, "unitig %9" F_U32P " anchor %9" F_U32P " hang %7" F_S32P " %7" F_S32P " position %9" F_U32P " %9" F_U32P "%s%s\n",
imp->ident(), imp->anchor(), imp->aHang(), imp->bHang(), imp->bgn(), imp->end(), trimString, deltaString);
if (imp->_isContig)
- fprintf(F, "contig %9"F_U32P" anchor %9"F_U32P" hang %6"F_S32P" %6"F_S32P" position %6"F_U32P" %6"F_U32P"%s%s\n",
+ fprintf(F, "contig %9" F_U32P " anchor %9" F_U32P " hang %7" F_S32P " %7" F_S32P " position %9" F_U32P " %9" F_U32P "%s%s\n",
imp->ident(), imp->anchor(), imp->aHang(), imp->bHang(), imp->bgn(), imp->end(), trimString, deltaString);
}
@@ -508,8 +535,10 @@ tgTig::loadLayout(FILE *F) {
fgets(LINE, LINEmax, F); LINEnum++;
- if (feof(F))
+ if (feof(F)) {
+ delete [] LINE;
return(false);
+ }
while (!feof(F)) {
splitToWords W(LINE);
@@ -554,7 +583,7 @@ tgTig::loadLayout(FILE *F) {
else if (strcmp(W[1], "contig") == 0)
_class = tgTig_contig;
else
- fprintf(stderr, "tgTig::loadLayout()-- '%s' line "F_U64" invalid: '%s'\n", W[0], LINEnum, LINE), exit(1);
+ fprintf(stderr, "tgTig::loadLayout()-- '%s' line " F_U64 " invalid: '%s'\n", W[0], LINEnum, LINE), exit(1);
} else if (strcmp(W[0], "suggestRepeat") == 0) {
_suggestRepeat = strtouint32(W[1]);
@@ -571,7 +600,7 @@ tgTig::loadLayout(FILE *F) {
(strcmp(W[0], "contig") == 0)) {
if (W.numWords() < 10)
- fprintf(stderr, "tgTig::loadLayout()-- '%s' line "F_U64" invalid: '%s'\n", W[0], LINEnum, LINE), exit(1);
+ fprintf(stderr, "tgTig::loadLayout()-- '%s' line " F_U64 " invalid: '%s'\n", W[0], LINEnum, LINE), exit(1);
if (nChildren >= _childrenLen) {
resizeArray(_children, _childrenLen, _childrenMax, _childrenLen + 1, resizeArray_copyData);
@@ -640,7 +669,7 @@ void
tgTig::dumpFASTA(FILE *F, bool useGapped) {
AS_UTL_writeFastA(F,
bases(useGapped), length(useGapped), 100,
- ">tig%08u len="F_U32" reads="F_U32" covStat=%.2f gappedBases=%s class=%s suggestRepeat=%s suggestCircular=%s\n",
+ ">tig%08u len=" F_U32 " reads=" F_U32 " covStat=%.2f gappedBases=%s class=%s suggestRepeat=%s suggestCircular=%s\n",
tigID(),
length(useGapped),
numberOfChildren(),
@@ -657,7 +686,7 @@ tgTig::dumpFASTQ(FILE *F, bool useGapped) {
AS_UTL_writeFastQ(F,
bases(useGapped), length(useGapped),
quals(useGapped), length(useGapped),
- "@tig%08u len="F_U32" reads="F_U32" covStat=%.2f gappedBases=%s class=%s suggestRepeat=%s suggestCircular=%s\n",
+ "@tig%08u len=" F_U32 " reads=" F_U32 " covStat=%.2f gappedBases=%s class=%s suggestRepeat=%s suggestCircular=%s\n",
tigID(),
length(useGapped),
numberOfChildren(),
diff --git a/src/stores/tgTig.H b/src/stores/tgTig.H
index a6dc113..9c2fd9e 100644
--- a/src/stores/tgTig.H
+++ b/src/stores/tgTig.H
@@ -263,6 +263,8 @@ public:
char *bases(bool useGapped=true) { return( (useGapped == true) ? gappedBases() : ungappedBases() ); };
char *quals(bool useGapped=true) { return( (useGapped == true) ? gappedQuals() : ungappedQuals() ); };
+ double computeCoverage(bool useGapped=true);
+
private:
uint32 layoutLength(void) { return(_layoutLen); };
diff --git a/src/stores/tgTigMultiAlignDisplay.C b/src/stores/tgTigMultiAlignDisplay.C
index 5cb299b..708c0d3 100644
--- a/src/stores/tgTigMultiAlignDisplay.C
+++ b/src/stores/tgTigMultiAlignDisplay.C
@@ -53,6 +53,7 @@ public:
readLen = 0;
bases = NULL;
quals = NULL;
+ delta = NULL;
next = NULL;
};
@@ -302,8 +303,9 @@ tgTig::display(FILE *F,
fprintf(F, "<<< begin Contig %d >>>", tigID());
- char *gruler = new char [displayWidth + 200];
- char *uruler = new char [displayWidth + 200];
+ uint32 lruler = displayWidth + 200;
+ char *gruler = new char [lruler];
+ char *uruler = new char [lruler];
int32 ungapped = 1;
int32 tick = 1;
@@ -325,10 +327,10 @@ tgTig::display(FILE *F,
for (uint32 rowind=0; rowind<rowlen; rowind++) {
if (((window + 1 + rowind) % 25) == 0)
- sprintf(gruler + rowind, "| GAP=%d", window + 1 + rowind);
+ snprintf(gruler + rowind, lruler, "| GAP=%d", window + 1 + rowind);
if ((ungapped % 25) == 0)
- sprintf(uruler + rowind, "| UNG=%d", ungapped);
+ snprintf(uruler + rowind, lruler, "| UNG=%d", ungapped);
if (_gappedBases[window + rowind] != '-')
ungapped++;
diff --git a/src/stores/tgTigSizeAnalysis.C b/src/stores/tgTigSizeAnalysis.C
index 2a97414..60733c0 100644
--- a/src/stores/tgTigSizeAnalysis.C
+++ b/src/stores/tgTigSizeAnalysis.C
@@ -73,6 +73,7 @@ tgTigSizeAnalysis::evaluateTig(tgTig *tig, bool useGapped) {
case tgTig_unassembled: lenUnassembled.push_back(length); break;
case tgTig_bubble: lenBubble.push_back(length); break;
case tgTig_contig: lenContig.push_back(length); break;
+ default: break;
}
}
@@ -112,7 +113,7 @@ tgTigSizeAnalysis::printSummary(FILE *out, char *description, vector<uint32> &da
sum += data[i];
while (siz * nnn / 100 < sum) {
- fprintf(out, "%s ng%-3"F_U64P" %10"F_U32P" bp lg%-3"F_U64P" %6"F_U64P" sum %10"F_U64P" bp\n",
+ fprintf(out, "%s ng%-3" F_U64P " %10" F_U32P " bp lg%-3" F_U64P " %6" F_U64P " sum %10" F_U64P " bp\n",
description,
nnn, data[i],
nnn, i+1,
@@ -122,9 +123,9 @@ tgTigSizeAnalysis::printSummary(FILE *out, char *description, vector<uint32> &da
}
}
- fprintf(out, "%s sum %10"F_U64P" (genomeSize "F_U64")\n", description, tot, genomeSize);
- fprintf(out, "%s num %10"F_U64P"\n", description, cnt);
- fprintf(out, "%s ave %10"F_U64P"\n", description, tot / cnt);
+ fprintf(out, "%s sum %10" F_U64P " (genomeSize " F_U64 ")\n", description, tot, genomeSize);
+ fprintf(out, "%s num %10" F_U64P "\n", description, cnt);
+ fprintf(out, "%s ave %10" F_U64P "\n", description, tot / cnt);
}
diff --git a/src/utgcns/libNDFalcon/dw.C b/src/utgcns/libNDFalcon/dw.C
index d710d6e..e12e89a 100644
--- a/src/utgcns/libNDFalcon/dw.C
+++ b/src/utgcns/libNDFalcon/dw.C
@@ -19,6 +19,10 @@
* are a 'United States Government Work', and
* are released in the public domain
*
+ * Brian P. Walenz beginning on 2016-JUL-19
+ * are a 'United States Government Work', and
+ * are released in the public domain
+ *
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
@@ -194,6 +198,8 @@ bool align(const char * query_seq, seq_coor_t q_len,
max_idx = 0;
for (d = 0; d < max_d; d ++ ) {
if (max_k - min_k > band_size) {
+ fprintf(stderr, "generatePBDAG()-- Exceeded band size max_k %d - min_k %d = %d > band_size = %d.\n",
+ max_k, min_k, max_k - min_k, band_size);
break;
}
diff --git a/src/utgcns/libNDFalcon/dw.H b/src/utgcns/libNDFalcon/dw.H
index 16f9dd8..836d9fa 100644
--- a/src/utgcns/libNDFalcon/dw.H
+++ b/src/utgcns/libNDFalcon/dw.H
@@ -124,8 +124,6 @@ public:
int32_t _tgt_bgn;
int32_t _tgt_end;
- int32_t _olapLen;
-
char* _qry_aln_str;
char* _tgt_aln_str;
};
diff --git a/src/utgcns/libNDalign/NDalign.C b/src/utgcns/libNDalign/NDalign.C
index 215308e..267f918 100644
--- a/src/utgcns/libNDalign/NDalign.C
+++ b/src/utgcns/libNDalign/NDalign.C
@@ -473,7 +473,7 @@ NDalign::findHits(void) {
if ((apos - bpos < _minDiag) ||
(apos - bpos > _maxDiag))
- fprintf(stderr, "NDalign::findHits()-- kmer "F_X64" apos - bpos = %d _minDiag = %d _maxDiag = %d\n",
+ fprintf(stderr, "NDalign::findHits()-- kmer " F_X64 " apos - bpos = %d _minDiag = %d _maxDiag = %d\n",
kmer, apos-bpos, _minDiag, _maxDiag);
assert(apos - bpos >= _minDiag); // ...these too.
assert(apos - bpos <= _maxDiag);
diff --git a/src/utgcns/libcns/abAbacus-refine.C b/src/utgcns/libcns/abAbacus-refine.C
index bf51c8a..c096615 100644
--- a/src/utgcns/libcns/abAbacus-refine.C
+++ b/src/utgcns/libcns/abAbacus-refine.C
@@ -82,14 +82,15 @@ enum ShiftStatus {
class abAbacusWork {
public:
abAbacusWork() {
- start_column = NULL;
- end_column = NULL;
- rows = 0;
- columns = 0;
- window_width = 0;
- shift = UNSHIFTED;
- beads = NULL;
- calls = NULL;
+ abacus_indices = NULL;
+ start_column = NULL;
+ end_column = NULL;
+ rows = 0;
+ columns = 0;
+ window_width = 0;
+ shift = UNSHIFTED;
+ beads = NULL;
+ calls = NULL;
};
abAbacusWork(abAbacus *abacus,
diff --git a/src/utgcns/libcns/unitigConsensus.C b/src/utgcns/libcns/unitigConsensus.C
index 62ea888..3762ed2 100644
--- a/src/utgcns/libcns/unitigConsensus.C
+++ b/src/utgcns/libcns/unitigConsensus.C
@@ -75,7 +75,7 @@
// for pbdagcon
#include "Alignment.H"
#include "AlnGraphBoost.H"
-#include "SimpleAligner.H"
+#include "dw.H"
#include "NDalign.H"
@@ -216,7 +216,7 @@ unitigConsensus::generate(tgTig *tig_,
// Second attempt, default parameters after recomputing consensus sequence.
if (showAlgorithm())
- fprintf(stderr, "generateMultiAlignment()-- recompute full consensus\n");
+ fprintf(stderr, "generate()-- recompute full consensus\n");
recomputeConsensus(showMultiAlignments());
@@ -249,7 +249,7 @@ unitigConsensus::generate(tgTig *tig_,
return(true);
returnFailure:
- fprintf(stderr, "generateMultiAlignment()-- unitig %d FAILED.\n", tig->tigID());
+ fprintf(stderr, "generate()-- unitig %d FAILED.\n", tig->tigID());
// tgTig should have no changes.
@@ -261,98 +261,208 @@ bool
unitigConsensus::generatePBDAG(tgTig *tig_,
map<uint32, gkRead *> *inPackageRead_,
map<uint32, gkReadData *> *inPackageReadData_) {
- tig = tig_;
- numfrags = tig->numberOfChildren();
+ tig = tig_;
+ numfrags = tig->numberOfChildren();
+
+ if (initialize(inPackageRead_, inPackageReadData_) == FALSE) {
+ fprintf(stderr, "generatePBDAG()-- Failed to initialize for tig %u with %u children\n", tig->tigID(), tig->numberOfChildren());
+ return(false);
+ }
+
+ // First we need to load into Unitig data structure the quick cns
+
+ Unitig utg;
+
+ utg.id = tig->tigID();
+ utg.seq = string(tig->_layoutLen, 'N');
+
+ // Build a quick consensus to align to, just smash together sequences.
+
+ for (uint32 i=0; i<numfrags; i++) {
+ abSequence *seq = abacus->getSequence(i);
+ char *fragment = seq->getBases();
+ uint32 readLen = seq->length();
+
+ uint32 start = utgpos[i].min();
+ uint32 end = utgpos[i].max();
+
+ if (start > utg.seq.length()) {
+ fprintf(stderr, "WARNING: reset start from " F_U32 " to " F_U64 "\n", start, utg.seq.length()-1);
+ start = utg.seq.length() - 1;
+ }
+
+ if (end - start > readLen) {
+ fprintf(stderr, "WARNING: reset end from " F_U32 " to " F_U32 "\n", end, start+readLen);
+ end = start + readLen;
+ }
- if (initialize(inPackageRead_, inPackageReadData_) == FALSE) {
- fprintf(stderr, "generatePBDAG()-- Failed to initialize for tig %u with %u children\n", tig->tigID(), tig->numberOfChildren());
- return(false);
+ if (end > utg.seq.length()) {
+ fprintf(stderr, "WARNING: truncate end from " F_U32 " to " F_U64 "\n", end, utg.seq.length()-1);
+ end = utg.seq.length() - 1;
}
- // first we need to load into Unitig data structure the quick cns
- Unitig utg;
- utg.id = tig->tigID();
-
- utg.seq = string(tig->_layoutLen, 'N');
-
- // build a quick consensus to align to, just smash together sequences.
- for (int i = 0; i < numfrags; i++) {
- gkRead *read = gkpStore->gkStore_getRead(utgpos[i].ident());
- uint32 readLen = read->gkRead_sequenceLength();
-
- uint32 start = utgpos[i].min();
- uint32 end = utgpos[i].max();
-
- if (start > utg.seq.length()) {
- start = utg.seq.length() - 1;
- }
- if (end - start > readLen) {
- end = start + readLen;
- }
- if (end > utg.seq.length()) {
- end = utg.seq.length() - 1;
- }
-
- abSequence *seq = abacus->getSequence(i);
- char *fragment = seq->getBases();
-
- for (int j = start; j < end; j++) {
- if (utg.seq[j] == 'N') {
- utg.seq[j] = fragment[j - start];
- }
- }
+ // Read aligns from position start to end. Skip ahead until we find unset bases.
+
+ uint32 cur = start;
+ while ((cur < end) && (utg.seq[cur] != 'N'))
+ cur++;
+
+ fprintf(stderr, "generatePBDAG()-- template from %7d to %7d comes from read %3d id %6d bases (%5d %5d) nominally %6d %6d)\n",
+ cur, end, i, seq->gkpIdent(),
+ cur - start,
+ end - start,
+ utgpos[i].min(),
+ utgpos[i].max());
+
+ for (uint32 j=cur; j<end; j++) {
+ //if (utg.seq[j] != 'N')
+ // fprintf(stderr, "WARNING: template %6d already set\n", j);
+ utg.seq[j] = fragment[j - start];
}
- AlnGraphBoost ag(utg.seq);
+ }
+
+ for (uint32 jj=0; jj<tig->_layoutLen; jj++)
+ if (utg.seq[jj] == 'N')
+ fprintf(stdout, "generatePBDAG()-- WARNING: template position %u not defined.\n", jj);
+
+ assert(utg.seq[tig->_layoutLen] == 0);
+
+#if 0
+ FILE *F = fopen("template.fasta", "w");
+ fprintf(F, ">tig%d template\n%s\n", tig->tigID(), utg.seq.c_str());
+ fclose(F);
+#endif
+
+ AlnGraphBoost ag(utg.seq);
+
+ // Compute alignments of each sequence in parallel
- // compute alignments of each sequence in parallel
#pragma omp parallel for schedule(dynamic)
- for (int i = 0; i < numfrags; i++) {
- bool placed = computePositionFromLayout();
- dagcon::Alignment aln;
- SimpleAligner align;
-
- // for each fragment align it
- abSequence *seq = abacus->getSequence(i);
- char *fragment = seq->getBases();
-
- aln.start = utgpos[i].min();
- aln.end = utgpos[i].max();
- aln.frgid = utgpos[i].ident();
- aln.qstr = string(fragment);
- aln.tstr = utg.seq.substr(aln.start, aln.end-aln.start);
-
- align.align(aln, errorRate);
- if (aln.qstr.size() == 0) {
- cnspos[i].setMinMax(0, 0);
- continue;
- }
- cnspos[i].setMinMax(aln.start, aln.end);
- dagcon::Alignment norm = normalizeGaps(aln);
-
- // not thread safe to add to graph concurrently, so lock while adding
-#pragma omp critical (graphAdd)
- ag.addAln(norm);
+ for (uint32 i=0; i<numfrags; i++) {
+ abSequence *seq = abacus->getSequence(i);
+ char *fragment = seq->getBases();
+
+ // computePositionFromLayout() does NOT work here; it needs to have abacus->numberOfColumns() updated.
+ // When the reads aren't placed in frankenstein, this function probably also just returns
+ // the original utgpos position anyway.
+ //
+ //computePositionFromLayout();
+
+ fprintf(stderr, "\n");
+ fprintf(stderr, "generatePBDAG()-- align read %u (%u/%u) at %u-%u\n",
+ seq->gkpIdent(), i, numfrags, utgpos[i].min(), utgpos[i].max());
+
+#if 0
+ char N[FILENAME_MAX];
+ sprintf(N, "read-%03d.fasta", i, seq->gkpIdent());
+ FILE *F = fopen(N, "w");
+ fprintf(F, ">read%d pos %d %d\n%s\n", seq->gkpIdent(), utgpos[i].min(), utgpos[i].max(), fragment);
+ fclose(F);
+#endif
+
+ dagcon::Alignment aln;
+
+ aln.start = utgpos[i].min();
+ aln.end = utgpos[i].max();
+ aln.frgid = utgpos[i].ident();
+ aln.qstr = string(fragment);
+ aln.tstr = utg.seq.substr(aln.start, aln.end-aln.start);
+
+ NDalignment::NDalignResult ndaln;
+
+ uint32 aLen = aln.qstr.size();
+ uint32 bLen = aln.tstr.size();
+
+ uint32 bandTolerance = 150;
+ bool aligned = NDalignment::align(aln.qstr.c_str(), aln.qstr.size(),
+ aln.tstr.c_str(), aln.tstr.size(),
+ bandTolerance,
+ true,
+ ndaln);
+
+ while ((aligned == false) && (bandTolerance < errorRate * (aLen + bLen))) {
+ bandTolerance *= 4;
+ fprintf(stderr, "generatePBDAG()-- retry with bandTolerance = %d\n",
+ bandTolerance);
+ aligned = NDalignment::align(aln.qstr.c_str(), aln.qstr.size(),
+ aln.tstr.c_str(), aln.tstr.size(),
+ bandTolerance,
+ true,
+ ndaln);
+
+ }
+
+ double errorRateAln = (ndaln._size > 0) ? ((double)ndaln._dist / ndaln._size) : 1.0;
+
+ if ((aligned == true) && (errorRateAln > errorRate)) {
+ fprintf(stderr, "generatePBDAG()-- error rate too high distance=%5d size=%5d, %f > %f\n",
+ ndaln._dist, ndaln._size, errorRateAln, errorRate);
+ aligned = false;
}
- // merge the nodes and call consensus
- ag.mergeNodes();
- std::string cns = ag.consensus(1);
- // save consensus
- resizeArrayPair(tig->_gappedBases, tig->_gappedQuals, 0, tig->_gappedMax, (uint32) cns.length() + 1, resizeArray_doNothing);
- std::string::size_type len = 0;
- for(len = 0; len < cns.size(); len++) {
- tig->_gappedBases[len] = cns[len];
- tig->_gappedQuals[len] = CNS_MIN_QV;
+ if (aligned == false) {
+ aln.start = aln.end = 0;
+ aln.qstr = std::string();
+ aln.tstr = std::string();
+
+ fprintf(stderr, "generatePBDAG()-- failed to align read #%u id %u at position %u-%u.\n",
+ i, utgpos[i].ident(), utgpos[i].min(), utgpos[i].max());
+
+ cnspos[i].setMinMax(0, 0);
+
+ continue;
}
- // Terminate the string.
- tig->_gappedBases[len] = 0;
- tig->_gappedQuals[len] = 0;
- tig->_gappedLen = len;
- tig->_layoutLen = len;
-
- assert(len < tig->_gappedMax);
- return true;
+
+
+ fprintf(stderr, "generatePBDAG()-- aligned distance=%5d size=%5d, %f < %f\n",
+ ndaln._dist, ndaln._size,
+ (double) ndaln._dist / ndaln._size,
+ errorRate);
+
+ aln.start += ndaln._tgt_bgn;
+ aln.end = aln.start + ndaln._tgt_end;
+ aln.start++;
+ aln.qstr = std::string(ndaln._qry_aln_str);
+ aln.tstr = std::string(ndaln._tgt_aln_str);
+
+ assert(aln.qstr.length() == aln.tstr.length());
+
+ cnspos[i].setMinMax(aln.start, aln.end);
+
+ dagcon::Alignment norm = normalizeGaps(aln);
+
+#pragma omp critical (graphAdd)
+ ag.addAln(norm); // NOT thread safe!
+ }
+
+ // Merge the nodes and call consensus
+
+ ag.mergeNodes();
+
+ std::string cns = ag.consensus(1);
+
+ // Save consensus
+
+ resizeArrayPair(tig->_gappedBases, tig->_gappedQuals, 0, tig->_gappedMax, (uint32) cns.length() + 1, resizeArray_doNothing);
+
+ std::string::size_type len = 0;
+
+ for (len=0; len<cns.size(); len++) {
+ tig->_gappedBases[len] = cns[len];
+ tig->_gappedQuals[len] = CNS_MIN_QV;
+ }
+
+ // Terminate the string.
+
+ tig->_gappedBases[len] = 0;
+ tig->_gappedQuals[len] = 0;
+ tig->_gappedLen = len;
+ tig->_layoutLen = len;
+
+ assert(len < tig->_gappedMax);
+
+ return(true);
}
@@ -376,13 +486,15 @@ unitigConsensus::generateQuick(tgTig *tig_,
reportStartingWork();
piid = -1;
+
bool placed = computePositionFromLayout();
- gkRead *read = gkpStore->gkStore_getRead(utgpos[tiid].ident());
- uint32 readLen = read->gkRead_sequenceLength();
+ abSequence *seq = abacus->getSequence(utgpos[tiid].ident());
+ char *fragment = seq->getBases();
+ uint32 readLen = seq->length();
- uint32 start = cnspos[tiid].min();
- uint32 end = cnspos[tiid].max();
+ uint32 start = cnspos[tiid].min();
+ uint32 end = cnspos[tiid].max();
// if we couldn't place the read, fall back to utg positions
if (placed == false) {
diff --git a/src/utgcns/libpbutgcns/SimpleAligner.C b/src/utgcns/libpbutgcns/SimpleAligner.C
deleted file mode 100644
index 5b73724..0000000
--- a/src/utgcns/libpbutgcns/SimpleAligner.C
+++ /dev/null
@@ -1,59 +0,0 @@
-
-/******************************************************************************
- *
- * This file is part of canu, a software program that assembles whole-genome
- * sequencing reads into contigs.
- *
- * This software is based on:
- * 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
- * the 'kmer package' (http://kmer.sourceforge.net)
- * both originally distributed by Applera Corporation under the GNU General
- * Public License, version 2.
- *
- * Canu branched from Celera Assembler at its revision 4587.
- * Canu branched from the kmer project at its revision 1994.
- *
- * Modifications by:
- *
- * Sergey Koren beginning on 2015-DEC-28
- * are a 'United States Government Work', and
- * are released in the public domain
- *
- * File 'README.licenses' in the root directory of this distribution contains
- * full conditions and disclaimers for each license.
- */
-
-#include <vector>
-#include <stdint.h>
-#include <cstring>
-#include <string>
-#include <algorithm>
-#include "Alignment.H"
-#include "SimpleAligner.H"
-#include "assert.h"
-
-SimpleAligner::SimpleAligner() {
-}
-
-void SimpleAligner::align(dagcon::Alignment &aln, double errorRate) {
- NDalignment::NDalignResult ndaln;
- bool aligned = NDalignment::align(aln.qstr.c_str(), aln.qstr.size(), aln.tstr.c_str(), aln.tstr.size(), 150, true, ndaln);
-
- if (((double) ndaln._dist / (double) ndaln._size) > errorRate) {
- aligned = false;
- }
-
- if (aligned) {
- aln.start += ndaln._tgt_bgn;
- aln.end = aln.start + ndaln._tgt_end;
- aln.start++;
- aln.qstr = std::string(ndaln._qry_aln_str);
- aln.tstr = std::string(ndaln._tgt_aln_str);
- } else {
- aln.start = aln.end = 0;
- aln.qstr = std::string();
- aln.tstr = std::string();
- }
-
- assert(aln.qstr.length() == aln.tstr.length());
-}
diff --git a/src/utgcns/stashContains.C b/src/utgcns/stashContains.C
index 8d0fd6c..7aa0fd0 100644
--- a/src/utgcns/stashContains.C
+++ b/src/utgcns/stashContains.C
@@ -184,27 +184,26 @@ unstashContains(tgTig *tig,
if (saved == NULL)
return;
- uint32 oldMax = 0;
- uint32 newMax = 0;
-
// For fragments not involved in the consensus computation, we'll scale their position linearly
// from the old max to the new max.
//
// We probably should do an alignment to the consensus sequence to find the true location, but
// that's (a) expensive and (b) likely overkill for these unitigs.
- // Find the oldMax
+ uint32 oldMax = 0;
+ uint32 newMax = 0;
+ double sf = 1.0;
+
for (uint32 fi=0, ci=0; fi<saved->childrenLen; fi++)
if (oldMax < saved->children[fi].max())
oldMax = saved->children[fi].max();
- // Find the newMax
- // We could have just done: newMax = tig->gappedLength();
for (uint32 fi=0, ci=0; fi<tig->numberOfChildren(); fi++)
if (newMax < tig->getChild(fi)->max())
newMax = tig->getChild(fi)->max();
- double sf = (double)newMax / oldMax;
+ if (oldMax > 0)
+ sf = (double)newMax / oldMax;
// First, we need a map from the child id to the location in the current tig
@@ -238,7 +237,7 @@ unstashContains(tgTig *tig,
}
if (idmap.empty() == false)
- fprintf(stderr, "Failed to unstash the contained reads. Still have "F_SIZE_T" reads unplaced.\n",
+ fprintf(stderr, "Failed to unstash the contained reads. Still have " F_SIZE_T " reads unplaced.\n",
idmap.size());
assert(idmap.empty() == true);
diff --git a/src/utgcns/stashContains.H b/src/utgcns/stashContains.H
index 8ae97be..d40a11b 100644
--- a/src/utgcns/stashContains.H
+++ b/src/utgcns/stashContains.H
@@ -76,14 +76,14 @@ public:
};
void reportDetected(FILE *out, uint32 id) {
- fprintf(out, " unitig %d detected "F_S32" contains (%.2fx, %.2f%%) "F_S32" dovetail (%.2fx, %.2f%%)\n",
+ fprintf(out, " unitig %d detected " F_S32 " contains (%.2fx, %.2f%%) " F_S32 " dovetail (%.2fx, %.2f%%)\n",
id,
numContains, covContain, percContain,
numDovetails, covDovetail, percDovetail);
};
void reportRemoved(FILE *out, uint32 id) {
- fprintf(out, " unitig %d removing "F_S32" (%.2fx) contained reads; processing only "F_S32" contained (%.2fx) and "F_S32" dovetail (%.2fx) reads\n",
+ fprintf(out, " unitig %d removing " F_S32 " (%.2fx) contained reads; processing only " F_S32 " contained (%.2fx) and " F_S32 " dovetail (%.2fx) reads\n",
id,
numContainsRemoved, covContainsRemoved,
numContainsSaved, covContainsSaved,
diff --git a/src/utgcns/utgcns.C b/src/utgcns/utgcns.C
index b20a371..7c37214 100644
--- a/src/utgcns/utgcns.C
+++ b/src/utgcns/utgcns.C
@@ -49,7 +49,9 @@
#include "unitigConsensus.H"
+#ifndef BROKEN_CLANG_OpenMP
#include <omp.h>
+#endif
#include <map>
#include <algorithm>
@@ -186,6 +188,11 @@ main (int argc, char **argv) {
arg++;
}
+ if ((gkpName == NULL) && (tigName != NULL)) {
+ gkpName = new char [FILENAME_MAX];
+ snprintf(gkpName, FILENAME_MAX, "%s/partitionedReads.gkpStore", tigName);
+ }
+
if ((gkpName == NULL) && (inPackageName == NULL))
err++;
@@ -342,10 +349,10 @@ main (int argc, char **argv) {
// Report some sizes.
- fprintf(stderr, "sizeof(abBead) "F_SIZE_T"\n", sizeof(abBead));
- fprintf(stderr, "sizeof(abColumn) "F_SIZE_T"\n", sizeof(abColumn));
- fprintf(stderr, "sizeof(abAbacus) "F_SIZE_T"\n", sizeof(abAbacus));
- fprintf(stderr, "sizeof(abSequence) "F_SIZE_T"\n", sizeof(abSequence));
+ fprintf(stderr, "sizeof(abBead) " F_SIZE_T "\n", sizeof(abBead));
+ fprintf(stderr, "sizeof(abColumn) " F_SIZE_T "\n", sizeof(abColumn));
+ fprintf(stderr, "sizeof(abAbacus) " F_SIZE_T "\n", sizeof(abAbacus));
+ fprintf(stderr, "sizeof(abSequence) " F_SIZE_T "\n", sizeof(abSequence));
// Decide on what to compute. Either all unitigs, or a single unitig, or a special case test.
@@ -365,12 +372,12 @@ main (int argc, char **argv) {
e = utgEnd;
}
- fprintf(stderr, "-- Computing unitig consensus for b="F_U32" to e="F_U32" with errorRate %0.4f (max %0.4f) and minimum overlap "F_U32"\n",
+ fprintf(stderr, "-- Computing unitig consensus for b=" F_U32 " to e=" F_U32 " with errorRate %0.4f (max %0.4f) and minimum overlap " F_U32 "\n",
b, e, errorRate, errorRateMax, minOverlap);
}
else {
- fprintf(stderr, "-- Computing unitig consensus with errorRate %0.4f (max %0.4f) and minimum overlap "F_U32"\n",
+ fprintf(stderr, "-- Computing unitig consensus with errorRate %0.4f (max %0.4f) and minimum overlap " F_U32 "\n",
errorRate, errorRateMax, minOverlap);
}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/canu.git
More information about the debian-med-commit
mailing list