[med-svn] [Git][med-team/canu][master] 7 commits: New upstream version 1.7.1+dfsg

Andreas Tille gitlab at salsa.debian.org
Tue Jul 17 11:42:22 BST 2018


Andreas Tille pushed to branch master at Debian Med / canu


Commits:
608e890a by Andreas Tille at 2018-07-17T12:31:54+02:00
New upstream version 1.7.1+dfsg
- - - - -
4c2ad7a8 by Andreas Tille at 2018-07-17T12:32:01+02:00
Update upstream source from tag 'upstream/1.7.1+dfsg'

Update to upstream version '1.7.1+dfsg'
with Debian dir 2ab1c0abd9cb9dd4a0aed98f8f528d7620dcf3d1
- - - - -
9af8864d by Andreas Tille at 2018-07-17T12:32:01+02:00
New upstream version

- - - - -
5ca91b4a by Andreas Tille at 2018-07-17T12:32:01+02:00
debhelper 11

- - - - -
558946eb by Andreas Tille at 2018-07-17T12:34:49+02:00
Point Vcs fields to salsa.debian.org

- - - - -
23f8e224 by Andreas Tille at 2018-07-17T12:34:49+02:00
Standards-Version: 4.1.5

- - - - -
b0bd43c0 by Andreas Tille at 2018-07-17T12:39:47+02:00
Upload to unstable

- - - - -


22 changed files:

- debian/changelog
- debian/compat
- debian/control
- src/AS_UTL/stddev.H
- src/bogart/AS_BAT_BestOverlapGraph.C
- src/bogart/AS_BAT_BestOverlapGraph.H
- src/bogart/AS_BAT_DropDeadEnds.C
- src/bogart/AS_BAT_MarkRepeatReads.C
- src/bogart/AS_BAT_MergeOrphans.C
- src/bogart/AS_BAT_OptimizePositions.C
- src/bogart/AS_BAT_OverlapCache.C
- src/bogart/AS_BAT_PlaceContains.C
- src/bogart/AS_BAT_PopulateUnitig.C
- src/bogart/AS_BAT_Unitig.H
- src/canu_version_update.pl
- src/overlapErrorAdjustment/findErrors-Read_Frags.C
- src/overlapErrorAdjustment/findErrors-Read_Olaps.C
- src/overlapErrorAdjustment/findErrors.C
- src/overlapErrorAdjustment/findErrors.H
- src/pipelines/canu/Defaults.pm
- src/pipelines/canu/OverlapErrorAdjustment.pm
- src/stores/ovStore.C


Changes:

=====================================
debian/changelog
=====================================
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,13 @@
+canu (1.7.1+dfsg-1) unstable; urgency=medium
+
+  * Team upload.
+  * New upstream version
+  * debhelper 11
+  * Point Vcs fields to salsa.debian.org
+  * Standards-Version: 4.1.5
+
+ -- Andreas Tille <tille at debian.org>  Tue, 17 Jul 2018 12:34:50 +0200
+
 canu (1.7+dfsg-1) unstable; urgency=medium
 
   * New upstream version


=====================================
debian/compat
=====================================
--- a/debian/compat
+++ b/debian/compat
@@ -1 +1 @@
-10
+11


=====================================
debian/control
=====================================
--- a/debian/control
+++ b/debian/control
@@ -1,32 +1,29 @@
 Source: canu
-Section: science
-Priority: optional
 Maintainer: Debian Med Packaging Team <debian-med-packaging at lists.alioth.debian.org>
 Uploaders: Afif Elghraoui <afif at debian.org>
-Build-Depends:
-	debhelper (>= 10),
-	libboost-dev,
-	libmeryl-dev,
+Section: science
+Priority: optional
+Build-Depends: debhelper (>= 11~),
+               libboost-dev,
+               libmeryl-dev,
 # For File::Path
-	libfilesys-df-perl,
-	mhap (>= 2.1.3)
-Standards-Version: 4.1.3
+               libfilesys-df-perl,
+               mhap (>= 2.1.3)
+Standards-Version: 4.1.5
+Vcs-Browser: https://salsa.debian.org/med-team/canu
+Vcs-Git: https://salsa.debian.org/med-team/canu.git
 Homepage: http://canu.readthedocs.org/en/latest/
-Vcs-Git: https://anonscm.debian.org/git/debian-med/canu.git
-Vcs-Browser: https://anonscm.debian.org/cgit/debian-med/canu.git
 
 Package: canu
 Architecture: any
-Depends:
-	${shlibs:Depends},
-	${misc:Depends},
-	${perl:Depends},
-	libfilesys-df-perl,
-	mhap (>= 2.1.3),
-	gnuplot,
-Suggests:
-	pbgenomicconsensus,
-	nanopolish,
+Depends: ${shlibs:Depends},
+         ${misc:Depends},
+         ${perl:Depends},
+         libfilesys-df-perl,
+         mhap (>= 2.1.3),
+         gnuplot
+Suggests: pbgenomicconsensus,
+          nanopolish
 Description: single molecule sequence assembler for genomes
  Canu is a fork of the Celera Assembler, designed for high-noise
  single-molecule sequencing (such as the PacBio RS II or Oxford


=====================================
src/AS_UTL/stddev.H
=====================================
--- a/src/AS_UTL/stddev.H
+++ b/src/AS_UTL/stddev.H
@@ -286,7 +286,7 @@ computeExponentialMovingAverage(TT alpha, TT ema, TT value) {
 
 
 
-
+#if 0
 
 template<typename TT>
 class genericStatistics {
@@ -338,12 +338,12 @@ public:
 
   vector<uint64>    &histogram(void) {    //  Returns pointer to private histogram data
     finalizeData();
-    return(&_histogram);
+    return(_histogram);
   };
 
   vector<uint64>    &Nstatistics(void) {  //  Returns pointer to private N data
     finalizeData();
-    return(&_Nstatistics);
+    return(_Nstatistics);
   };
 
   void               finalizeData(void) {
@@ -375,7 +375,7 @@ private:
   vector<uint64>  _Nstatistics;
 };
 
-
+#endif
 
 
 
@@ -422,12 +422,12 @@ public:
 #if 0
   vector<uint64>    &histogram(void) {    //  Returns pointer to private histogram data
     finalizeData();
-    return(&_histogram);
+    return(_histogram);
   };
 
   vector<uint64>    &Nstatistics(void) {  //  Returns pointer to private N data
     finalizeData();
-    return(&_Nstatistics);
+    return(_Nstatistics);
   };
 #endif
 


=====================================
src/bogart/AS_BAT_BestOverlapGraph.C
=====================================
--- a/src/bogart/AS_BAT_BestOverlapGraph.C
+++ b/src/bogart/AS_BAT_BestOverlapGraph.C
@@ -130,6 +130,27 @@ BestOverlapGraph::removeHighErrorBestEdges(void) {
 
     if (b5->readId() != 0)   edgeStats.insert(erates[eratesLen++] = b5->erate());
     if (b3->readId() != 0)   edgeStats.insert(erates[eratesLen++] = b3->erate());
+
+    //  If there are NO best edges, find the overlap with the most matches and use that.
+
+    if ((b5->readId() == 0) &&
+        (b3->readId() == 0)) {
+      uint32      no    = 0;
+      BAToverlap *ovl   = OC->getOverlaps(fi, no);
+      uint32      bestM = 0;
+      double      bestE = 0.0;
+
+      for (uint32 oo=0; oo<no; oo++) {
+        double  matches = (1 - ovl[oo].erate()) * RI->overlapLength(ovl[oo].a_iid, ovl[oo].b_iid, ovl[oo].a_hang, ovl[oo].b_hang);
+        if (bestM < matches) {
+          bestM = matches;
+          bestE = ovl[oo].erate();
+        }
+      }
+
+      if (no > 0)
+        edgeStats.insert(erates[eratesLen++] = bestE);
+    }
   }
 
   _mean   = edgeStats.mean();
@@ -357,6 +378,53 @@ BestOverlapGraph::removeSpurs(const char *prefix) {
 
 
 
+//  Mark zombie masters.  Any read that has only contained overlaps (it is the container) and is the
+//  smallest ID of those with no hangs, is a master.  These get promoted to unitigs.
+//
+void
+BestOverlapGraph::findZombies(const char *prefix) {
+  uint32  fiLimit    = RI->numReads();
+  uint32  numThreads = omp_get_max_threads();
+  uint32  blockSize  = (fiLimit < 100 * numThreads) ? numThreads : fiLimit / 99;
+
+#pragma omp parallel for schedule(dynamic, blockSize)
+  for (uint32 fi=1; fi <= fiLimit; fi++) {
+    uint32      no  = 0;
+    BAToverlap *ovl = OC->getOverlaps(fi, no);
+    uint32      nc = 0;
+
+    if (no == 0)
+      continue;
+
+    for (uint32 ii=0; ii<no; ii++, nc++)       //  If any overlap makes A not
+      if (ovl[ii].AisContainer() == false)     //  a container, it's not a zombie
+        break;
+
+    if (nc < no)
+      continue;
+
+    nc = UINT32_MAX;
+
+    for (uint32 ii=0; ii<no; ii++)             //  Find the smallest ID
+      if ((ovl[ii].a_hang == 0) &&             //  with no hangs.
+          (ovl[ii].b_hang == 0) &&
+          (ovl[ii].b_iid < nc))
+        nc = ovl[ii].b_iid;
+
+    if (fi < nc) {                             //  If we're smaller, we're a
+#pragma omp critical (suspInsert)              //  Zombie Master!
+      {
+        writeLog("read %u is a zombie.\n", fi);
+        _zombie.insert(fi);
+      }
+    }
+  }
+
+  writeStatus("BestOverlapGraph()-- detected " F_SIZE_T " zombie reads.\n", _zombie.size());
+}
+
+
+
 void
 BestOverlapGraph::findEdges(void) {
   uint32  fiLimit    = RI->numReads();
@@ -513,6 +581,8 @@ BestOverlapGraph::BestOverlapGraph(double        erateGraph,
     findEdges();
   }
 
+  findZombies(prefix);
+
   reportBestEdges(prefix, logFileFlagSet(LOG_ALL_BEST_EDGES) ? "best.3.final" : "best");
 
   //  One more pass, to find any ambiguous best edges.


=====================================
src/bogart/AS_BAT_BestOverlapGraph.H
=====================================
--- a/src/bogart/AS_BAT_BestOverlapGraph.H
+++ b/src/bogart/AS_BAT_BestOverlapGraph.H
@@ -183,6 +183,7 @@ private:
   void   removeSuspicious(const char *prefix);
   void   removeSpurs(const char *prefix);
   void   removeLopsidedEdges(const char *prefix);
+  void   findZombies(const char *prefix);
 
   void   findEdges(void);
 
@@ -240,6 +241,10 @@ public:
     return(_suspicious.count(readid) > 0);
   };
 
+  bool isZombie(const uint32 readid) {
+    return(_zombie.count(readid) > 0);
+  };
+
   void      reportEdgeStatistics(const char *prefix, const char *label);
   void      reportBestEdges(const char *prefix, const char *label);
 
@@ -283,6 +288,7 @@ private:
   set<uint32>                _suspicious;
   set<uint32>                _singleton;
   set<uint32>                _spur;
+  set<uint32>                _zombie;
 
   map<uint32, BestOverlaps>  _bestM;
   map<uint32, BestScores>    _scorM;


=====================================
src/bogart/AS_BAT_DropDeadEnds.C
=====================================
--- a/src/bogart/AS_BAT_DropDeadEnds.C
+++ b/src/bogart/AS_BAT_DropDeadEnds.C
@@ -133,17 +133,16 @@ findPrevRead(Unitig *tig,
 
 uint32
 dropDeadFirstRead(AssemblyGraph *AG,
-                  Unitig        *tig) {
+                  Unitig        *tig,
+                  bool           isForward) {
 
   ufNode *fn = tig->firstRead();
   ufNode *sn = findNextRead(tig, fn);
 
    //  No next read, keep fn in the tig.
 
-  if (sn == NULL) {
-    writeLog("dropDead()- read %8u no sn\n", fn->ident);
+  if (sn == NULL)
     return(0);
-  }
 
   //  Over all edges from the first read, look for any edge to something else.
   //
@@ -153,24 +152,36 @@ dropDeadFirstRead(AssemblyGraph *AG,
   //  the tig.  We assume that this is always the first read, which is OK, because the function name
   //  says so.  Any edge to anywhere means the read is good and should be kept.
 
+  if (AG->getForward(fn->ident).size() == 0)
+    writeLog("dropDead()-- (%s) 1st read %8u has no edges\n", (isForward) ? "fwd" : "rev", fn->ident);
+
   for (uint32 pp=0; pp<AG->getForward(fn->ident).size(); pp++) {
     BestPlacement  &pf = AG->getForward(fn->ident)[pp];
 
-    writeLog("dropDead()-- 1st read %8u %s pf %3u/%3u best5 %8u best3 %8u bestC %8u\n",
-             fn->ident,
-             fn->position.isForward() ? "->" : "<-",
-             pp, AG->getForward(fn->ident).size(),
-             pf.best5.b_iid, pf.best3.b_iid, pf.bestC.b_iid);
 
     if (pf.bestC.b_iid > 0) {
+      writeLog("dropDead()-- (%s) 1st read %8u %s edge %4u/%-4u - contained in %u - confirmed\n",
+               (isForward) ? "fwd" : "rev",
+               fn->ident,
+               fn->position.isForward() ? "->" : "<-",
+               pp, AG->getForward(fn->ident).size(),
+               pf.bestC.b_iid);
       return(0);
     }
 
     if (((fn->position.isForward() == true)  && (pf.best5.b_iid != 0)) ||
-        ((fn->position.isForward() == false) && (pf.best3.b_iid != 0)))
+        ((fn->position.isForward() == false) && (pf.best3.b_iid != 0))) {
+      writeLog("dropDead()-- (%s) 1st read %8u %s edge %4u/%-4u - 5:%u 3:%u - confirmed\n",
+               (isForward) ? "fwd" : "rev",
+               fn->ident,
+               fn->position.isForward() ? "->" : "<-",
+               pp, AG->getForward(fn->ident).size(),
+               pf.best5.b_iid, pf.best3.b_iid);
       return(0);
+    }
   }
 
+
   //  But no edge means we need to check the second read.  If it has an edge, then we infer the
   //  first read is bogus and should be removed.  If it also has no edge (except to the first read,
   //  duh) then we know nothing: this could be novel sequence or it could be the same garbage that
@@ -180,26 +191,41 @@ dropDeadFirstRead(AssemblyGraph *AG,
   //  first read.  Well, and that if the second read has an edge we declare the first read to be
   //  junk.  That's also a bit of a difference from the previous loop.
 
+  if (AG->getForward(sn->ident).size() == 0) {
+    writeLog("dropDead()-- (%s) 2nd read %8u has no edges - keep first\n", (isForward) ? "fwd" : "rev", sn->ident);
+    return(0);
+  }
+
   for (uint32 pp=0; pp<AG->getForward(sn->ident).size(); pp++) {
     BestPlacement  &pf = AG->getForward(sn->ident)[pp];
 
-    writeLog("dropDead()-- 2nd read %8u %s pf %3u/%3u best5 %8u best3 %8u bestC %8u\n",
-             sn->ident,
-             sn->position.isForward() ? "->" : "<-",
-             pp, AG->getForward(sn->ident).size(),
-             pf.best5.b_iid, pf.best3.b_iid, pf.bestC.b_iid);
-
-    if ((pf.bestC.b_iid > 0) && (pf.bestC.b_iid != fn->ident))
+    if ((pf.bestC.b_iid > 0) && (pf.bestC.b_iid != fn->ident)) {
+      writeLog("dropDead()-- (%s) 2nd read %8u %s edge %4u/%-4u - contained in %u - drop first\n",
+               (isForward) ? "fwd" : "rev",
+               sn->ident,
+               sn->position.isForward() ? "->" : "<-",
+               pp, AG->getForward(sn->ident).size(),
+               pf.bestC.b_iid);
       return(fn->ident);
+    }
 
     if (((sn->position.isForward() == true)  && (pf.best5.b_iid != 0) && (pf.best5.b_iid != fn->ident)) ||
-        ((sn->position.isForward() == false) && (pf.best3.b_iid != 0) && (pf.best3.b_iid != fn->ident)))
+        ((sn->position.isForward() == false) && (pf.best3.b_iid != 0) && (pf.best3.b_iid != fn->ident))) {
+      writeLog("dropDead()-- (%s) 2nd read %8u %s edge %4u/%-4u - 5:%u 3:8u - drop first\n",
+               (isForward) ? "fwd" : "rev",
+               sn->ident,
+               sn->position.isForward() ? "->" : "<-",
+               pp, AG->getForward(sn->ident).size(),
+               pf.best5.b_iid, pf.best3.b_iid);
       return(fn->ident);
+    }
   }
 
   //  Otherwise, the second read had only edges to the first read, and we should keep the first
   //  read.
 
+  writeLog("dropDead()-- (%s) 2nd read %8u has no useful external edges - keep first\n", (isForward) ? "fwd" : "rev", sn->ident);
+
   return(0);
 }
 
@@ -222,20 +248,32 @@ dropDeadEnds(AssemblyGraph  *AG,
         (tig->_isUnassembled == true))
       continue;
 
-    uint32  fn = dropDeadFirstRead(AG, tig);        //  Decide if the first read is junk.
+    writeLog("\n");
+    writeLog("dropDead()-- testing tig %u length %u\n", ti, tig->getLength());
+
+    uint32  fn = dropDeadFirstRead(AG, tig, true);  //  Decide if the first read is junk.
+
+    //fprintf(stderr, "drop dead for tig %u (flipped)\n", ti);
 
     tig->reverseComplement();                       //  Flip.
-    uint32  ln = dropDeadFirstRead(AG, tig);        //  Decide if the last (now first) read is junk.
+    uint32  ln = dropDeadFirstRead(AG, tig, false); //  Decide if the last (now first) read is junk.
     tig->reverseComplement();                       //  Flip back.
 
-    if ((fn == 0) && (ln == 0))                     //  Nothing to remove, just get out of here.
+    if ((fn == 0) && (ln == 0)) {                   //  Nothing to remove, nothing to do.
+      writeLog("dropDead()-- both ends confirmed\n");
       continue;
+    }
+
+    if (fn == ln) {                                 //  The same thing to remove, ignore it.
+      writeLog("dropDead()-- retaining spanning read %u\n", fn);
+      continue;
+    }
 
     //  At least one read needs to be kicked out.  Make new tigs for everything.
 
-    char   fnMsg[80] = {0};   Unitig  *fnTig = NULL;
-    char   nnMsg[80] = {0};   Unitig  *nnTig = NULL;  int32  nnOff = INT32_MAX;
-    char   lnMsg[80] = {0};   Unitig  *lnTig = NULL;
+    Unitig  *fnTig = NULL;
+    Unitig  *nnTig = NULL;  int32  nnOff = INT32_MAX;
+    Unitig  *lnTig = NULL;
 
     if (fn > 0)
       fnTig = tigs.newUnitig(false);
@@ -256,32 +294,26 @@ dropDeadEnds(AssemblyGraph  *AG,
 
     //  Move reads to their new unitig.
 
-    strcpy(fnMsg, "                                      ");
-    strcpy(nnMsg, "                          ");
-    strcpy(lnMsg, "");
-
     for (uint32 cc=0, tt=0; tt<tig->ufpath.size(); tt++) {
       ufNode  &read = tig->ufpath[tt];
 
       if        (read.ident == fn) {
-        sprintf(fnMsg, "first read %9u to tig %7u --", read.ident, fnTig->id());
+        writeLog("dropDead()-- tig %u gets first read %u\n", fnTig->id(), read.ident);
         fnTig->addRead(read, -read.position.min(), false);
 
       } else if (read.ident == ln) {
-        sprintf(lnMsg, "-- last read %9u to tig %7u", read.ident, lnTig->id());
+        writeLog("dropDead()-- tig %u gets last read %u\n", lnTig->id(), read.ident);
         lnTig->addRead(read, -read.position.min(), false);
 
       } else {
         if (nnOff == INT32_MAX) {
-          sprintf(nnMsg, "other reads to tig %7u", nnTig->id());
+          writeLog("dropDead()-- tig %u gets all other reads\n", nnTig->id());
           nnOff = read.position.min();
         }
         nnTig->addRead(read, -nnOff, false);
       }
     }
 
-    writeLog("dropDeadEnds()-- tig %7u --> %s %s %s\n", tig->id(), fnMsg, nnMsg, lnMsg);
-
     if (fnTig)  fnTig->cleanUp();   //  Probably not neeeded, but cheap.
     if (lnTig)  lnTig->cleanUp();   //  Probably not neeeded, but cheap.
     if (nnTig)  nnTig->cleanUp();   //  Most likely needed.
@@ -292,6 +324,6 @@ dropDeadEnds(AssemblyGraph  *AG,
     tigs[ti] = NULL;
   }
 
-  writeStatus("dropDeadEnds()-- Modified %u tigs.  Dropped %u first and %u last reads, %u tig%s had both reads dropped.\n",
+  writeStatus("dropDead()-- Modified %u tigs.  Dropped %u first and %u last reads, %u tig%s had both reads dropped.\n",
               numT, numF, numL, numB, (numB == 1) ? "" : "s");
 }


=====================================
src/bogart/AS_BAT_MarkRepeatReads.C
=====================================
--- a/src/bogart/AS_BAT_MarkRepeatReads.C
+++ b/src/bogart/AS_BAT_MarkRepeatReads.C
@@ -702,12 +702,15 @@ findConfusedEdges(TigVector            &tigs,
         uint32   tgBid    = tigs.inUnitig(rdBid);
 
         //  If the read is in a singleton, skip.  These are unassembled crud.
-        if ((tgBid                         == 0) ||
+        if ((tgBid                      == 0) ||
             (tigs[tgBid]                == NULL) ||
             (tigs[tgBid]->ufpath.size() == 1))
           continue;
 
         //  Skip if this overlap is the best we're trying to match.
+        //
+        //  NOTE.  This doesn't care about potential duplicate overlaps between a pair of reads,
+        //  as we're looking for thicker overlaps off only one end of the read.
         if ((rdBid == b5->readId()) ||
             (rdBid == b3->readId()))
           continue;


=====================================
src/bogart/AS_BAT_MergeOrphans.C
=====================================
--- a/src/bogart/AS_BAT_MergeOrphans.C
+++ b/src/bogart/AS_BAT_MergeOrphans.C
@@ -73,8 +73,15 @@ typedef  map<uint32, vector<uint32> >  BubTargetList;
 
 
 
-//  Decide which tigs can be orphans.  Any unitig where (nearly) every dovetail read has an overlap
-//  to some other unitig is a candidate for orphan popping.
+//  Decide which tigs can be orphans.  Any unitig where (nearly) every dovetail
+//  read has an overlap to some other unitig is a candidate for orphan popping.
+//
+//  Counts the number of reads that have an overlap to some other tig
+//  (tigOlapsTo).  if more than half the reads in the tig have an overlap to
+//  some other tig, it is a potential place to pop the bubble.
+//
+//  Returns BubTargetList, a map of uint32 to vector<uint32>, of the potential
+//  places that some tig could be popped into.
 
 void
 findPotentialOrphans(TigVector       &tigs,
@@ -617,13 +624,16 @@ mergeOrphans(TigVector &tigs,
     vector<candidatePop *>    targets;
 
     for (map<uint32, intervalList<uint32> *>::iterator it=targetIntervals.begin(); it != targetIntervals.end(); ++it)
-      saveCorrectlySizedInitialIntervals(orphan,
-                                         tigs[it->first],     //  The targetID      in targetIntervals
-                                         it->second,          //  The interval list in targetIntervals
-                                         fReadID,
-                                         lReadID,
-                                         placed,
-                                         targets);
+      if (tigs[it->first] == NULL)
+        writeLog("mergeOrphans()-- orphan %u wants to go into nonexistent tig %u!\n", ti, it->first);
+      else
+        saveCorrectlySizedInitialIntervals(orphan,
+                                           tigs[it->first],     //  The targetID      in targetIntervals
+                                           it->second,          //  The interval list in targetIntervals
+                                           fReadID,
+                                           lReadID,
+                                           placed,
+                                           targets);
 
     targetIntervals.clear();   //  intervalList already freed.
 
@@ -657,7 +667,7 @@ mergeOrphans(TigVector &tigs,
                    targets[tt]->placed[op].frgID,
                    targets[tt]->placed[op].position.bgn, targets[tt]->placed[op].position.end);
 
-      writeLog("mergeOrphans()-- tig %8u length %9u -> target %8u piece %2u position %9u-%-9u length %8u - expected %3" F_SIZE_TP " reads, had %3" F_SIZE_TP " reads.\n",
+      writeLog("mergeOrphans()-- tig %8u length %9u -> target %8u piece %2u position %9u-%-9u length %8u - expected %3" F_U32 " reads, had %3" F_U32 " reads.\n",
                orphan->id(), orphan->getLength(),
                targets[tt]->target->id(), tt, targets[tt]->bgn, targets[tt]->end, targets[tt]->end - targets[tt]->bgn,
                orphanSize, targetSize);


=====================================
src/bogart/AS_BAT_OptimizePositions.C
=====================================
--- a/src/bogart/AS_BAT_OptimizePositions.C
+++ b/src/bogart/AS_BAT_OptimizePositions.C
@@ -59,6 +59,77 @@ public:
 
 
 
+//  Decide if two reads in a tig are compatible with an overlap.
+//
+//  Returns true if the reads are overlapping in the tig, and are oriented consistent with the overlap.
+//
+bool
+Unitig::optimize_isCompatible(uint32       ii,
+                              uint32       jj,
+                              BAToverlap  &olap,
+                              bool         inInit,
+                              bool         secondPass,
+                              bool         beVerbose) {
+
+  SeqInterval  &ip = ufpath[ii].position;
+  SeqInterval  &jp = ufpath[jj].position;
+
+
+  bool  isOvl = isOverlapping(ufpath[ii].position, ufpath[jj].position);
+
+  //  Decide if the overlap preserves the ordering in the tig.
+  //
+  //  This is of questionable value, and might be incorrect.  We can have two overlaps between
+  //  a pair of reads iff the orientation differs - the flipped flag.  That gets tested elsewhere.
+
+  bool  isOrdered = true;
+
+#if 1
+  bool  isOvlLo = ((jp.min() <= ip.min()) && (ip.min() <= jp.max()) && (jp.max() <= ip.max()));
+  bool  isOvlHi = ((ip.min() <= jp.min()) && (jp.min() <= ip.max()) && (ip.max() <= jp.max()));
+
+  //bool  isOvlLo = jp.min() < ip.min();   //  Is the above too rigorous?
+  //bool  isOvlHi = ip.min() < jp.min();
+
+  //  isOvl tells the end of the read that should have the overlap (according to the tig layout),
+  //  so check if the overlap actually is on that end.  If not, flag this as 'mis ordered'.
+
+  if (((isOvlLo == true) && (ip.isForward()) && (olap.AEndIs5prime() == false)) ||
+      ((isOvlLo == true) && (ip.isReverse()) && (olap.AEndIs3prime() == false)) ||
+      ((isOvlHi == true) && (ip.isForward()) && (olap.AEndIs3prime() == false)) ||
+      ((isOvlHi == true) && (ip.isReverse()) && (olap.AEndIs5prime() == false)))
+    isOrdered = false;
+
+  //  But if in a containment relationship, it cannot be mis ordered.
+  if ((olap.AisContained() == true) ||
+      (olap.AisContainer() == true))
+    isOrdered = true;
+#endif
+
+  bool  isOriented = true;
+
+  if (((ip.isForward() == jp.isForward()) && (olap.flipped == true)) ||
+      ((ip.isForward() != jp.isForward()) && (olap.flipped == false)))
+    isOriented = false;
+
+  if ((beVerbose) || (secondPass))
+    writeLog("optimize_%s()-- tig %7u read %9u (at %9d %9d) olap to read %9u (at %9d %9d) - hangs %7d %7d - %s %s ovlLo %d ovlHi %d contained %d container %d\n",
+             (inInit) ? "initPlace" : "recompute",
+             id(),
+             ufpath[ii].ident, ip.bgn, ip.end,
+             ufpath[jj].ident, jp.bgn, jp.end,
+             olap.a_hang, olap.b_hang,
+             (isOvl     == true) ? "overlapping" : "not-overlapping",
+             (isOrdered == true) ? "ordered"     : "mis-ordered",
+             isOvlLo, isOvlHi, olap.AisContained(), olap.AisContainer());
+
+  return((isOvl      == true) &&
+         (isOrdered  == true) &&
+         (isOriented == true));
+}
+
+
+
 void
 Unitig::optimize_initPlace(uint32        ii,
                            optPos       *op,
@@ -87,37 +158,18 @@ Unitig::optimize_initPlace(uint32        ii,
       uint32  uu  = inUnitig (jid);
       uint32  jj  = ufpathIdx(jid);
 
-      //  Probably overkill, but report ALL overlaps for the troubling reads.
-
-      if ((beVerbose) || (firstPass == false))
-        writeLog("optimize_initPlace()-- olap %u a %u b %u hangs %d %d\n", oo, iid, jid, ovl[oo].a_hang, ovl[oo].b_hang);
-
-      if (uu != id())   //  Skip if the overlap is to a different tig.
-        continue;       //  (the ufpathIdx() call is valid, but using it isn't)
-
-      //  Reads are in the same tig.  Decide if they overlap in position.
-
-      bool  isOvl = isOverlapping(ufpath[ii].position, ufpath[jj].position);
-
-      //  Log!  beVerbose should be true for the second pass, but just in case it isn't.
-
-      if ((beVerbose) || (firstPass == false))
-        writeLog("optimize_initPlace()-- olap %4u tig %7u read %8u (at %9d %9d) olap to read %8u (at %9d %9d) - hangs %7d %7d - %s %s\n",
-                 oo, id(),
-                 iid, ufpath[ii].position.bgn, ufpath[ii].position.end,
-                 jid, ufpath[jj].position.bgn, ufpath[jj].position.end,
-                 ovl[oo].a_hang, ovl[oo].b_hang,
-                 (isOvl == true) ? "overlapping" : "not-overlapping",
-                 (jj > ii)       ? "after"       : "before");
+      if (uu != id())   //  Skip if to a different tig.
+        continue;       //  (otherwise, ufpath[jj] is invalid below)
 
-      if (isOvl == false)            //  Skip if the reads
-        continue;                    //  don't overlap
+      //  Skip if:
+      //    this is the first pass, but the overlap is to a read after us.
+      //    the overlap isn't compatible with the layout.
 
-      if ((firstPass) && (jj > ii))  //  We're setting initial positions, so overlaps to reads after
-        continue;                    //  us aren't correct, unless we're in the 2nd pass
+      if (((firstPass) && (ii < jj)) ||
+          (optimize_isCompatible(ii, jj, ovl[oo], true, !firstPass, beVerbose) == false))
+        continue;
 
-      //  Reads overlap.  Compute the position of the read using
-      //  the overlap and the other read.
+      //  Compute the position of the read using the overlap and the other read.
 
       nmin += (op[iid].fwd) ? (op[jid].min - ovl[oo].a_hang) : (op[jid].min + ovl[oo].b_hang);
       cnt  += 1;
@@ -176,8 +228,9 @@ Unitig::optimize_recompute(uint32        iid,
   uint32       cnt  = 0;
 
   if (beVerbose) {
-    writeLog("optimize()-- tig %8u read %8u previous  - %9.2f-%-9.2f\n", id(), iid, op[iid].min,           op[iid].max);
-    writeLog("optimize()-- tig %8u read %8u length    - %9.2f-%-9.2f\n", id(), iid, op[iid].max - readLen, op[iid].min + readLen);
+    writeLog("optimize()--\n");
+    writeLog("optimize()-- tig %8u read %9u previous  - %9.2f-%-9.2f\n", id(), iid, op[iid].min,           op[iid].max);
+    writeLog("optimize()-- tig %8u read %9u length    - %9.2f-%-9.2f\n", id(), iid, op[iid].max - readLen, op[iid].min + readLen);
   }
 
   //  Process all overlaps.
@@ -187,26 +240,34 @@ Unitig::optimize_recompute(uint32        iid,
     uint32  uu  = inUnitig (jid);
     uint32  jj  = ufpathIdx(jid);
 
-    if (uu != id())   //  Skip if the overlap is to a different tig.
-      continue;       //  (the ufpathIdx() call is valid, but using it isn't)
-
-    if (isOverlapping(ufpath[ii].position, ufpath[jj].position) == false)  //  Skip if the reads
-      continue;                                                            //  don't overlap
+    if (uu != id())   //  Skip if to a different tig.
+      continue;       //  (otherwise, ufpath[jj] is invalid below)
 
-    //  Reads overlap.  Compute the position of the read using
-    //  the overlap and the other read.
+    //  Compute the position of the read using the overlap and the other read.
 
     double tmin = (op[iid].fwd) ? (op[jid].min - ovl[oo].a_hang) : (op[jid].min + ovl[oo].b_hang);
     double tmax = (op[iid].fwd) ? (op[jid].max - ovl[oo].b_hang) : (op[jid].max + ovl[oo].a_hang);
 
     if (beVerbose)
-      writeLog("optimize()-- tig %8u read %8u olap %4u - %9.2f-%-9.2f\n", id(), iid, oo, tmin, tmax);
+      writeLog("optimize()-- tig %8u read %9u olap %4u - %9.2f-%-9.2f\n", id(), iid, oo, tmin, tmax);
+
+    //  Skip if the overlap isn't compatible with the layout.
+
+    if (optimize_isCompatible(ii, jj, ovl[oo], false, false, beVerbose) == false)
+      continue;
+
+    //  Update estimate.
 
     nmin += tmin;
     nmax += tmax;
     cnt  += 1;
-  }  //  over all overlaps
+  }
 
+  if (cnt == 0) {
+    writeLog("Failed to optimize read %u in tig %u\n", iid, id());
+    fprintf(stderr, "Failed to optimize read %u in tig %u\n", iid, id());
+    flushLog();
+  }
   assert(cnt > 0);
 
   //  Add in some evidence for the bases in the read.  We want higher weight than the overlaps,
@@ -226,7 +287,7 @@ Unitig::optimize_recompute(uint32        iid,
     double dmax = 2 * (op[iid].max - np[iid].max) / (op[iid].max + np[iid].max);
     double npll = np[iid].max - np[iid].min;
 
-    writeLog("optimize()-- tig %8u read %8u           - %9.2f-%-9.2f length %9.2f/%-6d %7.2f%% posChange %+6.4f %+6.4f\n",
+    writeLog("optimize()-- tig %8u read %9u           - %9.2f-%-9.2f length %9.2f/%-6d %7.2f%% posChange %+6.4f %+6.4f\n",
              id(), iid,
              np[iid].min, np[iid].max,
              npll, readLen,
@@ -237,8 +298,6 @@ Unitig::optimize_recompute(uint32        iid,
 
 
 
-
-
 void
 Unitig::optimize_expand(optPos  *op) {
 
@@ -300,7 +359,7 @@ Unitig::optimize_setPositions(optPos  *op,
 
     if (op[iid].fwd) {
       if (beVerbose)
-        writeLog("optimize()-- read %8u -> from %9d,%-9d %7d to %9d,%-9d %7d readLen %7d diff %7.4f%%\n",
+        writeLog("optimize()-- read %9u -> from %9d,%-9d %7d to %9d,%-9d %7d readLen %7d diff %7.4f%%\n",
                  iid,
                  ufpath[ii].position.bgn,
                  ufpath[ii].position.end,
@@ -315,7 +374,7 @@ Unitig::optimize_setPositions(optPos  *op,
       ufpath[ii].position.end = (int32)op[iid].max;
     } else {
       if (beVerbose)
-        writeLog("optimize()-- read %8u <- from %9d,%-9d %7d to %9d,%-9d %7d readLen %7d diff %7.4f%%\n",
+        writeLog("optimize()-- read %9u <- from %9d,%-9d %7d to %9d,%-9d %7d readLen %7d diff %7.4f%%\n",
                  iid,
                  ufpath[ii].position.bgn,
                  ufpath[ii].position.end,
@@ -358,14 +417,15 @@ TigVector::optimizePositions(const char *prefix, const char *label) {
   optPos *np = new optPos [fiLimit];
 
   for (uint32 fi=0; fi<fiLimit; fi++) {
-    uint32 ti = inUnitig(fi);
-    uint32 pp = ufpathIdx(fi);
+    uint32    ti = inUnitig(fi);
+    uint32    pp = ufpathIdx(fi);
+    Unitig   *tig = operator[](ti);
 
-    if (ti == 0)
+    if (tig == NULL)
       continue;
 
-    op[fi].set(operator[](ti)->ufpath[pp]);
-    np[fi].set(operator[](ti)->ufpath[pp]);
+    op[fi].set(tig->ufpath[pp]);
+    np[fi].set(tig->ufpath[pp]);
   }
 
   //  Compute initial positions using previously placed reads and the read length.
@@ -383,7 +443,7 @@ TigVector::optimizePositions(const char *prefix, const char *label) {
     Unitig       *tig = operator[](ti);
     set<uint32>   failed;
 
-    if (tig == NULL)
+    if ((tig == NULL) || (tig->ufpath.size() == 1))
       continue;
 
     for (uint32 ii=0; ii<tig->ufpath.size(); ii++)
@@ -406,12 +466,13 @@ TigVector::optimizePositions(const char *prefix, const char *label) {
 
 #pragma omp parallel for schedule(dynamic, fiBlockSize)
     for (uint32 fi=0; fi<fiLimit; fi++) {
-      uint32 ti = inUnitig(fi);
+      uint32        ti = inUnitig(fi);
+      Unitig       *tig = operator[](ti);
 
-      if (ti == 0)
+      if ((tig == NULL) || (tig->ufpath.size() == 1))
         continue;
 
-      operator[](ti)->optimize_recompute(fi, op, np, beVerbose);
+      tig->optimize_recompute(fi, op, np, beVerbose);
     }
 
     //  Reset zero
@@ -421,7 +482,7 @@ TigVector::optimizePositions(const char *prefix, const char *label) {
     for (uint32 ti=0; ti<tiLimit; ti++) {
       Unitig       *tig = operator[](ti);
 
-      if (tig == NULL)
+      if ((tig == NULL) || (tig->ufpath.size() == 1))
         continue;
 
       int32  z = np[ tig->ufpath[0].ident ].min;
@@ -479,7 +540,7 @@ TigVector::optimizePositions(const char *prefix, const char *label) {
   for (uint32 ti=0; ti<tiLimit; ti++) {
     Unitig       *tig = operator[](ti);
 
-    if (tig == NULL)
+    if ((tig == NULL) || (tig->ufpath.size() == 1))
       continue;
 
     tig->optimize_expand(op);
@@ -494,7 +555,7 @@ TigVector::optimizePositions(const char *prefix, const char *label) {
   for (uint32 ti=0; ti<tiLimit; ti++) {
     Unitig       *tig = operator[](ti);
 
-    if (tig == NULL)
+    if ((tig == NULL) || (tig->ufpath.size() == 1))
       continue;
 
     tig->optimize_setPositions(op, beVerbose);


=====================================
src/bogart/AS_BAT_OverlapCache.C
=====================================
--- a/src/bogart/AS_BAT_OverlapCache.C
+++ b/src/bogart/AS_BAT_OverlapCache.C
@@ -344,7 +344,7 @@ uint32
 OverlapCache::filterDuplicates(uint32 &no) {
   uint32   nFiltered = 0;
 
-  for (uint32 ii=0, jj=1; jj<no; ii++, jj++) {
+  for (uint32 ii=0, jj=1, dd=0; jj<no; ii++, jj++) {
     if (_ovs[ii].b_iid != _ovs[jj].b_iid)
       continue;
 
@@ -352,22 +352,35 @@ OverlapCache::filterDuplicates(uint32 &no) {
 
     nFiltered++;
 
-    //  Drop the shorter overlap, or the one with the higher erate.
+    //  Drop the weaker overlap.  If a tie, drop the flipped one.
 
-    uint32  iilen = RI->overlapLength(_ovs[ii].a_iid, _ovs[ii].b_iid, _ovs[ii].a_hang(), _ovs[ii].b_hang());
-    uint32  jjlen = RI->overlapLength(_ovs[jj].a_iid, _ovs[jj].b_iid, _ovs[jj].a_hang(), _ovs[jj].b_hang());
+    double iiSco = RI->overlapLength(_ovs[ii].a_iid, _ovs[ii].b_iid, _ovs[ii].a_hang(), _ovs[ii].b_hang()) * _ovs[ii].erate();
+    double jjSco = RI->overlapLength(_ovs[jj].a_iid, _ovs[jj].b_iid, _ovs[jj].a_hang(), _ovs[jj].b_hang()) * _ovs[jj].erate();
 
-    if (iilen == jjlen) {
-      if (_ovs[ii].evalue() < _ovs[jj].evalue())
-        jjlen = 0;
-      else
-        iilen = 0;
+    if (iiSco == jjSco) {             //  Hey gcc!  See how nice I was by putting brackets
+      if (_ovs[ii].flipped())         //  around this so you don't get confused by the
+        iiSco = 0;                    //  non-ambiguous ambiguous else clause?
+      else                            //
+        jjSco = 0;                    //  You're welcome.
     }
 
-    if (iilen < jjlen)
-      _ovs[ii].a_iid = _ovs[ii].b_iid = 0;
+    if (iiSco < jjSco)
+      dd = ii;
     else
-      _ovs[jj].a_iid = _ovs[jj].b_iid = 0;
+      dd = jj;
+
+#if 0
+    writeLog("OverlapCache::filterDuplicates()-- Dropping overlap A: %9" F_U64P " B: %9" F_U64P " - %6.4f%% - %6" F_S32P " %6" F_S32P " - %s\n",
+             _ovs[dd].a_iid,
+             _ovs[dd].b_iid,
+             _ovs[dd].a_hang(),
+             _ovs[dd].b_hang(),
+             _ovs[dd].erate(),
+             _ovs[dd].flipped() ? "flipped" : "");
+#endif
+
+    _ovs[dd].a_iid = 0;
+    _ovs[dd].b_iid = 0;
   }
 
   //  If nothing was filtered, return.
@@ -616,30 +629,29 @@ OverlapCache::loadOverlaps(ovStore *ovlStore, bool doSave) {
 
 
 
+//  Binary search a list of overlaps for one matching bID and flipped.
 bool
-searchForOverlap(BAToverlap *ovl, uint32 ovlLen, uint32 bID) {
+searchForOverlap(BAToverlap *ovl, uint32 ovlLen, uint32 bID, bool flipped) {
+  int32  F = 0;
+  int32  L = ovlLen - 1;
+  int32  M = 0;
 
 #ifdef TEST_LINEAR_SEARCH
   bool linearSearchFound = false;
 
   for (uint32 ss=0; ss<ovlLen; ss++)
-    if (ovl[ss].b_iid == bID) {
+    if ((ovl[ss].b_iid   == bID) &&
+        (ovl[ss].flipped == flipped)) {
       linearSearchFound = true;
       break;
     }
 #endif
 
-  //  If not, these are repeats and we should binary search everything.
-  //  There will be no short lists where we could exhaustively search.
-
-  int32  F = 0;
-  int32  L = ovlLen - 1;
-  int32  M = 0;
-
   while (F <= L) {
     M = (F + L) / 2;
 
-    if (ovl[M].b_iid == bID) {
+    if ((ovl[M].b_iid   == bID) &&
+        (ovl[M].flipped == flipped)) {
       ovl[M].symmetric = true;
 #ifdef TEST_LINEAR_SEARCH
       assert(linearSearchFound == true);
@@ -647,7 +659,8 @@ searchForOverlap(BAToverlap *ovl, uint32 ovlLen, uint32 bID) {
       return(true);
     }
 
-    if (ovl[M].b_iid < bID)
+    if (((ovl[M].b_iid  < bID)) ||
+        ((ovl[M].b_iid == bID) && (ovl[M].flipped < flipped)))
       F = M+1;
     else
       L = M-1;
@@ -694,7 +707,7 @@ OverlapCache::symmetrizeOverlaps(void) {
 
       //  Search for the twin overlap, and if found, we're done.  The twin is marked as symmetric in the function.
 
-      if (searchForOverlap(_overlaps[rb], _overlapLen[rb], rr)) {
+      if (searchForOverlap(_overlaps[rb], _overlapLen[rb], rr, _overlaps[rr][oo].flipped)) {
         _overlaps[rr][oo].symmetric = true;
         continue;
       }


=====================================
src/bogart/AS_BAT_PlaceContains.C
=====================================
--- a/src/bogart/AS_BAT_PlaceContains.C
+++ b/src/bogart/AS_BAT_PlaceContains.C
@@ -65,6 +65,11 @@ breakSingletonTigs(TigVector &tigs) {
     if (utg->ufpath.size() > 1)
       continue;
 
+    if (OG->isZombie(utg->ufpath[0].ident) == true) {
+      writeLog("Not breaking sinleton zombie tig %u with read %u\n", ti, utg->ufpath[0].ident);
+      continue;
+    }
+
     tigs[ti] = NULL;                           //  Remove the tig from the list
     tigs.registerRead(utg->ufpath[0].ident);   //  Eject the read
     delete utg;                                //  Reclaim space
@@ -125,6 +130,16 @@ placeUnplacedUsingAllOverlaps(TigVector           &tigs,
 
     placeReadUsingOverlaps(tigs, NULL, fid, placements, placeRead_fullMatch);
 
+    //  If all placements are in singletons, allow them.  If any placement is to a 'real' tig,
+    //  ignore singleton placements.
+
+    bool  ignoreSingleton = false;
+
+    for (uint32 i=0; i<placements.size(); i++)
+      if ((placements[i].fCoverage >= 0.99) &&
+          (tigs[placements[i].tigID]->ufpath.size() > 1))
+        ignoreSingleton = true;
+
     //  Search the placements for the highest expected identity placement using all overlaps in the unitig.
 
     uint32   b = UINT32_MAX;
@@ -132,10 +147,11 @@ placeUnplacedUsingAllOverlaps(TigVector           &tigs,
     for (uint32 i=0; i<placements.size(); i++) {
       Unitig *tig = tigs[placements[i].tigID];
 
-      if (placements[i].fCoverage < 0.99)                   //  Ignore partially placed reads.
+      if (placements[i].fCoverage < 0.99)     //  Ignore partially placed reads.
         continue;
 
-      if (tig->ufpath.size() == 1)  //  Ignore placements in singletons.
+      if ((ignoreSingleton == true) &&
+          (tig->ufpath.size() == 1))          //  Ignore placements in singletons.
         continue;
 
       uint32  bgn   = placements[i].position.min();


=====================================
src/bogart/AS_BAT_PopulateUnitig.C
=====================================
--- a/src/bogart/AS_BAT_PopulateUnitig.C
+++ b/src/bogart/AS_BAT_PopulateUnitig.C
@@ -119,8 +119,11 @@ populateUnitig(TigVector &tigs,
                int32      fi) {
 
   if ((RI->readLength(fi) == 0) ||      //  Skip deleted
-      (tigs.inUnitig(fi) != 0) ||         //  Skip placed
-      (OG->isContained(fi) == true))    //  Skip contained
+      (tigs.inUnitig(fi) != 0))         //  Skip placed
+    return;
+
+  if ((OG->isContained(fi) == true) &&  //  Skip contained...
+      (OG->isZombie(fi) == false))      //  that aren't zombies.
     return;
 
   Unitig *utg = tigs.newUnitig(logFileFlagSet(LOG_BUILD_UNITIG));
@@ -140,6 +143,22 @@ populateUnitig(TigVector &tigs,
 
   utg->addRead(read, 0, logFileFlagSet(LOG_BUILD_UNITIG));
 
+  //  If suspicious or a zombie, don't bother trying to extend.  In the former
+  //  case, we don't want to extend, and in the latter case, there isn't anything
+  //  to extend.
+
+  if (OG->isSuspicious(fi)) {
+    writeLog("Stopping unitig construction of suspicious read %d in unitig %d\n",
+            utg->ufpath.back().ident, utg->id());
+    return;
+  }
+
+  if (OG->isZombie(fi)) {
+    writeLog("Stopping unitig construction of zombie read %d in unitig %d\n",
+            utg->ufpath.back().ident, utg->id());
+    return;
+  }
+
   //  Add reads as long as there is a path to follow...from the 3' end of the first read.
 
   BestEdgeOverlap  *bestedge5 = OG->getBestEdgeOverlap(fi, false);
@@ -150,34 +169,6 @@ populateUnitig(TigVector &tigs,
   assert(bestedge3->ahang() >= 0);
   assert(bestedge3->bhang() >= 0);
 
-  //  If this read is not covered by the two best overlaps we are finished.  We will not follow
-  //  the paths out.  This indicates either low coverage, or a chimeric read.  If it is low
-  //  coverage, then the best overlaps will be mutual and we'll recover the same path.  If it is a
-  //  chimeric read the overlaps will not be mutual and we will skip this read.
-  //
-  //  The amount of our read that is covered by the two best overlaps is
-  //
-  //    (readLen + bestedge5->bhang()) + (readLen - bestedge3->ahang())
-  //
-  //  If that is not significantly longer than the read length, then we will not use this
-  //  read as a seed for unitig construction.
-  //
-
-  if (OG->isSuspicious(fi))
-    return;
-
-#if 0
-  uint32  covered = RI->readLength(fi) + bestedge5->bhang() + RI->readLength(fi) - bestedge3->ahang();
-
-  //  This breaks tigs at 0x best-coverage regions.  There might be a contain that spans (joins)
-  //  the two best overlaps to verify the read, but we can't easily tell right now.
-  if (covered < RI->readLength(fi) + AS_OVERLAP_MIN_LEN / 2) {
-    writeLog("Stopping unitig construction of suspicious read %d in unitig %d\n",
-            utg->ufpath.back().ident, utg->id());
-    return;
-  }
-#endif
-
   if (logFileFlagSet(LOG_BUILD_UNITIG))
     writeLog("Adding 5' edges off of read %d in unitig %d\n",
             utg->ufpath.back().ident, utg->id());


=====================================
src/bogart/AS_BAT_Unitig.H
=====================================
--- a/src/bogart/AS_BAT_Unitig.H
+++ b/src/bogart/AS_BAT_Unitig.H
@@ -40,6 +40,7 @@
 
 #include "AS_global.H"
 #include "AS_BAT_TigVector.H"
+#include "AS_BAT_OverlapCache.H"
 
 #include "stddev.H"
 
@@ -204,6 +205,13 @@ public:
   void cleanUp(void);
 
   //  Recompute bgn/end positions using all overlaps.
+  bool optimize_isCompatible(uint32       ii,
+                             uint32       jj,
+                             BAToverlap  &olap,
+                             bool         inInit,
+                             bool         secondPass,
+                             bool         beVerbose);
+
   void optimize_initPlace(uint32        pp,
                           optPos       *op,
                           optPos       *np,


=====================================
src/canu_version_update.pl
=====================================
--- a/src/canu_version_update.pl
+++ b/src/canu_version_update.pl
@@ -32,7 +32,7 @@ my $cwd = getcwd();
 
 my $label    = "snapshot";     #  Automagically set to 'release' for releases.
 my $major    = "1";            #  Bump before release.
-my $minor    = "7";            #  Bump before release.
+my $minor    = "7.1";          #  Bump before release.
 
 my $commits  = "0";
 my $hash1    = undef;          #  This from 'git describe'


=====================================
src/overlapErrorAdjustment/findErrors-Read_Frags.C
=====================================
--- a/src/overlapErrorAdjustment/findErrors-Read_Frags.C
+++ b/src/overlapErrorAdjustment/findErrors-Read_Frags.C
@@ -60,9 +60,6 @@ Read_Frags(feParameters   *G,
   uint64  votesLength = 0;
   uint64  readsLoaded = 0;
 
-  fprintf(stderr, "Read_Frags()-- from " F_U32 " through " F_U32 "\n",
-          G->bgnID, G->endID);
-
   for (uint32 curID=G->bgnID; curID<=G->endID; curID++) {
     gkRead *read = gkpStore->gkStore_getRead(curID);
 
@@ -74,11 +71,7 @@ Read_Frags(feParameters   *G,
                       sizeof(Vote_Tally_t) * votesLength +
                       sizeof(Frag_Info_t)  * G->readsLen);
 
-  fprintf(stderr, "Read_Frags()-- allocate " F_U64 " MB for bases, votes and info, for %u reads of total length " F_U64 " (%.2f MB)\n",
-          totAlloc >> 20,
-          G->endID - G->bgnID + 1,
-          basesLength,
-          totAlloc / 1024.0 / 1024.0);
+  fprintf(stderr, "Read_Frags()-- Loading target reads " F_U32 " through " F_U32 " with " F_U64 " bases.\n", G->bgnID, G->endID, basesLength);
 
   G->readBases = new char          [basesLength];
   G->readVotes = new Vote_Tally_t  [votesLength];             //  NO constructor, MUST INIT
@@ -122,6 +115,6 @@ Read_Frags(feParameters   *G,
 
   delete readData;
 
-  fprintf(stderr, "Read_Frags()-- from " F_U32 " through " F_U32 " -- loaded " F_U64 " bases in " F_U64 " reads.\n",
-          G->bgnID, G->endID-1, basesLength, readsLoaded);
+  fprintf(stderr, "Read_Frags()-- %.3f GB for bases, votes and info.\n", totAlloc / 1024.0 / 1024.0 / 1024.0);
+  fprintf(stderr, "\n");
 }


=====================================
src/overlapErrorAdjustment/findErrors-Read_Olaps.C
=====================================
--- a/src/overlapErrorAdjustment/findErrors-Read_Olaps.C
+++ b/src/overlapErrorAdjustment/findErrors-Read_Olaps.C
@@ -41,8 +41,7 @@ Read_Olaps(feParameters *G, gkStore *gkpStore) {
 
   uint64 numolaps = ovs->numOverlapsInRange();
 
-  fprintf(stderr, "Read_Olaps()-- loading " F_U64 " overlaps (%.2f MB).\n",
-          numolaps, sizeof(Olap_Info_t) * numolaps / 1024.0 / 1024.0);
+  fprintf(stderr, "Read_Olaps()-- Loading " F_U64 " overlaps.\n", numolaps);
 
   G->olaps    = new Olap_Info_t [numolaps];
   G->olapsLen = 0;
@@ -68,6 +67,9 @@ Read_Olaps(feParameters *G, gkStore *gkpStore) {
   }
 
   delete ovs;
+
+  fprintf(stderr, "Read_Olaps()-- %.3f GB for overlaps..\n", sizeof(Olap_Info_t) * numolaps / 1024.0 / 1024.0 / 1024.0);
+  fprintf(stderr, "\n");
 }
 
 


=====================================
src/overlapErrorAdjustment/findErrors.C
=====================================
--- a/src/overlapErrorAdjustment/findErrors.C
+++ b/src/overlapErrorAdjustment/findErrors.C
@@ -51,23 +51,20 @@ Output_Corrections(feParameters *G);
 
 
 
-//  From overlapInCore.C
-int
-Binomial_Bound(int e, double p, int Start, double Limit);
-
-
-
 //  Read fragments lo_frag..hi_frag (INCLUSIVE) from store and save the ids and sequences of those
 //  with overlaps to fragments in global Frag .
 
 static
 void
-Extract_Needed_Frags(feParameters *G,
-                     gkStore      *gkpStore,
-                     uint32        loID,
-                     uint32        hiID,
-                     Frag_List_t  *fl,
-                     uint64       &nextOlap) {
+extractReads(feParameters *G,
+             gkStore      *gkpStore,
+             Frag_List_t  *fl,
+             uint64       &nextOlap) {
+
+  //  Clear the buffer.
+
+  fl->readsLen = 0;
+  fl->basesLen = 0;
 
   //  The original converted to lowercase, and made non-acgt be 'a'.
 
@@ -81,35 +78,44 @@ Extract_Needed_Frags(feParameters *G,
   filter['G'] = filter['g'] = 'g';
   filter['T'] = filter['t'] = 't';
 
-  //  Count the amount of stuff we're loading.
+ //  Return if we've exhausted the overlaps.
 
-  fl->readsLen = 0;
-  fl->basesLen = 0;
+  if (nextOlap >= G->olapsLen)
+    return;
+
+  //  Count the amount of stuff we're loading.
 
   uint64 lastOlap = nextOlap;
-  uint32 ii       = 0;                        //  Index into reads arrays
-  uint32 fi       = G->olaps[lastOlap].b_iid;  //  Actual ID we're extracting
+  uint32 loID     = G->olaps[lastOlap].b_iid;  //  Actual ID we're extracting
+  uint32 hiID     = loID;
+  uint64 maxBases = 512 * 1024 * 1024;
 
-  assert(loID <= fi);
+  //  Find the highest read ID that we can load without exceeding maxBases.
 
-  fprintf(stderr, "\n");
-  fprintf(stderr, "Extract_Needed_Frags()--  Loading used reads between " F_U32 " and " F_U32 ", at overlap " F_U64 ".\n", fi, hiID, lastOlap);
+  while ((fl->basesLen < maxBases) &&
+         (lastOlap     < G->olapsLen)) {
+    hiID = G->olaps[lastOlap].b_iid;                        //  Grab the ID of the overlap we're at.
 
-  while (fi <= hiID) {
-    gkRead *read = gkpStore->gkStore_getRead(fi);
+    gkRead *read = gkpStore->gkStore_getRead(hiID);         //  Grab that read.
 
-    fl->readsLen += 1;
+    fl->readsLen += 1;                                      //  Add the read to our set.
     fl->basesLen += read->gkRead_sequenceLength() + 1;
 
-    //  Advance to the next overlap
-
-    lastOlap++;
-    while ((lastOlap < G->olapsLen) && (G->olaps[lastOlap].b_iid == fi))
-      lastOlap++;
-    fi = (lastOlap < G->olapsLen) ? G->olaps[lastOlap].b_iid : hiID + 1;
+   lastOlap++;                                             //  Advance to the next overlap
+    while ((lastOlap < G->olapsLen) &&                      //
+           (G->olaps[lastOlap].b_iid == hiID))              //  If we've exceeded the max size or hit the last overlap,
+      lastOlap++;                                           //  the loop will stop on the next iteration.
   }
 
-  fprintf(stderr, "Extract_Needed_Frags()--  Loading reads for overlaps " F_U64 " to " F_U64 " (reads " F_U32 " bases " F_U64 ")\n", nextOlap, lastOlap, fl->readsLen, fl->basesLen);
+  //  If nothing to load, just return.
+
+  if (fl->readsLen == 0)
+    return;
+
+  //  Report what we're going to do.
+
+  fprintf(stderr, "extractReads()-- Loading reads " F_U32 " to " F_U32 " (" F_U32 " reads with " F_U64 " bases) overlaps " F_U64 " through " F_U64 ".\n", 
+          loID, hiID, fl->readsLen, fl->basesLen, nextOlap, lastOlap);
 
   //  Ensure there is space.
 
@@ -117,42 +123,31 @@ Extract_Needed_Frags(feParameters *G,
     delete [] fl->readIDs;
     delete [] fl->readBases;
 
-    //fprintf(stderr, "Extract_Needed_Frags()--  realloc reads from " F_U32 " to " F_U32 "\n", fl->readsMax, 12 * fl->readsLen / 10);
-
-    fl->readIDs   = new uint32 [12 * fl->readsLen / 10];
-    fl->readBases = new char * [12 * fl->readsLen / 10];
-
     fl->readsMax  = 12 * fl->readsLen / 10;
+    fl->readIDs   = new uint32 [fl->readsMax];
+    fl->readBases = new char * [fl->readsMax];
   }
 
   if (fl->basesMax < fl->basesLen) {
     delete [] fl->bases;
 
-    //fprintf(stderr, "Extract_Needed_Frags()--  realloc bases from " F_U64 " to " F_U64 "\n", fl->basesMax, 12 * fl->basesLen / 10);
-
-    fl->bases       = new char [12 * fl->basesLen / 10];
-
     fl->basesMax    = 12 * fl->basesLen / 10;
+    fl->bases       = new char [fl->basesMax];
   }
 
-  //  Load.  This is complicated by loading only the reads that have overlaps we care about.
-
-  fl->readsLen = 0;
-  fl->basesLen = 0;
+  //  Load the sequence data for reads loID to hiID, as long as the read has an overlap.
 
   gkReadData *readData = new gkReadData;
 
-  ii = 0;
-  fi = G->olaps[nextOlap].b_iid;
-
-  assert(loID <= fi);
+  fl->readsLen = 0;
+  fl->basesLen = 0;
 
-  while (fi <= hiID) {
-    gkRead *read       = gkpStore->gkStore_getRead(fi);
+  while ((loID <= hiID) &&
+         (nextOlap < G->olapsLen)) {
+    gkRead *read       = gkpStore->gkStore_getRead(loID);
 
-    fl->readIDs[ii]     = fi;
-    fl->readBases[ii]   = fl->bases + fl->basesLen;
-    fl->basesLen       += read->gkRead_sequenceLength() + 1;
+    fl->readIDs[fl->readsLen]   = loID;                          //  Save the ID of _this_ read.
+    fl->readBases[fl->readsLen] = fl->bases + fl->basesLen;      //  Set the data pointer to where this read should start.
 
     gkpStore->gkStore_loadReadData(read, readData);
 
@@ -160,31 +155,25 @@ Extract_Needed_Frags(feParameters *G,
     char   *readBases  = readData->gkReadData_getSequence();
 
     for (uint32 bb=0; bb<readLen; bb++)
-      fl->readBases[ii][bb] = filter[readBases[bb]];
+      fl->readBases[fl->readsLen][bb] = filter[readBases[bb]];
 
-    fl->readBases[ii][readLen] = 0;  //  All good reads end.
+    fl->readBases[fl->readsLen][readLen] = 0;                    //  All good reads end.
 
-    ii++;
+    fl->basesLen += read->gkRead_sequenceLength() + 1;           //  Update basesLen to account for this read.
+    fl->readsLen += 1;                                           //  And note that we loaded a read.
 
-    //  Advance to the next overlap.
-
-    nextOlap++;
-    while ((nextOlap < G->olapsLen) && (G->olaps[nextOlap].b_iid == fi))
+    nextOlap++;                                                  //  Advance past all the overlaps for this read.
+    while ((nextOlap < G->olapsLen) &&
+           (G->olaps[nextOlap].b_iid == loID))
       nextOlap++;
-    fi = (nextOlap < G->olapsLen) ? G->olaps[nextOlap].b_iid : hiID + 1;
+
+    if (nextOlap < G->olapsLen)                                  //  If we have valid overlap, grab the read ID.
+      loID = G->olaps[nextOlap].b_iid;                           //  If we don't have a valid overlap, the loop will stop.
   }
 
   delete readData;
 
-  fl->readsLen = ii;
-
-  if (fl->readsLen > 0)
-    fprintf(stderr, "Extract_Needed_Frags()--  Loaded " F_U32 " reads (%.4f%%).  Loaded IDs " F_U32 " through " F_U32 ".\n",
-            fl->readsLen, 100.0 * fl->readsLen / (hiID - 1 - loID),
-            fl->readIDs[0], fl->readIDs[fl->readsLen-1]);
-  else
-    fprintf(stderr, "Extract_Needed_Frags()--  Loaded " F_U32 " reads (%.4f%%).\n",
-            fl->readsLen, 100.0 * fl->readsLen / (hiID - 1 - loID));
+  fprintf(stderr, "extractReads()-- Loaded.\n");
 }
 
 
@@ -262,8 +251,6 @@ Threaded_Stream_Old_Frags(feParameters *G,
 
   for (uint32 i=0; i<G->numThreads; i++) {
     thread_wa[i].thread_id    = i;
-    thread_wa[i].loID         = 0;
-    thread_wa[i].hiID         = 0;
     thread_wa[i].nextOlap     = 0;
     thread_wa[i].G            = G;
     thread_wa[i].frag_list    = NULL;
@@ -278,14 +265,6 @@ Threaded_Stream_Old_Frags(feParameters *G,
     thread_wa[i].ped.initialize(G, G->errorRate);
   }
 
-  uint32 loID  = G->olaps[0].b_iid;
-  uint32 hiID  = loID + FRAGS_PER_BATCH - 1;
-
-  uint32 endID = G->olaps[G->olapsLen - 1].b_iid;
-
-  if (hiID > endID)
-    hiID = endID;
-
   uint64 frstOlap = 0;
   uint64 nextOlap = 0;
 
@@ -295,15 +274,13 @@ Threaded_Stream_Old_Frags(feParameters *G,
   Frag_List_t  *curr_frag_list = &frag_list_1;
   Frag_List_t  *next_frag_list = &frag_list_2;
 
-  Extract_Needed_Frags(G, gkpStore, loID, hiID, curr_frag_list, nextOlap);
+  extractReads(G, gkpStore, curr_frag_list, nextOlap);
 
-  while (loID <= endID) {
+  while (curr_frag_list->readsLen > 0) {
 
     // Process fragments in curr_frag_list in background
 
     for (uint32 i=0; i<G->numThreads; i++) {
-      thread_wa[i].loID      = loID;
-      thread_wa[i].hiID      = hiID;
       thread_wa[i].nextOlap  = frstOlap;
       thread_wa[i].frag_list = curr_frag_list;
 
@@ -315,21 +292,14 @@ Threaded_Stream_Old_Frags(feParameters *G,
 
     // Read next batch of fragments
 
-    loID = hiID + 1;
-
-    if (loID <= endID) {
-      hiID = loID + FRAGS_PER_BATCH - 1;
+    frstOlap = nextOlap;
 
-      if (hiID > endID)
-        hiID = endID;
-
-      frstOlap = nextOlap;
-
-      Extract_Needed_Frags(G, gkpStore, loID, hiID, next_frag_list, nextOlap);
-    }
+    extractReads(G, gkpStore, next_frag_list, nextOlap);
 
     // Wait for background processing to finish
 
+    fprintf(stderr, "processReads()-- Waiting for compute.\n");
+
     for (uint32 i=0; i<G->numThreads; i++) {
       void  *ptr;
 


=====================================
src/overlapErrorAdjustment/findErrors.H
=====================================
--- a/src/overlapErrorAdjustment/findErrors.H
+++ b/src/overlapErrorAdjustment/findErrors.H
@@ -72,10 +72,6 @@ using namespace std;
 //  Factor by which to grow memory in olap array when reading it
 #define  EXPANSION_FACTOR            1.4
 
-//  Number of old fragments to read into memory-based fragment
-//  store at a time for processing
-#define  FRAGS_PER_BATCH             100000
-
 //  Longest name allowed for a file in the overlap store
 #define  MAX_FILENAME_LEN            1000
 
@@ -273,8 +269,6 @@ public:
 
 struct Thread_Work_Area_t {
   int32         thread_id;
-  uint32        loID;
-  uint32        hiID;
   uint64        nextOlap;
 
   feParameters *G;


=====================================
src/pipelines/canu/Defaults.pm
=====================================
--- a/src/pipelines/canu/Defaults.pm
+++ b/src/pipelines/canu/Defaults.pm
@@ -682,7 +682,7 @@ sub setOverlapDefaults ($$$) {
 
     setOverlapDefault($tag, "OvlHashBlockLength",  undef,                     "Amount of sequence (bp) to load into the overlap hash table");
     setOverlapDefault($tag, "OvlRefBlockLength",   undef,                     "Amount of sequence (bp) to search against the hash table per batch");
-    setOverlapDefault($tag, "OvlHashBits",         ($tag eq "cor") ? 18 : 23, "Width of the kmer hash.  Width 22=1gb, 23=2gb, 24=4gb, 25=8gb.  Plus 10b per ${tag}OvlHashBlockLength");
+    setOverlapDefault($tag, "OvlHashBits",         undef,                     "Width of the kmer hash.  Width 22=1gb, 23=2gb, 24=4gb, 25=8gb.  Plus 10b per ${tag}OvlHashBlockLength");
     setOverlapDefault($tag, "OvlHashLoad",         0.75,                      "Maximum hash table load.  If set too high, table lookups are inefficent; if too low, search overhead dominates run time; default 0.75");
     setOverlapDefault($tag, "OvlMerSize",          ($tag eq "cor") ? 19 : 22, "K-mer size for seeds in overlaps");
     setOverlapDefault($tag, "OvlMerThreshold",     "auto",                    "K-mer frequency threshold; mers more frequent than this count are ignored; default 'auto'");


=====================================
src/pipelines/canu/OverlapErrorAdjustment.pm
=====================================
--- a/src/pipelines/canu/OverlapErrorAdjustment.pm
+++ b/src/pipelines/canu/OverlapErrorAdjustment.pm
@@ -58,9 +58,13 @@ use canu::Grid_Cloud;
 sub loadReadLengthsAndNumberOfOverlaps ($$$$) {
     my $asm     = shift @_;
     my $maxID   = shift @_;
+
     my $rlVec   = shift @_;
+    my $rlCnt   = 0;
     my $rlSum   = 0;
+
     my $noVec   = shift @_;
+    my $noCnt   = 0;
     my $noSum   = 0;
 
     my $bin     = getBinDirectory();
@@ -75,13 +79,19 @@ sub loadReadLengthsAndNumberOfOverlaps ($$$$) {
     while (<F>) {
         s/^\s+//;
         s/\s+$//;
+ 
+        next if (m/^readID/);   #  Header line
+        next if (m/^------/);   #  ------ ----
+
         my @v = split '\s+', $_;
+
         vec($$rlVec, $v[0], 32) = $v[2];
+        $rlCnt                 += 1;
         $rlSum                 += $v[2];
     }
     close(F);
 
-    caExit("Failed to load read lengths from '$asm.gkpStore'", undef)   if ($rlSum == 0);
+    caExit("Failed to load read lengths from '$asm.gkpStore'", undef)   if ($rlCnt == 0);
 
 
     fetchStore("unitigging/$asm.ovlStore");
@@ -92,13 +102,16 @@ sub loadReadLengthsAndNumberOfOverlaps ($$$$) {
     while (<F>) {
         s/^\s+//;
         s/\s+$//;
+
         my @v = split '\s+', $_;
+
         vec($$noVec, $v[0], 32) = $v[1];
+        $noCnt                 += 1;
         $noSum                 += $v[1];
     }
     close(F);
 
-    caExit("Failed to load number of overlaps per read from '$asm.ovlStore'", undef)   if ($noSum == 0);
+    caExit("Failed to load number of overlaps per read from '$asm.ovlStore'", undef)   if ($noCnt == 0);
 
     return($rlSum, $noSum);
 }
@@ -127,25 +140,24 @@ sub readErrorDetectionConfigure ($) {
     my ($rlVec, $noVec);
     my ($rlSum, $noSum) = loadReadLengthsAndNumberOfOverlaps($asm, $maxID, \$rlVec, \$noVec);
 
+    if ($noSum == 0) {
+        print STDERR "--\n";
+        print STDERR "-- WARNING:\n";
+        print STDERR "-- WARNING: Found no overlaps.  Disabling Overlap Error Adjustment.\n";
+        print STDERR "-- WARNING:\n";
+        print STDERR "--\n";
+        setGlobal("enableOEA", 0);
+        return;
+    }
+
     #  Find the maximum size of each block of 100,000 reads.  findErrors reads up to 100,000 reads
     #  to process at one time.  It uses 1 * length + 4 * 100,000 bytes of memory for bases and ID storage,
     #  and has two buffers of this size.
 
-    my $maxBlockSize = 0;
-
-    for (my $id = 1; $id <= $maxID; $id += 100000) {
-        my $sum = 0;
-
-        for (my $ii=$id; ($ii < $id + 100000) && ($ii < $maxID); $ii++) {
-            $sum += vec($rlVec, $ii, 32);
-        }
-
-        $maxBlockSize = $sum   if ($maxBlockSize < $sum);
-    }
-
-    my $maxMem   = getGlobal("redMemory") * 1024 * 1024 * 1024;
-    my $maxReads = getGlobal("redBatchSize");
-    my $maxBases = getGlobal("redBatchLength");
+    my $maxBlockSize = 512 * 1024 * 1024;  #  This is hardcoded in findErrors.C
+    my $maxMem       = getGlobal("redMemory") * 1024 * 1024 * 1024;
+    my $maxReads     = getGlobal("redBatchSize");
+    my $maxBases     = getGlobal("redBatchLength");
 
     print STDERR "--\n";
     print STDERR "-- Configure RED for ", getGlobal("redMemory"), "gb memory.\n";
@@ -426,9 +438,6 @@ sub overlapErrorAdjustmentConfigure ($) {
 
     my $nj = 0;
 
-    # get earliest count of reads in store
-    my $maxID    = getNumberOfReadsEarliestVersion($asm);
-
     my $maxMem   = getGlobal("oeaMemory") * 1024 * 1024 * 1024;
     my $maxReads = getGlobal("oeaBatchSize");
     my $maxBases = getGlobal("oeaBatchLength");


=====================================
src/stores/ovStore.C
=====================================
--- a/src/stores/ovStore.C
+++ b/src/stores/ovStore.C
@@ -54,6 +54,11 @@
 
 #include "ovStore.H"
 
+#include <vector>
+#include <algorithm>
+
+using namespace std;
+
 
 
 ovStore::ovStore(const char *path, gkStore *gkp) {
@@ -525,12 +530,42 @@ ovStore::numOverlapsPerRead(uint32 numReads) {
 
 
 
+class evalueFileMap {
+public:
+  evalueFileMap() {
+    _name  = NULL;
+    _bgnID = UINT32_MAX;
+    _endID = 0;
+    _Nolap = 0;
+  }
+
+  char     *_name;
+
+  uint32    _bgnID;
+  uint32    _endID;
+  uint64    _Nolap;
+};
+
+bool
+operator<(evalueFileMap const &a, evalueFileMap const &b) {
+  return(a._bgnID < b._bgnID);
+};
+
+
+
 void
 ovStore::addEvalues(vector<char *> &fileList) {
-  char  name[FILENAME_MAX];
-  snprintf(name, FILENAME_MAX, "%s/evalues", _storePath);
+  char  evalueName[FILENAME_MAX+1];
+  char  evalueTemp[FILENAME_MAX+1];
 
-  //  If we have an opened memory mapped file, close it.
+  //  Handy to have the name of the file we're working with.
+
+  snprintf(evalueTemp, FILENAME_MAX, "%s/evalues.WORKING", _storePath);
+  snprintf(evalueName, FILENAME_MAX, "%s/evalues",         _storePath);
+
+  //  If we have an opened memory mapped file, close it.  There _shouldn't_ be one,
+  //  as it would exist only if evalues were already added, but it might.  And if it
+  //  does exist, nuke it from disk too (well, not quite yet).
 
   if (_evaluesMap) {
     delete _evaluesMap;
@@ -539,76 +574,78 @@ ovStore::addEvalues(vector<char *> &fileList) {
     _evalues    = NULL;
   }
 
-  //  Allocate space for the evalues.
+  if (AS_UTL_fileExists(evalueName) == true) {
+    fprintf(stderr, "WARNING:\n");
+    fprintf(stderr, "WARNING: existing evalue file will be overwritten!\n");
+    fprintf(stderr, "WARNING:\n");
+  }
 
-  _evalues     = new uint16 [_info.numOverlaps()];
+  //  Scan each file, finding the first overlap in it.
 
-  //  Remove a bogus evalues file if one exists.
+  fprintf(stderr, "\n");
+  fprintf(stderr, "Scanning.\n");
 
-  if ((AS_UTL_fileExists(name) == true) &&
-      (AS_UTL_sizeOfFile(name) != (sizeof(uint16) * _info.numOverlaps()))) {
-    fprintf(stderr, "WARNING: existing evalues file is incorrect size: should be " F_U64 " bytes, is " F_U64 " bytes.  Removing.\n",
-            (sizeof(uint16) * _info.numOverlaps()), AS_UTL_sizeOfFile(name));
-    AS_UTL_unlink(name);
-  }
+  evalueFileMap   *emap = new evalueFileMap [fileList.size()];
 
-  //  Clear the evalues.
+  for (uint32 ii=0; ii<fileList.size(); ii++) {
+    emap[ii]._name  = fileList[ii];
+    emap[ii]._bgnID = 0;
+    emap[ii]._endID = 0;
+    emap[ii]._Nolap = 0;
 
-  for (uint64 ii=0; ii<_info.numOverlaps(); ii++)
-    _evalues[ii] = UINT16_MAX;
+    FILE *E = AS_UTL_openInputFile(fileList[ii]);
 
-  //  For each file in the fileList, open it, read the header (bgnID, endID and
-  //  number of values), load the evalues, then copy this data to the actual
-  //  evalues file.
+    AS_UTL_safeRead(E, &emap[ii]._bgnID, "bgnID", sizeof(uint32), 1);
+    AS_UTL_safeRead(E, &emap[ii]._endID, "endID", sizeof(uint32), 1);
+    AS_UTL_safeRead(E, &emap[ii]._Nolap, "Nolap", sizeof(uint64), 1);
 
-  for (uint32 i=0; i<fileList.size(); i++) {
-    uint32        bgnID = 0;
-    uint32        endID = 0;
-    uint64        len   = 0;
+    AS_UTL_closeFile(E);
 
-    errno = 0;
-    FILE  *fp = fopen(fileList[i], "r");
-    if (errno)
-      fprintf(stderr, "Failed to open evalues file '%s': %s\n", fileList[i], strerror(errno)), exit(1);
+    fprintf(stderr, "  '%s' covers reads %7" F_U32P "-%-7" F_U32P " with %10" F_U64P " overlaps.\n",
+            emap[ii]._name, emap[ii]._bgnID, emap[ii]._endID, emap[ii]._Nolap);
+  }
 
-    AS_UTL_safeRead(fp, &bgnID, "loid",   sizeof(uint32), 1);
-    AS_UTL_safeRead(fp, &endID, "hiid",   sizeof(uint32), 1);
-    AS_UTL_safeRead(fp, &len,   "len",    sizeof(uint64), 1);
+  //  Sort the emap by starting read.
 
-    //  Figure out the overlap ID for the first overlap associated with bgnID
+  sort(emap, emap + fileList.size());
 
-    setRange(bgnID, endID);
+  //  Check that all IDs are present.
 
-    //  Load data directly into the evalue array
+  for (uint32 ii=1; ii<fileList.size(); ii++) {
+    if (emap[ii-1]._endID < emap[ii]._bgnID)
+      fprintf(stderr, "Discontinuity between files '%s' and '%s'.\n", emap[ii-1]._name, emap[ii]._name);
+  }
 
-    fprintf(stderr, "-  Loading evalues from '%s' -- ID range " F_U32 "-" F_U32 " with " F_U64 " overlaps\n",
-            fileList[i], bgnID, endID, len);
+  //  Now just copy the new evalues to the real evalues file.  The inputs two 32-bit words and
+  //  a 64-bit word at the start we need to ignore.  That's 8 16-bit words.
 
-    AS_UTL_safeRead(fp, _evalues + _offt._overlapID, "evalues", sizeof(uint16), len);
+  fprintf(stderr, "\n");
+  fprintf(stderr, "Merging.\n");
 
-    AS_UTL_closeFile(fp, fileList[i]);
-  }
+  FILE *EO = AS_UTL_openOutputFile(evalueTemp);
 
-  //  Write the evalues to disk.
+  for (uint32 ii=0; ii<fileList.size(); ii++) {
+    uint16 *ev = new uint16 [emap[ii]._Nolap + 8];
 
-  fprintf(stderr, "Saving evalues file for " F_U64 " overlaps.\n", _info.numOverlaps());
+    AS_UTL_loadFile(emap[ii]._name, ev, emap[ii]._Nolap + 8);
 
-  errno = 0;
-  FILE *F = fopen(name, "w");
-  if (errno)
-    fprintf(stderr, "Failed to make evalues file '%s': %s\n", name, strerror(errno)), exit(1);
+    AS_UTL_safeWrite(EO, ev + 8, "evalues", sizeof(uint16), emap[ii]._Nolap);
 
-  AS_UTL_safeWrite(F, _evalues, "evalues", sizeof(uint16), _info.numOverlaps());
+    delete [] ev;
 
-  AS_UTL_closeFile(F, name);
+    fprintf(stderr, "  '%s' covers reads %7" F_U32P "-%-7" F_U32P "; %10" F_U64P " with overlaps.\n",
+            emap[ii]._name, emap[ii]._bgnID, emap[ii]._endID, emap[ii]._Nolap);
+  }
 
-  //  Clean up, and reopen the file.  Usually, we just delete the store after
-  //  values are loaded, so this is pointless.
+  AS_UTL_closeFile(EO, evalueTemp);
 
-  delete [] _evalues;
+  fprintf(stderr, "\n");
+  fprintf(stderr, "Renaming.\n");
 
-  //  Open the evalues file if it isn't already opened
+  AS_UTL_unlink(evalueName);
+  AS_UTL_rename(evalueTemp, evalueName);
 
-  _evaluesMap = new memoryMappedFile(name, memoryMappedFile_readOnly);
-  _evalues    = (uint16 *)_evaluesMap->get(0);
+  fprintf(stderr, "\n");
+  fprintf(stderr, "Success!\n");
+  fprintf(stderr, "\n");
 }



View it on GitLab: https://salsa.debian.org/med-team/canu/compare/af4642f176e567ccc5afe860c278c0b4d401bac2...b0bd43c00403591d5484a22a0d23e960204e8fad

-- 
View it on GitLab: https://salsa.debian.org/med-team/canu/compare/af4642f176e567ccc5afe860c278c0b4d401bac2...b0bd43c00403591d5484a22a0d23e960204e8fad
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20180717/e02abed6/attachment-0001.html>


More information about the debian-med-commit mailing list