[med-svn] [Git][med-team/canu][upstream] New upstream version 1.7.1+dfsg
Andreas Tille
gitlab at salsa.debian.org
Tue Jul 17 11:42:32 BST 2018
Andreas Tille pushed to branch upstream at Debian Med / canu
Commits:
608e890a by Andreas Tille at 2018-07-17T12:31:54+02:00
New upstream version 1.7.1+dfsg
- - - - -
19 changed files:
- src/AS_UTL/stddev.H
- src/bogart/AS_BAT_BestOverlapGraph.C
- src/bogart/AS_BAT_BestOverlapGraph.H
- src/bogart/AS_BAT_DropDeadEnds.C
- src/bogart/AS_BAT_MarkRepeatReads.C
- src/bogart/AS_BAT_MergeOrphans.C
- src/bogart/AS_BAT_OptimizePositions.C
- src/bogart/AS_BAT_OverlapCache.C
- src/bogart/AS_BAT_PlaceContains.C
- src/bogart/AS_BAT_PopulateUnitig.C
- src/bogart/AS_BAT_Unitig.H
- src/canu_version_update.pl
- src/overlapErrorAdjustment/findErrors-Read_Frags.C
- src/overlapErrorAdjustment/findErrors-Read_Olaps.C
- src/overlapErrorAdjustment/findErrors.C
- src/overlapErrorAdjustment/findErrors.H
- src/pipelines/canu/Defaults.pm
- src/pipelines/canu/OverlapErrorAdjustment.pm
- src/stores/ovStore.C
Changes:
=====================================
src/AS_UTL/stddev.H
=====================================
--- a/src/AS_UTL/stddev.H
+++ b/src/AS_UTL/stddev.H
@@ -286,7 +286,7 @@ computeExponentialMovingAverage(TT alpha, TT ema, TT value) {
-
+#if 0
template<typename TT>
class genericStatistics {
@@ -338,12 +338,12 @@ public:
vector<uint64> &histogram(void) { // Returns pointer to private histogram data
finalizeData();
- return(&_histogram);
+ return(_histogram);
};
vector<uint64> &Nstatistics(void) { // Returns pointer to private N data
finalizeData();
- return(&_Nstatistics);
+ return(_Nstatistics);
};
void finalizeData(void) {
@@ -375,7 +375,7 @@ private:
vector<uint64> _Nstatistics;
};
-
+#endif
@@ -422,12 +422,12 @@ public:
#if 0
vector<uint64> &histogram(void) { // Returns pointer to private histogram data
finalizeData();
- return(&_histogram);
+ return(_histogram);
};
vector<uint64> &Nstatistics(void) { // Returns pointer to private N data
finalizeData();
- return(&_Nstatistics);
+ return(_Nstatistics);
};
#endif
=====================================
src/bogart/AS_BAT_BestOverlapGraph.C
=====================================
--- a/src/bogart/AS_BAT_BestOverlapGraph.C
+++ b/src/bogart/AS_BAT_BestOverlapGraph.C
@@ -130,6 +130,27 @@ BestOverlapGraph::removeHighErrorBestEdges(void) {
if (b5->readId() != 0) edgeStats.insert(erates[eratesLen++] = b5->erate());
if (b3->readId() != 0) edgeStats.insert(erates[eratesLen++] = b3->erate());
+
+ // If there are NO best edges, find the overlap with the most matches and use that.
+
+ if ((b5->readId() == 0) &&
+ (b3->readId() == 0)) {
+ uint32 no = 0;
+ BAToverlap *ovl = OC->getOverlaps(fi, no);
+ uint32 bestM = 0;
+ double bestE = 0.0;
+
+ for (uint32 oo=0; oo<no; oo++) {
+ double matches = (1 - ovl[oo].erate()) * RI->overlapLength(ovl[oo].a_iid, ovl[oo].b_iid, ovl[oo].a_hang, ovl[oo].b_hang);
+ if (bestM < matches) {
+ bestM = matches;
+ bestE = ovl[oo].erate();
+ }
+ }
+
+ if (no > 0)
+ edgeStats.insert(erates[eratesLen++] = bestE);
+ }
}
_mean = edgeStats.mean();
@@ -357,6 +378,53 @@ BestOverlapGraph::removeSpurs(const char *prefix) {
+// Mark zombie masters. Any read that has only contained overlaps (it is the container) and is the
+// smallest ID of those with no hangs, is a master. These get promoted to unitigs.
+//
+void
+BestOverlapGraph::findZombies(const char *prefix) {
+ uint32 fiLimit = RI->numReads();
+ uint32 numThreads = omp_get_max_threads();
+ uint32 blockSize = (fiLimit < 100 * numThreads) ? numThreads : fiLimit / 99;
+
+#pragma omp parallel for schedule(dynamic, blockSize)
+ for (uint32 fi=1; fi <= fiLimit; fi++) {
+ uint32 no = 0;
+ BAToverlap *ovl = OC->getOverlaps(fi, no);
+ uint32 nc = 0;
+
+ if (no == 0)
+ continue;
+
+ for (uint32 ii=0; ii<no; ii++, nc++) // If any overlap makes A not
+ if (ovl[ii].AisContainer() == false) // a container, it's not a zombie
+ break;
+
+ if (nc < no)
+ continue;
+
+ nc = UINT32_MAX;
+
+ for (uint32 ii=0; ii<no; ii++) // Find the smallest ID
+ if ((ovl[ii].a_hang == 0) && // with no hangs.
+ (ovl[ii].b_hang == 0) &&
+ (ovl[ii].b_iid < nc))
+ nc = ovl[ii].b_iid;
+
+ if (fi < nc) { // If we're smaller, we're a
+#pragma omp critical (suspInsert) // Zombie Master!
+ {
+ writeLog("read %u is a zombie.\n", fi);
+ _zombie.insert(fi);
+ }
+ }
+ }
+
+ writeStatus("BestOverlapGraph()-- detected " F_SIZE_T " zombie reads.\n", _zombie.size());
+}
+
+
+
void
BestOverlapGraph::findEdges(void) {
uint32 fiLimit = RI->numReads();
@@ -513,6 +581,8 @@ BestOverlapGraph::BestOverlapGraph(double erateGraph,
findEdges();
}
+ findZombies(prefix);
+
reportBestEdges(prefix, logFileFlagSet(LOG_ALL_BEST_EDGES) ? "best.3.final" : "best");
// One more pass, to find any ambiguous best edges.
=====================================
src/bogart/AS_BAT_BestOverlapGraph.H
=====================================
--- a/src/bogart/AS_BAT_BestOverlapGraph.H
+++ b/src/bogart/AS_BAT_BestOverlapGraph.H
@@ -183,6 +183,7 @@ private:
void removeSuspicious(const char *prefix);
void removeSpurs(const char *prefix);
void removeLopsidedEdges(const char *prefix);
+ void findZombies(const char *prefix);
void findEdges(void);
@@ -240,6 +241,10 @@ public:
return(_suspicious.count(readid) > 0);
};
+ bool isZombie(const uint32 readid) {
+ return(_zombie.count(readid) > 0);
+ };
+
void reportEdgeStatistics(const char *prefix, const char *label);
void reportBestEdges(const char *prefix, const char *label);
@@ -283,6 +288,7 @@ private:
set<uint32> _suspicious;
set<uint32> _singleton;
set<uint32> _spur;
+ set<uint32> _zombie;
map<uint32, BestOverlaps> _bestM;
map<uint32, BestScores> _scorM;
=====================================
src/bogart/AS_BAT_DropDeadEnds.C
=====================================
--- a/src/bogart/AS_BAT_DropDeadEnds.C
+++ b/src/bogart/AS_BAT_DropDeadEnds.C
@@ -133,17 +133,16 @@ findPrevRead(Unitig *tig,
uint32
dropDeadFirstRead(AssemblyGraph *AG,
- Unitig *tig) {
+ Unitig *tig,
+ bool isForward) {
ufNode *fn = tig->firstRead();
ufNode *sn = findNextRead(tig, fn);
// No next read, keep fn in the tig.
- if (sn == NULL) {
- writeLog("dropDead()- read %8u no sn\n", fn->ident);
+ if (sn == NULL)
return(0);
- }
// Over all edges from the first read, look for any edge to something else.
//
@@ -153,24 +152,36 @@ dropDeadFirstRead(AssemblyGraph *AG,
// the tig. We assume that this is always the first read, which is OK, because the function name
// says so. Any edge to anywhere means the read is good and should be kept.
+ if (AG->getForward(fn->ident).size() == 0)
+ writeLog("dropDead()-- (%s) 1st read %8u has no edges\n", (isForward) ? "fwd" : "rev", fn->ident);
+
for (uint32 pp=0; pp<AG->getForward(fn->ident).size(); pp++) {
BestPlacement &pf = AG->getForward(fn->ident)[pp];
- writeLog("dropDead()-- 1st read %8u %s pf %3u/%3u best5 %8u best3 %8u bestC %8u\n",
- fn->ident,
- fn->position.isForward() ? "->" : "<-",
- pp, AG->getForward(fn->ident).size(),
- pf.best5.b_iid, pf.best3.b_iid, pf.bestC.b_iid);
if (pf.bestC.b_iid > 0) {
+ writeLog("dropDead()-- (%s) 1st read %8u %s edge %4u/%-4u - contained in %u - confirmed\n",
+ (isForward) ? "fwd" : "rev",
+ fn->ident,
+ fn->position.isForward() ? "->" : "<-",
+ pp, AG->getForward(fn->ident).size(),
+ pf.bestC.b_iid);
return(0);
}
if (((fn->position.isForward() == true) && (pf.best5.b_iid != 0)) ||
- ((fn->position.isForward() == false) && (pf.best3.b_iid != 0)))
+ ((fn->position.isForward() == false) && (pf.best3.b_iid != 0))) {
+ writeLog("dropDead()-- (%s) 1st read %8u %s edge %4u/%-4u - 5:%u 3:%u - confirmed\n",
+ (isForward) ? "fwd" : "rev",
+ fn->ident,
+ fn->position.isForward() ? "->" : "<-",
+ pp, AG->getForward(fn->ident).size(),
+ pf.best5.b_iid, pf.best3.b_iid);
return(0);
+ }
}
+
// But no edge means we need to check the second read. If it has an edge, then we infer the
// first read is bogus and should be removed. If it also has no edge (except to the first read,
// duh) then we know nothing: this could be novel sequence or it could be the same garbage that
@@ -180,26 +191,41 @@ dropDeadFirstRead(AssemblyGraph *AG,
// first read. Well, and that if the second read has an edge we declare the first read to be
// junk. That's also a bit of a difference from the previous loop.
+ if (AG->getForward(sn->ident).size() == 0) {
+ writeLog("dropDead()-- (%s) 2nd read %8u has no edges - keep first\n", (isForward) ? "fwd" : "rev", sn->ident);
+ return(0);
+ }
+
for (uint32 pp=0; pp<AG->getForward(sn->ident).size(); pp++) {
BestPlacement &pf = AG->getForward(sn->ident)[pp];
- writeLog("dropDead()-- 2nd read %8u %s pf %3u/%3u best5 %8u best3 %8u bestC %8u\n",
- sn->ident,
- sn->position.isForward() ? "->" : "<-",
- pp, AG->getForward(sn->ident).size(),
- pf.best5.b_iid, pf.best3.b_iid, pf.bestC.b_iid);
-
- if ((pf.bestC.b_iid > 0) && (pf.bestC.b_iid != fn->ident))
+ if ((pf.bestC.b_iid > 0) && (pf.bestC.b_iid != fn->ident)) {
+ writeLog("dropDead()-- (%s) 2nd read %8u %s edge %4u/%-4u - contained in %u - drop first\n",
+ (isForward) ? "fwd" : "rev",
+ sn->ident,
+ sn->position.isForward() ? "->" : "<-",
+ pp, AG->getForward(sn->ident).size(),
+ pf.bestC.b_iid);
return(fn->ident);
+ }
if (((sn->position.isForward() == true) && (pf.best5.b_iid != 0) && (pf.best5.b_iid != fn->ident)) ||
- ((sn->position.isForward() == false) && (pf.best3.b_iid != 0) && (pf.best3.b_iid != fn->ident)))
+ ((sn->position.isForward() == false) && (pf.best3.b_iid != 0) && (pf.best3.b_iid != fn->ident))) {
+ writeLog("dropDead()-- (%s) 2nd read %8u %s edge %4u/%-4u - 5:%u 3:8u - drop first\n",
+ (isForward) ? "fwd" : "rev",
+ sn->ident,
+ sn->position.isForward() ? "->" : "<-",
+ pp, AG->getForward(sn->ident).size(),
+ pf.best5.b_iid, pf.best3.b_iid);
return(fn->ident);
+ }
}
// Otherwise, the second read had only edges to the first read, and we should keep the first
// read.
+ writeLog("dropDead()-- (%s) 2nd read %8u has no useful external edges - keep first\n", (isForward) ? "fwd" : "rev", sn->ident);
+
return(0);
}
@@ -222,20 +248,32 @@ dropDeadEnds(AssemblyGraph *AG,
(tig->_isUnassembled == true))
continue;
- uint32 fn = dropDeadFirstRead(AG, tig); // Decide if the first read is junk.
+ writeLog("\n");
+ writeLog("dropDead()-- testing tig %u length %u\n", ti, tig->getLength());
+
+ uint32 fn = dropDeadFirstRead(AG, tig, true); // Decide if the first read is junk.
+
+ //fprintf(stderr, "drop dead for tig %u (flipped)\n", ti);
tig->reverseComplement(); // Flip.
- uint32 ln = dropDeadFirstRead(AG, tig); // Decide if the last (now first) read is junk.
+ uint32 ln = dropDeadFirstRead(AG, tig, false); // Decide if the last (now first) read is junk.
tig->reverseComplement(); // Flip back.
- if ((fn == 0) && (ln == 0)) // Nothing to remove, just get out of here.
+ if ((fn == 0) && (ln == 0)) { // Nothing to remove, nothing to do.
+ writeLog("dropDead()-- both ends confirmed\n");
continue;
+ }
+
+ if (fn == ln) { // The same thing to remove, ignore it.
+ writeLog("dropDead()-- retaining spanning read %u\n", fn);
+ continue;
+ }
// At least one read needs to be kicked out. Make new tigs for everything.
- char fnMsg[80] = {0}; Unitig *fnTig = NULL;
- char nnMsg[80] = {0}; Unitig *nnTig = NULL; int32 nnOff = INT32_MAX;
- char lnMsg[80] = {0}; Unitig *lnTig = NULL;
+ Unitig *fnTig = NULL;
+ Unitig *nnTig = NULL; int32 nnOff = INT32_MAX;
+ Unitig *lnTig = NULL;
if (fn > 0)
fnTig = tigs.newUnitig(false);
@@ -256,32 +294,26 @@ dropDeadEnds(AssemblyGraph *AG,
// Move reads to their new unitig.
- strcpy(fnMsg, " ");
- strcpy(nnMsg, " ");
- strcpy(lnMsg, "");
-
for (uint32 cc=0, tt=0; tt<tig->ufpath.size(); tt++) {
ufNode &read = tig->ufpath[tt];
if (read.ident == fn) {
- sprintf(fnMsg, "first read %9u to tig %7u --", read.ident, fnTig->id());
+ writeLog("dropDead()-- tig %u gets first read %u\n", fnTig->id(), read.ident);
fnTig->addRead(read, -read.position.min(), false);
} else if (read.ident == ln) {
- sprintf(lnMsg, "-- last read %9u to tig %7u", read.ident, lnTig->id());
+ writeLog("dropDead()-- tig %u gets last read %u\n", lnTig->id(), read.ident);
lnTig->addRead(read, -read.position.min(), false);
} else {
if (nnOff == INT32_MAX) {
- sprintf(nnMsg, "other reads to tig %7u", nnTig->id());
+ writeLog("dropDead()-- tig %u gets all other reads\n", nnTig->id());
nnOff = read.position.min();
}
nnTig->addRead(read, -nnOff, false);
}
}
- writeLog("dropDeadEnds()-- tig %7u --> %s %s %s\n", tig->id(), fnMsg, nnMsg, lnMsg);
-
if (fnTig) fnTig->cleanUp(); // Probably not neeeded, but cheap.
if (lnTig) lnTig->cleanUp(); // Probably not neeeded, but cheap.
if (nnTig) nnTig->cleanUp(); // Most likely needed.
@@ -292,6 +324,6 @@ dropDeadEnds(AssemblyGraph *AG,
tigs[ti] = NULL;
}
- writeStatus("dropDeadEnds()-- Modified %u tigs. Dropped %u first and %u last reads, %u tig%s had both reads dropped.\n",
+ writeStatus("dropDead()-- Modified %u tigs. Dropped %u first and %u last reads, %u tig%s had both reads dropped.\n",
numT, numF, numL, numB, (numB == 1) ? "" : "s");
}
=====================================
src/bogart/AS_BAT_MarkRepeatReads.C
=====================================
--- a/src/bogart/AS_BAT_MarkRepeatReads.C
+++ b/src/bogart/AS_BAT_MarkRepeatReads.C
@@ -702,12 +702,15 @@ findConfusedEdges(TigVector &tigs,
uint32 tgBid = tigs.inUnitig(rdBid);
// If the read is in a singleton, skip. These are unassembled crud.
- if ((tgBid == 0) ||
+ if ((tgBid == 0) ||
(tigs[tgBid] == NULL) ||
(tigs[tgBid]->ufpath.size() == 1))
continue;
// Skip if this overlap is the best we're trying to match.
+ //
+ // NOTE. This doesn't care about potential duplicate overlaps between a pair of reads,
+ // as we're looking for thicker overlaps off only one end of the read.
if ((rdBid == b5->readId()) ||
(rdBid == b3->readId()))
continue;
=====================================
src/bogart/AS_BAT_MergeOrphans.C
=====================================
--- a/src/bogart/AS_BAT_MergeOrphans.C
+++ b/src/bogart/AS_BAT_MergeOrphans.C
@@ -73,8 +73,15 @@ typedef map<uint32, vector<uint32> > BubTargetList;
-// Decide which tigs can be orphans. Any unitig where (nearly) every dovetail read has an overlap
-// to some other unitig is a candidate for orphan popping.
+// Decide which tigs can be orphans. Any unitig where (nearly) every dovetail
+// read has an overlap to some other unitig is a candidate for orphan popping.
+//
+// Counts the number of reads that have an overlap to some other tig
+// (tigOlapsTo). if more than half the reads in the tig have an overlap to
+// some other tig, it is a potential place to pop the bubble.
+//
+// Returns BubTargetList, a map of uint32 to vector<uint32>, of the potential
+// places that some tig could be popped into.
void
findPotentialOrphans(TigVector &tigs,
@@ -617,13 +624,16 @@ mergeOrphans(TigVector &tigs,
vector<candidatePop *> targets;
for (map<uint32, intervalList<uint32> *>::iterator it=targetIntervals.begin(); it != targetIntervals.end(); ++it)
- saveCorrectlySizedInitialIntervals(orphan,
- tigs[it->first], // The targetID in targetIntervals
- it->second, // The interval list in targetIntervals
- fReadID,
- lReadID,
- placed,
- targets);
+ if (tigs[it->first] == NULL)
+ writeLog("mergeOrphans()-- orphan %u wants to go into nonexistent tig %u!\n", ti, it->first);
+ else
+ saveCorrectlySizedInitialIntervals(orphan,
+ tigs[it->first], // The targetID in targetIntervals
+ it->second, // The interval list in targetIntervals
+ fReadID,
+ lReadID,
+ placed,
+ targets);
targetIntervals.clear(); // intervalList already freed.
@@ -657,7 +667,7 @@ mergeOrphans(TigVector &tigs,
targets[tt]->placed[op].frgID,
targets[tt]->placed[op].position.bgn, targets[tt]->placed[op].position.end);
- writeLog("mergeOrphans()-- tig %8u length %9u -> target %8u piece %2u position %9u-%-9u length %8u - expected %3" F_SIZE_TP " reads, had %3" F_SIZE_TP " reads.\n",
+ writeLog("mergeOrphans()-- tig %8u length %9u -> target %8u piece %2u position %9u-%-9u length %8u - expected %3" F_U32 " reads, had %3" F_U32 " reads.\n",
orphan->id(), orphan->getLength(),
targets[tt]->target->id(), tt, targets[tt]->bgn, targets[tt]->end, targets[tt]->end - targets[tt]->bgn,
orphanSize, targetSize);
=====================================
src/bogart/AS_BAT_OptimizePositions.C
=====================================
--- a/src/bogart/AS_BAT_OptimizePositions.C
+++ b/src/bogart/AS_BAT_OptimizePositions.C
@@ -59,6 +59,77 @@ public:
+// Decide if two reads in a tig are compatible with an overlap.
+//
+// Returns true if the reads are overlapping in the tig, and are oriented consistent with the overlap.
+//
+bool
+Unitig::optimize_isCompatible(uint32 ii,
+ uint32 jj,
+ BAToverlap &olap,
+ bool inInit,
+ bool secondPass,
+ bool beVerbose) {
+
+ SeqInterval &ip = ufpath[ii].position;
+ SeqInterval &jp = ufpath[jj].position;
+
+
+ bool isOvl = isOverlapping(ufpath[ii].position, ufpath[jj].position);
+
+ // Decide if the overlap preserves the ordering in the tig.
+ //
+ // This is of questionable value, and might be incorrect. We can have two overlaps between
+ // a pair of reads iff the orientation differs - the flipped flag. That gets tested elsewhere.
+
+ bool isOrdered = true;
+
+#if 1
+ bool isOvlLo = ((jp.min() <= ip.min()) && (ip.min() <= jp.max()) && (jp.max() <= ip.max()));
+ bool isOvlHi = ((ip.min() <= jp.min()) && (jp.min() <= ip.max()) && (ip.max() <= jp.max()));
+
+ //bool isOvlLo = jp.min() < ip.min(); // Is the above too rigorous?
+ //bool isOvlHi = ip.min() < jp.min();
+
+ // isOvl tells the end of the read that should have the overlap (according to the tig layout),
+ // so check if the overlap actually is on that end. If not, flag this as 'mis ordered'.
+
+ if (((isOvlLo == true) && (ip.isForward()) && (olap.AEndIs5prime() == false)) ||
+ ((isOvlLo == true) && (ip.isReverse()) && (olap.AEndIs3prime() == false)) ||
+ ((isOvlHi == true) && (ip.isForward()) && (olap.AEndIs3prime() == false)) ||
+ ((isOvlHi == true) && (ip.isReverse()) && (olap.AEndIs5prime() == false)))
+ isOrdered = false;
+
+ // But if in a containment relationship, it cannot be mis ordered.
+ if ((olap.AisContained() == true) ||
+ (olap.AisContainer() == true))
+ isOrdered = true;
+#endif
+
+ bool isOriented = true;
+
+ if (((ip.isForward() == jp.isForward()) && (olap.flipped == true)) ||
+ ((ip.isForward() != jp.isForward()) && (olap.flipped == false)))
+ isOriented = false;
+
+ if ((beVerbose) || (secondPass))
+ writeLog("optimize_%s()-- tig %7u read %9u (at %9d %9d) olap to read %9u (at %9d %9d) - hangs %7d %7d - %s %s ovlLo %d ovlHi %d contained %d container %d\n",
+ (inInit) ? "initPlace" : "recompute",
+ id(),
+ ufpath[ii].ident, ip.bgn, ip.end,
+ ufpath[jj].ident, jp.bgn, jp.end,
+ olap.a_hang, olap.b_hang,
+ (isOvl == true) ? "overlapping" : "not-overlapping",
+ (isOrdered == true) ? "ordered" : "mis-ordered",
+ isOvlLo, isOvlHi, olap.AisContained(), olap.AisContainer());
+
+ return((isOvl == true) &&
+ (isOrdered == true) &&
+ (isOriented == true));
+}
+
+
+
void
Unitig::optimize_initPlace(uint32 ii,
optPos *op,
@@ -87,37 +158,18 @@ Unitig::optimize_initPlace(uint32 ii,
uint32 uu = inUnitig (jid);
uint32 jj = ufpathIdx(jid);
- // Probably overkill, but report ALL overlaps for the troubling reads.
-
- if ((beVerbose) || (firstPass == false))
- writeLog("optimize_initPlace()-- olap %u a %u b %u hangs %d %d\n", oo, iid, jid, ovl[oo].a_hang, ovl[oo].b_hang);
-
- if (uu != id()) // Skip if the overlap is to a different tig.
- continue; // (the ufpathIdx() call is valid, but using it isn't)
-
- // Reads are in the same tig. Decide if they overlap in position.
-
- bool isOvl = isOverlapping(ufpath[ii].position, ufpath[jj].position);
-
- // Log! beVerbose should be true for the second pass, but just in case it isn't.
-
- if ((beVerbose) || (firstPass == false))
- writeLog("optimize_initPlace()-- olap %4u tig %7u read %8u (at %9d %9d) olap to read %8u (at %9d %9d) - hangs %7d %7d - %s %s\n",
- oo, id(),
- iid, ufpath[ii].position.bgn, ufpath[ii].position.end,
- jid, ufpath[jj].position.bgn, ufpath[jj].position.end,
- ovl[oo].a_hang, ovl[oo].b_hang,
- (isOvl == true) ? "overlapping" : "not-overlapping",
- (jj > ii) ? "after" : "before");
+ if (uu != id()) // Skip if to a different tig.
+ continue; // (otherwise, ufpath[jj] is invalid below)
- if (isOvl == false) // Skip if the reads
- continue; // don't overlap
+ // Skip if:
+ // this is the first pass, but the overlap is to a read after us.
+ // the overlap isn't compatible with the layout.
- if ((firstPass) && (jj > ii)) // We're setting initial positions, so overlaps to reads after
- continue; // us aren't correct, unless we're in the 2nd pass
+ if (((firstPass) && (ii < jj)) ||
+ (optimize_isCompatible(ii, jj, ovl[oo], true, !firstPass, beVerbose) == false))
+ continue;
- // Reads overlap. Compute the position of the read using
- // the overlap and the other read.
+ // Compute the position of the read using the overlap and the other read.
nmin += (op[iid].fwd) ? (op[jid].min - ovl[oo].a_hang) : (op[jid].min + ovl[oo].b_hang);
cnt += 1;
@@ -176,8 +228,9 @@ Unitig::optimize_recompute(uint32 iid,
uint32 cnt = 0;
if (beVerbose) {
- writeLog("optimize()-- tig %8u read %8u previous - %9.2f-%-9.2f\n", id(), iid, op[iid].min, op[iid].max);
- writeLog("optimize()-- tig %8u read %8u length - %9.2f-%-9.2f\n", id(), iid, op[iid].max - readLen, op[iid].min + readLen);
+ writeLog("optimize()--\n");
+ writeLog("optimize()-- tig %8u read %9u previous - %9.2f-%-9.2f\n", id(), iid, op[iid].min, op[iid].max);
+ writeLog("optimize()-- tig %8u read %9u length - %9.2f-%-9.2f\n", id(), iid, op[iid].max - readLen, op[iid].min + readLen);
}
// Process all overlaps.
@@ -187,26 +240,34 @@ Unitig::optimize_recompute(uint32 iid,
uint32 uu = inUnitig (jid);
uint32 jj = ufpathIdx(jid);
- if (uu != id()) // Skip if the overlap is to a different tig.
- continue; // (the ufpathIdx() call is valid, but using it isn't)
-
- if (isOverlapping(ufpath[ii].position, ufpath[jj].position) == false) // Skip if the reads
- continue; // don't overlap
+ if (uu != id()) // Skip if to a different tig.
+ continue; // (otherwise, ufpath[jj] is invalid below)
- // Reads overlap. Compute the position of the read using
- // the overlap and the other read.
+ // Compute the position of the read using the overlap and the other read.
double tmin = (op[iid].fwd) ? (op[jid].min - ovl[oo].a_hang) : (op[jid].min + ovl[oo].b_hang);
double tmax = (op[iid].fwd) ? (op[jid].max - ovl[oo].b_hang) : (op[jid].max + ovl[oo].a_hang);
if (beVerbose)
- writeLog("optimize()-- tig %8u read %8u olap %4u - %9.2f-%-9.2f\n", id(), iid, oo, tmin, tmax);
+ writeLog("optimize()-- tig %8u read %9u olap %4u - %9.2f-%-9.2f\n", id(), iid, oo, tmin, tmax);
+
+ // Skip if the overlap isn't compatible with the layout.
+
+ if (optimize_isCompatible(ii, jj, ovl[oo], false, false, beVerbose) == false)
+ continue;
+
+ // Update estimate.
nmin += tmin;
nmax += tmax;
cnt += 1;
- } // over all overlaps
+ }
+ if (cnt == 0) {
+ writeLog("Failed to optimize read %u in tig %u\n", iid, id());
+ fprintf(stderr, "Failed to optimize read %u in tig %u\n", iid, id());
+ flushLog();
+ }
assert(cnt > 0);
// Add in some evidence for the bases in the read. We want higher weight than the overlaps,
@@ -226,7 +287,7 @@ Unitig::optimize_recompute(uint32 iid,
double dmax = 2 * (op[iid].max - np[iid].max) / (op[iid].max + np[iid].max);
double npll = np[iid].max - np[iid].min;
- writeLog("optimize()-- tig %8u read %8u - %9.2f-%-9.2f length %9.2f/%-6d %7.2f%% posChange %+6.4f %+6.4f\n",
+ writeLog("optimize()-- tig %8u read %9u - %9.2f-%-9.2f length %9.2f/%-6d %7.2f%% posChange %+6.4f %+6.4f\n",
id(), iid,
np[iid].min, np[iid].max,
npll, readLen,
@@ -237,8 +298,6 @@ Unitig::optimize_recompute(uint32 iid,
-
-
void
Unitig::optimize_expand(optPos *op) {
@@ -300,7 +359,7 @@ Unitig::optimize_setPositions(optPos *op,
if (op[iid].fwd) {
if (beVerbose)
- writeLog("optimize()-- read %8u -> from %9d,%-9d %7d to %9d,%-9d %7d readLen %7d diff %7.4f%%\n",
+ writeLog("optimize()-- read %9u -> from %9d,%-9d %7d to %9d,%-9d %7d readLen %7d diff %7.4f%%\n",
iid,
ufpath[ii].position.bgn,
ufpath[ii].position.end,
@@ -315,7 +374,7 @@ Unitig::optimize_setPositions(optPos *op,
ufpath[ii].position.end = (int32)op[iid].max;
} else {
if (beVerbose)
- writeLog("optimize()-- read %8u <- from %9d,%-9d %7d to %9d,%-9d %7d readLen %7d diff %7.4f%%\n",
+ writeLog("optimize()-- read %9u <- from %9d,%-9d %7d to %9d,%-9d %7d readLen %7d diff %7.4f%%\n",
iid,
ufpath[ii].position.bgn,
ufpath[ii].position.end,
@@ -358,14 +417,15 @@ TigVector::optimizePositions(const char *prefix, const char *label) {
optPos *np = new optPos [fiLimit];
for (uint32 fi=0; fi<fiLimit; fi++) {
- uint32 ti = inUnitig(fi);
- uint32 pp = ufpathIdx(fi);
+ uint32 ti = inUnitig(fi);
+ uint32 pp = ufpathIdx(fi);
+ Unitig *tig = operator[](ti);
- if (ti == 0)
+ if (tig == NULL)
continue;
- op[fi].set(operator[](ti)->ufpath[pp]);
- np[fi].set(operator[](ti)->ufpath[pp]);
+ op[fi].set(tig->ufpath[pp]);
+ np[fi].set(tig->ufpath[pp]);
}
// Compute initial positions using previously placed reads and the read length.
@@ -383,7 +443,7 @@ TigVector::optimizePositions(const char *prefix, const char *label) {
Unitig *tig = operator[](ti);
set<uint32> failed;
- if (tig == NULL)
+ if ((tig == NULL) || (tig->ufpath.size() == 1))
continue;
for (uint32 ii=0; ii<tig->ufpath.size(); ii++)
@@ -406,12 +466,13 @@ TigVector::optimizePositions(const char *prefix, const char *label) {
#pragma omp parallel for schedule(dynamic, fiBlockSize)
for (uint32 fi=0; fi<fiLimit; fi++) {
- uint32 ti = inUnitig(fi);
+ uint32 ti = inUnitig(fi);
+ Unitig *tig = operator[](ti);
- if (ti == 0)
+ if ((tig == NULL) || (tig->ufpath.size() == 1))
continue;
- operator[](ti)->optimize_recompute(fi, op, np, beVerbose);
+ tig->optimize_recompute(fi, op, np, beVerbose);
}
// Reset zero
@@ -421,7 +482,7 @@ TigVector::optimizePositions(const char *prefix, const char *label) {
for (uint32 ti=0; ti<tiLimit; ti++) {
Unitig *tig = operator[](ti);
- if (tig == NULL)
+ if ((tig == NULL) || (tig->ufpath.size() == 1))
continue;
int32 z = np[ tig->ufpath[0].ident ].min;
@@ -479,7 +540,7 @@ TigVector::optimizePositions(const char *prefix, const char *label) {
for (uint32 ti=0; ti<tiLimit; ti++) {
Unitig *tig = operator[](ti);
- if (tig == NULL)
+ if ((tig == NULL) || (tig->ufpath.size() == 1))
continue;
tig->optimize_expand(op);
@@ -494,7 +555,7 @@ TigVector::optimizePositions(const char *prefix, const char *label) {
for (uint32 ti=0; ti<tiLimit; ti++) {
Unitig *tig = operator[](ti);
- if (tig == NULL)
+ if ((tig == NULL) || (tig->ufpath.size() == 1))
continue;
tig->optimize_setPositions(op, beVerbose);
=====================================
src/bogart/AS_BAT_OverlapCache.C
=====================================
--- a/src/bogart/AS_BAT_OverlapCache.C
+++ b/src/bogart/AS_BAT_OverlapCache.C
@@ -344,7 +344,7 @@ uint32
OverlapCache::filterDuplicates(uint32 &no) {
uint32 nFiltered = 0;
- for (uint32 ii=0, jj=1; jj<no; ii++, jj++) {
+ for (uint32 ii=0, jj=1, dd=0; jj<no; ii++, jj++) {
if (_ovs[ii].b_iid != _ovs[jj].b_iid)
continue;
@@ -352,22 +352,35 @@ OverlapCache::filterDuplicates(uint32 &no) {
nFiltered++;
- // Drop the shorter overlap, or the one with the higher erate.
+ // Drop the weaker overlap. If a tie, drop the flipped one.
- uint32 iilen = RI->overlapLength(_ovs[ii].a_iid, _ovs[ii].b_iid, _ovs[ii].a_hang(), _ovs[ii].b_hang());
- uint32 jjlen = RI->overlapLength(_ovs[jj].a_iid, _ovs[jj].b_iid, _ovs[jj].a_hang(), _ovs[jj].b_hang());
+ double iiSco = RI->overlapLength(_ovs[ii].a_iid, _ovs[ii].b_iid, _ovs[ii].a_hang(), _ovs[ii].b_hang()) * _ovs[ii].erate();
+ double jjSco = RI->overlapLength(_ovs[jj].a_iid, _ovs[jj].b_iid, _ovs[jj].a_hang(), _ovs[jj].b_hang()) * _ovs[jj].erate();
- if (iilen == jjlen) {
- if (_ovs[ii].evalue() < _ovs[jj].evalue())
- jjlen = 0;
- else
- iilen = 0;
+ if (iiSco == jjSco) { // Hey gcc! See how nice I was by putting brackets
+ if (_ovs[ii].flipped()) // around this so you don't get confused by the
+ iiSco = 0; // non-ambiguous ambiguous else clause?
+ else //
+ jjSco = 0; // You're welcome.
}
- if (iilen < jjlen)
- _ovs[ii].a_iid = _ovs[ii].b_iid = 0;
+ if (iiSco < jjSco)
+ dd = ii;
else
- _ovs[jj].a_iid = _ovs[jj].b_iid = 0;
+ dd = jj;
+
+#if 0
+ writeLog("OverlapCache::filterDuplicates()-- Dropping overlap A: %9" F_U64P " B: %9" F_U64P " - %6.4f%% - %6" F_S32P " %6" F_S32P " - %s\n",
+ _ovs[dd].a_iid,
+ _ovs[dd].b_iid,
+ _ovs[dd].a_hang(),
+ _ovs[dd].b_hang(),
+ _ovs[dd].erate(),
+ _ovs[dd].flipped() ? "flipped" : "");
+#endif
+
+ _ovs[dd].a_iid = 0;
+ _ovs[dd].b_iid = 0;
}
// If nothing was filtered, return.
@@ -616,30 +629,29 @@ OverlapCache::loadOverlaps(ovStore *ovlStore, bool doSave) {
+// Binary search a list of overlaps for one matching bID and flipped.
bool
-searchForOverlap(BAToverlap *ovl, uint32 ovlLen, uint32 bID) {
+searchForOverlap(BAToverlap *ovl, uint32 ovlLen, uint32 bID, bool flipped) {
+ int32 F = 0;
+ int32 L = ovlLen - 1;
+ int32 M = 0;
#ifdef TEST_LINEAR_SEARCH
bool linearSearchFound = false;
for (uint32 ss=0; ss<ovlLen; ss++)
- if (ovl[ss].b_iid == bID) {
+ if ((ovl[ss].b_iid == bID) &&
+ (ovl[ss].flipped == flipped)) {
linearSearchFound = true;
break;
}
#endif
- // If not, these are repeats and we should binary search everything.
- // There will be no short lists where we could exhaustively search.
-
- int32 F = 0;
- int32 L = ovlLen - 1;
- int32 M = 0;
-
while (F <= L) {
M = (F + L) / 2;
- if (ovl[M].b_iid == bID) {
+ if ((ovl[M].b_iid == bID) &&
+ (ovl[M].flipped == flipped)) {
ovl[M].symmetric = true;
#ifdef TEST_LINEAR_SEARCH
assert(linearSearchFound == true);
@@ -647,7 +659,8 @@ searchForOverlap(BAToverlap *ovl, uint32 ovlLen, uint32 bID) {
return(true);
}
- if (ovl[M].b_iid < bID)
+ if (((ovl[M].b_iid < bID)) ||
+ ((ovl[M].b_iid == bID) && (ovl[M].flipped < flipped)))
F = M+1;
else
L = M-1;
@@ -694,7 +707,7 @@ OverlapCache::symmetrizeOverlaps(void) {
// Search for the twin overlap, and if found, we're done. The twin is marked as symmetric in the function.
- if (searchForOverlap(_overlaps[rb], _overlapLen[rb], rr)) {
+ if (searchForOverlap(_overlaps[rb], _overlapLen[rb], rr, _overlaps[rr][oo].flipped)) {
_overlaps[rr][oo].symmetric = true;
continue;
}
=====================================
src/bogart/AS_BAT_PlaceContains.C
=====================================
--- a/src/bogart/AS_BAT_PlaceContains.C
+++ b/src/bogart/AS_BAT_PlaceContains.C
@@ -65,6 +65,11 @@ breakSingletonTigs(TigVector &tigs) {
if (utg->ufpath.size() > 1)
continue;
+ if (OG->isZombie(utg->ufpath[0].ident) == true) {
+ writeLog("Not breaking sinleton zombie tig %u with read %u\n", ti, utg->ufpath[0].ident);
+ continue;
+ }
+
tigs[ti] = NULL; // Remove the tig from the list
tigs.registerRead(utg->ufpath[0].ident); // Eject the read
delete utg; // Reclaim space
@@ -125,6 +130,16 @@ placeUnplacedUsingAllOverlaps(TigVector &tigs,
placeReadUsingOverlaps(tigs, NULL, fid, placements, placeRead_fullMatch);
+ // If all placements are in singletons, allow them. If any placement is to a 'real' tig,
+ // ignore singleton placements.
+
+ bool ignoreSingleton = false;
+
+ for (uint32 i=0; i<placements.size(); i++)
+ if ((placements[i].fCoverage >= 0.99) &&
+ (tigs[placements[i].tigID]->ufpath.size() > 1))
+ ignoreSingleton = true;
+
// Search the placements for the highest expected identity placement using all overlaps in the unitig.
uint32 b = UINT32_MAX;
@@ -132,10 +147,11 @@ placeUnplacedUsingAllOverlaps(TigVector &tigs,
for (uint32 i=0; i<placements.size(); i++) {
Unitig *tig = tigs[placements[i].tigID];
- if (placements[i].fCoverage < 0.99) // Ignore partially placed reads.
+ if (placements[i].fCoverage < 0.99) // Ignore partially placed reads.
continue;
- if (tig->ufpath.size() == 1) // Ignore placements in singletons.
+ if ((ignoreSingleton == true) &&
+ (tig->ufpath.size() == 1)) // Ignore placements in singletons.
continue;
uint32 bgn = placements[i].position.min();
=====================================
src/bogart/AS_BAT_PopulateUnitig.C
=====================================
--- a/src/bogart/AS_BAT_PopulateUnitig.C
+++ b/src/bogart/AS_BAT_PopulateUnitig.C
@@ -119,8 +119,11 @@ populateUnitig(TigVector &tigs,
int32 fi) {
if ((RI->readLength(fi) == 0) || // Skip deleted
- (tigs.inUnitig(fi) != 0) || // Skip placed
- (OG->isContained(fi) == true)) // Skip contained
+ (tigs.inUnitig(fi) != 0)) // Skip placed
+ return;
+
+ if ((OG->isContained(fi) == true) && // Skip contained...
+ (OG->isZombie(fi) == false)) // that aren't zombies.
return;
Unitig *utg = tigs.newUnitig(logFileFlagSet(LOG_BUILD_UNITIG));
@@ -140,6 +143,22 @@ populateUnitig(TigVector &tigs,
utg->addRead(read, 0, logFileFlagSet(LOG_BUILD_UNITIG));
+ // If suspicious or a zombie, don't bother trying to extend. In the former
+ // case, we don't want to extend, and in the latter case, there isn't anything
+ // to extend.
+
+ if (OG->isSuspicious(fi)) {
+ writeLog("Stopping unitig construction of suspicious read %d in unitig %d\n",
+ utg->ufpath.back().ident, utg->id());
+ return;
+ }
+
+ if (OG->isZombie(fi)) {
+ writeLog("Stopping unitig construction of zombie read %d in unitig %d\n",
+ utg->ufpath.back().ident, utg->id());
+ return;
+ }
+
// Add reads as long as there is a path to follow...from the 3' end of the first read.
BestEdgeOverlap *bestedge5 = OG->getBestEdgeOverlap(fi, false);
@@ -150,34 +169,6 @@ populateUnitig(TigVector &tigs,
assert(bestedge3->ahang() >= 0);
assert(bestedge3->bhang() >= 0);
- // If this read is not covered by the two best overlaps we are finished. We will not follow
- // the paths out. This indicates either low coverage, or a chimeric read. If it is low
- // coverage, then the best overlaps will be mutual and we'll recover the same path. If it is a
- // chimeric read the overlaps will not be mutual and we will skip this read.
- //
- // The amount of our read that is covered by the two best overlaps is
- //
- // (readLen + bestedge5->bhang()) + (readLen - bestedge3->ahang())
- //
- // If that is not significantly longer than the read length, then we will not use this
- // read as a seed for unitig construction.
- //
-
- if (OG->isSuspicious(fi))
- return;
-
-#if 0
- uint32 covered = RI->readLength(fi) + bestedge5->bhang() + RI->readLength(fi) - bestedge3->ahang();
-
- // This breaks tigs at 0x best-coverage regions. There might be a contain that spans (joins)
- // the two best overlaps to verify the read, but we can't easily tell right now.
- if (covered < RI->readLength(fi) + AS_OVERLAP_MIN_LEN / 2) {
- writeLog("Stopping unitig construction of suspicious read %d in unitig %d\n",
- utg->ufpath.back().ident, utg->id());
- return;
- }
-#endif
-
if (logFileFlagSet(LOG_BUILD_UNITIG))
writeLog("Adding 5' edges off of read %d in unitig %d\n",
utg->ufpath.back().ident, utg->id());
=====================================
src/bogart/AS_BAT_Unitig.H
=====================================
--- a/src/bogart/AS_BAT_Unitig.H
+++ b/src/bogart/AS_BAT_Unitig.H
@@ -40,6 +40,7 @@
#include "AS_global.H"
#include "AS_BAT_TigVector.H"
+#include "AS_BAT_OverlapCache.H"
#include "stddev.H"
@@ -204,6 +205,13 @@ public:
void cleanUp(void);
// Recompute bgn/end positions using all overlaps.
+ bool optimize_isCompatible(uint32 ii,
+ uint32 jj,
+ BAToverlap &olap,
+ bool inInit,
+ bool secondPass,
+ bool beVerbose);
+
void optimize_initPlace(uint32 pp,
optPos *op,
optPos *np,
=====================================
src/canu_version_update.pl
=====================================
--- a/src/canu_version_update.pl
+++ b/src/canu_version_update.pl
@@ -32,7 +32,7 @@ my $cwd = getcwd();
my $label = "snapshot"; # Automagically set to 'release' for releases.
my $major = "1"; # Bump before release.
-my $minor = "7"; # Bump before release.
+my $minor = "7.1"; # Bump before release.
my $commits = "0";
my $hash1 = undef; # This from 'git describe'
=====================================
src/overlapErrorAdjustment/findErrors-Read_Frags.C
=====================================
--- a/src/overlapErrorAdjustment/findErrors-Read_Frags.C
+++ b/src/overlapErrorAdjustment/findErrors-Read_Frags.C
@@ -60,9 +60,6 @@ Read_Frags(feParameters *G,
uint64 votesLength = 0;
uint64 readsLoaded = 0;
- fprintf(stderr, "Read_Frags()-- from " F_U32 " through " F_U32 "\n",
- G->bgnID, G->endID);
-
for (uint32 curID=G->bgnID; curID<=G->endID; curID++) {
gkRead *read = gkpStore->gkStore_getRead(curID);
@@ -74,11 +71,7 @@ Read_Frags(feParameters *G,
sizeof(Vote_Tally_t) * votesLength +
sizeof(Frag_Info_t) * G->readsLen);
- fprintf(stderr, "Read_Frags()-- allocate " F_U64 " MB for bases, votes and info, for %u reads of total length " F_U64 " (%.2f MB)\n",
- totAlloc >> 20,
- G->endID - G->bgnID + 1,
- basesLength,
- totAlloc / 1024.0 / 1024.0);
+ fprintf(stderr, "Read_Frags()-- Loading target reads " F_U32 " through " F_U32 " with " F_U64 " bases.\n", G->bgnID, G->endID, basesLength);
G->readBases = new char [basesLength];
G->readVotes = new Vote_Tally_t [votesLength]; // NO constructor, MUST INIT
@@ -122,6 +115,6 @@ Read_Frags(feParameters *G,
delete readData;
- fprintf(stderr, "Read_Frags()-- from " F_U32 " through " F_U32 " -- loaded " F_U64 " bases in " F_U64 " reads.\n",
- G->bgnID, G->endID-1, basesLength, readsLoaded);
+ fprintf(stderr, "Read_Frags()-- %.3f GB for bases, votes and info.\n", totAlloc / 1024.0 / 1024.0 / 1024.0);
+ fprintf(stderr, "\n");
}
=====================================
src/overlapErrorAdjustment/findErrors-Read_Olaps.C
=====================================
--- a/src/overlapErrorAdjustment/findErrors-Read_Olaps.C
+++ b/src/overlapErrorAdjustment/findErrors-Read_Olaps.C
@@ -41,8 +41,7 @@ Read_Olaps(feParameters *G, gkStore *gkpStore) {
uint64 numolaps = ovs->numOverlapsInRange();
- fprintf(stderr, "Read_Olaps()-- loading " F_U64 " overlaps (%.2f MB).\n",
- numolaps, sizeof(Olap_Info_t) * numolaps / 1024.0 / 1024.0);
+ fprintf(stderr, "Read_Olaps()-- Loading " F_U64 " overlaps.\n", numolaps);
G->olaps = new Olap_Info_t [numolaps];
G->olapsLen = 0;
@@ -68,6 +67,9 @@ Read_Olaps(feParameters *G, gkStore *gkpStore) {
}
delete ovs;
+
+ fprintf(stderr, "Read_Olaps()-- %.3f GB for overlaps..\n", sizeof(Olap_Info_t) * numolaps / 1024.0 / 1024.0 / 1024.0);
+ fprintf(stderr, "\n");
}
=====================================
src/overlapErrorAdjustment/findErrors.C
=====================================
--- a/src/overlapErrorAdjustment/findErrors.C
+++ b/src/overlapErrorAdjustment/findErrors.C
@@ -51,23 +51,20 @@ Output_Corrections(feParameters *G);
-// From overlapInCore.C
-int
-Binomial_Bound(int e, double p, int Start, double Limit);
-
-
-
// Read fragments lo_frag..hi_frag (INCLUSIVE) from store and save the ids and sequences of those
// with overlaps to fragments in global Frag .
static
void
-Extract_Needed_Frags(feParameters *G,
- gkStore *gkpStore,
- uint32 loID,
- uint32 hiID,
- Frag_List_t *fl,
- uint64 &nextOlap) {
+extractReads(feParameters *G,
+ gkStore *gkpStore,
+ Frag_List_t *fl,
+ uint64 &nextOlap) {
+
+ // Clear the buffer.
+
+ fl->readsLen = 0;
+ fl->basesLen = 0;
// The original converted to lowercase, and made non-acgt be 'a'.
@@ -81,35 +78,44 @@ Extract_Needed_Frags(feParameters *G,
filter['G'] = filter['g'] = 'g';
filter['T'] = filter['t'] = 't';
- // Count the amount of stuff we're loading.
+ // Return if we've exhausted the overlaps.
- fl->readsLen = 0;
- fl->basesLen = 0;
+ if (nextOlap >= G->olapsLen)
+ return;
+
+ // Count the amount of stuff we're loading.
uint64 lastOlap = nextOlap;
- uint32 ii = 0; // Index into reads arrays
- uint32 fi = G->olaps[lastOlap].b_iid; // Actual ID we're extracting
+ uint32 loID = G->olaps[lastOlap].b_iid; // Actual ID we're extracting
+ uint32 hiID = loID;
+ uint64 maxBases = 512 * 1024 * 1024;
- assert(loID <= fi);
+ // Find the highest read ID that we can load without exceeding maxBases.
- fprintf(stderr, "\n");
- fprintf(stderr, "Extract_Needed_Frags()-- Loading used reads between " F_U32 " and " F_U32 ", at overlap " F_U64 ".\n", fi, hiID, lastOlap);
+ while ((fl->basesLen < maxBases) &&
+ (lastOlap < G->olapsLen)) {
+ hiID = G->olaps[lastOlap].b_iid; // Grab the ID of the overlap we're at.
- while (fi <= hiID) {
- gkRead *read = gkpStore->gkStore_getRead(fi);
+ gkRead *read = gkpStore->gkStore_getRead(hiID); // Grab that read.
- fl->readsLen += 1;
+ fl->readsLen += 1; // Add the read to our set.
fl->basesLen += read->gkRead_sequenceLength() + 1;
- // Advance to the next overlap
-
- lastOlap++;
- while ((lastOlap < G->olapsLen) && (G->olaps[lastOlap].b_iid == fi))
- lastOlap++;
- fi = (lastOlap < G->olapsLen) ? G->olaps[lastOlap].b_iid : hiID + 1;
+ lastOlap++; // Advance to the next overlap
+ while ((lastOlap < G->olapsLen) && //
+ (G->olaps[lastOlap].b_iid == hiID)) // If we've exceeded the max size or hit the last overlap,
+ lastOlap++; // the loop will stop on the next iteration.
}
- fprintf(stderr, "Extract_Needed_Frags()-- Loading reads for overlaps " F_U64 " to " F_U64 " (reads " F_U32 " bases " F_U64 ")\n", nextOlap, lastOlap, fl->readsLen, fl->basesLen);
+ // If nothing to load, just return.
+
+ if (fl->readsLen == 0)
+ return;
+
+ // Report what we're going to do.
+
+ fprintf(stderr, "extractReads()-- Loading reads " F_U32 " to " F_U32 " (" F_U32 " reads with " F_U64 " bases) overlaps " F_U64 " through " F_U64 ".\n",
+ loID, hiID, fl->readsLen, fl->basesLen, nextOlap, lastOlap);
// Ensure there is space.
@@ -117,42 +123,31 @@ Extract_Needed_Frags(feParameters *G,
delete [] fl->readIDs;
delete [] fl->readBases;
- //fprintf(stderr, "Extract_Needed_Frags()-- realloc reads from " F_U32 " to " F_U32 "\n", fl->readsMax, 12 * fl->readsLen / 10);
-
- fl->readIDs = new uint32 [12 * fl->readsLen / 10];
- fl->readBases = new char * [12 * fl->readsLen / 10];
-
fl->readsMax = 12 * fl->readsLen / 10;
+ fl->readIDs = new uint32 [fl->readsMax];
+ fl->readBases = new char * [fl->readsMax];
}
if (fl->basesMax < fl->basesLen) {
delete [] fl->bases;
- //fprintf(stderr, "Extract_Needed_Frags()-- realloc bases from " F_U64 " to " F_U64 "\n", fl->basesMax, 12 * fl->basesLen / 10);
-
- fl->bases = new char [12 * fl->basesLen / 10];
-
fl->basesMax = 12 * fl->basesLen / 10;
+ fl->bases = new char [fl->basesMax];
}
- // Load. This is complicated by loading only the reads that have overlaps we care about.
-
- fl->readsLen = 0;
- fl->basesLen = 0;
+ // Load the sequence data for reads loID to hiID, as long as the read has an overlap.
gkReadData *readData = new gkReadData;
- ii = 0;
- fi = G->olaps[nextOlap].b_iid;
-
- assert(loID <= fi);
+ fl->readsLen = 0;
+ fl->basesLen = 0;
- while (fi <= hiID) {
- gkRead *read = gkpStore->gkStore_getRead(fi);
+ while ((loID <= hiID) &&
+ (nextOlap < G->olapsLen)) {
+ gkRead *read = gkpStore->gkStore_getRead(loID);
- fl->readIDs[ii] = fi;
- fl->readBases[ii] = fl->bases + fl->basesLen;
- fl->basesLen += read->gkRead_sequenceLength() + 1;
+ fl->readIDs[fl->readsLen] = loID; // Save the ID of _this_ read.
+ fl->readBases[fl->readsLen] = fl->bases + fl->basesLen; // Set the data pointer to where this read should start.
gkpStore->gkStore_loadReadData(read, readData);
@@ -160,31 +155,25 @@ Extract_Needed_Frags(feParameters *G,
char *readBases = readData->gkReadData_getSequence();
for (uint32 bb=0; bb<readLen; bb++)
- fl->readBases[ii][bb] = filter[readBases[bb]];
+ fl->readBases[fl->readsLen][bb] = filter[readBases[bb]];
- fl->readBases[ii][readLen] = 0; // All good reads end.
+ fl->readBases[fl->readsLen][readLen] = 0; // All good reads end.
- ii++;
+ fl->basesLen += read->gkRead_sequenceLength() + 1; // Update basesLen to account for this read.
+ fl->readsLen += 1; // And note that we loaded a read.
- // Advance to the next overlap.
-
- nextOlap++;
- while ((nextOlap < G->olapsLen) && (G->olaps[nextOlap].b_iid == fi))
+ nextOlap++; // Advance past all the overlaps for this read.
+ while ((nextOlap < G->olapsLen) &&
+ (G->olaps[nextOlap].b_iid == loID))
nextOlap++;
- fi = (nextOlap < G->olapsLen) ? G->olaps[nextOlap].b_iid : hiID + 1;
+
+ if (nextOlap < G->olapsLen) // If we have valid overlap, grab the read ID.
+ loID = G->olaps[nextOlap].b_iid; // If we don't have a valid overlap, the loop will stop.
}
delete readData;
- fl->readsLen = ii;
-
- if (fl->readsLen > 0)
- fprintf(stderr, "Extract_Needed_Frags()-- Loaded " F_U32 " reads (%.4f%%). Loaded IDs " F_U32 " through " F_U32 ".\n",
- fl->readsLen, 100.0 * fl->readsLen / (hiID - 1 - loID),
- fl->readIDs[0], fl->readIDs[fl->readsLen-1]);
- else
- fprintf(stderr, "Extract_Needed_Frags()-- Loaded " F_U32 " reads (%.4f%%).\n",
- fl->readsLen, 100.0 * fl->readsLen / (hiID - 1 - loID));
+ fprintf(stderr, "extractReads()-- Loaded.\n");
}
@@ -262,8 +251,6 @@ Threaded_Stream_Old_Frags(feParameters *G,
for (uint32 i=0; i<G->numThreads; i++) {
thread_wa[i].thread_id = i;
- thread_wa[i].loID = 0;
- thread_wa[i].hiID = 0;
thread_wa[i].nextOlap = 0;
thread_wa[i].G = G;
thread_wa[i].frag_list = NULL;
@@ -278,14 +265,6 @@ Threaded_Stream_Old_Frags(feParameters *G,
thread_wa[i].ped.initialize(G, G->errorRate);
}
- uint32 loID = G->olaps[0].b_iid;
- uint32 hiID = loID + FRAGS_PER_BATCH - 1;
-
- uint32 endID = G->olaps[G->olapsLen - 1].b_iid;
-
- if (hiID > endID)
- hiID = endID;
-
uint64 frstOlap = 0;
uint64 nextOlap = 0;
@@ -295,15 +274,13 @@ Threaded_Stream_Old_Frags(feParameters *G,
Frag_List_t *curr_frag_list = &frag_list_1;
Frag_List_t *next_frag_list = &frag_list_2;
- Extract_Needed_Frags(G, gkpStore, loID, hiID, curr_frag_list, nextOlap);
+ extractReads(G, gkpStore, curr_frag_list, nextOlap);
- while (loID <= endID) {
+ while (curr_frag_list->readsLen > 0) {
// Process fragments in curr_frag_list in background
for (uint32 i=0; i<G->numThreads; i++) {
- thread_wa[i].loID = loID;
- thread_wa[i].hiID = hiID;
thread_wa[i].nextOlap = frstOlap;
thread_wa[i].frag_list = curr_frag_list;
@@ -315,21 +292,14 @@ Threaded_Stream_Old_Frags(feParameters *G,
// Read next batch of fragments
- loID = hiID + 1;
-
- if (loID <= endID) {
- hiID = loID + FRAGS_PER_BATCH - 1;
+ frstOlap = nextOlap;
- if (hiID > endID)
- hiID = endID;
-
- frstOlap = nextOlap;
-
- Extract_Needed_Frags(G, gkpStore, loID, hiID, next_frag_list, nextOlap);
- }
+ extractReads(G, gkpStore, next_frag_list, nextOlap);
// Wait for background processing to finish
+ fprintf(stderr, "processReads()-- Waiting for compute.\n");
+
for (uint32 i=0; i<G->numThreads; i++) {
void *ptr;
=====================================
src/overlapErrorAdjustment/findErrors.H
=====================================
--- a/src/overlapErrorAdjustment/findErrors.H
+++ b/src/overlapErrorAdjustment/findErrors.H
@@ -72,10 +72,6 @@ using namespace std;
// Factor by which to grow memory in olap array when reading it
#define EXPANSION_FACTOR 1.4
-// Number of old fragments to read into memory-based fragment
-// store at a time for processing
-#define FRAGS_PER_BATCH 100000
-
// Longest name allowed for a file in the overlap store
#define MAX_FILENAME_LEN 1000
@@ -273,8 +269,6 @@ public:
struct Thread_Work_Area_t {
int32 thread_id;
- uint32 loID;
- uint32 hiID;
uint64 nextOlap;
feParameters *G;
=====================================
src/pipelines/canu/Defaults.pm
=====================================
--- a/src/pipelines/canu/Defaults.pm
+++ b/src/pipelines/canu/Defaults.pm
@@ -682,7 +682,7 @@ sub setOverlapDefaults ($$$) {
setOverlapDefault($tag, "OvlHashBlockLength", undef, "Amount of sequence (bp) to load into the overlap hash table");
setOverlapDefault($tag, "OvlRefBlockLength", undef, "Amount of sequence (bp) to search against the hash table per batch");
- setOverlapDefault($tag, "OvlHashBits", ($tag eq "cor") ? 18 : 23, "Width of the kmer hash. Width 22=1gb, 23=2gb, 24=4gb, 25=8gb. Plus 10b per ${tag}OvlHashBlockLength");
+ setOverlapDefault($tag, "OvlHashBits", undef, "Width of the kmer hash. Width 22=1gb, 23=2gb, 24=4gb, 25=8gb. Plus 10b per ${tag}OvlHashBlockLength");
setOverlapDefault($tag, "OvlHashLoad", 0.75, "Maximum hash table load. If set too high, table lookups are inefficent; if too low, search overhead dominates run time; default 0.75");
setOverlapDefault($tag, "OvlMerSize", ($tag eq "cor") ? 19 : 22, "K-mer size for seeds in overlaps");
setOverlapDefault($tag, "OvlMerThreshold", "auto", "K-mer frequency threshold; mers more frequent than this count are ignored; default 'auto'");
=====================================
src/pipelines/canu/OverlapErrorAdjustment.pm
=====================================
--- a/src/pipelines/canu/OverlapErrorAdjustment.pm
+++ b/src/pipelines/canu/OverlapErrorAdjustment.pm
@@ -58,9 +58,13 @@ use canu::Grid_Cloud;
sub loadReadLengthsAndNumberOfOverlaps ($$$$) {
my $asm = shift @_;
my $maxID = shift @_;
+
my $rlVec = shift @_;
+ my $rlCnt = 0;
my $rlSum = 0;
+
my $noVec = shift @_;
+ my $noCnt = 0;
my $noSum = 0;
my $bin = getBinDirectory();
@@ -75,13 +79,19 @@ sub loadReadLengthsAndNumberOfOverlaps ($$$$) {
while (<F>) {
s/^\s+//;
s/\s+$//;
+
+ next if (m/^readID/); # Header line
+ next if (m/^------/); # ------ ----
+
my @v = split '\s+', $_;
+
vec($$rlVec, $v[0], 32) = $v[2];
+ $rlCnt += 1;
$rlSum += $v[2];
}
close(F);
- caExit("Failed to load read lengths from '$asm.gkpStore'", undef) if ($rlSum == 0);
+ caExit("Failed to load read lengths from '$asm.gkpStore'", undef) if ($rlCnt == 0);
fetchStore("unitigging/$asm.ovlStore");
@@ -92,13 +102,16 @@ sub loadReadLengthsAndNumberOfOverlaps ($$$$) {
while (<F>) {
s/^\s+//;
s/\s+$//;
+
my @v = split '\s+', $_;
+
vec($$noVec, $v[0], 32) = $v[1];
+ $noCnt += 1;
$noSum += $v[1];
}
close(F);
- caExit("Failed to load number of overlaps per read from '$asm.ovlStore'", undef) if ($noSum == 0);
+ caExit("Failed to load number of overlaps per read from '$asm.ovlStore'", undef) if ($noCnt == 0);
return($rlSum, $noSum);
}
@@ -127,25 +140,24 @@ sub readErrorDetectionConfigure ($) {
my ($rlVec, $noVec);
my ($rlSum, $noSum) = loadReadLengthsAndNumberOfOverlaps($asm, $maxID, \$rlVec, \$noVec);
+ if ($noSum == 0) {
+ print STDERR "--\n";
+ print STDERR "-- WARNING:\n";
+ print STDERR "-- WARNING: Found no overlaps. Disabling Overlap Error Adjustment.\n";
+ print STDERR "-- WARNING:\n";
+ print STDERR "--\n";
+ setGlobal("enableOEA", 0);
+ return;
+ }
+
# Find the maximum size of each block of 100,000 reads. findErrors reads up to 100,000 reads
# to process at one time. It uses 1 * length + 4 * 100,000 bytes of memory for bases and ID storage,
# and has two buffers of this size.
- my $maxBlockSize = 0;
-
- for (my $id = 1; $id <= $maxID; $id += 100000) {
- my $sum = 0;
-
- for (my $ii=$id; ($ii < $id + 100000) && ($ii < $maxID); $ii++) {
- $sum += vec($rlVec, $ii, 32);
- }
-
- $maxBlockSize = $sum if ($maxBlockSize < $sum);
- }
-
- my $maxMem = getGlobal("redMemory") * 1024 * 1024 * 1024;
- my $maxReads = getGlobal("redBatchSize");
- my $maxBases = getGlobal("redBatchLength");
+ my $maxBlockSize = 512 * 1024 * 1024; # This is hardcoded in findErrors.C
+ my $maxMem = getGlobal("redMemory") * 1024 * 1024 * 1024;
+ my $maxReads = getGlobal("redBatchSize");
+ my $maxBases = getGlobal("redBatchLength");
print STDERR "--\n";
print STDERR "-- Configure RED for ", getGlobal("redMemory"), "gb memory.\n";
@@ -426,9 +438,6 @@ sub overlapErrorAdjustmentConfigure ($) {
my $nj = 0;
- # get earliest count of reads in store
- my $maxID = getNumberOfReadsEarliestVersion($asm);
-
my $maxMem = getGlobal("oeaMemory") * 1024 * 1024 * 1024;
my $maxReads = getGlobal("oeaBatchSize");
my $maxBases = getGlobal("oeaBatchLength");
=====================================
src/stores/ovStore.C
=====================================
--- a/src/stores/ovStore.C
+++ b/src/stores/ovStore.C
@@ -54,6 +54,11 @@
#include "ovStore.H"
+#include <vector>
+#include <algorithm>
+
+using namespace std;
+
ovStore::ovStore(const char *path, gkStore *gkp) {
@@ -525,12 +530,42 @@ ovStore::numOverlapsPerRead(uint32 numReads) {
+class evalueFileMap {
+public:
+ evalueFileMap() {
+ _name = NULL;
+ _bgnID = UINT32_MAX;
+ _endID = 0;
+ _Nolap = 0;
+ }
+
+ char *_name;
+
+ uint32 _bgnID;
+ uint32 _endID;
+ uint64 _Nolap;
+};
+
+bool
+operator<(evalueFileMap const &a, evalueFileMap const &b) {
+ return(a._bgnID < b._bgnID);
+};
+
+
+
void
ovStore::addEvalues(vector<char *> &fileList) {
- char name[FILENAME_MAX];
- snprintf(name, FILENAME_MAX, "%s/evalues", _storePath);
+ char evalueName[FILENAME_MAX+1];
+ char evalueTemp[FILENAME_MAX+1];
- // If we have an opened memory mapped file, close it.
+ // Handy to have the name of the file we're working with.
+
+ snprintf(evalueTemp, FILENAME_MAX, "%s/evalues.WORKING", _storePath);
+ snprintf(evalueName, FILENAME_MAX, "%s/evalues", _storePath);
+
+ // If we have an opened memory mapped file, close it. There _shouldn't_ be one,
+ // as it would exist only if evalues were already added, but it might. And if it
+ // does exist, nuke it from disk too (well, not quite yet).
if (_evaluesMap) {
delete _evaluesMap;
@@ -539,76 +574,78 @@ ovStore::addEvalues(vector<char *> &fileList) {
_evalues = NULL;
}
- // Allocate space for the evalues.
+ if (AS_UTL_fileExists(evalueName) == true) {
+ fprintf(stderr, "WARNING:\n");
+ fprintf(stderr, "WARNING: existing evalue file will be overwritten!\n");
+ fprintf(stderr, "WARNING:\n");
+ }
- _evalues = new uint16 [_info.numOverlaps()];
+ // Scan each file, finding the first overlap in it.
- // Remove a bogus evalues file if one exists.
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Scanning.\n");
- if ((AS_UTL_fileExists(name) == true) &&
- (AS_UTL_sizeOfFile(name) != (sizeof(uint16) * _info.numOverlaps()))) {
- fprintf(stderr, "WARNING: existing evalues file is incorrect size: should be " F_U64 " bytes, is " F_U64 " bytes. Removing.\n",
- (sizeof(uint16) * _info.numOverlaps()), AS_UTL_sizeOfFile(name));
- AS_UTL_unlink(name);
- }
+ evalueFileMap *emap = new evalueFileMap [fileList.size()];
- // Clear the evalues.
+ for (uint32 ii=0; ii<fileList.size(); ii++) {
+ emap[ii]._name = fileList[ii];
+ emap[ii]._bgnID = 0;
+ emap[ii]._endID = 0;
+ emap[ii]._Nolap = 0;
- for (uint64 ii=0; ii<_info.numOverlaps(); ii++)
- _evalues[ii] = UINT16_MAX;
+ FILE *E = AS_UTL_openInputFile(fileList[ii]);
- // For each file in the fileList, open it, read the header (bgnID, endID and
- // number of values), load the evalues, then copy this data to the actual
- // evalues file.
+ AS_UTL_safeRead(E, &emap[ii]._bgnID, "bgnID", sizeof(uint32), 1);
+ AS_UTL_safeRead(E, &emap[ii]._endID, "endID", sizeof(uint32), 1);
+ AS_UTL_safeRead(E, &emap[ii]._Nolap, "Nolap", sizeof(uint64), 1);
- for (uint32 i=0; i<fileList.size(); i++) {
- uint32 bgnID = 0;
- uint32 endID = 0;
- uint64 len = 0;
+ AS_UTL_closeFile(E);
- errno = 0;
- FILE *fp = fopen(fileList[i], "r");
- if (errno)
- fprintf(stderr, "Failed to open evalues file '%s': %s\n", fileList[i], strerror(errno)), exit(1);
+ fprintf(stderr, " '%s' covers reads %7" F_U32P "-%-7" F_U32P " with %10" F_U64P " overlaps.\n",
+ emap[ii]._name, emap[ii]._bgnID, emap[ii]._endID, emap[ii]._Nolap);
+ }
- AS_UTL_safeRead(fp, &bgnID, "loid", sizeof(uint32), 1);
- AS_UTL_safeRead(fp, &endID, "hiid", sizeof(uint32), 1);
- AS_UTL_safeRead(fp, &len, "len", sizeof(uint64), 1);
+ // Sort the emap by starting read.
- // Figure out the overlap ID for the first overlap associated with bgnID
+ sort(emap, emap + fileList.size());
- setRange(bgnID, endID);
+ // Check that all IDs are present.
- // Load data directly into the evalue array
+ for (uint32 ii=1; ii<fileList.size(); ii++) {
+ if (emap[ii-1]._endID < emap[ii]._bgnID)
+ fprintf(stderr, "Discontinuity between files '%s' and '%s'.\n", emap[ii-1]._name, emap[ii]._name);
+ }
- fprintf(stderr, "- Loading evalues from '%s' -- ID range " F_U32 "-" F_U32 " with " F_U64 " overlaps\n",
- fileList[i], bgnID, endID, len);
+ // Now just copy the new evalues to the real evalues file. The inputs two 32-bit words and
+ // a 64-bit word at the start we need to ignore. That's 8 16-bit words.
- AS_UTL_safeRead(fp, _evalues + _offt._overlapID, "evalues", sizeof(uint16), len);
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Merging.\n");
- AS_UTL_closeFile(fp, fileList[i]);
- }
+ FILE *EO = AS_UTL_openOutputFile(evalueTemp);
- // Write the evalues to disk.
+ for (uint32 ii=0; ii<fileList.size(); ii++) {
+ uint16 *ev = new uint16 [emap[ii]._Nolap + 8];
- fprintf(stderr, "Saving evalues file for " F_U64 " overlaps.\n", _info.numOverlaps());
+ AS_UTL_loadFile(emap[ii]._name, ev, emap[ii]._Nolap + 8);
- errno = 0;
- FILE *F = fopen(name, "w");
- if (errno)
- fprintf(stderr, "Failed to make evalues file '%s': %s\n", name, strerror(errno)), exit(1);
+ AS_UTL_safeWrite(EO, ev + 8, "evalues", sizeof(uint16), emap[ii]._Nolap);
- AS_UTL_safeWrite(F, _evalues, "evalues", sizeof(uint16), _info.numOverlaps());
+ delete [] ev;
- AS_UTL_closeFile(F, name);
+ fprintf(stderr, " '%s' covers reads %7" F_U32P "-%-7" F_U32P "; %10" F_U64P " with overlaps.\n",
+ emap[ii]._name, emap[ii]._bgnID, emap[ii]._endID, emap[ii]._Nolap);
+ }
- // Clean up, and reopen the file. Usually, we just delete the store after
- // values are loaded, so this is pointless.
+ AS_UTL_closeFile(EO, evalueTemp);
- delete [] _evalues;
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Renaming.\n");
- // Open the evalues file if it isn't already opened
+ AS_UTL_unlink(evalueName);
+ AS_UTL_rename(evalueTemp, evalueName);
- _evaluesMap = new memoryMappedFile(name, memoryMappedFile_readOnly);
- _evalues = (uint16 *)_evaluesMap->get(0);
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Success!\n");
+ fprintf(stderr, "\n");
}
View it on GitLab: https://salsa.debian.org/med-team/canu/commit/608e890aea0715b35fe55c634be918438223168e
--
View it on GitLab: https://salsa.debian.org/med-team/canu/commit/608e890aea0715b35fe55c634be918438223168e
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20180717/c6963c51/attachment-0001.html>
More information about the debian-med-commit
mailing list