[med-svn] [Git][med-team/last-align][upstream] New upstream version 1256

Sun Aug 22 20:38:44 BST 2021


Nilesh Patra pushed to branch upstream at Debian Med / last-align


Commits:
b3ca634d by Nilesh Patra at 2021-08-23T00:56:26+05:30
New upstream version 1256
- - - - -


4 changed files:

- bin/last-dotplot
- doc/last-dotplot.rst
- src/makefile
- src/mcf_frameshift_xdrop_aligner.cc


Changes:

=====================================
bin/last-dotplot
=====================================
@@ -589,112 +589,121 @@ def expandedSeqDict(seqDict):
             newDict[base] = x
     return newDict
 
-def readBed(fileName, rangeDict):
-    for line in myOpen(fileName):
-        w = line.split()
-        if not w: continue
-        if not w[1].isdigit():
-            w = w[1:]
-        seqName = w[0]
-        if seqName not in rangeDict: continue
-        seqRanges = rangeDict[seqName]
-        beg = int(w[1])
-        end = int(w[2])
-        if all(beg >= i[2] or end <= i[1] for i in seqRanges):
-            continue
-        itemName = w[3] if len(w) > 3 and w[3] != "." else ""
+def commaSeparatedInts(text):
+    return map(int, text.rstrip(",").split(","))
+
+def annotsFromBedOrAgp(opts, rangeDict, fields):
+    seqName = fields[0]
+    if seqName not in rangeDict: return
+    end = int(fields[2])
+    if len(fields) > 7 and fields[4].isalpha():  # agp format, or gap.txt
+        if fields[4] in "NU" and fields[5].isdigit():
+            beg = end - int(fields[5])  # zero-based coordinate
+            if fields[7] == "yes":
+                yield 30000, opts.bridged_color, seqName, beg, end, ""
+            else:
+                yield 20000, opts.unbridged_color, seqName, beg, end, ""
+    else:  # BED format
+        beg = int(fields[1])
+        itemName = fields[3] if len(fields) > 3 and fields[3] != "." else ""
         layer = 900
         color = "#fbf"
-        if len(w) > 4:
-            if w[4] != ".":
-                layer = float(w[4])
-            if len(w) > 5:
-                if len(w) > 8 and w[8].count(",") == 2:
-                    color = "rgb(" + w[8] + ")"
+        if len(fields) > 4:
+            if fields[4] != ".":
+                layer = float(fields[4])
+            if len(fields) > 5:
+                if len(fields) > 8 and fields[8].count(",") == 2:
+                    color = "rgb(" + fields[8] + ")"
                 else:
-                    strand = w[5]
-                    isRev = (seqRanges[0][3] > 1)
+                    strand = fields[5]
+                    isRev = (rangeDict[seqName][0][3] > 1)
                     if strand == "+" and not isRev or strand == "-" and isRev:
                         color = "#ffe8e8"
                     if strand == "-" and not isRev or strand == "+" and isRev:
                         color = "#e8e8ff"
         yield layer, color, seqName, beg, end, itemName
 
-def commaSeparatedInts(text):
-    return map(int, text.rstrip(",").split(","))
-
-def readGenePred(opts, fileName, rangeDict):
-    for line in myOpen(fileName):
-        fields = line.split()  # xxx tab ???
-        if not fields: continue
-        geneName = fields[12 if len(fields) > 12 else 0]  # XXX ???
-        if fields[2] not in "+-":
-            fields = fields[1:]
-        seqName = fields[1]
-        if seqName not in rangeDict: continue
-        seqRanges = rangeDict[seqName]
-        #strand = fields[2]
+def annotsFromGpd(opts, rangeDict, fields, geneName):  # xxx split on tabs?
+    seqName = fields[1]
+    if seqName in rangeDict:
         cdsBeg = int(fields[5])
         cdsEnd = int(fields[6])
         exonBegs = commaSeparatedInts(fields[8])
         exonEnds = commaSeparatedInts(fields[9])
         for beg, end in zip(exonBegs, exonEnds):
-            if all(beg >= i[2] or end <= i[1] for i in seqRanges):
-                continue
             yield 300, opts.exon_color, seqName, beg, end, geneName
             b = max(beg, cdsBeg)
             e = min(end, cdsEnd)
             if b < e: yield 400, opts.cds_color, seqName, b, e, ""
 
-def readRmsk(fileName, rangeDict):
-    for line in myOpen(fileName):
-        fields = line.split()
-        if len(fields) == 17:  # rmsk.txt
-            seqName = fields[5]
-            if seqName not in rangeDict: continue  # do this ASAP for speed
-            beg = int(fields[6])
-            end = int(fields[7])
-            strand = fields[9]
-            repeatName = fields[10]
-            repeatClass = fields[11]
-        elif len(fields) == 15:  # .out
-            seqName = fields[4]
-            if seqName not in rangeDict: continue
-            beg = int(fields[5]) - 1
-            end = int(fields[6])
-            strand = fields[8]
-            repeatName = fields[9]
-            repeatClass = fields[10].split("/")[0]
-        else:
-            continue
-        seqRanges = rangeDict[seqName]
-        if all(beg >= i[2] or end <= i[1] for i in seqRanges):
-            continue
-        if repeatClass in ("Low_complexity", "Simple_repeat", "Satellite"):
-            yield 200, "#fbf", seqName, beg, end, repeatName
-        elif (strand == "+") != (seqRanges[0][3] > 1):
-            yield 100, "#ffe8e8", seqName, beg, end, repeatName
-        else:
-            yield 100, "#e8e8ff", seqName, beg, end, repeatName
-
-def isExtraFirstGapField(fields):
-    return fields[4].isdigit()
-
-def readGaps(opts, fileName, rangeDict):
-    '''Read locations of unsequenced gaps, from an agp or gap file.'''
-    for line in myOpen(fileName):
-        w = line.split()
-        if not w or w[0][0] == "#": continue
-        if isExtraFirstGapField(w): w = w[1:]
-        if w[4] not in "NU": continue
-        seqName = w[0]
-        if seqName not in rangeDict: continue
-        end = int(w[2])
-        beg = end - int(w[5])  # zero-based coordinate
-        if w[7] == "yes":
-            yield 3000, opts.bridged_color, seqName, beg, end, ""
-        else:
-            yield 2000, opts.unbridged_color, seqName, beg, end, ""
+def annotsFromGff(opts, line, seqName):
+    fields = line.rstrip().split("\t")
+    feature = fields[2]
+    beg = int(fields[3]) - 1
+    end = int(fields[4])
+    if feature == "exon":
+        geneName = fields[8]
+        if ";" in geneName or "=" in geneName:
+            parts = geneName.rstrip(";").split(";")
+            attrs = dict(re.split('[= ]', i.strip(), 1) for i in parts)
+            if "gene" in attrs:
+                geneName = attrs["gene"]  # seems good for NCBI gff
+            elif "Name" in attrs:
+                geneName = attrs["Name"]
+            else:
+                geneName = ""
+        yield 300, opts.exon_color, seqName, beg, end, geneName
+    elif feature == "CDS":
+        yield 400, opts.cds_color, seqName, beg, end, ""
+
+def annotsFromRep(rangeDict, seqName, beg, end, strand, repName, repClass):
+    simple = "Low_complexity", "Simple_repeat", "Satellite"
+    if repClass.startswith(simple):
+        yield 200, "#fbf", seqName, beg, end, repName
+    elif (strand == "+") != (rangeDict[seqName][0][3] > 1):
+        yield 100, "#ffe8e8", seqName, beg, end, repName
+    else:
+        yield 100, "#e8e8ff", seqName, beg, end, repName
+
+def annotsFromFiles(opts, fileNames, rangeDict):
+    isDig = str.isdigit
+    for fileName in fileNames:
+        for line in myOpen(fileName):
+            w = line.split()
+            n = len(w)
+            if n > 10 and w[8] in "+C-" and isDig(w[5]) and isDig(w[6]):
+                seq = w[4]                         # RepeatMasker .out
+                if seq not in rangeDict: continue  # do this ASAP for speed
+                beg = int(w[5]) - 1
+                end = int(w[6])
+                g = annotsFromRep(rangeDict, seq, beg, end, w[8], w[9], w[10])
+            elif n > 11 and w[9] in "+-" and isDig(w[6]) and isDig(w[7]):
+                seq = w[5]                         # rmsk.txt
+                if seq not in rangeDict: continue
+                beg = int(w[6])
+                end = int(w[7])
+                g = annotsFromRep(rangeDict, seq, beg, end, w[9], w[10], w[11])
+            elif n > 8 and w[6] in "+-." and isDig(w[3]) and isDig(w[4]):
+                seqName = w[0]
+                if seqName not in rangeDict: continue
+                g = annotsFromGff(opts, line, seqName)
+            elif n > 9 and w[2] in "+-" and isDig(w[4] + w[5] + w[6]):
+                geneName = w[12 if n > 12 else 0]  # XXX ???
+                g = annotsFromGpd(opts, rangeDict, w, geneName)
+            elif n > 10 and w[3] in "+-" and isDig(w[5] + w[6] + w[7]):
+                geneName = w[12 if n > 12 else 0]  # XXX ???
+                g = annotsFromGpd(opts, rangeDict, w[1:], geneName)
+            elif n > 2 and isDig(w[1]) and isDig(w[2]):
+                g = annotsFromBedOrAgp(opts, rangeDict, w)
+            elif n > 3 and isDig(w[2]) and isDig(w[3]):
+                g = annotsFromBedOrAgp(opts, rangeDict, w[1:])
+            else:
+                continue
+            if line[0] == "#": continue
+            for i in g:
+                layer, color, seqName, beg, end, name = i
+                if any(beg < r[2] and end > r[1] for r in rangeDict[seqName]):
+                    yield i
 
 def bedBoxes(annots, rangeDict, limit, isTop, bpPerPix):
     beds, textSizes, margin = annots
@@ -707,7 +716,7 @@ def bedBoxes(annots, rangeDict, limit, isTop, bpPerPix):
             if beg >= end: continue
             if isReverseRange:
                 beg, end = -end, -beg
-            if layer <= 1000:
+            if layer <= 10000:
                 # include partly-covered pixels
                 pixBeg = (origin + beg) // bpPerPix
                 pixEnd = div_ceil(origin + end, bpPerPix)
@@ -855,13 +864,9 @@ def remainingSequenceRanges(seqRanges, alignments, seqIndex):
     remainingSequences = set(i[seqIndex] for i in alignments)
     return [i for i in seqRanges if i[0] in remainingSequences]
 
-def readAnnotations(opts, font, textDraw, sortedRanges, totalLength,
-                    bedFile, repFile, geneFile, gapFile):
+def readAnnots(opts, font, textDraw, sortedRanges, totalLength, fileNames):
     rangeDict = expandedSeqDict(dict(rangesPerSeq(sortedRanges)))
-    annots = sorted(itertools.chain(readBed(bedFile, rangeDict),
-                                    readRmsk(repFile, rangeDict),
-                                    readGenePred(opts, geneFile, rangeDict),
-                                    readGaps(opts, gapFile, rangeDict)))
+    annots = sorted(annotsFromFiles(opts, fileNames, rangeDict))
     names = set(i[5] for i in annots)
     textSizes = dict(sizesPerText(names, font, textDraw))
     maxTextLength = totalLength // 2
@@ -937,12 +942,12 @@ def lastDotplot(opts, args):
 
     logging.info("reading annotations...")
 
-    annots1 = readAnnotations(opts, font, textDraw, sortedRanges1, opts.height,
-                              opts.bed1, opts.rmsk1, opts.genePred1, opts.gap1)
+    annots1 = readAnnots(opts, font, textDraw, sortedRanges1, opts.height,
+                         opts.bed1)
     bMargin = annots1[-1]
 
-    annots2 = readAnnotations(opts, font, textDraw, sortedRanges2, opts.width,
-                              opts.bed2, opts.rmsk2, opts.genePred2, opts.gap2)
+    annots2 = readAnnots(opts, font, textDraw, sortedRanges2, opts.width,
+                         opts.bed2)
     rMargin = annots2[-1]
 
     maxPixels1 = opts.width  - lMargin - rMargin
@@ -1057,10 +1062,6 @@ if __name__ == "__main__":
     op.add_option("-2", "--seq2", metavar="PATTERN", action="append",
                   default=[],
                   help="which sequences to show from the 2nd genome")
-    op.add_option("-c", "--forwardcolor", metavar="COLOR", default="red",
-                  help="color for forward alignments (default: %default)")
-    op.add_option("-r", "--reversecolor", metavar="COLOR", default="blue",
-                  help="color for reverse alignments (default: %default)")
     op.add_option("--alignments", metavar="FILE", help="secondary alignments")
     op.add_option("--sort1", default="1", metavar="N",
                   help="genome1 sequence order: 0=input order, 1=name order, "
@@ -1088,11 +1089,12 @@ if __name__ == "__main__":
                   "2=alignments adjacent in genome2 (default=%default)")
     op.add_option("--border-pixels", metavar="INT", type="int", default=1,
                   help="number of pixels between sequences (default=%default)")
-    op.add_option("--border-color", metavar="COLOR", default="black",
-                  help="color for pixels between sequences (default=%default)")
-    # --break-color and/or --break-pixels for intra-sequence breaks?
-    op.add_option("--margin-color", metavar="COLOR", default="#dcdcdc",
-                  help="margin color")
+    op.add_option("-a", "--bed1", "--rmsk1", "--genePred1", "--gap1",
+                  action="append", default=[], metavar="FILE",
+                  help="read genome1 annotations")
+    op.add_option("-b", "--bed2", "--rmsk2", "--genePred2", "--gap2",
+                  action="append", default=[], metavar="FILE",
+                  help="read genome2 annotations")
 
     og = optparse.OptionGroup(op, "Text options")
     og.add_option("-f", "--fontfile", metavar="FILE",
@@ -1111,33 +1113,20 @@ if __name__ == "__main__":
                   help="text rotation for the 2nd genome (default=%default)")
     op.add_option_group(og)
 
-    og = optparse.OptionGroup(op, "Annotation options")
-    og.add_option("--bed1", metavar="FILE",
-                  help="read genome1 annotations from BED file")
-    og.add_option("--bed2", metavar="FILE",
-                  help="read genome2 annotations from BED file")
-    og.add_option("--rmsk1", metavar="FILE", help="read genome1 repeats from "
-                  "RepeatMasker .out or rmsk.txt file")
-    og.add_option("--rmsk2", metavar="FILE", help="read genome2 repeats from "
-                  "RepeatMasker .out or rmsk.txt file")
-    op.add_option_group(og)
-
-    og = optparse.OptionGroup(op, "Gene options")
-    og.add_option("--genePred1", metavar="FILE",
-                  help="read genome1 genes from genePred file")
-    og.add_option("--genePred2", metavar="FILE",
-                  help="read genome2 genes from genePred file")
+    og = optparse.OptionGroup(op, "Color options")
+    og.add_option("-c", "--forwardcolor", metavar="COLOR", default="red",
+                  help="color for forward alignments (default: %default)")
+    og.add_option("-r", "--reversecolor", metavar="COLOR", default="blue",
+                  help="color for reverse alignments (default: %default)")
+    og.add_option("--border-color", metavar="COLOR", default="black",
+                  help="color for pixels between sequences (default=%default)")
+    # --break-color and/or --break-pixels for intra-sequence breaks?
+    og.add_option("--margin-color", metavar="COLOR", default="#dcdcdc",
+                  help="margin color")
     og.add_option("--exon-color", metavar="COLOR", default="PaleGreen",
                   help="color for exons (default=%default)")
     og.add_option("--cds-color", metavar="COLOR", default="LimeGreen",
                   help="color for protein-coding regions (default=%default)")
-    op.add_option_group(og)
-
-    og = optparse.OptionGroup(op, "Unsequenced gap options")
-    og.add_option("--gap1", metavar="FILE",
-                  help="read genome1 unsequenced gaps from agp or gap file")
-    og.add_option("--gap2", metavar="FILE",
-                  help="read genome2 unsequenced gaps from agp or gap file")
     og.add_option("--bridged-color", metavar="COLOR", default="yellow",
                   help="color for bridged gaps (default: %default)")
     og.add_option("--unbridged-color", metavar="COLOR", default="orange",


=====================================
doc/last-dotplot.rst
=====================================
@@ -2,7 +2,7 @@ last-dotplot
 ============
 
 This script makes a dotplot, a.k.a. Oxford Grid, of pair-wise sequence
-alignments in MAF or LAST tabular format.  It requires the `Python
+alignments in MAF_ or LAST tabular format.  It requires the `Python
 Imaging Library`_ to be installed.  It can be used like this::
 
   last-dotplot my-alignments my-plot.png
@@ -11,6 +11,10 @@ The output can be in any format supported by the Imaging Library::
 
   last-dotplot alns alns.gif
 
+It can read alignments from a pipe like this::
+
+  ... | last-dotplot - my-plot.png
+
 Terminology
 -----------
 
@@ -37,10 +41,14 @@ Options
     Which sequences to show from the 1st (horizontal) genome.
 -2 PATTERN, --seq2=PATTERN
     Which sequences to show from the 2nd (vertical) genome.
--c COLOR, --forwardcolor=COLOR
-    Color for forward alignments.
--r COLOR, --reversecolor=COLOR
-    Color for reverse alignments.
+-a FILE
+    Read annotations for the 1st (horizontal) genome, and draw them as
+    vertical stripes.  For backwards compatibility, these options have
+    the same meaning: ``--bed1 --rmsk1 --genePred1 --gap1``.
+-b FILE
+    Read annotations for the 2nd (vertical) genome, and draw them as
+    horizontal stripes.  For backwards compatibility, these options
+    have the same meaning: ``--bed2 --rmsk2 --genePred2 --gap2``.
 --alignments=FILE
     Read secondary alignments.  For example: we could use primary
     alignment data with one human DNA read aligned to the human
@@ -91,10 +99,6 @@ Options
     in the 2nd (vertical) genome.
 --border-pixels=INT
     Number of pixels between sequences.
---border-color=COLOR
-    Color for pixels between sequences.
---margin-color=COLOR
-    Color for the margins.
 
 Text options
 ~~~~~~~~~~~~
@@ -116,58 +120,49 @@ Text options
 --rot2=ROT
     Text rotation for the 2nd genome: h(orizontal) or v(ertical).
 
-Annotation options
-~~~~~~~~~~~~~~~~~~
-
-These options read annotations of sequence segments, and draw them as
-colored horizontal or vertical stripes.  This looks good only if the
-annotations are reasonably sparse: e.g. you can't sensibly view 20000
-gene annotations in one small dotplot.
-
---bed1=FILE
-    Read BED-format_ annotations for the 1st genome.  They are drawn
-    as stripes, with coordinates given by the first three BED fields.
-    The color is specified by the RGB field if present, else pale red
-    if the strand is "+", pale blue if "-", or pale purple.
---bed2=FILE
-    Read BED-format annotations for the 2nd genome.
---rmsk1=FILE
-    Read repeat annotations for the 1st genome, in RepeatMasker .out
-    or rmsk.txt format.  The color is pale purple for "low
-    complexity", "simple repeats", and "satellites", else pale red
-    for "+" strand and pale blue for "-" strand.
---rmsk2=FILE
-    Read repeat annotations for the 2nd genome.
-
-Gene options
-~~~~~~~~~~~~
+Color options
+~~~~~~~~~~~~~
 
---genePred1=FILE
-    Read gene annotations for the 1st genome in `genePred format`_.
---genePred2=FILE
-    Read gene annotations for the 2nd genome.
+-c COLOR, --forwardcolor=COLOR
+    Color for forward alignments.
+-r COLOR, --reversecolor=COLOR
+    Color for reverse alignments.
+--border-color=COLOR
+    Color for pixels between sequences.
+--margin-color=COLOR
+    Color for the margins.
 --exon-color=COLOR
     Color for exons.
 --cds-color=COLOR
     Color for protein-coding regions.
+--bridged-color=COLOR
+    Color for unsequenced gaps with "yes" evidence of linkage.
+--unbridged-color=COLOR
+    Color for unsequenced gaps with "no" evidence of linkage.
 
-Unsequenced gap options
-~~~~~~~~~~~~~~~~~~~~~~~
+Annotations
+-----------
 
-Note: these "gaps" are *not* alignment gaps (indels): they are regions
-of unknown sequence.
+Options ``-a`` and ``-b`` can read annotations in these formats:
 
---gap1=FILE
-    Read unsequenced gaps in the 1st genome from an agp or gap file.
---gap2=FILE
-    Read unsequenced gaps in the 2nd genome from an agp or gap file.
---bridged-color=COLOR
-    Color for bridged gaps.
---unbridged-color=COLOR
-    Color for unbridged gaps.
+* BED_: The color is specified by the RGB field if present, else pale
+  red if the strand is "+", pale blue if "-", or pale purple.  BED
+  lines with higher score are drawn on top of ones with lower score.
+
+* Repeatmasker_ .out, rmsk.txt: The color is pale purple for "low
+  complexity", "simple repeats", and "satellites", else pale red for
+  "+" strand and pale blue for "-" strand.
+
+* genePred_, GFF/GTF: Exons are shown in green, with a darker shade
+  for protein-coding regions.
+
+* AGP_, gap.txt: Unsequenced gaps are shown, but only if the gap
+  covers at least one whole pixel.
 
-An unsequenced gap will be shown only if it covers at least one whole
-pixel.
+You can use these options multiple times, e.g. ``-a stuff.bed -a
+more.bed -a rmsk.txt``.  Annotations look good only if reasonably
+sparse, e.g. you can't sensibly view 20000 gene annotations in one
+small dotplot.
 
 Choosing sequences
 ------------------
@@ -224,5 +219,8 @@ Colors can be specified in `various ways described here
 <https://pillow.readthedocs.io/en/stable/reference/ImageColor.html>`_.
 
 .. _Python Imaging Library: https://pillow.readthedocs.io/
-.. _BED-format: https://genome.ucsc.edu/FAQ/FAQformat.html#format1
-.. _genePred format: https://genome.ucsc.edu/FAQ/FAQformat.html#format9
+.. _MAF: https://genome.ucsc.edu/FAQ/FAQformat.html#format5
+.. _BED: https://genome.ucsc.edu/FAQ/FAQformat.html#format1
+.. _genePred: https://genome.ucsc.edu/FAQ/FAQformat.html#format9
+.. _RepeatMasker: http://www.repeatmasker.org/
+.. _AGP: https://www.ncbi.nlm.nih.gov/assembly/agp/


=====================================
src/makefile
=====================================
@@ -143,7 +143,7 @@ ScoreMatrixData.hh: ../data/*.mat
 	../build/mat-inc.sh ../data/*.mat > $@
 
 VERSION1 = git describe --dirty
-VERSION2 = echo ' (HEAD -> main, tag: 1254) ' | sed -e 's/.*tag: *//' -e 's/[,) ].*//'
+VERSION2 = echo ' (HEAD -> main, tag: 1256) ' | sed -e 's/.*tag: *//' -e 's/[,) ].*//'
 
 VERSION = \"`test -e ../.git && $(VERSION1) || $(VERSION2)`\"
 


=====================================
src/mcf_frameshift_xdrop_aligner.cc
=====================================
@@ -19,46 +19,59 @@ double FrameshiftXdropAligner::forward(const uchar *protein,
 				       const const_dbl_ptr *substitutionProbs,
 				       const GapCosts &gapCosts,
 				       double probDropLimit) {
+  const int seqIncrement = isRightwardExtension ? 1 : -1;
   if (isRightwardExtension) {
     --protein; --frame0; --frame1; --frame2;
   }
   proteinPtr = protein;
-  frames[0] = frame0;
+  frames[0] = frame0 + seqIncrement;
   frames[1] = frame1;
   frames[2] = frame2;
-  const int seqIncrement = isRightwardExtension ? 1 : -1;
-
-  const double delOpenProb = gapCosts.delProbPieces[0].openProb;
-  const double insOpenProb = gapCosts.insProbPieces[0].openProb;
-  const double delProb1 = gapCosts.delProb1;
-  const double delProb2 = gapCosts.delProb2;
-  const double delProb3 = gapCosts.delProb3;
-  const double insProb1 = gapCosts.insProb1;
-  const double insProb2 = gapCosts.insProb2;
-  const double insProb3 = gapCosts.insProb3;
 
-  int runOfDrops = 2;
-  int runOfEdges = 0;
-  int numCells = 1;
+  int numCells = 1;  // number of DynProg cells in this antidiagonal
   size_t proteinEnd = 1;
-  size_t diagPos6 = padSize - 1;  // xxx or 0 ?
+  size_t diagPos6 = 2 * padSize - 1;
   size_t horiPos5 = diagPos6 + padSize;
   size_t horiPos4 = horiPos5 + padSize;
   size_t horiPos3 = horiPos4 + padSize;
   size_t vertPos2 = horiPos3 + padSize + 1;
   size_t vertPos1 = vertPos2 + padSize;
-  size_t thisPos  = vertPos1;  // xxx or + padSize ?
-  size_t antidiagonal = 0;
+  size_t thisPos  = vertPos1 + numCells;
+  size_t antidiagonal = 1;
+
+  if (xdropShape.empty()) {
+    xdropShape.resize(3);
+    xdropShape[0] = vertPos2;
+    xdropShape[1] = proteinEnd;
+    xdropShape[2] = thisPos;
+    xFwdProbs.resize(thisPos);
+    yFwdProbs.resize(thisPos);
+    zFwdProbs.resize(thisPos);
+    xFwdProbs[vertPos1] = 1;
+    yFwdProbs[vertPos1] = 1;
+    zFwdProbs[vertPos1] = 1;
+  }
 
-  double sumOfProbRatios = 0;
+  double sumOfProbRatios = 1;
   double logSumOfProbRatios = 0;
 
-  if (xdropShape.empty()) xdropShape.assign(1, thisPos);
+  if (substitutionProbs[0][*(frames[0])] <= 0) {  // at end of DNA sequence
+    numOfAntidiagonals = 1;
+    rescaledSumOfProbRatios = 1;
+    return 0;
+  }
+
+  const double delOpenProb = gapCosts.delProbPieces[0].openProb;
+  const double insOpenProb = gapCosts.insProbPieces[0].openProb;
+  const double delProb1 = gapCosts.delProb1;
+  const double delProb2 = gapCosts.delProb2;
+  const double delProb3 = gapCosts.delProb3;
+  const double insProb1 = gapCosts.insProb1;
+  const double insProb2 = gapCosts.insProb2;
+  const double insProb3 = gapCosts.insProb3;
 
-  resizeFwdProbsIfSmaller(thisPos);
-  double behindProb = substitutionProbs[*protein][*frame0];
-  assert(behindProb > 0);  // XXX might fail?
-  xFwdProbs[padSize - 1] = 1 / behindProb;  // xxx or padSize ?
+  int runOfDrops = 0;
+  int runOfEdges = (substitutionProbs[*(protein + seqIncrement)][0] > 0);
 
   while (1) {
     const uchar *s1 = proteinPtr;
@@ -147,7 +160,6 @@ double FrameshiftXdropAligner::forward(const uchar *protein,
     }
 
     if (antidiagonal % rescaleStep == 0) {
-      assert(sumOfProbRatios > 0);
       double scale = 1 / sumOfProbRatios;
       size_t numOfRescales = antidiagonal / rescaleStep;
       if (rescales.size() < numOfRescales) rescales.resize(numOfRescales);



View it on GitLab: https://salsa.debian.org/med-team/last-align/-/commit/b3ca634d131e8470dfda044c9aadb1cfbe22c4f9

-- 
View it on GitLab: https://salsa.debian.org/med-team/last-align/-/commit/b3ca634d131e8470dfda044c9aadb1cfbe22c4f9
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20210822/43097e5c/attachment-0001.htm>