[med-svn] [Git][med-team/dnarrange][upstream] New upstream version 1.6.3

Sat Nov 1 16:37:01 GMT 2025


Nilesh Patra pushed to branch upstream at Debian Med / dnarrange


Commits:
5299fb6b by Nilesh Patra at 2025-11-01T21:59:56+05:30
New upstream version 1.6.3
- - - - -


7 changed files:

- README.md
- dnarrange
- dnarrange-genes
- dnarrange-link
- dnarrange-merge
- last-multiplot
- setup.py


Changes:

=====================================
README.md
=====================================
@@ -2,9 +2,10 @@
 
 This is a method to find rearrangements in "long" DNA reads relative
 to a genome sequence.  It can characterize changes such as chromosome
-shattering, gene conversion, and processed pseudogene insertion.  For
-more details, please see: [A pipeline for complete characterization of
-complex germline rearrangements from long DNA reads][paper].
+shattering, gene conversion, virus insertion, and processed pseudogene
+insertion.  For more details, please see: [A pipeline for complete
+characterization of complex germline rearrangements from long DNA
+reads][paper].
 
 You can install dnarrange (together with all other software that it
 uses) from [Bioconda][] or [Debian Med][].
@@ -65,6 +66,17 @@ whose "strongest" rearrangement type is shared with a control read,
 where "strength" is defined by: inter-chromosome > inter-strand >
 non-colinear > big gap.
 
+#### Insertions of foreign DNA
+
+You can find insertions of (e.g.) a virus genome into a host genome.
+First, align the DNA reads to a combined "genome" with host and virus
+chromosomes.  Then do:
+
+    dnarrange --insert=VirusSeqName alignments.maf > groups.maf
+
+This will get DNA reads that align partly to the sequence called
+`VirusSeqName` and partly to other sequences.
+
 ### Step 3: Draw pictures of the groups
 
 Draw a picture of each group, showing the read-to-genome alignments,
@@ -81,6 +93,7 @@ This in turn requires the [Python Imaging
 Library](https://pillow.readthedocs.io/) to be installed.
 
 * A useful option is `-a`, to display files of genes, repeats, etc.
+* It shows at most 10 reads per group: you can change that with option `-m`.
 
 Tips for viewing the pictures on a Mac: open the folder in Finder, and
 
@@ -254,11 +267,16 @@ You can also get groups near genes, with option `-o1`:
 - `-m PROB`, `--max-mismap=PROB`: discard any alignment with mismap
   probability > PROB (default=1).
 
+- `--insert=NAME`: find insertions of the sequence with this name.
+
 - `--shrink`: write the output in a compact format.  This format can
   be read by `dnarrange`.
 
 - `-v`, `--verbose`: show progress messages.
 
+- `-w W`, `--width=W`: line-wrap width of group summary lines
+  (default=79).  0 means no wrapping.
+
 ## `dnarrange-merge` options
 
 You can get the rearranged reads, without merging them, like this:


=====================================
dnarrange
=====================================
@@ -1,9 +1,10 @@
-#! /usr/bin/env python
+#! /usr/bin/env python3
 # Author: Martin C. Frith 2018
 # SPDX-License-Identifier: GPL-3.0-or-later
 
 from __future__ import print_function
 
+import collections
 import functools
 import gzip
 import heapq
@@ -221,6 +222,9 @@ def isBigGap(sortedAlignmentsOfOneQuery, opts):
     return False
 
 def rearrangementType(opts, alignmentsOfOneQuery):
+    if opts.insert:
+        seqNames = set(i[3] for i in alignmentsOfOneQuery)
+        return "I" if len(seqNames) > 1 and opts.insert in seqNames else None
     if "C" in opts.types and isInterChromosome(alignmentsOfOneQuery):
         return "C"
     if "S" in opts.types and isInterStrand(alignmentsOfOneQuery):
@@ -331,6 +335,8 @@ def isAdjacent(alnX, alnY):
 
 def isNonlinearPair(opts, types, alnX, alnY):
     if alnX[3] != alnY[3]:
+        if "I" in types and opts.insert in (alnX[3], alnY[3]):
+            return True
         return "C" in types and isDifferentChromosomes(alnX[3], alnY[3])
     if alnX[8] is not alnY[8]:
         return "S" in types
@@ -428,11 +434,12 @@ def isNoSharedRearrangement(opts, okAlignments, ngAlignments,
 
 def linksBetweenQueries(opts, alignments, alignmentsPerQuery):
     logging.info("linking...")
+    types = "I" if opts.insert else opts.types
     for alignmentsOfOneQuery in alignmentsPerQuery:
         qryNumA = alignmentsOfOneQuery[0][0]
         overlaps = sorted(overlapsOfOneQuery(alignments, alignmentsOfOneQuery))
         for qryNumB, g in groupby(overlaps, itemgetter(0)):
-            strand = sharedRearrangement(opts, opts.types,
+            strand = sharedRearrangement(opts, types,
                                          alignments, alignments, g)
             if strand:
                 yield qryNumA, qryNumB, strand == "-"
@@ -452,52 +459,34 @@ def linksBetweenClumps(alignments, alignmentsPerQuery, clumps):
                     if b < len(clumps) and b != a:
                         yield min(a, b), max(a, b), False
 
-def addJumpsIfShared(opts, alignments, alnA, alnB):
-    if alnA[6] == 0: return
-    alnAX = alignments[alnA[6] - 1]
-    if alnA[8] is alnB[8]:
-        if alnB[6] == 0: return
-        alnBX = alignments[alnB[6] - 1]
-        if (alnAX[8] is alnBX[8] and alnAX[3] == alnBX[3]
-            and alnAX[4] < alnBX[5] and alnAX[5] > alnBX[4]
-            and alnAX[0] == alnA[0] and alnBX[0] == alnB[0]
-            and isSharedRearrangement(opts, alnAX, alnA, alnBX, alnB)):
-            alnA[7].append(alnB[0])
-            alnB[7].append(alnA[0])
-    else:
-        if alnB[6] + 1 == len(alignments): return
-        alnBX = alignments[alnB[6] + 1]
-        if (alnAX[8] is not alnBX[8] and alnAX[3] == alnBX[3]
-            and alnAX[4] < alnBX[5] and alnAX[5] > alnBX[4]
-            and alnAX[0] == alnA[0] and alnBX[0] == alnB[0]
-            and isSharedRearrangement(opts, alnAX, alnA, alnBX, alnB)):
-            alnA[7].append(alnB[0])
-            alnBX[7].append(alnAX[0])
-
-def addSharedJumps(opts, alignments, myAlignmentsInGenomeOrder):
-    logging.info("finding shared jumps...")
+def addSharedJumps(opts, jumpsInGenomeOrder):
     stash = []
-    refName = ""
-    for alnB in myAlignmentsInGenomeOrder:
-        if alnB[3] != refName:
-            n = 0
-            refName = alnB[3]
-        qryNum = alnB[0]
-        refBeg = alnB[4]
+    n = 0
+    for jumpB in jumpsInGenomeOrder:
+        alnBX, alnBY, overlapsB = jumpB
+        qryNumB = alnBX[0]
+        refBegBX = alnBX[4]
+        refBegBY = alnBY[4]
+        refEndBY = alnBY[5]
+        isRevB = (alnBX[6] > alnBY[6])
+        if isRevB: alnBX, alnBY = alnBY, alnBX
         j = 0
-        for alnA in islice(stash, n):
-            if alnA[5] > refBeg:  # overlap in ref
-                if alnA[0] < qryNum:
-                    addJumpsIfShared(opts, alignments, alnA, alnB)
-                elif alnA[0] > qryNum:
-                    addJumpsIfShared(opts, alignments, alnB, alnA)
-                stash[j] = alnA
+        for jumpA in islice(stash, n):
+            alnAX, alnAY, overlapsA = jumpA
+            if alnAX[5] > refBegBX:
+                qryNumA = alnAX[0]
+                if (alnAY[4] < refEndBY and alnAY[5] > refBegBY and
+                    qryNumA != qryNumB):
+                    if isRevB: alnAX, alnAY = alnAY, alnAX
+                    if isSharedRearrangement(opts, alnBX, alnBY, alnAX, alnAY):
+                        overlapsA.append(qryNumB)
+                        overlapsB.append(qryNumA)
+                stash[j] = jumpA
                 j += 1
             else:
-                overlapsA = alnA[7]
                 overlapsA[:] = list(set(overlapsA))  # try to save memory
         if len(stash) == j: stash.append(None)
-        stash[j] = alnB
+        stash[j] = jumpB
         n = j + 1
 
 def isAllJumpsSupported(opts, goodQryNums, alignmentsOfOneQuery):
@@ -516,6 +505,37 @@ def clumpSortKey(alignmentsPerQuery, clump):
     k = min(querySortKey(alignmentsPerQuery[i]) for i, isFlipped in clump)
     return -len(clump), k
 
+def isInsertInRevStrand(insertSeqName, alignmentsOfOneQuery):
+    for j, y in enumerate(alignmentsOfOneQuery):
+        if j:
+            x = alignmentsOfOneQuery[j - 1]
+            if x[3] == insertSeqName and y[3] != insertSeqName:
+                return y[8]
+            if x[3] != insertSeqName and y[3] == insertSeqName:
+                return x[8]
+
+def isFlipQryStrand(opts, alignmentsOfOneQuery):
+    if opts.insert:
+        return isInsertInRevStrand(opts.insert, alignmentsOfOneQuery)
+    return alignmentsOfOneQuery[0][8] and alignmentsOfOneQuery[-1][8]
+
+def insertSortKey(opts, alignmentsPerQuery, clump):
+    shift = 50
+    seqName = opts.insert
+    qryNum, isFlipped = clump[0]
+    alns = alignmentsPerQuery[qryNum]
+    for j, y in enumerate(alns):
+        if j:
+            x = alns[j - 1]
+            xSeq, xBeg, xEnd = x[3:6]
+            ySeq, yBeg, yEnd = y[3:6]
+            if xSeq == seqName and ySeq != seqName:
+                pos = yEnd - shift if y[8] else yBeg + shift
+                return ySeq, pos
+            if xSeq != seqName and ySeq == seqName:
+                pos = xBeg + shift if x[8] else xEnd - shift
+                return xSeq, pos
+
 def isMergedGroupName(qryName):
     return re.match(r"(group|merged?)\d+-", qryName)
 
@@ -533,9 +553,33 @@ def alignmentsInGenomeOrder(alignmentsPerQuery):
     alignments = chain.from_iterable(alignmentsPerQuery)
     return sorted(alignments, key=itemgetter(3, 4))
 
-def alignmentsOfCoveredQueries(opts, alignments, alignmentsPerQuery):
-    myAlignmentsInGenomeOrder = alignmentsInGenomeOrder(alignmentsPerQuery)
-    addSharedJumps(opts, alignments, myAlignmentsInGenomeOrder)
+def jumpsPerChromosomeStrands(alignmentsPerQuery):
+    jumpsDict = collections.defaultdict(list)
+    for alignmentsOfOneQuery in alignmentsPerQuery:
+        x = None
+        for y in alignmentsOfOneQuery:
+            yRefName = y[3]
+            yStrand = y[8]
+            if x:
+                emptyListOfOverlaps = y[7]
+                fwdKey = xStrand, xRefName, yStrand, yRefName
+                revKey = not yStrand, yRefName, not xStrand, xRefName
+                if fwdKey <= revKey:
+                    jumpsDict[fwdKey].append((x, y, emptyListOfOverlaps))
+                if revKey <= fwdKey:
+                    jumpsDict[revKey].append((y, x, emptyListOfOverlaps))
+            x, xRefName, xStrand = y, yRefName, yStrand
+    return jumpsDict
+
+def jumpSortKey(jump):
+    return jump[0][4]
+
+def alignmentsOfCoveredQueries(opts, alignmentsPerQuery):
+    jumpsDict = jumpsPerChromosomeStrands(alignmentsPerQuery)
+    logging.info("finding shared jumps...")
+    for jumpsList in jumpsDict.values():
+        jumpsList.sort(key=jumpSortKey)
+        addSharedJumps(opts, jumpsList)
     while True:
         logging.info("excluding...")
         qryNums = set(i[0][0] for i in alignmentsPerQuery)
@@ -544,7 +588,7 @@ def alignmentsOfCoveredQueries(opts, alignments, alignmentsPerQuery):
         if len(alignmentsPerQuery) == len(qryNums):
             break
         logging.info("queries: " + str(len(alignmentsPerQuery)))
-    delOverlaps(myAlignmentsInGenomeOrder)
+    delOverlaps(chain.from_iterable(alignmentsPerQuery))
     return alignmentsPerQuery
 
 def newStrand(strand, isFlipped):
@@ -558,7 +602,7 @@ def qryNameWithStrand(qryName, isFlipped):
     return newQryName, isAddChar
 
 def printMaf(lines, isFlipped):
-    lines = [re.split(r"(\s+)", i, 5) for i in lines]
+    lines = [re.split(r"(\s+)", i, maxsplit=5) for i in lines]
     sLines = [i for i in lines if i[0] == "s"]
     qryLine = sLines[-1]
     qryName = qryLine[2]
@@ -732,8 +776,7 @@ def main(opts, args):
                                               args[colonEnd:])
 
     if opts.min_cov:
-        alnsPerKeptQuery = alignmentsOfCoveredQueries(opts, alignments,
-                                                      alnsPerKeptQuery)
+        alnsPerKeptQuery = alignmentsOfCoveredQueries(opts, alnsPerKeptQuery)
 
     print("#", os.path.basename(sys.argv[0]), *sys.argv[1:])
     print()
@@ -746,7 +789,7 @@ def main(opts, args):
     adjacencyList = adjacencyListFromLinks(len(alignmentsPerQuery), links)
 
     logging.info("grouping...")
-    isRevStrand = [i[0][8] and i[-1][8] for i in alignmentsPerQuery]
+    isRevStrand = [isFlipQryStrand(opts, i) for i in alignmentsPerQuery]
     qryPriorities = [(-len(i), -alignedQueryLength(j))
                      for i, j in zip(adjacencyList, alignmentsPerQuery)]
     allClumps = connectedComponents(adjacencyList, qryPriorities, isRevStrand)
@@ -755,6 +798,8 @@ def main(opts, args):
     clumps = [i for i in clumps if i[0][0] in keptQueryNums]
     isEachQueryOneMergedGroup = all(map(isMergedGroupName, queryNames))
     sortKey = functools.partial(clumpSortKey, alignmentsPerQuery)
+    if opts.insert:
+        sortKey = functools.partial(insertSortKey, opts, alignmentsPerQuery)
     if isEachQueryOneMergedGroup:
         sortKey = functools.partial(groupSortKey, queryNames)
     clumps.sort(key=sortKey)
@@ -772,7 +817,8 @@ def main(opts, args):
             for newQryName, ranges in s:
                 paddedName = format(newQryName, str(width))
                 para = " ".join([paddedName] + ranges)
-                print(textwrap.fill(para, 79, break_long_words=False,
+                wrapWidth = opts.width if opts.width else len(para) + 2
+                print(textwrap.fill(para, wrapWidth, break_long_words=False,
                                     break_on_hyphens=False,
                                     initial_indent="# ",
                                     subsequent_indent="#  "))
@@ -827,9 +873,13 @@ if __name__ == "__main__":
     op.add_option("-m", "--max-mismap", type="float", default=1.0,
                   metavar="PROB", help="discard any alignment with "
                   "mismap probability > PROB (default=%default)")
+    op.add_option("--insert", metavar="NAME",
+                  help="find insertions of the sequence with this name")
     op.add_option("--shrink", action="count", help="shrink the output")
     op.add_option("-v", "--verbose", action="count",
                   help="show progress messages")
+    op.add_option("-w", "--width", type="int", default=79, metavar="W", help=
+                  "line-wrap width of group summary lines (default=%default)")
     opts, args = op.parse_args()
     if opts.min_cov is None:
         opts.min_cov = 1 if opts.min_seqs > 1 else 0


=====================================
dnarrange-genes
=====================================
@@ -1,4 +1,4 @@
-#! /usr/bin/env python
+#! /usr/bin/env python3
 # Author: Martin C. Frith 2020
 # SPDX-License-Identifier: GPL-3.0-or-later
 


=====================================
dnarrange-link
=====================================
@@ -1,4 +1,4 @@
-#! /usr/bin/env python
+#! /usr/bin/env python3
 # Author: Martin C. Frith 2019
 # SPDX-License-Identifier: GPL-3.0-or-later
 


=====================================
dnarrange-merge
=====================================
@@ -1,4 +1,4 @@
-#! /usr/bin/env python
+#! /usr/bin/env python3
 # Author: Martin C. Frith 2019
 # SPDX-License-Identifier: GPL-3.0-or-later
 
@@ -72,7 +72,10 @@ def main(opts, args):
         if opts.verbose:
             cmd.append("-" + "v" * opts.verbose)
         cmd.append("-P" + str(opts.P))
-        cmd.append("-W" + str(opts.W))
+        if opts.u:
+            cmd.append("-u" + str(opts.u))
+        else:
+            cmd.append("-W" + str(opts.W))
         cmd.append("-m" + str(opts.m))
         cmd.append("-z" + str(opts.z))
         cmd += [args[1], "-"]
@@ -105,6 +108,8 @@ if __name__ == "__main__":
     og = optparse.OptionGroup(op, "LAST options")
     og.add_option("-P", type="int", default=1,
                   help="number of parallel threads (default=%default)")
+    og.add_option("-u", metavar="RY", type="int", help=
+                  "use ~1 per this many initial matches")
     og.add_option("-W", type="int", default=19, help="use minimum positions "
                   "in length-W windows (default=%default)")
     og.add_option("-m", type="int", default=5, help=


=====================================
last-multiplot
=====================================
@@ -30,7 +30,7 @@ for i in "$arg2"/*
 do
     test -e "$i" || continue
 
-    last-dotplot -j2 --sort2=0 --rot1=v --rot2=h --labels1=2 "$@" "$i" "$i".png
+    last-dotplot -m10 -j2 --sort2=0 --rot1=v --rot2=h --labels1=2 "$@" "$i" "$i".png
 
     rm "$i"
 done


=====================================
setup.py
=====================================
@@ -1,6 +1,6 @@
 import setuptools
 
-commitInfo = " (HEAD -> master, tag: 1.5.3)".strip("( )").split()
+commitInfo = " (HEAD -> master, tag: 1.6.3)".strip("( )").split()
 version = commitInfo[commitInfo.index("tag:") + 1].rstrip(",")
 
 setuptools.setup(



View it on GitLab: https://salsa.debian.org/med-team/dnarrange/-/commit/5299fb6b4020bc5475ba27bd6509e84e43baaee0

-- 
View it on GitLab: https://salsa.debian.org/med-team/dnarrange/-/commit/5299fb6b4020bc5475ba27bd6509e84e43baaee0
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20251101/73053cca/attachment-0001.htm>