[med-svn] [Git][med-team/parsnp][master] 7 commits: New upstream version 2.0.2+dfsg

Étienne Mollier (@emollier) gitlab at salsa.debian.org
Mon Jan 22 19:34:22 GMT 2024



Étienne Mollier pushed to branch master at Debian Med / parsnp


Commits:
fda2ced3 by Étienne Mollier at 2024-01-22T20:13:22+01:00
New upstream version 2.0.2+dfsg
- - - - -
0cdc13b2 by Étienne Mollier at 2024-01-22T20:13:22+01:00
routine-update: New upstream version

- - - - -
34647124 by Étienne Mollier at 2024-01-22T20:13:23+01:00
Update upstream source from tag 'upstream/2.0.2+dfsg'

Update to upstream version '2.0.2+dfsg'
with Debian dir ef868c683e76e5e9ee6141b8c474e2eb834d0c62
- - - - -
7b94cddc by Étienne Mollier at 2024-01-22T20:23:45+01:00
proper_calls_to_tools.patch: refresh.

- - - - -
a228ab8e by Étienne Mollier at 2024-01-22T20:25:38+01:00
py3-parsnp-libs.patch: refresh.

- - - - -
8412979e by Étienne Mollier at 2024-01-22T20:31:37+01:00
d/control: parsnp depends on python3-pyspoa.

- - - - -
3ea7771b by Étienne Mollier at 2024-01-22T20:33:26+01:00
ready to upload to unstable.

- - - - -


9 changed files:

- debian/changelog
- debian/control
- debian/patches/proper_calls_to_tools.patch
- debian/patches/py3-parsnp-libs.patch
- extend.py
- logger.py
- parsnp
- partition.py
- src/parsnp.cpp


Changes:

=====================================
debian/changelog
=====================================
@@ -1,3 +1,12 @@
+parsnp (2.0.2+dfsg-1) unstable; urgency=medium
+
+  * New upstream version 2.0.2+dfsg
+  * proper_calls_to_tools.patch: refresh.
+  * py3-parsnp-libs.patch: refresh.
+  * d/control: parsnp depends on python3-pyspoa.
+
+ -- Étienne Mollier <emollier at debian.org>  Mon, 22 Jan 2024 20:31:56 +0100
+
 parsnp (2.0.1+dfsg-1) unstable; urgency=medium
 
   * New upstream version


=====================================
debian/control
=====================================
@@ -23,6 +23,7 @@ Depends: ${shlibs:Depends},
          ${python3:Depends},
          python3-biopython,
          python3-numpy,
+         python3-pyspoa,
          python3-tqdm,
          fasttree,
          harvest-tools,


=====================================
debian/patches/proper_calls_to_tools.patch
=====================================
@@ -5,16 +5,16 @@ Forwarded: not-needed
 
 --- parsnp.orig/parsnp
 +++ parsnp/parsnp
-@@ -148,7 +148,7 @@
+@@ -145,7 +145,7 @@
  def run_phipack(query,seqlen,workingdir):
      currdir = os.getcwd()
      os.chdir(workingdir)
 -    command = "Profile -o -v -n %d -w 100 -m 100 -f %s > %s.out"%(seqlen,query,query)
 +    command = "phipack-profile -o -v -n %d -w 100 -m 100 -f %s > %s.out"%(seqlen,query,query)
-     run_command(command,1, prepend_time=True)
+     run_command(command, 1)
      os.chdir(currdir)
  
-@@ -646,7 +646,7 @@
+@@ -643,7 +643,7 @@
          missing = True
          logger.critical("{} not in system path!".format(exe))
      if use_phipack:
@@ -23,7 +23,7 @@ Forwarded: not-needed
          if shutil.which(exe) is None:
              missing = True
              logger.critical("{} not in system path!".format(exe))
-@@ -661,7 +661,7 @@
+@@ -658,7 +658,7 @@
              logger.critical("No fasttree executable found in system path!".format(exe))
          missing = missing or (not has_fasttree)
      else:
@@ -32,7 +32,7 @@ Forwarded: not-needed
          if shutil.which(exe) is None:
              missing = True
              logger.critical("{} not in system path!".format(exe))
-@@ -973,7 +973,7 @@
+@@ -975,7 +975,7 @@
      logger.debug("Writing .ini file")
      if xtrafast or 1:
          args.extend = False
@@ -41,7 +41,7 @@ Forwarded: not-needed
      inifiled = inifiled.replace("$REF", ref)
      inifiled = inifiled.replace("$EXTEND", "%d" % (args.extend))
      inifiled = inifiled.replace("$ANCHORS", str(args.min_anchor_length))
-@@ -1072,7 +1072,7 @@
+@@ -1074,7 +1074,7 @@
      if not os.path.exists(inifile):
          logger.error("ini file %s does not exist!\n"%(inifile))
          sys.exit(1)
@@ -50,7 +50,7 @@ Forwarded: not-needed
      # with open(f"{outputDir}/parsnpAligner.out", 'w') as stdout_f, open(f"{outputDir}/parsnpAligner.err", 'w') as stderr_f:
          # rc = run_command(command, ignorerc=1, stdout=stdout_f, stderr=stderr_f, prepend_time=True)
      rc = run_logged_command(command=command, ignorerc=1, label="parsnp-aligner", outputDir=outputDir)
-@@ -1293,10 +1293,10 @@
+@@ -1295,10 +1295,10 @@
          logger.info("Recruiting genomes...")
          if use_parsnp_mumi:
              if not inifile_exists:
@@ -63,7 +63,7 @@ Forwarded: not-needed
              run_logged_command(command=command, outputDir=outputDir, label="parsnp-mumi")
              # Takes eeach sequence and computes its mumi distance to the reference
              try:
-@@ -1748,7 +1748,7 @@
+@@ -1736,7 +1736,7 @@
                  break
          if not use_fasttree:
              with TemporaryDirectory() as raxml_output_dir:


=====================================
debian/patches/py3-parsnp-libs.patch
=====================================
@@ -8,27 +8,27 @@ Last-Update: 2022-04-14
 This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
 --- parsnp.orig/extend.py
 +++ parsnp/extend.py
-@@ -14,7 +14,7 @@
+@@ -9,7 +9,7 @@
+ import bisect
  import numpy as np
- from Bio.AlignIO.MafIO import MafWriter, MafIterator
- from Bio.AlignIO.MauveIO import MauveWriter, MauveIterator
--from logger import logger
-+from parsnp.logger import logger
- import time
+ from tqdm import tqdm
+-from logger import logger, TqdmToLogger, MIN_TQDM_INTERVAL
++from parsnp.logger import logger, TqdmToLogger, MIN_TQDM_INTERVAL
+ import spoa
  #%%
  
 --- parsnp.orig/parsnp
 +++ parsnp/parsnp
-@@ -13,7 +13,7 @@
+@@ -12,7 +12,7 @@
  from tempfile import TemporaryDirectory
  import re
  import logging
--from logger import logger
-+from parsnp.logger import logger
- import multiprocessing
+-from logger import logger, TqdmToLogger, MIN_TQDM_INTERVAL
++from parsnp.logger import logger, TqdmToLogger, MIN_TQDM_INTERVAL
  import argparse
  import signal
-@@ -24,7 +24,7 @@
+ from multiprocessing import Pool
+@@ -21,7 +21,7 @@
  from pathlib import Path
  
  
@@ -36,4 +36,4 @@ This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
 +import parsnp.extend as ext
  from tqdm import tqdm
  
- __version__ = "2.0.1"
+ __version__ = "2.0.2"


=====================================
extend.py
=====================================
@@ -2,20 +2,15 @@ from Bio import AlignIO, SeqIO
 from Bio.Seq import Seq
 from Bio.SeqIO import SeqRecord
 from Bio.Align import MultipleSeqAlignment
-from glob import glob
-import tempfile
+import logging
 from pathlib import Path
 import re
-import subprocess
 from collections import namedtuple, defaultdict, Counter
-import os
-from Bio.Align import substitution_matrices
-from itertools import product, combinations
+import bisect
 import numpy as np
-from Bio.AlignIO.MafIO import MafWriter, MafIterator
-from Bio.AlignIO.MauveIO import MauveWriter, MauveIterator
-from logger import logger
-import time
+from tqdm import tqdm
+from logger import logger, TqdmToLogger, MIN_TQDM_INTERVAL
+import spoa
 #%%
 
 
@@ -46,29 +41,38 @@ def parse_xmfa_header(xmfa_file):
     return index_to_gid, gid_to_index
                     
 
-def index_input_sequences(xmfa_file, input_dir):
+def index_input_sequences(xmfa_file, file_list):
+    basename_to_path = {}
+    for f in file_list:
+        basename = str(Path(f).stem)
+        basename_to_path[basename] = f
     gid_to_records = {}
     gid_to_cid_to_index = {}
+    gid_to_index_to_cid = {}
     with open(xmfa_file) as parsnp_fd:
         for line in (line.strip() for line in parsnp_fd):
             if line[:2] == "##":
                 if line.startswith("##SequenceFile"):
-                    p = Path(os.path.join(input_dir + line.split(' ')[1]))
-                    gid_to_records[p.stem] = {record.id: record for record in SeqIO.parse(str(p), "fasta")}
-                    gid_to_cid_to_index[p.stem] = {idx+1: rec.id for (idx, rec) in enumerate(SeqIO.parse(str(p), "fasta"))}
-    return gid_to_records, gid_to_cid_to_index
+                    basename = Path(line.split(' ')[1]).stem
+                    p = Path(basename_to_path[basename])
+                    gid_to_records[p.stem] = {}
+                    gid_to_cid_to_index[p.stem] = {}
+                    gid_to_index_to_cid[p.stem] = {}
+                    for idx, rec in enumerate(SeqIO.parse(str(p), "fasta")):
+                        gid_to_records[p.stem][rec.id] = rec
+                        gid_to_cid_to_index[p.stem][rec.id] = idx + 1
+                        gid_to_index_to_cid[p.stem][idx + 1] = rec.id
+    return gid_to_records, gid_to_cid_to_index, gid_to_index_to_cid
 
 
-
-def xmfa_to_covered(xmfa_file, index_to_gid, gid_to_cid_to_index):
+def xmfa_to_covered(xmfa_file, index_to_gid, gid_to_index_to_cid):
     seqid_parser = re.compile(r'^cluster(\d+) s(\d+):p(\d+)/.*')
     idpair_to_segments = defaultdict(list)
-    idpair_to_tree = defaultdict(IntervalTree)
     cluster_to_named_segments = defaultdict(list)
-    for aln in tqdm(AlignIO.parse(xmfa_file, "mauve")):
+    for aln in AlignIO.parse(xmfa_file, "mauve"):
         for seq in aln:
             # Skip reference for now...
-            aln_len = seq.annotations["end"] - seq.annotations["start"] + 1
+            aln_len = seq.annotations["end"] - seq.annotations["start"]
             cluster_idx, contig_idx, startpos = [int(x) for x in seqid_parser.match(seq.id).groups()]
             
             gid = index_to_gid[seq.name]
@@ -78,29 +82,29 @@ def xmfa_to_covered(xmfa_file, index_to_gid, gid_to_cid_to_index):
             else:
                 endpos = startpos + aln_len
             
-            idp = IdPair(gid, gid_to_cid_to_index[gid][contig_idx])
+            idp = IdPair(gid, gid_to_index_to_cid[gid][contig_idx])
             seg = Segment(idp, startpos, startpos + aln_len, seq.annotations["strand"])
             idpair_to_segments[idp].append(seg)
-            idpair_to_tree[idp].addi(seg.start, seg.stop)
             cluster_to_named_segments[cluster_idx].append(seg)
     
     for idp in idpair_to_segments:
         idpair_to_segments[idp] = sorted(idpair_to_segments[idp])
-        idpair_to_tree[idp].merge_overlaps()
-    return idpair_to_segments, idpair_to_tree, cluster_to_named_segments
+    return idpair_to_segments, cluster_to_named_segments
 
 
 def run_msa(downstream_segs_to_align, gid_to_records):
     keep_extending = True
     iteration = 0
-    seq_len_desc = stats.describe([seg.stop - seg.start for seg in downstream_segs_to_align])
-    longest_seq = seq_len_desc.minmax[1]
-    if sum(
-        seq_len_desc.mean*(1 - length_window) <= (seg.stop - seg.start) <= seq_len_desc.mean*(1 + length_window) for seg in downstream_segs_to_align) > len(downstream_segs_to_align)*window_prop:
-        base_length = int(seq_len_desc.mean*(1 + length_window))
-    else:
-        base_length = BASE_LENGTH
+    seq_lens = [seg.stop - seg.start for seg in downstream_segs_to_align]
+    longest_seq = max(seq_lens)
+    mean_seq_len = np.mean(seq_lens)
+    # if sum(
+        # mean_seq_len*(1 - length_window) <= (seg.stop - seg.start) <= mean_seq_len*(1 + length_window) for seg in downstream_segs_to_align) > len(downstream_segs_to_align)*window_prop:
+        # base_length = int(mean_seq_len*(1 + length_window))
+    # else:
+        # base_length = BASE_LENGTH
     
+    base_length = BASE_LENGTH
     while keep_extending:
         seqs_to_align = ["A" + (str(
                 gid_to_records[seg.idp.gid][seg.idp.cid].seq[seg.start:seg.stop] if seg.strand == 1
@@ -131,11 +135,15 @@ def run_msa(downstream_segs_to_align, gid_to_records):
     return aligned_msa_seqs
 
 
-def extend_clusters(xmfa_file, index_to_gid, gid_to_cid_to_index, idpair_to_segments, idpair_to_tree, cluster_to_named_segments, gid_to_records):
+def extend_clusters(xmfa_file, gid_to_index, gid_to_cid_to_index, idpair_to_segments, cluster_to_named_segments, gid_to_records):
     ret_lcbs = []
     seqid_parser = re.compile(r'^cluster(\d+) s(\d+):p(\d+)/.*')
     
-    for aln_idx, aln in enumerate(tqdm(AlignIO.parse(xmfa_file, "mauve"), total=len(cluster_to_named_segments))):
+    for aln_idx, aln in enumerate(tqdm(
+            AlignIO.parse(xmfa_file, "mauve"), 
+            total=len(cluster_to_named_segments),
+            file=TqdmToLogger(logger, level=logging.INFO),
+            mininterval=MIN_TQDM_INTERVAL)):
         # validate_lcb(aln, gid_to_records, parsnp_header=True)
         seq = aln[0]
         cluster_idx, contig_idx, startpos = [int(x) for x in seqid_parser.match(seq.id).groups()]
@@ -167,29 +175,36 @@ def extend_clusters(xmfa_file, index_to_gid, gid_to_cid_to_index, idpair_to_segm
         new_lcb = MultipleSeqAlignment([])
         # Assumes alignments are always in the same order
         new_bp = []
-        for seq_idx, (covered_seg, uncovered_seg, aln_str) in enumerate(zip(segs, downstream_segs_to_align, aligned_msa_seqs)):
+        for seg_idx, (covered_seg, uncovered_seg, aln_str) in enumerate(zip(segs, downstream_segs_to_align, aligned_msa_seqs)):
             # Update segment in idpair_to_segments
+            if len(aln_str) < MIN_LEN:
+                continue
             new_bp_covered = len(aln_str) - aln_str.count("-")
             # print(f"Extending {covered_seg} by {new_bp_covered}")
             new_bp.append(new_bp_covered)
             new_seq = aln_str
             if covered_seg.strand == 1:
                 new_seg = Segment(covered_seg.idp, uncovered_seg.start, uncovered_seg.start + new_bp_covered, covered_seg.strand)
+                if new_bp_covered > 0:
+                    segs[seg_idx] = Segment(covered_seg.idp, covered_seg.start, new_seg.stop, covered_seg.strand)
             else:
                 aln_str = Seq(aln_str).reverse_complement()
                 new_seg = Segment(covered_seg.idp, covered_seg.start - new_bp_covered, covered_seg.start, covered_seg.strand)
+                if new_bp_covered > 0:
+                    segs[seg_idx] = Segment(covered_seg.idp, new_seg.start, covered_seg.stop, covered_seg.strand)
             
             new_record = SeqRecord(
                 seq=new_seq,
-                id=f"{covered_seg.idp.gid}#{covered_seg.idp.cid}",
+                id=f"cluster{cluster_idx} s{gid_to_cid_to_index[covered_seg.idp.gid][covered_seg.idp.cid]}:p{new_seg.start if new_seg.strand == 1 else new_seg.stop}",
+                name=gid_to_index[covered_seg.idp.gid],
                 annotations={"start": new_seg.start, "end": new_seg.stop, "strand": new_seg.strand}
             )
+
             # if covered_seg.strand == 1:
             new_lcb.append(new_record)
-            if new_bp_covered > 0:
-                idpair_to_tree[covered_seg.idp].addi(new_seg.start, new_seg.stop)
 
-        ret_lcbs.append(new_lcb)
+        if len(new_lcb) > 0:
+            ret_lcbs.append(new_lcb)
     return ret_lcbs
 
 


=====================================
logger.py
=====================================
@@ -1,4 +1,5 @@
 import logging
+import io
 ############################################# Logging ##############################################
 BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = range(8)
 #These are the sequences need to get colored ouput
@@ -14,6 +15,28 @@ RESET_SEQ = "\033[0m"
 COLOR_SEQ = "\033[1;%dm"
 BOLD_SEQ = "\033[1m"
 
+MIN_TQDM_INTERVAL=30
+
+
+# Logging redirect copied from https://stackoverflow.com/questions/14897756/python-progress-bar-through-logging-module
+class TqdmToLogger(io.StringIO):
+    """
+        Output stream for TQDM which will output to logger module instead of
+        the StdOut.
+    """
+    logger = None
+    level = None
+    buf = ''
+    def __init__(self,logger,level=None):
+        super(TqdmToLogger, self).__init__()
+        self.logger = logger
+        self.level = level or logging.INFO
+    def write(self,buf):
+        self.buf = buf.strip('\r\n\t ')
+    def flush(self):
+        self.logger.log(self.level, self.buf)
+
+
 def formatter_message(message, use_color = True):
     if use_color:
         message = message.replace("$RESET", RESET_SEQ).replace("$BOLD", BOLD_SEQ)


=====================================
parsnp
=====================================
@@ -5,19 +5,16 @@
 
 '''
 
-import os, sys, string, getopt, random,subprocess, time,operator, math, datetime,numpy #pysam
+import os, sys, string, random, subprocess, time, operator, math, datetime, numpy #pysam
 from collections import defaultdict
-import csv
 import shutil
 import shlex
 from tempfile import TemporaryDirectory
 import re
 import logging
-from logger import logger
-import multiprocessing
+from logger import logger, TqdmToLogger, MIN_TQDM_INTERVAL
 import argparse
 import signal
-import inspect
 from multiprocessing import Pool
 from Bio import SeqIO
 from glob import glob
@@ -27,7 +24,7 @@ from pathlib import Path
 import extend as ext
 from tqdm import tqdm
 
-__version__ = "2.0.1"
+__version__ = "2.0.2"
 reroot_tree = True #use --midpoint-reroot
 random_seeded = random.Random(42)
 
@@ -149,7 +146,7 @@ def run_phipack(query,seqlen,workingdir):
     currdir = os.getcwd()
     os.chdir(workingdir)
     command = "Profile -o -v -n %d -w 100 -m 100 -f %s > %s.out"%(seqlen,query,query)
-    run_command(command,1, prepend_time=True)
+    run_command(command, 1)
     os.chdir(currdir)
 
 def run_fasttree(query,workingdir,recombination_sites):
@@ -685,15 +682,20 @@ def create_output_directory(output_dir):
 
     if os.path.exists(output_dir):
         logger.warning(f"Output directory {output_dir} exists, all results will be overwritten")
-        shutil.rmtree(output_dir)
+        if os.path.exists(output_dir + "/partition"):
+            shutil.rmtree(output_dir + "/partition/")
+        if os.path.exists(output_dir + "/config/"):
+            shutil.rmtree(output_dir + "/config/")
+        if os.path.exists(output_dir + "/log/"):
+            shutil.rmtree(output_dir + "/log/")
     elif output_dir == "[P_CURRDATE_CURRTIME]":
         today = datetime.datetime.now()
         timestamp = "P_" + today.isoformat().replace("-", "_").replace(".", "").replace(":", "").replace("T", "_")
         output_dir = os.getcwd() + os.sep + timestamp
-    os.makedirs(output_dir)
-    os.makedirs(os.path.join(output_dir, "tmp"))
-    os.makedirs(os.path.join(output_dir, "log"))
-    os.makedirs(os.path.join(output_dir, "config"))
+    os.makedirs(output_dir, exist_ok=True)
+    os.makedirs(os.path.join(output_dir, "tmp"), exist_ok=True)
+    os.makedirs(os.path.join(output_dir, "log"), exist_ok=True)
+    os.makedirs(os.path.join(output_dir, "config"), exist_ok=True)
     return output_dir
 
 
@@ -1645,7 +1647,11 @@ SETTINGS:
         logger.info("Running partitions...")
         good_chunks = set(chunk_labels)
         with Pool(args.threads) as pool:
-            return_codes = tqdm(pool.imap(run_parsnp_aligner, chunk_output_dirs, chunksize=1), total=len(chunk_output_dirs))
+            return_codes = tqdm(
+                pool.imap(run_parsnp_aligner, chunk_output_dirs, chunksize=1), 
+                total=len(chunk_output_dirs), 
+                file=TqdmToLogger(logger,level=logging.INFO),
+                mininterval=MIN_TQDM_INTERVAL)
             for cl, rc in zip(chunk_labels, return_codes):
                 if rc != 0:
                     logger.error(f"Partition {cl} failed...")
@@ -1666,51 +1672,33 @@ SETTINGS:
         partition.merge_xmfas(partition_output_dir, chunk_labels, xmfa_out_f, num_clusters, args.threads)
 
 
-
-    run_lcb_trees = 0
+    parsnp_output = f"{outputDir}/parsnp.xmfa"
 
     # This is the stuff for LCB extension:
-    annotation_dict = {}
-    #TODO always using xtrafast?
-    parsnp_output = f"{outputDir}/parsnp.xmfa"
     if args.extend_lcbs:
-        xmfa_file = f"{outputDir}/parsnp.xmfa"
-        with TemporaryDirectory() as temp_directory:
-            original_maf_file = f"{outputDir}/parsnp-original.maf"
-            extended_xmfa_file = f"{outputDir}/parsnp-extended.xmfa"
-            fname_contigid_to_length, fname_contigidx_to_header, fname_to_seqrecord = ext.get_sequence_data(
-                    ref,
-                    finalfiles,
-                    index_files=False)
-            fname_to_contigid_to_coords, fname_header_to_gcontigidx = ext.xmfa_to_maf(
-                    xmfa_file,
-                    original_maf_file,
-                    fname_contigidx_to_header,
-                    fname_contigid_to_length)
-            packed_write_result = ext.write_intercluster_regions(finalfiles + [ref], temp_directory, fname_to_contigid_to_coords)
-            fname_contigid_to_cluster_dir_to_length, fname_contigid_to_cluster_dir_to_adjacent_cluster = packed_write_result
-            cluster_files = glob(f"{temp_directory}/*.fasta")
-            clusterdir_expand, clusterdir_len = ext.get_new_extensions(
-                    cluster_files,
-                    args.match_score,
-                    args.mismatch_penalty,
-                    args.gap_penalty)
-            ext.write_extended_xmfa(
-                    original_maf_file,
-                    extended_xmfa_file,
-                    temp_directory,
-                    clusterdir_expand,
-                    clusterdir_len,
-                    fname_contigid_to_cluster_dir_to_length,
-                    fname_contigid_to_cluster_dir_to_adjacent_cluster,
-                    fname_header_to_gcontigidx,
-                    fname_contigid_to_length,
-                    args.extend_ani_cutoff,
-                    args.extend_indel_cutoff,
-                    threads)
-            parsnp_output = extended_xmfa_file
-            os.remove(original_maf_file)
+        logger.warning("The LCB extension module is experimental. Runtime may be significantly increased and extended alignments may not be as high quality as the original core-genome. Extensions off of existing LCBs are in a separate xmfa file.")
+        import partition 
+        import extend as ext
+
+        orig_parsnp_xmfa = parsnp_output
+        extended_parsnp_xmfa = orig_parsnp_xmfa + ".extended"
+
+        # Index input fasta files and original xmfa 
+        index_to_gid, gid_to_index = ext.parse_xmfa_header(orig_parsnp_xmfa)
+        gid_to_records, gid_to_cid_to_index, gid_to_index_to_cid = ext.index_input_sequences(orig_parsnp_xmfa, finalfiles + [ref])
+
+        # Get covered regions of xmfa file
+        idpair_to_segments, cluster_to_named_segments = ext.xmfa_to_covered(orig_parsnp_xmfa, index_to_gid, gid_to_index_to_cid)
+
+        # Extend clusters
+        logger.info(f"Extending LCBs with SPOA...")
+        new_lcbs = ext.extend_clusters(orig_parsnp_xmfa, gid_to_index, gid_to_cid_to_index, idpair_to_segments, cluster_to_named_segments, gid_to_records)
 
+        # Write output
+        partition.copy_header(orig_parsnp_xmfa, extended_parsnp_xmfa)
+        with open(extended_parsnp_xmfa, 'a') as out_f:
+            for lcb in new_lcbs:
+                partition.write_aln_to_xmfa(lcb, out_f)
 
     #add genbank here, if present
     if len(genbank_ref) != 0:


=====================================
partition.py
=====================================
@@ -7,6 +7,7 @@ import tempfile
 import os
 import copy
 import math
+import logging
 from multiprocessing import Pool
 from functools import partial
 from collections import namedtuple, defaultdict, Counter
@@ -18,7 +19,7 @@ from Bio import AlignIO
 from Bio.Seq import Seq
 from Bio.SeqRecord import SeqRecord
 from Bio.Align import MultipleSeqAlignment
-from logger import logger
+from logger import logger, TqdmToLogger, MIN_TQDM_INTERVAL
 from tqdm import tqdm
 
 
@@ -69,7 +70,7 @@ def get_interval(aln: MultipleSeqAlignment) -> Tuple[int, IntervalType]:
     """
     seqid_parser = re.compile(r'^cluster(\d+) s(\d+):p(\d+)/.*')
     seq = aln[0]
-    aln_len = seq.annotations["end"] - seq.annotations["start"] + 1
+    aln_len = seq.annotations["end"] - seq.annotations["start"] 
     cluster_idx, contig_idx, startpos = [int(x) for x in seqid_parser.match(seq.id).groups()]
 
     if seq.annotations["strand"] == -1:
@@ -122,7 +123,7 @@ def trim(aln: MultipleSeqAlignment,
     # Look for the record in the LCB that represents the reference genome
     for rec in aln:
         if int(rec.name) == seqidx:
-            aln_len = rec.annotations["end"] - rec.annotations["start"] + 1
+            aln_len = rec.annotations["end"] - rec.annotations["start"]
             cluster_idx, contig_idx, super_startpos = [int(x) for x in seqid_parser.match(rec.id).groups()]
 
             if rec.annotations["strand"] == -1:
@@ -185,7 +186,7 @@ def trim(aln: MultipleSeqAlignment,
             # orig_seq = copy.deepcopy(seq)
             new_rec = aln_seqs[seq_idx]
             
-            aln_len = rec.annotations["end"] - rec.annotations["start"] + 1
+            aln_len = rec.annotations["end"] - rec.annotations["start"]
             cluster_idx, contig_idx, startpos = [int(x) for x in seqid_parser.match(rec.id).groups()]
             left_bases_trim = 0
             right_bases_trim = 0
@@ -634,7 +635,11 @@ def trim_xmfas(
     trim_partial = partial(trim_single_xmfa, intersected_interval_dict=intersected_interval_dict)
     orig_xmfa_files = [f"{partition_output_dir}/{CHUNK_PREFIX}-{cl}-out/parsnp.xmfa" for cl in chunk_labels]
     with Pool(threads) as pool:
-        num_clusters_per_xmfa = list(tqdm(pool.imap_unordered(trim_partial, orig_xmfa_files), total=len(orig_xmfa_files)))
+        num_clusters_per_xmfa = list(tqdm(
+            pool.imap_unordered(trim_partial, orig_xmfa_files), 
+            total=len(orig_xmfa_files), 
+            file=TqdmToLogger(logger,level=logging.INFO),
+            mininterval=MIN_TQDM_INTERVAL))
         #TODO clean up
         if not all(num_clusters_per_xmfa[0][1] == nc for xmfa, nc in num_clusters_per_xmfa):
             logger.critical("One of the partitions has a different number of clusters after trimming...")
@@ -712,7 +717,9 @@ def merge_xmfas(
         with Pool(threads) as pool:
             tmp_xmfas = list(tqdm(
                     pool.imap_unordered(merge_single_LCB_star_partial, enumerate(pairs_list)), 
-                    total=num_clusters)
+                    total=num_clusters,
+                    file=TqdmToLogger(logger,level=logging.INFO),
+                    mininterval=MIN_TQDM_INTERVAL)
             )
     
         with open(xmfa_out_f, 'a') as xmfa_out_handle:
@@ -722,69 +729,3 @@ def merge_xmfas(
                     xmfa_out_handle.write(tx.read())
 
 
-if __name__ == "__main__":
-    args = parse_args()
-    os.makedirs(args.output_dir, exist_ok=True)
-    
-    query_files = list()
-    for arg_path in args.sequences:
-        if arg_path.endswith(".txt"):
-            with open(arg_path) as input_list_handle:
-                for line in input_list_handle:
-                    query_files.append(line.strip())
-        elif any(arg_path.endswith(suff) for suff in FASTA_SUFFIX_LIST):
-            query_files.append(arg_path)
-        elif os.path.isdir(arg_path):
-            for f in os.listdir(arg_path): 
-                if any(f.endswith(suff) for suff in FASTA_SUFFIX_LIST):
-                    query_files.append(f"{arg_path}/{f}")
-
-    full_query_list_path = f"{args.output_dir}/input-list.txt"
-    with open(full_query_list_path, 'w') as input_list_handle:
-        for qf in query_files:
-            input_list_handle.write(qf + "\n")
-
-    partition_output_dir = f"{args.output_dir}/partition"
-    partition_list_dir = f"{partition_output_dir}/input-lists"
-    os.makedirs(partition_list_dir, exist_ok=True)
-    run_command(f"split -l {args.partition_size} -a 5 --additional-suffix '.txt' {full_query_list_path} {partition_list_dir}/{CHUNK_PREFIX}-")
-
-    chunk_label_parser = re.compile(f'{CHUNK_PREFIX}-(.*).txt')
-    chunk_labels = []
-    for partition_list_file in os.listdir(partition_list_dir):
-        chunk_labels.append(chunk_label_parser.search(partition_list_file).groups()[0])
-
-    parsnp_commands = []
-    for cl in chunk_labels:
-        chunk_output_dir = f"{partition_output_dir}/{CHUNK_PREFIX}-{cl}-out"
-        os.makedirs(chunk_output_dir, exist_ok=True)
-        chunk_command = f"./parsnp -d {partition_list_dir}/{CHUNK_PREFIX}-{cl}.txt -r {args.reference} -o {chunk_output_dir} "
-        chunk_command += args.parsnp_flags
-        chunk_command += f" > {chunk_output_dir}/parsnp.stdout 2> {chunk_output_dir}/parsnp.stderr"
-        parsnp_commands.append(chunk_command)
-
-    good_chunks = set(chunk_labels)
-    run_command_nocheck = partial(run_command, check=False)
-    with Pool(args.threads) as pool:
-        return_codes = tqdm(pool.imap(run_command_nocheck, parsnp_commands, chunksize=1), total=len(parsnp_commands))
-        for cl, rc in zip(chunk_labels, return_codes):
-            if rc != 0:
-                logger.error("Partition {cl} failed...")
-                good_chunks.remove(cl)
-        
-    chunk_labels = list(good_chunks)
-
-    logger.info("Computing intersection of all partition LCBs...")
-    chunk_to_intvervaldict = get_chunked_intervals(partition_output_dir, chunk_labels)
-    intersected_interval_dict = get_intersected_intervals(chunk_to_intvervaldict)
-    
-    logger.info("Trimming partitioned XMFAs back to intersected intervals...")
-    num_clusters = trim_xmfas(partition_output_dir, chunk_labels, intersected_interval_dict, args.threads)
-
-    os.makedirs(f"{args.output_dir}/merged-out/", exist_ok=True)
-    xmfa_out_f =  f"{args.output_dir}/merged-out/parsnp.xmfa"
-
-    logger.info(f"Merging trimmed XMFA files into a single XMFA at {xmfa_out_f}")
-    merge_xmfas(partition_output_dir, chunk_labels, xmfa_out_f, num_clusters, args.threads)
-
-


=====================================
src/parsnp.cpp
=====================================
@@ -976,18 +976,18 @@ void Aligner::writeOutput(string psnp,vector<float>& coveragerow)
                         
                         if ( ct.mums.at(0).isforward.at(i) )
                         {
-                            xmfafile << "> " << i+1 << ":" << ct.start.at(i)+1 <<  "-" << ct.end.at(i)-1 << " ";
+                            xmfafile << "> " << i+1 << ":" << ct.start.at(i)+1 <<  "-" << ct.end.at(i) << " ";
                             if (recomb_filter)
                             {
-                                clcbfile << "> " << i+1 << ":" << ct.start.at(i)+1 <<  "-" << ct.end.at(i)-1 << " ";
+                                clcbfile << "> " << i+1 << ":" << ct.start.at(i)+1 <<  "-" << ct.end.at(i) << " ";
                             }
                         }
                         else
                         {
-                            xmfafile << "> " << i+1 << ":" << ct.mums.back().start.at(i)+1 <<  "-" << ct.mums.front().end.at(i)-1 << " ";
+                            xmfafile << "> " << i+1 << ":" << ct.mums.back().start.at(i)+1 <<  "-" << ct.mums.front().end.at(i) << " ";
                             if (recomb_filter)
                             {
-                                clcbfile << "> " << i+1 << ":" << ct.mums.back().start.at(i)+1 <<  "-" << ct.mums.front().end.at(i)-1 << " ";
+                                clcbfile << "> " << i+1 << ":" << ct.mums.back().start.at(i)+1 <<  "-" << ct.mums.front().end.at(i) << " ";
                             }
                         }
                         bool hit1 = false;



View it on GitLab: https://salsa.debian.org/med-team/parsnp/-/compare/2640df35ceb3a87b7ebcec04c1eaed165b61912c...3ea7771b43f49527e8dde63b94492984dfc05865

-- 
View it on GitLab: https://salsa.debian.org/med-team/parsnp/-/compare/2640df35ceb3a87b7ebcec04c1eaed165b61912c...3ea7771b43f49527e8dde63b94492984dfc05865
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20240122/e9e0b58e/attachment-0001.htm>


More information about the debian-med-commit mailing list