[med-svn] [Git][med-team/parsnp][upstream] New upstream version 2.1.3+dfsg

Étienne Mollier (@emollier) gitlab at salsa.debian.org
Sat Mar 1 18:24:41 GMT 2025



Étienne Mollier pushed to branch upstream at Debian Med / parsnp


Commits:
ae0cd21b by Étienne Mollier at 2025-03-01T19:15:37+01:00
New upstream version 2.1.3+dfsg
- - - - -


4 changed files:

- README.md
- parsnp
- partition.py
- src/parsnp.cpp


Changes:

=====================================
README.md
=====================================
@@ -4,9 +4,18 @@ Parsnp is a command-line-tool for efficient microbial core genome alignment and
 
 # Installation
 ## From conda
-Parsnp is available on the [Bioconda](https://bioconda.github.io/user/install.html#set-up-channels) channel. This is the recommended method of installation. Once you have [added the Bioconda channel](https://bioconda.github.io/user/install.html#set-up-channels) to your conda environment, `parsnp` can be installed via
+Parsnp is available on [Bioconda](https://bioconda.github.io/recipes/parsnp/README.html). This is the recommended method of installation. 
+Once you have [added the Bioconda channel](https://bioconda.github.io/#usage) to your conda environment, `parsnp` can be installed via
 ```
-conda install parsnp
+conda install -c bioconda parsnp
+```
+
+If you run into conflicts, even in a fresh environment, that may be due to the versions of the 
+default dependencies in the env. You can get around this by letting conda figure out
+which versions of the dependencies are necessary by telling it which packages you want at 
+creation:
+```
+conda create -n parsnp-env parsnp==2.*
 ```
 
 Instructions for building Parsnp from source are available towards the end of this README.


=====================================
parsnp
=====================================
@@ -6,7 +6,7 @@
 '''
 
 import os, sys, string, random, subprocess, time, operator, math, datetime, numpy #pysam
-from collections import defaultdict, namedtuple
+from collections import defaultdict, namedtuple, Counter
 import shutil
 import multiprocessing
 import shlex
@@ -24,7 +24,7 @@ from pathlib import Path
 
 from tqdm import tqdm
 
-__version__ = "2.0.6"
+__version__ = "2.1.3"
 reroot_tree = True #use --midpoint-reroot
 random_seeded = random.Random(42)
 
@@ -170,6 +170,7 @@ def xmfa_to_maf(xmfa_path, maf_path, all_input_paths):
 
     seqid_pattern = re.compile(r'^cluster(\d+) s(\d+):p(\d+)')
     with open(maf_path, 'w') as maf_out:
+        maf_out.write("##maf version=1 program=parsnp\n")
         for aln in AlignIO.parse(xmfa_path, "mauve"):
             for rec_idx, rec in enumerate(aln):
                 aln_len = rec.annotations["end"] - rec.annotations["start"]
@@ -180,6 +181,8 @@ def xmfa_to_maf(xmfa_path, maf_path, all_input_paths):
                 elif fname == ref_fname:
                     continue
                 rec_info = fname_to_info[fname][contig_idx-1]
+                if rec.annotations["strand"] == -1:
+                    startpos = rec_info.seq_length - startpos
                 maf_out.write(f"s\t{'.'.join(fname.split('.')[:-1])}{sample_delim}{rec_info.cid}")
                 maf_out.write(f"\t{startpos}")
                 maf_out.write(f"\t{aln_len}")
@@ -308,17 +311,15 @@ def parse_args():
 
     2) With reference but without genbank file:
     python Parsnp.py -r <reference_genome> -d <seq_file1 seq_file2 ...> -p <threads>
-
-    3) Autorecruit reference to a draft assembly:
-    python Parsnp.py -q <draft_assembly> -d <seq_file1 seq_file2 ...> -p <threads>
     """, formatter_class=argparse.RawTextHelpFormatter)
     #TODO Use lambda to check files and directories
     input_output_args = parser.add_argument_group(title="Input/Output")
     input_output_args.add_argument(
-        "-c",
-        "--curated",
-        action = "store_true",
-        help = "(c)urated genome directory, use all genomes in dir and ignore MUMi?")
+        "-r",
+        "--reference",
+        type = lambda fname: is_valid_file_path(parser, fname),
+        default = "",
+        help = "(r)eference genome (set to ! to pick random one from sequence dir)")
     input_output_args.add_argument(
         "-d",
         "--sequences",
@@ -326,13 +327,6 @@ def parse_args():
         nargs = '+',
         required = True,
         help = "A list of files containing genomes/contigs/scaffolds. If the file ends in .txt, each line in the file corresponds to the path to an input file.")
-    input_output_args.add_argument(
-        "-r",
-        "--reference",
-        type = lambda fname: is_valid_file_path(parser, fname),
-        default = "",
-        help = "(r)eference genome (set to ! to pick random one from sequence dir)")
-    #TODO Accept as space-separated input and parse automatically w/ argparse
     input_output_args.add_argument(
         "-g",
         "--genbank",
@@ -349,7 +343,17 @@ def parse_args():
         type = str,
         help = "Specify (assembled) query genome to use, in addition to genomes found in genome dir")
 
-    MUMi_args = parser.add_argument_group(title="MUMi")
+    MUMi_args = parser.add_argument_group(title="Filtering")
+    MUMi_args.add_argument(
+        "-c",
+        "--curated",
+        action = "store_true",
+        help = "(c)urated genome directory, use all genomes in dir and ignore MUMi.")
+    MUMi_args.add_argument(
+        "--skip-ani-filter",
+        action = "store_true",
+        help="Skip the filtering step which discards inputs based on the ANI/MUMi distance to the reference." 
+            +"\nUnlike --curated, this will still filter inputs based on their length compared to the reference")
     MUMi_mutex_args = MUMi_args.add_mutually_exclusive_group()
     #TODO whats the default?
     MUMi_mutex_args.add_argument(
@@ -385,12 +389,12 @@ def parse_args():
     MUMi_rec_prog.add_argument(
         "--use-ani",
         action = "store_true",
-        help = "Use ani for genome recruitment")
+        help = "Use ANI for genome filtering")
     MUMi_args.add_argument(
         "--min-ani",
         type = float,
         default = 95,
-        help = "Min ANI value to allow for genome recruitment.")
+        help = "Min ANI value required to include genome")
     MUMi_args.add_argument(
         "--min-ref-cov",
         type = float,
@@ -399,7 +403,7 @@ def parse_args():
     MUMi_rec_prog.add_argument(
         "--use-mash",
         action = "store_true",
-        help = "Use mash for genome recruitment")
+        help = "Use mash for genome filtering")
     MUMi_args.add_argument(
         "--max-mash-dist",
         type = float,
@@ -467,14 +471,6 @@ def parse_args():
         action = "store_true",
         help = "Output unaligned regions")
 
-    # recombination_args = parser.add_argument_group("Recombination filtration")
-    #TODO -x was a duplicate flag but no longer parsed as filter-phipack-snps in the current parsnp version
-    # recombination_args.add_argument(
-            # "-x",
-            # "--filter-phipack-snps",
-            # action = "store_true",
-            # help = "Enable filtering of SNPs located in PhiPack identified regions of recombination")
-
     partition_args = parser.add_argument_group("Sequence Partitioning")
     partition_args.add_argument(
         "--no-partition",
@@ -544,6 +540,11 @@ def parse_args():
         type = int,
         default = 1,
         help = "Number of threads to use")
+    misc_args.add_argument(
+        "--force-overwrite",
+        "--fo",
+        action="store_true",
+        help="Overwrites any results in the output directory if it already exists")
     misc_args.add_argument(
         "-P",
         "--max-partition-size",
@@ -557,8 +558,10 @@ def parse_args():
         help = "Verbose output")
     misc_args.add_argument(
         "-x",
+        "--recomb-filter",
         "--xtrafast",
-        action = "store_true")
+        action = "store_true",
+        help="Run recombination filter (phipack)")
     misc_args.add_argument(
         "-i",
         "--inifile",
@@ -568,9 +571,6 @@ def parse_args():
         "-e",
         "--extend",
         action = "store_true")
-    misc_args.add_argument(
-        "--no-recruit",
-        action = "store_true")
     misc_args.add_argument(
         "-V",
         "--version",
@@ -685,7 +685,7 @@ def check_for_dependencies(use_phipack, use_fasttree):
     Routine to check for the main dependencies for parsnp and exits the program if one is missing.
 
     That includes 
-    1) Profile (if -x or --xtrafast), 
+    1) Profile (if -x or --recomb-filter), 
     2) Raxml (if !use_fasttree) else one of the main FastTree executables 
     3) harvesttools
     '''
@@ -718,7 +718,7 @@ def check_for_dependencies(use_phipack, use_fasttree):
         sys.exit(1)
 
 
-def create_output_directory(output_dir):
+def create_output_directory(output_dir, overwrite=False):
     '''
     Function to make the output directory and the 'tmp' subdirectory based on the command-line arguments.
 
@@ -733,13 +733,13 @@ def create_output_directory(output_dir):
         sys.exit(1)
 
     if os.path.exists(output_dir):
-        logger.warning(f"Output directory {output_dir} exists, all results will be overwritten")
-        if os.path.exists(output_dir + "/partition"):
-            shutil.rmtree(output_dir + "/partition/")
-        if os.path.exists(output_dir + "/config/"):
-            shutil.rmtree(output_dir + "/config/")
-        if os.path.exists(output_dir + "/log/"):
-            shutil.rmtree(output_dir + "/log/")
+        if overwrite:
+            logger.warning(f"Output directory {output_dir} exists, all results will be overwritten")
+            shutil.rmtree(output_dir)
+        else:
+            logger.critical(f"Output directory {output_dir} exists. Use --fo/--force-overwrite to automatically overwrite it")
+            sys.exit(1)
+
     elif output_dir == "[P_CURRDATE_CURRTIME]":
         today = datetime.datetime.now()
         timestamp = "P_" + today.isoformat().replace("-", "_").replace(".", "").replace(":", "").replace("T", "_")
@@ -1001,12 +1001,12 @@ def parse_input_files(input_files, curated, validate_input):
             if sizediff <= 0.8:
                 log_f("File {} is {:.2f}x longer than reference! {}".format(
                     input_file, 1 / sizediff, msg))
-                if not curated or args.no_recruit:
+                if not curated:
                     continue
             elif sizediff >= 1.2:
                 log_f("File {} is {:.2f}x shorter than reference genome! {}".format(
                     input_file, sizediff, msg))
-                if not curated or args.no_recruit:
+                if not curated:
                     continue
             fnafiles.append(input_file)
             fnaf_sizes[input_file] = seqlen
@@ -1014,7 +1014,7 @@ def parse_input_files(input_files, curated, validate_input):
     return input_files, fnafiles, fnaf_sizes
 
 
-def write_inifile_1(args, ref, aligner, xtrafast, output_dir, fastmum, reflen, fnafiles, threads):
+def write_inifile_1(args, ref, aligner, recomb_filter, output_dir, fastmum, reflen, fnafiles, threads):
     '''
     Writes the first .ini file. Starts with 'template.ini' in the parsnp folder and replaces various
     values in it, then writes the output to 'all_mumi.ini' in the output folder (which is eventually
@@ -1027,7 +1027,7 @@ def write_inifile_1(args, ref, aligner, xtrafast, output_dir, fastmum, reflen, f
         args (namespace):   should be a copy of the command-line 'args' variable from parse-args.
         ref:                (assigned directly from args in the main loop)
         aligner:            (assigned directly from args in the main loop)
-        xtrafast:           (assigned directly from args in the main loop)
+        recomb_filter:      (assigned directly from args in the main loop)
         output_dir:         (assigned directly from args in the main loop)
         fastmum:            (assigned directly from args in the main loop)
         reflen:             (assigned directly from args in the main loop)
@@ -1039,8 +1039,6 @@ def write_inifile_1(args, ref, aligner, xtrafast, output_dir, fastmum, reflen, f
     '''
     # global extend, inifiled, file_string, cnt
     logger.debug("Writing .ini file")
-    if xtrafast or 1:
-        args.extend = False
     inifiled = open("%s/template.ini" % (PARSNP_DIR), 'r').read()
     inifiled = inifiled.replace("$REF", ref)
     inifiled = inifiled.replace("$EXTEND", "%d" % (args.extend))
@@ -1051,7 +1049,7 @@ def write_inifile_1(args, ref, aligner, xtrafast, output_dir, fastmum, reflen, f
     inifiled = inifiled.replace("$THREADS", str(threads))
     inifiled = inifiled.replace("$ALIGNER", str(aligner))
     inifiled = inifiled.replace("$DIAGDIFF", str(args.max_diagonal_difference))
-    inifiled = inifiled.replace("$RECOMBFILT", "%d" % (xtrafast))
+    inifiled = inifiled.replace("$RECOMBFILT", "%d" % (recomb_filter))
     inifiled = inifiled.replace("$OUTDIR", output_dir)
     if fastmum:
         inifiled = inifiled.replace("$PARTPOS", "%d" % (0.2 * reflen))
@@ -1146,7 +1144,7 @@ def run_parsnp_aligner(outputDir):
     rc = run_logged_command(command=command, ignorerc=1, label="parsnp-aligner", outputDir=outputDir)
 
     if not os.path.exists(os.path.join(outputDir, "parsnpAligner.xmfa")) or rc != 0:
-        logger.critical("Set of recruited genomes are too divergent for parsnp, please reduce MUMi (%f) and relaunch\n"%(float(mumidistance)))
+        logger.critical("Set of filtered genomes are too divergent for parsnp, please use stricter ANI thresholds for filtering")
     else:
         shutil.move(
                 os.path.join(outputDir, "parsnpAligner.xmfa"),
@@ -1199,7 +1197,7 @@ if __name__ == "__main__":
     # splitseq = args.split
     # extend = args.extend  # (using args directly)
     # layout = args.layout
-    xtrafast = args.xtrafast
+    recomb_filter = args.recomb_filter
     inifile = args.inifile
     inifile_exists = args.inifile is not None
     mumi_only = args.mumi_only
@@ -1224,14 +1222,28 @@ if __name__ == "__main__":
         handler.setLevel(logging_level)
 
     # 2) Check for dependencies
-    check_for_dependencies(xtrafast, use_fasttree)
+    check_for_dependencies(recomb_filter, use_fasttree)
 
     # 3) Create output dir
-    outputDir = create_output_directory(outputDir)
+    outputDir = create_output_directory(outputDir, args.force_overwrite)
 
     # 4) Process all input files:
     #   -- 'input_files' here is args.sequences
     input_files = process_input_files(args, input_files)
+    input_file_counter = Counter(Path(p).stem for p in input_files)
+    if len(input_file_counter) < len(input_files):
+        logger.critical("Multiple files have the same file stem, i.e. path1/genomic.fna and path2/genomic.fna. Please ensure unique filenames.")
+        for p in sorted(input_files, key=lambda p: Path(p).stem):
+            if input_file_counter[Path(p).stem] > 1:
+                logger.critical(p)
+        logger.critical("""If the directory names are unique, but not the filenames (for example if they all come from NCBI Datasets, you can use the following to rename them:
+for f in path/to/files/*/*.fna; do
+	DIR=$(dirname $f)
+	DIR_ONLY="${DIR##*/}"
+	mv $f "${DIR}/${DIR_ONLY}.fna"
+done;
+""")
+        exit(1)
 
     # Parse reference if necessary
     multifasta, ref_seqs = parse_reference_file(ref)
@@ -1324,7 +1336,7 @@ SETTINGS:
     allfiles = [os.path.basename(ref)]
     #write INI file to pass to the first stage of parsnp:
     if not inifile_exists:
-        inifiled = write_inifile_1(args, ref, aligner, xtrafast, outputDir, fastmum, reflen, fnafiles, threads)
+        inifiled = write_inifile_1(args, ref, aligner, recomb_filter, outputDir, fastmum, reflen, fnafiles, threads)
 
     #2)get near neighbors (mumi distance)
     if os.path.exists(os.path.join(outputDir, "alltogether.fasta")):
@@ -1344,8 +1356,8 @@ SETTINGS:
     mumi_dict = {}
     finalfiles = []
     auto_ref = ""
-    if not curated:
-        logger.info("Recruiting genomes...")
+    if not curated and not args.skip_ani_filter:
+        logger.info("Filtering genomes...")
         if use_parsnp_mumi:
             if not inifile_exists:
                 command = f"{PARSNP_DIR}/bin/parsnp_core {outputDir}/config/all_mumi.ini"
@@ -1396,7 +1408,7 @@ SETTINGS:
                 hpv = minv + (3*stdv)
 
             for idx in mumi_dict.keys():
-                if mumi_dict[idx] < (float(mumidistance)) or curated:
+                if mumi_dict[idx] < (float(mumidistance)):
                     if fastmum and mumi_dict[idx] > hpv:
                         continue
                     #TODO if 1, why is this?
@@ -1417,7 +1429,7 @@ SETTINGS:
                     all_genomes_f.writelines((line + '\n' for line in fnafiles))
                 if use_mash:
                     if randomly_selected_ref:
-                        logger.warning("You are using a randomly selected genome to recruit genomes from your input with Mash. If input genomes vary greatly in size, results could be suboptimal. It is advised to select a reference genome for recruitment...")
+                        logger.warning("You are using a randomly selected genome to filter genomes from your input with Mash. If input genomes vary greatly in size, results could be suboptimal. It is advised to select a reference genome for filter...")
                     command = " ".join([
                             "mash", "dist", "-t",
                             "-d", str(max_mash_dist),
@@ -1434,7 +1446,7 @@ SETTINGS:
                         3000, 
                         max(500, reflen // 100))
                     if randomly_selected_ref:
-                        logger.warning("You have not selected a reference and are using ANI recruitment. All-to-all FastANI will be performed in order to obtain the best reference. As this is O(N^2), it is advised to select a reference genome if you have many query sequences.")
+                        logger.warning("You have not selected a reference and are using ANI filter. All-to-all FastANI will be performed in order to obtain the best reference. As this is O(N^2), it is advised to select a reference genome if you have many query sequences.")
                     command = " ".join([
                             "fastANI",
                             ("--rl" if randomly_selected_ref else "-r"), 
@@ -1460,12 +1472,12 @@ SETTINGS:
                 # shutil.rmtree(tmp_dir)
             except subprocess.CalledProcessError as e:
                 logger.critical(
-                    "Recruitment failed with exception {}. More details may be found in the *.err output log".format(str(e)))
+                    "Filtering failed with exception {}. More details may be found in the *.err output log".format(str(e)))
                 # shutil.rmtree(tmp_dir)
             allfiles.extend(finalfiles)
 
     # If "curateD" then anything that was removed gets added back.
-    if curated:
+    if curated or args.skip_ani_filter:
         for f in fnafiles:
             if f not in finalfiles:
                 finalfiles.append(f)
@@ -1493,8 +1505,6 @@ SETTINGS:
     totseqs = len(finalfiles) + 1 # Add one to include reference
 
     #initiate parallelPhiPack tasks
-    run_recomb_filter = 0
-
     #3)run parsnp (cores, grid?)
     if args.no_partition or len(finalfiles) < 2*args.min_partition_size:
         if len(finalfiles) < 2*args.min_partition_size:
@@ -1507,10 +1517,6 @@ SETTINGS:
         run_parsnp_aligner(outputDir)
         shutil.move(f"{outputDir}/parsnpAligner.log", f"{outputDir}/log/parsnpAligner.log")
 
-        blocks_dir = os.path.join(outputDir, "blocks")
-        if not os.path.exists(blocks_dir):
-            os.mkdir(blocks_dir)
-
         #get coverage
         coverage = 0
         totlength = 0
@@ -1534,10 +1540,79 @@ SETTINGS:
                 sys.exit(1)
             else:
                 logger.warning("""Aligned regions cover less than 10% of reference genome!
-    Please verify recruited genomes are all strain of interest""")
+    Please verify filtered genomes are all strain of interest""")
         else:
             pass
+
+    else:
+        import partition 
+        full_partitions = len(finalfiles) // args.min_partition_size
+        effective_partition_size = len(finalfiles) // full_partitions
+        logger.info(f"Setting the partition size to {effective_partition_size}")
+        partition_output_dir = f"{outputDir}/partition"
+        partition_list_dir = f"{partition_output_dir}/input-lists"
+        os.makedirs(partition_list_dir, exist_ok=True)
+        for partition_idx in range(math.ceil(len(finalfiles) / effective_partition_size)):
+            with open(f"{partition_list_dir}/{partition.CHUNK_PREFIX}-{partition_idx:010}.txt", 'w') as part_out:
+                for qf in finalfiles[partition_idx*effective_partition_size : (partition_idx+1)*effective_partition_size]:
+                    part_out.write(f"{qf}\n")
+
+        chunk_label_parser = re.compile(f'{partition.CHUNK_PREFIX}-(.*).txt')
+        chunk_labels = []
+        for partition_list_file in os.listdir(partition_list_dir):
+            chunk_labels.append(chunk_label_parser.search(partition_list_file).groups()[0])
+
+        chunk_output_dirs = []
+        for cl in chunk_labels:
+            chunk_output_dir = f"{partition_output_dir}/{partition.CHUNK_PREFIX}-{cl}-out"
+            # os.makedirs(chunk_output_dir, exist_ok=True)
+            create_output_directory(chunk_output_dir)
+            chunk_output_dirs.append(chunk_output_dir)
+            with open(f"{partition_list_dir}/{partition.CHUNK_PREFIX}-{cl}.txt", 'r') as partition_file:
+                partition_seq_files = [f for f in partition_file.read().splitlines()]
+
+            write_inifile_2(inifiled, chunk_output_dir, unaligned, args, auto_ref, query, partition_seq_files, ref, max(1, args.threads // len(chunk_labels)))
+
+        logger.info("Running partitions...")
+        good_chunks = set(chunk_labels)
+        with Pool(args.threads) as pool:
+            return_codes = tqdm(
+                pool.imap(run_parsnp_aligner, chunk_output_dirs, chunksize=1), 
+                total=len(chunk_output_dirs), 
+                file=TqdmToLogger(logger,level=logging.INFO),
+                mininterval=MIN_TQDM_INTERVAL)
+            for cl, rc in zip(chunk_labels, return_codes):
+                if rc != 0:
+                    logger.error(f"Partition {cl} failed...")
+                    good_chunks.remove(cl)
+            
+        chunk_labels = sorted(list(good_chunks))
+
+        logger.info("Computing intersection of all partition LCBs...")
+        chunk_to_intvervaldict = partition.get_chunked_intervals(partition_output_dir, chunk_labels)
+        intersected_interval_dict = partition.get_intersected_intervals(chunk_to_intvervaldict)
+        
+        logger.info("Trimming partitioned XMFAs back to intersected intervals...")
+        num_clusters = partition.trim_xmfas(partition_output_dir, chunk_labels, intersected_interval_dict, args.threads)
+
+        logger.info(f"Merging trimmed XMFA files...")
+        partition.merge_xmfas(
+            outputDir, 
+            partition_output_dir, 
+            chunk_labels, 
+            num_clusters, 
+            args.threads, 
+            recomb_filter)
+
+
+    parsnp_output = f"{outputDir}/parsnp.xmfa"
+
+    if recomb_filter:
         #print("-->Getting list of LCBs.."
+        blocks_dir = os.path.join(outputDir, "blocks")
+        if not os.path.exists(blocks_dir):
+            os.mkdir(blocks_dir)
+
         allbfiles = glob(os.path.join(blocks_dir, "b*/*"))
         blockfiles = []
         block_startpos = []
@@ -1567,15 +1642,10 @@ SETTINGS:
                     lf.close()
 
 
-        if xtrafast:
-            run_recomb_filter = 1
-        else:
-            run_recomb_filter = 0
-
         recombination_sites = {}
         bedfile = ""
         bedfile_dict = {}
-        if run_recomb_filter and len(blockfiles) > 0:
+        if recomb_filter and len(blockfiles) > 0:
             logger.info("Running PhiPack on LCBs to detect recombination...")
             bedfile = open(os.path.join(outputDir, "parsnp.rec"), 'w')
             tasks = []
@@ -1657,65 +1727,6 @@ SETTINGS:
                 bedfile.write(bedfile_dict[key])
             bedfile.close()
 
-    else:
-        import partition 
-        full_partitions = len(finalfiles) // args.min_partition_size
-        effective_partition_size = len(finalfiles) // full_partitions
-        logger.info(f"Setting the partition size to {effective_partition_size}")
-        partition_output_dir = f"{outputDir}/partition"
-        partition_list_dir = f"{partition_output_dir}/input-lists"
-        os.makedirs(partition_list_dir, exist_ok=True)
-        for partition_idx in range(math.ceil(len(finalfiles) / effective_partition_size)):
-            with open(f"{partition_list_dir}/{partition.CHUNK_PREFIX}-{partition_idx:010}.txt", 'w') as part_out:
-                for qf in finalfiles[partition_idx*effective_partition_size : (partition_idx+1)*effective_partition_size]:
-                    part_out.write(f"{qf}\n")
-
-        chunk_label_parser = re.compile(f'{partition.CHUNK_PREFIX}-(.*).txt')
-        chunk_labels = []
-        for partition_list_file in os.listdir(partition_list_dir):
-            chunk_labels.append(chunk_label_parser.search(partition_list_file).groups()[0])
-
-        chunk_output_dirs = []
-        for cl in chunk_labels:
-            chunk_output_dir = f"{partition_output_dir}/{partition.CHUNK_PREFIX}-{cl}-out"
-            # os.makedirs(chunk_output_dir, exist_ok=True)
-            create_output_directory(chunk_output_dir)
-            chunk_output_dirs.append(chunk_output_dir)
-            with open(f"{partition_list_dir}/{partition.CHUNK_PREFIX}-{cl}.txt", 'r') as partition_file:
-                partition_seq_files = [f for f in partition_file.read().splitlines()]
-
-            write_inifile_2(inifiled, chunk_output_dir, unaligned, args, auto_ref, query, partition_seq_files, ref, max(1, args.threads // len(chunk_labels)))
-
-        logger.info("Running partitions...")
-        good_chunks = set(chunk_labels)
-        with Pool(args.threads) as pool:
-            return_codes = tqdm(
-                pool.imap(run_parsnp_aligner, chunk_output_dirs, chunksize=1), 
-                total=len(chunk_output_dirs), 
-                file=TqdmToLogger(logger,level=logging.INFO),
-                mininterval=MIN_TQDM_INTERVAL)
-            for cl, rc in zip(chunk_labels, return_codes):
-                if rc != 0:
-                    logger.error(f"Partition {cl} failed...")
-                    good_chunks.remove(cl)
-            
-        chunk_labels = sorted(list(good_chunks))
-
-        logger.info("Computing intersection of all partition LCBs...")
-        chunk_to_intvervaldict = partition.get_chunked_intervals(partition_output_dir, chunk_labels)
-        intersected_interval_dict = partition.get_intersected_intervals(chunk_to_intvervaldict)
-        
-        logger.info("Trimming partitioned XMFAs back to intersected intervals...")
-        num_clusters = partition.trim_xmfas(partition_output_dir, chunk_labels, intersected_interval_dict, args.threads)
-
-        xmfa_out_f =  f"{outputDir}/parsnp.xmfa"
-
-        logger.info(f"Merging trimmed XMFA files into a single XMFA at {xmfa_out_f}")
-        partition.merge_xmfas(partition_output_dir, chunk_labels, xmfa_out_f, num_clusters, args.threads)
-
-
-    parsnp_output = f"{outputDir}/parsnp.xmfa"
-
     # This is the stuff for LCB extension:
     if args.extend_lcbs:
         logger.warning("The LCB extension module is experimental. Runtime may be significantly increased and extended alignments may not be as high quality as the original core-genome. Extensions off of existing LCBs are in a separate xmfa file.")
@@ -1740,7 +1751,8 @@ SETTINGS:
         partition.copy_header(orig_parsnp_xmfa, extended_parsnp_xmfa)
         with open(extended_parsnp_xmfa, 'a') as out_f:
             for lcb in new_lcbs:
-                partition.write_aln_to_xmfa(lcb, out_f)
+                partition.write_aln_to_fna(lcb, out_f)
+                out_f.write("=\n")
 
 
     # Generate MAF file
@@ -1760,7 +1772,7 @@ SETTINGS:
     # Harvest seems to fail sometimes when piping to stderr/stdout...
     run_command(command)
 
-    if run_recomb_filter:
+    if recomb_filter:
         command = "harvesttools -q -b %s/parsnp.rec,REC,\"PhiPack\" -o %s/parsnp.ggr -i %s/parsnp.ggr"%(outputDir,outputDir,outputDir)
         run_command(command)
 
@@ -1771,7 +1783,7 @@ SETTINGS:
 
     if generate_vcf:
         run_logged_command(
-            f"harvesttools -q -x {parsnp_output} -V {outputDir+os.sep}parsnp.vcf", 
+            f"harvesttools -q -i {outputDir}/parsnp.ggr -V {outputDir+os.sep}parsnp.vcf", 
             outputDir, label="harvest-vcf")
 
     if not args.skip_phylogeny:
@@ -1823,7 +1835,7 @@ SETTINGS:
 
         if len(use_gingr) > 0:
             logger.info("Creating Gingr input file..")
-            if xtrafast or 1:
+            if recomb_filter or 1:
                 #if newick available, add
                 #new flag to update branch lengths
                 run_command("harvesttools --midpoint-reroot -u -q -i "+outputDir+os.sep+"parsnp.ggr -o "+outputDir+os.sep+"parsnp.ggr -n %s"%(outputDir+os.sep+"parsnp.tree "))
@@ -1841,11 +1853,11 @@ SETTINGS:
     #cleanup
     rmfiles = glob(os.path.join(outputDir, "*.aln"))
     #rmfiles2 = glob.glob(outputDir+os.sep+"blocks/b*/*")
-    rmfiles3 = glob(os.path.join(outputDir, "blocks/b*"))
-    for f in rmfiles:
-        os.remove(f)
-    for f in rmfiles3:
-        shutil.rmtree(f)
+    # rmfiles3 = glob(os.path.join(outputDir, "blocks/b*"))
+    # for f in rmfiles:
+        # os.remove(f)
+    # for f in rmfiles3:
+        # shutil.rmtree(f)
 
     filepres = 0
     logger.info("Parsnp finished! All output available in %s"%(outputDir))
@@ -1865,8 +1877,8 @@ SETTINGS:
     # if filepres != 3:
         # logger.critical("Output files missing, something went wrong. Check logs and relaunch or contact developers for assistance")
 
-    if os.path.exists("%sblocks"%(outputDir+os.sep)):
-        os.rmdir("%sblocks"%(outputDir+os.sep))
+    # if os.path.exists("%sblocks"%(outputDir+os.sep)):
+        # os.rmdir("%sblocks"%(outputDir+os.sep))
     if os.path.exists("allmums.out"):
         os.remove("allmums.out")
 


=====================================
partition.py
=====================================
@@ -5,9 +5,11 @@ import re
 import bisect
 import tempfile
 import os
+import shutil
 import copy
 import math
 import logging
+from pathlib import Path
 from multiprocessing import Pool
 from functools import partial
 from collections import namedtuple, defaultdict, Counter
@@ -230,9 +232,9 @@ def copy_header(orig_xmfa: str, new_xmfa: str) -> None:
                 break
 
 
-def write_aln_to_xmfa(aln: MultipleSeqAlignment, out_handle: TextIO) -> None:
+def write_aln_to_fna(aln: MultipleSeqAlignment, out_handle: TextIO) -> None:
     """
-    Write the MultipleSeqAlignment to the output handle in xmfa format.
+    Write the MultipleSeqAlignment to the output handle in fna format.
     
     Args:
         aln :        MultipleSeqAlignment
@@ -244,8 +246,6 @@ def write_aln_to_xmfa(aln: MultipleSeqAlignment, out_handle: TextIO) -> None:
         out_handle.write(header)
         for i in range(math.ceil(len(rec.seq) / LINESIZE)):
             out_handle.write(str(rec.seq[i*LINESIZE:(i+1)*LINESIZE]) + "\n")
-    out_handle.write("=\n")
-
 
 
 def combine_header_info(xmfa_list: List[str]) \
@@ -609,7 +609,8 @@ def trim_single_xmfa(
             new_alns = trim(aln, intersected_interval_dict, seqidx=1, cluster_start=cluster_start)
             cluster_start += len(new_alns)
             for new_aln in new_alns:
-                write_aln_to_xmfa(new_aln, trimmed_out)
+                write_aln_to_fna(new_aln, trimmed_out)
+                trimmed_out.write("=\n")
     
     num_clusters = cluster_start - 1
     return (xmfa_file, num_clusters)
@@ -668,7 +669,7 @@ def merge_single_LCB(
     tmp_xmfa = f"{tmp_directory}/cluster-{cluster_idx}.temp" 
     with open(tmp_xmfa, 'w') as xmfa_out_handle:
         new_aln = merge_blocks(aln_xmfa_pairs, fidx_to_new_idx)
-        write_aln_to_xmfa(new_aln, xmfa_out_handle)
+        write_aln_to_fna(new_aln, xmfa_out_handle)
     return tmp_xmfa
 
 
@@ -680,11 +681,12 @@ def merge_single_LCB_star(idx_pairs_tuple, tmp_directory, fidx_to_new_idx):
 
 
 def merge_xmfas(
+    output_dir: str,
     partition_output_dir: str, 
     chunk_labels: List[str], 
-    xmfa_out_f: str, 
     num_clusters: int, 
-    threads: int=1) -> None:
+    threads: int=1,
+    write_blocks: bool=False) -> None:
     """
     Take all of the trimmed XMFA files and compile them into a single XMFA
 
@@ -696,6 +698,7 @@ def merge_xmfas(
         threads:                Number of threads to use.
     """
 
+    xmfa_out_f = f"{output_dir}/parsnp.xmfa" 
     xmfa_files = [f"{partition_output_dir}/{CHUNK_PREFIX}-{cl}-out/parsnp.xmfa.trimmed"
                   for cl in chunk_labels]
     seq_to_idx, fidx_to_new_idx = combine_header_info(xmfa_files)
@@ -727,5 +730,9 @@ def merge_xmfas(
                 tmp_xmfa = f"{tmp_directory}/cluster-{cluster_idx}.temp" 
                 with open(tmp_xmfa) as tx:
                     xmfa_out_handle.write(tx.read())
+                    xmfa_out_handle.write("=\n")
+                if write_blocks:
+                    Path(f"{output_dir}/blocks/b{cluster_idx}").mkdir(parents=True)
+                    shutil.move(tmp_xmfa, f"{output_dir}/blocks/b{cluster_idx}/seq.fna")
 
 


=====================================
src/parsnp.cpp
=====================================
@@ -582,7 +582,7 @@ void Aligner::writeOutput(string psnp,vector<float>& coveragerow)
         if(ct.type==1)
             total_clusters++;
     }
-    xmfafile << "#FormatVersion Parsnp v1.1" << endl;
+    xmfafile << "#FormatVersion Mauve" << endl;
     xmfafile << "#SequenceCount " << nnum << endl;
     for (int zz = 0; zz < nnum; zz++)
     {



View it on GitLab: https://salsa.debian.org/med-team/parsnp/-/commit/ae0cd21ba3ad694a56c764d4a3393aa0ba5f93de

-- 
View it on GitLab: https://salsa.debian.org/med-team/parsnp/-/commit/ae0cd21ba3ad694a56c764d4a3393aa0ba5f93de
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20250301/0e2c3821/attachment-0001.htm>


More information about the debian-med-commit mailing list