[med-svn] [Git][med-team/parsnp][upstream] New upstream version 1.7.2+dfsg

Thu Apr 14 16:24:22 BST 2022


Étienne Mollier pushed to branch upstream at Debian Med / parsnp


Commits:
3a7f0984 by Étienne Mollier at 2022-04-14T16:06:35+02:00
New upstream version 1.7.2+dfsg
- - - - -


5 changed files:

- configure.ac
- + extend.py
- + logger.py
- parsnp
- src/parsnp.cpp


Changes:

=====================================
configure.ac
=====================================
@@ -1,5 +1,5 @@
-AC_INIT(parsnp,1.6.1)
-AM_INIT_AUTOMAKE(parsnp,1.6.1)
+AC_INIT([parsnp],[1.6.1])
+AM_INIT_AUTOMAKE([-Wall])
 AC_PROG_CC(gcc)
 
 AC_ARG_WITH(libmuscle, [  --with-libmuscle=<path/to/libmuscle>     libMUSCLE install dir (default: `pwd`/muscle)])


=====================================
extend.py
=====================================
@@ -0,0 +1,391 @@
+from Bio import AlignIO, SeqIO
+from Bio.Seq import Seq
+from Bio.SeqIO import SeqRecord
+from Bio.Align import MultipleSeqAlignment
+from glob import glob
+import tempfile
+from pathlib import Path
+import re
+from collections import namedtuple, defaultdict, Counter
+import os
+from Bio.Align import substitution_matrices
+from itertools import product, combinations
+import numpy as np
+from Bio.AlignIO.MafIO import MafWriter, MafIterator
+from Bio.AlignIO.MauveIO import MauveWriter
+from logger import logger
+
+
+#%%
+def get_sequence_data(reference, sequence_files, index_files=False):
+    fname_to_seqrecord = {}
+    fname_contigid_to_length = {}
+    fname_contigidx_to_header = {}
+    fname_header_to_gcontigidx = {}
+    idx = 0
+    for fasta_f in [reference] + sequence_files:
+        fname = Path(fasta_f).name
+        if index_files:
+            fname_to_seqrecord[fname] = SeqIO.index(fasta_f, 'fasta')
+            for contig_id, record in fname_to_seqrecord[fname].items():
+                fname_contigid_to_length[fname, contig_id] = len(record)
+        for ctg_idx, record in enumerate(SeqIO.parse(fasta_f, 'fasta')):
+            fname_header_to_gcontigidx[fname, record.id] = idx
+            idx += 1
+            fname_contigidx_to_header[fname, ctg_idx] = record.id
+            fname_contigid_to_length[fname, record.id] = len(record)
+    return fname_contigid_to_length, fname_contigidx_to_header, fname_to_seqrecord, fname_header_to_gcontigidx
+
+#%%
+# Writes the parsnp output to a MAF file containing. While there are methods
+# in HarvestTools that do this, they don't account for multiple contigs,
+# which is a future addition we would like to have.
+def xmfa_to_maf(xmfa_file, output_maf_file, fname_contigidx_to_header, fname_contigid_to_length):
+    index_to_fname = {}
+    # Avoid loading the whole alignment in memory
+    with open(xmfa_file) as parsnp_fd, tempfile.SpooledTemporaryFile(mode='w') as fasta_out:
+        for line in (line.strip() for line in parsnp_fd):
+            if line[:2] == "##":
+                if line.startswith("##SequenceIndex"):
+                    index = int(line.split(' ')[1])
+                elif line.startswith("##SequenceFile"):
+                    filename = Path(line.split(' ')[1]).name
+                    index_to_fname[index] = filename
+            elif line[0] != '=':
+                fasta_out.write(line + "\n")
+        fasta_out.seek(0)
+        xmfa_seqs = SeqIO.parse(fasta_out, "fasta")
+
+
+        curr_cluster_seqs = []
+        curr_cluster_idx = 0
+        with open(output_maf_file, 'w') as maf_out:
+            writer = MafWriter(maf_out)
+            writer.write_header()
+        header_parser = re.compile(r"^\s?(\d+):(\d+)-(\d+)\s+([+,-])\s+cluster(\d+)\s+s?(\d+)?:p(\d+)")
+
+        # Stores coordinates of aligned regions on the FORWARD strand
+        fname_to_contigid_to_coords = defaultdict(lambda: defaultdict(list))
+        for seqrecord in xmfa_seqs:
+            full_header = seqrecord.description
+            groups = header_parser.search(full_header).groups()
+            file_idx, gstart, gend, strand, cluster_idx, contig_idx, position_in_contig = groups
+            cluster_idx = int(cluster_idx)
+            if cluster_idx != curr_cluster_idx:
+                if len(curr_cluster_seqs) != 0:
+                    with open(output_maf_file, 'a') as maf_out:
+                        writer = MafWriter(maf_out)
+                        msa_record = MultipleSeqAlignment(curr_cluster_seqs)
+                        msa_record._annotations={"pass": curr_cluster_idx}
+                        writer.write_alignment(msa_record)
+                    curr_cluster_seqs = []
+            curr_cluster_idx = cluster_idx
+
+            # There are some Parsnp indexing situations that need to be corrected.
+            if not contig_idx:
+                contig_idx = 0
+                start_coord = int(position_in_contig) - 1
+            else:
+                start_coord = int(position_in_contig)
+                contig_idx = int(contig_idx) - 1
+            if int(file_idx) == 1 and contig_idx != 0:
+                start_coord -= 1
+
+            # Add annotations to the seqrecord in order to pass to mafwriter
+            fname = index_to_fname[int(file_idx)]
+            contig_id = fname_contigidx_to_header[fname, contig_idx]
+            src_size = fname_contigid_to_length[fname, contig_id]
+            size = len(str(seqrecord.seq).replace("-", ''))
+            seqrecord.id = f"{fname}:<-file:contig->:{contig_id}"
+            seqrecord.annotations["strand"] = 1 if strand == "+" else -1
+            seqrecord.annotations["srcSize"] = src_size
+            seqrecord.annotations["size"] = size
+            # Parsnp coordinates are always on the fwd strand, even for revcomp hits
+            # we adjust for that here
+            seqrecord.annotations["start"] = start_coord if strand == "+" else src_size - start_coord
+            curr_cluster_seqs.append(seqrecord)
+
+            # In order to keep track of inter-cluster regions, we also would like
+            # to have the coordinates of all aligned clusters on the fwd strand
+            if strand == "-":
+                end_coord = start_coord
+                start_coord = start_coord - size
+            else:
+                end_coord = start_coord + size
+            fname_to_contigid_to_coords[fname][contig_id].append((start_coord, end_coord, strand, cluster_idx))
+
+        # Write remaining sequences
+        if len(curr_cluster_seqs) != 0:
+            with open(output_maf_file, 'a') as maf_out:
+                writer = MafWriter(maf_out)
+                msa_record = MultipleSeqAlignment(curr_cluster_seqs)
+                msa_record._annotations={"pass": curr_cluster_idx}
+                writer.write_alignment(msa_record)
+    return fname_to_contigid_to_coords
+#%%
+def write_intercluster_regions(input_sequences, cluster_directory, fname_to_contigid_to_coords):
+    clusterdir_to_adjacent_clusters = defaultdict(set)
+    fname_contigid_to_cluster_dir_to_length = defaultdict(lambda: defaultdict(int))
+    fname_contigid_to_cluster_dir_to_adjacent_cluster = defaultdict(dict)
+    for f in glob(f"{cluster_directory}/*.fa*"):
+        os.remove(f)
+    for fasta_f in input_sequences:
+        records = SeqIO.parse(fasta_f, 'fasta')
+        fname = Path(fasta_f).name
+        for record in records:
+            coords = [(0, 0, "+", "START CAP")] + sorted(fname_to_contigid_to_coords[fname][record.id]) + [(len(record), len(record), "+", "END CAP")]
+            for idx, (start, end, strand, cluster_idx) in enumerate(coords[1:-1]):
+                idx += 1
+                prev_end = coords[idx-1][1]
+                next_start = coords[idx+1][0]
+                fwd, rev = "right", "left"
+                seq = record.seq
+                if strand == "-":
+                    pass
+                    fwd, rev = rev, fwd
+                    # seq = seq.reverse_complement()
+                fname_contigid_to_cluster_dir_to_length[(fname, record.id)][(cluster_idx, rev)] = start - prev_end
+                fname_contigid_to_cluster_dir_to_length[(fname, record.id)][(cluster_idx, fwd)] = next_start - end
+                fname_contigid_to_cluster_dir_to_adjacent_cluster[(fname, record.id)][(cluster_idx, rev)] = (coords[idx-1][3], "right" if coords[idx-1][2] == "+" else "left")
+                fname_contigid_to_cluster_dir_to_adjacent_cluster[(fname, record.id)][(cluster_idx, fwd)] = (coords[idx+1][3], "left" if coords[idx+1][2] == "+" else "right")
+                clusterdir_to_adjacent_clusters[(cluster_idx, fwd)].add((coords[idx+1][3], "left" if coords[idx+1][2] == "+" else "right"))
+                clusterdir_to_adjacent_clusters[(cluster_idx, rev)].add((coords[idx-1][3], "right" if coords[idx-1][2] == "+" else "left"))
+                seq_to_write = SeqRecord(seq[prev_end:start].reverse_complement(), id=f"{fname}:<-file:contig->:{record.id}", description="")
+                with open(f"{cluster_directory}/cluster{cluster_idx}-{rev}.fasta", 'a') as out_f:
+                    SeqIO.write([seq_to_write], out_f, "fasta")
+                seq_to_write = SeqRecord(seq[end:next_start], id=f"{fname}:<-file:contig->:{record.id}", description="")
+                with open(f"{cluster_directory}/cluster{cluster_idx}-{fwd}.fasta", 'a') as out_f:
+                    SeqIO.write([seq_to_write], out_f, "fasta")
+    return fname_contigid_to_cluster_dir_to_length, fname_contigid_to_cluster_dir_to_adjacent_cluster
+#%%
+def get_new_extensions(cluster_files, match_score, mismatch_score, gap_penalty):
+    # mat = substitution_matrices.load("NUC.4.4")
+    # mismatch_multiplier = 1
+    # gap_match = -1
+    fname_parser = re.compile(r"^.*/cluster(\d+)-(.*).fasta")
+    clusterdir_len = defaultdict(lambda: defaultdict(int))
+    clusterdir_expand = {}
+    for cluster_f in cluster_files:
+        cluster_idx, direction = fname_parser.match(cluster_f).groups()
+        cluster_idx = int(cluster_idx)
+        sequences = list(SeqIO.parse(cluster_f, 'fasta'))
+        lengths = Counter(len(s) for s in sequences)
+        idx = 0
+        score_list = [0]
+        clusterdir_len[(cluster_idx, direction)] = max(lengths)
+        while idx < max(lengths):
+            bases = Counter(seq[idx].upper() if len(seq) > idx else "-" for seq in sequences if seq)
+            score = 0
+            for n1, n2 in combinations(bases.keys(), r=2):
+                if n1 == "-" or n2 == "-":
+                    score += gap_penalty * bases[n1] * bases[n2]
+                else:
+                    score += mismatch_score * bases[n1] * bases[n2]
+            for nuc in bases.keys():
+                if nuc == "-":
+                    continue
+                else:
+                    score += match_score * bases[nuc] * (bases[nuc] - 1) / 2
+            score_list.append(score_list[-1] + score)
+            idx += 1
+
+        extend_by = np.argmax(score_list)
+        # extend_by = lengths.most_common()[0][0] if len(lengths) == 1 else (lengths.most_common()[0][0] if lengths.most_common()[0][0] != 0 else lengths.most_common()[1][0])
+        # print(lengths)
+        # extend_by = min(extend_by, 100)
+        clusterdir_expand[(cluster_idx, direction)] = extend_by
+    return clusterdir_expand, clusterdir_len
+
+#%%
+
+def write_xmfa_header(extended_xmfa_file, fname_header_to_gcontigidx, fname_contigid_to_length, num_clusters):
+
+    with open(extended_xmfa_file, 'w') as xmfa_out:
+        ### Write header
+        xmfa_out.write("#FormatVersion Parsnp v1.1\n")
+        xmfa_out.write(f"#SequenceCount {len(fname_header_to_gcontigidx)}\n")
+        for (fname, header), global_idx in fname_header_to_gcontigidx.items():
+            xmfa_out.write(f"##SequenceIndex {global_idx + 1}\n")
+            xmfa_out.write(f"##SequenceFile {fname}\n")
+            xmfa_out.write(f"##SequenceHeader >{header}\n")
+            xmfa_out.write(f"##SequenceLength {fname_contigid_to_length[(fname, header)]}\n")
+        xmfa_out.write(f"##IntervalCount {num_clusters}\n")
+
+
+def write_xmfa_cluster(extended_xmfa_file, msa_records, fname_header_to_gcontigidx):
+    with open(extended_xmfa_file, 'a') as xmfa_out:
+        ### Write MSA record
+        header_parser = re.compile(r"(.+):<-file:contig->:(.+)")
+        for msa_record in msa_records:
+            cluster = msa_record.annotations['cluster']
+            for seq_record in msa_record:
+                fname, contig_id = header_parser.match(seq_record.id).groups()
+
+                strand = "-" if seq_record.annotations["strand"] == -1 else "+"
+                start = seq_record.annotations['start']
+                size = seq_record.annotations["size"]
+                if strand == "+":
+                    seq_record.id = f"{fname_header_to_gcontigidx[(fname, contig_id)] + 1}:{start}-{start + size}"
+                else:
+                    seq_record.id = f"{fname_header_to_gcontigidx[(fname, contig_id)] + 1}:{start - size}-{start}"
+                seq_record.description = f"{strand} cluster{cluster} s1:p{start}"
+            SeqIO.write(msa_record, xmfa_out, "fasta")
+        xmfa_out.write("=\n")
+
+
+
+# fname_to_contigid_to_extended = defaultdict(lambda: defaultdict(list))
+# fname_to_contigid_to_coords[fname][contig_id].append((seqrecord.annotations["start"], seqrecord.annotations["start"] + seqrecord.annotations["size"], strand, cluster_idx))
+def write_extended_xmfa(
+        original_maf_file,
+        extended_maf_file,
+        cluster_directory,
+        clusterdir_expand,
+        clusterdir_len,
+        fname_contigid_to_cluster_dir_to_length,
+        fname_contigid_to_cluster_dir_to_adjacent_cluster,
+        fname_header_to_gcontigidx,
+        fname_contigid_to_length):
+    header_parser = re.compile(r"(.+):<-file:contig->:(.+)")
+    clusters = set()
+    for fname, contigid in fname_contigid_to_cluster_dir_to_length.keys():
+        for cluster, _ in fname_contigid_to_cluster_dir_to_length[(fname, contigid)].keys():
+            clusters.add(cluster)
+    write_xmfa_header(extended_maf_file, fname_header_to_gcontigidx, fname_contigid_to_length, cluster)
+    with open(original_maf_file, 'r') as maf_in:
+        maf_iterator = MafIterator(maf_in)
+        # maf_writer = MafWriter(maf_out)
+        # maf_writer = MauveWriter(maf_out)
+        # maf_writer.write_header()
+        for idx, msa_record in enumerate(maf_iterator):
+            fname, contig_id = header_parser.match(msa_record[0].id).groups()
+            cluster_idx = int(msa_record._annotations["pass"])
+            for direction in ("right", "left"):
+                expand_by = clusterdir_expand[(cluster_idx, direction)]
+                expand_by = min(
+                    expand_by, 
+                    max(cldir_to_len[(cluster_idx, direction)] for cldir_to_len in fname_contigid_to_cluster_dir_to_length.values())
+                ) 
+                # expand_by = min(expand_by, 7)
+                # expand_by = 10
+                # for seq_record in msa_record:
+                #     fname, contig_id, cluster_idx = header_parser.match(seq_record.id).groups()
+                #     cluster_idx = int(cluster_idx)
+                #     expand_by = min(expand_by, fname_contigid_to_cluster_dir_to_length[(fname, contig_id)][(cluster_idx, direction)])
+                if expand_by <= 0:
+                    continue
+                logger.debug(f"Expanding cluster {cluster_idx} to the {direction} by {expand_by}")
+                inter_cluster_sequences = SeqIO.index(f"{cluster_directory}/cluster{cluster_idx}-{direction}.fasta", 'fasta')
+                for seq_record in msa_record:
+                    fname, contig_id = header_parser.match(seq_record.id).groups()
+                    cluster_idx = int(cluster_idx)
+                    flanking_seq = inter_cluster_sequences[f"{fname}:<-file:contig->:{contig_id}"]
+                    flanking_seq = flanking_seq[:fname_contigid_to_cluster_dir_to_length[(fname, contig_id)][(cluster_idx, direction)]]
+                    if len(flanking_seq) >= expand_by:
+                        flanking_seq.seq = flanking_seq.seq[:expand_by]
+                    if direction == "left":
+                        flanking_seq.seq = flanking_seq.seq.reverse_complement()
+                        flanking_seq.seq = "-"*(expand_by - len(flanking_seq)) + flanking_seq.seq
+                    else:
+                        flanking_seq.seq = flanking_seq.seq + "-"*(expand_by - len(flanking_seq))
+                    seq_record.annotations["size"] += len(str(flanking_seq.seq).replace("-", ''))
+                    if direction == "right":
+                        seq_record.seq = seq_record.seq + flanking_seq.seq
+                    else:
+                        seq_record.seq = flanking_seq.seq + seq_record.seq
+                        seq_record.annotations["start"] -= (expand_by - flanking_seq.seq.count("-"))
+                    adj_cluster_idx, adj_cluster_dir = fname_contigid_to_cluster_dir_to_adjacent_cluster[(fname, contig_id)][(cluster_idx, direction)]
+                    fname_contigid_to_cluster_dir_to_length[(fname, contig_id)][(adj_cluster_idx, adj_cluster_dir)] -= len(str(flanking_seq.seq).replace("-", ''))
+
+
+
+            # for seq_record in msa_record:
+            #     fname, contig_id = header_parser.match(seq_record.id).groups()
+                # seq_record.id = contig_id
+            msa_record.annotations["cluster"] = cluster_idx
+            write_xmfa_cluster(extended_maf_file, [msa_record], fname_header_to_gcontigidx)
+            # maf_writer.write_alignment(msa_record)
+
+#%%
+def check_maf(maf_file, fname_to_seqrecords):
+    contig_to_coords_test = defaultdict(list)
+    header_parser = re.compile(r"(.+):<-file:contig->:(.+)")
+    with open(maf_file, 'r') as maf_in:
+        maf_iterator = MafIterator(maf_in)
+        for msa_record in maf_iterator:
+            for seq_record in msa_record:
+                strand = seq_record.annotations["strand"]
+                coord_start = seq_record.annotations["start"]  if seq_record.annotations["strand"] == 1 else  seq_record.annotations["srcSize"] - seq_record.annotations["start"]
+                start = seq_record.annotations["start"]
+                end = start + seq_record.annotations["size"]
+                if strand == -1:
+                    coord_end = coord_start - seq_record.annotations["size"]
+                    coord_start, coord_end = coord_end, coord_start
+                else:
+                    coord_end = end
+                fname, contig_id = header_parser.match(seq_record.id).groups()
+                contig_to_coords_test[(fname, contig_id)].append((coord_start, coord_end, seq_record.annotations["strand"]))
+                # if len(seq_record) > 20:
+                #     print(seq_record.id.split(":")[0], seq[:15],"...", seq[-15:])
+                # else:
+                #     print(seq_record.id.split(":")[0], seq)
+                block_seq = str(seq_record.seq).lower().replace("-", '').replace("n", ".")
+                original_seq = fname_to_seqrecords[fname][contig_id].seq
+                if strand == 1:
+                    match_pos = re.search(block_seq, str(original_seq), re.IGNORECASE)
+                else:
+                    original_seq = str(original_seq.reverse_complement())
+                    match_pos = re.search(block_seq, original_seq, re.IGNORECASE)
+                match_pos = match_pos.start() if match_pos else -1
+                if start != match_pos:
+                    print(start, "!=", match_pos)
+                    print(seq_record.id)
+                    print(seq_record.annotations)
+                    print(block_seq)
+                    return
+    for key in contig_to_coords_test:
+        contig_to_coords_test[key] = sorted(contig_to_coords_test[key])
+    return contig_to_coords_test
+
+def validate_coords(contig_to_coords):
+    for contig, coords in contig_to_coords.items():
+        prev_end = -1
+        for start, end, strand in coords:
+            if end <= prev_end:
+                print(f"{contig} has invalid coordinates!")
+                print(coords)
+                break
+            prev_end = end
+
+# coords_extended = check_maf("parsnp-extended.maf")
+# validate_coords(coords_extended)
+# coords_original = check_maf("parsnp.maf")
+#%%
+# validate = True
+# genome_dir = "sars-100-rev/"
+# sequence_files = glob(f"{genome_dir}/*.fa*")
+# xmfa_file = f"{genome_dir}/parsnp.xmfa"
+# original_maf_file = f"{genome_dir}/parsnp-original.maf"
+# extended_maf_file = f"{genome_dir}/parsnp-extended.maf"
+# cluster_directory = "clusters/"
+
+# fname_contigid_to_length, fname_contigidx_to_header, fname_to_seqrecord = get_sequence_data(sequence_files, index_files=validate)
+# fname_to_contigid_to_coords = xmfa_to_maf(xmfa_file, original_maf_file, fname_contigidx_to_header, fname_contigid_to_length)
+# packed_write_result = write_intercluster_regions(cluster_directory, fname_to_contigid_to_coords)
+# fname_contigid_to_cluster_dir_to_length, fname_contigid_to_cluster_dir_to_adjacent_cluster = packed_write_result
+# cluster_files = glob(f"{cluster_directory}/*.fasta")
+# clusterdir_expand, clusterdir_len = get_new_extensions(cluster_files)
+# write_extended_maf(
+        # original_maf_file,
+        # extended_maf_file,
+        # cluster_directory,
+        # clusterdir_expand,
+        # clusterdir_len,
+        # fname_contigid_to_cluster_dir_to_length,
+        # fname_contigid_to_cluster_dir_to_adjacent_cluster)
+
+# if validate:
+    # coords_extended = check_maf(extended_maf_file, fname_to_seqrecord)
+    # validate_coords(coords_extended)
+    # coords_original = check_maf(original_maf_file, fname_to_seqrecord)
+    # validate_coords(coords_original)


=====================================
logger.py
=====================================
@@ -0,0 +1,55 @@
+import logging
+############################################# Logging ##############################################
+BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = range(8)
+#These are the sequences need to get colored ouput
+CSI=""#"\x1B["
+BOLDME = ""#CSI+'\033[1m'
+STATUS_BLUE = ""#CSI+'\033[94m'
+OK_GREEN = ""#CSI+'\033[92m'#'32m'
+SKIP_GRAY = ""#CSI+'\033[37m'
+WARNING_YELLOW = ""#CSI+'\033[93m'
+ERROR_RED = ""#CSI+'\033[91m'
+ENDC = ""#CSI+'0m'
+RESET_SEQ = "\033[0m"
+COLOR_SEQ = "\033[1;%dm"
+BOLD_SEQ = "\033[1m"
+
+def formatter_message(message, use_color = True):
+    if use_color:
+        message = message.replace("$RESET", RESET_SEQ).replace("$BOLD", BOLD_SEQ)
+    else:
+        message = message.replace("$RESET", "").replace("$BOLD", "")
+    return message
+
+COLORS = {
+    'DEBUG': BLUE,
+    'INFO': WHITE,
+    'WARNING': YELLOW,
+    'ERROR': RED,
+    'CRITICAL': RED
+}
+
+class ColoredFormatter(logging.Formatter):
+    def __init__(self, msg, datefmt = None, use_color = True):
+        logging.Formatter.__init__(self, fmt=msg, datefmt=datefmt)
+        self.use_color = use_color
+
+    def format(self, record):
+        levelname = record.levelname
+        if self.use_color and levelname in COLORS:
+            levelname_color = COLOR_SEQ % (30 + COLORS[levelname]) + levelname + RESET_SEQ
+            record.levelname = levelname_color
+        return logging.Formatter.format(self, record)
+
+# Instantiate logger
+logger = logging.getLogger("Parsnp")
+logger.setLevel(logging.INFO)
+ch = logging.StreamHandler()
+ch.setLevel(logging.INFO)
+# create formatter and add it to the handlers
+formatter = ColoredFormatter('%(asctime)s - %(levelname)s - %(message)s',
+                             datefmt="%H:%M:%S")
+ch.setFormatter(formatter)
+# add the handlers to the logger
+logger.addHandler(ch)
+####################################################################################################


=====================================
parsnp
=====================================
@@ -1,20 +1,25 @@
 #!/usr/bin/env python
 # See the LICENSE file included with this software for license information.
 
-import os, sys, string, getopt, random,subprocess, time, glob,operator, math, datetime,numpy #pysam
+import os, sys, string, getopt, random,subprocess, time,operator, math, datetime,numpy #pysam
 from collections import defaultdict
 import csv
-import tempfile
 import shutil
+from tempfile import TemporaryDirectory
 import re
 import logging
+from logger import logger
 import multiprocessing
 import argparse
 import signal
 import inspect
 from multiprocessing import *
+from Bio import SeqIO
+from glob import glob
 
-__version__ = "1.6.2"
+import extend as ext
+
+__version__ = "1.7.2"
 reroot_tree = True #use --midpoint-reroot
 
 try:
@@ -37,60 +42,6 @@ TOTSEQS=0
 PARSNP_DIR = sys.path[0]
 
 
-############################################# Logging ##############################################
-BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = range(8)
-#These are the sequences need to get colored ouput
-CSI=""#"\x1B["
-BOLDME = ""#CSI+'\033[1m'
-STATUS_BLUE = ""#CSI+'\033[94m'
-OK_GREEN = ""#CSI+'\033[92m'#'32m'
-SKIP_GRAY = ""#CSI+'\033[37m'
-WARNING_YELLOW = ""#CSI+'\033[93m'
-ERROR_RED = ""#CSI+'\033[91m'
-ENDC = ""#CSI+'0m'
-RESET_SEQ = "\033[0m"
-COLOR_SEQ = "\033[1;%dm"
-BOLD_SEQ = "\033[1m"
-
-def formatter_message(message, use_color = True):
-    if use_color:
-        message = message.replace("$RESET", RESET_SEQ).replace("$BOLD", BOLD_SEQ)
-    else:
-        message = message.replace("$RESET", "").replace("$BOLD", "")
-    return message
-
-COLORS = {
-    'DEBUG': BLUE,
-    'INFO': WHITE,
-    'WARNING': YELLOW,
-    'ERROR': RED,
-    'CRITICAL': RED
-}
-
-class ColoredFormatter(logging.Formatter):
-    def __init__(self, msg, datefmt = None, use_color = True):
-        logging.Formatter.__init__(self, fmt=msg, datefmt=datefmt)
-        self.use_color = use_color
-
-    def format(self, record):
-        levelname = record.levelname
-        if self.use_color and levelname in COLORS:
-            levelname_color = COLOR_SEQ % (30 + COLORS[levelname]) + levelname + RESET_SEQ
-            record.levelname = levelname_color
-        return logging.Formatter.format(self, record)
-
-# Instantiate logger
-logger = logging.getLogger("Parsnp")
-logger.setLevel(logging.INFO)
-ch = logging.StreamHandler()
-ch.setLevel(logging.INFO)
-# create formatter and add it to the handlers
-formatter = ColoredFormatter('%(asctime)s - %(levelname)s - %(message)s',
-                             datefmt="%H:%M:%S")
-ch.setFormatter(formatter)
-# add the handlers to the logger
-logger.addHandler(ch)
-####################################################################################################
 
 
 ########################################### Environment ############################################
@@ -296,7 +247,7 @@ def is_valid_file_path(parser, arg):
 def is_valid_dir(parser, arg):
     if not os.path.exists(arg):
         parser.error( "The directory %s does not exist\n" % (arg))
-    if len(glob.glob("%s/*"%(arg))) == 0:
+    if len(glob("%s/*"%(arg))) == 0:
         parser.error("The director %s is empty"%(arg))
 
 
@@ -469,6 +420,26 @@ def parse_args():
             # "--filter-phipack-snps",
             # action = "store_true",
             # help = "Enable filtering of SNPs located in PhiPack identified regions of recombination")
+    extend_args = parser.add_argument_group("LCB Extension")
+    extend_args.add_argument(
+        "--extend-lcbs",
+        action="store_true",
+        help="Extend the boundaries of LCBs with an ungapped alignment")
+    extend_args.add_argument(
+        "--match-score",
+        type=float,
+        default=5,
+        help="Value of match score for extension")
+    extend_args.add_argument(
+        "--mismatch-penalty",
+        type=float,
+        default=-4,
+        help="Value of mismatch score for extension (should be negative)")
+    extend_args.add_argument(
+        "--gap-penalty",
+        type=float,
+        default=-2,
+        help="Value of gap penalty for extension (should be negative)")
 
     misc_args = parser.add_argument_group("Misc")
     misc_args.add_argument(
@@ -540,8 +511,7 @@ def parse_args():
 
 if __name__ == "__main__":
     t1 = time.time()
-    sys.stderr.write( BOLDME+"|--Parsnp %s--|\n"%(VERSION))
-    sys.stderr.write( BOLDME+"For detailed documentation please see --> http://harvest.readthedocs.org/en/latest\n")
+    logger.info(f"|--Parsnp {VERSION}--|\n")
 
 
     parsnp_dir= sys.path[0]
@@ -650,17 +620,18 @@ if __name__ == "__main__":
     if len(input_files) < 2:
         logger.critical("Less than 2 input sequences provided...")
         sys.exit(1)
-    if args.validate_input:
-        from Bio import SeqIO
-        for f in input_files + ([ref] if ref and ref != "!" else []):
-            try:
-                records = SeqIO.parse(f, "fasta")
-            except:
-                logger.error("{} is an invalid sequence file!".format(f))
-            for record in records:
-                if any(c not in "GATCRYWSMKHBVDN" + "GATCRYWSMKHBVDN".lower() for c in record.seq):
-                    logger.error("Genome sequence {} has invalid characters {}! Skip!".format(f, set(str(record.seq)) - set("AGCTNagctn")))
-                    continue
+    for f in input_files + ([ref] if ref and ref != "!" else []):
+        try:
+            records = list(SeqIO.parse(f, "fasta"))
+        except:
+            logger.error("{} is an invalid sequence file!".format(f))
+        if args.extend_lcbs and len(records) > 1:
+            logger.error("Extending LCBs does not currently work with multi-contig inputs yet")
+            sys.exit(1)
+        for record in records:
+            if any(c not in "GATCRYWSMKHBVDN" + "GATCRYWSMKHBVDN".lower() for c in record.seq):
+                logger.error("Genome sequence {} has invalid characters {}! Skip!".format(f, set(str(record.seq)) - set("AGCTNagctn")))
+                continue
 
     # Parse reference if necessary
     if ref and ref != "!":
@@ -884,28 +855,34 @@ SETTINGS:
         # fnafiles.remove(ref)
 
     #sort reference by largest replicon to smallest
-    if sortem and os.path.exists(ref) and not autopick_ref:
-        logger.debug("Sorting reference replicons")
-        ff = open(ref, 'r')
-        seqs = ff.read().split(">")[1:]
-        seq_dict = {}
-        seq_len = {}
-        for seq in seqs:
-            try:
-                hdr, seq = seq.split("\n",1)
-            except ValueError:
-                # TODO Why do we ignore when theres a header but no sequence?
-                continue
-            seq_dict[hdr] = seq
-            seq_len[hdr] = len(seq) - seq.count('\n')
-        seq_len_sort = sorted(iter(seq_len.items()), key=operator.itemgetter(1), reverse=True)
-        ref = os.path.join(outputDir, os.path.basename(ref)+".ref")
-        ffo = open(ref, 'w')
-        for hdr, seq in seq_len_sort:
-            ffo.write(">%s\n"%(hdr))
-            ffo.write("%s"%(seq_dict[hdr]))
-        ff.close()
-        ffo.close()
+    if ref in fnafiles:
+        fnafiles = [f for f in fnafiles if f != ref]
+    elif sortem and os.path.exists(ref) and not autopick_ref:
+        sequences = SeqIO.parse(ref, "fasta")
+        new_ref = os.path.join(outputDir, os.path.basename(ref)+".ref")
+        SeqIO.write(sequences, new_ref, "fasta")
+        ref = new_ref
+        # logger.debug("Sorting reference replicons")
+        # ff = open(ref, 'r')
+        # seqs = ff.read().split(">")[1:]
+        # seq_dict = {}
+        # seq_len = {}
+        # for seq in seqs:
+            # try:
+                # hdr, seq = seq.split("\n",1)
+            # except ValueError:
+                # # TODO Why do we ignore when theres a header but no sequence?
+                # continue
+            # seq_dict[hdr] = seq
+            # seq_len[hdr] = len(seq) - seq.count('\n')
+        # seq_len_sort = sorted(iter(seq_len.items()), key=operator.itemgetter(1), reverse=True)
+        # ref = os.path.join(outputDir, os.path.basename(ref)+".ref")
+        # ffo = open(ref, 'w')
+        # for hdr, seq in seq_len_sort:
+            # ffo.write(">%s\n"%(hdr))
+            # ffo.write("%s"%(seq_dict[hdr]))
+        # ff.close()
+        # ffo.close()
     else:
         ref = genbank_ref
 
@@ -948,7 +925,7 @@ SETTINGS:
     if os.path.exists(os.path.join(outputDir, "alltogether.fasta")):
         os.remove(os.path.join(outputDir, "alltogether.fasta"))
     if os.path.exists(os.path.join(outputDir, "blocks/b1")):
-        ftrm = glob.glob(os.path.join(outputDir, "blocks/b*"))
+        ftrm = glob(os.path.join(outputDir, "blocks/b*"))
         for f in ftrm:
             shutil.rmtree(f)
 
@@ -1240,7 +1217,7 @@ Please verify recruited genomes are all strain of interest""")
     t2 = time.time()
     elapsed = float(t2)-float(t1)
     #print("-->Getting list of LCBs.."
-    allbfiles = glob.glob(os.path.join(blocks_dir, "b*/*"))
+    allbfiles = glob(os.path.join(blocks_dir, "b*/*"))
     blockfiles = []
     block_startpos = []
     block_dict = {}
@@ -1357,7 +1334,7 @@ Please verify recruited genomes are all strain of interest""")
         pool.close()
         pool.join()
         brkeys = list(bedfile_dict.keys())
-        brkeys.sort()
+        brkeys.sort() 
         for key in brkeys:
             bedfile.write(bedfile_dict[key])
         bedfile.close()
@@ -1366,15 +1343,52 @@ Please verify recruited genomes are all strain of interest""")
 
     annotation_dict = {}
     #TODO always using xtrafast?
+    parsnp_output = f"{outputDir}/parsnp.xmfa"
+    if args.extend_lcbs:
+        xmfa_file = f"{outputDir}/parsnp.xmfa"
+        with TemporaryDirectory() as temp_directory:
+            original_maf_file = f"{temp_directory}/parsnp-original.maf"
+            extended_xmfa_file = f"{outputDir}/parsnp-extended.xmfa"
+            fname_contigid_to_length, fname_contigidx_to_header, fname_to_seqrecord, fname_header_to_gcontigidx = ext.get_sequence_data(
+                    ref,
+                    finalfiles, 
+                    index_files=False)
+            fname_to_contigid_to_coords = ext.xmfa_to_maf(
+                    xmfa_file, 
+                    original_maf_file, 
+                    fname_contigidx_to_header, 
+                    fname_contigid_to_length)
+            packed_write_result = ext.write_intercluster_regions(finalfiles + [ref], temp_directory, fname_to_contigid_to_coords)
+            fname_contigid_to_cluster_dir_to_length, fname_contigid_to_cluster_dir_to_adjacent_cluster = packed_write_result
+            cluster_files = glob(f"{temp_directory}/*.fasta")
+            clusterdir_expand, clusterdir_len = ext.get_new_extensions(
+                    cluster_files,
+                    args.match_score,
+                    args.mismatch_penalty,
+                    args.gap_penalty)
+            ext.write_extended_xmfa(
+                    original_maf_file,
+                    extended_xmfa_file,
+                    temp_directory,
+                    clusterdir_expand,
+                    clusterdir_len,
+                    fname_contigid_to_cluster_dir_to_length,
+                    fname_contigid_to_cluster_dir_to_adjacent_cluster,
+                    fname_header_to_gcontigidx,
+                    fname_contigid_to_length)
+            parsnp_output = extended_xmfa_file
+            os.remove(original_maf_file)
+
+
     if xtrafast or 1:
         #add genbank here, if present
         if len(genbank_ref) != 0:
-            rnc = "harvesttools -q -o %s/parsnp.ggr -x "%(outputDir)+outputDir+os.sep+"parsnp.xmfa"
+            rnc = f"harvesttools -q -o {outputDir}/parsnp.ggr -x {parsnp_output}"
             for file in genbank_files:
                 rnc += " -g %s " %(file)
             run_command(rnc)
         else:
-            run_command("harvesttools -q -o %s/parsnp.ggr -f %s -x "%(outputDir,ref)+outputDir+os.sep+"parsnp.xmfa")
+            run_command(f"harvesttools -q -o {outputDir}/parsnp.ggr -f {ref} -x {parsnp_output}")
 
         if run_recomb_filter:
             run_command("harvesttools -q -b %s/parsnp.rec,REC,\"PhiPack\" -o %s/parsnp.ggr -i %s/parsnp.ggr"%(outputDir,outputDir,outputDir))
@@ -1387,14 +1401,14 @@ Please verify recruited genomes are all strain of interest""")
 
     if not args.skip_phylogeny:
         logger.info("Reconstructing core genome phylogeny...")
-        with open(os.path.join(outputDir, "parsnp.snps.mblocks")) as mblocks:
-            for line in mblocks:
-                if line[0] != ">" and len(line.rstrip()) < 6:
-                    logger.warning("Not enough SNPs to use RaxML. Attempting to use FastTree instead...")
-                    use_fasttree = True
-                    break
+        mblocks_seqs = SeqIO.parse(os.path.join(outputDir, "parsnp.snps.mblocks"), "fasta")
+        for seq in mblocks_seqs:
+            if len(seq) < 6:
+                logger.warning("Not enough SNPs to use RaxML. Attempting to use FastTree instead...")
+                use_fasttree = True
+                break
         if not use_fasttree:
-            with tempfile.TemporaryDirectory() as raxml_output_dir:
+            with TemporaryDirectory() as raxml_output_dir:
                 command = "raxmlHPC-PTHREADS -m GTRCAT -p 12345 -T %d -s %s -w %s -n OUTPUT"%(threads,outputDir+os.sep+"parsnp.snps.mblocks", raxml_output_dir)
                 run_command(command)
                 os.system("mv {}/RAxML_bestTree.OUTPUT {}".format(raxml_output_dir, outputDir+os.sep+"parsnp.tree"))
@@ -1450,9 +1464,9 @@ Please verify recruited genomes are all strain of interest""")
     else:
         logger.info("Aligned %d genomes in %.2f seconds"%(totseqs,float(elapsed)))
     #cleanup
-    rmfiles = glob.glob(os.path.join(outputDir, "*.aln"))
+    rmfiles = glob(os.path.join(outputDir, "*.aln"))
     #rmfiles2 = glob.glob(outputDir+os.sep+"blocks/b*/*")
-    rmfiles3 = glob.glob(os.path.join(outputDir, "blocks/b*"))
+    rmfiles3 = glob(os.path.join(outputDir, "blocks/b*"))
     for f in rmfiles:
         os.remove(f)
     for f in rmfiles3:
@@ -1491,13 +1505,13 @@ Please verify recruited genomes are all strain of interest""")
     if not VERBOSE and os.path.exists("%s.delta"%(prefix)):
         os.remove("%s.delta"%(prefix))
 
-    for f in glob.glob(os.path.join(outputDir,"*.reps")):
+    for f in glob(os.path.join(outputDir,"*.reps")):
         if not VERBOSE and os.path.exists(f):
             os.remove(f)
 
-    for f in glob.glob(os.path.join(outputDir, "*.ref")):
-        if not VERBOSE and os.path.exists(f):
-            os.remove(f)
+    # for f in glob(os.path.join(outputDir, "*.ref")):
+        # if not VERBOSE and os.path.exists(f):
+            # os.remove(f)
 
     if not VERBOSE and os.path.exists("%s/psnn.ini"%(outputDir)):
         os.remove("%s/psnn.ini"%(outputDir))


=====================================
src/parsnp.cpp
=====================================
@@ -995,20 +995,39 @@ void Aligner::writeOutput(string psnp,vector<float>& coveragerow)
                         hdr1 =  lasthdr1;
                         seqstart = laststart;
                     }
-                    if ( !ct.mums.at(0).isforward.at(i) )
-                    {
-                        xmfafile << "- cluster" << b << " "  << hdr1 << ":p" << (ct.start.at(i)-seqstart)+1 << endl;
-                        if(recomb_filter)
+                    if (i == 0) {
+                        if ( !ct.mums.at(0).isforward.at(i) )
                         {
-                            clcbfile << "- cluster" << b << " "  << hdr1 << ":p" << (ct.start.at(i)-seqstart)+1 << endl;
+                            xmfafile << "- cluster" << b << " "  << hdr1 << ":p" << (ct.start.at(i)-seqstart)+1 << endl;
+                            if(recomb_filter)
+                            {
+                                clcbfile << "- cluster" << b << " "  << hdr1 << ":p" << (ct.start.at(i)-seqstart)+1 << endl;
+                            }
                         }
-                    }
-                    else
-                    {
-                        xmfafile << "+ cluster" << b << " "  << hdr1 << ":p" << (ct.start.at(i)-seqstart)+1 << endl;
-                        if(recomb_filter)
+                        else
                         {
-                            clcbfile << "+ cluster" << b << " "  << hdr1 << ":p" << (ct.start.at(i)-seqstart)+1 << endl;
+                            xmfafile << "+ cluster" << b << " "  << hdr1 << ":p" << (ct.start.at(i)-seqstart)+1 << endl;
+                            if(recomb_filter)
+                            {
+                                clcbfile << "+ cluster" << b << " "  << hdr1 << ":p" << (ct.start.at(i)-seqstart)+1 << endl;
+                            }
+                        }    
+                    } else {
+                        if ( !ct.mums.at(0).isforward.at(i) )
+                        {
+                            xmfafile << "- cluster" << b << " "  << hdr1 << ":p" << (ct.start.at(i)-seqstart)+1 - ((hdr1 == "s1" || hdr1=="") ? 0 : this->d + 10 + 1) + ct.mums.at(0).length << endl;
+                            if(recomb_filter)
+                            {
+                                clcbfile << "- cluster" << b << " "  << hdr1 << ":p" << (ct.start.at(i)-seqstart)+1 << endl;
+                            }
+                        }
+                        else
+                        {
+                            xmfafile << "+ cluster" << b << " "  << hdr1 << ":p" << (ct.start.at(i)-seqstart)+1 - ((hdr1 == "s1" || hdr1=="") ? 0 : this->d + 10 + 1)  << endl;
+                            if(recomb_filter)
+                            {
+                                clcbfile << "+ cluster" << b << " "  << hdr1 << ":p" << (ct.start.at(i)-seqstart)+1 << endl;
+                            }
                         }
                     }
                     for( k = 0; k+width < s1s.size();)



View it on GitLab: https://salsa.debian.org/med-team/parsnp/-/commit/3a7f098447a5cd743ca06604609246d2a5620ca4

-- 
View it on GitLab: https://salsa.debian.org/med-team/parsnp/-/commit/3a7f098447a5cd743ca06604609246d2a5620ca4
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20220414/01bf421f/attachment-0001.htm>