[med-svn] [Git][med-team/parsnp][upstream] New upstream version 1.7.2+dfsg
Étienne Mollier (@emollier)
gitlab at salsa.debian.org
Thu Apr 14 16:24:22 BST 2022
Étienne Mollier pushed to branch upstream at Debian Med / parsnp
Commits:
3a7f0984 by Étienne Mollier at 2022-04-14T16:06:35+02:00
New upstream version 1.7.2+dfsg
- - - - -
5 changed files:
- configure.ac
- + extend.py
- + logger.py
- parsnp
- src/parsnp.cpp
Changes:
=====================================
configure.ac
=====================================
@@ -1,5 +1,5 @@
-AC_INIT(parsnp,1.6.1)
-AM_INIT_AUTOMAKE(parsnp,1.6.1)
+AC_INIT([parsnp],[1.6.1])
+AM_INIT_AUTOMAKE([-Wall])
AC_PROG_CC(gcc)
AC_ARG_WITH(libmuscle, [ --with-libmuscle=<path/to/libmuscle> libMUSCLE install dir (default: `pwd`/muscle)])
=====================================
extend.py
=====================================
@@ -0,0 +1,391 @@
+from Bio import AlignIO, SeqIO
+from Bio.Seq import Seq
+from Bio.SeqIO import SeqRecord
+from Bio.Align import MultipleSeqAlignment
+from glob import glob
+import tempfile
+from pathlib import Path
+import re
+from collections import namedtuple, defaultdict, Counter
+import os
+from Bio.Align import substitution_matrices
+from itertools import product, combinations
+import numpy as np
+from Bio.AlignIO.MafIO import MafWriter, MafIterator
+from Bio.AlignIO.MauveIO import MauveWriter
+from logger import logger
+
+
+#%%
+def get_sequence_data(reference, sequence_files, index_files=False):
+ fname_to_seqrecord = {}
+ fname_contigid_to_length = {}
+ fname_contigidx_to_header = {}
+ fname_header_to_gcontigidx = {}
+ idx = 0
+ for fasta_f in [reference] + sequence_files:
+ fname = Path(fasta_f).name
+ if index_files:
+ fname_to_seqrecord[fname] = SeqIO.index(fasta_f, 'fasta')
+ for contig_id, record in fname_to_seqrecord[fname].items():
+ fname_contigid_to_length[fname, contig_id] = len(record)
+ for ctg_idx, record in enumerate(SeqIO.parse(fasta_f, 'fasta')):
+ fname_header_to_gcontigidx[fname, record.id] = idx
+ idx += 1
+ fname_contigidx_to_header[fname, ctg_idx] = record.id
+ fname_contigid_to_length[fname, record.id] = len(record)
+ return fname_contigid_to_length, fname_contigidx_to_header, fname_to_seqrecord, fname_header_to_gcontigidx
+
+#%%
+# Writes the parsnp output to a MAF file containing. While there are methods
+# in HarvestTools that do this, they don't account for multiple contigs,
+# which is a future addition we would like to have.
+def xmfa_to_maf(xmfa_file, output_maf_file, fname_contigidx_to_header, fname_contigid_to_length):
+ index_to_fname = {}
+ # Avoid loading the whole alignment in memory
+ with open(xmfa_file) as parsnp_fd, tempfile.SpooledTemporaryFile(mode='w') as fasta_out:
+ for line in (line.strip() for line in parsnp_fd):
+ if line[:2] == "##":
+ if line.startswith("##SequenceIndex"):
+ index = int(line.split(' ')[1])
+ elif line.startswith("##SequenceFile"):
+ filename = Path(line.split(' ')[1]).name
+ index_to_fname[index] = filename
+ elif line[0] != '=':
+ fasta_out.write(line + "\n")
+ fasta_out.seek(0)
+ xmfa_seqs = SeqIO.parse(fasta_out, "fasta")
+
+
+ curr_cluster_seqs = []
+ curr_cluster_idx = 0
+ with open(output_maf_file, 'w') as maf_out:
+ writer = MafWriter(maf_out)
+ writer.write_header()
+ header_parser = re.compile(r"^\s?(\d+):(\d+)-(\d+)\s+([+,-])\s+cluster(\d+)\s+s?(\d+)?:p(\d+)")
+
+ # Stores coordinates of aligned regions on the FORWARD strand
+ fname_to_contigid_to_coords = defaultdict(lambda: defaultdict(list))
+ for seqrecord in xmfa_seqs:
+ full_header = seqrecord.description
+ groups = header_parser.search(full_header).groups()
+ file_idx, gstart, gend, strand, cluster_idx, contig_idx, position_in_contig = groups
+ cluster_idx = int(cluster_idx)
+ if cluster_idx != curr_cluster_idx:
+ if len(curr_cluster_seqs) != 0:
+ with open(output_maf_file, 'a') as maf_out:
+ writer = MafWriter(maf_out)
+ msa_record = MultipleSeqAlignment(curr_cluster_seqs)
+ msa_record._annotations={"pass": curr_cluster_idx}
+ writer.write_alignment(msa_record)
+ curr_cluster_seqs = []
+ curr_cluster_idx = cluster_idx
+
+ # There are some Parsnp indexing situations that need to be corrected.
+ if not contig_idx:
+ contig_idx = 0
+ start_coord = int(position_in_contig) - 1
+ else:
+ start_coord = int(position_in_contig)
+ contig_idx = int(contig_idx) - 1
+ if int(file_idx) == 1 and contig_idx != 0:
+ start_coord -= 1
+
+ # Add annotations to the seqrecord in order to pass to mafwriter
+ fname = index_to_fname[int(file_idx)]
+ contig_id = fname_contigidx_to_header[fname, contig_idx]
+ src_size = fname_contigid_to_length[fname, contig_id]
+ size = len(str(seqrecord.seq).replace("-", ''))
+ seqrecord.id = f"{fname}:<-file:contig->:{contig_id}"
+ seqrecord.annotations["strand"] = 1 if strand == "+" else -1
+ seqrecord.annotations["srcSize"] = src_size
+ seqrecord.annotations["size"] = size
+ # Parsnp coordinates are always on the fwd strand, even for revcomp hits
+ # we adjust for that here
+ seqrecord.annotations["start"] = start_coord if strand == "+" else src_size - start_coord
+ curr_cluster_seqs.append(seqrecord)
+
+ # In order to keep track of inter-cluster regions, we also would like
+ # to have the coordinates of all aligned clusters on the fwd strand
+ if strand == "-":
+ end_coord = start_coord
+ start_coord = start_coord - size
+ else:
+ end_coord = start_coord + size
+ fname_to_contigid_to_coords[fname][contig_id].append((start_coord, end_coord, strand, cluster_idx))
+
+ # Write remaining sequences
+ if len(curr_cluster_seqs) != 0:
+ with open(output_maf_file, 'a') as maf_out:
+ writer = MafWriter(maf_out)
+ msa_record = MultipleSeqAlignment(curr_cluster_seqs)
+ msa_record._annotations={"pass": curr_cluster_idx}
+ writer.write_alignment(msa_record)
+ return fname_to_contigid_to_coords
+#%%
+def write_intercluster_regions(input_sequences, cluster_directory, fname_to_contigid_to_coords):
+ clusterdir_to_adjacent_clusters = defaultdict(set)
+ fname_contigid_to_cluster_dir_to_length = defaultdict(lambda: defaultdict(int))
+ fname_contigid_to_cluster_dir_to_adjacent_cluster = defaultdict(dict)
+ for f in glob(f"{cluster_directory}/*.fa*"):
+ os.remove(f)
+ for fasta_f in input_sequences:
+ records = SeqIO.parse(fasta_f, 'fasta')
+ fname = Path(fasta_f).name
+ for record in records:
+ coords = [(0, 0, "+", "START CAP")] + sorted(fname_to_contigid_to_coords[fname][record.id]) + [(len(record), len(record), "+", "END CAP")]
+ for idx, (start, end, strand, cluster_idx) in enumerate(coords[1:-1]):
+ idx += 1
+ prev_end = coords[idx-1][1]
+ next_start = coords[idx+1][0]
+ fwd, rev = "right", "left"
+ seq = record.seq
+ if strand == "-":
+ pass
+ fwd, rev = rev, fwd
+ # seq = seq.reverse_complement()
+ fname_contigid_to_cluster_dir_to_length[(fname, record.id)][(cluster_idx, rev)] = start - prev_end
+ fname_contigid_to_cluster_dir_to_length[(fname, record.id)][(cluster_idx, fwd)] = next_start - end
+ fname_contigid_to_cluster_dir_to_adjacent_cluster[(fname, record.id)][(cluster_idx, rev)] = (coords[idx-1][3], "right" if coords[idx-1][2] == "+" else "left")
+ fname_contigid_to_cluster_dir_to_adjacent_cluster[(fname, record.id)][(cluster_idx, fwd)] = (coords[idx+1][3], "left" if coords[idx+1][2] == "+" else "right")
+ clusterdir_to_adjacent_clusters[(cluster_idx, fwd)].add((coords[idx+1][3], "left" if coords[idx+1][2] == "+" else "right"))
+ clusterdir_to_adjacent_clusters[(cluster_idx, rev)].add((coords[idx-1][3], "right" if coords[idx-1][2] == "+" else "left"))
+ seq_to_write = SeqRecord(seq[prev_end:start].reverse_complement(), id=f"{fname}:<-file:contig->:{record.id}", description="")
+ with open(f"{cluster_directory}/cluster{cluster_idx}-{rev}.fasta", 'a') as out_f:
+ SeqIO.write([seq_to_write], out_f, "fasta")
+ seq_to_write = SeqRecord(seq[end:next_start], id=f"{fname}:<-file:contig->:{record.id}", description="")
+ with open(f"{cluster_directory}/cluster{cluster_idx}-{fwd}.fasta", 'a') as out_f:
+ SeqIO.write([seq_to_write], out_f, "fasta")
+ return fname_contigid_to_cluster_dir_to_length, fname_contigid_to_cluster_dir_to_adjacent_cluster
+#%%
+def get_new_extensions(cluster_files, match_score, mismatch_score, gap_penalty):
+ # mat = substitution_matrices.load("NUC.4.4")
+ # mismatch_multiplier = 1
+ # gap_match = -1
+ fname_parser = re.compile(r"^.*/cluster(\d+)-(.*).fasta")
+ clusterdir_len = defaultdict(lambda: defaultdict(int))
+ clusterdir_expand = {}
+ for cluster_f in cluster_files:
+ cluster_idx, direction = fname_parser.match(cluster_f).groups()
+ cluster_idx = int(cluster_idx)
+ sequences = list(SeqIO.parse(cluster_f, 'fasta'))
+ lengths = Counter(len(s) for s in sequences)
+ idx = 0
+ score_list = [0]
+ clusterdir_len[(cluster_idx, direction)] = max(lengths)
+ while idx < max(lengths):
+ bases = Counter(seq[idx].upper() if len(seq) > idx else "-" for seq in sequences if seq)
+ score = 0
+ for n1, n2 in combinations(bases.keys(), r=2):
+ if n1 == "-" or n2 == "-":
+ score += gap_penalty * bases[n1] * bases[n2]
+ else:
+ score += mismatch_score * bases[n1] * bases[n2]
+ for nuc in bases.keys():
+ if nuc == "-":
+ continue
+ else:
+ score += match_score * bases[nuc] * (bases[nuc] - 1) / 2
+ score_list.append(score_list[-1] + score)
+ idx += 1
+
+ extend_by = np.argmax(score_list)
+ # extend_by = lengths.most_common()[0][0] if len(lengths) == 1 else (lengths.most_common()[0][0] if lengths.most_common()[0][0] != 0 else lengths.most_common()[1][0])
+ # print(lengths)
+ # extend_by = min(extend_by, 100)
+ clusterdir_expand[(cluster_idx, direction)] = extend_by
+ return clusterdir_expand, clusterdir_len
+
+#%%
+
+def write_xmfa_header(extended_xmfa_file, fname_header_to_gcontigidx, fname_contigid_to_length, num_clusters):
+
+ with open(extended_xmfa_file, 'w') as xmfa_out:
+ ### Write header
+ xmfa_out.write("#FormatVersion Parsnp v1.1\n")
+ xmfa_out.write(f"#SequenceCount {len(fname_header_to_gcontigidx)}\n")
+ for (fname, header), global_idx in fname_header_to_gcontigidx.items():
+ xmfa_out.write(f"##SequenceIndex {global_idx + 1}\n")
+ xmfa_out.write(f"##SequenceFile {fname}\n")
+ xmfa_out.write(f"##SequenceHeader >{header}\n")
+ xmfa_out.write(f"##SequenceLength {fname_contigid_to_length[(fname, header)]}\n")
+ xmfa_out.write(f"##IntervalCount {num_clusters}\n")
+
+
+def write_xmfa_cluster(extended_xmfa_file, msa_records, fname_header_to_gcontigidx):
+ with open(extended_xmfa_file, 'a') as xmfa_out:
+ ### Write MSA record
+ header_parser = re.compile(r"(.+):<-file:contig->:(.+)")
+ for msa_record in msa_records:
+ cluster = msa_record.annotations['cluster']
+ for seq_record in msa_record:
+ fname, contig_id = header_parser.match(seq_record.id).groups()
+
+ strand = "-" if seq_record.annotations["strand"] == -1 else "+"
+ start = seq_record.annotations['start']
+ size = seq_record.annotations["size"]
+ if strand == "+":
+ seq_record.id = f"{fname_header_to_gcontigidx[(fname, contig_id)] + 1}:{start}-{start + size}"
+ else:
+ seq_record.id = f"{fname_header_to_gcontigidx[(fname, contig_id)] + 1}:{start - size}-{start}"
+ seq_record.description = f"{strand} cluster{cluster} s1:p{start}"
+ SeqIO.write(msa_record, xmfa_out, "fasta")
+ xmfa_out.write("=\n")
+
+
+
+# fname_to_contigid_to_extended = defaultdict(lambda: defaultdict(list))
+# fname_to_contigid_to_coords[fname][contig_id].append((seqrecord.annotations["start"], seqrecord.annotations["start"] + seqrecord.annotations["size"], strand, cluster_idx))
+def write_extended_xmfa(
+ original_maf_file,
+ extended_maf_file,
+ cluster_directory,
+ clusterdir_expand,
+ clusterdir_len,
+ fname_contigid_to_cluster_dir_to_length,
+ fname_contigid_to_cluster_dir_to_adjacent_cluster,
+ fname_header_to_gcontigidx,
+ fname_contigid_to_length):
+ header_parser = re.compile(r"(.+):<-file:contig->:(.+)")
+ clusters = set()
+ for fname, contigid in fname_contigid_to_cluster_dir_to_length.keys():
+ for cluster, _ in fname_contigid_to_cluster_dir_to_length[(fname, contigid)].keys():
+ clusters.add(cluster)
+ write_xmfa_header(extended_maf_file, fname_header_to_gcontigidx, fname_contigid_to_length, cluster)
+ with open(original_maf_file, 'r') as maf_in:
+ maf_iterator = MafIterator(maf_in)
+ # maf_writer = MafWriter(maf_out)
+ # maf_writer = MauveWriter(maf_out)
+ # maf_writer.write_header()
+ for idx, msa_record in enumerate(maf_iterator):
+ fname, contig_id = header_parser.match(msa_record[0].id).groups()
+ cluster_idx = int(msa_record._annotations["pass"])
+ for direction in ("right", "left"):
+ expand_by = clusterdir_expand[(cluster_idx, direction)]
+ expand_by = min(
+ expand_by,
+ max(cldir_to_len[(cluster_idx, direction)] for cldir_to_len in fname_contigid_to_cluster_dir_to_length.values())
+ )
+ # expand_by = min(expand_by, 7)
+ # expand_by = 10
+ # for seq_record in msa_record:
+ # fname, contig_id, cluster_idx = header_parser.match(seq_record.id).groups()
+ # cluster_idx = int(cluster_idx)
+ # expand_by = min(expand_by, fname_contigid_to_cluster_dir_to_length[(fname, contig_id)][(cluster_idx, direction)])
+ if expand_by <= 0:
+ continue
+ logger.debug(f"Expanding cluster {cluster_idx} to the {direction} by {expand_by}")
+ inter_cluster_sequences = SeqIO.index(f"{cluster_directory}/cluster{cluster_idx}-{direction}.fasta", 'fasta')
+ for seq_record in msa_record:
+ fname, contig_id = header_parser.match(seq_record.id).groups()
+ cluster_idx = int(cluster_idx)
+ flanking_seq = inter_cluster_sequences[f"{fname}:<-file:contig->:{contig_id}"]
+ flanking_seq = flanking_seq[:fname_contigid_to_cluster_dir_to_length[(fname, contig_id)][(cluster_idx, direction)]]
+ if len(flanking_seq) >= expand_by:
+ flanking_seq.seq = flanking_seq.seq[:expand_by]
+ if direction == "left":
+ flanking_seq.seq = flanking_seq.seq.reverse_complement()
+ flanking_seq.seq = "-"*(expand_by - len(flanking_seq)) + flanking_seq.seq
+ else:
+ flanking_seq.seq = flanking_seq.seq + "-"*(expand_by - len(flanking_seq))
+ seq_record.annotations["size"] += len(str(flanking_seq.seq).replace("-", ''))
+ if direction == "right":
+ seq_record.seq = seq_record.seq + flanking_seq.seq
+ else:
+ seq_record.seq = flanking_seq.seq + seq_record.seq
+ seq_record.annotations["start"] -= (expand_by - flanking_seq.seq.count("-"))
+ adj_cluster_idx, adj_cluster_dir = fname_contigid_to_cluster_dir_to_adjacent_cluster[(fname, contig_id)][(cluster_idx, direction)]
+ fname_contigid_to_cluster_dir_to_length[(fname, contig_id)][(adj_cluster_idx, adj_cluster_dir)] -= len(str(flanking_seq.seq).replace("-", ''))
+
+
+
+ # for seq_record in msa_record:
+ # fname, contig_id = header_parser.match(seq_record.id).groups()
+ # seq_record.id = contig_id
+ msa_record.annotations["cluster"] = cluster_idx
+ write_xmfa_cluster(extended_maf_file, [msa_record], fname_header_to_gcontigidx)
+ # maf_writer.write_alignment(msa_record)
+
+#%%
+def check_maf(maf_file, fname_to_seqrecords):
+ contig_to_coords_test = defaultdict(list)
+ header_parser = re.compile(r"(.+):<-file:contig->:(.+)")
+ with open(maf_file, 'r') as maf_in:
+ maf_iterator = MafIterator(maf_in)
+ for msa_record in maf_iterator:
+ for seq_record in msa_record:
+ strand = seq_record.annotations["strand"]
+ coord_start = seq_record.annotations["start"] if seq_record.annotations["strand"] == 1 else seq_record.annotations["srcSize"] - seq_record.annotations["start"]
+ start = seq_record.annotations["start"]
+ end = start + seq_record.annotations["size"]
+ if strand == -1:
+ coord_end = coord_start - seq_record.annotations["size"]
+ coord_start, coord_end = coord_end, coord_start
+ else:
+ coord_end = end
+ fname, contig_id = header_parser.match(seq_record.id).groups()
+ contig_to_coords_test[(fname, contig_id)].append((coord_start, coord_end, seq_record.annotations["strand"]))
+ # if len(seq_record) > 20:
+ # print(seq_record.id.split(":")[0], seq[:15],"...", seq[-15:])
+ # else:
+ # print(seq_record.id.split(":")[0], seq)
+ block_seq = str(seq_record.seq).lower().replace("-", '').replace("n", ".")
+ original_seq = fname_to_seqrecords[fname][contig_id].seq
+ if strand == 1:
+ match_pos = re.search(block_seq, str(original_seq), re.IGNORECASE)
+ else:
+ original_seq = str(original_seq.reverse_complement())
+ match_pos = re.search(block_seq, original_seq, re.IGNORECASE)
+ match_pos = match_pos.start() if match_pos else -1
+ if start != match_pos:
+ print(start, "!=", match_pos)
+ print(seq_record.id)
+ print(seq_record.annotations)
+ print(block_seq)
+ return
+ for key in contig_to_coords_test:
+ contig_to_coords_test[key] = sorted(contig_to_coords_test[key])
+ return contig_to_coords_test
+
+def validate_coords(contig_to_coords):
+ for contig, coords in contig_to_coords.items():
+ prev_end = -1
+ for start, end, strand in coords:
+ if end <= prev_end:
+ print(f"{contig} has invalid coordinates!")
+ print(coords)
+ break
+ prev_end = end
+
+# coords_extended = check_maf("parsnp-extended.maf")
+# validate_coords(coords_extended)
+# coords_original = check_maf("parsnp.maf")
+#%%
+# validate = True
+# genome_dir = "sars-100-rev/"
+# sequence_files = glob(f"{genome_dir}/*.fa*")
+# xmfa_file = f"{genome_dir}/parsnp.xmfa"
+# original_maf_file = f"{genome_dir}/parsnp-original.maf"
+# extended_maf_file = f"{genome_dir}/parsnp-extended.maf"
+# cluster_directory = "clusters/"
+
+# fname_contigid_to_length, fname_contigidx_to_header, fname_to_seqrecord = get_sequence_data(sequence_files, index_files=validate)
+# fname_to_contigid_to_coords = xmfa_to_maf(xmfa_file, original_maf_file, fname_contigidx_to_header, fname_contigid_to_length)
+# packed_write_result = write_intercluster_regions(cluster_directory, fname_to_contigid_to_coords)
+# fname_contigid_to_cluster_dir_to_length, fname_contigid_to_cluster_dir_to_adjacent_cluster = packed_write_result
+# cluster_files = glob(f"{cluster_directory}/*.fasta")
+# clusterdir_expand, clusterdir_len = get_new_extensions(cluster_files)
+# write_extended_maf(
+ # original_maf_file,
+ # extended_maf_file,
+ # cluster_directory,
+ # clusterdir_expand,
+ # clusterdir_len,
+ # fname_contigid_to_cluster_dir_to_length,
+ # fname_contigid_to_cluster_dir_to_adjacent_cluster)
+
+# if validate:
+ # coords_extended = check_maf(extended_maf_file, fname_to_seqrecord)
+ # validate_coords(coords_extended)
+ # coords_original = check_maf(original_maf_file, fname_to_seqrecord)
+ # validate_coords(coords_original)
=====================================
logger.py
=====================================
@@ -0,0 +1,55 @@
+import logging
+############################################# Logging ##############################################
+BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = range(8)
+#These are the sequences need to get colored ouput
+CSI=""#"\x1B["
+BOLDME = ""#CSI+'\033[1m'
+STATUS_BLUE = ""#CSI+'\033[94m'
+OK_GREEN = ""#CSI+'\033[92m'#'32m'
+SKIP_GRAY = ""#CSI+'\033[37m'
+WARNING_YELLOW = ""#CSI+'\033[93m'
+ERROR_RED = ""#CSI+'\033[91m'
+ENDC = ""#CSI+'0m'
+RESET_SEQ = "\033[0m"
+COLOR_SEQ = "\033[1;%dm"
+BOLD_SEQ = "\033[1m"
+
+def formatter_message(message, use_color = True):
+ if use_color:
+ message = message.replace("$RESET", RESET_SEQ).replace("$BOLD", BOLD_SEQ)
+ else:
+ message = message.replace("$RESET", "").replace("$BOLD", "")
+ return message
+
+COLORS = {
+ 'DEBUG': BLUE,
+ 'INFO': WHITE,
+ 'WARNING': YELLOW,
+ 'ERROR': RED,
+ 'CRITICAL': RED
+}
+
+class ColoredFormatter(logging.Formatter):
+ def __init__(self, msg, datefmt = None, use_color = True):
+ logging.Formatter.__init__(self, fmt=msg, datefmt=datefmt)
+ self.use_color = use_color
+
+ def format(self, record):
+ levelname = record.levelname
+ if self.use_color and levelname in COLORS:
+ levelname_color = COLOR_SEQ % (30 + COLORS[levelname]) + levelname + RESET_SEQ
+ record.levelname = levelname_color
+ return logging.Formatter.format(self, record)
+
+# Instantiate logger
+logger = logging.getLogger("Parsnp")
+logger.setLevel(logging.INFO)
+ch = logging.StreamHandler()
+ch.setLevel(logging.INFO)
+# create formatter and add it to the handlers
+formatter = ColoredFormatter('%(asctime)s - %(levelname)s - %(message)s',
+ datefmt="%H:%M:%S")
+ch.setFormatter(formatter)
+# add the handlers to the logger
+logger.addHandler(ch)
+####################################################################################################
=====================================
parsnp
=====================================
@@ -1,20 +1,25 @@
#!/usr/bin/env python
# See the LICENSE file included with this software for license information.
-import os, sys, string, getopt, random,subprocess, time, glob,operator, math, datetime,numpy #pysam
+import os, sys, string, getopt, random,subprocess, time,operator, math, datetime,numpy #pysam
from collections import defaultdict
import csv
-import tempfile
import shutil
+from tempfile import TemporaryDirectory
import re
import logging
+from logger import logger
import multiprocessing
import argparse
import signal
import inspect
from multiprocessing import *
+from Bio import SeqIO
+from glob import glob
-__version__ = "1.6.2"
+import extend as ext
+
+__version__ = "1.7.2"
reroot_tree = True #use --midpoint-reroot
try:
@@ -37,60 +42,6 @@ TOTSEQS=0
PARSNP_DIR = sys.path[0]
-############################################# Logging ##############################################
-BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = range(8)
-#These are the sequences need to get colored ouput
-CSI=""#"\x1B["
-BOLDME = ""#CSI+'\033[1m'
-STATUS_BLUE = ""#CSI+'\033[94m'
-OK_GREEN = ""#CSI+'\033[92m'#'32m'
-SKIP_GRAY = ""#CSI+'\033[37m'
-WARNING_YELLOW = ""#CSI+'\033[93m'
-ERROR_RED = ""#CSI+'\033[91m'
-ENDC = ""#CSI+'0m'
-RESET_SEQ = "\033[0m"
-COLOR_SEQ = "\033[1;%dm"
-BOLD_SEQ = "\033[1m"
-
-def formatter_message(message, use_color = True):
- if use_color:
- message = message.replace("$RESET", RESET_SEQ).replace("$BOLD", BOLD_SEQ)
- else:
- message = message.replace("$RESET", "").replace("$BOLD", "")
- return message
-
-COLORS = {
- 'DEBUG': BLUE,
- 'INFO': WHITE,
- 'WARNING': YELLOW,
- 'ERROR': RED,
- 'CRITICAL': RED
-}
-
-class ColoredFormatter(logging.Formatter):
- def __init__(self, msg, datefmt = None, use_color = True):
- logging.Formatter.__init__(self, fmt=msg, datefmt=datefmt)
- self.use_color = use_color
-
- def format(self, record):
- levelname = record.levelname
- if self.use_color and levelname in COLORS:
- levelname_color = COLOR_SEQ % (30 + COLORS[levelname]) + levelname + RESET_SEQ
- record.levelname = levelname_color
- return logging.Formatter.format(self, record)
-
-# Instantiate logger
-logger = logging.getLogger("Parsnp")
-logger.setLevel(logging.INFO)
-ch = logging.StreamHandler()
-ch.setLevel(logging.INFO)
-# create formatter and add it to the handlers
-formatter = ColoredFormatter('%(asctime)s - %(levelname)s - %(message)s',
- datefmt="%H:%M:%S")
-ch.setFormatter(formatter)
-# add the handlers to the logger
-logger.addHandler(ch)
-####################################################################################################
########################################### Environment ############################################
@@ -296,7 +247,7 @@ def is_valid_file_path(parser, arg):
def is_valid_dir(parser, arg):
if not os.path.exists(arg):
parser.error( "The directory %s does not exist\n" % (arg))
- if len(glob.glob("%s/*"%(arg))) == 0:
+ if len(glob("%s/*"%(arg))) == 0:
parser.error("The director %s is empty"%(arg))
@@ -469,6 +420,26 @@ def parse_args():
# "--filter-phipack-snps",
# action = "store_true",
# help = "Enable filtering of SNPs located in PhiPack identified regions of recombination")
+ extend_args = parser.add_argument_group("LCB Extension")
+ extend_args.add_argument(
+ "--extend-lcbs",
+ action="store_true",
+ help="Extend the boundaries of LCBs with an ungapped alignment")
+ extend_args.add_argument(
+ "--match-score",
+ type=float,
+ default=5,
+ help="Value of match score for extension")
+ extend_args.add_argument(
+ "--mismatch-penalty",
+ type=float,
+ default=-4,
+ help="Value of mismatch score for extension (should be negative)")
+ extend_args.add_argument(
+ "--gap-penalty",
+ type=float,
+ default=-2,
+ help="Value of gap penalty for extension (should be negative)")
misc_args = parser.add_argument_group("Misc")
misc_args.add_argument(
@@ -540,8 +511,7 @@ def parse_args():
if __name__ == "__main__":
t1 = time.time()
- sys.stderr.write( BOLDME+"|--Parsnp %s--|\n"%(VERSION))
- sys.stderr.write( BOLDME+"For detailed documentation please see --> http://harvest.readthedocs.org/en/latest\n")
+ logger.info(f"|--Parsnp {VERSION}--|\n")
parsnp_dir= sys.path[0]
@@ -650,17 +620,18 @@ if __name__ == "__main__":
if len(input_files) < 2:
logger.critical("Less than 2 input sequences provided...")
sys.exit(1)
- if args.validate_input:
- from Bio import SeqIO
- for f in input_files + ([ref] if ref and ref != "!" else []):
- try:
- records = SeqIO.parse(f, "fasta")
- except:
- logger.error("{} is an invalid sequence file!".format(f))
- for record in records:
- if any(c not in "GATCRYWSMKHBVDN" + "GATCRYWSMKHBVDN".lower() for c in record.seq):
- logger.error("Genome sequence {} has invalid characters {}! Skip!".format(f, set(str(record.seq)) - set("AGCTNagctn")))
- continue
+ for f in input_files + ([ref] if ref and ref != "!" else []):
+ try:
+ records = list(SeqIO.parse(f, "fasta"))
+ except:
+ logger.error("{} is an invalid sequence file!".format(f))
+ if args.extend_lcbs and len(records) > 1:
+ logger.error("Extending LCBs does not currently work with multi-contig inputs yet")
+ sys.exit(1)
+ for record in records:
+ if any(c not in "GATCRYWSMKHBVDN" + "GATCRYWSMKHBVDN".lower() for c in record.seq):
+ logger.error("Genome sequence {} has invalid characters {}! Skip!".format(f, set(str(record.seq)) - set("AGCTNagctn")))
+ continue
# Parse reference if necessary
if ref and ref != "!":
@@ -884,28 +855,34 @@ SETTINGS:
# fnafiles.remove(ref)
#sort reference by largest replicon to smallest
- if sortem and os.path.exists(ref) and not autopick_ref:
- logger.debug("Sorting reference replicons")
- ff = open(ref, 'r')
- seqs = ff.read().split(">")[1:]
- seq_dict = {}
- seq_len = {}
- for seq in seqs:
- try:
- hdr, seq = seq.split("\n",1)
- except ValueError:
- # TODO Why do we ignore when theres a header but no sequence?
- continue
- seq_dict[hdr] = seq
- seq_len[hdr] = len(seq) - seq.count('\n')
- seq_len_sort = sorted(iter(seq_len.items()), key=operator.itemgetter(1), reverse=True)
- ref = os.path.join(outputDir, os.path.basename(ref)+".ref")
- ffo = open(ref, 'w')
- for hdr, seq in seq_len_sort:
- ffo.write(">%s\n"%(hdr))
- ffo.write("%s"%(seq_dict[hdr]))
- ff.close()
- ffo.close()
+ if ref in fnafiles:
+ fnafiles = [f for f in fnafiles if f != ref]
+ elif sortem and os.path.exists(ref) and not autopick_ref:
+ sequences = SeqIO.parse(ref, "fasta")
+ new_ref = os.path.join(outputDir, os.path.basename(ref)+".ref")
+ SeqIO.write(sequences, new_ref, "fasta")
+ ref = new_ref
+ # logger.debug("Sorting reference replicons")
+ # ff = open(ref, 'r')
+ # seqs = ff.read().split(">")[1:]
+ # seq_dict = {}
+ # seq_len = {}
+ # for seq in seqs:
+ # try:
+ # hdr, seq = seq.split("\n",1)
+ # except ValueError:
+ # # TODO Why do we ignore when theres a header but no sequence?
+ # continue
+ # seq_dict[hdr] = seq
+ # seq_len[hdr] = len(seq) - seq.count('\n')
+ # seq_len_sort = sorted(iter(seq_len.items()), key=operator.itemgetter(1), reverse=True)
+ # ref = os.path.join(outputDir, os.path.basename(ref)+".ref")
+ # ffo = open(ref, 'w')
+ # for hdr, seq in seq_len_sort:
+ # ffo.write(">%s\n"%(hdr))
+ # ffo.write("%s"%(seq_dict[hdr]))
+ # ff.close()
+ # ffo.close()
else:
ref = genbank_ref
@@ -948,7 +925,7 @@ SETTINGS:
if os.path.exists(os.path.join(outputDir, "alltogether.fasta")):
os.remove(os.path.join(outputDir, "alltogether.fasta"))
if os.path.exists(os.path.join(outputDir, "blocks/b1")):
- ftrm = glob.glob(os.path.join(outputDir, "blocks/b*"))
+ ftrm = glob(os.path.join(outputDir, "blocks/b*"))
for f in ftrm:
shutil.rmtree(f)
@@ -1240,7 +1217,7 @@ Please verify recruited genomes are all strain of interest""")
t2 = time.time()
elapsed = float(t2)-float(t1)
#print("-->Getting list of LCBs.."
- allbfiles = glob.glob(os.path.join(blocks_dir, "b*/*"))
+ allbfiles = glob(os.path.join(blocks_dir, "b*/*"))
blockfiles = []
block_startpos = []
block_dict = {}
@@ -1357,7 +1334,7 @@ Please verify recruited genomes are all strain of interest""")
pool.close()
pool.join()
brkeys = list(bedfile_dict.keys())
- brkeys.sort()
+ brkeys.sort()
for key in brkeys:
bedfile.write(bedfile_dict[key])
bedfile.close()
@@ -1366,15 +1343,52 @@ Please verify recruited genomes are all strain of interest""")
annotation_dict = {}
#TODO always using xtrafast?
+ parsnp_output = f"{outputDir}/parsnp.xmfa"
+ if args.extend_lcbs:
+ xmfa_file = f"{outputDir}/parsnp.xmfa"
+ with TemporaryDirectory() as temp_directory:
+ original_maf_file = f"{temp_directory}/parsnp-original.maf"
+ extended_xmfa_file = f"{outputDir}/parsnp-extended.xmfa"
+ fname_contigid_to_length, fname_contigidx_to_header, fname_to_seqrecord, fname_header_to_gcontigidx = ext.get_sequence_data(
+ ref,
+ finalfiles,
+ index_files=False)
+ fname_to_contigid_to_coords = ext.xmfa_to_maf(
+ xmfa_file,
+ original_maf_file,
+ fname_contigidx_to_header,
+ fname_contigid_to_length)
+ packed_write_result = ext.write_intercluster_regions(finalfiles + [ref], temp_directory, fname_to_contigid_to_coords)
+ fname_contigid_to_cluster_dir_to_length, fname_contigid_to_cluster_dir_to_adjacent_cluster = packed_write_result
+ cluster_files = glob(f"{temp_directory}/*.fasta")
+ clusterdir_expand, clusterdir_len = ext.get_new_extensions(
+ cluster_files,
+ args.match_score,
+ args.mismatch_penalty,
+ args.gap_penalty)
+ ext.write_extended_xmfa(
+ original_maf_file,
+ extended_xmfa_file,
+ temp_directory,
+ clusterdir_expand,
+ clusterdir_len,
+ fname_contigid_to_cluster_dir_to_length,
+ fname_contigid_to_cluster_dir_to_adjacent_cluster,
+ fname_header_to_gcontigidx,
+ fname_contigid_to_length)
+ parsnp_output = extended_xmfa_file
+ os.remove(original_maf_file)
+
+
if xtrafast or 1:
#add genbank here, if present
if len(genbank_ref) != 0:
- rnc = "harvesttools -q -o %s/parsnp.ggr -x "%(outputDir)+outputDir+os.sep+"parsnp.xmfa"
+ rnc = f"harvesttools -q -o {outputDir}/parsnp.ggr -x {parsnp_output}"
for file in genbank_files:
rnc += " -g %s " %(file)
run_command(rnc)
else:
- run_command("harvesttools -q -o %s/parsnp.ggr -f %s -x "%(outputDir,ref)+outputDir+os.sep+"parsnp.xmfa")
+ run_command(f"harvesttools -q -o {outputDir}/parsnp.ggr -f {ref} -x {parsnp_output}")
if run_recomb_filter:
run_command("harvesttools -q -b %s/parsnp.rec,REC,\"PhiPack\" -o %s/parsnp.ggr -i %s/parsnp.ggr"%(outputDir,outputDir,outputDir))
@@ -1387,14 +1401,14 @@ Please verify recruited genomes are all strain of interest""")
if not args.skip_phylogeny:
logger.info("Reconstructing core genome phylogeny...")
- with open(os.path.join(outputDir, "parsnp.snps.mblocks")) as mblocks:
- for line in mblocks:
- if line[0] != ">" and len(line.rstrip()) < 6:
- logger.warning("Not enough SNPs to use RaxML. Attempting to use FastTree instead...")
- use_fasttree = True
- break
+ mblocks_seqs = SeqIO.parse(os.path.join(outputDir, "parsnp.snps.mblocks"), "fasta")
+ for seq in mblocks_seqs:
+ if len(seq) < 6:
+ logger.warning("Not enough SNPs to use RaxML. Attempting to use FastTree instead...")
+ use_fasttree = True
+ break
if not use_fasttree:
- with tempfile.TemporaryDirectory() as raxml_output_dir:
+ with TemporaryDirectory() as raxml_output_dir:
command = "raxmlHPC-PTHREADS -m GTRCAT -p 12345 -T %d -s %s -w %s -n OUTPUT"%(threads,outputDir+os.sep+"parsnp.snps.mblocks", raxml_output_dir)
run_command(command)
os.system("mv {}/RAxML_bestTree.OUTPUT {}".format(raxml_output_dir, outputDir+os.sep+"parsnp.tree"))
@@ -1450,9 +1464,9 @@ Please verify recruited genomes are all strain of interest""")
else:
logger.info("Aligned %d genomes in %.2f seconds"%(totseqs,float(elapsed)))
#cleanup
- rmfiles = glob.glob(os.path.join(outputDir, "*.aln"))
+ rmfiles = glob(os.path.join(outputDir, "*.aln"))
#rmfiles2 = glob.glob(outputDir+os.sep+"blocks/b*/*")
- rmfiles3 = glob.glob(os.path.join(outputDir, "blocks/b*"))
+ rmfiles3 = glob(os.path.join(outputDir, "blocks/b*"))
for f in rmfiles:
os.remove(f)
for f in rmfiles3:
@@ -1491,13 +1505,13 @@ Please verify recruited genomes are all strain of interest""")
if not VERBOSE and os.path.exists("%s.delta"%(prefix)):
os.remove("%s.delta"%(prefix))
- for f in glob.glob(os.path.join(outputDir,"*.reps")):
+ for f in glob(os.path.join(outputDir,"*.reps")):
if not VERBOSE and os.path.exists(f):
os.remove(f)
- for f in glob.glob(os.path.join(outputDir, "*.ref")):
- if not VERBOSE and os.path.exists(f):
- os.remove(f)
+ # for f in glob(os.path.join(outputDir, "*.ref")):
+ # if not VERBOSE and os.path.exists(f):
+ # os.remove(f)
if not VERBOSE and os.path.exists("%s/psnn.ini"%(outputDir)):
os.remove("%s/psnn.ini"%(outputDir))
=====================================
src/parsnp.cpp
=====================================
@@ -995,20 +995,39 @@ void Aligner::writeOutput(string psnp,vector<float>& coveragerow)
hdr1 = lasthdr1;
seqstart = laststart;
}
- if ( !ct.mums.at(0).isforward.at(i) )
- {
- xmfafile << "- cluster" << b << " " << hdr1 << ":p" << (ct.start.at(i)-seqstart)+1 << endl;
- if(recomb_filter)
+ if (i == 0) {
+ if ( !ct.mums.at(0).isforward.at(i) )
{
- clcbfile << "- cluster" << b << " " << hdr1 << ":p" << (ct.start.at(i)-seqstart)+1 << endl;
+ xmfafile << "- cluster" << b << " " << hdr1 << ":p" << (ct.start.at(i)-seqstart)+1 << endl;
+ if(recomb_filter)
+ {
+ clcbfile << "- cluster" << b << " " << hdr1 << ":p" << (ct.start.at(i)-seqstart)+1 << endl;
+ }
}
- }
- else
- {
- xmfafile << "+ cluster" << b << " " << hdr1 << ":p" << (ct.start.at(i)-seqstart)+1 << endl;
- if(recomb_filter)
+ else
{
- clcbfile << "+ cluster" << b << " " << hdr1 << ":p" << (ct.start.at(i)-seqstart)+1 << endl;
+ xmfafile << "+ cluster" << b << " " << hdr1 << ":p" << (ct.start.at(i)-seqstart)+1 << endl;
+ if(recomb_filter)
+ {
+ clcbfile << "+ cluster" << b << " " << hdr1 << ":p" << (ct.start.at(i)-seqstart)+1 << endl;
+ }
+ }
+ } else {
+ if ( !ct.mums.at(0).isforward.at(i) )
+ {
+ xmfafile << "- cluster" << b << " " << hdr1 << ":p" << (ct.start.at(i)-seqstart)+1 - ((hdr1 == "s1" || hdr1=="") ? 0 : this->d + 10 + 1) + ct.mums.at(0).length << endl;
+ if(recomb_filter)
+ {
+ clcbfile << "- cluster" << b << " " << hdr1 << ":p" << (ct.start.at(i)-seqstart)+1 << endl;
+ }
+ }
+ else
+ {
+ xmfafile << "+ cluster" << b << " " << hdr1 << ":p" << (ct.start.at(i)-seqstart)+1 - ((hdr1 == "s1" || hdr1=="") ? 0 : this->d + 10 + 1) << endl;
+ if(recomb_filter)
+ {
+ clcbfile << "+ cluster" << b << " " << hdr1 << ":p" << (ct.start.at(i)-seqstart)+1 << endl;
+ }
}
}
for( k = 0; k+width < s1s.size();)
View it on GitLab: https://salsa.debian.org/med-team/parsnp/-/commit/3a7f098447a5cd743ca06604609246d2a5620ca4
--
View it on GitLab: https://salsa.debian.org/med-team/parsnp/-/commit/3a7f098447a5cd743ca06604609246d2a5620ca4
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20220414/01bf421f/attachment-0001.htm>
More information about the debian-med-commit
mailing list