[med-svn] [Git][med-team/parsnp][master] 10 commits: initialize changelog
Étienne Mollier (@emollier)
gitlab at salsa.debian.org
Thu Jul 7 22:19:14 BST 2022
Étienne Mollier pushed to branch master at Debian Med / parsnp
Commits:
3973c7d1 by Étienne Mollier at 2022-07-07T22:17:25+02:00
initialize changelog
- - - - -
fb7f2034 by Étienne Mollier at 2022-07-07T22:17:46+02:00
d/control: add myself to uploaders.
- - - - -
5ff19d22 by Étienne Mollier at 2022-07-07T22:18:34+02:00
update changelog
- - - - -
0a759e39 by Étienne Mollier at 2022-07-07T22:18:56+02:00
New upstream version 1.7.3+dfsg
- - - - -
476e74d2 by Étienne Mollier at 2022-07-07T22:18:56+02:00
routine-update: New upstream version
- - - - -
271700f0 by Étienne Mollier at 2022-07-07T22:18:57+02:00
Update upstream source from tag 'upstream/1.7.3+dfsg'
Update to upstream version '1.7.3+dfsg'
with Debian dir b4d618bcd021d7a593163bdbecbd08c1e88d4f2e
- - - - -
8622908a by Étienne Mollier at 2022-07-07T22:18:57+02:00
routine-update: Standards-Version: 4.6.1
- - - - -
fec8f980 by Étienne Mollier at 2022-07-07T22:23:56+02:00
Delete typos.patch: applied upstream.
- - - - -
ceddd087 by Étienne Mollier at 2022-07-07T22:24:21+02:00
Refresh proper_calls_to_tools.patch and py3-parsnp-libs.patch.
- - - - -
81b42191 by Étienne Mollier at 2022-07-07T23:18:06+02:00
update changelog and add todo item.
- - - - -
9 changed files:
- debian/changelog
- debian/control
- debian/patches/proper_calls_to_tools.patch
- debian/patches/py3-parsnp-libs.patch
- debian/patches/series
- − debian/patches/typos.patch
- extend.py
- parsnp
- src/parsnp.cpp
Changes:
=====================================
debian/changelog
=====================================
@@ -1,3 +1,22 @@
+parsnp (1.7.3+dfsg-1) UNRELEASED; urgency=medium
+
+TODO: upload missing pyabpoa, otherwise new parsnp version fails to run with:
+Test 1: With GenBank Annotations
+Traceback (most recent call last):
+ File "/usr/bin/parsnp", line 20, in <module>
+ import parsnp.extend as ext
+ File "/usr/lib/python3/dist-packages/parsnp/extend.py", line 18, in <module>
+ import pyabpoa as pa
+ModuleNotFoundError: No module named 'pyabpoa'
+Missing dependency seems to be: https://github.com/yangao07/abPOA
+ * d/control: add myself to uploaders.
+ * New upstream version
+ * Standards-Version: 4.6.1 (routine-update)
+ * Delete typos.patch: applied upstream.
+ * Refresh proper_calls_to_tools.patch and py3-parsnp-libs.patch.
+
+ -- Étienne Mollier <emollier at debian.org> Thu, 07 Jul 2022 22:24:45 +0200
+
parsnp (1.7.2+dfsg-1) unstable; urgency=medium
* Team upload.
=====================================
debian/control
=====================================
@@ -1,6 +1,7 @@
Source: parsnp
Maintainer: Debian Med Packaging Team <debian-med-packaging at lists.alioth.debian.org>
-Uploaders: Andreas Tille <tille at debian.org>
+Uploaders: Andreas Tille <tille at debian.org>,
+ Étienne Mollier <emollier at debian.org>
Section: science
Priority: optional
Build-Depends: debhelper-compat (= 13),
@@ -9,7 +10,7 @@ Build-Depends: debhelper-compat (= 13),
python3-setuptools,
cython3,
libmuscle-dev
-Standards-Version: 4.6.0
+Standards-Version: 4.6.1
Vcs-Browser: https://salsa.debian.org/med-team/parsnp
Vcs-Git: https://salsa.debian.org/med-team/parsnp.git
Homepage: https://harvest.readthedocs.org/en/latest/content/parsnp.html
=====================================
debian/patches/proper_calls_to_tools.patch
=====================================
@@ -13,7 +13,7 @@ Description: Fix name of phipack executable
run_command(command,1)
os.chdir(currdir)
-@@ -578,7 +578,7 @@
+@@ -588,7 +588,7 @@
# Check for dependencies
missing = False
@@ -22,7 +22,7 @@ Description: Fix name of phipack executable
if shutil.which(exe) is None:
missing = True
logger.critical("{} not in system path!".format(exe))
-@@ -894,7 +894,7 @@
+@@ -916,7 +916,7 @@
if xtrafast or 1:
extend = False
@@ -31,7 +31,7 @@ Description: Fix name of phipack executable
inifiled = inifiled.replace("$REF", ref)
inifiled = inifiled.replace("$EXTEND", "%d"%(extend))
inifiled = inifiled.replace("$ANCHORS", str(anchor))
-@@ -951,10 +951,10 @@
+@@ -973,10 +973,10 @@
logger.info("Recruiting genomes...")
if use_parsnp_mumi:
if not inifile_exists:
@@ -44,7 +44,7 @@ Description: Fix name of phipack executable
run_command(command)
try:
mumif = open(os.path.join(outputDir, "all.mumi"),'r')
-@@ -1157,14 +1157,14 @@
+@@ -1179,14 +1179,14 @@
if command == "" and xtrafast and 0:
command = "%s/parsnpA_fast %sparsnpAligner.ini"%(PARSNP_DIR,outputDir+os.sep)
elif command == "":
@@ -62,7 +62,7 @@ Description: Fix name of phipack executable
run_command(command)
if not os.path.exists(os.path.join(outputDir, "parsnpAligner.xmfa")):
-@@ -1409,7 +1409,7 @@
+@@ -1432,7 +1432,7 @@
break
if not use_fasttree:
with TemporaryDirectory() as raxml_output_dir:
=====================================
debian/patches/py3-parsnp-libs.patch
=====================================
@@ -8,14 +8,14 @@ Last-Update: 2022-04-14
This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
--- parsnp.orig/extend.py
+++ parsnp/extend.py
-@@ -13,7 +13,7 @@
+@@ -14,7 +14,7 @@
import numpy as np
from Bio.AlignIO.MafIO import MafWriter, MafIterator
- from Bio.AlignIO.MauveIO import MauveWriter
+ from Bio.AlignIO.MauveIO import MauveWriter, MauveIterator
-from logger import logger
+from parsnp.logger import logger
-
-
+ import pyabpoa as pa
+ import time
#%%
--- parsnp.orig/parsnp
+++ parsnp/parsnp
@@ -35,5 +35,5 @@ This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
-import extend as ext
+import parsnp.extend as ext
- __version__ = "1.7.2"
+ __version__ = "1.7.3"
reroot_tree = True #use --midpoint-reroot
=====================================
debian/patches/series
=====================================
@@ -5,4 +5,3 @@ fix_build_with_as-needed.patch
non-versioned-libs.patch
blhc-fix.patch
py3-parsnp-libs.patch
-typos.patch
=====================================
debian/patches/typos.patch deleted
=====================================
@@ -1,28 +0,0 @@
-Description: fix typos caught by lintian
-Author: Étienne Mollier <emollier at debian.org>
-Forwarded: https://github.com/marbl/parsnp/pull/110
-Last-Update: 2022-04-14
----
-This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
---- parsnp.orig/parsnp
-+++ parsnp/parsnp
-@@ -411,7 +411,7 @@
- "-u",
- "--unaligned",
- action = "store_true",
-- help = "Ouput unaligned regions")
-+ help = "Output unaligned regions")
-
- recombination_args = parser.add_argument_group("Recombination filtration")
- #TODO -x was a duplicate flag but no longer parsed as filter-phipack-snps in the current parsnp version
---- parsnp.orig/src/parsnp.cpp
-+++ parsnp/src/parsnp.cpp
-@@ -3253,7 +3253,7 @@
- dif = difftime (tend,tstart);
-
- cerr << "Parsnp: Finished core genome alignment" << endl;
-- printf(" See log file for futher details. Total processing time: %.0lf seconds \n\n", dif );
-+ printf(" See log file for further details. Total processing time: %.0lf seconds \n\n", dif );
- exit(0);
- }
-
=====================================
extend.py
=====================================
@@ -6,14 +6,37 @@ from glob import glob
import tempfile
from pathlib import Path
import re
+import subprocess
from collections import namedtuple, defaultdict, Counter
import os
from Bio.Align import substitution_matrices
from itertools import product, combinations
import numpy as np
from Bio.AlignIO.MafIO import MafWriter, MafIterator
-from Bio.AlignIO.MauveIO import MauveWriter
+from Bio.AlignIO.MauveIO import MauveWriter, MauveIterator
from logger import logger
+import pyabpoa as pa
+import time
+#%%
+
+def get_ani_cutoff(sequences, cutoff=0, grace_period=5):
+ n = max(len(s) for s in sequences)
+ matches = 0
+ max_matches = 0
+ below_cutoff = 0
+ for i in range(n):
+ pileup = Counter(s[i] for s in sequences)
+ nongaps = sum(pileup[c] for c in pileup if c not in "-")
+ matches += sum(pileup[c]*(pileup[c]-1) for c in pileup if c in "ATGCatgc")
+ matches += sum(pileup[ambig] * (sum(pileup[c] for c in pileup if c in "ATGCatgcNn") - 1) for ambig in "Nn")
+ max_matches += nongaps * (len(sequences) - 1)
+ if matches/max_matches < cutoff:
+ below_cutoff += 1
+ if below_cutoff > grace_period:
+ return (matches, max_matches), i - below_cutoff
+ else:
+ below_cutoff = 0
+ return (matches, max_matches), n - below_cutoff
#%%
@@ -21,7 +44,6 @@ def get_sequence_data(reference, sequence_files, index_files=False):
fname_to_seqrecord = {}
fname_contigid_to_length = {}
fname_contigidx_to_header = {}
- fname_header_to_gcontigidx = {}
idx = 0
for fasta_f in [reference] + sequence_files:
fname = Path(fasta_f).name
@@ -30,11 +52,11 @@ def get_sequence_data(reference, sequence_files, index_files=False):
for contig_id, record in fname_to_seqrecord[fname].items():
fname_contigid_to_length[fname, contig_id] = len(record)
for ctg_idx, record in enumerate(SeqIO.parse(fasta_f, 'fasta')):
- fname_header_to_gcontigidx[fname, record.id] = idx
idx += 1
- fname_contigidx_to_header[fname, ctg_idx] = record.id
- fname_contigid_to_length[fname, record.id] = len(record)
- return fname_contigid_to_length, fname_contigidx_to_header, fname_to_seqrecord, fname_header_to_gcontigidx
+ record.description = record.description.replace(" ", "^")
+ fname_contigidx_to_header[fname, ctg_idx] = record.description
+ fname_contigid_to_length[fname, record.description] = len(record)
+ return fname_contigid_to_length, fname_contigidx_to_header, fname_to_seqrecord
#%%
# Writes the parsnp output to a MAF file containing. While there are methods
@@ -42,6 +64,7 @@ def get_sequence_data(reference, sequence_files, index_files=False):
# which is a future addition we would like to have.
def xmfa_to_maf(xmfa_file, output_maf_file, fname_contigidx_to_header, fname_contigid_to_length):
index_to_fname = {}
+ fname_header_to_gcontigidx = {}
# Avoid loading the whole alignment in memory
with open(xmfa_file) as parsnp_fd, tempfile.SpooledTemporaryFile(mode='w') as fasta_out:
for line in (line.strip() for line in parsnp_fd):
@@ -51,6 +74,9 @@ def xmfa_to_maf(xmfa_file, output_maf_file, fname_contigidx_to_header, fname_con
elif line.startswith("##SequenceFile"):
filename = Path(line.split(' ')[1]).name
index_to_fname[index] = filename
+ elif line.startswith("##SequenceHeader"):
+ header = line.split('>')[1].replace(" ", "^")
+ fname_header_to_gcontigidx[filename, header] = index
elif line[0] != '=':
fasta_out.write(line + "\n")
fasta_out.seek(0)
@@ -68,6 +94,7 @@ def xmfa_to_maf(xmfa_file, output_maf_file, fname_contigidx_to_header, fname_con
fname_to_contigid_to_coords = defaultdict(lambda: defaultdict(list))
for seqrecord in xmfa_seqs:
full_header = seqrecord.description
+ # print(full_header)
groups = header_parser.search(full_header).groups()
file_idx, gstart, gend, strand, cluster_idx, contig_idx, position_in_contig = groups
cluster_idx = int(cluster_idx)
@@ -94,6 +121,7 @@ def xmfa_to_maf(xmfa_file, output_maf_file, fname_contigidx_to_header, fname_con
# Add annotations to the seqrecord in order to pass to mafwriter
fname = index_to_fname[int(file_idx)]
contig_id = fname_contigidx_to_header[fname, contig_idx]
+ # print(contig_id)
src_size = fname_contigid_to_length[fname, contig_id]
size = len(str(seqrecord.seq).replace("-", ''))
seqrecord.id = f"{fname}:<-file:contig->:{contig_id}"
@@ -121,7 +149,7 @@ def xmfa_to_maf(xmfa_file, output_maf_file, fname_contigidx_to_header, fname_con
msa_record = MultipleSeqAlignment(curr_cluster_seqs)
msa_record._annotations={"pass": curr_cluster_idx}
writer.write_alignment(msa_record)
- return fname_to_contigid_to_coords
+ return fname_to_contigid_to_coords, fname_header_to_gcontigidx
#%%
def write_intercluster_regions(input_sequences, cluster_directory, fname_to_contigid_to_coords):
clusterdir_to_adjacent_clusters = defaultdict(set)
@@ -133,7 +161,8 @@ def write_intercluster_regions(input_sequences, cluster_directory, fname_to_cont
records = SeqIO.parse(fasta_f, 'fasta')
fname = Path(fasta_f).name
for record in records:
- coords = [(0, 0, "+", "START CAP")] + sorted(fname_to_contigid_to_coords[fname][record.id]) + [(len(record), len(record), "+", "END CAP")]
+ record.description = record.description.replace(" ", "^")
+ coords = [(0, 0, "+", "START CAP")] + sorted(fname_to_contigid_to_coords[fname][record.description]) + [(len(record), len(record), "+", "END CAP")]
for idx, (start, end, strand, cluster_idx) in enumerate(coords[1:-1]):
idx += 1
prev_end = coords[idx-1][1]
@@ -141,27 +170,23 @@ def write_intercluster_regions(input_sequences, cluster_directory, fname_to_cont
fwd, rev = "right", "left"
seq = record.seq
if strand == "-":
- pass
fwd, rev = rev, fwd
# seq = seq.reverse_complement()
- fname_contigid_to_cluster_dir_to_length[(fname, record.id)][(cluster_idx, rev)] = start - prev_end
- fname_contigid_to_cluster_dir_to_length[(fname, record.id)][(cluster_idx, fwd)] = next_start - end
- fname_contigid_to_cluster_dir_to_adjacent_cluster[(fname, record.id)][(cluster_idx, rev)] = (coords[idx-1][3], "right" if coords[idx-1][2] == "+" else "left")
- fname_contigid_to_cluster_dir_to_adjacent_cluster[(fname, record.id)][(cluster_idx, fwd)] = (coords[idx+1][3], "left" if coords[idx+1][2] == "+" else "right")
+ fname_contigid_to_cluster_dir_to_length[(fname, record.description)][(cluster_idx, rev)] = start - prev_end
+ fname_contigid_to_cluster_dir_to_length[(fname, record.description)][(cluster_idx, fwd)] = next_start - end
+ fname_contigid_to_cluster_dir_to_adjacent_cluster[(fname, record.description)][(cluster_idx, rev)] = (coords[idx-1][3], "right" if coords[idx-1][2] == "+" else "left")
+ fname_contigid_to_cluster_dir_to_adjacent_cluster[(fname, record.description)][(cluster_idx, fwd)] = (coords[idx+1][3], "left" if coords[idx+1][2] == "+" else "right")
clusterdir_to_adjacent_clusters[(cluster_idx, fwd)].add((coords[idx+1][3], "left" if coords[idx+1][2] == "+" else "right"))
clusterdir_to_adjacent_clusters[(cluster_idx, rev)].add((coords[idx-1][3], "right" if coords[idx-1][2] == "+" else "left"))
- seq_to_write = SeqRecord(seq[prev_end:start].reverse_complement(), id=f"{fname}:<-file:contig->:{record.id}", description="")
+ seq_to_write = SeqRecord(seq[prev_end:start].reverse_complement(), id=f"{fname}:<-file:contig->:{record.description}", description="")
with open(f"{cluster_directory}/cluster{cluster_idx}-{rev}.fasta", 'a') as out_f:
SeqIO.write([seq_to_write], out_f, "fasta")
- seq_to_write = SeqRecord(seq[end:next_start], id=f"{fname}:<-file:contig->:{record.id}", description="")
+ seq_to_write = SeqRecord(seq[end:next_start], id=f"{fname}:<-file:contig->:{record.description}", description="")
with open(f"{cluster_directory}/cluster{cluster_idx}-{fwd}.fasta", 'a') as out_f:
SeqIO.write([seq_to_write], out_f, "fasta")
return fname_contigid_to_cluster_dir_to_length, fname_contigid_to_cluster_dir_to_adjacent_cluster
#%%
def get_new_extensions(cluster_files, match_score, mismatch_score, gap_penalty):
- # mat = substitution_matrices.load("NUC.4.4")
- # mismatch_multiplier = 1
- # gap_match = -1
fname_parser = re.compile(r"^.*/cluster(\d+)-(.*).fasta")
clusterdir_len = defaultdict(lambda: defaultdict(int))
clusterdir_expand = {}
@@ -170,29 +195,36 @@ def get_new_extensions(cluster_files, match_score, mismatch_score, gap_penalty):
cluster_idx = int(cluster_idx)
sequences = list(SeqIO.parse(cluster_f, 'fasta'))
lengths = Counter(len(s) for s in sequences)
- idx = 0
- score_list = [0]
- clusterdir_len[(cluster_idx, direction)] = max(lengths)
- while idx < max(lengths):
- bases = Counter(seq[idx].upper() if len(seq) > idx else "-" for seq in sequences if seq)
- score = 0
- for n1, n2 in combinations(bases.keys(), r=2):
- if n1 == "-" or n2 == "-":
- score += gap_penalty * bases[n1] * bases[n2]
- else:
- score += mismatch_score * bases[n1] * bases[n2]
- for nuc in bases.keys():
- if nuc == "-":
- continue
- else:
- score += match_score * bases[nuc] * (bases[nuc] - 1) / 2
- score_list.append(score_list[-1] + score)
- idx += 1
+ maxlen = max(lengths)
+ align_all = True
+ if maxlen != 0:
+ bins = np.histogram([len(s) for s in sequences], bins=20)[0]
+ if max(bins)/len(sequences) < 0.99 or lengths.most_common(1)[0][0] == 0:
+ align_all = False
+
+ if not align_all:
+ idx = 0
+ score_list = [0]
+ clusterdir_len[(cluster_idx, direction)] = max(lengths)
+ while idx < max(lengths):
+ bases = Counter(seq[idx].upper() if len(seq) > idx else "-" for seq in sequences if seq)
+ score = 0
+ for n1, n2 in combinations(bases.keys(), r=2):
+ if n1 == "-" or n2 == "-":
+ score += gap_penalty * bases[n1] * bases[n2]
+ else:
+ score += mismatch_score * bases[n1] * bases[n2]
+ for nuc in bases.keys():
+ if nuc == "-":
+ continue
+ else:
+ score += match_score * bases[nuc] * (bases[nuc] - 1) / 2
+ score_list.append(score_list[-1] + score)
+ idx += 1
- extend_by = np.argmax(score_list)
- # extend_by = lengths.most_common()[0][0] if len(lengths) == 1 else (lengths.most_common()[0][0] if lengths.most_common()[0][0] != 0 else lengths.most_common()[1][0])
- # print(lengths)
- # extend_by = min(extend_by, 100)
+ extend_by = np.argmax(score_list)
+ else:
+ extend_by = lengths.most_common(1)[0][0]
clusterdir_expand[(cluster_idx, direction)] = extend_by
return clusterdir_expand, clusterdir_len
@@ -200,40 +232,43 @@ def get_new_extensions(cluster_files, match_score, mismatch_score, gap_penalty):
def write_xmfa_header(extended_xmfa_file, fname_header_to_gcontigidx, fname_contigid_to_length, num_clusters):
- with open(extended_xmfa_file, 'w') as xmfa_out:
+ with open(extended_xmfa_file, 'w') as xmfa_out:#, open(extended_xmfa_file + ".maf", 'w') as maf_out:
+ # maf_writer = MafWriter(maf_out)
+ # maf_writer.write_header()
### Write header
xmfa_out.write("#FormatVersion Parsnp v1.1\n")
xmfa_out.write(f"#SequenceCount {len(fname_header_to_gcontigidx)}\n")
for (fname, header), global_idx in fname_header_to_gcontigidx.items():
- xmfa_out.write(f"##SequenceIndex {global_idx + 1}\n")
+ xmfa_out.write(f"##SequenceIndex {global_idx}\n")
xmfa_out.write(f"##SequenceFile {fname}\n")
- xmfa_out.write(f"##SequenceHeader >{header}\n")
+ xmfa_out.write(f"##SequenceHeader >{header.replace('^', ' ')}\n")
xmfa_out.write(f"##SequenceLength {fname_contigid_to_length[(fname, header)]}\n")
xmfa_out.write(f"##IntervalCount {num_clusters}\n")
def write_xmfa_cluster(extended_xmfa_file, msa_records, fname_header_to_gcontigidx):
- with open(extended_xmfa_file, 'a') as xmfa_out:
+ with open(extended_xmfa_file, 'a') as xmfa_out:#, open(extended_xmfa_file + ".maf", 'a') as maf_out:
+ # maf_writer = MafWriter(maf_out)
### Write MSA record
header_parser = re.compile(r"(.+):<-file:contig->:(.+)")
for msa_record in msa_records:
cluster = msa_record.annotations['cluster']
+ # maf_writer.write_alignment(msa_record)
for seq_record in msa_record:
fname, contig_id = header_parser.match(seq_record.id).groups()
strand = "-" if seq_record.annotations["strand"] == -1 else "+"
start = seq_record.annotations['start']
size = seq_record.annotations["size"]
- if strand == "+":
- seq_record.id = f"{fname_header_to_gcontigidx[(fname, contig_id)] + 1}:{start}-{start + size}"
+ if strand == "+" or True:
+ seq_record.id = f"{fname_header_to_gcontigidx[(fname, contig_id)]}:{start}-{start + size - 1}"
else:
- seq_record.id = f"{fname_header_to_gcontigidx[(fname, contig_id)] + 1}:{start - size}-{start}"
+ seq_record.id = f"{fname_header_to_gcontigidx[(fname, contig_id)]}:{start - size}-{start}"
seq_record.description = f"{strand} cluster{cluster} s1:p{start}"
SeqIO.write(msa_record, xmfa_out, "fasta")
xmfa_out.write("=\n")
-
# fname_to_contigid_to_extended = defaultdict(lambda: defaultdict(list))
# fname_to_contigid_to_coords[fname][contig_id].append((seqrecord.annotations["start"], seqrecord.annotations["start"] + seqrecord.annotations["size"], strand, cluster_idx))
def write_extended_xmfa(
@@ -245,27 +280,53 @@ def write_extended_xmfa(
fname_contigid_to_cluster_dir_to_length,
fname_contigid_to_cluster_dir_to_adjacent_cluster,
fname_header_to_gcontigidx,
- fname_contigid_to_length):
+ fname_contigid_to_length,
+ cutoff,
+ indel_cutoff,
+ cpu_count=1): # cutoff is the min ANI
header_parser = re.compile(r"(.+):<-file:contig->:(.+)")
clusters = set()
for fname, contigid in fname_contigid_to_cluster_dir_to_length.keys():
for cluster, _ in fname_contigid_to_cluster_dir_to_length[(fname, contigid)].keys():
clusters.add(cluster)
- write_xmfa_header(extended_maf_file, fname_header_to_gcontigidx, fname_contigid_to_length, cluster)
+ write_xmfa_header(extended_maf_file, fname_header_to_gcontigidx, fname_contigid_to_length, len(clusters))
with open(original_maf_file, 'r') as maf_in:
maf_iterator = MafIterator(maf_in)
# maf_writer = MafWriter(maf_out)
# maf_writer = MauveWriter(maf_out)
# maf_writer.write_header()
- for idx, msa_record in enumerate(maf_iterator):
+ old_matches = 0
+ old_max_matches = 0
+ new_matches = 0
+ new_max_matches = 0
+ old_nucs_aligned = 0
+ new_nucs_aligned = 0
+ for idx, msa_record in (enumerate(maf_iterator)):
fname, contig_id = header_parser.match(msa_record[0].id).groups()
cluster_idx = int(msa_record._annotations["pass"])
+ (record_matches, record_maxmatches), _ = get_ani_cutoff([record.seq for record in msa_record])
+ old_matches, old_max_matches = record_matches + old_matches, record_maxmatches + old_max_matches
+ old_nucs_aligned += sum(len(record.seq) for record in msa_record)
for direction in ("right", "left"):
expand_by = clusterdir_expand[(cluster_idx, direction)]
+ flanks = []
+ inter_cluster_sequences = SeqIO.index(f"{cluster_directory}/cluster{cluster_idx}-{direction}.fasta", 'fasta')
+ for seq_record in msa_record:
+ fname, contig_id = header_parser.match(seq_record.id).groups()
+ cluster_idx = int(cluster_idx)
+ flanking_seq = inter_cluster_sequences[f"{fname}:<-file:contig->:{contig_id}"]
+ flanks.append(flanking_seq[:fname_contigid_to_cluster_dir_to_length[(fname, contig_id)][(cluster_idx, direction)]])
+ lengths = Counter(len(s) for s in flanks)
+ maxlen = max(lengths)
+ if maxlen != 0 and expand_by != 0:
+ bins = np.histogram([len(s) for s in flanks], bins=20)[0]
+ if max(bins)/len(flanks) < 0.99 or lengths.most_common(1)[0][0] == 0:
+ expand_by = lengths.most_common(1)[0][0]
+ cluster_space_left = max(cldir_to_len[(cluster_idx, direction)] for cldir_to_len in fname_contigid_to_cluster_dir_to_length.values())
expand_by = min(
- expand_by,
- max(cldir_to_len[(cluster_idx, direction)] for cldir_to_len in fname_contigid_to_cluster_dir_to_length.values())
- )
+ expand_by,
+ cluster_space_left
+ )
# expand_by = min(expand_by, 7)
# expand_by = 10
# for seq_record in msa_record:
@@ -274,28 +335,85 @@ def write_extended_xmfa(
# expand_by = min(expand_by, fname_contigid_to_cluster_dir_to_length[(fname, contig_id)][(cluster_idx, direction)])
if expand_by <= 0:
continue
- logger.debug(f"Expanding cluster {cluster_idx} to the {direction} by {expand_by}")
- inter_cluster_sequences = SeqIO.index(f"{cluster_directory}/cluster{cluster_idx}-{direction}.fasta", 'fasta')
+
+ seqs_to_align = {}
+ # pprint({key: value.seq[:10] for key, value in inter_cluster_sequences.items()})
for seq_record in msa_record:
fname, contig_id = header_parser.match(seq_record.id).groups()
cluster_idx = int(cluster_idx)
+ # print(contig_id)
flanking_seq = inter_cluster_sequences[f"{fname}:<-file:contig->:{contig_id}"]
+ # print(flanking_seq.seq)
flanking_seq = flanking_seq[:fname_contigid_to_cluster_dir_to_length[(fname, contig_id)][(cluster_idx, direction)]]
if len(flanking_seq) >= expand_by:
flanking_seq.seq = flanking_seq.seq[:expand_by]
+ seqs_to_align[contig_id] = str(flanking_seq.seq)
+ minlen = min(len(s) for s in seqs_to_align.values())
+ for k in seqs_to_align:
+ seqs_to_align[k] = seqs_to_align[k][:minlen+indel_cutoff]
+ seqlist = list(seqs_to_align.values())
+ empty_seqs = [i for i in range(len(seqlist)) if seqlist[i] == ""]
+ nonempty_seqs = [s for s in seqlist if s != ""]
+ msa_result_temp = []
+ if minlen < 50:
+ aligner = pa.msa_aligner()
+ # time.sleep(.25)
+ res = aligner.msa(nonempty_seqs, out_cons=False, out_msa=True)
+ msa_result_temp = res.msa_seq
+ else:
+ nonempty_seq_file = f"{cluster_directory}/cluster{cluster_idx}_{direction}_nonempty.fa"
+ SeqIO.write(
+ (SeqIO.SeqRecord(Seq(sequence), id=str(seq_idx)) for seq_idx, sequence in enumerate(nonempty_seqs)),
+ nonempty_seq_file,
+ "fasta")
+ subprocess.check_call(
+ f"muscle -super5 {nonempty_seq_file} -output {nonempty_seq_file}_aligned.fa -threads {cpu_count}",
+ stdout=subprocess.DEVNULL,
+ stderr=subprocess.STDOUT,
+ shell=True)
+ msa_result_temp = [str(record.seq) for record in SeqIO.parse(f"{nonempty_seq_file}_aligned.fa", "fasta")]
+ msa_result = []
+ idx = 0
+ for aligned_seq in msa_result_temp:
+ while idx in empty_seqs:
+ msa_result.append("-"*len(msa_result_temp[0]))
+ idx += 1
+ msa_result.append(aligned_seq)
+ idx += 1
+ while idx in empty_seqs:
+ msa_result.append("-"*len(msa_result_temp[0]))
+ idx += 1
+ (_, _), ani_cutoff = get_ani_cutoff(msa_result, cutoff)
+ msa_seqs = [seq[:ani_cutoff] for seq in msa_result]
+ # for i, seq in enumerate(msa_seqs):
+ # print(f">Seq{i}")
+ # print(seq)
+ expansion_nucs = max(len(s.replace("-", "")) for s in msa_seqs)
+ logger.debug(f"Expanding cluster {cluster_idx} to the {direction} by {expansion_nucs}/{cluster_space_left}")
+
+ # continue
+ flanking_seqs = {}
+ for idx, seq in enumerate(msa_seqs):
+ contig_id = list(seqs_to_align.keys())[idx]
+ flanking_seqs[contig_id] = Seq(seq)
+ for seq_record in msa_record:
+ fname, contig_id = header_parser.match(seq_record.id).groups()
+ cluster_idx = int(cluster_idx)
+ flanking_seq = flanking_seqs[contig_id]
if direction == "left":
- flanking_seq.seq = flanking_seq.seq.reverse_complement()
- flanking_seq.seq = "-"*(expand_by - len(flanking_seq)) + flanking_seq.seq
- else:
- flanking_seq.seq = flanking_seq.seq + "-"*(expand_by - len(flanking_seq))
- seq_record.annotations["size"] += len(str(flanking_seq.seq).replace("-", ''))
+ flanking_seq = flanking_seq.reverse_complement()
+ # flanking_seq = "-"*(expand_by - len(flanking_seq)) + flanking_seq
+ # else:
+ # flanking_seq = flanking_seq + "-"*(expand_by - len(flanking_seq))
+ seq_record.annotations["size"] += len(str(flanking_seq).replace("-", ''))
if direction == "right":
- seq_record.seq = seq_record.seq + flanking_seq.seq
+ seq_record.seq = seq_record.seq + flanking_seq
else:
- seq_record.seq = flanking_seq.seq + seq_record.seq
- seq_record.annotations["start"] -= (expand_by - flanking_seq.seq.count("-"))
+ seq_record.seq = flanking_seq + seq_record.seq
+ seq_record.annotations["start"] -= (len(flanking_seq) - flanking_seq.count("-"))
+ # CHECK IF START > LEN OF SEQ OR SOMETHING
adj_cluster_idx, adj_cluster_dir = fname_contigid_to_cluster_dir_to_adjacent_cluster[(fname, contig_id)][(cluster_idx, direction)]
- fname_contigid_to_cluster_dir_to_length[(fname, contig_id)][(adj_cluster_idx, adj_cluster_dir)] -= len(str(flanking_seq.seq).replace("-", ''))
+ fname_contigid_to_cluster_dir_to_length[(fname, contig_id)][(adj_cluster_idx, adj_cluster_dir)] -= len(str(flanking_seq).replace("-", ''))
@@ -303,8 +421,14 @@ def write_extended_xmfa(
# fname, contig_id = header_parser.match(seq_record.id).groups()
# seq_record.id = contig_id
msa_record.annotations["cluster"] = cluster_idx
+ (record_matches, record_maxmatches), _ = get_ani_cutoff([record.seq for record in msa_record])
+ new_matches, new_max_matches = record_matches + new_matches, record_maxmatches + new_max_matches
+ new_nucs_aligned += sum(len(record.seq) for record in msa_record)
write_xmfa_cluster(extended_maf_file, [msa_record], fname_header_to_gcontigidx)
# maf_writer.write_alignment(msa_record)
+ total_length = sum(l for l in fname_contigid_to_length.values())
+ logger.debug(f"COVERAGE {(old_nucs_aligned / total_length) * 100:.3f}% --> {(new_nucs_aligned / total_length)*100:.3f}%")
+ logger.debug(f"ANI {100*(old_matches/old_max_matches):.2f}% --> {100*(new_matches/new_max_matches):.2f}%")
#%%
def check_maf(maf_file, fname_to_seqrecords):
@@ -361,13 +485,56 @@ def validate_coords(contig_to_coords):
# validate_coords(coords_extended)
# coords_original = check_maf("parsnp.maf")
#%%
+# match_score = 4
+# mismatch_penalty = -8
+# gap_penalty = -10
# validate = True
-# genome_dir = "sars-100-rev/"
-# sequence_files = glob(f"{genome_dir}/*.fa*")
+# genome_dir = "sars"
+# sequence_files = glob(f"{genome_dir}/*.fa")
# xmfa_file = f"{genome_dir}/parsnp.xmfa"
-# original_maf_file = f"{genome_dir}/parsnp-original.maf"
-# extended_maf_file = f"{genome_dir}/parsnp-extended.maf"
# cluster_directory = "clusters/"
+# # ref = f"{genome_dir}/ST-11_RT-078_M120.fa"
+# ref = f"{genome_dir}/GISAID_9381.fa.ref"
+# finalfiles = list(set(sequence_files) - set([ref]))
+# temp_directory = "temp/"
+# original_maf_file = f"{genome_dir}/parsnp.maf"
+# extended_xmfa_file = f"{genome_dir}/parsnp-extended.xmfa"
+# fname_contigid_to_length, fname_contigidx_to_header, fname_to_seqrecord = get_sequence_data(
+# ref,
+# finalfiles,
+# index_files=True)
+# fname_to_contigid_to_coords, fname_header_to_gcontigidx = xmfa_to_maf(
+# xmfa_file,
+# original_maf_file,
+# fname_contigidx_to_header,
+# fname_contigid_to_length)
+# packed_write_result = write_intercluster_regions(finalfiles + [ref], temp_directory, fname_to_contigid_to_coords)
+# fname_contigid_to_cluster_dir_to_length, fname_contigid_to_cluster_dir_to_adjacent_cluster = packed_write_result
+# cluster_files = glob(f"{temp_directory}/*.fasta")
+# clusterdir_expand, clusterdir_len = get_new_extensions(
+# cluster_files,
+# match_score,
+# mismatch_penalty,
+# gap_penalty)
+# write_extended_xmfa(
+# original_maf_file,
+# extended_xmfa_file,
+# temp_directory,
+# clusterdir_expand,
+# clusterdir_len,
+# fname_contigid_to_cluster_dir_to_length,
+# fname_contigid_to_cluster_dir_to_adjacent_cluster,
+# fname_header_to_gcontigidx,
+# fname_contigid_to_length)
+# parsnp_output = extended_xmfa_file
+
+
+
+
+
+
+
+
# fname_contigid_to_length, fname_contigidx_to_header, fname_to_seqrecord = get_sequence_data(sequence_files, index_files=validate)
# fname_to_contigid_to_coords = xmfa_to_maf(xmfa_file, original_maf_file, fname_contigidx_to_header, fname_contigid_to_length)
@@ -376,16 +543,21 @@ def validate_coords(contig_to_coords):
# cluster_files = glob(f"{cluster_directory}/*.fasta")
# clusterdir_expand, clusterdir_len = get_new_extensions(cluster_files)
# write_extended_maf(
- # original_maf_file,
- # extended_maf_file,
- # cluster_directory,
- # clusterdir_expand,
- # clusterdir_len,
- # fname_contigid_to_cluster_dir_to_length,
- # fname_contigid_to_cluster_dir_to_adjacent_cluster)
+# original_maf_file,
+# extended_maf_file,
+# cluster_directory,
+# clusterdir_expand,
+# clusterdir_len,
+# fname_contigid_to_cluster_dir_to_length,
+# fname_contigid_to_cluster_dir_to_adjacent_cluster)
# if validate:
- # coords_extended = check_maf(extended_maf_file, fname_to_seqrecord)
+ # # fname_to_contigid_to_coords = xmfa_to_maf(
+ # # extended_xmfa_file,
+ # # extended_xmfa_file + ".maf",
+ # # fname_contigidx_to_header,
+ # # fname_contigid_to_length)
+ # coords_extended = check_maf(extended_xmfa_file + ".maf", fname_to_seqrecord)
# validate_coords(coords_extended)
- # coords_original = check_maf(original_maf_file, fname_to_seqrecord)
- # validate_coords(coords_original)
+ # # coords_original = check_maf(original_maf_file, fname_to_seqrecord)
+ # # validate_coords(coords_original)
=====================================
parsnp
=====================================
@@ -19,7 +19,7 @@ from glob import glob
import extend as ext
-__version__ = "1.7.2"
+__version__ = "1.7.3"
reroot_tree = True #use --midpoint-reroot
try:
@@ -224,13 +224,13 @@ def run_command(command,ignorerc=0):
>>$ {}
Please veryify input data and restart Parsnp.
If the problem persists please contact the Parsnp development team..
-
+
STDOUT:
{}
-
+
STDERR:
{}""".format(command, fstdout, fstderr))
-
+
sys.exit(rc)
else:
logger.debug(fstdout)
@@ -411,7 +411,7 @@ def parse_args():
"-u",
"--unaligned",
action = "store_true",
- help = "Ouput unaligned regions")
+ help = "Output unaligned regions")
recombination_args = parser.add_argument_group("Recombination filtration")
#TODO -x was a duplicate flag but no longer parsed as filter-phipack-snps in the current parsnp version
@@ -425,6 +425,16 @@ def parse_args():
"--extend-lcbs",
action="store_true",
help="Extend the boundaries of LCBs with an ungapped alignment")
+ extend_args.add_argument(
+ "--extend-ani-cutoff",
+ type=float,
+ default=0.95,
+ help="Cutoff ANI for lcb extension")
+ extend_args.add_argument(
+ "--extend-indel-cutoff",
+ type=int,
+ default=50,
+ help="Cutoff for indels in LCB extension region. LCB extension will be at most min(seqs) + cutoff bases")
extend_args.add_argument(
"--match-score",
type=float,
@@ -592,7 +602,7 @@ if __name__ == "__main__":
missing = missing or (not has_fasttree)
if missing:
sys.exit(1)
-
+
# Create output dir
if outputDir == "." or outputDir == "./" or outputDir == "/":
logger.critical("Specified output dir is current working dir or root dir! will clobber any parsnp.* results")
@@ -626,6 +636,7 @@ if __name__ == "__main__":
except:
logger.error("{} is an invalid sequence file!".format(f))
if args.extend_lcbs and len(records) > 1:
+ print(f)
logger.error("Extending LCBs does not currently work with multi-contig inputs yet")
sys.exit(1)
for record in records:
@@ -792,7 +803,7 @@ SETTINGS:
allfiles = []
fnaf_sizes = {}
- allfile_dict = {}
+ allfile_dict = {}
reflen = 0
fnafiles = []
if ref == "!":
@@ -810,7 +821,18 @@ SETTINGS:
logger.warning("Reference genome sequence %s has '-' in the sequence!"%((ref)))
reflen = len(seq) - seq.count('\n')
- for input_file in input_files:
+ for input_file in input_files[:]:
+ try:
+ record = list(SeqIO.parse(input_file, "fasta"))
+ if len(record) == 0:
+ input_files.remove(input_file)
+ logger.error(f"{input_file} is an empty file!")
+ continue
+ except:
+ input_files.remove(input_file)
+ logger.error(f"Could not parse {input_file}!")
+ continue
+
ff = open(input_file, 'r')
hdr = ff.readline()
seq = ff.read()
@@ -1017,27 +1039,27 @@ SETTINGS:
if randomly_selected_ref:
logger.warning("You are using a randomly selected genome to recruit genomes from your input...")
mash_out = subprocess.check_output([
- "mash", "dist", "-t",
- "-d", str(max_mash_dist),
- "-p", str(threads),
- ref,
+ "mash", "dist", "-t",
+ "-d", str(max_mash_dist),
+ "-p", str(threads),
+ ref,
"-l", all_genomes_fname],
stderr=open(os.path.join(outputDir, "mash.err"), 'w')).decode('utf-8')
finalfiles = [line.split('\t')[0] for line in mash_out.split('\n')[1:] if line != '' and len(line.split('\t')) > 1 and line.split('\t')[1].strip() != '']
elif use_ani:
if randomly_selected_ref:
subprocess.check_call([
- "fastANI",
- "--ql", all_genomes_fname,
- "--rl", all_genomes_fname,
+ "fastANI",
+ "--ql", all_genomes_fname,
+ "--rl", all_genomes_fname,
"-t", str(threads),
"-o", os.path.join(outputDir, "fastANI.tsv")],
stderr=open(os.path.join(outputDir, "fastANI.err"), 'w'))
else:
subprocess.check_call([
- "fastANI",
- "-q", ref,
- "--rl", all_genomes_fname,
+ "fastANI",
+ "-q", ref,
+ "--rl", all_genomes_fname,
"-t", str(threads),
"-o", os.path.join(outputDir, "fastANI.tsv")],
stderr=open(os.path.join(outputDir, "fastANI.err"), 'w'))
@@ -1048,18 +1070,18 @@ SETTINGS:
line = line.split('\t')
if float(line[2]) >= min_ani_cutoff:
genome_to_genomes[line[0]].add(line[1])
-
+
# for g in genome_to_genomes:
# print(len(g))
ani_ref = max(genome_to_genomes, key=(lambda key: len(genome_to_genomes[key])))
if autopick_ref:
auto_ref = ani_ref
finalfiles = list(genome_to_genomes[ani_ref])
-
+
# shutil.rmtree(tmp_dir)
except subprocess.CalledProcessError as e:
logger.critical(
- "Recruitment failed with exception {}. More details may be found in the *.err output log".format(str(e)))
+ "Recruitment failed with exception {}. More details may be found in the *.err output log".format(str(e)))
# shutil.rmtree(tmp_dir)
allfiles.extend(finalfiles)
@@ -1214,8 +1236,6 @@ Adjust params and rerun. If issue persists please submit a GitHub issue""")
Please verify recruited genomes are all strain of interest""")
else:
pass
- t2 = time.time()
- elapsed = float(t2)-float(t1)
#print("-->Getting list of LCBs.."
allbfiles = glob(os.path.join(blocks_dir, "b*/*"))
blockfiles = []
@@ -1334,7 +1354,7 @@ Please verify recruited genomes are all strain of interest""")
pool.close()
pool.join()
brkeys = list(bedfile_dict.keys())
- brkeys.sort()
+ brkeys.sort()
for key in brkeys:
bedfile.write(bedfile_dict[key])
bedfile.close()
@@ -1347,16 +1367,16 @@ Please verify recruited genomes are all strain of interest""")
if args.extend_lcbs:
xmfa_file = f"{outputDir}/parsnp.xmfa"
with TemporaryDirectory() as temp_directory:
- original_maf_file = f"{temp_directory}/parsnp-original.maf"
+ original_maf_file = f"{outputDir}/parsnp-original.maf"
extended_xmfa_file = f"{outputDir}/parsnp-extended.xmfa"
- fname_contigid_to_length, fname_contigidx_to_header, fname_to_seqrecord, fname_header_to_gcontigidx = ext.get_sequence_data(
+ fname_contigid_to_length, fname_contigidx_to_header, fname_to_seqrecord = ext.get_sequence_data(
ref,
- finalfiles,
+ finalfiles,
index_files=False)
- fname_to_contigid_to_coords = ext.xmfa_to_maf(
- xmfa_file,
- original_maf_file,
- fname_contigidx_to_header,
+ fname_to_contigid_to_coords, fname_header_to_gcontigidx = ext.xmfa_to_maf(
+ xmfa_file,
+ original_maf_file,
+ fname_contigidx_to_header,
fname_contigid_to_length)
packed_write_result = ext.write_intercluster_regions(finalfiles + [ref], temp_directory, fname_to_contigid_to_coords)
fname_contigid_to_cluster_dir_to_length, fname_contigid_to_cluster_dir_to_adjacent_cluster = packed_write_result
@@ -1375,7 +1395,10 @@ Please verify recruited genomes are all strain of interest""")
fname_contigid_to_cluster_dir_to_length,
fname_contigid_to_cluster_dir_to_adjacent_cluster,
fname_header_to_gcontigidx,
- fname_contigid_to_length)
+ fname_contigid_to_length,
+ args.extend_ani_cutoff,
+ args.extend_indel_cutoff,
+ threads)
parsnp_output = extended_xmfa_file
os.remove(original_maf_file)
@@ -1424,7 +1447,7 @@ Please verify recruited genomes are all strain of interest""")
logger.info("FastTreeMP failed. Trying fasttree...")
command = "fasttree -nt -quote -gamma -slow -boot 100 "+outputDir+os.sep+"parsnp.snps.mblocks > "+outputDir+os.sep+"parsnp.tree"
run_command(command)
-
+
#7)reroot to midpoint
@@ -1456,6 +1479,8 @@ Please verify recruited genomes are all strain of interest""")
run_command("harvesttools --midpoint-reroot -u -q -i "+outputDir+os.sep+"parsnp.ggr -o "+outputDir+os.sep+"parsnp.ggr -n %s"%(outputDir+os.sep+"parsnp.tree "))
+ t2 = time.time()
+ elapsed = float(t2)-float(t1)
if float(elapsed)/float(60.0) > 60:
logger.info("Aligned %d genomes in %.2f hours"%(totseqs,float(elapsed)/float(3600.0)))
elif float(elapsed) > 60:
=====================================
src/parsnp.cpp
=====================================
@@ -3253,7 +3253,7 @@ int main ( int argc, char* argv[] )
dif = difftime (tend,tstart);
cerr << "Parsnp: Finished core genome alignment" << endl;
- printf(" See log file for futher details. Total processing time: %.0lf seconds \n\n", dif );
+ printf(" See log file for further details. Total processing time: %.0lf seconds \n\n", dif );
exit(0);
}
View it on GitLab: https://salsa.debian.org/med-team/parsnp/-/compare/ca0f9296e126cd1de30793cc0a35d70f7f6bb834...81b421918ee73c4178e9fbe503c69293f10e8bfe
--
View it on GitLab: https://salsa.debian.org/med-team/parsnp/-/compare/ca0f9296e126cd1de30793cc0a35d70f7f6bb834...81b421918ee73c4178e9fbe503c69293f10e8bfe
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20220707/f53f9b8f/attachment-0001.htm>
More information about the debian-med-commit
mailing list