[med-svn] [Git][med-team/parsnp][master] 6 commits: New upstream version 2.0.6+dfsg
Étienne Mollier (@emollier)
gitlab at salsa.debian.org
Sun Nov 3 15:39:52 GMT 2024
Étienne Mollier pushed to branch master at Debian Med / parsnp
Commits:
81f6b2b6 by Étienne Mollier at 2024-11-03T16:30:48+01:00
New upstream version 2.0.6+dfsg
- - - - -
c89249d9 by Étienne Mollier at 2024-11-03T16:30:48+01:00
Update upstream source from tag 'upstream/2.0.6+dfsg'
Update to upstream version '2.0.6+dfsg'
with Debian dir e289d8e985e45961a788c40dd165fac3c3b9ad8d
- - - - -
2c9a5130 by Étienne Mollier at 2024-11-03T16:33:54+01:00
proper_calls_to_tools.patch: unfuzz.
- - - - -
9ce71511 by Étienne Mollier at 2024-11-03T16:34:06+01:00
py3-parsnp-libs.patch: adjust to new import of "extend".
- - - - -
df42997b by Étienne Mollier at 2024-11-03T16:38:27+01:00
d/patches/*.patch: normalize last update dates.
- - - - -
f995192c by Étienne Mollier at 2024-11-03T16:39:25+01:00
d/changelog: ready for upload to unstable.
- - - - -
7 changed files:
- README.md
- debian/changelog
- debian/patches/add_missing_interpreter_line.patch
- debian/patches/non-versioned-libs.patch
- debian/patches/proper_calls_to_tools.patch
- debian/patches/py3-parsnp-libs.patch
- parsnp
Changes:
=====================================
README.md
=====================================
@@ -48,7 +48,7 @@ The `concat_start` and `concat_end` values are internal to parsnp. The sequence
## Building from source
-To build Parsnp from source, users must have automake 1.15, autoconf, and libtool installed. Parsnp also requires RaxML (or FastTree), Harvest-tools, and numpy. Some additional features require pySPOA, Mash, FastANI, and Phipack. All of these packages are available via Conda (many on the Bioconda channel).
+To build Parsnp from source, users must have automake 1.15, autoconf, and libtool installed. Parsnp also requires RaxML (or FastTree), Harvest-tools, biopython, tqdm, and numpy. Some additional features require pySPOA, Mash, FastANI, and Phipack. All of these packages are available via Conda (many on the Bioconda channel).
### Build instructions
First, you must build the Muscle library
=====================================
debian/changelog
=====================================
@@ -1,3 +1,12 @@
+parsnp (2.0.6+dfsg-1) unstable; urgency=medium
+
+ * New upstream version 2.0.6+dfsg
+ * proper_calls_to_tools.patch: unfuzz.
+ * py3-parsnp-libs.patch: adjust to new import of "extend".
+ * d/patches/*.patch: normalize last update dates.
+
+ -- Étienne Mollier <emollier at debian.org> Sun, 03 Nov 2024 16:39:17 +0100
+
parsnp (2.0.5+dfsg-1) unstable; urgency=medium
* New upstream version 2.0.5+dfsg
=====================================
debian/patches/add_missing_interpreter_line.patch
=====================================
@@ -1,5 +1,5 @@
Author: Nilesh Patra
-Last-Update: 2020-09-21 15:59:19 +0000
+Last-Update: 2020-09-21
Description: Force Python3 interpreter
Forwarded: not-needed
=====================================
debian/patches/non-versioned-libs.patch
=====================================
@@ -1,6 +1,6 @@
Description: libmuscle-3.7-dev is now libmuscle-dev
Author: Andreas Tille <tille at debian.org>
-Last-Update: Wed, 18 Jul 2018 13:17:32 +0200
+Last-Update: 2018-07-18
Forwarded: not-needed
--- parsnp.orig/src/Makefile.am
=====================================
debian/patches/proper_calls_to_tools.patch
=====================================
@@ -1,11 +1,11 @@
Author: Nilesh Patra
-Last-Update: 2020-09-21 15:59:19 +0000
+Last-Update: 2020-09-21
Description: Fix name of phipack executable
Forwarded: not-needed
--- parsnp.orig/parsnp
+++ parsnp/parsnp
-@@ -194,7 +194,7 @@
+@@ -193,7 +193,7 @@
def run_phipack(query,seqlen,workingdir):
currdir = os.getcwd()
os.chdir(workingdir)
@@ -14,7 +14,7 @@ Forwarded: not-needed
run_command(command, 1)
os.chdir(currdir)
-@@ -696,7 +696,7 @@
+@@ -695,7 +695,7 @@
missing = True
logger.critical("{} not in system path!".format(exe))
if use_phipack:
@@ -23,7 +23,7 @@ Forwarded: not-needed
if shutil.which(exe) is None:
missing = True
logger.critical("{} not in system path!".format(exe))
-@@ -711,7 +711,7 @@
+@@ -710,7 +710,7 @@
logger.critical("No fasttree executable found in system path!".format(exe))
missing = missing or (not has_fasttree)
else:
@@ -32,7 +32,7 @@ Forwarded: not-needed
if shutil.which(exe) is None:
missing = True
logger.critical("{} not in system path!".format(exe))
-@@ -1029,7 +1029,7 @@
+@@ -1041,7 +1041,7 @@
logger.debug("Writing .ini file")
if xtrafast or 1:
args.extend = False
@@ -41,7 +41,7 @@ Forwarded: not-needed
inifiled = inifiled.replace("$REF", ref)
inifiled = inifiled.replace("$EXTEND", "%d" % (args.extend))
inifiled = inifiled.replace("$ANCHORS", str(args.min_anchor_length))
-@@ -1128,7 +1128,7 @@
+@@ -1140,7 +1140,7 @@
if not os.path.exists(inifile):
logger.error("ini file %s does not exist!\n"%(inifile))
sys.exit(1)
@@ -50,7 +50,7 @@ Forwarded: not-needed
# with open(f"{outputDir}/parsnpAligner.out", 'w') as stdout_f, open(f"{outputDir}/parsnpAligner.err", 'w') as stderr_f:
# rc = run_command(command, ignorerc=1, stdout=stdout_f, stderr=stderr_f, prepend_time=True)
rc = run_logged_command(command=command, ignorerc=1, label="parsnp-aligner", outputDir=outputDir)
-@@ -1352,10 +1352,10 @@
+@@ -1348,10 +1348,10 @@
logger.info("Recruiting genomes...")
if use_parsnp_mumi:
if not inifile_exists:
@@ -63,7 +63,7 @@ Forwarded: not-needed
run_logged_command(command=command, outputDir=outputDir, label="parsnp-mumi")
# Takes eeach sequence and computes its mumi distance to the reference
try:
-@@ -1798,7 +1798,7 @@
+@@ -1784,7 +1784,7 @@
break
if not use_fasttree:
with TemporaryDirectory() as raxml_output_dir:
=====================================
debian/patches/py3-parsnp-libs.patch
=====================================
@@ -28,12 +28,12 @@ This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
import argparse
import signal
from multiprocessing import Pool
-@@ -22,7 +22,7 @@
- from pathlib import Path
+@@ -1720,7 +1720,7 @@
+ if args.extend_lcbs:
+ logger.warning("The LCB extension module is experimental. Runtime may be significantly increased and extended alignments may not be as high quality as the original core-genome. Extensions off of existing LCBs are in a separate xmfa file.")
+ import partition
+- import extend as ext
++ import parsnp.extend as ext
-
--import extend as ext
-+import parsnp.extend as ext
- from tqdm import tqdm
-
- __version__ = "2.0.5"
+ orig_parsnp_xmfa = parsnp_output
+ extended_parsnp_xmfa = orig_parsnp_xmfa + ".extended"
=====================================
parsnp
=====================================
@@ -22,10 +22,9 @@ from glob import glob
from pathlib import Path
-import extend as ext
from tqdm import tqdm
-__version__ = "2.0.5"
+__version__ = "2.0.6"
reroot_tree = True #use --midpoint-reroot
random_seeded = random.Random(42)
@@ -144,13 +143,13 @@ signal.signal(signal.SIGINT, handler)
def xmfa_to_maf(xmfa_path, maf_path, all_input_paths):
sample_delim = '#'
SeqInfo = namedtuple("SeqInfo", "cid seq_length")
- hdr_block_pattern = re.compile("##SequenceIndex (\d+)\n##SequenceFile (.+)\n##SequenceHeader >\s*(\S+).*\n##SequenceLength (\d+)bp")
+ hdr_block_pattern = re.compile(r"##SequenceIndex (\d+)\n##SequenceFile (.+)\n##SequenceHeader >\s*(\S+).*\n##SequenceLength (\d+)bp")
idx_to_fname = {}
ref_fname = ""
with open(xmfa_path) as xmfa_in:
next(xmfa_in) # skip version
line = next(xmfa_in)
- seq_count = int(re.match("#SequenceCount (\d+)\n", line).groups()[0])
+ seq_count = int(re.match(r"#SequenceCount (\d+)\n", line).groups()[0])
for i in range(seq_count):
info_block = ""
for _ in range(4):
@@ -913,18 +912,35 @@ def make_genome_and_reference_output_strings(ref, genbank_files):
return sortem, ref_string, genome_string, ref
+def readfa(fp):
+ """
+ Fasta parser taken from readfq
+ """
+ last = None # this is a buffer keeping the last unprocessed line
+ while True: # mimic closure; is it a bad idea?
+ if not last: # the first record or a record following a fastq
+ for l in fp: # search for the start of the next record
+ if l[0] == '>': # fasta header line
+ last = l[:-1] # save this line
+ break
+ if not last: break
+ name, seqs, last = last[1:].partition(" ")[0], [], None
+ for l in fp: # read the sequence
+ if l[0] in '>':
+ last = l[:-1]
+ break
+ seqs.append(l[:-1])
+
+ yield name, ''.join(seqs) # yield a fasta record
+ if not last: break
+
def check_ref_genome_aligned(ref):
- # global ff, hdr, seq, line, reflen
+ reflen = 0
with open(ref, 'r') as ff:
- hdr = ff.readline()
- seq = ff.read()
- if hdr[0] != ">":
- logger.critical("Reference {} has improperly formatted header.".format(ref))
- sys.exit(1)
- for line in seq.split('\n'):
- if '-' in line and line[0] != ">":
- logger.warning("Reference genome sequence %s has '-' in the sequence!" % ((ref)))
- reflen = len(seq) - seq.count('\n')
+ for hdr, seq in readfa(ff):
+ if '-' in seq:
+ logger.warning(f"Reference genome sequence {hdr} in {ref} has '-' in the sequence!")
+ reflen += len(seq)
return reflen
@@ -962,22 +978,18 @@ def parse_input_files(input_files, curated, validate_input):
# Old version of the parser:
with open(input_file, 'r') as ff:
- hdr = ff.readline()
- seq = ff.read()
- name_flag = True
- seqlen = len(seq) - seq.count('\n')
- if hdr[0] != ">":
- logger.error("{} has improperly formatted header. Skip!".format(input_file))
+ concat_seq = ""
+ for hdr, seq in readfa(ff):
+ concat_seq += seq
+
+ seqlen = len(concat_seq)
+ if '-' in concat_seq:
+ logger.error("Genome sequence %s seems to be aligned! Skip!" % ((input_file)))
continue
- elif '-' in seq:
- seq = seq.split('\n')
- if any('-' in l and ('>' not in l) for l in seq):
- logger.error("Genome sequence %s seems to be aligned! Skip!" % ((input_file)))
- continue
elif seqlen <= 20:
logger.error("File %s is less than or equal to 20bp in length. Skip!" % (input_file))
continue
- sizediff = float(reflen) / float(seqlen)
+ sizediff = float(reflen) / seqlen
# Argument for ignoring any issues with the input/references:
if curated:
@@ -1296,31 +1308,15 @@ SETTINGS:
#sort reference by largest replicon to smallest
if sortem and os.path.exists(ref) and not autopick_ref:
- sequences = SeqIO.parse(ref, "fasta")
+ with open(ref, 'r') as ff:
+ seq_dict = {hdr: seq for hdr, seq in readfa(ff)}
+ seqs_sorted_by_len = sorted(seq_dict.items(), key=lambda kv: -len(kv[1]))
new_ref = os.path.join(outputDir, os.path.basename(ref)+".ref")
- SeqIO.write(sequences, new_ref, "fasta")
+ with open(new_ref, 'w') as ffo:
+ for hdr, seq in seqs_sorted_by_len:
+ ffo.write(f">{hdr}\n")
+ ffo.write(f"{seq}\n")
ref = new_ref
- # logger.debug("Sorting reference replicons")
- # ff = open(ref, 'r')
- # seqs = ff.read().split(">")[1:]
- # seq_dict = {}
- # seq_len = {}
- # for seq in seqs:
- # try:
- # hdr, seq = seq.split("\n",1)
- # except ValueError:
- # # TODO Why do we ignore when theres a header but no sequence?
- # continue
- # seq_dict[hdr] = seq
- # seq_len[hdr] = len(seq) - seq.count('\n')
- # seq_len_sort = sorted(iter(seq_len.items()), key=operator.itemgetter(1), reverse=True)
- # ref = os.path.join(outputDir, os.path.basename(ref)+".ref")
- # ffo = open(ref, 'w')
- # for hdr, seq in seq_len_sort:
- # ffo.write(">%s\n"%(hdr))
- # ffo.write("%s"%(seq_dict[hdr]))
- # ff.close()
- # ffo.close()
else:
ref = genbank_ref
@@ -1479,27 +1475,17 @@ SETTINGS:
# More stuff to autopick the reference if needed:
orig_auto_ref = auto_ref
if os.path.exists(auto_ref) and autopick_ref:
- #TODO This code block is duplicated
- ff = open(auto_ref, 'r')
- seqs = ff.read().split(">")[1:]
seq_dict = {}
seq_len = {}
- for seq in seqs:
- try:
- hdr, seq = seq.split("\n",1)
- except ValueError:
- continue
- seq_dict[hdr] = seq
- seq_len[hdr] = len(seq) - seq.count('\n')
- seq_len_sort = sorted(seq_len.iteritems(), key=operator.itemgetter(1))
- seq_len_sort.reverse()
+ with open(auto_ref, 'r') as ff:
+ seq_dict = {hdr: seq for hdr, seq in readfa(ff)}
+
+ seqs_sorted_by_len = sorted(seq_dict.items(), key=lambda kv: -len(kv[1]))
auto_ref = os.path.join(outputDir, os.path.basename(auto_ref)+".ref")
- ffo = open(ref, 'w')
- for item in seq_len_sort:
- ffo.write(">%s\n"%(item[0]))
- ffo.write(seq_dict[item[0]])
- ff.close()
- ffo.close()
+ with open(ref, 'w') as ffo:
+ for hdr, seq in seqs_sorted_by_len:
+ ffo.write(f">{hdr}\n")
+ ffo.write(f"{seq}\n")
ref = auto_ref
finalfiles = sorted(finalfiles)
@@ -1774,9 +1760,9 @@ SETTINGS:
# Harvest seems to fail sometimes when piping to stderr/stdout...
run_command(command)
- if run_recomb_filter and not args.partition:
+ if run_recomb_filter:
command = "harvesttools -q -b %s/parsnp.rec,REC,\"PhiPack\" -o %s/parsnp.ggr -i %s/parsnp.ggr"%(outputDir,outputDir,outputDir)
- run_logged_command(command, outputDir, label="recomb-filter")
+ run_command(command)
run_logged_command(
f"harvesttools -i {outputDir}/parsnp.ggr -S {outputDir}/parsnp.snps.mblocks",
View it on GitLab: https://salsa.debian.org/med-team/parsnp/-/compare/42bd696c44651ea7745dd0162963789923dcc14a...f995192c9bae50b5dfbeabd09e15432cddd8c864
--
View it on GitLab: https://salsa.debian.org/med-team/parsnp/-/compare/42bd696c44651ea7745dd0162963789923dcc14a...f995192c9bae50b5dfbeabd09e15432cddd8c864
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20241103/f81cb48b/attachment-0001.htm>
More information about the debian-med-commit
mailing list