[med-svn] [Git][med-team/parsnp][upstream] New upstream version 2.0.6+dfsg
Étienne Mollier (@emollier)
gitlab at salsa.debian.org
Sun Nov 3 15:40:07 GMT 2024
Étienne Mollier pushed to branch upstream at Debian Med / parsnp
Commits:
81f6b2b6 by Étienne Mollier at 2024-11-03T16:30:48+01:00
New upstream version 2.0.6+dfsg
- - - - -
2 changed files:
- README.md
- parsnp
Changes:
=====================================
README.md
=====================================
@@ -48,7 +48,7 @@ The `concat_start` and `concat_end` values are internal to parsnp. The sequence
## Building from source
-To build Parsnp from source, users must have automake 1.15, autoconf, and libtool installed. Parsnp also requires RaxML (or FastTree), Harvest-tools, and numpy. Some additional features require pySPOA, Mash, FastANI, and Phipack. All of these packages are available via Conda (many on the Bioconda channel).
+To build Parsnp from source, users must have automake 1.15, autoconf, and libtool installed. Parsnp also requires RaxML (or FastTree), Harvest-tools, biopython, tqdm, and numpy. Some additional features require pySPOA, Mash, FastANI, and Phipack. All of these packages are available via Conda (many on the Bioconda channel).
### Build instructions
First, you must build the Muscle library
=====================================
parsnp
=====================================
@@ -22,10 +22,9 @@ from glob import glob
from pathlib import Path
-import extend as ext
from tqdm import tqdm
-__version__ = "2.0.5"
+__version__ = "2.0.6"
reroot_tree = True #use --midpoint-reroot
random_seeded = random.Random(42)
@@ -144,13 +143,13 @@ signal.signal(signal.SIGINT, handler)
def xmfa_to_maf(xmfa_path, maf_path, all_input_paths):
sample_delim = '#'
SeqInfo = namedtuple("SeqInfo", "cid seq_length")
- hdr_block_pattern = re.compile("##SequenceIndex (\d+)\n##SequenceFile (.+)\n##SequenceHeader >\s*(\S+).*\n##SequenceLength (\d+)bp")
+ hdr_block_pattern = re.compile(r"##SequenceIndex (\d+)\n##SequenceFile (.+)\n##SequenceHeader >\s*(\S+).*\n##SequenceLength (\d+)bp")
idx_to_fname = {}
ref_fname = ""
with open(xmfa_path) as xmfa_in:
next(xmfa_in) # skip version
line = next(xmfa_in)
- seq_count = int(re.match("#SequenceCount (\d+)\n", line).groups()[0])
+ seq_count = int(re.match(r"#SequenceCount (\d+)\n", line).groups()[0])
for i in range(seq_count):
info_block = ""
for _ in range(4):
@@ -913,18 +912,35 @@ def make_genome_and_reference_output_strings(ref, genbank_files):
return sortem, ref_string, genome_string, ref
+def readfa(fp):
+ """
+ Fasta parser taken from readfq
+ """
+ last = None # this is a buffer keeping the last unprocessed line
+ while True: # mimic closure; is it a bad idea?
+ if not last: # the first record or a record following a fastq
+ for l in fp: # search for the start of the next record
+ if l[0] == '>': # fasta header line
+ last = l[:-1] # save this line
+ break
+ if not last: break
+ name, seqs, last = last[1:].partition(" ")[0], [], None
+ for l in fp: # read the sequence
+ if l[0] in '>':
+ last = l[:-1]
+ break
+ seqs.append(l[:-1])
+
+ yield name, ''.join(seqs) # yield a fasta record
+ if not last: break
+
def check_ref_genome_aligned(ref):
- # global ff, hdr, seq, line, reflen
+ reflen = 0
with open(ref, 'r') as ff:
- hdr = ff.readline()
- seq = ff.read()
- if hdr[0] != ">":
- logger.critical("Reference {} has improperly formatted header.".format(ref))
- sys.exit(1)
- for line in seq.split('\n'):
- if '-' in line and line[0] != ">":
- logger.warning("Reference genome sequence %s has '-' in the sequence!" % ((ref)))
- reflen = len(seq) - seq.count('\n')
+ for hdr, seq in readfa(ff):
+ if '-' in seq:
+ logger.warning(f"Reference genome sequence {hdr} in {ref} has '-' in the sequence!")
+ reflen += len(seq)
return reflen
@@ -962,22 +978,18 @@ def parse_input_files(input_files, curated, validate_input):
# Old version of the parser:
with open(input_file, 'r') as ff:
- hdr = ff.readline()
- seq = ff.read()
- name_flag = True
- seqlen = len(seq) - seq.count('\n')
- if hdr[0] != ">":
- logger.error("{} has improperly formatted header. Skip!".format(input_file))
+ concat_seq = ""
+ for hdr, seq in readfa(ff):
+ concat_seq += seq
+
+ seqlen = len(concat_seq)
+ if '-' in concat_seq:
+ logger.error("Genome sequence %s seems to be aligned! Skip!" % ((input_file)))
continue
- elif '-' in seq:
- seq = seq.split('\n')
- if any('-' in l and ('>' not in l) for l in seq):
- logger.error("Genome sequence %s seems to be aligned! Skip!" % ((input_file)))
- continue
elif seqlen <= 20:
logger.error("File %s is less than or equal to 20bp in length. Skip!" % (input_file))
continue
- sizediff = float(reflen) / float(seqlen)
+ sizediff = float(reflen) / seqlen
# Argument for ignoring any issues with the input/references:
if curated:
@@ -1296,31 +1308,15 @@ SETTINGS:
#sort reference by largest replicon to smallest
if sortem and os.path.exists(ref) and not autopick_ref:
- sequences = SeqIO.parse(ref, "fasta")
+ with open(ref, 'r') as ff:
+ seq_dict = {hdr: seq for hdr, seq in readfa(ff)}
+ seqs_sorted_by_len = sorted(seq_dict.items(), key=lambda kv: -len(kv[1]))
new_ref = os.path.join(outputDir, os.path.basename(ref)+".ref")
- SeqIO.write(sequences, new_ref, "fasta")
+ with open(new_ref, 'w') as ffo:
+ for hdr, seq in seqs_sorted_by_len:
+ ffo.write(f">{hdr}\n")
+ ffo.write(f"{seq}\n")
ref = new_ref
- # logger.debug("Sorting reference replicons")
- # ff = open(ref, 'r')
- # seqs = ff.read().split(">")[1:]
- # seq_dict = {}
- # seq_len = {}
- # for seq in seqs:
- # try:
- # hdr, seq = seq.split("\n",1)
- # except ValueError:
- # # TODO Why do we ignore when theres a header but no sequence?
- # continue
- # seq_dict[hdr] = seq
- # seq_len[hdr] = len(seq) - seq.count('\n')
- # seq_len_sort = sorted(iter(seq_len.items()), key=operator.itemgetter(1), reverse=True)
- # ref = os.path.join(outputDir, os.path.basename(ref)+".ref")
- # ffo = open(ref, 'w')
- # for hdr, seq in seq_len_sort:
- # ffo.write(">%s\n"%(hdr))
- # ffo.write("%s"%(seq_dict[hdr]))
- # ff.close()
- # ffo.close()
else:
ref = genbank_ref
@@ -1479,27 +1475,17 @@ SETTINGS:
# More stuff to autopick the reference if needed:
orig_auto_ref = auto_ref
if os.path.exists(auto_ref) and autopick_ref:
- #TODO This code block is duplicated
- ff = open(auto_ref, 'r')
- seqs = ff.read().split(">")[1:]
seq_dict = {}
seq_len = {}
- for seq in seqs:
- try:
- hdr, seq = seq.split("\n",1)
- except ValueError:
- continue
- seq_dict[hdr] = seq
- seq_len[hdr] = len(seq) - seq.count('\n')
- seq_len_sort = sorted(seq_len.iteritems(), key=operator.itemgetter(1))
- seq_len_sort.reverse()
+ with open(auto_ref, 'r') as ff:
+ seq_dict = {hdr: seq for hdr, seq in readfa(ff)}
+
+ seqs_sorted_by_len = sorted(seq_dict.items(), key=lambda kv: -len(kv[1]))
auto_ref = os.path.join(outputDir, os.path.basename(auto_ref)+".ref")
- ffo = open(ref, 'w')
- for item in seq_len_sort:
- ffo.write(">%s\n"%(item[0]))
- ffo.write(seq_dict[item[0]])
- ff.close()
- ffo.close()
+ with open(ref, 'w') as ffo:
+ for hdr, seq in seqs_sorted_by_len:
+ ffo.write(f">{hdr}\n")
+ ffo.write(f"{seq}\n")
ref = auto_ref
finalfiles = sorted(finalfiles)
@@ -1774,9 +1760,9 @@ SETTINGS:
# Harvest seems to fail sometimes when piping to stderr/stdout...
run_command(command)
- if run_recomb_filter and not args.partition:
+ if run_recomb_filter:
command = "harvesttools -q -b %s/parsnp.rec,REC,\"PhiPack\" -o %s/parsnp.ggr -i %s/parsnp.ggr"%(outputDir,outputDir,outputDir)
- run_logged_command(command, outputDir, label="recomb-filter")
+ run_command(command)
run_logged_command(
f"harvesttools -i {outputDir}/parsnp.ggr -S {outputDir}/parsnp.snps.mblocks",
View it on GitLab: https://salsa.debian.org/med-team/parsnp/-/commit/81f6b2b661597133307ad20928da078fdb996e69
--
View it on GitLab: https://salsa.debian.org/med-team/parsnp/-/commit/81f6b2b661597133307ad20928da078fdb996e69
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20241103/b77cc867/attachment-0001.htm>
More information about the debian-med-commit
mailing list