[med-svn] [Git][med-team/augur][upstream] New upstream version 12.0.0
Nilesh Patra
gitlab at salsa.debian.org
Fri Apr 23 17:58:26 BST 2021
Nilesh Patra pushed to branch upstream at Debian Med / augur
Commits:
d19d29e5 by Nilesh Patra at 2021-04-23T22:21:53+05:30
New upstream version 12.0.0
- - - - -
8 changed files:
- CHANGES.md
- augur/__version__.py
- augur/distance.py
- augur/filter.py
- augur/refine.py
- augur/tree.py
- tests/functional/filter.t
- tests/test_filter.py
Changes:
=====================================
CHANGES.md
=====================================
@@ -3,6 +3,30 @@
## __NEXT__
+## 12.0.0 (13 April 2021)
+
+### Major Changes
+
+* filter: Date bounds (`--min-date` and `--max-date`) are now inclusive instead of exclusive such that records matching the given dates will pass date filters [#708][] (@benjaminotter)
+
+### Bug Fixes
+
+* refine: Recommend an alternate action when skyline optimization fails [#712][] (@huddlej)
+
+### Features
+
+* distance: Count insertion/deletion events once in pairwise distances [#698][] (@huddlej, @benjaminotter)
+* distance: Optionally ignore specific list of characters defined in a distance map's top-level `ignored_characters` list [#707][] (@benjaminotter)
+* filter: Allow `--subsample-max-sequences` without `--group-by` [#710][] (@benjaminotter)
+* tree: Prefer `iqtree2` binary over `iqtree` when possible [#711][] (@benjaminotter)
+
+[#698]: https://github.com/nextstrain/augur/pull/698
+[#707]: https://github.com/nextstrain/augur/pull/707
+[#708]: https://github.com/nextstrain/augur/pull/708
+[#710]: https://github.com/nextstrain/augur/pull/710
+[#711]: https://github.com/nextstrain/augur/pull/711
+[#712]: https://github.com/nextstrain/augur/pull/712
+
## 11.3.0 (19 March 2021)
### Bug Fixes
=====================================
augur/__version__.py
=====================================
@@ -1,4 +1,4 @@
-__version__ = '11.3.0'
+__version__ = '12.0.0'
def is_augur_version_compatible(version):
=====================================
augur/distance.py
=====================================
@@ -135,6 +135,7 @@ import Bio
import Bio.Phylo
from collections import defaultdict
import copy
+from itertools import chain
import json
import numpy as np
import pandas as pd
@@ -198,7 +199,7 @@ def read_distance_map(map_file):
return distance_map
-def get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map):
+def get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map, aggregate_function=max):
"""Calculate distance between the two given nodes using the given distance map.
In cases where the distance map between sequences is asymmetric, the first
@@ -245,15 +246,137 @@ def get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
>>> distance_map = {"default": 0.0, "map": {"gene": {2: {('T', 'G'): 0.5}}}}
>>> get_distance_between_nodes(node_b_sequences, node_a_sequences, distance_map)
0.0
+
+ Treat a single indel as one event.
+
+ >>> node_a_sequences = {"gene": "ACTG"}
+ >>> node_b_sequences = {"gene": "A--G"}
+ >>> distance_map = {"default": 1, "map": {}}
+ >>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
+ 1
+
+ Use the maximum weight of all sites affected by an indel with a site-specific distance map.
+
+ >>> distance_map = {"default": 0, "map": {"gene": {1: 1, 2: 2}}}
+ >>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
+ 2
+
+ Use the maximum weight of all mutations at all sites affected by an indel with a mutation-specific distance map.
+
+ >>> distance_map = {
+ ... "default": 0,
+ ... "map": {
+ ... "gene": {
+ ... 1: {
+ ... ('C', 'G'): 1,
+ ... ('C', 'A'): 2
+ ... },
+ ... 2: {
+ ... ('T', 'G'): 3,
+ ... ('T', 'A'): 2
+ ... }
+ ... }
+ ... }
+ ... }
+ >>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
+ 3
+
+ Use the maximum weight of gaps at all sites affected by an indel with a mutation-specific distance map.
+
+ >>> distance_map = {
+ ... "default": 0,
+ ... "map": {
+ ... "gene": {
+ ... 1: {
+ ... ('C', '-'): 1,
+ ... ('C', 'A'): 2
+ ... },
+ ... 2: {
+ ... ('T', 'G'): 3,
+ ... ('T', '-'): 2
+ ... }
+ ... }
+ ... }
+ ... }
+ >>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
+ 2
+
+ If the default value is greater than any of the site-specific mismatches and
+ the specific mismatch does not have a weighted defined, use the default
+ weight.
+
+ >>> distance_map = {
+ ... "default": 4,
+ ... "map": {
+ ... "gene": {
+ ... 1: {
+ ... ('C', 'G'): 1,
+ ... ('C', 'A'): 2
+ ... },
+ ... 2: {
+ ... ('T', 'G'): 3,
+ ... ('T', 'A'): 2
+ ... }
+ ... }
+ ... }
+ ... }
+ >>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
+ 4
+
+ Count mismatches adjacent to indel events.
+
+ >>> node_a_sequences = {"gene": "ACTGTA"}
+ >>> node_b_sequences = {"gene": "A--CCA"}
+ >>> distance_map = {"default": 1, "map": {}}
+ >>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
+ 3
+
+ Ignore specific characters defined in the distance map.
+
+ >>> node_a_sequences = {"gene": "ACTGG"}
+ >>> node_b_sequences = {"gene": "A--GN"}
+ >>> distance_map = {"default": 1, "ignored_characters":["-"], "map": {}}
+ >>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
+ 1
+ >>> distance_map = {"default": 1, "ignored_characters":["-", "N"], "map": {}}
+ >>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
+ 0
+
"""
distance_type = type(distance_map["default"])
distance = distance_type(0)
+ ignored_characters = distance_map.get("ignored_characters",[])
for gene in node_a_sequences:
gene_length = len(node_a_sequences[gene])
+ # In a first pass, find all mismatches between the given sequences. Use
+ # this pass to identify all sites in insertion/deletion (indel) events,
+ # so we can calculate an aggregate weight per event. Track each event by
+ # its start site.
+ mismatches_by_site = defaultdict(set)
+ in_gap = False
for site in range(gene_length):
- if node_a_sequences[gene][site] != node_b_sequences[gene][site]:
+ if (node_a_sequences[gene][site] != node_b_sequences[gene][site]
+ and node_a_sequences[gene][site] not in ignored_characters
+ and node_b_sequences[gene][site] not in ignored_characters):
+ if node_a_sequences[gene][site] == "-" or node_b_sequences[gene][site] == "-":
+ if not in_gap:
+ gap_start = site
+ in_gap = True
+
+ mismatches_by_site[gap_start].add(site)
+ else:
+ in_gap = False
+ mismatches_by_site[site].add(site)
+ else:
+ in_gap = False
+
+ # Sum distances across mismatched sites, aggregating indel events by a
+ # summary function (e.g., max, mean, etc.).
+ for sites in mismatches_by_site.values():
+ mismatch_distances = []
+ for site in sites:
if gene in distance_map["map"] and site in distance_map["map"][gene]:
# Distances can be provided as either site- and
# sequence-specific dictionaries of sequence pairs to
@@ -262,14 +385,36 @@ def get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
if isinstance(distance_map["map"][gene][site], dict):
seq_ancestral = node_a_sequences[gene][site]
seq_derived = node_b_sequences[gene][site]
- distance += distance_map["map"][gene][site].get(
- (seq_ancestral, seq_derived),
- distance_map["default"]
- )
+
+ # Check first for a user-defined weight for the
+ # mismatched bases. This supports mismatch weights
+ # between specific characters including gaps.
+ if (seq_ancestral, seq_derived) in distance_map["map"][gene][site]:
+ mismatch_distances.append(
+ distance_map["map"][gene][site][(seq_ancestral, seq_derived)]
+ )
+ # Next, check whether the mismatch is a gap. We want to
+ # take the aggregate of all weights at this site.
+ elif seq_ancestral == "-" or seq_derived == "-":
+ mismatch_distances.append(
+ aggregate_function(
+ chain(
+ (distance_map["default"],),
+ distance_map["map"][gene][site].values()
+ )
+ )
+ )
+ # Finally, use the default weight, if no
+ # sequence-specific weights are defined.
+ else:
+ mismatch_distances.append(distance_map["default"])
else:
- distance += distance_map["map"][gene][site]
+ mismatch_distances.append(distance_map["map"][gene][site])
else:
- distance += distance_map["default"]
+ mismatch_distances.append(distance_map["default"])
+
+ # Aggregate the distances for all sites in the current mismatch.
+ distance += aggregate_function(mismatch_distances)
return distance_type(np.round(distance, 2))
=====================================
augur/filter.py
=====================================
@@ -105,8 +105,8 @@ def register_arguments(parser):
Uses Pandas Dataframe querying, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query for syntax.
(e.g., --query "country == 'Colombia'" or --query "(country == 'USA' & (division == 'Washington'))")"""
)
- metadata_filter_group.add_argument('--min-date', type=numeric_date, help="minimal cutoff for date; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD")
- metadata_filter_group.add_argument('--max-date', type=numeric_date, help="maximal cutoff for date; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD")
+ metadata_filter_group.add_argument('--min-date', type=numeric_date, help="minimal cutoff for date, the cutoff date is inclusive; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD")
+ metadata_filter_group.add_argument('--max-date', type=numeric_date, help="maximal cutoff for date, the cutoff date is inclusive; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD")
metadata_filter_group.add_argument('--exclude-ambiguous-dates-by', choices=['any', 'day', 'month', 'year'],
help='Exclude ambiguous dates by day (e.g., 2020-09-XX), month (e.g., 2020-XX-XX), year (e.g., 200X-10-01), or any date fields. An ambiguous year makes the corresponding month and day ambiguous, too, even if those fields have unambiguous values (e.g., "201X-10-01"). Similarly, an ambiguous month makes the corresponding day ambiguous (e.g., "2010-XX-01").')
metadata_filter_group.add_argument('--exclude', type=str, nargs="+", help="file(s) with list of strains to exclude")
@@ -125,7 +125,7 @@ def register_arguments(parser):
subsample_group.add_argument('--group-by', nargs='+', help="categories with respect to subsample; two virtual fields, \"month\" and \"year\", are supported if they don't already exist as real fields but a \"date\" field does exist")
subsample_limits_group = subsample_group.add_mutually_exclusive_group()
subsample_limits_group.add_argument('--sequences-per-group', type=int, help="subsample to no more than this number of sequences per category")
- subsample_limits_group.add_argument('--subsample-max-sequences', type=int, help="subsample to no more than this number of sequences")
+ subsample_limits_group.add_argument('--subsample-max-sequences', type=int, help="subsample to no more than this number of sequences; can be used without the group_by argument")
probabilistic_sampling_group = subsample_group.add_mutually_exclusive_group()
probabilistic_sampling_group.add_argument('--probabilistic-sampling', action='store_true', help="Enable probabilistic sampling during subsampling. This is useful when there are more groups than requested sequences. This option only applies when `--subsample-max-sequences` is provided.")
probabilistic_sampling_group.add_argument('--no-probabilistic-sampling', action='store_false', dest='probabilistic_sampling')
@@ -348,9 +348,9 @@ def run(args):
dates = get_numerical_dates(meta_dict, fmt="%Y-%m-%d")
tmp = {s for s in seq_keep if dates[s] is not None}
if args.min_date:
- tmp = {s for s in tmp if (np.isscalar(dates[s]) or all(dates[s])) and np.max(dates[s])>args.min_date}
+ tmp = {s for s in tmp if (np.isscalar(dates[s]) or all(dates[s])) and np.max(dates[s])>=args.min_date}
if args.max_date:
- tmp = {s for s in tmp if (np.isscalar(dates[s]) or all(dates[s])) and np.min(dates[s])<args.max_date}
+ tmp = {s for s in tmp if (np.isscalar(dates[s]) or all(dates[s])) and np.min(dates[s])<=args.max_date}
num_excluded_by_date = len(seq_keep) - len(tmp)
seq_keep = tmp
@@ -376,7 +376,15 @@ def run(args):
if args.subsample_seed:
random.seed(args.subsample_seed)
num_excluded_subsamp = 0
- if args.group_by and (args.sequences_per_group or args.subsample_max_sequences):
+ if args.subsample_max_sequences or (args.group_by and args.sequences_per_group):
+
+ #set groups to group_by values
+ if args.group_by:
+ groups = args.group_by
+ #if group_by not specified use dummy category
+ else:
+ groups = ["_dummy"]
+
spg = args.sequences_per_group
seq_names_by_group = defaultdict(list)
@@ -384,8 +392,10 @@ def run(args):
group = []
m = meta_dict[seq_name]
# collect group specifiers
- for c in args.group_by:
- if c in m:
+ for c in groups:
+ if c == "_dummy":
+ group.append(c)
+ elif c in m:
group.append(m[c])
elif c in ['month', 'year'] and 'date' in m:
try:
@@ -407,16 +417,16 @@ def run(args):
#If didnt find any categories specified, all seqs will be in 'unknown' - but don't sample this!
if len(seq_names_by_group)==1 and ('unknown' in seq_names_by_group or ('unknown',) in seq_names_by_group):
- print("WARNING: The specified group-by categories (%s) were not found."%args.group_by,
+ print("WARNING: The specified group-by categories (%s) were not found."%groups,
"No sequences-per-group sampling will be done.")
- if any([x in args.group_by for x in ['year','month']]):
+ if any([x in groups for x in ['year','month']]):
print("Note that using 'year' or 'year month' requires a column called 'date'.")
print("\n")
else:
# Check to see if some categories are missing to warn the user
group_by = set(['date' if cat in ['year','month'] else cat
- for cat in args.group_by])
- missing_cats = [cat for cat in group_by if cat not in meta_columns]
+ for cat in groups])
+ missing_cats = [cat for cat in group_by if cat not in meta_columns and cat != "_dummy"]
if missing_cats:
print("WARNING:")
if any([cat != 'date' for cat in missing_cats]):
=====================================
augur/refine.py
=====================================
@@ -222,7 +222,11 @@ def run(args):
node_data['skyline'] = [[float(x) for x in skyline.x], [float(y) for y in conf[0]],
[float(y) for y in skyline.y], [float(y) for y in conf[1]]]
except:
- print("ERROR: skyline optimization by TreeTime has failed.", file=sys.stderr)
+ print(
+ "ERROR: skyline optimization by TreeTime has failed.",
+ "To avoid this error, try running without coalescent optimization or with `--coalescent opt` instead of `--coalescent skyline`.",
+ file=sys.stderr
+ )
return 1
attributes.extend(['numdate', 'clock_length', 'mutation_length', 'raw_date', 'date'])
=====================================
augur/tree.py
=====================================
@@ -133,6 +133,10 @@ def build_iqtree(aln_file, out_file, substitution_model="GTR", clean_up=True, nt
aln_file file name of input aligment
out_file file name to write tree to
'''
+ iqtree = find_executable([
+ "iqtree2",
+ "iqtree"
+ ])
# create a dictionary for characters that IQ-tree changes.
# we remove those prior to tree-building and reinstantiate later
def random_string(n):
@@ -188,10 +192,10 @@ def build_iqtree(aln_file, out_file, substitution_model="GTR", clean_up=True, nt
)
if substitution_model.lower() != "none":
- call = ["iqtree", *fast_opts, "-nt", str(nthreads), "-s", shquote(tmp_aln_file),
+ call = [iqtree, *fast_opts, "-nt", str(nthreads), "-s", shquote(tmp_aln_file),
"-m", substitution_model, tree_builder_args, ">", log_file]
else:
- call = ["iqtree", *fast_opts, "-nt", str(nthreads), "-s", shquote(tmp_aln_file), tree_builder_args, ">", shquote(log_file)]
+ call = [iqtree *fast_opts, "-nt", str(nthreads), "-s", shquote(tmp_aln_file), tree_builder_args, ">", shquote(log_file)]
cmd = " ".join(call)
=====================================
tests/functional/filter.t
=====================================
@@ -20,6 +20,22 @@ With 10 groups to subsample from, this should produce one sequence per group.
\s*10 (re)
$ rm -f "$TMP/filtered.fasta"
+Filter with subsampling where no more than 5 sequences are requested and no groups are specified.
+This generates a dummy category and subsamples from there. With no-probabilistic-sampling we expect exactly 5 sequences.
+
+ $ ${AUGUR} filter \
+ > --sequences filter/sequences.fasta \
+ > --sequence-index filter/sequence_index.tsv \
+ > --metadata filter/metadata.tsv \
+ > --min-date 2012 \
+ > --subsample-max-sequences 5 \
+ > --subsample-seed 314159 \
+ > --no-probabilistic-sampling \
+ > --output "$TMP/filtered.fasta" > /dev/null
+ $ grep ">" "$TMP/filtered.fasta" | wc -l
+ \s*5 (re)
+ $ rm -f "$TMP/filtered.fasta"
+
Try to filter with subsampling when there are more available groups than requested sequences.
This should fail, as probabilistic sampling is explicitly disabled.
=====================================
tests/test_filter.py
=====================================
@@ -215,3 +215,31 @@ class TestFilter:
augur.filter.run(args)
output = SeqIO.to_dict(SeqIO.parse(out_fn, "fasta"))
assert list(output.keys()) == ["SEQ_1", "SEQ_3"]
+
+ def test_filter_run_min_date(self, tmpdir, fasta_fn, argparser):
+ """Test that filter --min-date is inclusive"""
+ out_fn = str(tmpdir / "out.fasta")
+ min_date = "2020-02-26"
+ meta_fn = write_metadata(tmpdir, (("strain","date"),
+ ("SEQ_1","2020-02-XX"),
+ ("SEQ_2","2020-02-26"),
+ ("SEQ_3","2020-02-25")))
+ args = argparser('-s %s --metadata %s -o %s --min-date %s'
+ % (fasta_fn, meta_fn, out_fn, min_date))
+ augur.filter.run(args)
+ output = SeqIO.to_dict(SeqIO.parse(out_fn, "fasta"))
+ assert list(output.keys()) == ["SEQ_1", "SEQ_2"]
+
+ def test_filter_run_max_date(self, tmpdir, fasta_fn, argparser):
+ """Test that filter --max-date is inclusive"""
+ out_fn = str(tmpdir / "out.fasta")
+ max_date = "2020-03-01"
+ meta_fn = write_metadata(tmpdir, (("strain","date"),
+ ("SEQ_1","2020-03-XX"),
+ ("SEQ_2","2020-03-01"),
+ ("SEQ_3","2020-03-02")))
+ args = argparser('-s %s --metadata %s -o %s --max-date %s'
+ % (fasta_fn, meta_fn, out_fn, max_date))
+ augur.filter.run(args)
+ output = SeqIO.to_dict(SeqIO.parse(out_fn, "fasta"))
+ assert list(output.keys()) == ["SEQ_1", "SEQ_2"]
\ No newline at end of file
View it on GitLab: https://salsa.debian.org/med-team/augur/-/commit/d19d29e58828e426502c2411a61b05818fd67285
--
View it on GitLab: https://salsa.debian.org/med-team/augur/-/commit/d19d29e58828e426502c2411a61b05818fd67285
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20210423/8ff48797/attachment-0001.htm>
More information about the debian-med-commit
mailing list