[med-svn] [Git][med-team/augur][upstream] New upstream version 12.0.0

Fri Apr 23 17:58:26 BST 2021


Nilesh Patra pushed to branch upstream at Debian Med / augur


Commits:
d19d29e5 by Nilesh Patra at 2021-04-23T22:21:53+05:30
New upstream version 12.0.0
- - - - -


8 changed files:

- CHANGES.md
- augur/__version__.py
- augur/distance.py
- augur/filter.py
- augur/refine.py
- augur/tree.py
- tests/functional/filter.t
- tests/test_filter.py


Changes:

=====================================
CHANGES.md
=====================================
@@ -3,6 +3,30 @@
 ## __NEXT__
 
 
+## 12.0.0 (13 April 2021)
+
+### Major Changes
+
+* filter: Date bounds (`--min-date` and `--max-date`) are now inclusive instead of exclusive such that records matching the given dates will pass date filters [#708][] (@benjaminotter)
+
+### Bug Fixes
+
+* refine: Recommend an alternate action when skyline optimization fails [#712][] (@huddlej)
+
+### Features
+
+* distance: Count insertion/deletion events once in pairwise distances [#698][] (@huddlej, @benjaminotter)
+* distance: Optionally ignore specific list of characters defined in a distance map's top-level `ignored_characters` list [#707][] (@benjaminotter)
+* filter: Allow `--subsample-max-sequences` without `--group-by` [#710][] (@benjaminotter)
+* tree: Prefer `iqtree2` binary over `iqtree` when possible [#711][] (@benjaminotter)
+
+[#698]: https://github.com/nextstrain/augur/pull/698
+[#707]: https://github.com/nextstrain/augur/pull/707
+[#708]: https://github.com/nextstrain/augur/pull/708
+[#710]: https://github.com/nextstrain/augur/pull/710
+[#711]: https://github.com/nextstrain/augur/pull/711
+[#712]: https://github.com/nextstrain/augur/pull/712
+
 ## 11.3.0 (19 March 2021)
 
 ### Bug Fixes


=====================================
augur/__version__.py
=====================================
@@ -1,4 +1,4 @@
-__version__ = '11.3.0'
+__version__ = '12.0.0'
 
 
 def is_augur_version_compatible(version):


=====================================
augur/distance.py
=====================================
@@ -135,6 +135,7 @@ import Bio
 import Bio.Phylo
 from collections import defaultdict
 import copy
+from itertools import chain
 import json
 import numpy as np
 import pandas as pd
@@ -198,7 +199,7 @@ def read_distance_map(map_file):
     return distance_map
 
 
-def get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map):
+def get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map, aggregate_function=max):
     """Calculate distance between the two given nodes using the given distance map.
 
     In cases where the distance map between sequences is asymmetric, the first
@@ -245,15 +246,137 @@ def get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
     >>> distance_map = {"default": 0.0, "map": {"gene": {2: {('T', 'G'): 0.5}}}}
     >>> get_distance_between_nodes(node_b_sequences, node_a_sequences, distance_map)
     0.0
+
+    Treat a single indel as one event.
+
+    >>> node_a_sequences = {"gene": "ACTG"}
+    >>> node_b_sequences = {"gene": "A--G"}
+    >>> distance_map = {"default": 1, "map": {}}
+    >>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
+    1
+
+    Use the maximum weight of all sites affected by an indel with a site-specific distance map.
+
+    >>> distance_map = {"default": 0, "map": {"gene": {1: 1, 2: 2}}}
+    >>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
+    2
+
+    Use the maximum weight of all mutations at all sites affected by an indel with a mutation-specific distance map.
+
+    >>> distance_map = {
+    ...     "default": 0,
+    ...     "map": {
+    ...         "gene": {
+    ...             1: {
+    ...                 ('C', 'G'): 1,
+    ...                 ('C', 'A'): 2
+    ...             },
+    ...             2: {
+    ...                 ('T', 'G'): 3,
+    ...                 ('T', 'A'): 2
+    ...             }
+    ...         }
+    ...     }
+    ... }
+    >>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
+    3
+
+    Use the maximum weight of gaps at all sites affected by an indel with a mutation-specific distance map.
+
+    >>> distance_map = {
+    ...     "default": 0,
+    ...     "map": {
+    ...         "gene": {
+    ...             1: {
+    ...                 ('C', '-'): 1,
+    ...                 ('C', 'A'): 2
+    ...             },
+    ...             2: {
+    ...                 ('T', 'G'): 3,
+    ...                 ('T', '-'): 2
+    ...             }
+    ...         }
+    ...     }
+    ... }
+    >>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
+    2
+
+    If the default value is greater than any of the site-specific mismatches and
+    the specific mismatch does not have a weighted defined, use the default
+    weight.
+
+    >>> distance_map = {
+    ...     "default": 4,
+    ...     "map": {
+    ...         "gene": {
+    ...             1: {
+    ...                 ('C', 'G'): 1,
+    ...                 ('C', 'A'): 2
+    ...             },
+    ...             2: {
+    ...                 ('T', 'G'): 3,
+    ...                 ('T', 'A'): 2
+    ...             }
+    ...         }
+    ...     }
+    ... }
+    >>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
+    4
+
+    Count mismatches adjacent to indel events.
+
+    >>> node_a_sequences = {"gene": "ACTGTA"}
+    >>> node_b_sequences = {"gene": "A--CCA"}
+    >>> distance_map = {"default": 1, "map": {}}
+    >>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
+    3
+
+    Ignore specific characters defined in the distance map.
+
+    >>> node_a_sequences = {"gene": "ACTGG"}
+    >>> node_b_sequences = {"gene": "A--GN"}
+    >>> distance_map = {"default": 1, "ignored_characters":["-"], "map": {}}
+    >>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
+    1
+    >>> distance_map = {"default": 1, "ignored_characters":["-", "N"], "map": {}}
+    >>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
+    0
+
     """
     distance_type = type(distance_map["default"])
     distance = distance_type(0)
+    ignored_characters = distance_map.get("ignored_characters",[])
 
     for gene in node_a_sequences:
         gene_length = len(node_a_sequences[gene])
 
+        # In a first pass, find all mismatches between the given sequences. Use
+        # this pass to identify all sites in insertion/deletion (indel) events,
+        # so we can calculate an aggregate weight per event. Track each event by
+        # its start site.
+        mismatches_by_site = defaultdict(set)
+        in_gap = False
         for site in range(gene_length):
-            if node_a_sequences[gene][site] != node_b_sequences[gene][site]:
+            if (node_a_sequences[gene][site] != node_b_sequences[gene][site]
+                and node_a_sequences[gene][site] not in ignored_characters
+                and node_b_sequences[gene][site] not in ignored_characters):
+                if node_a_sequences[gene][site] == "-" or node_b_sequences[gene][site] == "-":
+                    if not in_gap:
+                        gap_start = site
+                        in_gap = True
+
+                    mismatches_by_site[gap_start].add(site)
+                else:
+                    in_gap = False
+                    mismatches_by_site[site].add(site)
+            else:
+                in_gap = False
+
+        # Sum distances across mismatched sites, aggregating indel events by a
+        # summary function (e.g., max, mean, etc.).
+        for sites in mismatches_by_site.values():
+            mismatch_distances = []
+            for site in sites:
                 if gene in distance_map["map"] and site in distance_map["map"][gene]:
                     # Distances can be provided as either site- and
                     # sequence-specific dictionaries of sequence pairs to
@@ -262,14 +385,36 @@ def get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
                     if isinstance(distance_map["map"][gene][site], dict):
                         seq_ancestral = node_a_sequences[gene][site]
                         seq_derived = node_b_sequences[gene][site]
-                        distance += distance_map["map"][gene][site].get(
-                            (seq_ancestral, seq_derived),
-                            distance_map["default"]
-                        )
+
+                        # Check first for a user-defined weight for the
+                        # mismatched bases. This supports mismatch weights
+                        # between specific characters including gaps.
+                        if (seq_ancestral, seq_derived) in distance_map["map"][gene][site]:
+                            mismatch_distances.append(
+                                distance_map["map"][gene][site][(seq_ancestral, seq_derived)]
+                            )
+                        # Next, check whether the mismatch is a gap. We want to
+                        # take the aggregate of all weights at this site.
+                        elif seq_ancestral == "-" or seq_derived == "-":
+                            mismatch_distances.append(
+                                aggregate_function(
+                                    chain(
+                                        (distance_map["default"],),
+                                        distance_map["map"][gene][site].values()
+                                    )
+                                )
+                            )
+                        # Finally, use the default weight, if no
+                        # sequence-specific weights are defined.
+                        else:
+                            mismatch_distances.append(distance_map["default"])
                     else:
-                        distance += distance_map["map"][gene][site]
+                        mismatch_distances.append(distance_map["map"][gene][site])
                 else:
-                    distance += distance_map["default"]
+                    mismatch_distances.append(distance_map["default"])
+
+            # Aggregate the distances for all sites in the current mismatch.
+            distance += aggregate_function(mismatch_distances)
 
     return distance_type(np.round(distance, 2))
 


=====================================
augur/filter.py
=====================================
@@ -105,8 +105,8 @@ def register_arguments(parser):
         Uses Pandas Dataframe querying, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query for syntax.
         (e.g., --query "country == 'Colombia'" or --query "(country == 'USA' & (division == 'Washington'))")"""
     )
-    metadata_filter_group.add_argument('--min-date', type=numeric_date, help="minimal cutoff for date; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD")
-    metadata_filter_group.add_argument('--max-date', type=numeric_date, help="maximal cutoff for date; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD")
+    metadata_filter_group.add_argument('--min-date', type=numeric_date, help="minimal cutoff for date, the cutoff date is inclusive; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD")
+    metadata_filter_group.add_argument('--max-date', type=numeric_date, help="maximal cutoff for date, the cutoff date is inclusive; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD")
     metadata_filter_group.add_argument('--exclude-ambiguous-dates-by', choices=['any', 'day', 'month', 'year'],
                                 help='Exclude ambiguous dates by day (e.g., 2020-09-XX), month (e.g., 2020-XX-XX), year (e.g., 200X-10-01), or any date fields. An ambiguous year makes the corresponding month and day ambiguous, too, even if those fields have unambiguous values (e.g., "201X-10-01"). Similarly, an ambiguous month makes the corresponding day ambiguous (e.g., "2010-XX-01").')
     metadata_filter_group.add_argument('--exclude', type=str, nargs="+", help="file(s) with list of strains to exclude")
@@ -125,7 +125,7 @@ def register_arguments(parser):
     subsample_group.add_argument('--group-by', nargs='+', help="categories with respect to subsample; two virtual fields, \"month\" and \"year\", are supported if they don't already exist as real fields but a \"date\" field does exist")
     subsample_limits_group = subsample_group.add_mutually_exclusive_group()
     subsample_limits_group.add_argument('--sequences-per-group', type=int, help="subsample to no more than this number of sequences per category")
-    subsample_limits_group.add_argument('--subsample-max-sequences', type=int, help="subsample to no more than this number of sequences")
+    subsample_limits_group.add_argument('--subsample-max-sequences', type=int, help="subsample to no more than this number of sequences; can be used without the group_by argument")
     probabilistic_sampling_group = subsample_group.add_mutually_exclusive_group()
     probabilistic_sampling_group.add_argument('--probabilistic-sampling', action='store_true', help="Enable probabilistic sampling during subsampling. This is useful when there are more groups than requested sequences. This option only applies when `--subsample-max-sequences` is provided.")
     probabilistic_sampling_group.add_argument('--no-probabilistic-sampling', action='store_false', dest='probabilistic_sampling')
@@ -348,9 +348,9 @@ def run(args):
         dates = get_numerical_dates(meta_dict, fmt="%Y-%m-%d")
         tmp = {s for s in seq_keep if dates[s] is not None}
         if args.min_date:
-            tmp = {s for s in tmp if (np.isscalar(dates[s]) or all(dates[s])) and np.max(dates[s])>args.min_date}
+            tmp = {s for s in tmp if (np.isscalar(dates[s]) or all(dates[s])) and np.max(dates[s])>=args.min_date}
         if args.max_date:
-            tmp = {s for s in tmp if (np.isscalar(dates[s]) or all(dates[s])) and np.min(dates[s])<args.max_date}
+            tmp = {s for s in tmp if (np.isscalar(dates[s]) or all(dates[s])) and np.min(dates[s])<=args.max_date}
         num_excluded_by_date = len(seq_keep) - len(tmp)
         seq_keep = tmp
 
@@ -376,7 +376,15 @@ def run(args):
     if args.subsample_seed:
         random.seed(args.subsample_seed)
     num_excluded_subsamp = 0
-    if args.group_by and (args.sequences_per_group or args.subsample_max_sequences):
+    if args.subsample_max_sequences or (args.group_by and args.sequences_per_group):
+        
+        #set groups to group_by values
+        if args.group_by:
+            groups = args.group_by
+        #if group_by not specified use dummy category
+        else:
+            groups = ["_dummy"]
+
         spg = args.sequences_per_group
         seq_names_by_group = defaultdict(list)
 
@@ -384,8 +392,10 @@ def run(args):
             group = []
             m = meta_dict[seq_name]
             # collect group specifiers
-            for c in args.group_by:
-                if c in m:
+            for c in groups:
+                if c == "_dummy":
+                    group.append(c)
+                elif c in m:
                     group.append(m[c])
                 elif c in ['month', 'year'] and 'date' in m:
                     try:
@@ -407,16 +417,16 @@ def run(args):
 
         #If didnt find any categories specified, all seqs will be in 'unknown' - but don't sample this!
         if len(seq_names_by_group)==1 and ('unknown' in seq_names_by_group or ('unknown',) in seq_names_by_group):
-            print("WARNING: The specified group-by categories (%s) were not found."%args.group_by,
+            print("WARNING: The specified group-by categories (%s) were not found."%groups,
                   "No sequences-per-group sampling will be done.")
-            if any([x in args.group_by for x in ['year','month']]):
+            if any([x in groups for x in ['year','month']]):
                 print("Note that using 'year' or 'year month' requires a column called 'date'.")
             print("\n")
         else:
             # Check to see if some categories are missing to warn the user
             group_by = set(['date' if cat in ['year','month'] else cat
-                            for cat in args.group_by])
-            missing_cats = [cat for cat in group_by if cat not in meta_columns]
+                            for cat in groups])
+            missing_cats = [cat for cat in group_by if cat not in meta_columns and cat != "_dummy"]
             if missing_cats:
                 print("WARNING:")
                 if any([cat != 'date' for cat in missing_cats]):


=====================================
augur/refine.py
=====================================
@@ -222,7 +222,11 @@ def run(args):
                 node_data['skyline'] = [[float(x) for x in skyline.x], [float(y) for y in conf[0]],
                                         [float(y) for y in skyline.y], [float(y) for y in conf[1]]]
             except:
-                print("ERROR: skyline optimization by TreeTime has failed.", file=sys.stderr)
+                print(
+                    "ERROR: skyline optimization by TreeTime has failed.",
+                    "To avoid this error, try running without coalescent optimization or with `--coalescent opt` instead of `--coalescent skyline`.",
+                    file=sys.stderr
+                )
                 return 1
 
         attributes.extend(['numdate', 'clock_length', 'mutation_length', 'raw_date', 'date'])


=====================================
augur/tree.py
=====================================
@@ -133,6 +133,10 @@ def build_iqtree(aln_file, out_file, substitution_model="GTR", clean_up=True, nt
         aln_file    file name of input aligment
         out_file    file name to write tree to
     '''
+    iqtree = find_executable([
+        "iqtree2",
+        "iqtree"
+    ])
     # create a dictionary for characters that IQ-tree changes.
     # we remove those prior to tree-building and reinstantiate later
     def random_string(n):
@@ -188,10 +192,10 @@ def build_iqtree(aln_file, out_file, substitution_model="GTR", clean_up=True, nt
         )
 
     if substitution_model.lower() != "none":
-        call = ["iqtree", *fast_opts, "-nt", str(nthreads), "-s", shquote(tmp_aln_file),
+        call = [iqtree, *fast_opts, "-nt", str(nthreads), "-s", shquote(tmp_aln_file),
                 "-m", substitution_model, tree_builder_args, ">", log_file]
     else:
-        call = ["iqtree", *fast_opts, "-nt", str(nthreads), "-s", shquote(tmp_aln_file), tree_builder_args, ">", shquote(log_file)]
+        call = [iqtree *fast_opts, "-nt", str(nthreads), "-s", shquote(tmp_aln_file), tree_builder_args, ">", shquote(log_file)]
 
     cmd = " ".join(call)
 


=====================================
tests/functional/filter.t
=====================================
@@ -20,6 +20,22 @@ With 10 groups to subsample from, this should produce one sequence per group.
   \s*10 (re)
   $ rm -f "$TMP/filtered.fasta"
 
+Filter with subsampling where no more than 5 sequences are requested and no groups are specified.
+This generates a dummy category and subsamples from there. With no-probabilistic-sampling we expect exactly 5 sequences.
+
+  $ ${AUGUR} filter \
+  >  --sequences filter/sequences.fasta \
+  >  --sequence-index filter/sequence_index.tsv \
+  >  --metadata filter/metadata.tsv \
+  >  --min-date 2012 \
+  >  --subsample-max-sequences 5 \
+  >  --subsample-seed 314159 \
+  >  --no-probabilistic-sampling \
+  >  --output "$TMP/filtered.fasta" > /dev/null
+  $ grep ">" "$TMP/filtered.fasta" | wc -l
+  \s*5 (re)
+  $ rm -f "$TMP/filtered.fasta"
+
 Try to filter with subsampling when there are more available groups than requested sequences.
 This should fail, as probabilistic sampling is explicitly disabled.
 


=====================================
tests/test_filter.py
=====================================
@@ -215,3 +215,31 @@ class TestFilter:
         augur.filter.run(args)
         output = SeqIO.to_dict(SeqIO.parse(out_fn, "fasta"))
         assert list(output.keys()) == ["SEQ_1", "SEQ_3"]
+
+    def test_filter_run_min_date(self, tmpdir, fasta_fn, argparser):
+        """Test that filter --min-date is inclusive"""
+        out_fn = str(tmpdir / "out.fasta")
+        min_date = "2020-02-26"
+        meta_fn = write_metadata(tmpdir, (("strain","date"),
+                                          ("SEQ_1","2020-02-XX"),
+                                          ("SEQ_2","2020-02-26"),
+                                          ("SEQ_3","2020-02-25")))
+        args = argparser('-s %s --metadata %s -o %s --min-date %s'
+                         % (fasta_fn, meta_fn, out_fn, min_date))
+        augur.filter.run(args)
+        output = SeqIO.to_dict(SeqIO.parse(out_fn, "fasta"))
+        assert list(output.keys()) == ["SEQ_1", "SEQ_2"]
+
+    def test_filter_run_max_date(self, tmpdir, fasta_fn, argparser):
+        """Test that filter --max-date is inclusive"""
+        out_fn = str(tmpdir / "out.fasta")
+        max_date = "2020-03-01"
+        meta_fn = write_metadata(tmpdir, (("strain","date"),
+                                          ("SEQ_1","2020-03-XX"),
+                                          ("SEQ_2","2020-03-01"),
+                                          ("SEQ_3","2020-03-02")))
+        args = argparser('-s %s --metadata %s -o %s --max-date %s'
+                         % (fasta_fn, meta_fn, out_fn, max_date))
+        augur.filter.run(args)
+        output = SeqIO.to_dict(SeqIO.parse(out_fn, "fasta"))
+        assert list(output.keys()) == ["SEQ_1", "SEQ_2"]
\ No newline at end of file



View it on GitLab: https://salsa.debian.org/med-team/augur/-/commit/d19d29e58828e426502c2411a61b05818fd67285

-- 
View it on GitLab: https://salsa.debian.org/med-team/augur/-/commit/d19d29e58828e426502c2411a61b05818fd67285
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20210423/8ff48797/attachment-0001.htm>