[med-svn] [Git][med-team/augur][upstream] New upstream version 10.3.0

Nilesh Patra gitlab at salsa.debian.org
Mon Jan 18 14:14:52 GMT 2021



Nilesh Patra pushed to branch upstream at Debian Med / augur


Commits:
884cbf1f by Nilesh Patra at 2021-01-18T19:34:20+05:30
New upstream version 10.3.0
- - - - -


12 changed files:

- .travis.yml
- CHANGES.md
- README.md
- augur/__version__.py
- augur/frequencies.py
- augur/frequency_estimators.py
- + codecov.yml
- paper/paper.bib
- paper/paper.md
- scripts/identify_emerging_clades.py
- scripts/verify_meta_json.py
- tests/test_frequencies.py


Changes:

=====================================
.travis.yml
=====================================
@@ -24,7 +24,7 @@ jobs:
         - conda config --set always_yes yes --set changeps1 no
         - conda update -q conda
         - conda info -a
-        - conda env create -f environment.yml
+        - conda create -n augur -c bioconda python=$TRAVIS_PYTHON_VERSION mafft raxml fasttree iqtree vcftools pip
         - source activate augur
       install:
         - pip install -e .[dev]


=====================================
CHANGES.md
=====================================
@@ -3,6 +3,24 @@
 ## __NEXT__
 
 
+## 10.3.0 (14 January 2021)
+
+### Bug Fixes
+
+* scripts: Fix typo in `verify_meta_json.py` [#656][] (@felixonmars)
+* CI: Use the expected Python version in conda environments [#658][]
+* CI: Minimize codecov feedback [#661][]
+
+### Features
+
+* frequencies: Add `--pivot-interval-units` argument with support for weekly pivots [#660][]
+* frequencies: Add support for ISO dates for `--min-date` and `--max-date` arguments [#660][]
+
+[#656]: https://github.com/nextstrain/augur/pull/656
+[#658]: https://github.com/nextstrain/augur/pull/658
+[#660]: https://github.com/nextstrain/augur/pull/660
+[#661]: https://github.com/nextstrain/augur/pull/661
+
 ## 10.2.0 (1 January 2021)
 
 ### Features


=====================================
README.md
=====================================
@@ -3,6 +3,7 @@
 [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/augur/README.html)
 [![Documentation Status](https://readthedocs.org/projects/nextstrain-augur/badge/?version=latest)](https://nextstrain-augur.readthedocs.io/en/stable/?badge=latest)
 [![License: AGPL v3](https://img.shields.io/badge/License-AGPL%20v3-blue.svg)](https://www.gnu.org/licenses/agpl-3.0)
+[![DOI](https://joss.theoj.org/papers/10.21105/joss.02906/status.svg)](https://doi.org/10.21105/joss.02906)
 
 ## About Nextstrain
 


=====================================
augur/__version__.py
=====================================
@@ -1,4 +1,4 @@
-__version__ = '10.2.0'
+__version__ = '10.3.0'
 
 
 def is_augur_version_compatible(version):


=====================================
augur/frequencies.py
=====================================
@@ -2,10 +2,12 @@
 infer frequencies of mutations or clades
 """
 import json, os, sys
+import datetime
 import numpy as np
 from collections import defaultdict
 from Bio import Phylo, AlignIO
 from Bio.Align import MultipleSeqAlignment
+import treetime.utils
 
 from .frequency_estimators import get_pivots, alignment_frequencies, tree_frequencies
 from .frequency_estimators import AlignmentKdeFrequencies, TreeKdeFrequencies, TreeKdeFrequenciesError
@@ -21,11 +23,13 @@ def register_arguments(parser):
     parser.add_argument('--regions', type=str, nargs='+', default=['global'],
                         help="region to subsample to")
     parser.add_argument("--pivot-interval", type=int, default=3,
-                        help="number of months between pivots")
-    parser.add_argument('--min-date', type=float,
-                        help="minimal pivot value")
-    parser.add_argument('--max-date', type=float,
-                        help="maximal pivot value")
+                        help="number of units between pivots")
+    parser.add_argument("--pivot-interval-units", type=str, default="months", choices=['months', 'weeks'],
+                        help="space pivots by months (default) or by weeks")
+    parser.add_argument('--min-date', type=numeric_date,
+                        help="date to begin frequencies calculations; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD")
+    parser.add_argument('--max-date', type=numeric_date,
+                        help="date to end frequencies calculations; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD")
 
     # Tree-specific arguments
     parser.add_argument('--tree', '-t', type=str,
@@ -104,7 +108,7 @@ def run(args):
 
         if args.method == "diffusion":
             # estimate tree frequencies
-            pivots = get_pivots(tps, args.pivot_interval, args.min_date, args.max_date)
+            pivots = get_pivots(tps, args.pivot_interval, args.min_date, args.max_date, args.pivot_interval_units)
             frequency_dict = {"pivots":format_frequencies(pivots)}
             frequency_dict["counts"] = {}
 
@@ -157,6 +161,7 @@ def run(args):
                 pivot_frequency=args.pivot_interval,
                 start_date=args.min_date,
                 end_date=args.max_date,
+                pivot_interval_units=args.pivot_interval_units,
                 weights=weights,
                 weights_attribute=weights_attribute,
                 include_internal_nodes=args.include_internal_nodes,
@@ -190,7 +195,7 @@ def run(args):
             tps = np.array([np.mean(dates[seq.name]) for seq in aln])
 
             if frequencies is None:
-                pivots = get_pivots(tps, args.pivot_interval, args.min_date, args.max_date)
+                pivots = get_pivots(tps, args.pivot_interval, args.min_date, args.max_date, args.pivot_interval_units)
                 frequencies = {"pivots":format_frequencies(pivots)}
 
             if args.method == "kde":
@@ -201,6 +206,7 @@ def run(args):
                     pivot_frequency=args.pivot_interval,
                     start_date=args.min_date,
                     end_date=args.max_date,
+                    pivot_interval_units=args.pivot_interval_units,
                     weights=weights,
                     weights_attribute=weights_attribute,
                     include_internal_nodes=args.include_internal_nodes,
@@ -224,3 +230,19 @@ def run(args):
 
         write_json(frequencies, args.output)
         print("mutation frequencies written to", args.output, file=sys.stdout)
+
+
+def numeric_date(date):
+    """
+    Converts the given *date* string to a :py:class:`float`.
+    *date* may be given as a number (a float) with year as the integer part, or
+    in the YYYY-MM-DD (ISO 8601) syntax.
+    >>> numeric_date("2020.42")
+    2020.42
+    >>> numeric_date("2020-06-04")
+    2020.42486...
+    """
+    try:
+        return float(date)
+    except ValueError:
+        return treetime.utils.numeric_date(datetime.date(*map(int, date.split("-", 2))))


=====================================
augur/frequency_estimators.py
=====================================
@@ -1,6 +1,7 @@
 # estimates clade frequencies
 from __future__ import division, print_function
 from collections import defaultdict
+import datetime
 import numpy as np
 import pandas as pd
 from scipy.interpolate import interp1d
@@ -18,7 +19,7 @@ class TreeKdeFrequenciesError(Exception):
     pass
 
 
-def get_pivots(observations, pivot_interval, start_date=None, end_date=None):
+def get_pivots(observations, pivot_interval, start_date=None, end_date=None, pivot_interval_units="months"):
     """Calculate pivots for a given list of floating point observation dates and
     interval between pivots.
 
@@ -30,11 +31,13 @@ def get_pivots(observations, pivot_interval, start_date=None, end_date=None):
     observations : list
         a list of observed floating point dates per sample
     pivot_interval : int
-        number of months between pivots
+        number of months (or weeks) between pivots
     start_date : float
         optional start of the pivots interval
     end_date : float
         optional end of the pivots interval
+    pivot_interval : str
+        whether pivots are measured in "months" or in "weeks"
 
     Returns
     -------
@@ -44,18 +47,27 @@ def get_pivots(observations, pivot_interval, start_date=None, end_date=None):
     """
     # Convert months between pivots to pivot frequency.
     pivot_frequency = pivot_interval / 12.0
+    if pivot_interval_units == "weeks":
+        pivot_frequency = pivot_interval / 52.1429
 
     pivot_start = start_date if start_date else np.floor(np.min(observations) / pivot_frequency) * pivot_frequency
     pivot_end = end_date if end_date else np.ceil(np.max(observations) / pivot_frequency) * pivot_frequency
 
+    if pivot_interval_units == "months":
+        offset = "%sMS" % pivot_interval
+    elif pivot_interval_units == "weeks":
+        offset = "%sW" % pivot_interval
+    else:
+        raise ValueError(f"The given interval unit '{pivot_interval_units}' is not supported.")
+
     datetime_pivots = pd.date_range(
         float_to_datestring(pivot_start),
         float_to_datestring(pivot_end),
-        freq="%sMS" % pivot_interval
+        freq = offset
     )
     pivots = np.array([timestamp_to_float(pivot) for pivot in datetime_pivots])
 
-    return np.around(pivots, 2)
+    return np.around(pivots, 4)
 
 
 def make_pivots(pivots, tps):
@@ -796,45 +808,33 @@ def test_nested_estimator():
 
     return nested_freq
 
-
-def float_to_datestring(time):
-    """Convert a floating point date to a date string
-
-    >>> float_to_datestring(2010.75)
-    '2010-10-01'
-    >>> float_to_datestring(2011.25)
-    '2011-04-01'
-    >>> float_to_datestring(2011.0)
-    '2011-01-01'
-    >>> float_to_datestring(2011.0 + 11.0 / 12)
-    '2011-12-01'
-
-    In some cases, the given float value can be truncated leading to unexpected
-    conversion between floating point and integer values. This function should
-    account for these errors by rounding months to the nearest integer.
-
-    >>> float_to_datestring(2011.9166666666665)
-    '2011-12-01'
-    >>> float_to_datestring(2016.9609856262834)
-    '2016-12-01'
+def float_to_datestring(numdate):
+    """convert a numeric decimal date to a python datetime object
+    Note that this only works for AD dates since the range of datetime objects
+    is restricted to year>1.
+    Copied from treetime.utils
+    Parameters
+    ----------
+    numdate : float
+        numeric date as in 2018.23
+    Returns
+    -------
+    datetime.datetime
+        datetime object
     """
-    year = int(time)
-
-    # After accounting for the current year, extract the remainder and convert
-    # it to a month using the inverse of the logic used to create the floating
-    # point date. If the float date is sufficiently close to the end of the
-    # year, rounding can produce a 13th month.
-    month = min(int(np.rint(((time - year) * 12) + 1)), 12)
-
-    # Floating point dates do not encode day information, so we always assume
-    # they refer to the start of a given month.
-    day = 1
-
-    return "%s-%02d-%02d" % (year, month, day)
+    from calendar import isleap
+    days_in_year = 366 if isleap(int(numdate)) else 365
+    # add a small number of the time elapsed in a year to avoid
+    # unexpected behavior for values 1/365, 2/365, etc
+    days_elapsed = int(((numdate%1)+1e-10)*days_in_year)
+    date = datetime.datetime(int(numdate),1,1) + datetime.timedelta(days=days_elapsed)
 
+    return "%s-%02d-%02d" % (date.year, date.month, date.day)
 
 def timestamp_to_float(time):
     """Convert a pandas timestamp to a floating point date.
+    This is not entirely accurate as it doesn't account for months with different
+    numbers of days, but should be close enough to be accurate for weekly pivots.
 
     >>> import datetime
     >>> time = datetime.date(2010, 10, 1)
@@ -848,7 +848,7 @@ def timestamp_to_float(time):
     >>> timestamp_to_float(datetime.date(2011, 12, 1)) == (2011.0 + 11.0 / 12)
     True
     """
-    return time.year + ((time.month - 1) / 12.0)
+    return time.year + ((time.month - 1) / 12.0) + ((time.day - 1) / 365.25)
 
 
 class KdeFrequencies(object):
@@ -858,8 +858,10 @@ class KdeFrequencies(object):
     each clade in the tree.
     """
     def __init__(self, sigma_narrow=1 / 12.0, sigma_wide=3 / 12.0, proportion_wide=0.2,
-                 pivot_frequency=1, start_date=None, end_date=None, weights=None, weights_attribute=None,
-                 node_filters=None, max_date=None, include_internal_nodes=False, censored=False):
+                 pivot_frequency=1, start_date=None, end_date=None,
+                 pivot_interval_units="months", weights=None, weights_attribute=None,
+                 node_filters=None, max_date=None, include_internal_nodes=False,
+                 censored=False):
         """Define parameters for KDE-based frequency estimation.
 
         Args:
@@ -869,6 +871,7 @@ class KdeFrequencies(object):
             pivot_frequency (int): Number of months between pivots
             start_date (float): start of the pivots interval
             end_date (float): end of the pivots interval
+            pivot_interval_units (str): Whether pivot intervals are measured in "months" or "weeks"
             weights (dict): Numerical weights indexed by attribute values and applied to individual tips
             weights_attribute (str): Attribute annotated on tips of a tree to use for weighting
             node_filters (dict): Mapping of node attribute names (keys) to a list of valid values to keep
@@ -886,6 +889,7 @@ class KdeFrequencies(object):
         self.pivot_frequency = pivot_frequency
         self.start_date = start_date
         self.end_date = end_date
+        self.pivot_interval_units = pivot_interval_units
         self.weights = weights
         self.weights_attribute = weights_attribute
         self.node_filters = node_filters
@@ -907,6 +911,7 @@ class KdeFrequencies(object):
             "pivot_frequency": self.pivot_frequency,
             "start_date": self.start_date,
             "end_date": self.end_date,
+            "pivot_interval_units": self.pivot_interval_units,
             "weights": self.weights,
             "weights_attribute": self.weights_attribute,
             "max_date": self.max_date,
@@ -1121,7 +1126,8 @@ class TreeKdeFrequencies(KdeFrequencies):
             observations,
             self.pivot_frequency,
             start_date=self.start_date,
-            end_date=self.end_date
+            end_date=self.end_date,
+            pivot_interval_units=self.pivot_interval_units
         )
 
         # If weights are defined, calculate frequencies for all tips by their
@@ -1208,7 +1214,8 @@ class AlignmentKdeFrequencies(KdeFrequencies):
             observations,
             self.pivot_frequency,
             start_date=self.start_date,
-            end_date=self.end_date
+            end_date=self.end_date,
+            pivot_interval_units=self.pivot_interval_units
         )
 
         # Pair alignment sequence indices with observation dates.


=====================================
codecov.yml
=====================================
@@ -0,0 +1,13 @@
+comment:
+  # Only post a codecov comment if coverage changes.
+  require_changes: true
+
+coverage:
+  status:
+    # Disable GitHub checks for code coverage that make otherwise valid builds fail.
+    project: off
+    patch: off
+
+github_checks:
+  # Disable inline annotations of code coverage in pull requests.
+  annotations: false


=====================================
paper/paper.bib
=====================================
@@ -40,7 +40,7 @@
     number = {32},
     urldate = {2020-08-26},
     journal = {Eurosurveillance},
-    author = {Alm, Erik and Broberg, Eeva K. and Connor, Thomas and Hodcroft, Emma B. and Komissarov, Andrey B. and Maurer-Stroh, Sebastian and Melidou, Angeliki and Neher, Richard A. and O’Toole, Áine and Pereyaslov, Dmitriy and Group, The WHO European Region sequencing laboratories {and} GISAID EpiCoV},
+    author = {Alm, Erik and Broberg, Eeva K. and Connor, Thomas and Hodcroft, Emma B. and Komissarov, Andrey B. and Maurer-Stroh, Sebastian and Melidou, Angeliki and Neher, Richard A. and O’Toole, Áine and Pereyaslov, Dmitriy and The WHO European Region Sequencing Laboratories {and} GISAID EpiCoV Group,},
     month = aug,
     year = {2020},
     note = {Publisher: European Centre for Disease Prevention and Control},


=====================================
paper/paper.md
=====================================
@@ -17,34 +17,39 @@ authors:
   - name: Thomas R. Sibley
     affiliation: 2
     orcid: 0000-0001-5269-2297
+  - name: Jover Lee
+    affiliation: 2
+    orcid: 0000-0002-2391-0512
+  - name: Kairsten Fay
+    affiliation: 2
+    orcid: 0000-0002-6529-9205
+  - name: Misja Ilcisin
+    affiliation: 2
+    orcid: 0000-0002-1468-7697
   - name: Elias Harkins
     affiliation: 2
     orcid: 0000-0001-6525-9134
-  - name: Augur Contributors
-    affiliation: 3
   - name: Trevor Bedford
     affiliation: "1, 2"
     orcid: 0000-0002-4039-5794
   - name: Richard A. Neher
-    affiliation: "4, 5"
+    affiliation: "3, 4"
     orcid: 0000-0003-2525-1407
   - name: Emma B. Hodcroft
-    affiliation: "4, 5, 6"
+    affiliation: "3, 4, 5"
     orcid: 0000-0002-0078-2212
 affiliations:
-  - name: Molecular and Cell Biology Program, University of Washington, Seattle, WA, USA
+  - name: Molecular and Cellular Biology Program, University of Washington, Seattle, WA, USA
     index: 1
   - name: Vaccine and Infectious Disease Division, Fred Hutchinson Cancer Research Center, Seattle, WA, USA
     index: 2
-  - name: Open Source Software community
-    index: 3
   - name: Biozentrum, University of Basel, Basel, Switzerland
-    index: 4
+    index: 3
   - name: Swiss Institute of Bioinformatics, Basel, Switzerland
-    index: 5
+    index: 4
   - name: Institute of Social and Preventive Medicine, University of Bern, Bern, Switzerland
-    index: 6
-date: 5 November 2020
+    index: 5
+date: 5 January 2021
 bibliography: paper.bib
 ---
 
@@ -102,6 +107,7 @@ Augur can be installed from PyPI ([nextstrain-augur](https://pypi.org/project/ne
 # Acknowledgments
 
 Thank you to all of [the open source community members who have contributed to Augur](https://github.com/nextstrain/augur/graphs/contributors).
+Specifically, we thank Eric Danielson, Eddie Lebow, Barney Potter, Ryan Grout, Sai Kiran Kollapudi, Mingye Wang, Carol Willing, Louise Moncla, Thomas Caswell, Sidney Bell, Terry Jones, Christian Clauss, Julien Bordellier, Gytis Dudas, Cameron Devine, Samuel Zhang, Akshay Subramanian, Christopher Tomkins-Tinch, Danielle Kain, Pierre Barrat-Charlaix, Rhys Kidd, Chris Woszczak, Tony Tung, Mathias Walter, and Zachary Sailer.
 Thank you to Dan Fornika from BCCDC Public Health Laboratory for creating the first conda recipe for Augur in Bioconda.
 JHu is a Graduate Research Fellow and is supported by the NIH grant NIAID F31AI140714.
 TB is a Pew Biomedical Scholar.


=====================================
scripts/identify_emerging_clades.py
=====================================
@@ -230,9 +230,15 @@ if __name__ == '__main__':
             for index, row_df in distinct_subclades.iterrows():
                 parent_clade = row_df["parent_clade"]
                 mutations = row_df["mutations"]
-                print(
-                    f"<li><a target='_new' href='{nextstrain_url}?c=gt-{mutation_region}_{mutations}&p=grid'>{parent_clade}: {mutations}</a></li>",
-                    file=oh
-                )
+                if filter_attribute:
+                    print(
+                        f"<li><a target='_new' href='{nextstrain_url}?c=gt-{mutation_region}_{mutations}&f_{filter_attribute}={filter_value}&transmissions=hide&p=grid'>{parent_clade}: {mutations}</a></li>",
+                        file=oh
+                    )
+                else:
+                    print(
+                        f"<li><a target='_new' href='{nextstrain_url}?c=gt-{mutation_region}_{mutations}&transmissions=hide&p=grid'>{parent_clade}: {mutations}</a></li>",
+                        file=oh
+                    )
 
             print("</ul>", file=oh)


=====================================
scripts/verify_meta_json.py
=====================================
@@ -39,7 +39,7 @@ if __name__=="__main__":
     if "author_info" in data:
         verify_author_info(data["author_info"])
     else:
-        print("ERROR: author_info does not exist. This build needs to be updated and will soon be incompatable with auspice.")
+        print("ERROR: author_info does not exist. This build needs to be updated and will soon be incompatible with auspice.")
 
 
     # set_trace()


=====================================
tests/test_frequencies.py
=====================================
@@ -66,19 +66,42 @@ def test_get_pivots_from_tree_only(tree):
 
 def test_get_pivots_from_start_and_end_date():
     """
-    Test pivot calculation from a given start and end date instead of a given tree.
+    Test pivot calculation from a given start and end date instead of a given tree
+    First pivot is the first day of the month immediately following start_date
+    Last pivot is the first day of the month immediately preceding end_date
+    Current logic converts numeric date 2015.5 to 2015-07-02, hence using 2015.49
     """
-    pivot_frequency = 3
-    start_date = 2015.5
+    pivot_frequency = 1
+    start_date = 2015.49
     end_date = 2018.5
     observations = []
     pivots = get_pivots(observations, pivot_frequency, start_date=start_date, end_date=end_date)
     assert isinstance(pivots, np.ndarray)
-    assert pivots[1] - pivots[0] == pivot_frequency / 12.0
-    assert pivots[0] == start_date
-    assert pivots[-1] == end_date
+    assert np.round( 12 * (pivots[1] - pivots[0]) ) == pivot_frequency
+    assert pivots[0] == 2015.5
+    assert pivots[-1] == 2018.5
     assert pivots[-1] >= end_date - pivot_frequency
 
+def test_get_pivots_by_months():
+    """Get pivots where intervals are defined by months.
+    """
+    pivots = get_pivots(observations=[], pivot_interval=1, start_date=2015.0, end_date=2016.0, pivot_interval_units="months")
+    # Pivots should include all 12 months of the year plus the month represented
+    # by the end date, since the pandas month interval uses "month starts". See
+    # pandas date offsets documentation for more details:
+    # https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
+    assert len(pivots) == 13
+
+def test_get_pivots_by_weeks():
+    """Get pivots where intervals are defined as weeks instead of months..
+    """
+    pivots = get_pivots(observations=[], pivot_interval=1, start_date=2015.0, end_date=2016.0, pivot_interval_units="weeks")
+    assert len(pivots) == 52
+
+def test_get_pivots_by_invalid_unit():
+    with pytest.raises(ValueError, match=r".*invalid_unit.*is not supported.*"):
+        pivots = get_pivots(observations=[], pivot_interval=1, start_date=2015.0, end_date=2016.0, pivot_interval_units="invalid_unit")
+
 #
 # Test KDE frequency estimation for trees
 #
@@ -110,7 +133,7 @@ class TestTreeKdeFrequencies(object):
         )
         frequencies = kde_frequencies.estimate(tree)
         assert hasattr(kde_frequencies, "pivots")
-        assert kde_frequencies.pivots[0] == start_date
+        assert kde_frequencies.pivots[0] == 2015.5833
         assert hasattr(kde_frequencies, "frequencies")
         assert list(frequencies.values())[0].shape == kde_frequencies.pivots.shape
 



View it on GitLab: https://salsa.debian.org/med-team/augur/-/commit/884cbf1f30c78e9114e8c256d94584002411c4e6

-- 
View it on GitLab: https://salsa.debian.org/med-team/augur/-/commit/884cbf1f30c78e9114e8c256d94584002411c4e6
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20210118/66def03d/attachment-0001.html>


More information about the debian-med-commit mailing list