[med-svn] [Git][med-team/augur][master] 4 commits: routine-update: New upstream version
Nilesh Patra
gitlab at salsa.debian.org
Mon Jan 18 14:14:43 GMT 2021
Nilesh Patra pushed to branch master at Debian Med / augur
Commits:
ef1986b8 by Nilesh Patra at 2021-01-18T19:34:19+05:30
routine-update: New upstream version
- - - - -
884cbf1f by Nilesh Patra at 2021-01-18T19:34:20+05:30
New upstream version 10.3.0
- - - - -
e24f8c1d by Nilesh Patra at 2021-01-18T19:35:06+05:30
Update upstream source from tag 'upstream/10.3.0'
Update to upstream version '10.3.0'
with Debian dir fce2fb4db65e3697b2ffa611af592babb6febbc5
- - - - -
d80b50c3 by Nilesh Patra at 2021-01-18T14:10:50+00:00
Update changelog
- - - - -
13 changed files:
- .travis.yml
- CHANGES.md
- README.md
- augur/__version__.py
- augur/frequencies.py
- augur/frequency_estimators.py
- + codecov.yml
- debian/changelog
- paper/paper.bib
- paper/paper.md
- scripts/identify_emerging_clades.py
- scripts/verify_meta_json.py
- tests/test_frequencies.py
Changes:
=====================================
.travis.yml
=====================================
@@ -24,7 +24,7 @@ jobs:
- conda config --set always_yes yes --set changeps1 no
- conda update -q conda
- conda info -a
- - conda env create -f environment.yml
+ - conda create -n augur -c bioconda python=$TRAVIS_PYTHON_VERSION mafft raxml fasttree iqtree vcftools pip
- source activate augur
install:
- pip install -e .[dev]
=====================================
CHANGES.md
=====================================
@@ -3,6 +3,24 @@
## __NEXT__
+## 10.3.0 (14 January 2021)
+
+### Bug Fixes
+
+* scripts: Fix typo in `verify_meta_json.py` [#656][] (@felixonmars)
+* CI: Use the expected Python version in conda environments [#658][]
+* CI: Minimize codecov feedback [#661][]
+
+### Features
+
+* frequencies: Add `--pivot-interval-units` argument with support for weekly pivots [#660][]
+* frequencies: Add support for ISO dates for `--min-date` and `--max-date` arguments [#660][]
+
+[#656]: https://github.com/nextstrain/augur/pull/656
+[#658]: https://github.com/nextstrain/augur/pull/658
+[#660]: https://github.com/nextstrain/augur/pull/660
+[#661]: https://github.com/nextstrain/augur/pull/661
+
## 10.2.0 (1 January 2021)
### Features
=====================================
README.md
=====================================
@@ -3,6 +3,7 @@
[![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/augur/README.html)
[![Documentation Status](https://readthedocs.org/projects/nextstrain-augur/badge/?version=latest)](https://nextstrain-augur.readthedocs.io/en/stable/?badge=latest)
[![License: AGPL v3](https://img.shields.io/badge/License-AGPL%20v3-blue.svg)](https://www.gnu.org/licenses/agpl-3.0)
+[![DOI](https://joss.theoj.org/papers/10.21105/joss.02906/status.svg)](https://doi.org/10.21105/joss.02906)
## About Nextstrain
=====================================
augur/__version__.py
=====================================
@@ -1,4 +1,4 @@
-__version__ = '10.2.0'
+__version__ = '10.3.0'
def is_augur_version_compatible(version):
=====================================
augur/frequencies.py
=====================================
@@ -2,10 +2,12 @@
infer frequencies of mutations or clades
"""
import json, os, sys
+import datetime
import numpy as np
from collections import defaultdict
from Bio import Phylo, AlignIO
from Bio.Align import MultipleSeqAlignment
+import treetime.utils
from .frequency_estimators import get_pivots, alignment_frequencies, tree_frequencies
from .frequency_estimators import AlignmentKdeFrequencies, TreeKdeFrequencies, TreeKdeFrequenciesError
@@ -21,11 +23,13 @@ def register_arguments(parser):
parser.add_argument('--regions', type=str, nargs='+', default=['global'],
help="region to subsample to")
parser.add_argument("--pivot-interval", type=int, default=3,
- help="number of months between pivots")
- parser.add_argument('--min-date', type=float,
- help="minimal pivot value")
- parser.add_argument('--max-date', type=float,
- help="maximal pivot value")
+ help="number of units between pivots")
+ parser.add_argument("--pivot-interval-units", type=str, default="months", choices=['months', 'weeks'],
+ help="space pivots by months (default) or by weeks")
+ parser.add_argument('--min-date', type=numeric_date,
+ help="date to begin frequencies calculations; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD")
+ parser.add_argument('--max-date', type=numeric_date,
+ help="date to end frequencies calculations; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD")
# Tree-specific arguments
parser.add_argument('--tree', '-t', type=str,
@@ -104,7 +108,7 @@ def run(args):
if args.method == "diffusion":
# estimate tree frequencies
- pivots = get_pivots(tps, args.pivot_interval, args.min_date, args.max_date)
+ pivots = get_pivots(tps, args.pivot_interval, args.min_date, args.max_date, args.pivot_interval_units)
frequency_dict = {"pivots":format_frequencies(pivots)}
frequency_dict["counts"] = {}
@@ -157,6 +161,7 @@ def run(args):
pivot_frequency=args.pivot_interval,
start_date=args.min_date,
end_date=args.max_date,
+ pivot_interval_units=args.pivot_interval_units,
weights=weights,
weights_attribute=weights_attribute,
include_internal_nodes=args.include_internal_nodes,
@@ -190,7 +195,7 @@ def run(args):
tps = np.array([np.mean(dates[seq.name]) for seq in aln])
if frequencies is None:
- pivots = get_pivots(tps, args.pivot_interval, args.min_date, args.max_date)
+ pivots = get_pivots(tps, args.pivot_interval, args.min_date, args.max_date, args.pivot_interval_units)
frequencies = {"pivots":format_frequencies(pivots)}
if args.method == "kde":
@@ -201,6 +206,7 @@ def run(args):
pivot_frequency=args.pivot_interval,
start_date=args.min_date,
end_date=args.max_date,
+ pivot_interval_units=args.pivot_interval_units,
weights=weights,
weights_attribute=weights_attribute,
include_internal_nodes=args.include_internal_nodes,
@@ -224,3 +230,19 @@ def run(args):
write_json(frequencies, args.output)
print("mutation frequencies written to", args.output, file=sys.stdout)
+
+
+def numeric_date(date):
+ """
+ Converts the given *date* string to a :py:class:`float`.
+ *date* may be given as a number (a float) with year as the integer part, or
+ in the YYYY-MM-DD (ISO 8601) syntax.
+ >>> numeric_date("2020.42")
+ 2020.42
+ >>> numeric_date("2020-06-04")
+ 2020.42486...
+ """
+ try:
+ return float(date)
+ except ValueError:
+ return treetime.utils.numeric_date(datetime.date(*map(int, date.split("-", 2))))
=====================================
augur/frequency_estimators.py
=====================================
@@ -1,6 +1,7 @@
# estimates clade frequencies
from __future__ import division, print_function
from collections import defaultdict
+import datetime
import numpy as np
import pandas as pd
from scipy.interpolate import interp1d
@@ -18,7 +19,7 @@ class TreeKdeFrequenciesError(Exception):
pass
-def get_pivots(observations, pivot_interval, start_date=None, end_date=None):
+def get_pivots(observations, pivot_interval, start_date=None, end_date=None, pivot_interval_units="months"):
"""Calculate pivots for a given list of floating point observation dates and
interval between pivots.
@@ -30,11 +31,13 @@ def get_pivots(observations, pivot_interval, start_date=None, end_date=None):
observations : list
a list of observed floating point dates per sample
pivot_interval : int
- number of months between pivots
+ number of months (or weeks) between pivots
start_date : float
optional start of the pivots interval
end_date : float
optional end of the pivots interval
+ pivot_interval : str
+ whether pivots are measured in "months" or in "weeks"
Returns
-------
@@ -44,18 +47,27 @@ def get_pivots(observations, pivot_interval, start_date=None, end_date=None):
"""
# Convert months between pivots to pivot frequency.
pivot_frequency = pivot_interval / 12.0
+ if pivot_interval_units == "weeks":
+ pivot_frequency = pivot_interval / 52.1429
pivot_start = start_date if start_date else np.floor(np.min(observations) / pivot_frequency) * pivot_frequency
pivot_end = end_date if end_date else np.ceil(np.max(observations) / pivot_frequency) * pivot_frequency
+ if pivot_interval_units == "months":
+ offset = "%sMS" % pivot_interval
+ elif pivot_interval_units == "weeks":
+ offset = "%sW" % pivot_interval
+ else:
+ raise ValueError(f"The given interval unit '{pivot_interval_units}' is not supported.")
+
datetime_pivots = pd.date_range(
float_to_datestring(pivot_start),
float_to_datestring(pivot_end),
- freq="%sMS" % pivot_interval
+ freq = offset
)
pivots = np.array([timestamp_to_float(pivot) for pivot in datetime_pivots])
- return np.around(pivots, 2)
+ return np.around(pivots, 4)
def make_pivots(pivots, tps):
@@ -796,45 +808,33 @@ def test_nested_estimator():
return nested_freq
-
-def float_to_datestring(time):
- """Convert a floating point date to a date string
-
- >>> float_to_datestring(2010.75)
- '2010-10-01'
- >>> float_to_datestring(2011.25)
- '2011-04-01'
- >>> float_to_datestring(2011.0)
- '2011-01-01'
- >>> float_to_datestring(2011.0 + 11.0 / 12)
- '2011-12-01'
-
- In some cases, the given float value can be truncated leading to unexpected
- conversion between floating point and integer values. This function should
- account for these errors by rounding months to the nearest integer.
-
- >>> float_to_datestring(2011.9166666666665)
- '2011-12-01'
- >>> float_to_datestring(2016.9609856262834)
- '2016-12-01'
+def float_to_datestring(numdate):
+ """convert a numeric decimal date to a python datetime object
+ Note that this only works for AD dates since the range of datetime objects
+ is restricted to year>1.
+ Copied from treetime.utils
+ Parameters
+ ----------
+ numdate : float
+ numeric date as in 2018.23
+ Returns
+ -------
+ datetime.datetime
+ datetime object
"""
- year = int(time)
-
- # After accounting for the current year, extract the remainder and convert
- # it to a month using the inverse of the logic used to create the floating
- # point date. If the float date is sufficiently close to the end of the
- # year, rounding can produce a 13th month.
- month = min(int(np.rint(((time - year) * 12) + 1)), 12)
-
- # Floating point dates do not encode day information, so we always assume
- # they refer to the start of a given month.
- day = 1
-
- return "%s-%02d-%02d" % (year, month, day)
+ from calendar import isleap
+ days_in_year = 366 if isleap(int(numdate)) else 365
+ # add a small number of the time elapsed in a year to avoid
+ # unexpected behavior for values 1/365, 2/365, etc
+ days_elapsed = int(((numdate%1)+1e-10)*days_in_year)
+ date = datetime.datetime(int(numdate),1,1) + datetime.timedelta(days=days_elapsed)
+ return "%s-%02d-%02d" % (date.year, date.month, date.day)
def timestamp_to_float(time):
"""Convert a pandas timestamp to a floating point date.
+ This is not entirely accurate as it doesn't account for months with different
+ numbers of days, but should be close enough to be accurate for weekly pivots.
>>> import datetime
>>> time = datetime.date(2010, 10, 1)
@@ -848,7 +848,7 @@ def timestamp_to_float(time):
>>> timestamp_to_float(datetime.date(2011, 12, 1)) == (2011.0 + 11.0 / 12)
True
"""
- return time.year + ((time.month - 1) / 12.0)
+ return time.year + ((time.month - 1) / 12.0) + ((time.day - 1) / 365.25)
class KdeFrequencies(object):
@@ -858,8 +858,10 @@ class KdeFrequencies(object):
each clade in the tree.
"""
def __init__(self, sigma_narrow=1 / 12.0, sigma_wide=3 / 12.0, proportion_wide=0.2,
- pivot_frequency=1, start_date=None, end_date=None, weights=None, weights_attribute=None,
- node_filters=None, max_date=None, include_internal_nodes=False, censored=False):
+ pivot_frequency=1, start_date=None, end_date=None,
+ pivot_interval_units="months", weights=None, weights_attribute=None,
+ node_filters=None, max_date=None, include_internal_nodes=False,
+ censored=False):
"""Define parameters for KDE-based frequency estimation.
Args:
@@ -869,6 +871,7 @@ class KdeFrequencies(object):
pivot_frequency (int): Number of months between pivots
start_date (float): start of the pivots interval
end_date (float): end of the pivots interval
+ pivot_interval_units (str): Whether pivot intervals are measured in "months" or "weeks"
weights (dict): Numerical weights indexed by attribute values and applied to individual tips
weights_attribute (str): Attribute annotated on tips of a tree to use for weighting
node_filters (dict): Mapping of node attribute names (keys) to a list of valid values to keep
@@ -886,6 +889,7 @@ class KdeFrequencies(object):
self.pivot_frequency = pivot_frequency
self.start_date = start_date
self.end_date = end_date
+ self.pivot_interval_units = pivot_interval_units
self.weights = weights
self.weights_attribute = weights_attribute
self.node_filters = node_filters
@@ -907,6 +911,7 @@ class KdeFrequencies(object):
"pivot_frequency": self.pivot_frequency,
"start_date": self.start_date,
"end_date": self.end_date,
+ "pivot_interval_units": self.pivot_interval_units,
"weights": self.weights,
"weights_attribute": self.weights_attribute,
"max_date": self.max_date,
@@ -1121,7 +1126,8 @@ class TreeKdeFrequencies(KdeFrequencies):
observations,
self.pivot_frequency,
start_date=self.start_date,
- end_date=self.end_date
+ end_date=self.end_date,
+ pivot_interval_units=self.pivot_interval_units
)
# If weights are defined, calculate frequencies for all tips by their
@@ -1208,7 +1214,8 @@ class AlignmentKdeFrequencies(KdeFrequencies):
observations,
self.pivot_frequency,
start_date=self.start_date,
- end_date=self.end_date
+ end_date=self.end_date,
+ pivot_interval_units=self.pivot_interval_units
)
# Pair alignment sequence indices with observation dates.
=====================================
codecov.yml
=====================================
@@ -0,0 +1,13 @@
+comment:
+ # Only post a codecov comment if coverage changes.
+ require_changes: true
+
+coverage:
+ status:
+ # Disable GitHub checks for code coverage that make otherwise valid builds fail.
+ project: off
+ patch: off
+
+github_checks:
+ # Disable inline annotations of code coverage in pull requests.
+ annotations: false
=====================================
debian/changelog
=====================================
@@ -1,3 +1,9 @@
+augur (10.3.0-1) unstable; urgency=medium
+
+ * New upstream version
+
+ -- Nilesh Patra <npatra974 at gmail.com> Mon, 18 Jan 2021 19:35:13 +0530
+
augur (10.2.0-1) unstable; urgency=medium
* New upstream version
=====================================
paper/paper.bib
=====================================
@@ -40,7 +40,7 @@
number = {32},
urldate = {2020-08-26},
journal = {Eurosurveillance},
- author = {Alm, Erik and Broberg, Eeva K. and Connor, Thomas and Hodcroft, Emma B. and Komissarov, Andrey B. and Maurer-Stroh, Sebastian and Melidou, Angeliki and Neher, Richard A. and O’Toole, Áine and Pereyaslov, Dmitriy and Group, The WHO European Region sequencing laboratories {and} GISAID EpiCoV},
+ author = {Alm, Erik and Broberg, Eeva K. and Connor, Thomas and Hodcroft, Emma B. and Komissarov, Andrey B. and Maurer-Stroh, Sebastian and Melidou, Angeliki and Neher, Richard A. and O’Toole, Áine and Pereyaslov, Dmitriy and The WHO European Region Sequencing Laboratories {and} GISAID EpiCoV Group,},
month = aug,
year = {2020},
note = {Publisher: European Centre for Disease Prevention and Control},
=====================================
paper/paper.md
=====================================
@@ -17,34 +17,39 @@ authors:
- name: Thomas R. Sibley
affiliation: 2
orcid: 0000-0001-5269-2297
+ - name: Jover Lee
+ affiliation: 2
+ orcid: 0000-0002-2391-0512
+ - name: Kairsten Fay
+ affiliation: 2
+ orcid: 0000-0002-6529-9205
+ - name: Misja Ilcisin
+ affiliation: 2
+ orcid: 0000-0002-1468-7697
- name: Elias Harkins
affiliation: 2
orcid: 0000-0001-6525-9134
- - name: Augur Contributors
- affiliation: 3
- name: Trevor Bedford
affiliation: "1, 2"
orcid: 0000-0002-4039-5794
- name: Richard A. Neher
- affiliation: "4, 5"
+ affiliation: "3, 4"
orcid: 0000-0003-2525-1407
- name: Emma B. Hodcroft
- affiliation: "4, 5, 6"
+ affiliation: "3, 4, 5"
orcid: 0000-0002-0078-2212
affiliations:
- - name: Molecular and Cell Biology Program, University of Washington, Seattle, WA, USA
+ - name: Molecular and Cellular Biology Program, University of Washington, Seattle, WA, USA
index: 1
- name: Vaccine and Infectious Disease Division, Fred Hutchinson Cancer Research Center, Seattle, WA, USA
index: 2
- - name: Open Source Software community
- index: 3
- name: Biozentrum, University of Basel, Basel, Switzerland
- index: 4
+ index: 3
- name: Swiss Institute of Bioinformatics, Basel, Switzerland
- index: 5
+ index: 4
- name: Institute of Social and Preventive Medicine, University of Bern, Bern, Switzerland
- index: 6
-date: 5 November 2020
+ index: 5
+date: 5 January 2021
bibliography: paper.bib
---
@@ -102,6 +107,7 @@ Augur can be installed from PyPI ([nextstrain-augur](https://pypi.org/project/ne
# Acknowledgments
Thank you to all of [the open source community members who have contributed to Augur](https://github.com/nextstrain/augur/graphs/contributors).
+Specifically, we thank Eric Danielson, Eddie Lebow, Barney Potter, Ryan Grout, Sai Kiran Kollapudi, Mingye Wang, Carol Willing, Louise Moncla, Thomas Caswell, Sidney Bell, Terry Jones, Christian Clauss, Julien Bordellier, Gytis Dudas, Cameron Devine, Samuel Zhang, Akshay Subramanian, Christopher Tomkins-Tinch, Danielle Kain, Pierre Barrat-Charlaix, Rhys Kidd, Chris Woszczak, Tony Tung, Mathias Walter, and Zachary Sailer.
Thank you to Dan Fornika from BCCDC Public Health Laboratory for creating the first conda recipe for Augur in Bioconda.
JHu is a Graduate Research Fellow and is supported by the NIH grant NIAID F31AI140714.
TB is a Pew Biomedical Scholar.
=====================================
scripts/identify_emerging_clades.py
=====================================
@@ -230,9 +230,15 @@ if __name__ == '__main__':
for index, row_df in distinct_subclades.iterrows():
parent_clade = row_df["parent_clade"]
mutations = row_df["mutations"]
- print(
- f"<li><a target='_new' href='{nextstrain_url}?c=gt-{mutation_region}_{mutations}&p=grid'>{parent_clade}: {mutations}</a></li>",
- file=oh
- )
+ if filter_attribute:
+ print(
+ f"<li><a target='_new' href='{nextstrain_url}?c=gt-{mutation_region}_{mutations}&f_{filter_attribute}={filter_value}&transmissions=hide&p=grid'>{parent_clade}: {mutations}</a></li>",
+ file=oh
+ )
+ else:
+ print(
+ f"<li><a target='_new' href='{nextstrain_url}?c=gt-{mutation_region}_{mutations}&transmissions=hide&p=grid'>{parent_clade}: {mutations}</a></li>",
+ file=oh
+ )
print("</ul>", file=oh)
=====================================
scripts/verify_meta_json.py
=====================================
@@ -39,7 +39,7 @@ if __name__=="__main__":
if "author_info" in data:
verify_author_info(data["author_info"])
else:
- print("ERROR: author_info does not exist. This build needs to be updated and will soon be incompatable with auspice.")
+ print("ERROR: author_info does not exist. This build needs to be updated and will soon be incompatible with auspice.")
# set_trace()
=====================================
tests/test_frequencies.py
=====================================
@@ -66,19 +66,42 @@ def test_get_pivots_from_tree_only(tree):
def test_get_pivots_from_start_and_end_date():
"""
- Test pivot calculation from a given start and end date instead of a given tree.
+ Test pivot calculation from a given start and end date instead of a given tree
+ First pivot is the first day of the month immediately following start_date
+ Last pivot is the first day of the month immediately preceding end_date
+ Current logic converts numeric date 2015.5 to 2015-07-02, hence using 2015.49
"""
- pivot_frequency = 3
- start_date = 2015.5
+ pivot_frequency = 1
+ start_date = 2015.49
end_date = 2018.5
observations = []
pivots = get_pivots(observations, pivot_frequency, start_date=start_date, end_date=end_date)
assert isinstance(pivots, np.ndarray)
- assert pivots[1] - pivots[0] == pivot_frequency / 12.0
- assert pivots[0] == start_date
- assert pivots[-1] == end_date
+ assert np.round( 12 * (pivots[1] - pivots[0]) ) == pivot_frequency
+ assert pivots[0] == 2015.5
+ assert pivots[-1] == 2018.5
assert pivots[-1] >= end_date - pivot_frequency
+def test_get_pivots_by_months():
+ """Get pivots where intervals are defined by months.
+ """
+ pivots = get_pivots(observations=[], pivot_interval=1, start_date=2015.0, end_date=2016.0, pivot_interval_units="months")
+ # Pivots should include all 12 months of the year plus the month represented
+ # by the end date, since the pandas month interval uses "month starts". See
+ # pandas date offsets documentation for more details:
+ # https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
+ assert len(pivots) == 13
+
+def test_get_pivots_by_weeks():
+ """Get pivots where intervals are defined as weeks instead of months..
+ """
+ pivots = get_pivots(observations=[], pivot_interval=1, start_date=2015.0, end_date=2016.0, pivot_interval_units="weeks")
+ assert len(pivots) == 52
+
+def test_get_pivots_by_invalid_unit():
+ with pytest.raises(ValueError, match=r".*invalid_unit.*is not supported.*"):
+ pivots = get_pivots(observations=[], pivot_interval=1, start_date=2015.0, end_date=2016.0, pivot_interval_units="invalid_unit")
+
#
# Test KDE frequency estimation for trees
#
@@ -110,7 +133,7 @@ class TestTreeKdeFrequencies(object):
)
frequencies = kde_frequencies.estimate(tree)
assert hasattr(kde_frequencies, "pivots")
- assert kde_frequencies.pivots[0] == start_date
+ assert kde_frequencies.pivots[0] == 2015.5833
assert hasattr(kde_frequencies, "frequencies")
assert list(frequencies.values())[0].shape == kde_frequencies.pivots.shape
View it on GitLab: https://salsa.debian.org/med-team/augur/-/compare/bb58b81e4f3a2a198c5ddd2fa4709917a654e7c6...d80b50c3679d73a836b9dcd304426a3edf8c0238
--
View it on GitLab: https://salsa.debian.org/med-team/augur/-/compare/bb58b81e4f3a2a198c5ddd2fa4709917a654e7c6...d80b50c3679d73a836b9dcd304426a3edf8c0238
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20210118/f6991836/attachment-0001.html>
More information about the debian-med-commit
mailing list