[med-svn] [Git][med-team/augur][upstream] New upstream version 24.2.3

Sun Mar 10 09:01:16 GMT 2024


Étienne Mollier pushed to branch upstream at Debian Med / augur


Commits:
15e45c24 by Étienne Mollier at 2024-03-10T09:35:00+01:00
New upstream version 24.2.3
- - - - -


6 changed files:

- CHANGES.md
- augur/__version__.py
- augur/filter/__init__.py
- augur/filter/_run.py
- augur/frequencies.py
- + tests/functional/frequencies/cram/diffusion-region.t


Changes:

=====================================
CHANGES.md
=====================================
@@ -3,6 +3,16 @@
 ## __NEXT__
 
 
+## 24.2.3 (23 February 2024)
+
+### Bug Fixes
+
+* filter: Updated the help and report text of `--min-length` to explicitly state that the minimum length filter only counts standard nucleotide characters A, C, G, or T (case-insensitive). This has been the behavior since version 3.0.3.dev1, but has never been explicitly documented. [#1422][] (@joverlee521)
+* frequencies: Fixed a bug introduced in 24.2.0 and 24.1.0 that prevented `--regions` from working when providing regions other than the default "global" region. [#1424]
+
+[#1422]: https://github.com/nextstrain/augur/pull/1422
+[#1424]: https://github.com/nextstrain/augur/pull/1424
+
 ## 24.2.2 (16 February 2024)
 
 ### Bug Fixes


=====================================
augur/__version__.py
=====================================
@@ -1,4 +1,4 @@
-__version__ = '24.2.2'
+__version__ = '24.2.3'
 
 
 def is_augur_version_compatible(version):


=====================================
augur/filter/__init__.py
=====================================
@@ -51,7 +51,7 @@ def register_arguments(parser):
         of an entry in --sequences.""")
 
     sequence_filter_group = parser.add_argument_group("sequence filters", "filters to apply to sequence data")
-    sequence_filter_group.add_argument('--min-length', type=int, help="minimal length of the sequences")
+    sequence_filter_group.add_argument('--min-length', type=int, help="minimal length of the sequences, only counting standard nucleotide characters A, C, G, or T (case-insensitive)")
     sequence_filter_group.add_argument('--non-nucleotide', action='store_true', help="exclude sequences that contain illegal characters")
 
     subsample_group = parser.add_argument_group("subsampling", "options to subsample filtered data")


=====================================
augur/filter/_run.py
=====================================
@@ -428,7 +428,7 @@ def run(args):
         include_exclude_rules.filter_by_ambiguous_date.__name__: "{count} {were} dropped because of their ambiguous date in {ambiguity}",
         include_exclude_rules.filter_by_min_date.__name__: "{count} {were} dropped because {they} {were} earlier than {min_date} or missing a date",
         include_exclude_rules.filter_by_max_date.__name__: "{count} {were} dropped because {they} {were} later than {max_date} or missing a date",
-        include_exclude_rules.filter_by_sequence_length.__name__: "{count} {were} dropped because {they} {were} shorter than minimum length of {min_length}bp",
+        include_exclude_rules.filter_by_sequence_length.__name__: "{count} {were} dropped because {they} {were} shorter than minimum length of {min_length}bp when only counting standard nucleotide characters A, C, G, or T (case-insensitive)",
         include_exclude_rules.filter_by_non_nucleotide.__name__: "{count} {were} dropped because {they} had non-nucleotide characters",
         include_exclude_rules.skip_group_by_with_ambiguous_year.__name__: "{count} {were} dropped during grouping due to ambiguous year information",
         include_exclude_rules.skip_group_by_with_ambiguous_month.__name__: "{count} {were} dropped during grouping due to ambiguous month information",


=====================================
augur/frequencies.py
=====================================
@@ -14,6 +14,8 @@ from .io.file import open_file
 from .io.metadata import DEFAULT_DELIMITERS, DEFAULT_ID_COLUMNS, METADATA_DATE_COLUMN, InvalidDelimiter, Metadata, read_metadata
 from .utils import write_json
 
+REGION_COLUMN = 'region'
+DEFAULT_REGION = 'global'
 
 def register_parser(parent_subparsers):
     parser = parent_subparsers.add_parser("frequencies", help=__doc__)
@@ -26,8 +28,10 @@ def register_parser(parent_subparsers):
                         help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.")
     parser.add_argument('--metadata-id-columns', default=DEFAULT_ID_COLUMNS, nargs="+",
                         help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.")
-    parser.add_argument('--regions', type=str, nargs='+', default=['global'],
-                        help="region to subsample to")
+    parser.add_argument('--regions', type=str, nargs='+', default=[DEFAULT_REGION],
+                        help="region to filter to. " \
+                            f"Regions should match values in the {REGION_COLUMN!r} column of the metadata file " \
+                            f"if specifying values other than the default {DEFAULT_REGION!r} region.")
     parser.add_argument("--pivot-interval", type=int, default=3,
                         help="number of units between pivots")
     parser.add_argument("--pivot-interval-units", type=str, default="months", choices=['months', 'weeks'],
@@ -97,6 +101,11 @@ def run(args):
     columns_to_load = [metadata_object.id_column, METADATA_DATE_COLUMN]
     if args.weights_attribute:
         columns_to_load.append(args.weights_attribute)
+
+    filter_to_region = any(region != DEFAULT_REGION for region in args.regions)
+    if filter_to_region:
+        columns_to_load.append(REGION_COLUMN)
+
     metadata = read_metadata(
         args.metadata,
         delimiters=[metadata_object.delimiter],
@@ -130,6 +139,9 @@ def run(args):
                 # Annotate tip with weight attribute.
                 tip.attr[weights_attribute] = metadata.loc[tip.name, weights_attribute]
 
+            if filter_to_region:
+                tip.attr[REGION_COLUMN] = metadata.loc[tip.name, REGION_COLUMN]
+
         if args.method == "diffusion":
             # estimate tree frequencies
             pivots = get_pivots(tps, args.pivot_interval, args.min_date, args.max_date, args.pivot_interval_units)
@@ -139,10 +151,10 @@ def run(args):
             for region in args.regions:
                 # Omit strains sampled prior to the first pivot from frequency calculations.
                 # (these tend to be reference strains included for phylogenetic context)
-                if region=='global':
+                if region==DEFAULT_REGION:
                     node_filter_func = lambda node: node.attr["num_date"] >= pivots[0]
                 else:
-                    node_filter_func = lambda node: (node.attr["region"] == region
+                    node_filter_func = lambda node: (node.attr[REGION_COLUMN] == region
                                                     and node.attr["num_date"] >= pivots[0])
 
                 tree_freqs = tree_frequencies(tree, pivots, method='SLSQP',


=====================================
tests/functional/frequencies/cram/diffusion-region.t
=====================================
@@ -0,0 +1,247 @@
+Setup
+
+  $ source "$TESTDIR"/_setup.sh
+
+Calculate diffusion-based tip frequencies from a refined tree with `--regions`.
+
+  $ ${AUGUR} frequencies \
+  >  --method diffusion \
+  >  --tree "$TESTDIR/../data/tree.nwk" \
+  >  --metadata "$TESTDIR/../data/metadata.tsv" \
+  >  --regions "global" "North America" "South America" \
+  >  --pivot-interval 3 \
+  >  --output tip-frequencies.json > /dev/null
+
+  $ cat tip-frequencies.json
+  {
+    "BRA/2016/FC_6706": {
+      "North America": [
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "South America": [
+        0.2,
+        0.2,
+        0.2,
+        0.2
+      ],
+      "global": [
+        0.1,
+        0.1,
+        0.1,
+        0.1
+      ]
+    },
+    "COL/FLR_00008/2015": {
+      "North America": [
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "South America": [
+        0.2,
+        0.2,
+        0.2,
+        0.2
+      ],
+      "global": [
+        0.1,
+        0.1,
+        0.1,
+        0.1
+      ]
+    },
+    "Colombia/2016/ZC204Se": {
+      "North America": [
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "South America": [
+        0.2,
+        0.2,
+        0.2,
+        0.2
+      ],
+      "global": [
+        0.1,
+        0.1,
+        0.1,
+        0.1
+      ]
+    },
+    "DOM/2016/BB_0183": {
+      "North America": [
+        0.25,
+        0.25,
+        0.25,
+        0.25
+      ],
+      "South America": [
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "global": [
+        0.1,
+        0.1,
+        0.1,
+        0.1
+      ]
+    },
+    "EcEs062_16": {
+      "North America": [
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "South America": [
+        0.2,
+        0.2,
+        0.2,
+        0.2
+      ],
+      "global": [
+        0.1,
+        0.1,
+        0.1,
+        0.1
+      ]
+    },
+    "HND/2016/HU_ME59": {
+      "North America": [
+        0.25,
+        0.25,
+        0.25,
+        0.25
+      ],
+      "South America": [
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "global": [
+        0.1,
+        0.1,
+        0.1,
+        0.1
+      ]
+    },
+    "PAN/CDC_259359_V1_V3/2015": {
+      "North America": [
+        0.25,
+        0.25,
+        0.25,
+        0.25
+      ],
+      "South America": [
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "global": [
+        0.1,
+        0.1,
+        0.1,
+        0.1
+      ]
+    },
+    "PRVABC59": {
+      "North America": [
+        0.25,
+        0.25,
+        0.25,
+        0.25
+      ],
+      "South America": [
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "global": [
+        0.1,
+        0.1,
+        0.1,
+        0.1
+      ]
+    },
+    "VEN/UF_1/2016": {
+      "North America": [
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "South America": [
+        0.2,
+        0.2,
+        0.2,
+        0.2
+      ],
+      "global": [
+        0.1,
+        0.1,
+        0.1,
+        0.1
+      ]
+    },
+    "ZKC2/2016": {
+      "North America": [
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "South America": [
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "global": [
+        0.1,
+        0.1,
+        0.1,
+        0.1
+      ]
+    },
+    "counts": {
+      "North America": [
+        0,
+        2,
+        2,
+        0
+      ],
+      "South America": [
+        0,
+        2,
+        3,
+        0
+      ],
+      "global": [
+        0,
+        5,
+        5,
+        0
+      ]
+    },
+    "generated_by": {
+      "program": "augur",
+      "version": ".*" (re)
+    },
+    "pivots": [
+      2015.7521,
+      2016.0041,
+      2016.2527,
+      2016.5014
+    ]
+  } (no-eol)



View it on GitLab: https://salsa.debian.org/med-team/augur/-/commit/15e45c24afcd7bbb96751883a7a5755c029d51bf

-- 
View it on GitLab: https://salsa.debian.org/med-team/augur/-/commit/15e45c24afcd7bbb96751883a7a5755c029d51bf
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20240310/3c203618/attachment-0001.htm>