[med-svn] [Git][med-team/augur][upstream] New upstream version 16.0.3

Sat Jul 9 13:02:56 BST 2022


Étienne Mollier pushed to branch upstream at Debian Med / augur


Commits:
015e2767 by Étienne Mollier at 2022-07-09T13:56:11+02:00
New upstream version 16.0.3
- - - - -


4 changed files:

- CHANGES.md
- augur/__version__.py
- augur/filter.py
- + tests/functional/filter/cram/subsample-group-by-without-force-included-strains.t


Changes:

=====================================
CHANGES.md
=====================================
@@ -3,6 +3,16 @@
 ## __NEXT__
 
 
+## 16.0.3 (6 July 2022)
+
+### Bug Fixes
+
+* filter: Move `register_arguments` to the top of the module for better readability [#995][]. (@victorlin)
+* filter: Fix a regression [introduced in 16.0.2](https://github.com/nextstrain/augur/commit/4859b5d70e77cc9a0bb99e741fefb29952058b71) that caused grouping with subsampled max sequences and force-included strains to fail in a data-specific way [#1000][]. (@huddlej)
+
+[#995]: https://github.com/nextstrain/augur/pull/995
+[#1000]: https://github.com/nextstrain/augur/pull/1000
+
 ## 16.0.2 (30 June 2022)
 
 ### Bug Fixes


=====================================
augur/__version__.py
=====================================
@@ -1,4 +1,4 @@
-__version__ = '16.0.2'
+__version__ = '16.0.3'
 
 
 def is_augur_version_compatible(version):


=====================================
augur/filter.py
=====================================
@@ -31,6 +31,63 @@ SEQUENCE_ONLY_FILTERS = (
 )
 
 
+def register_arguments(parser):
+    input_group = parser.add_argument_group("inputs", "metadata and sequences to be filtered")
+    input_group.add_argument('--metadata', required=True, metavar="FILE", help="sequence metadata, as CSV or TSV")
+    input_group.add_argument('--sequences', '-s', help="sequences in FASTA or VCF format")
+    input_group.add_argument('--sequence-index', help="sequence composition report generated by augur index. If not provided, an index will be created on the fly.")
+    input_group.add_argument('--metadata-chunk-size', type=int, default=100000, help="maximum number of metadata records to read into memory at a time. Increasing this number can speed up filtering at the cost of more memory used.")
+    input_group.add_argument('--metadata-id-columns', default=["strain", "name"], nargs="+", help="names of valid metadata columns containing identifier information like 'strain' or 'name'")
+
+    metadata_filter_group = parser.add_argument_group("metadata filters", "filters to apply to metadata")
+    metadata_filter_group.add_argument(
+        '--query',
+        help="""Filter samples by attribute.
+        Uses Pandas Dataframe querying, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query for syntax.
+        (e.g., --query "country == 'Colombia'" or --query "(country == 'USA' & (division == 'Washington'))")"""
+    )
+    metadata_filter_group.add_argument('--min-date', type=numeric_date_type, help=f"minimal cutoff for date, the cutoff date is inclusive; may be specified as: {SUPPORTED_DATE_HELP_TEXT}")
+    metadata_filter_group.add_argument('--max-date', type=numeric_date_type, help=f"maximal cutoff for date, the cutoff date is inclusive; may be specified as: {SUPPORTED_DATE_HELP_TEXT}")
+    metadata_filter_group.add_argument('--exclude-ambiguous-dates-by', choices=['any', 'day', 'month', 'year'],
+                                help='Exclude ambiguous dates by day (e.g., 2020-09-XX), month (e.g., 2020-XX-XX), year (e.g., 200X-10-01), or any date fields. An ambiguous year makes the corresponding month and day ambiguous, too, even if those fields have unambiguous values (e.g., "201X-10-01"). Similarly, an ambiguous month makes the corresponding day ambiguous (e.g., "2010-XX-01").')
+    metadata_filter_group.add_argument('--exclude', type=str, nargs="+", help="file(s) with list of strains to exclude")
+    metadata_filter_group.add_argument('--exclude-where', nargs='+',
+                                help="Exclude samples matching these conditions. Ex: \"host=rat\" or \"host!=rat\". Multiple values are processed as OR (matching any of those specified will be excluded), not AND")
+    metadata_filter_group.add_argument('--exclude-all', action="store_true", help="exclude all strains by default. Use this with the include arguments to select a specific subset of strains.")
+    metadata_filter_group.add_argument('--include', type=str, nargs="+", help="file(s) with list of strains to include regardless of priorities or subsampling")
+    metadata_filter_group.add_argument('--include-where', nargs='+',
+                                help="Include samples with these values. ex: host=rat. Multiple values are processed as OR (having any of those specified will be included), not AND. This rule is applied last and ensures any sequences matching these rules will be included.")
+
+    sequence_filter_group = parser.add_argument_group("sequence filters", "filters to apply to sequence data")
+    sequence_filter_group.add_argument('--min-length', type=int, help="minimal length of the sequences")
+    sequence_filter_group.add_argument('--non-nucleotide', action='store_true', help="exclude sequences that contain illegal characters")
+
+    subsample_group = parser.add_argument_group("subsampling", "options to subsample filtered data")
+    subsample_group.add_argument('--group-by', nargs='+', help="""
+        categories with respect to subsample.
+        Grouping by 'year' and/or 'month' is only supported when there is a 'date' column in the metadata.
+        Custom 'year' and 'month' columns in the metadata are ignored for grouping. Please rename them if you want to use their values for grouping.""")
+    subsample_limits_group = subsample_group.add_mutually_exclusive_group()
+    subsample_limits_group.add_argument('--sequences-per-group', type=int, help="subsample to no more than this number of sequences per category")
+    subsample_limits_group.add_argument('--subsample-max-sequences', type=int, help="subsample to no more than this number of sequences; can be used without the group_by argument")
+    probabilistic_sampling_group = subsample_group.add_mutually_exclusive_group()
+    probabilistic_sampling_group.add_argument('--probabilistic-sampling', action='store_true', help="Allow probabilistic sampling during subsampling. This is useful when there are more groups than requested sequences. This option only applies when `--subsample-max-sequences` is provided.")
+    probabilistic_sampling_group.add_argument('--no-probabilistic-sampling', action='store_false', dest='probabilistic_sampling')
+    subsample_group.add_argument('--priority', type=str, help="""tab-delimited file with list of priority scores for strains (e.g., "<strain>\\t<priority>") and no header.
+    When scores are provided, Augur converts scores to floating point values, sorts strains within each subsampling group from highest to lowest priority, and selects the top N strains per group where N is the calculated or requested number of strains per group.
+    Higher numbers indicate higher priority.
+    Since priorities represent relative values between strains, these values can be arbitrary.""")
+    subsample_group.add_argument('--subsample-seed', type=int, help="random number generator seed to allow reproducible subsampling (with same input data).")
+
+    output_group = parser.add_argument_group("outputs", "possible representations of filtered data (at least one required)")
+    output_group.add_argument('--output', '--output-sequences', '-o', help="filtered sequences in FASTA format")
+    output_group.add_argument('--output-metadata', help="metadata for strains that passed filters")
+    output_group.add_argument('--output-strains', help="list of strains that passed filters (no header)")
+    output_group.add_argument('--output-log', help="tab-delimited file with one row for each filtered strain and the reason it was filtered. Keyword arguments used for a given filter are reported in JSON format in a `kwargs` column.")
+
+    parser.set_defaults(probabilistic_sampling=True)
+
+
 class FilterException(AugurError):
     """Representation of an error that occurred during filtering.
     """
@@ -1125,63 +1182,6 @@ def create_queues_by_group(groups, max_size, max_attempts=100, random_seed=None)
     return queues_by_group
 
 
-def register_arguments(parser):
-    input_group = parser.add_argument_group("inputs", "metadata and sequences to be filtered")
-    input_group.add_argument('--metadata', required=True, metavar="FILE", help="sequence metadata, as CSV or TSV")
-    input_group.add_argument('--sequences', '-s', help="sequences in FASTA or VCF format")
-    input_group.add_argument('--sequence-index', help="sequence composition report generated by augur index. If not provided, an index will be created on the fly.")
-    input_group.add_argument('--metadata-chunk-size', type=int, default=100000, help="maximum number of metadata records to read into memory at a time. Increasing this number can speed up filtering at the cost of more memory used.")
-    input_group.add_argument('--metadata-id-columns', default=["strain", "name"], nargs="+", help="names of valid metadata columns containing identifier information like 'strain' or 'name'")
-
-    metadata_filter_group = parser.add_argument_group("metadata filters", "filters to apply to metadata")
-    metadata_filter_group.add_argument(
-        '--query',
-        help="""Filter samples by attribute.
-        Uses Pandas Dataframe querying, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query for syntax.
-        (e.g., --query "country == 'Colombia'" or --query "(country == 'USA' & (division == 'Washington'))")"""
-    )
-    metadata_filter_group.add_argument('--min-date', type=numeric_date_type, help=f"minimal cutoff for date, the cutoff date is inclusive; may be specified as: {SUPPORTED_DATE_HELP_TEXT}")
-    metadata_filter_group.add_argument('--max-date', type=numeric_date_type, help=f"maximal cutoff for date, the cutoff date is inclusive; may be specified as: {SUPPORTED_DATE_HELP_TEXT}")
-    metadata_filter_group.add_argument('--exclude-ambiguous-dates-by', choices=['any', 'day', 'month', 'year'],
-                                help='Exclude ambiguous dates by day (e.g., 2020-09-XX), month (e.g., 2020-XX-XX), year (e.g., 200X-10-01), or any date fields. An ambiguous year makes the corresponding month and day ambiguous, too, even if those fields have unambiguous values (e.g., "201X-10-01"). Similarly, an ambiguous month makes the corresponding day ambiguous (e.g., "2010-XX-01").')
-    metadata_filter_group.add_argument('--exclude', type=str, nargs="+", help="file(s) with list of strains to exclude")
-    metadata_filter_group.add_argument('--exclude-where', nargs='+',
-                                help="Exclude samples matching these conditions. Ex: \"host=rat\" or \"host!=rat\". Multiple values are processed as OR (matching any of those specified will be excluded), not AND")
-    metadata_filter_group.add_argument('--exclude-all', action="store_true", help="exclude all strains by default. Use this with the include arguments to select a specific subset of strains.")
-    metadata_filter_group.add_argument('--include', type=str, nargs="+", help="file(s) with list of strains to include regardless of priorities or subsampling")
-    metadata_filter_group.add_argument('--include-where', nargs='+',
-                                help="Include samples with these values. ex: host=rat. Multiple values are processed as OR (having any of those specified will be included), not AND. This rule is applied last and ensures any sequences matching these rules will be included.")
-
-    sequence_filter_group = parser.add_argument_group("sequence filters", "filters to apply to sequence data")
-    sequence_filter_group.add_argument('--min-length', type=int, help="minimal length of the sequences")
-    sequence_filter_group.add_argument('--non-nucleotide', action='store_true', help="exclude sequences that contain illegal characters")
-
-    subsample_group = parser.add_argument_group("subsampling", "options to subsample filtered data")
-    subsample_group.add_argument('--group-by', nargs='+', help="""
-        categories with respect to subsample.
-        Grouping by 'year' and/or 'month' is only supported when there is a 'date' column in the metadata.
-        Custom 'year' and 'month' columns in the metadata are ignored for grouping. Please rename them if you want to use their values for grouping.""")
-    subsample_limits_group = subsample_group.add_mutually_exclusive_group()
-    subsample_limits_group.add_argument('--sequences-per-group', type=int, help="subsample to no more than this number of sequences per category")
-    subsample_limits_group.add_argument('--subsample-max-sequences', type=int, help="subsample to no more than this number of sequences; can be used without the group_by argument")
-    probabilistic_sampling_group = subsample_group.add_mutually_exclusive_group()
-    probabilistic_sampling_group.add_argument('--probabilistic-sampling', action='store_true', help="Allow probabilistic sampling during subsampling. This is useful when there are more groups than requested sequences. This option only applies when `--subsample-max-sequences` is provided.")
-    probabilistic_sampling_group.add_argument('--no-probabilistic-sampling', action='store_false', dest='probabilistic_sampling')
-    subsample_group.add_argument('--priority', type=str, help="""tab-delimited file with list of priority scores for strains (e.g., "<strain>\\t<priority>") and no header.
-    When scores are provided, Augur converts scores to floating point values, sorts strains within each subsampling group from highest to lowest priority, and selects the top N strains per group where N is the calculated or requested number of strains per group.
-    Higher numbers indicate higher priority.
-    Since priorities represent relative values between strains, these values can be arbitrary.""")
-    subsample_group.add_argument('--subsample-seed', type=int, help="random number generator seed to allow reproducible subsampling (with same input data).")
-
-    output_group = parser.add_argument_group("outputs", "possible representations of filtered data (at least one required)")
-    output_group.add_argument('--output', '--output-sequences', '-o', help="filtered sequences in FASTA format")
-    output_group.add_argument('--output-metadata', help="metadata for strains that passed filters")
-    output_group.add_argument('--output-strains', help="list of strains that passed filters (no header)")
-    output_group.add_argument('--output-log', help="tab-delimited file with one row for each filtered strain and the reason it was filtered. Keyword arguments used for a given filter are reported in JSON format in a `kwargs` column.")
-
-    parser.set_defaults(probabilistic_sampling=True)
-
-
 def validate_arguments(args):
     """Validate arguments and return a boolean representing whether all validation
     rules succeeded.
@@ -1547,6 +1547,11 @@ def run(args):
             # during the first pass, but we want to minimize overall memory
             # usage at the moment.
             seq_keep = set(metadata.index.values) & valid_strains
+
+            # Prevent force-included strains from being considered in this
+            # second pass, as in the first pass.
+            seq_keep = seq_keep - all_sequences_to_include
+
             group_by_strain, skipped_strains = get_groups_for_subsampling(
                 seq_keep,
                 metadata,


=====================================
tests/functional/filter/cram/subsample-group-by-without-force-included-strains.t
=====================================
@@ -0,0 +1,21 @@
+Setup
+
+  $ pushd "$TESTDIR" > /dev/null
+  $ source _setup.sh
+
+Do not consider force-included strains for subsampling.
+In this test, we force-include two old strains that are the only representatives of their month/year date group (December 2015).
+We don't filter these strains, so they could be considered for subsampling, but Augur removes them from consideration because they have been force-included.
+
+  $ cat >$TMP/include_old_strains.txt <<~~
+  > PRVABC59
+  > COL/FLR_00008/2015
+  > ~~
+
+  $ ${AUGUR} filter \
+  >   --metadata filter/data/metadata.tsv \
+  >   --include $TMP/include_old_strains.txt \
+  >   --group-by month year \
+  >   --subsample-max-sequences 10 \
+  >   --output-metadata $TMP/metadata-filtered.tsv > /dev/null
+  $ rm -f $TMP/metadata-filtered.tsv



View it on GitLab: https://salsa.debian.org/med-team/augur/-/commit/015e2767309c7ce0a68d9bbde566846fa88b3a6f

-- 
View it on GitLab: https://salsa.debian.org/med-team/augur/-/commit/015e2767309c7ce0a68d9bbde566846fa88b3a6f
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20220709/3b32cac2/attachment-0001.htm>