[med-svn] [Git][med-team/augur][master] 4 commits: routine-update: New upstream version
Andreas Tille (@tille)
gitlab at salsa.debian.org
Fri Oct 28 08:37:25 BST 2022
Andreas Tille pushed to branch master at Debian Med / augur
ac972c38 by Andreas Tille at 2022-10-28T09:29:38+02:00
routine-update: New upstream version
- - - - -
797cec5e by Andreas Tille at 2022-10-28T09:29:39+02:00
New upstream version 18.1.0
- - - - -
26546f97 by Andreas Tille at 2022-10-28T09:31:20+02:00
Update upstream source from tag 'upstream/18.1.0'
Update to upstream version '18.1.0'
with Debian dir 6927fbb5f7be4e727aca63cda69fa09c92c0af1b
- - - - -
10e63a1e by Andreas Tille at 2022-10-28T09:33:37+02:00
routine-update: Ready to upload to unstable
- - - - -
11 changed files:
- .gitignore
- augur/__version__.py
- augur/dates.py
- augur/filter.py
- debian/changelog
- tests/functional/filter/cram/subsample-group-by-missing-error.t
- + tests/functional/filter/cram/subsample-group-by-week.t
- tests/functional/filter/cram/subsample-group-by-with-custom-year-column.t
- tests/functional/filter/cram/subsample-skip-ambiguous-dates.t
- tests/test_filter.py
@@ -16,6 +16,13 @@ dist/
+# Output files from running tests
# For Python #
@@ -3,6 +3,21 @@
## __NEXT__
+## 18.1.0 (26 October 2022)
+### Features
+* filter: Add support to group by ISO week (`--group-by week`) during subsampling. [#1067][] (@victorlin)
+### Bug Fixes
+* filter: Fixed unintended behavior in which grouping by `day` would "work" when used with `month` and/or `year`. Updated so it will be ignored. [#1070][] (@victorlin)
+* filter: Fixed unintended behavior in which grouping by `month` with ambiguous years would "work". Updated so date ambiguity is checked properly for all generated columns. [#1072][] (@victorlin)
+[#1067]: https://github.com/nextstrain/augur/pull/1067
+[#1070]: https://github.com/nextstrain/augur/pull/1070
+[#1072]: https://github.com/nextstrain/augur/pull/1072
## 18.0.0 (21 September 2022)
### Major Changes
@@ -1,4 +1,4 @@
-__version__ = '18.0.0'
+__version__ = '18.1.0'
def is_augur_version_compatible(version):
@@ -137,3 +137,6 @@ def get_numerical_dates(metadata:pd.DataFrame, name_col = None, date_col='date',
strains = metadata.index.values
dates = metadata[date_col].astype(float)
return dict(zip(strains, dates))
+def get_iso_year_week(year, month, day):
+ return datetime.date(year, month, day).isocalendar()[:2]
@@ -14,10 +14,11 @@ import pandas as pd
import random
import re
import sys
+import uuid
from tempfile import NamedTemporaryFile
from typing import Collection
-from .dates import numeric_date, numeric_date_type, SUPPORTED_DATE_HELP_TEXT, is_date_ambiguous, get_numerical_dates
+from .dates import numeric_date, numeric_date_type, SUPPORTED_DATE_HELP_TEXT, is_date_ambiguous, get_numerical_dates, get_iso_year_week
from .errors import AugurError
from .index import index_sequences, index_vcf
from .io import open_file, read_metadata, read_sequences, write_sequences, is_vcf as filename_is_vcf, write_vcf
@@ -30,6 +31,9 @@ SEQUENCE_ONLY_FILTERS = (
+# Use sorted() for reproducible output
+GROUP_BY_GENERATED_COLUMNS = {'year', 'month', 'week'}
def register_arguments(parser):
Add arguments to parser.
@@ -67,10 +71,13 @@ def register_arguments(parser):
sequence_filter_group.add_argument('--non-nucleotide', action='store_true', help="exclude sequences that contain illegal characters")
subsample_group = parser.add_argument_group("subsampling", "options to subsample filtered data")
- subsample_group.add_argument('--group-by', nargs='+', help="""
+ subsample_group.add_argument('--group-by', nargs='+', help=f"""
categories with respect to subsample.
- Grouping by 'year' and/or 'month' is only supported when there is a 'date' column in the metadata.
- Custom 'year' and 'month' columns in the metadata are ignored for grouping. Please rename them if you want to use their values for grouping.""")
+ Notes:
+ (1) Grouping by {sorted(GROUP_BY_GENERATED_COLUMNS)} is only supported when there is a 'date' column in the metadata.
+ (2) 'week' uses the ISO week numbering system, where a week starts on a Monday and ends on a Sunday.
+ (3) 'month' and 'week' grouping cannot be used together.
+ (4) Custom columns {sorted(GROUP_BY_GENERATED_COLUMNS)} in the metadata are ignored for grouping. Please rename them if you want to use their values for grouping.""")
subsample_limits_group = subsample_group.add_mutually_exclusive_group()
subsample_limits_group.add_argument('--sequences-per-group', type=int, help="subsample to no more than this number of sequences per category")
subsample_limits_group.add_argument('--subsample-max-sequences', type=int, help="subsample to no more than this number of sequences; can be used without the group_by argument")
@@ -994,63 +1001,85 @@ def get_groups_for_subsampling(strains, metadata, group_by=None):
return group_by_strain, skipped_strains
group_by_set = set(group_by)
+ generated_columns_requested = GROUP_BY_GENERATED_COLUMNS & group_by_set
# If we could not find any requested categories, we cannot complete subsampling.
- if 'date' not in metadata and group_by_set <= {'year', 'month'}:
- raise FilterException(f"The specified group-by categories ({group_by}) were not found. Note that using 'year' or 'year month' requires a column called 'date'.")
- if not group_by_set & (set(metadata.columns) | {'year', 'month'}):
+ if 'date' not in metadata and group_by_set <= GROUP_BY_GENERATED_COLUMNS:
+ raise FilterException(f"The specified group-by categories ({group_by}) were not found. Note that using any of {sorted(GROUP_BY_GENERATED_COLUMNS)} requires a column called 'date'.")
+ if not group_by_set & (set(metadata.columns) | GROUP_BY_GENERATED_COLUMNS):
raise FilterException(f"The specified group-by categories ({group_by}) were not found.")
- # date requested
- if 'year' in group_by_set or 'month' in group_by_set:
+ # Warn/error based on other columns grouped with 'week'.
+ if 'week' in group_by_set:
+ if 'year' in group_by_set:
+ print(f"WARNING: 'year' grouping will be ignored since 'week' includes ISO year.", file=sys.stderr)
+ group_by.remove('year')
+ group_by_set.remove('year')
+ generated_columns_requested.remove('year')
+ if 'month' in group_by_set:
+ raise AugurError("'month' and 'week' grouping cannot be used together.")
+ if generated_columns_requested:
- if 'year' in metadata.columns and 'year' in group_by_set:
- print(f"WARNING: `--group-by year` uses the generated year value from the 'date' column. The custom 'year' column in the metadata is ignored for grouping purposes.", file=sys.stderr)
- metadata.drop('year', axis=1, inplace=True)
- if 'month' in metadata.columns and 'month' in group_by_set:
- print(f"WARNING: `--group-by month` uses the generated month value from the 'date' column. The custom 'month' column in the metadata is ignored for grouping purposes.", file=sys.stderr)
- metadata.drop('month', axis=1, inplace=True)
+ for col in sorted(generated_columns_requested):
+ if col in metadata.columns:
+ print(f"WARNING: `--group-by {col}` uses a generated {col} value from the 'date' column. The custom '{col}' column in the metadata is ignored for grouping purposes.", file=sys.stderr)
+ metadata.drop(col, axis=1, inplace=True)
if 'date' not in metadata:
- # set year/month/day = unknown
- print(f"WARNING: A 'date' column could not be found to group-by year or month.", file=sys.stderr)
+ # Set generated columns to 'unknown'.
+ print(f"WARNING: A 'date' column could not be found to group-by {sorted(generated_columns_requested)}.", file=sys.stderr)
print(f"Filtering by group may behave differently than expected!", file=sys.stderr)
- df_dates = pd.DataFrame({'year': 'unknown', 'month': 'unknown'}, index=metadata.index)
+ df_dates = pd.DataFrame({col: 'unknown' for col in GROUP_BY_GENERATED_COLUMNS}, index=metadata.index)
metadata = pd.concat([metadata, df_dates], axis=1)
- # replace date with year/month/day as nullable ints
- date_cols = ['year', 'month', 'day']
+ # Create a DataFrame with year/month/day columns as nullable ints.
+ # These columns are prefixed to note temporary usage. They are used
+ # to generate other columns, and will be discarded at the end.
+ temp_prefix = str(uuid.uuid4())
+ temp_date_cols = [f'{temp_prefix}year', f'{temp_prefix}month', f'{temp_prefix}day']
df_dates = metadata['date'].str.split('-', n=2, expand=True)
- df_dates = df_dates.set_axis(date_cols[:len(df_dates.columns)], axis=1)
- missing_date_cols = set(date_cols) - set(df_dates.columns)
+ df_dates = df_dates.set_axis(temp_date_cols[:len(df_dates.columns)], axis=1)
+ missing_date_cols = set(temp_date_cols) - set(df_dates.columns)
for col in missing_date_cols:
df_dates[col] = pd.NA
- for col in date_cols:
+ for col in temp_date_cols:
df_dates[col] = pd.to_numeric(df_dates[col], errors='coerce').astype(pd.Int64Dtype())
+ # Extend metadata with generated date columns
+ # Drop the 'date' column since it should not be used for grouping.
metadata = pd.concat([metadata.drop('date', axis=1), df_dates], axis=1)
- if 'year' in group_by_set:
- # skip ambiguous years
- df_skip = metadata[metadata['year'].isnull()]
- metadata.dropna(subset=['year'], inplace=True)
- for strain in df_skip.index:
- skipped_strains.append({
- "strain": strain,
- "filter": "skip_group_by_with_ambiguous_year",
- "kwargs": "",
- })
- if 'month' in group_by_set:
- # skip ambiguous months
- df_skip = metadata[metadata['month'].isnull()]
- metadata.dropna(subset=['month'], inplace=True)
- for strain in df_skip.index:
- skipped_strains.append({
- "strain": strain,
- "filter": "skip_group_by_with_ambiguous_month",
- "kwargs": "",
- })
- # month = (year, month)
- metadata['month'] = list(zip(metadata['year'], metadata['month']))
- # TODO: support group by day
+ ambiguous_date_strains = list(_get_ambiguous_date_skipped_strains(
+ metadata,
+ temp_prefix,
+ generated_columns_requested
+ ))
+ metadata.drop([record['strain'] for record in ambiguous_date_strains], inplace=True)
+ skipped_strains.extend(ambiguous_date_strains)
+ # Generate columns.
+ if 'year' in generated_columns_requested:
+ metadata['year'] = metadata[f'{temp_prefix}year']
+ if 'month' in generated_columns_requested:
+ metadata['month'] = list(zip(
+ metadata[f'{temp_prefix}year'],
+ metadata[f'{temp_prefix}month']
+ ))
+ if 'week' in generated_columns_requested:
+ # Note that week = (year, week) from the date.isocalendar().
+ # Do not combine the raw year with the ISO week number alone,
+ # since raw year ≠ ISO year.
+ metadata['week'] = metadata.apply(lambda row: get_iso_year_week(
+ row[f'{temp_prefix}year'],
+ row[f'{temp_prefix}month'],
+ row[f'{temp_prefix}day']
+ ), axis=1
+ )
+ # Drop the internally used columns.
+ for col in temp_date_cols:
+ metadata.drop(col, axis=1, inplace=True)
unknown_groups = group_by_set - set(metadata.columns)
if unknown_groups:
@@ -1059,10 +1088,59 @@ def get_groups_for_subsampling(strains, metadata, group_by=None):
for group in unknown_groups:
metadata[group] = 'unknown'
+ # Finally, determine groups.
group_by_strain = dict(zip(metadata.index, metadata[group_by].apply(tuple, axis=1)))
return group_by_strain, skipped_strains
+def _get_ambiguous_date_skipped_strains(
+ metadata, temp_prefix, generated_columns_requested):
+ """Get strains skipped due to date ambiguity.
+ Each value is a dictionary with keys:
+ - `strain`: strain name
+ - `filter`: filter reason. Used for the final report output.
+ - `kwargs`: Empty string since filter reason does not represent a function.
+ """
+ # Don't yield the same strain twice.
+ already_skipped_strains = set()
+ if generated_columns_requested:
+ # Skip ambiguous years.
+ df_skip = metadata[metadata[f'{temp_prefix}year'].isnull()]
+ for strain in df_skip.index:
+ if strain not in already_skipped_strains:
+ yield {
+ "strain": strain,
+ "filter": "skip_group_by_with_ambiguous_year",
+ "kwargs": "",
+ }
+ already_skipped_strains.update(df_skip.index)
+ if 'month' in generated_columns_requested or 'week' in generated_columns_requested:
+ # Skip ambiguous months.
+ df_skip = metadata[metadata[f'{temp_prefix}month'].isnull()]
+ for strain in df_skip.index:
+ if strain not in already_skipped_strains:
+ yield {
+ "strain": strain,
+ "filter": "skip_group_by_with_ambiguous_month",
+ "kwargs": "",
+ }
+ already_skipped_strains.update(df_skip.index)
+ if 'week' in generated_columns_requested:
+ # Skip ambiguous days.
+ df_skip = metadata[metadata[f'{temp_prefix}day'].isnull()]
+ for strain in df_skip.index:
+ if strain not in already_skipped_strains:
+ yield {
+ "strain": strain,
+ "filter": "skip_group_by_with_ambiguous_day",
+ "kwargs": "",
+ }
+ # TODO: uncomment if another filter reason is ever added.
+ # already_skipped_strains.update(df_skip.index)
class PriorityQueue:
"""A priority queue implementation that automatically replaces lower priority
items in the heap with incoming higher priority items.
@@ -1713,6 +1791,7 @@ def run(args):
"filter_by_non_nucleotide": "{count} of these were dropped because they had non-nucleotide characters",
"skip_group_by_with_ambiguous_year": "{count} were dropped during grouping due to ambiguous year information",
"skip_group_by_with_ambiguous_month": "{count} were dropped during grouping due to ambiguous month information",
+ "skip_group_by_with_ambiguous_day": "{count} were dropped during grouping due to ambiguous day information",
"force_include_strains": "{count} strains were added back because they were in {include_file}",
"force_include_where": "{count} sequences were added back because of '{include_where}'",
@@ -1,3 +1,9 @@
+augur (18.1.0-1) unstable; urgency=medium
+ * New upstream version
+ -- Andreas Tille <tille at debian.org> Fri, 28 Oct 2022 09:31:41 +0200
augur (18.0.0-1) unstable; urgency=medium
* New upstream version
@@ -16,7 +16,7 @@ Error on missing group-by columns.
> --group-by year \
> --sequences-per-group 1 \
> --output-metadata $TMP/metadata-filtered.tsv > /dev/null
- ERROR: The specified group-by categories (['year']) were not found. Note that using 'year' or 'year month' requires a column called 'date'.
+ ERROR: The specified group-by categories (['year']) were not found. Note that using any of ['month', 'week', 'year'] requires a column called 'date'.
$ cat $TMP/metadata-filtered.tsv
cat: .*: No such file or directory (re)
@@ -0,0 +1,54 @@
+ $ pushd "$TESTDIR" > /dev/null
+ $ source _setup.sh
+SEQ1 and SEQ2 translate to week=(2003, 1).
+SEQ3 and SEQ4 translate to week=(2004, 1).
+These should be in separate groups.
+ $ cat >$TMP/metadata.tsv <<~~
+ > strain date
+ > SEQ1 2003-01-01
+ > SEQ2 2003-01-02
+ > SEQ3 2003-12-30
+ > SEQ4 2003-12-31
+ > ~~
+ $ ${AUGUR} filter \
+ > --metadata $TMP/metadata.tsv \
+ > --group-by week \
+ > --sequences-per-group 1 \
+ > --subsample-seed 0 \
+ > --output-metadata $TMP/metadata-filtered.tsv
+ 2 strains were dropped during filtering
+ \t2 of these were dropped because of subsampling criteria (esc)
+ 2 strains passed all filters
+ $ cat $TMP/metadata-filtered.tsv
+ strain date
+ SEQ1 2003-01-01
+ SEQ3 2003-12-30
+ISO year from 'week' takes precedence over 'year'.
+ $ cat >$TMP/metadata.tsv <<~~
+ > strain date
+ > SEQ1 2003-12-30
+ > SEQ2 2003-12-31
+ > SEQ3 2004-01-01
+ > SEQ4 2004-01-02
+ > ~~
+ $ ${AUGUR} filter \
+ > --metadata $TMP/metadata.tsv \
+ > --group-by year week \
+ > --sequences-per-group 1 \
+ > --subsample-seed 0 \
+ > --output-metadata $TMP/metadata-filtered.tsv
+ WARNING: 'year' grouping will be ignored since 'week' includes ISO year.
+ 3 strains were dropped during filtering
+ \t3 of these were dropped because of subsampling criteria (esc)
+ 1 strains passed all filters
+ $ cat $TMP/metadata-filtered.tsv
+ strain date
+ SEQ1 2003-12-30
@@ -22,7 +22,7 @@ Group by generated year column, and ensure all original columns are still in the
> --sequences-per-group 1 \
> --subsample-seed 0 \
> --output-metadata "$TMP/filtered_metadata.tsv" > /dev/null
- WARNING: `--group-by year` uses the generated year value from the 'date' column. The custom 'year' column in the metadata is ignored for grouping purposes.
+ WARNING: `--group-by year` uses a generated year value from the 'date' column. The custom 'year' column in the metadata is ignored for grouping purposes.
$ cat "$TMP/filtered_metadata.tsv"
strain\tdate\tyear\tmonth (esc)
SEQ1\t2021-01-01\todd\tJanuary (esc)
@@ -36,8 +36,8 @@ Group by generated year and month columns, and ensure all original columns are s
> --sequences-per-group 1 \
> --subsample-seed 0 \
> --output-metadata "$TMP/filtered_metadata.tsv" > /dev/null
- WARNING: `--group-by year` uses the generated year value from the 'date' column. The custom 'year' column in the metadata is ignored for grouping purposes.
- WARNING: `--group-by month` uses the generated month value from the 'date' column. The custom 'month' column in the metadata is ignored for grouping purposes.
+ WARNING: `--group-by month` uses a generated month value from the 'date' column. The custom 'month' column in the metadata is ignored for grouping purposes.
+ WARNING: `--group-by year` uses a generated year value from the 'date' column. The custom 'year' column in the metadata is ignored for grouping purposes.
$ cat "$TMP/filtered_metadata.tsv"
strain\tdate\tyear\tmonth (esc)
SEQ1\t2021-01-01\todd\tJanuary (esc)
@@ -17,3 +17,30 @@ Strains with ambiguous years or months should be dropped and logged.
SG_018\tskip_group_by_with_ambiguous_month (esc)
$ grep "COL/FLR_00024/2015" "$TMP/filtered_log.tsv" | cut -f 1-2
COL/FLR_00024/2015\tskip_group_by_with_ambiguous_year (esc)
+Group by 'year month week'. Using 'week' has some restrictions - 'year' should warn and 'month' should error.
+ $ ${AUGUR} filter \
+ > --metadata filter/data/metadata.tsv \
+ > --group-by year month week \
+ > --sequences-per-group 1 \
+ > --output-strains "$TMP/filtered_strains.txt" > /dev/null
+ WARNING: 'year' grouping will be ignored since 'week' includes ISO year.
+ ERROR: 'month' and 'week' grouping cannot be used together.
+ [2]
+Group by 'week'. Check the number of strains that have been dropped due to ambiguous day.
+ $ ${AUGUR} filter \
+ > --metadata filter/data/metadata.tsv \
+ > --group-by week \
+ > --sequences-per-group 1 \
+ > --subsample-seed 0 \
+ > --output-strains "$TMP/filtered_strains.txt" \
+ > --output-log "$TMP/filtered_log.tsv" > /dev/null
+ $ grep "skip_group_by_with_ambiguous_year" "$TMP/filtered_log.tsv" | wc -l
+ \s*1 (re)
+ $ grep "skip_group_by_with_ambiguous_month" "$TMP/filtered_log.tsv" | wc -l
+ \s*1 (re)
+ $ grep "skip_group_by_with_ambiguous_day" "$TMP/filtered_log.tsv" | wc -l
+ \s*3 (re)
@@ -473,7 +473,7 @@ class TestFilterGroupBy:
strains = metadata.index.tolist()
with pytest.raises(FilterException) as e_info:
augur.filter.get_groups_for_subsampling(strains, metadata, group_by=groups)
- assert str(e_info.value) == "The specified group-by categories (['year']) were not found. Note that using 'year' or 'year month' requires a column called 'date'."
+ assert str(e_info.value) == "The specified group-by categories (['year']) were not found. Note that using any of ['month', 'week', 'year'] requires a column called 'date'."
def test_filter_groupby_missing_month_error(self, valid_metadata: pd.DataFrame):
groups = ['month']
@@ -482,7 +482,7 @@ class TestFilterGroupBy:
strains = metadata.index.tolist()
with pytest.raises(FilterException) as e_info:
augur.filter.get_groups_for_subsampling(strains, metadata, group_by=groups)
- assert str(e_info.value) == "The specified group-by categories (['month']) were not found. Note that using 'year' or 'year month' requires a column called 'date'."
+ assert str(e_info.value) == "The specified group-by categories (['month']) were not found. Note that using any of ['month', 'week', 'year'] requires a column called 'date'."
def test_filter_groupby_missing_year_and_month_error(self, valid_metadata: pd.DataFrame):
groups = ['year', 'month']
@@ -491,7 +491,7 @@ class TestFilterGroupBy:
strains = metadata.index.tolist()
with pytest.raises(FilterException) as e_info:
augur.filter.get_groups_for_subsampling(strains, metadata, group_by=groups)
- assert str(e_info.value) == "The specified group-by categories (['year', 'month']) were not found. Note that using 'year' or 'year month' requires a column called 'date'."
+ assert str(e_info.value) == "The specified group-by categories (['year', 'month']) were not found. Note that using any of ['month', 'week', 'year'] requires a column called 'date'."
def test_filter_groupby_missing_date_warn(self, valid_metadata: pd.DataFrame, capsys):
groups = ['country', 'year', 'month']
@@ -507,7 +507,7 @@ class TestFilterGroupBy:
'SEQ_5': ('B', 'unknown', 'unknown')
captured = capsys.readouterr()
- assert captured.err == "WARNING: A 'date' column could not be found to group-by year or month.\nFiltering by group may behave differently than expected!\n"
+ assert captured.err == "WARNING: A 'date' column could not be found to group-by ['month', 'year'].\nFiltering by group may behave differently than expected!\n"
assert skipped_strains == []
def test_filter_groupby_no_strains(self, valid_metadata: pd.DataFrame):
View it on GitLab: https://salsa.debian.org/med-team/augur/-/compare/2cd127596ed35723c2dc28adaa548f20d33b4d13...10e63a1e62046d9b221fefdba42ff58fe1a80157
View it on GitLab: https://salsa.debian.org/med-team/augur/-/compare/2cd127596ed35723c2dc28adaa548f20d33b4d13...10e63a1e62046d9b221fefdba42ff58fe1a80157
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20221028/0c97fd40/attachment-0001.htm>
More information about the debian-med-commit
mailing list