[med-svn] [Git][med-team/augur][upstream] New upstream version 24.3.0

Wed Apr 24 10:35:11 BST 2024


Étienne Mollier pushed to branch upstream at Debian Med / augur


Commits:
7cc0c257 by Étienne Mollier at 2024-04-24T11:19:27+02:00
New upstream version 24.3.0
- - - - -


19 changed files:

- CHANGES.md
- augur/__version__.py
- augur/ancestral.py
- augur/data/schema-annotations.json
- augur/data/schema-export-v2.json
- augur/filter/__init__.py
- augur/filter/_run.py
- augur/filter/include_exclude_rules.py
- augur/filter/validate_arguments.py
- augur/import_/beast.py
- augur/parse.py
- augur/translate.py
- augur/utils.py
- docs/usage/cli/filter.rst
- + docs/usage/cli/snippets/filtering-and-subsampling.rst
- − tests/functional/filter/cram/filter-min-length-output-metadata.t
- − tests/functional/filter/cram/filter-min-length-output-strains.t
- tests/functional/filter/cram/filter-min-length-no-sequence-index-error.t → tests/functional/filter/cram/filter-no-sequence-index-error.t
- + tests/functional/filter/cram/filter-sequence-length.t


Changes:

=====================================
CHANGES.md
=====================================
@@ -3,6 +3,28 @@
 ## __NEXT__
 
 
+## 24.3.0 (18 March 2024)
+
+### Features
+
+* filter: Added a new option `--max-length` to filter out sequences that are longer than a certain amount of base pairs. [#1429][] (@victorlin)
+* parse: Added support for environments that use pandas 2.x. [#1436][] (@emollier, @victorlin)
+
+### Bug Fixes
+
+* filter: Updated docs with an example of tiered subsampling. [#1425][] (@victorlin)
+* export: Fixes bug [#1433] introduced in v23.1.0, that causes validation to fail when gene names start with `nuc`, e.g. `nucleocapsid`. [#1434][] (@corneliusroemer)
+* import: Fixes bug introduced in v24.2.0 that prevented `import beast` from running. [#1439][] (@tomkinsc)
+* translate, ancestral: Compound CDS are now exported as segmented CDS and are now viewable in Auspice.  [#1438][] (@jameshadfield)
+
+[#1425]: https://github.com/nextstrain/augur/pull/1425
+[#1429]: https://github.com/nextstrain/augur/pull/1429
+[#1433]: https://github.com/nextstrain/augur/issues/1433
+[#1434]: https://github.com/nextstrain/augur/pull/1434
+[#1436]: https://github.com/nextstrain/augur/pull/1436
+[#1438]: https://github.com/nextstrain/augur/pull/1438
+[#1439]: https://github.com/nextstrain/augur/pull/1439
+
 ## 24.2.3 (23 February 2024)
 
 ### Bug Fixes


=====================================
augur/__version__.py
=====================================
@@ -1,4 +1,4 @@
-__version__ = '24.2.3'
+__version__ = '24.3.0'
 
 
 def is_augur_version_compatible(version):


=====================================
augur/ancestral.py
=====================================
@@ -28,7 +28,8 @@ import numpy as np
 from Bio import SeqIO
 from Bio.Seq import Seq
 from Bio.SeqRecord import SeqRecord
-from .utils import parse_genes_argument, read_tree, InvalidTreeError, write_json, get_json_name
+from .utils import parse_genes_argument, read_tree, InvalidTreeError, write_json, get_json_name, \
+    genome_features_to_auspice_annotation
 from .io.file import open_file
 from .io.vcf import is_vcf as is_filename_vcf
 from treetime.vcf_utils import read_vcf, write_vcf
@@ -455,14 +456,7 @@ def run(args):
                     node["aa_sequences"][gene] = aa_result['tt'].sequence(T.root, as_string=True, reconstructed=True)
 
             anc_seqs['reference'][gene] = aa_result['root_seq']
-            # FIXME: Note that this is calculating the end of the CDS as 3*length of translation
-            # this is equivalent to the annotation for single segment CDS, but not for cds
-            # with splicing and slippage. But auspice can't handle the latter at the moment.
-            anc_seqs['annotations'][gene] = {'seqid':args.annotation,
-                                             'type':feat.type,
-                                             'start': int(feat.location.start)+1,
-                                             'end': int(feat.location.start) + 3*len(anc_seqs['reference'][gene]),
-                                             'strand': {+1:'+', -1:'-', 0:'?', None:None}[feat.location.strand]}
+            anc_seqs['annotations'].update(genome_features_to_auspice_annotation({gene: feat}, args.annotation))
 
             # Save ancestral amino acid sequences to FASTA.
             if args.output_translations:


=====================================
augur/data/schema-annotations.json
=====================================
@@ -24,8 +24,9 @@
         }
     },
     "required": ["nuc"],
+    "additionalProperties": false,
     "patternProperties": {
-        "^(?!nuc)[a-zA-Z0-9*_-]+$": {
+        "^(?!nuc$)[a-zA-Z0-9*_-]+$": {
             "$comment": "Each object here defines a single CDS",
             "type": "object",
             "oneOf": [{ "$ref": "#/$defs/startend" }, { "$ref": "#/$defs/segments" }],


=====================================
augur/data/schema-export-v2.json
=====================================
@@ -310,7 +310,7 @@
                                 }
                             },
                             "patternProperties": {
-                                "^(?!nuc)[a-zA-Z0-9*_-]+$": {
+                                "^(?!nuc$)[a-zA-Z0-9*_-]+$": {
                                     "description": "Amino acid mutations for this CDS",
                                     "$comment": "properties must exist in the meta.JSON -> annotation object",
                                     "type": "array",


=====================================
augur/filter/__init__.py
=====================================
@@ -52,6 +52,7 @@ def register_arguments(parser):
 
     sequence_filter_group = parser.add_argument_group("sequence filters", "filters to apply to sequence data")
     sequence_filter_group.add_argument('--min-length', type=int, help="minimal length of the sequences, only counting standard nucleotide characters A, C, G, or T (case-insensitive)")
+    sequence_filter_group.add_argument('--max-length', type=int, help="maximum length of the sequences, only counting standard nucleotide characters A, C, G, or T (case-insensitive)")
     sequence_filter_group.add_argument('--non-nucleotide', action='store_true', help="exclude sequences that contain illegal characters")
 
     subsample_group = parser.add_argument_group("subsampling", "options to subsample filtered data")


=====================================
augur/filter/_run.py
=====================================
@@ -428,7 +428,8 @@ def run(args):
         include_exclude_rules.filter_by_ambiguous_date.__name__: "{count} {were} dropped because of their ambiguous date in {ambiguity}",
         include_exclude_rules.filter_by_min_date.__name__: "{count} {were} dropped because {they} {were} earlier than {min_date} or missing a date",
         include_exclude_rules.filter_by_max_date.__name__: "{count} {were} dropped because {they} {were} later than {max_date} or missing a date",
-        include_exclude_rules.filter_by_sequence_length.__name__: "{count} {were} dropped because {they} {were} shorter than minimum length of {min_length}bp when only counting standard nucleotide characters A, C, G, or T (case-insensitive)",
+        include_exclude_rules.filter_by_min_length.__name__: "{count} {were} dropped because {they} {were} shorter than the minimum length of {min_length}bp when only counting standard nucleotide characters A, C, G, or T (case-insensitive)",
+        include_exclude_rules.filter_by_max_length.__name__: "{count} {were} dropped because {they} {were} longer than the maximum length of {max_length}bp when only counting standard nucleotide characters A, C, G, or T (case-insensitive)",
         include_exclude_rules.filter_by_non_nucleotide.__name__: "{count} {were} dropped because {they} had non-nucleotide characters",
         include_exclude_rules.skip_group_by_with_ambiguous_year.__name__: "{count} {were} dropped during grouping due to ambiguous year information",
         include_exclude_rules.skip_group_by_with_ambiguous_month.__name__: "{count} {were} dropped during grouping due to ambiguous month information",


=====================================
augur/filter/include_exclude_rules.py
=====================================
@@ -424,7 +424,7 @@ def filter_by_sequence_index(metadata, sequence_index) -> FilterFunctionReturn:
     return metadata_strains & sequence_index_strains
 
 
-def filter_by_sequence_length(metadata, sequence_index, min_length) -> FilterFunctionReturn:
+def filter_by_min_length(metadata, sequence_index, min_length) -> FilterFunctionReturn:
     """Filter metadata by sequence length from a given sequence index.
 
     Parameters
@@ -440,13 +440,13 @@ def filter_by_sequence_length(metadata, sequence_index, min_length) -> FilterFun
     --------
     >>> metadata = pd.DataFrame([{"region": "Africa", "date": "2020-01-01"}, {"region": "Europe", "date": "2020-01-02"}], index=["strain1", "strain2"])
     >>> sequence_index = pd.DataFrame([{"strain": "strain1", "A": 7000, "C": 7000, "G": 7000, "T": 7000}, {"strain": "strain2", "A": 6500, "C": 6500, "G": 6500, "T": 6500}]).set_index("strain")
-    >>> filter_by_sequence_length(metadata, sequence_index, min_length=27000)
+    >>> filter_by_min_length(metadata, sequence_index, min_length=27000)
     {'strain1'}
 
     It is possible for the sequence index to be missing strains present in the metadata.
 
     >>> sequence_index = pd.DataFrame([{"strain": "strain3", "A": 7000, "C": 7000, "G": 7000, "T": 7000}, {"strain": "strain2", "A": 6500, "C": 6500, "G": 6500, "T": 6500}]).set_index("strain")
-    >>> filter_by_sequence_length(metadata, sequence_index, min_length=27000)
+    >>> filter_by_min_length(metadata, sequence_index, min_length=27000)
     set()
 
     """
@@ -459,6 +459,34 @@ def filter_by_sequence_length(metadata, sequence_index, min_length) -> FilterFun
     return set(filtered_sequence_index[filtered_sequence_index["ACGT"] >= min_length].index.values)
 
 
+def filter_by_max_length(metadata, sequence_index, max_length) -> FilterFunctionReturn:
+    """Filter metadata by sequence length from a given sequence index.
+
+    Parameters
+    ----------
+    metadata : pandas.DataFrame
+        Metadata indexed by strain name
+    sequence_index : pandas.DataFrame
+        Sequence index
+    max_length : int
+        Maximum number of standard nucleotide characters (A, C, G, or T) in each sequence
+
+    Examples
+    --------
+    >>> metadata = pd.DataFrame([{"region": "Africa", "date": "2020-01-01"}, {"region": "Europe", "date": "2020-01-02"}], index=["strain1", "strain2"])
+    >>> sequence_index = pd.DataFrame([{"strain": "strain1", "A": 7000, "C": 7000, "G": 7000, "T": 7000}, {"strain": "strain2", "A": 6500, "C": 6500, "G": 6500, "T": 6500}]).set_index("strain")
+    >>> filter_by_max_length(metadata, sequence_index, max_length=27000)
+    {'strain2'}
+    """
+    strains = set(metadata.index.values)
+    filtered_sequence_index = sequence_index.loc[
+        sequence_index.index.intersection(strains)
+    ]
+    filtered_sequence_index["ACGT"] = filtered_sequence_index.loc[:, ["A", "C", "G", "T"]].sum(axis=1)
+
+    return set(filtered_sequence_index[filtered_sequence_index["ACGT"] <= max_length].index.values)
+
+
 def filter_by_non_nucleotide(metadata, sequence_index) -> FilterFunctionReturn:
     """Filter metadata for strains with invalid nucleotide content.
 
@@ -667,21 +695,31 @@ def construct_filters(args, sequence_index) -> Tuple[List[FilterOption], List[Fi
         ))
 
     # Filter by sequence length.
+    # Skip VCF files and warn the user that length filters do not
+    # make sense for VCFs.
+    is_vcf = filename_is_vcf(args.sequences)
     if args.min_length:
-        # Skip VCF files and warn the user that the min length filter does not
-        # make sense for VCFs.
-        is_vcf = filename_is_vcf(args.sequences)
-
-        if is_vcf: #doesn't make sense for VCF, ignore.
+        if is_vcf:
             print_err("WARNING: Cannot use min_length for VCF files. Ignoring...")
         else:
             exclude_by.append((
-                filter_by_sequence_length,
+                filter_by_min_length,
                 {
                     "sequence_index": sequence_index,
                     "min_length": args.min_length,
                 }
             ))
+    if args.max_length:
+        if is_vcf:
+            print_err("WARNING: Cannot use max_length for VCF files. Ignoring...")
+        else:
+            exclude_by.append((
+                filter_by_max_length,
+                {
+                    "sequence_index": sequence_index,
+                    "max_length": args.max_length,
+                }
+            ))
 
     # Exclude sequences with non-nucleotide characters.
     if args.non_nucleotide:
@@ -755,13 +793,13 @@ def apply_filters(metadata, exclude_by: List[FilterOption], include_by: List[Fil
     annotated in a sequence index.
 
     >>> sequence_index = pd.DataFrame([{"strain": "strain1", "A": 7000, "C": 7000, "G": 7000, "T": 7000}, {"strain": "strain2", "A": 6500, "C": 6500, "G": 6500, "T": 6500}, {"strain": "strain3", "A": 1250, "C": 1250, "G": 1250, "T": 1250}]).set_index("strain")
-    >>> exclude_by = [(filter_by_sequence_length, {"sequence_index": sequence_index, "min_length": 27000})]
+    >>> exclude_by = [(filter_by_min_length, {"sequence_index": sequence_index, "min_length": 27000})]
     >>> include_by = [(force_include_where, {"include_where": "region=Europe"})]
     >>> strains_to_keep, strains_to_exclude, strains_to_include = apply_filters(metadata, exclude_by, include_by)
     >>> strains_to_keep
     {'strain1'}
     >>> sorted(strains_to_exclude, key=lambda record: record["strain"])
-    [{'strain': 'strain2', 'filter': 'filter_by_sequence_length', 'kwargs': '[["min_length", 27000]]'}, {'strain': 'strain3', 'filter': 'filter_by_sequence_length', 'kwargs': '[["min_length", 27000]]'}]
+    [{'strain': 'strain2', 'filter': 'filter_by_min_length', 'kwargs': '[["min_length", 27000]]'}, {'strain': 'strain3', 'filter': 'filter_by_min_length', 'kwargs': '[["min_length", 27000]]'}]
     >>> strains_to_include
     [{'strain': 'strain2', 'filter': 'force_include_where', 'kwargs': '[["include_where", "region=Europe"]]'}]
 
@@ -847,9 +885,9 @@ def _filter_kwargs_to_str(kwargs: FilterFunctionKwargs):
     Examples
     --------
     >>> from augur.dates import numeric_date
-    >>> from augur.filter.include_exclude_rules import filter_by_sequence_length, filter_by_min_date
+    >>> from augur.filter.include_exclude_rules import filter_by_min_length, filter_by_min_date
     >>> sequence_index = pd.DataFrame([{"strain": "strain1", "ACGT": 28000}, {"strain": "strain2", "ACGT": 26000}, {"strain": "strain3", "ACGT": 5000}]).set_index("strain")
-    >>> exclude_by = [(filter_by_sequence_length, {"sequence_index": sequence_index, "min_length": 27000})]
+    >>> exclude_by = [(filter_by_min_length, {"sequence_index": sequence_index, "min_length": 27000})]
     >>> _filter_kwargs_to_str(exclude_by[0][1])
     '[["min_length", 27000]]'
     >>> exclude_by = [(filter_by_min_date, {"date_column": "date", "min_date": numeric_date("2020-03-01")})]


=====================================
augur/filter/validate_arguments.py
=====================================
@@ -4,6 +4,7 @@ from augur.io.vcf import is_vcf as filename_is_vcf
 
 SEQUENCE_ONLY_FILTERS = (
     "min_length",
+    "max_length",
     "non_nucleotide",
 )
 


=====================================
augur/import_/beast.py
=====================================
@@ -233,45 +233,37 @@ def parse_nexus(tree_path, treestring_regex=r'tree [A-Za-z\_]+([0-9]+)', verbose
     tipNum=0
     tree=None
 
-    if isinstance(tree_path,str): ## determine if path or handle was provided to function
-        try:
-            handle=open_file(tree_path,'r')
-        except FileNotFoundError:
-            print("FATAL: No such file {}".format(tree_path))
-            sys.exit(2)
-    else:
-        handle=tree_path
-
-    for line in handle: ## iterate over lines
-        l=line.strip('\n')
-
-        nTaxa=re.search(r'dimensions ntax=([0-9]+);',l.lower()) ## get number of tips that should be in tree
-        if nTaxa is not None:
-            tipNum=int(nTaxa.group(1))
-            if verbose:
-                print('File should contain %d taxa'%(tipNum))
-
-        treeString=re.search(treestring_regex,l) ## search for line with the tree
-        if treeString is not None:
-            treeString_start=l.index('(') ## find index of where tree string starts
-            tree=parse_beast_tree(l[treeString_start:], tipMap=tips, verbose=verbose) ## parse tree string
-
-            if verbose:
-                print('Identified tree string')
-
-        if tipFlag==True: ## going through tip encoding block
-            tipEncoding=re.search(r'([0-9]+) ([A-Za-z\-\_\/\.\'0-9 \|?]+)',l) ## search for key:value pairs
-            if tipEncoding is not None:
-                tips[tipEncoding.group(1)]=tipEncoding.group(2).strip('"').strip("'") ## add to tips dict
-                if verbose==True:
-                    print('Identified tip translation %s: %s'%(tipEncoding.group(1),tips[tipEncoding.group(1)]))
-            elif ';' not in l:
-                print('tip not captured by regex:',l.replace('\t',''))
-
-        if 'translate' in l.lower(): ## tip encoding starts on next line
-            tipFlag=True
-        if ';' in l:
-            tipFlag=False
+    with open_file(tree_path,'r') as handle: ## open tree_path as file, or consume directly if already a file handle
+        for line in handle: ## iterate over lines
+            l=line.strip('\n')
+
+            nTaxa=re.search(r'dimensions ntax=([0-9]+);',l.lower()) ## get number of tips that should be in tree
+            if nTaxa is not None:
+                tipNum=int(nTaxa.group(1))
+                if verbose:
+                    print('File should contain %d taxa'%(tipNum))
+
+            treeString=re.search(treestring_regex,l) ## search for line with the tree
+            if treeString is not None:
+                treeString_start=l.index('(') ## find index of where tree string starts
+                tree=parse_beast_tree(l[treeString_start:], tipMap=tips, verbose=verbose) ## parse tree string
+
+                if verbose:
+                    print('Identified tree string')
+
+            if tipFlag==True: ## going through tip encoding block
+                tipEncoding=re.search(r'([0-9]+) ([A-Za-z\-\_\/\.\'0-9 \|?]+)',l) ## search for key:value pairs
+                if tipEncoding is not None:
+                    tips[tipEncoding.group(1)]=tipEncoding.group(2).strip('"').strip("'") ## add to tips dict
+                    if verbose==True:
+                        print('Identified tip translation %s: %s'%(tipEncoding.group(1),tips[tipEncoding.group(1)]))
+                elif ';' not in l:
+                    print('tip not captured by regex:',l.replace('\t',''))
+
+            if 'translate' in l.lower(): ## tip encoding starts on next line
+                tipFlag=True
+            if ';' in l:
+                tipFlag=False
 
     assert tree,'Tree not captured by regex'
     assert tree.count_terminals()==tipNum,'Not all tips have been parsed.'


=====================================
augur/parse.py
=====================================
@@ -32,7 +32,12 @@ def fix_dates(d, dayfirst=True):
     '''
     try:
         from pandas.core.tools.datetimes import parsing
-        results = parsing.parse_time_string(d, dayfirst=dayfirst)
+        try:
+            # pandas 2.x
+            results = parsing.parse_datetime_string_with_reso(d, dayfirst=dayfirst)
+        except AttributeError:
+            # pandas 1.x
+            results = parsing.parse_time_string(d, dayfirst=dayfirst)
         if len(results) == 2:
             dto, res = results
         else:


=====================================
augur/translate.py
=====================================
@@ -17,7 +17,8 @@ import sys
 import numpy as np
 from Bio import SeqIO, Seq, SeqRecord, Phylo
 from .io.vcf import write_VCF_translation, is_vcf as is_filename_vcf
-from .utils import parse_genes_argument, read_node_data, load_features, write_json, get_json_name
+from .utils import parse_genes_argument, read_node_data, load_features, \
+    write_json, get_json_name, genome_features_to_auspice_annotation
 from treetime.vcf_utils import read_vcf
 from augur.errors import AugurError
 from textwrap import dedent
@@ -446,24 +447,7 @@ def run(args):
                 continue
             reference_translations[fname] = safe_translate(str(feat.extract(reference)))
 
-    ## glob the annotations for later auspice export
-    #
-    # Note that BioPython FeatureLocations use
-    # "Pythonic" coordinates: [zero-origin, half-open)
-    # Starting with augur v6 we use GFF coordinates: [one-origin, inclusive]
-    annotations = {
-        'nuc': {'start': features['nuc'].location.start+1,
-                'end':   features['nuc'].location.end,
-                'strand': '+',
-                'type':  features['nuc'].type,     # (unused by auspice)
-                'seqid': args.reference_sequence}  # (unused by auspice)
-    }
-    for fname, feat in features.items():
-        annotations[fname] = {'seqid':args.reference_sequence,
-                              'type':feat.type,
-                              'start':int(feat.location.start)+1,
-                              'end':int(feat.location.end),
-                              'strand': {+1:'+', -1:'-', 0:'?', None:None}[feat.location.strand]}
+    annotations = genome_features_to_auspice_annotation(features, args.reference_sequence, assert_nuc=True)
 
     ## determine amino acid mutations for each node
     try:


=====================================
augur/utils.py
=====================================
@@ -832,3 +832,57 @@ def _get_genes_from_file(fname):
     print("Read in {} specified genes to translate.".format(len(unique_genes)))
 
     return unique_genes
+
+
+
+def genome_features_to_auspice_annotation(features, ref_seq_name=None, assert_nuc=False):
+    """
+    Parameters
+    ----------
+    features : dict
+        keys: feature names, values: Bio.SeqFeature.SeqFeature objects
+    ref_seq_name : str (optional)
+        Exported as the `seqid` for each feature. Note this is unused by Auspice
+    assert_nuc : bool (optional)
+        If true, one of the feature key names must be "nuc"
+
+    Returns
+    -------
+    annotations: dict
+        See schema-annotations.json for the schema this conforms to
+
+    """
+    from Bio.SeqFeature import SimpleLocation, CompoundLocation
+
+    if assert_nuc and 'nuc' not in features:
+        raise AugurError("Genome features must include a feature for 'nuc'")
+
+    def _parse(feat):
+        a = {}
+        # Note that BioPython locations use "Pythonic" coordinates: [zero-origin, half-open)
+        # Starting with augur v6 we use GFF coordinates: [one-origin, inclusive]
+        if type(feat.location)==SimpleLocation:
+            a['start'] = int(feat.location.start)+1
+            a['end'] = int(feat.location.end)
+        elif type(feat.location)==CompoundLocation:
+            a['segments'] = [
+                {'start':int(segment.start)+1, 'end':int(segment.end)}
+                for segment in feat.location.parts # segment: SimpleLocation
+            ]
+        else:
+            raise AugurError(f"Encountered a genome feature with an unknown location type {type(feat.location):q}")
+        a['strand'] = {+1:'+', -1:'-', 0:'?', None:None}[feat.location.strand]
+        a['type'] = feat.type  # (unused by auspice)
+        if ref_seq_name:
+            a['seqid'] = ref_seq_name # (unused by auspice)
+        return a
+
+    annotations = {}
+    for fname, feat in features.items():
+        annotations[fname] = _parse(feat)
+        if fname=='nuc':
+            assert annotations['nuc']['strand'] == '+', "Nuc feature must be +ve strand"
+        elif annotations[fname]['strand'] not in ['+', '-']:
+            print("WARNING: Feature {fname:q} uses a strand which auspice cannot display")
+
+    return annotations


=====================================
docs/usage/cli/filter.rst
=====================================
@@ -13,53 +13,9 @@ augur filter
     :prog: augur
     :path: filter
 
-How we subsample sequences in the zika-tutoral
-==============================================
+Guides
+======
 
-As an example, we'll look that the ``filter`` command in greater detail using material from the :doc:`Zika tutorial <docs.nextstrain.org:tutorials/creating-a-workflow>`.
-The filter command allows you to selected various subsets of your input data for different types of analysis.
-A simple example use of this command would be
+Below are some examples of using ``augur filter`` to sample data.
 
-.. code-block:: bash
-
-	augur filter --sequences data/sequences.fasta --metadata data/metadata.tsv --min-date 2012 --output filtered.fasta
-
-This command will select all sequences with collection date in 2012 or later.
-The filter command has a large number of options that allow flexible filtering for many common situations.
-One such use-case is the exclusion of sequences that are known to be outliers (e.g. because of sequencing errors, cell-culture adaptation, ...).
-These can be specified in a separate file:
-
-.. code-block:: bash
-
-    BRA/2016/FC_DQ75D1
-    COL/FLR_00034/2015
-    ...
-
-To drop such strains, you can pass the name of this file to the augur filter command:
-
-.. code-block:: bash
-
-  augur filter --sequences data/sequences.fasta \
-             --metadata data/metadata.tsv \
-             --min-date 2012 \
-             --exclude config/dropped_strains.txt \
-             --output filtered.fasta
-
-(To improve legibility, we have wrapped the command across multiple lines.)
-If you run this command (you should be able to copy-paste this into your terminal) on the data provided in the :doc:`Zika tutorial <docs.nextstrain.org:tutorials/creating-a-workflow>`, you should see that one of the sequences in the data set was dropped since its name was in the ``dropped_strains.txt`` file.
-
-Another common filtering operation is subsetting of data to a achieve a more even spatio-temporal distribution or to cut-down data set size to more manageable numbers.
-The filter command allows you to select a specific number of sequences from specific groups, for example one sequence per month from each country:
-
-.. code-block:: bash
-
-  augur filter \
-    --sequences data/sequences.fasta \
-    --metadata data/metadata.tsv \
-    --min-date 2012 \
-    --exclude config/dropped_strains.txt \
-    --group-by country year month \
-    --sequences-per-group 1 \
-    --output filtered.fasta
-
-This subsampling and filtering will reduce the number of sequences in the tutorial data set from 34 to 24.
+.. include:: snippets/filtering-and-subsampling.rst


=====================================
docs/usage/cli/snippets/filtering-and-subsampling.rst
=====================================
@@ -0,0 +1,207 @@
+Filtering
+---------
+
+The filter command allows you to select various subsets of your input data for different types of analysis.
+A simple example use of this command would be
+
+.. code-block:: bash
+
+  augur filter \
+    --sequences data/sequences.fasta \
+    --metadata data/metadata.tsv \
+    --min-date 2012 \
+    --output-sequences filtered_sequences.fasta \
+    --output-metadata filtered_metadata.tsv
+
+This command will select all sequences with collection date in 2012 or later.
+The filter command has a large number of options that allow flexible filtering for many common situations.
+One such use-case is the exclusion of sequences that are known to be outliers (e.g. because of sequencing errors, cell-culture adaptation, ...).
+These can be specified in a separate text file (e.g. ``exclude.txt``):
+
+.. code-block::
+
+  BRA/2016/FC_DQ75D1
+  COL/FLR_00034/2015
+  ...
+
+To drop such strains, you can pass the filename to ``--exclude``:
+
+.. code-block:: bash
+
+  augur filter \
+    --sequences data/sequences.fasta \
+    --metadata data/metadata.tsv \
+    --min-date 2012 \
+    --exclude exclude.txt \
+    --output-sequences filtered_sequences.fasta \
+    --output-metadata filtered_metadata.tsv
+
+Subsampling within ``augur filter``
+-----------------------------------
+
+Another common filtering operation is subsetting of data to a achieve a more even spatio-temporal distribution or to cut-down data set size to more manageable numbers.
+The filter command allows you to select a specific number of sequences from specific groups, for example one sequence per month from each country:
+
+.. code-block:: bash
+
+  augur filter \
+    --sequences data/sequences.fasta \
+    --metadata data/metadata.tsv \
+    --min-date 2012 \
+    --exclude exclude.txt \
+    --group-by country year month \
+    --sequences-per-group 1 \
+    --output-sequences subsampled_sequences.fasta \
+    --output-metadata subsampled_metadata.tsv
+
+Subsampling using multiple ``augur filter`` commands
+----------------------------------------------------
+
+There are some subsampling strategies in which a single call to ``augur filter``
+does not suffice. One such strategy is "tiered subsampling". In this strategy,
+mutually exclusive sets of filters, each representing a "tier", are sampled with
+different subsampling rules. This is commonly used to create geographic tiers.
+Consider this subsampling scheme:
+
+    Sample 100 sequences from Washington state and 50 sequences from the rest of the United States.
+
+This cannot be done in a single call to ``augur filter``. Instead, it can be
+decomposed into multiple schemes, each handled by a single call to ``augur
+filter``. Additionally, there is an extra step to combine the intermediate
+samples.
+
+    1. Sample 100 sequences from Washington state.
+    2. Sample 50 sequences from the rest of the United States.
+    3. Combine the samples.
+
+Calling ``augur filter`` multiple times
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+A basic approach is to run the ``augur filter`` commands directly. This works
+well for ad-hoc analyses.
+
+.. code-block:: bash
+
+  # 1. Sample 100 sequences from Washington state
+  augur filter \
+    --sequences sequences.fasta \
+    --metadata metadata.tsv \
+    --query "state == 'WA'" \
+    --subsample-max-sequences 100 \
+    --output-strains sample_strains_state.txt
+
+  # 2. Sample 50 sequences from the rest of the United States
+  augur filter \
+    --sequences sequences.fasta \
+    --metadata metadata.tsv \
+    --query "state != 'WA' & country == 'USA'" \
+    --subsample-max-sequences 50 \
+    --output-strains sample_strains_country.txt
+
+  # 3. Combine using augur filter
+  augur filter \
+    --sequences sequences.fasta \
+    --metadata metadata.tsv \
+    --exclude-all \
+    --include sample_strains_state.txt \
+              sample_strains_country.txt \
+    --output-sequences subsampled_sequences.fasta \
+    --output-metadata subsampled_metadata.tsv
+
+Each intermediate sample is represented by a strain list file obtained from
+``--output-strains``. The final step uses ``augur filter`` with ``--exclude-all``
+and ``--include`` to sample the data based on the intermediate strain list
+files. If the same strain appears in both files, ``augur filter`` will only
+write it once in each of the final outputs.
+
+Generalizing subsampling in a workflow
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The approach above can be cumbersome with more intermediate samples. To
+generalize this process and allow for more flexibility, a workflow management
+system can be used. The following examples use `Snakemake`_.
+
+1. Add a section in the `config file`_.
+
+  .. code-block:: yaml
+
+    subsampling:
+      state: --query "state == 'WA'" --subsample-max-sequences 100
+      country: --query "state != 'WA' & country == 'USA'" --subsample-max-sequences 50
+
+2. Add two rules in a `Snakefile`_. If you are building a standard Nextstrain
+   workflow, the output files should be used as input to sequence alignment. See
+   :doc:`docs.nextstrain.org:learn/parts` to learn more about the placement of
+   this step within a workflow.
+
+  .. code-block:: python
+
+    # 1. Sample 100 sequences from Washington state
+    # 2. Sample 50 sequences from the rest of the United States
+    rule intermediate_sample:
+        input:
+            metadata = "data/metadata.tsv",
+        output:
+            strains = "results/sample_strains_{sample_name}.txt",
+        params:
+            augur_filter_args = lambda wildcards: config.get("subsampling", {}).get(wildcards.sample_name, "")
+        shell:
+            """
+            augur filter \
+                --metadata {input.metadata} \
+                {params.augur_filter_args} \
+                --output-strains {output.strains}
+            """
+
+    # 3. Combine using augur filter
+    rule combine_intermediate_samples:
+        input:
+            sequences = "data/sequences.fasta",
+            metadata = "data/metadata.tsv",
+            intermediate_sample_strains = expand("results/sample_strains_{sample_name}.txt", sample_name=list(config.get("subsampling", {}).keys()))
+        output:
+            sequences = "results/subsampled_sequences.fasta",
+            metadata = "results/subsampled_metadata.tsv",
+        shell:
+            """
+            augur filter \
+                --sequences {input.sequences} \
+                --metadata {input.metadata} \
+                --exclude-all \
+                --include {input.intermediate_sample_strains} \
+                --output-sequences {output.sequences} \
+                --output-metadata {output.metadata}
+            """
+
+3. Run Snakemake targeting the second rule.
+
+  .. code-block:: bash
+
+    snakemake combine_intermediate_samples
+
+Explanation:
+
+- The configuration section consists of one entry per intermediate sample in the
+  format ``sample_name: <augur filter arguments>``.
+- The first rule is run once per intermediate sample using `wildcards`_ and an
+  `input function`_. The output of each run is the sampled strain list.
+- The second rule uses `expand()`_ to define input as all the intermediate
+  sampled strain lists, which are passed directly to ``--include`` as done in
+  the previous example.
+
+It is easy to add or remove intermediate samples. The configuration above can be
+updated to add another tier in between state and country:
+
+  .. code-block:: yaml
+
+    subsampling:
+      state: --query "state == 'WA'" --subsample-max-sequences 100
+      neighboring_states: --query "state in {'CA', 'ID', 'OR', 'NV'}" --subsample-max-sequences 75
+      country: --query "country == 'USA' & state not in {'WA', 'CA', 'ID', 'OR', 'NV'}" --subsample-max-sequences 50
+
+.. _Snakemake: https://snakemake.readthedocs.io/en/stable/index.html
+.. _config file: https://snakemake.readthedocs.io/en/stable/snakefiles/configuration.html#snakefiles-standard-configuration
+.. _Snakefile: https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html
+.. _wildcards: https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#wildcards
+.. _input function: https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#snakefiles-input-functions
+.. _expand(): https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#the-expand-function


=====================================
tests/functional/filter/cram/filter-min-length-output-metadata.t deleted
=====================================
@@ -1,17 +0,0 @@
-Setup
-
-  $ source "$TESTDIR"/_setup.sh
-
-Filter using only metadata without sequence input or output and save results as filtered metadata.
-
-  $ ${AUGUR} filter \
-  >  --sequence-index "$TESTDIR/../data/sequence_index.tsv" \
-  >  --metadata "$TESTDIR/../data/metadata.tsv" \
-  >  --min-date 2012 \
-  >  --min-length 10500 \
-  >  --output-metadata filtered_metadata.tsv > /dev/null
-
-Output should include the 8 sequences matching the filters and a header line.
-
-  $ wc -l filtered_metadata.tsv
-  \s*9 .* (re)


=====================================
tests/functional/filter/cram/filter-min-length-output-strains.t deleted
=====================================
@@ -1,17 +0,0 @@
-Setup
-
-  $ source "$TESTDIR"/_setup.sh
-
-Filter using only metadata and save results as a list of filtered strains.
-
-  $ ${AUGUR} filter \
-  >  --sequence-index "$TESTDIR/../data/sequence_index.tsv" \
-  >  --metadata "$TESTDIR/../data/metadata.tsv" \
-  >  --min-date 2012 \
-  >  --min-length 10500 \
-  >  --output-strains filtered_strains.txt > /dev/null
-
-Output should include only the 8 sequences matching the filters (without a header line).
-
-  $ wc -l filtered_strains.txt
-  \s*8 .* (re)


=====================================
tests/functional/filter/cram/filter-min-length-no-sequence-index-error.t → tests/functional/filter/cram/filter-no-sequence-index-error.t
=====================================
@@ -3,7 +3,8 @@ Setup
   $ source "$TESTDIR"/_setup.sh
 
 Try to filter using only metadata without a sequence index.
-This should fail because the requested filters rely on sequence information.
+
+These should fail because the requested filters rely on sequence information.
 
   $ ${AUGUR} filter \
   >  --metadata "$TESTDIR/../data/metadata.tsv" \
@@ -11,3 +12,17 @@ This should fail because the requested filters rely on sequence information.
   >  --output-strains filtered_strains.txt > /dev/null
   ERROR: You need to provide a sequence index or sequences to filter on sequence-specific information.
   [2]
+
+  $ ${AUGUR} filter \
+  >  --metadata "$TESTDIR/../data/metadata.tsv" \
+  >  --max-length 10000 \
+  >  --output-strains filtered_strains.txt > /dev/null
+  ERROR: You need to provide a sequence index or sequences to filter on sequence-specific information.
+  [2]
+
+  $ ${AUGUR} filter \
+  >  --metadata "$TESTDIR/../data/metadata.tsv" \
+  >  --non-nucleotide \
+  >  --output-strains filtered_strains.txt > /dev/null
+  ERROR: You need to provide a sequence index or sequences to filter on sequence-specific information.
+  [2]


=====================================
tests/functional/filter/cram/filter-sequence-length.t
=====================================
@@ -0,0 +1,18 @@
+Setup
+
+  $ source "$TESTDIR"/_setup.sh
+
+Filter using --min-length and --max-length.
+
+  $ ${AUGUR} filter \
+  >  --sequence-index "$TESTDIR/../data/sequence_index.tsv" \
+  >  --metadata "$TESTDIR/../data/metadata.tsv" \
+  >  --min-length 10500 \
+  >  --max-length 10700 \
+  >  --output-strains filtered_strains.txt
+  7 strains were dropped during filtering
+  	1 had no metadata
+  	1 had no sequence data
+  	2 were dropped because they were shorter than the minimum length of 10500bp when only counting standard nucleotide characters A, C, G, or T (case-insensitive)
+  	3 were dropped because they were longer than the maximum length of 10700bp when only counting standard nucleotide characters A, C, G, or T (case-insensitive)
+  6 strains passed all filters



View it on GitLab: https://salsa.debian.org/med-team/augur/-/commit/7cc0c25772b116b2e3dafd7d9bab36c04fe3a7a5

-- 
View it on GitLab: https://salsa.debian.org/med-team/augur/-/commit/7cc0c25772b116b2e3dafd7d9bab36c04fe3a7a5
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20240424/83a0cd06/attachment-0001.htm>