[med-svn] [Git][med-team/augur][upstream] New upstream version 24.3.0
Étienne Mollier (@emollier)
gitlab at salsa.debian.org
Wed Apr 24 10:35:11 BST 2024
Étienne Mollier pushed to branch upstream at Debian Med / augur
Commits:
7cc0c257 by Étienne Mollier at 2024-04-24T11:19:27+02:00
New upstream version 24.3.0
- - - - -
19 changed files:
- CHANGES.md
- augur/__version__.py
- augur/ancestral.py
- augur/data/schema-annotations.json
- augur/data/schema-export-v2.json
- augur/filter/__init__.py
- augur/filter/_run.py
- augur/filter/include_exclude_rules.py
- augur/filter/validate_arguments.py
- augur/import_/beast.py
- augur/parse.py
- augur/translate.py
- augur/utils.py
- docs/usage/cli/filter.rst
- + docs/usage/cli/snippets/filtering-and-subsampling.rst
- − tests/functional/filter/cram/filter-min-length-output-metadata.t
- − tests/functional/filter/cram/filter-min-length-output-strains.t
- tests/functional/filter/cram/filter-min-length-no-sequence-index-error.t → tests/functional/filter/cram/filter-no-sequence-index-error.t
- + tests/functional/filter/cram/filter-sequence-length.t
Changes:
=====================================
CHANGES.md
=====================================
@@ -3,6 +3,28 @@
## __NEXT__
+## 24.3.0 (18 March 2024)
+
+### Features
+
+* filter: Added a new option `--max-length` to filter out sequences that are longer than a certain amount of base pairs. [#1429][] (@victorlin)
+* parse: Added support for environments that use pandas 2.x. [#1436][] (@emollier, @victorlin)
+
+### Bug Fixes
+
+* filter: Updated docs with an example of tiered subsampling. [#1425][] (@victorlin)
+* export: Fixes bug [#1433] introduced in v23.1.0, that causes validation to fail when gene names start with `nuc`, e.g. `nucleocapsid`. [#1434][] (@corneliusroemer)
+* import: Fixes bug introduced in v24.2.0 that prevented `import beast` from running. [#1439][] (@tomkinsc)
+* translate, ancestral: Compound CDS are now exported as segmented CDS and are now viewable in Auspice. [#1438][] (@jameshadfield)
+
+[#1425]: https://github.com/nextstrain/augur/pull/1425
+[#1429]: https://github.com/nextstrain/augur/pull/1429
+[#1433]: https://github.com/nextstrain/augur/issues/1433
+[#1434]: https://github.com/nextstrain/augur/pull/1434
+[#1436]: https://github.com/nextstrain/augur/pull/1436
+[#1438]: https://github.com/nextstrain/augur/pull/1438
+[#1439]: https://github.com/nextstrain/augur/pull/1439
+
## 24.2.3 (23 February 2024)
### Bug Fixes
=====================================
augur/__version__.py
=====================================
@@ -1,4 +1,4 @@
-__version__ = '24.2.3'
+__version__ = '24.3.0'
def is_augur_version_compatible(version):
=====================================
augur/ancestral.py
=====================================
@@ -28,7 +28,8 @@ import numpy as np
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
-from .utils import parse_genes_argument, read_tree, InvalidTreeError, write_json, get_json_name
+from .utils import parse_genes_argument, read_tree, InvalidTreeError, write_json, get_json_name, \
+ genome_features_to_auspice_annotation
from .io.file import open_file
from .io.vcf import is_vcf as is_filename_vcf
from treetime.vcf_utils import read_vcf, write_vcf
@@ -455,14 +456,7 @@ def run(args):
node["aa_sequences"][gene] = aa_result['tt'].sequence(T.root, as_string=True, reconstructed=True)
anc_seqs['reference'][gene] = aa_result['root_seq']
- # FIXME: Note that this is calculating the end of the CDS as 3*length of translation
- # this is equivalent to the annotation for single segment CDS, but not for cds
- # with splicing and slippage. But auspice can't handle the latter at the moment.
- anc_seqs['annotations'][gene] = {'seqid':args.annotation,
- 'type':feat.type,
- 'start': int(feat.location.start)+1,
- 'end': int(feat.location.start) + 3*len(anc_seqs['reference'][gene]),
- 'strand': {+1:'+', -1:'-', 0:'?', None:None}[feat.location.strand]}
+ anc_seqs['annotations'].update(genome_features_to_auspice_annotation({gene: feat}, args.annotation))
# Save ancestral amino acid sequences to FASTA.
if args.output_translations:
=====================================
augur/data/schema-annotations.json
=====================================
@@ -24,8 +24,9 @@
}
},
"required": ["nuc"],
+ "additionalProperties": false,
"patternProperties": {
- "^(?!nuc)[a-zA-Z0-9*_-]+$": {
+ "^(?!nuc$)[a-zA-Z0-9*_-]+$": {
"$comment": "Each object here defines a single CDS",
"type": "object",
"oneOf": [{ "$ref": "#/$defs/startend" }, { "$ref": "#/$defs/segments" }],
=====================================
augur/data/schema-export-v2.json
=====================================
@@ -310,7 +310,7 @@
}
},
"patternProperties": {
- "^(?!nuc)[a-zA-Z0-9*_-]+$": {
+ "^(?!nuc$)[a-zA-Z0-9*_-]+$": {
"description": "Amino acid mutations for this CDS",
"$comment": "properties must exist in the meta.JSON -> annotation object",
"type": "array",
=====================================
augur/filter/__init__.py
=====================================
@@ -52,6 +52,7 @@ def register_arguments(parser):
sequence_filter_group = parser.add_argument_group("sequence filters", "filters to apply to sequence data")
sequence_filter_group.add_argument('--min-length', type=int, help="minimal length of the sequences, only counting standard nucleotide characters A, C, G, or T (case-insensitive)")
+ sequence_filter_group.add_argument('--max-length', type=int, help="maximum length of the sequences, only counting standard nucleotide characters A, C, G, or T (case-insensitive)")
sequence_filter_group.add_argument('--non-nucleotide', action='store_true', help="exclude sequences that contain illegal characters")
subsample_group = parser.add_argument_group("subsampling", "options to subsample filtered data")
=====================================
augur/filter/_run.py
=====================================
@@ -428,7 +428,8 @@ def run(args):
include_exclude_rules.filter_by_ambiguous_date.__name__: "{count} {were} dropped because of their ambiguous date in {ambiguity}",
include_exclude_rules.filter_by_min_date.__name__: "{count} {were} dropped because {they} {were} earlier than {min_date} or missing a date",
include_exclude_rules.filter_by_max_date.__name__: "{count} {were} dropped because {they} {were} later than {max_date} or missing a date",
- include_exclude_rules.filter_by_sequence_length.__name__: "{count} {were} dropped because {they} {were} shorter than minimum length of {min_length}bp when only counting standard nucleotide characters A, C, G, or T (case-insensitive)",
+ include_exclude_rules.filter_by_min_length.__name__: "{count} {were} dropped because {they} {were} shorter than the minimum length of {min_length}bp when only counting standard nucleotide characters A, C, G, or T (case-insensitive)",
+ include_exclude_rules.filter_by_max_length.__name__: "{count} {were} dropped because {they} {were} longer than the maximum length of {max_length}bp when only counting standard nucleotide characters A, C, G, or T (case-insensitive)",
include_exclude_rules.filter_by_non_nucleotide.__name__: "{count} {were} dropped because {they} had non-nucleotide characters",
include_exclude_rules.skip_group_by_with_ambiguous_year.__name__: "{count} {were} dropped during grouping due to ambiguous year information",
include_exclude_rules.skip_group_by_with_ambiguous_month.__name__: "{count} {were} dropped during grouping due to ambiguous month information",
=====================================
augur/filter/include_exclude_rules.py
=====================================
@@ -424,7 +424,7 @@ def filter_by_sequence_index(metadata, sequence_index) -> FilterFunctionReturn:
return metadata_strains & sequence_index_strains
-def filter_by_sequence_length(metadata, sequence_index, min_length) -> FilterFunctionReturn:
+def filter_by_min_length(metadata, sequence_index, min_length) -> FilterFunctionReturn:
"""Filter metadata by sequence length from a given sequence index.
Parameters
@@ -440,13 +440,13 @@ def filter_by_sequence_length(metadata, sequence_index, min_length) -> FilterFun
--------
>>> metadata = pd.DataFrame([{"region": "Africa", "date": "2020-01-01"}, {"region": "Europe", "date": "2020-01-02"}], index=["strain1", "strain2"])
>>> sequence_index = pd.DataFrame([{"strain": "strain1", "A": 7000, "C": 7000, "G": 7000, "T": 7000}, {"strain": "strain2", "A": 6500, "C": 6500, "G": 6500, "T": 6500}]).set_index("strain")
- >>> filter_by_sequence_length(metadata, sequence_index, min_length=27000)
+ >>> filter_by_min_length(metadata, sequence_index, min_length=27000)
{'strain1'}
It is possible for the sequence index to be missing strains present in the metadata.
>>> sequence_index = pd.DataFrame([{"strain": "strain3", "A": 7000, "C": 7000, "G": 7000, "T": 7000}, {"strain": "strain2", "A": 6500, "C": 6500, "G": 6500, "T": 6500}]).set_index("strain")
- >>> filter_by_sequence_length(metadata, sequence_index, min_length=27000)
+ >>> filter_by_min_length(metadata, sequence_index, min_length=27000)
set()
"""
@@ -459,6 +459,34 @@ def filter_by_sequence_length(metadata, sequence_index, min_length) -> FilterFun
return set(filtered_sequence_index[filtered_sequence_index["ACGT"] >= min_length].index.values)
+def filter_by_max_length(metadata, sequence_index, max_length) -> FilterFunctionReturn:
+ """Filter metadata by sequence length from a given sequence index.
+
+ Parameters
+ ----------
+ metadata : pandas.DataFrame
+ Metadata indexed by strain name
+ sequence_index : pandas.DataFrame
+ Sequence index
+ max_length : int
+ Maximum number of standard nucleotide characters (A, C, G, or T) in each sequence
+
+ Examples
+ --------
+ >>> metadata = pd.DataFrame([{"region": "Africa", "date": "2020-01-01"}, {"region": "Europe", "date": "2020-01-02"}], index=["strain1", "strain2"])
+ >>> sequence_index = pd.DataFrame([{"strain": "strain1", "A": 7000, "C": 7000, "G": 7000, "T": 7000}, {"strain": "strain2", "A": 6500, "C": 6500, "G": 6500, "T": 6500}]).set_index("strain")
+ >>> filter_by_max_length(metadata, sequence_index, max_length=27000)
+ {'strain2'}
+ """
+ strains = set(metadata.index.values)
+ filtered_sequence_index = sequence_index.loc[
+ sequence_index.index.intersection(strains)
+ ]
+ filtered_sequence_index["ACGT"] = filtered_sequence_index.loc[:, ["A", "C", "G", "T"]].sum(axis=1)
+
+ return set(filtered_sequence_index[filtered_sequence_index["ACGT"] <= max_length].index.values)
+
+
def filter_by_non_nucleotide(metadata, sequence_index) -> FilterFunctionReturn:
"""Filter metadata for strains with invalid nucleotide content.
@@ -667,21 +695,31 @@ def construct_filters(args, sequence_index) -> Tuple[List[FilterOption], List[Fi
))
# Filter by sequence length.
+ # Skip VCF files and warn the user that length filters do not
+ # make sense for VCFs.
+ is_vcf = filename_is_vcf(args.sequences)
if args.min_length:
- # Skip VCF files and warn the user that the min length filter does not
- # make sense for VCFs.
- is_vcf = filename_is_vcf(args.sequences)
-
- if is_vcf: #doesn't make sense for VCF, ignore.
+ if is_vcf:
print_err("WARNING: Cannot use min_length for VCF files. Ignoring...")
else:
exclude_by.append((
- filter_by_sequence_length,
+ filter_by_min_length,
{
"sequence_index": sequence_index,
"min_length": args.min_length,
}
))
+ if args.max_length:
+ if is_vcf:
+ print_err("WARNING: Cannot use max_length for VCF files. Ignoring...")
+ else:
+ exclude_by.append((
+ filter_by_max_length,
+ {
+ "sequence_index": sequence_index,
+ "max_length": args.max_length,
+ }
+ ))
# Exclude sequences with non-nucleotide characters.
if args.non_nucleotide:
@@ -755,13 +793,13 @@ def apply_filters(metadata, exclude_by: List[FilterOption], include_by: List[Fil
annotated in a sequence index.
>>> sequence_index = pd.DataFrame([{"strain": "strain1", "A": 7000, "C": 7000, "G": 7000, "T": 7000}, {"strain": "strain2", "A": 6500, "C": 6500, "G": 6500, "T": 6500}, {"strain": "strain3", "A": 1250, "C": 1250, "G": 1250, "T": 1250}]).set_index("strain")
- >>> exclude_by = [(filter_by_sequence_length, {"sequence_index": sequence_index, "min_length": 27000})]
+ >>> exclude_by = [(filter_by_min_length, {"sequence_index": sequence_index, "min_length": 27000})]
>>> include_by = [(force_include_where, {"include_where": "region=Europe"})]
>>> strains_to_keep, strains_to_exclude, strains_to_include = apply_filters(metadata, exclude_by, include_by)
>>> strains_to_keep
{'strain1'}
>>> sorted(strains_to_exclude, key=lambda record: record["strain"])
- [{'strain': 'strain2', 'filter': 'filter_by_sequence_length', 'kwargs': '[["min_length", 27000]]'}, {'strain': 'strain3', 'filter': 'filter_by_sequence_length', 'kwargs': '[["min_length", 27000]]'}]
+ [{'strain': 'strain2', 'filter': 'filter_by_min_length', 'kwargs': '[["min_length", 27000]]'}, {'strain': 'strain3', 'filter': 'filter_by_min_length', 'kwargs': '[["min_length", 27000]]'}]
>>> strains_to_include
[{'strain': 'strain2', 'filter': 'force_include_where', 'kwargs': '[["include_where", "region=Europe"]]'}]
@@ -847,9 +885,9 @@ def _filter_kwargs_to_str(kwargs: FilterFunctionKwargs):
Examples
--------
>>> from augur.dates import numeric_date
- >>> from augur.filter.include_exclude_rules import filter_by_sequence_length, filter_by_min_date
+ >>> from augur.filter.include_exclude_rules import filter_by_min_length, filter_by_min_date
>>> sequence_index = pd.DataFrame([{"strain": "strain1", "ACGT": 28000}, {"strain": "strain2", "ACGT": 26000}, {"strain": "strain3", "ACGT": 5000}]).set_index("strain")
- >>> exclude_by = [(filter_by_sequence_length, {"sequence_index": sequence_index, "min_length": 27000})]
+ >>> exclude_by = [(filter_by_min_length, {"sequence_index": sequence_index, "min_length": 27000})]
>>> _filter_kwargs_to_str(exclude_by[0][1])
'[["min_length", 27000]]'
>>> exclude_by = [(filter_by_min_date, {"date_column": "date", "min_date": numeric_date("2020-03-01")})]
=====================================
augur/filter/validate_arguments.py
=====================================
@@ -4,6 +4,7 @@ from augur.io.vcf import is_vcf as filename_is_vcf
SEQUENCE_ONLY_FILTERS = (
"min_length",
+ "max_length",
"non_nucleotide",
)
=====================================
augur/import_/beast.py
=====================================
@@ -233,45 +233,37 @@ def parse_nexus(tree_path, treestring_regex=r'tree [A-Za-z\_]+([0-9]+)', verbose
tipNum=0
tree=None
- if isinstance(tree_path,str): ## determine if path or handle was provided to function
- try:
- handle=open_file(tree_path,'r')
- except FileNotFoundError:
- print("FATAL: No such file {}".format(tree_path))
- sys.exit(2)
- else:
- handle=tree_path
-
- for line in handle: ## iterate over lines
- l=line.strip('\n')
-
- nTaxa=re.search(r'dimensions ntax=([0-9]+);',l.lower()) ## get number of tips that should be in tree
- if nTaxa is not None:
- tipNum=int(nTaxa.group(1))
- if verbose:
- print('File should contain %d taxa'%(tipNum))
-
- treeString=re.search(treestring_regex,l) ## search for line with the tree
- if treeString is not None:
- treeString_start=l.index('(') ## find index of where tree string starts
- tree=parse_beast_tree(l[treeString_start:], tipMap=tips, verbose=verbose) ## parse tree string
-
- if verbose:
- print('Identified tree string')
-
- if tipFlag==True: ## going through tip encoding block
- tipEncoding=re.search(r'([0-9]+) ([A-Za-z\-\_\/\.\'0-9 \|?]+)',l) ## search for key:value pairs
- if tipEncoding is not None:
- tips[tipEncoding.group(1)]=tipEncoding.group(2).strip('"').strip("'") ## add to tips dict
- if verbose==True:
- print('Identified tip translation %s: %s'%(tipEncoding.group(1),tips[tipEncoding.group(1)]))
- elif ';' not in l:
- print('tip not captured by regex:',l.replace('\t',''))
-
- if 'translate' in l.lower(): ## tip encoding starts on next line
- tipFlag=True
- if ';' in l:
- tipFlag=False
+ with open_file(tree_path,'r') as handle: ## open tree_path as file, or consume directly if already a file handle
+ for line in handle: ## iterate over lines
+ l=line.strip('\n')
+
+ nTaxa=re.search(r'dimensions ntax=([0-9]+);',l.lower()) ## get number of tips that should be in tree
+ if nTaxa is not None:
+ tipNum=int(nTaxa.group(1))
+ if verbose:
+ print('File should contain %d taxa'%(tipNum))
+
+ treeString=re.search(treestring_regex,l) ## search for line with the tree
+ if treeString is not None:
+ treeString_start=l.index('(') ## find index of where tree string starts
+ tree=parse_beast_tree(l[treeString_start:], tipMap=tips, verbose=verbose) ## parse tree string
+
+ if verbose:
+ print('Identified tree string')
+
+ if tipFlag==True: ## going through tip encoding block
+ tipEncoding=re.search(r'([0-9]+) ([A-Za-z\-\_\/\.\'0-9 \|?]+)',l) ## search for key:value pairs
+ if tipEncoding is not None:
+ tips[tipEncoding.group(1)]=tipEncoding.group(2).strip('"').strip("'") ## add to tips dict
+ if verbose==True:
+ print('Identified tip translation %s: %s'%(tipEncoding.group(1),tips[tipEncoding.group(1)]))
+ elif ';' not in l:
+ print('tip not captured by regex:',l.replace('\t',''))
+
+ if 'translate' in l.lower(): ## tip encoding starts on next line
+ tipFlag=True
+ if ';' in l:
+ tipFlag=False
assert tree,'Tree not captured by regex'
assert tree.count_terminals()==tipNum,'Not all tips have been parsed.'
=====================================
augur/parse.py
=====================================
@@ -32,7 +32,12 @@ def fix_dates(d, dayfirst=True):
'''
try:
from pandas.core.tools.datetimes import parsing
- results = parsing.parse_time_string(d, dayfirst=dayfirst)
+ try:
+ # pandas 2.x
+ results = parsing.parse_datetime_string_with_reso(d, dayfirst=dayfirst)
+ except AttributeError:
+ # pandas 1.x
+ results = parsing.parse_time_string(d, dayfirst=dayfirst)
if len(results) == 2:
dto, res = results
else:
=====================================
augur/translate.py
=====================================
@@ -17,7 +17,8 @@ import sys
import numpy as np
from Bio import SeqIO, Seq, SeqRecord, Phylo
from .io.vcf import write_VCF_translation, is_vcf as is_filename_vcf
-from .utils import parse_genes_argument, read_node_data, load_features, write_json, get_json_name
+from .utils import parse_genes_argument, read_node_data, load_features, \
+ write_json, get_json_name, genome_features_to_auspice_annotation
from treetime.vcf_utils import read_vcf
from augur.errors import AugurError
from textwrap import dedent
@@ -446,24 +447,7 @@ def run(args):
continue
reference_translations[fname] = safe_translate(str(feat.extract(reference)))
- ## glob the annotations for later auspice export
- #
- # Note that BioPython FeatureLocations use
- # "Pythonic" coordinates: [zero-origin, half-open)
- # Starting with augur v6 we use GFF coordinates: [one-origin, inclusive]
- annotations = {
- 'nuc': {'start': features['nuc'].location.start+1,
- 'end': features['nuc'].location.end,
- 'strand': '+',
- 'type': features['nuc'].type, # (unused by auspice)
- 'seqid': args.reference_sequence} # (unused by auspice)
- }
- for fname, feat in features.items():
- annotations[fname] = {'seqid':args.reference_sequence,
- 'type':feat.type,
- 'start':int(feat.location.start)+1,
- 'end':int(feat.location.end),
- 'strand': {+1:'+', -1:'-', 0:'?', None:None}[feat.location.strand]}
+ annotations = genome_features_to_auspice_annotation(features, args.reference_sequence, assert_nuc=True)
## determine amino acid mutations for each node
try:
=====================================
augur/utils.py
=====================================
@@ -832,3 +832,57 @@ def _get_genes_from_file(fname):
print("Read in {} specified genes to translate.".format(len(unique_genes)))
return unique_genes
+
+
+
+def genome_features_to_auspice_annotation(features, ref_seq_name=None, assert_nuc=False):
+ """
+ Parameters
+ ----------
+ features : dict
+ keys: feature names, values: Bio.SeqFeature.SeqFeature objects
+ ref_seq_name : str (optional)
+ Exported as the `seqid` for each feature. Note this is unused by Auspice
+ assert_nuc : bool (optional)
+ If true, one of the feature key names must be "nuc"
+
+ Returns
+ -------
+ annotations: dict
+ See schema-annotations.json for the schema this conforms to
+
+ """
+ from Bio.SeqFeature import SimpleLocation, CompoundLocation
+
+ if assert_nuc and 'nuc' not in features:
+ raise AugurError("Genome features must include a feature for 'nuc'")
+
+ def _parse(feat):
+ a = {}
+ # Note that BioPython locations use "Pythonic" coordinates: [zero-origin, half-open)
+ # Starting with augur v6 we use GFF coordinates: [one-origin, inclusive]
+ if type(feat.location)==SimpleLocation:
+ a['start'] = int(feat.location.start)+1
+ a['end'] = int(feat.location.end)
+ elif type(feat.location)==CompoundLocation:
+ a['segments'] = [
+ {'start':int(segment.start)+1, 'end':int(segment.end)}
+ for segment in feat.location.parts # segment: SimpleLocation
+ ]
+ else:
+ raise AugurError(f"Encountered a genome feature with an unknown location type {type(feat.location):q}")
+ a['strand'] = {+1:'+', -1:'-', 0:'?', None:None}[feat.location.strand]
+ a['type'] = feat.type # (unused by auspice)
+ if ref_seq_name:
+ a['seqid'] = ref_seq_name # (unused by auspice)
+ return a
+
+ annotations = {}
+ for fname, feat in features.items():
+ annotations[fname] = _parse(feat)
+ if fname=='nuc':
+ assert annotations['nuc']['strand'] == '+', "Nuc feature must be +ve strand"
+ elif annotations[fname]['strand'] not in ['+', '-']:
+ print("WARNING: Feature {fname:q} uses a strand which auspice cannot display")
+
+ return annotations
=====================================
docs/usage/cli/filter.rst
=====================================
@@ -13,53 +13,9 @@ augur filter
:prog: augur
:path: filter
-How we subsample sequences in the zika-tutoral
-==============================================
+Guides
+======
-As an example, we'll look that the ``filter`` command in greater detail using material from the :doc:`Zika tutorial <docs.nextstrain.org:tutorials/creating-a-workflow>`.
-The filter command allows you to selected various subsets of your input data for different types of analysis.
-A simple example use of this command would be
+Below are some examples of using ``augur filter`` to sample data.
-.. code-block:: bash
-
- augur filter --sequences data/sequences.fasta --metadata data/metadata.tsv --min-date 2012 --output filtered.fasta
-
-This command will select all sequences with collection date in 2012 or later.
-The filter command has a large number of options that allow flexible filtering for many common situations.
-One such use-case is the exclusion of sequences that are known to be outliers (e.g. because of sequencing errors, cell-culture adaptation, ...).
-These can be specified in a separate file:
-
-.. code-block:: bash
-
- BRA/2016/FC_DQ75D1
- COL/FLR_00034/2015
- ...
-
-To drop such strains, you can pass the name of this file to the augur filter command:
-
-.. code-block:: bash
-
- augur filter --sequences data/sequences.fasta \
- --metadata data/metadata.tsv \
- --min-date 2012 \
- --exclude config/dropped_strains.txt \
- --output filtered.fasta
-
-(To improve legibility, we have wrapped the command across multiple lines.)
-If you run this command (you should be able to copy-paste this into your terminal) on the data provided in the :doc:`Zika tutorial <docs.nextstrain.org:tutorials/creating-a-workflow>`, you should see that one of the sequences in the data set was dropped since its name was in the ``dropped_strains.txt`` file.
-
-Another common filtering operation is subsetting of data to a achieve a more even spatio-temporal distribution or to cut-down data set size to more manageable numbers.
-The filter command allows you to select a specific number of sequences from specific groups, for example one sequence per month from each country:
-
-.. code-block:: bash
-
- augur filter \
- --sequences data/sequences.fasta \
- --metadata data/metadata.tsv \
- --min-date 2012 \
- --exclude config/dropped_strains.txt \
- --group-by country year month \
- --sequences-per-group 1 \
- --output filtered.fasta
-
-This subsampling and filtering will reduce the number of sequences in the tutorial data set from 34 to 24.
+.. include:: snippets/filtering-and-subsampling.rst
=====================================
docs/usage/cli/snippets/filtering-and-subsampling.rst
=====================================
@@ -0,0 +1,207 @@
+Filtering
+---------
+
+The filter command allows you to select various subsets of your input data for different types of analysis.
+A simple example use of this command would be
+
+.. code-block:: bash
+
+ augur filter \
+ --sequences data/sequences.fasta \
+ --metadata data/metadata.tsv \
+ --min-date 2012 \
+ --output-sequences filtered_sequences.fasta \
+ --output-metadata filtered_metadata.tsv
+
+This command will select all sequences with collection date in 2012 or later.
+The filter command has a large number of options that allow flexible filtering for many common situations.
+One such use-case is the exclusion of sequences that are known to be outliers (e.g. because of sequencing errors, cell-culture adaptation, ...).
+These can be specified in a separate text file (e.g. ``exclude.txt``):
+
+.. code-block::
+
+ BRA/2016/FC_DQ75D1
+ COL/FLR_00034/2015
+ ...
+
+To drop such strains, you can pass the filename to ``--exclude``:
+
+.. code-block:: bash
+
+ augur filter \
+ --sequences data/sequences.fasta \
+ --metadata data/metadata.tsv \
+ --min-date 2012 \
+ --exclude exclude.txt \
+ --output-sequences filtered_sequences.fasta \
+ --output-metadata filtered_metadata.tsv
+
+Subsampling within ``augur filter``
+-----------------------------------
+
+Another common filtering operation is subsetting of data to a achieve a more even spatio-temporal distribution or to cut-down data set size to more manageable numbers.
+The filter command allows you to select a specific number of sequences from specific groups, for example one sequence per month from each country:
+
+.. code-block:: bash
+
+ augur filter \
+ --sequences data/sequences.fasta \
+ --metadata data/metadata.tsv \
+ --min-date 2012 \
+ --exclude exclude.txt \
+ --group-by country year month \
+ --sequences-per-group 1 \
+ --output-sequences subsampled_sequences.fasta \
+ --output-metadata subsampled_metadata.tsv
+
+Subsampling using multiple ``augur filter`` commands
+----------------------------------------------------
+
+There are some subsampling strategies in which a single call to ``augur filter``
+does not suffice. One such strategy is "tiered subsampling". In this strategy,
+mutually exclusive sets of filters, each representing a "tier", are sampled with
+different subsampling rules. This is commonly used to create geographic tiers.
+Consider this subsampling scheme:
+
+ Sample 100 sequences from Washington state and 50 sequences from the rest of the United States.
+
+This cannot be done in a single call to ``augur filter``. Instead, it can be
+decomposed into multiple schemes, each handled by a single call to ``augur
+filter``. Additionally, there is an extra step to combine the intermediate
+samples.
+
+ 1. Sample 100 sequences from Washington state.
+ 2. Sample 50 sequences from the rest of the United States.
+ 3. Combine the samples.
+
+Calling ``augur filter`` multiple times
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+A basic approach is to run the ``augur filter`` commands directly. This works
+well for ad-hoc analyses.
+
+.. code-block:: bash
+
+ # 1. Sample 100 sequences from Washington state
+ augur filter \
+ --sequences sequences.fasta \
+ --metadata metadata.tsv \
+ --query "state == 'WA'" \
+ --subsample-max-sequences 100 \
+ --output-strains sample_strains_state.txt
+
+ # 2. Sample 50 sequences from the rest of the United States
+ augur filter \
+ --sequences sequences.fasta \
+ --metadata metadata.tsv \
+ --query "state != 'WA' & country == 'USA'" \
+ --subsample-max-sequences 50 \
+ --output-strains sample_strains_country.txt
+
+ # 3. Combine using augur filter
+ augur filter \
+ --sequences sequences.fasta \
+ --metadata metadata.tsv \
+ --exclude-all \
+ --include sample_strains_state.txt \
+ sample_strains_country.txt \
+ --output-sequences subsampled_sequences.fasta \
+ --output-metadata subsampled_metadata.tsv
+
+Each intermediate sample is represented by a strain list file obtained from
+``--output-strains``. The final step uses ``augur filter`` with ``--exclude-all``
+and ``--include`` to sample the data based on the intermediate strain list
+files. If the same strain appears in both files, ``augur filter`` will only
+write it once in each of the final outputs.
+
+Generalizing subsampling in a workflow
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The approach above can be cumbersome with more intermediate samples. To
+generalize this process and allow for more flexibility, a workflow management
+system can be used. The following examples use `Snakemake`_.
+
+1. Add a section in the `config file`_.
+
+ .. code-block:: yaml
+
+ subsampling:
+ state: --query "state == 'WA'" --subsample-max-sequences 100
+ country: --query "state != 'WA' & country == 'USA'" --subsample-max-sequences 50
+
+2. Add two rules in a `Snakefile`_. If you are building a standard Nextstrain
+ workflow, the output files should be used as input to sequence alignment. See
+ :doc:`docs.nextstrain.org:learn/parts` to learn more about the placement of
+ this step within a workflow.
+
+ .. code-block:: python
+
+ # 1. Sample 100 sequences from Washington state
+ # 2. Sample 50 sequences from the rest of the United States
+ rule intermediate_sample:
+ input:
+ metadata = "data/metadata.tsv",
+ output:
+ strains = "results/sample_strains_{sample_name}.txt",
+ params:
+ augur_filter_args = lambda wildcards: config.get("subsampling", {}).get(wildcards.sample_name, "")
+ shell:
+ """
+ augur filter \
+ --metadata {input.metadata} \
+ {params.augur_filter_args} \
+ --output-strains {output.strains}
+ """
+
+ # 3. Combine using augur filter
+ rule combine_intermediate_samples:
+ input:
+ sequences = "data/sequences.fasta",
+ metadata = "data/metadata.tsv",
+ intermediate_sample_strains = expand("results/sample_strains_{sample_name}.txt", sample_name=list(config.get("subsampling", {}).keys()))
+ output:
+ sequences = "results/subsampled_sequences.fasta",
+ metadata = "results/subsampled_metadata.tsv",
+ shell:
+ """
+ augur filter \
+ --sequences {input.sequences} \
+ --metadata {input.metadata} \
+ --exclude-all \
+ --include {input.intermediate_sample_strains} \
+ --output-sequences {output.sequences} \
+ --output-metadata {output.metadata}
+ """
+
+3. Run Snakemake targeting the second rule.
+
+ .. code-block:: bash
+
+ snakemake combine_intermediate_samples
+
+Explanation:
+
+- The configuration section consists of one entry per intermediate sample in the
+ format ``sample_name: <augur filter arguments>``.
+- The first rule is run once per intermediate sample using `wildcards`_ and an
+ `input function`_. The output of each run is the sampled strain list.
+- The second rule uses `expand()`_ to define input as all the intermediate
+ sampled strain lists, which are passed directly to ``--include`` as done in
+ the previous example.
+
+It is easy to add or remove intermediate samples. The configuration above can be
+updated to add another tier in between state and country:
+
+ .. code-block:: yaml
+
+ subsampling:
+ state: --query "state == 'WA'" --subsample-max-sequences 100
+ neighboring_states: --query "state in {'CA', 'ID', 'OR', 'NV'}" --subsample-max-sequences 75
+ country: --query "country == 'USA' & state not in {'WA', 'CA', 'ID', 'OR', 'NV'}" --subsample-max-sequences 50
+
+.. _Snakemake: https://snakemake.readthedocs.io/en/stable/index.html
+.. _config file: https://snakemake.readthedocs.io/en/stable/snakefiles/configuration.html#snakefiles-standard-configuration
+.. _Snakefile: https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html
+.. _wildcards: https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#wildcards
+.. _input function: https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#snakefiles-input-functions
+.. _expand(): https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#the-expand-function
=====================================
tests/functional/filter/cram/filter-min-length-output-metadata.t deleted
=====================================
@@ -1,17 +0,0 @@
-Setup
-
- $ source "$TESTDIR"/_setup.sh
-
-Filter using only metadata without sequence input or output and save results as filtered metadata.
-
- $ ${AUGUR} filter \
- > --sequence-index "$TESTDIR/../data/sequence_index.tsv" \
- > --metadata "$TESTDIR/../data/metadata.tsv" \
- > --min-date 2012 \
- > --min-length 10500 \
- > --output-metadata filtered_metadata.tsv > /dev/null
-
-Output should include the 8 sequences matching the filters and a header line.
-
- $ wc -l filtered_metadata.tsv
- \s*9 .* (re)
=====================================
tests/functional/filter/cram/filter-min-length-output-strains.t deleted
=====================================
@@ -1,17 +0,0 @@
-Setup
-
- $ source "$TESTDIR"/_setup.sh
-
-Filter using only metadata and save results as a list of filtered strains.
-
- $ ${AUGUR} filter \
- > --sequence-index "$TESTDIR/../data/sequence_index.tsv" \
- > --metadata "$TESTDIR/../data/metadata.tsv" \
- > --min-date 2012 \
- > --min-length 10500 \
- > --output-strains filtered_strains.txt > /dev/null
-
-Output should include only the 8 sequences matching the filters (without a header line).
-
- $ wc -l filtered_strains.txt
- \s*8 .* (re)
=====================================
tests/functional/filter/cram/filter-min-length-no-sequence-index-error.t → tests/functional/filter/cram/filter-no-sequence-index-error.t
=====================================
@@ -3,7 +3,8 @@ Setup
$ source "$TESTDIR"/_setup.sh
Try to filter using only metadata without a sequence index.
-This should fail because the requested filters rely on sequence information.
+
+These should fail because the requested filters rely on sequence information.
$ ${AUGUR} filter \
> --metadata "$TESTDIR/../data/metadata.tsv" \
@@ -11,3 +12,17 @@ This should fail because the requested filters rely on sequence information.
> --output-strains filtered_strains.txt > /dev/null
ERROR: You need to provide a sequence index or sequences to filter on sequence-specific information.
[2]
+
+ $ ${AUGUR} filter \
+ > --metadata "$TESTDIR/../data/metadata.tsv" \
+ > --max-length 10000 \
+ > --output-strains filtered_strains.txt > /dev/null
+ ERROR: You need to provide a sequence index or sequences to filter on sequence-specific information.
+ [2]
+
+ $ ${AUGUR} filter \
+ > --metadata "$TESTDIR/../data/metadata.tsv" \
+ > --non-nucleotide \
+ > --output-strains filtered_strains.txt > /dev/null
+ ERROR: You need to provide a sequence index or sequences to filter on sequence-specific information.
+ [2]
=====================================
tests/functional/filter/cram/filter-sequence-length.t
=====================================
@@ -0,0 +1,18 @@
+Setup
+
+ $ source "$TESTDIR"/_setup.sh
+
+Filter using --min-length and --max-length.
+
+ $ ${AUGUR} filter \
+ > --sequence-index "$TESTDIR/../data/sequence_index.tsv" \
+ > --metadata "$TESTDIR/../data/metadata.tsv" \
+ > --min-length 10500 \
+ > --max-length 10700 \
+ > --output-strains filtered_strains.txt
+ 7 strains were dropped during filtering
+ 1 had no metadata
+ 1 had no sequence data
+ 2 were dropped because they were shorter than the minimum length of 10500bp when only counting standard nucleotide characters A, C, G, or T (case-insensitive)
+ 3 were dropped because they were longer than the maximum length of 10700bp when only counting standard nucleotide characters A, C, G, or T (case-insensitive)
+ 6 strains passed all filters
View it on GitLab: https://salsa.debian.org/med-team/augur/-/commit/7cc0c25772b116b2e3dafd7d9bab36c04fe3a7a5
--
View it on GitLab: https://salsa.debian.org/med-team/augur/-/commit/7cc0c25772b116b2e3dafd7d9bab36c04fe3a7a5
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20240424/83a0cd06/attachment-0001.htm>
More information about the debian-med-commit
mailing list