[med-svn] [Git][med-team/augur][upstream] New upstream version 18.2.0

Étienne Mollier (@emollier) gitlab at salsa.debian.org
Sat Nov 19 17:23:46 GMT 2022



Étienne Mollier pushed to branch upstream at Debian Med / augur


Commits:
949a6709 by Étienne Mollier at 2022-11-19T17:53:35+01:00
New upstream version 18.2.0
- - - - -


27 changed files:

- .github/pull_request_template.md
- CHANGES.md
- augur/__init__.py
- augur/__version__.py
- augur/argparse_.py
- + augur/curate/__init__.py
- + augur/curate/normalize_strings.py
- + augur/curate/passthru.py
- augur/export.py
- + augur/io/json.py
- augur/io/metadata.py
- augur/io/sequences.py
- + augur/types.py
- docs/usage/cli/cli.rst
- + docs/usage/cli/curate/index.rst
- + docs/usage/cli/curate/normalize-strings.rst
- + docs/usage/cli/curate/passthru.rst
- setup.py
- + tests/functional/curate/cram/metadata-and-fasta-input.t
- + tests/functional/curate/cram/metadata-and-fasta-output.t
- + tests/functional/curate/cram/metadata-input.t
- + tests/functional/curate/cram/metadata-output.t
- + tests/functional/curate/cram/normalize_strings.t
- + tests/functional/curate/cram/passthru.t
- tests/io/test_file.py
- + tests/io/test_metadata.py
- tests/io/test_sequences.py


Changes:

=====================================
.github/pull_request_template.md
=====================================
@@ -14,4 +14,4 @@ If you added or changed behavior in the codebase, did you update the tests, or d
 
 ### Checklist
 
-- [ ] Add a message in [CHANGES.md](https://github.com/nextstrain/augur/blob/HEAD/CHANGES.md) summarizing the changes in this PR. Keep headers and formatting consistent with the rest of the file.
+- [ ] Add a message in [CHANGES.md](https://github.com/nextstrain/augur/blob/HEAD/CHANGES.md) summarizing the changes in this PR that are end user focused. Keep headers and formatting consistent with the rest of the file.


=====================================
CHANGES.md
=====================================
@@ -3,6 +3,14 @@
 ## __NEXT__
 
 
+## 18.2.0 (15 November 2022)
+
+### Features
+
+* Add the curate subcommand with two sub-subcommands, passthru and normalize-strings. The curate subcommand is intended to be a suite of commands to help users with data curation prior to running Nextstrain analyses. We will continue to add more subcommands as we identify other common data curation tasks. Please see the [usage docs](https://docs.nextstrain.org/projects/augur/en/stable/usage/cli/curate) for details. [#1039][] (@joverlee521)
+
+[#1039]: https://github.com/nextstrain/augur/pull/1039
+
 ## 18.1.2 (1 November 2022)
 
 ### Bug Fixes


=====================================
augur/__init__.py
=====================================
@@ -21,6 +21,7 @@ if recursion_limit:
 
 command_strings = [
     "parse",
+    "curate",
     "index",
     "filter",
     "mask",


=====================================
augur/__version__.py
=====================================
@@ -1,4 +1,4 @@
-__version__ = '18.1.2'
+__version__ = '18.2.0'
 
 
 def is_augur_version_compatible(version):


=====================================
augur/argparse_.py
=====================================
@@ -16,7 +16,7 @@ def add_default_command(parser):
     parser.set_defaults(__command__ = default_command)
 
 
-def add_command_subparsers(subparsers, commands):
+def add_command_subparsers(subparsers, commands, command_attribute='__command__'):
     """
     Add subparsers for each command module.
 
@@ -30,13 +30,18 @@ def add_command_subparsers(subparsers, commands):
         A list of modules that are commands that require their own subparser.
         Each module is required to have a `register_parser` function to add its own
         subparser and arguments.
+
+    command_attribute: str, optional
+        Optional attribute name for the commands. The default is `__command__`,
+        which allows top level augur to run commands directly via `args.__command__.run()`.
     """
     for command in commands:
         # Allow each command to register its own subparser
         subparser = command.register_parser(subparsers)
 
-        # Allows us to run commands directly with `args.__command__.run()`
-        subparser.set_defaults(__command__ = command)
+        # Add default attribute for command module
+        if command_attribute:
+            subparser.set_defaults(**{command_attribute: command})
 
         # Use the same formatting class for every command for consistency.
         # Set here to avoid repeating it in every command's register_parser().


=====================================
augur/curate/__init__.py
=====================================
@@ -0,0 +1,175 @@
+"""
+A suite of commands to help with data curation.
+"""
+import argparse
+import sys
+from collections import deque
+from textwrap import dedent
+
+from augur.argparse_ import add_command_subparsers
+from augur.errors import AugurError
+from augur.io.json import dump_ndjson, load_ndjson
+from augur.io.metadata import read_table_to_dict, read_metadata_with_sequences, write_records_to_tsv
+from augur.io.sequences import write_records_to_fasta
+from augur.types import DataErrorMethod
+from . import normalize_strings, passthru
+
+
+SUBCOMMAND_ATTRIBUTE = '_curate_subcommand'
+SUBCOMMANDS = [
+    passthru,
+    normalize_strings,
+]
+
+
+def create_shared_parser():
+    """
+    Creates an argparse.ArgumentParser that is intended to be used as a parent
+    parser¹ for all `augur curate` subcommands. This should include all options
+    that are intended to be shared across the subcommands.
+
+    Note that any options strings used here cannot be used in individual subcommand
+    subparsers unless the subparser specifically sets `conflict_handler='resolve'`²,
+    then the subparser option will override the option defined here.
+
+    Based on https://stackoverflow.com/questions/23296695/permit-argparse-global-flags-after-subcommand/23296874#23296874
+
+    ¹ https://docs.python.org/3/library/argparse.html#parents
+    ² https://docs.python.org/3/library/argparse.html#conflict-handler
+    """
+    shared_parser = argparse.ArgumentParser(add_help=False)
+
+    shared_inputs = shared_parser.add_argument_group(
+        title="INPUTS",
+        description="""
+            Input options shared by all `augur curate` commands.
+            If no input options are provided, commands will try to read NDJSON records from stdin.
+        """)
+    shared_inputs.add_argument("--metadata",
+        help="Input metadata file, as CSV or TSV. Accepts '-' to read metadata from stdin.")
+    shared_inputs.add_argument("--id-column",
+        help="Name of the metadata column that contains the record identifier for reporting duplicate records. "
+             "Uses the first column of the metadata file if not provided. "
+             "Ignored if also providing a FASTA file input.")
+
+    shared_inputs.add_argument("--fasta",
+        help="Plain or gzipped FASTA file. Headers can only contain the sequence id used to match a metadata record. " +
+             "Note that an index file will be generated for the FASTA file as <filename>.fasta.fxi")
+    shared_inputs.add_argument("--seq-id-column",
+        help="Name of metadata column that contains the sequence id to match sequences in the FASTA file.")
+    shared_inputs.add_argument("--seq-field",
+        help="The name to use for the sequence field when joining sequences from a FASTA file.")
+
+    shared_inputs.add_argument("--unmatched-reporting",
+        choices=[ method.value for method in DataErrorMethod ],
+        default=DataErrorMethod.ERROR_FIRST.value,
+        help="How unmatched records from combined metadata/FASTA input should be reported.")
+    shared_inputs.add_argument("--duplicate-reporting",
+        choices=[ method.value for method in DataErrorMethod ],
+        default=DataErrorMethod.ERROR_FIRST.value,
+        help="How should duplicate records be reported.")
+
+    shared_outputs = shared_parser.add_argument_group(
+        title="OUTPUTS",
+        description="""
+            Output options shared by all `augur curate` commands.
+            If no output options are provided, commands will output NDJSON records to stdout.
+        """)
+    shared_outputs.add_argument("--output-metadata",
+        help="Output metadata TSV file. Accepts '-' to output TSV to stdout.")
+
+    shared_outputs.add_argument("--output-fasta",
+        help="Output FASTA file.")
+    shared_outputs.add_argument("--output-id-field",
+        help="The record field to use as the sequence identifier in the FASTA output.")
+    shared_outputs.add_argument("--output-seq-field",
+        help="The record field that contains the sequence for the FASTA output. "
+             "This field will be deleted from the metadata output.")
+
+    return shared_parser
+
+
+def register_parser(parent_subparsers):
+    shared_parser = create_shared_parser()
+    parser = parent_subparsers.add_parser("curate", help=__doc__)
+
+    # Add print_help so we can run it when no subcommands are called
+    parser.set_defaults(print_help = parser.print_help)
+
+    # Add subparsers for subcommands
+    subparsers = parser.add_subparsers(dest="subcommand", required=False)
+    # Add the shared_parser to make it available for subcommands
+    # to include in their own parser
+    subparsers.shared_parser = shared_parser
+    # Using a subcommand attribute so subcommands are not directly
+    # run by top level Augur. Process I/O in `curate`` so individual
+    # subcommands do not have to worry about it.
+    add_command_subparsers(subparsers, SUBCOMMANDS, SUBCOMMAND_ATTRIBUTE)
+
+    return parser
+
+
+def run(args):
+    # Print help if no subcommands are used
+    if not getattr(args, SUBCOMMAND_ATTRIBUTE, None):
+        return args.print_help()
+
+    # Check provided args are valid and required combination of args are provided
+    if not args.fasta and (args.seq_id_column or args.seq_field):
+        raise AugurError("The --seq-id-column and --seq-field options should only be used when providing a FASTA file.")
+
+    if args.fasta and (not args.seq_id_column or not args.seq_field):
+        raise AugurError("The --seq-id-column and --seq-field options are required for a FASTA file input.")
+
+    if not args.output_fasta and (args.output_id_field or args.output_seq_field):
+        raise AugurError("The --output-id-field and --output-seq-field options should only be used when requesting a FASTA output.")
+
+    if args.output_fasta and (not args.output_id_field or not args.output_seq_field):
+        raise AugurError("The --output-id-field and --output-seq-field options are required for a FASTA output.")
+
+    # Read inputs
+    # Special case single hyphen as stdin
+    if args.metadata == '-':
+        args.metadata = sys.stdin
+
+    if args.metadata and args.fasta:
+        records = read_metadata_with_sequences(
+            args.metadata,
+            args.fasta,
+            args.seq_id_column,
+            args.seq_field,
+            DataErrorMethod(args.unmatched_reporting),
+            DataErrorMethod(args.duplicate_reporting))
+    elif args.metadata:
+        records = read_table_to_dict(args.metadata, DataErrorMethod(args.duplicate_reporting), args.id_column)
+    elif not sys.stdin.isatty():
+        records = load_ndjson(sys.stdin)
+    else:
+        raise AugurError(dedent("""\
+            No valid inputs were provided.
+            NDJSON records can be streamed from stdin or
+            input files can be provided via the command line options `--metadata` and `--fasta`.
+            See the command's help message for more details."""))
+
+    # Run subcommand to get modified records
+    modified_records = getattr(args, SUBCOMMAND_ATTRIBUTE).run(args, records)
+
+    # Output modified records
+    # First output FASTA, since the write fasta function yields the records again
+    # and removes the sequences from the records
+    if args.output_fasta:
+        modified_records = write_records_to_fasta(
+            modified_records,
+            args.output_fasta,
+            args.output_id_field,
+            args.output_seq_field)
+
+    if args.output_metadata:
+        write_records_to_tsv(modified_records, args.output_metadata)
+
+    if not (args.output_fasta or args.output_metadata):
+        dump_ndjson(modified_records)
+    else:
+        # Exhaust generator to ensure we run through all records
+        # when only a FASTA output is requested but not a metadata output
+        deque(modified_records, maxlen=0)


=====================================
augur/curate/normalize_strings.py
=====================================
@@ -0,0 +1,49 @@
+"""
+Normalize strings to a Unicode normalization form and strip leading and trailing whitespaces.
+
+Strings need to be normalized for predictable string comparisons, especially
+in cases where strings contain diacritics (see https://unicode.org/faq/normalization.html).
+"""
+import unicodedata
+
+from augur.utils import first_line
+
+
+def register_parser(parent_subparsers):
+    parser = parent_subparsers.add_parser("normalize-strings",
+        parents=[parent_subparsers.shared_parser],
+        help=first_line(__doc__))
+
+    optional = parser.add_argument_group(title="OPTIONAL")
+    optional.add_argument("--form", default="NFC", choices=["NFC", "NFKC", "NFD", "NFKD"],
+        help="Unicode normalization form to use for normalization.")
+    return parser
+
+
+def normalize_strings(record, form='NFC'):
+    """
+    Normalizes string values in *record* to a Unicode normalization *form*
+    and removes leading and trailing whitespaces from string.
+    Uses `NFC` normalization form by default.
+
+    Parameters
+    ----------
+    records: dict
+        An input record to be normalized
+    form: str, optional
+        An optional Unicode normalization form
+
+    Yields
+    ------
+    record: dict
+        The modified record that is a shallow copy of the original record
+    """
+    return {
+        key: (unicodedata.normalize(form, value).strip() if isinstance(value, str) else value)
+        for key, value in record.items()
+    }
+
+
+def run(args, records):
+    for record in records:
+        yield normalize_strings(record, args.form)


=====================================
augur/curate/passthru.py
=====================================
@@ -0,0 +1,15 @@
+"""
+Pass through records without doing any data transformations.
+Useful for testing, troubleshooting, or just converting file formats.
+"""
+
+
+def register_parser(parent_subparsers):
+    return parent_subparsers.add_parser("passthru",
+        parents=[parent_subparsers.shared_parser],
+        help=__doc__)
+
+
+def run(args, records):
+    yield from records
+


=====================================
augur/export.py
=====================================
@@ -5,8 +5,8 @@ from .argparse_ import add_command_subparsers
 from . import export_v1, export_v2
 
 SUBCOMMANDS = [
-    export_v1,
     export_v2,
+    export_v1,
 ]
 
 


=====================================
augur/io/json.py
=====================================
@@ -0,0 +1,243 @@
+"""
+A copy of id3c/lib/id3c/json.py and the additional util functions from
+id3c/lib/id3c/utils.py of the seattleflu/id3c repo as of commit 911e7d7 and
+licensed under the MIT License. The License file included in the repo is
+copied below verbatim.
+
+MIT License
+
+Copyright (c) 2018 Brotman Baty Institute
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+import json
+from datetime import datetime
+from typing import Iterable
+from uuid import UUID
+
+
+def as_json(value):
+    """
+    Converts *value* to a JSON string using our custom :class:`JsonEncoder`.
+    """
+    return json.dumps(value, allow_nan = False, cls = JsonEncoder)
+
+
+def load_json(value):
+    """
+    Converts *value* from a JSON string with better error messages.
+    Raises an :exc:`augur.io.json.JSONDecodeError` which provides improved error
+    messaging, compared to :exc:`json.JSONDecodeError`, when stringified..
+    """
+    try:
+        return json.loads(value)
+    except json.JSONDecodeError as e:
+        raise JSONDecodeError(e) from e
+
+
+def dump_ndjson(iterable: Iterable) -> None:
+    """
+    :func:`print` *iterable* as a set of newline-delimited JSON records.
+    """
+    for item in iterable:
+        print(as_json(item))
+
+
+def load_ndjson(file: Iterable[str], ignore_empty_lines = True) -> Iterable:
+    """
+    Load newline-delimited JSON records from *file*. Ignore empty lines
+    in the file by default.
+    """
+    for line in file:
+        # Skip empty lines when requested
+        if ignore_empty_lines and not line.strip():
+            continue
+
+        yield load_json(line)
+
+
+class JsonEncoder(json.JSONEncoder):
+    """
+    Encodes Python values into JSON for non-standard objects.
+    """
+
+    def __init__(self, *args, **kwargs):
+        """
+        Disallows the floating point values NaN, Infinity, and -Infinity..
+        Python's :class:`json` allows them by default because they work with
+        JSON-as-JavaScript, but they don't work with spec-compliant JSON
+        parsers.
+        """
+        kwargs["allow_nan"] = False
+        super().__init__(*args, **kwargs)
+
+    def default(self, value):
+        """
+        Returns *value* as JSON or raises a TypeError.
+        Serializes:
+        * :class:`~datetime.datetime` using :meth:`~datetime.datetime.isoformat()`
+        * :class:`~uuid.UUID` using ``str()``
+        """
+        if isinstance(value, datetime):
+            return value.isoformat()
+
+        elif isinstance(value, UUID):
+            return str(value)
+
+        else:
+            # Let the base class raise the TypeError
+            return super().default(value)
+
+
+class JSONDecodeError(json.JSONDecodeError):
+    """
+    Subclass of :class:`json.JSONDecodeError` which contextualizes the
+    stringified error message by including a snippet of the JSON source input.
+    Typically you won't need to ever reference this class directly.  It will be
+    raised by :func:`load_json` and be caught by except blocks which catch the
+    standard :class:`json.JSONDecodeError`.
+    >>> load_json('{foo: "bar"}')
+    Traceback (most recent call last):
+        ...
+    augur.io.json.JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1): '{▸▸▸f◂◂◂oo: "bar"}'
+    >>> load_json('not json')
+    Traceback (most recent call last):
+        ...
+    augur.io.json.JSONDecodeError: Expecting value: line 1 column 1 (char 0): 'not json'
+    >>> load_json("[0, 1, 2, 3, 4, 5")
+    Traceback (most recent call last):
+        ...
+    augur.io.json.JSONDecodeError: Expecting ',' delimiter: line 1 column 18 (char 17): unexpected end of document: '…, 3, 4, 5'
+    >>> load_json("[\\n")
+    Traceback (most recent call last):
+        ...
+    augur.io.json.JSONDecodeError: Expecting value: line 2 column 1 (char 2): unexpected end of document: '[\\n'
+    >>> load_json("\\n")
+    Traceback (most recent call last):
+        ...
+    augur.io.json.JSONDecodeError: Expecting value: line 2 column 1 (char 1): unexpected end of document: '\\n'
+    >>> load_json('')
+    Traceback (most recent call last):
+        ...
+    augur.io.json.JSONDecodeError: Expecting value: line 1 column 1 (char 0): (empty source document)
+    """
+    CONTEXT_LENGTH = 10
+
+    def __init__(self, exc: json.JSONDecodeError):
+        super().__init__(exc.msg, exc.doc, exc.pos)
+
+    def __str__(self):
+        error = super().__str__()
+
+        if self.doc:
+            if self.pos == 0 and self.msg == "Expecting value":
+                # Most likely not a JSON document at all, so show the whole thing.
+                context = repr(self.doc)
+            elif self.pos > 0 and self.pos == len(self.doc):
+                context = "unexpected end of document: " + repr(shorten_left(self.doc, self.CONTEXT_LENGTH, "…"))
+            else:
+                context = repr(contextualize_char(self.doc, self.pos, self.CONTEXT_LENGTH))
+        else:
+            context = "(empty source document)"
+
+        return f"{error}: {context}"
+
+
+def shorten_left(text, length, placeholder):
+    """
+    A variant of :py:func:`shorten` which shortens from the left end of *text*
+    instead of the right.
+
+    >>> shorten_left("foobar", 6, "...")
+    'foobar'
+    >>> shorten_left("foobarbaz", 6, "...")
+    '...baz'
+    >>> shorten_left("foobar", 3, "...")
+    Traceback (most recent call last):
+        ...
+    ValueError: maximum length (3) must be greater than length of placeholder (3)
+    """
+    if length <= len(placeholder):
+        raise ValueError(f"maximum length ({length}) must be greater than length of placeholder ({len(placeholder)})")
+
+    if len(text) > length:
+        return placeholder + text[-(length - len(placeholder)):]
+    else:
+        return text
+
+
+def contextualize_char(text, idx, context = 10):
+    """
+    Marks the *idx* char in *text* and snips out a surrounding amount of
+    *context*.
+
+    Avoids making a copy of *text* before snipping, in case *text* is very
+    large.
+
+    >>> contextualize_char('hello world', 0, context = 4)
+    '▸▸▸h◂◂◂ello…'
+    >>> contextualize_char('hello world', 5, context = 3)
+    '…llo▸▸▸ ◂◂◂wor…'
+    >>> contextualize_char('hello world', 5, context = 100)
+    'hello▸▸▸ ◂◂◂world'
+    >>> contextualize_char('hello world', 10)
+    'hello worl▸▸▸d◂◂◂'
+    >>> contextualize_char('hello world', 2, context = 0)
+    '…▸▸▸l◂◂◂…'
+
+    >>> contextualize_char('hello world', 11)
+    Traceback (most recent call last):
+        ...
+    IndexError: string index out of range
+    """
+    if context < 0:
+        raise ValueError("context must be positive")
+
+    start = max(0, idx - context)
+    end   = min(len(text), idx + context + 1)
+    idx   = min(idx, context)
+
+    start_placeholder = "…" if start > 0         else ""
+    end_placeholder   = "…" if end   < len(text) else ""
+
+    return start_placeholder + mark_char(text[start:end], idx) + end_placeholder
+
+
+def mark_char(text, idx):
+    """
+    Prominently marks the *idx* char in *text*.
+
+    >>> mark_char('hello world', 0)
+    '▸▸▸h◂◂◂ello world'
+    >>> mark_char('hello world', 2)
+    'he▸▸▸l◂◂◂lo world'
+    >>> mark_char('hello world', 10)
+    'hello worl▸▸▸d◂◂◂'
+
+    >>> mark_char('hello world', 11)
+    Traceback (most recent call last):
+        ...
+    IndexError: string index out of range
+
+    >>> mark_char('', 0)
+    Traceback (most recent call last):
+        ...
+    IndexError: string index out of range
+    """
+    return text[0:idx] + '▸▸▸' + text[idx] + '◂◂◂' + text[idx+1:]


=====================================
augur/io/metadata.py
=====================================
@@ -1,4 +1,15 @@
+import csv
+import os
 import pandas as pd
+import pyfastx
+import sys
+from io import StringIO
+from itertools import chain
+
+from augur.errors import AugurError
+from augur.io.print import print_err
+from augur.types import DataErrorMethod
+from .file import open_file
 
 
 def read_metadata(metadata_file, id_columns=("strain", "name"), chunk_size=None):
@@ -88,3 +99,328 @@ def read_metadata(metadata_file, id_columns=("strain", "name"), chunk_size=None)
         metadata_file,
         **kwargs
     )
+
+
+def read_table_to_dict(table, duplicate_reporting=DataErrorMethod.ERROR_FIRST, id_column=None):
+    """
+    Read rows from *table* file and yield each row as a single dict.
+
+    Will report duplicate records based on the *id_column* if requested via
+    *duplicate_reporting* after the generator has been exhausted.
+
+    Parameters
+    ----------
+    table: str
+        Path to a CSV or TSV file or IO buffer
+
+    duplicate_reporting: DataErrorMethod, optional
+        How should duplicate records be reported
+
+    id_column: str, optional
+        Name of the column that contains the record identifier used for reporting duplicates.
+        Uses the first column of the metadata if not provided.
+
+    Yields
+    ------
+    dict:
+        The parsed row as a single record
+
+    Raises
+    ------
+    AugurError:
+        Raised for any of the following reasons:
+        1. There are parsing errors from the csv standard library
+        2. The provided *id_column* does not exist in the *metadata*
+        3. The *duplicate_reporting* method is set to ERROR_FIRST or ERROR_ALL and duplicate(s) are found
+    """
+    valid_delimiters = [',', '\t']
+    seen_ids = set()
+    duplicate_ids = set()
+    with open_file(table) as handle:
+        # Get sample to determine delimiter
+        table_sample = handle.read(1024)
+
+        if handle.seekable():
+            handle.seek(0)
+        else:
+            table_sample_file = StringIO(table_sample)
+            handle = chain(table_sample_file, handle)
+
+        try:
+            dialect = csv.Sniffer().sniff(table_sample, valid_delimiters)
+        except csv.Error as err:
+            raise AugurError(
+                f"Could not determine the delimiter of {table!r}. "
+                "File must be a CSV or TSV."
+            ) from err
+
+        metadata_reader = csv.DictReader(handle, dialect=dialect)
+        if duplicate_reporting is DataErrorMethod.SILENT:
+            # Directly yield from metadata reader since we do not need to check for duplicate ids
+            yield from metadata_reader
+        else:
+            if id_column is None:
+                id_column = metadata_reader.fieldnames[0]
+
+            for record in metadata_reader:
+                record_id = record.get(id_column)
+                if record_id is None:
+                    raise AugurError(f"The provided id column {id_column!r} does not exist in {table!r}.")
+
+                if record_id in seen_ids:
+                    # Immediately raise an error if requested to error on the first duplicate
+                    if duplicate_reporting is DataErrorMethod.ERROR_FIRST:
+                        raise AugurError(f"Encountered record with duplicate id {record_id!r} in {table!r}")
+
+                    # Give immediate feedback on duplicates if requested to warn on duplicates
+                    # We'll also print a full summary of duplicates once the generator is exhausted
+                    if duplicate_reporting is DataErrorMethod.WARN:
+                        print_err(f"WARNING: Encountered record with duplicate id {record_id!r} in {table!r}")
+
+                    duplicate_ids.add(record_id)
+                else:
+                    seen_ids.add(record_id)
+
+                yield record
+
+    if duplicate_reporting is not DataErrorMethod.SILENT and duplicate_ids:
+        duplicates_message = f"The following records are duplicated in {table!r}:\n" + "\n".join(map(repr, sorted(duplicate_ids)))
+
+        if duplicate_reporting is DataErrorMethod.WARN:
+            print_err(f"WARNING: {duplicates_message}")
+        elif duplicate_reporting is DataErrorMethod.ERROR_ALL:
+            raise AugurError(duplicates_message)
+        else:
+            raise ValueError(f"Encountered unhandled duplicate reporting method: {duplicate_reporting!r}")
+
+
+def read_metadata_with_sequences(metadata, fasta, seq_id_column, seq_field='sequence',
+    unmatched_reporting=DataErrorMethod.ERROR_FIRST, duplicate_reporting=DataErrorMethod.ERROR_FIRST):
+    """
+    Read rows from *metadata* file and yield each row as a single dict that has
+    been updated with their corresponding sequence from the *fasta* file..
+    Matches the metadata record with sequences using the sequence id provided
+    in the *seq_id_column*. To ensure that the sequences can be matched with
+    the metadata, the FASTA headers must contain the matching sequence id. The
+    FASTA headers may include additional description parts after the id, but
+    they will not be used to match the metadata.
+
+    Will report unmatched records if requested via *unmatched_reporting*..
+    Note the ERROR_FIRST method will raise an error at the first unmatched metadata record
+    but not for an unmatched sequence record because we can only check for unmatched sequences
+    after exhausting the metadata generator.
+
+    Will report duplicate records if requested via *duplicate_reporting*..
+
+    Reads the *fasta* file with `pyfastx.Fasta`, which creates an index for
+    the file to allow random access of sequences via the sequence id.
+    Will remove any existing index file named `<fasta>.fxi` to force the
+    rebuilding of the index so that there's no chance of using stale cached indexes.
+    See pyfastx docs for more details:
+    https://pyfastx.readthedocs.io/en/latest/usage.html#fasta
+
+    Parameters
+    ----------
+    metadata: str
+        Path to a CSV or TSV metadata file
+
+    fasta: str
+        Path to a plain or gzipped FASTA file
+
+    seq_id_column: str
+        The column in the metadata file that contains the sequence id for
+        matching sequences
+
+    seq_field: str, optional
+        The field name to use for the sequence in the updated record
+
+    unmatched_reporting: DataErrorMethod, optional
+        How should unmatched records be reported
+
+    duplicate_reporting: DataErrorMethod, optional
+        How should duplicate records be reported
+
+    Yields
+    ------
+    dict
+        The parsed metadata record with the sequence
+    """
+    # Remove the old Pyfastx index to force rebuild of index
+    # so we don't have to worry about a stale cached index
+    try:
+        os.remove(f"{fasta}.fxi")
+    except FileNotFoundError:
+        pass
+
+    sequences = pyfastx.Fasta(fasta)
+    sequence_ids = set(sequences.keys())
+
+    # Used for determining unmatched records
+    processed_sequence_ids = set()
+    unmatched_metadata_ids = set()
+
+    # Used for determining duplicate records
+    processed_metadata_ids = set()
+    duplicate_metadata_ids = set()
+    duplicate_sequence_ids = set()
+
+    # First check for duplicates in FASTA first since pyfastx will only return
+    # the first sequence of duplicates, which may lead to unexpected results.
+    # Look for duplicate sequence ids if the number of sequences does not match the number of unique ids
+    if duplicate_reporting is not DataErrorMethod.SILENT and len(sequences) != len(sequence_ids):
+        seen_sequence_ids = set()
+        for seq_id in sequences.keys():
+            if seq_id in seen_sequence_ids:
+                # Immediately raise an error if requested to error on the first duplicate
+                if duplicate_reporting is DataErrorMethod.ERROR_FIRST:
+                    raise AugurError(f"Encountered sequence record with duplicate id {seq_id!r}.")
+
+                # Give immediate feedback on duplicates if requested to warn on duplicates
+                # We'll also print a full summary of duplicates at the end of the command
+                if duplicate_reporting is DataErrorMethod.WARN:
+                    print_err(f"WARNING: Encountered sequence record with duplicate id {seq_id!r}.")
+
+                duplicate_sequence_ids.add(seq_id)
+            else:
+                seen_sequence_ids.add(seq_id)
+
+    # Silencing duplicate reporting here because we will need to handle duplicates
+    # in both the metadata and FASTA files after processing all the records here.
+    for record in read_table_to_dict(metadata, duplicate_reporting=DataErrorMethod.SILENT):
+        seq_id = record.get(seq_id_column)
+
+        if seq_id is None:
+            raise AugurError(f"The provided sequence id column {seq_id_column!r} does not exist in the metadata.")
+
+        # Keep track of duplicate ids to report duplicate records if requested
+        if seq_id in processed_metadata_ids:
+            # Immediately raise an error if requested to error on the first duplicate
+            if duplicate_reporting is DataErrorMethod.ERROR_FIRST:
+                raise AugurError(f"Encountered metadata record with duplicate id {seq_id!r}.")
+
+            # Give immediate feedback on duplicates if requested to warn on duplicates
+            # We'll also print a full summary of duplicates at the end of the command
+            if duplicate_reporting is DataErrorMethod.WARN:
+                print_err(f"WARNING: Encountered metadata record with duplicate id {seq_id!r}.")
+
+            duplicate_metadata_ids.add(seq_id)
+        else:
+            processed_metadata_ids.add(seq_id)
+
+        # Skip records that do not have a matching sequence
+        # TODO: change this to try/except to fetch sequences and catch
+        # KeyError for non-existing sequences when https://github.com/lmdu/pyfastx/issues/50 is resolved
+        if seq_id not in sequence_ids:
+            # Immediately raise an error if requested to error on the first unmatched record
+            if unmatched_reporting is DataErrorMethod.ERROR_FIRST:
+                raise AugurError(f"Encountered metadata record {seq_id!r} without a matching sequence.")
+
+            # Give immediate feedback on unmatched records if requested to warn on unmatched
+            # We'll also print a full summary of unmatched records at the end of the command
+            if unmatched_reporting is DataErrorMethod.WARN:
+                print_err(f"WARNING: Encountered metadata record {seq_id!r} without a matching sequence.")
+
+            # Save unmatched metadata ids to report unmatched records if requested
+            unmatched_metadata_ids.add(seq_id)
+            continue
+
+        sequence_record = sequences[seq_id]
+        record[seq_field] = str(sequence_record.seq).upper()
+        # Save processed sequence ids to be able to determine if sequences were unmatched
+        processed_sequence_ids.add(seq_id)
+
+        yield record
+
+    # Create summary of duplicate records if requested
+    duplicates_message = None
+    if duplicate_reporting is not DataErrorMethod.SILENT and (duplicate_metadata_ids or duplicate_sequence_ids):
+        duplicates_message = "The output may not match expectations because there were records with duplicate sequence ids."
+
+        if duplicate_metadata_ids:
+            duplicates_message += f"\nThe following sequence ids were duplicated in {metadata!r}:\n"
+            duplicates_message += "\n".join(map(repr, sorted(duplicate_metadata_ids)))
+
+        if duplicate_sequence_ids:
+            duplicates_message += f"\nThe following sequence ids were duplicated in {fasta!r}:\n"
+            duplicates_message += "\n".join(map(repr, sorted(duplicate_sequence_ids)))
+
+    # Create summary for unmatched records if requested
+    # Note this is where we find unmatched sequences because we can only do so after looping through all of the metadata
+    unmatched_message = None
+    unmatched_sequence_ids = sequence_ids - processed_sequence_ids
+    if unmatched_reporting is not DataErrorMethod.SILENT and (unmatched_metadata_ids or unmatched_sequence_ids):
+        unmatched_message = "The output may be incomplete because there were unmatched records."
+
+        if unmatched_metadata_ids:
+            unmatched_message += "\nThe following metadata records did not have a matching sequence:\n"
+            unmatched_message += "\n".join(map(repr, sorted(unmatched_metadata_ids)))
+
+        if unmatched_sequence_ids:
+            unmatched_message += "\nThe following sequence records did not have a matching metadata record:\n"
+            unmatched_message += "\n".join(map(repr, sorted(unmatched_sequence_ids)))
+
+
+    # Handle all the different combinations for warnings and errors for unmatched and duplicate records
+    # Make sure we output warnings before raising any errors
+    if duplicate_reporting is DataErrorMethod.WARN and duplicates_message is not None:
+        print_err(f"WARNING: {duplicates_message}")
+
+    if unmatched_reporting is DataErrorMethod.WARN and unmatched_message is not None:
+        print_err(f"WARNING: {unmatched_message}")
+
+    # Combine error messages so both messages can be included in the final error
+    error_message = ""
+    if duplicate_reporting is DataErrorMethod.ERROR_ALL and duplicates_message is not None:
+        error_message += "\n" + duplicates_message
+
+    # We need to check ERROR_FIRST here for unmatched sequences since we
+    # need to process all metadata records to know which sequences are unmatched
+    if unmatched_reporting in {DataErrorMethod.ERROR_FIRST, DataErrorMethod.ERROR_ALL} and unmatched_message is not None:
+        error_message += "\n" + unmatched_message
+
+    if error_message:
+        raise AugurError(f"Encountered the following error(s) when parsing metadata with sequences:{error_message}")
+
+
+def write_records_to_tsv(records, output_file):
+    """
+    Write each record from *records* as a single row to a TSV *output_file*.
+    Uses the keys of the first record as output column names.
+    Ignores extra keys in other records.
+    If records are missing keys, they will have an empty string as the value.
+
+    Parameters
+    ----------
+    records: iterator[dict]
+        Iterator that yields dict that contains sequences
+
+    output_file: str
+        Path to the output TSV file.
+        Accepts '-' to output TSV to stdout.
+    """
+    # Use the keys of the first record as output fields
+    try:
+        first_record = next(records)
+    except StopIteration:
+        raise AugurError(f"Unable to write records to {output_file} because provided records were empty.")
+
+    # Use the record keys as output columns since as of python 3.7 dicts retain insertion order
+    output_columns = list(first_record.keys())
+
+    # Special case single hyphen as stdout
+    if output_file == '-':
+        output_file = sys.stdout
+
+    with open_file(output_file, 'w', newline='') as output_metadata:
+        tsv_writer = csv.DictWriter(
+            output_metadata,
+            output_columns,
+            extrasaction='ignore',
+            delimiter='\t',
+            lineterminator='\n'
+        )
+        tsv_writer.writeheader()
+        tsv_writer.writerow(first_record)
+
+        for record in records:
+            tsv_writer.writerow(record)


=====================================
augur/io/sequences.py
=====================================
@@ -1,5 +1,6 @@
 import Bio.SeqIO
 
+from augur.errors import AugurError
 from .file import open_file
 
 
@@ -72,3 +73,51 @@ def write_sequences(sequences, path_or_buffer, format="fasta"):
         )
 
     return sequences_written
+
+
+def write_records_to_fasta(records, fasta, seq_id_field='strain', seq_field='sequence'):
+    """
+    Write sequences from dict *records* to a *fasta* file.
+    Yields the records with the *seq_field* dropped so that they can be consumed downstream.
+
+    Parameters
+    ----------
+    records: iterator[dict]
+        Iterator that yields dict that contains sequences
+
+    fasta: str
+        Path to FASTA file
+
+    seq_id_field: str, optional
+        Field name for the sequence identifier
+
+    seq_field: str, optional
+        Field name for the genomic sequence
+
+    Yields
+    ------
+    dict:
+        A copy of the record with *seq_field* dropped
+
+    Raises
+    ------
+    AugurError:
+        When the sequence id field or sequence field does not exist in a record
+    """
+    with open_file(fasta, "w") as output_fasta:
+        for record in records:
+            if seq_id_field not in record:
+                raise AugurError(f"Provided sequence identifier field {seq_id_field!r} does not exist.")
+            if seq_field not in record:
+                raise AugurError(f"Provided sequence field {seq_field!r} does not exist.")
+
+            output_fasta.writelines([
+                f">{record[seq_id_field]}\n",
+                f"{record[seq_field]}\n"
+            ])
+
+            yield {
+                key: value
+                for key, value in record.items()
+                if key != seq_field
+            }


=====================================
augur/types.py
=====================================
@@ -0,0 +1,13 @@
+import enum
+
+
+ at enum.unique
+class DataErrorMethod(enum.Enum):
+    """
+    Enum representation of string values that represent how a data error should
+    be handled.
+    """
+    ERROR_FIRST     = 'error_first'
+    ERROR_ALL       = 'error_all'
+    WARN            = 'warn'
+    SILENT          = 'silent'


=====================================
docs/usage/cli/cli.rst
=====================================
@@ -10,6 +10,7 @@ We're in the process of adding examples and more extensive documentation for eac
 	:maxdepth: 1
 
 	parse
+	curate/index
 	index
 	filter
 	mask


=====================================
docs/usage/cli/curate/index.rst
=====================================
@@ -0,0 +1,21 @@
+==================
+augur curate
+==================
+
+This suite of commands is intended to help users curate their metadata for Nextstrain analyses.
+Each subcommand is designed to be tightly scoped to a single type of data transformation so curation pipelines can be easily customized.
+All subcommands share the same input and output options so a curation pipeline can begin with any subcommand and the output can be directly piped to any other curate subcommand.
+
+.. note::
+    If you need to parse metadata from a FASTA headers, please continue to use :doc:`augur parse </usage/cli/parse>`.
+    The output metadata TSV and FASTA files can then be used as inputs for any augur curate subcommand.
+
+You'll find documentation for all augur curate subcommands below.
+We will continue to add more subcommands as we identify other common data curation tasks.
+
+.. toctree::
+    :maxdepth: 1
+
+    normalize-strings
+    passthru
+


=====================================
docs/usage/cli/curate/normalize-strings.rst
=====================================
@@ -0,0 +1,9 @@
+=================
+normalize-strings
+=================
+
+.. argparse::
+    :module: augur
+    :func: make_parser
+    :prog: augur
+    :path: curate normalize-strings


=====================================
docs/usage/cli/curate/passthru.rst
=====================================
@@ -0,0 +1,9 @@
+========
+passthru
+========
+
+.. argparse::
+    :module: augur
+    :func: make_parser
+    :prog: augur
+    :path: curate passthru


=====================================
setup.py
=====================================
@@ -60,7 +60,8 @@ setuptools.setup(
         "packaging >=19.2",
         "pandas >=1.0.0, ==1.*",
         "phylo-treetime >=0.9.3, ==0.9.*",
-        "xopen >=1.0.1, ==1.*"
+        "pyfastx >=0.8.4, ==0.8.*",
+        "xopen[zstd] >=1.7.0, ==1.*"
     ],
     extras_require = {
         'dev': [


=====================================
tests/functional/curate/cram/metadata-and-fasta-input.t
=====================================
@@ -0,0 +1,163 @@
+Setup
+
+  $ pushd "$TESTDIR" > /dev/null
+  $ export AUGUR="${AUGUR:-../../../../bin/augur}"
+
+Testing combined metadata and FASTA inputs for the curate command.
+Running the `passthru` subcommand since it does not do any data transformations.
+
+Create FASTA file for testing.
+
+  $ cat >$TMP/sequences.fasta <<~~
+  > >sequence_A
+  > ATCG
+  > >sequence_B
+  > TCGA
+  > >sequence_C
+  > CGAT
+  > ~~
+
+Create metadata TSV file for testing.
+
+  $ cat >$TMP/metadata.tsv <<~~
+  > strain	country	date
+  > sequence_A	USA	2020-10-01
+  > sequence_B	USA	2020-10-02
+  > sequence_C	USA	2020-10-03
+  > ~~
+
+Test metadata input with extra FASTA input options without a FASTA file.
+This is expected to fail with an error.
+
+  $ ${AUGUR} curate passthru \
+  > --metadata $TMP/metadata.tsv \
+  > --seq-id-column name \
+  > --seq-field sequences
+  ERROR: The --seq-id-column and --seq-field options should only be used when providing a FASTA file.
+  [2]
+
+
+Test metadata and FASTA inputs without required FASTA input options.
+This is expected to fail with an error.
+
+  $ ${AUGUR} curate passthru \
+  > --metadata $TMP/metadata.tsv \
+  > --fasta $TMP/sequences.fasta
+  ERROR: The --seq-id-column and --seq-field options are required for a FASTA file input.
+  [2]
+
+Test metadata and FASTA inputs with required FASTA input options.
+
+  $ ${AUGUR} curate passthru \
+  > --metadata $TMP/metadata.tsv \
+  > --fasta $TMP/sequences.fasta \
+  > --seq-id-column strain \
+  > --seq-field seq
+  {"strain": "sequence_A", "country": "USA", "date": "2020-10-01", "seq": "ATCG"}
+  {"strain": "sequence_B", "country": "USA", "date": "2020-10-02", "seq": "TCGA"}
+  {"strain": "sequence_C", "country": "USA", "date": "2020-10-03", "seq": "CGAT"}
+
+Create new metadata file with duplicate and extra metadata records.
+
+  $ cp $TMP/metadata.tsv $TMP/metadata-with-duplicate-and-unmatched-records.tsv
+  $ cat >>$TMP/metadata-with-duplicate-and-unmatched-records.tsv <<~~
+  > sequence_A	USA	2020-10-XX
+  > extra_metadata_A	USA	2020-10-01
+  > extra_metadata_B	USA	2020-10-02
+  > ~~
+
+Create new FASTA file with duplicate and extra sequence records.
+
+  $ cp $TMP/sequences.fasta $TMP/sequences-with-duplicate-and-unmatched-records.fasta
+  $ cat >>$TMP/sequences-with-duplicate-and-unmatched-records.fasta <<~~
+  > >sequence_A
+  > NNNN
+  > >extra_sequence_A
+  > ATCG
+  > >extra_sequence_B
+  > TCGA
+  > ~~
+
+Test metadata and FASTA inputs with duplicate and extra records and default `ERROR_FIRST` reporting.
+This is expected to fail with an error, so redirecting stdout since we don't care about the output.
+
+  $ ${AUGUR} curate passthru \
+  > --metadata $TMP/metadata-with-duplicate-and-unmatched-records.tsv \
+  > --fasta $TMP/sequences-with-duplicate-and-unmatched-records.fasta \
+  > --seq-id-column strain \
+  > --seq-field seq 1> /dev/null
+  ERROR: Encountered sequence record with duplicate id 'sequence_A'.
+  [2]
+
+Test metadata and FASTA inputs with duplicate and extra records with `ERROR_ALL` reporting.
+This is expected to fail with an error, so redirecting stdout since we don't care about the output.
+
+  $ ${AUGUR} curate passthru \
+  > --metadata $TMP/metadata-with-duplicate-and-unmatched-records.tsv \
+  > --fasta $TMP/sequences-with-duplicate-and-unmatched-records.fasta \
+  > --seq-id-column strain \
+  > --seq-field seq \
+  > --unmatched-reporting error_all \
+  > --duplicate-reporting error_all 1> /dev/null
+  ERROR: Encountered the following error(s) when parsing metadata with sequences:
+  The output may not match expectations because there were records with duplicate sequence ids.
+  The following sequence ids were duplicated in .*metadata-with-duplicate-and-unmatched-records.* (re)
+  'sequence_A'
+  The following sequence ids were duplicated in .*sequences-with-duplicate-and-unmatched-records.* (re)
+  'sequence_A'
+  The output may be incomplete because there were unmatched records.
+  The following metadata records did not have a matching sequence:
+  'extra_metadata_A'
+  'extra_metadata_B'
+  The following sequence records did not have a matching metadata record:
+  'extra_sequence_A'
+  'extra_sequence_B'
+  [2]
+
+Test metadata and FASTA inputs with unmatched records, but ask to only warn on unmatched and duplicates.
+This is expected run without error and only print a warning.
+Notice the duplicate sequence "sequence_A" will always use the first sequence in the FASTA file because of pyfastx.
+
+  $ ${AUGUR} curate passthru \
+  > --metadata $TMP/metadata-with-duplicate-and-unmatched-records.tsv \
+  > --fasta $TMP/sequences-with-duplicate-and-unmatched-records.fasta \
+  > --seq-id-column strain \
+  > --seq-field seq \
+  > --unmatched-reporting warn \
+  > --duplicate-reporting warn
+  WARNING: Encountered sequence record with duplicate id 'sequence_A'.
+  WARNING: Encountered metadata record with duplicate id 'sequence_A'.
+  WARNING: Encountered metadata record 'extra_metadata_A' without a matching sequence.
+  WARNING: Encountered metadata record 'extra_metadata_B' without a matching sequence.
+  WARNING: The output may not match expectations because there were records with duplicate sequence ids.
+  The following sequence ids were duplicated in .*metadata-with-duplicate-and-unmatched-records.* (re)
+  'sequence_A'
+  The following sequence ids were duplicated in .*sequences-with-duplicate-and-unmatched-records.* (re)
+  'sequence_A'
+  WARNING: The output may be incomplete because there were unmatched records.
+  The following metadata records did not have a matching sequence:
+  'extra_metadata_A'
+  'extra_metadata_B'
+  The following sequence records did not have a matching metadata record:
+  'extra_sequence_A'
+  'extra_sequence_B'
+  {"strain": "sequence_A", "country": "USA", "date": "2020-10-01", "seq": "ATCG"}
+  {"strain": "sequence_B", "country": "USA", "date": "2020-10-02", "seq": "TCGA"}
+  {"strain": "sequence_C", "country": "USA", "date": "2020-10-03", "seq": "CGAT"}
+  {"strain": "sequence_A", "country": "USA", "date": "2020-10-XX", "seq": "ATCG"}
+
+Test metadata and FASTA inputs with unmatched records in both, but ask to silent unmatched and duplicates.
+Notice the duplicate sequence "sequence_A" will always use the first sequence in the FASTA file because of pyfastx.
+
+  $ ${AUGUR} curate passthru \
+  > --metadata $TMP/metadata-with-duplicate-and-unmatched-records.tsv \
+  > --fasta $TMP/sequences-with-duplicate-and-unmatched-records.fasta \
+  > --seq-id-column strain \
+  > --seq-field seq \
+  > --unmatched-reporting silent \
+  > --duplicate-reporting silent
+  {"strain": "sequence_A", "country": "USA", "date": "2020-10-01", "seq": "ATCG"}
+  {"strain": "sequence_B", "country": "USA", "date": "2020-10-02", "seq": "TCGA"}
+  {"strain": "sequence_C", "country": "USA", "date": "2020-10-03", "seq": "CGAT"}
+  {"strain": "sequence_A", "country": "USA", "date": "2020-10-XX", "seq": "ATCG"}
+


=====================================
tests/functional/curate/cram/metadata-and-fasta-output.t
=====================================
@@ -0,0 +1,92 @@
+Setup
+
+  $ pushd "$TESTDIR" > /dev/null
+  $ export AUGUR="${AUGUR:-../../../../bin/augur}"
+
+Testing combined metadata and FASTA output for the curate command.
+Running the `passthru` subcommand since it does not do any data transformations.
+
+Create NDJSON file for testing.
+
+  $ cat >$TMP/records.ndjson <<~~
+  > {"strain": "sequence_A", "country": "USA", "date": "2020-10-01", "sequence": "AAAA"}
+  > {"strain": "sequence_T", "country": "USA", "date": "2020-10-02", "sequence": "TTTT"}
+  > {"strain": "sequence_C", "country": "USA", "date": "2020-10-03", "sequence": "CCCC"}
+  > ~~
+
+Test metadata output with extra FASTA output options.
+This is expected to fail immediately with an error.
+  $ cat $TMP/records.ndjson \
+  >   | ${AUGUR} curate passthru \
+  >     --output-metadata $TMP/metadata.tsv \
+  >     --output-id-field strain \
+  >     --output-seq-field sequence
+  ERROR: The --output-id-field and --output-seq-field options should only be used when requesting a FASTA output.
+  [2]
+
+Test metadata and FASTA outputs without requried FASTA output options.
+This is expected to fail immediately with an error.
+  $ cat $TMP/records.ndjson \
+  >   | ${AUGUR} curate passthru \
+  >     --output-metadata $TMP/metadata.tsv \
+  >     --output-fasta $TMP/sequences.fasta
+  ERROR: The --output-id-field and --output-seq-field options are required for a FASTA output.
+  [2]
+
+Test metadata and FASTA outputs
+
+  $ cat $TMP/records.ndjson \
+  >   | ${AUGUR} curate passthru \
+  >     --output-metadata $TMP/metadata.tsv \
+  >     --output-fasta $TMP/sequences.fasta \
+  >     --output-id-field strain \
+  >     --output-seq-field sequence
+  $ cat $TMP/metadata.tsv
+  strain\tcountry\tdate (esc)
+  sequence_A\tUSA\t2020-10-01 (esc)
+  sequence_T\tUSA\t2020-10-02 (esc)
+  sequence_C\tUSA\t2020-10-03 (esc)
+  $ cat $TMP/sequences.fasta
+  >sequence_A (esc)
+  AAAA (esc)
+  >sequence_T (esc)
+  TTTT (esc)
+  >sequence_C (esc)
+  CCCC
+
+Test FASTA output without metadata output.
+
+  $ cat $TMP/records.ndjson \
+  >   | ${AUGUR} curate passthru \
+  >     --output-fasta $TMP/sequences.fasta \
+  >     --output-id-field strain \
+  >     --output-seq-field sequence
+  $ cat $TMP/sequences.fasta
+  >sequence_A (esc)
+  AAAA (esc)
+  >sequence_T (esc)
+  TTTT (esc)
+  >sequence_C (esc)
+  CCCC
+
+Test FASTA output with bad output id field.
+This is expected to fail with an error.
+
+  $ cat $TMP/records.ndjson \
+  >   | ${AUGUR} curate passthru \
+  >     --output-fasta $TMP/sequences.fasta \
+  >     --output-id-field bogus_id \
+  >     --output-seq-field sequence
+  ERROR: Provided sequence identifier field 'bogus_id' does not exist.
+  [2]
+
+Test FASTA output with bad output sequence field.
+This is expected to fail with an error.
+
+  $ cat $TMP/records.ndjson \
+  >   | ${AUGUR} curate passthru \
+  >     --output-fasta $TMP/sequences.fasta \
+  >     --output-id-field strain \
+  >     --output-seq-field bogus_sequence
+  ERROR: Provided sequence field 'bogus_sequence' does not exist.
+  [2]


=====================================
tests/functional/curate/cram/metadata-input.t
=====================================
@@ -0,0 +1,139 @@
+Setup
+
+  $ pushd "$TESTDIR" > /dev/null
+  $ export AUGUR="${AUGUR:-../../../../bin/augur}"
+
+Testing metadata inputs for the curate command.
+Running the `passthru` subcommand since it does not do any data transformations.
+
+Create metadata TSV file for testing.
+
+  $ cat >$TMP/metadata.tsv <<~~
+  > strain	country	date
+  > sequence_A	USA	2020-10-01
+  > sequence_B	USA	2020-10-02
+  > sequence_C	USA	2020-10-03
+  > ~~
+
+Test TSV metadata input
+
+  $ ${AUGUR} curate passthru \
+  > --metadata $TMP/metadata.tsv
+  {"strain": "sequence_A", "country": "USA", "date": "2020-10-01"}
+  {"strain": "sequence_B", "country": "USA", "date": "2020-10-02"}
+  {"strain": "sequence_C", "country": "USA", "date": "2020-10-03"}
+
+Test TSV metadata input from stdin
+
+  $ cat $TMP/metadata.tsv \
+  >   | ${AUGUR} curate normalize-strings \
+  >     --metadata -
+  {"strain": "sequence_A", "country": "USA", "date": "2020-10-01"}
+  {"strain": "sequence_B", "country": "USA", "date": "2020-10-02"}
+  {"strain": "sequence_C", "country": "USA", "date": "2020-10-03"}
+
+Create metadata CSV file for testing.
+
+  $ cat >$TMP/metadata.csv <<~~
+  > strain,country,date
+  > sequence_A,USA,2020-10-01
+  > sequence_B,USA,2020-10-02
+  > sequence_C,USA,2020-10-03
+  > ~~
+
+Test CSV metadata input
+
+  $ ${AUGUR} curate passthru \
+  > --metadata $TMP/metadata.csv
+  {"strain": "sequence_A", "country": "USA", "date": "2020-10-01"}
+  {"strain": "sequence_B", "country": "USA", "date": "2020-10-02"}
+  {"strain": "sequence_C", "country": "USA", "date": "2020-10-03"}
+
+Test CSV metadata input from stdin
+
+  $ cat $TMP/metadata.csv \
+  >   | ${AUGUR} curate normalize-strings \
+  >     --metadata -
+  {"strain": "sequence_A", "country": "USA", "date": "2020-10-01"}
+  {"strain": "sequence_B", "country": "USA", "date": "2020-10-02"}
+  {"strain": "sequence_C", "country": "USA", "date": "2020-10-03"}
+
+
+Create a metadata TSV file with duplicate records
+
+  $ cat >$TMP/metadata.tsv <<~~
+  > strain	country	date
+  > sequence_A	USA	2020-10-01
+  > sequence_B	USA	2020-10-02
+  > sequence_C	USA	2020-10-03
+  > sequence_A	USA	2020-10-01
+  > sequence_B	USA	2020-10-02
+  > sequence_C	USA	2020-10-03
+  > ~~
+
+Test default options for duplicate records, which is expected for exit with an error on the first duplicate
+There will still be output due to the nature of the chained generators in augur curate.
+
+  $ ${AUGUR} curate passthru \
+  > --metadata $TMP/metadata.tsv
+  ERROR: Encountered record with duplicate id 'sequence_A' in .* (re)
+  {"strain": "sequence_A", "country": "USA", "date": "2020-10-01"}
+  {"strain": "sequence_B", "country": "USA", "date": "2020-10-02"}
+  {"strain": "sequence_C", "country": "USA", "date": "2020-10-03"}
+  [2]
+
+Test error_all on duplicate records.
+
+  $ ${AUGUR} curate passthru \
+  > --metadata $TMP/metadata.tsv \
+  > --duplicate-reporting error_all
+  ERROR: The following records are duplicated in .* (re)
+  'sequence_A'
+  'sequence_B'
+  'sequence_C'
+  {"strain": "sequence_A", "country": "USA", "date": "2020-10-01"}
+  {"strain": "sequence_B", "country": "USA", "date": "2020-10-02"}
+  {"strain": "sequence_C", "country": "USA", "date": "2020-10-03"}
+  {"strain": "sequence_A", "country": "USA", "date": "2020-10-01"}
+  {"strain": "sequence_B", "country": "USA", "date": "2020-10-02"}
+  {"strain": "sequence_C", "country": "USA", "date": "2020-10-03"}
+  [2]
+
+Test warning on duplicate records.
+
+  $ ${AUGUR} curate passthru \
+  > --metadata $TMP/metadata.tsv \
+  > --duplicate-reporting warn
+  WARNING: Encountered record with duplicate id 'sequence_A' in .* (re)
+  WARNING: Encountered record with duplicate id 'sequence_B' in .* (re)
+  WARNING: Encountered record with duplicate id 'sequence_C' in .* (re)
+  WARNING: The following records are duplicated in .* (re)
+  'sequence_A'
+  'sequence_B'
+  'sequence_C'
+  {"strain": "sequence_A", "country": "USA", "date": "2020-10-01"}
+  {"strain": "sequence_B", "country": "USA", "date": "2020-10-02"}
+  {"strain": "sequence_C", "country": "USA", "date": "2020-10-03"}
+  {"strain": "sequence_A", "country": "USA", "date": "2020-10-01"}
+  {"strain": "sequence_B", "country": "USA", "date": "2020-10-02"}
+  {"strain": "sequence_C", "country": "USA", "date": "2020-10-03"}
+
+Test silent on duplicate records.
+
+  $ ${AUGUR} curate passthru \
+  > --metadata $TMP/metadata.tsv \
+  > --duplicate-reporting silent
+  {"strain": "sequence_A", "country": "USA", "date": "2020-10-01"}
+  {"strain": "sequence_B", "country": "USA", "date": "2020-10-02"}
+  {"strain": "sequence_C", "country": "USA", "date": "2020-10-03"}
+  {"strain": "sequence_A", "country": "USA", "date": "2020-10-01"}
+  {"strain": "sequence_B", "country": "USA", "date": "2020-10-02"}
+  {"strain": "sequence_C", "country": "USA", "date": "2020-10-03"}
+
+Test duplicate records with a bogus id column, which is expected to fail with an error.
+
+  $ ${AUGUR} curate passthru \
+  > --metadata $TMP/metadata.tsv \
+  > --id-column "bogus_id"
+  ERROR: The provided id column 'bogus_id' does not exist in .* (re)
+  [2]


=====================================
tests/functional/curate/cram/metadata-output.t
=====================================
@@ -0,0 +1,32 @@
+Setup
+
+  $ pushd "$TESTDIR" > /dev/null
+  $ export AUGUR="${AUGUR:-../../../../bin/augur}"
+
+Testing metadata output for the curate command.
+Running the `passthru` subcommand since it does not do any data transformations.
+
+Create NDJSON file for testing.
+
+  $ cat >$TMP/records.ndjson <<~~
+  > {"strain": "sequence_A", "country": "USA", "date": "2020-10-01"}
+  > {"strain": "sequence_B", "country": "USA", "date": "2020-10-02"}
+  > {"strain": "sequence_C", "country": "USA", "date": "2020-10-03"}
+  > ~~
+Test metadata output TSV
+  $ cat $TMP/records.ndjson \
+  >   | ${AUGUR} curate passthru \
+  >     --output-metadata $TMP/metadata.tsv
+  $ cat $TMP/metadata.tsv
+  strain\tcountry\tdate (esc)
+  sequence_A\tUSA\t2020-10-01 (esc)
+  sequence_B\tUSA\t2020-10-02 (esc)
+  sequence_C\tUSA\t2020-10-03 (esc)
+Test metadata output TSV to stdout
+  $ cat $TMP/records.ndjson \
+  >   | ${AUGUR} curate passthru \
+  >     --output-metadata -
+  strain\tcountry\tdate (esc)
+  sequence_A\tUSA\t2020-10-01 (esc)
+  sequence_B\tUSA\t2020-10-02 (esc)
+  sequence_C\tUSA\t2020-10-03 (esc)


=====================================
tests/functional/curate/cram/normalize_strings.t
=====================================
@@ -0,0 +1,42 @@
+Setup
+
+  $ pushd "$TESTDIR" > /dev/null
+  $ export AUGUR="${AUGUR:-../../../../bin/augur}"
+
+Test two versions of C-cedilla that look the same visually but
+have different code points, therefore are considered "Not equal".
+
+  $ export DIACRITIC_1="Ç"
+  $ export DIACRITIC_2="Ç"
+  $ [[ "${DIACRITIC_1}" == "${DIACRITIC_2}" ]] && echo "Equal" || echo "Not equal"
+  Not equal
+
+Create NDJSON file for testing normalize-strings with different forms
+
+  $ cat >$TMP/records.ndjson <<~~
+  > {"record": 1, "diacritic_1": "${DIACRITIC_1}", "diacritic_2": "${DIACRITIC_2}"}
+  > ~~
+
+Test output with default Unicode normalization form "NFC".
+
+  $ cat $TMP/records.ndjson \
+  >   | ${AUGUR} curate normalize-strings
+  {"record": 1, "diacritic_1": "\u00c7", "diacritic_2": "\u00c7"}
+
+Test output with Unicode normalization form "NFKC".
+
+  $ cat $TMP/records.ndjson \
+  >   | ${AUGUR} curate normalize-strings --form NFKC
+  {"record": 1, "diacritic_1": "\u00c7", "diacritic_2": "\u00c7"}
+
+Test output with Unicode normalization form "NFD".
+
+  $ cat $TMP/records.ndjson \
+  >   | ${AUGUR} curate normalize-strings --form NFD
+  {"record": 1, "diacritic_1": "C\u0327", "diacritic_2": "C\u0327"}
+
+Test output with Unicode normalization form "NFKD".
+
+  $ cat $TMP/records.ndjson \
+  >   | ${AUGUR} curate normalize-strings --form NFKD
+  {"record": 1, "diacritic_1": "C\u0327", "diacritic_2": "C\u0327"}


=====================================
tests/functional/curate/cram/passthru.t
=====================================
@@ -0,0 +1,15 @@
+Setup
+
+  $ pushd "$TESTDIR" > /dev/null
+  $ export AUGUR="${AUGUR:-../../../../bin/augur}"
+
+Create NDJSON file for testing all valid JSON data types.
+
+  $ cat >$TMP/records.ndjson <<~~
+  > {"string": "string", "number": 123, "object": {"string": "string"}, "array": ["string0", "string1", "string2"], "boolean1": true, "boolean2": false, "null": null}
+  > ~~
+
+Output should be exactly the same as the input.
+
+  $ cat $TMP/records.ndjson | ${AUGUR} curate passthru
+  {"string": "string", "number": 123, "object": {"string": "string"}, "array": ["string0", "string1", "string2"], "boolean1": true, "boolean2": false, "null": null}


=====================================
tests/io/test_file.py
=====================================
@@ -45,7 +45,7 @@ class TestFile:
         with augur.io.file.open_file(path) as f_read:
             assert f_read.read() == 'foo\nbar\n'
 
-    def test_open_file_read_lzma(self, tmpdir):
+    def test_open_file_write_lzma(self, tmpdir):
         """Write a text file compressed with LZMA."""
         import lzma
         path = str(tmpdir / 'test.txt.xz')
@@ -53,3 +53,21 @@ class TestFile:
             f_write.write('foo\nbar\n')
         with lzma.open(path, 'rt') as f_read:
             assert f_read.read() == 'foo\nbar\n'
+
+    def test_open_file_read_zstd(self, tmpdir):
+        """Read a text file compressed with zstd."""
+        import zstandard as zstd
+        path = str(tmpdir / 'test.txt.zst')
+        with zstd.open(path, 'wt') as f_write:
+            f_write.write('foo\nbar\n')
+        with augur.io.file.open_file(path) as f_read:
+            assert f_read.read() == 'foo\nbar\n'
+
+    def test_open_file_write_zstd(self, tmpdir):
+        """Write a text file compressed with zstd."""
+        import zstandard as zstd
+        path = str(tmpdir / 'test.txt.zst')
+        with augur.io.file.open_file(path, 'w') as f_write:
+            f_write.write('foo\nbar\n')
+        with zstd.open(path, 'rt') as f_read:
+            assert f_read.read() == 'foo\nbar\n'


=====================================
tests/io/test_metadata.py
=====================================
@@ -0,0 +1,507 @@
+import pytest
+import shutil
+import sys
+from io import StringIO
+
+from augur.errors import AugurError
+from augur.io.metadata import read_table_to_dict, read_metadata_with_sequences, write_records_to_tsv
+from augur.types import DataErrorMethod
+
+
+ at pytest.fixture
+def expected_record():
+    return {
+        'strain': 'SEQ_A',
+        'date': '2020-10-03',
+        'country': 'USA'
+    }
+
+ at pytest.fixture
+def metadata_with_duplicate(tmpdir):
+    path = str(tmpdir / 'metadata.tsv')
+    with open(path, 'w') as fh:
+        fh.write('strain\tdate\tcountry\n')
+        fh.write('SEQ_A\t2020-10-03\tUSA\n')
+        fh.write('SEQ_A\t2020-10-03\tUSA\n')
+        fh.write('SEQ_B\t2020-10-03\tUSA\n')
+        fh.write('SEQ_B\t2020-10-03\tUSA\n')
+    return path
+
+ at pytest.fixture
+def mp_context(monkeypatch):
+    with monkeypatch.context() as mp:
+        yield mp
+
+class TestReadMetadataToDict:
+    def test_read_table_to_dict_with_csv(self, tmpdir, expected_record):
+        path = str(tmpdir / 'metadata.csv')
+        with open(path, 'w') as fh:
+            fh.write('strain,date,country\n')
+            fh.write('SEQ_A,2020-10-03,USA\n')
+
+        record = next(read_table_to_dict(path))
+        assert record == expected_record
+
+    def test_read_table_to_dict_with_csv_from_stdin(self, mp_context, expected_record):
+        stdin = StringIO('strain,date,country\nSEQ_A,2020-10-03,USA\n')
+        mp_context.setattr('sys.stdin', stdin)
+        record = next(read_table_to_dict(sys.stdin))
+        assert record == expected_record
+
+    def test_read_table_to_dict_with_tsv(self, tmpdir, expected_record):
+        path = str(tmpdir / 'metadata.tsv')
+        with open(path, 'w') as fh:
+            fh.write('strain\tdate\tcountry\n')
+            fh.write('SEQ_A\t2020-10-03\tUSA\n')
+
+        record = next(read_table_to_dict(path))
+        assert record == expected_record
+
+    def test_read_table_to_dict_with_tsv_from_stdin(self, mp_context, expected_record):
+        stdin = StringIO('strain\tdate\tcountry\nSEQ_A\t2020-10-03\tUSA\n')
+        mp_context.setattr('sys.stdin', stdin)
+        record = next(read_table_to_dict(sys.stdin))
+        assert record == expected_record
+
+    def test_read_table_to_dict_with_bad_delimiter(self, tmpdir):
+        path = str(tmpdir / 'metadata.txt')
+        with open(path, 'w') as fh:
+            fh.write('strain date country\n')
+            fh.write('SEQ_A 2020-10-03 USA\n')
+
+        with pytest.raises(AugurError) as e_info:
+            next(read_table_to_dict(path))
+
+        assert str(e_info.value) == f"Could not determine the delimiter of {path!r}. File must be a CSV or TSV."
+
+    @pytest.mark.parametrize('id_column', ['strain', None])
+    def test_read_table_to_dict_with_duplicates(self, metadata_with_duplicate, id_column):
+        with pytest.raises(AugurError) as e_info:
+            list(read_table_to_dict(metadata_with_duplicate, id_column=id_column))
+        assert str(e_info.value) == f"Encountered record with duplicate id 'SEQ_A' in {metadata_with_duplicate!r}"
+
+    @pytest.mark.parametrize('id_column', ['strain', None])
+    def test_read_table_to_dict_with_duplicates_error_all(self, metadata_with_duplicate, id_column):
+        with pytest.raises(AugurError) as e_info:
+            list(read_table_to_dict(metadata_with_duplicate, DataErrorMethod("error_all"), id_column=id_column))
+        assert str(e_info.value) == f"The following records are duplicated in {metadata_with_duplicate!r}:\n'SEQ_A'\n'SEQ_B'"
+
+    @pytest.mark.parametrize('id_column', ['strain', None])
+    def test_read_table_to_dict_with_duplicates_warning(self, capsys, metadata_with_duplicate, id_column):
+        list(read_table_to_dict(metadata_with_duplicate, DataErrorMethod('warn'), id_column=id_column))
+        captured = capsys.readouterr()
+        assert captured.err == (
+            f"WARNING: Encountered record with duplicate id 'SEQ_A' in {metadata_with_duplicate!r}\n"
+            f"WARNING: Encountered record with duplicate id 'SEQ_B' in {metadata_with_duplicate!r}\n"
+            f"WARNING: The following records are duplicated in {metadata_with_duplicate!r}:\n'SEQ_A'\n'SEQ_B'\n"
+        )
+
+    def test_read_table_to_dict_with_duplicates_silent(self, capsys, metadata_with_duplicate):
+        list(read_table_to_dict(metadata_with_duplicate, DataErrorMethod('silent')))
+        assert "WARNING" not in capsys.readouterr().err
+
+    def test_read_table_to_dict_with_duplicate_and_bad_id(self, metadata_with_duplicate):
+        id_column = "bad_id"
+        with pytest.raises(AugurError) as e_info:
+            list(read_table_to_dict(metadata_with_duplicate, id_column=id_column))
+        assert str(e_info.value) == f"The provided id column {id_column!r} does not exist in {metadata_with_duplicate!r}."
+
+
+ at pytest.fixture
+def fasta_file(tmpdir):
+    path = str(tmpdir / 'sequences.fasta')
+    with open(path, 'w') as fh:
+        fh.writelines([
+            '>SEQ_A\nAAAA\n',
+            '>SEQ_T\nTTTT\n',
+            '>SEQ_C\nCCCC\n',
+            '>SEQ_G\nGGGG\n'
+        ])
+    return path
+
+ at pytest.fixture
+def metadata_file(tmpdir):
+    path = str(tmpdir / 'metadata.tsv')
+    with open(path, 'w') as fh:
+        fh.writelines([
+            'strain\tcountry\tdate\n',
+            'SEQ_A\tUSA\t2020-10-01\n',
+            'SEQ_T\tUSA\t2020-10-02\n',
+            'SEQ_C\tUSA\t2020-10-03\n',
+            'SEQ_G\tUSA\t2020-10-04\n'
+        ])
+    return path
+
+def unmatched_sequences():
+    return [
+        '>EXTRA_SEQ_A\nAAAAA\n',
+        '>EXTRA_SEQ_T\nTTTTT\n'
+    ]
+
+def unmatched_metadata():
+    return [
+        'EXTRA_METADATA_A\tUSA\t2020-10-01\n',
+        'EXTRA_METADATA_T\tUSA\t2020-10-02\n',
+    ]
+
+def dup_sequences():
+    return [
+        '>SEQ_A\nNNNN\n',
+        '>SEQ_T\nNNNN\n',
+    ]
+
+def dup_metadata():
+    return [
+        'SEQ_C\tUSA\t2020-10-XX\n',
+        'SEQ_G\tUSA\t2020-10-XX\n',
+    ]
+
+def copy_and_append_to_file(src, dst, appended_content):
+    shutil.copy(src, dst)
+    with open(dst, 'a') as fh:
+        fh.writelines(appended_content)
+    return dst
+
+ at pytest.fixture
+def fasta_with_unmatched(tmpdir, fasta_file):
+    path = str(tmpdir / 'extra-sequences.fasta')
+    return copy_and_append_to_file(fasta_file, path, unmatched_sequences())
+
+ at pytest.fixture
+def metadata_with_unmatched(tmpdir, metadata_file):
+    path = str(tmpdir / 'extra-metadata.tsv')
+    return copy_and_append_to_file(metadata_file, path, unmatched_metadata())
+
+ at pytest.fixture
+def fasta_with_dup(tmpdir, fasta_file):
+    path = str(tmpdir / 'dup-sequences.fasta')
+    return copy_and_append_to_file(fasta_file, path, dup_sequences())
+
+ at pytest.fixture
+def metadata_with_dup(tmpdir, metadata_file):
+    path = str(tmpdir / 'dup-metadata.tsv')
+    return copy_and_append_to_file(metadata_file, path, dup_metadata())
+
+ at pytest.fixture
+def fasta_with_unmatched_and_dup(tmpdir, fasta_file):
+    path = str(tmpdir / 'extra-and-dup-sequences.fasta')
+    return copy_and_append_to_file(fasta_file, path, unmatched_sequences() + dup_sequences())
+
+ at pytest.fixture
+def metadata_with_unmatched_and_dup(tmpdir, metadata_file):
+    path = str(tmpdir / 'extra-and-dup-metadata.tsv')
+    return copy_and_append_to_file(metadata_file, path, dup_metadata() + unmatched_metadata()) #TODO: CHANGE ORDER HERE
+
+class TestReadMetadataWithSequence:
+    def test_read_metadata_with_sequence(self, metadata_file, fasta_file):
+        records = list(read_metadata_with_sequences(metadata_file, fasta_file, 'strain'))
+        assert len(records) == 4
+        for record in records:
+            seq_base = record['strain'].split("_")[-1].upper()
+            expected_sequence = seq_base * 4
+            assert record['sequence'] == expected_sequence
+
+    def test_read_metadata_with_sequences_with_bad_id(self, metadata_file, fasta_file):
+        id_field = "bad_id"
+        with pytest.raises(AugurError) as e_info:
+            next(read_metadata_with_sequences(metadata_file, fasta_file, id_field))
+        assert str(e_info.value) == f"The provided sequence id column {id_field!r} does not exist in the metadata."
+
+    def test_read_metadata_with_sequences_with_unmatched(self, metadata_with_unmatched, fasta_with_unmatched):
+        with pytest.raises(AugurError) as e_info:
+            list(read_metadata_with_sequences(metadata_with_unmatched, fasta_with_unmatched, 'strain'))
+        assert str(e_info.value) == "Encountered metadata record 'EXTRA_METADATA_A' without a matching sequence."
+
+    def test_read_metadata_with_sequences_with_unmatched_error_all(self, metadata_with_unmatched, fasta_with_unmatched):
+        with pytest.raises(AugurError) as e_info:
+            list(read_metadata_with_sequences(
+                metadata_with_unmatched,
+                fasta_with_unmatched,
+                'strain',
+                unmatched_reporting=DataErrorMethod.ERROR_ALL))
+        assert str(e_info.value) == (
+            "Encountered the following error(s) when parsing metadata with sequences:\n"
+            "The output may be incomplete because there were unmatched records.\n"
+            "The following metadata records did not have a matching sequence:\n"
+            "'EXTRA_METADATA_A'\n'EXTRA_METADATA_T'\n"
+            "The following sequence records did not have a matching metadata record:\n"
+            "'EXTRA_SEQ_A'\n'EXTRA_SEQ_T'"
+        )
+
+    def test_read_metadata_with_sequences_with_unmatched_warning(self, capsys, metadata_with_unmatched, fasta_with_unmatched):
+        records = list(read_metadata_with_sequences(
+            metadata_with_unmatched,
+            fasta_with_unmatched,
+            'strain',
+            unmatched_reporting=DataErrorMethod.WARN))
+        assert len(records) == 4
+        assert [record['strain'] for record in records] == ['SEQ_A', 'SEQ_T', 'SEQ_C', 'SEQ_G']
+
+        captured = capsys.readouterr()
+        assert captured.err == (
+            "WARNING: Encountered metadata record 'EXTRA_METADATA_A' without a matching sequence.\n"
+            "WARNING: Encountered metadata record 'EXTRA_METADATA_T' without a matching sequence.\n"
+            "WARNING: The output may be incomplete because there were unmatched records.\n"
+            "The following metadata records did not have a matching sequence:\n"
+            "'EXTRA_METADATA_A'\n'EXTRA_METADATA_T'\n"
+            "The following sequence records did not have a matching metadata record:\n"
+            "'EXTRA_SEQ_A'\n'EXTRA_SEQ_T'\n"
+        )
+
+    def test_read_metadata_with_sequences_with_unmatched_silent(self, capsys, metadata_with_unmatched, fasta_with_unmatched):
+        records = list(read_metadata_with_sequences(
+            metadata_with_unmatched,
+            fasta_with_unmatched,
+            'strain',
+            unmatched_reporting=DataErrorMethod.SILENT))
+        assert len(records) == 4
+        assert [record['strain'] for record in records] == ['SEQ_A', 'SEQ_T', 'SEQ_C', 'SEQ_G']
+        assert "WARNING" not in capsys.readouterr().err
+
+    def test_read_metadata_with_sequences_with_dup_metadata(self, metadata_with_dup, fasta_file):
+        with pytest.raises(AugurError) as e_info:
+            list(read_metadata_with_sequences(metadata_with_dup, fasta_file, 'strain'))
+        assert str(e_info.value) == "Encountered metadata record with duplicate id 'SEQ_C'."
+
+    def test_read_metadata_with_sequences_with_dup_fasta(self, metadata_file, fasta_with_dup):
+        with pytest.raises(AugurError) as e_info:
+            list(read_metadata_with_sequences(metadata_file, fasta_with_dup, 'strain'))
+        assert str(e_info.value) == "Encountered sequence record with duplicate id 'SEQ_A'."
+
+    def test_read_metadata_with_sequences_with_dup_both(self, metadata_with_dup, fasta_with_dup):
+        with pytest.raises(AugurError) as e_info:
+            list(read_metadata_with_sequences(metadata_with_dup, fasta_with_dup, 'strain'))
+        # Expected to error on first duplicate sequence since we check sequences first
+        assert str(e_info.value) == "Encountered sequence record with duplicate id 'SEQ_A'."
+
+    def test_read_metadata_with_sequences_with_dup_error_all(self, metadata_with_dup, fasta_with_dup):
+        with pytest.raises(AugurError) as e_info:
+            list(read_metadata_with_sequences(
+                metadata_with_dup,
+                fasta_with_dup,
+                'strain',
+                duplicate_reporting=DataErrorMethod.ERROR_ALL
+            ))
+        assert str(e_info.value) == (
+            "Encountered the following error(s) when parsing metadata with sequences:\n"
+            "The output may not match expectations because there were records with duplicate sequence ids.\n"
+            f"The following sequence ids were duplicated in {metadata_with_dup!r}:\n"
+            "'SEQ_C'\n'SEQ_G'\n"
+            f"The following sequence ids were duplicated in {fasta_with_dup!r}:\n"
+            "'SEQ_A'\n'SEQ_T'"
+        )
+
+    def test_read_metadata_with_sequences_with_dup_warn(self, capsys, metadata_with_dup, fasta_with_dup):
+        records = list(read_metadata_with_sequences(
+            metadata_with_dup,
+            fasta_with_dup,
+            'strain',
+            duplicate_reporting=DataErrorMethod.WARN
+        ))
+        assert len(records) == 6
+        assert [record['strain'] for record in records] == ['SEQ_A', 'SEQ_T', 'SEQ_C', 'SEQ_G', 'SEQ_C', 'SEQ_G']
+
+        captured = capsys.readouterr()
+        assert captured.err == (
+            "WARNING: Encountered sequence record with duplicate id 'SEQ_A'.\n"
+            "WARNING: Encountered sequence record with duplicate id 'SEQ_T'.\n"
+            "WARNING: Encountered metadata record with duplicate id 'SEQ_C'.\n"
+            "WARNING: Encountered metadata record with duplicate id 'SEQ_G'.\n"
+            "WARNING: The output may not match expectations because there were records with duplicate sequence ids.\n"
+            f"The following sequence ids were duplicated in {metadata_with_dup!r}:\n"
+            "'SEQ_C'\n'SEQ_G'\n"
+            f"The following sequence ids were duplicated in {fasta_with_dup!r}:\n"
+            "'SEQ_A'\n'SEQ_T'\n"
+        )
+
+    def test_read_metadata_with_sequences_with_dup_silent(self, capsys, metadata_with_dup, fasta_with_dup):
+        records = list(read_metadata_with_sequences(
+            metadata_with_dup,
+            fasta_with_dup,
+            'strain',
+            duplicate_reporting=DataErrorMethod.SILENT
+        ))
+        assert len(records) == 6
+        assert [record['strain'] for record in records] == ['SEQ_A', 'SEQ_T', 'SEQ_C', 'SEQ_G', 'SEQ_C', 'SEQ_G']
+        assert "WARNING" not in capsys.readouterr().err
+
+    def test_read_metadata_with_sequences_with_extra_and_dup(self, metadata_with_unmatched_and_dup, fasta_with_unmatched_and_dup):
+        with pytest.raises(AugurError) as e_info:
+            list(read_metadata_with_sequences(metadata_with_unmatched_and_dup, fasta_with_unmatched_and_dup, 'strain'))
+        # Expected to error on first duplicate sequence since we check duplicate sequences first
+        assert str(e_info.value) == "Encountered sequence record with duplicate id 'SEQ_A'."
+
+    def test_read_metadata_with_sequences_with_extra_and_dup_error_all(self, metadata_with_unmatched_and_dup, fasta_with_unmatched_and_dup):
+        with pytest.raises(AugurError) as e_info:
+            list(read_metadata_with_sequences(
+                metadata_with_unmatched_and_dup,
+                fasta_with_unmatched_and_dup,
+                'strain',
+                unmatched_reporting=DataErrorMethod.ERROR_ALL,
+                duplicate_reporting=DataErrorMethod.ERROR_ALL
+            ))
+        assert str(e_info.value) == (
+            "Encountered the following error(s) when parsing metadata with sequences:\n"
+            "The output may not match expectations because there were records with duplicate sequence ids.\n"
+            f"The following sequence ids were duplicated in {metadata_with_unmatched_and_dup!r}:\n"
+            "'SEQ_C'\n'SEQ_G'\n"
+            f"The following sequence ids were duplicated in {fasta_with_unmatched_and_dup!r}:\n"
+            "'SEQ_A'\n'SEQ_T'\n"
+            "The output may be incomplete because there were unmatched records.\n"
+            "The following metadata records did not have a matching sequence:\n"
+            "'EXTRA_METADATA_A'\n'EXTRA_METADATA_T'\n"
+            "The following sequence records did not have a matching metadata record:\n"
+            "'EXTRA_SEQ_A'\n'EXTRA_SEQ_T'"
+        )
+
+    def test_read_metadata_with_sequences_with_extra_and_dup_warn_unmatched(self, capsys, metadata_with_unmatched_and_dup, fasta_with_unmatched_and_dup):
+        with pytest.raises(AugurError) as e_info:
+            list(read_metadata_with_sequences(
+                metadata_with_unmatched_and_dup,
+                fasta_with_unmatched_and_dup,
+                'strain',
+                unmatched_reporting=DataErrorMethod.WARN,
+                duplicate_reporting=DataErrorMethod.ERROR_ALL
+            ))
+        # We should see warnings for the unmatched records before the error is raised
+        captured = capsys.readouterr()
+        assert captured.err == (
+            "WARNING: Encountered metadata record 'EXTRA_METADATA_A' without a matching sequence.\n"
+            "WARNING: Encountered metadata record 'EXTRA_METADATA_T' without a matching sequence.\n"
+            "WARNING: The output may be incomplete because there were unmatched records.\n"
+            "The following metadata records did not have a matching sequence:\n"
+            "'EXTRA_METADATA_A'\n'EXTRA_METADATA_T'\n"
+            "The following sequence records did not have a matching metadata record:\n"
+            "'EXTRA_SEQ_A'\n'EXTRA_SEQ_T'\n"
+        )
+        assert str(e_info.value) == (
+            "Encountered the following error(s) when parsing metadata with sequences:\n"
+            "The output may not match expectations because there were records with duplicate sequence ids.\n"
+            f"The following sequence ids were duplicated in {metadata_with_unmatched_and_dup!r}:\n"
+            "'SEQ_C'\n'SEQ_G'\n"
+            f"The following sequence ids were duplicated in {fasta_with_unmatched_and_dup!r}:\n"
+            "'SEQ_A'\n'SEQ_T'"
+        )
+
+    def test_read_metadata_with_sequences_with_extra_and_dup_warn_dups(self, capsys, metadata_with_unmatched_and_dup, fasta_with_unmatched_and_dup):
+        with pytest.raises(AugurError) as e_info:
+            list(read_metadata_with_sequences(
+                metadata_with_unmatched_and_dup,
+                fasta_with_unmatched_and_dup,
+                'strain',
+                unmatched_reporting=DataErrorMethod.ERROR_ALL,
+                duplicate_reporting=DataErrorMethod.WARN
+            ))
+        # We should see warnings for the unmatched records before the error is raised
+        captured = capsys.readouterr()
+        assert captured.err == (
+            "WARNING: Encountered sequence record with duplicate id 'SEQ_A'.\n"
+            "WARNING: Encountered sequence record with duplicate id 'SEQ_T'.\n"
+            "WARNING: Encountered metadata record with duplicate id 'SEQ_C'.\n"
+            "WARNING: Encountered metadata record with duplicate id 'SEQ_G'.\n"
+            "WARNING: The output may not match expectations because there were records with duplicate sequence ids.\n"
+            f"The following sequence ids were duplicated in {metadata_with_unmatched_and_dup!r}:\n"
+            "'SEQ_C'\n'SEQ_G'\n"
+            f"The following sequence ids were duplicated in {fasta_with_unmatched_and_dup!r}:\n"
+            "'SEQ_A'\n'SEQ_T'\n"
+        )
+
+        assert str(e_info.value) == (
+            "Encountered the following error(s) when parsing metadata with sequences:\n"
+            "The output may be incomplete because there were unmatched records.\n"
+            "The following metadata records did not have a matching sequence:\n"
+            "'EXTRA_METADATA_A'\n'EXTRA_METADATA_T'\n"
+            "The following sequence records did not have a matching metadata record:\n"
+            "'EXTRA_SEQ_A'\n'EXTRA_SEQ_T'"
+        )
+
+    def test_read_metadata_with_sequences_with_extra_and_dup_warn_both(self, capsys, metadata_with_unmatched_and_dup, fasta_with_unmatched_and_dup):
+        records = list(read_metadata_with_sequences(
+            metadata_with_unmatched_and_dup,
+            fasta_with_unmatched_and_dup,
+            'strain',
+            unmatched_reporting=DataErrorMethod.WARN,
+            duplicate_reporting=DataErrorMethod.WARN
+        ))
+        assert len(records) == 6
+        assert [record['strain'] for record in records] == ['SEQ_A', 'SEQ_T', 'SEQ_C', 'SEQ_G', 'SEQ_C', 'SEQ_G']
+
+        captured = capsys.readouterr()
+        assert captured.err == (
+            "WARNING: Encountered sequence record with duplicate id 'SEQ_A'.\n"
+            "WARNING: Encountered sequence record with duplicate id 'SEQ_T'.\n"
+            "WARNING: Encountered metadata record with duplicate id 'SEQ_C'.\n"
+            "WARNING: Encountered metadata record with duplicate id 'SEQ_G'.\n"
+            "WARNING: Encountered metadata record 'EXTRA_METADATA_A' without a matching sequence.\n"
+            "WARNING: Encountered metadata record 'EXTRA_METADATA_T' without a matching sequence.\n"
+            "WARNING: The output may not match expectations because there were records with duplicate sequence ids.\n"
+            f"The following sequence ids were duplicated in {metadata_with_unmatched_and_dup!r}:\n"
+            "'SEQ_C'\n'SEQ_G'\n"
+            f"The following sequence ids were duplicated in {fasta_with_unmatched_and_dup!r}:\n"
+            "'SEQ_A'\n'SEQ_T'\n"
+            "WARNING: The output may be incomplete because there were unmatched records.\n"
+            "The following metadata records did not have a matching sequence:\n"
+            "'EXTRA_METADATA_A'\n'EXTRA_METADATA_T'\n"
+            "The following sequence records did not have a matching metadata record:\n"
+            "'EXTRA_SEQ_A'\n'EXTRA_SEQ_T'\n"
+        )
+
+ at pytest.fixture
+def output_records():
+    return iter([
+        {"strain": "SEQ_A", "country": "USA", "date": "2020-10-01"},
+        {"strain": "SEQ_T", "country": "USA", "date": "2020-10-02"}
+    ])
+
+ at pytest.fixture
+def expected_output_tsv():
+    return (
+        "strain\tcountry\tdate\n"
+        "SEQ_A\tUSA\t2020-10-01\n"
+        "SEQ_T\tUSA\t2020-10-02\n"
+    )
+
+class TestWriteRecordsToTsv:
+    def test_write_records_to_tsv(self, tmpdir, output_records, expected_output_tsv):
+        output_tsv = tmpdir / "output.tsv"
+        write_records_to_tsv(output_records, output_tsv)
+        with open(output_tsv, 'r') as output_fh:
+            assert output_fh.read() == expected_output_tsv
+
+    def test_write_records_to_tsv_stdout(self, capsys, output_records, expected_output_tsv):
+        write_records_to_tsv(output_records, '-')
+        captured = capsys.readouterr()
+        assert captured.out == expected_output_tsv
+
+    def test_write_records_to_tsv_with_extra_keys(self, capsys):
+        records_with_extra_keys = iter([
+            {"key_1": "value_1", "key_2": "value_2"},
+            {"key_1": "value_1", "key_2": "value_2", "key_3": "value_3"}
+        ])
+        write_records_to_tsv(records_with_extra_keys, '-')
+        captured = capsys.readouterr()
+        assert captured.out == (
+            "key_1\tkey_2\n"
+            "value_1\tvalue_2\n"
+            "value_1\tvalue_2\n"
+        )
+
+    def test_write_records_to_tsv_with_missing_keys(self, capsys):
+        records_with_missing_keys = iter([
+            {"key_1": "value_1", "key_2": "value_2"},
+            {"key_2": "value_2"}
+        ])
+        write_records_to_tsv(records_with_missing_keys, '-')
+        captured = capsys.readouterr()
+        assert captured.out == (
+            "key_1\tkey_2\n"
+            "value_1\tvalue_2\n"
+            "\tvalue_2\n"
+        )
+
+    def test_write_records_to_tsv_with_empty_records(self, tmpdir):
+        output_file = tmpdir / "output.tsv"
+        with pytest.raises(AugurError) as e_info:
+            write_records_to_tsv(iter([]), output_file)
+
+        assert str(e_info.value) == f"Unable to write records to {output_file} because provided records were empty."


=====================================
tests/io/test_sequences.py
=====================================
@@ -8,6 +8,7 @@ import lzma
 from pathlib import Path
 import pytest
 import random
+from augur.errors import AugurError
 
 import augur.io.sequences
 
@@ -176,3 +177,37 @@ class TestWriteSequences:
 
         with open(output_filename, "r") as handle:
             assert total_sequences_written == len([line for line in handle if line.startswith(">")])
+
+
+ at pytest.fixture()
+def sequence_records():
+    return [
+        {"strain": "SEQ_A", "sequence": "AAAA"},
+        {"strain": "SEQ_T", "sequence": "TTTT"},
+        {"strain": "SEQ_C", "sequence": "CCCC"},
+        {"strain": "SEQ_G", "sequence": "GGGG"},
+    ]
+
+class TestWriteFastaFromRecords:
+    def test_write_records_to_fasta(self, tmpdir, sequence_records):
+        output_fasta = str(tmpdir / "sequences.fasta")
+        records = list(augur.io.sequences.write_records_to_fasta(sequence_records, output_fasta))
+
+        assert all("sequence" not in record for record in records)
+
+        with open(output_fasta, 'r') as handle:
+            assert len(records) == len([line for line in handle if line.startswith(">")])
+
+    def test_write_records_to_fasta_with_bad_id_field(self, tmpdir, sequence_records):
+        output_fasta = str(tmpdir / "sequences.fasta")
+        seq_id_field = "bogus_id"
+        with pytest.raises(AugurError) as e_info:
+            list(augur.io.sequences.write_records_to_fasta(sequence_records, output_fasta, seq_id_field=seq_id_field))
+        assert str(e_info.value) == f"Provided sequence identifier field {seq_id_field!r} does not exist."
+
+    def test_write_records_to_fasta_with_bad_seq_field(self, tmpdir, sequence_records):
+        output_fasta = str(tmpdir / "sequences.fasta")
+        seq_field = "bogus_sequence"
+        with pytest.raises(AugurError) as e_info:
+            list(augur.io.sequences.write_records_to_fasta(sequence_records, output_fasta, seq_field=seq_field))
+        assert str(e_info.value) == f"Provided sequence field {seq_field!r} does not exist."



View it on GitLab: https://salsa.debian.org/med-team/augur/-/commit/949a6709d1426d74ace27cd9fc49e521d9e95c31

-- 
View it on GitLab: https://salsa.debian.org/med-team/augur/-/commit/949a6709d1426d74ace27cd9fc49e521d9e95c31
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20221119/320657f2/attachment-0001.htm>


More information about the debian-med-commit mailing list