[med-svn] [Git][med-team/augur][upstream] New upstream version 14.1.0
Andreas Tille (@tille)
gitlab at salsa.debian.org
Fri Apr 8 06:04:07 BST 2022
Andreas Tille pushed to branch upstream at Debian Med / augur
Commits:
d19a7749 by Andreas Tille at 2022-04-08T06:57:52+02:00
New upstream version 14.1.0
- - - - -
19 changed files:
- CHANGES.md
- augur/__version__.py
- augur/data/schema-auspice-config-v2.json
- + augur/data/schema-export-root-sequence.json
- augur/data/schema-export-v2.json
- + augur/data/schema-frequencies.json
- + augur/data/schema-measurements.json
- + augur/data/schema-tip-frequencies.json
- augur/export_v2.py
- augur/filter.py
- augur/parse.py
- augur/utils.py
- augur/validate.py
- tests/functional/export_v2.t
- + tests/functional/export_v2/auspice_config2.json
- + tests/functional/export_v2/auspice_config3.json
- + tests/functional/export_v2/dataset2.json
- tests/functional/filter.t
- + tests/test_schemas.py
Changes:
=====================================
CHANGES.md
=====================================
@@ -3,6 +3,30 @@
## __NEXT__
+## 14.1.0 (31 March 2022)
+
+### Features
+
+* schemas: Extend export v2 schema to support an array of trees [#851][] (@tsibley)
+* schemas: Add JSON schemas for our root-sequence and tip-frequencies sidecars [#852][] (@tsibley)
+* schemas: Add JSON schema for measurements sidecar [#859][] (@joverlee521)
+* filter: Send warnings to stderr to be consistent with other warnings [#862][] (@victorlin)
+* export: Allow an extensions block in auspice config & dataset JSONs [#865][] (@jameshadfield)
+* export: Allow skipping of input/output schema validation [#865][] (@jameshadfield)
+* export: Order keys in dataset for easier reading [#868][] (@jameshadfield)
+
+### Bug Fixes
+
+* parse: Fix typo in internal variable name [#850][] (@emmahodcroft)
+
+[#850]: https://github.com/nextstrain/augur/pull/850
+[#851]: https://github.com/nextstrain/augur/pull/851
+[#852]: https://github.com/nextstrain/augur/pull/852
+[#859]: https://github.com/nextstrain/augur/pull/859
+[#862]: https://github.com/nextstrain/augur/pull/862
+[#865]: https://github.com/nextstrain/augur/pull/865
+[#868]: https://github.com/nextstrain/augur/pull/868
+
## 14.0.0 (8 February 2022)
### Major Changes
=====================================
augur/__version__.py
=====================================
@@ -1,4 +1,4 @@
-__version__ = '14.0.0'
+__version__ = '14.1.0'
def is_augur_version_compatible(version):
=====================================
augur/data/schema-auspice-config-v2.json
=====================================
@@ -230,6 +230,10 @@
}
}
}
+ },
+ "extensions": {
+ "description": "Data to be passed through to the the resulting dataset JSON",
+ "$comment": "Any type is accepted"
}
}
}
=====================================
augur/data/schema-export-root-sequence.json
=====================================
@@ -0,0 +1,33 @@
+{
+ "$schema": "http://json-schema.org/draft-06/schema#",
+ "$id": "https://nextstrain.org/schemas/dataset/root-sequence",
+ "title": "Nextstrain root-sequence sidecar for datasets",
+ "description": "Typically produced by Augur and consumed by Auspice. Applicable to the `--root-sequence` output of `augur export v2` as well as the `--output-sequence` option of `augur export v1`.",
+ "oneOf": [
+ {
+ "$comment": "This is sort of weird, but `augur export v1` can explicitly produce an empty object.",
+ "description": "An empty object",
+ "type": "object",
+ "properties": {},
+ "additionalProperties": false
+ },
+ {
+ "description": "An object containing at least a \"nuc\" key and optionally additional keys for genome annotations (e.g. genes)",
+ "type": "object",
+ "required": ["nuc"],
+ "properties": {
+ "nuc": {
+ "description": "Nucleotide sequence of whole genome (from the output of `augur ancestral`)",
+ "type": "string"
+ }
+ },
+ "patternProperties": {
+ "^[a-zA-Z0-9*_-]+$": {
+ "$comment": "This pattern is the same pattern used in the corresponding parts of schema-export-v2.json.",
+ "description": "Amino acid sequence of genome annotation (e.g. gene) identified by this key (from the output of `augur translate`)",
+ "type": "string"
+ }
+ }
+ }
+ ]
+}
=====================================
augur/data/schema-export-v2.json
=====================================
@@ -2,14 +2,14 @@
"$schema": "http://json-schema.org/draft-06/schema#",
"$id": "https://nextstrain.org/schemas/dataset/v2",
"type": "object",
- "title": "Nextstrain metadata JSON schema proposal (meta + tree together)",
+ "title": "Nextstrain dataset v2",
+ "description": "Typically produced by Augur (`augur export v2`) and consumed by Auspice. Combines dataset v1 meta and tree files together into one file, with additional changes.",
"additionalProperties": false,
"required": ["version", "meta", "tree"],
"properties": {
"version" : {
- "description": "JSON schema version",
- "type" : "string",
- "pattern": "^v[0-9]+$"
+ "description": "Major schema version",
+ "const": "v2"
},
"meta": {
"type": "object",
@@ -298,6 +298,23 @@
}
}
},
+ "tree": {
+ "description": "One or more phylogenies using a nested JSON structure",
+ "oneOf": [
+ {"$ref": "#/$defs/tree"},
+ {
+ "type": "array",
+ "minItems": 1,
+ "items": {"$ref": "#/$defs/tree"}
+ }
+ ]
+ },
+ "extensions": {
+ "description": "Data for use by applications other than auspice",
+ "$comment": "Any type is accepted"
+ }
+ },
+ "$defs": {
"tree": {
"type" : "object",
"$comment": "The phylogeny in a nested JSON structure",
@@ -500,7 +517,7 @@
"$comment": "Polytomies (more than 2 items) allowed, as are nodes with a single child.",
"type": "array",
"minItems": 1,
- "items": {"$ref": "#/properties/tree"}
+ "items": {"$ref": "#/$defs/tree"}
}
}
}
=====================================
augur/data/schema-frequencies.json
=====================================
@@ -0,0 +1,112 @@
+{
+ "$schema": "http://json-schema.org/draft-06/schema#",
+ "$id": "https://nextstrain.org/schemas/augur/frequencies",
+ "title": "`augur frequencies` output",
+ "description": "This schema describes the various forms of `augur frequencies` output when using the default `--output-format=auspice`. The specific form used depends on the input parameters, as noted for each possible form below. One of the forms is compatible with the Nextstrain tip-frequencies sidecar described by <https://nextstrain.org/schemas/tip-frequencies>.",
+ "$comment": "For historical context (some, not complete) on the development of this format, see <https://github.com/nextstrain/augur/pull/83> and <https://github.com/nextstrain/augur/issues/84>.",
+ "oneOf": [
+ {
+ "description": "`augur frequencies` with an input `--tree` and `--method=diffusion`",
+ "type": "object",
+ "required": ["pivots", "counts"],
+ "properties": {
+ "counts": {
+ "description": "Counts by region",
+ "type": "object",
+ "additionalProperties": {
+ "description": "Counts for region identified by this key",
+ "type": "array",
+ "items": {"type": "integer"}
+ }
+ },
+ "pivots": {"$ref": "#/$defs/pivots"},
+ "generated_by": {"$ref": "#/$defs/generated_by"}
+ },
+ "additionalProperties": {
+ "description": "Estimated frequencies by region for tip (or node) identified by this key",
+ "type": "object",
+ "additionalProperties": {
+ "description": "Estimated frequencies for region identified by this key",
+ "type": "array",
+ "items": {"type": "number"}
+ }
+ }
+ },
+ {
+ "description": "`augur frequencies` with an input `--tree` and `--method=kde`, compatible with <https://nextstrain.org/schemas/tip-frequencies>",
+ "type": "object",
+ "required": ["pivots"],
+ "properties": {
+ "pivots": {"$ref": "#/$defs/pivots"},
+ "generated_by": {"$ref": "#/$defs/generated_by"}
+ },
+ "additionalProperties": {
+ "description": "Estimated frequencies for tip (or node) identified by this key",
+ "type": "object",
+ "properties": {
+ "frequencies": {
+ "type": "array",
+ "items": {"type": "number"}
+ }
+ },
+ "additionalProperties": false
+ }
+ },
+ {
+ "description": "`augur frequencies` with input gene `--alignments` and `--method=diffusion`",
+ "type": "object",
+ "required": ["pivots"],
+ "properties": {
+ "pivots": {"$ref": "#/$defs/pivots"},
+ "generated_by": {"$ref": "#/$defs/generated_by"}
+ },
+ "patternProperties": {
+ "^(.+):counts$": {
+ "description": "Counts for gene alignment position+state identified by this key (<GENE>:counts)",
+ "type": "array",
+ "items": {"type": "integer"}
+ },
+ "^(.+):([0-9]+)(.+)$": {
+ "description": "Estimated frequencies for gene alignment position+state identified by this key (<GENE>:<POSITION><STATE>)",
+ "type": "array",
+ "items": {"type": "number"}
+ }
+ },
+ "additionalProperties": false
+ },
+ {
+ "description": "`augur frequencies` with input gene `--alignments` and `--method=kde`",
+ "type": "object",
+ "required": ["pivots"],
+ "properties": {
+ "pivots": {"$ref": "#/$defs/pivots"},
+ "generated_by": {"$ref": "#/$defs/generated_by"}
+ },
+ "patternProperties": {
+ "^(.+):([0-9]+)(.+)$": {
+ "description": "Estimated frequencies for gene alignment position+state identified by this key (<GENE>:<POSITION><STATE>)",
+ "type": "array",
+ "items": {"type": "number"}
+ }
+ },
+ "additionalProperties": false
+ }
+ ],
+ "$defs": {
+ "pivots": {
+ "description": "Pivot dates as floating point numbers (YYYY.nnnnnn)",
+ "type": "array",
+ "items": {"type": "number"},
+ "minItems": 1,
+ "uniqueItems": true
+ },
+ "generated_by": {
+ "description": "Information about the software which produced the file",
+ "type": "object",
+ "properties": {
+ "program": {"type": "string"},
+ "version": {"type": "string"}
+ }
+ }
+ }
+}
=====================================
augur/data/schema-measurements.json
=====================================
@@ -0,0 +1,150 @@
+{
+ "$schema": "http://json-schema.org/draft-06/schema#",
+ "$id": "https://nextstrain.org/schemas/dataset/measurements",
+ "title": "Nextstrain measurements sidecar for datasets",
+ "description": "Typically produced by Augur and consumed by Auspice.",
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["collections"],
+ "properties": {
+ "default_collection": {
+ "description": "Default collection to display. The value must be a `key` in one of the objects of the collections array. Optional -- if not provided, first collection will be displayed",
+ "type": "string"
+ },
+ "collections": {
+ "description": "Collections of measurements and their configurations for display in Auspice. Order of the collections determines the order they are shown in the collections dropdown.",
+ "type": "array",
+ "minItems": 1,
+ "items": {
+ "description": "A single collection of related measurements and the collection's display config",
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["key", "groupings", "x_axis_label", "measurements"],
+ "properties": {
+ "key": {
+ "description": "The short name of the collection that is only used internally within Auspice. Each collection is expected to have a unique key.",
+ "type": "string"
+ },
+ "title": {
+ "description": "The title to display in the collections dropdown and panel title. Optional -- if not provided, then `key` will be used",
+ "type": "string"
+ },
+ "fields": {
+ "description": "Custom field order and display titles. Order of the fields determines the order they are shown in the measurement hover display. Optional -- if not provided, then the key of the field will be displayed in alphabetical order.",
+ "type": "array",
+ "minItems": 1,
+ "items": {
+ "description": "A single field of the measurements",
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["key"],
+ "properties": {
+ "key": {
+ "description": "The property name of the field within the measurement object",
+ "type": "string"
+ },
+ "title": {
+ "description": "The display title for the field. Optional -- if not provided, then `key` will be used",
+ "type": "string"
+ }
+ }
+ }
+ },
+ "groupings": {
+ "description": "The available group by fields for measurements. Order of the group by fields determines the order they are shown in the group by dropdown.",
+ "type": "array",
+ "minItems": 1,
+ "items": {
+ "description": "A single group by field for measurements",
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["key"],
+ "properties": {
+ "key": {
+ "description": "The property name of the group by field within the measurement object",
+ "type": "string"
+ },
+ "order": {
+ "description": "A custom order of group by values to customize the display order of the subplots when using group by field",
+ "type": "array",
+ "minItems": 1,
+ "items": {
+ "description": "A single value of the group by field present in measurements",
+ "type": ["string", "number", "boolean"]
+ }
+ }
+ }
+ }
+ },
+ "filters": {
+ "description": "The available filter options for measurements. Order of the filter options determines the order they are shown in the filter dropdown. Optional -- if not provided, then all fields will be available as filters.",
+ "type": "array",
+ "minItems": 1,
+ "items": {
+ "description": "The property name of the filter field within the measurement object",
+ "type": "string"
+ }
+ },
+ "x_axis_label": {
+ "description": "The short label to display for the x-axis that describes the `value` of the measurements in a collection",
+ "type": "string"
+ },
+ "threshold": {
+ "description": "A numeric measurement threshold to be displayed as a single grey line shared across subplots. Optional -- if not provided, no threshold will be displayed",
+ "type": "number"
+ },
+ "display_defaults": {
+ "description": "Default display options of the collection",
+ "type": "object",
+ "additionalProperties": false,
+ "minProperty": 1,
+ "properties": {
+ "group_by": {
+ "description": "Default group by field name that must be included as a group by option in the groupings array. Optional -- if not provided, first group by option will be used",
+ "type": "string"
+ },
+ "measurements_display": {
+ "description": "Dictates how the measurements are displayed, either as the raw data points or as the means of values grouped by the tree color-by attribute",
+ "type": "string",
+ "enum": ["raw", "mean"]
+ },
+ "show_overall_mean": {
+ "description": "Should the overall mean per group be displayed by default?",
+ "type": "boolean"
+ },
+ "show_threshold": {
+ "description": "Should the threshold line be displayed by default? Ignored if no threshold has been provided for collection",
+ "type": "boolean"
+ }
+ }
+ },
+ "measurements": {
+ "description": "All measurements for a single collection",
+ "type": "array",
+ "minItems": 1,
+ "items": {
+ "description": "A single measurement for a sample and it's associated metadata. At least one of the metadata properties should be a group by option",
+ "type": "object",
+ "required": ["strain", "value"],
+ "minProperties": 3,
+ "properties": {
+ "strain": {
+ "description": "The name of the sample that matches the name of the sample within the tree",
+ "type": "string"
+ },
+ "value": {
+ "description": "The numeric value of the measurement",
+ "type": "number"
+ }
+ },
+ "additionalProperties": {
+ "description": "Metadata associated with the measurement. Only metadata properties included in the groupings config will be included in the group by dropdown, but all metadata properties will be available as a filter",
+ "type": ["string", "number", "boolean"]
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+}
=====================================
augur/data/schema-tip-frequencies.json
=====================================
@@ -0,0 +1,42 @@
+
+{
+ "$schema": "http://json-schema.org/draft-06/schema#",
+ "$id": "https://nextstrain.org/schemas/dataset/tip-frequencies",
+ "title": "Nextstrain tip-frequencies sidecar for datasets",
+ "description": "Typically produced by Augur (with `augur frequencies --method kde --tree …`) and consumed by Auspice. Note that the full range of output forms from `augur frequencies` is broader than this, see <https://nextstrain.org/schemas/augur/frequencies>.",
+ "$comment": "For historical context (some, not complete) on the development of this format, see <https://github.com/nextstrain/augur/pull/83> and <https://github.com/nextstrain/augur/issues/84>.",
+ "type": "object",
+ "required": ["pivots"],
+ "properties": {
+ "pivots": {
+ "description": "Pivot dates as floating point numbers (YYYY.nnnnnn)",
+ "type": "array",
+ "items": {"type": "number"},
+ "minItems": 1,
+ "uniqueItems": true
+ },
+ "projection_pivot": {
+ "description": "Pivot at which estimates are projected into the future. This property is understood by Auspice but only produced by custom generators; it is not produced by `augur frequencies`.",
+ "type": "number"
+ },
+ "generated_by": {
+ "description": "Information about the software which produced the file",
+ "type": "object",
+ "properties": {
+ "program": {"type": "string"},
+ "version": {"type": "string"}
+ }
+ }
+ },
+ "additionalProperties": {
+ "description": "Estimated frequencies for tip (or node) identified by this key",
+ "type": "object",
+ "properties": {
+ "frequencies": {
+ "type": "array",
+ "items": {"type": "number"}
+ }
+ },
+ "additionalProperties": false
+ }
+}
=====================================
augur/export_v2.py
=====================================
@@ -4,7 +4,7 @@ Export JSON files suitable for visualization with auspice..
from pathlib import Path
import os, sys
import time
-from collections import defaultdict, deque
+from collections import defaultdict, deque, OrderedDict
import warnings
import numbers
import re
@@ -45,6 +45,67 @@ def configure_warnings():
class InvalidOption(Exception):
pass
+class CustomOrderedDict(OrderedDict):
+ """
+ Similar to OrderedDict but will convert dictionaries (and dictionaries of dictionaries)
+ into (nested) CustomOrderedDicts.
+ Encountered lists of dicts will be converted to lists of CustomOrderedDict but we will not
+ recursively explore nested lists.
+ Tuples and other iterators are not explored.
+ """
+ def __init__(self, *args):
+ super().__init__(*args)
+ for key in self:
+ if isinstance(self[key], dict) and not isinstance(self[key], OrderedDict):
+ self[key] = CustomOrderedDict(self[key])
+ elif isinstance(self[key], list):
+ self[key] = [
+ (CustomOrderedDict(el) if (isinstance(el, dict) and not isinstance(el, OrderedDict)) else el)
+ for el in self[key]
+ ]
+ def set_order(self, *order):
+ """
+ changes the order of keys to match those specified in `order` as much
+ as possible. Missing keys are ignored. Extra keys will come after those
+ specified in `order`.
+ """
+ for key in reversed(order):
+ self.move_to_end_if_present(key, last=False)
+ def move_to_end_if_present(self, key, **kwargs):
+ try:
+ self.move_to_end(key, **kwargs)
+ except KeyError:
+ pass
+
+
+def orderKeys(data):
+ """
+ converts the data dict (where keys are inherently unordered) into an
+ OrderedDict where keys are nicely ordered for human eyes to scan the
+ data when written to JSON. The ordering (mostly) mirrors the schema.
+ """
+ od = CustomOrderedDict(data)
+ od.set_order("version", "meta", "tree")
+ if "meta" in od:
+ od["meta"].set_order("title", "updated", "build_url", "data_provenance", "maintainers")
+ for coloring in od['meta'].get('colorings', []):
+ coloring.set_order("key", "title", "type", "scale", "legend")
+ def order_nodes(node):
+ """recursive function to order nodes in a (sub)tree"""
+ node.set_order("name", "node_attrs", "branch_attrs")
+ # children often a _large_ object and it improves readability if this comes last in the node
+ node.move_to_end_if_present("children")
+ if "node_attrs" in node:
+ node["node_attrs"].set_order("div", "num_date")
+ for child in node.get("children", []):
+ order_nodes(child)
+ if isinstance(od.get("tree"), list):
+ for subtree in od['tree']:
+ order_nodes(subtree)
+ elif isinstance(od.get("tree"), dict):
+ order_nodes(od['tree'])
+ return od
+
def convert_tree_to_json_structure(node, metadata, div=0):
"""
converts the Biopython tree structure to a dictionary that can
@@ -754,39 +815,6 @@ def node_data_prop_is_normal_trait(name):
return True
-def get_root_sequence(root_node, ref=None, translations=None):
- '''
- create a json structure that contains the sequence of the root, both as
- nucleotide and as translations. This allows look-up of the sequence for
- all states, including those that are not variable.
-
- Parameters
- ----------
- root_node : dict
- data associated with the node
- ref : str, optional
- filename of the root sequence
- translations : str, optional
- file name of translations
-
- Returns
- -------
- dict
- dict of nucleotide sequence and translations
- '''
- root_sequence = {}
- if ref and translations:
- from Bio import SeqIO
- refseq = SeqIO.read(ref, 'fasta')
- root_sequence['nuc']=str(refseq.seq)
- for gene in SeqIO.parse(translations, 'fasta'):
- root_sequence[gene.id] = str(gene.seq)
- else:
- root_sequence["nuc"] = root_node["sequence"]
- root_sequence.update(root_node["aa_sequences"])
-
- return root_sequence
-
def register_arguments_v2(subparsers):
v2 = subparsers.add_parser("v2", help="Export version 2 JSON schema")
@@ -825,6 +853,7 @@ def register_arguments_v2(subparsers):
)
optional_settings.add_argument('--minify-json', action="store_true", help="export JSONs without indentation or line returns")
optional_settings.add_argument('--include-root-sequence', action="store_true", help="Export an additional JSON containing the root sequence (reference sequence for vcf) used to identify mutations. The filename will follow the pattern of <OUTPUT>_root-sequence.json for a main auspice JSON of <OUTPUT>.json")
+ optional_settings.add_argument('--skip-validation', action="store_true", help="skip validation of input/output files. Use at your own risk!")
return v2
@@ -936,12 +965,13 @@ def get_config(args):
if not args.auspice_config:
return {}
config = read_config(args.auspice_config)
- try:
- print("Validating config file {} against the JSON schema".format(args.auspice_config))
- validate_auspice_config_v2(args.auspice_config)
- except ValidateError:
- print("Validation of {} failed. Please check the formatting of this file & refer to the augur documentation for further help. ".format(args.auspice_config))
- sys.exit(2)
+ if not args.skip_validation:
+ try:
+ print("Validating config file {} against the JSON schema".format(args.auspice_config))
+ validate_auspice_config_v2(args.auspice_config)
+ except ValidateError:
+ print("Validation of {} failed. Please check the formatting of this file & refer to the augur documentation for further help. ".format(args.auspice_config))
+ sys.exit(2)
# Print a warning about the inclusion of "vaccine_choices" which are _unused_ by `export v2`
# (They are in the schema as this allows v1-compat configs to be used)
if config.get("vaccine_choices"):
@@ -1005,9 +1035,13 @@ def run_v2(args):
set_panels(data_json, config, args.panels)
set_data_provenance(data_json, config)
+ # pass through any extensions block in the auspice config JSON without any changes / checking
+ if config.get("extensions"):
+ data_json["extensions"] = config["extensions"]
+
# Write outputs - the (unified) dataset JSON intended for auspice & perhaps the ref root-sequence JSON
indent = {"indent": None} if args.minify_json else {}
- write_json(data=data_json, file_name=args.output, include_version=False, **indent)
+ write_json(data=orderKeys(data_json), file_name=args.output, include_version=False, **indent)
if args.include_root_sequence:
if 'reference' in node_data:
@@ -1022,7 +1056,8 @@ def run_v2(args):
fatal("Root sequence output was requested, but the node data provided is missing a 'reference' key.")
# validate outputs
- validate_data_json(args.output)
+ if not args.skip_validation:
+ validate_data_json(args.output)
if deprecationWarningsEmitted:
print("\n------------------------")
=====================================
augur/filter.py
=====================================
@@ -623,7 +623,7 @@ def construct_filters(args, sequence_index):
is_vcf = filename_is_vcf(args.sequences)
if is_vcf: #doesn't make sense for VCF, ignore.
- print("WARNING: Cannot use min_length for VCF files. Ignoring...")
+ print("WARNING: Cannot use min_length for VCF files. Ignoring...", file=sys.stderr)
else:
exclude_by.append((
filter_by_sequence_length,
@@ -1744,7 +1744,7 @@ def calculate_sequences_per_group(target_max_value, counts_per_group, allow_prob
)
except TooManyGroupsError as error:
if allow_probabilistic:
- print(f"WARNING: {error}")
+ print(f"WARNING: {error}", file=sys.stderr)
sequences_per_group = _calculate_fractional_sequences_per_group(
target_max_value,
counts_per_group,
=====================================
augur/parse.py
=====================================
@@ -5,7 +5,7 @@ import pandas as pd
from .io import open_file, read_sequences, write_sequences
-forbidden_chactacters = str.maketrans(
+forbidden_characters = str.maketrans(
{' ': None,
'(': '_',
')': '_',
@@ -103,7 +103,7 @@ def parse_sequence(sequence, fields, strain_key="strain", separator="|", prettif
sequence_fields = map(str.strip, sequence.description.split(separator))
metadata = dict(zip(fields, sequence_fields))
- tmp_name = metadata[strain_key].translate(forbidden_chactacters)
+ tmp_name = metadata[strain_key].translate(forbidden_characters)
sequence.name = sequence.id = tmp_name
sequence.description = ''
=====================================
augur/utils.py
=====================================
@@ -10,7 +10,7 @@ import subprocess
import shlex
from contextlib import contextmanager
from treetime.utils import numeric_date
-from collections import defaultdict
+from collections import defaultdict, OrderedDict
from pkg_resources import resource_stream
from io import TextIOWrapper
from .__version__ import __version__
@@ -256,9 +256,9 @@ def write_json(data, file_name, indent=(None if os.environ.get("AUGUR_MINIFY_JSO
if include_version:
data["generated_by"] = {"program": "augur", "version": get_augur_version()}
-
with open(file_name, 'w', encoding='utf-8') as handle:
- json.dump(data, handle, indent=indent, sort_keys=True)
+ sort_keys = False if isinstance(data, OrderedDict) else True
+ json.dump(data, handle, indent=indent, sort_keys=sort_keys)
def load_features(reference, feature_names=None):
=====================================
augur/validate.py
=====================================
@@ -43,11 +43,12 @@ def load_json_schema(path):
except json.JSONDecodeError as err:
raise ValidateError("Schema {} is not a valid JSON file. Error: {}".format(path, err))
# check loaded schema is itself valid -- see http://python-jsonschema.readthedocs.io/en/latest/errors/
+ Validator = jsonschema.validators.validator_for(schema)
try:
- jsonschema.Draft6Validator.check_schema(schema)
+ Validator.check_schema(schema)
except jsonschema.exceptions.SchemaError as err:
- raise ValidateError("Schema {} is not a valid JSON file. Error: {}".format(path, err))
- return jsonschema.Draft6Validator(schema)
+ raise ValidateError(f"Schema {path} is not a valid JSON Schema ({Validator.META_SCHEMA['$schema']}). Error: {err}")
+ return Validator(schema)
def load_json(path):
with open(path, 'rb') as fh:
@@ -118,6 +119,13 @@ def export_v1(meta_json, tree_json, **kwargs):
print("Validation of {!r} and {!r} succeeded, but there were warnings you may want to resolve.".format(meta_json, tree_json))
+def measurements(measurements_json, **kwargs):
+ schema = load_json_schema("schema-measurements.json")
+ measurements = load_json(measurements_json)
+ validate_json(measurements, schema, measurements_json)
+ return measurements
+
+
def register_arguments(parser):
subparsers = parser.add_subparsers(dest="subcommand", help="Which file(s) do you want to validate?")
@@ -131,6 +139,8 @@ def register_arguments(parser):
subparsers.add_parser("auspice-config-v2", help="validate auspice config intended for `augur export v2`") \
.add_argument('config_json', metavar='JSON', help="auspice config JSON")
+ subparsers.add_parser("measurements", help="validate measurements JSON intended for auspice measurements panel") \
+ .add_argument("measurements_json", metavar="JSON", help="exported measurements JSON")
def run(args):
try:
=====================================
tests/functional/export_v2.t
=====================================
@@ -30,4 +30,38 @@ Export with auspice config JSON which defines scale & legend settings
$ python3 "$TESTDIR/../../scripts/diff_jsons.py" export_v2/dataset1.json "$TMP/dataset1.json" \
> --exclude-paths "root['meta']['updated']"
- {}
\ No newline at end of file
+ {}
+
+
+Export with auspice config JSON with an extensions block
+ $ ${AUGUR} export v2 \
+ > --tree export_v2/tree.nwk \
+ > --node-data export_v2/div_node-data.json export_v2/location_node-data.json \
+ > --auspice-config export_v2/auspice_config2.json \
+ > --output "$TMP/dataset2.json" &>/dev/null
+
+ $ python3 "$TESTDIR/../../scripts/diff_jsons.py" export_v2/dataset2.json "$TMP/dataset2.json" \
+ > --exclude-paths "root['meta']['updated']"
+ {}
+
+# auspice_config3.json is the same as auspice_config2.json but with an extra key which the schema does not allow.
+# Running without --skip-validation should result in an error
+# Message printed: "Validation of export_v2/auspice_config3.json failed."
+ $ ${AUGUR} export v2 \
+ > --tree export_v2/tree.nwk \
+ > --node-data export_v2/div_node-data.json export_v2/location_node-data.json \
+ > --auspice-config export_v2/auspice_config3.json \
+ > --output "$TMP/dataset2.json" &>/dev/null
+ [2]
+
+# Skipping validation gives us the same results as `auspice_config2.json`
+ $ ${AUGUR} export v2 \
+ > --tree export_v2/tree.nwk \
+ > --node-data export_v2/div_node-data.json export_v2/location_node-data.json \
+ > --auspice-config export_v2/auspice_config3.json \
+ > --output "$TMP/dataset3.json" \
+ > --skip-validation &>/dev/null
+
+ $ python3 "$TESTDIR/../../scripts/diff_jsons.py" export_v2/dataset2.json "$TMP/dataset3.json" \
+ > --exclude-paths "root['meta']['updated']"
+ {}
=====================================
tests/functional/export_v2/auspice_config2.json
=====================================
@@ -0,0 +1,36 @@
+{
+ "title": "Minimal config with an extensions block",
+ "colorings": [
+ {
+ "key": "location",
+ "title": "Location",
+ "type": "categorical",
+ "legend": [
+ {"value": "alpha", "display": "α"},
+ {"value": "beta"}
+ ],
+ "scale": [
+ ["beta", "#bd0026"],
+ ["gamma", "#6a51a3"]
+ ]
+ },
+ {
+ "key": "mutation_length",
+ "title": "Mutations per branch",
+ "type": "continuous",
+ "legend": [
+ {"value": 1, "display": "0-2", "bounds": [-1,2]},
+ {"value": 3, "display": "3-5", "bounds": [2,5]},
+ {"value": 5, "display": ">5", "bounds": [5, 10]}
+ ],
+ "scale": [
+ [1, "#081d58"],
+ [3, "#1d91c0"],
+ [5, "#c7e9b4"]
+ ]
+ }
+ ],
+ "extensions": {
+ "some_key": "some_value"
+ }
+}
\ No newline at end of file
=====================================
tests/functional/export_v2/auspice_config3.json
=====================================
@@ -0,0 +1,37 @@
+{
+ "title": "Minimal config with an extensions block",
+ "colorings": [
+ {
+ "key": "location",
+ "title": "Location",
+ "type": "categorical",
+ "legend": [
+ {"value": "alpha", "display": "α"},
+ {"value": "beta"}
+ ],
+ "scale": [
+ ["beta", "#bd0026"],
+ ["gamma", "#6a51a3"]
+ ]
+ },
+ {
+ "key": "mutation_length",
+ "title": "Mutations per branch",
+ "type": "continuous",
+ "legend": [
+ {"value": 1, "display": "0-2", "bounds": [-1,2]},
+ {"value": 3, "display": "3-5", "bounds": [2,5]},
+ {"value": 5, "display": ">5", "bounds": [5, 10]}
+ ],
+ "scale": [
+ [1, "#081d58"],
+ [3, "#1d91c0"],
+ [5, "#c7e9b4"]
+ ]
+ }
+ ],
+ "extensions": {
+ "some_key": "some_value"
+ },
+ "this_is_not_allowed_by_the_schema": "but is used to test --skip-validation"
+}
\ No newline at end of file
=====================================
tests/functional/export_v2/dataset2.json
=====================================
@@ -0,0 +1,228 @@
+{
+ "meta": {
+ "colorings": [
+ {
+ "key": "location",
+ "legend": [
+ {
+ "display": "\u03b1",
+ "value": "alpha"
+ },
+ {
+ "value": "beta"
+ }
+ ],
+ "scale": [
+ [
+ "beta",
+ "#bd0026"
+ ],
+ [
+ "gamma",
+ "#6a51a3"
+ ]
+ ],
+ "title": "Location",
+ "type": "categorical"
+ },
+ {
+ "key": "mutation_length",
+ "legend": [
+ {
+ "bounds": [
+ -1,
+ 2
+ ],
+ "display": "0-2",
+ "value": 1
+ },
+ {
+ "bounds": [
+ 2,
+ 5
+ ],
+ "display": "3-5",
+ "value": 3
+ },
+ {
+ "bounds": [
+ 5,
+ 10
+ ],
+ "display": ">5",
+ "value": 5
+ }
+ ],
+ "scale": [
+ [
+ 1,
+ "#081d58"
+ ],
+ [
+ 3,
+ "#1d91c0"
+ ],
+ [
+ 5,
+ "#c7e9b4"
+ ]
+ ],
+ "title": "Mutations per branch",
+ "type": "continuous"
+ },
+ {
+ "key": "location",
+ "legend": [
+ {
+ "display": "\u03b1",
+ "value": "alpha"
+ },
+ {
+ "value": "beta"
+ }
+ ],
+ "scale": [
+ [
+ "beta",
+ "#bd0026"
+ ],
+ [
+ "gamma",
+ "#6a51a3"
+ ]
+ ],
+ "title": "Location",
+ "type": "categorical"
+ }
+ ],
+ "filters": [
+ "location"
+ ],
+ "panels": [
+ "tree"
+ ],
+ "title": "Minimal config with an extensions block",
+ "updated": "2021-06-09"
+ },
+ "tree": {
+ "branch_attrs": {},
+ "children": [
+ {
+ "branch_attrs": {},
+ "name": "tipA",
+ "node_attrs": {
+ "div": 1,
+ "location": {
+ "value": "delta"
+ },
+ "mutation_length": {
+ "value": 1
+ }
+ }
+ },
+ {
+ "branch_attrs": {},
+ "children": [
+ {
+ "branch_attrs": {},
+ "name": "tipB",
+ "node_attrs": {
+ "div": 3,
+ "location": {
+ "value": "gamma"
+ },
+ "mutation_length": {
+ "value": 1
+ }
+ }
+ },
+ {
+ "branch_attrs": {},
+ "name": "tipC",
+ "node_attrs": {
+ "div": 3,
+ "location": {
+ "value": "gamma"
+ },
+ "mutation_length": {
+ "value": 1
+ }
+ }
+ }
+ ],
+ "name": "internalBC",
+ "node_attrs": {
+ "div": 2,
+ "mutation_length": {
+ "value": 2
+ }
+ }
+ },
+ {
+ "branch_attrs": {},
+ "children": [
+ {
+ "branch_attrs": {},
+ "name": "tipD",
+ "node_attrs": {
+ "div": 8,
+ "location": {
+ "value": "alpha"
+ },
+ "mutation_length": {
+ "value": 3
+ }
+ }
+ },
+ {
+ "branch_attrs": {},
+ "name": "tipE",
+ "node_attrs": {
+ "div": 9,
+ "location": {
+ "value": "alpha"
+ },
+ "mutation_length": {
+ "value": 4
+ }
+ }
+ },
+ {
+ "branch_attrs": {},
+ "name": "tipF",
+ "node_attrs": {
+ "div": 6,
+ "location": {
+ "value": "beta"
+ },
+ "mutation_length": {
+ "value": 1
+ }
+ }
+ }
+ ],
+ "name": "internalDEF",
+ "node_attrs": {
+ "div": 5,
+ "location": {
+ "value": "alpha"
+ },
+ "mutation_length": {
+ "value": 5
+ }
+ }
+ }
+ ],
+ "name": "ROOT",
+ "node_attrs": {
+ "div": 0,
+ "mutation_length": {
+ "value": 0
+ }
+ }
+ },
+ "version": "v2",
+ "extensions": {
+ "some_key": "some_value"
+ }
+}
\ No newline at end of file
=====================================
tests/functional/filter.t
=====================================
@@ -101,6 +101,7 @@ Explicitly use probabilistic subsampling to handle the case when there are more
> --subsample-seed 314159 \
> --probabilistic-sampling \
> --output-strains "$TMP/filtered_strains_probabilistic.txt" > /dev/null
+ WARNING: Asked to provide at most 5 sequences, but there are 8 groups.
Using the default probabilistic subsampling, should work the same as the previous case.
@@ -113,6 +114,7 @@ Using the default probabilistic subsampling, should work the same as the previou
> --subsample-max-sequences 5 \
> --subsample-seed 314159 \
> --output-strains "$TMP/filtered_strains_default.txt" > /dev/null
+ WARNING: Asked to provide at most 5 sequences, but there are 8 groups.
By setting the subsample seed above, we should get the same results for both runs.
@@ -394,6 +396,7 @@ Strains with ambiguous years or months should be dropped and logged.
> --subsample-max-sequences 5 \
> --output-strains "$TMP/filtered_strains.txt" \
> --output-log "$TMP/filtered_log.tsv" > /dev/null
+ WARNING: Asked to provide at most 5 sequences, but there are 6 groups.
$ grep "SG_018" "$TMP/filtered_log.tsv" | cut -f 1-2
SG_018\tskip_group_by_with_ambiguous_month (esc)
$ grep "COL/FLR_00024/2015" "$TMP/filtered_log.tsv" | cut -f 1-2
=====================================
tests/test_schemas.py
=====================================
@@ -0,0 +1,14 @@
+import json
+import jsonschema.validators
+import pytest
+from pathlib import Path
+
+schemas = list(Path("augur/data/").glob("schema-*.json"))
+
+ at pytest.mark.parametrize("schema_path", schemas, ids = lambda schema_path: str(schema_path))
+def test_schema_is_valid(schema_path):
+ with schema_path.open("rb") as schema_fh:
+ schema = json.load(schema_fh)
+
+ Validator = jsonschema.validators.validator_for(schema)
+ Validator.check_schema(schema)
View it on GitLab: https://salsa.debian.org/med-team/augur/-/commit/d19a7749f42d916ea13e82046c21ad868e5c37d1
--
View it on GitLab: https://salsa.debian.org/med-team/augur/-/commit/d19a7749f42d916ea13e82046c21ad868e5c37d1
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20220408/caaf069b/attachment-0001.htm>
More information about the debian-med-commit
mailing list