[med-svn] [Git][med-team/augur][upstream] New upstream version 14.1.0

Fri Apr 8 06:04:07 BST 2022


Andreas Tille pushed to branch upstream at Debian Med / augur


Commits:
d19a7749 by Andreas Tille at 2022-04-08T06:57:52+02:00
New upstream version 14.1.0
- - - - -


19 changed files:

- CHANGES.md
- augur/__version__.py
- augur/data/schema-auspice-config-v2.json
- + augur/data/schema-export-root-sequence.json
- augur/data/schema-export-v2.json
- + augur/data/schema-frequencies.json
- + augur/data/schema-measurements.json
- + augur/data/schema-tip-frequencies.json
- augur/export_v2.py
- augur/filter.py
- augur/parse.py
- augur/utils.py
- augur/validate.py
- tests/functional/export_v2.t
- + tests/functional/export_v2/auspice_config2.json
- + tests/functional/export_v2/auspice_config3.json
- + tests/functional/export_v2/dataset2.json
- tests/functional/filter.t
- + tests/test_schemas.py


Changes:

=====================================
CHANGES.md
=====================================
@@ -3,6 +3,30 @@
 ## __NEXT__
 
 
+## 14.1.0 (31 March 2022)
+
+### Features
+
+* schemas: Extend export v2 schema to support an array of trees [#851][] (@tsibley)
+* schemas: Add JSON schemas for our root-sequence and tip-frequencies sidecars [#852][] (@tsibley)
+* schemas: Add JSON schema for measurements sidecar [#859][] (@joverlee521)
+* filter: Send warnings to stderr to be consistent with other warnings [#862][] (@victorlin)
+* export: Allow an extensions block in auspice config & dataset JSONs [#865][] (@jameshadfield)
+* export: Allow skipping of input/output schema validation [#865][] (@jameshadfield)
+* export:  Order keys in dataset for easier reading [#868][] (@jameshadfield)
+
+### Bug Fixes
+
+* parse: Fix typo in internal variable name [#850][] (@emmahodcroft)
+
+[#850]: https://github.com/nextstrain/augur/pull/850
+[#851]: https://github.com/nextstrain/augur/pull/851
+[#852]: https://github.com/nextstrain/augur/pull/852
+[#859]: https://github.com/nextstrain/augur/pull/859
+[#862]: https://github.com/nextstrain/augur/pull/862
+[#865]: https://github.com/nextstrain/augur/pull/865
+[#868]: https://github.com/nextstrain/augur/pull/868
+
 ## 14.0.0 (8 February 2022)
 
 ### Major Changes


=====================================
augur/__version__.py
=====================================
@@ -1,4 +1,4 @@
-__version__ = '14.0.0'
+__version__ = '14.1.0'
 
 
 def is_augur_version_compatible(version):


=====================================
augur/data/schema-auspice-config-v2.json
=====================================
@@ -230,6 +230,10 @@
                     }
                 }
             }
+        },
+        "extensions": {
+            "description": "Data to be passed through to the the resulting dataset JSON",
+            "$comment": "Any type is accepted"
         }
     }
 }


=====================================
augur/data/schema-export-root-sequence.json
=====================================
@@ -0,0 +1,33 @@
+{
+  "$schema": "http://json-schema.org/draft-06/schema#",
+  "$id": "https://nextstrain.org/schemas/dataset/root-sequence",
+  "title": "Nextstrain root-sequence sidecar for datasets",
+  "description": "Typically produced by Augur and consumed by Auspice.  Applicable to the `--root-sequence` output of `augur export v2` as well as the `--output-sequence` option of `augur export v1`.",
+  "oneOf": [
+    {
+      "$comment": "This is sort of weird, but `augur export v1` can explicitly produce an empty object.",
+      "description": "An empty object",
+      "type": "object",
+      "properties": {},
+      "additionalProperties": false
+    },
+    {
+      "description": "An object containing at least a \"nuc\" key and optionally additional keys for genome annotations (e.g. genes)",
+      "type": "object",
+      "required": ["nuc"],
+      "properties": {
+        "nuc": {
+          "description": "Nucleotide sequence of whole genome (from the output of `augur ancestral`)",
+          "type": "string"
+        }
+      },
+      "patternProperties": {
+        "^[a-zA-Z0-9*_-]+$": {
+          "$comment": "This pattern is the same pattern used in the corresponding parts of schema-export-v2.json.",
+          "description": "Amino acid sequence of genome annotation (e.g. gene) identified by this key (from the output of `augur translate`)",
+          "type": "string"
+        }
+      }
+    }
+  ]
+}


=====================================
augur/data/schema-export-v2.json
=====================================
@@ -2,14 +2,14 @@
     "$schema": "http://json-schema.org/draft-06/schema#",
     "$id": "https://nextstrain.org/schemas/dataset/v2",
     "type": "object",
-    "title": "Nextstrain metadata JSON schema proposal (meta + tree together)",
+    "title": "Nextstrain dataset v2",
+    "description": "Typically produced by Augur (`augur export v2`) and consumed by Auspice.  Combines dataset v1 meta and tree files together into one file, with additional changes.",
     "additionalProperties": false,
     "required": ["version", "meta", "tree"],
     "properties": {
         "version" : {
-            "description": "JSON schema version",
-            "type" : "string",
-            "pattern": "^v[0-9]+$"
+            "description": "Major schema version",
+            "const": "v2"
         },
         "meta": {
             "type": "object",
@@ -298,6 +298,23 @@
                 }
             }
         },
+        "tree": {
+            "description": "One or more phylogenies using a nested JSON structure",
+            "oneOf": [
+                {"$ref": "#/$defs/tree"},
+                {
+                    "type": "array",
+                    "minItems": 1,
+                    "items": {"$ref": "#/$defs/tree"}
+                }
+            ]
+        },
+        "extensions": {
+            "description": "Data for use by applications other than auspice",
+            "$comment": "Any type is accepted"
+        }
+    },
+    "$defs": {
         "tree": {
             "type" : "object",
             "$comment": "The phylogeny in a nested JSON structure",
@@ -500,7 +517,7 @@
                     "$comment": "Polytomies (more than 2 items) allowed, as are nodes with a single child.",
                     "type": "array",
                     "minItems": 1,
-                    "items": {"$ref": "#/properties/tree"}
+                    "items": {"$ref": "#/$defs/tree"}
                 }
             }
         }


=====================================
augur/data/schema-frequencies.json
=====================================
@@ -0,0 +1,112 @@
+{
+  "$schema": "http://json-schema.org/draft-06/schema#",
+  "$id": "https://nextstrain.org/schemas/augur/frequencies",
+  "title": "`augur frequencies` output",
+  "description": "This schema describes the various forms of `augur frequencies` output when using the default `--output-format=auspice`.  The specific form used depends on the input parameters, as noted for each possible form below.  One of the forms is compatible with the Nextstrain tip-frequencies sidecar described by <https://nextstrain.org/schemas/tip-frequencies>.",
+  "$comment": "For historical context (some, not complete) on the development of this format, see <https://github.com/nextstrain/augur/pull/83> and <https://github.com/nextstrain/augur/issues/84>.",
+  "oneOf": [
+    {
+      "description": "`augur frequencies` with an input `--tree` and `--method=diffusion`",
+      "type": "object",
+      "required": ["pivots", "counts"],
+      "properties": {
+        "counts": {
+          "description": "Counts by region",
+          "type": "object",
+          "additionalProperties": {
+            "description": "Counts for region identified by this key",
+            "type": "array",
+            "items": {"type": "integer"}
+          }
+        },
+        "pivots": {"$ref": "#/$defs/pivots"},
+        "generated_by": {"$ref": "#/$defs/generated_by"}
+      },
+      "additionalProperties": {
+        "description": "Estimated frequencies by region for tip (or node) identified by this key",
+        "type": "object",
+        "additionalProperties": {
+          "description": "Estimated frequencies for region identified by this key",
+          "type": "array",
+          "items": {"type": "number"}
+        }
+      }
+    },
+    {
+      "description": "`augur frequencies` with an input `--tree` and `--method=kde`, compatible with <https://nextstrain.org/schemas/tip-frequencies>",
+      "type": "object",
+      "required": ["pivots"],
+      "properties": {
+        "pivots": {"$ref": "#/$defs/pivots"},
+        "generated_by": {"$ref": "#/$defs/generated_by"}
+      },
+      "additionalProperties": {
+        "description": "Estimated frequencies for tip (or node) identified by this key",
+        "type": "object",
+        "properties": {
+          "frequencies": {
+            "type": "array",
+            "items": {"type": "number"}
+          }
+        },
+        "additionalProperties": false
+      }
+    },
+    {
+      "description": "`augur frequencies` with input gene `--alignments` and `--method=diffusion`",
+      "type": "object",
+      "required": ["pivots"],
+      "properties": {
+        "pivots": {"$ref": "#/$defs/pivots"},
+        "generated_by": {"$ref": "#/$defs/generated_by"}
+      },
+      "patternProperties": {
+        "^(.+):counts$": {
+          "description": "Counts for gene alignment position+state identified by this key (<GENE>:counts)",
+          "type": "array",
+          "items": {"type": "integer"}
+        },
+        "^(.+):([0-9]+)(.+)$": {
+          "description": "Estimated frequencies for gene alignment position+state identified by this key (<GENE>:<POSITION><STATE>)",
+          "type": "array",
+          "items": {"type": "number"}
+        }
+      },
+      "additionalProperties": false
+    },
+    {
+      "description": "`augur frequencies` with input gene `--alignments` and `--method=kde`",
+      "type": "object",
+      "required": ["pivots"],
+      "properties": {
+        "pivots": {"$ref": "#/$defs/pivots"},
+        "generated_by": {"$ref": "#/$defs/generated_by"}
+      },
+      "patternProperties": {
+        "^(.+):([0-9]+)(.+)$": {
+          "description": "Estimated frequencies for gene alignment position+state identified by this key (<GENE>:<POSITION><STATE>)",
+          "type": "array",
+          "items": {"type": "number"}
+        }
+      },
+      "additionalProperties": false
+    }
+  ],
+  "$defs": {
+    "pivots": {
+      "description": "Pivot dates as floating point numbers (YYYY.nnnnnn)",
+      "type": "array",
+      "items": {"type": "number"},
+      "minItems": 1,
+      "uniqueItems": true
+    },
+    "generated_by": {
+      "description": "Information about the software which produced the file",
+      "type": "object",
+      "properties": {
+        "program": {"type": "string"},
+        "version": {"type": "string"}
+      }
+    }
+  }
+}


=====================================
augur/data/schema-measurements.json
=====================================
@@ -0,0 +1,150 @@
+{
+    "$schema": "http://json-schema.org/draft-06/schema#",
+    "$id": "https://nextstrain.org/schemas/dataset/measurements",
+    "title": "Nextstrain measurements sidecar for datasets",
+    "description": "Typically produced by Augur and consumed by Auspice.",
+    "type": "object",
+    "additionalProperties": false,
+    "required": ["collections"],
+    "properties": {
+        "default_collection": {
+            "description": "Default collection to display. The value must be a `key` in one of the objects of the collections array. Optional -- if not provided, first collection will be displayed",
+            "type": "string"
+        },
+        "collections": {
+            "description": "Collections of measurements and their configurations for display in Auspice. Order of the collections determines the order they are shown in the collections dropdown.",
+            "type": "array",
+            "minItems": 1,
+            "items": {
+                "description": "A single collection of related measurements and the collection's display config",
+                "type": "object",
+                "additionalProperties": false,
+                "required": ["key", "groupings", "x_axis_label", "measurements"],
+                "properties": {
+                    "key": {
+                        "description": "The short name of the collection that is only used internally within Auspice. Each collection is expected to have a unique key.",
+                        "type": "string"
+                    },
+                    "title": {
+                        "description": "The title to display in the collections dropdown and panel title. Optional -- if not provided, then `key` will be used",
+                        "type": "string"
+                    },
+                    "fields": {
+                        "description": "Custom field order and display titles. Order of the fields determines the order they are shown in the measurement hover display. Optional -- if not provided, then the key of the field will be displayed in alphabetical order.",
+                        "type": "array",
+                        "minItems": 1,
+                        "items": {
+                            "description": "A single field of the measurements",
+                            "type": "object",
+                            "additionalProperties": false,
+                            "required": ["key"],
+                            "properties": {
+                                "key": {
+                                    "description": "The property name of the field within the measurement object",
+                                    "type": "string"
+                                },
+                                "title": {
+                                    "description": "The display title for the field. Optional -- if not provided, then `key` will be used",
+                                    "type": "string"
+                                }
+                            }
+                        }
+                    },
+                    "groupings": {
+                        "description": "The available group by fields for measurements. Order of the group by fields determines the order they are shown in the group by dropdown.",
+                        "type": "array",
+                        "minItems": 1,
+                        "items": {
+                            "description": "A single group by field for measurements",
+                            "type": "object",
+                            "additionalProperties": false,
+                            "required": ["key"],
+                            "properties": {
+                                "key": {
+                                    "description": "The property name of the group by field within the measurement object",
+                                    "type": "string"
+                                },
+                                "order": {
+                                    "description": "A custom order of group by values to customize the display order of the subplots when using group by field",
+                                    "type": "array",
+                                    "minItems": 1,
+                                    "items": {
+                                        "description": "A single value of the group by field present in measurements",
+                                        "type": ["string", "number", "boolean"]
+                                    }
+                                }
+                            }
+                        }
+                    },
+                    "filters": {
+                        "description": "The available filter options for measurements. Order of the filter options determines the order they are shown in the filter dropdown. Optional -- if not provided, then all fields will be available as filters.",
+                        "type": "array",
+                        "minItems": 1,
+                        "items": {
+                            "description": "The property name of the filter field within the measurement object",
+                            "type": "string"
+                        }
+                    },
+                    "x_axis_label": {
+                        "description": "The short label to display for the x-axis that describes the `value` of the measurements in a collection",
+                        "type": "string"
+                    },
+                    "threshold": {
+                        "description": "A numeric measurement threshold to be displayed as a single grey line shared across subplots. Optional -- if not provided, no threshold will be displayed",
+                        "type": "number"
+                    },
+                    "display_defaults": {
+                        "description": "Default display options of the collection",
+                        "type": "object",
+                        "additionalProperties": false,
+                        "minProperty": 1,
+                        "properties": {
+                            "group_by": {
+                                "description": "Default group by field name that must be included as a group by option in the groupings array. Optional -- if not provided, first group by option will be used",
+                                "type": "string"
+                            },
+                            "measurements_display": {
+                                "description": "Dictates how the measurements are displayed, either as the raw data points or as the means of values grouped by the tree color-by attribute",
+                                "type": "string",
+                                "enum": ["raw", "mean"]
+                            },
+                            "show_overall_mean": {
+                                "description": "Should the overall mean per group be displayed by default?",
+                                "type": "boolean"
+                            },
+                            "show_threshold": {
+                                "description": "Should the threshold line be displayed by default? Ignored if no threshold has been provided for collection",
+                                "type": "boolean"
+                            }
+                        }
+                    },
+                    "measurements": {
+                        "description": "All measurements for a single collection",
+                        "type": "array",
+                        "minItems": 1,
+                        "items": {
+                            "description": "A single measurement for a sample and it's associated metadata. At least one of the metadata properties should be a group by option",
+                            "type": "object",
+                            "required": ["strain", "value"],
+                            "minProperties": 3,
+                            "properties": {
+                                "strain": {
+                                    "description": "The name of the sample that matches the name of the sample within the tree",
+                                    "type": "string"
+                                },
+                                "value": {
+                                    "description": "The numeric value of the measurement",
+                                    "type": "number"
+                                }
+                            },
+                            "additionalProperties": {
+                                "description": "Metadata associated with the measurement. Only metadata properties included in the groupings config will be included in the group by dropdown, but all metadata properties will be available as a filter",
+                                "type": ["string", "number", "boolean"]
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}


=====================================
augur/data/schema-tip-frequencies.json
=====================================
@@ -0,0 +1,42 @@
+
+{
+  "$schema": "http://json-schema.org/draft-06/schema#",
+  "$id": "https://nextstrain.org/schemas/dataset/tip-frequencies",
+  "title": "Nextstrain tip-frequencies sidecar for datasets",
+  "description": "Typically produced by Augur (with `augur frequencies --method kde --tree …`) and consumed by Auspice.  Note that the full range of output forms from `augur frequencies` is broader than this, see <https://nextstrain.org/schemas/augur/frequencies>.",
+  "$comment": "For historical context (some, not complete) on the development of this format, see <https://github.com/nextstrain/augur/pull/83> and <https://github.com/nextstrain/augur/issues/84>.",
+  "type": "object",
+  "required": ["pivots"],
+  "properties": {
+    "pivots": {
+      "description": "Pivot dates as floating point numbers (YYYY.nnnnnn)",
+      "type": "array",
+      "items": {"type": "number"},
+      "minItems": 1,
+      "uniqueItems": true
+    },
+    "projection_pivot": {
+      "description": "Pivot at which estimates are projected into the future.  This property is understood by Auspice but only produced by custom generators; it is not produced by `augur frequencies`.",
+      "type": "number"
+    },
+    "generated_by": {
+      "description": "Information about the software which produced the file",
+      "type": "object",
+      "properties": {
+        "program": {"type": "string"},
+        "version": {"type": "string"}
+      }
+    }
+  },
+  "additionalProperties": {
+    "description": "Estimated frequencies for tip (or node) identified by this key",
+    "type": "object",
+    "properties": {
+      "frequencies": {
+        "type": "array",
+        "items": {"type": "number"}
+      }
+    },
+    "additionalProperties": false
+  }
+}


=====================================
augur/export_v2.py
=====================================
@@ -4,7 +4,7 @@ Export JSON files suitable for visualization with auspice..
 from pathlib import Path
 import os, sys
 import time
-from collections import defaultdict, deque
+from collections import defaultdict, deque, OrderedDict
 import warnings
 import numbers
 import re
@@ -45,6 +45,67 @@ def configure_warnings():
 class InvalidOption(Exception):
     pass
 
+class CustomOrderedDict(OrderedDict):
+    """
+    Similar to OrderedDict but will convert dictionaries (and dictionaries of dictionaries)
+    into (nested) CustomOrderedDicts.
+    Encountered lists of dicts will be converted to lists of CustomOrderedDict but we will not
+    recursively explore nested lists.
+    Tuples and other iterators are not explored.
+    """
+    def __init__(self, *args):
+        super().__init__(*args)
+        for key in self:
+            if isinstance(self[key], dict) and not isinstance(self[key], OrderedDict):
+                self[key] = CustomOrderedDict(self[key])
+            elif isinstance(self[key], list):
+                self[key] = [
+                    (CustomOrderedDict(el) if (isinstance(el, dict) and not isinstance(el, OrderedDict)) else el)
+                    for el in self[key]
+                ]
+    def set_order(self, *order):
+        """
+        changes the order of keys to match those specified in `order` as much
+        as possible. Missing keys are ignored. Extra keys will come after those
+        specified in `order`.
+        """
+        for key in reversed(order):
+            self.move_to_end_if_present(key, last=False)
+    def move_to_end_if_present(self, key, **kwargs):
+        try:
+            self.move_to_end(key, **kwargs)
+        except KeyError:
+            pass
+
+
+def orderKeys(data):
+    """
+    converts the data dict (where keys are inherently unordered) into an
+    OrderedDict where keys are nicely ordered for human eyes to scan the
+    data when written to JSON. The ordering (mostly) mirrors the schema.
+    """
+    od = CustomOrderedDict(data)
+    od.set_order("version", "meta", "tree")
+    if "meta" in od:
+        od["meta"].set_order("title", "updated", "build_url", "data_provenance", "maintainers")
+        for coloring in od['meta'].get('colorings', []):
+            coloring.set_order("key", "title", "type", "scale", "legend")
+    def order_nodes(node):
+        """recursive function to order nodes in a (sub)tree"""
+        node.set_order("name", "node_attrs", "branch_attrs")
+        # children often a _large_ object and it improves readability if this comes last in the node
+        node.move_to_end_if_present("children")
+        if "node_attrs" in node:
+            node["node_attrs"].set_order("div", "num_date")
+        for child in node.get("children", []):
+            order_nodes(child)
+    if isinstance(od.get("tree"), list):
+        for subtree in od['tree']:
+            order_nodes(subtree)
+    elif isinstance(od.get("tree"), dict):
+        order_nodes(od['tree'])
+    return od
+
 def convert_tree_to_json_structure(node, metadata, div=0):
     """
     converts the Biopython tree structure to a dictionary that can
@@ -754,39 +815,6 @@ def node_data_prop_is_normal_trait(name):
 
     return True
 
-def get_root_sequence(root_node, ref=None, translations=None):
-    '''
-    create a json structure that contains the sequence of the root, both as
-    nucleotide and as translations. This allows look-up of the sequence for
-    all states, including those that are not variable.
-
-    Parameters
-    ----------
-    root_node : dict
-    	data associated with the node
-    ref : str, optional
-        filename of the root sequence
-    translations : str, optional
-        file name of translations
-
-    Returns
-    -------
-    dict
-        dict of nucleotide sequence and translations
-    '''
-    root_sequence = {}
-    if ref and translations:
-        from Bio import SeqIO
-        refseq = SeqIO.read(ref, 'fasta')
-        root_sequence['nuc']=str(refseq.seq)
-        for gene in SeqIO.parse(translations, 'fasta'):
-            root_sequence[gene.id] = str(gene.seq)
-    else:
-        root_sequence["nuc"] = root_node["sequence"]
-        root_sequence.update(root_node["aa_sequences"])
-
-    return root_sequence
-
 
 def register_arguments_v2(subparsers):
     v2 = subparsers.add_parser("v2", help="Export version 2 JSON schema")
@@ -825,6 +853,7 @@ def register_arguments_v2(subparsers):
     )
     optional_settings.add_argument('--minify-json', action="store_true", help="export JSONs without indentation or line returns")
     optional_settings.add_argument('--include-root-sequence', action="store_true", help="Export an additional JSON containing the root sequence (reference sequence for vcf) used to identify mutations. The filename will follow the pattern of <OUTPUT>_root-sequence.json for a main auspice JSON of <OUTPUT>.json")
+    optional_settings.add_argument('--skip-validation', action="store_true", help="skip validation of input/output files. Use at your own risk!")
 
     return v2
 
@@ -936,12 +965,13 @@ def get_config(args):
     if not args.auspice_config:
         return {}
     config = read_config(args.auspice_config)
-    try:
-        print("Validating config file {} against the JSON schema".format(args.auspice_config))
-        validate_auspice_config_v2(args.auspice_config)
-    except ValidateError:
-        print("Validation of {} failed. Please check the formatting of this file & refer to the augur documentation for further help. ".format(args.auspice_config))
-        sys.exit(2)
+    if not args.skip_validation:
+        try:
+            print("Validating config file {} against the JSON schema".format(args.auspice_config))
+            validate_auspice_config_v2(args.auspice_config)
+        except ValidateError:
+            print("Validation of {} failed. Please check the formatting of this file & refer to the augur documentation for further help. ".format(args.auspice_config))
+            sys.exit(2)
     # Print a warning about the inclusion of "vaccine_choices" which are _unused_ by `export v2`
     # (They are in the schema as this allows v1-compat configs to be used)
     if config.get("vaccine_choices"):
@@ -1005,9 +1035,13 @@ def run_v2(args):
     set_panels(data_json, config, args.panels)
     set_data_provenance(data_json, config)
 
+    # pass through any extensions block in the auspice config JSON without any changes / checking
+    if config.get("extensions"):
+        data_json["extensions"] = config["extensions"]
+
     # Write outputs - the (unified) dataset JSON intended for auspice & perhaps the ref root-sequence JSON
     indent = {"indent": None} if args.minify_json else {}
-    write_json(data=data_json, file_name=args.output, include_version=False, **indent)
+    write_json(data=orderKeys(data_json), file_name=args.output, include_version=False, **indent)
 
     if args.include_root_sequence:
         if 'reference' in node_data:
@@ -1022,7 +1056,8 @@ def run_v2(args):
             fatal("Root sequence output was requested, but the node data provided is missing a 'reference' key.")
 
     # validate outputs
-    validate_data_json(args.output)
+    if not args.skip_validation:
+        validate_data_json(args.output)
 
     if deprecationWarningsEmitted:
         print("\n------------------------")


=====================================
augur/filter.py
=====================================
@@ -623,7 +623,7 @@ def construct_filters(args, sequence_index):
         is_vcf = filename_is_vcf(args.sequences)
 
         if is_vcf: #doesn't make sense for VCF, ignore.
-            print("WARNING: Cannot use min_length for VCF files. Ignoring...")
+            print("WARNING: Cannot use min_length for VCF files. Ignoring...", file=sys.stderr)
         else:
             exclude_by.append((
                 filter_by_sequence_length,
@@ -1744,7 +1744,7 @@ def calculate_sequences_per_group(target_max_value, counts_per_group, allow_prob
         )
     except TooManyGroupsError as error:
         if allow_probabilistic:
-            print(f"WARNING: {error}")
+            print(f"WARNING: {error}", file=sys.stderr)
             sequences_per_group = _calculate_fractional_sequences_per_group(
                 target_max_value,
                 counts_per_group,


=====================================
augur/parse.py
=====================================
@@ -5,7 +5,7 @@ import pandas as pd
 
 from .io import open_file, read_sequences, write_sequences
 
-forbidden_chactacters = str.maketrans(
+forbidden_characters = str.maketrans(
     {' ': None,
      '(': '_',
      ')': '_',
@@ -103,7 +103,7 @@ def parse_sequence(sequence, fields, strain_key="strain", separator="|", prettif
     sequence_fields = map(str.strip, sequence.description.split(separator))
     metadata = dict(zip(fields, sequence_fields))
 
-    tmp_name = metadata[strain_key].translate(forbidden_chactacters)
+    tmp_name = metadata[strain_key].translate(forbidden_characters)
     sequence.name = sequence.id = tmp_name
     sequence.description = ''
 


=====================================
augur/utils.py
=====================================
@@ -10,7 +10,7 @@ import subprocess
 import shlex
 from contextlib import contextmanager
 from treetime.utils import numeric_date
-from collections import defaultdict
+from collections import defaultdict, OrderedDict
 from pkg_resources import resource_stream
 from io import TextIOWrapper
 from .__version__ import __version__
@@ -256,9 +256,9 @@ def write_json(data, file_name, indent=(None if os.environ.get("AUGUR_MINIFY_JSO
 
     if include_version:
         data["generated_by"] = {"program": "augur", "version": get_augur_version()}
-
     with open(file_name, 'w', encoding='utf-8') as handle:
-        json.dump(data, handle, indent=indent, sort_keys=True)
+        sort_keys = False if isinstance(data, OrderedDict) else True
+        json.dump(data, handle, indent=indent, sort_keys=sort_keys)
 
 
 def load_features(reference, feature_names=None):


=====================================
augur/validate.py
=====================================
@@ -43,11 +43,12 @@ def load_json_schema(path):
     except json.JSONDecodeError as err:
         raise ValidateError("Schema {} is not a valid JSON file. Error: {}".format(path, err))
     # check loaded schema is itself valid -- see http://python-jsonschema.readthedocs.io/en/latest/errors/
+    Validator = jsonschema.validators.validator_for(schema)
     try:
-        jsonschema.Draft6Validator.check_schema(schema)
+        Validator.check_schema(schema)
     except jsonschema.exceptions.SchemaError as err:
-        raise ValidateError("Schema {} is not a valid JSON file. Error: {}".format(path, err))
-    return jsonschema.Draft6Validator(schema)
+        raise ValidateError(f"Schema {path} is not a valid JSON Schema ({Validator.META_SCHEMA['$schema']}). Error: {err}")
+    return Validator(schema)
 
 def load_json(path):
     with open(path, 'rb') as fh:
@@ -118,6 +119,13 @@ def export_v1(meta_json, tree_json, **kwargs):
         print("Validation of {!r} and {!r} succeeded, but there were warnings you may want to resolve.".format(meta_json, tree_json))
 
 
+def measurements(measurements_json, **kwargs):
+    schema = load_json_schema("schema-measurements.json")
+    measurements = load_json(measurements_json)
+    validate_json(measurements, schema, measurements_json)
+    return measurements
+
+
 def register_arguments(parser):
     subparsers = parser.add_subparsers(dest="subcommand", help="Which file(s) do you want to validate?")
 
@@ -131,6 +139,8 @@ def register_arguments(parser):
     subparsers.add_parser("auspice-config-v2", help="validate auspice config intended for `augur export v2`") \
         .add_argument('config_json', metavar='JSON', help="auspice config JSON")
 
+    subparsers.add_parser("measurements", help="validate measurements JSON intended for auspice measurements panel") \
+        .add_argument("measurements_json", metavar="JSON", help="exported measurements JSON")
 
 def run(args):
     try:


=====================================
tests/functional/export_v2.t
=====================================
@@ -30,4 +30,38 @@ Export with auspice config JSON which defines scale & legend settings
 
   $ python3 "$TESTDIR/../../scripts/diff_jsons.py"  export_v2/dataset1.json "$TMP/dataset1.json" \
   >   --exclude-paths "root['meta']['updated']"
-  {}
\ No newline at end of file
+  {}
+
+
+Export with auspice config JSON with an extensions block
+  $ ${AUGUR} export v2 \
+  >   --tree export_v2/tree.nwk \
+  >   --node-data export_v2/div_node-data.json export_v2/location_node-data.json \
+  >   --auspice-config export_v2/auspice_config2.json \
+  >   --output "$TMP/dataset2.json" &>/dev/null
+
+  $ python3 "$TESTDIR/../../scripts/diff_jsons.py"  export_v2/dataset2.json "$TMP/dataset2.json" \
+  >   --exclude-paths "root['meta']['updated']"
+  {}
+
+# auspice_config3.json is the same as auspice_config2.json but with an extra key which the schema does not allow.
+# Running without --skip-validation should result in an error
+# Message printed: "Validation of export_v2/auspice_config3.json failed."
+  $ ${AUGUR} export v2 \
+  >   --tree export_v2/tree.nwk \
+  >   --node-data export_v2/div_node-data.json export_v2/location_node-data.json \
+  >   --auspice-config export_v2/auspice_config3.json \
+  >   --output "$TMP/dataset2.json" &>/dev/null
+  [2]
+
+# Skipping validation gives us the same results as `auspice_config2.json`
+  $ ${AUGUR} export v2 \
+  >   --tree export_v2/tree.nwk \
+  >   --node-data export_v2/div_node-data.json export_v2/location_node-data.json \
+  >   --auspice-config export_v2/auspice_config3.json \
+  >   --output "$TMP/dataset3.json" \
+  >   --skip-validation &>/dev/null
+
+  $ python3 "$TESTDIR/../../scripts/diff_jsons.py"  export_v2/dataset2.json "$TMP/dataset3.json" \
+  >   --exclude-paths "root['meta']['updated']"
+  {}


=====================================
tests/functional/export_v2/auspice_config2.json
=====================================
@@ -0,0 +1,36 @@
+{
+  "title": "Minimal config with an extensions block",
+  "colorings": [
+    {
+      "key": "location",
+      "title": "Location",
+      "type": "categorical",
+      "legend": [
+        {"value": "alpha", "display": "α"},
+        {"value": "beta"}
+      ],
+      "scale": [
+        ["beta", "#bd0026"],
+        ["gamma", "#6a51a3"]
+      ]
+    },
+    {
+      "key": "mutation_length",
+      "title": "Mutations per branch",
+      "type": "continuous",
+      "legend": [
+        {"value": 1, "display": "0-2", "bounds": [-1,2]},
+        {"value": 3, "display": "3-5", "bounds": [2,5]},
+        {"value": 5, "display": ">5", "bounds": [5, 10]}
+      ],
+      "scale": [
+        [1, "#081d58"],
+        [3, "#1d91c0"],
+        [5, "#c7e9b4"]
+      ]
+    }
+  ],
+  "extensions": {
+    "some_key": "some_value"
+  }
+}
\ No newline at end of file


=====================================
tests/functional/export_v2/auspice_config3.json
=====================================
@@ -0,0 +1,37 @@
+{
+  "title": "Minimal config with an extensions block",
+  "colorings": [
+    {
+      "key": "location",
+      "title": "Location",
+      "type": "categorical",
+      "legend": [
+        {"value": "alpha", "display": "α"},
+        {"value": "beta"}
+      ],
+      "scale": [
+        ["beta", "#bd0026"],
+        ["gamma", "#6a51a3"]
+      ]
+    },
+    {
+      "key": "mutation_length",
+      "title": "Mutations per branch",
+      "type": "continuous",
+      "legend": [
+        {"value": 1, "display": "0-2", "bounds": [-1,2]},
+        {"value": 3, "display": "3-5", "bounds": [2,5]},
+        {"value": 5, "display": ">5", "bounds": [5, 10]}
+      ],
+      "scale": [
+        [1, "#081d58"],
+        [3, "#1d91c0"],
+        [5, "#c7e9b4"]
+      ]
+    }
+  ],
+  "extensions": {
+    "some_key": "some_value"
+  },
+  "this_is_not_allowed_by_the_schema": "but is used to test --skip-validation"
+}
\ No newline at end of file


=====================================
tests/functional/export_v2/dataset2.json
=====================================
@@ -0,0 +1,228 @@
+{
+  "meta": {
+    "colorings": [
+      {
+        "key": "location",
+        "legend": [
+          {
+            "display": "\u03b1",
+            "value": "alpha"
+          },
+          {
+            "value": "beta"
+          }
+        ],
+        "scale": [
+          [
+            "beta",
+            "#bd0026"
+          ],
+          [
+            "gamma",
+            "#6a51a3"
+          ]
+        ],
+        "title": "Location",
+        "type": "categorical"
+      },
+      {
+        "key": "mutation_length",
+        "legend": [
+          {
+            "bounds": [
+              -1,
+              2
+            ],
+            "display": "0-2",
+            "value": 1
+          },
+          {
+            "bounds": [
+              2,
+              5
+            ],
+            "display": "3-5",
+            "value": 3
+          },
+          {
+            "bounds": [
+              5,
+              10
+            ],
+            "display": ">5",
+            "value": 5
+          }
+        ],
+        "scale": [
+          [
+            1,
+            "#081d58"
+          ],
+          [
+            3,
+            "#1d91c0"
+          ],
+          [
+            5,
+            "#c7e9b4"
+          ]
+        ],
+        "title": "Mutations per branch",
+        "type": "continuous"
+      },
+      {
+        "key": "location",
+        "legend": [
+          {
+            "display": "\u03b1",
+            "value": "alpha"
+          },
+          {
+            "value": "beta"
+          }
+        ],
+        "scale": [
+          [
+            "beta",
+            "#bd0026"
+          ],
+          [
+            "gamma",
+            "#6a51a3"
+          ]
+        ],
+        "title": "Location",
+        "type": "categorical"
+      }
+    ],
+    "filters": [
+      "location"
+    ],
+    "panels": [
+      "tree"
+    ],
+    "title": "Minimal config with an extensions block",
+    "updated": "2021-06-09"
+  },
+  "tree": {
+    "branch_attrs": {},
+    "children": [
+      {
+        "branch_attrs": {},
+        "name": "tipA",
+        "node_attrs": {
+          "div": 1,
+          "location": {
+            "value": "delta"
+          },
+          "mutation_length": {
+            "value": 1
+          }
+        }
+      },
+      {
+        "branch_attrs": {},
+        "children": [
+          {
+            "branch_attrs": {},
+            "name": "tipB",
+            "node_attrs": {
+              "div": 3,
+              "location": {
+                "value": "gamma"
+              },
+              "mutation_length": {
+                "value": 1
+              }
+            }
+          },
+          {
+            "branch_attrs": {},
+            "name": "tipC",
+            "node_attrs": {
+              "div": 3,
+              "location": {
+                "value": "gamma"
+              },
+              "mutation_length": {
+                "value": 1
+              }
+            }
+          }
+        ],
+        "name": "internalBC",
+        "node_attrs": {
+          "div": 2,
+          "mutation_length": {
+            "value": 2
+          }
+        }
+      },
+      {
+        "branch_attrs": {},
+        "children": [
+          {
+            "branch_attrs": {},
+            "name": "tipD",
+            "node_attrs": {
+              "div": 8,
+              "location": {
+                "value": "alpha"
+              },
+              "mutation_length": {
+                "value": 3
+              }
+            }
+          },
+          {
+            "branch_attrs": {},
+            "name": "tipE",
+            "node_attrs": {
+              "div": 9,
+              "location": {
+                "value": "alpha"
+              },
+              "mutation_length": {
+                "value": 4
+              }
+            }
+          },
+          {
+            "branch_attrs": {},
+            "name": "tipF",
+            "node_attrs": {
+              "div": 6,
+              "location": {
+                "value": "beta"
+              },
+              "mutation_length": {
+                "value": 1
+              }
+            }
+          }
+        ],
+        "name": "internalDEF",
+        "node_attrs": {
+          "div": 5,
+          "location": {
+            "value": "alpha"
+          },
+          "mutation_length": {
+            "value": 5
+          }
+        }
+      }
+    ],
+    "name": "ROOT",
+    "node_attrs": {
+      "div": 0,
+      "mutation_length": {
+        "value": 0
+      }
+    }
+  },
+  "version": "v2",
+  "extensions": {
+    "some_key": "some_value"
+  }
+}
\ No newline at end of file


=====================================
tests/functional/filter.t
=====================================
@@ -101,6 +101,7 @@ Explicitly use probabilistic subsampling to handle the case when there are more
   >  --subsample-seed 314159 \
   >  --probabilistic-sampling \
   >  --output-strains "$TMP/filtered_strains_probabilistic.txt" > /dev/null
+  WARNING: Asked to provide at most 5 sequences, but there are 8 groups.
 
 Using the default probabilistic subsampling, should work the same as the previous case.
 
@@ -113,6 +114,7 @@ Using the default probabilistic subsampling, should work the same as the previou
   >  --subsample-max-sequences 5 \
   >  --subsample-seed 314159 \
   >  --output-strains "$TMP/filtered_strains_default.txt" > /dev/null
+  WARNING: Asked to provide at most 5 sequences, but there are 8 groups.
 
 By setting the subsample seed above, we should get the same results for both runs.
 
@@ -394,6 +396,7 @@ Strains with ambiguous years or months should be dropped and logged.
   >  --subsample-max-sequences 5 \
   >  --output-strains "$TMP/filtered_strains.txt" \
   >  --output-log "$TMP/filtered_log.tsv" > /dev/null
+  WARNING: Asked to provide at most 5 sequences, but there are 6 groups.
   $ grep "SG_018" "$TMP/filtered_log.tsv" | cut -f 1-2
   SG_018\tskip_group_by_with_ambiguous_month (esc)
   $ grep "COL/FLR_00024/2015" "$TMP/filtered_log.tsv" | cut -f 1-2


=====================================
tests/test_schemas.py
=====================================
@@ -0,0 +1,14 @@
+import json
+import jsonschema.validators
+import pytest
+from pathlib import Path
+
+schemas = list(Path("augur/data/").glob("schema-*.json"))
+
+ at pytest.mark.parametrize("schema_path", schemas, ids = lambda schema_path: str(schema_path))
+def test_schema_is_valid(schema_path):
+    with schema_path.open("rb") as schema_fh:
+        schema = json.load(schema_fh)
+
+    Validator = jsonschema.validators.validator_for(schema)
+    Validator.check_schema(schema)



View it on GitLab: https://salsa.debian.org/med-team/augur/-/commit/d19a7749f42d916ea13e82046c21ad868e5c37d1

-- 
View it on GitLab: https://salsa.debian.org/med-team/augur/-/commit/d19a7749f42d916ea13e82046c21ad868e5c37d1
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20220408/caaf069b/attachment-0001.htm>