[med-svn] [Git][med-team/augur][upstream] New upstream version 11.0.0

Tue Jan 26 05:55:28 GMT 2021


Nilesh Patra pushed to branch upstream at Debian Med / augur


Commits:
ca24b0bd by Nilesh Patra at 2021-01-26T11:13:39+05:30
New upstream version 11.0.0
- - - - -


8 changed files:

- CHANGES.md
- augur/__version__.py
- augur/filter.py
- tests/builds/tb/Snakefile
- tests/builds/various_export_settings/base.snakefile
- tests/builds/zika.t
- − tests/builds/zika/Snakefile
- tests/functional/filter.t


Changes:

=====================================
CHANGES.md
=====================================
@@ -3,6 +3,14 @@
 ## __NEXT__
 
 
+## 11.0.0 (22 January 2021)
+
+### Major Changes
+
+* filter: Use probabilistic sampling by default when requesting a maximum number of sequences to subsample with `--subsample-max-sequences`. Adds `--no-probabilistic-sampling` flag to disable this default behavior and prevent users from requesting fewer maximum sequences than there are subsampling groups. [#659][]
+
+[#659]: https://github.com/nextstrain/augur/pull/659
+
 ## 10.3.0 (14 January 2021)
 
 ### Bug Fixes


=====================================
augur/__version__.py
=====================================
@@ -1,4 +1,4 @@
-__version__ = '10.3.0'
+__version__ = '11.0.0'
 
 
 def is_augur_version_compatible(version):


=====================================
augur/filter.py
=====================================
@@ -100,7 +100,9 @@ def register_arguments(parser):
     subsample_group.add_argument('--sequences-per-group', type=int, help="subsample to no more than this number of sequences per category")
     subsample_group.add_argument('--subsample-max-sequences', type=int, help="subsample to no more than this number of sequences")
     parser.add_argument('--group-by', nargs='+', help="categories with respect to subsample; two virtual fields, \"month\" and \"year\", are supported if they don't already exist as real fields but a \"date\" field does exist")
-    parser.add_argument('--probabilistic-sampling', action='store_true', help="Sample probabilitically from groups -- useful when there are more groups than requested sequences")
+    probabilistic_sampling_group = parser.add_mutually_exclusive_group()
+    probabilistic_sampling_group.add_argument('--probabilistic-sampling', action='store_true', help="Enable probabilistic sampling during subsampling. This is useful when there are more groups than requested sequences. This option only applies when `--subsample-max-sequences` is provided.")
+    probabilistic_sampling_group.add_argument('--no-probabilistic-sampling', action='store_false', dest='probabilistic_sampling')
     parser.add_argument('--subsample-seed', help="random number generator seed to allow reproducible sub-sampling (with same input data). Can be number or string.")
     parser.add_argument('--exclude-where', nargs='+',
                                 help="Exclude samples matching these conditions. Ex: \"host=rat\" or \"host!=rat\". Multiple values are processed as OR (matching any of those specified will be excluded), not AND")
@@ -111,6 +113,7 @@ def register_arguments(parser):
     parser.add_argument('--query', help="Filter samples by attribute. Uses Pandas Dataframe querying, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query for syntax.")
     parser.add_argument('--output', '-o', help="output file", required=True)
 
+    parser.set_defaults(probabilistic_sampling=True)
 
 def run(args):
     '''


=====================================
tests/builds/tb/Snakefile
=====================================
@@ -49,6 +49,7 @@ rule filter:
             --exclude {input.exclude} \
             --group-by {params.group_by} \
             --sequences-per-group {params.sequences_per_group} \
+            --no-probabilistic-sampling
         """
 
 rule mask:


=====================================
tests/builds/various_export_settings/base.snakefile
=====================================
@@ -50,6 +50,7 @@ rule filter:
             --output {output.sequences} \
             --group-by {params.group_by} \
             --sequences-per-group {params.sequences_per_group} \
+            --no-probabilistic-sampling \
             --min-date {params.min_date}
         """
 
@@ -112,4 +113,3 @@ rule refine:
             --date-inference {params.date_inference} \
             --clock-filter-iqd {params.clock_filter_iqd}
         """
-


=====================================
tests/builds/zika.t
=====================================
@@ -30,6 +30,7 @@ Filter sequences by a minimum date and an exclusion list and only keep one seque
   >   --group-by country year month \
   >   --sequences-per-group 1 \
   >   --subsample-seed 314159 \
+  >   --no-probabilistic-sampling \
   >   --min-date 2012 > /dev/null
 
   $ diff -u "results/filtered.fasta" "$TMP/out/filtered.fasta"
@@ -178,4 +179,4 @@ Export JSON files as v2 auspice outputs.
 
 Switch back to the original directory where testing started.
 
-  $ popd > /dev/null
\ No newline at end of file
+  $ popd > /dev/null


=====================================
tests/builds/zika/Snakefile deleted
=====================================
@@ -1,279 +0,0 @@
-rule all:
-    input:
-        auspice_tip_frequencies = "auspice/zika_tip-frequencies.json",
-        auspice_v1_tree = "auspice/v1_zika_tree.json",
-        auspice_v1_meta = "auspice/v1_zika_meta.json",
-        auspice_v1_seq = "auspice/v1_zika_seq.json",
-        auspice_v1_tip_frequencies = "auspice/v1_zika_tip-frequencies.json",
-        auspice_v2 = "auspice/v2_zika.json",
-        auspice_v2_tip_frequencies = "auspice/v2_zika_tip-frequencies.json"
-rule files:
-    params:
-        input_fasta = "data/zika.fasta",
-        dropped_strains = "config/dropped_strains.txt",
-        reference = "config/zika_outgroup.gb",
-        colors = "config/colors.tsv",
-        auspice_config_v1 = "config/auspice_config_v1.json",
-        auspice_config_v2 = "config/auspice_config_v2.json"
-
-files = rules.files.params
-
-rule parse:
-    input:
-        sequences = files.input_fasta
-    output:
-        sequences = "results/sequences.fasta",
-        metadata = "results/metadata.tsv"
-    params:
-        fasta_fields = "strain virus accession date region country division city db segment authors url title journal paper_url",
-        prettify_fields = "region country division city"
-    shell:
-        """
-        augur parse \
-            --sequences {input.sequences} \
-            --output-sequences {output.sequences} \
-            --output-metadata {output.metadata} \
-            --fields {params.fasta_fields} \
-            --prettify-fields {params.prettify_fields}
-        """
-
-rule filter:
-    message: "Filtering sequences"
-    input:
-        sequences = rules.parse.output.sequences,
-        metadata = rules.parse.output.metadata,
-        exclude = files.dropped_strains
-    output:
-        sequences = "results/filtered.fasta"
-    params:
-        group_by = "country year month",
-        sequences_per_group = 1,
-        min_date = 2012
-    shell:
-        """
-        augur filter \
-            --sequences {input.sequences} \
-            --metadata {input.metadata} \
-            --exclude {input.exclude} \
-            --output {output.sequences} \
-            --group-by {params.group_by} \
-            --sequences-per-group {params.sequences_per_group} \
-            --subsample-seed 314159 \
-            --min-date {params.min_date}
-        """
-
-rule align:
-    message: "Aligning sequences"
-    input:
-        sequences = rules.filter.output.sequences,
-        reference = files.reference
-    output:
-        alignment = "results/aligned.fasta"
-    shell:
-        """
-        augur align \
-            --sequences {input.sequences} \
-            --reference-sequence {input.reference} \
-            --output {output.alignment} \
-            --fill-gaps
-        """
-
-rule tree:
-    message: "Building tree"
-    input:
-        alignment = rules.align.output.alignment
-    output:
-        tree = "results/tree_raw.nwk"
-    params:
-        method = "iqtree"
-    shell:
-        """
-        augur tree \
-            --alignment {input.alignment} \
-            --output {output.tree} \
-            --method {params.method} \
-            --tree-builder-args "-seed 314159"
-        """
-
-rule refine:
-    message: "Refining tree"
-    input:
-        tree = rules.tree.output.tree,
-        alignment = rules.align.output,
-        metadata = rules.parse.output.metadata
-    output:
-        tree = "results/tree.nwk",
-        node_data = "results/branch_lengths.json"
-    params:
-        coalescent = "opt",
-        date_inference = "marginal",
-        clock_filter_iqd = 4
-    shell:
-        """
-        augur refine \
-            --tree {input.tree} \
-            --alignment {input.alignment} \
-            --metadata {input.metadata} \
-            --output-tree {output.tree} \
-            --output-node-data {output.node_data} \
-            --timetree \
-            --coalescent {params.coalescent} \
-            --date-confidence \
-            --date-inference {params.date_inference} \
-            --clock-filter-iqd {params.clock_filter_iqd} \
-            --seed 314159
-        """
-
-rule ancestral:
-    message: "Reconstructing ancestral sequences and mutations"
-    input:
-        tree = rules.refine.output.tree,
-        alignment = rules.align.output
-    output:
-        node_data = "results/nt_muts.json"
-    params:
-        inference = "joint"
-    shell:
-        """
-        augur ancestral \
-            --tree {input.tree} \
-            --alignment {input.alignment} \
-            --infer-ambiguous \
-            --output-node-data {output.node_data} \
-            --inference {params.inference}
-        """
-
-rule translate:
-    message: "Translating amino acid sequences"
-    input:
-        tree = rules.refine.output.tree,
-        node_data = rules.ancestral.output.node_data,
-        reference = files.reference
-    output:
-        node_data = "results/aa_muts.json"
-    shell:
-        """
-        augur translate \
-            --tree {input.tree} \
-            --ancestral-sequences {input.node_data} \
-            --reference-sequence {input.reference} \
-            --output-node-data {output.node_data} \
-        """
-
-rule traits:
-    message: "Inferring ancestral traits"
-    input:
-        tree = rules.refine.output.tree,
-        metadata = rules.parse.output.metadata
-    output:
-        node_data = "results/traits.json",
-    params:
-        columns = "country region",
-        sampling_bias_correction = 3,
-        weights = "config/trait_weights.csv"
-    shell:
-        """
-        augur traits \
-            --tree {input.tree} \
-            --weights {params.weights} \
-            --metadata {input.metadata} \
-            --output-node-data {output.node_data} \
-            --columns {params.columns} \
-            --sampling-bias-correction {params.sampling_bias_correction} \
-            --confidence
-        """
-
-rule export_v1:
-    message: "Exporting data files for for auspice using nextflu compatible schema"
-    input:
-        tree = rules.refine.output.tree,
-        metadata = rules.parse.output.metadata,
-        branch_lengths = rules.refine.output.node_data,
-        traits = rules.traits.output.node_data,
-        nt_muts = rules.ancestral.output.node_data,
-        aa_muts = rules.translate.output.node_data,
-        colors = files.colors,
-        auspice_config = files.auspice_config_v1
-    output:
-        auspice_tree = rules.all.input.auspice_v1_tree,
-        auspice_meta = rules.all.input.auspice_v1_meta,
-        auspice_seq = rules.all.input.auspice_v1_seq
-    shell:
-        """
-        augur export v1 \
-            --tree {input.tree} \
-            --metadata {input.metadata} \
-            --node-data {input.branch_lengths} {input.traits} {input.nt_muts} {input.aa_muts} \
-            --colors {input.colors} \
-            --auspice-config {input.auspice_config} \
-            --output-tree {output.auspice_tree} \
-            --output-meta {output.auspice_meta} \
-            --output-sequence {output.auspice_seq}
-        augur validate export-v1 {output.auspice_meta} {output.auspice_tree}
-        """
-
-rule export_v2:
-    message: "Exporting data files for for auspice using nextstrain schema v2"
-    input:
-        tree = rules.refine.output.tree,
-        metadata = rules.parse.output.metadata,
-        branch_lengths = rules.refine.output.node_data,
-        traits = rules.traits.output.node_data,
-        nt_muts = rules.ancestral.output.node_data,
-        aa_muts = rules.translate.output.node_data,
-        auspice_config = files.auspice_config_v2,
-        colors = files.colors
-    params:
-        title = "Real-time tracking of Zika virus evolution -- v2 JSON"
-    output:
-        auspice_main = rules.all.input.auspice_v2
-    shell:
-        """
-        augur export v2 \
-            --tree {input.tree} \
-            --metadata {input.metadata} \
-            --node-data {input.branch_lengths} {input.traits} {input.nt_muts} {input.aa_muts} \
-            --colors {input.colors} \
-            --auspice-config {input.auspice_config} \
-            --output {output.auspice_main} \
-            --title {params.title:q} \
-            --panels tree map entropy frequencies
-        """
-
-rule tip_frequencies:
-    input:
-        tree = rules.refine.output.tree,
-        metadata = rules.parse.output.metadata
-    params:
-        pivot_interval = 3
-    output:
-        tip_freq = rules.all.input.auspice_tip_frequencies
-    shell:
-        """
-        augur frequencies \
-            --method kde \
-            --tree {input.tree} \
-            --metadata {input.metadata} \
-            --pivot-interval {params.pivot_interval} \
-            --output {output}
-        """
-
-rule copy_tip_frequencies:
-    input:
-        frequencies = rules.tip_frequencies.output.tip_freq
-    output:
-        v1 = rules.all.input.auspice_v1_tip_frequencies,
-        v2 = rules.all.input.auspice_v2_tip_frequencies
-    shell:
-        """
-        cp -f {input} {output.v1};
-        cp -f {input} {output.v2};
-        """
-
-rule clean:
-    message: "Removing directories: {params}"
-    params:
-        "results ",
-        "auspice"
-    shell:
-        "rm -rfv {params}"


=====================================
tests/functional/filter.t
=====================================
@@ -13,13 +13,14 @@ With 10 groups to subsample from, this should produce one sequence per group.
   >  --group-by country year month \
   >  --subsample-max-sequences 10 \
   >  --subsample-seed 314159 \
+  >  --no-probabilistic-sampling \
   >  --output "$TMP/filtered.fasta" > /dev/null
   $ grep ">" "$TMP/filtered.fasta" | wc -l
-  10
+  \s*10 (re)
   $ rm -f "$TMP/filtered.fasta"
 
 Try to filter with subsampling when there are more available groups than requested sequences.
-This should fail.
+This should fail, as probabilistic sampling is explicitly disabled.
 
   $ ${AUGUR} filter \
   >  --sequences filter/sequences.fasta \
@@ -28,12 +29,13 @@ This should fail.
   >  --group-by country year month \
   >  --subsample-max-sequences 5 \
   >  --subsample-seed 314159 \
+  >  --no-probabilistic-sampling \
   >  --output "$TMP/filtered.fasta"
   ERROR: Asked to provide at most 5 sequences, but there are 10 groups.
   [1]
   $ rm -f "$TMP/filtered.fasta"
 
-Use probabilistic subsampling to handle the case when there are more available groups than requested sequences.
+Explicitly use probabilistic subsampling to handle the case when there are more available groups than requested sequences.
 
   $ ${AUGUR} filter \
   >  --sequences filter/sequences.fasta \
@@ -45,3 +47,15 @@ Use probabilistic subsampling to handle the case when there are more available g
   >  --probabilistic-sampling \
   >  --output "$TMP/filtered.fasta" > /dev/null
   $ rm -f "$TMP/filtered.fasta"
+
+Using the default probabilistic subsampling, should work the same as the previous case.
+
+  $ ${AUGUR} filter \
+  >  --sequences filter/sequences.fasta \
+  >  --metadata filter/metadata.tsv \
+  >  --min-date 2012 \
+  >  --group-by country year month \
+  >  --subsample-max-sequences 5 \
+  >  --subsample-seed 314159 \
+  >  --output "$TMP/filtered.fasta" > /dev/null
+  $ rm -f "$TMP/filtered.fasta"



View it on GitLab: https://salsa.debian.org/med-team/augur/-/commit/ca24b0bda423927b330a95d986722a704914e5d4

-- 
View it on GitLab: https://salsa.debian.org/med-team/augur/-/commit/ca24b0bda423927b330a95d986722a704914e5d4
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20210126/bba8d2e0/attachment-0001.html>