[med-svn] [Git][med-team/augur][upstream] New upstream version 18.1.2

Étienne Mollier (@emollier) gitlab at salsa.debian.org
Sun Nov 6 13:02:03 GMT 2022



Étienne Mollier pushed to branch upstream at Debian Med / augur


Commits:
87a22b0a by Étienne Mollier at 2022-11-06T10:47:38+01:00
New upstream version 18.1.2
- - - - -


8 changed files:

- CHANGES.md
- augur/__version__.py
- augur/filter.py
- augur/traits.py
- + tests/functional/filter/cram/subsample-ambiguous-dates-error.t
- tests/functional/traits.t
- + tests/functional/traits/metadata_with_missing_region.tsv
- + tests/functional/traits/traits_with_missing_region.json


Changes:

=====================================
CHANGES.md
=====================================
@@ -3,6 +3,22 @@
 ## __NEXT__
 
 
+## 18.1.2 (1 November 2022)
+
+### Bug Fixes
+
+* traits: Fix trait inference when tips have missing values. [#1081][] (@huddlej)
+
+[#1081]: https://github.com/nextstrain/augur/pull/1081
+
+## 18.1.1 (1 November 2022)
+
+### Bug Fixes
+
+* filter: Fixed a bug where `--group-by week` would fail when all samples in a chunk have been dropped due to ambiguous dates. [#1080][] (@victorlin)
+
+[#1080]: https://github.com/nextstrain/augur/pull/1080
+
 ## 18.1.0 (26 October 2022)
 
 ### Features


=====================================
augur/__version__.py
=====================================
@@ -1,4 +1,4 @@
-__version__ = '18.1.0'
+__version__ = '18.1.2'
 
 
 def is_augur_version_compatible(version):


=====================================
augur/filter.py
=====================================
@@ -1058,6 +1058,10 @@ def get_groups_for_subsampling(strains, metadata, group_by=None):
             metadata.drop([record['strain'] for record in ambiguous_date_strains], inplace=True)
             skipped_strains.extend(ambiguous_date_strains)
 
+            # Check again if metadata is empty after dropping ambiguous dates.
+            if metadata.empty:
+                return group_by_strain, skipped_strains
+
             # Generate columns.
             if 'year' in generated_columns_requested:
                 metadata['year'] = metadata[f'{temp_prefix}year']


=====================================
augur/traits.py
=====================================
@@ -51,7 +51,7 @@ def mugration_inference(tree=None, seq_meta=None, field='country', confidence=Tr
     traits = {}
     nodes = {n.name:n for n in T.get_terminals()}
     for name, meta in seq_meta.iterrows():
-        if field in meta and name in nodes:
+        if field in meta and name in nodes and meta[field] != missing:
             traits[name] = meta[field]
     unique_states = list(set(traits.values()))
 


=====================================
tests/functional/filter/cram/subsample-ambiguous-dates-error.t
=====================================
@@ -0,0 +1,88 @@
+Setup
+
+  $ pushd "$TESTDIR" > /dev/null
+  $ source _setup.sh
+
+Metadata with ambiguous days on all strains should error when grouping by week.
+
+  $ cat >$TMP/metadata.tsv <<~~
+  > strain	date
+  > SEQ1	2000-01-XX
+  > SEQ2	2000-02-XX
+  > SEQ3	2000-03-XX
+  > SEQ4	2000-04-XX
+  > ~~
+
+  $ ${AUGUR} filter \
+  >   --metadata $TMP/metadata.tsv \
+  >   --group-by week \
+  >   --sequences-per-group 1 \
+  >   --subsample-seed 0 \
+  >   --output-metadata $TMP/metadata-filtered.tsv \
+  >   --output-log $TMP/filtered_log.tsv
+  ERROR: All samples have been dropped! Check filter rules and metadata file format.
+  4 strains were dropped during filtering
+  \t4 were dropped during grouping due to ambiguous day information (esc)
+  \t0 of these were dropped because of subsampling criteria (esc)
+  [1]
+  $ cat $TMP/filtered_log.tsv | grep "skip_group_by_with_ambiguous_day" | wc -l
+  \s*4 (re)
+  $ cat $TMP/metadata-filtered.tsv
+  strain	date
+  $ rm -f $TMP/filtered_log.tsv $TMP/metadata-filtered.tsv
+
+Metadata with ambiguous months on all strains should error when grouping by month.
+
+  $ cat >$TMP/metadata.tsv <<~~
+  > strain	date
+  > SEQ1	2000-XX-XX
+  > SEQ2	2000-XX-XX
+  > SEQ3	2000-XX-XX
+  > SEQ4	2000-XX-XX
+  > ~~
+
+  $ ${AUGUR} filter \
+  >   --metadata $TMP/metadata.tsv \
+  >   --group-by month \
+  >   --sequences-per-group 1 \
+  >   --subsample-seed 0 \
+  >   --output-metadata $TMP/metadata-filtered.tsv \
+  >   --output-log $TMP/filtered_log.tsv
+  ERROR: All samples have been dropped! Check filter rules and metadata file format.
+  4 strains were dropped during filtering
+  \t4 were dropped during grouping due to ambiguous month information (esc)
+  \t0 of these were dropped because of subsampling criteria (esc)
+  [1]
+  $ cat $TMP/filtered_log.tsv | grep "skip_group_by_with_ambiguous_month" | wc -l
+  \s*4 (re)
+  $ cat $TMP/metadata-filtered.tsv
+  strain	date
+  $ rm -f $TMP/filtered_log.tsv $TMP/metadata-filtered.tsv
+
+Metadata with ambiguous years on all strains should error when grouping by year.
+
+  $ cat >$TMP/metadata.tsv <<~~
+  > strain	date
+  > SEQ1	XXXX-XX-XX
+  > SEQ2	XXXX-XX-XX
+  > SEQ3	XXXX-XX-XX
+  > SEQ4	XXXX-XX-XX
+  > ~~
+
+  $ ${AUGUR} filter \
+  >   --metadata $TMP/metadata.tsv \
+  >   --group-by year \
+  >   --sequences-per-group 1 \
+  >   --subsample-seed 0 \
+  >   --output-metadata $TMP/metadata-filtered.tsv \
+  >   --output-log $TMP/filtered_log.tsv
+  ERROR: All samples have been dropped! Check filter rules and metadata file format.
+  4 strains were dropped during filtering
+  \t4 were dropped during grouping due to ambiguous year information (esc)
+  \t0 of these were dropped because of subsampling criteria (esc)
+  [1]
+  $ cat $TMP/filtered_log.tsv | grep "skip_group_by_with_ambiguous_year" | wc -l
+  \s*4 (re)
+  $ cat $TMP/metadata-filtered.tsv
+  strain	date
+  $ rm -f $TMP/filtered_log.tsv $TMP/metadata-filtered.tsv


=====================================
tests/functional/traits.t
=====================================
@@ -15,6 +15,20 @@ Infer the ancestral region for a given tree and metadata.
   {}
   $ rm -f "$TMP/traits.json"
 
+Infer the ancestral region for a tree and metadata where one or more records have a missing value ("?") in the region field.
+Tips with missing values should get their values inferred, too.
+In this case, a sample from Panama (North America) has its region inferred as "South America".
+
+  $ ${AUGUR} traits \
+  >  --metadata "traits/metadata_with_missing_region.tsv" \
+  >  --tree "traits/tree.nwk" \
+  >  --columns region \
+  >  --output-node-data "$TMP/traits.json" > /dev/null
+
+  $ python3 "$TESTDIR/../../scripts/diff_jsons.py" "traits/traits_with_missing_region.json" "$TMP/traits.json" --significant-digits 2
+  {}
+  $ rm -f "$TMP/traits.json"
+
 Infer the ancestral "virus" value from the same metadata.
 Since there is only a single virus in the data, Augur warns the user through stderr.
 
@@ -44,6 +58,19 @@ This should similarly warn the user through stderr, but it should produce an err
   {}
   $ rm -f "$TMP/traits.json"
 
+Infer the ancestral "virus" value from the metadata after replacing the "zika" values with missing data values ("?").
+Augur should warn that there were no discrete states found for reconstruction, since "?" is not a valid state on its own.
+
+  $ sed 's/zika/?/' traits/metadata.tsv > "$TMP/metadata_with_missing_virus.tsv"
+  $ ${AUGUR} traits \
+  >  --metadata "$TMP/metadata_with_missing_virus.tsv" \
+  >  --tree "traits/tree.nwk" \
+  >  --columns virus \
+  >  --output-node-data "$TMP/traits.json" > /dev/null
+  WARNING: no states found for discrete state reconstruction.
+
+  $ rm -f "$TMP/traits.json"
+
 Switch back to the original directory where testing started.
 
   $ popd > /dev/null


=====================================
tests/functional/traits/metadata_with_missing_region.tsv
=====================================
@@ -0,0 +1,13 @@
+strain	virus	accession	date	region	country	division	city	db	segment	authors	url	title	journal	paper_url
+PAN/CDC_259359_V1_V3/2015	zika	KX156774	2015-12-18	?	Panama	Panama	Panama	genbank	genome	Shabman et al	https://www.ncbi.nlm.nih.gov/nuccore/KX156774	Direct Submission	Submitted (29-APR-2016) J. Craig Venter Institute, 9704 Medical Center Drive, Rockville, MD 20850, USA	https://www.ncbi.nlm.nih.gov/pubmed/
+COL/FLR_00024/2015	zika	MF574569	2015-12-XX	South America	Colombia	Colombia	Colombia	genbank	genome	Pickett et al	https://www.ncbi.nlm.nih.gov/nuccore/MF574569	Direct Submission	Submitted (28-JUL-2017) J. Craig Venter Institute, 9704 Medical Center Drive, Rockville, MD 20850, USA	https://www.ncbi.nlm.nih.gov/pubmed/
+PRVABC59	zika	KU501215	2015-12-XX	North America	Puerto Rico	Puerto Rico	Puerto Rico	genbank	genome	Lanciotti et al	https://www.ncbi.nlm.nih.gov/nuccore/KU501215	Phylogeny of Zika Virus in Western Hemisphere, 2015	Emerging Infect. Dis. 22 (5), 933-935 (2016)	https://www.ncbi.nlm.nih.gov/pubmed/27088323
+COL/FLR_00008/2015	zika	MF574562	2015-12-XX	South America	Colombia	Colombia	Colombia	genbank	genome	Pickett et al	https://www.ncbi.nlm.nih.gov/nuccore/MF574562	Direct Submission	Submitted (28-JUL-2017) J. Craig Venter Institute, 9704 Medical Center Drive, Rockville, MD 20850, USA	https://www.ncbi.nlm.nih.gov/pubmed/
+Colombia/2016/ZC204Se	zika	KY317939	2016-01-06	South America	Colombia	Colombia	Colombia	genbank	genome	Quick et al	https://www.ncbi.nlm.nih.gov/nuccore/KY317939	Multiplex PCR method for MinION and Illumina sequencing of Zika and other virus genomes directly from clinical samples	Nat Protoc 12 (6), 1261-1276 (2017)	https://www.ncbi.nlm.nih.gov/pubmed/28538739
+ZKC2/2016	zika	KX253996	2016-02-16	Oceania	American Samoa	American Samoa	American Samoa	genbank	genome	Wu et al	https://www.ncbi.nlm.nih.gov/nuccore/KX253996	Direct Submission	Submitted (18-MAY-2016) Center for Diseases Control and Prevention of Guangdong Province; National Institute of Viral Disease Control and Prevention, China	https://www.ncbi.nlm.nih.gov/pubmed/
+VEN/UF_1/2016	zika	KX702400	2016-03-25	South America	Venezuela	Venezuela	Venezuela	genbank	genome	Blohm et al	https://www.ncbi.nlm.nih.gov/nuccore/KX702400	Complete Genome Sequences of Identical Zika virus Isolates in a Nursing Mother and Her Infant	Genome Announc 5 (17), e00231-17 (2017)	https://www.ncbi.nlm.nih.gov/pubmed/28450510
+DOM/2016/BB_0059	zika	KY785425	2016-04-04	North America	Dominican Republic	Dominican Republic	Dominican Republic	genbank	genome	Metsky et al	https://www.ncbi.nlm.nih.gov/nuccore/KY785425	Zika virus evolution and spread in the Americas	Nature 546 (7658), 411-415 (2017)	https://www.ncbi.nlm.nih.gov/pubmed/28538734
+BRA/2016/FC_6706	zika	KY785433	2016-04-08	South America	Brazil	Brazil	Brazil	genbank	genome	Metsky et al	https://www.ncbi.nlm.nih.gov/nuccore/KY785433	Zika virus evolution and spread in the Americas	Nature 546 (7658), 411-415 (2017)	https://www.ncbi.nlm.nih.gov/pubmed/28538734
+DOM/2016/BB_0183	zika	KY785420	2016-04-18	North America	Dominican Republic	Dominican Republic	Dominican Republic	genbank	genome	Metsky et al	https://www.ncbi.nlm.nih.gov/nuccore/KY785420	Zika virus evolution and spread in the Americas	Nature 546 (7658), 411-415 (2017)	https://www.ncbi.nlm.nih.gov/pubmed/28538734
+EcEs062_16	zika	KX879603	2016-04-XX	South America	Ecuador	Ecuador	Ecuador	genbank	genome	Marquez et al	https://www.ncbi.nlm.nih.gov/nuccore/KX879603	First Complete Genome Sequences of Zika Virus Isolated from Febrile Patient Sera in Ecuador	Genome Announc 5 (8), e01673-16 (2017)	https://www.ncbi.nlm.nih.gov/pubmed/28232448
+HND/2016/HU_ME59	zika	KY785418	2016-05-13	North America	Honduras	Honduras	Honduras	genbank	genome	Metsky et al	https://www.ncbi.nlm.nih.gov/nuccore/KY785418	Zika virus evolution and spread in the Americas	Nature 546 (7658), 411-415 (2017)	https://www.ncbi.nlm.nih.gov/pubmed/28538734


=====================================
tests/functional/traits/traits_with_missing_region.json
=====================================
@@ -0,0 +1,95 @@
+{
+  "generated_by": {
+    "program": "augur",
+    "version": "18.1.0"
+  },
+  "models": {
+    "region": {
+      "alphabet": [
+        "North America",
+        "Oceania",
+        "South America",
+        "?"
+      ],
+      "equilibrium_probabilities": [
+        0.2937135406086192,
+        0.2682094003586489,
+        0.4380770590327319
+      ],
+      "rate": 256.91544578481756,
+      "transition_matrix": [
+        [
+          0.0,
+          1.0936742264236954,
+          2.2244614853809708
+        ],
+        [
+          1.0936742264236954,
+          0.0,
+          1.086197824238873
+        ],
+        [
+          2.2244614853809708,
+          1.086197824238873,
+          0.0
+        ]
+      ]
+    }
+  },
+  "nodes": {
+    "BRA/2016/FC_6706": {
+      "region": "South America"
+    },
+    "COL/FLR_00008/2015": {
+      "region": "South America"
+    },
+    "Colombia/2016/ZC204Se": {
+      "region": "South America"
+    },
+    "DOM/2016/BB_0183": {
+      "region": "North America"
+    },
+    "EcEs062_16": {
+      "region": "South America"
+    },
+    "HND/2016/HU_ME59": {
+      "region": "North America"
+    },
+    "NODE_0000001": {
+      "region": "South America"
+    },
+    "NODE_0000002": {
+      "region": "North America"
+    },
+    "NODE_0000003": {
+      "region": "North America"
+    },
+    "NODE_0000004": {
+      "region": "North America"
+    },
+    "NODE_0000005": {
+      "region": "South America"
+    },
+    "NODE_0000006": {
+      "region": "South America"
+    },
+    "NODE_0000007": {
+      "region": "South America"
+    },
+    "NODE_0000008": {
+      "region": "South America"
+    },
+    "PAN/CDC_259359_V1_V3/2015": {
+      "region": "South America"
+    },
+    "PRVABC59": {
+      "region": "North America"
+    },
+    "VEN/UF_1/2016": {
+      "region": "South America"
+    },
+    "ZKC2/2016": {
+      "region": "Oceania"
+    }
+  }
+}
\ No newline at end of file



View it on GitLab: https://salsa.debian.org/med-team/augur/-/commit/87a22b0a594348e304a318648737770a421cafea

-- 
View it on GitLab: https://salsa.debian.org/med-team/augur/-/commit/87a22b0a594348e304a318648737770a421cafea
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20221106/ac2a725f/attachment-0001.htm>


More information about the debian-med-commit mailing list