[med-svn] [Git][med-team/augur][upstream] New upstream version 18.1.2
Étienne Mollier (@emollier)
gitlab at salsa.debian.org
Sun Nov 6 13:02:03 GMT 2022
Étienne Mollier pushed to branch upstream at Debian Med / augur
Commits:
87a22b0a by Étienne Mollier at 2022-11-06T10:47:38+01:00
New upstream version 18.1.2
- - - - -
8 changed files:
- CHANGES.md
- augur/__version__.py
- augur/filter.py
- augur/traits.py
- + tests/functional/filter/cram/subsample-ambiguous-dates-error.t
- tests/functional/traits.t
- + tests/functional/traits/metadata_with_missing_region.tsv
- + tests/functional/traits/traits_with_missing_region.json
Changes:
=====================================
CHANGES.md
=====================================
@@ -3,6 +3,22 @@
## __NEXT__
+## 18.1.2 (1 November 2022)
+
+### Bug Fixes
+
+* traits: Fix trait inference when tips have missing values. [#1081][] (@huddlej)
+
+[#1081]: https://github.com/nextstrain/augur/pull/1081
+
+## 18.1.1 (1 November 2022)
+
+### Bug Fixes
+
+* filter: Fixed a bug where `--group-by week` would fail when all samples in a chunk have been dropped due to ambiguous dates. [#1080][] (@victorlin)
+
+[#1080]: https://github.com/nextstrain/augur/pull/1080
+
## 18.1.0 (26 October 2022)
### Features
=====================================
augur/__version__.py
=====================================
@@ -1,4 +1,4 @@
-__version__ = '18.1.0'
+__version__ = '18.1.2'
def is_augur_version_compatible(version):
=====================================
augur/filter.py
=====================================
@@ -1058,6 +1058,10 @@ def get_groups_for_subsampling(strains, metadata, group_by=None):
metadata.drop([record['strain'] for record in ambiguous_date_strains], inplace=True)
skipped_strains.extend(ambiguous_date_strains)
+ # Check again if metadata is empty after dropping ambiguous dates.
+ if metadata.empty:
+ return group_by_strain, skipped_strains
+
# Generate columns.
if 'year' in generated_columns_requested:
metadata['year'] = metadata[f'{temp_prefix}year']
=====================================
augur/traits.py
=====================================
@@ -51,7 +51,7 @@ def mugration_inference(tree=None, seq_meta=None, field='country', confidence=Tr
traits = {}
nodes = {n.name:n for n in T.get_terminals()}
for name, meta in seq_meta.iterrows():
- if field in meta and name in nodes:
+ if field in meta and name in nodes and meta[field] != missing:
traits[name] = meta[field]
unique_states = list(set(traits.values()))
=====================================
tests/functional/filter/cram/subsample-ambiguous-dates-error.t
=====================================
@@ -0,0 +1,88 @@
+Setup
+
+ $ pushd "$TESTDIR" > /dev/null
+ $ source _setup.sh
+
+Metadata with ambiguous days on all strains should error when grouping by week.
+
+ $ cat >$TMP/metadata.tsv <<~~
+ > strain date
+ > SEQ1 2000-01-XX
+ > SEQ2 2000-02-XX
+ > SEQ3 2000-03-XX
+ > SEQ4 2000-04-XX
+ > ~~
+
+ $ ${AUGUR} filter \
+ > --metadata $TMP/metadata.tsv \
+ > --group-by week \
+ > --sequences-per-group 1 \
+ > --subsample-seed 0 \
+ > --output-metadata $TMP/metadata-filtered.tsv \
+ > --output-log $TMP/filtered_log.tsv
+ ERROR: All samples have been dropped! Check filter rules and metadata file format.
+ 4 strains were dropped during filtering
+ \t4 were dropped during grouping due to ambiguous day information (esc)
+ \t0 of these were dropped because of subsampling criteria (esc)
+ [1]
+ $ cat $TMP/filtered_log.tsv | grep "skip_group_by_with_ambiguous_day" | wc -l
+ \s*4 (re)
+ $ cat $TMP/metadata-filtered.tsv
+ strain date
+ $ rm -f $TMP/filtered_log.tsv $TMP/metadata-filtered.tsv
+
+Metadata with ambiguous months on all strains should error when grouping by month.
+
+ $ cat >$TMP/metadata.tsv <<~~
+ > strain date
+ > SEQ1 2000-XX-XX
+ > SEQ2 2000-XX-XX
+ > SEQ3 2000-XX-XX
+ > SEQ4 2000-XX-XX
+ > ~~
+
+ $ ${AUGUR} filter \
+ > --metadata $TMP/metadata.tsv \
+ > --group-by month \
+ > --sequences-per-group 1 \
+ > --subsample-seed 0 \
+ > --output-metadata $TMP/metadata-filtered.tsv \
+ > --output-log $TMP/filtered_log.tsv
+ ERROR: All samples have been dropped! Check filter rules and metadata file format.
+ 4 strains were dropped during filtering
+ \t4 were dropped during grouping due to ambiguous month information (esc)
+ \t0 of these were dropped because of subsampling criteria (esc)
+ [1]
+ $ cat $TMP/filtered_log.tsv | grep "skip_group_by_with_ambiguous_month" | wc -l
+ \s*4 (re)
+ $ cat $TMP/metadata-filtered.tsv
+ strain date
+ $ rm -f $TMP/filtered_log.tsv $TMP/metadata-filtered.tsv
+
+Metadata with ambiguous years on all strains should error when grouping by year.
+
+ $ cat >$TMP/metadata.tsv <<~~
+ > strain date
+ > SEQ1 XXXX-XX-XX
+ > SEQ2 XXXX-XX-XX
+ > SEQ3 XXXX-XX-XX
+ > SEQ4 XXXX-XX-XX
+ > ~~
+
+ $ ${AUGUR} filter \
+ > --metadata $TMP/metadata.tsv \
+ > --group-by year \
+ > --sequences-per-group 1 \
+ > --subsample-seed 0 \
+ > --output-metadata $TMP/metadata-filtered.tsv \
+ > --output-log $TMP/filtered_log.tsv
+ ERROR: All samples have been dropped! Check filter rules and metadata file format.
+ 4 strains were dropped during filtering
+ \t4 were dropped during grouping due to ambiguous year information (esc)
+ \t0 of these were dropped because of subsampling criteria (esc)
+ [1]
+ $ cat $TMP/filtered_log.tsv | grep "skip_group_by_with_ambiguous_year" | wc -l
+ \s*4 (re)
+ $ cat $TMP/metadata-filtered.tsv
+ strain date
+ $ rm -f $TMP/filtered_log.tsv $TMP/metadata-filtered.tsv
=====================================
tests/functional/traits.t
=====================================
@@ -15,6 +15,20 @@ Infer the ancestral region for a given tree and metadata.
{}
$ rm -f "$TMP/traits.json"
+Infer the ancestral region for a tree and metadata where one or more records have a missing value ("?") in the region field.
+Tips with missing values should get their values inferred, too.
+In this case, a sample from Panama (North America) has its region inferred as "South America".
+
+ $ ${AUGUR} traits \
+ > --metadata "traits/metadata_with_missing_region.tsv" \
+ > --tree "traits/tree.nwk" \
+ > --columns region \
+ > --output-node-data "$TMP/traits.json" > /dev/null
+
+ $ python3 "$TESTDIR/../../scripts/diff_jsons.py" "traits/traits_with_missing_region.json" "$TMP/traits.json" --significant-digits 2
+ {}
+ $ rm -f "$TMP/traits.json"
+
Infer the ancestral "virus" value from the same metadata.
Since there is only a single virus in the data, Augur warns the user through stderr.
@@ -44,6 +58,19 @@ This should similarly warn the user through stderr, but it should produce an err
{}
$ rm -f "$TMP/traits.json"
+Infer the ancestral "virus" value from the metadata after replacing the "zika" values with missing data values ("?").
+Augur should warn that there were no discrete states found for reconstruction, since "?" is not a valid state on its own.
+
+ $ sed 's/zika/?/' traits/metadata.tsv > "$TMP/metadata_with_missing_virus.tsv"
+ $ ${AUGUR} traits \
+ > --metadata "$TMP/metadata_with_missing_virus.tsv" \
+ > --tree "traits/tree.nwk" \
+ > --columns virus \
+ > --output-node-data "$TMP/traits.json" > /dev/null
+ WARNING: no states found for discrete state reconstruction.
+
+ $ rm -f "$TMP/traits.json"
+
Switch back to the original directory where testing started.
$ popd > /dev/null
=====================================
tests/functional/traits/metadata_with_missing_region.tsv
=====================================
@@ -0,0 +1,13 @@
+strain virus accession date region country division city db segment authors url title journal paper_url
+PAN/CDC_259359_V1_V3/2015 zika KX156774 2015-12-18 ? Panama Panama Panama genbank genome Shabman et al https://www.ncbi.nlm.nih.gov/nuccore/KX156774 Direct Submission Submitted (29-APR-2016) J. Craig Venter Institute, 9704 Medical Center Drive, Rockville, MD 20850, USA https://www.ncbi.nlm.nih.gov/pubmed/
+COL/FLR_00024/2015 zika MF574569 2015-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al https://www.ncbi.nlm.nih.gov/nuccore/MF574569 Direct Submission Submitted (28-JUL-2017) J. Craig Venter Institute, 9704 Medical Center Drive, Rockville, MD 20850, USA https://www.ncbi.nlm.nih.gov/pubmed/
+PRVABC59 zika KU501215 2015-12-XX North America Puerto Rico Puerto Rico Puerto Rico genbank genome Lanciotti et al https://www.ncbi.nlm.nih.gov/nuccore/KU501215 Phylogeny of Zika Virus in Western Hemisphere, 2015 Emerging Infect. Dis. 22 (5), 933-935 (2016) https://www.ncbi.nlm.nih.gov/pubmed/27088323
+COL/FLR_00008/2015 zika MF574562 2015-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al https://www.ncbi.nlm.nih.gov/nuccore/MF574562 Direct Submission Submitted (28-JUL-2017) J. Craig Venter Institute, 9704 Medical Center Drive, Rockville, MD 20850, USA https://www.ncbi.nlm.nih.gov/pubmed/
+Colombia/2016/ZC204Se zika KY317939 2016-01-06 South America Colombia Colombia Colombia genbank genome Quick et al https://www.ncbi.nlm.nih.gov/nuccore/KY317939 Multiplex PCR method for MinION and Illumina sequencing of Zika and other virus genomes directly from clinical samples Nat Protoc 12 (6), 1261-1276 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28538739
+ZKC2/2016 zika KX253996 2016-02-16 Oceania American Samoa American Samoa American Samoa genbank genome Wu et al https://www.ncbi.nlm.nih.gov/nuccore/KX253996 Direct Submission Submitted (18-MAY-2016) Center for Diseases Control and Prevention of Guangdong Province; National Institute of Viral Disease Control and Prevention, China https://www.ncbi.nlm.nih.gov/pubmed/
+VEN/UF_1/2016 zika KX702400 2016-03-25 South America Venezuela Venezuela Venezuela genbank genome Blohm et al https://www.ncbi.nlm.nih.gov/nuccore/KX702400 Complete Genome Sequences of Identical Zika virus Isolates in a Nursing Mother and Her Infant Genome Announc 5 (17), e00231-17 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28450510
+DOM/2016/BB_0059 zika KY785425 2016-04-04 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785425 Zika virus evolution and spread in the Americas Nature 546 (7658), 411-415 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28538734
+BRA/2016/FC_6706 zika KY785433 2016-04-08 South America Brazil Brazil Brazil genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785433 Zika virus evolution and spread in the Americas Nature 546 (7658), 411-415 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28538734
+DOM/2016/BB_0183 zika KY785420 2016-04-18 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785420 Zika virus evolution and spread in the Americas Nature 546 (7658), 411-415 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28538734
+EcEs062_16 zika KX879603 2016-04-XX South America Ecuador Ecuador Ecuador genbank genome Marquez et al https://www.ncbi.nlm.nih.gov/nuccore/KX879603 First Complete Genome Sequences of Zika Virus Isolated from Febrile Patient Sera in Ecuador Genome Announc 5 (8), e01673-16 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28232448
+HND/2016/HU_ME59 zika KY785418 2016-05-13 North America Honduras Honduras Honduras genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785418 Zika virus evolution and spread in the Americas Nature 546 (7658), 411-415 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28538734
=====================================
tests/functional/traits/traits_with_missing_region.json
=====================================
@@ -0,0 +1,95 @@
+{
+ "generated_by": {
+ "program": "augur",
+ "version": "18.1.0"
+ },
+ "models": {
+ "region": {
+ "alphabet": [
+ "North America",
+ "Oceania",
+ "South America",
+ "?"
+ ],
+ "equilibrium_probabilities": [
+ 0.2937135406086192,
+ 0.2682094003586489,
+ 0.4380770590327319
+ ],
+ "rate": 256.91544578481756,
+ "transition_matrix": [
+ [
+ 0.0,
+ 1.0936742264236954,
+ 2.2244614853809708
+ ],
+ [
+ 1.0936742264236954,
+ 0.0,
+ 1.086197824238873
+ ],
+ [
+ 2.2244614853809708,
+ 1.086197824238873,
+ 0.0
+ ]
+ ]
+ }
+ },
+ "nodes": {
+ "BRA/2016/FC_6706": {
+ "region": "South America"
+ },
+ "COL/FLR_00008/2015": {
+ "region": "South America"
+ },
+ "Colombia/2016/ZC204Se": {
+ "region": "South America"
+ },
+ "DOM/2016/BB_0183": {
+ "region": "North America"
+ },
+ "EcEs062_16": {
+ "region": "South America"
+ },
+ "HND/2016/HU_ME59": {
+ "region": "North America"
+ },
+ "NODE_0000001": {
+ "region": "South America"
+ },
+ "NODE_0000002": {
+ "region": "North America"
+ },
+ "NODE_0000003": {
+ "region": "North America"
+ },
+ "NODE_0000004": {
+ "region": "North America"
+ },
+ "NODE_0000005": {
+ "region": "South America"
+ },
+ "NODE_0000006": {
+ "region": "South America"
+ },
+ "NODE_0000007": {
+ "region": "South America"
+ },
+ "NODE_0000008": {
+ "region": "South America"
+ },
+ "PAN/CDC_259359_V1_V3/2015": {
+ "region": "South America"
+ },
+ "PRVABC59": {
+ "region": "North America"
+ },
+ "VEN/UF_1/2016": {
+ "region": "South America"
+ },
+ "ZKC2/2016": {
+ "region": "Oceania"
+ }
+ }
+}
\ No newline at end of file
View it on GitLab: https://salsa.debian.org/med-team/augur/-/commit/87a22b0a594348e304a318648737770a421cafea
--
View it on GitLab: https://salsa.debian.org/med-team/augur/-/commit/87a22b0a594348e304a318648737770a421cafea
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20221106/ac2a725f/attachment-0001.htm>
More information about the debian-med-commit
mailing list