[med-svn] [Git][med-team/python-treetime][upstream] New upstream version 0.11.4

Étienne Mollier (@emollier) gitlab at salsa.debian.org
Sat Nov 9 20:49:19 GMT 2024



Étienne Mollier pushed to branch upstream at Debian Med / python-treetime


Commits:
e6064017 by Étienne Mollier at 2024-11-09T21:16:32+01:00
New upstream version 0.11.4
- - - - -


12 changed files:

- README.md
- changelog.md
- test.sh
- + test/test_vcf.py
- treetime/CLI_io.py
- treetime/__init__.py
- treetime/argument_parser.py
- treetime/gtr.py
- treetime/treeanc.py
- treetime/treetime.py
- treetime/vcf_utils.py
- treetime/wrappers.py


Changes:

=====================================
README.md
=====================================
@@ -1,5 +1,5 @@
 [![CI](https://github.com/neherlab/treetime/actions/workflows/ci.yml/badge.svg?branch=master)](https://github.com/neherlab/treetime/actions/workflows/ci.yml)
-[![anaconda](https://anaconda.org/bioconda/treetime/badges/installer/conda.svg)](https://anaconda.org/bioconda/treetime)
+[![anaconda](https://anaconda.org/bioconda/treetime/badges/version.svg)](https://anaconda.org/bioconda/treetime)
 [![readthedocs](https://readthedocs.org/projects/treetime/badge/)](https://treetime.readthedocs.io/en/latest/)
 
 ## TreeTime: maximum likelihood dating and ancestral sequence inference


=====================================
changelog.md
=====================================
@@ -1,3 +1,17 @@
+# 0.11.4: Bug fixed
+- fix output of mutations into the `branch_mutation.txt` file which was masked by a conditional
+- adjust CLI help
+- adjust output precision
+
+# 0.11.3: Bug fixed
+- calls to some random number generator errorred after the recent switch to treetime owned RNGs.
+- default argument for clock-filter method had a spelling mistake
+- a branch length optimization function didn't handle profiles==True correctly.
+
+# 0.11.2: improvements in VCF parsing/writing + clock-filter functionality
+- rewrite of the vcf reading and writing code including many more tests, closer alignment with spec, and performance improvements by @jameshadfield [PR #263](https://github.com/neherlab/treetime/pull/263)
+- clock-filter command can now remove detected outliers with the additional flag `--prune-outliers`.
+
 # 0.11.1: bug fixes and tweaks to plotting
 - fix division by zero error during GTR inference
 - improve doc strings in parse dates


=====================================
test.sh
=====================================
@@ -19,5 +19,10 @@ if [ "$OUT" != 0 ]; then
   exit 1
 fi
 
+pytest test_vcf.py
+if [ "$OUT" != 0 ]; then
+  exit 1
+fi
+
 # Clean up, the 202* is to remove auto-generated output dirs
 rm -rf treetime_examples __pycache__ 202*


=====================================
test/test_vcf.py
=====================================
@@ -0,0 +1,548 @@
+import pytest
+from textwrap import dedent
+from treetime.vcf_utils import read_vcf, write_vcf
+from treetime import TreeTimeError
+
+def create_vcf(dir, content):
+    d = dir / 'data'
+    d.mkdir()
+    fname = d / "no-header.vcf"
+    with open(fname, 'w') as fh:
+        print(dedent(content), file=fh)
+    return str(fname)
+
+class TestMalformedVcf:
+
+    def test_no_header(self, tmp_path):
+        with pytest.raises(TreeTimeError):
+            read_vcf(create_vcf(tmp_path, """\
+                ##fileformat=VCFv4.3
+                1\t5\t.\tT\tC\t.\t.\t.\tGT\t1\t1\t1
+            """))
+
+    def test_meta_info_after_header(self, tmp_path):
+        with pytest.raises(TreeTimeError):
+            read_vcf(create_vcf(tmp_path, """\
+                #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample_A\tsample_B\tsample_C
+                ##fileformat=VCFv4.3
+                1\t5\t.\tT\tC\t.\t.\t.\tGT\t1\t1\t1
+            """))
+    
+    def test_inconsistent_sample_numbers(self, tmp_path):
+        with pytest.raises(TreeTimeError):
+            read_vcf(create_vcf(tmp_path, """\
+                ##fileformat=VCFv4.3
+                #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample_A\tsample_B\tsample_C
+                1\t5\t.\tT\tC\t.\t.\t.\tGT\t1\t1\t1
+                1\t6\t.\tT\tC\t.\t.\t.\tGT\t1\t1
+            """))
+
+    def test_no_data_lines(self, tmp_path):
+        with pytest.raises(TreeTimeError):
+            read_vcf(create_vcf(tmp_path, """\
+                ##fileformat=VCFv4.3
+                #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample_A\tsample_B\tsample_C
+            """))
+
+    def test_consistent_ploidy(self, tmp_path):
+        """
+        The spec heavily implies that ALT alleles must be supplied for each copy
+        of the chromosome, even for no-calls: "If a call cannot be made for a
+        sample at a given locus, '.' must be specified for each missing allele
+        in the GT field (for example './.' for a diploid genotype and '.' for
+        haploid genotype)."
+        """
+        with pytest.raises(TreeTimeError):
+            read_vcf(create_vcf(tmp_path, """\
+                ##fileformat=VCFv4.3
+                #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample_A\tsample_B
+                1\t5\t.\tT\tC\t.\t.\t.\tGT\t1\t1\t1/1
+            """))
+
+class TestMultipleChromosomes:
+    """
+    These are valid VCFs but TreeTime cannot yet parse them
+    """
+    def test_multiple_chromosomes(self, tmp_path):
+        with pytest.raises(TreeTimeError):
+            read_vcf(create_vcf(tmp_path, """\
+                ##fileformat=VCFv4.3
+                #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample_A\tsample_B\tsample_C
+                1\t5\t.\tT\tC\t.\t.\t.\tGT\t1\t1\t1
+                2\t6\t.\tT\tC\t.\t.\t.\tGT\t1\t1\t1
+            """))
+
+def zero_based(idx):
+    """Useful as a form of code commentary. Convert idx from 1-based to 0-based"""
+    return idx-1
+
+def write_data(tmp_path_factory, sample_names, data_lines, reference_seq, meta_lines=None, reference_name='reference_name'):
+    """helper function to create and write a VCF and FASTA file. Returns a tuple of filenames"""
+    if meta_lines:
+        vcf = "\n".join(meta_lines)+"\n"
+    else:
+        vcf = dedent("""\
+            ##fileformat=VCFv4.3
+            ##contig=<ID=1,length=50>
+            ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+        """)
+    vcf += "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + "\t".join(sample_names) + "\n"
+    for line in data_lines:
+        vcf += "\t".join(line) + "\n"
+    reference=dedent(f"""\
+        >{reference_name}
+        {reference_seq}
+    """)
+    dir = tmp_path_factory.mktemp("data")
+    vcf_fn = dir / "snps.vcf"
+    ref_fn = dir / "reference.fasta"
+    with open(vcf_fn, 'w') as fh:
+        print(vcf, file=fh)
+    with open(ref_fn, 'w') as fh:
+        print(reference, file=fh)
+    return (str(vcf_fn), str(ref_fn))
+
+
+class TestSimpleVcf:
+    @pytest.fixture(scope="class")
+    def data(self, tmp_path_factory):
+        """
+        Creates the VCF and reference files (in a temporary location) to be used by
+        the tests in this class.
+        Returns their filenames, as well as information about their contents which
+        we can use as comparisons in tests.
+        NOTE: this function is only run once - _not_ once per test.
+        """
+        sample_names = ["sample_A", "sample_B", "sample_C"]
+        data_lines = [
+            ["1", "5",  ".", "T", "C",   ".", ".", ".", "GT", "1", "1", "1"],
+            ["1", "7",  ".", "T", "G",   ".", ".", ".", "GT", "1", "1", "0"],
+            ["1", "14", ".", "C", "T",   ".", ".", ".", "GT", "1", "1", "0"],
+            ["1", "18", ".", "C", "T",   ".", ".", ".", "GT", "0", "0", "1"],
+            ["1", "28", ".", "A", "N",   ".", ".", ".", "GT", "0", "0", "1"],
+            ["1", "29", ".", "A", "N",   ".", ".", ".", "GT", "0", "0", "1"],
+            ["1", "30", ".", "A", "N",   ".", ".", ".", "GT", "0", "0", "1"],
+            ["1", "33", ".", "A", "C,G", ".", ".", ".", "GT", "1", "2", "0"],
+            ["1", "39", ".", "C", "T",   ".", ".", ".", "GT", "1", "0", "0"],
+            ["1", "42", ".", "G", "A",   ".", ".", ".", "GT", "0", "1", "0"],
+            ["1", "43", ".", "A", "T",   ".", ".", ".", "GT", "1", "1", "0"],
+        ]
+        positions = [int(line[1]) for line in data_lines]
+        reference_seq = 'AAAAAAAAAATGCCCTGCGGGTAAAAAAAAAAAAACTACTTGACCATAAA'
+        filenames = write_data(tmp_path_factory, sample_names, data_lines, reference_seq)
+        return {
+            "filenames": filenames,
+            "positions": positions,
+            "data_lines": data_lines,
+            "sample_names": sample_names,
+            "reference_seq": reference_seq,
+        }
+
+    def test_mutation_structure(self, data):
+        vcf_data = read_vcf(*data['filenames'])
+        # The structure of insertions & sequences is a dict a key for each sample, irregardless of whether
+        # any insertions/mutations are defined for that sample.
+        # samples = {'sample_A', 'sample_B', 'sample_C'}
+        assert(set(data['sample_names'])==set(vcf_data['sequences'].keys()))
+        assert(set(data['sample_names'])==set(vcf_data['insertions'].keys()))
+
+    def test_mutation_parsing(self, data):
+        """
+        Test that the correct SNPs (ALTs) are extracted from the VCF for the three samples defined in the VCF
+        """
+        vcf_data = read_vcf(*data['filenames'])
+        def summarise(sample):
+            # uses 1-based position so we can easily compare with the VCF file
+            return ",".join([f"{pos+1}{alt}" for pos,alt in vcf_data['sequences'][sample].items()])
+
+        assert("5C,7G,14T,33C,39T,43T"==summarise('sample_A'))
+        assert("5C,7G,14T,33G,42A,43T"==summarise('sample_B'))
+        assert("5C,18T,28N,29N,30N"==summarise('sample_C'))
+
+    def test_no_insertions_parsed(self, data):
+        vcf_data = read_vcf(*data['filenames'])
+        for sample, ins in vcf_data['insertions'].items():
+            assert(ins=={})
+
+    def test_reference_sequence(self, data):
+        vcf_data = read_vcf(*data['filenames'])
+        assert(data['reference_seq']==vcf_data['reference'])
+
+    def test_positions_with_mutations(self, data):
+        vcf_data = read_vcf(*data['filenames'])
+        assert(data['positions'] == [pos+1 for pos in vcf_data['positions']]) 
+
+
+class TestNoCallsOrMissing:
+    """
+    Tests a few overlapping concepts:
+    - The genotype field (i.e. within the column of a sample name) may be "." which results in a
+      no call allele. So we call this as SNP(s) changing the REF to N(s). The 4.2 spec refers
+      to this as "a call cannot be made for a sample at a given locus". Spec 4.3 defines this
+      as "If a call cannot be made for a sample at a given locus, '.' must be specified for each
+      missing allele in the GT field (for example ./. for a diploid genotype and . for haploid
+      genotype)"
+    - The ALT allele may be "*". v4.2. defines this as "missing due to a upstream deletion (sic)"
+      and 4.3 defines this as "allele missing due to overlapping deletion". Either way we encode
+      this similarly to the no-call above, i.e. changing the REF to N(s). Note that the allele
+      must be the one-character "*" - this can't appear within other bases.
+    - The ALT allele may be ".". This was introduced in version 4.3 as "a MISSING value (no variant)"
+      So we treat this the same as "*" above. Again, this must be the entire allele, '.' can't appear
+      as part of other bases. Note that while this doesn't exist in v4.2, it is commonly found in
+      v4.2 VCF files, e.g. those produced by `snp_sites`.
+    """
+
+    @pytest.fixture(scope="class")
+    def data(self, tmp_path_factory):
+        reference="ATCGA"
+        sample_names = ["sample_A", "sample_B", "sample_C", "sample_D"]
+        data_lines = [
+            ## No call alleles
+            ["1", "2",  ".", "T", "A",    ".", ".", ".", "GT", ".", "1", "0", "0"], # sample A has T2N
+            ["1", "4",  ".", "GA", "C",   ".", ".", ".", "GT", ".", "1", "0", "0"], # sample A has GA -> NN
+            ## star alleles & dot alleles indicate missing
+            ["1", "3",  ".", "C", "*,G",  ".", ".", ".", "GT", "1", "2", "0", "0"], # sample A has C->N
+            ["1", "3",  ".", "C", "T,.",  ".", ".", ".", "GT", "0", "0", "1", "2"], # sample D has C->N
+            ["1", "4",  ".", "GA", ".,*", ".", ".", ".", "GT", "0", "0", "1", "2"], # both samples C & D have G->N and A->N
+        ]
+        filenames = write_data(tmp_path_factory, sample_names, data_lines, reference)
+        return {"filenames": filenames}
+
+    def test_no_call_allele(self, data):
+        vcf_data = read_vcf(*data['filenames'])
+        assert(vcf_data['sequences']['sample_A'][zero_based(2)]=='N')
+        assert(vcf_data['sequences']['sample_A'][zero_based(4)]=='N')
+        assert(vcf_data['sequences']['sample_A'][zero_based(5)]=='N')
+        assert(vcf_data['sequences']['sample_B'][zero_based(4)]=='C') 
+        assert(vcf_data['sequences']['sample_B'][zero_based(5)]=='-')
+
+    def test_star_allele(self, data):
+        vcf_data = read_vcf(*data['filenames'])
+        assert(vcf_data['sequences']['sample_A'][zero_based(3)]=='N')
+        assert(vcf_data['sequences']['sample_B'][zero_based(3)]=='G')
+
+        assert(vcf_data['sequences']['sample_D'][zero_based(4)]=='N')
+        assert(vcf_data['sequences']['sample_D'][zero_based(5)]=='N')
+
+    def test_dot_allele(self, data):
+        vcf_data = read_vcf(*data['filenames'])
+        assert(vcf_data['sequences']['sample_C'][zero_based(3)]=='T')
+        assert(vcf_data['sequences']['sample_D'][zero_based(3)]=='N')
+
+        assert(vcf_data['sequences']['sample_C'][zero_based(4)]=='N')
+        assert(vcf_data['sequences']['sample_C'][zero_based(5)]=='N')
+
+
+    def test_malformed_star_allele(self, tmp_path):
+        """* must be an entire allele, not together with other bases"""
+        with pytest.raises(TreeTimeError):
+            read_vcf(create_vcf(tmp_path, """\
+                #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample_A\tsample_B\tsample_C
+                ##fileformat=VCFv4.3
+                1\t5\t.\tT\tC*\t.\t.\t.\tGT\t1\t1\t1
+            """))
+
+    def test_malformed_dot_allele(self, tmp_path):
+        """. must be an entire allele, not together with other bases"""
+        with pytest.raises(TreeTimeError):
+            read_vcf(create_vcf(tmp_path, """\
+                #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample_A\tsample_B\tsample_C
+                ##fileformat=VCFv4.3
+                1\t5\t.\tT\tC.,G\t.\t.\t.\tGT\t1\t1\t1
+            """))
+
+
+class TestVcfSpecExamples:
+    """
+    Test the examples provided in the VCF specs. Note that the examples
+    aren't provided as per-sample VCF files, so there's a bit of interpretation
+    required to create the example data. Note that the actual `test_<name>`
+    methods are added dynamically after the class definition.
+    """
+    def create(self, tmp_path, input):
+        lines = ["##fileformat=VCFv4.2", # Actual version may differ, but we don't parse this so it doesn't matter (yet)
+            "##contig=<ID=1,length=50>",
+            "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">",
+            "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + "\t".join(input['samples']),
+        ]
+        for d in input['data']:
+            lines.append(f"{d['chrom']}\t{d['pos']}\t.\t{d['ref']}\t{d['alt']}\t.\t.\t.\tGT\t" + "\t".join([str(x) for x in d['values']]))
+        return create_vcf(tmp_path, "\n".join(lines))
+
+    @staticmethod
+    def add_test(example_data):
+        def test(self, tmp_path):
+            vcf = read_vcf(self.create(tmp_path, example_data))
+            for sample_name, sample_data in example_data['expect_sequences'].items():
+                assert(sample_name in vcf['sequences'])
+                for snp in sample_data:
+                    assert(len(snp)==2)
+                    assert(vcf['sequences'][sample_name][zero_based(int(snp[0]))] == snp[1])
+                assert(len(vcf['sequences'][sample_name].keys()) == len(sample_data))
+            for sample_name, sample_data in example_data['expect_insertions'].items():
+                assert(sample_name in vcf['insertions'])
+                for ins in sample_data:
+                    assert(vcf['insertions'][sample_name][zero_based(int(ins[0]))] == ins[1:])
+                assert(len(vcf['insertions'][sample_name].keys()) == len(sample_data))
+        return test
+
+"""
+-------- comment re: TreeTime's encoding of insertions --------
+
+Give a reference base of 'C' at 1-based position 2, and an
+insertion _after_ this of 'A', TreeTime encodes this as an insertion
+of 'CA' at (0-based) position 1. This was unexpected to me,
+I would have expected an insertion of just 'A'. However I've written
+the tests to conform to TreeTime's existing behaviour (version 0.11.1)
+
+---------------------------------------------------------------
+"""
+
+vcf_spec_data = {
+    'version_4_2': {
+        '5_1_1': {
+            'samples': ['A', 'B', 'C'],
+            'data': [
+                {'chrom': 20, 'pos': 3, 'ref': 'C', 'alt': 'G', 'values': [1, 0, 0]},
+                {'chrom': 20, 'pos': 2, 'ref': 'TC', 'alt': 'T,TCA', 'values': [0, 1, 2]}
+            ],
+            # Note that with TC->TCA, there are no sequence changes (T->T, C->C) but one insertion ("A")
+            'expect_sequences': {'A': ['3G'], 'B': ['3-']},
+            'expect_insertions': {'C': ['3CA']} # see comment above. Actually only a single base "A" insertion
+        },
+        '5_1_2': {
+            'samples': ['A', 'B'],
+            'data': [
+                {'chrom': 20, 'pos': 2, 'ref': 'TC', 'alt': 'TG,T', 'values': [1,2]}
+            ],
+            'expect_sequences': {'A': ['3G'], 'B': ['3-']},
+            'expect_insertions': {}
+        },
+        '5_1_3': {
+            'samples': ['A', 'B', 'C'],
+            ## This example is quite hard to understand for sample A. The alignment provided indicates that
+            ## TCG -> T-G is encoded as "ref: TCG, alt: TG". But without aligning the alt to the ref, the only
+            ## sane interpretation of this is 2 changes: C->G and G->-. This is what the spec means by
+            ## "the molecular equivalence explicitly listed above in the per-base alignment is discarded so the
+            ## actual placement of equivalent g isn't retained" (also explained in example 5.2.4)
+            ## Similarly, for sample C, the actual event is described as "A base is inserted wrt the reference
+            ## sequence" but the way the VCF file reads we are going to have a SNP (G->A) + a insertion (G)
+            'data': [
+                {'chrom': 20, 'pos': 2, 'ref': 'TCG', 'alt': 'TG,T,TCAG', 'values': [1,2,3]}
+            ],
+            'expect_sequences': {'A': ['3G', '4-'], 'B': ['3-', '4-'], 'C': ['4A']},
+            'expect_insertions': {'C': ['4AG']} # See comment above
+        },
+        '5_2_1': {
+            'samples': ['A'],
+            'data': [
+                {'chrom': 20, 'pos': 3, 'ref': 'C', 'alt': 'T', 'values': [1]}
+            ],
+            'expect_sequences': {'A': ['3T']},
+            'expect_insertions': {}
+        },
+        '5_2_2': {
+            'samples': ['A'],
+            'data': [
+                {'chrom': 20, 'pos': 3, 'ref': 'C', 'alt': 'CTAG', 'values': [1]}
+            ],
+            'expect_sequences': {},
+            'expect_insertions': {'A': ['3CTAG']} # See comment above
+        },
+        '5_2_3': {
+            'samples': ['A'],
+            'data': [
+                {'chrom': 20, 'pos': 2, 'ref': 'TCG', 'alt': 'T', 'values': [1]}
+            ],
+            'expect_sequences': {'A': ['3-', '4-']},
+            'expect_insertions': {}
+        },
+    }
+}
+# dynamically create a test for each example in the above data (by adding methods to the class)
+for spec_version, spec_data in vcf_spec_data.items():
+    for example_key, example_data in spec_data.items():
+        setattr(TestVcfSpecExamples, f"test_{spec_version}_example_{example_key}", TestVcfSpecExamples.add_test(example_data))
+
+
+class TestMutationAndInsertion:
+    """
+    Tests the situation where a reference base is mutated _and_ there's an insertion
+    """
+
+    @pytest.fixture(scope="class")
+    def data(self, tmp_path_factory):
+        reference="ATCGA"
+        sample_names = ["sample_A", "sample_B"]
+        data_lines = [
+            ["1", "2",  ".", "T", "GA",    ".", ".", ".", "GT", "1", "0"], # sample A has both a C->G mutation _and_ a subsequent "A" insertion
+            ["1", "3",  ".", "CGA", "NTTTA,CCGT",   ".", ".", ".", "GT", "1", "2"], # complex! Both samples have multiple mutations + an insertion
+        ]
+        filenames = write_data(tmp_path_factory, sample_names, data_lines, reference)
+        return {"filenames": filenames}
+
+    def test_single_ref_base_mutation_and_insertion(self, data):
+        # This case was missed in treetime 0.11.1
+        vcf_data = read_vcf(*data['filenames'])
+        assert(vcf_data['sequences']['sample_A'][zero_based(2)]=='G')
+        assert(vcf_data['insertions']['sample_A'][zero_based(2)]=='GA') # see comment above re: insertion encoding
+
+    def test_multi_ref_base_mutations_and_insertion(self, data):
+        # This case was missed in treetime 0.11.1
+        vcf_data = read_vcf(*data['filenames'])
+        assert(vcf_data['sequences']['sample_A'][zero_based(3)]=='N')
+        assert(vcf_data['sequences']['sample_A'][zero_based(4)]=='T')
+        assert(vcf_data['sequences']['sample_A'][zero_based(5)]=='T')
+        assert(vcf_data['insertions']['sample_A'][zero_based(5)]=='TTA') # see comment above re: insertion encoding
+
+        assert(vcf_data['sequences']['sample_B'][zero_based(4)]=='C')
+        assert(vcf_data['sequences']['sample_B'][zero_based(5)]=='G')
+        assert(vcf_data['insertions']['sample_B'][zero_based(5)]=='GT') # see comment above re: insertion encoding
+
+class TestMetadataParsing:
+        
+    def test_simple_haploid(self, tmp_path):
+        data = read_vcf(create_vcf(tmp_path, """\
+            ##fileformat=VCFv4.3
+            #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample_A\tsample_B\tsample_C
+            1\t5\t.\tT\tC\t.\t.\t.\tGT\t1\t1\t1
+            1\t6\t.\tT\tC\t.\t.\t.\tGT\t1\t1\t1
+        """))
+        assert(data['metadata']['chrom']=='1')
+        assert(data['metadata']['ploidy']==1)
+        assert(data['metadata']['meta_lines']==['##fileformat=VCFv4.3'])
+
+    def test_simple_diploid(self, tmp_path):
+        data = read_vcf(create_vcf(tmp_path, """\
+            ##fileformat=VCFv4.3
+            ##contig=<ID=1,length=50>
+            ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+            #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample_A
+            foo\t5\t.\tT\tC\t.\t.\t.\tGT\t1/1
+            foo\t6\t.\tT\tC\t.\t.\t.\tGT\t.|.
+        """))
+        assert(data['metadata']['chrom']=='foo')
+        assert(data['metadata']['ploidy']==2)
+        assert(data['metadata']['meta_lines']==['##fileformat=VCFv4.3', '##contig=<ID=1,length=50>', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">'])
+
+
+
+    
+def roundtrip(tmp_path, sample_names, data_lines, reference, meta_lines, pass_metadata=True):
+    """
+    Write the provided data to a (temporary) VCF file.
+    Then read it via `read_vcf` and store this as `vcf_a`.
+    Then write this data via `write_vcf` and read it back, storing as `vcf_b`.
+    Returns a tuple of (vcf_a, vcf_b)
+    """
+
+    dir = tmp_path / 'data'
+    dir.mkdir()
+    vcf_filename_a = str(dir / "a.vcf")
+    vcf_filename_b = str(dir / "b.vcf")
+    reference_filename = str(dir / 'reference.fasta')
+
+    with open(str(dir / 'reference.fasta'), 'w') as fh:
+        print(dedent(f"""\
+            >reference_name
+            {reference}
+        """), file=fh)
+
+    with open(vcf_filename_a, 'w') as fh:
+        vcf_lines = meta_lines[:] + \
+            ["#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + "\t".join(sample_names)] + \
+            ["\t".join(data) for data in data_lines]
+        print("\n".join(vcf_lines), file=fh)
+
+    vcf_a = read_vcf(vcf_filename_a, reference_filename)
+
+    ## take this data and write it out as a VCF
+    tree_dict = {'sequences': vcf_a['sequences'], 'reference': vcf_a['reference'], 'positions': vcf_a['positions']}
+    if pass_metadata:
+        tree_dict['metadata'] = vcf_a['metadata']
+    write_vcf(tree_dict, vcf_filename_b)
+
+    ## then read in this (just-created) VCF
+    vcf_b = read_vcf(vcf_filename_b, reference_filename)
+
+    ## uncomment the following & run pytest with "-s" to see the contents of the VCF file being written by TreeTime
+    # print("_________________________________")
+    # with open(vcf_filename_b) as fh:
+    #     for line in fh:
+    #         print(line, end='')
+    # print("\n_________________________________")
+
+    return (vcf_a, vcf_b)
+
+
+class TestWriting:
+    """
+    Write a simple VCF file out (created by hand), parse it with `read_vcf` (called "vcf_a")
+    then write that data out and read it back in (called "vcf_b"). vcf_a should equal vcf_b
+    for the data we care about.
+    """
+
+    def test_basic_roundtripping_vcf(self, tmp_path):
+        [vcf_a, vcf_b] = roundtrip(tmp_path,
+            reference = "ATCGACC",
+            meta_lines = [
+                "##fileformat=VCFv4.3",
+                "##contig=<ID=1,length=7>",
+                "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">"
+            ],
+            sample_names = ["sample_A", "sample_B"],
+            data_lines = [
+                ["foo", "2",  ".", "T", "G",    ".", ".", ".", "GT", "1", "0"],
+                ["foo", "3",  ".", "C", "GA,*", ".", ".", ".", "GT", "1", "2"], # "A" insertion will be lost on roundtrip!
+                ["foo", "5",  ".", "A", "TC",   ".", ".", ".", "GT", ".", "1"],
+                ["foo", "6",  ".", "CC", "C",   ".", ".", ".", "GT", "1", "0"], # del of (1-based) pos 7 in sample A
+            ],
+            pass_metadata=False
+        )
+
+        ## reference is certainly the same, the same FASTA file is being used for both read_vcf commands, but check anyway
+        assert(vcf_a['reference']==vcf_a['reference'])
+        ## insertions, if there are any, _won't_ be the same because `write_vcf` doesn't read insertions even if they're provided
+        ## as input. Note that vcf_a does have an insertion!
+
+        ## The meta-lines are different (as expected) and since we are not passing the metadata information to `write_vcf`
+        ## we get back the defaults, which differ from the input
+        assert(vcf_a['metadata']['chrom'] == "foo" and vcf_b['metadata']['chrom'] == "1")
+        assert(vcf_a['metadata']['ploidy'] == 1 and vcf_b['metadata']['ploidy'] == 2)
+
+        ## positions should be the same (positions are only reflective of data in `sequences`, so the ignoring of insertions is ok)
+        assert(vcf_a['positions']==vcf_a['positions'])
+        ## check sequences are the same
+        assert(vcf_a['sequences']==vcf_b['sequences'])
+
+    def test_basic_roundtripping_vcf_with_metadata(self, tmp_path):
+        [vcf_a, vcf_b] = roundtrip(tmp_path,
+            reference = "ATCGACC",
+            meta_lines = [
+                "##fileformat=VCFv4.3",
+                "##contig=<ID=1,length=7>",
+                "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">"
+            ],
+            sample_names = ["sample_A", "sample_B"],
+            data_lines = [
+                ["1", "2",  ".", "T", "G",    ".", ".", ".", "GT", "1", "0"],
+                ["1", "3",  ".", "C", "GA,*", ".", ".", ".", "GT", "1", "2"], # "A" insertion will be lost on roundtrip!
+                ["1", "5",  ".", "A", "TC",   ".", ".", ".", "GT", ".", "1"],
+                ["1", "6",  ".", "CC", "C",   ".", ".", ".", "GT", "1", "0"], # del of (1-based) pos 7 in sample A
+            ],
+            pass_metadata=True
+        )
+
+        ## reference is certainly the same, the same FASTA file is being used for both read_vcf commands, but check anyway
+        assert(vcf_a['reference']==vcf_a['reference'])
+        ## insertions, if there are any, _won't_ be the same because `write_vcf` doesn't read insertions even if they're provided
+        ## as input. Note that vcf_a does have an insertion!
+
+        ## The meta-lines are different (as expected) but the chrom + ploidy should be the same
+        assert(vcf_a['metadata']['chrom'] == vcf_b['metadata']['chrom'])
+        assert(vcf_a['metadata']['ploidy'] == vcf_b['metadata']['ploidy'])
+
+        ## positions should be the same (positions are only reflective of data in `sequences`, so the ignoring of insertions is ok)
+        assert(vcf_a['positions']==vcf_a['positions'])
+        ## check sequences are the same
+        assert(vcf_a['sequences']==vcf_b['sequences'])


=====================================
treetime/CLI_io.py
=====================================
@@ -164,15 +164,15 @@ def export_sequences_and_tree(tt, basename, is_vcf=False, zero_based=False,
                     n.comment= '&mutations="' + ','.join([a+str(pos + offset)+d for (a,pos, d) in n.mutations
                                                         if tt.gtr.ambiguous not in [a,d] and n.mask[pos]>0])+f'",mcc="{n.mcc}"'
 
-                for (a, pos, d) in n.mutations:
-                    if tt.gtr.ambiguous not in [a,d] or report_ambiguous:
-                        mutations_out.write("%s\t%s\t%s\t%s\n" %(n.name, a, pos + 1, d))
+            for (a, pos, d) in n.mutations:
+                if tt.gtr.ambiguous not in [a,d] or report_ambiguous:
+                    mutations_out.write("%s\t%s\t%s\t%s\n" %(n.name, a, pos + 1, d))
         if timetree:
             n.comment+=(',' if n.comment else '&') + 'date=%1.2f'%n.numdate
     mutations_out.close()
 
     # write tree to file
-    fmt_bl = "%1.6f" if tt.data.full_length<1e6 else "%1.8e"
+    fmt_bl = "%1.7f" if tt.data.full_length<1e6 else "%1.9e"
     if timetree:
         outtree_name = basename + f'timetree{tree_suffix}.nexus'
         print("--- saved divergence times in \n\t %s\n"%dates_fname)


=====================================
treetime/__init__.py
=====================================
@@ -1,4 +1,4 @@
-version="0.11.1"
+version="0.11.4"
 ## Here we define an error class for TreeTime errors, MissingData, UnknownMethod and NotReady errors
 ## are all due to incorrect calling of TreeTime functions or input data that does not fit our base assumptions.
 ## Errors marked as TreeTimeUnknownErrors might be due to data not fulfilling base assumptions or due


=====================================
treetime/argument_parser.py
=====================================
@@ -81,7 +81,8 @@ tree_description = "Name of file containing the tree in newick, nexus, or phylip
 
 aln_description = "alignment file (fasta)"
 
-dates_description = "csv file with dates for nodes with 'node_name, date' where date is float (as in 2012.15)"
+dates_description = "csv file with dates for nodes with 'node_name, date' where date is float (as in 2012.15) or in ISO-format (YYYY-MM-DD). "\
+                    "Imprecisely known dates can be specified as '2023-XX-XX' or [2013.2:2013.7]"
 
 coalescent_description = \
     "coalescent time scale -- sensible values are on the order of the average "\
@@ -303,6 +304,8 @@ def make_parser():
     add_seq_len_aln_group(c_parser)
 
     add_reroot_group(c_parser)
+    c_parser.add_argument('--prune-outliers', action='store_true', default=False,
+                        help="remove detected outliers from the output tree")
     c_parser.add_argument('--allow-negative-rate', required = False, action="store_true", default=False,
                           help="By default, rates are forced to be positive. For trees with little temporal "
                                "signal it is advisable to remove this restriction to achieve essentially mid-point rooting.")


=====================================
treetime/gtr.py
=====================================
@@ -476,8 +476,8 @@ class GTR(object):
         alphabet=alphabets[alphabet]
         gtr = cls(alphabet)
         n = gtr.alphabet.shape[0]
-        pi = 1.0*rng.randint(0,100,size=(n))
-        W = 1.0*rng.randint(0,100,size=(n,n)) # with gaps
+        pi = 1.0*rng.random(size=n)
+        W = 1.0*rng.random(size=(n,n)) # with gaps
 
         gtr.assign_rates(mu=mu, pi=pi, W=W)
         return gtr
@@ -851,7 +851,11 @@ class GTR(object):
             else:
                 return -1.0*self.prob_t_compressed(seq_pair, multiplicity,t**2, return_log=True)
 
-        hamming_distance = np.sum(multiplicity[seq_pair[:,1]!=seq_pair[:,0]])/np.sum(multiplicity)
+        if profiles:
+            hamming_distance = 1-np.sum(multiplicity*np.sum(seq_pair[0]*seq_pair[1], axis=1))/np.sum(multiplicity)
+        else:
+            hamming_distance = np.sum(multiplicity[seq_pair[:,1]!=seq_pair[:,0]])/np.sum(multiplicity)
+
         try:
             from scipy.optimize import minimize_scalar
             opt = minimize_scalar(_neg_prob,


=====================================
treetime/treeanc.py
=====================================
@@ -572,7 +572,7 @@ class TreeAnc(object):
                                     "in the position %d: %s, "
                                     "choosing %s" % (amb, str(self.tree.root.state[amb]),
                                                      self.tree.root.state[amb][0]), 4)
-        self.tree.root._cseq = np.array([k[self.rng.randint(len(k)) if len(k)>1 else 0]
+        self.tree.root._cseq = np.array([k[self.rng.integers(len(k)) if len(k)>1 else 0]
                                            for k in self.tree.root.state])
 
 
@@ -1670,7 +1670,7 @@ class TreeAnc(object):
 
 
     def get_tree_dict(self, keep_var_ambigs=False):
-        return self.get_reconstructed_alignment()
+        return self.get_reconstructed_alignment(reconstruct_tip_states=not keep_var_ambigs)
 
 
     def recover_var_ambigs(self):


=====================================
treetime/treetime.py
=====================================
@@ -73,7 +73,7 @@ class TreeTime(ClockTree):
                 sys.exit(2)
 
 
-    def _run(self, root=None, infer_gtr=True, relaxed_clock=None, clock_filter_method='residuals',
+    def _run(self, root=None, infer_gtr=True, relaxed_clock=None, clock_filter_method='residual',
              n_iqd = None, resolve_polytomies=True, max_iter=0, Tc=None, fixed_clock_rate=None,
             time_marginal='never', sequence_marginal=False, branch_length_mode='auto',
             vary_rate=False, use_covariation=False, tracelog_file=None,
@@ -423,6 +423,8 @@ class TreeTime(ClockTree):
             bad_branch_count = residual_filter(self, n_iqd)
         elif method=='local':
             bad_branch_count = local_filter(self, n_iqd)
+        else:
+            raise ValueError(f"TreeTime.clock_filter: unknown clock-filter method '{method}'. Implemented methods are: 'residual', 'local'")
 
         if bad_branch_count>0.34*self.tree.count_terminals():
             self.logger("TreeTime.clock_filter: More than a third of leaves have been excluded by the clock filter. Please check your input data.", 0, warn=True)


=====================================
treetime/vcf_utils.py
=====================================
@@ -2,6 +2,7 @@ import gzip
 import numpy as np
 from collections import defaultdict
 from textwrap import fill
+from . import TreeTimeError
 
 ## Functions to read in and print out VCF files
 
@@ -54,6 +55,8 @@ def read_vcf(vcf_file, ref_file=None):
             Python list of all positions with a mutation, insertion, or deletion.
 
     """
+    import re
+    ALT_CHARS = re.compile(r"^([ACGTNacgtn]+|\*|\.)$") # straight from the VCF 4.3 spec
 
     #Programming Note:
     # Note on VCF Format
@@ -73,7 +76,7 @@ def read_vcf(vcf_file, ref_file=None):
     #     Ex:
     #       REF     ALT
     #       A       ATT
-    #     First base always matches Ref.
+    #     First base _does not_ always match REF, so there may be a SNP as well
     # 'No indel'
     #     Ex:
     #       REF     ALT
@@ -82,6 +85,11 @@ def read_vcf(vcf_file, ref_file=None):
     #define here, so that all sub-functions can access them
     sequences = defaultdict(dict)
     insertions = defaultdict(dict) #Currently not used, but kept in case of future use.
+    metadata = {
+        'meta_lines': [], # all the VCF meta_lines, i.e. those starting with '##' (they are left unparsed)
+        'chrom': None,    # chromosome name (we only allow one)
+        'ploidy': None,   # ploidy count -- encoded in how the GT calls are formatted
+    }
 
     #TreeTime handles 2-3 base ambig codes, this will allow that.
     def getAmbigCode(bp1, bp2, bp3=""):
@@ -104,13 +112,23 @@ def read_vcf(vcf_file, ref_file=None):
 
     #Parses a 'normal' (not hetero or no-call) call depending if insertion+deletion, insertion,
     #deletion, or single bp subsitution
-    def parseCall(snps, ins, pos, ref, alt):
+    def parse_homozygous_call(snps, ins, pos, ref, alt):
+
+        # Replace missing allele with N(s). See commentary in test_vcf.py::TestNoCallsOrMissing for more details
+        if alt=='*' or alt=='.':
+            alt = "N" * len(ref)
+
         #Insertion where there are also deletions (special handling)
         if len(ref) > 1 and len(alt)>len(ref):
+            ## NOTE: the loop below contains a potential double-counting bug. For example,
+            ## REF='TCG' ALT='TCAG' (Example 5.1.3 in the VCF 4.2 spec), then when i=2
+            ## we'll add both snps[pos+2] = 'A' as well as ins[pos+2] = 'AG'. This has been 
+            ## detailed within `test_vcf.py`, as it may also be the expected way to encode
+            ## insertions within TreeTime
             for i in range(len(ref)):
                 #if the pos doesn't match, store in sequences
                 if ref[i] != alt[i]:
-                    snps[pos+i] = (alt[i] if alt[i] != '.' else 'N') #'.' = no-call
+                    snps[pos+i] = alt[i]
                 #if about to run out of ref, store rest:
                 if (i+1) >= len(ref):
                     ins[pos+i] = alt[i:]
@@ -123,17 +141,21 @@ def read_vcf(vcf_file, ref_file=None):
                 #if not, there may be mutations
                 else:
                     if ref[i] != alt[i]:
-                        snps[pos+i] = (alt[i] if alt[i] != '.' else 'N') #'.' = no-call
-        #Insertion
+                        snps[pos+i] = alt[i]
+        #Insertion + single-base ref
         elif len(alt) > 1:
             ins[pos] = alt
+            # If the first base of the allele doesn't match the ref then we _also_ have a mutation
+            if ref[0]!=alt[0]:
+                snps[pos] = alt[0]
         #No indel
         else:
             snps[pos] = alt
 
 
     #Parses a 'bad' (hetero or no-call) call depending on what it is
-    def parseBadCall(gen, snps, ins, pos, ref, ALT):
+    #TODO - consider the situation where the alternate allele base(s) is '*' (done for the homozygous case)
+    def parse_heterozygous_call(gen, snps, ins, pos, ref, ALT):
         #Deletion
         #   REF     ALT     Seq1    Seq2    Seq3
         #   GCC     G       1/1     0/1     ./.
@@ -173,90 +195,135 @@ def read_vcf(vcf_file, ref_file=None):
             snps[pos] = alt
         #else a no-call insertion, so ignore.
 
+    def validate_alt(alt):
+        """
+        from the VCF 4.3 spec:
+        > the ALT field must be a symbolic allele, or a breakend replacement string,
+        > or match the regular expression [see ALT_CHARS regex, above]
+        Since we only consider GT variation, the symbolic alleles and breakends aren't
+        relevant for us.
+
+        The spec also states:
+        > Tools processing VCF files are not required to preserve case in the allele String
+        
+        Return the uppercase allele bases (string) _or_ None if it fails validation
+        """
+        if ALT_CHARS.match(alt):
+            return alt.upper()
+        else:
+            print(f"WARNING: Encountered invalid allele base(s) {alt!r}. Skipping...")
+            return None
 
     #House code is *much* faster than pyvcf because we don't care about all info
     #about coverage, quality, counts, etc, which pyvcf goes to effort to parse
     #(and it's not easy as there's no standard ordering). Custom code can completely
     #ignore all of this.
-    import gzip
     from Bio import SeqIO
-    import numpy as np
-
-    nsamp = 0
-    posLoc = 0
-    refLoc = 0
-    altLoc = 0
-    sampLoc = 9
 
     #Use different openers depending on whether compressed
     opn = gzip.open if vcf_file.endswith(('.gz', '.GZ')) else open
 
     with opn(vcf_file, mode='rt') as f:
-        for line in f:
-            if line[0] != '#':
-                #actual data - most common so first in 'if-list'!
-                dat = line.strip().split('\t')
-                POS = int(dat[posLoc])
-                REF = dat[refLoc]
-                ALT = dat[altLoc].split(',')
-                calls = np.array(dat[sampLoc:])
-
-                #get samples that differ from Ref at this site
-                recCalls = {}
-                for sname, sa in zip(samps, calls):
-                    if ':' in sa: #if proper VCF file (followed by quality/coverage info)
-                        gt = sa.split(':')[0]
-                    else: #if 'pseudo' VCF file (nextstrain output, or otherwise stripped)
-                        gt = sa
-
-                    # convert haploid calls to pseudo diploid
-                    if gt == '0':
-                        gt = '0/0'
-                    elif gt == '1':
-                        gt = '1/1'
-                    elif gt == '.':
-                        gt = './.'
-
-                    #ignore if ref call: '.' or '0/0', depending on VCF
-                    if ('/' in gt and gt != '0/0') or ('|' in gt and gt != '0|0'):
-                        recCalls[sname] = gt
-
-                #store the position and the alt
-                for seq, gen in recCalls.items():
-                    ref = REF
-                    pos = POS-1     #VCF numbering starts from 1, but Reference seq numbering
-                                    #will be from 0 because it's python!
-                    #Accepts only calls that are 1/1, 2/2 etc. Rejects hets and no-calls
-                    if gen[0] != '0' and gen[2] != '0' and gen[0] != '.' and gen[2] != '.':
-                        alt = str(ALT[int(gen[0])-1])   #get the index of the alternate
-                        if seq not in sequences.keys():
-                            sequences[seq] = {}
-
-                        parseCall(sequences[seq],insertions[seq], pos, ref, alt)
-
-                    #If is heterozygote call (0/1) or no call (./.)
-                    else:
-                        #alt will differ here depending on het or no-call, must pass original
-                        parseBadCall(gen, sequences[seq],insertions[seq], pos, ref, ALT)
+        current_block = "meta-information" # The start of VCF files is the meta-information (lines starting with ##)
+        header,samps,nsamp=None,None,None
 
-            elif line[0] == '#' and line[1] == 'C':
-                #header line, get all the information
+        for line in f:
+            if line.startswith("##"):
+                if current_block!='meta-information':
+                    raise TreeTimeError(f"Malformed VCF file {vcf_file!r} - all the meta-information (lines starting with ##) must appear at the top of the file.")
+                metadata['meta_lines'].append(line.strip())
+            elif line[0]=='#':
+                mandatory_fields = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT']
+                # Note that FORMAT isn't mandatory unless genotype data is present. But every VCF we deal with has genotype data - that's the point!
+                header_start = "#"+"\t".join(mandatory_fields)
+                if not line.startswith(header_start):
+                    raise TreeTimeError(f"Malformed VCF file {vcf_file!r} - the header line must start with {header_start}")
+                if current_block!='meta-information':
+                    raise TreeTimeError(f"Malformed VCF file {vcf_file!r} - the header must immediately follow the meta-information lines")
+                current_block = 'header'
                 header = line.strip().split('\t')
-                posLoc = header.index("POS")
-                refLoc = header.index('REF')
-                altLoc = header.index('ALT')
-                sampLoc = header.index('FORMAT')+1
-                samps = header[sampLoc:]
-                samps = [ x.strip() for x in samps ] #ensure no leading/trailing spaces
+                samps = [ x.strip() for x in header[9:] ] #ensure no leading/trailing spaces
                 nsamp = len(samps)
+                for sample_name in samps:
+                    sequences[sample_name] = {}
+            else:
+                if current_block!='header' and current_block!='data-lines':
+                    raise TreeTimeError(f"Malformed VCF file {vcf_file!r} - the data lines must follow the header line")
+                current_block='data-lines'
+                l = line.strip()
+                if not l: # empty line
+                    continue
+                dat = l.split('\t')
+                chrom = str(dat[0])
+                if metadata['chrom'] is None:
+                    metadata['chrom'] = chrom
+                elif metadata['chrom']!=chrom:
+                    raise TreeTimeError(f"The VCF file {vcf_file!r} contains multiple chromosomes. TreeTime can not yet handle this.")
+                pos = int(dat[1])-1 # Convert VCF 1-based to python 0-based
+                REF = dat[3]
+                ALT = dat[4].split(',') # List of alternate alleles (strings)
+                calls = np.array(dat[9:])
+                if len(calls)!=nsamp:
+                    raise TreeTimeError(f"Malformed VCF file {vcf_file!r} - the data lines have different number of calls than the number of samples defined in the header")
+
+                # Treetime only parses GT FORMAT calls. Moreover, "the first sub-field must always be the genotype (GT) if it is present"
+                # according to the VCF 4.2 spec. Note that if the VCF is for one sample only then the format's a bit different, but this'll
+                # raise an error above because the header differs from what we assert. Other variation (CN, BND etc) is not parsed by
+                # TreeTime, only GT.
+                FORMAT = dat[8]
+                if not FORMAT.startswith('GT'):
+                    continue
 
-            #else you are a comment line, ignore.
+                #get samples that differ from Ref at this site
+                for sname, sa in zip(samps, calls):
+                    gt = sa.split(':')[0] # May be multiple colon-separated subfields, depending on the line's FORMAT
+
+                    # `gt` is the "index" of the alternate allele for this sample.
+                    # The format of `gt` is quite varied:
+                    # For haploid genomes, it's simply a single int _or_ "." (meaning a call cannot be made at the locus)
+                    # For polyploid genomes, the format is a list of {int, '.'} separated by "/" or "|"
+                    # ('/' means unphased, '|' means phased). 
+                    # If the index is an int, it's the 1-based (!) lookup index for ALT
+
+                    # Split on the valid separators - if haploid, then we'll get a list len=1
+                    gts = gt.split('|') if '|' in gt else gt.split('/')
+                    ploidy = len(gts)
+                    if metadata['ploidy'] is None:
+                        metadata['ploidy'] = ploidy
+                    elif metadata['ploidy']!=ploidy:
+                        raise TreeTimeError(f"The VCF file {vcf_file!r} had genotype calls of ploidy {metadata['ploidy']} but sample {sname!r} has a genotype of ploidy {ploidy}")
+
+                    # if polyploid, but homozygous, then treat as if haploid
+                    if ploidy>1 and len(set(gts))==1:
+                        gt = gts[0]
+
+                    if gt.isdigit(): # haploid, and a call has been made (i.e. it's not gt='.')
+                        gt = int(gt)
+                        if gt==0:
+                            continue # reference allele!
+                        alt = validate_alt(ALT[gt-1]) # gt is the 1-based lookup, but ALT is 0-indexed
+                        if alt:
+                            parse_homozygous_call(sequences[sname],insertions[sname], pos, REF, alt)
+                        continue
+
+                    if gt=='.': # haploid "call cannot be made" identifier - replace REF with N(s)
+                        for i in range(len(REF)):
+                            sequences[sname][pos+i] = 'N'
+                        continue
+
+                    # ---- heterozygous polyploid call  ----
+                    parse_heterozygous_call(gt, sequences[sname],insertions[sname], pos, REF, ALT)
 
     #Gather all variable positions
+    #NOTE: this does not consider positions of insertions
     positions = set()
     for seq, muts in sequences.items():
         positions.update(muts.keys())
 
+    num_insertions = sum([len(list(ins.keys())) for ins in insertions.values()])
+    if len(positions)==0 and num_insertions==0:
+        raise TreeTimeError(f"VCF file {vcf_file!r} has no data-lines which we could extract genotype information from!")
+
     #One or more seqs are same as ref! (No non-ref calls) So haven't been 'seen' yet
     if nsamp > len(sequences):
         missings = set(samps).difference(sequences.keys())
@@ -273,12 +340,13 @@ def read_vcf(vcf_file, ref_file=None):
     compress_seq = {'reference':refSeqStr,
                     'sequences': sequences,
                     'insertions': insertions,
-                    'positions': sorted(positions)}
+                    'positions': sorted(positions),
+                    'metadata': metadata}
 
     return compress_seq
 
 
-def write_vcf(tree_dict, file_name):#, compress=False):
+def write_vcf(tree_dict, file_name, mask=None):#, compress=False):
     """
     Writes out a VCF-style file (which seems to be minimally handleable
     by vcftools and pyvcf) of the alignment. This is created from a dict
@@ -290,13 +358,25 @@ def write_vcf(tree_dict, file_name):#, compress=False):
     Parameters
     ----------
      tree_dict: nested dict
-        A nested dict with keys 'sequence' 'reference' and 'positions',
-        as is created by :py:meth:`treetime.TreeAnc.get_tree_dict`
+        A nested dict with required keys of:
+        'sequences': maps sampleName -> pos (0-based) -> new base (SNP)
+        'reference': string of reference nuc sequence
+        'positions': sorted list of 0-based positions with variation in sequences
+        And optional keys:
+        'inferred_const_sites': list or set, 0-based positions to skip output for.
+        This input is often created by :py:meth:`treetime.TreeAnc.get_tree_dict`
+        'metadata': dict of information to influence VCF formatting. Only
+        the following keys are used:
+        'metadata.ploidy': int. Influences how genotype calls are formatted. (default
+        of 2 (diploid) used if not provided)
+        'metadata.chrom': str. The chromosome name (default of '1' used if not provided)
 
      file_name: str
         File to which the new VCF should be written out. File names ending with
         '.gz' will result in the VCF automatically being gzipped.
 
+    mask : optional, str of 0 or 1
+        Calls at these positions will be skipped
     """
 
 #   Programming Logic Note:
@@ -329,6 +409,23 @@ def write_vcf(tree_dict, file_name):#, compress=False):
     sequences = tree_dict['sequences']
     ref = tree_dict['reference']
     positions = tree_dict['positions']
+    ploidy = tree_dict.get('metadata', {}).get('ploidy', 2)
+    chrom_name = tree_dict.get('metadata', {}).get('chrom', '1')
+    sample_names = list(sequences.keys())
+    inferred_const_sites = set(tree_dict.get('inferred_const_sites', []))
+
+    # For every variable site in sequences, flip the format around so
+    # we can have fast lookups later on.
+    alleles = {}
+    num_samples = len(sample_names)
+    for idx, name in enumerate(sample_names):
+        for posn, allele in sequences[name].items():
+            if posn not in alleles:
+                alleles[posn] = np.zeros(num_samples, dtype='U')
+            alleles[posn][idx] = allele
+    # fill in reference
+    for posn,bases in alleles.items():
+        bases[bases==''] = ref[posn]
 
     def handleDeletions(i, pi, pos, ref, delete, pattern):
         refb = ref[pi]
@@ -371,7 +468,7 @@ def write_vcf(tree_dict, file_name):#, compress=False):
         #Rotate them into 'calls'
         align = np.asarray(sites).T
 
-        #Get rid of '-', and put '.' for calls that match ref
+        #Get rid of '-', and put '0' for calls that match ref
         #Only removes trailing '-'. This breaks VCF convension, but the standard
         #VCF way of handling this* is really complicated, and the situation is rare.
         #(*deletions and mutations at the same locations)
@@ -383,7 +480,7 @@ def write_vcf(tree_dict, file_name):#, compress=False):
                 gp-=1
             pat = "".join(pt)
             if pat == refb:
-                fullpat.append('.')
+                fullpat.append('0')
             else:
                 fullpat.append(pat)
 
@@ -393,19 +490,21 @@ def write_vcf(tree_dict, file_name):#, compress=False):
 
 
     #prepare the header of the VCF & write out
-    header=["#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"]+list(sequences.keys())
+    header=["#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"]+sample_names
 
     opn = gzip.open if file_name.endswith(('.gz', '.GZ')) else open
     out_file = opn(file_name, 'w')
 
     out_file.write( "##fileformat=VCFv4.2\n"+
                         "##source=NextStrain\n"+
-                        "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n")
+                        "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n"+
+                        f"##contig=<ID={chrom_name}>\n")
     out_file.write("\t".join(header)+"\n")
 
     vcfWrite = []
     errorPositions = []
     explainedErrors = 0
+    mask_skip_count = 0
 
     #Why so basic? Because we sometimes have to back up a position!
     i=0
@@ -420,28 +519,14 @@ def write_vcf(tree_dict, file_name):#, compress=False):
         delete = False #deletion at this position - need to grab previous base (invariable)
         deleteGroup = False #deletion at next position (mutation at this pos) - do not need to get prev base
 
-        #try/except is much more efficient than 'if' statements for constructing patterns,
-        #as on average a 'variable' location will not be variable for any given sequence
-        pattern = []
-        #pattern2 gets the pattern at next position to check for upcoming deletions
-        #it's more efficient to get both here rather than loop through sequences twice!
-        pattern2 = []
-        for k,v in sequences.items():
-            try:
-                pattern.append(sequences[k][pi])
-            except KeyError:
-                pattern.append(ref[pi])
-
-            try:
-                pattern2.append(sequences[k][pi+1])
-            except KeyError:
-                try:
-                    pattern2.append(ref[pi+1])
-                except IndexError:
-                    pass
+        if mask and mask[pi] == '1':
+            mask_skip_count+=1
+            i+=1
+            continue
 
-        pattern = np.array(pattern).astype('U')
-        pattern2 = np.array(pattern2).astype('U')
+        # patterns will be empty if there was no variation in the sequences
+        pattern = alleles[pi] if pi in alleles else np.array([]).astype('U')
+        pattern2 = alleles[pi+1] if pi+1 in alleles else np.array([]).astype('U')
 
         #If a deletion here, need to gather up all bases, and position before
         if any(pattern == '-'):
@@ -462,39 +547,40 @@ def write_vcf(tree_dict, file_name):#, compress=False):
 
         #If deletion, treat affected bases as 1 'call':
         if delete or deleteGroup:
+            if pattern.size==0: # no variation in sequences
+                pattern = np.full(num_samples, refb, dtype='U')
             i, pi, pos, refb, pattern = handleDeletions(i, pi, pos, ref, delete, pattern)
-        #If no deletion, replace ref with '.', as in VCF format
+        #If no deletion, replace ref with '0' which means the reference base is unchanged
         else:
-            pattern[pattern==refb] = '.'
+            pattern[pattern==refb] = '0'
 
-        #Get the list of ALTs - minus any '.'!
+        #Get the list of ALTs - minus any '0' which are unchanged reference sequences!
         uniques = np.unique(pattern)
-        uniques = uniques[np.where(uniques!='.')]
+        uniques = uniques[np.where(uniques!='0')]
 
         #Convert bases to the number that matches the ALT
         j=1
         for u in uniques:
             pattern[np.where(pattern==u)[0]] = str(j)
             j+=1
-        #Now convert these calls to #/# (VCF format)
-        calls = [ j+"/"+j if j!='.' else '.' for j in pattern ]
 
         #What if there's no variation at a variable site??
         #This can happen when sites are modified by TreeTime - see below.
-        printPos = True
-        if len(uniques)==0:
+        #We don't print to VCF (because no variation!)
+        any_variation = len(uniques)!=0
+        if not any_variation:
             #If we expect it (it was made constant by TreeTime), it's fine.
-            if 'inferred_const_sites' in tree_dict and pi in tree_dict['inferred_const_sites']:
+            if pi in inferred_const_sites:
                 explainedErrors += 1
-                printPos = False #and don't output position to the VCF
             else:
                 #If we don't expect, raise an error
                 errorPositions.append(str(pi))
 
         #Write it out - Increment positions by 1 so it's in VCF numbering
         #If no longer variable, and explained, don't write it out
-        if printPos:
-            output = ["MTB_anc", str(pos), ".", refb, ",".join(uniques), ".", "PASS", ".", "GT"] + calls
+        if any_variation:
+            calls = [ "/".join([j]*ploidy) for j in pattern ]
+            output = [chrom_name, str(pos), ".", refb, ",".join(uniques), ".", "PASS", ".", "GT"] + calls
             vcfWrite.append("\t".join(output))
 
         i+=1
@@ -507,6 +593,9 @@ def write_vcf(tree_dict, file_name):#, compress=False):
     #purposes, because the site is 'variant' against the ref, it is variant, as expected, and so
     #won't be counted in the below list, which is only sites removed from the VCF.
 
+    if mask_skip_count:
+        print(f"{mask_skip_count} positions were skipped due to the provided mask")
+
     if 'inferred_const_sites' in tree_dict and explainedErrors != 0:
         print(fill("Sites that were constant except for ambiguous bases were made" +
                    " constant by TreeTime. This happened {} times. These sites are".format(explainedErrors) +


=====================================
treetime/wrappers.py
=====================================
@@ -521,7 +521,7 @@ def ancestral_reconstruction(params):
     ### OUTPUT and saving of results
     ###########################################################################
     if params.gtr=='infer':
-        fname = outdir+'/sequence_evolution_model.txt'
+        fname = outdir+'sequence_evolution_model.txt'
         with open(fname, 'w', encoding='utf-8') as ofile:
             ofile.write(str(treeanc.gtr)+'\n')
         print('\nInferred sequence evolution model (saved as %s):'%fname)
@@ -854,11 +854,25 @@ def estimate_clock_model(params):
     else:
         print('\n--- root-date:\t %3.2f\n\n'%(-d2d.intercept/d2d.clock_rate))
 
+    if hasattr(myTree, 'outliers') and myTree.outliers is not None:
+        print("--- saved detected outliers as " + basename + 'outliers.tsv')
+        myTree.outliers.to_csv(basename + 'outliers.tsv', sep='\t')
+
+    if hasattr(myTree, 'outliers') and myTree.outliers is not None and params.prune_outliers:
+        for outlier in myTree.outliers.index:
+            print("removing ", outlier)
+            myTree.tree.prune(outlier)
+
     if not params.keep_root:
         # write rerooted tree to file
         outtree_name = basename+'rerooted.newick'
-        Phylo.write(myTree.tree, outtree_name, 'newick')
-        print("--- re-rooted tree written to \n\t%s\n"%outtree_name)
+    elif params.prune_outliers:
+        outtree_name = basename+'pruned.newick'
+    else:
+        outtree_name = basename+'.output.newick'
+
+    Phylo.write(myTree.tree, outtree_name, 'newick')
+    print("--- new tree written to \n\t%s\n"%outtree_name)
 
     table_fname = basename+'rtt.csv'
     with open(table_fname, 'w', encoding='utf-8') as ofile:



View it on GitLab: https://salsa.debian.org/med-team/python-treetime/-/commit/e6064017dcfbf63072b9d8a06cc5775d5eff2f6e

-- 
View it on GitLab: https://salsa.debian.org/med-team/python-treetime/-/commit/e6064017dcfbf63072b9d8a06cc5775d5eff2f6e
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20241109/6d7475c4/attachment-0001.htm>


More information about the debian-med-commit mailing list