[med-svn] [Git][med-team/python-treetime][upstream] New upstream version 0.11.4
Étienne Mollier (@emollier)
gitlab at salsa.debian.org
Sat Nov 9 20:49:19 GMT 2024
Étienne Mollier pushed to branch upstream at Debian Med / python-treetime
Commits:
e6064017 by Étienne Mollier at 2024-11-09T21:16:32+01:00
New upstream version 0.11.4
- - - - -
12 changed files:
- README.md
- changelog.md
- test.sh
- + test/test_vcf.py
- treetime/CLI_io.py
- treetime/__init__.py
- treetime/argument_parser.py
- treetime/gtr.py
- treetime/treeanc.py
- treetime/treetime.py
- treetime/vcf_utils.py
- treetime/wrappers.py
Changes:
=====================================
README.md
=====================================
@@ -1,5 +1,5 @@
[![CI](https://github.com/neherlab/treetime/actions/workflows/ci.yml/badge.svg?branch=master)](https://github.com/neherlab/treetime/actions/workflows/ci.yml)
-[![anaconda](https://anaconda.org/bioconda/treetime/badges/installer/conda.svg)](https://anaconda.org/bioconda/treetime)
+[![anaconda](https://anaconda.org/bioconda/treetime/badges/version.svg)](https://anaconda.org/bioconda/treetime)
[![readthedocs](https://readthedocs.org/projects/treetime/badge/)](https://treetime.readthedocs.io/en/latest/)
## TreeTime: maximum likelihood dating and ancestral sequence inference
=====================================
changelog.md
=====================================
@@ -1,3 +1,17 @@
+# 0.11.4: Bug fixed
+- fix output of mutations into the `branch_mutation.txt` file which was masked by a conditional
+- adjust CLI help
+- adjust output precision
+
+# 0.11.3: Bug fixed
+- calls to some random number generator errorred after the recent switch to treetime owned RNGs.
+- default argument for clock-filter method had a spelling mistake
+- a branch length optimization function didn't handle profiles==True correctly.
+
+# 0.11.2: improvements in VCF parsing/writing + clock-filter functionality
+- rewrite of the vcf reading and writing code including many more tests, closer alignment with spec, and performance improvements by @jameshadfield [PR #263](https://github.com/neherlab/treetime/pull/263)
+- clock-filter command can now remove detected outliers with the additional flag `--prune-outliers`.
+
# 0.11.1: bug fixes and tweaks to plotting
- fix division by zero error during GTR inference
- improve doc strings in parse dates
=====================================
test.sh
=====================================
@@ -19,5 +19,10 @@ if [ "$OUT" != 0 ]; then
exit 1
fi
+pytest test_vcf.py
+if [ "$OUT" != 0 ]; then
+ exit 1
+fi
+
# Clean up, the 202* is to remove auto-generated output dirs
rm -rf treetime_examples __pycache__ 202*
=====================================
test/test_vcf.py
=====================================
@@ -0,0 +1,548 @@
+import pytest
+from textwrap import dedent
+from treetime.vcf_utils import read_vcf, write_vcf
+from treetime import TreeTimeError
+
+def create_vcf(dir, content):
+ d = dir / 'data'
+ d.mkdir()
+ fname = d / "no-header.vcf"
+ with open(fname, 'w') as fh:
+ print(dedent(content), file=fh)
+ return str(fname)
+
+class TestMalformedVcf:
+
+ def test_no_header(self, tmp_path):
+ with pytest.raises(TreeTimeError):
+ read_vcf(create_vcf(tmp_path, """\
+ ##fileformat=VCFv4.3
+ 1\t5\t.\tT\tC\t.\t.\t.\tGT\t1\t1\t1
+ """))
+
+ def test_meta_info_after_header(self, tmp_path):
+ with pytest.raises(TreeTimeError):
+ read_vcf(create_vcf(tmp_path, """\
+ #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample_A\tsample_B\tsample_C
+ ##fileformat=VCFv4.3
+ 1\t5\t.\tT\tC\t.\t.\t.\tGT\t1\t1\t1
+ """))
+
+ def test_inconsistent_sample_numbers(self, tmp_path):
+ with pytest.raises(TreeTimeError):
+ read_vcf(create_vcf(tmp_path, """\
+ ##fileformat=VCFv4.3
+ #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample_A\tsample_B\tsample_C
+ 1\t5\t.\tT\tC\t.\t.\t.\tGT\t1\t1\t1
+ 1\t6\t.\tT\tC\t.\t.\t.\tGT\t1\t1
+ """))
+
+ def test_no_data_lines(self, tmp_path):
+ with pytest.raises(TreeTimeError):
+ read_vcf(create_vcf(tmp_path, """\
+ ##fileformat=VCFv4.3
+ #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample_A\tsample_B\tsample_C
+ """))
+
+ def test_consistent_ploidy(self, tmp_path):
+ """
+ The spec heavily implies that ALT alleles must be supplied for each copy
+ of the chromosome, even for no-calls: "If a call cannot be made for a
+ sample at a given locus, '.' must be specified for each missing allele
+ in the GT field (for example './.' for a diploid genotype and '.' for
+ haploid genotype)."
+ """
+ with pytest.raises(TreeTimeError):
+ read_vcf(create_vcf(tmp_path, """\
+ ##fileformat=VCFv4.3
+ #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample_A\tsample_B
+ 1\t5\t.\tT\tC\t.\t.\t.\tGT\t1\t1\t1/1
+ """))
+
+class TestMultipleChromosomes:
+ """
+ These are valid VCFs but TreeTime cannot yet parse them
+ """
+ def test_multiple_chromosomes(self, tmp_path):
+ with pytest.raises(TreeTimeError):
+ read_vcf(create_vcf(tmp_path, """\
+ ##fileformat=VCFv4.3
+ #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample_A\tsample_B\tsample_C
+ 1\t5\t.\tT\tC\t.\t.\t.\tGT\t1\t1\t1
+ 2\t6\t.\tT\tC\t.\t.\t.\tGT\t1\t1\t1
+ """))
+
+def zero_based(idx):
+ """Useful as a form of code commentary. Convert idx from 1-based to 0-based"""
+ return idx-1
+
+def write_data(tmp_path_factory, sample_names, data_lines, reference_seq, meta_lines=None, reference_name='reference_name'):
+ """helper function to create and write a VCF and FASTA file. Returns a tuple of filenames"""
+ if meta_lines:
+ vcf = "\n".join(meta_lines)+"\n"
+ else:
+ vcf = dedent("""\
+ ##fileformat=VCFv4.3
+ ##contig=<ID=1,length=50>
+ ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+ """)
+ vcf += "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + "\t".join(sample_names) + "\n"
+ for line in data_lines:
+ vcf += "\t".join(line) + "\n"
+ reference=dedent(f"""\
+ >{reference_name}
+ {reference_seq}
+ """)
+ dir = tmp_path_factory.mktemp("data")
+ vcf_fn = dir / "snps.vcf"
+ ref_fn = dir / "reference.fasta"
+ with open(vcf_fn, 'w') as fh:
+ print(vcf, file=fh)
+ with open(ref_fn, 'w') as fh:
+ print(reference, file=fh)
+ return (str(vcf_fn), str(ref_fn))
+
+
+class TestSimpleVcf:
+ @pytest.fixture(scope="class")
+ def data(self, tmp_path_factory):
+ """
+ Creates the VCF and reference files (in a temporary location) to be used by
+ the tests in this class.
+ Returns their filenames, as well as information about their contents which
+ we can use as comparisons in tests.
+ NOTE: this function is only run once - _not_ once per test.
+ """
+ sample_names = ["sample_A", "sample_B", "sample_C"]
+ data_lines = [
+ ["1", "5", ".", "T", "C", ".", ".", ".", "GT", "1", "1", "1"],
+ ["1", "7", ".", "T", "G", ".", ".", ".", "GT", "1", "1", "0"],
+ ["1", "14", ".", "C", "T", ".", ".", ".", "GT", "1", "1", "0"],
+ ["1", "18", ".", "C", "T", ".", ".", ".", "GT", "0", "0", "1"],
+ ["1", "28", ".", "A", "N", ".", ".", ".", "GT", "0", "0", "1"],
+ ["1", "29", ".", "A", "N", ".", ".", ".", "GT", "0", "0", "1"],
+ ["1", "30", ".", "A", "N", ".", ".", ".", "GT", "0", "0", "1"],
+ ["1", "33", ".", "A", "C,G", ".", ".", ".", "GT", "1", "2", "0"],
+ ["1", "39", ".", "C", "T", ".", ".", ".", "GT", "1", "0", "0"],
+ ["1", "42", ".", "G", "A", ".", ".", ".", "GT", "0", "1", "0"],
+ ["1", "43", ".", "A", "T", ".", ".", ".", "GT", "1", "1", "0"],
+ ]
+ positions = [int(line[1]) for line in data_lines]
+ reference_seq = 'AAAAAAAAAATGCCCTGCGGGTAAAAAAAAAAAAACTACTTGACCATAAA'
+ filenames = write_data(tmp_path_factory, sample_names, data_lines, reference_seq)
+ return {
+ "filenames": filenames,
+ "positions": positions,
+ "data_lines": data_lines,
+ "sample_names": sample_names,
+ "reference_seq": reference_seq,
+ }
+
+ def test_mutation_structure(self, data):
+ vcf_data = read_vcf(*data['filenames'])
+ # The structure of insertions & sequences is a dict a key for each sample, irregardless of whether
+ # any insertions/mutations are defined for that sample.
+ # samples = {'sample_A', 'sample_B', 'sample_C'}
+ assert(set(data['sample_names'])==set(vcf_data['sequences'].keys()))
+ assert(set(data['sample_names'])==set(vcf_data['insertions'].keys()))
+
+ def test_mutation_parsing(self, data):
+ """
+ Test that the correct SNPs (ALTs) are extracted from the VCF for the three samples defined in the VCF
+ """
+ vcf_data = read_vcf(*data['filenames'])
+ def summarise(sample):
+ # uses 1-based position so we can easily compare with the VCF file
+ return ",".join([f"{pos+1}{alt}" for pos,alt in vcf_data['sequences'][sample].items()])
+
+ assert("5C,7G,14T,33C,39T,43T"==summarise('sample_A'))
+ assert("5C,7G,14T,33G,42A,43T"==summarise('sample_B'))
+ assert("5C,18T,28N,29N,30N"==summarise('sample_C'))
+
+ def test_no_insertions_parsed(self, data):
+ vcf_data = read_vcf(*data['filenames'])
+ for sample, ins in vcf_data['insertions'].items():
+ assert(ins=={})
+
+ def test_reference_sequence(self, data):
+ vcf_data = read_vcf(*data['filenames'])
+ assert(data['reference_seq']==vcf_data['reference'])
+
+ def test_positions_with_mutations(self, data):
+ vcf_data = read_vcf(*data['filenames'])
+ assert(data['positions'] == [pos+1 for pos in vcf_data['positions']])
+
+
+class TestNoCallsOrMissing:
+ """
+ Tests a few overlapping concepts:
+ - The genotype field (i.e. within the column of a sample name) may be "." which results in a
+ no call allele. So we call this as SNP(s) changing the REF to N(s). The 4.2 spec refers
+ to this as "a call cannot be made for a sample at a given locus". Spec 4.3 defines this
+ as "If a call cannot be made for a sample at a given locus, '.' must be specified for each
+ missing allele in the GT field (for example ./. for a diploid genotype and . for haploid
+ genotype)"
+ - The ALT allele may be "*". v4.2. defines this as "missing due to a upstream deletion (sic)"
+ and 4.3 defines this as "allele missing due to overlapping deletion". Either way we encode
+ this similarly to the no-call above, i.e. changing the REF to N(s). Note that the allele
+ must be the one-character "*" - this can't appear within other bases.
+ - The ALT allele may be ".". This was introduced in version 4.3 as "a MISSING value (no variant)"
+ So we treat this the same as "*" above. Again, this must be the entire allele, '.' can't appear
+ as part of other bases. Note that while this doesn't exist in v4.2, it is commonly found in
+ v4.2 VCF files, e.g. those produced by `snp_sites`.
+ """
+
+ @pytest.fixture(scope="class")
+ def data(self, tmp_path_factory):
+ reference="ATCGA"
+ sample_names = ["sample_A", "sample_B", "sample_C", "sample_D"]
+ data_lines = [
+ ## No call alleles
+ ["1", "2", ".", "T", "A", ".", ".", ".", "GT", ".", "1", "0", "0"], # sample A has T2N
+ ["1", "4", ".", "GA", "C", ".", ".", ".", "GT", ".", "1", "0", "0"], # sample A has GA -> NN
+ ## star alleles & dot alleles indicate missing
+ ["1", "3", ".", "C", "*,G", ".", ".", ".", "GT", "1", "2", "0", "0"], # sample A has C->N
+ ["1", "3", ".", "C", "T,.", ".", ".", ".", "GT", "0", "0", "1", "2"], # sample D has C->N
+ ["1", "4", ".", "GA", ".,*", ".", ".", ".", "GT", "0", "0", "1", "2"], # both samples C & D have G->N and A->N
+ ]
+ filenames = write_data(tmp_path_factory, sample_names, data_lines, reference)
+ return {"filenames": filenames}
+
+ def test_no_call_allele(self, data):
+ vcf_data = read_vcf(*data['filenames'])
+ assert(vcf_data['sequences']['sample_A'][zero_based(2)]=='N')
+ assert(vcf_data['sequences']['sample_A'][zero_based(4)]=='N')
+ assert(vcf_data['sequences']['sample_A'][zero_based(5)]=='N')
+ assert(vcf_data['sequences']['sample_B'][zero_based(4)]=='C')
+ assert(vcf_data['sequences']['sample_B'][zero_based(5)]=='-')
+
+ def test_star_allele(self, data):
+ vcf_data = read_vcf(*data['filenames'])
+ assert(vcf_data['sequences']['sample_A'][zero_based(3)]=='N')
+ assert(vcf_data['sequences']['sample_B'][zero_based(3)]=='G')
+
+ assert(vcf_data['sequences']['sample_D'][zero_based(4)]=='N')
+ assert(vcf_data['sequences']['sample_D'][zero_based(5)]=='N')
+
+ def test_dot_allele(self, data):
+ vcf_data = read_vcf(*data['filenames'])
+ assert(vcf_data['sequences']['sample_C'][zero_based(3)]=='T')
+ assert(vcf_data['sequences']['sample_D'][zero_based(3)]=='N')
+
+ assert(vcf_data['sequences']['sample_C'][zero_based(4)]=='N')
+ assert(vcf_data['sequences']['sample_C'][zero_based(5)]=='N')
+
+
+ def test_malformed_star_allele(self, tmp_path):
+ """* must be an entire allele, not together with other bases"""
+ with pytest.raises(TreeTimeError):
+ read_vcf(create_vcf(tmp_path, """\
+ #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample_A\tsample_B\tsample_C
+ ##fileformat=VCFv4.3
+ 1\t5\t.\tT\tC*\t.\t.\t.\tGT\t1\t1\t1
+ """))
+
+ def test_malformed_dot_allele(self, tmp_path):
+ """. must be an entire allele, not together with other bases"""
+ with pytest.raises(TreeTimeError):
+ read_vcf(create_vcf(tmp_path, """\
+ #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample_A\tsample_B\tsample_C
+ ##fileformat=VCFv4.3
+ 1\t5\t.\tT\tC.,G\t.\t.\t.\tGT\t1\t1\t1
+ """))
+
+
+class TestVcfSpecExamples:
+ """
+ Test the examples provided in the VCF specs. Note that the examples
+ aren't provided as per-sample VCF files, so there's a bit of interpretation
+ required to create the example data. Note that the actual `test_<name>`
+ methods are added dynamically after the class definition.
+ """
+ def create(self, tmp_path, input):
+ lines = ["##fileformat=VCFv4.2", # Actual version may differ, but we don't parse this so it doesn't matter (yet)
+ "##contig=<ID=1,length=50>",
+ "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">",
+ "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + "\t".join(input['samples']),
+ ]
+ for d in input['data']:
+ lines.append(f"{d['chrom']}\t{d['pos']}\t.\t{d['ref']}\t{d['alt']}\t.\t.\t.\tGT\t" + "\t".join([str(x) for x in d['values']]))
+ return create_vcf(tmp_path, "\n".join(lines))
+
+ @staticmethod
+ def add_test(example_data):
+ def test(self, tmp_path):
+ vcf = read_vcf(self.create(tmp_path, example_data))
+ for sample_name, sample_data in example_data['expect_sequences'].items():
+ assert(sample_name in vcf['sequences'])
+ for snp in sample_data:
+ assert(len(snp)==2)
+ assert(vcf['sequences'][sample_name][zero_based(int(snp[0]))] == snp[1])
+ assert(len(vcf['sequences'][sample_name].keys()) == len(sample_data))
+ for sample_name, sample_data in example_data['expect_insertions'].items():
+ assert(sample_name in vcf['insertions'])
+ for ins in sample_data:
+ assert(vcf['insertions'][sample_name][zero_based(int(ins[0]))] == ins[1:])
+ assert(len(vcf['insertions'][sample_name].keys()) == len(sample_data))
+ return test
+
+"""
+-------- comment re: TreeTime's encoding of insertions --------
+
+Give a reference base of 'C' at 1-based position 2, and an
+insertion _after_ this of 'A', TreeTime encodes this as an insertion
+of 'CA' at (0-based) position 1. This was unexpected to me,
+I would have expected an insertion of just 'A'. However I've written
+the tests to conform to TreeTime's existing behaviour (version 0.11.1)
+
+---------------------------------------------------------------
+"""
+
+vcf_spec_data = {
+ 'version_4_2': {
+ '5_1_1': {
+ 'samples': ['A', 'B', 'C'],
+ 'data': [
+ {'chrom': 20, 'pos': 3, 'ref': 'C', 'alt': 'G', 'values': [1, 0, 0]},
+ {'chrom': 20, 'pos': 2, 'ref': 'TC', 'alt': 'T,TCA', 'values': [0, 1, 2]}
+ ],
+ # Note that with TC->TCA, there are no sequence changes (T->T, C->C) but one insertion ("A")
+ 'expect_sequences': {'A': ['3G'], 'B': ['3-']},
+ 'expect_insertions': {'C': ['3CA']} # see comment above. Actually only a single base "A" insertion
+ },
+ '5_1_2': {
+ 'samples': ['A', 'B'],
+ 'data': [
+ {'chrom': 20, 'pos': 2, 'ref': 'TC', 'alt': 'TG,T', 'values': [1,2]}
+ ],
+ 'expect_sequences': {'A': ['3G'], 'B': ['3-']},
+ 'expect_insertions': {}
+ },
+ '5_1_3': {
+ 'samples': ['A', 'B', 'C'],
+ ## This example is quite hard to understand for sample A. The alignment provided indicates that
+ ## TCG -> T-G is encoded as "ref: TCG, alt: TG". But without aligning the alt to the ref, the only
+ ## sane interpretation of this is 2 changes: C->G and G->-. This is what the spec means by
+ ## "the molecular equivalence explicitly listed above in the per-base alignment is discarded so the
+ ## actual placement of equivalent g isn't retained" (also explained in example 5.2.4)
+ ## Similarly, for sample C, the actual event is described as "A base is inserted wrt the reference
+ ## sequence" but the way the VCF file reads we are going to have a SNP (G->A) + a insertion (G)
+ 'data': [
+ {'chrom': 20, 'pos': 2, 'ref': 'TCG', 'alt': 'TG,T,TCAG', 'values': [1,2,3]}
+ ],
+ 'expect_sequences': {'A': ['3G', '4-'], 'B': ['3-', '4-'], 'C': ['4A']},
+ 'expect_insertions': {'C': ['4AG']} # See comment above
+ },
+ '5_2_1': {
+ 'samples': ['A'],
+ 'data': [
+ {'chrom': 20, 'pos': 3, 'ref': 'C', 'alt': 'T', 'values': [1]}
+ ],
+ 'expect_sequences': {'A': ['3T']},
+ 'expect_insertions': {}
+ },
+ '5_2_2': {
+ 'samples': ['A'],
+ 'data': [
+ {'chrom': 20, 'pos': 3, 'ref': 'C', 'alt': 'CTAG', 'values': [1]}
+ ],
+ 'expect_sequences': {},
+ 'expect_insertions': {'A': ['3CTAG']} # See comment above
+ },
+ '5_2_3': {
+ 'samples': ['A'],
+ 'data': [
+ {'chrom': 20, 'pos': 2, 'ref': 'TCG', 'alt': 'T', 'values': [1]}
+ ],
+ 'expect_sequences': {'A': ['3-', '4-']},
+ 'expect_insertions': {}
+ },
+ }
+}
+# dynamically create a test for each example in the above data (by adding methods to the class)
+for spec_version, spec_data in vcf_spec_data.items():
+ for example_key, example_data in spec_data.items():
+ setattr(TestVcfSpecExamples, f"test_{spec_version}_example_{example_key}", TestVcfSpecExamples.add_test(example_data))
+
+
+class TestMutationAndInsertion:
+ """
+ Tests the situation where a reference base is mutated _and_ there's an insertion
+ """
+
+ @pytest.fixture(scope="class")
+ def data(self, tmp_path_factory):
+ reference="ATCGA"
+ sample_names = ["sample_A", "sample_B"]
+ data_lines = [
+ ["1", "2", ".", "T", "GA", ".", ".", ".", "GT", "1", "0"], # sample A has both a C->G mutation _and_ a subsequent "A" insertion
+ ["1", "3", ".", "CGA", "NTTTA,CCGT", ".", ".", ".", "GT", "1", "2"], # complex! Both samples have multiple mutations + an insertion
+ ]
+ filenames = write_data(tmp_path_factory, sample_names, data_lines, reference)
+ return {"filenames": filenames}
+
+ def test_single_ref_base_mutation_and_insertion(self, data):
+ # This case was missed in treetime 0.11.1
+ vcf_data = read_vcf(*data['filenames'])
+ assert(vcf_data['sequences']['sample_A'][zero_based(2)]=='G')
+ assert(vcf_data['insertions']['sample_A'][zero_based(2)]=='GA') # see comment above re: insertion encoding
+
+ def test_multi_ref_base_mutations_and_insertion(self, data):
+ # This case was missed in treetime 0.11.1
+ vcf_data = read_vcf(*data['filenames'])
+ assert(vcf_data['sequences']['sample_A'][zero_based(3)]=='N')
+ assert(vcf_data['sequences']['sample_A'][zero_based(4)]=='T')
+ assert(vcf_data['sequences']['sample_A'][zero_based(5)]=='T')
+ assert(vcf_data['insertions']['sample_A'][zero_based(5)]=='TTA') # see comment above re: insertion encoding
+
+ assert(vcf_data['sequences']['sample_B'][zero_based(4)]=='C')
+ assert(vcf_data['sequences']['sample_B'][zero_based(5)]=='G')
+ assert(vcf_data['insertions']['sample_B'][zero_based(5)]=='GT') # see comment above re: insertion encoding
+
+class TestMetadataParsing:
+
+ def test_simple_haploid(self, tmp_path):
+ data = read_vcf(create_vcf(tmp_path, """\
+ ##fileformat=VCFv4.3
+ #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample_A\tsample_B\tsample_C
+ 1\t5\t.\tT\tC\t.\t.\t.\tGT\t1\t1\t1
+ 1\t6\t.\tT\tC\t.\t.\t.\tGT\t1\t1\t1
+ """))
+ assert(data['metadata']['chrom']=='1')
+ assert(data['metadata']['ploidy']==1)
+ assert(data['metadata']['meta_lines']==['##fileformat=VCFv4.3'])
+
+ def test_simple_diploid(self, tmp_path):
+ data = read_vcf(create_vcf(tmp_path, """\
+ ##fileformat=VCFv4.3
+ ##contig=<ID=1,length=50>
+ ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+ #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample_A
+ foo\t5\t.\tT\tC\t.\t.\t.\tGT\t1/1
+ foo\t6\t.\tT\tC\t.\t.\t.\tGT\t.|.
+ """))
+ assert(data['metadata']['chrom']=='foo')
+ assert(data['metadata']['ploidy']==2)
+ assert(data['metadata']['meta_lines']==['##fileformat=VCFv4.3', '##contig=<ID=1,length=50>', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">'])
+
+
+
+
+def roundtrip(tmp_path, sample_names, data_lines, reference, meta_lines, pass_metadata=True):
+ """
+ Write the provided data to a (temporary) VCF file.
+ Then read it via `read_vcf` and store this as `vcf_a`.
+ Then write this data via `write_vcf` and read it back, storing as `vcf_b`.
+ Returns a tuple of (vcf_a, vcf_b)
+ """
+
+ dir = tmp_path / 'data'
+ dir.mkdir()
+ vcf_filename_a = str(dir / "a.vcf")
+ vcf_filename_b = str(dir / "b.vcf")
+ reference_filename = str(dir / 'reference.fasta')
+
+ with open(str(dir / 'reference.fasta'), 'w') as fh:
+ print(dedent(f"""\
+ >reference_name
+ {reference}
+ """), file=fh)
+
+ with open(vcf_filename_a, 'w') as fh:
+ vcf_lines = meta_lines[:] + \
+ ["#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + "\t".join(sample_names)] + \
+ ["\t".join(data) for data in data_lines]
+ print("\n".join(vcf_lines), file=fh)
+
+ vcf_a = read_vcf(vcf_filename_a, reference_filename)
+
+ ## take this data and write it out as a VCF
+ tree_dict = {'sequences': vcf_a['sequences'], 'reference': vcf_a['reference'], 'positions': vcf_a['positions']}
+ if pass_metadata:
+ tree_dict['metadata'] = vcf_a['metadata']
+ write_vcf(tree_dict, vcf_filename_b)
+
+ ## then read in this (just-created) VCF
+ vcf_b = read_vcf(vcf_filename_b, reference_filename)
+
+ ## uncomment the following & run pytest with "-s" to see the contents of the VCF file being written by TreeTime
+ # print("_________________________________")
+ # with open(vcf_filename_b) as fh:
+ # for line in fh:
+ # print(line, end='')
+ # print("\n_________________________________")
+
+ return (vcf_a, vcf_b)
+
+
+class TestWriting:
+ """
+ Write a simple VCF file out (created by hand), parse it with `read_vcf` (called "vcf_a")
+ then write that data out and read it back in (called "vcf_b"). vcf_a should equal vcf_b
+ for the data we care about.
+ """
+
+ def test_basic_roundtripping_vcf(self, tmp_path):
+ [vcf_a, vcf_b] = roundtrip(tmp_path,
+ reference = "ATCGACC",
+ meta_lines = [
+ "##fileformat=VCFv4.3",
+ "##contig=<ID=1,length=7>",
+ "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">"
+ ],
+ sample_names = ["sample_A", "sample_B"],
+ data_lines = [
+ ["foo", "2", ".", "T", "G", ".", ".", ".", "GT", "1", "0"],
+ ["foo", "3", ".", "C", "GA,*", ".", ".", ".", "GT", "1", "2"], # "A" insertion will be lost on roundtrip!
+ ["foo", "5", ".", "A", "TC", ".", ".", ".", "GT", ".", "1"],
+ ["foo", "6", ".", "CC", "C", ".", ".", ".", "GT", "1", "0"], # del of (1-based) pos 7 in sample A
+ ],
+ pass_metadata=False
+ )
+
+ ## reference is certainly the same, the same FASTA file is being used for both read_vcf commands, but check anyway
+ assert(vcf_a['reference']==vcf_a['reference'])
+ ## insertions, if there are any, _won't_ be the same because `write_vcf` doesn't read insertions even if they're provided
+ ## as input. Note that vcf_a does have an insertion!
+
+ ## The meta-lines are different (as expected) and since we are not passing the metadata information to `write_vcf`
+ ## we get back the defaults, which differ from the input
+ assert(vcf_a['metadata']['chrom'] == "foo" and vcf_b['metadata']['chrom'] == "1")
+ assert(vcf_a['metadata']['ploidy'] == 1 and vcf_b['metadata']['ploidy'] == 2)
+
+ ## positions should be the same (positions are only reflective of data in `sequences`, so the ignoring of insertions is ok)
+ assert(vcf_a['positions']==vcf_a['positions'])
+ ## check sequences are the same
+ assert(vcf_a['sequences']==vcf_b['sequences'])
+
+ def test_basic_roundtripping_vcf_with_metadata(self, tmp_path):
+ [vcf_a, vcf_b] = roundtrip(tmp_path,
+ reference = "ATCGACC",
+ meta_lines = [
+ "##fileformat=VCFv4.3",
+ "##contig=<ID=1,length=7>",
+ "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">"
+ ],
+ sample_names = ["sample_A", "sample_B"],
+ data_lines = [
+ ["1", "2", ".", "T", "G", ".", ".", ".", "GT", "1", "0"],
+ ["1", "3", ".", "C", "GA,*", ".", ".", ".", "GT", "1", "2"], # "A" insertion will be lost on roundtrip!
+ ["1", "5", ".", "A", "TC", ".", ".", ".", "GT", ".", "1"],
+ ["1", "6", ".", "CC", "C", ".", ".", ".", "GT", "1", "0"], # del of (1-based) pos 7 in sample A
+ ],
+ pass_metadata=True
+ )
+
+ ## reference is certainly the same, the same FASTA file is being used for both read_vcf commands, but check anyway
+ assert(vcf_a['reference']==vcf_a['reference'])
+ ## insertions, if there are any, _won't_ be the same because `write_vcf` doesn't read insertions even if they're provided
+ ## as input. Note that vcf_a does have an insertion!
+
+ ## The meta-lines are different (as expected) but the chrom + ploidy should be the same
+ assert(vcf_a['metadata']['chrom'] == vcf_b['metadata']['chrom'])
+ assert(vcf_a['metadata']['ploidy'] == vcf_b['metadata']['ploidy'])
+
+ ## positions should be the same (positions are only reflective of data in `sequences`, so the ignoring of insertions is ok)
+ assert(vcf_a['positions']==vcf_a['positions'])
+ ## check sequences are the same
+ assert(vcf_a['sequences']==vcf_b['sequences'])
=====================================
treetime/CLI_io.py
=====================================
@@ -164,15 +164,15 @@ def export_sequences_and_tree(tt, basename, is_vcf=False, zero_based=False,
n.comment= '&mutations="' + ','.join([a+str(pos + offset)+d for (a,pos, d) in n.mutations
if tt.gtr.ambiguous not in [a,d] and n.mask[pos]>0])+f'",mcc="{n.mcc}"'
- for (a, pos, d) in n.mutations:
- if tt.gtr.ambiguous not in [a,d] or report_ambiguous:
- mutations_out.write("%s\t%s\t%s\t%s\n" %(n.name, a, pos + 1, d))
+ for (a, pos, d) in n.mutations:
+ if tt.gtr.ambiguous not in [a,d] or report_ambiguous:
+ mutations_out.write("%s\t%s\t%s\t%s\n" %(n.name, a, pos + 1, d))
if timetree:
n.comment+=(',' if n.comment else '&') + 'date=%1.2f'%n.numdate
mutations_out.close()
# write tree to file
- fmt_bl = "%1.6f" if tt.data.full_length<1e6 else "%1.8e"
+ fmt_bl = "%1.7f" if tt.data.full_length<1e6 else "%1.9e"
if timetree:
outtree_name = basename + f'timetree{tree_suffix}.nexus'
print("--- saved divergence times in \n\t %s\n"%dates_fname)
=====================================
treetime/__init__.py
=====================================
@@ -1,4 +1,4 @@
-version="0.11.1"
+version="0.11.4"
## Here we define an error class for TreeTime errors, MissingData, UnknownMethod and NotReady errors
## are all due to incorrect calling of TreeTime functions or input data that does not fit our base assumptions.
## Errors marked as TreeTimeUnknownErrors might be due to data not fulfilling base assumptions or due
=====================================
treetime/argument_parser.py
=====================================
@@ -81,7 +81,8 @@ tree_description = "Name of file containing the tree in newick, nexus, or phylip
aln_description = "alignment file (fasta)"
-dates_description = "csv file with dates for nodes with 'node_name, date' where date is float (as in 2012.15)"
+dates_description = "csv file with dates for nodes with 'node_name, date' where date is float (as in 2012.15) or in ISO-format (YYYY-MM-DD). "\
+ "Imprecisely known dates can be specified as '2023-XX-XX' or [2013.2:2013.7]"
coalescent_description = \
"coalescent time scale -- sensible values are on the order of the average "\
@@ -303,6 +304,8 @@ def make_parser():
add_seq_len_aln_group(c_parser)
add_reroot_group(c_parser)
+ c_parser.add_argument('--prune-outliers', action='store_true', default=False,
+ help="remove detected outliers from the output tree")
c_parser.add_argument('--allow-negative-rate', required = False, action="store_true", default=False,
help="By default, rates are forced to be positive. For trees with little temporal "
"signal it is advisable to remove this restriction to achieve essentially mid-point rooting.")
=====================================
treetime/gtr.py
=====================================
@@ -476,8 +476,8 @@ class GTR(object):
alphabet=alphabets[alphabet]
gtr = cls(alphabet)
n = gtr.alphabet.shape[0]
- pi = 1.0*rng.randint(0,100,size=(n))
- W = 1.0*rng.randint(0,100,size=(n,n)) # with gaps
+ pi = 1.0*rng.random(size=n)
+ W = 1.0*rng.random(size=(n,n)) # with gaps
gtr.assign_rates(mu=mu, pi=pi, W=W)
return gtr
@@ -851,7 +851,11 @@ class GTR(object):
else:
return -1.0*self.prob_t_compressed(seq_pair, multiplicity,t**2, return_log=True)
- hamming_distance = np.sum(multiplicity[seq_pair[:,1]!=seq_pair[:,0]])/np.sum(multiplicity)
+ if profiles:
+ hamming_distance = 1-np.sum(multiplicity*np.sum(seq_pair[0]*seq_pair[1], axis=1))/np.sum(multiplicity)
+ else:
+ hamming_distance = np.sum(multiplicity[seq_pair[:,1]!=seq_pair[:,0]])/np.sum(multiplicity)
+
try:
from scipy.optimize import minimize_scalar
opt = minimize_scalar(_neg_prob,
=====================================
treetime/treeanc.py
=====================================
@@ -572,7 +572,7 @@ class TreeAnc(object):
"in the position %d: %s, "
"choosing %s" % (amb, str(self.tree.root.state[amb]),
self.tree.root.state[amb][0]), 4)
- self.tree.root._cseq = np.array([k[self.rng.randint(len(k)) if len(k)>1 else 0]
+ self.tree.root._cseq = np.array([k[self.rng.integers(len(k)) if len(k)>1 else 0]
for k in self.tree.root.state])
@@ -1670,7 +1670,7 @@ class TreeAnc(object):
def get_tree_dict(self, keep_var_ambigs=False):
- return self.get_reconstructed_alignment()
+ return self.get_reconstructed_alignment(reconstruct_tip_states=not keep_var_ambigs)
def recover_var_ambigs(self):
=====================================
treetime/treetime.py
=====================================
@@ -73,7 +73,7 @@ class TreeTime(ClockTree):
sys.exit(2)
- def _run(self, root=None, infer_gtr=True, relaxed_clock=None, clock_filter_method='residuals',
+ def _run(self, root=None, infer_gtr=True, relaxed_clock=None, clock_filter_method='residual',
n_iqd = None, resolve_polytomies=True, max_iter=0, Tc=None, fixed_clock_rate=None,
time_marginal='never', sequence_marginal=False, branch_length_mode='auto',
vary_rate=False, use_covariation=False, tracelog_file=None,
@@ -423,6 +423,8 @@ class TreeTime(ClockTree):
bad_branch_count = residual_filter(self, n_iqd)
elif method=='local':
bad_branch_count = local_filter(self, n_iqd)
+ else:
+ raise ValueError(f"TreeTime.clock_filter: unknown clock-filter method '{method}'. Implemented methods are: 'residual', 'local'")
if bad_branch_count>0.34*self.tree.count_terminals():
self.logger("TreeTime.clock_filter: More than a third of leaves have been excluded by the clock filter. Please check your input data.", 0, warn=True)
=====================================
treetime/vcf_utils.py
=====================================
@@ -2,6 +2,7 @@ import gzip
import numpy as np
from collections import defaultdict
from textwrap import fill
+from . import TreeTimeError
## Functions to read in and print out VCF files
@@ -54,6 +55,8 @@ def read_vcf(vcf_file, ref_file=None):
Python list of all positions with a mutation, insertion, or deletion.
"""
+ import re
+ ALT_CHARS = re.compile(r"^([ACGTNacgtn]+|\*|\.)$") # straight from the VCF 4.3 spec
#Programming Note:
# Note on VCF Format
@@ -73,7 +76,7 @@ def read_vcf(vcf_file, ref_file=None):
# Ex:
# REF ALT
# A ATT
- # First base always matches Ref.
+ # First base _does not_ always match REF, so there may be a SNP as well
# 'No indel'
# Ex:
# REF ALT
@@ -82,6 +85,11 @@ def read_vcf(vcf_file, ref_file=None):
#define here, so that all sub-functions can access them
sequences = defaultdict(dict)
insertions = defaultdict(dict) #Currently not used, but kept in case of future use.
+ metadata = {
+ 'meta_lines': [], # all the VCF meta_lines, i.e. those starting with '##' (they are left unparsed)
+ 'chrom': None, # chromosome name (we only allow one)
+ 'ploidy': None, # ploidy count -- encoded in how the GT calls are formatted
+ }
#TreeTime handles 2-3 base ambig codes, this will allow that.
def getAmbigCode(bp1, bp2, bp3=""):
@@ -104,13 +112,23 @@ def read_vcf(vcf_file, ref_file=None):
#Parses a 'normal' (not hetero or no-call) call depending if insertion+deletion, insertion,
#deletion, or single bp subsitution
- def parseCall(snps, ins, pos, ref, alt):
+ def parse_homozygous_call(snps, ins, pos, ref, alt):
+
+ # Replace missing allele with N(s). See commentary in test_vcf.py::TestNoCallsOrMissing for more details
+ if alt=='*' or alt=='.':
+ alt = "N" * len(ref)
+
#Insertion where there are also deletions (special handling)
if len(ref) > 1 and len(alt)>len(ref):
+ ## NOTE: the loop below contains a potential double-counting bug. For example,
+ ## REF='TCG' ALT='TCAG' (Example 5.1.3 in the VCF 4.2 spec), then when i=2
+ ## we'll add both snps[pos+2] = 'A' as well as ins[pos+2] = 'AG'. This has been
+ ## detailed within `test_vcf.py`, as it may also be the expected way to encode
+ ## insertions within TreeTime
for i in range(len(ref)):
#if the pos doesn't match, store in sequences
if ref[i] != alt[i]:
- snps[pos+i] = (alt[i] if alt[i] != '.' else 'N') #'.' = no-call
+ snps[pos+i] = alt[i]
#if about to run out of ref, store rest:
if (i+1) >= len(ref):
ins[pos+i] = alt[i:]
@@ -123,17 +141,21 @@ def read_vcf(vcf_file, ref_file=None):
#if not, there may be mutations
else:
if ref[i] != alt[i]:
- snps[pos+i] = (alt[i] if alt[i] != '.' else 'N') #'.' = no-call
- #Insertion
+ snps[pos+i] = alt[i]
+ #Insertion + single-base ref
elif len(alt) > 1:
ins[pos] = alt
+ # If the first base of the allele doesn't match the ref then we _also_ have a mutation
+ if ref[0]!=alt[0]:
+ snps[pos] = alt[0]
#No indel
else:
snps[pos] = alt
#Parses a 'bad' (hetero or no-call) call depending on what it is
- def parseBadCall(gen, snps, ins, pos, ref, ALT):
+ #TODO - consider the situation where the alternate allele base(s) is '*' (done for the homozygous case)
+ def parse_heterozygous_call(gen, snps, ins, pos, ref, ALT):
#Deletion
# REF ALT Seq1 Seq2 Seq3
# GCC G 1/1 0/1 ./.
@@ -173,90 +195,135 @@ def read_vcf(vcf_file, ref_file=None):
snps[pos] = alt
#else a no-call insertion, so ignore.
+ def validate_alt(alt):
+ """
+ from the VCF 4.3 spec:
+ > the ALT field must be a symbolic allele, or a breakend replacement string,
+ > or match the regular expression [see ALT_CHARS regex, above]
+ Since we only consider GT variation, the symbolic alleles and breakends aren't
+ relevant for us.
+
+ The spec also states:
+ > Tools processing VCF files are not required to preserve case in the allele String
+
+ Return the uppercase allele bases (string) _or_ None if it fails validation
+ """
+ if ALT_CHARS.match(alt):
+ return alt.upper()
+ else:
+ print(f"WARNING: Encountered invalid allele base(s) {alt!r}. Skipping...")
+ return None
#House code is *much* faster than pyvcf because we don't care about all info
#about coverage, quality, counts, etc, which pyvcf goes to effort to parse
#(and it's not easy as there's no standard ordering). Custom code can completely
#ignore all of this.
- import gzip
from Bio import SeqIO
- import numpy as np
-
- nsamp = 0
- posLoc = 0
- refLoc = 0
- altLoc = 0
- sampLoc = 9
#Use different openers depending on whether compressed
opn = gzip.open if vcf_file.endswith(('.gz', '.GZ')) else open
with opn(vcf_file, mode='rt') as f:
- for line in f:
- if line[0] != '#':
- #actual data - most common so first in 'if-list'!
- dat = line.strip().split('\t')
- POS = int(dat[posLoc])
- REF = dat[refLoc]
- ALT = dat[altLoc].split(',')
- calls = np.array(dat[sampLoc:])
-
- #get samples that differ from Ref at this site
- recCalls = {}
- for sname, sa in zip(samps, calls):
- if ':' in sa: #if proper VCF file (followed by quality/coverage info)
- gt = sa.split(':')[0]
- else: #if 'pseudo' VCF file (nextstrain output, or otherwise stripped)
- gt = sa
-
- # convert haploid calls to pseudo diploid
- if gt == '0':
- gt = '0/0'
- elif gt == '1':
- gt = '1/1'
- elif gt == '.':
- gt = './.'
-
- #ignore if ref call: '.' or '0/0', depending on VCF
- if ('/' in gt and gt != '0/0') or ('|' in gt and gt != '0|0'):
- recCalls[sname] = gt
-
- #store the position and the alt
- for seq, gen in recCalls.items():
- ref = REF
- pos = POS-1 #VCF numbering starts from 1, but Reference seq numbering
- #will be from 0 because it's python!
- #Accepts only calls that are 1/1, 2/2 etc. Rejects hets and no-calls
- if gen[0] != '0' and gen[2] != '0' and gen[0] != '.' and gen[2] != '.':
- alt = str(ALT[int(gen[0])-1]) #get the index of the alternate
- if seq not in sequences.keys():
- sequences[seq] = {}
-
- parseCall(sequences[seq],insertions[seq], pos, ref, alt)
-
- #If is heterozygote call (0/1) or no call (./.)
- else:
- #alt will differ here depending on het or no-call, must pass original
- parseBadCall(gen, sequences[seq],insertions[seq], pos, ref, ALT)
+ current_block = "meta-information" # The start of VCF files is the meta-information (lines starting with ##)
+ header,samps,nsamp=None,None,None
- elif line[0] == '#' and line[1] == 'C':
- #header line, get all the information
+ for line in f:
+ if line.startswith("##"):
+ if current_block!='meta-information':
+ raise TreeTimeError(f"Malformed VCF file {vcf_file!r} - all the meta-information (lines starting with ##) must appear at the top of the file.")
+ metadata['meta_lines'].append(line.strip())
+ elif line[0]=='#':
+ mandatory_fields = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT']
+ # Note that FORMAT isn't mandatory unless genotype data is present. But every VCF we deal with has genotype data - that's the point!
+ header_start = "#"+"\t".join(mandatory_fields)
+ if not line.startswith(header_start):
+ raise TreeTimeError(f"Malformed VCF file {vcf_file!r} - the header line must start with {header_start}")
+ if current_block!='meta-information':
+ raise TreeTimeError(f"Malformed VCF file {vcf_file!r} - the header must immediately follow the meta-information lines")
+ current_block = 'header'
header = line.strip().split('\t')
- posLoc = header.index("POS")
- refLoc = header.index('REF')
- altLoc = header.index('ALT')
- sampLoc = header.index('FORMAT')+1
- samps = header[sampLoc:]
- samps = [ x.strip() for x in samps ] #ensure no leading/trailing spaces
+ samps = [ x.strip() for x in header[9:] ] #ensure no leading/trailing spaces
nsamp = len(samps)
+ for sample_name in samps:
+ sequences[sample_name] = {}
+ else:
+ if current_block!='header' and current_block!='data-lines':
+ raise TreeTimeError(f"Malformed VCF file {vcf_file!r} - the data lines must follow the header line")
+ current_block='data-lines'
+ l = line.strip()
+ if not l: # empty line
+ continue
+ dat = l.split('\t')
+ chrom = str(dat[0])
+ if metadata['chrom'] is None:
+ metadata['chrom'] = chrom
+ elif metadata['chrom']!=chrom:
+ raise TreeTimeError(f"The VCF file {vcf_file!r} contains multiple chromosomes. TreeTime can not yet handle this.")
+ pos = int(dat[1])-1 # Convert VCF 1-based to python 0-based
+ REF = dat[3]
+ ALT = dat[4].split(',') # List of alternate alleles (strings)
+ calls = np.array(dat[9:])
+ if len(calls)!=nsamp:
+ raise TreeTimeError(f"Malformed VCF file {vcf_file!r} - the data lines have different number of calls than the number of samples defined in the header")
+
+ # Treetime only parses GT FORMAT calls. Moreover, "the first sub-field must always be the genotype (GT) if it is present"
+ # according to the VCF 4.2 spec. Note that if the VCF is for one sample only then the format's a bit different, but this'll
+ # raise an error above because the header differs from what we assert. Other variation (CN, BND etc) is not parsed by
+ # TreeTime, only GT.
+ FORMAT = dat[8]
+ if not FORMAT.startswith('GT'):
+ continue
- #else you are a comment line, ignore.
+ #get samples that differ from Ref at this site
+ for sname, sa in zip(samps, calls):
+ gt = sa.split(':')[0] # May be multiple colon-separated subfields, depending on the line's FORMAT
+
+ # `gt` is the "index" of the alternate allele for this sample.
+ # The format of `gt` is quite varied:
+ # For haploid genomes, it's simply a single int _or_ "." (meaning a call cannot be made at the locus)
+ # For polyploid genomes, the format is a list of {int, '.'} separated by "/" or "|"
+ # ('/' means unphased, '|' means phased).
+ # If the index is an int, it's the 1-based (!) lookup index for ALT
+
+ # Split on the valid separators - if haploid, then we'll get a list len=1
+ gts = gt.split('|') if '|' in gt else gt.split('/')
+ ploidy = len(gts)
+ if metadata['ploidy'] is None:
+ metadata['ploidy'] = ploidy
+ elif metadata['ploidy']!=ploidy:
+ raise TreeTimeError(f"The VCF file {vcf_file!r} had genotype calls of ploidy {metadata['ploidy']} but sample {sname!r} has a genotype of ploidy {ploidy}")
+
+ # if polyploid, but homozygous, then treat as if haploid
+ if ploidy>1 and len(set(gts))==1:
+ gt = gts[0]
+
+ if gt.isdigit(): # haploid, and a call has been made (i.e. it's not gt='.')
+ gt = int(gt)
+ if gt==0:
+ continue # reference allele!
+ alt = validate_alt(ALT[gt-1]) # gt is the 1-based lookup, but ALT is 0-indexed
+ if alt:
+ parse_homozygous_call(sequences[sname],insertions[sname], pos, REF, alt)
+ continue
+
+ if gt=='.': # haploid "call cannot be made" identifier - replace REF with N(s)
+ for i in range(len(REF)):
+ sequences[sname][pos+i] = 'N'
+ continue
+
+ # ---- heterozygous polyploid call ----
+ parse_heterozygous_call(gt, sequences[sname],insertions[sname], pos, REF, ALT)
#Gather all variable positions
+ #NOTE: this does not consider positions of insertions
positions = set()
for seq, muts in sequences.items():
positions.update(muts.keys())
+ num_insertions = sum([len(list(ins.keys())) for ins in insertions.values()])
+ if len(positions)==0 and num_insertions==0:
+ raise TreeTimeError(f"VCF file {vcf_file!r} has no data-lines which we could extract genotype information from!")
+
#One or more seqs are same as ref! (No non-ref calls) So haven't been 'seen' yet
if nsamp > len(sequences):
missings = set(samps).difference(sequences.keys())
@@ -273,12 +340,13 @@ def read_vcf(vcf_file, ref_file=None):
compress_seq = {'reference':refSeqStr,
'sequences': sequences,
'insertions': insertions,
- 'positions': sorted(positions)}
+ 'positions': sorted(positions),
+ 'metadata': metadata}
return compress_seq
-def write_vcf(tree_dict, file_name):#, compress=False):
+def write_vcf(tree_dict, file_name, mask=None):#, compress=False):
"""
Writes out a VCF-style file (which seems to be minimally handleable
by vcftools and pyvcf) of the alignment. This is created from a dict
@@ -290,13 +358,25 @@ def write_vcf(tree_dict, file_name):#, compress=False):
Parameters
----------
tree_dict: nested dict
- A nested dict with keys 'sequence' 'reference' and 'positions',
- as is created by :py:meth:`treetime.TreeAnc.get_tree_dict`
+ A nested dict with required keys of:
+ 'sequences': maps sampleName -> pos (0-based) -> new base (SNP)
+ 'reference': string of reference nuc sequence
+ 'positions': sorted list of 0-based positions with variation in sequences
+ And optional keys:
+ 'inferred_const_sites': list or set, 0-based positions to skip output for.
+ This input is often created by :py:meth:`treetime.TreeAnc.get_tree_dict`
+ 'metadata': dict of information to influence VCF formatting. Only
+ the following keys are used:
+ 'metadata.ploidy': int. Influences how genotype calls are formatted. (default
+ of 2 (diploid) used if not provided)
+ 'metadata.chrom': str. The chromosome name (default of '1' used if not provided)
file_name: str
File to which the new VCF should be written out. File names ending with
'.gz' will result in the VCF automatically being gzipped.
+ mask : optional, str of 0 or 1
+ Calls at these positions will be skipped
"""
# Programming Logic Note:
@@ -329,6 +409,23 @@ def write_vcf(tree_dict, file_name):#, compress=False):
sequences = tree_dict['sequences']
ref = tree_dict['reference']
positions = tree_dict['positions']
+ ploidy = tree_dict.get('metadata', {}).get('ploidy', 2)
+ chrom_name = tree_dict.get('metadata', {}).get('chrom', '1')
+ sample_names = list(sequences.keys())
+ inferred_const_sites = set(tree_dict.get('inferred_const_sites', []))
+
+ # For every variable site in sequences, flip the format around so
+ # we can have fast lookups later on.
+ alleles = {}
+ num_samples = len(sample_names)
+ for idx, name in enumerate(sample_names):
+ for posn, allele in sequences[name].items():
+ if posn not in alleles:
+ alleles[posn] = np.zeros(num_samples, dtype='U')
+ alleles[posn][idx] = allele
+ # fill in reference
+ for posn,bases in alleles.items():
+ bases[bases==''] = ref[posn]
def handleDeletions(i, pi, pos, ref, delete, pattern):
refb = ref[pi]
@@ -371,7 +468,7 @@ def write_vcf(tree_dict, file_name):#, compress=False):
#Rotate them into 'calls'
align = np.asarray(sites).T
- #Get rid of '-', and put '.' for calls that match ref
+ #Get rid of '-', and put '0' for calls that match ref
#Only removes trailing '-'. This breaks VCF convension, but the standard
#VCF way of handling this* is really complicated, and the situation is rare.
#(*deletions and mutations at the same locations)
@@ -383,7 +480,7 @@ def write_vcf(tree_dict, file_name):#, compress=False):
gp-=1
pat = "".join(pt)
if pat == refb:
- fullpat.append('.')
+ fullpat.append('0')
else:
fullpat.append(pat)
@@ -393,19 +490,21 @@ def write_vcf(tree_dict, file_name):#, compress=False):
#prepare the header of the VCF & write out
- header=["#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"]+list(sequences.keys())
+ header=["#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"]+sample_names
opn = gzip.open if file_name.endswith(('.gz', '.GZ')) else open
out_file = opn(file_name, 'w')
out_file.write( "##fileformat=VCFv4.2\n"+
"##source=NextStrain\n"+
- "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n")
+ "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n"+
+ f"##contig=<ID={chrom_name}>\n")
out_file.write("\t".join(header)+"\n")
vcfWrite = []
errorPositions = []
explainedErrors = 0
+ mask_skip_count = 0
#Why so basic? Because we sometimes have to back up a position!
i=0
@@ -420,28 +519,14 @@ def write_vcf(tree_dict, file_name):#, compress=False):
delete = False #deletion at this position - need to grab previous base (invariable)
deleteGroup = False #deletion at next position (mutation at this pos) - do not need to get prev base
- #try/except is much more efficient than 'if' statements for constructing patterns,
- #as on average a 'variable' location will not be variable for any given sequence
- pattern = []
- #pattern2 gets the pattern at next position to check for upcoming deletions
- #it's more efficient to get both here rather than loop through sequences twice!
- pattern2 = []
- for k,v in sequences.items():
- try:
- pattern.append(sequences[k][pi])
- except KeyError:
- pattern.append(ref[pi])
-
- try:
- pattern2.append(sequences[k][pi+1])
- except KeyError:
- try:
- pattern2.append(ref[pi+1])
- except IndexError:
- pass
+ if mask and mask[pi] == '1':
+ mask_skip_count+=1
+ i+=1
+ continue
- pattern = np.array(pattern).astype('U')
- pattern2 = np.array(pattern2).astype('U')
+ # patterns will be empty if there was no variation in the sequences
+ pattern = alleles[pi] if pi in alleles else np.array([]).astype('U')
+ pattern2 = alleles[pi+1] if pi+1 in alleles else np.array([]).astype('U')
#If a deletion here, need to gather up all bases, and position before
if any(pattern == '-'):
@@ -462,39 +547,40 @@ def write_vcf(tree_dict, file_name):#, compress=False):
#If deletion, treat affected bases as 1 'call':
if delete or deleteGroup:
+ if pattern.size==0: # no variation in sequences
+ pattern = np.full(num_samples, refb, dtype='U')
i, pi, pos, refb, pattern = handleDeletions(i, pi, pos, ref, delete, pattern)
- #If no deletion, replace ref with '.', as in VCF format
+ #If no deletion, replace ref with '0' which means the reference base is unchanged
else:
- pattern[pattern==refb] = '.'
+ pattern[pattern==refb] = '0'
- #Get the list of ALTs - minus any '.'!
+ #Get the list of ALTs - minus any '0' which are unchanged reference sequences!
uniques = np.unique(pattern)
- uniques = uniques[np.where(uniques!='.')]
+ uniques = uniques[np.where(uniques!='0')]
#Convert bases to the number that matches the ALT
j=1
for u in uniques:
pattern[np.where(pattern==u)[0]] = str(j)
j+=1
- #Now convert these calls to #/# (VCF format)
- calls = [ j+"/"+j if j!='.' else '.' for j in pattern ]
#What if there's no variation at a variable site??
#This can happen when sites are modified by TreeTime - see below.
- printPos = True
- if len(uniques)==0:
+ #We don't print to VCF (because no variation!)
+ any_variation = len(uniques)!=0
+ if not any_variation:
#If we expect it (it was made constant by TreeTime), it's fine.
- if 'inferred_const_sites' in tree_dict and pi in tree_dict['inferred_const_sites']:
+ if pi in inferred_const_sites:
explainedErrors += 1
- printPos = False #and don't output position to the VCF
else:
#If we don't expect, raise an error
errorPositions.append(str(pi))
#Write it out - Increment positions by 1 so it's in VCF numbering
#If no longer variable, and explained, don't write it out
- if printPos:
- output = ["MTB_anc", str(pos), ".", refb, ",".join(uniques), ".", "PASS", ".", "GT"] + calls
+ if any_variation:
+ calls = [ "/".join([j]*ploidy) for j in pattern ]
+ output = [chrom_name, str(pos), ".", refb, ",".join(uniques), ".", "PASS", ".", "GT"] + calls
vcfWrite.append("\t".join(output))
i+=1
@@ -507,6 +593,9 @@ def write_vcf(tree_dict, file_name):#, compress=False):
#purposes, because the site is 'variant' against the ref, it is variant, as expected, and so
#won't be counted in the below list, which is only sites removed from the VCF.
+ if mask_skip_count:
+ print(f"{mask_skip_count} positions were skipped due to the provided mask")
+
if 'inferred_const_sites' in tree_dict and explainedErrors != 0:
print(fill("Sites that were constant except for ambiguous bases were made" +
" constant by TreeTime. This happened {} times. These sites are".format(explainedErrors) +
=====================================
treetime/wrappers.py
=====================================
@@ -521,7 +521,7 @@ def ancestral_reconstruction(params):
### OUTPUT and saving of results
###########################################################################
if params.gtr=='infer':
- fname = outdir+'/sequence_evolution_model.txt'
+ fname = outdir+'sequence_evolution_model.txt'
with open(fname, 'w', encoding='utf-8') as ofile:
ofile.write(str(treeanc.gtr)+'\n')
print('\nInferred sequence evolution model (saved as %s):'%fname)
@@ -854,11 +854,25 @@ def estimate_clock_model(params):
else:
print('\n--- root-date:\t %3.2f\n\n'%(-d2d.intercept/d2d.clock_rate))
+ if hasattr(myTree, 'outliers') and myTree.outliers is not None:
+ print("--- saved detected outliers as " + basename + 'outliers.tsv')
+ myTree.outliers.to_csv(basename + 'outliers.tsv', sep='\t')
+
+ if hasattr(myTree, 'outliers') and myTree.outliers is not None and params.prune_outliers:
+ for outlier in myTree.outliers.index:
+ print("removing ", outlier)
+ myTree.tree.prune(outlier)
+
if not params.keep_root:
# write rerooted tree to file
outtree_name = basename+'rerooted.newick'
- Phylo.write(myTree.tree, outtree_name, 'newick')
- print("--- re-rooted tree written to \n\t%s\n"%outtree_name)
+ elif params.prune_outliers:
+ outtree_name = basename+'pruned.newick'
+ else:
+ outtree_name = basename+'.output.newick'
+
+ Phylo.write(myTree.tree, outtree_name, 'newick')
+ print("--- new tree written to \n\t%s\n"%outtree_name)
table_fname = basename+'rtt.csv'
with open(table_fname, 'w', encoding='utf-8') as ofile:
View it on GitLab: https://salsa.debian.org/med-team/python-treetime/-/commit/e6064017dcfbf63072b9d8a06cc5775d5eff2f6e
--
View it on GitLab: https://salsa.debian.org/med-team/python-treetime/-/commit/e6064017dcfbf63072b9d8a06cc5775d5eff2f6e
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20241109/6d7475c4/attachment-0001.htm>
More information about the debian-med-commit
mailing list