[med-svn] [Git][med-team/pyensembl][upstream] New upstream version 2.6.9

Karsten Schöke (@karso) gitlab at salsa.debian.org
Thu Apr 30 14:34:13 BST 2026



Karsten Schöke pushed to branch upstream at Debian Med / pyensembl


Commits:
88d09667 by Karsten Schöke at 2026-04-30T15:11:50+02:00
New upstream version 2.6.9
- - - - -


7 changed files:

- pyensembl/genome.py
- pyensembl/transcript.py
- pyensembl/version.py
- + tests/data/arabidopsis.tair10.partial.cdna.fa
- + tests/data/arabidopsis.tair10.partial.gtf
- + tests/test_tair10_complete.py
- tests/test_transcript_support_level.py


Changes:

=====================================
pyensembl/genome.py
=====================================
@@ -29,6 +29,25 @@ from .sequence_data import SequenceData
 from .transcript import Transcript
 
 
+def _parse_transcript_support_level(value):
+    """
+    Coerce a raw ``transcript_support_level`` attribute into an int or None.
+
+    Recent Ensembl releases append text such as
+    ``"1 (assigned to previous version 5)"`` after the numeric TSL, and
+    older or missing entries can be ``None`` or the literal string ``"NA"``.
+    Keep only the leading whitespace-separated token and return it as an
+    int when it is a digit, otherwise None.
+    """
+    if not value:
+        return None
+    tokens = value.split()
+    leading = tokens[0] if tokens else None
+    if leading and leading.isdigit():
+        return int(leading)
+    return None
+
+
 class Genome(Serializable):
     """
     Bundles together the genomic annotation and sequence data associated with
@@ -896,11 +915,9 @@ class Genome(Serializable):
                 extra_data = dict(zip(extra_field_names, result[5:]))
                 transcript_name = extra_data.get("transcript_name")
                 transcript_biotype = extra_data.get("transcript_biotype")
-                tsl = extra_data.get("transcript_support_level")
-                if not tsl or tsl == "NA":
-                    tsl = None
-                else:
-                    tsl = int(tsl)
+                tsl = _parse_transcript_support_level(
+                    extra_data.get("transcript_support_level")
+                )
 
             self._transcripts[transcript_id] = Transcript(
                 transcript_id=transcript_id,


=====================================
pyensembl/transcript.py
=====================================
@@ -489,6 +489,13 @@ class Transcript(LocusWithGenome):
         if self.sequence is None:
             return None
 
+        # Some GTF annotations (e.g. fragments in Ensembl Plants) leave a
+        # protein-coding transcript without a start_codon or stop_codon
+        # feature. Return None rather than crashing when either endpoint of
+        # the CDS cannot be located.
+        if not self.contains_start_codon or not self.contains_stop_codon:
+            return None
+
         start = self.first_start_codon_spliced_offset
         end = self.last_stop_codon_spliced_offset
 
@@ -508,6 +515,8 @@ class Transcript(LocusWithGenome):
         cDNA sequence of 5' UTR
         (untranslated region at the beginning of the transcript)
         """
+        if self.sequence is None or not self.contains_start_codon:
+            return None
         # pylint: disable=invalid-slice-index
         # TODO(tavi) Figure out pylint is not happy with this slice
         return self.sequence[: self.first_start_codon_spliced_offset]
@@ -518,6 +527,8 @@ class Transcript(LocusWithGenome):
         cDNA sequence of 3' UTR
         (untranslated region at the end of the transcript)
         """
+        if self.sequence is None or not self.contains_stop_codon:
+            return None
         return self.sequence[self.last_stop_codon_spliced_offset + 1 :]
 
     @memoized_property


=====================================
pyensembl/version.py
=====================================
@@ -1,4 +1,4 @@
-__version__ = "2.6.7"
+__version__ = "2.6.9"
 
 def print_version():
     print(f"v{__version__}")


=====================================
tests/data/arabidopsis.tair10.partial.cdna.fa
=====================================
@@ -0,0 +1,46 @@
+>AT1G01010.1
+AAATTATTAGATATACCAAACCAGAGAAAACAAATACATAATCGGAGAAATACAGATTAC
+AGAGAGCGAGAGAGATCGACGGCGAAGCTCTTTACCCGGAAACCATTGAAATCGGACGGT
+TTAGTGAAAATGGAGGATCAAGTTGGGTTTGGGTTCCGTCCGAACGACGAGGAGCTCGTT
+GGTCACTATCTCCGTAACAAAATCGAAGGAAACACTAGCCGCGACGTTGAAGTAGCCATC
+AGCGAGGTCAACATCTGTAGCTACGATCCTTGGAACTTGCGCTTCCAGTCAAAGTACAAA
+TCGAGAGATGCTATGTGGTACTTCTTCTCTCGTAGAGAAAACAACAAAGGGAATCGACAG
+AGCAGGACAACGGTTTCTGGTAAATGGAAGCTTACCGGAGAATCTGTTGAGGTCAAGGAC
+CAGTGGGGATTTTGTAGTGAGGGCTTTCGTGGTAAGATTGGTCATAAAAGGGTTTTGGTG
+TTCCTCGATGGAAGATACCCTGACAAAACCAAATCTGATTGGGTTATCCACGAGTTCCAC
+TACGACCTCTTACCAGAACATCAGAGGACATATGTCATCTGCAGACTTGAGTACAAGGGT
+GATGATGCGGACATTCTATCTGCTTATGCAATAGATCCCACTCCCGCTTTTGTCCCCAAT
+ATGACTAGTAGTGCAGGTTCTGTGGTCAACCAATCACGTCAACGAAATTCAGGATCTTAC
+AACACTTACTCTGAGTATGATTCAGCAAATCATGGCCAGCAGTTTAATGAAAACTCTAAC
+ATTATGCAGCAGCAACCACTTCAAGGATCATTCAACCCTCTCCTTGAGTATGATTTTGCA
+AATCACGGCGGTCAGTGGCTGAGTGACTATATCGACCTGCAACAGCAAGTTCCTTACTTG
+GCACCTTATGAAAATGAGTCGGAGATGATTTGGAAGCATGTGATTGAAGAAAATTTTGAG
+TTTTTGGTAGATGAAAGGACATCTATGCAACAGCATTACAGTGATCACCGGCCCAAAAAA
+CCTGTGTCTGGGGTTTTGCCTGATGATAGCAGTGATACTGAAACTGGATCAATGATTTTC
+GAAGACACTTCGAGCTCCACTGATAGTGTTGGTAGTTCAGATGAACCGGGCCATACTCGT
+ATAGATGATATTCCATCATTGAACATTATTGAGCCTTTGCACAATTATAAGGCACAAGAG
+CAACCAAAGCAGCAGAGCAAAGAAAAGGTGATAAGTTCGCAGAAAAGCGAATGCGAGTGG
+AAAATGGCTGAAGACTCGATCAAGATACCTCCATCCACCAACACGGTGAAGCAGAGCTGG
+ATTGTTTTGGAGAATGCACAGTGGAACTATCTCAAGAACATGATCATTGGTGTCTTGTTG
+TTCATCTCCGTCATTAGTTGGATCATTCTTGTTGGTTAAGAGGTCAAATCGGATTCTTGC
+TCAAAATTTGTATTTCTTAGAATGTGTGTTTTTTTTTGTTTTTTTTTCTTTGCTCTGTTT
+TCTCGCTCCGGAAAAGTTTGAAGTTATATTTTATTAGTATGTAAAGAAGAGAAAAAGGGG
+GAAAGAAGAGAGAAGAAAAATGCAGAAAATCATATATATGAATTGGAAAAAAGTATATGT
+AATAATAATTAGTGCATCGTTTTGTGGTGTAGTTTATATAAATAAAGTGATATATAGTCT
+TGTATAAG
+>AT1G03325.1
+TCATCTGTGCATCATAAAGGCAAAAACTTTAAGATTTGTGAAGATAAAAGTAAGAAATCT
+TCAAATAGGTTAAACAACTCACCTTCGTTTTTCTTCGATTTCTTCTTCTTCGTTGCTTTA
+ACCTGA
+>AT1G24475.1
+GCCCAATGGGCCATATATTTATCCACGAAAGGTGGAGGAGCAAATACAAACTTGAAAATA
+TG
+>AT1G42615.1
+TTTGATAAAGAGCCAACAAATCCTGTGGATTTTGGAGCAGAGACTTTGAGTTATGAGGAT
+TATTACGATGAAACAAGAGATAGATATGATAAAGCTTTTCTGATGATGATTACTTATCAG
+TGTGATGCTTTGGTTGACAAGTTTAATGTCACTCCTTTGATTATTGGTGAAGTAAAAGAT
+ACTAAGAGGCCTAAAACACACAAAGCTGAGCCGTGTAACTTAGATGGTAAAAGAGCAGTG
+ACGATATGGTTTAGTATGCTTAAAATGACTATGCCTTTATCGAGTTCTTTGGTGATATCT
+TTCCTTGCTTGTCACCAAGCGGGAGCACCGGTACTTCATCCATCAGTCGGAACTTCATCT
+ATCTCTACTGTACATGGAATAGAGCAGGAGGGGAACATACATATTCAGGATGACCTTCCT
+AAACCAGAG


=====================================
tests/data/arabidopsis.tair10.partial.gtf
=====================================
@@ -0,0 +1,44 @@
+#!genome-build TAIR10
+#!genome-version TAIR10
+#!genome-date 2008-04
+#!genome-build-accession GCA_000001735.1
+#!genebuild-last-updated 2010-09
+1	araport11	gene	817053	817178	.	-	.	gene_id "AT1G03325"; gene_name "AT1G03325"; gene_source "araport11"; gene_biotype "protein_coding";
+1	araport11	transcript	817053	817178	.	-	.	gene_id "AT1G03325"; transcript_id "AT1G03325.1"; gene_name "AT1G03325"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_name "AT1G03325-201"; transcript_source "araport11"; transcript_biotype "protein_coding"; tag "Ensembl_canonical";
+1	araport11	exon	817053	817178	.	-	.	gene_id "AT1G03325"; transcript_id "AT1G03325.1"; exon_number "1"; gene_name "AT1G03325"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_name "AT1G03325-201"; transcript_source "araport11"; transcript_biotype "protein_coding"; exon_id "AT1G03325.1.exon1"; tag "Ensembl_canonical";
+1	araport11	CDS	817056	817178	.	-	0	gene_id "AT1G03325"; transcript_id "AT1G03325.1"; exon_number "1"; gene_name "AT1G03325"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_name "AT1G03325-201"; transcript_source "araport11"; transcript_biotype "protein_coding"; protein_id "AT1G03325.1"; tag "Ensembl_canonical";
+1	araport11	stop_codon	817053	817055	.	-	0	gene_id "AT1G03325"; transcript_id "AT1G03325.1"; exon_number "1"; gene_name "AT1G03325"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_name "AT1G03325-201"; transcript_source "araport11"; transcript_biotype "protein_coding"; tag "Ensembl_canonical";
+1	araport11	gene	3631	5899	.	+	.	gene_id "AT1G01010"; gene_name "NAC001"; gene_source "araport11"; gene_biotype "protein_coding";
+1	araport11	transcript	3631	5899	.	+	.	gene_id "AT1G01010"; transcript_id "AT1G01010.1"; gene_name "NAC001"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_name "NAC001-201"; transcript_source "araport11"; transcript_biotype "protein_coding"; tag "Ensembl_canonical";
+1	araport11	exon	3631	3913	.	+	.	gene_id "AT1G01010"; transcript_id "AT1G01010.1"; exon_number "1"; gene_name "NAC001"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_name "NAC001-201"; transcript_source "araport11"; transcript_biotype "protein_coding"; exon_id "AT1G01010.1.exon1"; tag "Ensembl_canonical";
+1	araport11	CDS	3760	3913	.	+	0	gene_id "AT1G01010"; transcript_id "AT1G01010.1"; exon_number "1"; gene_name "NAC001"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_name "NAC001-201"; transcript_source "araport11"; transcript_biotype "protein_coding"; protein_id "AT1G01010.1"; tag "Ensembl_canonical";
+1	araport11	start_codon	3760	3762	.	+	0	gene_id "AT1G01010"; transcript_id "AT1G01010.1"; exon_number "1"; gene_name "NAC001"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_name "NAC001-201"; transcript_source "araport11"; transcript_biotype "protein_coding"; tag "Ensembl_canonical";
+1	araport11	exon	3996	4276	.	+	.	gene_id "AT1G01010"; transcript_id "AT1G01010.1"; exon_number "2"; gene_name "NAC001"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_name "NAC001-201"; transcript_source "araport11"; transcript_biotype "protein_coding"; exon_id "AT1G01010.1.exon2"; tag "Ensembl_canonical";
+1	araport11	CDS	3996	4276	.	+	2	gene_id "AT1G01010"; transcript_id "AT1G01010.1"; exon_number "2"; gene_name "NAC001"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_name "NAC001-201"; transcript_source "araport11"; transcript_biotype "protein_coding"; protein_id "AT1G01010.1"; tag "Ensembl_canonical";
+1	araport11	exon	4486	4605	.	+	.	gene_id "AT1G01010"; transcript_id "AT1G01010.1"; exon_number "3"; gene_name "NAC001"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_name "NAC001-201"; transcript_source "araport11"; transcript_biotype "protein_coding"; exon_id "AT1G01010.1.exon3"; tag "Ensembl_canonical";
+1	araport11	CDS	4486	4605	.	+	0	gene_id "AT1G01010"; transcript_id "AT1G01010.1"; exon_number "3"; gene_name "NAC001"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_name "NAC001-201"; transcript_source "araport11"; transcript_biotype "protein_coding"; protein_id "AT1G01010.1"; tag "Ensembl_canonical";
+1	araport11	exon	4706	5095	.	+	.	gene_id "AT1G01010"; transcript_id "AT1G01010.1"; exon_number "4"; gene_name "NAC001"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_name "NAC001-201"; transcript_source "araport11"; transcript_biotype "protein_coding"; exon_id "AT1G01010.1.exon4"; tag "Ensembl_canonical";
+1	araport11	CDS	4706	5095	.	+	0	gene_id "AT1G01010"; transcript_id "AT1G01010.1"; exon_number "4"; gene_name "NAC001"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_name "NAC001-201"; transcript_source "araport11"; transcript_biotype "protein_coding"; protein_id "AT1G01010.1"; tag "Ensembl_canonical";
+1	araport11	exon	5174	5326	.	+	.	gene_id "AT1G01010"; transcript_id "AT1G01010.1"; exon_number "5"; gene_name "NAC001"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_name "NAC001-201"; transcript_source "araport11"; transcript_biotype "protein_coding"; exon_id "AT1G01010.1.exon5"; tag "Ensembl_canonical";
+1	araport11	CDS	5174	5326	.	+	0	gene_id "AT1G01010"; transcript_id "AT1G01010.1"; exon_number "5"; gene_name "NAC001"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_name "NAC001-201"; transcript_source "araport11"; transcript_biotype "protein_coding"; protein_id "AT1G01010.1"; tag "Ensembl_canonical";
+1	araport11	exon	5439	5899	.	+	.	gene_id "AT1G01010"; transcript_id "AT1G01010.1"; exon_number "6"; gene_name "NAC001"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_name "NAC001-201"; transcript_source "araport11"; transcript_biotype "protein_coding"; exon_id "AT1G01010.1.exon6"; tag "Ensembl_canonical";
+1	araport11	CDS	5439	5627	.	+	0	gene_id "AT1G01010"; transcript_id "AT1G01010.1"; exon_number "6"; gene_name "NAC001"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_name "NAC001-201"; transcript_source "araport11"; transcript_biotype "protein_coding"; protein_id "AT1G01010.1"; tag "Ensembl_canonical";
+1	araport11	stop_codon	5628	5630	.	+	0	gene_id "AT1G01010"; transcript_id "AT1G01010.1"; exon_number "6"; gene_name "NAC001"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_name "NAC001-201"; transcript_source "araport11"; transcript_biotype "protein_coding"; tag "Ensembl_canonical";
+1	araport11	five_prime_utr	3631	3759	.	+	.	gene_id "AT1G01010"; transcript_id "AT1G01010.1"; gene_name "NAC001"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_name "NAC001-201"; transcript_source "araport11"; transcript_biotype "protein_coding"; tag "Ensembl_canonical";
+1	araport11	three_prime_utr	5631	5899	.	+	.	gene_id "AT1G01010"; transcript_id "AT1G01010.1"; gene_name "NAC001"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_name "NAC001-201"; transcript_source "araport11"; transcript_biotype "protein_coding"; tag "Ensembl_canonical";
+1	araport11	gene	8676282	8676343	.	-	.	gene_id "AT1G24475"; gene_name "AT1G24475"; gene_source "araport11"; gene_biotype "protein_coding";
+1	araport11	transcript	8676282	8676343	.	-	.	gene_id "AT1G24475"; transcript_id "AT1G24475.1"; gene_name "AT1G24475"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_name "AT1G24475-201"; transcript_source "araport11"; transcript_biotype "protein_coding"; tag "Ensembl_canonical";
+1	araport11	exon	8676282	8676343	.	-	.	gene_id "AT1G24475"; transcript_id "AT1G24475.1"; exon_number "1"; gene_name "AT1G24475"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_name "AT1G24475-201"; transcript_source "araport11"; transcript_biotype "protein_coding"; exon_id "AT1G24475.1.exon1"; tag "Ensembl_canonical";
+1	araport11	CDS	8676282	8676338	.	-	0	gene_id "AT1G24475"; transcript_id "AT1G24475.1"; exon_number "1"; gene_name "AT1G24475"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_name "AT1G24475-201"; transcript_source "araport11"; transcript_biotype "protein_coding"; protein_id "AT1G24475.1"; tag "Ensembl_canonical";
+1	araport11	start_codon	8676336	8676338	.	-	0	gene_id "AT1G24475"; transcript_id "AT1G24475.1"; exon_number "1"; gene_name "AT1G24475"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_name "AT1G24475-201"; transcript_source "araport11"; transcript_biotype "protein_coding"; tag "Ensembl_canonical";
+1	araport11	five_prime_utr	8676339	8676343	.	-	.	gene_id "AT1G24475"; transcript_id "AT1G24475.1"; gene_name "AT1G24475"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_name "AT1G24475-201"; transcript_source "araport11"; transcript_biotype "protein_coding"; tag "Ensembl_canonical";
+1	araport11	gene	16037606	16038455	.	+	.	gene_id "AT1G42615"; gene_name "AT1G42615"; gene_source "araport11"; gene_biotype "protein_coding";
+1	araport11	transcript	16037606	16038455	.	+	.	gene_id "AT1G42615"; transcript_id "AT1G42615.1"; gene_name "AT1G42615"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_name "AT1G42615-201"; transcript_source "araport11"; transcript_biotype "protein_coding"; tag "Ensembl_canonical";
+1	araport11	exon	16037606	16037771	.	+	.	gene_id "AT1G42615"; transcript_id "AT1G42615.1"; exon_number "1"; gene_name "AT1G42615"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_name "AT1G42615-201"; transcript_source "araport11"; transcript_biotype "protein_coding"; exon_id "AT1G42615.1.exon1"; tag "Ensembl_canonical";
+1	araport11	CDS	16037606	16037771	.	+	0	gene_id "AT1G42615"; transcript_id "AT1G42615.1"; exon_number "1"; gene_name "AT1G42615"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_name "AT1G42615-201"; transcript_source "araport11"; transcript_biotype "protein_coding"; protein_id "AT1G42615.1"; tag "Ensembl_canonical";
+1	araport11	exon	16037964	16038127	.	+	.	gene_id "AT1G42615"; transcript_id "AT1G42615.1"; exon_number "2"; gene_name "AT1G42615"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_name "AT1G42615-201"; transcript_source "araport11"; transcript_biotype "protein_coding"; exon_id "AT1G42615.1.exon2"; tag "Ensembl_canonical";
+1	araport11	CDS	16037964	16038127	.	+	2	gene_id "AT1G42615"; transcript_id "AT1G42615.1"; exon_number "2"; gene_name "AT1G42615"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_name "AT1G42615-201"; transcript_source "araport11"; transcript_biotype "protein_coding"; protein_id "AT1G42615.1"; tag "Ensembl_canonical";
+1	araport11	exon	16038329	16038385	.	+	.	gene_id "AT1G42615"; transcript_id "AT1G42615.1"; exon_number "3"; gene_name "AT1G42615"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_name "AT1G42615-201"; transcript_source "araport11"; transcript_biotype "protein_coding"; exon_id "AT1G42615.1.exon3"; tag "Ensembl_canonical";
+1	araport11	CDS	16038329	16038385	.	+	0	gene_id "AT1G42615"; transcript_id "AT1G42615.1"; exon_number "3"; gene_name "AT1G42615"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_name "AT1G42615-201"; transcript_source "araport11"; transcript_biotype "protein_coding"; protein_id "AT1G42615.1"; tag "Ensembl_canonical";
+1	araport11	exon	16038414	16038455	.	+	.	gene_id "AT1G42615"; transcript_id "AT1G42615.1"; exon_number "4"; gene_name "AT1G42615"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_name "AT1G42615-201"; transcript_source "araport11"; transcript_biotype "protein_coding"; exon_id "AT1G42615.1.exon4"; tag "Ensembl_canonical";
+1	araport11	CDS	16038414	16038455	.	+	0	gene_id "AT1G42615"; transcript_id "AT1G42615.1"; exon_number "4"; gene_name "AT1G42615"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_name "AT1G42615-201"; transcript_source "araport11"; transcript_biotype "protein_coding"; protein_id "AT1G42615.1"; tag "Ensembl_canonical";


=====================================
tests/test_tair10_complete.py
=====================================
@@ -0,0 +1,105 @@
+"""
+Regression tests for Transcript.complete and Transcript.coding_sequence on
+Ensembl Plants / TAIR10-style data.
+
+Covers issue #252 (Transcript.complete returning the wrong result for TAIR
+transcripts — fixed by #268 which stopped stripping the `.N` isoform suffix
+from non-ENSEMBL FASTA headers) and a related defect where coding_sequence
+and the 5'/3' UTR accessors raised ValueError for transcripts missing a
+start_codon or stop_codon feature.
+
+GTF and cDNA FASTA fragments were taken from Ensembl Plants release 57:
+    http://ftp.ensemblgenomes.org/pub/plants/release-57/gtf/arabidopsis_thaliana/
+        Arabidopsis_thaliana.TAIR10.57.gtf.gz
+    http://ftp.ensemblgenomes.org/pub/plants/release-57/fasta/arabidopsis_thaliana/cdna/
+        Arabidopsis_thaliana.TAIR10.cdna.all.fa.gz
+
+Four transcripts were selected:
+  * AT1G01010.1 (NAC001) — fully annotated: 6 exons, start_codon + stop_codon.
+  * AT1G03325.1 — protein-coding fragment: no start_codon feature.
+  * AT1G24475.1 — protein-coding fragment: no stop_codon feature.
+  * AT1G42615.1 — protein-coding fragment: neither start_codon nor stop_codon.
+"""
+from __future__ import absolute_import
+
+from pyensembl import Genome
+
+from .common import eq_, ok_
+from .data import data_path
+
+
+TAIR10_GTF_PATH = data_path("arabidopsis.tair10.partial.gtf")
+TAIR10_CDNA_FASTA_PATH = data_path("arabidopsis.tair10.partial.cdna.fa")
+
+
+custom_tair10_genome_subset = Genome(
+    reference_name="TAIR10",
+    annotation_name="_test_arabidopsis_tair10_subset",
+    gtf_path_or_url=TAIR10_GTF_PATH,
+    transcript_fasta_paths_or_urls=[TAIR10_CDNA_FASTA_PATH],
+)
+
+
+def setup_module(module):
+    custom_tair10_genome_subset.clear_cache()
+    custom_tair10_genome_subset.index()
+
+
+def test_complete_transcript_with_start_and_stop():
+    # AT1G01010.1 has both start_codon and stop_codon features and a cDNA
+    # in the FASTA — .complete must be True and the coding sequence must
+    # start with ATG and end with a stop codon.
+    transcript = custom_tair10_genome_subset.transcript_by_id("AT1G01010.1")
+    ok_(transcript.contains_start_codon)
+    ok_(transcript.contains_stop_codon)
+    ok_(transcript.sequence is not None)
+    ok_(transcript.complete)
+    cds = transcript.coding_sequence
+    ok_(cds is not None)
+    eq_(len(cds) % 3, 0)
+    eq_(cds[:3], "ATG")
+    ok_(cds[-3:] in ("TAA", "TAG", "TGA"))
+    # UTRs flank the CDS and concatenate back to the full cDNA.
+    five_prime = transcript.five_prime_utr_sequence
+    three_prime = transcript.three_prime_utr_sequence
+    ok_(five_prime is not None)
+    ok_(three_prime is not None)
+    eq_(five_prime + cds + three_prime, transcript.sequence)
+
+
+def test_transcript_missing_start_codon_is_not_complete():
+    # AT1G03325.1 has stop_codon but no start_codon. .complete must be
+    # False and .coding_sequence / .five_prime_utr_sequence must return
+    # None rather than raise. The 3' UTR is still resolvable.
+    transcript = custom_tair10_genome_subset.transcript_by_id("AT1G03325.1")
+    eq_(transcript.contains_start_codon, False)
+    eq_(transcript.contains_stop_codon, True)
+    eq_(transcript.complete, False)
+    eq_(transcript.coding_sequence, None)
+    eq_(transcript.five_prime_utr_sequence, None)
+    ok_(transcript.three_prime_utr_sequence is not None)
+
+
+def test_transcript_missing_stop_codon_is_not_complete():
+    # AT1G24475.1 has start_codon but no stop_codon. .complete must be
+    # False and .coding_sequence / .three_prime_utr_sequence must return
+    # None rather than raise. The 5' UTR is still resolvable.
+    transcript = custom_tair10_genome_subset.transcript_by_id("AT1G24475.1")
+    eq_(transcript.contains_start_codon, True)
+    eq_(transcript.contains_stop_codon, False)
+    eq_(transcript.complete, False)
+    eq_(transcript.coding_sequence, None)
+    ok_(transcript.five_prime_utr_sequence is not None)
+    eq_(transcript.three_prime_utr_sequence, None)
+
+
+def test_transcript_missing_both_codons_is_not_complete():
+    # AT1G42615.1 has neither start_codon nor stop_codon. All CDS/UTR
+    # accessors must return None rather than raise.
+    transcript = custom_tair10_genome_subset.transcript_by_id("AT1G42615.1")
+    eq_(transcript.contains_start_codon, False)
+    eq_(transcript.contains_stop_codon, False)
+    eq_(transcript.complete, False)
+    eq_(transcript.coding_sequence, None)
+    eq_(transcript.five_prime_utr_sequence, None)
+    eq_(transcript.three_prime_utr_sequence, None)


=====================================
tests/test_transcript_support_level.py
=====================================
@@ -7,6 +7,23 @@ from __future__ import absolute_import
 from .common import eq_
 
 from pyensembl import cached_release
+from pyensembl.genome import _parse_transcript_support_level
+
+
+def test_parse_transcript_support_level_values():
+    # Recent Ensembl releases append parenthetical text to the TSL; we should
+    # still recover the leading integer. See openvax/pyensembl#297.
+    eq_(_parse_transcript_support_level("1 (assigned to previous version 5)"), 1)
+    eq_(_parse_transcript_support_level("3 (assigned to previous version 12)"), 3)
+    # Plain numeric strings still work.
+    eq_(_parse_transcript_support_level("1"), 1)
+    eq_(_parse_transcript_support_level("5"), 5)
+    # Missing / NA / junk values collapse to None.
+    eq_(_parse_transcript_support_level(None), None)
+    eq_(_parse_transcript_support_level(""), None)
+    eq_(_parse_transcript_support_level("NA"), None)
+    eq_(_parse_transcript_support_level("NA (something)"), None)
+    eq_(_parse_transcript_support_level("   "), None)
 
 
 def test_transcript_support_level():



View it on GitLab: https://salsa.debian.org/med-team/pyensembl/-/commit/88d09667a351d7e5a49dc993df31419ed3adef50

-- 
View it on GitLab: https://salsa.debian.org/med-team/pyensembl/-/commit/88d09667a351d7e5a49dc993df31419ed3adef50
You're receiving this email because of your account on salsa.debian.org. Manage all notifications: https://salsa.debian.org/-/profile/notifications | Help: https://salsa.debian.org/help


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20260430/5440f729/attachment-0001.htm>


More information about the debian-med-commit mailing list