[med-svn] [Git][med-team/gffread][master] 5 commits: New upstream version 0.12.1
Steffen Möller
gitlab at salsa.debian.org
Mon Aug 10 11:49:02 BST 2020
Steffen Möller pushed to branch master at Debian Med / gffread
Commits:
e5cb5692 by Steffen Moeller at 2020-08-08T22:51:40+02:00
New upstream version 0.12.1
- - - - -
1e2579c4 by Steffen Moeller at 2020-08-08T22:51:40+02:00
routine-update: New upstream version
- - - - -
9dbff771 by Steffen Moeller at 2020-08-08T22:51:43+02:00
Update upstream source from tag 'upstream/0.12.1'
Update to upstream version '0.12.1'
with Debian dir 6ced4dedde8d40954843cc472a057b848a356ed9
- - - - -
b8893762 by Steffen Moeller at 2020-08-08T22:51:43+02:00
routine-update: debhelper-compat 13
- - - - -
d719fba1 by Steffen Moeller at 2020-08-10T12:45:36+02:00
Preparing for new upstream version.
- - - - -
17 changed files:
- Makefile
- README.md
- debian/changelog
- debian/control
- debian/patches/gclib.patch
- debian/patches/hardening
- + examples/README.md
- + examples/annotation.gff
- + examples/genome.fa
- + examples/output/ann_simple.gff
- + examples/output/annotation.gtf
- + examples/output/transcripts.fa
- + examples/transcripts.gtf
- gff_utils.cpp
- gff_utils.h
- gffread.cpp
- tag_git.sh
Changes:
=====================================
Makefile
=====================================
@@ -13,7 +13,7 @@ BASEFLAGS := -Wall -Wextra ${SEARCHDIRS} -D_FILE_OFFSET_BITS=64 \
-D_LARGEFILE_SOURCE -D_REENTRANT -fno-strict-aliasing \
-std=c++0x -fno-exceptions -fno-rtti
-GCCV8 := $(shell expr `g++ -dumpversion | cut -f1 -d.` \>= 8)
+GCCV8 := $(shell expr `${CXX} -dumpversion | cut -f1 -d.` \>= 8)
ifeq "$(GCCV8)" "1"
BASEFLAGS += -Wno-class-memaccess
endif
@@ -34,12 +34,12 @@ else
ifneq (,$(filter %memcheck %memdebug, $(MAKECMDGOALS)))
#use sanitizer in gcc 4.9+
MEMCHECK_BUILD := 1
- GCCVER49 := $(shell expr `g++ -dumpversion | cut -f1,2 -d.` \>= 4.9)
+ GCCVER49 := $(shell expr `${CXX} -dumpversion | cut -f1,2 -d.` \>= 4.9)
ifeq "$(GCCVER49)" "0"
$(error gcc version 4.9 or greater is required for this build target)
endif
CXXFLAGS += -fno-omit-frame-pointer -fsanitize=undefined -fsanitize=address
- GCCVER5 := $(shell expr `g++ -dumpversion | cut -f1 -d.` \>= 5)
+ GCCVER5 := $(shell expr `${CXX} -dumpversion | cut -f1 -d.` \>= 5)
ifeq "$(GCCVER5)" "1"
CXXFLAGS += -fsanitize=bounds -fsanitize=float-divide-by-zero -fsanitize=vptr
CXXFLAGS += -fsanitize=float-cast-overflow -fsanitize=object-size
@@ -75,8 +75,10 @@ OBJS := ${GCLDIR}/GBase.o ${GCLDIR}/GArgs.o ${GCLDIR}/GFaSeqGet.o \
.PHONY : all
-nodebug: release
-all release debug memcheck memdebug profile gprof prof: gffread
+all release debug memcheck memdebug profile gprof prof: ../gclib gffread
+
+../gclib:
+ git clone https://github.com/gpertea/gclib.git ../gclib
$(OBJS) : $(GCLDIR)/GBase.h $(GCLDIR)/gff.h
gffread.o : gff_utils.h $(GCLDIR)/GBase.h $(GCLDIR)/gff.h
=====================================
README.md
=====================================
@@ -1,18 +1,25 @@
-# gffread
-GFF/GTF parsing utility providing format conversions, region filtering,
-FASTA sequence extraction and more.
+## GffRead
-Use gffread -h to check the usage options.
+GFF/GTF utility providing format conversions, filtering, FASTA sequence
+extraction and more.
-Compiling this program from source requires the [GCLib](../../../gclib) code
-library. Building the program can be done like this:
+More details and usage examples can be found in the paper [DOI: 10.12688/f1000research.23297.1](http://dx.doi.org/10.12688/f1000research.23297.1) which can be also used to cite this software.
+
+The official webpage with download packages for this utility can be found online here:
+ http://ccb.jhu.edu/software/stringtie/gff.shtml#gffread
+
+Use `gffread -h` to see the command line usage options.
+
+## Installation
+Building this program from source requires the [GCLib](../../../gclib) source code
+library. The `make` command should automatically fetch the latest gclib version from the repository if no `../gclib` directory is found.
```
cd /some/build/dir
- git clone https://github.com/gpertea/gclib
git clone https://github.com/gpertea/gffread
cd gffread
make release
```
+This should create the **gffread** binary in the current directory.
+
-This should build the **gffread** binary in the current directory.
=====================================
debian/changelog
=====================================
@@ -1,3 +1,14 @@
+gffread (0.12.1-1) unstable; urgency=medium
+
+ * Team upload.
+ * New upstream version
+ * debhelper-compat 13 (routine-update)
+ * Build-depends on newer version of libgclib.
+ * Don't depend on creating a checkout of gclib during build
+ * Added myself to uploaders.
+
+ -- Steffen Moeller <moeller at debian.org> Sat, 08 Aug 2020 22:51:47 +0200
+
gffread (0.11.8-1) unstable; urgency=medium
* Adjust the autopkgtests to exclude the spades dependency on !amd64
=====================================
debian/control
=====================================
@@ -1,11 +1,12 @@
Source: gffread
Maintainer: Debian Med Packaging Team <debian-med-packaging at lists.alioth.debian.org>
Uploaders: Andreas Tille <tille at debian.org>,
- Michael R. Crusoe <crusoe at debian.org>
+ Michael R. Crusoe <crusoe at debian.org>,
+ Steffen Moeller <moeller at debian.org>
Section: science
Priority: optional
-Build-Depends: debhelper-compat (= 12),
- libgclib-dev
+Build-Depends: debhelper-compat (= 13),
+ libgclib-dev (>= 0.11.10)
Standards-Version: 4.5.0
Vcs-Browser: https://salsa.debian.org/med-team/gffread
Vcs-Git: https://salsa.debian.org/med-team/gffread.git
=====================================
debian/patches/gclib.patch
=====================================
@@ -2,9 +2,20 @@ Author: Andreas Tille <tille at debian.org>
Last-Update: Thu, 18 Apr 2019 12:40:25 +0200
Description: Fix build against libgclib
+Index: gffread/Makefile
+===================================================================
--- gffread.orig/Makefile
+++ gffread/Makefile
-@@ -83,8 +83,8 @@
+@@ -75,7 +75,7 @@ OBJS := ${GCLDIR}/GBase.o ${GCLDIR}/GArg
+
+ .PHONY : all
+
+-all release debug memcheck memdebug profile gprof prof: ../gclib gffread
++all release debug memcheck memdebug profile gprof prof: gffread
+
+ ../gclib:
+ git clone https://github.com/gpertea/gclib.git ../gclib
+@@ -85,8 +85,8 @@ gffread.o : gff_utils.h $(GCLDIR)/GBase.
gff_utils.o : gff_utils.h $(GCLDIR)/gff.h
${GCLDIR}/gff.o : ${GCLDIR}/gff.h ${GCLDIR}/GFaSeqGet.h ${GCLDIR}/GList.hh ${GCLDIR}/GHash.hh
${GCLDIR}/GFaSeqGet.o : ${GCLDIR}/GFaSeqGet.h
=====================================
debian/patches/hardening
=====================================
@@ -2,9 +2,11 @@ From: Michael R. Crusoe <michael.crusoe at gmail.com>
Subject: Use CPPFLAGS
Allows Debian to harden the binary with CPPFLAGS=-Wdate-time -D_FORTIFY_SOURCE=2
+Index: gffread/Makefile
+===================================================================
--- gffread.orig/Makefile
+++ gffread/Makefile
-@@ -65,7 +65,7 @@
+@@ -65,7 +65,7 @@ endif
#endif
%.o : %.cpp
@@ -13,7 +15,7 @@ Allows Debian to harden the binary with CPPFLAGS=-Wdate-time -D_FORTIFY_SOURCE=2
# C/C++ linker
-@@ -84,7 +84,7 @@
+@@ -86,7 +86,7 @@ gff_utils.o : gff_utils.h $(GCLDIR)/gff.
${GCLDIR}/gff.o : ${GCLDIR}/gff.h ${GCLDIR}/GFaSeqGet.h ${GCLDIR}/GList.hh ${GCLDIR}/GHash.hh
${GCLDIR}/GFaSeqGet.o : ${GCLDIR}/GFaSeqGet.h
gffread: gffread.o gff_utils.o
=====================================
examples/README.md
=====================================
@@ -0,0 +1,29 @@
+## GffRead usage examples
+
+GffRead can be used to simply read an annotation file in a GFF format, and print it in either GFF3 (default) or
+GTF2 format (with the -T option), while discarding any non-trasncript features and optional attributes.
+It can also report some potential issues found in the input GFF records. The command line for such a quick GFF/GTF
+file cleanup would be:
+```
+gffread -E annotation.gff -o ann_simple.gff
+```
+
+This will create a minimalist GFF3 re-formatting of the transcript records found in the input file (`annotation.gff` in this example).
+The -E option directs GffRead to "expose" (display warnings about) any potential formatting issues
+encountered while parsing the input file.
+
+In order to obtain the GTF2 version of the same transcript records, the `-T` option should be added:
+```
+gffread annotation.gff -T -o annotation.gtf
+```
+
+GffRead can be used to generate a FASTA file with the DNA sequences for all transcripts in a GFF file. For this operation
+a fasta file with the genomic sequences has to be provided as well. This can be accomplished with a command line like this:
+```
+gffread -w transcripts.fa -g genome.fa annotation.gff
+```
+The file `genome.fa` in this example would be a multi-fasta file with the chromosome/contig sequences of the target genome.
+This also requires that every contig or chromosome name found in the 1st column of the input GFF file
+(`annotation.gff` in this example) must have a corresponding sequence entry in the `genome.fa` file.
+
+The `output` directory contains all the output files that should be generated by the above examples.
=====================================
examples/annotation.gff
=====================================
@@ -0,0 +1,86 @@
+NT_187562.1 BestRefSeq gene 411 68627 . + . ID=gene55473;Dbxref=GeneID:8972,HGNC:HGNC:7043,MIM:154360;Name=MGAM;description=maltase-glucoamylase;gbkey=Gene;gene=MGAM;gene_biotype=protein_coding;gene_synonym=MG,MGA;partial=true;start_range=.,411
+NT_187562.1 BestRefSeq mRNA 411 68627 . + . ID=rna157470;Parent=gene55473;Dbxref=GeneID:8972,Genbank:NM_004668.2,HGNC:HGNC:7043,MIM:154360;Name=NM_004668.2;Note=The RefSeq transcript aligns at 64%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=MGAM;partial=true;product=maltase-glucoamylase;start_range=.,411;transcript_id=NM_004668.2
+NT_187562.1 BestRefSeq exon 411 495 . + . ID=id1808900;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NM_004668.2,HGNC:HGNC:7043,MIM:154360;Note=The RefSeq transcript aligns at 64%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=MGAM;partial=true;product=maltase-glucoamylase;start_range=.,411;transcript_id=NM_004668.2
+NT_187562.1 BestRefSeq exon 1995 2051 . + . ID=id1808901;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NM_004668.2,HGNC:HGNC:7043,MIM:154360;Note=The RefSeq transcript aligns at 64%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=MGAM;partial=true;product=maltase-glucoamylase;transcript_id=NM_004668.2
+NT_187562.1 BestRefSeq exon 2602 2726 . + . ID=id1808902;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NM_004668.2,HGNC:HGNC:7043,MIM:154360;Note=The RefSeq transcript aligns at 64%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=MGAM;partial=true;product=maltase-glucoamylase;transcript_id=NM_004668.2
+NT_187562.1 BestRefSeq exon 9665 9753 . + . ID=id1808903;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NM_004668.2,HGNC:HGNC:7043,MIM:154360;Note=The RefSeq transcript aligns at 64%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=MGAM;partial=true;product=maltase-glucoamylase;transcript_id=NM_004668.2
+NT_187562.1 BestRefSeq exon 12115 12164 . + . ID=id1808904;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NM_004668.2,HGNC:HGNC:7043,MIM:154360;Note=The RefSeq transcript aligns at 64%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=MGAM;partial=true;product=maltase-glucoamylase;transcript_id=NM_004668.2
+NT_187562.1 BestRefSeq exon 12577 12744 . + . ID=id1808905;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NM_004668.2,HGNC:HGNC:7043,MIM:154360;Note=The RefSeq transcript aligns at 64%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=MGAM;partial=true;product=maltase-glucoamylase;transcript_id=NM_004668.2
+NT_187562.1 BestRefSeq exon 14174 14326 . + . ID=id1808906;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NM_004668.2,HGNC:HGNC:7043,MIM:154360;Note=The RefSeq transcript aligns at 64%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=MGAM;partial=true;product=maltase-glucoamylase;transcript_id=NM_004668.2
+NT_187562.1 BestRefSeq exon 14664 14864 . + . ID=id1808907;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NM_004668.2,HGNC:HGNC:7043,MIM:154360;Note=The RefSeq transcript aligns at 64%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=MGAM;partial=true;product=maltase-glucoamylase;transcript_id=NM_004668.2
+NT_187562.1 BestRefSeq exon 16634 16788 . + . ID=id1808908;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NM_004668.2,HGNC:HGNC:7043,MIM:154360;Note=The RefSeq transcript aligns at 64%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=MGAM;partial=true;product=maltase-glucoamylase;transcript_id=NM_004668.2
+NT_187562.1 BestRefSeq exon 17438 17606 . + . ID=id1808909;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NM_004668.2,HGNC:HGNC:7043,MIM:154360;Note=The RefSeq transcript aligns at 64%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=MGAM;partial=true;product=maltase-glucoamylase;transcript_id=NM_004668.2
+NT_187562.1 BestRefSeq exon 17880 17976 . + . ID=id1808910;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NM_004668.2,HGNC:HGNC:7043,MIM:154360;Note=The RefSeq transcript aligns at 64%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=MGAM;partial=true;product=maltase-glucoamylase;transcript_id=NM_004668.2
+NT_187562.1 BestRefSeq exon 18710 18822 . + . ID=id1808911;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NM_004668.2,HGNC:HGNC:7043,MIM:154360;Note=The RefSeq transcript aligns at 64%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=MGAM;partial=true;product=maltase-glucoamylase;transcript_id=NM_004668.2
+NT_187562.1 BestRefSeq exon 20083 20208 . + . ID=id1808912;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NM_004668.2,HGNC:HGNC:7043,MIM:154360;Note=The RefSeq transcript aligns at 64%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=MGAM;partial=true;product=maltase-glucoamylase;transcript_id=NM_004668.2
+NT_187562.1 BestRefSeq exon 21352 21480 . + . ID=id1808913;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NM_004668.2,HGNC:HGNC:7043,MIM:154360;Note=The RefSeq transcript aligns at 64%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=MGAM;partial=true;product=maltase-glucoamylase;transcript_id=NM_004668.2
+NT_187562.1 BestRefSeq exon 21736 21846 . + . ID=id1808914;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NM_004668.2,HGNC:HGNC:7043,MIM:154360;Note=The RefSeq transcript aligns at 64%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=MGAM;partial=true;product=maltase-glucoamylase;transcript_id=NM_004668.2
+NT_187562.1 BestRefSeq exon 22191 22253 . + . ID=id1808915;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NM_004668.2,HGNC:HGNC:7043,MIM:154360;Note=The RefSeq transcript aligns at 64%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=MGAM;partial=true;product=maltase-glucoamylase;transcript_id=NM_004668.2
+NT_187562.1 BestRefSeq exon 24448 24582 . + . ID=id1808916;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NM_004668.2,HGNC:HGNC:7043,MIM:154360;Note=The RefSeq transcript aligns at 64%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=MGAM;partial=true;product=maltase-glucoamylase;transcript_id=NM_004668.2
+NT_187562.1 BestRefSeq exon 25379 25466 . + . ID=id1808917;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NM_004668.2,HGNC:HGNC:7043,MIM:154360;Note=The RefSeq transcript aligns at 64%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=MGAM;partial=true;product=maltase-glucoamylase;transcript_id=NM_004668.2
+NT_187562.1 BestRefSeq exon 26264 26402 . + . ID=id1808918;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NM_004668.2,HGNC:HGNC:7043,MIM:154360;Note=The RefSeq transcript aligns at 64%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=MGAM;partial=true;product=maltase-glucoamylase;transcript_id=NM_004668.2
+NT_187562.1 BestRefSeq exon 27215 27348 . + . ID=id1808919;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NM_004668.2,HGNC:HGNC:7043,MIM:154360;Note=The RefSeq transcript aligns at 64%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=MGAM;partial=true;product=maltase-glucoamylase;transcript_id=NM_004668.2
+NT_187562.1 BestRefSeq exon 56500 56534 . + . ID=id1808920;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NM_004668.2,HGNC:HGNC:7043,MIM:154360;Note=The RefSeq transcript aligns at 64%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=MGAM;partial=true;product=maltase-glucoamylase;transcript_id=NM_004668.2
+NT_187562.1 BestRefSeq exon 56627 56743 . + . ID=id1808921;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NM_004668.2,HGNC:HGNC:7043,MIM:154360;Note=The RefSeq transcript aligns at 64%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=MGAM;partial=true;product=maltase-glucoamylase;transcript_id=NM_004668.2
+NT_187562.1 BestRefSeq exon 57445 57593 . + . ID=id1808922;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NM_004668.2,HGNC:HGNC:7043,MIM:154360;Note=The RefSeq transcript aligns at 64%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=MGAM;partial=true;product=maltase-glucoamylase;transcript_id=NM_004668.2
+NT_187562.1 BestRefSeq exon 58211 58295 . + . ID=id1808923;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NM_004668.2,HGNC:HGNC:7043,MIM:154360;Note=The RefSeq transcript aligns at 64%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=MGAM;partial=true;product=maltase-glucoamylase;transcript_id=NM_004668.2
+NT_187562.1 BestRefSeq exon 59473 59529 . + . ID=id1808924;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NM_004668.2,HGNC:HGNC:7043,MIM:154360;Note=The RefSeq transcript aligns at 64%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=MGAM;partial=true;product=maltase-glucoamylase;transcript_id=NM_004668.2
+NT_187562.1 BestRefSeq exon 61493 61617 . + . ID=id1808925;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NM_004668.2,HGNC:HGNC:7043,MIM:154360;Note=The RefSeq transcript aligns at 64%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=MGAM;partial=true;product=maltase-glucoamylase;transcript_id=NM_004668.2
+NT_187562.1 BestRefSeq exon 62682 62770 . + . ID=id1808926;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NM_004668.2,HGNC:HGNC:7043,MIM:154360;Note=The RefSeq transcript aligns at 64%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=MGAM;partial=true;product=maltase-glucoamylase;transcript_id=NM_004668.2
+NT_187562.1 BestRefSeq exon 64510 64559 . + . ID=id1808927;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NM_004668.2,HGNC:HGNC:7043,MIM:154360;Note=The RefSeq transcript aligns at 64%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=MGAM;partial=true;product=maltase-glucoamylase;transcript_id=NM_004668.2
+NT_187562.1 BestRefSeq exon 65149 65319 . + . ID=id1808928;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NM_004668.2,HGNC:HGNC:7043,MIM:154360;Note=The RefSeq transcript aligns at 64%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=MGAM;partial=true;product=maltase-glucoamylase;transcript_id=NM_004668.2
+NT_187562.1 BestRefSeq exon 67694 68627 . + . ID=id1808929;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NM_004668.2,HGNC:HGNC:7043,MIM:154360;Note=The RefSeq transcript aligns at 64%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=MGAM;partial=true;product=maltase-glucoamylase;transcript_id=NM_004668.2
+NT_187562.1 BestRefSeq CDS 411 495 . + 1 ID=cds110496;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NP_004659.2,HGNC:HGNC:7043,MIM:154360;Name=NP_004659.2;Note=The RefSeq protein aligns at 59%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=CDS;gene=MGAM;partial=true;product=maltase-glucoamylase%2C intestinal;protein_id=NP_004659.2;start_range=.,411
+NT_187562.1 BestRefSeq CDS 1995 2051 . + 0 ID=cds110496;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NP_004659.2,HGNC:HGNC:7043,MIM:154360;Name=NP_004659.2;Note=The RefSeq protein aligns at 59%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=CDS;gene=MGAM;partial=true;product=maltase-glucoamylase%2C intestinal;protein_id=NP_004659.2
+NT_187562.1 BestRefSeq CDS 2602 2726 . + 0 ID=cds110496;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NP_004659.2,HGNC:HGNC:7043,MIM:154360;Name=NP_004659.2;Note=The RefSeq protein aligns at 59%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=CDS;gene=MGAM;partial=true;product=maltase-glucoamylase%2C intestinal;protein_id=NP_004659.2
+NT_187562.1 BestRefSeq CDS 9665 9753 . + 1 ID=cds110496;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NP_004659.2,HGNC:HGNC:7043,MIM:154360;Name=NP_004659.2;Note=The RefSeq protein aligns at 59%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=CDS;gene=MGAM;partial=true;product=maltase-glucoamylase%2C intestinal;protein_id=NP_004659.2
+NT_187562.1 BestRefSeq CDS 12115 12164 . + 2 ID=cds110496;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NP_004659.2,HGNC:HGNC:7043,MIM:154360;Name=NP_004659.2;Note=The RefSeq protein aligns at 59%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=CDS;gene=MGAM;partial=true;product=maltase-glucoamylase%2C intestinal;protein_id=NP_004659.2
+NT_187562.1 BestRefSeq CDS 12577 12744 . + 0 ID=cds110496;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NP_004659.2,HGNC:HGNC:7043,MIM:154360;Name=NP_004659.2;Note=The RefSeq protein aligns at 59%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=CDS;gene=MGAM;partial=true;product=maltase-glucoamylase%2C intestinal;protein_id=NP_004659.2
+NT_187562.1 BestRefSeq CDS 14174 14326 . + 0 ID=cds110496;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NP_004659.2,HGNC:HGNC:7043,MIM:154360;Name=NP_004659.2;Note=The RefSeq protein aligns at 59%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=CDS;gene=MGAM;partial=true;product=maltase-glucoamylase%2C intestinal;protein_id=NP_004659.2
+NT_187562.1 BestRefSeq CDS 14664 14864 . + 0 ID=cds110496;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NP_004659.2,HGNC:HGNC:7043,MIM:154360;Name=NP_004659.2;Note=The RefSeq protein aligns at 59%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=CDS;gene=MGAM;partial=true;product=maltase-glucoamylase%2C intestinal;protein_id=NP_004659.2
+NT_187562.1 BestRefSeq CDS 16634 16788 . + 0 ID=cds110496;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NP_004659.2,HGNC:HGNC:7043,MIM:154360;Name=NP_004659.2;Note=The RefSeq protein aligns at 59%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=CDS;gene=MGAM;partial=true;product=maltase-glucoamylase%2C intestinal;protein_id=NP_004659.2
+NT_187562.1 BestRefSeq CDS 17438 17606 . + 1 ID=cds110496;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NP_004659.2,HGNC:HGNC:7043,MIM:154360;Name=NP_004659.2;Note=The RefSeq protein aligns at 59%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=CDS;gene=MGAM;partial=true;product=maltase-glucoamylase%2C intestinal;protein_id=NP_004659.2
+NT_187562.1 BestRefSeq CDS 17880 17976 . + 0 ID=cds110496;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NP_004659.2,HGNC:HGNC:7043,MIM:154360;Name=NP_004659.2;Note=The RefSeq protein aligns at 59%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=CDS;gene=MGAM;partial=true;product=maltase-glucoamylase%2C intestinal;protein_id=NP_004659.2
+NT_187562.1 BestRefSeq CDS 18710 18822 . + 2 ID=cds110496;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NP_004659.2,HGNC:HGNC:7043,MIM:154360;Name=NP_004659.2;Note=The RefSeq protein aligns at 59%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=CDS;gene=MGAM;partial=true;product=maltase-glucoamylase%2C intestinal;protein_id=NP_004659.2
+NT_187562.1 BestRefSeq CDS 20083 20208 . + 0 ID=cds110496;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NP_004659.2,HGNC:HGNC:7043,MIM:154360;Name=NP_004659.2;Note=The RefSeq protein aligns at 59%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=CDS;gene=MGAM;partial=true;product=maltase-glucoamylase%2C intestinal;protein_id=NP_004659.2
+NT_187562.1 BestRefSeq CDS 21352 21480 . + 0 ID=cds110496;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NP_004659.2,HGNC:HGNC:7043,MIM:154360;Name=NP_004659.2;Note=The RefSeq protein aligns at 59%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=CDS;gene=MGAM;partial=true;product=maltase-glucoamylase%2C intestinal;protein_id=NP_004659.2
+NT_187562.1 BestRefSeq CDS 21736 21846 . + 0 ID=cds110496;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NP_004659.2,HGNC:HGNC:7043,MIM:154360;Name=NP_004659.2;Note=The RefSeq protein aligns at 59%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=CDS;gene=MGAM;partial=true;product=maltase-glucoamylase%2C intestinal;protein_id=NP_004659.2
+NT_187562.1 BestRefSeq CDS 22191 22253 . + 0 ID=cds110496;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NP_004659.2,HGNC:HGNC:7043,MIM:154360;Name=NP_004659.2;Note=The RefSeq protein aligns at 59%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=CDS;gene=MGAM;partial=true;product=maltase-glucoamylase%2C intestinal;protein_id=NP_004659.2
+NT_187562.1 BestRefSeq CDS 24448 24582 . + 0 ID=cds110496;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NP_004659.2,HGNC:HGNC:7043,MIM:154360;Name=NP_004659.2;Note=The RefSeq protein aligns at 59%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=CDS;gene=MGAM;partial=true;product=maltase-glucoamylase%2C intestinal;protein_id=NP_004659.2
+NT_187562.1 BestRefSeq CDS 25379 25466 . + 0 ID=cds110496;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NP_004659.2,HGNC:HGNC:7043,MIM:154360;Name=NP_004659.2;Note=The RefSeq protein aligns at 59%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=CDS;gene=MGAM;partial=true;product=maltase-glucoamylase%2C intestinal;protein_id=NP_004659.2
+NT_187562.1 BestRefSeq CDS 26264 26402 . + 2 ID=cds110496;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NP_004659.2,HGNC:HGNC:7043,MIM:154360;Name=NP_004659.2;Note=The RefSeq protein aligns at 59%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=CDS;gene=MGAM;partial=true;product=maltase-glucoamylase%2C intestinal;protein_id=NP_004659.2
+NT_187562.1 BestRefSeq CDS 27215 27348 . + 1 ID=cds110496;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NP_004659.2,HGNC:HGNC:7043,MIM:154360;Name=NP_004659.2;Note=The RefSeq protein aligns at 59%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=CDS;gene=MGAM;partial=true;product=maltase-glucoamylase%2C intestinal;protein_id=NP_004659.2
+NT_187562.1 BestRefSeq CDS 56500 56534 . + 2 ID=cds110496;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NP_004659.2,HGNC:HGNC:7043,MIM:154360;Name=NP_004659.2;Note=The RefSeq protein aligns at 59%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=CDS;gene=MGAM;partial=true;product=maltase-glucoamylase%2C intestinal;protein_id=NP_004659.2
+NT_187562.1 BestRefSeq CDS 56627 56743 . + 0 ID=cds110496;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NP_004659.2,HGNC:HGNC:7043,MIM:154360;Name=NP_004659.2;Note=The RefSeq protein aligns at 59%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=CDS;gene=MGAM;partial=true;product=maltase-glucoamylase%2C intestinal;protein_id=NP_004659.2
+NT_187562.1 BestRefSeq CDS 57445 57593 . + 0 ID=cds110496;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NP_004659.2,HGNC:HGNC:7043,MIM:154360;Name=NP_004659.2;Note=The RefSeq protein aligns at 59%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=CDS;gene=MGAM;partial=true;product=maltase-glucoamylase%2C intestinal;protein_id=NP_004659.2
+NT_187562.1 BestRefSeq CDS 58211 58295 . + 1 ID=cds110496;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NP_004659.2,HGNC:HGNC:7043,MIM:154360;Name=NP_004659.2;Note=The RefSeq protein aligns at 59%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=CDS;gene=MGAM;partial=true;product=maltase-glucoamylase%2C intestinal;protein_id=NP_004659.2
+NT_187562.1 BestRefSeq CDS 59473 59529 . + 0 ID=cds110496;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NP_004659.2,HGNC:HGNC:7043,MIM:154360;Name=NP_004659.2;Note=The RefSeq protein aligns at 59%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=CDS;gene=MGAM;partial=true;product=maltase-glucoamylase%2C intestinal;protein_id=NP_004659.2
+NT_187562.1 BestRefSeq CDS 61493 61617 . + 0 ID=cds110496;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NP_004659.2,HGNC:HGNC:7043,MIM:154360;Name=NP_004659.2;Note=The RefSeq protein aligns at 59%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=CDS;gene=MGAM;partial=true;product=maltase-glucoamylase%2C intestinal;protein_id=NP_004659.2
+NT_187562.1 BestRefSeq CDS 62682 62770 . + 1 ID=cds110496;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NP_004659.2,HGNC:HGNC:7043,MIM:154360;Name=NP_004659.2;Note=The RefSeq protein aligns at 59%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=CDS;gene=MGAM;partial=true;product=maltase-glucoamylase%2C intestinal;protein_id=NP_004659.2
+NT_187562.1 BestRefSeq CDS 64510 64559 . + 2 ID=cds110496;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NP_004659.2,HGNC:HGNC:7043,MIM:154360;Name=NP_004659.2;Note=The RefSeq protein aligns at 59%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=CDS;gene=MGAM;partial=true;product=maltase-glucoamylase%2C intestinal;protein_id=NP_004659.2
+NT_187562.1 BestRefSeq CDS 65149 65319 . + 0 ID=cds110496;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NP_004659.2,HGNC:HGNC:7043,MIM:154360;Name=NP_004659.2;Note=The RefSeq protein aligns at 59%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=CDS;gene=MGAM;partial=true;product=maltase-glucoamylase%2C intestinal;protein_id=NP_004659.2
+NT_187562.1 BestRefSeq CDS 67694 67771 . + 0 ID=cds110496;Parent=rna157470;Dbxref=GeneID:8972,Genbank:NP_004659.2,HGNC:HGNC:7043,MIM:154360;Name=NP_004659.2;Note=The RefSeq protein aligns at 59%25 coverage compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=CDS;gene=MGAM;partial=true;product=maltase-glucoamylase%2C intestinal;protein_id=NP_004659.2
+NT_187562.1 Curated Genomic gene 214038 219958 . - . ID=gene55476;Dbxref=GeneID:136541,HGNC:HGNC:39125;Name=PRSS58;description=protease%2C serine 58;gbkey=Gene;gene=PRSS58;gene_biotype=protein_coding;gene_synonym=PRSS1,TRY1,TRYX3,UNQ2540
+NT_187562.1 Curated Genomic mRNA 214038 219958 . - . ID=rna157473;Parent=gene55476;Dbxref=GeneID:136541,Genbank:NM_001001317.4,HGNC:HGNC:39125;Name=NM_001001317.4;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=PRSS58;product=protease%2C serine 58;transcript_id=NM_001001317.4
+NT_187562.1 Curated Genomic exon 219891 219958 . - . ID=id1808989;Parent=rna157473;Dbxref=GeneID:136541,Genbank:NM_001001317.4,HGNC:HGNC:39125;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=PRSS58;product=protease%2C serine 58;transcript_id=NM_001001317.4
+NT_187562.1 Curated Genomic exon 219568 219648 . - . ID=id1808990;Parent=rna157473;Dbxref=GeneID:136541,Genbank:NM_001001317.4,HGNC:HGNC:39125;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=PRSS58;product=protease%2C serine 58;transcript_id=NM_001001317.4
+NT_187562.1 Curated Genomic exon 217435 217573 . - . ID=id1808991;Parent=rna157473;Dbxref=GeneID:136541,Genbank:NM_001001317.4,HGNC:HGNC:39125;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=PRSS58;product=protease%2C serine 58;transcript_id=NM_001001317.4
+NT_187562.1 Curated Genomic exon 216955 217211 . - . ID=id1808992;Parent=rna157473;Dbxref=GeneID:136541,Genbank:NM_001001317.4,HGNC:HGNC:39125;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=PRSS58;product=protease%2C serine 58;transcript_id=NM_001001317.4
+NT_187562.1 Curated Genomic exon 214372 214511 . - . ID=id1808993;Parent=rna157473;Dbxref=GeneID:136541,Genbank:NM_001001317.4,HGNC:HGNC:39125;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=PRSS58;product=protease%2C serine 58;transcript_id=NM_001001317.4
+NT_187562.1 Curated Genomic exon 214038 214270 . - . ID=id1808994;Parent=rna157473;Dbxref=GeneID:136541,Genbank:NM_001001317.4,HGNC:HGNC:39125;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=PRSS58;product=protease%2C serine 58;transcript_id=NM_001001317.4
+NT_187562.1 Curated Genomic CDS 219568 219607 . - 0 ID=cds110498;Parent=rna157473;Dbxref=GeneID:136541,Genbank:NP_001001317.1,HGNC:HGNC:39125;Name=NP_001001317.1;gbkey=CDS;gene=PRSS58;product=serine protease 58 precursor;protein_id=NP_001001317.1
+NT_187562.1 Curated Genomic CDS 217435 217573 . - 2 ID=cds110498;Parent=rna157473;Dbxref=GeneID:136541,Genbank:NP_001001317.1,HGNC:HGNC:39125;Name=NP_001001317.1;gbkey=CDS;gene=PRSS58;product=serine protease 58 precursor;protein_id=NP_001001317.1
+NT_187562.1 Curated Genomic CDS 216955 217211 . - 1 ID=cds110498;Parent=rna157473;Dbxref=GeneID:136541,Genbank:NP_001001317.1,HGNC:HGNC:39125;Name=NP_001001317.1;gbkey=CDS;gene=PRSS58;product=serine protease 58 precursor;protein_id=NP_001001317.1
+NT_187562.1 Curated Genomic CDS 214372 214511 . - 2 ID=cds110498;Parent=rna157473;Dbxref=GeneID:136541,Genbank:NP_001001317.1,HGNC:HGNC:39125;Name=NP_001001317.1;gbkey=CDS;gene=PRSS58;product=serine protease 58 precursor;protein_id=NP_001001317.1
+NT_187562.1 Curated Genomic CDS 214121 214270 . - 0 ID=cds110498;Parent=rna157473;Dbxref=GeneID:136541,Genbank:NP_001001317.1,HGNC:HGNC:39125;Name=NP_001001317.1;gbkey=CDS;gene=PRSS58;product=serine protease 58 precursor;protein_id=NP_001001317.1
+NT_187562.1 BestRefSeq gene 962573 963937 . + . ID=gene55581;Dbxref=GeneID:135927,HGNC:HGNC:21750;Name=C7orf34;description=chromosome 7 open reading frame 34;gbkey=Gene;gene=C7orf34;gene_biotype=protein_coding;gene_synonym=ctm-1
+NT_187562.1 BestRefSeq mRNA 962573 963937 . + . ID=rna157497;Parent=gene55581;Dbxref=GeneID:135927,Genbank:NM_178829.4,HGNC:HGNC:21750;Name=NM_178829.4;gbkey=mRNA;gene=C7orf34;product=chromosome 7 open reading frame 34;transcript_id=NM_178829.4
+NT_187562.1 BestRefSeq exon 962573 962821 . + . ID=id1809397;Parent=rna157497;Dbxref=GeneID:135927,Genbank:NM_178829.4,HGNC:HGNC:21750;gbkey=mRNA;gene=C7orf34;product=chromosome 7 open reading frame 34;transcript_id=NM_178829.4
+NT_187562.1 BestRefSeq exon 963419 963937 . + . ID=id1809398;Parent=rna157497;Dbxref=GeneID:135927,Genbank:NM_178829.4,HGNC:HGNC:21750;gbkey=mRNA;gene=C7orf34;product=chromosome 7 open reading frame 34;transcript_id=NM_178829.4
+NT_187562.1 BestRefSeq CDS 962614 962821 . + 0 ID=cds110593;Parent=rna157497;Dbxref=GeneID:135927,Genbank:NP_849151.2,HGNC:HGNC:21750;Name=NP_849151.2;gbkey=CDS;gene=C7orf34;product=uncharacterized protein C7orf34;protein_id=NP_849151.2
+NT_187562.1 BestRefSeq CDS 963419 963654 . + 2 ID=cds110593;Parent=rna157497;Dbxref=GeneID:135927,Genbank:NP_849151.2,HGNC:HGNC:21750;Name=NP_849151.2;gbkey=CDS;gene=C7orf34;product=uncharacterized protein C7orf34;protein_id=NP_849151.2
+NT_187562.1 Curated Genomic gene 230181 234148 . - . ID=gene55477;Dbxref=GeneID:207147;Name=TRY2P;description=trypsinogen-like pseudogene;gbkey=Gene;gene=TRY2P;gene_biotype=misc_RNA;pseudo=true
+NT_187562.1 Curated Genomic transcript 230181 234148 . - . ID=rna157474;Parent=gene55477;Dbxref=GeneID:207147,Genbank:NR_036483.2;Name=NR_036483.2;exception=annotated by transcript or proteomic data;gbkey=misc_RNA;gene=TRY2P;product=trypsinogen-like pseudogene;transcript_id=NR_036483.2
+NT_187562.1 Curated Genomic exon 233852 234148 . - . ID=id1808995;Parent=rna157474;Dbxref=GeneID:207147,Genbank:NR_036483.2;exception=annotated by transcript or proteomic data;gbkey=misc_RNA;gene=TRY2P;product=trypsinogen-like pseudogene;transcript_id=NR_036483.2
+NT_187562.1 Curated Genomic exon 233047 233200 . - . ID=id1808996;Parent=rna157474;Dbxref=GeneID:207147,Genbank:NR_036483.2;exception=annotated by transcript or proteomic data;gbkey=misc_RNA;gene=TRY2P;product=trypsinogen-like pseudogene;transcript_id=NR_036483.2
+NT_187562.1 Curated Genomic exon 230181 231784 . - . ID=id1808997;Parent=rna157474;Dbxref=GeneID:207147,Genbank:NR_036483.2;exception=annotated by transcript or proteomic data;gbkey=misc_RNA;gene=TRY2P;product=trypsinogen-like pseudogene;transcript_id=NR_036483.2
=====================================
examples/genome.fa
=====================================
The diff for this file was not included because it is too large.
=====================================
examples/output/ann_simple.gff
=====================================
@@ -0,0 +1,85 @@
+# gffread -E annotation.gff -o ann_simple.gff
+# gffread v0.11.8
+##gff-version 3
+NT_187562.1 BestRefSeq mRNA 411 68627 . + . ID=rna157470;geneID=gene55473;gene_name=MGAM
+NT_187562.1 BestRefSeq exon 411 495 . + . Parent=rna157470
+NT_187562.1 BestRefSeq exon 1995 2051 . + . Parent=rna157470
+NT_187562.1 BestRefSeq exon 2602 2726 . + . Parent=rna157470
+NT_187562.1 BestRefSeq exon 9665 9753 . + . Parent=rna157470
+NT_187562.1 BestRefSeq exon 12115 12164 . + . Parent=rna157470
+NT_187562.1 BestRefSeq exon 12577 12744 . + . Parent=rna157470
+NT_187562.1 BestRefSeq exon 14174 14326 . + . Parent=rna157470
+NT_187562.1 BestRefSeq exon 14664 14864 . + . Parent=rna157470
+NT_187562.1 BestRefSeq exon 16634 16788 . + . Parent=rna157470
+NT_187562.1 BestRefSeq exon 17438 17606 . + . Parent=rna157470
+NT_187562.1 BestRefSeq exon 17880 17976 . + . Parent=rna157470
+NT_187562.1 BestRefSeq exon 18710 18822 . + . Parent=rna157470
+NT_187562.1 BestRefSeq exon 20083 20208 . + . Parent=rna157470
+NT_187562.1 BestRefSeq exon 21352 21480 . + . Parent=rna157470
+NT_187562.1 BestRefSeq exon 21736 21846 . + . Parent=rna157470
+NT_187562.1 BestRefSeq exon 22191 22253 . + . Parent=rna157470
+NT_187562.1 BestRefSeq exon 24448 24582 . + . Parent=rna157470
+NT_187562.1 BestRefSeq exon 25379 25466 . + . Parent=rna157470
+NT_187562.1 BestRefSeq exon 26264 26402 . + . Parent=rna157470
+NT_187562.1 BestRefSeq exon 27215 27348 . + . Parent=rna157470
+NT_187562.1 BestRefSeq exon 56500 56534 . + . Parent=rna157470
+NT_187562.1 BestRefSeq exon 56627 56743 . + . Parent=rna157470
+NT_187562.1 BestRefSeq exon 57445 57593 . + . Parent=rna157470
+NT_187562.1 BestRefSeq exon 58211 58295 . + . Parent=rna157470
+NT_187562.1 BestRefSeq exon 59473 59529 . + . Parent=rna157470
+NT_187562.1 BestRefSeq exon 61493 61617 . + . Parent=rna157470
+NT_187562.1 BestRefSeq exon 62682 62770 . + . Parent=rna157470
+NT_187562.1 BestRefSeq exon 64510 64559 . + . Parent=rna157470
+NT_187562.1 BestRefSeq exon 65149 65319 . + . Parent=rna157470
+NT_187562.1 BestRefSeq exon 67694 68627 . + . Parent=rna157470
+NT_187562.1 BestRefSeq CDS 411 495 . + 1 Parent=rna157470
+NT_187562.1 BestRefSeq CDS 1995 2051 . + 0 Parent=rna157470
+NT_187562.1 BestRefSeq CDS 2602 2726 . + 0 Parent=rna157470
+NT_187562.1 BestRefSeq CDS 9665 9753 . + 1 Parent=rna157470
+NT_187562.1 BestRefSeq CDS 12115 12164 . + 2 Parent=rna157470
+NT_187562.1 BestRefSeq CDS 12577 12744 . + 0 Parent=rna157470
+NT_187562.1 BestRefSeq CDS 14174 14326 . + 0 Parent=rna157470
+NT_187562.1 BestRefSeq CDS 14664 14864 . + 0 Parent=rna157470
+NT_187562.1 BestRefSeq CDS 16634 16788 . + 0 Parent=rna157470
+NT_187562.1 BestRefSeq CDS 17438 17606 . + 1 Parent=rna157470
+NT_187562.1 BestRefSeq CDS 17880 17976 . + 0 Parent=rna157470
+NT_187562.1 BestRefSeq CDS 18710 18822 . + 2 Parent=rna157470
+NT_187562.1 BestRefSeq CDS 20083 20208 . + 0 Parent=rna157470
+NT_187562.1 BestRefSeq CDS 21352 21480 . + 0 Parent=rna157470
+NT_187562.1 BestRefSeq CDS 21736 21846 . + 0 Parent=rna157470
+NT_187562.1 BestRefSeq CDS 22191 22253 . + 0 Parent=rna157470
+NT_187562.1 BestRefSeq CDS 24448 24582 . + 0 Parent=rna157470
+NT_187562.1 BestRefSeq CDS 25379 25466 . + 0 Parent=rna157470
+NT_187562.1 BestRefSeq CDS 26264 26402 . + 2 Parent=rna157470
+NT_187562.1 BestRefSeq CDS 27215 27348 . + 1 Parent=rna157470
+NT_187562.1 BestRefSeq CDS 56500 56534 . + 2 Parent=rna157470
+NT_187562.1 BestRefSeq CDS 56627 56743 . + 0 Parent=rna157470
+NT_187562.1 BestRefSeq CDS 57445 57593 . + 0 Parent=rna157470
+NT_187562.1 BestRefSeq CDS 58211 58295 . + 1 Parent=rna157470
+NT_187562.1 BestRefSeq CDS 59473 59529 . + 0 Parent=rna157470
+NT_187562.1 BestRefSeq CDS 61493 61617 . + 0 Parent=rna157470
+NT_187562.1 BestRefSeq CDS 62682 62770 . + 1 Parent=rna157470
+NT_187562.1 BestRefSeq CDS 64510 64559 . + 2 Parent=rna157470
+NT_187562.1 BestRefSeq CDS 65149 65319 . + 0 Parent=rna157470
+NT_187562.1 BestRefSeq CDS 67694 67771 . + 0 Parent=rna157470
+NT_187562.1 Curated Genomic mRNA 214038 219958 . - . ID=rna157473;geneID=gene55476;gene_name=PRSS58
+NT_187562.1 Curated Genomic exon 214038 214270 . - . Parent=rna157473
+NT_187562.1 Curated Genomic exon 214372 214511 . - . Parent=rna157473
+NT_187562.1 Curated Genomic exon 216955 217211 . - . Parent=rna157473
+NT_187562.1 Curated Genomic exon 217435 217573 . - . Parent=rna157473
+NT_187562.1 Curated Genomic exon 219568 219648 . - . Parent=rna157473
+NT_187562.1 Curated Genomic exon 219891 219958 . - . Parent=rna157473
+NT_187562.1 Curated Genomic CDS 214121 214270 . - 0 Parent=rna157473
+NT_187562.1 Curated Genomic CDS 214372 214511 . - 2 Parent=rna157473
+NT_187562.1 Curated Genomic CDS 216955 217211 . - 1 Parent=rna157473
+NT_187562.1 Curated Genomic CDS 217435 217573 . - 2 Parent=rna157473
+NT_187562.1 Curated Genomic CDS 219568 219607 . - 0 Parent=rna157473
+NT_187562.1 Curated Genomic transcript 230181 234148 . - . ID=rna157474;geneID=gene55477;gene_name=TRY2P
+NT_187562.1 Curated Genomic exon 230181 231784 . - . Parent=rna157474
+NT_187562.1 Curated Genomic exon 233047 233200 . - . Parent=rna157474
+NT_187562.1 Curated Genomic exon 233852 234148 . - . Parent=rna157474
+NT_187562.1 BestRefSeq mRNA 962573 963937 . + . ID=rna157497;geneID=gene55581;gene_name=C7orf34
+NT_187562.1 BestRefSeq exon 962573 962821 . + . Parent=rna157497
+NT_187562.1 BestRefSeq exon 963419 963937 . + . Parent=rna157497
+NT_187562.1 BestRefSeq CDS 962614 962821 . + 0 Parent=rna157497
+NT_187562.1 BestRefSeq CDS 963419 963654 . + 2 Parent=rna157497
=====================================
examples/output/annotation.gtf
=====================================
@@ -0,0 +1,82 @@
+NT_187562.1 BestRefSeq transcript 411 68627 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq exon 411 495 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq exon 1995 2051 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq exon 2602 2726 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq exon 9665 9753 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq exon 12115 12164 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq exon 12577 12744 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq exon 14174 14326 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq exon 14664 14864 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq exon 16634 16788 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq exon 17438 17606 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq exon 17880 17976 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq exon 18710 18822 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq exon 20083 20208 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq exon 21352 21480 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq exon 21736 21846 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq exon 22191 22253 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq exon 24448 24582 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq exon 25379 25466 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq exon 26264 26402 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq exon 27215 27348 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq exon 56500 56534 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq exon 56627 56743 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq exon 57445 57593 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq exon 58211 58295 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq exon 59473 59529 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq exon 61493 61617 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq exon 62682 62770 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq exon 64510 64559 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq exon 65149 65319 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq exon 67694 68627 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq CDS 411 495 . + 1 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq CDS 1995 2051 . + 0 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq CDS 2602 2726 . + 0 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq CDS 9665 9753 . + 1 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq CDS 12115 12164 . + 2 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq CDS 12577 12744 . + 0 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq CDS 14174 14326 . + 0 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq CDS 14664 14864 . + 0 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq CDS 16634 16788 . + 0 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq CDS 17438 17606 . + 1 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq CDS 17880 17976 . + 0 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq CDS 18710 18822 . + 2 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq CDS 20083 20208 . + 0 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq CDS 21352 21480 . + 0 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq CDS 21736 21846 . + 0 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq CDS 22191 22253 . + 0 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq CDS 24448 24582 . + 0 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq CDS 25379 25466 . + 0 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq CDS 26264 26402 . + 2 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq CDS 27215 27348 . + 1 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq CDS 56500 56534 . + 2 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq CDS 56627 56743 . + 0 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq CDS 57445 57593 . + 0 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq CDS 58211 58295 . + 1 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq CDS 59473 59529 . + 0 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq CDS 61493 61617 . + 0 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq CDS 62682 62770 . + 1 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq CDS 64510 64559 . + 2 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq CDS 65149 65319 . + 0 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 BestRefSeq CDS 67694 67771 . + 0 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM";
+NT_187562.1 Curated Genomic transcript 214038 219958 . - . transcript_id "rna157473"; gene_id "gene55476"; gene_name "PRSS58";
+NT_187562.1 Curated Genomic exon 214038 214270 . - . transcript_id "rna157473"; gene_id "gene55476"; gene_name "PRSS58";
+NT_187562.1 Curated Genomic exon 214372 214511 . - . transcript_id "rna157473"; gene_id "gene55476"; gene_name "PRSS58";
+NT_187562.1 Curated Genomic exon 216955 217211 . - . transcript_id "rna157473"; gene_id "gene55476"; gene_name "PRSS58";
+NT_187562.1 Curated Genomic exon 217435 217573 . - . transcript_id "rna157473"; gene_id "gene55476"; gene_name "PRSS58";
+NT_187562.1 Curated Genomic exon 219568 219648 . - . transcript_id "rna157473"; gene_id "gene55476"; gene_name "PRSS58";
+NT_187562.1 Curated Genomic exon 219891 219958 . - . transcript_id "rna157473"; gene_id "gene55476"; gene_name "PRSS58";
+NT_187562.1 Curated Genomic CDS 214121 214270 . - 0 transcript_id "rna157473"; gene_id "gene55476"; gene_name "PRSS58";
+NT_187562.1 Curated Genomic CDS 214372 214511 . - 2 transcript_id "rna157473"; gene_id "gene55476"; gene_name "PRSS58";
+NT_187562.1 Curated Genomic CDS 216955 217211 . - 1 transcript_id "rna157473"; gene_id "gene55476"; gene_name "PRSS58";
+NT_187562.1 Curated Genomic CDS 217435 217573 . - 2 transcript_id "rna157473"; gene_id "gene55476"; gene_name "PRSS58";
+NT_187562.1 Curated Genomic CDS 219568 219607 . - 0 transcript_id "rna157473"; gene_id "gene55476"; gene_name "PRSS58";
+NT_187562.1 Curated Genomic transcript 230181 234148 . - . transcript_id "rna157474"; gene_id "gene55477"; gene_name "TRY2P";
+NT_187562.1 Curated Genomic exon 230181 231784 . - . transcript_id "rna157474"; gene_id "gene55477"; gene_name "TRY2P";
+NT_187562.1 Curated Genomic exon 233047 233200 . - . transcript_id "rna157474"; gene_id "gene55477"; gene_name "TRY2P";
+NT_187562.1 Curated Genomic exon 233852 234148 . - . transcript_id "rna157474"; gene_id "gene55477"; gene_name "TRY2P";
+NT_187562.1 BestRefSeq transcript 962573 963937 . + . transcript_id "rna157497"; gene_id "gene55581"; gene_name "C7orf34";
+NT_187562.1 BestRefSeq exon 962573 962821 . + . transcript_id "rna157497"; gene_id "gene55581"; gene_name "C7orf34";
+NT_187562.1 BestRefSeq exon 963419 963937 . + . transcript_id "rna157497"; gene_id "gene55581"; gene_name "C7orf34";
+NT_187562.1 BestRefSeq CDS 962614 962821 . + 0 transcript_id "rna157497"; gene_id "gene55581"; gene_name "C7orf34";
+NT_187562.1 BestRefSeq CDS 963419 963654 . + 2 transcript_id "rna157497"; gene_id "gene55581"; gene_name "C7orf34";
=====================================
examples/output/transcripts.fa
=====================================
@@ -0,0 +1,119 @@
+>rna157470 CDS=2-3343
+GTTCTACGAGGACAACAGCACTTGGGATGTGCACCAACAGTTCTTATGGGGGCCCGGCCTCCTCATCACT
+CCAGTTCTGGATGAAGGTGCAGAGAAAGTGATGGCATATGTGCCTGATGCTGTCTGGTATGACTACGAGA
+CTGGGAGCCAAGTGAGATGGAGGAAGCAAAAAGTCGAGATGGAACTTCCTGGAGACAAAATTGGACTTCA
+CCTTCGAGGAGGCTACATCTTCCCCACACAGCAGCCAAATACAACCACTCTGGCCAGTCGAAAGAACCCT
+CTTGGTCTTATCATTGCCCTAGATGAGAACAAAGAAGCAAAAGGAGAACTTTTCTGGGATAATGGGGAAA
+CGAAGGATACTGTGGCCAATAAAGTGTATCTTTTATGTGAGTTTTCTGTCACTCAAAACCGCTTGGAGGT
+GAATATTTCACAATCAACCTACAAGGACCCCAATAATTTAGCATTTAATGAGATTAAAATTCTTGGGACG
+GAGGAACCTAGCAATGTTACAGTGAAACACAATGGTGTCCCAAGTCAGACTTCTCCTACAGTCACTTATG
+ATTCTAACCTGAAGGTTGCCATTATCACAGATATTGATCTTCTCCTGGGAGAAGCATACACAGTGGAATG
+GAGCATAAAGATAAGGGATGAAGAAAAAATAGACTGTTACCCTGATGAGAATGGTGCTTCTGCCGAAAAC
+TGCACTGCCCGTGGCTGTATCTGGGAGGCATCCAATTCTTCTGGAGTCCCTTTTTGCTATTTTGTCAACG
+ACCTATACTCTGTCAGTGATGTTCAGTATAATTCCCATGGGGCCACAGCTGACATCTCCTTAAAGTCTTC
+CGTTTATGCCAATGCCTTCCCCTCCACACCCGTGAACCCCCTTCGCCTGGATGTCACTTACCATAAGAAT
+GAAATGCTGCAGTTCAAGATTTATGATCCCAACAAGAATCGGTATGAAGTTCCAGTCCCTCTGAACATAC
+CCAGCATGCCATCCAGCACCCCTGAGGGTCAACTCTATGATGTGCTCATTAAGAAGAATCCATTTGGGAT
+TGAAATTCGCCGGAAGAGTACAGGCACTATAATTTGGGACTCTCAGCTCCTTGGCTTTACCTTCAGTGAC
+ATGTTTATCCGCATCTCCACCCGCCTTCCCTCCAAGTACCTCTATGGCTTTGGGGAAACTGAGCACAGGT
+CCTATAGGAGAGACTTGGAGTGGCACACTTGGGGGATGTTCTCCCGAGACCAGCCCCCAGGGTACAAGAA
+GAATTCCTATGGTGTCCACCCCTACTACATGGGGCTGGAGGAGGACGGCAGTGCCCATGGAGTGCTCCTG
+CTGAACAGCAATGCCATGGATGTGACGTTCCAGCCCCTGCCTGCCTTGACATACCGCACCACAGGGGGAG
+TTCTGGACTTTTATGTGTTCTTGGGGCCGACTCCAGAGCTTGTCACCCAGCAGTACACTGAGTTGATTGG
+CCGGCCTGTGATGGTACCTTACTGGTCTTTGGGGTTCCAGCTGTGTCGCTATGGCTACCAGAATGACTCT
+GAGATCGCCAGCTTGTATGATGAGATGGTGGCTGCCCAGATCCCTTATGATGTGCAGTACTCAGACATCG
+ACTACATGGAGCGGCAGCTGGACTTCACCCTCAGCCCCAAGTTTGCTGGGTTTCCAGCTCTGATCAATCG
+CATGAAGGCTGATGGGATGCGGGTCATCCTCATTCTGGATCCAGCCATTTCTGGCAATGAGACACAGCCT
+TATCCTGCCTTCACTCGGGGCGTGGAGGATGACGTCTTCATCAAATACCCAAATGATGGAGACATTGTCT
+GGGGAAAGGTCTGGCCTGATTTTCCTGATGTTGTTGTGAATGGGTCTCTAGACTGGGACAGCCAAGTGGA
+GCTATATCGAGCTTATGTGGCCTTCCCAGACTTTTTCCGTAATTCAACTGCCAAGTGGTGGAAGAGGGAA
+ATAGAAGAACTATACAACAATCCACAGAATCCAGAGAGGAGCTTGAAGTTTGATGGCATGTGGATTGATA
+TGAATGAACCATCAAGCTTCGTGAATGGGGCAGTTTCTCCAGGCTGCAGGGACGCCTCTCTGAACCACCC
+TCCCTACATGCCACATTTGGAGTCCAGGGACAGGGGCCTGAGCAGCAAGACCCTTTGTATGGAGAGTCAG
+CAGATCCTCCCAGACGGCTCCCTGGTGCAGCACTACAACGTGCACAACCTGTATGGGTGGTCCCAGACCA
+GACCCACATACGAAGCCGTGCAGGAGGTGACGGGACAGCGAGGGGTCGTCATCACCCGCTCCACATTTCC
+CTCTTCTGGCCGCTGGGCAGGACATTGGCTGGGAGACAACACGGCCGCATGGGATCAGCTGAAGAAGTCT
+ATCATTGGCATGATGGAGTTCAGCCTCTTCGGCATATCCTATACGGGAGCAGATATCTGTGGGTTCTTTC
+AAGATGCTGAATATGAGATGTGTGTTCGCTGGATGCAGCTGGGGGCCTTTTACCCCTTCTCAAGAAACCA
+CAACACCATTGGGACCAGGAGACAAGACCCTGTGTCCTGGGATGTTGCTTTTGTGAATATTTCCAGAACT
+GTCCTGCAGACCAGATACACCCTGTTGCCATATCTGTATACCTTGATGCATAAGGCCCACACGGAGGGCG
+TCACTGTTGTGCGGCCTCTGCTCCATGAGTTTGTGTCAGACCAGGTGACATGGGACATAGACAGTCAGTT
+CCTGCTGGGCCCAGCCTTCCTGGTCAGCCCTGTCCTGGAGCGTAATGCCAGAAATGTCACTGCATATTTC
+CCTAGAGCCCGCTGGTATGATTACTACACGGGTGTGGATATTAATGCAAGAGGAGAGTGGAAGACCTTGC
+CAGCCCCTCTTGACCACATTAATCTTCATGTCCGTGGGGGCTACATCCTGCCCTGGCAAGAGCCTGCACT
+GAACACCCACTTAAGCCGCCAGAAATTCATGGGCTTCAAAATTGCCTTGGATGATGAAGGAACTGCTGGG
+GGCTGGCTCTTCTGGGATGATGGGCAAAGCATTGATACCTATGGGAAAGGACTCTATTACTTGGCCAGCT
+TTTCTGCCAGCCAGAATACGATGCAAAGCCATATAATTTTCAACAATTACATCACTGGTACAAATCCTTT
+GAAACTGGGCTACATTGAAATCTGGGGAGTGGGCAGTGTCCCCGTTACCAGTGTCAGCATCTCTGTGAGT
+GGCATGGTCATAACACCCTCCTTCAACAATGACCCCACGACACAGGTATTAAGCATCGATGTGACTGACA
+GAAACATCAGCCTACATAATTTTACTTCATTGACGTGGATAAGCACTCTGTGAATTTTTACAGCAAGATT
+CTAACTAACTATGAATGACTTTGAAACTACTTATACTTCATACTCATAAAAATTATTGTGTGTTGCTAAT
+TTGTTCATACCCACTATTGGTGAAATATTTCTGTTAATTTTGTTATATGTTTTTTGTGTGAACCCTAAAG
+GTTAAACCTTAGCCCTGTGGGATAGGCAGTTAGGGAGGTGTGGAAAATCTATGCATTACCTTAATGTCTC
+TGTGTGGTTAGTATGGTAGTGACTGTTCATCATATGACATTTACTGAAGATGAACTGGGTCCATGATGAA
+GTGTGTGTATGTCCACGTTTGTAATCATAGAATGGACCCCATTCTTTTGTTAAATACACAAGAGAAAGCT
+TTCTGTGACAGTTCCAGGTCTTGAAGCTAATCAGCATCTCAAGAAAGTATCCAGAAAGAACATCTGCTAG
+TTGGTTATAGGCGGTGGGAGGAATAATATACCTAATTGGTTATAGGTGGGGGGAGCATGATAAGCAAAGA
+AAAGGCAAACACAAGGAAAGATCAGATGAAACAGAAGATGATAGTAAAAGTGATCCTAAGTAAGAACATA
+ATGTAAAATTGTCAGCAGCCTCATGGGGAGGAAAAAGGAAGAGTCAACTCACTTGAAGAAGAGGGTCTTG
+AGAAATCCTTAGCATAAAGGGCTACTGGTGAGATTGAGATCTGAGCAGGCAAAGCTCAAAAGAGAGTTTG
+GAGGTTAAAAATAATTTATTTTTGCAGTAGTGTGCTTTGAAATGTGTAAATCTTATTTCTAATGTATACA
+ACCACATTTCACATAAAAATATGCAATTTATATGCCAGATAAAAATAAAACAAGTGAATTTGCAAGTGA
+>rna157473 CDS=110-835
+AAGGCTGGCAAAAAGGAGACCAGACAGGAGGCGTCTGTAGAGATATCATGAACTTCAACTTAGCTTTGTT
+TTCCAGAGACTGGAGCTAAACTGGGCTTTCAACATCATCATGAAGTTTATCCTCCTCTGGGCTCTCTTGA
+ATCTGACTGTTGCTTTGGCCTTTAATCCAGATTACACAGTCAGCTCCACTCCCCCTTACTTGGTCTATTT
+GAAATCTGACTACTTGCCCTGCGCTGGAGTCCTGATCCACCCGCTTTGGGTGATCACAGCTGCACACTGC
+AATTTACCAAAGCTTCGGGTGATATTGGGGGTTACAATCCCAGCAGACTCTAATGAAAAGCATCTGCAAG
+TGATTGGCTATGAGAAGATGATTCATCATCCACACTTCTCAGTCACTTCTATTGATCATGACATCATGCT
+AATCAAGCTGAAAACAGAGGCTGAACTCAATGACTATGTGAAATTAGCCAACCTGCCCTACCAAACTATC
+TCTGAAAATACCATGTGCTCTGTCTCTACCTGGAGCTACAATGTGTGTGATATCTACAAAGAGCCCGATT
+CACTGCAAACTGTGAACATCTCTGTAATCTCCAAGCCTCAGTGTCGCGATGCCTATAAAACCTACAACAT
+CACGGAAAATATGCTGTGTGTGGGCATTGTGCCAGGAAGGAGGCAGCCCTGCAAGGAAGTTTCTGCTGCC
+CCGGCAATCTGCAATGGGATGCTTCAAGGAATCCTGTCTTTTGCGGATGGATGTGTTTTGAGAGCCGATG
+TTGGCATCTATGCCAAAATTTTTTACTATATACCCTGGATTGAAAATGTAATCCAAAATAACTGAGCTGT
+GGCAGTTGTGGACCATATGACACAGCTTGTCCCCATCGTTCACCTTTAGAATTAAATATAAATTAACTCC
+TCACATTG
+>rna157474
+AAGATTTGTAGGATAGGAAAGGAGGCACCAGGACCTCGGCTTGGTCTGTGCCACTCTCTTTCACCCACAT
+CAGAGACGGCTCCCAGCAATGCCGTATATATTTGGGTCTTGATAAGGACTCATGTGAAGACTGCTCTCCT
+CTCATCTGTCTTCTGCCCATAATCTCCACAGCTGTTCTTGGCAAGGACACTTGGACTAAAGATTGGGACA
+GTTTACAGCTTGATCAACTTGTCTGTTCCTGGCGAGATCTGTCTGCCACGAAGGGTTTTCTCATCGTCGC
+TCTCTTGAGTGCAGCTGGAGCTGCTCTGGCTAAAGACTCTAAAGATAATCAGGATCTGACATATAATTTT
+ACTATTCCTTACATGGTCTATCTTCAGTCCTTCCCAGAACCCTGCGTGGGGTCTCTCATTCACCTTGACT
+GGGTATTGACAGCTGCCCACTGCCCCTTACCTGTTGAAATTCGACTGGGAGTTTCTCAACCTAGCATCAC
+AAACAAGAAACAACAGATATGAAACTATTCATCGATTGTGACCTACCCTGACTTTGATGCAAAATCCTTG
+AACAATGACCTAATGATGATCAAACTATCAACACCTGCTTCACTCAACCCCCATGTGGGGACTATAGCCA
+TAGCCTTGGAACCCATTCCATTTAATGAATCCTGCTTTATTCCAACCTGGACCTGGAATGAATATAAAAA
+CCGTATGTGTTGGATGCTGTGTTCTTTATGCTGAGGAGGTTGGAGCTGGGAGAAAATGTAGAGGGATGTC
+CCTAATGGAAATCGCTGCTATGAGACAATGTTGTCCACCATAGAGGGATTCCTGCCAGGGACCCAAACCA
+CCATCACTGGAGGTCAAAGATAAAATTCAAAGAGACTGATAAAGGGCAGCCACTCTGACACCAACAAATG
+GAGCAGACAGTAATTTTCATATCTCTAGAATATTGAAGACAAGAGTCCAGAGAACTGGATTGAAATACAG
+GTGGGAGAAATAGGAGTCCTCGTGCACAAAGACAGTATGAAGACAGCCTGCGGGAGGGTGACACCCATTG
+ATAGTTGCTGAAAGCCCATCACACAATGGAAGAACAGTGGGGGAAATGAACATAAAATATATGTGATGCG
+GTGAAGCAGCAGAAGATGCTTCCTACACCTTCCGCAGAGAACCCAGTTATGTGCAGAGAACCCTGGGAGA
+GAAGAGAATGGTTGAGGGAGTACCTGTCCTGTCACCAGAGCTGGTTCCCTGCACAGAGTGCTACTCATAC
+CCACTGCTTCTTCCCAGAGTTCCCATTTAGGGGTGGAGACTTTGGTGGTTCGTAGGGAAATTGATTATTA
+TGACATAATCAGTTGTTATCTCCCTGGTCTGACATGCCTCACAAGAAAAACAGTGGCAGCCCATGTTGTA
+AGCACACACCCATCTCCCCTGTGAGCTGAAATTGGGATGAGCAGAAAGATCCGGTCTCAGCAGGAGTCTG
+TCTAATAAAGAACACCTGTGAAGAAAACTACTTTCTAATAAGTTTGTTGGTCCTTGTTGGCTGTTACTCC
+ATTATTCTCTCCTTAGGTATTGTTGAGGATGAGGATACCACCACTTGATATCTGAAATGAAATGACAAAT
+GTAAATCTATTGCTTATTACAGCAAGTCAGAGCAATGCTGACCAGGCACCATCCCACTAAGCAGAATGGA
+GTGCTGATTAGCATAAGGGGTTTGTGGGAAGCTTGGAGTTGAGGGATTGGAGGACTTTCAGAGGTATGAA
+TGGGTTCCATCATCTGTTGAGGCCTGGCTACTTCAGGGGGATTGCTGTTGATTGATTGGTGCTCTGAGGC
+GTGTTTTTCAGAGGACCTCTGATCCTGATCAGCTGGTTTTAAGAAGTGAAGGCTGACATTGATTATACTG
+CAAAAGTTATGTTCACTCAAGCAAGTTGTCCTGGATCTATTAATCAGGTTTAAATCTCTCTCTGGTGGCT
+ACAGAACAATGCACGTGCTCTGTAAAAGCAAATAAAAGTGGAATTTTAGAAGTCAGTCTTTGGTAAAAAA
+CAGAAATAAAAAGCAGAAACTATTA
+>rna157497 CDS=42-485
+GGAGAAAATGCCCCTGGAAAGGGTTAAGGGCCAGGACAGGAATGGGGCAGGAGGTGCACGGATCCTGCTG
+GGCACTGGGAGCAGGGGGCGGCCAAAGGCAGTGGGTGGGCAGGTCCATGCCTCCCCTGGCCCCCCAGCTC
+TGCAGGGCAGTGTTCCTGGTTCCTATCTTGCTGCTGCTGCAGGTGAAGCCTCTGAACGGGAGCCCAGGCC
+CCAAAGATGGGAGCCAGACAGAGAAAACGCCCTCTGCAGACCAGAATCAAGAACAGTTCGAAGAGCACTT
+TGTGGCCTCCTCAGTGGGTGAGATGTGGCAGGTGGTGGACATGGCCCAGCAGGAAGAAGACCAGTCGTCC
+AAGACGGCAGCTGTTCACAAGCACTCTTTCCACCTCAGCTTCTGCTTTAGTCTGGCCAGTGTCATGGTTT
+TCTCAGGAGGGCCATTGAGGCGGACATTCCCAAATATCCAACTCTGCTTCATGCTCACTCACTGACCCTC
+CCTCCCTCCTGGGCTCCAGGTCACAACTCCCAAAGGAGATGCAGGCATGGCTCTCTGCCTCTGATCACCA
+TCACTGTATCTCAAGGTTCAGCAGCAGAGATACCAGTTGCCATCAGTGCTAACTGACTGCCTCTCCAGGT
+TCGGAGTTTCATCTCCCAGGGCCAGAGACAGCAGACCCACATCCTTCTCTCCCACACCTCTCCTGGTTTT
+GTTCAGGACAGCAGATTAGAGGCAGGAGGCAATGACAATAAAATAACGATAAAATCCTGAGAACAATT
=====================================
examples/transcripts.gtf
=====================================
@@ -0,0 +1,17 @@
+NT_187562.1 StringTie transcript 403 10275 . + . transcript_id "STRG.1.1"; gene_id "STRG.1";
+NT_187562.1 StringTie exon 403 495 . + . transcript_id "STRG.1.1"; gene_id "STRG.1";
+NT_187562.1 StringTie exon 1995 2726 . + . transcript_id "STRG.1.1"; gene_id "STRG.1";
+NT_187562.1 StringTie exon 9665 10275 . + . transcript_id "STRG.1.1"; gene_id "STRG.1";
+NT_187562.1 StringTie transcript 214032 219967 . - . transcript_id "STRG.3.1"; gene_id "STRG.3";
+NT_187562.1 StringTie exon 214032 214270 . - . transcript_id "STRG.3.1"; gene_id "STRG.3";
+NT_187562.1 StringTie exon 214372 214511 . - . transcript_id "STRG.3.1"; gene_id "STRG.3";
+NT_187562.1 StringTie exon 216955 217211 . - . transcript_id "STRG.3.1"; gene_id "STRG.3";
+NT_187562.1 StringTie exon 217435 217573 . - . transcript_id "STRG.3.1"; gene_id "STRG.3";
+NT_187562.1 StringTie exon 219568 219648 . - . transcript_id "STRG.3.1"; gene_id "STRG.3";
+NT_187562.1 StringTie exon 219891 219967 . - . transcript_id "STRG.3.1"; gene_id "STRG.3";
+NT_187562.1 StringTie transcript 230172 233179 . - . transcript_id "STRG.4.2"; gene_id "STRG.4";
+NT_187562.1 StringTie exon 230172 231784 . - . transcript_id "STRG.4.2"; gene_id "STRG.4";
+NT_187562.1 StringTie exon 233047 233179 . - . transcript_id "STRG.4.2"; gene_id "STRG.4";
+NT_187562.1 BestRefSeq transcript 962573 963937 . + . transcript_id "STRG.6.2"; gene_id "STRG.6";
+NT_187562.1 BestRefSeq exon 962573 962829 . + . transcript_id "STRG.6.2"; gene_id "STRG.6";
+NT_187562.1 BestRefSeq exon 963419 963937 . + . transcript_id "STRG.6.2"; gene_id "STRG.6";
=====================================
gff_utils.cpp
=====================================
@@ -1,6 +1,67 @@
#include "gff_utils.h"
+GHash<GeneInfo> gene_ids;
+
bool verbose=false; //same with GffReader::showWarnings and GffLoader::beVserbose
+bool debugMode=false;
+bool ensembl_convert=false; //-L, assist in converting Ensembl GTF to GFF3
+
+FILE* ffasta=NULL;
+FILE* f_in=NULL;
+FILE* f_out=NULL;
+FILE* f_w=NULL; //writing fasta with spliced exons (transcripts)
+int wPadding = 0; //padding for -w option
+FILE* f_x=NULL; //writing fasta with spliced CDS
+FILE* f_y=NULL; //wrting fasta with translated CDS
+
+
+int maxintron=999000000;
+
+bool wCDSonly=false;
+bool wNConly=false;
+int minLen=0; //minimum transcript length
+bool validCDSonly=false; // translation with no in-frame STOP
+bool bothStrands=false; //for single-exon mRNA validation, check the other strand too
+bool altPhases=false; //if original phase fails translation validation,
+ //try the other 2 phases until one makes it
+bool addCDSattrs=false;
+bool add_hasCDS=false;
+//bool streamIn=false; // --stream option
+bool adjustStop=false; //automatic adjust the CDS stop coordinate
+bool covInfo=false; // --cov-info : only report genome coverage
+GStr tableFormat; //list of "attributes" to print in tab delimited format
+bool spliceCheck=false; //only known splice-sites
+bool decodeChars=false; //decode url-encoded chars in attrs (-D)
+bool StarStop=false; //use * instead of . for stop codon translation
+bool fullCDSonly=false; // starts with START, ends with STOP codon
+
+bool multiExon=false;
+bool writeExonSegs=false;
+char* tracklabel=NULL;
+char* rfltGSeq=NULL;
+char rfltStrand=0;
+uint rfltStart=0;
+uint rfltEnd=MAX_UINT;
+bool rfltWithin=false; //check for full containment within given range
+bool addDescr=false;
+
+
+bool fmtGFF3=true; //default output: GFF3
+//other formats only make sense in transcriptOnly mode
+bool fmtGTF=false;
+bool fmtBED=false;
+bool fmtTLF=false;
+bool fmtTable=false;
+
+GffPrintMode exonPrinting=pgffAny;
+
+GFastaDb gfasta;
+
+GHash<SeqInfo> seqinfo;
+GVec<CTableField> tableCols;
+GHash<RefTran> reftbl;
+
+GHash<int> isoCounter; //counts the valid isoforms
void printFasta(FILE* f, GStr* defline, char* seq, int seqlen, bool useStar) {
if (seq==NULL) return;
@@ -135,6 +196,561 @@ bool tMatch(GffObj& a, GffObj& b) {
}
+char* getSeqDescr(char* seqid) {
+ static char charbuf[128];
+ if (seqinfo.Count()==0) return NULL;
+ char* suf=rstrchr(seqid, '.');
+ if (suf!=NULL) *suf=0;
+ SeqInfo* seqd=seqinfo.Find(seqid);
+ if (suf!=NULL) *suf='.';
+ if (seqd!=NULL) {
+ GStr s(seqd->descr);
+ //cleanup some Uniref gunk
+ if (s[0]=='[') {
+ int r=s.index(']');
+ if (r>=0 && r<8 && isdigit(s[1]))
+ s.remove(0,r+1);
+ }
+ if (s.length()>80) {
+ int r=s.index(';');
+ if (r>5) s.cut(r);
+ }
+ if (s.length()>127) {
+ s.cut(127);
+ int r=s.rindex(' ');
+ if (r>0) s.cut(r);
+ }
+ strcpy(charbuf, s.chars());
+ return charbuf;
+ }
+ else return NULL;
+}
+
+char* getSeqName(char* seqid) {
+ static char charbuf[128];
+ char* suf=rstrchr(seqid, '.');
+ if (suf!=NULL) *suf=0;
+ strcpy(charbuf, seqid);
+ if (suf!=NULL) *suf='.';
+ return charbuf;
+}
+
+
+int adjust_stopcodon(GffObj& gffrec, int adj, GList<GSeg>* seglst) {
+ //adj>0, extend CDS to include a potential stop codon
+ //when CDS is expanded, the terminal exon might have to be adjusted too
+ int realadj=0;
+ if (gffrec.strand=='-') {
+ if ((int)gffrec.CDstart>adj) {
+ gffrec.CDstart-=adj;
+ realadj=adj;
+ if (gffrec.exons.First()->start>gffrec.CDstart) {
+ gffrec.covlen+=gffrec.exons.First()->start - gffrec.CDstart;
+ gffrec.exons.First()->start=gffrec.CDstart;
+ gffrec.start=gffrec.CDstart;
+ }
+ }
+ }
+ else { // forward strand
+ //expand beyond
+ realadj=adj;
+ gffrec.CDend+=adj;
+ if (adj<0) {//restore
+ if (gffrec.exons.Last()->end==gffrec.CDend-adj) {
+ gffrec.exons.Last()->end+=adj;
+ gffrec.end=gffrec.exons.Last()->end;
+ gffrec.covlen+=adj;
+ }
+ }
+ else if (gffrec.exons.Last()->end<gffrec.CDend) {
+ gffrec.covlen+=gffrec.CDend-gffrec.exons.Last()->end;
+ gffrec.exons.Last()->end=gffrec.CDend;
+ gffrec.end=gffrec.CDend;
+ }
+ }
+ if (seglst!=NULL) seglst->Last()->end+=realadj;
+ return realadj;
+ }
+
+void printTableData(FILE* f, GffObj& g, bool inFasta) {
+ //using attribute list in tableCols
+ char* av=NULL;
+ for(int i=0;i<tableCols.Count();i++) {
+ if (i>0 || inFasta) {
+ if (!inFasta || tableCols[i].type!=ctfGFF_ID)
+ fprintf(f,"\t");
+ }
+ switch(tableCols[i].type) {
+ case ctfGFF_Attr:
+ av=g.getAttr(tableCols[i].name.chars());
+ fprintf(f,"%s",av!=NULL? av : ".");
+ break;
+ case ctfGFF_chr:
+ fprintf(f,"%s",g.getGSeqName());
+ break;
+ case ctfGFF_ID:
+ if (!inFasta)
+ fprintf(f,"%s",g.getID());
+ break;
+ case ctfGFF_geneID:
+ fprintf(f,"%s",g.getGeneID()!=NULL ? g.getGeneID() : ".");
+ break;
+ case ctfGFF_geneName:
+ fprintf(f,"%s",g.getGeneName()!=NULL ? g.getGeneName() : ".");
+ break;
+ case ctfGFF_Parent:
+ fprintf(f,"%s",g.parent!=NULL ? g.parent->getID() : ".");
+ break;
+ case ctfGFF_feature:
+ fprintf(f,"%s",g.getFeatureName());
+ break;
+ case ctfGFF_start:
+ fprintf(f,"%d",g.start);
+ break;
+ case ctfGFF_end:
+ fprintf(f,"%d",g.end);
+ break;
+ case ctfGFF_strand:
+ fprintf(f,"%c",g.strand);
+ break;
+ case ctfGFF_numexons:
+ fprintf(f,"%d",g.exons.Count());
+ break;
+ case ctfGFF_exons:
+ if (g.exons.Count()>0) {
+ for (int x=0;x<g.exons.Count();x++) {
+ if (x>0) fprintf(f,",");
+ fprintf(f,"%d-%d",g.exons[x]->start, g.exons[x]->end);
+ }
+ } else fprintf(f,".");
+ break;
+ case ctfGFF_cds:
+ if (g.hasCDS()) {
+ GVec<GffExon> cds;
+ g.getCDSegs(cds);
+ for (int x=0;x<cds.Count();x++) {
+ if (x>0) fprintf(f,",");
+ fprintf(f,"%d-%d",cds[x].start, cds[x].end);
+ }
+ }
+ else fprintf(f,".");
+ break;
+ case ctfGFF_covlen:
+ fprintf(f, "%d", g.covlen);
+ break;
+ case ctfGFF_cdslen:
+ if (g.hasCDS()) {
+ GVec<GffExon> cds;
+ g.getCDSegs(cds);
+ int clen=0;
+ for (int x=0;x<cds.Count();x++)
+ clen+=cds[x].end-cds[x].start+1;
+ fprintf(f, "%d", clen);
+ }
+ else fprintf(f, "0");
+ break;
+ } //switch
+ }
+ fprintf(f,"\n");
+}
+bool GffLoader::validateGffRec(GffObj* gffrec) {
+ if (reftbl.Count()>0) { //check if we need to reject by ref seq filter
+ GStr refname(gffrec->getRefName());
+ RefTran* rt=reftbl.Find(refname.chars());
+ if (rt==NULL && refname.length()>2 && refname[-2]=='.' && isdigit(refname[-1])) {
+ //try removing the version suffix
+ refname.cut(-2);
+ //GMessage("[DEBUG] Trying ref name '%s'...\n", refname.chars());
+ rt=reftbl.Find(refname.chars());
+ }
+ if (rt) {
+ gffrec->setRefName(rt->new_name);
+ }
+ /* //no, do not discard non-matching entries, let them pass through!
+ else {
+ if (verbose)
+ GMessage("Info: %s discarded due to reference %s not being mapped\n",
+ gffrec->getID(), refname.chars());
+ return false; //discard, ref seq not in the given translation table
+ }*/
+ }
+ if (transcriptsOnly && gffrec->isDiscarded()) {
+ //discard generic "locus" features with no other detailed subfeatures
+ //GMessage("Warning: discarding %s GFF generic gene/locus container %s\n",gffrec->getID());
+ return false;
+ }
+ if (minLen>0 && gffrec->covlen<minLen) {
+ if (verbose)
+ GMessage("Info: %s discarded due to minimum length threshold %d\n",
+ gffrec->getID(), minLen);
+ return false;
+ }
+ if (rfltGSeq!=NULL) { //filter by gseqName
+ if (strcmp(gffrec->getGSeqName(),rfltGSeq)!=0) {
+ return false;
+ }
+ }
+ if (rfltStrand>0 && gffrec->strand !=rfltStrand) {
+ return false;
+ }
+ //check coordinates
+ if (rfltStart!=0 || rfltEnd!=MAX_UINT) {
+ if (rfltWithin) {
+ if (gffrec->start<rfltStart || gffrec->end>rfltEnd) {
+ return false; //not within query range
+ }
+ }
+ else {
+ if (gffrec->start>rfltEnd || gffrec->end<rfltStart) {
+ return false;
+ }
+ }
+ }
+ if (multiExon && gffrec->exons.Count()<=1) {
+ return false;
+ }
+ if (wCDSonly && gffrec->CDstart==0) {
+ return false;
+ }
+ if (wNConly && gffrec->hasCDS()) return false;
+ return true;
+}
+
+bool process_transcript(GFastaDb& gfasta, GffObj& gffrec) {
+ if (!gffrec.isTranscript()) return false; //shouldn't call this function unless it's a transcript
+ //returns true if the transcript passed the filter
+ char* gname=gffrec.getGeneName();
+ if (gname==NULL) gname=gffrec.getGeneID();
+ if (ensembl_convert && startsWith(gffrec.getID(), "ENS")) {
+ const char* biotype=gffrec.getAttr("gene_biotype");
+ if (biotype) {
+ gffrec.addAttr("type", biotype);
+ gffrec.removeAttr("gene_biotype");
+ }
+ else { //old Ensembl files lacking gene_biotype
+ gffrec.addAttr("type", gffrec.getTrackName());
+ }
+
+ //bool is_gene=false;
+ bool is_pseudo=false;
+ if (strcmp(biotype, "protein_coding")==0 || gffrec.hasCDS())
+ gffrec.setFeatureName("mRNA");
+ else {
+ if (strcmp(biotype, "processed_transcript")==0)
+ gffrec.setFeatureName("proc_RNA");
+ else {
+ //is_gene=endsWith(biotype, "gene");
+ is_pseudo=strifind(biotype, "pseudo");
+ if (is_pseudo) {
+ gffrec.setFeatureName("pseudo_RNA");
+ }
+ else if (endsWith(biotype, "RNA")) {
+ gffrec.setFeatureName(biotype);
+ } else gffrec.setFeatureName("misc_RNA");
+ }
+ }
+ }
+ if (gname && strcmp(gname, gffrec.getID())!=0) {
+ int* isonum=isoCounter.Find(gname);
+ if (isonum==NULL) {
+ isonum=new int(1);
+ isoCounter.Add(gname,isonum);
+ }
+ else (*isonum)++;
+ //defline.appendfmt(" gene=%s", gname);
+ }
+ int seqlen=0;
+
+ const char* tlabel=tracklabel;
+ if (tlabel==NULL) tlabel=gffrec.getTrackName();
+ //defline.appendfmt(" track:%s",tlabel);
+ char* cdsnt = NULL;
+ char* cdsaa = NULL;
+ int aalen=0;
+ for (int i=1;i<gffrec.exons.Count();i++) {
+ int ilen=gffrec.exons[i]->start-gffrec.exons[i-1]->end-1;
+ if (verbose && ilen>4000000)
+ GMessage("Warning: very large intron (%d) for transcript %s\n",
+ ilen, gffrec.getID());
+ if (ilen>maxintron) {
+ return false;
+ }
+ }
+ GMapSegments seglst(gffrec.strand);
+ GFaSeqGet* faseq=NULL;
+ if (f_x!=NULL || f_y!=NULL || f_w!=NULL || spliceCheck || validCDSonly || addCDSattrs) {
+ faseq=fastaSeqGet(gfasta, gffrec.getGSeqName());
+ if (faseq==NULL)
+ GError("Error: no genomic sequence available (check -g option!).\n");
+ }
+ if (spliceCheck && gffrec.exons.Count()>1) {
+ //check introns for splice site consensi ( GT-AG, GC-AG or AT-AC )
+ int glen=gffrec.end-gffrec.start+1;
+ const char* gseq=faseq->subseq(gffrec.start, glen);
+ if (gseq==NULL) {
+ GMessage("Error at GFF ID %s : could not retrieve subsequence %s:%d-%d !\n",
+ gffrec.getID(), gffrec.getRefName(), gffrec.start, gffrec.end);
+ return false;
+ }
+ bool revcompl=(gffrec.strand=='-');
+ bool ssValid=true;
+ for (int e=1;e<gffrec.exons.Count();e++) {
+ const char* intron=gseq+gffrec.exons[e-1]->end+1-gffrec.start;
+ int intronlen=gffrec.exons[e]->start-gffrec.exons[e-1]->end-1;
+ GSpliceSite acceptorSite(intron,intronlen,true, revcompl);
+ GSpliceSite donorSite(intron,intronlen, false, revcompl);
+ //GMessage("%c intron %d-%d : %s .. %s\n",
+ // gffrec.strand, istart, iend, donorSite.nt, acceptorSite.nt);
+ if (acceptorSite=="AG") { // GT-AG or GC-AG
+ if (!donorSite.canonicalDonor()) {
+ ssValid=false;break;
+ }
+ }
+ else if (acceptorSite=="AC") { //AT-AC also accepted
+ if (donorSite!="AT") { ssValid=false; break; }
+ }
+ else { ssValid=false; break; }
+ }
+ if (!ssValid) {
+ if (verbose)
+ GMessage("Unrecognized splice sites found for '%s'\n",gffrec.getID());
+ return false; //don't print this one!
+ }
+ }
+ bool trprint=true;
+ bool inframeStop=false;
+ //int stopCodonAdjust=0;
+ int mCDphase=0;
+ bool fullCDS=false;
+ bool endStop=false;
+ bool stopAdjusted=false;
+ if (add_hasCDS && gffrec.hasCDS()) gffrec.addAttr("hasCDS", "true");
+ if (gffrec.CDphase=='1' || gffrec.CDphase=='2')
+ mCDphase = gffrec.CDphase-'0';
+ //CDS partialness only added when -y -x -V options are given
+ if (gffrec.hasCDS() && (f_y!=NULL || f_x!=NULL || validCDSonly || addCDSattrs)) {
+ int strandNum=0;
+ int phaseNum=0;
+ CDS_CHECK:
+ uint cds_olen=0;
+ cdsnt=gffrec.getSpliced(faseq, true, &seqlen, NULL, &cds_olen, &seglst, adjustStop);
+ //if adjustStop, seqlen has the CDS+3'UTR length, but cds_olen still has the original CDS length
+ if (cdsnt!=NULL && cdsnt[0]!='\0') { //has CDS
+ cdsaa=translateDNA(cdsnt, aalen, seqlen);
+ char* p=strchr(cdsaa,'.');
+ int cds_aalen=aalen;
+ if (adjustStop)
+ cds_aalen=cds_olen/3; //originally stated CDS length
+ endStop=false;
+ if (p!=NULL) { //stop codon found
+ if (p-cdsaa==cds_aalen-1) { //stop found as the stated last CDS codon
+ *p='\0';//remove it
+ endStop=true;
+ if (adjustStop) {
+ seqlen=cds_aalen*3;
+ aalen=cds_aalen;
+ }
+ cds_aalen--;
+ aalen--;
+ //no need to adjust stop codon
+ }
+ else {//stop found in a different position than the last codon
+ if (p-cdsaa<cds_aalen-1 && !adjustStop) {
+ inframeStop=true;
+ }
+ if (adjustStop) {
+ *p='\0';
+ cds_aalen=p-cdsaa+1; //adjusted CDS length
+ seqlen=cds_aalen*3;
+ aalen=cds_aalen;
+ uint gc=seglst.gmap(seqlen);
+ if (gffrec.strand=='-') gffrec.CDstart=gc;
+ else gffrec.CDend=gc;
+ endStop=true;
+ stopAdjusted=true;
+ }
+ }
+ }//stop codon found
+ //if (trprint==false) { //failed CDS validity check
+ if (inframeStop) {
+ //in-frame stop codon found
+ if (altPhases && phaseNum<3) {
+ phaseNum++; //try a different phase
+ gffrec.CDphase = '0'+((mCDphase+phaseNum)%3);
+ GFREE(cdsaa);
+ goto CDS_CHECK;
+ }
+ if (gffrec.exons.Count()==1 && bothStrands) {
+ strandNum++;
+ phaseNum=0;
+ if (strandNum<2) {
+ GFREE(cdsaa);
+ gffrec.strand = (gffrec.strand=='-') ? '+':'-';
+ goto CDS_CHECK; //repeat the CDS check for a different frame
+ }
+ }
+ if (verbose) GMessage("Warning: In-frame STOP found for '%s'\n",gffrec.getID());
+ if (addCDSattrs) gffrec.addAttr("InFrameStop", "true");
+ } //has in-frame STOP
+ if (stopAdjusted) {
+ if (addCDSattrs) gffrec.addAttr("CDStopAdjusted", "true");
+ inframeStop=false; //pretend it's OK now that we've adjusted it
+ }
+ if (!inframeStop) {
+ bool hasStart=(cdsaa[0]=='M'); //for the regular eukaryotic translation table
+ fullCDS=(endStop && hasStart);
+ if (!fullCDS) {
+ const char* partialness=NULL;
+ if (hasStart) partialness="3";
+ else {
+ partialness = endStop ? "5" : "5_3";
+ }
+ if (addCDSattrs) gffrec.addAttr("partialness", partialness);
+ }
+ }
+ if (trprint && ((fullCDSonly && !fullCDS) || (validCDSonly && inframeStop)) )
+ trprint=false;
+ //} // Valid CDS only requested?
+ } //has CDS
+ } //translation or codon check was requested
+ if (!trprint) {
+ GFREE(cdsnt);
+ GFREE(cdsaa);
+ //if (adjstop!=NULL) delete adjstop;
+ return false;
+ }
+ /*
+ if (validCDSonly) {
+ int stopCodonAdjust=adjstop->restore();
+ if (stopCodonAdjust!=0 && !endStop) {
+ //restore stop codon location
+ //adjust_stopcodon(gffrec, -stopCodonAdjust, &seglst);
+ if (seglst.Count()>0) seglst.Last()->end-=stopCodonAdjust;
+ if (cdsnt!=NULL && seqlen>0) {
+ seqlen-=stopCodonAdjust;
+ cdsnt[seqlen]=0;
+ }
+ if (cdsaa!=NULL) aalen--;
+ }
+ }
+ if (adjstop!=NULL) delete adjstop;
+ */
+ if (cdsnt!=NULL) { // && !inframeStop) {
+ if (f_y!=NULL) { //CDS translation fasta output requested
+ if (cdsaa==NULL) { //translate now if not done before
+ cdsaa=translateDNA(cdsnt, aalen, seqlen);
+ }
+ if (aalen>0) {
+ if (cdsaa[aalen-1]=='.' || cdsaa[aalen-1]=='\0') --aalen; //avoid printing the stop codon
+ fprintf(f_y, ">%s", gffrec.getID());
+ if (fmtTable) printTableData(f_y, gffrec, true);
+ else fprintf(f_y, "\n");
+ printFasta(f_y, NULL, cdsaa, aalen, StarStop);
+ }
+ }
+ if (f_x!=NULL) { //CDS only
+ GStr defline(gffrec.getID(), 94);
+ if (writeExonSegs) {
+ defline.append(" loc:");
+ defline.append(gffrec.getGSeqName());
+ defline.appendfmt("(%c)",gffrec.strand);
+ //warning: not CDS coordinates are written here, but the exon ones
+ defline+=(int)gffrec.start;
+ defline+=(char)'-';
+ defline+=(int)gffrec.end;
+ // -- here these are CDS substring coordinates on the spliced sequence:
+ defline.append(" segs:");
+ for (int i=0;i<seglst.Count();i++) {
+ if (i>0) defline.append(",");
+ defline+=(int)seglst[i].start;
+ defline.append("-");
+ defline+=(int)seglst[i].end;
+ }
+ }
+ fprintf(f_x, ">%s", defline.chars());
+ if (fmtTable) printTableData(f_x, gffrec, true);
+ else fprintf(f_x, "\n");
+ printFasta(f_x, NULL, cdsnt, seqlen);
+ }
+ GFREE(cdsnt);
+ GFREE(cdsaa);
+ } //writing CDS or its translation
+ if (f_w!=NULL) { //write spliced exons
+ uint cds_start=0;
+ uint cds_end=0;
+ seglst.Clear();
+ int padLeft=0;
+ int padRight=0;
+ if (wPadding>0) {
+ padLeft= (gffrec.start>(uint)wPadding) ? wPadding : gffrec.start - 1;
+ int ediff=faseq->getseqlen()-gffrec.end;
+ padRight=(wPadding>ediff) ? ediff : wPadding;
+ gffrec.addPadding(padLeft, padRight);
+ }
+ char* exont=gffrec.getSpliced(faseq, false, &seqlen, &cds_start, &cds_end, &seglst);
+ //restore exons to normal (remove padding)
+ if (wPadding>0)
+ gffrec.removePadding(padLeft, padRight);
+
+ GStr defline(gffrec.getID());
+ if (exont!=NULL) {
+ if (gffrec.CDstart>0) {
+ defline.appendfmt(" CDS=%d-%d", cds_start, cds_end);
+ }
+ if (writeExonSegs) {
+ defline.append(" loc:");
+ defline.append(gffrec.getGSeqName());
+ defline+=(char)'|';
+ defline+=(int)gffrec.start;
+ defline+=(char)'-';
+ defline+=(int)gffrec.end;
+ defline+=(char)'|';
+ defline+=(char)gffrec.strand;
+ defline.append(" exons:");
+ for (int i=0;i<gffrec.exons.Count();i++) {
+ if (i>0) defline.append(",");
+ defline+=(int)gffrec.exons[i]->start;
+ defline.append("-");
+ defline+=(int)gffrec.exons[i]->end;
+ }
+ if (wPadding>0) {
+ defline.append(" padding:");
+ defline.append(padLeft);
+ defline+=(char)'|';
+ defline.append(padRight);
+ }
+
+ defline.append(" segs:");
+ for (int i=0;i<seglst.Count();i++) {
+ if (i>0) defline.append(",");
+ defline+=(int)seglst[i].start;
+ defline.append("-");
+ defline+=(int)seglst[i].end;
+ }
+ }
+
+ fprintf(f_w, ">%s", defline.chars());
+ if (fmtTable) printTableData(f_w, gffrec, true);
+ else fprintf(f_w, "\n");
+ printFasta(f_w, NULL, exont, seqlen);
+ GFREE(exont);
+ }
+ } //writing f_w (spliced exons)
+ return true;
+}
+
+
+GTData::GTData(GffObj* t, GenomicSeqData* gd):rna(t),gdata(gd), locus(NULL), replaced_by(NULL), geneinfo(NULL) {
+ if (rna!=NULL) {
+ //geneinfo=(GeneInfo*)rna->uptr; //take over geneinfo, if there
+ rna->uptr=this;
+ }
+ if (gdata!=NULL)
+ gdata->tdata.Add(this);
+}
+
+
+
bool GffLoader::unsplContained(GffObj& ti, GffObj& tj) {
//returns true only if ti (which MUST be single-exon) is "almost" contained in any of tj's exons
//but it does not cross any intron-exon boundary of tj
@@ -375,8 +991,7 @@ bool GffLoader::placeGf(GffObj* t, GenomicSeqData* gdata) {
g.children.Add(t);
keep=true;
if (tdata==NULL) {
- tdata=new GTData(t); //additional transcript data
- gdata->tdata.Add(tdata);
+ tdata=new GTData(t, gdata); //additional transcript data
}
t->parent=&g;
//disable printing of gene if transcriptsOnly and --keep-genes wasn't given
@@ -400,8 +1015,8 @@ bool GffLoader::placeGf(GffObj* t, GenomicSeqData* gdata) {
if (t->exons.Count()>0) { //treating this entry as a transcript
gdata->rnas.Add(t); //added it in sorted order
if (tdata==NULL) {
- tdata=new GTData(t); //additional transcript data
- gdata->tdata.Add(tdata);
+ tdata=new GTData(t, gdata); //additional transcript data
+ //gdata->tdata.Add(tdata);
}
keep=true;
}
@@ -409,10 +1024,9 @@ bool GffLoader::placeGf(GffObj* t, GenomicSeqData* gdata) {
if (t->isGene() || !this->transcriptsOnly) {
gdata->gfs.Add(t);
keep=true;
- //GTData* tdata=new GTData(t); //additional transcript data
if (tdata==NULL) {
- tdata=new GTData(t); //additional transcript data
- gdata->tdata.Add(tdata);
+ tdata=new GTData(t, gdata); //additional transcript data
+ //gdata->tdata.Add(tdata);
}
noexon_gfs=true; //gene-like record, no exons defined
keep=true;
@@ -420,6 +1034,27 @@ bool GffLoader::placeGf(GffObj* t, GenomicSeqData* gdata) {
return false; //nothing to do with these non-transcript objects
}
}
+ //keeping track of genes in special cases
+ char* geneid=t->getGeneID();
+ bool trackGenes=!t->isGene() && ( (keepGenes && t->parent==NULL) ||
+ (ensembl_convert && startsWith(t->getID(), "ENS") ) ) ;
+ if (trackGenes) {
+ GTData* tdata=(GTData*)(t->uptr);
+ //keep track of chr|gene_id data and coordinate range
+ if (geneid!=NULL) {
+ GeneInfo* ginfo=gene_ids.Find(geneid);
+ if (ginfo==NULL) {//first time seeing this gene ID
+ GeneInfo* geneinfo=new GeneInfo(t, tdata->gdata, ensembl_convert);
+ gene_ids.Add(geneid, geneinfo);
+ //if (gfnew!=NULL) //new gene features
+ // gfnew->Add(geneinfo->gf);
+ }
+ else ginfo->update(t);
+ }
+ }
+
+
+
if (!doCluster) return keep;
if (!keep) return false;
@@ -677,7 +1312,7 @@ void warnPseudo(GffObj& m) {
GMessage("Info: pseudo gene/transcript record with ID=%s discarded.\n",m.getID());
}
-void GffLoader::load(GList<GenomicSeqData>& seqdata, GFValidateFunc* gf_validate, GFFCommentParser* gf_parsecomment) {
+void GffLoader::load(GList<GenomicSeqData>& seqdata, GFFCommentParser* gf_parsecomment) {
if (f==NULL) GError("Error: GffLoader::load() cannot be called before ::openFile()!\n");
GffReader* gffr=new GffReader(f, this->transcriptsOnly, true); //not only mRNA features, sorted
clearHeaderLines();
@@ -692,6 +1327,30 @@ void GffLoader::load(GList<GenomicSeqData>& seqdata, GFValidateFunc* gf_validate
gffr->setIgnoreLocus(ignoreLocus);
gffr->setRefAlphaSorted(this->sortRefsAlpha);
if (keepGff3Comments && gf_parsecomment!=NULL) gffr->setCommentParser(gf_parsecomment);
+ int outcounter=0;
+ if (streamIn) { //this will ignore any clustering options
+ GffObj* t=NULL;
+ while ((t=gffr->readNext())!=NULL) {
+ if (!validateGffRec(t)) {
+ delete t;
+ continue;
+ }
+ if (process_transcript(gfasta, *t)) {
+ outcounter++;
+ if (f_out) {
+ if (fmtTable)
+ printTableData(f_out, *t);
+ else //GFF3, GTF, BED, TLF
+ t->printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars);
+ }
+ }
+ delete t;
+ }
+ delete gffr;
+ return;
+ }
+
+
gffr->readAll();
GVec<int> pseudoFeatureIds; //feature type: pseudo*
GVec<int> pseudoAttrIds; // attribute: [is]pseudo*=true/yes/1
@@ -720,9 +1379,7 @@ void GffLoader::load(GList<GenomicSeqData>& seqdata, GFValidateFunc* gf_validate
}
}
- //int redundant=0; //redundant annotation discarded
if (verbose) GMessage(" .. loaded %d genomic features from %s\n", gffr->gflst.Count(), fname.chars());
- //int rna_deleted=0;
//add to GenomicSeqData, adding to existing loci and identifying intron-chain duplicates
for (int k=0;k<gffr->gflst.Count();k++) {
GffObj* m=gffr->gflst[k];
@@ -781,7 +1438,7 @@ void GffLoader::load(GList<GenomicSeqData>& seqdata, GFValidateFunc* gf_validate
m->subftype_id=gff_fid_exon;
}
//GList<GffObj> gfadd(false,false); -- for gf_validate()?
- if (gf_validate!=NULL && !(*gf_validate)(m, NULL)) {
+ if (!validateGffRec(m)) {
continue;
}
m->isUsed(true); //so the gffreader won't destroy it
=====================================
gff_utils.h
=====================================
@@ -2,12 +2,64 @@
#define GFF_UTILS_H
#include "gff.h"
#include "GStr.h"
+#include "GVec.hh"
#include "GFaSeqGet.h"
extern bool verbose;
extern bool debugMode;
-
-typedef bool GFValidateFunc(GffObj* gf, GList<GffObj>* gfadd);
+extern bool ensembl_convert;
+
+extern FILE* ffasta;
+extern FILE* f_in;
+extern FILE* f_out;
+extern FILE* f_w; //writing fasta with spliced exons (transcripts)
+extern int wPadding; //padding for -w option
+extern FILE* f_x; //writing fasta with spliced CDS
+extern FILE* f_y; //wrting fasta with translated CDS
+
+extern int maxintron;
+
+extern bool wCDSonly;
+extern bool wNConly;
+extern int minLen; //minimum transcript length
+extern bool validCDSonly; // translation with no in-frame STOP
+extern bool bothStrands; //for single-exon mRNA validation, check the other strand too
+extern bool altPhases; //if original phase fails translation validation,
+ //try the other 2 phases until one makes it
+extern bool addCDSattrs;
+extern bool add_hasCDS;
+
+extern bool adjustStop; //automatic adjust the CDS stop coordinate
+extern bool covInfo; // --cov-info : only report genome coverage
+extern GStr tableFormat; //list of "attributes" to print in tab delimited format
+extern bool spliceCheck; //only known splice-sites
+extern bool decodeChars; //decode url-encoded chars in attrs (-D)
+extern bool StarStop; //use * instead of . for stop codon translation
+extern bool fullCDSonly; // starts with START, ends with STOP codon
+
+extern bool multiExon;
+extern bool writeExonSegs;
+extern char* tracklabel;
+extern char* rfltGSeq;
+extern char rfltStrand;
+extern uint rfltStart;
+extern uint rfltEnd;
+extern bool rfltWithin; //check for full containment within given range
+extern bool addDescr;
+
+
+extern bool fmtGFF3; //output: GFF3
+//other formats only make sense in transcriptOnly mode
+extern bool fmtGTF;
+extern bool fmtBED;
+extern bool fmtTLF;
+extern bool fmtTable;
+
+
+extern GffPrintMode exonPrinting;
+
+//typedef bool GFValidateFunc(GffObj* gf, GList<GffObj>* gfadd);
+typedef bool GFValidateFunc(GffObj* gf);
//test if a transcript should be printed (and not printed yet)
#define T_PRINTABLE(d) (((d) & 0x100)==0)
@@ -26,7 +78,90 @@ typedef bool GFValidateFunc(GffObj* gf, GList<GffObj>* gfadd);
//keep/set original/old strand
#define T_SET_OSTRAND(d, s) d |= s
-class GeneInfo { //for Ensembl GTF conversion
+class SeqInfo { //populated from the -s option of gffread
+ public:
+ int len;
+ char* descr;
+ SeqInfo( int l, char* s): len(l), descr(NULL) {
+ if (s!=NULL)
+ descr=Gstrdup(s);
+ }
+ ~SeqInfo() {
+ GFREE(descr);
+ }
+};
+
+class RefTran {
+ public:
+ char* new_name;
+ RefTran(char *ns) {
+ new_name=NULL;
+ if (ns!=NULL)
+ new_name=Gstrdup(ns);
+ }
+ ~RefTran() {
+ GFREE(new_name);
+ }
+};
+
+extern GFastaDb gfasta;
+extern GHash<SeqInfo> seqinfo;
+extern GHash<int> isoCounter; //counts the valid isoforms
+extern GHash<RefTran> reftbl;
+
+char* getSeqDescr(char* seqid);
+char* getSeqName(char* seqid);
+int adjust_stopcodon(GffObj& gffrec, int adj, GList<GSeg>* seglst=NULL);
+void printTableData(FILE* f, GffObj& g, bool inFasta=false);
+bool validateGffRec(GffObj* gffrec);
+bool process_transcript(GFastaDb& gfasta, GffObj& gffrec);
+
+enum ETableFieldType {
+ ctfGFF_Attr=0, // attribute name as is
+ ctfGFF_ID, //ID or @id or transcript_id
+ ctfGFF_geneID, //geneID or @gene_id or @geneid
+ ctfGFF_geneName, //geneName or @gene_name or @genename
+ ctfGFF_Parent, //Parent or @parent
+ ctfGFF_chr, //@chr
+ ctfGFF_feature, //@feature
+ ctfGFF_start, //@start
+ ctfGFF_end, //@end
+ ctfGFF_strand, //@strand
+ ctfGFF_numexons, //@numexons
+ ctfGFF_exons, //@exons
+ ctfGFF_cds, //@cds
+ ctfGFF_covlen, //@covlen
+ ctfGFF_cdslen//@cdslen
+};
+
+class CTableField {
+ public:
+ ETableFieldType type;
+ GStr name; //only for type ctfGFF_Attr
+ CTableField(ETableFieldType atype=ctfGFF_Attr):type(atype) { }
+ CTableField(GStr& attrname):type(ctfGFF_Attr),name(attrname) { }
+};
+
+
+extern GVec<CTableField> tableCols; //table output format fields
+
+class GffLocus;
+class GenomicSeqData;
+class GeneInfo;
+
+class GTData { // transcript associated data
+ public:
+ GffObj* rna;
+ GenomicSeqData* gdata;
+ GffLocus* locus;
+ GffObj* replaced_by;
+ GeneInfo* geneinfo;
+ GTData(GffObj* t=NULL, GenomicSeqData* gd=NULL);
+ bool operator<(GTData& b) { return (rna < b.rna); }
+ bool operator==(GTData& b) { return (rna==b.rna); }
+};
+
+class GeneInfo {
public:
int flag;
GffObj* gf;
@@ -36,17 +171,23 @@ class GeneInfo { //for Ensembl GTF conversion
gf=NULL;
flag=0;
}
- GeneInfo(GffObj* gfrec, bool ensembl_convert=false):gene_names(true, true, true),
+
+ GeneInfo(GffObj* gfrec, GenomicSeqData* gdata, bool ensembl_convert=false):flag(0), gf(NULL), gene_names(true, true, true),
transcripts(true,true,true) {
- flag=0;
if (gfrec->getGeneName())
gene_names.Add(new GStr(gfrec->getGeneName()));
transcripts.Add(new GStr(gfrec->getID()));
- create_gf(gfrec, ensembl_convert);
- }
+ create_gf(gfrec, gdata ,ensembl_convert);
+ }
+
+ ~GeneInfo() {
+ delete gf;
+ }
- void create_gf(GffObj* gfrec, bool ensembl_convert) {
+ void create_gf(GffObj* gfrec, GenomicSeqData* gdata, bool ensembl_convert) {
gf=new GffObj(gfrec->getGeneID());
+ GTData* gfdata=new GTData(gf, gdata);
+ gfdata->geneinfo=this;
gf->gseq_id=gfrec->gseq_id;
gf->track_id=gfrec->track_id;
gf->start=gfrec->start;
@@ -55,28 +196,30 @@ class GeneInfo { //for Ensembl GTF conversion
gf->setFeatureName("gene");
gf->isGene(true);
gf->isUsed(true);
- gf->uptr=this;
+ //gf->uptr=gfdata; //for these new gene objects
gfrec->incLevel();
gfrec->parent=gf;
gf->children.Add(gfrec);
+ const char* s=NULL;
+ if ((s=gfrec->getGeneName())) {
+ gf->addAttr("Name", s);
+ gf->copyAttrs(gfrec);
+ }
if (ensembl_convert) {
//gf->addAttr("type", gf->getTrackName());
const char* biotype=gfrec->getAttr("type");
if (biotype) gf->addAttr("type", biotype);
}
- //gf->children.Add(gfrec);
- }
- //~GeneInfo() {
- // }
+ // gf->children.Add(gfrec);
+ }
+
void update(GffObj* gfrec) {
if (transcripts.AddedIfNew(new GStr(gfrec->getID()))<0)
return;
gene_names.AddedIfNew(new GStr(gfrec->getGeneName()));
if (gf==NULL) {
GError("GeneInfo::update() called on uninitialized gf!\n");
- //create_gf(gfrec);
- //return;
- }
+ }
gfrec->parent=gf;
gf->children.Add(gfrec);
gfrec->incLevel();
@@ -85,53 +228,49 @@ class GeneInfo { //for Ensembl GTF conversion
if (gf->end<gfrec->end)
gf->end=gfrec->end;
}
- void finalize() {
+
+ void finalize() {
//prepare attributes for printing
//must be called right before printing
if (gf==NULL || transcripts.Count()==0) return;
if (gene_names.Count()>0) {
gf->addAttr("Name", gene_names[0]->chars());
- /*
- GStr s(gene_names[0]->chars());
- for (int i=1;i<gene_names.Count();i++) {
- s.append(",");
- s.append(gene_names[i]->chars());
- }
- gf->addAttr("genes", s.chars());
- */
- } //has gene names
- GStr t(transcripts[0]->chars());
- for (int i=1;i<transcripts.Count();i++) {
+ } //has gene names
+ GStr t(transcripts[0]->chars());
+ for (int i=1;i<transcripts.Count();i++) {
t.append(",");
t.append(transcripts[i]->chars());
- }
- gf->addAttr("transcripts", t.chars());
}
+ gf->addAttr("transcripts", t.chars());
+ }
};
-
-
-class GffLocus;
-
-class GTData { //transcript associated data
+class GenomicSeqData {
+ int gseq_id;
public:
- GffObj* rna;
- GffLocus* locus;
- GffObj* replaced_by;
- GeneInfo* geneinfo;
- //int flag;
- GTData(GffObj* t=NULL) {
- rna=t;
- //flag=0;
- locus=NULL;
- replaced_by=NULL;
- geneinfo=NULL;
- if (rna!=NULL) {
- geneinfo=(GeneInfo*)rna->uptr; //take over geneinfo, if there
- rna->uptr=this;
- }
- }
- bool operator<(GTData& b) { return (rna < b.rna); }
- bool operator==(GTData& b) { return (rna==b.rna); }
+ const char* gseq_name;
+ int seqreg_start; //if given by ##sequence-region comment
+ int seqreg_end;
+ GList<GffObj> gfs; //all non-transcript features -> usually gene features
+ GList<GffObj> rnas; //all transcripts on this genomic sequence
+ GList<GffLocus> loci; //all loci clusters
+ GList<GTData> tdata; //transcript data (uptr holder for all rnas loaded here)
+ uint64 f_bases;//base coverage on forward strand
+ uint64 r_bases;//base coverage on reverse strand
+ uint64 u_bases;//base coverage on undetermined strand
+ //GenomicSeqData(int gid=-1):rnas(true,true,false),loci(true,true,true),
+ GenomicSeqData(int gid=-1):gseq_id(gid), gseq_name(NULL), seqreg_start(0), seqreg_end(0),
+ gfs(true, true, false),rnas((GCompareProc*)gfo_cmpByLoc),loci(true,true,false),
+ tdata(false,true,false), f_bases(0), r_bases(0), u_bases(0) {
+ if (gseq_id>=0)
+ gseq_name=GffObj::names->gseqs.getName(gseq_id);
+ }
+
+ bool operator==(GenomicSeqData& d){
+ return gseq_id==d.gseq_id;
+ }
+ bool operator<(GenomicSeqData& d){
+ return (gseq_id<d.gseq_id);
+ }
};
class CGeneSym {
@@ -450,34 +589,6 @@ public:
}
};
-class GenomicSeqData {
- int gseq_id;
- public:
- const char* gseq_name;
- int seqreg_start; //if given by ##sequence-region comment
- int seqreg_end;
- GList<GffObj> gfs; //all non-transcript features -> usually gene features
- GList<GffObj> rnas; //all transcripts on this genomic sequence
- GList<GffLocus> loci; //all loci clusters
- GList<GTData> tdata; //transcript data (uptr holder for all rnas loaded here)
- uint64 f_bases;//base coverage on forward strand
- uint64 r_bases;//base coverage on reverse strand
- uint64 u_bases;//base coverage on undetermined strand
- //GenomicSeqData(int gid=-1):rnas(true,true,false),loci(true,true,true),
- GenomicSeqData(int gid=-1):gseq_id(gid), gseq_name(NULL), seqreg_start(0), seqreg_end(0),
- gfs(true, true, false),rnas((GCompareProc*)gfo_cmpByLoc),loci(true,true,false),
- tdata(false,true,false), f_bases(0), r_bases(0), u_bases(0) {
- if (gseq_id>=0)
- gseq_name=GffObj::names->gseqs.getName(gseq_id);
- }
-
- bool operator==(GenomicSeqData& d){
- return gseq_id==d.gseq_id;
- }
- bool operator<(GenomicSeqData& d){
- return (gseq_id<d.gseq_id);
- }
-};
int gseqCmpName(const pointer p1, const pointer p2);
@@ -574,6 +685,7 @@ class GffLoader {
bool fuzzSpan:1; //matching/contained redundancy relaxed to disregard full boundary containment
bool dOvlSET:1; //discard overlapping Single Exon Transcripts on any strand
bool forceExons:1;
+ bool streamIn:1;
};
};
@@ -599,7 +711,9 @@ class GffLoader {
}
}
- void load(GList<GenomicSeqData>&seqdata, GFValidateFunc* gf_validate=NULL, GFFCommentParser* gf_parsecomment=NULL);
+ bool validateGffRec(GffObj* gffrec);
+
+ void load(GList<GenomicSeqData>&seqdata, GFFCommentParser* gf_parsecomment=NULL);
bool placeGf(GffObj* t, GenomicSeqData* gdata);
=====================================
gffread.cpp
=====================================
@@ -4,13 +4,13 @@
#define __STDC_FORMAT_MACROS
#include <inttypes.h>
-#define VERSION "0.11.8"
+#define VERSION "0.12.1"
#define USAGE "gffread v" VERSION ". Usage:\n\
gffread <input_gff> [-g <genomic_seqs_fasta> | <dir>][-s <seq_info.fsize>] \n\
[-o <outfile>] [-t <trackname>] [-r [[<strand>]<chr>:]<start>..<end> [-R]]\n\
[-CTVNJMKQAFPGUBHZWTOLE] [-w <exons.fa>] [-x <cds.fa>] [-y <tr_cds.fa>]\n\
- [-i <maxintron>] [--bed] [--table <attrlist>] [--sort-by <refseq_list.txt>]\n\
+ [-i <maxintron>] [--stream] [--bed] [--table <attrlist>] [--sort-by <ref.lst>]\n\
\n\
Filter, convert or cluster GFF/GTF/BED records, extract the sequence of\n\
transcripts (exon or CDS) and more.\n\
@@ -74,6 +74,8 @@ Misc options: \n\
--in-tlf: input GFF-like one-line-per-transcript format without exon/CDS\n\
features (see --tlf option below); automatic if the input\n\
filename ends with .tlf)\n\
+ --stream: fast processing of input GFF/BED transcripts as they are received\n\
+ ((no sorting, exons must be grouped by transcript in the input data)\n\
Clustering:\n\
-M/--merge : cluster the input transcripts into loci, discarding\n\
\"duplicated\" transcripts (those with the same exact introns\n\
@@ -134,58 +136,7 @@ Output options:\n\
problems with the given GFF/GTF records\n\
"
-class SeqInfo { //populated from the -s option of gffread
- public:
- int len;
- char* descr;
- SeqInfo( int l, char* s): len(l), descr(NULL) {
- if (s!=NULL)
- descr=Gstrdup(s);
- }
- ~SeqInfo() {
- GFREE(descr);
- }
-};
-
-class RefTran {
- public:
- char* new_name;
- RefTran(char *ns) {
- new_name=NULL;
- if (ns!=NULL)
- new_name=Gstrdup(ns);
- }
- ~RefTran() {
- GFREE(new_name);
- }
-};
-
-enum ETableFieldType {
- ctfGFF_Attr=0, // attribute name as is
- ctfGFF_ID, //ID or @id or transcript_id
- ctfGFF_geneID, //geneID or @gene_id or @geneid
- ctfGFF_geneName, //geneName or @gene_name or @genename
- ctfGFF_Parent, //Parent or @parent
- ctfGFF_chr, //@chr
- ctfGFF_feature, //@feature
- ctfGFF_start, //@start
- ctfGFF_end, //@end
- ctfGFF_strand, //@strand
- ctfGFF_numexons, //@numexons
- ctfGFF_exons, //@exons
- ctfGFF_cds, //@cds
- ctfGFF_covlen, //@covlen
- ctfGFF_cdslen//@cdslen
-};
-
-class CTableField {
- public:
- ETableFieldType type;
- GStr name; //only for type ctfGFF_Attr
- CTableField(ETableFieldType atype=ctfGFF_Attr):type(atype) { }
- CTableField(GStr& attrname):type(ctfGFF_Attr),name(attrname) { }
-};
-
+/*
FILE* ffasta=NULL;
FILE* f_in=NULL;
FILE* f_out=NULL;
@@ -203,65 +154,51 @@ bool altPhases=false; //if original phase fails translation validation,
//try the other 2 phases until one makes it
bool addCDSattrs=false;
bool add_hasCDS=false;
+//bool streamIn=false; // --stream option
bool adjustStop=false; //automatic adjust the CDS stop coordinate
bool covInfo=false; // --cov-info : only report genome coverage
-//bool transcriptsOnly=true;
-//bool keepGenes=false; //for transcriptsOnly
-
-//bool sortAlpha=false;
-GStr sortBy; //file name with chromosomes listed in the desired order
-//bool keepRefOrder=false; //sort within chromosomes, but follow the input chromosome order -- default!
GStr tableFormat; //list of "attributes" to print in tab delimited format
-//bool NoPseudo=false;
bool spliceCheck=false; //only known splice-sites
bool decodeChars=false; //decode url-encoded chars in attrs (-D)
bool StarStop=false; //use * instead of . for stop codon translation
-
bool fullCDSonly=false; // starts with START, ends with STOP codon
-//bool fullattr=false; //-F
-//bool gatherExonAttrs=false; //-G
+*/
+
+GStr sortBy; //file name with chromosomes listed in the desired order
-//bool sortByLoc=false; // if the GFF output should be sorted by location
-bool ensembl_convert=false; //-L, assist in converting Ensembl GTF to GFF3
bool BEDinput=false;
bool TLFinput=false;
-bool fmtGFF3=true; //default output: GFF3
-//other formats only make sens in transcriptOnly mode
-bool fmtGTF=false;
-bool fmtBED=false;
-bool fmtTLF=false;
-bool fmtTable=false;
-bool addDescr=false;
-//bool protmap=false;
+//--- output options
+/*
+extern bool fmtGFF3=true; //default output: GFF3
+//other formats only make sense in transcriptOnly mode
+extern bool fmtGTF=false;
+extern bool fmtBED=false;
+extern bool fmtTLF=false;
+extern bool fmtTable=false;
+
+
bool multiExon=false;
bool writeExonSegs=false;
char* tracklabel=NULL;
-int maxintron=999000000;
-//bool mergeCloseExons=false;
-//range filter:
char* rfltGSeq=NULL;
char rfltStrand=0;
uint rfltStart=0;
uint rfltEnd=MAX_UINT;
bool rfltWithin=false; //check for full containment within given range
+bool addDescr=false;
-GffLoader gffloader;
-
-GList<GenomicSeqData> g_data(true,true,true); //list of GFF records by genomic seq
-
-//hash with sequence info
-GHash<SeqInfo> seqinfo;
-GHash<int> isoCounter; //counts the valid isoforms
-GHash<RefTran> reftbl;
-GHash<GeneInfo> gene_ids;
- //min-max gene span associated to chr|gene_id (mostly for Ensembl conversion)
+*/
-bool debugMode=false;
-//bool verbose=false;
+//bool protmap=false;
+//int maxintron=999000000;
+//bool mergeCloseExons=false;
+//range filter:
+GffLoader gffloader;
-GVec<CTableField> tableCols; //table output format fields
+GList<GenomicSeqData> g_data(true,true,true); //list of GFF records by genomic seq
void loadSeqInfo(FILE* f, GHash<SeqInfo> &si) {
GLineReader fr(f);
@@ -365,484 +302,6 @@ void loadRefTable(FILE* f, GHash<RefTran>& rt) {
} //while lines
}
-char* getSeqDescr(char* seqid) {
- static char charbuf[128];
- if (seqinfo.Count()==0) return NULL;
- char* suf=rstrchr(seqid, '.');
- if (suf!=NULL) *suf=0;
- SeqInfo* seqd=seqinfo.Find(seqid);
- if (suf!=NULL) *suf='.';
- if (seqd!=NULL) {
- GStr s(seqd->descr);
- //cleanup some Uniref gunk
- if (s[0]=='[') {
- int r=s.index(']');
- if (r>=0 && r<8 && isdigit(s[1]))
- s.remove(0,r+1);
- }
- if (s.length()>80) {
- int r=s.index(';');
- if (r>5) s.cut(r);
- }
- if (s.length()>127) {
- s.cut(127);
- int r=s.rindex(' ');
- if (r>0) s.cut(r);
- }
- strcpy(charbuf, s.chars());
- return charbuf;
- }
- else return NULL;
-}
-
-char* getSeqName(char* seqid) {
- static char charbuf[128];
- char* suf=rstrchr(seqid, '.');
- if (suf!=NULL) *suf=0;
- strcpy(charbuf, seqid);
- if (suf!=NULL) *suf='.';
- return charbuf;
-}
-
-
-int adjust_stopcodon(GffObj& gffrec, int adj, GList<GSeg>* seglst=NULL) {
- //adj>0, extend CDS to include a potential stop codon
- //when CDS is expanded, the terminal exon might have to be adjusted too
- int realadj=0;
- if (gffrec.strand=='-') {
- if ((int)gffrec.CDstart>adj) {
- gffrec.CDstart-=adj;
- realadj=adj;
- if (gffrec.exons.First()->start>gffrec.CDstart) {
- gffrec.covlen+=gffrec.exons.First()->start - gffrec.CDstart;
- gffrec.exons.First()->start=gffrec.CDstart;
- gffrec.start=gffrec.CDstart;
- }
- }
- }
- else { // forward strand
- //expand beyond
- realadj=adj;
- gffrec.CDend+=adj;
- if (adj<0) {//restore
- if (gffrec.exons.Last()->end==gffrec.CDend-adj) {
- gffrec.exons.Last()->end+=adj;
- gffrec.end=gffrec.exons.Last()->end;
- gffrec.covlen+=adj;
- }
- }
- else if (gffrec.exons.Last()->end<gffrec.CDend) {
- gffrec.covlen+=gffrec.CDend-gffrec.exons.Last()->end;
- gffrec.exons.Last()->end=gffrec.CDend;
- gffrec.end=gffrec.CDend;
- }
- }
- if (seglst!=NULL) seglst->Last()->end+=realadj;
- return realadj;
- }
-
-void printTableData(FILE* f, GffObj& g, bool inFasta=false) {
- //using attribute list in tableCols
- char* av=NULL;
- for(int i=0;i<tableCols.Count();i++) {
- if (i>0 || inFasta) {
- if (!inFasta || tableCols[i].type!=ctfGFF_ID)
- fprintf(f,"\t");
- }
- switch(tableCols[i].type) {
- case ctfGFF_Attr:
- av=g.getAttr(tableCols[i].name.chars());
- fprintf(f,"%s",av!=NULL? av : ".");
- break;
- case ctfGFF_chr:
- fprintf(f,"%s",g.getGSeqName());
- break;
- case ctfGFF_ID:
- if (!inFasta)
- fprintf(f,"%s",g.getID());
- break;
- case ctfGFF_geneID:
- fprintf(f,"%s",g.getGeneID()!=NULL ? g.getGeneID() : ".");
- break;
- case ctfGFF_geneName:
- fprintf(f,"%s",g.getGeneName()!=NULL ? g.getGeneName() : ".");
- break;
- case ctfGFF_Parent:
- fprintf(f,"%s",g.parent!=NULL ? g.parent->getID() : ".");
- break;
- case ctfGFF_feature:
- fprintf(f,"%s",g.getFeatureName());
- break;
- case ctfGFF_start:
- fprintf(f,"%d",g.start);
- break;
- case ctfGFF_end:
- fprintf(f,"%d",g.end);
- break;
- case ctfGFF_strand:
- fprintf(f,"%c",g.strand);
- break;
- case ctfGFF_numexons:
- fprintf(f,"%d",g.exons.Count());
- break;
- case ctfGFF_exons:
- if (g.exons.Count()>0) {
- for (int x=0;x<g.exons.Count();x++) {
- if (x>0) fprintf(f,",");
- fprintf(f,"%d-%d",g.exons[x]->start, g.exons[x]->end);
- }
- } else fprintf(f,".");
- break;
- case ctfGFF_cds:
- if (g.hasCDS()) {
- GVec<GffExon> cds;
- g.getCDSegs(cds);
- for (int x=0;x<cds.Count();x++) {
- if (x>0) fprintf(f,",");
- fprintf(f,"%d-%d",cds[x].start, cds[x].end);
- }
- }
- else fprintf(f,".");
- break;
- case ctfGFF_covlen:
- fprintf(f, "%d", g.covlen);
- break;
- case ctfGFF_cdslen:
- if (g.hasCDS()) {
- GVec<GffExon> cds;
- g.getCDSegs(cds);
- int clen=0;
- for (int x=0;x<cds.Count();x++)
- clen+=cds[x].end-cds[x].start+1;
- fprintf(f, "%d", clen);
- }
- else fprintf(f, "0");
- break;
- } //switch
- }
- fprintf(f,"\n");
-}
-
-bool process_transcript(GFastaDb& gfasta, GffObj& gffrec) {
- if (!gffrec.isTranscript()) return false; //shouldn't call this function unless it's a transcript
- //returns true if the transcript passed the filter
- char* gname=gffrec.getGeneName();
- if (gname==NULL) gname=gffrec.getGeneID();
- if (ensembl_convert && startsWith(gffrec.getID(), "ENS")) {
- const char* biotype=gffrec.getAttr("gene_biotype");
- if (biotype) {
- gffrec.addAttr("type", biotype);
- gffrec.removeAttr("gene_biotype");
- }
- else { //old Ensembl files lacking gene_biotype
- gffrec.addAttr("type", gffrec.getTrackName());
- }
-
- //bool is_gene=false;
- bool is_pseudo=false;
- if (strcmp(biotype, "protein_coding")==0 || gffrec.hasCDS())
- gffrec.setFeatureName("mRNA");
- else {
- if (strcmp(biotype, "processed_transcript")==0)
- gffrec.setFeatureName("proc_RNA");
- else {
- //is_gene=endsWith(biotype, "gene");
- is_pseudo=strifind(biotype, "pseudo");
- if (is_pseudo) {
- gffrec.setFeatureName("pseudo_RNA");
- }
- else if (endsWith(biotype, "RNA")) {
- gffrec.setFeatureName(biotype);
- } else gffrec.setFeatureName("misc_RNA");
- }
- }
- }
- if (gname && strcmp(gname, gffrec.getID())!=0) {
- int* isonum=isoCounter.Find(gname);
- if (isonum==NULL) {
- isonum=new int(1);
- isoCounter.Add(gname,isonum);
- }
- else (*isonum)++;
- //defline.appendfmt(" gene=%s", gname);
- }
- int seqlen=0;
-
- const char* tlabel=tracklabel;
- if (tlabel==NULL) tlabel=gffrec.getTrackName();
- //defline.appendfmt(" track:%s",tlabel);
- char* cdsnt = NULL;
- char* cdsaa = NULL;
- int aalen=0;
- for (int i=1;i<gffrec.exons.Count();i++) {
- int ilen=gffrec.exons[i]->start-gffrec.exons[i-1]->end-1;
- if (verbose && ilen>4000000)
- GMessage("Warning: very large intron (%d) for transcript %s\n",
- ilen, gffrec.getID());
- if (ilen>maxintron) {
- return false;
- }
- }
- GMapSegments seglst(gffrec.strand);
- GFaSeqGet* faseq=NULL;
- if (f_x!=NULL || f_y!=NULL || f_w!=NULL || spliceCheck || validCDSonly || addCDSattrs) {
- faseq=fastaSeqGet(gfasta, gffrec.getGSeqName());
- if (faseq==NULL)
- GError("Error: no genomic sequence available (check -g option!).\n");
- }
- if (spliceCheck && gffrec.exons.Count()>1) {
- //check introns for splice site consensi ( GT-AG, GC-AG or AT-AC )
- int glen=gffrec.end-gffrec.start+1;
- const char* gseq=faseq->subseq(gffrec.start, glen);
- bool revcompl=(gffrec.strand=='-');
- bool ssValid=true;
- for (int e=1;e<gffrec.exons.Count();e++) {
- const char* intron=gseq+gffrec.exons[e-1]->end+1-gffrec.start;
- int intronlen=gffrec.exons[e]->start-gffrec.exons[e-1]->end-1;
- GSpliceSite acceptorSite(intron,intronlen,true, revcompl);
- GSpliceSite donorSite(intron,intronlen, false, revcompl);
- //GMessage("%c intron %d-%d : %s .. %s\n",
- // gffrec.strand, istart, iend, donorSite.nt, acceptorSite.nt);
- if (acceptorSite=="AG") { // GT-AG or GC-AG
- if (!donorSite.canonicalDonor()) {
- ssValid=false;break;
- }
- }
- else if (acceptorSite=="AC") { //AT-AC also accepted
- if (donorSite!="AT") { ssValid=false; break; }
- }
- else { ssValid=false; break; }
- }
- if (!ssValid) {
- if (verbose)
- GMessage("Unrecognized splice sites found for '%s'\n",gffrec.getID());
- return false; //don't print this one!
- }
- }
- bool trprint=true;
- bool inframeStop=false;
- //int stopCodonAdjust=0;
- int mCDphase=0;
- bool fullCDS=false;
- bool endStop=false;
- bool stopAdjusted=false;
- if (add_hasCDS && gffrec.hasCDS()) gffrec.addAttr("hasCDS", "true");
- if (gffrec.CDphase=='1' || gffrec.CDphase=='2')
- mCDphase = gffrec.CDphase-'0';
- //CDS partialness only added when -y -x -V options are given
- if (gffrec.hasCDS() && (f_y!=NULL || f_x!=NULL || validCDSonly || addCDSattrs)) {
- int strandNum=0;
- int phaseNum=0;
- CDS_CHECK:
- uint cds_olen=0;
- cdsnt=gffrec.getSpliced(faseq, true, &seqlen, NULL, &cds_olen, &seglst, adjustStop);
- //if adjustStop, seqlen has the CDS+3'UTR length, but cds_olen still has the original CDS length
- if (cdsnt!=NULL && cdsnt[0]!='\0') { //has CDS
- cdsaa=translateDNA(cdsnt, aalen, seqlen);
- char* p=strchr(cdsaa,'.');
- int cds_aalen=aalen;
- if (adjustStop)
- cds_aalen=cds_olen/3; //originally stated CDS length
- endStop=false;
- if (p!=NULL) { //stop codon found
- if (p-cdsaa==cds_aalen-1) { //stop found as the stated last CDS codon
- *p='\0';//remove it
- endStop=true;
- if (adjustStop) {
- seqlen=cds_aalen*3;
- aalen=cds_aalen;
- }
- cds_aalen--;
- aalen--;
- //no need to adjust stop codon
- }
- else {//stop found in a different position than the last codon
- if (p-cdsaa<cds_aalen-1 && !adjustStop) {
- inframeStop=true;
- }
- if (adjustStop) {
- *p='\0';
- cds_aalen=p-cdsaa+1; //adjusted CDS length
- seqlen=cds_aalen*3;
- aalen=cds_aalen;
- uint gc=seglst.gmap(seqlen);
- if (gffrec.strand=='-') gffrec.CDstart=gc;
- else gffrec.CDend=gc;
- endStop=true;
- stopAdjusted=true;
- }
- }
- }//stop codon found
- //if (trprint==false) { //failed CDS validity check
- if (inframeStop) {
- //in-frame stop codon found
- if (altPhases && phaseNum<3) {
- phaseNum++; //try a different phase
- gffrec.CDphase = '0'+((mCDphase+phaseNum)%3);
- GFREE(cdsaa);
- goto CDS_CHECK;
- }
- if (gffrec.exons.Count()==1 && bothStrands) {
- strandNum++;
- phaseNum=0;
- if (strandNum<2) {
- GFREE(cdsaa);
- gffrec.strand = (gffrec.strand=='-') ? '+':'-';
- goto CDS_CHECK; //repeat the CDS check for a different frame
- }
- }
- if (verbose) GMessage("Warning: In-frame STOP found for '%s'\n",gffrec.getID());
- if (addCDSattrs) gffrec.addAttr("InFrameStop", "true");
- } //has in-frame STOP
- if (stopAdjusted) {
- if (addCDSattrs) gffrec.addAttr("CDStopAdjusted", "true");
- inframeStop=false; //pretend it's OK now that we've adjusted it
- }
- if (!inframeStop) {
- bool hasStart=(cdsaa[0]=='M'); //for the regular eukaryotic translation table
- fullCDS=(endStop && hasStart);
- if (!fullCDS) {
- const char* partialness=NULL;
- if (hasStart) partialness="3";
- else {
- partialness = endStop ? "5" : "5_3";
- }
- if (addCDSattrs) gffrec.addAttr("partialness", partialness);
- }
- }
- if (trprint && ((fullCDSonly && !fullCDS) || (validCDSonly && inframeStop)) )
- trprint=false;
- //} // Valid CDS only requested?
- } //has CDS
- } //translation or codon check was requested
- if (!trprint) {
- GFREE(cdsnt);
- GFREE(cdsaa);
- //if (adjstop!=NULL) delete adjstop;
- return false;
- }
- /*
- if (validCDSonly) {
- int stopCodonAdjust=adjstop->restore();
- if (stopCodonAdjust!=0 && !endStop) {
- //restore stop codon location
- //adjust_stopcodon(gffrec, -stopCodonAdjust, &seglst);
- if (seglst.Count()>0) seglst.Last()->end-=stopCodonAdjust;
- if (cdsnt!=NULL && seqlen>0) {
- seqlen-=stopCodonAdjust;
- cdsnt[seqlen]=0;
- }
- if (cdsaa!=NULL) aalen--;
- }
- }
- if (adjstop!=NULL) delete adjstop;
- */
- if (cdsnt!=NULL) { // && !inframeStop) {
- if (f_y!=NULL) { //CDS translation fasta output requested
- if (cdsaa==NULL) { //translate now if not done before
- cdsaa=translateDNA(cdsnt, aalen, seqlen);
- }
- if (aalen>0) {
- if (cdsaa[aalen-1]=='.' || cdsaa[aalen-1]=='\0') --aalen; //avoid printing the stop codon
- fprintf(f_y, ">%s", gffrec.getID());
- if (fmtTable) printTableData(f_y, gffrec, true);
- else fprintf(f_y, "\n");
- printFasta(f_y, NULL, cdsaa, aalen, StarStop);
- }
- }
- if (f_x!=NULL) { //CDS only
- GStr defline(gffrec.getID(), 94);
- if (writeExonSegs) {
- defline.append(" loc:");
- defline.append(gffrec.getGSeqName());
- defline.appendfmt("(%c)",gffrec.strand);
- //warning: not CDS coordinates are written here, but the exon ones
- defline+=(int)gffrec.start;
- defline+=(char)'-';
- defline+=(int)gffrec.end;
- // -- here these are CDS substring coordinates on the spliced sequence:
- defline.append(" segs:");
- for (int i=0;i<seglst.Count();i++) {
- if (i>0) defline.append(",");
- defline+=(int)seglst[i].start;
- defline.append("-");
- defline+=(int)seglst[i].end;
- }
- }
- fprintf(f_x, ">%s", defline.chars());
- if (fmtTable) printTableData(f_x, gffrec, true);
- else fprintf(f_x, "\n");
- printFasta(f_x, NULL, cdsnt, seqlen);
- }
- GFREE(cdsnt);
- GFREE(cdsaa);
- } //writing CDS or its translation
- if (f_w!=NULL) { //write spliced exons
- uint cds_start=0;
- uint cds_end=0;
- seglst.Clear();
- //TODO: ? if wPadding is set, *temporarily* change first and last exon coordinates ?!?
- // or perhaps getSpliced() should take an additional padding parameter ?!?
- int padLeft=0;
- int padRight=0;
- if (wPadding>0) {
- padLeft= (gffrec.start>(uint)wPadding) ? wPadding : gffrec.start - 1;
- int ediff=faseq->getseqlen()-gffrec.end;
- padRight=(wPadding>ediff) ? ediff : wPadding;
- gffrec.addPadding(padLeft, padRight);
- }
- char* exont=gffrec.getSpliced(faseq, false, &seqlen, &cds_start, &cds_end, &seglst);
- //restore exons to normal (remove padding)
- if (wPadding>0)
- gffrec.removePadding(padLeft, padRight);
-
- GStr defline(gffrec.getID());
- if (exont!=NULL) {
- if (gffrec.CDstart>0) {
- defline.appendfmt(" CDS=%d-%d", cds_start, cds_end);
- }
- if (writeExonSegs) {
- defline.append(" loc:");
- defline.append(gffrec.getGSeqName());
- defline+=(char)'|';
- defline+=(int)gffrec.start;
- defline+=(char)'-';
- defline+=(int)gffrec.end;
- defline+=(char)'|';
- defline+=(char)gffrec.strand;
- defline.append(" exons:");
- for (int i=0;i<gffrec.exons.Count();i++) {
- if (i>0) defline.append(",");
- defline+=(int)gffrec.exons[i]->start;
- defline.append("-");
- defline+=(int)gffrec.exons[i]->end;
- }
- if (wPadding>0) {
- defline.append(" padding:");
- defline.append(padLeft);
- defline+=(char)'|';
- defline.append(padRight);
- }
-
- defline.append(" segs:");
- for (int i=0;i<seglst.Count();i++) {
- if (i>0) defline.append(",");
- defline+=(int)seglst[i].start;
- defline.append("-");
- defline+=(int)seglst[i].end;
- }
- }
-
- fprintf(f_w, ">%s", defline.chars());
- if (fmtTable) printTableData(f_w, gffrec, true);
- else fprintf(f_w, "\n");
- printFasta(f_w, NULL, exont, seqlen);
- GFREE(exont);
- }
- } //writing f_w (spliced exons)
- return true;
-}
-
void openfw(FILE* &f, GArgs& args, char opt) {
GStr s=args.getOpt(opt);
if (!s.is_empty()) {
@@ -863,10 +322,9 @@ void printGff3Header(FILE* f, GArgs& args) {
fprintf(f, "%s\n", gffloader.headerLines[i]);
}
} else {
- fprintf(f, "# ");
- args.printCmdLine(f);
- fprintf(f, "# gffread v" VERSION "\n");
fprintf(f, "##gff-version 3\n");
+ fprintf(f, "# gffread v" VERSION "\n");
+ fprintf(f, "# ");args.printCmdLine(f);
}
}
@@ -906,83 +364,6 @@ void processGffComment(const char* cmline, GfList* gflst) {
}
}
-bool validateGffRec(GffObj* gffrec, GList<GffObj>* gfnew) {
- if (reftbl.Count()>0) { //check if we need to reject by ref seq filter
- GStr refname(gffrec->getRefName());
- RefTran* rt=reftbl.Find(refname.chars());
- if (rt==NULL && refname.length()>2 && refname[-2]=='.' && isdigit(refname[-1])) {
- //try removing the version suffix
- refname.cut(-2);
- //GMessage("[DEBUG] Trying ref name '%s'...\n", refname.chars());
- rt=reftbl.Find(refname.chars());
- }
- if (rt) {
- gffrec->setRefName(rt->new_name);
- }
- /* //no, do not discard non-matching entries, let them pass through!
- else {
- if (verbose)
- GMessage("Info: %s discarded due to reference %s not being mapped\n",
- gffrec->getID(), refname.chars());
- return false; //discard, ref seq not in the given translation table
- }*/
- }
- if (gffloader.transcriptsOnly && gffrec->isDiscarded()) {
- //discard generic "locus" features with no other detailed subfeatures
- //GMessage("Warning: discarding %s GFF generic gene/locus container %s\n",gffrec->getID());
- return false;
- }
- if (minLen>0 && gffrec->covlen<minLen) {
- if (verbose)
- GMessage("Info: %s discarded due to minimum length threshold %d\n",
- gffrec->getID(), minLen);
- return false;
- }
- if (rfltGSeq!=NULL) { //filter by gseqName
- if (strcmp(gffrec->getGSeqName(),rfltGSeq)!=0) {
- return false;
- }
- }
- if (rfltStrand>0 && gffrec->strand !=rfltStrand) {
- return false;
- }
- //check coordinates
- if (rfltStart!=0 || rfltEnd!=MAX_UINT) {
- if (rfltWithin) {
- if (gffrec->start<rfltStart || gffrec->end>rfltEnd) {
- return false; //not within query range
- }
- }
- else {
- if (gffrec->start>rfltEnd || gffrec->end<rfltStart) {
- return false;
- }
- }
- }
- if (multiExon && gffrec->exons.Count()<=1) {
- return false;
- }
- if (wCDSonly && gffrec->CDstart==0) {
- return false;
- }
- if (wNConly && gffrec->hasCDS()) return false;
- if (ensembl_convert && startsWith(gffrec->getID(), "ENS")) {
- //keep track of chr|gene_id data -- coordinate range
- char* geneid=gffrec->getGeneID();
- if (geneid!=NULL) {
- GeneInfo* ginfo=gene_ids.Find(geneid);
- if (ginfo==NULL) {//first time seeing this gene ID
- GeneInfo* geneinfo=new GeneInfo(gffrec, ensembl_convert);
- gene_ids.Add(geneid, geneinfo);
- if (gfnew!=NULL)
- gfnew->Add(geneinfo->gf); //do we really need this?
- }
- else ginfo->update(gffrec);
- }
- }
- return true;
-}
-
void printGffObj(FILE* f, GffObj* gfo, GStr& locname, GffPrintMode exonPrinting, int& out_counter) {
GffObj& t=*gfo;
GTData* tdata=(GTData*)(t.uptr);
@@ -1027,9 +408,20 @@ void printAsTable(FILE* f, GffObj* gfo, int* out_counter=NULL) {
printTableData(f, *gfo);
}
+void shutDown() {
+ seqinfo.Clear();
+ //if (faseq!=NULL) delete faseq;
+ //if (gcdb!=NULL) delete gcdb;
+ GFREE(rfltGSeq);
+ FWCLOSE(f_out);
+ FWCLOSE(f_w);
+ FWCLOSE(f_x);
+ FWCLOSE(f_y);
+}
+
int main(int argc, char* argv[]) {
GArgs args(argc, argv,
- "version;debug;merge;adj-stop;bed;in-bed;tlf;in-tlf;cluster-only;nc;cov-info;help;"
+ "version;debug;merge;stream;adj-stop;bed;in-bed;tlf;in-tlf;cluster-only;nc;cov-info;help;"
"sort-alpha;keep-genes;w-add=;keep-comments;keep-exon-attrs;force-exons;t-adopt;gene2exon;"
"ignore-locus;no-pseudo;table=sort-by=hvOUNHPWCVJMKQYTDARSZFGLEBm:g:i:r:s:l:t:o:w:x:y:d:");
args.printError(USAGE, true);
@@ -1041,6 +433,7 @@ int main(int argc, char* argv[]) {
debugMode=(args.getOpt("debug")!=NULL);
decodeChars=(args.getOpt('D')!=NULL);
gffloader.forceExons=(args.getOpt("force-exons")!=NULL);
+ gffloader.streamIn=(args.getOpt("stream")!=NULL);
gffloader.noPseudo=(args.getOpt("no-pseudo")!=NULL);
gffloader.ignoreLocus=(args.getOpt("ignore-locus")!=NULL);
gffloader.transcriptsOnly=(args.getOpt('O')==NULL);
@@ -1152,10 +545,13 @@ int main(int argc, char* argv[]) {
}
gffloader.mergeCloseExons=(args.getOpt('Z')!=NULL);
+
multiExon=(args.getOpt('U')!=NULL);
writeExonSegs=(args.getOpt('W')!=NULL);
tracklabel=args.getOpt('t');
- GFastaDb gfasta(args.getOpt('g'));
+
+ if (args.getOpt('g'))
+ gfasta.init(args.getOpt('g'));
//if (gfasta.fastaPath!=NULL)
// sortByLoc=true; //enforce sorting by chromosome/contig
GStr s=args.getOpt('i');
@@ -1163,7 +559,7 @@ int main(int argc, char* argv[]) {
s=args.getOpt('l');
if (!s.is_empty()) minLen=s.asInt();
- FILE* f_repl=NULL;
+ FILE* f_repl=NULL; //duplicate/collapsing info output file
s=args.getOpt('d');
if (!s.is_empty()) {
if (s=="-") f_repl=stdout;
@@ -1249,6 +645,17 @@ int main(int argc, char* argv[]) {
int numfiles = args.startNonOpt();
//GList<GffObj> gfkept(false,true); //unsorted, free items on delete
int out_counter=0; //number of records printed
+
+ if (fmtGTF)
+ exonPrinting = gffloader.forceExons ? pgtfBoth : pgtfAny;
+ else if (fmtBED)
+ exonPrinting=pgffBED;
+ else if (fmtTLF)
+ exonPrinting=pgffTLF;
+ else { //printing regular GFF3
+ exonPrinting = gffloader.forceExons ? pgffBoth : pgffAny;
+ }
+
while (true) {
GStr infile;
if (numfiles) {
@@ -1268,7 +675,20 @@ int main(int argc, char* argv[]) {
if (TLFinput || (Gstricmp(fext, "tlf")==0))
gffloader.TLFinput=true;
gffloader.openFile(infile);
- gffloader.load(g_data, &validateGffRec, &processGffComment);
+ if (gffloader.streamIn) { //streaming in - disable all bulk load features
+ gffloader.transcriptsOnly=true;
+ gffloader.doCluster=false;
+ covInfo=false;
+ }
+
+ gffloader.load(g_data, &processGffComment);
+ if (gffloader.streamIn) {
+ //we're done, GffLoader::load() took care of everything
+ shutDown();
+ return 0;
+ }
+
+
// will also place the transcripts in loci, if doCluster is enabled
if (gffloader.doCluster)
collectLocusData(g_data, covInfo);
@@ -1293,16 +713,6 @@ int main(int argc, char* argv[]) {
if (tracklabel) loctrack=tracklabel;
if (gffloader.sortRefsAlpha)
g_data.setSorted(&gseqCmpName);
- GffPrintMode exonPrinting;
- if (fmtGTF)
- exonPrinting = pgtfAny;
- else if (fmtBED)
- exonPrinting=pgffBED;
- else if (fmtTLF)
- exonPrinting=pgffTLF;
- else { //printing regular GFF3
- exonPrinting = gffloader.forceExons ? pgffBoth : pgffAny;
- }
bool firstGff3Print=fmtGFF3;
if (gffloader.doCluster) {
//grouped in loci
@@ -1433,13 +843,14 @@ int main(int argc, char* argv[]) {
}
//for GFF3 && table output, print the parent first, if any
if ((fmtGFF3 || fmtTable) && t.parent!=NULL && T_PRINTABLE(t.parent->udata)) {
- GTData* pdata=(GTData*)(t.parent->uptr);
- if (pdata && pdata->geneinfo!=NULL)
- pdata->geneinfo->finalize();
+ //GTData* pdata=(GTData*)(t.parent->uptr);
+ //if (pdata && pdata->geneinfo!=NULL)
+ // pdata->geneinfo->finalize();
if (fmtTable)
printTableData(f_out, *(t.parent));
- else
+ else { //GFF3 output
t.parent->printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars);
+ }
T_NO_PRINT(t.parent->udata);
}
if (fmtTable)
@@ -1469,14 +880,7 @@ int main(int argc, char* argv[]) {
} //for each genomic seq
} //no clustering
if (f_repl && f_repl!=stdout) fclose(f_repl);
- seqinfo.Clear();
- //if (faseq!=NULL) delete faseq;
- //if (gcdb!=NULL) delete gcdb;
- GFREE(rfltGSeq);
- FWCLOSE(f_out);
- FWCLOSE(f_w);
- FWCLOSE(f_x);
- FWCLOSE(f_y);
+ shutDown();
}
=====================================
tag_git.sh
=====================================
@@ -3,5 +3,12 @@ git checkout master
ver=$(fgrep '#define VERSION ' gffread.cpp)
ver=${ver#*\"}
ver=${ver%%\"*}
+git fetch --tags
+if [[ "$1" == "delete" || "$1" == "del" ]]; then
+ echo "Deleting tag v$ver .."
+ git tag -d v$ver
+ git push origin :refs/tags/v$ver
+ exit
+fi
git tag -a "v$ver" -m "release $ver"
git push --tags
View it on GitLab: https://salsa.debian.org/med-team/gffread/-/compare/0cdbae03c6fe3d778d864dbcf4c738a8c8eecdbb...d719fba162be014938c92a2cad7558260fe37731
--
View it on GitLab: https://salsa.debian.org/med-team/gffread/-/compare/0cdbae03c6fe3d778d864dbcf4c738a8c8eecdbb...d719fba162be014938c92a2cad7558260fe37731
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20200810/f27bb1ac/attachment-0001.html>
More information about the debian-med-commit
mailing list