[med-svn] [Git][med-team/gffread][master] 5 commits: routine-update: New upstream version
Andreas Tille
gitlab at salsa.debian.org
Sun Jan 17 17:11:42 GMT 2021
Andreas Tille pushed to branch master at Debian Med / gffread
Commits:
a0c98e3f by Andreas Tille at 2021-01-17T18:02:17+01:00
routine-update: New upstream version
- - - - -
8f84d6a4 by Andreas Tille at 2021-01-17T18:02:18+01:00
New upstream version 0.12.4
- - - - -
a0003af2 by Andreas Tille at 2021-01-17T18:02:20+01:00
Update upstream source from tag 'upstream/0.12.4'
Update to upstream version '0.12.4'
with Debian dir 230d6e8d68d4f3150dfd99686a649426a6e749de
- - - - -
bd084893 by Andreas Tille at 2021-01-17T18:03:23+01:00
Refresh patches
- - - - -
ef25b5a2 by Andreas Tille at 2021-01-17T18:11:22+01:00
Seems to require a new version of gclib which would mean a transition which is to late now
- - - - -
10 changed files:
- Makefile
- debian/changelog
- debian/control
- debian/patches/gclib.patch
- debian/patches/hardening
- gff_utils.cpp
- gff_utils.h
- gffread.cpp
- prep_source.sh
- tag_git.sh
Changes:
=====================================
Makefile
=====================================
@@ -11,7 +11,7 @@ LDFLAGS := $(if $(LDFLAGS),$(LDFLAGS),-g)
BASEFLAGS := -Wall -Wextra ${SEARCHDIRS} -D_FILE_OFFSET_BITS=64 \
-D_LARGEFILE_SOURCE -D_REENTRANT -fno-strict-aliasing \
- -std=c++0x -fno-exceptions -fno-rtti
+ -std=c++11 -fno-exceptions -fno-rtti
GCCV8 := $(shell expr `${CXX} -dumpversion | cut -f1 -d.` \>= 8)
ifeq "$(GCCV8)" "1"
@@ -83,7 +83,7 @@ all release debug memcheck memdebug profile gprof prof: ../gclib gffread
$(OBJS) : $(GCLDIR)/GBase.h $(GCLDIR)/gff.h
gffread.o : gff_utils.h $(GCLDIR)/GBase.h $(GCLDIR)/gff.h
gff_utils.o : gff_utils.h $(GCLDIR)/gff.h
-${GCLDIR}/gff.o : ${GCLDIR}/gff.h ${GCLDIR}/GFaSeqGet.h ${GCLDIR}/GList.hh ${GCLDIR}/GHash.hh
+${GCLDIR}/gff.o : ${GCLDIR}/gff.h ${GCLDIR}/GFaSeqGet.h ${GCLDIR}/GList.hh
${GCLDIR}/GFaSeqGet.o : ${GCLDIR}/GFaSeqGet.h
gffread: $(OBJS) gffread.o
${LINKER} ${LDFLAGS} -o $@ ${filter-out %.a %.so, $^} ${LIBS}
=====================================
debian/changelog
=====================================
@@ -1,3 +1,11 @@
+gffread (0.12.4-1) UNRELEASED; urgency=medium
+
+ * New upstream version
+ Seems to require a new version of gclib which would mean a transition
+ which is to late now
+
+ -- Andreas Tille <tille at debian.org> Sun, 17 Jan 2021 18:02:17 +0100
+
gffread (0.12.1-4) unstable; urgency=medium
* Team upload.
=====================================
debian/control
=====================================
@@ -6,7 +6,7 @@ Uploaders: Andreas Tille <tille at debian.org>,
Section: science
Priority: optional
Build-Depends: debhelper-compat (= 13),
- libgclib-dev (>= 0.11.10)
+ libgclib-dev
Standards-Version: 4.5.1
Vcs-Browser: https://salsa.debian.org/med-team/gffread
Vcs-Git: https://salsa.debian.org/med-team/gffread.git
=====================================
debian/patches/gclib.patch
=====================================
@@ -2,10 +2,8 @@ Author: Andreas Tille <tille at debian.org>
Last-Update: Thu, 18 Apr 2019 12:40:25 +0200
Description: Fix build against libgclib
-Index: gffread/Makefile
-===================================================================
---- gffread.orig/Makefile
-+++ gffread/Makefile
+--- a/Makefile
++++ b/Makefile
@@ -75,7 +75,7 @@ OBJS := ${GCLDIR}/GBase.o ${GCLDIR}/GArg
.PHONY : all
@@ -17,7 +15,7 @@ Index: gffread/Makefile
git clone https://github.com/gpertea/gclib.git ../gclib
@@ -85,8 +85,8 @@ gffread.o : gff_utils.h $(GCLDIR)/GBase.
gff_utils.o : gff_utils.h $(GCLDIR)/gff.h
- ${GCLDIR}/gff.o : ${GCLDIR}/gff.h ${GCLDIR}/GFaSeqGet.h ${GCLDIR}/GList.hh ${GCLDIR}/GHash.hh
+ ${GCLDIR}/gff.o : ${GCLDIR}/gff.h ${GCLDIR}/GFaSeqGet.h ${GCLDIR}/GList.hh
${GCLDIR}/GFaSeqGet.o : ${GCLDIR}/GFaSeqGet.h
-gffread: $(OBJS) gffread.o
- ${LINKER} ${LDFLAGS} -o $@ ${filter-out %.a %.so, $^} ${LIBS}
=====================================
debian/patches/hardening
=====================================
@@ -2,10 +2,8 @@ From: Michael R. Crusoe <michael.crusoe at gmail.com>
Subject: Use CPPFLAGS
Allows Debian to harden the binary with CPPFLAGS=-Wdate-time -D_FORTIFY_SOURCE=2
-Index: gffread/Makefile
-===================================================================
---- gffread.orig/Makefile
-+++ gffread/Makefile
+--- a/Makefile
++++ b/Makefile
@@ -65,7 +65,7 @@ endif
#endif
@@ -16,7 +14,7 @@ Index: gffread/Makefile
# C/C++ linker
@@ -86,7 +86,7 @@ gff_utils.o : gff_utils.h $(GCLDIR)/gff.
- ${GCLDIR}/gff.o : ${GCLDIR}/gff.h ${GCLDIR}/GFaSeqGet.h ${GCLDIR}/GList.hh ${GCLDIR}/GHash.hh
+ ${GCLDIR}/gff.o : ${GCLDIR}/gff.h ${GCLDIR}/GFaSeqGet.h ${GCLDIR}/GList.hh
${GCLDIR}/GFaSeqGet.o : ${GCLDIR}/GFaSeqGet.h
gffread: gffread.o gff_utils.o
- ${LINKER} ${LDFLAGS} -o $@ ${filter-out %.a %.so, $^} -lgclib ${LIBS} -pthread
=====================================
gff_utils.cpp
=====================================
@@ -1,6 +1,6 @@
#include "gff_utils.h"
-GHash<GeneInfo> gene_ids;
+GHash<GeneInfo*> gene_ids;
bool verbose=false; //same with GffReader::showWarnings and GffLoader::beVserbose
bool debugMode=false;
@@ -14,9 +14,13 @@ int wPadding = 0; //padding for -w option
FILE* f_x=NULL; //writing fasta with spliced CDS
FILE* f_y=NULL; //wrting fasta with translated CDS
+FILE* f_j=NULL; //wrting junctions (intron coordinates)
int maxintron=999000000;
+ID_Flt_Type IDflt=idFlt_None;
+
+bool TFilters=false;
bool wCDSonly=false;
bool wNConly=false;
int minLen=0; //minimum transcript length
@@ -45,6 +49,7 @@ uint rfltEnd=MAX_UINT;
bool rfltWithin=false; //check for full containment within given range
bool addDescr=false;
+bool wfaNoCDS=false;
bool fmtGFF3=true; //default output: GFF3
//other formats only make sense in transcriptOnly mode
@@ -57,9 +62,12 @@ GffPrintMode exonPrinting=pgffAny;
GFastaDb gfasta;
-GHash<SeqInfo> seqinfo;
+GHash<SeqInfo*> seqinfo;
GVec<CTableField> tableCols;
-GHash<RefTran> reftbl;
+GHash<RefTran*> reftbl;
+GStrSet<> fltIDs;
+
+GStrSet<> attrList;
GHash<int> isoCounter; //counts the valid isoforms
@@ -353,7 +361,27 @@ void printTableData(FILE* f, GffObj& g, bool inFasta) {
}
fprintf(f,"\n");
}
+
bool GffLoader::validateGffRec(GffObj* gffrec) {
+ if (!checkFilters(gffrec)) {
+ if (gffrec->isTranscript()) {
+ TFilters=true;
+ if (gffrec->parent!=NULL && keepGenes) {
+ GPVec<GffObj>& pchildren=gffrec->parent->children;
+ for (int c=0;c<pchildren.Count();c++) {
+ if (pchildren[c]==gffrec) {
+ pchildren.Delete(c);
+ break;
+ }
+ }
+ }
+ }
+ return false;
+ } //transcript rejected
+ return true;
+}
+
+bool GffLoader::checkFilters(GffObj* gffrec) {
if (reftbl.Count()>0) { //check if we need to reject by ref seq filter
GStr refname(gffrec->getRefName());
RefTran* rt=reftbl.Find(refname.chars());
@@ -379,6 +407,12 @@ bool GffLoader::validateGffRec(GffObj* gffrec) {
//GMessage("Warning: discarding %s GFF generic gene/locus container %s\n",gffrec->getID());
return false;
}
+ if (IDflt) {
+ if (fltIDs.hasKey(gffrec->getID())) {
+ if (IDflt==idFlt_Exclude) return false;
+ }
+ else if (IDflt==idFlt_Only) return false;
+ }
if (minLen>0 && gffrec->covlen<minLen) {
if (verbose)
GMessage("Info: %s discarded due to minimum length threshold %d\n",
@@ -406,13 +440,22 @@ bool GffLoader::validateGffRec(GffObj* gffrec) {
}
}
}
- if (multiExon && gffrec->exons.Count()<=1) {
- return false;
- }
- if (wCDSonly && gffrec->CDstart==0) {
- return false;
+ if (this->attrsFilter) { //mostly relevant for transcripts and gene records
+ //remove attributes that are not in attrList
+ gffrec->removeAttrs(attrList);
}
- if (wNConly && gffrec->hasCDS()) return false;
+
+ if (gffrec->isTranscript()) { // && TFilters) ?
+ //these filters only apply to transcripts
+ if (multiExon && gffrec->exons.Count()<=1) {
+ return false;
+ }
+ if (wCDSonly && gffrec->CDstart==0) {
+ return false;
+ }
+ if (wNConly && gffrec->hasCDS()) return false;
+ return process_transcript(gfasta, *gffrec);
+ } //transcript filters check
return true;
}
@@ -451,13 +494,13 @@ bool process_transcript(GFastaDb& gfasta, GffObj& gffrec) {
}
}
if (gname && strcmp(gname, gffrec.getID())!=0) {
- int* isonum=isoCounter.Find(gname);
- if (isonum==NULL) {
- isonum=new int(1);
- isoCounter.Add(gname,isonum);
- }
- else (*isonum)++;
- //defline.appendfmt(" gene=%s", gname);
+ int* isonum=isoCounter.Find(gname);
+ if (isonum==NULL) {
+ //isonum=new int(1);
+ isoCounter.Add(gname,1);
+ }
+ else (*isonum)++;
+ //defline.appendfmt(" gene=%s", gname);
}
int seqlen=0;
@@ -533,6 +576,7 @@ bool process_transcript(GFastaDb& gfasta, GffObj& gffrec) {
int phaseNum=0;
CDS_CHECK:
uint cds_olen=0;
+ inframeStop=false;
cdsnt=gffrec.getSpliced(faseq, true, &seqlen, NULL, &cds_olen, &seglst, adjustStop);
//if adjustStop, seqlen has the CDS+3'UTR length, but cds_olen still has the original CDS length
if (cdsnt!=NULL && cdsnt[0]!='\0') { //has CDS
@@ -694,7 +738,7 @@ bool process_transcript(GFastaDb& gfasta, GffObj& gffrec) {
GStr defline(gffrec.getID());
if (exont!=NULL) {
- if (gffrec.CDstart>0) {
+ if (!wfaNoCDS && gffrec.CDstart>0) {
defline.appendfmt(" CDS=%d-%d", cds_start, cds_end);
}
if (writeExonSegs) {
@@ -1169,9 +1213,9 @@ void collectLocusData(GList<GenomicSeqData>& ref_data, bool covInfo) {
GenomicSeqData* gdata=ref_data[g];
for (int l=0;l<gdata->loci.Count();l++) {
GffLocus& loc=*(gdata->loci[l]);
- GHash<int> gnames(true); //gene names in this locus
+ GHash<int> gnames; //gene names in this locus
//GHash<int> geneids(true); //Entrez GeneID: numbers
- GHash<int> geneids(true);
+ GHash<int> geneids;
int fstrand=0,rstrand=0,ustrand=0;
for (int i=0;i<loc.rnas.Count();i++) {
GffObj& t=*(loc.rnas[i]);
@@ -1185,11 +1229,14 @@ void collectLocusData(GList<GenomicSeqData>& ref_data, bool covInfo) {
gname.upper();
int* prevg=gnames.Find(gname.chars());
if (prevg!=NULL) (*prevg)++;
- else gnames.Add(gname, new int(1));
+ else gnames.Add(gname.chars(), 1);
}
GStr geneid(t.getGeneID());
- if (!geneid.is_empty())
- geneids.Add(geneid.chars());
+ if (!geneid.is_empty()) {
+ int* prevg=gnames.Find(geneid.chars());
+ if (prevg!=NULL) (*prevg)++;
+ geneids.Add(geneid.chars(), 1);
+ }
//parse GeneID xrefs, if any (RefSeq):
/*
GStr xrefs(t.getAttr("xrefs"));
@@ -1220,11 +1267,13 @@ void collectLocusData(GList<GenomicSeqData>& ref_data, bool covInfo) {
gname.upper();
int* prevg=gnames.Find(gname.chars());
if (prevg!=NULL) (*prevg)++;
- else gnames.Add(gname, new int(1));
+ else gnames.Add(gname, 1);
}
GStr geneid(nt.getID());
if (!geneid.is_empty()) {
- geneids.Add(geneid.chars(), new int(1));
+ int* prevg=gnames.Find(geneid.chars());
+ if (prevg!=NULL) (*prevg)++;
+ geneids.Add(geneid.chars(),1);
}
}
//parse GeneID xrefs, if any (RefSeq):
@@ -1258,18 +1307,18 @@ void collectLocusData(GList<GenomicSeqData>& ref_data, bool covInfo) {
loc.locus_num=locus_num;
if (gnames.Count()>0) { //collect all gene names associated to this locus
gnames.startIterate();
- int* gfreq=NULL;
- char* key=NULL;
- while ((gfreq=gnames.NextData(key))!=NULL) {
- loc.gene_names.AddIfNew(new CGeneSym(key,*gfreq));
+ int gfreq=0;
+ const char* key=NULL;
+ while ((key=gnames.Next(gfreq))!=NULL) {
+ loc.gene_names.AddIfNew(new CGeneSym(key, gfreq));
}
} //added collected gene_names
if (geneids.Count()>0) { //collect all GeneIDs names associated to this locus
geneids.startIterate();
- int* gfreq=NULL;
- char* key=NULL;
- while ((gfreq=geneids.NextData(key))!=NULL) {
- loc.gene_ids.AddIfNew(new CGeneSym(key,*gfreq));
+ int gfreq=0;
+ const char* key=NULL;
+ while ((key=geneids.Next(gfreq))!=NULL) {
+ loc.gene_ids.AddIfNew(new CGeneSym(key, gfreq));
}
}
} //for each locus
@@ -1311,6 +1360,21 @@ GenomicSeqData* getGSeqData(GList<GenomicSeqData>& seqdata, int gseq_id) {
void warnPseudo(GffObj& m) {
GMessage("Info: pseudo gene/transcript record with ID=%s discarded.\n",m.getID());
}
+void GffLoader::collectIntrons(GffObj& t) {
+ // assume t are coming grouped by chromosome and sorted by start coordinate!
+ if (intronList.jlst.Count()>0) {
+ if (t.gseq_id!=intronList.gseq_id ||
+ t.start>=intronList.jlst.Last()->start) {
+ intronList.print(f_j);
+ intronList.clear();
+ }
+ else if (t.start<intronList.last_t_start)
+ GError("Error collectIntrons(%s) called when last_t_start was %d\n",
+ t.getID(), intronList.last_t_start );
+ //add this transcript's introns
+ }
+ intronList.add(t);
+}
void GffLoader::load(GList<GenomicSeqData>& seqdata, GFFCommentParser* gf_parsecomment) {
if (f==NULL) GError("Error: GffLoader::load() cannot be called before ::openFile()!\n");
@@ -1326,6 +1390,7 @@ void GffLoader::load(GList<GenomicSeqData>& seqdata, GFFCommentParser* gf_parsec
gffr->keepGenes(keepGenes);
gffr->setIgnoreLocus(ignoreLocus);
gffr->setRefAlphaSorted(this->sortRefsAlpha);
+ gffr->procEnsemblID(this->ensemblProc);
if (keepGff3Comments && gf_parsecomment!=NULL) gffr->setCommentParser(gf_parsecomment);
int outcounter=0;
if (streamIn) { //this will ignore any clustering options
@@ -1335,22 +1400,27 @@ void GffLoader::load(GList<GenomicSeqData>& seqdata, GFFCommentParser* gf_parsec
delete t;
continue;
}
- if (process_transcript(gfasta, *t)) {
- outcounter++;
- if (f_out) {
- if (fmtTable)
- printTableData(f_out, *t);
- else //GFF3, GTF, BED, TLF
- t->printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars);
- }
+
+ if (f_j!=NULL && t->isTranscript() && t->exons.Count()>1)
+ collectIntrons(*t);
+
+ outcounter++;
+ if (f_out) {
+ if (fmtTable)
+ printTableData(f_out, *t);
+ else //GFF3, GTF, BED, TLF
+ t->printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars);
}
delete t;
}
+ if (f_j && intronList.jlst.Count()>0) {
+ intronList.print(f_j);
+ intronList.clear();
+ }
delete gffr;
return;
}
-
gffr->readAll();
GVec<int> pseudoFeatureIds; //feature type: pseudo*
GVec<int> pseudoAttrIds; // attribute: [is]pseudo*=true/yes/1
@@ -1438,9 +1508,12 @@ void GffLoader::load(GList<GenomicSeqData>& seqdata, GFFCommentParser* gf_parsec
m->subftype_id=gff_fid_exon;
}
//GList<GffObj> gfadd(false,false); -- for gf_validate()?
- if (!validateGffRec(m)) {
+ if (!validateGffRec(m)) { //this will also apply process_transcript() CDS filters etc.
continue;
}
+ if (f_j!=NULL && m->isTranscript() && m->exons.Count()>1)
+ collectIntrons(*m);
+
m->isUsed(true); //so the gffreader won't destroy it
GenomicSeqData* gdata=getGSeqData(seqdata, m->gseq_id);
bool keep=placeGf(m, gdata);
@@ -1450,6 +1523,10 @@ void GffLoader::load(GList<GenomicSeqData>& seqdata, GFFCommentParser* gf_parsec
//GMessage("Feature %s(%d-%d) is going to be discarded..\n",m->getID(), m->start, m->end);
}
} //for each read gffObj
+ if (f_j && intronList.jlst.Count()>0) {
+ intronList.print(f_j);
+ intronList.clear();
+ }
//if (verbose) GMessage(" .. %d records from %s clustered into loci.\n", gffr->gflst.Count(), fname.chars());
//if (f && f!=stdin) { fclose(f); f=NULL; }
delete gffr;
=====================================
gff_utils.h
=====================================
@@ -17,10 +17,26 @@ extern int wPadding; //padding for -w option
extern FILE* f_x; //writing fasta with spliced CDS
extern FILE* f_y; //wrting fasta with translated CDS
+extern FILE* f_j; //wrting junctions (introns)
+
+
+extern bool TFilters;
+
+extern bool wfaNoCDS;
+
extern int maxintron;
extern bool wCDSonly;
extern bool wNConly;
+
+enum ID_Flt_Type {
+ idFlt_None=0,
+ idFlt_Only,
+ idFlt_Exclude
+};
+
+extern ID_Flt_Type IDflt;
+
extern int minLen; //minimum transcript length
extern bool validCDSonly; // translation with no in-frame STOP
extern bool bothStrands; //for single-exon mRNA validation, check the other strand too
@@ -105,15 +121,14 @@ class RefTran {
};
extern GFastaDb gfasta;
-extern GHash<SeqInfo> seqinfo;
+extern GHash<SeqInfo*> seqinfo;
extern GHash<int> isoCounter; //counts the valid isoforms
-extern GHash<RefTran> reftbl;
+extern GHash<RefTran*> reftbl;
char* getSeqDescr(char* seqid);
char* getSeqName(char* seqid);
int adjust_stopcodon(GffObj& gffrec, int adj, GList<GSeg>* seglst=NULL);
void printTableData(FILE* f, GffObj& g, bool inFasta=false);
-bool validateGffRec(GffObj* gffrec);
bool process_transcript(GFastaDb& gfasta, GffObj& gffrec);
enum ETableFieldType {
@@ -144,11 +159,84 @@ class CTableField {
extern GVec<CTableField> tableCols; //table output format fields
+extern GStrSet<> attrList;
+extern GStrSet<> fltIDs;
class GffLocus;
class GenomicSeqData;
class GeneInfo;
+struct CIntronData:public GSeg {
+ char strand;//'.' < '-' < '+' (reverse ASCII order)
+ GVec<GStr> ts; //list of transcript IDs sharing this intron
+ CIntronData(uint istart, uint iend, char tstrand, const char* t_id=NULL):GSeg(istart, iend),
+ strand(tstrand) {
+ if (t_id!=NULL) {
+ GStr tid(t_id);
+ ts.Add(tid);
+ }
+ }
+ void add(const char* t_id) {
+ GStr tid(t_id);
+ ts.Add(tid);
+ }
+ bool operator==(CIntronData& d){
+ return (start==d.start && end==d.end && strand==d.strand);
+ }
+ bool operator<(CIntronData& d){
+ if (start==d.start) {
+ if (end==d.end) return strand>d.strand;
+ else return (end<d.end);
+ }
+ else return (start<d.start);
+ }
+
+};
+
+struct CIntronList {
+ int gseq_id;
+ uint last_t_start; //just to check if input is sorted properly!
+ GList<CIntronData> jlst;
+ CIntronList():gseq_id(-1),last_t_start(0), jlst(true, true) {}
+ void add(GffObj& t) { //add all introns of t to jlst
+ if (t.exons.Count()<2) return; //nothing to do
+ if (gseq_id>=0 && gseq_id!=t.gseq_id)
+ GError("Error: CIntronList::add(%s) on different ref seq!\n", t.getID());
+ gseq_id=t.gseq_id;
+ for (int i=1;i<t.exons.Count();++i) {
+ CIntronData* nintr = new CIntronData(t.exons[i-1]->end+1,
+ t.exons[i]->start-1, t.strand, t.getID());
+ int fidx=-1;
+ CIntronData* xintr=jlst.AddIfNew(nintr, true, &fidx);
+ if (xintr!=nintr) {
+ //nintr already exists,it was deallocated
+ xintr->add(t.getID());
+ }
+ last_t_start=t.start;
+ } //for each intron
+ }
+ void clear() {
+ gseq_id=-1;
+ last_t_start=0;
+ jlst.Clear();
+ }
+ void print(FILE* f) {
+ //simple tab delimited format: chr, start, end, strand, transcriptIDs comma delimited
+ const char* gseqname=GffObj::names->gseqs.getName(gseq_id);
+ for (int i=0;i<jlst.Count();++i) {
+ CIntronData& idata=*(jlst[i]);
+ fprintf(f,"%s\t%d\t%d\t%c\t",gseqname, idata.start, idata.end, idata.strand);
+ if (idata.ts.Count()>1)
+ idata.ts.Sort();
+ for (int t=0;t<idata.ts.Count();t++) {
+ if (t) fprintf(f, ",%s", idata.ts[t].chars());
+ else fprintf(f, "%s", idata.ts[t].chars());
+ }
+ fprintf(f, "\n");
+ }
+ }
+};
+
class GTData { // transcript associated data
public:
GffObj* rna;
@@ -662,6 +750,7 @@ class GffLoader {
GStr fname;
FILE* f;
GffNames* names;
+ CIntronList intronList; // collect introns for -j output
union {
unsigned int options;
struct {
@@ -686,10 +775,12 @@ class GffLoader {
bool dOvlSET:1; //discard overlapping Single Exon Transcripts on any strand
bool forceExons:1;
bool streamIn:1;
+ bool ensemblProc:1;
+ bool attrsFilter:1;
};
};
- GffLoader():fname(),f(NULL), names(NULL), options(0) {
+ GffLoader():fname(),f(NULL), names(NULL), intronList(), options(0) {
transcriptsOnly=true;
gffnames_ref(GffObj::names);
names=GffObj::names;
@@ -712,11 +803,15 @@ class GffLoader {
}
bool validateGffRec(GffObj* gffrec);
+ bool checkFilters(GffObj* gffrec);
+
+ void collectIntrons(GffObj& t); //for -j output
void load(GList<GenomicSeqData>&seqdata, GFFCommentParser* gf_parsecomment=NULL);
bool placeGf(GffObj* t, GenomicSeqData* gdata);
+
bool unsplContained(GffObj& ti, GffObj& tj);
GffObj* redundantTranscripts(GffObj& ti, GffObj& tj);
=====================================
gffread.cpp
=====================================
@@ -4,24 +4,25 @@
#define __STDC_FORMAT_MACROS
#include <inttypes.h>
-#define VERSION "0.12.1"
+#define VERSION "0.12.4"
#define USAGE "gffread v" VERSION ". Usage:\n\
-gffread <input_gff> [-g <genomic_seqs_fasta> | <dir>][-s <seq_info.fsize>] \n\
+gffread [-g <genomic_seqs_fasta> | <dir>] [-s <seq_info.fsize>] \n\
[-o <outfile>] [-t <trackname>] [-r [[<strand>]<chr>:]<start>..<end> [-R]]\n\
[-CTVNJMKQAFPGUBHZWTOLE] [-w <exons.fa>] [-x <cds.fa>] [-y <tr_cds.fa>]\n\
- [-i <maxintron>] [--stream] [--bed] [--table <attrlist>] [--sort-by <ref.lst>]\n\
- \n\
+ [--ids <IDs.lst> | --nids <IDs.lst>] [--attrs <attr-list>] [-i <maxintron>]\n\
+ [--stream] [--bed | --gtf | --tlf] [--table <attrlist>] [--sort-by <ref.lst>]\n\
+ [<input_gff>] \n\n\
Filter, convert or cluster GFF/GTF/BED records, extract the sequence of\n\
transcripts (exon or CDS) and more.\n\
By default (i.e. without -O) only transcripts are processed, discarding any\n\
other non-transcript features. Default output is a simplified GFF3 with only\n\
the basic attributes.\n\
\n\
- <input_gff> is a GFF file, use '-' for stdin\n\
- \n\
Options:\n\
-i discard transcripts having an intron larger than <maxintron>\n\
+ --ids discard records/transcripts if their IDs are not listed in <IDs.lst>\n\
+ --nids discard records/transcripts if their IDs are listed in <IDs.lst>\n\
-l discard transcripts shorter than <minlen> bases\n\
-r only show transcripts overlapping coordinate range <start>..<end>\n\
(on chromosome/contig <chr>, strand <strand> if provided)\n\
@@ -42,11 +43,13 @@ Sorting: (by default, chromosomes are kept in the order they were found)\n\
--sort-by : sort the reference sequences by the order in which their\n\
names are given in the <refseq.lst> file\n\
Misc options: \n\
- -F preserve all GFF attributes (for non-exon features)\n\
+ -F keep all GFF attributes (for non-exon features)\n\
--keep-exon-attrs : for -F option, do not attempt to reduce redundant\n\
exon/CDS attributes\n\
-G do not keep exon attributes, move them to the transcript feature\n\
(for GFF3 output)\n\
+ --attrs <attr-list> only output the GTF/GFF attributes listed in <attr-list>\n\
+ which is a comma delimited list of attribute names to\n\
--keep-genes : in transcript-only mode (default), also preserve gene records\n\
--keep-comments: for GFF3 input/output, try to preserve comments\n\
-O process other non-transcript GFF records (by default non-transcript\n\
@@ -78,7 +81,7 @@ Misc options: \n\
((no sorting, exons must be grouped by transcript in the input data)\n\
Clustering:\n\
-M/--merge : cluster the input transcripts into loci, discarding\n\
- \"duplicated\" transcripts (those with the same exact introns\n\
+ \"redundant\" transcripts (those with the same exact introns\n\
and fully contained or equal boundaries)\n\
-d <dupinfo> : for -M option, write duplication info to file <dupinfo>\n\
--cluster-only: same as -M/--merge but without discarding any of the\n\
@@ -102,24 +105,26 @@ Output options:\n\
-g full path to a multi-fasta file with the genomic sequences\n\
for all input mappings, OR a directory with single-fasta files\n\
(one per genomic sequence, with file names matching sequence names)\n\
+ -j write a tab delimited file with all the junctions (intron coordinates)\n\
-w write a fasta file with spliced exons for each transcript\n\
--w-add <N> for the -w option, extract additional <N> bases\n\
both upstream and downstream of the transcript boundaries\n\
+ --w-nocds for -w, disable the output of CDS info in the FASTA file\n\
-x write a fasta file with spliced CDS for each GFF transcript\n\
-y write a protein fasta file with the translation of CDS for each record\n\
-W for -w and -x options, write in the FASTA defline all the exon\n\
coordinates projected onto the spliced sequence;\n\
-S for -y option, use '*' instead of '.' as stop codon translation\n\
- -L Ensembl GTF to GFF3 conversion (implies -F; should be used with -m)\n\
+ -L Ensembl GTF to GFF3 conversion, adds version to IDs\n\
-m <chr_replace> is a name mapping table for converting reference \n\
sequence names, having this 2-column format:\n\
<original_ref_ID> <new_ref_ID>\n\
-t use <trackname> in the 2nd column of each GFF/GTF output line\n\
- -o write the records into <outfile> instead of stdout\n\
+ -o write the output records into <outfile> instead of stdout\n\
-T main output will be GTF instead of GFF3\n\
--bed output records in BED format instead of default GFF3\n\
--tlf output \"transcript line format\" which is like GFF\n\
- but exons, CDS features and related data are stored as GFF \n\
+ but with exons and CDS related features stored as GFF \n\
attributes in the transcript feature line, like this:\n\
exoncount=N;exons=<exons>;CDSphase=<N>;CDS=<CDScoords> \n\
<exons> is a comma-delimited list of exon_start-exon_end coordinates;\n\
@@ -136,61 +141,11 @@ Output options:\n\
problems with the given GFF/GTF records\n\
"
-/*
-FILE* ffasta=NULL;
-FILE* f_in=NULL;
-FILE* f_out=NULL;
-FILE* f_w=NULL; //writing fasta with spliced exons (transcripts)
-int wPadding = 0; //padding for -w option
-FILE* f_x=NULL; //writing fasta with spliced CDS
-FILE* f_y=NULL; //wrting fasta with translated CDS
-
-bool wCDSonly=false;
-bool wNConly=false;
-int minLen=0; //minimum transcript length
-bool validCDSonly=false; // translation with no in-frame STOP
-bool bothStrands=false; //for single-exon mRNA validation, check the other strand too
-bool altPhases=false; //if original phase fails translation validation,
- //try the other 2 phases until one makes it
-bool addCDSattrs=false;
-bool add_hasCDS=false;
-//bool streamIn=false; // --stream option
-bool adjustStop=false; //automatic adjust the CDS stop coordinate
-bool covInfo=false; // --cov-info : only report genome coverage
-GStr tableFormat; //list of "attributes" to print in tab delimited format
-bool spliceCheck=false; //only known splice-sites
-bool decodeChars=false; //decode url-encoded chars in attrs (-D)
-bool StarStop=false; //use * instead of . for stop codon translation
-bool fullCDSonly=false; // starts with START, ends with STOP codon
-*/
-
GStr sortBy; //file name with chromosomes listed in the desired order
bool BEDinput=false;
bool TLFinput=false;
-//--- output options
-/*
-extern bool fmtGFF3=true; //default output: GFF3
-//other formats only make sense in transcriptOnly mode
-extern bool fmtGTF=false;
-extern bool fmtBED=false;
-extern bool fmtTLF=false;
-extern bool fmtTable=false;
-
-
-bool multiExon=false;
-bool writeExonSegs=false;
-char* tracklabel=NULL;
-char* rfltGSeq=NULL;
-char rfltStrand=0;
-uint rfltStart=0;
-uint rfltEnd=MAX_UINT;
-bool rfltWithin=false; //check for full containment within given range
-bool addDescr=false;
-
-*/
-
//bool protmap=false;
//int maxintron=999000000;
//bool mergeCloseExons=false;
@@ -200,7 +155,22 @@ GffLoader gffloader;
GList<GenomicSeqData> g_data(true,true,true); //list of GFF records by genomic seq
-void loadSeqInfo(FILE* f, GHash<SeqInfo> &si) {
+void loadIDlist(FILE* f, GStrSet<> & idhash) {
+ GLineReader fr(f);
+ while (!fr.isEof()) {
+ char* line=fr.getLine();
+ if (line==NULL) break;
+ if (line[0]=='#') continue; //skip comments
+ GDynArray<char*> ids;
+ strsplit(line, ids);
+ for (uint i=0;i<ids.Count();i++) {
+ if (strlen(ids[i])>0)
+ idhash.Add(ids[i]);
+ }
+ }
+}
+
+void loadSeqInfo(FILE* f, GHash<SeqInfo*> &si) {
GLineReader fr(f);
while (!fr.isEof()) {
char* line=fr.getLine();
@@ -230,23 +200,33 @@ void loadSeqInfo(FILE* f, GHash<SeqInfo> &si) {
} //while lines
}
+void getAttrList(GStr& s) {
+ if (s.is_empty()) return;
+ s.startTokenize(",;:", tkCharSet);
+ GStr w;
+ while (s.nextToken(w)) {
+ if (w.length()>0)
+ attrList.Add(w.chars());
+ }
+}
+
void setTableFormat(GStr& s) {
if (s.is_empty()) return;
GHash<ETableFieldType> specialFields;
- specialFields.Add("chr", new ETableFieldType(ctfGFF_chr));
- specialFields.Add("id", new ETableFieldType(ctfGFF_ID));
- specialFields.Add("geneid", new ETableFieldType(ctfGFF_geneID));
- specialFields.Add("genename", new ETableFieldType(ctfGFF_geneName));
- specialFields.Add("parent", new ETableFieldType(ctfGFF_Parent));
- specialFields.Add("feature", new ETableFieldType(ctfGFF_feature));
- specialFields.Add("start", new ETableFieldType(ctfGFF_start));
- specialFields.Add("end", new ETableFieldType(ctfGFF_end));
- specialFields.Add("strand", new ETableFieldType(ctfGFF_strand));
- specialFields.Add("numexons", new ETableFieldType(ctfGFF_numexons));
- specialFields.Add("exons", new ETableFieldType(ctfGFF_exons));
- specialFields.Add("cds", new ETableFieldType(ctfGFF_cds));
- specialFields.Add("covlen", new ETableFieldType(ctfGFF_covlen));
- specialFields.Add("cdslen", new ETableFieldType(ctfGFF_cdslen));
+ specialFields.Add("chr", ctfGFF_chr);
+ specialFields.Add("id", ctfGFF_ID);
+ specialFields.Add("geneid", ctfGFF_geneID);
+ specialFields.Add("genename", ctfGFF_geneName);
+ specialFields.Add("parent", ctfGFF_Parent);
+ specialFields.Add("feature", ctfGFF_feature);
+ specialFields.Add("start", ctfGFF_start);
+ specialFields.Add("end", ctfGFF_end);
+ specialFields.Add("strand", ctfGFF_strand);
+ specialFields.Add("numexons", ctfGFF_numexons);
+ specialFields.Add("exons", ctfGFF_exons);
+ specialFields.Add("cds", ctfGFF_cds);
+ specialFields.Add("covlen", ctfGFF_covlen);
+ specialFields.Add("cdslen", ctfGFF_cdslen);
s.startTokenize(" ,;.:", tkCharSet);
GStr w;
@@ -287,7 +267,7 @@ void setTableFormat(GStr& s) {
}
}
-void loadRefTable(FILE* f, GHash<RefTran>& rt) {
+void loadRefTable(FILE* f, GHash<RefTran*>& rt) {
GLineReader fr(f);
char* line=NULL;
while ((line=fr.getLine())) {
@@ -417,15 +397,17 @@ void shutDown() {
FWCLOSE(f_w);
FWCLOSE(f_x);
FWCLOSE(f_y);
+ FWCLOSE(f_j);
}
int main(int argc, char* argv[]) {
GArgs args(argc, argv,
"version;debug;merge;stream;adj-stop;bed;in-bed;tlf;in-tlf;cluster-only;nc;cov-info;help;"
- "sort-alpha;keep-genes;w-add=;keep-comments;keep-exon-attrs;force-exons;t-adopt;gene2exon;"
- "ignore-locus;no-pseudo;table=sort-by=hvOUNHPWCVJMKQYTDARSZFGLEBm:g:i:r:s:l:t:o:w:x:y:d:");
+ "sort-alpha;keep-genes;w-nocds;attrs=;w-add=;ids=;nids=0;gtf;keep-comments;keep-exon-attrs;force-exons;t-adopt;gene2exon;"
+ "ignore-locus;no-pseudo;table=sort-by=hvOUNHPWCVJMKQYTDARSZFGLEBm:g:i:r:s:l:t:o:w:x:y:j:d:");
args.printError(USAGE, true);
- if (args.getOpt('h') || args.getOpt("help")) {
+ int numfiles = args.startNonOpt();
+ if (args.getOpt('h') || args.getOpt("help") || ( numfiles==0 && !hasStdInput())) {
GMessage("%s",USAGE);
exit(1);
}
@@ -448,9 +430,9 @@ int main(int argc, char* argv[]) {
if (adjustStop) addCDSattrs=true;
validCDSonly=(args.getOpt('V')!=NULL);
altPhases=(args.getOpt('H')!=NULL);
- fmtGTF=(args.getOpt('T')!=NULL); //switch output format to GTF
- fmtBED=(args.getOpt("bed")!=NULL);
- fmtTLF=(args.getOpt("tlf")!=NULL);
+ fmtGTF=(args.getOpt('T')!=NULL || args.getOpt("gtf")!=NULL); //switch output format to GTF
+ fmtBED=(args.getOpt("bed")!=NULL); //BED output
+ fmtTLF=(args.getOpt("tlf")!=NULL); //TLF output
if (fmtGTF || fmtBED || fmtTLF) {
if (!gffloader.transcriptsOnly) {
GMessage("Error: option -O is only supported with GFF3 output");
@@ -510,7 +492,7 @@ int main(int argc, char* argv[]) {
args.printCmdLine(stderr);
}
if (args.getOpt("version")) {
- GMessage(VERSION"\n");
+ printf(VERSION"\n");
exit(0);
}
gffloader.fullAttributes=(args.getOpt('F')!=NULL);
@@ -529,8 +511,8 @@ int main(int argc, char* argv[]) {
gffloader.gatherExonAttrs=true;
gffloader.fullAttributes=true;
}
- ensembl_convert=(args.getOpt('L')!=NULL);
- if (ensembl_convert) {
+ gffloader.ensemblProc=(args.getOpt('L')!=NULL);
+ if (gffloader.ensemblProc) {
gffloader.fullAttributes=true;
gffloader.gatherExonAttrs=false;
//sortByLoc=true;
@@ -558,7 +540,7 @@ int main(int argc, char* argv[]) {
if (!s.is_empty()) maxintron=s.asInt();
s=args.getOpt('l');
if (!s.is_empty()) minLen=s.asInt();
-
+ TFilters=(multiExon || wCDSonly || wNConly); //TODO: all transcript filters should be included here through validateGffRec()
FILE* f_repl=NULL; //duplicate/collapsing info output file
s=args.getOpt('d');
if (!s.is_empty()) {
@@ -569,6 +551,12 @@ int main(int argc, char* argv[]) {
}
}
+ s=args.getOpt("attrs");
+ if (!s.is_empty()) {
+ getAttrList(s);
+ gffloader.attrsFilter=(attrList.Count()>1);
+ gffloader.fullAttributes=true;
+ }
rfltWithin=(args.getOpt('R')!=NULL);
s=args.getOpt('r');
if (!s.is_empty()) {
@@ -621,8 +609,25 @@ int main(int argc, char* argv[]) {
if (fsize==NULL) GError("Error opening info file: %s\n",s.chars());
loadSeqInfo(fsize, seqinfo);
fclose(fsize);
+ }
+ s=args.getOpt("ids");
+ if (s.is_empty()) {
+ s=args.getOpt("nids");
+ if (!s.is_empty())
+ IDflt=idFlt_Exclude;
+ } else {
+ IDflt=idFlt_Only;
+ }
+ if (!s.is_empty()) {
+ FILE* f=fopen(s,"r");
+ if (f==NULL) GError("Error opening ID list file: %s\n",s.chars());
+ loadIDlist(f, fltIDs);
+ if (fltIDs.Count()==0) {
+ GMessage("Warning: no IDs were loaded from file %s\n", s.chars());
+ IDflt=idFlt_None;
}
-
+ fclose(f);
+ }
openfw(f_out, args, 'o');
//if (f_out==NULL) f_out=stdout;
if (gfasta.fastaPath==NULL && (validCDSonly || spliceCheck || args.getOpt('w')!=NULL || args.getOpt('x')!=NULL || args.getOpt('y')!=NULL))
@@ -630,19 +635,22 @@ int main(int argc, char* argv[]) {
openfw(f_w, args, 'w');
openfw(f_x, args, 'x');
openfw(f_y, args, 'y');
+ openfw(f_j, args, 'j');
s=args.getOpt("w-add");
if (!s.is_empty()) {
if (f_w==NULL) GError("Error: --w-add option requires -w option!\n");
wPadding=s.asInt();
}
+ if (f_w!=NULL && args.getOpt("w-nocds"))
+ wfaNoCDS=true;
+
if (f_out==NULL && f_w==NULL && f_x==NULL && f_y==NULL && !covInfo)
f_out=stdout;
//if (f_y!=NULL || f_x!=NULL) wCDSonly=true;
//useBadCDS=useBadCDS || (fgtfok==NULL && fgtfbad==NULL && f_y==NULL && f_x==NULL);
- int numfiles = args.startNonOpt();
//GList<GffObj> gfkept(false,true); //unsorted, free items on delete
int out_counter=0; //number of records printed
@@ -726,8 +734,8 @@ int main(int argc, char* argv[]) {
bool firstLocusPrint=true;
GffLocus& loc=*(gdata->loci[l]);
//check all non-replaced transcripts in this locus:
- int numvalid=0;
- int idxfirstvalid=-1;
+ //int numvalid=0;
+ //int idxfirstvalid=-1;
for (int i=0;i<loc.rnas.Count();i++) {
GffObj& t=*(loc.rnas[i]);
GTData* tdata=(GTData*)(t.uptr);
@@ -752,15 +760,16 @@ int main(int argc, char* argv[]) {
//restore strand for dOvlSET
char orig_strand=T_OSTRAND(t.udata);
if (orig_strand!=0) t.strand=orig_strand;
-
+ /* -- transcripts are filtered upon loading
if (process_transcript(gfasta, t)) {
numvalid++;
if (idxfirstvalid<0) idxfirstvalid=i;
}
+ */
} //for each transcript
int rnas_i=0;
- if (idxfirstvalid>=0) rnas_i=idxfirstvalid;
+ //if (idxfirstvalid>=0) rnas_i=idxfirstvalid;
int gfs_i=0;
if (f_out) {
GStr locname("RLOC_");
@@ -775,7 +784,8 @@ int main(int argc, char* argv[]) {
if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; }
if (firstGSeqHeader) { printGSeqHeader(f_out, gdata); firstGSeqHeader=false; }
if (firstLocusPrint) {
- loc.print(f_out, idxfirstvalid, locname, loctrack);
+ //loc.print(f_out, idxfirstvalid, locname, loctrack);
+ loc.print(f_out, 0, locname, loctrack);
firstLocusPrint=false;
}
printGffObj(f_out, loc.gfs[gfs_i], locname, exonPrinting, out_counter);
@@ -789,7 +799,8 @@ int main(int argc, char* argv[]) {
if (firstGff3Print) { printGff3Header(f_out, args); firstGff3Print=false; }
if (firstGSeqHeader) { printGSeqHeader(f_out, gdata); firstGSeqHeader=false; }
if (firstLocusPrint) {
- loc.print(f_out, idxfirstvalid, locname, loctrack);
+ //loc.print(f_out, idxfirstvalid, locname, loctrack);
+ loc.print(f_out, 0, locname, loctrack);
firstLocusPrint=false;
}
}
@@ -804,7 +815,7 @@ int main(int argc, char* argv[]) {
} //if Clustering enabled
else { //no clustering
//not grouped into loci, print the rnas with their parents, if any
- int numvalid=0;
+ //int numvalid=0;
for (int g=0;g<g_data.Count();g++) {
GenomicSeqData* gdata=g_data[g];
bool firstGSeqHeader=fmtGFF3;
@@ -815,6 +826,8 @@ int main(int argc, char* argv[]) {
//print other non-transcript (gene?) feature that might be there before t
while (gfs_i<gdata->gfs.Count() && gdata->gfs[gfs_i]->start<=t.start) {
GffObj& gfst=*(gdata->gfs[gfs_i]);
+ if (TFilters && gfst.isGene() && gfst.children.Count()==0) // gene with no children left, skip it if filters were applied
+ { ++gfs_i; continue; }
if T_PRINTABLE(gfst.udata) { //never printed
T_NO_PRINT(gfst.udata);
if (fmtGFF3) {
@@ -829,8 +842,8 @@ int main(int argc, char* argv[]) {
}
GTData* tdata=(GTData*)(t.uptr);
if (tdata->replaced_by!=NULL) continue;
- if (process_transcript(gfasta, t)) {
- numvalid++;
+ //if (process_transcript(gfasta, t)) {
+ // numvalid++;
if (f_out && T_PRINTABLE(t.udata) ) {
T_NO_PRINT(t.udata);
if (fmtGFF3 || fmtTable || t.isTranscript()) {
@@ -859,12 +872,14 @@ int main(int argc, char* argv[]) {
t.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars);
}
}//GFF/GTF output requested
- } //valid transcript
+ //} //valid transcript
} //for each rna
//print the rest of the isolated pseudo/gene/region features not printed yet
if (f_out && (fmtGFF3 || fmtTable)) {
while (gfs_i<gdata->gfs.Count()) {
GffObj& gfst=*(gdata->gfs[gfs_i]);
+ if (TFilters && gfst.isGene() && gfst.children.Count()==0) // gene with no children left, skip it if filters were applied
+ { ++gfs_i; continue; }
if T_PRINTABLE(gfst.udata) { //never printed
T_NO_PRINT(gfst.udata);
if (fmtGFF3) {
=====================================
prep_source.sh
=====================================
@@ -14,7 +14,7 @@ libdir=$pack/gclib/
cp LICENSE README.md gffread.cpp gff_utils.{h,cpp} $pack/
sed 's|\.\./gclib|./gclib|' Makefile > $pack/Makefile
-cp ../gclib/{GVec,GList,GHash}.hh $libdir
+cp ../gclib/{GVec,GList,GHashMap,khashl}.hh ../gclib/xxhash.h $libdir
cp ../gclib/{GArgs,GBase,gdna,GStr,gff,codons,GFaSeqGet,GFastaIndex}.{h,cpp} $libdir
tar cvfz $pack.tar.gz $pack
ls -l $pack.tar.gz
=====================================
tag_git.sh
=====================================
@@ -3,12 +3,13 @@ git checkout master
ver=$(fgrep '#define VERSION ' gffread.cpp)
ver=${ver#*\"}
ver=${ver%%\"*}
-git fetch --tags
+#git fetch --tags
if [[ "$1" == "delete" || "$1" == "del" ]]; then
echo "Deleting tag v$ver .."
git tag -d v$ver
git push origin :refs/tags/v$ver
exit
fi
+echo "Tagging with v$ver"
git tag -a "v$ver" -m "release $ver"
git push --tags
View it on GitLab: https://salsa.debian.org/med-team/gffread/-/compare/393dcda1b32d83199ad144655608aaf038ce6144...ef25b5a211f12f0b059da6834968a0b601c3128a
--
View it on GitLab: https://salsa.debian.org/med-team/gffread/-/compare/393dcda1b32d83199ad144655608aaf038ce6144...ef25b5a211f12f0b059da6834968a0b601c3128a
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20210117/7d22013a/attachment-0001.html>
More information about the debian-med-commit
mailing list