[med-svn] [Git][med-team/gffread][upstream] New upstream version 0.12.4

Andreas Tille gitlab at salsa.debian.org
Sun Jan 17 17:11:55 GMT 2021



Andreas Tille pushed to branch upstream at Debian Med / gffread


Commits:
8f84d6a4 by Andreas Tille at 2021-01-17T18:02:18+01:00
New upstream version 0.12.4
- - - - -


6 changed files:

- Makefile
- gff_utils.cpp
- gff_utils.h
- gffread.cpp
- prep_source.sh
- tag_git.sh


Changes:

=====================================
Makefile
=====================================
@@ -11,7 +11,7 @@ LDFLAGS := $(if $(LDFLAGS),$(LDFLAGS),-g)
 
 BASEFLAGS  := -Wall -Wextra ${SEARCHDIRS} -D_FILE_OFFSET_BITS=64 \
 -D_LARGEFILE_SOURCE -D_REENTRANT -fno-strict-aliasing \
- -std=c++0x -fno-exceptions -fno-rtti
+ -std=c++11 -fno-exceptions -fno-rtti
 
 GCCV8 := $(shell expr `${CXX} -dumpversion | cut -f1 -d.` \>= 8)
 ifeq "$(GCCV8)" "1"
@@ -83,7 +83,7 @@ all release debug memcheck memdebug profile gprof prof: ../gclib gffread
 $(OBJS) : $(GCLDIR)/GBase.h $(GCLDIR)/gff.h
 gffread.o : gff_utils.h $(GCLDIR)/GBase.h $(GCLDIR)/gff.h
 gff_utils.o : gff_utils.h $(GCLDIR)/gff.h
-${GCLDIR}/gff.o : ${GCLDIR}/gff.h ${GCLDIR}/GFaSeqGet.h ${GCLDIR}/GList.hh ${GCLDIR}/GHash.hh
+${GCLDIR}/gff.o : ${GCLDIR}/gff.h ${GCLDIR}/GFaSeqGet.h ${GCLDIR}/GList.hh
 ${GCLDIR}/GFaSeqGet.o : ${GCLDIR}/GFaSeqGet.h
 gffread: $(OBJS) gffread.o
 	${LINKER} ${LDFLAGS} -o $@ ${filter-out %.a %.so, $^} ${LIBS}


=====================================
gff_utils.cpp
=====================================
@@ -1,6 +1,6 @@
 #include "gff_utils.h"
 
-GHash<GeneInfo> gene_ids;
+GHash<GeneInfo*> gene_ids;
 
 bool verbose=false; //same with GffReader::showWarnings and GffLoader::beVserbose
 bool debugMode=false;
@@ -14,9 +14,13 @@ int wPadding = 0; //padding for -w option
 FILE* f_x=NULL; //writing fasta with spliced CDS
 FILE* f_y=NULL; //wrting fasta with translated CDS
 
+FILE* f_j=NULL; //wrting junctions (intron coordinates)
 
 int maxintron=999000000;
 
+ID_Flt_Type IDflt=idFlt_None;
+
+bool TFilters=false;
 bool wCDSonly=false;
 bool wNConly=false;
 int minLen=0; //minimum transcript length
@@ -45,6 +49,7 @@ uint rfltEnd=MAX_UINT;
 bool rfltWithin=false; //check for full containment within given range
 bool addDescr=false;
 
+bool wfaNoCDS=false;
 
 bool fmtGFF3=true; //default output: GFF3
 //other formats only make sense in transcriptOnly mode
@@ -57,9 +62,12 @@ GffPrintMode exonPrinting=pgffAny;
 
 GFastaDb gfasta;
 
-GHash<SeqInfo> seqinfo;
+GHash<SeqInfo*> seqinfo;
 GVec<CTableField> tableCols;
-GHash<RefTran> reftbl;
+GHash<RefTran*> reftbl;
+GStrSet<> fltIDs;
+
+GStrSet<> attrList;
 
 GHash<int> isoCounter; //counts the valid isoforms
 
@@ -353,7 +361,27 @@ void printTableData(FILE* f, GffObj& g, bool inFasta) {
 	}
 	fprintf(f,"\n");
 }
+
 bool GffLoader::validateGffRec(GffObj* gffrec) {
+	if (!checkFilters(gffrec)) {
+		if (gffrec->isTranscript()) {
+			TFilters=true;
+			if (gffrec->parent!=NULL && keepGenes) {
+			   GPVec<GffObj>& pchildren=gffrec->parent->children;
+			   for (int c=0;c<pchildren.Count();c++) {
+				   if (pchildren[c]==gffrec) {
+					   pchildren.Delete(c);
+					   break;
+				   }
+			   }
+			}
+		}
+		return false;
+	} //transcript rejected
+	return true;
+}
+
+bool GffLoader::checkFilters(GffObj* gffrec) {
 	if (reftbl.Count()>0) { //check if we need to reject by ref seq filter
 		GStr refname(gffrec->getRefName());
 		RefTran* rt=reftbl.Find(refname.chars());
@@ -379,6 +407,12 @@ bool GffLoader::validateGffRec(GffObj* gffrec) {
 		//GMessage("Warning: discarding %s GFF generic gene/locus container %s\n",gffrec->getID());
 		return false;
 	}
+	if (IDflt) {
+		if (fltIDs.hasKey(gffrec->getID())) {
+			if (IDflt==idFlt_Exclude) return false;
+		}
+		else if (IDflt==idFlt_Only) return false;
+	}
 	if (minLen>0 && gffrec->covlen<minLen) {
 		if (verbose)
 			GMessage("Info: %s discarded due to minimum length threshold %d\n",
@@ -406,13 +440,22 @@ bool GffLoader::validateGffRec(GffObj* gffrec) {
 			}
 		}
 	}
-	if (multiExon && gffrec->exons.Count()<=1) {
-		return false;
-	}
-	if (wCDSonly && gffrec->CDstart==0) {
-		return false;
+	if (this->attrsFilter) { //mostly relevant for transcripts and gene records
+		//remove attributes that are not in attrList
+		gffrec->removeAttrs(attrList);
 	}
-	if (wNConly && gffrec->hasCDS()) return false;
+
+    if (gffrec->isTranscript()) {    // && TFilters) ?
+    	//these filters only apply to transcripts
+		if (multiExon && gffrec->exons.Count()<=1) {
+			return false;
+		}
+		if (wCDSonly && gffrec->CDstart==0) {
+			return false;
+		}
+		if (wNConly && gffrec->hasCDS()) return false;
+		return process_transcript(gfasta, *gffrec);
+    } //transcript filters check
 	return true;
 }
 
@@ -451,13 +494,13 @@ bool process_transcript(GFastaDb& gfasta, GffObj& gffrec) {
           }
       }
  if (gname && strcmp(gname, gffrec.getID())!=0) {
-   int* isonum=isoCounter.Find(gname);
-   if  (isonum==NULL) {
-       isonum=new int(1);
-       isoCounter.Add(gname,isonum);
-       }
-      else (*isonum)++;
-   //defline.appendfmt(" gene=%s", gname);
+	   int* isonum=isoCounter.Find(gname);
+	   if  (isonum==NULL) {
+		   //isonum=new int(1);
+		   isoCounter.Add(gname,1);
+	   }
+		else (*isonum)++;
+	   //defline.appendfmt(" gene=%s", gname);
    }
   int seqlen=0;
 
@@ -533,6 +576,7 @@ bool process_transcript(GFastaDb& gfasta, GffObj& gffrec) {
     int phaseNum=0;
   CDS_CHECK:
     uint cds_olen=0;
+    inframeStop=false;
     cdsnt=gffrec.getSpliced(faseq, true, &seqlen, NULL, &cds_olen, &seglst, adjustStop);
     //if adjustStop, seqlen has the CDS+3'UTR length, but cds_olen still has the original CDS length
     if (cdsnt!=NULL && cdsnt[0]!='\0') { //has CDS
@@ -694,7 +738,7 @@ bool process_transcript(GFastaDb& gfasta, GffObj& gffrec) {
 
 	  GStr defline(gffrec.getID());
 	  if (exont!=NULL) {
-		  if (gffrec.CDstart>0) {
+		  if (!wfaNoCDS && gffrec.CDstart>0) {
 			  defline.appendfmt(" CDS=%d-%d", cds_start, cds_end);
 		  }
 		  if (writeExonSegs) {
@@ -1169,9 +1213,9 @@ void collectLocusData(GList<GenomicSeqData>& ref_data, bool covInfo) {
 		GenomicSeqData* gdata=ref_data[g];
 		for (int l=0;l<gdata->loci.Count();l++) {
 			GffLocus& loc=*(gdata->loci[l]);
-			GHash<int> gnames(true); //gene names in this locus
+			GHash<int> gnames; //gene names in this locus
 			//GHash<int> geneids(true); //Entrez GeneID: numbers
-			GHash<int> geneids(true);
+			GHash<int> geneids;
 			int fstrand=0,rstrand=0,ustrand=0;
 			for (int i=0;i<loc.rnas.Count();i++) {
 				GffObj& t=*(loc.rnas[i]);
@@ -1185,11 +1229,14 @@ void collectLocusData(GList<GenomicSeqData>& ref_data, bool covInfo) {
 					gname.upper();
 					int* prevg=gnames.Find(gname.chars());
 					if (prevg!=NULL) (*prevg)++;
-					else gnames.Add(gname, new int(1));
+					else gnames.Add(gname.chars(), 1);
 				}
 				GStr geneid(t.getGeneID());
-				if (!geneid.is_empty())
-					geneids.Add(geneid.chars());
+				if (!geneid.is_empty()) {
+					int* prevg=gnames.Find(geneid.chars());
+					if (prevg!=NULL) (*prevg)++;
+					geneids.Add(geneid.chars(), 1);
+				}
 				//parse GeneID xrefs, if any (RefSeq):
 				/*
 				GStr xrefs(t.getAttr("xrefs"));
@@ -1220,11 +1267,13 @@ void collectLocusData(GList<GenomicSeqData>& ref_data, bool covInfo) {
 						gname.upper();
 						int* prevg=gnames.Find(gname.chars());
 						if (prevg!=NULL) (*prevg)++;
-						else gnames.Add(gname, new int(1));
+						else gnames.Add(gname, 1);
 					}
 					GStr geneid(nt.getID());
 					if (!geneid.is_empty()) {
-						geneids.Add(geneid.chars(), new int(1));
+						int* prevg=gnames.Find(geneid.chars());
+						if (prevg!=NULL) (*prevg)++;
+						geneids.Add(geneid.chars(),1);
 					}
 				}
 				//parse GeneID xrefs, if any (RefSeq):
@@ -1258,18 +1307,18 @@ void collectLocusData(GList<GenomicSeqData>& ref_data, bool covInfo) {
 			loc.locus_num=locus_num;
 			if (gnames.Count()>0) { //collect all gene names associated to this locus
 				gnames.startIterate();
-				int* gfreq=NULL;
-				char* key=NULL;
-				while ((gfreq=gnames.NextData(key))!=NULL) {
-					loc.gene_names.AddIfNew(new CGeneSym(key,*gfreq));
+				int gfreq=0;
+				const char* key=NULL;
+				while ((key=gnames.Next(gfreq))!=NULL) {
+					loc.gene_names.AddIfNew(new CGeneSym(key, gfreq));
 				}
 			} //added collected gene_names
 			if (geneids.Count()>0) { //collect all GeneIDs names associated to this locus
 				geneids.startIterate();
-				int* gfreq=NULL;
-				char* key=NULL;
-				while ((gfreq=geneids.NextData(key))!=NULL) {
-					loc.gene_ids.AddIfNew(new CGeneSym(key,*gfreq));
+				int gfreq=0;
+				const char* key=NULL;
+				while ((key=geneids.Next(gfreq))!=NULL) {
+					loc.gene_ids.AddIfNew(new CGeneSym(key, gfreq));
 				}
 			}
 		} //for each locus
@@ -1311,6 +1360,21 @@ GenomicSeqData* getGSeqData(GList<GenomicSeqData>& seqdata, int gseq_id) {
 void warnPseudo(GffObj& m) {
 	GMessage("Info: pseudo gene/transcript record with ID=%s discarded.\n",m.getID());
 }
+void GffLoader::collectIntrons(GffObj& t) {
+	// assume t are coming grouped by chromosome and sorted by start coordinate!
+	if (intronList.jlst.Count()>0) {
+         if (t.gseq_id!=intronList.gseq_id ||
+        		 t.start>=intronList.jlst.Last()->start) {
+        	intronList.print(f_j);
+        	intronList.clear();
+         }
+         else if (t.start<intronList.last_t_start)
+        	 GError("Error collectIntrons(%s) called when last_t_start was %d\n",
+        			 t.getID(), intronList.last_t_start );
+         //add this transcript's introns
+	}
+    intronList.add(t);
+}
 
 void GffLoader::load(GList<GenomicSeqData>& seqdata, GFFCommentParser* gf_parsecomment) {
 	if (f==NULL) GError("Error: GffLoader::load() cannot be called before ::openFile()!\n");
@@ -1326,6 +1390,7 @@ void GffLoader::load(GList<GenomicSeqData>& seqdata, GFFCommentParser* gf_parsec
 	gffr->keepGenes(keepGenes);
 	gffr->setIgnoreLocus(ignoreLocus);
 	gffr->setRefAlphaSorted(this->sortRefsAlpha);
+	gffr->procEnsemblID(this->ensemblProc);
 	if (keepGff3Comments && gf_parsecomment!=NULL) gffr->setCommentParser(gf_parsecomment);
     int outcounter=0;
 	if (streamIn) { //this will ignore any clustering options
@@ -1335,22 +1400,27 @@ void GffLoader::load(GList<GenomicSeqData>& seqdata, GFFCommentParser* gf_parsec
 				delete t;
 				continue;
 			}
-			if (process_transcript(gfasta, *t)) {
-				outcounter++;
-				if (f_out) {
-				  if (fmtTable)
-						printTableData(f_out, *t);
-				  else //GFF3, GTF, BED, TLF
-					t->printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars);
-				}
+
+			if (f_j!=NULL && t->isTranscript() && t->exons.Count()>1)
+				collectIntrons(*t);
+
+			outcounter++;
+			if (f_out) {
+			  if (fmtTable)
+					printTableData(f_out, *t);
+			  else //GFF3, GTF, BED, TLF
+				t->printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars);
 			}
 			delete t;
 		}
+		if (f_j && intronList.jlst.Count()>0) {
+        	intronList.print(f_j);
+        	intronList.clear();
+		}
 		delete gffr;
 		return;
 	}
 
-
 	gffr->readAll();
 	GVec<int> pseudoFeatureIds; //feature type: pseudo*
 	GVec<int> pseudoAttrIds;  // attribute: [is]pseudo*=true/yes/1
@@ -1438,9 +1508,12 @@ void GffLoader::load(GList<GenomicSeqData>& seqdata, GFFCommentParser* gf_parsec
 			m->subftype_id=gff_fid_exon;
 		}
 		//GList<GffObj> gfadd(false,false); -- for gf_validate()?
-		if (!validateGffRec(m)) {
+		if (!validateGffRec(m)) { //this will also apply process_transcript() CDS filters etc.
 			continue;
 		}
+		if (f_j!=NULL && m->isTranscript() && m->exons.Count()>1)
+			collectIntrons(*m);
+
 		m->isUsed(true); //so the gffreader won't destroy it
 		GenomicSeqData* gdata=getGSeqData(seqdata, m->gseq_id);
 		bool keep=placeGf(m, gdata);
@@ -1450,6 +1523,10 @@ void GffLoader::load(GList<GenomicSeqData>& seqdata, GFFCommentParser* gf_parsec
 			//GMessage("Feature %s(%d-%d) is going to be discarded..\n",m->getID(), m->start, m->end);
 		}
 	} //for each read gffObj
+	if (f_j && intronList.jlst.Count()>0) {
+    	intronList.print(f_j);
+    	intronList.clear();
+	}
 	//if (verbose) GMessage("  .. %d records from %s clustered into loci.\n", gffr->gflst.Count(), fname.chars());
 	//if (f && f!=stdin) { fclose(f); f=NULL; }
 	delete gffr;


=====================================
gff_utils.h
=====================================
@@ -17,10 +17,26 @@ extern int wPadding; //padding for -w option
 extern FILE* f_x; //writing fasta with spliced CDS
 extern FILE* f_y; //wrting fasta with translated CDS
 
+extern FILE* f_j; //wrting junctions (introns)
+
+
+extern bool TFilters;
+
+extern bool wfaNoCDS;
+
 extern int maxintron;
 
 extern bool wCDSonly;
 extern bool wNConly;
+
+enum ID_Flt_Type {
+	idFlt_None=0,
+	idFlt_Only,
+	idFlt_Exclude
+};
+
+extern ID_Flt_Type IDflt;
+
 extern int minLen; //minimum transcript length
 extern bool validCDSonly; // translation with no in-frame STOP
 extern bool bothStrands; //for single-exon mRNA validation, check the other strand too
@@ -105,15 +121,14 @@ class RefTran {
 };
 
 extern GFastaDb gfasta;
-extern GHash<SeqInfo> seqinfo;
+extern GHash<SeqInfo*> seqinfo;
 extern GHash<int> isoCounter; //counts the valid isoforms
-extern GHash<RefTran> reftbl;
+extern GHash<RefTran*> reftbl;
 
 char* getSeqDescr(char* seqid);
 char* getSeqName(char* seqid);
 int adjust_stopcodon(GffObj& gffrec, int adj, GList<GSeg>* seglst=NULL);
 void printTableData(FILE* f, GffObj& g, bool inFasta=false);
-bool validateGffRec(GffObj* gffrec);
 bool process_transcript(GFastaDb& gfasta, GffObj& gffrec);
 
 enum ETableFieldType {
@@ -144,11 +159,84 @@ class CTableField {
 
 
 extern GVec<CTableField> tableCols; //table output format fields
+extern GStrSet<> attrList;
+extern GStrSet<> fltIDs;
 
 class GffLocus;
 class GenomicSeqData;
 class GeneInfo;
 
+struct CIntronData:public GSeg {
+	char strand;//'.' < '-' < '+' (reverse ASCII order)
+	GVec<GStr> ts; //list of transcript IDs sharing this intron
+	CIntronData(uint istart, uint iend, char tstrand, const char* t_id=NULL):GSeg(istart, iend),
+			strand(tstrand) {
+		if (t_id!=NULL) {
+			GStr tid(t_id);
+		    ts.Add(tid);
+		}
+	}
+	void add(const char* t_id) {
+		 GStr tid(t_id);
+		 ts.Add(tid);
+	}
+	bool operator==(CIntronData& d){
+	  return (start==d.start && end==d.end && strand==d.strand);
+	}
+	bool operator<(CIntronData& d){
+	 if (start==d.start) {
+		 if (end==d.end) return strand>d.strand;
+			else return (end<d.end);
+	  }
+	  else return (start<d.start);
+	}
+
+};
+
+struct CIntronList {
+	int gseq_id;
+	uint last_t_start; //just to check if input is sorted properly!
+	GList<CIntronData> jlst;
+	CIntronList():gseq_id(-1),last_t_start(0), jlst(true, true) {}
+	void add(GffObj& t) { //add all introns of t to jlst
+		if (t.exons.Count()<2) return; //nothing to do
+		if (gseq_id>=0 && gseq_id!=t.gseq_id)
+			GError("Error: CIntronList::add(%s) on different ref seq!\n", t.getID());
+		gseq_id=t.gseq_id;
+		for (int i=1;i<t.exons.Count();++i) {
+		  CIntronData* nintr = new CIntronData(t.exons[i-1]->end+1,
+				  t.exons[i]->start-1, t.strand, t.getID());
+		  int fidx=-1;
+		  CIntronData* xintr=jlst.AddIfNew(nintr, true, &fidx);
+		  if (xintr!=nintr) {
+			  //nintr already exists,it was deallocated
+			  xintr->add(t.getID());
+		  }
+		  last_t_start=t.start;
+		} //for each intron
+	}
+	void clear() {
+		gseq_id=-1;
+		last_t_start=0;
+		jlst.Clear();
+	}
+	void print(FILE* f) {
+		//simple tab delimited format: chr, start, end, strand, transcriptIDs comma delimited
+		const char* gseqname=GffObj::names->gseqs.getName(gseq_id);
+		for (int i=0;i<jlst.Count();++i) {
+			CIntronData& idata=*(jlst[i]);
+			fprintf(f,"%s\t%d\t%d\t%c\t",gseqname, idata.start, idata.end, idata.strand);
+			if (idata.ts.Count()>1)
+				idata.ts.Sort();
+			for (int t=0;t<idata.ts.Count();t++) {
+				if (t) fprintf(f, ",%s", idata.ts[t].chars());
+				  else fprintf(f,  "%s", idata.ts[t].chars());
+			}
+			fprintf(f, "\n");
+		}
+	}
+};
+
 class GTData { // transcript associated data
  public:
    GffObj* rna;
@@ -662,6 +750,7 @@ class GffLoader {
   GStr fname;
   FILE* f;
   GffNames* names;
+  CIntronList intronList; // collect introns for -j output
   union {
 	  unsigned int options;
 	  struct {
@@ -686,10 +775,12 @@ class GffLoader {
 		bool dOvlSET:1; //discard overlapping Single Exon Transcripts on any strand
 		bool forceExons:1;
 		bool streamIn:1;
+		bool ensemblProc:1;
+		bool attrsFilter:1;
 	  };
   };
 
-  GffLoader():fname(),f(NULL), names(NULL), options(0) {
+  GffLoader():fname(),f(NULL), names(NULL), intronList(), options(0) {
       transcriptsOnly=true;
       gffnames_ref(GffObj::names);
       names=GffObj::names;
@@ -712,11 +803,15 @@ class GffLoader {
   }
 
   bool validateGffRec(GffObj* gffrec);
+  bool checkFilters(GffObj* gffrec);
+
+  void collectIntrons(GffObj& t); //for -j output
 
   void load(GList<GenomicSeqData>&seqdata, GFFCommentParser* gf_parsecomment=NULL);
 
   bool placeGf(GffObj* t, GenomicSeqData* gdata);
 
+
   bool unsplContained(GffObj& ti, GffObj&  tj);
   GffObj* redundantTranscripts(GffObj& ti, GffObj&  tj);
 


=====================================
gffread.cpp
=====================================
@@ -4,24 +4,25 @@
 #define __STDC_FORMAT_MACROS
 #include <inttypes.h>
 
-#define VERSION "0.12.1"
+#define VERSION "0.12.4"
 
 #define USAGE "gffread v" VERSION ". Usage:\n\
-gffread <input_gff> [-g <genomic_seqs_fasta> | <dir>][-s <seq_info.fsize>] \n\
+gffread [-g <genomic_seqs_fasta> | <dir>] [-s <seq_info.fsize>] \n\
  [-o <outfile>] [-t <trackname>] [-r [[<strand>]<chr>:]<start>..<end> [-R]]\n\
  [-CTVNJMKQAFPGUBHZWTOLE] [-w <exons.fa>] [-x <cds.fa>] [-y <tr_cds.fa>]\n\
- [-i <maxintron>] [--stream] [--bed] [--table <attrlist>] [--sort-by <ref.lst>]\n\
- \n\
+ [--ids <IDs.lst> | --nids <IDs.lst>] [--attrs <attr-list>] [-i <maxintron>]\n\
+ [--stream] [--bed | --gtf | --tlf] [--table <attrlist>] [--sort-by <ref.lst>]\n\
+ [<input_gff>] \n\n\
  Filter, convert or cluster GFF/GTF/BED records, extract the sequence of\n\
  transcripts (exon or CDS) and more.\n\
  By default (i.e. without -O) only transcripts are processed, discarding any\n\
  other non-transcript features. Default output is a simplified GFF3 with only\n\
  the basic attributes.\n\
  \n\
- <input_gff> is a GFF file, use '-' for stdin\n\
- \n\
 Options:\n\
  -i   discard transcripts having an intron larger than <maxintron>\n\
+ --ids discard records/transcripts if their IDs are not listed in <IDs.lst>\n\
+ --nids discard records/transcripts if their IDs are listed in <IDs.lst>\n\
  -l   discard transcripts shorter than <minlen> bases\n\
  -r   only show transcripts overlapping coordinate range <start>..<end>\n\
       (on chromosome/contig <chr>, strand <strand> if provided)\n\
@@ -42,11 +43,13 @@ Sorting: (by default, chromosomes are kept in the order they were found)\n\
  --sort-by : sort the reference sequences by the order in which their\n\
       names are given in the <refseq.lst> file\n\
 Misc options: \n\
- -F   preserve all GFF attributes (for non-exon features)\n\
+ -F   keep all GFF attributes (for non-exon features)\n\
  --keep-exon-attrs : for -F option, do not attempt to reduce redundant\n\
       exon/CDS attributes\n\
  -G   do not keep exon attributes, move them to the transcript feature\n\
       (for GFF3 output)\n\
+ --attrs <attr-list> only output the GTF/GFF attributes listed in <attr-list>\n\
+    which is a comma delimited list of attribute names to\n\
  --keep-genes : in transcript-only mode (default), also preserve gene records\n\
  --keep-comments: for GFF3 input/output, try to preserve comments\n\
  -O   process other non-transcript GFF records (by default non-transcript\n\
@@ -78,7 +81,7 @@ Misc options: \n\
            ((no sorting, exons must be grouped by transcript in the input data)\n\
 Clustering:\n\
  -M/--merge : cluster the input transcripts into loci, discarding\n\
-      \"duplicated\" transcripts (those with the same exact introns\n\
+      \"redundant\" transcripts (those with the same exact introns\n\
       and fully contained or equal boundaries)\n\
  -d <dupinfo> : for -M option, write duplication info to file <dupinfo>\n\
  --cluster-only: same as -M/--merge but without discarding any of the\n\
@@ -102,24 +105,26 @@ Output options:\n\
  -g   full path to a multi-fasta file with the genomic sequences\n\
       for all input mappings, OR a directory with single-fasta files\n\
       (one per genomic sequence, with file names matching sequence names)\n\
+ -j    write a tab delimited file with all the junctions (intron coordinates)\n\
  -w    write a fasta file with spliced exons for each transcript\n\
  --w-add <N> for the -w option, extract additional <N> bases\n\
        both upstream and downstream of the transcript boundaries\n\
+ --w-nocds for -w, disable the output of CDS info in the FASTA file\n\
  -x    write a fasta file with spliced CDS for each GFF transcript\n\
  -y    write a protein fasta file with the translation of CDS for each record\n\
  -W    for -w and -x options, write in the FASTA defline all the exon\n\
        coordinates projected onto the spliced sequence;\n\
  -S    for -y option, use '*' instead of '.' as stop codon translation\n\
- -L    Ensembl GTF to GFF3 conversion (implies -F; should be used with -m)\n\
+ -L    Ensembl GTF to GFF3 conversion, adds version to IDs\n\
  -m    <chr_replace> is a name mapping table for converting reference \n\
        sequence names, having this 2-column format:\n\
        <original_ref_ID> <new_ref_ID>\n\
  -t    use <trackname> in the 2nd column of each GFF/GTF output line\n\
- -o    write the records into <outfile> instead of stdout\n\
+ -o    write the output records into <outfile> instead of stdout\n\
  -T    main output will be GTF instead of GFF3\n\
  --bed output records in BED format instead of default GFF3\n\
  --tlf output \"transcript line format\" which is like GFF\n\
-       but exons, CDS features and related data are stored as GFF \n\
+       but with exons and CDS related features stored as GFF \n\
        attributes in the transcript feature line, like this:\n\
          exoncount=N;exons=<exons>;CDSphase=<N>;CDS=<CDScoords> \n\
        <exons> is a comma-delimited list of exon_start-exon_end coordinates;\n\
@@ -136,61 +141,11 @@ Output options:\n\
        problems with the given GFF/GTF records\n\
 "
 
-/*
-FILE* ffasta=NULL;
-FILE* f_in=NULL;
-FILE* f_out=NULL;
-FILE* f_w=NULL; //writing fasta with spliced exons (transcripts)
-int wPadding = 0; //padding for -w option
-FILE* f_x=NULL; //writing fasta with spliced CDS
-FILE* f_y=NULL; //wrting fasta with translated CDS
-
-bool wCDSonly=false;
-bool wNConly=false;
-int minLen=0; //minimum transcript length
-bool validCDSonly=false; // translation with no in-frame STOP
-bool bothStrands=false; //for single-exon mRNA validation, check the other strand too
-bool altPhases=false; //if original phase fails translation validation,
-                     //try the other 2 phases until one makes it
-bool addCDSattrs=false;
-bool add_hasCDS=false;
-//bool streamIn=false; // --stream option
-bool adjustStop=false; //automatic adjust the CDS stop coordinate
-bool covInfo=false; // --cov-info : only report genome coverage
-GStr tableFormat; //list of "attributes" to print in tab delimited format
-bool spliceCheck=false; //only known splice-sites
-bool decodeChars=false; //decode url-encoded chars in attrs (-D)
-bool StarStop=false; //use * instead of . for stop codon translation
-bool fullCDSonly=false; // starts with START, ends with STOP codon
-*/
-
 GStr sortBy; //file name with chromosomes listed in the desired order
 
 bool BEDinput=false;
 bool TLFinput=false;
 
-//--- output options
-/*
-extern bool fmtGFF3=true; //default output: GFF3
-//other formats only make sense in transcriptOnly mode
-extern bool fmtGTF=false;
-extern bool fmtBED=false;
-extern bool fmtTLF=false;
-extern bool fmtTable=false;
-
-
-bool multiExon=false;
-bool writeExonSegs=false;
-char* tracklabel=NULL;
-char* rfltGSeq=NULL;
-char rfltStrand=0;
-uint rfltStart=0;
-uint rfltEnd=MAX_UINT;
-bool rfltWithin=false; //check for full containment within given range
-bool addDescr=false;
-
-*/
-
 //bool protmap=false;
 //int maxintron=999000000;
 //bool mergeCloseExons=false;
@@ -200,7 +155,22 @@ GffLoader gffloader;
 
 GList<GenomicSeqData> g_data(true,true,true); //list of GFF records by genomic seq
 
-void loadSeqInfo(FILE* f, GHash<SeqInfo> &si) {
+void loadIDlist(FILE* f, GStrSet<> & idhash) {
+  GLineReader fr(f);
+  while (!fr.isEof()) {
+      char* line=fr.getLine();
+      if (line==NULL) break;
+      if (line[0]=='#') continue; //skip comments
+      GDynArray<char*> ids;
+      strsplit(line, ids);
+      for (uint i=0;i<ids.Count();i++) {
+    	  if (strlen(ids[i])>0)
+    		  idhash.Add(ids[i]);
+      }
+  }
+}
+
+void loadSeqInfo(FILE* f, GHash<SeqInfo*> &si) {
   GLineReader fr(f);
   while (!fr.isEof()) {
       char* line=fr.getLine();
@@ -230,23 +200,33 @@ void loadSeqInfo(FILE* f, GHash<SeqInfo> &si) {
       } //while lines
 }
 
+void getAttrList(GStr& s) {
+	 if (s.is_empty()) return;
+	 s.startTokenize(",;:", tkCharSet);
+	 GStr w;
+	 while (s.nextToken(w)) {
+		  if (w.length()>0)
+    	    attrList.Add(w.chars());
+	 }
+}
+
 void setTableFormat(GStr& s) {
 	 if (s.is_empty()) return;
 	 GHash<ETableFieldType> specialFields;
-	 specialFields.Add("chr", new ETableFieldType(ctfGFF_chr));
-	 specialFields.Add("id", new ETableFieldType(ctfGFF_ID));
-	 specialFields.Add("geneid", new ETableFieldType(ctfGFF_geneID));
-	 specialFields.Add("genename", new ETableFieldType(ctfGFF_geneName));
-	 specialFields.Add("parent", new ETableFieldType(ctfGFF_Parent));
-	 specialFields.Add("feature", new ETableFieldType(ctfGFF_feature));
-	 specialFields.Add("start", new ETableFieldType(ctfGFF_start));
-	 specialFields.Add("end", new ETableFieldType(ctfGFF_end));
-	 specialFields.Add("strand", new ETableFieldType(ctfGFF_strand));
-	 specialFields.Add("numexons", new ETableFieldType(ctfGFF_numexons));
-	 specialFields.Add("exons", new ETableFieldType(ctfGFF_exons));
-	 specialFields.Add("cds", new ETableFieldType(ctfGFF_cds));
-	 specialFields.Add("covlen", new ETableFieldType(ctfGFF_covlen));
-	 specialFields.Add("cdslen", new ETableFieldType(ctfGFF_cdslen));
+	 specialFields.Add("chr", ctfGFF_chr);
+	 specialFields.Add("id", ctfGFF_ID);
+	 specialFields.Add("geneid", ctfGFF_geneID);
+	 specialFields.Add("genename", ctfGFF_geneName);
+	 specialFields.Add("parent", ctfGFF_Parent);
+	 specialFields.Add("feature", ctfGFF_feature);
+	 specialFields.Add("start", ctfGFF_start);
+	 specialFields.Add("end", ctfGFF_end);
+	 specialFields.Add("strand", ctfGFF_strand);
+	 specialFields.Add("numexons", ctfGFF_numexons);
+	 specialFields.Add("exons", ctfGFF_exons);
+	 specialFields.Add("cds", ctfGFF_cds);
+	 specialFields.Add("covlen", ctfGFF_covlen);
+	 specialFields.Add("cdslen", ctfGFF_cdslen);
 
 	 s.startTokenize(" ,;.:", tkCharSet);
 	 GStr w;
@@ -287,7 +267,7 @@ void setTableFormat(GStr& s) {
 	 }
 }
 
-void loadRefTable(FILE* f, GHash<RefTran>& rt) {
+void loadRefTable(FILE* f, GHash<RefTran*>& rt) {
   GLineReader fr(f);
   char* line=NULL;
   while ((line=fr.getLine())) {
@@ -417,15 +397,17 @@ void shutDown() {
 	FWCLOSE(f_w);
 	FWCLOSE(f_x);
 	FWCLOSE(f_y);
+	FWCLOSE(f_j);
 }
 
 int main(int argc, char* argv[]) {
  GArgs args(argc, argv,
    "version;debug;merge;stream;adj-stop;bed;in-bed;tlf;in-tlf;cluster-only;nc;cov-info;help;"
-    "sort-alpha;keep-genes;w-add=;keep-comments;keep-exon-attrs;force-exons;t-adopt;gene2exon;"
-    "ignore-locus;no-pseudo;table=sort-by=hvOUNHPWCVJMKQYTDARSZFGLEBm:g:i:r:s:l:t:o:w:x:y:d:");
+    "sort-alpha;keep-genes;w-nocds;attrs=;w-add=;ids=;nids=0;gtf;keep-comments;keep-exon-attrs;force-exons;t-adopt;gene2exon;"
+    "ignore-locus;no-pseudo;table=sort-by=hvOUNHPWCVJMKQYTDARSZFGLEBm:g:i:r:s:l:t:o:w:x:y:j:d:");
  args.printError(USAGE, true);
- if (args.getOpt('h') || args.getOpt("help")) {
+ int numfiles = args.startNonOpt();
+ if (args.getOpt('h') || args.getOpt("help") || ( numfiles==0 && !hasStdInput())) {
     GMessage("%s",USAGE);
     exit(1);
  }
@@ -448,9 +430,9 @@ int main(int argc, char* argv[]) {
  if (adjustStop) addCDSattrs=true;
  validCDSonly=(args.getOpt('V')!=NULL);
  altPhases=(args.getOpt('H')!=NULL);
- fmtGTF=(args.getOpt('T')!=NULL); //switch output format to GTF
- fmtBED=(args.getOpt("bed")!=NULL);
- fmtTLF=(args.getOpt("tlf")!=NULL);
+ fmtGTF=(args.getOpt('T')!=NULL || args.getOpt("gtf")!=NULL); //switch output format to GTF
+ fmtBED=(args.getOpt("bed")!=NULL); //BED output
+ fmtTLF=(args.getOpt("tlf")!=NULL); //TLF output
  if (fmtGTF || fmtBED || fmtTLF) {
 	 if (!gffloader.transcriptsOnly) {
 		 GMessage("Error: option -O is only supported with GFF3 output");
@@ -510,7 +492,7 @@ int main(int argc, char* argv[]) {
      args.printCmdLine(stderr);
      }
  if (args.getOpt("version")) {
-  GMessage(VERSION"\n");
+  printf(VERSION"\n");
   exit(0);
  }
  gffloader.fullAttributes=(args.getOpt('F')!=NULL);
@@ -529,8 +511,8 @@ int main(int argc, char* argv[]) {
 	 gffloader.gatherExonAttrs=true;
 	 gffloader.fullAttributes=true;
  }
- ensembl_convert=(args.getOpt('L')!=NULL);
- if (ensembl_convert) {
+ gffloader.ensemblProc=(args.getOpt('L')!=NULL);
+ if (gffloader.ensemblProc) {
     gffloader.fullAttributes=true;
     gffloader.gatherExonAttrs=false;
     //sortByLoc=true;
@@ -558,7 +540,7 @@ int main(int argc, char* argv[]) {
  if (!s.is_empty()) maxintron=s.asInt();
  s=args.getOpt('l');
  if (!s.is_empty()) minLen=s.asInt();
-
+ TFilters=(multiExon || wCDSonly || wNConly); //TODO: all transcript filters should be included here through validateGffRec()
  FILE* f_repl=NULL; //duplicate/collapsing info output file
  s=args.getOpt('d');
  if (!s.is_empty()) {
@@ -569,6 +551,12 @@ int main(int argc, char* argv[]) {
    }
  }
 
+ s=args.getOpt("attrs");
+ if (!s.is_empty()) {
+	 getAttrList(s);
+	 gffloader.attrsFilter=(attrList.Count()>1);
+	 gffloader.fullAttributes=true;
+ }
  rfltWithin=(args.getOpt('R')!=NULL);
  s=args.getOpt('r');
  if (!s.is_empty()) {
@@ -621,8 +609,25 @@ int main(int argc, char* argv[]) {
    if (fsize==NULL) GError("Error opening info file: %s\n",s.chars());
    loadSeqInfo(fsize, seqinfo);
    fclose(fsize);
+ }
+ s=args.getOpt("ids");
+ if (s.is_empty()) {
+	 s=args.getOpt("nids");
+	 if (!s.is_empty())
+		 IDflt=idFlt_Exclude;
+ } else {
+	 IDflt=idFlt_Only;
+ }
+ if (!s.is_empty()) {
+   FILE* f=fopen(s,"r");
+   if (f==NULL) GError("Error opening ID list file: %s\n",s.chars());
+   loadIDlist(f, fltIDs);
+   if (fltIDs.Count()==0) {
+	   GMessage("Warning: no IDs were loaded from file %s\n", s.chars());
+	   IDflt=idFlt_None;
    }
-
+   fclose(f);
+ }
  openfw(f_out, args, 'o');
  //if (f_out==NULL) f_out=stdout;
  if (gfasta.fastaPath==NULL && (validCDSonly || spliceCheck || args.getOpt('w')!=NULL || args.getOpt('x')!=NULL || args.getOpt('y')!=NULL))
@@ -630,19 +635,22 @@ int main(int argc, char* argv[]) {
  openfw(f_w, args, 'w');
  openfw(f_x, args, 'x');
  openfw(f_y, args, 'y');
+ openfw(f_j, args, 'j');
  s=args.getOpt("w-add");
  if (!s.is_empty()) {
 	 if (f_w==NULL) GError("Error: --w-add option requires -w option!\n");
 	 wPadding=s.asInt();
  }
 
+ if (f_w!=NULL && args.getOpt("w-nocds"))
+	 wfaNoCDS=true;
+
  if (f_out==NULL && f_w==NULL && f_x==NULL && f_y==NULL && !covInfo)
 	 f_out=stdout;
 
  //if (f_y!=NULL || f_x!=NULL) wCDSonly=true;
  //useBadCDS=useBadCDS || (fgtfok==NULL && fgtfbad==NULL && f_y==NULL && f_x==NULL);
 
- int numfiles = args.startNonOpt();
  //GList<GffObj> gfkept(false,true); //unsorted, free items on delete
  int out_counter=0; //number of records printed
 
@@ -726,8 +734,8 @@ int main(int argc, char* argv[]) {
        bool firstLocusPrint=true;
        GffLocus& loc=*(gdata->loci[l]);
        //check all non-replaced transcripts in this locus:
-       int numvalid=0;
-       int idxfirstvalid=-1;
+       //int numvalid=0;
+       //int idxfirstvalid=-1;
        for (int i=0;i<loc.rnas.Count();i++) {
          GffObj& t=*(loc.rnas[i]);
          GTData* tdata=(GTData*)(t.uptr);
@@ -752,15 +760,16 @@ int main(int argc, char* argv[]) {
          //restore strand for dOvlSET
          char orig_strand=T_OSTRAND(t.udata);
          if (orig_strand!=0) t.strand=orig_strand;
-
+         /* -- transcripts are filtered upon loading
          if (process_transcript(gfasta, t)) {
              numvalid++;
              if (idxfirstvalid<0) idxfirstvalid=i;
          }
+         */
        } //for each transcript
 
        int rnas_i=0;
-       if (idxfirstvalid>=0) rnas_i=idxfirstvalid;
+       //if (idxfirstvalid>=0) rnas_i=idxfirstvalid;
        int gfs_i=0;
        if (f_out) {
            GStr locname("RLOC_");
@@ -775,7 +784,8 @@ int main(int argc, char* argv[]) {
 					   if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; }
 					   if (firstGSeqHeader) { printGSeqHeader(f_out, gdata); firstGSeqHeader=false; }
 					   if (firstLocusPrint) {
-						   loc.print(f_out, idxfirstvalid, locname, loctrack);
+						   //loc.print(f_out, idxfirstvalid, locname, loctrack);
+						   loc.print(f_out, 0, locname, loctrack);
 						   firstLocusPrint=false;
 					   }
 					   printGffObj(f_out, loc.gfs[gfs_i], locname, exonPrinting, out_counter);
@@ -789,7 +799,8 @@ int main(int argc, char* argv[]) {
 					     if (firstGff3Print) { printGff3Header(f_out, args); firstGff3Print=false; }
 					     if (firstGSeqHeader) { printGSeqHeader(f_out, gdata); firstGSeqHeader=false; }
 					     if (firstLocusPrint) {
-					    	 loc.print(f_out, idxfirstvalid, locname, loctrack);
+					    	 //loc.print(f_out, idxfirstvalid, locname, loctrack);
+					    	 loc.print(f_out, 0, locname, loctrack);
 					    	 firstLocusPrint=false;
 					     }
 				       }
@@ -804,7 +815,7 @@ int main(int argc, char* argv[]) {
    } //if Clustering enabled
   else { //no clustering
    //not grouped into loci, print the rnas with their parents, if any
-   int numvalid=0;
+   //int numvalid=0;
    for (int g=0;g<g_data.Count();g++) {
      GenomicSeqData* gdata=g_data[g];
      bool firstGSeqHeader=fmtGFF3;
@@ -815,6 +826,8 @@ int main(int argc, char* argv[]) {
          //print other non-transcript (gene?) feature that might be there before t
            while (gfs_i<gdata->gfs.Count() && gdata->gfs[gfs_i]->start<=t.start) {
              GffObj& gfst=*(gdata->gfs[gfs_i]);
+             if (TFilters && gfst.isGene() && gfst.children.Count()==0) // gene with no children left, skip it if filters were applied
+            	 { ++gfs_i; continue; }
              if T_PRINTABLE(gfst.udata) { //never printed
                T_NO_PRINT(gfst.udata);
                if (fmtGFF3) {
@@ -829,8 +842,8 @@ int main(int argc, char* argv[]) {
         }
         GTData* tdata=(GTData*)(t.uptr);
         if (tdata->replaced_by!=NULL) continue;
-        if (process_transcript(gfasta, t)) {
-           numvalid++;
+        //if (process_transcript(gfasta, t)) {
+        //   numvalid++;
            if (f_out && T_PRINTABLE(t.udata) ) {
              T_NO_PRINT(t.udata);
              if (fmtGFF3 || fmtTable || t.isTranscript()) {
@@ -859,12 +872,14 @@ int main(int argc, char* argv[]) {
 					 t.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars);
              }
            }//GFF/GTF output requested
-        } //valid transcript
+        //} //valid transcript
      } //for each rna
      //print the rest of the isolated pseudo/gene/region features not printed yet
      if (f_out && (fmtGFF3 || fmtTable)) {
       while (gfs_i<gdata->gfs.Count()) {
          GffObj& gfst=*(gdata->gfs[gfs_i]);
+         if (TFilters && gfst.isGene() && gfst.children.Count()==0) // gene with no children left, skip it if filters were applied
+        	 { ++gfs_i; continue; }
          if T_PRINTABLE(gfst.udata) { //never printed
            T_NO_PRINT(gfst.udata);
            if (fmtGFF3) {


=====================================
prep_source.sh
=====================================
@@ -14,7 +14,7 @@ libdir=$pack/gclib/
 cp LICENSE README.md gffread.cpp gff_utils.{h,cpp} $pack/
 sed 's|\.\./gclib|./gclib|' Makefile > $pack/Makefile
 
-cp ../gclib/{GVec,GList,GHash}.hh $libdir
+cp ../gclib/{GVec,GList,GHashMap,khashl}.hh ../gclib/xxhash.h $libdir
 cp ../gclib/{GArgs,GBase,gdna,GStr,gff,codons,GFaSeqGet,GFastaIndex}.{h,cpp} $libdir
 tar cvfz $pack.tar.gz $pack
 ls -l $pack.tar.gz


=====================================
tag_git.sh
=====================================
@@ -3,12 +3,13 @@ git checkout master
 ver=$(fgrep '#define VERSION ' gffread.cpp)
 ver=${ver#*\"}
 ver=${ver%%\"*}
-git fetch --tags
+#git fetch --tags
 if [[ "$1" == "delete" || "$1" == "del" ]]; then
   echo "Deleting tag v$ver .."
   git tag -d v$ver
   git push origin :refs/tags/v$ver
   exit
 fi
+echo "Tagging with v$ver"
 git tag -a "v$ver" -m "release $ver"
 git push --tags



View it on GitLab: https://salsa.debian.org/med-team/gffread/-/commit/8f84d6a4137aa1f105b971864eb5ea314c617754

-- 
View it on GitLab: https://salsa.debian.org/med-team/gffread/-/commit/8f84d6a4137aa1f105b971864eb5ea314c617754
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20210117/490cf1a0/attachment-0001.html>


More information about the debian-med-commit mailing list