[med-svn] [Git][med-team/gffread][upstream] New upstream version 0.12.7
Andreas Tille (@tille)
gitlab at salsa.debian.org
Wed Oct 13 16:16:14 BST 2021
Andreas Tille pushed to branch upstream at Debian Med / gffread
Commits:
3f7ba6a2 by Andreas Tille at 2021-10-13T15:53:52+02:00
New upstream version 0.12.7
- - - - -
5 changed files:
- Makefile
- gff_utils.cpp
- gff_utils.h
- gffread.cpp
- prep_source.sh
Changes:
=====================================
Makefile
=====================================
@@ -9,9 +9,9 @@ LINKER := $(if $(LINKER),$(LINKER),g++)
LDFLAGS := $(if $(LDFLAGS),$(LDFLAGS),-g)
-BASEFLAGS := -Wall -Wextra ${SEARCHDIRS} -D_FILE_OFFSET_BITS=64 \
--D_LARGEFILE_SOURCE -D_REENTRANT -fno-strict-aliasing \
- -std=c++11 -fno-exceptions -fno-rtti
+BASEFLAGS := -Wall -Wextra -std=c++11 ${SEARCHDIRS} -D_FILE_OFFSET_BITS=64 \
+ -D_LARGEFILE_SOURCE -D_REENTRANT -fno-strict-aliasing \
+ -fno-exceptions -fno-rtti
GCCV8 := $(shell expr `${CXX} -dumpversion | cut -f1 -d.` \>= 8)
ifeq "$(GCCV8)" "1"
@@ -22,8 +22,12 @@ CXXFLAGS := $(if $(CXXFLAGS),$(BASEFLAGS) $(CXXFLAGS),$(BASEFLAGS))
ifneq (,$(filter %release %static, $(MAKECMDGOALS)))
# -- release build
- CXXFLAGS := -g -O3 -DNDEBUG $(CXXFLAGS)
-else
+ LIBS :=
+ ifneq (,$(findstring static,$(MAKECMDGOALS)))
+ LDFLAGS += -static-libstdc++ -static-libgcc
+ endif
+ CXXFLAGS := -O3 -DNDEBUG $(CXXFLAGS)
+else #debug builds
ifneq (,$(filter %profile %gprof %prof, $(MAKECMDGOALS)))
CXXFLAGS += -pg -O0 -DNDEBUG
LDFLAGS += -pg
@@ -75,7 +79,7 @@ OBJS := ${GCLDIR}/GBase.o ${GCLDIR}/GArgs.o ${GCLDIR}/GFaSeqGet.o \
.PHONY : all
-all release debug memcheck memdebug profile gprof prof: ../gclib gffread
+all static release debug memcheck memdebug profile gprof prof: ../gclib gffread
../gclib:
git clone https://github.com/gpertea/gclib.git ../gclib
=====================================
gff_utils.cpp
=====================================
@@ -42,10 +42,15 @@ bool fullCDSonly=false; // starts with START, ends with STOP codon
bool multiExon=false;
bool writeExonSegs=false;
char* tracklabel=NULL;
+/*
char* rfltGSeq=NULL;
char rfltStrand=0;
uint rfltStart=0;
-uint rfltEnd=MAX_UINT;
+uint rfltEnd=MAX_UINT;*/
+GRangeParser* fltRange=NULL;
+
+GRangeParser* fltJunction=NULL;
+
bool rfltWithin=false; //check for full containment within given range
bool addDescr=false;
@@ -166,7 +171,6 @@ int cmpRedundant(GffObj& a, GffObj& b) {
else return (a.exons.Count()>b.exons.Count())? 1: -1;
}
-
bool tMatch(GffObj& a, GffObj& b) {
//strict intron chain match, or single-exon perfect match
int imax=a.exons.Count()-1;
@@ -282,6 +286,8 @@ int adjust_stopcodon(GffObj& gffrec, int adj, GList<GSeg>* seglst) {
void printTableData(FILE* f, GffObj& g, bool inFasta) {
//using attribute list in tableCols
+ const int DBUF_LEN=1024; //there should not be attribute values larger than 1K!
+ char dbuf[DBUF_LEN];
char* av=NULL;
for(int i=0;i<tableCols.Count();i++) {
if (i>0 || inFasta) {
@@ -291,7 +297,12 @@ void printTableData(FILE* f, GffObj& g, bool inFasta) {
switch(tableCols[i].type) {
case ctfGFF_Attr:
av=g.getAttr(tableCols[i].name.chars());
- fprintf(f,"%s",av!=NULL? av : ".");
+ if (av) {
+ if (decodeChars) {
+ GffObj::decodeHexChars(dbuf, av, DBUF_LEN-1);
+ fprintf(f,"%s", dbuf);
+ } else fprintf(f,"%s",av);
+ } else fprintf(f,".");
break;
case ctfGFF_chr:
fprintf(f,"%s",g.getGSeqName());
@@ -375,7 +386,9 @@ bool GffLoader::validateGffRec(GffObj* gffrec) {
}
}
}
+ return false;
}
+ if (gffrec->isGene() && keepGenes) return true;
return false;
} //transcript rejected
return true;
@@ -419,24 +432,24 @@ bool GffLoader::checkFilters(GffObj* gffrec) {
gffrec->getID(), minLen);
return false;
}
- if (rfltGSeq!=NULL) { //filter by gseqName
- if (strcmp(gffrec->getGSeqName(),rfltGSeq)!=0) {
+ if (fltRange!=NULL) { //filter by gseqName
+ if (fltRange->refName!=NULL && strcmp(gffrec->getGSeqName(),fltRange->refName)!=0) {
return false;
}
- }
- if (rfltStrand>0 && gffrec->strand !=rfltStrand) {
- return false;
- }
- //check coordinates
- if (rfltStart!=0 || rfltEnd!=MAX_UINT) {
- if (rfltWithin) {
- if (gffrec->start<rfltStart || gffrec->end>rfltEnd) {
- return false; //not within query range
- }
+ if (fltRange->strand>0 && gffrec->strand!=fltRange->strand) {
+ return false;
}
- else {
- if (gffrec->start>rfltEnd || gffrec->end<rfltStart) {
- return false;
+ //check coordinates
+ if (fltRange->start || fltRange->end<UINT_MAX) {
+ if (rfltWithin) {
+ if (gffrec->start<fltRange->start || gffrec->end>fltRange->end) {
+ return false; //not within query range
+ }
+ }
+ else {
+ if (gffrec->start>fltRange->end || gffrec->end<fltRange->start) {
+ return false;
+ }
}
}
}
@@ -444,7 +457,6 @@ bool GffLoader::checkFilters(GffObj* gffrec) {
//remove attributes that are not in attrList
gffrec->removeAttrs(attrList);
}
-
if (gffrec->isTranscript()) { // && TFilters) ?
//these filters only apply to transcripts
if (multiExon && gffrec->exons.Count()<=1) {
@@ -454,12 +466,46 @@ bool GffLoader::checkFilters(GffObj* gffrec) {
return false;
}
if (wNConly && gffrec->hasCDS()) return false;
+ if (fltJunction!=NULL) {
+ if (gffrec->exons.Count()<=1) return false;
+ if (fltJunction->refName!=NULL && strcmp(gffrec->getGSeqName(),fltJunction->refName)!=0) {
+ return false;
+ }
+ if (fltJunction->strand && gffrec->strand!=fltJunction->strand) {
+ return false;
+ }
+ //check coordinates
+ uint jstart=fltJunction->start;
+ uint jend=fltJunction->end;
+ if (jstart==0) jstart=jend;
+ if (jend==0) jend=jstart;
+ if (gffrec->start>=jstart || gffrec->end<=jend) {
+ return false;
+ }
+
+ bool noJMatch=true;
+ for (int i=0;i<gffrec->exons.Count()-1;++i) {
+ if (fltJunction->start && fltJunction->end) {
+ if (gffrec->exons[i]->end+1==fltJunction->start &&
+ gffrec->exons[i+1]->start-1==fltJunction->end)
+ { noJMatch=false; break; }
+ } else if (fltJunction->start) { //end match not required
+ if (gffrec->exons[i]->end+1==fltJunction->start)
+ { noJMatch=false; break; }
+ } else { //only end match required:
+ if (gffrec->exons[i+1]->start-1==fltJunction->end)
+ { noJMatch=false; break; }
+ }
+ }
+ if (noJMatch) return false;
+ }
+
return process_transcript(gfasta, *gffrec);
} //transcript filters check
return true;
}
-bool process_transcript(GFastaDb& gfasta, GffObj& gffrec) {
+bool GffLoader::process_transcript(GFastaDb& gfasta, GffObj& gffrec) {
if (!gffrec.isTranscript()) return false; //shouldn't call this function unless it's a transcript
//returns true if the transcript passed the filter
char* gname=gffrec.getGeneName();
@@ -680,40 +726,48 @@ bool process_transcript(GFastaDb& gfasta, GffObj& gffrec) {
if (adjstop!=NULL) delete adjstop;
*/
if (cdsnt!=NULL) { // && !inframeStop) {
+ GStr defline(gffrec.getID(), 94);
+ if (writeExonSegs) {
+ defline.append(" loc:");
+ defline.append(gffrec.getGSeqName());
+ defline.appendfmt("(%c)",gffrec.strand);
+ //warning: not CDS coordinates are written here, but the exon ones
+ defline+=(int)gffrec.start;
+ defline+=(char)'-';
+ defline+=(int)gffrec.end;
+ // -- here these are CDS substring coordinates on the spliced sequence:
+ defline.append(" segs:");
+ for (int i=0;i<seglst.Count();i++) {
+ if (i>0) defline.append(",");
+ defline+=(int)seglst[i].start;
+ defline.append("-");
+ defline+=(int)seglst[i].end;
+ }
+ }
if (f_y!=NULL) { //CDS translation fasta output requested
if (cdsaa==NULL) { //translate now if not done before
cdsaa=translateDNA(cdsnt, aalen, seqlen);
}
if (aalen>0) {
if (cdsaa[aalen-1]=='.' || cdsaa[aalen-1]=='\0') --aalen; //avoid printing the stop codon
- fprintf(f_y, ">%s", gffrec.getID());
+ fprintf(f_y, ">%s", defline.chars());
if (fmtTable) printTableData(f_y, gffrec, true);
- else fprintf(f_y, "\n");
+ else {
+ if (gffrec.attrs!=NULL && gffrec.attrs->Count()>0) fprintf(f_y," ");
+ gffrec.printAttrs(f_y, ";", false, decodeChars, false);
+ fprintf(f_y, "\n");
+ }
printFasta(f_y, NULL, cdsaa, aalen, StarStop);
}
}
if (f_x!=NULL) { //CDS only
- GStr defline(gffrec.getID(), 94);
- if (writeExonSegs) {
- defline.append(" loc:");
- defline.append(gffrec.getGSeqName());
- defline.appendfmt("(%c)",gffrec.strand);
- //warning: not CDS coordinates are written here, but the exon ones
- defline+=(int)gffrec.start;
- defline+=(char)'-';
- defline+=(int)gffrec.end;
- // -- here these are CDS substring coordinates on the spliced sequence:
- defline.append(" segs:");
- for (int i=0;i<seglst.Count();i++) {
- if (i>0) defline.append(",");
- defline+=(int)seglst[i].start;
- defline.append("-");
- defline+=(int)seglst[i].end;
- }
- }
fprintf(f_x, ">%s", defline.chars());
if (fmtTable) printTableData(f_x, gffrec, true);
- else fprintf(f_x, "\n");
+ else {
+ if (gffrec.attrs!=NULL && gffrec.attrs->Count()>0) fprintf(f_x," ");
+ gffrec.printAttrs(f_x, ";", false, decodeChars, false);
+ fprintf(f_x, "\n");
+ }
printFasta(f_x, NULL, cdsnt, seqlen);
}
GFREE(cdsnt);
@@ -775,7 +829,11 @@ bool process_transcript(GFastaDb& gfasta, GffObj& gffrec) {
fprintf(f_w, ">%s", defline.chars());
if (fmtTable) printTableData(f_w, gffrec, true);
- else fprintf(f_w, "\n");
+ else {
+ if (gffrec.attrs!=NULL && gffrec.attrs->Count()>0) fprintf(f_w," ");
+ gffrec.printAttrs(f_w, ";", false, decodeChars, false);
+ fprintf(f_w, "\n");
+ }
printFasta(f_w, NULL, exont, seqlen);
GFREE(exont);
}
@@ -793,8 +851,6 @@ GTData::GTData(GffObj* t, GenomicSeqData* gd):rna(t),gdata(gd), locus(NULL), rep
gdata->tdata.Add(this);
}
-
-
bool GffLoader::unsplContained(GffObj& ti, GffObj& tj) {
//returns true only if ti (which MUST be single-exon) is "almost" contained in any of tj's exons
//but it does not cross any intron-exon boundary of tj
=====================================
gff_utils.h
=====================================
@@ -19,7 +19,6 @@ extern FILE* f_y; //wrting fasta with translated CDS
extern FILE* f_j; //wrting junctions (introns)
-
extern bool TFilters;
extern bool wfaNoCDS;
@@ -56,10 +55,6 @@ extern bool fullCDSonly; // starts with START, ends with STOP codon
extern bool multiExon;
extern bool writeExonSegs;
extern char* tracklabel;
-extern char* rfltGSeq;
-extern char rfltStrand;
-extern uint rfltStart;
-extern uint rfltEnd;
extern bool rfltWithin; //check for full containment within given range
extern bool addDescr;
@@ -94,6 +89,10 @@ typedef bool GFValidateFunc(GffObj* gf);
//keep/set original/old strand
#define T_SET_OSTRAND(d, s) d |= s
+extern GRangeParser* fltRange;
+
+extern GRangeParser* fltJunction;
+
class SeqInfo { //populated from the -s option of gffread
public:
int len;
@@ -129,7 +128,6 @@ char* getSeqDescr(char* seqid);
char* getSeqName(char* seqid);
int adjust_stopcodon(GffObj& gffrec, int adj, GList<GSeg>* seglst=NULL);
void printTableData(FILE* f, GffObj& g, bool inFasta=false);
-bool process_transcript(GFastaDb& gfasta, GffObj& gffrec);
enum ETableFieldType {
ctfGFF_Attr=0, // attribute name as is
@@ -803,6 +801,8 @@ class GffLoader {
}
bool validateGffRec(GffObj* gffrec);
+ bool process_transcript(GFastaDb& gfasta, GffObj& gffrec);
+
bool checkFilters(GffObj* gffrec);
void collectIntrons(GffObj& t); //for -j output
=====================================
gffread.cpp
=====================================
@@ -4,13 +4,14 @@
#define __STDC_FORMAT_MACROS
#include <inttypes.h>
-#define VERSION "0.12.4"
+#define VERSION "0.12.7"
#define USAGE "gffread v" VERSION ". Usage:\n\
gffread [-g <genomic_seqs_fasta> | <dir>] [-s <seq_info.fsize>] \n\
- [-o <outfile>] [-t <trackname>] [-r [[<strand>]<chr>:]<start>..<end> [-R]]\n\
+ [-o <outfile>] [-t <trackname>] [-r [<strand>]<chr>:<start>-<end> [-R]]\n\
+ [--jmatch <chr>:<start>-<end>] [--no-pseudo] \n\
[-CTVNJMKQAFPGUBHZWTOLE] [-w <exons.fa>] [-x <cds.fa>] [-y <tr_cds.fa>]\n\
- [--ids <IDs.lst> | --nids <IDs.lst>] [--attrs <attr-list>] [-i <maxintron>]\n\
+ [-j ][--ids <IDs.lst> | --nids <IDs.lst>] [--attrs <attr-list>] [-i <maxintron>]\n\
[--stream] [--bed | --gtf | --tlf] [--table <attrlist>] [--sort-by <ref.lst>]\n\
[<input_gff>] \n\n\
Filter, convert or cluster GFF/GTF/BED records, extract the sequence of\n\
@@ -20,14 +21,15 @@ gffread [-g <genomic_seqs_fasta> | <dir>] [-s <seq_info.fsize>] \n\
the basic attributes.\n\
\n\
Options:\n\
- -i discard transcripts having an intron larger than <maxintron>\n\
--ids discard records/transcripts if their IDs are not listed in <IDs.lst>\n\
--nids discard records/transcripts if their IDs are listed in <IDs.lst>\n\
+ -i discard transcripts having an intron larger than <maxintron>\n\
-l discard transcripts shorter than <minlen> bases\n\
-r only show transcripts overlapping coordinate range <start>..<end>\n\
(on chromosome/contig <chr>, strand <strand> if provided)\n\
-R for -r option, discard all transcripts that are not fully \n\
contained within the given range\n\
+ --jmatch only output transcripts matching the given junction\n\
-U discard single-exon transcripts\n\
-C coding only: discard mRNAs that have no CDS features\n\
--nc non-coding only: discard mRNAs that have CDS features\n\
@@ -105,14 +107,14 @@ Output options:\n\
-g full path to a multi-fasta file with the genomic sequences\n\
for all input mappings, OR a directory with single-fasta files\n\
(one per genomic sequence, with file names matching sequence names)\n\
- -j write a tab delimited file with all the junctions (intron coordinates)\n\
+ -j output the junctions and the corresponding transcripts\n\
-w write a fasta file with spliced exons for each transcript\n\
--w-add <N> for the -w option, extract additional <N> bases\n\
both upstream and downstream of the transcript boundaries\n\
--w-nocds for -w, disable the output of CDS info in the FASTA file\n\
-x write a fasta file with spliced CDS for each GFF transcript\n\
-y write a protein fasta file with the translation of CDS for each record\n\
- -W for -w and -x options, write in the FASTA defline all the exon\n\
+ -W for -w, -x and -y options, write in the FASTA defline all the exon\n\
coordinates projected onto the spliced sequence;\n\
-S for -y option, use '*' instead of '.' as stop codon translation\n\
-L Ensembl GTF to GFF3 conversion, adds version to IDs\n\
@@ -252,11 +254,6 @@ void setTableFormat(GStr& s) {
tableCols.Add(tcol);
continue;
}
- if (w=="geneID" || w=="gene_id") {
- CTableField tcol(ctfGFF_geneID);
- tableCols.Add(tcol);
- continue;
- }
if (w=="Parent") {
CTableField tcol(ctfGFF_Parent);
tableCols.Add(tcol);
@@ -392,7 +389,8 @@ void shutDown() {
seqinfo.Clear();
//if (faseq!=NULL) delete faseq;
//if (gcdb!=NULL) delete gcdb;
- GFREE(rfltGSeq);
+ delete fltRange;
+ delete fltJunction;
FWCLOSE(f_out);
FWCLOSE(f_w);
FWCLOSE(f_x);
@@ -403,11 +401,15 @@ void shutDown() {
int main(int argc, char* argv[]) {
GArgs args(argc, argv,
"version;debug;merge;stream;adj-stop;bed;in-bed;tlf;in-tlf;cluster-only;nc;cov-info;help;"
- "sort-alpha;keep-genes;w-nocds;attrs=;w-add=;ids=;nids=0;gtf;keep-comments;keep-exon-attrs;force-exons;t-adopt;gene2exon;"
+ "sort-alpha;keep-genes;w-nocds;attrs=;w-add=;ids=;nids=;jmatch=;gtf;keep-comments;keep-exon-attrs;force-exons;t-adopt;gene2exon;"
"ignore-locus;no-pseudo;table=sort-by=hvOUNHPWCVJMKQYTDARSZFGLEBm:g:i:r:s:l:t:o:w:x:y:j:d:");
args.printError(USAGE, true);
int numfiles = args.startNonOpt();
- if (args.getOpt('h') || args.getOpt("help") || ( numfiles==0 && !hasStdInput())) {
+ if (args.getOpt("version")) {
+ printf(VERSION"\n");
+ exit(0);
+ }
+ if (args.getOpt('h') || args.getOpt("help") || ( numfiles==0 && !haveStdInput())) {
GMessage("%s",USAGE);
exit(1);
}
@@ -491,10 +493,6 @@ int main(int argc, char* argv[]) {
fprintf(stderr, "Command line was:\n");
args.printCmdLine(stderr);
}
- if (args.getOpt("version")) {
- printf(VERSION"\n");
- exit(0);
- }
gffloader.fullAttributes=(args.getOpt('F')!=NULL);
gffloader.keep_AllExonAttrs=(args.getOpt("keep-exon-attrs")!=NULL);
if (gffloader.keep_AllExonAttrs && !gffloader.fullAttributes) {
@@ -558,44 +556,22 @@ int main(int argc, char* argv[]) {
gffloader.fullAttributes=true;
}
rfltWithin=(args.getOpt('R')!=NULL);
- s=args.getOpt('r');
- if (!s.is_empty()) {
- s.trim();
- if (s[0]=='+' || s[0]=='-') {
- rfltStrand=s[0];
- s.cut(0,1);
- }
- int isep=s.index(':');
- if (isep>0) { //gseq name given
- if (rfltStrand==0 && (s[isep-1]=='+' || s[isep-1]=='-')) {
- isep--;
- rfltStrand=s[isep];
- s.cut(isep,1);
- }
- if (isep>0)
- rfltGSeq=Gstrdup((s.substr(0,isep)).chars());
- s.cut(0,isep+1);
- }
- GStr gsend;
- char slast=s[s.length()-1];
- if (rfltStrand==0 && (slast=='+' || slast=='-')) {
- s.chomp(slast);
- rfltStrand=slast;
- }
- if (s.index("..")>=0) gsend=s.split("..");
- else gsend=s.split('-');
- if (!s.is_empty()) rfltStart=(uint)s.asInt();
- if (!gsend.is_empty()) {
- rfltEnd=(uint)gsend.asInt();
- if (rfltEnd==0) rfltEnd=MAX_UINT;
- }
- } //gseq/range filtering
- else {
+ char* sz=args.getOpt('r');
+ if (sz) {
+ fltRange=new GRangeParser(sz);
+ if (fltRange->end==0) //end coordinate not given
+ fltRange->end=UINT_MAX;
+ } else {
if (rfltWithin)
GError("Error: option -R requires -r!\n");
- //if (rfltWholeTranscript)
- // GError("Error: option -P requires -r!\n");
- }
+ }
+ sz=args.getOpt("jmatch");
+ if (sz) {
+ //TODO: check if this is a file?
+ fltJunction=new GRangeParser(sz);
+ if (fltJunction->strand=='.') fltJunction->strand=0;
+ } //gseq/range filtering
+
s=args.getOpt('m');
if (!s.is_empty()) {
FILE* ft=fopen(s,"r");
@@ -645,7 +621,7 @@ int main(int argc, char* argv[]) {
if (f_w!=NULL && args.getOpt("w-nocds"))
wfaNoCDS=true;
- if (f_out==NULL && f_w==NULL && f_x==NULL && f_y==NULL && !covInfo)
+ if (f_out==NULL && f_w==NULL && f_x==NULL && f_y==NULL && f_j==NULL && !covInfo)
f_out=stdout;
//if (f_y!=NULL || f_x!=NULL) wCDSonly=true;
=====================================
prep_source.sh
=====================================
@@ -14,7 +14,7 @@ libdir=$pack/gclib/
cp LICENSE README.md gffread.cpp gff_utils.{h,cpp} $pack/
sed 's|\.\./gclib|./gclib|' Makefile > $pack/Makefile
-cp ../gclib/{GVec,GList,GHashMap,khashl}.hh ../gclib/xxhash.h $libdir
+cp ../gclib/{GVec,GList,GHashMap,khashl}.hh ../gclib/xxhash.h ../gclib/wyhash.h ../gclib/GBitVec.h $libdir
cp ../gclib/{GArgs,GBase,gdna,GStr,gff,codons,GFaSeqGet,GFastaIndex}.{h,cpp} $libdir
tar cvfz $pack.tar.gz $pack
ls -l $pack.tar.gz
View it on GitLab: https://salsa.debian.org/med-team/gffread/-/commit/3f7ba6a22a42846c56f89697438f164f37ae33d9
--
View it on GitLab: https://salsa.debian.org/med-team/gffread/-/commit/3f7ba6a22a42846c56f89697438f164f37ae33d9
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20211013/c6f1ad85/attachment-0001.htm>
More information about the debian-med-commit
mailing list