[med-svn] [libgff] 01/02: Imported Upstream version 1.0
Michael Crusoe
misterc-guest at moszumanska.debian.org
Sat Sep 19 06:30:13 UTC 2015
This is an automated email from the git hooks/post-receive script.
misterc-guest pushed a commit to branch master
in repository libgff.
commit aa3684a37b5064bb9efe2ecf57e96f5db14b5b9e
Author: Michael R. Crusoe <crusoe at ucdavis.edu>
Date: Fri Sep 18 20:29:29 2015 -0700
Imported Upstream version 1.0
BoostLicense.txt | 23 +
CMakeLists.txt | 27 +
Readme.md | 9 +
include/GArgs.h | 98 +++
include/GBase.h | 458 +++++++++++
include/GFaSeqGet.h | 112 +++
include/GFastaIndex.h | 79 ++
include/GHash.hh | 561 +++++++++++++
include/GList.hh | 638 +++++++++++++++
include/GStr.h | 213 +++++
include/GVec.hh | 907 +++++++++++++++++++++
include/codons.h | 54 ++
include/gdna.h | 15 +
include/gff.h | 1088 +++++++++++++++++++++++++
include/gff_utils.h | 610 ++++++++++++++
src/GArgs.cpp | 376 +++++++++
src/GBase.cpp | 780 ++++++++++++++++++
src/GFaSeqGet.cpp | 319 ++++++++
src/GFastaIndex.cpp | 170 ++++
src/GStr.cpp | 1345 +++++++++++++++++++++++++++++++
src/TestGFFParse.cpp | 34 +
src/codons.cpp | 90 +++
src/gdna.cpp | 90 +++
src/gff.cpp | 2125 +++++++++++++++++++++++++++++++++++++++++++++++++
src/gff_utils.cpp | 664 +++++++++++++++
25 files changed, 10885 insertions(+)
diff --git a/BoostLicense.txt b/BoostLicense.txt
new file mode 100644
index 0000000..3998b97
--- /dev/null
+++ b/BoostLicense.txt
@@ -0,0 +1,23 @@
+Boost Software License - Version 1.0 - August 17, 2003
+ Permission is hereby granted, free of charge, to any person or organization
+ obtaining a copy of the software and accompanying documentation covered by
+ this license (the "Software") to use, reproduce, display, distribute,
+ execute, and transmit the Software, and to prepare [[derivative work]]s of the
+ Software, and to permit third-parties to whom the Software is furnished to
+ do so, all subject to the following:
+ The copyright notices in the Software and this entire statement, including
+ the above license grant, this restriction and the following disclaimer,
+ must be included in all copies of the Software, in whole or in part, and
+ all derivative works of the Software, unless such copies or derivative
+ works are solely in the form of machine-executable object code generated by
+ a source language processor.
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..de5d58e
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,27 @@
+ ${PROJECT_SOURCE_DIR}/codons.cpp
+ ${PROJECT_SOURCE_DIR}/gdna.cpp
+ ${PROJECT_SOURCE_DIR}/GFastaIndex.cpp
+ ${PROJECT_SOURCE_DIR}/gff_utils.cpp
diff --git a/Readme.md b/Readme.md
new file mode 100644
index 0000000..a177580
--- /dev/null
+++ b/Readme.md
@@ -0,0 +1,9 @@
+This is an attempt to perform a simple "libraryfication" of the GFF/GTF parsing
+code that is used in the [Cufflinks](http://cufflinks.cbcb.umd.edu/index.html)
+codebase. There are not many (any?) relatively lightweight GTF/GFF parsers
+exposing a C++ interface, and the goal of this library is to provide this
+functionality without the necessity of drawing in a heavy-weight dependency
+like SeqAn.
diff --git a/include/GArgs.h b/include/GArgs.h
new file mode 100644
index 0000000..92f32fb
--- /dev/null
+++ b/include/GArgs.h
@@ -0,0 +1,98 @@
+GArgs is a quick'n'dirty object oriented replacement for the standard
+ getopts library call available on many unix platforms;
+ it accepts the regular single dash style options
+ -<letter>[ ][<value>]
+ but also attr=value style options:
+ <optname>=<value>
+#include <config.h>
+#include <stdio.h>
+struct GArgsDef {
+ const char* longopt;
+ char opt; //equivalent one-char option, if any
+ bool req_value; //true if the string that follows must be a value
+ int code; //an enum code to be associated with this option
+class GArgs {
+ //structure for parsing arguments format definition
+ struct fmtdef {
+ char* longopt;
+ char opt; //equivalent one-char option, if any
+ bool req_value; //true if the string that follows must be a value
+ int code; //an enum code to be associated with this option
+ };
+ int fmtcount;
+ fmtdef* fmt; //this will store format definition after parsing it
+ struct argdata {
+ char* opt; // this is NULL for non-dashed arguments
+ // a single character for single dash style arguments
+ // a string for ARG=VALUE or --long_option style arguments
+ char* value; // is NULL for switches (dashed flags)
+ int fmti; //index in fmt table
+ //int code; // if GArgsDef[] constructor was used, for getOpt
+ };
+ int _argc;
+ char* const *_argv; //the original main() values
+ argdata* args; //arguments table after parsing it
+ int count; //total count of elements in 'args' array
+ int nonOptCount; //count of non-dashed, non= arguments
+ int nonOptPos; //current position for nonOpt arguments iterator
+ int optPos; //current position for options iterator
+ int errarg; //argv error position after parsing
+ bool err_valmissing; //if the error is strictly about missing value for errarg option
+ int parseArgs(bool nodigitopts=false);
+ //parsing helper functions
+ int validOpt(int c);
+ int validShortOpt(char o);
+ int validLongOpt(char* o, char* to);
+ public:
+ GArgs(int argc, char* const argv[], const char* format, bool nodigitopts=false);
+ /* format can be:
+ <string>{;|=} e.g. disable-test;PID=S= for --disable-test PID=50 (or --PID 50) S=3.5 etc.
+ <letter>[:] e.g. p:hT for -p testing (or -ptesting) -h -T
+ This means that the long options, if present, should be given at the beginning
+ of the format string, before the single-dash, single-char options
+ */
+ GArgs(int argc, char* const argv[], const GArgsDef fmtrecs[], bool nodigitopts=false);
+ ~GArgs();
+ int isError(); // returns the offending argv position or 0 if no error
+ int getCount() { return count; } //total number of arguments given
+ int getFmtCount() { return fmtcount; } //total number of option definitions
+ int getNonOptCount() { return nonOptCount; } //total number of non-option arguments
+ char* getOpt(const char* o); /* retrieve the value for option o
+ returns
+ NULL if option not given at all
+ !=NULL if boolean option was given
+ opt's value if value option was given
+ */
+ char* getOpt(const char o);
+ char* getOpt(int c); //retrieve value by enum code
+ char* getOptName(int c); //retrieve name of by enum code
+ int startOpt(); //init iteration through option arguments
+ // returns number of option args
+ char* nextOpt(); //get next option argument's string
+ int nextCode(); //get next option argument's code
+ int startNonOpt(void); //init iteration through non-option arguments
+ // returns the number of non-option arguments
+ void printError(FILE* fout, const char* usage=NULL,
+ bool exitProgram=false);
+ void printError(const char* usage=NULL, bool exitProgram=false);
+ void printCmdLine(FILE* fout);
+ char* nextNonOpt(); //get the next non-option argument
diff --git a/include/GBase.h b/include/GBase.h
new file mode 100644
index 0000000..fc3c5ba
--- /dev/null
+++ b/include/GBase.h
@@ -0,0 +1,458 @@
+#ifndef _POSIX_SOURCE
+//mostly for MinGW
+#define _POSIX_SOURCE
+#include "config.h"
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <limits.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdint.h>
+#if defined __WIN32__ || defined WIN32 || defined _WIN32 || defined _WIN32_
+ #ifndef __WIN32__
+ #define __WIN32__
+ #endif
+ #include <windows.h>
+ #include <io.h>
+ #define CHPATHSEP '\\'
+ #undef off_t
+ #define off_t int64_t
+ #ifndef popen
+ #define popen _popen
+ #endif
+ #ifndef fseeko
+ #ifdef _fseeki64
+ #define fseeko(stream, offset, origin) _fseeki64(stream, offset, origin)
+ #else
+ /*
+ #define _DEFINE_WIN32_FSEEKO
+ int fseeko(FILE *stream, off_t offset, int whence);
+ */
+ #define fseeko fseek
+ #endif
+ #endif
+ #ifndef ftello
+ #ifdef _ftelli64
+ #define ftello(stream) _ftelli64(stream)
+ #else
+ /*
+ #define _DEFINE_WIN32_FTELLO
+ off_t ftello(FILE *stream);
+ */
+ #define ftello ftell
+ #endif
+ #endif
+ #else
+ #define CHPATHSEP '/'
+ #include <unistd.h>
+#ifndef fseeko
+ #define fseeko fseek
+#ifndef ftello
+ #define ftello ftell
+#ifdef DEBUG
+#undef NDEBUG
+typedef int32_t int32;
+typedef uint32_t uint32;
+typedef int16_t int16;
+typedef uint16_t uint16;
+typedef unsigned char uchar;
+typedef unsigned char byte;
+#ifndef MAXUINT
+#define MAXUINT ((unsigned int)-1)
+#ifndef MAXINT
+#ifndef MAX_UINT
+#define MAX_UINT ((unsigned int)-1)
+#ifndef MAX_INT
+#define MAX_INT INT_MAX
+typedef int64_t int64;
+typedef uint64_t uint64;
+#define EXIT_FAILURE 1
+#define EXIT_SUCCESS 0
+#define ERR_ALLOC "Error allocating memory.\n"
+// Debug helpers
+#ifndef NDEBUG
+ #define GASSERT(exp) ((exp)?((void)0):(void)GAssert(#exp,__FILE__,__LINE__))
+ #ifdef TRACE
+ #define GTRACE(exp) (GMessage exp)
+ #else
+ #define GTRACE(exp) ((void)0)
+ #endif
+ #define GASSERT(exp) ((void)0)
+ #define GTRACE(exp) ((void)0)
+#define GERROR(exp) (GError exp)
+/********************************** Macros ***********************************/
+// Abolute value
+#define GABS(val) (((val)>=0)?(val):-(val))
+// Min and Max
+#define GMAX(a,b) (((a)>(b))?(a):(b))
+#define GMIN(a,b) (((a)>(b))?(b):(a))
+// Min of three
+#define GMIN3(x,y,z) ((x)<(y)?GMIN(x,z):GMIN(y,z))
+// Max of three
+#define GMAX3(x,y,z) ((x)>(y)?GMAX(x,z):GMAX(y,z))
+// Return minimum and maximum of a, b
+#define GMINMAX(lo,hi,a,b) ((a)<(b)?((lo)=(a),(hi)=(b)):((lo)=(b),(hi)=(a)))
+// Clamp value x to range [lo..hi]
+#define GCLAMP(lo,x,hi) ((x)<(lo)?(lo):((x)>(hi)?(hi):(x)))
+typedef void* pointer;
+typedef unsigned int uint;
+typedef int GCompareProc(const pointer item1, const pointer item2);
+typedef long GFStoreProc(const pointer item1, FILE* fstorage); //for serialization
+typedef pointer GFLoadProc(FILE* fstorage); //for deserialization
+typedef void GFreeProc(pointer item); //usually just delete,
+ //but may also support structures with embedded dynamic members
+#define GMALLOC(ptr,size) if (!GMalloc((pointer*)(&ptr),size)) \
+#define GCALLOC(ptr,size) if (!GCalloc((pointer*)(&ptr),size)) \
+#define GREALLOC(ptr,size) if (!GRealloc((pointer*)(&ptr),size)) \
+#define GFREE(ptr) GFree((pointer*)(&ptr))
+inline char* strMin(char *arg1, char *arg2) {
+ return (strcmp(arg1, arg2) < 0)? arg1 : arg2;
+inline char* strMax(char *arg1, char *arg2) {
+ return (strcmp(arg2, arg1) < 0)? arg1 : arg2;
+inline int iround(double x) {
+ return (int)floor(x + 0.5);
+inline int Gintcmp(int a, int b) {
+ //return (a>b)? 1 : ((a==b)?0:-1);
+ return a-b;
+int Gstrcmp(const char* a, const char* b, int n=-1);
+//same as strcmp but doesn't crash on NULL pointers
+int Gstricmp(const char* a, const char* b, int n=-1);
+//basic swap template function
+template<class T> void Gswap(T& lhs, T& rhs) {
+ //register T tmp=lhs;
+ T tmp=lhs; //requires copy operator
+ lhs=rhs;
+ rhs=tmp;
+/**************** Memory management ***************************/
+bool GMalloc(pointer* ptr, unsigned long size); // Allocate memory
+bool GCalloc(pointer* ptr, unsigned long size); // Allocate and initialize memory
+bool GRealloc(pointer* ptr,unsigned long size); // Resize memory
+void GFree(pointer* ptr); // Free memory, resets ptr to NULL
+//int saprintf(char **retp, const char *fmt, ...);
+void GError(const char* format,...); // Error routine (aborts program)
+void GMessage(const char* format,...);// Log message to stderr
+// Assert failed routine:- usually not called directly but through GASSERT
+void GAssert(const char* expression, const char* filename, unsigned int lineno);
+// ****************** string manipulation *************************
+char *Gstrdup(const char* str);
+//duplicate a string by allocating a copy for it and returning it
+char* Gstrdup(const char* sfrom, const char* sto);
+//same as GStrdup, but with an early termination (e.g. on delimiter)
+char* Gsubstr(const char* str, char* from, char* to=NULL);
+//extracts a substring, allocating it, including boundaries (from/to)
+int strsplit(char* str, char** fields, int maxfields, const char* delim);
+int strsplit(char* str, char** fields, int maxfields, const char delim);
+int strsplit(char* str, char** fields, int maxfields); //splits by tab or space
+char* replaceStr(char* &str, char* newvalue);
+//conversion: to Lower/Upper case
+// creating a new string:
+char* upCase(const char* str);
+char* loCase(const char* str);
+// changing string in place:
+char* strlower(char * str);
+char* strupper(char * str);
+//strstr but for memory zones: scans a memory region
+//for a substring:
+void* Gmemscan(void *mem, unsigned int len,
+ void *part, unsigned int partlen);
+// test if a char is in a string:
+bool chrInStr(char c, const char* str);
+char* rstrchr(char* str, char ch);
+/* returns a pointer to the rightmost
+ occurence of ch in str - like rindex for platforms missing it*/
+char* strchrs(const char* s, const char* chrs);
+//strchr but with a set of chars instead of only one
+char* rstrfind(const char* str, const char *substr);
+// like rindex() but for strings; right side version of strstr()
+char* reverseChars(char* str, int slen=0); //in place reversal of string
+char* rstrstr(const char* rstart, const char *lend, const char* substr);
+/*the reversed, rightside equivalent of strstr: starts searching
+ from right end (rstart), going back to left end (lend) and returns
+ a pointer to the last (right) matching character in str */
+char* strifind(const char* str, const char* substr);
+// the case insensitive version of strstr -- finding a string within a strin
+//Determines if a string begins with a given prefix
+//(returns false when any of the params is NULL,
+// but true when prefix is '' (empty string)!)
+bool startsWith(const char* s, const char* prefix);
+bool endsWith(const char* s, const char* suffix);
+//Note: returns true if suffix is empty string, but false if it's NULL
+// ELF hash function for strings
+int strhash(const char* str);
+//---- generic base GSeg : genomic segment (interval) --
+// coordinates are considered 1-based (so 0 is invalid)
+class GSeg {
+ public:
+ uint start; //start<end always!
+ uint end;
+ GSeg(uint s=0,uint e=0) {
+ if (s>e) { start=e;end=s; }
+ else { start=s;end=e; }
+ }
+ //check for overlap with other segment
+ uint len() { return end-start+1; }
+ bool overlap(GSeg* d) {
+ //return start<d->start ? (d->start<=end) : (start<=d->end);
+ return (start<=d->end && end>=d->start);
+ }
+ bool overlap(GSeg& d) {
+ //return start<d.start ? (d.start<=end) : (start<=d.end);
+ return (start<=d.end && end>=d.start);
+ }
+ bool overlap(GSeg& d, int fuzz) {
+ //return start<d.start ? (d.start<=end+fuzz) : (start<=d.end+fuzz);
+ return (start<=d.end+fuzz && end+fuzz>=d.start);
+ }
+ bool overlap(uint s, uint e) {
+ if (s>e) { Gswap(s,e); }
+ //return start<s ? (s<=end) : (start<=e);
+ return (start<=e && end>=s);
+ }
+ //return the length of overlap between two segments
+ int overlapLen(GSeg* r) {
+ if (start<r->start) {
+ if (r->start>end) return 0;
+ return (r->end>end) ? end-r->start+1 : r->end-r->start+1;
+ }
+ else { //r->start<=start
+ if (start>r->end) return 0;
+ return (r->end<end)? r->end-start+1 : end-start+1;
+ }
+ }
+ int overlapLen(uint rstart, uint rend) {
+ if (rstart>rend) { Gswap(rstart,rend); }
+ if (start<rstart) {
+ if (rstart>end) return 0;
+ return (rend>end) ? end-rstart+1 : rend-rstart+1;
+ }
+ else { //rstart<=start
+ if (start>rend) return 0;
+ return (rend<end)? rend-start+1 : end-start+1;
+ }
+ }
+ //fuzzy coordinate matching:
+ bool coordMatch(GSeg* s, uint fuzz=0) {
+ if (fuzz==0) return (start==s->start && end==s->end);
+ uint sd = (start>s->start) ? start-s->start : s->start-start;
+ uint ed = (end>s->end) ? end-s->end : s->end-end;
+ return (sd<=fuzz && ed<=fuzz);
+ }
+ //comparison operators required for sorting
+ bool operator==(GSeg& d){
+ return (start==d.start && end==d.end);
+ }
+ bool operator<(GSeg& d){
+ return (start==d.start)?(end<d.end):(start<d.start);
+ }
+// ************** simple line reading class for text files
+//GLineReader -- text line reading/buffering class
+class GLineReader {
+ bool closeFile;
+ int len;
+ int allocated;
+ char* buf;
+ bool isEOF;
+ FILE* file;
+ off_t filepos; //current position
+ bool pushed; //pushed back
+ int lcount; //line counter (read lines)
+ public:
+ char* chars() { return buf; }
+ char* line() { return buf; }
+ int readcount() { return lcount; } //number of lines read
+ void setFile(FILE* stream) { file=stream; }
+ int length() { return len; }
+ int size() { return len; } //same as size();
+ bool isEof() {return isEOF; }
+ bool eof() { return isEOF; }
+ off_t getfpos() { return filepos; }
+ off_t getFpos() { return filepos; }
+ char* nextLine() { return getLine(); }
+ char* getLine() { if (pushed) { pushed=false; return buf; }
+ else return getLine(file); }
+ char* getLine(FILE* stream) {
+ if (pushed) { pushed=false; return buf; }
+ else return getLine(stream, filepos); }
+ char* getLine(FILE* stream, off_t& f_pos); //read a line from a stream and update
+ // the given file position
+ void pushBack() { if (lcount>0) pushed=true; } // "undo" the last getLine request
+ // so the next call will in fact return the same line
+ GLineReader(const char* fname) {
+ FILE* f=fopen(fname, "rb");
+ if (f==NULL) GError("Error opening file '%s'!\n",fname);
+ closeFile=true;
+ init(f);
+ }
+ GLineReader(FILE* stream=NULL, off_t fpos=0) {
+ closeFile=false;
+ init(stream,fpos);
+ }
+ void init(FILE* stream, off_t fpos=0) {
+ len=0;
+ isEOF=false;
+ allocated=1024;
+ GMALLOC(buf,allocated);
+ lcount=0;
+ buf[0]=0;
+ file=stream;
+ filepos=fpos;
+ pushed=false;
+ }
+ ~GLineReader() {
+ GFREE(buf);
+ if (closeFile) fclose(file);
+ }
+/* extended fgets() - to read one full line from a file and
+ update the file position correctly !
+ buf will be reallocated as necessary, to fit the whole line
+ */
+char* fgetline(char* & buf, int& buflen, FILE* stream, off_t* f_pos=NULL, int* linelen=NULL);
+//print int/values nicely formatted in 3-digit groups
+char* commaprint(uint64 n);
+/*********************** File management functions *********************/
+// removes the last part (file or directory name) of a full path
+// WARNING: this is a destructive operation for the given string!
+void delFileName(char* filepath);
+// returns a pointer to the last file or directory name in a full path
+const char* getFileName(const char* filepath);
+// returns a pointer to the file "extension" part in a filename
+const char* getFileExt(const char* filepath);
+int fileExists(const char* fname);
+//returns 0 if file entry doesn't exist
+// 1 if it's a directory
+// 2 if it's a regular file
+// 3 otherwise (?)
+int64 fileSize(const char* fpath);
+//write a formatted fasta record, fasta formatted
+void writeFasta(FILE *fw, const char* seqid, const char* descr,
+ const char* seq, int linelen=60, int seqlen=0);
+//parses the next number found in a string at the current position
+//until a non-digit (and not a '.', 'e','E','-','+') is encountered;
+//updates the char* pointer to be after the last digit parsed
+bool parseNumber(char* &p, double& v);
+bool parseDouble(char* &p, double& v); //just an alias for parseNumber
+bool parseInt(char* &p, int& i);
+bool parseUInt(char* &p, uint& i);
+bool parseHex(char* &p, uint& i);
+#endif /* G_BASE_DEFINED */
diff --git a/include/GFaSeqGet.h b/include/GFaSeqGet.h
new file mode 100644
index 0000000..d655a86
--- /dev/null
+++ b/include/GFaSeqGet.h
@@ -0,0 +1,112 @@
+#ifndef GFASEQGET_H
+#define GFASEQGET_H
+#include "GList.hh"
+#define MAX_FASUBSEQ 0x20000000
+//max 512MB sequence data held in memory at a time
+class GSubSeq {
+ public:
+ uint sqstart; //1-based coord of subseq start on sequence
+ uint sqlen; //length of subseq loaded
+ char* sq; //actual subsequence data will be stored here
+ // (with end-of-line characters removed)
+ /*char* xseq; //the exposed pointer to the last requested subsequence start
+ off_t xstart; //the coordinate start for the last requested subseq
+ off_t xlen; //the last requested subseq len*/
+ GSubSeq() {
+ sqstart=0;
+ sqlen=0;
+ sq=NULL;
+ /* xseq=NULL;
+ xstart=0;
+ xlen=0;*/
+ }
+ ~GSubSeq() {
+ GFREE(sq);
+ }
+ // genomic, 1-based coordinates:
+ void setup(uint sstart, int slen, int sovl=0, int qfrom=0, int qto=0, uint maxseqlen=0);
+ //check for overlap with previous window and realloc/extend appropriately
+ //returns offset from seq that corresponds to sstart
+ // the window will keep extending until MAX_FASUBSEQ is reached
+class GFaSeqGet {
+ char* fname;
+ FILE* fh;
+ //raw offset in the file where the sequence actually starts:
+ off_t fseqstart;
+ uint seq_len; //total sequence length, if known (when created from GFastaIndex)
+ int line_len; //length of each line of text
+ int line_blen; //binary length of each line
+ // = line_len + number of EOL character(s)
+ GSubSeq* lastsub;
+ void initialParse(off_t fofs=0, bool checkall=true);
+ const char* loadsubseq(uint cstart, int& clen);
+ void finit(const char* fn, off_t fofs, bool validate);
+ public:
+ GFaSeqGet() {
+ fh=NULL;
+ fseqstart=0;
+ seq_len=0;
+ line_len=0;
+ line_blen=0;
+ fname=NULL;
+ lastsub=NULL;
+ }
+ GFaSeqGet(const char* fn, off_t fofs, bool validate=false) {
+ seq_len=0;
+ finit(fn,fofs,validate);
+ }
+ GFaSeqGet(const char* fn, bool validate=false) {
+ seq_len=0;
+ finit(fn,0,validate);
+ }
+ GFaSeqGet(const char* faname, uint seqlen, off_t fseqofs, int l_len, int l_blen);
+ //constructor from GFastaIndex record
+ GFaSeqGet(FILE* f, off_t fofs=0, bool validate=false);
+ ~GFaSeqGet() {
+ if (fname!=NULL) {
+ GFREE(fname);
+ fclose(fh);
+ }
+ delete lastsub;
+ }
+ const char* subseq(uint cstart, int& clen);
+ const char* getRange(uint cstart=1, uint cend=0) {
+ if (cend==0) cend=(seq_len>0)?seq_len : MAX_FASUBSEQ;
+ if (cstart>cend) { Gswap(cstart, cend); }
+ int clen=cend-cstart+1;
+ //int rdlen=clen;
+ return subseq(cstart, clen);
+ }
+ char* copyRange(uint cstart, uint cend, bool revCmpl=false, bool upCase=false);
+ //caller is responsible for deallocating the return string
+ void loadall(uint32 max_len=0) {
+ //TODO: better read the whole sequence differently here - line by line
+ //so when EOF or another '>' line is found, the reading stops!
+ int clen=(seq_len>0) ? seq_len : ((max_len>0) ? max_len : MAX_FASUBSEQ);
+ subseq(1, clen);
+ }
+ void load(uint cstart, uint cend) {
+ //cache as much as possible
+ if (seq_len>0 && cend>seq_len) cend=seq_len; //correct a bad request
+ int clen=cend-cstart+1;
+ subseq(cstart, clen);
+ }
+ int getsublen() { return lastsub!=NULL ? lastsub->sqlen : 0 ; }
+ off_t getseqofs() { return fseqstart; }
+ int getLineLen() { return line_len; }
+ int getLineBLen() { return line_blen; }
+ //reads a subsequence starting at genomic coordinate cstart (1-based)
+ };
diff --git a/include/GFastaIndex.h b/include/GFastaIndex.h
new file mode 100644
index 0000000..1509f3a
--- /dev/null
+++ b/include/GFastaIndex.h
@@ -0,0 +1,79 @@
+ * GFaIdx.h
+ *
+ * Created on: Aug 25, 2010
+ * Author: gpertea
+ */
+#ifndef GFAIDX_H_
+#define GFAIDX_H_
+#include "GHash.hh"
+#include "GList.hh"
+class GFastaRec {
+ public:
+ char* seqname;
+ uint seqlen;
+ off_t fpos;
+ int line_len; //effective line length (without EoL)
+ int line_blen; //length of line including EoL characters
+ GFastaRec(uint slen=0, off_t fp=0, int llen=0, int llenb=0) {
+ seqname=NULL; //only a pointer copy
+ seqlen=slen;
+ fpos=fp;
+ line_len=llen;
+ line_blen=llenb;
+ }
+ bool operator==(GFastaRec& d){
+ return (fpos==d.fpos);
+ }
+ bool operator>(GFastaRec& d){
+ return (fpos>d.fpos);
+ }
+ bool operator<(GFastaRec& d){
+ return (fpos<d.fpos);
+ }
+class GFastaIndex {
+ char* fa_name;
+ char* fai_name;
+ bool haveFai;
+ public:
+ GHash<GFastaRec> records;
+ void addRecord(const char* seqname, uint seqlen,
+ off_t foffs, int llen, int llen_full);
+ GFastaRec* getRecord(const char* seqname) {
+ return records.Find(seqname);
+ }
+ bool hasIndex() { return haveFai; }
+ int loadIndex(const char* finame);
+ int buildIndex(); //build index in memory by parsing the whole fasta file
+ int storeIndex(const char* finame);
+ int storeIndex(FILE* fai);
+ int getCount() { return records.Count(); }
+ GFastaIndex(const char* fname, const char* finame=NULL):records() {
+ if (fileExists(fname)!=2) GError("Error: fasta file %s not found!\n",fname);
+ if (fileSize(fname)<=0) GError("Error: invalid fasta file %s !\n",fname);
+ fa_name=Gstrdup(fname);
+ fai_name=finame!=NULL ? Gstrdup(finame) : NULL;
+ if (fileSize(fa_name)==0) {
+ GError("Error creating GFastaIndex(%s): invalid fasta file!\n",fa_name);
+ }
+ haveFai=false;
+ if (fai_name!=NULL && fileSize(fai_name)>0) {
+ //try to load the index file if it exists
+ loadIndex(fai_name);
+ haveFai=(records.Count()>0);
+ }
+ }
+ ~GFastaIndex() {
+ GFREE(fa_name);
+ GFREE(fai_name);
+ }
+#endif /* GFAIDX_H_ */
diff --git a/include/GHash.hh b/include/GHash.hh
new file mode 100644
index 0000000..5122e1d
--- /dev/null
+++ b/include/GHash.hh
@@ -0,0 +1,561 @@
+* Hash table class template (char* based) *
+#ifndef GHash_HH
+#define GHash_HH
+#include "GBase.h"
+* This class maintains a fast-access hash table of entities
+* indexed by a character string (essentially, maps strings to pointers)
+template <class OBJ> class GHash {
+ protected:
+ struct GHashEntry {
+ char* key; // Key string
+ bool keyalloc; //shared key flag (to not free the key chars)
+ int hash; // Hash value of key
+ pointer data; // Data
+ bool mark; // Entry is marked
+ };
+ GHashEntry* hash; // Hash
+ int fCapacity; // table size
+ int fCount; // number of valid entries
+ int fCurrentEntry;
+ char* lastkeyptr; //pointer to last key string added
+ //---------- Raw data retrieval (including empty entries
+ // Return key at position pos.
+ const char* Key(uint pos) const { return hash[pos].key; }
+ // return data OBJ* at given position
+ OBJ* Data(uint pos) const { return (OBJ*) hash[pos].data; }
+ // Return mark flag of entry at position pos.
+ bool Mark(uint pos) const { return hash[pos].mark; }
+ // Return position of first filled slot, or >= fCapacity
+ int First() const;
+ // Return position of last filled slot or -1
+ int Last() const;
+ // Return position of next filled slot in hash table
+ // or a value greater than or equal to fCapacity if no filled
+ // slot was found
+ int Next(int pos) const;
+ //Return position of previous filled slot in hash table
+ //or a -1 if no filled slot was found
+ int Prev(int pos) const;
+ GHash(const GHash&);
+ GHash &operator=(const GHash&);
+ GFreeProc* fFreeProc; //procedure to free item data
+ static void DefaultFreeProc(pointer item) {
+ delete (OBJ*)item;
+ }
+ GHash(GFreeProc* freeProc); // constructs of an empty hash
+ GHash(bool doFree=true); // constructs of an empty hash (free the item objects)
+ void setFreeItem(GFreeProc *freeProc) { fFreeProc=freeProc; }
+ void setFreeItem(bool doFree) { fFreeProc=(doFree)? &DefaultFreeProc : NULL; }
+ int Capacity() const { return fCapacity; } // table's size, including the empty slots.
+ void Resize(int m); // Resize the table to the given size.
+ int Count() const { return fCount; }// the total number of entries in the table.
+ // Insert a new entry into the table given key and mark.
+ // If there is already an entry with that key, leave it unchanged,
+ const OBJ* Add(const char* ky, const OBJ* ptr=NULL, bool mrk=false);
+ //same as Add, but the key pointer is stored directly, no string duplicate
+ //is made (shared-key-Add)
+ const OBJ* shkAdd(const char* ky, const OBJ* ptr, bool mrk=false);
+ // Replace data at key, if the entry's mark is less than
+ // or equal to the given mark. If there was no existing entry,
+ // a new entry is inserted with the given mark.
+ OBJ* Replace(const char* ky, const OBJ* ptr, bool mrk=false);
+ // Remove a given key and its data
+ OBJ* Remove(const char* ky);
+ // Find data OBJ* given key.
+ OBJ* Find(const char* ky, char** keyptr=NULL);
+ bool hasKey(const char* ky);
+ char* getLastKey() { return lastkeyptr; }
+ OBJ* operator[](const char* ky) { return Find(ky); }
+ void startIterate(); //iterator-like initialization
+ char* NextKey(); //returns next valid key in the table (NULL if no more)
+ OBJ* NextData(); //returns next valid hash[].data
+ OBJ* NextData(char*& nextkey); //returns next valid hash[].data
+ //or NULL if no more
+ //nextkey is SET to the corresponding key
+ GHashEntry* NextEntry() { //returns a pointer to a GHashEntry
+ register int pos=fCurrentEntry;
+ while (pos<fCapacity && hash[pos].hash<0) pos++;
+ if (pos==fCapacity) {
+ fCurrentEntry=fCapacity;
+ return NULL;
+ }
+ else {
+ fCurrentEntry=pos+1;
+ return &hash[pos];
+ }
+ }
+ /// Clear all entries
+ void Clear();
+ /// Destructor
+ virtual ~GHash();
+ };
+//======================== method definitions ========================
+ Notes:
+ - The hash algorithm should yield a fCount in the range [0...GHash::EMPTY)
+ GHash::EMPTY and GHash::UNUSED are needed for flag purposes.
+ - Since the algorithm doubles the table size when exceeding MAX_LOAD,
+ it would be prudent to keep MIN_LOAD less than 1/2 MAX_LOAD;
+ otherwise, the algorithm might hip-hop between halving and doubling,
+ which would be quite expensive!!
+ - Not many people seem to know that hash tables don't have to be prime
+ numbers; in fact, a table size of 2**n and odd probe distance are very
+ easy to arrange, and this works just as well!
+ - We store the hash key, so that 99.999% of the time we can compare hash numbers;
+ only when hash numbers match do we need to compare keys.
+ Thus, with a good hash function, the fCount of calls to strcmp() should be
+ roughly the same as the fCount of successful lookups.
+ - The hash table should NEVER get full, or stuff will loop forever!!
+// Initial table size (MUST be power of 2)
+#define DEF_HASH_SIZE 32
+// Maximum hash table load factor (%)
+#define MAX_LOAD 80
+// Minimum hash table load factor (%)
+#define MIN_LOAD 10
+// Probe Position [0..n-1]
+#define HASH1(x,n) (((unsigned int)(x)*13)%(n))
+// Probe Distance [1..n-1]
+#define HASH2(x,n) (1|(((unsigned int)(x)*17)%((n)-1)))
+#define FREEDATA (fFreeProc!=NULL)
+// Construct empty hash
+template <class OBJ> GHash<OBJ>::GHash(GFreeProc* freeProc) {
+ GMALLOC(hash, sizeof(GHashEntry)*DEF_HASH_SIZE);
+ fCurrentEntry=-1;
+ fFreeProc=freeProc;
+ lastkeyptr=NULL;
+ for (uint i=0; i<DEF_HASH_SIZE; i++)
+ hash[i].hash=-1; //this will be an indicator for 'empty' entries
+ fCapacity=DEF_HASH_SIZE;
+ fCount=0;
+ }
+template <class OBJ> GHash<OBJ>::GHash(bool doFree) {
+ GMALLOC(hash, sizeof(GHashEntry)*DEF_HASH_SIZE);
+ fCurrentEntry=-1;
+ lastkeyptr=NULL;
+ fFreeProc = (doFree)?&DefaultFreeProc : NULL;
+ for (uint i=0; i<DEF_HASH_SIZE; i++)
+ hash[i].hash=-1; //this will be an indicator for 'empty' entries
+ fCapacity=DEF_HASH_SIZE;
+ fCount=0;
+ }
+// Resize table
+template <class OBJ> void GHash<OBJ>::Resize(int m){
+ register int i,n,p,x,h;
+ GHashEntry *k;
+ GASSERT(fCount<=fCapacity);
+ n=fCapacity;
+ while((n>>2)>m) n>>=1; // Shrink until n/4 <= m
+ while((n>>1)<m) n<<=1; // Grow until m <= n/2
+ GASSERT(m<=(n>>1));
+ if(n!=fCapacity){
+ GASSERT(m<=n);
+ GMALLOC(k, sizeof(GHashEntry)*n);
+ for(i=0; i<n; i++) k[i].hash=-1;
+ for(i=0; i<fCapacity; i++){
+ h=hash[i].hash;
+ if(0<=h){
+ p=HASH1(h,n);
+ GASSERT(0<=p && p<n);
+ x=HASH2(h,n);
+ GASSERT(1<=x && x<n);
+ while(k[p].hash!=-1) p=(p+x)%n;
+ GASSERT(k[p].hash<0);
+ k[p]=hash[i];
+ }
+ }
+ GFREE(hash);
+ hash=k;
+ fCapacity=n;
+ }
+ }
+// add a new entry, or update it if it already exists
+template <class OBJ> const OBJ* GHash<OBJ>::Add(const char* ky,
+ const OBJ* pdata,bool mrk){
+ register int p,i,x,h,n;
+ if(!ky) GError("GHash::insert: NULL key argument.\n");
+ GASSERT(fCount<fCapacity);
+ h=strhash(ky);
+ GASSERT(0<=h);
+ p=HASH1(h,fCapacity);
+ GASSERT(0<=p && p<fCapacity);
+ x=HASH2(h,fCapacity);
+ GASSERT(1<=x && x<fCapacity);
+ i=-1;
+ n=fCapacity;
+ while(n && hash[p].hash!=-1){
+ if ((i==-1)&&(hash[p].hash==-2)) i=p;
+ if (hash[p].hash==h && strcmp(hash[p].key,ky)==0) {
+ //replace hash data for this key!
+ lastkeyptr=hash[p].key;
+ hash[p].data = (void*) pdata;
+ return (OBJ*)hash[p].data;
+ }
+ p=(p+x)%fCapacity;
+ n--;
+ }
+ if(i==-1) i=p;
+ GTRACE(("GHash::insert: key=\"%s\"\n",ky));
+ //GMessage("GHash::insert: key=\"%s\"\n",ky);
+ GASSERT(0<=i && i<fCapacity);
+ GASSERT(hash[i].hash<0);
+ hash[i].hash=h;
+ hash[i].mark=mrk;
+ hash[i].key=Gstrdup(ky);
+ hash[i].keyalloc=true;
+ lastkeyptr=hash[i].key;
+ hash[i].data= (void*) pdata;
+ fCount++;
+ if((100*fCount)>=(MAX_LOAD*fCapacity)) Resize(fCount);
+ GASSERT(fCount<fCapacity);
+ return pdata;
+ }
+template <class OBJ> const OBJ* GHash<OBJ>::shkAdd(const char* ky,
+ const OBJ* pdata,bool mrk){
+ register int p,i,x,h,n;
+ if(!ky) GError("GHash::insert: NULL key argument.\n");
+ GASSERT(fCount<fCapacity);
+ h=strhash(ky);
+ GASSERT(0<=h);
+ p=HASH1(h,fCapacity);
+ GASSERT(0<=p && p<fCapacity);
+ x=HASH2(h,fCapacity);
+ GASSERT(1<=x && x<fCapacity);
+ i=-1;
+ n=fCapacity;
+ while(n && hash[p].hash!=-1){
+ if((i==-1)&&(hash[p].hash==-2)) i=p;
+ if(hash[p].hash==h && strcmp(hash[p].key,ky)==0){
+ //replace hash data for this key!
+ lastkeyptr=hash[p].key;
+ hash[p].data = (void*) pdata;
+ return (OBJ*)hash[p].data;
+ }
+ p=(p+x)%fCapacity;
+ n--;
+ }
+ if(i==-1) i=p;
+ GTRACE(("GHash::insert: key=\"%s\"\n",ky));
+ //GMessage("GHash::insert: key=\"%s\"\n",ky);
+ GASSERT(0<=i && i<fCapacity);
+ GASSERT(hash[i].hash<0);
+ hash[i].hash=h;
+ hash[i].mark=mrk;
+ hash[i].key=(char *)ky;
+ lastkeyptr=hash[i].key;
+ hash[i].keyalloc=false;
+ hash[i].data= (void*) pdata;
+ fCount++;
+ if((100*fCount)>=(MAX_LOAD*fCapacity)) Resize(fCount);
+ GASSERT(fCount<fCapacity);
+ return pdata;
+ }
+// Add or replace entry
+template <class OBJ> OBJ* GHash<OBJ>::Replace(const char* ky,const OBJ* pdata, bool mrk){
+ register int p,i,x,h,n;
+ if(!ky){ GError("GHash::replace: NULL key argument.\n"); }
+ GASSERT(fCount<fCapacity);
+ h=strhash(ky);
+ GASSERT(0<=h);
+ p=HASH1(h,fCapacity);
+ GASSERT(0<=p && p<fCapacity);
+ x=HASH2(h,fCapacity);
+ GASSERT(1<=x && x<fCapacity);
+ i=-1;
+ n=fCapacity;
+ while(n && hash[p].hash!=-1){
+ if((i==-1)&&(hash[p].hash==-2)) i=p;
+ if(hash[p].hash==h && strcmp(hash[p].key,ky)==0){
+ if(hash[p].mark<=mrk){
+ GTRACE(("GHash::replace: %08x: replacing: \"%s\"\n",this,ky));
+ if (FREEDATA) (*fFreeProc)(hash[p].data);
+ hash[p].mark=mrk;
+ hash[p].data=pdata;
+ }
+ return hash[p].data;
+ }
+ p=(p+x)%fCapacity;
+ n--;
+ }
+ if(i==-1) i=p;
+ GTRACE(("GHash::replace: %08x: inserting: \"%s\"\n",this,ky));
+ GASSERT(0<=i && i<fCapacity);
+ GASSERT(hash[i].hash<0);
+ hash[i].hash=h;
+ hash[i].mark=mrk;
+ hash[i].key=Gstrdup(ky);
+ hash[i].data=pdata;
+ fCount++;
+ if((100*fCount)>=(MAX_LOAD*fCapacity)) Resize(fCount);
+ GASSERT(fCount<fCapacity);
+ return pdata;
+ }
+// Remove entry
+template <class OBJ> OBJ* GHash<OBJ>::Remove(const char* ky){
+ register int p,x,h,n;
+ if(!ky){ GError("GHash::remove: NULL key argument.\n"); }
+ if(0<fCount){
+ h=strhash(ky);
+ GASSERT(0<=h);
+ p=HASH1(h,fCapacity);
+ GASSERT(0<=p && p<fCapacity);
+ x=HASH2(h,fCapacity);
+ GASSERT(1<=x && x<fCapacity);
+ GASSERT(fCount<fCapacity);
+ n=fCapacity;
+ while(n && hash[p].hash!=-1){
+ if(hash[p].hash==h && strcmp(hash[p].key,ky)==0){
+ GTRACE(("GHash::remove: %08x removing: \"%s\"\n",this,ky));
+ hash[p].hash=-2;
+ hash[p].mark=false;
+ if (hash[p].keyalloc) GFREE((hash[p].key));
+ if (FREEDATA) (*fFreeProc)(hash[p].data);
+ hash[p].key=NULL;
+ hash[p].data=NULL;
+ fCount--;
+ if((100*fCount)<=(MIN_LOAD*fCapacity)) Resize(fCount);
+ GASSERT(fCount<fCapacity);
+ return NULL;
+ }
+ p=(p+x)%fCapacity;
+ n--;
+ }
+ }
+ return NULL;
+ }
+// Find entry
+template <class OBJ> bool GHash<OBJ>::hasKey(const char* ky) {
+ register int p,x,h,n;
+ if(!ky){ GError("GHash::find: NULL key argument.\n"); }
+ if(0<fCount){
+ h=strhash(ky);
+ GASSERT(0<=h);
+ p=HASH1(h,fCapacity);
+ GASSERT(0<=p && p<fCapacity);
+ x=HASH2(h,fCapacity);
+ GASSERT(1<=x && x<fCapacity);
+ GASSERT(fCount<fCapacity);
+ n=fCapacity;
+ while(n && hash[p].hash!=-1){
+ if(hash[p].hash==h && strcmp(hash[p].key,ky)==0){
+ return true;
+ }
+ p=(p+x)%fCapacity;
+ n--;
+ }
+ }
+ return false;
+template <class OBJ> OBJ* GHash<OBJ>::Find(const char* ky, char** keyptr){
+ register int p,x,h,n;
+ if(!ky){ GError("GHash::find: NULL key argument.\n"); }
+ if(0<fCount){
+ h=strhash(ky);
+ GASSERT(0<=h);
+ p=HASH1(h,fCapacity);
+ GASSERT(0<=p && p<fCapacity);
+ x=HASH2(h,fCapacity);
+ GASSERT(1<=x && x<fCapacity);
+ GASSERT(fCount<fCapacity);
+ n=fCapacity;
+ while(n && hash[p].hash!=-1){
+ if(hash[p].hash==h && strcmp(hash[p].key,ky)==0){
+ if (keyptr!=NULL) *keyptr = hash[p].key;
+ return (OBJ*)hash[p].data;
+ }
+ p=(p+x)%fCapacity;
+ n--;
+ }
+ }
+ return NULL;
+ }
+template <class OBJ> void GHash<OBJ>::startIterate() {// initialize a key iterator; call
+ fCurrentEntry=0;
+template <class OBJ> char* GHash<OBJ>::NextKey() {
+ register int pos=fCurrentEntry;
+ while (pos<fCapacity && hash[pos].hash<0) pos++;
+ if (pos==fCapacity) {
+ fCurrentEntry=fCapacity;
+ return NULL;
+ }
+ else {
+ fCurrentEntry=pos+1;
+ return hash[pos].key;
+ }
+template <class OBJ> OBJ* GHash<OBJ>::NextData() {
+ register int pos=fCurrentEntry;
+ while (pos<fCapacity && hash[pos].hash<0) pos++;
+ if (pos==fCapacity) {
+ fCurrentEntry=fCapacity;
+ return NULL;
+ }
+ else {
+ fCurrentEntry=pos+1;
+ return (OBJ*)hash[pos].data;
+ }
+template <class OBJ> OBJ* GHash<OBJ>::NextData(char* &nextkey) {
+ register int pos=fCurrentEntry;
+ while (pos<fCapacity && hash[pos].hash<0) pos++;
+ if (pos==fCapacity) {
+ fCurrentEntry=fCapacity;
+ nextkey=NULL;
+ return NULL;
+ }
+ else {
+ fCurrentEntry=pos+1;
+ nextkey=hash[pos].key;
+ return (OBJ*)hash[pos].data;
+ }
+// Get first non-empty entry
+template <class OBJ> int GHash<OBJ>::First() const {
+ register int pos=0;
+ while(pos<fCapacity){ if(0<=hash[pos].hash) break; pos++; }
+ GASSERT(fCapacity<=pos || 0<=hash[pos].hash);
+ return pos;
+ }
+// Get last non-empty entry
+template <class OBJ> int GHash<OBJ>::Last() const {
+ register int pos=fCapacity-1;
+ while(0<=pos){ if(0<=hash[pos].hash) break; pos--; }
+ GASSERT(pos<0 || 0<=hash[pos].hash);
+ return pos;
+ }
+// Find next valid entry
+template <class OBJ> int GHash<OBJ>::Next(int pos) const {
+ GASSERT(0<=pos && pos<fCapacity);
+ while(++pos <= fCapacity-1){ if(0<=hash[pos].hash) break; }
+ GASSERT(fCapacity<=pos || 0<=hash[pos].hash);
+ return pos;
+ }
+// Find previous valid entry
+template <class OBJ> int GHash<OBJ>::Prev(int pos) const {
+ GASSERT(0<=pos && pos<fCapacity);
+ while(--pos >= 0){ if(0<=hash[pos].hash) break; }
+ GASSERT(pos<0 || 0<=hash[pos].hash);
+ return pos;
+ }
+// Remove all
+template <class OBJ> void GHash<OBJ>::Clear(){
+ register int i;
+ for(i=0; i<fCapacity; i++){
+ if(hash[i].hash>=0){
+ if (hash[i].keyalloc) GFREE((hash[i].key));
+ (*fFreeProc)(hash[i].data);
+ }
+ }
+ GFREE(hash);
+ GMALLOC(hash, sizeof(GHashEntry)*DEF_HASH_SIZE);
+ //reinitialize it
+ for (i=0; i<DEF_HASH_SIZE; i++)
+ hash[i].hash=-1; //this will be an indicator for 'empty' entries
+ fCapacity=DEF_HASH_SIZE;
+ fCount=0;
+ }
+// Save data
+void GHash::Save(Stream& store) const {
+ Object::save(store);
+ store << fCapacity;
+ store << fCount;
+ for(int i=0; i<fCapacity; i++){
+ store << hash[i].hash;
+ if(hash[i].hash>=0){
+ uint len=strlen(hash[i].key);
+ store << len;
+ store << hash[i].mark;
+ store.save(hash[i].key,len);
+ }
+ }
+ }
+// Load data
+void GHash::Load(Stream& store){
+ Object::load(store);
+ store >> fCapacity;
+ store >> fCount;
+ for(int i=0; i<fCapacity; i++){
+ store >> hash[i].hash;
+ if(hash[i].hash>=0){
+ uint len;
+ store >> len;
+ store >> hash[i].mark;
+ GMALLOC(hash[i].key,len+1);
+ store.load(hash[i].key,len);
+ hash[i].key[len]='\0';
+ }
+ }
+ }
+// Destroy table
+template <class OBJ> GHash<OBJ>::~GHash(){
+ register int i;
+ for(i=0; i<fCapacity; i++){
+ if(hash[i].hash>=0){
+ if (hash[i].keyalloc) GFREE((hash[i].key));
+ if (FREEDATA) (*fFreeProc)(hash[i].data);
+ }
+ }
+ GFREE(hash);
+ }
diff --git a/include/GList.hh b/include/GList.hh
new file mode 100644
index 0000000..13e0729
--- /dev/null
+++ b/include/GList.hh
@@ -0,0 +1,638 @@
+Sortable collections of objects and object pointers
+#ifndef _GList_HH
+#define _GList_HH
+#include "GVec.hh"
+#define GLIST_SORTED_ERR "Operation not allowed on a sorted list!\n"
+#define GLIST_UNSORTED_ERR "Operation not allowed on an unsorted list!\n"
+//------ useful macros:
+#define BE_UNSORTED if (fCompareProc!=NULL) { GError(GLIST_SORTED_ERR); return; }
+#define BE_SORTED if (fCompareProc==NULL) { GError(GLIST_UNSORTED_ERR); return; }
+#define SORTED (fCompareProc!=NULL)
+#define UNSORTED (fCompareProc==NULL)
+// GArray is the sortable array type, requires the comparison operator < to be defined
+template <class OBJ> class GArray:public GVec<OBJ> {
+ protected:
+ bool fUnique;
+ static int DefaultCompareProc(const pointer item1, const pointer item2) {
+ //operator< MUST be defined for OBJ class!
+ if (*((OBJ*)item2) < *((OBJ*)item1)) return 1;
+ else if (*((OBJ*)item1) < *((OBJ*)item2)) return -1;
+ else return 0;
+ }
+ GCompareProc* fCompareProc;
+ public:
+ GArray(GCompareProc* cmpFunc=NULL);
+ GArray(bool sorted, bool unique=false);
+ GArray(int init_capacity, bool sorted, bool unique=false);
+ GArray(GArray<OBJ>& array); //copy constructor
+ const GArray<OBJ>& operator=(GArray<OBJ>& array);
+ //~GArray();
+ //assignment operator
+ void setSorted(GCompareProc* cmpFunc);
+ void setSorted(bool sorted) {
+ if (sorted) {
+ if (fCompareProc!=&DefaultCompareProc) {
+ fCompareProc=&DefaultCompareProc;
+ Sort();
+ }
+ }
+ else fCompareProc=NULL;
+ }
+ //sort the array if cmpFunc not NULL or changes
+ int Add(OBJ* item); // specific implementation if sorted
+ int Add(OBJ& item) { return Add(&item); } //both will CREATE a new OBJ and COPY to it
+ // using OBJ new operator=
+ int cAdd(OBJ item) { return Add(&item); }
+ int cPush(OBJ item) { return Add(&item); }
+ int Push(OBJ& item) { return Add(&item); }
+ void Add(GArray<OBJ>& list); //add copies of all items from another list
+ //this will reject identical items in sorted lists only!
+ void setUnique(bool beUnique) { fUnique = beUnique; };
+ void Sort(); //explicit sort may be requested
+ bool Sorted() { return fCompareProc!=NULL; }
+ void Replace(int idx, OBJ& item); //Put, use operator= to copy
+ int Unique() { return fUnique; }
+ int IndexOf(OBJ& item);
+ //this needs the == operator to have been defined for OBJ
+ bool Found(OBJ& item, int& idx); // for sorted arrays only;
+ //search by content; if found, returns true and idx will be the index
+ //of the first item found matching for which fCompareProc returns 0
+ bool Exists(OBJ& item); //same as above without existing index info
+ //unsorted only, place item at position idx:
+ void Move(int curidx, int newidx);
+ void Insert(int idx, OBJ* item);
+ void Insert(int idx, OBJ item) { Insert(idx,&item); }
+//GList is a sortable collection of pointers to objects; requires operator< to be defined, or a custom compare function
+template <class OBJ> class GList:public GPVec<OBJ> {
+ protected:
+ bool fUnique;
+ GCompareProc* fCompareProc; //a pointer to a Compare function
+ static int DefaultCompareProc(const pointer item1, const pointer item2) {
+ //operator< MUST be defined for OBJ class!
+ if (*((OBJ*)item2) < *((OBJ*)item1)) return 1;
+ else if (*((OBJ*)item1) < *((OBJ*)item2)) return -1;
+ else return 0;
+ }
+ public:
+ void sortInsert(int idx, OBJ* item);
+ GList(GCompareProc* compareProc=NULL); //free by default
+ GList(GCompareProc* compareProc, //unsorted by default
+ GFreeProc *freeProc,
+ bool beUnique=false);
+ GList(bool sorted, bool free_elements=true, bool beUnique=false);
+ GList(int init_capacity, bool sorted, bool free_elements=true, bool beUnique=false);
+ GList(GList<OBJ>& list); //copy constructor?
+ GList(GList<OBJ>* list); //kind of a copy constructor
+ const GList<OBJ>& operator=(GList<OBJ>& list);
+ //void Clear();
+ //~GList();
+ void setSorted(GCompareProc* compareProc);
+ //sorted if compareProc not NULL; sort the list if compareProc changes !
+ bool Sorted() { return fCompareProc!=NULL; }
+ void setSorted(bool sorted) {
+ if (sorted) {
+ if (fCompareProc!=&DefaultCompareProc) {
+ fCompareProc=&DefaultCompareProc;
+ Sort();
+ }
+ }
+ else fCompareProc=NULL;
+ }
+ int Add(OBJ* item); //-- specific implementation if sorted
+ void Add(GList<OBJ>& list); //add all pointers from another list
+ OBJ* AddIfNew(OBJ* item, bool deleteIfFound=true, int* fidx=NULL);
+ // default: delete item if Found() (and pointers are not equal)!
+ //returns the equal (==) object if it's in the list already
+ //or the item itself if it is unique and actually added
+ int AddedIfNew(OBJ* item);
+ // if Found(item) (and pointers are not equal) delete item and returns -1
+ // if added, returns the new item index
+ int Unique() { return fUnique; }
+ //this will reject identical items in sorted lists only!
+ void setUnique(bool beUnique) { fUnique = beUnique; };
+ GCompareProc* GetCompareProc() {return fCompareProc;}
+ int IndexOf(OBJ* item); //this has a specific implementation for sorted lists
+ //if list is sorted, item data is located by binary search
+ //based on the Compare function
+ //if not, a linear search is performed, but
+ //this needs the == operator to have been defined for OBJ
+ void Put(int idx, OBJ* item, bool re_sort=false);
+ bool Found(OBJ* item, int & idx); // sorted only;
+ //search by content; if found, returns true and idx will be the index
+ //of the first item found matching for which GTCompareProc returns 0
+ bool Exists(OBJ* item); //same as above without existing index info
+ bool Exists(OBJ& item); //same as above without existing index info
+ void Sort(); //explicit sort may be requested using this function
+ int Remove(OBJ* item); //search for pointer, using binary search if sorted
+ void Insert(int idx, OBJ* item); //unsorted only, place item at position idx
+ void Move(int curidx, int newidx);
+}; //GList
+//-------------------- TEMPLATE IMPLEMENTATION-------------------------------
+template <class OBJ> GArray<OBJ>::GArray(GArray<OBJ>& array):GVec<OBJ>(0) { //copy constructor
+ this->fCount=array.fCount;
+ this->fCapacity=array.fCapacity;
+ this->fArray=NULL;
+ if (this->fCapacity>0) {
+ //GMALLOC(this->fArray, this->fCapacity*sizeof(OBJ));
+ this->fArray=new OBJ[this->fCapacity];
+ }
+ this->fCount=array.fCount;
+ fUnique=array.fUnique;
+ fCompareProc=array.fCompareProc;
+ // uses OBJ operator=
+ for (int i=0;i<this->fCount;i++) this->fArray[i]=array[i];
+ }
+template <class OBJ> const GArray<OBJ>& GArray<OBJ>::operator=(GArray<OBJ>& array) {
+ if (&array==this) return *this;
+ GVec<OBJ>::Clear();
+ this->fCount=array.fCount;
+ this->fUnique=array.fUnique;
+ this->fCapacity=array.fCapacity;
+ if (this->fCapacity>0) {
+ //GMALLOC(this->fArray, this->fCapacity*sizeof(OBJ));
+ this->fArray=new OBJ[this->fCapacity];
+ }
+ this->fCompareProc=array.fCompareProc;
+ this->fCount=array.fCount;
+ // uses OBJ operator=
+ for (int i=0;i<this->fCount;i++) {
+ this->fArray[i]=array[i];
+ }
+ return *this;
+template <class OBJ> GArray<OBJ>::GArray(GCompareProc* cmpFunc):GVec<OBJ>(0) {
+ fCompareProc = cmpFunc;
+ fUnique = false; //only affects sorted lists
+template <class OBJ> GArray<OBJ>::GArray(bool sorted, bool unique):GVec<OBJ>(0) {
+ fUnique=unique;
+ fCompareProc = sorted ? DefaultCompareProc : NULL;
+template <class OBJ> GArray<OBJ>::GArray(int init_capacity,
+ bool sorted, bool unique):GVec<OBJ>(init_capacity) {
+ fUnique=unique;
+ fCompareProc=sorted ? DefaultCompareProc : NULL;
+template <class OBJ> void GArray<OBJ>::setSorted(GCompareProc* cmpFunc) {
+ GCompareProc* old_proc=fCompareProc;
+ fCompareProc=cmpFunc;
+ if (fCompareProc!=old_proc && fCompareProc!=NULL)
+ Sort(); //new compare method
+template <class OBJ> int GArray<OBJ>::IndexOf(OBJ& item) {
+ int result=0;
+ if (Found(item, result)) return result;
+ else return -1;
+ }
+template <class OBJ> bool GArray<OBJ>::Exists(OBJ& item) {
+ int result=0;
+ if (Found(item, result)) return true;
+ else return false;
+ }
+template <class OBJ> int GArray<OBJ>::Add(OBJ* item) {
+ if (item==NULL) return -1;
+ int result;
+ if (SORTED) {
+ if (Found(*item, result))
+ if (fUnique) return -1; //cannot add a duplicate!
+ //Found sets result to the position where the item should be!
+ GVec<OBJ>::Insert(result, *item);
+ }
+ else {
+ if (fUnique && Found(*item,result)) return -1; //set behaviour
+ result = this->fCount;
+ if (result==this->fCapacity) GVec<OBJ>::Grow();
+ this->fArray[result] = *item; //operator=, copies the item
+ this->fCount++;
+ }
+ return result;
+template <class OBJ> void GArray<OBJ>::Add(GArray<OBJ>& list) {
+ if (list.Count()==0) return;
+ if (SORTED) {
+ for (int i=0;i<list.fCount;i++) Add(&list[i]);
+ }
+ else { //simply copy
+ this->setCapacity(this->fCapacity+list.fCount);
+ int s=this->fCount;
+ for (int i=0;i<list.fCount;i++)
+ this->fArray[s+i]=list.fArray[i];
+ this->fCount+=list.fCount;
+ }
+template <class OBJ> bool GArray<OBJ>::Found(OBJ& item, int& idx) {
+ //search the list by using fCompareProc (if defined)
+ //or == operator for a non-sortable list
+ //for sorted lists, even when the result is false, the idx is
+ //set to the closest matching object!
+ int i;
+ idx=-1;
+ if (this->fCount==0) { idx=0;return false;}
+ if (SORTED) { //binary search based on fCompareProc
+ //do the simplest tests first:
+ if ((*fCompareProc)(&(this->fArray[0]),&item)>0) {
+ idx=0;
+ return false;
+ }
+ if ((*fCompareProc)(&item, &(this->fArray[this->fCount-1]))>0) {
+ idx=this->fCount;
+ return false;
+ }
+ int l=0;
+ int h = this->fCount - 1;
+ int c;
+ while (l <= h) {
+ i = (l + h) >> 1;
+ c = (*fCompareProc)(&(this->fArray[i]), &item);
+ if (c < 0) l = i + 1;
+ else {
+ h = i - 1;
+ if (c == 0) { //found!
+ idx=i;
+ return true;
+ }
+ }
+ } //while
+ idx = l;
+ return false;
+ }
+ else {//not sorted: use linear search
+ // needs == operator to compare user defined objects !
+ i=0;
+ while (i<this->fCount) {
+ if (this->fArray[i]==item) { //requires operator==
+ idx=i;
+ return true;
+ }
+ i++;
+ }
+ return false;
+ }
+template <class OBJ> void GArray<OBJ>::Insert(int idx, OBJ* item) {
+ //idx can be [0..fCount] so an item can be actually added
+ BE_UNSORTED; //forbid this operation on sorted data
+ GVec<OBJ>::Insert(idx, item);
+template <class OBJ> void GArray<OBJ>::Move(int curidx, int newidx) {
+ BE_UNSORTED; //cannot do this in a sorted list!
+ if (curidx!=newidx || newidx>=this->fCount)
+ GError(GVEC_INDEX_ERR, newidx);
+ OBJ tmp=this->fArray[curidx]; //copy constructor here
+ this->fArray[curidx]=this->fArray[newidx];
+ this->fArray[newidx]=tmp;
+template <class OBJ> void GArray<OBJ>::Replace(int idx, OBJ& item) {
+ //TEST_INDEX(idx);
+ if (idx<0 || idx>=this->fCount) GError(GVEC_INDEX_ERR, __FILE__,__LINE__, idx);
+ this->fArray[idx]=item;
+ if ( SORTED ) Sort(); //re-sort ! this could be very expensive, don't do it
+template <class OBJ> void GArray<OBJ>::Sort() {
+ if (fCompareProc==NULL) { fCompareProc=DefaultCompareProc; }
+ if (this->fArray!=NULL && this->fCount>0)
+ this->qSort(0, this->fCount-1, fCompareProc);
+//*=> GList implementation -- sortable array of pointers to OBJ
+template <class OBJ> GList<OBJ>::GList(GList<OBJ>& list):GPVec<OBJ>(list) { //copy constructor
+ fUnique=list.fUnique;
+ fCompareProc=list.fCompareProc;
+template <class OBJ> GList<OBJ>::GList(GList<OBJ>* plist):GPVec<OBJ>(0) { //another copy constructor
+ this->fCapacity=plist->fCapacity;
+ this->fList=NULL;
+ if (this->fCapacity>0) {
+ GMALLOC(this->fList, this->fCapacity*sizeof(OBJ*));
+ }
+ fUnique=plist->fUnique;
+ fCompareProc=plist->fCompareProc;
+ this->fFreeProc=plist->fFreeProc;
+ this->fCount=plist->fCount;
+ memcpy(this->fList, plist->fList, this->fCount*sizeof(OBJ*));
+ //for (int i=0;i<list->fCount;i++) Add(plist->Get(i));
+template <class OBJ> void GList<OBJ>::Add(GList<OBJ>& list) {
+ if (list.Count()==0) return;
+ if (SORTED) {
+ for (int i=0;i<list.Count();i++) Add(list[i]);
+ }
+ else { //simply copy
+ this->setCapacity(this->fCapacity+list.fCount);
+ memcpy( & (this->fList[this->fCount]), list.fList, list.fCount*sizeof(OBJ*));
+ this->fCount+=list.fCount;
+ }
+template <class OBJ> GList<OBJ>::GList(GCompareProc* compareProc,
+ GFreeProc* freeProc, bool beUnique) {
+ fCompareProc = compareProc;
+ this->fFreeProc = freeProc;
+ fUnique = beUnique; //only affects sorted lists
+template <class OBJ> GList<OBJ>::GList(GCompareProc* compareProc) {
+ fCompareProc = compareProc;
+ this->fFreeProc = GPVec<OBJ>::DefaultFreeProc;
+ fUnique = false; //only affects sorted lists
+template <class OBJ> GList<OBJ>::GList(bool sorted,
+ bool free_elements, bool beUnique) {
+ if (sorted) {
+ if (free_elements) {
+ fCompareProc=&DefaultCompareProc;
+ this->fFreeProc = GPVec<OBJ>::DefaultFreeProc;
+ fUnique=beUnique;
+ }
+ else {
+ fCompareProc=&DefaultCompareProc;
+ this->fFreeProc=NULL;
+ fUnique=beUnique;
+ }
+ }
+ else {
+ if (free_elements) {
+ fCompareProc=NULL;
+ this->fFreeProc=GPVec<OBJ>::DefaultFreeProc;
+ fUnique=beUnique;
+ }
+ else {
+ fCompareProc=NULL;
+ this->fFreeProc=NULL;
+ fUnique=beUnique;
+ }
+ }
+template <class OBJ> GList<OBJ>::GList(int init_capacity, bool sorted,
+ bool free_elements, bool beUnique):GPVec<OBJ>(init_capacity, free_elements) {
+ if (sorted) {
+ fCompareProc=&DefaultCompareProc;
+ fUnique=beUnique;
+ }
+ else {
+ fCompareProc=NULL;
+ fUnique=beUnique;
+ }
+template <class OBJ> const GList<OBJ>& GList<OBJ>::operator=(GList& list) {
+ if (&list!=this) {
+ GPVec<OBJ>::Clear();
+ fCompareProc=list.fCompareProc;
+ this->fFreeProc=list.fFreeProc;
+ //Attention: the object pointers are copied directly,
+ //but the actual objects are NOT duplicated
+ for (int i=0;i<list.Count();i++) Add(list[i]);
+ }
+ return *this;
+template <class OBJ> void GList<OBJ>::setSorted(GCompareProc* compareProc) {
+ GCompareProc* old_proc=fCompareProc;
+ fCompareProc=compareProc;
+ if (fCompareProc!=old_proc && fCompareProc!=NULL)
+ Sort(); //new compare method
+template <class OBJ> int GList<OBJ>::IndexOf(OBJ* item) {
+ int result=0;
+ if (Found(item, result)) return result;
+ else return -1;
+ }
+template <class OBJ> bool GList<OBJ>::Exists(OBJ& item) {
+ int result=0;
+ if (Found(&item, result)) return true;
+ else return false;
+ }
+template <class OBJ> bool GList<OBJ>::Exists(OBJ* item) {
+ int result=0;
+ if (Found(item, result)) return true;
+ else return false;
+ }
+template <class OBJ> int GList<OBJ>::Add(OBJ* item) {
+ int result;
+ if (item==NULL) return -1;
+ if (SORTED) {
+ if (Found(item, result))
+ if (fUnique) return -1; //duplicates forbidden
+ //Found sets result to the position where the item should be!
+ sortInsert(result, item);
+ }
+ else {
+ if (fUnique && Found(item,result)) return -1; //set behaviour
+ result = this->fCount;
+ if (result==this->fCapacity) GPVec<OBJ>::Grow();
+ this->fList[result]=item;
+ this->fCount++;
+ }
+ return result;
+//by default, it deletes the item if it has an equal in the list!
+//returns the existing equal (==) object if it's in the list already
+//or returns the item itself if it's unique (and adds it)
+template <class OBJ> OBJ* GList<OBJ>::AddIfNew(OBJ* item,
+ bool deleteIfFound, int* fidx) {
+ int r;
+ if (Found(item, r)) {
+ if (deleteIfFound && (pointer)item != (pointer)(this->fList[r])) {
+ this->deallocate_item(item);
+ }
+ if (fidx!=NULL) *fidx=r;
+ return this->fList[r]; //found
+ }
+ //not found:
+ if (SORTED) {
+ //Found() set result to the position where the item should be inserted:
+ sortInsert(r, item);
+ }
+ else {
+ r = this->fCount;
+ if (r==this->fCapacity) GPVec<OBJ>::Grow();
+ this->fList[r]=item;
+ this->fCount++;
+ }
+ if (fidx!=NULL) *fidx=r;
+ return item;
+//if item is found already in the list DELETE it and return -1
+//otherwise the item is added and its index is returned
+template <class OBJ> int GList<OBJ>::AddedIfNew(OBJ* item) {
+ int r;
+ if (Found(item, r)) {
+ if ((pointer)item != (pointer)(this->fList[r])) {
+ this->deallocate_item(item);
+ }
+ return -1;
+ }
+ //not found:
+ if (SORTED) {
+ //Found() set r to the position where the item should be inserted:
+ sortInsert(r, item);
+ }
+ else {
+ r = this->fCount;
+ if (r==this->fCapacity) GPVec<OBJ>::Grow();
+ this->fList[r]=item;
+ this->fCount++;
+ }
+ return r;
+template <class OBJ> bool GList<OBJ>::Found(OBJ* item, int& idx) {
+ //search the list by using fCompareProc (if defined)
+ //or == operator for a non-sortable list
+ //for sorted lists, even when the result is false, the idx is
+ //set to the closest matching object!
+ int i;
+ idx=-1;
+ if (this->fCount==0) { idx=0;return false;}
+ if (SORTED) { //binary search based on fCompareProc
+ //do the simple test first:
+ if ((*fCompareProc)(this->fList[0],item)>0) {
+ idx=0;
+ return false;
+ }
+ if ((*fCompareProc)(item, this->fList[this->fCount-1])>0) {
+ idx=this->fCount;
+ return false;
+ }
+ int l, h, c;
+ l = 0;
+ h = this->fCount - 1;
+ while (l <= h) {
+ i = (l + h) >> 1;
+ c = (*fCompareProc)(this->fList[i], item);
+ if (c < 0) l = i + 1;
+ else {
+ h = i - 1;
+ if (c == 0) {
+ idx=i;
+ return true;
+ }
+ }
+ } //while
+ idx = l;
+ return false;
+ }
+ else {//not sorted: use linear search
+ // needs == operator to compare user defined objects !
+ i=0;
+ while (i<this->fCount) {
+ if (*this->fList[i]==*item) {
+ idx=i;
+ return true;
+ }
+ i++;
+ }
+ return false;
+ }
+template <class OBJ> void GList<OBJ>::sortInsert(int idx, OBJ* item) {
+ //idx must be the new position this new item must have
+ //so the allowed range is [0..fCount]
+ //the old idx item all the above will be shifted to idx+1
+ if (idx<0 || idx>this->fCount) GError(GVEC_INDEX_ERR, idx);
+ if (this->fCount==this->fCapacity) {
+ GPVec<OBJ>::Grow(idx, item);
+ //expand and also copy/move data and insert the new item
+ return;
+ }
+ //room still left, just move data around and insert the new one
+ if (idx<this->fCount) //copy/move pointers only!
+ memmove(&(this->fList[idx+1]), &(this->fList[idx]), (this->fCount-idx)*sizeof(OBJ*));
+ this->fList[idx]=item;
+ this->fCount++;
+template <class OBJ> void GList<OBJ>::Insert(int idx, OBJ* item) {
+ //idx can be [0..fCount] so an item can be actually added
+ BE_UNSORTED; //cannot do that with a sorted list!
+ GPVec<OBJ>::Insert(idx,item);
+template <class OBJ> void GList<OBJ>::Move(int curidx, int newidx) {
+ BE_UNSORTED; //cannot do this in a sorted list!
+ GPVec<OBJ>::Move(curidx,newidx);
+template <class OBJ> void GList<OBJ>::Put(int idx, OBJ* item, bool re_sort) {
+ //WARNING: this will never free the replaced item!
+ // this may BREAK the sort order unless the "re_sort" parameter is given
+ if (idx<0 || idx>this->fCount) GError(GVEC_INDEX_ERR, idx);
+ this->fList[idx]=item;
+ if (SORTED && item!=NULL && re_sort) Sort(); //re-sort
+template <class OBJ> int GList<OBJ>::Remove(OBJ* item) {
+//removes an item if it's in our list
+ int result=IndexOf(item);
+ if (result>=0) GPVec<OBJ>::Delete(result);
+ return result;
+template <class OBJ> void GList<OBJ>::Sort() {
+ if (fCompareProc==NULL) fCompareProc = DefaultCompareProc;
+ if (this->fList!=NULL && this->fCount>0)
+ this->qSort(0, this->fCount-1, fCompareProc);
diff --git a/include/GStr.h b/include/GStr.h
new file mode 100644
index 0000000..e2a89e7
--- /dev/null
+++ b/include/GStr.h
@@ -0,0 +1,213 @@
+#ifndef GSTR_H
+#define GSTR_H
+#include "GBase.h"
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+// This class uses reference counting and copy-on-write semantics
+// All indexes are zero-based. For all functions that accept an index, a
+// negative index specifies an index from the right of the string. Also,
+// for all functions that accept a length, a length of -1 specifies the rest
+// of the string.
+enum enTokenizeMode {
+ tkFullString,
+ tkCharSet
+ };
+class GStr {
+ friend GStr operator+(const char* s1, const GStr& s2);
+ friend bool operator==(const char* s1, const GStr& s2);
+ friend bool operator<(const char* s1, const GStr& s2);
+ friend bool operator<=(const char* s1, const GStr& s2);
+ friend bool operator>(const char* s1, const GStr& s2);
+ friend bool operator>=(const char* s1, const GStr& s2);
+ friend bool operator!=(const char* s1, const GStr& s2);
+ friend void Gswap(GStr& s1, GStr& s2);
+ public:
+ GStr();
+ GStr(const GStr& s);
+ GStr(const char* s);
+ GStr(const int i);
+ GStr(const double f);
+ GStr(char c, int n = 1);
+ ~GStr();
+ operator const char* () const { return my_data->chars;} //inline here
+ char& operator[](int index);
+ char operator[](int index) const;
+ GStr& operator=(const GStr& s);
+ GStr& operator=(const char* s);
+ GStr& operator=(const int i);
+ GStr& operator=(const double f);
+ GStr operator+(const GStr& s) const;
+ GStr operator+(const char* s) const;
+ GStr operator+(const char c) const;
+ GStr operator+(const int i) const;
+ GStr operator+(const double f) const;
+ bool operator==(const GStr& s) const;
+ bool operator==(const char* s) const;
+ bool operator<(const GStr& s) const;
+ bool operator<(const char* s) const;
+ bool operator<=(const GStr& s) const;
+ bool operator<=(const char* s) const;
+ bool operator>(const GStr& s) const;
+ bool operator>(const char* s) const;
+ bool operator>=(const GStr& s) const;
+ bool operator>=(const char* s) const;
+ bool operator!=(const GStr& s) const;
+ bool operator!=(const char* s) const;
+ GStr& operator+=(const GStr& s);
+ GStr& operator+=(const char* s);
+ GStr& operator+=(const char c);
+ GStr& operator+=(const int i);
+ GStr& operator+=(const double f);
+ //interface:
+ public:
+ int length() const;
+ bool is_empty() const;
+ bool is_space() const;
+ GStr substr(int index = 0, int len = -1) const;
+ GStr to(char c); //return the first part up to first occurence of c
+ //or whole string if c not found
+ GStr from(char c); //same as to, but starting from the right side
+ GStr copy() const;
+ GStr& format(const char *fmt,...);
+ GStr& reverse();
+ GStr& appendfmt(const char *fmt,...);
+ GStr& cut(int index = 0, int len = -1); //delete a specified length
+ GStr& remove(int from, int to) {
+ return cut(from, to-from+1);
+ }
+ //paste a string at the specified position
+ GStr& paste(const GStr& s, int index = 0, int len=-1);
+ GStr& paste(const char* s, int index = 0, int len = -1);
+ GStr& replace(const char* from, const char* to=NULL);
+ GStr& insert(const GStr& s, int index = 0);
+ GStr& insert(const char* s, int index = 0);
+ GStr& append(const char* s);
+ GStr& append(const GStr& s);
+ GStr& upper();
+ GStr& lower();
+ GStr& clear();//make empty
+ //character translation or removal:
+ GStr& tr(const char* from, const char* to=NULL);
+ //number of occurences of a char in the string:
+ int count(char c);
+ void startTokenize(const char* delimiter=" \t\n", enTokenizeMode tokenizemode=tkCharSet);
+ bool nextToken(GStr& token);
+ int asInt(int base=10);
+ double asReal();
+ double asDouble() { return asReal(); }
+ bool asReal(double& r);
+ bool asDouble(double& r) { return asReal(r); }
+ bool asInt(int& r, int base=10);
+ int index(const GStr& s, int start_index = 0) const;
+ int index(const char* s, int start_index = 0) const;
+ int index(char c, int start_index = 0) const;
+ int rindex(char c, int end_index = -1) const;
+ int rindex(const char* str, int end_index = -1) const;
+ bool contains(const GStr& s) const;
+ bool contains(const char* s) const;
+ bool contains(char c) const;
+ bool startsWith(const char* s) const;
+ bool startsWith(const GStr& s) const;
+ bool endsWith(const char* s) const;
+ bool endsWith(const GStr& s) const;
+ GStr split(const char* delim);
+ GStr split(char c);
+ /* splits "this" in two parts, at the first (leftmost)
+ encounter of delim:
+ 1st would stay in "this"
+ (which this way is truncated)
+ 2nd will go to the returned string
+ */
+ GStr splitr(const char* delim);
+ GStr splitr(char c);
+ /* splits "this" in two parts, at the last (rightmost)
+ encounter of delim:
+ 1st would stay in "this"
+ 2nd will be returned
+ */
+ int peelInt() const; //extract an integer, (left to right), from a
+ //mixed alphanumeric string, e.g. 'T24HC1234b'=> 2
+ int peelIntR() const; //same as above, but starts from the right side
+ //e.g. 'T2HC1234b'=> 1234
+ GStr& trim(char c);
+ GStr& trim(const char* c=" \t\n\r"); //trim both ends of characters in given set
+ GStr& trimR(const char* c=" \t\n\r"); //trim only right end
+ GStr& trimR(char c=' ');
+ GStr& chomp(char c='\n') { return trimR(c); }
+ GStr& chomp(const char* cstr); //like trimR, but given string is taken as a whole
+ GStr& trimL(const char* c=" \t\n\r"); //trim only left end
+ GStr& trimL(char c=' ');
+ GStr& padR(int len, char c=' '); //align it in len spaces to the right
+ GStr& padL(int len, char c=' '); //align it in len spaces to the left
+ GStr& padC(int len, char c=' '); //center it
+ size_t read(FILE* stream, const char* delimiter="\n", size_t bufsize=4096);
+ //read next token from stream, using the given string as
+ //a marker where the block should stop
+ const char* chars() const;
+ const char* text() const;
+ protected:
+ char* fTokenDelimiter;
+ int fLastTokenStart;
+ enTokenizeMode fTokenizeMode;
+ void* readbuf; //file read buffer for the read() function
+ size_t readbufsize; //last setting for the readbuf
+ static void invalid_args_error(const char* fname);
+ static void invalid_index_error(const char* fname);
+ struct Data {//structure holding actual
+ //string data and reference count information
+ Data() { ref_count=0; length=0; chars[0] = '\0'; }
+ unsigned int ref_count;
+ int length;
+ char chars[1];
+ };
+ static Data* new_data(int length); //alloc a specified length string's Data
+ static Data* new_data(const char* str); //alloc a copy of a specified string
+ void replace_data(int length);
+ void replace_data(Data* data);
+ void make_unique();
+ char* chrs(); // this is dangerous, length should not be affected
+ static Data null_data; //a null (empty) string Data is available here
+ Data* my_data; //pointer to a Data object holding actual string data
+inline int GStr::length() const {
+ return my_data->length;
+ }
+inline const char *GStr::chars() const {
+ return my_data->chars;
+ }
+inline char *GStr::chrs() { //protected version, allows modification of the chars
+ return my_data->chars;
+ }
+inline const char *GStr::text() const {
+ return my_data->chars;
+ }
+inline bool operator>=(const char *s1, const GStr& s2) {
+ return (strcmp(s1, s2.chars()) >= 0);
+ }
+inline bool operator!=(const char *s1, const GStr& s2) {
+ return (strcmp(s1, s2.chars()) != 0);
+ }
+inline void Gswap(GStr& s1, GStr& s2) {
+ GStr::Data *tmp = s1.my_data; s1.my_data = s2.my_data;
+ s2.my_data = tmp;
+ }
diff --git a/include/GVec.hh b/include/GVec.hh
new file mode 100644
index 0000000..25b095c
--- /dev/null
+++ b/include/GVec.hh
@@ -0,0 +1,907 @@
+Sortable collection of pointers to objects
+#ifndef _GVec_HH
+#define _GVec_HH
+#include "GBase.h"
+#define GVEC_INDEX_ERR "GVec error: invalid index: %d\n"
+ #if defined(NDEBUG) || defined(NODEBUG) || defined(_NDEBUG) || defined(NO_DEBUG)
+ #define TEST_INDEX(x)
+ #define TEST_INDEX(x) \
+ if (x<0 || x>=fCount) GError(GVEC_INDEX_ERR, x)
+#define GVEC_CAPACITY_ERR "GVec error: invalid capacity: %d\n"
+#define GVEC_COUNT_ERR "GVec error: invalid count: %d\n"
+#define FREEDATA (fFreeProc!=NULL)
+template<class T> struct IsPrimitiveType {
+ enum { VAL = 0 };
+template<> struct IsPrimitiveType<bool> { enum { VAL = 1 }; };
+template<> struct IsPrimitiveType<void*> { enum { VAL = 1 }; };
+template<> struct IsPrimitiveType<float> { enum { VAL = 1 }; };
+template<> struct IsPrimitiveType<double> { enum { VAL = 1 }; };
+template<> struct IsPrimitiveType<int> { enum { VAL = 1 }; };
+template<> struct IsPrimitiveType<unsigned int> { enum { VAL = 1 }; };
+template<> struct IsPrimitiveType<char> { enum { VAL = 1 }; };
+template<> struct IsPrimitiveType<unsigned char> { enum { VAL = 1 }; };
+template<> struct IsPrimitiveType<short> { enum { VAL = 1 }; };
+template<> struct IsPrimitiveType<unsigned short> { enum { VAL = 1 }; };
+template<> struct IsPrimitiveType<long> { enum { VAL = 1 }; };
+template<> struct IsPrimitiveType<unsigned long> { enum { VAL = 1 }; };
+template<> struct IsPrimitiveType<long long> { enum { VAL = 1 }; };
+template<> struct IsPrimitiveType<unsigned long long> { enum { VAL = 1 }; };
+template<> struct IsPrimitiveType<int64_t> { enum { VAL = 1 }; };
+template<> struct IsPrimitiveType<uint64_t> { enum { VAL = 1 }; };
+template<> struct IsPrimitiveType<int32_t> { enum { VAL = 1 }; };
+template<> struct IsPrimitiveType<uint32_t> { enum { VAL = 1 }; };
+template<> struct IsPrimitiveType<int16_t> { enum { VAL = 1 }; };
+template<> struct IsPrimitiveType<uint16_t> { enum { VAL = 1 }; };
+template<> struct IsPrimitiveType<int8_t> { enum { VAL = 1 }; };
+template<> struct IsPrimitiveType<uint8_t> { enum { VAL = 1 }; };
+template <class OBJ> int DefLTCompareProc(const pointer p1, const pointer p2) {
+ const OBJ& o1 = *((OBJ*) p1);
+ const OBJ& o2 = *((OBJ*) p2);
+ if (o1 < o2) return -1;
+ else return ((o2 < o1) ? 1 : 0 );
+//basic template for array of objects;
+//so it doesn't require comparison operators to be defined
+template <class OBJ> class GVec {
+ protected:
+ OBJ* fArray;
+ int fCount;
+ int fCapacity;
+ void qSort(int L, int R, GCompareProc* cmpFunc);
+ public:
+ GVec(int init_capacity=2);
+ GVec(int init_count, const OBJ init_val);
+ GVec(GVec<OBJ>& array); //copy constructor
+ const GVec<OBJ>& operator=(GVec<OBJ>& array); //copy operator
+ virtual ~GVec();
+ void Insert(int idx, OBJ item) { Insert(idx, &item); }
+ void Insert(int idx, OBJ* item);
+ void idxInsert(int idx, OBJ& item) { Insert(idx, &item); }
+ void Grow();
+ void Grow(int idx, OBJ& item); //grow and add/insert item copy
+ void Reverse(); //WARNING: will break the sort order if SORTED!
+ int Add(OBJ* item); // simply append to the end of fArray, reallocating as needed
+ int Add(OBJ& item) { return Add(&item); }
+ int cAdd(OBJ item) { return Add(&item); } //all these will CREATE a new OBJ and COPY to it
+ // // using OBJ copy operator=
+ // -- stack/queue usage:
+ //int Push(OBJ& item) { return Add(&item); }
+ int Push(OBJ& item) { return Add(&item); }
+ int cPush(OBJ item) { return Add(&item); }
+ OBJ Pop();// Stack use; removes and returns a copy of the last item
+ OBJ Shift(); //Queue use: removes and returns a copy of the first item
+ void Add(GVec<OBJ>& list); //append copies of all items from another list
+ OBJ& Get(int idx) {
+ TEST_INDEX(idx);
+ return fArray[idx];
+ }
+ inline OBJ& operator[](int i) {
+ return fArray[i];
+ }
+ OBJ& Last() {
+ TEST_INDEX(fCount-1);
+ return fArray[fCount-1];
+ }
+ OBJ& First() {
+ return fArray[0];
+ }
+ void Clear();
+ void Delete(int index);
+ void Replace(int idx, OBJ& item); //Put, use operator= to copy
+ void Exchange(int idx1, int idx2);
+ void Swap(int idx1, int idx2) { Exchange(idx1, idx2); }
+ int Capacity() { return fCapacity; }
+ //this will reject identical items in sorted lists only!
+ void setCapacity(int NewCapacity);
+ int Count() { return fCount; }
+ void setCount(int NewCount); // will trim or expand the array as needed
+ void setCount(int NewCount, OBJ* v); //same as setCount() but new objects are set to v
+ void setCount(int NewCount, OBJ v);
+ void Resize(int NewCount) { setCount(NewCount); }
+ //void Resize(int NewCount, OBJ* v) { setCount(NewCount, v); }
+ void Resize(int NewCount, OBJ v) { setCount(NewCount, &v); }
+ //void Move(int curidx, int newidx);
+ bool isEmpty() { return fCount==0; }
+ bool notEmpty() { return fCount>0; }
+ void Sort(GCompareProc* cmpFunc);
+ void Sort();
+//---- template for dynamic array of object pointers
+//---- it's faster than GVec<OBJ*> and has item deallocation awareness
+template <class OBJ> class GPVec {
+ protected:
+ OBJ** fList; //pointer to an array of pointers to objects
+ int fCount; //total number of entries in list
+ int fCapacity; //current allocated size
+ GFreeProc* fFreeProc; //useful for deleting objects
+ //---
+ void Expand();
+ void Grow();
+ void Grow(int idx, OBJ* newitem);
+ void qSort(int L, int R, GCompareProc* cmpFunc);
+ public:
+ static void DefaultFreeProc(pointer item) {
+ delete (OBJ*)item;
+ }
+ virtual ~GPVec();
+ GPVec(int init_capacity=2, bool free_elements=true); //also the default constructor
+ GPVec(bool free_elements);
+ GPVec(GPVec<OBJ>& list); //copy constructor?
+ GPVec(GPVec<OBJ>* list); //kind of a copy constructor
+ const GPVec<OBJ>& operator=(GPVec<OBJ>& list);
+ OBJ* Get(int i);
+ OBJ* operator[](int i) { return this->Get(i); }
+ void Reverse(); //reverse pointer array; WARNING: will break the sort order if sorted!
+ void freeItem(int idx); //calls fFreeProc (or DefaultFreeProc) on fList[idx] and sets NULL there, doesn't pack!
+ //it will free even if fFreeProc is NULL!
+ void setFreeItem(GFreeProc *freeProc) { fFreeProc=freeProc; }
+ void setFreeItem(bool doFree) {
+ if (doFree) fFreeProc=DefaultFreeProc;
+ else fFreeProc=NULL;
+ }
+ // -- stack usage:
+ int Push(OBJ* item) { return Add(item); }
+ OBJ* Pop();// Stack use; removes and returns last item,but does NOT FREE it
+ OBJ* Shift(); //Queue use: removes and returns first item, but does NOT FREE it
+ void deallocate_item(OBJ*& item); //forcefully call fFreeProc or delete on item
+ void Clear();
+ void Exchange(int idx1, int idx2);
+ void Swap(int idx1, int idx2) { Exchange(idx1, idx2); }
+ OBJ* First() { return (fCount>0)?fList[0]:NULL; }
+ OBJ* Last() { return (fCount>0)?fList[fCount-1]:NULL;}
+ bool isEmpty() { return fCount==0; }
+ bool notEmpty() { return fCount>0; }
+ int Capacity() { return fCapacity; }
+ int Count() { return fCount; }
+ void setCapacity(int NewCapacity);
+ void setCount(int NewCount); //the same as setCapacity() but the new item range is filled with NULLs
+ int Add(OBJ* item); //simply append the pointer copy
+ void Add(GPVec<OBJ>& list); //add all pointers from another list
+ void Insert(int idx, OBJ* item);
+ void Move(int curidx, int newidx);
+ void Put(int idx, OBJ* item);
+ void Pack();
+ void Delete(int index); //also frees the item if fFreeProc!=NULL, and shifts the successor items
+ void Forget(int idx); //simply places a NULL at fList[idx], nothing else
+ int RemovePtr(pointer item); //always use linear search to find the pointer! calls Delete() if found
+ int IndexOf(pointer item); //a linear search for pointer address!
+ void Sort(GCompareProc* cmpFunc);
+ void Sort();
+ };
+//-------------------- TEMPLATE IMPLEMENTATION-------------------------------
+template <class OBJ> GVec<OBJ>::GVec(int init_capacity) {
+ fCount=0;
+ fCapacity=0;
+ fArray=NULL;
+ setCapacity(init_capacity);
+template <class OBJ> GVec<OBJ>::GVec(int init_count, const OBJ init_val) {
+ fCount=0;
+ fCapacity=0;
+ fArray=NULL;
+ setCapacity(init_count);
+ fCount = init_count;
+ for (int i=0;i<fCount;i++)
+ fArray[i]=init_val;
+template <class OBJ> GVec<OBJ>::GVec(GVec<OBJ>& array) { //copy constructor
+ this->fCount=array.fCount;
+ this->fCapacity=array.fCapacity;
+ this->fArray=NULL;
+ if (this->fCapacity>0) {
+ if (IsPrimitiveType<OBJ>::VAL) {
+ GMALLOC(fArray, fCapacity*sizeof(OBJ));
+ memcpy(fArray, array.fArray, fCount*sizeof(OBJ));
+ }
+ else {
+ fArray=new OBJ[this->fCapacity]; //]()
+ // uses OBJ operator=
+ for (int i=0;i<this->fCount;i++) fArray[i]=array[i];
+ }
+ }
+ this->fCount=array.fCount;
+ }
+template <class OBJ> const GVec<OBJ>& GVec<OBJ>::operator=(GVec<OBJ>& array) {
+ if (&array==this) return *this;
+ Clear();
+ fCapacity=array.fCapacity;
+ fCount=array.fCount;
+ if (fCapacity>0) {
+ if (IsPrimitiveType<OBJ>::VAL) {
+ GMALLOC(fArray, fCapacity*sizeof(OBJ));
+ memcpy(fArray, array.fArray, fCount*sizeof(OBJ));
+ }
+ else {
+ fArray=new OBJ[this->fCapacity]; // ]()
+ // uses OBJ operator=
+ for (int i=0;i<fCount;i++) {
+ fArray[i]=array.fArray[i];
+ }
+ }
+ }
+ return *this;
+template <class OBJ> GVec<OBJ>::~GVec() {
+ this->Clear();
+template <class OBJ> void GVec<OBJ>::setCapacity(int NewCapacity) {
+ if (NewCapacity < fCount || NewCapacity > MAXLISTSIZE)
+ GError(GVEC_CAPACITY_ERR, NewCapacity);
+ //error: NewCapacity MUST be > fCount
+ //if you want to shrink it use Resize() or setCount()
+ if (NewCapacity!=fCapacity) {
+ if (NewCapacity==0) {
+ if (IsPrimitiveType<OBJ>::VAL) {
+ GFREE(fArray);
+ } else {
+ delete[] fArray;
+ fArray=NULL;
+ }
+ }
+ else {
+ if (IsPrimitiveType<OBJ>::VAL) {
+ GREALLOC(fArray, NewCapacity*sizeof(OBJ));
+ } else {
+ OBJ* oldArray=fArray;
+ //fArray=new OBJ[NewCapacity]();
+ fArray=new OBJ[NewCapacity];
+ for (int i=0;i<this->fCount;i++) {
+ fArray[i] = oldArray[i];
+ }// we need operator= here
+ //wouldn't be faster to use memcpy instead?
+ //memcpy(fArray, oldArray, fCount*sizeof(OBJ));
+ if (oldArray) delete[] oldArray;
+ }
+ }
+ fCapacity=NewCapacity;
+ }
+template <class OBJ> void GVec<OBJ>::Clear() {
+ fCount=0;
+ if (IsPrimitiveType<OBJ>::VAL) {
+ GFREE(fArray);
+ }
+ else {
+ delete[] fArray;
+ fArray=NULL;
+ }
+ fCapacity=0;
+template <class OBJ> void GVec<OBJ>::Grow() {
+ int delta = (fCapacity>8) ? (fCapacity>>2) : 1 ;
+ setCapacity(fCapacity + delta);
+template <class OBJ> void GVec<OBJ>::Reverse() {
+ int l=0;
+ int r=fCount-1;
+ OBJ c;
+ while (l<r) {
+ c=fArray[l];fArray[l]=fArray[r];
+ fArray[r]=c;
+ l++;r--;
+ }
+template <class OBJ> void GVec<OBJ>::Grow(int idx, OBJ& item) {
+ int delta = (fCapacity>8) ? (fCapacity>>2) : 1 ;
+ int NewCapacity=fCapacity+delta;
+ if (NewCapacity <= fCount || NewCapacity >= MAXLISTSIZE)
+ GError(GVEC_CAPACITY_ERR, NewCapacity);
+ //error: capacity not within range
+ //if (NewCapacity!=fCapacity) {
+ if (idx==fCount) { //append item
+ //GREALLOC(fArray, NewCapacity*sizeof(OBJ));
+ setCapacity(NewCapacity);
+ fArray[idx]=item;
+ }
+ else { //insert item at idx
+ OBJ* newList;
+ if (IsPrimitiveType<OBJ>::VAL) {
+ GMALLOC(newList, NewCapacity*sizeof(OBJ));
+ //copy data before idx
+ memcpy(&newList[0],&fArray[0], idx*sizeof(OBJ));
+ newList[idx]=item;
+ //copy data after idx
+ memmove(&newList[idx+1],&fArray[idx], (fCount-idx)*sizeof(OBJ));
+ //..shouldn't do this:
+ memset(&newList[fCount+1], 0, (NewCapacity-fCount-1)*sizeof(OBJ));
+ //data copied:
+ GFREE(fArray);
+ } else {
+ newList=new OBJ[NewCapacity]; //]()
+ // operator= required!
+ for (int i=0;i<idx;i++) {
+ newList[i]=fArray[i];
+ }
+ newList[idx]=item;
+ //copy data after idx
+ //memmove(&newList[idx+1],&fArray[idx], (fCount-idx)*sizeof(OBJ));
+ for (int i=idx+1;i<=fCount;i++) {
+ newList[i]=fArray[i-1];
+ }
+ delete[] fArray;
+ }
+ fArray=newList;
+ fCapacity=NewCapacity;
+ }
+ fCount++;
+template <class OBJ> int GVec<OBJ>::Add(OBJ* item) {
+ if (item==NULL) return -1;
+ if (fCount==fCapacity) Grow();
+ fArray[fCount] = *item; //OBJ::operator= must copy OBJ properly!
+ fCount++;
+ return fCount-1;
+template <class OBJ> void GVec<OBJ>::Add(GVec<OBJ>& list) {
+ if (list.Count()==0) return;
+ //simply copy
+ setCapacity(fCapacity+list.fCount);
+ if (IsPrimitiveType<OBJ>::VAL) {
+ memcpy( &fArray[fCount], list.fArray, list.fCount*sizeof(OBJ));
+ }
+ else {
+ for (int i=0;i<list.fCount;i++)
+ fArray[fCount+i]=list.fArray[i];
+ }
+ fCount+=list.fCount;
+//Stack usage:
+template <class OBJ> OBJ GVec<OBJ>::Pop() {
+ if (fCount<=0) GError("Error: invalid GVec::Pop() operation!\n");
+ fCount--;
+ //OBJ o(fArray[fCount]); //copy constructor
+ //o=fList[fCount];
+ //fArray[fCount]=NULL;
+ return fArray[fCount]; //copy of the last element (copy constructor called)
+//Queue usage:
+template <class OBJ> OBJ GVec<OBJ>::Shift() {
+ if (fCount<=0) GError("Error: invalid GVec::Shift() operation!\n");
+ fCount--;
+ OBJ o(fArray[0]); //copy constructor
+ if (fCount>0)
+ memmove(&fArray[0], &fArray[1], (fCount)*sizeof(OBJ));
+ //fList[fCount]=NULL; //not that it matters..
+ return o;
+template <class OBJ> void GVec<OBJ>::Insert(int idx, OBJ* item) {
+ //idx must be the new position this new item must have
+ //so the allowed range is [0..fCount]
+ //the old idx item all the above will be shifted to idx+1
+ if (idx<0 || idx>fCount) GError(GVEC_INDEX_ERR, idx);
+ if (fCount==fCapacity) { //need to resize the array
+ Grow(idx, *item); //expand and also copy/move data and insert the new item
+ return;
+ }
+ //move data around to make room for the new item
+ if (idx<fCount) {
+ //copy after-idx items (shift up)
+ if (IsPrimitiveType<OBJ>::VAL) {
+ memmove(&fArray[idx+1],&fArray[idx], (fCount-idx)*sizeof(OBJ));
+ }
+ else {
+ for (int i=fCount; i>idx; i--) {
+ fArray[i]=fArray[i-1];
+ }
+ }
+ }
+ fArray[idx]=*item;
+ fCount++;
+/*template <class OBJ> void GVec<OBJ>::Move(int curidx, int newidx) { //swap
+ if (curidx!=newidx || newidx>=fCount)
+ GError(GVEC_INDEX_ERR, newidx);
+ OBJ tmp=fArray[curidx]; //copy constructor here
+ fArray[curidx]=fArray[newidx];
+ fArray[newidx]=tmp;
+template <class OBJ> void GVec<OBJ>::Replace(int idx, OBJ& item) {
+ TEST_INDEX(idx);
+ fArray[idx]=item;
+template <class OBJ> void GVec<OBJ>::Exchange(int idx1, int idx2) {
+ TEST_INDEX(idx1);
+ TEST_INDEX(idx2);
+ OBJ item=fArray[idx1];
+ fArray[idx1]=fArray[idx2];
+ fArray[idx2]=item;
+template <class OBJ> void GVec<OBJ>::Delete(int index) {
+ TEST_INDEX(index);
+ fCount--;
+ if (IsPrimitiveType<OBJ>::VAL) {
+ if (index<fCount)
+ //move higher elements if any (shift down)
+ memmove(&fArray[index], &fArray[index+1], (fCount-index)*sizeof(OBJ));
+ }
+ else {
+ while (index<fCount) {
+ fArray[index]=fArray[index+1];
+ index++;
+ }
+ }
+template <class OBJ> void GVec<OBJ>::setCount(int NewCount) {
+ if (NewCount<0 || NewCount > MAXLISTSIZE)
+ GError(GVEC_COUNT_ERR, NewCount);
+ //if (NewCount > fCapacity) setCapacity(NewCount);
+ while(NewCount > fCapacity) Grow();
+ fCount = NewCount; //new items will be populated by the default object constructor(!)
+template <class OBJ> void GVec<OBJ>::setCount(int NewCount, OBJ* v) {
+ if (NewCount<0 || NewCount > MAXLISTSIZE)
+ GError(GVEC_COUNT_ERR, NewCount);
+ while (NewCount > fCapacity) Grow();
+ if (NewCount>fCount) {
+ for (int i=fCount;i<NewCount;i++)
+ fArray[i]=*v;
+ }
+ fCount = NewCount;
+template <class OBJ> void GVec<OBJ>::setCount(int NewCount, OBJ v) {
+ if (NewCount<0 || NewCount > MAXLISTSIZE)
+ GError(GVEC_COUNT_ERR, NewCount);
+ while (NewCount > fCapacity) Grow();
+ if (NewCount>fCount) {
+ for (int i=fCount;i<NewCount;i++)
+ fArray[i]=v;
+ }
+ fCount = NewCount;
+template <class OBJ> void GVec<OBJ>::qSort(int l, int r, GCompareProc* cmpFunc) {
+ int i, j;
+ OBJ p,t;
+ do {
+ i = l; j = r;
+ p = this->fArray[(l + r) >> 1];
+ do {
+ while (cmpFunc(&(this->fArray[i]), &p) < 0) i++;
+ while (cmpFunc(&(this->fArray[j]), &p) > 0) j--;
+ if (i <= j) {
+ t = this->fArray[i];
+ this->fArray[i] = this->fArray[j];
+ this->fArray[j] = t;
+ i++; j--;
+ }
+ } while (i <= j);
+ if (l < j) qSort(l, j, cmpFunc);
+ l = i;
+ } while (i < r);
+template <class OBJ> void GVec<OBJ>::Sort(GCompareProc* cmpFunc) {
+ if (cmpFunc==NULL) {
+ GMessage("Warning: NULL compare function given, useless Sort() call.\n");
+ return;
+ }
+ if (this->fArray!=NULL && this->fCount>0)
+ qSort(0, this->fCount-1, cmpFunc);
+template <class OBJ> void GVec<OBJ>::Sort() {
+ GCompareProc* cmpFunc = DefLTCompareProc<OBJ>;
+ Sort(cmpFunc);
+//*=> GPVec implementation
+template <class OBJ> GPVec<OBJ>::GPVec(GPVec& list) { //copy constructor
+ fCount=list.fCount;
+ fCapacity=list.fCapacity;
+ fList=NULL;
+ if (fCapacity>0) {
+ GMALLOC(fList, fCapacity*sizeof(OBJ*));
+ }
+ fFreeProc=list.fFreeProc;
+ fCount=list.fCount;
+ memcpy(fList, list.fList, fCount*sizeof(OBJ*));
+ //for (int i=0;i<list.Count();i++) Add(list[i]);
+template <class OBJ> GPVec<OBJ>::GPVec(GPVec* plist) { //another copy constructor
+ fCount=0;
+ fCapacity=plist->fCapacity;
+ fList=NULL;
+ if (fCapacity>0) {
+ GMALLOC(fList, fCapacity*sizeof(OBJ*));
+ }
+ fFreeProc=plist->fFreeProc;
+ fCount=plist->fCount;
+ memcpy(fList, plist->fList, fCount*sizeof(OBJ*));
+ //for (int i=0;i<list->fCount;i++) Add(plist->Get(i));
+template <class OBJ> const GPVec<OBJ>& GPVec<OBJ>::operator=(GPVec& list) {
+ if (&list!=this) {
+ Clear();
+ fFreeProc=list.fFreeProc;
+ //Attention: the object *POINTERS* are copied,
+ // but the actual object content is NOT duplicated
+ //for (int i=0;i<list.Count();i++) Add(list[i]);
+ fCount=list.fCount;
+ GMALLOC(fList, fCapacity*sizeof(OBJ*));
+ memcpy(fList, list.fList, fCount*sizeof(OBJ*));
+ }
+ return *this;
+template <class OBJ> void GPVec<OBJ>::Add(GPVec<OBJ>& list) {
+ if (list.Count()==0) return;
+ //simply copy the pointers! -- the objects will be shared
+ setCapacity(fCapacity+list.fCount);
+ memcpy( & (fList[fCount]), list.fList, list.fCount*sizeof(OBJ*));
+ fCount+=list.fCount;
+template <class OBJ> void GPVec<OBJ>::Reverse() {
+ int l=0;
+ int r=fCount-1;
+ OBJ* c;
+ while (l<r) {
+ c=fList[l];fList[l]=fList[r];
+ fList[r]=c;
+ l++;r--;
+ }
+template <class OBJ> GPVec<OBJ>::GPVec(int init_capacity, bool free_elements) {
+ fCount=0;
+ fCapacity=0;
+ fList=NULL;
+ fFreeProc=(free_elements) ? DefaultFreeProc : NULL;
+ if (init_capacity>0)
+ setCapacity(init_capacity);
+template <class OBJ> GPVec<OBJ>::GPVec(bool free_elements) {
+ fCount=0;
+ fCapacity=0;
+ fList=NULL;
+ fFreeProc=(free_elements) ? DefaultFreeProc : NULL;
+template <class OBJ> GPVec<OBJ>::~GPVec() {
+ this->Clear();//this will free the items if fFreeProc is defined
+template <class OBJ> void GPVec<OBJ>::setCapacity(int NewCapacity) {
+ if (NewCapacity < fCount || NewCapacity > MAXLISTSIZE)
+ GError(GVEC_CAPACITY_ERR, NewCapacity);
+ //error: capacity not within range
+ if (NewCapacity!=fCapacity) {
+ if (NewCapacity==0) {
+ GFREE(fList);
+ }
+ else {
+ GREALLOC(fList, NewCapacity*sizeof(OBJ*));
+ }
+ fCapacity=NewCapacity;
+ }
+template <class OBJ> void GPVec<OBJ>::deallocate_item(OBJ* &item) {
+ if (item==NULL) return;
+ if (FREEDATA) {
+ (*fFreeProc)(item);
+ item=NULL;
+ }
+ else {
+ delete item;
+ item=NULL;
+ }
+template <class OBJ> void GPVec<OBJ>::Clear() {
+ if (FREEDATA) {
+ for (int i=0; i<fCount; i++) {
+ (*fFreeProc)(fList[i]);
+ }
+ }
+ GFREE(fList);
+ fCount=0;
+ fCapacity=0;
+template <class OBJ> void GPVec<OBJ>::Exchange(int idx1, int idx2) {
+ TEST_INDEX(idx1);
+ TEST_INDEX(idx2);
+ OBJ* item=fList[idx1];
+ fList[idx1]=fList[idx2];
+ fList[idx2]=item;
+template <class OBJ> void GPVec<OBJ>::Expand() {
+ if (fCount==fCapacity) Grow();
+ //return this;
+template <class OBJ> OBJ* GPVec<OBJ>::Get(int idx) {
+ TEST_INDEX(idx);
+ return fList[idx];
+template <class OBJ> void GPVec<OBJ>::Grow() {
+ /*
+ int delta;
+ if (fCapacity > 64 ) {
+ delta = (fCapacity > 0xFFF) ? 0x100 : (fCapacity>>4);
+ }
+ else {
+ delta = (fCapacity>8) ? (fCapacity>>2) : 1 ;
+ }
+ */
+ int delta = (fCapacity>8) ? (fCapacity>>2) : 1;
+ setCapacity(fCapacity + delta);
+template <class OBJ> void GPVec<OBJ>::Grow(int idx, OBJ* newitem) {
+ /*
+ int delta;
+ if (fCapacity > 64 ) {
+ delta = (fCapacity > 0xFFF) ? 0x100 : (fCapacity>>4);
+ }
+ else {
+ delta = (fCapacity>8) ? (fCapacity>>2) : 1 ;
+ }
+ */
+ int delta = (fCapacity>8) ? (fCapacity>>2) : 1 ;
+ int NewCapacity=fCapacity+delta;
+ if (NewCapacity <= fCount || NewCapacity > MAXLISTSIZE)
+ GError(GVEC_CAPACITY_ERR, NewCapacity);
+ //error: capacity not within range
+ //if (NewCapacity!=fCapacity) {
+ /*if (NewCapacity==0) {
+ GFREE(fList);
+ }
+ else {//add the new item
+ */
+ if (idx==fCount) {
+ GREALLOC(fList, NewCapacity*sizeof(OBJ*));
+ fList[idx]=newitem;
+ }
+ else {
+ OBJ** newList;
+ GMALLOC(newList, NewCapacity*sizeof(OBJ*));
+ //copy data before idx
+ memcpy(&newList[0],&fList[0], idx*sizeof(OBJ*));
+ newList[idx]=newitem;
+ //copy data after idx
+ memmove(&newList[idx+1],&fList[idx], (fCount-idx)*sizeof(OBJ*));
+ memset(&newList[fCount+1], 0, (NewCapacity-fCount-1)*sizeof(OBJ*));
+ //data copied:
+ GFREE(fList);
+ fList=newList;
+ }
+ fCount++;
+ fCapacity=NewCapacity;
+template <class OBJ> int GPVec<OBJ>::IndexOf(pointer item) {
+ int result=-1;
+ for (int i=0;i<fCount;i++) {
+ if (item==(pointer)fList[i]) return i;
+ }
+ return -1;
+ }
+template <class OBJ> int GPVec<OBJ>::Add(OBJ* item) {
+ int result;
+ if (item==NULL) return -1;
+ result = fCount;
+ if (result==fCapacity) this->Grow();
+ fList[result]=item;
+ fCount++;
+ return fCount-1;
+template <class OBJ> void GPVec<OBJ>::Insert(int idx, OBJ* item) {
+ //idx can be [0..fCount] so an item can be actually added
+ if (idx<0 || idx>fCount) GError(GVEC_INDEX_ERR, idx);
+ if (fCount==fCapacity) {
+ Grow(idx, item);
+ return;
+ }
+ if (idx<fCount)
+ memmove(&fList[idx+1], &fList[idx], (fCount-idx)*sizeof(OBJ*));
+ fList[idx]=item;
+ fCount++;
+template <class OBJ> void GPVec<OBJ>::Move(int curidx, int newidx) { //s
+ //BE_UNSORTED; //cannot do that in a sorted list!
+ if (curidx!=newidx || newidx>=fCount)
+ GError(GVEC_INDEX_ERR, newidx);
+ OBJ* p;
+ p=Get(curidx);
+ //this is a delete:
+ fCount--;
+ if (curidx<fCount)
+ memmove(&fList[curidx], &fList[curidx+1], (fCount-curidx)*sizeof(OBJ*));
+ //-this was instead of delete
+ Insert(newidx, p);
+template <class OBJ> void GPVec<OBJ>::Put(int idx, OBJ* item) {
+ //WARNING: this will never free the replaced item!
+ TEST_INDEX(idx);
+ fList[idx]=item;
+template <class OBJ> void GPVec<OBJ>::Forget(int idx) {
+ TEST_INDEX(idx);
+ fList[idx]=NULL; //user should free that somewhere else
+template <class OBJ> void GPVec<OBJ>::freeItem(int idx) {
+ TEST_INDEX(idx);
+ if (fFreeProc!=NULL) {
+ (*fFreeProc)(fList[idx]);
+ }
+ else this->DefaultFreeProc(fList[idx]);
+ fList[idx]=NULL;
+template <class OBJ> void GPVec<OBJ>::Delete(int index) {
+ TEST_INDEX(index);
+ if (fFreeProc!=NULL && fList[index]!=NULL) {
+ (*fFreeProc)(fList[index]); //freeItem
+ }
+ fList[index]=NULL;
+ fCount--;
+ if (index<fCount) //move higher elements if any
+ memmove(&fList[index], &fList[index+1], (fCount-index)*sizeof(OBJ*));
+//Stack usage:
+template <class OBJ> OBJ* GPVec<OBJ>::Pop() {
+ if (fCount<=0) return NULL;
+ fCount--;
+ OBJ* o=fList[fCount];
+ fList[fCount]=NULL;
+ return o;
+//Queue usage:
+template <class OBJ> OBJ* GPVec<OBJ>::Shift() {
+ if (fCount<=0) return NULL;
+ fCount--;
+ OBJ* o=fList[0];
+ if (fCount>0)
+ memmove(&fList[0], &fList[1], (fCount)*sizeof(OBJ*));
+ fList[fCount]=NULL; //not that it matters..
+ return o;
+//linear search for the pointer address
+template <class OBJ> int GPVec<OBJ>::RemovePtr(pointer item) {
+if (item==NULL) return -1;
+for (int i=0;i<fCount;i++)
+ if ((pointer)fList[i] == item) {
+ Delete(i);
+ return i;
+ }
+return -1; //not found
+template <class OBJ> void GPVec<OBJ>::Pack() {
+ for (int i=fCount-1; i>=0; i--)
+ if (fList[i]==NULL) Delete(i); //shift rest of fList content accordingly
+template <class OBJ> void GPVec<OBJ>::setCount(int NewCount) {
+ if (NewCount<0 || NewCount > MAXLISTSIZE)
+ GError(GVEC_COUNT_ERR, NewCount);
+ if (NewCount > fCapacity) setCapacity(NewCount);
+ if (NewCount > fCount) //pad with NULL pointers
+ memset(& fList[fCount], 0, (NewCount - fCount) * sizeof(OBJ*));
+ fCount = NewCount;
+template <class OBJ> void GPVec<OBJ>::qSort(int L, int R, GCompareProc* cmpFunc) {
+ int I, J;
+ OBJ* P;
+ OBJ* T;
+ do {
+ I = L;
+ J = R;
+ P = this->fList[(L + R) >> 1];
+ do {
+ while (cmpFunc(this->fList[I], P) < 0) I++;
+ while (cmpFunc(this->fList[J], P) > 0) J--;
+ if (I <= J) {
+ T = this->fList[I];
+ this->fList[I] = this->fList[J];
+ this->fList[J] = T;
+ I++;
+ J--;
+ }
+ }
+ while (I <= J);
+ if (L < J) qSort(L, J, cmpFunc);
+ L = I;
+ }
+ while (I < R);
+template <class OBJ> void GPVec<OBJ>::Sort(GCompareProc* cmpFunc) {
+ if (cmpFunc==NULL) {
+ GMessage("Warning: NULL compare function given, useless Sort() call.\n");
+ return;
+ }
+ if (this->fList!=NULL && this->fCount>0)
+ qSort(0, this->fCount-1, cmpFunc);
+template <class OBJ> void GPVec<OBJ>::Sort() {
+ GCompareProc* cmpFunc = DefLTCompareProc<OBJ>;
+ Sort(cmpFunc);
diff --git a/include/codons.h b/include/codons.h
new file mode 100644
index 0000000..1925e9f
--- /dev/null
+++ b/include/codons.h
@@ -0,0 +1,54 @@
+#ifndef CODONS_H
+#define CODONS_H
+#include "GBase.h"
+#include <ctype.h>
+unsigned short packCodon(char n1, char n2, char n3);
+//assumes n1,n2,n3 are UPPERCASE!
+struct Codon {
+ char nuc[3];
+ Codon(char* str=NULL) {
+ if (str==NULL) {
+ nuc[0]='N';
+ nuc[1]='N';
+ nuc[2]='N';
+ }
+ else {
+ nuc[0]=toupper(str[0]);
+ nuc[1]=toupper(str[1]);
+ nuc[2]=toupper(str[2]);
+ }
+ }
+ Codon(char s1, char s2, char s3) {
+ nuc[0]=toupper(s1);
+ nuc[1]=toupper(s2);
+ nuc[2]=toupper(s3);
+ }
+ char& operator[](int idx) {
+ if (idx<0 || idx>2)
+ GError("Error: Codon index out of bounds!\n");
+ return nuc[idx];
+ }
+ char operator[](int idx) const {
+ if (idx<0 || idx>2)
+ GError("Error: Codon index out of bounds!\n");
+ return nuc[idx];
+ }
+ char translate();
+ };
+//simple 1st frame forward translation of a given DNA string
+//will allocated memory for the translation -- the caller is
+// responsible for freeing the returned string!
+char* translateDNA(const char* dnastr, int& aalen, int dnalen=0);
+bool codonTableInit();
diff --git a/include/gdna.h b/include/gdna.h
new file mode 100644
index 0000000..1f923ed
--- /dev/null
+++ b/include/gdna.h
@@ -0,0 +1,15 @@
+#ifndef GDNA_H
+#define GDNA_H
+#include "GBase.h"
+char ntComplement(char c);
+//in-place reverse complement of a nucleotide (sub)sequence
+char* reverseComplement(char* seq, int slen=0);
+bool gDnaInit();
+byte gdna2bit(char* &nt, int n=4); //pack n bases into a byte (n can be 1..4)
+char g2bit2base(byte v2bit); //convert the 2-bit value into 'A', 'C', 'G' or 'T'
diff --git a/include/gff.h b/include/gff.h
new file mode 100644
index 0000000..d29da03
--- /dev/null
+++ b/include/gff.h
@@ -0,0 +1,1088 @@
+#ifndef GFF_H
+#define GFF_H
+#include "GBase.h"
+#include "gdna.h"
+#include "codons.h"
+#include "GFaSeqGet.h"
+#include "GList.hh"
+#include "GHash.hh"
+//#include <boost/crc.hpp> // for boost::crc_32_type
+const byte exMskMajSpliceL = 0x01;
+const byte exMskMajSpliceR = 0x02;
+const byte exMskMinSpliceL = 0x04;
+const byte exMskMinSpliceR = 0x08;
+const byte exMskTag = 0x80;
+//reserved Gffnames::feats entries -- basic feature types
+extern const int gff_fid_mRNA; // "mRNA" feature name
+extern const int gff_fid_transcript; // *RNA, *transcript feature name
+extern const int gff_fid_exon;
+extern const uint GFF_MAX_LOCUS;
+extern const uint GFF_MAX_EXON;
+extern const uint GFF_MAX_INTRON;
+extern const uint gfo_flag_CHILDREN_PROMOTED;
+extern const uint gfo_flag_HAS_ERRORS;
+extern const uint gfo_flag_IS_GENE;
+extern const uint gfo_flag_HAS_GFF_ID; //found a GFF3 formatted main feature with its own ID
+extern const uint gfo_flag_BY_EXON; //created by subfeature (exon) directly
+ //(GTF2 and some chado gff3 dumps with exons given before their mRNA)
+extern const uint gfo_flag_IS_TRANSCRIPT; //recognized as '*RNA' or '*transcript'
+extern const uint gfo_flag_DISCARDED; //should not be printed under the "transcriptsOnly" directive
+extern const uint gfo_flag_LST_KEEP; //GffObj from GffReader::gflst is to be kept (not deallocated)
+ //when GffReader is destroyed
+extern const uint gfo_flag_LEVEL_MSK; //hierarchical level: 0 = no parent
+extern const byte gfo_flagShift_LEVEL;
+extern bool gff_show_warnings;
+#define GFF_LINELEN 2048
+#define ERR_NULL_GFNAMES "Error: GffObj::%s requires a non-null GffNames* names!\n"
+enum GffExonType {
+ exgffIntron=-1, // useless "intron" feature
+ exgffNone=0, //not a recognizable exon or CDS segment
+ exgffStart, //from "start_codon" feature (within CDS)
+ exgffStop, //from "stop_codon" feature (may be outside CDS)
+ exgffCDS, //from "CDS" feature
+ exgffUTR, //from "UTR" feature
+ exgffCDSUTR, //from a merge of UTR and CDS feature
+ exgffExon, //from "exon" feature
+const char* strExonType(char xtype);
+class GffReader;
+class GffLine {
+ char* _parents; //stores a copy of the Parent attribute value,
+ //with commas replaced by \0
+ int _parents_len;
+ public:
+ char* dupline; //duplicate of original line
+ char* line; //this will have tabs replaced by \0
+ int llen;
+ char* gseqname;
+ char* track;
+ char* ftype; //feature name: mRNA/gene/exon/CDS
+ char* info; //the last, attributes' field, unparsed
+ uint fstart;
+ uint fend;
+ uint qstart; //overlap coords on query, if available
+ uint qend;
+ uint qlen; //query len, if given
+ double score;
+ char strand;
+ bool skip;
+ bool is_gff3; //if the line appears to be in GFF3 format
+ bool is_cds; //"cds" and "stop_codon" features
+ bool is_exon; //"exon" and "utr" features
+ char exontype; // gffExonType
+ bool is_transcript; //if current feature is *RNA or *transcript
+ bool is_gene; //if current feature is *gene
+ char phase; // '.' , '0', '1' or '2'
+ // -- allocated strings:
+ char* gene_name; //value of gene_name attribute (GTF) if present or Name attribute of a gene feature (GFF3)
+ char* gene_id; //value of gene_id attribute (GTF) if present or ID attribute of a gene feature (GFF3)
+ //
+ char** parents; //for GTF only parents[0] is used
+ int num_parents;
+ char* ID; // if a ID=.. attribute was parsed, or a GTF with 'transcript' line (transcript_id)
+ GffLine(GffReader* reader, const char* l); //parse the line accordingly
+ void discardParent() {
+ GFREE(_parents);
+ _parents_len=0;
+ num_parents=0;
+ parents=NULL;
+ }
+ char* extractAttr(const char* pre, bool caseStrict=false, bool enforce_GTF2=false);
+ GffLine(GffLine* l):_parents(NULL), _parents_len(0),
+ dupline(NULL), line(NULL), llen(0), gseqname(NULL), track(NULL),
+ ftype(NULL), info(NULL), fstart(0), fend(0), qstart(0), qend(0), qlen(0),
+ score(0), strand(0), skip(true), is_gff3(false), is_cds(false), is_exon(false),
+ exontype(0), is_transcript(false), is_gene(false), phase(0),
+ gene_name(NULL), gene_id(NULL),
+ parents(NULL), num_parents(0), ID(NULL) { //a copy constructor
+ if (l==NULL || l->line==NULL)
+ GError("Error: invalid GffLine(l)\n");
+ memcpy((void*)this, (void*)l, sizeof(GffLine));
+ GMALLOC(line, llen+1);
+ memcpy(line, l->line, llen+1);
+ GMALLOC(dupline, llen+1);
+ memcpy(dupline, l->dupline, llen+1);
+ //--offsets within line[]
+ gseqname=line+(l->gseqname-l->line);
+ track=line+(l->track-l->line);
+ ftype=line+(l->ftype-l->line);
+ info=line+(l->info-l->line);
+ if (num_parents>0 && parents) {
+ parents=NULL; //re-init, just copied earlier
+ GMALLOC(parents, num_parents*sizeof(char*));
+ //_parents_len=l->_parents_len; copied above
+ _parents=NULL; //re-init, forget pointer copy
+ GMALLOC(_parents, _parents_len);
+ memcpy(_parents, l->_parents, _parents_len);
+ for (int i=0;i<num_parents;i++) {
+ parents[i]=_parents+(l->parents[i] - l->_parents);
+ }
+ }
+ //-- allocated string copies:
+ ID=Gstrdup(l->ID);
+ if (l->gene_name!=NULL)
+ gene_name=Gstrdup(l->gene_name);
+ if (l->gene_id!=NULL)
+ gene_id=Gstrdup(l->gene_id);
+ }
+ GffLine():_parents(NULL), _parents_len(0),
+ dupline(NULL), line(NULL), llen(0), gseqname(NULL), track(NULL),
+ ftype(NULL), info(NULL), fstart(0), fend(0), qstart(0), qend(0), qlen(0),
+ score(0), strand(0), skip(true), is_gff3(false), is_cds(false), is_exon(false),
+ exontype(0), is_transcript(false), is_gene(false), phase(0),
+ gene_name(NULL), gene_id(NULL),
+ parents(NULL), num_parents(0), ID(NULL) {
+ }
+ ~GffLine() {
+ GFREE(dupline);
+ GFREE(line);
+ GFREE(_parents);
+ GFREE(parents);
+ GFREE(gene_name);
+ GFREE(gene_id);
+ }
+class GffAttr {
+ public:
+ int attr_id;
+ char* attr_val;
+ GffAttr(int an_id, const char* av=NULL) {
+ attr_id=an_id;
+ attr_val=NULL;
+ setValue(av);
+ }
+ ~GffAttr() {
+ GFREE(attr_val);
+ }
+ void setValue(const char* av) {
+ if (attr_val!=NULL) {
+ GFREE(attr_val);
+ }
+ if (av==NULL || av[0]==0) return;
+ //trim spaces
+ const char* vstart=av;
+ while (*vstart==' ') av++;
+ const char* vend=vstart;
+ bool keep_dq=false;
+ while (vend[1]!=0) {
+ if (*vend==' ' && vend[1]!=' ') keep_dq=true;
+ else if (*vend==';') keep_dq=true;
+ vend++;
+ }
+ //remove spaces at the end:
+ while (*vend==' ' && vend!=vstart) vend--;
+ //practical clean-up: if it doesn't have any internal spaces just strip those useless double quotes
+ if (!keep_dq && *vstart=='"' && *vend=='"') {
+ vend--;
+ vstart++;
+ }
+ attr_val=Gstrdup(vstart, vend);
+ }
+ bool operator==(GffAttr& d){
+ return (this==&d);
+ }
+ bool operator>(GffAttr& d){
+ return (this>&d);
+ }
+ bool operator<(GffAttr& d){
+ return (this<&d);
+ }
+ };
+class GffNameList;
+class GffNames;
+class GffNameInfo {
+ friend class GffNameList;
+ public:
+ int idx;
+ char* name;
+ GffNameInfo(const char* n=NULL):idx(-1),name(NULL) {
+ if (n) name=Gstrdup(n);
+ }
+ ~GffNameInfo() {
+ GFREE(name);
+ }
+ bool operator==(GffNameInfo& d){
+ return (strcmp(this->name, d.name)==0);
+ }
+ bool operator<(GffNameInfo& d){
+ return (strcmp(this->name, d.name)<0);
+ }
+class GffNameList:public GList<GffNameInfo> {
+ friend class GffNameInfo;
+ friend class GffNames;
+ GHash<GffNameInfo> byName;//hash with shared keys
+ int idlast; //fList index of last added/reused name
+ void addStatic(const char* tname) {// fast add
+ GffNameInfo* f=new GffNameInfo(tname);
+ idlast=this->Add(f);
+ f->idx=idlast;
+ byName.shkAdd(f->name,f);
+ }
+ GffNameList(int init_capacity=6):GList<GffNameInfo>(init_capacity, false,true,true), byName(false) {
+ idlast=-1;
+ setCapacity(init_capacity);
+ }
+ char* lastNameUsed() { return idlast<0 ? NULL : Get(idlast)->name; }
+ int lastNameId() { return idlast; }
+ char* getName(int nid) { //retrieve name by its ID
+ if (nid<0 || nid>=fCount)
+ GError("GffNameList Error: invalid index (%d)\n",nid);
+ return fList[nid]->name;
+ }
+ int addName(const char* tname) {//returns or create an id for the given name
+ //check idlast first, chances are it's the same feature name checked
+ /*if (idlast>=0 && strcmp(fList[idlast]->name,tname)==0)
+ return idlast;*/
+ GffNameInfo* f=byName.Find(tname);
+ int fidx=-1;
+ if (f!=NULL) fidx=f->idx;
+ else {//add new entry
+ f=new GffNameInfo(tname);
+ fidx=this->Add(f);
+ f->idx=fidx;
+ byName.shkAdd(f->name,f);
+ }
+ idlast=fidx;
+ return fidx;
+ }
+ int addNewName(const char* tname) {
+ GffNameInfo* f=new GffNameInfo(tname);
+ int fidx=this->Add(f);
+ f->idx=fidx;
+ byName.shkAdd(f->name,f);
+ return fidx;
+ }
+ int getId(const char* tname) { //only returns a name id# if found
+ GffNameInfo* f=byName.Find(tname);
+ if (f==NULL) return -1;
+ return f->idx;
+ }
+ int removeName() {
+ GError("Error: removing names from GffNameList not allowed!\n");
+ return -1;
+ }
+class GffNames {
+ public:
+ int numrefs;
+ GffNameList tracks;
+ GffNameList gseqs;
+ GffNameList attrs;
+ GffNameList feats; //feature names: 'mRNA', 'exon', 'CDS' etc.
+ GffNames():tracks(),gseqs(),attrs(), feats() {
+ numrefs=0;
+ //the order below is critical!
+ //has to match: gff_fid_mRNA, gff_fid_exon
+ feats.addStatic("mRNA");//index 0=gff_fid_mRNA
+ feats.addStatic("transcript");//index 1=gff_fid_transcript
+ feats.addStatic("exon");//index 1=gff_fid_exon
+ //feats.addStatic("CDS"); //index 2=gff_fid_CDS
+ }
+void gffnames_ref(GffNames* &n);
+void gffnames_unref(GffNames* &n);
+enum GffPrintMode {
+ pgtfAny, //print record as read
+ pgtfExon,
+ pgtfCDS,
+ pgffAny, //print record as read
+ pgffExon,
+ pgffCDS,
+ pgffBoth,
+class GffAttrs:public GList<GffAttr> {
+ public:
+ GffAttrs():GList<GffAttr>(false,true,false) { }
+ void add_or_update(GffNames* names, const char* attrname, const char* val) {
+ int aid=names->attrs.getId(attrname);
+ if (aid>=0) {
+ //attribute found in the dictionary
+ for (int i=0;i<Count();i++) {
+ //do we have it?
+ if (aid==Get(i)->attr_id) {
+ //update the value
+ Get(i)->setValue(val);
+ return;
+ }
+ }
+ }
+ else {
+ aid=names->attrs.addNewName(attrname);
+ }
+ this->Add(new GffAttr(aid, val));
+ }
+ char* getAttr(GffNames* names, const char* attrname) {
+ int aid=names->attrs.getId(attrname);
+ if (aid>=0)
+ for (int i=0;i<Count();i++)
+ if (aid==Get(i)->attr_id) return Get(i)->attr_val;
+ return NULL;
+ }
+ char* getAttr(int aid) {
+ if (aid>=0)
+ for (int i=0;i<Count();i++)
+ if (aid==Get(i)->attr_id) return Get(i)->attr_val;
+ return NULL;
+ }
+class GffExon : public GSeg {
+ public:
+ void* uptr; //for later extensions
+ GffAttrs* attrs; //other attributes kept for this exon
+ double score; // gff score column
+ char phase; //GFF phase column - for CDS segments only
+ // '.' = undefined (UTR), '0','1','2' for CDS exons
+ char exontype; // 1="exon" 2="cds" 3="utr" 4="stop_codon"
+ int qstart; // for mRNA/protein exon mappings: coordinates on query
+ int qend;
+ GffExon(int s=0, int e=0, double sc=0, char fr=0, int qs=0, int qe=0, char et=0) {
+ uptr=NULL;
+ attrs=NULL;
+ if (s<e) {
+ start=s;
+ end=e;
+ }
+ else {
+ start=e;
+ end=s;
+ }
+ if (qs<qe) {
+ qstart=qs;
+ qend=qe;
+ } else {
+ qstart=qe;
+ qend=qs;
+ }
+ score=sc;
+ phase=fr;
+ exontype=et;
+ } //constructor
+ char* getAttr(GffNames* names, const char* atrname) {
+ if (attrs==NULL || names==NULL || atrname==NULL) return NULL;
+ return attrs->getAttr(names, atrname);
+ }
+ char* getAttr(int aid) {
+ if (attrs==NULL) return NULL;
+ return attrs->getAttr(aid);
+ }
+ ~GffExon() { //destructor
+ if (attrs!=NULL) delete attrs;
+ }
+class GffCDSeg:public GSeg {
+ public:
+ char phase;
+ int exonidx;
+//one GFF mRNA object -- e.g. a mRNA with its exons and/or CDS segments
+class GffObj:public GSeg {
+ //utility segment-merging function for addExon()
+ void expandExon(int xovl, uint segstart, uint segend,
+ char exontype, double sc, char fr, int qs, int qe);
+ protected:
+ //coordinate transformation data:
+ uint xstart; //absolute genomic coordinates of reference region
+ uint xend;
+ char xstatus; //coordinate transform status:
+ //0 : (start,end) coordinates are absolute
+ //'+' : (start,end) coords are relative to xstart..xend region
+ //'-' : (start,end) are relative to the reverse complement of xstart..xend region
+ //--
+ char* gffID; // ID name for mRNA (parent) feature
+ char* gene_name; //value of gene_name attribute (GTF) if present or Name attribute of the parent gene feature (GFF3)
+ char* geneID; //value of gene_id attribute (GTF) if present or ID attribute of a parent gene feature (GFF3)
+ unsigned int flags;
+ //-- friends:
+ friend class GffReader;
+ friend class GffExon;
+ static GffNames* names; // dictionary storage that holds the various attribute names etc.
+ int track_id; // index of track name in names->tracks
+ int gseq_id; // index of genomic sequence name in names->gseqs
+ int ftype_id; // index of this record's feature name in names->feats, or the special gff_fid_mRNA value
+ int exon_ftype_id; //index of child subfeature name in names->feats (that subfeature stored in "exons")
+ //if ftype_id==gff_fid_mRNA then this value is ignored
+ GList<GffExon> exons; //for non-mRNA entries, these can be any subfeature of type subftype_id
+ GPVec<GffObj> children;
+ GffObj* parent;
+ int udata; //user data, flags etc.
+ void* uptr; //user pointer (to a parent object, cluster, locus etc.)
+ GffObj* ulink; //link to another GffObj (user controlled field)
+ // mRNA specific fields:
+ bool isCDS; //just a CDS, no UTRs
+ bool partial; //partial CDS
+ uint CDstart; //CDS start coord
+ uint CDend; //CDS end coord
+ char CDphase; //initial phase for CDS start
+ bool hasErrors() { return ((flags & gfo_flag_HAS_ERRORS)!=0); }
+ void hasErrors(bool v) {
+ if (v) flags |= gfo_flag_HAS_ERRORS;
+ else flags &= ~gfo_flag_HAS_ERRORS;
+ }
+ bool hasGffID() { return ((flags & gfo_flag_HAS_GFF_ID)!=0); }
+ void hasGffID(bool v) {
+ if (v) flags |= gfo_flag_HAS_GFF_ID;
+ else flags &= ~gfo_flag_HAS_GFF_ID;
+ }
+ bool createdByExon() { return ((flags & gfo_flag_BY_EXON)!=0); }
+ void createdByExon(bool v) {
+ if (v) flags |= gfo_flag_BY_EXON;
+ else flags &= ~gfo_flag_BY_EXON;
+ }
+ bool isGene() { return ((flags & gfo_flag_IS_GENE)!=0); }
+ void isGene(bool v) {
+ if (v) flags |= gfo_flag_IS_GENE;
+ else flags &= ~gfo_flag_IS_GENE;
+ }
+ bool isDiscarded() { return ((flags & gfo_flag_DISCARDED)!=0); }
+ void isDiscarded(bool v) {
+ if (v) flags |= gfo_flag_DISCARDED;
+ else flags &= ~gfo_flag_DISCARDED;
+ }
+ bool isUsed() { return ((flags & gfo_flag_LST_KEEP)!=0); }
+ void isUsed(bool v) {
+ if (v) flags |= gfo_flag_LST_KEEP;
+ else flags &= ~gfo_flag_LST_KEEP;
+ }
+ bool isTranscript() { return ((flags & gfo_flag_IS_TRANSCRIPT)!=0); }
+ void isTranscript(bool v) {
+ if (v) flags |= gfo_flag_IS_TRANSCRIPT;
+ else flags &= ~gfo_flag_IS_TRANSCRIPT;
+ }
+ bool promotedChildren() { return ((flags & gfo_flag_CHILDREN_PROMOTED)!=0); }
+ void promotedChildren(bool v) {
+ if (v) flags |= gfo_flag_CHILDREN_PROMOTED;
+ else flags &= ~gfo_flag_CHILDREN_PROMOTED;
+ }
+ void setLevel(byte v) {
+ if (v==0) flags &= ~gfo_flag_LEVEL_MSK;
+ else flags &= ~(((uint)v) << gfo_flagShift_LEVEL);
+ }
+ byte incLevel() {
+ uint v=((flags & gfo_flag_LEVEL_MSK) >> gfo_flagShift_LEVEL);
+ v++;
+ flags &= ~(v << gfo_flagShift_LEVEL);
+ return v;
+ }
+ byte getLevel() {
+ return ((byte)((flags & gfo_flag_LEVEL_MSK) >> gfo_flagShift_LEVEL));
+ }
+ bool isValidTranscript() {
+ //return (ftype_id==gff_fid_mRNA && exons.Count()>0);
+ return (isTranscript() && exons.Count()>0);
+ }
+ int addExon(uint segstart, uint segend, double sc=0, char fr='.',
+ int qs=0, int qe=0, bool iscds=false, char exontype=0);
+ int addExon(GffReader* reader, GffLine* gl, bool keepAttr=false, bool noExonAttr=true);
+ void removeExon(int idx);
+ void removeExon(GffExon* p);
+ char strand; //true if features are on the reverse complement strand
+ double gscore;
+ double uscore; //custom, user-computed score, if needed
+ int covlen; //total coverage of reference genomic sequence (sum of maxcf segment lengths)
+ //--------- optional data:
+ int qlen; //query length, start, end - if available
+ int qstart;
+ int qend;
+ int qcov; //query coverage - percent
+ GffAttrs* attrs; //other gff3 attributes found for the main mRNA feature
+ //constructor by gff line parsing:
+ GffObj(GffReader* gfrd, GffLine* gffline, bool keepAttrs=false, bool noExonAttr=true);
+ //if gfline->Parent!=NULL then this will also add the first sub-feature
+ // otherwise, only the main feature is created
+ void copyAttrs(GffObj* from);
+ void clearAttrs() {
+ if (attrs!=NULL) {
+ bool sharedattrs=(exons.Count()>0 && exons[0]->attrs==attrs);
+ delete attrs; attrs=NULL;
+ if (sharedattrs) exons[0]->attrs=NULL;
+ }
+ }
+ GffObj(char* anid=NULL):GSeg(0,0), exons(true,true,false), children(1,false) {
+ //exons: sorted, free, non-unique
+ gffID=NULL;
+ uptr=NULL;
+ ulink=NULL;
+ flags=0;
+ udata=0;
+ parent=NULL;
+ ftype_id=-1;
+ exon_ftype_id=-1;
+ if (anid!=NULL) gffID=Gstrdup(anid);
+ gffnames_ref(names);
+ qlen=0;
+ qstart=0;
+ qend=0;
+ qcov=0;
+ partial=true;
+ isCDS=false;
+ CDstart=0; // hasCDS <=> CDstart>0
+ CDend=0;
+ CDphase=0;
+ gseq_id=-1;
+ track_id=-1;
+ xstart=0;
+ xend=0;
+ xstatus=0;
+ strand='.';
+ gscore=0;
+ uscore=0;
+ attrs=NULL;
+ covlen=0;
+ gene_name=NULL;
+ geneID=NULL;
+ }
+ ~GffObj() {
+ GFREE(gffID);
+ GFREE(gene_name);
+ GFREE(geneID);
+ clearAttrs();
+ gffnames_unref(names);
+ }
+ //--------------
+ GffObj* finalize(GffReader* gfr, bool mergeCloseExons=false,
+ bool keepAttrs=false, bool noExonAttr=true);
+ //complete parsing: must be called in order to merge adjacent/close proximity subfeatures
+ void parseAttrs(GffAttrs*& atrlist, char* info, bool isExon=false);
+ const char* getSubfName() { //returns the generic feature type of the entries in exons array
+ //int sid=exon_ftype_id;
+ //if (sid==gff_fid_exon && isCDS) sid=gff_fid_CDS;
+ return names->feats.getName(exon_ftype_id);
+ }
+ void addCDS(uint cd_start, uint cd_end, char phase=0);
+ bool monoFeature() {
+ return (exons.Count()==0 ||
+ (exons.Count()==1 && //exon_ftype_id==ftype_id &&
+ exons[0]->end==this->end && exons[0]->start==this->start));
+ }
+ bool hasCDS() { return (CDstart>0); }
+ const char* getFeatureName() {
+ return names->feats.getName(ftype_id);
+ }
+ void setFeatureName(const char* feature);
+ void addAttr(const char* attrname, const char* attrvalue);
+ int removeAttr(const char* attrname, const char* attrval=NULL);
+ int removeAttr(int aid, const char* attrval=NULL);
+ int removeExonAttr(GffExon& exon, const char* attrname, const char* attrval=NULL);
+ int removeExonAttr(GffExon& exon, int aid, const char* attrval=NULL);
+ const char* getAttrName(int i) {
+ if (attrs==NULL) return NULL;
+ return names->attrs.getName(attrs->Get(i)->attr_id);
+ }
+ char* getAttr(const char* attrname, bool checkFirstExon=false) {
+ if (names==NULL || attrname==NULL) return NULL;
+ char* r=NULL;
+ if (attrs==NULL) {
+ if (!checkFirstExon) return NULL;
+ }
+ else r=attrs->getAttr(names, attrname);
+ if (r!=NULL) return r;
+ if (checkFirstExon && exons.Count()>0) {
+ r=exons[0]->getAttr(names, attrname);
+ }
+ return r;
+ }
+ char* getExonAttr(GffExon* exon, const char* attrname) {
+ if (exon==NULL || attrname==NULL) return NULL;
+ return exon->getAttr(names, attrname);
+ }
+ char* getExonAttr(int exonidx, const char* attrname) {
+ if (exonidx<0 || exonidx>=exons.Count() || attrname==NULL) return NULL;
+ return exons[exonidx]->getAttr(names, attrname);
+ }
+ char* getAttrValue(int i) {
+ if (attrs==NULL) return NULL;
+ return attrs->Get(i)->attr_val;
+ }
+ const char* getGSeqName() {
+ return names->gseqs.getName(gseq_id);
+ }
+ const char* getRefName() {
+ return names->gseqs.getName(gseq_id);
+ }
+ void setRefName(const char* newname);
+ const char* getTrackName() {
+ return names->tracks.getName(track_id);
+ }
+ bool exonOverlap(uint s, uint e) {//check if ANY exon overlaps given segment
+ //ignores strand!
+ if (s>e) Gswap(s,e);
+ for (int i=0;i<exons.Count();i++) {
+ if (exons[i]->overlap(s,e)) return true;
+ }
+ return false;
+ }
+ bool exonOverlap(GffObj& m) {//check if ANY exon overlaps given segment
+ //if (gseq_id!=m.gseq_id) return false;
+ // ignores strand and gseq_id, must check in advance
+ for (int i=0;i<exons.Count();i++) {
+ for (int j=0;j<m.exons.Count();j++) {
+ if (exons[i]->start>m.exons[j]->end) continue;
+ if (m.exons[j]->start>exons[i]->end) break;
+ //-- overlap if we are here:
+ return true;
+ }
+ }
+ return false;
+ }
+ int exonOverlapIdx(uint s, uint e, int* ovlen=NULL) {
+ //return the exons' index for the overlapping OR ADJACENT exon
+ //ovlen, if given, will return the overlap length
+ if (s>e) Gswap(s,e);
+ s--;e++; //to also catch adjacent exons
+ for (int i=0;i<exons.Count();i++) {
+ if (exons[i]->start>e) break;
+ if (s>exons[i]->end) continue;
+ //-- overlap if we are here:
+ if (ovlen!=NULL) {
+ s++;e--;
+ int ovlend= (exons[i]->end>e) ? e : exons[i]->end;
+ *ovlen= ovlend - ((s>exons[i]->start)? s : exons[i]->start)+1;
+ }
+ return i;
+ } //for each exon
+ *ovlen=0;
+ return -1;
+ }
+ int exonOverlapLen(GffObj& m) {
+ if (start>m.end || m.start>end) return 0;
+ int i=0;
+ int j=0;
+ int ovlen=0;
+ while (i<exons.Count() && j<m.exons.Count()) {
+ uint istart=exons[i]->start;
+ uint iend=exons[i]->end;
+ uint jstart=m.exons[j]->start;
+ uint jend=m.exons[j]->end;
+ if (istart>jend) { j++; continue; }
+ if (jstart>iend) { i++; continue; }
+ //exon overlap
+ uint ovstart=GMAX(istart,jstart);
+ if (iend<jend) {
+ ovlen+=iend-ovstart+1;
+ i++;
+ }
+ else {
+ ovlen+=jend-ovstart+1;
+ j++;
+ }
+ }//while comparing exons
+ return ovlen;
+ }
+ bool exonOverlap(GffObj* m) {
+ return exonOverlap(*m);
+ }
+ //---------- coordinate transformation
+ void xcoord(uint grstart, uint grend, char xstrand='+') {
+ //relative coordinate transform, and reverse-complement transform if xstrand is '-'
+ //does nothing if xstatus is the same already
+ if (xstatus) {
+ if (xstatus==xstrand && grstart==xstart && grend==xend) return;
+ unxcoord();//restore original coordinates
+ }
+ xstatus=xstrand;
+ xstart=grstart;
+ xend=grend;
+ if (CDstart>0) xcoordseg(CDstart, CDend);
+ for (int i=0;i<exons.Count();i++) {
+ xcoordseg(exons[i]->start, exons[i]->end);
+ }
+ if (xstatus=='-') {
+ exons.Reverse();
+ int flen=end-start;
+ start=xend-end+1;
+ end=start+flen;
+ }
+ else {
+ start=start-xstart+1;
+ end=end-xstart+1;
+ }
+ }
+ //transform an arbitrary segment based on current xstatus/xstart-xend
+ void xcoordseg(uint& segstart, uint &segend) {
+ if (xstatus==0) return;
+ if (xstatus=='-') {
+ int flen=segend-segstart;
+ segstart=xend-segend+1;
+ segend=segstart+flen;
+ return;
+ }
+ else {
+ segstart=segstart-xstart+1;
+ segend=segend-xstart+1;
+ }
+ }
+ void unxcoord() { //revert back to absolute genomic/gff coordinates if xstatus==true
+ if (xstatus==0) return; //nothing to do, no transformation appplied
+ if (CDstart>0) unxcoordseg(CDstart, CDend);
+ //restore all GffExon intervals too
+ for (int i=0;i<exons.Count();i++) {
+ unxcoordseg(exons[i]->start, exons[i]->end);
+ }
+ if (xstatus=='-') {
+ exons.Reverse();
+ int flen=end-start;
+ start=xend-end+1;
+ end=start+flen;
+ }
+ else {
+ start=start+xstart-1;
+ end=end+xstart-1;
+ }
+ xstatus=0;
+ }
+ void unxcoordseg(uint& astart, uint &aend) {
+ //restore an arbitrary interval -- does NOT change the transform state!
+ if (xstatus==0) return;
+ if (xstatus=='-') {
+ int flen=aend-astart;
+ astart=xend-aend+1;
+ aend=astart+flen;
+ }
+ else {
+ astart=astart+xstart-1;
+ aend=aend+xstart-1;
+ }
+ }
+ //---------------------
+ bool operator==(GffObj& d){
+ return (gseq_id==d.gseq_id && start==d.start && end==d.end && strcmp(gffID, d.gffID)==0);
+ }
+ bool operator>(GffObj& d){
+ if (gseq_id!=d.gseq_id) return (gseq_id>d.gseq_id);
+ if (start==d.start) {
+ if (getLevel()==d.getLevel()) {
+ if (end==d.end) return (strcmp(gffID, d.gffID)>0);
+ else return (end>d.end);
+ } else return (getLevel()>d.getLevel());
+ } else return (start>d.start);
+ }
+ bool operator<(GffObj& d){
+ if (gseq_id!=d.gseq_id) return (gseq_id<d.gseq_id);
+ if (start==d.start) {
+ if (getLevel()==d.getLevel()) {
+ if (end==d.end) return strcmp(gffID, d.gffID)<0;
+ else return end<d.end;
+ } else return (getLevel()<d.getLevel());
+ } else return (start<d.start);
+ }
+ char* getID() { return gffID; }
+ char* getGeneID() { return geneID; }
+ char* getGeneName() { return gene_name; }
+ void setGeneName(const char* gname) {
+ GFREE(gene_name);
+ if (gname) gene_name=Gstrdup(gname);
+ }
+ void setGeneID(const char* gene_id) {
+ GFREE(geneID);
+ if (gene_id) geneID=Gstrdup(gene_id);
+ }
+ int addSeg(GffLine* gfline);
+ int addSeg(int fnid, GffLine* gfline);
+ void getCDSegs(GArray<GffCDSeg>& cds);
+ void updateExonPhase(); //for CDS-only features, updates GExon::phase
+ void printGxfLine(FILE* fout, const char* tlabel, const char* gseqname,
+ bool iscds, uint segstart, uint segend, int exidx, char phase, bool gff3, bool cvtChars=false);
+ void printGxf(FILE* fout, GffPrintMode gffp=pgffExon,
+ const char* tlabel=NULL, const char* gfparent=NULL, bool cvtChars=false);
+ void printGtf(FILE* fout, const char* tlabel=NULL, bool cvtChars=false) {
+ printGxf(fout, pgtfAny, tlabel, NULL, cvtChars);
+ }
+ void printGff(FILE* fout, const char* tlabel=NULL,
+ const char* gfparent=NULL, bool cvtChars=false) {
+ printGxf(fout, pgffAny, tlabel, gfparent, cvtChars);
+ }
+ void printTranscriptGff(FILE* fout, char* tlabel=NULL,
+ bool showCDS=false, const char* gfparent=NULL, bool cvtChars=false) {
+ if (isValidTranscript())
+ printGxf(fout, showCDS ? pgffBoth : pgffExon, tlabel, gfparent, cvtChars);
+ }
+ void printSummary(FILE* fout=NULL);
+ void getCDS_ends(uint& cds_start, uint& cds_end);
+ void mRNA_CDS_coords(uint& cds_start, uint& cds_end);
+ char* getSpliced(GFaSeqGet* faseq, bool CDSonly=false, int* rlen=NULL,
+ uint* cds_start=NULL, uint* cds_end=NULL, GList<GSeg>* seglst=NULL);
+ char* getUnspliced(GFaSeqGet* faseq, int* rlen, GList<GSeg>* seglst);
+ char* getSplicedTr(GFaSeqGet* faseq, bool CDSonly=true, int* rlen=NULL);
+ //bool validCDS(GFaSeqGet* faseq); //has In-Frame Stop Codon ?
+ bool empty() { return (start==0); }
+typedef bool GffRecFunc(GffObj* gobj, void* usrptr1, void* usrptr2);
+//user callback after parsing a mapping object:
+// Returns: "done with it" status:
+// TRUE if gobj is no longer needed so it's FREEd upon return
+// FALSE if the user needs the gobj pointer and is responsible for
+// collecting and freeing all GffObj objects
+//GSeqStat: collect basic stats about a common underlying genomic sequence
+// for multiple GffObj
+class GSeqStat {
+ public:
+ int gseqid; //gseq id in the global static pool of gseqs
+ char* gseqname; //just a pointer to the name of gseq
+ int fcount;//number of features on this gseq
+ uint mincoord;
+ uint maxcoord;
+ uint maxfeat_len; //maximum feature length on this genomic sequence
+ GffObj* maxfeat;
+ GSeqStat(int id=-1, char* name=NULL) {
+ gseqid=id;
+ gseqname=name;
+ fcount=0;
+ mincoord=MAXUINT;
+ maxcoord=0;
+ maxfeat_len=0;
+ maxfeat=NULL;
+ }
+ bool operator>(GSeqStat& g) {
+ return (gseqid>g.gseqid);
+ }
+ bool operator<(GSeqStat& g) {
+ return (gseqid<g.gseqid);
+ }
+ bool operator==(GSeqStat& g) {
+ return (gseqid==g.gseqid);
+ }
+int gfo_cmpByLoc(const pointer p1, const pointer p2);
+class GfList: public GList<GffObj> {
+ //just adding the option to sort by genomic sequence and coordinate
+ bool mustSort;
+ public:
+ GfList(bool sortbyloc=false):GList<GffObj>(false,false,false) {
+ //GffObjs in this list are NOT deleted when the list is cleared
+ //-- for deallocation of these objects, call freeAll() or freeUnused() as needed
+ mustSort=sortbyloc;
+ }
+ void sortedByLoc(bool v=true) {
+ bool prev=mustSort;
+ mustSort=v;
+ if (fCount>0 && mustSort && !prev) {
+ this->setSorted((GCompareProc*)gfo_cmpByLoc);
+ }
+ }
+ void finalize(GffReader* gfr, bool mergeCloseExons,
+ bool keepAttrs=false, bool noExonAttr=true);
+ void freeAll() {
+ for (int i=0;i<fCount;i++) {
+ delete fList[i];
+ fList[i]=NULL;
+ }
+ Clear();
+ }
+ void freeUnused() {
+ for (int i=0;i<fCount;i++) {
+ if (fList[i]->isUsed()) continue;
+ //inform the children
+ for (int c=0;c<fList[i]->children.Count();c++) {
+ fList[i]->children[c]->parent=NULL;
+ }
+ delete fList[i];
+ fList[i]=NULL;
+ }
+ Clear();
+ }
+struct GfoHolder {
+ //int idx; //position in GffReader::gflst array
+ GffObj* gffobj;
+ GfoHolder(GffObj* gfo=NULL) { //, int i=0) {
+ //idx=i;
+ gffobj=gfo;
+ }
+class CNonExon { //utility class used in subfeature promotion
+ public:
+ //int idx;
+ GffObj* parent;
+ GffExon* exon;
+ GffLine* gffline;
+ //CNonExon(int i, GffObj* p, GffExon* e, GffLine* gl) {
+ CNonExon(GffObj* p, GffExon* e, GffLine* gl) {
+ parent=p;
+ exon=e;
+ //idx=i;
+ gffline=new GffLine(gl);
+ }
+ ~CNonExon() {
+ delete gffline;
+ }
+ };
+class GffReader {
+ friend class GffObj;
+ friend class GffLine;
+ char* linebuf;
+ off_t fpos;
+ int buflen;
+ protected:
+ bool gff_warns; //warn about duplicate IDs, etc. even when they are on different chromosomes
+ FILE* fh;
+ char* fname; //optional fasta file with the underlying genomic sequence to be attached to this reader
+ GffLine* gffline;
+ bool transcriptsOnly; //keep only transcripts w/ their exon/CDS features
+ GHash<int> discarded_ids; //for transcriptsOnly mode, keep track
+ // of discarded parent IDs
+ GHash< GPVec<GffObj> > phash; //transcript_id+contig (Parent~Contig) => [gflst index, GffObj]
+ //GHash<int> tids; //just for transcript_id uniqueness
+ char* gfoBuildId(const char* id, const char* ctg);
+ //void gfoRemove(const char* id, const char* ctg);
+ GffObj* gfoAdd(GffObj* gfo);
+ GffObj* gfoAdd(GPVec<GffObj>& glst, GffObj* gfo);
+ // const char* id, const char* ctg, char strand, GVec<GfoHolder>** glst, uint start, uint end
+ GffObj* gfoFind(const char* id, const char* ctg=NULL, GPVec<GffObj>** glst=NULL,
+ char strand=0, uint start=0, uint end=0);
+ CNonExon* subfPoolCheck(GffLine* gffline, GHash<CNonExon>& pex, char*& subp_name);
+ void subfPoolAdd(GHash<CNonExon>& pex, GffObj* newgfo);
+ GffObj* promoteFeature(CNonExon* subp, char*& subp_name, GHash<CNonExon>& pex,
+ bool keepAttr, bool noExonAttr);
+ GList<GSeqStat> gseqstats; //list of all genomic sequences seen by this reader, accumulates stats
+ //boost::crc_32_type _crc_result;
+ public:
+ GffNames* names; //just a pointer to the global static Gff names repository in GffObj
+ GfList gflst; //accumulate GffObjs being read
+ GffObj* newGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr,
+ GffObj* parent=NULL, GffExon* pexon=NULL, GPVec<GffObj>* glst=NULL);
+ //GffObj* replaceGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr, int replaceidx);
+ GffObj* updateGffRec(GffObj* prevgfo, GffLine* gffline,
+ bool keepAttr);
+ GffObj* updateParent(GffObj* newgfh, GffObj* parent);
+ bool addExonFeature(GffObj* prevgfo, GffLine* gffline, GHash<CNonExon>& pex, bool noExonAttr);
+ GPVec<GSeqStat> gseqStats; //only populated after finalize()
+ GffReader(FILE* f=NULL, bool t_only=false, bool sortbyloc=false):discarded_ids(true),
+ phash(true), gseqstats(true,true,true), gflst(sortbyloc), gseqStats(1, false) {
+ gff_warns=gff_show_warnings;
+ names=NULL;
+ gffline=NULL;
+ transcriptsOnly=t_only;
+ fpos=0;
+ fname=NULL;
+ fh=f;
+ buflen=GFF_LINELEN-1;
+ }
+ void init(FILE *f, bool t_only=false, bool sortbyloc=false) {
+ fname=NULL;
+ fh=f;
+ if (fh!=NULL) rewind(fh);
+ fpos=0;
+ transcriptsOnly=t_only;
+ gflst.sortedByLoc(sortbyloc);
+ }
+ GffReader(char* fn, bool t_only=false, bool sort=false):discarded_ids(true), phash(true),
+ gseqstats(true,true,true), gflst(sort), gseqStats(1,false) {
+ gff_warns=gff_show_warnings;
+ names=NULL;
+ fname=Gstrdup(fn);
+ transcriptsOnly=t_only;
+ fh=fopen(fname, "rb");
+ fpos=0;
+ gffline=NULL;
+ buflen=GFF_LINELEN-1;
+ }
+ ~GffReader() {
+ delete gffline;
+ gffline=NULL;
+ fpos=0;
+ gflst.freeUnused();
+ gflst.Clear();
+ discarded_ids.Clear();
+ phash.Clear();
+ gseqstats.Clear();
+ GFREE(fname);
+ GFREE(linebuf);
+ }
+ void showWarnings(bool v=true) {
+ gff_warns=v;
+ gff_show_warnings=v;
+ }
+ GffLine* nextGffLine();
+ // load all subfeatures, re-group them:
+ void readAll(bool keepAttr=false, bool mergeCloseExons=false, bool noExonAttr=true);
+ //boost::crc_32_type current_crc_result() const { return _crc_result; }
+}; // end of GffReader
diff --git a/include/gff_utils.h b/include/gff_utils.h
new file mode 100644
index 0000000..b15b677
--- /dev/null
+++ b/include/gff_utils.h
@@ -0,0 +1,610 @@
+#ifndef GFF_UTILS_H
+#define GFF_UTILS_H
+#include "gff.h"
+#include "GStr.h"
+#include "GFastaIndex.h"
+#include "GFaSeqGet.h"
+typedef bool GFValidateFunc(GffObj* gf, GList<GffObj>* gfadd);
+class GeneInfo { //for Ensembl GTF conversion
+ public:
+ int flag;
+ GffObj* gf;
+ GList<GStr> gene_names;
+ GList<GStr> transcripts; //list of transcript IDs
+ GeneInfo():gene_names(true, true, true), transcripts(true,true,true) {
+ gf=NULL;
+ flag=0;
+ }
+ GeneInfo(GffObj* gfrec, bool ensembl_convert=false):gene_names(true, true, true),
+ transcripts(true,true,true) {
+ flag=0;
+ if (gfrec->getGeneName())
+ gene_names.Add(new GStr(gfrec->getGeneName()));
+ transcripts.Add(new GStr(gfrec->getID()));
+ create_gf(gfrec, ensembl_convert);
+ }
+ void create_gf(GffObj* gfrec, bool ensembl_convert) {
+ gf=new GffObj(gfrec->getGeneID());
+ gf->gseq_id=gfrec->gseq_id;
+ gf->track_id=gfrec->track_id;
+ gf->start=gfrec->start;
+ gf->end=gfrec->end;
+ gf->strand=gfrec->strand;
+ gf->setFeatureName("gene");
+ gf->isGene(true);
+ gf->isUsed(true);
+ gf->uptr=this;
+ gfrec->incLevel();
+ gfrec->parent=gf;
+ gf->children.Add(gfrec);
+ if (ensembl_convert) {
+ //gf->addAttr("type", gf->getTrackName());
+ const char* biotype=gfrec->getAttr("type");
+ if (biotype) gf->addAttr("type", biotype);
+ }
+ //gf->children.Add(gfrec);
+ }
+ //~GeneInfo() {
+ // }
+ void update(GffObj* gfrec) {
+ if (transcripts.AddedIfNew(new GStr(gfrec->getID()))<0)
+ return;
+ gene_names.AddedIfNew(new GStr(gfrec->getGeneName()));
+ if (gf==NULL) {
+ GError("GeneInfo::update() called on uninitialized gf!\n");
+ //create_gf(gfrec);
+ //return;
+ }
+ gfrec->parent=gf;
+ gf->children.Add(gfrec);
+ gfrec->incLevel();
+ if (gf->start>gfrec->start)
+ gf->start=gfrec->start;
+ if (gf->end<gfrec->end)
+ gf->end=gfrec->end;
+ }
+ void finalize() {
+ //prepare attributes for printing
+ //must be called right before printing
+ if (gf==NULL || transcripts.Count()==0) return;
+ if (gene_names.Count()>0) {
+ gf->addAttr("Name", gene_names[0]->chars());
+ /*
+ GStr s(gene_names[0]->chars());
+ for (int i=1;i<gene_names.Count();i++) {
+ s.append(",");
+ s.append(gene_names[i]->chars());
+ }
+ gf->addAttr("genes", s.chars());
+ */
+ } //has gene names
+ GStr t(transcripts[0]->chars());
+ for (int i=1;i<transcripts.Count();i++) {
+ t.append(",");
+ t.append(transcripts[i]->chars());
+ }
+ gf->addAttr("transcripts", t.chars());
+ }
+//genomic fasta sequence handling
+class GFastaDb {
+ public:
+ char* fastaPath;
+ GFastaIndex* faIdx; //could be a cdb .cidx file
+ int last_fetchid;
+ GFaSeqGet* faseq;
+ //GCdbYank* gcdb;
+ char* getFastaFile(int gseq_id) {
+ if (fastaPath==NULL) return NULL;
+ GStr s(fastaPath);
+ s.trimR('/');
+ s.appendfmt("/%s",GffObj::names->gseqs.getName(gseq_id));
+ GStr sbase(s);
+ if (!fileExists(s.chars())) s.append(".fa");
+ if (!fileExists(s.chars())) s.append("sta");
+ if (fileExists(s.chars())) return Gstrdup(s.chars());
+ else {
+ GMessage("Warning: cannot find genomic sequence file %s{.fa,.fasta}\n",sbase.chars());
+ return NULL;
+ }
+ }
+ GFastaDb(const char* fpath=NULL) {
+ //gcdb=NULL;
+ fastaPath=NULL;
+ faseq=NULL;
+ faIdx=NULL;
+ init(fpath);
+ }
+ void init(const char* fpath) {
+ if (fpath==NULL || fpath[0]==0) return;
+ last_fetchid=-1;
+ if (!fileExists(fpath))
+ GError("Error: file/directory %s does not exist!\n",fpath);
+ fastaPath=Gstrdup(fpath);
+ GStr gseqpath(fpath);
+ if (fileExists(fastaPath)>1) { //exists and it's not a directory
+ GStr fainame(fastaPath);
+ if (fainame.rindex(".fai")==fainame.length()-4) {
+ //.fai index file given directly
+ fastaPath[fainame.length()-4]=0;
+ if (!fileExists(fastaPath))
+ GError("Error: cannot find fasta file for index %s !\n", fastaPath);
+ }
+ else fainame.append(".fai");
+ //GMessage("creating GFastaIndex with fastaPath=%s, fainame=%s\n", fastaPath, fainame.chars());
+ faIdx=new GFastaIndex(fastaPath,fainame.chars());
+ GStr fainamecwd(fainame);
+ int ip=-1;
+ if ((ip=fainamecwd.rindex(CHPATHSEP))>=0)
+ fainamecwd.cut(0,ip+1);
+ if (!faIdx->hasIndex()) { //could not load index
+ //try current directory
+ if (fainame!=fainamecwd) {
+ if (fileExists(fainamecwd.chars())>1) {
+ faIdx->loadIndex(fainamecwd.chars());
+ }
+ }
+ } //tried to load index
+ if (!faIdx->hasIndex()) {
+ GMessage("No fasta index found for %s. Rebuilding, please wait..\n",fastaPath);
+ faIdx->buildIndex();
+ if (faIdx->getCount()==0) GError("Error: no fasta records found!\n");
+ GMessage("Fasta index rebuilt.\n");
+ FILE* fcreate=fopen(fainame.chars(), "w");
+ if (fcreate==NULL) {
+ GMessage("Warning: cannot create fasta index %s! (permissions?)\n", fainame.chars());
+ if (fainame!=fainamecwd) fcreate=fopen(fainamecwd.chars(), "w");
+ if (fcreate==NULL)
+ GError("Error: cannot create fasta index %s!\n", fainamecwd.chars());
+ }
+ if (faIdx->storeIndex(fcreate)<faIdx->getCount())
+ GMessage("Warning: error writing the index file!\n");
+ } //index created and attempted to store it
+ } //multi-fasta
+ }
+ GFaSeqGet* fetch(int gseq_id, bool checkFasta=false) {
+ if (fastaPath==NULL) return NULL;
+ if (gseq_id==last_fetchid && faseq!=NULL) return faseq;
+ delete faseq;
+ faseq=NULL;
+ last_fetchid=-1;
+ char* gseqname=GffObj::names->gseqs.getName(gseq_id);
+ if (faIdx!=NULL) { //fastaPath was the multi-fasta file name
+ GFastaRec* farec=faIdx->getRecord(gseqname);
+ if (farec!=NULL) {
+ faseq=new GFaSeqGet(fastaPath,farec->seqlen, farec->fpos,
+ farec->line_len, farec->line_blen);
+ faseq->loadall(); //just cache the whole sequence, it's faster
+ last_fetchid=gseq_id;
+ }
+ else {
+ GMessage("Warning: couldn't find fasta record for '%s'!\n",gseqname);
+ return NULL;
+ }
+ }
+ else {
+ char* sfile=getFastaFile(gseq_id);
+ if (sfile!=NULL) {
+ faseq=new GFaSeqGet(sfile,checkFasta);
+ faseq->loadall();
+ last_fetchid=gseq_id;
+ GFREE(sfile);
+ }
+ } //one fasta file per contig
+ return faseq;
+ }
+ ~GFastaDb() {
+ GFREE(fastaPath);
+ //delete gcdb;
+ delete faIdx;
+ delete faseq;
+ }
+class GffLocus;
+class GTData { //transcript associated data
+ public:
+ GffObj* rna;
+ GffLocus* locus;
+ GffObj* replaced_by;
+ GeneInfo* geneinfo;
+ int flag;
+ GTData(GffObj* t=NULL) {
+ rna=t;
+ flag=0;
+ locus=NULL;
+ replaced_by=NULL;
+ geneinfo=NULL;
+ if (rna!=NULL) {
+ geneinfo=(GeneInfo*)rna->uptr; //take over geneinfo, if there
+ rna->uptr=this;
+ }
+ }
+ bool operator<(GTData& b) { return (rna < b.rna); }
+ bool operator==(GTData& b) { return (rna==b.rna); }
+class CGeneSym {
+ public:
+ GStr name;
+ int freq;
+ CGeneSym(const char* n=NULL, int f=0):name(n) {
+ freq=f;
+ }
+ bool operator<(CGeneSym& b) {
+ return (freq==b.freq)? ( (name.length()==b.name.length()) ? (name<b.name) :
+ (name.length()<b.name.length()) ) : ( freq>b.freq );
+ }
+ bool operator==(CGeneSym& b) { return name==b.name; }
+const char* getGeneDescr(const char* gsym);
+void printLocus(GffLocus* loc, const char* pre=NULL);
+class GffLocus:public GSeg {
+ int gseq_id; //id of underlying genomic sequence
+ int locus_num;
+ bool is_mrna;
+ char strand;
+ GffObj* t_maxcov; //transcript with maximum coverage (for main "ref" transcript)
+ GList<GffObj> rnas; //list of transcripts (isoforms) for this locus
+ GArray<GSeg> mexons; //list of merged exons in this region
+ GList<CGeneSym> gene_names;
+ GList<CGeneSym> gene_ids;
+ int v; //user flag/data
+ /*
+ bool operator==(GffLocus& d){
+ return (gseq_id==d.gseq_id && strand==d.strand && start==d.start && end==d.end);
+ }
+ bool operator<(GffLocus& d){
+ if (gseq_id!=d.gseq_id) return (gseq_id<d.gseq_id);
+ if (start==d.start) {
+ if (end==d.end) return strand<d.strand;
+ else return end<d.end;
+ } else return (start<d.start);
+ }
+ */
+ const char* getGeneName() {
+ if (gene_names.Count()==0) return NULL;
+ return gene_names.First()->name.chars();
+ }
+ const char* get_tmax_id() {
+ return t_maxcov->getID();
+ }
+ const char* get_descr() {
+ if (gene_names.Count()>0) {
+ for (int i=0;i<gene_names.Count();i++) {
+ const char* gn=getGeneDescr(gene_names.First()->name.chars());
+ if (gn!=NULL) return gn;
+ }
+ }
+ char* s=t_maxcov->getAttr("product");
+ if (s!=NULL) return s;
+ s=t_maxcov->getAttr("descr");
+ if (s!=NULL) return s;
+ s=t_maxcov->getAttr("description");
+ if (s!=NULL) return s;
+ s=t_maxcov->getAttr("info");
+ if (s!=NULL) return s;
+ return NULL;
+ }
+ GffLocus(GffObj* t=NULL):rnas(true,false,false),mexons(true,true),
+ gene_names(true,true,false), gene_ids(true,true,false) {
+ //this will NOT free rnas!
+ t_maxcov=NULL;
+ gseq_id=-1;
+ v=0;
+ locus_num=0;
+ start=0;
+ end=0;
+ strand=0;
+ is_mrna=false;
+ if (t!=NULL) {
+ start=t->exons.First()->start;
+ end=t->exons.Last()->end;;
+ gseq_id=t->gseq_id;
+ GSeg seg;
+ for (int i=0;i<t->exons.Count();i++) {
+ seg.start=t->exons[i]->start;
+ seg.end=t->exons[i]->end;
+ mexons.Add(seg);
+ }
+ rnas.Add(t);
+ ((GTData*)(t->uptr))->locus=this;
+ t_maxcov=t;
+ strand=t->strand;
+ if (t->ftype_id==gff_fid_mRNA) {
+ is_mrna=true;
+ }
+ }
+ }
+ void addMerge(GffLocus& locus, GffObj* lnkrna) {
+ //add all the elements of the other locus (merging)
+ //-- merge mexons
+ GArray<int> ovlexons(true,true); //list of locus.mexons indexes overlapping existing mexons
+ int i=0; //index of first mexons with a merge
+ int j=0; //index current mrna exon
+ while (i<mexons.Count() && j<locus.mexons.Count()) {
+ uint istart=mexons[i].start;
+ uint iend=mexons[i].end;
+ uint jstart=locus.mexons[j].start;
+ uint jend=locus.mexons[j].end;
+ if (iend<jstart) { i++; continue; }
+ if (jend<istart) { j++; continue; }
+ ovlexons.Add(j);
+ //extend mexons[i] as needed
+ if (jstart<istart) mexons[i].start=jstart;
+ if (jend>iend) { //mexons[i] end extend
+ mexons[i].end=jend;
+ //now this could overlap the next mexon(s), so we have to merge them all
+ while (i<mexons.Count()-1 && mexons[i].end>mexons[i+1].start) {
+ uint nextend=mexons[i+1].end;
+ mexons.Delete(i+1);
+ if (nextend>mexons[i].end) {
+ mexons[i].end=nextend;
+ break; //no need to check next mexons
+ }
+ } //while next mexons merge
+ } // mexons[i] end extend
+ j++; //check the next locus.mexon
+ }
+ //-- add the rest of the non-overlapping mexons:
+ GSeg seg;
+ for (int i=0;i<locus.mexons.Count();i++) {
+ seg.start=locus.mexons[i].start;
+ seg.end=locus.mexons[i].end;
+ if (!ovlexons.Exists(i)) mexons.Add(seg);
+ }
+ // -- add locus.rnas
+ for (int i=0;i<locus.rnas.Count();i++) {
+ ((GTData*)(locus.rnas[i]->uptr))->locus=this;
+ if (locus.rnas[i]!=lnkrna) rnas.Add(locus.rnas[i]);
+ }
+ // -- adjust start/end as needed
+ if (start>locus.start) start=locus.start;
+ if (end<locus.end) end=locus.end;
+ if (locus.is_mrna) is_mrna=true;
+ if (t_maxcov->covlen<locus.t_maxcov->covlen)
+ t_maxcov=locus.t_maxcov;
+ }
+ bool exonOverlap(GffLocus& loc) {
+ //check if any mexons overlap!
+ if (strand!=loc.strand || loc.start>end || start>loc.end) return false;
+ int i=0;
+ int j=0;
+ while (i<mexons.Count() && j<loc.mexons.Count()) {
+ uint istart=mexons[i].start;
+ uint iend=mexons[i].end;
+ uint jstart=loc.mexons[j].start;
+ uint jend=loc.mexons[j].end;
+ if (iend<jstart) { i++; continue; }
+ if (jend<istart) { j++; continue; }
+ //exon overlap found if we're here:
+ return true;
+ }
+ return false;
+ }
+ bool add_RNA(GffObj* t) {
+ //if (rnas.Count()==0) return true; //? should never be called on an empty locus
+ if (t->gseq_id!=gseq_id || t->strand!=strand || t->start>end || start>t->end)
+ return false; //rna must be on the same genomic seq
+ //check for exon overlap with existing mexons
+ //also update mexons accordingly if t is to be added
+ bool hasovl=false;
+ int i=0; //index of first mexons with a merge
+ int j=0; //index current t exon
+ GArray<int> ovlexons(true,true); //list of mrna exon indexes overlapping mexons
+ while (i<mexons.Count() && j<t->exons.Count()) {
+ uint istart=mexons[i].start;
+ uint iend=mexons[i].end;
+ uint jstart=t->exons[j]->start;
+ uint jend=t->exons[j]->end;
+ if (iend<jstart) { i++; continue; }
+ if (jend<istart) { j++; continue; }
+ //exon overlap found if we're here:
+ ovlexons.Add(j);
+ hasovl=true;
+ //extend mexons[i] as needed
+ if (jstart<istart) mexons[i].start=jstart;
+ if (jend>iend) { //mexon stretch up
+ mexons[i].end=jend;
+ //now this could overlap the next mexon(s), so we have to merge them all
+ while (i<mexons.Count()-1 && mexons[i].end>mexons[i+1].start) {
+ uint nextend=mexons[i+1].end;
+ mexons.Delete(i+1);
+ if (nextend>mexons[i].end) {
+ mexons[i].end=nextend;
+ break; //no need to check next mexons
+ }
+ } //while next mexons merge
+ } //possible mexons merge
+ j++; //check the next t exon
+ }//all vs all exon check loop
+ if (hasovl) {
+ GSeg seg;
+ //add the rest of the non-overlapping exons
+ for (int i=0;i<t->exons.Count();i++) {
+ seg.start=t->exons[i]->start;
+ seg.end=t->exons[i]->end;
+ if (!ovlexons.Exists(i)) mexons.Add(seg);
+ }
+ rnas_add(t);
+ // add to rnas
+ ((GTData*)t->uptr)->locus=this;
+ gseq_id=t->gseq_id;
+ }
+ return hasovl;
+ }
+ //simpler,basic adding of a mrna
+ void rnas_add(GffObj* t) {
+ rnas.Add(t);
+ // adjust start/end
+ //if (start==0 || start>t->start) start=t->start;
+ if (start==0) start=t->start;
+ else if (start>t->start) {
+ start=t->start;
+ }
+ if (end<t->end) end=t->end;
+ if (t_maxcov->covlen<t->covlen) t_maxcov=t;
+ if (strand==0) strand=t->strand;
+ if (t->ftype_id==gff_fid_mRNA) is_mrna=true;
+ }
+class GenomicSeqData {
+ int gseq_id;
+ public:
+ const char* gseq_name;
+ GList<GffObj> gfs; //all non-transcript features -> usually gene features
+ GList<GffObj> rnas; //all transcripts on this genomic sequence
+ GList<GffLocus> loci; //all loci clusters
+ GList<GTData> tdata; //transcript data (uptr holder for all rnas loaded here)
+ //GenomicSeqData(int gid=-1):rnas(true,true,false),loci(true,true,true),
+ GenomicSeqData(int gid=-1):gfs(true, true, false),rnas((GCompareProc*)gfo_cmpByLoc),loci(true,true,false),
+ tdata(false,true,false) {
+ gseq_id=gid;
+ if (gseq_id>=0)
+ gseq_name=GffObj::names->gseqs.getName(gseq_id);
+ }
+ bool operator==(GenomicSeqData& d){
+ return gseq_id==d.gseq_id;
+ }
+ bool operator<(GenomicSeqData& d){
+ return (gseq_id<d.gseq_id);
+ }
+int gseqCmpName(const pointer p1, const pointer p2);
+class GSpliceSite {
+ public:
+ char nt[3];
+ GSpliceSite(const char* c, bool revc=false) {
+ nt[2]=0;
+ if (c==NULL) {
+ nt[0]=0;
+ nt[1]=0;
+ return;
+ }
+ if (revc) {
+ nt[0]=toupper(ntComplement(c[1]));
+ nt[1]=toupper(ntComplement(c[0]));
+ }
+ else {
+ nt[0]=toupper(c[0]);
+ nt[1]=toupper(c[1]);
+ }
+ }
+ GSpliceSite(const char* intron, int intronlen, bool getAcceptor, bool revc=false) {
+ nt[2]=0;
+ if (intron==NULL || intronlen==0)
+ GError("Error: invalid intron or intron len for GSpliceSite()!\n");
+ const char* c=intron;
+ if (revc) {
+ if (!getAcceptor) c+=intronlen-2;
+ nt[0]=toupper(ntComplement(c[1]));
+ nt[1]=toupper(ntComplement(c[0]));
+ }
+ else { //on forward strand
+ if (getAcceptor) c+=intronlen-2;
+ nt[0]=toupper(c[0]);
+ nt[1]=toupper(c[1]);
+ }//forward strand
+ }
+ GSpliceSite(const char n1, const char n2) {
+ nt[2]=0;
+ nt[0]=toupper(n1);
+ nt[1]=toupper(n2);
+ }
+ bool canonicalDonor() {
+ return (nt[0]=='G' && (nt[1]=='C' || nt[1]=='T'));
+ }
+ bool operator==(GSpliceSite& c) {
+ return (c.nt[0]==nt[0] && c.nt[1]==nt[1]);
+ }
+ bool operator==(GSpliceSite* c) {
+ return (c->nt[0]==nt[0] && c->nt[1]==nt[1]);
+ }
+ bool operator==(const char* c) {
+ //return (nt[0]==toupper(c[0]) && nt[1]==toupper(c[1]));
+ //assumes given const nucleotides are uppercase already!
+ return (nt[0]==c[0] && nt[1]==c[1]);
+ }
+ bool operator!=(const char* c) {
+ //assumes given const nucleotides are uppercase already!
+ return (nt[0]!=c[0] || nt[1]!=c[1]);
+ }
+struct GffLoader {
+ GStr fname;
+ FILE* f;
+ bool transcriptsOnly;
+ bool fullAttributes;
+ bool noExonAttrs;
+ bool mergeCloseExons;
+ bool showWarnings;
+ bool noPseudo;
+ void placeGf(GffObj* t, GenomicSeqData* gdata, bool doCluster=true, bool collapseRedundant=true,
+ bool matchAllIntrons=true, bool fuzzSpan=false);
+ void load(GList<GenomicSeqData>&seqdata, GFValidateFunc* gf_validate=NULL,
+ bool doCluster=true, bool doCollapseRedundant=true,
+ bool matchAllIntrons=true, bool fuzzSpan=false, bool forceExons=false);
+ GffLoader(const char* filename):fname(filename) {
+ f=NULL;
+ transcriptsOnly=true;
+ fullAttributes=false;
+ noExonAttrs=false;
+ mergeCloseExons=false;
+ showWarnings=false;
+ noPseudo=false;
+ if (fname=="-" || fname=="stdin") {
+ f=stdin;
+ fname="stdin";
+ }
+ else {
+ if ((f=fopen(fname.chars(), "r"))==NULL) {
+ GError("Error: cannot open gff file %s!\n",fname.chars());
+ }
+ }
+ }
+ ~GffLoader() {
+ if (f!=NULL && f!=stdin) fclose(f);
+ }
+void printFasta(FILE* f, GStr& defline, char* seq, int seqlen=-1);
+//"position" a given coordinate x within a list of transcripts sorted by their start (lowest)
+//coordinate, using quick-search; the returned int is the list index of the closest *higher*
+//GffObj - i.e. starting right *ABOVE* the given coordinate
+//Convention: returns -1 if there is no such GffObj (i.e. last GffObj starts below x)
+int qsearch_rnas(uint x, GList<GffObj>& rnas);
+int qsearch_gloci(uint x, GList<GffLocus>& loci);
+GffObj* redundantTranscripts(GffObj& ti, GffObj& tj, bool matchAllIntrons=true, bool fuzzSpan=false);
+//void loadGFF(FILE* f, GList<GenomicSeqData>& seqdata, const char* fname);
+void collectLocusData(GList<GenomicSeqData>& ref_data);
diff --git a/src/GArgs.cpp b/src/GArgs.cpp
new file mode 100644
index 0000000..f3b72b9
--- /dev/null
+++ b/src/GArgs.cpp
@@ -0,0 +1,376 @@
+#include "GBase.h"
+#include "GArgs.h"
+#include <ctype.h>
+GArgs::GArgs(int argc, char* const argv[], const char* format, bool nodigitopts) {
+ /* format can be:
+ <string>{;|=} e.g. disable-test;PID=S= for --disable-test PID=50 (or --PID 50) S=3.5 etc.
+ <letter>[:] e.g. p:hT for -p testing (or -ptesting) -h -T
+ */
+const char* fstr=format;
+int fmtlen=strlen(format);
+//---- first parse the format string
+while (fstr-format < fmtlen ) {
+ int l=strcspn(fstr, ";=:");
+ if (fstr[l]==0) { //end of string reached
+ //all previous chars are just switches:
+ GREALLOC(fmt, (fmtcount+l)*sizeof(fmtdef));
+ //store each switch
+ for (int i=0; i<l;i++) {
+ fmt[fmtcount+i].longopt=NULL;
+ fmt[fmtcount+i].opt=fstr[i];
+ fmt[fmtcount+i].req_value = false;
+ fmt[fmtcount+i].code=fmtcount+i+1;
+ }
+ fmtcount+=l;
+ break;
+ }
+ else {
+ if (fstr[l]==':') {
+ //fstr[l-1] is an argument, but all the previous are just switches
+ GREALLOC(fmt, (fmtcount+l)*sizeof(fmtdef));
+ //store each switch AND the option
+ for (int i=0; i<l;i++) {
+ fmt[fmtcount+i].longopt=NULL; //one char length
+ fmt[fmtcount+i].opt=fstr[i];
+ fmt[fmtcount+i].req_value = (i==l-1);
+ fmt[fmtcount+i].code=fmtcount+i+1;
+ }
+ fmtcount+=l;
+ }
+ else { // fstr[l]=='=' or ';'
+ GREALLOC(fmt, (fmtcount+1)*sizeof(fmtdef));
+ fmt[fmtcount].longopt=Gstrdup(fstr, fstr+l-1);
+ fmt[fmtcount].opt=0;
+ fmt[fmtcount].req_value=(fstr[l]=='=');
+ fmt[fmtcount].code=fmtcount+1;
+ fmtcount++;
+ }
+ fstr+=l+1;
+ }
+ }
+ //-- now parse the arguments based on the given format specification
+ parseArgs(nodigitopts);
+ }
+int GArgs::parseArgs(bool nodigitopts) {
+ int p=1; //skip program name
+ int f=0;
+ while (p<_argc) {
+ if (_argv[p][0]=='-' && (_argv[p][1]==0 || _argv[p][1]!='-')) {
+ //single-dash argument
+ int cpos=1;
+ char c=_argv[p][cpos];
+ if (c==0 || (nodigitopts && isdigit(c)) ||
+ (c=='.' && isdigit(_argv[p][cpos+1]))) {
+ //special case: plain argument '-' or just a negative number
+ GREALLOC(args, (count+1)*sizeof(argdata));
+ args[count].opt=NULL;
+ args[count].fmti=-1;
+ if (c==0) {
+ GCALLOC(args[count].value, 2);
+ args[count].value[0]='-';
+ }
+ else { //negative number given
+ args[count].value=Gstrdup(_argv[p]);
+ }
+ count++;
+ nonOptCount++;
+ }
+ else { //single-dash argument or switch
+ if ((f=validShortOpt(c))>=0) {
+ GREALLOC(args, (count+1)*sizeof(argdata));
+ GCALLOC(args[count].opt, 2);
+ args[count].opt[0]=c;
+ args[count].fmti=f;
+ if (!fmt[f].req_value) {//switch type
+ GCALLOC(args[count].value,1);//so getOpt() functions would not return NULL
+ count++;
+ // only switches can be grouped with some other switches or options
+ if (_argv[p][cpos+1]!='\0') {
+ cpos++;
+ c=_argv[p][cpos];
+ }
+ }
+ else {
+ //single-dash argument followed by a value
+ if (_argv[p][cpos+1]=='\0') {
+ if (p+1<_argc && _argv[p+1][0]!=0) { //value is the whole next argument
+ p++;
+ args[count].value=Gstrdup(_argv[p]);
+ }
+ else {
+ errarg=p;
+ err_valmissing=true;
+ return errarg;
+ }
+ }
+ else { //value immediately follows the dash-option
+ args[count].value=Gstrdup(_argv[p]+cpos+1);
+ }
+ count++;
+ }
+ } //was validShortOpt
+ else { //option not found in format definition!
+ errarg=p;
+ return errarg;
+ }
+ }
+ } //-single-dash
+ else {//not a single-dash argument
+ char* ap=_argv[p];
+ bool is_longopt=false;
+ if (*ap=='-' && ap[1]=='-') {
+ is_longopt=true;
+ ap+=2;
+ }
+ char* e=strchr(ap+1,'=');
+ while (e!=NULL && *(e-1)=='\\') e=strchr(e,'=');
+ if (e==NULL && is_longopt) {
+ e=ap;
+ while (*e!=0 && *e!=' ') e++;
+ //e will be on eos or next space
+ }
+ if (e!=NULL && e>ap) {
+ //this must be a long option
+ //e is on eos, space or '='
+ if ((f=validLongOpt(ap,e-1))>=0) {
+ GREALLOC(args, (count+1)*sizeof(argdata));
+ args[count].opt=Gstrdup(ap,e-1);
+ args[count].fmti=f;
+ if (fmt[f].req_value) {
+ if (*e==0) {
+ //value is the next argument
+ if (p+1<_argc && _argv[p+1][0]!=0) {
+ p++;
+ args[count].value=Gstrdup(_argv[p]);
+ }
+ else {
+ errarg=p;
+ err_valmissing=true;
+ return errarg;
+ }
+ }
+ else { //value is in the same argument
+ //while (*e!=0 && (*e==' ' || *e=='=')) e++;
+ if (*e=='=') e++;
+ if (*e==0) {
+ errarg=p;
+ err_valmissing=true;
+ return errarg;
+ }
+ args[count].value=Gstrdup(e);
+ }
+ } //value required
+ else { //no value expected
+ GCALLOC(args[count].value,1); //do not return NULL
+ }
+ count++;
+ }
+ else { //error - this long argument not recognized
+ errarg=p;
+ return errarg;
+ }
+ }
+ else { //just a plain non-option argument
+ if (e==ap) { //i.e. just "--"
+ errarg=p;
+ return errarg;
+ }
+ GREALLOC(args, (count+1)*sizeof(argdata));
+ args[count].opt=NULL; //it's not an option
+ args[count].value=Gstrdup(_argv[p]);
+ args[count].fmti=-1;
+ count++;
+ nonOptCount++;
+ }
+ }
+ p++;//check next arg string
+ } //while arguments
+ return errarg;
+void GArgs::printError(FILE* fout, const char* usage, bool exitProgram) {
+ if (errarg==0) return;
+ if (usage) fprintf(fout, "%s\n", usage);
+ if (err_valmissing)
+ fprintf(fout, "Error: value required for option '%s'\n", _argv[errarg]);
+ else
+ fprintf(fout, "Error: invalid argument '%s'\n", _argv[errarg]);
+ if (exitProgram)
+ exit(1);
+void GArgs::printError(const char* usage, bool exitProgram) {
+ printError(stderr, usage, exitProgram);
+void GArgs::printCmdLine(FILE* fout) {
+ if (_argv==NULL) return;
+ for (int i=0;i<_argc;i++) {
+ fprintf(fout, "%s%c", _argv[i], (i==_argc-1)?'\n':' ');
+ }
+GArgs::GArgs(int argc, char* const argv[], const GArgsDef fmtrecs[], bool nodigitopts) {
+ fmtcount=0;
+ count=0;
+ nonOptCount=0;
+ nonOptPos=0;
+ optPos=0;
+ errarg=0;
+ err_valmissing=false;
+ args=NULL;
+ fmt=NULL;
+ _argc=argc;
+ _argv=argv;
+ if (fmtrecs==NULL) return;
+ const GArgsDef* frec=fmtrecs;
+ while ((frec->longopt || frec->opt) && fmtcount<255) {
+ fmtcount++;
+ frec=&(fmtrecs[fmtcount]);
+ }
+ GCALLOC(fmt, fmtcount*sizeof(fmtdef));
+ for (int i=0;i<fmtcount;i++) {
+ fmt[i].longopt=Gstrdup(fmtrecs[i].longopt); //do we need to use Gstrdup here?
+ fmt[i].opt=fmtrecs[i].opt;
+ fmt[i].req_value=fmtrecs[i].req_value;
+ fmt[i].code=fmtrecs[i].code;
+ }
+ parseArgs(nodigitopts);
+GArgs::~GArgs() {
+ int i;
+ for (i=0; i<fmtcount; i++)
+ GFREE(fmt[i].longopt);
+ GFREE(fmt);
+ for (i=0; i<count; i++) {
+ GFREE(args[i].opt);
+ GFREE(args[i].value);
+ }
+ GFREE(args);
+int GArgs::validShortOpt(char o) {
+ for (int i=0; i<fmtcount; i++)
+ if (fmt[i].opt==o) return i;
+ return -1;
+int GArgs::validLongOpt(char* o, char* to) {
+ char* pstr=Gstrdup(o,to);
+ for (int i=0; i<fmtcount; i++) {
+ if (fmt[i].longopt && strcmp(fmt[i].longopt, pstr)==0) {
+ GFREE(pstr);
+ return i;
+ }
+ }
+ GFREE(pstr);
+ return -1;
+int GArgs::validOpt(int code) {
+ for (int i=0; i<fmtcount; i++)
+ if (fmt[i].code==code) return i;
+ return -1;
+int GArgs::isError() { // returns the offending argv position or 0 if no error
+ return errarg;
+ }
+char* GArgs::getOpt(const char* o) { /* retrieve the value for option o
+ returns
+ NULL if option not given at all
+ !=NULL if boolean option was given
+ opt.value if value option was given
+ */
+ for (int i=0; i<count; i++)
+ if (args[i].opt!=NULL && strcmp(args[i].opt, o)==0)
+ return args[i].value;
+ return NULL;
+char* GArgs::getOpt(const char o) {
+ for (int i=0; i<count; i++)
+ if (args[i].opt!=NULL && args[i].opt[0]==o && args[i].opt[1]=='\0')
+ return args[i].value;
+ return NULL;
+char* GArgs::getOpt(int c) {
+ for (int i=0; i<count; i++)
+ if (args[i].fmti>=0 && fmt[args[i].fmti].code==c)
+ return args[i].value;
+ return NULL;
+char* GArgs::getOptName(int c) {
+ for (int i=0; i<count; i++)
+ if (args[i].fmti>=0 && fmt[args[i].fmti].code==c)
+ return args[i].opt;
+ return NULL;
+int GArgs::startNonOpt(){ //reset iteration through non-option arguments
+ //returns the number of non-option arguments
+return nonOptCount;
+char* GArgs::nextNonOpt() { //get the next non-dashed argument
+ //or NULL if no more
+for (int i=nonOptPos;i<count;i++)
+ if (args[i].opt==NULL) {
+ nonOptPos=i+1;
+ return args[i].value;
+ }
+return NULL;
+int GArgs::startOpt(){ //reset iteration through option arguments
+ //returns the number of option arguments
+return count-nonOptCount;
+char* GArgs::nextOpt() { //get the next non-dashed argument
+ //or NULL if no more
+for (int i=optPos;i<count;i++)
+ if (args[i].opt!=NULL) {
+ optPos=i+1;
+ return args[i].opt;
+ }
+return NULL;
+int GArgs::nextCode() { //get the next non-dashed argument
+ //or NULL if no more
+for (int i=optPos;i<count;i++)
+ if (args[i].opt!=NULL && args[i].fmti>=0) {
+ optPos=i+1;
+ return fmt[args[i].fmti].code;
+ }
+return 0; //must make sure that codes are > 0 for this to work properly
diff --git a/src/GBase.cpp b/src/GBase.cpp
new file mode 100644
index 0000000..ed117f5
--- /dev/null
+++ b/src/GBase.cpp
@@ -0,0 +1,780 @@
+#include "GBase.h"
+#include <stdarg.h>
+#include <ctype.h>
+#include <sys/stat.h>
+#ifndef S_ISDIR
+#define S_ISDIR(mode) (((mode) & S_IFMT) == S_IFDIR)
+#ifndef S_ISREG
+#define S_ISREG(mode) (((mode) & S_IFMT) == S_IFREG)
+ int fseeko(FILE *stream, off_t offset, int whence) {
+ }
+ off_t ftello(FILE *stream) {
+ }
+int saprintf(char **retp, const char *fmt, ...) {
+ va_list argp;
+ int len;
+ char *buf;
+ va_start(argp, fmt);
+ len = vsnprintf(NULL, 0, fmt, argp);
+ va_end(argp);
+ GMALLOC(buf, (len + 1));
+ if(buf == NULL)
+ {
+ *retp = NULL;
+ return -1;
+ }
+ va_start(argp, fmt);
+ vsnprintf(buf, len+1, fmt, argp);
+ va_end(argp);
+ *retp = buf;
+ return len;
+//************************* Debug helpers **************************
+// Assert failed routine
+void GAssert(const char* expression, const char* filename, unsigned int lineno){
+ char msg[4096];
+ sprintf(msg,"%s(%d): ASSERT(%s) failed.\n",filename,lineno,expression);
+ fprintf(stderr,"%s",msg);
+ //abort();
+ }
+// Error routine (prints error message and exits!)
+void GError(const char* format,...){
+ #ifdef __WIN32__
+ char msg[4096];
+ va_list arguments;
+ va_start(arguments,format);
+ vsprintf(msg,format,arguments);
+ va_end(arguments);
+ OutputDebugString(msg);
+ fprintf(stderr,"%s",msg); // if a console is available
+ #else
+ va_list arguments;
+ va_start(arguments,format);
+ vfprintf(stderr,format,arguments);
+ va_end(arguments);
+ #ifdef DEBUG
+ // modify here if you want a core dump
+ abort();
+ #endif
+ #endif
+ exit(1);
+ }
+// Warning routine (just print message without exiting)
+void GMessage(const char* format,...){
+ char msg[4096];
+ va_list arguments;
+ va_start(arguments,format);
+ vsprintf(msg,format,arguments);
+ va_end(arguments);
+ #ifdef __WIN32__
+ OutputDebugString(msg);
+ #endif
+ fprintf(stderr,"%s",msg);fflush(stderr);
+ }
+/*************** Memory management routines *****************/
+// Allocate memory
+bool GMalloc(pointer* ptr,unsigned long size){
+ //GASSERT(ptr);
+ if (size!=0) *ptr=malloc(size);
+ return *ptr!=NULL;
+ }
+// Allocate cleaned memory (0 filled)
+bool GCalloc(pointer* ptr,unsigned long size){
+ GASSERT(ptr);
+ *ptr=calloc(size,1);
+ return *ptr!=NULL;
+ }
+// Resize memory
+bool GRealloc(pointer* ptr,unsigned long size){
+ //GASSERT(ptr);
+ if (size==0) {
+ GFree(ptr);
+ return true;
+ }
+ if (*ptr==NULL) {//simple malloc
+ void *p=malloc(size);
+ if (p != NULL) {
+ *ptr=p;
+ return true;
+ }
+ else return false;
+ }//malloc
+ else {//realloc
+ void *p=realloc(*ptr,size);
+ if (p) {
+ *ptr=p;
+ return true;
+ }
+ return false;
+ }
+ }
+// Free memory, resets ptr to NULL afterward
+void GFree(pointer* ptr){
+ GASSERT(ptr);
+ if (*ptr) free(*ptr);
+ *ptr=NULL;
+ }
+char* Gstrdup(const char* str) {
+ if (str==NULL) return NULL;
+ char *copy=NULL;
+ GMALLOC(copy, strlen(str)+1);
+ strcpy(copy,str);
+ return copy;
+ }
+char* newEmptyStr() {
+ char* zs=NULL;
+ GMALLOC(zs,1);
+ zs[0]=0;
+ return zs;
+char* Gstrdup(const char* sfrom, const char* sto) {
+ if (sfrom==NULL || sto==NULL) return NULL;
+ char *copy=NULL;
+ if (sfrom[0]==0) return newEmptyStr();
+ GMALLOC(copy, sto-sfrom+2);
+ strncpy(copy, sfrom, sto-sfrom+1);
+ copy[sto-sfrom+1]=0;
+ return copy;
+ }
+int Gstrcmp(const char* a, const char* b, int n) {
+ if (a==NULL || b==NULL) {
+ return a==NULL ? -1 : 1;
+ }
+ else {
+ if (n<0) return strcmp(a,b);
+ else return strncmp(a,b,n);
+ }
+int Gstricmp(const char* a, const char* b, int n) {
+ if (a==NULL || b==NULL) return a==NULL ? -1 : 1;
+ register int ua, ub;
+ if (n<0) {
+ while ((*a!=0) && (*b!=0)) {
+ ua=tolower((unsigned char)*a);
+ ub=tolower((unsigned char)*b);
+ a++;b++;
+ if (ua!=ub) return ua < ub ? -1 : 1;
+ }
+ return (*a == 0) ? ( (*b == 0) ? 0 : -1 ) : 1 ;
+ }
+ else {
+ while (n && (*a!=0) && (*b!=0)) {
+ ua=tolower((unsigned char)*a);
+ ub=tolower((unsigned char)*b);
+ a++;b++;n--;
+ if (ua!=ub) return ua < ub ? -1 : 1;
+ }
+ //return (*a == 0) ? ( (*b == 0) ? 0 : -1 ) : 1 ;
+ if (n==0) return 0;
+ else { return (*a == 0) ? ( (*b == 0) ? 0 : -1 ) : 1 ; }
+ }
+int strsplit(char* str, char** fields, int maxfields, const char* delim) {
+ //splits by placing 0 where delim chars are found, setting fields[] to the beginning
+ //of each field (stopping after maxfields); returns number of fields parsed
+ int tidx=0;
+ bool afterdelim=true;
+ int i=0;
+ while (str[i]!=0 && tidx<maxfields) {
+ if (afterdelim) {
+ fields[tidx]=str+i;
+ tidx++;
+ }
+ afterdelim=false;
+ if (chrInStr(str[i],(char*)delim)) {
+ str[i]=0;
+ i++;
+ while (str[i]!=0 && chrInStr(str[i], (char*)delim)) i++;
+ afterdelim=true;
+ continue;
+ }
+ i++;
+ }
+ return tidx;
+int strsplit(char* str, char** fields, int maxfields, const char delim) {
+ //splits by placing 0 where delim is found, setting fields[] to the beginning
+ //of each field (stopping after maxfields); returns number of fields parsed
+ int tidx=0;
+ bool afterdelim=true;
+ int i=0;
+ while (str[i]!=0 && tidx<maxfields) {
+ if (afterdelim) {
+ fields[tidx]=str+i;
+ tidx++;
+ }
+ afterdelim=false;
+ if (str[i]==delim) {
+ str[i]=0;
+ i++;
+ while (str[i]!=0 && str[i]==delim) i++;
+ afterdelim=true;
+ continue;
+ }
+ i++;
+ }
+ return tidx;
+int strsplit(char* str, char** fields, int maxfields) {
+ //splits by placing 0 where delim is found, setting fields[] to the beginning
+ //of each field (stopping after maxfields); returns number of fields parsed
+ int tidx=0;
+ bool afterdelim=true;
+ int i=0;
+ while (str[i]!=0 && tidx<maxfields) {
+ if (afterdelim) {
+ fields[tidx]=str+i;
+ tidx++;
+ }
+ afterdelim=false;
+ if (str[i]==' ' || str[i]=='\t') {
+ str[i]=0;
+ i++;
+ while (str[i]!=0 && (str[i]=='\t' || str[i]==' ')) i++;
+ afterdelim=true;
+ continue;
+ }
+ i++;
+ }
+ return tidx;
+char* Gsubstr(const char* str, char* from, char* to) {
+ //extract (and allocate) a substring, including boundaries (from/to)
+ if (str==NULL || from==NULL) return NULL;
+ if (from[0]==0 || str[0]==0) return newEmptyStr();
+ if (from<str) return NULL;
+ if (to==NULL) {
+ to=from;
+ while (to[1]) to++;
+ }
+ if (to<from) return newEmptyStr();
+ int newlen=to-from+1;
+ char* subs;
+ GMALLOC(subs, newlen);
+ memcpy(subs, str, newlen-1);
+ subs[newlen]='\0';
+ return subs;
+ }
+char* replaceStr(char* &str, char* newvalue) {
+ if (str!=NULL) GFREE(str);
+ if (newvalue==NULL) { return NULL; }
+ GMALLOC(str, strlen(newvalue)+1);
+ strcpy(str,newvalue);
+ return str;
+ }
+void* Gmemscan(void *mem, unsigned int len,
+ void *part, unsigned int partlen) {
+char* p;
+unsigned int restlen=len-partlen+1;
+void* oldp=mem;
+while ( (p=(char*)memchr(oldp, ((char*)part)[0], restlen))!=NULL) {
+ //located first char, try to match the rest:
+ p++;
+ if (memcmp(p, &((char*)part)[1], partlen-1)==0) return p-1;
+ //no string match, prepare next iteration
+ restlen-=(p-(char*)oldp);
+ oldp=p;
+ }//while
+return NULL;
+//rindex function is missing on some platforms ?
+char* rstrchr(char* str, char ch) { /* returns a pointer to the rightmost
+ occurence of ch in str */
+ char *p;
+ if (str==NULL) return NULL;
+ p=str+strlen(str)-1;
+ while (p>=str) {
+ if (*p==ch) return p;
+ p--;
+ }
+ return NULL;
+ }
+/* DOS/UNIX safer fgets : reads a text line from a (binary) file and
+ update the file position accordingly and the buffer capacity accordingly.
+ The given buf is resized to read the entire line in memory
+ -- even when it's abnormally long
+ */
+char* fgetline(char* & buf, int& buf_cap, FILE *stream, off_t* f_pos, int* linelen) {
+ //reads a char at a time until \n and/or \r are encountered
+ int i=0;
+ int c=0;
+ off_t fpos=(f_pos!=NULL) ? *f_pos : 0;
+ while ((c=getc(stream))!=EOF) {
+ if (i>=buf_cap-1) {
+ buf_cap+=1024;
+ GREALLOC(buf, buf_cap);
+ }
+ if (c=='\n' || c=='\r') {
+ if (c=='\r') {
+ if ((c=getc(stream))!='\n') ungetc(c,stream);
+ else fpos++;
+ }
+ fpos++;
+ break;
+ }
+ fpos++;
+ buf[i]=(char)c;
+ i++;
+ } //while i<buf_cap-1
+ if (linelen!=NULL) *linelen=i;
+ if (f_pos!=NULL) *f_pos=fpos;
+ if (c==EOF && i==0) return NULL;
+ buf[i]='\0';
+ return buf;
+ }
+char* GLineReader::getLine(FILE* stream, off_t& f_pos) {
+ if (pushed) { pushed=false; return buf; }
+ //reads a char at a time until \n and/or \r are encountered
+ len=0;
+ int c=0;
+ while ((c=getc(stream))!=EOF) {
+ if (len>=allocated-1) {
+ allocated+=1024;
+ GREALLOC(buf, allocated);
+ }
+ if (c=='\n' || c=='\r') {
+ buf[len]='\0';
+ if (c=='\r') { //DOS file -- special case
+ if ((c=getc(stream))!='\n') ungetc(c,stream);
+ else f_pos++;
+ }
+ f_pos++;
+ lcount++;
+ return buf;
+ }
+ f_pos++;
+ buf[len]=(char)c;
+ len++;
+ } //while i<buf_cap-1
+ if (c==EOF) {
+ isEOF=true;
+ if (len==0) return NULL;
+ }
+ buf[len]='\0';
+ lcount++;
+ return buf;
+//strchr but with a set of chars instead of only one
+char* strchrs(const char* s, const char* chrs) {
+ if (s==NULL || chrs==NULL || *chrs=='\0' || *s=='\0')
+ return NULL;
+ unsigned int l=strlen(s);
+ unsigned int r=strcspn(s, chrs);
+ if (r==l) return NULL;
+ return ((char*)s+r);
+char* upCase(const char* str) {
+ if (str==NULL) return NULL;
+ int len=strlen(str);
+ char* upstr;
+ GMALLOC(upstr, len+1);
+ upstr[len]='\0';
+ for (int i=0;i<len;i++) upstr[i]=toupper(str[i]);
+ return upstr;
+ }
+char* loCase(const char* str) {
+ if (str==NULL) return NULL;
+ int len=strlen(str);
+ char* lostr;
+ GMALLOC(lostr, len+1);
+ lostr[len]='\0';
+ for (int i=0;i<len;i++) lostr[i]=tolower(str[i]);
+ return lostr;
+ }
+char* strlower(char * str) {//changes string in place
+ if (str==NULL) return NULL;
+ int i=0;
+ while (str[i]!=0) { str[i]=tolower(str[i]); i++; }
+ return str;
+char* strupper(char * str) {//changes string in place
+ if (str==NULL) return NULL;
+ int i=0;
+ while (str[i]!=0) { str[i]=toupper(str[i]); i++; }
+ return str;
+//test if a char is in a given string (set)
+bool chrInStr(char c, const char* str) {
+ if (str==NULL || *str=='\0') return false;
+ for (const char* p=str; (*p)!='\0'; p++) {
+ if ((*p)==c) return true;
+ }
+ return false;
+ }
+char* rstrfind(const char* str, const char* substr) {
+/* like rindex() for a string */
+ int l,i;
+ if (str==NULL || *str=='\0') return NULL;
+ if (substr==NULL || *substr=='\0') return NULL;
+ l=strlen(substr);
+ char* p=(char*)str+strlen(str)-l;
+ //rightmost position that could match
+ while (p>=str) {
+ for (i=0; i<l && *(p+i) == *(substr+i); i++) ;
+ if (i==l) return p; //found!
+ p--;
+ }
+ return NULL;
+char* strifind(const char* str, const char* substr) {
+ // the case insensitive version of strstr -- finding a string within a strin
+ int l,i;
+ if (str==NULL || *str==0) return NULL;
+ if (substr==NULL || *substr==0) return NULL;
+ l=strlen(substr);
+ char* smax=(char*)str+strlen(str)-l;
+ //rightmost position that could match
+ char* p=(char*)str;
+ while (p<=smax) {
+ for (i=0; i<l && tolower(*(p+i))==tolower(*(substr+i)); i++) ;
+ if (i==l) return p; //found!
+ p++;
+ }
+ return NULL;
+// tests if string s has the given prefix
+bool startsWith(const char* s, const char* prefix) {
+ if (prefix==NULL || s==NULL) return false;
+ int i=0;
+ while (prefix[i]!='\0' && prefix[i]==s[i]) i++;
+ return (prefix[i]=='\0');
+ }
+// tests if string s ends with given suffix
+bool endsWith(const char* s, const char* suffix) {
+ if (suffix==NULL || s==NULL) return false;
+ if (suffix[0]==0) return true; //special case: empty suffix
+ int j=strlen(suffix)-1;
+ int i=strlen(s)-1;
+ if (i<j) return false;
+ while (j>=0 && s[i]==suffix[j]) { i--; j--; }
+ return (j==-1);
+ }
+char* reverseChars(char* str, int slen) {
+ if (slen==0) slen=strlen(str);
+ int l=0;
+ int r=slen-1;
+ char c;
+ while (l<r) {
+ c=str[l];str[l]=str[r];
+ str[r]=c;
+ l++;r--;
+ }
+ return str;
+char* rstrstr(const char* rstart, const char *lend, const char* substr) { /*like strstr, but starts searching
+ from right end, going up to lend and returns a pointer to the last (right)
+ matching character in str */
+ char *p;
+ int l,i;
+ l=strlen(substr);
+ p=(char*)rstart-l+1;
+ while (p>=lend) {
+ for (i=0;i<l;i++) if (*(p+i) != *(substr+i)) break;
+ if (i==l) return p+l-1;
+ p--;
+ }
+ return NULL;
+ }
+//hash function used for strings in GHash
+int strhash(const char* str){
+ register int h=0;
+ register int g;
+ while (*str) {
+ h=(h<<4)+*str++;
+ g=h&0xF0000000;
+ if(g) h^=g>>24;
+ h&=0x0fffffff;
+ }
+ GASSERT(h<=0x0fffffff);
+ return h;
+ }
+// removes the last part (file or directory name) of a full path
+// this is a destructive operation for the given string!!!
+// the trailing '/' is guaranteed to be there
+void delFileName(char* filepath) {
+ char *p, *sep;
+ if (filepath==NULL) return;
+ for (p=filepath, sep=filepath;*p!='\0';p++)
+ if (*p=='/' || *p=='\\') sep=p+1;
+ *sep='\0'; // truncate filepath
+// returns a pointer to the last file or directory name in a full path
+const char* getFileName(const char* filepath) {
+ const char *p, *sep;
+ if (filepath==NULL) return NULL;
+ for (p=filepath, sep=filepath;*p!='\0';p++)
+ if (*p=='/' || *p=='\\') sep=p+1;
+ return sep;
+// returns a pointer to the file "extension" part in a filename
+const char* getFileExt(const char* filepath) {
+ const char *p, *dp, *sep;
+ if (filepath==NULL) return NULL;
+ for (p=filepath, dp=filepath, sep=filepath;*p!='\0';p++) {
+ if (*p=='.') dp=p+1;
+ else if (*p=='/' || *p=='\\')
+ sep=p+1;
+ }
+ return (dp>sep) ? dp : NULL ;
+int fileExists(const char* fname) {
+ struct stat stFileInfo;
+ int r=0;
+ // Attempt to get the file attributes
+ int fs = stat(fname,&stFileInfo);
+ if (fs == 0) {
+ r=3;
+ // We were able to get the file attributes
+ // so the file obviously exists.
+ if (S_ISREG (stFileInfo.st_mode)) {
+ r=2;
+ }
+ if (S_ISDIR (stFileInfo.st_mode)) {
+ r=1;
+ }
+ }
+ return r;
+/*bool fileExists(const char* filepath) {
+ if (filepath==NULL) return false;
+ FILE* ft=fopen(filepath, "rb");
+ if (ft==NULL) return false;
+ fclose(ft);
+ return true;
+int64 fileSize(const char* fpath) {
+ struct stat results;
+ if (stat(fpath, &results) == 0)
+ // The size of the file in bytes is in
+ return (int64)results.st_size;
+ else
+ // An error occurred
+ //GMessage("Error at stat(%s)!\n", fpath);
+ return 0;
+bool parseNumber(char* &p, double& v) {
+ //skip any spaces..
+ while (*p==' ' || *p=='\t') p++;
+ char* start=p;
+ /*if (*p=='-') p++;
+ else if (*p=='+') { p++;start++; }*/
+ /* while ((*p>='1' && *p<='9') || *p=='0' ||
+ *p=='.' || *p=='-' || tolower(*p)=='e') p++; */
+ int numlen=strspn(start, "0123456789eE.-+");
+ p=start+numlen;
+ //now p is on a non-digit;
+ if (*start=='-' && p==start+1) return false;
+ char saved=*p;
+ *p='\0';
+ char* endptr=p;
+ v=strtod(start,&endptr);
+ *p=saved;
+ if (endptr!=p) return false;
+ return true;
+bool parseDouble(char* &p, double& v) {
+ return parseNumber(p,v);
+bool parseInt(char* &p, int& i) {
+ while (*p==' ' || *p=='\t') p++;
+ char* start=p;
+ if (*p=='-') p++;
+ else if (*p=='+') { p++;start++; }
+ while ((*p>='1' && *p<='9') || *p=='0') p++;
+ //now p is on a non-digit;
+ if (*start=='-' && p==start+1) return false;
+ char saved=*p;
+ *p='\0';
+ char* endptr=p;
+ long l=strtol(start,&endptr,10);
+ i=(int)l;
+ *p=saved;
+ if (endptr!=p || i!=l) return false;
+ return true;
+bool parseUInt(char* &p, uint& i) {
+ while (*p==' ' || *p=='\t') p++;
+ char* start=p;
+ if (*p=='-') return false;
+ else if (*p=='+') { p++;start++; }
+ while ((*p>='1' && *p<='9') || *p=='0') p++;
+ //now p is on a non-digit;
+ if (*start=='-' && p==start+1) return false;
+ char saved=*p;
+ *p='\0';
+ char* endptr=p;
+ unsigned long l=strtoul(start,&endptr,10);
+ i=(uint) l;
+ *p=saved;
+ if (endptr!=p || i!=l) return false;
+ return true;
+bool parseHex(char* &p, uint& i) {
+ //skip initial spaces/prefix
+ while (*p==' ' || *p=='\t' || *p=='0' || *p=='x') p++;
+ char* start=p;
+ if (*p=='-') return false;
+ else if (*p=='+') { p++;start++; }
+ while (isxdigit(*p)) p++;
+ //now p is on a non-hexdigit;
+ if (p==start+1) return false;
+ char saved=*p;
+ *p='\0';
+ char* endptr=p;
+ unsigned long l=strtoul(start,&endptr,16);
+ i=(uint) l;
+ *p=saved;
+ if (endptr!=p || i!=l) return false;
+ return true;
+//write a formatted fasta record, fasta formatted
+void writeFasta(FILE *fw, const char* seqid, const char* descr,
+ const char* seq, int linelen, int seqlen) {
+ fflush(fw);
+ // write header line only if given!
+ if (seqid!=NULL) {
+ if (descr==NULL || descr[0]==0)
+ fprintf(fw,">%s\n",seqid);
+ else fprintf(fw,">%s %s\n",seqid, descr);
+ }
+ fflush(fw);
+ if (seq==NULL || *seq==0) return; //nothing to print
+ if (linelen==0) { //unlimited line length: write the whole sequence on a line
+ if (seqlen>0)
+ fwrite((const void*)seq, 1, seqlen,fw);
+ else fprintf(fw,"%s",seq);
+ fprintf(fw,"\n");
+ fflush(fw);
+ return;
+ }
+ int ilen=0;
+ if (seqlen>0) { //seq length given, so we know when to stop
+ for (int i=0; i < seqlen; i++, ilen++) {
+ if (ilen == linelen) {
+ fputc('\n', fw);
+ ilen = 0;
+ }
+ fputc(seq[i], fw);
+ }
+ fputc('\n', fw);
+ }
+ else { //seq length not given, stop when 0 encountered
+ for (int i=0; seq[i]!=0; i++, ilen++) {
+ if (ilen == linelen) {
+ fputc('\n', fw);
+ ilen = 0;
+ }
+ fputc(seq[i], fw);
+ } //for
+ fputc('\n', fw);
+ }
+ fflush(fw);
+ }
+char* commaprint(uint64 n) {
+ int comma = '\0';
+ char retbuf[48];
+ char *p = &retbuf[sizeof(retbuf)-1];
+ int i = 0;
+ if(comma == '\0') {
+ /* struct lconv *lcp = localeconv();
+ if(lcp != NULL) {
+ if(lcp->thousands_sep != NULL &&
+ *lcp->thousands_sep != '\0')
+ comma = *lcp->thousands_sep;
+ else */
+ comma = ',';
+ // }
+ }
+ *p = '\0';
+ do {
+ if(i%3 == 0 && i != 0)
+ *--p = comma;
+ *--p = '0' + n % 10;
+ n /= 10;
+ i++;
+ } while(n != 0);
+ return p;
diff --git a/src/GFaSeqGet.cpp b/src/GFaSeqGet.cpp
new file mode 100644
index 0000000..ca722ca
--- /dev/null
+++ b/src/GFaSeqGet.cpp
@@ -0,0 +1,319 @@
+#include "GFaSeqGet.h"
+#include "gdna.h"
+#include <ctype.h>
+void GSubSeq::setup(uint sstart, int slen, int sovl, int qfrom, int qto, uint maxseqlen) {
+ if (sovl==0) {
+ GFREE(sq);
+ sqstart=sstart;
+ uint max_len=(maxseqlen>0) ? maxseqlen : MAX_FASUBSEQ;
+ sqlen = (slen==0 ? max_len : slen);
+ GMALLOC(sq, sqlen);
+ return;
+ }
+ //overlap -- copy the overlapping region
+ char* newsq=NULL;
+ GMALLOC(newsq, slen);
+ memcpy((void*)&newsq[qto], (void*)&sq[qfrom], sovl);
+ GFREE(sq);
+ sq=newsq;
+ sqstart=sstart;
+ sqlen=slen;
+void GFaSeqGet::finit(const char* fn, off_t fofs, bool validate) {
+ fh=fopen(fn,"rb");
+ if (fh==NULL) {
+ GError("Error (GFaSeqGet) opening file '%s'\n",fn);
+ }
+ fname=Gstrdup(fn);
+ initialParse(fofs, validate);
+ lastsub=new GSubSeq();
+GFaSeqGet::GFaSeqGet(const char* faname, uint seqlen, off_t fseqofs, int l_len, int l_blen) {
+//for GFastaIndex use mostly -- the important difference is that
+//the file offset is to the sequence, not to the defline
+ fh=fopen(faname,"rb");
+ if (fh==NULL) {
+ GError("Error (GFaSeqGet) opening file '%s'\n",faname);
+ }
+ fname=Gstrdup(faname);
+ line_len=l_len;
+ line_blen=l_blen;
+ seq_len=seqlen;
+ if (line_blen<line_len)
+ GError("Error (GFaSeqGet): invalid line length info (len=%d, blen=%d)\n",
+ line_len, line_blen);
+ fseqstart=fseqofs;
+ lastsub=new GSubSeq();
+GFaSeqGet::GFaSeqGet(FILE* f, off_t fofs, bool validate) {
+ fname=NULL;
+ fseqstart=0;
+ if (f==NULL) GError("Error (GFaSeqGet) : null file handle!\n");
+ seq_len=0;
+ fh=f;
+ initialParse(fofs, validate);
+ lastsub=new GSubSeq();
+void GFaSeqGet::initialParse(off_t fofs, bool checkall) {
+ static const char gfa_ERRPARSE[]="Error (GFaSeqGet): invalid FASTA file format.\n";
+ if (fofs!=0) { fseeko(fh,fofs,SEEK_SET); } //e.g. for offsets provided by cdbyank
+ //read the first two lines to determine fasta parameters
+ fseqstart=fofs;
+ int c=getc(fh);
+ fseqstart++;
+ if (c!='>') GError("Error (GFaSeqGet): not a fasta header?\n");
+ while ((c=getc(fh))!=EOF) {
+ fseqstart++;
+ if (c=='\n' || c=='\r') { break; } //end of defline
+ }
+ if (c==EOF) GError(gfa_ERRPARSE);
+ line_len=0;
+ int lendlen=0;
+ while ((c=getc(fh))!=EOF) {
+ if (c=='\n' || c=='\r') { //end of line encountered
+ if (line_len>0) { //end of the first "sequence" line
+ lendlen++;
+ break;
+ }
+ else {// another EoL char at the end of defline
+ fseqstart++;
+ continue;
+ }
+ }// end-of-line characters
+ line_len++;
+ }
+ //we are at the end of first sequence line
+ while ((c=getc(fh))!=EOF) {
+ if (c=='\n' || c=='\r') lendlen++;
+ else {
+ ungetc(c,fh);
+ break;
+ }
+ }
+ line_blen=line_len+lendlen;
+ if (c==EOF) return;
+ // -- you don't need to check it all if you're sure it's safe
+ if (checkall) { //validate the rest of the FASTA record
+ int llen=0; //last line length
+ int elen=0; //length of last line ending
+ bool waseol=true;
+ while ((c=getc(fh))!=EOF) {
+ if (c=='>' && waseol) { ungetc(c,fh); break; }
+ if (c=='\n' || c=='\r') {
+ // eol char
+ elen++;
+ if (waseol) continue; //2nd eol char
+ waseol=true;
+ elen=1;
+ continue;
+ }
+ if (c<=32) GError(gfa_ERRPARSE); //invalid character encountered
+ //--- on a seq char here:
+ if (waseol) {//beginning of a seq line
+ if (elen && (llen!=line_len || elen!=lendlen))
+ //GError(gfa_ERRPARSE);
+ GError("Error: invalid FASTA format for GFaSeqGet; make sure that\n\
+ the sequence lines have the same length (except for the last line)");
+ waseol=false;
+ llen=0;
+ elen=0;
+ }
+ llen++;
+ } //while reading chars
+ }// FASTA checking was requested
+ fseeko(fh,fseqstart,SEEK_SET);
+const char* GFaSeqGet::subseq(uint cstart, int& clen) {
+ //cstart is 1-based genomic coordinate within current fasta sequence
+ int maxlen=(seq_len>0)?seq_len : MAX_FASUBSEQ;
+ //GMessage("--> call: subseq(%u, %d)\n", cstart, clen);
+ if (clen>maxlen) {
+ GMessage("Error (GFaSeqGet): subsequence cannot be larger than %d\n", maxlen);
+ return NULL;
+ }
+ if (seq_len>0 && clen+cstart-1>seq_len) {
+ GMessage("Error (GFaSeqGet): end coordinate (%d) cannot be larger than sequence length %d\n", clen+cstart-1, seq_len);
+ }
+ if (lastsub->sq==NULL || lastsub->sqlen==0) {
+ lastsub->setup(cstart, clen, 0,0,0,seq_len);
+ loadsubseq(cstart, clen);
+ lastsub->sqlen=clen;
+ return (const char*)lastsub->sq;
+ }
+ //allow extension up to MAX_FASUBSEQ
+ uint bstart=lastsub->sqstart;
+ uint bend=lastsub->sqstart+lastsub->sqlen-1;
+ uint cend=cstart+clen-1;
+ int qlen=0; //only the extra len to be allocated/appended/prepended
+ uint qstart=cstart; //start coordinate of the new seq block of length qlen to be read from file
+ int newlen=0; //the new total length of the buffered sequence lastsub->sq
+ int kovl=0;
+ int czfrom=0;//0-based offsets for copying a previously read sequence chunk
+ int czto=0;
+ uint newstart=cstart;
+ if (cstart>=bstart && cend<=bend) { //new reg contained within existing buffer
+ return (const char*) &(lastsub->sq[cstart-bstart]) ;
+ }
+ //extend downward
+ uint newend=GMAX(cend, bend);
+ if (cstart<bstart) { //requested start < old buffer start
+ newstart=cstart;
+ newlen=(newend-newstart+1);
+ if (newlen>MAX_FASUBSEQ) {
+ newlen=MAX_FASUBSEQ;
+ newend=cstart+newlen-1; //keep newstart, set newend
+ }
+ qlen=bstart-cstart;
+ if (newend>bstart) { //overlap
+ if (newend>bend) {// new region is larger & around the old one - so we have two regions to update
+ kovl=bend-bstart+1;
+ czfrom=0;
+ czto=bstart-cstart;
+ lastsub->setup(newstart, newlen, kovl, czfrom, czto, seq_len); //this should realloc and copy the kovl subseq
+ qlen=bstart-cstart;
+ loadsubseq(newstart, qlen);
+ qlen=newend-bend;
+ int toread=qlen;
+ loadsubseq(bend+1, qlen);
+ clen-=(toread-qlen);
+ lastsub->sqlen=clen;
+ return (const char*)lastsub->sq;
+ }
+ //newend<=bend
+ kovl=newend-bstart+1;
+ }
+ else { //no overlap with previous buffer
+ if (newend>bend) kovl=bend-bstart+1;
+ else kovl=newend-bstart+1;
+ }
+ qlen=bstart-cstart;
+ czfrom=0;
+ czto=qlen;
+ } //cstart<bstart
+ else { //cstart>=bstart, possibly extend upwards
+ newstart=bstart;
+ newlen=(newend-newstart+1);
+ if (newlen>MAX_FASUBSEQ) {
+ newstart=bstart+(newlen-MAX_FASUBSEQ);//keep newend, assign newstart
+ newlen=MAX_FASUBSEQ;
+ if (newstart<=bend) { //overlap with old buffer
+ kovl=bend-newstart+1;
+ czfrom=newstart-bstart;
+ czto=0;
+ }
+ else { //not overlapping old buffer
+ kovl=0;
+ }
+ } //newstart reassigned
+ else { //we can extend the buffer to include the old one
+ qlen=newend-bend; //how much to read from file
+ qstart=bend+1;
+ kovl=bend-bstart+1;
+ czfrom=0;
+ czto=0;
+ }
+ }
+ lastsub->setup(newstart, newlen, kovl, czfrom, czto, seq_len); //this should realloc but copy any overlapping region
+ lastsub->sqlen-=qlen; //appending may result in a premature eof
+ int toread=qlen;
+ loadsubseq(qstart, qlen); //read the missing chunk, if any
+ clen-=(toread-qlen);
+ lastsub->sqlen+=qlen;
+ return (const char*)(lastsub->sq+(cstart-newstart));
+char* GFaSeqGet::copyRange(uint cstart, uint cend, bool revCmpl, bool upCase) {
+ if (cstart>cend) { Gswap(cstart, cend); }
+ int clen=cend-cstart+1;
+ const char* gs=subseq(cstart, clen);
+ if (gs==NULL) return NULL;
+ char* r=NULL;
+ GMALLOC(r,clen+1);
+ r[clen]=0;
+ memcpy((void*)r,(void*)gs, clen);
+ if (revCmpl) reverseComplement(r,clen);
+ if (upCase) {
+ for (int i=0;i<clen;i++)
+ r[i]=toupper(r[i]);
+ }
+ return r;
+ }
+const char* GFaSeqGet::loadsubseq(uint cstart, int& clen) {
+ //assumes enough lastsub->sq space allocated previously
+ //only loads the requested clen chars from file, at offset &lastsub->sq[cstart-lastsub->sqstart]
+ int sofs=cstart-lastsub->sqstart;
+ int lendlen=line_blen-line_len;
+ char* seqp=lastsub->sq+sofs;
+ //find the proper file offset and read the appropriate lines
+ uint seqofs=cstart-1;
+ uint startlno = seqofs/line_len;
+ int lineofs = seqofs % line_len;
+ off_t fstart=fseqstart + (startlno*line_blen);
+ fstart+=lineofs;
+ fseeko(fh, fstart, SEEK_SET);
+ int toread=clen;
+ int maxlen=(seq_len>0)? seq_len-cstart+1 : MAX_FASUBSEQ ;
+ if (toread==0) toread=maxlen; //read max allowed, or to the end of file
+ int actualrlen=0;
+ int sublen=0;
+ if (lineofs>0) { //read the partial first line
+ int reqrlen=line_len-lineofs;
+ if (reqrlen>toread) reqrlen=toread; //in case we need to read just a few chars
+ actualrlen=fread((void*)seqp, 1, reqrlen, fh);
+ if (actualrlen<reqrlen) { //eof reached prematurely
+ while (seqp[actualrlen-1]=='\n' || seqp[actualrlen-1]=='\r') actualrlen--;
+ //check for new sequences in between
+ clen=actualrlen;
+ sublen+=actualrlen;
+ return (const char*)seqp;
+ }
+ toread-=reqrlen;
+ sublen+=reqrlen;
+ fseeko(fh, lendlen, SEEK_CUR);
+ }
+ //read the rest of the lines
+ while (toread>=line_len) {
+ char* rseqp=&(seqp[sublen]);
+ actualrlen=fread((void*)rseqp, 1, line_len, fh);
+ /*
+ char dbuf[256];dbuf[255]=0;
+ strncpy(dbuf,rseqp, actualrlen);
+ dbuf[actualrlen]=0;
+ GMessage("<<<read line: %s\n",dbuf);
+ */
+ if (actualrlen<line_len) {
+ while (rseqp[actualrlen-1]=='\n' || rseqp[actualrlen-1]=='\r') actualrlen--;
+ sublen+=actualrlen;
+ clen=sublen;
+ return (const char*)seqp;
+ }
+ toread-=actualrlen;
+ sublen+=actualrlen;
+ fseeko(fh, lendlen, SEEK_CUR);
+ }
+ // read the last partial line, if any
+ if (toread>0) {
+ char* rseqp=&(seqp[sublen]);
+ actualrlen=fread((void*)rseqp, 1, toread, fh);
+ if (actualrlen<toread) {
+ while (rseqp[actualrlen-1]=='\n' || rseqp[actualrlen-1]=='\r')
+ actualrlen--;
+ }
+ sublen+=actualrlen;
+ }
+ //lastsub->sqlen+=sublen;
+ clen=sublen;
+ return (const char*)seqp;
+ }
diff --git a/src/GFastaIndex.cpp b/src/GFastaIndex.cpp
new file mode 100644
index 0000000..bc79b66
--- /dev/null
+++ b/src/GFastaIndex.cpp
@@ -0,0 +1,170 @@
+ * GFastaIndex.cpp
+ *
+ * Created on: Aug 25, 2010
+ * Author: gpertea
+ */
+#include "GFastaIndex.h"
+#define ERR_FAIDXLINE "Error parsing fasta index line: \n%s\n"
+#define ERR_FALINELEN "Error: sequence lines in a FASTA record must have the same length!\n"
+void GFastaIndex::addRecord(const char* seqname, uint seqlen, off_t foffs, int llen, int llen_full) {
+ GFastaRec* farec=records.Find(seqname);
+ if (farec!=NULL) {
+ GMessage("Warning: duplicate sequence ID (%s) added to the fasta index! Only last entry data will be kept.\n");
+ farec->seqlen=seqlen;
+ farec->fpos=foffs;
+ farec->line_len=llen;
+ farec->line_blen=llen_full;
+ }
+ else {
+ farec=new GFastaRec(seqlen,foffs,llen,llen_full);
+ records.Add(seqname,farec);
+ farec->seqname=records.getLastKey();
+ }
+int GFastaIndex::loadIndex(const char* finame) { //load record info from existing fasta index
+ if (finame==NULL) finame=fai_name;
+ if (finame!=fai_name) {
+ fai_name=Gstrdup(finame);
+ }
+ if (fai_name==NULL) GError("Error: GFastaIndex::loadIndex() called with no file name!\n");
+ records.Clear();
+ haveFai=false;
+ FILE* fi=fopen(fai_name,"rb");
+ if (fi==NULL) {
+ GMessage("Warning: cannot open fasta index file: %s!\n",fai_name);
+ return 0;
+ }
+ GLineReader fl(fi);
+ char* s=NULL;
+ while ((s=fl.nextLine())!=NULL) {
+ if (*s=='#') continue;
+ char* p=strchrs(s,"\t ");
+ if (p==NULL) GError(ERR_FAIDXLINE,s);
+ *p=0; //s now holds the genomic sequence name
+ p++;
+ uint len=0;
+ int line_len=0, line_blen=0;
+#ifdef __WIN32__
+ long offset=-1;
+ sscanf(p, "%d%ld%d%d", &len, &offset, &line_len, &line_blen);
+ long long offset=-1;
+ sscanf(p, "%d%lld%d%d", &len, &offset, &line_len, &line_blen);
+ if (len==0 || line_len==0 || line_blen==0 || line_blen<line_len)
+ addRecord(s,len,offset,line_len, line_blen);
+ }
+ fclose(fi);
+ haveFai=(records.Count()>0);
+ return records.Count();
+int GFastaIndex::buildIndex() {
+ //this parses the whole fasta file, so it could be slow
+ if (fa_name==NULL)
+ GError("Error: GFastaIndex::buildIndex() called with no fasta file!\n");
+ FILE* fa=fopen(fa_name,"rb");
+ if (fa==NULL) {
+ GMessage("Warning: cannot open fasta index file: %s!\n",fa_name);
+ return 0;
+ }
+ records.Clear();
+ GLineReader fl(fa);
+ char* s=NULL;
+ uint seqlen=0;
+ int line_len=0,line_blen=0;
+ bool newSeq=false; //set to true after defline
+ off_t newSeqOffset=0;
+ int prevOffset=0;
+ char* seqname=NULL;
+ int last_len=0;
+ bool mustbeLastLine=false; //true if the line length decreases
+ while ((s=fl.nextLine())!=NULL) {
+ if (s[0]=='>') {
+ if (seqname!=NULL) {
+ if (seqlen==0)
+ GError("Warning: empty FASTA record skipped (%s)!\n",seqname);
+ else { //seqlen!=0
+ addRecord(seqname, seqlen,newSeqOffset, line_len, line_blen);
+ }
+ }
+ char *p=s;
+ while (*p > 32) p++;
+ *p=0;
+ GFREE(seqname);
+ seqname=Gstrdup(&s[1]);
+ newSeq=true;
+ newSeqOffset=fl.getfpos();
+ last_len=0;
+ line_len=0;
+ line_blen=0;
+ seqlen=0;
+ mustbeLastLine=false;
+ } //defline parsing
+ else { //sequence line
+ int llen=fl.length();
+ int lblen=fl.getFpos()-prevOffset;
+ if (newSeq) { //first sequence line after defline
+ line_len=llen;
+ line_blen=lblen;
+ }
+ else {//next seq lines after first
+ if (mustbeLastLine || llen>last_len)
+ if (llen<last_len) mustbeLastLine=true;
+ }
+ seqlen+=llen;
+ last_len=llen;
+ newSeq=false;
+ } //sequence line
+ prevOffset=fl.getfpos();
+ }//for each line of the fasta file
+ if (seqlen>0)
+ addRecord(seqname, seqlen, newSeqOffset, line_len, line_blen);
+ GFREE(seqname);
+ fclose(fa);
+ return records.Count();
+int GFastaIndex::storeIndex(const char* finame) { //write the hash to a file
+ if (records.Count()==0)
+ GError("Error at GFastaIndex:storeIndex(): no records found!\n");
+ FILE* fai=fopen(finame, "w");
+ if (fai==NULL) GError("Error creating fasta index file: %s\n",finame);
+ int rcount=storeIndex(fai);
+ GFREE(fai_name);
+ fai_name=Gstrdup(finame);
+ return rcount;
+int GFastaIndex::storeIndex(FILE* fai) {
+ int rcount=0;
+ GList<GFastaRec> reclist(true,false,true); //sorted, don't free members, unique
+ records.startIterate();
+ GFastaRec* rec=NULL;
+ while ((rec=records.NextData())!=NULL) {
+ reclist.Add(rec);
+ }
+ //reclist has records sorted by file offset
+ for (int i=0;i<reclist.Count();i++) {
+#ifdef __WIN32__
+ int written=fprintf(fai, "%s\t%d\t%ld\t%d\t%d\n",
+ reclist[i]->seqname,reclist[i]->seqlen,(long)reclist[i]->fpos,
+ reclist[i]->line_len, reclist[i]->line_blen);
+ int written=fprintf(fai, "%s\t%d\t%lld\t%d\t%d\n",
+ reclist[i]->seqname, reclist[i]->seqlen, (long long)(reclist[i]->fpos),
+ reclist[i]->line_len, reclist[i]->line_blen);
+ if (written>0) rcount++;
+ else break; //couldn't write anymore
+ }
+ fclose(fai);
+ haveFai=(rcount>0);
+ return rcount;
diff --git a/src/GStr.cpp b/src/GStr.cpp
new file mode 100644
index 0000000..4613fa2
--- /dev/null
+++ b/src/GStr.cpp
@@ -0,0 +1,1345 @@
+#include "GStr.h"
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include "GBase.h"
+#include <stdarg.h>
+#include <errno.h>
+GStr::Data GStr::null_data;
+GStr::Data * GStr::new_data(int length) {
+//static method to return a new Data object (allocate length)
+//content is undefined, but it's null terminated
+ if (length > 0) {
+ Data* data;
+ GMALLOC(data, sizeof(Data)+length);
+ data->ref_count = 0;
+ data->length = length;
+ data->chars[length] = '\0';
+ return data;
+ }
+ else
+ return &null_data;
+ }
+GStr::Data* GStr::new_data(const char* str) {
+//static method to return a new Data object (allocate length)
+//as a copy of a given string
+ if (str==NULL) return &null_data;
+ int length=strlen(str);
+ if (length > 0) {
+ Data* data;
+ GMALLOC(data, sizeof(Data)+length);
+ strcpy(data->chars, str);
+ data->ref_count = 0;
+ data->length = length;
+ data->chars[length] = '\0';
+ return data;
+ }
+ else
+ return &null_data;
+ }
+void GStr::replace_data(int len) {
+ if (len == my_data->length && my_data->ref_count <= 1)
+ return;
+ if (my_data != &null_data && --my_data->ref_count == 0)
+ GFREE(my_data);
+ if (len > 0) {
+ //my_data = (Data *) malloc(sizeof(Data) + len);
+ GMALLOC(my_data, sizeof(Data) + len);
+ my_data->ref_count = 1;
+ my_data->length = len;
+ my_data->chars[len] = '\0';
+ }
+ else
+ my_data = &null_data;
+void GStr::replace_data(Data *data) {
+ if (my_data != &null_data && --my_data->ref_count == 0)
+ GFREE(my_data);
+ if (data != &null_data)
+ data->ref_count++;
+ my_data = data;
+void GStr::make_unique() {//make sure it's not a reference to other string
+ if (my_data->ref_count > 1) {
+ Data *data = new_data(length());
+ ::memcpy(data->chars, chars(), length());
+ my_data->ref_count--;
+ my_data = data;
+ my_data->ref_count++;
+ }
+bool operator==(const char *s1, const GStr& s2){
+ if (s1==NULL) return s2.is_empty();
+ return (strcmp(s1, s2.chars()) == 0);
+ }
+bool operator<(const char *s1, const GStr& s2) {
+ if (s1==NULL) return !s2.is_empty();
+ return (strcmp(s1, s2.chars()) < 0);
+ }
+bool operator<=(const char *s1, const GStr& s2){
+ if (s1==NULL) return true;
+ return (strcmp(s1, s2.chars()) <= 0);
+ }
+bool operator>(const char *s1, const GStr& s2) {
+ if (s1==NULL) return false;
+ return (strcmp(s1, s2.chars()) > 0);
+ }
+GStr::GStr():my_data(&null_data) {
+ fTokenDelimiter=NULL;
+ fTokenizeMode=tkCharSet;
+ fLastTokenStart=0;
+ readbuf=NULL;
+ readbufsize=0;
+ }
+GStr::GStr(const GStr& s): my_data(&null_data){
+ fTokenDelimiter=NULL;
+ fTokenizeMode=tkCharSet;
+ fLastTokenStart=0;
+ readbuf=NULL;
+ readbufsize=0;
+ replace_data(s.my_data);
+ }
+GStr::GStr(const char *s): my_data(&null_data) {
+ fTokenDelimiter=NULL;
+ fTokenizeMode=tkCharSet;
+ fLastTokenStart=0;
+ readbuf=NULL;
+ readbufsize=0;
+ my_data=new_data(s);
+ my_data->ref_count = 1;
+ }
+GStr::GStr(const int i): my_data(&null_data) {
+ fTokenDelimiter=NULL;
+ fTokenizeMode=tkCharSet;
+ fLastTokenStart=0;
+ readbuf=NULL;
+ readbufsize=0;
+ char buf[20];
+ sprintf(buf,"%d",i);
+ const int len = ::strlen(buf);
+ replace_data(len);
+ ::memcpy(chrs(), buf, len);
+ }
+GStr::GStr(const double f): my_data(&null_data) {
+ fTokenDelimiter=NULL;
+ fTokenizeMode=tkCharSet;
+ fLastTokenStart=0;
+ readbuf=NULL;
+ readbufsize=0;
+ char buf[20];
+ sprintf(buf,"%f",f);
+ const int len = ::strlen(buf);
+ replace_data(len);
+ ::memcpy(chrs(), buf, len);
+ }
+GStr::GStr(char c, int n): my_data(&null_data) {
+ fTokenDelimiter=NULL;
+ fTokenizeMode=tkCharSet;
+ fLastTokenStart=0;
+ readbuf=NULL;
+ readbufsize=0;
+ replace_data(n); ::memset(chrs(), c, n);
+ }
+GStr::~GStr() {
+ if (my_data != &null_data && --my_data->ref_count == 0)
+ GFREE(my_data);
+ GFREE(fTokenDelimiter);
+ GFREE(readbuf);
+ }
+char& GStr::operator[](int idx){
+//returns reference to char (can be l-value)
+ if (idx < 0) idx += length();
+ if (idx < 0 || idx >= length()) invalid_index_error("operator[]");
+ make_unique(); //because the user will probably modify this char!
+ return chrs()[idx];
+ }
+char GStr::operator[](int idx) const {
+//returns char copy (cannot be l-value!)
+ if (idx < 0) idx += length();
+ if (idx < 0 || idx >= length()) invalid_index_error("operator[]");
+ return chars()[idx];
+ }
+GStr& GStr::operator=(const GStr& s) {
+ make_unique(); //edit operation ahead
+ replace_data(s.my_data);
+ return *this;
+ }
+GStr& GStr::operator=(const char *s) {
+ make_unique(); //edit operation ahead
+ if (s==NULL) {
+ replace_data(0);
+ return *this;
+ }
+ const int len = ::strlen(s); replace_data(len);
+ ::memcpy(chrs(), s, len);
+ return *this;
+ }
+GStr& GStr::operator=(const double f) {
+ make_unique(); //edit operation ahead
+ char buf[20];
+ sprintf(buf,"%f",f);
+ const int len = ::strlen(buf);
+ replace_data(len);
+ ::memcpy(chrs(), buf, len);
+ return *this;
+GStr& GStr::operator=(const int i) {
+ make_unique(); //edit operation ahead
+ char buf[20];
+ sprintf(buf,"%d",i);
+ const int len = ::strlen(buf);
+ replace_data(len);
+ ::memcpy(chrs(), buf, len);
+ return *this;
+bool GStr::operator==(const GStr& s) const {
+ if (s.is_empty()) return is_empty();
+ return (length() == s.length()) &&
+ (memcmp(chars(), s.chars(), length()) == 0);
+ }
+bool GStr::operator==(const char *s) const {
+ if (s==NULL) return is_empty();
+ return (strcmp(chars(), s) == 0);
+ }
+bool GStr::operator<(const GStr& s) const {
+ if (s.is_empty()) return false;
+ return (strcmp(chars(), s.chars()) < 0);
+ }
+bool GStr::operator<(const char *s) const {
+ if (s==NULL) return false;
+ return (strcmp(chars(), s) < 0);
+ }
+bool GStr::operator<=(const GStr& s) const {
+ if (s.is_empty()) return is_empty();
+ return (strcmp(chars(), s.chars()) <= 0);
+ }
+bool GStr::operator<=(const char *s) const {
+ if (s==NULL) return is_empty();
+ return (strcmp(chars(), s) <= 0);
+ }
+bool GStr::operator>(const GStr& s) const {
+ if (s.is_empty()) return !is_empty();
+ return (strcmp(chars(), s.chars()) > 0);
+ }
+bool GStr::operator>(const char *s) const {
+ if (s==NULL) return !is_empty();
+ return (strcmp(chars(), s) > 0);
+ }
+bool GStr::operator>=(const GStr& s) const {
+ if (s.is_empty()) return true;
+ return (strcmp(chars(), s.chars()) >= 0);
+ }
+bool GStr::operator>=(const char *s) const {
+ if (s==NULL) return true;
+ return (strcmp(chars(), s) >= 0);
+ }
+bool GStr::operator!=(const GStr& s) const {
+ if (s.is_empty()) return !is_empty();
+ return (length() != s.length()) ||
+ (memcmp(chars(), s.chars(), length()) != 0);
+ }
+bool GStr::operator!=(const char *s) const {
+ if (s==NULL) return !is_empty();
+ return (strcmp(chars(), s) != 0);
+ }
+GStr& GStr::operator+=(const GStr& s) {
+ return append((const char *)s);
+ }
+GStr& GStr::operator+=(const char* s) {
+ return append(s);
+ }
+GStr& GStr::operator+=(const char c) {
+ char buf[4];
+ sprintf(buf,"%c",c);
+ return append(buf);
+ }
+GStr& GStr::operator+=(const int i) {
+ char buf[20];
+ sprintf(buf,"%d",i);
+ return append(buf);
+ }
+GStr& GStr::operator+=(const double f) {
+ char buf[30];
+ sprintf(buf,"%f",f);
+ return append(buf);
+ }
+bool GStr::is_empty() const {
+ //return my_data == &null_data;
+ return (length()==0);
+ }
+GStr GStr::copy() const {
+ GStr newstring(*this);
+ return newstring;
+ }
+GStr& GStr::clear() {
+ make_unique(); //edit operation ahead
+ replace_data(0);
+ return *this;
+ }
+int GStr::index(const GStr& s, int start_index) const {
+ return index(s.chars(), start_index);
+ }
+bool GStr::contains(const GStr& s) const {
+ return (index(s, 0) >= 0);
+ }
+bool GStr::contains(const char *s) const {
+ return (index(s, 0) >= 0);
+ }
+bool GStr::startsWith(const char *s) const {
+ //return (index(s, 0) == 0);
+ return ::startsWith(this->chars(), s);
+ }
+bool GStr::startsWith(const GStr& s) const {
+ //return (index(s, 0) == 0);
+ return ::startsWith(this->chars(), s.chars());
+ }
+bool GStr::endsWith(const char *s) const {
+ //return (index(s, 0) == 0);
+ return ::endsWith(this->chars(), s);
+ }
+bool GStr::endsWith(const GStr& s) const {
+ //return (index(s, 0) == 0);
+ return ::endsWith(this->chars(), s.chars());
+ }
+bool GStr::contains(char c) const {
+ return (index(c, 0) >= 0);
+ }
+GStr& GStr::format(const char *fmt,...) {
+// Format as in sprintf
+ make_unique(); //edit operation ahead
+ char* buf;
+ GMALLOC(buf, strlen(fmt)+1024);
+ va_list arguments;
+ va_start(arguments,fmt);
+ //+1K buffer, should be enough for common expressions
+ int len=vsprintf(buf,fmt,arguments);
+ va_end(arguments);
+ replace_data(len); //this also adds the '\0' at the end!
+ //and sets the right len
+ ::memcpy(chrs(), buf, len);
+ GFREE(buf);
+ return *this;
+ }
+GStr& GStr::appendfmt(const char *fmt,...) {
+// Format as in sprintf
+ make_unique(); //edit operation ahead
+ char* buf;
+ GMALLOC(buf, strlen(fmt)+1024);
+ va_list arguments;
+ va_start(arguments,fmt);
+ //+1K buffer, should be enough for common expressions
+ vsprintf(buf,fmt,arguments);
+ va_end(arguments);
+ append(buf);
+ GFREE(buf);
+ return *this;
+ }
+GStr& GStr::trim(char c) {
+ register int istart;
+ register int iend;
+ for (istart=0; istart<length() && chars()[istart]==c;istart++) ;
+ if (istart==length()) {
+ make_unique(); //edit operation ahead
+ replace_data(0); //string was entirely trimmed
+ return *this;
+ }
+ for (iend=length()-1; iend>istart && chars()[iend]==c;iend--) ;
+ int newlen=iend-istart+1;
+ if (newlen==length()) //nothing to trim
+ return *this;
+ make_unique(); //edit operation ahead
+ Data *data = new_data(newlen);
+ ::memcpy(data->chars, &chars()[istart], newlen);
+ replace_data(data);
+ return *this;
+ }
+GStr& GStr::trim(const char* c) {
+ register int istart;
+ register int iend;
+ for (istart=0; istart<length() && strchr(c, chars()[istart])!=NULL ;istart++) ;
+ if (istart==length()) {
+ replace_data(0); //string was entirely trimmed
+ return *this;
+ }
+ for (iend=length()-1; iend>istart && strchr(c, chars()[iend])!=NULL;iend--) ;
+ int newlen=iend-istart+1;
+ if (newlen==length()) //nothing to trim
+ return *this;
+ make_unique(); //edit operation ahead
+ Data *data = new_data(newlen);
+ ::memcpy(data->chars, &chars()[istart], newlen);
+ replace_data(data);
+ return *this;
+ }
+GStr& GStr::trimR(char c) {
+ //only trim the right end
+ //register int istart;
+ register int iend;
+ for (iend=length()-1; iend>=0 && chars()[iend]==c;iend--) ;
+ if (iend==-1) {
+ replace_data(0); //string was entirely trimmed
+ return *this;
+ }
+ int newlen=iend+1;
+ if (newlen==length()) //nothing to trim
+ return *this;
+ make_unique(); //edit operation ahead
+ Data *data = new_data(newlen);
+ ::memcpy(data->chars, chars(), newlen);
+ replace_data(data);
+ return *this;
+ }
+GStr& GStr::trimR(const char* c) {
+ register int iend;
+ for (iend=length()-1; iend>=0 && strchr(c,chars()[iend])!=NULL;iend--) ;
+ if (iend==-1) {
+ replace_data(0); //string was entirely trimmed
+ return *this;
+ }
+ int newlen=iend+1;
+ if (newlen==length()) //nothing to trim
+ return *this;
+ make_unique(); //edit operation ahead
+ Data *data = new_data(newlen);
+ ::memcpy(data->chars, chars(), newlen);
+ replace_data(data);
+ return *this;
+ }
+GStr& GStr::chomp(const char* cstr) {
+ register int iend;
+ if (cstr==NULL || *cstr==0) return *this;
+ //check if this ends with cstr
+ int cend=strlen(cstr)-1;
+ iend=my_data->length-1;
+ while (iend>=0 && cend>=0) {
+ if (my_data->chars[iend]!=cstr[cend]) return *this;
+ iend--;
+ cend--;
+ }
+ if (iend==-1) {
+ replace_data(0); //string will be entirely trimmed
+ return *this;
+ }
+ int newlen=iend+1;
+ make_unique(); //edit operation ahead
+ Data *data = new_data(newlen);
+ ::memcpy(data->chars, chars(), newlen);
+ replace_data(data);
+ return *this;
+ }
+GStr& GStr::trimL(char c) {
+ register int istart;
+ for (istart=0; istart<length() && chars()[istart]==c;istart++) ;
+ if (istart==length()) {
+ replace_data(0); //string was entirely trimmed
+ return *this;
+ }
+ int newlen=length()-istart;
+ if (newlen==length()) //nothing to trim
+ return *this;
+ make_unique(); //edit operation ahead
+ Data *data = new_data(newlen);
+ ::memcpy(data->chars, &chars()[istart], newlen);
+ replace_data(data);
+ return *this;
+ }
+GStr& GStr::trimL(const char* c) {
+ register int istart;
+ for (istart=0; istart<length() && strchr(c,chars()[istart])!=NULL;istart++) ;
+ if (istart==length()) {
+ replace_data(0); //string was entirely trimmed
+ return *this;
+ }
+ int newlen=length()-istart;
+ if (newlen==length()) //nothing to trim
+ return *this;
+ make_unique(); //edit operation ahead
+ Data *data = new_data(newlen);
+ ::memcpy(data->chars, &chars()[istart], newlen);
+ replace_data(data);
+ return *this;
+ }
+GStr& GStr::padR(int len, char c) {
+ //actually means align right in len
+ if (length()>=len) return *this; //no room for padding
+ make_unique(); //edit operation ahead
+ Data *data = new_data(len);
+ ::memset(data->chars,c,len-length());
+ ::memcpy(&data->chars[len-length()], chars(), length());
+ replace_data(data);
+ return *this;
+ }
+GStr& GStr::padL(int len, char c) { //align left the string
+ if (length()>=len) return *this; //no room for padding
+ make_unique(); //edit operation ahead
+ Data *data = new_data(len);
+ ::memcpy(data->chars, chars(), length());
+ ::memset(&data->chars[length()],c,len-length());
+ replace_data(data);
+ return *this;
+ }
+GStr& GStr::padC(int len, char c) {
+ if (length()>=len) return *this; //no room for padding
+ make_unique(); //edit operation ahead
+ int istart=(len-length())/2;
+ Data *data = new_data(len);
+ if (istart>0)
+ ::memset(data->chars, c, istart);
+ ::memcpy(&data->chars[istart], chars(), length());
+ int iend=istart+length();
+ if (iend<len)
+ ::memset(&data->chars[iend],c,len-iend);
+ replace_data(data);
+ return *this;
+ }
+GStr operator+(const char *s1, const GStr& s2) {
+ const int s1_length = ::strlen(s1);
+ if (s1_length == 0)
+ return s2;
+ else {
+ GStr newstring;
+ newstring.replace_data(s1_length + s2.length());
+ ::memcpy(newstring.chrs(), s1, s1_length);
+ ::memcpy(&(newstring.chrs())[s1_length], s2.chars(), s2.length());
+ return newstring;
+ }
+GStr GStr::operator+(const GStr& s) const {
+ if (length() == 0)
+ return s;
+ else if (s.length() == 0)
+ return *this;
+ else {
+ GStr newstring;
+ newstring.replace_data(length() + s.length());
+ ::memcpy(newstring.chrs(), chars(), length());
+ ::memcpy(&(newstring.chrs())[length()], s.chars(), s.length());
+ return newstring;
+ }
+GStr GStr::operator+(const char *s) const {
+ const int s_length = ::strlen(s);
+ if (s_length == 0)
+ return *this;
+ else {
+ GStr newstring;
+ newstring.replace_data(length() + s_length);
+ ::memcpy(newstring.chrs(), chars(), length());
+ ::memcpy(&(newstring.chrs())[length()], s, s_length);
+ return newstring;
+ }
+GStr GStr::operator+(const int i) const {
+ char buf[20];
+ sprintf(buf, "%d", i);
+ const int s_length = ::strlen(buf);
+ GStr newstring;
+ newstring.replace_data(length() + s_length);
+ ::memcpy(newstring.chrs(), chars(), length());
+ ::memcpy(&(newstring.chrs())[length()], buf, s_length);
+ return newstring;
+GStr GStr::operator+(const char c) const {
+ char buf[4];
+ sprintf(buf, "%c", c);
+ const int s_length = ::strlen(buf);
+ GStr newstring;
+ newstring.replace_data(length() + s_length);
+ ::memcpy(newstring.chrs(), chars(), length());
+ ::memcpy(&(newstring.chrs())[length()], buf, s_length);
+ return newstring;
+GStr GStr::operator+(const double f) const {
+ char buf[30];
+ sprintf(buf, "%f", f);
+ const int s_length = ::strlen(buf);
+ GStr newstring;
+ newstring.replace_data(length() + s_length);
+ ::memcpy(newstring.chrs(), chars(), length());
+ ::memcpy(&(newstring.chrs())[length()], buf, s_length);
+ return newstring;
+bool GStr::is_space() const {
+ if (my_data == &null_data)
+ return false;
+ for (register const char *p = chars(); *p; p++)
+ if (!isspace(*p))
+ return false;
+ return true;
+GStr GStr::substr(int idx, int len) const {
+ // A negative idx specifies an idx from the right of the string.
+ if (idx < 0)
+ idx += length();
+ // A length of -1 specifies the rest of the string.
+ if (len < 0 || len>length()-idx)
+ len = length() - idx;
+ if (idx<0 || idx>=length() || len<0 )
+ invalid_args_error("substr()");
+ GStr newstring;
+ newstring.replace_data(len);
+ ::memcpy(newstring.chrs(), &chars()[idx], len);
+ return newstring;
+GStr& GStr::reverse() {
+ make_unique();
+ int l=0;
+ int r=my_data->length-1;
+ char c;
+ while (l<r) {
+ c=my_data->chars[l];
+ my_data->chars[l]=my_data->chars[r];
+ my_data->chars[r]=c;
+ l++;r--;
+ }
+ return *this;
+//transform: any character from 'from' is replaced with a coresponding
+//char from 'to'
+GStr& GStr::tr(const char *rfrom, const char* rto) {
+ if (length() == 0 || rfrom==NULL || strlen(rfrom)==0)
+ return *this;
+ unsigned int l=strlen(rfrom);
+ if (rto!=NULL && strlen(rto)!=l)
+ invalid_args_error("tr()");
+ make_unique(); //edit operation ahead
+ Data *data = new_data(length());
+ if (rto==NULL) { //deletion case
+ char* s = my_data->chars;
+ char* p;
+ char* dest = data->chars;
+ do {
+ if ((p=strpbrk(s,rfrom))!=NULL) {
+ memcpy(dest,s,p-s);
+ dest+=p-s;
+ s=p+1;
+ }
+ else {
+ strcpy(dest, s);
+ dest+=strlen(s);
+ }
+ } while (p!=NULL);
+ (*dest)='\0';
+ }
+ else { //char substitution case - easier!
+ const char* p;
+ for (int i=0; i<length(); i++) {
+ if ((p=strchr(rfrom, my_data->chars[i]))!=NULL)
+ my_data->chars[i]=rto[p-rfrom];
+ }
+ }
+ data->length=strlen(data->chars);
+ replace_data(data);
+ return *this;
+// search and replace all the occurences of a string with another string
+// or just remove the given string (if replacement is NULL)
+GStr& GStr::replace(const char *rfrom, const char* rto) {
+ if (length() == 0 || rfrom==NULL || strlen(rfrom)==0)
+ return *this;
+ unsigned int l=strlen(rfrom);
+ unsigned int tl= (rto==NULL)?0:strlen(rto);
+ make_unique(); //edit operation ahead
+ char* p;
+ char* dest;
+ char* newdest=NULL;
+ char* s = my_data->chars;
+ if (tl!=l) { //reallocation
+ if (tl>l) { //possible enlargement
+ GMALLOC(newdest, length()*(tl-l+1)+1);
+ }
+ else {//delete or replace with a shorter string
+ GMALLOC(newdest, length() + 1);
+ }
+ dest=newdest;
+ if (tl==0) {//deletion
+ while ((p=strstr(s,rfrom))!=NULL) {
+ //rfrom found at position p
+ memcpy(dest,s,p-s);
+ dest+=p-s;
+ s+=p-s+l; //s positioned in string after rfrom
+ }
+ //no more occurences, copy the remaining string
+ strcpy(dest, s);
+ }
+ else { //replace with another string
+ while ((p=strstr(s,rfrom))!=NULL) {
+ memcpy(dest,s,p-s); //copy up rto the match
+ dest+=p-s;
+ memcpy(dest,rto,tl); //put the replacement string
+ dest+=tl;
+ s+=p-s+l;
+ }
+ //not found any more, copy rto end of string
+ strcpy(dest, s);
+ }
+ Data* data=new_data(newdest);
+ replace_data(data);
+ GFREE(newdest);
+ }
+ else { //inplace editing: no need rto reallocate
+ while ((p=strstr(s,rfrom))!=NULL) {
+ memcpy(p,rto,l);
+ s+=p-s+l;
+ }
+ }
+ return *this;
+GStr& GStr::cut(int idx, int len) {
+ if (len == 0)
+ return *this;
+ make_unique(); //edit operation ahead
+ // A negative idx specifies an idx from the right of the string,
+ // so the left part will be cut out
+ if (idx < 0)
+ idx += length();
+ // A length of -1 specifies the rest of the string.
+ if (len == -1)
+ len = length() - idx;
+ if (idx<0 || idx>=length() || len<0 || len>length()-idx)
+ invalid_args_error("cut()");
+ Data *data = new_data(length() - len);
+ if (idx > 0)
+ ::memcpy(data->chars, chars(), idx);
+ ::strcpy(&data->chars[idx], &chars()[idx+len]);
+ replace_data(data);
+ return *this;
+GStr& GStr::paste(const GStr& s, int idx, int len) {
+ // A negative idx specifies an idx from the right of the string.
+ if (idx < 0)
+ idx += length();
+ make_unique(); //edit operation ahead
+ // A length of -1 specifies the rest of the string.
+ if (len == -1)
+ len = length() - idx;
+ if (idx<0 || idx>=length() || len<0 || len>length()-idx)
+ invalid_args_error("replace()");
+ if (len == s.length() && my_data->ref_count == 1)
+ ::memcpy(&chrs()[idx], s.chars(), len);
+ else {
+ Data *data = new_data(length() - len + s.length());
+ if (idx > 0)
+ ::memcpy(data->chars, chars(), idx);
+ if (s.length() > 0)
+ ::memcpy(&data->chars[idx], s.chars(), s.length());
+ ::strcpy(&data->chars[idx+s.length()], &chars()[idx+len]);
+ replace_data(data);
+ }
+ return *this;
+GStr& GStr::paste(const char *s, int idx, int len) {
+ // A negative idx specifies an idx from the right of the string.
+ make_unique(); //edit operation ahead
+ if (idx < 0)
+ idx += length();
+ // A length of -1 specifies the rest of the string.
+ if (len == -1)
+ len = length() - idx;
+ if (idx<0 || idx>=length() || len<0 || len>length()-idx)
+ invalid_args_error("replace()");
+ const int s_length = ::strlen(s);
+ if (len == s_length && my_data->ref_count == 1)
+ ::memcpy(&chrs()[idx], s, len);
+ else {
+ Data *data = new_data(length() - len + s_length);
+ if (idx > 0)
+ ::memcpy(data->chars, chars(), idx);
+ if (s_length > 0)
+ ::memcpy(&data->chars[idx], s, s_length);
+ ::strcpy(&data->chars[idx+s_length], &chars()[idx+len]);
+ replace_data(data);
+ }
+ return *this;
+GStr& GStr::insert(const GStr& s, int idx) {
+ make_unique(); //edit operation ahead
+ // A negative idx specifies an idx from the right of the string.
+ if (idx < 0)
+ idx += length();
+ if (idx < 0 || idx >= length())
+ invalid_index_error("insert()");
+ if (s.length() > 0) {
+ Data *data = new_data(length() + s.length());
+ if (idx > 0)
+ ::memcpy(data->chars, chars(), idx);
+ ::memcpy(&data->chars[idx], s.chars(), s.length());
+ ::strcpy(&data->chars[idx+s.length()], &chars()[idx]);
+ replace_data(data);
+ }
+ return *this;
+GStr& GStr::insert(const char *s, int idx) {
+ // A negative idx specifies an idx from the right of the string.
+ make_unique(); //edit operation ahead
+ if (idx < 0)
+ idx += length();
+ if (idx < 0 || idx >= length())
+ invalid_index_error("insert()");
+ const int s_length = ::strlen(s);
+ if (s_length > 0) {
+ Data *data = new_data(length() + s_length);
+ if (idx > 0)
+ ::memcpy(data->chars, chars(), idx);
+ ::memcpy(&data->chars[idx], s, s_length);
+ ::strcpy(&data->chars[idx+s_length], &chars()[idx]);
+ replace_data(data);
+ }
+ return *this;
+GStr& GStr::append(const char* s) {
+ make_unique(); //edit operation ahead
+ int len=::strlen(s);
+ int newlength=len+my_data->length;
+ if (newlength<=my_data->length) return *this;
+ if (my_data->length==0) {
+ replace_data(len);
+ ::memcpy(my_data->chars, s, len);
+ return *this;
+ }
+ //faster solution with realloc
+ GREALLOC(my_data, sizeof(Data)+newlength);
+ ::strcpy(&my_data->chars[my_data->length], s);
+ my_data->length=newlength;
+ my_data->chars[newlength]='\0';
+ return *this;
+GStr& GStr::append(const GStr& s) {
+ return append((const char *)s);
+GStr& GStr::upper() {
+ make_unique(); //edit operation ahead
+ for (register char *p = chrs(); *p; p++)
+ *p = (char) toupper(*p);
+ return *this;
+GStr& GStr::lower() {
+ make_unique();
+ for (register char *p = chrs(); *p; p++)
+ *p = (char) tolower(*p);
+ return *this;
+int GStr::index(const char *s, int start_index) const {
+ // A negative index specifies an index from the right of the string.
+ if (strlen(s)>(size_t)length()) return -1;
+ if (start_index < 0)
+ start_index += length();
+ if (start_index < 0 || start_index >= length())
+ invalid_index_error("index()");
+ const char* idx = strstr(&chars()[start_index], s);
+ if (!idx)
+ return -1;
+ else
+ return idx - chars();
+int GStr::index(char c, int start_index) const {
+ // A negative index specifies an index from the right of the string.
+ if (length()==0) return -1;
+ if (start_index < 0)
+ start_index += length();
+ if (start_index < 0 || start_index >= length())
+ invalid_index_error("index()");
+ if (c == '\0')
+ return -1;
+ const char *idx=(char *) ::memchr(&chars()[start_index], c,
+ length()-start_index);
+ if (idx==NULL)
+ return -1;
+ else
+ return idx - chars();
+int GStr::rindex(char c, int end_index) const {
+ if (c == 0 || length()==0 || end_index>=length()) return -1;
+ if (end_index<0) end_index=my_data->length-1;
+ for (int i=end_index;i>=0;i--) {
+ if (my_data->chars[i]==c) return i;
+ }
+ return -1;
+int GStr::rindex(const char* str, int end_index) const {
+ if (str==NULL || *str == '\0' || length()==0 || end_index>=length())
+ return -1;
+ int slen=strlen(str);
+ if (end_index<0) end_index=my_data->length-1;
+ //end_index is the index of the right-side boundary
+ //the scanning starts at the end
+ if (end_index>=0 && end_index<slen-1) return -1;
+ for (int i=end_index-slen+1;i>=0;i--) {
+ if (memcmp((void*)(my_data->chars+i),(void*)str, slen)==0)
+ return i;
+ }
+ return -1;
+GStr GStr::split(const char* delim) {
+ /* splits "this" in two parts, at the first (left)
+ encounter of delim:
+ 1st would stay in "this",
+ 2nd part will be returned
+ as a new string!
+ */
+ GStr result;
+ int i=index(delim);
+ if (i>=0){
+ result=substr(i+strlen(delim));
+ cut(i);
+ return result;
+ }
+ return result;
+GStr GStr::split(char c) {
+ /* splits "this" in two parts, at the first (left)
+ encounter of delim:
+ 1st would stay in "this",
+ 2nd part will be returned
+ as a new string!
+ */
+ GStr result;
+ int i=index(c);
+ if (i>=0){
+ result=substr(i+1);
+ cut(i);
+ return result;
+ }
+ return result;
+GStr GStr::splitr(const char* delim) {
+ GStr result;
+ int i=rindex(delim);
+ if (i>=0){
+ result=substr(i+strlen(delim));
+ cut(i);
+ return result;
+ }
+ return result;
+GStr GStr::splitr(char c) {
+ GStr result;
+ int i=rindex(c);
+ if (i>=0){
+ result=substr(i+1);
+ cut(i);
+ return result;
+ }
+ return result;
+void GStr::startTokenize(const char* delimiter, enTokenizeMode tokenizemode) {
+ GFREE(fTokenDelimiter);
+ if (delimiter) {
+ GMALLOC(fTokenDelimiter,strlen(delimiter)+1);
+ strcpy(fTokenDelimiter, delimiter);
+ }
+ fLastTokenStart=0;
+ fTokenizeMode=tokenizemode;
+bool GStr::nextToken(GStr& token) {
+ if (fTokenDelimiter==NULL) {
+ GError("GStr:: no token delimiter; use StartTokenize first\n");
+ }
+ if (fLastTokenStart>=length()) {//no more
+ GFREE(fTokenDelimiter);
+ fLastTokenStart=0;
+ return false;
+ }
+ int dlen=strlen(fTokenDelimiter);
+ char* delpos=NULL; //delimiter position
+ int tlen=0;
+ if (fTokenizeMode==tkFullString) { //exact string as a delimiter
+ delpos=(char*)strstr(chars()+fLastTokenStart,fTokenDelimiter);
+ if (delpos==NULL) delpos=(char*)(chars()+length());
+ //empty records may be returned
+ if (chars()+fLastTokenStart == delpos) { //empty token
+ fLastTokenStart=(delpos-chars())+dlen;
+ token="";
+ return true;
+ }
+ else {
+ tlen=delpos-(chars()+fLastTokenStart);
+ token.replace_data(tlen);
+ ::memcpy(token.chrs(), &chars()[fLastTokenStart], tlen);
+ fLastTokenStart=(delpos-chars())+dlen;
+ return true;
+ }
+ }
+ else { //tkCharSet - any character is a delimiter
+ //empty records are never returned !
+ if (fLastTokenStart==0) {//skip any starting delimiters
+ delpos=(char*)chars();
+ while (*delpos!='\0' && strchr(fTokenDelimiter, *delpos)!=NULL)
+ delpos++;
+ if (*delpos!='\0')
+ fLastTokenStart = delpos-chars();
+ else { //only delimiters here,no tokens
+ GFREE(fTokenDelimiter);
+ fLastTokenStart=0;
+ return false;
+ }
+ }
+ //now fLastTokenStart is on a non-delimiter char
+ //GMessage("String at fLastTokenStart=%d is %s\n", fLastTokenStart, delpos);
+ char* token_end=NULL;
+ delpos=(char*)strpbrk(chars()+fLastTokenStart,fTokenDelimiter);
+ if (delpos==NULL) delpos=(char*)(chars()+length());
+ token_end=delpos-1;
+ while (*delpos!='\0' && strchr(fTokenDelimiter, *delpos)!=NULL)
+ delpos++; //skip any other delimiters in the set!
+ //now we know that delpos is on the beginning of next token
+ tlen=(token_end-chars())-fLastTokenStart+1;
+ if (tlen==0) {
+ GFREE(fTokenDelimiter);
+ fLastTokenStart=0;
+ return false;
+ }
+ token.replace_data(tlen);
+ ::memcpy(token.chrs(), &chars()[fLastTokenStart], tlen);
+ fLastTokenStart=delpos-chars();
+ return true;
+ }
+ //return true;
+size_t GStr::read(FILE* stream, const char* delimiter, size_t bufsize) {
+//read up to (and including) the given delimiter string
+ if (readbuf==NULL) {
+ GMALLOC(readbuf, bufsize);
+ readbufsize=bufsize;
+ }
+ else if (bufsize!=readbufsize) {
+ GFREE(readbuf);
+ if (bufsize>0) {
+ GMALLOC(readbuf, bufsize);
+ }
+ readbufsize=bufsize;
+ }
+ if (bufsize==0) {
+ replace_data(0);
+ return 0; //clear the string and free the buffer
+ }
+ size_t numread;
+ size_t acc_len=0; //accumulated length
+ int seplen=strlen(delimiter);
+ void* p=NULL;
+ Data *data = new_data(0);
+ do {
+ numread=fread(readbuf, 1, bufsize, stream);
+ if (numread) {
+ p=Gmemscan(readbuf, bufsize, (void*) delimiter, seplen);
+ if (p!=NULL) {//found the delimiter
+ //position the stream after it
+ int l = (char*)p-(char*)readbuf;
+ fseek(stream, l+seplen-numread, SEEK_CUR);
+ numread=l+seplen;
+ }
+ else {//not found, go back if not eof
+ if (numread==bufsize) {
+ fseek(stream, -seplen, SEEK_CUR); //check if this works!
+ numread-=seplen;
+ }
+ }
+ if (data==&null_data) {
+ data=new_data(numread);
+ ::memcpy(data->chars, readbuf, numread);
+ acc_len+=numread;
+ }
+ else {
+ GREALLOC(data, sizeof(Data)+acc_len+numread);
+ memcpy(&data->chars[acc_len], readbuf, numread);
+ acc_len+=numread;
+ data->length=acc_len;
+ data->chars[acc_len]='\0';
+ }
+ } //if something read
+ } while (p==NULL && numread!=0);
+ replace_data(data);
+ return acc_len;
+int GStr::asInt(int base /*=10 */) {
+ return strtol(text(), NULL, base);
+bool GStr::asInt(int& r, int base) {
+ errno=0;
+ char*endptr;
+ long val=strtol(text(), &endptr, base);
+ if (errno!=0) return false;
+ if (endptr == text()) return false;
+ /* If we got here, strtol() successfully parsed a number */
+ r=val;
+ return true;
+double GStr::asReal() {
+ return strtod(text(), NULL);
+bool GStr::asReal(double& r) {
+ errno=0;
+ char* endptr;
+ double val=strtod(text(), &endptr);
+ if (errno!=0) return false;
+ if (endptr == text()) return false; //no digits to parse
+ r=val;
+ return true;
+int GStr::peelInt() const {
+ if (is_empty()) return 0;
+ char buf[24];
+ bool started=false;
+ int j=0;
+ int i;
+ for (i=0;i<length();i++) {
+ if (started) {
+ if (isdigit(my_data->chars[i])) j++; //set coord
+ else break; //finished
+ }
+ else
+ if (isdigit(my_data->chars[i])) {
+ j++; started=true;
+ }
+ }
+ if (j>0) {
+ strncpy(buf, &my_data->chars[i-j], j);
+ buf[j]='\0';
+ return strtol(buf, NULL, 10);
+ }
+ return 0;
+int GStr::peelIntR() const {
+ if (is_empty()) return 0;
+ char buf[24];
+ bool started=false;
+ int j=0;
+ int i;
+ for (i=length()-1;i>=0;i--) {
+ if (started) {
+ if (isdigit(my_data->chars[i])) j++; //set length
+ else break; //finished
+ }
+ else
+ if (isdigit(my_data->chars[i])) {
+ j++; started=true;
+ }
+ }
+ if (j>0) {
+ strncpy(buf, &my_data->chars[i+1], j);
+ buf[j]='\0';
+ return strtol(buf, NULL, 10);
+ }
+ return 0;
+GStr GStr::to(char c) { //return the first part up to first occurence of c
+ int i=index(c);
+ if (i>=0) return substr(0,i);
+ else return (*this);
+ //or whole string if c not found
+GStr GStr::from(char c) { //same as to, but starting from the right side
+ int i=rindex(c);
+ if (i>=0) return substr(i+1);
+ else return (*this);
+int GStr::count(char c){
+ //return the number of occurences of char c within the string
+ int result=0;
+ for (int i=0;i<length();i++)
+ if (my_data->chars[i]==c) result++;
+ return result;
+ }
+void GStr::invalid_args_error(const char *fname) {
+ GError("GStr:: %s - invalid arguments\n", fname);
+void GStr::invalid_index_error(const char *fname) {
+ GError("GStr:: %s - invalid index\n", fname);
diff --git a/src/TestGFFParse.cpp b/src/TestGFFParse.cpp
new file mode 100644
index 0000000..3354ca6
--- /dev/null
+++ b/src/TestGFFParse.cpp
@@ -0,0 +1,34 @@
+#include <iostream>
+#include <cstdio>
+#include <cstdlib>
+#include "gff.h"
+int main(int argc, char* argv[]) {
+ if (argc == 1 or argc > 2) {
+ std::cerr << "Usage: TestGFFParse input\n";
+ std::exit(1);
+ }
+ GffReader reader(argv[1]);
+ reader.readAll(true);
+ std::cerr << "had count of " << reader.gflst.Count() << "\n";
+ size_t nfeat = reader.gflst.Count();
+ for (size_t i=0; i < nfeat; ++i) {
+ GffObj* f = reader.gflst[i];
+ if (f->isTranscript()) {
+ std::cout << f->getID() << '\t' << f->getGeneID() << '\t';
+ if (f->attrs) {
+ for (size_t j=0; j < f->attrs->Count(); ++j) {
+ std::cout << f->getAttrName(j) << "\t" << f->getAttrValue(j) << "\t";
+ }
+ }
+ std::cout << "\n";
+ }
+ }
+ std::exit(0);
diff --git a/src/codons.cpp b/src/codons.cpp
new file mode 100644
index 0000000..a459250
--- /dev/null
+++ b/src/codons.cpp
@@ -0,0 +1,90 @@
+#include "codons.h"
+static char codonTable[32768]; //32K table for fasta codon decoding
+ // codons are encoded as triplets of 5-bit-encoded nucleotides
+ // (so any codon can be encoded/decoded as a unique 15-bit value)
+static char codonData[]={ //long list of 3+1 characters (codon+translation)
+'A','A','A','K', 'A','A','C','N', 'A','A','G','K', 'A','A','R','K', 'A','A','T','N',
+'A','A','Y','N', 'A','C','A','T', 'A','C','B','T', 'A','C','C','T', 'A','C','D','T',
+'A','C','G','T', 'A','C','H','T', 'A','C','K','T', 'A','C','M','T', 'A','C','N','T',
+'A','C','R','T', 'A','C','S','T', 'A','C','T','T', 'A','C','V','T', 'A','C','W','T',
+'A','C','Y','T', 'A','G','A','R', 'A','G','C','S', 'A','G','G','R', 'A','G','R','R',
+'A','G','T','S', 'A','G','Y','S', 'A','T','A','I', 'A','T','C','I', 'A','T','G','M',
+'A','T','H','I', 'A','T','M','I', 'A','T','T','I', 'A','T','W','I', 'A','T','Y','I',
+'C','A','A','Q', 'C','A','C','H', 'C','A','G','Q', 'C','A','R','Q', 'C','A','T','H',
+'C','A','Y','H', 'C','C','A','P', 'C','C','B','P', 'C','C','C','P', 'C','C','D','P',
+'C','C','G','P', 'C','C','H','P', 'C','C','K','P', 'C','C','M','P', 'C','C','N','P',
+'C','C','R','P', 'C','C','S','P', 'C','C','T','P', 'C','C','V','P', 'C','C','W','P',
+'C','C','Y','P', 'C','G','A','R', 'C','G','B','R', 'C','G','C','R', 'C','G','D','R',
+'C','G','G','R', 'C','G','H','R', 'C','G','K','R', 'C','G','M','R', 'C','G','N','R',
+'C','G','R','R', 'C','G','S','R', 'C','G','T','R', 'C','G','V','R', 'C','G','W','R',
+'C','G','Y','R', 'C','T','A','L', 'C','T','B','L', 'C','T','C','L', 'C','T','D','L',
+'C','T','G','L', 'C','T','H','L', 'C','T','K','L', 'C','T','M','L', 'C','T','N','L',
+'C','T','R','L', 'C','T','S','L', 'C','T','T','L', 'C','T','V','L', 'C','T','W','L',
+'C','T','Y','L', 'G','A','A','E', 'G','A','C','D', 'G','A','G','E', 'G','A','R','E',
+'G','A','T','D', 'G','A','Y','D', 'G','C','A','A', 'G','C','B','A', 'G','C','C','A',
+'G','C','D','A', 'G','C','G','A', 'G','C','H','A', 'G','C','K','A', 'G','C','M','A',
+'G','C','N','A', 'G','C','R','A', 'G','C','S','A', 'G','C','T','A', 'G','C','V','A',
+'G','C','W','A', 'G','C','Y','A', 'G','G','A','G', 'G','G','B','G', 'G','G','C','G',
+'G','G','D','G', 'G','G','G','G', 'G','G','H','G', 'G','G','K','G', 'G','G','M','G',
+'G','G','N','G', 'G','G','R','G', 'G','G','S','G', 'G','G','T','G', 'G','G','V','G',
+'G','G','W','G', 'G','G','Y','G', 'G','T','A','V', 'G','T','B','V', 'G','T','C','V',
+'G','T','D','V', 'G','T','G','V', 'G','T','H','V', 'G','T','K','V', 'G','T','M','V',
+'G','T','N','V', 'G','T','R','V', 'G','T','S','V', 'G','T','T','V', 'G','T','V','V',
+'G','T','W','V', 'G','T','Y','V', 'M','G','A','R', 'M','G','G','R', 'M','G','R','R',
+'N','N','N','X', 'R','A','Y','B', 'S','A','R','Z', 'T','A','A','.', 'T','A','C','Y',
+'T','A','G','.', 'T','A','R','.', 'T','A','T','Y', 'T','A','Y','Y', 'T','C','A','S',
+'T','C','B','S', 'T','C','C','S', 'T','C','D','S', 'T','C','G','S', 'T','C','H','S',
+'T','C','K','S', 'T','C','M','S', 'T','C','N','S', 'T','C','R','S', 'T','C','S','S',
+'T','C','T','S', 'T','C','V','S', 'T','C','W','S', 'T','C','Y','S', 'T','G','A','.',
+'T','G','C','C', 'T','G','G','W', 'T','G','T','C', 'T','G','Y','C', 'T','R','A','.',
+'T','T','A','L', 'T','T','C','F', 'T','T','G','L', 'T','T','R','L', 'T','T','T','F',
+'T','T','Y','F', 'X','X','X','X', 'Y','T','A','L', 'Y','T','G','L', 'Y','T','R','L'
+static bool isCodonTableReady=codonTableInit();
+unsigned short packCodon(char n1, char n2, char n3) {
+ //assumes they are uppercase already!
+ byte b1=n1-'A';
+ byte b2=n2-'A';
+ byte b3=n3-'A';
+ b1 |= (b2 << 5);
+ b2 = (b2 >> 3) | (b3 << 2);
+ return ( ((unsigned short)b2) << 8) + b1;
+ }
+bool codonTableInit() {
+ memset((void*)codonTable, 'X', 32768);
+ int cdsize=sizeof(codonData);
+ for (int i=0;i<cdsize;i+=4) {
+ unsigned short aacode=packCodon(codonData[i], codonData[i+1], codonData[i+2]);
+ codonTable[aacode]=codonData[i+3];
+ }
+ return true;
+ }
+char Codon::translate() {
+ for (byte i=0;i<3;i++) nuc[i]=toupper(nuc[i]);
+ unsigned short aacode=packCodon(nuc[0], nuc[1], nuc[2]);
+ return codonTable[aacode];
+ }
+//simple 1st frame forward translation of a given DNA string
+// allocate and returns the translation string
+char* translateDNA(const char* dnastr, int& aalen, int dnalen) {
+ if (dnastr==NULL || *dnastr==0) return NULL;
+ if (dnalen==0) dnalen=strlen(dnastr);
+ aalen=dnalen/3;
+ char* r=NULL;
+ GMALLOC(r, aalen+1);
+ r[aalen]=0;
+ int ai=0;
+ for (int i=0;i+2<dnalen;i+=3,ai++) {
+ r[ai]=codonTable[packCodon(toupper(dnastr[i]),toupper(dnastr[i+1]),toupper(dnastr[i+2]))];
+ }
+ return r;
diff --git a/src/gdna.cpp b/src/gdna.cpp
new file mode 100644
index 0000000..4d8a4b0
--- /dev/null
+++ b/src/gdna.cpp
@@ -0,0 +1,90 @@
+#include "gdna.h"
+#include <string.h>
+const char* IUPAC_2BITN ="001133223300000011112200000011000000";
+const char* IUPAC_DEFS ="AaCcTtGgUuMmRrWwSsYyKkVvHhDdBbNnXx-*";
+const char* IUPAC_COMP ="TtGgAaCcAaKkYyWwSsRrMmBbDdHhVvNnXx-*";
+#define A_2BIT 0 // 00
+#define C_2BIT 1 // 01
+#define G_2BIT 2 // 10
+#define T_2BIT 3 // 11
+static byte ntCompTable[256];
+static byte nt2bit[256]; //maps any character to a 2bit base value (with N = A)
+static char v_2bit2nt[4] = {'A','C','G','T'};
+static bool gdna_Ready=gDnaInit();
+byte gdna2bit(char* &nt, int n) {
+// Pack n bases into a byte (n can be 1..4)
+byte out = 0;
+while (n && *nt) {
+ n--;
+ out <<= 2;
+ out += nt2bit[(int)*nt];
+ nt++;
+ }
+#ifdef GDEBUG
+if (n) {
+ GError("Error: attempt to read 6-mer beyond the end of the string!\n");
+ }
+return out;
+char ntComplement(char c) {
+ return ntCompTable[(int)c];
+ }
+char g2bit2base(byte v2bit) {
+ return v_2bit2nt[v2bit & 0x03 ];
+//in place reverse complement of nucleotide (sub)sequence
+char* reverseComplement(char* seq, int slen) {
+ if (slen==0) slen=strlen(seq);
+ //reverseChars(seq,len);
+ int l=0;
+ int r=slen-1;
+ register char c;
+ while (l<r) {
+ c=seq[l];seq[l]=seq[r];
+ seq[r]=c; //this was: Gswap(str[l],str[r]);
+ l++;r--;
+ }
+ for (int i=0;i<slen;i++) seq[i]=ntComplement(seq[i]);
+ return seq;
+ }
+bool gDnaInit() {
+ if (gdna_Ready) return true;
+ int l=strlen(IUPAC_DEFS);
+ ntCompTable[0]=0;
+ nt2bit[0]=0;
+ for (int ch=1;ch<256;ch++) {
+ ntCompTable[ch]=0;
+ nt2bit[ch]=0;
+ for (int i=0;i<l;i++)
+ if (ch==IUPAC_DEFS[i]) {
+ ntCompTable[ch]=IUPAC_COMP[i];
+ nt2bit[ch] = IUPAC_2BITN[i]-'0';
+ break;
+ }
+ if (ntCompTable[ch]==0) {
+ ntCompTable[ch]='N';
+ }
+ }
+ gdna_Ready=true;
+ return true;
+ }
diff --git a/src/gff.cpp b/src/gff.cpp
new file mode 100644
index 0000000..cd57de6
--- /dev/null
+++ b/src/gff.cpp
@@ -0,0 +1,2125 @@
+#include "gff.h"
+//GffNames* GffReader::names=NULL;
+GffNames* GffObj::names=NULL;
+//global set of feature names, attribute names etc.
+// -- common for all GffObjs in current application!
+const uint GFF_MAX_LOCUS = 7000000; //longest known gene in human is ~2.2M, UCSC claims a gene for mouse of ~ 3.1 M
+const uint GFF_MAX_EXON = 30000; //longest known exon in human is ~11K
+const uint GFF_MAX_INTRON= 6000000; //Ensembl shows a >5MB human intron
+bool gff_show_warnings = false; //global setting, set by GffReader->showWarnings()
+const int gff_fid_mRNA=0;
+const int gff_fid_transcript=1;
+const int gff_fid_exon=2;
+const uint gfo_flag_HAS_ERRORS = 0x00000001;
+const uint gfo_flag_CHILDREN_PROMOTED= 0x00000002;
+const uint gfo_flag_IS_GENE = 0x00000004;
+const uint gfo_flag_IS_TRANSCRIPT = 0x00000008;
+const uint gfo_flag_HAS_GFF_ID = 0x00000010; //found GFF3 feature line with its own ID
+const uint gfo_flag_BY_EXON = 0x00000020; //created by subfeature (exon) directly
+const uint gfo_flag_DISCARDED = 0x00000100;
+const uint gfo_flag_LST_KEEP = 0x00000200;
+const uint gfo_flag_LEVEL_MSK = 0x00FF0000;
+const byte gfo_flagShift_LEVEL = 16;
+void gffnames_ref(GffNames* &n) {
+ if (n==NULL) n=new GffNames();
+ n->numrefs++;
+void gffnames_unref(GffNames* &n) {
+ if (n==NULL) GError("Error: attempt to remove reference to null GffNames object!\n");
+ n->numrefs--;
+ if (n->numrefs==0) { delete n; n=NULL; }
+const char* strExonType(char xtype) {
+ static const char* extbl[7]={"None", "start_codon", "stop_codon", "CDS", "UTR", "CDS_UTR", "exon"};
+ if (xtype>0 && xtype<7)
+ return extbl[(int)xtype];
+ else return "NULL";
+int gfo_cmpByLoc(const pointer p1, const pointer p2) {
+ GffObj& g1=*((GffObj*)p1);
+ GffObj& g2=*((GffObj*)p2);
+ if (g1.gseq_id==g2.gseq_id) {
+ if (g1.start!=g2.start)
+ return (int)(g1.start-g2.start);
+ else if (g1.getLevel()!=g2.getLevel())
+ return (int)(g1.getLevel()-g2.getLevel());
+ else
+ if (g1.end!=g2.end)
+ return (int)(g1.end-g2.end);
+ else return strcmp(g1.getID(), g2.getID());
+ }
+ else return (int)(g1.gseq_id-g2.gseq_id);
+char* GffLine::extractAttr(const char* attr, bool caseStrict, bool enforce_GTF2) {
+ //parse a key attribute and remove it from the info string
+ //(only works for attributes that have values following them after ' ' or '=')
+ static const char GTF2_ERR[]="Error parsing attribute %s ('\"' required) at GTF line:\n%s\n";
+ int attrlen=strlen(attr);
+ char cend=attr[attrlen-1];
+ //char* pos = (caseStrict) ? strstr(info, attr) : strifind(info, attr);
+ //must make sure attr is not found in quoted text
+ char* pos=info;
+ char prevch=0;
+ bool in_str=false;
+ bool notfound=true;
+ int (*strcmpfn)(const char*, const char*, int) = caseStrict ? Gstrcmp : Gstricmp;
+ while (notfound && *pos) {
+ char ch=*pos;
+ if (ch=='"') {
+ in_str=!in_str;
+ pos++;
+ prevch=ch;
+ continue;
+ }
+ if (!in_str && (prevch==0 || prevch==' ' || prevch == ';')
+ && strcmpfn(attr, pos, attrlen)==0) {
+ //attr match found
+ //check for word boundary on right
+ char* epos=pos+attrlen;
+ if (cend=='=' || cend==' ' || *epos==0 || *epos==' ') {
+ notfound=false;
+ break;
+ }
+ //not a perfect match, move on
+ pos=epos;
+ prevch=*(pos-1);
+ continue;
+ }
+ //not a match or in_str
+ prevch=ch;
+ pos++;
+ }
+ if (notfound) return NULL;
+ char* vp=pos+attrlen;
+ while (*vp==' ') vp++;
+ if (*vp==';' || *vp==0)
+ GError("Error parsing value of GFF attribute \"%s\", line:\n%s\n", attr, dupline);
+ bool dq_enclosed=false; //value string enclosed by double quotes
+ if (*vp=='"') {
+ dq_enclosed=true;
+ vp++;
+ }
+ if (enforce_GTF2 && !dq_enclosed)
+ GError(GTF2_ERR,attr, dupline);
+ char* vend=vp;
+ if (dq_enclosed) {
+ while (*vend!='"' && *vend!=';' && *vend!=0) vend++;
+ }
+ else {
+ while (*vend!=';' && *vend!=0) vend++;
+ }
+ if (enforce_GTF2 && *vend!='"')
+ GError(GTF2_ERR, attr, dupline);
+ char *r=Gstrdup(vp, vend-1);
+ //-- now remove this attribute from the info string
+ while (*vend!=0 && (*vend=='"' || *vend==';' || *vend==' ')) vend++;
+ if (*vend==0) vend--;
+ for (char *src=vend, *dest=pos;;src++,dest++) {
+ *dest=*src;
+ if (*src==0) break;
+ }
+ return r;
+static char fnamelc[128];
+GffLine::GffLine(GffReader* reader, const char* l) {
+ llen=strlen(l);
+ GMALLOC(line,llen+1);
+ memcpy(line, l, llen+1);
+ GMALLOC(dupline, llen+1);
+ memcpy(dupline, l, llen+1);
+ skip=true;
+ gseqname=NULL;
+ track=NULL;
+ ftype=NULL;
+ info=NULL;
+ _parents=NULL;
+ _parents_len=0;
+ num_parents=0;
+ parents=NULL;
+ is_gff3=false;
+ is_cds=false;
+ is_transcript=false;
+ is_exon=false;
+ is_gene=false;
+ exontype=0;
+ gene_id=NULL;
+ gene_name=NULL;
+ qstart=0;
+ qend=0;
+ qlen=0;
+ char* t[9];
+ int i=0;
+ int tidx=1;
+ t[0]=line;
+ while (line[i]!=0) {
+ if (line[i]=='\t') {
+ line[i]=0;
+ t[tidx]=line+i+1;
+ tidx++;
+ if (tidx>8) break;
+ }
+ i++;
+ }
+ if (tidx<8) { // ignore non-GFF lines
+ // GMessage("Warning: error parsing GFF/GTF line:\n%s\n", l);
+ return;
+ }
+ gseqname=t[0];
+ track=t[1];
+ ftype=t[2];
+ info=t[8];
+ char* p=t[3];
+ if (!parseUInt(p,fstart)) {
+ //chromosome_band entries in Flybase
+ GMessage("Warning: invalid start coordinate at line:\n%s\n",l);
+ return;
+ }
+ p=t[4];
+ if (!parseUInt(p,fend)) {
+ GMessage("Warning: invalid end coordinate at line:\n%s\n",l);
+ return;
+ }
+ if (fend<fstart) Gswap(fend,fstart); //make sure fstart>=fend, always
+ p=t[5];
+ if (p[0]=='.' && p[1]==0) {
+ score=0;
+ }
+ else {
+ if (!parseDouble(p,score))
+ GError("Error parsing feature score from GFF line:\n%s\n",l);
+ }
+ strand=*t[6];
+ if (strand!='+' && strand!='-' && strand!='.')
+ GError("Error parsing strand (%c) from GFF line:\n%s\n",strand,l);
+ phase=*t[7]; // must be '.', '0', '1' or '2'
+ // exon/CDS/mrna filter
+ strncpy(fnamelc, ftype, 127);
+ fnamelc[127]=0;
+ strlower(fnamelc); //convert to lower case
+ bool is_t_data=false;
+ if (strstr(fnamelc, "utr")!=NULL) {
+ exontype=exgffUTR;
+ is_exon=true;
+ is_t_data=true;
+ }
+ else if (endsWith(fnamelc, "exon")) {
+ exontype=exgffExon;
+ is_exon=true;
+ is_t_data=true;
+ }
+ else if (strstr(fnamelc, "stop") &&
+ (strstr(fnamelc, "codon") || strstr(fnamelc, "cds"))){
+ exontype=exgffStop;
+ is_cds=true; //though some place it outside the last CDS segment
+ is_t_data=true;
+ }
+ else if (strstr(fnamelc, "start") &&
+ ((strstr(fnamelc, "codon")!=NULL) || strstr(fnamelc, "cds")!=NULL)){
+ exontype=exgffStart;
+ is_cds=true;
+ is_t_data=true;
+ }
+ else if (strcmp(fnamelc, "cds")==0) {
+ exontype=exgffCDS;
+ is_cds=true;
+ is_t_data=true;
+ }
+ else if (startsWith(fnamelc, "intron") || endsWith(fnamelc, "intron")) {
+ exontype=exgffIntron;
+ }
+ else if (endsWith(fnamelc, "gene") || startsWith(fnamelc, "gene")) {
+ is_gene=true;
+ is_t_data=true; //because its name will be attached to parented transcripts
+ }
+ else if (endsWith(fnamelc,"rna") || endsWith(fnamelc,"transcript")) {
+ is_transcript=true;
+ is_t_data=true;
+ }
+if (reader->transcriptsOnly && !is_t_data) {
+ char* id=extractAttr("ID=");
+ if (id==NULL) id=extractAttr("transcript_id");
+ //GMessage("Discarding non-transcript line:\n%s\n",l);
+ if (id!=NULL) {
+ reader->discarded_ids.Add(id, new int(1));
+ GFREE(id);
+ }
+ return; //skip this line, unwanted feature name
+ }
+ ID=extractAttr("ID=",true);
+ char* Parent=extractAttr("Parent=",true);
+ is_gff3=(ID!=NULL || Parent!=NULL);
+ if (is_gff3) {
+ //parse as GFF3
+ if (ID!=NULL) {
+ //has ID attr so it's likely to be a parent feature
+ //look for explicit gene name
+ gene_name=extractAttr("gene_name=");
+ if (gene_name==NULL) {
+ gene_name=extractAttr("geneName=");
+ if (gene_name==NULL) {
+ gene_name=extractAttr("gene_sym=");
+ if (gene_name==NULL) {
+ gene_name=extractAttr("gene=");
+ }
+ }
+ }
+ gene_id=extractAttr("geneID=");
+ if (gene_id==NULL) {
+ gene_id=extractAttr("gene_id=");
+ }
+ if (is_gene) {
+ //special case: keep the Name and ID attributes of the gene feature
+ if (gene_name==NULL)
+ gene_name=extractAttr("Name=");
+ if (gene_id==NULL) //the ID is also gene_id in this case
+ gene_id=Gstrdup(ID);
+ //skip=false;
+ //return;
+ GFREE(Parent); //TMI, we really don't care about gene Parents?
+ } //gene feature
+ }// has GFF3 ID
+ if (Parent!=NULL) {
+ //keep Parent attr
+ //parse multiple parents
+ num_parents=1;
+ p=Parent;
+ int last_delim_pos=-1;
+ while (*p!=';' && *p!=0) {
+ if (*p==',' && *(p+1)!=0 && *(p+1)!=';') {
+ num_parents++;
+ last_delim_pos=(p-Parent);
+ }
+ p++;
+ }
+ _parents_len=p-Parent+1;
+ _parents=Parent;
+ GMALLOC(parents, num_parents*sizeof(char*));
+ parents[0]=_parents;
+ int i=1;
+ if (last_delim_pos>0) {
+ for (p=_parents+1;p<=_parents+last_delim_pos;p++) {
+ if (*p==',') {
+ char* ep=p-1;
+ while (*ep==' ' && ep>_parents) ep--;
+ *(ep+1)=0; //end the string there
+ parents[i]=p+1;
+ i++;
+ }
+ }
+ }
+ } //has Parent field
+ } //GFF3
+ else { // GTF-like expected
+ Parent=extractAttr("transcript_id",true);
+ if (Parent!=NULL) { //GTF2 format detected
+ if (is_transcript) {
+ // atypical GTF with a parent transcript line declared
+ ID=Parent;
+ Parent=NULL;
+ }
+ gene_id=extractAttr("gene_id"); // for GTF this is the only attribute accepted as geneID
+ if (gene_id==NULL)
+ gene_id=extractAttr("geneid");
+ gene_name=extractAttr("gene_name");
+ if (gene_name==NULL) {
+ gene_name=extractAttr("gene_sym");
+ if (gene_name==NULL) {
+ gene_name=extractAttr("gene");
+ if (gene_name==NULL)
+ gene_name=extractAttr("genesymbol");
+ }
+ }
+ //prepare for parseAttr by adding '=' character instead of spaces for all attributes
+ //after the attribute name
+ p=info;
+ bool noed=true; //not edited after the last delim
+ bool nsp=false; //non-space found after last delim
+ while (*p!=0) {
+ if (*p==' ') {
+ if (nsp && noed) {
+ *p='=';
+ noed=false;
+ p++;
+ continue;
+ }
+ }
+ else nsp=true; //non-space
+ if (*p==';') { noed=true; nsp=false; }
+ p++;
+ }
+ } //GTF2 detected (no parent line)
+ else {// Parent is NULL, check for jigsaw format or other pre-GTF2 format
+ //char* fexon=strstr(fnamelc, "exon");
+ //if (fexon!=NULL) {
+ if (exontype==exgffExon) {
+ if (startsWith(track,"jigsaw")) {
+ is_cds=true;
+ strcpy(track,"jigsaw");
+ p=strchr(info,';');
+ if (p==NULL) { Parent=Gstrdup(info); info=NULL; }
+ else { Parent=Gstrdup(info,p-1);
+ info=p+1;
+ }
+ }
+ } //exon feature?
+ if (Parent==NULL && exontype>=exgffCDS &&
+ (i=strcspn(info,"; \t\n\r"))<=(int)(strlen(info)+1)) {
+ //one word ID ? really desperate attempt to parse it here
+ Parent=Gstrdup(info,info+i-1);
+ info=NULL; //discard anything else on the line
+ }
+ }
+ if (Parent!=NULL) { //GTF transcript_id for exon/CDS feature
+ _parents=Parent;
+ GMALLOC(parents,sizeof(char*));
+ num_parents=1;
+ parents[0]=_parents;
+ }
+ } //GTF-like
+ //parse other potentially useful features
+ if (is_gff3) {
+ if ((p=strstr(info,"Target="))!=NULL) { //has Target attr
+ p+=7;
+ while (*p!=';' && *p!=0 && *p!=' ') p++;
+ if (*p!=' ') {
+ GError("Error parsing target coordinates from GFF line:\n%s\n",l);
+ }
+ if (!parseUInt(p,qstart))
+ GError("Error parsing target start coordinate from GFF line:\n%s\n",l);
+ if (*p!=' ') {
+ GError("Error parsing next target coordinate from GFF line:\n%s\n",l);
+ }
+ p++;
+ if (!parseUInt(p,qend))
+ GError("Error parsing target end coordinate from GFF line:\n%s\n",l);
+ }
+ if ((p=strifind(info,"Qreg="))!=NULL) { //has Qreg attr
+ p+=5;
+ if (!parseUInt(p,qstart))
+ GError("Error parsing target start coordinate from GFF line:\n%s\n",l);
+ if (*p!='-') {
+ GError("Error parsing next target coordinate from GFF line:\n%s\n",l);
+ }
+ p++;
+ if (!parseUInt(p,qend))
+ GError("Error parsing target end coordinate from GFF line:\n%s\n",l);
+ if (*p=='|' || *p==':') {
+ p++;
+ if (!parseUInt(p,qlen))
+ GError("Error parsing target length from GFF Qreg|: \n%s\n",l);
+ }
+ }//has Qreg attr
+ if (qlen==0 && (p=strifind(info,"Qlen="))!=NULL) {
+ p+=5;
+ if (!parseUInt(p,qlen))
+ GError("Error parsing target length from GFF Qlen:\n%s\n",l);
+ }
+ }//parsing some useful attributes in GFF3 records
+ if (ID==NULL && parents==NULL) {
+ if (reader->gff_warns)
+ GMessage("Warning: could not parse ID or Parent from GFF line:\n%s\n",dupline);
+ return; //skip
+ }
+ skip=false;
+void GffObj::addCDS(uint cd_start, uint cd_end, char phase) {
+ if (cd_start>=this->start) {
+ this->CDstart=cd_start;
+ if (strand=='+') this->CDphase=phase;
+ }
+ else this->CDstart=this->start;
+ if (cd_end<=this->end) {
+ this->CDend=cd_end;
+ if (strand=='-') this->CDphase=phase;
+ }
+ else this->CDend=this->end;
+ isTranscript(true);
+ exon_ftype_id=gff_fid_exon;
+ if (monoFeature()) {
+ if (exons.Count()==0) addExon(this->start, this->end,0,'.',0,0,false,exgffExon);
+ else exons[0]->exontype=exgffExon;
+ }
+int GffObj::addExon(GffReader* reader, GffLine* gl, bool keepAttr, bool noExonAttr) {
+ //this will make sure we have the right subftype_id!
+ //int subf_id=-1;
+ if (!isTranscript() && gl->is_cds) {
+ isTranscript(true);
+ exon_ftype_id=gff_fid_exon;
+ if (exons.Count()==1) exons[0]->exontype=exgffExon;
+ }
+ if (isTranscript()) {
+ if (exon_ftype_id<0) {//exon_ftype_id=gff_fid_exon;
+ if (gl->exontype>0) exon_ftype_id=gff_fid_exon;
+ else exon_ftype_id=names->feats.addName(gl->ftype);
+ }
+ //any recognized mRNA segment gets the generic "exon" type (also applies to CDS)
+ if (gl->exontype==0 && !gl->is_transcript) {
+ //extraneous mRNA feature, discard
+ if (reader->gff_warns)
+ GMessage("Warning: discarding unrecognized transcript subfeature '%s' of %s\n",
+ gl->ftype, gffID);
+ return -1;
+ }
+ }
+ else { //non-mRNA parent feature, check this subf type
+ int subf_id=names->feats.addName(gl->ftype);
+ if (exon_ftype_id<0 || exons.Count()==0) //never assigned a subfeature type before (e.g. first exon being added)
+ exon_ftype_id=subf_id;
+ else {
+ if (exon_ftype_id!=subf_id) {
+ //
+ if (exon_ftype_id==ftype_id && exons.Count()==1 && exons[0]->start==start && exons[0]->end==end) {
+ //the existing exon was just a dummy one created by default, discard it
+ exons.Clear();
+ covlen=0;
+ exon_ftype_id=subf_id; //allow the new subfeature to completely takeover
+ }
+ else { //multiple subfeatures, prefer those with
+ if (reader->gff_warns)
+ GMessage("GFF Warning: multiple subfeatures (%s and %s) found for %s, discarding ",
+ names->feats.getName(subf_id), names->feats.getName(exon_ftype_id),gffID);
+ if (gl->exontype!=0) { //new feature is an exon, discard previously parsed subfeatures
+ if (reader->gff_warns) GMessage("%s.\n", names->feats.getName(exon_ftype_id));
+ exon_ftype_id=subf_id;
+ exons.Clear();
+ covlen=0;
+ }
+ else { //discard new feature
+ if (reader->gff_warns) GMessage("%s.\n", names->feats.getName(subf_id));
+ return -1; //skip this 2nd subfeature type for this parent!
+ }
+ }
+ } //incoming subfeature is of different type
+ } //new subfeature type
+ } //non-mRNA parent
+ int eidx=addExon(gl->fstart, gl->fend, gl->score, gl->phase,
+ gl->qstart,gl->qend, gl->is_cds, gl->exontype);
+ if (eidx<0) return eidx; //this should never happen
+ if (keepAttr) {
+ if (noExonAttr) {
+ if (attrs==NULL) //place the parsed attributes directly at transcript level
+ parseAttrs(attrs, gl->info);
+ }
+ else { //need all exon-level attributes
+ parseAttrs(exons[eidx]->attrs, gl->info, true);
+ }
+ }
+ return eidx;
+int GffObj::addExon(uint segstart, uint segend, double sc, char fr, int qs, int qe, bool iscds, char exontype) {
+ if (exons.Count()==0) {
+ if (iscds) isCDS=true; //for now, assume CDS only if first "exon" given is a CDS
+ if (exon_ftype_id<0) {
+ exon_ftype_id = isTranscript() ? gff_fid_exon : ftype_id;
+ }
+ }
+ //special treatment of start/stop codon features, they might be broken/split between exons
+ //and in that case some providers will still give the wrong end coordinate as start+2 (e.g. UCSC)
+ //so we should not trust the end coordinate for such features
+ if (exontype==exgffStart || exontype==exgffStop) {
+ if (strand=='-') segstart=segend;
+ else segend=segstart;
+ if (exontype==exgffStart) {
+ if (CDstart==0 || segstart<CDstart) CDstart=segstart;
+ }
+ else {
+ if (segstart>CDend) CDend=segstart;
+ }
+ }
+ else if (iscds) { //update CDS anchors:
+ if (CDstart==0 || segstart<CDstart) {
+ CDstart=segstart;
+ if (exontype==exgffCDS && strand=='+') CDphase=fr;
+ }
+ if (segend>CDend) {
+ if (exontype==exgffCDS && strand=='-') CDphase=fr;
+ CDend=segend;
+ }
+ }
+ else { // not a CDS/start/stop
+ isCDS=false;
+ }
+ if (qs || qe) {
+ if (qs>qe) Gswap(qs,qe);
+ if (qs==0) qs=1;
+ }
+ int ovlen=0;
+ if (exontype>0) { //check for overlaps between exon-type segments
+ int oi=exonOverlapIdx(segstart, segend, &ovlen);
+ if (oi>=0) { //overlap existing segment
+ if (ovlen==0) {
+ //adjacent segments will be merged
+ //e.g. CDS to (UTR|exon)
+ if ((exons[oi]->exontype>=exgffUTR && exontype==exgffCDS) ||
+ (exons[oi]->exontype==exgffCDS && exontype>=exgffUTR)) {
+ expandExon(oi, segstart, segend, exgffCDSUTR, sc, fr, qs, qe);
+ return oi;
+ }
+ //CDS adjacent to stop_codon: UCSC does (did?) this
+ if ((exons[oi]->exontype==exgffStop && exontype==exgffCDS) ||
+ (exons[oi]->exontype==exgffCDS && exontype==exgffStop)) {
+ expandExon(oi, segstart, segend, exgffCDS, sc, fr, qs, qe);
+ return oi;
+ }
+ }
+ //only allow this for CDS within exon, stop_codon within (CDS|UTR|exon),
+ // start_codon within (CDS|exon)
+ if (exons[oi]->start<=segstart && exons[oi]->end>=segend) {
+ //larger segment given first, now the smaller included one is redundant
+ if (exons[oi]->exontype>exontype &&
+ !(exons[oi]->exontype==exgffUTR && exontype==exgffCDS)) {
+ return oi; //only used to store attributes from current GffLine
+ }
+ else {
+ if (gff_show_warnings && (exons[oi]->start<segstart || exons[oi]->end>segend)) {
+ GMessage("GFF Warning: unusual segment inclusion: %s(%d-%d) within %s(%d-%d) (ID=%s)\n",
+ strExonType(exontype), segstart, segend, strExonType(exons[oi]->exontype),
+ exons[oi]->start, exons[oi]->end, this->gffID);
+ }
+ return oi;
+ }
+ }
+ if (exontype>exons[oi]->exontype &&
+ segstart<=exons[oi]->start && segend>=exons[oi]->end &&
+ !(exontype==exgffUTR && exons[oi]->exontype==exgffCDS)) {
+ //smaller segment given first, so we have to enlarge it
+ expandExon(oi, segstart, segend, exontype, sc, fr, qs, qe);
+ //this should also check for overlapping next exon (oi+1) ?
+ return oi;
+ }
+ //there is also the special case of "ribosomal slippage exception" (programmed frameshift)
+ //where two CDS segments may actually overlap for 1 or 2 bases, but there should be only one encompassing exon
+ //if (ovlen>2 || exons[oi]->exontype!=exgffCDS || exontype!=exgffCDS) {
+ // had to relax this because of some weird UCSC annotations with exons partially overlapping the CDS segments
+ /*
+ if (ovlen>2 && exons[oi]->exontype!=exgffUTR && exontype!=exgffUTR) {
+ if (gff_show_warnings)
+ GMessage("GFF Warning: discarding overlapping feature segment (%d-%d) (vs %d-%d (%s)) for GFF ID %s on %s\n",
+ segstart, segend, exons[oi]->start, exons[oi]->end, getSubfName(), gffID, getGSeqName());
+ hasErrors(true);
+ return -1; //segment NOT added
+ }
+ */
+ if ((ovlen>2 || ovlen==0) || exons[oi]->exontype!=exgffCDS || exontype!=exgffCDS) {
+ if (gff_show_warnings)
+ GMessage("GFF Warning: merging overlapping/adjacent feature segment %s (%d-%d) with %s (%d-%d) for GFF ID %s on %s\n",
+ strExonType(exontype), segstart, segend, strExonType(exons[oi]->exontype), exons[oi]->start, exons[oi]->end, gffID, getGSeqName());
+ expandExon(oi, segstart, segend, exontype, sc, fr, qs, qe);
+ return oi;
+ }
+ // else add the segment if the overlap is small and between two CDS segments
+ //TODO: we might want to add an attribute here with the slippage coordinate and size?
+ covlen-=ovlen;
+ }//overlap or adjacent to existing segment
+ } //check for overlap
+ // --- no overlap, or accepted micro-overlap (ribosomal slippage)
+ // create & add the new segment
+ /*
+ if (start>0 && exontype==exgffCDS && exons.Count()==0) {
+ //adding a CDS directly as the first subfeature of a declared parent
+ segstart=start;
+ segend=end;
+ }
+ */
+ GffExon* enew=new GffExon(segstart, segend, sc, fr, qs, qe, exontype);
+ int eidx=exons.Add(enew);
+ if (eidx<0) {
+ //this would actually be acceptable if the object is a "Gene" and "exons" are in fact isoforms
+ if (gff_show_warnings)
+ GMessage("GFF Warning: failed adding segment %d-%d for %s (discarded)!\n",
+ segstart, segend, gffID);
+ delete enew;
+ hasErrors(true);
+ return -1;
+ }
+ covlen+=(int)(exons[eidx]->end-exons[eidx]->start)+1;
+ //adjust parent feature coordinates to contain this exon
+ if (start==0 || start>exons.First()->start) {
+ start=exons.First()->start;
+ }
+ if (end<exons.Last()->end) end=exons.Last()->end;
+ return eidx;
+void GffObj::expandExon(int oi, uint segstart, uint segend, char exontype, double sc, char fr, int qs, int qe) {
+ //oi is the index of the *first* overlapping segment found that must be enlarged
+ covlen-=exons[oi]->len();
+ if (segstart<exons[oi]->start)
+ exons[oi]->start=segstart;
+ if (qs && qs<exons[oi]->qstart) exons[oi]->qstart=qs;
+ if (segend>exons[oi]->end)
+ exons[oi]->end=segend;
+ if (qe && qe>exons[oi]->qend) exons[oi]->qend=qe;
+ //warning: score cannot be properly adjusted! e.g. if it's a p-value it's just going to get worse
+ if (sc!=0) exons[oi]->score=sc;
+ covlen+=exons[oi]->len();
+ //if (exons[oi]->exontype< exontype) -- always true
+ exons[oi]->exontype = exontype;
+ if (exontype==exgffCDS) exons[oi]->phase=fr;
+ //we must check if any more exons are also overlapping this
+ int ni=oi+1; //next exon index after oi
+ while (ni<exons.Count() && segend>=exons[ni]->start) { // next segment overlaps new enlarged segment
+ //only allow this if next segment is fully included, and a subordinate
+ if (exons[ni]->exontype<exontype && exons[ni]->end<=segend) {
+/* I guess we have to relax this due to stupid UCSC hg18 files having a start_codon sticking out
+chr1 hg18_knownGene start_codon 69806911 69806913 0.000000 + .
+chr1 hg18_knownGene CDS 69806911 69806912 0.000000 + 0
+chr1 hg18_knownGene exon 69805456 69806912 0.000000 + .
+ if (exons[ni]->qstart<exons[oi]->qstart) exons[oi]->qstart=exons[ni]->qstart;
+ if (exons[ni]->qend>exons[oi]->qend) exons[oi]->qend=exons[ni]->qend;
+ exons.Delete(ni);
+ }
+ else {
+ if (gff_show_warnings) GMessage("GFF Warning: overlapping existing exon(%d-%d) while expanding to %d-%d for GFF ID %s\n",
+ exons[ni]->start, exons[ni]->end, segstart, segend, gffID);
+ //hasErrors(true);
+ break;
+ }
+ }
+ // -- make sure any other related boundaries are updated:
+ start=exons.First()->start;
+ end=exons.Last()->end;
+ if (uptr!=NULL) { //collect stats about the underlying genomic sequence
+ GSeqStat* gsd=(GSeqStat*)uptr;
+ if (start<gsd->mincoord) gsd->mincoord=start;
+ if (end>gsd->maxcoord) gsd->maxcoord=end;
+ if (this->len()>gsd->maxfeat_len) {
+ gsd->maxfeat_len=this->len();
+ gsd->maxfeat=this;
+ }
+ }
+void GffObj::removeExon(int idx) {
+ /*
+ if (idx==0 && segs[0].start==gstart)
+ gstart=segs[1].start;
+ if (idx==segcount && segs[segcount].end==gend)
+ gend=segs[segcount-1].end;
+ */
+ if (idx<0 || idx>=exons.Count()) return;
+ int segstart=exons[idx]->start;
+ int segend=exons[idx]->end;
+ exons.Delete(idx);
+ covlen -= (int)(segend-segstart)+1;
+ start=exons.First()->start;
+ end=exons.Last()->end;
+ if (isCDS) { CDstart=start; CDend=end; }
+void GffObj::removeExon(GffExon* p) {
+ for (int idx=0;idx<exons.Count();idx++) {
+ if (exons[idx]==p) {
+ int segstart=exons[idx]->start;
+ int segend=exons[idx]->end;
+ exons.Delete(idx);
+ covlen -= (int)(segend-segstart)+1;
+ if (exons.Count() > 0) {
+ start=exons.First()->start;
+ end=exons.Last()->end;
+ if (isCDS) { CDstart=start; CDend=end; }
+ }
+ return;
+ }
+ }
+GffObj::GffObj(GffReader *gfrd, GffLine* gffline, bool keepAttr, bool noExonAttr):
+ GSeg(0,0), exons(true,true,false), children(1,false) {
+ xstart=0;
+ xend=0;
+ xstatus=0;
+ partial=false;
+ isCDS=false;
+ uptr=NULL;
+ ulink=NULL;
+ parent=NULL;
+ udata=0;
+ flags=0;
+ CDstart=0;
+ CDend=0;
+ CDphase=0;
+ geneID=NULL;
+ gene_name=NULL;
+ attrs=NULL;
+ gffID=NULL;
+ track_id=-1;
+ gseq_id=-1;
+ ftype_id=-1;
+ exon_ftype_id=-1;
+ strand='.';
+ if (gfrd==NULL)
+ GError("Cannot use this GffObj constructor with a NULL GffReader!\n");
+ gffnames_ref(names);
+ if (gfrd->names==NULL) gfrd->names=names;
+ //qlen=0;qstart=0;qend=0;
+ gscore=0;
+ uscore=0;
+ covlen=0;
+ qcov=0;
+ start=gffline->fstart;
+ end=gffline->fend;
+ gseq_id=names->gseqs.addName(gffline->gseqname);
+ track_id=names->tracks.addName(gffline->track);
+ strand=gffline->strand;
+ qlen=gffline->qlen;
+ qstart=gffline->qstart;
+ qend=gffline->qend;
+ //setup flags from gffline
+ isCDS=gffline->is_cds; //for now
+ isGene(gffline->is_gene);
+ isTranscript(gffline->is_transcript || gffline->exontype!=0);
+ //fromGff3(gffline->is_gff3);
+ if (gffline->parents!=NULL && !gffline->is_transcript) {
+ //GTF style -- create a GffObj directly by subfeature
+ //(also possible orphan GFF3 exon line, or an exon given before its parent (chado))
+ if (gffline->exontype!=0) { //recognized exon-like feature
+ ftype_id=gff_fid_transcript; //so this is some sort of transcript
+ exon_ftype_id=gff_fid_exon; //subfeatures MUST be exons
+ }
+ else {//unrecognized subfeatures
+ //make this GffObj of the same feature type
+ ftype_id=names->feats.addName(gffline->ftype);
+ }
+ if (gffline->ID==NULL) { //typical GTF2 without "transcript" line
+ gffID=Gstrdup(gffline->parents[0]);
+ this->createdByExon(true);
+ //this is likely the first exon/segment of the feature
+ addExon(gfrd, gffline, keepAttr, noExonAttr);
+ }
+ else { //a parented feature with an ID: orphan or premature GFF3 subfeature line
+ if (gffline->is_gff3 && gffline->exontype!=0) {
+ //premature exon given before its parent transcript
+ //create the transcript entry here
+ gffID=Gstrdup(gffline->parents[0]);
+ this->createdByExon(true);
+ //this is the first exon/segment of the transcript
+ addExon(gfrd, gffline, keepAttr, noExonAttr);
+ }
+ else { //unrecognized non-exon feature ? use the ID instead
+ this->hasGffID(true);
+ gffID=Gstrdup(gffline->ID);
+ if (keepAttr) this->parseAttrs(attrs, gffline->info);
+ }
+ }
+ } //non-transcript parented subfeature given directly
+ else {
+ //non-parented feature OR a recognizable transcript
+ //create a parent feature in its own right
+ gscore=gffline->score;
+ if (gffline->ID==NULL || gffline->ID[0]==0)
+ GError("Error: no ID found for GFF record start\n");
+ this->hasGffID(true);
+ gffID=Gstrdup(gffline->ID); //there must be an ID here
+ //if (gffline->is_transcript) ftype_id=gff_fid_mRNA;
+ //else
+ ftype_id=names->feats.addName(gffline->ftype);
+ if (gffline->is_transcript)
+ exon_ftype_id=gff_fid_exon;
+ if (keepAttr) this->parseAttrs(attrs, gffline->info);
+ }//no parent
+ if (gffline->gene_name!=NULL) {
+ gene_name=Gstrdup(gffline->gene_name);
+ }
+ if (gffline->gene_id) {
+ geneID=Gstrdup(gffline->gene_id);
+ }
+ else if (gffline->is_transcript && gffline->parents) {
+ geneID=Gstrdup(gffline->parents[0]);
+ }
+ //GSeqStat* gsd=gfrd->gseqstats.AddIfNew(new GSeqStat(gseq_id,names->gseqs.lastNameUsed()),true);
+ GSeqStat* gsd=gfrd->gseqstats.AddIfNew(new GSeqStat(gseq_id,gffline->gseqname), true);
+ uptr=gsd;
+ /*
+ if (start<gsd->mincoord) gsd->mincoord=start;
+ if (end>gsd->maxcoord) gsd->maxcoord=end;
+ if (this->len()>gsd->maxfeat_len) {
+ gsd->maxfeat_len=this->len();
+ gsd->maxfeat=this;
+ }
+ */
+GffLine* GffReader::nextGffLine() {
+ if (gffline!=NULL) return gffline; //caller should free gffline after processing
+ while (gffline==NULL) {
+ int llen=0;
+ buflen=GFF_LINELEN-1;
+ char* l=fgetline(linebuf, buflen, fh, &fpos, &llen);
+ if (l==NULL) {
+ return NULL; //end of file
+ }
+ // _crc_result.process_bytes( linebuf, llen );
+ int ns=0; //first nonspace position
+ while (l[ns]!=0 && isspace(l[ns])) ns++;
+ if (l[ns]=='#' || llen<10) continue;
+ gffline=new GffLine(this, l);
+ if (gffline->skip) {
+ delete gffline;
+ gffline=NULL;
+ continue;
+ }
+ if (gffline->ID==NULL && gffline->parents==NULL) { //it must have an ID
+ //this might not be needed, already checked in the GffLine constructor
+ if (gff_warns)
+ GMessage("Warning: malformed GFF line, no parent or record Id (kipping\n");
+ delete gffline;
+ gffline=NULL;
+ //continue;
+ }
+ }
+return gffline;
+char* GffReader::gfoBuildId(const char* id, const char* ctg) {
+//caller must free the returned pointer
+ char* buf=NULL;
+ int idlen=strlen(id);
+ GMALLOC(buf, idlen+strlen(ctg)+2);
+ strcpy(buf, id);
+ buf[idlen]='~';
+ strcpy(buf+idlen+1, ctg);
+ return buf;
+void GffReader::gfoRemove(const char* id, const char* ctg) {
+ char* buf=gfoBuildId(id,ctg);
+ phash.Remove(buf);
+ GFREE(buf);
+GffObj* GffReader::gfoAdd(GffObj* gfo) {
+ GPVec<GffObj>* glst=phash.Find(gfo->gffID);
+ if (glst==NULL)
+ glst=new GPVec<GffObj>(false);
+ //GfoHolder gh(gfo); //,idx);
+ int i=glst->Add(gfo);
+ phash.Add(gfo->gffID, glst);
+ return glst->Get(i);
+GffObj* GffReader::gfoAdd(GPVec<GffObj>& glst, GffObj* gfo) {
+ int i=glst.Add(gfo);
+ return glst[i];
+GffObj* GffReader::gfoFind(const char* id, const char* ctg,
+ GPVec<GffObj>** glst, char strand, uint start, uint end) {
+ GPVec<GffObj>* gl=phash.Find(id);
+ GffObj* gh=NULL;
+ if (gl) {
+ for (int i=0;i<gl->Count();i++) {
+ GffObj& gfo = *(gl->Get(i));
+ if (ctg!=NULL && strcmp(ctg, gfo.getGSeqName())!=0)
+ continue;
+ if (strand && gfo.strand!='.' && strand != gfo.strand)
+ continue;
+ if (start>0) {
+ if (abs((int)start-(int)gfo.start)> (int)GFF_MAX_LOCUS)
+ continue;
+ if (end>0 && (gfo.start>end || gfo.end<start))
+ continue;
+ }
+ //must be the same transcript, according to given comparison criteria
+ gh=&gfo;
+ break;
+ }
+ }
+ if (glst) *glst=gl;
+ return gh;
+GffObj* GffReader::replaceGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr, int replaceidx) {
+ GffObj* newgfo=new GffObj(this, gffline, keepAttr, noExonAttr);
+ GffObj* r=NULL;
+ if (replaceidx>=0) {
+ gflst.Put(replaceidx,newgfo);
+ r=gfoAdd(newgfo);
+ }
+ else {
+ int gfoidx=gflst.Add(newgfo);
+ r=gfoAdd(newgfo);
+ }
+ return r;
+} */
+GffObj* GffReader::updateParent(GffObj* newgfo, GffObj* parent) {
+ //assert(parent);
+ //assert(newgfo);
+ parent->children.Add(newgfo);
+ if (newgfo->parent==NULL) newgfo->parent=parent;
+ newgfo->setLevel(parent->getLevel()+1);
+ if (parent->isGene()) {
+ if (parent->gene_name!=NULL && newgfo->gene_name==NULL)
+ newgfo->gene_name=Gstrdup(parent->gene_name);
+ if (parent->geneID!=NULL && newgfo->geneID==NULL)
+ newgfo->geneID=Gstrdup(parent->geneID);
+ }
+ return newgfo;
+GffObj* GffReader::newGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr,
+ GffObj* parent, GffExon* pexon, GPVec<GffObj>* glst) {
+ GffObj* newgfo=new GffObj(this, gffline, keepAttr, noExonAttr);
+ GffObj* r=NULL;
+ //int gfoidx=gflst.Add(newgfo);
+ gflst.Add(newgfo);
+ r=(glst) ? gfoAdd(*glst, newgfo) : gfoAdd(newgfo);
+ if (parent!=NULL) {
+ updateParent(r, parent);
+ if (pexon!=NULL) parent->removeExon(pexon);
+ }
+ /*
+ if (gff_warns) {
+ int* pcount=tids.Find(newgfo->gffID);
+ if (pcount!=NULL) {
+ if (gff_warns) GMessage("Warning: duplicate GFF ID: %s\n", newgfo->gffID);
+ (*pcount)++;
+ }
+ else {
+ tids.Add(newgfo->gffID,new int(1));
+ }
+ }
+ */
+ return r;
+GffObj* GffReader::updateGffRec(GffObj* prevgfo, GffLine* gffline,
+ bool keepAttr) {
+ if (prevgfo==NULL) return NULL;
+ //prevgfo->gffobj->createdByExon(false);
+ prevgfo->ftype_id=prevgfo->names->feats.addName(gffline->ftype);
+ prevgfo->start=gffline->fstart;
+ prevgfo->end=gffline->fend;
+ prevgfo->isGene(gffline->is_gene);
+ prevgfo->isTranscript(gffline->is_transcript || gffline->exontype!=0);
+ prevgfo->hasGffID(gffline->ID!=NULL);
+ if (keepAttr) {
+ if (prevgfo->attrs!=NULL) prevgfo->attrs->Clear();
+ prevgfo->parseAttrs(prevgfo->attrs, gffline->info);
+ }
+ return prevgfo;
+bool GffReader::addExonFeature(GffObj* prevgfo, GffLine* gffline, GHash<CNonExon>& pex, bool noExonAttr) {
+ bool r=true;
+ if (gffline->strand!=prevgfo->strand) {
+ if (prevgfo->strand=='.') {
+ prevgfo->strand=gffline->strand;
+ }
+ else {
+ GMessage("GFF Error at %s (%c): exon %d-%d (%c) found on different strand; discarded.\n",
+ prevgfo->gffID, prevgfo->strand,
+ gffline->fstart, gffline->fend, gffline->strand, prevgfo->getGSeqName());
+ //r=false;
+ return true;
+ }
+ }
+ int gdist=(gffline->fstart>prevgfo->end) ? gffline->fstart-prevgfo->end :
+ ((gffline->fend<prevgfo->start)? prevgfo->start-gffline->fend :
+ 0 );
+ if (gdist>(int)GFF_MAX_LOCUS) { //too far apart, most likely this is a duplicate ID
+ GMessage("Error: duplicate GFF ID '%s' (or exons too far apart)!\n",prevgfo->gffID);
+ //validation_errors = true;
+ r=false;
+ if (!gff_warns) exit(1);
+ }
+ int eidx=prevgfo->addExon(this, gffline, !noExonAttr, noExonAttr);
+ if (eidx>=0 && gffline->ID!=NULL && gffline->exontype==0)
+ subfPoolAdd(pex, prevgfo);
+ return r;
+CNonExon* GffReader::subfPoolCheck(GffLine* gffline, GHash<CNonExon>& pex, char*& subp_name) {
+ CNonExon* subp=NULL;
+ subp_name=NULL;
+ for (int i=0;i<gffline->num_parents;i++) {
+ if (transcriptsOnly && discarded_ids.Find(gffline->parents[i])!=NULL)
+ continue;
+ subp_name=gfoBuildId(gffline->parents[i], gffline->gseqname); //e.g. mRNA name
+ subp=pex.Find(subp_name);
+ if (subp!=NULL)
+ return subp;
+ GFREE(subp_name);
+ }
+ return NULL;
+void GffReader::subfPoolAdd(GHash<CNonExon>& pex, GffObj* newgfo) {
+//this might become a parent feature later
+if (newgfo->exons.Count()>0) {
+ char* xbuf=gfoBuildId(gffline->ID, gffline->gseqname);
+ pex.Add(xbuf, new CNonExon(newgfo, newgfo->exons[0], gffline));
+ GFREE(xbuf);
+ }
+GffObj* GffReader::promoteFeature(CNonExon* subp, char*& subp_name, GHash<CNonExon>& pex,
+ bool keepAttr, bool noExonAttr) {
+ GffObj* prevp=subp->parent; //grandparent of gffline (e.g. gene)
+ //if (prevp!=gflst[subp->idx])
+ // GError("Error promoting subfeature %s, gflst index mismatch?!\n", subp->gffline->ID);
+ subp->gffline->discardParent();
+ GffObj* gfoh=newGffRec(subp->gffline, keepAttr, noExonAttr, prevp, subp->exon);
+ pex.Remove(subp_name); //no longer a potential parent, moved it to phash already
+ prevp->promotedChildren(true);
+ return gfoh; //returns the holder of newly promoted feature
+//have to parse the whole file because exons and other subfeatures can be scattered, unordered in the input
+//Trans-splicing and fusions are only accepted in proper GFF3 format, i.e. multiple features with the same ID
+//are accepted if they are NOT overlapping/continuous
+// *** BUT (exception): proximal xRNA features with the same ID, on the same strand, will be merged
+// and the segments will be treated like exons (e.g. TRNAR15 (rna1940) in RefSeq)
+void GffReader::readAll(bool keepAttr, bool mergeCloseExons, bool noExonAttr) {
+ bool validation_errors = false;
+ //loc_debug=false;
+ GHash<CNonExon> pex; //keep track of any "exon"-like features that have an ID
+ //and thus could become promoted to parent features
+ while (nextGffLine()!=NULL) {
+ GffObj* prevseen=NULL;
+ GPVec<GffObj>* prevgflst=NULL;
+ if (gffline->ID && gffline->exontype==0) {
+ //>> for a parent-like IDed feature (mRNA, gene, etc.)
+ //look for same ID on the same chromosome/strand/locus
+ prevseen=gfoFind(gffline->ID, gffline->gseqname, &prevgflst, gffline->strand, gffline->fstart);
+ if (prevseen!=NULL) {
+ //same ID/chromosome combo encountered before
+ if (prevseen->createdByExon()) {
+ if (gff_show_warnings && (prevseen->start<gffline->fstart ||
+ prevseen->end>gffline->fend))
+ GMessage("GFF Warning: invalid coordinates for %s parent feature (ID=%s)\n", gffline->ftype, gffline->ID);
+ //an exon of this ID was given before
+ //this line has the main attributes for this ID
+ updateGffRec(prevseen, gffline, keepAttr);
+ }
+ else {
+ //- duplicate ID -- this must be a discontinuous feature according to GFF3 specs
+ // e.g. a trans-spliced transcript
+ if (prevseen->overlap(gffline->fstart, gffline->fend)) {
+ //overlapping with same ID not allowed
+ GMessage("GFF Error: duplicate/invalid '%s' feature ID=%s\n", gffline->ftype, gffline->ID);
+ //validation_errors = true;
+ if (gff_warns) {
+ delete gffline;
+ gffline=NULL;
+ continue;
+ }
+ else exit(1);
+ }
+ //create a new entry with the same ID
+ int distance=INT_MAX;
+ if (prevseen->isTranscript() && prevseen->strand==gffline->strand) {
+ if (prevseen->start>=gffline->fstart)
+ distance=prevseen->start-gffline->fend;
+ else
+ distance=gffline->fstart-prevseen->end;
+ }
+ if (distance<1000) {//FIXME: arbitrary proximity threshold (yuck)
+ //exception: make this an exon of previous ID
+ //addExonFeature(prevseen, gffline, pex, noExonAttr);
+ prevseen->addExon(this, gffline, false, true);
+ }
+ else { //create a separate entry (true discontinuous feature)
+ prevseen=newGffRec(gffline, keepAttr, noExonAttr,
+ prevseen->parent, NULL, prevgflst);
+ }
+ } //duplicate ID on the same chromosome
+ } //prevseeen != NULL
+ } //parent-like ID feature
+ if (gffline->parents==NULL) {//start GFF3-like record with no parent (mRNA, gene)
+ if (!prevseen) newGffRec(gffline, keepAttr, noExonAttr, NULL, NULL, prevgflst);
+ }
+ else { //--- it's a child feature (exon/CDS but could still be a mRNA with gene(s) as parent)
+ //updates all the declared parents with this child
+ bool found_parent=false;
+ GffObj* newgfo=prevseen;
+ GPVec<GffObj>* newgflst=NULL;
+ for (int i=0;i<gffline->num_parents;i++) {
+ if (transcriptsOnly && discarded_ids.Find(gffline->parents[i])!=NULL)
+ continue; //skipping discarded parent feature
+ GffObj* parentgfo=NULL;
+ if (gffline->is_transcript || gffline->exontype==0) {//possibly a transcript
+ parentgfo=gfoFind(gffline->parents[i], gffline->gseqname,
+ &newgflst, gffline->strand, gffline->fstart, gffline->fend);
+ }
+ else {
+ //for exon-like entities we only need a parent to be in locus distance,
+ //on the same strand
+ parentgfo=gfoFind(gffline->parents[i], gffline->gseqname,
+ &newgflst, gffline->strand, gffline->fstart);
+ }
+ if (parentgfo!=NULL) { //parent GffObj parsed earlier
+ found_parent=true;
+ if (parentgfo->isGene() && gffline->is_transcript
+ && gffline->exontype==0) {
+ //not an exon, but a transcript parented by a gene
+ if (newgfo) {
+ updateParent(newgfo, parentgfo);
+ }
+ else {
+ newgfo=newGffRec(gffline, keepAttr, noExonAttr, parentgfo);
+ }
+ }
+ else { //potential exon subfeature?
+ //always discards dummy "intron" features
+ if (!(gffline->exontype==exgffIntron && (parentgfo->isTranscript() || parentgfo->exons.Count()>0))) {
+ if (!addExonFeature(parentgfo, gffline, pex, noExonAttr))
+ validation_errors=true;
+ }
+ }
+ } //overlapping parent feature found
+ } //for each parsed parent Id
+ if (!found_parent) { //new GTF-like record starting here with a subfeature directly
+ //or it could be some chado GFF3 barf with exons coming BEFORE their parent :(
+ //check if this feature isn't parented by a previously stored "exon" subfeature
+ char* subp_name=NULL;
+ CNonExon* subp=subfPoolCheck(gffline, pex, subp_name);
+ if (subp!=NULL) { //found a subfeature that is the parent of this gffline
+ //promote that subfeature to a full GffObj
+ GffObj* gfoh=promoteFeature(subp, subp_name, pex, keepAttr, noExonAttr);
+ //add current gffline as an exon of the newly promoted subfeature
+ if (!addExonFeature(gfoh, gffline, pex, noExonAttr))
+ validation_errors=true;
+ }
+ else { //no parent seen before,
+ //loc_debug=true;
+ GffObj* ngfo=prevseen;
+ if (ngfo==NULL) {
+ //if it's an exon type, create directly the parent with this exon
+ //but if it's recognized as a transcript, the object itself is created
+ ngfo=newGffRec(gffline, keepAttr, noExonAttr, NULL, NULL, newgflst);
+ }
+ if (!ngfo->isTranscript() &&
+ gffline->ID!=NULL && gffline->exontype==0)
+ subfPoolAdd(pex, ngfo);
+ //even those with errors will be added here!
+ }
+ GFREE(subp_name);
+ } //no previous parent found
+ } //parented feature
+ //--
+ delete gffline;
+ gffline=NULL;
+ }//while gff lines
+ if (gflst.Count()>0) {
+ gflst.finalize(this, mergeCloseExons, keepAttr, noExonAttr); //force sorting by locus if so constructed
+ gseqStats.setCount(gseqstats.Last()->gseqid+1);
+ for (int gi=0;gi<gseqstats.Count();gi++) {
+ gseqStats.Put(gseqstats[gi]->gseqid, gseqstats[gi]); //copy the pointer only
+ }
+ }
+ // all gff records are now loaded in GList gflst
+ // so we can free the hash
+ phash.Clear();
+ //tids.Clear();
+ if (validation_errors) {
+ exit(1);
+ }
+void GfList::finalize(GffReader* gfr, bool mergeCloseExons,
+ bool keepAttrs, bool noExonAttr) { //if set, enforce sort by locus
+ if (mustSort) { //force (re-)sorting
+ this->setSorted(false);
+ this->setSorted((GCompareProc*)gfo_cmpByLoc);
+ }
+ GList<GffObj> discarded(false,true,false);
+ for (int i=0;i<Count();i++) {
+ //finish the parsing for each GffObj
+ fList[i]->finalize(gfr, mergeCloseExons, keepAttrs, noExonAttr);
+ if (fList[i]->isDiscarded()) {
+ discarded.Add(fList[i]);
+ if (fList[i]->children.Count()>0) {
+ for (int c=0;c<fList[i]->children.Count();c++) {
+ fList[i]->children[c]->parent=NULL;
+ if (keepAttrs)
+ fList[i]->children[c]->copyAttrs(fList[i]); //inherit the attributes of discarded parent (e.g. pseudo=true; )
+ }
+ }
+ this->Forget(i);
+ }
+ }
+ if (discarded.Count()>0) {
+ this->Pack();
+ }
+GffObj* GffObj::finalize(GffReader* gfr, bool mergeCloseExons, bool keepAttrs, bool noExonAttr) {
+ //merge
+ //always merge adjacent or overlapping segments
+ //but if mergeCloseExons then merge even when distance is up to 5 bases
+ udata=0;
+ uptr=NULL;
+ if (gfr->transcriptsOnly && !(isTranscript() || (isGene() && children.Count()==0))) {
+ isDiscarded(true);
+ }
+ if (ftype_id==gff_fid_transcript && CDstart>0) {
+ ftype_id=gff_fid_mRNA;
+ //exon_ftype_id=gff_fid_exon;
+ }
+ if (exons.Count()>0 && (isTranscript() || exon_ftype_id==gff_fid_exon)) {
+ if (mergeCloseExons) {
+ int mindist=mergeCloseExons ? 5:1;
+ for (int i=0;i<exons.Count()-1;i++) {
+ int ni=i+1;
+ uint mend=exons[i]->end;
+ while (ni<exons.Count()) {
+ int dist=(int)(exons[ni]->start-mend);
+ if (dist>mindist) break; //no merging with next segment
+ if (gfr!=NULL && gfr->gff_warns && dist!=0 && (exons[ni]->exontype!=exgffUTR && exons[i]->exontype!=exgffUTR)) {
+ GMessage("GFF warning: merging adjacent/overlapping segments of %s on %s (%d-%d, %d-%d)\n",
+ gffID, getGSeqName(), exons[i]->start, exons[i]->end,exons[ni]->start, exons[ni]->end);
+ }
+ mend=exons[ni]->end;
+ covlen-=exons[i]->len();
+ exons[i]->end=mend;
+ covlen+=exons[i]->len();
+ covlen-=exons[ni]->len();
+ if (exons[ni]->attrs!=NULL && (exons[i]->attrs==NULL ||
+ exons[i]->attrs->Count()<exons[ni]->attrs->Count())) {
+ //use the other exon attributes, if more
+ delete(exons[i]->attrs);
+ exons[i]->attrs=exons[ni]->attrs;
+ exons[ni]->attrs=NULL;
+ }
+ exons.Delete(ni);
+ } //check for merge with next exon
+ } //for each exon
+ } //merge close exons
+ //shrink transcript to the exons' span
+ this->start=exons.First()->start;
+ this->end=exons.Last()->end;
+ //also update the stats for the reference sequence
+ if (uptr!=NULL) { //collect stats about the underlying genomic sequence
+ GSeqStat* gsd=(GSeqStat*)uptr;
+ if (start<gsd->mincoord) gsd->mincoord=start;
+ if (end>gsd->maxcoord) gsd->maxcoord=end;
+ if (this->len()>gsd->maxfeat_len) {
+ gsd->maxfeat_len=this->len();
+ gsd->maxfeat=this;
+ }
+ }
+ this->uptr=NULL;
+ this->udata=0;
+ }
+ //attribute reduction for GTF records
+ if (keepAttrs && !noExonAttr && !hasGffID()
+ && exons.Count()>0 && exons[0]->attrs!=NULL) {
+ bool attrs_discarded=false;
+ for (int a=0;a<exons[0]->attrs->Count();a++) {
+ int attr_name_id=exons[0]->attrs->Get(a)->attr_id;
+ char* attr_name=names->attrs.getName(attr_name_id);
+ char* attr_val =exons[0]->attrs->Get(a)->attr_val;
+ bool sameExonAttr=true;
+ for (int i=1;i<exons.Count();i++) {
+ char* ov=exons[i]->getAttr(attr_name_id);
+ if (ov==NULL || (strcmp(ov,attr_val)!=0)) {
+ sameExonAttr=false;
+ break;
+ }
+ }
+ if (sameExonAttr) {
+ //delete this attribute from exons level
+ attrs_discarded=true;
+ this->addAttr(attr_name, attr_val);
+ for (int i=1;i<exons.Count();i++) {
+ removeExonAttr(*(exons[i]), attr_name_id);
+ }
+ exons[0]->attrs->freeItem(a);
+ }
+ }
+ if (attrs_discarded) exons[0]->attrs->Pack();
+ }
+ return this;
+void GffObj::parseAttrs(GffAttrs*& atrlist, char* info, bool isExon) {
+ if (names==NULL)
+ GError(ERR_NULL_GFNAMES, "parseAttrs()");
+ if (atrlist==NULL)
+ atrlist=new GffAttrs();
+ char* endinfo=info+strlen(info);
+ char* start=info;
+ char* pch=start;
+ while (start<endinfo) {
+ //skip spaces
+ while (*start==' ' && start<endinfo) start++;
+ pch=strchr(start, ';');
+ if (pch==NULL) pch=endinfo;
+ else {
+ *pch='\0';
+ pch++;
+ }
+ char* ech=strchr(start,'=');
+ if (ech!=NULL) { // attr=value format found
+ *ech='\0';
+ //if (noExonAttr && (strcmp(start, "exon_number")==0 || strcmp(start, "exon")==0)) { start=pch; continue; }
+ if (strcmp(start, "exon_number")==0 || strcmp(start, "exon")==0 ||
+ strcmp(start, "exon_id")==0)
+ { start=pch; continue; }
+ ech++;
+ while (*ech==' ' && ech<endinfo) ech++;//skip extra spaces after the '='
+ //atrlist->Add(new GffAttr(names->attrs.addName(start),ech));
+ //make sure we don't add the same attribute more than once
+ if (isExon && (strcmp(start, "protein_id")==0)) {
+ //Ensembl special case
+ this->addAttr(start, ech);
+ start=pch;
+ continue;
+ }
+ atrlist->add_or_update(names, start, ech);
+ }
+ /*
+ else { //not an attr=value format
+ atrlist->Add(new GffAttr(names->attrs.addName(start),"1"));
+ }
+ */
+ start=pch;
+ }
+ if (atrlist->Count()==0) { delete atrlist; atrlist=NULL; }
+void GffObj::addAttr(const char* attrname, const char* attrvalue) {
+ if (this->attrs==NULL)
+ this->attrs=new GffAttrs();
+ //this->attrs->Add(new GffAttr(names->attrs.addName(attrname),attrvalue));
+ this->attrs->add_or_update(names, attrname, attrvalue);
+void GffObj::copyAttrs(GffObj* from) { //typically from is the parent gene, and this is a transcript
+ if (from==NULL || from->attrs==NULL) return;
+ if (this->attrs==NULL) {
+ this->attrs=new GffAttrs();
+ }
+ //special RefSeq case
+ int desc_attr_id=names->attrs.getId("description"); //from gene
+ int prod_attr_id=names->attrs.getId("product"); //from transcript (this)
+ char* prod = (prod_attr_id>=0) ? this->attrs->getAttr(prod_attr_id) : NULL;
+ for (int i=0;i<from->attrs->Count();++i) {
+ //this->attrs->add_no_update(names, from->attrs->Get(i)->attr_id, from->attrs->Get(i)->attr_val);
+ int aid=from->attrs->Get(i)->attr_id;
+ //special case for GenBank refseq genes vs transcripts:
+ if (prod && aid==desc_attr_id && strcmp(from->attrs->getAttr(desc_attr_id), prod)==0)
+ continue; //skip description if product already there and the same
+ bool haveit=false;
+ for (int ai=0;ai<this->attrs->Count();++ai) {
+ //do we have it already?
+ if (aid==this->attrs->Get(i)->attr_id) {
+ haveit=true;
+ break; //skip this, don't replace
+ }
+ }
+ if (!haveit)
+ this->attrs->Add(new GffAttr(aid, from->attrs->Get(i)->attr_val));
+ }
+void GffObj::setFeatureName(const char* feature) {
+ //change the feature name/type for a transcript
+ int fid=names->feats.addName(feature);
+ if (monoFeature() && exons.Count()>0)
+ this->exon_ftype_id=fid;
+ this->ftype_id=fid;
+void GffObj::setRefName(const char* newname) {
+ //change the feature name/type for a transcript
+ int rid=names->gseqs.addName(newname);
+ this->gseq_id=rid;
+int GffObj::removeAttr(const char* attrname, const char* attrval) {
+ if (this->attrs==NULL || attrname==NULL || attrname[0]==0) return 0;
+ int aid=this->names->attrs.getId(attrname);
+ if (aid<0) return 0;
+ int delcount=0; //could be more than one ?
+ for (int i=0;i<this->attrs->Count();i++) {
+ if (aid==this->attrs->Get(i)->attr_id) {
+ if (attrval==NULL ||
+ strcmp(attrval, this->attrs->Get(i)->attr_val)==0) {
+ delcount++;
+ this->attrs->freeItem(i);
+ }
+ }
+ }
+ if (delcount>0) this->attrs->Pack();
+ return delcount;
+int GffObj::removeAttr(int aid, const char* attrval) {
+ if (this->attrs==NULL || aid<0) return 0;
+ int delcount=0; //could be more than one ?
+ for (int i=0;i<this->attrs->Count();i++) {
+ if (aid==this->attrs->Get(i)->attr_id) {
+ if (attrval==NULL ||
+ strcmp(attrval, this->attrs->Get(i)->attr_val)==0) {
+ delcount++;
+ this->attrs->freeItem(i);
+ }
+ }
+ }
+ if (delcount>0) this->attrs->Pack();
+ return delcount;
+int GffObj::removeExonAttr(GffExon& exon, const char* attrname, const char* attrval) {
+ if (exon.attrs==NULL || attrname==NULL || attrname[0]==0) return 0;
+ int aid=this->names->attrs.getId(attrname);
+ if (aid<0) return 0;
+ int delcount=0; //could be more than one
+ for (int i=0;i<exon.attrs->Count();i++) {
+ if (aid==exon.attrs->Get(i)->attr_id) {
+ if (attrval==NULL ||
+ strcmp(attrval, exon.attrs->Get(i)->attr_val)==0) {
+ delcount++;
+ exon.attrs->freeItem(i);
+ }
+ }
+ }
+ if (delcount>0) exon.attrs->Pack();
+ return delcount;
+int GffObj::removeExonAttr(GffExon& exon, int aid, const char* attrval) {
+ if (exon.attrs==NULL || aid<0) return 0;
+ int delcount=0; //could be more than one
+ for (int i=0;i<exon.attrs->Count();i++) {
+ if (aid==exon.attrs->Get(i)->attr_id) {
+ if (attrval==NULL ||
+ strcmp(attrval, exon.attrs->Get(i)->attr_val)==0) {
+ delcount++;
+ exon.attrs->freeItem(i);
+ }
+ }
+ }
+ if (delcount>0) exon.attrs->Pack();
+ return delcount;
+void GffObj::getCDS_ends(uint& cds_start, uint& cds_end) {
+ cds_start=0;
+ cds_end=0;
+ if (CDstart==0 || CDend==0) return; //no CDS info
+ int cdsadj=0;
+ if (CDphase=='1' || CDphase=='2') {
+ cdsadj=CDphase-'0';
+ }
+ cds_start=CDstart;
+ cds_end=CDend;
+ if (strand=='-') cds_end-=cdsadj;
+ else cds_start+=cdsadj;
+ }
+void GffObj::mRNA_CDS_coords(uint& cds_mstart, uint& cds_mend) {
+ //sets cds_start and cds_end to the CDS start,end coordinates on the spliced mRNA transcript
+ cds_mstart=0;
+ cds_mend=0;
+ if (CDstart==0 || CDend==0) return; //no CDS info
+ //restore normal coordinates, just in case
+ unxcoord();
+ int cdsadj=0;
+ if (CDphase=='1' || CDphase=='2') {
+ cdsadj=CDphase-'0';
+ }
+ /*
+ uint seqstart=CDstart;
+ uint seqend=CDend;
+ */
+ uint seqstart=exons.First()->start;
+ uint seqend=exons.Last()->end;
+ int s=0; //resulting nucleotide counter
+ if (strand=='-') {
+ for (int x=exons.Count()-1;x>=0;x--) {
+ uint sgstart=exons[x]->start;
+ uint sgend=exons[x]->end;
+ if (seqend<sgstart || seqstart>sgend) continue;
+ if (seqstart>=sgstart && seqstart<=sgend)
+ sgstart=seqstart; //seqstart within this segment
+ if (seqend>=sgstart && seqend<=sgend)
+ sgend=seqend; //seqend within this segment
+ s+=(int)(sgend-sgstart)+1;
+ if (CDstart>=sgstart && CDstart<=sgend) {
+ //CDstart in this segment
+ //and we are getting the whole transcript
+ cds_mend=s-(int)(CDstart-sgstart);
+ }
+ if (CDend>=sgstart && CDend<=sgend) {
+ //CDstart in this segment
+ //and we are getting the whole transcript
+ cds_mstart=s-(int)(CDend-cdsadj-sgstart);
+ }
+ } //for each exon
+ } // - strand
+ else { // + strand
+ for (int x=0;x<exons.Count();x++) {
+ uint sgstart=exons[x]->start;
+ uint sgend=exons[x]->end;
+ if (seqend<sgstart || seqstart>sgend) continue;
+ if (seqstart>=sgstart && seqstart<=sgend)
+ sgstart=seqstart; //seqstart within this segment
+ if (seqend>=sgstart && seqend<=sgend)
+ sgend=seqend; //seqend within this segment
+ s+=(int)(sgend-sgstart)+1;
+ /* for (uint i=sgstart;i<=sgend;i++) {
+ spliced[s]=gsubseq[i-gstart];
+ s++;
+ }//for each nt
+ */
+ if (CDstart>=sgstart && CDstart<=sgend) {
+ //CDstart in this segment
+ cds_mstart=s-(int)(sgend-CDstart-cdsadj);
+ }
+ if (CDend>=sgstart && CDend<=sgend) {
+ //CDend in this segment
+ cds_mend=s-(int)(sgend-CDend);
+ }
+ } //for each exon
+ } // + strand
+ //spliced[s]=0;
+ //if (rlen!=NULL) *rlen=s;
+ //return spliced;
+char* GffObj::getUnspliced(GFaSeqGet* faseq, int* rlen, GList<GSeg>* seglst)
+ if (faseq==NULL) { GMessage("Warning: getUnspliced(NULL,.. ) called!\n");
+ return NULL;
+ }
+ //restore normal coordinates:
+ unxcoord();
+ if (exons.Count()==0) return NULL;
+ int fspan=end-start+1;
+ const char* gsubseq=faseq->subseq(start, fspan);
+ if (gsubseq==NULL) {
+ GError("Error getting subseq for %s (%d..%d)!\n", gffID, start, end);
+ }
+ char* unspliced=NULL;
+ int seqstart=exons.First()->start;
+ int seqend=exons.Last()->end;
+ int unsplicedlen = 0;
+ unsplicedlen += seqend - seqstart + 1;
+ GMALLOC(unspliced, unsplicedlen+1); //allocate more here
+ //uint seqstart, seqend;
+ int s = 0; //resulting nucleotide counter
+ if (strand=='-')
+ {
+ if (seglst!=NULL)
+ seglst->Add(new GSeg(s+1,s+1+seqend-seqstart));
+ for (int i=seqend;i>=seqstart;i--)
+ {
+ unspliced[s] = ntComplement(gsubseq[i-start]);
+ s++;
+ }//for each nt
+ } // - strand
+ else
+ { // + strand
+ if (seglst!=NULL)
+ seglst->Add(new GSeg(s+1,s+1+seqend-seqstart));
+ for (int i=seqstart;i<=seqend;i++)
+ {
+ unspliced[s]=gsubseq[i-start];
+ s++;
+ }//for each nt
+ } // + strand
+ //assert(s <= unsplicedlen);
+ unspliced[s]=0;
+ if (rlen!=NULL) *rlen=s;
+ return unspliced;
+char* GffObj::getSpliced(GFaSeqGet* faseq, bool CDSonly, int* rlen, uint* cds_start, uint* cds_end,
+ GList<GSeg>* seglst) {
+ if (CDSonly && CDstart==0) return NULL;
+ if (faseq==NULL) { GMessage("Warning: getSpliced(NULL,.. ) called!\n");
+ return NULL;
+ }
+ //restore normal coordinates:
+ unxcoord();
+ if (exons.Count()==0) return NULL;
+ int fspan=end-start+1;
+ const char* gsubseq=faseq->subseq(start, fspan);
+ if (gsubseq==NULL) {
+ GError("Error getting subseq for %s (%d..%d)!\n", gffID, start, end);
+ }
+ if (fspan<(int)(end-start+1)) { //special case: stop coordinate was extended past the gseq length, must adjust
+ int endadj=end-start+1-fspan;
+ uint prevend=end;
+ end-=endadj;
+ if (CDend>end) CDend=end;
+ if (exons.Last()->end>end) {
+ exons.Last()->end=end; //this could get us into trouble if exon start is also > end
+ if (exons.Last()->start>exons.Last()->end) {
+ GError("GffObj::getSpliced() error: improper genomic coordinate %d on %s for %s\n",
+ prevend,getGSeqName(), getID());
+ }
+ covlen-=endadj;
+ }
+ }
+ char* spliced=NULL;
+ GMALLOC(spliced, covlen+1); //allocate more here
+ uint seqstart, seqend;
+ int cdsadj=0;
+ if (CDphase=='1' || CDphase=='2') {
+ cdsadj=CDphase-'0';
+ }
+ if (CDSonly) {
+ seqstart=CDstart;
+ seqend=CDend;
+ if (strand=='-') seqend-=cdsadj;
+ else seqstart+=cdsadj;
+ }
+ else {
+ seqstart=exons.First()->start;
+ seqend=exons.Last()->end;
+ }
+ int s=0; //resulting nucleotide counter
+ if (strand=='-') {
+ for (int x=exons.Count()-1;x>=0;x--) {
+ uint sgstart=exons[x]->start;
+ uint sgend=exons[x]->end;
+ if (seqend<sgstart || seqstart>sgend) continue;
+ if (seqstart>=sgstart && seqstart<=sgend)
+ sgstart=seqstart; //seqstart within this segment
+ if (seqend>=sgstart && seqend<=sgend)
+ sgend=seqend; //seqend within this segment
+ if (seglst!=NULL)
+ seglst->Add(new GSeg(s+1,s+1+sgend-sgstart));
+ for (uint i=sgend;i>=sgstart;i--) {
+ spliced[s] = ntComplement(gsubseq[i-start]);
+ s++;
+ }//for each nt
+ if (!CDSonly && cds_start!=NULL && CDstart>0) {
+ if (CDstart>=sgstart && CDstart<=sgend) {
+ //CDstart in this segment
+ //and we are getting the whole transcript
+ *cds_end=s-(CDstart-sgstart);
+ }
+ if (CDend>=sgstart && CDend<=sgend) {
+ //CDstart in this segment
+ //and we are getting the whole transcript
+ *cds_start=s-(CDend-cdsadj-sgstart);
+ }
+ }//update local CDS coordinates
+ } //for each exon
+ } // - strand
+ else { // + strand
+ for (int x=0;x<exons.Count();x++) {
+ uint sgstart=exons[x]->start;
+ uint sgend=exons[x]->end;
+ if (seqend<sgstart || seqstart>sgend) continue;
+ if (seqstart>=sgstart && seqstart<=sgend)
+ sgstart=seqstart; //seqstart within this segment
+ if (seqend>=sgstart && seqend<=sgend)
+ sgend=seqend; //seqend within this segment
+ if (seglst!=NULL)
+ seglst->Add(new GSeg(s+1,s+1+sgend-sgstart));
+ for (uint i=sgstart;i<=sgend;i++) {
+ spliced[s]=gsubseq[i-start];
+ s++;
+ }//for each nt
+ if (!CDSonly && cds_start!=NULL && CDstart>0) {
+ if (CDstart>=sgstart && CDstart<=sgend) {
+ //CDstart in this segment
+ //and we are getting the whole transcript
+ *cds_start=s-(sgend-CDstart-cdsadj);
+ }
+ if (CDend>=sgstart && CDend<=sgend) {
+ //CDstart in this segment
+ //and we are getting the whole transcript
+ *cds_end=s-(sgend-CDend);
+ }
+ }//update local CDS coordinates
+ } //for each exon
+ } // + strand
+ spliced[s]=0;
+ if (rlen!=NULL) *rlen=s;
+ return spliced;
+char* GffObj::getSplicedTr(GFaSeqGet* faseq, bool CDSonly, int* rlen) {
+ if (CDSonly && CDstart==0) return NULL;
+ //restore normal coordinates:
+ unxcoord();
+ if (exons.Count()==0) return NULL;
+ int fspan=end-start+1;
+ const char* gsubseq=faseq->subseq(start, fspan);
+ if (gsubseq==NULL) {
+ GError("Error getting subseq for %s (%d..%d)!\n", gffID, start, end);
+ }
+ char* translation=NULL;
+ GMALLOC(translation, (int)(covlen/3)+1);
+ uint seqstart, seqend;
+ int cdsadj=0;
+ if (CDphase=='1' || CDphase=='2') {
+ cdsadj=CDphase-'0';
+ }
+ if (CDSonly) {
+ seqstart=CDstart;
+ seqend=CDend;
+ if (strand=='-') seqend-=cdsadj;
+ else seqstart+=cdsadj;
+ }
+ else {
+ seqstart=exons.First()->start;
+ seqend=exons.Last()->end;
+ }
+ Codon codon;
+ int nt=0; //codon nucleotide counter (0..2)
+ int aa=0; //aminoacid count
+ if (strand=='-') {
+ for (int x=exons.Count()-1;x>=0;x--) {
+ uint sgstart=exons[x]->start;
+ uint sgend=exons[x]->end;
+ if (seqend<sgstart || seqstart>sgend) continue;
+ if (seqstart>=sgstart && seqstart<=sgend)
+ sgstart=seqstart; //seqstart within this segment
+ if (seqend>=sgstart && seqend<=sgend) {
+ sgend=seqend; //seqend within this segment
+ }
+ for (uint i=sgend;i>=sgstart;i--) {
+ codon.nuc[nt]=ntComplement(gsubseq[i-start]);
+ nt++;
+ if (nt==3) {
+ nt=0;
+ translation[aa]=codon.translate();
+ aa++;
+ }
+ }//for each nt
+ } //for each exon
+ } // - strand
+ else { // + strand
+ for (int x=0;x<exons.Count();x++) {
+ uint sgstart=exons[x]->start;
+ uint sgend=exons[x]->end;
+ if (seqend<sgstart || seqstart>sgend) continue;
+ if (seqstart>=sgstart && seqstart<=sgend)
+ sgstart=seqstart; //seqstart within this segment
+ if (seqend>=sgstart && seqend<=sgend)
+ sgend=seqend; //seqend within this segment
+ for (uint i=sgstart;i<=sgend;i++) {
+ codon.nuc[nt]=gsubseq[i-start];
+ nt++;
+ if (nt==3) {
+ nt=0;
+ translation[aa]=codon.translate();
+ aa++;
+ }
+ }//for each nt
+ } //for each exon
+ } // + strand
+ translation[aa]=0;
+ if (rlen!=NULL) *rlen=aa;
+ return translation;
+void GffObj::printSummary(FILE* fout) {
+ if (fout==NULL) fout=stdout;
+ fprintf(fout, "%s\t%c\t%d\t%d\t%4.2f\t%4.1f\n", gffID,
+ strand, start, end, gscore, (float)qcov/10.0);
+void decodeHexChars(char* dbuf, const char* s, int maxlen=1023) {
+ int dlen=0;
+ dbuf[0]=0;
+ if (s==NULL) return;
+ for (const char* p=s;(*p)!=0 && dlen<maxlen;++p) {
+ if (p[0]=='%' && isxdigit(p[1]) && isxdigit(p[2])) {
+ int a=p[1];
+ if (a>'Z') a^=0x20; //toupper()
+ if (a>'9') a=10+(a-'A');
+ else a-='0';
+ int b=p[2];
+ if (b>'Z') b^=0x20;
+ if (b>'9') b=10+(b-'A');
+ else b-='0';
+ char c=(char)((a<<4)+b);
+ if (c==';') c='.';
+ if (c>' ') {
+ dbuf[dlen]=c;
+ ++p;++p;
+ ++dlen;
+ continue;
+ }
+ }
+ dbuf[dlen]=*p;
+ ++dlen;
+ }
+ dbuf[dlen]=0;
+void GffObj::printGxfLine(FILE* fout, const char* tlabel, const char* gseqname, bool iscds,
+ uint segstart, uint segend, int exidx, char phase, bool gff3, bool cvtChars) {
+ char dbuf[1024];
+ strcpy(dbuf,".");
+ GffAttrs* xattrs=NULL;
+ if (exidx>=0) {
+ if (exons[exidx]->score) sprintf(dbuf,"%.2f", exons[exidx]->score);
+ xattrs=exons[exidx]->attrs;
+ }
+ if (phase==0 || !iscds) phase='.';
+ const char* ftype=iscds ? "CDS" : getSubfName();
+ const char* attrname=NULL;
+ const char* attrval=NULL;
+ if (gff3) {
+ fprintf(fout,
+ "%s\t%s\t%s\t%d\t%d\t%s\t%c\t%c\tParent=%s",
+ gseqname, tlabel, ftype, segstart, segend, dbuf, strand,
+ phase, gffID);
+ if (xattrs!=NULL) {
+ for (int i=0;i<xattrs->Count();i++) {
+ attrname=names->attrs.getName(xattrs->Get(i)->attr_id);
+ if (cvtChars) {
+ decodeHexChars(dbuf, xattrs->Get(i)->attr_val);
+ fprintf(fout,";%s=%s", attrname, dbuf);
+ } else {
+ fprintf(fout,";%s=%s", attrname, xattrs->Get(i)->attr_val);
+ }
+ }
+ }
+ fprintf(fout, "\n");
+ } //GFF3
+ else {//for GTF -- we print only transcripts
+ //if (isValidTranscript())
+ fprintf(fout, "%s\t%s\t%s\t%d\t%d\t%s\t%c\t%c\ttranscript_id \"%s\";",
+ gseqname, tlabel, ftype, segstart, segend, dbuf, strand, phase, gffID);
+ //char* geneid=(geneID!=NULL)? geneID : gffID;
+ if (geneID)
+ fprintf(fout," gene_id \"%s\";",geneID);
+ if (gene_name!=NULL) {
+ //fprintf(fout, " gene_name ");
+ //if (gene_name[0]=='"') fprintf (fout, "%s;",gene_name);
+ // else fprintf(fout, "\"%s\";",gene_name);
+ fprintf(fout," gene_name \"%s\";",gene_name);
+ }
+ if (xattrs!=NULL) {
+ for (int i=0;i<xattrs->Count();i++) {
+ if (xattrs->Get(i)->attr_val==NULL) continue;
+ attrname=names->attrs.getName(xattrs->Get(i)->attr_id);
+ fprintf(fout, " %s ",attrname);
+ if (cvtChars) {
+ decodeHexChars(dbuf, xattrs->Get(i)->attr_val);
+ attrval=dbuf;
+ } else {
+ attrval=xattrs->Get(i)->attr_val;
+ }
+ if (attrval[0]=='"') fprintf(fout, "%s;",attrval);
+ else fprintf(fout, "\"%s\";",attrval);
+ }
+ }
+ //for GTF, also append the GffObj attributes to each exon line
+ if ((xattrs=this->attrs)!=NULL) {
+ for (int i=0;i<xattrs->Count();i++) {
+ if (xattrs->Get(i)->attr_val==NULL) continue;
+ attrname=names->attrs.getName(xattrs->Get(i)->attr_id);
+ fprintf(fout, " %s ",attrname);
+ if (cvtChars) {
+ decodeHexChars(dbuf, xattrs->Get(i)->attr_val);
+ attrval=dbuf;
+ } else {
+ attrval=xattrs->Get(i)->attr_val;
+ }
+ if (attrval[0]=='"') fprintf(fout, "%s;",attrval);
+ else fprintf(fout, "\"%s\";",attrval);
+ }
+ }
+ fprintf(fout, "\n");
+ }//GTF
+void GffObj::printGxf(FILE* fout, GffPrintMode gffp,
+ const char* tlabel, const char* gfparent, bool cvtChars) {
+ //char tmpstr[255];
+ char dbuf[1024];
+ if (tlabel==NULL) {
+ tlabel=track_id>=0 ? names->tracks.Get(track_id)->name :
+ (char*)"gffobj" ;
+ }
+ unxcoord();
+ //if (exons.Count()==0) return;
+ const char* gseqname=names->gseqs.Get(gseq_id)->name;
+ bool gff3 = (gffp>=pgffAny);
+ bool showCDS = (gffp==pgtfAny || gffp==pgtfCDS || gffp==pgffCDS || gffp==pgffAny || gffp==pgffBoth);
+ bool showExon = (gffp<=pgtfExon || gffp==pgffAny || gffp==pgffExon || gffp==pgffBoth);
+ if (gff3) {
+ //print GFF3 mRNA line:
+ if (gscore>0.0) sprintf(dbuf,"%.2f", gscore);
+ else strcpy(dbuf,".");
+ uint pstart, pend;
+ if (gffp==pgffCDS) {
+ pstart=CDstart;
+ pend=CDend;
+ }
+ else { pstart=start;pend=end; }
+ //const char* ftype=isTranscript() ? "mRNA" : getFeatureName();
+ const char* ftype=getFeatureName();
+ fprintf(fout,
+ "%s\t%s\t%s\t%d\t%d\t%s\t%c\t.\tID=%s",
+ gseqname, tlabel, ftype, pstart, pend, dbuf, strand, gffID);
+ if (CDstart>0 && !showCDS/* && !isCDS*/) fprintf(fout,";CDS=%d-%d",CDstart,CDend);
+ if (gfparent!=NULL) {
+ //parent override
+ fprintf(fout, ";Parent=%s",gfparent);
+ }
+ else {
+ if (parent!=NULL && !parent->isDiscarded())
+ fprintf(fout, ";Parent=%s",parent->getID());
+ }
+ if (geneID!=NULL)
+ fprintf(fout, ";geneID=%s",geneID);
+ if (gene_name!=NULL)
+ fprintf(fout, ";gene_name=%s",gene_name);
+ if (attrs!=NULL) {
+ for (int i=0;i<attrs->Count();i++) {
+ const char* attrname=names->attrs.getName(attrs->Get(i)->attr_id);
+ if (cvtChars) {
+ decodeHexChars(dbuf, attrs->Get(i)->attr_val);
+ fprintf(fout,";%s=%s", attrname, dbuf);
+ } else {
+ fprintf(fout,";%s=%s", attrname, attrs->Get(i)->attr_val);
+ }
+ }
+ }
+ fprintf(fout,"\n");
+ }// gff3 mRNA line
+ bool is_cds_only = (gffp==pgffBoth) ? false : isCDS;
+ if (showExon) {
+ //print exons
+ if (isCDS && exons.Count()>0 &&
+ ((strand=='-' && exons.Last()->phase<'0') || (strand=='+' && exons.Last()->phase<'0')))
+ updateExonPhase();
+ for (int i=0;i<exons.Count();i++) {
+ printGxfLine(fout, tlabel, gseqname, is_cds_only, exons[i]->start, exons[i]->end, i, exons[i]->phase, gff3, cvtChars);
+ }
+ }//printing exons
+ if (showCDS && !is_cds_only && CDstart>0) {
+ if (isCDS) {
+ for (int i=0;i<exons.Count();i++) {
+ printGxfLine(fout, tlabel, gseqname, true, exons[i]->start, exons[i]->end, i, exons[i]->phase, gff3, cvtChars);
+ }
+ }
+ else {
+ GArray<GffCDSeg> cds(true,true);
+ getCDSegs(cds);
+ for (int i=0;i<cds.Count();i++) {
+ printGxfLine(fout, tlabel, gseqname, true, cds[i].start, cds[i].end, -1, cds[i].phase, gff3, cvtChars);
+ }
+ }
+ } //showCDS
+void GffObj::updateExonPhase() {
+ if (!isCDS) return;
+ int cdsacc=0;
+ if (CDphase=='1' || CDphase=='2') {
+ cdsacc+= 3-(CDphase-'0');
+ }
+ if (strand=='-') { //reverse strand
+ for (int i=exons.Count()-1;i>=0;i--) {
+ exons[i]->phase='0'+ (3-cdsacc%3)%3;
+ cdsacc+=exons[i]->end-exons[i]->start+1;
+ }
+ }
+ else { //forward strand
+ for (int i=0;i<exons.Count();i++) {
+ exons[i]->phase='0'+ (3-cdsacc%3)%3;
+ cdsacc+=exons[i]->end-exons[i]->start+1;
+ }
+ }
+void GffObj::getCDSegs(GArray<GffCDSeg>& cds) {
+ GffCDSeg cdseg;
+ int cdsacc=0;
+ if (CDphase=='1' || CDphase=='2') {
+ cdsacc+= 3-(CDphase-'0');
+ }
+ if (strand=='-') {
+ for (int x=exons.Count()-1;x>=0;x--) {
+ uint sgstart=exons[x]->start;
+ uint sgend=exons[x]->end;
+ if (CDend<sgstart || CDstart>sgend) continue;
+ if (CDstart>=sgstart && CDstart<=sgend)
+ sgstart=CDstart; //cdstart within this segment
+ if (CDend>=sgstart && CDend<=sgend)
+ sgend=CDend; //cdend within this segment
+ cdseg.start=sgstart;
+ cdseg.end=sgend;
+ cdseg.exonidx=x;
+ //cdseg.phase='0'+(cdsacc>0 ? (3-cdsacc%3)%3 : 0);
+ cdseg.phase='0'+ (3-cdsacc%3)%3;
+ cdsacc+=sgend-sgstart+1;
+ cds.Add(cdseg);
+ } //for each exon
+ } // - strand
+ else { // + strand
+ for (int x=0;x<exons.Count();x++) {
+ uint sgstart=exons[x]->start;
+ uint sgend=exons[x]->end;
+ if (CDend<sgstart || CDstart>sgend) continue;
+ if (CDstart>=sgstart && CDstart<=sgend)
+ sgstart=CDstart; //seqstart within this segment
+ if (CDend>=sgstart && CDend<=sgend)
+ sgend=CDend; //seqend within this segment
+ cdseg.start=sgstart;
+ cdseg.end=sgend;
+ cdseg.exonidx=x;
+ //cdseg.phase='0'+(cdsacc>0 ? (3-cdsacc%3)%3 : 0);
+ cdseg.phase='0' + (3-cdsacc%3)%3 ;
+ cdsacc+=sgend-sgstart+1;
+ cds.Add(cdseg);
+ } //for each exon
+ } // + strand
diff --git a/src/gff_utils.cpp b/src/gff_utils.cpp
new file mode 100644
index 0000000..419b153
--- /dev/null
+++ b/src/gff_utils.cpp
@@ -0,0 +1,664 @@
+#include "gff_utils.h"
+extern bool verbose;
+extern bool debugMode;
+//bool debugState=false;
+void printFasta(FILE* f, GStr& defline, char* seq, int seqlen) {
+ if (seq==NULL) return;
+ int len=(seqlen>0)?seqlen:strlen(seq);
+ if (len<=0) return;
+ if (!defline.is_empty())
+ fprintf(f, ">%s\n",defline.chars());
+ int ilen=0;
+ for (int i=0; i < len; i++, ilen++) {
+ if (ilen == 70) {
+ fputc('\n', f);
+ ilen = 0;
+ }
+ putc(seq[i], f);
+ } //for
+ fputc('\n', f);
+int qsearch_gloci(uint x, GList<GffLocus>& loci) {
+ //binary search
+ //do the simplest tests first:
+ if (loci[0]->start>x) return 0;
+ if (loci.Last()->start<x) return -1;
+ uint istart=0;
+ int i=0;
+ int idx=-1;
+ int maxh=loci.Count()-1;
+ int l=0;
+ int h = maxh;
+ while (l <= h) {
+ i = (l+h)>>1;
+ istart=loci[i]->start;
+ if (istart < x) l = i + 1;
+ else {
+ if (istart == x) { //found matching coordinate here
+ idx=i;
+ while (idx<=maxh && loci[idx]->start==x) {
+ idx++;
+ }
+ return (idx>maxh) ? -1 : idx;
+ }
+ h = i - 1;
+ }
+ } //while
+ idx = l;
+ while (idx<=maxh && loci[idx]->start<=x) {
+ idx++;
+ }
+ return (idx>maxh) ? -1 : idx;
+int qsearch_rnas(uint x, GList<GffObj>& rnas) {
+ //binary search
+ //do the simplest tests first:
+ if (rnas[0]->start>x) return 0;
+ if (rnas.Last()->start<x) return -1;
+ uint istart=0;
+ int i=0;
+ int idx=-1;
+ int maxh=rnas.Count()-1;
+ int l=0;
+ int h = maxh;
+ while (l <= h) {
+ i = (l+h)>>1;
+ istart=rnas[i]->start;
+ if (istart < x) l = i + 1;
+ else {
+ if (istart == x) { //found matching coordinate here
+ idx=i;
+ while (idx<=maxh && rnas[idx]->start==x) {
+ idx++;
+ }
+ return (idx>maxh) ? -1 : idx;
+ }
+ h = i - 1;
+ }
+ } //while
+ idx = l;
+ while (idx<=maxh && rnas[idx]->start<=x) {
+ idx++;
+ }
+ return (idx>maxh) ? -1 : idx;
+int cmpRedundant(GffObj& a, GffObj& b) {
+ if (a.exons.Count()==b.exons.Count()) {
+ if (a.covlen==b.covlen) {
+ return strcmp(a.getID(), b.getID());
+ }
+ else return (a.covlen>b.covlen)? 1 : -1;
+ }
+ else return (a.exons.Count()>b.exons.Count())? 1: -1;
+bool tMatch(GffObj& a, GffObj& b) {
+ //strict intron chain match, or single-exon perfect match
+ int imax=a.exons.Count()-1;
+ int jmax=b.exons.Count()-1;
+ int ovlen=0;
+ if (imax!=jmax) return false; //different number of introns
+ if (imax==0) { //single-exon mRNAs
+ //if (equnspl) {
+ //fuzz match for single-exon transfrags:
+ // it's a match if they overlap at least 80% of max len
+ ovlen=a.exons[0]->overlapLen(b.exons[0]);
+ int maxlen=GMAX(a.covlen,b.covlen);
+ return (ovlen>=maxlen*0.8);
+ /*}
+ else {
+ //only exact match
+ ovlen=a.covlen;
+ return (a.exons[0]->start==b.exons[0]->start &&
+ a.exons[0]->end==b.exons[0]->end);
+ }*/
+ }
+ //check intron overlaps
+ ovlen=a.exons[0]->end-(GMAX(a.start,b.start))+1;
+ ovlen+=(GMIN(a.end,b.end))-a.exons.Last()->start;
+ for (int i=1;i<=imax;i++) {
+ if (i<imax) ovlen+=a.exons[i]->len();
+ if ((a.exons[i-1]->end!=b.exons[i-1]->end) ||
+ (a.exons[i]->start!=b.exons[i]->start)) {
+ return false; //intron mismatch
+ }
+ }
+ return true;
+bool unsplContained(GffObj& ti, GffObj& tj, bool fuzzSpan) {
+ //returns true only if ti (which MUST be single-exon) is "almost" contained in any of tj's exons
+ //but it does not cross any intron-exon boundary of tj
+ int imax=ti.exons.Count()-1;
+ int jmax=tj.exons.Count()-1;
+ if (imax>0) GError("Error: bad unsplContained() call, 1st param must be single-exon transcript!\n");
+ int minovl = (int)(0.8 * ti.len()); //minimum overlap for fuzzSpan
+ if (fuzzSpan) {
+ for (int j=0;j<=jmax;j++) {
+ //must NOT overlap the introns
+ if ((j>0 && ti.start<tj.exons[j]->start)
+ || (j<jmax && ti.end>tj.exons[j]->end))
+ return false;
+ if (ti.exons[0]->overlapLen(tj.exons[j])>=minovl)
+ return true;
+ }
+ } else {
+ for (int j=0;j<=jmax;j++) {
+ //must NOT overlap the introns
+ if ((j>0 && ti.start<tj.exons[j]->start)
+ || (j<jmax && ti.end>tj.exons[j]->end))
+ return false;
+ //strict containment
+ if (ti.end<=tj.exons[j]->end && ti.start>=tj.exons[j]->start)
+ return true;
+ }
+ }
+ return false;
+GffObj* redundantTranscripts(GffObj& ti, GffObj& tj, bool matchAllIntrons, bool fuzzSpan) {
+ // matchAllIntrons==true: transcripts are considered "redundant" only if
+ // they have the exact same number of introns and same splice sites (or none)
+ // (single-exon transcripts can be also fully contained to be considered matching)
+ // matchAllIntrons==false: an intron chain could be a subset of a "container" chain,
+ // as long as no intron-exon boundaries are violated; also, a single-exon
+ // transcript will be collapsed if it's contained in one of the exons of the other
+ // fuzzSpan==false: the genomic span of one transcript must be contained in or equal with the genomic
+ // span of the other
+ //
+ // fuzzSpan==true: then genomic spans of transcripts are no longer required to be fully contained
+ // (i.e. they may extend each-other in opposite directions)
+ //if redundancy is detected, the "bigger" transcript is returned (otherwise NULL is returned)
+ if (ti.start>=tj.end || tj.start>=ti.end || tj.strand!=ti.strand) return NULL; //no span overlap at all
+ int imax=ti.exons.Count()-1;
+ int jmax=tj.exons.Count()-1;
+ GffObj* bigger=NULL;
+ GffObj* smaller=NULL;
+ if (matchAllIntrons) {
+ if (imax!=jmax) return NULL;
+ if (ti.covlen>tj.covlen) {
+ bigger=&ti;
+ if (!fuzzSpan && (ti.start>tj.start || ti.end<tj.end)) return NULL;
+ }
+ else { //ti.covlen<=tj.covlen
+ bigger=&tj;
+ if (!fuzzSpan && (tj.start>ti.start || tj.end<ti.end)) return NULL;
+ }
+ //check that all introns really match
+ for (int i=0;i<imax;i++) {
+ if (ti.exons[i]->end!=tj.exons[i]->end ||
+ ti.exons[i+1]->start!=tj.exons[i+1]->start) return NULL;
+ }
+ return bigger;
+ }
+ //--- matchAllIntrons==false: intron-chain containment is also considered redundancy
+ //int maxlen=0;
+ int minlen=0;
+ if (ti.covlen>tj.covlen) {
+ if (tj.exons.Count()>ti.exons.Count()) {
+ //exon count override
+ bigger=&tj;
+ smaller=&ti;
+ }
+ else {
+ bigger=&ti;
+ smaller=&tj;
+ }
+ //maxlen=ti.covlen;
+ minlen=tj.covlen;
+ }
+ else { //tj has more bases
+ if (ti.exons.Count()>tj.exons.Count()) {
+ //exon count override
+ bigger=&ti;
+ smaller=&tj;
+ }
+ else {
+ bigger=&tj;
+ smaller=&ti;
+ }
+ //maxlen=tj.covlen;
+ minlen=ti.covlen;
+ }
+ if (imax==0 && jmax==0) {
+ //single-exon transcripts: if fuzzSpan, at least 80% of the shortest one must be overlapped by the other
+ if (fuzzSpan) {
+ return (ti.exons[0]->overlapLen(tj.exons[0])>=minlen*0.8) ? bigger : NULL;
+ }
+ else {
+ return (smaller->start>=bigger->start && smaller->end<=bigger->end) ? bigger : NULL;
+ }
+ }
+ //containment is also considered redundancy
+ if (smaller->exons.Count()==1) {
+ //check if this single exon is contained in any of tj exons
+ //without violating any intron-exon boundaries
+ return (unsplContained(*smaller, *bigger, fuzzSpan) ? bigger : NULL);
+ }
+ //--from here on: both are multi-exon transcripts, imax>0 && jmax>0
+ if (ti.exons[imax]->start<tj.exons[0]->end ||
+ tj.exons[jmax]->start<ti.exons[0]->end )
+ return NULL; //intron chains do not overlap at all
+ //checking full intron chain containment
+ uint eistart=0, eiend=0, ejstart=0, ejend=0; //exon boundaries
+ int i=1; //exon idx to the right of the current intron of ti
+ int j=1; //exon idx to the right of the current intron of tj
+ //find the first intron overlap:
+ while (i<=imax && j<=jmax) {
+ eistart=ti.exons[i-1]->end;
+ eiend=ti.exons[i]->start;
+ ejstart=tj.exons[j-1]->end;
+ ejend=tj.exons[j]->start;
+ if (ejend<eistart) { j++; continue; }
+ if (eiend<ejstart) { i++; continue; }
+ //we found an intron overlap
+ break;
+ }
+ if (!fuzzSpan && (bigger->start>smaller->start || bigger->end < smaller->end)) return NULL;
+ if ((i>1 && j>1) || i>imax || j>jmax) {
+ return NULL; //either no intron overlaps found at all
+ //or it's not the first intron for at least one of the transcripts
+ }
+ if (eistart!=ejstart || eiend!=ejend) return NULL; //not an exact intron match
+ if (j>i) {
+ //i==1, ti's start must not conflict with the previous intron of tj
+ if (ti.start<tj.exons[j-1]->start) return NULL;
+ //so i's first intron starts AFTER j's first intron
+ // then j must contain i, so i's last intron must end with or before j's last intron
+ if (ti.exons[imax]->start>tj.exons[jmax]->start) return NULL;
+ //comment out the line above if you just want "intron compatibility" (i.e. extension of intron chains )
+ }
+ else if (i>j) {
+ //j==1, tj's start must not conflict with the previous intron of ti
+ if (tj.start<ti.exons[i-1]->start) return NULL;
+ //so j's intron chain starts AFTER i's
+ // then i must contain j, so j's last intron must end with or before j's last intron
+ if (tj.exons[jmax]->start>ti.exons[imax]->start) return NULL;
+ //comment out the line above for just "intronCompatible()" check (allowing extension of intron chain)
+ }
+ //now check if the rest of the introns overlap, in the same sequence
+ i++;
+ j++;
+ while (i<=imax && j<=jmax) {
+ if (ti.exons[i-1]->end!=tj.exons[j-1]->end ||
+ ti.exons[i]->start!=tj.exons[j]->start) return NULL;
+ i++;
+ j++;
+ }
+ i--;
+ j--;
+ if (i==imax && j<jmax) {
+ // tj has more introns to the right, check if ti's end doesn't conflict with the current tj exon boundary
+ if (ti.end>tj.exons[j]->end) return NULL;
+ }
+ else if (j==jmax && i<imax) {
+ if (tj.end>ti.exons[i]->end) return NULL;
+ }
+ return bigger;
+int gseqCmpName(const pointer p1, const pointer p2) {
+ return strcmp(((GenomicSeqData*)p1)->gseq_name, ((GenomicSeqData*)p2)->gseq_name);
+void printLocus(GffLocus* loc, const char* pre) {
+ if (pre!=NULL) fprintf(stderr, "%s", pre);
+ GMessage(" [%d-%d] : ", loc->start, loc->end);
+ GMessage("%s",loc->rnas[0]->getID());
+ for (int i=1;i<loc->rnas.Count();i++) {
+ GMessage(",%s",loc->rnas[i]->getID());
+ }
+ GMessage("\n");
+void preserveContainedCDS(GffObj* t, GffObj* tfrom) {
+ //transfer CDS info to the container t if it's a larger protein
+ if (tfrom->CDstart==0) return;
+ if (t->CDstart) {
+ if (tfrom->CDstart<t->CDstart && tfrom->CDstart>=t->start)
+ t->CDstart=tfrom->CDstart;
+ if (tfrom->CDend>t->CDend && tfrom->CDend<=t->end)
+ t->CDend=tfrom->CDend;
+ }
+ else { //no CDS info on container, just copy it from the contained
+ t->addCDS(tfrom->CDstart, tfrom->CDend, tfrom->CDphase);
+ }
+bool exonOverlap2Gene(GffObj* t, GffObj& g) {
+ if (t->exons.Count()>0) {
+ return t->exonOverlap(g.start, g.end);
+ }
+ else return g.overlap(*t);
+void GffLoader::placeGf(GffObj* t, GenomicSeqData* gdata, bool doCluster, bool collapseRedundant,
+ bool matchAllIntrons, bool fuzzSpan) {
+ GTData* tdata=new GTData(t); //additional transcript data
+ gdata->tdata.Add(tdata);
+ //int tidx=-1;
+ /*
+ if (debug) {
+ GMessage(">>Placing transcript %s\n", t->getID());
+ debugState=true;
+ }
+ else debugState=false;
+ */
+ //dumb TRNA case for RefSeq: gene parent link missing
+ //try to restore it here; BUT this only works if gene feature comes first
+ if (t->parent==NULL && t->isTranscript()) {
+ int gidx=gdata->gfs.Count()-1;
+ while (gidx>=0 && gdata->gfs[gidx]->end>=t->start) {
+ GffObj& g = *(gdata->gfs[gidx]);
+ if (g.isGene() && t->strand==g.strand && exonOverlap2Gene(t, g)) {
+ g.children.Add(t);
+ t->parent=&g;
+ //disable printing of gene if transcriptsOnly
+ if (transcriptsOnly) {
+ g.udata|=4; //tag it as non-printable
+ }
+ const char* geneName=g.getAttr("Name");
+ if (t->getAttr("Name")==NULL && geneName) {
+ t->addAttr("Name", geneName);
+ t->addAttr("gene_name", geneName);
+ }
+ t->addAttr("geneID", g.getID());
+ break;
+ }
+ gidx--;
+ }
+ }
+ /*
+ if (t->exons.Count()==0 && t->children.Count()==0 && forceExons) {
+ //a non-mRNA feature with no subfeatures
+ //just so we get some sequence functions working, add a dummy "exon"-like subfeature here
+ //--this could be a single "pseudogene" entry or another genomic region without exons
+ //
+ t->addExon(t->start,t->end);
+ }
+ */
+ if (t->exons.Count()>0) {
+ //tidx=
+ gdata->rnas.Add(t); //added it in sorted order
+ }
+ else {
+ if (t->isGene() || !this->transcriptsOnly)
+ gdata->gfs.Add(t);
+ return; //nothing to do with these non-transcript objects
+ }
+ if (!doCluster) return;
+ if (gdata->loci.Count()==0) {
+ gdata->loci.Add(new GffLocus(t));
+ //GMessage(" <<make it first locus %d-%d \n",t->start, t->end);
+ return;
+ }
+ /*
+ //DEBUG: show available loci:
+ if (debug) {
+ GMessage(" [%d loci already:\n", gdata->loci.Count());
+ for (int l=0;l<gdata->loci.Count();l++) {
+ printLocus(gdata->loci[l]);
+ }
+ }
+ */
+ int nidx=qsearch_gloci(t->end, gdata->loci); //get index of nearest locus starting just ABOVE t->end
+ //GMessage("\tlooking up end coord %d in gdata->loci.. (qsearch got nidx=%d)\n", t->end, nidx);
+ if (nidx==0) {
+ //cannot have any overlapping loci
+ //if (debug) GMessage(" <<no ovls possible, create locus %d-%d \n",t->start, t->end);
+ gdata->loci.Add(new GffLocus(t));
+ return;
+ }
+ if (nidx==-1) nidx=gdata->loci.Count();//all loci start below t->end
+ int lfound=0; //count of parent loci
+ GArray<int> mrgloci(false);
+ GList<GffLocus> tloci(true); //candidate parent loci to adopt this
+ //if (debug) GMessage("\tchecking all loci from %d to 0\n",nidx-1);
+ for (int l=nidx-1;l>=0;l--) {
+ GffLocus& loc=*(gdata->loci[l]);
+ if (loc.strand!='.' && t->strand!='.'&& loc.strand!=t->strand) continue;
+ if (t->start>loc.end) {
+ if (t->start-loc.start>GFF_MAX_LOCUS) break; //give up already
+ continue;
+ }
+ if (loc.start>t->end) {
+ //this should never be the case if nidx was found correctly
+ GMessage("Warning: qsearch_gloci found loc.start>t.end!(t=%s)\n", t->getID());
+ continue;
+ }
+ /*
+ if (debug) {
+ GMessage(" !range overlap found with locus ");
+ printLocus(&loc);
+ }
+ */
+ if (loc.add_RNA(t)) {
+ //will add this transcript to loc
+ lfound++;
+ mrgloci.Add(l);
+ if (collapseRedundant) {
+ //compare to every single transcript in this locus
+ for (int ti=0;ti<loc.rnas.Count();ti++) {
+ if (loc.rnas[ti]==t) continue;
+ GTData* odata=(GTData*)(loc.rnas[ti]->uptr);
+ //GMessage(" ..redundant check vs overlapping transcript %s\n",loc.rnas[ti]->getID());
+ GffObj* container=NULL;
+ if (odata->replaced_by==NULL &&
+ (container=redundantTranscripts(*t, *(loc.rnas[ti]), matchAllIntrons, fuzzSpan))!=NULL) {
+ if (container==t) {
+ odata->replaced_by=t;
+ preserveContainedCDS(t, loc.rnas[ti]);
+ }
+ else {
+ tdata->replaced_by=loc.rnas[ti];
+ preserveContainedCDS(loc.rnas[ti], t);
+ }
+ }
+ }//for each transcript in the exon-overlapping locus
+ } //if doCollapseRedundant
+ } //overlapping locus
+ } //for each existing locus
+ if (lfound==0) {
+ //overlapping loci not found, create a locus with only this mRNA
+ /* if (debug) {
+ GMessage(" overlapping locus not found, create locus %d-%d \n",t->start, t->end);
+ }
+ */
+ int addidx=gdata->loci.Add(new GffLocus(t));
+ if (addidx<0) {
+ //should never be the case!
+ GMessage(" WARNING: new GffLocus(%s:%d-%d) not added!\n",t->getID(), t->start, t->end);
+ }
+ }
+ else { //found at least one overlapping locus
+ lfound--;
+ int locidx=mrgloci[lfound];
+ GffLocus& loc=*(gdata->loci[locidx]);
+ //last locus index found is also the smallest index
+ if (lfound>0) {
+ //more than one loci found parenting this mRNA, merge loci
+ /* if (debug)
+ GMessage(" merging %d loci \n",lfound);
+ */
+ for (int l=0;l<lfound;l++) {
+ int mlidx=mrgloci[l];
+ loc.addMerge(*(gdata->loci[mlidx]), t);
+ gdata->loci.Delete(mlidx); //highest indices first, so it's safe to remove
+ }
+ }
+ int i=locidx;
+ while (i>0 && loc<*(gdata->loci[i-1])) {
+ //bubble down until it's in the proper order
+ i--;
+ gdata->loci.Swap(i,i+1);
+ }
+ }//found at least one overlapping locus
+void collectLocusData(GList<GenomicSeqData>& ref_data) {
+ int locus_num=0;
+ for (int g=0;g<ref_data.Count();g++) {
+ GenomicSeqData* gdata=ref_data[g];
+ for (int l=0;l<gdata->loci.Count();l++) {
+ GffLocus& loc=*(gdata->loci[l]);
+ GHash<int> gnames(true); //gene names in this locus
+ GHash<int> geneids(true); //Entrez GeneID: numbers
+ for (int i=0;i<loc.rnas.Count();i++) {
+ GffObj& t=*(loc.rnas[i]);
+ GStr gname(t.getGeneName());
+ if (!gname.is_empty()) {
+ gname.upper();
+ int* prevg=gnames.Find(gname.chars());
+ if (prevg!=NULL) (*prevg)++;
+ else gnames.Add(gname, new int(1));
+ }
+ //parse GeneID xrefs, if any:
+ GStr xrefs(t.getAttr("xrefs"));
+ if (!xrefs.is_empty()) {
+ xrefs.startTokenize(",");
+ GStr token;
+ while (xrefs.nextToken(token)) {
+ token.upper();
+ if (token.startsWith("GENEID:")) {
+ token.cut(0,token.index(':')+1);
+ int* prevg=geneids.Find(token.chars());
+ if (prevg!=NULL) (*prevg)++;
+ else geneids.Add(token, new int(1));
+ }
+ } //for each xref
+ } //xrefs parsing
+ }//for each transcript
+ locus_num++;
+ loc.locus_num=locus_num;
+ if (gnames.Count()>0) { //collect all gene names associated to this locus
+ gnames.startIterate();
+ int* gfreq=NULL;
+ char* key=NULL;
+ while ((gfreq=gnames.NextData(key))!=NULL) {
+ loc.gene_names.AddIfNew(new CGeneSym(key,*gfreq));
+ }
+ } //added collected gene_names
+ if (loc.gene_ids.Count()>0) { //collect all GeneIDs names associated to this locus
+ geneids.startIterate();
+ int* gfreq=NULL;
+ char* key=NULL;
+ while ((gfreq=geneids.NextData(key))!=NULL) {
+ loc.gene_ids.AddIfNew(new CGeneSym(key,*gfreq));
+ }
+ }
+ } //for each locus
+ }//for each genomic sequence
+void GffLoader::load(GList<GenomicSeqData>& seqdata, GFValidateFunc* gf_validate,
+ bool doCluster, bool doCollapseRedundant,
+ bool matchAllIntrons, bool fuzzSpan, bool forceExons) {
+ GffReader* gffr=new GffReader(f, this->transcriptsOnly, false); //not only mRNA features, not sorted
+ gffr->showWarnings(this->showWarnings);
+ // keepAttrs mergeCloseExons noExonAttr
+ gffr->readAll(this->fullAttributes, this->mergeCloseExons, this->noExonAttrs);
+ GVec<int> pseudoAttrIds;
+ GVec<int> pseudoFeatureIds;
+ if (this->noPseudo) {
+ GffNameList& fnames = gffr->names->feats;
+ for (int i=0;i<fnames.Count();i++) {
+ char* n=fnames[i]->name;
+ if (startsWith(n, "pseudo")) {
+ pseudoFeatureIds.Add(fnames[i]->idx);
+ }
+ }
+ GffNameList& attrnames = gffr->names->attrs;
+ for (int i=0;i<attrnames.Count();i++) {
+ char* n=attrnames[i]->name;
+ char* p=strifind(n, "pseudo");
+ if (p==n || (p==n+2 && tolower(n[0])=='i' && tolower(n[1])=='s')) {
+ pseudoAttrIds.Add(attrnames[i]->idx);
+ }
+ }
+ }
+ //int redundant=0; //redundant annotation discarded
+ if (verbose) GMessage(" .. loaded %d genomic features from %s\n", gffr->gflst.Count(), fname.chars());
+ //int rna_deleted=0;
+ //add to GenomicSeqData, adding to existing loci and identifying intron-chain duplicates
+ for (int k=0;k<gffr->gflst.Count();k++) {
+ GffObj* m=gffr->gflst[k];
+ if (strcmp(m->getFeatureName(), "locus")==0 &&
+ m->getAttr("transcripts")!=NULL) {
+ continue; //discard locus meta-features
+ }
+ if (this->noPseudo) {
+ bool is_pseudo=false;
+ for (int i=0;i<pseudoFeatureIds.Count();++i) {
+ if (pseudoFeatureIds[i]==m->ftype_id) {
+ is_pseudo=true;
+ break;
+ }
+ }
+ if (is_pseudo) continue;
+ for (int i=0;i<pseudoAttrIds.Count();++i) {
+ char* attrv=NULL;
+ if (m->attrs!=NULL) attrv=m->attrs->getAttr(pseudoAttrIds[i]);
+ if (attrv!=NULL) {
+ char fc=tolower(attrv[0]);
+ if (fc=='t' || fc=='y' || fc=='1') {
+ is_pseudo=true;
+ break;
+ }
+ }
+ }
+ if (is_pseudo) continue;
+ }
+ char* rloc=m->getAttr("locus");
+ if (rloc!=NULL && startsWith(rloc, "RLOC_")) {
+ m->removeAttr("locus", rloc);
+ }
+ /*
+ if (m->exons.Count()==0 && m->children.Count()==0) {
+ //a non-mRNA feature with no subfeatures
+ //add a dummy exon just to have the generic exon checking work
+ m->addExon(m->start,m->end);
+ }
+ */
+ if (forceExons) { // && m->children.Count()==0) {
+ m->exon_ftype_id=gff_fid_exon;
+ }
+ GList<GffObj> gfadd(false,false);
+ if (gf_validate!=NULL && !(*gf_validate)(m, &gfadd)) {
+ continue;
+ }
+ m->isUsed(true); //so the gffreader won't destroy it
+ int i=-1;
+ GenomicSeqData f(m->gseq_id);
+ GenomicSeqData* gdata=NULL;
+ if (seqdata.Found(&f,i)) gdata=seqdata[i];
+ else { //entry not created yet for this genomic seq
+ gdata=new GenomicSeqData(m->gseq_id);
+ seqdata.Add(gdata);
+ }
+ for (int k=0;k<gfadd.Count();k++) {
+ placeGf(gfadd[k], gdata, doCluster, doCollapseRedundant, matchAllIntrons, fuzzSpan);
+ }
+ placeGf(m, gdata, doCluster, doCollapseRedundant, matchAllIntrons, fuzzSpan);
+ } //for each read gffObj
+ //if (verbose) GMessage(" .. %d records from %s clustered into loci.\n", gffr->gflst.Count(), fname.chars());
+ if (f!=stdin) { fclose(f); f=NULL; }
+ delete gffr;
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/libgff.git
More information about the debian-med-commit
mailing list