[med-svn] [Git][med-team/libgclib][upstream] New upstream version 0.12.7+ds
Andreas Tille (@tille)
gitlab at salsa.debian.org
Tue Oct 12 06:42:50 BST 2021
Andreas Tille pushed to branch upstream at Debian Med / libgclib
Commits:
fd5fa4bd by Andreas Tille at 2021-10-12T07:31:11+02:00
New upstream version 0.12.7+ds
- - - - -
19 changed files:
- .gitignore
- GBam.cpp
- GBase.cpp
- GBase.h
- GBitVec.h
- GFastaIndex.cpp
- GHashMap.hh
- + GIntervalTree.hh
- GList.hh
- GResUsage.cpp
- GResUsage.h
- GStr.cpp
- GStr.h
- GVec.hh
- gff.cpp
- gff.h
- htest.cpp
- khashl.hh
- + wyhash.h
Changes:
=====================================
.gitignore
=====================================
@@ -27,3 +27,4 @@
*.exe
*.out
*.app
+/Default/
=====================================
GBam.cpp
=====================================
@@ -2,6 +2,9 @@
#include <ctype.h>
#include "kstring.h"
+#define _cigOp(c) ((c)&BAM_CIGAR_MASK)
+#define _cigLen(c) ((c)>>BAM_CIGAR_SHIFT)
+
//auxiliary functions for low level BAM record creation
uint8_t* realloc_bdata(bam1_t *b, int size) {
if (b->m_data < size) {
@@ -320,47 +323,75 @@ switch (cop) {
void GBamRecord::setupCoordinates() {
const bam1_core_t *c = &b->core;
if (c->flag & BAM_FUNMAP) return; /* skip unmapped reads */
- uint32_t *p = bam1_cigar(b);
+ uint32_t *cigar = bam1_cigar(b);
+ //uint32_t *p = bam1_cigar(b);
//--- prevent alignment error here (reported by UB-sanitazer):
- uint32_t *cigar= new uint32_t[c->n_cigar];
- memcpy(cigar, p, c->n_cigar * sizeof(uint32_t));
+ //uint32_t *cigar= new uint32_t[c->n_cigar];
+ //memcpy(cigar, p, c->n_cigar * sizeof(uint32_t));
//--- UBsan protection end
int l=0;
mapped_len=0;
clipL=0;
clipR=0;
start=c->pos+1; //genomic start coordinate, 1-based (BAM core.pos is 0-based)
+ GSeg exon;
int exstart=c->pos;
+ bool intron=false;
+ bool ins=false;
for (int i = 0; i < c->n_cigar; ++i) {
- int op = cigar[i]&0xf;
- if (op == BAM_CMATCH || op==BAM_CEQUAL ||
- op == BAM_CDIFF || op == BAM_CDEL) {
- l += cigar[i]>>4;
- }
- else if (op == BAM_CREF_SKIP) { //N
- //intron starts
- //exon ends here
+ unsigned char op = _cigOp(cigar[i]);
+ switch(op) {
+ case BAM_CEQUAL: // =
+ case BAM_CDIFF: // X
+ case BAM_CMATCH: // M
+ case BAM_CDEL: // D
+ l+=_cigLen(cigar[i]);
+ intron=false;
+ ins=false;
+ break;
+ case BAM_CINS: // I
+ //rpos+=cl; //gpos not advanced
+ //take care of cases where there is an ins within an intron
+ ins=true;
+ break;
+ case BAM_CREF_SKIP: // N
+ //intron starts
+ //exon ends here
+ if(!ins || !intron) { // insertion in the middle of an intron --> adjust last exon
+ exon.start=exstart+1;
+ exon.end=c->pos+l;
+ exons.Add(exon);
+ mapped_len+=exon.len();
+ }
has_Introns=true;
- GSeg exon(exstart+1,c->pos+l);
- exons.Add(exon);
- mapped_len+=exon.len();
- l += cigar[i]>>4;
+ l += _cigLen(cigar[i]);
exstart=c->pos+l;
- }
- else if (op == BAM_CSOFT_CLIP) {
- soft_Clipped=true;
- if (l) clipR=(cigar[i]>>4);
- else clipL=(cigar[i]>>4);
- }
- else if (op == BAM_CHARD_CLIP) {
- hard_Clipped=true;
+ intron=true;
+ break;
+ case BAM_CSOFT_CLIP: // S
+ soft_Clipped=true;
+ if (l) clipR=_cigLen(cigar[i]);
+ else clipL=_cigLen(cigar[i]);
+ intron=false; ins=false;
+ break;
+ case BAM_CHARD_CLIP:
+ hard_Clipped=true;
+ intron=false; ins=false;
+ break;
+ case BAM_CPAD:
+ //gpos+=cl;
+ intron=false; ins=false; //?
+ break;
+ default:
+ int cl=_cigLen(cigar[i]);
+ fprintf(stderr, "Unhandled CIGAR operation %d:%d\n", op, cl);
}
}
- GSeg exon(exstart+1,c->pos+l);
+ exon.start=exstart+1;
+ exon.end=c->pos+l;
exons.Add(exon);
mapped_len+=exon.len();
- end=c->pos+l; //genomic end coordinate
- delete[] cigar; //UBsan protection
+ end=exon.end; //genomic end coordinate
}
=====================================
GBase.cpp
=====================================
@@ -62,61 +62,17 @@ void GMessage(const char* format,...){
_vsnprintf(msg, 4095, format, arguments);
msg[4095]=0;
va_end(arguments);
- OutputDebugString(msg);
+ fflush(stderr);
+ //OutputDebugString(msg);
#else
va_list arguments;
va_start(arguments,format);
vfprintf(stderr,format,arguments);
va_end(arguments);
+ fflush(stderr);
#endif
}
-/*************** Memory management routines *****************/
-// Allocate memory
-bool GMalloc(pointer* ptr,unsigned long size){
- //GASSERT(ptr);
- if (size!=0)
- *ptr=malloc(size);
- return *ptr!=NULL;
- }
-
-// Allocate cleaned memory (0 filled)
-bool GCalloc(pointer* ptr,unsigned long size){
- GASSERT(ptr);
- *ptr=calloc(size,1);
- return *ptr!=NULL;
- }
-
-// Resize memory
-bool GRealloc(pointer* ptr,unsigned long size){
- //GASSERT(ptr);
- if (size==0) {
- GFree(ptr);
- return true;
- }
- if (*ptr==NULL) {//simple malloc
- void *p=malloc(size);
- if (p != NULL) {
- *ptr=p;
- return true;
- }
- else return false;
- }//malloc
- else {//realloc
- void *p=realloc(*ptr,size);
- if (p) {
- *ptr=p;
- return true;
- }
- return false;
- }
- }
-// Free memory, resets ptr to NULL afterward
-void GFree(pointer* ptr){
- GASSERT(ptr);
- if (*ptr) free(*ptr);
- *ptr=NULL;
- }
char* Gstrdup(const char* str, int xtracap) {
if (str==NULL) return NULL;
@@ -184,6 +140,44 @@ void Gmktempdir(char* templ) {
#endif
}
+char *to_unix_path(char *p) {
+ if (p != NULL) {
+ char *pp = p;
+ while (*pp != 0) {
+ if (*pp == '\\')
+ *pp = '/';
+ ++pp;
+ }
+ }
+ return p;
+}
+
+char* Grealpath(const char *path, char *resolved_path) {
+#ifdef _WIN32
+ //char *realpath(const char *path, char *resolved_path) {
+ char *ret = NULL;
+ if (path == NULL) {
+ errno = EINVAL;
+ } else if (access(path, R_OK) == 0) {
+ ret = resolved_path;
+ if (ret == NULL) {
+ GMALLOC(ret, _MAX_PATH);
+ }
+ if (ret == NULL) {
+ errno = ENOMEM;
+ } else {
+ ret = _fullpath(ret, path, _MAX_PATH);
+ if (ret == NULL)
+ errno = EIO;
+ }
+ }
+ return to_unix_path(ret);
+ #else
+ return realpath(path, resolved_path);
+ #endif
+}
+
+
int Gmkdir(const char *path, bool recursive, int perms) {
if (path==NULL || path[0]==0) return -1;
mode_t process_mask = umask(0); //is this really needed?
@@ -236,6 +230,16 @@ int Gmkdir(const char *path, bool recursive, int perms) {
return 0;
}
+bool haveStdInput() {
+#ifdef _WIN32
+ HANDLE hIn = GetStdHandle(STD_INPUT_HANDLE);
+ DWORD stype = GetFileType(hIn);
+ return (stype!=FILE_TYPE_CHAR);
+#else
+ return !(isatty(fileno(stdin)));
+#endif
+}
+
FILE* Gfopen(const char *path, char *mode) {
FILE* f=NULL;
if (mode==NULL) f=fopen(path, "rb");
@@ -244,6 +248,80 @@ FILE* Gfopen(const char *path, char *mode) {
GMessage("Error opening file '%s': %s\n", path, strerror(errno));
return f;
}
+#define IS_CHR_DELIM(c) ( c == ' ' || c == '\t' || c == ':' )
+void GRangeParser::parse(char* s) {
+ //parses general range format: [+\-|.]refID[+\-\.][ |:][start][-|.\s]end[\s:][+\-\.]
+ // if ref ID has ':' a space delimited format is preferred
+ // or just separate the ref ID from the coordinate range: [[+/-]ref<space>start[[-..][end]]
+ //the safest way would be to parse from the end in case the ref ID has ':' characters
+ //if the whole chromosome is intended (no coordinates to speficy)
+ this->start=0;
+ this->end=0;
+ this->strand=0;
+ int slen=strlen(s);
+ if (slen==0) return;
+ while (isspace(s[slen-1])) { slen--; s[slen]=0; } //trim trailing spaces
+ while (isspace(*s)) { ++s; --slen; } //trim prefix spaces
+ char c=*s;
+ if (c=='+' || c=='-') {
+ strand=c;
+ ++s;slen--;
+ }
+ if (strand && (*s==':' || *s==' ')) //ignore
+ { s++;slen--; }
+ char* p=s; //parsing position for coordinate string
+ char* isep=strpbrk(s, " \t");
+ if (isep==NULL) isep=strchr(s, ':');
+ if (isep) { //chr (ref) ID ending found
+ p=isep+1;
+ *isep=0;
+ //character after the delimiter can only be a strand if it's followed by another delimiter
+ //e.g. chr1 + 134551-204326 or chr1:+:134551-204326
+ c=*(isep+1);
+ if (strand==0 && (c=='+' || c=='-' || c=='.') && IS_CHR_DELIM(*(isep+2))) {
+ strand=c;
+ p=isep+3;
+ }
+ if (strand==0) {
+ c=*(isep-1); //character before the delimiter could be the strand
+ if (c=='+' || c=='-') { //not '.', sorry
+ isep--;
+ strand=c;
+ *isep=0; //ref is now parsable
+ }
+ }
+ this->refName=Gstrdup(s,isep-1);
+ }
+ //here we are after ref ID (and possibly strand) delimiter
+ char* pend=p;
+ if (isdigit(*pend)) {
+ //parse the start coordinate then
+ do { pend++; } while (isdigit(*pend));
+ c=*pend;
+ *pend=0;
+ this->start=atoi(p);
+ p=pend;
+ *p=c;
+ }
+ while (*p=='-' || *p=='.' || *p==' ' || *p=='\t') ++p;
+ pend=p;
+ while (isdigit(*pend)) pend++;
+ if (pend>p) { //parse the 2nd coordinate
+ c=*pend;
+ *pend=0;
+ this->end=atoi(p);
+ *pend=c;
+ }
+ if (start && end && end<start) Gswap(this->start, this->end);
+ //if (strand==0) { ?
+ c=s[slen-1]; //peek at the end of the string for strand
+ if (c=='+' || c=='-' || c=='.') {
+ if (end || IS_CHR_DELIM(s[slen-2]))
+ strand=c;
+ //slen--;s[slen]=0;
+ }
+}
+
bool GstrEq(const char* a, const char* b) {
if (a==NULL || b==NULL) return false;
=====================================
GBase.h
=====================================
@@ -1,6 +1,6 @@
#ifndef G_BASE_DEFINED
#define G_BASE_DEFINED
-#define GCLIB_VERSION "0.12.1"
+#define GCLIB_VERSION "0.12.7"
#ifdef HAVE_CONFIG_H
#include "config.h"
@@ -49,7 +49,7 @@
#define CHPATHSEP '/'
#ifdef __CYGWIN__
#define _BSD_SOURCE
- #endif
+ #endif
#include <unistd.h>
#endif
@@ -165,6 +165,11 @@ typedef int GCompareProc(const pointer item1, const pointer item2);
typedef long GFStoreProc(const pointer item1, FILE* fstorage); //for serialization
typedef pointer GFLoadProc(FILE* fstorage); //for deserialization
+void GError(const char* format,...); // Error routine (aborts program)
+void GMessage(const char* format,...);// Log message to stderr
+// Assert failed routine:- usually not called directly but through GASSERT
+void GAssert(const char* expression, const char* filename, unsigned int lineno);
+
typedef void GFreeProc(pointer item); //usually just delete,
//but may also support structures with embedded dynamic members
@@ -189,9 +194,14 @@ inline int iround(double x) {
return (int)floor(x + 0.5);
}
+char* Grealpath(const char *path, char *resolved_path);
+
int Gmkdir(const char *path, bool recursive=true, int perms = (S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH));
+
void Gmktempdir(char* templ);
+bool haveStdInput(); //if stdin is from a pipe or redirection
+
/****************************************************************************/
inline int Gintcmp(int a, int b) {
@@ -229,21 +239,52 @@ template<class T>
std::is_same<char *, typename std::decay<T>::type>::value
> {};
-/**************** Memory management ***************************/
-
-bool GMalloc(pointer* ptr, unsigned long size); // Allocate memory
-bool GCalloc(pointer* ptr, unsigned long size); // Allocate and initialize memory
-bool GRealloc(pointer* ptr,unsigned long size); // Resize memory
-
-void GFree(pointer* ptr); // Free memory, resets ptr to NULL
-//int saprintf(char **retp, const char *fmt, ...);
-
-void GError(const char* format,...); // Error routine (aborts program)
-void GMessage(const char* format,...);// Log message to stderr
-// Assert failed routine:- usually not called directly but through GASSERT
-void GAssert(const char* expression, const char* filename, unsigned int lineno);
+inline void GFree(pointer* ptr){
+ GASSERT(ptr);
+ if (*ptr) free(*ptr);
+ *ptr=NULL;
+ }
+
+inline bool GMalloc(pointer* ptr,unsigned long size){
+ //GASSERT(ptr);
+ if (size!=0)
+ *ptr=malloc(size);
+ return *ptr!=NULL;
+ }
+
+// Allocate 0-filled memory
+inline bool GCalloc(pointer* ptr,unsigned long size){
+ GASSERT(ptr);
+ *ptr=calloc(size,1);
+ return *ptr!=NULL;
+ }
+
+// Resize memory
+inline bool GRealloc(pointer* ptr,unsigned long size){
+ //GASSERT(ptr);
+ if (size==0) {
+ GFree(ptr);
+ return true;
+ }
+ if (*ptr==NULL) {//simple malloc
+ void *p=malloc(size);
+ if (p != NULL) {
+ *ptr=p;
+ return true;
+ }
+ else return false;
+ }//malloc
+ else {//realloc
+ void *p=realloc(*ptr,size);
+ if (p) {
+ *ptr=p;
+ return true;
+ }
+ return false;
+ }
+ }
template<class T> T* GDupAlloc(T& data) {
T* tmp=NULL;
@@ -280,7 +321,6 @@ char* strupper(char * str);
void* Gmemscan(void *mem, unsigned int len,
void *part, unsigned int partlen);
-
FILE* Gfopen(const char *path, char *mode=NULL);
// test if a char is in a string:
@@ -334,8 +374,7 @@ int djb_hash(const char* cp);
//---- generic base GSeg : genomic segment (interval) --
// coordinates are considered 1-based (so 0 is invalid)
-class GSeg {
- public:
+struct GSeg {
uint start; //start<end always!
uint end;
GSeg(uint s=0,uint e=0) {
@@ -345,17 +384,14 @@ class GSeg {
//check for overlap with other segment
uint len() { return end-start+1; }
bool overlap(GSeg* d) {
- //return start<d->start ? (d->start<=end) : (start<=d->end);
return (start<=d->end && end>=d->start);
}
bool overlap(GSeg& d) {
- //return start<d.start ? (d.start<=end) : (start<=d.end);
return (start<=d.end && end>=d.start);
}
bool overlap(GSeg& d, int fuzz) {
- //return start<d.start ? (d.start<=end+fuzz) : (start<=d.end+fuzz);
return (start<=d.end+fuzz && end+fuzz>=d.start);
}
@@ -430,6 +466,18 @@ class GSeg {
}
};
+struct GRangeParser: GSeg {
+ char* refName=NULL;
+ char strand=0;
+ void parse(char* s);
+ GRangeParser(char* s=NULL):GSeg(0, 0) {
+ if (s) parse(s);
+ }
+ ~GRangeParser() {
+ GFREE(refName);
+ }
+};
+
//basic dynamic array template for primitive types
//which can only grow (reallocate) as needed
=====================================
GBitVec.h
=====================================
@@ -115,8 +115,13 @@ public:
/// GBitVec ctor - Creates a GBitVec of specified number of bits. All
/// bits are initialized to the specified value.
- explicit GBitVec(uint s, bool value = false) : Size(s) {
- Capacity = NumBitWords(s);
+ explicit GBitVec(uint bitsize, bool value = false) : Size(bitsize) {
+ if (bitsize==0) {
+ Capacity=0;
+ fBits=0;
+ return;
+ }
+ Capacity = NumBitWords(bitsize);
//fBits = (BitWord *)std::malloc(Capacity * sizeof(BitWord));
GMALLOC(fBits, Capacity * sizeof(BitWord));
init_words(fBits, Capacity, value);
@@ -128,6 +133,19 @@ public:
return r;
}
+ GBitVec(const GBitVec* RHS) {
+ if (RHS==NULL) {
+ Size = 0;
+ fBits = 0;
+ Capacity = 0;
+ return;
+ }
+ Capacity = NumBitWords(RHS->size());
+ GMALLOC(fBits, Capacity * sizeof(BitWord));
+ memcpy(fBits, RHS->fBits, Capacity * sizeof(BitWord));
+ }
+
+
/// GBitVec copy ctor.
GBitVec(const GBitVec &RHS) : Size(RHS.size()) {
if (Size == 0) {
@@ -326,14 +344,18 @@ public:
// Indexing.
GBitRef operator[](uint Idx) {
//assert (Idx < Size && "Out-of-bounds Bit access.");
- indexCheck(Idx, Size);
+ #ifndef NDEBUG
+ indexCheck(Idx, Size);
+ #endif
return GBitRef(*this, Idx);
}
bool operator[](uint Idx) const {
+ #ifndef NDEBUG
indexCheck(Idx, Size);
- BitWord Mask = 1L << (Idx % BITWORD_SIZE);
- return (fBits[Idx / BITWORD_SIZE] & Mask) != 0;
+ #endif
+ BitWord Mask = 1L << (Idx % BITWORD_SIZE);
+ return (fBits[Idx / BITWORD_SIZE] & Mask) != 0;
}
bool test(uint Idx) const {
=====================================
GFastaIndex.cpp
=====================================
@@ -20,8 +20,7 @@ void GFastaIndex::addRecord(const char* seqname, uint seqlen, off_t foffs, int l
else {
farec=new GFastaRec(seqlen,foffs,llen,llen_full);
records.Add(seqname,farec);
- //farec->seqname=records.getLastKey();
- farec->seqname=seqname;
+ farec->seqname=records.getLastKey();
}
}
=====================================
GHashMap.hh
=====================================
@@ -11,6 +11,7 @@
#define XXH_INLINE_ALL 1
#include "xxhash.h"
+#include "wyhash.h"
template <typename K> struct GHashKey_xxHash32 { //K generic (class, primitive, pointer except const char* )
//template <typename T=K> inline typename std::enable_if< std::is_trivial<T>::value, uint32_t>::type
@@ -40,6 +41,21 @@ template <> struct GHashKey_xxHash<const char*> {
}
};
+
+template <typename K> struct GHashKey_wyHash { //K generic (class, primitive, pointer except const char* )
+ //template <typename T=K> inline typename std::enable_if< std::is_trivial<T>::value, uint32_t>::type
+ uint64_t operator()(const K& s) const { //only works for trivial types!
+ static_assert(std::is_trivial<K>::value, "Error: cannot use this for non-trivial types!\n");
+ return wyhash((const void *) &s, sizeof(K), 0, _wyp);
+ }
+};
+
+template <> struct GHashKey_wyHash<const char*> {
+ inline uint32_t operator()(const char* s) const {
+ return wyhash(s, strlen(s), 0, _wyp);
+ }
+};
+
template <typename K> struct GHashKey_Eq { //K is a type having the == operator defined
inline bool operator()(const K& x, const K& y) const {
return (x == y); //requires == operator to be defined for K
@@ -52,8 +68,9 @@ template <> struct GHashKey_Eq<const char*> {
}
};
-//GHashSet is never making a deep copy of the char* key, it only stores the pointer
-template <typename K=const char*, class Hash=GHashKey_xxHash<K>, class Eq=GHashKey_Eq<K>, typename khInt_t=uint64_t >
+// GHashSet<KType> never makes a deep copy of a char* key, it only stores the pointer
+// - for pointer keys like char*, key allocation must be managed separately (and should always survive the GHashSet)
+template <typename K=const char*, class Hash=GHashKey_wyHash<K>, class Eq=GHashKey_Eq<K>, typename khInt_t=uint64_t >
class GHashSet: public std::conditional< is_char_ptr<K>::value,
klib::KHashSetCached< K, Hash, Eq, khInt_t >,
klib::KHashSet< K, Hash, Eq, khInt_t > >::type {
@@ -78,7 +95,7 @@ public:
}
inline void Clear() {
- this->clear(); //does not shrink !
+ this->clear(); //does not shrink
}
inline void Reset() {
@@ -114,7 +131,7 @@ public:
//returns a pointer to next valid key in the table (NULL if no more)
if (this->count==0) return NULL;
uint32_t nb=this->n_buckets();
- while (i_iter<nb && !this->occupied(i_iter)) i_iter++;
+ while (i_iter<nb && !this->_used(i_iter)) i_iter++;
if (i_iter==nb) return NULL;
K* k=&(this->key(i_iter-1));
++i_iter;
@@ -125,10 +142,12 @@ public:
};
-//GStrSet always allocates a copy of each added string;
-// if you don't want that (keys are shared), just use GHashSet<const char*> instead
-template <class Hash=GHashKey_xxHash<const char*>, class Eq=GHashKey_Eq<const char*>, typename khInt_t=uint64_t>
+// GStrSet always allocates a new copy of each added string;
+// if you don't want that, just use GHashSet<const char*> instead and manage the key allocation separately
+template <class Hash=GHashKey_wyHash<const char*>, class Eq=GHashKey_Eq<const char*>, typename khInt_t=uint64_t>
class GStrSet: public GHashSet<const char*, Hash, Eq, khInt_t> {
+ protected:
+ const char* lastKey=NULL;
public:
inline int Add(const char* ky) { // return -1 if the key already exists
int absent=-1;
@@ -136,16 +155,21 @@ template <class Hash=GHashKey_xxHash<const char*>, class Eq=GHashKey_Eq<const ch
if (absent==1) {//key was actually added
const char* s=Gstrdup(ky);
this->key(i)=s; //store a copy of the key string
+ lastKey=s;
return i;
}
//key was already there
return -1;
}
+ inline const char* getLastKey() { return lastKey; }
+
int Remove(const char* ky) { //return index being removed, or -1 if no such key exists
khInt_t i=this->get(ky);
if (i!=this->end()) {
- GFREE(this->key(i)); //free string copy
+ const char* s=this->key(i);
+ if (s==lastKey) lastKey=NULL;
+ GFREE(s); //free string copy
this->del(i);
return i;
}
@@ -155,16 +179,18 @@ template <class Hash=GHashKey_xxHash<const char*>, class Eq=GHashKey_Eq<const ch
inline void Clear() {
khInt_t nb=this->n_buckets();
for (khInt_t i = 0; i != nb; ++i) {
- if (!this->__kh_used(this->used, i)) continue;
+ if (!this->_used(i)) continue;
//deallocate string copy
GFREE(this->key(i));
}
+ lastKey=NULL;
this->clear(); //does not shrink !
}
inline void Reset() {
this->Clear();
GFREE(this->used); GFREE(this->keys);
+ lastKey=NULL;
this->bits=0; this->count=0;
}
@@ -174,8 +200,12 @@ template <class Hash=GHashKey_xxHash<const char*>, class Eq=GHashKey_Eq<const ch
};
-//generic hash map where keys and values can be of any type
-template <class K, class V, class Hash=GHashKey_xxHash<K>, class Eq=GHashKey_Eq<K>, typename khInt_t=uint64_t>
+// Generic hash map where keys and values can be of any type
+// Note: keys are always copied (shared) as simple value, there is no deep copy/allocation for pointers
+// so pointer keys must me managed separately
+// Note: pointer values are automatically deallocated on container destruction by default,
+// use GHashMap(false) to disable that when V is a pointer
+template <class K, class V, class Hash=GHashKey_wyHash<K>, class Eq=GHashKey_Eq<K>, typename khInt_t=uint64_t>
class GHashMap:public std::conditional< is_char_ptr<K>::value,
klib::KHashMapCached< K, V, Hash, Eq, khInt_t>,
klib::KHashMap< K, V, Hash, Eq, khInt_t> >::type {
@@ -217,7 +247,6 @@ public:
return -1;
}
-
template <typename T=V> inline
typename std::enable_if< std::is_pointer<T>::value, void>::type
Clear() {
@@ -227,7 +256,7 @@ public:
}
khInt_t nb=this->n_buckets();
for (khInt_t i = 0; i != nb; ++i) {
- if (!this->__kh_used(this->used, i)) continue;
+ if (!this->_used(i)) continue;
if (freeItems) delete this->value(i);
}
this->clear();
@@ -235,17 +264,7 @@ public:
template <typename T=V> inline
typename std::enable_if< !std::is_pointer<T>::value, void>::type
- Clear() {
- if (!freeItems) {
- this->clear(); //does not shrink !
- return;
- }
- khInt_t nb=this->n_buckets();
- for (khInt_t i = 0; i != nb; ++i) {
- if (!this->__kh_used(this->used, i)) continue;
- }
- this->clear();
- }
+ Clear() { this->clear(); }
inline void Reset() {
this->Clear();
@@ -258,7 +277,6 @@ public:
}
// -- these can be shared with GHash:
-
GHashMap(bool doFree=std::is_pointer<V>::value):freeItems(doFree) {
static_assert(std::is_trivial<K>::value,
"Error: cannot use this for non-trivial types!\n");
@@ -313,7 +331,7 @@ public:
//returns a pointer to next key entry in the table (NULL if no more)
if (this->count==0) return NULL;
khInt_t nb=this->n_buckets();
- while (i_iter<nb && !this->occupied(i_iter)) i_iter++;
+ while (i_iter<nb && !this->_used(i_iter)) i_iter++;
if (i_iter==nb) return NULL;
val=this->value(i_iter);
K* k=&(this->key(i_iter));
@@ -327,7 +345,7 @@ public:
//returns a pointer to next key entry in the table (NULL if no more)
if (this->count==0) return NULL;
khInt_t nb=this->n_buckets();
- while (i_iter<nb && !this->occupied(i_iter)) i_iter++;
+ while (i_iter<nb && !this->_used(i_iter)) i_iter++;
if (i_iter==nb) return NULL;
val=this->value(i_iter);
K k = this->key(i_iter);
@@ -341,7 +359,7 @@ public:
//returns a pointer to next key entry in the table (NULL if no more)
if (this->count==0) return NULL;
khInt_t nb=this->n_buckets();
- while (i_iter<nb && !this->occupied(i_iter)) i_iter++;
+ while (i_iter<nb && !this->_used(i_iter)) i_iter++;
if (i_iter==nb) return NULL;
T* val=&(this->value(i_iter));
++i_iter;
@@ -354,23 +372,24 @@ public:
//returns a pointer to next key entry in the table (NULL if no more)
if (this->count==0) return NULL;
khInt_t nb=this->n_buckets();
- while (i_iter<nb && !this->occupied(i_iter)) i_iter++;
+ while (i_iter<nb && !this->_used(i_iter)) i_iter++;
if (i_iter==nb) return NULL;
T val=this->value(i_iter);
++i_iter;
return val;
}
-
-
inline uint32_t Count() { return this->count; }
};
-template <class V, class Hash=GHashKey_xxHash<const char*>, class Eq=GHashKey_Eq<const char*>, typename khInt_t=uint64_t >
+// GHash<VType>(doFree=true) -- basic string hashmap
+// Note: this hash map always makes a copy of the string key which can be costly
+// use GHashMap<const char*, VTYPE> for a faster alternative
+template <class V, class Hash=GHashKey_wyHash<const char*>, class Eq=GHashKey_Eq<const char*>, typename khInt_t=uint64_t >
class GHash:public GHashMap<const char*, V, Hash, Eq, khInt_t> {
protected:
-
+ const char* lastKey=NULL;
public:
GHash(bool doFree=true) {
this->freeItems=doFree;
@@ -383,17 +402,23 @@ public:
if (absent==1) { //key was actually added
const char* s=Gstrdup(ky);
this->key(i)=s; //store a copy of the key string
+ lastKey=s;
this->value(i)=val; //value is always copied
return i;
}
return -1;
}
+
+ inline const char* getLastKey() { return lastKey; }
+
template <typename T=V> inline
typename std::enable_if< std::is_pointer<T>::value, int>::type
Remove(const char* ky) { //return index being removed
khInt_t i=this->get(ky);
if (i!=this->end()) {
- GFREE(this->key(i)); //free string copy
+ const char* s=this->key(i);
+ if (s==lastKey) lastKey=NULL;
+ GFREE(s); //free string copy
if (this->freeItems) delete this->value(i);
this->del(i);
return i;
@@ -406,7 +431,9 @@ public:
Remove(const char* ky) { //return index being removed
khInt_t i=this->get(ky);
if (i!=this->end()) {
- GFREE(this->key(i)); //free string copy
+ const char* s=this->key(i);
+ if (s==lastKey) lastKey=NULL;
+ GFREE(s); //free string copy
this->del(i);
return i;
}
@@ -418,10 +445,11 @@ public:
Clear() {
khInt_t nb=this->n_buckets();
for (khInt_t i = 0; i != nb; ++i) {
- if (!this->__kh_used(this->used, i)) continue;
+ if (!this->_used(i)) continue;
if (this->freeItems) delete this->value(i);
GFREE(this->key(i));
}
+ lastKey=NULL;
this->clear();
}
@@ -430,9 +458,10 @@ public:
Clear() {
khInt_t nb=this->n_buckets();
for (khInt_t i = 0; i != nb; ++i) {
- if (!this->__kh_used(this->used, i)) continue;
+ if (! this->_used(i) ) continue;
GFREE(this->key(i));
}
+ lastKey=NULL;
this->clear();
}
=====================================
GIntervalTree.hh
=====================================
@@ -0,0 +1,570 @@
+#ifndef E_INTERVAL_TREE
+#define E_INTERVAL_TREE
+
+#include "GBase.h"
+#include "GVec.hh"
+
+// This is an interval tree implementation based on red-black-trees
+// as described in the book _Introduction_To_Algorithms_ by Cormen, Leisserson, and Rivest.
+
+class GIntervalTreeNode {
+ friend class GIntervalTree;
+protected:
+ GSeg* storedInterval;
+ int key;
+ int high;
+ int maxHigh;
+ int red; /* if red=0 then the node is black */
+ GIntervalTreeNode* left;
+ GIntervalTreeNode* right;
+ GIntervalTreeNode* parent;
+public:
+ void Print(GIntervalTreeNode* nil,
+ GIntervalTreeNode* root) const {
+ printf(", k=%i, h=%i, mH=%i",key,high,maxHigh);
+ printf(" l->key=");
+ if( left == nil) printf("NULL"); else printf("%i",left->key);
+ printf(" r->key=");
+ if( right == nil) printf("NULL"); else printf("%i",right->key);
+ printf(" p->key=");
+ if( parent == root) printf("NULL"); else printf("%i",parent->key);
+ printf(" red=%i\n",red);
+ }
+ GIntervalTreeNode():storedInterval(NULL), key(0), high(0),maxHigh(0),red(0),
+ left(NULL), right(NULL), parent(NULL) {}
+ GIntervalTreeNode(GSeg * newInterval): storedInterval (newInterval),
+ key(newInterval->start), high(newInterval->end) ,
+ maxHigh(high), red(0), left(NULL), right(NULL), parent(NULL) { }
+ ~GIntervalTreeNode() {}
+};
+
+struct G_ITRecursionNode {
+public:
+ // this structure stores the information needed when we take the
+ // right branch in searching for intervals but possibly come back
+ // and check the left branch as well.
+ GIntervalTreeNode * start_node;
+ unsigned int parentIndex;
+ int tryRightBranch;
+} ;
+
+class GIntervalTree {
+private:
+ unsigned int recursionNodeStackSize;
+ G_ITRecursionNode * recursionNodeStack;
+ unsigned int currentParent;
+ unsigned int recursionNodeStackTop;
+protected:
+ // A sentinel is used for root and for nil. root->left should always
+ // point to the node which is the root of the tree. nil points to a
+ // node which should always be black but has arbitrary children and
+ // parent and no key or info; These sentinels are used so
+ // that the root and nil nodes do not require special treatment in the code
+ GIntervalTreeNode* root;
+ GIntervalTreeNode* nil;
+
+ // INPUT: the node to rotate on
+ // Rotates as described in _Introduction_To_Algorithms by
+ // Cormen, Leiserson, Rivest (Chapter 14). Basically this
+ // makes the parent of x be to the left of x, x the parent of
+ // its parent before the rotation and fixes other pointers
+ // accordingly. Also updates the maxHigh fields of x and y
+ // after rotation.
+ void LeftRotate(GIntervalTreeNode* x) {
+ GIntervalTreeNode* y;
+ // originally wrote this function to use the sentinel for
+ // nil to avoid checking for nil. However this introduces a
+ // very subtle bug because sometimes this function modifies
+ // the parent pointer of nil. This can be a problem if a
+ // function which calls LeftRotate also uses the nil sentinel
+ // and expects the nil sentinel's parent pointer to be unchanged
+ // after calling this function. For example, when DeleteFixUP
+ // calls LeftRotate it expects the parent pointer of nil to be
+ // unchanged.
+
+ y=x->right;
+ x->right=y->left;
+
+ if (y->left != nil) y->left->parent=x; // used to use sentinel here
+ // and do an unconditional assignment instead of testing for nil
+
+ y->parent=x->parent;
+
+ // instead of checking if x->parent is the root as in the book, we
+ // count on the root sentinel to implicitly take care of this case
+ if( x == x->parent->left) {
+ x->parent->left=y;
+ } else {
+ x->parent->right=y;
+ }
+ y->left=x;
+ x->parent=y;
+
+ x->maxHigh=GMAX(x->left->maxHigh, GMAX(x->right->maxHigh,x->high));
+ y->maxHigh=GMAX(x->maxHigh,GMAX(y->right->maxHigh,y->high));
+ }
+ // make the parent of x be to the left of x, x the parent of
+ // its parent before the rotation and fixes other pointers
+ // accordingly. Also updates the maxHigh fields of x and y
+ // after rotation.
+ void RightRotate(GIntervalTreeNode*y) {
+ GIntervalTreeNode* x;
+
+ x=y->left;
+ y->left=x->right;
+
+ if (nil != x->right) x->right->parent=y; //used to use sentinel here
+ // and do an unconditional assignment instead of testing for nil
+
+ // instead of checking if x->parent is the root as in the book, we
+ // count on the root sentinel to implicitly take care of this case
+ x->parent=y->parent;
+ if( y == y->parent->left) {
+ y->parent->left=x;
+ } else {
+ y->parent->right=x;
+ }
+ x->right=y;
+ y->parent=x;
+
+ y->maxHigh=GMAX(y->left->maxHigh,GMAX(y->right->maxHigh,y->high));
+ x->maxHigh=GMAX(x->left->maxHigh,GMAX(y->maxHigh,x->high));
+
+ }
+
+ // Inserts z into the tree as if it were a regular binary tree
+ // using the algorithm described in _Introduction_To_Algorithms_
+ // by Cormen et al. This function is only intended to be called
+ // by the InsertTree function and not by the user
+ void TreeInsertHelp(GIntervalTreeNode* z) {
+ // this should only be called by the Insert method
+ GIntervalTreeNode* x;
+ GIntervalTreeNode* y;
+
+ z->left=z->right=nil;
+ y=root;
+ x=root->left;
+ while( x != nil) {
+ y=x;
+ if ( x->key > z->key) {
+ x=x->left;
+ } else { // x->key <= z->key
+ x=x->right;
+ }
+ }
+ z->parent=y;
+ if ( (y == root) ||
+ (y->key > z->key) ) {
+ y->left=z;
+ } else {
+ y->right=z;
+ }
+ #if defined(DEBUG_ASSERT)
+ Assert(!nil->red,"nil not red in ITTreeInsertHelp");
+ Assert((nil->maxHigh=MIN_INT),
+ "nil->maxHigh != MIN_INT in ITTreeInsertHelp");
+ #endif
+ }
+
+ void TreePrintHelper(GIntervalTreeNode* x) const {
+ if (x != nil) {
+ TreePrintHelper(x->left);
+ x->Print(nil,root);
+ TreePrintHelper(x->right);
+ }
+ }
+
+ // FUNCTION: FixUpMaxHigh
+ // INPUTS: x is the node to start from
+ // EFFECTS: Travels up to the root fixing the maxHigh fields after
+ // an insertion or deletion
+ void FixUpMaxHigh(GIntervalTreeNode* x) {
+ while(x != root) {
+ x->maxHigh=GMAX(x->high,GMAX(x->left->maxHigh,x->right->maxHigh));
+ x=x->parent;
+ }
+ }
+
+ // FUNCTION: DeleteFixUp
+ // INPUTS: x is the child of the spliced
+ // out node in DeleteNode.
+ // EFFECT: Performs rotations and changes colors to restore red-black
+ // properties after a node is deleted
+ void DeleteFixUp(GIntervalTreeNode* x) {
+ GIntervalTreeNode * w;
+ GIntervalTreeNode * rootLeft = root->left;
+
+ while( (!x->red) && (rootLeft != x)) {
+ if (x == x->parent->left) {
+ w=x->parent->right;
+ if (w->red) {
+ w->red=0;
+ x->parent->red=1;
+ LeftRotate(x->parent);
+ w=x->parent->right;
+ }
+ if ( (!w->right->red) && (!w->left->red) ) {
+ w->red=1;
+ x=x->parent;
+ } else {
+ if (!w->right->red) {
+ w->left->red=0;
+ w->red=1;
+ RightRotate(w);
+ w=x->parent->right;
+ }
+ w->red=x->parent->red;
+ x->parent->red=0;
+ w->right->red=0;
+ LeftRotate(x->parent);
+ x=rootLeft; // this is to exit while loop
+ }
+ } else { // the code below is has left and right switched from above
+ w=x->parent->left;
+ if (w->red) {
+ w->red=0;
+ x->parent->red=1;
+ RightRotate(x->parent);
+ w=x->parent->left;
+ }
+ if ( (!w->right->red) && (!w->left->red) ) {
+ w->red=1;
+ x=x->parent;
+ } else {
+ if (!w->left->red) {
+ w->right->red=0;
+ w->red=1;
+ LeftRotate(w);
+ w=x->parent->left;
+ }
+ w->red=x->parent->red;
+ x->parent->red=0;
+ w->left->red=0;
+ RightRotate(x->parent);
+ x=rootLeft; // this is to exit while loop
+ }
+ }
+ }
+ x->red=0;
+ }
+
+ // Make sure the maxHigh fields for everything makes sense.
+ void CheckMaxHighFields(GIntervalTreeNode * x) const {
+ if (x != nil) {
+ CheckMaxHighFields(x->left);
+ if(!(CheckMaxHighFieldsHelper(x,x->maxHigh,0) > 0)) {
+ GEXIT("Error found in CheckMaxHighFields.\n");
+ }
+ CheckMaxHighFields(x->right);
+ }
+ }
+
+ int CheckMaxHighFieldsHelper(GIntervalTreeNode * y,
+ const int currentHigh,
+ int match) const {
+ if (y != nil) {
+ match = CheckMaxHighFieldsHelper(y->left,currentHigh,match) ?
+ 1 : match;
+ GVERIFY(y->high <= currentHigh);
+ if (y->high == currentHigh)
+ match = 1;
+ match = CheckMaxHighFieldsHelper(y->right,currentHigh,match) ?
+ 1 : match;
+ }
+ return match;
+ }public:
+ GIntervalTree():recursionNodeStackSize(128),
+ recursionNodeStack(NULL), currentParent(0), recursionNodeStackTop(1),
+ root(new GIntervalTreeNode), nil(new GIntervalTreeNode) {
+ //nil = new IntervalTreeNode;
+ nil->left = nil->right = nil->parent = nil;
+ nil->red = 0;
+ nil->key = nil->high = nil->maxHigh = INT_MIN;
+ nil->storedInterval = NULL;
+
+ //root = new IntervalTreeNode;
+ root->parent = root->left = root->right = nil;
+ root->key = root->high = root->maxHigh = INT_MAX;
+ root->red=0;
+ root->storedInterval = NULL;
+
+ /* the following are used for the Enumerate function */
+ //recursionNodeStackSize = 128;
+ GMALLOC(recursionNodeStack, recursionNodeStackSize*sizeof(G_ITRecursionNode));
+ //recursionNodeStackTop = 1;
+ recursionNodeStack[0].start_node = NULL;
+}
+
+~GIntervalTree() {
+ GIntervalTreeNode * x = root->left;
+ GVec<GIntervalTreeNode *> stuffToFree;
+ if (x != nil) {
+ if (x->left != nil) {
+ stuffToFree.Push(x->left);
+ }
+ if (x->right != nil) {
+ stuffToFree.Push(x->right);
+ }
+ // delete x->storedInterval;
+ delete x;
+ while( stuffToFree.Count()>0 ) {
+ x = stuffToFree.Pop();
+ if (x->left != nil) {
+ stuffToFree.Push(x->left);
+ }
+ if (x->right != nil) {
+ stuffToFree.Push(x->right);
+ }
+ // delete x->storedInterval;
+ delete x;
+ }
+ }
+ delete nil;
+ delete root;
+ GFREE(recursionNodeStack);
+ }
+ void Print() const { TreePrintHelper(root->left); }
+
+
+ // FUNCTION: DeleteNode
+ //
+ // INPUTS: tree is the tree to delete node z from
+ // OUTPUT: returns the Interval stored at deleted node
+ // EFFECT: Deletes z from tree and but don't call destructor
+ // Then calls FixUpMaxHigh to fix maxHigh fields then calls
+ // DeleteFixUp to restore red-black properties
+ GSeg* DeleteNode(GIntervalTreeNode* z) {
+ GIntervalTreeNode* y;
+ GIntervalTreeNode* x;
+ GSeg* returnValue = z->storedInterval;
+
+ y= ((z->left == nil) || (z->right == nil)) ? z : GetSuccessorOf(z);
+ x= (y->left == nil) ? y->right : y->left;
+ if (root == (x->parent = y->parent)) { // assignment of y->p to x->p is intentional
+ root->left=x;
+ } else {
+ if (y == y->parent->left) {
+ y->parent->left=x;
+ } else {
+ y->parent->right=x;
+ }
+ }
+ if (y != z) { // y should not be nil in this case
+ #ifdef DEBUG_ASSERT
+ Assert( (y!=nil),"y is nil in DeleteNode \n");
+ #endif
+ // y is the node to splice out and x is its child
+ y->maxHigh = INT_MIN;
+ y->left=z->left;
+ y->right=z->right;
+ y->parent=z->parent;
+ z->left->parent=z->right->parent=y;
+ if (z == z->parent->left) {
+ z->parent->left=y;
+ } else {
+ z->parent->right=y;
+ }
+ FixUpMaxHigh(x->parent);
+ if (!(y->red)) {
+ y->red = z->red;
+ DeleteFixUp(x);
+ } else
+ y->red = z->red;
+ delete z;
+ } else {
+ FixUpMaxHigh(x->parent);
+ if (!(y->red)) DeleteFixUp(x);
+ delete y;
+ }
+ return returnValue;
+ }
+
+ // Before calling InsertNode the node x should have its key set
+ // FUNCTION: InsertNode
+ // INPUT: newInterval is the interval to insert
+ // OUTPUT: This function returns a pointer to the newly inserted node
+ // which is guaranteed to be valid until this node is deleted.
+ // What this means is if another data structure stores this
+ // pointer then the tree does not need to be searched when this
+ // is to be deleted.
+ // EFFECTS: Creates a node node which contains the appropriate key and
+ // info pointers and inserts it into the tree.
+ GIntervalTreeNode * Insert(GSeg* newInterval) {
+ GIntervalTreeNode* y;
+ GIntervalTreeNode* newNode;
+ GIntervalTreeNode* x = new GIntervalTreeNode(newInterval);
+ TreeInsertHelp(x);
+ FixUpMaxHigh(x->parent);
+ newNode = x;
+ x->red=1;
+ while(x->parent->red) { // use sentinel instead of checking for root
+ if (x->parent == x->parent->parent->left) {
+ y=x->parent->parent->right;
+ if (y->red) {
+ x->parent->red=0;
+ y->red=0;
+ x->parent->parent->red=1;
+ x=x->parent->parent;
+ } else {
+ if (x == x->parent->right) {
+ x=x->parent;
+ LeftRotate(x);
+ }
+ x->parent->red=0;
+ x->parent->parent->red=1;
+ RightRotate(x->parent->parent);
+ }
+ } else { // case for x->parent == x->parent->parent->right
+ // this part is just like the section above with
+ // left and right interchanged
+ y=x->parent->parent->left;
+ if (y->red) {
+ x->parent->red=0;
+ y->red=0;
+ x->parent->parent->red=1;
+ x=x->parent->parent;
+ } else {
+ if (x == x->parent->left) {
+ x=x->parent;
+ RightRotate(x);
+ }
+ x->parent->red=0;
+ x->parent->parent->red=1;
+ LeftRotate(x->parent->parent);
+ }
+ }
+ }
+ root->left->red=0;
+ return(newNode);
+ }
+
+ // FUNCTION: GetSuccessorOf
+ // INPUTS: x is the node we want the successor of
+ // OUTPUT: This function returns the successor of x or NULL if no
+ // successor exists.
+ GIntervalTreeNode * GetPredecessorOf(GIntervalTreeNode* x) const {
+ GIntervalTreeNode* y;
+ if (nil != (y = x->right)) { // assignment to y is intentional
+ while(y->left != nil) { // returns the minium of the right subtree of x
+ y=y->left;
+ }
+ return(y);
+ } else {
+ y=x->parent;
+ while(x == y->right) { // sentinel used instead of checking for nil
+ x=y;
+ y=y->parent;
+ }
+ if (y == root) return(nil);
+ return(y);
+ }
+
+ }
+ // FUNCTION: GetPredecessorOf
+ // INPUTS: x is the node to get predecessor of
+ // OUTPUT: This function returns the predecessor of x or NULL if no
+ // predecessor exists.
+ GIntervalTreeNode * GetSuccessorOf(GIntervalTreeNode* x) const {
+ GIntervalTreeNode* y;
+ if (nil != (y = x->left)) { // assignment to y is intentional
+ while(y->right != nil) { // returns the maximum of the left subtree of x
+ y=y->right;
+ }
+ return(y);
+ } else {
+ y=x->parent;
+ while(x == y->left) {
+ if (y == root) return(nil);
+ x=y;
+ y=y->parent;
+ }
+ return(y);
+ }
+
+ }
+
+ // FUNCTION: Enumerate
+ // INPUTS: tree is the tree to look for intervals overlapping the
+ // closed interval [low,high]
+ // OUTPUT: stack containing pointers to the nodes overlapping
+ // [low,high]
+ // EFFECT: Returns a stack containing pointers to nodes containing
+ // intervals which overlap [low,high] in O(max(N,k*log(N)))
+ // where N is the number of intervals in the tree and k is
+ // the number of overlapping intervals
+
+ // Note: This basic idea for this function comes from the
+ // _Introduction_To_Algorithms_ book by Cormen et al, but
+ // modifications were made to return all overlapping intervals
+ // instead of just the first overlapping interval as in the
+ // book. The natural way to do this would require recursive
+ // calls of a basic search function. I translated the
+ // recursive version into an iterative version with a stack
+ // as described below.
+
+ // The basic idea for the function below is to take the IntervalSearch
+ // function from the book and modify to find all overlapping intervals
+ // instead of just one. This means that any time we take the left
+ // branch down the tree we must also check the right branch if and only if
+ // we find an overlapping interval in that left branch. Note this is a
+ // recursive condition because if we go left at the root then go left
+ // again at the first left child and find an overlap in the left subtree
+ // of the left child of root we must recursively check the right subtree
+ // of the left child of root as well as the right child of root.
+ GVec<GSeg*> * Enumerate(int low, int high) {
+ GVec<GSeg*> * enumResultStack;
+ GIntervalTreeNode* x=root->left;
+ int stuffToDo = (x != nil);
+
+ // Possible speed up: add min field to prune right searches
+
+ #ifdef DEBUG_ASSERT
+ Assert((recursionNodeStackTop == 1),
+ "recursionStack not empty when entering IntervalTree::Enumerate");
+ #endif
+ currentParent = 0;
+ enumResultStack = new GVec<GSeg*>(4);
+
+ while(stuffToDo) {
+ //if (Overlap(low,high,x->key,x->high) ) {
+ if (low<=x->high && x->key<=high) {
+ enumResultStack->Push(x->storedInterval);
+ recursionNodeStack[currentParent].tryRightBranch=1;
+ }
+ if(x->left->maxHigh >= low) { // implies x != nil
+ if ( recursionNodeStackTop == recursionNodeStackSize ) {
+ recursionNodeStackSize *= 2;
+ recursionNodeStack = (G_ITRecursionNode *)
+ realloc(recursionNodeStack,
+ recursionNodeStackSize * sizeof(G_ITRecursionNode));
+ if (recursionNodeStack == NULL)
+ GEXIT("realloc failed in IntervalTree::Enumerate\n");
+ }
+ recursionNodeStack[recursionNodeStackTop].start_node = x;
+ recursionNodeStack[recursionNodeStackTop].tryRightBranch = 0;
+ recursionNodeStack[recursionNodeStackTop].parentIndex = currentParent;
+ currentParent = recursionNodeStackTop++;
+ x = x->left;
+ } else {
+ x = x->right;
+ }
+ stuffToDo = (x != nil);
+ while( (!stuffToDo) && (recursionNodeStackTop > 1) ) {
+ if(recursionNodeStack[--recursionNodeStackTop].tryRightBranch) {
+ x=recursionNodeStack[recursionNodeStackTop].start_node->right;
+ currentParent=recursionNodeStack[recursionNodeStackTop].parentIndex;
+ recursionNodeStack[currentParent].tryRightBranch=1;
+ stuffToDo = ( x != nil);
+ }
+ }
+ }
+ #ifdef DEBUG_ASSERT
+ Assert((recursionNodeStackTop == 1),
+ "recursionStack not empty when exiting IntervalTree::Enumerate");
+ #endif
+ return(enumResultStack);
+ }
+};
+
+
+#endif
=====================================
GList.hh
=====================================
@@ -32,8 +32,8 @@ template <class OBJ> class GArray:public GVec<OBJ> {
GArray(GCompareProc* cmpFunc=NULL);
GArray(bool sorted, bool unique=false);
GArray(int init_capacity, bool sorted, bool unique=false);
- GArray(GArray<OBJ>& array); //copy constructor
- const GArray<OBJ>& operator=(GArray<OBJ>& array);
+ GArray(const GArray<OBJ>& array); //copy constructor
+ GArray<OBJ>& operator=(const GArray<OBJ>& array);
//~GArray();
//assignment operator
void setSorted(GCompareProc* cmpFunc);
@@ -97,9 +97,11 @@ template <class OBJ> class GList:public GPVec<OBJ> {
bool beUnique=false);
GList(bool sorted, bool free_elements=true, bool beUnique=false);
GList(int init_capacity, bool sorted, bool free_elements=true, bool beUnique=false);
- GList(GList<OBJ>& list); //copy constructor?
+ GList(const GList<OBJ>& list); //copy constructor
+ GList(GList<OBJ>&& list); //move constructor
GList(GList<OBJ>* list); //kind of a copy constructor
- const GList<OBJ>& operator=(GList<OBJ>& list);
+ GList<OBJ>& operator=(GList<OBJ>& list); //copy operator
+ GList<OBJ>& operator=(GList<OBJ>&& list); //move operator
//void Clear();
//~GList();
void setSorted(GCompareProc* compareProc);
@@ -154,12 +156,11 @@ template <class OBJ> class GList:public GPVec<OBJ> {
//-------------------- TEMPLATE IMPLEMENTATION-------------------------------
-template <class OBJ> GArray<OBJ>::GArray(GArray<OBJ>& array):GVec<OBJ>(0) { //copy constructor
+template <class OBJ> GArray<OBJ>::GArray(const GArray<OBJ>& array):GVec<OBJ>(0) { //copy constructor
this->fCount=array.fCount;
this->fCapacity=array.fCapacity;
this->fArray=NULL;
if (this->fCapacity>0) {
- //GMALLOC(this->fArray, this->fCapacity*sizeof(OBJ));
this->fArray=new OBJ[this->fCapacity];
}
this->fCount=array.fCount;
@@ -169,7 +170,7 @@ template <class OBJ> GArray<OBJ>::GArray(GArray<OBJ>& array):GVec<OBJ>(0) { //co
for (int i=0;i<this->fCount;i++) this->fArray[i]=array[i];
}
-template <class OBJ> const GArray<OBJ>& GArray<OBJ>::operator=(GArray<OBJ>& array) {
+template <class OBJ> GArray<OBJ>& GArray<OBJ>::operator=(const GArray<OBJ>& array) {
if (&array==this) return *this;
GVec<OBJ>::Clear();
this->fCount=array.fCount;
@@ -365,38 +366,27 @@ template <class OBJ> void GArray<OBJ>::Sort() {
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
//*=> GList implementation -- sortable array of pointers to OBJ
-template <class OBJ> GList<OBJ>::GList(GList<OBJ>& list):GPVec<OBJ>(list) { //copy constructor
+template <class OBJ> GList<OBJ>::GList(const GList<OBJ>& list):GPVec<OBJ>(list) { //copy constructor
fUnique=list.fUnique;
fCompareProc=list.fCompareProc;
}
-template <class OBJ> GList<OBJ>::GList(GList<OBJ>* plist):GPVec<OBJ>(0) { //another copy constructor
- this->fCapacity=plist->fCapacity;
- this->fList=NULL;
- if (this->fCapacity>0) {
- GMALLOC(this->fList, this->fCapacity*sizeof(OBJ*));
- }
+template <class OBJ> GList<OBJ>::GList(GList<OBJ>&& list):GPVec<OBJ>(list) { //move constructor
+ fUnique=list.fUnique;
+ fCompareProc=list.fCompareProc;
+}
+
+
+template <class OBJ> GList<OBJ>::GList(GList<OBJ>* plist):GPVec<OBJ>(*plist) {
fUnique=plist->fUnique;
fCompareProc=plist->fCompareProc;
- this->fFreeProc=plist->fFreeProc;
- this->fCount=plist->fCount;
- memcpy(this->fList, plist->fList, this->fCount*sizeof(OBJ*));
- //for (int i=0;i<list->fCount;i++) Add(plist->Get(i));
}
template <class OBJ> void GList<OBJ>::Add(GList<OBJ>& list) {
if (list.Count()==0) return;
- if (SORTED) {
- for (int i=0;i<list.Count();i++) Add(list[i]);
- }
- else { //simply copy
- this->setCapacity(this->fCapacity+list.fCount);
- memcpy( & (this->fList[this->fCount]), list.fList, list.fCount*sizeof(OBJ*));
- this->fCount+=list.fCount;
- }
+ for (int i=0;i<list.Count();i++) Add(list[i]);
}
-
template <class OBJ> GList<OBJ>::GList(GCompareProc* compareProc,
GFreeProc* freeProc, bool beUnique) {
fCompareProc = compareProc;
@@ -451,7 +441,7 @@ template <class OBJ> GList<OBJ>::GList(int init_capacity, bool sorted,
}
}
-template <class OBJ> const GList<OBJ>& GList<OBJ>::operator=(GList& list) {
+template <class OBJ> GList<OBJ>& GList<OBJ>::operator=(GList& list) {
if (&list!=this) {
GPVec<OBJ>::Clear();
fCompareProc=list.fCompareProc;
@@ -463,6 +453,20 @@ template <class OBJ> const GList<OBJ>& GList<OBJ>::operator=(GList& list) {
return *this;
}
+template <class OBJ> GList<OBJ>& GList<OBJ>::operator=(GList&& list) {
+ if (&list!=this) {
+ GPVec<OBJ>::Clear();
+ fCompareProc=list.fCompareProc;
+ this->fCount=list.fCount;
+ this->fFreeProc=list.fFreeProc;
+ this->fList=list.fList;
+ list.fList=NULL;
+ list.fCount=0;
+ list.fCapacity=0;
+ }
+ return *this;
+}
+
template <class OBJ> void GList<OBJ>::setSorted(GCompareProc* compareProc) {
GCompareProc* old_proc=fCompareProc;
fCompareProc=compareProc;
@@ -583,7 +587,7 @@ template <class OBJ> bool GList<OBJ>::Found(OBJ* item, int& idx) {
l = 0;
h = this->fCount - 1;
while (l <= h) {
- i = (l + h) >> 1;
+ i = l + ((h-l)>>1);
c = (*fCompareProc)(this->fList[i], item);
if (c < 0) l = i + 1;
else {
=====================================
GResUsage.cpp
=====================================
@@ -149,22 +149,30 @@ size_t getPeakMemUse() {
// -- Windows
PROCESS_MEMORY_COUNTERS info;
GetProcessMemoryInfo( GetCurrentProcess( ), &info, sizeof(info) );
- return (size_t)info.PeakWorkingSetSize/1024;
+ return (size_t)info.PeakWorkingSetSize;
+#elif defined(__APPLE__) && defined(__MACH__) && defined(MACH_TASK_BASIC_INFO)
+ struct mach_task_basic_info info;
+ mach_msg_type_number_t infoCount = MACH_TASK_BASIC_INFO_COUNT;
+ if ( task_info( mach_task_self( ), MACH_TASK_BASIC_INFO,
+ (task_info_t)&info, &infoCount ) != KERN_SUCCESS )
+ return (size_t)info.resident_size_max;
+ else
+ return (size_t)0L; // Can't access?
#else // defined(__unix__) || defined(__unix) || defined(unix) || (defined(__APPLE__) && defined(__MACH__))
// asssume BSD, Linux, or OSX
struct rusage rusage;
getrusage( RUSAGE_SELF, &rusage );
- #if defined(__APPLE__) && defined(__MACH__)
- return (size_t)rusage.ru_maxrss/1024;
- #else
- return (size_t)(rusage.ru_maxrss);
- #endif
+ #if defined(__APPLE__)
+ return (size_t)(rusage.ru_maxrss);
+ #else //linux returns this in kilobytes
+ return (size_t)(rusage.ru_maxrss*1024L);
+ #endif
#endif
}
/**
* Returns the current resident set size (physical memory use) measured
- * in bytes, or zero if the value cannot be determined on this OS.
+ * in bytes
*/
size_t getCurrentMemUse() {
#if defined(_WIN32)
@@ -201,15 +209,10 @@ size_t getCurrentMemUse() {
}
fclose( fp );
int page_size=sysconf(_SC_PAGESIZE);
- return ((size_t)progsize * (size_t)page_size)/1024;
+ return ((size_t)progsize * (size_t)page_size);
#endif
}
-// get_mem_usage(double &, double &) - takes two doubles by reference,
-// attempts to read the system-dependent data for a process' virtual memory
-// size and resident set size, and return the results in KB.
-//
-// On failure, returns 0.0, 0.0
void printMemUsage(FILE* fout) {
double rs= getCurrentMemUse();
rs/=1024;
@@ -226,6 +229,8 @@ double GResUsage::start() {
started=true;
stopped=false;
start_mem=getCurrentMemUse();
+ double mem=(double)start_mem/1024;
+ GMessage(" start_mem=%.2f\n", mem);
getrusage(RUSAGE_SELF, &start_ru);
G_gettime(start_ts);
double tm=start_ts.tv_sec*1000000.0 + start_ts.tv_nsec/1000.0;
@@ -240,6 +245,9 @@ double GResUsage::stop() {
getrusage(RUSAGE_SELF, &stop_ru);
double tm=stop_ts.tv_sec*1000000.0 + stop_ts.tv_nsec/1000.0;
stop_mem=getCurrentMemUse();
+ double mem=(double)stop_mem/1024;
+ GMessage(" stop_mem=%.2f\n", mem);
+
return tm;
}
@@ -271,8 +279,8 @@ double GResUsage::s_elapsed() {
return (et-st);
}
-size_t GResUsage::memoryUsed() {
+double GResUsage::memoryUsed() { //in kilobytes
stopCheck("memoryUsed");
- return (stop_mem-start_mem);
+ return (((double)stop_mem-(double)start_mem)/1024.0);
}
=====================================
GResUsage.h
=====================================
@@ -17,11 +17,11 @@
#endif
#include <time.h>
-// report the memory usage of the current process, rounded to kilobytes
+// report the memory usage of the current process in bytes
size_t getCurrentMemUse(); //current memory usage of the program (RSS)
size_t getPeakMemUse(); //maximum memory usage (RSS) for the program until now
-void printMemUsage(FILE* fout=stderr);
+void printMemUsage(FILE* fout=stderr); //in kilobytes
double get_usecTime();
@@ -42,12 +42,12 @@ class GResUsage {
if (do_start) start();
}
- double start(); //returns microseconds time using clock_gettime(CLOCK_MONOTONIC
+ double start(); //returns microseconds time using clock_gettime(CLOCK_MONOTONIC)
double stop(); //stop the stopwatch, returns the current time in microseconds
double elapsed(); //microseconds elapsed between start and stop (wallclock time)
double u_elapsed(); //microseconds of user time elapsed
double s_elapsed(); //microseconds of system time elapsed
- size_t memoryUsed(); //memory increase between start and stop in KB (can be negative)
+ double memoryUsed(); //memory increase between start and stop in KB (can be negative)
};
#endif
=====================================
GStr.cpp
=====================================
@@ -32,12 +32,12 @@ GStr::Data * GStr::new_data(uint len, uint addcap) {
GStr::Data* GStr::new_data(const char* str, uint addcap) {
//static method to return a new Data object (allocate: length+addcap)
//as a copy of a given string
- if (str==NULL) return &null_data;
- int len=strlen(str);
+ //if (str==NULL) return &null_data;
+ int len= (str==NULL)? 0 : strlen(str);
if (len+addcap > 0) {
Data* data;
GMALLOC(data, sizeof(Data)+len+addcap);
- strcpy(data->chars, str);
+ if (len) strcpy(data->chars, str);
data->ref_count = 0;
data->cap=len+addcap;
data->length = len;
@@ -158,7 +158,7 @@ GStr::GStr(const char *s, uint addcap): my_data(&null_data) {
my_data->ref_count = 1;
}
-GStr::GStr(const int i): my_data(&null_data) {
+GStr::GStr(const int i, uint addcap): my_data(&null_data) {
fTokenDelimiter=NULL;
fTokenizeMode=tkCharSet;
fLastTokenStart=0;
@@ -166,9 +166,7 @@ GStr::GStr(const int i): my_data(&null_data) {
readbufsize=0;
char buf[20];
sprintf(buf,"%d",i);
- const int len = ::strlen(buf);
- prep_data(len);
- ::memcpy(chrs(), buf, len);
+ my_data=new_data(buf, addcap);
}
GStr::GStr(const double f): my_data(&null_data) {
@@ -184,7 +182,7 @@ GStr::GStr(const double f): my_data(&null_data) {
::memcpy(chrs(), buf, len);
}
-GStr::GStr(char c, int n): my_data(&null_data) {
+GStr::GStr(const char c, int n): my_data(&null_data) {
fTokenDelimiter=NULL;
fTokenizeMode=tkCharSet;
fLastTokenStart=0;
@@ -217,8 +215,10 @@ char GStr::operator[](int idx) const {
}
GStr& GStr::operator=(const GStr& s) {
- make_unique(); //edit operation ahead
- replace_data(s.my_data);
+ if (s.my_data!=my_data) {
+ make_unique(); //edit operation ahead
+ replace_data(s.my_data);
+ }
return *this;
}
@@ -233,6 +233,29 @@ GStr& GStr::operator=(const char *s) {
return *this;
}
+GStr& GStr::assign(const char* s) {
+ make_unique(); //edit operation ahead
+ if (s==NULL) {
+ prep_data(0, my_data->cap);
+ return *this;
+ }
+ const int len = ::strlen(s);
+ prep_data(len, my_data->cap-len-2);
+ ::memcpy(my_data->chars, s, len);
+ return *this;
+}
+
+GStr& GStr::assign(const int v) {
+ make_unique(); //edit operation ahead
+ char buf[20];
+ sprintf(buf,"%d",v);
+ const int len = ::strlen(buf);
+ prep_data(len, my_data->cap-len-2);
+ ::memcpy(my_data->chars, buf, len);
+ return *this;
+}
+
+
GStr& GStr::operator=(const double f) {
make_unique(); //edit operation ahead
char buf[20];
=====================================
GStr.h
=====================================
@@ -30,14 +30,18 @@ class GStr {
public:
GStr();
GStr(const GStr& s);
- GStr(const char* s, uint addcap=4);
- GStr(const int i);
+ //minimize reallocation when suffixes are added
+ GStr(const char* s, uint addcap=8);
+ GStr(const int i, uint addcap=8);
+
GStr(const double f);
- GStr(char c, int n = 1);
+ GStr(const char c, int n = 1);
~GStr();
operator const char* () const { return my_data->chars;} //inline here
char& operator[](int index);
char operator[](int index) const;
+ GStr& assign(const char* s); //never shrinks the allocated space
+ GStr& assign(const int v); //never shrinks the allocated space
GStr& operator=(const GStr& s);
GStr& operator=(const char* s);
GStr& operator=(const int i);
=====================================
GVec.hh
=====================================
@@ -7,6 +7,7 @@ Sortable collection of pointers to objects
#define _GVec_HH
#include "GBase.h"
+#include <algorithm>
#define GVEC_INDEX_ERR "GVec error: invalid index: %d\n"
#if defined(NDEBUG) || defined(NODEBUG) || defined(_NDEBUG) || defined(NO_DEBUG)
@@ -23,27 +24,6 @@ Sortable collection of pointers to objects
#define FREEDATA (fFreeProc!=NULL)
-template<class T> struct IsPrimitiveType {
- enum { VAL = 0 };
-};
-
-template<> struct IsPrimitiveType<bool> { enum { VAL = 1 }; };
-template<> struct IsPrimitiveType<void*> { enum { VAL = 1 }; };
-template<> struct IsPrimitiveType<char*> { enum { VAL = 1 }; };
-template<> struct IsPrimitiveType<float> { enum { VAL = 1 }; };
-template<> struct IsPrimitiveType<double> { enum { VAL = 1 }; };
-
-template<> struct IsPrimitiveType<int> { enum { VAL = 1 }; };
-template<> struct IsPrimitiveType<unsigned int> { enum { VAL = 1 }; };
-template<> struct IsPrimitiveType<char> { enum { VAL = 1 }; };
-template<> struct IsPrimitiveType<unsigned char> { enum { VAL = 1 }; };
-template<> struct IsPrimitiveType<short> { enum { VAL = 1 }; };
-template<> struct IsPrimitiveType<unsigned short> { enum { VAL = 1 }; };
-template<> struct IsPrimitiveType<long> { enum { VAL = 1 }; };
-template<> struct IsPrimitiveType<unsigned long> { enum { VAL = 1 }; };
-template<> struct IsPrimitiveType<long long> { enum { VAL = 1 }; };
-template<> struct IsPrimitiveType<unsigned long long> { enum { VAL = 1 }; };
-
template <class OBJ> int DefLTCompareProc(pointer p1, pointer p2) {
OBJ& o1 = *((OBJ*) p1);
OBJ& o2 = *((OBJ*) p2);
@@ -64,12 +44,15 @@ template <class OBJ> class GVec {
GVec(int init_count, const OBJ init_val);
GVec(int init_count, OBJ* init_val, bool delete_initval=true); //convenience constructor for complex vectors
GVec(const GVec<OBJ>& array); //copy constructor
- const GVec<OBJ>& operator=(const GVec<OBJ>& array); //copy operator
+ GVec(GVec<OBJ>&& array); //move constructor
+ GVec<OBJ>& operator=(const GVec<OBJ>& array); //copy operator
+ GVec<OBJ>& operator=(GVec<OBJ>&& array); //move operator
virtual ~GVec();
void Insert(int idx, OBJ item) { Insert(idx, &item); }
void Insert(int idx, OBJ* item);
void idxInsert(int idx, OBJ& item) { Insert(idx, &item); }
void Grow();
+ void Grow(int newCap);
void Grow(int idx, OBJ& item); //grow and add/insert item copy
void Reverse(); //WARNING: will break the sort order if SORTED!
int Add(OBJ* item); // simply append to the end of fArray, reallocating as needed
@@ -109,7 +92,58 @@ template <class OBJ> class GVec {
void Swap(int idx1, int idx2) { Exchange(idx1, idx2); }
int Capacity() { return fCapacity; }
//this will reject identical items in sorted lists only!
- void setCapacity(int NewCapacity);
+ inline void setCapacity(int NewCapacity) {
+ if (NewCapacity < fCount || NewCapacity > MAXLISTSIZE)
+ GError(GVEC_CAPACITY_ERR, NewCapacity);
+ //error: NewCapacity MUST be > fCount
+ //if you want to shrink it use Resize() or setCount()
+ if (NewCapacity!=fCapacity) {
+ if (NewCapacity==0) {
+ delete[] fArray;
+ fArray=nullptr;
+ }
+ else {
+ OBJ* oldArray=fArray;
+ fArray=new OBJ[NewCapacity];
+ if (oldArray) {
+ std::move(&oldArray[0], &oldArray[fCount], & fArray[0]);
+ delete[] oldArray;
+ }
+ //this relies on in-class initializers, default constructors etc.
+ }
+ fCapacity=NewCapacity;
+ }
+ }
+ /*
+ void insertGrow(int NewCapacity, int idx, OBJ& item) {
+ OBJ* newList=new OBJ[NewCapacity];
+ // operator= required!
+ std::move(& fArray[0], & fArray[idx], & newList[0]);
+ newList[idx]=item;
+ //copy data after idx
+ std::move(& fArray[idx], & fArray[fCount], & newList[idx+1]);
+ delete[] fArray;
+ fArray=newList;
+ fCapacity=NewCapacity;
+ fCount++;
+ }
+ */
+
+ template <typename T=OBJ>
+ typename std::enable_if< std::is_trivial<T>::value, void>::type
+ topOff(int NewCount) {
+ memset(fArray+fCount, 0, (NewCount-fCount)*sizeof(OBJ));
+ }
+
+ template <typename T=OBJ>
+ typename std::enable_if< !std::is_trivial<T>::value, void>::type
+ topOff(int NewCount) {
+ OBJ v;
+ if (NewCount>fCount)
+ for (int i=fCount;i<NewCount;i++)
+ fArray[i]=v;
+ }
+
int Count() { return fCount; }
void setCount(int NewCount); // will trim or expand the array as needed
void setCount(int NewCount, OBJ v);
@@ -128,14 +162,14 @@ template <class OBJ> class GVec {
//---- it's faster than GVec<OBJ*> and has item deallocation awareness
template <class OBJ> class GPVec {
protected:
- OBJ** fList; //pointer to an array of pointers to objects
+ OBJ* *fList; //pointer to an array of pointers to objects
int fCount; //total number of entries in list
int fCapacity; //current allocated size
GFreeProc* fFreeProc; //useful for deleting objects
//---
void Expand();
void Grow();
- void Grow(int idx, OBJ* newitem);
+ void Grow(int idx, OBJ* item);
void qSort(int L, int R, GCompareProc* cmpFunc);
public:
static void DefaultFreeProc(pointer item) {
@@ -144,9 +178,11 @@ template <class OBJ> class GPVec {
virtual ~GPVec();
GPVec(int init_capacity=2, bool free_elements=true); //also the default constructor
GPVec(bool free_elements);
- GPVec(GPVec<OBJ>& list); //copy constructor?
- GPVec(GPVec<OBJ>* list); //kind of a copy constructor
- const GPVec<OBJ>& operator=(const GPVec<OBJ>& list);
+ GPVec(const GPVec<OBJ>& list); //copy constructor
+ GPVec(GPVec<OBJ>&& list); //move construstor
+ GPVec(GPVec<OBJ>* list); //similar to a copy constructor
+ GPVec<OBJ>& operator=(const GPVec<OBJ>& list);
+ GPVec<OBJ>& operator=(GPVec<OBJ>&& list);//move assignment operator
inline OBJ* Get(int i) {
TEST_INDEX(i);
return fList[i];
@@ -171,8 +207,8 @@ template <class OBJ> class GPVec {
void Clear();
void Exchange(int idx1, int idx2);
void Swap(int idx1, int idx2) { Exchange(idx1, idx2); }
- OBJ* First() { return (fCount>0)?fList[0]:NULL; }
- OBJ* Last() { return (fCount>0)?fList[fCount-1]:NULL;}
+ OBJ* First() { return (fCount>0)?fList[0]:nullptr; }
+ OBJ* Last() { return (fCount>0)?fList[fCount-1]:nullptr;}
bool isEmpty() { return fCount==0; }
bool notEmpty() { return fCount>0; }
int Capacity() { return fCapacity; }
@@ -181,6 +217,7 @@ template <class OBJ> class GPVec {
void setCount(int NewCount); //the same as setCapacity() but the new item range is filled with NULLs
int Add(OBJ* item); //simply append the pointer copy
void Add(GPVec<OBJ>& list); //add all pointers from another list
+ void addNew(GPVec<OBJ>& list); //add new OBJ copies of items in another list
void Insert(int idx, OBJ* item);
void Move(int curidx, int newidx);
void Put(int idx, OBJ* item);
@@ -198,7 +235,7 @@ template <class OBJ> class GPVec {
template <class OBJ> GVec<OBJ>::GVec(int init_capacity) {
fCount=0;
fCapacity=0;
- fArray=NULL;
+ fArray=nullptr;
setCapacity(init_capacity);
//if (set_count) fCount = init_capacity;
}
@@ -207,7 +244,7 @@ template <class OBJ> GVec<OBJ>::GVec(int init_capacity) {
template <class OBJ> GVec<OBJ>::GVec(int init_count, const OBJ init_val) {
fCount=0;
fCapacity=0;
- fArray=NULL;
+ fArray=nullptr;
setCapacity(init_count);
fCount = init_count;
for (int i=0;i<fCount;i++)
@@ -217,7 +254,7 @@ template <class OBJ> GVec<OBJ>::GVec(int init_count, const OBJ init_val) {
template <class OBJ> GVec<OBJ>::GVec(int init_count, OBJ* init_val, bool delete_initval) {
fCount=0;
fCapacity=0;
- fArray=NULL;
+ fArray=nullptr;
setCapacity(init_count);
fCount = init_count;
for (int i=0;i<fCount;i++)
@@ -227,98 +264,68 @@ template <class OBJ> GVec<OBJ>::GVec(int init_count, OBJ* init_val, bool delete_
template <class OBJ> GVec<OBJ>::GVec(const GVec<OBJ>& array) { //copy constructor
- this->fCount=array.fCount;
- this->fCapacity=array.fCapacity;
- this->fArray=NULL;
- if (this->fCapacity>0) {
- if (IsPrimitiveType<OBJ>::VAL) {
- GMALLOC(fArray, fCapacity*sizeof(OBJ));
- memcpy(fArray, array.fArray, fCount*sizeof(OBJ));
- }
- else {
- fArray=new OBJ[this->fCapacity]; //]()
- // uses OBJ operator=
- for (int i=0;i<this->fCount;i++) fArray[i]=array.fArray[i];
+ fCount=array.fCount;
+ fCapacity=array.fCapacity;
+ fArray=nullptr;
+ if (fCapacity>0) {
+ fArray=new OBJ[this->fCapacity];
+ std::copy(& array.fArray[0], & array.fArray[fCount], & fArray[0]);
}
}
- this->fCount=array.fCount;
+
+template <class OBJ> GVec<OBJ>::GVec(GVec<OBJ>&& array) { //move constructor
+ fCount=array.fCount;
+ fCapacity=array.fCapacity;
+ fArray=array.fArray;
+ array.fArray=nullptr;
+ array.fCount=0;
+ array.fCapacity=0;
}
-template <class OBJ> const GVec<OBJ>& GVec<OBJ>::operator=(const GVec<OBJ>& array) {
+template <class OBJ> GVec<OBJ>& GVec<OBJ>::operator=(const GVec<OBJ>& array) {
if (&array==this) return *this;
Clear();
fCapacity=array.fCapacity;
fCount=array.fCount;
if (fCapacity>0) {
- if (IsPrimitiveType<OBJ>::VAL) {
- GMALLOC(fArray, fCapacity*sizeof(OBJ));
- memcpy(fArray, array.fArray, fCount*sizeof(OBJ));
- }
- else {
- fArray=new OBJ[this->fCapacity]; // ]()
+ fArray=new OBJ[this->fCapacity];
// uses OBJ operator=
- for (int i=0;i<fCount;i++) {
- fArray[i]=array.fArray[i];
- }
- }
+ std::copy(& array.fArray[0], & array.fArray[fCount], & fArray[0]);
}
return *this;
}
-template <class OBJ> GVec<OBJ>::~GVec() {
- this->Clear();
+template <class OBJ> GVec<OBJ>& GVec<OBJ>::operator=(GVec<OBJ>&& array) {
+ if (&array==this) return *this;
+ Clear();
+ fCapacity=array.fCapacity;
+ fCount=array.fCount;
+ fArray=array.fArray;
+ array.fArray=nullptr;
+ array.fCapacity=0;
+ array.fCount=0;
+ return *this;
}
-
-template <class OBJ> void GVec<OBJ>::setCapacity(int NewCapacity) {
- if (NewCapacity < fCount || NewCapacity > MAXLISTSIZE)
- GError(GVEC_CAPACITY_ERR, NewCapacity);
- //error: NewCapacity MUST be > fCount
- //if you want to shrink it use Resize() or setCount()
- if (NewCapacity!=fCapacity) {
- if (NewCapacity==0) {
- if (IsPrimitiveType<OBJ>::VAL) {
- GFREE(fArray);
- } else {
- delete[] fArray;
- fArray=NULL;
- }
- }
- else {
- if (IsPrimitiveType<OBJ>::VAL) {
- GREALLOC(fArray, NewCapacity*sizeof(OBJ));
- //also zero init the new items?
- memset(fArray+fCount, 0, (NewCapacity-fCount)*sizeof(OBJ));
- } else {
- OBJ* oldArray=fArray;
- //fArray=new OBJ[NewCapacity]();
- fArray=new OBJ[NewCapacity];
- for (int i=0;i<this->fCount;i++) {
- fArray[i] = oldArray[i];
- }// we need operator= here
- //wouldn't be faster to use memcpy instead?
- //memcpy(fArray, oldArray, fCount*sizeof(OBJ));
- if (oldArray) delete[] oldArray;
- }
- }
- fCapacity=NewCapacity;
- }
+template <class OBJ> GVec<OBJ>::~GVec() {
+ this->Clear();
}
template <class OBJ> void GVec<OBJ>::Clear() {
fCount=0;
- if (IsPrimitiveType<OBJ>::VAL) {
- GFREE(fArray);
- }
- else {
- delete[] fArray;
- fArray=NULL;
- }
+ delete[] fArray;
+ fArray=nullptr;
fCapacity=0;
}
template <class OBJ> void GVec<OBJ>::Grow() {
- int delta = (fCapacity>8) ? (fCapacity>>2) : 1 ;
+ int delta = (fCapacity>8) ? (fCapacity>>2) : 2 ;
+ setCapacity(fCapacity + delta);
+}
+
+template <class OBJ> void GVec<OBJ>::Grow(int newCap) {
+ if (newCap<fCapacity) return; //never shrink with this method
+ int delta=newCap-fCapacity+2;
setCapacity(fCapacity + delta);
}
@@ -339,42 +346,23 @@ template <class OBJ> void GVec<OBJ>::Grow(int idx, OBJ& item) {
if (NewCapacity <= fCount || NewCapacity >= MAXLISTSIZE)
GError(GVEC_CAPACITY_ERR, NewCapacity);
//error: capacity not within range
- //if (NewCapacity!=fCapacity) {
if (idx==fCount) { //append item
- //GREALLOC(fArray, NewCapacity*sizeof(OBJ));
setCapacity(NewCapacity);
fArray[idx]=item;
+ fCount++;
+ return;
}
- else { //insert item at idx
- OBJ* newList;
- if (IsPrimitiveType<OBJ>::VAL) {
- GMALLOC(newList, NewCapacity*sizeof(OBJ));
- //copy data before idx
- memcpy(&newList[0],&fArray[0], idx*sizeof(OBJ));
- newList[idx]=item;
- //copy data after idx
- memmove(&newList[idx+1],&fArray[idx], (fCount-idx)*sizeof(OBJ));
- //..shouldn't do this (zero init new unused items in the array)
- memset(&newList[fCount+1], 0, (NewCapacity-fCount-1)*sizeof(OBJ));
- //data copied:
- GFREE(fArray);
- } else {
- newList=new OBJ[NewCapacity];
- // operator= required!
- for (int i=0;i<idx;i++) {
- newList[i]=fArray[i];
- }
- newList[idx]=item;
- //copy data after idx
- //memmove(&newList[idx+1],&fArray[idx], (fCount-idx)*sizeof(OBJ));
- for (int i=idx+1;i<=fCount;i++) {
- newList[i]=fArray[i-1];
- }
- delete[] fArray;
- }
- fArray=newList;
- fCapacity=NewCapacity;
- }
+ //expands and inserts item at idx at the same time
+ //insertGrow(NewCapacity, idx, item);
+ OBJ* newList=new OBJ[NewCapacity];
+ // operator= required!
+ std::move(& fArray[0], & fArray[idx], & newList[0]);
+ newList[idx]=item;
+ //copy data after idx
+ std::move(& fArray[idx], & fArray[fCount], & newList[idx+1]);
+ delete[] fArray;
+ fArray=newList;
+ fCapacity=NewCapacity;
fCount++;
}
template <class OBJ> int GVec<OBJ>::Add(OBJ* item) {
@@ -390,7 +378,7 @@ template <class OBJ> void GVec<OBJ>::Add(GVec<OBJ>& list) {
if (list.Count()==0) return;
//simply copy
setCapacity(fCapacity+list.fCount);
- if (IsPrimitiveType<OBJ>::VAL) {
+ if (std::is_trivial<OBJ>::value) {
memcpy( &fArray[fCount], list.fArray, list.fCount*sizeof(OBJ));
}
else {
@@ -434,20 +422,16 @@ template <class OBJ> void GVec<OBJ>::Insert(int idx, OBJ* item) {
//the old idx item all the above will be shifted to idx+1
if (idx<0 || idx>fCount) GError(GVEC_INDEX_ERR, idx);
if (fCount==fCapacity) { //need to resize the array
- Grow(idx, *item); //expand and also copy/move data and insert the new item
+ //expand and also copy/move data and insert the new item
+ //more efficient than copying everything and then moving elements
+ Grow(idx, *item);
return;
- }
+ }
//move data around to make room for the new item
if (idx<fCount) {
//copy after-idx items (shift up)
- if (IsPrimitiveType<OBJ>::VAL) {
- memmove(&fArray[idx+1],&fArray[idx], (fCount-idx)*sizeof(OBJ));
- }
- else {
- for (int i=fCount; i>idx; i--) {
- fArray[i]=fArray[i-1];
- }
- }
+ //this->shiftUp(idx);
+ std::move_backward(& fArray[idx], & fArray[fCount], & fArray[fCount+1]);
}
fArray[idx]=*item;
fCount++;
@@ -477,10 +461,12 @@ template <class OBJ> void GVec<OBJ>::Exchange(int idx1, int idx2) {
}
-template <class OBJ> void GVec<OBJ>::Delete(int index) {
- TEST_INDEX(index);
+template <class OBJ> void GVec<OBJ>::Delete(int idx) {
+ TEST_INDEX(idx);
+ std::move(& fArray[idx+1], & fArray[fCount], & fArray[idx]);
fCount--;
- if (IsPrimitiveType<OBJ>::VAL) {
+ /*
+ if (std::is_trivial<OBJ>::value) {
if (index<fCount)
//move higher elements if any (shift down)
memmove(&fArray[index], &fArray[index+1], (fCount-index)*sizeof(OBJ));
@@ -491,17 +477,16 @@ template <class OBJ> void GVec<OBJ>::Delete(int index) {
index++;
}
}
+ */
}
template <class OBJ> void GVec<OBJ>::setCount(int NewCount) {
if (NewCount<0 || NewCount > MAXLISTSIZE)
GError(GVEC_COUNT_ERR, NewCount);
//if (NewCount > fCapacity) setCapacity(NewCount);
- while(NewCount > fCapacity) Grow();
- if (IsPrimitiveType<OBJ>::VAL && NewCount>fCount) {
- memset(fArray+fCount, 0, (NewCount-fCount)*sizeof(OBJ));
- }
- fCount = NewCount; //new items will be populated by the default object constructor(!)
+ Grow(NewCount);
+ if (NewCount>fCount) this->topOff(NewCount);
+ fCount = NewCount; //new items should be populated by the default object constructor(!)
}
/*
template <class OBJ> void GVec<OBJ>::setCount(int NewCount, OBJ* v) {
@@ -518,7 +503,7 @@ template <class OBJ> void GVec<OBJ>::setCount(int NewCount, OBJ* v) {
template <class OBJ> void GVec<OBJ>::setCount(int NewCount, OBJ v) {
if (NewCount<0 || NewCount > MAXLISTSIZE)
GError(GVEC_COUNT_ERR, NewCount);
- while (NewCount > fCapacity) Grow();
+ Grow(NewCount);
if (NewCount>fCount) {
for (int i=fCount;i<NewCount;i++)
fArray[i]=v;
@@ -531,7 +516,7 @@ template <class OBJ> void GVec<OBJ>::qSort(int l, int r, GCompareProc* cmpFunc)
OBJ p,t;
do {
i = l; j = r;
- p = this->fArray[(l + r) >> 1];
+ p = this->fArray[l+ ((r - l) >> 1)];
do {
while (cmpFunc(&(this->fArray[i]), &p) < 0) i++;
while (cmpFunc(&(this->fArray[j]), &p) > 0) j--;
@@ -552,7 +537,7 @@ template <class OBJ> void GVec<OBJ>::Sort(GCompareProc* cmpFunc) {
GMessage("Warning: NULL compare function given, useless Sort() call.\n");
return;
}
- if (this->fArray!=NULL && this->fCount>0)
+ if (this->fArray!=nullptr && this->fCount>0)
qSort(0, this->fCount-1, cmpFunc);
}
@@ -565,31 +550,45 @@ template <class OBJ> void GVec<OBJ>::Sort() {
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
//*=> GPVec implementation
-template <class OBJ> GPVec<OBJ>::GPVec(GPVec& list) { //copy constructor
+template <class OBJ> GPVec<OBJ>::GPVec(const GPVec& list) { //copy constructor
fCount=list.fCount;
fCapacity=list.fCapacity;
- fList=NULL;
+ fList=nullptr;
fFreeProc=list.fFreeProc;
fCount=list.fCount;
if (fCapacity>0) {
- GMALLOC(fList, fCapacity*sizeof(OBJ*));
- memcpy(fList, list.fList, fCount*sizeof(OBJ*));
+ //GMALLOC(fList, fCapacity*sizeof(OBJ*));
+ fList=new OBJ*[fCapacity];
+ std::move(&list.fList[0], &list.fList[fCount], &fList[0]);
+ //memcpy(fList, list.fList, fCount*sizeof(OBJ*));
}
}
+template <class OBJ> GPVec<OBJ>::GPVec(GPVec&& list) { //copy constructor
+ fCount=list.fCount;
+ fCapacity=list.fCapacity;
+ fList=list.fList;
+ fFreeProc=list.fFreeProc;
+ list.fCount=0;
+ list.fCapacity=0;
+ list.fList=nullptr;
+}
+
template <class OBJ> GPVec<OBJ>::GPVec(GPVec* plist) { //another copy constructor
fCount=0;
fCapacity=plist->fCapacity;
- fList=NULL;
+ fList=nullptr;
fFreeProc=plist->fFreeProc;
fCount=plist->fCount;
if (fCapacity>0) {
- GMALLOC(fList, fCapacity*sizeof(OBJ*));
- memcpy(fList, plist->fList, fCount*sizeof(OBJ*));
+ //GMALLOC(fList, fCapacity*sizeof(OBJ*));
+ fList=new OBJ*[fCapacity];
+ std::move(& (plist->fList[0]), & (plist->fList[fCount]), &fList[0]);
+ //memcpy(fList, plist->fList, fCount*sizeof(OBJ*));
}
}
-template <class OBJ> const GPVec<OBJ>& GPVec<OBJ>::operator=(const GPVec& list) {
+template <class OBJ> GPVec<OBJ>& GPVec<OBJ>::operator=(const GPVec& list) {
if (&list!=this) {
Clear();
fFreeProc=list.fFreeProc;
@@ -598,13 +597,31 @@ template <class OBJ> const GPVec<OBJ>& GPVec<OBJ>::operator=(const GPVec& list)
fCount=list.fCount;
fCapacity=list.fCapacity;
if (fCapacity>0) {
- GMALLOC(fList, fCapacity*sizeof(OBJ*));
- memcpy(fList, list.fList, fCount*sizeof(OBJ*));
+ //GMALLOC(fList, fCapacity*sizeof(OBJ*));
+ //memcpy(fList, list.fList, fCount*sizeof(OBJ*));
+ fList=new OBJ*[fCapacity];
+ std::move(&list.fList[0], &list.fList[fCount], &fList[0]);
}
}
return *this;
}
+template <class OBJ> GPVec<OBJ>& GPVec<OBJ>::operator=(GPVec&& list) {
+ if (&list!=this) {
+ Clear();
+ fFreeProc=list.fFreeProc;
+ //Attention: only the *POINTERS* are copied,
+ // the actual objects are NOT duplicated
+ fCount=list.fCount;
+ fCapacity=list.fCapacity;
+ fList=list.fList;
+ list.fList=nullptr;
+ list.fCapacity=0;
+ list.fCount=0;
+ }
+ return *this;
+}
+
template <class OBJ> void GPVec<OBJ>::Add(GPVec<OBJ>& list) {
if (list.Count()==0) return;
@@ -614,6 +631,17 @@ template <class OBJ> void GPVec<OBJ>::Add(GPVec<OBJ>& list) {
fCount+=list.fCount;
}
+template <class OBJ> void GPVec<OBJ>::addNew(GPVec<OBJ>& list) {
+ if (list.Count()==0) return;
+ //requires OBJ copy constructor
+ setCapacity(fCapacity+list.fCount);
+ //memcpy( & (fList[fCount]), list.fList, list.fCount*sizeof(OBJ*));
+ for (int i=fCount;i<list.fCount;i++)
+ fList[i]=new OBJ(*list.fList[i-fCount]);
+ fCount+=list.fCount;
+}
+
+
template <class OBJ> void GPVec<OBJ>::Reverse() {
int l=0;
int r=fCount-1;
@@ -628,7 +656,7 @@ template <class OBJ> void GPVec<OBJ>::Reverse() {
template <class OBJ> GPVec<OBJ>::GPVec(int init_capacity, bool free_elements) {
fCount=0;
fCapacity=0;
- fList=NULL;
+ fList=nullptr;
fFreeProc=(free_elements) ? DefaultFreeProc : NULL;
if (init_capacity>0)
setCapacity(init_capacity);
@@ -637,7 +665,7 @@ template <class OBJ> GPVec<OBJ>::GPVec(int init_capacity, bool free_elements) {
template <class OBJ> GPVec<OBJ>::GPVec(bool free_elements) {
fCount=0;
fCapacity=0;
- fList=NULL;
+ fList=nullptr;
fFreeProc=(free_elements) ? DefaultFreeProc : NULL;
}
@@ -651,11 +679,19 @@ template <class OBJ> void GPVec<OBJ>::setCapacity(int NewCapacity) {
//error: capacity not within range
if (NewCapacity!=fCapacity) {
if (NewCapacity==0) {
- GFREE(fList);
+ //GFREE(fList);
+ delete[] fList;
+ fList=nullptr;
}
else {
- GREALLOC(fList, NewCapacity*sizeof(OBJ*));
- }
+ //GREALLOC(fList, NewCapacity*sizeof(OBJ*));
+ OBJ** oldList=fList;
+ fList=new OBJ*[NewCapacity];
+ if (oldList) {
+ std::move(&oldList[0], &oldList[fCount], & fList[0]);
+ delete[] oldList;
+ }
+ }
fCapacity=NewCapacity;
}
}
@@ -678,7 +714,9 @@ template <class OBJ> void GPVec<OBJ>::Clear() {
(*fFreeProc)(fList[i]);
}
}
- GFREE(fList);
+ //GFREE(fList);
+ delete[] fList;
+ fList=nullptr;
fCount=0;
fCapacity=0;
}
@@ -697,60 +735,46 @@ template <class OBJ> void GPVec<OBJ>::Expand() {
}
template <class OBJ> void GPVec<OBJ>::Grow() {
- /*
- int delta;
- if (fCapacity > 64 ) {
- delta = (fCapacity > 0xFFF) ? 0x100 : (fCapacity>>4);
- }
- else {
- delta = (fCapacity>8) ? (fCapacity>>2) : 1 ;
- }
- */
- int delta = (fCapacity>8) ? (fCapacity>>2) : 1;
+ int delta = (fCapacity>8) ? (fCapacity>>2) : 1;
setCapacity(fCapacity + delta);
}
-template <class OBJ> void GPVec<OBJ>::Grow(int idx, OBJ* newitem) {
- /*
- int delta;
- if (fCapacity > 64 ) {
- delta = (fCapacity > 0xFFF) ? 0x100 : (fCapacity>>4);
- }
- else {
- delta = (fCapacity>8) ? (fCapacity>>2) : 1 ;
- }
- */
+template <class OBJ> void GPVec<OBJ>::Grow(int idx, OBJ* item) {
int delta = (fCapacity>8) ? (fCapacity>>2) : 1 ;
int NewCapacity=fCapacity+delta;
if (NewCapacity <= fCount || NewCapacity > MAXLISTSIZE)
GError(GVEC_CAPACITY_ERR, NewCapacity);
- //error: capacity not within range
- //if (NewCapacity!=fCapacity) {
- /*if (NewCapacity==0) {
- GFREE(fList);
- }
- else {//add the new item
- */
if (idx==fCount) {
- GREALLOC(fList, NewCapacity*sizeof(OBJ*));
- fList[idx]=newitem;
- }
- else {
- OBJ** newList;
- GMALLOC(newList, NewCapacity*sizeof(OBJ*));
- //copy data before idx
- memcpy(&newList[0],&fList[0], idx*sizeof(OBJ*));
- newList[idx]=newitem;
- //copy data after idx
- memmove(&newList[idx+1],&fList[idx], (fCount-idx)*sizeof(OBJ*));
- memset(&newList[fCount+1], 0, (NewCapacity-fCount-1)*sizeof(OBJ*));
- //data copied:
- GFREE(fList);
- fList=newList;
- }
- fCount++;
+ //GREALLOC(fList, NewCapacity*sizeof(OBJ*));
+ setCapacity(NewCapacity);
+ fList[idx]=item;
+ fCount++;
+ return;
+ }
+ OBJ** newList=new OBJ*[NewCapacity];
+ std::move(& fList[0], & fList[idx], & newList[0]);
+ newList[idx]=item;
+ //copy data after idx
+ std::move(& fList[idx], & fList[fCount], & newList[idx+1]);
+ delete[] fList;
+ fList=newList;
fCapacity=NewCapacity;
+ fCount++;
}
+ /*
+ GMALLOC(newList, NewCapacity*sizeof(OBJ*));
+ //copy data before idx
+ memcpy(&newList[0],&fList[0], idx*sizeof(OBJ*));
+ newList[idx]=newitem;
+ //copy data after idx
+ memmove(&newList[idx+1],&fList[idx], (fCount-idx)*sizeof(OBJ*));
+ memset(&newList[fCount+1], 0, (NewCapacity-fCount-1)*sizeof(OBJ*));
+ //data copied:
+ GFREE(fList);
+ fList=newList;
+ fCount++;
+ fCapacity=NewCapacity;
+ */
template <class OBJ> int GPVec<OBJ>::IndexOf(pointer item) {
for (int i=0;i<fCount;i++) {
@@ -867,7 +891,7 @@ template <class OBJ> void GPVec<OBJ>::setCount(int NewCount) {
if (NewCount<0 || NewCount > MAXLISTSIZE)
GError(GVEC_COUNT_ERR, NewCount);
if (NewCount > fCapacity) setCapacity(NewCount);
- if (NewCount > fCount) //pad with NULL pointers
+ if (NewCount > fCount) //pad with NULL pointers!
memset(& fList[fCount], 0, (NewCount - fCount) * sizeof(OBJ*));
fCount = NewCount;
}
@@ -879,7 +903,7 @@ template <class OBJ> void GPVec<OBJ>::qSort(int L, int R, GCompareProc* cmpFunc)
do {
I = L;
J = R;
- P = this->fList[(L + R) >> 1];
+ P = this->fList[L + ((R-L)>>1)];
do {
while (cmpFunc(this->fList[I], P) < 0) I++;
while (cmpFunc(this->fList[J], P) > 0) J--;
=====================================
gff.cpp
=====================================
@@ -31,23 +31,27 @@ void gffnames_unref(GffNames* &n) {
if (n->numrefs==0) { delete n; n=NULL; }
}
-const int CLASSCODE_OVL_RANK = 15;
-int classcode_rank(char c) {
+const byte CLASSCODE_OVL_RANK = 14; //rank value just above 'o' class code
+//rank value < this means exon overlap
+
+const byte CLASSCODE_J_RANK = 6; // all junctional based overlaps
+
+byte classcode_rank(char c) {
switch (c) {
case '=': return 0; //intron chain match or full exon chain match if strict matching is enabled
case '~': return 1; //intron chain match when strict matching is enabled
- case 'c': return 2; //containment, perfect partial match (transfrag < reference)
- case 'k': return 6; // reverse containment (reference < transfrag)
+ case 'c': return 4; //containment, perfect partial match (transfrag contained in reference)
+ case 'k': return 4; // reverse containment (reference contained in transfrag)
case 'm': return 6; // full span overlap with all reference introns either matching or retained
case 'n': return 6; // partial overlap transfrag with at least one intron retention
- case 'j': return 6; // multi-exon transfrag with at least one junction match
+ case 'j': return 6; // multi-exon transfrag overlap with at least one junction match OR intron overlap!
case 'e': return 12; // single exon transfrag partially overlapping an intron of reference (possible pre-mRNA fragment)
- case 'o': return 14; // other generic exon overlap
- //**** >15 = no-overlaps (not on the same strand) from here on *****
+ case 'o': return 12; // other generic exon overlap
+ //**** >14 => no exon overlaps (not on the same strand) from here on *****
case 's': return 16; //"shadow" - an intron overlaps with a ref intron on the opposite strand (wrong strand mapping?)
case 'x': return 18; // generic overlap on opposite strand (usually wrong strand mapping)
case 'i': return 20; // intra-intron (transfrag fully contained within a reference intron)
- case 'y': return 30; // no exon overlap: ref exons fall within transfrag introns!
+ case 'y': return 30; // no exon overlap: ref exons fall within transfrag introns! (reverse of i)
case 'p': return 90; //polymerase run
case 'r': return 92; //repeats
case 'u': return 94; //intergenic
@@ -364,6 +368,32 @@ bool GffLine::parseSegmentList(GVec<GSeg>& segs, char* str) {
return segs_valid;
}
+void GffLine::ensembl_GFF_ID_process(char*& id) {
+ char* n=NULL;
+ if (startsWith(id, "gene:")) {
+ n=Gstrdup(id+5);
+ GFREE(id);
+ id=n;
+ }
+ else if (startsWith(id, "transcript:")) {
+ n=Gstrdup(id+11);
+ GFREE(id);
+ id=n;
+ }
+}
+
+void GffLine::ensembl_GTF_ID_process(char*& id, const char* ver_attr) {
+ char* v=NULL;
+ v=extractAttr(ver_attr);
+ if (v!=NULL) {
+ char* n=Gstrdup(id, strlen(v)+1);
+ strcat(n,".");strcat(n,v);
+ GFREE(v);
+ GFREE(id);
+ id=n;
+ }
+}
+
GffLine::GffLine(GffReader* reader, const char* l): _parents(NULL), _parents_len(0),
dupline(NULL), line(NULL), llen(0), gseqname(NULL), track(NULL),
ftype(NULL), ftype_id(-1), info(NULL), fstart(0), fend(0), //qstart(0), qend(0), qlen(0),
@@ -385,13 +415,16 @@ GffLine::GffLine(GffReader* reader, const char* l): _parents(NULL), _parents_len
line[i]=0;
t[tidx]=line+i+1;
tidx++;
- if (tidx>8) break;
+ //if (tidx>8) break;
}
i++;
}
if (tidx<8) { // ignore non-GFF lines
return;
}
+ if (tidx>9) {
+ GMessage("Warning: unexpected tab character in last column, line truncated:\n\%s\n",l);
+ }
gffWarnings=reader->gff_warns;
gseqname=t[0];
track=t[1];
@@ -519,14 +552,29 @@ GffLine::GffLine(GffReader* reader, const char* l): _parents(NULL), _parents_len
char *gtf_gid=NULL;
if (reader->is_gff3 || reader->gff_type==0) {
ID=extractAttr("ID=",true);
+ if (ID!=NULL && reader->procEnsemblID()) {
+ ensembl_GFF_ID_process(ID);
+ }
Parent=extractAttr("Parent=",true);
+ if (Parent!=NULL && reader->procEnsemblID()) {
+ ensembl_GFF_ID_process(Parent);
+ }
+
if (reader->gff_type==0) {
if (ID!=NULL || Parent!=NULL) reader->is_gff3=true;
else { //check if it looks like a GTF
gtf_tid=extractAttr("transcript_id", true, true);
- if (gtf_tid==NULL) {
+ if (gtf_tid!=NULL) {
+ if (reader->procEnsemblID())
+ ensembl_GTF_ID_process(gtf_tid, "transcript_version");
+ }
+ else { //NULL gtf_tid, try gene_id
gtf_gid=extractAttr("gene_id", true, true);
- if (gtf_gid==NULL) return; //cannot determine file type yet
+ if (gtf_gid!=NULL) {
+ if (reader->procEnsemblID())
+ ensembl_GTF_ID_process(gtf_gid, "gene_version");
+ }
+ else return; //cannot determine file type yet
}
reader->is_gtf=true;
}
@@ -538,7 +586,6 @@ GffLine::GffLine(GffReader* reader, const char* l): _parents(NULL), _parents_len
//if (ID==NULL && Parent==NULL) return; //silently ignore unidentified/unlinked features
if (ID!=NULL) {
//has ID attr so it's likely to be a parent feature
-
//look for explicit gene name
gene_name=getAttrValue("gene_name=");
if (gene_name==NULL) {
@@ -554,18 +601,6 @@ GffLine::GffLine(GffReader* reader, const char* l): _parents(NULL), _parents_len
if (gene_id==NULL) {
gene_id=getAttrValue("gene_id=");
}
- /*
- if (is_gene) { //--WARNING: this might be mislabeled (e.g. TAIR: "mRNA_TE_gene")
- //---special case: keep the Name and ID attributes of the gene feature
- //if (gene_name==NULL)
- // gene_name=extractAttr("Name=");
- if (gene_id==NULL) //the ID is also gene_id in this case
- gene_id=Gstrdup(ID);
- //skip=false;
- //return;
- //-- we don't care about gene parents.. unless it's a mislabeled "gene" feature
- } //gene feature (probably)
- */
//--parse exons for TLF
char* segstr=extractAttr("exons=");
bool exons_valid=false;
@@ -649,48 +684,13 @@ GffLine::GffLine(GffReader* reader, const char* l): _parents(NULL), _parents_len
}
} //has Parent field
//special case for gene_id: for genes, this is the ID
- if (is_gene && gene_id==NULL && ID!=NULL) {
- gene_id=Gstrdup(ID);
- }
- //parse other potentially useful GFF3 attributes
- /*
- if ((p=strstr(info,"Target="))!=NULL) { //has Target attr
- p+=7;
- while (*p!=';' && *p!=0 && *p!=' ') p++;
- if (*p!=' ') {
- GError("Error parsing target coordinates from GFF line:\n%s\n",l);
- }
- if (!parseUInt(p,qstart))
- GError("Error parsing target start coordinate from GFF line:\n%s\n",l);
- if (*p!=' ') {
- GError("Error parsing next target coordinate from GFF line:\n%s\n",l);
- }
- p++;
- if (!parseUInt(p,qend))
- GError("Error parsing target end coordinate from GFF line:\n%s\n",l);
- }
- if ((p=strifind(info,"Qreg="))!=NULL) { //has Qreg attr
- p+=5;
- if (!parseUInt(p,qstart))
- GError("Error parsing target start coordinate from GFF line:\n%s\n",l);
- if (*p!='-') {
- GError("Error parsing next target coordinate from GFF line:\n%s\n",l);
- }
- p++;
- if (!parseUInt(p,qend))
- GError("Error parsing target end coordinate from GFF line:\n%s\n",l);
- if (*p=='|' || *p==':') {
- p++;
- if (!parseUInt(p,qlen))
- GError("Error parsing target length from GFF Qreg|: \n%s\n",l);
- }
- }//has Qreg attr
- if (qlen==0 && (p=strifind(info,"Qlen="))!=NULL) {
- p+=5;
- if (!parseUInt(p,qlen))
- GError("Error parsing target length from GFF Qlen:\n%s\n",l);
+ if (gene_id==NULL) {
+ if (is_gene) {
+ if (ID!=NULL) gene_id=Gstrdup(ID);
+ } else if (is_transcript) {
+ if (Parent!=NULL) gene_id=Gstrdup(Parent);
+ }
}
- */
} //GFF3
else { // ----------------- GTF syntax ------------------
if (reader->transcripts_Only && !is_t_data) {
@@ -698,8 +698,16 @@ GffLine::GffLine(GffReader* reader, const char* l): _parents(NULL), _parents_len
}
if (is_gene) {
reader->gtf_gene=true;
- ID = (gtf_tid!=NULL) ? gtf_tid : extractAttr("transcript_id", true, true); //Ensemble GTF might lack this
+ ID = (gtf_tid!=NULL) ? gtf_tid : extractAttr("transcript_id", true, true);
+ //Ensemble GTF might lack transcript_id !
+ if (ID!=NULL) {
+ if (gtf_tid==NULL && reader->procEnsemblID())
+ ensembl_GTF_ID_process(ID, "transcript_version");
+ }
gene_id = (gtf_gid!=NULL) ? gtf_gid : extractAttr("gene_id", true, true);
+ if (gene_id!=NULL && gtf_gid==NULL && reader->procEnsemblID())
+ ensembl_GTF_ID_process(gene_id, "gene_version");
+
if (ID==NULL) {
//no transcript_id -- this should not be valid GTF2 format, but Ensembl (Gencode?)
//has being known to add "gene" features with only gene_id in their GTF
@@ -716,15 +724,23 @@ GffLine::GffLine(GffReader* reader, const char* l): _parents(NULL), _parents_len
//something is wrong here, cannot parse the GTF ID
GMessage("Warning: invalid GTF record, transcript_id not found:\n%s\n", l);
return;
- }
+ } else if (gtf_tid==NULL && reader->procEnsemblID())
+ ensembl_GTF_ID_process(ID, "transcript_version");
gene_id = (gtf_gid!=NULL) ? gtf_gid : extractAttr("gene_id", true, true);
- if (gene_id!=NULL)
+ if (gene_id!=NULL) {
+ if (gtf_gid==NULL && reader->procEnsemblID())
+ ensembl_GTF_ID_process(gene_id, "gene_version");
Parent=Gstrdup(gene_id);
+ }
reader->gtf_transcript=true;
is_gtf_transcript=1;
- } else { //must be an exon type
+ } else { //must be an exon type ?
Parent = (gtf_tid!=NULL) ? gtf_tid : extractAttr("transcript_id", true, true);
+ if (Parent!=NULL && gtf_tid==NULL && reader->procEnsemblID())
+ ensembl_GTF_ID_process(Parent, "transcript_version");
gene_id = (gtf_gid!=NULL) ? gtf_gid : extractAttr("gene_id", true, true); // for GTF this is the only attribute accepted as geneID
+ if (gene_id!=NULL && gtf_gid==NULL && reader->procEnsemblID())
+ ensembl_GTF_ID_process(gene_id, "gene_version");
//old pre-GTF2 formats like Jigsaw's (legacy support)
if (Parent==NULL && exontype==exgffExon) {
if (startsWith(track,"jigsaw")) {
@@ -1267,7 +1283,8 @@ GffObj::GffObj(GffReader &gfrd, GffLine& gffline):
gene_name=Gstrdup(gffline.gene_name);
}
if (gffline.gene_id) { //only for gene features or GTF2 gene_id attribute
- geneID=Gstrdup(gffline.gene_id);
+ if (!(this->isGene() && strcmp(gffID, gffline.gene_id)==0))
+ geneID=Gstrdup(gffline.gene_id);
}
/*//we cannot assume parents[0] is a gene! for NCBI miRNA, parent can be a primary_transcript feature!
else if (gffline.is_transcript && gffline.parents!=NULL) {
@@ -1478,7 +1495,6 @@ GffObj* GffReader::updateGffRec(GffObj* prevgfo, GffLine* gffline) {
return prevgfo;
}
-
bool GffReader::readExonFeature(GffObj* prevgfo, GffLine* gffline, GHash<CNonExon*>* pex) {
//this should only be called before prevgfo->finalize()!
bool r=true;
@@ -2330,8 +2346,11 @@ void BED_addAttribute(FILE* fout, int& acc, const char* format,... ) {
va_end(arguments);
}
-void GffObj::printBED(FILE* fout, bool cvtChars, char* dbuf, int dbuf_len) {
+void GffObj::printBED(FILE* fout, bool cvtChars) {
//print a BED-12 line + GFF3 attributes in 13th field
+ const int DBUF_LEN=1024; //there should not be attribute values longer than 1K!
+ char dbuf[DBUF_LEN];
+
int cd_start=CDstart>0? CDstart-1 : start-1;
int cd_end=CDend>0 ? CDend : end;
char cdphase=(CDphase>0) ? CDphase : '0';
@@ -2365,7 +2384,7 @@ void GffObj::printBED(FILE* fout, bool cvtChars, char* dbuf, int dbuf_len) {
continue;
}
if (cvtChars) {
- decodeHexChars(dbuf, attrval, dbuf_len-1);
+ decodeHexChars(dbuf, attrval, DBUF_LEN-1);
BED_addAttribute(fout, numattrs, "%s=%s", attrname, dbuf);
}
else
@@ -2468,7 +2487,20 @@ void GffObj::setRefName(const char* newname) {
this->gseq_id=rid;
}
-
+int GffObj::removeAttrs(GStrSet<>& attrSet) {
+ //remove attributes NOT found in given set of attribute names
+ if (this->attrs==NULL || attrSet.Count()==0) return 0;
+ int delcount=0;
+ for (int i=0;i<this->attrs->Count();i++) {
+ int aid=this->attrs->Get(i)->attr_id;
+ if (!attrSet.hasKey(this->names->attrs.Get(aid)->name)) {
+ delcount++;
+ this->attrs->freeItem(i);
+ }
+ }
+ if (delcount>0) this->attrs->Pack();
+ return delcount;
+}
int GffObj::removeAttr(const char* attrname, const char* attrval) {
if (this->attrs==NULL || attrname==NULL || attrname[0]==0) return 0;
@@ -2801,10 +2833,9 @@ void GffObj::printGTab(FILE* fout, char** extraAttrs) {
}
void GffObj::printGxfExon(FILE* fout, const char* tlabel, const char* gseqname, bool iscds,
- GffExon* exon, bool gff3, bool cvtChars,
- char* dbuf, int dbuf_len) {
- //strcpy(dbuf,".");
- //if (exon->score>0) sprintf(dbuf,"%.2f", exon->score);
+ GffExon* exon, bool gff3, bool cvtChars) {
+ const int DBUF_LEN=1024; //there should not be attribute values longer than 1K!
+ char dbuf[DBUF_LEN];
exon->score.sprint(dbuf);
if (exon->phase==0 || !iscds) exon->phase='.';
const char* ftype=iscds ? "CDS" : getSubfName();
@@ -2820,7 +2851,7 @@ void GffObj::printGxfExon(FILE* fout, const char* tlabel, const char* gseqname,
if (exon->attrs->Get(i)->cds!=iscds) continue;
attrname=names->attrs.getName(exon->attrs->Get(i)->attr_id);
if (cvtChars) {
- decodeHexChars(dbuf, exon->attrs->Get(i)->attr_val, dbuf_len-1);
+ decodeHexChars(dbuf, exon->attrs->Get(i)->attr_val, DBUF_LEN-1);
fprintf(fout,";%s=%s", attrname, dbuf);
} else {
fprintf(fout,";%s=%s", attrname, exon->attrs->Get(i)->attr_val);
@@ -2865,7 +2896,7 @@ void GffObj::printGxfExon(FILE* fout, const char* tlabel, const char* gseqname,
}
fprintf(fout, " %s ",attrname);
if (cvtChars) {
- decodeHexChars(dbuf, exon->attrs->Get(i)->attr_val, dbuf_len-1);
+ decodeHexChars(dbuf, exon->attrs->Get(i)->attr_val, DBUF_LEN-1);
attrval=dbuf;
} else {
attrval=exon->attrs->Get(i)->attr_val;
@@ -2875,94 +2906,108 @@ void GffObj::printGxfExon(FILE* fout, const char* tlabel, const char* gseqname,
else fprintf(fout, "\"%s\";",attrval);
}
}
- //for GTF, also append the GffObj attributes to each exon line
- // - do not do this when the transcript line is also printed!
- /*
- if (attrs!=NULL) {
- for (int i=0;i<attrs->Count();i++) {
- if (attrs->Get(i)->attr_val==NULL) continue;
- attrname=names->attrs.getName(attrs->Get(i)->attr_id);
- fprintf(fout, " %s ",attrname);
- if (cvtChars) {
- decodeHexChars(dbuf, attrs->Get(i)->attr_val, dbuf_len-1);
- attrval=dbuf;
- } else {
- attrval=attrs->Get(i)->attr_val;
- }
- if (attrval[0]=='"') fprintf(fout, "%s;",attrval);
- else fprintf(fout, "\"%s\";",attrval);
- }
- }
- */
fprintf(fout, "\n");
}//GTF
}
-void GffObj::printGxf(FILE* fout, GffPrintMode gffp,
- const char* tlabel, const char* gfparent, bool cvtChars) {
+bool GffObj::printAttrs(FILE* fout, const char* sep, bool GTFstyle, bool cvtChars, bool sepFirst) {
+ //* this prints sep FIRST and then the list of attributes separated by sep
+ //* does NOT print sep and newline at the end!
+ // returns false if no attribute was printed at all
const int DBUF_LEN=1024; //there should not be attribute values longer than 1K!
char dbuf[DBUF_LEN];
+ const char* prsep=sepFirst ? sep : "";
+ //assumes ID or transcript_ID was already printed (without ending)
+ bool pr=false;
+ if (GTFstyle) {
+ //for GTF also print gene_id here! (and gene_name)
+ char* gid=NULL;
+ if (geneID!=NULL) {
+ gid=geneID;
+ }
+ else {
+ gid=getAttr("gene_id");
+ if (gid==NULL)
+ gid=gffID; //last resort, write gid the same with gffID
+ }
+ if (gid!=NULL) { fprintf(fout, "%sgene_id \"%s\"",prsep, gid); prsep=sep; pr=true; }
+ if (gene_name!=NULL && getAttr("gene_name")==NULL && getAttr("GENE_NAME")==NULL)
+ { fprintf(fout, "%sgene_name \"%s\"",prsep, gene_name); prsep=sep; pr=true; }
+ if (attrs!=NULL) {
+ bool trId=false;
+ //bool gId=false;
+ for (int i=0;i<attrs->Count();i++) {
+ const char* attrname=names->attrs.getName(attrs->Get(i)->attr_id);
+ const char* attrval=attrs->Get(i)->attr_val;
+ if (attrval==NULL || attrval[0]=='\0') continue;
+ if (strcmp(attrname, "transcriptID")==0) {
+ if (trId) continue;
+ trId=true;
+ }
+ if (strcmp(attrname, "transcript_id")==0 && !trId) {
+ attrname="transcriptID";
+ trId=true;
+ }
+ if (Gstrcmp(attrname, "geneID")==0 && gid!=NULL &&
+ strcmp(attrval, gid)==0) continue;
+ if (strcmp(attrname, "gene_id")==0) continue;
+ if (cvtChars) {
+ decodeHexChars(dbuf, attrval, DBUF_LEN-1);
+ fprintf(fout,"%s%s \"%s\"", prsep, attrname, dbuf);
+ }
+ else
+ fprintf(fout,"%s%s \"%s\"", prsep, attrname, attrs->Get(i)->attr_val);
+ prsep=sep;
+ pr=true;
+ }
+ }
+ } else { //for GFF/BED/TLF etc, geneID and gene_name should be printed already
+ //IF stored only separately and NOT as attributes
+ //the initial sep is NOT printed!
+ if (attrs!=NULL) {
+ for (int i=0;i<attrs->Count();i++) {
+ const char* attrname=names->attrs.getName(attrs->Get(i)->attr_id);
+ const char* attrval=attrs->Get(i)->attr_val;
+ if (attrval==NULL || attrval[0]=='\0') continue;
+ //fprintf(fout,";%s",attrname);
+ if (cvtChars) {
+ decodeHexChars(dbuf, attrval, DBUF_LEN-1);
+ fprintf(fout,"%s%s=%s", prsep, attrname, dbuf);
+ }
+ else
+ fprintf(fout,"%s%s=%s", prsep, attrname, attrs->Get(i)->attr_val);
+ prsep=sep;
+ pr=true;
+ }
+ }
+ }
+ return pr;
+}
+
+void GffObj::printGxf(FILE* fout, GffPrintMode gffp,
+ const char* tlabel, const char* gfparent, bool cvtChars) {
+ char dbuf[10];
if (tlabel==NULL) {
tlabel=track_id>=0 ? names->tracks.Get(track_id)->name :
(char*)"gffobj" ;
}
if (gffp==pgffBED) {
- printBED(fout, cvtChars, dbuf, DBUF_LEN);
+ printBED(fout, cvtChars);
return;
}
const char* gseqname=names->gseqs.Get(gseq_id)->name;
bool gff3 = (gffp>=pgffAny && gffp<=pgffTLF);
bool showCDS = (gffp==pgtfAny || gffp==pgtfCDS || gffp==pgtfBoth || gffp==pgffCDS || gffp==pgffAny || gffp==pgffBoth);
bool showExon = (gffp<=pgtfExon || gffp==pgtfBoth || gffp==pgffAny || gffp==pgffExon || gffp==pgffBoth);
- //if (gscore>0.0) sprintf(dbuf,"%.2f", gscore);
- // else strcpy(dbuf,".");
gscore.sprint(dbuf);
if (gffp<=pgtfBoth && gffp>=pgtfAny) { //GTF output
fprintf(fout,
"%s\t%s\ttranscript\t%d\t%d\t%s\t%c\t.\ttranscript_id \"%s\"",
gseqname, tlabel, start, end, dbuf, strand, gffID);
- char* gid=NULL;
- if (geneID!=NULL) {
- gid=geneID;
- }
- else {
- gid=getAttr("gene_id");
- if (gid==NULL)
- gid=gffID; //last resort, write gid the same with gffID
- }
- if (gid!=NULL) fprintf(fout, "; gene_id \"%s\"",gid);
- if (gene_name!=NULL && getAttr("gene_name")==NULL && getAttr("GENE_NAME")==NULL)
- fprintf(fout, "; gene_name \"%s\"",gene_name);
- if (attrs!=NULL) {
- bool trId=false;
- //bool gId=false;
- for (int i=0;i<attrs->Count();i++) {
- const char* attrname=names->attrs.getName(attrs->Get(i)->attr_id);
- const char* attrval=attrs->Get(i)->attr_val;
- if (attrval==NULL || attrval[0]=='\0') continue;
- if (strcmp(attrname, "transcriptID")==0) {
- if (trId) continue;
- trId=true;
- }
- if (strcmp(attrname, "transcript_id")==0 && !trId) {
- attrname="transcriptID";
- trId=true;
- }
- if (Gstrcmp(attrname, "geneID")==0 && gid!=NULL &&
- strcmp(attrval, gid)==0) continue;
- if (strcmp(attrname, "gene_id")==0) continue;
- if (cvtChars) {
- decodeHexChars(dbuf, attrval, DBUF_LEN-1);
- fprintf(fout,"; %s \"%s\"", attrname, dbuf);
- }
- else
- fprintf(fout,"; %s \"%s\"", attrname, attrs->Get(i)->attr_val);
- }
- }
- fprintf(fout,";\n");
+ printAttrs(fout, "; ", true, cvtChars);
+ fprintf(fout,"\n");
}
- else if (gff3) {
- //print GFF3 transcript line:
+ else if (gff3) { //print GFF3 transcript line:
uint pstart, pend;
if (gffp==pgffCDS) {
pstart=CDstart;
@@ -2976,7 +3021,7 @@ void GffObj::printGxf(FILE* fout, GffPrintMode gffp,
gseqname, tlabel, ftype, pstart, pend, dbuf, strand, gffID);
bool parentPrint=false;
if (gfparent!=NULL && gffp!=pgffTLF) {
- //parent override - also prevents printing gene_name and gene_id
+ //parent override - also prevents printing gene_name and gene_id unless they were given as transcript attributes
fprintf(fout, ";Parent=%s",gfparent);
parentPrint=true;
}
@@ -3009,20 +3054,7 @@ void GffObj::printGxf(FILE* fout, GffPrintMode gffp,
fprintf(fout, ";geneID=%s",geneID);
if (gene_name!=NULL && !parentPrint && getAttr("gene_name")==NULL && getAttr("GENE_NAME")==NULL)
fprintf(fout, ";gene_name=%s",gene_name);
- if (attrs!=NULL) {
- for (int i=0;i<attrs->Count();i++) {
- const char* attrname=names->attrs.getName(attrs->Get(i)->attr_id);
- const char* attrval=attrs->Get(i)->attr_val;
- if (attrval==NULL || attrval[0]=='\0') continue;
- //fprintf(fout,";%s",attrname);
- if (cvtChars) {
- decodeHexChars(dbuf, attrval, DBUF_LEN-1);
- fprintf(fout,";%s=%s", attrname, dbuf);
- }
- else
- fprintf(fout,";%s=%s", attrname, attrs->Get(i)->attr_val);
- }
- }
+ printAttrs(fout, ";", false, cvtChars);
fprintf(fout,"\n");
}// gff3 transcript line
if (gffp==pgffTLF) return;
@@ -3030,14 +3062,14 @@ void GffObj::printGxf(FILE* fout, GffPrintMode gffp,
if (showExon) {
//print exons
for (int i=0;i<exons.Count();i++) {
- printGxfExon(fout, tlabel, gseqname, is_cds_only, exons[i], gff3, cvtChars, dbuf, DBUF_LEN);
+ printGxfExon(fout, tlabel, gseqname, is_cds_only, exons[i], gff3, cvtChars);
}
}//printing exons
if (showCDS && !is_cds_only && CDstart>0) {
GVec<GffExon> cds;
getCDSegs(cds); //also uses/prepares the CDS phase for each CDS segment
for (int i=0;i<cds.Count();i++) {
- printGxfExon(fout, tlabel, gseqname, true, &(cds[i]), gff3, cvtChars, dbuf, DBUF_LEN);
+ printGxfExon(fout, tlabel, gseqname, true, &(cds[i]), gff3, cvtChars);
}
} //printing CDSs
}
@@ -3181,59 +3213,62 @@ char singleExonTMatch(GffObj& m, GffObj& r, int& ovlen, int trange) {
return 0;
}
-//formerly in gffcompare
-char getOvlCode(GffObj& m, GffObj& r, int& ovlen, bool stricterMatch, int trange) {
- ovlen=0; //total actual exonic overlap
- if (!m.overlap(r.start,r.end)) return 0;
+TOvlData getOvlData(GffObj& m, GffObj& r, bool stricterMatch, int trange) {
+ TOvlData odta;
+ if (!m.overlap(r.start,r.end)) return odta;
int jmax=r.exons.Count()-1;
- char rcode=0;
+ //char rcode=0;
if (m.exons.Count()==1) { //single-exon transfrag
GSeg mseg(m.start, m.end);
if (jmax==0) { //also single-exon ref
- //ovlen=mseg.overlapLen(r.start,r.end);
char eqcode=0;
- if ((eqcode=singleExonTMatch(m, r, ovlen, trange))>0) {
- if (stricterMatch) return eqcode;
- else return '=';
+ if ((eqcode=singleExonTMatch(m, r, odta.ovlen, trange))>0) {
+ odta.ovlcode=(stricterMatch) ? eqcode : '=';
+ return odta;
}
if (m.covlen<r.covlen)
- { if (ovlen >= m.covlen*0.8) return 'c'; } // fuzzy containment
+ { if (odta.ovlen >= m.covlen*0.8) { odta.ovlcode='c'; return odta; } } // fuzzy containment
else
- if (ovlen >= r.covlen*0.8 ) return 'k'; // fuzzy reverse containment
- return 'o'; //just plain overlapping
+ if (odta.ovlen >= r.covlen*0.8 ) { odta.ovlcode='k'; return odta; } // fuzzy reverse containment
+ odta.ovlcode='o';
+ return odta;
+ //just plain overlapping
}
//-- single-exon qry overlaping multi-exon ref
//check full pre-mRNA case (all introns retained): code 'm'
if (m.start<=r.exons[0]->end && m.end>=r.exons[jmax]->start)
- return 'm';
+ { odta.ovlcode='m'; return odta; }
for (int j=0;j<=jmax;j++) {
//check if it's ~contained by an exon
int exovlen=mseg.overlapLen(r.exons[j]);
if (exovlen>0) {
- ovlen+=exovlen;
+ odta.ovlen+=exovlen;
if (m.start>r.exons[j]->start-4 && m.end<r.exons[j]->end+4) {
- return 'c'; //close enough to be considered contained in this exon
+ odta.ovlcode='c';
+ return odta; //close enough to be considered contained in this exon
}
}
if (j==jmax) break; //last exon here, no intron to check
//check if it fully covers an intron (retained intron)
if (m.start<r.exons[j]->end && m.end>r.exons[j+1]->start)
- return 'n';
+ { odta.ovlcode='n'; return odta; }
//check if it's fully contained by an intron
if (m.end<r.exons[j+1]->start && m.start>r.exons[j]->end)
- return 'i';
+ { odta.ovlcode='i'; return odta; }
// check if it's a potential pre-mRNA transcript
// (if overlaps this intron at least 10 bases)
uint introvl=mseg.overlapLen(r.exons[j]->end+1, r.exons[j+1]->start-1);
//iovlen+=introvl;
- if (introvl>=10 && mseg.len()>introvl+10) { rcode='e'; }
+ if (introvl>=10 && mseg.len()>introvl+10) { odta.ovlcode='e'; }
} //for each ref exon
- if (rcode>0) return rcode;
- return 'o'; //plain overlap, uncategorized
+ if (odta.ovlcode>0) return odta;
+ odta.ovlcode='o'; //plain overlap, uncategorized
+ return odta;
} //single-exon transfrag
//-- multi-exon transfrag --
int imax=m.exons.Count()-1;// imax>0 here
+ odta.jbits.resize(imax << 1); //num_junctions = 2 * num_introns
if (jmax==0) { //single-exon reference overlap
//any exon overlap?
GSeg rseg(r.start, r.end);
@@ -3241,35 +3276,52 @@ char getOvlCode(GffObj& m, GffObj& r, int& ovlen, bool stricterMatch, int trange
//check if it's ~contained by an exon
int exovlen=rseg.overlapLen(m.exons[i]);
if (exovlen>0) {
- ovlen+=exovlen;
+ odta.ovlen+=exovlen;
if (r.start>m.exons[i]->start-4 && r.end<m.exons[i]->end+4) {
- return 'k'; //reference contained in this assembled exon
+ odta.ovlcode='k';
+ return odta; //reference contained in this assembled exon
}
}
if (i==imax) break;
- if (r.end<m.exons[i+1]->start && r.start>m.exons[i]->end)
- return 'y'; //ref contained in this transfrag intron
+ if (r.end<m.exons[i+1]->start && r.start>m.exons[i]->end) {
+ odta.ovlcode='y'; //ref contained in this transfrag intron
+ return odta;
+ }
}
- return 'o';
- }
- // * check if transfrag contained by a ref intron
+ odta.ovlcode='o';
+ return odta;
+ } // SET reference
+ // -- MET comparison ---
+ // check if qry contained by a ref intron
for (int j=0;j<jmax;j++) {
if (m.end<r.exons[j+1]->start && m.start>r.exons[j]->end)
- return 'i';
+ { odta.ovlcode='i'; return odta; }
}
if (m.exons[imax]->start<r.exons[0]->end) {
//qry intron chain ends before ref intron chain starts
//check if last qry exon plugs the 1st ref intron
if (m.exons[imax]->start<=r.exons[0]->end &&
- m.exons[imax]->end>=r.exons[1]->start) return 'n';
- return 'o'; //only terminal exons overlap
+ m.exons[imax]->end>=r.exons[1]->start) {
+ odta.ovlen=m.exonOverlapLen(r);
+ odta.ovlcode='n';
+ return odta;
+ }
+ odta.ovlen=m.exons[imax]->overlapLen(r.exons[0]);
+ odta.ovlcode='o'; //only terminal exons overlap
+ return odta;
}
else if (r.exons[jmax]->start<m.exons[0]->end) {
//qry intron chain starts after ref intron chain ends
//check if first qry exon plugs the last ref intron
if (m.exons[0]->start<=r.exons[jmax-1]->end &&
- m.exons[0]->end>=r.exons[jmax]->start) return 'n';
- return 'o'; //only terminal exons overlap
+ m.exons[0]->end>=r.exons[jmax]->start) {
+ odta.ovlen=m.exonOverlapLen(r);
+ odta.ovlcode='n';
+ return odta;
+ }
+ odta.ovlen=m.exons[0]->overlapLen(r.exons[jmax]);
+ odta.ovlcode='o'; //only terminal exons overlap
+ return odta;
}
//check intron chain overlap (match, containment, intron retention etc.)
int i=1; //index of exon to the right of current qry intron
@@ -3286,9 +3338,8 @@ char getOvlCode(GffObj& m, GffObj& r, int& ovlen, bool stricterMatch, int trange
int imlast=0; //index of exon after last intron match in query
int jmlast=0; //index of exon after last intron match in reference
//--keep track of the last overlapping introns in both qry and ref:
- //int q_last_iovl=0;
- //int r_last_iovl=0;
-
+ odta.ovlen=m.exonOverlapLen(r);
+ //int q_first_iovl=-1, r_first_iovl=-1, q_last_iovl=-1, r_last_iovl=-1;
//check for intron matches
while (i<=imax && j<=jmax) {
uint mstart=m.exons[i-1]->end; //qry intron start-end
@@ -3303,7 +3354,7 @@ char getOvlCode(GffObj& m, GffObj& r, int& ovlen, bool stricterMatch, int trange
if (intron_ovl) ichain_match=false;
j++;
continue;
- } //no intron overlap, skipping ref intron
+ } //no overlap with this ref intron, skipping it
if (rstart>mend) { //qry intron ends before ref intron starts
//if qry intron overlaps the exon on the left, we have an intron conflict
if (!intron_conflict && r.exons[j-1]->overlap(mstart+1, mend-1))
@@ -3315,14 +3366,26 @@ char getOvlCode(GffObj& m, GffObj& r, int& ovlen, bool stricterMatch, int trange
continue;
} //no intron overlap, skipping qry intron
intron_ovl=true;
+
//q_last_iovl=i; //keep track of the last overlapping introns in both qry and ref
//r_last_iovl=j;
//overlapping introns, test junction matching
- bool smatch=(mstart==rstart);
- bool ematch=(mend==rend);
- if (smatch || ematch) junct_match=true;
+ bool smatch=false;
+ if (mstart==rstart) {
+ smatch=true;
+ odta.jbits.set( (i-1)<<1 );
+ odta.numJmatch++;
+ junct_match=true;
+ }
+ bool ematch=false;
+ if (mend==rend) {
+ ematch=true;
+ odta.jbits.set( ((i-1)<<1)+1 );
+ odta.numJmatch++;
+ junct_match=true;
+ }
if (smatch && ematch) {
- //perfect match for this intron
+ //perfect match of this intron
if (jmfirst==0) {
ichain_match=true;
jmfirst=j;
@@ -3353,9 +3416,13 @@ char getOvlCode(GffObj& m, GffObj& r, int& ovlen, bool stricterMatch, int trange
if (ichain_match) { //intron (sub-)chain compatible so far (but there could still be conflicts)
if (imfirst==1 && imlast==imax) { // qry full intron chain match
if (jmfirst==1 && jmlast==jmax) {//identical intron chains
- if (stricterMatch) return (abs((int)r.exons[0]->start-(int)m.exons[0]->start)<=trange &&
- abs((int)r.exons.Last()->end-(int)m.exons.Last()->end)<=trange) ? '=' : '~';
- return '=';
+ if (stricterMatch) {
+ odta.ovlcode= (abs((int)r.exons[0]->start-(int)m.exons[0]->start)<=trange &&
+ abs((int)r.exons.Last()->end-(int)m.exons.Last()->end)<=trange) ? '=' : '~';
+ return odta;
+ }
+ odta.ovlcode='=';
+ return odta;
}
// -- a partial intron chain match
if (jmfirst>1) {
@@ -3379,7 +3446,10 @@ char getOvlCode(GffObj& m, GffObj& r, int& ovlen, bool stricterMatch, int trange
else { intron_retention=true; ichain_match=false; }
}
}
- if (ichain_match && l_iovh<4 && r_iovh<4) return 'c';
+ if (ichain_match && l_iovh<4 && r_iovh<4) {
+ odta.ovlcode='c';
+ return odta;
+ }
qry_intron_poking=GMAX(l_iovh, r_iovh);
} else if ((jmfirst==1 && jmlast==jmax)) {//ref intron chain match
//check if the reference j-chain is contained in qry i-chain
@@ -3402,7 +3472,10 @@ char getOvlCode(GffObj& m, GffObj& r, int& ovlen, bool stricterMatch, int trange
else { ichain_match = false; }
}
}
- if (ichain_match && l_jovh<4 && r_jovh<4) return 'k'; //reverse containment
+ if (ichain_match && l_jovh<4 && r_jovh<4) {
+ odta.ovlcode='k'; //reverse containment
+ return odta;
+ }
ref_intron_poking=GMAX(l_jovh, r_jovh);
}
}
@@ -3415,13 +3488,29 @@ char getOvlCode(GffObj& m, GffObj& r, int& ovlen, bool stricterMatch, int trange
//if (ref_intron_poking>0 && )
//we just need to have no intron poking going on
if (!intron_conflict && ref_intron_poking<4
- && qry_intron_poking<4) return 'm';
- return 'n';
+ && qry_intron_poking<4) {
+ odta.ovlcode='m';
+ return odta;
+ }
+ odta.ovlcode='n';
+ return odta;
+ }
+ //if (intron_ovl) { ?
+ if (junct_match) {
+ odta.ovlcode='j';
+ return odta;
+ }
+ //what's left could be intron overlap but with no junction match = 'o'
+ if (odta.ovlen>4) {
+ odta.ovlcode='o';
+ return odta;
+ }
+ //but if there is no exon overlap, we have 'i' or 'y'
+ //exons are within the introns of the other!
+ if (m.start>r.start && r.end > m.end) {
+ odta.ovlcode='i';
+ return odta;
}
- if (junct_match) return 'j';
- //we could have 'o' or 'y' here
- //any real exon overlaps?
- ovlen=m.exonOverlapLen(r);
- if (ovlen>4) return 'o';
- return 'y'; //all reference exons are within transfrag introns!
+ odta.ovlcode='y';
+ return odta; //all reference exons are within transfrag introns!
}
=====================================
gff.h
=====================================
@@ -8,8 +8,8 @@
#include "codons.h"
#include "GFaSeqGet.h"
#include "GList.hh"
-//#include "GHash.hh"
#include "GHashMap.hh"
+#include "GBitVec.h"
#ifdef CUFFLINKS
#include <boost/crc.hpp> // for boost::crc_32_type
@@ -24,7 +24,6 @@ extern int gff_fid_CDS;
extern const uint GFF_MAX_LOCUS;
extern const uint GFF_MAX_EXON;
extern const uint GFF_MAX_INTRON;
-extern const int CLASSCODE_OVL_RANK;
//extern const uint gfo_flag_LEVEL_MSK; //hierarchical level: 0 = no parent
//extern const byte gfo_flagShift_LEVEL;
@@ -57,10 +56,22 @@ typedef void GFFCommentParser(const char* cmline, GfList* gflst); //comment pars
class GffReader;
class GffObj;
-//---transcript overlapping - utility functions:
-int classcode_rank(char c); //returns priority value for class codes
+//---transcript overlapping - utility functions
-char getOvlCode(GffObj& m, GffObj& r, int& ovlen, bool stricterMatch=false, int trange=0); //returns: class code
+extern const byte CLASSCODE_OVL_RANK; //rank value just above 'o' class code
+
+byte classcode_rank(char c); //returns priority value for class codes
+
+struct TOvlData {
+ char ovlcode;
+ int ovlen;
+ int16_t numJmatch; //number of matching junctions (not introns)
+ GBitVec jbits; //bit array with 1 bit for each junctions (total = 2 * num_introns)
+ TOvlData(char oc=0, int olen=0, int16_t nmj=0, GBitVec* jb=NULL):ovlcode(oc),
+ ovlen(olen),numJmatch(nmj),jbits(jb) { }
+};
+
+TOvlData getOvlData(GffObj& m, GffObj& r, bool stricterMatch=false, int trange=0);
char transcriptMatch(GffObj& a, GffObj& b, int& ovlen, int trange=0); //generic transcript match test
// -- return '=', '~' or 0
@@ -262,6 +273,8 @@ class GffLine {
GFREE(parents);
parents=NULL;
}
+ void ensembl_GFF_ID_process(char*& id);
+ void ensembl_GTF_ID_process(char*& id, const char* ver_attr);
static char* extractGFFAttr(char*& infostr, const char* oline, const char* pre, bool caseStrict=false,
bool enforce_GTF2=false, int* rlen=NULL, bool deleteAttr=true);
char* extractAttr(const char* pre, bool caseStrict=false, bool enforce_GTF2=false, int* rlen=NULL){
@@ -791,7 +804,7 @@ protected:
public:
void removeExon(int idx);
void removeExon(GffExon* p);
- char strand; //true if features are on the reverse complement strand
+ char strand; //'+', '-' or '.'
GffScore gscore;
int covlen; //total coverage of reference genomic sequence (sum of maxcf segment lengths)
GffAttrs* attrs; //other gff3 attributes found for the main mRNA feature
@@ -865,6 +878,7 @@ public:
void addAttr(const char* attrname, const char* attrvalue);
int removeAttr(const char* attrname, const char* attrval=NULL);
int removeAttr(int aid, const char* attrval=NULL);
+ int removeAttrs(GStrSet<>& attrSet); //remove attributes whose names are NOT in attrSet
int removeExonAttr(GffExon& exon, const char* attrname, const char* attrval=NULL);
int removeExonAttr(GffExon& exon, int aid, const char* attrval=NULL);
@@ -1008,7 +1022,7 @@ public:
void updateCDSPhase(GList<GffExon>& segs); //for CDS-only features, updates GffExon::phase
void printGTab(FILE* fout, char** extraAttrs=NULL);
void printGxfExon(FILE* fout, const char* tlabel, const char* gseqname,
- bool iscds, GffExon* exon, bool gff3, bool cvtChars, char* dbuf, int dbuf_len);
+ bool iscds, GffExon* exon, bool gff3, bool cvtChars);
void printGxf(FILE* fout, GffPrintMode gffp=pgffExon,
const char* tlabel=NULL, const char* gfparent=NULL, bool cvtChars=false);
void printGtf(FILE* fout, const char* tlabel=NULL, bool cvtChars=false) {
@@ -1018,6 +1032,7 @@ public:
const char* gfparent=NULL, bool cvtChars=false) {
printGxf(fout, pgffAny, tlabel, gfparent, cvtChars);
}
+ bool printAttrs(FILE* fout, const char* sep=";", bool GTFstyle=false, bool cvtChars=false, bool sepFirst=true);
void printTranscriptGff(FILE* fout, char* tlabel=NULL,
bool showCDS=false, const char* gfparent=NULL, bool cvtChars=false) {
if (isValidTranscript())
@@ -1026,7 +1041,7 @@ public:
void printExonList(FILE* fout); //print comma delimited list of exon intervals
void printCDSList(FILE* fout); //print comma delimited list of CDS intervals
- void printBED(FILE* fout, bool cvtChars, char* dbuf, int dbuf_len);
+ void printBED(FILE* fout, bool cvtChars);
//print a BED-12 line + GFF3 attributes in 13th field
void printSummary(FILE* fout=NULL);
@@ -1166,6 +1181,9 @@ class GffReader {
bool sortByLoc:1; //if records should be sorted by location
bool refAlphaSort:1; //if sortByLoc, reference sequences are
// sorted lexically instead of their id#
+ //Ensembl ID processing:
+ bool xEnsemblID:1; //for ensemble GTF merge gene_version and transcript_version into the ID
+ //for ensemble GFF3, cannot merge version (!), just remove "transcript:" and "gene:" prefixes
bool gff_warns:1;
};
};
@@ -1206,7 +1224,7 @@ class GffReader {
//GffObj* replaceGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr, int replaceidx);
GffObj* updateGffRec(GffObj* prevgfo, GffLine* gffline);
GffObj* updateParent(GffObj* newgfh, GffObj* parent);
- bool readExonFeature(GffObj* prevgfo, GffLine* gffline, GHash<CNonExon*>* pex=NULL);
+ bool readExonFeature(GffObj* prevgfo, GffLine* gffline, GHash<CNonExon*>* pex = NULL);
GPVec<GSeqStat> gseqStats; //populated after finalize() with only the ref seqs in this file
GffReader(FILE* f=NULL, bool t_only=false, bool sort=false):linebuf(NULL), fpos(0),
buflen(0), flags(0), fh(f), fname(NULL), commentParser(NULL), gffline(NULL),
@@ -1234,6 +1252,8 @@ class GffReader {
}
*/
void gene2Exon(bool v) { gene2exon=v;}
+ void procEnsemblID(bool v) { xEnsemblID=v;}
+ bool procEnsemblID() { return xEnsemblID; }
void enableSorting(bool sorting=true) { sortByLoc=sorting; }
bool getSorting() { return sortByLoc; }
void isBED(bool v=true) { is_BED=v; } //should be set before any parsing!
@@ -1297,7 +1317,7 @@ class GffReader {
GFREE(linebuf);
//GFREE(lastReadNext);
gffnames_unref(GffObj::names);
- }
+ }
GffLine* nextGffLine();
=====================================
htest.cpp
=====================================
@@ -5,16 +5,13 @@
namespace old {
#include "GHash.hh"
}
+
#include "GResUsage.h"
-#include <cstdint>
#include <iostream>
//#include "tsl/hopscotch_map.h"
//#include "tsl/robin_map.h"
#include <unordered_map>
//#include "ska/bytell_hash_map.hpp"
-
-//#include "khashl.hh"
-//#include "city.h"
#include "GHashMap.hh"
#define USAGE "Usage:\n\
@@ -108,7 +105,7 @@ struct cstr_hash {
};
void run_GHash(GResUsage& swatch, GPVec<HStrData> & hstrs, const char* label) {
- old::GHash<int> ghash;
+ old::GHash<int> ghash; // @suppress("Type cannot be resolved")
int num_add=0, num_rm=0, num_clr=0;
GMessage("----------------- %s ----------------\n", label);
ghash.Clear();
@@ -298,7 +295,8 @@ void run_GHashMap(GResUsage& swatch, GPVec<HStrData> & hstrs, const char* label)
int num_add=0, num_rm=0, num_clr=0;
//GKHashSet<const char*> khset;
//GHashSet<> khset;
- GHash<int, cstr_hash, GHashKey_Eq<const char*>, uint32_t> khset;
+ //GHash<int, cstr_hash, GHashKey_Eq<const char*>, uint32_t> khset;
+ GHashMap<const char*, int> khset;
GMessage("----------------- %s ----------------\n", label);
int cl_i=0;
swatch.start();
@@ -337,7 +335,9 @@ void run_GHashMap(GResUsage& swatch, GPVec<HStrData> & hstrs, const char* label)
void run_GxxHashMap(GResUsage& swatch, GPVec<HStrData> & hstrs, const char* label) {
int num_add=0, num_rm=0, num_clr=0;
- GHash<int> khset;
+ //GHash<int> khset;
+ GHashMap<const char*, int, GHashKey_xxHash32<const char*>,
+ GHashKey_Eq<const char*>, uint32_t > khset;
GMessage("----------------- %s ----------------\n", label);
int cl_i=0;
swatch.start();
@@ -561,16 +561,17 @@ int main(int argc, char* argv[]) {
run_Robin(swatch, strs, "robin no suffix");
showTimings(swatch);
*/
-/*
+
run_Khashl(swatch, sufstrs, "khashl w/ suffix");
showTimings(swatch);
+/*
run_Khashl(swatch, strs, "khashl no suffix");
showTimings(swatch);
*/
- run_GHashMap(swatch, sufstrs, "GHashMap xxHash32 w/ suffix");
+ run_GHashMap(swatch, sufstrs, "GHashMap default w/ suffix");
showTimings(swatch);
- run_GxxHashMap(swatch, sufstrs, "GHashMap xxHash64 w/ suffix");
+ run_GxxHashMap(swatch, sufstrs, "GHashMap xxHash32 w/ suffix");
showTimings(swatch);
//run_GHashMap(swatch, strs, "GHashMap no suffix");
=====================================
khashl.hh
=====================================
@@ -1,3 +1,24 @@
+/* The MIT License
+ Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor at live.co.uk>
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
#ifndef __AC_KHASHL_HPP
#define __AC_KHASHL_HPP
@@ -6,27 +27,6 @@
#include <cstring> // for memset()
#include <stdint.h> // for uint32_t
-/* // ==> Code example <==
-#include <cstdio>
-#include "khashl.hpp"
-
-int main(void)
-{
- klib::KHashMap<uint32_t, int, std::hash<uint32_t> > h; // NB: C++98 doesn't have std::hash
- uint32_t k;
- int absent;
- h[43] = 1, h[53] = 2, h[63] = 3, h[73] = 4; // one way to insert
- k = h.put(53, &absent), h.value(k) = -2; // another way to insert
- if (!absent) printf("already in the table\n"); // which allows to test presence
- if (h.get(33) == h.end()) printf("not found!\n"); // test presence without insertion
- h.del(h.get(43)); // deletion
- for (k = 0; k != h.end(); ++k) // traversal
- if (h.occupied(k)) // some buckets are not occupied; skip them
- printf("%u => %d\n", h.key(k), h.value(k));
- return 0;
-}
-*/
-
namespace klib {
/***********
@@ -46,13 +46,13 @@ protected:
static inline khint_t __kh_h2b(uint32_t hash, khint_t bits) { return hash * 2654435769U >> (32 - bits); }
static inline khint_t __kh_h2b(uint64_t hash, khint_t bits) { return hash * 11400714819323198485ULL >> (64 - bits); }
public:
- KHashSet() : bits(0), count(0), used(0), keys(0) {};
+ KHashSet() : bits(0), count(0), used(NULL), keys(NULL) {};
~KHashSet() { std::free(used); std::free(keys); };
inline khint_t n_buckets() const { return used? khint_t(1) << bits : 0; }
inline khint_t end() const { return n_buckets(); }
inline khint_t size() const { return count; }
inline T &key(khint_t x) { return keys[x]; };
- inline bool occupied(khint_t x) const { return (__kh_used(used, x) != 0); }
+ inline bool _used(khint_t i) const { return (used[i>>5] >> (i&0x1fU) & 1U); }
void clear(void) {
if (!used) return;
memset(used, 0, __kh_fsize(n_buckets()) * sizeof(uint32_t));
=====================================
wyhash.h
=====================================
@@ -0,0 +1,266 @@
+// This is free and unencumbered software released into the public domain under The Unlicense (http://unlicense.org/)
+// main repo: https://github.com/wangyi-fudan/wyhash
+// author: 王一 Wang Yi <godspeed_china at yeah.net>
+// contributors: Reini Urban, Dietrich Epp, Joshua Haberman, Tommy Ettinger, Daniel Lemire, Otmar Ertl, cocowalla, leo-yuriev, Diego Barrios Romero, paulie-g, dumblob, Yann Collet, ivte-ms, hyb, James Z.M. Gao, easyaspi314 (Devin), TheOneric
+
+/* quick example:
+ string s="fjsakfdsjkf";
+ uint64_t hash=wyhash(s.c_str(), s.size(), 0, _wyp);
+*/
+
+#ifndef wyhash_final_version_3
+#define wyhash_final_version_3
+
+#ifndef WYHASH_CONDOM
+//protections that produce different results:
+//1: normal valid behavior
+//2: extra protection against entropy loss (probability=2^-63), aka. "blind multiplication"
+#define WYHASH_CONDOM 1
+#endif
+
+#ifndef WYHASH_32BIT_MUM
+//0: normal version, slow on 32 bit systems
+//1: faster on 32 bit systems but produces different results, incompatible with wy2u0k function
+#define WYHASH_32BIT_MUM 0
+#endif
+
+//includes
+#include <stdint.h>
+#include <string.h>
+#if defined(_MSC_VER) && defined(_M_X64)
+ #include <intrin.h>
+ #pragma intrinsic(_umul128)
+#endif
+
+//likely and unlikely macros
+#if defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__clang__)
+ #define _likely_(x) __builtin_expect(x,1)
+ #define _unlikely_(x) __builtin_expect(x,0)
+#else
+ #define _likely_(x) (x)
+ #define _unlikely_(x) (x)
+#endif
+
+//128bit multiply function
+static inline uint64_t _wyrot(uint64_t x) { return (x>>32)|(x<<32); }
+static inline void _wymum(uint64_t *A, uint64_t *B){
+#if(WYHASH_32BIT_MUM)
+ uint64_t hh=(*A>>32)*(*B>>32), hl=(*A>>32)*(uint32_t)*B, lh=(uint32_t)*A*(*B>>32), ll=(uint64_t)(uint32_t)*A*(uint32_t)*B;
+ #if(WYHASH_CONDOM>1)
+ *A^=_wyrot(hl)^hh; *B^=_wyrot(lh)^ll;
+ #else
+ *A=_wyrot(hl)^hh; *B=_wyrot(lh)^ll;
+ #endif
+#elif defined(__SIZEOF_INT128__)
+ __uint128_t r=*A; r*=*B;
+ #if(WYHASH_CONDOM>1)
+ *A^=(uint64_t)r; *B^=(uint64_t)(r>>64);
+ #else
+ *A=(uint64_t)r; *B=(uint64_t)(r>>64);
+ #endif
+#elif defined(_MSC_VER) && defined(_M_X64)
+ #if(WYHASH_CONDOM>1)
+ uint64_t a, b;
+ a=_umul128(*A,*B,&b);
+ *A^=a; *B^=b;
+ #else
+ *A=_umul128(*A,*B,B);
+ #endif
+#else
+ uint64_t ha=*A>>32, hb=*B>>32, la=(uint32_t)*A, lb=(uint32_t)*B, hi, lo;
+ uint64_t rh=ha*hb, rm0=ha*lb, rm1=hb*la, rl=la*lb, t=rl+(rm0<<32), c=t<rl;
+ lo=t+(rm1<<32); c+=lo<t; hi=rh+(rm0>>32)+(rm1>>32)+c;
+ #if(WYHASH_CONDOM>1)
+ *A^=lo; *B^=hi;
+ #else
+ *A=lo; *B=hi;
+ #endif
+#endif
+}
+
+//multiply and xor mix function, aka MUM
+static inline uint64_t _wymix(uint64_t A, uint64_t B){ _wymum(&A,&B); return A^B; }
+
+//endian macros
+#ifndef WYHASH_LITTLE_ENDIAN
+ #if defined(_WIN32) || defined(__LITTLE_ENDIAN__) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+ #define WYHASH_LITTLE_ENDIAN 1
+ #elif defined(__BIG_ENDIAN__) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ #define WYHASH_LITTLE_ENDIAN 0
+ #else
+ #warning could not determine endianness! Falling back to little endian.
+ #define WYHASH_LITTLE_ENDIAN 1
+ #endif
+#endif
+
+//read functions
+#if (WYHASH_LITTLE_ENDIAN)
+static inline uint64_t _wyr8(const uint8_t *p) { uint64_t v; memcpy(&v, p, 8); return v;}
+static inline uint64_t _wyr4(const uint8_t *p) { uint32_t v; memcpy(&v, p, 4); return v;}
+#elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__clang__)
+static inline uint64_t _wyr8(const uint8_t *p) { uint64_t v; memcpy(&v, p, 8); return __builtin_bswap64(v);}
+static inline uint64_t _wyr4(const uint8_t *p) { uint32_t v; memcpy(&v, p, 4); return __builtin_bswap32(v);}
+#elif defined(_MSC_VER)
+static inline uint64_t _wyr8(const uint8_t *p) { uint64_t v; memcpy(&v, p, 8); return _byteswap_uint64(v);}
+static inline uint64_t _wyr4(const uint8_t *p) { uint32_t v; memcpy(&v, p, 4); return _byteswap_ulong(v);}
+#else
+static inline uint64_t _wyr8(const uint8_t *p) {
+ uint64_t v; memcpy(&v, p, 8);
+ return (((v >> 56) & 0xff)| ((v >> 40) & 0xff00)| ((v >> 24) & 0xff0000)| ((v >> 8) & 0xff000000)| ((v << 8) & 0xff00000000)| ((v << 24) & 0xff0000000000)| ((v << 40) & 0xff000000000000)| ((v << 56) & 0xff00000000000000));
+}
+static inline uint64_t _wyr4(const uint8_t *p) {
+ uint32_t v; memcpy(&v, p, 4);
+ return (((v >> 24) & 0xff)| ((v >> 8) & 0xff00)| ((v << 8) & 0xff0000)| ((v << 24) & 0xff000000));
+}
+#endif
+static inline uint64_t _wyr3(const uint8_t *p, size_t k) { return (((uint64_t)p[0])<<16)|(((uint64_t)p[k>>1])<<8)|p[k-1];}
+//wyhash main function
+static inline uint64_t wyhash(const void *key, size_t len, uint64_t seed, const uint64_t *secret){
+ const uint8_t *p=(const uint8_t *)key; seed^=*secret; uint64_t a, b;
+ if(_likely_(len<=16)){
+ if(_likely_(len>=4)){ a=(_wyr4(p)<<32)|_wyr4(p+((len>>3)<<2)); b=(_wyr4(p+len-4)<<32)|_wyr4(p+len-4-((len>>3)<<2)); }
+ else if(_likely_(len>0)){ a=_wyr3(p,len); b=0;}
+ else a=b=0;
+ }
+ else{
+ size_t i=len;
+ if(_unlikely_(i>48)){
+ uint64_t see1=seed, see2=seed;
+ do{
+ seed=_wymix(_wyr8(p)^secret[1],_wyr8(p+8)^seed);
+ see1=_wymix(_wyr8(p+16)^secret[2],_wyr8(p+24)^see1);
+ see2=_wymix(_wyr8(p+32)^secret[3],_wyr8(p+40)^see2);
+ p+=48; i-=48;
+ }while(_likely_(i>48));
+ seed^=see1^see2;
+ }
+ while(_unlikely_(i>16)){ seed=_wymix(_wyr8(p)^secret[1],_wyr8(p+8)^seed); i-=16; p+=16; }
+ a=_wyr8(p+i-16); b=_wyr8(p+i-8);
+ }
+ return _wymix(secret[1]^len,_wymix(a^secret[1],b^seed));
+}
+
+//the default secret parameters
+static const uint64_t _wyp[4] = {0xa0761d6478bd642full, 0xe7037ed1a0b428dbull, 0x8ebc6af09c88c6e3ull, 0x589965cc75374cc3ull};
+
+//a useful 64bit-64bit mix function to produce deterministic pseudo random numbers that can pass BigCrush and PractRand
+static inline uint64_t wyhash64(uint64_t A, uint64_t B){ A^=0xa0761d6478bd642full; B^=0xe7037ed1a0b428dbull; _wymum(&A,&B); return _wymix(A^0xa0761d6478bd642full,B^0xe7037ed1a0b428dbull);}
+
+//The wyrand PRNG that pass BigCrush and PractRand
+static inline uint64_t wyrand(uint64_t *seed){ *seed+=0xa0761d6478bd642full; return _wymix(*seed,*seed^0xe7037ed1a0b428dbull);}
+
+//convert any 64 bit pseudo random numbers to uniform distribution [0,1). It can be combined with wyrand, wyhash64 or wyhash.
+static inline double wy2u01(uint64_t r){ const double _wynorm=1.0/(1ull<<52); return (r>>12)*_wynorm;}
+
+//convert any 64 bit pseudo random numbers to APPROXIMATE Gaussian distribution. It can be combined with wyrand, wyhash64 or wyhash.
+static inline double wy2gau(uint64_t r){ const double _wynorm=1.0/(1ull<<20); return ((r&0x1fffff)+((r>>21)&0x1fffff)+((r>>42)&0x1fffff))*_wynorm-3.0;}
+
+#if(!WYHASH_32BIT_MUM)
+//fast range integer random number generation on [0,k) credit to Daniel Lemire. May not work when WYHASH_32BIT_MUM=1. It can be combined with wyrand, wyhash64 or wyhash.
+static inline uint64_t wy2u0k(uint64_t r, uint64_t k){ _wymum(&r,&k); return k; }
+#endif
+
+//make your own secret
+static inline void make_secret(uint64_t seed, uint64_t *secret){
+ uint8_t c[] = {15, 23, 27, 29, 30, 39, 43, 45, 46, 51, 53, 54, 57, 58, 60, 71, 75, 77, 78, 83, 85, 86, 89, 90, 92, 99, 101, 102, 105, 106, 108, 113, 114, 116, 120, 135, 139, 141, 142, 147, 149, 150, 153, 154, 156, 163, 165, 166, 169, 170, 172, 177, 178, 180, 184, 195, 197, 198, 201, 202, 204, 209, 210, 212, 216, 225, 226, 228, 232, 240 };
+ for(size_t i=0;i<4;i++){
+ uint8_t ok;
+ do{
+ ok=1; secret[i]=0;
+ for(size_t j=0;j<64;j+=8) secret[i]|=((uint64_t)c[wyrand(&seed)%sizeof(c)])<<j;
+ if(secret[i]%2==0){ ok=0; continue; }
+ for(size_t j=0;j<i;j++) {
+#if defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__clang__)
+ if(__builtin_popcountll(secret[j]^secret[i])!=32){ ok=0; break; }
+#elif defined(_MSC_VER) && defined(_M_X64)
+ if(_mm_popcnt_u64(secret[j]^secret[i])!=32){ ok=0; break; }
+#else
+ //manual popcount
+ uint64_t x = secret[j]^secret[i];
+ x -= (x >> 1) & 0x5555555555555555;
+ x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333);
+ x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0f;
+ x = (x * 0x0101010101010101) >> 56;
+ if(x!=32){ ok=0; break; }
+#endif
+ }
+ }while(!ok);
+ }
+}
+
+/* This is world's fastest hash map: 2x faster than bytell_hash_map.
+ It does not store the keys, but only the hash/signature of keys.
+ First we use pos=hash1(key) to approximately locate the bucket.
+ Then we search signature=hash2(key) from pos linearly.
+ If we find a bucket with matched signature we report the bucket
+ Or if we meet a bucket whose signifure=0, we report a new position to insert
+ The signature collision probability is very low as we usually searched N~10 buckets.
+ By combining hash1 and hash2, we acturally have 128 bit anti-collision strength.
+ hash1 and hash2 can be the same function, resulting lower collision resistance but faster.
+ The signature is 64 bit, but can be modified to 32 bit if necessary for save space.
+ The above two can be activated by define WYHASHMAP_WEAK_SMALL_FAST
+ simple examples:
+ const size_t size=213432;
+ vector<wyhashmap_t> idx(size); // allocate the index of fixed size. idx MUST be zeroed.
+ vector<value_class> value(size); // we only care about the index, user should maintain his own value vectors.
+ string key="dhskfhdsj" // the object to be inserted into idx
+ size_t pos=wyhashmap(idx.data(), idx.size(), key.c_str(), key.size(), 1); // get the position and insert
+ if(pos<size) value[pos]++; // we process the vallue
+ else cerr<<"map is full\n";
+ pos=wyhashmap(idx.data(), idx.size(), key.c_str(), key.size(), 0); // just lookup by setting insert=0
+ if(pos<size) value[pos]++; // we process the vallue
+ else cerr<<"the key does not exist\n";
+*/
+#ifdef WYHASHMAP_WEAK_SMALL_FAST // for small hashmaps whose size < 2^24 and acceptable collision
+typedef uint32_t wyhashmap_t;
+#else
+typedef uint64_t wyhashmap_t;
+#endif
+
+static inline size_t wyhashmap(wyhashmap_t *idx, size_t idx_size, const void *key, size_t key_size, uint8_t insert, uint64_t *secret){
+ size_t i=1; uint64_t h2; wyhashmap_t sig;
+ do{ sig=h2=wyhash(key,key_size,i,secret); i++; }while(_unlikely_(!sig));
+#ifdef WYHASHMAP_WEAK_SMALL_FAST
+ size_t i0=wy2u0k(h2,idx_size);
+#else
+ size_t i0=wy2u0k(wyhash(key,key_size,0,secret),idx_size);
+#endif
+ for(i=i0; i<idx_size&&idx[i]&&idx[i]!=sig; i++);
+ if(_unlikely_(i==idx_size)){
+ for(i=0; i<i0&&idx[i]&&idx[i]!=sig; i++);
+ if(i==i0) return idx_size;
+ }
+ if(!idx[i]){
+ if(insert) idx[i]=sig;
+ else return idx_size;
+ }
+ return i;
+}
+#endif
+
+/* The Unlicense
+This is free and unencumbered software released into the public domain.
+
+Anyone is free to copy, modify, publish, use, compile, sell, or
+distribute this software, either in source code form or as a compiled
+binary, for any purpose, commercial or non-commercial, and by any
+means.
+
+In jurisdictions that recognize copyright laws, the author or authors
+of this software dedicate any and all copyright interest in the
+software to the public domain. We make this dedication for the benefit
+of the public at large and to the detriment of our heirs and
+successors. We intend this dedication to be an overt act of
+relinquishment in perpetuity of all present and future rights to this
+software under copyright law.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+
+For more information, please refer to <http://unlicense.org/>
+*/
View it on GitLab: https://salsa.debian.org/med-team/libgclib/-/commit/fd5fa4bdea23f418f69e21fc84af3aa78bd5eaa1
--
View it on GitLab: https://salsa.debian.org/med-team/libgclib/-/commit/fd5fa4bdea23f418f69e21fc84af3aa78bd5eaa1
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20211012/0432e411/attachment-0001.htm>
More information about the debian-med-commit
mailing list