[med-svn] [roguenarok] 01/02: New upstream version 1.0
Andreas Tille
tille at debian.org
Mon Mar 13 10:48:38 UTC 2017
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository roguenarok.
commit c0bda03bb31419fb66143607a92f79a636189c89
Author: Andreas Tille <tille at debian.org>
Date: Mon Mar 13 11:47:38 2017 +0100
New upstream version 1.0
---
Array.c | 38 +
Array.h | 63 ++
BitVector.c | 130 +++
BitVector.h | 67 ++
Dropset.c | 380 +++++++++
Dropset.h | 80 ++
HashTable.c | 331 ++++++++
HashTable.h | 82 ++
List.c | 490 ++++++++++++
List.h | 86 ++
Makefile | 57 ++
Node.c | 77 ++
Node.h | 51 ++
ProfileElem.c | 184 +++++
ProfileElem.h | 72 ++
README | 14 +
RogueNaRok.c | 2289 +++++++++++++++++++++++++++++++++++++++++++++++++++++
Tree.c | 1462 ++++++++++++++++++++++++++++++++++
Tree.h | 57 ++
common.c | 225 ++++++
common.h | 86 ++
legacy.c | 153 ++++
legacy.h | 143 ++++
newFunctions.c | 197 +++++
newFunctions.h | 45 ++
parallel.c | 276 +++++++
parallel.h | 76 ++
rnr-lsi.c | 410 ++++++++++
rnr-mast.c | 965 ++++++++++++++++++++++
rnr-prune.c | 230 ++++++
rnr-tii.c | 330 ++++++++
sharedVariables.h | 46 ++
32 files changed, 9192 insertions(+)
diff --git a/Array.c b/Array.c
new file mode 100644
index 0000000..a523d92
--- /dev/null
+++ b/Array.c
@@ -0,0 +1,38 @@
+/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees.
+ *
+ * Moreover, the program collection comes with efficient implementations of
+ * * the unrooted leaf stability by Thorley and Wilkinson
+ * * the taxonomic instability index by Maddinson and Maddison
+ * * a maximum agreement subtree implementation (MAST) for unrooted trees
+ * * a tool for pruning taxa from a tree collection.
+ *
+ * Copyright October 2011 by Andre J. Aberer
+ *
+ * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis.
+ *
+ * This program is free software; you may redistribute it and/or
+ * modify its under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * For any other inquiries send an Email to Andre J. Aberer
+ * andre.aberer at googlemail.com
+ *
+ * When publishing work that is based on the results from RogueNaRok, please cite:
+ * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011.
+ *
+ */
+
+#include "Array.h"
+
+void freeArray(Array *array)
+{
+ free(array->arrayTable);
+ free(array);
+}
+
diff --git a/Array.h b/Array.h
new file mode 100644
index 0000000..62dad92
--- /dev/null
+++ b/Array.h
@@ -0,0 +1,63 @@
+/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees.
+ *
+ * Moreover, the program collection comes with efficient implementations of
+ * * the unrooted leaf stability by Thorley and Wilkinson
+ * * the taxonomic instability index by Maddinson and Maddison
+ * * a maximum agreement subtree implementation (MAST) for unrooted trees
+ * * a tool for pruning taxa from a tree collection.
+ *
+ * Copyright October 2011 by Andre J. Aberer
+ *
+ * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis.
+ *
+ * This program is free software; you may redistribute it and/or
+ * modify its under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * For any other inquiries send an Email to Andre J. Aberer
+ * andre.aberer at googlemail.com
+ *
+ * When publishing work that is based on the results from RogueNaRok, please cite:
+ * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011.
+ *
+ */
+
+#ifndef ARRAY_H
+#define ARRAY_H
+
+#include <stdlib.h>
+
+typedef struct
+{
+ void *arrayTable;
+ void *commonAttributes;
+ unsigned int length;
+} Array;
+
+
+#define CLONE_ARRAY_FLAT(FUNCNAME, TYPE, TYPEATTR) \
+Array *FUNCNAME (const Array *array) \
+{ \
+ Array *result = CALLOC(1,sizeof(Array)); \
+ result->length = array->length; \
+ result->arrayTable = CALLOC(result->length, sizeof(TYPE)); \
+ memcpy(result->arrayTable, array->arrayTable, array->length * sizeof(TYPE) ); \
+ \
+ if( array->commonAttributes ) \
+ { \
+ result->commonAttributes = CALLOC(1 , sizeof(TYPEATTR)); \
+ memcpy(result->commonAttributes, array->commonAttributes, sizeof(TYPEATTR) ) ; \
+ } \
+ return result; \
+}
+
+void freeArray(Array *array);
+
+
+#endif
diff --git a/BitVector.c b/BitVector.c
new file mode 100644
index 0000000..a52e7a0
--- /dev/null
+++ b/BitVector.c
@@ -0,0 +1,130 @@
+/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees.
+ *
+ * Moreover, the program collection comes with efficient implementations of
+ * * the unrooted leaf stability by Thorley and Wilkinson
+ * * the taxonomic instability index by Maddinson and Maddison
+ * * a maximum agreement subtree implementation (MAST) for unrooted trees
+ * * a tool for pruning taxa from a tree collection.
+ *
+ * Copyright October 2011 by Andre J. Aberer
+ *
+ * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis.
+ *
+ * This program is free software; you may redistribute it and/or
+ * modify its under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * For any other inquiries send an Email to Andre J. Aberer
+ * andre.aberer at googlemail.com
+ *
+ * When publishing work that is based on the results from RogueNaRok, please cite:
+ * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011.
+ *
+ */
+
+#include "BitVector.h"
+
+BitVector *mask32;
+
+void initializeMask()
+{
+ int i;
+ mask32 = CALLOC(MASK_LENGTH, sizeof(BitVector));
+ mask32[0] = 1;
+
+ for(i = 1; i < MASK_LENGTH; ++i)
+ mask32[i] = mask32[i-1] << 1;
+}
+
+
+void printBitVector(BitVector *bv, int length)
+{
+ int i ;
+ for(i = 0; i < length * 32; ++i)
+ printf("%d", NTH_BIT_IS_SET(bv, i) ? 1 : 0);
+}
+
+
+void freeBitVectors(unsigned int **v, int n)
+{
+ int i;
+
+ for(i = 1; i < n; i++)
+ free(v[i]);
+}
+
+
+boolean areSameBitVectors(BitVector *a, BitVector *b, int bitVectorLength)
+{
+ int
+ i;
+
+ FOR_0_LIMIT(i,bitVectorLength)
+ if(a[i] != b[i])
+ return FALSE;
+
+ return TRUE;
+}
+
+
+BitVector genericBitCount(BitVector* bitVector, int bitVectorLength)
+{
+ BitVector
+ i,
+ result = 0;
+
+ for(i = 0; i < bitVectorLength; i++)
+ result += BIT_COUNT(bitVector[i]);
+
+ return result;
+}
+
+
+static int iterated_bitcount(BitVector n)
+{
+ int
+ count=0;
+
+ while(n)
+ {
+ count += n & 0x1u ;
+ n >>= 1 ;
+ }
+
+ return count;
+}
+
+void compute_bits_in_16bits(void)
+{
+ BitVector i;
+
+ for (i = 0; i < (0x1u<<16); i++)
+ bits_in_16bits[i] = iterated_bitcount(i);
+
+ return ;
+}
+
+BitVector precomputed16_bitcount (BitVector n)
+{
+ /* works only for 32-bit int*/
+ return bits_in_16bits [n & 0xffffu]
+ + bits_in_16bits [(n >> 16) & 0xffffu] ;
+}
+
+
+
+BitVector *copyBitVector(BitVector *bitVector, int bitVectorLength)
+{
+ BitVector *result = CALLOC(bitVectorLength, sizeof(BitVector));
+ memcpy(result, bitVector,bitVectorLength * sizeof(BitVector));
+ return result;
+}
+
+
+
diff --git a/BitVector.h b/BitVector.h
new file mode 100644
index 0000000..cb2c0dc
--- /dev/null
+++ b/BitVector.h
@@ -0,0 +1,67 @@
+/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees.
+ *
+ * Moreover, the program collection comes with efficient implementations of
+ * * the unrooted leaf stability by Thorley and Wilkinson
+ * * the taxonomic instability index by Maddinson and Maddison
+ * * a maximum agreement subtree implementation (MAST) for unrooted trees
+ * * a tool for pruning taxa from a tree collection.
+ *
+ * Copyright October 2011 by Andre J. Aberer
+ *
+ * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis.
+ *
+ * This program is free software; you may redistribute it and/or
+ * modify its under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * For any other inquiries send an Email to Andre J. Aberer
+ * andre.aberer at googlemail.com
+ *
+ * When publishing work that is based on the results from RogueNaRok, please cite:
+ * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011.
+ *
+ */
+
+
+#ifndef BITVECTOR_H
+#define BITVECTOR_H
+
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+
+#include "common.h"
+
+typedef unsigned int BitVector;
+
+#define BIT_COUNT(x) precomputed16_bitcount(x)
+#define NUMBER_BITS_IN_COMPLEMENT(bipartition) (mxtips - dropRound - bipartition->numberOfBitsSet)
+#define GET_BITVECTOR_LENGTH(x) (((x) % MASK_LENGTH) ? ((x) / MASK_LENGTH + 1) : ((x) / MASK_LENGTH))
+#define FLIP_NTH_BIT(bitVector,n) (bitVector[(n) / MASK_LENGTH] |= mask32[ (n) % MASK_LENGTH ])
+#define UNFLIP_NTH_BIT(bitVector,n) (bitVector[(n) / MASK_LENGTH] &= ~mask32[ (n) % MASK_LENGTH ])
+#define NTH_BIT_IS_SET(bitVector,n) (bitVector[(n) / MASK_LENGTH] & mask32[(n) % MASK_LENGTH])
+#define NTH_BIT_IS_SET_IN_INT(integer,n) (integer & mask32[n])
+#define MASK_LENGTH 32
+
+char bits_in_16bits [0x1u << 16];
+extern BitVector *mask32;
+
+void initializeMask();
+BitVector genericBitCount(BitVector* bitVector, int bitVectorLength);
+BitVector precomputed16_bitcount (BitVector n);
+void compute_bits_in_16bits(void);
+void printBitVector(BitVector *bv, int length);
+void freeBitVectors(BitVector **v, int n);
+BitVector *copyBitVector(BitVector *bitVector, int bitVectorLength);
+void printBitVector(BitVector *bv, int length);
+
+#endif
+
+
+
diff --git a/Dropset.c b/Dropset.c
new file mode 100644
index 0000000..a008508
--- /dev/null
+++ b/Dropset.c
@@ -0,0 +1,380 @@
+/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees.
+ *
+ * Moreover, the program collection comes with efficient implementations of
+ * * the unrooted leaf stability by Thorley and Wilkinson
+ * * the taxonomic instability index by Maddinson and Maddison
+ * * a maximum agreement subtree implementation (MAST) for unrooted trees
+ * * a tool for pruning taxa from a tree collection.
+ *
+ * Copyright October 2011 by Andre J. Aberer
+ *
+ * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis.
+ *
+ * This program is free software; you may redistribute it and/or
+ * modify its under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * For any other inquiries send an Email to Andre J. Aberer
+ * andre.aberer at googlemail.com
+ *
+ * When publishing work that is based on the results from RogueNaRok, please cite:
+ * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011.
+ *
+ */
+
+
+#include "Dropset.h"
+
+unsigned int *randForTaxa = NULL;
+extern int mxtips,
+ maxDropsetSize,
+ bitVectorLength;
+extern BitVector *droppedTaxa,
+ *neglectThose,
+ *paddingBits;
+
+
+void initializeRandForTaxa(int mxtips)
+{
+ int i;
+ randForTaxa = CALLOC(mxtips,sizeof(unsigned int));
+ FOR_0_LIMIT(i,mxtips)
+ randForTaxa[i] = rand();
+}
+
+
+/* is ONLY done for adding OWN elements */
+void addEventToDropsetPrime(Dropset *dropset, int a, int b)
+{
+ List
+ *lIter;
+
+ lIter = dropset->ownPrimeE;
+ while(lIter)
+ {
+ List *next = lIter->next;
+ MergingEvent *me = lIter->value;
+
+ assert(NOT me->isComplex);
+
+ if( me->mergingBipartitions.pair[0] == a
+ || me->mergingBipartitions.pair[1] == b
+ || me->mergingBipartitions.pair[1] == a
+ || me->mergingBipartitions.pair[0] == b)
+ {
+ assert( (me->mergingBipartitions.pair[0] == a)
+ + (me->mergingBipartitions.pair[1] == b)
+ + (me->mergingBipartitions.pair[0] == b)
+ + (me->mergingBipartitions.pair[1] == a)
+ == 2);
+ return;
+ }
+ lIter = next;
+ }
+
+ MergingEvent
+ *result = CALLOC(1,sizeof(MergingEvent));
+ result->mergingBipartitions.pair[0] = b;
+ result->mergingBipartitions.pair[1] = a;
+
+ APPEND(result, dropset->ownPrimeE);
+}
+
+
+List *addEventToDropsetCombining(List *complexEvents, MergingBipartitions primeEvent)
+{
+ int
+ a = primeEvent.pair[0],
+ b = primeEvent.pair[1];
+
+ List
+ *firstElem = NULL,
+ *secondElem = NULL;
+
+ /* find list elems that already contain merging events */
+ List *iter ;
+
+ iter = complexEvents;
+ FOR_LIST(iter)
+ {
+ int res = isInIndexListSpecial(a,b,((MergingEvent*)iter->value)->mergingBipartitions.many);
+ if(res)
+ {
+ if(NOT firstElem)
+ firstElem = iter;
+ else if(NOT secondElem)
+ secondElem = iter;
+ else break;
+ }
+ }
+
+
+ if(firstElem && secondElem)
+ {
+ /* merge bips into first elem */
+ IndexList
+ *il = ((MergingEvent*)secondElem->value)->mergingBipartitions.many,
+ *persistentIndexList = ((MergingEvent*)firstElem->value)->mergingBipartitions.many;
+ FOR_LIST(il)
+ persistentIndexList = appendToIndexListIfNotThere(il->index, persistentIndexList);
+ ((MergingEvent*)firstElem->value)->mergingBipartitions.many = persistentIndexList;
+ freeIndexList(((MergingEvent*)secondElem->value)->mergingBipartitions.many);
+ free((MergingEvent*)secondElem->value);
+
+ /* remove second element */
+ if(complexEvents == secondElem)
+ complexEvents = secondElem->next;
+ else
+ {
+ iter = complexEvents;
+ FOR_LIST(iter)
+ {
+ if(iter->next == secondElem)
+ {
+ iter->next = iter->next->next;
+ break;
+ }
+ }
+ }
+ free(secondElem);
+ }
+ else if(firstElem)
+ {
+ assert( NOT secondElem);
+ MergingEvent *me = firstElem->value;
+ me->mergingBipartitions.many = appendToIndexListIfNotThere(a,me->mergingBipartitions.many);
+ me->mergingBipartitions.many = appendToIndexListIfNotThere(b,me->mergingBipartitions.many);
+ }
+ else
+ {
+ MergingEvent
+ *me = CALLOC(1,sizeof(MergingEvent));
+ me->isComplex = TRUE;
+
+ me->mergingBipartitions.many = NULL;
+ APPEND_INT(a,me->mergingBipartitions.many);
+ APPEND_INT(b,me->mergingBipartitions.many);
+ APPEND(me, complexEvents);
+ }
+
+ return complexEvents;
+}
+
+
+void freeDropsetDeepInHash(void *value)
+{
+ freeDropsetDeep(value, TRUE);
+}
+
+
+void freeDropsetDeep(void *value, boolean extended)
+{
+ Dropset
+ *dropset = (Dropset*) value;
+ List
+ *iter = NULL;
+
+ if(dropset->taxaToDrop)
+ freeIndexList(dropset->taxaToDrop);
+
+ /* free combined events */
+ if(extended && dropset->complexEvents)
+ {
+ iter = dropset->complexEvents;
+ while(iter)
+ {
+ List *next = iter->next;
+ MergingEvent *me = iter->value;
+ freeIndexList(me->mergingBipartitions.many);
+ free(me);
+ iter = next;
+ }
+ freeListFlat(dropset->complexEvents);
+ }
+
+ if(extended && dropset->acquiredPrimeE)
+ freeListFlat(dropset->acquiredPrimeE);
+
+ /* ownPrimeE */
+ iter = dropset->ownPrimeE;
+ while(iter)
+ {
+ List *next = iter->next;
+ free((MergingEvent*)iter->value);
+ iter = next;
+ }
+ freeListFlat(dropset->ownPrimeE);
+
+ free(dropset);
+}
+
+
+void freeDropsetDeepInEnd(void *value)
+{
+ Dropset
+ *dropset = (Dropset*) value;
+ List
+ *iter = NULL;
+
+ if(dropset->taxaToDrop)
+ freeIndexList(dropset->taxaToDrop);
+
+ /* free combined events */
+ if(dropset->complexEvents)
+ freeListFlat(dropset->complexEvents);
+
+ if( dropset->acquiredPrimeE)
+ freeListFlat(dropset->acquiredPrimeE);
+
+ /* ownPrimeE */
+ iter = dropset->ownPrimeE;
+ while(iter)
+ {
+ List *next = iter->next;
+ free((MergingEvent*)iter->value);
+ iter = next;
+ }
+ freeListFlat(dropset->ownPrimeE);
+
+ free(dropset);
+}
+
+
+#ifdef PRINT_VERY_VERBOSE
+void printMergingEventStruct(MergingEvent **mergingEvents, int mxtips)
+{
+ int i;
+ MergingEvent
+ *iter_merge;
+ IndexList *iter_index;
+ PR("MERGING EVENT STRUCT:\n");
+
+ FOR_0_LIMIT(i,mxtips)
+ {
+ PR("%d:\t",i);
+ iter_merge = mergingEvents[i];
+ FOR_LIST(iter_merge)
+ {
+ PR("{");
+ iter_index = iter_merge->mergingBipartitions;
+ FOR_LIST(iter_index)
+ {
+ PR("%d,", iter_index->index);
+ }
+ PR(" : %d} ", (iter_merge->supportGained - iter_merge->supportLost));
+ }
+ PR( "\n");
+ }
+ PR("\n");
+}
+#endif
+
+unsigned int dropsetHashValue(HashTable *hashTable, void *value)
+{
+ unsigned int
+ result = 0;
+
+ Dropset *dropset = (Dropset*)value;
+ IndexList *iter = dropset->taxaToDrop;
+
+ FOR_LIST(iter)
+ {
+ assert(iter->index < mxtips);
+ result ^= randForTaxa[ iter->index ];
+ }
+
+ return result;
+}
+
+
+boolean dropsetEqual(HashTable *hashtable, void *entryA, void *entryB)
+{
+ IndexList *aList = ((Dropset*)entryA)->taxaToDrop,
+ *bList = ((Dropset*)entryB)->taxaToDrop;
+
+ return indexListEqual(aList, bList);
+}
+
+
+IndexList *convertBitVectorToIndexList(BitVector *bv)
+{
+ int i;
+ IndexList
+ *result = NULL;
+
+ FOR_0_LIMIT(i,mxtips)
+ if(NTH_BIT_IS_SET(bv, i))
+ APPEND_INT(i, result);
+
+ return result;
+}
+
+
+boolean elementsEqual(ProfileElem *elemA,ProfileElem *elemB, int a, int b)
+{
+ return (elemA->id == a && elemB->id == b )
+ || (elemB->id == a && elemA->id == b) ;
+}
+
+
+IndexList *getDropset(ProfileElem *elemA, ProfileElem *elemB, boolean complement, BitVector *neglectThose)
+{
+ int i,j,
+ numBit = 0,
+ localBitCount,
+ differenceByte;
+
+ IndexList
+ *result = NULL;
+
+ if(elemA == elemB)
+ return NULL;
+
+ FOR_0_LIMIT(i,bitVectorLength)
+ {
+ if( complement)
+ differenceByte = ~ ((elemA->bitVector[i] ^ elemB->bitVector[i]) | ( droppedTaxa[i] | paddingBits[i] ));
+ else
+ differenceByte = elemA->bitVector[i] ^ elemB->bitVector[i];
+
+ localBitCount = BIT_COUNT(differenceByte);
+
+ if( (numBit += localBitCount) > maxDropsetSize)
+ {
+ freeIndexList(result);
+ return NULL;
+ }
+
+ if( NOT localBitCount)
+ continue;
+
+ FOR_0_LIMIT(j,MASK_LENGTH)
+ {
+ if(NOT localBitCount)
+ break;
+
+ if(NTH_BIT_IS_SET_IN_INT(differenceByte,j))
+ {
+ APPEND_INT((( i * MASK_LENGTH) + j),result);
+ localBitCount--;
+
+ if(NOT NTH_BIT_IS_SET(neglectThose, (i*MASK_LENGTH + j)))
+ {
+ freeIndexList(result);
+ return NULL;
+ }
+ }
+ }
+ }
+
+ assert(numBit);
+
+ return result;
+}
+
diff --git a/Dropset.h b/Dropset.h
new file mode 100644
index 0000000..7a1033c
--- /dev/null
+++ b/Dropset.h
@@ -0,0 +1,80 @@
+/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees.
+ *
+ * Moreover, the program collection comes with efficient implementations of
+ * * the unrooted leaf stability by Thorley and Wilkinson
+ * * the taxonomic instability index by Maddinson and Maddison
+ * * a maximum agreement subtree implementation (MAST) for unrooted trees
+ * * a tool for pruning taxa from a tree collection.
+ *
+ * Copyright October 2011 by Andre J. Aberer
+ *
+ * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis.
+ *
+ * This program is free software; you may redistribute it and/or
+ * modify its under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * For any other inquiries send an Email to Andre J. Aberer
+ * andre.aberer at googlemail.com
+ *
+ * When publishing work that is based on the results from RogueNaRok, please cite:
+ * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011.
+ *
+ */
+
+
+#ifndef DROPSET_H
+#define DROPSET_H
+
+#include "common.h"
+#include "List.h"
+#include "ProfileElem.h"
+#include "HashTable.h"
+
+typedef union _mergeBips
+{
+ int pair[2];
+ IndexList *many;
+} MergingBipartitions;
+
+typedef struct _mergingEvent
+{
+ MergingBipartitions mergingBipartitions;
+ boolean isComplex;
+ int supportLost;
+ int supportGained;
+ boolean computed;
+} MergingEvent;
+
+
+typedef struct dropset
+{
+ IndexList *taxaToDrop;
+ int improvement;
+
+ List *ownPrimeE;
+ List *acquiredPrimeE;
+ List *complexEvents;
+} Dropset;
+
+
+boolean dropsetEqual(HashTable *hashtable, void *entryA, void *entryB);
+unsigned int dropsetHashValue(HashTable *hashTable, void *value);
+void removeMergingEvent(Dropset *dropset, List *toBeRemoved);
+List *freeMergingEventReturnNext(List *elem);
+void removeDropsetAndRelated(HashTable *mergingHash, Dropset *dropset);
+void addEventToDropsetPrime(Dropset *dropset, int a, int b);
+List *addEventToDropsetCombining(List *complexEvents, MergingBipartitions primeEvent);
+void freeDropsetDeepInHash(void *value);
+void freeDropsetDeepInEnd(void *value);
+void addEventToDropsetForCombining(Dropset *dropset, IndexList *mergingBips);
+void initializeRandForTaxa(int mxtips);
+void freeDropsetDeep(void *values, boolean freeCombinedM);
+IndexList *getDropset(ProfileElem *elemA, ProfileElem *elemB, boolean complement, BitVector *neglectThose);
+#endif
diff --git a/HashTable.c b/HashTable.c
new file mode 100644
index 0000000..ffa89cb
--- /dev/null
+++ b/HashTable.c
@@ -0,0 +1,331 @@
+/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees.
+ *
+ * Moreover, the program collection comes with efficient implementations of
+ * * the unrooted leaf stability by Thorley and Wilkinson
+ * * the taxonomic instability index by Maddinson and Maddison
+ * * a maximum agreement subtree implementation (MAST) for unrooted trees
+ * * a tool for pruning taxa from a tree collection.
+ *
+ * Copyright October 2011 by Andre J. Aberer
+ *
+ * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis.
+ *
+ * This program is free software; you may redistribute it and/or
+ * modify its under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * For any other inquiries send an Email to Andre J. Aberer
+ * andre.aberer at googlemail.com
+ *
+ * When publishing work that is based on the results from RogueNaRok, please cite:
+ * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011.
+ *
+ */
+
+#include "HashTable.h"
+#ifdef PARALLEL
+#include <pthread.h>
+#endif
+
+HashTable *createHashTable(unsigned int size,
+ void *commonAttr,
+ unsigned int (*hashFunction)(HashTable *hash_table, void *value),
+ boolean (*equalFunction)(HashTable *hash_table, void *entryA, void *entryB))
+{
+ static const unsigned int
+ initTable[] = {64, 128, 256, 512, 1024, 2048, 4096,
+ 8192, 16384, 32768, 65536, 131072,
+ 262144, 524288, 1048576, 2097152,
+ 4194304, 8388608, 16777216, 33554432,
+ 67108864, 134217728, 268435456,
+ 536870912, 1073741824, 2147483648U};
+
+ HashTable
+ *hashTable = CALLOC(1, sizeof(HashTable));
+
+ unsigned int
+ tableSize,
+ i,
+#ifdef DEBUG
+ maxSize = (unsigned int)-1,
+#endif
+ primeTableLength = sizeof(initTable)/sizeof(initTable[0]);
+
+ hashTable->hashFunction = hashFunction;
+ hashTable->equalFunction = equalFunction;
+ hashTable->commonAttributes = commonAttr;
+
+#ifdef DEBUG
+ assert(size <= maxSize);
+#endif
+ for(i = 0; initTable[i] < size && i < primeTableLength; ++i);
+ assert(i < primeTableLength);
+
+ tableSize = initTable[i];
+
+#ifdef PARALLEL
+ hashTable->cntLock = CALLOC(1,sizeof(pthread_mutex_t));
+ pthread_mutex_init(hashTable->cntLock, (pthread_mutexattr_t *)NULL);
+
+ hashTable->lockPerSlot = (pthread_mutex_t**)CALLOC(tableSize,sizeof(pthread_mutex_t*));
+ FOR_0_LIMIT(i, tableSize)
+ {
+ hashTable->lockPerSlot[i] = (pthread_mutex_t*) CALLOC(1,sizeof(pthread_mutex_t));
+ pthread_mutex_init(hashTable->lockPerSlot[i], (pthread_mutexattr_t *)NULL);
+ }
+#endif
+
+ hashTable->table = CALLOC(tableSize, sizeof(HashElem*));
+ hashTable->tableSize = tableSize;
+ hashTable->entryCount = 0;
+
+ return hashTable;
+}
+
+
+/* remove without destroying value of hashelem */
+boolean removeElementFromHash(HashTable *hashtable, void *value)
+{
+ unsigned int
+ hashValue = hashtable->hashFunction(hashtable, value),
+ position = hashValue % hashtable->tableSize;
+
+ HashElem *elem = hashtable->table[position];
+
+ if( NOT elem )
+ {
+ assert(0);
+ return FALSE;
+ }
+
+ if(
+ elem->fullKey == hashValue &&
+ hashtable->equalFunction(hashtable, elem->value, value))
+ {
+ hashtable->table[position] = elem->next ;
+ free(elem);
+ hashtable->entryCount-- ;
+
+ return TRUE;
+ }
+
+ while(elem->next)
+ {
+ if(elem->next->fullKey == hashValue && hashtable->equalFunction(hashtable, elem->next->value, value))
+ {
+ void *nextOne = elem->next->next;
+ free(elem->next);
+ elem->next = nextOne;
+ hashtable->entryCount-- ;
+ return TRUE;
+ }
+ elem = elem->next;
+ }
+
+ return FALSE;
+}
+
+
+void *searchHashTableWithInt(HashTable *hashtable, unsigned int hashValue)
+{
+ unsigned int
+ position = hashValue % hashtable->tableSize;
+
+ HashElem
+ *elem;
+
+ for(elem = hashtable->table[position];
+ elem;
+ elem = elem->next)
+ if(elem->fullKey == hashValue)
+ return elem->value;
+
+ return NULL;
+}
+
+
+void *searchHashTable(HashTable *hashtable, void *value, unsigned int hashValue)
+{
+ unsigned int
+ position = hashValue % hashtable->tableSize;
+
+ HashElem
+ *elem;
+
+ for(elem = hashtable->table[position];
+ elem;
+ elem = elem->next)
+ if(elem->fullKey == hashValue &&
+ hashtable->equalFunction(hashtable, elem->value, value))
+ return elem->value;
+
+ return NULL;
+}
+
+
+void insertIntoHashTable(HashTable *hashTable, void *value, unsigned int index)
+{
+ /* just copied this */
+ HashElem
+ *hashElem = CALLOC(1, sizeof(HashElem));
+
+ hashElem->fullKey = index;
+
+ index = hashElem->fullKey % hashTable->tableSize;
+
+ hashElem->value = value;
+ hashElem->next = hashTable->table[index];
+ hashTable->table[index] = hashElem;
+#ifdef PARALLEL
+ pthread_mutex_lock(hashTable->cntLock);
+ hashTable->entryCount++;
+ pthread_mutex_unlock(hashTable->cntLock);
+#else
+ hashTable->entryCount++;
+#endif
+}
+
+
+void destroyHashTable(HashTable *hashTable, void (*freeValue)(void *value))
+{
+ unsigned
+ int i;
+
+ HashElem
+ *elemA,
+ *elemB,
+ **table = hashTable->table;
+
+ for(i = 0; i < hashTable->tableSize; ++i)
+ {
+ elemA = table[i];
+ while(elemA != NULL)
+ {
+ elemB = elemA;
+ elemA = elemA->next;
+ if(freeValue)
+ freeValue(elemB->value);
+ free(elemB);
+ }
+ }
+
+#ifdef PARALLEL
+ pthread_mutex_destroy(hashTable->cntLock);
+ free(hashTable->cntLock);
+
+ FOR_0_LIMIT(i,hashTable->tableSize)
+ {
+ pthread_mutex_destroy(hashTable->lockPerSlot[i]);
+ free(hashTable->lockPerSlot[i]);
+ }
+ free(hashTable->lockPerSlot);
+#endif
+
+ free(hashTable->commonAttributes);
+ free(hashTable->table);
+ free(hashTable);
+}
+
+
+void updateEntryCount(HashTable *hashTable)
+{
+ unsigned int
+ i,
+ result = 0;
+
+ for(i = 0; i < hashTable->tableSize; ++i)
+ {
+ HashElem
+ *elem = ((HashElem**)hashTable->table)[i];
+
+ while(elem)
+ {
+ result++;
+ elem = elem->next;
+ }
+ }
+
+ hashTable->entryCount = result;
+}
+
+
+HashTableIterator *createHashTableIterator(HashTable *hashTable)
+{
+ unsigned
+ int i;
+
+ HashTableIterator
+ *hashTableIterator = CALLOC(1, sizeof(HashTableIterator));
+
+ hashTableIterator->hashTable = hashTable;
+ hashTableIterator->hashElem = NULL;
+ hashTableIterator->index = hashTable->tableSize;
+
+ if( NOT hashTable->entryCount)
+ return hashTableIterator;
+
+ FOR_0_LIMIT(i,hashTable->tableSize)
+ {
+ if(hashTable->table[i])
+ {
+ hashTableIterator->hashElem = hashTable->table[i];
+ hashTableIterator->index = i;
+ break;
+ }
+ }
+
+ return hashTableIterator;
+}
+
+boolean hashTableIteratorNext(HashTableIterator *hashTableIterator)
+{
+ unsigned int
+ i,
+ tableSize = hashTableIterator->hashTable->tableSize;
+
+ HashElem
+ *next = hashTableIterator->hashElem->next;
+
+ if(next)
+ {
+ hashTableIterator->hashElem = next;
+ return TRUE;
+ }
+
+ i = hashTableIterator->index + 1;
+
+ if(i >= tableSize)
+ {
+ hashTableIterator->index = i;
+ return FALSE;
+ }
+
+ while( NOT (next = hashTableIterator->hashTable->table[i]))
+ {
+ if( ++i >= tableSize)
+ {
+ hashTableIterator->index = i;
+ return FALSE;
+ }
+ }
+
+ hashTableIterator->index = i;
+ hashTableIterator->hashElem = next;
+
+ return next != NULL ;
+}
+
+
+void *getCurrentValueFromHashTableIterator(HashTableIterator *hashTableIterator)
+{
+ return ((hashTableIterator->hashElem)
+ ? hashTableIterator->hashElem->value
+ : NULL);
+}
+
diff --git a/HashTable.h b/HashTable.h
new file mode 100644
index 0000000..8d28b7b
--- /dev/null
+++ b/HashTable.h
@@ -0,0 +1,82 @@
+/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees.
+ *
+ * Moreover, the program collection comes with efficient implementations of
+ * * the unrooted leaf stability by Thorley and Wilkinson
+ * * the taxonomic instability index by Maddinson and Maddison
+ * * a maximum agreement subtree implementation (MAST) for unrooted trees
+ * * a tool for pruning taxa from a tree collection.
+ *
+ * Copyright October 2011 by Andre J. Aberer
+ *
+ * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis.
+ *
+ * This program is free software; you may redistribute it and/or
+ * modify its under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * For any other inquiries send an Email to Andre J. Aberer
+ * andre.aberer at googlemail.com
+ *
+ * When publishing work that is based on the results from RogueNaRok, please cite:
+ * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011.
+ *
+ */
+
+#ifndef HASHTABLE_H
+#define HASHTABLE_H
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "common.h"
+
+
+typedef struct hash_el
+{
+ unsigned int fullKey;
+ void *value;
+ struct hash_el *next;
+} HashElem;
+
+typedef struct hash_table
+{
+ unsigned int tableSize;
+ unsigned int entryCount;
+ void *commonAttributes;
+ unsigned int (*hashFunction)(struct hash_table *h, void *value);
+ boolean (*equalFunction)(struct hash_table *hashtable, void *entrA, void *entryB);
+ HashElem **table;
+#ifdef PARALLEL
+ pthread_mutex_t **lockPerSlot;
+ pthread_mutex_t *cntLock;
+#endif
+} HashTable;
+
+typedef struct
+{
+ HashTable *hashTable;
+ HashElem *hashElem;
+ unsigned int index;
+} HashTableIterator;
+
+#define FOR_HASH(htIter, hashtable) boolean hasNext = TRUE; for(htIter = createHashTableIterator(hashtable); htIter && hasNext ; hasNext = hashTableIteratorNext(htIter))
+#define FOR_HASH_2(htIter, hashtable) hasNext = TRUE; for(htIter = createHashTableIterator(hashtable); hasNext ; hasNext = hashTableIteratorNext(htIter))
+
+HashTable *createHashTable(unsigned int size, void *commonAttr, unsigned int (*hashFunction)(HashTable *hash_table, void *value), boolean (*equalFunction)(HashTable *hash_table, void *entryA, void *entryB));
+HashTableIterator *createHashTableIterator(HashTable *hashTable);
+void *getCurrentValueFromHashTableIterator(HashTableIterator *hashTableIterator);
+boolean hashTableIteratorNext(HashTableIterator *hashTableIterator);
+void *searchHashTableWithInt(HashTable *hashtable, unsigned int hashValue);
+void *searchHashTable(HashTable *hashtable, void *value, unsigned int hashValue);
+void insertIntoHashTable(HashTable *hashTable, void *value, unsigned int index);
+boolean removeElementFromHash(HashTable *hashtable, void *value);
+void destroyHashTable(HashTable *hashTable, void (*freeValue)(void *value));
+
+#endif
diff --git a/List.c b/List.c
new file mode 100644
index 0000000..9805994
--- /dev/null
+++ b/List.c
@@ -0,0 +1,490 @@
+/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees.
+ *
+ * Moreover, the program collection comes with efficient implementations of
+ * * the unrooted leaf stability by Thorley and Wilkinson
+ * * the taxonomic instability index by Maddinson and Maddison
+ * * a maximum agreement subtree implementation (MAST) for unrooted trees
+ * * a tool for pruning taxa from a tree collection.
+ *
+ * Copyright October 2011 by Andre J. Aberer
+ *
+ * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis.
+ *
+ * This program is free software; you may redistribute it and/or
+ * modify its under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * For any other inquiries send an Email to Andre J. Aberer
+ * andre.aberer at googlemail.com
+ *
+ * When publishing work that is based on the results from RogueNaRok, please cite:
+ * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011.
+ *
+ */
+
+#include "List.h"
+
+
+
+boolean isInIndexList(int index, IndexList *list)
+{
+ IndexList *iter = list;
+ FOR_LIST(iter)
+ if(iter->index == index)
+ return TRUE;
+
+ return FALSE;
+}
+
+
+int isInIndexListSpecial(int a, int b, IndexList *list)
+{
+ FOR_LIST(list)
+ {
+ if(list->index == a)
+ return 1 ;
+ if(list->index == b)
+ return 2;
+ }
+
+ return 0;
+}
+
+
+
+IndexList *setMinusOf(IndexList *list, IndexList *subtract)
+{
+ IndexList
+ *iter = list,
+ *result = NULL;
+
+ FOR_LIST(iter)
+ {
+ IndexList
+ *iter2 = subtract;
+
+ boolean
+ isThere = FALSE;
+
+ FOR_LIST(iter2)
+ isThere |= iter->index == iter2->index;
+
+ if(NOT isThere)
+ APPEND_INT(iter->index, result);
+ }
+ freeIndexList(list);
+
+ return result;
+}
+
+
+#ifdef NEW_SUBSET
+boolean isSubsetOfReverseOrdered(IndexList *subset, IndexList *set)
+{
+ IndexList
+ *subsetIter = subset;
+ boolean found = FALSE;
+
+ FOR_LIST(subsetIter)
+ {
+ IndexList
+ *setIter = set;
+ int ind = subsetIter->index;
+ FOR_LIST(setIter)
+ {
+ if(ind > setIter->index)
+ return FALSE;
+ else if((found = (ind == setIter->index)))
+ break;
+ }
+
+ if(NOT found)
+ return FALSE;
+ }
+
+ return TRUE;
+}
+#else
+boolean isSubsetOf(IndexList *subset, IndexList *set)
+{
+ boolean result = TRUE;
+
+ FOR_LIST(subset)
+ {
+ result &= isInIndexList(subset->index, set);
+ if( NOT result)
+ return result;
+ }
+
+ return result;
+}
+#endif
+
+
+IndexList *appendToIndexListIfNotThere(int elem, IndexList *list)
+{
+ IndexList *iter = list;
+
+ boolean contains = FALSE;
+ FOR_LIST(iter)
+ {
+ contains |= elem == iter->index;
+ if(contains)
+ break;
+ }
+
+ if( NOT contains)
+ return appendToIndexList(elem, list);
+ else
+ return list;
+}
+
+
+
+IndexList *appendToIndexListIfNotThere2(int elem, IndexList *list)
+{
+ IndexList *iter = list;
+
+ boolean contains = FALSE;
+ FOR_LIST(iter)
+ {
+ contains |= elem == iter->index;
+ if(contains)
+ break;
+ }
+
+ if( NOT contains)
+ {
+ PR("FAIL: it was not there\n");
+ return appendToIndexList(elem, list);
+ }
+ else
+ return list;
+}
+
+
+
+boolean haveIntersection2(IndexList *listA, IndexList *listB)
+{
+ FOR_LIST(listA)
+ {
+ IndexList *iter = listB;
+ FOR_LIST(iter)
+ if(listA->index == iter->index)
+ return TRUE;
+ }
+
+ return FALSE;
+}
+
+boolean haveIntersection(IndexList *listA, IndexList *listB)
+{
+ FOR_LIST(listA)
+ {
+ IndexList *iter = listB;
+ FOR_LIST(iter)
+ if(listA->index == iter->index)
+ return TRUE;
+ }
+
+ return FALSE;
+}
+
+
+void freeIndexList(IndexList *list)
+{
+ IndexList
+ *iter;
+
+ for(iter = list; iter;)
+ {
+ list = iter->next;
+ free(iter);
+ iter = list;
+ }
+}
+
+
+boolean indexListContainsIndexListUnordered(IndexList *list, IndexList *subList)
+{
+ FOR_LIST(subList)
+ {
+ boolean contained = FALSE;
+ IndexList
+ *iter = list;
+ FOR_LIST(iter)
+ if(iter->index == subList->index)
+ {
+ contained = TRUE;
+ break;
+ }
+
+ if( NOT contained)
+ return FALSE;
+ }
+
+ return TRUE;
+}
+
+
+List *joinLists(List* a, List*b)
+{
+ List *iter;
+
+ if( NOT a)
+ return b;
+
+ for(iter = a; iter ; iter = iter->next)
+ {
+ if( NOT iter->next)
+ {
+ iter->next = b;
+ return a;
+ }
+ }
+
+ assert(0);
+ return NULL;
+}
+
+
+int getMax_indexList(IndexList *list)
+{
+ IndexList
+ *iter = list;
+ int
+ result = 0;
+
+ FOR_LIST(iter)
+ if(result < iter->index)
+ result = iter->index;
+
+ return result;
+}
+
+int lengthIndexList(IndexList *list)
+{
+ IndexList
+ *iter = list;
+
+ int result = 0;
+ FOR_LIST(iter)
+ result++;
+
+ return result;
+}
+
+
+void printIndexList(IndexList *list)
+{
+ PR(">");
+ FOR_LIST(list)
+ PR("%d,", list->index);
+ PR("<");
+}
+
+void printIndexListToFile(FILE *file, IndexList *list)
+{
+ boolean isFirst = TRUE;
+ FOR_LIST(list)
+ if(isFirst)
+ {
+ fprintf(file, "%d", list->index);
+ isFirst = FALSE;
+ }
+ else
+ fprintf(file, ",%d", list->index);
+}
+
+
+IndexList *findFirstCommonElem(IndexList *listA, IndexList *listB)
+{
+ IndexList
+ *iterA = listA,
+ *iterB = listB;
+
+ FOR_LIST(iterA)
+ {
+ iterB = listB;
+ FOR_LIST(iterB)
+ if(iterA->index == iterB->index)
+ return iterA;
+ }
+
+ return NULL;
+}
+
+
+IndexList *concatenateIndexList(IndexList *listA, IndexList *listB)
+{
+ IndexList
+ *iterA = listA;
+
+ if( NOT iterA)
+ return listB;
+
+ FOR_LIST(iterA)
+ if( NOT iterA->next)
+ {
+ iterA->next = listB;
+ return listA;
+ }
+
+ return NULL;
+}
+
+
+
+IndexList *doubleAppendToIndexList(int valueA, int valueB, IndexList *list)
+{
+ IndexList
+ *listElemB = CALLOC(1, sizeof(IndexList)),
+ *listElemA = CALLOC(1, sizeof(IndexList));
+
+ listElemA->index = valueA;
+ listElemB->index = valueB;
+
+ listElemA->next = list;
+ listElemB->next = listElemA;
+
+ return listElemB;
+}
+
+IndexList* appendToIndexList(int value, IndexList *list)
+{
+ IndexList
+ *listElem = CALLOC(1, sizeof(IndexList));
+
+ listElem->index = value;
+ listElem->next = list;
+
+ return listElem;
+}
+
+
+boolean indexListEqual(IndexList *aList, IndexList *bList)
+{
+ IndexList *iterA = aList;
+
+ FOR_LIST(iterA)
+ {
+ boolean indexFound = FALSE;
+ IndexList *iterB = bList;
+ FOR_LIST(iterB)
+ {
+ indexFound = iterA->index == iterB->index;
+ if(indexFound)
+ break;
+ }
+
+ if( NOT indexFound)
+ return FALSE;
+ }
+
+ return lengthIndexList(aList) == lengthIndexList(bList);
+}
+
+
+int lengthOfList(List *list)
+{
+ int length = 0 ;
+ List
+ *iter = list;
+ FOR_LIST(iter)
+ length++;
+ return length;
+}
+
+
+List* appendToList(void *value, List *list)
+{
+ List
+ *listElem = CALLOC(1, sizeof(List));
+
+ listElem->value = value;
+ listElem->next = list;
+
+ return listElem;
+}
+
+List *concatenateLists(List *listA, List *listB)
+{
+ List
+ *iter = listA ;
+
+ if( NOT iter)
+ return listB;
+
+ FOR_LIST(iter)
+ {
+ if( NOT iter->next)
+ {
+ iter->next = listB;
+ return listA;
+ }
+ }
+
+ return NULL;
+}
+
+
+void freeList(List* list)
+{
+ List
+ *iter;
+ for(iter = list; iter;)
+ {
+ list = iter->next;
+ free(iter->value);
+ free(iter);
+ iter = list;
+ }
+}
+
+
+void freeListFlat(List* list)
+{
+ List
+ *iter;
+ for(iter = list; iter;)
+ {
+ list = iter->next;
+ free(iter);
+ iter = list;
+ }
+}
+
+
+boolean elemIsInIndexList(int index, IndexList *list)
+{
+ FOR_LIST(list)
+ {
+ if(index == list->index)
+ return TRUE;
+ }
+ return FALSE;
+}
+
+
+IndexList *joinIndexListsNonRedundant(IndexList *listA, IndexList *listB)
+{
+ IndexList *result = NULL;
+
+ FOR_LIST(listA)
+ APPEND_INT(listA->index, result);
+ freeIndexList(listA);
+
+ FOR_LIST(listB)
+ if( NOT elemIsInIndexList(listB->index, result))
+ APPEND_INT(listB->index, result);
+
+ freeIndexList(listB);
+
+ return result;
+}
diff --git a/List.h b/List.h
new file mode 100644
index 0000000..ef9770e
--- /dev/null
+++ b/List.h
@@ -0,0 +1,86 @@
+/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees.
+ *
+ * Moreover, the program collection comes with efficient implementations of
+ * * the unrooted leaf stability by Thorley and Wilkinson
+ * * the taxonomic instability index by Maddinson and Maddison
+ * * a maximum agreement subtree implementation (MAST) for unrooted trees
+ * * a tool for pruning taxa from a tree collection.
+ *
+ * Copyright October 2011 by Andre J. Aberer
+ *
+ * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis.
+ *
+ * This program is free software; you may redistribute it and/or
+ * modify its under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * For any other inquiries send an Email to Andre J. Aberer
+ * andre.aberer at googlemail.com
+ *
+ * When publishing work that is based on the results from RogueNaRok, please cite:
+ * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011.
+ *
+ */
+
+#ifndef LIST_H
+#define LIST_H
+
+#include <stdlib.h>
+#include "common.h"
+
+
+typedef struct List_{
+ void *value;
+ struct List_ *next;
+} List;
+
+typedef struct _indexList
+{
+ struct _indexList *next;
+ int index;
+} IndexList;
+
+
+void freeList(List* list);
+IndexList *doubleAppendToIndexList(int valueA, int valueB, IndexList *list);
+boolean isSubsetOf(IndexList *subset, IndexList *set);
+List* appendToList(void *value, List *list);
+IndexList *appendToIndexListIfNotThere(int elem, IndexList *list);
+IndexList *appendToIndexListIfNotThere2(int elem, IndexList *list);
+void freeIndexList(IndexList *list);
+IndexList* appendToIndexList(int value, IndexList *list) ;
+boolean indexListEqual(IndexList *aList, IndexList *bList);
+boolean isInIndexList(int index, IndexList *list);
+int isInIndexListSpecial(int a, int b, IndexList *list);
+void freeIndexList(IndexList *list);
+IndexList* appendToIndexList(int value, IndexList *list) ;
+void freeIndexList(IndexList *list);
+IndexList *concatenateIndexList(IndexList *listA, IndexList *listB) ;
+List *concatenateLists(List *listA, List *listB);
+void printIndexList(IndexList *list);
+IndexList *findFirstCommonElem(IndexList *listA, IndexList *listB);
+boolean elemIsInIndexList(int index, IndexList *list);
+List *joinLists(List* a, List*b) ;
+IndexList *joinIndexListsNonRedundant(IndexList *listA, IndexList *listB);
+void freeListFlat(List* list);
+boolean indexListContainsIndexListUnordered(IndexList *list, IndexList *subList);
+boolean haveIntersection(IndexList *listA, IndexList *listB);
+int length_indexList(IndexList *list);
+IndexList *setMinusOf(IndexList *list, IndexList *subtract);
+int lengthIndexList(IndexList *list);
+void printIndexListToFile(FILE *file, IndexList *list);
+int lengthOfList(List *list);
+boolean isSubsetOfReverseOrdered(IndexList *subset, IndexList *set);
+
+#define FOR_LIST(iter) for(;iter;iter = iter->next)
+#define APPEND(elem,list) list = appendToList(elem,list)
+#define APPEND_INT(elem,list) list = appendToIndexList(elem,list)
+#define CONCAT_LISTS(lista,listb) lista = concatenateLists(lista,listb)
+
+#endif
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..cbb94e3
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,57 @@
+CC = gcc
+
+CFLAGS = -Wall -D_GNU_SOURCE # -DNDEBUG
+LFLAGS = -lm
+
+ifeq ($(mode), debug)
+ CFLAGS += -g
+else
+ CFLAGS += -O3 # -march=native
+ifeq ($(mode), profile)
+ CFLAGS += -pg -g
+endif
+endif
+ifeq ($(mode), parallel)
+CFLAGS += -DPARALLEL -DPORTABLE_PTHREADS
+LFLAGS += -pthread
+endif
+ifeq ($(mode), parallelDebug)
+CFLAGS += -DPARALLEL -g
+LFLAGS += -pthread
+endif
+
+RM = rm -fr
+
+TARGETS = RogueNaRok rnr-prune rnr-lsi rnr-tii rnr-mast
+
+all : $(TARGETS)
+
+rnr-objs = common.o RogueNaRok.o Tree.o BitVector.o HashTable.o List.o Array.o Dropset.o ProfileElem.o legacy.o newFunctions.o parallel.o Node.o
+lsi-objs = rnr-lsi.o common.o Tree.o BitVector.o HashTable.o legacy.o newFunctions.o List.o
+tii-objs = rnr-tii.o common.o BitVector.o Tree.o HashTable.o List.o legacy.o newFunctions.o
+mast-objs = rnr-mast.o common.o List.o Tree.o BitVector.o HashTable.o legacy.o newFunctions.o
+prune-objs = rnr-prune.o common.o Tree.o BitVector.o HashTable.o legacy.o newFunctions.o List.o
+
+rnr-lsi: $(lsi-objs)
+ $(CC) $(LFLAGS) -o $@ $^ $(CFLAGS)
+rnr-tii: $(tii-objs)
+ $(CC) $(LFLAGS) -o $@ $^ $(CFLAGS)
+rnr-mast: $(mast-objs)
+ $(CC) $(LFLAGS) -o $@ $^ $(CFLAGS)
+rnr-prune: $(prune-objs)
+ $(CC) $(LFLAGS) -o $@ $^ $(CFLAGS)
+
+ifeq ($(mode),parallel)
+RogueNaRok: $(rnr-objs)
+ $(CC) $(LFLAGS) -o $@-parallel $^ $(CFLAGS)
+else
+RogueNaRok: $(rnr-objs)
+ $(CC) $(LFLAGS) -o $@ $^ $(CFLAGS)
+endif
+
+%.o : %.c $(DEPS)
+ $(CC) -c -o $@ $< $(CFLAGS)
+
+clean :
+ $(RM) $(rnr-objs) $(lsi-objs) $(tii-objs) $(mast-objs) $(prune-objs) $(TARGETS) $(TESTS) $(rnr-test-objs) RogueNaRok-parallel
+
diff --git a/Node.c b/Node.c
new file mode 100644
index 0000000..8ce8675
--- /dev/null
+++ b/Node.c
@@ -0,0 +1,77 @@
+/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees.
+ *
+ * Moreover, the program collection comes with efficient implementations of
+ * * the unrooted leaf stability by Thorley and Wilkinson
+ * * the taxonomic instability index by Maddinson and Maddison
+ * * a maximum agreement subtree implementation (MAST) for unrooted trees
+ * * a tool for pruning taxa from a tree collection.
+ *
+ * Copyright October 2011 by Andre J. Aberer
+ *
+ * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis.
+ *
+ * This program is free software; you may redistribute it and/or
+ * modify its under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * For any other inquiries send an Email to Andre J. Aberer
+ * andre.aberer at googlemail.com
+ *
+ * When publishing work that is based on the results from RogueNaRok, please cite:
+ * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011.
+ *
+ */
+
+#include "Node.h"
+
+
+IndexList *findAnIndependentComponent(HashTable *allNodes, Node *thisNode)
+{
+ if(thisNode->visited)
+ return NULL;
+
+ IndexList *iter = thisNode->edges;
+ thisNode->visited = TRUE;
+ IndexList *result = NULL;
+ APPEND_INT(thisNode->id, result);
+
+ FOR_LIST(iter)
+ {
+ Node *found = searchHashTableWithInt(allNodes, iter->index);
+
+ if( NOT found->visited)
+ {
+ IndexList *list = findAnIndependentComponent(allNodes, found);
+ result = concatenateIndexList(list, result);
+ }
+ }
+
+ return result;
+}
+
+
+void freeNode(void *value)
+{
+ Node *node = value;
+ freeIndexList(node->edges);
+ free(node);
+}
+
+
+
+boolean nodeEqual(HashTable *hashTable, void *entryA, void *entryB)
+{
+ return ((Node*)entryA)->id == ((Node*)entryB)->id;
+}
+
+
+unsigned int nodeHashValue(HashTable *hashTable, void *value)
+{
+ return ((Node*)value)->id;
+}
diff --git a/Node.h b/Node.h
new file mode 100644
index 0000000..19c8522
--- /dev/null
+++ b/Node.h
@@ -0,0 +1,51 @@
+/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees.
+ *
+ * Moreover, the program collection comes with efficient implementations of
+ * * the unrooted leaf stability by Thorley and Wilkinson
+ * * the taxonomic instability index by Maddinson and Maddison
+ * * a maximum agreement subtree implementation (MAST) for unrooted trees
+ * * a tool for pruning taxa from a tree collection.
+ *
+ * Copyright October 2011 by Andre J. Aberer
+ *
+ * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis.
+ *
+ * This program is free software; you may redistribute it and/or
+ * modify its under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * For any other inquiries send an Email to Andre J. Aberer
+ * andre.aberer at googlemail.com
+ *
+ * When publishing work that is based on the results from RogueNaRok, please cite:
+ * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011.
+ *
+ */
+
+#ifndef NODE_H
+#define NODE_H
+
+#include "common.h"
+#include "List.h"
+#include "HashTable.h"
+
+typedef struct _node
+{
+ int id;
+ boolean visited;
+ IndexList *edges;
+} Node;
+
+boolean nodeEqual(HashTable *hashTable, void *entryA, void *entryB);
+unsigned int nodeHashValue(HashTable *hashTable, void *value);
+void freeNode(void *value);
+IndexList *findAnIndependentComponent(HashTable *allNodes, Node *thisNode);
+
+
+#endif
diff --git a/ProfileElem.c b/ProfileElem.c
new file mode 100644
index 0000000..a6c4522
--- /dev/null
+++ b/ProfileElem.c
@@ -0,0 +1,184 @@
+/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees.
+ *
+ * Moreover, the program collection comes with efficient implementations of
+ * * the unrooted leaf stability by Thorley and Wilkinson
+ * * the taxonomic instability index by Maddinson and Maddison
+ * * a maximum agreement subtree implementation (MAST) for unrooted trees
+ * * a tool for pruning taxa from a tree collection.
+ *
+ * Copyright October 2011 by Andre J. Aberer
+ *
+ * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis.
+ *
+ * This program is free software; you may redistribute it and/or
+ * modify its under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * For any other inquiries send an Email to Andre J. Aberer
+ * andre.aberer at googlemail.com
+ *
+ * When publishing work that is based on the results from RogueNaRok, please cite:
+ * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011.
+ *
+ */
+
+#include "ProfileElem.h"
+
+
+CLONE_ARRAY_FLAT(cloneProfileArrayFlat, ProfileElem*, ProfileElemAttr*)
+
+
+void addElemToArray(ProfileElem *elem, Array *array)
+{
+ GET_PROFILE_ELEM(array, array->length) = elem;
+ array->length++;
+}
+
+
+void freeProfileElem(ProfileElem *elem)
+{
+ free(elem->treeVector);
+ free(elem->bitVector);
+ free(elem);
+}
+
+Array* profileToArray(HashTable *profile, boolean updateFrequencyCount, boolean assignIds)
+{
+ HashTableIterator*
+ hashTableIterator = createHashTableIterator(profile);
+
+ Array
+ *result = CALLOC(1, sizeof(Array));
+
+ unsigned int
+ count = 0;
+
+ /* remember to always copy s.t. free() runs w/o problems */
+
+ ProfileElemAttr
+ *profileElemAttr;
+
+ result->commonAttributes = CALLOC(1, sizeof(ProfileElemAttr));
+ result->commonAttributes = memcpy(result->commonAttributes, profile->commonAttributes, sizeof(ProfileElemAttr));
+ profileElemAttr = result->commonAttributes;
+
+ result->length = profile->entryCount;
+ result->arrayTable = CALLOC(profile->entryCount, sizeof(ProfileElem*));
+
+ if( NOT hashTableIterator)
+ return result;
+
+ do
+ {
+ ProfileElem *profileElem = getCurrentValueFromHashTableIterator(hashTableIterator);
+
+ assert(profileElem);
+
+ if(updateFrequencyCount)
+ profileElem->treeVectorSupport = genericBitCount(profileElem->treeVector, profileElemAttr->treeVectorLength);
+
+ if(assignIds)
+ profileElem->id = count;
+
+ ((ProfileElem**)result->arrayTable)[count] = profileElem;
+ assert(profileElem->bitVector && profileElem->treeVector);
+ count++;
+ }
+ while(hashTableIteratorNext(hashTableIterator));
+
+ assert(count == profile->entryCount);
+
+ free(hashTableIterator);
+
+ return result;
+}
+
+int sortBipProfile(const void *a, const void *b)
+{
+ if(*((ProfileElem**)a) == NULL)
+ return 1;
+ if(*((ProfileElem**)b) == NULL)
+ return -1;
+
+ int
+ as = (*(ProfileElem**)a)->numberOfBitsSet,
+ bs = (*(ProfileElem**)b)->numberOfBitsSet;
+
+ if(as == bs)
+ return 0;
+
+ return as < bs ? -1 : 1;
+}
+
+
+int sortById(const void *a, const void *b)
+{
+ int
+ as = (*(ProfileElem**)a)->id,
+ bs = (*(ProfileElem**)b)->id;
+
+ if(as == bs)
+ return 0;
+
+ return as < bs ? -1 : 1;
+}
+
+
+int sortBySupport(const void *a, const void *b)
+{
+ unsigned int
+ as = (*(ProfileElem**)a)->treeVectorSupport,
+ bs = (*(ProfileElem**)b)->treeVectorSupport;
+
+ if(as == bs)
+ return 0;
+ return as > bs ? -1 : 1;
+}
+
+
+/* what is the index in the (ordered) profile of the first element to
+ have at least i bits set? */
+int *createNumBitIndex(Array *bipartitionProfile, int mxtips)
+{
+ int *result = CALLOC(mxtips, sizeof(int));
+ memset(result, -1, mxtips * sizeof(int));
+ qsort(bipartitionProfile->arrayTable, bipartitionProfile->length, sizeof(ProfileElem**), sortBipProfile);
+
+ int
+ i,
+ max = 0,
+ current = 0;
+
+ FOR_0_LIMIT(i,bipartitionProfile->length)
+ {
+ ProfileElem
+ *elem = GET_PROFILE_ELEM(bipartitionProfile, i);
+
+ if(NOT elem )
+ break;
+
+ if(elem->numberOfBitsSet != current)
+ {
+ current = elem->numberOfBitsSet;
+ result[current] = i;
+ max = i;
+ }
+ }
+
+ current = max;
+ for(i = mxtips-1; i >= 0; --i)
+ {
+ if(result[i] == -1)
+ result[i] = current;
+ else
+ current = result[i];
+ }
+
+ return result;
+}
diff --git a/ProfileElem.h b/ProfileElem.h
new file mode 100644
index 0000000..42d8efd
--- /dev/null
+++ b/ProfileElem.h
@@ -0,0 +1,72 @@
+/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees.
+ *
+ * Moreover, the program collection comes with efficient implementations of
+ * * the unrooted leaf stability by Thorley and Wilkinson
+ * * the taxonomic instability index by Maddinson and Maddison
+ * * a maximum agreement subtree implementation (MAST) for unrooted trees
+ * * a tool for pruning taxa from a tree collection.
+ *
+ * Copyright October 2011 by Andre J. Aberer
+ *
+ * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis.
+ *
+ * This program is free software; you may redistribute it and/or
+ * modify its under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * For any other inquiries send an Email to Andre J. Aberer
+ * andre.aberer at googlemail.com
+ *
+ * When publishing work that is based on the results from RogueNaRok, please cite:
+ * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011.
+ *
+ */
+
+#ifndef PROFILEELEM_H
+#define PROFILEELEM_H
+
+#include <string.h>
+#include <assert.h>
+
+#include "Array.h"
+#include "HashTable.h"
+#include "common.h"
+#include "BitVector.h"
+
+
+typedef struct
+{
+ BitVector bitVectorLength;
+ BitVector treeVectorLength;
+ BitVector *randForTaxa; /* random numbers to hash the vectors */
+ BitVector lastByte; /* the padding bits */
+} ProfileElemAttr;
+
+
+typedef struct profile_elem
+{
+ BitVector *bitVector;
+ BitVector *treeVector;
+ int treeVectorSupport;
+ boolean isInMLTree;
+ BitVector id;
+ int numberOfBitsSet;
+} ProfileElem;
+
+#define GET_PROFILE_ELEM(array,index) (((ProfileElem**)array->arrayTable)[(index)])
+#define GET_DROPSET_ELEM(array,index) (((Dropset**)array->arrayTable)[(index)])
+
+int *createNumBitIndex(Array *bipartitionProfile, int mxtips);
+int sortById(const void *a, const void *b);
+int sortBySupport(const void *a, const void *b);
+int sortBipProfile(const void *a, const void *b);
+Array *cloneProfileArrayFlat(const Array *array);
+void addElemToArray(ProfileElem *elem, Array *array);
+void freeProfileElem(ProfileElem *elem);
+#endif
diff --git a/README b/README
new file mode 100644
index 0000000..3635648
--- /dev/null
+++ b/README
@@ -0,0 +1,14 @@
+RogueNaRok is an algorithm for the identification of rogue taxa in a tree set.
+
+How do I install it?
+
+Download the code and run the "make" command.
+
+For a parallel version of the RogueNaRok algorithm, use "make
+mode=parallel". Note that, the parallel version requires the
+pthreads-library. Running one of the programs without arguments will
+trigger the help message. Just follow the instructions.
+
+More info is available on
+https://github.com/aberer/RogueNaRok/wiki/RogueNaRok . Also, use this
+site for reporting bugs, ask questions of how to employ RogueNaRok.
diff --git a/RogueNaRok.c b/RogueNaRok.c
new file mode 100644
index 0000000..d3b24ea
--- /dev/null
+++ b/RogueNaRok.c
@@ -0,0 +1,2289 @@
+/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees.
+ *
+ * Moreover, the program collection comes with efficient implementations of
+ * * the unrooted leaf stability by Thorley and Wilkinson
+ * * the taxonomic instability index by Maddinson and Maddison
+ * * a maximum agreement subtree implementation (MAST) for unrooted trees
+ * * a tool for pruning taxa from a tree collection.
+ *
+ * Copyright October 2011 by Andre J. Aberer
+ *
+ * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis.
+ *
+ * This program is free software; you may redistribute it and/or
+ * modify its under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * For any other inquiries send an Email to Andre J. Aberer
+ * andre.aberer at googlemail.com
+ *
+ * When publishing work that is based on the results from RogueNaRok, please cite:
+ * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011.
+ *
+ */
+
+
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <unistd.h>
+#include <limits.h>
+
+#include "Tree.h"
+#include "sharedVariables.h"
+#include "Dropset.h"
+#include "legacy.h"
+#include "newFunctions.h"
+#include "Node.h"
+
+#ifdef PARALLEL
+#include "parallel.h"
+#include <pthread.h>
+#endif
+
+#define PROG_NAME "RogueNaRok"
+#define PROG_VERSION "1.0"
+#define PROG_RELEASE_DATE "2011-10-25"
+
+/* #define PRINT_VERY_VERBOSE */
+/* #define MYDEBUG */
+
+#define PRINT_DROPSETS
+#define PRINT_TIME
+
+/* try to produce minimal dropsets */
+/* #define MIN_DROPSETS */
+
+#define VANILLA_CONSENSUS_OPT 0
+#define ML_TREE_OPT 1
+#define MRE_CONSENSUS_OPT 2
+
+#define HASH_TABLE_SIZE_CONST 100
+
+extern unsigned int *randForTaxa;
+
+int bitVectorLength,
+ treeVectorLength,
+ maxDropsetSize = 1,
+ rogueMode = 0,
+ dropRound = 0,
+ taxaDropped = 0,
+ thresh,
+ numberOfTrees,
+ bestLastTime,
+ *cumScores,
+ cumScore = 0,
+ bestCumEver = 0,
+
+ numBips,
+ mxtips;
+
+Dropset **dropsetPerRound;
+
+boolean computeSupport = TRUE;
+
+BitVector *droppedTaxa,
+ *neglectThose,
+ *paddingBits;
+
+double labelPenalty = 0.,
+ timeInc;
+
+#ifdef MYDEBUG
+void debug_dropsetConsistencyCheck(HashTable *mergingHash)
+{
+ HashTableIterator *htIter;
+ FOR_HASH(htIter, mergingHash)
+ {
+ Dropset *ds = getCurrentValueFromHashTableIterator(htIter);
+
+ if( NOT ds)
+ break;
+
+ HashTableIterator *htIter2;
+ boolean hasNext2 = TRUE;
+ for(htIter2 = createHashTableIterator(mergingHash); htIter2 && hasNext2 ; hasNext2 = hashTableIteratorNext(htIter2))
+ {
+ Dropset *ds2 = getCurrentValueFromHashTableIterator(htIter2);
+ if(ds == ds2)
+ continue;
+
+ if(indexListEqual(ds->taxaToDrop, ds2->taxaToDrop))
+ {
+ PR("duplicate dropset: ");
+ printIndexList(ds->taxaToDrop);
+ PR(" and ");
+ printIndexList(ds2->taxaToDrop);
+ PR("\n");
+ exit(-1);
+ }
+ }
+ free(htIter2);
+ }
+ free(htIter);
+}
+#endif
+
+
+boolean isCompatible(ProfileElem* elemA, ProfileElem* elemB, BitVector *droppedTaxa)
+{
+ unsigned int i;
+
+ unsigned int
+ *A = elemA->bitVector,
+ *C = elemB->bitVector;
+
+ FOR_0_LIMIT(i,bitVectorLength)
+ if(A[i] & C[i] & ~ (droppedTaxa[i] | paddingBits[i]) )
+ break;
+
+ if(i == bitVectorLength)
+ return TRUE;
+
+ FOR_0_LIMIT(i,bitVectorLength)
+ if( ( A[i] & ~C[i] ) & ~ (droppedTaxa[i] | paddingBits[i]) )
+ break;
+
+ if(i == bitVectorLength)
+ return TRUE;
+
+ FOR_0_LIMIT(i,bitVectorLength)
+ if( ( ~A[i] & C[i] ) & ~ (droppedTaxa[i] | paddingBits[i]) )
+ break;
+
+ if(i == bitVectorLength)
+ return TRUE;
+ else
+ return FALSE;
+}
+
+
+#ifdef MYDEBUG
+/* ensures, no merging event occurs twice per dropset */
+void debug_mergingHashSanityCheck(HashTable *mergingHash, int totalNumberOfBips)
+{
+ HashTableIterator *htIter;
+ FOR_HASH(htIter, mergingHash)
+ {
+ Dropset
+ *ds = getCurrentValueFromHashTableIterator(htIter);
+
+ if(NOT ds)
+ break;
+
+ BitVector
+ *bv = CALLOC(totalNumberOfBips, sizeof(BitVector));
+
+ List
+ *meIter = ds->primeEvents;
+
+ FOR_LIST(meIter)
+ {
+ MergingEvent
+ *me = meIter->value;
+ if(me->isComplex)
+ {
+ IndexList *il = me->mergingBipartitions.many;
+ FOR_LIST(il)
+ {
+ assert(NOT NTH_BIT_IS_SET(bv, il->index));
+ FLIP_NTH_BIT(bv, il->index);
+ }
+ }
+ else
+ {
+ assert(NOT NTH_BIT_IS_SET(bv, me->mergingBipartitions.pair[0]));
+ assert(NOT NTH_BIT_IS_SET(bv, me->mergingBipartitions.pair[1]));
+ FLIP_NTH_BIT(bv, me->mergingBipartitions.pair[0]);
+ FLIP_NTH_BIT(bv, me->mergingBipartitions.pair[1]);
+ }
+ }
+ free(bv);
+ }
+ free(htIter);
+}
+#endif
+
+
+boolean myBitVectorEqual(ProfileElem *elemA, ProfileElem *elemB)
+{
+ boolean normalEqual = TRUE,
+ complement = TRUE;
+
+ int i ;
+ FOR_0_LIMIT(i,bitVectorLength)
+ {
+ normalEqual = normalEqual && ( elemA->bitVector[i] == elemB->bitVector[i]);
+ complement = complement && (elemA->bitVector[i] == ~(elemB->bitVector[i] | droppedTaxa[i] | paddingBits[i]));
+ }
+
+ return normalEqual || complement;
+}
+
+
+boolean canMergeWithComplement(ProfileElem *elem)
+{
+ return mxtips - taxaDropped - 2 * elem->numberOfBitsSet <= 2 * maxDropsetSize;
+}
+
+
+boolean bitVectorEqual(ProfileElem *elemA, ProfileElem *elemB)
+{
+ boolean normalEqual = TRUE,
+ complement = TRUE;
+
+ int i ;
+ FOR_0_LIMIT(i,bitVectorLength)
+ {
+ normalEqual = normalEqual && ( elemA->bitVector[i] == elemB->bitVector[i]);
+ complement = complement && (elemA->bitVector[i] == ~(elemB->bitVector[i] | droppedTaxa[i] | paddingBits[i]));
+ }
+
+ return normalEqual || complement;
+}
+
+
+boolean mergedBipVanishes(MergingEvent *me, Array *bipartitionsById, IndexList *taxaToDrop)
+{
+ int vanBits = 0;
+
+ IndexList
+ *iter = taxaToDrop;
+
+ ProfileElem
+ *elem = me->isComplex ? GET_PROFILE_ELEM(bipartitionsById, (me->mergingBipartitions).many->index) : GET_PROFILE_ELEM(bipartitionsById, (me->mergingBipartitions).pair[0]);
+
+ FOR_LIST(iter)
+ if(NTH_BIT_IS_SET(elem->bitVector,iter->index))
+ vanBits++;
+
+ return elem->numberOfBitsSet - vanBits < 2;
+}
+
+
+/* insert dropset. If it is a multi-taxa dropset, gather all merging
+ events of sub-dropsets */
+Dropset *insertOrFindDropset(HashTable *hashtable, Dropset *dropset, unsigned int hashValue)
+{
+ void
+ *result = searchHashTable(hashtable, dropset, hashValue);
+
+ if(result)
+ {
+ freeDropsetDeep(dropset, TRUE);
+ return result;
+ }
+ else
+ {
+ insertIntoHashTable(hashtable, dropset, hashValue);
+ return dropset;
+ }
+}
+
+boolean checkForMergerAndAddEvent(boolean complement, ProfileElem *elemA, ProfileElem *elemB, HashTable *mergingHash)
+{
+ IndexList
+ *dropsetTaxa = getDropset(elemA,elemB,complement, neglectThose);
+
+ if(dropsetTaxa)
+ {
+ Dropset
+ *dropset,
+ *tmp = CALLOC(1,sizeof(Dropset));
+ tmp->taxaToDrop = dropsetTaxa;
+
+ unsigned int hashValue = 0;
+ IndexList *iter = dropsetTaxa;
+ FOR_LIST(iter)
+ {
+ assert(iter->index < mxtips);
+ hashValue ^= randForTaxa[ iter->index ];
+ }
+
+#ifdef PARALLEL
+ int position = hashValue % mergingHash->tableSize;
+ pthread_mutex_lock(mergingHash->lockPerSlot[position]);
+#endif
+ dropset = insertOrFindDropset(mergingHash, tmp, hashValue);
+ addEventToDropsetPrime(dropset, elemA->id, elemB->id);
+#ifdef PARALLEL
+ pthread_mutex_unlock(mergingHash->lockPerSlot[position]);
+#endif
+ return TRUE;
+ }
+ else
+ return FALSE;
+}
+
+
+/* can i trust you tiny function? (practical relevant -> 0) */
+/* boolean bothDropsetsRelevant(ProfileElem *elemA) */
+boolean bothDropsetsRelevant(int numBits)
+{
+ return numBits <= maxDropsetSize && numBits >= mxtips - taxaDropped - maxDropsetSize;
+}
+
+
+int cleanup_applyOneMergerEvent(MergingEvent *mergingEvent, Array *bipartitionsById, BitVector *mergingBipartitions)
+{
+ int
+ j;
+
+ ProfileElem
+ *resultBip, *elem;
+
+ resultBip =
+ mergingEvent->isComplex
+ ? GET_PROFILE_ELEM(bipartitionsById, mergingEvent->mergingBipartitions.many->index)
+ : GET_PROFILE_ELEM(bipartitionsById, mergingEvent->mergingBipartitions.pair[0]);
+
+ if(mergingEvent->isComplex)
+ {
+ IndexList
+ *iterBip = mergingEvent->mergingBipartitions.many->next;
+ FOR_LIST(iterBip)
+ {
+ elem = GET_PROFILE_ELEM(bipartitionsById, iterBip->index);
+ FLIP_NTH_BIT(mergingBipartitions, elem->id);
+ resultBip->isInMLTree |= elem->isInMLTree;
+ FOR_0_LIMIT(j,treeVectorLength)
+ resultBip->treeVector[j] |= elem->treeVector[j];
+ }
+
+ freeIndexList(mergingEvent->mergingBipartitions.many);
+ free(mergingEvent);
+ }
+ else
+ {
+ elem = GET_PROFILE_ELEM(bipartitionsById,mergingEvent->mergingBipartitions.pair[1]);
+ FLIP_NTH_BIT(mergingBipartitions, elem->id);
+ resultBip->isInMLTree |= elem->isInMLTree;
+ FOR_0_LIMIT(j,treeVectorLength)
+ resultBip->treeVector[j] |= elem->treeVector[j];
+ }
+
+ resultBip->treeVectorSupport = genericBitCount(resultBip->treeVector, treeVectorLength);
+ return resultBip->id;
+}
+
+
+int getSupportOfMRETreeHelper(Array *bipartitionProfile, Dropset *dropset)
+{
+ int
+ result = 0,
+ i,j;
+
+ BitVector
+ *taxaDroppedHere = copyBitVector(droppedTaxa, bitVectorLength);
+
+ if(dropset)
+ {
+ IndexList
+ *iter = dropset->taxaToDrop;
+ FOR_LIST(iter)
+ FLIP_NTH_BIT(taxaDroppedHere, iter->index);
+ }
+
+ qsort(bipartitionProfile->arrayTable, bipartitionProfile->length, sizeof(ProfileElem**), sortBySupport);
+
+ Array *mreBips = CALLOC(1,sizeof(Array));
+ mreBips->arrayTable = CALLOC((mxtips-3), sizeof(ProfileElem*));
+ mreBips->length = 0;
+
+#ifdef MYDEBUG
+ for(i = 1; i < bipartitionProfile->length; i ++)
+ assert(GET_PROFILE_ELEM(bipartitionProfile,i-1)->treeVectorSupport >= GET_PROFILE_ELEM(bipartitionProfile,i)->treeVectorSupport);
+#endif
+
+ FOR_0_LIMIT(i, bipartitionProfile->length)
+ if(GET_PROFILE_ELEM(bipartitionProfile,i)->treeVectorSupport > thresh)
+ addElemToArray(GET_PROFILE_ELEM(bipartitionProfile,i), mreBips);
+ else
+ break;
+
+ for(; i < bipartitionProfile->length && mreBips->length < mxtips-3; ++i)
+ {
+ ProfileElem
+ *elemA = GET_PROFILE_ELEM(bipartitionProfile, i);
+ boolean compatibleP = TRUE;
+
+ FOR_0_LIMIT(j,mreBips->length)
+ {
+ ProfileElem
+ *elemB = GET_PROFILE_ELEM(mreBips,j);
+
+ compatibleP &= isCompatible(elemA, elemB, taxaDroppedHere);
+
+ if( NOT compatibleP)
+ break;
+ }
+
+ if(compatibleP)
+ addElemToArray(GET_PROFILE_ELEM(bipartitionProfile,i), mreBips);
+ }
+
+ if(computeSupport)
+ FOR_0_LIMIT(i,mreBips->length)
+ result += GET_PROFILE_ELEM(mreBips,i)->treeVectorSupport;
+ else
+ result = mreBips->length;
+
+ free(mreBips->arrayTable); free(mreBips);
+ free(bipartitionProfile->arrayTable); free(bipartitionProfile);
+
+ return result;
+}
+
+
+void getSupportGainedThreshold(MergingEvent *me, Array *bipartitionsById)
+{
+ int
+ i;
+ me->supportGained = 0;
+ BitVector
+ *tmp;
+ boolean isInMLTree = FALSE;
+
+ if(me->isComplex)
+ {
+ IndexList
+ *iI = me->mergingBipartitions.many;
+
+ int bestPossible = 0;
+ FOR_LIST(iI)
+ {
+ ProfileElem
+ *elem = GET_PROFILE_ELEM(bipartitionsById, iI->index);
+ bestPossible += elem->treeVectorSupport;
+ isInMLTree |= elem->isInMLTree;
+ }
+
+ if( rogueMode == VANILLA_CONSENSUS_OPT && bestPossible < thresh)
+ return ;
+ if( rogueMode == ML_TREE_OPT && NOT isInMLTree)
+ return ;
+
+ tmp = CALLOC(treeVectorLength, sizeof(BitVector));
+
+ /* create new bip vector */
+ iI = me->mergingBipartitions.many;
+ FOR_LIST(iI)
+ {
+ ProfileElem
+ *elem = GET_PROFILE_ELEM(bipartitionsById, iI->index);
+
+ FOR_0_LIMIT(i, treeVectorLength)
+ tmp[i] |= elem->treeVector[i];
+ }
+ }
+ else
+ {
+ ProfileElem
+ *elemA = GET_PROFILE_ELEM(bipartitionsById, me->mergingBipartitions.pair[0]),
+ *elemB = GET_PROFILE_ELEM(bipartitionsById, me->mergingBipartitions.pair[1]);
+
+ if(rogueMode == VANILLA_CONSENSUS_OPT && elemA->treeVectorSupport + elemB->treeVectorSupport < thresh)
+ return;
+
+ isInMLTree = elemA->isInMLTree || elemB->isInMLTree;
+ if(rogueMode == ML_TREE_OPT && NOT isInMLTree)
+ return;
+
+ tmp = CALLOC(treeVectorLength, sizeof(BitVector));
+ FOR_0_LIMIT(i,treeVectorLength)
+ tmp[i] = elemA->treeVector[i] | elemB->treeVector[i];
+ }
+
+ int newSup = genericBitCount(tmp, treeVectorLength);
+ switch (rogueMode)
+ {
+ case MRE_CONSENSUS_OPT:
+ {
+ me->supportGained = computeSupport ? newSup : 1;
+ break;
+ }
+ case VANILLA_CONSENSUS_OPT:
+ {
+ if(rogueMode == VANILLA_CONSENSUS_OPT && newSup > thresh)
+ me->supportGained = computeSupport ? newSup : 1 ;
+ break;
+ }
+ case ML_TREE_OPT:
+ {
+ if(isInMLTree )
+ me->supportGained = computeSupport ? newSup : 1 ;
+ break;
+ }
+ default :
+ assert(0);
+ }
+
+ free(tmp);
+}
+
+
+int getSupportOfMRETree(Array *bipartitionsById, Dropset *dropset)
+{
+ List
+ *mergingEvents = NULL;
+ if(dropset)
+ {
+ if(maxDropsetSize == 1)
+ mergingEvents = dropset->ownPrimeE;
+ else
+ {
+ List *iter = dropset->acquiredPrimeE ;
+ FOR_LIST(iter)
+ APPEND(iter->value, mergingEvents);
+ iter = dropset->complexEvents;
+ FOR_LIST(iter)
+ APPEND(iter->value, mergingEvents);
+ }
+ }
+ int
+ i;
+
+ /* initial case */
+ if( NOT dropset)
+ {
+ Array *array = cloneProfileArrayFlat(bipartitionsById);
+ int tmp = getSupportOfMRETreeHelper( array, dropset);
+ return tmp;
+ }
+
+ Array
+ *emergedBips = CALLOC(1,sizeof(Array)),
+ *finalArray = CALLOC(1,sizeof(Array)),
+ *tmpArray = cloneProfileArrayFlat(bipartitionsById);
+ emergedBips->arrayTable = CALLOC(lengthOfList(mergingEvents), sizeof(ProfileElem*));
+ emergedBips->length = 0;
+
+ finalArray->arrayTable = CALLOC(tmpArray->length, sizeof(ProfileElem*));
+ finalArray->length = 0;
+
+ /* kill merging bips from array */
+ List *meIter = mergingEvents ;
+ FOR_LIST(meIter)
+ {
+ MergingEvent *me = meIter->value;
+ if(me->isComplex)
+ {
+ IndexList *iter = me->mergingBipartitions.many;
+ FOR_LIST(iter)
+ GET_PROFILE_ELEM(tmpArray, iter->index) = NULL;
+
+ /* create emerged bips in other array */
+ ProfileElem *elem = CALLOC(1,sizeof(ProfileElem));
+ getSupportGainedThreshold(me,bipartitionsById);
+ elem->treeVectorSupport = me->supportGained;
+ elem->bitVector = GET_PROFILE_ELEM(bipartitionsById, me->mergingBipartitions.many->index)->bitVector;
+ GET_PROFILE_ELEM(emergedBips, emergedBips->length) = elem;
+ emergedBips->length++;
+ }
+ else
+ {
+ int a = me->mergingBipartitions.pair[0],
+ b = me->mergingBipartitions.pair[1];
+ GET_PROFILE_ELEM(tmpArray, a) = NULL;
+ GET_PROFILE_ELEM(tmpArray, b) = NULL;
+
+ /* create emerged bips in other array */
+ ProfileElem *elem = CALLOC(1,sizeof(ProfileElem));
+ getSupportGainedThreshold(me,bipartitionsById);
+ elem->treeVectorSupport = me->supportGained;
+ elem->bitVector = GET_PROFILE_ELEM(bipartitionsById, a)->bitVector;
+ GET_PROFILE_ELEM(emergedBips, emergedBips->length) = elem;
+ emergedBips->length++;
+ }
+ }
+
+ /* kill vanishing bips from array */
+ FOR_0_LIMIT(i, tmpArray->length)
+ {
+ if( GET_PROFILE_ELEM(tmpArray,i) )
+ {
+ ProfileElem *elem = GET_PROFILE_ELEM(tmpArray,i);
+ int remainingBits = elem->numberOfBitsSet;
+ IndexList *iter = dropset->taxaToDrop;
+ FOR_LIST(iter)
+ if(NTH_BIT_IS_SET(elem->bitVector, iter->index))
+ remainingBits--;
+ if(remainingBits > 1)
+ addElemToArray(elem, finalArray);
+ }
+ }
+
+ FOR_0_LIMIT(i, emergedBips->length)
+ addElemToArray(GET_PROFILE_ELEM(emergedBips, i), finalArray);
+
+ free(tmpArray->arrayTable); free(tmpArray);
+
+ int result = getSupportOfMRETreeHelper(finalArray, dropset);
+
+ if(maxDropsetSize > 1 )
+ freeListFlat(mergingEvents);
+
+ FOR_0_LIMIT(i,emergedBips->length)
+ free(GET_PROFILE_ELEM(emergedBips,i));
+ free(emergedBips->arrayTable); free(emergedBips);
+
+ return result;
+}
+
+
+boolean bipartitionVanishesP(ProfileElem *elem, Dropset *dropset)
+{
+ IndexList *iter = dropset->taxaToDrop;
+ int result = elem->numberOfBitsSet;
+
+ FOR_LIST(iter)
+ if(NTH_BIT_IS_SET(elem->bitVector, iter->index))
+ result--;
+
+ return result < 2;
+}
+
+
+void removeMergedBipartitions(Array *bipartitionsById, Array *bipartitionProfile, BitVector *mergingBipartitions)
+{
+ int
+ i;
+
+ FOR_0_LIMIT(i,bipartitionProfile->length)
+ {
+ ProfileElem
+ *elem = GET_PROFILE_ELEM(bipartitionProfile,i);
+
+ if( NOT elem )
+ continue;
+
+ if(NTH_BIT_IS_SET(mergingBipartitions,elem->id))
+ {
+ GET_PROFILE_ELEM(bipartitionProfile, i) = NULL;
+ GET_PROFILE_ELEM(bipartitionsById, elem->id) = NULL;
+ freeProfileElem(elem);
+#ifdef PRINT_VERY_VERBOSE
+ PR("CLEAN UP: removing %d from bip profile because of merger\n", elem->id);
+#endif
+ }
+ }
+}
+
+
+boolean eventMustBeRecomputed(MergingEvent *meIter, BitVector *mergingBipartitions, BitVector *newCandidates)
+{
+ boolean
+ mustBeRecomputed = FALSE;
+
+ IndexList
+ *mergingBipsIter = meIter->mergingBipartitions.many;
+ FOR_LIST(mergingBipsIter)
+ mustBeRecomputed |=
+ NTH_BIT_IS_SET(mergingBipartitions,mergingBipsIter->index)
+ | NTH_BIT_IS_SET(newCandidates, mergingBipsIter->index);
+
+ return mustBeRecomputed;
+}
+
+
+boolean checkValidityOfEvent(BitVector *obsoleteBips, List *elem)
+{
+ MergingEvent
+ *me = elem->value;
+ boolean
+ killP = FALSE;
+
+ if(me->isComplex)
+ {
+ IndexList
+ *iter = me->mergingBipartitions.many;
+ FOR_LIST(iter)
+ killP |= NTH_BIT_IS_SET(obsoleteBips, iter->index);
+ if(killP)
+ freeIndexList(me->mergingBipartitions.many);
+ }
+ else
+ killP = NTH_BIT_IS_SET(obsoleteBips, me->mergingBipartitions.pair[0]) || NTH_BIT_IS_SET(obsoleteBips, me->mergingBipartitions.pair[1]) ;
+
+ if(killP)
+ {
+ free(me);
+ return FALSE;
+ }
+ else
+ return TRUE;
+}
+
+
+#ifdef LATER
+void printDropset(Dropset *dropset)
+{
+ IndexList *iter;
+ iter = dropset->taxaToDrop;
+ boolean isFirst = TRUE;
+ FOR_LIST(iter)
+ {
+ PR(isFirst ? "%d" : ",%d", iter->index);
+ isFirst = FALSE;
+ }
+ PR(" [%d] : ", dropset->improvement);
+ List *lIter = dropset->primeEvents;
+ FOR_LIST(lIter)
+ {
+ MergingEvent *me = lIter->value;
+ PR("{ %d,%d },", me->mergingBipartitions.pair[0], me->mergingBipartitions.pair[1]);
+ }
+ PR("\n");
+ if(dropset->combinedEvents)
+ {
+ PR("\t`->");
+ lIter = dropset->combinedEvents;
+ FOR_LIST(lIter)
+ {
+ if(((MergingEvent*)lIter->value)->isComplex)
+ {
+ isFirst = TRUE;
+ PR("{ ");
+ iter = ((MergingEvent*)lIter->value)->mergingBipartitions.many;
+ FOR_LIST(iter)
+ {
+ PR(isFirst ? "%d" : ",%d" ,iter->index);
+ isFirst = FALSE;
+ }
+ PR(" },");
+ }
+ else
+ {
+ MergingBipartitions mb = ((MergingEvent*)lIter->value)->mergingBipartitions ;
+ PR("[ %d,%d ],", mb.pair[0], mb.pair[1]);
+ }
+ }
+ PR("\n");
+ }
+}
+#endif
+
+
+#ifdef LATER
+void printMergingHash(HashTable *mergingHash)
+{
+ if(mergingHash->entryCount < 1)
+ {
+ PR("Nothing in mergingHash.\n");
+ return;
+ }
+
+ HashTableIterator *htIter;
+ FOR_HASH(htIter, mergingHash)
+ {
+ Dropset
+ *dropset = getCurrentValueFromHashTableIterator(htIter);
+ printDropset(dropset);
+ }
+}
+#endif
+
+
+#ifdef MYDEBUG
+void debug_assureCleanStructure(HashTable *hashtable, BitVector *mergingBipartitions)
+{
+ HashTableIterator *htIter;
+
+ FOR_HASH(htIter, hashtable)
+ {
+ Dropset *ds = (Dropset*)getCurrentValueFromHashTableIterator(htIter);
+
+ if( NOT ds)
+ break;
+
+ List *meIter = ds->primeEvents;
+ FOR_LIST(meIter)
+ {
+ MergingEvent *me = meIter->value;
+ if(me->isComplex)
+ {
+ IndexList *iter = me->mergingBipartitions.many;
+ FOR_LIST(iter)
+ if(NTH_BIT_IS_SET(mergingBipartitions, iter->index))
+ {
+ PR("%d from merging bipartitions still present. \n", iter->index);
+ printMergingHash(hashtable);
+ exit(-1);
+ }
+ }
+ else
+ {
+ if(NTH_BIT_IS_SET(mergingBipartitions, me->mergingBipartitions.pair[0]) )
+ {
+ PR("%d from merging bipartitions still present. \n", me->mergingBipartitions.pair[0]);
+ printMergingHash(hashtable);
+ exit(-1);
+ }
+ if(NTH_BIT_IS_SET(mergingBipartitions, me->mergingBipartitions.pair[1]))
+ {
+ PR("%d from merging bipartitions still present. \n", me->mergingBipartitions.pair[1]);
+ printMergingHash(hashtable);
+ exit(-1);
+ }
+ }
+ }
+ }
+ free(htIter);
+}
+#endif
+
+
+
+void cleanup_mergingEvents(HashTable *mergingHash, BitVector *mergingBipartitions, BitVector *candidateBips, int length)
+{
+ HashTableIterator
+ *htIter;
+
+ int i;
+ FOR_0_LIMIT(i,GET_BITVECTOR_LENGTH(length))
+ mergingBipartitions[i] |= candidateBips[i];
+
+ if( NOT mergingHash->entryCount)
+ return;
+
+ FOR_HASH(htIter, mergingHash)
+ {
+ Dropset
+ *dropset = getCurrentValueFromHashTableIterator(htIter);
+
+ /* always remove combined events */
+ List *iter = dropset->complexEvents;
+ FOR_LIST(iter)
+ {
+ MergingEvent *me = iter->value;
+ assert(me);
+ if(me && me->isComplex)
+ {
+ freeIndexList(me->mergingBipartitions.many);
+ free(me);
+ }
+ }
+ freeListFlat(dropset->complexEvents);
+
+ /* always remove acquired elems */
+ freeListFlat(dropset->acquiredPrimeE);
+ }
+ free(htIter);
+
+ /* reduce own elements */
+ FOR_HASH_2(htIter, mergingHash)
+ {
+ Dropset
+ *dropset = getCurrentValueFromHashTableIterator(htIter);
+
+ assert(dropset);
+
+ /* prime events */
+ List
+ *iter = dropset->ownPrimeE,
+ *start = NULL;
+ while(iter)
+ {
+ List *next = iter->next;
+ if( checkValidityOfEvent(mergingBipartitions, iter) )
+ APPEND(iter->value, start);
+ free(iter);
+ iter = next;
+ }
+ dropset->ownPrimeE = start;
+ }
+ free(htIter);
+
+#ifdef MYDEBUG
+ debug_assureCleanStructure(mergingHash, mergingBipartitions);
+#endif
+
+ free(mergingBipartitions);
+}
+
+
+/*
+ • inverses the bit vector, if more than half of the remaining taxa
+ bits is set. This is better anyway, is the bit vector do not get
+ that physically heavy (assuming that a 1 weighs more than a 0)
+ • assuming, we already know the numbers of bits set
+ • assuming, bits are unflipped, if the taxon was dropped
+*/
+void unifyBipartitionRepresentation(Array *bipartitionArray, BitVector *droppedTaxa)
+{
+ int
+ i,j,
+ bvLen = GET_BITVECTOR_LENGTH(mxtips),
+ remainingTaxa = mxtips - genericBitCount(droppedTaxa, bvLen);
+
+#ifdef PRINT_VERY_VERBOSE
+ PR("remaining taxa: %d\n", remainingTaxa);
+ PR("inverting bit vectors: ");
+#endif
+
+ FOR_0_LIMIT(i,bipartitionArray->length)
+ {
+ ProfileElem
+ *elem = GET_PROFILE_ELEM(bipartitionArray,i);
+
+ if( elem
+ && elem->numberOfBitsSet > remainingTaxa / 2)
+ {
+
+#ifdef PRINT_VERY_VERBOSE
+ PR("%d (%d bits set), ", elem->id, elem->numberOfBitsSet);
+#endif
+ FOR_0_LIMIT(j,bvLen)
+ elem->bitVector[j] = ~(elem->bitVector[j] | paddingBits[j] | droppedTaxa[j]);
+ elem->numberOfBitsSet = remainingTaxa - elem->numberOfBitsSet;
+ }
+ }
+#ifdef PRINT_VERY_VERBOSE
+ PR("\n");
+#endif
+}
+
+
+void printBipartitionProfile(Array *bipartitionProfile)
+{
+ int i;
+ FOR_0_LIMIT(i,bipartitionProfile->length)
+ {
+ ProfileElem *elem = GET_PROFILE_ELEM(bipartitionProfile, i);
+ if(elem)
+ {
+ PR("%d (%d):\t\t", elem->id, elem->numberOfBitsSet);
+ printBitVector(elem->bitVector, GET_BITVECTOR_LENGTH(mxtips));
+ }
+ else
+ break;
+ PR("\n");
+ }
+}
+
+
+int getNumberOfBipsPresent(Array *bipartitionArray)
+{
+ int result = 0;
+ int i;
+
+ FOR_0_LIMIT(i, bipartitionArray->length)
+ {
+ if(GET_PROFILE_ELEM(bipartitionArray,i))
+ result++;
+ }
+
+ return result;
+}
+
+
+int getInitScore(Array *bipartitionProfile)
+{
+ int
+ score = 0 , i;
+
+ if(rogueMode == MRE_CONSENSUS_OPT)
+ return getSupportOfMRETree(bipartitionProfile, NULL);
+
+ FOR_0_LIMIT(i,bipartitionProfile->length)
+ {
+ ProfileElem
+ *elem = GET_PROFILE_ELEM(bipartitionProfile,i);
+
+ switch(rogueMode)
+ {
+ case VANILLA_CONSENSUS_OPT:
+ if(elem->treeVectorSupport > thresh)
+ score += computeSupport ? elem->treeVectorSupport : 1 ;
+ break;
+
+ case ML_TREE_OPT:
+ if(elem->isInMLTree)
+ score += computeSupport ? elem->treeVectorSupport : 1;
+ break;
+
+ case MRE_CONSENSUS_OPT:
+ default:
+ assert(0);
+ }
+ }
+
+ return score;
+}
+
+
+void printDropsetImprovement(Dropset *dropset, All *tr, int cumScore)
+{
+#ifndef PRINT_DROPSETS
+ return ;
+#endif
+
+ IndexList
+ *iter = dropset->taxaToDrop;
+ boolean isFirst = TRUE;
+
+ FOR_LIST(iter)
+ {
+ PR( isFirst ? ">%d" : ",%d", iter->index);
+ isFirst = FALSE;
+ }
+
+ isFirst = TRUE;
+ PR("\t");
+ iter = dropset->taxaToDrop;
+ FOR_LIST(iter)
+ {
+ PR(isFirst ? "%s" : ",%s" , tr->nameList[iter->index+1]);
+ isFirst = FALSE;
+ }
+
+ PR("\t");
+ PR("%f\t%f\n",
+ (double)dropset->improvement /(computeSupport ? (double)numberOfTrees : 1) ,
+ (double)cumScore / (double)( (computeSupport ? numberOfTrees : 1) * (mxtips-3)));
+}
+
+
+void fprintRogueNames(All *tr, FILE *file, IndexList *list)
+{
+ boolean isFirst = TRUE;
+
+ FOR_LIST(list)
+ {
+ if(isFirst)
+ {
+ fprintf(file, "%s", tr->nameList[list->index+1]);
+ isFirst = FALSE;
+ }
+ else
+ fprintf(file, ",%s", tr->nameList[list->index+1]);
+ }
+}
+
+
+void printRogueInformationToFile( All *tr, FILE *rogueOutput, int bestCumEver, int *cumScores, Dropset **dropsetInRound)
+{
+ int
+ i=1,j;
+
+ boolean reached = bestCumEver == cumScores[0];
+ while ( NOT reached)
+ {
+ fprintf(rogueOutput, "%d\t", i);
+ printIndexListToFile(rogueOutput, dropsetInRound[i]->taxaToDrop);
+ fprintf(rogueOutput, "\t");
+ fprintRogueNames(tr, rogueOutput, dropsetInRound[i]->taxaToDrop);
+ fprintf(rogueOutput, "\t%f\t%f\n", (double)(cumScores[i] - cumScores[i-1] )/ (double)tr->numberOfTrees,(double)cumScores[i] / (double)((computeSupport ? numberOfTrees : 1 ) * (mxtips-3)) );
+ reached = bestCumEver == cumScores[i];
+ ++i;
+ }
+
+ FOR_0_LIMIT(j,mxtips)
+ if(NOT NTH_BIT_IS_SET(neglectThose,j))
+ {
+ fprintf(rogueOutput, "%d\t%d\t%s\t%s\t%s\n", i, j, tr->nameList[j+1], "NA", "NA");
+ i++;
+ }
+}
+
+
+void findCandidatesForBip(HashTable *mergingHash, ProfileElem *elemA, boolean firstMerge, Array *bipartitionsById, Array *bipartitionProfile, int* indexByNumberBits)
+{
+ ProfileElem
+ *elemB;
+ int indexInBitSortedArray;
+
+ boolean
+ compMerge = canMergeWithComplement(elemA);
+
+ if(firstMerge)
+ {
+ if(NOT compMerge && maxDropsetSize == 1)
+ indexInBitSortedArray = indexByNumberBits[elemA->numberOfBitsSet +1];
+ else
+ indexInBitSortedArray = indexByNumberBits[elemA->numberOfBitsSet];
+ }
+ else
+ indexInBitSortedArray =
+ elemA->numberOfBitsSet - maxDropsetSize < 0 ?
+ indexByNumberBits[0]
+ : indexByNumberBits[elemA->numberOfBitsSet-maxDropsetSize];
+
+ for( ;
+ indexInBitSortedArray < bipartitionProfile->length
+ && (elemB = GET_PROFILE_ELEM(bipartitionProfile,indexInBitSortedArray))
+ && elemB->numberOfBitsSet - elemA->numberOfBitsSet <= maxDropsetSize ;
+ indexInBitSortedArray++)
+ {
+ if(
+ maxDropsetSize == 1 &&
+ NOT compMerge &&
+ elemA->numberOfBitsSet == elemB->numberOfBitsSet)
+ continue;
+
+ boolean foundOne = FALSE;
+ if(compMerge)
+ foundOne = checkForMergerAndAddEvent(TRUE,elemA, elemB, mergingHash);
+
+ if(NOT foundOne || bothDropsetsRelevant(elemA->numberOfBitsSet))
+ checkForMergerAndAddEvent(FALSE, elemA, elemB, mergingHash);
+ }
+}
+
+
+/* HashTable * */
+void createOrUpdateMergingHash(All *tr, HashTable *mergingHash, Array *bipartitionProfile, Array *bipartitionsById, BitVector *candidateBips, boolean firstMerge, int *indexByNumberBits)
+{
+ int i;
+ FOR_0_LIMIT(i,bipartitionProfile->length)
+ if(NTH_BIT_IS_SET(candidateBips, i))
+ findCandidatesForBip(mergingHash, GET_PROFILE_ELEM(bipartitionsById, i), firstMerge, bipartitionsById, bipartitionProfile, indexByNumberBits);
+
+ free(candidateBips);
+}
+
+
+void combineEventsForOneDropset(Array *allDropsets, Dropset *refDropset, Array *bipartitionsById)
+{
+ List *allEventsUncombined = NULL;
+ refDropset->acquiredPrimeE = NULL;
+ refDropset->complexEvents = NULL;
+ int eventCntr = 0;
+
+ if(NOT refDropset->taxaToDrop->next)
+ {
+ List *iter = refDropset->ownPrimeE;
+ FOR_LIST(iter)
+ APPEND(iter->value, refDropset->acquiredPrimeE);
+ return;
+ }
+
+ /* gather all events */
+ int i;
+ FOR_0_LIMIT(i,allDropsets->length)
+ {
+ Dropset *currentDropset = GET_DROPSET_ELEM(allDropsets, i);
+ if( isSubsetOf(currentDropset->taxaToDrop, refDropset->taxaToDrop) )
+ {
+ List
+ *iter = currentDropset->ownPrimeE;
+ FOR_LIST(iter)
+ {
+ APPEND(iter->value, allEventsUncombined);
+ eventCntr++;
+ }
+ }
+ }
+
+ /* transform the edges into nodes */
+ HashTable *allNodes = createHashTable(eventCntr * 10, NULL, nodeHashValue, nodeEqual);
+ Node *found;
+ List *iter = allEventsUncombined;
+ FOR_LIST(iter)
+ {
+ MergingEvent *me = (MergingEvent*)iter->value;
+ int a = me->mergingBipartitions.pair[0];
+ int b = me->mergingBipartitions.pair[1];
+
+
+ if( ( found = searchHashTableWithInt(allNodes, a) ) )
+ APPEND_INT(b,found->edges);
+ else
+ {
+ Node *node = CALLOC(1,sizeof(Node));
+ node->id = a;
+ APPEND_INT(b,node->edges);
+ insertIntoHashTable(allNodes, node, a);
+ }
+
+ if( ( found = searchHashTableWithInt(allNodes, b) ) )
+ APPEND_INT(a,found->edges);
+ else
+ {
+ Node *node = CALLOC(1,sizeof(Node));
+ node->id = b;
+ APPEND_INT(a,node->edges);
+ insertIntoHashTable(allNodes, node, b);
+ }
+ }
+
+ iter = allEventsUncombined;
+ FOR_LIST(iter)
+ {
+ MergingEvent *me = (MergingEvent*)iter->value;
+ int a = me->mergingBipartitions.pair[0],
+ b = me->mergingBipartitions.pair[1];
+
+ Node *foundA = searchHashTableWithInt(allNodes,a),
+ *foundB = searchHashTableWithInt(allNodes,b);
+
+ if(NOT foundA->edges->next
+ && NOT foundB->edges->next)
+ {
+ assert(foundA->edges->index == foundB->id );
+ assert(foundB->edges->index == foundA->id );
+ APPEND(me, refDropset->acquiredPrimeE);
+ }
+ else
+ {
+ IndexList
+ *component = findAnIndependentComponent(allNodes,foundA);
+ if( component)
+ {
+ MergingEvent *complexMe = CALLOC(1,sizeof(MergingEvent));
+ complexMe->mergingBipartitions.many = component;
+ complexMe->isComplex = TRUE;
+ APPEND(complexMe,refDropset->complexEvents);
+ }
+ }
+ }
+
+ destroyHashTable(allNodes, freeNode);
+ freeListFlat(allEventsUncombined);
+}
+
+
+HashTable *combineMergerEvents(HashTable *mergingHash, Array *bipartitionsById)
+{
+ /* hash to array */
+ Array *allDropsets = CALLOC(1,sizeof(Array));
+ allDropsets->arrayTable = CALLOC(mergingHash->entryCount, sizeof(Dropset**));
+
+ HashTableIterator *htIter;
+ int cnt = 0;
+ FOR_HASH(htIter, mergingHash)
+ {
+ GET_DROPSET_ELEM(allDropsets, cnt) = getCurrentValueFromHashTableIterator(htIter);
+ cnt++;
+ }
+ free(htIter);
+ assert(cnt == mergingHash->entryCount);
+ allDropsets->length = cnt;
+
+#ifdef PARALLEL
+ globalPArgs->allDropsets = allDropsets;
+ globalPArgs->bipartitionsById =bipartitionsById;
+ numberOfJobs = allDropsets->length;
+ masterBarrier(THREAD_COMBINE_EVENTS, globalPArgs);
+#else
+ int i;
+ FOR_0_LIMIT(i,allDropsets->length)
+ combineEventsForOneDropset(allDropsets, GET_DROPSET_ELEM(allDropsets,i), bipartitionsById);
+#endif
+
+ free(allDropsets->arrayTable);
+ free(allDropsets);
+
+ return mergingHash;
+}
+
+
+void getLostSupportThreshold(MergingEvent *me, Array *bipartitionsById)
+{
+ ProfileElem *elemA, *elemB ;
+ me->supportLost = 0;
+
+ if(me->isComplex)
+ {
+ IndexList *iI = me->mergingBipartitions.many;
+
+ FOR_LIST(iI)
+ {
+ elemA = GET_PROFILE_ELEM(bipartitionsById, iI->index);
+ switch (rogueMode)
+ {
+ case VANILLA_CONSENSUS_OPT :
+ {
+ if(elemA->treeVectorSupport > thresh)
+ me->supportLost += computeSupport ? elemA->treeVectorSupport : 1;
+ break ;
+ }
+ case ML_TREE_OPT:
+ {
+ if(elemA->isInMLTree)
+ me->supportLost += computeSupport ? elemA->treeVectorSupport : 1 ;
+ break;
+ }
+ default :
+ assert(0);
+ }
+ }
+ }
+ else
+ {
+ elemA = GET_PROFILE_ELEM(bipartitionsById, me->mergingBipartitions.pair[0]);
+ elemB = GET_PROFILE_ELEM(bipartitionsById, me->mergingBipartitions.pair[1]);
+
+ switch(rogueMode)
+ {
+ case MRE_CONSENSUS_OPT:
+ case VANILLA_CONSENSUS_OPT:
+ {
+ if(elemA->treeVectorSupport > thresh)
+ me->supportLost += computeSupport ? elemA->treeVectorSupport : 1 ;
+ if(elemB->treeVectorSupport > thresh)
+ me->supportLost += computeSupport ? elemB->treeVectorSupport : 1;
+ break;
+ }
+ case ML_TREE_OPT:
+ {
+ if(elemA->isInMLTree)
+ me->supportLost += computeSupport ? elemA->treeVectorSupport : 1 ;
+ if(elemB->isInMLTree)
+ me->supportLost += computeSupport ? elemB->treeVectorSupport : 1 ;
+ }
+ }
+ }
+}
+
+
+void evaluateDropset(HashTable *mergingHash, Dropset *dropset,Array *bipartitionsById, List *consensusBipsCanVanish )
+{
+ int result = 0;
+ List
+ *allElems = NULL,
+ *elemsToCheck = NULL;
+
+ if(maxDropsetSize == 1)
+ elemsToCheck = dropset->ownPrimeE ;
+ else
+ {
+ List *otherIter = dropset->acquiredPrimeE;
+ FOR_LIST(otherIter)
+ APPEND(otherIter->value, elemsToCheck);
+ otherIter = dropset->complexEvents;
+ FOR_LIST(otherIter)
+ APPEND(otherIter->value, elemsToCheck);
+ allElems = elemsToCheck;
+ }
+
+ BitVector
+ *bipsSeen = CALLOC(GET_BITVECTOR_LENGTH(bipartitionsById->length), sizeof(BitVector));
+
+ FOR_LIST(elemsToCheck)
+ {
+ MergingEvent *me = (MergingEvent*)elemsToCheck->value;
+
+ if(NOT me->computed)
+ {
+ getLostSupportThreshold(me, bipartitionsById);
+ getSupportGainedThreshold(me, bipartitionsById);
+ me->computed = TRUE;
+ }
+
+ result -= me->supportLost;
+ if( me->supportGained
+ && NOT mergedBipVanishes(me, bipartitionsById, dropset->taxaToDrop) )
+ result += me->supportGained;
+
+ if(me->isComplex)
+ {
+ IndexList *iI = me->mergingBipartitions.many ;
+ FOR_LIST(iI)
+ {
+ assert(NOT NTH_BIT_IS_SET(bipsSeen, iI->index));
+ if(NTH_BIT_IS_SET(bipsSeen, iI->index))
+ {
+ PR("problem:");
+ printIndexList(me->mergingBipartitions.many);
+ PR("at ");
+ printIndexList(dropset->taxaToDrop);
+ PR("\n");
+ exit(0);
+ }
+ FLIP_NTH_BIT(bipsSeen, iI->index);
+ }
+ }
+ else
+ {
+ assert( NOT NTH_BIT_IS_SET(bipsSeen, me->mergingBipartitions.pair[0]));
+ assert( NOT NTH_BIT_IS_SET(bipsSeen, me->mergingBipartitions.pair[1]));
+ FLIP_NTH_BIT(bipsSeen,me->mergingBipartitions.pair[0]);
+ FLIP_NTH_BIT(bipsSeen,me->mergingBipartitions.pair[1]);
+ }
+ }
+ freeListFlat(allElems);
+
+
+ /* handle vanishing bip */
+ List *iter = consensusBipsCanVanish;
+ FOR_LIST(iter)
+ {
+ ProfileElem *elem = iter->value;
+
+ switch(rogueMode)
+ {
+ case VANILLA_CONSENSUS_OPT :
+ {
+ if(elem->treeVectorSupport > thresh
+ && NOT NTH_BIT_IS_SET(bipsSeen, elem->id)
+ && bipartitionVanishesP(elem,dropset))
+ result -= computeSupport ? elem->treeVectorSupport : 1;
+ break;
+ }
+ case ML_TREE_OPT:
+ {
+ if(elem->isInMLTree
+ && NOT NTH_BIT_IS_SET(bipsSeen, elem->id)
+ && bipartitionVanishesP(elem,dropset))
+ result -= computeSupport ? elem->treeVectorSupport : 1;
+ break;
+ }
+ default:
+ assert(0);
+ }
+ }
+
+ free(bipsSeen);
+ dropset->improvement = result;
+}
+
+
+List *getConsensusBipsCanVanish(Array *bipartitionProfile)
+{
+ List
+ *consensusBipsCanVanish = NULL;
+
+ if(rogueMode == VANILLA_CONSENSUS_OPT
+ || rogueMode == MRE_CONSENSUS_OPT)
+ {
+ int i;
+ FOR_0_LIMIT(i,bipartitionProfile->length)
+ {
+ ProfileElem
+ *elem = GET_PROFILE_ELEM(bipartitionProfile, i);
+
+ if(NOT elem)
+ break;
+
+ if(elem->numberOfBitsSet - maxDropsetSize > 1 )
+ break;
+
+ if(elem->treeVectorSupport > thresh)
+ APPEND(elem,consensusBipsCanVanish);
+ }
+ }
+ else if(ML_TREE_OPT)
+ {
+ int i;
+ FOR_0_LIMIT(i,bipartitionProfile->length)
+ {
+ ProfileElem
+ *elem = GET_PROFILE_ELEM(bipartitionProfile, i);
+
+ if(NOT elem)
+ break;
+
+ if(elem->isInMLTree)
+ APPEND(elem,consensusBipsCanVanish);
+ }
+ }
+
+ return consensusBipsCanVanish;
+}
+
+
+Dropset *evaluateEvents(HashTable *mergingHash, Array *bipartitionsById, Array *bipartitionProfile)
+{
+ Dropset
+ *result = NULL;
+
+ int i ;
+
+ List
+ *consensusBipsCanVanish = getConsensusBipsCanVanish(bipartitionProfile);
+
+ if( NOT mergingHash->entryCount)
+ return NULL ;
+
+ /* gather dropsets in array */
+ Array *allDropsets = CALLOC(1,sizeof(Array)) ;
+ allDropsets->length = mergingHash->entryCount;
+ allDropsets->arrayTable = CALLOC(mergingHash->entryCount, sizeof(Dropset*));
+
+ int cnt = 0;
+ HashTableIterator *htIter;
+ FOR_HASH(htIter, mergingHash)
+ {
+ GET_DROPSET_ELEM(allDropsets,cnt) = getCurrentValueFromHashTableIterator(htIter);
+ cnt++;
+ }
+ free(htIter);
+ assert(cnt == mergingHash->entryCount);
+
+ /* compute MRE stuff */
+ if(rogueMode == MRE_CONSENSUS_OPT)
+ {
+
+#ifdef PARALLEL
+ numberOfJobs = allDropsets->length;
+ globalPArgs->bipartitionsById = bipartitionsById;
+ globalPArgs->allDropsets = allDropsets;
+ masterBarrier(THREAD_MRE, globalPArgs);
+#else
+
+ FOR_0_LIMIT(i,allDropsets->length)
+ {
+ Dropset *dropset = GET_DROPSET_ELEM(allDropsets, i);
+ dropset->improvement = getSupportOfMRETree(bipartitionsById, dropset) - cumScore;
+ }
+#endif
+ }
+
+
+ /* evaluate dropsets */
+ if(rogueMode != MRE_CONSENSUS_OPT)
+ {
+#ifdef PARALLEL
+ numberOfJobs = allDropsets->length;
+ globalPArgs->mergingHash = mergingHash;
+ globalPArgs->allDropsets = allDropsets;
+ globalPArgs->bipartitionsById = bipartitionsById;
+ globalPArgs->consensusBipsCanVanish = consensusBipsCanVanish;
+ masterBarrier(THREAD_EVALUATE_EVENTS, globalPArgs);
+#else
+ FOR_0_LIMIT(i, allDropsets->length)
+ {
+ Dropset *dropset = GET_DROPSET_ELEM(allDropsets, i);
+ evaluateDropset(mergingHash, dropset, bipartitionsById, consensusBipsCanVanish);
+ }
+#endif
+ }
+
+ FOR_0_LIMIT(i,allDropsets->length)
+ {
+ Dropset
+ *dropset = GET_DROPSET_ELEM(allDropsets, i);
+
+ if(NOT result)
+ result = dropset;
+ else
+ {
+ int drSize = lengthIndexList(dropset->taxaToDrop),
+ resSize = lengthIndexList(result->taxaToDrop);
+
+ double oldQuality = labelPenalty == 0.0
+ ? result->improvement * drSize
+ : (double)(result->improvement / (double)(computeSupport ? numberOfTrees : 1.0)) - (double)resSize;
+ double newQuality = labelPenalty == 0.0
+ ? dropset->improvement * resSize
+ : (double)(dropset->improvement / (double)(computeSupport ? numberOfTrees : 1.0)) - (double)drSize;
+
+ if( (newQuality > oldQuality) )
+ result = dropset;
+ }
+ }
+ freeListFlat(consensusBipsCanVanish);
+
+ free(allDropsets->arrayTable);
+ free(allDropsets);
+
+ if(labelPenalty == 0.0 && result->improvement > 0)
+ return result;
+ else if(labelPenalty != 0.0 && (result->improvement / (computeSupport ? numberOfTrees : 1.0) - lengthIndexList(result->taxaToDrop)) > 0.0 )
+ return result;
+ else
+ return NULL;
+}
+
+
+void cleanup_updateNumBitsAndCleanArrays(Array *bipartitionProfile, Array *bipartitionsById, BitVector *mergingBipartitions, BitVector *newCandidates, Dropset *dropset)
+{
+ int profileIndex;
+
+ FOR_0_LIMIT(profileIndex,bipartitionProfile->length)
+ {
+ ProfileElem
+ *elem = GET_PROFILE_ELEM(bipartitionProfile,profileIndex);
+
+ if( NOT elem )
+ continue;
+
+ /* check if number of bits has changed */
+ if(NOT NTH_BIT_IS_SET(mergingBipartitions,elem->id))
+ {
+ if( mxtips - taxaDropped - 2 * elem->numberOfBitsSet <= 2 * maxDropsetSize )
+ FLIP_NTH_BIT(newCandidates, elem->id);
+ IndexList *iter = dropset->taxaToDrop;
+ boolean taxonDroppedP = FALSE;
+ FOR_LIST(iter)
+ {
+ if(NTH_BIT_IS_SET(elem->bitVector, iter->index))
+ {
+ taxonDroppedP = TRUE;
+ UNFLIP_NTH_BIT(elem->bitVector, iter->index);
+ elem->numberOfBitsSet--;
+ }
+ }
+
+ if(taxonDroppedP)
+ {
+ if(elem->numberOfBitsSet < 2)
+ {
+ UNFLIP_NTH_BIT(newCandidates, elem->id);
+ FLIP_NTH_BIT(mergingBipartitions, elem->id);
+ }
+ else
+ FLIP_NTH_BIT(newCandidates, elem->id);
+ }
+ }
+
+ /* bip has been merged or vanished */
+ if(NTH_BIT_IS_SET(mergingBipartitions,elem->id))
+ {
+ assert(NOT NTH_BIT_IS_SET(newCandidates, elem->id));
+ GET_PROFILE_ELEM(bipartitionProfile, profileIndex) = NULL;
+ GET_PROFILE_ELEM(bipartitionsById, elem->id) = NULL;
+ freeProfileElem(elem);
+ }
+ }
+}
+
+
+BitVector *cleanup_applyAllMergerEvents(Array *bipartitionsById, Dropset *bestDropset, BitVector *mergingBipartitions)
+{
+ BitVector
+ *candidateBips = CALLOC(GET_BITVECTOR_LENGTH(bipartitionsById->length), sizeof(BitVector)) ;
+
+ if( bestDropset)
+ {
+#ifdef PRINT_VERY_VERBOSE
+ printDropset(bestDropset);
+#endif
+
+ List *iter = NULL ;
+ if(maxDropsetSize == 1)
+ iter = bestDropset->ownPrimeE;
+ else
+ iter = bestDropset->acquiredPrimeE;
+ FOR_LIST(iter)
+ {
+ int newBipId = cleanup_applyOneMergerEvent((MergingEvent*)iter->value, bipartitionsById, mergingBipartitions);
+ FLIP_NTH_BIT(candidateBips, newBipId);
+ }
+
+ if(maxDropsetSize > 1 )
+ {
+ iter = bestDropset->complexEvents;
+ FOR_LIST(iter)
+ {
+ int newBipId = cleanup_applyOneMergerEvent((MergingEvent*)iter->value, bipartitionsById, mergingBipartitions);
+ FLIP_NTH_BIT(candidateBips, newBipId);
+ }
+ }
+ }
+
+ return candidateBips;
+}
+
+
+void cleanup_rehashDropsets(HashTable *mergingHash, Dropset *bestDropset)
+{
+ if(maxDropsetSize == 1)
+ return;
+
+ IndexList
+ *taxaToDrop = bestDropset->taxaToDrop;
+
+ List *allDropsets = NULL;
+ HashTableIterator *htIter;
+ FOR_HASH(htIter, mergingHash)
+ {
+ Dropset
+ *dropset = getCurrentValueFromHashTableIterator(htIter);
+ allDropsets = appendToList(dropset, allDropsets);
+ }
+ free(htIter);
+
+ List *iter = allDropsets;
+ FOR_LIST(iter)
+ {
+ Dropset
+ *dropset = (Dropset*)iter->value;
+
+ if( NOT dropset)
+ break;
+
+ if(NOT dropset->ownPrimeE || isSubsetOf(dropset->taxaToDrop, taxaToDrop) )
+ {
+ removeElementFromHash(mergingHash, dropset);
+ freeDropsetDeep(dropset, FALSE);
+ }
+ else if(haveIntersection(dropset->taxaToDrop, taxaToDrop)) /* needs reinsert */
+ {
+ removeElementFromHash(mergingHash, dropset);
+
+#ifdef MYDEBUG
+ int length = lengthIndexList(dropset->taxaToDrop);
+#endif
+
+ dropset->taxaToDrop = setMinusOf(dropset->taxaToDrop, taxaToDrop);
+
+#ifdef MYDEBUG
+ assert(length > lengthIndexList(dropset->taxaToDrop));
+#endif
+ unsigned int hv = mergingHash->hashFunction(mergingHash, dropset);
+ Dropset *found = searchHashTable(mergingHash, dropset, hv);
+ if( NOT found)
+ insertIntoHashTable(mergingHash,dropset,hv);
+ else /* reuse the merging events */
+ {
+ List
+ *iter, *next;
+ for(iter = dropset->ownPrimeE; iter; iter = next)
+ {
+ /* TODO potential error: double check, if this stuff did not already occur would be great */
+ next = iter->next;
+ iter->next = found->ownPrimeE;
+ found->ownPrimeE = iter;
+ }
+ freeIndexList(dropset->taxaToDrop);
+ free(dropset);
+ }
+ }
+ }
+ freeListFlat(allDropsets);
+}
+
+BitVector *cleanup(All *tr, HashTable *mergingHash, Dropset *bestDropset, BitVector *candidateBips, Array *bipartitionProfile, Array *bipartitionsById)
+{
+ IndexList
+ *ilIter;
+
+ BitVector
+ *bipsToVanish = CALLOC(GET_BITVECTOR_LENGTH(bipartitionsById->length), sizeof(BitVector));
+
+ /* apply merging events for best dropset */
+ candidateBips = cleanup_applyAllMergerEvents(bipartitionsById, bestDropset, bipsToVanish);
+
+ if(NOT bestDropset)
+ {
+ free(bipsToVanish);
+ return candidateBips;
+ }
+
+ /* add to list of dropped taxa */
+ ilIter = bestDropset->taxaToDrop;
+ FOR_LIST(ilIter)
+ FLIP_NTH_BIT(droppedTaxa,ilIter->index);
+
+ /* remove merging bipartitions from arrays (not candidates) */
+ cleanup_updateNumBitsAndCleanArrays(bipartitionProfile, bipartitionsById, bipsToVanish,candidateBips,bestDropset );
+ removeElementFromHash(mergingHash, bestDropset);
+ cleanup_mergingEvents(mergingHash, bipsToVanish, candidateBips, bipartitionProfile->length);
+
+ cleanup_rehashDropsets(mergingHash, bestDropset);
+
+#ifdef PRINT_VERY_VERBOSE
+ int i;
+ PR("CLEAN UP: need to recompute bipartitions ");
+ FOR_0_LIMIT(i, bipartitionProfile->length)
+ if(NTH_BIT_IS_SET(candidateBips, i))
+ PR("%d,", i);
+ PR("\n");
+#endif
+
+#ifdef MYDEBUG
+ debug_dropsetConsistencyCheck(mergingHash);
+#endif
+
+#ifdef PRINT_TIME
+ PR("[%f] executed the merging events \n", updateTime(&timeInc));
+#endif
+
+#ifdef PRINT_VERY_VERBOSE
+ PR("bips present %d (id) %d (profile)\n", getNumberOfBipsPresent(bipartitionsById), getNumberOfBipsPresent(bipartitionProfile));
+#endif
+
+ cumScore += bestDropset->improvement;
+ if(cumScore > bestCumEver)
+ bestCumEver = cumScore;
+ bestLastTime += bestDropset->improvement;
+ dropsetPerRound[dropRound+1] = bestDropset;
+ cumScores[dropRound+1] = cumScore;
+
+ printDropsetImprovement(bestDropset, tr, cumScore);
+
+ return candidateBips;
+}
+
+
+void doomRogues(All *tr, char *bootStrapFileName, char *dontDropFile, char *treeFile, boolean mreOptimisation, int rawThresh)
+{
+ double startingTime = gettime();
+ timeInc = gettime();
+
+ int
+ *indexByNumberBits,
+ i;
+
+ FILE
+ *bootstrapTreesFile = getNumberOfTrees(tr, bootStrapFileName),
+ *rogueOutput = getOutputFileFromString("droppedRogues");
+
+ BitVector
+ *candidateBips;
+
+ HashTable
+ *mergingHash = NULL;
+
+ numberOfTrees = tr->numberOfTrees;
+
+ if(strlen(treeFile))
+ {
+ rogueMode = ML_TREE_OPT;
+ if(mreOptimisation)
+ {
+ PR("ERROR: Please choose either support in the MRE consensus tree OR the bipartitions in the ML tree for optimization.\n");
+ exit(-1);
+ }
+ PR("mode: optimization of support of ML tree bipartitions in the bootstrap tree set.\n");
+ }
+ else if(mreOptimisation)
+ {
+ rogueMode = MRE_CONSENSUS_OPT;
+ thresh = tr->numberOfTrees * 0.5;
+ PR("mode: optimization on MRE consensus tree. \n");
+ }
+ else
+ {
+ rogueMode = VANILLA_CONSENSUS_OPT;
+ thresh = tr->numberOfTrees * rawThresh / 100;
+ if(thresh == tr->numberOfTrees)
+ thresh--;
+ PR("mode: optimization on consensus tree. Bipartition is part of consensus, if it occurs in more than %d trees\n", thresh);
+ }
+
+ FILE
+ *bestTree = (rogueMode == ML_TREE_OPT) ? myfopen(treeFile,"r") : NULL;
+
+ mxtips = tr->mxtips;
+ tr->bitVectorLength = GET_BITVECTOR_LENGTH(mxtips);
+
+ Array
+ *bipartitionProfile = getOriginalBipArray(tr, bestTree, bootstrapTreesFile);
+
+ if(maxDropsetSize >= mxtips - 3)
+ {
+ PR("\nMaximum dropset size (%d) too large. If we prune %d taxa, then there \n\
+ will be no bipartitions left and thus such a pruned tree set can never \n\
+ have a higher information content (in terms of RBIC) than the original \n\
+ tree.\n", maxDropsetSize, mxtips-3);
+ exit(-1);
+ }
+
+ dropsetPerRound = CALLOC(mxtips, sizeof(Dropset*));
+ Dropset
+ *bestDropset = NULL;
+
+ neglectThose = neglectThoseTaxa(tr, dontDropFile);
+
+ initializeRandForTaxa(mxtips);
+
+ treeVectorLength = GET_BITVECTOR_LENGTH(tr->numberOfTrees);
+ bitVectorLength = GET_BITVECTOR_LENGTH(tr->mxtips);
+ droppedTaxa = CALLOC(bitVectorLength, sizeof(BitVector));
+
+ paddingBits = CALLOC(GET_BITVECTOR_LENGTH(mxtips), sizeof(BitVector));
+ for(i = mxtips; i < GET_BITVECTOR_LENGTH(mxtips) * MASK_LENGTH; ++i)
+ FLIP_NTH_BIT(paddingBits,i);
+
+ FOR_0_LIMIT(i,bipartitionProfile->length)
+ {
+ ProfileElem *elem = ((ProfileElem**)bipartitionProfile->arrayTable)[i];
+ elem->numberOfBitsSet = genericBitCount(elem->bitVector, bitVectorLength);
+ }
+
+ Array
+ *bipartitionsById = CALLOC(1,sizeof(Array));
+ bipartitionsById->arrayTable = CALLOC(bipartitionProfile->length, sizeof(ProfileElem*));
+ bipartitionsById->length = bipartitionProfile->length;
+ FOR_0_LIMIT(i,bipartitionsById->length)
+ GET_PROFILE_ELEM(bipartitionsById,i) = GET_PROFILE_ELEM(bipartitionProfile, i);
+ qsort(bipartitionsById->arrayTable, bipartitionsById->length, sizeof(ProfileElem**), sortById);
+
+ numBips = bipartitionProfile->length;
+
+ cumScore = getInitScore(bipartitionProfile);
+ cumScores = CALLOC(mxtips-3, sizeof(int));
+ cumScores[0] = cumScore;
+ bestCumEver = cumScore;
+
+ bestLastTime = cumScore;
+ fprintf(rogueOutput, "num\ttaxNum\ttaxon\trawImprovement\tRBIC\n");
+ fprintf(rogueOutput, "%d\tNA\tNA\t%d\t%f\n", 0, 0, (double)cumScore /(numberOfTrees * (mxtips-3)) );
+
+ PR("[%f] initialisation done (initScore = %f, numBip=%d)\n", updateTime(&timeInc), (double)cumScore / (double)((tr->mxtips-3) * (computeSupport ? tr->numberOfTrees : 1 ) ), bipartitionsById->length);
+
+ boolean firstMerge= TRUE;
+ candidateBips = CALLOC(GET_BITVECTOR_LENGTH(bipartitionProfile->length),sizeof(BitVector));
+ FOR_0_LIMIT(i,bipartitionProfile->length)
+ FLIP_NTH_BIT(candidateBips,i);
+
+ mergingHash = createHashTable(tr->mxtips * maxDropsetSize * HASH_TABLE_SIZE_CONST,
+ NULL,
+ dropsetHashValue,
+ dropsetEqual);
+
+
+
+#ifdef PARALLEL
+ globalPArgs = CALLOC(1,sizeof(parallelArguments));
+ startThreads();
+#endif
+
+ /* main loop */
+ do
+ {
+#ifdef PRINT_VERY_VERBOSE
+ PR("ROUND %d ================================================================================================================================================================================================================\n",dropRound);
+ PR("dropped vector is: ");
+ printBitVector(droppedTaxa, GET_BITVECTOR_LENGTH(mxtips));
+ PR("\n");
+#endif
+
+ /***********/
+ /* prepare */
+ /***********/
+ bestDropset = NULL;
+ unifyBipartitionRepresentation(bipartitionProfile,droppedTaxa);
+ indexByNumberBits = createNumBitIndex(bipartitionProfile, mxtips);
+
+#ifdef PRINT_TIME
+ PR("[%f] sorting bipartition profile\n", updateTime(&timeInc));
+#endif
+
+#ifdef PRINT_VERY_VERBOSE
+ printBipartitionProfile(bipartitionProfile);
+#endif
+
+ /***********************************/
+ /* create / update merging events */
+ /***********************************/
+#ifdef PARALLEL
+ numberOfJobs = bipartitionProfile->length;
+ globalPArgs->mergingHash = mergingHash;
+ globalPArgs->candidateBips = candidateBips;
+ globalPArgs->bipartitionsById = bipartitionsById;
+ globalPArgs->bipartitionProfile = bipartitionProfile ;
+ globalPArgs->indexByNumberBits = indexByNumberBits;
+ globalPArgs->firstMerge = firstMerge;
+ masterBarrier(THREAD_GET_EVENTS, globalPArgs);
+ free(candidateBips);
+#else
+ createOrUpdateMergingHash(tr, mergingHash, bipartitionProfile, bipartitionsById, candidateBips, firstMerge, indexByNumberBits );
+#endif
+ firstMerge = FALSE;
+
+#ifdef MYDEBUG
+ debug_dropsetConsistencyCheck(mergingHash);
+ debug_mergingHashSanityCheck(mergingHash, bipartitionProfile->length);
+#endif
+
+#ifdef PRINT_TIME
+ PR("[%f] computed / updated events\n", updateTime(&timeInc));
+#endif
+
+ /* clear */
+
+ /******************/
+ /* combine events */
+ /******************/
+ if(maxDropsetSize > 1)
+ mergingHash = combineMergerEvents(mergingHash, bipartitionsById);
+
+#ifdef PRINT_TIME
+ PR("[%f] combined events\n", updateTime(&timeInc));
+#endif
+
+#ifdef PRINT_VERY_VERBOSE
+ if(mergingHash->entryCount > 0)
+ printMergingHash(mergingHash);
+#endif
+
+ /**********************/
+ /* evaluate dropsets */
+ /**********************/
+ bestDropset = evaluateEvents(mergingHash, bipartitionsById, bipartitionProfile);
+ free(indexByNumberBits);
+
+#ifdef PRINT_TIME
+ PR("[%f] calculated per dropset improvement\n", updateTime(&timeInc));
+#endif
+
+ /*****************/
+ /* cleanup */
+ /*****************/
+ candidateBips = cleanup(tr, mergingHash, bestDropset, candidateBips, bipartitionProfile, bipartitionsById);
+
+#ifdef MYDEBUG
+ int l,m;
+ FOR_0_LIMIT(l,bipartitionProfile->length)
+ {
+ ProfileElem
+ *elemA = GET_PROFILE_ELEM(bipartitionProfile,l);
+
+ if(NOT elemA )
+ continue;
+
+ for(m = l+1; m < bipartitionProfile->length; ++m)
+ {
+ ProfileElem
+ *elemB = GET_PROFILE_ELEM(bipartitionProfile,m);
+
+ if( NOT elemB)
+ continue;
+
+ if(elemA->numberOfBitsSet == elemB->numberOfBitsSet && myBitVectorEqual(elemA,elemB))
+ {
+ PR("%d and %d are equal!\n", elemA->id, elemB->id);
+ printBitVector(elemA->bitVector, bitVectorLength);
+ PR("\n");
+ printBitVector(elemB->bitVector, bitVectorLength);
+ PR("\n");
+ /* assert(0); */
+ exit(-1);
+ }
+ }
+ }
+#endif
+
+#ifdef PRINT_VERY_VERBOSE
+ PR("dropped vector is: ");
+ printBitVector(droppedTaxa, GET_BITVECTOR_LENGTH(mxtips));
+ PR("\n");
+#endif
+ if(bestDropset)
+ taxaDropped += lengthIndexList(bestDropset->taxaToDrop);
+
+ dropRound++;
+ } while(bestDropset);
+
+ /* print out result */
+ printRogueInformationToFile(tr, rogueOutput, bestCumEver,cumScores, dropsetPerRound);
+
+ PR("total time elapsed: %f\n", updateTime(&startingTime));
+
+ /* free everything */
+ FOR_0_LIMIT(i, bipartitionProfile->length)
+ {
+ ProfileElem *elem = GET_PROFILE_ELEM(bipartitionProfile,i);
+ if(elem)
+ freeProfileElem(elem);
+ }
+ free(((ProfileElemAttr*)bipartitionProfile->commonAttributes));
+ freeArray(bipartitionProfile);
+ freeArray(bipartitionsById);
+ destroyHashTable(mergingHash, freeDropsetDeepInHash);
+
+ fclose(rogueOutput);
+ for(i= 0 ; i < dropRound + 1; ++i)
+ {
+ Dropset *theDropset = dropsetPerRound[i];
+ if(theDropset)
+ freeDropsetDeepInEnd(theDropset);
+ }
+ free(dropsetPerRound);
+ free(neglectThose);
+ free(cumScores);
+ free(paddingBits);
+ free(randForTaxa);
+ free(droppedTaxa);
+ free(candidateBips);
+}
+
+
+void printHelpFile()
+{
+ printVersionInfo(FALSE);
+ printf("This program implements the RogueNaRok algorithm for rogue taxon identification.\n\nSYNTAX: ./%s -i <bootTrees> -n <runId> [-x <excludeFile>] [-c <threshold>] [-b] [-s <dropsetSize>] [-w <workingDir>] [-h]\n", programName);
+ printf("\n\tOBLIGATORY:\n");
+ printf("-i <bootTrees>\n\tA collection of bootstrap trees.\n");
+ printf("-n <runId>\n\tAn identifier for this run.\n");
+ printf("\n\tOPTIONAL:\n");
+ printf("-t <bestKnownTree>\n\tIf a single best-known tree (such as an ML or MP\n\t\
+tree) is provided, RogueNaRok optimizes the bootstrap support in this\n\t\
+best-known tree (still drawn from the bootstrap trees). The threshold\n\t\
+parameter is ignored.\n");
+ printf("-x <excludeFile>\n\ttaxa in this file (one taxon per line) will not be\n\t\
+considered for pruning.\n");
+ printf("-c <threshold>\n\t A threshold or mode for the consensus tree that is\n\t\
+optimized. Specify a value between 50 (majority rule consensus) and\n\t\
+100 (strict consensus) or MR (for the extended majority rule\n\t\
+consensus). Note that rogue taxa identified with respect to different\n\t\
+thresholds can vary substantially. DEFAULT: MR consensus\n");
+ printf("-b\n\tInstead of trying to maximize the support in the consensus tree,\n\t\
+the RogueNaRok will try to maximize the number of bipartition in the\n\t\
+final tree by pruning taxa. DEFAULT: off\n");
+ printf("-L <factor>\n\ta weight factor to penalize for dropset size. \n\t\
+Factor=1 is Pattengale's criterion. The higher the value, the more \n\t\
+conservative the algorithm is in pruning taxa. DEFAULT: 0.0 (=RBIC)\n");
+ printf("-s <dropsetSize>\n\tmaximum size of dropset per iteration. If\n\t\
+dropsetSize == n, then RogueNaRok will test in each iteration which\n\t\
+tuple of n taxa increases optimality criterion the most and prunes\n\t\
+taxa accordingly. This improves the result, but runtimes will\n\t\
+increase at least linearly. DEFAULT: 1\n");
+ printf("-w <workDir>\n\tA working directory where output files are created.\n");
+ printf("-T <num>\n\tExecute RogueNaRok in parallel with <num> threads. You need to compile the program for parallel execution first.\n");
+ printf("-h\n\tThis help file.\n");
+ printf("\nMINIMAL EXAMPLE:\n./%s -i <bootstrapTreeFile> -n run1\n", programName);
+}
+
+
+
+int main(int argc, char *argv[])
+{
+ int
+ c,
+ threshold = 50;
+
+ char
+ *excludeFile = "",
+ *bootTrees = "",
+ *treeFile = "";
+
+ boolean
+ mreOptimisation = FALSE;
+
+ if(sizeof(int) != 4)
+ {
+ printf("I am sorry, RogueNaRok currently does not support your computer architecture. The code assumes that an integer (type int) consists of 4 bytes.\n");
+ assert(sizeof(int) == 4);
+ }
+
+
+ programName = PROG_NAME;
+ programVersion = PROG_VERSION;
+ programReleaseDate = PROG_RELEASE_DATE;
+
+ while ((c = getopt (argc, argv, "i:t:n:x:whc:s:bT:L:")) != -1)
+ switch (c)
+ {
+ case 'i':
+ bootTrees = optarg;
+ break;
+ case 'T':
+ {
+#ifndef PARALLEL
+ printf("\n\nFor running RogueNaRok in parallel, please compile with \n\n");
+ exit(-1);
+#else
+ numberOfThreads = wrapStrToL(optarg);
+#endif
+ break;
+ }
+ case 'b':
+ computeSupport = FALSE;
+ break;
+ case 'n':
+ strcpy(run_id, optarg);
+ break;
+ case 't':
+ treeFile = optarg;
+ break;
+ case 's':
+ maxDropsetSize = wrapStrToL(optarg);
+ break;
+ case 'x':
+ excludeFile = optarg;
+ break;
+ case 'w':
+ strcpy(workdir, optarg) ;
+ break;
+ case 'L':
+ labelPenalty = wrapStrToDouble(optarg);
+ break;
+ case 'c':
+ {
+ if( NOT strcmp(optarg, "MRE"))
+ {
+ mreOptimisation = TRUE;
+ threshold = 50;
+ }
+ else
+ threshold = wrapStrToL(optarg);
+ break;
+ }
+ case 'h':
+ default:
+ {
+ printHelpFile();
+ abort ();
+ }
+ }
+
+ /* initialize fast bit counting */
+ compute_bits_in_16bits();
+ initializeMask();
+
+#ifdef PARALLEL
+ if(NOT numberOfThreads)
+ {
+ printf("\n\nPlease specify the number of threads for parallel execution with -T\n\n");
+ exit(-1);
+ }
+ if(numberOfThreads == 1)
+ {
+ printf("\n\nCalling parallel version of RogueNaRok with 1 thread is deprecated.\n\
+Please compile a sequential version of RogueNaRok instead.\n\n");
+ exit(-1);
+ }
+#endif
+
+ if( NOT strcmp(treeFile, ""))
+ rogueMode = ML_TREE_OPT;
+
+ if( NOT strcmp(bootTrees, ""))
+ {
+ printf("ERROR: Please specify a file containing bootstrap trees via -i.\n");
+ printHelpFile();
+ exit(-1);
+ }
+
+ if( NOT strcmp(run_id, ""))
+ {
+ printf("ERROR: Please specify a run-id via -n\n");
+ printHelpFile();
+ exit(-1);
+ }
+
+ if(threshold < 50)
+ {
+ printf("ERROR: Only accepting threshold values between 50 (MR) and 100 (strict).\n");
+ exit(-1);
+ }
+
+ if(threshold != 50 && strcmp(treeFile, "") )
+ {
+ printf("ERROR: threshold option -c not available in combination with best-known tree.\n");
+ exit(-1);
+ }
+
+ All
+ *tr = CALLOC(1,sizeof(All));
+ setupInfoFile();
+ if (NOT setupTree(tr, bootTrees))
+ {
+ PR("Something went wrong during tree initialisation. Sorry.\n");
+ exit(-1);
+ }
+
+ doomRogues(tr,
+ bootTrees,
+ excludeFile,
+ treeFile,
+ mreOptimisation,
+ threshold);
+
+ freeTree(tr);
+ free(mask32);
+ free(infoFileName);
+
+ return 0;
+}
diff --git a/Tree.c b/Tree.c
new file mode 100644
index 0000000..92cefff
--- /dev/null
+++ b/Tree.c
@@ -0,0 +1,1462 @@
+/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees.
+ *
+ * Moreover, the program collection comes with efficient implementations of
+ * * the unrooted leaf stability by Thorley and Wilkinson
+ * * the taxonomic instability index by Maddinson and Maddison
+ * * a maximum agreement subtree implementation (MAST) for unrooted trees
+ * * a tool for pruning taxa from a tree collection.
+ *
+ * Copyright October 2011 by Andre J. Aberer
+ *
+ * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis.
+ *
+ * This program is free software; you may redistribute it and/or
+ * modify its under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * For any other inquiries send an Email to Andre J. Aberer
+ * andre.aberer at googlemail.com
+ *
+ * When publishing work that is based on the results from RogueNaRok, please cite:
+ * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011.
+ *
+ */
+
+
+#include "Tree.h"
+
+static int treeGetCh (FILE *fp) ;
+static void insertHashBootstop(unsigned int *bitVector, hashtable *h, unsigned int vectorLength, int treeNumber, int treeVectorLength, unsigned int position);
+static void treeEchoContext (FILE *fp1, FILE *fp2, int n);
+static double getBranchLength(All *tr, int perGene, nodeptr p);
+boolean isTip(int number, int maxTips);
+void getxnode (nodeptr p);
+static void insertHashAll(unsigned int *bitVector, hashtable *h, unsigned int vectorLength, int treeNumber, unsigned int position);
+static void insertHash(unsigned int *bitVector, hashtable *h, unsigned int vectorLength, int bipNumber, unsigned int position);
+static int countHash(unsigned int *bitVector, hashtable *h, unsigned int vectorLength, unsigned int position);
+
+
+static unsigned int KISS32(void)
+{
+ static unsigned int
+ x = 123456789,
+ y = 362436069,
+ z = 21288629,
+ w = 14921776,
+ c = 0;
+
+ unsigned int t;
+
+ x += 545925293;
+ y ^= (y<<13);
+ y ^= (y>>17);
+ y ^= (y<<5);
+ t = z + w + c;
+ z = w;
+ c = (t>>31);
+ w = t & 2147483647;
+
+ return (x+y+w);
+}
+
+
+stringHashtable *initStringHashTable(unsigned int n)
+{
+ static const unsigned int initTable[] = {53, 97, 193, 389, 769, 1543, 3079, 6151, 12289, 24593, 49157, 98317,
+ 196613, 393241, 786433, 1572869, 3145739, 6291469, 12582917, 25165843,
+ 50331653, 100663319, 201326611, 402653189, 805306457, 1610612741};
+
+ stringHashtable *h = (stringHashtable*)malloc(sizeof(stringHashtable));
+
+ unsigned int
+ tableSize,
+ i,
+ primeTableLength = sizeof(initTable)/sizeof(initTable[0]);
+
+
+
+ i = 0;
+
+ while(initTable[i] < n && i < primeTableLength)
+ i++;
+
+ assert(i < primeTableLength);
+
+ tableSize = initTable[i];
+
+ h->table = (stringEntry**)calloc(tableSize, sizeof(stringEntry*));
+ h->tableSize = tableSize;
+
+ return h;
+}
+
+
+static unsigned int hashString(char *p, unsigned int tableSize)
+{
+ unsigned int h = 0;
+
+ for(; *p; p++)
+ h = 31 * h + *p;
+
+ return (h % tableSize);
+}
+
+
+void addword(char *s, stringHashtable *h, int nodeNumber)
+{
+ unsigned int position = hashString(s, h->tableSize);
+ stringEntry *p = h->table[position];
+
+ for(; p!= NULL; p = p->next)
+ {
+ if(strcmp(s, p->word) == 0)
+ return;
+ }
+
+ p = (stringEntry *)malloc(sizeof(stringEntry));
+
+ assert(p);
+
+ p->nodeNumber = nodeNumber;
+ p->word = (char *)malloc((strlen(s) + 1) * sizeof(char));
+
+ strcpy(p->word, s);
+ /* free(s); */
+
+ p->next = h->table[position];
+
+ h->table[position] = p;
+}
+
+
+int getNumberOfTaxa(All *tr, char *bootStrapFile)
+{
+ FILE *f = myfopen(bootStrapFile, "rb");
+
+ char
+ **nameList,
+ buffer[nmlngth + 2];
+
+ int
+ i = 0,
+ c,
+ taxaSize = 1024,
+ taxaCount = 0;
+
+ nameList = (char**)malloc(sizeof(char*) * taxaSize);
+
+ while((c = fgetc(f)) != ';')
+ {
+ if(c == '(' || c == ',')
+ {
+ c = fgetc(f);
+ if(c == '(' || c == ',')
+ ungetc(c, f);
+ else
+ {
+ i = 0;
+
+ do
+ {
+ buffer[i++] = c;
+ c = fgetc(f);
+ }
+ while(c != ':' && c != ')' && c != ',');
+ buffer[i] = '\0';
+
+ for(i = 0; i < taxaCount; i++)
+ {
+ if(strcmp(buffer, nameList[i]) == 0)
+ {
+ printf("A taxon labelled by %s appears twice in the first tree of tree collection %s, exiting ...\n", buffer, bootStrapFile);
+ exit(-1);
+ }
+ }
+
+ if(taxaCount == taxaSize)
+ {
+ taxaSize *= 2;
+ nameList = (char **)realloc(nameList, sizeof(char*) * taxaSize);
+ }
+
+ nameList[taxaCount] = (char*)malloc(sizeof(char) * (strlen(buffer) + 1));
+ strcpy(nameList[taxaCount], buffer);
+
+ taxaCount++;
+
+ ungetc(c, f);
+ }
+ }
+ }
+
+ printf("Found a total of %d taxa in first tree of tree collection %s\n", taxaCount, bootStrapFile);
+ printf("Expecting all remaining trees in collection to have the same taxon set\n\n");
+
+ tr->nameList = (char **)malloc(sizeof(char *) * (taxaCount + 1));
+ for(i = 1; i <= taxaCount; i++)
+ tr->nameList[i] = nameList[i - 1];
+
+ free(nameList);
+
+ tr->nameHash = initStringHashTable(10 * taxaCount);
+ for(i = 1; i <= taxaCount; i++)
+ addword(tr->nameList[i], tr->nameHash, i);
+
+ fclose(f);
+
+ return taxaCount;
+}
+
+
+boolean setupTree (All *tr, char *bootstrapFile)
+{
+ nodeptr p0, p, q;
+ int
+ i,
+ j,
+ k,
+ tips,
+ inter;
+
+ tips = getNumberOfTaxa(tr, bootstrapFile);
+ tr->mxtips = tips;
+
+ tips = tr->mxtips;
+ inter = tr->mxtips - 1;
+ tr->numberOfTrees = -1;
+
+ if (NOT(p0 = (nodeptr) malloc((tips + 3*inter) * sizeof(node))))
+ {
+ printf("ERROR: Unable to obtain sufficient tree memory\n");
+ return FALSE;
+ }
+
+ if (NOT(tr->nodep = (nodeptr *) malloc((2*tr->mxtips) * sizeof(nodeptr))))
+ {
+ printf("ERROR: Unable to obtain sufficient tree memory, too\n");
+ return FALSE;
+ }
+
+ tr->nodep[0] = (node *) NULL; /* Use as 1-based array */
+
+ for (i = 1; i <= tips; i++)
+ {
+ p = p0++;
+
+ p->hash = KISS32(); /* hast table stuff */
+ p->x = 0;
+ p->number = i;
+ p->next = p;
+ p->back = (node *)NULL;
+ p->bInf = (branchInfo *)NULL;
+
+
+ for(k = 0; k < NUM_BRANCHES; k++)
+ {
+ p->xs[k] = 0;
+ p->backs[k] = (nodeptr)NULL;
+ }
+
+ for(k = 0; k < VECTOR_LENGTH; k++)
+ p->isPresent[k] = 0;
+
+ tr->nodep[i] = p;
+ }
+
+ for (i = tips + 1; i <= tips + inter; i++)
+ {
+ q = (node *) NULL;
+ for (j = 1; j <= 3; j++)
+ {
+ p = p0++;
+ if(j == 1)
+ p->x = 1;
+ else
+ p->x = 0;
+ p->number = i;
+ p->next = q;
+ p->bInf = (branchInfo *)NULL;
+ p->back = (node *) NULL;
+ p->hash = 0;
+
+ if(j == 1)
+ for(k = 0; k < NUM_BRANCHES; k++)
+ {
+ p->xs[k] = 1;
+ p->backs[k] = (nodeptr)NULL;
+ }
+ else
+ for(k = 0; k < NUM_BRANCHES; k++)
+ {
+ p->xs[k] = 0;
+ p->backs[k] = (nodeptr)NULL;
+ }
+
+ for(k = 0; k < VECTOR_LENGTH; k++)
+ p->isPresent[k] = 0;
+
+
+ q = p;
+ }
+ p->next->next->next = p;
+ tr->nodep[i] = p;
+ }
+
+ tr->start = (node *) NULL;
+
+ tr->ntips = 0;
+ tr->nextnode = 0;
+
+ for(i = 0; i < tr->numBranches; i++)
+ tr->partitionSmoothed[i] = FALSE;
+
+ return TRUE;
+}
+
+
+nodeptr findAnyTip(nodeptr p, int numsp)
+{
+ return isTip(p->number, numsp) ? p : findAnyTip(p->next->back, numsp);
+}
+
+
+void hookup (nodeptr p, nodeptr q, double *z, int numBranches)
+{
+ int i;
+
+ p->back = q;
+ q->back = p;
+
+ for(i = 0; i < numBranches; i++)
+ p->z[i] = q->z[i] = z[i];
+}
+
+
+static void newviewBipartitions(BitVector **bitVectors, nodeptr p, int numsp, unsigned int vectorLength)
+{
+ if(isTip(p->number, numsp))
+ return;
+ {
+ nodeptr
+ q = p->next->back,
+ r = p->next->next->back;
+ unsigned int
+ *vector = bitVectors[p->number],
+ *left = bitVectors[q->number],
+ *right = bitVectors[r->number];
+ unsigned
+ int i;
+
+ while(NOT p->x)
+ {
+ if(NOT p->x)
+ getxnode(p);
+ }
+
+ p->hash = q->hash ^ r->hash;
+
+ if(isTip(q->number, numsp) && isTip(r->number, numsp))
+ {
+ for(i = 0; i < vectorLength; i++)
+ vector[i] = left[i] | right[i];
+ }
+ else
+ {
+ if(isTip(q->number, numsp) || isTip(r->number, numsp))
+ {
+ if(isTip(r->number, numsp))
+ {
+ nodeptr tmp = r;
+ r = q;
+ q = tmp;
+ }
+
+ while(NOT r->x)
+ {
+ if(NOT r->x)
+ newviewBipartitions(bitVectors, r, numsp, vectorLength);
+ }
+
+ for(i = 0; i < vectorLength; i++)
+ vector[i] = left[i] | right[i];
+ }
+ else
+ {
+ while((NOT r->x) || (NOT q->x))
+ {
+ if(NOT q->x)
+ newviewBipartitions(bitVectors, q, numsp, vectorLength);
+ if(NOT r->x)
+ newviewBipartitions(bitVectors, r, numsp, vectorLength);
+ }
+
+ for(i = 0; i < vectorLength; i++)
+ vector[i] = left[i] | right[i];
+ }
+
+ }
+ }
+}
+
+
+
+void bitVectorInitravSpecial(unsigned int **bitVectors, nodeptr p, int numsp, unsigned int vectorLength, hashtable *h, int treeNumber, int function, branchInfo *bInf, int *countBranches, int treeVectorLength, boolean traverseOnly, boolean computeWRF)
+{
+ if(isTip(p->number, numsp))
+ return;
+ else
+ {
+ nodeptr q = p->next;
+
+ do
+ {
+ bitVectorInitravSpecial(bitVectors, q->back, numsp, vectorLength, h, treeNumber, function, bInf, countBranches, treeVectorLength, traverseOnly, computeWRF);
+ q = q->next;
+ }
+ while(q != p);
+
+ newviewBipartitions(bitVectors, p, numsp, vectorLength);
+
+ assert(p->x);
+
+ if(traverseOnly)
+ {
+ if(NOT(isTip(p->back->number, numsp)))
+ *countBranches = *countBranches + 1;
+ return;
+ }
+
+ if(NOT(isTip(p->back->number, numsp)))
+ {
+ unsigned int *toInsert = bitVectors[p->number];
+ unsigned int position = p->hash % h->tableSize;
+
+ switch(function)
+ {
+ case BIPARTITIONS_ALL:
+ insertHashAll(toInsert, h, vectorLength, treeNumber, position);
+ *countBranches = *countBranches + 1;
+ break;
+ case GET_BIPARTITIONS_BEST:
+ insertHash(toInsert, h, vectorLength, *countBranches, position);
+
+ p->bInf = &bInf[*countBranches];
+ p->back->bInf = &bInf[*countBranches];
+ p->bInf->support = 0;
+ p->bInf->oP = p;
+ p->bInf->oQ = p->back;
+
+ *countBranches = *countBranches + 1;
+ break;
+ case DRAW_BIPARTITIONS_BEST:
+ {
+ int found = countHash(toInsert, h, vectorLength, position);
+ if(found >= 0)
+ bInf[found].support = bInf[found].support + 1;
+ *countBranches = *countBranches + 1;
+ }
+ break;
+ case BIPARTITIONS_BOOTSTOP:
+ insertHashBootstop(toInsert, h, vectorLength, treeNumber, treeVectorLength, position);
+ *countBranches = *countBranches + 1;
+ break;
+ default:
+ assert(0);
+ }
+ }
+
+ }
+}
+
+
+
+
+void hookupDefault (nodeptr p, nodeptr q, int numBranches)
+{
+ int i;
+
+ p->back = q;
+ q->back = p;
+
+ for(i = 0; i < numBranches; i++)
+ p->z[i] = q->z[i] = defaultz;
+}
+
+
+static boolean treeLabelEnd (int ch)
+{
+ switch (ch)
+ {
+ case EOF:
+ case '\0':
+ case '\t':
+ case '\n':
+ case '\r':
+ case ' ':
+ case ':':
+ case ',':
+ case '(':
+ case ')':
+ case ';':
+ return TRUE;
+ default:
+ break;
+ }
+ return FALSE;
+}
+
+static boolean treeGetLabel (FILE *fp, char *lblPtr, int maxlen)
+{
+ int ch;
+ boolean done, quoted, lblfound;
+
+ if (--maxlen < 0)
+ lblPtr = (char *) NULL;
+ else
+ if (lblPtr == NULL)
+ maxlen = 0;
+
+ ch = getc(fp);
+ done = treeLabelEnd(ch);
+
+ lblfound = NOT done;
+ quoted = (ch == '\'');
+ if (quoted && NOT done)
+ {
+ ch = getc(fp);
+ done = (ch == EOF);
+ }
+
+ while (NOT done)
+ {
+ if (quoted)
+ {
+ if (ch == '\'')
+ {
+ ch = getc(fp);
+ if (ch != '\'')
+ break;
+ }
+ }
+ else
+ if (treeLabelEnd(ch)) break;
+
+ if (--maxlen >= 0) *lblPtr++ = ch;
+ ch = getc(fp);
+ if (ch == EOF) break;
+ }
+
+ if (ch != EOF) (void) ungetc(ch, fp);
+
+ if (lblPtr != NULL) *lblPtr = '\0';
+
+ return lblfound;
+}
+
+static boolean treeFlushLabel (FILE *fp)
+{
+ return treeGetLabel(fp, (char *) NULL, (int) 0);
+}
+
+int lookupWord(char *s, stringHashtable *h)
+{
+ unsigned int position = hashString(s, h->tableSize);
+ stringEntry *p = h->table[position];
+
+ for(; p!= NULL; p = p->next)
+ {
+ if(strcmp(s, p->word) == 0)
+ return p->nodeNumber;
+ }
+
+ return -1;
+}
+
+
+int treeFindTipByLabelString(char *str, All *tr)
+{
+ int lookup = lookupWord(str, tr->nameHash);
+
+ if(lookup > 0)
+ {
+ /* assert(! tr->nodep[lookup]->back); */
+ return lookup;
+ }
+ else
+ {
+ printf("ERROR: Cannot find tree species: %s\n", str);
+ return 0;
+ }
+}
+
+
+int treeFindTipName(FILE *fp, All *tr)
+{
+ char str[nmlngth+2];
+ int n;
+
+ if(treeGetLabel(fp, str, nmlngth+2))
+ n = treeFindTipByLabelString(str, tr);
+ else
+ n = 0;
+
+
+ return n;
+}
+
+
+static boolean treeProcessLength (FILE *fp, double *dptr)
+{
+ int ch;
+
+ if ((ch = treeGetCh(fp)) == EOF) return FALSE; /* Skip comments */
+ (void) ungetc(ch, fp);
+
+ if (fscanf(fp, "%lf", dptr) != 1) {
+ printf("ERROR: treeProcessLength: Problem reading branch length\n");
+ treeEchoContext(fp, stdout, 40);
+ printf("\n");
+ return FALSE;
+ }
+
+ return TRUE;
+}
+
+
+static int treeFlushLen (FILE *fp)
+{
+ double dummy;
+ int ch;
+
+ ch = treeGetCh(fp);
+
+ if (ch == ':')
+ {
+ ch = treeGetCh(fp);
+
+ ungetc(ch, fp);
+ if(NOT treeProcessLength(fp, & dummy)) return 0;
+ return 1;
+ }
+
+
+
+ if (ch != EOF) (void) ungetc(ch, fp);
+ return 1;
+}
+
+
+boolean isTip(int number, int maxTips)
+{
+ assert(number > 0);
+
+ if(number <= maxTips)
+ return TRUE;
+ else
+ return FALSE;
+}
+
+
+static char *Tree2StringREC(char *treestr, All *tr, nodeptr p, boolean printBranchLengths, boolean printNames, boolean printLikelihood, boolean rellTree, boolean finalPrint, int perGene, boolean branchLabelSupport, boolean printSHSupport)
+{
+ char *nameptr;
+
+ if(isTip(p->number, tr->mxtips))
+ {
+ if(printNames)
+ {
+ nameptr = tr->nameList[p->number];
+ sprintf(treestr, "%s", nameptr);
+ }
+ else
+ sprintf(treestr, "%d", p->number);
+
+ while (*treestr) treestr++;
+ }
+ else
+ {
+ *treestr++ = '(';
+ treestr = Tree2StringREC(treestr, tr, p->next->back, printBranchLengths, printNames, printLikelihood, rellTree,
+ finalPrint, perGene, branchLabelSupport, printSHSupport);
+ *treestr++ = ',';
+ treestr = Tree2StringREC(treestr, tr, p->next->next->back, printBranchLengths, printNames, printLikelihood, rellTree,
+ finalPrint, perGene, branchLabelSupport, printSHSupport);
+ if(p == tr->start->back)
+ {
+ *treestr++ = ',';
+ treestr = Tree2StringREC(treestr, tr, p->back, printBranchLengths, printNames, printLikelihood, rellTree,
+ finalPrint, perGene, branchLabelSupport, printSHSupport);
+ }
+ *treestr++ = ')';
+ }
+
+ if(p == tr->start->back)
+ {
+ if(printBranchLengths && NOT rellTree)
+ sprintf(treestr, ":0.0;\n");
+ else
+ sprintf(treestr, ";\n");
+ }
+ else
+ {
+ if(rellTree || branchLabelSupport || printSHSupport)
+ {
+ if(( NOT isTip(p->number, tr->mxtips)) &&
+ ( NOT isTip(p->back->number, tr->mxtips)))
+ {
+ assert(p->bInf != (branchInfo *)NULL);
+
+ if(rellTree)
+ sprintf(treestr, "%d:%8.20f", p->bInf->support, p->z[0]);
+ if(branchLabelSupport)
+ sprintf(treestr, ":%8.20f[%d]", p->z[0], p->bInf->support);
+ if(printSHSupport)
+ sprintf(treestr, ":%8.20f[%d]", getBranchLength(tr, perGene, p), p->bInf->support);
+
+ }
+ else
+ {
+ if(rellTree || branchLabelSupport)
+ sprintf(treestr, ":%8.20f", p->z[0]);
+ if(printSHSupport)
+ sprintf(treestr, ":%8.20f", getBranchLength(tr, perGene, p));
+ }
+ }
+ else
+ {
+ if(printBranchLengths)
+ /* sprintf(treestr, ":%8.20f", getBranchLength(tr, perGene, p)); */
+ sprintf(treestr, ":%8.20f", p->z[0]);
+ else
+ sprintf(treestr, "%s", "\0");
+ }
+ }
+
+ while (*treestr) treestr++;
+ return treestr;
+}
+
+
+static double getBranchLength(All *tr, int perGene, nodeptr p)
+{
+ double
+ z = 0.0,
+ x = 0.0;
+
+ assert(perGene != NO_BRANCHES);
+ assert(tr->fracchange != -1.0);
+ z = p->z[0];
+ if (z < zmin)
+ z = zmin;
+
+ x = -log(z) * tr->fracchange;
+
+ return x; /* here! */
+}
+
+
+static nodeptr uprootTree (All *tr, nodeptr p, boolean readBranchLengths, boolean readConstraint)
+{
+ nodeptr q, r, s, start;
+ int n, i;
+
+ for(i = tr->mxtips + 1; i < 2 * tr->mxtips - 1; i++)
+ assert(i == tr->nodep[i]->number);
+
+
+ if(isTip(p->number, tr->mxtips) || p->back)
+ {
+ printf("ERROR: Unable to uproot tree.\n");
+ printf(" Inappropriate node marked for removal.\n");
+ assert(0);
+ }
+
+ assert(p->back == (nodeptr)NULL);
+
+ tr->nextnode = tr->nextnode - 1;
+
+ assert(tr->nextnode < 2 * tr->mxtips);
+
+ n = tr->nextnode;
+
+ assert(tr->nodep[tr->nextnode]);
+
+ if (n != tr->mxtips + tr->ntips - 1)
+ {
+ printf("ERROR: Unable to uproot tree. Inconsistent\n");
+ printf(" number of tips and nodes for rooted tree.\n");
+ assert(0);
+ }
+
+ q = p->next->back; /* remove p from tree */
+ r = p->next->next->back;
+ assert(p->back == (nodeptr)NULL);
+
+ if(readBranchLengths)
+ {
+ double b[NUM_BRANCHES];
+ int i;
+ for(i = 0; i < tr->numBranches; i++)
+ b[i] = (r->z[i] + q->z[i]);
+ hookup (q, r, b, tr->numBranches);
+ }
+ else
+ hookupDefault(q, r, tr->numBranches);
+
+ if(readConstraint && tr->grouped)
+ {
+ if(tr->constraintVector[p->number] != 0)
+ {
+ printf("Root node to remove should have top-level grouping of 0\n");
+ assert(0);
+ }
+ }
+
+ assert(NOT(isTip(r->number, tr->mxtips) && isTip(q->number, tr->mxtips)));
+
+ assert(p->number > tr->mxtips);
+
+ if(tr->ntips > 2 && p->number != n)
+ {
+ q = tr->nodep[n]; /* transfer last node's conections to p */
+ r = q->next;
+ s = q->next->next;
+
+ if(readConstraint && tr->grouped)
+ tr->constraintVector[p->number] = tr->constraintVector[q->number];
+
+ hookup(p, q->back, q->z, tr->numBranches); /* move connections to p */
+ hookup(p->next, r->back, r->z, tr->numBranches);
+ hookup(p->next->next, s->back, s->z, tr->numBranches);
+
+ q->back = q->next->back = q->next->next->back = (nodeptr) NULL;
+ }
+ else
+ p->back = p->next->back = p->next->next->back = (nodeptr) NULL;
+
+ assert(tr->ntips > 2);
+
+ start = findAnyTip(tr->nodep[tr->mxtips + 1], tr->mxtips);
+
+ assert(isTip(start->number, tr->mxtips));
+ tr->rooted = FALSE;
+ return start;
+}
+
+
+char *Tree2String(char *treestr, All *tr, nodeptr p,
+ boolean printBranchLengths, boolean printNames,
+ boolean printLikelihood, boolean rellTree,
+ boolean finalPrint, int perGene,
+ boolean branchLabelSupport, boolean printSHSupport)
+{
+ Tree2StringREC(treestr, tr, p, printBranchLengths, printNames, printLikelihood, rellTree,
+ finalPrint, perGene, branchLabelSupport, printSHSupport);
+
+ while (*treestr) treestr++;
+
+ return treestr;
+}
+
+
+boolean whitechar (int ch)
+{
+ return (ch == ' ' || ch == '\n' || ch == '\t' || ch == '\r');
+}
+
+
+static void treeEchoContext (FILE *fp1, FILE *fp2, int n)
+{ /* treeEchoContext */
+ int ch;
+ boolean waswhite;
+
+ waswhite = TRUE;
+
+ while (n > 0 && ((ch = getc(fp1)) != EOF)) {
+ if (whitechar(ch)) {
+ ch = waswhite ? '\0' : ' ';
+ waswhite = TRUE;
+ }
+ else {
+ waswhite = FALSE;
+ }
+
+ if (ch > '\0') {putc(ch, fp2); n--;}
+ }
+}
+
+
+int treeFinishCom (FILE *fp, char **strp)
+{
+ int ch;
+
+ while ((ch = getc(fp)) != EOF && ch != ']') {
+ if (strp != NULL) *(*strp)++ = ch; /* save character */
+ if (ch == '[') { /* nested comment; find its end */
+ if ((ch = treeFinishCom(fp, strp)) == EOF) break;
+ if (strp != NULL) *(*strp)++ = ch; /* save closing ] */
+ }
+ }
+
+ if (strp != NULL) **strp = '\0'; /* terminate string */
+ return ch;
+}
+
+
+static int treeGetCh (FILE *fp)
+{
+ int ch;
+
+ while ((ch = getc(fp)) != EOF) {
+ if (whitechar(ch)) ;
+ else if (ch == '[') { /* comment; find its end */
+ if ((ch = treeFinishCom(fp, (char **) NULL)) == EOF) break;
+ }
+ else break;
+ }
+
+ return ch;
+}
+
+
+static boolean treeNeedCh (FILE *fp, int c1, char *where)
+{
+ int c2;
+
+ if ((c2 = treeGetCh(fp)) == c1) return TRUE;
+
+ printf("ERROR: Expecting '%c' %s tree; found:", c1, where);
+ if (c2 == EOF)
+ {
+ printf("End-of-File");
+ }
+ else
+ {
+ ungetc(c2, fp);
+ treeEchoContext(fp, stdout, 40);
+ }
+ putchar('\n');
+
+ if(c1 == ':')
+ printf("RAxML may be expecting to read a tree that contains branch lengths\n");
+
+ return FALSE;
+}
+
+
+static boolean addElementLen (FILE *fp, All *tr, nodeptr p, boolean readBranchLengths, boolean readNodeLabels, int *lcount)
+{
+ nodeptr q;
+ int n, ch, fres;
+
+ if ((ch = treeGetCh(fp)) == '(')
+ {
+ n = (tr->nextnode)++;
+ if (n > 2*(tr->mxtips) - 2)
+ {
+ if (tr->rooted || n > 2*(tr->mxtips) - 1)
+ {
+ printf("ERROR: Too many internal nodes. Is tree rooted?\n");
+ printf(" Deepest splitting should be a trifurcation.\n");
+ return FALSE;
+ }
+ else
+ {
+ assert(NOT readNodeLabels);
+ tr->rooted = TRUE;
+ }
+ }
+
+ q = tr->nodep[n];
+
+ if (NOT addElementLen(fp, tr, q->next, readBranchLengths, readNodeLabels, lcount)) return FALSE;
+ if (NOT treeNeedCh(fp, ',', "in")) return FALSE;
+ if (NOT addElementLen(fp, tr, q->next->next, readBranchLengths, readNodeLabels, lcount)) return FALSE;
+ if (NOT treeNeedCh(fp, ')', "in")) return FALSE;
+
+ if(readNodeLabels)
+ {
+ char label[64];
+ int support;
+
+ if(treeGetLabel (fp, label, 10))
+ {
+ int val = sscanf(label, "%d", &support);
+ assert(val == 1);
+ if(val != 1 )
+ PR("error\n");
+
+ /*printf("LABEL %s Number %d\n", label, support);*/
+ p->support = q->support = support;
+ /*printf("%d %d %d %d\n", p->support, q->support, p->number, q->number);*/
+ assert(p->number > tr->mxtips && q->number > tr->mxtips);
+ *lcount = *lcount + 1;
+ }
+ }
+ else
+ (void) treeFlushLabel(fp);
+ }
+ else
+ {
+ ungetc(ch, fp);
+ if ((n = treeFindTipName(fp, tr)) <= 0) return FALSE;
+ q = tr->nodep[n];
+ if (tr->start->number > n) tr->start = q;
+ (tr->ntips)++;
+ }
+
+ if(readBranchLengths)
+ {
+ double branch;
+ if (NOT treeNeedCh(fp, ':', "in")) return FALSE;
+ if (NOT treeProcessLength(fp, &branch)) return FALSE;
+
+ /* printf("Branch %8.20f %d\n", branch, tr->numBranches); */
+ hookup(p, q, &branch, tr->numBranches);
+ /* PR(">>> %d\n", tr->numBranches); */
+ }
+ else
+ {
+ fres = treeFlushLen(fp);
+ if(NOT fres) return FALSE;
+
+ hookupDefault(p, q, tr->numBranches);
+ }
+ return TRUE;
+}
+
+
+int getTreeStringLength(char *fileName)
+{
+ int
+ i = 0;
+ char
+ cc;
+ FILE
+ *f = myfopen(fileName,"r");
+
+ while((cc=getc(f))!='\n') i++;
+ fclose(f);
+
+ return i;
+}
+
+
+int treeReadLen (FILE *fp, All *tr,
+ boolean readBranches, boolean readNodeLabels,
+ boolean topologyOnly, boolean completeTree)
+{
+ nodeptr
+ p;
+
+ int
+ i,
+ ch,
+ lcount = 0;
+
+ for (i = 1; i <= tr->mxtips; i++)
+ {
+ tr->nodep[i]->back = (nodeptr) NULL;
+ if(topologyOnly)
+ tr->nodep[i]->support = -1;
+ }
+
+ for(i = tr->mxtips + 1; i < 2 * tr->mxtips; i++)
+ {
+ tr->nodep[i]->back = (nodeptr)NULL;
+ tr->nodep[i]->next->back = (nodeptr)NULL;
+ tr->nodep[i]->next->next->back = (nodeptr)NULL;
+ tr->nodep[i]->number = i;
+ tr->nodep[i]->next->number = i;
+ tr->nodep[i]->next->next->number = i;
+
+ if(topologyOnly)
+ {
+ tr->nodep[i]->support = -2;
+ tr->nodep[i]->next->support = -2;
+ tr->nodep[i]->next->next->support = -2;
+ }
+ }
+
+ if(topologyOnly)
+ tr->start = tr->nodep[tr->mxtips];
+ else
+ tr->start = tr->nodep[1];
+
+ tr->ntips = 0;
+ tr->nextnode = tr->mxtips + 1;
+
+ for(i = 0; i < tr->numBranches; i++)
+ tr->partitionSmoothed[i] = FALSE;
+
+ tr->rooted = FALSE;
+
+ p = tr->nodep[(tr->nextnode)++];
+
+ while((ch = treeGetCh(fp)) != '(');
+
+ if(NOT topologyOnly)
+ assert(readBranches == FALSE && readNodeLabels == FALSE);
+
+
+ if (NOT addElementLen(fp, tr, p, readBranches, readNodeLabels, &lcount))
+ assert(0);
+ if (NOT treeNeedCh(fp, ',', "in"))
+ assert(0);
+ if (NOT addElementLen(fp, tr, p->next, readBranches, readNodeLabels, &lcount))
+ assert(0);
+ if (NOT tr->rooted)
+ {
+ if ((ch = treeGetCh(fp)) == ',')
+ {
+ if (NOT addElementLen(fp, tr, p->next->next, readBranches, readNodeLabels, &lcount))
+ assert(0);
+ }
+ else
+ { /* A rooted format */
+ tr->rooted = TRUE;
+ if (ch != EOF) (void) ungetc(ch, fp);
+ }
+ }
+ else
+ {
+ p->next->next->back = (nodeptr) NULL;
+ }
+ if (NOT treeNeedCh(fp, ')', "in"))
+ assert(0);
+
+ if(topologyOnly)
+ assert(NOT(tr->rooted && readNodeLabels));
+
+ (void) treeFlushLabel(fp);
+
+ if (NOT treeFlushLen(fp))
+ assert(0);
+
+ if (NOT treeNeedCh(fp, ';', "at end of"))
+ assert(0);
+
+ if (tr->rooted)
+ {
+ assert(NOT readNodeLabels);
+
+ p->next->next->back = (nodeptr) NULL;
+ tr->start = uprootTree(tr, p->next->next, FALSE, FALSE);
+ if (NOT tr->start)
+ {
+ printf("FATAL ERROR UPROOTING TREE\n");
+ assert(0);
+ }
+ }
+ else
+ tr->start = findAnyTip(p, tr->mxtips);
+
+ if(NOT topologyOnly)
+ {
+ if(tr->ntips < tr->mxtips)
+ {
+ if(completeTree)
+ {
+ printBothOpen("Hello this is your friendly RAxML tree parsing routine\n");
+ printBothOpen("The RAxML option you are uisng requires to read in only complete trees\n");
+ printBothOpen("with %d taxa, there is at least one tree with %d taxa though ... exiting\n", tr->mxtips, tr->ntips);
+ exit(-1);
+ }
+ }
+ }
+
+
+ return lcount;
+}
+
+
+void getxnode (nodeptr p)
+{
+ nodeptr s;
+
+ if ((s = p->next)->x || (s = s->next)->x)
+ {
+ p->x = s->x;
+ s->x = 0;
+ }
+
+ assert(p->x);
+}
+
+
+entry *initEntry(void)
+{
+ entry *e = (entry*)CALLOC(1,sizeof(entry));
+
+ e->bitVector = (unsigned int*)NULL;
+ e->treeVector = (unsigned int*)NULL;
+ e->supportVector = (int*)NULL;
+ e->bipNumber = 0;
+ e->bipNumber2 = 0;
+ e->supportFromTreeset[0] = 0;
+ e->supportFromTreeset[1] = 0;
+ e->next = (entry*)NULL;
+
+ return e;
+}
+
+
+static void insertHashAll(unsigned int *bitVector, hashtable *h, unsigned int vectorLength, int treeNumber, unsigned int position)
+{
+ if(h->table[position] != NULL)
+ {
+ entry *e = h->table[position];
+
+ do
+ {
+ unsigned int i;
+
+ for(i = 0; i < vectorLength; i++)
+ if(bitVector[i] != e->bitVector[i])
+ break;
+
+ if(i == vectorLength)
+ {
+ if(treeNumber == 0)
+ e->bipNumber = e->bipNumber + 1;
+ else
+ e->bipNumber2 = e->bipNumber2 + 1;
+ return;
+ }
+
+ e = e->next;
+ }
+ while(e != (entry*)NULL);
+
+ e = initEntry();
+
+ e->bitVector = (unsigned int*)CALLOC(vectorLength, sizeof(unsigned int));
+ /* e->bitVector = (unsigned int*)malloc_aligned(vectorLength * sizeof(unsigned int)); */
+ memset(e->bitVector, 0, vectorLength * sizeof(unsigned int));
+
+
+ memcpy(e->bitVector, bitVector, sizeof(unsigned int) * vectorLength);
+
+ if(treeNumber == 0)
+ e->bipNumber = 1;
+ else
+ e->bipNumber2 = 1;
+
+ e->next = h->table[position];
+ h->table[position] = e;
+ }
+ else
+ {
+ entry *e = initEntry();
+
+ e->bitVector = (unsigned int*)CALLOC(vectorLength, sizeof(unsigned int));
+ /* e->bitVector = (unsigned int*)malloc_aligned(vectorLength * sizeof(unsigned int)); */
+ memset(e->bitVector, 0, vectorLength * sizeof(unsigned int));
+
+ memcpy(e->bitVector, bitVector, sizeof(unsigned int) * vectorLength);
+
+ if(treeNumber == 0)
+ e->bipNumber = 1;
+ else
+ e->bipNumber2 = 1;
+
+ h->table[position] = e;
+ }
+
+ h->entryCount = h->entryCount + 1;
+}
+
+
+static void insertHash(unsigned int *bitVector, hashtable *h, unsigned int vectorLength, int bipNumber, unsigned int position)
+{
+ entry *e = initEntry();
+
+ e->bipNumber = bipNumber;
+ /*e->bitVector = (unsigned int*)CALLOC(vectorLength, sizeof(unsigned int)); */
+
+ e->bitVector = (unsigned int*)CALLOC(vectorLength , sizeof(unsigned int));
+ memset(e->bitVector, 0, vectorLength * sizeof(unsigned int));
+
+ memcpy(e->bitVector, bitVector, sizeof(unsigned int) * vectorLength);
+
+ if(h->table[position] != NULL)
+ {
+ e->next = h->table[position];
+ h->table[position] = e;
+ }
+ else
+ h->table[position] = e;
+
+ h->entryCount = h->entryCount + 1;
+}
+
+
+static int countHash(unsigned int *bitVector, hashtable *h, unsigned int vectorLength, unsigned int position)
+{
+ if(h->table[position] == NULL)
+ return -1;
+ {
+ entry *e = h->table[position];
+
+ do
+ {
+ unsigned int i;
+
+ for(i = 0; i < vectorLength; i++)
+ if(bitVector[i] != e->bitVector[i])
+ goto NEXT;
+
+ return (e->bipNumber);
+ NEXT:
+ e = e->next;
+ }
+ while(e != (entry*)NULL);
+
+ return -1;
+ }
+}
+
+
+static void insertHashBootstop(unsigned int *bitVector, hashtable *h, unsigned int vectorLength, int treeNumber, int treeVectorLength, unsigned int position)
+{
+ if(h->table[position] != NULL)
+ {
+ entry *e = h->table[position];
+
+ do
+ {
+ unsigned int i;
+
+ for(i = 0; i < vectorLength; i++)
+ if(bitVector[i] != e->bitVector[i])
+ break;
+
+ if(i == vectorLength)
+ {
+ e->treeVector[treeNumber / MASK_LENGTH] |= mask32[treeNumber % MASK_LENGTH];
+ return;
+ }
+
+ e = e->next;
+ }
+ while(e != (entry*)NULL);
+
+ e = initEntry();
+
+ e->bipNumber = h->entryCount;
+
+ /*e->bitVector = (unsigned int*)CALLOC(vectorLength, sizeof(unsigned int));*/
+ e->bitVector = (unsigned int*)CALLOC(vectorLength, sizeof(unsigned int));
+ memset(e->bitVector, 0, vectorLength * sizeof(unsigned int));
+
+
+ e->treeVector = (unsigned int*)CALLOC(treeVectorLength, sizeof(unsigned int));
+
+ e->treeVector[treeNumber / MASK_LENGTH] |= mask32[treeNumber % MASK_LENGTH];
+ memcpy(e->bitVector, bitVector, sizeof(unsigned int) * vectorLength);
+
+ e->next = h->table[position];
+ h->table[position] = e;
+ }
+ else
+ {
+ entry *e = initEntry();
+
+ e->bipNumber = h->entryCount;
+
+ e->bitVector = (unsigned int*)CALLOC(vectorLength , sizeof(unsigned int));
+ memset(e->bitVector, 0, vectorLength * sizeof(unsigned int));
+
+ e->treeVector = (unsigned int*)CALLOC(treeVectorLength, sizeof(unsigned int));
+
+ e->treeVector[treeNumber / MASK_LENGTH] |= mask32[treeNumber % MASK_LENGTH];
+ memcpy(e->bitVector, bitVector, sizeof(unsigned int) * vectorLength);
+
+ h->table[position] = e;
+ }
+
+ h->entryCount = h->entryCount + 1;
+}
+
+
+FILE *getNumberOfTrees(All *tr, char *fileName)
+{
+ FILE
+ *f = myfopen(fileName, "r");
+
+ int
+ trees = 0,
+ ch;
+
+ while((ch = fgetc(f)) != EOF)
+ if(ch == ';')
+ trees++;
+
+ assert(trees > 0);
+
+ tr->numberOfTrees = trees;
+
+ rewind(f);
+
+ return f;
+}
+
+
+/* INTERFACE TO OUTSIDE WOLRD */
+void readBestTree(All *tr, FILE *file)
+{
+ treeReadLen(file, tr, TRUE, FALSE, TRUE, TRUE);
+}
+
+
+void readBootstrapTree(All *tr, FILE *file)
+{
+ treeReadLen(file, tr, FALSE, FALSE, TRUE, TRUE);
+}
+
+
+char *writeTreeToString(All *tr, boolean printBranchLengths)
+{
+ Tree2String(tr->tree_string, tr, tr->start->back, /* */
+ printBranchLengths, TRUE,
+ FALSE, FALSE,
+ TRUE, 0,
+ FALSE, FALSE);
+ return tr->tree_string;
+}
+
+void freeTree(All *tr)
+{
+ int i;
+ for(i = 1; i <= tr->mxtips; ++i)
+ free(tr->nameList[i]);
+ free(tr->nameList);
+
+ for(i = 0; i < tr->nameHash->tableSize; ++i)
+ {
+ stringEntry *elem = tr->nameHash->table[i];
+ while(elem)
+ {
+ stringEntry *iter = elem->next;
+ free(elem->word);
+ free(elem);
+ elem = iter;
+ }
+ }
+ free(tr->nameHash->table);
+ free(tr->nameHash);
+ /* FOR_0_LIMIT(i,((tr->mxtips-1) )) */
+ /* free(tr->nodep[i]); */
+ free(tr->nodep);
+
+ free(tr);
+}
+/* END OF INTERFACE */
+
+
diff --git a/Tree.h b/Tree.h
new file mode 100644
index 0000000..36598f2
--- /dev/null
+++ b/Tree.h
@@ -0,0 +1,57 @@
+/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees.
+ *
+ * Moreover, the program collection comes with efficient implementations of
+ * * the unrooted leaf stability by Thorley and Wilkinson
+ * * the taxonomic instability index by Maddinson and Maddison
+ * * a maximum agreement subtree implementation (MAST) for unrooted trees
+ * * a tool for pruning taxa from a tree collection.
+ *
+ * Copyright October 2011 by Andre J. Aberer
+ *
+ * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis.
+ *
+ * This program is free software; you may redistribute it and/or
+ * modify its under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * For any other inquiries send an Email to Andre J. Aberer
+ * andre.aberer at googlemail.com
+ *
+ * When publishing work that is based on the results from RogueNaRok, please cite:
+ * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011.
+ *
+ */
+
+
+#ifndef TREES_H
+#define TREES_H
+
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <assert.h>
+
+#include "common.h"
+#include "legacy.h"
+
+extern unsigned int *mask32;
+
+boolean isTip(int number, int maxTips);
+char *writeTreeToString(All *tr, boolean printBranchLengths);
+void readTree(char *fileName);
+boolean setupTree (All *tr, char *bootstrapTrees);
+void readBestTree(All *tr, FILE *file);
+void readBootstrapTree(All *tr, FILE *file);
+void hookupDefault (nodeptr p, nodeptr q, int numBranches);
+nodeptr findAnyTip(nodeptr p, int numsp);
+int treeFindTipByLabelString(char *str, All *tr);
+int getTreeStringLength(char *fileName);
+FILE *getNumberOfTrees(All *tr, char *fileName);
+void freeTree(All *tr);
+#endif
diff --git a/common.c b/common.c
new file mode 100644
index 0000000..9919dc9
--- /dev/null
+++ b/common.c
@@ -0,0 +1,225 @@
+/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees.
+ *
+ * Moreover, the program collection comes with efficient implementations of
+ * * the unrooted leaf stability by Thorley and Wilkinson
+ * * the taxonomic instability index by Maddinson and Maddison
+ * * a maximum agreement subtree implementation (MAST) for unrooted trees
+ * * a tool for pruning taxa from a tree collection.
+ *
+ * Copyright October 2011 by Andre J. Aberer
+ *
+ * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis.
+ *
+ * This program is free software; you may redistribute it and/or
+ * modify its under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * For any other inquiries send an Email to Andre J. Aberer
+ * andre.aberer at googlemail.com
+ *
+ * When publishing work that is based on the results from RogueNaRok, please cite:
+ * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011.
+ *
+ */
+
+
+#include "common.h"
+
+#define ALLOW_OVERWRITE_INFO_FILE TRUE
+
+extern void printHelpFile();
+
+/* GLOBAL VARIABLES =( */
+char *infoFileName = "",
+ *programName = "",
+ *programVersion = "",
+ *programReleaseDate ="",
+ run_id[128] = "",
+ workdir[1024] = "";
+
+
+double updateTime(double* time)
+{
+ double tmp = *time;
+ return (*time = gettime()) - tmp;
+}
+
+
+int wrapStrToL(char *string)
+{
+ int errno = 0;
+ char *p;
+ int result = strtol(string, &p, 10);
+ if (errno != 0 || *p != 0 || p == string)
+ {
+ printf("Something went wrong parsing a number.");
+ exit(-1);
+ }
+ return result;
+}
+
+
+double wrapStrToDouble(char *string)
+{
+ int errno = 0;
+ char *p;
+ double result = strtod(string, &p);
+ if (errno != 0 || *p != 0 || p == string)
+ {
+ printf("Something went wrong parsing a number.");
+ exit(-1);
+ }
+ return result;
+}
+
+
+int filexists(char *filename)
+{
+ FILE *fp;
+ int res;
+ fp = fopen(filename,"rb");
+
+ if(fp)
+ {
+ res = 1;
+ fclose(fp);
+ }
+ else
+ res = 0;
+
+ return res;
+}
+
+void printVersionInfo(boolean toInfoFile)
+{
+ if(toInfoFile)
+ PR("\nThis is %s version %s released by Andre J. Aberer in %s.\n\n", programName, programVersion, programReleaseDate);
+ else
+ printf("\nThis is %s version %s released by Andre J. Aberer in %s.\n\n", programName, programVersion, programReleaseDate);
+}
+
+
+FILE *myfopen(const char *path, const char *mode)
+{
+ FILE *fp = fopen(path, mode);
+
+ if(strcmp(mode,"r") == 0 || strcmp(mode,"rb") == 0)
+ {
+ if(fp)
+ return fp;
+ else
+ {
+ if(processID == 0)
+ printf("The file %s you want to open for reading does not exist, exiting ...\n", path);
+ assert(0);
+ exit(-1);
+ return (FILE *)NULL;
+ }
+ }
+ else
+ {
+ if(fp)
+ return fp;
+ else
+ {
+ if(processID == 0)
+ printf("The file %s RAxML wants to open for writing or appending can not be opened [mode: %s], exiting ...\n",
+ path, mode);
+ exit(-1);
+ return (FILE *)NULL;
+ }
+ }
+}
+
+
+void setupInfoFile()
+{
+ char *result = CALLOC(1024, sizeof(char));
+ strcpy(result, workdir);
+ strcat(result, programName);
+ strcat(result, "_info");
+ strcat(result, ".");
+ strcat(result, run_id);
+
+ if( NOT ALLOW_OVERWRITE_INFO_FILE && filexists(result))
+ {
+ printf("The run-id >%s< you specified already exists. Aborting in order to protect the output of this previous run.\n", run_id);
+ exit(-1);
+ }
+
+ FILE *tmp = myfopen(result, "w");
+
+ fclose(tmp);
+ infoFileName = result;
+ printVersionInfo(TRUE);
+}
+
+
+char *lowerTheString(char *string)
+{
+ int
+ i,
+ stringLength = strlen(string);
+ char
+ *result = CALLOC(stringLength, sizeof(char));
+
+ FOR_0_LIMIT(i,stringLength)
+ result[i] = tolower(string[i]);
+
+
+ return result;
+}
+
+
+FILE *getOutputFileFromString(char *fileName)
+{
+
+ char result[1024];
+ strcpy(result, workdir);
+ strcat(result, programName);
+ strcat(result, "_");
+ strcat(result, fileName);
+ strcat(result, ".");
+ strcat(result, run_id);
+
+ return myfopen(result, "w");
+}
+
+
+double gettime(void)
+{
+#ifdef WIN32
+ time_t tp;
+ struct tm localtm;
+ tp = time(NULL);
+ localtm = *localtime(&tp);
+ return 60.0*localtm.tm_min + localtm.tm_sec;
+#else
+ struct timeval ttime;
+ gettimeofday(&ttime , NULL);
+ return ttime.tv_sec + ttime.tv_usec * 0.000001;
+#endif
+}
+
+
+void printBothOpen(const char* format, ... )
+{
+ FILE *f = myfopen(infoFileName, "ab");
+
+ va_list args;
+ va_start(args, format);
+ vfprintf(f, format, args );
+ va_end(args);
+
+ va_start(args, format);
+ vprintf(format, args );
+ va_end(args);
+
+ fclose(f);
+}
diff --git a/common.h b/common.h
new file mode 100644
index 0000000..5f7f7b6
--- /dev/null
+++ b/common.h
@@ -0,0 +1,86 @@
+/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees.
+ *
+ * Moreover, the program collection comes with efficient implementations of
+ * * the unrooted leaf stability by Thorley and Wilkinson
+ * * the taxonomic instability index by Maddinson and Maddison
+ * * a maximum agreement subtree implementation (MAST) for unrooted trees
+ * * a tool for pruning taxa from a tree collection.
+ *
+ * Copyright October 2011 by Andre J. Aberer
+ *
+ * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis.
+ *
+ * This program is free software; you may redistribute it and/or
+ * modify its under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * For any other inquiries send an Email to Andre J. Aberer
+ * andre.aberer at googlemail.com
+ *
+ * When publishing work that is based on the results from RogueNaRok, please cite:
+ * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011.
+ *
+ */
+
+
+#ifndef COMMON_H
+#define COMMON_H
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdarg.h>
+#include <assert.h>
+#include <math.h>
+#include <ctype.h>
+
+#ifdef WIN32
+#include <direct.h>
+#endif
+
+#ifndef WIN32
+#include <sys/times.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#endif
+
+typedef char boolean;
+
+/* GENERAL */
+#define NOT !
+#define TRUE 1
+#define FALSE 0
+#define CALLOC(num, size) calloc(num, size)
+#define ABS(x) (((x)<0) ? (-(x)) : (x))
+#define FOR_0_LIMIT(iter,limit) for(iter=0;iter < (limit); iter++)
+#define FOR_N_LIMIT(iter,n,limit) for(iter=(n); iter < (limit); iter++)
+#define PR printBothOpen
+#define SWAP(a, b) (((a) ^= (b)), ((b) ^= (a)), ((a) ^= (b)))
+#define USE_UPPER_TRIANGLE_LSI(matrix,a,b,c) (matrix[a][b][(c)-(b)])
+#define USE_UPPER_TRIANGLE_TII(matrix,x,y) (matrix[(x)][(y-x)]) /* assumes that x < y */
+#define MIN(a,b) (((a) > (b)) ? (b) : (a))
+#define MAX(a,b) (((a) < (b)) ? (b) : (a))
+
+#define GET_FROM_UPPER_TRIANGLE(matrix,a,b,c) ((b<c) ? matrix[a][b][(c)-(b)] : matrix[a][c][(b)-(c)])
+
+int processID;
+void printVersionInfo(boolean toInfoFile);
+int wrapStrToL(char *string);
+void printBothOpen(const char* format, ... );
+double wrapStrToDouble(char *string);
+char *lowerTheString(char *string);
+FILE *getOutputFileFromString(char *fileName);
+double gettime(void);
+void setupInfoFile();
+double updateTime(double* time);
+FILE *myfopen(const char *path, const char *mode);
+
+#endif
diff --git a/legacy.c b/legacy.c
new file mode 100644
index 0000000..c946704
--- /dev/null
+++ b/legacy.c
@@ -0,0 +1,153 @@
+/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees.
+ *
+ * Moreover, the program collection comes with efficient implementations of
+ * * the unrooted leaf stability by Thorley and Wilkinson
+ * * the taxonomic instability index by Maddinson and Maddison
+ * * a maximum agreement subtree implementation (MAST) for unrooted trees
+ * * a tool for pruning taxa from a tree collection.
+ *
+ * Copyright October 2011 by Andre J. Aberer
+ *
+ * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis.
+ *
+ * This program is free software; you may redistribute it and/or
+ * modify its under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * For any other inquiries send an Email to Andre J. Aberer
+ * andre.aberer at googlemail.com
+ *
+ * When publishing work that is based on the results from RogueNaRok, please cite:
+ * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011.
+ *
+ */
+
+#include "legacy.h"
+
+
+void freeHashTable(hashtable *h)
+{
+ unsigned int
+ i,
+ entryCount = 0;
+
+ for(i = 0; i < h->tableSize; i++)
+ {
+ if(h->table[i] != NULL)
+ {
+ entry *e = h->table[i];
+ entry *previous;
+
+ do
+ {
+ previous = e;
+ e = e->next;
+
+ if(previous->bitVector)
+ free(previous->bitVector);
+
+ if(previous->treeVector)
+ free(previous->treeVector);
+
+ if(previous->supportVector)
+ free(previous->supportVector);
+
+ free(previous);
+ entryCount++;
+ }
+ while(e != NULL);
+ }
+
+ }
+
+ assert(entryCount == h->entryCount);
+
+ free(h->table);
+}
+
+hashtable *initHashTable(unsigned int n)
+{
+ static const unsigned int initTable[] = {64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608, 16777216, 33554432, 67108864, 134217728, 268435456, 536870912, 1073741824, 2147483648U};
+
+ hashtable *h = (hashtable*)CALLOC(1,sizeof(hashtable));
+
+ unsigned int
+ tableSize,
+ i,
+
+#ifndef NDEBUG
+ maxSize = (unsigned int)-1,
+#endif
+ primeTableLength = sizeof(initTable)/sizeof(initTable[0]);
+
+#ifndef NDEBUG
+ assert(n <= maxSize);
+#endif
+
+ i = 0;
+
+ while(initTable[i] < n && i < primeTableLength)
+ i++;
+
+ assert(i < primeTableLength);
+
+ tableSize = initTable[i];
+
+ /* printf("Hash table init with size %u\n", tableSize); */
+
+ h->table = (entry**)CALLOC(tableSize, sizeof(entry*));
+ h->tableSize = tableSize;
+ h->entryCount = 0;
+
+ return h;
+}
+
+
+BitVector **initBitVector(All *tr, BitVector *vectorLength)
+{
+ BitVector **bitVectors = (BitVector **)CALLOC(2 * tr->mxtips, sizeof(BitVector*));
+ int i;
+
+ if(tr->mxtips % MASK_LENGTH == 0)
+ *vectorLength = tr->mxtips / MASK_LENGTH;
+ else
+ *vectorLength = 1 + (tr->mxtips / MASK_LENGTH);
+
+ for(i = 1; i <= tr->mxtips; i++)
+ {
+ bitVectors[i] = (BitVector *)CALLOC(*vectorLength, sizeof(BitVector));
+ bitVectors[i][(i - 1) / MASK_LENGTH] |= mask32[(i - 1) % MASK_LENGTH];
+ }
+
+ for(i = tr->mxtips + 1; i < 2 * tr->mxtips; i++)
+ bitVectors[i] = (BitVector *)CALLOC(*vectorLength, sizeof(BitVector));
+
+ return bitVectors;
+}
+
+
+ProfileElem *addProfileElem(entry *helem, int vectorLength, int treeVectorLength, int numberOfTrees)
+{
+ ProfileElem *result = CALLOC(1,sizeof(ProfileElem));
+ result->isInMLTree = FALSE;
+ result->bitVector = CALLOC(vectorLength, sizeof(BitVector));
+ result->treeVector = CALLOC(treeVectorLength, sizeof(BitVector));
+ result->bitVector = memcpy(result->bitVector, helem->bitVector, vectorLength * sizeof(BitVector));
+ result->treeVector = memcpy(result->treeVector, helem->treeVector, treeVectorLength * sizeof(BitVector));
+
+ if(NTH_BIT_IS_SET(result->treeVector, numberOfTrees))
+ {
+ result->isInMLTree = TRUE;
+ UNFLIP_NTH_BIT(result->treeVector, numberOfTrees);
+ }
+
+ result->treeVectorSupport = genericBitCount(result->treeVector, treeVectorLength);
+
+ return result;
+}
diff --git a/legacy.h b/legacy.h
new file mode 100644
index 0000000..95137e8
--- /dev/null
+++ b/legacy.h
@@ -0,0 +1,143 @@
+/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees.
+ *
+ * Moreover, the program collection comes with efficient implementations of
+ * * the unrooted leaf stability by Thorley and Wilkinson
+ * * the taxonomic instability index by Maddinson and Maddison
+ * * a maximum agreement subtree implementation (MAST) for unrooted trees
+ * * a tool for pruning taxa from a tree collection.
+ *
+ * Copyright October 2011 by Andre J. Aberer
+ *
+ * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis.
+ *
+ * This program is free software; you may redistribute it and/or
+ * modify its under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * For any other inquiries send an Email to Andre J. Aberer
+ * andre.aberer at googlemail.com
+ *
+ * When publishing work that is based on the results from RogueNaRok, please cite:
+ * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011.
+ *
+ */
+
+#ifndef LEGACY_H
+#define LEGACY_H
+
+#include "common.h"
+#include "BitVector.h"
+#include "List.h"
+#include "Array.h"
+#include "ProfileElem.h"
+
+#define NUM_BRANCHES 128
+
+typedef struct ent
+{
+ unsigned int *bitVector;
+ unsigned int *treeVector;
+ unsigned int amountTips;
+ int *supportVector;
+ unsigned int bipNumber;
+ unsigned int bipNumber2;
+ unsigned int supportFromTreeset[2];
+ struct ent *next;
+} entry;
+
+typedef struct
+{
+ unsigned int *vector;
+ int support;
+ struct noderec *oP;
+ struct noderec *oQ;
+} branchInfo;
+
+typedef struct noderec
+{
+ unsigned int isPresent[NUM_BRANCHES / MASK_LENGTH];
+ struct noderec *backs[NUM_BRANCHES];
+ char xs[NUM_BRANCHES];
+ branchInfo *bInf;
+ double z[NUM_BRANCHES];
+ struct noderec *next;
+ struct noderec *back;
+ unsigned int hash;
+ int support;
+ int number;
+ char x;
+ double **insertionLWs;
+} node, *nodeptr;
+
+typedef struct stringEnt
+{
+ int nodeNumber;
+ char *word;
+ struct stringEnt *next;
+} stringEntry ;
+
+typedef struct
+{
+ unsigned int tableSize;
+ stringEntry **table;
+} stringHashtable;
+
+
+typedef struct _All
+{
+ nodeptr start;
+ int mxtips;
+ int numberOfTrees;
+ int bitVectorLength;
+ nodeptr *nodep;
+ int ntips;
+ int nextnode;
+ int numBranches;
+ boolean partitionSmoothed[NUM_BRANCHES];
+ boolean rooted;
+ stringHashtable *nameHash;
+ boolean grouped;
+ int *constraintVector;
+ char **nameList;
+ char *tree_string;
+ int treeStringLength;
+ double fracchange;
+} All;
+
+typedef struct
+{
+ unsigned int tableSize;
+ entry **table;
+ unsigned int entryCount;
+} hashtable;
+
+
+#define FC_INIT 20
+#define zmin 1.0E-15 /* max branch prop. to -log(zmin) (= 34) */
+#define NO_BRANCHES -1
+
+#define BIPARTITIONS_ALL 0
+#define GET_BIPARTITIONS_BEST 1
+#define DRAW_BIPARTITIONS_BEST 2
+#define BIPARTITIONS_BOOTSTOP 3
+#define BIPARTITIONS_RF 4
+#define defaultz 0.9 /* value of z assigned as starting point */
+#define nmlngth 1024 /* number of characters in species name */
+#define VECTOR_LENGTH (NUM_BRANCHES / MASK_LENGTH)
+
+void bitVectorInitravSpecial(unsigned int **bitVectors, nodeptr p, int numsp, unsigned int vectorLength, hashtable *h, int treeNumber, int function, branchInfo *bInf, int *countBranches, int treeVectorLength, boolean traverseOnly, boolean computeWRF);
+hashtable *initHashTable(unsigned int n);
+void freeHashTable(hashtable *h);
+ProfileElem *addProfileElem(entry *helem, int vectorLength, int treeVectorLength, int numberOfTrees) ;
+
+
+BitVector *neglectThoseTaxa(All *tr, char *toDrop);
+void pruneTaxon(All *tr, unsigned int k);
+BitVector **initBitVector(All *tr, BitVector *vectorLength);
+#endif
diff --git a/newFunctions.c b/newFunctions.c
new file mode 100644
index 0000000..8c881fc
--- /dev/null
+++ b/newFunctions.c
@@ -0,0 +1,197 @@
+/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees.
+ *
+ * Moreover, the program collection comes with efficient implementations of
+ * * the unrooted leaf stability by Thorley and Wilkinson
+ * * the taxonomic instability index by Maddinson and Maddison
+ * * a maximum agreement subtree implementation (MAST) for unrooted trees
+ * * a tool for pruning taxa from a tree collection.
+ *
+ * Copyright October 2011 by Andre J. Aberer
+ *
+ * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis.
+ *
+ * This program is free software; you may redistribute it and/or
+ * modify its under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * For any other inquiries send an Email to Andre J. Aberer
+ * andre.aberer at googlemail.com
+ *
+ * When publishing work that is based on the results from RogueNaRok, please cite:
+ * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011.
+ *
+ */
+
+#include "newFunctions.h"
+
+IndexList *parseToDrop(All *tr, FILE *toDrop)
+{
+ IndexList
+ *result = NULL;
+
+ char
+ line[1024];
+
+ while(fgets(line, 1024, toDrop) != NULL)
+ {
+ char bla[1024];
+ sscanf(line, "%s\n", bla);
+ PR("will drop %d\n", treeFindTipByLabelString(bla, tr));
+ result = appendToIndexList(treeFindTipByLabelString(bla, tr), result);
+ }
+
+ return result;
+}
+
+void pruneTaxon(All *tr, unsigned int k)
+{
+ assert(k > 0 && k <= ((unsigned int)(tr->mxtips)));
+
+ nodeptr
+ p = tr->nodep[k],
+ q = p->back,
+ q1 = q->next->back,
+ q2 = q->next->next->back;
+
+ hookupDefault(q1, q2, tr->numBranches);
+
+ tr->start = findAnyTip(q1, tr->mxtips);
+
+ assert(p != tr->start && q != tr->start);
+}
+
+
+BitVector *neglectThoseTaxa(All *tr, char *toDrop)
+{
+ int
+ i = 0;
+
+ BitVector
+ *result = CALLOC(tr->bitVectorLength, sizeof(BitVector));
+
+ for(i = 0; i < tr->mxtips; i++)
+ FLIP_NTH_BIT(result,i);
+
+ if(strlen(toDrop) == 0)
+ return result;
+
+ FILE
+ *toDropFile = myfopen(toDrop,"r");
+
+ IndexList
+ *iter,
+ *toConvert = parseToDrop(tr, toDropFile);
+
+ for(iter = toConvert; iter; iter = iter->next)
+ {
+ UNFLIP_NTH_BIT(result, (iter->index -1) );
+ assert( NOT NTH_BIT_IS_SET(result, iter->index -1 ));
+ }
+
+ freeIndexList(toConvert);
+
+ return result;
+}
+
+
+Array *getOriginalBipArray(All *tr, FILE *bestTree, FILE *treeFile)
+{
+ Array *result = CALLOC(1, sizeof(Array));
+
+ int
+ i,j,bCount = 0,
+ treeVectorLength = GET_BITVECTOR_LENGTH((tr->numberOfTrees+1));
+ unsigned int
+ vectorLength = 0,
+ **setBitVectors = initBitVector(tr, &vectorLength);
+ hashtable
+ *setHtable = initHashTable(tr->mxtips * FC_INIT * 10);
+ nodeptr
+ commonStart = NULL;
+
+ BitVector
+ lastByte = 0;
+
+ for(i = tr->mxtips; i < MASK_LENGTH * vectorLength; ++i)
+ lastByte |= mask32[i % MASK_LENGTH];
+
+ unsigned int
+ *randForTaxa = CALLOC(tr->mxtips, sizeof(unsigned int));
+
+ for(i = 0; i < tr->mxtips; ++i)
+ randForTaxa[i] = rand();
+
+ rewind(treeFile);
+
+ if(bestTree)
+ rewind(bestTree);
+
+ /* get bipartitions of bootstrap set */
+ for( i = 1; i <= tr->numberOfTrees; ++i)
+ {
+ readBootstrapTree(tr, treeFile);
+
+ if( NOT commonStart)
+ commonStart = tr->start;
+ bCount = 0;
+ bitVectorInitravSpecial(setBitVectors, commonStart->back, tr->mxtips, vectorLength, setHtable, (i - 1), BIPARTITIONS_BOOTSTOP, (branchInfo *)NULL, &bCount, treeVectorLength, FALSE, FALSE);
+ }
+
+ if(bestTree)
+ {
+ readBestTree(tr,bestTree);
+
+ bCount = 0;
+ bitVectorInitravSpecial(setBitVectors, commonStart->back, tr->mxtips, vectorLength, setHtable, tr->numberOfTrees, BIPARTITIONS_BOOTSTOP, (branchInfo *)NULL, &bCount, treeVectorLength, FALSE, FALSE);
+ assert(bCount == tr->mxtips - 3);
+ }
+
+ result->commonAttributes = CALLOC(1,sizeof(ProfileElemAttr));
+ ((ProfileElemAttr*)result->commonAttributes)->bitVectorLength = vectorLength;
+ ((ProfileElemAttr*)result->commonAttributes)->treeVectorLength = treeVectorLength;
+ ((ProfileElemAttr*)result->commonAttributes)->lastByte = lastByte;
+ ((ProfileElemAttr*)result->commonAttributes)->randForTaxa = randForTaxa;
+ result->length = setHtable->entryCount;
+ result->arrayTable = CALLOC(result->length, sizeof(ProfileElem*));
+
+ j = 0;
+ for(i = 0; i < setHtable->tableSize; ++i)
+ {
+ entry
+ *elem = setHtable->table[i];
+
+ while(elem)
+ {
+ ((ProfileElem**)result->arrayTable)[j] = addProfileElem(elem, vectorLength, treeVectorLength, tr->numberOfTrees);
+ j++;
+ elem = elem->next;
+ }
+ }
+ assert(j == setHtable->entryCount);
+
+ int cnt= 0;
+ for(i = 0; i < result->length; ++i)
+ if(((ProfileElem**)result->arrayTable)[i]->isInMLTree)
+ cnt++;
+ if(bestTree)
+ assert(cnt == tr->mxtips - 3);
+
+ freeHashTable(setHtable);
+ freeBitVectors(setBitVectors, 2 * tr->mxtips);
+ free(setBitVectors);
+ free(setHtable);
+ free(randForTaxa);
+
+ /* TEST */
+ for(i = 0; i < result->length; ++i )
+ ((ProfileElem**)result->arrayTable)[i]->id = i;
+ /* END */
+
+ return result;
+}
diff --git a/newFunctions.h b/newFunctions.h
new file mode 100644
index 0000000..ba94b44
--- /dev/null
+++ b/newFunctions.h
@@ -0,0 +1,45 @@
+/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees.
+ *
+ * Moreover, the program collection comes with efficient implementations of
+ * * the unrooted leaf stability by Thorley and Wilkinson
+ * * the taxonomic instability index by Maddinson and Maddison
+ * * a maximum agreement subtree implementation (MAST) for unrooted trees
+ * * a tool for pruning taxa from a tree collection.
+ *
+ * Copyright October 2011 by Andre J. Aberer
+ *
+ * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis.
+ *
+ * This program is free software; you may redistribute it and/or
+ * modify its under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * For any other inquiries send an Email to Andre J. Aberer
+ * andre.aberer at googlemail.com
+ *
+ * When publishing work that is based on the results from RogueNaRok, please cite:
+ * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011.
+ *
+ */
+
+#ifndef NEW_FUNCTIONS_H
+#define NEW_FUNCTIONS_H
+
+#include "List.h"
+#include "common.h"
+#include "legacy.h"
+#include "Tree.h"
+#include "ProfileElem.h"
+
+IndexList *parseToDrop(All *tr, FILE *toDrop);
+void pruneTaxon(All *tr, unsigned int k);
+BitVector *neglectThoseTaxa(All *tr, char *toDrop);
+Array *getOriginalBipArray(All *tr, FILE *bestTree, FILE *treeFile) ;
+
+#endif
diff --git a/parallel.c b/parallel.c
new file mode 100644
index 0000000..cdb5877
--- /dev/null
+++ b/parallel.c
@@ -0,0 +1,276 @@
+/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees.
+ *
+ * Moreover, the program collection comes with efficient implementations of
+ * * the unrooted leaf stability by Thorley and Wilkinson
+ * * the taxonomic instability index by Maddinson and Maddison
+ * * a maximum agreement subtree implementation (MAST) for unrooted trees
+ * * a tool for pruning taxa from a tree collection.
+ *
+ * Copyright October 2011 by Andre J. Aberer
+ *
+ * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis.
+ *
+ * This program is free software; you may redistribute it and/or
+ * modify its under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * For any other inquiries send an Email to Andre J. Aberer
+ * andre.aberer at googlemail.com
+ *
+ * When publishing work that is based on the results from RogueNaRok, please cite:
+ * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011.
+ *
+ */
+
+#ifdef PARALLEL
+#include <stdio.h>
+#include <pthread.h>
+#include "common.h"
+#include "ProfileElem.h"
+#include "Dropset.h"
+#include "parallel.h"
+
+
+void findCandidatesForBip(HashTable *mergingHash, ProfileElem *elemA, boolean firstMerge, Array *bipartitionsById, Array *bipartitionProfile, int* indexByNumberBits);
+void combineEventsForOneDropset(Array *allDropsets, Dropset *refDropset, Array *bipartitionsById);
+int getSupportOfMRETree(Array *bipartitionsById, Dropset *dropset);
+void evaluateDropset(HashTable *mergingHash, Dropset *dropset,Array *bipartitionsById, List *consensusBipsCanVanish );
+extern int cumScore;
+
+#ifndef PORTABLE_PTHREADS
+void pinToCore(int tid)
+{
+ cpu_set_t cpuset;
+
+ CPU_ZERO(&cpuset);
+ CPU_SET( tid, &cpuset);
+
+ if(pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset) != 0)
+ {
+ printBothOpen("\n\nThere was a problem finding a physical core for thread number %d to run on.\n", tid);
+ printBothOpen("Probably this happend because you are trying to run more threads than you have cores available,\n");
+ printBothOpen("which is a thing you should never ever do again, good bye .... \n\n");
+ assert(0);
+ }
+}
+#endif
+
+
+void execFunction(parallelArguments *pArgs, int tid, int n)
+{
+ int currentJob = threadJob >> 16;
+
+ switch(currentJob)
+ {
+ case THREAD_GET_EVENTS:
+ {
+ HashTable *mergingHash = pArgs->mergingHash;
+ BitVector *candidateBips = pArgs->candidateBips;
+ Array *bipartitionsById = pArgs->bipartitionsById;
+ Array *bipartitionProfile = pArgs->bipartitionProfile;
+ int *indexByNumberBits = pArgs->indexByNumberBits;
+ boolean firstMerge = pArgs->firstMerge ;
+
+
+ boolean done = FALSE ;
+ while (NOT done )
+ {
+ int jobId = bipartitionProfile->length;
+ ProfileElem *jobElem = NULL ;
+ pthread_mutex_lock(&mutex);
+ do
+ {
+ jobId = bipartitionProfile->length - numberOfJobs;
+ numberOfJobs--;
+ }
+ while(numberOfJobs > 0 && NOT NTH_BIT_IS_SET(candidateBips, jobId));
+ done = (numberOfJobs <= 0);
+ pthread_mutex_unlock(&mutex);
+
+ if( bipartitionProfile->length > jobId && (jobElem = GET_PROFILE_ELEM(bipartitionsById, jobId)) )
+ {
+ assert(jobElem);
+ findCandidatesForBip(mergingHash, jobElem, firstMerge , bipartitionsById, bipartitionProfile, indexByNumberBits);
+ }
+ }
+ break;
+ }
+ case THREAD_COMBINE_EVENTS:
+ {
+ Array *allDropsets = globalPArgs->allDropsets;
+ Array *bipartitionsById = globalPArgs->bipartitionsById;
+ int jobId = allDropsets->length;
+
+ boolean done = FALSE ;
+ while (NOT done )
+ {
+ pthread_mutex_lock(&mutex);
+ jobId = allDropsets->length - numberOfJobs;
+ numberOfJobs--;
+ done = (numberOfJobs <= 0);
+ pthread_mutex_unlock(&mutex);
+
+ if( allDropsets->length > jobId )
+ combineEventsForOneDropset(allDropsets, GET_DROPSET_ELEM(allDropsets,jobId), bipartitionsById);
+ }
+ break;
+ }
+ case THREAD_MRE:
+ {
+ Array *allDropsets = globalPArgs->allDropsets,
+ *bipartitionsById = globalPArgs->bipartitionsById ;
+ int jobId = allDropsets->length;
+
+ boolean done = FALSE ;
+ while (NOT done )
+ {
+ pthread_mutex_lock(&mutex);
+ jobId = allDropsets->length - numberOfJobs;
+ numberOfJobs--;
+ done = (numberOfJobs <= 0);
+ pthread_mutex_unlock(&mutex);
+
+ if( allDropsets->length > jobId )
+ {
+ Dropset *dropset = GET_DROPSET_ELEM(allDropsets,jobId);
+ int newSup = getSupportOfMRETree(bipartitionsById, dropset);
+ dropset->improvement = newSup - cumScore;
+ }
+ }
+
+ break;
+ }
+ case THREAD_EVALUATE_EVENTS:
+ {
+ boolean done = FALSE;
+ int jobId = globalPArgs->allDropsets->length;
+ while (NOT done )
+ {
+ pthread_mutex_lock(&mutex);
+ jobId = globalPArgs->allDropsets->length - numberOfJobs;
+ numberOfJobs--;
+ done = (numberOfJobs <= 0);
+ pthread_mutex_unlock(&mutex);
+
+ if(globalPArgs->allDropsets->length > jobId)
+ {
+ Dropset *dropset = GET_DROPSET_ELEM(globalPArgs->allDropsets, jobId);
+ evaluateDropset(globalPArgs->mergingHash, dropset, globalPArgs->bipartitionsById, globalPArgs->consensusBipsCanVanish);
+ }
+ }
+ break;
+ }
+
+ default:
+ printf("Job %d\n", currentJob);
+ assert(0);
+ }
+}
+
+
+void *workerThreadWait(void *tData)
+{
+ threadData *td = (threadData*)tData;
+ parallelArguments
+ *pArgs = td->pArgs;
+ int
+ myCycle = 0;
+
+ const int
+ n = numberOfThreads,
+ tid = td->threadNumber;
+
+#ifndef PORTABLE_PTHREADS
+ pinToCore(tid);
+#endif
+
+ printf("This is worker thread number: %d\n", tid);
+
+ while(1)
+ {
+ while (myCycle == threadJob)
+ ;
+ myCycle = threadJob;
+ execFunction(pArgs, tid, n);
+ barrierBuffer[tid] = 1;
+
+ }
+
+ return (void*)NULL;
+}
+
+
+void startThreads()
+{
+
+ pthread_t *threads;
+ pthread_attr_t attr;
+ int rc, t;
+ threadData *tData;
+
+ jobCycle = 0;
+ threadJob = 0;
+
+#ifndef PORTABLE_PTHREADS
+ pinToCore(0);
+#endif
+
+ printf("\nThis is the master thread\n");
+
+ pthread_attr_init(&attr);
+ pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
+
+ pthread_mutex_init(&mutex , (pthread_mutexattr_t *)NULL);
+ threads = (pthread_t *)CALLOC(numberOfThreads , sizeof(pthread_t));
+ tData = (threadData *)CALLOC(numberOfThreads , sizeof(threadData));
+ barrierBuffer = (volatile char *)CALLOC(numberOfThreads, sizeof(volatile char));
+
+ for(t = 0; t < numberOfThreads; t++)
+ barrierBuffer[t] = 0;
+
+ for(t = 1; t < numberOfThreads; t++)
+ {
+ tData[t].pArgs = globalPArgs;
+ tData[t].threadNumber = t;
+ rc = pthread_create(&threads[t], &attr, workerThreadWait, (void *)(&tData[t]));
+ if(rc)
+ {
+ printf("ERROR; return code from pthread_create() is %d\n", rc);
+ exit(-1);
+ }
+ }
+}
+
+
+void masterBarrier(int jobType, parallelArguments *pArgs)
+{
+ const int
+ n = numberOfThreads;
+
+ int
+ i,
+ sum;
+
+ jobCycle = NOT jobCycle;
+ threadJob = (jobType << 16) + jobCycle;
+
+ execFunction(pArgs, 0, n);
+
+ do
+ {
+ for(i = 1, sum = 1; i < n; i++)
+ sum += barrierBuffer[i];
+ }
+ while(sum < n);
+
+ for(i = 1; i < n; i++)
+ barrierBuffer[i] = 0;
+}
+#else
+#endif
diff --git a/parallel.h b/parallel.h
new file mode 100644
index 0000000..a7a6126
--- /dev/null
+++ b/parallel.h
@@ -0,0 +1,76 @@
+/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees.
+ *
+ * Moreover, the program collection comes with efficient implementations of
+ * * the unrooted leaf stability by Thorley and Wilkinson
+ * * the taxonomic instability index by Maddinson and Maddison
+ * * a maximum agreement subtree implementation (MAST) for unrooted trees
+ * * a tool for pruning taxa from a tree collection.
+ *
+ * Copyright October 2011 by Andre J. Aberer
+ *
+ * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis.
+ *
+ * This program is free software; you may redistribute it and/or
+ * modify its under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * For any other inquiries send an Email to Andre J. Aberer
+ * andre.aberer at googlemail.com
+ *
+ * When publishing work that is based on the results from RogueNaRok, please cite:
+ * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011.
+ *
+ */
+
+#ifndef PARALLEL_H
+#define PARALLEL_H
+#include "HashTable.h"
+
+
+extern volatile int numberOfThreads;
+extern volatile int numberOfJobs;
+extern volatile int jobCycle;
+extern volatile int threadJob;
+extern volatile char *barrierBuffer;
+extern pthread_mutex_t mutex;
+
+#define THREAD_GET_EVENTS 1
+#define THREAD_COMBINE_EVENTS 2
+#define THREAD_MRE 3
+#define THREAD_EVALUATE_EVENTS 4
+
+typedef struct _parArgs
+{
+ HashTable *mergingHash;
+ BitVector *candidateBips;
+ Array *bipartitionsById;
+ Array *bipartitionProfile;
+ int *indexByNumberBits;
+ boolean firstMerge;
+ Array *allDropsets;
+ List *consensusBipsCanVanish;
+} parallelArguments ;
+
+
+parallelArguments *globalPArgs;
+
+typedef struct
+{
+ parallelArguments *pArgs;
+ int threadNumber;
+} threadData;
+
+
+void masterBarrier(int jobType, parallelArguments *pArgs);
+void startThreads();
+void *workerThreadWait(void *tData);
+void execFunction(parallelArguments *pArgs, int tid, int n);
+
+
+#endif
diff --git a/rnr-lsi.c b/rnr-lsi.c
new file mode 100644
index 0000000..0faa971
--- /dev/null
+++ b/rnr-lsi.c
@@ -0,0 +1,410 @@
+/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees.
+ *
+ * Moreover, the program collection comes with efficient implementations of
+ * * the unrooted leaf stability by Thorley and Wilkinson
+ * * the taxonomic instability index by Maddinson and Maddison
+ * * a maximum agreement subtree implementation (MAST) for unrooted trees
+ * * a tool for pruning taxa from a tree collection.
+ *
+ * Copyright October 2011 by Andre J. Aberer
+ *
+ * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis.
+ *
+ * This program is free software; you may redistribute it and/or
+ * modify its under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * For any other inquiries send an Email to Andre J. Aberer
+ * andre.aberer at googlemail.com
+ *
+ * When publishing work that is based on the results from RogueNaRok, please cite:
+ * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011.
+ *
+ */
+
+
+#include <math.h>
+#include <time.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <assert.h>
+#include <unistd.h>
+
+#include "common.h"
+#include "sharedVariables.h"
+#include "newFunctions.h"
+#include "legacy.h"
+#include "Tree.h"
+#include "BitVector.h"
+
+
+#define PROG_NAME "RnR-lsi"
+#define PROG_VERSION "1.0"
+#define PROG_RELEASE_DATE "2011-10-25"
+
+#define FLIP_NTH_BIT(bitVector,n) (bitVector[(n) / MASK_LENGTH] |= mask32[ (n) % MASK_LENGTH ])
+#define NTH_BIT_IS_SET(bitVector,n) (bitVector[(n) / MASK_LENGTH] & mask32[(n) % MASK_LENGTH])
+
+#define NUMBER_TO_PRUNE(x) ((x) / 2)
+#define SHORT_UNSIGNED_MAX 65535
+
+
+typedef unsigned short int smallNumber;
+extern void freeIndexList(IndexList *list);
+
+typedef struct _scorePerTaxon
+{
+ int taxonId;
+
+ double mrConsensus;
+ double strictConsensus;
+ double bestTreeScore;
+
+ double leafStab;
+ double taxonInstability;
+} ScorePerTaxon;
+
+void printQuadruples(All *tr, smallNumber ***quadruples)
+{
+ int
+ i,j,k;
+
+ for(i = 0; i < tr->mxtips; ++i)
+ {
+ for(j = 0; j < tr->mxtips; ++j)
+ {
+ for(k = 0; k < tr->mxtips; ++k)
+ {
+ if(quadruples[i][j][k])
+ printf("%s|%s,%s\t%d\n", tr->nameList[i+1], tr->nameList[j+1], tr->nameList[k+1], quadruples[i][j][k]);
+ }
+ }
+ }
+}
+
+
+IndexList *extractQuadruplesRecursively(All *tr, nodeptr node, smallNumber ***quadruples, BitVector *neglectThose, boolean isStart)
+{
+ IndexList
+ *leftList, *rightList, *iterA, *iterB, *iterC;
+
+ if( NOT isTip(node->number, tr->mxtips) || isStart)
+ {
+ nodeptr
+ leftNode = (isStart) ? node->back->next->back : node->next->back,
+ rightNode = (isStart) ? node->back->next->next->back : node->next->next->back;
+
+ leftList = extractQuadruplesRecursively(tr, leftNode, quadruples, neglectThose, FALSE);
+ rightList = extractQuadruplesRecursively(tr, rightNode, quadruples, neglectThose,FALSE);
+ }
+ else
+ {
+ IndexList
+ *elem = CALLOC(1,sizeof(IndexList));
+ elem->index = node->number;
+ elem->next = NULL;
+ return elem;
+ }
+
+ /* insert the information */
+ for(iterA = leftList; iterA; iterA = iterA->next)
+ {
+ for(iterB = rightList; iterB; iterB = iterB->next)
+ {
+ for(iterC = iterB->next; iterC; iterC = iterC->next)
+ {
+ int
+ indexB = iterB->index,
+ indexC = iterC->index;
+
+ if(indexB > indexC)
+ SWAP(indexB,indexC);
+
+ USE_UPPER_TRIANGLE_LSI(quadruples, (iterA->index-1), (indexB-1), (indexC-1))++;
+ }
+ }
+ }
+
+ for(iterA = rightList; iterA; iterA = iterA->next)
+ {
+ for(iterB = leftList; iterB; iterB = iterB->next)
+ {
+ for(iterC = iterB->next; iterC; iterC = iterC->next)
+ {
+ int
+ indexB = iterB->index,
+ indexC = iterC->index;
+
+ if(indexB > indexC)
+ SWAP(indexB,indexC);
+
+ USE_UPPER_TRIANGLE_LSI(quadruples, (iterA->index-1), (indexB-1), (indexC-1))++;
+ }
+ }
+ }
+
+ for(iterA = leftList; iterA; iterA = iterA->next)
+ if( NOT iterA->next)
+ {
+ iterA->next = rightList;
+ break;
+ }
+
+ return leftList;
+}
+
+void freeQuads(All *tr, smallNumber ***quads)
+{
+ int i,j;
+ for(i = 0; i < tr->mxtips; ++i)
+ {
+ for(j = 0; j < tr->mxtips; ++j)
+ free(quads[i][j]);
+ free(quads[i]);
+ }
+ free(quads);
+}
+
+
+smallNumber ***initQuads(All *tr)
+{
+ int i, j;
+ smallNumber ***result = CALLOC(tr->mxtips, sizeof(smallNumber**));
+
+
+ for(i = 0; i < tr->mxtips; ++i)
+ {
+ result[i] = CALLOC(tr->mxtips, sizeof(smallNumber*));
+ for(j = 0 ; j < tr->mxtips; ++j)
+ result[i][j] = CALLOC(tr->mxtips - j, sizeof(smallNumber));
+ }
+
+ return result;
+}
+
+
+int intcmp(const void *a, const void *b)
+{
+ return *(int *)b - *(int *)a;
+}
+
+
+void calculateLeafStability(All *tr, char *bootstrapFileName, char *excludeFileName)
+{
+ FILE
+ *outf = getOutputFileFromString("leafStabilityIndices"),
+ *bootstrapFile = getNumberOfTrees(tr,bootstrapFileName);
+
+ BitVector
+ *neglectThose = neglectThoseTaxa(tr, excludeFileName);
+
+ int
+ i, j, k, l;
+
+ if(tr->numberOfTrees >= SHORT_UNSIGNED_MAX)
+ {
+ PR("Sorry, %s is not capable of handling more than %d trees. You may want to adjust the code, if you have sufficient memory at disposal.\n");
+ exit(-1);
+ }
+
+ fprintf(outf,"taxon\tlsDif\tlsEnt\tlsMax\n");
+ PR("taxon\tlsDif\tlsEnt\tlsMax\n");
+
+ /* calculate leaf stability for n-th taxon */
+ for(i = 0; i < tr->mxtips; i++)
+ {
+ if( NOT NTH_BIT_IS_SET(neglectThose,i))
+ {
+ PR("%s\tNA\tNA\tNA\n", tr->nameList[i+1]);
+ fprintf(outf,"%s\tNA\tNA\tNA\n", tr->nameList[i+1]);
+ continue;
+ }
+
+ smallNumber
+ ***quadruples = initQuads(tr);
+
+ double
+ lsDif = 0.0,
+ lsEnt = 0.0,
+ lsMax = 0.0;
+
+ rewind(bootstrapFile);
+
+ for(j = 1; j <= tr->numberOfTrees; ++j)
+ {
+ int k;
+ readBootstrapTree(tr,bootstrapFile);
+ FOR_0_LIMIT(k,tr->mxtips)
+ if( NOT NTH_BIT_IS_SET(neglectThose, k))
+ pruneTaxon(tr,k+1);
+ extractQuadruplesRecursively(tr, tr->nodep[i+1], quadruples, neglectThose, TRUE);
+ }
+
+ /* calculate leaf stability of this leaf */
+ int
+ sum = 0;
+ double
+ cnt = 0.0;
+ for(j = 0; j < tr->mxtips; ++j)
+ {
+ if(NOT NTH_BIT_IS_SET(neglectThose, j))
+ continue;
+
+ for(k = j+1; k < tr->mxtips; ++k)
+ {
+ if(NOT NTH_BIT_IS_SET(neglectThose, k))
+ continue;
+
+ for(l = k+1; l < tr->mxtips; ++l)
+ {
+ if(NOT NTH_BIT_IS_SET(neglectThose, l))
+ continue;
+
+ int rels[3] = {
+
+ GET_FROM_UPPER_TRIANGLE(quadruples,j,k,l),
+ GET_FROM_UPPER_TRIANGLE(quadruples,k,j,l),
+ GET_FROM_UPPER_TRIANGLE(quadruples,l,j,k)
+ };
+
+ sum = rels[0] + rels[1] + rels[2];
+
+ qsort(rels, 3, sizeof(int), intcmp);
+
+ assert( sum == 0 || sum == tr->numberOfTrees);
+ assert(rels[0] >= 0 && rels[1] >= 0 && rels[0] >= rels[1]);
+ assert( (NTH_BIT_IS_SET(neglectThose, j) && NTH_BIT_IS_SET(neglectThose, k) && NTH_BIT_IS_SET(neglectThose, l) && NTH_BIT_IS_SET(neglectThose, i)) );
+
+ if(sum)
+ {
+ lsDif += (double)(rels[0] - rels[1]) / (double)tr->numberOfTrees;
+ lsMax += (double) rels[0] / (double) tr->numberOfTrees ;
+ /* PR(">> %f\n", ((double) rels[0] / (double) tr->numberOfTrees )); */
+
+ double tmp = (double)rels[0] / (double)tr->numberOfTrees;
+ lsEnt -= tmp * log(tmp);
+ if(rels[1] > 0)
+ {
+ double tmp = (double) rels[1] / (double) tr->numberOfTrees;
+ lsEnt -= tmp * log(tmp);
+ }
+ if(rels[2] > 0)
+ {
+ double tmp = (double) rels[2] / (double) tr->numberOfTrees;
+ lsEnt -= tmp * log(tmp);
+ }
+ cnt++;
+ }
+ }
+ }
+ }
+ lsDif /= cnt;
+
+ /* normalize between 0 and 1 */
+ lsEnt /= cnt;
+ lsEnt /= 3. * log(1./3.) * 1./3.;
+ lsEnt = lsEnt + 1;
+
+ /* also normalize */
+ lsMax /= cnt ;
+ lsMax = (lsMax - (1. / 3.)) * 3. / 2. ;
+
+ PR("%s\t%f\t%f\t%f\n", tr->nameList[i+1], lsDif, lsEnt, lsMax);
+ fprintf(outf, "%s\t%f\t%f\t%f\n", tr->nameList[i+1], lsDif, lsEnt, lsMax);
+ freeQuads(tr,quadruples);
+ }
+}
+
+
+void printHelpFile()
+{
+ printVersionInfo(FALSE);
+ printf("This program computes three flavors of leaf stability index for each taxon.\n\nSYNTAX: ./%s -i <bootTrees> -n <runId> [-w <workingDir>] [-h]\n", lowerTheString(programName));
+ printf("\n\tOBLIGATORY:\n");
+ printf("-i <bootTrees>\n\tA collection of bootstrap trees.\n");
+ printf("-n <runId>\n\tAn identifier for this run.\n");
+ printf("\n\tOPTIONAL:\n");
+ printf("-x <excludeFile>\n\tPrune the taxa in the file first (one taxon per line), before computing the lsi.\n");
+ printf("-w <workDir>\n\tA working directory where output files are created.\n");
+ printf("-h\n\tThis help file.\n");
+}
+
+
+int main(int argc, char *argv[])
+{
+ programName = PROG_NAME;
+ programVersion = PROG_VERSION;
+ programReleaseDate = PROG_RELEASE_DATE;
+
+ int
+ c;
+
+ char
+ *excludeFile = "",
+ *bootTrees = "";
+
+ while ((c = getopt (argc, argv, "hi:n:w:m:x:")) != -1)
+ {
+ switch(c)
+ {
+ case 'i':
+ bootTrees = optarg;
+ break;
+ case 'n':
+ strcpy(run_id, optarg);
+ break;
+ case 'w':
+ strcpy(workdir, optarg);
+ break;
+ case 'x':
+ excludeFile = optarg;
+ break;
+ case 'h':
+ default:
+ {
+ printHelpFile();
+ abort ();
+ }
+ }
+ }
+
+
+ if( NOT strcmp(bootTrees, ""))
+ {
+ printf("Please specify a file containing bootstrap trees via -i.\n");
+ printHelpFile(FALSE);
+ exit(-1);
+ }
+
+ if( NOT strcmp(run_id, ""))
+ {
+ printf("Please specify a run-id via -n\n");
+ printHelpFile(FALSE);
+ exit(-1);
+ }
+
+ compute_bits_in_16bits();
+ initializeMask();
+
+ All
+ *tr = CALLOC(1,sizeof(All));
+ setupInfoFile();
+ if (NOT setupTree(tr, bootTrees))
+ {
+ PR("Something went wrong during tree initialisation. Sorry.\n");
+ exit(-1);
+ }
+
+ calculateLeafStability(tr, bootTrees, excludeFile);
+
+ return 0;
+}
+
diff --git a/rnr-mast.c b/rnr-mast.c
new file mode 100644
index 0000000..0d41c09
--- /dev/null
+++ b/rnr-mast.c
@@ -0,0 +1,965 @@
+/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees.
+ *
+ * Moreover, the program collection comes with efficient implementations of
+ * * the unrooted leaf stability by Thorley and Wilkinson
+ * * the taxonomic instability index by Maddinson and Maddison
+ * * a maximum agreement subtree implementation (MAST) for unrooted trees
+ * * a tool for pruning taxa from a tree collection.
+ *
+ * Copyright October 2011 by Andre J. Aberer
+ *
+ * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis.
+ *
+ * This program is free software; you may redistribute it and/or
+ * modify its under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * For any other inquiries send an Email to Andre J. Aberer
+ * andre.aberer at googlemail.com
+ *
+ * When publishing work that is based on the results from RogueNaRok, please cite:
+ * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include <unistd.h>
+
+#include "common.h"
+#include "List.h"
+#include "Tree.h"
+#include "BitVector.h"
+#include "sharedVariables.h"
+#include "legacy.h"
+
+
+#define PROG_NAME "RnR-mast"
+#define PROG_VERSION "1.0"
+#define PROG_RELEASE_DATE "2011-10-25"
+
+/*
+ NOTICE: as the numbering of the taxa is 1-based, throughout this
+ file all array indices (that use the taxon number as access code)
+ are also 1-based. This means that we waste a bit of memory, but
+ hopefully avoids index wars...
+*/
+
+
+/* SWITCHES */
+/* #define PRINT_VERY_VERBOSE */
+#define FAST_BV_COMPARISON
+
+boolean areSameBitVectors(BitVector *a, BitVector *b, int bitVectorLength);
+
+typedef struct _BitVectorList
+{
+ struct _BitVectorList *next;
+ BitVector *taxaInMast;
+} MastList;
+
+
+IndexList *traverseForTriples(All *tr, nodeptr node, boolean isStart, BitVector ***rootedTriples, BitVector **bitVectors, int bitVectorLength)
+{
+ int
+ i;
+ IndexList
+ *leftList, *rightList, *iterA, *iterB;
+ BitVector
+ *leftBv, *rightBv;
+
+ if( NOT isTip(node->number, tr->mxtips) || isStart)
+ {
+ nodeptr
+ leftNode = (isStart) ? node->back->next->back : node->next->back,
+ rightNode = (isStart) ? node->back->next->next->back : node->next->next->back;
+ leftBv = bitVectors[leftNode->number];
+ rightBv = bitVectors[rightNode->number];
+ leftList = traverseForTriples(tr, leftNode, FALSE, rootedTriples, bitVectors, bitVectorLength);
+ rightList = traverseForTriples(tr, rightNode, FALSE, rootedTriples, bitVectors, bitVectorLength);
+ }
+ else
+ {
+ IndexList
+ *elem = malloc(sizeof(IndexList));
+ elem->index = node->number;
+ elem->next = NULL;
+ FLIP_NTH_BIT(bitVectors[node->number], node->number);
+ assert(NTH_BIT_IS_SET(bitVectors[node->number], node->number));
+ return elem;
+ }
+
+ /* enter entries in the triple structure */
+ for(iterA = leftList; iterA; iterA = iterA->next)
+ {
+ int
+ indexA = iterA->index;
+
+ assert(NTH_BIT_IS_SET(leftBv, indexA));
+ for(iterB = rightList; iterB; iterB = iterB->next)
+ {
+ int
+ indexB = iterB->index;
+
+ assert(NTH_BIT_IS_SET(rightBv, indexB));
+ for(i = 0; i < bitVectorLength; ++i)
+ {
+ rootedTriples[indexA][indexB][i] |= leftBv[i];
+ rootedTriples[indexB][indexA][i] |= rightBv[i];
+ }
+
+ UNFLIP_NTH_BIT(rootedTriples[indexA][indexB], indexA);
+ UNFLIP_NTH_BIT(rootedTriples[indexB][indexA], indexB);
+ }
+ }
+
+ /* prepare bitvector and list for this root */
+ for(i = 0; i < bitVectorLength; ++i)
+ bitVectors[node->number][i] = (leftBv[i] | rightBv[i]);
+
+ for(iterA = leftList; iterA; iterA = iterA->next)
+ if( NOT iterA->next)
+ {
+ iterA->next = rightList;
+
+ for(iterA = leftList; iterA; iterA = iterA->next)
+ assert(NTH_BIT_IS_SET(bitVectors[node->number], iterA->index));
+ break;
+ }
+
+ /* insert the triples containing the root -- TODO we could omit
+ that, as the root always must be part of the MAST */
+ i = 0;
+ if(isStart)
+ {
+ for(iterA = leftList; iterA; iterA = iterA->next)
+ {
+ for(iterB = leftList; iterB; iterB = iterB->next)
+ if(iterA->index != iterB->index)
+ FLIP_NTH_BIT(rootedTriples[iterA->index][node->number], iterB->index);
+ i++;
+ }
+ /* assert(i == tr->mxtips-1); */
+ }
+
+ return leftList;
+}
+
+
+BitVector*** initializeTriplesStructure(All *tr, int bitVectorLength)
+{
+ int
+ i, j;
+ BitVector
+ ***result = malloc((tr->mxtips + 1) * sizeof(BitVector**));
+
+ for(i = 0; i < tr->mxtips+1; ++i)
+ {
+ result[i] = malloc((tr->mxtips+1) * sizeof(BitVector*));
+ for(j = 0; j < tr->mxtips+1; ++j)
+ result[i][j] = calloc(bitVectorLength, sizeof(BitVector));
+ }
+
+ return result;
+}
+
+void freeTriplesStructure(All *tr, BitVector ***triples)
+{
+ int
+ i,j;
+
+ for(i = 0; i < tr->mxtips+1; ++i)
+ {
+ for(j = 0; j < tr->mxtips+1;++j)
+ free(triples[i][j]);
+ free(triples[i]);
+ }
+ free(triples);
+}
+
+
+BitVector*** getIntersectionOfRootedTriples(FILE *bootstrapFile, All *tr, int startingNodeIndex, BitVector *neglectThose)
+{
+ int
+ i,j,k,treeNum=0,
+ bitVectorLength = GET_BITVECTOR_LENGTH((tr->mxtips+1));
+ IndexList *iter;
+ BitVector
+ **bitVectors = malloc((2 * tr->mxtips - 1) * sizeof(BitVector*)),
+ ***result = initializeTriplesStructure(tr, bitVectorLength),
+ ***theseTriples = initializeTriplesStructure(tr, bitVectorLength);
+ for(i = 0; i < 2 * tr->mxtips - 1; ++i)
+ bitVectors[i] = malloc(bitVectorLength * sizeof(BitVector));
+
+ rewind(bootstrapFile);
+
+ while(treeNum++ < tr->numberOfTrees)
+ {
+ readBootstrapTree(tr,bootstrapFile);
+
+ FOR_0_LIMIT(i, tr->mxtips)
+ if( NOT NTH_BIT_IS_SET(neglectThose, i))
+ pruneTaxon(tr, i+1);
+
+ /* reset bit-vectors */
+ FOR_0_LIMIT(i, (tr->mxtips+1))
+ memset(bitVectors[i], 0, sizeof(BitVector) * bitVectorLength);
+
+ /* reset triples */
+ for(i = 0; i < tr->mxtips+1; ++i)
+ for(j = 0; j < tr->mxtips+1; ++j)
+ memset(theseTriples[i][j], 0, sizeof(BitVector) * bitVectorLength);
+
+ /* get all triples in the current tree and clean up */
+ IndexList *il = traverseForTriples(tr, tr->nodep[startingNodeIndex], TRUE, theseTriples, bitVectors, bitVectorLength);
+ for(iter = il; iter; )
+ {
+ il = iter->next;
+ free(iter);
+ iter = il;
+ }
+
+ /* intersect the triples */
+ for(i = 0; i < tr->mxtips+1; ++i)
+ for(j = 0; j < tr->mxtips+1; ++j)
+ for(k = 0; k < bitVectorLength; ++k)
+ if(treeNum == 1)
+ result[i][j][k] |= theseTriples[i][j][k];
+ else
+ result[i][j][k] &= theseTriples[i][j][k];
+ }
+
+ freeTriplesStructure(tr, theseTriples);
+
+ for(i = 0; i < 2 * tr->mxtips - 1; ++i)
+ free(bitVectors[i]);
+ free(bitVectors);
+
+ return result;
+}
+
+
+typedef struct _agreementMatrix
+{
+ int score;
+ IndexList* valuesA;
+ IndexList* valuesB;
+} AgreementMatrix;
+
+
+/* we mean ax|b by that */
+IndexList* findBestXinAxB(All *tr, int A, int B, AgreementMatrix **agreementMatrix, BitVector ***triples, int *resultScore)
+{
+ int
+ i = 0,
+ maximum = 0;
+ BitVector
+ *currentBv = triples[A][B];
+ IndexList
+ *result = NULL;
+
+ while(i < tr->mxtips)
+ {
+#ifdef FAST_BV_COMPARISON
+ if( ((i & 31) == 0) && NOT currentBv[i]) /* NOTE portability issue!!! */
+ {
+ /* assert((i % 32) == i & (MASK_LENGTH - 1)); */
+ i += MASK_LENGTH;
+ continue;
+ }
+#endif
+
+ if(NTH_BIT_IS_SET(currentBv, i))
+ {
+ int score = ( A < i ) ? agreementMatrix[A][i].score : agreementMatrix[i][A].score;
+ assert(score);
+ if(score > maximum)
+ maximum = score;
+ }
+ ++i;
+ }
+
+ i = 0;
+ while(i < tr->mxtips)
+ {
+#ifdef FAST_BV_COMPARISON
+ if( ((i & 31) == 0) && NOT currentBv[i]) /* NOTE portability issue!!! */
+ {
+ /* assert((i % 32) == i & (MASK_LENGTH - 1)); */
+ i += MASK_LENGTH;
+ continue;
+ }
+#endif
+
+ if(NTH_BIT_IS_SET(currentBv,i))
+ {
+ int score = ( A < i ) ? agreementMatrix[A][i].score : agreementMatrix[i][A].score;
+ assert(score);
+ if(score == maximum)
+ result = appendToIndexList(i,result);
+ }
+ i++;
+ }
+
+ *resultScore = maximum ? maximum : 1;
+
+ return result;
+}
+
+
+IndexList *traverseForMastTable(All *tr, nodeptr node, BitVector ***triples, AgreementMatrix **agreementMatrix, boolean isStart)
+{
+ if(isTip(node->number, tr->mxtips) && NOT isStart)
+ {
+ IndexList *elem = malloc(sizeof(IndexList));
+ elem->next = NULL;
+ elem->index = node->number;
+ return elem;
+ }
+ else
+ {
+ nodeptr
+ leftNode = (isStart) ? node->back->next->back : node->next->back,
+ rightNode = (isStart) ? node->back->next->next->back : node->next->next->back;
+ IndexList
+ *nodesOnLeft = traverseForMastTable(tr, leftNode, triples, agreementMatrix, FALSE),
+ *nodesOnRight = traverseForMastTable(tr, rightNode, triples, agreementMatrix, FALSE),
+ *iterA, *iterB;
+
+ for(iterA = nodesOnLeft; iterA; iterA = iterA->next)
+ {
+ int indexA = iterA->index;
+ for(iterB = nodesOnRight; iterB; iterB = iterB->next)
+ {
+ int indexB = iterB->index;
+
+ int
+ scoreA = 0, scoreB = 0;
+
+ IndexList
+ *xs = findBestXinAxB(tr, indexA, indexB, agreementMatrix, triples, &scoreA),
+ *ys = findBestXinAxB(tr, indexB, indexA, agreementMatrix, triples, &scoreB);
+#ifdef PRINT_VERY_VERBOSE
+ printf("%s,x|%s => x = %s\t%s,y|%s => y = %s \n", tr->nameList[indexA], tr->nameList[indexB],
+ (x) ? tr->nameList[x] : "0", tr->nameList[indexB], tr->nameList[indexA], (y) ? tr->nameList[y] : "0");
+#endif
+
+ AgreementMatrix
+ *currentElem = (indexA > indexB) ? &(agreementMatrix[indexB][indexA]) : &(agreementMatrix[indexA][indexB]);
+ currentElem->score = scoreA + scoreB;
+#ifdef PRINT_VERY_VERBOSE
+ printf("MAST(%s,%s) = %d\n", tr->nameList[indexA], tr->nameList[indexB], currentElem->score);
+#endif
+ currentElem->valuesA = (indexA > indexB) ? ys : xs;
+ currentElem->valuesB = (indexA > indexB) ? xs : ys;
+ }
+ }
+
+ /* join the lists */
+ for(iterA = nodesOnLeft; iterA; iterA = iterA->next)
+ if( NOT iterA->next)
+ {
+ iterA->next = nodesOnRight;
+ break;
+ }
+
+ if(isStart)
+ {
+ int indexA = node->number;
+ for(iterA = nodesOnLeft; iterA; iterA = iterA->next)
+ {
+ int indexB = iterA->index;
+ int
+ scoreA = 0, scoreB = 0;
+
+ IndexList *xs = findBestXinAxB(tr, indexA, indexB, agreementMatrix, triples, &scoreA);
+ IndexList *ys = findBestXinAxB(tr, indexB, indexA, agreementMatrix, triples, &scoreB);
+
+ AgreementMatrix *currentElem =
+ (indexA > indexB) ? &(agreementMatrix[indexB][indexA]) : &(agreementMatrix[indexA][indexB]);
+ currentElem->score = scoreA + scoreB;
+ currentElem->valuesA = (indexA > indexB) ? ys : xs;
+ currentElem->valuesB = (indexA > indexB) ? xs : ys;
+ }
+ }
+
+ return nodesOnLeft;
+ }
+}
+
+
+void printAgreementTable(All *tr, AgreementMatrix **matrix)
+{
+ int
+ i,j;
+
+ printf("\n\n ");
+
+ for(i = 1; i <= tr->mxtips; ++i)
+ {
+ printf("%s,", tr->nameList[i]);
+ }
+ printf("\n");
+
+ for(i = 1; i <= tr->mxtips; ++i)
+ {
+ printf("%s,", tr->nameList[i]);
+ for(j = 1; j <= tr->mxtips; ++j)
+ {
+ if( NOT matrix[i][j].score)
+ printf(" ,");
+ else
+ printf("%d,", matrix[i][j].score);
+ }
+ printf("\n");
+ }
+ printf("\n");
+}
+
+
+List *filterDuplicates(List *masts, int bitVectorLength)
+{
+ List
+ *iter,
+ *result = NULL;
+ int
+ i = 0,j,
+ numElem = 0;
+ boolean *isDuplicate;
+ BitVector **bitVectors;
+
+ for(iter = masts; iter; iter = iter->next)
+ numElem++;
+
+ bitVectors = calloc(numElem, sizeof(BitVector*));
+ i = 0;
+ for(iter = masts; iter;)
+ {
+ bitVectors[i] = (BitVector*)iter->value;
+ i++;
+ masts = iter->next;
+ free(iter);
+ iter = masts;
+ }
+
+ isDuplicate = calloc(numElem, sizeof(boolean));
+ for(i = 0; i < numElem; i++)
+ {
+ if(isDuplicate[i])
+ continue;
+
+ for(j = i+1; j < numElem; ++j)
+ {
+ if(isDuplicate[j])
+ continue;
+
+ isDuplicate[j] = areSameBitVectors(bitVectors[i], bitVectors[j], bitVectorLength);
+ }
+ }
+
+ for(i = 0; i < numElem; ++i)
+ if(isDuplicate[i])
+ free(bitVectors[i]);
+ else
+ result = appendToList(bitVectors[i], result);
+
+
+ free(isDuplicate);
+ free(bitVectors);
+
+ return result;
+}
+
+
+List *backtrace(All *tr, AgreementMatrix **matrix, int cntI, int cntJ, int bitVectorLength, boolean allMasts)
+{
+ List
+ *iIter;
+ IndexList
+ *iter;
+
+ int
+ i;
+
+ List
+ *result = NULL, *fromLeft = NULL, *fromRight = NULL;
+
+ if(cntI > cntJ)
+ SWAP(cntI,cntJ);
+
+ AgreementMatrix
+ *matrixElem = &(matrix[cntI][cntJ]);
+
+ /* int a = 0, b = 0; */
+
+ for(iter = matrixElem->valuesA; iter; iter = iter->next)
+ {
+ List *tmp = backtrace(tr, matrix, cntI,iter->index, bitVectorLength, allMasts);
+ fromLeft = joinLists(fromLeft, tmp);
+ if( NOT allMasts)
+ break;
+
+ /* a++; */
+ }
+
+ for(iter = matrixElem->valuesB; iter; iter = iter->next)
+ {
+ List *tmp = backtrace(tr, matrix, cntJ,iter->index, bitVectorLength, allMasts);
+ fromRight = joinLists(fromRight, tmp);
+ if( NOT allMasts)
+ break;
+ /* b++; */
+ }
+
+ /* printf("%d\t%d\n", a,b); */
+
+ if( NOT (fromLeft || fromRight))
+ result = appendToList(calloc(bitVectorLength, sizeof(BitVector)),result);
+ else if( NOT (fromLeft && fromRight))
+ result = fromLeft ? fromLeft : fromRight;
+ else
+ {
+ List
+ *iterA, *iterB;
+ for(iterA = fromLeft; iterA; iterA = iterA->next)
+ {
+ for(iterB = fromRight; iterB; iterB = iterB->next)
+ {
+ BitVector
+ *new = calloc(bitVectorLength, sizeof(BitVector));
+ for(i = 0; i < bitVectorLength; ++i)
+ new[i] = ((BitVector*)iterA->value)[i] | ((BitVector*)iterB->value)[i];
+ result = appendToList(new, result);
+ }
+ }
+ freeList(fromRight);
+ freeList(fromLeft);
+
+ if(allMasts)
+ result = filterDuplicates(result, bitVectorLength);
+ }
+
+
+
+ /* set the bits of the current entry */
+ for(iIter = result; iIter; iIter = iIter->next)
+ {
+ FLIP_NTH_BIT(((BitVector*)iIter->value), cntI);
+ FLIP_NTH_BIT(((BitVector*)iIter->value), cntJ);
+ }
+
+ return result;
+}
+
+
+List *backtraceMasts(All *tr, AgreementMatrix **matrix, boolean allMasts)
+{
+ int
+ bitVectorLength = GET_BITVECTOR_LENGTH(tr->mxtips),
+ maxScore = 0,
+ i, j;
+
+ List
+ *result = NULL;
+
+ /* find maximum in agreement matrix */
+ for(i = 1; i <= tr->mxtips; ++i)
+ for(j = 1; j <= tr->mxtips; ++j)
+ if(maxScore < matrix[i][j].score)
+ maxScore = matrix[i][j].score;
+ assert(maxScore);
+
+ for(i = 1; i <= tr->mxtips; ++i)
+ for(j = 1; j <= tr->mxtips; ++j)
+ if(matrix[i][j].score == maxScore)
+ {
+ List
+ *elems = backtrace(tr, matrix, i,j, bitVectorLength, allMasts);
+ result = joinLists(result, elems);
+ }
+
+ return result;
+}
+
+
+AgreementMatrix** computeAgreementTable(All *tr, BitVector ***triples, int startingNodeIndex)
+{
+ int
+ i;
+ IndexList *iter;
+ AgreementMatrix
+ **agreementMatrix = malloc((tr->mxtips+1) * sizeof(AgreementMatrix*));
+ for(i = 1; i <= tr->mxtips; ++i)
+ agreementMatrix[i] = calloc(tr->mxtips+1, sizeof(AgreementMatrix));
+
+ IndexList
+ *list = traverseForMastTable(tr, tr->nodep[startingNodeIndex], triples, agreementMatrix, TRUE);
+
+ for(iter = list; iter;)
+ {
+ list = iter->next;
+ free(iter);
+ iter = list;
+ }
+
+#ifdef PRINT_VERY_VERBOSE
+ printAgreementTable(tr, agreementMatrix);
+#endif
+
+ return agreementMatrix;
+}
+
+
+void printBipartition(All *tr, BitVector *bv, int contentLength)
+{
+ int i;
+ for(i = 0; i < contentLength;++i)
+ if(NTH_BIT_IS_SET(bv, i))
+ printf("%s,", tr->nameList[i+1]);
+}
+
+
+void verifyMasts(All *tr, FILE *bootstrapFile, BitVector *taxaToKeep)
+{
+ int
+ bCount = 0,
+ treeVectorLength = GET_BITVECTOR_LENGTH(tr->numberOfTrees),
+ i, j;
+ unsigned int vectorLength = 0;
+ hashtable
+ *htable = initHashTable(tr->mxtips * FC_INIT * 10);
+ BitVector
+ **bitVectors = initBitVector(tr, &vectorLength);
+ entry *e;
+ nodeptr commonStart = NULL;
+
+ rewind(bootstrapFile);
+
+ for(i = 1; i <= tr->numberOfTrees; ++i)
+ {
+ /* treeReadLen(bootstrapFile, tr, FALSE, FALSE, TRUE, adef, TRUE); */
+ readBootstrapTree(tr, bootstrapFile);
+
+ /* prune taxa */
+ for(j = 1; j <= tr->mxtips; ++j)
+ if( NOT (NTH_BIT_IS_SET(taxaToKeep, j)) )
+ pruneTaxon(tr, j+1);
+
+ if(i == 1 )
+ commonStart = tr->start;
+ bitVectorInitravSpecial(bitVectors, commonStart->back, tr->mxtips, vectorLength, htable, (i - 1), BIPARTITIONS_BOOTSTOP, (branchInfo *)NULL, &bCount, treeVectorLength, FALSE, FALSE);
+ }
+
+ for(i = 0; i < htable->tableSize; ++i)
+ for(e = htable->table[i]; e; e = e->next)
+ {
+ if(genericBitCount(e->treeVector, treeVectorLength) != tr->numberOfTrees)
+ {
+ printf("error %d/%d\nbv: ", genericBitCount(e->treeVector, treeVectorLength), tr->numberOfTrees);
+ printBipartition(tr,e->bitVector, tr->mxtips);
+ printf("\nrepresented in: ");
+ printBitVector(e->treeVector, tr->numberOfTrees);
+ printf("\n");
+ }
+ assert(genericBitCount(e->treeVector, treeVectorLength) == tr->numberOfTrees);
+ }
+
+ freeBitVectors(bitVectors, 2 * tr->mxtips);
+ freeHashTable(htable);
+}
+
+int countTips(nodeptr p, int numsp)
+{
+ if(isTip(p->number, numsp))
+ return 1;
+ {
+ nodeptr q;
+ int tips = 0;
+
+ q = p->next;
+ while(q != p)
+ {
+ tips += countTips(q->back, numsp);
+ q = q->next;
+ }
+
+ return tips;
+ }
+}
+
+
+void printMastToFile(All *tr, FILE *bootstrapFile, BitVector *mast, FILE *result)
+{
+ int
+ droppedTaxaNum = 0,
+ tips, i;
+
+ rewind(bootstrapFile);
+ readBootstrapTree(tr,bootstrapFile);
+
+ for(i = 1; i <= tr->mxtips; ++i)
+ if( NOT NTH_BIT_IS_SET(mast,i) )
+ {
+ pruneTaxon(tr,i+1);
+ droppedTaxaNum++;
+ }
+
+ tips = countTips(tr->start, tr->mxtips) + countTips(tr->start->back, tr->mxtips);
+ assert((unsigned)tips == ((unsigned)tr->mxtips - droppedTaxaNum));
+ char *tmp = writeTreeToString(tr, FALSE);
+ fprintf(result, "%s", tmp);
+ printBothOpen("MAST tree: %s", tmp);
+}
+
+
+void printMastsToFile(All *tr, FILE *bootstrapFile, List *masts)
+{
+ FILE
+ *result = getOutputFileFromString("MaximumAgreementSubtree");
+
+ List
+ *iter;
+
+ for(iter = masts; iter ; iter = iter->next)
+ printMastToFile( tr, bootstrapFile, iter->value, result);
+
+ fclose(result);
+}
+
+
+void freeAgreementMatrix(All *tr, AgreementMatrix **agm)
+{
+ int
+ i,j;
+
+ for(i = 1; i <= tr->mxtips; ++i)
+ {
+ for(j = 1; j <= tr->mxtips; ++j)
+ {
+ freeIndexList(agm[i][j].valuesA);
+ freeIndexList(agm[i][j].valuesB);
+ }
+ free(agm[i]);
+ }
+ free(agm);
+}
+
+
+void freeAmatList(All *tr,List *aml)
+{
+ List
+ *iter;
+
+ for(iter = aml; iter;)
+ {
+ aml = iter->next;
+ freeAgreementMatrix(tr, iter->value);
+ free(iter);
+ iter = aml;
+ }
+}
+
+
+void calculateMast(char *bootStrapFileName, All *tr, char *excludeFileName, boolean allMasts)
+{
+ int
+ bitVectorLength = GET_BITVECTOR_LENGTH((tr->mxtips+1)),
+ mast = 0,
+ i,j,k;
+
+ FILE
+ *bootstrapFile = getNumberOfTrees(tr, bootStrapFileName);
+
+ BitVector
+ *taxaToNeglect = neglectThoseTaxa(tr, excludeFileName);
+
+ List
+ *iter,
+ *amastList = NULL;
+
+ for(i = 1; i <= tr->mxtips; ++i)
+ {
+ int
+ currentMast = 0;
+
+ printBothOpen("rooting %d/%d\t%s\n", i, tr->mxtips, tr->nameList[i]);
+
+ BitVector
+ ***commonRootedTriples = getIntersectionOfRootedTriples(bootstrapFile, tr, i, taxaToNeglect);
+ AgreementMatrix
+ **amat = computeAgreementTable(tr, commonRootedTriples, i);
+
+ for(j = 1; j <= tr->mxtips; ++j)
+ for(k = 1 ; k <= tr->mxtips; ++k)
+ if(currentMast < amat[j][k].score)
+ currentMast = amat[j][k].score;
+
+ if(currentMast > mast)
+ {
+ mast = currentMast;
+ freeAmatList(tr,amastList);
+ amastList = NULL;
+ amastList = appendToList(amat, amastList);
+ }
+ else if( allMasts && currentMast == mast )
+ amastList = appendToList(amat, amastList);
+ else
+ freeAgreementMatrix(tr, amat);
+
+ freeTriplesStructure(tr, commonRootedTriples);
+ }
+
+ List
+ *accMasts = NULL;
+
+ for(iter = amastList; iter; iter = iter->next)
+ {
+ List
+ *Mast = backtraceMasts(tr, iter->value, allMasts);
+ accMasts = joinLists(Mast, accMasts);
+ }
+
+ accMasts = filterDuplicates(accMasts, bitVectorLength);
+
+ /* this works, commenting it out, as it only costs additional time */
+ for(iter = accMasts; iter; iter = iter->next)
+ verifyMasts(tr, bootstrapFile, ((BitVector*)iter->value));
+
+ int cnt = 0;
+ for(iter = accMasts; iter; iter = iter->next)
+ cnt++;
+ printBothOpen("number of alternative MASTs (this is not the number of all possible MASTs, if you did not use \"ALL_MAST\"): %d\n", cnt);
+ cnt = genericBitCount((BitVector*)accMasts->value, GET_BITVECTOR_LENGTH(tr->mxtips));
+ printBothOpen("MAST size is: %d\n", cnt);
+
+ /* print */
+ printMastsToFile(tr,bootstrapFile, accMasts);
+
+ freeList(accMasts);
+ freeAmatList(tr, amastList);
+ fclose(bootstrapFile);
+}
+
+
+void printRootedTriples(BitVector ***rootedTriples, All *tr)
+{
+ int
+ i, j, k;
+
+ for(i = 1; i <= tr->mxtips; ++i)
+ for(j = 1; j <= tr->mxtips;++j)
+ for(k = 1; k <= tr->mxtips; ++k)
+ if(NTH_BIT_IS_SET(rootedTriples[i][j], k))
+ printf("%s,%s|%s\n", tr->nameList[i], tr->nameList[k], tr->nameList[j]);
+
+}
+
+
+void printHelpFile()
+{
+ printVersionInfo(FALSE);
+ printf("This program computes maximum agreement trees for unrooted input sets.\n\nSYNTAX: ./%s -i <bootTrees> -n <runId> [-w <workingDir>] [-h] [-a] [-x <excludeFile>]\n", lowerTheString(programName));
+ printf("\nOBLIGATORY:\n");
+ printf("-i <bootTrees>\n\tA collection of bootstrap trees.\n");
+ printf("-n <runId>\n\tAn identifier for this run.\n");
+ printf("\nOPTIONAL\n");
+ printf("-a\n\tCompute all possible MAST trees. Without this flag, you will\n\t\
+ only get a few MASTs that are easy to compute. As there may be an\n\t\
+ exponential number of MASTs, use this option with care.\n");
+ printf("-w <workDir>\n\tA working directory where output files are created.\n");
+ printf("-x <excludeFile>\n\tExclude the taxa in this file (one taxon per line)\n\t\
+ prior to computing the MAST. If you compute all MASTs anyway, this\n\t\
+ option is option will not be useful. However, you can use this option\n\t\
+ to speed up things.\n");
+ printf("-h\n\tThis help file.\n");
+}
+
+
+int main(int argc, char *argv[])
+{
+ programName = PROG_NAME;
+ programVersion = PROG_VERSION;
+ programReleaseDate = PROG_RELEASE_DATE;
+
+ int
+ c;
+
+ boolean
+ computeAllMasts = FALSE;
+
+ char
+ *excludeFile = "",
+ *bootTrees = "";
+
+ while ((c = getopt (argc, argv, "hi:n:aw:x:")) != -1)
+ {
+ switch(c)
+ {
+ case 'i':
+ bootTrees = optarg;
+ break;
+ case 'n':
+ strcpy(run_id, optarg);
+ break;
+ case 'a':
+ computeAllMasts = TRUE;
+ break;
+ case 'x':
+ excludeFile = optarg;
+ break;
+ case 'w':
+ strcpy(workdir, optarg);
+ break;
+ case 'h':
+ default:
+ {
+ printHelpFile();
+ abort ();
+ }
+ }
+ }
+
+
+ if( NOT strcmp(bootTrees, ""))
+ {
+ printf("Please specify a file containing bootstrap trees via -i.\n");
+ printHelpFile();
+ exit(-1);
+ }
+
+ if( NOT strcmp(run_id, ""))
+ {
+ printf("Please specify a run-id via -n\n");
+ printHelpFile();
+ exit(-1);
+ }
+
+
+ /* maybe? */
+ compute_bits_in_16bits();
+ initializeMask();
+
+
+ All
+ *tr = CALLOC(1,sizeof(All));
+ setupInfoFile();
+ if (NOT setupTree(tr, bootTrees))
+ {
+ PR("Something went wrong during tree initialisation. Sorry.\n");
+ exit(-1);
+ }
+
+ tr->tree_string = CALLOC(getTreeStringLength(bootTrees), sizeof(char));
+ calculateMast(bootTrees, tr, excludeFile, computeAllMasts);
+
+ return 0;
+}
diff --git a/rnr-prune.c b/rnr-prune.c
new file mode 100644
index 0000000..77e6532
--- /dev/null
+++ b/rnr-prune.c
@@ -0,0 +1,230 @@
+/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees.
+ *
+ * Moreover, the program collection comes with efficient implementations of
+ * * the unrooted leaf stability by Thorley and Wilkinson
+ * * the taxonomic instability index by Maddinson and Maddison
+ * * a maximum agreement subtree implementation (MAST) for unrooted trees
+ * * a tool for pruning taxa from a tree collection.
+ *
+ * Copyright October 2011 by Andre J. Aberer
+ *
+ * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis.
+ *
+ * This program is free software; you may redistribute it and/or
+ * modify its under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * For any other inquiries send an Email to Andre J. Aberer
+ * andre.aberer at googlemail.com
+ *
+ * When publishing work that is based on the results from RogueNaRok, please cite:
+ * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011.
+ *
+ */
+
+#ifndef WIN32
+#include <unistd.h>
+#endif
+
+#include <math.h>
+#include <time.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <assert.h>
+#include <string.h>
+
+#include "common.h"
+#include "Tree.h"
+#include "List.h"
+#include "BitVector.h"
+#include "sharedVariables.h"
+#include "newFunctions.h"
+
+#define PROG_NAME "RnR-prune"
+#define PROG_VERSION "1.0"
+#define PROG_RELEASE_DATE "2011-10-25"
+
+
+extern char run_id[128];
+extern char *workdir;
+extern double masterTime;
+extern int tid;
+extern int NumberOfThreads;
+extern volatile int NumberOfJobs;
+
+void pruneTaxaFromTreeset(char *bootstrapFileName, char *bestTreeFile, char *toDropFileName, All *tr)
+{
+ int
+ i;
+
+ IndexList
+ *iter;
+
+ FILE
+ *toDrop = myfopen(toDropFileName, "r");
+
+ /* drop taxa from bootstrap trees */
+ if( strcmp(bootstrapFileName, ""))
+ {
+ tr->tree_string = CALLOC(10 * getTreeStringLength(bootstrapFileName), sizeof(char));
+ if (NOT setupTree(tr, bootstrapFileName))
+ {
+ PR("Something went wrong during tree initialisation. Sorry.\n");
+ exit(-1);
+ }
+ FILE *TMP = getNumberOfTrees(tr, bootstrapFileName);
+ fclose(TMP);
+ }
+
+ /* drop taxa from best-known tree */
+ if( strcmp(bestTreeFile, ""))
+ {
+ tr->tree_string = CALLOC(10 * getTreeStringLength(bestTreeFile), sizeof(char));
+ if (NOT setupTree(tr, bestTreeFile))
+ {
+ PR("Something went wrong during tree initialisation. Sorry.\n");
+ exit(-1);
+ }
+ FILE *tmp = getNumberOfTrees(tr, bestTreeFile);
+ fclose(tmp);
+ }
+
+ IndexList
+ *indicesToDrop = parseToDrop(tr, toDrop);
+
+ if( strcmp(bootstrapFileName, ""))
+ {
+ FILE
+ *bootstrapFile = getNumberOfTrees(tr, bootstrapFileName),
+ *outf = getOutputFileFromString("prunedBootstraps");
+
+ FOR_0_LIMIT(i,tr->numberOfTrees)
+ {
+ readBootstrapTree(tr, bootstrapFile);
+
+ iter = indicesToDrop;
+ FOR_LIST(iter)
+ pruneTaxon(tr, iter->index);
+
+ char *tmp = writeTreeToString(tr , FALSE);
+ fprintf(outf, "%s", tmp);
+ }
+
+ fclose(outf);
+ }
+
+ if( strcmp(bestTreeFile, ""))
+ {
+ FILE
+ *bestTree = myfopen(bestTreeFile, "r"),
+ *outf = getOutputFileFromString("prunedBestTree");
+
+ readBestTree(tr, bestTree);
+ iter = indicesToDrop;
+ FOR_LIST(iter)
+ pruneTaxon(tr, iter->index);
+
+ char *tmp = writeTreeToString(tr, TRUE);
+ fprintf(outf, "%s", tmp);
+ }
+
+ freeIndexList(indicesToDrop);
+ exit(EXIT_SUCCESS);
+}
+
+static void printHelpFile()
+{
+ printVersionInfo(FALSE);
+ printf("This program prunes a list of taxa from a bootstrap tree or a single tree with branch lengths (such as a best-known ML/MP-tree).\n\nSYNTAX: ./%s [-i <bootTrees> | -t <treeFile>] -x <excludeFile> -n <runId> [-w <workingDir>] [-h]\n", lowerTheString(programName));
+ printf("\n\tOBLIGATORY:\n");
+ printf("-x <excludeFile>\n\tA list of taxa (one taxon per line) to prune from either the bootstrap trees or the single best-known tree.\n");
+ printf("-i <bootTrees>\n\tA collection of bootstrap trees.\n");
+ printf("-t <treeFile>\n\tA single tree with branch lengths. Use either this flag or the -i flag.\n");
+ printf("-n <runId>\n\tAn identifier for this run.\n");
+ printf("\n\tOPTIONAL:\n");
+ printf("-w <workDir>\n\tA working directory where output files are created.\n");
+ printf("-h\n\tThis help file.\n");
+}
+
+
+int main(int argc, char *argv[])
+{
+ int
+ c;
+
+ char
+ *bestTreeFileName = "",
+ *excludeFileName = "",
+ *bootTreesFileName = "";
+
+ programName = PROG_NAME;
+ programVersion = PROG_VERSION;
+ programReleaseDate = PROG_RELEASE_DATE;
+
+ while((c = getopt(argc,argv, "hi:t:x:n:")) != -1)
+ {
+ switch(c)
+ {
+ case 'i':
+ bootTreesFileName = optarg;
+ break;
+ case 'w':
+ strcpy(workdir,optarg);
+ break;
+ case 'n':
+ strcpy(run_id,optarg);
+ break;
+ case 't':
+ bestTreeFileName = optarg;
+ break;
+ case 'x':
+ excludeFileName = optarg;
+ break;
+ case 'h':
+ default:
+ {
+ printHelpFile();
+ abort();
+ }
+ }
+ }
+
+ if( NOT (strcmp(bootTreesFileName, "") || strcmp(bestTreeFileName, "")))
+ {
+ printf("Please specify either a set of bootstrap trees (-i) or a single best-known tree (-t) from which you want to prune taxa.\n");
+ printHelpFile();
+ exit(-1);
+ }
+
+ if( NOT strcmp(excludeFileName, "") )
+ {
+ printf("Please specify a file containing taxa to prune (one taxon a line) via -x\n");
+ printHelpFile();
+ exit(-1);
+ }
+
+ if ( NOT strcmp(run_id, ""))
+ {
+ printf("Please specify a run id via -n.\n");
+ printHelpFile();
+ exit(-1);
+ }
+
+ compute_bits_in_16bits();
+ initializeMask();
+
+ setupInfoFile();
+
+ All *tr = CALLOC(1,sizeof(All));
+ tr->numBranches = 1;
+ pruneTaxaFromTreeset(bootTreesFileName, bestTreeFileName, excludeFileName, tr);
+
+ return 0;
+}
diff --git a/rnr-tii.c b/rnr-tii.c
new file mode 100644
index 0000000..7c08fa3
--- /dev/null
+++ b/rnr-tii.c
@@ -0,0 +1,330 @@
+/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees.
+ *
+ * Moreover, the program collection comes with efficient implementations of
+ * * the unrooted leaf stability by Thorley and Wilkinson
+ * * the taxonomic instability index by Maddinson and Maddison
+ * * a maximum agreement subtree implementation (MAST) for unrooted trees
+ * * a tool for pruning taxa from a tree collection.
+ *
+ * Copyright October 2011 by Andre J. Aberer
+ *
+ * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis.
+ *
+ * This program is free software; you may redistribute it and/or
+ * modify its under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * For any other inquiries send an Email to Andre J. Aberer
+ * andre.aberer at googlemail.com
+ *
+ * When publishing work that is based on the results from RogueNaRok, please cite:
+ * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011.
+ *
+ */
+
+
+#ifndef WIN32
+#include <unistd.h>
+#endif
+
+#include <math.h>
+#include <time.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+
+#include "common.h"
+#include "BitVector.h"
+#include "Tree.h"
+#include "List.h"
+#include "sharedVariables.h"
+
+#define PROG_NAME "RnR-tii"
+#define PROG_VERSION "1.0"
+#define PROG_RELEASE_DATE "2011-10-25"
+
+
+typedef unsigned short int nodeDistance_t;
+
+double tiiZ = 2.;
+
+typedef struct _distanceElem
+{
+ nodeptr node;
+ nodeDistance_t distance;
+} DistanceElem;
+
+
+List *gatherDistances(All *tr, nodeptr node, nodeDistance_t **resultBranchDistances, boolean isStart)
+{
+ List
+ *distancesFromLeft,
+ *distancesFromRight,
+ *iterA, *iterB,
+ *end = NULL;
+
+ if( NOT isTip(node->number, tr->mxtips))
+ {
+ /* go down to the tips (DFS) */
+ distancesFromLeft = gatherDistances(tr, node->next->back, resultBranchDistances, FALSE);
+ distancesFromRight = gatherDistances(tr, node->next->next->back, resultBranchDistances, FALSE);
+ }
+ else if(isStart)
+ {
+ distancesFromLeft = gatherDistances(tr, node->back->next->back, resultBranchDistances, FALSE);
+ distancesFromRight = gatherDistances(tr, node->back->next->next->back, resultBranchDistances, FALSE);
+ }
+ else
+ {
+ /* recursion ends */
+ List *distanceList = CALLOC(1,sizeof(List));
+ distanceList->value = CALLOC(1,sizeof(DistanceElem));
+ ((DistanceElem*)distanceList->value)->node = node;
+ ((DistanceElem*)distanceList->value)->distance = 1 ;
+ return distanceList;
+ }
+
+ /* compute distances of branches from the right to branches from the left side */
+ iterA = distancesFromLeft;
+ FOR_LIST(iterA)
+ {
+ for(iterB = distancesFromRight; iterB; iterB = iterB->next)
+ {
+ int
+ a = ((DistanceElem*)iterA->value)->node->number - 1,
+ b = ((DistanceElem*)iterB->value)->node->number - 1;
+
+ if(b < a)
+ SWAP(a,b);
+
+ assert(a != b );
+ USE_UPPER_TRIANGLE_TII(resultBranchDistances,a,b) = ((DistanceElem*)iterA->value)->distance + ((DistanceElem*)iterB->value)->distance;
+ }
+
+ if( NOT iterA->next)
+ end = iterA;
+ }
+
+ assert(end);
+ end->next = distancesFromRight;
+
+ /* adding own value */
+ for(iterA = distancesFromLeft; iterA; iterA = iterA->next)
+ ((DistanceElem*)iterA->value)->distance += 1;
+
+ if(isStart)
+ {
+ int a = node->number - 1;
+ for(iterA = distancesFromLeft; iterA; iterA = iterA->next)
+ {
+ int b = ((DistanceElem*)iterA->value)->node->number -1;
+ if(b < a)
+ SWAP(a,b);
+ assert(b != a);
+ USE_UPPER_TRIANGLE_TII(resultBranchDistances,a,b) = ((DistanceElem*)iterA->value)->distance;
+ }
+
+ /* for(i = 0; i < tr->mxtips; ++i) */
+ /* for(j = i+1; j < tr->mxtips; ++j) */
+ /* printf("%d\t%d\t%d\n", i,j,USE_UPPER_TRIANGLE(resultBranchDistances,i,j)); */
+ }
+
+ return distancesFromLeft;
+}
+
+
+double getOneTaxonomicInstability(All *tr, int i, nodeDistance_t ***distances, unsigned int *remainingTaxa)
+{
+ int
+ j,k,l;
+ double
+ taxInstab = 0.0;
+
+ FOR_0_LIMIT(j,tr->mxtips)
+ {
+ if(j == i || NOT NTH_BIT_IS_SET(remainingTaxa, j))
+ continue;
+
+ FOR_0_LIMIT(k, tr->numberOfTrees)
+ {
+ for(l = k+1; l < tr->numberOfTrees; ++l)
+ {
+ double
+ d1 = (i < j) ? (double)(USE_UPPER_TRIANGLE_TII(distances[k], i, j)) : (double)(USE_UPPER_TRIANGLE_TII(distances[k], j, i)),
+ d2 = (i < j) ? (double)(USE_UPPER_TRIANGLE_TII(distances[l], i, j)) : (double)(USE_UPPER_TRIANGLE_TII(distances[l], j, i)),
+ diff = (d1 > d2) ? (d1 - d2) : (d2 - d1),
+ sum = d1 + d2;
+
+ assert( d1 > 0 && d2 > 0);
+
+ taxInstab += diff / pow(sum, tiiZ); /* NOTE: could also be another exponent, but the mesquite
+ guys use this one (emphasises close relationship) */
+ }
+ }
+ }
+
+ return taxInstab;
+}
+
+
+void getTaxonomicInstability(All *tr, char *treesFileName, char *excludeFile)
+{
+ FILE
+ *outf = getOutputFileFromString("taxonomicInstabilityIndex"),
+ *treesFile = getNumberOfTrees(tr, treesFileName);
+
+ BitVector
+ *neglectThose = neglectThoseTaxa(tr, excludeFile);
+
+ int
+ i,j,k;
+ nodeDistance_t
+ ***distances = CALLOC(tr->numberOfTrees, sizeof(nodeDistance_t**));
+
+ List
+ *iter;
+
+ rewind(treesFile);
+
+ FOR_0_LIMIT(i,tr->numberOfTrees)
+ {
+ distances[i] = CALLOC(tr->mxtips, sizeof(nodeDistance_t*));
+ for(j = 0; j < tr->mxtips; ++j)
+ distances[i][j] = CALLOC(tr->mxtips - j, sizeof(nodeDistance_t));
+ }
+
+ /* calculate distances */
+ FOR_0_LIMIT(i,tr->numberOfTrees)
+ {
+ readBootstrapTree(tr, treesFile);
+
+ FOR_0_LIMIT(j,tr->mxtips)
+ if( NOT NTH_BIT_IS_SET(neglectThose, j))
+ pruneTaxon(tr,j+1);
+
+ /* assert that we get the right labels */
+ iter = gatherDistances(tr, tr->nodep[1], distances[i], TRUE);
+ freeList(iter);
+
+ FOR_0_LIMIT(j,tr->mxtips)
+ for(k = j+1; k < tr->mxtips; ++k)
+ {
+ if( NOT USE_UPPER_TRIANGLE_TII(distances[i],j,k))
+ printf("%s\t%s\t%d\t%d\n", tr->nameList[j+1], tr->nameList[k+1], j,k);
+ assert(USE_UPPER_TRIANGLE_TII(distances[i],j,k));
+ }
+ }
+
+ /* calculate taxonomic instability */
+ FOR_0_LIMIT(i,tr->mxtips)
+ {
+ if(NTH_BIT_IS_SET(neglectThose,i))
+ {
+ double tii = getOneTaxonomicInstability(tr, i, distances, neglectThose);
+ PR("%s\t%f\n", tr->nameList[i+1], tii);
+ fprintf(outf, "%s\t%f\n", tr->nameList[i+1], tii);
+ }
+ else
+ {
+ PR("%s\tNA\n", tr->nameList[i+1]);
+ fprintf(outf, "%s\tNA\n", tr->nameList[i+1]);
+ }
+ }
+ fclose(outf);
+}
+
+
+void printHelpFile()
+{
+ printVersionInfo(FALSE);
+ printf("This program computes the taxonomic instability index.\n\nSYNTAX: ./%s -i <bootTrees> -n <runId> [-w <workingDir>] [-h] [-x <excludeFile>]\n", lowerTheString(programName));
+ printf("\nOBLIGATORY:\n");
+ printf("-i <bootTrees>\n\tA collection of bootstrap trees.\n");
+ printf("-n <runId>\n\tAn identifier for this run.\n");
+ printf("\nOPTIONAL\n");
+ printf("-z <z>\n\tThe exponent used in the TII formula. Use small values to emphasize close relationships and vice versa. DEFAULT: 2\n");
+ printf("-w <workDir>\n\tA working directory where output files are created.\n");
+ printf("-x <excludeFile>\n\tExclude the taxa in this file (one taxon per line) prior to computing the TII.\n");
+ printf("-h\n\tThis help file.\n");
+}
+
+
+int main(int argc, char *argv[])
+{
+ programName = PROG_NAME;
+ programVersion = PROG_VERSION;
+ programReleaseDate = PROG_RELEASE_DATE;
+
+ int
+ c;
+
+ char
+ *excludeFile = "",
+ *bootTrees = "";
+
+ while ((c = getopt (argc, argv, "hi:n:x:w:z:")) != -1)
+ {
+ switch(c)
+ {
+ case 'i':
+ bootTrees = optarg;
+ break;
+ case 'n':
+ strcpy(run_id, optarg);
+ break;
+ case 'x':
+ excludeFile = optarg;
+ break;
+ case 'w':
+ strcpy(workdir, optarg);
+ break;
+ case 'z':
+ tiiZ = wrapStrToL(optarg);
+ break;
+ case 'h':
+ default:
+ {
+ printHelpFile();
+ abort ();
+ }
+ }
+ }
+
+ if( NOT strcmp(bootTrees, ""))
+ {
+ printf("Please specify a file containing bootstrap trees via -i.\n");
+ printHelpFile();
+ exit(-1);
+ }
+
+ if( NOT strcmp(run_id, ""))
+ {
+ printf("Please specify a run-id via -n\n");
+ printHelpFile();
+ exit(-1);
+ }
+
+ compute_bits_in_16bits();
+ initializeMask();
+
+ All
+ *tr = CALLOC(1,sizeof(All));
+ setupInfoFile();
+ if (NOT setupTree(tr, bootTrees))
+ {
+ PR("Something went wrong during tree initialisation. Sorry.\n");
+ exit(-1);
+ }
+
+ getTaxonomicInstability(tr, bootTrees, excludeFile);
+
+ return 0;
+}
diff --git a/sharedVariables.h b/sharedVariables.h
new file mode 100644
index 0000000..c829363
--- /dev/null
+++ b/sharedVariables.h
@@ -0,0 +1,46 @@
+/* RogueNaRok is an algorithm for the identification of rogue taxa in a set of phylogenetic trees.
+ *
+ * Moreover, the program collection comes with efficient implementations of
+ * * the unrooted leaf stability by Thorley and Wilkinson
+ * * the taxonomic instability index by Maddinson and Maddison
+ * * a maximum agreement subtree implementation (MAST) for unrooted trees
+ * * a tool for pruning taxa from a tree collection.
+ *
+ * Copyright October 2011 by Andre J. Aberer
+ *
+ * Tree I/O and parallel framework are derived from RAxML by Alexandros Stamatakis.
+ *
+ * This program is free software; you may redistribute it and/or
+ * modify its under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * For any other inquiries send an Email to Andre J. Aberer
+ * andre.aberer at googlemail.com
+ *
+ * When publishing work that is based on the results from RogueNaRok, please cite:
+ * Andre J. Aberer, Denis Krompaß, Alexandros Stamatakis. RogueNaRok: an Efficient and Exact Algorithm for Rogue Taxon Identification. (unpublished) 2011.
+ *
+ */
+
+#ifdef PARALLEL
+volatile int numberOfThreads = 0;
+volatile int numberOfJobs;
+volatile int jobCycle;
+volatile int threadJob;
+volatile char *barrierBuffer;
+pthread_mutex_t mutex;
+#endif
+
+extern char *infoFileName,
+ run_id[128],
+ *workdir,
+ *programName,
+ *programVersion,
+ *programReleaseDate;
+
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/roguenarok.git
More information about the debian-med-commit
mailing list