[med-svn] [piler] 01/02: Imported Upstream version 0~20140707
Afif Elghraoui
afif-guest at moszumanska.debian.org
Fri Nov 20 07:14:13 UTC 2015
This is an automated email from the git hooks/post-receive script.
afif-guest pushed a commit to branch master
in repository piler.
commit 9a9b955cfa32a189eb9d8c039e6f82c34f84db89
Author: Afif Elghraoui <afif at ghraoui.name>
Date: Thu Nov 19 20:35:29 2015 -0800
Imported Upstream version 0~20140707
---
Makefile | 25 ++
annot.cpp | 47 ++++
annotedge.cpp | 47 ++++
bitfuncs.h | 20 ++
cons.cpp | 134 +++++++++++
contigs.cpp | 178 ++++++++++++++
crisp.cpp | 640 +++++++++++++++++++++++++++++++++++++++++++++++++
findcc.cpp | 233 ++++++++++++++++++
gff.cpp | 114 +++++++++
gff2.cpp | 253 ++++++++++++++++++++
gffset.cpp | 98 ++++++++
gffset.h | 32 +++
glix.cpp | 304 ++++++++++++++++++++++++
glix.h | 60 +++++
hash.cpp | 164 +++++++++++++
iix.cpp | 160 +++++++++++++
iix.h | 42 ++++
log.cpp | 51 ++++
main.cpp | 87 +++++++
makeannot.cpp | 373 +++++++++++++++++++++++++++++
mem.cpp | 53 +++++
options.cpp | 156 ++++++++++++
params.h | 8 +
piler2.h | 148 ++++++++++++
progress.cpp | 67 ++++++
quit.cpp | 38 +++
readafa.cpp | 86 +++++++
readhits.cpp | 121 ++++++++++
readmfa.cpp | 98 ++++++++
readmotif.cpp | 74 ++++++
readreps.cpp | 126 ++++++++++
readtrs.cpp | 94 ++++++++
tan.cpp | 503 +++++++++++++++++++++++++++++++++++++++
tanmotif2fasta.cpp | 114 +++++++++
tr.cpp | 354 +++++++++++++++++++++++++++
trs.cpp | 682 +++++++++++++++++++++++++++++++++++++++++++++++++++++
trs2fasta.cpp | 142 +++++++++++
types.h | 152 ++++++++++++
usage.cpp | 40 ++++
utils.cpp | 89 +++++++
utils_linux.cpp | 76 ++++++
utils_unix.cpp | 38 +++
utils_win32.cpp | 34 +++
writecrisp.cpp | 61 +++++
writefasta.cpp | 62 +++++
writeimages.cpp | 50 ++++
writepiles.cpp | 31 +++
writetrs.cpp | 44 ++++
48 files changed, 6603 insertions(+)
diff --git a/Makefile b/Makefile
new file mode 100755
index 0000000..d6852f8
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,25 @@
+CFLAGS = -O3 -DNDEBUG=1
+LDLIBS = -lm -static
+# LDLIBS = -lm
+
+OBJ = .o
+EXE =
+
+RM = rm -f
+CP = cp
+
+GPP = g++
+LD = $(GPP) $(CFLAGS)
+CPP = $(GPP) -c $(CFLAGS)
+CC = gcc -c $(CFLAGS)
+
+all: piler2
+
+CPPSRC = $(sort $(wildcard *.cpp))
+CPPOBJ = $(subst .cpp,.o,$(CPPSRC))
+
+$(CPPOBJ): %.o: %.cpp
+ $(CPP) $< -o $@
+
+piler2: $(CPPOBJ)
+ $(LD) -o piler2 $(CPPOBJ) $(LDLIBS)
diff --git a/annot.cpp b/annot.cpp
new file mode 100755
index 0000000..c42edf5
--- /dev/null
+++ b/annot.cpp
@@ -0,0 +1,47 @@
+#include "piler2.h"
+
+void Annot()
+ {
+ const char *InputFileName = RequiredValueOpt("annot");
+ const char *RepeatFileName = RequiredValueOpt("rep");
+ const char *OutputFileName = RequiredValueOpt("out");
+
+ ProgressStart("Reading repeat file");
+ int RepCount;
+ RepData *Reps = ReadReps(RepeatFileName, &RepCount);
+ ProgressDone();
+
+ Progress("%d records", RepCount);
+
+ FILE *fInput = OpenStdioFile(InputFileName);
+ FILE *fOutput = OpenStdioFile(OutputFileName, FILEIO_MODE_WriteOnly);
+
+ ProgressStart("Transferring annotation");
+ GFFRecord Rec;
+ while (GetNextGFFRecord(fInput, Rec))
+ {
+ const bool Rev = Rec.Strand == '-';
+ const char *Annot = MakeAnnot(Rec.SeqName, Rec.Start-1, Rec.End-1, Rev,
+ Reps, RepCount);
+
+ fprintf(fOutput, "%s\t%s\t%s\t%d\t%d\t%.3g\t%c",
+ // 0 1 2 3 4 5 6
+ Rec.SeqName, // 0
+ Rec.Source, // 1
+ Rec.Feature, // 2
+ Rec.Start, // 3
+ Rec.End, // 4
+ Rec.Score, // 5
+ Rec.Strand); // 6
+
+ if (-1 == Rec.Frame)
+ fprintf(fOutput, "\t.");
+ else
+ fprintf(fOutput, "\t%d", Rec.Frame);
+
+ fprintf(fOutput, "\t%s ; Annot \"%s\"\n", Rec.Attrs, Annot);
+ }
+ fclose(fInput);
+ fclose(fOutput);
+ ProgressDone();
+ }
diff --git a/annotedge.cpp b/annotedge.cpp
new file mode 100755
index 0000000..c534305
--- /dev/null
+++ b/annotedge.cpp
@@ -0,0 +1,47 @@
+#include "piler2.h"
+
+void AnnotEdge()
+ {
+ const char *InputFileName = RequiredValueOpt("annotedge");
+ const char *RepeatFileName = RequiredValueOpt("rep");
+ const char *OutputFileName = RequiredValueOpt("out");
+
+ ProgressStart("Reading repeat file");
+ int RepCount;
+ RepData *Reps = ReadReps(RepeatFileName, &RepCount);
+ ProgressDone();
+
+ Progress("%d records", RepCount);
+
+ FILE *fInput = OpenStdioFile(InputFileName);
+ FILE *fOutput = OpenStdioFile(OutputFileName, FILEIO_MODE_WriteOnly);
+
+ ProgressStart("Transferring annotation");
+ GFFRecord Rec;
+ while (GetNextGFFRecord(fInput, Rec))
+ {
+ const bool Rev = (Rec.Strand == '-');
+ const char *Annot = MakeAnnotEdge(Rec.SeqName, Rec.Start-1, Rec.End-1, Rev,
+ Reps, RepCount);
+
+ fprintf(fOutput, "%s\t%s\t%s\t%d\t%d\t%.3g\t%c",
+ // 0 1 2 3 4 5 6
+ Rec.SeqName, // 0
+ Rec.Source, // 1
+ Rec.Feature, // 2
+ Rec.Start, // 3
+ Rec.End, // 4
+ Rec.Score, // 5
+ Rec.Strand); // 6
+
+ if (-1 == Rec.Frame)
+ fprintf(fOutput, "\t.");
+ else
+ fprintf(fOutput, "\t%d", Rec.Frame);
+
+ fprintf(fOutput, "\t%s ; Annot \"%s\"\n", Rec.Attrs, Annot);
+ }
+ fclose(fInput);
+ fclose(fOutput);
+ ProgressDone();
+ }
diff --git a/bitfuncs.h b/bitfuncs.h
new file mode 100755
index 0000000..d831cb1
--- /dev/null
+++ b/bitfuncs.h
@@ -0,0 +1,20 @@
+#ifndef bitfuncs_h
+#define bitfuncs_h
+
+static inline void SetBit(int *BitVec, int Bit)
+ {
+ int IntIndex = Bit/BITS_PER_INT;
+ int BitIndex = Bit%BITS_PER_INT;
+ int i = BitVec[IntIndex];
+ BitVec[IntIndex] = (i | (1 << BitIndex));
+ }
+
+static inline int BitIsSet(int *BitVec, int Bit)
+ {
+ int IntIndex = Bit/BITS_PER_INT;
+ int BitIndex = Bit%BITS_PER_INT;
+ int i = BitVec[IntIndex];
+ return (i & (1 << BitIndex));
+ }
+
+#endif // bitfuncs_h
diff --git a/cons.cpp b/cons.cpp
new file mode 100755
index 0000000..d555fd4
--- /dev/null
+++ b/cons.cpp
@@ -0,0 +1,134 @@
+#include "piler2.h"
+
+static const char Letter[5] = { 'A', 'C', 'G', 'T', '-'};
+
+static void GetCounts(const char *Seqs, int ColCount, int SeqCount,
+ int ColIndex, int Counts[5])
+ {
+ memset(Counts, 0, 5*sizeof(unsigned));
+ for (int SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex)
+ {
+ char c = Seqs[SeqIndex*ColCount + ColIndex];
+ c = toupper(c);
+ switch (c)
+ {
+ case 'a':
+ case 'A':
+ ++(Counts[0]);
+ break;
+ case 'c':
+ case 'C':
+ ++(Counts[1]);
+ break;
+ case 'g':
+ case 'G':
+ ++(Counts[2]);
+ break;
+ case 't':
+ case 'T':
+ ++(Counts[3]);
+ break;
+ case '-':
+ ++(Counts[4]);
+ break;
+ }
+ }
+ }
+
+static bool Conserved(const char *Seqs, int ColCount, int SeqCount, int Col)
+ {
+ int Counts[5];
+ GetCounts(Seqs, ColCount, SeqCount, Col, Counts);
+ if (Counts[4] > 0)
+ return false;
+
+ for (int i = 0; i < 4; ++i)
+ if (Counts[i] != 0 && Counts[i] != SeqCount)
+ return false;
+
+ return true;
+ }
+
+static void FindStartEndExact(const char *Seqs, int ColCount, int SeqCount,
+ int *ptrStart, int *ptrEnd)
+ {
+ int BestLength = 0;
+ int BestStart = 0;
+ int BestEnd = 0;
+ int Length = 0;
+ int Start = 0;
+ for (int Col = 0; Col <= ColCount; ++Col)
+ {
+ if (Col != ColCount && Conserved(Seqs, ColCount, SeqCount, Col))
+ {
+ if (Start == -1)
+ {
+ Start = Col;
+ Length = 1;
+ }
+ else
+ ++Length;
+ }
+ else
+ {
+ if (Length > BestLength)
+ {
+ BestStart = Start;
+ BestEnd = Col - 1;
+ if (BestEnd - BestStart + 1 != Length)
+ Quit("BestEnd");
+ }
+ Length = -1;
+ Start = -1;
+ }
+ }
+ *ptrStart = BestStart;
+ *ptrEnd = BestEnd;
+ }
+
+void Cons()
+ {
+ const char *InputFileName = RequiredValueOpt("cons");
+ const char *OutputFileName = RequiredValueOpt("out");
+ const char *Label = RequiredValueOpt("label");
+ bool Exact = FlagOpt("exact");
+
+ ProgressStart("Reading alignment");
+ int ColCount;
+ int SeqCount;
+ const char *Seqs = ReadAFA(InputFileName, &ColCount, &SeqCount);
+ ProgressDone();
+
+ Progress("%d seqs, length %d", SeqCount, ColCount);
+
+ char *ConsSeq = all(char, ColCount+1);
+ int ConsSeqLength = 0;
+
+ int StartCol = 0;
+ int EndCol = ColCount - 1;
+
+ if (Exact)
+ FindStartEndExact(Seqs, ColCount, SeqCount, &StartCol, &EndCol);
+
+ for (int Col = StartCol; Col <= EndCol; ++Col)
+ {
+ int Counts[5];
+ GetCounts(Seqs, ColCount, SeqCount, Col, Counts);
+ int MaxCount = 0;
+ char MaxLetter = 'A';
+ for (int i = 0; i < 4; ++i)
+ {
+ if (Counts[i] > MaxCount)
+ {
+ MaxLetter = Letter[i];
+ MaxCount = Counts[i];
+ }
+ }
+ if (MaxLetter == '-')
+ continue;
+ ConsSeq[ConsSeqLength++] = MaxLetter;
+ }
+
+ FILE *f = OpenStdioFile(OutputFileName, FILEIO_MODE_WriteOnly);
+ WriteFasta(f, ConsSeq, ConsSeqLength, Label);
+ }
diff --git a/contigs.cpp b/contigs.cpp
new file mode 100755
index 0000000..951f2dc
--- /dev/null
+++ b/contigs.cpp
@@ -0,0 +1,178 @@
+#include "piler2.h"
+
+const int INSANE_LENGTH = 500000000;
+
+// Arbitrarily chosen prime number
+static ContigData *HashTable[HASH_TABLE_SIZE];
+static ContigData **ContigMap;
+static int SeqLength;
+
+void SetSeqLength(int Length)
+ {
+ SeqLength = Length;
+ }
+
+static ContigData *HashLookup(const char *Label)
+ {
+ unsigned h = Hash(Label);
+ assert(h < HASH_TABLE_SIZE);
+ ContigData *p = HashTable[h];
+ while (p != 0)
+ {
+ if (0 == strcmp(Label, p->Label))
+ return p;
+ p = p->Next;
+ }
+ return 0;
+ }
+
+void LogContigs()
+ {
+ Log(" Label Hash Pos Length\n");
+ Log("-------------------- ------ ---------- ----------\n");
+ for (int h = 0; h < HASH_TABLE_SIZE; ++h)
+ {
+ for (const ContigData *p = HashTable[h]; p != 0; p = p->Next)
+ {
+ const ContigData &Contig = *p;
+ Log("%20.20s %6d %10d %10d",
+ Contig.Label,
+ h,
+ Contig.From,
+ Contig.Length);
+ if (Hash(Contig.Label) != h)
+ Log(" ** Hash=%d", Hash(Contig.Label));
+ Log("\n");
+ }
+ }
+ }
+
+static void AppendContig(const char *Label, int Pos)
+ {
+ ContigData *Contig = all(ContigData, 1);
+ Contig->Label = strsave(Label);
+ char *Space = strchr(Contig->Label, ' ');
+ if (0 != Space)
+ *Space = 0;
+
+ unsigned h = Hash(Label);
+ assert(h < HASH_TABLE_SIZE);
+ ContigData *p = HashTable[h];
+ Contig->Length = Pos + 1;
+ Contig->Next = p;
+ HashTable[h] = Contig;
+ }
+
+void AddContigPos(const char *Label, int Pos)
+ {
+ ContigData *Contig = HashLookup(Label);
+ if (0 == Contig)
+ AppendContig(Label, Pos);
+ else
+ {
+ if (Pos > Contig->Length)
+ Contig->Length = Pos;
+ }
+ }
+
+void AddContig(const char *Label, int GlobalPos, int Length)
+ {
+ ContigData *Contig = all(ContigData, 1);
+ Contig->Label = strsave(Label);
+ char *Space = strchr(Contig->Label, ' ');
+ if (0 != Space)
+ *Space = 0;
+
+ unsigned h = Hash(Contig->Label);
+ assert(h < HASH_TABLE_SIZE);
+ ContigData *p = HashTable[h];
+ Contig->From = GlobalPos;
+ Contig->Length = Length;
+ Contig->Next = p;
+ HashTable[h] = Contig;
+ }
+
+int GlobalizeContigs()
+ {
+ int GlobalPos = 0;
+ for (int h = 0; h < HASH_TABLE_SIZE; ++h)
+ {
+ for (ContigData *p = HashTable[h]; p != 0; p = p->Next)
+ {
+ ContigData &Contig = *p;
+ const int Length = p->Length;
+ if (Length < 1 || Length >= INSANE_LENGTH)
+ Quit("GlobalizeContigs, insane length %d", Length);
+ p->From = GlobalPos;
+ GlobalPos += Length;
+
+ // Pad up to start of next bin
+ // (required for contig map)
+ int BinRemainder = GlobalPos%CONTIG_MAP_BIN_SIZE;
+ if (BinRemainder > 0)
+ GlobalPos += CONTIG_MAP_BIN_SIZE - BinRemainder;
+ if (GlobalPos%CONTIG_MAP_BIN_SIZE)
+ Quit("Dumb mistake rounding contig");
+ }
+ }
+ SeqLength = GlobalPos;
+ return SeqLength;
+ }
+
+void MakeContigMap()
+ {
+ if (SeqLength%CONTIG_MAP_BIN_SIZE)
+ Quit("MakeContigMap: expects rounded size");
+
+ const int BinCount = SeqLength/CONTIG_MAP_BIN_SIZE;
+ ContigMap = all(ContigData *, BinCount);
+ zero(ContigMap, ContigData *, BinCount);
+
+ for (int h = 0; h < HASH_TABLE_SIZE; ++h)
+ {
+ for (ContigData *p = HashTable[h]; p != 0; p = p->Next)
+ {
+ ContigData &Contig = *p;
+ int From = p->From;
+ int To = From + p->Length - 1;
+
+ if (From%CONTIG_MAP_BIN_SIZE)
+ Quit("MakeContigMap: expected rounded contig from");
+
+ int BinFrom = From/CONTIG_MAP_BIN_SIZE;
+ int BinTo = To/CONTIG_MAP_BIN_SIZE;
+ for (int Bin = BinFrom; Bin <= BinTo; ++Bin)
+ {
+ if (ContigMap[Bin] != 0)
+ Quit("MakeContigMap: overlap error");
+ ContigMap[Bin] = p;
+ }
+ }
+ }
+ }
+
+const char *GlobalToContig(int GlobalPos, int *ptrContigPos)
+ {
+ if (GlobalPos < 0 || GlobalPos >= SeqLength)
+ Quit("GlobalToContig: invalid pos");
+
+ const int Bin = GlobalPos/CONTIG_MAP_BIN_SIZE;
+ ContigData *Contig = ContigMap[Bin];
+ if (0 == Contig)
+ Quit("GlobalToContig: doesn't map");
+ int ContigPos = GlobalPos - Contig->From;
+ if (ContigPos < 0 || ContigPos >= Contig->Length)
+ Quit("GlobalToContig: out of bounds");
+ *ptrContigPos = ContigPos;
+ return Contig->Label;
+ }
+
+int ContigToGlobal(int ContigPos, const char *Label)
+ {
+ const ContigData *Contig = HashLookup(Label);
+ if (0 == Contig)
+ Quit("ContigToGlobal: contig not found (%s)", Label);
+ if (ContigPos < 0 || ContigPos >= Contig->Length)
+ Quit("ContigToGlobal: out of bounds");
+ return Contig->From + ContigPos;
+ }
diff --git a/crisp.cpp b/crisp.cpp
new file mode 100755
index 0000000..399923f
--- /dev/null
+++ b/crisp.cpp
@@ -0,0 +1,640 @@
+#include "piler2.h"
+#include "bitfuncs.h"
+#include <algorithm>
+
+#define TRACE 0
+
+#define GFFRecord GFFRecord2
+
+/***
+Find CRISPR families.
+
+CRISPR candidate:
+
+ <-----spacer------->
+
+ ------====--------------------====------------ genome
+
+ Pile Pile
+ ^-----------------------^
+ Hit
+
+Candidate if
+ (a) hit length >= MIN_LENGTH_CRISPR and <= MAX_LENGTH_CRISPR, and
+ (b) offset is >= MIN_LENGTH_SPACER and <= MAX_LENGTH_SPACER.
+
+***/
+
+static int MIN_CRISPR_LENGTH = 10;
+static int MAX_CRISPR_LENGTH = 200;
+static int MIN_SPACER_LENGTH = 10;
+static int MAX_SPACER_LENGTH = 200;
+static double MIN_CRISPR_RATIO = 0.8;
+static double MIN_SPACER_RATIO = 0.8;
+static int MIN_FAM_SIZE = 3;
+static int MAX_SPACE_DIFF = 20;
+
+static int g_paramMinFamSize = 3;
+static int g_paramMaxLengthDiffPct = 5;
+static bool g_paramSingleHitCoverage = true;
+
+static PileData *Piles;
+static int PileCount;
+static int EdgeCount;
+
+static int MaxImageCount = 0;
+static int SeqLength;
+static int SeqLengthChunks;
+
+static int GetHitLength(const HitData &Hit)
+ {
+ const int QueryHitLength = Hit.QueryTo - Hit.QueryFrom + 1;
+ const int TargetHitLength = Hit.TargetTo - Hit.TargetFrom + 1;
+ return (QueryHitLength + TargetHitLength)/2;
+ }
+
+static int GetSpacerLength(const HitData &Hit)
+ {
+ if (Hit.QueryFrom > Hit.TargetTo)
+ return Hit.QueryFrom - Hit.TargetTo;
+ else
+ return Hit.TargetFrom - Hit.QueryTo;
+ }
+
+static bool IsCand(const HitData &Hit)
+ {
+ if (Hit.Rev)
+ return false;
+
+ int HitLength = GetHitLength(Hit);
+ if (HitLength > MAX_CRISPR_LENGTH || HitLength < MIN_CRISPR_LENGTH)
+ return false;
+
+ int SpacerLength = GetSpacerLength(Hit);
+ if (SpacerLength > MAX_SPACER_LENGTH || SpacerLength < MIN_SPACER_LENGTH)
+ return false;
+
+ return true;
+ }
+
+static PILE_INDEX_TYPE *IdentifyPiles(int *CopyCount)
+ {
+ PILE_INDEX_TYPE *PileIndexes = all(PILE_INDEX_TYPE, SeqLengthChunks);
+
+#if DEBUG
+ memset(PileIndexes, 0xff, SeqLengthChunks*sizeof(PILE_INDEX_TYPE));
+#endif
+
+ int PileIndex = -1;
+ bool InPile = false;
+ for (int i = 0; i < SeqLengthChunks; ++i)
+ {
+ if (BitIsSet(CopyCount, i))
+ {
+ if (!InPile)
+ {
+ ++PileIndex;
+ if (PileIndex > MAX_STACK_INDEX)
+ Quit("Too many stacks");
+ InPile = true;
+ }
+ PileIndexes[i] = PileIndex;
+ }
+ else
+ InPile = false;
+ }
+ PileCount = PileIndex + 1;
+ return PileIndexes;
+ }
+
+static void IncCopyCountImage(int *CopyCount, int From, int To)
+ {
+ if (From < 0)
+ Quit("From < 0");
+
+ From /= CHUNK_LENGTH;
+ To /= CHUNK_LENGTH;
+
+ if (From >= SeqLengthChunks)
+ {
+ Warning("IncCopyCountImage: From=%d, SeqLength=%d", From, SeqLengthChunks);
+ From = SeqLengthChunks - 1;
+ }
+ if (To >= SeqLengthChunks)
+ {
+ Warning("IncCopyCountImage: To=%d, SeqLength=%d", To, SeqLengthChunks);
+ To = SeqLengthChunks - 1;
+ }
+
+ if (From > To)
+ Quit("From > To");
+
+ for (int i = From; i <= To; ++i)
+ SetBit(CopyCount, i);
+ }
+
+static void IncCopyCount(int *CopyCount, const HitData &Hit)
+ {
+ IncCopyCountImage(CopyCount, Hit.TargetFrom, Hit.TargetTo);
+ IncCopyCountImage(CopyCount, Hit.QueryFrom, Hit.QueryTo);
+ }
+
+static int CmpHits(const void *ptrHit1, const void *ptrHit2)
+ {
+ HitData *Hit1 = (HitData *) ptrHit1;
+ HitData *Hit2 = (HitData *) ptrHit2;
+ return Hit1->QueryFrom - Hit2->QueryFrom;
+ }
+
+static int CmpImages(const void *ptrImage1, const void *ptrImage2)
+ {
+ PileImageData *Image1 = (PileImageData *) ptrImage1;
+ PileImageData *Image2 = (PileImageData *) ptrImage2;
+ return Image1->SIPile - Image2->SIPile;
+ }
+
+static void AssertImagesSorted(PileImageData *Images, int ImageCount)
+ {
+ for (int i = 0; i < ImageCount - 1; ++i)
+ if (Images[i].SIPile > Images[i+1].SIPile)
+ Quit("Images not sorted");
+ }
+
+static void SortImagesPile(PileImageData *Images, int ImageCount)
+ {
+ qsort(Images, ImageCount, sizeof(PileImageData), CmpImages);
+ }
+
+static void SortImages()
+ {
+ for (int PileIndex = 0; PileIndex < PileCount; ++PileIndex)
+ {
+ PileData &Pile = Piles[PileIndex];
+ SortImagesPile(Pile.Images, Pile.ImageCount);
+#if DEBUG
+ AssertImagesSorted(Pile.Images, Pile.ImageCount);
+#endif
+ }
+ }
+
+static bool FamMemberLt(const FamMemberData &Mem1, const FamMemberData &Mem2)
+ {
+ const PileData &Pile1 = Piles[Mem1.PileIndex];
+ const PileData &Pile2 = Piles[Mem2.PileIndex];
+ return Pile1.From < Pile2.From;
+ }
+
+static void CreatePiles(const HitData *Hits, int HitCount,
+ PILE_INDEX_TYPE *PileIndexes)
+ {
+ Piles = all(PileData, PileCount);
+ zero(Piles, PileData, PileCount);
+ for (int i = 0; i < PileCount; ++i)
+ {
+ Piles[i].FamIndex = -1;
+ Piles[i].SuperFamIndex = -1;
+ Piles[i].Rev = -1;
+ }
+
+// Count images in stack
+ ProgressStart("Create piles: count images");
+ for (int HitIndex = 0; HitIndex < HitCount; ++HitIndex)
+ {
+ const HitData &Hit = Hits[HitIndex];
+
+ int Pos = Hit.QueryFrom/CHUNK_LENGTH;
+ PILE_INDEX_TYPE PileIndex = PileIndexes[Pos];
+ assert(PileIndex == PileIndexes[Hit.QueryTo/CHUNK_LENGTH]);
+ assert(PileIndex >= 0 && PileIndex < PileCount);
+ ++(Piles[PileIndex].ImageCount);
+
+ Pos = Hit.TargetFrom/CHUNK_LENGTH;
+ PileIndex = PileIndexes[Pos];
+ assert(PileIndex >= 0 && PileIndex < PileCount);
+ assert(PileIndex == PileIndexes[Hit.TargetTo/CHUNK_LENGTH]);
+ ++(Piles[PileIndex].ImageCount);
+ }
+ ProgressDone();
+
+// Allocate memory for image list
+ int TotalImageCount = 0;
+ ProgressStart("Create piles: allocate image memory");
+ for (int PileIndex = 0; PileIndex < PileCount; ++PileIndex)
+ {
+ PileData &Pile = Piles[PileIndex];
+ const int ImageCount = Pile.ImageCount;
+ TotalImageCount += ImageCount;
+ assert(ImageCount > 0);
+ Pile.Images = all(PileImageData, ImageCount);
+ }
+ ProgressDone();
+
+// Build image list
+ ProgressStart("Create piles: build image list");
+ for (int PileIndex = 0; PileIndex < PileCount; ++PileIndex)
+ {
+ PileData &Pile = Piles[PileIndex];
+ Pile.ImageCount = 0;
+ Pile.From = -1;
+ Pile.To = -1;
+ }
+
+ for (int HitIndex = 0; HitIndex < HitCount; ++HitIndex)
+ {
+ const HitData &Hit = Hits[HitIndex];
+
+ const bool Rev = Hit.Rev;
+
+ const int Length1 = Hit.QueryTo - Hit.QueryFrom;
+ const int Length2 = Hit.TargetTo - Hit.TargetFrom;
+
+ const int From1 = Hit.QueryFrom;
+ const int From2 = Hit.TargetFrom;
+
+ const int To1 = Hit.QueryTo;
+ const int To2 = Hit.TargetTo;
+
+ const int Pos1 = From1/CHUNK_LENGTH;
+ const int Pos2 = From2/CHUNK_LENGTH;
+
+ PILE_INDEX_TYPE PileIndex1 = PileIndexes[Pos1];
+ PILE_INDEX_TYPE PileIndex2 = PileIndexes[Pos2];
+
+ assert(PileIndex1 == PileIndexes[(From1 + Length1 - 1)/CHUNK_LENGTH]);
+ assert(PileIndex1 >= 0 && PileIndex1 < PileCount);
+
+ assert(PileIndex2 == PileIndexes[(From2 + Length2 - 1)/CHUNK_LENGTH]);
+ assert(PileIndex2 >= 0 && PileIndex2 < PileCount);
+
+ PileData &Pile1 = Piles[PileIndex1];
+ PileImageData &Image1 = Pile1.Images[Pile1.ImageCount++];
+ Image1.SILength = Length2;
+ Image1.SIPile = PileIndex2;
+ Image1.SIRev = Rev;
+
+ PileData &Pile2 = Piles[PileIndex2];
+ PileImageData &Image2 = Pile2.Images[Pile2.ImageCount++];
+ Image2.SILength = Length1;
+ Image2.SIPile = PileIndex1;
+ Image2.SIRev = Rev;
+
+ if (Pile1.From == -1 || From1 < Pile1.From)
+ Pile1.From = From1;
+ if (Pile1.To == -1 || To1 > Pile1.To)
+ Pile1.To = To1;
+
+ if (Pile2.From == -1 || From2 < Pile2.From)
+ Pile2.From = From2;
+ if (Pile2.To == -1 || To2 > Pile2.To)
+ Pile2.To = To2;
+
+ if (Pile1.ImageCount > MaxImageCount)
+ MaxImageCount = Pile1.ImageCount;
+ if (Pile2.ImageCount > MaxImageCount)
+ MaxImageCount = Pile2.ImageCount;
+ }
+ ProgressDone();
+ }
+
+static int PileDist(const PileData &Pile1, const PileData &Pile2)
+ {
+ if (Pile1.From > Pile2.To)
+ return Pile1.From - Pile2.To;
+ else
+ return Pile2.From - Pile1.From;
+ }
+
+static int FindEdgesPile(PileData &Pile, int PileIndex,
+ PILE_INDEX_TYPE Partners[], bool PartnersRev[])
+ {
+ const int PileLength = Pile.To - Pile.From + 1;
+ if (PileLength < MIN_CRISPR_LENGTH || PileLength > MAX_CRISPR_LENGTH)
+ return 0;
+
+ const int ImageCount = Pile.ImageCount;
+
+ int PartnerCount = 0;
+ for (int ImageIndex = 0; ImageIndex < ImageCount; ++ImageIndex)
+ {
+ const PileImageData &Image = Pile.Images[ImageIndex];
+ const int PartnerImageLength = Image.SILength;
+ const int PartnerPileIndex = Image.SIPile;
+ const PileData &PartnerPile = Piles[PartnerPileIndex];
+ const int PartnerPileLength = PartnerPile.To - PartnerPile.From + 1;
+ const int Dist = PileDist(Pile, PartnerPile);
+
+ if (PartnerPileLength >= MIN_CRISPR_LENGTH &&
+ PartnerPileLength <= MAX_CRISPR_LENGTH &&
+ Dist >= MIN_SPACER_LENGTH &&
+ Dist <= MAX_SPACER_LENGTH)
+ {
+ PartnersRev[PartnerCount] = Image.SIRev;
+ Partners[PartnerCount] = PartnerPileIndex;
+ ++PartnerCount;
+ }
+ }
+ return PartnerCount;
+ }
+
+static void AddEdges(EdgeList &Edges, PILE_INDEX_TYPE PileIndex,
+ PILE_INDEX_TYPE Partners[], bool PartnersRev[], int PartnerCount)
+ {
+ EdgeCount += PartnerCount;
+ for (int i = 0; i < PartnerCount; ++i)
+ {
+ int PileIndex2 = Partners[i];
+ EdgeData Edge;
+ Edge.Node1 = PileIndex;
+ Edge.Node2 = PileIndex2;
+ Edge.Rev = PartnersRev[i];
+ Edges.push_back(Edge);
+ }
+ }
+
+static void FindCandEdges(EdgeList &Edges, int MaxImageCount)
+ {
+ Edges.clear();
+
+ PILE_INDEX_TYPE *Partners = all(PILE_INDEX_TYPE, MaxImageCount);
+ bool *PartnersRev = all(bool, MaxImageCount);
+ for (int PileIndex = 0; PileIndex < PileCount; ++PileIndex)
+ {
+ PileData &Pile = Piles[PileIndex];
+ int PartnerCount;
+ PartnerCount = FindEdgesPile(Pile, PileIndex, Partners, PartnersRev);
+ AddEdges(Edges, PileIndex, Partners, PartnersRev, PartnerCount);
+ }
+ freemem(Partners);
+ freemem(PartnersRev);
+ }
+
+static void AssignFamsToPiles(FamList &Fams)
+ {
+ int FamIndex = 0;
+ for (PtrFamList p = Fams.begin(); p != Fams.end(); ++p)
+ {
+ FamData *Fam = *p;
+ if (Fam->size() < (size_t) g_paramMinFamSize)
+ Quit("Fam size");
+ for (PtrFamData q = Fam->begin(); q != Fam->end(); ++q)
+ {
+ FamMemberData &FamMember = *q;
+ int PileIndex = FamMember.PileIndex;
+ PileData &Pile = Piles[PileIndex];
+ Pile.FamIndex = FamIndex;
+ Pile.Rev = (int) FamMember.Rev;
+ }
+ ++FamIndex;
+ }
+ }
+
+// Discard families that don't have regular spacing
+typedef std::vector<FamMemberData> FAMVEC;
+
+static void FilterCrispFam(const FamData &Fam, FamList &OutFams)
+ {
+ int PileCount = (int) Fam.size();
+
+ FAMVEC FamVec;
+ FamVec.reserve(PileCount);
+
+ for (FamData::const_iterator p = Fam.begin(); p != Fam.end(); ++p)
+ FamVec.push_back(*p);
+
+ std::sort(FamVec.begin(), FamVec.end(), FamMemberLt);
+
+#if TRACE
+ Log("\n");
+ Log("FilterCrispFam fam size=%d\n", (int) Fam.size());
+ Log("\n");
+ Log("After sort:\n");
+ Log(" Pile From To\n");
+ Log("===== ======= =======\n");
+ for (int i = 0; i < PileCount; ++i)
+ {
+ const FamMemberData &Mem = FamVec[i];
+ int PileIndex = Mem.PileIndex;
+ const PileData &Pile = Piles[PileIndex];
+ Log("%5d %7d %7d\n", PileIndex, Pile.From, Pile.To);
+ }
+ Log("\n");
+#endif
+
+ FamData OutFam;
+ for (int i = 0; i < PileCount; ++i)
+ {
+ const FamMemberData &Mem = FamVec[i];
+
+ if (i == 0 || i == 1)
+ {
+ OutFam.push_back(Mem);
+ continue;
+ }
+
+ const FamMemberData &Mem_1 = FamVec[i-1];
+ const FamMemberData &Mem_2 = FamVec[i-2];
+
+ int PileIndex = Mem.PileIndex;
+ int PileIndex_1 = Mem_1.PileIndex;
+ int PileIndex_2 = Mem_2.PileIndex;
+
+ const PileData &Pile = Piles[PileIndex];
+ const PileData &Pile_1 = Piles[PileIndex_1];
+ const PileData &Pile_2 = Piles[PileIndex_2];
+
+ int Space_12 = Pile_1.From - Pile_2.To;
+ int Space_1 = Pile.From - Pile_1.To;
+#if TRACE
+ {
+ Log("<Pile %d %d-%d> <space %d> <Pile %d %d-%d> <space %d> <Pile %d %d-%d>\n",
+ PileIndex_2,
+ Pile_2.From,
+ Pile_2.To,
+ Space_12,
+ PileIndex_1,
+ Pile_1.From,
+ Pile_2.To,
+ Space_1,
+ PileIndex,
+ Pile.From,
+ Pile.To);
+ }
+#endif
+
+ if (iabs(Space_12 - Space_1) <= MAX_SPACE_DIFF)
+ {
+#if TRACE
+ Log("Add %d to current family\n", PileIndex);
+#endif
+ OutFam.push_back(Mem);
+ }
+ else
+ {
+#if TRACE
+ Log("Space difference too big, fam size so far %d\n", (int) OutFam.size());
+#endif
+ if (OutFam.size() >= (size_t) g_paramMinFamSize)
+ {
+ FamData *NewFam = new FamData;
+ *NewFam = OutFam;
+ OutFams.push_back(NewFam);
+ OutFam = *new FamData;
+ }
+ else
+ OutFam.clear();
+ }
+ }
+
+ if (OutFam.size() >= (size_t) g_paramMinFamSize)
+ {
+ FamData *NewFam = new FamData;
+ *NewFam = OutFam;
+ OutFams.push_back(NewFam);
+ }
+ }
+
+static void FilterCrispFams(const FamList &InFams, FamList &OutFams)
+ {
+ OutFams.clear();
+
+ for (FamList::const_iterator p = InFams.begin(); p != InFams.end(); ++p)
+ {
+ FamData *Fam = *p;
+ FilterCrispFam(*Fam, OutFams);
+ }
+ }
+
+void Crisp()
+ {
+ const char *InputFileName = RequiredValueOpt("crisp");
+
+ const char *OutputFileName = ValueOpt("out");
+ const char *PilesFileName = ValueOpt("piles");
+ const char *ImagesFileName = ValueOpt("images");
+ const char *ArraysFileName = ValueOpt("arrays");
+
+ const char *strMinFamSize = ValueOpt("famsize");
+
+ if (0 == OutputFileName && 0 == PilesFileName && 0 == ImagesFileName)
+ Quit("No output file specified, must be at least one of -out, -piles, -images");
+
+ if (0 != strMinFamSize)
+ g_paramMinFamSize = atoi(strMinFamSize);
+
+ ProgressStart("Read hit file");
+ int HitCount;
+ int SeqLength;
+ HitData *Hits = ReadHits(InputFileName, &HitCount, &SeqLength);
+ ProgressDone();
+ Progress("%d hits", HitCount);
+
+ ProgressStart("Filter candidate hits");
+ int NewHitCount = 0;
+ for (int i = 0; i < HitCount; ++i)
+ {
+ HitData &Hit = Hits[i];
+ if (IsCand(Hit))
+ Hits[NewHitCount++] = Hit;
+ }
+ ProgressDone();
+ Progress("%d of %d candidate hits", NewHitCount, HitCount);
+ HitCount = NewHitCount;
+
+ SeqLengthChunks = (SeqLength + CHUNK_LENGTH - 1)/CHUNK_LENGTH;
+
+ const int BitVectorLength = (SeqLengthChunks + BITS_PER_INT - 1)/BITS_PER_INT;
+ int *CopyCount = all(int, BitVectorLength);
+ zero(CopyCount, int, BitVectorLength);
+
+ ProgressStart("Compute copy counts");
+ for (int i = 0; i < HitCount; ++i)
+ IncCopyCount(CopyCount, Hits[i]);
+ ProgressDone();
+
+ ProgressStart("Identify piles");
+ PILE_INDEX_TYPE *PileIndexes = IdentifyPiles(CopyCount);
+ ProgressDone();
+
+ Progress("%d stacks", PileCount);
+
+ freemem(CopyCount);
+ CopyCount = 0;
+
+ CreatePiles(Hits, HitCount, PileIndexes);
+
+ if (0 != ImagesFileName)
+ {
+ ProgressStart("Writing images file");
+ WriteImages(ImagesFileName, Hits, HitCount, PileIndexes);
+ ProgressDone();
+ }
+
+ freemem(Hits);
+ Hits = 0;
+
+ if (0 != PilesFileName)
+ {
+ ProgressStart("Writing piles file");
+ WritePiles(PilesFileName, Piles, PileCount);
+ ProgressDone();
+ }
+
+ freemem(PileIndexes);
+ PileIndexes = 0;
+
+ if (0 == OutputFileName)
+ return;
+
+ ProgressStart("Find edges");
+ EdgeList Edges;
+ FindCandEdges(Edges, MaxImageCount);
+ ProgressDone();
+
+ Progress("%d edges", (int) Edges.size());
+
+ Progress("Find connected components");
+ FamList Fams;
+ FindConnectedComponents(Edges, Fams, g_paramMinFamSize);
+
+ Progress("Filter");
+ FamList OutFams;
+ FilterCrispFams(Fams, OutFams);
+
+ AssignFamsToPiles(OutFams);
+ ProgressDone();
+
+ Progress("%d arrays", (int) OutFams.size());
+
+ ProgressStart("Write crisp file");
+ WriteCrispFile(OutputFileName, Piles, PileCount);
+ ProgressDone();
+
+ if (0 != ArraysFileName)
+ {
+ FILE *fArray = OpenStdioFile(ArraysFileName, FILEIO_MODE_WriteOnly);
+ ProgressStart("Writing arrays file");
+ int FamIndex = 0;
+ for (PtrFamList p = OutFams.begin(); p != OutFams.end(); ++p)
+ {
+ int Lo = -1;
+ int Hi = -1;
+ FamData *Fam = *p;
+ if (Fam->size() < (size_t) g_paramMinFamSize)
+ Quit("Fam size");
+ for (PtrFamData q = Fam->begin(); q != Fam->end(); ++q)
+ {
+ FamMemberData &FamMember = *q;
+ int PileIndex = FamMember.PileIndex;
+ PileData &Pile = Piles[PileIndex];
+ if (Lo == -1 || Pile.From < Lo)
+ Lo = Pile.From;
+ if (Hi == -1 || Pile.To > Hi)
+ Hi = Pile.To;
+ }
+ WriteArray(fArray, FamIndex, Lo, Hi);
+ ++FamIndex;
+ }
+ fclose(fArray);
+ ProgressDone();
+ }
+ }
diff --git a/findcc.cpp b/findcc.cpp
new file mode 100755
index 0000000..46a7cb7
--- /dev/null
+++ b/findcc.cpp
@@ -0,0 +1,233 @@
+#include "piler2.h"
+
+enum REVSTATE
+ {
+ REVSTATE_Unknown = 0,
+ REVSTATE_Normal = 1,
+ REVSTATE_Reversed = 2
+ };
+
+struct NeighborData
+ {
+ int NodeIndex;
+ bool Rev;
+ };
+
+typedef std::list<NeighborData> NeighborList;
+typedef NeighborList::iterator PtrNeighborList;
+
+struct NodeData
+ {
+ REVSTATE Rev;
+ int Index;
+ NeighborList *Neighbors;
+ NodeData *Next;
+ NodeData *Prev;
+ NodeData **List;
+ };
+
+static NodeData *Nodes;
+
+static FamData *MakeFam(NodeData *Nodes)
+ {
+ FamData *Fam = new FamData;
+ for (const NodeData *Node = Nodes; Node; Node = Node->Next)
+ {
+ FamMemberData FamMember;
+ FamMember.PileIndex = Node->Index;
+ switch (Node->Rev)
+ {
+ case REVSTATE_Unknown:
+ Quit("REVSTATE_Unknown");
+
+ case REVSTATE_Normal:
+ FamMember.Rev = false;
+ break;
+
+ case REVSTATE_Reversed:
+ FamMember.Rev = true;
+ break;
+ }
+
+ Fam->push_back(FamMember);
+ }
+ return Fam;
+ }
+
+static void AddNodeToList(NodeData *Node, NodeData **List)
+ {
+ NodeData *Head = *List;
+
+ Node->Next = Head;
+ Node->Prev = 0;
+
+ if (Head != 0)
+ Head->Prev = Node;
+
+ Node->List = List;
+ *List = Node;
+ }
+
+static void DeleteNodeFromList(NodeData *Node, NodeData **List)
+ {
+ assert(Node->List == List);
+
+ NodeData *Head = *List;
+
+ if (Node->Next != 0)
+ Node->Next->Prev = Node->Prev;
+
+ if (Node->Prev != 0)
+ Node->Prev->Next = Node->Next;
+ else
+ *List = Node->Next;
+
+ Node->List = 0;
+ }
+
+static NodeData *ListHead(NodeData **List)
+ {
+ return *List;
+ }
+
+static void MoveBetweenLists(NodeData *Node, NodeData **FromList, NodeData **ToList)
+ {
+ DeleteNodeFromList(Node, FromList);
+ AddNodeToList(Node, ToList);
+ }
+
+static bool ListIsEmpty(NodeData **List)
+ {
+ return 0 == *List;
+ }
+
+static bool NodeIsInList(NodeData *Node, NodeData **List)
+ {
+ return Node->List == List;
+ }
+
+static void LogList(NodeData **List)
+ {
+ for (const NodeData *Node = *List; Node; Node = Node->Next)
+ Log(" %d", Node->Index);
+ Log("\n");
+ }
+
+static int GetMaxIndex(EdgeList &Edges)
+ {
+ int MaxIndex = -1;
+ for (PtrEdgeList p = Edges.begin(); p != Edges.end(); ++p)
+ {
+ EdgeData &Edge = *p;
+ if (Edge.Node1 > MaxIndex)
+ MaxIndex = Edge.Node1;
+ if (Edge.Node2 > MaxIndex)
+ MaxIndex = Edge.Node2;
+ }
+ return MaxIndex;
+ }
+
+static REVSTATE RevState(REVSTATE Rev1, bool Rev2)
+ {
+ switch (Rev1)
+ {
+ case REVSTATE_Normal:
+ if (Rev2)
+ return REVSTATE_Reversed;
+ return REVSTATE_Normal;
+
+ case REVSTATE_Reversed:
+ if (Rev2)
+ return REVSTATE_Normal;
+ return REVSTATE_Reversed;
+ }
+ assert(false);
+ return REVSTATE_Unknown;
+ }
+
+int FindConnectedComponents(EdgeList &Edges, FamList &Fams, int MinComponentSize)
+ {
+ Fams.clear();
+
+ if (0 == Edges.size())
+ return 0;
+
+ int NodeCount = GetMaxIndex(Edges) + 1;
+ Nodes = new NodeData[NodeCount];
+
+ for (int i = 0; i < NodeCount; ++i)
+ {
+ Nodes[i].Neighbors = new NeighborList;
+ Nodes[i].Rev = REVSTATE_Unknown;
+ Nodes[i].Index = i;
+ }
+
+ NodeData *NotVisitedList = 0;
+ NodeData *PendingList = 0;
+ NodeData *CurrentList = 0;
+
+ for (PtrEdgeList p = Edges.begin(); p != Edges.end(); ++p)
+ {
+ EdgeData &Edge = *p;
+ int From = Edge.Node1;
+ int To = Edge.Node2;
+
+ assert(From >= 0 && From < NodeCount);
+ assert(To >= 0 && From < NodeCount);
+
+ NeighborData NTo;
+ NTo.NodeIndex = To;
+ NTo.Rev = Edge.Rev;
+ Nodes[From].Neighbors->push_back(NTo);
+
+ NeighborData NFrom;
+ NFrom.NodeIndex = From;
+ NFrom.Rev = Edge.Rev;
+ Nodes[To].Neighbors->push_back(NFrom);
+ }
+
+ for (int i = 0; i < NodeCount; ++i)
+ AddNodeToList(&Nodes[i], &NotVisitedList);
+
+ int FamCount = 0;
+ while (!ListIsEmpty(&NotVisitedList))
+ {
+ int ClassSize = 0;
+ NodeData *Node = ListHead(&NotVisitedList);
+
+ // This node becomes the first in the family
+ // By convention, the first member defines reversal or lack thereof.
+ Node->Rev = REVSTATE_Normal;
+ assert(ListIsEmpty(&PendingList));
+ MoveBetweenLists(Node, &NotVisitedList, &PendingList);
+ while (!ListIsEmpty(&PendingList))
+ {
+ Node = ListHead(&PendingList);
+ assert(REVSTATE_Normal == Node->Rev || REVSTATE_Reversed == Node->Rev);
+ NeighborList *Neighbors = Node->Neighbors;
+ for (PtrNeighborList p = Neighbors->begin(); p != Neighbors->end(); ++p)
+ {
+ NeighborData &Neighbor = *p;
+ int NeighborIndex = Neighbor.NodeIndex;
+ NodeData *NeighborNode = &(Nodes[NeighborIndex]);
+ if (NodeIsInList(NeighborNode, &NotVisitedList))
+ {
+ NeighborNode->Rev = RevState(Node->Rev, Neighbor.Rev);
+ MoveBetweenLists(NeighborNode, &NotVisitedList, &PendingList);
+ }
+ }
+ ++ClassSize;
+ MoveBetweenLists(Node, &PendingList, &CurrentList);
+ }
+
+ if (ClassSize >= MinComponentSize)
+ {
+ FamData *Fam = MakeFam(CurrentList);
+ Fams.push_back(Fam);
+ ++FamCount;
+ }
+
+ CurrentList = 0;
+ }
+ return FamCount;
+ }
diff --git a/gff.cpp b/gff.cpp
new file mode 100755
index 0000000..6098dd1
--- /dev/null
+++ b/gff.cpp
@@ -0,0 +1,114 @@
+#include "piler2.h"
+
+// GFF fields are:
+// <seqname> <source> <feature> <start> <end> <score> <strand> <frame> [attributes] [comments]
+// 0 1 2 3 4 5 6 7 8 9
+
+int GFFLineNr;
+
+// Destructive read -- pokes nuls onto FS
+int GetFields(char *Line, char **Fields, int MaxFields, char FS)
+ {
+ char *p = Line;
+ for (int FieldIndex = 0; FieldIndex < MaxFields; ++FieldIndex)
+ {
+ Fields[FieldIndex] = p;
+ char *Tab = strchr(p, FS);
+ char *End = Tab;
+ if (0 == End)
+ End = strchr(p, '\0');
+ size_t FieldLength = End - p;
+ if (FieldLength > MAX_GFF_FEATURE_LENGTH)
+ Quit("Max GFF field length exceeded, field is %d chars, max=%d, line %d",
+ FieldLength, MAX_GFF_FEATURE_LENGTH, GFFLineNr);
+ if (0 == Tab)
+ return FieldIndex + 1;
+ *Tab = 0;
+ p = Tab + 1;
+ }
+ return MaxFields;
+ }
+
+bool GetNextGFFRecord(FILE *f, GFFRecord &Rec)
+ {
+ for (;;)
+ {
+ ++GFFLineNr;
+ const char TAB = '\t';
+ char Line[MAX_GFF_LINE+1];
+ char *Ok = fgets(Line, sizeof(Line), f);
+ if (NULL == Ok)
+ {
+ if (feof(f))
+ return false;
+ Quit("Error reading GFF file, line=%d feof=%d ftell=%d ferror=%d errno=%d",
+ GFFLineNr, feof(f), ftell(f), ferror(f), errno);
+ }
+ if ('#' == Line[0])
+ continue;
+ size_t n = strlen(Line);
+ if (0 == n)
+ Quit("fgets returned zero-length line");
+ if (Line[n-1] != '\n' && !feof(f))
+ Quit("Max line length in GFF file exceeded, line %d is %d chars long, max=%d",
+ GFFLineNr, n - 1, MAX_GFF_LINE);
+ Line[n-1] = 0; // delete newline
+
+ char *Fields[9];
+ int FieldCount = GetFields(Line, Fields, 9, '\t');
+ if (FieldCount < 8)
+ Quit("GFF record has < 8 fields, line %d", GFFLineNr);
+
+ const char *SeqName = Fields[0];
+ const char *Source = Fields[1];
+ const char *Feature = Fields[2];
+ const char *Start = Fields[3];
+ const char *End = Fields[4];
+ const char *Score = Fields[5];
+ const char *Strand = Fields[6];
+ const char *Frame = Fields[7];
+ const char *Attrs = "";
+ if (FieldCount > 8)
+ {
+ // Truncate attrs if comment found
+ Attrs = Fields[8];
+ // char *Pound = strchr(Attrs, '#');
+ // if (0 != Pound)
+ // *Pound = 0;
+ }
+
+ strcpy(Rec.SeqName, SeqName);
+ strcpy(Rec.Source, Source);
+ strcpy(Rec.Feature, Feature);
+ Rec.Start = atoi(Start);
+ Rec.End = atoi(End);
+ Rec.Score = (float) atof(Score);
+ Rec.Strand = Strand[0];
+ Rec.Frame = Frame[0] == '.' ? -1 : atoi(Frame);
+ strcpy(Rec.Attrs, Attrs);
+ return true;
+ }
+ }
+
+// GFF fields are:
+// <seqname> <source> <feature> <start> <end> <score> <strand> <frame> [attributes] [comments]
+// 0 1 2 3 4 5 6 7 8 9
+void WriteGFFRecord(FILE *f, const GFFRecord &Rec)
+ {
+ fprintf(f, "%s\t%s\t%s\t%d\t%d\t%.3g\t%c",
+ // 0 1 2 3 4 5 6 7 8
+ Rec.SeqName, // 0
+ Rec.Source, // 1
+ Rec.Feature, // 2
+ Rec.Start, // 3
+ Rec.End, // 4
+ Rec.Score, // 5
+ Rec.Strand); // 6
+
+ if (-1 == Rec.Frame)
+ fprintf(f, "\t.");
+ else
+ fprintf(f, "\t%d", Rec.Frame);
+
+ fprintf(f, "\t%s\n", Rec.Attrs);
+ }
diff --git a/gff2.cpp b/gff2.cpp
new file mode 100755
index 0000000..4a799f4
--- /dev/null
+++ b/gff2.cpp
@@ -0,0 +1,253 @@
+#include "piler2.h"
+
+#define GFFRecord GFFRecord2
+
+// GFF fields are:
+// <seqname> <source> <feature> <start> <end> <score> <strand> <frame> [attributes] [comments]
+// 0 1 2 3 4 5 6 7 8 9
+
+static int GFFLineNr2;
+
+bool HasTargetAttrs(const char *Attrs)
+ {
+ const char *ptrTarget = strstr(Attrs, "Target ");
+ return 0 != ptrTarget;
+ }
+
+// Piles 123 456
+// 0123456
+void ParsePilesAttrs(const char *Attrs, int *ptrQueryPile, int *ptrTargetPile)
+ {
+ const char *ptrPiles = strstr(Attrs, "Piles ");
+ if (0 == ptrPiles)
+ Quit("Piles attribute not found");
+
+ const char *ptrValues = ptrPiles + 6;
+ int n = sscanf(ptrValues, "%d %d", ptrQueryPile, ptrTargetPile);
+ if (n != 2)
+ Quit("ParsePilesAttrs, sscanf(%s)=%d", ptrValues, n);
+ }
+
+// BandClust 123
+// 01234567890
+void ParseBandClustAttrs(const char *Attrs, int *ptrBandClustIndex)
+ {
+ const char *ptrBandClust = strstr(Attrs, "BandClust ");
+ if (0 == ptrBandClust)
+ Quit("BandClust attribute not found");
+
+ const char *ptrIndex = ptrBandClust + 10;
+ *ptrBandClustIndex = atoi(ptrIndex);
+ }
+
+// Pyramid 123
+// 01234567890
+void ParsePyramidAttrs(const char *Attrs, int *ptrClustIndex)
+ {
+ const char *p = strstr(Attrs, "Pyramid ");
+ if (0 == p)
+ Quit("Pyramid attribute not found");
+
+ const char *ptrIndex = p + 8;
+ *ptrClustIndex = atoi(ptrIndex);
+ }
+
+// Target SeqName 123 456
+// 01234567
+void ParseTargetAttrs(const char *Attrs, char SeqName[],
+ int SeqNameBytes, int *ptrStart, int *ptrEnd)
+ {
+ const char *ptrTarget = strstr(Attrs, "Target ");
+ if (0 == ptrTarget)
+ Quit("Target attribute not found");
+
+ const char *ptrRest = ptrTarget + 7;
+ const char *ptrBlank = strchr(ptrRest, ' ');
+ if (0 == ptrBlank)
+ Quit("Invalid Target attributes '%s', missing blank", Attrs);
+ size_t NameLength = ptrBlank - ptrRest;
+ if (NameLength >= (size_t) SeqNameBytes)
+ Quit("Target name length too long '%s'", Attrs);
+ memcpy(SeqName, ptrRest, NameLength);
+ SeqName[NameLength] = 0;
+ int n = sscanf(ptrBlank+1, "%d %d", ptrStart, ptrEnd);
+ if (n != 2)
+ Quit("Invalid Target start/end attributes '%s', sscanf=%d", ptrBlank+1);
+ }
+
+// Destructive read -- pokes nuls onto FS
+static int GetFields2(char *Line, char **Fields, int MaxFields, char FS)
+ {
+ char *p = Line;
+ for (int FieldIndex = 0; FieldIndex < MaxFields; ++FieldIndex)
+ {
+ Fields[FieldIndex] = p;
+ char *Tab = strchr(p, FS);
+ char *End = Tab;
+ if (0 == End)
+ End = strchr(p, '\0');
+ size_t FieldLength = End - p;
+ if (FieldLength > MAX_GFF_FEATURE_LENGTH)
+ Quit("Max GFF field length exceeded, field is %d chars, max=%d, line %d",
+ FieldLength, MAX_GFF_FEATURE_LENGTH, GFFLineNr2);
+ if (0 == Tab)
+ return FieldIndex + 1;
+ *Tab = 0;
+ p = Tab + 1;
+ }
+ return MaxFields;
+ }
+
+// WARNING: Line[] and Fields[]are overwritten
+// by each call to GetNextGFFRecord.
+// The Rec agument to GetNextGFFRecord returns
+// pointers into Line[].
+static char Line[MAX_GFF_LINE+1];
+static char *Fields[9];
+
+bool GetNextGFFRecord(FILE *f, GFFRecord &Rec)
+ {
+ for (;;)
+ {
+ ++GFFLineNr2;
+ const char TAB = '\t';
+ char *Ok = fgets(Line, sizeof(Line), f);
+ if (NULL == Ok)
+ {
+ if (feof(f))
+ return false;
+ Quit("Error reading GFF file, line=%d feof=%d ftell=%d ferror=%d errno=%d",
+ GFFLineNr2, feof(f), ftell(f), ferror(f), errno);
+ }
+ if ('#' == Line[0])
+ continue;
+ size_t n = strlen(Line);
+ if (0 == n)
+ Quit("fgets returned zero-length line");
+ if (Line[n-1] != '\n')
+ Quit("Max line length in GFF file exceeded, line %d is %d chars long, max=%d",
+ GFFLineNr2, n - 1, MAX_GFF_LINE);
+ Line[n-1] = 0; // delete newline
+
+ int FieldCount = GetFields2(Line, Fields, 9, '\t');
+ if (FieldCount < 8)
+ Quit("GFF record has < 8 fields, line %d", GFFLineNr2);
+
+ char *SeqName = Fields[0];
+ char *Source = Fields[1];
+ char *Feature = Fields[2];
+ char *Start = Fields[3];
+ char *End = Fields[4];
+ char *Score = Fields[5];
+ char *Strand = Fields[6];
+ char *Frame = Fields[7];
+ char *Attrs = Fields[8];
+
+ // Truncate attrs if comment found
+ char *Pound = strchr(Attrs, '#');
+ if (0 != Pound)
+ *Pound = 0;
+
+ Rec.SeqName = SeqName;
+ Rec.Source = Source;
+ Rec.Feature = Feature;
+ Rec.Start = atoi(Start);
+ Rec.End = atoi(End);
+ Rec.Score = (float) atof(Score);
+ Rec.Strand = Strand[0];
+ Rec.Frame = Frame[0] == '.' ? -1 : atoi(Frame);
+ Rec.Attrs = Attrs;
+
+ return true;
+ }
+ }
+
+// GFF fields are:
+// <seqname> <source> <feature> <start> <end> <score> <strand> <frame> [attributes] [comments]
+// 0 1 2 3 4 5 6 7 8 9
+void WriteGFFRecord(FILE *f, const GFFRecord &Rec)
+ {
+ fprintf(f, "%s\t%s\t%s\t%d\t%d\t%.3g\t%c",
+ // 0 1 2 3 4 5 6 7 8
+ Rec.SeqName, // 0
+ Rec.Source, // 1
+ Rec.Feature, // 2
+ Rec.Start, // 3
+ Rec.End, // 4
+ Rec.Score, // 5
+ Rec.Strand); // 6
+
+ if (-1 == Rec.Frame)
+ fprintf(f, "\t.");
+ else
+ fprintf(f, "\t%d", Rec.Frame);
+
+ fprintf(f, "\t%s\n", Rec.Attrs);
+ }
+
+void SaveGFFStrings(GFFRecord &Rec)
+ {
+ Rec.SeqName = strsave(Rec.SeqName);
+ Rec.Feature = strsave(Rec.Feature);
+ Rec.Source = strsave(Rec.Source);
+ Rec.Attrs = strsave(Rec.Attrs);
+ }
+
+void FreeGFFStrings(GFFRecord &Rec)
+ {
+ free((void *) Rec.SeqName);
+ free((void *) Rec.Feature);
+ free((void *) Rec.Source);
+ free((void *) Rec.Attrs);
+
+ Rec.SeqName = 0;
+ Rec.Feature = 0;
+ Rec.Source = 0;
+ Rec.Attrs = 0;
+ }
+
+void GFFRecordToHit(const GLIX &Glix, const GFFRecord &Rec, HitData &Hit)
+ {
+ if (0 != strcmp(Rec.Feature, "hit"))
+ Quit("GFFRecordToHit: feature=%s", Rec.Feature);
+
+ const int QueryLength = Rec.End - Rec.Start + 1;
+ Hit.QueryFrom = Glix.LocalToGlobal(Rec.SeqName, Rec.Start - 1);
+ Hit.QueryTo = Hit.QueryFrom + QueryLength - 1;
+
+ char TargetName[MAX_GFF_FEATURE_LENGTH+1];
+ int TargetStart;
+ int TargetEnd;
+ ParseTargetAttrs(Rec.Attrs, TargetName, sizeof(TargetName), &TargetStart, &TargetEnd);
+
+ const int TargetLength = TargetEnd - TargetStart + 1;
+ Hit.TargetFrom = Glix.LocalToGlobal(TargetName, TargetStart - 1);
+ Hit.TargetTo = Hit.TargetFrom + TargetLength - 1;
+
+ Hit.Rev = (Rec.Strand == '-');
+ }
+
+void HitToGFFRecord(const GLIX &Glix, const HitData &Hit, GFFRecord &Rec, char AnnotBuffer[])
+ {
+ Rec.Source = "piler";
+ Rec.Feature = "hit";
+ Rec.Score = 0;
+ Rec.Frame = '.';
+ Rec.Strand = (Hit.Rev ? '-' : '+');
+
+ const int QueryLength = Hit.QueryTo - Hit.QueryFrom + 1;
+ int SeqQueryFrom;
+ const char *QueryLabel = Glix.GlobalToSeqPadded(Hit.QueryFrom, &SeqQueryFrom);
+ const int SeqQueryTo = SeqQueryFrom + QueryLength - 1;
+
+ Rec.SeqName = QueryLabel;
+ Rec.Start = SeqQueryFrom + 1;
+ Rec.End = SeqQueryTo + 1;
+
+ const int TargetLength = Hit.TargetTo - Hit.TargetFrom + 1;
+ int SeqTargetFrom;
+ const char *TargetLabel = Glix.GlobalToSeqPadded(Hit.TargetFrom, &SeqTargetFrom);
+ const int SeqTargetTo = SeqTargetFrom + TargetLength - 1;
+ sprintf(AnnotBuffer, "Target %s %d %d", TargetLabel, SeqTargetFrom + 1, SeqTargetTo + 1);
+ Rec.Attrs = AnnotBuffer;
+ }
diff --git a/gffset.cpp b/gffset.cpp
new file mode 100755
index 0000000..59232b0
--- /dev/null
+++ b/gffset.cpp
@@ -0,0 +1,98 @@
+/***
+GFFSet is a container class for a set of GFF
+records. This is to hide issues to do with
+retrieving from a file, efficient memory use
+etc. The initial implementation here is
+designed for simplicity; by hiding the details
+we can improve it later as needed without
+a big impact on the calling code.
+***/
+
+#include "piler2.h"
+
+#define GFFRecord GFFRecord2
+
+GFFSet::GFFSet()
+ {
+ m_Recs = 0;
+ m_RecordCount = 0;
+ m_RecordBufferSize = 0;
+ m_GLIX = 0;
+ m_IIX = 0;
+ }
+
+GFFSet::~GFFSet()
+ {
+ Free();
+ }
+
+void GFFSet::Free()
+ {
+ for (int i = 0; i < m_RecordCount; ++i)
+ FreeGFFStrings(m_Recs[i]);
+ freemem(m_Recs);
+
+ delete m_GLIX;
+ delete m_IIX;
+
+ m_Recs = 0;
+ m_RecordCount = 0;
+ m_RecordBufferSize = 0;
+ m_GLIX = 0;
+ m_IIX = 0;
+ }
+
+void GFFSet::FromFile(FILE *f)
+ {
+ Free();
+
+ GFFRecord Rec;
+ while (GetNextGFFRecord(f, Rec))
+ {
+ SaveGFFStrings(Rec);
+ Add(Rec);
+ }
+ }
+
+// Caller's respsonsibility to allocate memory
+// for string fields, Add() doesn't make copies.
+// SaveGFFStrings() makes copies if needed.
+void GFFSet::Add(const GFFRecord &Rec)
+ {
+ if (m_RecordCount > m_RecordBufferSize)
+ Quit("GFFSet::Add, m_RecordCount > m_BufferSize");
+ if (m_RecordCount == m_RecordBufferSize)
+ {
+ m_RecordBufferSize += BUFFER_SIZE_INC;
+ reall(m_Recs, GFFRecord, m_RecordBufferSize);
+ }
+ m_Recs[m_RecordCount++] = Rec;
+ }
+
+const GFFRecord &GFFSet::Get(int RecordIndex) const
+ {
+ if (RecordIndex < 0 || RecordIndex >= m_RecordCount)
+ Quit("GFFRecord::Get(%d) out of range %d", RecordIndex, m_RecordCount);
+ return m_Recs[RecordIndex];
+ }
+
+void GFFSet::MakeGLIX()
+ {
+ m_GLIX = new GLIX;
+ m_GLIX->Init();
+ m_GLIX->FromGFFSet(*this);
+ }
+
+void GFFSet::MakeIIX()
+ {
+ const int GlobalLength = m_GLIX->GetGlobalLength();
+ m_IIX = new IIX;
+ m_IIX->Init(GlobalLength);
+ m_IIX->SetGLIX(m_GLIX);
+
+ for (int RecordIndex = 0; RecordIndex < m_RecordCount; ++RecordIndex)
+ {
+ const GFFRecord &Rec = Get(RecordIndex);
+ m_IIX->AddLocal(Rec.SeqName, Rec.Start - 1, Rec.End - 1, RecordIndex);
+ }
+ }
diff --git a/gffset.h b/gffset.h
new file mode 100755
index 0000000..efad211
--- /dev/null
+++ b/gffset.h
@@ -0,0 +1,32 @@
+#define GFFRecord GFFRecord2
+
+const int BUFFER_SIZE_INC = 4096;
+
+class GFFSet
+ {
+public:
+ GFFSet();
+ virtual ~GFFSet();
+ void Free();
+
+ void FromFile(FILE *f);
+ void Add(const GFFRecord &Rec);
+
+ void MakeGLIX();
+ void MakeIIX();
+
+ GLIX *GetGLIX() const { return m_GLIX; }
+ IIX *GetIIX() const { return m_IIX; }
+
+ const GFFRecord &Get(int RecordIndex) const;
+ int GetRecordCount() const { return m_RecordCount; }
+
+private:
+ GFFRecord *m_Recs;
+ int m_RecordCount;
+ int m_RecordBufferSize;
+ GLIX *m_GLIX;
+ IIX *m_IIX;
+ };
+
+#undef GFFRecord
diff --git a/glix.cpp b/glix.cpp
new file mode 100755
index 0000000..dfc73ca
--- /dev/null
+++ b/glix.cpp
@@ -0,0 +1,304 @@
+/***
+GLIX = Global Local IndeX
+
+This index provides an efficient mapping of a
+set of sequences to and from a single set of
+global coordinates such that each sequence has
+a unique range of positions. Global coordinates
+are convenient in a number of algorithms, and
+there may be quite large numbers of sequences
+(e.g., contigs or scaffolds in an early assembly),
+so efficiency is useful here.
+
+"Local" coordinate is (Label, SeqPos), where
+SeqPos is 0-based, i.e. is 0, 1 ... (SeqLength-1)
+for each sequence.
+
+"Global" coordindate is GlobalPos, where
+GlobalPos = 0, 1 ... (GlobalLength - 1).
+
+To convert from Local to Global, a hash table
+is used to look up by sequence name.
+
+To convert from Global to Local, the global
+sequence is divided into blocks of length B.
+A vector of length GlobalLength/B contains
+the sequence index for each block. Padding
+is therefore required to ensure that there
+are no blocks that map to two different
+sequences.
+***/
+
+#include "piler2.h"
+
+#define GFFRecord GFFRecord2
+
+GLIX::GLIX()
+ {
+ m_SeqCount = 0;
+ m_GlobalLength = 0;
+ m_HashTableSize = 0;
+ m_Pad = 0;
+ m_HashTable = 0;
+ m_SeqMap = 0;
+ }
+
+GLIX::~GLIX()
+ {
+ }
+
+void GLIX::Free()
+ {
+ // TODO@@
+ }
+
+void GLIX::Init(int Pad, int HashTableSize)
+ {
+ Free();
+
+ m_Pad = Pad;
+ m_HashTableSize = HashTableSize;
+ m_HashTable = all(SEQDATA *, HashTableSize);
+ zero(m_HashTable, SEQDATA *, HashTableSize);
+ }
+
+SEQDATA *GLIX::SeqIndexLookup(const char *Label) const
+ {
+ int h = Hash(Label, m_HashTableSize);
+ assert(h >= 0 && h < m_HashTableSize);
+ for (SEQDATA *i = m_HashTable[h]; i; i = i->Next)
+ if (0 == strcmp(i->Label, Label))
+ return i;
+ return 0;
+ }
+
+SEQDATA *GLIX::AddToIndex(const char *Label, int Length)
+ {
+ int h = Hash(Label, m_HashTableSize);
+ assert(h >= 0 && h < m_HashTableSize);
+
+ SEQDATA *i = all(SEQDATA, 1);
+ i->Label = strsave(Label);
+ i->Length = Length;
+ i->Next = m_HashTable[h];
+ m_HashTable[h] = i;
+ ++m_SeqCount;
+ return i;
+ }
+
+void GLIX::AssignOffsets()
+ {
+ int Offset = 0;
+ for (int h = 0; h < m_HashTableSize; ++h)
+ {
+ for (SEQDATA *i = m_HashTable[h]; i; i = i->Next)
+ {
+ i->Offset = Offset;
+ Offset += i->Length;
+ const int NewOffset = RoundUp(Offset, m_Pad, SEQMAP_BLOCKSIZE);
+ i->RoundedLength = i->Length + NewOffset - Offset;
+ Offset = NewOffset;
+ }
+ }
+ m_GlobalLength = Offset;
+ }
+
+// Add() is typically used when creating a GLIX from GFF records.
+// It can be called multiple times with the same label, the
+// maximum position determines the sequence length.
+void GLIX::Add(const char *Label, int Pos)
+ {
+ SEQDATA *IE = SeqIndexLookup(Label);
+ if (0 == IE)
+ AddToIndex(Label, Pos + 1);
+ else
+ {
+ if (Pos + 1 > IE->Length)
+ IE->Length = Pos + 1;
+ }
+ }
+
+// Insert() is typically used when creating a GLIX from
+// a sequence file. It must be called exactly once per label.
+void GLIX::Insert(const char *Label, int Offset, int Length)
+ {
+ const SEQDATA *IE = SeqIndexLookup(Label);
+ if (0 != IE)
+ Quit("Duplicate sequence label %s", Label);
+
+ int h = Hash(Label, m_HashTableSize);
+ assert(h >= 0 && h < m_HashTableSize);
+
+ SEQDATA *i = all(SEQDATA, 1);
+ i->Label = strsave(Label);
+ i->Length = Length;
+ i->Offset = Offset;
+ i->Next = m_HashTable[h];
+ m_HashTable[h] = i;
+ ++m_SeqCount;
+ }
+
+int GLIX::FromGFFFile(FILE *f)
+ {
+ int RecordCount = 0;
+ GFFRecord Rec;
+ while (GetNextGFFRecord(f, Rec))
+ {
+ ++RecordCount;
+ Add(Rec.SeqName, Rec.End - 1);
+ if (HasTargetAttrs(Rec.Attrs))
+ {
+ char TargetName[MAX_GFF_FEATURE_LENGTH+1];
+ int Start;
+ int End;
+ ParseTargetAttrs(Rec.Attrs, TargetName, sizeof(TargetName), &Start, &End);
+ Add(TargetName, End - 1);
+ }
+ }
+ AssignOffsets();
+ return RecordCount;
+ }
+
+int GLIX::FromGFFSet(const GFFSet &Set)
+ {
+ const int RecordCount = Set.GetRecordCount();
+ for (int i = 0; i < RecordCount; ++i)
+ {
+ const GFFRecord &Rec = Set.Get(i);
+ Add(Rec.SeqName, Rec.End - 1);
+ }
+ AssignOffsets();
+ return RecordCount;
+ }
+
+void GLIX::LogMe() const
+ {
+ Log("m_SeqCount=%d\n", m_SeqCount);
+ Log(" Label Length Hash\n");
+ Log("-------------------- ---------- -------\n");
+ int Count = 0;
+ for (int h = 0; h < m_HashTableSize; ++h)
+ {
+ for (SEQDATA *i = m_HashTable[h]; i; i = i->Next)
+ {
+ Log("%20.20s %10d %7d\n", i->Label, i->Length, h);
+ ++Count;
+ }
+ }
+ if (Count != m_SeqCount)
+ Log("\n**** ERROR **** Count=%d\n", Count);
+ }
+
+int GLIX::GetGlobalLength() const
+ {
+ return m_GlobalLength;
+ }
+
+int GLIX::GetSeqCount() const
+ {
+ return m_SeqCount;
+ }
+
+int GLIX::GetSeqOffset(const char *Label) const
+ {
+ const SEQDATA *IE = SeqIndexLookup(Label);
+ if (0 == IE)
+ Quit("Label not found %s", Label);
+ return IE->Offset;
+ }
+
+int GLIX::GetSeqLength(const char *Label) const
+ {
+ const SEQDATA *IE = SeqIndexLookup(Label);
+ if (0 == IE)
+ Quit("Label not found %s", Label);
+ return IE->Length;
+ }
+
+void GLIX::MakeGlobalToLocalIndex()
+ {
+ if (m_GlobalLength%SEQMAP_BLOCKSIZE)
+ Quit("MakeGlobalToLocalIndex: expects rounded size");
+
+ const int BinCount = (m_GlobalLength + SEQMAP_BLOCKSIZE - 1)/SEQMAP_BLOCKSIZE;
+ m_SeqMap = all(SEQDATA *, BinCount);
+ zero(m_SeqMap, SEQDATA *, BinCount);
+
+ for (int h = 0; h < m_HashTableSize; ++h)
+ {
+ for (SEQDATA *p = m_HashTable[h]; p != 0; p = p->Next)
+ {
+ SEQDATA &IE = *p;
+ int From = p->Offset;
+ int To = From + p->Length - 1;
+
+ if (From%SEQMAP_BLOCKSIZE)
+ Quit("MakeGlobalToLocalIndex: expected rounded contig from");
+
+ int BinFrom = From/SEQMAP_BLOCKSIZE;
+ int BinTo = To/SEQMAP_BLOCKSIZE;
+ for (int Bin = BinFrom; Bin <= BinTo; ++Bin)
+ {
+ if (m_SeqMap[Bin] != 0)
+ Quit("MakeGlobalToLocalIndex: overlap error");
+ m_SeqMap[Bin] = p;
+ }
+ }
+ }
+ }
+
+const char *GLIX::GlobalToSeq(int GlobalPos, int *ptrSeqPos) const
+ {
+ if (GlobalPos < 0 || GlobalPos >= m_GlobalLength)
+ Quit("GlobalToSeqPos: invalid pos");
+ if (0 == m_SeqMap)
+ Quit("GLIX::MakeGlobalToLocalIndex not called");
+
+ const int Bin = GlobalPos/SEQMAP_BLOCKSIZE;
+ SEQDATA *IE = m_SeqMap[Bin];
+ if (0 == IE)
+ Quit("GlobalToSeqPos: doesn't map");
+ int SeqPos = GlobalPos - IE->Offset;
+ if (SeqPos < 0 || SeqPos >= IE->Length)
+ Quit("GlobalToSeqPos: out of bounds");
+ *ptrSeqPos = SeqPos;
+ return IE->Label;
+ }
+
+const char *GLIX::GlobalToSeqPadded(int GlobalPos, int *ptrSeqPos) const
+ {
+ if (GlobalPos < 0 || GlobalPos >= m_GlobalLength)
+ Quit("GlobalToSeqPos: invalid pos");
+ if (0 == m_SeqMap)
+ Quit("GLIX::MakeGlobalToLocalIndex not called");
+
+ const int Bin = GlobalPos/SEQMAP_BLOCKSIZE;
+ SEQDATA *IE = m_SeqMap[Bin];
+ if (0 == IE)
+ Quit("GlobalToSeqPos: doesn't map");
+ int SeqPos = GlobalPos - IE->Offset;
+ if (SeqPos < 0 || SeqPos >= IE->RoundedLength)
+ Quit("GlobalToSeqPos: out of bounds");
+ *ptrSeqPos = SeqPos;
+ return IE->Label;
+ }
+
+int GLIX::LocalToGlobal(const char *Label, int LocalPos) const
+ {
+ const SEQDATA *IE = SeqIndexLookup(Label);
+ if (0 == IE)
+ Quit("ContigToGlobal: contig not found (%s)", Label);
+ if (LocalPos < 0 || LocalPos >= IE->Length)
+ Quit("ContigToGlobal: out of bounds");
+ return IE->Offset + LocalPos;
+ }
+
+int GLIX::LocalToGlobalNoFail(const char *Label, int LocalPos) const
+ {
+ const SEQDATA *IE = SeqIndexLookup(Label);
+ if (0 == IE)
+ return -1;
+ if (LocalPos < 0 || LocalPos >= IE->Length)
+ return -1;
+ return IE->Offset + LocalPos;
+ }
diff --git a/glix.h b/glix.h
new file mode 100755
index 0000000..e210613
--- /dev/null
+++ b/glix.h
@@ -0,0 +1,60 @@
+#define GFFRecord GFFRecord2
+
+const int SEQMAP_BLOCKSIZE = 1024;
+const int DEFAULT_HASH_TABLE_SIZE = 17909;
+const int DEFAULT_PAD = 0;
+
+struct SEQDATA
+ {
+ char *Label;
+ int Length;
+ int RoundedLength;
+ int Offset;
+ SEQDATA *Next;
+ };
+
+class GLIX
+ {
+public:
+ GLIX();
+ virtual ~GLIX();
+ void Free();
+
+ void Init(int Pad = DEFAULT_PAD, int HashTableSize = DEFAULT_HASH_TABLE_SIZE);
+
+ const char *GlobalToSeq(int GlobalPos, int *ptrSeqPos) const;
+ const char *GlobalToSeqPadded(int GlobalPos, int *ptrSeqPos) const;
+ int LocalToGlobal(const char *Label, int LocalPos) const;
+ int LocalToGlobalNoFail(const char *Label, int LocalPos) const;
+
+ int FromGFFFile(FILE *f);
+ int FromGFFSet(const GFFSet &Set);
+
+ void Add(const char *Label, int LocalPos);
+ void Insert(const char *Label, int Offset, int Length);
+
+ void MakeGlobalToLocalIndex();
+
+ int GetGlobalLength() const;
+ int GetSeqCount() const;
+ int GetSeqOffset(const char *Label) const;
+ int GetSeqLength(const char *Label) const;
+ void LogMe() const;
+
+private:
+ SEQDATA *AddToIndex(const char *Label, int Length);
+ void AssignOffsets();
+
+ SEQDATA *SeqIndexLookup(const char *Label) const;
+
+private:
+ SEQDATA **m_HashTable;
+ SEQDATA **m_SeqMap;
+
+ int m_SeqCount;
+ int m_HashTableSize;
+ int m_GlobalLength;
+ int m_Pad;
+ };
+
+#undef GFFRecord
diff --git a/hash.cpp b/hash.cpp
new file mode 100755
index 0000000..14dd92b
--- /dev/null
+++ b/hash.cpp
@@ -0,0 +1,164 @@
+#include "piler2.h"
+
+typedef unsigned long int ub4; /* unsigned 4-byte quantities */
+typedef unsigned char ub1; /* unsigned 1-byte quantities */
+
+#define hashsize(n) ((ub4)1<<(n))
+#define hashmask(n) (hashsize(n)-1)
+
+/*
+--------------------------------------------------------------------
+mix -- mix 3 32-bit values reversibly.
+For every delta with one or two bits set, and the deltas of all three
+ high bits or all three low bits, whether the original value of a,b,c
+ is almost all zero or is uniformly distributed,
+* If mix() is run forward or backward, at least 32 bits in a,b,c
+ have at least 1/4 probability of changing.
+* If mix() is run forward, every bit of c will change between 1/3 and
+ 2/3 of the time. (Well, 22/100 and 78/100 for some 2-bit deltas.)
+mix() was built out of 36 single-cycle latency instructions in a
+ structure that could supported 2x parallelism, like so:
+ a -= b;
+ a -= c; x = (c>>13);
+ b -= c; a ^= x;
+ b -= a; x = (a<<8);
+ c -= a; b ^= x;
+ c -= b; x = (b>>13);
+ ...
+ Unfortunately, superscalar Pentiums and Sparcs can't take advantage
+ of that parallelism. They've also turned some of those single-cycle
+ latency instructions into multi-cycle latency instructions. Still,
+ this is the fastest good hash I could find. There were about 2^^68
+ to choose from. I only looked at a billion or so.
+--------------------------------------------------------------------
+*/
+#define mix(a,b,c) \
+{ \
+ a -= b; a -= c; a ^= (c>>13); \
+ b -= c; b -= a; b ^= (a<<8); \
+ c -= a; c -= b; c ^= (b>>13); \
+ a -= b; a -= c; a ^= (c>>12); \
+ b -= c; b -= a; b ^= (a<<16); \
+ c -= a; c -= b; c ^= (b>>5); \
+ a -= b; a -= c; a ^= (c>>3); \
+ b -= c; b -= a; b ^= (a<<10); \
+ c -= a; c -= b; c ^= (b>>15); \
+}
+
+/*
+--------------------------------------------------------------------
+hash() -- hash a variable-length key into a 32-bit value
+ k : the key (the unaligned variable-length array of bytes)
+ len : the length of the key, counting by bytes
+ initval : can be any 4-byte value
+Returns a 32-bit value. Every bit of the key affects every bit of
+the return value. Every 1-bit and 2-bit delta achieves avalanche.
+About 6*len+35 instructions.
+
+The best hash table sizes are powers of 2. There is no need to do
+mod a prime (mod is sooo slow!). If you need less than 32 bits,
+use a bitmask. For example, if you need only 10 bits, do
+ h = (h & hashmask(10));
+In which case, the hash table should have hashsize(10) elements.
+
+If you are hashing n strings (ub1 **)k, do it like this:
+ for (i=0, h=0; i<n; ++i) h = hash( k[i], len[i], h);
+
+By Bob Jenkins, 1996. bob_jenkins at burtleburtle.net. You may use this
+code any way you wish, private, educational, or commercial. It's free.
+
+See http://burtleburtle.net/bob/hash/evahash.html
+Use for hash table lookup, or anything where one collision in 2^^32 is
+acceptable. Do NOT use for cryptographic purposes.
+--------------------------------------------------------------------
+*/
+
+unsigned Hash(const char *key)
+{
+ register ub1 *k = (ub1 *) key;
+ register ub4 length = (ub4) strlen(key); /* the length of the key */
+ const ub4 initval = 12345; /* the previous hash, or an arbitrary value */
+ register ub4 a,b,c,len;
+
+ /* Set up the internal state */
+ len = length;
+ a = b = 0x9e3779b9; /* the golden ratio; an arbitrary value */
+ c = initval; /* the previous hash value */
+
+ /*---------------------------------------- handle most of the key */
+ while (len >= 12)
+ {
+ a += (k[0] +((ub4)k[1]<<8) +((ub4)k[2]<<16) +((ub4)k[3]<<24));
+ b += (k[4] +((ub4)k[5]<<8) +((ub4)k[6]<<16) +((ub4)k[7]<<24));
+ c += (k[8] +((ub4)k[9]<<8) +((ub4)k[10]<<16)+((ub4)k[11]<<24));
+ mix(a,b,c);
+ k += 12; len -= 12;
+ }
+
+ /*------------------------------------- handle the last 11 bytes */
+ c += length;
+ switch(len) /* all the case statements fall through */
+ {
+ case 11: c+=((ub4)k[10]<<24);
+ case 10: c+=((ub4)k[9]<<16);
+ case 9 : c+=((ub4)k[8]<<8);
+ /* the first byte of c is reserved for the length */
+ case 8 : b+=((ub4)k[7]<<24);
+ case 7 : b+=((ub4)k[6]<<16);
+ case 6 : b+=((ub4)k[5]<<8);
+ case 5 : b+=k[4];
+ case 4 : a+=((ub4)k[3]<<24);
+ case 3 : a+=((ub4)k[2]<<16);
+ case 2 : a+=((ub4)k[1]<<8);
+ case 1 : a+=k[0];
+ /* case 0: nothing left to add */
+ }
+ mix(a,b,c);
+ /*-------------------------------------------- report the result */
+ return c%HASH_TABLE_SIZE;
+}
+
+unsigned Hash(const char *key, unsigned TableSize)
+{
+ register ub1 *k = (ub1 *) key;
+ register ub4 length = (ub4) strlen(key); /* the length of the key */
+ const ub4 initval = 12345; /* the previous hash, or an arbitrary value */
+ register ub4 a,b,c,len;
+
+ /* Set up the internal state */
+ len = length;
+ a = b = 0x9e3779b9; /* the golden ratio; an arbitrary value */
+ c = initval; /* the previous hash value */
+
+ /*---------------------------------------- handle most of the key */
+ while (len >= 12)
+ {
+ a += (k[0] +((ub4)k[1]<<8) +((ub4)k[2]<<16) +((ub4)k[3]<<24));
+ b += (k[4] +((ub4)k[5]<<8) +((ub4)k[6]<<16) +((ub4)k[7]<<24));
+ c += (k[8] +((ub4)k[9]<<8) +((ub4)k[10]<<16)+((ub4)k[11]<<24));
+ mix(a,b,c);
+ k += 12; len -= 12;
+ }
+
+ /*------------------------------------- handle the last 11 bytes */
+ c += length;
+ switch(len) /* all the case statements fall through */
+ {
+ case 11: c+=((ub4)k[10]<<24);
+ case 10: c+=((ub4)k[9]<<16);
+ case 9 : c+=((ub4)k[8]<<8);
+ /* the first byte of c is reserved for the length */
+ case 8 : b+=((ub4)k[7]<<24);
+ case 7 : b+=((ub4)k[6]<<16);
+ case 6 : b+=((ub4)k[5]<<8);
+ case 5 : b+=k[4];
+ case 4 : a+=((ub4)k[3]<<24);
+ case 3 : a+=((ub4)k[2]<<16);
+ case 2 : a+=((ub4)k[1]<<8);
+ case 1 : a+=k[0];
+ /* case 0: nothing left to add */
+ }
+ mix(a,b,c);
+ /*-------------------------------------------- report the result */
+ return c%TableSize;
+}
diff --git a/iix.cpp b/iix.cpp
new file mode 100755
index 0000000..2578534
--- /dev/null
+++ b/iix.cpp
@@ -0,0 +1,160 @@
+/***
+IIX = Interval IndeX.
+
+Given a fixed set of intervals on a
+sequence of length L, provides an efficient
+way to retrieve all intervals in the set that
+have non-zero overlap with given interval.
+
+This implementation is designed to be simple
+to implement and efficient for the typical
+sets of intervals that I will be dealing
+with -- this is not a good general solution
+to the problem. The main assumption is that
+there are few long intervals.
+
+The sequence is divided into blocks of length B.
+For each block, there is a list of intervals
+that overlap that block. So an indexed interval
+(ii) of length > B will appear in more than one
+list, and in fact any ii of length > 1 may appear
+in more than one list if it happens to cross
+a block boundary. Intervals of length >> B are a
+problem because they will appear in many lists,
+but for my purposes they are rare.
+
+Each interval is associated with a user-defined
+integer. This can be used by the calling code to
+map an ii to some data, e.g. a GFF record. How
+this is done is up to the caller.
+***/
+
+#include "piler2.h"
+
+IIX::IIX()
+ {
+ m_GlobalLength = 0;
+ m_BlockLength = 0;
+ m_BlockCount = 0;
+ m_Blocks = 0;
+ m_GLIX = 0;
+ }
+
+IIX::~IIX()
+ {
+ Free();
+ }
+
+void IIX::Free()
+ {
+ // TODO@@
+ m_GlobalLength = 0;
+ }
+
+int IIX::PosToBlockIndex(int Pos) const
+ {
+ return Pos/m_BlockLength;
+ }
+
+void IIX::Init(int GlobalLength, int BlockLength)
+ {
+ m_GlobalLength = GlobalLength;
+ m_BlockLength = BlockLength;
+ m_BlockCount = (GlobalLength + BlockLength - 1)/BlockLength;
+ m_Blocks = all(INTERVAL *, m_BlockCount);
+ zero(m_Blocks, INTERVAL *, m_BlockCount);
+ m_GLIX = 0;
+ }
+
+bool IIX::ValidInterval(int From, int To) const
+ {
+ if (From < 0 || From >= m_GlobalLength)
+ return false;
+ if (To < 0 || To >= m_GlobalLength)
+ return false;
+ if (From > To)
+ return false;
+ return true;
+ }
+
+void IIX::AddGlobal(int GlobalFrom, int GlobalTo, int User)
+ {
+ assert(ValidInterval(GlobalFrom, GlobalTo));
+
+ const int BlockFrom = PosToBlockIndex(GlobalFrom);
+ const int BlockTo = PosToBlockIndex(GlobalTo);
+ for (int BlockIndex = BlockFrom; BlockIndex <= BlockTo; ++BlockIndex)
+ {
+ INTERVAL *ii = all(INTERVAL, 1);
+
+ ii->From = GlobalFrom;
+ ii->To = GlobalTo;
+ ii->User = User;
+ ii->Next = m_Blocks[BlockIndex];
+
+ m_Blocks[BlockIndex] = ii;
+ }
+ }
+
+void IIX::AddLocal(const char *Label, int LocalFrom, int LocalTo, int User)
+ {
+ if (0 == m_GLIX)
+ Quit("IIX::AddLocal: no GLIX");
+
+ int GlobalFrom = m_GLIX->LocalToGlobal(Label, LocalFrom);
+ int GlobalTo = GlobalFrom + (LocalTo - LocalFrom + 1);
+ return AddGlobal(GlobalFrom, GlobalTo, User);
+ }
+
+int IIX::LookupGlobal(int GlobalFrom, int GlobalTo, int **ptrHits) const
+ {
+ *ptrHits = 0;
+ if (!ValidInterval(GlobalFrom, GlobalTo))
+ return 0;
+
+ int *Hits = 0;
+ int HitCount = 0;
+ int HitBufferSize = 0;
+ const int BlockFrom = PosToBlockIndex(GlobalFrom);
+ const int BlockTo = PosToBlockIndex(GlobalTo);
+ for (int BlockIndex = BlockFrom; BlockIndex <= BlockTo; ++BlockIndex)
+ {
+ for (INTERVAL *ii = m_Blocks[BlockIndex]; ii; ii = ii->Next)
+ {
+ if (Overlap(GlobalFrom, GlobalTo, ii->From, ii->To) > 0)
+ {
+ if (HitCount >= HitBufferSize)
+ {
+ HitBufferSize += 128;
+ reall(Hits, int, HitBufferSize);
+ }
+ Hits[HitCount++] = ii->User;
+ }
+ }
+ }
+ *ptrHits = Hits;
+ return HitCount;
+ }
+
+int IIX::LookupLocal(const char *Label, int LocalFrom, int LocalTo, int **ptrHits) const
+ {
+ int GlobalFrom = m_GLIX->LocalToGlobalNoFail(Label, LocalFrom);
+ if (-1 == GlobalFrom)
+ return 0;
+
+ int GlobalTo = GlobalFrom + (LocalTo - LocalFrom + 1);
+ return LookupGlobal(GlobalFrom, GlobalTo, ptrHits);
+ }
+
+void IIX::LogMe() const
+ {
+ Log("IIX %d blocks\n", m_BlockCount);
+ Log(" Block From To User\n");
+ Log("====== ========== ========== ==========\n");
+ for (int BlockIndex = 0; BlockIndex < m_BlockCount; ++BlockIndex)
+ {
+ for (INTERVAL *ii = m_Blocks[BlockIndex]; ii; ii = ii->Next)
+ Log("%6d %10d %10d %10d\n",
+ BlockIndex, ii->From, ii->To, ii->User);
+ }
+ }
diff --git a/iix.h b/iix.h
new file mode 100755
index 0000000..e49e882
--- /dev/null
+++ b/iix.h
@@ -0,0 +1,42 @@
+const int DEFAULT_BLOCK_LENGTH = 10000;
+
+// From and To are 0-based coordinates,
+// i.e. are >= 0 and < m_GlobalLength.
+struct INTERVAL
+ {
+ int From;
+ int To;
+ int User;
+ INTERVAL *Next;
+ };
+
+class IIX
+ {
+public:
+ IIX();
+ virtual ~IIX();
+
+ void Init(int GlobalLength, int BlockLength = DEFAULT_BLOCK_LENGTH);
+ void SetGLIX(GLIX *ptrGLIX) { m_GLIX = ptrGLIX; }
+
+ void Free();
+ void AddGlobal(int GlobalFrom, int GlobalTo, int User);
+ void AddLocal(const char *Label, int LocalFrom, int LocalTo, int User);
+
+ int LookupGlobal(int GlobalFrom, int GlobalTo, int **ptrHits) const;
+ int LookupLocal(const char *Label, int LocalFrom, int LocalTo, int **ptrHits) const;
+
+ bool ValidInterval(int From, int To) const;
+
+ void LogMe() const;
+
+private:
+ int PosToBlockIndex(int Pos) const;
+
+private:
+ int m_GlobalLength;
+ int m_BlockLength;
+ int m_BlockCount;
+ INTERVAL **m_Blocks;
+ GLIX *m_GLIX;
+ };
diff --git a/log.cpp b/log.cpp
new file mode 100755
index 0000000..f598ca9
--- /dev/null
+++ b/log.cpp
@@ -0,0 +1,51 @@
+#include "piler2.h"
+
+static FILE *g_fLog = 0;
+
+void SetLog()
+ {
+ bool Append = false;
+ const char *FileName = ValueOpt("log");
+ if (0 == FileName)
+ {
+ FileName = ValueOpt("loga");
+ if (0 == FileName)
+ return;
+ Append = true;
+ }
+
+ g_fLog = fopen(FileName, Append ? "a" : "w");
+ if (NULL == g_fLog)
+ {
+ fprintf(stderr, "\n*** FATAL ERROR *** Cannot open log file\n");
+ perror(FileName);
+ exit(1);
+ }
+ }
+
+void Log(const char Format[], ...)
+ {
+ if (0 == g_fLog)
+ return;
+
+ char Str[4096];
+ va_list ArgList;
+ va_start(ArgList, Format);
+ vsprintf(Str, Format, ArgList);
+ fprintf(g_fLog, "%s", Str);
+ fflush(g_fLog);
+ }
+
+void Warning(const char Format[], ...)
+ {
+ char Str[4096];
+ va_list ArgList;
+ va_start(ArgList, Format);
+ vsprintf(Str, Format, ArgList);
+ fprintf(stderr, "\n** WARNING ** %s\n", Str);
+ if (0 != g_fLog)
+ {
+ fprintf(g_fLog, "** WARNING ** %s\n", Str);
+ fflush(g_fLog);
+ }
+ }
diff --git a/main.cpp b/main.cpp
new file mode 100755
index 0000000..07e9959
--- /dev/null
+++ b/main.cpp
@@ -0,0 +1,87 @@
+#include "piler2.h"
+
+#if WIN32
+#include <windows.h>
+#endif
+
+bool g_Quiet = false;
+const char *g_ProcessName = "piler2";
+
+int main(int argc, char *argv[])
+ {
+ g_ProcessName = argv[0];
+
+#if WIN32
+// Multi-tasking does not work well in CPU-bound
+// console apps running under Win32.
+// Reducing the process priority allows GUI apps
+// to run responsively in parallel.
+ SetPriorityClass(GetCurrentProcess(), BELOW_NORMAL_PRIORITY_CLASS);
+#endif
+
+ ProcessArgVect(argc - 1, argv + 1);
+ SetLog();
+ g_Quiet = FlagOpt("quiet");
+
+ for (int i = 0; i < argc; ++i)
+ Log("%s ", argv[i]);
+ Log("\n");
+
+ if (ValueOpt("trs") != 0)
+ {
+ TRS();
+ return 0;
+ }
+ else if (ValueOpt("tan") != 0)
+ {
+ Tan();
+ return 0;
+ }
+ else if (ValueOpt("tr") != 0)
+ {
+ TR();
+ return 0;
+ }
+ else if (ValueOpt("trs2fasta"))
+ {
+ TRS2Fasta();
+ return 0;
+ }
+ else if (ValueOpt("tanmotif2fasta"))
+ {
+ Tanmotif2Fasta();
+ return 0;
+ }
+ else if (ValueOpt("cons"))
+ {
+ Cons();
+ return 0;
+ }
+ else if (ValueOpt("annot"))
+ {
+ Annot();
+ return 0;
+ }
+ else if (ValueOpt("annotedge"))
+ {
+ AnnotEdge();
+ return 0;
+ }
+ else if (ValueOpt("crisp"))
+ {
+ Crisp();
+ return 0;
+ }
+ else if (FlagOpt("help"))
+ {
+ Usage();
+ exit(0);
+ }
+ else if (FlagOpt("version"))
+ {
+ fprintf(stderr, PILER_LONG_VERSION "\n");
+ exit(0);
+ }
+ else
+ CommandLineError("No command specified");
+ }
diff --git a/makeannot.cpp b/makeannot.cpp
new file mode 100755
index 0000000..c99b40f
--- /dev/null
+++ b/makeannot.cpp
@@ -0,0 +1,373 @@
+#include "piler2.h"
+
+#define MAX(i, j) ((i) >= (j) ? (i) : (j))
+#define MIN(i, j) ((i) <= (j) ? (i) : (j))
+
+static double MIN_OVERLAP = 0.05;
+const int MAX_HITS = 8;
+const int MAX_STR = 256;
+const int MAP_CHARS = 20;
+const double MAX_MARGIN = 0.05;
+const int EDGE_BASES = 100;
+const int MIN_OVERLAP_EDGE = 32;
+
+struct AnnotHit
+ {
+ int Overlap;
+ int RepIndex;
+ };
+static AnnotHit AHs[MAX_HITS];
+static int AnnotHitCount;
+
+static char Str[MAX_STR+1];
+static int StrPos;
+
+static const RepData *g_Reps;
+static int g_RepCount;
+static bool g_Rev;
+static int g_Mid;
+
+static int InRegion(int ContigFrom, int ContigTo, int Pos)
+ {
+ if (Pos >= ContigFrom && Pos <= ContigTo)
+ return true;
+ if (Pos < ContigFrom && ContigFrom - Pos <= MAX_MARGIN)
+ return true;
+ if (Pos > ContigTo && Pos - ContigTo <= MAX_MARGIN)
+ return true;
+ return false;
+ }
+
+static int CmpAH(const void *a1, const void *a2)
+ {
+ const AnnotHit *h1 = (const AnnotHit *) a1;
+ const AnnotHit *h2 = (const AnnotHit *) a2;
+ int i1 = h1->RepIndex;
+ int i2 = h2->RepIndex;
+ assert(i1 >= 0 && i1 < g_RepCount);
+ assert(i2 >= 0 && i2 < g_RepCount);
+ if (g_Rev)
+ return g_Reps[i2].ContigFrom - g_Reps[i1].ContigFrom;
+ return g_Reps[i1].ContigFrom - g_Reps[i2].ContigFrom;
+ }
+
+static int CmpAHEdge(const void *a1, const void *a2)
+ {
+ const AnnotHit *h1 = (const AnnotHit *) a1;
+ const AnnotHit *h2 = (const AnnotHit *) a2;
+ int i1 = h1->RepIndex;
+ int i2 = h2->RepIndex;
+ assert(i1 >= 0 && i1 < g_RepCount);
+ assert(i2 >= 0 && i2 < g_RepCount);
+
+ const RepData &Rep1 = g_Reps[i1];
+ const RepData &Rep2 = g_Reps[i2];
+
+ int df1 = iabs(Rep1.ContigFrom - g_Mid);
+ int df2 = iabs(Rep2.ContigFrom - g_Mid);
+
+ int dt1 = iabs(Rep1.ContigTo - g_Mid);
+ int dt2 = iabs(Rep2.ContigTo - g_Mid);
+
+ int d1 = MIN(df1, dt1);
+ int d2 = MIN(df2, dt2);
+
+ return d1 - d2;
+ }
+
+static void SafeCatChar(char c)
+ {
+ if (StrPos >= MAX_STR)
+ return;
+ Str[StrPos++] = c;
+ Str[StrPos] = 0;
+ }
+
+static void SafeCat(const char *s)
+ {
+ while (char c = *s++)
+ {
+ if (StrPos >= MAX_STR)
+ break;
+ Str[StrPos++] = c;
+ }
+ Str[StrPos] = 0;
+ }
+
+static int GetAnnots(const char *Label, int ContigFrom, int ContigTo,
+ const RepData *Reps, int RepCount, int MinOverlap)
+ {
+ int AnnotHitCount = 0;
+ for (int RepIndex = 0; RepIndex < RepCount; ++RepIndex)
+ {
+ const RepData &Rep = Reps[RepIndex];
+ if (0 != strcmp(Label, Rep.ContigLabel))
+ continue;
+ int ov = Overlap(ContigFrom, ContigTo, Rep.ContigFrom, Rep.ContigTo);
+ if (ov < MinOverlap)
+ continue;
+
+ // Unconditionally accept if not maxed out
+ if (AnnotHitCount < MAX_HITS)
+ {
+ AHs[AnnotHitCount].Overlap = ov;
+ AHs[AnnotHitCount].RepIndex = RepIndex;
+ ++AnnotHitCount;
+ }
+ else
+ {
+ // Overwrite shortest hit if this is longer
+ int SmallestOverlap = AHs[0].Overlap;
+ int SmallestIndex = 0;
+ for (int i = 1; i < MAX_HITS; ++i)
+ {
+ if (AHs[i].Overlap < SmallestOverlap)
+ {
+ SmallestOverlap = AHs[i].Overlap;
+ SmallestIndex = 0;
+ }
+ }
+ if (ov > SmallestOverlap)
+ {
+ AHs[SmallestIndex].RepIndex = RepIndex;
+ AHs[SmallestIndex].Overlap = ov;
+ }
+ }
+ }
+ return AnnotHitCount;
+ }
+
+static int PosToCharIndex(int RegionFrom, int RegionTo, int Pos, int MapChars)
+ {
+ if (MapChars <= 1)
+ return 0;
+
+ int RegionLength = RegionTo - RegionFrom + 1;
+ if (RegionLength <= 1)
+ return 0;
+
+ if (Pos < RegionFrom)
+ return 0;
+ if (Pos >= RegionTo)
+ return MapChars - 1;
+
+ int CharIndex = ((Pos - RegionFrom)*MapChars)/RegionLength;
+ if (CharIndex < 0 || CharIndex >= MapChars)
+ Quit("PosToCharIndex messed up");
+ return CharIndex;
+ }
+
+static void RevStr()
+ {
+ for (int i = 0; i < MAP_CHARS/2; ++i)
+ {
+ char c1 = Str[i];
+ char c2 = Str[MAP_CHARS-i-1];
+ Str[i] = c2;
+ Str[MAP_CHARS-i-1] = c1;
+ }
+ }
+
+const char *MakeAnnotEdge(const char *Label, int SeqFrom, int SeqTo, bool Rev,
+ const RepData *Reps, int RepCount)
+ {
+ StrPos = 0;
+ for (int i = 0; i < MAP_CHARS; ++i)
+ SafeCatChar('-');
+
+ const int Mid = (SeqFrom + SeqTo)/2;
+ int ContigFrom = SeqFrom - EDGE_BASES/2;
+ int ContigTo = SeqFrom + EDGE_BASES/2;
+
+ int From = ContigFrom;
+ int To = ContigTo;
+ if (From < 0)
+ From = 0;
+
+ AnnotHitCount = GetAnnots(Label, From, To, Reps, RepCount, MIN_OVERLAP_EDGE);
+ if (0 == AnnotHitCount)
+ return Str;
+
+// Global vars needed by CmpAHEdge
+ g_Rev = Rev;
+ g_Reps = Reps;
+ g_RepCount = RepCount;
+ g_Mid = Mid;
+// Sort by edge
+ qsort((void *) AHs, AnnotHitCount, sizeof(AnnotHit), CmpAHEdge);
+
+ for (int AnnotIndex = 0; AnnotIndex < AnnotHitCount; ++AnnotIndex)
+ {
+ const int RepIndex = AHs[AnnotIndex].RepIndex;
+ const RepData &Rep = Reps[RepIndex];
+
+ bool LeftEnd = false;
+ bool RightEnd = false;
+ int RepFrom = Rep.RepeatFrom;
+ if (-1 != RepFrom)
+ {
+ int RepTo = Rep.RepeatTo;
+ int RepLeft = Rep.RepeatLeft;
+ int RepLength = RepTo + RepLeft - 1;
+ if (RepLength <= 0)
+ RepLength = 9999; // hack to avoid div by 0
+
+ int RepContigFrom = Rep.ContigFrom;
+ int RepContigTo = Rep.ContigTo;
+ if (Rep.Rev)
+ {
+ int Tmp = RepFrom;
+ RepFrom = RepLeft;
+ RepLeft = Tmp;
+ }
+ const double LeftMargin = (double) RepFrom / RepLength;
+ const double RightMargin = (double) RepLeft / RepLength;
+ bool LeftInRegion = InRegion(ContigFrom, ContigTo, Rep.ContigFrom);
+ bool RightInRegion = InRegion(ContigFrom, ContigTo, Rep.ContigTo);
+ LeftEnd = (LeftInRegion && LeftMargin <= MAX_MARGIN);
+ RightEnd = (RightInRegion && RightMargin <= MAX_MARGIN);
+ }
+
+ int MapFrom = PosToCharIndex(ContigFrom, ContigTo, Rep.ContigFrom, MAP_CHARS);
+ int MapTo = PosToCharIndex(ContigFrom, ContigTo, Rep.ContigTo, MAP_CHARS);
+
+ const char cLower = 'a' + AnnotIndex;
+ const char cUpper = 'A' + AnnotIndex;
+
+ if (LeftEnd)
+ Str[MapFrom] = cUpper;
+ else
+ Str[MapFrom] = cLower;
+
+ for (int i = MapFrom + 1; i <= MapTo - 1; ++i)
+ Str[i] = cLower;
+
+ if (RightEnd)
+ Str[MapTo] = cUpper;
+ else
+ Str[MapTo] = cLower;
+
+ SafeCat(" ");
+ SafeCat(Rep.RepeatName);
+
+ if (Rep.RepeatFrom >= 0)
+ {
+ int FullRepLength = Rep.RepeatTo + Rep.RepeatLeft;
+ int RepMissing = Rep.RepeatFrom + Rep.RepeatLeft;
+ double Pct = ((FullRepLength - RepMissing)*100.0)/FullRepLength;
+ char s[32];
+ sprintf(s, "(%.0f%%)", Pct);
+ SafeCat(s);
+ }
+ }
+
+ if (Rev)
+ RevStr();
+ return Str;
+ }
+
+const char *MakeAnnot(const char *Label, int SeqFrom, int SeqTo, bool Rev,
+ const RepData *Reps, int RepCount)
+ {
+ int ContigFrom = SeqFrom;
+ int ContigTo = SeqTo;
+ int Length = ContigTo - ContigFrom + 1;
+ const char *strMinRatio = ValueOpt("minratio");
+ if (strMinRatio != 0)
+ MIN_OVERLAP = atof(strMinRatio)/100.0;
+ const int MinOverlap = (int) (Length*MIN_OVERLAP);
+
+ StrPos = 0;
+ for (int i = 0; i < MAP_CHARS; ++i)
+ SafeCatChar('-');
+
+ AnnotHitCount = GetAnnots(Label, ContigFrom, ContigTo, Reps, RepCount,
+ MinOverlap);
+ if (0 == AnnotHitCount)
+ return Str;
+
+// Global vars needed by CmpAH
+ g_Rev = Rev;
+ g_Reps = Reps;
+ g_RepCount = RepCount;
+// Sort by start pos
+ qsort((void *) AHs, AnnotHitCount, sizeof(AnnotHit), CmpAH);
+
+ for (int AnnotIndex = 0; AnnotIndex < AnnotHitCount; ++AnnotIndex)
+ {
+ const int RepIndex = AHs[AnnotIndex].RepIndex;
+ const RepData &Rep = Reps[RepIndex];
+ const int From = MAX(Rep.ContigFrom, ContigFrom);
+ const int To = MIN(Rep.ContigTo, ContigTo);
+
+ bool LeftEnd = false;
+ bool RightEnd = false;
+ int RepFrom = Rep.RepeatFrom;
+ if (-1 != RepFrom)
+ {
+ int RepTo = Rep.RepeatTo;
+ int RepLeft = Rep.RepeatLeft;
+ int RepLength = RepTo + RepLeft - 1;
+ if (RepLength <= 0)
+ RepLength = 9999; // hack to avoid div by 0
+
+ if (Rep.ContigFrom < ContigFrom)
+ RepFrom += (ContigFrom - Rep.ContigFrom);
+ if (Rep.ContigTo > ContigTo)
+ RepTo -= Rep.ContigTo - ContigTo;
+
+ const double LeftMargin = (double) RepFrom / RepLength;
+ const double RightMargin = (double) RepLeft / RepLength;
+ LeftEnd = (LeftMargin <= MAX_MARGIN);
+ RightEnd = (RightMargin <= MAX_MARGIN);
+ }
+
+ int MapFrom = ((From - ContigFrom)*MAP_CHARS)/Length;
+ int MapTo = ((To - ContigFrom)*MAP_CHARS)/Length;
+
+ if (Rev)
+ {
+ int Tmp = MapFrom;
+ MapFrom = MAP_CHARS - MapTo - 1;
+ MapTo = MAP_CHARS - Tmp - 1;
+ }
+
+ if (MapFrom < 0 || MapFrom >= MAP_CHARS || MapTo < 0 || MapTo >= MAP_CHARS)
+ Quit("MakeAnnot: failed to map");
+
+ const char cLower = 'a' + AnnotIndex;
+ const char cUpper = 'A' + AnnotIndex;
+
+ if (LeftEnd)
+ Str[MapFrom] = cUpper;
+ else
+ Str[MapFrom] = cLower;
+
+ for (int i = MapFrom + 1; i <= MapTo - 1; ++i)
+ Str[i] = cLower;
+
+ if (RightEnd)
+ Str[MapTo] = cUpper;
+ else
+ Str[MapTo] = cLower;
+
+ SafeCat(" ");
+ SafeCat(Rep.RepeatName);
+
+ if (Rep.RepeatFrom >= 0)
+ {
+ int FullRepLength = Rep.RepeatTo + Rep.RepeatLeft;
+ int RepMissing = Rep.RepeatFrom + Rep.RepeatLeft;
+ if (ContigFrom > Rep.ContigFrom)
+ RepMissing += ContigFrom - Rep.ContigFrom;
+ if (Rep.ContigTo > ContigTo)
+ RepMissing += Rep.ContigTo - ContigTo;
+ double Pct = ((FullRepLength - RepMissing)*100.0)/FullRepLength;
+ char s[32];
+ sprintf(s, "(%.0f%%)", Pct);
+ SafeCat(s);
+ }
+ }
+
+ return Str;
+ }
diff --git a/mem.cpp b/mem.cpp
new file mode 100755
index 0000000..93bf2a8
--- /dev/null
+++ b/mem.cpp
@@ -0,0 +1,53 @@
+#include "piler2.h"
+
+#ifdef _MSC_VER
+#include <crtdbg.h>
+#endif
+
+static int AllocatedBytes;
+static int PeakAllocatedBytes;
+
+// Allocate memory, fail on error, track total
+void *allocmem(int bytes)
+ {
+ assert(bytes < 0xfffffff0); // check for overlow
+ char *p = (char *) malloc((size_t) (bytes + 4));
+ if (0 == p)
+ Quit("Out of memory (%d)", bytes);
+ int *pSize = (int *) p;
+ *pSize = bytes;
+ AllocatedBytes += bytes;
+ if (AllocatedBytes > PeakAllocatedBytes)
+ PeakAllocatedBytes = AllocatedBytes;
+ return p + 4;
+ }
+
+void freemem(void *p)
+ {
+ if (0 == p)
+ return;
+ int *pSize = (int *) ((char *) p - 4);
+ int bytes = *pSize;
+ assert(bytes <= AllocatedBytes);
+ AllocatedBytes -= bytes;
+ free(((char *) p) - 4);
+ }
+
+void chkmem()
+ {
+#ifdef _MSC_VER
+ assert(_CrtCheckMemory());
+#endif
+ }
+
+void *reallocmem(void *p, int bytes)
+ {
+ if (0 == p)
+ return allocmem(bytes);
+
+ int *pSize = (int *) ((char *) p - 4);
+ int oldbytes = *pSize;
+ void *newp = allocmem(bytes);
+ memcpy(newp, p, oldbytes);
+ return newp;
+ }
diff --git a/options.cpp b/options.cpp
new file mode 100755
index 0000000..7342aca
--- /dev/null
+++ b/options.cpp
@@ -0,0 +1,156 @@
+#include "piler2.h"
+
+struct VALUE_OPT
+ {
+ const char *m_pstrName;
+ const char *m_pstrValue;
+ };
+
+struct FLAG_OPT
+ {
+ const char *m_pstrName;
+ bool m_bSet;
+ };
+
+static VALUE_OPT ValueOpts[] =
+ {
+ "trs", 0,
+ "trs2", 0,
+ "trs2fasta", 0,
+ "tanmotif2fasta", 0,
+ "cons", 0,
+ "arrays", 0,
+ "out", 0,
+ "piles", 0,
+ "images", 0,
+ "log", 0,
+ "loga", 0,
+ "seq", 0,
+ "path", 0,
+ "rep", 0,
+ "mincover", 0,
+ "maxlengthdiffpct", 0,
+ "maxfam", 0,
+ "label", 0,
+ "annot", 0,
+ "annotedge", 0,
+ "famsize", 0,
+ "prefix", 0,
+ "tan", 0,
+ "pyramid", 0,
+ "motif", 0,
+ "minhits", 0,
+ "maxmargin", 0,
+ "minratio", 0,
+ "tr", 0,
+ "cand", 0,
+ "mintrspacing", 0,
+ "maxtrspacing", 0,
+ "mintrlength", 0,
+ "minspacingratio", 0,
+ "minfam", 0,
+ "minhitratio", 0,
+ "mindistpairs", 0,
+ "crisp", 0,
+ };
+static int ValueOptCount = sizeof(ValueOpts)/sizeof(ValueOpts[0]);
+
+static FLAG_OPT FlagOpts[] =
+ {
+ "multihit", 0,
+ "quiet", 0,
+ "version", 0,
+ "help", 0,
+ "edge", 0,
+ "pilesonly", 0,
+ "exact", 0,
+ };
+static int FlagOptCount = sizeof(FlagOpts)/sizeof(FlagOpts[0]);
+
+void CommandLineError(const char *Format, ...)
+ {
+ va_list ArgList;
+ va_start(ArgList, Format);
+
+ char Str[4096];
+ vsprintf(Str, Format, ArgList);
+ Usage();
+ fprintf(stderr, "\n** Command line error** %s\n", Str);
+ exit(1);
+ }
+
+static bool TestSetFlagOpt(const char *Arg)
+ {
+ for (int i = 0; i < FlagOptCount; ++i)
+ if (!stricmp(Arg, FlagOpts[i].m_pstrName))
+ {
+ FlagOpts[i].m_bSet = true;
+ return true;
+ }
+ return false;
+ }
+
+static bool TestSetValueOpt(const char *Arg, const char *Value)
+ {
+ for (int i = 0; i < ValueOptCount; ++i)
+ if (!stricmp(Arg, ValueOpts[i].m_pstrName))
+ {
+ if (0 == Value)
+ CommandLineError("Option -%s must have value\n", Arg);
+ ValueOpts[i].m_pstrValue = strsave(Value);
+ return true;
+ }
+ return false;
+ }
+
+bool FlagOpt(const char *Name)
+ {
+ for (int i = 0; i < FlagOptCount; ++i)
+ if (!stricmp(Name, FlagOpts[i].m_pstrName))
+ return FlagOpts[i].m_bSet;
+ Quit("FlagOpt(%s) invalid", Name);
+ return false;
+ }
+
+const char *ValueOpt(const char *Name)
+ {
+ for (int i = 0; i < ValueOptCount; ++i)
+ if (!stricmp(Name, ValueOpts[i].m_pstrName))
+ return ValueOpts[i].m_pstrValue;
+ Quit("ValueOpt(%s) invalid", Name);
+ return 0;
+ }
+
+const char *RequiredValueOpt(const char *Name)
+ {
+ const char *s = ValueOpt(Name);
+ if (0 == s)
+ CommandLineError("Required option -%s not specified\n", Name);
+ return s;
+ }
+
+void ProcessArgVect(int argc, char *argv[])
+ {
+ for (int iArgIndex = 0; iArgIndex < argc; )
+ {
+ const char *Arg = argv[iArgIndex];
+ if (Arg[0] != '-')
+ Quit("Command-line option \"%s\" must start with '-'\n", Arg);
+ const char *ArgName = Arg + 1;
+ if (TestSetFlagOpt(ArgName))
+ {
+ ++iArgIndex;
+ continue;
+ }
+
+ char *Value = 0;
+ if (iArgIndex < argc - 1)
+ Value = argv[iArgIndex+1];
+ if (TestSetValueOpt(ArgName, Value))
+ {
+ iArgIndex += 2;
+ continue;
+ }
+ CommandLineError("Invalid command line option \"%s\"\n", ArgName);
+ }
+ }
diff --git a/params.h b/params.h
new file mode 100755
index 0000000..1345644
--- /dev/null
+++ b/params.h
@@ -0,0 +1,8 @@
+const int BITS_PER_INT = 32;
+const int CONTIG_MAP_BIN_SIZE = 1024;
+const int CHUNK_LENGTH = 1;//TODO?
+const int MAX_GFF_LINE = 1023;
+const int MAX_GFF_FEATURE_LENGTH = 128;
+const int HASH_TABLE_SIZE = 17909;
+const int FASTA_BLOCK = 60;
+const int DEFAULT_MAX_FAM = 16;
diff --git a/piler2.h b/piler2.h
new file mode 100755
index 0000000..7f10059
--- /dev/null
+++ b/piler2.h
@@ -0,0 +1,148 @@
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <assert.h>
+#include <errno.h>
+#include <stdarg.h>
+
+#define PILER_LONG_VERSION "PILER v1.0"
+
+#if _MSC_VER
+#pragma warning(disable:4800) // don't warn about bool->int conversion
+#endif
+
+#ifdef _DEBUG
+#define DEBUG 1
+#endif
+
+#ifdef WIN32
+#define FILEIO_BINARY_MODE 1
+#else
+#define FILEIO_BINARY_MODE 0
+#define stricmp strcasecmp
+#define strnicmp strncasecmp
+#endif
+
+#include "params.h"
+#include "types.h"
+
+extern bool g_Quiet;
+extern const char *g_ProcessName;
+
+// Memory wrappers.
+// Macro hacks, but makes code more readable
+// by hiding cast and sizeof.
+#define all(t, n) (t *) allocmem((n)*sizeof(t))
+#define reall(p, t, n) p = (t *) reallocmem(p, (n)*sizeof(t))
+#define zero(p, t, n) memset(p, 0, (n)*sizeof(t))
+void *allocmem(int bytes);
+void freemem(void *p);
+void *reallocmem(void *p, int bytes);
+
+char *strsave(const char *s);
+
+FILE *OpenStdioFile(const char *FileName, FILEIO_MODE Mode = FILEIO_MODE_ReadOnly);
+void Rewind(FILE *f);
+int GetFileSize(FILE *f);
+
+// FASTA input / output
+// MFA=multi FASTA (>= 1 sequence in file)
+// AFA=aligned FASTA (>= 1 sequence with gaps, must be same length)
+char *ReadMFA(const char FileName[], int *ptrSeqLength);
+char *ReadAFA(const char FileName[], int *ptrSeqLength, int *ptrSeqCount);
+void WriteFasta(FILE *f, const char *Seq, int Length, const char *Label, bool Rev = false);
+
+void Usage();
+void Quit(const char szFormat[], ...);
+
+void SetLog();
+void Log(const char Format[], ...);
+void Warning(const char Format[], ...);
+
+void ProgressStart(const char *Format, ...);
+void Progress(const char *Format, ...);
+void ProgressDone();
+int GetElapsedSecs();
+unsigned GetMaxMemUseBytes();
+
+void ProcessArgVect(int argc, char *argv[]);
+const char *ValueOpt(const char *Name);
+const char *RequiredValueOpt(const char *Name);
+bool FlagOpt(const char *Name);
+void CommandLineError(const char *Format, ...);
+
+unsigned GetPeakMemUseBytes();
+unsigned GetMemUseBytes();
+double GetRAMSize();
+
+int FindConnectedComponents(EdgeList &Edges, FamList &Fams,
+ int MinComponentSize);
+
+void AddContig(const char *Label, int GlobalPos, int Length);
+void AddContigPos(const char *Label, int Pos);
+void LogContigs();
+int GlobalizeContigs();
+const char *GlobalToContig(int GlobalPos, int *ptrContigPos);
+int ContigToGlobal(int ContigPos, const char *Label);
+void MakeContigMap();
+void SetSeqLength(int Length);
+unsigned Hash(const char *key);
+unsigned Hash(const char *key, unsigned TableSize);
+int Overlap(int From1, int To1, int From2, int To2);
+int RoundUp(int i, int MinInc, int MultipleOf);
+
+static inline int iabs(int i)
+ {
+ return i > 0 ? i : -i;
+ }
+
+const char *MakeAnnot(const char *Label, int Start, int End, bool Rev,
+ const RepData *Reps, int RepCount);
+const char *MakeAnnotEdge(const char *Label, int SeqFrom, int SeqTo, bool Rev,
+ const RepData *Reps, int RepCount);
+
+// GFF helpers
+int GetFields(char *Line, char **Fields, int MaxFields, char FS);
+bool GetNextGFFRecord(FILE *f, GFFRecord &Rec);
+void WriteGFFRecord(FILE *f, const GFFRecord &Rec);
+extern int GFFLineNr;
+
+bool GetNextGFFRecord(FILE *f, GFFRecord2 &Rec);
+void WriteGFFRecord(FILE *f, const GFFRecord2 &Rec);
+
+void GFFRecordToHit(const GLIX &Glix, const GFFRecord2 &Rec, HitData &Hit);
+void HitToGFFRecord(const GLIX &Glix, const HitData &Hit, GFFRecord2 &Rec, char AnnotBuffer[]);
+void FreeGFFStrings(GFFRecord2 &Rec);
+void SaveGFFStrings(GFFRecord2 &Rec);
+
+bool HasTargetAttrs(const char *Attrs);
+void ParseTargetAttrs(const char *Attrs, char SeqName[],
+ int SeqNameBytes, int *ptrStart, int *ptrEnd);
+void ParsePilesAttrs(const char *Attrs, int *ptrQueryPile, int *ptrTargetPile);
+
+// GFF input
+HitData *ReadHits(const char *FileName, int *ptrHitCount, int *ptrSeqLength);
+TRSData *ReadTRS(const char *FileName, int *ptrTRSCount);
+RepData *ReadReps(const char *FileName, int *ptrRepCount);
+MotifData *ReadMotif(const char *FileName, int *ptrMotifCount);
+
+// GFF output
+void WriteTRSFile(const char *OutputFileName, const PileData *Piles, int PileCount);
+void WritePiles(const char *OutputFileName, const PileData *Piles, int PileCount);
+void WriteImages(const char *OutputFileName, const HitData *Hits, int PileCount,
+ const PILE_INDEX_TYPE *PileIndexes);
+void WriteCrispFile(const char *OutputFileName, const PileData *Piles, int PileCount);
+void WriteArray(FILE *f, int ArrayIndex, int Lo, int Hi);
+
+// Commands
+void TR();
+void TRS();
+//void TRS2();
+void TRS2Fasta();
+void Cons();
+void Annot();
+void AnnotEdge();
+void Tanmotif2Fasta();
+void Tan();
+void Crisp();
diff --git a/progress.cpp b/progress.cpp
new file mode 100755
index 0000000..c800f39
--- /dev/null
+++ b/progress.cpp
@@ -0,0 +1,67 @@
+#include "piler2.h"
+#include <time.h>
+
+static time_t g_StartTime = time(0);
+static time_t g_StartTimeStep;
+static char *g_Desc = 0;
+
+int GetElapsedSecs()
+ {
+ return (int) (time(0) - g_StartTime);
+ }
+
+static void ProgressPrefix()
+ {
+ int ElapsedSecs = (int) (time(0) - g_StartTime);
+ unsigned MemUseMB = (GetMemUseBytes() + 500000)/1000000;
+ unsigned RAMMB = (unsigned) ((GetRAMSize() + 500000)/1000000);
+ unsigned MemUsePct = (MemUseMB*100)/RAMMB;
+ Log("%4d s %4d Mb (%3d%%) ", ElapsedSecs, MemUseMB, MemUsePct);
+
+ if (!g_Quiet)
+ fprintf(stderr, "%4d s %6d Mb (%3d%%) ", ElapsedSecs, MemUseMB, MemUsePct);
+ }
+
+void ProgressStart(const char *Format, ...)
+ {
+ ProgressPrefix();
+
+ g_StartTimeStep = time(0);
+ char Str[4096];
+ va_list ArgList;
+ va_start(ArgList, Format);
+ vsprintf(Str, Format, ArgList);
+ if (g_Desc != 0)
+ free(g_Desc);
+ g_Desc = strsave(Str);
+ Log("%s\n", Str);
+
+ if (!g_Quiet)
+ fprintf(stderr, "%s\n", Str);
+ }
+
+void ProgressDone()
+ {
+ if (0 == g_Desc)
+ return;
+
+ ProgressPrefix();
+
+ int Secs = (int) (time(0) - g_StartTimeStep);
+ Log("%s done (%d s).\n", g_Desc, Secs);
+
+ if (!g_Quiet)
+ fprintf(stderr, "%s done (%d s).\n", g_Desc, Secs);
+ }
+
+void Progress(const char *Format, ...)
+ {
+ char Str[4096];
+ va_list ArgList;
+ va_start(ArgList, Format);
+ vsprintf(Str, Format, ArgList);
+ Log("%s\n", Str);
+
+ if (!g_Quiet)
+ fprintf(stderr, "%s\n", Str);
+ }
diff --git a/quit.cpp b/quit.cpp
new file mode 100755
index 0000000..084014c
--- /dev/null
+++ b/quit.cpp
@@ -0,0 +1,38 @@
+#include "piler2.h"
+
+#ifdef WIN32
+#define _WIN32_WINNT 0x0400
+#include <windows.h>
+
+void Break()
+ {
+ DebugBreak();
+ }
+#endif
+
+// Exit immediately with error message, printf-style.
+void Quit(const char szFormat[], ...)
+ {
+ va_list ArgList;
+ char szStr[4096];
+
+ va_start(ArgList, szFormat);
+ vsprintf(szStr, szFormat, ArgList);
+
+ fprintf(stderr, "\n*** FATAL ERROR *** %s %s\n", g_ProcessName, szStr);
+
+ Log("\n*** FATAL ERROR *** ");
+ Log("%s\n", szStr);
+
+#if DEBUG
+#ifdef WIN32
+ if (IsDebuggerPresent())
+ {
+ int iBtn = MessageBox(NULL, szStr, "piler", MB_ICONERROR | MB_OKCANCEL);
+ if (IDCANCEL == iBtn)
+ Break();
+ }
+#endif
+#endif
+ exit(1);
+ }
diff --git a/readafa.cpp b/readafa.cpp
new file mode 100755
index 0000000..1587780
--- /dev/null
+++ b/readafa.cpp
@@ -0,0 +1,86 @@
+#include "piler2.h"
+
+char *ReadAFA(FILE *f, int *ptrSeqLength, int *ptrSeqCount)
+ {
+ rewind(f);
+ int FileSize = GetFileSize(f);
+ int BufferSize = FileSize;
+ char *Buffer = all(char, BufferSize);
+
+ char prev_c = '\n';
+ bool InLabel = false;
+ int Pos = 0;
+ int SeqStart = 0;
+ int SeqCount = 0;
+ int SeqLength;
+
+ for (;;)
+ {
+ int c = fgetc(f);
+ if (EOF == c)
+ {
+ if (feof(f))
+ break;
+ Quit("Stream error");
+ }
+ if (InLabel)
+ {
+ if (c == '\r')
+ continue;
+ if ('\n' == c)
+ InLabel = false;
+ }
+ else
+ {
+ if ('>' == c && '\n' == prev_c)
+ {
+ int ThisSeqLength = Pos - SeqStart;
+ if (0 == SeqCount)
+ ;
+ else if (1 == SeqCount)
+ SeqLength = ThisSeqLength;
+ else if (SeqCount > 1)
+ {
+ if (SeqLength != ThisSeqLength)
+ Quit("ReadAFA: %u seqs, sequence lengths differ (a) %d %d",
+ SeqCount, SeqLength, ThisSeqLength);
+ }
+ ++SeqCount;
+ SeqStart = Pos;
+ InLabel = true;
+ }
+ else if (!isspace(c))
+ {
+ if (Pos >= BufferSize)
+ Quit("ReadAFA: buffer too small");
+ Buffer[Pos++] = (c);
+ }
+ }
+ prev_c = c;
+ }
+
+ int ThisSeqLength = Pos - SeqStart;
+ if (1 == SeqCount)
+ SeqLength = ThisSeqLength;
+ else
+ {
+ if (SeqLength != ThisSeqLength)
+ Quit("ReadAFA: %u seqs, sequence lengths differ (b) %d %d",
+ SeqCount, SeqLength, ThisSeqLength);
+ }
+
+ *ptrSeqCount = SeqCount;
+ *ptrSeqLength = SeqLength;
+
+ if (Pos != SeqCount*SeqLength)
+ Quit("ReadAFA: Internal error");
+ return Buffer;
+ }
+
+char *ReadAFA(const char FileName[], int *ptrSeqLength, int *ptrSeqCount)
+ {
+ FILE *f = OpenStdioFile(FileName);
+ char *Seqs = ReadAFA(f, ptrSeqLength, ptrSeqCount);
+ fclose(f);
+ return Seqs;
+ }
diff --git a/readhits.cpp b/readhits.cpp
new file mode 100755
index 0000000..9d6356a
--- /dev/null
+++ b/readhits.cpp
@@ -0,0 +1,121 @@
+#include "piler2.h"
+
+// Destructive: pokes Rec.Attrs
+static char *GetTarget(GFFRecord &Rec, int *ptrStart, int *ptrEnd)
+ {
+ char *Attrs = Rec.Attrs;
+ char *SemiColon = strchr(Attrs, ';');
+ if (0 != SemiColon)
+ *SemiColon = 0;
+ char *Space = strchr(Attrs, ' ');
+ if (0 == Space)
+ Quit("GFF hit record line %d, missing space in attrs", GFFLineNr);
+ *Space = 0;
+ if (0 != strcmp(Attrs, "Target"))
+ Quit("GFF hit record line %d, expected Target as first attr", GFFLineNr);
+ char *Label = Space + 1;
+ Space = strchr(Label, ' ');
+ if (0 == Space)
+ Quit("GFF hit record line %d, missing space following Target label", GFFLineNr);
+ *Space = 0;
+ char *Start = Space + 1;
+ Space = strchr(Start, ' ');
+ if (0 == Space)
+ Quit("GFF hit record line %d, missing space following Target start", GFFLineNr);
+ *Space = 0;
+ char *End = Space + 1;
+ *ptrStart = atoi(Start);
+ *ptrEnd = atoi(End);
+ return Label;
+ }
+
+static int ReadHitsPass1(FILE *f)
+ {
+ GFFLineNr = 0;
+ int HitCount = 0;
+ GFFRecord Rec;
+ while (GetNextGFFRecord(f, Rec))
+ {
+ if (0 != strcmp(Rec.Feature, "hit"))
+ continue;
+ if (Rec.Start <= 0 || Rec.End <= 0 || Rec.Start > Rec.End)
+ Warning("GFF line %d: invalid start %d / end %d",
+ GFFLineNr, Rec.Start, Rec.End);
+
+ int TargetStart;
+ int TargetEnd;
+ const char *TargetLabel = GetTarget(Rec, &TargetStart, &TargetEnd);
+ if (TargetStart <= 0 || TargetEnd <= 0 || TargetStart > TargetEnd)
+ Warning("GFF line %d: invalid target start %d / end %d",
+ GFFLineNr, Rec.Start, Rec.End);
+
+ ++HitCount;
+ AddContigPos(Rec.SeqName, Rec.End - 1);
+ AddContigPos(TargetLabel, TargetEnd - 1);
+ }
+ return HitCount;
+ }
+
+static int ReadHitsPass2(FILE *f, HitData *Hits)
+ {
+ rewind(f);
+
+ GFFLineNr = 0;
+ int HitCount = 0;
+ GFFRecord Rec;
+ while (GetNextGFFRecord(f, Rec))
+ {
+ if (0 != strcmp(Rec.Feature, "hit"))
+ continue;
+ if (Rec.Start <= 0 || Rec.End <= 0 || Rec.Start > Rec.End)
+ Warning("GFF line %d: invalid start %d / end %d",
+ GFFLineNr, Rec.Start, Rec.End);
+
+ int TargetStart;
+ int TargetEnd;
+ const char *TargetLabel = GetTarget(Rec, &TargetStart, &TargetEnd);
+ if (TargetStart <= 0 || TargetEnd <= 0 || TargetStart > TargetEnd)
+ Warning("GFF line %d: invalid target start %d / end %d",
+ GFFLineNr, Rec.Start, Rec.End);
+
+ HitData &Hit = Hits[HitCount];
+
+ const int QueryFrom = ContigToGlobal(Rec.Start, Rec.SeqName);
+ const int TargetFrom = ContigToGlobal(TargetStart, TargetLabel);
+
+ const int QueryLength = Rec.End - Rec.Start + 1;
+ const int TargetLength = TargetEnd - TargetStart + 1;
+
+ Hit.QueryFrom = QueryFrom;
+ Hit.QueryTo = QueryFrom + QueryLength - 1;
+ Hit.TargetFrom = TargetFrom;
+ Hit.TargetTo = TargetFrom + TargetLength - 1;
+
+ if (Rec.Strand == '+')
+ Hit.Rev = false;
+ else if (Rec.Strand == '-')
+ Hit.Rev = true;
+ else
+ Quit("GFF line %d, Invalid strand %c", GFFLineNr, Rec.Strand);
+
+ ++HitCount;
+ }
+ return HitCount;
+ }
+
+HitData *ReadHits(const char *FileName, int *ptrHitCount, int *ptrSeqLength)
+ {
+ FILE *f = OpenStdioFile(FileName);
+
+ int HitCount = ReadHitsPass1(f);
+ int SeqLength = GlobalizeContigs();
+ MakeContigMap();
+
+ HitData *Hits = all(HitData, HitCount);
+ ReadHitsPass2(f, Hits);
+ fclose(f);
+
+ *ptrSeqLength = SeqLength;
+ *ptrHitCount = HitCount;
+ return Hits;
+ }
diff --git a/readmfa.cpp b/readmfa.cpp
new file mode 100755
index 0000000..4a3d753
--- /dev/null
+++ b/readmfa.cpp
@@ -0,0 +1,98 @@
+#include "piler2.h"
+
+#define APPEND_CHAR(c) \
+ { \
+ if (Pos >= BufferSize) \
+ Quit("ReadMFA: buffer too small"); \
+ Buffer[Pos++] = (c); \
+ }
+
+#define APPEND_LABEL(c) \
+ { \
+ if (LabelLength >= LabelBufferLength-1) \
+ { \
+ LabelBufferLength += 128; \
+ char *NewLabel = all(char, LabelBufferLength); \
+ memcpy(NewLabel, Label, LabelLength); \
+ freemem(Label); \
+ Label = NewLabel; \
+ } \
+ Label[LabelLength] = c; \
+ ++LabelLength; \
+ }
+
+char *ReadMFA(FILE *f, int *ptrLength)
+ {
+ rewind(f);
+ int FileSize = GetFileSize(f);
+ int BufferSize = FileSize;
+ char *Buffer = all(char, BufferSize);
+
+ char prev_c = '\n';
+ bool InLabel = false;
+ int ContigFrom = 0;
+ char *Label = 0;
+ int LabelLength = 0;
+ int LabelBufferLength = 0;
+ int Pos = 0;
+ int ContigStart = 0;
+
+ for (;;)
+ {
+ int c = fgetc(f);
+ if (EOF == c)
+ {
+ if (feof(f))
+ break;
+ Quit("Stream error");
+ }
+ if (InLabel)
+ {
+ if (c == '\r')
+ continue;
+ if ('\n' == c)
+ {
+ Label[LabelLength] = 0;
+ InLabel = false;
+ }
+ else
+ {
+ APPEND_LABEL(c)
+ }
+ }
+ else
+ {
+ if ('>' == c && '\n' == prev_c)
+ {
+ int ContigLength = Pos - ContigStart;
+ if (ContigLength > 0)
+ AddContig(Label, ContigStart, ContigLength);
+
+ ContigStart = Pos;
+ InLabel = true;
+ LabelLength = 0;
+ }
+ else if (!isspace(c))
+ {
+ APPEND_CHAR(c)
+ }
+ }
+ prev_c = c;
+ }
+
+ int ContigLength = Pos - ContigStart;
+ if (ContigLength > 0)
+ AddContig(Label, ContigStart, ContigLength);
+
+ *ptrLength = Pos;
+ SetSeqLength(Pos);
+ return Buffer;
+ }
+
+char *ReadMFA(const char FileName[], int *ptrSeqLength)
+ {
+ FILE *f = OpenStdioFile(FileName);
+ char *Seq = ReadMFA(f, ptrSeqLength);
+ fclose(f);
+ return Seq;
+ }
diff --git a/readmotif.cpp b/readmotif.cpp
new file mode 100755
index 0000000..a662c60
--- /dev/null
+++ b/readmotif.cpp
@@ -0,0 +1,74 @@
+#include "piler2.h"
+
+static int GetFam(GFFRecord &Rec)
+ {
+ char *Attrs = Rec.Attrs;
+ char *Pyr = strstr(Attrs, "Pyramid");
+ if (0 == Pyr)
+ Quit("GFF tandemmotif record line %d, failed to find Pyramid attr", GFFLineNr);
+
+// Pyramid 123
+// 012345678
+ return atoi(Pyr + 8);
+ }
+
+static int ReadMotifPass1(FILE *f)
+ {
+ GFFLineNr = 0;
+ int MotifCount = 0;
+ GFFRecord Rec;
+ while (GetNextGFFRecord(f, Rec))
+ {
+ if (0 != strcmp(Rec.Feature, "tandemmotif"))
+ continue;
+ if (Rec.Start <= 0 || Rec.End <= 0 || Rec.Start > Rec.End)
+ Warning("GFF line %d: invalid start %d / end %d",
+ GFFLineNr, Rec.Start, Rec.End);
+ ++MotifCount;
+ }
+ return MotifCount;
+ }
+
+static int ReadMotifPass2(FILE *f, MotifData *Motifs)
+ {
+ rewind(f);
+
+ GFFLineNr = 0;
+ int MotifCount = 0;
+ GFFRecord Rec;
+ while (GetNextGFFRecord(f, Rec))
+ {
+ if (0 != strcmp(Rec.Feature, "tandemmotif"))
+ continue;
+ if (Rec.Start <= 0 || Rec.End <= 0 || Rec.Start > Rec.End)
+ Warning("GFF line %d: invalid start %d / end %d",
+ GFFLineNr, Rec.Start, Rec.End);
+
+ int FamIndex = GetFam(Rec);
+
+ MotifData &Motif = Motifs[MotifCount];
+
+ const int Length = Rec.End - Rec.Start + 1;
+
+ Motif.ContigLabel = strsave(Rec.SeqName);
+ Motif.ContigFrom = Rec.Start - 1;
+ Motif.ContigTo = Motif.ContigFrom + Length - 1;
+ Motif.FamIndex = FamIndex;
+
+ ++MotifCount;
+ }
+ return MotifCount;
+ }
+
+MotifData *ReadMotif(const char *FileName, int *ptrMotifCount)
+ {
+ FILE *f = OpenStdioFile(FileName);
+
+ int MotifCount = ReadMotifPass1(f);
+ MotifData *Motifs = all(MotifData, MotifCount);
+ ReadMotifPass2(f, Motifs);
+ fclose(f);
+
+ *ptrMotifCount = MotifCount;
+ return Motifs;
+ }
diff --git a/readreps.cpp b/readreps.cpp
new file mode 100755
index 0000000..113b7e3
--- /dev/null
+++ b/readreps.cpp
@@ -0,0 +1,126 @@
+#include "piler2.h"
+
+// Destructive: pokes Rec.Attrs
+static char *GetRepeat(GFFRecord &Rec)
+ {
+ char *Attrs = Rec.Attrs;
+ char *Start = strstr(Attrs, "Repeat");
+ if (0 == Start)
+ Quit("GFF line %d, repeat record does not have Repeat attribute", GFFLineNr);
+
+ char *SemiColon = strchr(Start, ';');
+ if (0 != SemiColon)
+ *SemiColon = 0;
+ char *Space = strchr(Start, ' ');
+ if (0 == Space)
+ Quit("GFF repeat record line %d, missing space in attrs", GFFLineNr);
+ return Space + 1;
+ }
+
+// "Repeat HETRP_DM Satellite 1518 1669 0"
+// 0 1 2 3 4
+void ParseRepeat(char *Str, RepData &Rep)
+ {
+ char *Fields[5];
+ int FieldCount = GetFields(Str, Fields, 5, ' ');
+ if (FieldCount != 5)
+ Quit("GFF line %d, Repeat attribute does not have 5 fields");
+
+ const char *RepeatName = Fields[0];
+ const char *RepeatClass = Fields[1];
+ const char *RepeatFrom = Fields[2];
+ const char *RepeatTo = Fields[3];
+ const char *RepeatLeft = Fields[4];
+
+ Rep.RepeatName = strsave(RepeatName);
+ Rep.RepeatClass = strsave(RepeatClass);
+ if (0 == strcmp(RepeatFrom, "."))
+ {
+ Rep.RepeatFrom = -1;
+ Rep.RepeatTo = -1;
+ Rep.RepeatLeft = -1;
+ }
+ else
+ {
+ Rep.RepeatFrom = atoi(RepeatFrom) - 1;
+ Rep.RepeatTo = atoi(RepeatTo) - 1;
+ Rep.RepeatLeft = atoi(RepeatLeft);
+ }
+ }
+
+static int ReadRepsPass1(FILE *f)
+ {
+ GFFLineNr = 0;
+ int RepCount = 0;
+ GFFRecord Rec;
+ while (GetNextGFFRecord(f, Rec))
+ {
+ if (0 != strcmp(Rec.Feature, "repeat"))
+ {
+ static bool WarningIssued = false;
+ if (!WarningIssued)
+ {
+ Warning("GFF record feature '%s' is not a repeat", Rec.Feature);
+ WarningIssued = true;
+ }
+ continue;
+ }
+ if (Rec.Start <= 0 || Rec.End <= 0 || Rec.Start > Rec.End)
+ Warning("GFF line %d: invalid start %d / end %d",
+ GFFLineNr, Rec.Start, Rec.End);
+ ++RepCount;
+ }
+ return RepCount;
+ }
+
+static int ReadRepsPass2(FILE *f, RepData *Reps)
+ {
+ rewind(f);
+
+ GFFLineNr = 0;
+ int RepCount = 0;
+ GFFRecord Rec;
+ while (GetNextGFFRecord(f, Rec))
+ {
+ if (0 != strcmp(Rec.Feature, "repeat"))
+ continue;
+
+ static char *Repeat = GetRepeat(Rec);
+
+ RepData &Rep = Reps[RepCount];
+ ParseRepeat(Repeat, Rep);
+
+ if (Rec.Start <= 0 || Rec.End <= 0 || Rec.Start > Rec.End)
+ Warning("GFF line %d: invalid start %d / end %d",
+ GFFLineNr, Rec.Start, Rec.End);
+
+ const int Length = Rec.End - Rec.Start + 1;
+
+ Rep.ContigLabel = strsave(Rec.SeqName);
+ Rep.ContigFrom = Rec.Start - 1;
+ Rep.ContigTo = Rep.ContigFrom + Length - 1;
+
+ if (Rec.Strand == '+')
+ Rep.Rev = false;
+ else if (Rec.Strand == '-')
+ Rep.Rev = true;
+ else
+ Quit("GFF line %d, Invalid strand %c", GFFLineNr, Rec.Strand);
+
+ ++RepCount;
+ }
+ return RepCount;
+ }
+
+RepData *ReadReps(const char *FileName, int *ptrRepCount)
+ {
+ FILE *f = OpenStdioFile(FileName);
+
+ int RepCount = ReadRepsPass1(f);
+ RepData *Reps = all(RepData, RepCount);
+ ReadRepsPass2(f, Reps);
+ fclose(f);
+
+ *ptrRepCount = RepCount;
+ return Reps;
+ }
diff --git a/readtrs.cpp b/readtrs.cpp
new file mode 100755
index 0000000..d097340
--- /dev/null
+++ b/readtrs.cpp
@@ -0,0 +1,94 @@
+#include "piler2.h"
+
+// Destructive: pokes Rec.Attrs
+static char *GetFam(GFFRecord &Rec)
+ {
+ char *Attrs = Rec.Attrs;
+ char *SemiColon = strchr(Attrs, ';');
+ if (0 != SemiColon)
+ *SemiColon = 0;
+ char *Space = strchr(Attrs, ' ');
+ if (0 == Space)
+ Quit("GFF trs record line %d, missing space in attrs", GFFLineNr);
+ *Space = 0;
+ if (0 != strcmp(Attrs, "Family"))
+ Quit("GFF trs record line %d, expected Family as first attr", GFFLineNr);
+ return Space + 1;
+ }
+
+static int ReadTRSPass1(FILE *f)
+ {
+ GFFLineNr = 0;
+ int TRSCount = 0;
+ GFFRecord Rec;
+ while (GetNextGFFRecord(f, Rec))
+ {
+ if (0 != strcmp(Rec.Feature, "trs"))
+ continue;
+ if (Rec.Start <= 0 || Rec.End <= 0 || Rec.Start > Rec.End)
+ Warning("GFF line %d: invalid start %d / end %d",
+ GFFLineNr, Rec.Start, Rec.End);
+ ++TRSCount;
+ }
+ return TRSCount;
+ }
+
+static int ReadTRSPass2(FILE *f, TRSData *TRSs)
+ {
+ rewind(f);
+
+ GFFLineNr = 0;
+ int TRSCount = 0;
+ GFFRecord Rec;
+ while (GetNextGFFRecord(f, Rec))
+ {
+ if (0 != strcmp(Rec.Feature, "trs"))
+ continue;
+ if (Rec.Start <= 0 || Rec.End <= 0 || Rec.Start > Rec.End)
+ Warning("GFF line %d: invalid start %d / end %d",
+ GFFLineNr, Rec.Start, Rec.End);
+
+ static char *Fam = GetFam(Rec);
+
+ int FamIndex = 0;
+ int SuperFamIndex = 0;
+ int n = sscanf(Fam, "%d.%d", &FamIndex, &SuperFamIndex);
+ if (n == 1)
+ SuperFamIndex = -1;
+ else if (n != 2)
+ Quit("Invalid Family %s", Fam);
+
+ TRSData &TRS = TRSs[TRSCount];
+
+ const int Length = Rec.End - Rec.Start + 1;
+
+ TRS.ContigLabel = strsave(Rec.SeqName);
+ TRS.ContigFrom = Rec.Start - 1;
+ TRS.ContigTo = TRS.ContigFrom + Length - 1;
+ TRS.FamIndex = FamIndex;
+ TRS.SuperFamIndex = SuperFamIndex;
+
+ if (Rec.Strand == '+')
+ TRS.Rev = false;
+ else if (Rec.Strand == '-')
+ TRS.Rev = true;
+ else
+ Quit("GFF line %d, Invalid strand %c", GFFLineNr, Rec.Strand);
+
+ ++TRSCount;
+ }
+ return TRSCount;
+ }
+
+TRSData *ReadTRS(const char *FileName, int *ptrTRSCount)
+ {
+ FILE *f = OpenStdioFile(FileName);
+
+ int TRSCount = ReadTRSPass1(f);
+ TRSData *TRSs = all(TRSData, TRSCount);
+ ReadTRSPass2(f, TRSs);
+ fclose(f);
+
+ *ptrTRSCount = TRSCount;
+ return TRSs;
+ }
diff --git a/tan.cpp b/tan.cpp
new file mode 100755
index 0000000..72ca8c9
--- /dev/null
+++ b/tan.cpp
@@ -0,0 +1,503 @@
+#include "piler2.h"
+
+#define GFFRecord GFFRecord2
+
+static int MIN_HIT_COUNT = 2;
+static double MAX_FRACT_MARGIN = 0.05;
+static double MIN_RATIO = 0.5;
+
+struct TanPile
+ {
+ char *Label;
+ int From;
+ int To;
+ HitData *Hits;
+ int HitCount;
+ };
+static TanPile *Piles;
+static int PileCount;
+static int PileBufferSize;
+static int PyramidIndex;
+FILE *fOut;
+FILE *fPyramid;
+FILE *fMotif;
+
+static inline int min3(int i, int j, int k)
+ {
+ return min(i, min(j, k));
+ }
+
+static inline int max3(int i, int j, int k)
+ {
+ return max(i, max(j, k));
+ }
+
+static void TouchPiles(int PileIndex)
+ {
+ if (PileIndex < PileBufferSize)
+ {
+ if (PileIndex >= PileCount)
+ PileCount = PileIndex + 1;
+ return;
+ }
+
+ int NewPileBufferSize = PileBufferSize + 10000;
+ TanPile *NewPiles = all(TanPile, NewPileBufferSize);
+ zero(NewPiles, TanPile, NewPileBufferSize);
+ memcpy(NewPiles, Piles, PileCount*sizeof(TanPile));
+ Piles = NewPiles;
+ PileBufferSize = NewPileBufferSize;
+ PileCount = PileIndex + 1;
+ }
+
+static void AssignHit(int PileIndex, const char *Label, int QueryFrom, int QueryTo,
+ int TargetFrom, int TargetTo, bool Rev)
+ {
+ TanPile &Pile = Piles[PileIndex];
+
+ int HitIndex = Pile.HitCount;
+ HitData &Hit = Pile.Hits[HitIndex];
+
+ Hit.QueryFrom = QueryFrom;
+ Hit.QueryTo = QueryTo;
+ Hit.TargetFrom = TargetFrom;
+ Hit.TargetTo = TargetTo;
+ Hit.Rev = Rev;
+
+ ++(Pile.HitCount);
+ }
+
+static void AddHit(int PileIndex, const char *Label, int QueryFrom, int QueryTo,
+ int TargetFrom, int TargetTo, bool Rev)
+ {
+ TouchPiles(PileIndex);
+
+ TanPile &Pile = Piles[PileIndex];
+ if (0 == Pile.HitCount)
+ {
+ Pile.Label = strsave(Label);
+ Pile.From = min(QueryFrom, TargetFrom);
+ Pile.To = max(QueryTo, TargetTo);
+ }
+ else
+ {
+ if (0 != strcmp(Label, Pile.Label))
+ Quit("Labels disagree");
+ Pile.From = min3(Pile.From, QueryFrom, TargetFrom);
+ Pile.To = max3(Pile.To, QueryTo, TargetTo);
+ }
+ ++(Pile.HitCount);
+ }
+
+bool TandemPair(const HitData &Hit1, const HitData &Hit2)
+ {
+ int Length1 = (Hit1.QueryTo - Hit1.QueryFrom + Hit1.TargetTo- Hit1.TargetFrom)/2;
+ int Length2 = (Hit2.QueryTo - Hit2.QueryFrom + Hit2.TargetTo - Hit2.TargetFrom)/2;
+ int ShorterLength = min(Length1, Length2);
+ int LongerLength = max(Length1, Length2);
+ if ((double) ShorterLength / (double) LongerLength < MIN_RATIO)
+ return false;
+
+ int StartDist = iabs(Hit1.TargetFrom - Hit2.TargetFrom);
+ int EndDist = iabs(Hit1.QueryTo - Hit2.QueryTo);
+
+ double StartMargin = (double) StartDist / (double) ShorterLength;
+ double EndMargin = (double) EndDist / (double) ShorterLength;
+ return StartMargin <= MAX_FRACT_MARGIN && EndMargin <= MAX_FRACT_MARGIN;
+ }
+
+static void WritePyramid(int PyramidIndex, const char *Label, int From, int To)
+ {
+ if (0 == fPyramid)
+ return;
+
+ GFFRecord Rec;
+ Rec.Source = "piler";
+ Rec.Feature = "pyramid";
+ Rec.Score = 0;
+ Rec.Frame = -1;
+ Rec.Strand = '.';
+ Rec.SeqName = Label;
+ Rec.Start = From + 1;
+ Rec.End = To + 1;
+
+ char s[32];
+ sprintf(s, "PyramidIndex %d", PyramidIndex);
+ Rec.Attrs = s;
+
+ WriteGFFRecord(fPyramid, Rec);
+ }
+
+static void InferMotif(int PyramidIndex, const TanPile &Pile, FamData *Fam)
+ {
+ const int MAX_PAIRS = 1024;
+ int DiagDists[MAX_PAIRS];
+ int PairCount = 0;
+ int MinDiagDist = -1;
+ for (PtrFamData q1 = Fam->begin(); q1 != Fam->end(); ++q1)
+ {
+ FamMemberData &FamMember1 = *q1;
+ int HitIndex1 = FamMember1.PileIndex;
+ if (HitIndex1 < 0 || HitIndex1 >= Pile.HitCount)
+ Quit("Hit index out of range");
+ const HitData &Hit1 = Pile.Hits[HitIndex1];
+ int Diag1Start = Hit1.TargetFrom - Hit1.QueryFrom;
+ int Diag1End = Hit1.TargetTo - Hit1.QueryTo;
+ int Diag1 = (Diag1Start + Diag1End)/2;
+
+ PtrFamData q2 = q1;
+ for (++q2; q2 != Fam->end(); ++q2)
+ {
+ FamMemberData &FamMember2 = *q2;
+ int HitIndex2 = FamMember2.PileIndex;
+ if (HitIndex2 < 0 || HitIndex2 >= Pile.HitCount)
+ Quit("Hit index out of range");
+ const HitData &Hit2 = Pile.Hits[HitIndex2];
+
+ int Diag2Start = Hit2.TargetFrom - Hit2.QueryFrom;
+ int Diag2End = Hit2.TargetTo - Hit2.QueryTo;
+ int Diag2 = (Diag2Start + Diag2End)/2;
+
+ int DiagDist = iabs(Diag1 - Diag2);
+ if (-1 == MinDiagDist || DiagDist < MinDiagDist)
+ MinDiagDist = DiagDist;
+ DiagDists[PairCount++] = DiagDist;
+ if (PairCount == MAX_PAIRS)
+ goto Done;
+ }
+ }
+Done:
+// Find all dists that are no more than 10% bigger than MinDist.
+// The average of these distances is the estimated repeat length.
+ int Count = 0;
+ int Sum = 0;
+ int MaxCandidateDist = (MinDiagDist*110)/100;
+ for (int i = 0; i < PairCount; ++i)
+ {
+ const int Dist = DiagDists[i];
+ if (Dist <= MaxCandidateDist)
+ {
+ Sum += Dist;
+ ++Count;
+ }
+ }
+ if (0 == Count)
+ Quit("Huh?");
+
+ const int EstimatedRepeatLength = Sum/Count;
+ Log("\n");
+ Log("Pyramid %d: min dist = %d ", PyramidIndex, MinDiagDist);
+ Log(" Count=%d Sum=%d Estd=%d\n", Count, Sum, EstimatedRepeatLength);
+
+ GFFRecord Rec;
+ Rec.Source = "piler";
+ Rec.Feature = "tandemmotif";
+ Rec.Score = 0;
+ Rec.Frame = -1;
+ Rec.SeqName = Pile.Label;
+ Rec.Strand = '.';
+
+// By definition, a Pyramid is set of hits for which QueryTo
+// and TargetFrom values are approximately equal.
+// We arbitrarily choose the canonical motif to end
+// at QueryTo. We output this motif and the Target motifs
+// to which it aligns.
+ FamMemberData &FamMember = *(Fam->begin());
+ int HitIndex = FamMember.PileIndex;
+ if (HitIndex < 0 || HitIndex >= Pile.HitCount)
+ Quit("Hit index out of range");
+ const HitData &Hit = Pile.Hits[HitIndex];
+
+ const int MotifQueryTo = Hit.QueryTo;
+ const int MotifQueryFrom = MotifQueryTo - EstimatedRepeatLength + 1;
+ const int TargetTo = Hit.TargetTo;
+ const int TargetFrom = TargetTo - EstimatedRepeatLength + 1;
+
+ char s[1024];
+ sprintf(s, "Target %s %d %d ; Pyramid %d",
+ Pile.Label,
+ TargetFrom + 1,
+ TargetTo + 1,
+ PyramidIndex);
+
+ Rec.Start = MotifQueryFrom + 1;
+ Rec.End = MotifQueryTo + 1;
+ Rec.Attrs = s;
+
+ WriteGFFRecord(fMotif, Rec);
+
+ for (PtrFamData q = Fam->begin(); q != Fam->end(); ++q)
+ {
+ FamMemberData &FamMember = *q;
+ int HitIndex = FamMember.PileIndex;
+ if (HitIndex < 0 || HitIndex >= Pile.HitCount)
+ Quit("Hit index out of range");
+ const HitData &Hit = Pile.Hits[HitIndex];
+
+ const int To = Hit.TargetTo;
+ const int From = To - EstimatedRepeatLength + 1;
+
+ char s[1024];
+ sprintf(s, "Target %s %d %d ; Pyramid %d",
+ Pile.Label,
+ MotifQueryFrom + 1,
+ MotifQueryTo + 1,
+ PyramidIndex);
+
+ Rec.Start = From + 1;
+ Rec.End = To + 1;
+ Rec.Attrs = s;
+
+ WriteGFFRecord(fMotif, Rec);
+ }
+ }
+
+static void LogPyramid(int PyramidIndex, const TanPile &Pile, FamData *Fam)
+ {
+ Log("\n");
+ Log("Pyramid %d\n", PyramidIndex);
+
+ Log(
+" Label QStart1 QEnd1 TStart1 TEnd1 QStart1 QEnd1 TStart1 TEnd1 Diag Start End\n");
+ Log(
+"---------- -------- -------- -------- -------- -------- -------- -------- -------- ------ ------ ------\n");
+ for (PtrFamData q1 = Fam->begin(); q1 != Fam->end(); ++q1)
+ {
+ FamMemberData &FamMember1 = *q1;
+ int HitIndex1 = FamMember1.PileIndex;
+ if (HitIndex1 < 0 || HitIndex1 >= Pile.HitCount)
+ Quit("Hit index out of range");
+ const HitData &Hit1 = Pile.Hits[HitIndex1];
+ int Diag1Start = Hit1.TargetFrom - Hit1.QueryFrom;
+ int Diag1End = Hit1.TargetTo - Hit1.QueryTo;
+ int Diag1 = (Diag1Start + Diag1End)/2;
+
+ PtrFamData q2 = q1;
+ for (++q2; q2 != Fam->end(); ++q2)
+ {
+ FamMemberData &FamMember2 = *q2;
+ int HitIndex2 = FamMember2.PileIndex;
+ if (HitIndex2 < 0 || HitIndex2 >= Pile.HitCount)
+ Quit("Hit index out of range");
+ const HitData &Hit2 = Pile.Hits[HitIndex2];
+
+ int Diag2Start = Hit2.TargetFrom - Hit2.QueryFrom;
+ int Diag2End = Hit2.TargetTo - Hit2.QueryTo;
+ int Diag2 = (Diag2Start + Diag2End)/2;
+
+ int DiagDist = iabs(Diag1 - Diag2);
+
+ int StartDist = iabs(Hit1.TargetFrom - Hit2.TargetFrom);
+ int EndDist = iabs(Hit1.QueryTo - Hit2.QueryTo);
+ Log("%10.10s %8d %8d %8d %8d %8d %8d %8d %8d %6d",
+ Pile.Label,
+ Hit1.QueryFrom,
+ Hit1.QueryTo,
+ Hit1.TargetFrom,
+ Hit1.TargetTo,
+ Hit2.QueryFrom,
+ Hit2.QueryTo,
+ Hit2.TargetFrom,
+ Hit2.TargetTo,
+ DiagDist);
+ Log(" %6d", StartDist);
+ Log(" %6d", EndDist);
+ Log("\n");
+ }
+ }
+ }
+
+static void FindPyramids(int PileIndex)
+ {
+ const TanPile &Pile = Piles[PileIndex];
+ const int HitCount = Pile.HitCount;
+ if (HitCount < MIN_HIT_COUNT)
+ return;
+
+ EdgeList Edges;
+ EdgeData Edge;
+ Edge.Rev = false;
+ const HitData *Hits = Pile.Hits;
+ for (int HitIndex1 = 0; HitIndex1 < HitCount; ++HitIndex1)
+ {
+ const HitData &Hit1 = Hits[HitIndex1];
+ Edge.Node1 = HitIndex1;
+ for (int HitIndex2 = 0; HitIndex2 < HitIndex1; ++HitIndex2)
+ {
+ const HitData &Hit2 = Hits[HitIndex2];
+ if (TandemPair(Hit1, Hit2))
+ {
+ Edge.Node2 = HitIndex2;
+ Edges.push_back(Edge);
+ }
+ }
+ }
+
+ FamList Fams;
+ FindConnectedComponents(Edges, Fams, MIN_HIT_COUNT);
+
+ if (0 == Fams.size())
+ return;
+
+ GFFRecord Rec;
+ Rec.Source = "piler";
+ Rec.Feature = "hit";
+ Rec.Score = 0;
+ Rec.Frame = -1;
+ Rec.SeqName = Pile.Label;
+
+ for (PtrFamList p = Fams.begin(); p != Fams.end(); ++p)
+ {
+ FamData *Fam = *p;
+ if (0 != fMotif)
+ InferMotif(PyramidIndex, Pile, Fam);
+ LogPyramid(PyramidIndex, Pile, Fam);
+
+ int PyramidFrom = -1;
+ int PyramidTo = -1;
+
+ for (PtrFamData q = Fam->begin(); q != Fam->end(); ++q)
+ {
+ FamMemberData &FamMember = *q;
+ int HitIndex = FamMember.PileIndex;
+ if (HitIndex < 0 || HitIndex >= Pile.HitCount)
+ Quit("Hit index out of range");
+ const HitData &Hit = Pile.Hits[HitIndex];
+
+ char s[1024];
+ sprintf(s, "Target %s %d %d ; Pile %d ; Pyramid %d",
+ Pile.Label,
+ Hit.TargetFrom + 1,
+ Hit.TargetTo + 1,
+ PileIndex,
+ PyramidIndex);
+
+ Rec.Strand = (Hit.Rev ? '-' : '+');
+ Rec.Start = Hit.QueryFrom + 1;
+ Rec.End = Hit.QueryTo + 1;
+ Rec.Attrs = s;
+
+ WriteGFFRecord(fOut, Rec);
+
+ if (PyramidFrom == -1)
+ PyramidFrom = min(Hit.QueryFrom, Hit.TargetFrom);
+ else
+ PyramidFrom = min3(PyramidFrom, Hit.QueryFrom, Hit.TargetFrom);
+
+ if (PyramidTo == -1)
+ PyramidTo = max(Hit.QueryTo, Hit.TargetTo);
+ else
+ PyramidTo = max3(PyramidTo, Hit.QueryTo, Hit.TargetTo);
+ }
+ WritePyramid(PyramidIndex, Pile.Label, PyramidFrom, PyramidTo);
+ ++PyramidIndex;
+ }
+ }
+
+void Tan()
+ {
+// Image file annotated with from-to pile indexes
+// Produced by:
+// piler2 -trs banded_hits.gff -images mainband_images.gff
+ const char *HitFileName = RequiredValueOpt("tan");
+ const char *OutFileName = RequiredValueOpt("out");
+ const char *PyramidFileName = ValueOpt("pyramid");
+ const char *MotifFileName = ValueOpt("motif");
+ const char *strMinHits = ValueOpt("minhits");
+ const char *strMaxMargin = ValueOpt("maxmargin");
+ const char *strMinRatio = ValueOpt("minratio");
+
+ if (0 != strMinHits)
+ MIN_HIT_COUNT = atoi(strMinHits);
+ if (0 != strMaxMargin)
+ MAX_FRACT_MARGIN = atof(strMaxMargin);
+ if (0 != strMinRatio)
+ MIN_RATIO = atof(strMinRatio);
+
+ FILE *fInput = OpenStdioFile(HitFileName);
+
+ ProgressStart("Initialize piles");
+ GFFRecord Rec;
+ int HitCount = 0;
+ while (GetNextGFFRecord(fInput, Rec))
+ {
+ if (0 != strcmp(Rec.Feature, "hit"))
+ continue;
+
+ int QueryPileIndex = -1;
+ int TargetPileIndex = -1;
+ ParsePilesAttrs(Rec.Attrs, &QueryPileIndex, &TargetPileIndex);
+ if (QueryPileIndex != TargetPileIndex)
+ continue;
+
+ char TargetLabel[128];
+ int TargetStart;
+ int TargetEnd;
+ ParseTargetAttrs(Rec.Attrs, TargetLabel, sizeof(TargetLabel), &TargetStart, &TargetEnd);
+ if (0 != strcmp(Rec.SeqName, TargetLabel))
+ Quit("Labels don't match");
+
+ const int QueryFrom = Rec.Start - 1;
+ const int QueryTo = Rec.End - 1;
+ const int TargetFrom = TargetStart - 1;
+ const int TargetTo = TargetEnd - 1;
+ const bool Rev = (Rec.Strand == '-');
+
+ AddHit(QueryPileIndex, Rec.SeqName, QueryFrom, QueryTo, TargetFrom, TargetTo, Rev);
+ ++HitCount;
+ }
+ ProgressDone();
+
+ Progress("%d hits, %d piles", HitCount, PileCount);
+
+ ProgressStart("Allocate piles");
+ for (int PileIndex = 0; PileIndex < PileCount; ++PileIndex)
+ {
+ TanPile &Pile = Piles[PileIndex];
+ Pile.Hits = all(HitData, Pile.HitCount);
+ Pile.HitCount = 0;
+ }
+ ProgressDone();
+
+ ProgressStart("Assign hits to piles");
+ Rewind(fInput);
+ while (GetNextGFFRecord(fInput, Rec))
+ {
+ if (0 != strcmp(Rec.Feature, "hit"))
+ continue;
+
+ int QueryPileIndex = -1;
+ int TargetPileIndex = -1;
+ ParsePilesAttrs(Rec.Attrs, &QueryPileIndex, &TargetPileIndex);
+ if (QueryPileIndex != TargetPileIndex)
+ continue;
+
+ char TargetLabel[128];
+ int TargetStart;
+ int TargetEnd;
+ ParseTargetAttrs(Rec.Attrs, TargetLabel, sizeof(TargetLabel), &TargetStart, &TargetEnd);
+ if (0 != strcmp(Rec.SeqName, TargetLabel))
+ Quit("Labels don't match");
+
+ const int QueryFrom = Rec.Start - 1;
+ const int QueryTo = Rec.End - 1;
+ const int TargetFrom = TargetStart - 1;
+ const int TargetTo = TargetEnd - 1;
+ const bool Rev = (Rec.Strand == '-');
+
+ AssignHit(QueryPileIndex, Rec.SeqName, QueryFrom, QueryTo, TargetFrom, TargetTo, Rev);
+ }
+ ProgressDone();
+
+ fOut = OpenStdioFile(OutFileName, FILEIO_MODE_WriteOnly);
+ fPyramid = (0 == PyramidFileName ? 0 : OpenStdioFile(PyramidFileName, FILEIO_MODE_WriteOnly));
+ fMotif = (0 == PyramidFileName ? 0 : OpenStdioFile(MotifFileName, FILEIO_MODE_WriteOnly));
+
+ ProgressStart("Find pyramids");
+ for (int PileIndex = 0; PileIndex < PileCount; ++PileIndex)
+ FindPyramids(PileIndex);
+ int PyramidCount = PyramidIndex;
+ ProgressDone();
+
+ Progress("%d pyramids", PyramidCount);
+ }
diff --git a/tanmotif2fasta.cpp b/tanmotif2fasta.cpp
new file mode 100755
index 0000000..848c23d
--- /dev/null
+++ b/tanmotif2fasta.cpp
@@ -0,0 +1,114 @@
+#include "piler2.h"
+
+static void LogRecords(const MotifData *Motifs, int MotifCount)
+ {
+ Log(" Contig From To Length Family\n");
+ Log("-------------------- ---------- ---------- ------ ------\n");
+ for (int MotifIndex = 0; MotifIndex < MotifCount; ++MotifIndex)
+ {
+ const MotifData &Motif = Motifs[MotifIndex];
+ Log("%20.20s %10d %10d %6d %d\n",
+ Motif.ContigLabel,
+ Motif.ContigFrom,
+ Motif.ContigTo,
+ Motif.ContigTo - Motif.ContigFrom + 1,
+ Motif.FamIndex);
+ }
+ }
+
+static int CmpMotif(const void *s1, const void *s2)
+ {
+ const MotifData *Motif1 = (const MotifData *) s1;
+ const MotifData *Motif2 = (const MotifData *) s2;
+ return Motif1->FamIndex - Motif2->FamIndex;
+ }
+
+static char *FamFileName(const char *Path, int Fam)
+ {
+ static char FileName[256];
+ char s[128];
+ sprintf(s, "/%d", Fam);
+ if (strlen(Path) + strlen(s) + 3 >= sizeof(FileName))
+ Quit("Path name too long");
+ strcpy(FileName, Path);
+ strcat(FileName, s);
+ fprintf(stderr, "\n\n*** FILE NAME *** '%s'\n", FileName);
+ return FileName;
+ }
+
+static char *MotifLabel(const char *Prefix, const MotifData &Motif)
+ {
+ int n = (int) strlen(Motif.ContigLabel) + 128;
+ char *s = all(char, n);
+ if (0 != Prefix)
+ sprintf(s, "%s.%d %s:%d",
+ Prefix,
+ Motif.FamIndex,
+ Motif.ContigLabel,
+ Motif.ContigFrom+1);
+ else
+ sprintf(s, "%d %s:%d",
+ Motif.FamIndex,
+ Motif.ContigLabel,
+ Motif.ContigFrom+1);
+ return s;
+ }
+
+void Tanmotif2Fasta()
+ {
+ const char *MotifFileName = RequiredValueOpt("tanmotif2fasta");
+ const char *SeqFileName = RequiredValueOpt("seq");
+ const char *Path = ValueOpt("path");
+ const char *strMaxFam = ValueOpt("maxfam");
+ const char *Prefix = ValueOpt("prefix");
+
+ int MaxFam = DEFAULT_MAX_FAM;
+ if (strMaxFam != 0)
+ MaxFam = atoi(strMaxFam);
+
+ if (0 == Path)
+ Path = ".";
+
+ ProgressStart("Reading seq file");
+ int SeqLength;
+ const char *Seq = ReadMFA(SeqFileName, &SeqLength);
+ ProgressDone();
+
+ Progress("Seq length %d bases, %.3g Mb", SeqLength, SeqLength/1e6);
+
+ ProgressStart("Read Motif file");
+ int MotifCount;
+ MotifData *Motifs = ReadMotif(MotifFileName, &MotifCount);
+ ProgressDone();
+
+ Progress("%d records", MotifCount);
+
+ ProgressStart("Sorting by family");
+ qsort((void *) Motifs, MotifCount, sizeof(MotifData), CmpMotif);
+ ProgressDone();
+
+ FILE *f = 0;
+ int CurrentFamily = -1;
+ int MemberCount = 0;
+ for (int MotifIndex = 0; MotifIndex < MotifCount; ++MotifIndex)
+ {
+ const MotifData &Motif = Motifs[MotifIndex];
+ if (Motif.FamIndex != CurrentFamily)
+ {
+ if (f != 0)
+ fclose(f);
+ char *FastaFileName = FamFileName(Path, Motif.FamIndex);
+ f = OpenStdioFile(FastaFileName, FILEIO_MODE_WriteOnly);
+ CurrentFamily = Motif.FamIndex;
+ MemberCount = 0;
+ }
+ ++MemberCount;
+ if (MemberCount > MaxFam)
+ continue;
+ const int From = ContigToGlobal(Motif.ContigFrom, Motif.ContigLabel);
+ const int Length = Motif.ContigTo - Motif.ContigFrom + 1;
+ char *Label = MotifLabel(Prefix, Motif);
+ WriteFasta(f, Seq + From, Length, Label, false);
+ freemem(Label);
+ }
+ }
diff --git a/tr.cpp b/tr.cpp
new file mode 100755
index 0000000..e6d1690
--- /dev/null
+++ b/tr.cpp
@@ -0,0 +1,354 @@
+#include "piler2.h"
+#if defined(DEBUG) && defined(_MSC_VER)
+#include <crtdbg.h>
+#endif
+
+#define GFFRecord GFFRecord2
+
+/***
+Find LTR families.
+
+1. LTR candidates
+
+ ------===--------------------===------------ genome
+ ^----------------------^
+ Hit
+
+Hit is candidate pair of LTRs bounding a candidate LINE if:
+ (a) hit length >= MIN_LENGTH_LTR and <= MAX_LENGTH_LTR, and
+ (b) offset is >= MIN_LENGTH_LINE and <= MAX_LENGTH_LINE.
+
+2. Same family if a local alignment connects two LTR candidates
+of similar length.
+
+
+ Cand A Cand B
+ ------===........===----------===........===----- genome
+ ^------------^
+ Hit
+***/
+
+static int MIN_LENGTH_LINE = 50;
+static int MAX_LENGTH_LINE = 12000;
+static int MIN_LENGTH_LTR = 50;
+static int MAX_LENGTH_LTR = 2000;
+static double MIN_LINE_RATIO = 0.9;
+static int MIN_FAM_SIZE = 3;
+static double MIN_HIT_LENGTH_RATIO = 0.5;
+static int MIN_DIST_EDGE = 50000;
+
+static EdgeList Edges;
+
+static double Ratio(int i, int j)
+ {
+ if (i == 0 && j == 0)
+ return 0;
+ if (i < j)
+ return (double) i / (double) j;
+ else
+ return (double) j / (double) i;
+ }
+
+static int GetHitLength(const HitData &Hit)
+ {
+ const int QueryHitLength = Hit.QueryTo - Hit.QueryFrom + 1;
+ const int TargetHitLength = Hit.TargetTo - Hit.TargetFrom + 1;
+ return (QueryHitLength + TargetHitLength)/2;
+ }
+
+static int GetLTRLength(const HitData &Hit)
+ {
+ const int QueryLTRLength = Hit.QueryTo - Hit.QueryFrom + 1;
+ const int TargetLTRLength = Hit.TargetTo - Hit.TargetFrom + 1;
+ return (QueryLTRLength + TargetLTRLength)/2;
+ }
+
+static int GetHitFrom(const HitData &Hit)
+ {
+ return min(Hit.QueryFrom, Hit.TargetFrom);
+ }
+
+static int GetHitTo(const HitData &Hit)
+ {
+ return max(Hit.QueryTo, Hit.TargetTo);
+ }
+
+static int GetLINELength(const HitData &Hit)
+ {
+ return GetHitTo(Hit) - GetHitFrom(Hit) + 1;
+ }
+
+static bool IsCandLTR(const HitData &Hit)
+ {
+ const int LTRLength = GetLTRLength(Hit);
+ if (LTRLength < MIN_LENGTH_LTR || LTRLength > MAX_LENGTH_LTR)
+ return false;
+
+ const int LINELength = GetLINELength(Hit);
+ if (LINELength < MIN_LENGTH_LINE || LINELength > MAX_LENGTH_LINE)
+ return false;
+
+ return true;
+ }
+
+static HitData *Cands;
+static int CandCount = 0;
+static int CandBufferSize = 0;
+static void AddCand(const HitData &Hit, IIX &IntervalIndex)
+ {
+ if (CandCount >= CandBufferSize)
+ {
+ CandBufferSize += 10000;
+ Cands = reall(Cands, HitData, CandBufferSize);
+ }
+
+ int From;
+ int To;
+ if (Hit.QueryFrom < Hit.TargetFrom)
+ {
+ From = Hit.QueryFrom;
+ To = Hit.TargetTo;
+ }
+ else
+ {
+ From = Hit.TargetFrom;
+ To = Hit.TargetTo;
+ }
+
+ IntervalIndex.AddGlobal(From, To, CandCount);
+ Cands[CandCount++] = Hit;
+ }
+
+static bool TooClose(const GLIX &HitGlix, const HitData &Hit1, const HitData &Hit2)
+ {
+ const int Hit1From = Hit1.QueryFrom;
+ const int Hit2From = Hit2.QueryFrom;
+ int SeqFrom1;
+ int SeqFrom2;
+ const char *Label1 = HitGlix.GlobalToSeq(Hit1From, &SeqFrom1);
+ const char *Label2 = HitGlix.GlobalToSeq(Hit2From, &SeqFrom2);
+ if (0 != strcmp(Label1, Label2))
+ return false;
+ return iabs(SeqFrom2 - SeqFrom1) < MIN_DIST_EDGE;
+ }
+
+static bool IsEdge(const GLIX &HitGlix, const HitData &Hit, int i, int j)
+ {
+ if (i == j)
+ return false;
+
+ assert(i >= 0 && i < CandCount);
+ assert(j >= 0 && j < CandCount);
+
+ const HitData &Hit_i = Cands[i];
+ const HitData &Hit_j = Cands[j];
+
+ const int HitLength = GetHitLength(Hit);
+ const int HitLength_i = GetHitLength(Hit_i);
+ const int HitLength_j = GetHitLength(Hit_j);
+
+ if (Ratio(HitLength, HitLength_i) < MIN_HIT_LENGTH_RATIO ||
+ Ratio(HitLength, HitLength_j) < MIN_HIT_LENGTH_RATIO ||
+ Ratio(HitLength_i, HitLength_j) < MIN_HIT_LENGTH_RATIO)
+ return false;
+
+ const int LINELength_i = GetLINELength(Hit_i);
+ const int LINELength_j = GetLINELength(Hit_j);
+
+ if (Ratio(LINELength_i, LINELength_j) < MIN_LINE_RATIO)
+ return false;
+
+ if (TooClose(HitGlix, Hit_i, Hit_j))
+ return false;
+
+ return true;
+ }
+
+static void FindEdges(const HitData &Hit, const GLIX &HitGlix, const IIX &IntervalIndex)
+ {
+ int *QueryMatches;
+ int *TargetMatches;
+ const int QueryMatchCount = IntervalIndex.LookupGlobal(Hit.QueryFrom, Hit.QueryTo, &QueryMatches);
+ const int TargetMatchCount = IntervalIndex.LookupGlobal(Hit.TargetFrom, Hit.TargetTo, &TargetMatches);
+ const int MatchCount = QueryMatchCount + TargetMatchCount;
+ int *Matches = all(int, MatchCount);
+ memcpy(Matches, QueryMatches, QueryMatchCount*sizeof(int));
+ memcpy(Matches + QueryMatchCount, TargetMatches, TargetMatchCount*sizeof(int));
+
+ for (int i = 0; i < MatchCount; ++i)
+ {
+ const int m = Matches[i];
+ if (m < 0 || m > CandCount)
+ Quit("m=%d count=%d", m, CandCount);
+ }
+
+ for (int i = 0; i < MatchCount; ++i)
+ {
+ const int CandIndex1 = Matches[i];
+ for (int j = 0; j < i; ++j)
+ {
+ const int CandIndex2 = Matches[j];
+ if (IsEdge(HitGlix, Hit, CandIndex1, CandIndex2))
+ {
+ EdgeData Edge;
+ Edge.Node1 = CandIndex1;
+ Edge.Node2 = CandIndex2;
+ Edge.Rev = Hit.Rev;
+ Edges.push_back(Edge);
+ }
+ }
+ }
+ }
+
+static void WriteOutputFile(FILE *fOut, const GLIX &HitGlix, FamList &Fams)
+ {
+ GFFRecord Rec;
+ Rec.Feature = "tr";
+ Rec.Source = "piler";
+ Rec.Score = 0;
+ Rec.Strand = '.';
+ Rec.Frame = -1;
+
+ int FamIndex = 0;
+ for (PtrFamList p = Fams.begin(); p != Fams.end(); ++p)
+ {
+ FamData *Fam = *p;
+ for (PtrFamData q = Fam->begin(); q != Fam->end(); ++q)
+ {
+ FamMemberData &FamMember = *q;
+ int CandIndex = FamMember.PileIndex;
+ assert(CandIndex >= 0 && CandIndex < CandCount);
+ const HitData &Hit = Cands[CandIndex];
+
+ const int GlobalFrom = GetHitFrom(Hit);
+ const int GlobalTo = GetHitTo(Hit);
+ const int Length = GlobalTo - GlobalFrom + 1;
+
+ int SeqFrom;
+ const char *Label = HitGlix.GlobalToSeq(GlobalFrom, &SeqFrom);
+ int SeqTo = SeqFrom + Length - 1;
+
+ Rec.SeqName = Label;
+ Rec.Start = SeqFrom + 1;
+ Rec.End = SeqTo + 1;
+ Rec.Strand = (FamMember.Rev ? '-' : '+');
+
+ char Attrs[1024];
+ sprintf(Attrs, "Family %d ; Cand %d", FamIndex, CandIndex);
+
+ Rec.Attrs = Attrs;
+
+ WriteGFFRecord(fOut, Rec);
+ }
+ ++FamIndex;
+ }
+ }
+
+static void WriteCands(FILE *f, const GLIX &HitGlix)
+ {
+ GFFRecord Rec;
+ for (int CandIndex = 0; CandIndex < CandCount; ++CandIndex)
+ {
+ const HitData &Hit = Cands[CandIndex];
+ char AnnotBuffer[1024];
+ HitToGFFRecord(HitGlix, Hit, Rec, AnnotBuffer);
+ char s[4096];
+ sprintf(s, "%s ; Cand %d", Rec.Attrs, CandIndex);
+ Rec.Attrs = s;
+ WriteGFFRecord(f, Rec);
+ }
+ }
+
+void TR()
+ {
+#if defined(DEBUG) && defined(_MSC_VER)
+ _CrtSetDbgFlag(0); // too expensive
+#endif
+
+ const char *HitFileName = RequiredValueOpt("tr");
+ const char *OutFileName = RequiredValueOpt("out");
+ const char *CandFileName = ValueOpt("cand");
+
+ const char *strMinTrSpacing = ValueOpt("mintrspacing");
+ const char *strMaxTrSpacing = ValueOpt("maxtrspacing");
+ const char *strMinTrLength = ValueOpt("mintrlength");
+ const char *strMaxTrLength = ValueOpt("minspacingratio");
+ const char *strMinFam = ValueOpt("minfam");
+ const char *strMinHitRatio = ValueOpt("minhitratio");
+ const char *strMinDistPairs = ValueOpt("mindistpairs");
+
+ if (0 != strMinTrSpacing)
+ MIN_LENGTH_LINE = atoi(strMinTrSpacing);
+ if (0 != strMaxTrSpacing)
+ MAX_LENGTH_LINE = atoi(strMaxTrSpacing);
+ if (0 != strMinTrLength)
+ MIN_LENGTH_LTR = atoi(strMinTrLength);
+ if (0 != strMaxTrLength)
+ MAX_LENGTH_LTR = atoi(strMaxTrLength);
+ if (0 != strMinFam)
+ MIN_FAM_SIZE = atoi(strMinFam);
+ if (0 != strMinHitRatio)
+ MIN_HIT_LENGTH_RATIO = atoi(strMinHitRatio);
+ if (0 != strMinDistPairs)
+ MIN_DIST_EDGE = atoi(strMinDistPairs);
+
+ FILE *fHit = OpenStdioFile(HitFileName, FILEIO_MODE_ReadOnly);
+
+ ProgressStart("Index hits");
+ GLIX HitGlix;
+ HitGlix.Init();
+ HitGlix.FromGFFFile(fHit);
+ HitGlix.MakeGlobalToLocalIndex();
+ ProgressDone();
+
+ const int GlobalLength = HitGlix.GetGlobalLength();
+ IIX IntervalIndex;
+ IntervalIndex.Init(GlobalLength);
+
+ ProgressStart("Find candidate TRs");
+ Rewind(fHit);
+ GFFRecord Rec;
+ while (GetNextGFFRecord(fHit, Rec))
+ {
+ HitData Hit;
+ GFFRecordToHit(HitGlix, Rec, Hit);
+ if (IsCandLTR(Hit))
+ AddCand(Hit, IntervalIndex);
+ }
+ ProgressDone();
+
+ Progress("%d candidates", CandCount);
+
+ if (0 != CandFileName)
+ {
+ ProgressStart("Write candidates");
+ FILE *fCand = OpenStdioFile(CandFileName, FILEIO_MODE_WriteOnly);
+ WriteCands(fCand, HitGlix);
+ ProgressDone();
+ }
+
+ ProgressStart("Make graph");
+ Rewind(fHit);
+ while (GetNextGFFRecord(fHit, Rec))
+ {
+ HitData Hit;
+ GFFRecordToHit(HitGlix, Rec, Hit);
+ FindEdges(Hit, HitGlix, IntervalIndex);
+ }
+ fclose(fHit);
+ fHit = 0;
+
+ ProgressDone();
+
+ Progress("%d edges", (int) Edges.size());
+
+ ProgressStart("Find families");
+ FamList Fams;
+ FindConnectedComponents(Edges, Fams, MIN_FAM_SIZE);
+ ProgressDone();
+
+ Progress("%d families", (int) Fams.size());
+
+ FILE *fOut = OpenStdioFile(OutFileName, FILEIO_MODE_WriteOnly);
+ WriteOutputFile(fOut, HitGlix, Fams);
+ }
diff --git a/trs.cpp b/trs.cpp
new file mode 100755
index 0000000..4dc047a
--- /dev/null
+++ b/trs.cpp
@@ -0,0 +1,682 @@
+#include "piler2.h"
+#include "bitfuncs.h"
+
+#define TRACE 0
+
+static int g_paramMinFamSize = 3;
+static int g_paramMaxLengthDiffPct = 5;
+static bool g_paramSingleHitCoverage = true;
+
+static PileData *Piles;
+static int PileCount;
+static int EdgeCount;
+
+static int MaxImageCount = 0;
+static int SeqLength;
+static int SeqLengthChunks;
+
+static PILE_INDEX_TYPE *IdentifyPiles(int *CopyCount)
+ {
+ PILE_INDEX_TYPE *PileIndexes = all(PILE_INDEX_TYPE, SeqLengthChunks);
+
+#if DEBUG
+ memset(PileIndexes, 0xff, SeqLengthChunks*sizeof(PILE_INDEX_TYPE));
+#endif
+
+ int PileIndex = -1;
+ bool InPile = false;
+ for (int i = 0; i < SeqLengthChunks; ++i)
+ {
+ if (BitIsSet(CopyCount, i))
+ {
+ if (!InPile)
+ {
+ ++PileIndex;
+ if (PileIndex > MAX_STACK_INDEX)
+ Quit("Too many stacks");
+ InPile = true;
+ }
+ PileIndexes[i] = PileIndex;
+ }
+ else
+ InPile = false;
+ }
+ PileCount = PileIndex + 1;
+ return PileIndexes;
+ }
+
+static void IncCopyCountImage(int *CopyCount, int From, int To)
+ {
+ if (From < 0)
+ Quit("From < 0");
+
+ From /= CHUNK_LENGTH;
+ To /= CHUNK_LENGTH;
+
+ if (From >= SeqLengthChunks)
+ {
+ Warning("IncCopyCountImage: From=%d, SeqLength=%d", From, SeqLengthChunks);
+ From = SeqLengthChunks - 1;
+ }
+ if (To >= SeqLengthChunks)
+ {
+ Warning("IncCopyCountImage: To=%d, SeqLength=%d", To, SeqLengthChunks);
+ To = SeqLengthChunks - 1;
+ }
+
+ if (From > To)
+ Quit("From > To");
+
+ for (int i = From; i <= To; ++i)
+ SetBit(CopyCount, i);
+ }
+
+static void IncCopyCount(int *CopyCount, const HitData &Hit)
+ {
+ IncCopyCountImage(CopyCount, Hit.TargetFrom, Hit.TargetTo);
+ IncCopyCountImage(CopyCount, Hit.QueryFrom, Hit.QueryTo);
+ }
+
+static int CmpHits(const void *ptrHit1, const void *ptrHit2)
+ {
+ HitData *Hit1 = (HitData *) ptrHit1;
+ HitData *Hit2 = (HitData *) ptrHit2;
+ return Hit1->QueryFrom - Hit2->QueryFrom;
+ }
+
+static int CmpImages(const void *ptrImage1, const void *ptrImage2)
+ {
+ PileImageData *Image1 = (PileImageData *) ptrImage1;
+ PileImageData *Image2 = (PileImageData *) ptrImage2;
+ return Image1->SIPile - Image2->SIPile;
+ }
+
+static void AssertImagesSorted(PileImageData *Images, int ImageCount)
+ {
+ for (int i = 0; i < ImageCount - 1; ++i)
+ if (Images[i].SIPile > Images[i+1].SIPile)
+ Quit("Images not sorted");
+ }
+
+static void SortImagesPile(PileImageData *Images, int ImageCount)
+ {
+ qsort(Images, ImageCount, sizeof(PileImageData), CmpImages);
+ }
+
+static void SortImages()
+ {
+ for (int PileIndex = 0; PileIndex < PileCount; ++PileIndex)
+ {
+ PileData &Pile = Piles[PileIndex];
+ SortImagesPile(Pile.Images, Pile.ImageCount);
+#if DEBUG
+ AssertImagesSorted(Pile.Images, Pile.ImageCount);
+#endif
+ }
+ }
+
+static void CreatePiles(const HitData *Hits, int HitCount,
+ PILE_INDEX_TYPE *PileIndexes)
+ {
+ Piles = all(PileData, PileCount);
+ zero(Piles, PileData, PileCount);
+ for (int i = 0; i < PileCount; ++i)
+ {
+ Piles[i].FamIndex = -1;
+ Piles[i].SuperFamIndex = -1;
+ Piles[i].Rev = -1;
+ }
+
+// Count images in stack
+ ProgressStart("Create stacks: count images");
+ for (int HitIndex = 0; HitIndex < HitCount; ++HitIndex)
+ {
+ const HitData &Hit = Hits[HitIndex];
+
+ int Pos = Hit.QueryFrom/CHUNK_LENGTH;
+ PILE_INDEX_TYPE PileIndex = PileIndexes[Pos];
+ assert(PileIndex == PileIndexes[Hit.QueryTo/CHUNK_LENGTH]);
+ assert(PileIndex >= 0 && PileIndex < PileCount);
+ ++(Piles[PileIndex].ImageCount);
+
+ Pos = Hit.TargetFrom/CHUNK_LENGTH;
+ PileIndex = PileIndexes[Pos];
+ assert(PileIndex >= 0 && PileIndex < PileCount);
+ assert(PileIndex == PileIndexes[Hit.TargetTo/CHUNK_LENGTH]);
+ ++(Piles[PileIndex].ImageCount);
+ }
+ ProgressDone();
+
+// Allocate memory for image list
+ int TotalImageCount = 0;
+ ProgressStart("Create stacks: allocate image memory");
+ for (int PileIndex = 0; PileIndex < PileCount; ++PileIndex)
+ {
+ PileData &Pile = Piles[PileIndex];
+ const int ImageCount = Pile.ImageCount;
+ TotalImageCount += ImageCount;
+ assert(ImageCount > 0);
+ Pile.Images = all(PileImageData, ImageCount);
+ }
+ ProgressDone();
+
+// Build image list
+ ProgressStart("Create stacks: build image list");
+ for (int PileIndex = 0; PileIndex < PileCount; ++PileIndex)
+ {
+ PileData &Pile = Piles[PileIndex];
+ Pile.ImageCount = 0;
+ Pile.From = -1;
+ Pile.To = -1;
+ }
+
+ for (int HitIndex = 0; HitIndex < HitCount; ++HitIndex)
+ {
+ const HitData &Hit = Hits[HitIndex];
+
+ const bool Rev = Hit.Rev;
+
+ const int Length1 = Hit.QueryTo - Hit.QueryFrom;
+ const int Length2 = Hit.TargetTo - Hit.TargetFrom;
+
+ const int From1 = Hit.QueryFrom;
+ const int From2 = Hit.TargetFrom;
+
+ const int To1 = Hit.QueryTo;
+ const int To2 = Hit.TargetTo;
+
+ const int Pos1 = From1/CHUNK_LENGTH;
+ const int Pos2 = From2/CHUNK_LENGTH;
+
+ PILE_INDEX_TYPE PileIndex1 = PileIndexes[Pos1];
+ PILE_INDEX_TYPE PileIndex2 = PileIndexes[Pos2];
+
+ assert(PileIndex1 == PileIndexes[(From1 + Length1 - 1)/CHUNK_LENGTH]);
+ assert(PileIndex1 >= 0 && PileIndex1 < PileCount);
+
+ assert(PileIndex2 == PileIndexes[(From2 + Length2 - 1)/CHUNK_LENGTH]);
+ assert(PileIndex2 >= 0 && PileIndex2 < PileCount);
+
+ PileData &Pile1 = Piles[PileIndex1];
+ PileImageData &Image1 = Pile1.Images[Pile1.ImageCount++];
+ Image1.SILength = Length2;
+ Image1.SIPile = PileIndex2;
+ Image1.SIRev = Rev;
+
+ PileData &Pile2 = Piles[PileIndex2];
+ PileImageData &Image2 = Pile2.Images[Pile2.ImageCount++];
+ Image2.SILength = Length1;
+ Image2.SIPile = PileIndex1;
+ Image2.SIRev = Rev;
+
+ if (Pile1.From == -1 || From1 < Pile1.From)
+ Pile1.From = From1;
+ if (Pile1.To == -1 || To1 > Pile1.To)
+ Pile1.To = To1;
+
+ if (Pile2.From == -1 || From2 < Pile2.From)
+ Pile2.From = From2;
+ if (Pile2.To == -1 || To2 > Pile2.To)
+ Pile2.To = To2;
+
+ if (Pile1.ImageCount > MaxImageCount)
+ MaxImageCount = Pile1.ImageCount;
+ if (Pile2.ImageCount > MaxImageCount)
+ MaxImageCount = Pile2.ImageCount;
+ }
+ ProgressDone();
+ }
+
+static int FindGlobalEdgesPileMulti(PileData &Pile, int PileIndex,
+ PILE_INDEX_TYPE Partners[], bool PartnersRev[])
+ {
+ const int PileLength = Pile.To - Pile.From + 1;
+ const int MinLength = (PileLength*(100 - g_paramMaxLengthDiffPct))/100;
+ const int MaxLength = (PileLength*(100 + g_paramMaxLengthDiffPct))/100;
+
+ const int ImageCount = Pile.ImageCount;
+ SortImagesPile(Pile.Images, ImageCount);
+
+#if TRACE
+ Log("Pile1 Pile2 Pile1L Pile2L Fract1 Fract2 Global [Multi]\n");
+ Log("------ ------ ------- ------- ------ ------ ------\n");
+#endif
+
+ int CurrentPartnerPileIndex = -1;
+ int BasesCovered = 0;
+ int PartnerCount = 0;
+ for (int ImageIndex = 0; ; ++ImageIndex)
+ {
+ int PartnerPileIndex;
+ int ImageLength = 0;
+ bool Rev = false;
+ if (ImageIndex < ImageCount)
+ {
+ const PileImageData &Image = Pile.Images[ImageIndex];
+ PartnerPileIndex = Image.SIPile;
+ ImageLength = Image.SILength;
+ Rev = Image.SIRev;
+ }
+ else
+ PartnerPileIndex = -1;
+
+ if (PartnerPileIndex == CurrentPartnerPileIndex)
+ BasesCovered += ImageLength;
+ else
+ {
+ if (CurrentPartnerPileIndex != -1)
+ {
+ const PileData &PartnerPile = Piles[CurrentPartnerPileIndex];
+ const int PartnerPileLength = PartnerPile.To - PartnerPile.From + 1;
+ bool IsGlobalMatch =
+ PartnerPileLength >= MinLength && PartnerPileLength <= MaxLength &&
+ BasesCovered >= MinLength && PartnerPileIndex != PileIndex;
+#if TRACE
+ Log("%6d %6d %7d %7d %5.0f%% %5.0f%% %c\n",
+ PileIndex,
+ CurrentPartnerPileIndex,
+ PileLength,
+ PartnerPileLength,
+ (BasesCovered*100.0)/PileLength,
+ (BasesCovered*100.0)/PartnerPileLength,
+ IsGlobalMatch ? 'Y' : 'N');
+#endif
+ if (IsGlobalMatch)
+ {
+ PartnersRev[PartnerCount] = Rev; // TODO
+ Partners[PartnerCount] = CurrentPartnerPileIndex;
+ ++PartnerCount;
+ }
+ }
+ CurrentPartnerPileIndex = PartnerPileIndex;
+ BasesCovered = ImageLength;
+ }
+ if (ImageIndex == ImageCount)
+ break;
+ }
+ return PartnerCount;
+ }
+
+static int FindGlobalEdgesPileSingle(PileData &Pile, int PileIndex,
+ PILE_INDEX_TYPE Partners[], bool PartnersRev[])
+ {
+ const int ImageCount = Pile.ImageCount;
+ const int PileLength = Pile.To - Pile.From + 1;
+
+ const int MinLength = (PileLength*(100 - g_paramMaxLengthDiffPct))/100;
+ const int MaxLength = (PileLength*(100 + g_paramMaxLengthDiffPct))/100;
+
+#if TRACE
+ Log("Pile1 Pile2 Pile1L Pile2L Fract1 Fract2 Global [Single]\n");
+ Log("------ ------ ------- ------- ------ ------ ------\n");
+#endif
+ int PartnerCount = 0;
+ for (int ImageIndex = 0; ImageIndex < ImageCount; ++ImageIndex)
+ {
+ const PileImageData &Image = Pile.Images[ImageIndex];
+ const int PartnerImageLength = Image.SILength;
+ const int PartnerPileIndex = Image.SIPile;
+ const PileData &PartnerPile = Piles[PartnerPileIndex];
+ const int PartnerPileLength = PartnerPile.To - PartnerPile.From + 1;
+
+ bool IsGlobalImage =
+ PartnerPileLength >= MinLength && PartnerPileLength <= MaxLength &&
+ PartnerImageLength >= MinLength && PartnerImageLength <= MaxLength &&
+ PartnerPileIndex != PileIndex;
+#if TRACE
+ Log("%6d %6d %7d %7d %5.0f%% %5.0f%% %c\n",
+ PileIndex,
+ PartnerPileIndex,
+ PileLength,
+ PartnerPileLength,
+ (PartnerImageLength*100.0)/PileLength,
+ (PartnerImageLength*100.0)/PartnerPileLength,
+ IsGlobalImage ? 'Y' : 'N');
+#endif
+ if (IsGlobalImage)
+ {
+ PartnersRev[PartnerCount] = Image.SIRev;
+ Partners[PartnerCount] = PartnerPileIndex;
+ ++PartnerCount;
+ }
+ }
+ return PartnerCount;
+ }
+
+static void AddEdges(EdgeList &Edges, PILE_INDEX_TYPE PileIndex,
+ PILE_INDEX_TYPE Partners[], bool PartnersRev[], int PartnerCount)
+ {
+ EdgeCount += PartnerCount;
+ for (int i = 0; i < PartnerCount; ++i)
+ {
+ int PileIndex2 = Partners[i];
+ EdgeData Edge;
+ Edge.Node1 = PileIndex;
+ Edge.Node2 = PileIndex2;
+ Edge.Rev = PartnersRev[i];
+ Edges.push_back(Edge);
+ }
+ }
+
+static void FindGlobalEdges(EdgeList &Edges, int MaxImageCount)
+ {
+ Edges.clear();
+
+ PILE_INDEX_TYPE *Partners = all(PILE_INDEX_TYPE, MaxImageCount);
+ bool *PartnersRev = all(bool, MaxImageCount);
+ for (int PileIndex = 0; PileIndex < PileCount; ++PileIndex)
+ {
+ PileData &Pile = Piles[PileIndex];
+ int PartnerCount;
+ if (g_paramSingleHitCoverage)
+ PartnerCount = FindGlobalEdgesPileSingle(Pile, PileIndex, Partners, PartnersRev);
+ else
+ PartnerCount = FindGlobalEdgesPileMulti(Pile, PileIndex, Partners, PartnersRev);
+ AddEdges(Edges, PileIndex, Partners, PartnersRev, PartnerCount);
+ }
+ freemem(Partners);
+ freemem(PartnersRev);
+ }
+
+static void AssignFamsToPiles(FamList &Fams)
+ {
+ int FamIndex = 0;
+ for (PtrFamList p = Fams.begin(); p != Fams.end(); ++p)
+ {
+ FamData *Fam = *p;
+ for (PtrFamData q = Fam->begin(); q != Fam->end(); ++q)
+ {
+ FamMemberData &FamMember = *q;
+ int PileIndex = FamMember.PileIndex;
+ PileData &Pile = Piles[PileIndex];
+ Pile.FamIndex = FamIndex;
+ Pile.Rev = (int) FamMember.Rev;
+ }
+ ++FamIndex;
+ }
+ }
+
+static inline unsigned TriangleSubscript(unsigned FamCount, unsigned i, unsigned j)
+ {
+ assert(i >= 0 && j >= 0 && i < FamCount && j < FamCount);
+ unsigned v;
+ if (i >= j)
+ v = j + (i*(i - 1))/2;
+ else
+ v = i + (j*(j - 1))/2;
+ assert(v < (FamCount*(FamCount - 1))/2);
+ return v;
+ }
+
+static void FindSuperFamEdges(FamList &Fams, EdgeList &Edges)
+ {
+ const int FamCount = (int) Fams.size();
+
+// Allocate triangular array Related[i][j], value is true
+// iff families i and j are related (i.e., there is a local
+// alignment between some member of i and some member of j).
+ const int TriangleSize = (FamCount*(FamCount - 1))/2;
+ bool *Related = all(bool, TriangleSize);
+ zero(Related, bool, TriangleSize);
+ for (PtrFamList p = Fams.begin(); p != Fams.end(); ++p)
+ {
+ FamData *Fam = *p;
+ for (PtrFamData q = Fam->begin(); q != Fam->end(); ++q)
+ {
+ FamMemberData &FamMember = *q;
+ int PileIndex = FamMember.PileIndex;
+ const PileData &Pile = Piles[PileIndex];
+ const int FamIndex = Pile.FamIndex;
+ if (-1 == FamIndex)
+ continue;
+ const int ImageCount = Pile.ImageCount;
+ for (int ImageIndex = 0; ImageIndex < ImageCount; ++ImageIndex)
+ {
+ const PileImageData &Image = Pile.Images[ImageIndex];
+ const int PartnerPileIndex = Image.SIPile;
+ if (PartnerPileIndex == PileIndex)
+ continue;
+ const PileData &PartnerPile = Piles[PartnerPileIndex];
+ const int PartnerFamIndex = PartnerPile.FamIndex;
+ if (-1 == PartnerFamIndex || PartnerFamIndex == FamIndex)
+ continue;
+ const int Index = TriangleSubscript(FamCount, FamIndex, PartnerFamIndex);
+ assert(Index >= 0 && Index < TriangleSize);
+ Related[Index] = true;
+ }
+ }
+ }
+
+ Edges.clear();
+ for (int i = 0; i < FamCount; ++i)
+ for (int j = i + 1; j < FamCount; ++j)
+ {
+ const int Index = TriangleSubscript(FamCount, i, j);
+ if (Related[Index])
+ {
+// Log("R %d %d\n", i, j);
+ EdgeData Edge;
+ Edge.Node1 = i;
+ Edge.Node2 = j;
+ Edge.Rev = false;
+ Edges.push_back(Edge);
+ }
+ }
+ }
+
+static void AssignSuperFamsToPiles(FamList &Fams, FamList &SuperFams)
+ {
+ const int FamCount = (int) Fams.size();
+ FamData **FamVect = all(FamData *, FamCount);
+
+ int FamIndex = 0;
+ for (PtrFamList p = Fams.begin(); p != Fams.end(); ++p)
+ {
+ FamVect[FamIndex] = *p;
+ ++FamIndex;
+ }
+
+ int SuperFamIndex = 0;
+ for (PtrFamList pSF = SuperFams.begin(); pSF != SuperFams.end(); ++pSF)
+ {
+ FamData &SFFams = *(*pSF);
+ for (PtrFamData p = SFFams.begin(); p != SFFams.end(); ++p)
+ {
+ FamMemberData &FamMember = *p;
+ int FamIndex = FamMember.PileIndex;
+ assert(FamIndex >= 0 && FamIndex < FamCount);
+ FamData *Fam = FamVect[FamIndex];
+ for (PtrFamData q = Fam->begin(); q != Fam->end(); ++q)
+ {
+ FamMemberData &FamMember = *q;
+ int PileIndex = FamMember.PileIndex;
+ assert(PileIndex >= 0 && PileIndex < PileCount);
+ PileData &Pile = Piles[PileIndex];
+ assert(Pile.FamIndex == FamIndex);
+ Pile.SuperFamIndex = SuperFamIndex;
+ }
+ }
+ ++SuperFamIndex;
+ }
+ }
+
+static void FindSingletonSuperFams(FamList &Fams, FamList &SuperFams)
+ {
+ const int FamCount = (int) Fams.size();
+ FamData **FamVect = all(FamData *, FamCount);
+ bool *FamAssigned = all(bool, FamCount);
+
+ int FamIndex = 0;
+ for (PtrFamList p = Fams.begin(); p != Fams.end(); ++p)
+ {
+ FamVect[FamIndex] = *p;
+ FamAssigned[FamIndex] = false;
+ ++FamIndex;
+ }
+
+// Flag families that have been assigned to superfamilies
+ for (PtrFamList pSF = SuperFams.begin(); pSF != SuperFams.end(); ++pSF)
+ {
+ FamData &SFFams = *(*pSF);
+ for (PtrFamData p = SFFams.begin(); p != SFFams.end(); ++p)
+ {
+ FamMemberData &FamMember = *p;
+ int FamIndex = FamMember.PileIndex;
+ assert(FamIndex >= 0 && FamIndex < FamCount);
+ FamAssigned[FamIndex] = true;
+ }
+ }
+
+// Create new superfamily for each unassigned family
+ for (int FamIndex = 0; FamIndex < FamCount; ++FamIndex)
+ {
+ if (FamAssigned[FamIndex])
+ continue;
+
+ FamMemberData Fam;
+ Fam.PileIndex = FamIndex;
+ Fam.Rev = false;
+
+ FamData *SuperFam = new FamData;
+ SuperFam->push_back(Fam);
+
+ SuperFams.push_back(SuperFam);
+ }
+
+// Validate
+ int SuperFamIndex = 0;
+ for (PtrFamList pSF = SuperFams.begin(); pSF != SuperFams.end(); ++pSF)
+ {
+ FamData &SFFams = *(*pSF);
+ for (PtrFamData p = SFFams.begin(); p != SFFams.end(); ++p)
+ {
+ FamMemberData &FamMember = *p;
+ int FamIndex = FamMember.PileIndex;
+ assert(FamIndex >= 0 && FamIndex < FamCount);
+ FamData *Fam = FamVect[FamIndex];
+
+ for (PtrFamData q = Fam->begin(); q != Fam->end(); ++q)
+ {
+ FamMemberData &FamMember = *q;
+ int PileIndex = FamMember.PileIndex;
+ if (PileIndex == 5354)
+ Log("");
+ PileData &Pile = Piles[PileIndex];
+
+ assert(Pile.FamIndex == FamIndex);
+ }
+ }
+ ++SuperFamIndex;
+ }
+ }
+
+void TRS()
+ {
+ const char *InputFileName = RequiredValueOpt("trs");
+
+ const char *OutputFileName = ValueOpt("out");
+ const char *PilesFileName = ValueOpt("piles");
+ const char *ImagesFileName = ValueOpt("images");
+
+ const char *strMinFamSize = ValueOpt("famsize");
+ const char *strMaxLengthDiffPct = ValueOpt("maxlengthdiffpct");
+ g_paramSingleHitCoverage = !FlagOpt("multihit");
+
+ if (0 == OutputFileName && 0 == PilesFileName && 0 == ImagesFileName)
+ Quit("No output file specified, must be at least one of -out, -piles, -images");
+
+ if (0 != strMinFamSize)
+ g_paramMinFamSize = atoi(strMinFamSize);
+ if (0 != strMaxLengthDiffPct)
+ g_paramMaxLengthDiffPct = atoi(strMaxLengthDiffPct);
+
+ Log("singlehit=%s famsize=%d maxlengthdiffpct=%d\n",
+ g_paramSingleHitCoverage ? "True" : "False",
+ g_paramMinFamSize,
+ g_paramMaxLengthDiffPct);
+
+ ProgressStart("Read hit file");
+ int HitCount;
+ int SeqLength;
+ HitData *Hits = ReadHits(InputFileName, &HitCount, &SeqLength);
+ ProgressDone();
+
+ Progress("%d hits", HitCount);
+
+ SeqLengthChunks = (SeqLength + CHUNK_LENGTH - 1)/CHUNK_LENGTH;
+
+ const int BitVectorLength = (SeqLengthChunks + BITS_PER_INT - 1)/BITS_PER_INT;
+ int *CopyCount = all(int, BitVectorLength);
+ zero(CopyCount, int, BitVectorLength);
+
+ ProgressStart("Compute copy counts");
+ for (int i = 0; i < HitCount; ++i)
+ IncCopyCount(CopyCount, Hits[i]);
+ ProgressDone();
+
+ ProgressStart("Identify piles");
+ PILE_INDEX_TYPE *PileIndexes = IdentifyPiles(CopyCount);
+ ProgressDone();
+
+ Progress("%d stacks", PileCount);
+
+ freemem(CopyCount);
+ CopyCount = 0;
+
+ CreatePiles(Hits, HitCount, PileIndexes);
+
+ if (0 != ImagesFileName)
+ {
+ ProgressStart("Writing images file");
+ WriteImages(ImagesFileName, Hits, HitCount, PileIndexes);
+ ProgressDone();
+ }
+
+ freemem(Hits);
+ Hits = 0;
+
+ if (0 != PilesFileName)
+ {
+ ProgressStart("Writing piles file");
+ WritePiles(PilesFileName, Piles, PileCount);
+ ProgressDone();
+ }
+
+ freemem(PileIndexes);
+ PileIndexes = 0;
+
+ if (0 == OutputFileName)
+ return;
+
+ ProgressStart("Find edges");
+ EdgeList Edges;
+ FindGlobalEdges(Edges, MaxImageCount);
+ ProgressDone();
+
+ Progress("%d edges", (int) Edges.size());
+
+ ProgressStart("Find families");
+ FamList Fams;
+ FindConnectedComponents(Edges, Fams, g_paramMinFamSize);
+ AssignFamsToPiles(Fams);
+ ProgressDone();
+
+ Progress("%d families", (int) Fams.size());
+
+ ProgressStart("Find superfamilies");
+ EdgeList SuperEdges;
+ FindSuperFamEdges(Fams, SuperEdges);
+
+ FamList SuperFams;
+ FindConnectedComponents(SuperEdges, SuperFams, 1);
+ FindSingletonSuperFams(Fams, SuperFams);
+
+ AssignSuperFamsToPiles(Fams, SuperFams);
+ ProgressDone();
+
+ Progress("%d superfamilies", (int) SuperFams.size());
+
+ ProgressStart("Write TRS output file");
+ WriteTRSFile(OutputFileName, Piles, PileCount);
+ ProgressDone();
+ }
diff --git a/trs2fasta.cpp b/trs2fasta.cpp
new file mode 100755
index 0000000..b52b625
--- /dev/null
+++ b/trs2fasta.cpp
@@ -0,0 +1,142 @@
+#include "piler2.h"
+
+static void LogRecords(const TRSData *TRSs, int TRSCount)
+ {
+ Log(" Contig From To Length + Family\n");
+ Log("-------------------- ---------- ---------- ------ - ------\n");
+ for (int TRSIndex = 0; TRSIndex < TRSCount; ++TRSIndex)
+ {
+ const TRSData &TRS = TRSs[TRSIndex];
+ Log("%20.20s %10d %10d %6d %c %d.%d\n",
+ TRS.ContigLabel,
+ TRS.ContigFrom,
+ TRS.ContigTo,
+ TRS.ContigTo - TRS.ContigFrom + 1,
+ TRS.Rev ? '-' : '+',
+ TRS.SuperFamIndex,
+ TRS.FamIndex);
+ }
+ }
+
+static int CmpTRS(const void *s1, const void *s2)
+ {
+ const TRSData *TRS1 = (const TRSData *) s1;
+ const TRSData *TRS2 = (const TRSData *) s2;
+ return TRS1->FamIndex - TRS2->FamIndex;
+ }
+
+static char *FamFileName(const char *Path, int Fam, int SuperFam)
+ {
+ static char FileName[256];
+ char s[128];
+ if (SuperFam == -1)
+ sprintf(s, "/%d", Fam);
+ else
+ sprintf(s, "/%d.%d", SuperFam, Fam);
+ if (strlen(Path) + strlen(s) + 3 >= sizeof(FileName))
+ Quit("Path name too long");
+ strcpy(FileName, Path);
+ strcat(FileName, s);
+ return FileName;
+ }
+
+static char *TRSLabel(const char *Prefix, const TRSData &TRS)
+ {
+ int n = (int) strlen(TRS.ContigLabel) + 128;
+ char *s = all(char, n);
+ if (TRS.SuperFamIndex == -1)
+ {
+ if (0 != Prefix)
+ sprintf(s, "%s.%d %s:%d%c",
+ Prefix,
+ TRS.FamIndex,
+ TRS.ContigLabel,
+ TRS.ContigFrom+1,
+ TRS.Rev ? '-' : '+');
+ else
+ sprintf(s, "%d.%s:%d%c",
+ TRS.FamIndex,
+ TRS.ContigLabel,
+ TRS.ContigFrom+1,
+ TRS.Rev ? '-' : '+');
+ }
+ else
+ {
+ if (0 != Prefix)
+ sprintf(s, "%s.%d.%d %s:%d%c",
+ Prefix,
+ TRS.SuperFamIndex,
+ TRS.FamIndex,
+ TRS.ContigLabel,
+ TRS.ContigFrom+1,
+ TRS.Rev ? '-' : '+');
+ else
+ sprintf(s, "%d.%d %s:%d%c",
+ TRS.SuperFamIndex,
+ TRS.FamIndex,
+ TRS.ContigLabel,
+ TRS.ContigFrom+1,
+ TRS.Rev ? '-' : '+');
+ }
+
+ return s;
+ }
+
+void TRS2Fasta()
+ {
+ const char *TRSFileName = RequiredValueOpt("trs2fasta");
+ const char *SeqFileName = RequiredValueOpt("seq");
+ const char *Path = ValueOpt("path");
+ const char *strMaxFam = ValueOpt("maxfam");
+ const char *Prefix = ValueOpt("prefix");
+
+ int MaxFam = DEFAULT_MAX_FAM;
+ if (strMaxFam != 0)
+ MaxFam = atoi(strMaxFam);
+
+ if (0 == Path)
+ Path = ".";
+
+ ProgressStart("Reading seq file");
+ int SeqLength;
+ const char *Seq = ReadMFA(SeqFileName, &SeqLength);
+ ProgressDone();
+
+ Progress("Seq length %d bases, %.3g Mb", SeqLength, SeqLength/1e6);
+
+ ProgressStart("Read TRS file");
+ int TRSCount;
+ TRSData *TRSs = ReadTRS(TRSFileName, &TRSCount);
+ ProgressDone();
+
+ Progress("%d records", TRSCount);
+
+ ProgressStart("Sorting by family");
+ qsort((void *) TRSs, TRSCount, sizeof(TRSData), CmpTRS);
+ ProgressDone();
+
+ FILE *f = 0;
+ int CurrentFamily = -1;
+ int MemberCount = 0;
+ for (int TRSIndex = 0; TRSIndex < TRSCount; ++TRSIndex)
+ {
+ const TRSData &TRS = TRSs[TRSIndex];
+ if (TRS.FamIndex != CurrentFamily)
+ {
+ if (f != 0)
+ fclose(f);
+ char *FastaFileName = FamFileName(Path, TRS.FamIndex, TRS.SuperFamIndex);
+ f = OpenStdioFile(FastaFileName, FILEIO_MODE_WriteOnly);
+ CurrentFamily = TRS.FamIndex;
+ MemberCount = 0;
+ }
+ ++MemberCount;
+ if (MemberCount > MaxFam)
+ continue;
+ const int From = ContigToGlobal(TRS.ContigFrom, TRS.ContigLabel);
+ const int Length = TRS.ContigTo - TRS.ContigFrom + 1;
+ char *Label = TRSLabel(Prefix, TRS);
+ WriteFasta(f, Seq + From, Length, Label, TRS.Rev);
+ freemem(Label);
+ }
+ }
diff --git a/types.h b/types.h
new file mode 100755
index 0000000..9505f14
--- /dev/null
+++ b/types.h
@@ -0,0 +1,152 @@
+#include <list>
+#include <vector>
+using namespace std;
+
+enum FILEIO_MODE
+ {
+ FILEIO_MODE_Undefined = 0,
+ FILEIO_MODE_ReadOnly,
+ FILEIO_MODE_ReadWrite,
+ FILEIO_MODE_WriteOnly
+ };
+
+#if FILEIO_BINARY_MODE
+#define FILIO_STRMODE_ReadOnly "rb"
+#define FILIO_STRMODE_WriteOnly "wb"
+#define FILIO_STRMODE_ReadWrite "w+b"
+#else
+#define FILIO_STRMODE_ReadOnly "r"
+#define FILIO_STRMODE_WriteOnly "w"
+#define FILIO_STRMODE_ReadWrite "w+"
+#endif
+
+struct HitData
+ {
+ int QueryFrom;
+ int QueryTo;
+ int TargetFrom;
+ int TargetTo;
+ bool Rev;
+ };
+
+typedef int PILE_INDEX_TYPE;
+const PILE_INDEX_TYPE MAX_STACK_INDEX = 0x7ffffff0;
+struct PileImageData
+ {
+// Hack to enforce structure packing
+// in a compiler-independent way
+ PILE_INDEX_TYPE Data[3];
+#define SILength Data[0]
+#define SIPile Data[1]
+#define SIRev Data[2]
+ };
+
+struct EdgeData
+ {
+ int Node1;
+ int Node2;
+ bool Rev;
+ };
+typedef list<EdgeData> EdgeList;
+typedef EdgeList::iterator PtrEdgeList;
+
+struct FamMemberData
+ {
+ int PileIndex;
+ bool Rev;
+ };
+typedef list<FamMemberData> FamData;
+typedef FamData::iterator PtrFamData;
+
+typedef list<FamData *> FamList;
+typedef FamList::iterator PtrFamList;
+
+typedef list<int> IntList;
+typedef IntList::iterator PtrIntList;
+
+typedef list<IntList *> ListOfIntListPtrs;
+typedef ListOfIntListPtrs::iterator PtrListOfIntListPtrs;
+
+struct TRSData
+ {
+ char *ContigLabel;
+ int ContigFrom;
+ int ContigTo;
+ int FamIndex;
+ int SuperFamIndex;
+ bool Rev;
+ };
+
+struct MotifData
+ {
+ char *ContigLabel;
+ int ContigFrom;
+ int ContigTo;
+ int FamIndex;
+ };
+
+struct PileData
+ {
+ int From;
+ int To;
+ int ImageCount;
+ PileImageData *Images;
+ int SuperFamIndex;
+ int FamIndex;
+ int Rev;
+ };
+
+struct ContigData
+ {
+ int From;
+ int Length;
+ char *Label;
+ ContigData *Next;
+ };
+
+struct GFFRecord
+ {
+ char SeqName[MAX_GFF_FEATURE_LENGTH+1];
+ char Source[MAX_GFF_FEATURE_LENGTH+1];
+ char Feature[MAX_GFF_FEATURE_LENGTH+1];
+ int Start;
+ int End;
+ float Score;
+ char Strand;
+ int Frame;
+ char Attrs[MAX_GFF_FEATURE_LENGTH+1];
+ };
+
+struct GFFRecord2
+ {
+ const char *SeqName;
+ const char *Source;
+ const char *Feature;
+ int Start;
+ int End;
+ float Score;
+ char Strand;
+ int Frame;
+ const char *Attrs;
+ };
+
+struct RepData
+ {
+ char *ContigLabel;
+ int ContigFrom;
+ int ContigTo;
+ char *RepeatName;
+ char *RepeatClass;
+ int RepeatFrom;
+ int RepeatTo;
+ int RepeatLeft;
+ bool Rev;
+ };
+
+class GFFSet;
+class GLIX;
+class IIX;
+
+#include "gffset.h"
+#include "glix.h"
+#include "iix.h"
diff --git a/usage.cpp b/usage.cpp
new file mode 100755
index 0000000..d444575
--- /dev/null
+++ b/usage.cpp
@@ -0,0 +1,40 @@
+#include "piler2.h"
+
+void Credits()
+ {
+ static bool Displayed = false;
+ if (Displayed)
+ return;
+
+ fprintf(stderr, "\n" PILER_LONG_VERSION "\n");
+ fprintf(stderr, "http://www.drive5.com/piler\n");
+ fprintf(stderr, "Written by Robert C. Edgar\n");
+ fprintf(stderr, "This software is donated to the public domain.\n");
+ fprintf(stderr, "Please visit web site for requested citation.\n");
+ Displayed = true;
+ }
+
+void Usage()
+ {
+ Credits();
+ fprintf(stderr,
+"\n"
+"Usage:\n"
+" piler -trs <hitfile> -out <trsfile> [options]\n"
+" piler -trs2fasta <trsfile> -seq <fastafile> [-path <path>] [-maxfam <maxfam>]\n"
+" piler -cons <alnfile> -out <fastafile> -label <label>\n"
+" piler -annot <gff> -rep <repfile> -out <annotfile>\n"
+" piler -help\n"
+" piler -version\n"
+"\n"
+"Use -quiet to suppress progress messages\n"
+"\n"
+"-trs options:\n"
+" -mincover <n>\n"
+" -maxlengthdiffpct <n>\n"
+" -piles <pilefile>\n"
+" -images <imagefile>\n"
+" -multihit\n"
+"\n"
+"For further information, please see the User Guide.\n");
+ }
diff --git a/utils.cpp b/utils.cpp
new file mode 100755
index 0000000..b03323a
--- /dev/null
+++ b/utils.cpp
@@ -0,0 +1,89 @@
+#include "piler2.h"
+
+char *strsave(const char *s)
+ {
+ if (0 == s)
+ return 0;
+ char *ptrCopy = strdup(s);
+ if (0 == ptrCopy)
+ Quit("Out of memory");
+ return ptrCopy;
+ }
+
+static const char *StdioStrMode(FILEIO_MODE Mode)
+ {
+ switch (Mode)
+ {
+ case FILEIO_MODE_ReadOnly:
+ return FILIO_STRMODE_ReadOnly;
+ case FILEIO_MODE_WriteOnly:
+ return FILIO_STRMODE_WriteOnly;
+ case FILEIO_MODE_ReadWrite:
+ return FILIO_STRMODE_ReadWrite;
+ }
+ Quit("StdioStrMode: Invalid mode");
+ return "r";
+ }
+
+FILE *OpenStdioFile(const char *FileName, FILEIO_MODE Mode)
+ {
+ const char *strMode = StdioStrMode(Mode);
+ FILE *f = fopen(FileName, strMode);
+ if (0 == f)
+ Quit("Cannot open %s, %s [%d]", FileName, strerror(errno), errno);
+ return f;
+ }
+
+void Rewind(FILE *f)
+ {
+ int Ok = fseek(f, 0, SEEK_SET);
+ if (Ok != 0)
+ Quit("Rewind fseek() != 0 errno=%d", errno);
+
+ long CurrPos = ftell(f);
+ if (CurrPos != 0)
+ Quit("FileSize: ftell=%ld errno=%d", CurrPos, errno);
+ }
+
+int GetFileSize(FILE *f)
+ {
+ long CurrPos = ftell(f);
+ if (CurrPos < 0)
+ Quit("FileSize: ftell<0 (CurrPos), errno=%d", errno);
+
+ int Ok = fseek(f, 0, SEEK_END);
+ if (Ok != 0)
+ Quit("FileSize fseek(END) != 0 errno=%d", errno);
+
+ long Size = ftell(f);
+ if (Size < 0)
+ Quit("FileSize: ftell<0 (size), errno=%d", errno);
+
+ Ok = fseek(f, CurrPos, SEEK_SET);
+ if (Ok != 0)
+ Quit("FileSize fseek(restore curr pos) != 0 errno=%d", errno);
+
+ long NewPos = ftell(f);
+ if (CurrPos < 0)
+ Quit("FileSize: ftell=%ld != CurrPos=%ld", CurrPos, NewPos);
+
+ return (int) Size;
+ }
+
+int Overlap(int From1, int To1, int From2, int To2)
+ {
+ return min(To1, To2) - max(From1, From2);
+ }
+
+// Add minimum possible value of j to i such that:
+// (a) j >= MinInc, and
+// (b) (i + j)%MultipleOf = 0.
+int RoundUp(int i, int MinInc, int MultipleOf)
+ {
+ int newi = i + MinInc;
+ int k = newi%MultipleOf;
+ if (k > 0)
+ newi += (MultipleOf - k);
+ assert(newi >= i + MinInc && 0 == newi%MultipleOf);
+ return newi;
+ }
diff --git a/utils_linux.cpp b/utils_linux.cpp
new file mode 100755
index 0000000..f6eda84
--- /dev/null
+++ b/utils_linux.cpp
@@ -0,0 +1,76 @@
+#include "piler2.h"
+
+#if linux || __linux__
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <unistd.h>
+#include <errno.h>
+#include <stdio.h>
+#include <fcntl.h>
+
+double GetRAMSize()
+ {
+ const double DEFAULT_RAM = 1e9;
+ static double RAMMB = 0;
+ if (RAMMB != 0)
+ return RAMMB;
+
+ int fd = open("/proc/meminfo", O_RDONLY);
+ if (-1 == fd)
+ return DEFAULT_RAM;
+
+ char Buffer[128];
+ int n = read(fd, Buffer, sizeof(Buffer) - 1);
+ close(fd);
+ fd = -1;
+
+ if (n <= 0)
+ return DEFAULT_RAM;
+
+ Buffer[n] = 0;
+ char *pMem = strstr(Buffer, "Mem: ");
+ if (0 == pMem)
+ return DEFAULT_RAM;
+ int Bytes = atoi(pMem+4);
+ return (double) Bytes;
+ }
+
+static unsigned g_uPeakMemUseBytes;
+
+unsigned GetMaxMemUseBytes()
+ {
+ return g_uPeakMemUseBytes;
+ }
+
+unsigned GetMemUseBytes()
+ {
+ static char statm[64];
+ static int PageSize;
+ if (0 == statm[0])
+ {
+ PageSize = sysconf(_SC_PAGESIZE);
+ pid_t pid = getpid();
+ sprintf(statm, "/proc/%d/statm", (int) pid);
+ }
+
+ int fd = open(statm, O_RDONLY);
+ if (-1 == fd)
+ return 1000000;
+ char Buffer[64];
+ int n = read(fd, Buffer, sizeof(Buffer) - 1);
+ close(fd);
+ fd = -1;
+
+ if (n <= 0)
+ return 1000000;
+
+ Buffer[n] = 0;
+ int Pages = atoi(Buffer);
+
+ unsigned uBytes = Pages*PageSize;
+ if (uBytes > g_uPeakMemUseBytes)
+ g_uPeakMemUseBytes = uBytes;
+ return uBytes;
+ }
+
+#endif
diff --git a/utils_unix.cpp b/utils_unix.cpp
new file mode 100755
index 0000000..0d28c15
--- /dev/null
+++ b/utils_unix.cpp
@@ -0,0 +1,38 @@
+#include "piler2.h"
+
+#if ((unix || __unix) && !(linux || __linux__))
+
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <sys/unistd.h>
+
+double GetRAMSize()
+ {
+ struct rlimit RL;
+ int Ok = getrlimit(RLIMIT_DATA, &RL);
+ if (Ok != 0)
+ return 1e9;
+ return RL.rlim_cur;
+ }
+
+static unsigned g_uPeakMemUseBytes = 1000000;
+
+unsigned GetPeakMemUseBytes()
+ {
+ return g_uPeakMemUseBytes;
+ }
+
+unsigned GetMemUseBytes()
+ {
+ struct rusage RU;
+ int Ok = getrusage(RUSAGE_SELF, &RU);
+ if (Ok != 0)
+ return 1000000;
+
+ unsigned Bytes = RU.ru_maxrss*1000;
+ if (Bytes > g_uPeakMemUseBytes)
+ g_uPeakMemUseBytes = Bytes;
+ return Bytes;
+ }
+
+#endif
diff --git a/utils_win32.cpp b/utils_win32.cpp
new file mode 100755
index 0000000..851dc02
--- /dev/null
+++ b/utils_win32.cpp
@@ -0,0 +1,34 @@
+#include "piler2.h"
+
+#if WIN32
+#include <windows.h>
+#include <psapi.h>
+
+double GetRAMSize()
+ {
+ MEMORYSTATUS MS;
+ GlobalMemoryStatus(&MS);
+ return (double) MS.dwTotalPhys;
+ }
+
+static unsigned g_uMaxMemUseBytes;
+
+unsigned GetMaxMemUseBytes()
+ {
+ return g_uMaxMemUseBytes;
+ }
+
+unsigned GetMemUseBytes()
+ {
+ HANDLE hProc = GetCurrentProcess();
+ PROCESS_MEMORY_COUNTERS PMC;
+ BOOL bOk = GetProcessMemoryInfo(hProc, &PMC, sizeof(PMC));
+ if (!bOk)
+ return 1000000;
+ unsigned uBytes = (unsigned) PMC.WorkingSetSize;
+ if (uBytes > g_uMaxMemUseBytes)
+ g_uMaxMemUseBytes = uBytes;
+ return uBytes;
+ }
+
+#endif
diff --git a/writecrisp.cpp b/writecrisp.cpp
new file mode 100755
index 0000000..9f33ef0
--- /dev/null
+++ b/writecrisp.cpp
@@ -0,0 +1,61 @@
+#include "piler2.h"
+
+static void WriteCrisp(FILE *f, const PileData &Pile, int PileIndex)
+ {
+ int FamIndex = Pile.FamIndex;
+ assert(FamIndex >= 0);
+
+ int ContigFrom;
+ const char *ContigLabel = GlobalToContig(Pile.From, &ContigFrom);
+ const int Length = Pile.To - Pile.From + 1;
+ const int ContigTo = ContigFrom + Length - 1;
+ const char Strand = Pile.Rev ? '-' : '+';
+
+// GFF fields are:
+// <seqname> <source> <feature> <start> <end> <score> <strand> <frame> [attributes] [comments]
+// 0 1 2 3 4 5 6 7 8 9
+ fprintf(f, "%s\tpiler\ttrs\t%d\t%d\t0\t%c\t.\tFamily %d ; Pile %d\n",
+ ContigLabel,
+ ContigFrom + 1,
+ ContigTo + 1,
+ Strand,
+ FamIndex,
+ PileIndex);
+ }
+
+void WriteCrispFile(const char *OutputFileName, const PileData *Piles, int PileCount)
+ {
+ FILE *f = OpenStdioFile(OutputFileName, FILEIO_MODE_WriteOnly);
+
+ for (int PileIndex = 0; PileIndex < PileCount; ++PileIndex)
+ {
+ const PileData &Pile = Piles[PileIndex];
+ if (-1 == Pile.FamIndex)
+ continue;
+
+ WriteCrisp(f, Pile, PileIndex);
+ }
+
+ fclose(f);
+ }
+
+void WriteArray(FILE *f, int FamIndex, int Lo, int Hi)
+ {
+ assert(FamIndex >= 0);
+
+ int ContigFrom;
+ const char *ContigLabel = GlobalToContig(Lo, &ContigFrom);
+ const int Length = Hi - Lo + 1;
+ const int ContigTo = ContigFrom + Length - 1;
+ const char Strand = '+';
+
+// GFF fields are:
+// <seqname> <source> <feature> <start> <end> <score> <strand> <frame> [attributes] [comments]
+// 0 1 2 3 4 5 6 7 8 9
+ fprintf(f, "%s\tpiler\ttrs\t%d\t%d\t0\t%c\t.\tFamily %d\n",
+ ContigLabel,
+ ContigFrom + 1,
+ ContigTo + 1,
+ Strand,
+ FamIndex);
+ }
diff --git a/writefasta.cpp b/writefasta.cpp
new file mode 100755
index 0000000..c685ba4
--- /dev/null
+++ b/writefasta.cpp
@@ -0,0 +1,62 @@
+#include "piler2.h"
+
+
+unsigned char CompChar[256];
+
+static void InitCompChar()
+ {
+ for (unsigned i = 0; i < 256; ++i)
+ CompChar[i] = (unsigned char) i;
+
+ CompChar['a'] = 't';
+ CompChar['c'] = 'g';
+ CompChar['g'] = 'c';
+ CompChar['t'] = 'a';
+ CompChar['n'] = 'n';
+ CompChar['A'] = 'T';
+ CompChar['C'] = 'G';
+ CompChar['G'] = 'C';
+ CompChar['T'] = 'A';
+ }
+
+void WriteFasta(FILE *f, const char *Seq, int Length, const char *Label, bool Rev)
+ {
+ static bool CompCharInit = false;
+ if (!CompCharInit)
+ {
+ InitCompChar();
+ CompCharInit = true;
+ }
+
+ fprintf(f, ">%s\n", Label);
+
+ if (Rev)
+ {
+ int Index = Length;
+ for (int i = 0; i < Length; i += FASTA_BLOCK)
+ {
+ int n = FASTA_BLOCK;
+ if (i + n > Length)
+ n = Length - i;
+ for (int j = 0; j < n; ++j)
+ {
+ const unsigned char c = Seq[--Index];
+ const unsigned char cComp = CompChar[c];
+ fputc(cComp, f);
+ }
+ fputc('\n', f);
+ }
+ }
+ else
+ {
+ for (int i = 0; i < Length; i += FASTA_BLOCK)
+ {
+ int n = FASTA_BLOCK;
+ if (i + n > Length)
+ n = Length - i;
+ for (int j = 0; j < n; ++j)
+ fputc(*Seq++, f);
+ fputc('\n', f);
+ }
+ }
+ }
diff --git a/writeimages.cpp b/writeimages.cpp
new file mode 100755
index 0000000..fbfd0b7
--- /dev/null
+++ b/writeimages.cpp
@@ -0,0 +1,50 @@
+#include "piler2.h"
+
+static void WriteHit(FILE *f, const HitData &Hit, const PILE_INDEX_TYPE *PileIndexes)
+ {
+ const int TargetFrom = Hit.TargetFrom;
+ const int QueryFrom = Hit.QueryFrom;
+
+ int TargetContigFrom;
+ const char *TargetLabel = GlobalToContig(TargetFrom, &TargetContigFrom);
+
+ int QueryContigFrom;
+ const char *QueryLabel = GlobalToContig(QueryFrom, &QueryContigFrom);
+
+ const int TargetLength = Hit.TargetTo - TargetFrom + 1;
+ const int QueryLength = Hit.QueryTo - QueryFrom + 1;
+
+ const int QueryPileIndex = PileIndexes[QueryFrom];
+ const int TargetPileIndex = PileIndexes[TargetFrom];
+
+ const int QueryContigTo = QueryContigFrom + QueryLength - 1;
+ const int TargetContigTo = TargetContigFrom + TargetLength - 1;
+
+ const char Strand = Hit.Rev ? '-' : '+';
+
+// GFF fields are:
+// <seqname> <source> <feature> <start> <end> <score> <strand> <frame> [attributes] [comments]
+// 0 1 2 3 4 5 6 7 8 9
+ fprintf(f, "%s\tpiler\thit\t%d\t%d\t0\t%c\t.\tTarget %s %d %d ; Piles %d %d\n",
+ QueryLabel,
+ QueryContigFrom + 1,
+ QueryContigTo + 1,
+ Strand,
+ TargetLabel,
+ TargetContigFrom + 1,
+ TargetContigTo + 1,
+ QueryPileIndex,
+ TargetPileIndex);
+ }
+
+void WriteImages(const char *FileName, const HitData *Hits, int HitCount,
+ const PILE_INDEX_TYPE *PileIndexes)
+ {
+ FILE *f = OpenStdioFile(FileName, FILEIO_MODE_WriteOnly);
+ for (int HitIndex = 0; HitIndex < HitCount; ++HitIndex)
+ {
+ const HitData &Hit = Hits[HitIndex];
+ WriteHit(f, Hit, PileIndexes);
+ }
+ fclose(f);
+ }
diff --git a/writepiles.cpp b/writepiles.cpp
new file mode 100755
index 0000000..45b57de
--- /dev/null
+++ b/writepiles.cpp
@@ -0,0 +1,31 @@
+#include "piler2.h"
+
+static void WritePile(FILE *f, const PileData &Pile, int PileIndex)
+ {
+ int ContigFrom;
+ const char *ContigLabel = GlobalToContig(Pile.From, &ContigFrom);
+ const int Length = Pile.To - Pile.From + 1;
+ const int ContigTo = ContigFrom + Length - 1;
+ const char Strand = Pile.Rev ? '-' : '+';
+
+// GFF fields are:
+// <seqname> <source> <feature> <start> <end> <score> <strand> <frame> [attributes] [comments]
+// 0 1 2 3 4 5 6 7 8 9
+ fprintf(f, "%s\tpiler\tpile\t%d\t%d\t0\t%c\t.\tPile %d\n",
+ ContigLabel,
+ ContigFrom + 1,
+ ContigTo + 1,
+ Strand,
+ PileIndex);
+ }
+
+void WritePiles(const char *FileName, const PileData *Piles, int PileCount)
+ {
+ FILE *f = OpenStdioFile(FileName, FILEIO_MODE_WriteOnly);
+ for (int PileIndex = 0; PileIndex < PileCount; ++PileIndex)
+ {
+ const PileData &Pile = Piles[PileIndex];
+ WritePile(f, Pile, PileIndex);
+ }
+ fclose(f);
+ }
diff --git a/writetrs.cpp b/writetrs.cpp
new file mode 100755
index 0000000..51db18b
--- /dev/null
+++ b/writetrs.cpp
@@ -0,0 +1,44 @@
+#include "piler2.h"
+
+static void WriteTRS(FILE *f, const PileData &Pile, int PileIndex)
+ {
+ int FamIndex = Pile.FamIndex;
+ assert(FamIndex >= 0);
+
+ int SuperFamIndex = Pile.SuperFamIndex;
+ assert(SuperFamIndex >= 0);
+
+ int ContigFrom;
+ const char *ContigLabel = GlobalToContig(Pile.From, &ContigFrom);
+ const int Length = Pile.To - Pile.From + 1;
+ const int ContigTo = ContigFrom + Length - 1;
+ const char Strand = Pile.Rev ? '-' : '+';
+
+// GFF fields are:
+// <seqname> <source> <feature> <start> <end> <score> <strand> <frame> [attributes] [comments]
+// 0 1 2 3 4 5 6 7 8 9
+ fprintf(f, "%s\tpiler\ttrs\t%d\t%d\t0\t%c\t.\tFamily %d.%d ; Pile %d\n",
+ ContigLabel,
+ ContigFrom + 1,
+ ContigTo + 1,
+ Strand,
+ FamIndex,
+ SuperFamIndex,
+ PileIndex);
+ }
+
+void WriteTRSFile(const char *OutputFileName, const PileData *Piles, int PileCount)
+ {
+ FILE *f = OpenStdioFile(OutputFileName, FILEIO_MODE_WriteOnly);
+
+ for (int PileIndex = 0; PileIndex < PileCount; ++PileIndex)
+ {
+ const PileData &Pile = Piles[PileIndex];
+ if (-1 == Pile.FamIndex)
+ continue;
+
+ WriteTRS(f, Pile, PileIndex);
+ }
+
+ fclose(f);
+ }
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/piler.git
More information about the debian-med-commit
mailing list