[med-svn] [libsmithwaterman] 01/02: Imported Upstream version 0.0+20151117
Andreas Tille
tille at debian.org
Wed Jun 22 11:47:11 UTC 2016
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository libsmithwaterman.
commit 7c48bcde4e68549156c6e3151319e3856d144546
Author: Andreas Tille <tille at debian.org>
Date: Wed Jun 22 13:44:54 2016 +0200
Imported Upstream version 0.0+20151117
---
.gitignore | 3 +
BandedSmithWaterman.cpp | 670 +++++++++++++++++++++++++++++++++++++
BandedSmithWaterman.h | 111 +++++++
IndelAllele.cpp | 62 ++++
IndelAllele.h | 37 +++
LeftAlign.cpp | 853 ++++++++++++++++++++++++++++++++++++++++++++++++
LeftAlign.h | 32 ++
Makefile | 66 ++++
Mosaik.h | 73 +++++
Repeats.cpp | 69 ++++
Repeats.h | 8 +
SWMain.cpp | 126 +++++++
SmithWatermanGotoh.cpp | 741 +++++++++++++++++++++++++++++++++++++++++
SmithWatermanGotoh.h | 103 ++++++
convert.h | 22 ++
disorder.cpp | 191 +++++++++++
disorder.h | 62 ++++
examples.txt | 76 +++++
libdisorder.LICENSE | 339 +++++++++++++++++++
smithwaterman.cpp | 306 +++++++++++++++++
20 files changed, 3950 insertions(+)
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..e36f800
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+.*.swp
+*.o
+smithwaterman
diff --git a/BandedSmithWaterman.cpp b/BandedSmithWaterman.cpp
new file mode 100644
index 0000000..a961f7c
--- /dev/null
+++ b/BandedSmithWaterman.cpp
@@ -0,0 +1,670 @@
+#include "BandedSmithWaterman.h"
+
+// define our static constants
+const float CBandedSmithWaterman::FLOAT_NEGATIVE_INFINITY = (float)-1e+30;
+
+const DirectionType CBandedSmithWaterman::Directions_STOP = 0;
+const DirectionType CBandedSmithWaterman::Directions_LEFT = 1;
+const DirectionType CBandedSmithWaterman::Directions_DIAGONAL = 2;
+const DirectionType CBandedSmithWaterman::Directions_UP = 3;
+
+const PositionType CBandedSmithWaterman::Position_REF_AND_QUERY_ZERO = 0;
+const PositionType CBandedSmithWaterman::Position_REF_ZERO = 1;
+const PositionType CBandedSmithWaterman::Position_QUERY_ZERO = 2;
+const PositionType CBandedSmithWaterman::Position_REF_AND_QUERO_NONZERO = 3;
+
+// constructor
+CBandedSmithWaterman::CBandedSmithWaterman(float matchScore, float mismatchScore, float gapOpenPenalty, float gapExtendPenalty, unsigned int bandWidth)
+: mCurrentMatrixSize(0)
+, mCurrentAnchorSize(0)
+, mCurrentAQSumSize(0)
+, mBandwidth(bandWidth)
+, mPointers(NULL)
+, mMatchScore(matchScore)
+, mMismatchScore(mismatchScore)
+, mGapOpenPenalty(gapOpenPenalty)
+, mGapExtendPenalty(gapExtendPenalty)
+, mAnchorGapScores(NULL)
+, mBestScores(NULL)
+, mReversedAnchor(NULL)
+, mReversedQuery(NULL)
+, mUseHomoPolymerGapOpenPenalty(false)
+{
+ CreateScoringMatrix();
+
+ //if((bandWidth % 2) != 1) {
+ //printf("ERROR: The bandwidth must be an odd number.\n");
+ //exit(1);
+ //}
+
+ try {
+ mBestScores = new float[bandWidth + 2];
+ mAnchorGapScores = new float[bandWidth + 2];
+ } catch(bad_alloc) {
+ printf("ERROR: Unable to allocate enough memory for the banded Smith-Waterman algorithm.\n");
+ exit(1);
+ }
+}
+
+// destructor
+CBandedSmithWaterman::~CBandedSmithWaterman(void) {
+ if(mPointers) delete [] mPointers;
+ if(mAnchorGapScores) delete [] mAnchorGapScores;
+ if(mBestScores) delete [] mBestScores;
+ if(mReversedAnchor) delete [] mReversedAnchor;
+ if(mReversedQuery) delete [] mReversedQuery;
+}
+
+// aligns the query sequence to the anchor using the Smith Waterman Gotoh algorithm
+void CBandedSmithWaterman::Align(unsigned int& referenceAl, string& cigarAl, const string& s1, const string& s2, pair< pair<unsigned int, unsigned int>, pair<unsigned int, unsigned int> >& hr) {
+
+
+
+ unsigned int rowStart = min(hr.first.first, (unsigned int)hr.second.first);
+ hr.first.first -= rowStart;
+ hr.second.first -= rowStart;
+
+ //bool isLegalBandWidth = (s2.length() - hr.QueryBegin) > (mBandwidth / 2);
+ // isLegalBandWidth = isLegalBandWidth && ((s1.length() - hr.Begin) > (mBandwidth / 2));
+
+
+
+ // check the lengths of the input sequences
+ //if( (s1.length() <= 0) || (s2.length() <= 0) || (s1.length() < s2.length()) ) {
+ // printf("ERROR: An unexpected sequence length was encountered during pairwise alignment.\n");
+ // printf("Sequence lengths are listed as following:\n");
+ // printf("1. Reference length: %u\n2. Query length: %u\n", s1.length(), s2.length());
+ //printf("3. Hash region in reference:%4u-%4u\n", hr.Begin + rowStart, hr.End);
+ //printf("4. Hash region in query: %4u-%4u\n", hr.QueryBegin + rowStart, hr.QueryEnd);
+ // exit(1);
+ //}
+
+
+ // determine the hash region type
+ unsigned int rowOffset;
+ unsigned int columnOffset;
+ PositionType positionType;
+
+ if(hr.first.first == 0) {
+ if(hr.second.first == 0) {
+ rowOffset = 1;
+ columnOffset = (mBandwidth / 2) + 1;
+ positionType = Position_REF_AND_QUERY_ZERO;
+ } else {
+ rowOffset = 1 - hr.second.first;
+ columnOffset = (mBandwidth / 2) + 1 + hr.second.first;
+ positionType = Position_REF_ZERO;
+ }
+ } else {
+ if(hr.second.first == 0) {
+ rowOffset = 1;
+ columnOffset = (mBandwidth / 2) + 1 - hr.first.first;
+ positionType = Position_QUERY_ZERO;
+ } else {
+ rowOffset = 1 - hr.second.first;
+ columnOffset = (mBandwidth / 2) + 1 + hr.second.first - hr.first.first;
+ positionType = Position_REF_AND_QUERO_NONZERO;
+ }
+ }
+
+ // =========================
+ // Reinitialize the matrices
+ // =========================
+
+ ReinitializeMatrices(positionType, s1.length(), s2.length(), hr);
+
+ // =======================================
+ // Banded Smith-Waterman forward algorithm
+ // =======================================
+
+ unsigned int bestColumn = 0;
+ unsigned int bestRow = 0;
+ float bestScore = FLOAT_NEGATIVE_INFINITY;
+ float currentQueryGapScore;
+
+ // rowNum and column indicate the row and column numbers in the Smith-Waterman matrix respectively
+ unsigned int rowNum = hr.second.first;
+ unsigned int columnNum = hr.first.first;
+
+ // indicates how many rows including blank elements in the Banded SmithWaterman
+ int numBlankElements = (mBandwidth / 2) - columnNum;
+
+ //cout << numBlankElements << endl;
+ // upper triangle matrix in Banded Smith-Waterman
+ for( ; numBlankElements > 0; numBlankElements--, rowNum++){
+ // in the upper triangle matrix, we always start at the 0th column
+ columnNum = 0;
+
+ // columnEnd indicates how many columns which should be dealt with in the current row
+ unsigned int columnEnd = min((mBandwidth - numBlankElements), ((unsigned int) s1.length() - columnNum + 1) );
+ currentQueryGapScore = FLOAT_NEGATIVE_INFINITY;
+ for( unsigned int j = 0; j < columnEnd; j++){
+ float score = CalculateScore(s1, s2, rowNum, columnNum, currentQueryGapScore, rowOffset, columnOffset);
+ //cout << s1[columnNum] << s2[rowNum] << score << endl;
+ UpdateBestScore(bestRow, bestColumn, bestScore, rowNum, columnNum, score);
+ columnNum++;
+ }
+
+ // replace the columnNum to the middle column in the Smith-Waterman matrix
+ columnNum = columnNum - (mBandwidth / 2);
+ }
+ // complete matrix in Banded Smith-Waterman
+ unsigned int completeNum = min((s1.length() - columnNum - (mBandwidth / 2)), (s2.length() - rowNum));
+ //cout << completeNum << endl;
+ for(unsigned int i = 0; i < completeNum; i++, rowNum++){
+ columnNum = columnNum - (mBandwidth / 2);
+
+ // there are mBandwidth columns which should be dealt with in each row
+ currentQueryGapScore = FLOAT_NEGATIVE_INFINITY;
+
+ for(unsigned int j = 0; j < mBandwidth; j++){
+ float score = CalculateScore(s1, s2, rowNum, columnNum, currentQueryGapScore, rowOffset, columnOffset);
+ UpdateBestScore(bestRow, bestColumn, bestScore, rowNum, columnNum, score);
+ //cout << s1[columnNum] << s2[rowNum] << score << endl;
+ columnNum++;
+ }
+
+ // replace the columnNum to the middle column in the Smith-Waterman matrix
+ // because mBandwidth is an odd number, everytime the following equation shifts a column (pluses 1).
+ columnNum = columnNum - (mBandwidth / 2);
+ }
+
+ // lower triangle matrix
+ numBlankElements = min(mBandwidth, ((unsigned int) s2.length() - rowNum));
+ columnNum = columnNum - (mBandwidth / 2);
+ for(unsigned int i = 0; numBlankElements > 0; i++, rowNum++, numBlankElements--) {
+
+ mBestScores[ mBandwidth - i ] = FLOAT_NEGATIVE_INFINITY;;
+ // columnEnd indicates how many columns which should be dealt with
+ currentQueryGapScore = FLOAT_NEGATIVE_INFINITY;
+
+ for( unsigned int j = columnNum; j < s1.length(); j++){
+ float score = CalculateScore(s1, s2, rowNum, columnNum, currentQueryGapScore, rowOffset, columnOffset);
+ UpdateBestScore(bestRow, bestColumn, bestScore, rowNum, columnNum, score);
+ //cout << s1[columnNum] << s2[rowNum] << score << endl;
+ columnNum++;
+ }
+
+ // replace the columnNum to the middle column in the Smith-Waterman matrix
+ columnNum = columnNum - mBandwidth + i + 2;
+ }
+
+ // =========================================
+ // Banded Smith-Waterman backtrace algorithm
+ // =========================================
+
+ Traceback(referenceAl, cigarAl, s1, s2, bestRow, bestColumn, rowOffset, columnOffset);
+
+}
+
+// calculates the score during the forward algorithm
+float CBandedSmithWaterman::CalculateScore(const string& s1, const string& s2, const unsigned int rowNum, const unsigned int columnNum, float& currentQueryGapScore, const unsigned int rowOffset, const unsigned int columnOffset) {
+
+ // initialize
+ const unsigned int row = rowNum + rowOffset;
+ const unsigned int column = columnOffset - rowNum + columnNum;
+ const unsigned int position = row * (mBandwidth + 2) + column;
+
+ // retrieve the similarity scores
+ const float similarityScore = mScoringMatrix[s1[columnNum] - 'A'][s2[rowNum] - 'A'];
+ const float totalSimilarityScore = mBestScores[column] + similarityScore;
+
+ // ================================
+ // open a gap in the query sequence
+ // ================================
+
+ float queryGapExtendScore = currentQueryGapScore - mGapExtendPenalty;
+ float queryGapOpenScore = mBestScores[column - 1] - mGapOpenPenalty;
+
+ // compute the homo-polymer gap score if enabled
+ if(mUseHomoPolymerGapOpenPenalty)
+ if((rowNum > 1) && (s2[rowNum] == s2[rowNum - 1]))
+ queryGapOpenScore = mBestScores[column - 1] - mHomoPolymerGapOpenPenalty;
+
+ if(queryGapExtendScore > queryGapOpenScore) {
+ currentQueryGapScore = queryGapExtendScore;
+ mPointers[position].mSizeOfHorizontalGaps = mPointers[position - 1].mSizeOfHorizontalGaps + 1;
+ } else currentQueryGapScore = queryGapOpenScore;
+
+
+ // ====================================
+ // open a gap in the reference sequence
+ // ====================================
+
+
+ float anchorGapExtendScore = mAnchorGapScores[column + 1] - mGapExtendPenalty;
+ float anchorGapOpenScore = mBestScores[column + 1] - mGapOpenPenalty;
+
+ // compute the homo-polymer gap score if enabled
+ if(mUseHomoPolymerGapOpenPenalty)
+ if((columnNum > 1) && (s1[columnNum] == s1[columnNum - 1]))
+ anchorGapOpenScore = mBestScores[column + 1] - mHomoPolymerGapOpenPenalty;
+
+ if(anchorGapExtendScore > anchorGapOpenScore) {
+ mAnchorGapScores[column] = anchorGapExtendScore;
+ mPointers[position].mSizeOfVerticalGaps = mPointers[position - mBandwidth - 1].mSizeOfVerticalGaps + 1;
+ } else mAnchorGapScores[column] = anchorGapOpenScore;
+
+ // ======================================
+ // calculate the best score and direction
+ // ======================================
+
+ //mBestScores[column] = MaxFloats(totalSimilarityScore, mAnchorGapScores[column], currentQueryGapScore);
+ mBestScores[column] = MaxFloats(totalSimilarityScore, currentQueryGapScore, mAnchorGapScores[column]);
+
+ // determine the traceback direction
+ // diagonal (445364713) > stop (238960195) > up (214378647) > left (166504495)
+ if(mBestScores[column] == 0) mPointers[position].Direction = Directions_STOP;
+ else if(mBestScores[column] == totalSimilarityScore) mPointers[position].Direction = Directions_UP;
+ else if(mBestScores[column] == currentQueryGapScore) mPointers[position].Direction = Directions_LEFT;
+ else mPointers[position].Direction = Directions_DIAGONAL;
+
+ return mBestScores[column];
+}
+
+// corrects the homopolymer gap order for forward alignments
+void CBandedSmithWaterman::CorrectHomopolymerGapOrder(const unsigned int numBases, const unsigned int numMismatches) {
+
+
+ // this is only required for alignments with mismatches
+ //if(al.NumMismatches == 0) return;
+ if ( numMismatches == 0 ) return;
+
+ // localize the alignment data
+ //char* pReference = al.Reference.Data();
+ //char* pQuery = al.Query.Data();
+ //const unsigned int numBases = al.Reference.Length();
+ char* pReference = mReversedAnchor;
+ char* pQuery = mReversedQuery;
+
+ // initialize
+ bool hasReferenceGap = false, hasQueryGap = false;
+ char* pNonGapSeq = NULL;
+ char* pGapSeq = NULL;
+ char nonGapBase = 'J';
+
+ // identify gapped regions
+ for(unsigned int i = 0; i < numBases; i++) {
+
+ // check if the current position is gapped
+ hasReferenceGap = false;
+ hasQueryGap = false;
+
+ if(pReference[i] == GAP) {
+ hasReferenceGap = true;
+ pNonGapSeq = pQuery;
+ pGapSeq = pReference;
+ nonGapBase = pQuery[i];
+ }
+
+ if(pQuery[i] == GAP) {
+ hasQueryGap = true;
+ pNonGapSeq = pReference;
+ pGapSeq = pQuery;
+ nonGapBase = pReference[i];
+ }
+
+ // continue if we don't have any gaps
+ if(!hasReferenceGap && !hasQueryGap) continue;
+
+ // sanity check
+ if(hasReferenceGap && hasQueryGap) {
+ printf("ERROR: Found a gap in both the reference sequence and query sequence.\n");
+ exit(1);
+ }
+
+ // find the non-gapped length (forward)
+ unsigned short numGappedBases = 0;
+ unsigned short nonGapLength = 0;
+ unsigned short testPos = i;
+ while(testPos < numBases) {
+
+ const char gs = pGapSeq[testPos];
+ const char ngs = pNonGapSeq[testPos];
+
+ bool isPartofHomopolymer = false;
+ if(((gs == nonGapBase) || (gs == GAP)) && (ngs == nonGapBase)) isPartofHomopolymer = true;
+ if(!isPartofHomopolymer) break;
+
+ if(gs == GAP) numGappedBases++;
+ else nonGapLength++;
+ testPos++;
+ }
+
+ // fix the gap order
+ if(numGappedBases != 0) {
+ char* pCurrentSequence = pGapSeq + i;
+ memset(pCurrentSequence, nonGapBase, nonGapLength);
+ pCurrentSequence += nonGapLength;
+ memset(pCurrentSequence, GAP, numGappedBases);
+ }
+
+ // increment
+ i += numGappedBases + nonGapLength - 1;
+ }
+}
+
+// creates a simple scoring matrix to align the nucleotides and the ambiguity code N
+void CBandedSmithWaterman::CreateScoringMatrix(void) {
+
+ unsigned int nIndex = 13;
+ unsigned int xIndex = 23;
+
+ // define the N score to be 1/4 of the span between mismatch and match
+ //const short nScore = mMismatchScore + (short)(((mMatchScore - mMismatchScore) / 4.0) + 0.5);
+
+ // calculate the scoring matrix
+ for(unsigned char i = 0; i < MOSAIK_NUM_NUCLEOTIDES; i++) {
+ for(unsigned char j = 0; j < MOSAIK_NUM_NUCLEOTIDES; j++) {
+
+ // N.B. matching N to everything (while conceptually correct) leads to some
+ // bad alignments, lets make N be a mismatch instead.
+
+ // add the matches or mismatches to the hashtable (N is a mismatch)
+ if((i == nIndex) || (j == nIndex)) mScoringMatrix[i][j] = mMismatchScore;
+ else if((i == xIndex) || (j == xIndex)) mScoringMatrix[i][j] = mMismatchScore;
+ else if(i == j) mScoringMatrix[i][j] = mMatchScore;
+ else mScoringMatrix[i][j] = mMismatchScore;
+ }
+ }
+
+ // add ambiguity codes
+ mScoringMatrix['M' - 'A']['A' - 'A'] = mMatchScore; // M - A
+ mScoringMatrix['A' - 'A']['M' - 'A'] = mMatchScore;
+ // add ambiguity codes
+ mScoringMatrix['M' - 'A']['A' - 'A'] = mMatchScore; // M - A
+ mScoringMatrix['A' - 'A']['M' - 'A'] = mMatchScore;
+ mScoringMatrix['M' - 'A']['C' - 'A'] = mMatchScore; // M - C
+ mScoringMatrix['C' - 'A']['M' - 'A'] = mMatchScore;
+
+ mScoringMatrix['R' - 'A']['A' - 'A'] = mMatchScore; // R - A
+ mScoringMatrix['A' - 'A']['R' - 'A'] = mMatchScore;
+ mScoringMatrix['R' - 'A']['G' - 'A'] = mMatchScore; // R - G
+ mScoringMatrix['G' - 'A']['R' - 'A'] = mMatchScore;
+
+ mScoringMatrix['W' - 'A']['A' - 'A'] = mMatchScore; // W - A
+ mScoringMatrix['A' - 'A']['W' - 'A'] = mMatchScore;
+ mScoringMatrix['W' - 'A']['T' - 'A'] = mMatchScore; // W - T
+ mScoringMatrix['T' - 'A']['W' - 'A'] = mMatchScore;
+
+ mScoringMatrix['S' - 'A']['C' - 'A'] = mMatchScore; // S - C
+ mScoringMatrix['C' - 'A']['S' - 'A'] = mMatchScore;
+ mScoringMatrix['S' - 'A']['G' - 'A'] = mMatchScore; // S - G
+ mScoringMatrix['G' - 'A']['S' - 'A'] = mMatchScore;
+
+ mScoringMatrix['Y' - 'A']['C' - 'A'] = mMatchScore; // Y - C
+ mScoringMatrix['C' - 'A']['Y' - 'A'] = mMatchScore;
+ mScoringMatrix['Y' - 'A']['T' - 'A'] = mMatchScore; // Y - T
+ mScoringMatrix['T' - 'A']['Y' - 'A'] = mMatchScore;
+
+ mScoringMatrix['K' - 'A']['G' - 'A'] = mMatchScore; // K - G
+ mScoringMatrix['G' - 'A']['K' - 'A'] = mMatchScore;
+ mScoringMatrix['K' - 'A']['T' - 'A'] = mMatchScore; // K - T
+ mScoringMatrix['T' - 'A']['K' - 'A'] = mMatchScore;
+
+ mScoringMatrix['V' - 'A']['A' - 'A'] = mMatchScore; // V - A
+ mScoringMatrix['A' - 'A']['V' - 'A'] = mMatchScore;
+ mScoringMatrix['V' - 'A']['C' - 'A'] = mMatchScore; // V - C
+ mScoringMatrix['C' - 'A']['V' - 'A'] = mMatchScore;
+ mScoringMatrix['V' - 'A']['G' - 'A'] = mMatchScore; // V - G
+ mScoringMatrix['G' - 'A']['V' - 'A'] = mMatchScore;
+
+ mScoringMatrix['H' - 'A']['A' - 'A'] = mMatchScore; // H - A
+ mScoringMatrix['A' - 'A']['H' - 'A'] = mMatchScore;
+ mScoringMatrix['H' - 'A']['C' - 'A'] = mMatchScore; // H - C
+ mScoringMatrix['C' - 'A']['H' - 'A'] = mMatchScore;
+ mScoringMatrix['H' - 'A']['T' - 'A'] = mMatchScore; // H - T
+ mScoringMatrix['T' - 'A']['H' - 'A'] = mMatchScore;
+
+ mScoringMatrix['D' - 'A']['A' - 'A'] = mMatchScore; // D - A
+ mScoringMatrix['A' - 'A']['D' - 'A'] = mMatchScore;
+ mScoringMatrix['D' - 'A']['G' - 'A'] = mMatchScore; // D - G
+ mScoringMatrix['G' - 'A']['D' - 'A'] = mMatchScore;
+ mScoringMatrix['D' - 'A']['T' - 'A'] = mMatchScore; // D - T
+ mScoringMatrix['T' - 'A']['D' - 'A'] = mMatchScore;
+
+ mScoringMatrix['B' - 'A']['C' - 'A'] = mMatchScore; // B - C
+ mScoringMatrix['C' - 'A']['B' - 'A'] = mMatchScore;
+ mScoringMatrix['B' - 'A']['G' - 'A'] = mMatchScore; // B - G
+ mScoringMatrix['G' - 'A']['B' - 'A'] = mMatchScore;
+ mScoringMatrix['B' - 'A']['T' - 'A'] = mMatchScore; // B - T
+ mScoringMatrix['T' - 'A']['B' - 'A'] = mMatchScore;
+}
+
+// enables homo-polymer scoring
+void CBandedSmithWaterman::EnableHomoPolymerGapPenalty(float hpGapOpenPenalty) {
+ mUseHomoPolymerGapOpenPenalty = true;
+ mHomoPolymerGapOpenPenalty = hpGapOpenPenalty;
+}
+
+// reinitializes the matrices
+void CBandedSmithWaterman::ReinitializeMatrices(const PositionType& positionType, const unsigned int& s1Length, const unsigned int& s2Length, const pair< pair<unsigned int, unsigned int>, pair<unsigned int, unsigned int> > hr) {
+
+/*
+ try {
+ mBestScores = new float[mBandwidth + 2];
+ mAnchorGapScores = new float[mBandwidth + 2];
+ } catch(bad_alloc) {
+ printf("ERROR: Unable to allocate enough memory for the banded Smith-Waterman algorithm.\n");
+ exit(1);
+ }
+*/
+
+ const unsigned int numColumns = mBandwidth + 2;
+ unsigned int numRows = 0;
+
+ switch(positionType) {
+ case Position_REF_AND_QUERY_ZERO:
+ numRows = s2Length + 1;
+ break;
+ case Position_REF_ZERO:
+ numRows = s2Length - hr.second.first + 2;
+ break;
+ case Position_QUERY_ZERO:
+ numRows = min(s2Length + 1, s1Length - hr.first.first + 2);
+ break;
+ case Position_REF_AND_QUERO_NONZERO:
+ numRows = min(s1Length - hr.first.first + 2, s2Length - hr.second.first + 2);
+ break;
+ }
+
+ // update the size of the backtrace matrix
+ if((numColumns * numRows) > mCurrentMatrixSize) {
+
+ mCurrentMatrixSize = numColumns * numRows;
+ if(mPointers) delete [] mPointers;
+
+ try {
+ mPointers = new ElementInfo[mCurrentMatrixSize];
+ } catch(bad_alloc) {
+ printf("ERROR: Unable to allocate enough memory for the banded Smith-Waterman algorithm.\n");
+ exit(1);
+ }
+ }
+
+ // initialize our backtrace matrix
+ ElementInfo defaultElement;
+ defaultElement.Direction = Directions_STOP;
+ defaultElement.mSizeOfHorizontalGaps = 1;
+ defaultElement.mSizeOfVerticalGaps = 1;
+
+ uninitialized_fill(mPointers, mPointers + mCurrentMatrixSize, defaultElement);
+
+ // update the sequence character arrays
+ if((s1Length + s2Length) > mCurrentAQSumSize) {
+
+ mCurrentAQSumSize = s1Length + s2Length;
+ if(mReversedAnchor) delete [] mReversedAnchor;
+ if(mReversedQuery) delete [] mReversedQuery;
+
+ try {
+ mReversedAnchor = new char[mCurrentAQSumSize + 1]; // reversed sequence #1
+ mReversedQuery = new char[mCurrentAQSumSize + 1]; // reversed sequence #2
+ } catch(bad_alloc) {
+ printf("ERROR: Unable to allocate enough memory for the banded Smith-Waterman algorithm.\n");
+ exit(1);
+ }
+ }
+
+ // initialize the gap score and score vectors
+ uninitialized_fill(mAnchorGapScores, mAnchorGapScores + mBandwidth + 2, FLOAT_NEGATIVE_INFINITY);
+ memset((char*)mBestScores, 0, SIZEOF_FLOAT * (mBandwidth + 2));
+ mBestScores[0] = FLOAT_NEGATIVE_INFINITY;
+ mBestScores[mBandwidth + 1] = FLOAT_NEGATIVE_INFINITY;
+}
+
+// performs the backtrace algorithm
+void CBandedSmithWaterman::Traceback(unsigned int& referenceAl, string& cigarAl, const string& s1, const string& s2, unsigned int bestRow, unsigned int bestColumn, const unsigned int rowOffset, const unsigned int columnOffset){
+
+
+ unsigned int currentRow = bestRow;
+ unsigned int currentColumn = bestColumn;
+ unsigned int currentPosition = ((currentRow + rowOffset) * (mBandwidth + 2)) + (columnOffset - currentRow + currentColumn);
+
+
+ // record the numbers of row and column before the current row and column
+ unsigned int previousRow = bestRow;
+ unsigned int previousColumn = bestColumn;
+
+ unsigned int gappedAnchorLen = 0;
+ unsigned int gappedQueryLen = 0;
+ unsigned int numMismatches = 0;
+
+ bool keepProcessing = true;
+ while(keepProcessing) {
+ unsigned int nVerticalGap = 0;
+ unsigned int nHorizontalGap = 0;
+ switch(mPointers[currentPosition].Direction){
+ case Directions_DIAGONAL:
+ nVerticalGap = mPointers[currentPosition].mSizeOfVerticalGaps;
+ for(unsigned int i = 0; i < nVerticalGap; i++){
+ mReversedAnchor[gappedAnchorLen++] = GAP;
+ mReversedQuery[gappedQueryLen++] = s2[currentRow];
+
+ numMismatches++;
+
+ previousRow = currentRow;
+ previousColumn = currentColumn;
+
+ currentRow--;
+ }
+ break;
+
+ case Directions_STOP:
+ keepProcessing = false;
+ //mReversedAnchor[gappedAnchorLen+1]='\0';
+ //mReversedQuery [gappedQueryLen+1]='\0';
+ break;
+
+ case Directions_UP:
+
+ mReversedAnchor[gappedAnchorLen++] = s1[currentColumn];
+ mReversedQuery[gappedQueryLen++] = s2[currentRow];
+
+ if(s1[currentColumn] != s2[currentRow]) numMismatches++;
+ previousRow = currentRow;
+ previousColumn = currentColumn;
+
+ currentRow--;
+ currentColumn--;
+ break;
+
+ case Directions_LEFT:
+ nHorizontalGap = mPointers[currentPosition].mSizeOfHorizontalGaps;
+ for(unsigned int i = 0; i < nHorizontalGap; i++){
+
+ mReversedAnchor[gappedAnchorLen++] = s1[currentColumn];
+ mReversedQuery[gappedQueryLen++] = GAP;
+
+ numMismatches++;
+
+ previousRow = currentRow;
+ previousColumn = currentColumn;
+
+
+ currentColumn--;
+ }
+ break;
+ }
+ currentPosition = ((currentRow + rowOffset) * (mBandwidth + 2)) + (columnOffset - currentRow + currentColumn);
+ }
+
+ // correct the reference and query sequence order
+ mReversedAnchor[gappedAnchorLen] = 0;
+ mReversedQuery [gappedQueryLen] = 0;
+ reverse(mReversedAnchor, mReversedAnchor + gappedAnchorLen);
+ reverse(mReversedQuery, mReversedQuery + gappedQueryLen);
+
+ //alignment.Reference = mReversedAnchor;
+ //alignment.Query = mReversedQuery;
+
+ // assign the alignment endpoints
+ //alignment.ReferenceBegin = previousColumn;
+ //alignment.ReferenceEnd = bestColumn;
+ referenceAl = previousColumn;
+ /*
+ if(alignment.IsReverseComplement){
+ alignment.QueryBegin = s2.length() - bestRow - 1;
+ alignment.QueryEnd = s2.length() - previousRow - 1;
+ } else {
+ alignment.QueryBegin = previousRow;
+ alignment.QueryEnd = bestRow;
+ }
+ */
+
+ //alignment.QueryLength = alignment.QueryEnd - alignment.QueryBegin + 1;
+ //alignment.NumMismatches = numMismatches;
+
+ const unsigned int alLength = strlen(mReversedAnchor);
+ unsigned int m = 0, d = 0, i = 0;
+ bool dashRegion = false;
+ ostringstream oCigar;
+
+ if ( previousRow != 0 )
+ oCigar << previousRow << 'S';
+
+ for ( unsigned int j = 0; j < alLength; j++ ) {
+ // m
+ if ( ( mReversedAnchor[j] != GAP ) && ( mReversedQuery[j] != GAP ) ) {
+ if ( dashRegion ) {
+ if ( d != 0 ) oCigar << d << 'D';
+ else oCigar << i << 'I';
+ }
+ dashRegion = false;
+ m++;
+ d = 0;
+ i = 0;
+ }
+ // I or D
+ else {
+ if ( !dashRegion )
+ oCigar << m << 'M';
+ dashRegion = true;
+ m = 0;
+ if ( mReversedAnchor[j] == GAP ) {
+ if ( d != 0 ) oCigar << d << 'D';
+ i++;
+ d = 0;
+ }
+ else {
+ if ( i != 0 ) oCigar << i << 'I';
+ d++;
+ i = 0;
+ }
+ }
+ }
+
+ if ( m != 0 ) oCigar << m << 'M';
+ else if ( d != 0 ) oCigar << d << 'D';
+ else if ( i != 0 ) oCigar << i << 'I';
+
+ if ( ( bestRow + 1 ) != s2.length() )
+ oCigar << s2.length() - bestRow - 1 << 'S';
+
+ cigarAl = oCigar.str();
+
+
+ // correct the homopolymer gap order
+ CorrectHomopolymerGapOrder(alLength, numMismatches);
+
+}
diff --git a/BandedSmithWaterman.h b/BandedSmithWaterman.h
new file mode 100644
index 0000000..ca78ac2
--- /dev/null
+++ b/BandedSmithWaterman.h
@@ -0,0 +1,111 @@
+#pragma once
+
+#include <iostream>
+#include <algorithm>
+#include <memory>
+//#include "Alignment.h"
+#include "Mosaik.h"
+//#include "HashRegion.h"
+#include <string.h>
+#include <stdio.h>
+#include <sstream>
+#include <string>
+
+using namespace std;
+
+#define MOSAIK_NUM_NUCLEOTIDES 26
+#define GAP '-'
+
+typedef unsigned char DirectionType;
+typedef unsigned char PositionType;
+
+struct ElementInfo {
+ unsigned int Direction : 2;
+ unsigned int mSizeOfVerticalGaps : 15;
+ unsigned int mSizeOfHorizontalGaps : 15;
+};
+
+class CBandedSmithWaterman {
+public:
+ // constructor
+ CBandedSmithWaterman(float matchScore, float mismatchScore, float gapOpenPenalty, float gapExtendPenalty, unsigned int bandWidth);
+ // destructor
+ ~CBandedSmithWaterman(void);
+ // aligns the query sequence to the anchor using the Smith Waterman Gotoh algorithm
+ void Align(unsigned int& referenceAl, string& stringAl, const string& s1, const string& s2, pair< pair<unsigned int, unsigned int>, pair<unsigned int, unsigned int> >& hr);
+ // enables homo-polymer scoring
+ void EnableHomoPolymerGapPenalty(float hpGapOpenPenalty);
+private:
+ // calculates the score during the forward algorithm
+ float CalculateScore(const string& s1, const string& s2, const unsigned int rowNum, const unsigned int columnNum, float& currentQueryGapScore, const unsigned int rowOffset, const unsigned int columnOffset);
+ // creates a simple scoring matrix to align the nucleotides and the ambiguity code N
+ void CreateScoringMatrix(void);
+ // corrects the homopolymer gap order for forward alignments
+ void CorrectHomopolymerGapOrder(const unsigned int numBases, const unsigned int numMismatches);
+ // returns the maximum floating point number
+ static inline float MaxFloats(const float& a, const float& b, const float& c);
+ // reinitializes the matrices
+ void ReinitializeMatrices(const PositionType& positionType, const unsigned int& s1Length, const unsigned int& s2Length, const pair< pair<unsigned int, unsigned int>, pair<unsigned int, unsigned int> > hr);
+ // performs the backtrace algorithm
+ void Traceback(unsigned int& referenceAl, string& stringAl, const string& s1, const string& s2, unsigned int bestRow, unsigned int bestColumn, const unsigned int rowOffset, const unsigned int columnOffset);
+ // updates the best score during the forward algorithm
+ inline void UpdateBestScore(unsigned int& bestRow, unsigned int& bestColumn, float& bestScore, const unsigned int rowNum, const unsigned int columnNum, const float score);
+ // our simple scoring matrix
+ float mScoringMatrix[MOSAIK_NUM_NUCLEOTIDES][MOSAIK_NUM_NUCLEOTIDES];
+ // keep track of maximum initialized sizes
+ unsigned int mCurrentMatrixSize;
+ unsigned int mCurrentAnchorSize;
+ unsigned int mCurrentAQSumSize;
+ unsigned int mBandwidth;
+ // define our backtrace directions
+ const static DirectionType Directions_STOP;
+ const static DirectionType Directions_LEFT;
+ const static DirectionType Directions_DIAGONAL;
+ const static DirectionType Directions_UP;
+ // store the backtrace pointers
+ ElementInfo* mPointers;
+ // define our position types
+ const static PositionType Position_REF_AND_QUERY_ZERO;
+ const static PositionType Position_REF_ZERO;
+ const static PositionType Position_QUERY_ZERO;
+ const static PositionType Position_REF_AND_QUERO_NONZERO;
+ // define scoring constants
+ const float mMatchScore;
+ const float mMismatchScore;
+ const float mGapOpenPenalty;
+ const float mGapExtendPenalty;
+ // score if xi aligns to a gap after yi
+ float* mAnchorGapScores;
+ // best score of alignment x1...xi to y1...yi
+ float* mBestScores;
+ // our reversed alignment
+ char* mReversedAnchor;
+ char* mReversedQuery;
+ // define static constants
+ static const float FLOAT_NEGATIVE_INFINITY;
+ // toggles the use of the homo-polymer gap open penalty
+ bool mUseHomoPolymerGapOpenPenalty;
+ float mHomoPolymerGapOpenPenalty;
+};
+
+// returns the maximum floating point number
+inline float CBandedSmithWaterman::MaxFloats(const float& a, const float& b, const float& c) {
+ float max = 0.0f;
+ if(a > max) max = a;
+ if(b > max) max = b;
+ if(c > max) max = c;
+ return max;
+}
+
+// updates the best score during the forward algorithm
+inline void CBandedSmithWaterman::UpdateBestScore(unsigned int& bestRow, unsigned int& bestColumn, float& bestScore, const unsigned int rowNum, const unsigned int columnNum, const float score) {
+
+ //const unsigned int row = rowNum + rowOffset;
+ //const unsigned int column = columnOffset - rowNum + columnNum;
+
+ if(score > bestScore) {
+ bestRow = rowNum;
+ bestColumn = columnNum;
+ bestScore = score;
+ }
+}
diff --git a/IndelAllele.cpp b/IndelAllele.cpp
new file mode 100644
index 0000000..80b5fac
--- /dev/null
+++ b/IndelAllele.cpp
@@ -0,0 +1,62 @@
+#include "IndelAllele.h"
+
+using namespace std;
+
+
+bool IndelAllele::homopolymer(void) {
+ string::iterator s = sequence.begin();
+ char c = *s++;
+ while (s != sequence.end()) {
+ if (c != *s++) return false;
+ }
+ return true;
+}
+
+int IndelAllele::readLength(void) {
+ if (insertion) {
+ return length;
+ } else {
+ return 0;
+ }
+}
+
+int IndelAllele::referenceLength(void) {
+ if (insertion) {
+ return 0;
+ } else {
+ return length;
+ }
+}
+
+bool homopolymer(string sequence) {
+ string::iterator s = sequence.begin();
+ char c = *s++;
+ while (s != sequence.end()) {
+ if (c != *s++) return false;
+ }
+ return true;
+}
+
+ostream& operator<<(ostream& out, const IndelAllele& indel) {
+ string t = indel.insertion ? "i" : "d";
+ out << t << ":" << indel.position << ":" << indel.readPosition << ":" << indel.length << ":" << indel.sequence;
+ return out;
+}
+
+bool operator==(const IndelAllele& a, const IndelAllele& b) {
+ return (a.insertion == b.insertion
+ && a.length == b.length
+ && a.position == b.position
+ && a.sequence == b.sequence);
+}
+
+bool operator!=(const IndelAllele& a, const IndelAllele& b) {
+ return !(a==b);
+}
+
+bool operator<(const IndelAllele& a, const IndelAllele& b) {
+ ostringstream as, bs;
+ as << a;
+ bs << b;
+ return as.str() < bs.str();
+}
diff --git a/IndelAllele.h b/IndelAllele.h
new file mode 100644
index 0000000..247c01f
--- /dev/null
+++ b/IndelAllele.h
@@ -0,0 +1,37 @@
+#ifndef __INDEL_ALLELE_H
+#define __INDEL_ALLELE_H
+
+#include <string>
+#include <iostream>
+#include <sstream>
+
+using namespace std;
+
+class IndelAllele {
+ friend ostream& operator<<(ostream&, const IndelAllele&);
+ friend bool operator==(const IndelAllele&, const IndelAllele&);
+ friend bool operator!=(const IndelAllele&, const IndelAllele&);
+ friend bool operator<(const IndelAllele&, const IndelAllele&);
+public:
+ bool insertion;
+ int length;
+ int referenceLength(void);
+ int readLength(void);
+ int position;
+ int readPosition;
+ string sequence;
+
+ bool homopolymer(void);
+
+ IndelAllele(bool i, int l, int p, int rp, string s)
+ : insertion(i), length(l), position(p), readPosition(rp), sequence(s)
+ { }
+};
+
+bool homopolymer(string sequence);
+ostream& operator<<(ostream& out, const IndelAllele& indel);
+bool operator==(const IndelAllele& a, const IndelAllele& b);
+bool operator!=(const IndelAllele& a, const IndelAllele& b);
+bool operator<(const IndelAllele& a, const IndelAllele& b);
+
+#endif
diff --git a/LeftAlign.cpp b/LeftAlign.cpp
new file mode 100644
index 0000000..3e526aa
--- /dev/null
+++ b/LeftAlign.cpp
@@ -0,0 +1,853 @@
+#include "LeftAlign.h"
+
+//bool debug;
+#define VERBOSE_DEBUG
+
+// Attempts to left-realign all the indels represented by the alignment cigar.
+//
+// This is done by shifting all indels as far left as they can go without
+// mismatch, then merging neighboring indels of the same class. leftAlign
+// updates the alignment cigar with changes, and returns true if realignment
+// changed the alignment cigar.
+//
+// To left-align, we move multi-base indels left by their own length as long as
+// the preceding bases match the inserted or deleted sequence. After this
+// step, we handle multi-base homopolymer indels by shifting them one base to
+// the left until they mismatch the reference.
+//
+// To merge neighboring indels, we iterate through the set of left-stabilized
+// indels. For each indel we add a new cigar element to the new cigar. If a
+// deletion follows a deletion, or an insertion occurs at the same place as
+// another insertion, we merge the events by extending the previous cigar
+// element.
+//
+// In practice, we must call this function until the alignment is stabilized.
+//
+bool leftAlign(string& querySequence, string& cigar, string& baseReferenceSequence, int& offset, bool debug) {
+
+ debug = false;
+
+ string referenceSequence = baseReferenceSequence.substr(offset);
+
+ int arsOffset = 0; // pointer to insertion point in aligned reference sequence
+ string alignedReferenceSequence, alignedQuerySequence;
+ if (debug) alignedReferenceSequence = referenceSequence;
+ if (debug) alignedQuerySequence = querySequence;
+ int aabOffset = 0;
+
+ // store information about the indels
+ vector<IndelAllele> indels;
+
+ int rp = 0; // read position, 0-based relative to read
+ int sp = 0; // sequence position
+
+ string softBegin;
+ string softEnd;
+
+ string cigarbefore = cigar;
+
+ vector<pair<int, string> > cigarData = splitCIGAR(cigar);
+ for (vector<pair<int, string> >::const_iterator c = cigarData.begin();
+ c != cigarData.end(); ++c) {
+ unsigned int l = c->first;
+ string t = c->second;
+ if (debug) cerr << l << t << " " << sp << " " << rp << endl;
+ if (t == "M") { // match or mismatch
+ sp += l;
+ rp += l;
+ } else if (t == "D") { // deletion
+ indels.push_back(IndelAllele(false, l, sp, rp, referenceSequence.substr(sp, l)));
+ if (debug) { cerr << indels.back() << endl; alignedQuerySequence.insert(rp + aabOffset, string(l, '-')); }
+ aabOffset += l;
+ sp += l; // update reference sequence position
+ } else if (t == "I") { // insertion
+ indels.push_back(IndelAllele(true, l, sp, rp, querySequence.substr(rp, l)));
+ if (debug) { cerr << indels.back() << endl; alignedReferenceSequence.insert(sp + softBegin.size() + arsOffset, string(l, '-')); }
+ arsOffset += l;
+ rp += l;
+ } else if (t == "S") { // soft clip, clipped sequence present in the read not matching the reference
+ // remove these bases from the refseq and read seq, but don't modify the alignment sequence
+ if (rp == 0) {
+ alignedReferenceSequence = string(l, '*') + alignedReferenceSequence;
+ //indels.push_back(IndelAllele(true, l, sp, rp, querySequence.substr(rp, l)));
+ softBegin = querySequence.substr(0, l);
+ } else {
+ alignedReferenceSequence = alignedReferenceSequence + string(l, '*');
+ //indels.push_back(IndelAllele(true, l, sp, rp, querySequence.substr(rp, l)));
+ softEnd = querySequence.substr(querySequence.size() - l, l);
+ }
+ rp += l;
+ } else if (t == "H") { // hard clip on the read, clipped sequence is not present in the read
+ } else if (t == "N") { // skipped region in the reference not present in read, aka splice
+ sp += l;
+ }
+ }
+
+
+ if (debug) cerr << "| " << cigarbefore << endl
+ << "| " << alignedReferenceSequence << endl
+ << "| " << alignedQuerySequence << endl;
+
+ // if no indels, return the alignment
+ if (indels.empty()) { return false; }
+
+ if (debug) {
+ for (vector<IndelAllele>::iterator a = indels.begin(); a != indels.end(); ++a) cerr << *a << " ";
+ cerr << endl;
+ }
+
+ // for each indel, from left to right
+ // while the indel sequence repeated to the left and we're not matched up with the left-previous indel
+ // move the indel left
+
+ vector<IndelAllele>::iterator previous = indels.begin();
+ for (vector<IndelAllele>::iterator id = indels.begin(); id != indels.end(); ++id) {
+
+ // left shift by repeats
+ //
+ // from 1 base to the length of the indel, attempt to shift left
+ // if the move would cause no change in alignment optimality (no
+ // introduction of mismatches, and by definition no change in gap
+ // length), move to the new position.
+ // in practice this moves the indel left when we reach the size of
+ // the repeat unit.
+ //
+ int steppos, readsteppos;
+ IndelAllele& indel = *id;
+ int i = 1;
+
+ while (i <= indel.length) {
+
+ int steppos = indel.position - i;
+ int readsteppos = indel.readPosition - i;
+
+ if (debug) {
+ if (steppos >= 0 && readsteppos >= 0) {
+ cerr << "refseq flank " << referenceSequence.substr(steppos, indel.length) << endl;
+ cerr << "qryseq flank " << querySequence.substr(readsteppos, indel.length) << endl;
+ cerr << "indelseq " << indel.sequence << endl;
+ }
+ }
+
+ while (steppos >= 0 && readsteppos >= 0
+ && indel.sequence == referenceSequence.substr(steppos, indel.length)
+ && indel.sequence == querySequence.substr(readsteppos, indel.length)
+ && (id == indels.begin()
+ || (previous->insertion && steppos >= previous->position)
+ || (!previous->insertion && steppos >= previous->position + previous->length))) {
+ LEFTALIGN_DEBUG((indel.insertion ? "insertion " : "deletion ") << indel << " shifting " << i << "bp left" << endl);
+ indel.position -= i;
+ indel.readPosition -= i;
+ steppos = indel.position - i;
+ readsteppos = indel.readPosition - i;
+ }
+ do {
+ ++i;
+ } while (i <= indel.length && indel.length % i != 0);
+ }
+
+
+
+ // left shift indels with exchangeable flanking sequence
+ //
+ // for example:
+ //
+ // GTTACGTT GTTACGTT
+ // GT-----T ----> G-----TT
+ //
+ // GTGTGACGTGT GTGTGACGTGT
+ // GTGTG-----T ----> GTG-----TGT
+ //
+ // GTGTG-----T GTG-----TGT
+ // GTGTGACGTGT ----> GTGTGACGTGT
+ //
+ //
+
+ steppos = indel.position - 1;
+ readsteppos = indel.readPosition - 1;
+ while (steppos >= 0 && readsteppos >= 0
+ && querySequence.at(readsteppos) == referenceSequence.at(steppos)
+ && referenceSequence.size() > steppos + indel.length
+ && indel.sequence.at((int) indel.sequence.size() - 1) == referenceSequence.at(steppos + indel.length) // are the exchanged bases going to match wrt. the reference?
+ && querySequence.at(readsteppos) == indel.sequence.at((int) indel.sequence.size() - 1)
+ && (id == indels.begin()
+ || (previous->insertion && indel.position - 1 >= previous->position)
+ || (!previous->insertion && indel.position - 1 >= previous->position + previous->length))) {
+ if (debug) cerr << (indel.insertion ? "insertion " : "deletion ") << indel << " exchanging bases " << 1 << "bp left" << endl;
+ indel.sequence = indel.sequence.at(indel.sequence.size() - 1) + indel.sequence.substr(0, indel.sequence.size() - 1);
+ indel.position -= 1;
+ indel.readPosition -= 1;
+ if (debug) cerr << indel << endl;
+ steppos = indel.position - 1;
+ readsteppos = indel.readPosition - 1;
+ //if (debug && steppos && readsteppos) cerr << querySequence.at(readsteppos) << " ==? " << referenceSequence.at(steppos) << endl;
+ //if (debug && steppos && readsteppos) cerr << indel.sequence.at((int) indel.sequence.size() - 1) << " ==? " << referenceSequence.at(steppos + indel.length) << endl;
+ }
+ // tracks previous indel, so we don't run into it with the next shift
+ previous = id;
+ }
+
+ if (debug) {
+ for (vector<IndelAllele>::iterator a = indels.begin(); a != indels.end(); ++a) cerr << *a << " ";
+ cerr << endl;
+ }
+
+ if (debug) cerr << "bring together floating indels" << endl;
+
+ // bring together floating indels
+ // from left to right
+ // check if we could merge with the next indel
+ // if so, adjust so that we will merge in the next step
+ if (indels.size() > 1) {
+ previous = indels.begin();
+ for (vector<IndelAllele>::iterator id = (indels.begin() + 1); id != indels.end(); ++id) {
+ IndelAllele& indel = *id;
+ // parsimony: could we shift right and merge with the previous indel?
+ // if so, do it
+ int prev_end_ref = previous->insertion ? previous->position : previous->position + previous->length;
+ int prev_end_read = !previous->insertion ? previous->readPosition : previous->readPosition + previous->length;
+ if (previous->insertion == indel.insertion
+ && ((previous->insertion
+ && (previous->position < indel.position
+ && previous->readPosition < indel.readPosition))
+ ||
+ (!previous->insertion
+ && (previous->position + previous->length < indel.position)
+ && (previous->readPosition < indel.readPosition)
+ ))) {
+ if (previous->homopolymer()) {
+ string seq = referenceSequence.substr(prev_end_ref, indel.position - prev_end_ref);
+ string readseq = querySequence.substr(prev_end_read, indel.position - prev_end_ref);
+ if (debug) cerr << "seq: " << seq << endl << "readseq: " << readseq << endl;
+ if (previous->sequence.at(0) == seq.at(0)
+ && homopolymer(seq)
+ && homopolymer(readseq)) {
+ if (debug) cerr << "moving " << *previous << " right to "
+ << (indel.insertion ? indel.position : indel.position - previous->length) << endl;
+ previous->position = indel.insertion ? indel.position : indel.position - previous->length;
+ previous->readPosition = !indel.insertion ? indel.readPosition : indel.readPosition - previous->readLength(); // should this be readLength?
+ }
+ }
+ /*
+ else {
+ int pos = previous->position;
+ int readpos = previous->readPosition;
+ while (pos < (int) referenceSequence.length() &&
+ ((previous->insertion && pos + previous->length <= indel.position)
+ ||
+ (!previous->insertion && pos + previous->length < indel.position))
+ && previous->sequence == referenceSequence.substr(pos + previous->length, previous->length)
+ && previous->sequence == querySequence.substr(readpos + previous->length, previous->length)
+ ) {
+ pos += previous->length;
+ readpos += previous->length;
+ }
+ string seq = previous->sequence;
+ if (pos > previous->position) {
+ // wobble bases right to left as far as we can go
+ int steppos = previous->position + seq.size();
+ int readsteppos = previous->readPosition + seq.size();
+
+ while (querySequence.at(readsteppos) == referenceSequence.at(steppos)
+ && querySequence.at(readsteppos) == seq.at(0)
+ && (id == indels.begin()
+ || (indel.insertion && pos + seq.size() - 1 <= indel.position)
+ || (!previous->insertion && indel.position - 1 >= pos + previous->length))) {
+ seq = seq.substr(1) + seq.at(0);
+ ++pos;
+ ++readpos;
+ steppos = pos + 1;
+ readsteppos = readpos + 1;
+ }
+
+ if (((previous->insertion && pos + previous->length == indel.position)
+ ||
+ (!previous->insertion && pos == indel.position - previous->length))
+ ) {
+ if (debug) cerr << "right-merging tandem repeat: moving " << *previous << " right to " << pos << endl;
+ previous->position = pos;
+ previous->readPosition = readpos;
+ previous->sequence = seq;
+ }
+ }
+ }
+ */
+ }
+ previous = id;
+ }
+ }
+
+ if (debug) {
+ for (vector<IndelAllele>::iterator a = indels.begin(); a != indels.end(); ++a) cerr << *a << " ";
+ cerr << endl;
+ }
+
+
+ if (debug) cerr << "bring in indels at ends of read" << endl;
+
+ // try to "bring in" repeat indels at the end, for maximum parsimony
+ //
+ // e.g.
+ //
+ // AGAAAGAAAGAAAAAGAAAAAGAACCAAGAAGAAAA
+ // AGAAAG------AAAGAAAAAGAACCAAGAAGAAAA
+ //
+ // has no information which distinguishes it from:
+ //
+ // AGAAAGAAAAAGAAAAAGAACCAAGAAGAAAA
+ // AGAAAG--AAAGAAAAAGAACCAAGAAGAAAA
+ //
+ // here we take the parsimonious explanation
+
+ if (!indels.empty()) {
+ // deal with the first indel
+ // the first deletion ... or the biggest deletion
+ vector<IndelAllele>::iterator a = indels.begin();
+ vector<IndelAllele>::iterator del = indels.begin();
+ for (; a != indels.end(); ++a) {
+ //if (!a->insertion && a->length > biggestDel->length) biggestDel = a;
+ if (!a->insertion && a->length) del = a;
+ if (!del->insertion) {
+ //if (!indel.insertion) { // only handle deletions like this for now
+ //if (!indel.insertion && !(indels.size() > 1 && indel.readPosition == indels.at(1).readPosition)) { // only handle deletions like this for now
+ int insertedBpBefore = 0;
+ int deletedBpBefore = 0;
+ for (vector<IndelAllele>::iterator i = indels.begin(); i != del; ++i) {
+ if (i->insertion) insertedBpBefore += i->length;
+ else deletedBpBefore += i->length;
+ }
+ IndelAllele& indel = *del;
+ int minsize = indel.length;
+ int flankingLength = indel.readPosition;
+ if (debug) cerr << indel << endl;
+ string flanking = querySequence.substr(0, flankingLength);
+ if (debug) cerr << flanking << endl;
+
+ size_t p = referenceSequence.substr(0, indel.position + indel.length).rfind(flanking);
+ if (p == string::npos) {
+ if (debug) cerr << "flanking not found" << endl;
+ } else {
+ if (debug) {
+ cerr << "flanking is at " << p << endl;
+ cerr << "minsize would be " << (indel.position + indel.length) - ((int) p + flankingLength) << endl;
+ }
+ minsize = (indel.position + indel.length) - ((int) p + flankingLength);
+ }
+
+ if (debug) cerr << minsize << endl;
+
+ if (minsize >= 0 && minsize < indel.length) {
+
+ int softdiff = softBegin.size();
+ if (!softBegin.empty()) { // remove soft clips if we can
+ if (flankingLength < softBegin.size()) {
+ softBegin = softBegin.substr(0, flankingLength - softBegin.size());
+ softdiff -= softBegin.size();
+ } else {
+ softBegin.clear();
+ }
+ }
+
+ // the new read position == the current read position
+ // the new reference position == the flanking length size
+ // the positional offset of the reference sequence == the new position of the deletion - the flanking length
+
+ int diff = indel.length - minsize - softdiff + deletedBpBefore - insertedBpBefore;
+ //int querydiff = indel.length - minsize - softBegin.size() - insertedBpBefore + deletedBpBefore;
+ if (debug) cerr << "adjusting " << indel.length <<" " << minsize << " " << softdiff << " " << diff << endl;
+ offset += diff;
+ ///
+ indel.length = minsize;
+ indel.sequence = indel.sequence.substr(indel.sequence.size() - minsize);
+ indel.position = flankingLength;
+ indel.readPosition = indel.position; // if we have removed all the sequence before, this should be ==
+ referenceSequence = referenceSequence.substr(diff);
+
+ for (vector<IndelAllele>::iterator i = indels.begin(); i != indels.end(); ++i) {
+ if (i < del) {
+ i->length = 0; // remove
+ } else if (i > del) {
+ i->position -= diff;
+ }
+ }
+ }
+ if (debug) cerr << indel << endl;
+
+ // now, do the same for the reverse
+ if (indel.length > 0) {
+ int minsize = indel.length + 1;
+ int flankingLength = querySequence.size() - indel.readPosition + indel.readLength();
+ string flanking = querySequence.substr(indel.readPosition + indel.readLength(), flankingLength);
+ int indelRefEnd = indel.position + indel.referenceLength();
+
+ size_t p = referenceSequence.find(flanking, indel.position);
+ if (p == string::npos) {
+ if (debug)
+ cerr << "flanking not found" << endl;
+ } else {
+ if (debug) {
+ cerr << "flanking is at " << p << endl;
+ cerr << "minsize would be " << (int) p - indel.position << endl;
+ }
+ minsize = (int) p - indel.position;
+ }
+
+ if (debug) cerr << "minsize " << minsize << endl;
+ if (minsize >= 0 && minsize <= indel.length) {
+ //referenceSequence = referenceSequence.substr(0, referenceSequence.size() - (indel.length - minsize));
+ if (debug) cerr << "adjusting " << indel << endl;
+ indel.length = minsize;
+ indel.sequence = indel.sequence.substr(0, minsize);
+ //cerr << indel << endl;
+ if (!softEnd.empty()) { // remove soft clips if we can
+ if (flankingLength < softEnd.size()) {
+ softEnd = softEnd.substr(flankingLength - softEnd.size());
+ } else {
+ softEnd.clear();
+ }
+ }
+ for (vector<IndelAllele>::iterator i = indels.begin(); i != indels.end(); ++i) {
+ if (i > del) {
+ i->length = 0; // remove
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ if (debug) {
+ for (vector<IndelAllele>::iterator a = indels.begin(); a != indels.end(); ++a) cerr << *a << " ";
+ cerr << endl;
+ }
+
+ if (debug) cerr << "parsing indels" << endl;
+
+ // if soft clipping can be reduced by adjusting the tailing indels in the read, do it
+ // TODO
+
+ /*
+ int numEmptyIndels = 0;
+
+ if (!indels.empty()) {
+ vector<IndelAllele>::iterator a = indels.begin();
+ while (a != indels.end()) {
+ if (debug) cerr << "parsing " << *a << endl;
+ if (!(a->length > 0 && a->position >= 0)) {
+ ++numEmptyIndels;
+ }
+ ++a;
+ }
+ }
+ */
+
+ for (vector<IndelAllele>::iterator i = indels.begin(); i != indels.end(); ++i) {
+ if (i->length == 0) continue;
+ if (i->insertion) {
+ if (querySequence.substr(i->readPosition, i->readLength()) != i->sequence) {
+ cerr << "failure: " << *i << " should be " << querySequence.substr(i->readPosition, i->readLength()) << endl;
+ cerr << baseReferenceSequence << endl;
+ cerr << querySequence << endl;
+ throw 1;
+ }
+ } else {
+ if (referenceSequence.substr(i->position, i->length) != i->sequence) {
+ cerr << "failure: " << *i << " should be " << referenceSequence.substr(i->position, i->length) << endl;
+ cerr << baseReferenceSequence << endl;
+ cerr << querySequence << endl;
+ throw 1;
+ }
+ }
+ }
+
+ if (indels.size() > 1) {
+ vector<IndelAllele>::iterator id = indels.begin();
+ while ((id + 1) != indels.end()) {
+ if (debug) {
+ cerr << "indels: ";
+ for (vector<IndelAllele>::iterator a = indels.begin(); a != indels.end(); ++a) cerr << *a << " ";
+ cerr << endl;
+ //for (vector<IndelAllele>::iterator a = newIndels.begin(); a != newIndels.end(); ++a) cerr << *a << " ";
+ //cerr << endl;
+ }
+
+ // get the indels to try to merge
+ while (id->length == 0 && (id + 1) != indels.end()) ++id;
+ vector<IndelAllele>::iterator idn = (id + 1);
+ while (idn != indels.end() && idn->length == 0) ++idn;
+ if (idn == indels.end()) break;
+
+ IndelAllele& indel = *idn;
+ IndelAllele& last = *id;
+ if (debug) cerr << "trying " << last << " against " << indel << endl;
+
+ int lastend = last.insertion ? last.position : (last.position + last.length);
+ if (indel.position == lastend) {
+ if (debug) cerr << "indel.position " << indel.position << " lastend " << lastend << endl;
+ if (indel.insertion == last.insertion) {
+ last.length += indel.length;
+ last.sequence += indel.sequence;
+ indel.length = 0;
+ indel.sequence.clear();
+ id = idn;
+ } else if (last.length && indel.length) { // if the end of the previous == the start of the current, cut it off of both the ins and the del
+ if (debug) cerr << "Merging " << last << " " << indel << endl;
+ int matchsize = 1;
+ int biggestmatchsize = 0;
+
+ while (matchsize <= last.sequence.size() && matchsize <= indel.sequence.size()) {
+ if (last.sequence.substr(last.sequence.size() - matchsize) == indel.sequence.substr(0, matchsize)) {
+ biggestmatchsize = matchsize;
+ }
+ ++matchsize;
+ }
+ if (debug) cerr << "biggestmatchsize " << biggestmatchsize << endl;
+
+ last.sequence = last.sequence.substr(0, last.sequence.size() - biggestmatchsize);
+ last.length -= biggestmatchsize;
+ indel.sequence = indel.sequence.substr(biggestmatchsize);
+ indel.length -= biggestmatchsize;
+ if (indel.insertion) indel.readPosition += biggestmatchsize;
+ else indel.position += biggestmatchsize;
+
+ if (indel.length > 0) {
+ id = idn;
+ }
+ }
+ } else {
+ if (last.insertion != indel.insertion) {
+ if (debug) cerr << "merging by overlap " << last << " " << indel << endl;
+ // see if we can slide the sequence in between these two indels together
+ string lastOverlapSeq;
+ string indelOverlapSeq;
+
+ if (last.insertion) {
+ lastOverlapSeq =
+ last.sequence
+ + querySequence.substr(last.readPosition + last.readLength(),
+ indel.readPosition - (last.readPosition + last.readLength()));
+ indelOverlapSeq =
+ referenceSequence.substr(last.position + last.referenceLength(),
+ indel.position - (last.position + last.referenceLength()))
+ + indel.sequence;
+ } else {
+ lastOverlapSeq =
+ last.sequence
+ + referenceSequence.substr(last.position + last.referenceLength(),
+ indel.position - (last.position + last.referenceLength()));
+ indelOverlapSeq =
+ querySequence.substr(last.readPosition + last.readLength(),
+ indel.readPosition - (last.readPosition + last.readLength()))
+ + indel.sequence;
+ }
+
+ if (debug) {
+ if (!last.insertion) {
+ if (last.insertion) cerr << string(last.length, '-');
+ cerr << lastOverlapSeq;
+ if (indel.insertion) cerr << string(indel.length, '-');
+ cerr << endl;
+ if (!last.insertion) cerr << string(last.length, '-');
+ cerr << indelOverlapSeq;
+ if (!indel.insertion) cerr << string(indel.length, '-');
+ cerr << endl;
+ } else {
+ if (last.insertion) cerr << string(last.length, '-');
+ cerr << indelOverlapSeq;
+ if (indel.insertion) cerr << string(indel.length, '-');
+ cerr << endl;
+ if (!last.insertion) cerr << string(last.length, '-');
+ cerr << lastOverlapSeq;
+ if (!indel.insertion) cerr << string(indel.length, '-');
+ cerr << endl;
+ }
+ }
+
+
+ int dist = min(last.length, indel.length);
+ int matchingInBetween = indel.position - (last.position + last.referenceLength());
+ int previousMatchingInBetween = matchingInBetween;
+ //int matchingInBetween = indel.position - last.position;
+ if (debug) cerr << "matchingInBetween " << matchingInBetween << endl;
+ if (debug) cerr << "dist " << dist << endl;
+ int mindist = matchingInBetween - dist;
+ if (lastOverlapSeq == indelOverlapSeq) {
+ matchingInBetween = lastOverlapSeq.size();
+ } else {
+ // TODO change to use string::find()
+ for (int i = dist; i > 0; --i) {
+ if (debug) cerr << "lastoverlap: "
+ << lastOverlapSeq.substr(lastOverlapSeq.size() - previousMatchingInBetween - i)
+ << " thisoverlap: "
+ << indelOverlapSeq.substr(0, i + previousMatchingInBetween) << endl;
+ if (lastOverlapSeq.substr(lastOverlapSeq.size() - previousMatchingInBetween - i)
+ == indelOverlapSeq.substr(0, i + previousMatchingInBetween)) {
+ matchingInBetween = previousMatchingInBetween + i;
+ break;
+ }
+ }
+ }
+ //cerr << last << " " << indel << endl;
+ if (matchingInBetween > 0 && matchingInBetween > previousMatchingInBetween) {
+ if (debug) cerr << "matching " << matchingInBetween << "bp between " << last << " " << indel
+ << " was matching " << previousMatchingInBetween << endl;
+ int diff = matchingInBetween - previousMatchingInBetween;
+ last.length -= diff;
+ last.sequence = last.sequence.substr(0, last.length);
+ indel.length -= diff;
+ indel.sequence = indel.sequence.substr(diff);
+ if (!indel.insertion) indel.position += diff;
+ else indel.readPosition += diff;
+ if (debug) cerr << last << " " << indel << endl;
+ }// else if (matchingInBetween == 0 || matchingInBetween == indel.position - last.position) {
+ //if (!newIndels.empty()) newIndels.pop_back();
+ //} //else { newIndels.push_back(indel); }
+ id = idn;
+ //newIndels.push_back(indel);
+ } else {
+ id = idn;
+ //newIndels.push_back(indel);
+ }
+ }
+ }
+ }
+
+ vector<IndelAllele> newIndels;
+ for (vector<IndelAllele>::iterator i = indels.begin(); i != indels.end(); ++i) {
+ if (!i->insertion && i->position == 0) { offset += i->length;
+ } else if (i->length > 0) newIndels.push_back(*i); // remove dels at front
+ }
+
+ // for each indel
+ // if ( we're matched up to the previous insertion (or deletion)
+ // and it's also an insertion or deletion )
+ // merge the indels
+ //
+ // and simultaneously reconstruct the cigar
+ //
+ // if there are spurious deletions at the start and end of the read, remove them
+ // if there are spurious insertions after soft-clipped bases, make them soft clips
+
+ vector<pair<int, string> > newCigar;
+
+ if (!softBegin.empty()) {
+ newCigar.push_back(make_pair(softBegin.size(), "S"));
+ }
+
+ if (newIndels.empty()) {
+
+ int remainingReadBp = querySequence.size() - softEnd.size() - softBegin.size();
+ newCigar.push_back(make_pair(remainingReadBp, "M"));
+
+ if (!softEnd.empty()) {
+ newCigar.push_back(make_pair(softEnd.size(), "S"));
+ }
+
+ cigar = joinCIGAR(newCigar);
+
+ // check if we're realigned
+ if (cigar == cigarbefore) {
+ return false;
+ } else {
+ return true;
+ }
+ }
+
+ vector<IndelAllele>::iterator id = newIndels.begin();
+ vector<IndelAllele>::iterator last = id++;
+
+ if (last->position > 0) {
+ newCigar.push_back(make_pair(last->position, "M"));
+ newCigar.push_back(make_pair(last->length, (last->insertion ? "I" : "D")));
+ } else if (last->position == 0) { // discard floating indels
+ if (last->insertion) newCigar.push_back(make_pair(last->length, "S"));
+ else newCigar.push_back(make_pair(last->length, "D"));
+ } else {
+ cerr << "negative indel position " << *last << endl;
+ }
+
+ int lastend = last->insertion ? last->position : (last->position + last->length);
+ LEFTALIGN_DEBUG(*last << ",");
+
+ for (; id != newIndels.end(); ++id) {
+ IndelAllele& indel = *id;
+ if (indel.length == 0) continue; // remove 0-length indels
+ if (debug) cerr << indel << " " << *last << endl;
+ LEFTALIGN_DEBUG(indel << ",");
+ if ((id + 1) == newIndels.end()
+ && (indel.insertion && indel.position == referenceSequence.size()
+ || (!indel.insertion && indel.position + indel.length == referenceSequence.size()))
+ ) {
+ if (indel.insertion) {
+ if (!newCigar.empty() && newCigar.back().second == "S") {
+ newCigar.back().first += indel.length;
+ } else {
+ newCigar.push_back(make_pair(indel.length, "S"));
+ }
+ }
+ } else if (indel.position < lastend) {
+ cerr << "impossibility?: indel realigned left of another indel" << endl;
+ return false;
+ } else if (indel.position == lastend) {
+ // how?
+ if (indel.insertion == last->insertion) {
+ pair<int, string>& op = newCigar.back();
+ op.first += indel.length;
+ } else {
+ newCigar.push_back(make_pair(indel.length, (indel.insertion ? "I" : "D")));
+ }
+ } else if (indel.position > lastend) { // also catches differential indels, but with the same position
+ if (!newCigar.empty() && newCigar.back().second == "M") newCigar.back().first += indel.position - lastend;
+ else newCigar.push_back(make_pair(indel.position - lastend, "M"));
+ newCigar.push_back(make_pair(indel.length, (indel.insertion ? "I" : "D")));
+ }
+
+ last = id;
+ lastend = last->insertion ? last->position : (last->position + last->length);
+
+ if (debug) {
+ for (vector<pair<int, string> >::iterator c = newCigar.begin(); c != newCigar.end(); ++c)
+ cerr << c->first << c->second;
+ cerr << endl;
+ }
+
+ }
+
+ int remainingReadBp = querySequence.size() - (last->readPosition + last->readLength()) - softEnd.size();
+ if (remainingReadBp > 0) {
+ if (debug) cerr << "bp remaining = " << remainingReadBp << endl;
+ if (newCigar.back().second == "M") newCigar.back().first += remainingReadBp;
+ else newCigar.push_back(make_pair(remainingReadBp, "M"));
+ }
+
+ if (newCigar.back().second == "D") newCigar.pop_back(); // remove trailing deletions
+
+ if (!softEnd.empty()) {
+ if (newCigar.back().second == "S") newCigar.back().first += softEnd.size();
+ else newCigar.push_back(make_pair(softEnd.size(), "S"));
+ }
+
+ LEFTALIGN_DEBUG(endl);
+
+ cigar = joinCIGAR(newCigar);
+
+ LEFTALIGN_DEBUG(cigar << endl);
+
+ // check if we're realigned
+ if (cigar == cigarbefore) {
+ return false;
+ } else {
+ return true;
+ }
+
+}
+
+int countMismatches(string& querySequence, string& cigar, string referenceSequence) {
+
+ int mismatches = 0;
+ int sp = 0;
+ int rp = 0;
+ vector<pair<int, string> > cigarData = splitCIGAR(cigar);
+ for (vector<pair<int, string> >::const_iterator c = cigarData.begin();
+ c != cigarData.end(); ++c) {
+ unsigned int l = c->first;
+ string t = c->second;
+ if (t == "M") { // match or mismatch
+ for (int i = 0; i < l; ++i) {
+ if (querySequence.at(rp) != referenceSequence.at(sp))
+ ++mismatches;
+ ++sp;
+ ++rp;
+ }
+ } else if (t == "D") { // deletion
+ sp += l; // update reference sequence position
+ } else if (t == "I") { // insertion
+ rp += l; // update read position
+ } else if (t == "S") { // soft clip, clipped sequence present in the read not matching the reference
+ rp += l;
+ } else if (t == "H") { // hard clip on the read, clipped sequence is not present in the read
+ } else if (t == "N") { // skipped region in the reference not present in read, aka splice
+ sp += l;
+ }
+ }
+
+ return mismatches;
+
+}
+
+// Iteratively left-aligns the indels in the alignment until we have a stable
+// realignment. Returns true on realignment success or non-realignment.
+// Returns false if we exceed the maximum number of realignment iterations.
+//
+bool stablyLeftAlign(string querySequence, string& cigar, string referenceSequence, int& offset, int maxiterations, bool debug) {
+
+ if (!leftAlign(querySequence, cigar, referenceSequence, offset)) {
+
+ LEFTALIGN_DEBUG("did not realign" << endl);
+ return true;
+
+ } else {
+
+ while (leftAlign(querySequence, cigar, referenceSequence, offset) && --maxiterations > 0) {
+ LEFTALIGN_DEBUG("realigning ..." << endl);
+ }
+
+ if (maxiterations <= 0) {
+ return false;
+ } else {
+ return true;
+ }
+ }
+}
+
+string mergeCIGAR(const string& c1, const string& c2) {
+ vector<pair<int, string> > cigar1 = splitCIGAR(c1);
+ vector<pair<int, string> > cigar2 = splitCIGAR(c2);
+ // check if the middle elements are the same
+ if (cigar1.back().second == cigar2.front().second) {
+ cigar1.back().first += cigar2.front().first;
+ cigar2.erase(cigar2.begin());
+ }
+ for (vector<pair<int, string> >::iterator c = cigar2.begin(); c != cigar2.end(); ++c) {
+ cigar1.push_back(*c);
+ }
+ return joinCIGAR(cigar1);
+}
+
+vector<pair<int, string> > splitCIGAR(const string& cigarStr) {
+ vector<pair<int, string> > cigar;
+ string number;
+ string type;
+ // strings go [Number][Type] ...
+ for (string::const_iterator s = cigarStr.begin(); s != cigarStr.end(); ++s) {
+ char c = *s;
+ if (isdigit(c)) {
+ if (type.empty()) {
+ number += c;
+ } else {
+ // signal for next token, push back the last pair, clean up
+ cigar.push_back(make_pair(atoi(number.c_str()), type));
+ number.clear();
+ type.clear();
+ number += c;
+ }
+ } else {
+ type += c;
+ }
+ }
+ if (!number.empty() && !type.empty()) {
+ cigar.push_back(make_pair(atoi(number.c_str()), type));
+ }
+ return cigar;
+}
+
+string joinCIGAR(const vector<pair<int, string> >& cigar) {
+ string cigarStr;
+ for (vector<pair<int, string> >::const_iterator c = cigar.begin(); c != cigar.end(); ++c) {
+ if (c->first) {
+ cigarStr += convert(c->first) + c->second;
+ }
+ }
+ return cigarStr;
+}
diff --git a/LeftAlign.h b/LeftAlign.h
new file mode 100644
index 0000000..7fb796e
--- /dev/null
+++ b/LeftAlign.h
@@ -0,0 +1,32 @@
+#ifndef __LEFTALIGN_H
+#define __LEFTALIGN_H
+
+#include <algorithm>
+#include <map>
+#include <vector>
+#include <list>
+#include <utility>
+#include <sstream>
+
+#include "IndelAllele.h"
+#include "convert.h"
+
+#ifdef VERBOSE_DEBUG
+#define LEFTALIGN_DEBUG(msg) \
+ if (debug) { cerr << msg; }
+#else
+#define LEFTALIGN_DEBUG(msg)
+#endif
+
+using namespace std;
+
+bool leftAlign(string& alternateQuery, string& cigar, string& referenceSequence, int& offset, bool debug = false);
+bool stablyLeftAlign(string alternateQuery, string& cigar, string referenceSequence, int& offset, int maxiterations = 20, bool debug = false);
+int countMismatches(string& alternateQuery, string& cigar, string& referenceSequence);
+
+string mergeCIGAR(const string& c1, const string& c2);
+vector<pair<int, string> > splitCIGAR(const string& cigarStr);
+string joinCIGAR(const vector<pair<int, string> >& cigar);
+
+
+#endif
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..3c62e93
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,66 @@
+# =========================================
+# MOSAIK Banded Smith-Waterman Makefile
+# (c) 2009 Michael Stromberg & Wan-Ping Lee
+# =========================================
+
+# ----------------------------------
+# define our source and object files
+# ----------------------------------
+SOURCES= smithwaterman.cpp BandedSmithWaterman.cpp SmithWatermanGotoh.cpp Repeats.cpp LeftAlign.cpp IndelAllele.cpp
+OBJECTS= $(SOURCES:.cpp=.o) disorder.o
+OBJECTS_NO_MAIN= disorder.o BandedSmithWaterman.o SmithWatermanGotoh.o Repeats.o LeftAlign.o IndelAllele.o
+
+# ----------------
+# compiler options
+# ----------------
+
+# Use ?= to allow overriding from the env or command-line
+CXX?= c++
+CXXFLAGS?= -O3
+OBJ?= sw.o
+
+# I don't think := is useful here, since there is nothing to expand
+LDFLAGS:= -Wl,-s
+#CXXFLAGS=-g
+EXE:= smithwaterman
+LIBS=
+
+all: $(EXE) $(OBJ)
+
+.PHONY: all
+
+libsw.a: smithwaterman.o BandedSmithWaterman.o SmithWatermanGotoh.o LeftAlign.o Repeats.o IndelAllele.o disorder.o
+ ar rs $@ smithwaterman.o SmithWatermanGotoh.o disorder.o BandedSmithWaterman.o LeftAlign.o Repeats.o IndelAllele.o
+
+sw.o: BandedSmithWaterman.o SmithWatermanGotoh.o LeftAlign.o Repeats.o IndelAllele.o disorder.o
+ ld -r $^ -o sw.o -L.
+ #$(CXX) $(CFLAGS) -c -o smithwaterman.cpp $(OBJECTS_NO_MAIN) -I.
+
+### @$(CXX) $(LDFLAGS) $(CFLAGS) -o $@ $^ -I.
+$(EXE): smithwaterman.o BandedSmithWaterman.o SmithWatermanGotoh.o disorder.o LeftAlign.o Repeats.o IndelAllele.o
+ $(CXX) $(CFLAGS) $^ -I. -o $@
+
+#smithwaterman: $(OBJECTS)
+# $(CXX) $(CXXFLAGS) -o $@ $< -I.
+
+smithwaterman.o: smithwaterman.cpp disorder.o
+ $(CXX) $(CXXFLAGS) -c -o $@ smithwaterman.cpp -I.
+
+disorder.o: disorder.cpp disorder.h
+ $(CXX) $(CXXFLAGS) -c -o $@ $< -I.
+BandedSmithWaterman.o: BandedSmithWaterman.cpp BandedSmithWaterman.h
+ $(CXX) $(CXXFLAGS) -c -o $@ $< -I.
+SmithWatermanGotoh.o: SmithWatermanGotoh.cpp SmithWatermanGotoh.h disorder.o
+ $(CXX) $(CXXFLAGS) -c -o $@ $< -I.
+Repeats.o: Repeats.cpp
+ $(CXX) $(CXXFLAGS) -c -o $@ $< -I.
+LeftAlign.o: LeftAlign.cpp
+ $(CXX) $(CXXFLAGS) -c -o $@ $< -I.
+IndelAllele.o: IndelAllele.cpp
+ $(CXX) $(CXXFLAGS) -c -o $@ $< -I.
+
+.PHONY: clean
+
+clean:
+ @echo "Cleaning up."
+ @rm -f *.o $(PROGRAM) *~
diff --git a/Mosaik.h b/Mosaik.h
new file mode 100644
index 0000000..24d70d0
--- /dev/null
+++ b/Mosaik.h
@@ -0,0 +1,73 @@
+#pragma once
+
+#ifndef WIN32
+//#include "SafeFunctions.h"
+#endif
+
+// ==============
+// MOSAIK version
+// ==============
+
+#define MOSAIK_VERSION_DATE "2009-02-11"
+
+// adopt a major.minor.build version number [1].[1].[3]
+const unsigned char MOSAIK_MAJOR_VERSION = 0;
+const unsigned char MOSAIK_MINOR_VERSION = 9;
+const unsigned short MOSAIK_BUILD_VERSION = 899;
+
+// ================================
+// Platform specific variable sizes
+// ================================
+
+// Windows Vista 32-bit
+// Fedora Core 7 32-bit
+// Fedora Core 6 64-bit
+// Itanium2 64-bit
+#define SIZEOF_CHAR 1
+#define SIZEOF_WCHAR 2
+#define SIZEOF_SHORT 2
+#define SIZEOF_INT 4
+#define SIZEOF_FLOAT 4
+#define SIZEOF_DOUBLE 8
+#define SIZEOF_UINT64 8
+#define MOSAIK_LITTLE_ENDIAN 1
+
+#ifdef WIN32
+typedef signed long long int64_t;
+typedef unsigned long long uint64_t;
+#endif
+
+#define NEGATIVE_ONE_INT 0xffffffff
+#define NEGATIVE_TWO_INT 0xfffffffe
+#define NEGATIVE_THREE_INT 0xfffffffd
+#define NEGATIVE_FOUR_INT 0xfffffffc
+#define MAX_SHORT 0xffff
+
+// ==========================
+// Platform specific file I/O
+// ==========================
+
+#ifdef WIN32
+const char OS_DIRECTORY_SEPARATOR = '\\';
+#else
+const char OS_DIRECTORY_SEPARATOR = '/';
+#endif
+
+#define DIRECTORY_NAME_LENGTH 255
+
+// ====================================
+// Enable unit test diagnostic messages
+// ====================================
+
+#ifdef UNITTEST
+#define SILENTMODE if(0)
+#else
+#define SILENTMODE
+#endif
+
+// =================
+// Aligner constants
+// =================
+
+const double HASH_REGION_EXTENSION_PERCENT = 0.025;
+const unsigned char REFERENCE_SEQUENCE_QUALITY = 40;
diff --git a/Repeats.cpp b/Repeats.cpp
new file mode 100644
index 0000000..289e7c7
--- /dev/null
+++ b/Repeats.cpp
@@ -0,0 +1,69 @@
+#include "Repeats.h"
+
+map<string, int> repeatCounts(long int position, const string& sequence, int maxsize) {
+ map<string, int> counts;
+ for (int i = 1; i <= maxsize; ++i) {
+ // subseq here i bases
+ string seq = sequence.substr(position, i);
+ // go left.
+
+ int j = position - i;
+ int leftsteps = 0;
+ while (j >= 0 && seq == sequence.substr(j, i)) {
+ j -= i;
+ ++leftsteps;
+ }
+
+ // go right.
+ j = position;
+
+ int rightsteps = 0;
+ while (j + i <= sequence.size() && seq == sequence.substr(j, i)) {
+ j += i;
+ ++rightsteps;
+ }
+ // if we went left and right a non-zero number of times,
+ if (leftsteps + rightsteps > 1) {
+ counts[seq] = leftsteps + rightsteps;
+ }
+ }
+
+ // filter out redundant repeat information
+ if (counts.size() > 1) {
+ map<string, int> filteredcounts;
+ map<string, int>::iterator c = counts.begin();
+ string prev = c->first;
+ filteredcounts[prev] = c->second; // shortest sequence
+ ++c;
+ for (; c != counts.end(); ++c) {
+ int i = 0;
+ string seq = c->first;
+ while (i + prev.length() <= seq.length() && seq.substr(i, prev.length()) == prev) {
+ i += prev.length();
+ }
+ if (i < seq.length()) {
+ filteredcounts[seq] = c->second;
+ prev = seq;
+ }
+ }
+ return filteredcounts;
+ } else {
+ return counts;
+ }
+}
+
+bool isRepeatUnit(const string& seq, const string& unit) {
+
+ if (seq.size() % unit.size() != 0) {
+ return false;
+ } else {
+ int maxrepeats = seq.size() / unit.size();
+ for (int i = 0; i < maxrepeats; ++i) {
+ if (seq.substr(i * unit.size(), unit.size()) != unit) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+}
diff --git a/Repeats.h b/Repeats.h
new file mode 100644
index 0000000..2efc0ea
--- /dev/null
+++ b/Repeats.h
@@ -0,0 +1,8 @@
+#include <iostream>
+#include <string>
+#include <map>
+
+using namespace std;
+
+map<string, int> repeatCounts(long int pos, const string& seq, int maxsize);
+bool isRepeatUnit(const string& seq, const string& unit);
diff --git a/SWMain.cpp b/SWMain.cpp
new file mode 100644
index 0000000..6ef3421
--- /dev/null
+++ b/SWMain.cpp
@@ -0,0 +1,126 @@
+#include <iostream>
+#include <string.h>
+//#include "Alignment.h"
+//#include "Benchmark.h"
+//#include "HashRegion.h"
+#include "SmithWatermanGotoh.h"
+#include "BandedSmithWaterman.h"
+
+using namespace std;
+
+int main(int argc, char* argv[]) {
+/*
+ printf("------------------------------------------------------------------------------\n");
+ printf("Banded Smith-Waterman Algorithm (worst case)\n");
+ printf("Michael Stromberg & Wan-Ping Lee Marth Lab, Boston College Biology Department\n");
+ printf("------------------------------------------------------------------------------\n\n");
+*/
+ // this version simulates the worst case of only a fragment hashing to the
+ // reference sequence. Basically a non-centered diagonal in the Smith-Waterman
+ // dynamic programming matrix.
+
+ // here we simulate a region on the reference that occurs between position 4001
+ // and position 4136. During hashing, only the first 20 bases in the query
+ // matched perfectly.
+
+ // define the start and end coordinates of the entire reference region
+ //const unsigned int start = 4001;
+ //const unsigned int end = 4136;
+
+ //const unsigned int testStart = atoi(argv[1]);
+ //const unsigned int testEnd = atoi(argv[2]);
+ //const unsigned int testQueryStart = atoi(argv[3]);
+ //const unsigned int testQueryEnd = atoi(argv[4]);
+
+ //cout << endl<< "=====================================================" << endl;
+ //cout << testStart << "\t" << testQueryStart << endl;
+
+ // define the 20 b:q
+ // ases that matched perfectly
+ //HashRegion hr;
+
+ //=====================================================
+ // defind the hash region
+ // first.first: reference begin
+ // first.second: reference end
+ // second.first: query begin
+ // second.second: query end
+ //=====================================================
+
+ pair< pair<unsigned int, unsigned int>, pair<unsigned int, unsigned int> > hr;
+ hr.first.first = 5;
+ hr.first.second = 13;
+ hr.second.first = 0;
+ hr.second.second = 8;
+
+ //=====================================================
+
+ // for 76 bp reads, we expect as much as 12 mismatches - however this does not
+ // translate to a bandwidth of 12 * 2 + 1 since most of these will be
+ // substitution errors
+ const unsigned char bandwidth = 11;
+
+ // initialize
+ const char* pReference = "ATGGCGGGGATCGGGACACTCGCCGGTGCGGGTACCCTA";
+ const char* pQuery = "GGGGATCGGGACACTCGCTCTCCGGTGCGGGTA";
+
+ const unsigned int referenceLen = strlen(pReference);
+ const unsigned int queryLen = strlen(pQuery);
+
+ // ==============================================================================================
+ // benchmarking reference on koi.bc.edu when NUM_ITERATIONS = 38000 on 76 bp read (1 try):
+ // CPU time: 23.920 s, wall time: 24.012 s (1582.6 alignments/s)
+ // ==============================================================================================
+ //const unsigned int NUM_ITERATIONS = 38000;
+ //unsigned int NUM_ITERATIONS = 1;
+
+ // create a new Smith-Waterman alignment object
+ CSmithWatermanGotoh sw(10.0f, -9.0f, 15.0f, 6.66f);
+ CBandedSmithWaterman bsw(10.0f, -9.0f, 15.0f, 6.66f, bandwidth);
+
+ // start timing the algorithm
+ //CBenchmark bench;
+ //bench.Start();
+
+ // perform NUM_ITERATIONS alignments
+ //Alignment bswAl;
+ //Alignment swAl;
+ // referenceBegin, referenceEnd
+ unsigned int referenceSW, referenceBSW;
+ string cigarSW, cigarBSW;
+ //for(unsigned int i = 0; i < NUM_ITERATIONS; i++) {
+ sw.Align(referenceSW, cigarSW, pReference, referenceLen, pQuery, queryLen);
+ bsw.Align(referenceBSW, cigarBSW, pReference, referenceLen, pQuery, queryLen, hr);
+ //}
+
+ // stop timing the algorithm
+ //bench.Stop();
+
+ // calculate the alignments per second
+ //double elapsedWallTime = bench.GetElapsedWallTime();
+ //double alignmentsPerSecond = (double)NUM_ITERATIONS / elapsedWallTime;
+
+ // show our results
+ //printf("%d\t%d\n", al.ReferenceBegin,al.QueryBegin);
+
+ printf("Smith-Waterman\n");
+ printf("reference: %s %3u\n", cigarSW.c_str(), referenceSW);
+ printf("Banded Smith-Waterman\n");
+ printf("reference: %s %3u\n", cigarBSW.c_str(), referenceBSW);
+ /*
+ printf("Smith-Waterman\n");
+ printf("reference: %s %3u %3u\n", swAl.Reference.CData(), swAl.ReferenceBegin, swAl.ReferenceEnd);
+ printf("query: %s %3u %3u\n", swAl.Query.CData(), swAl.QueryBegin, swAl.QueryEnd);
+ printf("mismatches: %u\n", swAl.NumMismatches);
+ printf("\n");
+ printf("Banded Smith-Waterman\n");
+ printf("reference: %s %3u %3u\n", bswAl.Reference.CData(), bswAl.ReferenceBegin, bswAl.ReferenceEnd);
+ printf("query: %s %3u %3u\n", bswAl.Query.CData(), bswAl.QueryBegin, bswAl.QueryEnd);
+ printf("mismatches: %u\n", bswAl.NumMismatches);
+ */
+ //printf("alignments/s: %.1f\n\n", alignmentsPerSecond);
+
+ //bench.DisplayTime("BandedSmithWaterman");
+
+ return 0;
+}
diff --git a/SmithWatermanGotoh.cpp b/SmithWatermanGotoh.cpp
new file mode 100644
index 0000000..49f687f
--- /dev/null
+++ b/SmithWatermanGotoh.cpp
@@ -0,0 +1,741 @@
+#include "SmithWatermanGotoh.h"
+
+const float CSmithWatermanGotoh::FLOAT_NEGATIVE_INFINITY = (float)-1e+30;
+
+const char CSmithWatermanGotoh::Directions_STOP = 0;
+const char CSmithWatermanGotoh::Directions_LEFT = 1;
+const char CSmithWatermanGotoh::Directions_DIAGONAL = 2;
+const char CSmithWatermanGotoh::Directions_UP = 3;
+
+const int CSmithWatermanGotoh::repeat_size_max = 12;
+
+CSmithWatermanGotoh::CSmithWatermanGotoh(float matchScore, float mismatchScore, float gapOpenPenalty, float gapExtendPenalty)
+ : mCurrentMatrixSize(0)
+ , mCurrentAnchorSize(0)
+ , mCurrentQuerySize(0)
+ , mCurrentAQSumSize(0)
+ , mMatchScore(matchScore)
+ , mMismatchScore(mismatchScore)
+ , mGapOpenPenalty(gapOpenPenalty)
+ , mGapExtendPenalty(gapExtendPenalty)
+ , mPointers(NULL)
+ , mSizesOfVerticalGaps(NULL)
+ , mSizesOfHorizontalGaps(NULL)
+ , mQueryGapScores(NULL)
+ , mBestScores(NULL)
+ , mReversedAnchor(NULL)
+ , mReversedQuery(NULL)
+ , mUseHomoPolymerGapOpenPenalty(false)
+ , mUseEntropyGapOpenPenalty(false)
+ , mUseRepeatGapExtensionPenalty(false)
+{
+ CreateScoringMatrix();
+}
+
+CSmithWatermanGotoh::~CSmithWatermanGotoh(void) {
+ if(mPointers) delete [] mPointers;
+ if(mSizesOfVerticalGaps) delete [] mSizesOfVerticalGaps;
+ if(mSizesOfHorizontalGaps) delete [] mSizesOfHorizontalGaps;
+ if(mQueryGapScores) delete [] mQueryGapScores;
+ if(mBestScores) delete [] mBestScores;
+ if(mReversedAnchor) delete [] mReversedAnchor;
+ if(mReversedQuery) delete [] mReversedQuery;
+}
+
+// aligns the query sequence to the reference using the Smith Waterman Gotoh algorithm
+void CSmithWatermanGotoh::Align(unsigned int& referenceAl, string& cigarAl, const string& s1, const string& s2) {
+
+ if((s1.length() == 0) || (s2.length() == 0)) {
+ cout << "ERROR: Found a read with a zero length." << endl;
+ exit(1);
+ }
+
+ unsigned int referenceLen = s1.length() + 1;
+ unsigned int queryLen = s2.length() + 1;
+ unsigned int sequenceSumLength = s1.length() + s2.length();
+
+ // reinitialize our matrices
+
+ if((referenceLen * queryLen) > mCurrentMatrixSize) {
+
+ // calculate the new matrix size
+ mCurrentMatrixSize = referenceLen * queryLen;
+
+ // delete the old arrays
+ if(mPointers) delete [] mPointers;
+ if(mSizesOfVerticalGaps) delete [] mSizesOfVerticalGaps;
+ if(mSizesOfHorizontalGaps) delete [] mSizesOfHorizontalGaps;
+
+ try {
+
+ // initialize the arrays
+ mPointers = new char[mCurrentMatrixSize];
+ mSizesOfVerticalGaps = new short[mCurrentMatrixSize];
+ mSizesOfHorizontalGaps = new short[mCurrentMatrixSize];
+
+ } catch(bad_alloc) {
+ cout << "ERROR: Unable to allocate enough memory for the Smith-Waterman algorithm." << endl;
+ exit(1);
+ }
+ }
+
+ // initialize the traceback matrix to STOP
+ memset((char*)mPointers, 0, SIZEOF_CHAR * queryLen);
+ for(unsigned int i = 1; i < referenceLen; i++) mPointers[i * queryLen] = 0;
+
+ // initialize the gap matrices to 1
+ uninitialized_fill(mSizesOfVerticalGaps, mSizesOfVerticalGaps + mCurrentMatrixSize, 1);
+ uninitialized_fill(mSizesOfHorizontalGaps, mSizesOfHorizontalGaps + mCurrentMatrixSize, 1);
+
+
+ // initialize our repeat counts if they are needed
+ vector<map<string, int> > referenceRepeats;
+ vector<map<string, int> > queryRepeats;
+ int queryBeginRepeatBases = 0;
+ int queryEndRepeatBases = 0;
+ if (mUseRepeatGapExtensionPenalty) {
+ for (unsigned int i = 0; i < queryLen; ++i)
+ queryRepeats.push_back(repeatCounts(i, s2, repeat_size_max));
+ for (unsigned int i = 0; i < referenceLen; ++i)
+ referenceRepeats.push_back(repeatCounts(i, s1, repeat_size_max));
+
+ // keep only the biggest repeat
+ vector<map<string, int> >::iterator q = queryRepeats.begin();
+ for (; q != queryRepeats.end(); ++q) {
+ map<string, int>::iterator biggest = q->begin();
+ map<string, int>::iterator z = q->begin();
+ for (; z != q->end(); ++z)
+ if (z->first.size() > biggest->first.size()) biggest = z;
+ z = q->begin();
+ while (z != q->end()) {
+ if (z != biggest)
+ q->erase(z++);
+ else ++z;
+ }
+ }
+
+ q = referenceRepeats.begin();
+ for (; q != referenceRepeats.end(); ++q) {
+ map<string, int>::iterator biggest = q->begin();
+ map<string, int>::iterator z = q->begin();
+ for (; z != q->end(); ++z)
+ if (z->first.size() > biggest->first.size()) biggest = z;
+ z = q->begin();
+ while (z != q->end()) {
+ if (z != biggest)
+ q->erase(z++);
+ else ++z;
+ }
+ }
+
+ // remove repeat information from ends of queries
+ // this results in the addition of spurious flanking deletions in repeats
+ map<string, int>& qrend = queryRepeats.at(queryRepeats.size() - 2);
+ if (!qrend.empty()) {
+ int queryEndRepeatBases = qrend.begin()->first.size() * qrend.begin()->second;
+ for (int i = 0; i < queryEndRepeatBases; ++i)
+ queryRepeats.at(queryRepeats.size() - 2 - i).clear();
+ }
+
+ map<string, int>& qrbegin = queryRepeats.front();
+ if (!qrbegin.empty()) {
+ int queryBeginRepeatBases = qrbegin.begin()->first.size() * qrbegin.begin()->second;
+ for (int i = 0; i < queryBeginRepeatBases; ++i)
+ queryRepeats.at(i).clear();
+ }
+
+ }
+
+ int entropyWindowSize = 8;
+ vector<float> referenceEntropies;
+ vector<float> queryEntropies;
+ if (mUseEntropyGapOpenPenalty) {
+ for (unsigned int i = 0; i < queryLen; ++i)
+ queryEntropies.push_back(
+ shannon_H((char*) &s2[max(0, min((int) i - entropyWindowSize / 2, (int) queryLen - entropyWindowSize - 1))],
+ entropyWindowSize));
+ for (unsigned int i = 0; i < referenceLen; ++i)
+ referenceEntropies.push_back(
+ shannon_H((char*) &s1[max(0, min((int) i - entropyWindowSize / 2, (int) referenceLen - entropyWindowSize - 1))],
+ entropyWindowSize));
+ }
+
+ // normalize entropies
+ /*
+ float qsum = 0;
+ float qnorm = 0;
+ float qmax = 0;
+ for (vector<float>::iterator q = queryEntropies.begin(); q != queryEntropies.end(); ++q) {
+ qsum += *q;
+ if (*q > qmax) qmax = *q;
+ }
+ qnorm = qsum / queryEntropies.size();
+ for (vector<float>::iterator q = queryEntropies.begin(); q != queryEntropies.end(); ++q)
+ *q = *q / qsum + qmax;
+
+ float rsum = 0;
+ float rnorm = 0;
+ float rmax = 0;
+ for (vector<float>::iterator r = referenceEntropies.begin(); r != referenceEntropies.end(); ++r) {
+ rsum += *r;
+ if (*r > rmax) rmax = *r;
+ }
+ rnorm = rsum / referenceEntropies.size();
+ for (vector<float>::iterator r = referenceEntropies.begin(); r != referenceEntropies.end(); ++r)
+ *r = *r / rsum + rmax;
+ */
+
+ //
+ // construct
+ //
+
+ // reinitialize our query-dependent arrays
+ if(s2.length() > mCurrentQuerySize) {
+
+ // calculate the new query array size
+ mCurrentQuerySize = s2.length();
+
+ // delete the old arrays
+ if(mQueryGapScores) delete [] mQueryGapScores;
+ if(mBestScores) delete [] mBestScores;
+
+ // initialize the arrays
+ try {
+
+ mQueryGapScores = new float[mCurrentQuerySize + 1];
+ mBestScores = new float[mCurrentQuerySize + 1];
+
+ } catch(bad_alloc) {
+ cout << "ERROR: Unable to allocate enough memory for the Smith-Waterman algorithm." << endl;
+ exit(1);
+ }
+ }
+
+ // reinitialize our reference+query-dependent arrays
+ if(sequenceSumLength > mCurrentAQSumSize) {
+
+ // calculate the new reference array size
+ mCurrentAQSumSize = sequenceSumLength;
+
+ // delete the old arrays
+ if(mReversedAnchor) delete [] mReversedAnchor;
+ if(mReversedQuery) delete [] mReversedQuery;
+
+ // initialize the arrays
+ try {
+
+ mReversedAnchor = new char[mCurrentAQSumSize + 1]; // reversed sequence #1
+ mReversedQuery = new char[mCurrentAQSumSize + 1]; // reversed sequence #2
+
+ } catch(bad_alloc) {
+ cout << "ERROR: Unable to allocate enough memory for the Smith-Waterman algorithm." << endl;
+ exit(1);
+ }
+ }
+
+ // initialize the gap score and score vectors
+ uninitialized_fill(mQueryGapScores, mQueryGapScores + queryLen, FLOAT_NEGATIVE_INFINITY);
+ memset((char*)mBestScores, 0, SIZEOF_FLOAT * queryLen);
+
+ float similarityScore, totalSimilarityScore, bestScoreDiagonal;
+ float queryGapExtendScore, queryGapOpenScore;
+ float referenceGapExtendScore, referenceGapOpenScore, currentAnchorGapScore;
+
+ unsigned int BestColumn = 0;
+ unsigned int BestRow = 0;
+ BestScore = FLOAT_NEGATIVE_INFINITY;
+
+ for(unsigned int i = 1, k = queryLen; i < referenceLen; i++, k += queryLen) {
+
+ currentAnchorGapScore = FLOAT_NEGATIVE_INFINITY;
+ bestScoreDiagonal = mBestScores[0];
+
+ for(unsigned int j = 1, l = k + 1; j < queryLen; j++, l++) {
+
+ // calculate our similarity score
+ similarityScore = mScoringMatrix[s1[i - 1] - 'A'][s2[j - 1] - 'A'];
+
+ // fill the matrices
+ totalSimilarityScore = bestScoreDiagonal + similarityScore;
+
+ //cerr << "i: " << i << ", j: " << j << ", totalSimilarityScore: " << totalSimilarityScore << endl;
+
+ queryGapExtendScore = mQueryGapScores[j] - mGapExtendPenalty;
+ queryGapOpenScore = mBestScores[j] - mGapOpenPenalty;
+
+ // compute the homo-polymer gap score if enabled
+ if(mUseHomoPolymerGapOpenPenalty)
+ if((j > 1) && (s2[j - 1] == s2[j - 2]))
+ queryGapOpenScore = mBestScores[j] - mHomoPolymerGapOpenPenalty;
+
+ // compute the entropy gap score if enabled
+ if (mUseEntropyGapOpenPenalty) {
+ queryGapOpenScore =
+ mBestScores[j] - mGapOpenPenalty
+ * max(queryEntropies.at(j), referenceEntropies.at(i))
+ * mEntropyGapOpenPenalty;
+ }
+
+ int gaplen = mSizesOfVerticalGaps[l - queryLen] + 1;
+
+ if (mUseRepeatGapExtensionPenalty) {
+ map<string, int>& repeats = queryRepeats[j];
+ // does the sequence which would be inserted or deleted in this gap match the repeat structure which it is embedded in?
+ if (!repeats.empty()) {
+
+ const pair<string, int>& repeat = *repeats.begin();
+ int repeatsize = repeat.first.size();
+ if (gaplen != repeatsize && gaplen % repeatsize != 0) {
+ gaplen = gaplen / repeatsize + repeatsize;
+ }
+
+ if ((repeat.first.size() * repeat.second) > 3 && gaplen + i < s1.length()) {
+ string gapseq = string(&s1[i], gaplen);
+ if (gapseq == repeat.first || isRepeatUnit(gapseq, repeat.first)) {
+ queryGapExtendScore = mQueryGapScores[j]
+ + mRepeatGapExtensionPenalty / (float) gaplen;
+ // mMaxRepeatGapExtensionPenalty)
+ } else {
+ queryGapExtendScore = mQueryGapScores[j] - mGapExtendPenalty;
+ }
+ }
+ } else {
+ queryGapExtendScore = mQueryGapScores[j] - mGapExtendPenalty;
+ }
+ }
+
+ if(queryGapExtendScore > queryGapOpenScore) {
+ mQueryGapScores[j] = queryGapExtendScore;
+ mSizesOfVerticalGaps[l] = gaplen;
+ } else mQueryGapScores[j] = queryGapOpenScore;
+
+ referenceGapExtendScore = currentAnchorGapScore - mGapExtendPenalty;
+ referenceGapOpenScore = mBestScores[j - 1] - mGapOpenPenalty;
+
+ // compute the homo-polymer gap score if enabled
+ if(mUseHomoPolymerGapOpenPenalty)
+ if((i > 1) && (s1[i - 1] == s1[i - 2]))
+ referenceGapOpenScore = mBestScores[j - 1] - mHomoPolymerGapOpenPenalty;
+
+ // compute the entropy gap score if enabled
+ if (mUseEntropyGapOpenPenalty) {
+ referenceGapOpenScore =
+ mBestScores[j - 1] - mGapOpenPenalty
+ * max(queryEntropies.at(j), referenceEntropies.at(i))
+ * mEntropyGapOpenPenalty;
+ }
+
+ gaplen = mSizesOfHorizontalGaps[l - 1] + 1;
+
+ if (mUseRepeatGapExtensionPenalty) {
+ map<string, int>& repeats = referenceRepeats[i];
+ // does the sequence which would be inserted or deleted in this gap match the repeat structure which it is embedded in?
+ if (!repeats.empty()) {
+
+ const pair<string, int>& repeat = *repeats.begin();
+ int repeatsize = repeat.first.size();
+ if (gaplen != repeatsize && gaplen % repeatsize != 0) {
+ gaplen = gaplen / repeatsize + repeatsize;
+ }
+
+ if ((repeat.first.size() * repeat.second) > 3 && gaplen + j < s2.length()) {
+ string gapseq = string(&s2[j], gaplen);
+ if (gapseq == repeat.first || isRepeatUnit(gapseq, repeat.first)) {
+ referenceGapExtendScore = currentAnchorGapScore
+ + mRepeatGapExtensionPenalty / (float) gaplen;
+ //mMaxRepeatGapExtensionPenalty)
+ } else {
+ referenceGapExtendScore = currentAnchorGapScore - mGapExtendPenalty;
+ }
+ }
+ } else {
+ referenceGapExtendScore = currentAnchorGapScore - mGapExtendPenalty;
+ }
+ }
+
+ if(referenceGapExtendScore > referenceGapOpenScore) {
+ currentAnchorGapScore = referenceGapExtendScore;
+ mSizesOfHorizontalGaps[l] = gaplen;
+ } else currentAnchorGapScore = referenceGapOpenScore;
+
+ bestScoreDiagonal = mBestScores[j];
+ mBestScores[j] = MaxFloats(totalSimilarityScore, mQueryGapScores[j], currentAnchorGapScore);
+
+
+ // determine the traceback direction
+ // diagonal (445364713) > stop (238960195) > up (214378647) > left (166504495)
+ if(mBestScores[j] == 0) mPointers[l] = Directions_STOP;
+ else if(mBestScores[j] == totalSimilarityScore) mPointers[l] = Directions_DIAGONAL;
+ else if(mBestScores[j] == mQueryGapScores[j]) mPointers[l] = Directions_UP;
+ else mPointers[l] = Directions_LEFT;
+
+ // set the traceback start at the current cell i, j and score
+ if(mBestScores[j] > BestScore) {
+ BestRow = i;
+ BestColumn = j;
+ BestScore = mBestScores[j];
+ }
+ }
+ }
+
+ //
+ // traceback
+ //
+
+ // aligned sequences
+ int gappedAnchorLen = 0; // length of sequence #1 after alignment
+ int gappedQueryLen = 0; // length of sequence #2 after alignment
+ int numMismatches = 0; // the mismatched nucleotide count
+
+ char c1, c2;
+
+ int ci = BestRow;
+ int cj = BestColumn;
+ int ck = ci * queryLen;
+
+ // traceback flag
+ bool keepProcessing = true;
+
+ while(keepProcessing) {
+ //cerr << ci << " " << cj << " " << ck << " ... " << gappedAnchorLen << " " << gappedQueryLen << endl;
+
+ // diagonal (445364713) > stop (238960195) > up (214378647) > left (166504495)
+ switch(mPointers[ck + cj]) {
+
+ case Directions_DIAGONAL:
+ c1 = s1[--ci];
+ c2 = s2[--cj];
+ ck -= queryLen;
+
+ mReversedAnchor[gappedAnchorLen++] = c1;
+ mReversedQuery[gappedQueryLen++] = c2;
+
+ // increment our mismatch counter
+ if(mScoringMatrix[c1 - 'A'][c2 - 'A'] == mMismatchScore) numMismatches++;
+ break;
+
+ case Directions_STOP:
+ keepProcessing = false;
+ break;
+
+ case Directions_UP:
+ for(unsigned int l = 0, len = mSizesOfVerticalGaps[ck + cj]; l < len; l++) {
+ if (ci <= 0) {
+ keepProcessing = false;
+ break;
+ }
+ mReversedAnchor[gappedAnchorLen++] = s1[--ci];
+ mReversedQuery[gappedQueryLen++] = GAP;
+ ck -= queryLen;
+ numMismatches++;
+ }
+ break;
+
+ case Directions_LEFT:
+ for(unsigned int l = 0, len = mSizesOfHorizontalGaps[ck + cj]; l < len; l++) {
+ if (cj <= 0) {
+ keepProcessing = false;
+ break;
+ }
+ mReversedAnchor[gappedAnchorLen++] = GAP;
+ mReversedQuery[gappedQueryLen++] = s2[--cj];
+ numMismatches++;
+ }
+ break;
+ }
+ }
+
+ // define the reference and query sequences
+ mReversedAnchor[gappedAnchorLen] = 0;
+ mReversedQuery[gappedQueryLen] = 0;
+
+ // catch sequences with different lengths
+ if(gappedAnchorLen != gappedQueryLen) {
+ cout << "ERROR: The aligned sequences have different lengths after Smith-Waterman-Gotoh algorithm." << endl;
+ exit(1);
+ }
+
+ // reverse the strings and assign them to our alignment structure
+ reverse(mReversedAnchor, mReversedAnchor + gappedAnchorLen);
+ reverse(mReversedQuery, mReversedQuery + gappedQueryLen);
+
+ //alignment.Reference = mReversedAnchor;
+ //alignment.Query = mReversedQuery;
+
+ // set the reference endpoints
+ //alignment.ReferenceBegin = ci;
+ //alignment.ReferenceEnd = BestRow - 1;
+ referenceAl = ci;
+
+ // set the query endpoints
+ /*
+ if(alignment.IsReverseComplement) {
+ alignment.QueryBegin = s2Length - BestColumn;
+ alignment.QueryEnd = s2Length - cj - 1;
+ // alignment.QueryLength= alignment.QueryBegin - alignment.QueryEnd + 1;
+ } else {
+ alignment.QueryBegin = cj;
+ alignment.QueryEnd = BestColumn - 1;
+ // alignment.QueryLength= alignment.QueryEnd - alignment.QueryBegin + 1;
+ }
+ */
+
+ // set the query length and number of mismatches
+ //alignment.QueryLength = alignment.QueryEnd - alignment.QueryBegin + 1;
+ //alignment.NumMismatches = numMismatches;
+
+ unsigned int alLength = strlen(mReversedAnchor);
+ unsigned int m = 0, d = 0, i = 0;
+ bool dashRegion = false;
+ ostringstream oCigar (ostringstream::out);
+ int insertedBases = 0;
+
+ if ( cj != 0 ) {
+ if ( cj > 0 ) {
+ oCigar << cj << 'S';
+ } else { // how do we get negative cj's?
+ referenceAl -= cj;
+ alLength += cj;
+ }
+ }
+
+ for ( unsigned int j = 0; j < alLength; j++ ) {
+ // m
+ if ( ( mReversedAnchor[j] != GAP ) && ( mReversedQuery[j] != GAP ) ) {
+ if ( dashRegion ) {
+ if ( d != 0 ) oCigar << d << 'D';
+ else { oCigar << i << 'I'; insertedBases += i; }
+ }
+ dashRegion = false;
+ m++;
+ d = 0;
+ i = 0;
+ }
+ else {
+ if ( !dashRegion && m )
+ oCigar << m << 'M';
+ dashRegion = true;
+ m = 0;
+ if ( mReversedAnchor[j] == GAP ) {
+ if ( d != 0 ) oCigar << d << 'D';
+ i++;
+ d = 0;
+ }
+ else {
+ if ( i != 0) { oCigar << i << 'I'; insertedBases += i; }
+ d++;
+ i = 0;
+ }
+ }
+ }
+ if ( m != 0 ) oCigar << m << 'M';
+ else if ( d != 0 ) oCigar << d << 'D';
+ else if ( i != 0 ) oCigar << i << 'I';
+
+ if ( BestColumn != s2.length() )
+ oCigar << s2.length() - BestColumn << 'S';
+
+ cigarAl = oCigar.str();
+
+ // fix the gap order
+ CorrectHomopolymerGapOrder(alLength, numMismatches);
+
+ if (mUseEntropyGapOpenPenalty || mUseRepeatGapExtensionPenalty) {
+ int offset = 0;
+ string oldCigar;
+ try {
+ oldCigar = cigarAl;
+ stablyLeftAlign(s2, cigarAl, s1.substr(referenceAl, alLength - insertedBases), offset);
+ } catch (...) {
+ cerr << "an exception occurred when left-aligning " << s1 << " " << s2 << endl;
+ cigarAl = oldCigar; // undo the failed left-realignment attempt
+ offset = 0;
+ }
+ referenceAl += offset;
+ }
+
+}
+
+// creates a simple scoring matrix to align the nucleotides and the ambiguity code N
+void CSmithWatermanGotoh::CreateScoringMatrix(void) {
+
+ unsigned int nIndex = 13;
+ unsigned int xIndex = 23;
+
+ // define the N score to be 1/4 of the span between mismatch and match
+ //const short nScore = mMismatchScore + (short)(((mMatchScore - mMismatchScore) / 4.0) + 0.5);
+
+ // calculate the scoring matrix
+ for(unsigned char i = 0; i < MOSAIK_NUM_NUCLEOTIDES; i++) {
+ for(unsigned char j = 0; j < MOSAIK_NUM_NUCLEOTIDES; j++) {
+
+ // N.B. matching N to everything (while conceptually correct) leads to some
+ // bad alignments, lets make N be a mismatch instead.
+
+ // add the matches or mismatches to the hashtable (N is a mismatch)
+ if((i == nIndex) || (j == nIndex)) mScoringMatrix[i][j] = mMismatchScore;
+ else if((i == xIndex) || (j == xIndex)) mScoringMatrix[i][j] = mMismatchScore;
+ else if(i == j) mScoringMatrix[i][j] = mMatchScore;
+ else mScoringMatrix[i][j] = mMismatchScore;
+ }
+ }
+
+ // add ambiguity codes
+ mScoringMatrix['M' - 'A']['A' - 'A'] = mMatchScore; // M - A
+ mScoringMatrix['A' - 'A']['M' - 'A'] = mMatchScore;
+ mScoringMatrix['M' - 'A']['C' - 'A'] = mMatchScore; // M - C
+ mScoringMatrix['C' - 'A']['M' - 'A'] = mMatchScore;
+
+ mScoringMatrix['R' - 'A']['A' - 'A'] = mMatchScore; // R - A
+ mScoringMatrix['A' - 'A']['R' - 'A'] = mMatchScore;
+ mScoringMatrix['R' - 'A']['G' - 'A'] = mMatchScore; // R - G
+ mScoringMatrix['G' - 'A']['R' - 'A'] = mMatchScore;
+
+ mScoringMatrix['W' - 'A']['A' - 'A'] = mMatchScore; // W - A
+ mScoringMatrix['A' - 'A']['W' - 'A'] = mMatchScore;
+ mScoringMatrix['W' - 'A']['T' - 'A'] = mMatchScore; // W - T
+ mScoringMatrix['T' - 'A']['W' - 'A'] = mMatchScore;
+
+ mScoringMatrix['S' - 'A']['C' - 'A'] = mMatchScore; // S - C
+ mScoringMatrix['C' - 'A']['S' - 'A'] = mMatchScore;
+ mScoringMatrix['S' - 'A']['G' - 'A'] = mMatchScore; // S - G
+ mScoringMatrix['G' - 'A']['S' - 'A'] = mMatchScore;
+
+ mScoringMatrix['Y' - 'A']['C' - 'A'] = mMatchScore; // Y - C
+ mScoringMatrix['C' - 'A']['Y' - 'A'] = mMatchScore;
+ mScoringMatrix['Y' - 'A']['T' - 'A'] = mMatchScore; // Y - T
+ mScoringMatrix['T' - 'A']['Y' - 'A'] = mMatchScore;
+
+ mScoringMatrix['K' - 'A']['G' - 'A'] = mMatchScore; // K - G
+ mScoringMatrix['G' - 'A']['K' - 'A'] = mMatchScore;
+ mScoringMatrix['K' - 'A']['T' - 'A'] = mMatchScore; // K - T
+ mScoringMatrix['T' - 'A']['K' - 'A'] = mMatchScore;
+
+ mScoringMatrix['V' - 'A']['A' - 'A'] = mMatchScore; // V - A
+ mScoringMatrix['A' - 'A']['V' - 'A'] = mMatchScore;
+ mScoringMatrix['V' - 'A']['C' - 'A'] = mMatchScore; // V - C
+ mScoringMatrix['C' - 'A']['V' - 'A'] = mMatchScore;
+ mScoringMatrix['V' - 'A']['G' - 'A'] = mMatchScore; // V - G
+ mScoringMatrix['G' - 'A']['V' - 'A'] = mMatchScore;
+
+ mScoringMatrix['H' - 'A']['A' - 'A'] = mMatchScore; // H - A
+ mScoringMatrix['A' - 'A']['H' - 'A'] = mMatchScore;
+ mScoringMatrix['H' - 'A']['C' - 'A'] = mMatchScore; // H - C
+ mScoringMatrix['C' - 'A']['H' - 'A'] = mMatchScore;
+ mScoringMatrix['H' - 'A']['T' - 'A'] = mMatchScore; // H - T
+ mScoringMatrix['T' - 'A']['H' - 'A'] = mMatchScore;
+
+ mScoringMatrix['D' - 'A']['A' - 'A'] = mMatchScore; // D - A
+ mScoringMatrix['A' - 'A']['D' - 'A'] = mMatchScore;
+ mScoringMatrix['D' - 'A']['G' - 'A'] = mMatchScore; // D - G
+ mScoringMatrix['G' - 'A']['D' - 'A'] = mMatchScore;
+ mScoringMatrix['D' - 'A']['T' - 'A'] = mMatchScore; // D - T
+ mScoringMatrix['T' - 'A']['D' - 'A'] = mMatchScore;
+
+ mScoringMatrix['B' - 'A']['C' - 'A'] = mMatchScore; // B - C
+ mScoringMatrix['C' - 'A']['B' - 'A'] = mMatchScore;
+ mScoringMatrix['B' - 'A']['G' - 'A'] = mMatchScore; // B - G
+ mScoringMatrix['G' - 'A']['B' - 'A'] = mMatchScore;
+ mScoringMatrix['B' - 'A']['T' - 'A'] = mMatchScore; // B - T
+ mScoringMatrix['T' - 'A']['B' - 'A'] = mMatchScore;
+}
+
+// enables homo-polymer scoring
+void CSmithWatermanGotoh::EnableHomoPolymerGapPenalty(float hpGapOpenPenalty) {
+ mUseHomoPolymerGapOpenPenalty = true;
+ mHomoPolymerGapOpenPenalty = hpGapOpenPenalty;
+}
+
+// enables entropy-based gap open penalty
+void CSmithWatermanGotoh::EnableEntropyGapPenalty(float enGapOpenPenalty) {
+ mUseEntropyGapOpenPenalty = true;
+ mEntropyGapOpenPenalty = enGapOpenPenalty;
+}
+
+// enables repeat-aware gap extension penalty
+void CSmithWatermanGotoh::EnableRepeatGapExtensionPenalty(float rGapExtensionPenalty, float rMaxGapRepeatExtensionPenaltyFactor) {
+ mUseRepeatGapExtensionPenalty = true;
+ mRepeatGapExtensionPenalty = rGapExtensionPenalty;
+ mMaxRepeatGapExtensionPenalty = rMaxGapRepeatExtensionPenaltyFactor * rGapExtensionPenalty;
+}
+
+// corrects the homopolymer gap order for forward alignments
+void CSmithWatermanGotoh::CorrectHomopolymerGapOrder(const unsigned int numBases, const unsigned int numMismatches) {
+
+ // this is only required for alignments with mismatches
+ //if(al.NumMismatches == 0) return;
+ if ( numMismatches == 0 ) return;
+
+ // localize the alignment data
+ //char* pReference = al.Reference.Data();
+ //char* pQuery = al.Query.Data();
+ //const unsigned int numBases = al.Reference.Length();
+ char* pReference = mReversedAnchor;
+ char* pQuery = mReversedQuery;
+
+ // initialize
+ bool hasReferenceGap = false, hasQueryGap = false;
+ char* pNonGapSeq = NULL;
+ char* pGapSeq = NULL;
+ char nonGapBase = 'J';
+
+ // identify gapped regions
+ for(unsigned int i = 0; i < numBases; i++) {
+
+ // check if the current position is gapped
+ hasReferenceGap = false;
+ hasQueryGap = false;
+
+ if(pReference[i] == GAP) {
+ hasReferenceGap = true;
+ pNonGapSeq = pQuery;
+ pGapSeq = pReference;
+ nonGapBase = pQuery[i];
+ }
+
+ if(pQuery[i] == GAP) {
+ hasQueryGap = true;
+ pNonGapSeq = pReference;
+ pGapSeq = pQuery;
+ nonGapBase = pReference[i];
+ }
+
+ // continue if we don't have any gaps
+ if(!hasReferenceGap && !hasQueryGap) continue;
+
+ // sanity check
+ if(hasReferenceGap && hasQueryGap) {
+ printf("ERROR: Found a gap in both the reference sequence and query sequence.\n");
+ exit(1);
+ }
+
+ // find the non-gapped length (forward)
+ unsigned short numGappedBases = 0;
+ unsigned short nonGapLength = 0;
+ unsigned short testPos = i;
+ while(testPos < numBases) {
+
+ const char gs = pGapSeq[testPos];
+ const char ngs = pNonGapSeq[testPos];
+
+ bool isPartofHomopolymer = false;
+ if(((gs == nonGapBase) || (gs == GAP)) && (ngs == nonGapBase)) isPartofHomopolymer = true;
+ if(!isPartofHomopolymer) break;
+
+ if(gs == GAP) numGappedBases++;
+ else nonGapLength++;
+ testPos++;
+ }
+
+ // fix the gap order
+ if(numGappedBases != 0) {
+ char* pCurrentSequence = pGapSeq + i;
+ memset(pCurrentSequence, nonGapBase, nonGapLength);
+ pCurrentSequence += nonGapLength;
+ memset(pCurrentSequence, GAP, numGappedBases);
+ }
+
+ // increment
+ i += numGappedBases + nonGapLength - 1;
+ }
+}
diff --git a/SmithWatermanGotoh.h b/SmithWatermanGotoh.h
new file mode 100644
index 0000000..da15036
--- /dev/null
+++ b/SmithWatermanGotoh.h
@@ -0,0 +1,103 @@
+#pragma once
+
+#include <iostream>
+#include <algorithm>
+#include <memory>
+//#include "Alignment.h"
+#include "Mosaik.h"
+#include <stdio.h>
+#include <string.h>
+#include <sstream>
+#include <string>
+#include "disorder.h"
+#include "Repeats.h"
+#include "LeftAlign.h"
+
+using namespace std;
+
+#define MOSAIK_NUM_NUCLEOTIDES 26
+#define GAP '-'
+
+class CSmithWatermanGotoh {
+public:
+ // constructor
+ CSmithWatermanGotoh(float matchScore, float mismatchScore, float gapOpenPenalty, float gapExtendPenalty);
+ // destructor
+ ~CSmithWatermanGotoh(void);
+ // aligns the query sequence to the reference using the Smith Waterman Gotoh algorithm
+ void Align(unsigned int& referenceAl, string& cigarAl, const string& s1, const string& s2);
+ // enables homo-polymer scoring
+ void EnableHomoPolymerGapPenalty(float hpGapOpenPenalty);
+ // enables non-repeat gap open penalty
+ void EnableEntropyGapPenalty(float enGapOpenPenalty);
+ // enables repeat gap extension penalty
+ void EnableRepeatGapExtensionPenalty(float rGapExtensionPenalty, float rMaxGapRepeatExtensionPenaltyFactor = 10);
+ // record the best score for external use
+ float BestScore;
+private:
+ // creates a simple scoring matrix to align the nucleotides and the ambiguity code N
+ void CreateScoringMatrix(void);
+ // corrects the homopolymer gap order for forward alignments
+ void CorrectHomopolymerGapOrder(const unsigned int numBases, const unsigned int numMismatches);
+ // returns the maximum floating point number
+ static inline float MaxFloats(const float& a, const float& b, const float& c);
+ // our simple scoring matrix
+ float mScoringMatrix[MOSAIK_NUM_NUCLEOTIDES][MOSAIK_NUM_NUCLEOTIDES];
+ // keep track of maximum initialized sizes
+ unsigned int mCurrentMatrixSize;
+ unsigned int mCurrentAnchorSize;
+ unsigned int mCurrentQuerySize;
+ unsigned int mCurrentAQSumSize;
+ // define our traceback directions
+ // N.B. This used to be defined as an enum, but gcc doesn't like being told
+ // which storage class to use
+ const static char Directions_STOP;
+ const static char Directions_LEFT;
+ const static char Directions_DIAGONAL;
+ const static char Directions_UP;
+ // repeat structure determination
+ const static int repeat_size_max;
+ // define scoring constants
+ const float mMatchScore;
+ const float mMismatchScore;
+ const float mGapOpenPenalty;
+ const float mGapExtendPenalty;
+ // store the backtrace pointers
+ char* mPointers;
+ // store the vertical gap sizes - assuming gaps are not longer than 32768 bases long
+ short* mSizesOfVerticalGaps;
+ // store the horizontal gap sizes - assuming gaps are not longer than 32768 bases long
+ short* mSizesOfHorizontalGaps;
+ // score if xi aligns to a gap after yi
+ float* mQueryGapScores;
+ // best score of alignment x1...xi to y1...yi
+ float* mBestScores;
+ // our reversed alignment
+ char* mReversedAnchor;
+ char* mReversedQuery;
+ // define static constants
+ static const float FLOAT_NEGATIVE_INFINITY;
+ // toggles the use of the homo-polymer gap open penalty
+ bool mUseHomoPolymerGapOpenPenalty;
+ // specifies the homo-polymer gap open penalty
+ float mHomoPolymerGapOpenPenalty;
+ // toggles the use of the entropy gap open penalty
+ bool mUseEntropyGapOpenPenalty;
+ // specifies the entropy gap open penalty (multiplier)
+ float mEntropyGapOpenPenalty;
+ // toggles the use of the repeat gap extension penalty
+ bool mUseRepeatGapExtensionPenalty;
+ // specifies the repeat gap extension penalty
+ float mRepeatGapExtensionPenalty;
+ // specifies the max repeat gap extension penalty
+ float mMaxRepeatGapExtensionPenalty;
+};
+
+// returns the maximum floating point number
+inline float CSmithWatermanGotoh::MaxFloats(const float& a, const float& b, const float& c) {
+ float max = 0.0f;
+ if(a > max) max = a;
+ if(b > max) max = b;
+ if(c > max) max = c;
+ return max;
+}
diff --git a/convert.h b/convert.h
new file mode 100644
index 0000000..399bcea
--- /dev/null
+++ b/convert.h
@@ -0,0 +1,22 @@
+#ifndef __CONVERT_H
+#define __CONVERT_H
+
+#include <sstream>
+
+// converts the string into the specified type, setting r to the converted
+// value and returning true/false on success or failure
+template<typename T>
+bool convert(const std::string& s, T& r) {
+ std::istringstream iss(s);
+ iss >> r;
+ return iss.eof() ? true : false;
+}
+
+template<typename T>
+std::string convert(const T& r) {
+ std::ostringstream iss;
+ iss << r;
+ return iss.str();
+}
+
+#endif
diff --git a/disorder.cpp b/disorder.cpp
new file mode 100644
index 0000000..67684b4
--- /dev/null
+++ b/disorder.cpp
@@ -0,0 +1,191 @@
+/***************************************************************************
+ * libdisorder: A Library for Measuring Byte Stream Entropy
+ * Copyright (C) 2010 Michael E. Locasto
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the:
+ * Free Software Foundation, Inc.
+ * 59 Temple Place, Suite 330
+ * Boston, MA 02111-1307 USA
+ *
+ * $Id$
+ **************************************************************************/
+
+#include <math.h> //for log2()
+#include <stdio.h> //for NULL
+#include "disorder.h"
+
+#if defined(__FreeBSD__)
+#define log2(x) (log((x)) * (1./M_LN2))
+#endif
+
+/** Frequecies for each byte */
+static int m_token_freqs[LIBDO_MAX_BYTES]; //frequency of each token in sample
+static float m_token_probs[LIBDO_MAX_BYTES]; //P(each token appearing)
+static int m_num_tokens = 0; //actual number of `seen' tokens, max 256
+static float m_maxent = 0.0;
+static float m_ratio = 0.0;
+static int LIBDISORDER_INITIALIZED = 0;
+
+static void
+initialize_lib()
+{
+ int i = 0;
+ if(1==LIBDISORDER_INITIALIZED)
+ return;
+
+ m_num_tokens = 0;
+
+ for(i=0;i<LIBDO_MAX_BYTES;i++)
+ {
+ m_token_freqs[i]=0;
+ m_token_probs[i]=0.0;
+ }
+
+ LIBDISORDER_INITIALIZED = 1;
+}
+
+/**
+ * Set m_num_tokens by iterating over m_token_freq[] and maintaining
+ * a counter of each position that does not hold the value of zero.
+ */
+static void
+count_num_tokens()
+{
+ int i = 0;
+ int counter = 0;
+ for(i=0;i<LIBDO_MAX_BYTES;i++)
+ {
+ if(0!=m_token_freqs[i])
+ {
+ counter++;
+ }
+ }
+ m_num_tokens = counter;
+ return;
+}
+
+/**
+ * Sum frequencies for each token (i.e., byte values 0 through 255)
+ * We assume the `length' parameter is correct.
+ *
+ * This function is available only to functions in this file.
+ */
+static void
+get_token_frequencies(char* buf,
+ long long length)
+{
+ int i=0;
+ char* itr=NULL;
+ unsigned char c=0;
+
+ itr = buf;
+
+ //reset number of tokens
+ m_num_tokens = 0;
+
+ //make sure freqency and probability arrays are cleared
+ for(i=0;i<LIBDO_MAX_BYTES;i++)
+ {
+ m_token_freqs[i] = 0;
+ m_token_probs[i] = 0.0;
+ }
+
+ for(i=0;i<length;i++)
+ {
+ c = (unsigned char)*itr;
+ //assert(0<=c<LIBDO_MAX_BYTES);
+ m_token_freqs[c]++;
+ itr++;
+ }
+}
+
+/**
+ * Return entropy (in bits) of this buffer of bytes. We assume that the
+ * `length' parameter is correct. This implementation is a translation
+ * of the PHP code found here:
+ *
+ * http://onlamp.com/pub/a/php/2005/01/06/entropy.html
+ *
+ * with a helpful hint on the `foreach' statement from here:
+ *
+ * http://php.net/manual/en/control-structures.foreach.php
+ */
+float shannon_H(char* buf,
+ long long length)
+{
+ int i = 0;
+ float bits = 0.0;
+ char* itr=NULL; //values of itr should be zero to 255
+ unsigned char token;
+ int num_events = 0; //`length' parameter
+ float freq = 0.0; //loop variable for holding freq from m_token_freq[]
+ float entropy = 0.0; //running entropy sum
+
+ if(NULL==buf || 0==length)
+ return 0.0;
+
+ if(0==LIBDISORDER_INITIALIZED)
+ initialize_lib();
+
+ itr = buf;
+ m_maxent = 0.0;
+ m_ratio = 0.0;
+ num_events = length;
+ get_token_frequencies(itr, num_events); //modifies m_token_freqs[]
+ //set m_num_tokens by counting unique m_token_freqs entries
+ count_num_tokens();
+
+ if(m_num_tokens>LIBDO_MAX_BYTES)
+ {
+ //report error somehow?
+ return 0.0;
+ }
+
+ //iterate through whole m_token_freq array, but only count
+ //spots that have a registered token (i.e., freq>0)
+ for(i=0;i<LIBDO_MAX_BYTES;i++)
+ {
+ if(0!=m_token_freqs[i])
+ {
+ token = i;
+ freq = ((float)m_token_freqs[token]);
+ m_token_probs[token] = (freq / ((float)num_events));
+ entropy += m_token_probs[token] * log2(m_token_probs[token]);
+ }
+ }
+
+ bits = -1.0 * entropy;
+ m_maxent = log2(m_num_tokens);
+ m_ratio = bits / m_maxent;
+
+ return bits;
+}
+
+int
+get_num_tokens()
+{
+ return m_num_tokens;
+}
+
+float
+get_max_entropy()
+{
+ return m_maxent;
+}
+
+float
+get_entropy_ratio()
+{
+ return m_ratio;
+}
diff --git a/disorder.h b/disorder.h
new file mode 100644
index 0000000..3458774
--- /dev/null
+++ b/disorder.h
@@ -0,0 +1,62 @@
+/***************************************************************************
+ * libdisorder: A Library for Measuring Byte Stream Entropy
+ * Copyright (C) 2010 Michael E. Locasto
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the:
+ * Free Software Foundation, Inc.
+ * 59 Temple Place, Suite 330
+ * Boston, MA 02111-1307 USA
+ *
+ * $Id$
+ **************************************************************************/
+
+#ifndef __DISORDER_H_
+#define __DISORDER_H_
+
+/** Max number of bytes (i.e., tokens) */
+#define LIBDO_MAX_BYTES 256
+
+/** A convienance value for clients of this library. Feel free to change
+ * if you plan to use a larger buffer. You can also safely ignore it, as
+ * libdisorder does not use this value internally; it relies on the
+ * client-supplied `length' parameter.
+ *
+ * NB: Might become deprecated because it is potentially misleading and
+ * has zero relationship to any library internal state.
+ */
+#define LIBDO_BUFFER_LEN 16384
+
+/**
+ * Given a pointer to an array of bytes, return a float indicating the
+ * level of entropy in bits (a number between zero and eight),
+ * assuming a space of 256 possible byte values. The second argument
+ * indicates the number of bytes in the sequence. If this sequence
+ * runs into unallocated memory, this function should fail with a
+ * SIGSEGV.
+ */
+float shannon_H(char*, long long);
+
+/** Report the number of (unique) tokens seen. This is _not_ the
+ number of individual events seen. For example, if the library sees
+ the string `aaab', the number of events is 4 and the number of
+ tokens is 2. */
+int get_num_tokens(void);
+
+/** Returns maximum entropy for byte distributions log2(256)=8 bits*/
+float get_max_entropy(void);
+
+/** Returns the ratio of entropy to maxentropy */
+float get_entropy_ratio(void);
+
+#endif
diff --git a/examples.txt b/examples.txt
new file mode 100644
index 0000000..aefe062
--- /dev/null
+++ b/examples.txt
@@ -0,0 +1,76 @@
+TCTGTGACCTCAAAGCCCAACTGTGCATACACAAGCATACACACACACACACACACACACACACACACACACACATACACACACA TCTGTGACCTCAAAGCCCAACTGTGCATACACAAGCATACACACACACACACACACA
+AAAGGCTGGGGACCACTGATCTAAATACACCAATAAAAAGAAAAAGATTGTAAGATTGGATTTTAAAAGACCTGACTCTATACTGACCACAAAAAAAAACCCTCACT AAAGGCTGGGGACCACTGATCTAAATACACCAATAAAAAGAAAAAGATTGTAAGATTGGATTTTAAAAGACCTGACTCTATACTGACCACAAAAAAAACC
+TGTGACCTCAAAGCCCAACTGTGCATACACAAGCATACACACACACACACACACACACACACACACACACACATACACA TGTGACCTCAAAGCCCAACTGTGCATACACAAGCATACACACACACACGCACACACACACATACACACAC
+ATTTTTTAAATCAGGAATAACTTAGACCAGGGTGAACAAACTACTGCTGTCAGGGCAAATCCAGCCCATAGCCTGCTTTTGGAAATAAATTTGTATTAGAACACACACACACACACACACACACACACACACACACACACATACACACATACACACAAATATATCTTCACTAATGTTCTTTTTTTCTTGTTTTTC GGTGAACAAACTACTGCTGTCAGGGCAAATCCAGCCCATAGCCTGCTTTTGGAAATAAATTTGTATTAGAACACACACACACACACACACACATACACACATACACACAAAT
+ATGCATGCCTCTCTCTCTCTCTCTGTCGCTCTCTCTCTCTCTCTCTCTCTCTCTCT ATGCATGCCTCTCTCTCTCTCTCTCGCTCTCTCTCTCTCTCTCTCT
+ACACCAGCTGGGGTGTGTGTGTGTGTGTGTGTGTGTGCGTGTGTGTGTGTGTGTGTGATTCTCGTGCCT GGGGGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGATTCTCGTGC
+GCCTGGGCAACATAGTGAGACCTTGTCTCTACAAATAGTTAAAAAAAAAAAAAAAATTAGCCAGGTGTGGTGGTGCACACATGT GCCTGGGCAACATAGTGAGACCTTGTCTCTACAAATAGTTAAAAAAAAAAAAAATTAGCCAGGTGTGGTGGTGCACACATGT
+TTTTCTTTTCTTTTCTTTTCTTTTTTTTTTTTTTTTTGAGATGGAATTTCACTCTTGTTGCCCAGGCTGGAGTGCAATGGTGTGATCTCGGCTCACGGCAACCTCCG TTTTCTTTTTTTTTTTGAGATGGAATTTCACTCTTGTTGCCCAGGCCGGAGTGCAATGGTGTGCTCTCGGCTCCCGGCAACCTCCG
+AGTAATGGAAATACTGTTTATCATCATTAGAGTTGGGTGTATACTACTACATTACTCTCTCTCTCTCTCTATATATATATATATATATATATATATTTTTTTTT AGTAATGGAAATACTGTTTATCATCATTAGAGTTGGGTGTATACTACTACATTACTCTCTCTCTCTCTCTCTCTATATATATATATATATATTTTTTTTT
+AGCCTGGGCGACAGGGCGAGACTCCGTCTCAAATAATAATAATAATAATAATAATAATAATAATAATAATAATAAAATAAAATAAAATAAAATAAAAATACAAAAAT AGCCTGGGCGACAGGGCGAGACTCCGTCTCAAATAATAATAATAATAATAATAATAATAATAAAATAAAATAAAATAAAATAAAAATACAAAAAT
+CATACACACACACACACACACACACACACACACACACACACACACACACACACCTCATGTAGTGAACTTAATAAATTTAATCTGCAGCTCTGATGATTTCCTTAAGG CATACACACACACACACACACATACACACACACACACACACACACCTCATGTAGTGAACTTAATAAATTTAATCTGCAGCTCTGATGATTTCCTTAAGG
+GGGAGGCTGAGGCAGGAGGATCACACCACTGCACTTTAGCCTGAATACTGAGTAACAAAGCAAAACCCTGTCTCTCTTAAAAAAAAAATTGGGGGGAAGGACAAGTCTTTTTTCTTTTCTTTTCTTTTCTTTTCTTTTTTTTTTTTTTTTTGAGATGGAATTTCACTCTTGTTGCCCAGGCTGGAGTGCAATGGTGTGATC AGTAACAAAGCAAAACCCTGTCTCTCTTAAAAAAAAAATTGGGGGGAAGGACAAGTCTTTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTTTTTTTTGTGATGGAATT
+ATTTTTTAAATCAGGAATAACTTAGACCAGGGTGAACAAACTACTGCTGTCAGGGCAAATCCAGCCCATAGCCTGCTTTTGGAAATAAATTTGTATTAGAACACACACACACACACACACACACACACACACACACACACATACACACATACACACAAATATATCTTCACTAATGTTCTTTTTTTCTTGTTTTTC GGTGAACAAACTACTGCTGTCAGGGCAAATCCAGCCCATAGCCTGCTTTTGGAAATAAATTTGTATTAGAACACACACACACACACACACACATACACACATACACACAAAT
+CATACACACACACACACACACACACACACACACACACACACACACACACACACCTCATGTAGTGAACTTAATAAATTTAATCTGCAGCTCTGATGATTTCCTTAAGGC CATACACACACACACACACACATACACACACACACACACACACACCTCATGTAGTGAACTTAATAAATTTAATCTGCAGCTCTGATGATTTCCTTAAGGC
+TATACAGATTACTTTTATAGCTGATGAGGCAAGTCCTTCTATCATTGTTTCAAAGATTCCATGGCTTTTACTGAACATTTTCTTTTTTCTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGAGACGGAGTTTC TTTTCTTTTTTCTTTTTTTTTTTTTGAGACGGAGTTTC
+GAGAGAGGCGCGCGCGCGCGTGCGCACGCACACACACACACACACACACATAACTAATATATATAAATACATATATATGTGGGTATATATTATTTATTTATG GAGAGAGGCGCGCGCGCGCGCGCGCACACACACACACACACACACACATAACTAATATATATAAATACATATATATGTGGGTATATATTATTTATTTATG
+TATATGTATATATATGTGTGTATATATATGTATATATATGTGTGTATATATATGTGTATATATATGTGTGTGTGTGTGTGTGTGTGTGTATATATATATATATATATATATATCAGTTTGCCCTTGCTGGATAACAAA TATATGTATATATATGTGTGTATATATATGTATATATATGTGTGTATATATATGTGTATATATGTGTGTGTGTGTGTATATATATATCAGTTTGCCCTTG
+AGCAAACACCTATTGTGCATTTTCTTTTCTTTCTTTCTTTCTTTCTTTTTTTTTTTTGAGACGGAGTTTCGCTCTTGTTGTCCAGGCTAGAGTACGATGG AGCAAACACCTATTGTGCATTTTCTTTTCCTTCTTTCTTTCTTTTTTTTTTTTTTTGAGACGGAGTTTCGCTCTTGTTGTCCAGTCTAGAGTCAGTGG
+GATTTTGGTATATTGGTCTTACATTTTTTCACTTTGCTGAACTCATTTATTAGTTCTAATTCATGGGGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTCTGTGTGTGTGTGTGTCTGTGTGTT GATTTTGGTATATTGGTCTTACATTTTTTCACTTTGCTGAACTCATTTATTAGTTCTAATTCATGGGGTGTGTGTGTGTGTGTGTGTGTGTCTGTGTGTG
+CCAGCCTGGGCGACAGAGTGAGACTCCATCTCAAAACAAACAAACAAACAAACAAACAAACAAACAACTCCACTAAGACTTTTGGGACAACCTGTACCTACCTGACCTGCCTTTCCATCTTTAATGCTCTTCT CCAGCCTGGGCGACAGAGTGAGACTCCATCTCAAAACAAACAAACAAACAAACAAACAAACAACTCCACTAAGACTTTTGGGACAACCTGTACCTATCT
+TATATGTATATATATGTGTGTATATATATGTATATATATGTGTGTATATATATGTGTATATATATGTGTGTGTGTGTGTGTGTGTGTGTATATATATATATATATATATATATCAGTTTGCCCTTGCTGGATAACAAA TATATGTATATATATGTGTGTATATATATGTATATATATGTGTGTATATATATGTGTATATATGTGTGTGTGTGTGTATATATATATCAGTTTGCCCTTG
+TAATATATATAATATCTAATATATATAATATCTAATATATATAATATATATATAGAGAGAGAGAGAGAGCGAGAGAGAGAGAGAGAGGGAGAGACGGAGTTTCGCTCTTGTTGCCCAGACTGGAGTGCAATGGCGCGATCTC AATATATATATAGAGAGAGAGAGAGCGAGAGAGAGAGAGAGAGAGGGAGAGACGGAGTTTCGCTCTTGTTGCCCAGACTGGAGTGCAATGGCGCGATCTC
+GAGAAAGAAACTATTTAGCATTGTGGCTTTCATAATTTCTTTCTTTCTTTTTTTTTTTTTTTTTGAAGCAGAGTCTAGCTCTGTCGCCCAGGCTGGAAAGCAGTGGTGCAAACTC TTAGCATTGTGGCTTTCATAATTTCTTTCTTTTTTTTTTTTTTTTTTTTTAAGCAGAGTCTAGCTCTGTCCCCCCGGCTGGAAAGCAGTGGTGCAACCTC
+GGTGACAGAGCAAGACTCCATCTCTCTCTCTCTCTCTCTCTCTCTATATATATATATATATATATACATATATATATGTATATATATGTATATATATGTGTATATATGTGTATATATATGTATATATGTGTATATATATAT GTGACAGAGCAAGACTCCATCTCTCTCTCTCTCTCTCTCTCTATATATATATATATACACACATATATATATGTATATATATGTATATATATATGTGTAT
+GAGAGAGGCGCGCGCGCGCGTGCGCACGCACACACACACACACACACACATAACTAATATATATAAATACATATATATGTGGGTATATATTATTTATTTATG GAGAGAGGCGCGCGCGCGCGCGCGCACACACACACACACACACACACATAACTAATATATATAAATACATATATATGTGGGTATATATTATTTATTTATG
+ATTTTTTAAATCAGGAATAACTTAGACCAGGGTGAACAAACTACTGCTGTCAGGGCAAATCCAGCCCATAGCCTGCTTTTGGAAATAAATTTGTATTAGAACACACACACACACACACACACACACACACACACACACACATACACACATACACACAAATATATCTTCACTAATGTTCTTTTTTTCTTGTTTTTC GGTGAACAAACTACTGCTGTCAGGGCAAATCCAGCCCATAGCCTGCTTTTGGAAATAAATTTGTATTAGAACACACACACACACACACACACATACACACATACACACAAAT
+TTTTCTTTTCTTTTCTTTTTTTTTTTTTTTTTGAGATGGAATTTCACTCTTGTTGCCCAGGCTGGAGTGCAATGGTGTGATCTCGGCTCACGGCAACCTCCGCCTCC TTTTCTTTTTTTTTTTGAGATGGAATTTCACTCTTGTTGCCCAGGCCGGAGTGCAATGGTGTGCTCTCGGCTCCCGGCAACCTCCGCCTCT
+GAGCCGAGATCGTGCCACTGCACTCCAGCCTGGGTGACAGAGCGAGACTCTGTCTCAAAAACAAAAAACAAGCAAACAAAAAAACAAAAAAAAACAAAAAATCCCCAGCA ATCGTGCCACTGCACTCCAGCCTGGGTGACAGAGCGAGACTCTGTCTCAAAAACAAAAAACAAGCAAACAAAAAGCAAAAAAAACAAAAAACCCCCAGCA
+GAGCCGAGATCGTGCCACTGCACTCCAGCCTGGGTGACAGAGCGAGACTCTGTCTCAAAAACAAAAAACAAGCAAACAAAAAAACAAAAAAAAACAAAAAATCCCCAGCA ATCGTGCCACTGCACTCCAGCCTGGGTGACAGAGCGAGACTCTGTCTCAAAAACAAAAAACAAGCAAACAAAAAGCAAAAAAAACAAAAAACCCCCAGCA
+TAATATATATAATATCTAATATATATAATATCTAATATATATAATATATATATAGAGAGAGAGAGAGAGCGAGAGAGAGAGAGAGAGGGAGAGACGGAGTTTCGCTCTTGTTGCCCAGACTGGAGTG AATATATATATAGAGAGAGAGAGAGCGAGAGAGAGAGAGAGAGAGGGAGAGACGGAGTTTCGCTCTTGTTGCCCAGACTGGAGTG
+AAACCCCATCTCTGCTACAAATACAAATACAAATAATAATAATAATAATAATAATAATAATAATAATAATAATAATAATAGCCAGGCATGC AAACCCCATCTCTGCTACAAATACAAATACAAATAATAATAATAATAATAATAATAATAATAATAATAATAATAGCCAGGCATGC
+TATATAATATCTAATATATATAATATATATATAGAGAGAGAGAGAGAGCGAGAGAGAGAGAGAGAGGGAGAGACGGAGTTTCGCTCTTGTTGCCCAGACTGGAGTGC TATATAATATCTAATATATATAATATATATATATAGAGAGAGAGAGCGAGAGAGAGAGAGAGAGGGAGAGA
+AGAGGCAGTAGATTTAGGGACCACTCAACCTAGTGAGACACCAGCTGGGGTGTGTGTGTGTGTGTGTGTGTGTGCGTGTGTGTGTGTGTGTGTGATTCTCGTGCCTC TAGATTTAGGGAGCACTCAACCTAGTGAGACACCAGCTGGGGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGATT
+ACACCAGCTGGGGTGTGTGTGTGTGTGTGTGTGTGTGCGTGTGTGTGTGTGTGTGTGATTCTCGTGCCTC GGGGGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGATTCTCGTGC
+CAAGGCTCCAAAATTTTCAGATGAAATGTACAGAAAGAATACAATCTCATTTTAATAGTTTTTTTTTTTTTTAAAAAGGTCCTTGACCAATTCCCCAAGGTCCAT CAAGGCTCCAAAATTTTCAGATGAAATGTACAGAAAGAATACAATCTCATTTTAATAGTTTTTTTTTTTTAAAAAAGGTCCTTGACCAATTCCCCAAGGT
+AAACCCCATCTCTGCTACAAATACAAATACAAATAATAATAATAATAATAATAATAATAATAATAATAATAATAATAATAGCCAGGCATGC AAACCCCATCTCTGCTACAAATACAAATACAAATAATAATAATAATAATAATAATAATAATAATAATAATAATAGCCAGGCATGC
+TACTTCATATGTTCCACGCTCTGGTTGTTTTGTGGGGAGCAAAAGAGAAGTTCCCATTTCTGTTTATGTTAGAAACAAAACAAGACAAAAAACAAATTAACAAGCAAATACATACTGATTATAAAATAATGTGATGTGAAGAAAAAAACTGCTAAGAAAGAATAATGAGGT TTCCCATTTCTGTTTATGTTAAAAACAAAACAAAACAAGACAAAAAACAAATTAACAAGCAAATACATACTGATTATAAAATAATGTGATGTGAAGAAAA
+GGTTGTTTTGTGGGGAGCAAAAGAGAAGTTCCCATTTCTGTTTATGTTAGAAACAAAACAAGACAAAAAACAAATTAACAAGCAAATACATACTGATTATAAAATAATGTGATGTGAAGAAAAAAACTGCTAAGAAAGAATAATGAGGTAGGAGGGGGCTACTTGAGATCAGGTGGTCACGCTGAGAGTTGA TTAAAAACAAAACGAAACAAGACAAAAAACAAATTAACAAGCAAATACATACTGATTATAAAATAATGTGATGTGAAGAAAAAAACTGCTAAGAAAGAAT
+TTAGAAACAAAACAAGACAAAAAACAAATTAACAAGCAAATACATACTGATTATAAAATAATGTGATGTGAAGAAAAAAACTGCTAAGAAAGAAT TTAAAAACAAAACGAAACAAGACAAAAAACAAATTAACAAGCAAATACATACTGATTATAAAATAATGTGATGTGAAGAAAAAAACTGCTAAGAAAGAAT
+ACCTTGAATATGAACACTCTAAATGCTCCACTTAAGAGGCACAGAGTAGCAAGTTGGAAAAGAAAAGAAAAGAAAAGAAAAGAAAACTCATCTGTCTGCAG ACCTTGAATATGAACACTCTAAATGCTCCACTTAAGAGGCACAGAGTAGCAAGTTGGAAAAGAAAAGAAAAGAAAAGAAAACAAAACAAAACTCATCTGT
+TTTTCTTTTCTTTTCTTTTCTTTTCTTTTTTTTTTTTTTTTTGAGATGGAATTTCACTCTTGTTGCCCAGGCTGGAGTGCAATGGTGTGATCTCGGCTCACGGCAAC TTTTCTTTTTTTTTTTGAGATGGAATTTCACTCTTGTTGCCCAGGCCGGAGTGCAATGGTGTGCTCTCGGCTCCCGGCAAC
+AGAAAGAAAGAAAAAGAAAAAGAACCAAGAAGAAAAAATAATCACCGGAGATTCCTCCCCTCCCCTAGAGCTAACTAGGCTAACATTTTGGTATATATCTTTCCAG AGAAAGAAAGAAAAAGAACCAAGAAGAAAAAATAATCACCGGAGATTCCTCCCCTCCCCTAGAGCTGACTAGGCTAACATTTTGGTATATATCTTTCCAG
+GACCTTTCTATATATGGTTTTACAATCGGATCAATCGAGATCCCCTCCCCTCCTTAGAGGCCACTAATAAAAAAGAAGAACCAAGAAAAAGAAAAAGAAAGA GACCTTTCTATATATGGTTTTACAATCGGATCAATCGAGATCCCCTCCCCTCCTTAGAGGCCACTAATAAAAAAGAAGAACCAAGAAAAAGAAAAAGA
+AGAAAGAAAGAAAAAGAAAAAGAACCAAGAAGAAAAAATAATCACCGGAGATTCCTCCCCTCCCCTAGAGCTAACTAGGCTAACATTTTGGTATATATCTTTCCAG AGAAAGAAAGAAAAAGAACCAAGAAGAAAAAATAATCACCGGAGATTCCTCCCCTCCCCTAGAGCTGACTAGGCTAACATTTTGGTATATATCTTTCCAG
+ATTTTTTAAATCAGGAATAACTTAGACCAGGGTGAACAAACTACTGCTGTCAGGGCAAATCCAGCCCATAGCCTGCTTTTGGAAATAAATTTGTATTAGAACACACACACACACACACACACACACACACACACACACACATACACACATACACACAAATATATCTTCACTAATGTTCTTTTTTTCTTGTTTTTC GGTGAACAAACTACTGCTGTCAGGGCAAATCCAGCCCATAGCCTGCTTTTGGAAATAAATTTGTATTAGAACACACACACACACACACACACATACACACATACACACAAAT
+GGTTGTTTTGTGGGGAGCAAAAGAGAAGTTCCCATTTCTGTTTATGTTAGAAACAAAACAAGACAAAAAACAAATTAACAAGCAAATACATACTGATTATAAAATAATGTGATGTGAAGAAAAAAACTGCTAAGAAAGAATAATGAGGTAGGAGGGGGCTACTTGAGATCAGGTGGTCACGCTGAGAGTTGA TTAAAAACAAAACGAAACAAGACAAAAAACAAATTAACAAGCAAATACATACTGATTATAAAATAATGTGATGTGAAGAAAAAAACTGCTAAGAAAGAAT
+TAAGAAAGAATCGTCAAAAAAAGAAGTGTAGTGTAATAAAATATTAGTCATACATAAACGAACAATTAAACAAAAAACAGAACAAAGCACACACACACACACA TAAGAAAGAATCGTCAAAAAAAGAAGTGTAGTGTAATAAAATATTAGTCATACATAAACGAACAATTAAACAAAAAACAGAACAAAGCACACACACACACACA
+TAATATATATAATATCTAATATATATAATATCTAATATATATAATATATATATAGAGAGAGAGAGAGAGCGAGAGAGAGAGAGAGAGGGAGAGACGGAGTTTCGCTCTTGTTGCCCAGACTGGAGTGCAATGGCGCGATCTC AATATATATATAGAGAGAGAGAGAGCGAGAGAGAGAGAGAGAGAGGGAGAGACGGAGTTTCGCTCTTGTTGCCCAGACTGGAGTGCAATGGCGCGATCTC
+AAAGAAAGAAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAAAGAAAAAGAACCAAGAAGAAAAAATAATCACCGGAGATTCCTCCCCTCCCCTAGAGCTAACTAGGCTAACATTTTGGTATATATCTTTCCAGTCCGGATCCTGTGTGACTGAGTGTGTATATGCATATGTATTATTTTCAAC AAGAAAGAAAGAAAAAGAACCAAGAAGAAAAAATAATCACCCGAGATTCCTCCCCTCCCCTAGAGCTAACTAGGCTAACATTTTGGTATATATCTTTCCA
+GAATGCACATTTGGTCCTTCATTAACTCATTTACTCACAATTTTTTTTTTTTTTTTTTTTTTGAGACGGAGTTTCACTCTTGTCGCCCAGGCTGGAGTGCAATGGTGTGA ACAATTTTTTTTTTTTTTTTTTTGAGACGGAGTTTCACTCTTGTCGCCCAGGCTGGAGTGCAATGGTGTGA
+CCACTGCACTCCAGCCTGGGCGACAGGGCGAGACTCCGTCTCAAATAATAATAATAATAATAATAATAATAATAATAATAATAATAAAATAAAATAAAA CCACTGCACTCCAGCCTGGGCGACAGGGCGAGACTCCGTCTCAAATAATAATAATAATAATAATAATAATAATAAAATAAAATAAAATAAAATAAAA
+ATTCCAATACTATTCAATTGTTCCACAGCACAGAAAAGAAAAAATTCTAAATTCTTTCTATGGAACAAAAAAATCATCAATGACACCTGACCAAAGATGGCACACACACACACACACACACACACACACACACACACACACTACAGTCCAACCCCACTAATGAATACAAAAATCCTAACACTAGCA GGGACACACACACACACACACACACACACACACTACAGTCCAACCCCACTAATGAATACAAAAATCCTAACACTAGCA
+GGCAGGAGAATTGCTTGAACCCAGGGGGCAGAGGTGGCAGTGAGCCGGGATCATGCCACTTCACTCCAGCCTGGGTGAAAGAGCAAAACTCTGTCTCAAAAAAAAAAAAAAAAAAAGACAGCTGCAACAAATGTCAAGTTCTGTGTGTTTTCTTTTCTTTTCTTTTTTTTCTATTTAATTAATTTATT AAAAAAAAAAAAAAAAAGACAGCTGCAACAAATGTCAAGTTCTGTGTGTTTTCTTTTCTTTTCTTTTTTTTCTATTTAATTAATTTATT
+GGCAGGAGAATTGCTTGAACCCAGGGGGCAGAGGTGGCAGTGAGCCGGGATCATGCCACTTCACTCCAGCCTGGGTGAAAGAGCAAAACTCTGTCTCAAAAAAAAAAAAAAAAAAAGACAGCTGCAACAAATGTCAAGTTCTGTGTGTTTTCTTTTCTTTTCTTTTTTTTCTATTTAATTAATTTATT CAAAAAAAAAAAAAAAAAGACAGCTGCAACAAATGTCAAGTTCTGTGTGTTTTCTTTTCTTTTCTTTTTTTTCTATTTAATTAATTTATT
+ATTGCTTGAGCCCAGGAGTTCAGGGCTGCAATGAGCTATGATCATGCCACTGCACTCCAGCCTGGGCAACAGAGTGAGATCCTGTCTCTAAAATATGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTATGTGTGTGTGTCTGTACATATACGTATATATATATGTGTGTATATATACAT AATATGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTATGTGTGTGTGTCTGTACATATACGTATATATATATGTGTGTATATATACAT
+GCCCACCACGCTGCAGATGAATATGCCACAATGTCAACAGTGTTTAGGCTCATATATATATATATATATATATATATATATATATATATATATATATATATTTTAAGACAGTCTTGCTCTGTCACCC TCTCATATATATATATATATATATATATATTTTAAGACAGTCTTGCTCTGTCACCC
+GCCCACCACGCTGCAGATGAATATGCCACAATGTCAACAGTGTTTAGGCTCATATATATATATATATATATATATATATATATATATATATATATATATATTTTAAGACAGTCTTGCTCTGTCACCC TATATATATATATATATATATATATATATATATATTGAGACAGTCTCGCTCTGTCACCC
+ATTCCAATACTATTCAATTGTTCCACAGCACAGAAAAGAAAAAATTCTAAATTCTTTCTATGGAACAAAAAAATCATCAATGACACCTGACCAAAGATGGCACACACACACACACACACACACACACACACACACACACACTACAGTCCAACCCCACTAATGAATACAAAAATCCTAACACTAGCA GGGACACACACACACACACACACACACACACACTACAGTCCAACCCCACTAATGAATACAAAAATCCTAACACTAGCA
+GCCCACCACGCTGCAGATGAATATGCCACAATGTCAACAGTGTTTAGGCTCATATATATATATATATATATATATATATATATATATATATATATATATATTTTAAGACAGTCTTGCTCTGTCACCC TATATATATATATATATATATATATATATATATATTGAGACAGTCTCGCTCTGTCACCC
+CACAAGAACTGCAATTCCTAGGCAACTGCTAGTGCTGTGCTGGGCTCAGAGGCAGTAGATTTAGGGACCACTCAACCTAGTGAGACACCAGCTGGGGTGTGTGTGTGTGTGTGTGTGTGTGCGTGTGTGTGTGTGTGTGTGATTCTCGTGCCTCAGCCTCCCAAGTAGCTGGTGATGGCAGTGGCAGCCCATCTGGAGTGGACGCTGCCATCAAGCCAGCTGCAGCAGGGAGGGACAGCTGGGGCTGCACAT GGTCAGAGAGAGTATATAGAGAGAGCACACACCAGAGAGAGACACGCTGGGGGGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGATTCTCGTGC
+CGCCTGTAGTCTCAGCTATTAATATTTGGGAGGCTGAGGCAGGAGGATCACACCACTGCACTTTAGCCTGAATACTGAGTAACAAAGCAAAACCCTGTCTCTCTTAAAAAAAAAATTGGGGGGAAGGACAAGTCTTTTTTCTTTTCTTTTCTTTTCTTTTCTTTTTTTTTTTTTTTTTGAGATGGAATTTCACTCTTGTTGCCCAGGCTGGAGTGCAATGGTGTGATCTCGGCTCACGGCAACCTCCGCCTCCTGGGTTCAAGCAATTCTGCTTCAGCCTCCCGAGTGGCTGGGATTATAGT CTCTTAAAAAAAAAATTGGGGGGAAGGACAAGTCTTTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTTTTTTTTGGGTTGGAATTTTACTCTTGTTG
+CTCGGCTCACTGCAAGCTCCACCTCCCGGGTTCACGCCATTCTCCTGCCTCAGTCTCCCAAGTAGCTGGGACTACAGGTGCCCACCACCACGTCTGGCTAATTTTTTTGTATTTTTAGTAGAGATGGGGATTCACCGTGTTAGCCAGGATGGTCTCGATCTCCTGACTTTGTGATCCACCCGCCTCAGCCTCCCAAAGCCTCCTTCACTTTTCTTTATTAGCCTCAACCCCATGATTCACCACTCCAAGTACTCCCTTGCCAGCATCCTCAAATCCCAATACCATTTTTAAAATTTTTTAA ATTTTTTTGTATTTTTAGTAGAGATGGGGATTCACCGTGTTAGCCAGGATGGTCTCGATCTCCTGACTTTGTGATCCGCCCGCCTCAGCCTCCCAAAGCC
+TTGGGGGGAAGGACAAGTCTTTTTTCTTTTCTTTTCTTTTCTTTTCTTTTTTTTTTTTTTTTTGAGATGGAATTTCACTCTTGTTGCCCAGGCTGGAGTGCAATG TTCTTTTTTTTTTTGAGATGGAATTTCACTCTTGTTGCCCAGGCCGGAGTGCAATG
+GGAGGCAGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGTGACAGAGCAAGACTCCATCTCTCTCTCTCTCTCTCTCTCTCTATATATATATATATATATATACATATATATATGTATATATATGTATATATATGTGTATATATGTGTATATATATGTATATATGTGTATATATATATGTAT CCTGTGTGACAGAGCAAGACTCCATCTCTCTCTCTCTCTCTCTCTCTCTATATATATATATATATATACATATATATATATATGTATATATATGTATATA
+TAATTTACAACAACACGTAAGTTGTTACTCTGTAAACCCTTGCCTCCCCCCCACCCCCCACCCAATTGGGTCTTTTTTTTTTTTCTCTCTCTCCATGCTTCTGCAGTGACTCTTAAGTAGCATTTTTAAAAACTTC CCACCCCCCACCCAATTGGGTCTTTTTTTTTTTTCTCTCTCTCCATGCTTCTGCAGTGACTCTTAAGTAGCATTTTTTAAAACTTCTATTTATTTTAAAA
+GTTCACACTTTTGGCCATACCCAGGGTCAGCCATGAAGATTGTTCTCAGAATGTTTTCTTTCTTCCTTCCTTTCTCTTTCTTTTCTTTCTTTCTTCCTTTCTTTCTTTCCTTTCTTTTTCTTTCTTCTCTTTCTTTTTTTCTTTTCTTCTTTCTCTCTCTTTCTTTCTCTCTCTCTCTCTCCTTCCTTCCTTCCTTCT ATGTTTTCTTTCTTCCTTCCTTTCTCTTTCTTTTCTTTCTTTCTTCCTTTCTTTCTTTCCTTTCTTTTTCTTTCTTCTCTTTCTTTCTTTTTTCTTTTCT
+GTTCACACTTTTGGCCATACCCAGGGTCAGCCATGAAGATTGTTCTCAGAATGTTTTCTTTCTTCCTTCCTTTCTCTTTCTTTTCTTTCTTTCTTCCTTTCTTTCTTTCCTTTCTTTTTCTTTCTTCTCTTTCTTTTTTTCTTTTCTTCTTTCTCTCTCTTTCTTTCTCTCTCTCTCTCTCCTTCCTTCCTTCCTTCT ATGTTTTCTTTCTTCCTTCCTTTCTCTTTCTTTTCTTTCTTTCTTCCTTTCTTTCTTTCCTTTCTTTTTCTTTCTTCTCTTTCTTTCTTTTTTCTTTTCT
+GGAGGCAGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGTGACAGAGCAAGACTCCATCTCTCTCTCTCTCTCTCTCTCTCTATATATATATATATATATATACATATATATATGTATATATATGTATATATATGTGTATATATGTGTATATATATGTATATATGTGTATATATATATGTAT CCTGTGTGACAGAGCAAGACTCCATCTCTCTCTCTCTCTCTCTCTCTCTATATATATATATATATATACATATATATATATATGTATATATATGTATATA
+CTCCCACTGGGGTAATCCATCTTTTCTTTTAATTATTTTCCTTTTGAGATATATTAAAAATGCAAAAAAAAAAATTTTTATTTTTTTGAGACGGAGTCTCGCTCTATCACCTAGGCTGGAGTGCAGTGGCATGATCTTGGCTCACTGCAACCTCCGCCTCCTGGGTTCAACTGATTCTCCTGCCTCAGCCTCCTGAATAG ATATTAAAAATGCAAAAAAAAATTTTTTTTATTTTTTTGAGACGGAGTCTCGCTCTATCGCCTAGGCTGGAGTGCAGTGGCATGATCTTGGCTCACTGCA
+GGCAGGAGAATTGCTTGAACCCAGGGGGCAGAGGTGGCAGTGAGCCGGGATCATGCCACTTCACTCCAGCCTGGGTGAAAGAGCAAAACTCTGTCTCAAAAAAAAAAAAAAAAAAAGACAGCTGCAACAAATGTCAAGTTCTGTGTGTTTTCTTTTCTTTTCTTTTTTTTCTATTTAATTAATTTATT CAAAAAAAAAAAAAAAAAGACAGCTGCAACAAATGTCAAGTTCTGTGTGTTTTCTTTTCTTTTCTTTTTTTTCTATTTAATTAATTTATT
+CAAGATCGTGCCGCTGCACTCCAGCCCAGGTGACAGAGCGAGACTTCATCTCAAAAAAAAAAAGGGCGCCAAACATCTACTGTGTACCCACAAAAATTAAAATTATAAAAAGACGGCATCAGCAATCCCAGGAGGTGATGTGTCCCTGGTTGGTGTACCTCAGGAGTTGCTGCATTTGCCTCACATCACCATGTGAGATAA CAAAAAAAAAAAAGGGCGCCAAACATCTACTGTGTACCCACAAAAATTAAAATTATAAAAAGACGGCATCAGCAATCCCAGGAGGTGATGTGTCCCTGGT
+CCTCTTAGGTGGTCACATCCTAGAGAGGGGGAAATTACATCAGAAAAGGACCAATGCCAAATTACAGCAACAAAGGGAAAGTAATCCTGGAAGCTGATTTAAGCTATGTGACTGTGTCTTCAATTAAAATATTCAGTCCCTTCCCCTCCCCCTCCCCCTCCCTTCCGTCTCCCTCTGTTGCTGAGGCTGGACTG CCAATGCCAAATTACAGCAACAAAGGGAAAGTAATCCTGGAAGCTGATTTAAGCTATGTGACTGTGTCTTCAATTAAAATATTCAGTCCCTCCCCCTCCC
+CCAAGTGACCCTTTCACCTCAGCTTCCCAAGTAGCTGGGATTACAGGTGCACACCAACTGTGCTTTGCAGTTTTGTTTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTATTTTGTTTTTTTTACAAGCCCACAACATCATAGC TCCTCAGCGTCCCAAGTAGCTGGGAGTAGAGATGCACACCAACTGTGCTTTGCAGTTTTGTTTGTGTGTGTGTGTGTGTGTGTGTGTGTATTTTGTTTTT
+CAAGATTTGCCACTGCACTCCAGCCTGGGTGACAGAGTGAGACTGTATCTCAAAAAAAAAAAATAAATAAATAAAGAGAATAAGGCATTTGATATAGTTTCTTTGCATAACAAAATACAAATAAAACCACATTCTATTTCATCTAAACACTTTCTCCCAAGTCATTCTCTCTTAGCCTCA GGACAGGGGGAGACTGTATCTCAAAAAAAAAAAAAAAATAAATAAAGAGAATAAGGCATTTGATATAGTTTCTTTGCATAACAAAATACAAATAAAACCA
+GATCATGCCACTGCACTCCAGCCTGGGCAAAAGAGCGAGACTCTGTCTCAAAAAAAAAAAAAAAATCCAGAAAGAATTGGCACACCTATGTTGTTAAGTTTTCCAATCCAAGAATACTGTATTCCTTATCATTTTT AAAAAAAAAAAAAAATCCAGAAAGAATTGGCACCCTTTTGTTGTAAAGTTTTCCATTCCAAGAAAACTGGATTCCTTCTTTTTTTTTTCTTTTTGTTTCT
+TCCCTCCCTTCCTCCCTTTCTCTCCCTCTCCCTCTCTTTCTTTCTCTCTCTCTCTTTCTCCCCTTCTTTCTTTCTTTCTCTCTCTCTTTTTCTTTCTTTCTTTCTTTCTTTC TCCCTCCCTTCCTCCCTTTCTCTCCCTCTCCCTCTCTTTCTTTCTCTCTCTCTCTTTCTCCCCTTCTTTTTTTCTCTCTCTCTCTTTTTTTTTCTTTCTC
diff --git a/libdisorder.LICENSE b/libdisorder.LICENSE
new file mode 100644
index 0000000..8bbcff9
--- /dev/null
+++ b/libdisorder.LICENSE
@@ -0,0 +1,339 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+ 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users. This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it. (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.) You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+ To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have. You must make sure that they, too, receive or can get the
+source code. And you must show them these terms so they know their
+rights.
+
+ We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+ Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software. If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+ Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ GNU GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+ 1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+ 2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) You must cause the modified files to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ b) You must cause any work that you distribute or publish, that in
+ whole or in part contains or is derived from the Program or any
+ part thereof, to be licensed as a whole at no charge to all third
+ parties under the terms of this License.
+
+ c) If the modified program normally reads commands interactively
+ when run, you must cause it, when started running for such
+ interactive use in the most ordinary way, to print or display an
+ announcement including an appropriate copyright notice and a
+ notice that there is no warranty (or else, saying that you provide
+ a warranty) and that users may redistribute the program under
+ these conditions, and telling the user how to view a copy of this
+ License. (Exception: if the Program itself is interactive but
+ does not normally print such an announcement, your work based on
+ the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+ a) Accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of Sections
+ 1 and 2 above on a medium customarily used for software interchange; or,
+
+ b) Accompany it with a written offer, valid for at least three
+ years, to give any third party, for a charge no more than your
+ cost of physically performing source distribution, a complete
+ machine-readable copy of the corresponding source code, to be
+ distributed under the terms of Sections 1 and 2 above on a medium
+ customarily used for software interchange; or,
+
+ c) Accompany it with the information you received as to the offer
+ to distribute corresponding source code. (This alternative is
+ allowed only for noncommercial distribution and only if you
+ received the program in object code or executable form with such
+ an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+ 4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+ 5. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+ 6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+ 7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+ 8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+ 9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation. If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+ 10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission. For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this. Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+ NO WARRANTY
+
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+ Gnomovision version 69, Copyright (C) year name of author
+ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary. Here is a sample; alter the names:
+
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+ `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+ <signature of Ty Coon>, 1 April 1989
+ Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs. If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library. If this is what you want to do, use the GNU Library General
+Public License instead of this License.
diff --git a/smithwaterman.cpp b/smithwaterman.cpp
new file mode 100644
index 0000000..92aa73c
--- /dev/null
+++ b/smithwaterman.cpp
@@ -0,0 +1,306 @@
+#include <iostream>
+#include <string.h>
+#include <string>
+#include <sstream>
+#include <getopt.h>
+#include <utility>
+#include <vector>
+#include <stdlib.h>
+#include "SmithWatermanGotoh.h"
+#include "BandedSmithWaterman.h"
+
+using namespace std;
+
+/* Returns the Reverse Complement of a DNA Sequence, from the alphabet {A,T,C,G,N} */
+string reverseComplement(string read) {
+
+ // Declare the (empty) reverse complement read as a string
+ string rc_read;
+
+ // Reverse Read
+ rc_read.assign(read.rbegin(), read.rend());
+
+ // Complement. Note that not IUPAC compliant. Uses the alphabet {A,T,C,G,N}
+ string::iterator t;
+ for (t = rc_read.begin(); t != rc_read.end(); ++t) {
+ switch (*t) {
+ case 'A':
+ *t = 'T';
+ break;
+ case 'T':
+ *t = 'A';
+ break;
+ case 'C':
+ *t = 'G';
+ break;
+ case 'G':
+ *t = 'C';
+ break;
+ case 'N':
+ *t = 'N';
+ break;
+ default:
+ cout << "Unknown Nucleotide!";
+ break;
+ }
+ }
+
+ // Return the Read (faster if done through pointers?)
+ return rc_read;
+}
+
+
+void printSummary(void) {
+ cerr << "usage: smithwaterman [options] <reference sequence> <query sequence>" << endl
+ << endl
+ << "options:" << endl
+ << " -m, --match-score the match score (default 10.0)" << endl
+ << " -n, --mismatch-score the mismatch score (default -9.0)" << endl
+ << " -g, --gap-open-penalty the gap open penalty (default 15.0)" << endl
+ << " -z, --entropy-gap-open-penalty enable entropy scaling of the gap open penalty" << endl
+ << " -e, --gap-extend-penalty the gap extend penalty (default 6.66)" << endl
+ << " -r, --repeat-gap-extend-penalty use repeat information when generating gap extension penalties" << endl
+ << " -b, --bandwidth bandwidth to use (default 0, or non-banded algorithm)" << endl
+ << " -p, --print-alignment print out the alignment" << endl
+ << " -R, --reverse-complement report the reverse-complement alignment if it scores better" << endl
+ << endl
+ << "When called with literal reference and query sequences, smithwaterman" << endl
+ << "prints the cigar match positional string and the match position for the" << endl
+ << "query sequence against the reference sequence." << endl;
+}
+
+
+int main (int argc, char** argv) {
+
+ int c;
+
+ string reference;
+ string query;
+
+ int bandwidth = 0;
+
+ float matchScore = 10.0f;
+ float mismatchScore = -9.0f;
+ float gapOpenPenalty = 15.0f;
+ float gapExtendPenalty = 6.66f;
+ float entropyGapOpenPenalty = 0.0f;
+ bool useRepeatGapExtendPenalty = false;
+ float repeatGapExtendPenalty = 1.0f;
+
+ bool print_alignment = false;
+ bool tryReverseComplement = false;
+
+ while (true) {
+ static struct option long_options[] =
+ {
+ {"help", no_argument, 0, 'h'},
+ {"match-score", required_argument, 0, 'm'},
+ {"mismatch-score", required_argument, 0, 'n'},
+ {"gap-open-penalty", required_argument, 0, 'g'},
+ {"entropy-gap-open-penalty", required_argument, 0, 'z'},
+ {"gap-extend-penalty", required_argument, 0, 'e'},
+ {"repeat-gap-extend-penalty", required_argument, 0, 'r'},
+ {"print-alignment", required_argument, 0, 'p'},
+ {"bandwidth", required_argument, 0, 'b'},
+ {"reverse-complement", no_argument, 0, 'R'},
+ {0, 0, 0, 0}
+ };
+ int option_index = 0;
+
+ c = getopt_long (argc, argv, "hpRzm:n:g:r:e:b:r:",
+ long_options, &option_index);
+
+ if (c == -1)
+ break;
+
+ switch (c)
+ {
+ case 0:
+ /* If this option set a flag, do nothing else now. */
+ if (long_options[option_index].flag != 0)
+ break;
+ printf ("option %s", long_options[option_index].name);
+ if (optarg)
+ printf (" with arg %s", optarg);
+ printf ("\n");
+ break;
+
+ case 'R':
+ tryReverseComplement = true;
+ break;
+
+ case 'm':
+ matchScore = atof(optarg);
+ break;
+
+ case 'n':
+ mismatchScore = atof(optarg);
+ break;
+
+ case 'g':
+ gapOpenPenalty = atof(optarg);
+ break;
+
+ case 'z':
+ entropyGapOpenPenalty = 1;
+ break;
+
+ case 'r':
+ useRepeatGapExtendPenalty = true;
+ repeatGapExtendPenalty = atof(optarg);
+ break;
+
+ case 'e':
+ gapExtendPenalty = atof(optarg);
+ break;
+
+ case 'b':
+ bandwidth = atoi(optarg);
+ break;
+
+ case 'p':
+ print_alignment = true;
+ break;
+
+ case 'h':
+ printSummary();
+ exit(0);
+ break;
+
+ case '?':
+ /* getopt_long already printed an error message. */
+ printSummary();
+ exit(1);
+ break;
+
+ default:
+ abort ();
+ }
+ }
+
+ /* Print any remaining command line arguments (not options). */
+ if (optind == argc - 2) {
+ //cerr << "fasta file: " << argv[optind] << endl;
+ reference = string(argv[optind]);
+ ++optind;
+ query = string(argv[optind]);
+ } else {
+ cerr << "please specify a reference and query sequence" << endl
+ << "execute " << argv[0] << " --help for command-line usage" << endl;
+ exit(1);
+ }
+
+ // initialize
+
+ unsigned int referencePos;
+ string cigar;
+
+ float bestScore = 0;
+
+ bool alignedReverse = false;
+
+ // create a new Smith-Waterman alignment object
+ if (bandwidth > 0) {
+ pair< pair<unsigned int, unsigned int>, pair<unsigned int, unsigned int> > hr;
+ hr.first.first = 2;
+ hr.first.second = 18;
+ hr.second.first = 1;
+ hr.second.second = 17;
+ CBandedSmithWaterman bsw(matchScore, mismatchScore, gapOpenPenalty, gapExtendPenalty, bandwidth);
+ bsw.Align(referencePos, cigar, reference, query, hr);
+ } else {
+ CSmithWatermanGotoh sw(matchScore, mismatchScore, gapOpenPenalty, gapExtendPenalty);
+ if (useRepeatGapExtendPenalty)
+ sw.EnableRepeatGapExtensionPenalty(repeatGapExtendPenalty);
+ if (entropyGapOpenPenalty > 0)
+ sw.EnableEntropyGapPenalty(entropyGapOpenPenalty);
+ sw.Align(referencePos, cigar, reference, query);
+ bestScore = sw.BestScore;
+ if (tryReverseComplement) {
+ string queryRevC = reverseComplement(query);
+ sw.Align(referencePos, cigar, reference, query);
+ if (sw.BestScore > bestScore) {
+ alignedReverse = true;
+ bestScore = sw.BestScore;
+ query = queryRevC;
+ }
+ }
+ }
+
+ printf("%s %3u %f %s\n", cigar.c_str(), referencePos, bestScore, (alignedReverse ? "-" : "+"));
+
+ // optionally print out the alignment
+ if (print_alignment) {
+ int alignmentLength = 0;
+ int len;
+ string slen;
+ vector<pair<int, char> > cigarData;
+ for (string::iterator c = cigar.begin(); c != cigar.end(); ++c) {
+ switch (*c) {
+ case 'I':
+ len = atoi(slen.c_str());
+ slen.clear();
+ cigarData.push_back(make_pair(len, *c));
+ break;
+ case 'D':
+ len = atoi(slen.c_str());
+ alignmentLength += len;
+ slen.clear();
+ cigarData.push_back(make_pair(len, *c));
+ break;
+ case 'M':
+ len = atoi(slen.c_str());
+ alignmentLength += len;
+ slen.clear();
+ cigarData.push_back(make_pair(len, *c));
+ break;
+ case 'S':
+ len = atoi(slen.c_str());
+ slen.clear();
+ cigarData.push_back(make_pair(len, *c));
+ break;
+ default:
+ len = 0;
+ slen += *c;
+ break;
+ }
+ }
+
+ string gapped_ref = string(reference).substr(referencePos, alignmentLength);
+ string gapped_query = string(query);
+
+ int refpos = 0;
+ int readpos = 0;
+ for (vector<pair<int, char> >::iterator c = cigarData.begin(); c != cigarData.end(); ++c) {
+ int len = c->first;
+ switch (c->second) {
+ case 'I':
+ gapped_ref.insert(refpos, string(len, '-'));
+ readpos += len;
+ refpos += len;
+ break;
+ case 'D':
+ gapped_query.insert(readpos, string(len, '-'));
+ refpos += len;
+ readpos += len;
+ break;
+ case 'M':
+ readpos += len;
+ refpos += len;
+ break;
+ case 'S':
+ readpos += len;
+ gapped_ref.insert(refpos, string(len, '*'));
+ refpos += len;
+ break;
+ default:
+ break;
+ }
+ }
+
+ cout << gapped_ref << endl << gapped_query << endl;
+ }
+
+ return 0;
+
+}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/libsmithwaterman.git
More information about the debian-med-commit
mailing list