[med-svn] r2475 - in trunk/packages: . last-align last-align/branches last-align/branches/upstream last-align/branches/upstream/current last-align/branches/upstream/current/src
plessy at alioth.debian.org
plessy at alioth.debian.org
Thu Sep 11 06:55:42 UTC 2008
Author: plessy
Date: 2008-09-11 06:55:41 +0000 (Thu, 11 Sep 2008)
New Revision: 2475
Added:
trunk/packages/last-align/
trunk/packages/last-align/branches/
trunk/packages/last-align/branches/upstream/
trunk/packages/last-align/branches/upstream/current/
trunk/packages/last-align/branches/upstream/current/src/
trunk/packages/last-align/branches/upstream/current/src/Alignment.cc
trunk/packages/last-align/branches/upstream/current/src/AlignmentPot.hh
trunk/packages/last-align/branches/upstream/current/src/Alphabet.cc
trunk/packages/last-align/branches/upstream/current/src/MultiSequence.cc
trunk/packages/last-align/branches/upstream/current/src/PeriodicSpacedSeed.cc
trunk/packages/last-align/branches/upstream/current/src/ScoreMatrix.cc
trunk/packages/last-align/branches/upstream/current/src/ScoreMatrix.hh
trunk/packages/last-align/branches/upstream/current/src/SuffixArray.hh
trunk/packages/last-align/branches/upstream/current/src/XdropAligner.hh
trunk/packages/last-align/branches/upstream/current/src/lastal.cc
trunk/packages/last-align/branches/upstream/current/src/lastdb.cc
Log:
[svn-inject] Installing original source of last-align
Added: trunk/packages/last-align/branches/upstream/current/src/Alignment.cc
===================================================================
--- trunk/packages/last-align/branches/upstream/current/src/Alignment.cc (rev 0)
+++ trunk/packages/last-align/branches/upstream/current/src/Alignment.cc 2008-09-11 06:55:41 UTC (rev 2475)
@@ -0,0 +1,201 @@
+// Copyright 2008 Martin C. Frith
+
+#include "Alignment.hh"
+#include "XdropAligner.hh"
+#include "MultiSequence.hh"
+#include "Alphabet.hh"
+#include "GeneralizedAffineGapCosts.hh"
+#include "stringify.hh"
+#include <iomanip>
+#include <cassert>
+
+// make C++ tolerable:
+#define CI(type) std::vector<type>::const_iterator
+
+namespace cbrc{
+
+void Alignment::fromSegmentPair( const SegmentPair& sp ){
+ blocks.assign( 1, sp );
+ score = sp.score;
+}
+
+void Alignment::makeXdrop( XdropAligner& aligner, const SegmentPair& seed,
+ const uchar* seq1, const uchar* seq2,
+ const int scoreMatrix[MAT][MAT], int maxDrop,
+ const GeneralizedAffineGapCosts& gap ){
+ assert( seed.size > 0 ); // relax this requirement?
+
+ int reverseScore =
+ aligner.fill( seq1, seq2, seed.beg1(), seed.beg2(),
+ XdropAligner::REVERSE, scoreMatrix, maxDrop, gap );
+
+ aligner.traceback( blocks, seq1, seq2, seed.beg1(), seed.beg2(),
+ XdropAligner::REVERSE, scoreMatrix, gap );
+
+ if( !blocks.empty() &&
+ blocks.back().end1() == seed.beg1() &&
+ blocks.back().end2() == seed.beg2() ){
+ blocks.back().size += seed.size;
+ }
+ else blocks.push_back(seed);
+
+ int forwardScore =
+ aligner.fill( seq1, seq2, seed.end1(), seed.end2(),
+ XdropAligner::FORWARD, scoreMatrix, maxDrop, gap );
+
+ std::vector<SegmentPair> forwardBlocks;
+ aligner.traceback( forwardBlocks, seq1, seq2, seed.end1(), seed.end2(),
+ XdropAligner::FORWARD, scoreMatrix, gap );
+
+ SegmentPair& b = blocks.back();
+ if( !forwardBlocks.empty() &&
+ b.end1() == forwardBlocks.back().beg1() &&
+ b.end2() == forwardBlocks.back().beg2() ){
+ b.size += forwardBlocks.back().size;
+ forwardBlocks.pop_back();
+ }
+
+ std::reverse( forwardBlocks.begin(), forwardBlocks.end() );
+ blocks.insert( blocks.end(), forwardBlocks.begin(), forwardBlocks.end() );
+
+ score = reverseScore + seed.score + forwardScore;
+}
+
+bool Alignment::isOptimal( const uchar* seq1, const uchar* seq2,
+ const int scoreMatrix[MAT][MAT], int maxDrop,
+ const GeneralizedAffineGapCosts& gap ){
+ int maxScore = 0;
+ int runningScore = 0;
+
+ for( CI(SegmentPair) i = blocks.begin(); i < blocks.end(); ++i ){
+ if( i > blocks.begin() ){
+ indexT gapSize1 = i->beg1() - (i-1)->end1();
+ indexT gapSize2 = i->beg2() - (i-1)->end2();
+ runningScore -= gap.cost( gapSize1, gapSize2 );
+ if( runningScore <= 0 || runningScore < maxScore - maxDrop ){
+ return false;
+ }
+ }
+
+ const uchar* s1 = seq1 + i->beg1();
+ const uchar* s2 = seq2 + i->beg2();
+ const uchar* e1 = seq1 + i->end1();
+
+ while( s1 < e1 ){
+ runningScore += scoreMatrix[ *s1++ ][ *s2++ ];
+ if( runningScore > maxScore ) maxScore = runningScore;
+ else if( runningScore <= 0 || // non-optimal prefix
+ s1 == e1 && i+1 == blocks.end() || // non-optimal suffix
+ runningScore < maxScore - maxDrop ){ // excessive score drop
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
+
+void Alignment::write( const MultiSequence& seq1, const MultiSequence& seq2,
+ char strand, const Alphabet& alph, int format,
+ std::ostream& os ) const{
+ /**/ if( format == 0 ) writeTab( seq1, seq2, strand, os );
+ else if( format == 1 ) writeMaf( seq1, seq2, strand, alph, os );
+}
+
+void Alignment::writeTab( const MultiSequence& seq1, const MultiSequence& seq2,
+ char strand, std::ostream& os ) const{
+ indexT size2 = seq2.ends.back();
+ indexT w1 = seq1.whichSequence( beg1() );
+ indexT w2 = seq2.whichSequence( strand == '+' ? beg2() : size2 - beg2() );
+ indexT seqStart1 = seq1.seqBeg(w1);
+ indexT seqStart2 = strand == '+' ? seq2.seqBeg(w2) : size2 - seq2.seqEnd(w2);
+
+ os << score << '\t'
+ << seq1.seqName(w1) << '\t'
+ << beg1() - seqStart1 << '\t'
+ << range1() << '\t'
+ << '+' << '\t'
+ << seq1.seqLen(w1) << '\t'
+ << seq2.seqName(w2) << '\t'
+ << beg2() - seqStart2 << '\t'
+ << range2() << '\t'
+ << strand << '\t'
+ << seq2.seqLen(w2) << '\t';
+
+ for( unsigned i = 0; i < blocks.size(); ++i ){
+ if( i > 0 ){
+ os << blocks[i].beg1() - blocks[i-1].end1() << ':'
+ << blocks[i].beg2() - blocks[i-1].end2() << ',';
+ }
+ os << blocks[i].size << ( i+1 < blocks.size() ? ',' : '\n' );
+ }
+}
+
+void Alignment::writeMaf( const MultiSequence& seq1, const MultiSequence& seq2,
+ char strand, const Alphabet& alph,
+ std::ostream& os ) const{
+ indexT size2 = seq2.ends.back();
+ indexT w1 = seq1.whichSequence( beg1() );
+ indexT w2 = seq2.whichSequence( strand == '+' ? beg2() : size2 - beg2() );
+ indexT seqStart1 = seq1.seqBeg(w1);
+ indexT seqStart2 = strand == '+' ? seq2.seqBeg(w2) : size2 - seq2.seqEnd(w2);
+
+ const std::string n1 = seq1.seqName(w1);
+ const std::string n2 = seq2.seqName(w2);
+ const std::string b1 = stringify( beg1() - seqStart1 );
+ const std::string b2 = stringify( beg2() - seqStart2 );
+ const std::string r1 = stringify( range1() );
+ const std::string r2 = stringify( range2() );
+ const std::string s1 = stringify( seq1.seqLen(w1) );
+ const std::string s2 = stringify( seq2.seqLen(w2) );
+
+ const std::size_t nw = std::max( n1.size(), n2.size() );
+ const std::size_t bw = std::max( b1.size(), b2.size() );
+ const std::size_t rw = std::max( r1.size(), r2.size() );
+ const std::size_t sw = std::max( s1.size(), s2.size() );
+
+ os << "a score=" << score << '\n'
+ << "s "
+ << std::setw( nw ) << std::left << n1 << std::right << ' '
+ << std::setw( bw ) << b1 << ' '
+ << std::setw( rw ) << r1 << ' ' << '+' << ' '
+ << std::setw( sw ) << s1 << ' ' << topString( seq1.seq, alph ) << '\n'
+ << "s "
+ << std::setw( nw ) << std::left << n2 << std::right << ' '
+ << std::setw( bw ) << b2 << ' '
+ << std::setw( rw ) << r2 << ' ' << strand << ' '
+ << std::setw( sw ) << s2 << ' ' << botString( seq2.seq, alph ) << '\n'
+ << '\n'; // blank line afterwards
+}
+
+std::string Alignment::topString( const std::vector<uchar>& seq,
+ const Alphabet& alph ) const{
+ std::string s;
+ for( unsigned i = 0; i < blocks.size(); ++i ){
+ if( i > 0 ){
+ s.append( alph.rtString( seq.begin() + blocks[i-1].end1(),
+ seq.begin() + blocks[i].beg1() ) );
+ s.append( blocks[i].beg2() - blocks[i-1].end2(), '-' );
+ }
+ s.append( alph.rtString( seq.begin() + blocks[i].beg1(),
+ seq.begin() + blocks[i].end1() ) );
+ }
+ return s;
+}
+
+std::string Alignment::botString( const std::vector<uchar>& seq,
+ const Alphabet& alph ) const{
+ std::string s;
+ for( unsigned i = 0; i < blocks.size(); ++i ){
+ if( i > 0 ){
+ s.append( blocks[i].beg1() - blocks[i-1].end1(), '-' );
+ s.append( alph.rtString( seq.begin() + blocks[i-1].end2(),
+ seq.begin() + blocks[i].beg2() ) );
+ }
+ s.append( alph.rtString( seq.begin() + blocks[i].beg2(),
+ seq.begin() + blocks[i].end2() ) );
+ }
+ return s;
+}
+
+} // end namespace cbrc
Added: trunk/packages/last-align/branches/upstream/current/src/AlignmentPot.hh
===================================================================
--- trunk/packages/last-align/branches/upstream/current/src/AlignmentPot.hh (rev 0)
+++ trunk/packages/last-align/branches/upstream/current/src/AlignmentPot.hh 2008-09-11 06:55:41 UTC (rev 2475)
@@ -0,0 +1,58 @@
+// Copyright 2008 Martin C. Frith
+
+// This struct holds alignments, and includes a procedure to
+// non-redundantize alignments that share endpoints.
+
+#ifndef ALIGNMENTPOT_HH
+#define ALIGNMENTPOT_HH
+#include "Alignment.hh"
+#include <vector>
+
+namespace cbrc{
+
+struct AlignmentPot{
+ typedef std::vector<Alignment>::iterator iterator;
+
+ // add an alignment to the pot
+ void add( const Alignment& aln ) { items.push_back(aln); }
+
+ // the number of alignments in the pot
+ std::size_t size() { return items.size(); }
+
+ // erase any alignment that shares an endpoint with a higher-scoring
+ // alignment
+ void eraseSuboptimal();
+
+ // sort the alignments in descending order of score
+ void sort() { std::sort( items.begin(), items.end(), moreScore ); }
+
+ // data:
+ std::vector<Alignment> items;
+
+ static bool moreScore( const Alignment& x, const Alignment& y ){
+ // Try to break ties, so that alignments come in a consistent
+ // order. This makes it easier to compare different results.
+ return x.score != y.score ? x.score > y.score : lessBeg( x, y );
+ }
+
+ static bool lessBeg( const Alignment& x, const Alignment& y ){
+ return
+ x.beg1() != y.beg1() ? x.beg1() < y.beg1() :
+ x.beg2() != y.beg2() ? x.beg2() < y.beg2() :
+ x.score > y.score;
+ }
+
+ static bool lessEnd( const Alignment& x, const Alignment& y ){
+ return
+ x.end1() != y.end1() ? x.end1() < y.end1() :
+ x.end2() != y.end2() ? x.end2() < y.end2() :
+ x.score > y.score;
+ }
+
+ static void mark( Alignment& a ) { a.blocks[0].score = -1; }
+
+ static bool isMarked( Alignment& a ) { return a.blocks[0].score == -1; }
+};
+
+} // end namespace cbrc
+#endif // ALIGNMENTPOT_HH
Added: trunk/packages/last-align/branches/upstream/current/src/Alphabet.cc
===================================================================
--- trunk/packages/last-align/branches/upstream/current/src/Alphabet.cc (rev 0)
+++ trunk/packages/last-align/branches/upstream/current/src/Alphabet.cc 2008-09-11 06:55:41 UTC (rev 2475)
@@ -0,0 +1,140 @@
+// Copyright 2008 Martin C. Frith
+
+#include "Alphabet.hh"
+#include <istream>
+#include <ostream>
+#include <stdexcept>
+
+namespace cbrc{
+
+const char* Alphabet::dna = "ACGT";
+
+// U=selenocysteine, O=pyrrolysine, *=stop?
+const char* Alphabet::protein = "ACDEFGHIKLMNPQRSTVW";
+
+// need to allow "*", because blosum62 includes it:
+const char* Alphabet::all = "ABCDEFGHIJKLMNOPQRSTUVWXYZ*";
+
+void Alphabet::fromString( const std::string& alphString ){
+ letters = alphString;
+ init();
+}
+
+void Alphabet::makeCaseInsensitive(){
+ for( const char* i = all; *i; ++i ){
+ uchar upper = std::toupper( *i );
+ uchar lower = std::tolower( *i );
+ encode[lower] = encode[upper];
+ }
+}
+
+void Alphabet::tr( std::vector<uchar>::iterator beg,
+ std::vector<uchar>::iterator end ) const{
+ for( /* noop */; beg < end; ++beg ){
+ uchar code = encode[ *beg ];
+ if( code == 255 ){
+ throw std::runtime_error( std::string("bad symbol: ") + char(*beg) );
+ }
+ *beg = code;
+ }
+}
+
+std::string Alphabet::rtString( std::vector<uchar>::const_iterator beg,
+ std::vector<uchar>::const_iterator end ) const{
+ std::string s;
+ for( /* noop */; beg < end; ++beg ){
+ s += decode[ *beg ];
+ }
+ return s;
+}
+
+void Alphabet::rc( std::vector<uchar>::iterator beg,
+ std::vector<uchar>::iterator end ) const{
+ std::reverse( beg, end );
+
+ for( /* noop */; beg < end; ++beg ){
+ *beg = complement[ *beg ];
+ }
+}
+
+// perhaps slower than it could be:
+void Alphabet::mergeImproperCodes( std::vector<uchar>::iterator beg,
+ std::vector<uchar>::iterator end ) const{
+ for( /* noop */; beg < end; ++beg ){
+ if( *beg > size ) *beg = size;
+ }
+}
+
+void Alphabet::init(){
+ size = letters.size();
+
+ // std::toupper doesn't work here!
+ std::transform( letters.begin(), letters.end(), letters.begin(), toupper );
+
+ std::fill_n( encode, 256, 255 ); // fill encode with value 255
+
+ uchar code = 0;
+
+ // add the "real" alphabet letters:
+ for( std::string::iterator i = letters.begin(); i < letters.end(); ++i ){
+ uchar letter = *i;
+ if( letter == ' ' || encode[ letter ] != 255 ){
+ throw std::runtime_error( "bad alphabet: " + letters );
+ }
+ code = newCode( letter, code );
+ }
+
+ // add space as a delimiter symbol:
+ code = newCode( ' ', code );
+
+ // add any remaining letters (e.g. ambiguous nucleotides):
+ for( const char* i = all; *i; ++i ){
+ uchar letter = *i;
+ if( encode[ letter ] == 255 ) code = newCode( letter, code );
+ }
+
+ // add lowercase letters:
+ for( const char* i = all; *i; ++i ){
+ uchar letter = std::tolower( *i );
+ if( encode[ letter ] == 255 ) code = newCode( letter, code );
+ }
+
+ makeComplement();
+}
+
+Alphabet::uchar Alphabet::newCode( uchar letter, uchar code ){
+ encode[letter] = code;
+ decode[code] = letter;
+ canonical[code] = encode[ std::toupper(letter) ];
+ return code + 1;
+}
+
+void Alphabet::makeComplement(){
+ static const char* x = "ACGTRYSWKMBDHVN";
+ static const char* y = "TGCAYRSWMKVHDBN";
+
+ for( unsigned i = 0; i < 256; ++i ){
+ complement[i] = i;
+ }
+
+ for( unsigned i = 0; x[i] && y[i]; ++i ){
+ uchar xUpper = std::toupper( x[i] );
+ uchar yUpper = std::toupper( y[i] );
+ complement[ encode[xUpper] ] = encode[yUpper];
+ uchar xLower = std::tolower( x[i] );
+ uchar yLower = std::tolower( y[i] );
+ complement[ encode[xLower] ] = encode[yLower];
+ }
+}
+
+std::ostream& operator<<( std::ostream& s, const Alphabet& a ){
+ return s << a.letters;
+}
+
+std::istream& operator>>( std::istream& s, Alphabet& a ){
+ s >> a.letters;
+ if( s ) a.init();
+ return s;
+}
+
+} // end namespace cbrc
Added: trunk/packages/last-align/branches/upstream/current/src/MultiSequence.cc
===================================================================
--- trunk/packages/last-align/branches/upstream/current/src/MultiSequence.cc (rev 0)
+++ trunk/packages/last-align/branches/upstream/current/src/MultiSequence.cc 2008-09-11 06:55:41 UTC (rev 2475)
@@ -0,0 +1,115 @@
+// Copyright 2008 Martin C. Frith
+
+#include "MultiSequence.hh"
+#include "io.hh"
+#include <sstream>
+#include <cassert>
+
+namespace cbrc{
+
+void MultiSequence::initForAppending( indexT padSizeIn ){
+ padSize = padSizeIn;
+ nameEnds.push_back( names.size() );
+ finish();
+}
+
+void MultiSequence::reinitForAppending(){
+ seq.erase( seq.begin(), seq.begin() + ends.back() - padSize );
+ names.erase( names.begin(),
+ names.begin() + nameEnds[ finishedSequences() ] );
+ ends.resize(1);
+ nameEnds.resize(1);
+ if( !names.empty() ) nameEnds.push_back( names.size() );
+}
+
+void MultiSequence::fromFiles( const std::string& baseName, indexT seqCount ){
+ ends.resize( seqCount + 1 ); // unwanted zero-fill
+ vectorFromBinaryFile( ends, baseName + ".ssp" );
+
+ seq.resize( ends.back() ); // unwanted zero-fill
+ vectorFromBinaryFile( seq, baseName + ".tis" );
+
+ nameEnds.resize( seqCount + 1 ); // unwanted zero-fill
+ vectorFromBinaryFile( nameEnds, baseName + ".sds" );
+
+ names.resize( nameEnds.back() ); // unwanted zero-fill
+ vectorFromBinaryFile( names, baseName + ".des" );
+
+ padSize = ends[0];
+}
+
+void MultiSequence::toFiles( const std::string& baseName ) const{
+ vectorToBinaryFile( ends, baseName + ".ssp" );
+
+ memoryToBinaryFile( seq.begin(), seq.begin() + ends.back(),
+ baseName + ".tis" );
+
+ memoryToBinaryFile( nameEnds.begin(), nameEnds.begin() + ends.size(),
+ baseName + ".sds" );
+
+ memoryToBinaryFile( names.begin(),
+ names.begin() + nameEnds[ finishedSequences() ],
+ baseName + ".des" );
+}
+
+// probably slower than it could be:
+std::istream&
+MultiSequence::appendFromFasta( std::istream& stream, std::size_t maxBytes ){
+ uchar c;
+
+ if( isFinished() ){
+ while( stream >> c ){
+ if( c == '>' ) break;
+ }
+ if( !stream ) return stream;
+
+ // get the first word of the FASTA title line:
+ std::string line, word;
+ std::getline( stream, line );
+ std::istringstream iss(line);
+ iss >> word;
+ names.insert( names.end(), word.begin(), word.end() );
+ nameEnds.push_back( names.size() );
+ }
+
+ while( isFinishable(maxBytes) && stream >> c ){ // skips whitespace
+ if( c == '>' ){
+ stream.unget();
+ break;
+ }else{
+ seq.push_back(c);
+ }
+ }
+
+ if( isFinishable(maxBytes) ) finish();
+ if( stream.eof() && !stream.bad() ) stream.clear();
+ return stream;
+}
+
+void MultiSequence::finish(){
+ seq.insert( seq.end(), padSize, ' ' );
+ ends.push_back( seq.size() );
+}
+
+void MultiSequence::unfinish(){
+ ends.pop_back();
+ seq.erase( seq.end() - padSize, seq.end() );
+}
+
+bool MultiSequence::isFinishable( std::size_t maxBytes ) const{
+ return seq.size() + padSize <= maxBytes;
+}
+
+MultiSequence::indexT MultiSequence::whichSequence( indexT coordinate ) const{
+ std::vector<indexT>::const_iterator u =
+ std::upper_bound( ends.begin(), ends.end(), coordinate );
+ assert( u != ends.begin() && u != ends.end() );
+ return u - ends.begin() - 1;
+}
+
+std::string MultiSequence::seqName( indexT seqNum ) const{
+ return std::string( names.begin() + nameEnds[ seqNum ],
+ names.begin() + nameEnds[ seqNum + 1 ] );
+}
+
+} // end namespace cbrc
Added: trunk/packages/last-align/branches/upstream/current/src/PeriodicSpacedSeed.cc
===================================================================
--- trunk/packages/last-align/branches/upstream/current/src/PeriodicSpacedSeed.cc (rev 0)
+++ trunk/packages/last-align/branches/upstream/current/src/PeriodicSpacedSeed.cc 2008-09-11 06:55:41 UTC (rev 2475)
@@ -0,0 +1,46 @@
+// Copyright 2008 Martin C. Frith
+
+#include "PeriodicSpacedSeed.hh"
+#include <stdexcept>
+#include <istream>
+#include <ostream>
+
+namespace cbrc{
+
+void PeriodicSpacedSeed::fromString( const std::string& s ){
+ pattern = s;
+ init();
+}
+
+void PeriodicSpacedSeed::init(){
+ if( pattern.empty() || pattern[0] != '1' ){
+ throw std::runtime_error("bad mask: " + pattern);
+ }
+
+ offsets.clear();
+ unsigned off = 1;
+
+ for( std::size_t i = 1; i < pattern.size(); ++i ){
+ if( pattern[i] == '1' ){
+ offsets.push_back(off);
+ off = 0;
+ }
+ ++off;
+ }
+
+ offsets.push_back(off);
+
+ maxOffset = *max_element( offsets.begin(), offsets.end() );
+}
+
+std::ostream& operator<<( std::ostream& s, const PeriodicSpacedSeed& m ){
+ return s << m.pattern;
+}
+
+std::istream& operator>>( std::istream& s, PeriodicSpacedSeed& m ){
+ s >> m.pattern;
+ if( s ) m.init();
+ return s;
+}
+
+} // end namespace cbrc
Added: trunk/packages/last-align/branches/upstream/current/src/ScoreMatrix.cc
===================================================================
--- trunk/packages/last-align/branches/upstream/current/src/ScoreMatrix.cc (rev 0)
+++ trunk/packages/last-align/branches/upstream/current/src/ScoreMatrix.cc 2008-09-11 06:55:41 UTC (rev 2475)
@@ -0,0 +1,195 @@
+// Copyright 2008 Martin C. Frith
+
+#include "ScoreMatrix.hh"
+#include <fstream>
+#include <sstream>
+#include <iomanip>
+#include <stdexcept>
+#include <cassert>
+//#include <iostream> // for debugging
+
+#define ERR(x) throw std::runtime_error(x)
+
+namespace cbrc{
+
+// copied from NCBI BLAST:
+const char* ScoreMatrix::blosum62 = "\
+ A R N D C Q E G H I L K M F P S T W Y V B J Z X *\n\
+A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 -1 -1 -4\n\
+R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 -2 0 -1 -4\n\
+N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 4 -3 0 -1 -4\n\
+D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 -3 1 -1 -4\n\
+C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -1 -3 -1 -4\n\
+Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 -2 4 -1 -4\n\
+E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 -3 4 -1 -4\n\
+G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -4 -2 -1 -4\n\
+H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 -3 0 -1 -4\n\
+I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 3 -3 -1 -4\n\
+L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 3 -3 -1 -4\n\
+K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 -3 1 -1 -4\n\
+M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 2 -1 -1 -4\n\
+F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 0 -3 -1 -4\n\
+P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -3 -1 -1 -4\n\
+S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 -2 0 -1 -4\n\
+T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 -1 -1 -4\n\
+W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -2 -2 -1 -4\n\
+Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -1 -2 -1 -4\n\
+V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 2 -2 -1 -4\n\
+B -2 -1 4 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 -3 0 -1 -4\n\
+J -1 -2 -3 -3 -1 -2 -3 -4 -3 3 3 -3 2 0 -3 -2 -1 -2 -1 2 -3 3 -3 -1 -4\n\
+Z -1 0 0 1 -3 4 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -2 -2 -2 0 -3 4 -1 -4\n\
+X -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -4\n\
+* -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1\n\
+";
+
+void ScoreMatrix::matchMismatch( int match, int mismatch,
+ const std::string& letters ){
+ rows = letters;
+ cols = letters;
+ std::size_t size = letters.size();
+
+ cells.resize( size );
+
+ for( std::size_t i = 0; i < size; ++i ){
+ cells[i].assign( size, -mismatch );
+ cells[i][i] = match;
+ }
+}
+
+void ScoreMatrix::fromFile( const std::string& fileName ){
+ std::ifstream file( fileName.c_str() );
+ if( !file ) ERR( "can't open file: " + fileName );
+ file >> *this;
+ if( !file ) ERR( "can't read file: " + fileName );
+}
+
+void ScoreMatrix::fromString( const std::string& matString ){
+ std::istringstream iss(matString);
+ iss >> *this;
+ assert(iss);
+}
+
+void ScoreMatrix::init( const uchar encode[] ){
+ assert( !rows.empty() && !cols.empty() );
+
+ std::transform( rows.begin(), rows.end(), rows.begin(), toupper );
+ std::transform( cols.begin(), cols.end(), cols.begin(), toupper );
+
+ minScore = cells[0][0];
+ maxScore = cells[0][0];
+
+ for( std::size_t i = 0; i < rows.size(); ++i ){
+ for( std::size_t j = 0; j < cols.size(); ++j ){
+ minScore = std::min( minScore, cells[i][j] );
+ maxScore = std::max( maxScore, cells[i][j] );
+ }
+ }
+
+ // set default score = minScore:
+ for( unsigned i = 0; i < MAT; ++i ){
+ for( unsigned j = 0; j < MAT; ++j ){
+ caseSensitive[i][j] = minScore;
+ caseInsensitive[i][j] = minScore;
+ }
+ }
+
+ for( std::size_t i = 0; i < rows.size(); ++i ){
+ for( std::size_t j = 0; j < cols.size(); ++j ){
+ uchar x = encode[ rows[i] ];
+ uchar y = encode[ cols[j] ];
+ uchar a = encode[ std::tolower( rows[i] ) ];
+ uchar b = encode[ std::tolower( cols[j] ) ];
+ if( a >= MAT ) ERR( std::string("bad symbol: ") + rows[i] );
+ if( b >= MAT ) ERR( std::string("bad symbol: ") + cols[j] );
+ caseSensitive[x][y] = cells[i][j];
+ caseInsensitive[x][y] = cells[i][j];
+ caseInsensitive[x][b] = cells[i][j];
+ caseInsensitive[a][y] = cells[i][j];
+ caseInsensitive[a][b] = cells[i][j];
+ }
+ }
+
+ // set a hugely negative score for the delimiter symbol:
+ uchar z = encode[' '];
+ assert( z < MAT );
+ for( unsigned i = 0; i < MAT; ++i ){
+ caseSensitive[z][i] = -INF;
+ caseSensitive[i][z] = -INF;
+ caseInsensitive[z][i] = -INF;
+ caseInsensitive[i][z] = -INF;
+ }
+}
+
+void ScoreMatrix::writeCommented( std::ostream& stream ) const{
+ stream << "# " << ' ';
+ for( std::size_t i = 0; i < cols.size(); ++i ){
+ stream << ' ' << std::setw(OUTPAD) << cols[i];
+ }
+ stream << '\n';
+
+ for( std::size_t i = 0; i < rows.size(); ++i ){
+ stream << "# " << rows[i];
+ for( std::size_t j = 0; j < cols.size(); ++j ){
+ stream << ' ' << std::setw(OUTPAD) << cells[i][j];
+ }
+ stream << '\n';
+ }
+}
+
+std::istream& operator>>( std::istream& stream, ScoreMatrix& m ){
+ std::string tmpRows;
+ std::string tmpCols;
+ std::vector< std::vector<int> > tmpCells;
+ std::string line;
+ int state = 0;
+
+ while( std::getline( stream, line ) ){
+ std::istringstream iss(line);
+ char c;
+ if( !(iss >> c) ) continue; // skip blank lines
+ if( state == 0 ){
+ if( c == '#' ) continue; // skip comment lines at the top
+ do{
+ tmpCols.push_back(c);
+ }while( iss >> c );
+ state = 1;
+ }
+ else{
+ tmpRows.push_back(c);
+ tmpCells.resize( tmpCells.size() + 1 );
+ int score;
+ while( iss >> score ){
+ tmpCells.back().push_back(score);
+ }
+ if( tmpCells.back().size() != tmpCols.size() ) ERR( "bad score matrix" );
+ }
+ }
+
+ if( stream.eof() && !stream.bad() && !tmpRows.empty() ){
+ stream.clear();
+ m.rows.swap(tmpRows);
+ m.cols.swap(tmpCols);
+ m.cells.swap(tmpCells);
+ }
+
+ return stream;
+}
+
+std::ostream& operator<<( std::ostream& stream, const ScoreMatrix& m ){
+ for( std::size_t i = 0; i < m.cols.size(); ++i ){
+ stream << ' ' << std::setw(m.OUTPAD) << m.cols[i];
+ }
+ stream << '\n';
+
+ for( std::size_t i = 0; i < m.rows.size(); ++i ){
+ stream << m.rows[i];
+ for( std::size_t j = 0; j < m.cols.size(); ++j ){
+ stream << ' ' << std::setw(m.OUTPAD) << m.cells[i][j];
+ }
+ stream << '\n';
+ }
+
+ return stream;
+}
+
+} // end namespace cbrc
Added: trunk/packages/last-align/branches/upstream/current/src/ScoreMatrix.hh
===================================================================
--- trunk/packages/last-align/branches/upstream/current/src/ScoreMatrix.hh (rev 0)
+++ trunk/packages/last-align/branches/upstream/current/src/ScoreMatrix.hh 2008-09-11 06:55:41 UTC (rev 2475)
@@ -0,0 +1,43 @@
+// Copyright 2008 Martin C. Frith
+
+// This struct holds a score matrix for aligning pairs of residues,
+// e.g. blosum62. The delimiter symbol (space) aligned to anything
+// gets a score of -INF.
+
+#ifndef SCOREMATRIX_HH
+#define SCOREMATRIX_HH
+#include <string>
+#include <vector>
+#include <iosfwd>
+
+namespace cbrc{
+
+struct ScoreMatrix{
+ typedef unsigned char uchar;
+
+ enum { INF = INT_MAX / 2 }; // big, but try to avoid overflow
+ enum { MAT = 64 }; // matrix size = MAT x MAT
+ enum { OUTPAD = 2 }; // cell-padding for output
+
+ static const char* blosum62;
+
+ void matchMismatch( int match, int mismatch, const std::string& letters );
+ void fromFile( const std::string& fileName );
+ void fromString( const std::string& s );
+ void init( const uchar encode[] ); // unspecified letters get minScore
+ void writeCommented( std::ostream& stream ) const; // write preceded by "#"
+
+ std::string rows; // row headings (letters)
+ std::string cols; // column headings (letters)
+ std::vector< std::vector<int> > cells; // scores
+ int caseSensitive[MAT][MAT];
+ int caseInsensitive[MAT][MAT];
+ int minScore;
+ int maxScore;
+};
+
+std::istream& operator>>( std::istream& stream, ScoreMatrix& mat );
+std::ostream& operator<<( std::ostream& stream, const ScoreMatrix& mat );
+
+} // end namespace cbrc
+#endif // SCOREMATRIX_HH
Added: trunk/packages/last-align/branches/upstream/current/src/SuffixArray.hh
===================================================================
--- trunk/packages/last-align/branches/upstream/current/src/SuffixArray.hh (rev 0)
+++ trunk/packages/last-align/branches/upstream/current/src/SuffixArray.hh 2008-09-11 06:55:41 UTC (rev 2475)
@@ -0,0 +1,88 @@
+// Copyright 2008 Martin C. Frith
+
+// This struct holds a suffix array. The suffix array is just a list
+// of numbers indicating positions in a text, sorted according to the
+// alphabetical order of the text suffixes starting at these
+// positions. A query sequence can then be matched to the suffix
+// array using binary search. For faster matching, we use "buckets",
+// which store the matching regions for all k-mers for k=1 to, say,
+// 12. In addition, we allow skipping of positions while sorting and
+// matching, using a periodic spaced seed.
+
+// To allow for "improper" letters in the text (e.g. non-ACGT for
+// DNA), the buckets are slightly cunning. For each k-mer size k, if
+// the alphabet size is A (e.g. 4 for DNA), we store A^k + A^(k-1)
+// numbers. The A^k numbers are the starting locations in the suffix
+// array of every "proper" k-mer. The extra A^(k-1) numbers are the
+// starting positions of improper k-mers.
+
+#ifndef SUFFIXARRAY_HH
+#define SUFFIXARRAY_HH
+#include <vector>
+
+namespace cbrc{
+
+struct SuffixArray{
+ typedef unsigned indexT;
+ typedef unsigned char uchar;
+
+ SuffixArray( const std::vector<uchar>& t, const std::vector<indexT>& m,
+ unsigned a )
+ : text(t), mask(m), alphSize(a), maskSize( m.size() ){}
+
+ // Add (unsorted) indices for text positions between beg and end,
+ // and return 1. Positions starting with delimiters aren't added.
+ // If the index size would exceed maxBytes, don't add anything, and
+ // return 0.
+ int makeIndex( indexT beg, indexT end, indexT step, std::size_t maxBytes );
+
+ std::size_t indexBytes() const{ return index.size() * sizeof(indexT); }
+ void sortIndex();
+ void makeBuckets( indexT bucketDepth );
+ void clear();
+
+ void fromFiles( const std::string& baseName,
+ indexT indexNum, indexT bucketDepth );
+ void toFiles( const std::string& baseName ) const;
+
+ // Find the smallest match to the text, starting at the given
+ // position in the query, such that there are at most maxHits
+ // matches, and the match-depth is at least minDepth. Return the
+ // range of matching indices via beg and end.
+ void match( const indexT*& beg, const indexT*& end,
+ const uchar* queryPtr,
+ indexT maxHits, indexT minDepth ) const;
+
+ // Count matches of all sizes, starting at the given position in the
+ // query. Don't try this for large self-comparisons!
+ void countMatches( std::vector<unsigned long long>& counts,
+ const uchar* queryPtr ) const;
+
+ const std::vector<uchar>& text;
+ const std::vector<indexT>& mask;
+ unsigned alphSize; // excluding delimiter symbol
+ unsigned maskSize; // same as mask.size(), but faster!!!
+ std::vector<indexT> index; // sorted indices
+ std::vector< std::vector<indexT> > bucketNest;
+ std::vector<indexT> bucketMask; // extended mask, to avoid modulus
+ indexT bucketMaskTotal;
+
+ static void equalRange( const indexT*& beg, const indexT*& end,
+ const uchar* textBase, uchar symbol );
+ static const indexT* lowerBound( const indexT* beg, const indexT* end,
+ const uchar* textBase, uchar symbol );
+ static const indexT* upperBound( const indexT* beg, const indexT* end,
+ const uchar* textBase, uchar symbol );
+
+ void makeBucketMask( indexT bucketDepth );
+
+ void radixSort( indexT* beg, indexT* end,
+ indexT depth, const uchar* textBase );
+ void insertionSort( indexT* beg, indexT* end,
+ indexT depth, const uchar* textBase );
+ void insertionSortSimple( indexT* beg, indexT* end,
+ const uchar* textBase );
+};
+
+} // end namespace cbrc
+#endif // SUFFIXARRAY_HH
Added: trunk/packages/last-align/branches/upstream/current/src/XdropAligner.hh
===================================================================
--- trunk/packages/last-align/branches/upstream/current/src/XdropAligner.hh (rev 0)
+++ trunk/packages/last-align/branches/upstream/current/src/XdropAligner.hh 2008-09-11 06:55:41 UTC (rev 2475)
@@ -0,0 +1,77 @@
+// Copyright 2008 Martin C. Frith
+
+// X-drop algorithm for gapped extension of alignments.
+
+#ifndef XDROPALIGNER_HH
+#define XDROPALIGNER_HH
+#include <vector>
+
+namespace cbrc{
+
+class GeneralizedAffineGapCosts;
+class SegmentPair;
+
+class XdropAligner{
+public:
+ typedef unsigned char uchar;
+
+ enum direction{ FORWARD, REVERSE };
+
+ // Extend an alignment, from the given start point, in the given direction
+ // Return the score (bestScore)
+ int fill( const uchar* seq1, const uchar* seq2,
+ size_t start1, size_t start2, direction dir,
+ const int sm[64][64], int maxDrop,
+ const GeneralizedAffineGapCosts& gap );
+
+ // Get the ungapped segments of the extension, putting them in "chunks"
+ // The extension might be empty!
+ void traceback( std::vector< SegmentPair >& chunks,
+ const uchar* seq1, const uchar* seq2,
+ size_t start1, size_t start2, direction dir,
+ const int sm[64][64],
+ const GeneralizedAffineGapCosts& gap ) const;
+
+private:
+ typedef std::vector< std::vector< int > > matrix_t;
+
+ enum { INF = INT_MAX / 2 }; // big, but try to avoid overflow
+
+ int bestScore;
+ size_t bestAntiDiagonal;
+ size_t bestPos1;
+
+ // maybe use std::vector< std::vector< int[3] > > instead of x, y, z???
+ matrix_t x;
+ matrix_t y;
+ matrix_t z;
+ std::vector< size_t > offsets;
+
+ static int drop( int score, int minScore );
+ static bool isDelimiter( uchar c, const int sm[64][64] );
+ static const uchar* seqPtr( const uchar* seq, size_t start,
+ direction dir, size_t pos );
+ static int match( const uchar* seq1, const uchar* seq2,
+ size_t start1, size_t start2, direction dir,
+ const int sm[64][64],
+ size_t antiDiagonal, size_t seq1pos );
+
+ size_t fillBeg( size_t antiDiagonal ) const;
+ size_t fillEnd( size_t antiDiagonal ) const;
+ int cell( const matrix_t& matrix,
+ size_t antiDiagonal, size_t seq1pos ) const;
+ int hori( const matrix_t& matrix,
+ size_t antiDiagonal, size_t seq1pos ) const;
+ int vert( const matrix_t& matrix,
+ size_t antiDiagonal, size_t seq1pos ) const;
+ int diag( const matrix_t& matrix,
+ size_t antiDiagonal, size_t seq1pos ) const;
+ void reset( const GeneralizedAffineGapCosts& gap );
+ void initScores( size_t antiDiagonal, size_t reservation );
+ void updateBest( int score, size_t antiDiagonal, const int* x0 );
+ size_t newFillBeg( size_t k1, size_t k2, size_t off1, size_t end1 ) const;
+ size_t newFillEnd( size_t k1, size_t k2, size_t off1, size_t end1 ) const;
+};
+
+} // end namespace cbrc
+#endif // XDROPALIGNER_HH
Added: trunk/packages/last-align/branches/upstream/current/src/lastal.cc
===================================================================
--- trunk/packages/last-align/branches/upstream/current/src/lastal.cc (rev 0)
+++ trunk/packages/last-align/branches/upstream/current/src/lastal.cc 2008-09-11 06:55:41 UTC (rev 2475)
@@ -0,0 +1,420 @@
+// Copyright 2008 Martin C. Frith
+
+// BLAST-like pair-wise sequence alignment, using suffix arrays.
+
+#include "LastalArguments.hh"
+#include "SuffixArray.hh"
+#include "XdropAligner.hh"
+#include "AlignmentPot.hh"
+#include "Alignment.hh"
+#include "SegmentPairPot.hh"
+#include "SegmentPair.hh"
+#include "ScoreMatrix.hh"
+#include "Alphabet.hh"
+#include "MultiSequence.hh"
+#include "PeriodicSpacedSeed.hh"
+#include "DiagonalTable.hh"
+#include "GeneralizedAffineGapCosts.hh"
+#include "io.hh"
+#include "stringify.hh"
+#include <iostream>
+#include <fstream>
+#include <stdexcept>
+#include <cassert>
+
+#define LOG(x) if( args.verbosity > 0 ) std::cerr << "lastal: " << x << '\n'
+
+typedef unsigned indexT;
+typedef unsigned char uchar;
+typedef unsigned long long countT;
+
+// Set up a scoring matrix, based on the user options
+void makeScoreMatrix( cbrc::ScoreMatrix& sm, cbrc::LastalArguments& args,
+ const cbrc::Alphabet& alph ){
+ if( !args.matrixFile.empty() ){
+ sm.fromFile( args.matrixFile );
+ }
+ else if( args.matchScore < 0 && args.mismatchCost < 0 &&
+ alph.letters == alph.protein ){
+ sm.fromString( sm.blosum62 );
+ }
+ else{
+ // if matchScore or mismatchCost weren't set, use default values:
+ if( args.matchScore < 0 ) args.matchScore = 1;
+ if( args.mismatchCost < 0 ) args.mismatchCost = 1;
+ sm.matchMismatch( args.matchScore, args.mismatchCost, alph.letters );
+ }
+
+ sm.init( alph.encode );
+}
+
+// Read the .prj file for the whole database
+void readOuterPrj( const std::string& fileName, cbrc::Alphabet& alph,
+ cbrc::PeriodicSpacedSeed& mask, unsigned& volumes ){
+ std::ifstream f( fileName.c_str() );
+ if( !f ) throw std::runtime_error("can't open file: " + fileName );
+
+ std::string word;
+ while( std::getline( f >> std::ws, word, '=' ) ){ // eat leading whitespace
+ /**/ if( word == "alphabet" ) f >> alph;
+ else if( word == "spacedseed" ) f >> mask;
+ else if( word == "volumes" ) f >> volumes;
+ else f >> word;
+ }
+
+ if( f.eof() && !f.bad() ) f.clear();
+ if( alph.letters.empty() || mask.pattern.empty() || volumes == -1u ){
+ f.setstate( std::ios::failbit );
+ }
+ if( !f ) throw std::runtime_error("can't read file: " + fileName);
+}
+
+// Read a per-volume .prj file, with info about a database volume
+void readInnerPrj( const std::string& fileName, indexT& seqCount,
+ indexT& delimiterNum, indexT& bucketDepth ){
+ std::ifstream f( fileName.c_str() );
+ if( !f ) throw std::runtime_error("can't open file: " + fileName );
+
+ std::string word;
+ while( std::getline( f >> std::ws, word, '=' ) ){ // eat leading whitespace
+ /**/ if( word == "numofsequences" ) f >> seqCount;
+ else if( word == "specialcharacters" ) f >> delimiterNum;
+ else if( word == "prefixlength" ) f >> bucketDepth;
+ else f >> word;
+ }
+
+ if( f.eof() && !f.bad() ) f.clear();
+ if( seqCount == -1u || delimiterNum == -1u || bucketDepth == -1u ){
+ f.setstate( std::ios::failbit );
+ }
+ if( !f ) throw std::runtime_error("can't read file: " + fileName);
+}
+
+// Write match counts for each query sequence
+void writeCounts( const std::vector< std::vector<countT> >& matchCounts,
+ const cbrc::MultiSequence& query, indexT minDepth,
+ std::ostream& out ){
+ for( indexT i = 0; i < matchCounts.size(); ++i ){
+ out << query.names[i] << '\n';
+
+ for( indexT j = minDepth-1; j < matchCounts[i].size(); ++j ){
+ out << j+1 << '\t' << matchCounts[i][j] << '\n';
+ }
+ }
+}
+
+// Count all matches, of all sizes, of a query batch against a suffix array
+void countMatches( std::vector< std::vector<countT> >& matchCounts,
+ const cbrc::MultiSequence& query,
+ const cbrc::SuffixArray& sa,
+ const cbrc::LastalArguments& args, char strand ){
+ indexT seqNum = strand == '+' ? 0 : query.finishedSequences() - 1;
+
+ for( indexT i = 0; i < query.ends.back(); i += args.queryStep ){
+ if( strand == '+' ){
+ for( ;; ){
+ if( seqNum == query.finishedSequences() ) return;
+ if( query.seqEnd(seqNum) > i ) break;
+ ++seqNum;
+ }
+ }
+ else{
+ for( ;; ){
+ if( seqNum == -1u ) return;
+ if( query.seqBeg(seqNum) < query.ends.back() - i ) break;
+ --seqNum;
+ }
+ }
+
+ sa.countMatches( matchCounts[seqNum], &query.seq[i] );
+ }
+}
+
+// Find query matches to the suffix array, and do gapless extensions
+void alignGapless( cbrc::SegmentPairPot& gaplessAlns,
+ const cbrc::MultiSequence& query,
+ const cbrc::MultiSequence& text,
+ const cbrc::SuffixArray& sa,
+ const cbrc::LastalArguments& args,
+ const cbrc::ScoreMatrix& sm,
+ const cbrc::Alphabet& alph,
+ char strand, std::ostream& out ){
+ const std::vector<uchar>& qseq = query.seq;
+ const std::vector<uchar>& tseq = text.seq;
+ const int (*mat)[64] =
+ args.maskLowercase < 2 ? sm.caseInsensitive : sm.caseSensitive;
+ cbrc::DiagonalTable dt; // record already-covered positions on each diagonal
+ countT gaplessExtensionCount = 0, gaplessAlignmentCount = 0;
+
+ for( indexT i = 0; i < query.ends.back(); i += args.queryStep ){
+ const indexT* beg;
+ const indexT* end;
+ sa.match( beg, end, &qseq[i], args.oneHitMultiplicity, args.minHitDepth );
+
+ for( /* noop */; beg < end; ++beg ){ // loop over suffix-array matches
+ if( dt.isCovered( i, *beg ) ) continue;
+
+ // tried: first get the score only, not the endpoints: slower!
+ cbrc::SegmentPair sp; // do the gapless extension:
+ sp.makeXdrop( *beg, i, &tseq[0], &qseq[0], mat, args.maxDropGapless );
+ ++gaplessExtensionCount;
+
+ // Tried checking the score after isOptimal & addEndpoint, but
+ // the number of extensions decreased by < 10%, and it was
+ // slower overall.
+ if( sp.score < args.minScoreGapless ) continue;
+
+ if( !sp.isOptimal( &tseq[0], &qseq[0], mat, args.maxDropGapless ) ){
+ continue; // ignore sucky gapless extensions
+ }
+
+ dt.addEndpoint( sp.end2(), sp.end1() );
+
+ if( args.outputType == 1 ){ // we just want gapless alignments
+ cbrc::Alignment aln;
+ aln.fromSegmentPair(sp);
+ aln.write( text, query, strand, alph, args.outputFormat, out );
+ }
+ else gaplessAlns.add(sp); // add the gapless alignment to the pot
+ ++gaplessAlignmentCount;
+ }
+ }
+
+ LOG( "gapless extensions=" << gaplessExtensionCount );
+ LOG( "gapless alignments=" << gaplessAlignmentCount );
+}
+
+// Do gapped extensions of the gapless alignments
+void alignGapped( cbrc::AlignmentPot& gappedAlns,
+ cbrc::SegmentPairPot& gaplessAlns,
+ const std::vector<uchar>& text,
+ const std::vector<uchar>& query,
+ const cbrc::LastalArguments& args,
+ const cbrc::ScoreMatrix& sm,
+ const cbrc::Alphabet& alph ){
+ cbrc::XdropAligner aligner;
+ cbrc::GeneralizedAffineGapCosts gap( args.gapExistCost, args.gapExtendCost,
+ args.gapPairCost );
+ const int (*mat)[64] =
+ args.maskLowercase < 3 ? sm.caseInsensitive : sm.caseSensitive;
+ countT gappedExtensionCount = 0;
+
+ gaplessAlns.sort(); // sort the gapless alignments by score, highest first
+
+ for( size_t i = 0; i < gaplessAlns.size(); ++i ){
+ const cbrc::SegmentPair& sp = gaplessAlns.get(i);
+
+ if( sp.score == 0 ) continue; // it has been marked as redundant
+
+ cbrc::SegmentPair seed;
+
+ // Redo gapless extension, using gapped score parameters. Without
+ // this, if we self-compare a huge sequence, we risk getting a
+ // huge gapped extension. (Do this before sorting them?)
+ seed.makeXdrop( sp.beg1(), sp.beg2(), &text[0], &query[0],
+ mat, args.maxDropGapped );
+
+ // Shrink the seed to its longest run of identical matches. This
+ // trims off possibly unreliable parts of the gapless alignment.
+ seed.maxIdenticalRun( &text[0], &query[0], alph.canonical, mat );
+
+ cbrc::Alignment aln; // do gapped extension from each end of the seed:
+ aln.makeXdrop( aligner, seed, &text[0], &query[0],
+ mat, args.maxDropGapped, gap );
+ ++gappedExtensionCount;
+
+ if( aln.score < args.minScoreGapped ) continue;
+
+ if( !aln.isOptimal( &text[0], &query[0], mat, args.maxDropGapped, gap ) ){
+ // If retained, non-"optimal" alignments can hide "optimal"
+ // alignments, e.g. during non-reduntantization.
+ continue;
+ }
+
+ gaplessAlns.markAllOverlaps( aln.blocks );
+ gaplessAlns.markTandemRepeats( seed, args.maxRepeatDistance );
+ gappedAlns.add(aln); // add the gapped alignment to the pot
+ }
+
+ LOG( "gapped extensions=" << gappedExtensionCount );
+ LOG( "gapped alignments=" << gappedAlns.size() );
+}
+
+// Scan one batch of query sequences against one database volume
+void scan( const cbrc::MultiSequence& query, const cbrc::MultiSequence& text,
+ const cbrc::SuffixArray& sa, const cbrc::LastalArguments& args,
+ const cbrc::Alphabet& alph, const cbrc::ScoreMatrix& sm,
+ std::vector< std::vector<countT> >& matchCounts,
+ char strand, std::ostream& out ){
+ LOG( "scanning..." );
+
+ if( args.outputType == 0 ){ // we just want match counts
+ countMatches( matchCounts, query, sa, args, strand );
+ return;
+ }
+
+ cbrc::SegmentPairPot gaplessAlns;
+ alignGapless( gaplessAlns, query, text, sa, args, sm, alph, strand, out );
+ if( args.outputType == 1 ) return; // we just want gapless alignments
+
+ cbrc::AlignmentPot gappedAlns;
+ alignGapped( gappedAlns, gaplessAlns, text.seq, query.seq, args, sm, alph );
+
+ if( args.outputType == 3 ){ // we want non-redundant alignments
+ gappedAlns.eraseSuboptimal();
+ LOG( "nonredundant gapped alignments=" << gappedAlns.size() );
+ }
+
+ gappedAlns.sort(); // sort by score
+ for( size_t i = 0; i < gappedAlns.size(); ++i ){
+ gappedAlns.items[i].write( text, query, strand, alph,
+ args.outputFormat, out );
+ }
+}
+
+// Scan one batch of query sequences against all database volumes
+void scanAllVolumes( cbrc::MultiSequence& query,
+ const cbrc::LastalArguments& args,
+ const cbrc::Alphabet& alph,
+ const cbrc::PeriodicSpacedSeed& mask,
+ const cbrc::ScoreMatrix& sm,
+ unsigned volumes, std::ostream& out ){
+ std::vector< std::vector<countT> > matchCounts; // used if outputType == 0
+ if( args.outputType == 0 ) matchCounts.resize( query.finishedSequences() );
+
+ for( unsigned i = 0; i < volumes; ++i ){
+ std::string baseName = args.lastdbName + cbrc::stringify(i);
+ LOG( "reading " << baseName << "..." );
+
+ indexT seqCount = -1u, delimiterNum = -1u, bucketDepth = -1u;
+ readInnerPrj( baseName + ".prj", seqCount, delimiterNum, bucketDepth );
+
+ cbrc::MultiSequence text;
+ text.fromFiles( baseName, seqCount );
+
+ cbrc::SuffixArray sa( text.seq, mask.offsets, alph.size );
+ sa.fromFiles( baseName, text.ends.back() - delimiterNum, bucketDepth );
+
+ if( args.strand == 2 && i > 0 ){
+ LOG( "re-reverse complementing..." );
+ alph.rc( query.seq.begin(), query.seq.begin() + query.ends.back() );
+ }
+
+ if( args.strand != 0 ){
+ scan( query, text, sa, args, alph, sm, matchCounts, '+', out );
+ }
+
+ if( args.strand == 2 || args.strand == 0 && i == 0 ){
+ LOG( "reverse complementing..." );
+ alph.rc( query.seq.begin(), query.seq.begin() + query.ends.back() );
+ }
+
+ if( args.strand != 1 ){
+ scan( query, text, sa, args, alph, sm, matchCounts, '-', out );
+ }
+ }
+
+ if( args.outputType == 0 ){
+ LOG( "writing..." );
+ writeCounts( matchCounts, query, args.minHitDepth, out );
+ }
+
+ LOG( "done!" );
+}
+
+void writeHeader( std::ostream& out, const cbrc::LastalArguments& args,
+ const cbrc::ScoreMatrix sm ){
+ out << "# LAST version " <<
+#include "version.hh"
+ << "\n";
+ out << "#\n";
+ args.writeCommented( out );
+ out << "#\n";
+
+ if( args.outputType == 0 ){ // we just want hit counts
+ out << "# depth\tcount\n";
+ }
+ else{ // we want alignments
+ sm.writeCommented( out );
+ out << "#\n";
+ out << "# Coordinates are 0-based. For - strand matches, coordinates\n";
+ out << "# in the reverse complement of the 2nd sequence are used.\n";
+ out << "#\n";
+
+ if( args.outputFormat == 0 ){ // tabular format
+ out << "# score\tname1\tstart1\talnSize1\tstrand1\tseqSize1\t"
+ << "name2\tstart2\talnSize2\tstrand2\tseqSize2\tblocks\n";
+ }
+ else{ // MAF format
+ out << "# name start alnSize strand seqSize alignment\n";
+ }
+ }
+
+ out << "#\n";
+}
+
+// Read the next sequence, adding it to the MultiSequence
+std::istream&
+appendFromFasta( cbrc::MultiSequence& query,
+ const cbrc::LastalArguments& args, const cbrc::Alphabet& alph,
+ std::istream& in ){
+ std::size_t maxSeqBytes = args.batchSize;
+ if( query.finishedSequences() == 0 ) maxSeqBytes = std::size_t(-1);
+
+ indexT oldSeqSize = query.seq.size();
+
+ query.appendFromFasta( in, maxSeqBytes );
+
+ // encode the newly-read sequence
+ alph.tr( query.seq.begin() + oldSeqSize, query.seq.end() );
+
+ return in;
+}
+
+int main( int argc, char** argv )
+try{
+ std::clock_t startTime = std::clock();
+ cbrc::LastalArguments args( argc, argv );
+ cbrc::Alphabet alph;
+ cbrc::PeriodicSpacedSeed mask;
+ cbrc::ScoreMatrix sm;
+ unsigned volumes = -1u; // initialize it to an "error" value
+ readOuterPrj( args.lastdbName + ".prj", alph, mask, volumes );
+ makeScoreMatrix( sm, args, alph ); // before alph.makeCaseInsensitive!
+ args.setDefaults( alph.letters == alph.dna, alph.letters == alph.protein,
+ sm.maxScore );
+ if( args.maskLowercase < 1 ) alph.makeCaseInsensitive();
+
+ std::ofstream outFileStream;
+ std::ostream& out = cbrc::openOut( args.outFile, outFileStream );
+ writeHeader( out, args, sm );
+ cbrc::MultiSequence query;
+ query.initForAppending( mask.maxOffset );
+ alph.tr( query.seq.begin(), query.seq.end() );
+
+ for( char** i = argv + args.inputStart; i < argv + argc; ++i ){
+ LOG( "reading " << *i << "..." );
+ std::ifstream inFileStream;
+ std::istream& in = cbrc::openIn( *i, inFileStream );
+
+ while( appendFromFasta( query, args, alph, in ) ){
+ if( !query.isFinished() ){
+ scanAllVolumes( query, args, alph, mask, sm, volumes, out );
+ query.reinitForAppending();
+ }
+ }
+ }
+
+ if( query.finishedSequences() > 0 ){
+ scanAllVolumes( query, args, alph, mask, sm, volumes, out );
+ }
+
+ out << "# CPU time: " << (clock() - startTime + 0.0) / CLOCKS_PER_SEC
+ << " seconds\n";
+
+ return EXIT_SUCCESS;
+}
+catch( const std::exception& e ) {
+ std::cerr << "lastal: " << e.what() << '\n';
+ return EXIT_FAILURE;
+}
Added: trunk/packages/last-align/branches/upstream/current/src/lastdb.cc
===================================================================
--- trunk/packages/last-align/branches/upstream/current/src/lastdb.cc (rev 0)
+++ trunk/packages/last-align/branches/upstream/current/src/lastdb.cc 2008-09-11 06:55:41 UTC (rev 2475)
@@ -0,0 +1,167 @@
+// Copyright 2008 Martin C. Frith
+
+// Read fasta-format sequences; construct a suffix array of them; and
+// write the results to files.
+
+#include "LastdbArguments.hh"
+#include "SuffixArray.hh"
+#include "Alphabet.hh"
+#include "MultiSequence.hh"
+#include "PeriodicSpacedSeed.hh"
+#include "io.hh"
+#include "stringify.hh"
+#include <stdexcept>
+#include <fstream>
+#include <iostream>
+
+#define LOG(x) if( args.verbosity > 0 ) std::cerr << "lastdb: " << x << '\n'
+
+typedef unsigned indexT;
+
+// Set up an alphabet (e.g. DNA or protein), based on the user options
+void makeAlphabet( cbrc::Alphabet& alph, const cbrc::LastdbArguments& args ){
+ if( !args.userAlphabet.empty() ) alph.fromString( args.userAlphabet );
+ else if( args.isProtein ) alph.fromString( alph.protein );
+ else alph.fromString( alph.dna );
+ if( !args.isCaseSensitive ) alph.makeCaseInsensitive();
+}
+
+// Find the maximum bucket depth, such that the total number of
+// buckets in all layers is at most 1/4 the number of indices
+indexT defaultBucketDepth( unsigned alphSize, indexT indexNum ){
+ indexT maxBuckets = indexNum / 4;
+ indexT layerBuckets = (alphSize + 1); // buckets in outermost layer
+ indexT totalBuckets = layerBuckets;
+ if( totalBuckets > maxBuckets ) return 0; // no buckets at all!
+
+ for( indexT depth = 1; /* noop */; ++depth ){
+ if( layerBuckets > maxBuckets / alphSize ) return depth;
+ layerBuckets *= alphSize; // number of buckets in next layer
+ if( totalBuckets > maxBuckets - layerBuckets ) return depth;
+ totalBuckets += layerBuckets;
+ }
+}
+
+// Write the .prj file for the whole database
+void writeOuterPrj( const std::string& fileName, const cbrc::Alphabet& alph,
+ const cbrc::PeriodicSpacedSeed& mask, unsigned volumes ){
+ std::ofstream f( fileName.c_str() );
+ if( !f ) throw std::runtime_error("can't open file: " + fileName );
+ f << "version=" <<
+#include "version.hh"
+ << '\n';
+ f << "alphabet=" << alph << '\n';
+ f << "spacedseed=" << mask << '\n';
+ f << "volumes=" << volumes << '\n';
+ if( !f ) throw std::runtime_error("can't write file: " + fileName);
+}
+
+// Write a per-volume .prj file, with info about a database volume
+void writeInnerPrj( const std::string& fileName,
+ const cbrc::MultiSequence& multi,
+ const cbrc::SuffixArray& sa ){
+ std::ofstream f( fileName.c_str() );
+ if( !f ) throw std::runtime_error("can't open file: " + fileName );
+ f << "totallength=" << multi.ends.back() << '\n';
+ f << "specialcharacters=" << multi.ends.back() - sa.index.size() << '\n';
+ f << "numofsequences=" << multi.finishedSequences() << '\n';
+ f << "prefixlength=" << sa.bucketNest.size() << '\n';
+ if( !f ) throw std::runtime_error("can't write file: " + fileName);
+}
+
+// Make one database volume, from one batch of sequences
+void makeVolume( cbrc::MultiSequence& multi, cbrc::SuffixArray& sa,
+ const cbrc::LastdbArguments& args, const cbrc::Alphabet& alph,
+ unsigned volumeNumber ){
+ std::string baseName = args.lastdbName + cbrc::stringify(volumeNumber);
+
+ LOG( "writing tis, des, ssp, sds..." );
+ multi.toFiles( baseName );
+
+ LOG( "recoding..." );
+ alph.mergeImproperCodes( multi.seq.begin(), multi.seq.end() );
+
+ LOG( "sorting..." );
+ sa.sortIndex();
+
+ LOG( "bucketing..." );
+ sa.makeBuckets( args.bucketDepth == -1u
+ ? defaultBucketDepth( alph.size, sa.index.size() )
+ : args.bucketDepth );
+
+ LOG( "writing suf, bck..." );
+ sa.toFiles( baseName );
+
+ LOG( "writing prj..." );
+ writeInnerPrj( baseName + ".prj", multi, sa );
+
+ LOG( "done!" );
+}
+
+// Read the next sequence, adding it to the MultiSequence and the SuffixArray
+std::istream&
+appendFromFasta( cbrc::MultiSequence& multi, cbrc::SuffixArray& sa,
+ const cbrc::LastdbArguments& args, const cbrc::Alphabet& alph,
+ std::istream& in ){
+ std::size_t maxSeqBytes = args.volumeSize - sa.indexBytes();
+ if( multi.finishedSequences() == 0 ) maxSeqBytes = std::size_t(-1);
+
+ indexT oldSeqSize = multi.seq.size();
+
+ multi.appendFromFasta( in, maxSeqBytes );
+
+ // encode the newly-read sequence
+ alph.tr( multi.seq.begin() + oldSeqSize, multi.seq.end() );
+
+ if( in && multi.isFinished() ){
+ std::size_t maxIndexBytes = args.volumeSize - multi.seq.size();
+ if( multi.finishedSequences() == 1 ) maxIndexBytes = std::size_t(-1);
+
+ if( !sa.makeIndex( *(multi.ends.end() - 2), *(multi.ends.end() - 1),
+ args.indexStep, maxIndexBytes ) ){
+ multi.unfinish();
+ }
+ }
+
+ return in;
+}
+
+int main( int argc, char** argv )
+try{
+ cbrc::LastdbArguments args( argc, argv );
+ cbrc::Alphabet alph;
+ cbrc::PeriodicSpacedSeed mask;
+ cbrc::MultiSequence multi;
+ makeAlphabet( alph, args );
+ mask.fromString( args.maskPattern );
+ multi.initForAppending( mask.maxOffset );
+ alph.tr( multi.seq.begin(), multi.seq.end() );
+ cbrc::SuffixArray sa( multi.seq, mask.offsets, alph.size );
+ unsigned volumeNumber = 0;
+
+ for( char** i = argv + args.inputStart; i < argv + argc; ++i ){
+ LOG( "reading " << *i << "..." );
+ std::ifstream inFileStream;
+ std::istream& in = cbrc::openIn( *i, inFileStream );
+
+ while( appendFromFasta( multi, sa, args, alph, in ) ){
+ if( !multi.isFinished() ){
+ makeVolume( multi, sa, args, alph, volumeNumber++ );
+ sa.clear();
+ multi.reinitForAppending();
+ }
+ }
+ }
+
+ if( multi.finishedSequences() > 0 ){
+ makeVolume( multi, sa, args, alph, volumeNumber++ );
+ }
+
+ writeOuterPrj( args.lastdbName + ".prj", alph, mask, volumeNumber );
+
+ return EXIT_SUCCESS;
+}
+catch( const std::exception& e ) {
+ std::cerr << "lastdb: " << e.what() << '\n';
+ return EXIT_FAILURE;
+}
More information about the debian-med-commit
mailing list