[med-svn] r2475 - in trunk/packages: . last-align last-align/branches last-align/branches/upstream last-align/branches/upstream/current last-align/branches/upstream/current/src

Thu Sep 11 06:55:42 UTC 2008

Author: plessy
Date: 2008-09-11 06:55:41 +0000 (Thu, 11 Sep 2008)
New Revision: 2475

Added:
   trunk/packages/last-align/
   trunk/packages/last-align/branches/
   trunk/packages/last-align/branches/upstream/
   trunk/packages/last-align/branches/upstream/current/
   trunk/packages/last-align/branches/upstream/current/src/
   trunk/packages/last-align/branches/upstream/current/src/Alignment.cc
   trunk/packages/last-align/branches/upstream/current/src/AlignmentPot.hh
   trunk/packages/last-align/branches/upstream/current/src/Alphabet.cc
   trunk/packages/last-align/branches/upstream/current/src/MultiSequence.cc
   trunk/packages/last-align/branches/upstream/current/src/PeriodicSpacedSeed.cc
   trunk/packages/last-align/branches/upstream/current/src/ScoreMatrix.cc
   trunk/packages/last-align/branches/upstream/current/src/ScoreMatrix.hh
   trunk/packages/last-align/branches/upstream/current/src/SuffixArray.hh
   trunk/packages/last-align/branches/upstream/current/src/XdropAligner.hh
   trunk/packages/last-align/branches/upstream/current/src/lastal.cc
   trunk/packages/last-align/branches/upstream/current/src/lastdb.cc
Log:
[svn-inject] Installing original source of last-align

Added: trunk/packages/last-align/branches/upstream/current/src/Alignment.cc
===================================================================

--- trunk/packages/last-align/branches/upstream/current/src/Alignment.cc	                        (rev 0)
+++ trunk/packages/last-align/branches/upstream/current/src/Alignment.cc	2008-09-11 06:55:41 UTC (rev 2475)
@@ -0,0 +1,201 @@
+// Copyright 2008 Martin C. Frith
+
+#include "Alignment.hh"
+#include "XdropAligner.hh"
+#include "MultiSequence.hh"
+#include "Alphabet.hh"
+#include "GeneralizedAffineGapCosts.hh"
+#include "stringify.hh"
+#include <iomanip>
+#include <cassert>
+
+// make C++ tolerable:
+#define CI(type) std::vector<type>::const_iterator
+
+namespace cbrc{
+
+void Alignment::fromSegmentPair( const SegmentPair& sp ){
+  blocks.assign( 1, sp );
+  score = sp.score;
+}
+
+void Alignment::makeXdrop( XdropAligner& aligner, const SegmentPair& seed,
+			   const uchar* seq1, const uchar* seq2,
+			   const int scoreMatrix[MAT][MAT], int maxDrop,
+			   const GeneralizedAffineGapCosts& gap ){
+  assert( seed.size > 0 );  // relax this requirement?
+
+  int reverseScore =
+    aligner.fill( seq1, seq2, seed.beg1(), seed.beg2(),
+		  XdropAligner::REVERSE, scoreMatrix, maxDrop, gap );
+
+  aligner.traceback( blocks, seq1, seq2, seed.beg1(), seed.beg2(),
+		     XdropAligner::REVERSE, scoreMatrix, gap );
+
+  if( !blocks.empty() &&
+      blocks.back().end1() == seed.beg1() &&
+      blocks.back().end2() == seed.beg2() ){
+    blocks.back().size += seed.size;
+  }
+  else blocks.push_back(seed);
+
+  int forwardScore =
+    aligner.fill( seq1, seq2, seed.end1(), seed.end2(),
+		  XdropAligner::FORWARD, scoreMatrix, maxDrop, gap );
+
+  std::vector<SegmentPair> forwardBlocks;
+  aligner.traceback( forwardBlocks, seq1, seq2, seed.end1(), seed.end2(),
+		     XdropAligner::FORWARD, scoreMatrix, gap );
+
+  SegmentPair& b = blocks.back();
+  if( !forwardBlocks.empty() &&
+      b.end1() == forwardBlocks.back().beg1() &&
+      b.end2() == forwardBlocks.back().beg2() ){
+    b.size += forwardBlocks.back().size;
+    forwardBlocks.pop_back();
+  }
+
+  std::reverse( forwardBlocks.begin(), forwardBlocks.end() );
+  blocks.insert( blocks.end(), forwardBlocks.begin(), forwardBlocks.end() );
+
+  score = reverseScore + seed.score + forwardScore;
+}
+
+bool Alignment::isOptimal( const uchar* seq1, const uchar* seq2,
+			   const int scoreMatrix[MAT][MAT], int maxDrop,
+			   const GeneralizedAffineGapCosts& gap ){
+  int maxScore = 0;
+  int runningScore = 0;
+
+  for( CI(SegmentPair) i = blocks.begin(); i < blocks.end(); ++i ){
+    if( i > blocks.begin() ){
+      indexT gapSize1 = i->beg1() - (i-1)->end1();
+      indexT gapSize2 = i->beg2() - (i-1)->end2();
+      runningScore -= gap.cost( gapSize1, gapSize2 );
+      if( runningScore <= 0 || runningScore < maxScore - maxDrop ){
+	return false;
+      }
+    }
+
+    const uchar* s1 = seq1 + i->beg1();
+    const uchar* s2 = seq2 + i->beg2();
+    const uchar* e1 = seq1 + i->end1();
+
+    while( s1 < e1 ){
+      runningScore += scoreMatrix[ *s1++ ][ *s2++ ];
+      if( runningScore > maxScore ) maxScore = runningScore;
+      else if( runningScore <= 0 ||                  // non-optimal prefix
+	       s1 == e1 && i+1 == blocks.end() ||    // non-optimal suffix
+	       runningScore < maxScore - maxDrop ){  // excessive score drop
+	return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+void Alignment::write( const MultiSequence& seq1, const MultiSequence& seq2,
+		       char strand, const Alphabet& alph, int format,
+		       std::ostream& os ) const{
+  /**/ if( format == 0 ) writeTab( seq1, seq2, strand, os );
+  else if( format == 1 ) writeMaf( seq1, seq2, strand, alph, os );
+}
+
+void Alignment::writeTab( const MultiSequence& seq1, const MultiSequence& seq2,
+			  char strand, std::ostream& os ) const{
+  indexT size2 = seq2.ends.back();
+  indexT w1 = seq1.whichSequence( beg1() );
+  indexT w2 = seq2.whichSequence( strand == '+' ? beg2() : size2 - beg2() );
+  indexT seqStart1 = seq1.seqBeg(w1);
+  indexT seqStart2 = strand == '+' ? seq2.seqBeg(w2) : size2 - seq2.seqEnd(w2);
+
+  os << score << '\t'
+     << seq1.seqName(w1) << '\t'
+     << beg1() - seqStart1 << '\t'
+     << range1() << '\t'
+     << '+' << '\t'
+     << seq1.seqLen(w1) << '\t'
+     << seq2.seqName(w2) << '\t'
+     << beg2() - seqStart2 << '\t'
+     << range2() << '\t'
+     << strand << '\t'
+     << seq2.seqLen(w2) << '\t';
+
+  for( unsigned i = 0; i < blocks.size(); ++i ){
+    if( i > 0 ){
+      os << blocks[i].beg1() - blocks[i-1].end1() << ':'
+	 << blocks[i].beg2() - blocks[i-1].end2() << ',';
+    }
+    os << blocks[i].size << ( i+1 < blocks.size() ? ',' : '\n' );
+  }
+}
+
+void Alignment::writeMaf( const MultiSequence& seq1, const MultiSequence& seq2,
+			  char strand, const Alphabet& alph,
+			  std::ostream& os ) const{
+  indexT size2 = seq2.ends.back();
+  indexT w1 = seq1.whichSequence( beg1() );
+  indexT w2 = seq2.whichSequence( strand == '+' ? beg2() : size2 - beg2() );
+  indexT seqStart1 = seq1.seqBeg(w1);
+  indexT seqStart2 = strand == '+' ? seq2.seqBeg(w2) : size2 - seq2.seqEnd(w2);
+
+  const std::string n1 = seq1.seqName(w1);
+  const std::string n2 = seq2.seqName(w2);
+  const std::string b1 = stringify( beg1() - seqStart1 );
+  const std::string b2 = stringify( beg2() - seqStart2 );
+  const std::string r1 = stringify( range1() );
+  const std::string r2 = stringify( range2() );
+  const std::string s1 = stringify( seq1.seqLen(w1) );
+  const std::string s2 = stringify( seq2.seqLen(w2) );
+
+  const std::size_t nw = std::max( n1.size(), n2.size() );
+  const std::size_t bw = std::max( b1.size(), b2.size() );
+  const std::size_t rw = std::max( r1.size(), r2.size() );
+  const std::size_t sw = std::max( s1.size(), s2.size() );
+
+  os << "a score=" << score << '\n'
+     << "s "
+     << std::setw( nw ) << std::left << n1 << std::right << ' '
+     << std::setw( bw ) << b1 << ' '
+     << std::setw( rw ) << r1 << ' ' << '+' << ' '
+     << std::setw( sw ) << s1 << ' ' << topString( seq1.seq, alph ) << '\n'
+     << "s "
+     << std::setw( nw ) << std::left << n2 << std::right << ' '
+     << std::setw( bw ) << b2 << ' '
+     << std::setw( rw ) << r2 << ' ' << strand << ' '
+     << std::setw( sw ) << s2 << ' ' << botString( seq2.seq, alph ) << '\n'
+     << '\n';  // blank line afterwards
+}
+
+std::string Alignment::topString( const std::vector<uchar>& seq,
+				  const Alphabet& alph ) const{
+  std::string s;
+  for( unsigned i = 0; i < blocks.size(); ++i ){
+    if( i > 0 ){
+      s.append( alph.rtString( seq.begin() + blocks[i-1].end1(),
+			       seq.begin() + blocks[i].beg1() ) );
+      s.append( blocks[i].beg2() - blocks[i-1].end2(), '-' );
+    }
+    s.append( alph.rtString( seq.begin() + blocks[i].beg1(),
+			     seq.begin() + blocks[i].end1() ) );
+  }
+  return s;
+}
+
+std::string Alignment::botString( const std::vector<uchar>& seq,
+				  const Alphabet& alph ) const{
+  std::string s;
+  for( unsigned i = 0; i < blocks.size(); ++i ){
+    if( i > 0 ){
+      s.append( blocks[i].beg1() - blocks[i-1].end1(), '-' );
+      s.append( alph.rtString( seq.begin() + blocks[i-1].end2(),
+			       seq.begin() + blocks[i].beg2() ) );
+    }
+    s.append( alph.rtString( seq.begin() + blocks[i].beg2(),
+			     seq.begin() + blocks[i].end2() ) );
+  }
+  return s;
+}
+
+}  // end namespace cbrc

Added: trunk/packages/last-align/branches/upstream/current/src/AlignmentPot.hh
===================================================================
--- trunk/packages/last-align/branches/upstream/current/src/AlignmentPot.hh	                        (rev 0)
+++ trunk/packages/last-align/branches/upstream/current/src/AlignmentPot.hh	2008-09-11 06:55:41 UTC (rev 2475)
@@ -0,0 +1,58 @@
+// Copyright 2008 Martin C. Frith
+
+// This struct holds alignments, and includes a procedure to
+// non-redundantize alignments that share endpoints.
+
+#ifndef ALIGNMENTPOT_HH
+#define ALIGNMENTPOT_HH
+#include "Alignment.hh"
+#include <vector>
+
+namespace cbrc{
+
+struct AlignmentPot{
+  typedef std::vector<Alignment>::iterator iterator;
+
+  // add an alignment to the pot
+  void add( const Alignment& aln ) { items.push_back(aln); }
+
+  // the number of alignments in the pot
+  std::size_t size() { return items.size(); }
+
+  // erase any alignment that shares an endpoint with a higher-scoring
+  // alignment
+  void eraseSuboptimal();
+
+  // sort the alignments in descending order of score
+  void sort() { std::sort( items.begin(), items.end(), moreScore ); }
+
+  // data:
+  std::vector<Alignment> items;
+
+  static bool moreScore( const Alignment& x, const Alignment& y ){
+    // Try to break ties, so that alignments come in a consistent
+    // order.  This makes it easier to compare different results.
+    return x.score != y.score ? x.score > y.score : lessBeg( x, y );
+  }
+
+  static bool lessBeg( const Alignment& x, const Alignment& y ){
+    return
+      x.beg1() != y.beg1() ? x.beg1() < y.beg1() :
+      x.beg2() != y.beg2() ? x.beg2() < y.beg2() :
+      x.score > y.score;
+  }
+
+  static bool lessEnd( const Alignment& x, const Alignment& y ){
+    return
+      x.end1() != y.end1() ? x.end1() < y.end1() :
+      x.end2() != y.end2() ? x.end2() < y.end2() :
+      x.score > y.score;
+  }
+
+  static void mark( Alignment& a ) { a.blocks[0].score = -1; }
+
+  static bool isMarked( Alignment& a ) { return a.blocks[0].score == -1; }
+};
+
+}  // end namespace cbrc
+#endif  // ALIGNMENTPOT_HH

Added: trunk/packages/last-align/branches/upstream/current/src/Alphabet.cc
===================================================================
--- trunk/packages/last-align/branches/upstream/current/src/Alphabet.cc	                        (rev 0)
+++ trunk/packages/last-align/branches/upstream/current/src/Alphabet.cc	2008-09-11 06:55:41 UTC (rev 2475)
@@ -0,0 +1,140 @@
+// Copyright 2008 Martin C. Frith
+
+#include "Alphabet.hh"
+#include <istream>
+#include <ostream>
+#include <stdexcept>
+
+namespace cbrc{
+
+const char* Alphabet::dna = "ACGT";
+
+// U=selenocysteine, O=pyrrolysine, *=stop?
+const char* Alphabet::protein = "ACDEFGHIKLMNPQRSTVW";
+
+// need to allow "*", because blosum62 includes it:
+const char* Alphabet::all = "ABCDEFGHIJKLMNOPQRSTUVWXYZ*";
+
+void Alphabet::fromString( const std::string& alphString ){
+  letters = alphString;
+  init();
+}
+
+void Alphabet::makeCaseInsensitive(){
+  for( const char* i = all; *i; ++i ){
+    uchar upper = std::toupper( *i );
+    uchar lower = std::tolower( *i );
+    encode[lower] = encode[upper];
+  }
+}
+
+void Alphabet::tr( std::vector<uchar>::iterator beg,
+		   std::vector<uchar>::iterator end ) const{
+  for( /* noop */; beg < end; ++beg ){
+    uchar code = encode[ *beg ];
+    if( code == 255 ){
+      throw std::runtime_error( std::string("bad symbol: ") + char(*beg) );
+    }
+    *beg = code;
+  }
+}
+
+std::string Alphabet::rtString( std::vector<uchar>::const_iterator beg,
+				std::vector<uchar>::const_iterator end ) const{
+  std::string s;
+  for( /* noop */; beg < end; ++beg ){
+    s += decode[ *beg ];
+  }
+  return s;
+}
+
+void Alphabet::rc( std::vector<uchar>::iterator beg,
+		   std::vector<uchar>::iterator end ) const{
+  std::reverse( beg, end );
+
+  for( /* noop */; beg < end; ++beg ){
+    *beg = complement[ *beg ];
+  }
+}
+
+// perhaps slower than it could be:
+void Alphabet::mergeImproperCodes( std::vector<uchar>::iterator beg,
+				   std::vector<uchar>::iterator end ) const{
+  for( /* noop */; beg < end; ++beg ){
+    if( *beg > size ) *beg = size;
+  }
+}
+
+void Alphabet::init(){
+  size = letters.size();
+
+  // std::toupper doesn't work here!
+  std::transform( letters.begin(), letters.end(), letters.begin(), toupper );
+
+  std::fill_n( encode, 256, 255 );  // fill encode with value 255
+
+  uchar code = 0;
+
+  // add the "real" alphabet letters:
+  for( std::string::iterator i = letters.begin(); i < letters.end(); ++i ){
+    uchar letter = *i;
+    if( letter == ' ' || encode[ letter ] != 255 ){
+      throw std::runtime_error( "bad alphabet: " + letters );
+    }
+    code = newCode( letter, code );
+  }
+
+  // add space as a delimiter symbol:
+  code = newCode( ' ', code );
+
+  // add any remaining letters (e.g. ambiguous nucleotides):
+  for( const char* i = all; *i; ++i ){
+    uchar letter = *i;
+    if( encode[ letter ] == 255 ) code = newCode( letter, code );
+  }
+
+  // add lowercase letters:
+  for( const char* i = all; *i; ++i ){
+    uchar letter = std::tolower( *i );
+    if( encode[ letter ] == 255 ) code = newCode( letter, code );
+  }
+
+  makeComplement();
+}
+
+Alphabet::uchar Alphabet::newCode( uchar letter, uchar code ){
+  encode[letter] = code;
+  decode[code] = letter;
+  canonical[code] = encode[ std::toupper(letter) ];
+  return code + 1;
+}
+
+void Alphabet::makeComplement(){
+  static const char* x = "ACGTRYSWKMBDHVN";
+  static const char* y = "TGCAYRSWMKVHDBN";
+
+  for( unsigned i = 0; i < 256; ++i ){
+    complement[i] = i;
+  }
+
+  for( unsigned i = 0; x[i] && y[i]; ++i ){
+    uchar xUpper = std::toupper( x[i] );
+    uchar yUpper = std::toupper( y[i] );
+    complement[ encode[xUpper] ] = encode[yUpper];
+    uchar xLower = std::tolower( x[i] );
+    uchar yLower = std::tolower( y[i] );
+    complement[ encode[xLower] ] = encode[yLower];
+  }
+}
+
+std::ostream& operator<<( std::ostream& s, const Alphabet& a ){
+  return s << a.letters;
+}
+
+std::istream& operator>>( std::istream& s, Alphabet& a ){
+  s >> a.letters;
+  if( s ) a.init();
+  return s;
+}
+
+}  // end namespace cbrc

Added: trunk/packages/last-align/branches/upstream/current/src/MultiSequence.cc
===================================================================
--- trunk/packages/last-align/branches/upstream/current/src/MultiSequence.cc	                        (rev 0)
+++ trunk/packages/last-align/branches/upstream/current/src/MultiSequence.cc	2008-09-11 06:55:41 UTC (rev 2475)
@@ -0,0 +1,115 @@
+// Copyright 2008 Martin C. Frith
+
+#include "MultiSequence.hh"
+#include "io.hh"
+#include <sstream>
+#include <cassert>
+
+namespace cbrc{
+
+void MultiSequence::initForAppending( indexT padSizeIn ){
+  padSize = padSizeIn;
+  nameEnds.push_back( names.size() );
+  finish();
+}
+
+void MultiSequence::reinitForAppending(){
+  seq.erase( seq.begin(), seq.begin() + ends.back() - padSize );
+  names.erase( names.begin(),
+	       names.begin() + nameEnds[ finishedSequences() ] );
+  ends.resize(1);
+  nameEnds.resize(1);
+  if( !names.empty() ) nameEnds.push_back( names.size() );
+}
+
+void MultiSequence::fromFiles( const std::string& baseName, indexT seqCount ){
+  ends.resize( seqCount + 1 );  // unwanted zero-fill
+  vectorFromBinaryFile( ends, baseName + ".ssp" );
+
+  seq.resize( ends.back() );  // unwanted zero-fill
+  vectorFromBinaryFile( seq, baseName + ".tis" );
+
+  nameEnds.resize( seqCount + 1 );  // unwanted zero-fill
+  vectorFromBinaryFile( nameEnds, baseName + ".sds" );
+
+  names.resize( nameEnds.back() );  // unwanted zero-fill
+  vectorFromBinaryFile( names, baseName + ".des" );
+
+  padSize = ends[0];
+}
+
+void MultiSequence::toFiles( const std::string& baseName ) const{
+  vectorToBinaryFile( ends, baseName + ".ssp" );
+
+  memoryToBinaryFile( seq.begin(), seq.begin() + ends.back(),
+		      baseName + ".tis" );
+
+  memoryToBinaryFile( nameEnds.begin(), nameEnds.begin() + ends.size(),
+		      baseName + ".sds" );
+
+  memoryToBinaryFile( names.begin(),
+		      names.begin() + nameEnds[ finishedSequences() ],
+		      baseName + ".des" );
+}
+
+// probably slower than it could be:
+std::istream&
+MultiSequence::appendFromFasta( std::istream& stream, std::size_t maxBytes ){
+  uchar c;
+
+  if( isFinished() ){
+    while( stream >> c ){
+      if( c == '>' ) break;
+    }
+    if( !stream ) return stream;
+
+    // get the first word of the FASTA title line:
+    std::string line, word;
+    std::getline( stream, line );
+    std::istringstream iss(line);
+    iss >> word;
+    names.insert( names.end(), word.begin(), word.end() );
+    nameEnds.push_back( names.size() );
+  }
+
+  while( isFinishable(maxBytes) && stream >> c ){  // skips whitespace
+    if( c == '>' ){
+      stream.unget();
+      break;
+    }else{
+      seq.push_back(c);
+    }
+  }
+
+  if( isFinishable(maxBytes) ) finish();
+  if( stream.eof() && !stream.bad() ) stream.clear();
+  return stream;
+}
+
+void MultiSequence::finish(){
+  seq.insert( seq.end(), padSize, ' ' );
+  ends.push_back( seq.size() );
+}
+
+void MultiSequence::unfinish(){
+  ends.pop_back();
+  seq.erase( seq.end() - padSize, seq.end() );
+}
+
+bool MultiSequence::isFinishable( std::size_t maxBytes ) const{
+  return seq.size() + padSize <= maxBytes;
+}
+
+MultiSequence::indexT MultiSequence::whichSequence( indexT coordinate ) const{
+  std::vector<indexT>::const_iterator u =
+    std::upper_bound( ends.begin(), ends.end(), coordinate );
+  assert( u != ends.begin() && u != ends.end() );
+  return u - ends.begin() - 1;
+}
+
+std::string MultiSequence::seqName( indexT seqNum ) const{
+  return std::string( names.begin() + nameEnds[ seqNum ],
+		      names.begin() + nameEnds[ seqNum + 1 ] );
+}
+
+}  // end namespace cbrc

Added: trunk/packages/last-align/branches/upstream/current/src/PeriodicSpacedSeed.cc
===================================================================
--- trunk/packages/last-align/branches/upstream/current/src/PeriodicSpacedSeed.cc	                        (rev 0)
+++ trunk/packages/last-align/branches/upstream/current/src/PeriodicSpacedSeed.cc	2008-09-11 06:55:41 UTC (rev 2475)
@@ -0,0 +1,46 @@
+// Copyright 2008 Martin C. Frith
+
+#include "PeriodicSpacedSeed.hh"
+#include <stdexcept>
+#include <istream>
+#include <ostream>
+
+namespace cbrc{
+
+void PeriodicSpacedSeed::fromString( const std::string& s ){
+  pattern = s;
+  init();
+}
+
+void PeriodicSpacedSeed::init(){
+  if( pattern.empty() || pattern[0] != '1' ){
+    throw std::runtime_error("bad mask: " + pattern);
+  }
+
+  offsets.clear();
+  unsigned off = 1;
+
+  for( std::size_t i = 1; i < pattern.size(); ++i ){
+    if( pattern[i] == '1' ){
+      offsets.push_back(off);
+      off = 0;
+    }
+    ++off;
+  }
+
+  offsets.push_back(off);
+
+  maxOffset = *max_element( offsets.begin(), offsets.end() );
+}
+
+std::ostream& operator<<( std::ostream& s, const PeriodicSpacedSeed& m ){
+  return s << m.pattern;
+}
+
+std::istream& operator>>( std::istream& s, PeriodicSpacedSeed& m ){
+  s >> m.pattern;
+  if( s ) m.init();
+  return s;
+}
+
+}  // end namespace cbrc

Added: trunk/packages/last-align/branches/upstream/current/src/ScoreMatrix.cc
===================================================================
--- trunk/packages/last-align/branches/upstream/current/src/ScoreMatrix.cc	                        (rev 0)
+++ trunk/packages/last-align/branches/upstream/current/src/ScoreMatrix.cc	2008-09-11 06:55:41 UTC (rev 2475)
@@ -0,0 +1,195 @@
+// Copyright 2008 Martin C. Frith
+
+#include "ScoreMatrix.hh"
+#include <fstream>
+#include <sstream>
+#include <iomanip>
+#include <stdexcept>
+#include <cassert>
+//#include <iostream>  // for debugging
+
+#define ERR(x) throw std::runtime_error(x)
+
+namespace cbrc{
+
+// copied from NCBI BLAST:
+const char* ScoreMatrix::blosum62 = "\
+   A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  B  J  Z  X  *\n\
+A  4 -1 -2 -2  0 -1 -1  0 -2 -1 -1 -1 -1 -2 -1  1  0 -3 -2  0 -2 -1 -1 -1 -4\n\
+R -1  5  0 -2 -3  1  0 -2  0 -3 -2  2 -1 -3 -2 -1 -1 -3 -2 -3 -1 -2  0 -1 -4\n\
+N -2  0  6  1 -3  0  0  0  1 -3 -3  0 -2 -3 -2  1  0 -4 -2 -3  4 -3  0 -1 -4\n\
+D -2 -2  1  6 -3  0  2 -1 -1 -3 -4 -1 -3 -3 -1  0 -1 -4 -3 -3  4 -3  1 -1 -4\n\
+C  0 -3 -3 -3  9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -1 -3 -1 -4\n\
+Q -1  1  0  0 -3  5  2 -2  0 -3 -2  1  0 -3 -1  0 -1 -2 -1 -2  0 -2  4 -1 -4\n\
+E -1  0  0  2 -4  2  5 -2  0 -3 -3  1 -2 -3 -1  0 -1 -3 -2 -2  1 -3  4 -1 -4\n\
+G  0 -2  0 -1 -3 -2 -2  6 -2 -4 -4 -2 -3 -3 -2  0 -2 -2 -3 -3 -1 -4 -2 -1 -4\n\
+H -2  0  1 -1 -3  0  0 -2  8 -3 -3 -1 -2 -1 -2 -1 -2 -2  2 -3  0 -3  0 -1 -4\n\
+I -1 -3 -3 -3 -1 -3 -3 -4 -3  4  2 -3  1  0 -3 -2 -1 -3 -1  3 -3  3 -3 -1 -4\n\
+L -1 -2 -3 -4 -1 -2 -3 -4 -3  2  4 -2  2  0 -3 -2 -1 -2 -1  1 -4  3 -3 -1 -4\n\
+K -1  2  0 -1 -3  1  1 -2 -1 -3 -2  5 -1 -3 -1  0 -1 -3 -2 -2  0 -3  1 -1 -4\n\
+M -1 -1 -2 -3 -1  0 -2 -3 -2  1  2 -1  5  0 -2 -1 -1 -1 -1  1 -3  2 -1 -1 -4\n\
+F -2 -3 -3 -3 -2 -3 -3 -3 -1  0  0 -3  0  6 -4 -2 -2  1  3 -1 -3  0 -3 -1 -4\n\
+P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4  7 -1 -1 -4 -3 -2 -2 -3 -1 -1 -4\n\
+S  1 -1  1  0 -1  0  0  0 -1 -2 -2  0 -1 -2 -1  4  1 -3 -2 -2  0 -2  0 -1 -4\n\
+T  0 -1  0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1  1  5 -2 -2  0 -1 -1 -1 -1 -4\n\
+W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1  1 -4 -3 -2 11  2 -3 -4 -2 -2 -1 -4\n\
+Y -2 -2 -2 -3 -2 -1 -2 -3  2 -1 -1 -2 -1  3 -3 -2 -2  2  7 -1 -3 -1 -2 -1 -4\n\
+V  0 -3 -3 -3 -1 -2 -2 -3 -3  3  1 -2  1 -1 -2 -2  0 -3 -1  4 -3  2 -2 -1 -4\n\
+B -2 -1  4  4 -3  0  1 -1  0 -3 -4  0 -3 -3 -2  0 -1 -4 -3 -3  4 -3  0 -1 -4\n\
+J -1 -2 -3 -3 -1 -2 -3 -4 -3  3  3 -3  2  0 -3 -2 -1 -2 -1  2 -3  3 -3 -1 -4\n\
+Z -1  0  0  1 -3  4  4 -2  0 -3 -3  1 -1 -3 -1  0 -1 -2 -2 -2  0 -3  4 -1 -4\n\
+X -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -4\n\
+* -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4  1\n\
+";
+
+void ScoreMatrix::matchMismatch( int match, int mismatch,
+				 const std::string& letters ){
+  rows = letters;
+  cols = letters;
+  std::size_t size = letters.size();
+
+  cells.resize( size );
+
+  for( std::size_t i = 0; i < size; ++i ){
+    cells[i].assign( size, -mismatch );
+    cells[i][i] = match;
+  }
+}
+
+void ScoreMatrix::fromFile( const std::string& fileName ){
+  std::ifstream file( fileName.c_str() );
+  if( !file ) ERR( "can't open file: " + fileName );
+  file >> *this;
+  if( !file ) ERR( "can't read file: " + fileName );
+}
+
+void ScoreMatrix::fromString( const std::string& matString ){
+  std::istringstream iss(matString);
+  iss >> *this;
+  assert(iss);
+}
+
+void ScoreMatrix::init( const uchar encode[] ){
+  assert( !rows.empty() && !cols.empty() );
+
+  std::transform( rows.begin(), rows.end(), rows.begin(), toupper );
+  std::transform( cols.begin(), cols.end(), cols.begin(), toupper );
+
+  minScore = cells[0][0];
+  maxScore = cells[0][0];
+
+  for( std::size_t i = 0; i < rows.size(); ++i ){
+    for( std::size_t j = 0; j < cols.size(); ++j ){
+      minScore = std::min( minScore, cells[i][j] );
+      maxScore = std::max( maxScore, cells[i][j] );
+    }
+  }
+
+  // set default score = minScore:
+  for( unsigned i = 0; i < MAT; ++i ){
+    for( unsigned j = 0; j < MAT; ++j ){
+      caseSensitive[i][j] = minScore;
+      caseInsensitive[i][j] = minScore;
+    }
+  }
+
+  for( std::size_t i = 0; i < rows.size(); ++i ){
+    for( std::size_t j = 0; j < cols.size(); ++j ){
+      uchar x = encode[ rows[i] ];
+      uchar y = encode[ cols[j] ];
+      uchar a = encode[ std::tolower( rows[i] ) ];
+      uchar b = encode[ std::tolower( cols[j] ) ];
+      if( a >= MAT ) ERR( std::string("bad symbol: ") + rows[i] );
+      if( b >= MAT ) ERR( std::string("bad symbol: ") + cols[j] );
+      caseSensitive[x][y] = cells[i][j];
+      caseInsensitive[x][y] = cells[i][j];
+      caseInsensitive[x][b] = cells[i][j];
+      caseInsensitive[a][y] = cells[i][j];
+      caseInsensitive[a][b] = cells[i][j];
+    }
+  }
+
+  // set a hugely negative score for the delimiter symbol:
+  uchar z = encode[' '];
+  assert( z < MAT );
+  for( unsigned i = 0; i < MAT; ++i ){
+    caseSensitive[z][i] = -INF;
+    caseSensitive[i][z] = -INF;
+    caseInsensitive[z][i] = -INF;
+    caseInsensitive[i][z] = -INF;    
+  }
+}
+
+void ScoreMatrix::writeCommented( std::ostream& stream ) const{
+  stream << "# " << ' ';
+  for( std::size_t i = 0; i < cols.size(); ++i ){
+    stream << ' ' << std::setw(OUTPAD) << cols[i];
+  }
+  stream << '\n';
+
+  for( std::size_t i = 0; i < rows.size(); ++i ){
+    stream << "# " << rows[i];
+    for( std::size_t j = 0; j < cols.size(); ++j ){
+      stream << ' ' << std::setw(OUTPAD) << cells[i][j];
+    }
+    stream << '\n';
+  }
+}
+
+std::istream& operator>>( std::istream& stream, ScoreMatrix& m ){
+  std::string tmpRows;
+  std::string tmpCols;
+  std::vector< std::vector<int> > tmpCells;
+  std::string line;
+  int state = 0;
+
+  while( std::getline( stream, line ) ){
+    std::istringstream iss(line);
+    char c;
+    if( !(iss >> c) ) continue;  // skip blank lines
+    if( state == 0 ){
+      if( c == '#' ) continue;  // skip comment lines at the top
+      do{
+	tmpCols.push_back(c);
+      }while( iss >> c );
+      state = 1;
+    }
+    else{
+      tmpRows.push_back(c);
+      tmpCells.resize( tmpCells.size() + 1 );
+      int score;
+      while( iss >> score ){
+	tmpCells.back().push_back(score);
+      }
+      if( tmpCells.back().size() != tmpCols.size() ) ERR( "bad score matrix" );
+    }
+  }
+
+  if( stream.eof() && !stream.bad() && !tmpRows.empty() ){
+    stream.clear();
+    m.rows.swap(tmpRows);
+    m.cols.swap(tmpCols);
+    m.cells.swap(tmpCells);
+  }
+
+  return stream;
+}
+
+std::ostream& operator<<( std::ostream& stream, const ScoreMatrix& m ){
+  for( std::size_t i = 0; i < m.cols.size(); ++i ){
+    stream << ' ' << std::setw(m.OUTPAD) << m.cols[i];
+  }
+  stream << '\n';
+
+  for( std::size_t i = 0; i < m.rows.size(); ++i ){
+    stream << m.rows[i];
+    for( std::size_t j = 0; j < m.cols.size(); ++j ){
+      stream << ' ' << std::setw(m.OUTPAD) << m.cells[i][j];
+    }
+    stream << '\n';
+  }
+
+  return stream;
+}
+
+}  // end namespace cbrc

Added: trunk/packages/last-align/branches/upstream/current/src/ScoreMatrix.hh
===================================================================
--- trunk/packages/last-align/branches/upstream/current/src/ScoreMatrix.hh	                        (rev 0)
+++ trunk/packages/last-align/branches/upstream/current/src/ScoreMatrix.hh	2008-09-11 06:55:41 UTC (rev 2475)
@@ -0,0 +1,43 @@
+// Copyright 2008 Martin C. Frith
+
+// This struct holds a score matrix for aligning pairs of residues,
+// e.g. blosum62.  The delimiter symbol (space) aligned to anything
+// gets a score of -INF.
+
+#ifndef SCOREMATRIX_HH
+#define SCOREMATRIX_HH
+#include <string>
+#include <vector>
+#include <iosfwd>
+
+namespace cbrc{
+
+struct ScoreMatrix{
+  typedef unsigned char uchar;
+
+  enum { INF = INT_MAX / 2 };  // big, but try to avoid overflow
+  enum { MAT = 64 };           // matrix size = MAT x MAT
+  enum { OUTPAD = 2 };         // cell-padding for output
+
+  static const char* blosum62;
+
+  void matchMismatch( int match, int mismatch, const std::string& letters );
+  void fromFile( const std::string& fileName );
+  void fromString( const std::string& s );
+  void init( const uchar encode[] );  // unspecified letters get minScore
+  void writeCommented( std::ostream& stream ) const;  // write preceded by "#"
+
+  std::string rows;                       // row headings (letters)
+  std::string cols;                       // column headings (letters)
+  std::vector< std::vector<int> > cells;  // scores
+  int caseSensitive[MAT][MAT];
+  int caseInsensitive[MAT][MAT];
+  int minScore;
+  int maxScore;
+};
+
+std::istream& operator>>( std::istream& stream, ScoreMatrix& mat );
+std::ostream& operator<<( std::ostream& stream, const ScoreMatrix& mat );
+
+}  // end namespace cbrc
+#endif  // SCOREMATRIX_HH

Added: trunk/packages/last-align/branches/upstream/current/src/SuffixArray.hh
===================================================================
--- trunk/packages/last-align/branches/upstream/current/src/SuffixArray.hh	                        (rev 0)
+++ trunk/packages/last-align/branches/upstream/current/src/SuffixArray.hh	2008-09-11 06:55:41 UTC (rev 2475)
@@ -0,0 +1,88 @@
+// Copyright 2008 Martin C. Frith
+
+// This struct holds a suffix array.  The suffix array is just a list
+// of numbers indicating positions in a text, sorted according to the
+// alphabetical order of the text suffixes starting at these
+// positions.  A query sequence can then be matched to the suffix
+// array using binary search.  For faster matching, we use "buckets",
+// which store the matching regions for all k-mers for k=1 to, say,
+// 12.  In addition, we allow skipping of positions while sorting and
+// matching, using a periodic spaced seed.
+
+// To allow for "improper" letters in the text (e.g. non-ACGT for
+// DNA), the buckets are slightly cunning.  For each k-mer size k, if
+// the alphabet size is A (e.g. 4 for DNA), we store A^k + A^(k-1)
+// numbers.  The A^k numbers are the starting locations in the suffix
+// array of every "proper" k-mer.  The extra A^(k-1) numbers are the
+// starting positions of improper k-mers.
+
+#ifndef SUFFIXARRAY_HH
+#define SUFFIXARRAY_HH
+#include <vector>
+
+namespace cbrc{
+
+struct SuffixArray{
+  typedef unsigned indexT;
+  typedef unsigned char uchar;
+
+  SuffixArray( const std::vector<uchar>& t, const std::vector<indexT>& m,
+	       unsigned a )
+    : text(t), mask(m), alphSize(a), maskSize( m.size() ){}
+
+  // Add (unsorted) indices for text positions between beg and end,
+  // and return 1.  Positions starting with delimiters aren't added.
+  // If the index size would exceed maxBytes, don't add anything, and
+  // return 0.
+  int makeIndex( indexT beg, indexT end, indexT step, std::size_t maxBytes );
+
+  std::size_t indexBytes() const{ return index.size() * sizeof(indexT); }
+  void sortIndex();
+  void makeBuckets( indexT bucketDepth );
+  void clear();
+
+  void fromFiles( const std::string& baseName,
+		  indexT indexNum, indexT bucketDepth );
+  void toFiles( const std::string& baseName ) const;
+
+  // Find the smallest match to the text, starting at the given
+  // position in the query, such that there are at most maxHits
+  // matches, and the match-depth is at least minDepth.  Return the
+  // range of matching indices via beg and end.
+  void match( const indexT*& beg, const indexT*& end,
+	      const uchar* queryPtr,
+	      indexT maxHits, indexT minDepth ) const;
+
+  // Count matches of all sizes, starting at the given position in the
+  // query.  Don't try this for large self-comparisons!
+  void countMatches( std::vector<unsigned long long>& counts,
+		     const uchar* queryPtr ) const;
+
+  const std::vector<uchar>& text;
+  const std::vector<indexT>& mask;
+  unsigned alphSize;  // excluding delimiter symbol
+  unsigned maskSize;  // same as mask.size(), but faster!!!
+  std::vector<indexT> index;  // sorted indices
+  std::vector< std::vector<indexT> > bucketNest;
+  std::vector<indexT> bucketMask;  // extended mask, to avoid modulus
+  indexT bucketMaskTotal;
+
+  static void equalRange( const indexT*& beg, const indexT*& end,
+			  const uchar* textBase, uchar symbol );
+  static const indexT* lowerBound( const indexT* beg, const indexT* end,
+				   const uchar* textBase, uchar symbol );
+  static const indexT* upperBound( const indexT* beg, const indexT* end,
+				   const uchar* textBase, uchar symbol );
+
+  void makeBucketMask( indexT bucketDepth );
+
+  void radixSort( indexT* beg, indexT* end,
+		  indexT depth, const uchar* textBase );
+  void insertionSort( indexT* beg, indexT* end,
+		      indexT depth, const uchar* textBase );
+  void insertionSortSimple( indexT* beg, indexT* end,
+			    const uchar* textBase );
+};
+
+}  // end namespace cbrc
+#endif  // SUFFIXARRAY_HH

Added: trunk/packages/last-align/branches/upstream/current/src/XdropAligner.hh
===================================================================
--- trunk/packages/last-align/branches/upstream/current/src/XdropAligner.hh	                        (rev 0)
+++ trunk/packages/last-align/branches/upstream/current/src/XdropAligner.hh	2008-09-11 06:55:41 UTC (rev 2475)
@@ -0,0 +1,77 @@
+// Copyright 2008 Martin C. Frith
+
+// X-drop algorithm for gapped extension of alignments.
+
+#ifndef XDROPALIGNER_HH
+#define XDROPALIGNER_HH
+#include <vector>
+
+namespace cbrc{
+
+class GeneralizedAffineGapCosts;
+class SegmentPair;
+
+class XdropAligner{
+public:
+  typedef unsigned char uchar;
+
+  enum direction{ FORWARD, REVERSE };
+
+  // Extend an alignment, from the given start point, in the given direction
+  // Return the score (bestScore)
+  int fill( const uchar* seq1, const uchar* seq2,
+	    size_t start1, size_t start2, direction dir,
+	    const int sm[64][64], int maxDrop,
+	    const GeneralizedAffineGapCosts& gap );
+
+  // Get the ungapped segments of the extension, putting them in "chunks"
+  // The extension might be empty!
+  void traceback( std::vector< SegmentPair >& chunks,
+		  const uchar* seq1, const uchar* seq2,
+		  size_t start1, size_t start2, direction dir,
+		  const int sm[64][64],
+		  const GeneralizedAffineGapCosts& gap ) const;
+
+private:
+  typedef std::vector< std::vector< int > > matrix_t;
+
+  enum { INF = INT_MAX / 2 };  // big, but try to avoid overflow
+
+  int bestScore;
+  size_t bestAntiDiagonal;
+  size_t bestPos1;
+
+  // maybe use std::vector< std::vector< int[3] > > instead of x, y, z???
+  matrix_t x;
+  matrix_t y;
+  matrix_t z;
+  std::vector< size_t > offsets;
+
+  static int drop( int score, int minScore );
+  static bool isDelimiter( uchar c, const int sm[64][64] );
+  static const uchar* seqPtr( const uchar* seq, size_t start,
+			      direction dir, size_t pos );
+  static int match( const uchar* seq1, const uchar* seq2,
+		    size_t start1, size_t start2, direction dir,
+		    const int sm[64][64],
+		    size_t antiDiagonal, size_t seq1pos );
+
+  size_t fillBeg( size_t antiDiagonal ) const;
+  size_t fillEnd( size_t antiDiagonal ) const;
+  int cell( const matrix_t& matrix,
+	    size_t antiDiagonal, size_t seq1pos ) const;
+  int hori( const matrix_t& matrix,
+	    size_t antiDiagonal, size_t seq1pos ) const;
+  int vert( const matrix_t& matrix,
+	    size_t antiDiagonal, size_t seq1pos ) const;
+  int diag( const matrix_t& matrix,
+	    size_t antiDiagonal, size_t seq1pos ) const;
+  void reset( const GeneralizedAffineGapCosts& gap );
+  void initScores( size_t antiDiagonal, size_t reservation );
+  void updateBest( int score, size_t antiDiagonal, const int* x0 );
+  size_t newFillBeg( size_t k1, size_t k2, size_t off1, size_t end1 ) const;
+  size_t newFillEnd( size_t k1, size_t k2, size_t off1, size_t end1 ) const;
+};
+
+}  // end namespace cbrc
+#endif  // XDROPALIGNER_HH

Added: trunk/packages/last-align/branches/upstream/current/src/lastal.cc
===================================================================
--- trunk/packages/last-align/branches/upstream/current/src/lastal.cc	                        (rev 0)
+++ trunk/packages/last-align/branches/upstream/current/src/lastal.cc	2008-09-11 06:55:41 UTC (rev 2475)
@@ -0,0 +1,420 @@
+// Copyright 2008 Martin C. Frith
+
+// BLAST-like pair-wise sequence alignment, using suffix arrays.
+
+#include "LastalArguments.hh"
+#include "SuffixArray.hh"
+#include "XdropAligner.hh"
+#include "AlignmentPot.hh"
+#include "Alignment.hh"
+#include "SegmentPairPot.hh"
+#include "SegmentPair.hh"
+#include "ScoreMatrix.hh"
+#include "Alphabet.hh"
+#include "MultiSequence.hh"
+#include "PeriodicSpacedSeed.hh"
+#include "DiagonalTable.hh"
+#include "GeneralizedAffineGapCosts.hh"
+#include "io.hh"
+#include "stringify.hh"
+#include <iostream>
+#include <fstream>
+#include <stdexcept>
+#include <cassert>
+
+#define LOG(x) if( args.verbosity > 0 ) std::cerr << "lastal: " << x << '\n'
+
+typedef unsigned indexT;
+typedef unsigned char uchar;
+typedef unsigned long long countT;
+
+// Set up a scoring matrix, based on the user options
+void makeScoreMatrix( cbrc::ScoreMatrix& sm, cbrc::LastalArguments& args,
+		      const cbrc::Alphabet& alph ){
+  if( !args.matrixFile.empty() ){
+    sm.fromFile( args.matrixFile );
+  }
+  else if( args.matchScore < 0 && args.mismatchCost < 0 &&
+	   alph.letters == alph.protein ){
+    sm.fromString( sm.blosum62 );
+  }
+  else{
+    // if matchScore or mismatchCost weren't set, use default values:
+    if( args.matchScore < 0 )    args.matchScore = 1;
+    if( args.mismatchCost < 0 )  args.mismatchCost = 1;
+    sm.matchMismatch( args.matchScore, args.mismatchCost, alph.letters );
+  }
+
+  sm.init( alph.encode );
+}
+
+// Read the .prj file for the whole database
+void readOuterPrj( const std::string& fileName, cbrc::Alphabet& alph,
+		   cbrc::PeriodicSpacedSeed& mask, unsigned& volumes ){
+  std::ifstream f( fileName.c_str() );
+  if( !f ) throw std::runtime_error("can't open file: " + fileName );
+
+  std::string word;
+  while( std::getline( f >> std::ws, word, '=' ) ){  // eat leading whitespace
+    /**/ if( word == "alphabet" ) f >> alph;
+    else if( word == "spacedseed" ) f >> mask;
+    else if( word == "volumes" ) f >> volumes;
+    else f >> word;
+  }
+
+  if( f.eof() && !f.bad() ) f.clear();
+  if( alph.letters.empty() || mask.pattern.empty() || volumes == -1u ){
+    f.setstate( std::ios::failbit );
+  }
+  if( !f ) throw std::runtime_error("can't read file: " + fileName);
+}
+
+// Read a per-volume .prj file, with info about a database volume
+void readInnerPrj( const std::string& fileName, indexT& seqCount,
+		   indexT& delimiterNum, indexT& bucketDepth ){
+  std::ifstream f( fileName.c_str() );
+  if( !f ) throw std::runtime_error("can't open file: " + fileName );
+
+  std::string word;
+  while( std::getline( f >> std::ws, word, '=' ) ){  // eat leading whitespace
+    /**/ if( word == "numofsequences" ) f >> seqCount;
+    else if( word == "specialcharacters" ) f >> delimiterNum;
+    else if( word == "prefixlength" ) f >> bucketDepth;
+    else f >> word;
+  }
+
+  if( f.eof() && !f.bad() ) f.clear();
+  if( seqCount == -1u || delimiterNum == -1u || bucketDepth == -1u ){
+    f.setstate( std::ios::failbit );
+  }
+  if( !f ) throw std::runtime_error("can't read file: " + fileName);
+}
+
+// Write match counts for each query sequence
+void writeCounts( const std::vector< std::vector<countT> >& matchCounts,
+		  const cbrc::MultiSequence& query, indexT minDepth,
+		  std::ostream& out ){
+  for( indexT i = 0; i < matchCounts.size(); ++i ){
+    out << query.names[i] << '\n';
+
+    for( indexT j = minDepth-1; j < matchCounts[i].size(); ++j ){
+      out << j+1 << '\t' << matchCounts[i][j] << '\n';
+    }
+  }
+}
+
+// Count all matches, of all sizes, of a query batch against a suffix array
+void countMatches( std::vector< std::vector<countT> >& matchCounts,
+		   const cbrc::MultiSequence& query,
+		   const cbrc::SuffixArray& sa,
+		   const cbrc::LastalArguments& args, char strand ){
+  indexT seqNum = strand == '+' ? 0 : query.finishedSequences() - 1;
+
+  for( indexT i = 0; i < query.ends.back(); i += args.queryStep ){
+    if( strand == '+' ){
+      for( ;; ){
+	if( seqNum == query.finishedSequences() ) return;
+	if( query.seqEnd(seqNum) > i ) break;
+	++seqNum;
+      }
+    }
+    else{
+      for( ;; ){
+	if( seqNum == -1u ) return;
+	if( query.seqBeg(seqNum) < query.ends.back() - i ) break;
+	--seqNum;
+      }
+    }
+
+    sa.countMatches( matchCounts[seqNum], &query.seq[i] );
+  }
+}
+
+// Find query matches to the suffix array, and do gapless extensions
+void alignGapless( cbrc::SegmentPairPot& gaplessAlns,
+		   const cbrc::MultiSequence& query,
+		   const cbrc::MultiSequence& text,
+		   const cbrc::SuffixArray& sa,
+		   const cbrc::LastalArguments& args,
+		   const cbrc::ScoreMatrix& sm,
+		   const cbrc::Alphabet& alph,
+		   char strand, std::ostream& out ){
+  const std::vector<uchar>& qseq = query.seq;
+  const std::vector<uchar>& tseq = text.seq;
+  const int (*mat)[64] =
+    args.maskLowercase < 2 ? sm.caseInsensitive : sm.caseSensitive;
+  cbrc::DiagonalTable dt;  // record already-covered positions on each diagonal
+  countT gaplessExtensionCount = 0, gaplessAlignmentCount = 0;
+
+  for( indexT i = 0; i < query.ends.back(); i += args.queryStep ){
+    const indexT* beg;
+    const indexT* end;
+    sa.match( beg, end, &qseq[i], args.oneHitMultiplicity, args.minHitDepth );
+
+    for( /* noop */; beg < end; ++beg ){  // loop over suffix-array matches
+      if( dt.isCovered( i, *beg ) ) continue;
+
+      // tried: first get the score only, not the endpoints: slower!
+      cbrc::SegmentPair sp;  // do the gapless extension:
+      sp.makeXdrop( *beg, i, &tseq[0], &qseq[0], mat, args.maxDropGapless );
+      ++gaplessExtensionCount;
+
+      // Tried checking the score after isOptimal & addEndpoint, but
+      // the number of extensions decreased by < 10%, and it was
+      // slower overall.
+      if( sp.score < args.minScoreGapless ) continue;
+
+      if( !sp.isOptimal( &tseq[0], &qseq[0], mat, args.maxDropGapless ) ){
+	continue;  // ignore sucky gapless extensions
+      }
+
+      dt.addEndpoint( sp.end2(), sp.end1() );
+
+      if( args.outputType == 1 ){  // we just want gapless alignments
+	cbrc::Alignment aln;
+	aln.fromSegmentPair(sp);
+	aln.write( text, query, strand, alph, args.outputFormat, out );
+      }
+      else gaplessAlns.add(sp);  // add the gapless alignment to the pot
+      ++gaplessAlignmentCount;
+    }
+  }
+
+  LOG( "gapless extensions=" << gaplessExtensionCount );
+  LOG( "gapless alignments=" << gaplessAlignmentCount );
+}
+
+// Do gapped extensions of the gapless alignments
+void alignGapped( cbrc::AlignmentPot& gappedAlns,
+		  cbrc::SegmentPairPot& gaplessAlns,
+		  const std::vector<uchar>& text,
+		  const std::vector<uchar>& query,
+		  const cbrc::LastalArguments& args,
+		  const cbrc::ScoreMatrix& sm,
+		  const cbrc::Alphabet& alph ){
+  cbrc::XdropAligner aligner;
+  cbrc::GeneralizedAffineGapCosts gap( args.gapExistCost, args.gapExtendCost,
+				       args.gapPairCost );
+  const int (*mat)[64] =
+    args.maskLowercase < 3 ? sm.caseInsensitive : sm.caseSensitive;
+  countT gappedExtensionCount = 0;
+
+  gaplessAlns.sort();  // sort the gapless alignments by score, highest first
+
+  for( size_t i = 0; i < gaplessAlns.size(); ++i ){
+    const cbrc::SegmentPair& sp = gaplessAlns.get(i);
+
+    if( sp.score == 0 ) continue;  // it has been marked as redundant
+
+    cbrc::SegmentPair seed;
+
+    // Redo gapless extension, using gapped score parameters.  Without
+    // this, if we self-compare a huge sequence, we risk getting a
+    // huge gapped extension.  (Do this before sorting them?)
+    seed.makeXdrop( sp.beg1(), sp.beg2(), &text[0], &query[0],
+		    mat, args.maxDropGapped );
+
+    // Shrink the seed to its longest run of identical matches.  This
+    // trims off possibly unreliable parts of the gapless alignment.
+    seed.maxIdenticalRun( &text[0], &query[0], alph.canonical, mat );
+
+    cbrc::Alignment aln;  // do gapped extension from each end of the seed:
+    aln.makeXdrop( aligner, seed, &text[0], &query[0],
+		   mat, args.maxDropGapped, gap );
+    ++gappedExtensionCount;
+
+    if( aln.score < args.minScoreGapped ) continue;
+
+    if( !aln.isOptimal( &text[0], &query[0], mat, args.maxDropGapped, gap ) ){
+      // If retained, non-"optimal" alignments can hide "optimal"
+      // alignments, e.g. during non-reduntantization.
+      continue;
+    }
+
+    gaplessAlns.markAllOverlaps( aln.blocks );
+    gaplessAlns.markTandemRepeats( seed, args.maxRepeatDistance ); 
+    gappedAlns.add(aln);  // add the gapped alignment to the pot
+ }
+
+  LOG( "gapped extensions=" << gappedExtensionCount );
+  LOG( "gapped alignments=" << gappedAlns.size() );
+}
+
+// Scan one batch of query sequences against one database volume
+void scan( const cbrc::MultiSequence& query, const cbrc::MultiSequence& text,
+	   const cbrc::SuffixArray& sa, const cbrc::LastalArguments& args,
+	   const cbrc::Alphabet& alph, const cbrc::ScoreMatrix& sm,
+	   std::vector< std::vector<countT> >& matchCounts,
+	   char strand, std::ostream& out ){
+  LOG( "scanning..." );
+
+  if( args.outputType == 0 ){  // we just want match counts
+    countMatches( matchCounts, query, sa, args, strand );
+    return;
+  }
+
+  cbrc::SegmentPairPot gaplessAlns;
+  alignGapless( gaplessAlns, query, text, sa, args, sm, alph, strand, out );
+  if( args.outputType == 1 ) return;  // we just want gapless alignments
+
+  cbrc::AlignmentPot gappedAlns;
+  alignGapped( gappedAlns, gaplessAlns, text.seq, query.seq, args, sm, alph );
+
+  if( args.outputType == 3 ){  // we want non-redundant alignments
+    gappedAlns.eraseSuboptimal();
+    LOG( "nonredundant gapped alignments=" << gappedAlns.size() );
+  }
+
+  gappedAlns.sort();  // sort by score
+  for( size_t i = 0; i < gappedAlns.size(); ++i ){
+    gappedAlns.items[i].write( text, query, strand, alph,
+			       args.outputFormat, out );
+  }
+}
+
+// Scan one batch of query sequences against all database volumes
+void scanAllVolumes( cbrc::MultiSequence& query,
+		     const cbrc::LastalArguments& args,
+		     const cbrc::Alphabet& alph,
+		     const cbrc::PeriodicSpacedSeed& mask,
+		     const cbrc::ScoreMatrix& sm,
+		     unsigned volumes, std::ostream& out ){
+  std::vector< std::vector<countT> > matchCounts;  // used if outputType == 0
+  if( args.outputType == 0 ) matchCounts.resize( query.finishedSequences() );
+
+  for( unsigned i = 0; i < volumes; ++i ){
+    std::string baseName = args.lastdbName + cbrc::stringify(i);
+    LOG( "reading " << baseName << "..." );
+
+    indexT seqCount = -1u, delimiterNum = -1u, bucketDepth = -1u;
+    readInnerPrj( baseName + ".prj", seqCount, delimiterNum, bucketDepth );
+
+    cbrc::MultiSequence text;
+    text.fromFiles( baseName, seqCount );
+
+    cbrc::SuffixArray sa( text.seq, mask.offsets, alph.size );
+    sa.fromFiles( baseName, text.ends.back() - delimiterNum, bucketDepth );
+
+    if( args.strand == 2 && i > 0 ){
+      LOG( "re-reverse complementing..." );
+      alph.rc( query.seq.begin(), query.seq.begin() + query.ends.back() );
+    }
+
+    if( args.strand != 0 ){
+      scan( query, text, sa, args, alph, sm, matchCounts, '+', out );
+    }
+
+    if( args.strand == 2 || args.strand == 0 && i == 0 ){
+      LOG( "reverse complementing..." );
+      alph.rc( query.seq.begin(), query.seq.begin() + query.ends.back() );
+    }
+
+    if( args.strand != 1 ){
+      scan( query, text, sa, args, alph, sm, matchCounts, '-', out );
+    }
+  }
+
+  if( args.outputType == 0 ){
+    LOG( "writing..." );
+    writeCounts( matchCounts, query, args.minHitDepth, out );
+  }
+
+  LOG( "done!" );
+}
+
+void writeHeader( std::ostream& out, const cbrc::LastalArguments& args,
+		  const cbrc::ScoreMatrix sm ){
+  out << "# LAST version " <<
+#include "version.hh"
+      << "\n";
+  out << "#\n";
+  args.writeCommented( out );
+  out << "#\n";
+
+  if( args.outputType == 0 ){  // we just want hit counts
+    out << "# depth\tcount\n";
+  }
+  else{  // we want alignments
+    sm.writeCommented( out );
+    out << "#\n";
+    out << "# Coordinates are 0-based.  For - strand matches, coordinates\n";
+    out << "# in the reverse complement of the 2nd sequence are used.\n";
+    out << "#\n";
+
+    if( args.outputFormat == 0 ){  // tabular format
+      out << "# score\tname1\tstart1\talnSize1\tstrand1\tseqSize1\t"
+	  << "name2\tstart2\talnSize2\tstrand2\tseqSize2\tblocks\n";
+    }
+    else{  // MAF format
+      out << "# name start alnSize strand seqSize alignment\n";
+    }
+  }
+
+  out << "#\n";
+}
+
+// Read the next sequence, adding it to the MultiSequence
+std::istream&
+appendFromFasta( cbrc::MultiSequence& query,
+		 const cbrc::LastalArguments& args, const cbrc::Alphabet& alph,
+		 std::istream& in ){
+  std::size_t maxSeqBytes = args.batchSize;
+  if( query.finishedSequences() == 0 ) maxSeqBytes = std::size_t(-1);
+
+  indexT oldSeqSize = query.seq.size();
+
+  query.appendFromFasta( in, maxSeqBytes );
+
+  // encode the newly-read sequence
+  alph.tr( query.seq.begin() + oldSeqSize, query.seq.end() );
+
+  return in;
+}
+
+int main( int argc, char** argv )
+try{
+  std::clock_t startTime = std::clock();
+  cbrc::LastalArguments args( argc, argv );
+  cbrc::Alphabet alph;
+  cbrc::PeriodicSpacedSeed mask;
+  cbrc::ScoreMatrix sm;
+  unsigned volumes = -1u;  // initialize it to an "error" value
+  readOuterPrj( args.lastdbName + ".prj", alph, mask, volumes );
+  makeScoreMatrix( sm, args, alph );  // before alph.makeCaseInsensitive!
+  args.setDefaults( alph.letters == alph.dna, alph.letters == alph.protein,
+		    sm.maxScore );
+  if( args.maskLowercase < 1 ) alph.makeCaseInsensitive();
+
+  std::ofstream outFileStream;
+  std::ostream& out = cbrc::openOut( args.outFile, outFileStream );
+  writeHeader( out, args, sm );
+  cbrc::MultiSequence query;
+  query.initForAppending( mask.maxOffset );
+  alph.tr( query.seq.begin(), query.seq.end() );
+
+  for( char** i = argv + args.inputStart; i < argv + argc; ++i ){
+    LOG( "reading " << *i << "..." );
+    std::ifstream inFileStream;
+    std::istream& in = cbrc::openIn( *i, inFileStream );
+
+    while( appendFromFasta( query, args, alph, in ) ){
+      if( !query.isFinished() ){
+	scanAllVolumes( query, args, alph, mask, sm, volumes, out );
+	query.reinitForAppending();
+      }
+    }
+  }
+
+  if( query.finishedSequences() > 0 ){
+    scanAllVolumes( query, args, alph, mask, sm, volumes, out );
+  }
+
+  out << "# CPU time: " << (clock() - startTime + 0.0) / CLOCKS_PER_SEC
+      << " seconds\n";
+
+  return EXIT_SUCCESS;
+}
+catch( const std::exception& e ) {
+  std::cerr << "lastal: " << e.what() << '\n';
+  return EXIT_FAILURE;
+}

Added: trunk/packages/last-align/branches/upstream/current/src/lastdb.cc
===================================================================
--- trunk/packages/last-align/branches/upstream/current/src/lastdb.cc	                        (rev 0)
+++ trunk/packages/last-align/branches/upstream/current/src/lastdb.cc	2008-09-11 06:55:41 UTC (rev 2475)
@@ -0,0 +1,167 @@
+// Copyright 2008 Martin C. Frith
+
+// Read fasta-format sequences; construct a suffix array of them; and
+// write the results to files.
+
+#include "LastdbArguments.hh"
+#include "SuffixArray.hh"
+#include "Alphabet.hh"
+#include "MultiSequence.hh"
+#include "PeriodicSpacedSeed.hh"
+#include "io.hh"
+#include "stringify.hh"
+#include <stdexcept>
+#include <fstream>
+#include <iostream>
+
+#define LOG(x) if( args.verbosity > 0 ) std::cerr << "lastdb: " << x << '\n'
+
+typedef unsigned indexT;
+
+// Set up an alphabet (e.g. DNA or protein), based on the user options
+void makeAlphabet( cbrc::Alphabet& alph, const cbrc::LastdbArguments& args ){
+  if( !args.userAlphabet.empty() )  alph.fromString( args.userAlphabet );
+  else if( args.isProtein )         alph.fromString( alph.protein );
+  else                              alph.fromString( alph.dna );
+  if( !args.isCaseSensitive )       alph.makeCaseInsensitive();
+}
+
+// Find the maximum bucket depth, such that the total number of
+// buckets in all layers is at most 1/4 the number of indices
+indexT defaultBucketDepth( unsigned alphSize, indexT indexNum ){
+  indexT maxBuckets = indexNum / 4;
+  indexT layerBuckets = (alphSize + 1);  // buckets in outermost layer
+  indexT totalBuckets = layerBuckets;
+  if( totalBuckets > maxBuckets ) return 0;  // no buckets at all!
+
+  for( indexT depth = 1; /* noop */; ++depth ){
+    if( layerBuckets > maxBuckets / alphSize ) return depth;
+    layerBuckets *= alphSize;  // number of buckets in next layer
+    if( totalBuckets > maxBuckets - layerBuckets ) return depth;
+    totalBuckets += layerBuckets;
+  }
+}
+
+// Write the .prj file for the whole database
+void writeOuterPrj( const std::string& fileName, const cbrc::Alphabet& alph,
+		    const cbrc::PeriodicSpacedSeed& mask, unsigned volumes ){
+  std::ofstream f( fileName.c_str() );
+  if( !f ) throw std::runtime_error("can't open file: " + fileName );
+  f << "version=" <<
+#include "version.hh"
+    << '\n';
+  f << "alphabet=" << alph << '\n';
+  f << "spacedseed=" << mask << '\n';
+  f << "volumes=" << volumes << '\n';
+  if( !f ) throw std::runtime_error("can't write file: " + fileName);
+}
+
+// Write a per-volume .prj file, with info about a database volume
+void writeInnerPrj( const std::string& fileName,
+		    const cbrc::MultiSequence& multi,
+		    const cbrc::SuffixArray& sa ){
+  std::ofstream f( fileName.c_str() );
+  if( !f ) throw std::runtime_error("can't open file: " + fileName );
+  f << "totallength=" << multi.ends.back() << '\n';
+  f << "specialcharacters=" << multi.ends.back() - sa.index.size() << '\n';
+  f << "numofsequences=" << multi.finishedSequences() << '\n';
+  f << "prefixlength=" << sa.bucketNest.size() << '\n';
+  if( !f ) throw std::runtime_error("can't write file: " + fileName);
+}
+
+// Make one database volume, from one batch of sequences
+void makeVolume( cbrc::MultiSequence& multi, cbrc::SuffixArray& sa,
+		 const cbrc::LastdbArguments& args, const cbrc::Alphabet& alph,
+		 unsigned volumeNumber ){
+  std::string baseName = args.lastdbName + cbrc::stringify(volumeNumber);
+
+  LOG( "writing tis, des, ssp, sds..." );
+  multi.toFiles( baseName );
+
+  LOG( "recoding..." );
+  alph.mergeImproperCodes( multi.seq.begin(), multi.seq.end() );
+
+  LOG( "sorting..." );
+  sa.sortIndex();
+
+  LOG( "bucketing..." );
+  sa.makeBuckets( args.bucketDepth == -1u
+		  ? defaultBucketDepth( alph.size, sa.index.size() )
+		  : args.bucketDepth );
+
+  LOG( "writing suf, bck..." );
+  sa.toFiles( baseName );
+
+  LOG( "writing prj..." );
+  writeInnerPrj( baseName + ".prj", multi, sa );
+
+  LOG( "done!" );
+}
+
+// Read the next sequence, adding it to the MultiSequence and the SuffixArray
+std::istream&
+appendFromFasta( cbrc::MultiSequence& multi, cbrc::SuffixArray& sa,
+		 const cbrc::LastdbArguments& args, const cbrc::Alphabet& alph,
+		 std::istream& in ){
+  std::size_t maxSeqBytes = args.volumeSize - sa.indexBytes();
+  if( multi.finishedSequences() == 0 ) maxSeqBytes = std::size_t(-1);
+
+  indexT oldSeqSize = multi.seq.size();
+
+  multi.appendFromFasta( in, maxSeqBytes );
+
+  // encode the newly-read sequence
+  alph.tr( multi.seq.begin() + oldSeqSize, multi.seq.end() );
+
+  if( in && multi.isFinished() ){
+    std::size_t maxIndexBytes = args.volumeSize - multi.seq.size();
+    if( multi.finishedSequences() == 1 ) maxIndexBytes = std::size_t(-1);
+
+    if( !sa.makeIndex( *(multi.ends.end() - 2), *(multi.ends.end() - 1),
+		       args.indexStep, maxIndexBytes ) ){
+      multi.unfinish();
+    }
+  }
+
+  return in;
+}
+
+int main( int argc, char** argv )
+try{
+  cbrc::LastdbArguments args( argc, argv );
+  cbrc::Alphabet alph;
+  cbrc::PeriodicSpacedSeed mask;
+  cbrc::MultiSequence multi;
+  makeAlphabet( alph, args );
+  mask.fromString( args.maskPattern );
+  multi.initForAppending( mask.maxOffset );
+  alph.tr( multi.seq.begin(), multi.seq.end() );
+  cbrc::SuffixArray sa( multi.seq, mask.offsets, alph.size );
+  unsigned volumeNumber = 0;
+
+  for( char** i = argv + args.inputStart; i < argv + argc; ++i ){
+    LOG( "reading " << *i << "..." );
+    std::ifstream inFileStream;
+    std::istream& in = cbrc::openIn( *i, inFileStream );
+
+    while( appendFromFasta( multi, sa, args, alph, in ) ){
+      if( !multi.isFinished() ){
+	makeVolume( multi, sa, args, alph, volumeNumber++ );
+	sa.clear();
+	multi.reinitForAppending();
+      }
+    }
+  }
+
+  if( multi.finishedSequences() > 0 ){
+    makeVolume( multi, sa, args, alph, volumeNumber++ );
+  }
+
+  writeOuterPrj( args.lastdbName + ".prj", alph, mask, volumeNumber );
+
+  return EXIT_SUCCESS;
+}
+catch( const std::exception& e ) {
+  std::cerr << "lastdb: " << e.what() << '\n';
+  return EXIT_FAILURE;
+}