[med-svn] r41 - in trunk/packages: . probcons probcons/branches probcons/branches/upstream probcons/branches/upstream/current

Sun May 7 14:55:29 UTC 2006

Author: charles-guest
Date: 2006-05-07 14:55:27 +0000 (Sun, 07 May 2006)
New Revision: 41

Added:
   trunk/packages/probcons/
   trunk/packages/probcons/branches/
   trunk/packages/probcons/branches/upstream/
   trunk/packages/probcons/branches/upstream/current/
   trunk/packages/probcons/branches/upstream/current/CompareToRef.cc
   trunk/packages/probcons/branches/upstream/current/Defaults.h
   trunk/packages/probcons/branches/upstream/current/EvolutionaryTree.h
   trunk/packages/probcons/branches/upstream/current/FileBuffer.h
   trunk/packages/probcons/branches/upstream/current/FixRef.cc
   trunk/packages/probcons/branches/upstream/current/Main.cc
   trunk/packages/probcons/branches/upstream/current/MakeGnuPlot.cc
   trunk/packages/probcons/branches/upstream/current/Makefile
   trunk/packages/probcons/branches/upstream/current/MultiSequence.h
   trunk/packages/probcons/branches/upstream/current/ProbabilisticModel.h
   trunk/packages/probcons/branches/upstream/current/ProjectPairwise.cc
   trunk/packages/probcons/branches/upstream/current/README
   trunk/packages/probcons/branches/upstream/current/SafeVector.h
   trunk/packages/probcons/branches/upstream/current/ScoreType.h
   trunk/packages/probcons/branches/upstream/current/Sequence.h
   trunk/packages/probcons/branches/upstream/current/SparseMatrix.h
   trunk/packages/probcons/tags/
Log:
[svn-inject] Installing original source of probcons

Added: trunk/packages/probcons/branches/upstream/current/CompareToRef.cc
===================================================================

--- trunk/packages/probcons/branches/upstream/current/CompareToRef.cc	2006-05-07 10:44:13 UTC (rev 40)
+++ trunk/packages/probcons/branches/upstream/current/CompareToRef.cc	2006-05-07 14:55:27 UTC (rev 41)
@@ -0,0 +1,348 @@
+/////////////////////////////////////////////////////////////////
+// CompareToRef.cc
+//
+// Program for scoring alignments according to the SUM-OF-PAIRS
+// or COLUMN score.
+/////////////////////////////////////////////////////////////////
+
+#include "SafeVector.h"
+#include "MultiSequence.h"
+#include <string>
+#include <sstream>
+#include <iomanip>
+#include <iostream>
+#include <list>
+#include <set>
+#include <limits>
+#include <cstdio>
+#include <cstdlib>
+#include <cerrno>
+#include <iomanip>
+
+const char CORE_BLOCK = 'h';
+typedef pair<int,int> PII;
+bool useCoreBlocks = false;
+bool useColScore = false;
+bool useCaps = false;
+bool useBaliAnnot = false;
+bool makeAnnot = false;
+
+/////////////////////////////////////////////////////////////////
+// Function prototypes
+/////////////////////////////////////////////////////////////////
+
+set<PII> ComputePairs (MultiSequence *align, bool isRef);
+set<VI> ComputeColumns (MultiSequence *align, bool isRef);
+string GetName (string s);
+set<int> coreCols;
+
+set<VI> refCols, testCols;
+set<PII> refPairs, testPairs;
+VI annotation;
+
+/////////////////////////////////////////////////////////////////
+// main()
+//
+// Main program.
+/////////////////////////////////////////////////////////////////
+
+int main (int argc, char **argv){
+
+  // check arguments
+  if (argc < 3){
+    cerr << "Usage: score TEST_ALIGNMENT REFERENCE_ALIGNMENT [BALIBASE_ANNOT_FILE] [-col] [-core] [-caps] [-annot FILENAME]" << endl;
+    exit (1);
+  }
+
+  // try opening file
+  FileBuffer infile (argv[1]);
+
+  MultiSequence *testAlign;
+  if (infile.fail()){
+    cerr << "ERROR: Could not open file '" << argv[1] << "' for reading." << endl;
+    testAlign = NULL;
+  }
+  else {
+    testAlign = new MultiSequence(); assert (testAlign);
+    testAlign->LoadMFA (infile);
+  }
+  infile.close();
+
+  MultiSequence *refAlign = new MultiSequence (string (argv[2])); assert (refAlign);
+  
+  string outFilename = "";
+
+  for (int i = 3; i < argc; i++){
+    if (strcmp (argv[i], "-core") == 0)
+      useCoreBlocks = true;
+    else if (strcmp (argv[i], "-col") == 0)
+      useColScore = true;
+    else if (strcmp (argv[i], "-caps") == 0)
+      useCaps = true;
+    else if (strcmp (argv[i], "-annot") == 0){
+      makeAnnot = true;
+      outFilename = string (argv[++i]);
+    }
+    else { // annotation file
+      useBaliAnnot = true;
+
+      ifstream annotFile (argv[i]);
+      if (annotFile.fail()){
+        cerr << "ERROR: Could not read BAliBASE annotation file." << endl;
+        exit (1);
+      }
+
+      SafeVector<int> *indices = refAlign->GetSequence(0)->GetMapping();
+
+      char buffer[10000];
+      while (annotFile.getline (buffer, 10000)){
+        istringstream ss;
+        ss.str (string (buffer));
+
+        string s;
+
+        if ((ss >> s) && s == string ("BPOS")){
+          while (ss >> s){
+            int begin=-1, end=-1;
+            if (sscanf (s.c_str(), "%d=%d", &begin, &end) == 2){
+              for (int i = (*indices)[begin]; i <= (*indices)[end]; i++)
+                coreCols.insert (i);
+            }
+          }
+        }
+      }
+
+      delete indices;
+
+      annotFile.close();
+    }
+  }
+
+  if (useColScore) makeAnnot = false;
+
+  if (testAlign){
+    for (int i = 0; i < testAlign->GetNumSequences(); i++){
+      bool found = false;
+      
+      for (int j = 0; !found && j < refAlign->GetNumSequences(); j++){
+	if (testAlign->GetSequence(i)->GetHeader() == refAlign->GetSequence(j)->GetHeader())
+	  found = true;
+      }
+      
+      if (!found){
+	testAlign->RemoveSequence (i);
+	i--;
+      }
+    }
+    
+    for (int i = 0; i < refAlign->GetNumSequences(); i++){
+      bool found = false;
+      
+      for (int j = 0; !found && j < testAlign->GetNumSequences(); j++){
+	if (refAlign->GetSequence(i)->GetHeader() == testAlign->GetSequence(j)->GetHeader())
+	  found = true;
+      }
+      
+      if (!found){
+	refAlign->RemoveSequence (i);
+	i--;
+      }
+    }
+    
+    testAlign->SortByHeader();
+    refAlign->SortByHeader();
+  }
+
+  int TP = 0;
+  int TPFN = 0;
+  int TPFP = 0;
+  double FD, FM;
+  if (useColScore){
+    refCols = ComputeColumns (refAlign, true);
+    if (testAlign) testCols = ComputeColumns (testAlign, false);
+    set<VI> colIntersect;
+    insert_iterator<set<VI> > colIntersectIter (colIntersect, colIntersect.begin());
+    set_intersection (testCols.begin(), testCols.end(), refCols.begin(), refCols.end(), colIntersectIter);
+    TP = (int) colIntersect.size();
+    TPFN = (int) refCols.size();
+    if (testAlign) TPFP = (int) testCols.size();
+  }
+  else {
+    refPairs = ComputePairs (refAlign, true);
+    if (testAlign) testPairs = ComputePairs (testAlign, false);
+    set<PII> pairIntersect;
+
+    insert_iterator<set<PII> > pairIntersectIter (pairIntersect, pairIntersect.begin());
+    set_intersection (testPairs.begin(), testPairs.end(), refPairs.begin(), refPairs.end(), pairIntersectIter);
+    TP = (int) pairIntersect.size();
+    TPFN = (int) refPairs.size();
+    if (testAlign) TPFP = (int) testPairs.size();
+  }
+
+  FD = (double) TP / TPFN;
+  FM = (double) TP / TPFP;
+  
+  cout << GetName(string (argv[2])) << " " << TP << " " << TPFN << " " << TPFP << " " << FD << " " << FM << endl;
+
+  if (makeAnnot){
+    ofstream outfile (outFilename.c_str());
+    for (int i = 0; i < (int) annotation.size(); i++){
+      outfile << annotation[i] << endl;
+    }
+    outfile.close();
+  }
+
+  if (testAlign) delete testAlign;
+  delete refAlign;
+}
+
+int GetOffset (Sequence *testSeq, Sequence *refSeq){
+  string test = testSeq->GetString();
+  string ref = refSeq->GetString();
+
+  for (int i = 0; i < (int) test.length(); i++) test[i] = toupper(test[i]);
+  for (int i = 0; i < (int) ref.length(); i++) ref[i] = toupper(ref[i]);
+
+  size_t offset = test.find (ref, 0);
+  if (offset == string::npos){
+    cerr << "ERROR: Reference string not found in original sequence!" << endl;
+    cerr << "       test = " << test << endl;
+    cerr << "       ref = " << ref << endl;
+    exit (1);
+  }
+
+  cerr << "Offset found: " << offset << endl;
+
+  return (int) offset;
+}
+
+string GetName (string s){
+
+  size_t index1 = s.rfind ('/');
+  size_t index2 = s.rfind ('.');
+
+  if (index1 == string::npos) index1 = 0; else index1++;
+  if (index2 == string::npos) index2 = s.length();
+
+  if (index2 < index1) index2 = s.length();
+
+  return s.substr (index1, index2 - index1);
+}
+
+bool isCore (char ch, int col){
+  if (ch == '-') return false;
+  if (useBaliAnnot){
+    return coreCols.find (col) != coreCols.end();
+  }
+  if (useCaps){
+    return ch >= 'A' && ch <= 'Z';
+  }
+  return ch == CORE_BLOCK;
+}
+
+/////////////////////////////////////////////////////////////////
+// ComputePairs
+//
+// Returns the set of all matching pairs.
+/////////////////////////////////////////////////////////////////
+
+set<PII> ComputePairs (MultiSequence *align, bool isRef){
+  int N = align->GetNumSequences();
+  int L = align->GetSequence(0)->GetLength();
+
+  // retrieve all sequence data pointers
+  SafeVector<SafeVector<char>::iterator> seqs (N);
+  for (int i = 0; i < N; i++){
+    seqs[i] = align->GetSequence(i)->GetDataPtr();
+    assert (align->GetSequence(i)->GetLength() == L);
+  }
+
+  set<PII> ret;
+  VI ctr(N);
+
+  // compute pairs
+  for (int i = 1; i <= L; i++){
+
+    // ctr keeps track of the current position in each sequence
+    for (int j = 0; j < N; j++){
+      ctr[j] += (seqs[j][i] != '-');
+    }
+
+    int good = 0;
+    int ct = 0;
+
+    // check for all matching pairs
+    for (int j = 0; j < N - 1; j++){
+      for (int k = j + 1; k < N; k++){
+	
+	// skip if one of the sequences is gapped
+	if (seqs[j][i] == '-' || seqs[k][i] == '-') continue;
+
+	// check for core blocks in the reference sequence
+	if (isRef && useCoreBlocks)
+	  if (!isCore (seqs[j][i], i) || !isCore (seqs[k][i], i)) continue;
+      
+	// if all ok, then add pair to list of pairs
+	pair<int,int> p (10000 * j + ctr[j], 10000 * k + ctr[k]);
+
+	// if we're making an annotation, compute annotation statistics
+	if (makeAnnot && !isRef){
+	  ct++;
+	  if (refPairs.find (p) != refPairs.end()) good++;
+	}
+        ret.insert (p);
+      }
+    }
+
+    // build annotation
+    if (makeAnnot && !isRef){
+      annotation.push_back ((ct == 0) ? 0 : 100 * good / ct);
+    }
+
+  }
+
+  return ret;
+}
+
+/////////////////////////////////////////////////////////////////
+// ComputeColumns
+//
+// Returns the set of all columns.
+/////////////////////////////////////////////////////////////////
+
+set<VI> ComputeColumns (MultiSequence *align,  bool isRef){
+  int N = align->GetNumSequences();
+  int L = align->GetSequence(0)->GetLength();
+
+  // retrieve all sequence data pointers
+  SafeVector<SafeVector<char>::iterator> seqs (N);
+  for (int i = 0; i < N; i++){
+    seqs[i] = align->GetSequence(i)->GetDataPtr();
+  }
+
+  set<VI> ret;
+  VI ctr(N);
+
+  // compute pairs
+  for (int i = 1; i <= L; i++){
+
+    // ctr keeps track of the current position in each sequence
+    for (int j = 0; j < N; j++){
+      ctr[j] += (seqs[j][i] != '-');
+    }
+
+    // add column, pick only positions that are matched
+    SafeVector<int> column (N);
+    bool useThisColumn = !useCoreBlocks;
+
+    for (int j = 0; j < N; j++){
+      if (isCore (seqs[j][i], i)) useThisColumn = true;
+      column[j] = (seqs[j][i] == '-') ? -1 : ctr[j];
+    }
+
+    if (useThisColumn || !isRef)
+      ret.insert (column);
+  }
+
+  return ret;
+}

Added: trunk/packages/probcons/branches/upstream/current/Defaults.h
===================================================================
--- trunk/packages/probcons/branches/upstream/current/Defaults.h	2006-05-07 10:44:13 UTC (rev 40)
+++ trunk/packages/probcons/branches/upstream/current/Defaults.h	2006-05-07 14:55:27 UTC (rev 41)
@@ -0,0 +1,63 @@
+/////////////////////////////////////////////////////////////////
+// Defaults.h
+//
+// Default constants for use in PROBCONS.  The emission
+// probabilities were computed using the program used to build
+// the BLOSUM62 matrix from the BLOCKS 5.0 dataset.  Transition
+// parameters were obtained via unsupervised EM training on the
+// BALIBASE 2.0 benchmark alignment database.
+/////////////////////////////////////////////////////////////////
+
+#ifndef DEFAULTS_H
+#define DEFAULTS_H
+
+#include <string>
+
+using namespace std;
+
+/*
+float initDistrib1Default[] = { 0.3202854395, 0.3398572505, 0.3398572505 };
+float gapOpen1Default[] = { 0.1375414133, 0.1375414133 };
+float gapExtend1Default[] = { 0.7832147479, 0.7832147479 };
+*/
+
+float initDistrib1Default[] = { 0.6080327034f, 0.1959836632f, 0.1959836632f };
+float gapOpen1Default[] = { 0.01993141696f, 0.01993141696f };
+float gapExtend1Default[] = { 0.7943345308f, 0.7943345308f };
+
+float initDistrib2Default[] = { 0.6814756989f, 8.615339902e-05f, 8.615339902e-05f, 0.1591759622f, 0.1591759622 };
+float gapOpen2Default[] = { 0.0119511066f, 0.0119511066f, 0.008008334786f, 0.008008334786 };
+float gapExtend2Default[] = { 0.3965826333f, 0.3965826333f, 0.8988758326f, 0.8988758326 };
+
+string alphabetDefault = "ARNDCQEGHILKMFPSTWYV";
+float emitSingleDefault[20] = {
+  0.07831005f, 0.05246024f, 0.04433257f, 0.05130349f, 0.02189704f,
+  0.03585766f, 0.05615771f, 0.07783433f, 0.02601093f, 0.06511648f,
+  0.09716489f, 0.05877077f, 0.02438117f, 0.04463228f, 0.03940142f,
+  0.05849916f, 0.05115306f, 0.01203523f, 0.03124726f, 0.07343426f
+};
+
+float emitPairsDefault[20][20] = {
+  {0.02373072f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+  {0.00244502f, 0.01775118f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+  {0.00210228f, 0.00207782f, 0.01281864f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+  {0.00223549f, 0.00161657f, 0.00353540f, 0.01911178f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+  {0.00145515f, 0.00044701f, 0.00042479f, 0.00036798f, 0.01013470f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+  {0.00219102f, 0.00253532f, 0.00158223f, 0.00176784f, 0.00032102f, 0.00756604f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+  {0.00332218f, 0.00268865f, 0.00224738f, 0.00496800f, 0.00037956f, 0.00345128f, 0.01676565f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+  {0.00597898f, 0.00194865f, 0.00288882f, 0.00235249f, 0.00071206f, 0.00142432f, 0.00214860f, 0.04062876f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+  {0.00114353f, 0.00132105f, 0.00141205f, 0.00097077f, 0.00026421f, 0.00113901f, 0.00131767f, 0.00103704f, 0.00867996f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+  {0.00318853f, 0.00138145f, 0.00104273f, 0.00105355f, 0.00094040f, 0.00100883f, 0.00124207f, 0.00142520f, 0.00059716f, 0.01778263f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+  {0.00449576f, 0.00246811f, 0.00160275f, 0.00161966f, 0.00138494f, 0.00180553f, 0.00222063f, 0.00212853f, 0.00111754f, 0.01071834f, 0.03583921f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+  {0.00331693f, 0.00595650f, 0.00257310f, 0.00252518f, 0.00046951f, 0.00312308f, 0.00428420f, 0.00259311f, 0.00121376f, 0.00157852f, 0.00259626f, 0.01612228f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+  {0.00148878f, 0.00076734f, 0.00063401f, 0.00047808f, 0.00037421f, 0.00075546f, 0.00076105f, 0.00066504f, 0.00042237f, 0.00224097f, 0.00461939f, 0.00096120f, 0.00409522f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+  {0.00165004f, 0.00090768f, 0.00084658f, 0.00069041f, 0.00052274f, 0.00059248f, 0.00078814f, 0.00115204f, 0.00072545f, 0.00279948f, 0.00533369f, 0.00087222f, 0.00116111f, 0.01661038f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+  {0.00230618f, 0.00106268f, 0.00100282f, 0.00125381f, 0.00034766f, 0.00090111f, 0.00151550f, 0.00155601f, 0.00049078f, 0.00103767f, 0.00157310f, 0.00154836f, 0.00046718f, 0.00060701f, 0.01846071f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+  {0.00631752f, 0.00224540f, 0.00301397f, 0.00285226f, 0.00094867f, 0.00191155f, 0.00293898f, 0.00381962f, 0.00116422f, 0.00173565f, 0.00250962f, 0.00312633f, 0.00087787f, 0.00119036f, 0.00180037f, 0.01346609f, 0.0f, 0.0f, 0.0f, 0.0f},
+  {0.00389995f, 0.00186053f, 0.00220144f, 0.00180488f, 0.00073798f, 0.00154526f, 0.00216760f, 0.00214841f, 0.00077747f, 0.00248968f, 0.00302273f, 0.00250862f, 0.00093371f, 0.00107595f, 0.00147982f, 0.00487295f, 0.01299436f, 0.0f, 0.0f, 0.0f},
+  {0.00039119f, 0.00029139f, 0.00021006f, 0.00016015f, 0.00010666f, 0.00020592f, 0.00023815f, 0.00038786f, 0.00019097f, 0.00039549f, 0.00076736f, 0.00028448f, 0.00016253f, 0.00085751f, 0.00015674f, 0.00026525f, 0.00024961f, 0.00563625f, 0.0f, 0.0f},
+  {0.00131840f, 0.00099430f, 0.00074960f, 0.00066005f, 0.00036626f, 0.00070192f, 0.00092548f, 0.00089301f, 0.00131038f, 0.00127857f, 0.00219713f, 0.00100817f, 0.00054105f, 0.00368739f, 0.00047608f, 0.00102648f, 0.00094759f, 0.00069226f, 0.00999315f, 0.0f},
+  {0.00533241f, 0.00169359f, 0.00136609f, 0.00127915f, 0.00119152f, 0.00132844f, 0.00178697f, 0.00194579f, 0.00071553f, 0.01117956f, 0.00914460f, 0.00210897f, 0.00197461f, 0.00256159f, 0.00135781f, 0.00241601f, 0.00343452f, 0.00038538f, 0.00148001f, 0.02075171f}
+};
+
+#endif

Added: trunk/packages/probcons/branches/upstream/current/EvolutionaryTree.h
===================================================================
--- trunk/packages/probcons/branches/upstream/current/EvolutionaryTree.h	2006-05-07 10:44:13 UTC (rev 40)
+++ trunk/packages/probcons/branches/upstream/current/EvolutionaryTree.h	2006-05-07 14:55:27 UTC (rev 41)
@@ -0,0 +1,176 @@
+/////////////////////////////////////////////////////////////////
+// EvolutionaryTree.h
+//
+// Utilities for reading/writing multiple sequence data.
+/////////////////////////////////////////////////////////////////
+
+#ifndef EVOLUTIONARYTREE_H
+#define EVOLUTIONARYTREE_H
+
+#include <string>
+#include <list>
+#include <stdio.h>
+#include "SafeVector.h"
+#include "MultiSequence.h"
+#include "Sequence.h"
+
+using namespace std;
+
+/////////////////////////////////////////////////////////////////
+// TreeNode
+//
+// The fundamental unit for representing an alignment tree.  The
+// guide tree is represented as a binary tree.
+/////////////////////////////////////////////////////////////////
+
+class TreeNode {
+  int sequenceLabel;                  // sequence label
+  TreeNode *left, *right, *parent;    // pointers to left, right children
+
+  /////////////////////////////////////////////////////////////////
+  // TreeNode::PrintNode()
+  //
+  // Internal routine used to print out the sequence comments
+  // associated with the evolutionary tree, using a hierarchical
+  // parenthesized format.
+  /////////////////////////////////////////////////////////////////
+
+  void PrintNode (ostream &outfile, const MultiSequence *sequences) const {
+
+    // if this is a leaf node, print out the associated sequence comment
+    if (sequenceLabel >= 0)
+      outfile << sequences->GetSequence (sequenceLabel)->GetHeader();
+
+    // otherwise, it must have two children; print out their subtrees recursively
+    else {
+      assert (left);
+      assert (right);
+
+      outfile << "(";
+      left->PrintNode (outfile, sequences);
+      outfile << " ";
+      right->PrintNode (outfile, sequences);
+      outfile << ")";
+    }
+  }
+
+ public:
+
+  /////////////////////////////////////////////////////////////////
+  // TreeNode::TreeNode()
+  //
+  // Constructor for a tree node.  Note that sequenceLabel = -1
+  // implies that the current node is not a leaf in the tree.
+  /////////////////////////////////////////////////////////////////
+
+  TreeNode (int sequenceLabel) : sequenceLabel (sequenceLabel),
+    left (NULL), right (NULL), parent (NULL) {
+    assert (sequenceLabel >= -1);
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // TreeNode::~TreeNode()
+  //
+  // Destructor for a tree node.  Recursively deletes all children.
+  /////////////////////////////////////////////////////////////////
+
+  ~TreeNode (){
+    if (left){ delete left; left = NULL; }
+    if (right){ delete right; right = NULL; }
+    parent = NULL;
+  }
+
+
+  // getters
+  int GetSequenceLabel () const { return sequenceLabel; }
+  TreeNode *GetLeftChild () const { return left; }
+  TreeNode *GetRightChild () const { return right; }
+  TreeNode *GetParent () const { return parent; }
+
+  // setters
+  void SetSequenceLabel (int sequenceLabel){ this->sequenceLabel = sequenceLabel; assert (sequenceLabel >= -1); }
+  void SetLeftChild (TreeNode *left){ this->left = left; }
+  void SetRightChild (TreeNode *right){ this->right = right; }
+  void SetParent (TreeNode *parent){ this->parent = parent; }
+
+  /////////////////////////////////////////////////////////////////
+  // TreeNode::ComputeTree()
+  //
+  // Routine used to compute an evolutionary tree based on the
+  // given distance matrix.  We assume the distance matrix has the
+  // form, distMatrix[i][j] = expected accuracy of aligning i with j.
+  /////////////////////////////////////////////////////////////////
+
+  static TreeNode *ComputeTree (const VVF &distMatrix){
+
+    int numSeqs = distMatrix.size();                 // number of sequences in distance matrix
+    VVF distances (numSeqs, VF (numSeqs));           // a copy of the distance matrix
+    SafeVector<TreeNode *> nodes (numSeqs, NULL);    // list of nodes for each sequence
+    SafeVector<int> valid (numSeqs, 1);              // valid[i] tells whether or not the ith
+                                                     // nodes in the distances and nodes array
+                                                     // are valid
+
+    // initialization: make a copy of the distance matrix
+    for (int i = 0; i < numSeqs; i++)
+      for (int j = 0; j < numSeqs; j++)
+        distances[i][j] = distMatrix[i][j];
+
+    // initialization: create all the leaf nodes
+    for (int i = 0; i < numSeqs; i++){
+      nodes[i] = new TreeNode (i);
+      assert (nodes[i]);
+    }
+
+    // repeat until only a single node left
+    for (int numNodesLeft = numSeqs; numNodesLeft > 1; numNodesLeft--){
+      float bestProb = -1;
+      pair<int,int> bestPair;
+
+      // find the closest pair
+      for (int i = 0; i < numSeqs; i++) if (valid[i]){
+        for (int j = i+1; j < numSeqs; j++) if (valid[j]){
+          if (distances[i][j] > bestProb){
+            bestProb = distances[i][j];
+            bestPair = make_pair(i, j);
+          }
+        }
+      }
+
+      // merge the closest pair
+      TreeNode *newParent = new TreeNode (-1);
+      newParent->SetLeftChild (nodes[bestPair.first]);
+      newParent->SetRightChild (nodes[bestPair.second]);
+      nodes[bestPair.first]->SetParent (newParent);
+      nodes[bestPair.second]->SetParent (newParent);
+      nodes[bestPair.first] = newParent;
+      nodes[bestPair.second] = NULL;
+
+      // now update the distance matrix
+      for (int i = 0; i < numSeqs; i++) if (valid[i]){
+        distances[bestPair.first][i] = distances[i][bestPair.first]
+          = (distances[i][bestPair.first] + distances[i][bestPair.second]) * bestProb / 2;
+      }
+
+      // finally, mark the second node entry as no longer valid
+      valid[bestPair.second] = 0;
+    }
+
+    assert (nodes[0]);
+    return nodes[0];
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // TreeNode::Print()
+  //
+  // Print out the subtree associated with this node in a
+  // parenthesized representation.
+  /////////////////////////////////////////////////////////////////
+
+  void Print (ostream &outfile, const MultiSequence *sequences) const {
+    outfile << "Alignment tree: ";
+    PrintNode (outfile, sequences);
+    outfile << endl;
+  }
+};
+
+#endif

Added: trunk/packages/probcons/branches/upstream/current/FileBuffer.h
===================================================================
--- trunk/packages/probcons/branches/upstream/current/FileBuffer.h	2006-05-07 10:44:13 UTC (rev 40)
+++ trunk/packages/probcons/branches/upstream/current/FileBuffer.h	2006-05-07 14:55:27 UTC (rev 41)
@@ -0,0 +1,104 @@
+/////////////////////////////////////////////////////////////////
+// FileBuffer.h
+//
+// Buffered file reading.
+/////////////////////////////////////////////////////////////////
+
+
+#ifndef FILEBUFFER_H
+#define FILEBUFFER_H
+
+#include <string>
+#include <fstream>
+#include <iostream>
+
+using namespace std;
+
+const int BufferSize = 1000;
+
+/////////////////////////////////////////////////////////////////
+// FileBuffer
+//
+// Class for buffering file reading.
+/////////////////////////////////////////////////////////////////
+
+class FileBuffer {
+  ifstream file;
+  char buffer[BufferSize];
+  int currPos;
+  int size;
+  bool isEOF;
+  bool isValid;
+  bool canUnget;
+
+ public:
+
+  // Some common routines
+
+  FileBuffer (const char *filename) : file (filename), currPos (0), size (0), isEOF (false), isValid (!file.fail()), canUnget (false){}
+  ~FileBuffer (){ close(); }
+  bool fail () const { return !isValid; }
+  bool eof () const { return (!isValid || isEOF); }
+  void close(){ file.close(); isValid = false; }
+
+  /////////////////////////////////////////////////////////////////
+  // FileBuffer::Get()
+  //
+  // Retrieve a character from the file buffer.  Returns true if
+  // and only if a character is read.
+  /////////////////////////////////////////////////////////////////
+
+  bool Get (char &ch){
+
+    // check to make sure that there's more stuff in the file
+    if (!isValid || isEOF) return false;
+
+    // if the buffer is empty, it's time to reload it
+    if (currPos == size){
+      file.read (buffer, BufferSize);
+      size = file.gcount();
+      isEOF = (size == 0);
+      currPos = 0;
+      if (isEOF) return false;
+    }
+
+    // store the read character
+    ch = buffer[currPos++];
+    canUnget = true;
+    return true;
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // FileBuffer::UnGet()
+  //
+  // Unretrieve the most recently read character from the file
+  // buffer.  Note that this allows only a one-level undo.
+  /////////////////////////////////////////////////////////////////
+
+  void UnGet (){
+    assert (canUnget);
+    assert (isValid);
+    assert (currPos > 0);
+    currPos--;
+    assert (currPos < size);
+    isEOF = false;
+    canUnget = false;
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // FileBuffer::GetLine()
+  //
+  // Retrieve characters of text until a newline character is
+  // encountered.  Terminates properly on end-of-file condition.
+  /////////////////////////////////////////////////////////////////
+
+  void GetLine (string &s){
+    char ch;
+    s = "";
+    while (Get (ch) && ch != '\n')
+      s += ch;
+  }
+
+};
+
+#endif

Added: trunk/packages/probcons/branches/upstream/current/FixRef.cc
===================================================================
--- trunk/packages/probcons/branches/upstream/current/FixRef.cc	2006-05-07 10:44:13 UTC (rev 40)
+++ trunk/packages/probcons/branches/upstream/current/FixRef.cc	2006-05-07 14:55:27 UTC (rev 41)
@@ -0,0 +1,1000 @@
+/////////////////////////////////////////////////////////////////
+// Main.cc
+/////////////////////////////////////////////////////////////////
+
+#include "SafeVector.h"
+#include "MultiSequence.h"
+#include "Defaults.h"
+#include "ScoreType.h"
+#include "ProbabilisticModel.h"
+#include "EvolutionaryTree.h"
+#include "SparseMatrix.h"
+#include <string>
+#include <iomanip>
+#include <iostream>
+#include <list>
+#include <set>
+#include <algorithm>
+#include <cstdio>
+#include <cstdlib>
+#include <cerrno>
+#include <iomanip>
+
+string matrixFilename = "";
+string parametersInputFilename = "";
+string parametersOutputFilename = "no training";
+
+bool enableTraining = false;
+bool enableVerbose = false;
+int numConsistencyReps = 2;
+int numPreTrainingReps = 0;
+int numIterativeRefinementReps = 100;
+
+float gapOpenPenalty = 0;
+float gapContinuePenalty = 0;
+VF initDistrib (NumMatrixTypes);
+VF gapOpen (2*NumInsertStates);
+VF gapExtend (2*NumInsertStates);
+SafeVector<char> alphabet;
+VVF emitPairs;
+VF emitSingle;
+
+const int MIN_PRETRAINING_REPS = 0;
+const int MAX_PRETRAINING_REPS = 20;
+const int MIN_CONSISTENCY_REPS = 0;
+const int MAX_CONSISTENCY_REPS = 5;
+const int MIN_ITERATIVE_REFINEMENT_REPS = 0;
+const int MAX_ITERATIVE_REFINEMENT_REPS = 1000;
+
+/////////////////////////////////////////////////////////////////
+// Function prototypes
+/////////////////////////////////////////////////////////////////
+
+void PrintHeading();
+void PrintParameters (const char *message, const VF &initDistrib, const VF &gapOpen,
+                      const VF &gapExtend, const char *filename);
+MultiSequence *DoAlign (MultiSequence *sequence, const ProbabilisticModel &model);
+SafeVector<string> ParseParams (int argc, char **argv);
+void ReadParameters ();
+MultiSequence *ComputeFinalAlignment (const TreeNode *tree, MultiSequence *sequences,
+                                      const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+                                      const ProbabilisticModel &model);
+MultiSequence *AlignAlignments (MultiSequence *align1, MultiSequence *align2,
+                                const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+                                const ProbabilisticModel &model);
+void DoRelaxation (MultiSequence *sequences, SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
+void Relax (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior);
+void DoIterativeRefinement (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+                            const ProbabilisticModel &model, MultiSequence* &alignment);
+//float ScoreAlignment (MultiSequence *alignment, MultiSequence *sequences, SparseMatrix **sparseMatrices, const int numSeqs);
+
+/////////////////////////////////////////////////////////////////
+// main()
+//
+// Calls all initialization routines and runs the PROBCONS
+// aligner.
+/////////////////////////////////////////////////////////////////
+
+int main (int argc, char **argv){
+
+  if (argc != 3){
+    cerr << "Usage: FixRef inputfile reffile" << endl;
+    exit (1);
+  }
+
+  string inputFilename = string (argv[1]);
+  string refFilename = string (argv[2]);
+
+  ReadParameters();
+
+  // build new model for aligning
+  ProbabilisticModel model (initDistrib, gapOpen, gapExtend, 
+                            alphabet, emitPairs, emitSingle);
+
+  MultiSequence *inputSeq = new MultiSequence(); inputSeq->LoadMFA (inputFilename);
+  MultiSequence *refSeq = new MultiSequence(); refSeq->LoadMFA (refFilename);
+
+  SafeVector<char> *ali = new SafeVector<char>;
+
+  if (refSeq->GetNumSequences() != 2){
+    cerr << "ERROR: Expected two sequences in reference alignment." << endl;
+    exit (1);
+  }
+  set<int> s; s.insert (0);
+  MultiSequence *ref1 = refSeq->Project (s);
+  s.clear(); s.insert (1);
+  MultiSequence *ref2 = refSeq->Project (s);
+
+  for (int i = 0; i < inputSeq->GetNumSequences(); i++){
+    if (inputSeq->GetSequence(i)->GetHeader() == ref1->GetSequence(0)->GetHeader()){
+      ref1->AddSequence (inputSeq->GetSequence(i)->Clone());
+    }
+    if (inputSeq->GetSequence(i)->GetHeader() == ref2->GetSequence(0)->GetHeader())
+      ref2->AddSequence (inputSeq->GetSequence(i)->Clone());
+  }
+  if (ref1->GetNumSequences() != 2){
+    cerr << "ERROR: Expected two sequences in reference1 alignment." << endl;
+    exit (1);
+  }
+  if (ref2->GetNumSequences() != 2){
+    cerr << "ERROR: Expected two sequences in reference2 alignment." << endl;
+    exit (1);
+  }
+
+  ref1->GetSequence(0)->SetLabel(0);
+  ref2->GetSequence(0)->SetLabel(0);
+  ref1->GetSequence(1)->SetLabel(1);
+  ref2->GetSequence(1)->SetLabel(1);
+
+  //  cerr << "Aligning..." << endl;
+
+  // now, we can perform the alignments and write them out
+  MultiSequence *alignment1 = DoAlign (ref1,
+                                       ProbabilisticModel (initDistrib, gapOpen, gapExtend, 
+                                                           alphabet, emitPairs, emitSingle));
+
+  //cerr << "Aligning second..." << endl;
+  MultiSequence *alignment2 = DoAlign (ref2,
+                                       ProbabilisticModel (initDistrib, gapOpen, gapExtend, 
+                                                           alphabet, emitPairs, emitSingle));
+
+  SafeVector<char>::iterator iter1 = alignment1->GetSequence(0)->GetDataPtr();
+  SafeVector<char>::iterator iter2 = alignment1->GetSequence(1)->GetDataPtr();
+  for (int i = 1; i <= alignment1->GetSequence(0)->GetLength(); i++){
+    if (islower(iter1[i])) iter2[i] = tolower(iter2[i]);
+    if (isupper(iter1[i])) iter2[i] = toupper(iter2[i]);
+  }
+  iter1 = alignment2->GetSequence(0)->GetDataPtr();
+  iter2 = alignment2->GetSequence(1)->GetDataPtr();
+  for (int i = 1; i <= alignment2->GetSequence(0)->GetLength(); i++){
+    if (islower(iter1[i])) iter2[i] = tolower(iter2[i]);
+    if (isupper(iter1[i])) iter2[i] = toupper(iter2[i]);
+  }
+  //alignment1->WriteMFA (cout);
+  //alignment2->WriteMFA (cout);
+
+  int a1 = 0, a = 0;
+  int b1 = 0, b = 0;
+
+  for (int i = 1; i <= refSeq->GetSequence(0)->GetLength(); i++){
+
+    // catch up in filler sequences
+    if (refSeq->GetSequence(0)->GetPosition(i) != '-'){
+      while (true){
+        a++;
+        if (alignment1->GetSequence(0)->GetPosition(a) != '-') break;
+        ali->push_back ('X');
+      }
+    }
+    if (refSeq->GetSequence(1)->GetPosition(i) != '-'){
+      while (true){
+        b++;
+        if (alignment2->GetSequence(0)->GetPosition(b) != '-') break;
+        ali->push_back ('Y');
+      }
+    }
+
+    if (refSeq->GetSequence(0)->GetPosition(i) != '-' &&
+        refSeq->GetSequence(1)->GetPosition(i) != '-'){
+      //cerr << "M: " << refSeq->GetSequence(0)->GetPosition(i) << refSeq->GetSequence(1)->GetPosition(i) << endl;
+      ali->push_back ('B');
+    }
+    else if (refSeq->GetSequence(0)->GetPosition(i) != '-'){
+      //cerr << "X" << endl;
+      ali->push_back ('X');
+    }
+    else if (refSeq->GetSequence(1)->GetPosition(i) != '-'){
+      //cerr << "Y" << endl;
+      ali->push_back ('Y');
+    }
+  }
+
+  while (a < alignment1->GetSequence(0)->GetLength()){
+    a++;
+    ali->push_back ('X');
+    if (alignment1->GetSequence(0)->GetPosition(a) != '-') a1++;
+  }
+  while (b < alignment2->GetSequence(0)->GetLength()){
+    b++;
+    ali->push_back ('Y');
+    if (alignment2->GetSequence(0)->GetPosition(b) != '-') b1++;
+  }
+
+  Sequence *seq1 = alignment1->GetSequence(1)->AddGaps (ali, 'X');
+  Sequence *seq2 = alignment2->GetSequence(1)->AddGaps (ali, 'Y');
+  seq1->WriteMFA (cout, 60);
+  seq2->WriteMFA (cout, 60);
+
+  delete seq1;
+  delete seq2;
+
+  delete ali;
+  delete alignment1;
+  delete alignment2;
+  delete inputSeq;
+  delete refSeq;
+}
+
+/////////////////////////////////////////////////////////////////
+// PrintHeading()
+//
+// Prints heading for PROBCONS program.
+/////////////////////////////////////////////////////////////////
+
+void PrintHeading (){
+  cerr << endl
+       << "PROBCONS version 1.02 - align multiple protein sequences and print to standard output" << endl
+       << "Copyright (C) 2004  Chuong Ba Do" << endl
+       << endl;
+}
+
+/////////////////////////////////////////////////////////////////
+// PrintParameters()
+//
+// Prints PROBCONS parameters to STDERR.  If a filename is
+// specified, then the parameters are also written to the file.
+/////////////////////////////////////////////////////////////////
+
+void PrintParameters (const char *message, const VF &initDistrib, const VF &gapOpen,
+                      const VF &gapExtend, const char *filename){
+
+  // print parameters to the screen
+  cerr << message << endl
+       << "    initDistrib[] = { ";
+  for (int i = 0; i < NumMatrixTypes; i++) cerr << setprecision (10) << initDistrib[i] << " ";
+  cerr << "}" << endl
+       << "        gapOpen[] = { ";
+  for (int i = 0; i < NumInsertStates*2; i++) cerr << setprecision (10) << gapOpen[i] << " ";
+  cerr << "}" << endl
+       << "      gapExtend[] = { ";
+  for (int i = 0; i < NumInsertStates*2; i++) cerr << setprecision (10) << gapExtend[i] << " ";
+  cerr << "}" << endl
+       << endl;
+
+  // if a file name is specified
+  if (filename){
+
+    // attempt to open the file for writing
+    FILE *file = fopen (filename, "w");
+    if (!file){
+      cerr << "ERROR: Unable to write parameter file: " << filename << endl;
+      exit (1);
+    }
+
+    // if successful, then write the parameters to the file
+    for (int i = 0; i < NumMatrixTypes; i++) fprintf (file, "%.10f ", initDistrib[i]); fprintf (file, "\n");
+    for (int i = 0; i < 2*NumInsertStates; i++) fprintf (file, "%.10f ", gapOpen[i]); fprintf (file, "\n");
+    for (int i = 0; i < 2*NumInsertStates; i++) fprintf (file, "%.10f ", gapExtend[i]); fprintf (file, "\n");
+    fclose (file);
+  }
+}
+
+/////////////////////////////////////////////////////////////////
+// DoAlign()
+//
+// First computes all pairwise posterior probability matrices.
+// Then, computes new parameters if training, or a final
+// alignment, otherwise.
+/////////////////////////////////////////////////////////////////
+
+MultiSequence *DoAlign (MultiSequence *sequences, const ProbabilisticModel &model){
+
+  assert (sequences);
+
+  const int numSeqs = sequences->GetNumSequences();
+  VVF distances (numSeqs, VF (numSeqs, 0));
+  SafeVector<SafeVector<SparseMatrix *> > sparseMatrices (numSeqs, SafeVector<SparseMatrix *>(numSeqs, NULL));
+
+  // do all pairwise alignments
+  for (int a = 0; a < numSeqs-1; a++){
+    for (int b = a+1; b < numSeqs; b++){
+      Sequence *seq1 = sequences->GetSequence (a);
+      Sequence *seq2 = sequences->GetSequence (b);
+
+      // verbose output
+      if (enableVerbose)
+        cerr << "(" << a+1 << ") " << seq1->GetHeader() << " vs. "
+             << "(" << b+1 << ") " << seq2->GetHeader() << ": ";
+
+      // compute forward and backward probabilities
+      VF *forward = model.ComputeForwardMatrix (seq1, seq2); assert (forward);
+      VF *backward = model.ComputeBackwardMatrix (seq1, seq2); assert (backward);
+
+      // if we are training, then we'll simply want to compute the
+      // expected counts for each region within the matrix separately;
+      // otherwise, we'll need to put all of the regions together and
+      // assemble a posterior probability match matrix
+
+      // compute posterior probability matrix
+      VF *posterior = model.ComputePosteriorMatrix (seq1, seq2, *forward, *backward); assert (posterior);
+
+      // compute "expected accuracy" distance for evolutionary tree computation
+      pair<SafeVector<char> *, float> alignment = model.ComputeAlignment (seq1->GetLength(),
+                                                                          seq2->GetLength(),
+                                                                          *posterior);
+
+      float distance = alignment.second / min (seq1->GetLength(), seq2->GetLength());
+
+      if (enableVerbose)
+        cerr << setprecision (10) << distance << endl;
+
+      // save posterior probability matrices in sparse format
+      distances[a][b] = distances[b][a] = distance;
+      sparseMatrices[a][b] = new SparseMatrix (seq1->GetLength(), seq2->GetLength(), *posterior);
+      sparseMatrices[b][a] = sparseMatrices[a][b]->ComputeTranspose();
+
+      delete alignment.first;
+      delete posterior;
+
+      delete forward;
+      delete backward;
+    }
+  }
+
+  if (!enableTraining){
+    if (enableVerbose)
+      cerr << endl;
+
+    // now, perform the consistency transformation the desired number of times
+    for (int i = 0; i < numConsistencyReps; i++)
+      DoRelaxation (sequences, sparseMatrices);
+
+    // compute the evolutionary tree
+    TreeNode *tree = TreeNode::ComputeTree (distances);
+
+    //tree->Print (cerr, sequences);
+    //cerr << endl;
+
+    // make the final alignment
+    MultiSequence *alignment = ComputeFinalAlignment (tree, sequences, sparseMatrices, model);
+    delete tree;
+
+    return alignment;
+  }
+
+  return NULL;
+}
+
+/////////////////////////////////////////////////////////////////
+// GetInteger()
+//
+// Attempts to parse an integer from the character string given.
+// Returns true only if no parsing error occurs.
+/////////////////////////////////////////////////////////////////
+
+bool GetInteger (char *data, int *val){
+  char *endPtr;
+  long int retVal;
+
+  assert (val);
+
+  errno = 0;
+  retVal = strtol (data, &endPtr, 0);
+  if (retVal == 0 && (errno != 0 || data == endPtr)) return false;
+  if (errno != 0 && (retVal == LONG_MAX || retVal == LONG_MIN)) return false;
+  if (retVal < (long) INT_MIN || retVal > (long) INT_MAX) return false;
+  *val = (int) retVal;
+  return true;
+}
+
+/////////////////////////////////////////////////////////////////
+// GetFloat()
+//
+// Attempts to parse a float from the character string given.
+// Returns true only if no parsing error occurs.
+/////////////////////////////////////////////////////////////////
+
+bool GetFloat (char *data, float *val){
+  char *endPtr;
+  double retVal;
+
+  assert (val);
+
+  errno = 0;
+  retVal = strtod (data, &endPtr);
+  if (retVal == 0 && (errno != 0 || data == endPtr)) return false;
+  if (errno != 0 && (retVal >= 1000000.0 || retVal <= -1000000.0)) return false;
+  *val = (float) retVal;
+  return true;
+}
+
+/////////////////////////////////////////////////////////////////
+// ParseParams()
+//
+// Parse all command-line options.
+/////////////////////////////////////////////////////////////////
+
+SafeVector<string> ParseParams (int argc, char **argv){
+
+  if (argc < 2){
+
+    cerr << "PROBCONS comes with ABSOLUTELY NO WARRANTY.  This is free software, and" << endl
+         << "you are welcome to redistribute it under certain conditions.  See the" << endl
+         << "file COPYING.txt for details." << endl
+         << endl
+         << "Usage:" << endl
+         << "       probcons [OPTION]... [MFAFILE]..." << endl
+         << endl
+         << "Description:" << endl
+         << "       Align sequences in MFAFILE(s) and print result to standard output" << endl
+         << endl
+         << "       -t, --train FILENAME" << endl
+         << "              compute EM transition probabilities, store in FILENAME (default: "
+         << parametersOutputFilename << ")" << endl
+         << endl
+         << "       -m, --matrixfile FILENAME" << endl
+         << "              read transition parameters from FILENAME (default: "
+         << matrixFilename << ")" << endl
+         << endl
+         << "       -p, --paramfile FILENAME" << endl
+         << "              read scoring matrix probabilities from FILENAME (default: "
+         << parametersInputFilename << ")" << endl
+         << endl
+         << "       -c, --consistency REPS" << endl
+         << "              use " << MIN_CONSISTENCY_REPS << " <= REPS <= " << MAX_CONSISTENCY_REPS
+         << " (default: " << numConsistencyReps << ") passes of consistency transformation" << endl
+         << endl
+         << "       -ir, --iterative-refinement REPS" << endl
+         << "              use " << MIN_ITERATIVE_REFINEMENT_REPS << " <= REPS <= " << MAX_ITERATIVE_REFINEMENT_REPS
+         << " (default: " << numIterativeRefinementReps << ") passes of iterative-refinement" << endl
+         << endl
+         << "       -pre, --pre-training REPS" << endl
+         << "              use " << MIN_PRETRAINING_REPS << " <= REPS <= " << MAX_PRETRAINING_REPS
+         << " (default: " << numPreTrainingReps << ") rounds of pretraining" << endl
+         << endl
+         << "       -go, --gap-open VALUE" << endl
+         << "              gap opening penalty of VALUE <= 0 (default: " << gapOpenPenalty << ")" << endl
+         << endl
+         << "       -ge, --gap-extension VALUE" << endl
+         << "              gap extension penalty of VALUE <= 0 (default: " << gapContinuePenalty << ")" << endl
+         << endl
+         << "       -v, --verbose" << endl
+         << "              report progress while aligning (default: " << (enableVerbose ? "on" : "off") << ")" << endl
+         << endl;
+
+    exit (1);
+  }
+
+  SafeVector<string> sequenceNames;
+  int tempInt;
+  float tempFloat;
+
+  for (int i = 1; i < argc; i++){
+    if (argv[i][0] == '-'){
+
+      // training
+      if (!strcmp (argv[i], "-t") || !strcmp (argv[i], "--train")){
+        enableTraining = true;
+        if (i < argc - 1)
+          parametersOutputFilename = string (argv[++i]);
+        else {
+          cerr << "ERROR: Filename expected for option " << argv[i] << endl;
+          exit (1);
+        }
+      }
+
+      // scoring matrix file
+      else if (!strcmp (argv[i], "-m") || !strcmp (argv[i], "--matrixfile")){
+        if (i < argc - 1)
+          matrixFilename = string (argv[++i]);
+        else {
+          cerr << "ERROR: Filename expected for option " << argv[i] << endl;
+          exit (1);
+        }
+      }
+
+      // transition/initial distribution parameter file
+      else if (!strcmp (argv[i], "-p") || !strcmp (argv[i], "--paramfile")){
+        if (i < argc - 1)
+          parametersInputFilename = string (argv[++i]);
+        else {
+          cerr << "ERROR: Filename expected for option " << argv[i] << endl;
+          exit (1);
+        }
+      }
+
+      // number of consistency transformations
+      else if (!strcmp (argv[i], "-c") || !strcmp (argv[i], "--consistency")){
+        if (i < argc - 1){
+          if (!GetInteger (argv[++i], &tempInt)){
+            cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
+            exit (1);
+          }
+          else {
+            if (tempInt < MIN_CONSISTENCY_REPS || tempInt > MAX_CONSISTENCY_REPS){
+              cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
+                   << MIN_CONSISTENCY_REPS << " and " << MAX_CONSISTENCY_REPS << "." << endl;
+              exit (1);
+            }
+            else
+              numConsistencyReps = tempInt;
+          }
+        }
+        else {
+          cerr << "ERROR: Integer expected for option " << argv[i] << endl;
+          exit (1);
+        }
+      }
+
+      // number of randomized partitioning iterative refinement passes
+      else if (!strcmp (argv[i], "-ir") || !strcmp (argv[i], "--iterative-refinement")){
+        if (i < argc - 1){
+          if (!GetInteger (argv[++i], &tempInt)){
+            cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
+            exit (1);
+          }
+          else {
+            if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS || tempInt > MAX_ITERATIVE_REFINEMENT_REPS){
+              cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
+                   << MIN_ITERATIVE_REFINEMENT_REPS << " and " << MAX_ITERATIVE_REFINEMENT_REPS << "." << endl;
+              exit (1);
+            }
+            else
+              numIterativeRefinementReps = tempInt;
+          }
+        }
+        else {
+          cerr << "ERROR: Integer expected for option " << argv[i] << endl;
+          exit (1);
+        }
+      }
+
+      // number of EM pre-training rounds
+      else if (!strcmp (argv[i], "-pre") || !strcmp (argv[i], "--pre-training")){
+        if (i < argc - 1){
+          if (!GetInteger (argv[++i], &tempInt)){
+            cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
+            exit (1);
+          }
+          else {
+            if (tempInt < MIN_PRETRAINING_REPS || tempInt > MAX_PRETRAINING_REPS){
+              cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
+                   << MIN_PRETRAINING_REPS << " and " << MAX_PRETRAINING_REPS << "." << endl;
+              exit (1);
+            }
+            else
+              numPreTrainingReps = tempInt;
+          }
+        }
+        else {
+          cerr << "ERROR: Integer expected for option " << argv[i] << endl;
+          exit (1);
+        }
+      }
+
+      // gap open penalty
+      else if (!strcmp (argv[i], "-go") || !strcmp (argv[i], "--gap-open")){
+        if (i < argc - 1){
+          if (!GetFloat (argv[++i], &tempFloat)){
+            cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
+            exit (1);
+          }
+          else {
+            if (tempFloat > 0){
+              cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be positive." << endl;
+              exit (1);
+            }
+            else
+              gapOpenPenalty = tempFloat;
+          }
+        }
+        else {
+          cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
+          exit (1);
+        }
+      }
+
+      // gap extension penalty
+      else if (!strcmp (argv[i], "-ge") || !strcmp (argv[i], "--gap-extension")){
+        if (i < argc - 1){
+          if (!GetFloat (argv[++i], &tempFloat)){
+            cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
+            exit (1);
+          }
+          else {
+            if (tempFloat > 0){
+              cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be positive." << endl;
+              exit (1);
+            }
+            else
+              gapContinuePenalty = tempFloat;
+          }
+        }
+        else {
+          cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
+          exit (1);
+        }
+      }
+
+      // verbose reporting
+      else if (!strcmp (argv[i], "-v") || !strcmp (argv[i], "--verbose")){
+        enableVerbose = true;
+      }
+
+      // bad arguments
+      else {
+        cerr << "ERROR: Unrecognized option: " << argv[i] << endl;
+        exit (1);
+      }
+    }
+    else {
+      sequenceNames.push_back (string (argv[i]));
+    }
+  }
+
+  return sequenceNames;
+}
+
+/////////////////////////////////////////////////////////////////
+// ReadParameters()
+//
+// Read initial distribution, transition, and emission
+// parameters from a file.
+/////////////////////////////////////////////////////////////////
+
+void ReadParameters (){
+
+  ifstream data;
+
+  // read initial state distribution and transition parameters
+  if (parametersInputFilename == string ("")){
+    if (NumInsertStates == 1){
+      for (int i = 0; i < NumMatrixTypes; i++) initDistrib[i] = initDistrib1Default[i];
+      for (int i = 0; i < 2*NumInsertStates; i++) gapOpen[i] = gapOpen1Default[i];
+      for (int i = 0; i < 2*NumInsertStates; i++) gapExtend[i] = gapExtend1Default[i];
+    }
+    else if (NumInsertStates == 2){
+      for (int i = 0; i < NumMatrixTypes; i++) initDistrib[i] = initDistrib2Default[i];
+      for (int i = 0; i < 2*NumInsertStates; i++) gapOpen[i] = gapOpen2Default[i];
+      for (int i = 0; i < 2*NumInsertStates; i++) gapExtend[i] = gapExtend2Default[i];
+    }
+    else {
+      cerr << "ERROR: No default initial distribution/parameter settings exist" << endl
+           << "       for " << NumInsertStates << " pairs of insert states.  Use --paramfile." << endl;
+      exit (1);
+    }
+  }
+  else {
+    data.open (parametersInputFilename.c_str());
+    if (data.fail()){
+      cerr << "ERROR: Unable to read parameter file: " << parametersInputFilename << endl;
+      exit (1);
+    }
+    for (int i = 0; i < NumMatrixTypes; i++) data >> initDistrib[i];
+    for (int i = 0; i < 2*NumInsertStates; i++) data >> gapOpen[i];
+    for (int i = 0; i < 2*NumInsertStates; i++) data >> gapExtend[i];
+    data.close();
+  }
+
+  // read emission parameters
+  int alphabetSize = 20;
+
+  // allocate memory
+  alphabet = SafeVector<char>(alphabetSize);
+  emitPairs = VVF (alphabetSize, VF (alphabetSize, 0));
+  emitSingle = VF (alphabetSize);
+
+  if (matrixFilename == string ("")){
+    for (int i = 0; i < alphabetSize; i++) alphabet[i] = alphabetDefault[i];
+    for (int i = 0; i < alphabetSize; i++){
+      emitSingle[i] = emitSingleDefault[i];
+      for (int j = 0; j <= i; j++){
+        emitPairs[i][j] = emitPairs[j][i] = (i == j);
+      }
+    }
+  }
+  else {
+    data.open (matrixFilename.c_str());
+    if (data.fail()){
+      cerr << "ERROR: Unable to read scoring matrix file: " << matrixFilename << endl;
+      exit (1);
+    }
+
+    for (int i = 0; i < alphabetSize; i++) data >> alphabet[i];
+    for (int i = 0; i < alphabetSize; i++){
+      for (int j = 0; j <= i; j++){
+        data >> emitPairs[i][j];
+        emitPairs[j][i] = emitPairs[i][j];
+      }
+    }
+    for (int i = 0; i < alphabetSize; i++){
+      char ch;
+      data >> ch;
+      assert (ch == alphabet[i]);
+    }
+    for (int i = 0; i < alphabetSize; i++) data >> emitSingle[i];
+    data.close();
+  }
+}
+
+/////////////////////////////////////////////////////////////////
+// ProcessTree()
+//
+// Process the tree recursively.  Returns the aligned sequences
+// corresponding to a node or leaf of the tree.
+/////////////////////////////////////////////////////////////////
+
+MultiSequence *ProcessTree (const TreeNode *tree, MultiSequence *sequences,
+                            const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+                            const ProbabilisticModel &model){
+  MultiSequence *result;
+
+  // check if this is a node of the alignment tree
+  if (tree->GetSequenceLabel() == -1){
+    MultiSequence *alignLeft = ProcessTree (tree->GetLeftChild(), sequences, sparseMatrices, model);
+    MultiSequence *alignRight = ProcessTree (tree->GetRightChild(), sequences, sparseMatrices, model);
+
+    assert (alignLeft);
+    assert (alignRight);
+
+    result = AlignAlignments (alignLeft, alignRight, sparseMatrices, model);
+    assert (result);
+
+    delete alignLeft;
+    delete alignRight;
+  }
+
+  // otherwise, this is a leaf of the alignment tree
+  else {
+    result = new MultiSequence(); assert (result);
+    result->AddSequence (sequences->GetSequence(tree->GetSequenceLabel())->Clone());
+  }
+
+  return result;
+}
+
+/////////////////////////////////////////////////////////////////
+// ComputeFinalAlignment()
+//
+// Compute the final alignment by calling ProcessTree(), then
+// performing iterative refinement as needed.
+/////////////////////////////////////////////////////////////////
+
+MultiSequence *ComputeFinalAlignment (const TreeNode *tree, MultiSequence *sequences,
+                                      const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+                                      const ProbabilisticModel &model){
+
+  MultiSequence *alignment = ProcessTree (tree, sequences, sparseMatrices, model);
+
+  // iterative refinement
+  for (int i = 0; i < numIterativeRefinementReps; i++)
+    DoIterativeRefinement (sparseMatrices, model, alignment);
+
+  cerr << endl;
+
+  // return final alignment
+  return alignment;
+}
+
+/////////////////////////////////////////////////////////////////
+// AlignAlignments()
+//
+// Returns the alignment of two MultiSequence objects.
+/////////////////////////////////////////////////////////////////
+
+MultiSequence *AlignAlignments (MultiSequence *align1, MultiSequence *align2,
+                                const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+                                const ProbabilisticModel &model){
+
+  // print some info about the alignment
+  if (enableVerbose){
+    for (int i = 0; i < align1->GetNumSequences(); i++)
+      cerr << ((i==0) ? "[" : ",") << align1->GetSequence(i)->GetLabel();
+    cerr << "] vs. ";
+    for (int i = 0; i < align2->GetNumSequences(); i++)
+      cerr << ((i==0) ? "[" : ",") << align2->GetSequence(i)->GetLabel();
+    cerr << "]: ";
+  }
+
+  VF *posterior = model.BuildPosterior (align1, align2, sparseMatrices);
+  pair<SafeVector<char> *, float> alignment;
+
+  // choose the alignment routine depending on the "cosmetic" gap penalties used
+  if (gapOpenPenalty == 0 && gapContinuePenalty == 0)
+    alignment = model.ComputeAlignment (align1->GetSequence(0)->GetLength(), align2->GetSequence(0)->GetLength(), *posterior);
+  else
+    alignment = model.ComputeAlignmentWithGapPenalties (align1, align2,
+                                                        *posterior, align1->GetNumSequences(), align2->GetNumSequences(),
+                                                        gapOpenPenalty, gapContinuePenalty);
+
+  delete posterior;
+
+  if (enableVerbose){
+
+    // compute total length of sequences
+    int totLength = 0;
+    for (int i = 0; i < align1->GetNumSequences(); i++)
+      for (int j = 0; j < align2->GetNumSequences(); j++)
+        totLength += min (align1->GetSequence(i)->GetLength(), align2->GetSequence(j)->GetLength());
+
+    // give an "accuracy" measure for the alignment
+    cerr << alignment.second / totLength << endl;
+  }
+
+  // now build final alignment
+  MultiSequence *result = new MultiSequence();
+  for (int i = 0; i < align1->GetNumSequences(); i++)
+    result->AddSequence (align1->GetSequence(i)->AddGaps(alignment.first, 'X'));
+  for (int i = 0; i < align2->GetNumSequences(); i++)
+    result->AddSequence (align2->GetSequence(i)->AddGaps(alignment.first, 'Y'));
+  result->SortByLabel();
+
+  // free temporary alignment
+  delete alignment.first;
+
+  return result;
+}
+
+/////////////////////////////////////////////////////////////////
+// DoRelaxation()
+//
+// Performs one round of the consistency transformation.  The
+// formula used is:
+//                     1
+//    P'(x[i]-y[j]) = ---  sum   sum P(x[i]-z[k]) P(z[k]-y[j])
+//                    |S| z in S  k
+//
+// where S = {x, y, all other sequences...}
+//
+/////////////////////////////////////////////////////////////////
+
+void DoRelaxation (MultiSequence *sequences, SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){
+  const int numSeqs = sequences->GetNumSequences();
+
+  SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices (numSeqs, SafeVector<SparseMatrix *>(numSeqs, NULL));
+
+  // for every pair of sequences
+  for (int i = 0; i < numSeqs; i++){
+    for (int j = i+1; j < numSeqs; j++){
+      Sequence *seq1 = sequences->GetSequence (i);
+      Sequence *seq2 = sequences->GetSequence (j);
+
+      if (enableVerbose)
+        cerr << "Relaxing (" << i+1 << ") " << seq1->GetHeader() << " vs. "
+             << "(" << j+1 << ") " << seq2->GetHeader() << ": ";
+
+      // get the original posterior matrix
+      VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior(); assert (posteriorPtr);
+      VF &posterior = *posteriorPtr;
+
+      const int seq1Length = seq1->GetLength();
+      const int seq2Length = seq2->GetLength();
+
+      // contribution from the summation where z = x and z = y
+      for (int k = 0; k < (seq1Length+1) * (seq2Length+1); k++) posterior[k] += posterior[k];
+
+      if (enableVerbose)
+        cerr << sparseMatrices[i][j]->GetNumCells() << " --> ";
+
+      // contribution from all other sequences
+      for (int k = 0; k < numSeqs; k++) if (k != i && k != j){
+        Relax (sparseMatrices[i][k], sparseMatrices[k][j], posterior);
+      }
+
+      // now renormalization
+      for (int k = 0; k < (seq1Length+1) * (seq2Length+1); k++) posterior[k] /= numSeqs;
+
+      // save the new posterior matrix
+      newSparseMatrices[i][j] = new SparseMatrix (seq1->GetLength(), seq2->GetLength(), posterior);
+      newSparseMatrices[j][i] = newSparseMatrices[i][j]->ComputeTranspose();
+
+      if (enableVerbose)
+        cerr << newSparseMatrices[i][j]->GetNumCells() << " -- ";
+
+      delete posteriorPtr;
+
+      if (enableVerbose)
+        cerr << "done." << endl;
+    }
+  }
+
+  // now replace the old posterior matrices
+  for (int i = 0; i < numSeqs; i++){
+    for (int j = 0; j < numSeqs; j++){
+      delete sparseMatrices[i][j];
+      sparseMatrices[i][j] = newSparseMatrices[i][j];
+    }
+  }
+}
+
+/////////////////////////////////////////////////////////////////
+// DoRelaxation()
+//
+// Computes the consistency transformation for a single sequence
+// z, and adds the transformed matrix to "posterior".
+/////////////////////////////////////////////////////////////////
+
+void Relax (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior){
+
+  assert (matXZ);
+  assert (matZY);
+
+  int lengthX = matXZ->GetSeq1Length();
+  int lengthY = matZY->GetSeq2Length();
+  assert (matXZ->GetSeq2Length() == matZY->GetSeq1Length());
+
+  // for every x[i]
+  for (int i = 1; i <= lengthX; i++){
+    SafeVector<PIF>::iterator XZptr = matXZ->GetRowPtr(i);
+    SafeVector<PIF>::iterator XZend = XZptr + matXZ->GetRowSize(i);
+
+    VF::iterator base = posterior.begin() + i * (lengthY + 1);
+
+    // iterate through all x[i]-z[k]
+    while (XZptr != XZend){
+      SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(XZptr->first);
+      SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(XZptr->first);
+      const float XZval = XZptr->second;
+
+      // iterate through all z[k]-y[j]
+      while (ZYptr != ZYend){
+        base[ZYptr->first] += XZval * ZYptr->second;;
+        ZYptr++;
+      }
+      XZptr++;
+    }
+  }
+}
+
+/////////////////////////////////////////////////////////////////
+// DoIterativeRefinement()
+//
+// Performs a single round of randomized partionining iterative
+// refinement.
+/////////////////////////////////////////////////////////////////
+
+void DoIterativeRefinement (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+                            const ProbabilisticModel &model, MultiSequence* &alignment){
+  set<int> groupOne, groupTwo;
+
+  // create two separate groups
+  for (int i = 0; i < alignment->GetNumSequences(); i++){
+    if (random() % 2)
+      groupOne.insert (i);
+    else
+      groupTwo.insert (i);
+  }
+
+  if (groupOne.empty() || groupTwo.empty()) return;
+
+  // project into the two groups
+  MultiSequence *groupOneSeqs = alignment->Project (groupOne); assert (groupOneSeqs);
+  MultiSequence *groupTwoSeqs = alignment->Project (groupTwo); assert (groupTwoSeqs);
+  delete alignment;
+
+  // realign
+  alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model);
+}
+
+/*
+float ScoreAlignment (MultiSequence *alignment, MultiSequence *sequences, SparseMatrix **sparseMatrices, const int numSeqs){
+  int totLength = 0;
+  float score = 0;
+
+  for (int a = 0; a < alignment->GetNumSequences(); a++){
+    for (int b = a+1; b < alignment->GetNumSequences(); b++){
+      Sequence *seq1 = alignment->GetSequence(a);
+      Sequence *seq2 = alignment->GetSequence(b);
+
+      const int seq1Length = sequences->GetSequence(seq1->GetLabel())->GetLength();
+      const int seq2Length = sequences->GetSequence(seq2->GetLabel())->GetLength();
+
+      totLength += min (seq1Length, seq2Length);
+
+      int pos1 = 0, pos2 = 0;
+      for (int i = 1; i <= seq1->GetLength(); i++){
+        char ch1 = seq1->GetPosition(i);
+        char ch2 = seq2->GetPosition(i);
+
+        if (ch1 != '-') pos1++;
+        if (ch2 != '-') pos2++;
+        if (ch1 != '-' && ch2 != '-'){
+          score += sparseMatrices[a * numSeqs + b]->GetValue (pos1, pos2);
+        }
+      }
+    }
+  }
+
+  return score / totLength;
+}
+*/

Added: trunk/packages/probcons/branches/upstream/current/Main.cc
===================================================================
--- trunk/packages/probcons/branches/upstream/current/Main.cc	2006-05-07 10:44:13 UTC (rev 40)
+++ trunk/packages/probcons/branches/upstream/current/Main.cc	2006-05-07 14:55:27 UTC (rev 41)
@@ -0,0 +1,1445 @@
+/////////////////////////////////////////////////////////////////
+// Main.cc
+//
+// Main routines for PROBCONS program.
+/////////////////////////////////////////////////////////////////
+
+#include "SafeVector.h"
+#include "MultiSequence.h"
+#include "Defaults.h"
+#include "ScoreType.h"
+#include "ProbabilisticModel.h"
+#include "EvolutionaryTree.h"
+#include "SparseMatrix.h"
+#include <string>
+#include <sstream>
+#include <iomanip>
+#include <iostream>
+#include <list>
+#include <set>
+#include <algorithm>
+#include <cstdio>
+#include <cstdlib>
+#include <cerrno>
+#include <iomanip>
+
+string parametersInputFilename = "";
+string parametersOutputFilename = "no training";
+string annotationFilename = "";
+
+bool enableTraining = false;
+bool enableVerbose = false;
+bool enableAllPairs = false;
+bool enableAnnotation = false;
+bool enableViterbi = false;
+bool enableClustalWOutput = false;
+bool enableTrainEmissions = false;
+bool enableAlignOrder = false;
+int numConsistencyReps = 2;
+int numPreTrainingReps = 0;
+int numIterativeRefinementReps = 100;
+
+float cutoff = 0;
+float gapOpenPenalty = 0;
+float gapContinuePenalty = 0;
+
+VF initDistrib (NumMatrixTypes);
+VF gapOpen (2*NumInsertStates);
+VF gapExtend (2*NumInsertStates);
+VVF emitPairs (256, VF (256, 1e-10));
+VF emitSingle (256, 1e-5);
+
+string alphabet = alphabetDefault;
+
+const int MIN_PRETRAINING_REPS = 0;
+const int MAX_PRETRAINING_REPS = 20;
+const int MIN_CONSISTENCY_REPS = 0;
+const int MAX_CONSISTENCY_REPS = 5;
+const int MIN_ITERATIVE_REFINEMENT_REPS = 0;
+const int MAX_ITERATIVE_REFINEMENT_REPS = 1000;
+
+/////////////////////////////////////////////////////////////////
+// Function prototypes
+/////////////////////////////////////////////////////////////////
+
+void PrintHeading();
+void PrintParameters (const char *message, const VF &initDistrib, const VF &gapOpen,
+                      const VF &gapExtend, const VVF &emitPairs, const VF &emitSingle, const char *filename);
+MultiSequence *DoAlign (MultiSequence *sequence, const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen, VF &gapExtend,
+			VVF &emitPairs, VF &emitSingle);
+SafeVector<string> ParseParams (int argc, char **argv);
+void ReadParameters ();
+MultiSequence *ComputeFinalAlignment (const TreeNode *tree, MultiSequence *sequences,
+                                      const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+                                      const ProbabilisticModel &model);
+MultiSequence *AlignAlignments (MultiSequence *align1, MultiSequence *align2,
+                                const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+                                const ProbabilisticModel &model);
+SafeVector<SafeVector<SparseMatrix *> > DoRelaxation (MultiSequence *sequences, 
+						      SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
+void Relax (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior);
+void Relax1 (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior);
+
+set<int> GetSubtree (const TreeNode *tree);
+void TreeBasedBiPartitioning (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+                              const ProbabilisticModel &model, MultiSequence* &alignment,
+                              const TreeNode *tree);
+void DoIterativeRefinement (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+                            const ProbabilisticModel &model, MultiSequence* &alignment);
+void WriteAnnotation (MultiSequence *alignment,
+		      const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
+int ComputeScore (const SafeVector<pair<int, int> > &active, 
+		  const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
+
+
+/////////////////////////////////////////////////////////////////
+// main()
+//
+// Calls all initialization routines and runs the PROBCONS
+// aligner.
+/////////////////////////////////////////////////////////////////
+
+int main (int argc, char **argv){
+
+  // print PROBCONS heading
+  PrintHeading();
+  
+  // parse program parameters
+  SafeVector<string> sequenceNames = ParseParams (argc, argv);
+  ReadParameters();
+  PrintParameters ("Using parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);
+
+  // now, we'll process all the files given as input.  If we are given
+  // several filenames as input, then we'll load all of those sequences
+  // simultaneously, as long as we're not training.  On the other hand,
+  // if we are training, then we'll treat each file as a separate
+  // training instance
+  
+  // if we are training
+  if (enableTraining){
+
+    // build new model for aligning
+    ProbabilisticModel model (initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);
+
+    // prepare to average parameters
+    for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] = 0;
+    for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] = 0;
+    for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] = 0;
+    if (enableTrainEmissions){
+      for (int i = 0; i < (int) emitPairs.size(); i++)
+	for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] = 0;
+      for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] = 0;
+    }
+   
+    // align each file individually
+    for (int i = 0; i < (int) sequenceNames.size(); i++){
+
+      VF thisInitDistrib (NumMatrixTypes);
+      VF thisGapOpen (2*NumInsertStates);
+      VF thisGapExtend (2*NumInsertStates);
+      VVF thisEmitPairs (256, VF (256, 1e-10));
+      VF thisEmitSingle (256, 1e-5);
+      
+      // load sequence file
+      MultiSequence *sequences = new MultiSequence(); assert (sequences);
+      cerr << "Loading sequence file: " << sequenceNames[i] << endl;
+      sequences->LoadMFA (sequenceNames[i], true);
+
+      // align sequences
+      DoAlign (sequences, model, thisInitDistrib, thisGapOpen, thisGapExtend, thisEmitPairs, thisEmitSingle);
+
+      // add in contribution of the derived parameters
+      for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] += thisInitDistrib[i];
+      for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] += thisGapOpen[i];
+      for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] += thisGapExtend[i];
+      if (enableTrainEmissions){
+	for (int i = 0; i < (int) emitPairs.size(); i++) 
+	  for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] += thisEmitPairs[i][j];
+	for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] += thisEmitSingle[i];
+      }
+
+      delete sequences;
+    }
+
+    // compute new parameters and print them out
+    for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] /= (int) sequenceNames.size();
+    for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] /= (int) sequenceNames.size();
+    for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] /= (int) sequenceNames.size();
+    if (enableTrainEmissions){
+      for (int i = 0; i < (int) emitPairs.size(); i++) 
+	for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] /= (int) sequenceNames.size();
+      for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] /= sequenceNames.size();
+    }
+    
+    PrintParameters ("Trained parameter set:",
+                     initDistrib, gapOpen, gapExtend, emitPairs, emitSingle,
+                     parametersOutputFilename.c_str());
+  }
+
+  // if we are not training, we must simply want to align some sequences
+  else {
+
+    // load all files together
+    MultiSequence *sequences = new MultiSequence(); assert (sequences);
+    for (int i = 0; i < (int) sequenceNames.size(); i++){
+      cerr << "Loading sequence file: " << sequenceNames[i] << endl;
+      sequences->LoadMFA (sequenceNames[i], true);
+    }
+
+    // do all "pre-training" repetitions first
+    for (int ct = 0; ct < numPreTrainingReps; ct++){
+      enableTraining = true;
+
+      // build new model for aligning
+      ProbabilisticModel model (initDistrib, gapOpen, gapExtend, 
+                                emitPairs, emitSingle);
+
+      // do initial alignments
+      DoAlign (sequences, model, initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);
+
+      // print new parameters
+      PrintParameters ("Recomputed parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);
+
+      enableTraining = false;
+    }
+
+    // now, we can perform the alignments and write them out
+    MultiSequence *alignment = DoAlign (sequences,
+                                        ProbabilisticModel (initDistrib, gapOpen, gapExtend,  emitPairs, emitSingle),
+                                        initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);
+    
+    if (!enableAllPairs){
+      if (enableClustalWOutput)
+	alignment->WriteALN (cout);
+      else
+	alignment->WriteMFA (cout);
+    }
+    delete alignment;
+    delete sequences;
+   
+  }
+}
+
+/////////////////////////////////////////////////////////////////
+// PrintHeading()
+//
+// Prints heading for PROBCONS program.
+/////////////////////////////////////////////////////////////////
+
+void PrintHeading (){
+  cerr << endl
+       << "PROBCONS version " << VERSION << " - align multiple protein sequences and print to standard output" << endl
+       << "Written by Chuong Do" << endl
+       << endl;
+}
+
+/////////////////////////////////////////////////////////////////
+// PrintParameters()
+//
+// Prints PROBCONS parameters to STDERR.  If a filename is
+// specified, then the parameters are also written to the file.
+/////////////////////////////////////////////////////////////////
+
+void PrintParameters (const char *message, const VF &initDistrib, const VF &gapOpen,
+                      const VF &gapExtend, const VVF &emitPairs, const VF &emitSingle, const char *filename){
+
+  // print parameters to the screen
+  cerr << message << endl
+       << "    initDistrib[] = { ";
+  for (int i = 0; i < NumMatrixTypes; i++) cerr << setprecision (10) << initDistrib[i] << " ";
+  cerr << "}" << endl
+       << "        gapOpen[] = { ";
+  for (int i = 0; i < NumInsertStates*2; i++) cerr << setprecision (10) << gapOpen[i] << " ";
+  cerr << "}" << endl
+       << "      gapExtend[] = { ";
+  for (int i = 0; i < NumInsertStates*2; i++) cerr << setprecision (10) << gapExtend[i] << " ";
+  cerr << "}" << endl
+       << endl;
+
+  /*
+  for (int i = 0; i < 5; i++){
+    for (int j = 0; j <= i; j++){
+      cerr << emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]] << " ";
+    }
+    cerr << endl;
+    }*/
+
+  // if a file name is specified
+  if (filename){
+
+    // attempt to open the file for writing
+    FILE *file = fopen (filename, "w");
+    if (!file){
+      cerr << "ERROR: Unable to write parameter file: " << filename << endl;
+      exit (1);
+    }
+
+    // if successful, then write the parameters to the file
+    for (int i = 0; i < NumMatrixTypes; i++) fprintf (file, "%.10f ", initDistrib[i]); fprintf (file, "\n");
+    for (int i = 0; i < 2*NumInsertStates; i++) fprintf (file, "%.10f ", gapOpen[i]); fprintf (file, "\n");
+    for (int i = 0; i < 2*NumInsertStates; i++) fprintf (file, "%.10f ", gapExtend[i]); fprintf (file, "\n");
+    fprintf (file, "%s\n", alphabet.c_str());
+    for (int i = 0; i < (int) alphabet.size(); i++){
+      for (int j = 0; j <= i; j++)
+	fprintf (file, "%.10f ", emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]]);
+      fprintf (file, "\n");
+    }
+    for (int i = 0; i < (int) alphabet.size(); i++)
+      fprintf (file, "%.10f ", emitSingle[(unsigned char) alphabet[i]]);
+    fprintf (file, "\n");
+    fclose (file);
+  }
+}
+
+/////////////////////////////////////////////////////////////////
+// DoAlign()
+//
+// First computes all pairwise posterior probability matrices.
+// Then, computes new parameters if training, or a final
+// alignment, otherwise.
+/////////////////////////////////////////////////////////////////
+
+MultiSequence *DoAlign (MultiSequence *sequences, const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen, 
+			VF &gapExtend, VVF &emitPairs, VF &emitSingle){
+
+  assert (sequences);
+
+  const int numSeqs = sequences->GetNumSequences();
+  VVF distances (numSeqs, VF (numSeqs, 0));
+  SafeVector<SafeVector<SparseMatrix *> > sparseMatrices (numSeqs, SafeVector<SparseMatrix *>(numSeqs, NULL));
+
+
+
+  if (enableTraining){
+    // prepare to average parameters
+    for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] = 0;
+    for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] = 0;
+    for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] = 0;
+    if (enableTrainEmissions){
+      for (int i = 0; i < (int) emitPairs.size(); i++)
+	for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] = 0;
+      for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] = 0;
+    }
+  }
+
+  // skip posterior calculations if we just want to do Viterbi alignments
+  if (!enableViterbi){
+
+    // do all pairwise alignments for posterior probability matrices
+    for (int a = 0; a < numSeqs-1; a++){
+      for (int b = a+1; b < numSeqs; b++){
+	Sequence *seq1 = sequences->GetSequence (a);
+	Sequence *seq2 = sequences->GetSequence (b);
+	
+	// verbose output
+	if (enableVerbose)
+	  cerr << "Computing posterior matrix: (" << a+1 << ") " << seq1->GetHeader() << " vs. "
+	       << "(" << b+1 << ") " << seq2->GetHeader() << " -- ";
+	
+	// compute forward and backward probabilities
+	VF *forward = model.ComputeForwardMatrix (seq1, seq2); assert (forward);
+	VF *backward = model.ComputeBackwardMatrix (seq1, seq2); assert (backward);
+	
+	// if we are training, then we'll simply want to compute the
+	// expected counts for each region within the matrix separately;
+	// otherwise, we'll need to put all of the regions together and
+	// assemble a posterior probability match matrix
+	
+	// so, if we're training
+	if (enableTraining){
+	  
+	  // compute new parameters
+	  VF thisInitDistrib (NumMatrixTypes);
+	  VF thisGapOpen (2*NumInsertStates);
+	  VF thisGapExtend (2*NumInsertStates);
+	  VVF thisEmitPairs (256, VF (256, 1e-10));
+	  VF thisEmitSingle (256, 1e-5);
+	  
+	  model.ComputeNewParameters (seq1, seq2, *forward, *backward, thisInitDistrib, thisGapOpen, thisGapExtend, thisEmitPairs, thisEmitSingle, enableTrainEmissions);
+
+	  // add in contribution of the derived parameters
+	  for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] += thisInitDistrib[i];
+	  for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] += thisGapOpen[i];
+	  for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] += thisGapExtend[i];
+	  if (enableTrainEmissions){
+	    for (int i = 0; i < (int) emitPairs.size(); i++) 
+	      for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] += thisEmitPairs[i][j];
+	    for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] += thisEmitSingle[i];
+	  }
+
+	  // let us know that we're done.
+	  if (enableVerbose) cerr << "done." << endl;
+	}
+	else {
+
+	  // compute posterior probability matrix
+	  VF *posterior = model.ComputePosteriorMatrix (seq1, seq2, *forward, *backward); assert (posterior);
+
+	  // compute sparse representations
+	  sparseMatrices[a][b] = new SparseMatrix (seq1->GetLength(), seq2->GetLength(), *posterior);
+	  sparseMatrices[b][a] = NULL; 
+	  
+	  if (!enableAllPairs){
+	    // perform the pairwise sequence alignment
+	    pair<SafeVector<char> *, float> alignment = model.ComputeAlignment (seq1->GetLength(),
+										seq2->GetLength(),
+										*posterior);
+	    
+	    // compute "expected accuracy" distance for evolutionary tree computation
+	    float distance = alignment.second / min (seq1->GetLength(), seq2->GetLength());
+	    distances[a][b] = distances[b][a] = distance;
+	    
+	    if (enableVerbose)
+	      cerr << setprecision (10) << distance << endl;
+	    
+	      delete alignment.first;
+	  }
+	  else {
+	    // let us know that we're done.
+	    if (enableVerbose) cerr << "done." << endl;
+	  }
+	  
+	  delete posterior;
+	}
+	
+	delete forward;
+	delete backward;
+      }
+    }
+  }
+
+  // now average out parameters derived
+  if (enableTraining){
+
+    // compute new parameters
+    for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] /= numSeqs * (numSeqs - 1) / 2;
+    for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] /= numSeqs * (numSeqs - 1) / 2;
+    for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] /= numSeqs * (numSeqs - 1) / 2;
+
+    if (enableTrainEmissions){
+      for (int i = 0; i < (int) emitPairs.size(); i++)
+	for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] /= numSeqs * (numSeqs - 1) / 2;
+      for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] /= numSeqs * (numSeqs - 1) / 2;
+    }
+  }
+
+  // see if we still want to do some alignments
+  else {
+
+    if (!enableViterbi){
+      
+      // perform the consistency transformation the desired number of times
+      for (int r = 0; r < numConsistencyReps; r++){
+	SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices = DoRelaxation (sequences, sparseMatrices);
+
+	// now replace the old posterior matrices
+	for (int i = 0; i < numSeqs; i++){
+	  for (int j = 0; j < numSeqs; j++){
+	    delete sparseMatrices[i][j];
+	    sparseMatrices[i][j] = newSparseMatrices[i][j];
+	  }
+	}
+      }
+    }
+
+    MultiSequence *finalAlignment = NULL;
+
+    if (enableAllPairs){
+      for (int a = 0; a < numSeqs-1; a++){
+	for (int b = a+1; b < numSeqs; b++){
+	  Sequence *seq1 = sequences->GetSequence (a);
+	  Sequence *seq2 = sequences->GetSequence (b);
+	  
+	  if (enableVerbose)
+	    cerr << "Performing pairwise alignment: (" << a+1 << ") " << seq1->GetHeader() << " vs. "
+		 << "(" << b+1 << ") " << seq2->GetHeader() << " -- ";
+
+	  
+	  // perform the pairwise sequence alignment
+	  pair<SafeVector<char> *, float> alignment;
+	  if (enableViterbi)
+	    alignment = model.ComputeViterbiAlignment (seq1, seq2);
+	  else {
+
+	    // build posterior matrix
+	    VF *posterior = sparseMatrices[a][b]->GetPosterior(); assert (posterior);
+	    int length = (seq1->GetLength() + 1) * (seq2->GetLength() + 1);
+	    for (int i = 0; i < length; i++) (*posterior)[i] -= cutoff;
+
+	    alignment = model.ComputeAlignment (seq1->GetLength(), seq2->GetLength(), *posterior);
+	    delete posterior;
+	  }
+
+	  // write pairwise alignments 
+	  string name = seq1->GetHeader() + "-" + seq2->GetHeader() + (enableClustalWOutput ? ".aln" : ".fasta");
+	  ofstream outfile (name.c_str());
+	  
+	  MultiSequence *result = new MultiSequence();
+	  result->AddSequence (seq1->AddGaps(alignment.first, 'X'));
+	  result->AddSequence (seq2->AddGaps(alignment.first, 'Y'));
+	  if (enableClustalWOutput)
+	    result->WriteALN (outfile);
+	  else
+	    result->WriteMFA (outfile);
+	  
+	  outfile.close();
+	  
+	  delete alignment.first;
+	}
+      }
+    }
+    
+    // now if we still need to do a final multiple alignment
+    else {
+      
+      if (enableVerbose)
+	cerr << endl;
+      
+      // compute the evolutionary tree
+      TreeNode *tree = TreeNode::ComputeTree (distances);
+      
+      tree->Print (cerr, sequences);
+      cerr << endl;
+      
+      // make the final alignment
+      finalAlignment = ComputeFinalAlignment (tree, sequences, sparseMatrices, model);
+      
+      // build annotation
+      if (enableAnnotation){
+	WriteAnnotation (finalAlignment, sparseMatrices);
+      }
+
+      delete tree;
+    }
+
+    if (!enableViterbi){
+      // delete sparse matrices
+      for (int a = 0; a < numSeqs-1; a++){
+	for (int b = a+1; b < numSeqs; b++){
+	  delete sparseMatrices[a][b];
+	  delete sparseMatrices[b][a];
+	}
+      }
+    }
+
+    return finalAlignment;
+  }
+
+  return NULL;
+}
+
+/////////////////////////////////////////////////////////////////
+// GetInteger()
+//
+// Attempts to parse an integer from the character string given.
+// Returns true only if no parsing error occurs.
+/////////////////////////////////////////////////////////////////
+
+bool GetInteger (char *data, int *val){
+  char *endPtr;
+  long int retVal;
+
+  assert (val);
+
+  errno = 0;
+  retVal = strtol (data, &endPtr, 0);
+  if (retVal == 0 && (errno != 0 || data == endPtr)) return false;
+  if (errno != 0 && (retVal == LONG_MAX || retVal == LONG_MIN)) return false;
+  if (retVal < (long) INT_MIN || retVal > (long) INT_MAX) return false;
+  *val = (int) retVal;
+  return true;
+}
+
+/////////////////////////////////////////////////////////////////
+// GetFloat()
+//
+// Attempts to parse a float from the character string given.
+// Returns true only if no parsing error occurs.
+/////////////////////////////////////////////////////////////////
+
+bool GetFloat (char *data, float *val){
+  char *endPtr;
+  double retVal;
+
+  assert (val);
+
+  errno = 0;
+  retVal = strtod (data, &endPtr);
+  if (retVal == 0 && (errno != 0 || data == endPtr)) return false;
+  if (errno != 0 && (retVal >= 1000000.0 || retVal <= -1000000.0)) return false;
+  *val = (float) retVal;
+  return true;
+}
+
+/////////////////////////////////////////////////////////////////
+// ParseParams()
+//
+// Parse all command-line options.
+/////////////////////////////////////////////////////////////////
+
+SafeVector<string> ParseParams (int argc, char **argv){
+
+  if (argc < 2){
+
+    cerr << "PROBCONS comes with ABSOLUTELY NO WARRANTY.  This is free software, and" << endl
+         << "you are welcome to redistribute it under certain conditions.  See the" << endl
+         << "file COPYING.txt for details." << endl
+         << endl
+         << "Usage:" << endl
+         << "       probcons [OPTION]... [MFAFILE]..." << endl
+         << endl
+         << "Description:" << endl
+         << "       Align sequences in MFAFILE(s) and print result to standard output" << endl
+         << endl
+         << "       -clustalw" << endl
+         << "              use CLUSTALW output format instead of MFA" << endl
+         << endl
+         << "       -c, --consistency REPS" << endl
+         << "              use " << MIN_CONSISTENCY_REPS << " <= REPS <= " << MAX_CONSISTENCY_REPS
+         << " (default: " << numConsistencyReps << ") passes of consistency transformation" << endl
+         << endl
+         << "       -ir, --iterative-refinement REPS" << endl
+         << "              use " << MIN_ITERATIVE_REFINEMENT_REPS << " <= REPS <= " << MAX_ITERATIVE_REFINEMENT_REPS
+         << " (default: " << numIterativeRefinementReps << ") passes of iterative-refinement" << endl
+         << endl
+	 << "       -pre, --pre-training REPS" << endl
+	 << "              use " << MIN_PRETRAINING_REPS << " <= REPS <= " << MAX_PRETRAINING_REPS
+	 << " (default: " << numPreTrainingReps << ") rounds of pretraining" << endl
+	 << endl
+	 << "       -pairs" << endl
+         << "              generate all-pairs pairwise alignments" << endl
+         << endl
+	 << "       -viterbi" << endl
+	 << "              use Viterbi algorithm to generate all pairs (automatically enables -pairs)" << endl
+	 << endl
+         << "       -v, --verbose" << endl
+         << "              report progress while aligning (default: " << (enableVerbose ? "on" : "off") << ")" << endl
+         << endl
+         << "       -annot FILENAME" << endl
+         << "              write annotation for multiple alignment to FILENAME" << endl
+         << endl
+         << "       -t, --train FILENAME" << endl
+         << "              compute EM transition probabilities, store in FILENAME (default: "
+         << parametersOutputFilename << ")" << endl
+         << endl
+         << "       -e, --emissions" << endl
+         << "              also reestimate emission probabilities (default: "
+         << (enableTrainEmissions ? "on" : "off") << ")" << endl
+         << endl
+	 << "       -p, --paramfile FILENAME" << endl
+	 << "              read parameters from FILENAME (default: "
+	 << parametersInputFilename << ")" << endl
+	 << endl
+	 << "       -a, --alignment-order" << endl
+	 << "              print sequences in alignment order rather than input order (default: "
+	 << (enableAlignOrder ? "on" : "off") << ")" << endl
+	 << endl;
+    //     	 << "       -go, --gap-open VALUE" << endl
+    //     	 << "              gap opening penalty of VALUE <= 0 (default: " << gapOpenPenalty << ")" << endl
+    //	 << endl
+    //	 << "       -ge, --gap-extension VALUE" << endl
+    //	 << "              gap extension penalty of VALUE <= 0 (default: " << gapContinuePenalty << ")" << endl
+    //	 << endl
+    //         << "       -co, --cutoff CUTOFF" << endl
+    //         << "              subtract 0 <= CUTOFF <= 1 (default: " << cutoff << ") from all posterior values before final alignment" << endl
+    //         << endl
+    
+    exit (1);
+  }
+
+  SafeVector<string> sequenceNames;
+  int tempInt;
+  float tempFloat;
+
+  for (int i = 1; i < argc; i++){
+    if (argv[i][0] == '-'){
+
+      // training
+      if (!strcmp (argv[i], "-t") || !strcmp (argv[i], "--train")){
+        enableTraining = true;
+        if (i < argc - 1)
+          parametersOutputFilename = string (argv[++i]);
+        else {
+          cerr << "ERROR: Filename expected for option " << argv[i] << endl;
+          exit (1);
+        }
+      }
+      
+      // emission training
+      else if (!strcmp (argv[i], "-e") || !strcmp (argv[i], "--emissions")){
+        enableTrainEmissions = true;
+      }
+
+      // parameter file
+      else if (!strcmp (argv[i], "-p") || !strcmp (argv[i], "--paramfile")){
+        if (i < argc - 1)
+          parametersInputFilename = string (argv[++i]);
+        else {
+          cerr << "ERROR: Filename expected for option " << argv[i] << endl;
+          exit (1);
+        }
+      }
+
+      // number of consistency transformations
+      else if (!strcmp (argv[i], "-c") || !strcmp (argv[i], "--consistency")){
+        if (i < argc - 1){
+          if (!GetInteger (argv[++i], &tempInt)){
+            cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
+            exit (1);
+          }
+          else {
+            if (tempInt < MIN_CONSISTENCY_REPS || tempInt > MAX_CONSISTENCY_REPS){
+              cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
+                   << MIN_CONSISTENCY_REPS << " and " << MAX_CONSISTENCY_REPS << "." << endl;
+              exit (1);
+            }
+            else
+              numConsistencyReps = tempInt;
+          }
+        }
+        else {
+          cerr << "ERROR: Integer expected for option " << argv[i] << endl;
+          exit (1);
+        }
+      }
+
+      // number of randomized partitioning iterative refinement passes
+      else if (!strcmp (argv[i], "-ir") || !strcmp (argv[i], "--iterative-refinement")){
+        if (i < argc - 1){
+          if (!GetInteger (argv[++i], &tempInt)){
+            cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
+            exit (1);
+          }
+          else {
+            if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS || tempInt > MAX_ITERATIVE_REFINEMENT_REPS){
+              cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
+                   << MIN_ITERATIVE_REFINEMENT_REPS << " and " << MAX_ITERATIVE_REFINEMENT_REPS << "." << endl;
+              exit (1);
+            }
+            else
+              numIterativeRefinementReps = tempInt;
+          }
+        }
+        else {
+          cerr << "ERROR: Integer expected for option " << argv[i] << endl;
+          exit (1);
+        }
+      }
+
+      // number of EM pre-training rounds
+      else if (!strcmp (argv[i], "-pre") || !strcmp (argv[i], "--pre-training")){
+        if (i < argc - 1){
+          if (!GetInteger (argv[++i], &tempInt)){
+            cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
+            exit (1);
+          }
+          else {
+            if (tempInt < MIN_PRETRAINING_REPS || tempInt > MAX_PRETRAINING_REPS){
+              cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
+                   << MIN_PRETRAINING_REPS << " and " << MAX_PRETRAINING_REPS << "." << endl;
+              exit (1);
+            }
+            else
+              numPreTrainingReps = tempInt;
+          }
+        }
+        else {
+          cerr << "ERROR: Integer expected for option " << argv[i] << endl;
+          exit (1);
+        }
+      }
+
+      // gap open penalty
+      else if (!strcmp (argv[i], "-go") || !strcmp (argv[i], "--gap-open")){
+        if (i < argc - 1){
+          if (!GetFloat (argv[++i], &tempFloat)){
+            cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
+            exit (1);
+          }
+          else {
+            if (tempFloat > 0){
+              cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be positive." << endl;
+              exit (1);
+            }
+            else
+              gapOpenPenalty = tempFloat;
+          }
+        }
+        else {
+          cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
+          exit (1);
+        }
+      }
+
+      // gap extension penalty
+      else if (!strcmp (argv[i], "-ge") || !strcmp (argv[i], "--gap-extension")){
+        if (i < argc - 1){
+          if (!GetFloat (argv[++i], &tempFloat)){
+            cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
+            exit (1);
+          }
+          else {
+            if (tempFloat > 0){
+              cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be positive." << endl;
+              exit (1);
+            }
+            else
+              gapContinuePenalty = tempFloat;
+          }
+        }
+        else {
+          cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
+          exit (1);
+        }
+      }
+
+      // all-pairs pairwise alignments
+      else if (!strcmp (argv[i], "-pairs")){
+        enableAllPairs = true;
+      }
+
+      // all-pairs pairwise Viterbi alignments
+      else if (!strcmp (argv[i], "-viterbi")){
+        enableAllPairs = true;
+	enableViterbi = true;
+      }
+
+      // annotation files
+      else if (!strcmp (argv[i], "-annot")){
+        enableAnnotation = true;
+        if (i < argc - 1)
+	  annotationFilename = argv[++i];
+        else {
+          cerr << "ERROR: FILENAME expected for option " << argv[i] << endl;
+          exit (1);
+        }
+      }
+
+      // clustalw output format
+      else if (!strcmp (argv[i], "-clustalw")){
+	enableClustalWOutput = true;
+      }
+
+      // cutoff
+      else if (!strcmp (argv[i], "-co") || !strcmp (argv[i], "--cutoff")){
+        if (i < argc - 1){
+          if (!GetFloat (argv[++i], &tempFloat)){
+            cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
+            exit (1);
+          }
+          else {
+            if (tempFloat < 0 || tempFloat > 1){
+              cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must be between 0 and 1." << endl;
+              exit (1);
+            }
+            else
+              cutoff = tempFloat;
+          }
+        }
+        else {
+          cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
+          exit (1);
+        }
+      }
+
+      // verbose reporting
+      else if (!strcmp (argv[i], "-v") || !strcmp (argv[i], "--verbose")){
+        enableVerbose = true;
+      }
+
+      // alignment order
+      else if (!strcmp (argv[i], "-a") || !strcmp (argv[i], "--alignment-order")){
+	enableAlignOrder = true;
+      }
+
+      // bad arguments
+      else {
+        cerr << "ERROR: Unrecognized option: " << argv[i] << endl;
+        exit (1);
+      }
+    }
+    else {
+      sequenceNames.push_back (string (argv[i]));
+    }
+  }
+
+  if (enableTrainEmissions && !enableTraining){
+    cerr << "ERROR: Training emissions (-e) requires training (-t)" << endl;
+    exit (1);
+  }
+
+  return sequenceNames;
+}
+
+/////////////////////////////////////////////////////////////////
+// ReadParameters()
+//
+// Read initial distribution, transition, and emission
+// parameters from a file.
+/////////////////////////////////////////////////////////////////
+
+void ReadParameters (){
+
+  ifstream data;
+
+  emitPairs = VVF (256, VF (256, 1e-10));
+  emitSingle = VF (256, 1e-5);
+
+  // read initial state distribution and transition parameters
+  if (parametersInputFilename == string ("")){
+    if (NumInsertStates == 1){
+      for (int i = 0; i < NumMatrixTypes; i++) initDistrib[i] = initDistrib1Default[i];
+      for (int i = 0; i < 2*NumInsertStates; i++) gapOpen[i] = gapOpen1Default[i];
+      for (int i = 0; i < 2*NumInsertStates; i++) gapExtend[i] = gapExtend1Default[i];
+    }
+    else if (NumInsertStates == 2){
+      for (int i = 0; i < NumMatrixTypes; i++) initDistrib[i] = initDistrib2Default[i];
+      for (int i = 0; i < 2*NumInsertStates; i++) gapOpen[i] = gapOpen2Default[i];
+      for (int i = 0; i < 2*NumInsertStates; i++) gapExtend[i] = gapExtend2Default[i];
+    }
+    else {
+      cerr << "ERROR: No default initial distribution/parameter settings exist" << endl
+           << "       for " << NumInsertStates << " pairs of insert states.  Use --paramfile." << endl;
+      exit (1);
+    }
+
+    alphabet = alphabetDefault;
+
+    for (int i = 0; i < (int) alphabet.length(); i++){
+      emitSingle[(unsigned char) tolower(alphabet[i])] = emitSingleDefault[i];
+      emitSingle[(unsigned char) toupper(alphabet[i])] = emitSingleDefault[i];
+      for (int j = 0; j <= i; j++){
+	emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(alphabet[j])] = emitPairsDefault[i][j];
+	emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(alphabet[j])] = emitPairsDefault[i][j];
+	emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(alphabet[j])] = emitPairsDefault[i][j];
+	emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(alphabet[j])] = emitPairsDefault[i][j];
+	emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(alphabet[i])] = emitPairsDefault[i][j];
+	emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(alphabet[i])] = emitPairsDefault[i][j];
+	emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(alphabet[i])] = emitPairsDefault[i][j];
+	emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(alphabet[i])] = emitPairsDefault[i][j];
+      }
+    }
+  }
+  else {
+    data.open (parametersInputFilename.c_str());
+    if (data.fail()){
+      cerr << "ERROR: Unable to read parameter file: " << parametersInputFilename << endl;
+      exit (1);
+    }
+    
+    string line[3];
+    for (int i = 0; i < 3; i++){
+      if (!getline (data, line[i])){
+	cerr << "ERROR: Unable to read transition parameters from parameter file: " << parametersInputFilename << endl;
+	exit (1);
+      }
+    }
+    istringstream data2;
+    data2.clear(); data2.str (line[0]); for (int i = 0; i < NumMatrixTypes; i++) data2 >> initDistrib[i];
+    data2.clear(); data2.str (line[1]); for (int i = 0; i < 2*NumInsertStates; i++) data2 >> gapOpen[i];
+    data2.clear(); data2.str (line[2]); for (int i = 0; i < 2*NumInsertStates; i++) data2 >> gapExtend[i];
+
+    if (!getline (data, line[0])){
+      cerr << "ERROR: Unable to read alphabet from scoring matrix file: " << parametersInputFilename << endl;
+      exit (1);
+    }
+    
+    // read alphabet as concatenation of all characters on alphabet line
+    alphabet = "";
+    string token;
+    data2.clear(); data2.str (line[0]); while (data2 >> token) alphabet += token;
+
+    for (int i = 0; i < (int) alphabet.size(); i++){
+      for (int j = 0; j <= i; j++){
+	float val;
+        data >> val;
+	emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(alphabet[j])] = val;
+	emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(alphabet[j])] = val;
+	emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(alphabet[j])] = val;
+	emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(alphabet[j])] = val;
+	emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(alphabet[i])] = val;
+	emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(alphabet[i])] = val;
+	emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(alphabet[i])] = val;
+	emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(alphabet[i])] = val;
+      }
+    }
+
+    for (int i = 0; i < (int) alphabet.size(); i++){
+      float val;
+      data >> val;
+      emitSingle[(unsigned char) tolower(alphabet[i])] = val;
+      emitSingle[(unsigned char) toupper(alphabet[i])] = val;
+    }
+    data.close();
+  }
+}
+
+/////////////////////////////////////////////////////////////////
+// ProcessTree()
+//
+// Process the tree recursively.  Returns the aligned sequences
+// corresponding to a node or leaf of the tree.
+/////////////////////////////////////////////////////////////////
+
+MultiSequence *ProcessTree (const TreeNode *tree, MultiSequence *sequences,
+                            const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+                            const ProbabilisticModel &model){
+  MultiSequence *result;
+
+  // check if this is a node of the alignment tree
+  if (tree->GetSequenceLabel() == -1){
+    MultiSequence *alignLeft = ProcessTree (tree->GetLeftChild(), sequences, sparseMatrices, model);
+    MultiSequence *alignRight = ProcessTree (tree->GetRightChild(), sequences, sparseMatrices, model);
+
+    assert (alignLeft);
+    assert (alignRight);
+
+    result = AlignAlignments (alignLeft, alignRight, sparseMatrices, model);
+    assert (result);
+
+    delete alignLeft;
+    delete alignRight;
+  }
+
+  // otherwise, this is a leaf of the alignment tree
+  else {
+    result = new MultiSequence(); assert (result);
+    result->AddSequence (sequences->GetSequence(tree->GetSequenceLabel())->Clone());
+  }
+
+  return result;
+}
+
+/////////////////////////////////////////////////////////////////
+// ComputeFinalAlignment()
+//
+// Compute the final alignment by calling ProcessTree(), then
+// performing iterative refinement as needed.
+/////////////////////////////////////////////////////////////////
+
+MultiSequence *ComputeFinalAlignment (const TreeNode *tree, MultiSequence *sequences,
+                                      const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+                                      const ProbabilisticModel &model){
+
+  MultiSequence *alignment = ProcessTree (tree, sequences, sparseMatrices, model);
+
+  if (enableAlignOrder){
+    alignment->SaveOrdering();
+    enableAlignOrder = false;
+  }
+
+  // tree-based refinement
+  // TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree);
+
+  // iterative refinement
+  for (int i = 0; i < numIterativeRefinementReps; i++)
+    DoIterativeRefinement (sparseMatrices, model, alignment);
+
+  cerr << endl;
+
+  // return final alignment
+  return alignment;
+}
+
+/////////////////////////////////////////////////////////////////
+// AlignAlignments()
+//
+// Returns the alignment of two MultiSequence objects.
+/////////////////////////////////////////////////////////////////
+
+MultiSequence *AlignAlignments (MultiSequence *align1, MultiSequence *align2,
+                                const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+                                const ProbabilisticModel &model){
+
+  // print some info about the alignment
+  if (enableVerbose){
+    for (int i = 0; i < align1->GetNumSequences(); i++)
+      cerr << ((i==0) ? "[" : ",") << align1->GetSequence(i)->GetLabel();
+    cerr << "] vs. ";
+    for (int i = 0; i < align2->GetNumSequences(); i++)
+      cerr << ((i==0) ? "[" : ",") << align2->GetSequence(i)->GetLabel();
+    cerr << "]: ";
+  }
+
+  VF *posterior = model.BuildPosterior (align1, align2, sparseMatrices, cutoff);
+  pair<SafeVector<char> *, float> alignment;
+
+  // choose the alignment routine depending on the "cosmetic" gap penalties used
+  if (gapOpenPenalty == 0 && gapContinuePenalty == 0)
+    alignment = model.ComputeAlignment (align1->GetSequence(0)->GetLength(), align2->GetSequence(0)->GetLength(), *posterior);
+  else
+    alignment = model.ComputeAlignmentWithGapPenalties (align1, align2,
+                                                        *posterior, align1->GetNumSequences(), align2->GetNumSequences(),
+                                                        gapOpenPenalty, gapContinuePenalty);
+
+  delete posterior;
+
+  if (enableVerbose){
+
+    // compute total length of sequences
+    int totLength = 0;
+    for (int i = 0; i < align1->GetNumSequences(); i++)
+      for (int j = 0; j < align2->GetNumSequences(); j++)
+        totLength += min (align1->GetSequence(i)->GetLength(), align2->GetSequence(j)->GetLength());
+
+    // give an "accuracy" measure for the alignment
+    cerr << alignment.second / totLength << endl;
+  }
+
+  // now build final alignment
+  MultiSequence *result = new MultiSequence();
+  for (int i = 0; i < align1->GetNumSequences(); i++)
+    result->AddSequence (align1->GetSequence(i)->AddGaps(alignment.first, 'X'));
+  for (int i = 0; i < align2->GetNumSequences(); i++)
+    result->AddSequence (align2->GetSequence(i)->AddGaps(alignment.first, 'Y'));
+  if (!enableAlignOrder)
+    result->SortByLabel();
+
+  // free temporary alignment
+  delete alignment.first;
+
+  return result;
+}
+
+/////////////////////////////////////////////////////////////////
+// DoRelaxation()
+//
+// Performs one round of the consistency transformation.  The
+// formula used is:
+//                     1
+//    P'(x[i]-y[j]) = ---  sum   sum P(x[i]-z[k]) P(z[k]-y[j])
+//                    |S| z in S  k
+//
+// where S = {x, y, all other sequences...}
+//
+/////////////////////////////////////////////////////////////////
+
+SafeVector<SafeVector<SparseMatrix *> > DoRelaxation (MultiSequence *sequences, 
+						      SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){
+  const int numSeqs = sequences->GetNumSequences();
+
+  SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices (numSeqs, SafeVector<SparseMatrix *>(numSeqs, NULL));
+
+  // for every pair of sequences
+  for (int i = 0; i < numSeqs; i++){
+    for (int j = i+1; j < numSeqs; j++){
+      Sequence *seq1 = sequences->GetSequence (i);
+      Sequence *seq2 = sequences->GetSequence (j);
+
+      if (enableVerbose)
+        cerr << "Relaxing (" << i+1 << ") " << seq1->GetHeader() << " vs. "
+             << "(" << j+1 << ") " << seq2->GetHeader() << ": ";
+
+      // get the original posterior matrix
+      VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior(); assert (posteriorPtr);
+      VF &posterior = *posteriorPtr;
+
+      const int seq1Length = seq1->GetLength();
+      const int seq2Length = seq2->GetLength();
+
+      // contribution from the summation where z = x and z = y
+      for (int k = 0; k < (seq1Length+1) * (seq2Length+1); k++) posterior[k] += posterior[k];
+
+      if (enableVerbose)
+        cerr << sparseMatrices[i][j]->GetNumCells() << " --> ";
+
+      // contribution from all other sequences
+      for (int k = 0; k < numSeqs; k++) if (k != i && k != j){
+	if (k < i)
+	  Relax1 (sparseMatrices[k][i], sparseMatrices[k][j], posterior);
+	else if (k > i && k < j)
+	  Relax (sparseMatrices[i][k], sparseMatrices[k][j], posterior);
+	else {
+	  SparseMatrix *temp = sparseMatrices[j][k]->ComputeTranspose();
+	  Relax (sparseMatrices[i][k], temp, posterior);
+	  delete temp;
+	}
+      }
+
+      // now renormalization
+      for (int k = 0; k < (seq1Length+1) * (seq2Length+1); k++) posterior[k] /= numSeqs;
+
+      // mask out positions not originally in the posterior matrix
+      SparseMatrix *matXY = sparseMatrices[i][j];
+      for (int y = 0; y <= seq2Length; y++) posterior[y] = 0;
+      for (int x = 1; x <= seq1Length; x++){
+	SafeVector<PIF>::iterator XYptr = matXY->GetRowPtr(x);
+	SafeVector<PIF>::iterator XYend = XYptr + matXY->GetRowSize(x);
+	VF::iterator base = posterior.begin() + x * (seq2Length + 1);
+	int curr = 0;
+	while (XYptr != XYend){
+
+	  // zero out all cells until the first filled column
+	  while (curr < XYptr->first){
+	    base[curr] = 0;
+	    curr++;
+	  }
+
+	  // now, skip over this column
+	  curr++;
+	  ++XYptr;
+	}
+	
+	// zero out cells after last column
+	while (curr <= seq2Length){
+	  base[curr] = 0;
+	  curr++;
+	}
+      }
+
+      // save the new posterior matrix
+      newSparseMatrices[i][j] = new SparseMatrix (seq1->GetLength(), seq2->GetLength(), posterior);
+      newSparseMatrices[j][i] = NULL;
+
+      if (enableVerbose)
+        cerr << newSparseMatrices[i][j]->GetNumCells() << " -- ";
+
+      delete posteriorPtr;
+
+      if (enableVerbose)
+        cerr << "done." << endl;
+    }
+  }
+  
+  return newSparseMatrices;
+}
+
+/////////////////////////////////////////////////////////////////
+// Relax()
+//
+// Computes the consistency transformation for a single sequence
+// z, and adds the transformed matrix to "posterior".
+/////////////////////////////////////////////////////////////////
+
+void Relax (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior){
+
+  assert (matXZ);
+  assert (matZY);
+
+  int lengthX = matXZ->GetSeq1Length();
+  int lengthY = matZY->GetSeq2Length();
+  assert (matXZ->GetSeq2Length() == matZY->GetSeq1Length());
+
+  // for every x[i]
+  for (int i = 1; i <= lengthX; i++){
+    SafeVector<PIF>::iterator XZptr = matXZ->GetRowPtr(i);
+    SafeVector<PIF>::iterator XZend = XZptr + matXZ->GetRowSize(i);
+
+    VF::iterator base = posterior.begin() + i * (lengthY + 1);
+
+    // iterate through all x[i]-z[k]
+    while (XZptr != XZend){
+      SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(XZptr->first);
+      SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(XZptr->first);
+      const float XZval = XZptr->second;
+
+      // iterate through all z[k]-y[j]
+      while (ZYptr != ZYend){
+        base[ZYptr->first] += XZval * ZYptr->second;
+        ZYptr++;
+      }
+      XZptr++;
+    }
+  }
+}
+
+/////////////////////////////////////////////////////////////////
+// Relax1()
+//
+// Computes the consistency transformation for a single sequence
+// z, and adds the transformed matrix to "posterior".
+/////////////////////////////////////////////////////////////////
+
+void Relax1 (SparseMatrix *matZX, SparseMatrix *matZY, VF &posterior){
+
+  assert (matZX);
+  assert (matZY);
+
+  int lengthZ = matZX->GetSeq1Length();
+  int lengthY = matZY->GetSeq2Length();
+
+  // for every z[k]
+  for (int k = 1; k <= lengthZ; k++){
+    SafeVector<PIF>::iterator ZXptr = matZX->GetRowPtr(k);
+    SafeVector<PIF>::iterator ZXend = ZXptr + matZX->GetRowSize(k);
+
+    // iterate through all z[k]-x[i]
+    while (ZXptr != ZXend){
+      SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(k);
+      SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(k);
+      const float ZXval = ZXptr->second;
+      VF::iterator base = posterior.begin() + ZXptr->first * (lengthY + 1);
+
+      // iterate through all z[k]-y[j]
+      while (ZYptr != ZYend){
+        base[ZYptr->first] += ZXval * ZYptr->second;
+        ZYptr++;
+      }
+      ZXptr++;
+    }
+  }
+}
+
+/////////////////////////////////////////////////////////////////
+// GetSubtree
+//
+// Returns set containing all leaf labels of the current subtree.
+/////////////////////////////////////////////////////////////////
+
+set<int> GetSubtree (const TreeNode *tree){
+  set<int> s;
+
+  if (tree->GetSequenceLabel() == -1){
+    s = GetSubtree (tree->GetLeftChild());
+    set<int> t = GetSubtree (tree->GetRightChild());
+
+    for (set<int>::iterator iter = t.begin(); iter != t.end(); ++iter)
+      s.insert (*iter);
+  }
+  else {
+    s.insert (tree->GetSequenceLabel());
+  }
+
+  return s;
+}
+
+/////////////////////////////////////////////////////////////////
+// TreeBasedBiPartitioning
+//
+// Uses the iterative refinement scheme from MUSCLE.
+/////////////////////////////////////////////////////////////////
+
+void TreeBasedBiPartitioning (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+                              const ProbabilisticModel &model, MultiSequence* &alignment,
+                              const TreeNode *tree){
+  // check if this is a node of the alignment tree
+  if (tree->GetSequenceLabel() == -1){
+    TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree->GetLeftChild());
+    TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree->GetRightChild());
+
+    set<int> leftSubtree = GetSubtree (tree->GetLeftChild());
+    set<int> rightSubtree = GetSubtree (tree->GetRightChild());
+    set<int> leftSubtreeComplement, rightSubtreeComplement;
+
+    // calculate complement of each subtree
+    for (int i = 0; i < alignment->GetNumSequences(); i++){
+      if (leftSubtree.find(i) == leftSubtree.end()) leftSubtreeComplement.insert (i);
+      if (rightSubtree.find(i) == rightSubtree.end()) rightSubtreeComplement.insert (i);
+    }
+
+    // perform realignments for edge to left child
+    if (!leftSubtree.empty() && !leftSubtreeComplement.empty()){
+      MultiSequence *groupOneSeqs = alignment->Project (leftSubtree); assert (groupOneSeqs);
+      MultiSequence *groupTwoSeqs = alignment->Project (leftSubtreeComplement); assert (groupTwoSeqs);
+      delete alignment;
+      alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model);
+    }
+
+    // perform realignments for edge to right child
+    if (!rightSubtree.empty() && !rightSubtreeComplement.empty()){
+      MultiSequence *groupOneSeqs = alignment->Project (rightSubtree); assert (groupOneSeqs);
+      MultiSequence *groupTwoSeqs = alignment->Project (rightSubtreeComplement); assert (groupTwoSeqs);
+      delete alignment;
+      alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model);
+    }
+  }
+}
+
+/////////////////////////////////////////////////////////////////
+// DoIterativeRefinement()
+//
+// Performs a single round of randomized partionining iterative
+// refinement.
+/////////////////////////////////////////////////////////////////
+
+void DoIterativeRefinement (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+                            const ProbabilisticModel &model, MultiSequence* &alignment){
+  set<int> groupOne, groupTwo;
+
+  // create two separate groups
+  for (int i = 0; i < alignment->GetNumSequences(); i++){
+    if (rand() % 2)
+      groupOne.insert (i);
+    else
+      groupTwo.insert (i);
+  }
+
+  if (groupOne.empty() || groupTwo.empty()) return;
+
+  // project into the two groups
+  MultiSequence *groupOneSeqs = alignment->Project (groupOne); assert (groupOneSeqs);
+  MultiSequence *groupTwoSeqs = alignment->Project (groupTwo); assert (groupTwoSeqs);
+  delete alignment;
+
+  // realign
+  alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model);
+
+  delete groupOneSeqs;
+  delete groupTwoSeqs;
+}
+
+/////////////////////////////////////////////////////////////////
+// WriteAnnotation()
+//
+// Computes annotation for multiple alignment and write values
+// to a file.
+/////////////////////////////////////////////////////////////////
+
+void WriteAnnotation (MultiSequence *alignment, 
+		      const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){
+  ofstream outfile (annotationFilename.c_str());
+  
+  if (outfile.fail()){
+    cerr << "ERROR: Unable to write annotation file." << endl;
+    exit (1);
+  }
+
+  const int alignLength = alignment->GetSequence(0)->GetLength();
+  const int numSeqs = alignment->GetNumSequences();
+  
+  SafeVector<int> position (numSeqs, 0);
+  SafeVector<SafeVector<char>::iterator> seqs (numSeqs);
+  for (int i = 0; i < numSeqs; i++) seqs[i] = alignment->GetSequence(i)->GetDataPtr();
+  SafeVector<pair<int,int> > active;
+  active.reserve (numSeqs);
+  
+  // for every column
+  for (int i = 1; i <= alignLength; i++){
+    
+    // find all aligned residues in this particular column
+    active.clear();
+    for (int j = 0; j < numSeqs; j++){
+      if (seqs[j][i] != '-'){
+	active.push_back (make_pair(j, ++position[j]));
+      }
+    }
+    
+    outfile << setw(4) << ComputeScore (active, sparseMatrices) << endl;
+  }
+  
+  outfile.close();
+}
+
+/////////////////////////////////////////////////////////////////
+// ComputeScore()
+//
+// Computes the annotation score for a particular column.
+/////////////////////////////////////////////////////////////////
+
+int ComputeScore (const SafeVector<pair<int, int> > &active, 
+		  const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){
+
+  if (active.size() <= 1) return 0;
+  
+  // ALTERNATIVE #1: Compute the average alignment score.
+
+  float val = 0;
+  for (int i = 0; i < (int) active.size(); i++){
+    for (int j = i+1; j < (int) active.size(); j++){
+      val += sparseMatrices[active[i].first][active[j].first]->GetValue(active[i].second, active[j].second);
+    }
+  }
+
+  return (int) (200 * val / ((int) active.size() * ((int) active.size() - 1)));
+  
+}

Added: trunk/packages/probcons/branches/upstream/current/MakeGnuPlot.cc
===================================================================
--- trunk/packages/probcons/branches/upstream/current/MakeGnuPlot.cc	2006-05-07 10:44:13 UTC (rev 40)
+++ trunk/packages/probcons/branches/upstream/current/MakeGnuPlot.cc	2006-05-07 14:55:27 UTC (rev 41)
@@ -0,0 +1,58 @@
+/////////////////////////////////////////////////////////////////
+// MakeGnuPlot.cc
+/////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include <fstream>
+
+using namespace std;
+
+int main (int argc, char **argv){
+  
+  if (argc == 1 || argc > 3){
+    cerr << "Usage: makegnuplot annotscores [refscores]" << endl;
+    exit (1);
+  }
+
+  ifstream data (argv[1]);
+
+  if (data.fail()){
+    cerr << "ERROR: Could not open file " << argv[1] << endl;
+    exit (1);
+  }
+  
+  int x, ct = 0;
+  while (data >> x) ct++;
+  data.close();
+  
+  ofstream out ("temporary_gnuplot_script");
+  
+  if (out.fail()){
+    cerr << "ERROR: Could not create temporary file." << endl;
+    exit (1);
+  }
+
+  out << "set title \"Column Reliability Scores\"" << endl
+      << "set xlabel \"Alignment Position\"" << endl
+      << "set ylabel \"Column Reliability\"" << endl
+      << "set xr [1:" << ct << "]" << endl
+      << "set term postscript enhanced color" << endl
+      << "set output \"reliability.ps\"" << endl;
+  
+  if (argc == 3){
+    out << "set style fill solid 0.5 noborder" << endl
+	<< "plot \"" << argv[2] << "\" title \"actual\" with boxes lt 2, \\" << endl
+	<< "     \"" << argv[1] << "\" title \"predicted\" with histeps lt 1 lw 3" << endl;
+  }
+  else 
+    out << "plot \"" << argv[1] << "\" title \"predicted\" with histeps lt 1 lw 3" << endl;
+
+  out.close();
+
+  if (system ("gnuplot temporary_gnuplot_script") == -1){
+    cerr << "ERROR: Could not run Gnuplot correctly." << endl;
+    exit (1);
+  }
+  
+  //system ("rm temporary_gnuplot_script");
+}

Added: trunk/packages/probcons/branches/upstream/current/Makefile
===================================================================
--- trunk/packages/probcons/branches/upstream/current/Makefile	2006-05-07 10:44:13 UTC (rev 40)
+++ trunk/packages/probcons/branches/upstream/current/Makefile	2006-05-07 14:55:27 UTC (rev 41)
@@ -0,0 +1,56 @@
+################################################################################
+# Makefile for probcons
+################################################################################
+
+################################################################################
+# 1) Choose C++ compiler.
+################################################################################
+
+CXX = g++
+
+################################################################################
+# 2) Set C++ flags.
+#    a) DEBUG mode -- no optimizations, enable SafeVector checking, no inlining
+#    b) PROFILE mode -- for gprof
+#    c) RELEASE mode
+################################################################################
+
+OTHERFLAGS = -DNumInsertStates=2 -DVERSION="1.10"
+
+# debug mode    
+#CXXFLAGS = -g -W -Wall -pedantic -DENABLE_CHECKS -fno-inline $(OTHERFLAGS)
+
+# profile mode
+#CXXFLAGS = -pg -W -Wall -pedantic $(OTHERFLAGS)
+
+# release mode
+#CXXFLAGS = -O3 -W -Wall -pedantic -DNDEBUG $(OTHERFLAGS) -mmmx -msse -msse2 -mfpmath=sse -march=pentium4 -mcpu=pentium4 -funroll-loops -fomit-frame-pointer 
+CXXFLAGS = -O3 -W -Wall -pedantic -DNDEBUG $(OTHERFLAGS) -funroll-loops
+
+################################################################################
+# 3) Dependencies
+################################################################################
+
+TARGETS = probcons compare project makegnuplot
+
+.PHONY : all
+all : $(TARGETS)
+
+probcons : MultiSequence.h ProbabilisticModel.h ScoreType.h Sequence.h FileBuffer.h SparseMatrix.h EvolutionaryTree.h Defaults.h SafeVector.h Main.cc
+	$(CXX) $(CXXFLAGS) -lm -o probcons Main.cc 
+
+compare : MultiSequence.h Sequence.h FileBuffer.h SafeVector.h CompareToRef.cc
+	$(CXX) $(CXXFLAGS) -o compare CompareToRef.cc
+
+fixref : MultiSequence.h ProbabilisticModel.h ScoreType.h Sequence.h FileBuffer.h SparseMatrix.h EvolutionaryTree.h Defaults.h SafeVector.h FixRef.cc
+	$(CXX) $(CXXFLAGS) -o fixref FixRef.cc
+
+project : MultiSequence.h Sequence.h SafeVector.h ProjectPairwise.cc
+	$(CXX) $(CXXFLAGS) -o project ProjectPairwise.cc
+
+makegnuplot : MakeGnuPlot.cc
+	$(CXX) $(CXXFLAGS) -o makegnuplot MakeGnuPlot.cc
+
+.PHONY : clean
+clean:
+	rm -f $(TARGETS)

Added: trunk/packages/probcons/branches/upstream/current/MultiSequence.h
===================================================================
--- trunk/packages/probcons/branches/upstream/current/MultiSequence.h	2006-05-07 10:44:13 UTC (rev 40)
+++ trunk/packages/probcons/branches/upstream/current/MultiSequence.h	2006-05-07 14:55:27 UTC (rev 41)
@@ -0,0 +1,710 @@
+////////////////////////////////////////////////////////////////
+// MultiSequence.h
+//
+// Utilities for reading/writing multiple sequence data.
+/////////////////////////////////////////////////////////////////
+
+#ifndef MULTISEQUENCE_H
+#define MULTISEQUENCE_H
+
+#include <cctype>
+#include <string>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <algorithm>
+#include <set>
+#include "SafeVector.h"
+#include "Sequence.h"
+#include "FileBuffer.h"
+
+/////////////////////////////////////////////////////////////////
+// MultiSequence
+//
+// Class for multiple sequence alignment input/output.
+/////////////////////////////////////////////////////////////////
+
+class MultiSequence {
+
+  SafeVector<Sequence *> *sequences;
+
+ public:
+
+  /////////////////////////////////////////////////////////////////
+  // MultiSequence::MultiSequence()
+  //
+  // Default constructor.
+  /////////////////////////////////////////////////////////////////
+
+  MultiSequence () : sequences (NULL) {}
+
+  /////////////////////////////////////////////////////////////////
+  // MultiSequence::MultiSequence()
+  //
+  // Constructor.  Load MFA from a FileBuffer object.
+  /////////////////////////////////////////////////////////////////
+
+  MultiSequence (FileBuffer &infile) : sequences (NULL) {
+    LoadMFA (infile);
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // MultiSequence::MultiSequence()
+  //
+  // Constructor.  Load MFA from a filename.
+  /////////////////////////////////////////////////////////////////
+
+  MultiSequence (const string &filename) : sequences (NULL){
+    LoadMFA (filename);
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // MultiSequence::~MultiSequence()
+  //
+  // Destructor.  Gets rid of sequence objects contained in the
+  // multiple alignment.
+  /////////////////////////////////////////////////////////////////
+
+  ~MultiSequence(){
+
+    // if sequences allocated
+    if (sequences){
+
+      // free all sequences
+      for (SafeVector<Sequence *>::iterator iter = sequences->begin(); iter != sequences->end(); ++iter){
+        assert (*iter);
+        delete *iter;
+        *iter = NULL;
+      }
+
+      // free sequence vector
+      delete sequences;
+      sequences = NULL;
+    }
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // MultiSequence::LoadMFA()
+  //
+  // Load MFA from a filename.
+  /////////////////////////////////////////////////////////////////
+
+  void LoadMFA (const string &filename, bool stripGaps = false){
+
+    // try opening file
+    FileBuffer infile (filename.c_str());
+
+    if (infile.fail()){
+      cerr << "ERROR: Could not open file '" << filename << "' for reading." << endl;
+      exit (1);
+    }
+
+    // if successful, then load using other LoadMFA() routine
+    LoadMFA (infile, stripGaps);
+
+    infile.close();
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // MultiSequence::LoadMFA()
+  //
+  // Load MSF from a FileBuffer object.
+  /////////////////////////////////////////////////////////////////
+
+  void ParseMSF (FileBuffer &infile, string header, bool stripGaps = false){
+
+    SafeVector<SafeVector<char> *> seqData;
+    SafeVector<string> seqNames;
+    SafeVector<int> seqLengths;
+
+    istringstream in;
+    bool valid = true;
+    bool missingHeader = false;
+    bool clustalW = false;
+
+    // read until data starts
+    while (!infile.eof() && header.find ("..", 0) == string::npos){
+      if (header.find ("CLUSTAL", 0) == 0 || header.find ("PROBCONS", 0) == 0){
+	clustalW = true;
+	break;
+      }
+      infile.GetLine (header);
+      if (header.find ("//", 0) != string::npos){
+        missingHeader = true;
+        break;
+      }
+    }
+
+    // read until end-of-file
+    while (valid){
+      infile.GetLine (header);
+      if (infile.eof()) break;
+
+      string word;
+      in.clear();
+      in.str(header);
+
+      // check if there's anything on this line
+      if (in >> word){
+
+	// clustalw name parsing
+	if (clustalW){
+	  if (!isspace(header[0]) && find (seqNames.begin(), seqNames.end(), word) == seqNames.end()){
+	    seqNames.push_back (word);
+	    seqData.push_back (new SafeVector<char>());
+	    seqLengths.push_back (0);
+	    seqData[(int) seqData.size() - 1]->push_back ('@');
+	  }	  
+	}
+
+        // look for new sequence label
+        if (word == string ("Name:")){
+          if (in >> word){
+            seqNames.push_back (word);
+            seqData.push_back (new SafeVector<char>());
+            seqLengths.push_back (0);
+            seqData[(int) seqData.size() - 1]->push_back ('@');
+          }
+          else
+            valid = false;
+        }
+
+        // check if this is sequence data
+        else if (find (seqNames.begin(), seqNames.end(), word) != seqNames.end()){
+          int index = find (seqNames.begin(), seqNames.end(), word) - seqNames.begin();
+
+          // read all remaining characters on the line
+          char ch;
+          while (in >> ch){
+            if (isspace (ch)) continue;
+            if (ch >= 'a' && ch <= 'z') ch = ch - 'a' + 'A';
+            if (ch == '.') ch = '-';
+	    if (stripGaps && ch == '-') continue;
+            if (!((ch >= 'A' && ch <= 'Z') || ch == '*' || ch == '-')){
+              cerr << "ERROR: Unknown character encountered: " << ch << endl;
+              exit (1);
+            }
+
+            // everything's ok so far, so just store this character.
+            seqData[index]->push_back (ch);
+            seqLengths[index]++;
+          }
+        }
+        else if (missingHeader){
+          seqNames.push_back (word);
+          seqData.push_back (new SafeVector<char>());
+          seqLengths.push_back (0);
+          seqData[(int) seqData.size() - 1]->push_back ('@');
+
+          int index = (int) seqNames.size() - 1;
+
+          // read all remaining characters on the line
+          char ch;
+          while (in >> ch){
+            if (isspace (ch)) continue;
+            if (ch >= 'a' && ch <= 'z') ch = ch - 'a' + 'A';
+            if (ch == '.') ch = '-';
+	    if (stripGaps && ch == '-') continue;
+            if (!((ch >= 'A' && ch <= 'Z') || ch == '*' || ch == '-')){
+              cerr << "ERROR: Unknown character encountered: " << ch << endl;
+              exit (1);
+            }
+
+            // everything's ok so far, so just store this character.
+            seqData[index]->push_back (ch);
+            seqLengths[index]++;
+          }
+        }
+      }
+    }
+
+    // check for errors
+    if (seqNames.size() == 0){
+      cerr << "ERROR: No sequences read!" << endl;
+      exit (1);
+    }
+
+    assert (!sequences);
+    sequences = new SafeVector<Sequence *>;
+    for (int i = 0; i < (int) seqNames.size(); i++){
+      if (seqLengths[i] == 0){
+        cerr << "ERROR: Sequence of zero length!" << endl;
+        exit (1);
+      }
+      Sequence *seq = new Sequence (seqData[i], seqNames[i], seqLengths[i], i, i);
+      sequences->push_back (seq);
+    }
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // MultiSequence::LoadMFA()
+  //
+  // Load MFA from a FileBuffer object.
+  /////////////////////////////////////////////////////////////////
+
+  void LoadMFA (FileBuffer &infile, bool stripGaps = false){
+
+    // check to make sure that file reading is ok
+    if (infile.fail()){
+      cerr << "ERROR: Error reading file." << endl;
+      exit (1);
+    }
+
+    // read all sequences
+    while (true){
+
+      // get the sequence label as being the current # of sequences
+      // NOTE: sequence labels here are zero-based
+      int index = (!sequences) ? 0 : sequences->size();
+
+      // read the sequence
+      Sequence *seq = new Sequence (infile, stripGaps);
+      if (seq->Fail()){
+
+        // check if alternative file format (i.e. not MFA)
+        if (index == 0){
+          string header = seq->GetHeader();
+          if (header.length() > 0 && header[0] != '>'){
+
+            // try MSF format
+            ParseMSF (infile, header);
+            break;
+          }
+        }
+
+        delete seq;
+        break;
+      }
+      seq->SetLabel (index);
+
+      // add the sequence to the list of current sequences
+      if (!sequences) sequences = new SafeVector<Sequence *>;
+      sequences->push_back (seq);
+    }
+
+    // make sure at least one sequence was read
+    if (!sequences){
+      cerr << "ERROR: No sequences read." << endl;
+      exit (1);
+    }
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // MultiSequence::AddSequence()
+  //
+  // Add another sequence to an existing sequence list
+  /////////////////////////////////////////////////////////////////
+
+  void AddSequence (Sequence *sequence){
+    assert (sequence);
+    assert (!sequence->Fail());
+
+    // add sequence
+    if (!sequences) sequences = new SafeVector<Sequence *>;
+    sequences->push_back (sequence);
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // MultiSequence::RemoveSequence()
+  //
+  // Remove a sequence from the MultiSequence
+  /////////////////////////////////////////////////////////////////
+
+  void RemoveSequence (int index){
+    assert (sequences);
+
+    assert (index >= 0 && index < (int) sequences->size());
+    delete (*sequences)[index];
+
+    sequences->erase (sequences->begin() + index);
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // MultiSequence::WriteMFA()
+  //
+  // Write MFA to the outfile.  Allows the user to specify the
+  // number of columns for the output.  Also, useIndices determines
+  // whether or not the actual sequence comments will be printed
+  // out or whether the artificially assigned sequence labels will
+  // be used instead.
+  /////////////////////////////////////////////////////////////////
+
+  void WriteMFA (ostream &outfile, int numColumns = 60, bool useIndices = false){
+    if (!sequences) return;
+
+    // loop through all sequences and write them out
+    for (SafeVector<Sequence *>::iterator iter = sequences->begin(); iter != sequences->end(); ++iter){
+      (*iter)->WriteMFA (outfile, numColumns, useIndices);
+    }
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // MultiSequence::GetAnnotationChar()
+  //
+  // Return CLUSTALW annotation for column.
+  /////////////////////////////////////////////////////////////////
+
+  char GetAnnotationChar (SafeVector<char> &column){
+    SafeVector<int> counts (256, 0);
+    int allChars = (int) column.size();
+    
+    for (int i = 0; i < allChars; i++){
+      counts[(unsigned char) toupper(column[i])]++;
+    }
+    
+    allChars -= counts[(unsigned char) '-'];
+    if (allChars == 1) return ' ';
+    
+    for (int i = 0; i < 256; i++) if ((char) i != '-' && counts[i] == allChars) return '*';
+    
+    if (counts[(unsigned char) 'S'] + 
+	counts[(unsigned char) 'T'] + 
+	counts[(unsigned char) 'A'] == allChars) 
+      return ':';
+    
+    if (counts[(unsigned char) 'N'] + 
+	counts[(unsigned char) 'E'] + 
+	counts[(unsigned char) 'Q'] +
+	counts[(unsigned char) 'K'] == allChars) 
+      return ':';
+
+    if (counts[(unsigned char) 'N'] + 
+	counts[(unsigned char) 'H'] + 
+	counts[(unsigned char) 'Q'] +
+	counts[(unsigned char) 'K'] == allChars) 
+      return ':';
+
+    if (counts[(unsigned char) 'N'] + 
+	counts[(unsigned char) 'D'] + 
+	counts[(unsigned char) 'E'] +
+	counts[(unsigned char) 'Q'] == allChars) 
+      return ':';
+
+    if (counts[(unsigned char) 'Q'] + 
+	counts[(unsigned char) 'H'] + 
+	counts[(unsigned char) 'R'] +
+	counts[(unsigned char) 'K'] == allChars) 
+      return ':';
+
+    if (counts[(unsigned char) 'M'] + 
+	counts[(unsigned char) 'I'] + 
+	counts[(unsigned char) 'L'] +
+	counts[(unsigned char) 'V'] == allChars) 
+      return ':';
+
+    if (counts[(unsigned char) 'M'] + 
+	counts[(unsigned char) 'I'] + 
+	counts[(unsigned char) 'L'] +
+	counts[(unsigned char) 'F'] == allChars) 
+      return ':';
+
+    if (counts[(unsigned char) 'H'] + 
+	counts[(unsigned char) 'Y'] == allChars) 
+      return ':';
+
+    if (counts[(unsigned char) 'F'] + 
+	counts[(unsigned char) 'Y'] + 
+	counts[(unsigned char) 'W'] == allChars) 
+      return ':';
+
+    if (counts[(unsigned char) 'C'] + 
+	counts[(unsigned char) 'S'] + 
+	counts[(unsigned char) 'A'] == allChars) 
+      return '.';
+
+    if (counts[(unsigned char) 'A'] + 
+	counts[(unsigned char) 'T'] + 
+	counts[(unsigned char) 'V'] == allChars) 
+      return '.';
+
+    if (counts[(unsigned char) 'S'] + 
+	counts[(unsigned char) 'A'] + 
+	counts[(unsigned char) 'G'] == allChars) 
+      return '.';
+
+    if (counts[(unsigned char) 'S'] + 
+	counts[(unsigned char) 'T'] + 
+	counts[(unsigned char) 'N'] + 
+	counts[(unsigned char) 'K'] == allChars) 
+      return '.';
+
+    if (counts[(unsigned char) 'S'] + 
+	counts[(unsigned char) 'T'] + 
+	counts[(unsigned char) 'P'] + 
+	counts[(unsigned char) 'A'] == allChars) 
+      return '.';
+
+    if (counts[(unsigned char) 'S'] + 
+	counts[(unsigned char) 'G'] + 
+	counts[(unsigned char) 'N'] + 
+	counts[(unsigned char) 'D'] == allChars) 
+      return '.';
+
+    if (counts[(unsigned char) 'S'] + 
+	counts[(unsigned char) 'N'] + 
+	counts[(unsigned char) 'D'] + 
+	counts[(unsigned char) 'E'] + 
+	counts[(unsigned char) 'Q'] + 
+	counts[(unsigned char) 'K'] == allChars) 
+      return '.';
+
+    if (counts[(unsigned char) 'N'] + 
+	counts[(unsigned char) 'D'] + 
+	counts[(unsigned char) 'E'] + 
+	counts[(unsigned char) 'Q'] + 
+	counts[(unsigned char) 'H'] + 
+	counts[(unsigned char) 'K'] == allChars) 
+      return '.';
+
+    if (counts[(unsigned char) 'N'] + 
+	counts[(unsigned char) 'E'] + 
+	counts[(unsigned char) 'H'] + 
+	counts[(unsigned char) 'Q'] + 
+	counts[(unsigned char) 'R'] + 
+	counts[(unsigned char) 'K'] == allChars) 
+      return '.';
+
+    if (counts[(unsigned char) 'F'] + 
+	counts[(unsigned char) 'V'] + 
+	counts[(unsigned char) 'L'] + 
+	counts[(unsigned char) 'I'] + 
+	counts[(unsigned char) 'M'] == allChars) 
+      return '.';
+
+    if (counts[(unsigned char) 'H'] + 
+	counts[(unsigned char) 'F'] + 
+	counts[(unsigned char) 'Y'] == allChars) 
+      return '.';
+
+    return ' ';
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // MultiSequence::WriteALN()
+  //
+  // Write ALN to the outfile.  Allows the user to specify the
+  // number of columns for the output.  
+  /////////////////////////////////////////////////////////////////
+
+  void WriteALN (ostream &outfile, int numColumns = 60){
+    if (!sequences) return;
+
+    outfile << "PROBCONS version " << VERSION << " multiple sequence alignment" << endl;
+
+    int longestComment = 0;
+    SafeVector<SafeVector<char>::iterator> ptrs (GetNumSequences());
+    SafeVector<int> lengths (GetNumSequences());
+    for (int i = 0; i < GetNumSequences(); i++){
+      ptrs[i] = GetSequence (i)->GetDataPtr();
+      lengths[i] = GetSequence (i)->GetLength();
+      longestComment = max (longestComment, (int) GetSequence(i)->GetName().length());
+    }
+    longestComment += 4;
+
+    int writtenChars = 0;    
+    bool allDone = false;
+
+    while (!allDone){
+      outfile << endl;
+      allDone = true;
+
+      // loop through all sequences and write them out
+      for (int i = 0; i < GetNumSequences(); i++){
+
+	if (writtenChars < lengths[i]){
+	  outfile << GetSequence(i)->GetName();
+	  for (int j = 0; j < longestComment - (int) GetSequence(i)->GetName().length(); j++)
+	    outfile << ' ';
+
+	  for (int j = 0; j < numColumns; j++){
+	    if (writtenChars + j < lengths[i])
+	      outfile << ptrs[i][writtenChars + j + 1];
+	    else
+	      break;
+	  }
+	  
+	  outfile << endl;
+	  
+	  if (writtenChars + numColumns < lengths[i]) allDone = false;
+	}
+      }
+
+      // write annotation line
+      for (int j = 0; j < longestComment; j++)
+	outfile << ' ';
+
+      for (int j = 0; j < numColumns; j++){
+	SafeVector<char> column;
+
+	for (int i = 0; i < GetNumSequences(); i++)
+	  if (writtenChars + j < lengths[i])
+	    column.push_back (ptrs[i][writtenChars + j + 1]);
+	
+	if (column.size() > 0)
+	  outfile << GetAnnotationChar (column);	
+      }
+
+      outfile << endl;
+      writtenChars += numColumns;
+    }
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // MultiSequence::GetSequence()
+  //
+  // Retrieve a sequence from the MultiSequence object.
+  /////////////////////////////////////////////////////////////////
+
+  Sequence* GetSequence (int i){
+    assert (sequences);
+    assert (0 <= i && i < (int) sequences->size());
+
+    return (*sequences)[i];
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // MultiSequence::GetSequence()
+  //
+  // Retrieve a sequence from the MultiSequence object
+  // (const version).
+  /////////////////////////////////////////////////////////////////
+
+  const Sequence* GetSequence (int i) const {
+    assert (sequences);
+    assert (0 <= i && i < (int) sequences->size());
+
+    return (*sequences)[i];
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // MultiSequence::GetNumSequences()
+  //
+  // Returns the number of sequences in the MultiSequence.
+  /////////////////////////////////////////////////////////////////
+
+  int GetNumSequences () const {
+    if (!sequences) return 0;
+    return (int) sequences->size();
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // MultiSequence::SortByHeader()
+  //
+  // Organizes the sequences according to their sequence headers
+  // in ascending order.
+  /////////////////////////////////////////////////////////////////
+
+  void SortByHeader () {
+    assert (sequences);
+
+    // a quick and easy O(n^2) sort
+    for (int i = 0; i < (int) sequences->size()-1; i++){
+      for (int j = i+1; j < (int) sequences->size(); j++){
+        if ((*sequences)[i]->GetHeader() > (*sequences)[j]->GetHeader())
+          swap ((*sequences)[i], (*sequences)[j]);
+      }
+    }
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // MultiSequence::SortByLabel()
+  //
+  // Organizes the sequences according to their sequence labels
+  // in ascending order.
+  /////////////////////////////////////////////////////////////////
+
+  void SortByLabel () {
+    assert (sequences);
+
+    // a quick and easy O(n^2) sort
+    for (int i = 0; i < (int) sequences->size()-1; i++){
+      for (int j = i+1; j < (int) sequences->size(); j++){
+        if ((*sequences)[i]->GetSortLabel() > (*sequences)[j]->GetSortLabel())
+          swap ((*sequences)[i], (*sequences)[j]);
+      }
+    }
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // MultiSequence::SaveOrdering()
+  //
+  // Relabels sequences so as to preserve the current ordering.
+  /////////////////////////////////////////////////////////////////
+
+  void SaveOrdering () {
+    assert (sequences);
+    
+    for (int i = 0; i < (int) sequences->size(); i++)
+      (*sequences)[i]->SetSortLabel (i);
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // MultiSequence::Project()
+  //
+  // Given a set of indices, extract all sequences from the current
+  // MultiSequence object whose index is included in the set.
+  // Then, project the multiple alignments down to the desired
+  // subset, and return the projection as a new MultiSequence
+  // object.
+  /////////////////////////////////////////////////////////////////
+
+  MultiSequence *Project (const set<int> &indices){
+    SafeVector<SafeVector<char>::iterator> oldPtrs (indices.size());
+    SafeVector<SafeVector<char> *> newPtrs (indices.size());
+
+    assert (indices.size() != 0);
+
+    // grab old data
+    int i = 0;
+    for (set<int>::const_iterator iter = indices.begin(); iter != indices.end(); ++iter){
+      oldPtrs[i++] = GetSequence (*iter)->GetDataPtr();
+    }
+
+    // compute new length
+    int oldLength = GetSequence (*indices.begin())->GetLength();
+    int newLength = 0;
+    for (i = 1; i <= oldLength; i++){
+
+      // check to see if there is a gap in every sequence of the set
+      bool found = false;
+      for (int j = 0; !found && j < (int) indices.size(); j++)
+        found = (oldPtrs[j][i] != '-');
+
+      // if not, then this column counts towards the sequence length
+      if (found) newLength++;
+    }
+
+    // build new alignments
+    for (i = 0; i < (int) indices.size(); i++){
+      newPtrs[i] = new SafeVector<char>(); assert (newPtrs[i]);
+      newPtrs[i]->push_back ('@');
+    }
+
+    // add all needed columns
+    for (i = 1; i <= oldLength; i++){
+
+      // make sure column is not gapped in all sequences in the set
+      bool found = false;
+      for (int j = 0; !found && j < (int) indices.size(); j++)
+        found = (oldPtrs[j][i] != '-');
+
+      // if not, then add it
+      if (found){
+        for (int j = 0; j < (int) indices.size(); j++)
+          newPtrs[j]->push_back (oldPtrs[j][i]);
+      }
+    }
+
+    // wrap sequences in MultiSequence object
+    MultiSequence *ret = new MultiSequence();
+    i = 0;
+    for (set<int>::const_iterator iter = indices.begin(); iter != indices.end(); ++iter){
+      ret->AddSequence (new Sequence (newPtrs[i++], GetSequence (*iter)->GetHeader(), newLength,
+                                      GetSequence (*iter)->GetSortLabel(), GetSequence (*iter)->GetLabel()));
+    }
+
+    return ret;
+  }
+};
+
+#endif

Added: trunk/packages/probcons/branches/upstream/current/ProbabilisticModel.h
===================================================================
--- trunk/packages/probcons/branches/upstream/current/ProbabilisticModel.h	2006-05-07 10:44:13 UTC (rev 40)
+++ trunk/packages/probcons/branches/upstream/current/ProbabilisticModel.h	2006-05-07 14:55:27 UTC (rev 41)
@@ -0,0 +1,1089 @@
+/////////////////////////////////////////////////////////////////
+// ProbabilisticModel.h
+//
+// Routines for (1) posterior probability computations
+//              (2) chained anchoring
+//              (3) maximum weight trace alignment
+/////////////////////////////////////////////////////////////////
+
+#ifndef PROBABILISTICMODEL_H
+#define PROBABILISTICMODEL_H
+
+#include <list>
+#include <cmath>
+#include <cstdio>
+#include "SafeVector.h"
+#include "ScoreType.h"
+#include "SparseMatrix.h"
+#include "MultiSequence.h"
+
+using namespace std;
+
+const int NumMatchStates = 1;                                    // note that in this version the number
+                                                                 // of match states is fixed at 1...will
+                                                                 // change in future versions
+const int NumMatrixTypes = NumMatchStates + NumInsertStates * 2;
+
+/////////////////////////////////////////////////////////////////
+// ProbabilisticModel
+//
+// Class for storing the parameters of a probabilistic model and
+// performing different computations based on those parameters.
+// In particular, this class handles the computation of
+// posterior probabilities that may be used in alignment.
+/////////////////////////////////////////////////////////////////
+
+class ProbabilisticModel {
+
+  float initialDistribution[NumMatrixTypes];               // holds the initial probabilities for each state
+  float transProb[NumMatrixTypes][NumMatrixTypes];         // holds all state-to-state transition probabilities
+  float matchProb[256][256];                               // emission probabilities for match states
+  float insProb[256][NumMatrixTypes];                      // emission probabilities for insert states
+
+ public:
+
+  /////////////////////////////////////////////////////////////////
+  // ProbabilisticModel::ProbabilisticModel()
+  //
+  // Constructor.  Builds a new probabilistic model using the
+  // given parameters.
+  /////////////////////////////////////////////////////////////////
+
+  ProbabilisticModel (const VF &initDistribMat, const VF &gapOpen, const VF &gapExtend,
+                      const VVF &emitPairs, const VF &emitSingle){
+
+    // build transition matrix
+    VVF transMat (NumMatrixTypes, VF (NumMatrixTypes, 0.0f));
+    transMat[0][0] = 1;
+    for (int i = 0; i < NumInsertStates; i++){
+      transMat[0][2*i+1] = gapOpen[2*i];
+      transMat[0][2*i+2] = gapOpen[2*i+1];
+      transMat[0][0] -= (gapOpen[2*i] + gapOpen[2*i+1]);
+      assert (transMat[0][0] > 0);
+      transMat[2*i+1][2*i+1] = gapExtend[2*i];
+      transMat[2*i+2][2*i+2] = gapExtend[2*i+1];
+      transMat[2*i+1][2*i+2] = 0;
+      transMat[2*i+2][2*i+1] = 0;
+      transMat[2*i+1][0] = 1 - gapExtend[2*i];
+      transMat[2*i+2][0] = 1 - gapExtend[2*i+1];
+    }
+
+    // create initial and transition probability matrices
+    for (int i = 0; i < NumMatrixTypes; i++){
+      initialDistribution[i] = LOG (initDistribMat[i]);
+      for (int j = 0; j < NumMatrixTypes; j++)
+        transProb[i][j] = LOG (transMat[i][j]);
+    }
+
+    // create insertion and match probability matrices
+    for (int i = 0; i < 256; i++){
+      for (int j = 0; j < NumMatrixTypes; j++)
+        insProb[i][j] = LOG (emitSingle[i]);
+      for (int j = 0; j < 256; j++)
+        matchProb[i][j] = LOG (emitPairs[i][j]);
+    }
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // ProbabilisticModel::ComputeForwardMatrix()
+  //
+  // Computes a set of forward probability matrices for aligning
+  // seq1 and seq2.
+  //
+  // For efficiency reasons, a single-dimensional floating-point
+  // array is used here, with the following indexing scheme:
+  //
+  //    forward[i + NumMatrixTypes * (j * (seq2Length+1) + k)]
+  //    refers to the probability of aligning through j characters
+  //    of the first sequence, k characters of the second sequence,
+  //    and ending in state i.
+  /////////////////////////////////////////////////////////////////
+
+  VF *ComputeForwardMatrix (Sequence *seq1, Sequence *seq2) const {
+
+    assert (seq1);
+    assert (seq2);
+
+    const int seq1Length = seq1->GetLength();
+    const int seq2Length = seq2->GetLength();
+
+    // retrieve the points to the beginning of each sequence
+    SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
+    SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
+
+    // create matrix
+    VF *forwardPtr = new VF (NumMatrixTypes * (seq1Length+1) * (seq2Length+1), LOG_ZERO);
+    assert (forwardPtr);
+    VF &forward = *forwardPtr;
+
+    // initialization condition
+    forward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)] = 
+      initialDistribution[0] + matchProb[(unsigned char) iter1[1]][(unsigned char) iter2[1]];
+   
+    for (int k = 0; k < NumInsertStates; k++){
+      forward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)] = 
+	initialDistribution[2*k+1] + insProb[(unsigned char) iter1[1]][k];
+      forward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)] = 
+	initialDistribution[2*k+2] + insProb[(unsigned char) iter2[1]][k]; 
+    }
+    
+    // remember offset for each index combination
+    int ij = 0;
+    int i1j = -seq2Length - 1;
+    int ij1 = -1;
+    int i1j1 = -seq2Length - 2;
+
+    ij *= NumMatrixTypes;
+    i1j *= NumMatrixTypes;
+    ij1 *= NumMatrixTypes;
+    i1j1 *= NumMatrixTypes;
+
+    // compute forward scores
+    for (int i = 0; i <= seq1Length; i++){
+      unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i];
+      for (int j = 0; j <= seq2Length; j++){
+        unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j];
+
+	if (i > 1 || j > 1){
+	  if (i > 0 && j > 0){
+	    forward[0 + ij] = forward[0 + i1j1] + transProb[0][0];
+	    for (int k = 1; k < NumMatrixTypes; k++)
+	      LOG_PLUS_EQUALS (forward[0 + ij], forward[k + i1j1] + transProb[k][0]);
+	    forward[0 + ij] += matchProb[c1][c2];
+	  }
+	  if (i > 0){
+	    for (int k = 0; k < NumInsertStates; k++)
+	      forward[2*k+1 + ij] = insProb[c1][k] +
+		LOG_ADD (forward[0 + i1j] + transProb[0][2*k+1],
+			 forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1]);
+	  }
+	  if (j > 0){
+	    for (int k = 0; k < NumInsertStates; k++)
+	      forward[2*k+2 + ij] = insProb[c2][k] +
+		LOG_ADD (forward[0 + ij1] + transProb[0][2*k+2],
+			 forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2]);
+	  }
+	}
+
+        ij += NumMatrixTypes;
+        i1j += NumMatrixTypes;
+        ij1 += NumMatrixTypes;
+        i1j1 += NumMatrixTypes;
+      }
+    }
+
+    return forwardPtr;
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // ProbabilisticModel::ComputeBackwardMatrix()
+  //
+  // Computes a set of backward probability matrices for aligning
+  // seq1 and seq2.
+  //
+  // For efficiency reasons, a single-dimensional floating-point
+  // array is used here, with the following indexing scheme:
+  //
+  //    backward[i + NumMatrixTypes * (j * (seq2Length+1) + k)]
+  //    refers to the probability of starting in state i and
+  //    aligning from character j+1 to the end of the first
+  //    sequence and from character k+1 to the end of the second
+  //    sequence.
+  /////////////////////////////////////////////////////////////////
+
+  VF *ComputeBackwardMatrix (Sequence *seq1, Sequence *seq2) const {
+
+    assert (seq1);
+    assert (seq2);
+
+    const int seq1Length = seq1->GetLength();
+    const int seq2Length = seq2->GetLength();
+    SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
+    SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
+
+    // create matrix
+    VF *backwardPtr = new VF (NumMatrixTypes * (seq1Length+1) * (seq2Length+1), LOG_ZERO);
+    assert (backwardPtr);
+    VF &backward = *backwardPtr;
+
+    // initialization condition
+    for (int k = 0; k < NumMatrixTypes; k++)
+      backward[NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1) + k] = initialDistribution[k];
+
+    // remember offset for each index combination
+    int ij = (seq1Length+1) * (seq2Length+1) - 1;
+    int i1j = ij + seq2Length + 1;
+    int ij1 = ij + 1;
+    int i1j1 = ij + seq2Length + 2;
+
+    ij *= NumMatrixTypes;
+    i1j *= NumMatrixTypes;
+    ij1 *= NumMatrixTypes;
+    i1j1 *= NumMatrixTypes;
+
+    // compute backward scores
+    for (int i = seq1Length; i >= 0; i--){
+      unsigned char c1 = (i == seq1Length) ? '~' : (unsigned char) iter1[i+1];
+      for (int j = seq2Length; j >= 0; j--){
+        unsigned char c2 = (j == seq2Length) ? '~' : (unsigned char) iter2[j+1];
+
+        if (i < seq1Length && j < seq2Length){
+          const float ProbXY = backward[0 + i1j1] + matchProb[c1][c2];
+          for (int k = 0; k < NumMatrixTypes; k++)
+            LOG_PLUS_EQUALS (backward[k + ij], ProbXY + transProb[k][0]);
+        }
+        if (i < seq1Length){
+          for (int k = 0; k < NumInsertStates; k++){
+            LOG_PLUS_EQUALS (backward[0 + ij], backward[2*k+1 + i1j] + insProb[c1][k] + transProb[0][2*k+1]);
+            LOG_PLUS_EQUALS (backward[2*k+1 + ij], backward[2*k+1 + i1j] + insProb[c1][k] + transProb[2*k+1][2*k+1]);
+          }
+        }
+        if (j < seq2Length){
+          for (int k = 0; k < NumInsertStates; k++){
+            LOG_PLUS_EQUALS (backward[0 + ij], backward[2*k+2 + ij1] + insProb[c2][k] + transProb[0][2*k+2]);
+            LOG_PLUS_EQUALS (backward[2*k+2 + ij], backward[2*k+2 + ij1] + insProb[c2][k] + transProb[2*k+2][2*k+2]);
+          }
+        }
+
+        ij -= NumMatrixTypes;
+        i1j -= NumMatrixTypes;
+        ij1 -= NumMatrixTypes;
+        i1j1 -= NumMatrixTypes;
+      }
+    }
+
+    return backwardPtr;
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // ProbabilisticModel::ComputeTotalProbability()
+  //
+  // Computes the total probability of an alignment given
+  // the forward and backward matrices.
+  /////////////////////////////////////////////////////////////////
+
+  float ComputeTotalProbability (int seq1Length, int seq2Length,
+                                 const VF &forward, const VF &backward) const {
+
+    // compute total probability
+    float totalForwardProb = LOG_ZERO;
+    float totalBackwardProb = LOG_ZERO;
+    for (int k = 0; k < NumMatrixTypes; k++){
+      LOG_PLUS_EQUALS (totalForwardProb,
+                       forward[k + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] + 
+		       backward[k + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]);
+    }
+
+    totalBackwardProb = 
+      forward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)] +
+      backward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)];
+
+    for (int k = 0; k < NumInsertStates; k++){
+      LOG_PLUS_EQUALS (totalBackwardProb,
+		       forward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)] +
+		       backward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)]);
+      LOG_PLUS_EQUALS (totalBackwardProb,
+		       forward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)] +
+		       backward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)]);
+    }
+
+    //    cerr << totalForwardProb << " " << totalBackwardProb << endl;
+    
+    return (totalForwardProb + totalBackwardProb) / 2;
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // ProbabilisticModel::ComputePosteriorMatrix()
+  //
+  // Computes the posterior probability matrix based on
+  // the forward and backward matrices.
+  /////////////////////////////////////////////////////////////////
+
+  VF *ComputePosteriorMatrix (Sequence *seq1, Sequence *seq2,
+                              const VF &forward, const VF &backward) const {
+
+    assert (seq1);
+    assert (seq2);
+
+    const int seq1Length = seq1->GetLength();
+    const int seq2Length = seq2->GetLength();
+
+    float totalProb = ComputeTotalProbability (seq1Length, seq2Length,
+                                               forward, backward);
+
+    // compute posterior matrices
+    VF *posteriorPtr = new VF((seq1Length+1) * (seq2Length+1)); assert (posteriorPtr);
+    VF &posterior = *posteriorPtr;
+
+    int ij = 0;
+    VF::iterator ptr = posterior.begin();
+
+    for (int i = 0; i <= seq1Length; i++){
+      for (int j = 0; j <= seq2Length; j++){
+        *(ptr++) = EXP (min (LOG_ONE, forward[ij] + backward[ij] - totalProb));
+        ij += NumMatrixTypes;
+      }
+    }
+
+    posterior[0] = 0;
+
+    return posteriorPtr;
+  }
+
+  /*
+  /////////////////////////////////////////////////////////////////
+  // ProbabilisticModel::ComputeExpectedCounts()
+  //
+  // Computes the expected counts for the various transitions.
+  /////////////////////////////////////////////////////////////////
+
+  VVF *ComputeExpectedCounts () const {
+
+    assert (seq1);
+    assert (seq2);
+
+    const int seq1Length = seq1->GetLength();
+    const int seq2Length = seq2->GetLength();
+    SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
+    SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
+
+    // compute total probability
+    float totalProb = ComputeTotalProbability (seq1Length, seq2Length,
+                                               forward, backward);
+
+    // initialize expected counts
+    VVF *countsPtr = new VVF(NumMatrixTypes + 1, VF(NumMatrixTypes, LOG_ZERO)); assert (countsPtr);
+    VVF &counts = *countsPtr;
+
+    // remember offset for each index combination
+    int ij = 0;
+    int i1j = -seq2Length - 1;
+    int ij1 = -1;
+    int i1j1 = -seq2Length - 2;
+
+    ij *= NumMatrixTypes;
+    i1j *= NumMatrixTypes;
+    ij1 *= NumMatrixTypes;
+    i1j1 *= NumMatrixTypes;
+
+    // compute expected counts
+    for (int i = 0; i <= seq1Length; i++){
+      unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i];
+      for (int j = 0; j <= seq2Length; j++){
+        unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j];
+
+        if (i > 0 && j > 0){
+          for (int k = 0; k < NumMatrixTypes; k++)
+            LOG_PLUS_EQUALS (counts[k][0],
+                             forward[k + i1j1] + transProb[k][0] +
+                             matchProb[c1][c2] + backward[0 + ij]);
+        }
+        if (i > 0){
+          for (int k = 0; k < NumInsertStates; k++){
+            LOG_PLUS_EQUALS (counts[0][2*k+1],
+                             forward[0 + i1j] + transProb[0][2*k+1] +
+                             insProb[c1][k] + backward[2*k+1 + ij]);
+            LOG_PLUS_EQUALS (counts[2*k+1][2*k+1],
+                             forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1] +
+                             insProb[c1][k] + backward[2*k+1 + ij]);
+          }
+        }
+        if (j > 0){
+          for (int k = 0; k < NumInsertStates; k++){
+            LOG_PLUS_EQUALS (counts[0][2*k+2],
+                             forward[0 + ij1] + transProb[0][2*k+2] +
+                             insProb[c2][k] + backward[2*k+2 + ij]);
+            LOG_PLUS_EQUALS (counts[2*k+2][2*k+2],
+                             forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2] +
+                             insProb[c2][k] + backward[2*k+2 + ij]);
+          }
+        }
+
+        ij += NumMatrixTypes;
+        i1j += NumMatrixTypes;
+        ij1 += NumMatrixTypes;
+        i1j1 += NumMatrixTypes;
+      }
+    }
+
+    // scale all expected counts appropriately
+    for (int i = 0; i < NumMatrixTypes; i++)
+      for (int j = 0; j < NumMatrixTypes; j++)
+        counts[i][j] -= totalProb;
+
+  }
+  */
+
+  /////////////////////////////////////////////////////////////////
+  // ProbabilisticModel::ComputeNewParameters()
+  //
+  // Computes a new parameter set based on the expected counts
+  // given.
+  /////////////////////////////////////////////////////////////////
+
+  void ComputeNewParameters (Sequence *seq1, Sequence *seq2,
+			     const VF &forward, const VF &backward,
+                             VF &initDistribMat, VF &gapOpen,
+                             VF &gapExtend, VVF &emitPairs, VF &emitSingle, bool enableTrainEmissions) const {
+    
+    assert (seq1);
+    assert (seq2);
+
+    const int seq1Length = seq1->GetLength();
+    const int seq2Length = seq2->GetLength();
+    SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
+    SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
+
+    // compute total probability
+    float totalProb = ComputeTotalProbability (seq1Length, seq2Length,
+                                               forward, backward);
+    
+    // initialize expected counts
+    VVF transCounts (NumMatrixTypes, VF (NumMatrixTypes, LOG_ZERO));
+    VF initCounts (NumMatrixTypes, LOG_ZERO);
+    VVF pairCounts (256, VF (256, LOG_ZERO));
+    VF singleCounts (256, LOG_ZERO);
+    
+    // remember offset for each index combination
+    int ij = 0;
+    int i1j = -seq2Length - 1;
+    int ij1 = -1;
+    int i1j1 = -seq2Length - 2;
+
+    ij *= NumMatrixTypes;
+    i1j *= NumMatrixTypes;
+    ij1 *= NumMatrixTypes;
+    i1j1 *= NumMatrixTypes;
+
+    // compute initial distribution posteriors
+    initCounts[0] = LOG_ADD (forward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)] +
+			     backward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)],
+			     forward[0 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] + 
+			     backward[0 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]);
+    for (int k = 0; k < NumInsertStates; k++){
+      initCounts[2*k+1] = LOG_ADD (forward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)] +
+				   backward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)],
+				   forward[2*k+1 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] + 
+				   backward[2*k+1 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]);
+      initCounts[2*k+2] = LOG_ADD (forward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)] +
+				   backward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)],
+				   forward[2*k+2 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] + 
+				   backward[2*k+2 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]);
+    }
+
+    // compute expected counts
+    for (int i = 0; i <= seq1Length; i++){
+      unsigned char c1 = (i == 0) ? '~' : (unsigned char) toupper(iter1[i]);
+      for (int j = 0; j <= seq2Length; j++){
+        unsigned char c2 = (j == 0) ? '~' : (unsigned char) toupper(iter2[j]);
+
+	if (i > 0 && j > 0){
+	  if (enableTrainEmissions && i == 1 && j == 1){
+	    LOG_PLUS_EQUALS (pairCounts[c1][c2],
+			     initialDistribution[0] + matchProb[c1][c2] + backward[0 + ij]);
+	    LOG_PLUS_EQUALS (pairCounts[c2][c1],
+			     initialDistribution[0] + matchProb[c2][c1] + backward[0 + ij]);
+	  }
+
+	  for (int k = 0; k < NumMatrixTypes; k++){
+	    LOG_PLUS_EQUALS (transCounts[k][0],
+			     forward[k + i1j1] + transProb[k][0] +
+			     matchProb[c1][c2] + backward[0 + ij]);
+	    if (enableTrainEmissions && i != 1 || j != 1){
+	      LOG_PLUS_EQUALS (pairCounts[c1][c2],
+			       forward[k + i1j1] + transProb[k][0] +
+			       matchProb[c1][c2] + backward[0 + ij]);
+	      LOG_PLUS_EQUALS (pairCounts[c2][c1],
+			       forward[k + i1j1] + transProb[k][0] +
+			       matchProb[c2][c1] + backward[0 + ij]);
+	    }
+	  }
+	}
+	if (i > 0){
+	  for (int k = 0; k < NumInsertStates; k++){
+	    LOG_PLUS_EQUALS (transCounts[0][2*k+1],
+			     forward[0 + i1j] + transProb[0][2*k+1] +
+			     insProb[c1][k] + backward[2*k+1 + ij]);
+	    LOG_PLUS_EQUALS (transCounts[2*k+1][2*k+1],
+			     forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1] +
+			     insProb[c1][k] + backward[2*k+1 + ij]);
+	    if (enableTrainEmissions){
+	      if (i == 1 && j == 0){
+		LOG_PLUS_EQUALS (singleCounts[c1],
+				 initialDistribution[2*k+1] + insProb[c1][k] + backward[2*k+1 + ij]);
+	      }
+	      else {
+		LOG_PLUS_EQUALS (singleCounts[c1],
+				 forward[0 + i1j] + transProb[0][2*k+1] +
+				 insProb[c1][k] + backward[2*k+1 + ij]);
+		LOG_PLUS_EQUALS (singleCounts[c1],
+				 forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1] +
+				 insProb[c1][k] + backward[2*k+1 + ij]);
+	      }
+	    }
+	  }
+	}
+	if (j > 0){
+	  for (int k = 0; k < NumInsertStates; k++){
+	    LOG_PLUS_EQUALS (transCounts[0][2*k+2],
+			     forward[0 + ij1] + transProb[0][2*k+2] +
+			     insProb[c2][k] + backward[2*k+2 + ij]);
+	    LOG_PLUS_EQUALS (transCounts[2*k+2][2*k+2],
+			     forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2] +
+			     insProb[c2][k] + backward[2*k+2 + ij]);
+	    if (enableTrainEmissions){
+	      if (i == 0 && j == 1){
+		LOG_PLUS_EQUALS (singleCounts[c2],
+				 initialDistribution[2*k+2] + insProb[c2][k] + backward[2*k+2 + ij]);
+	      }
+	      else {
+		LOG_PLUS_EQUALS (singleCounts[c2],
+				 forward[0 + ij1] + transProb[0][2*k+2] +
+				 insProb[c2][k] + backward[2*k+2 + ij]);
+		LOG_PLUS_EQUALS (singleCounts[c2],
+				 forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2] +
+				 insProb[c2][k] + backward[2*k+2 + ij]);
+	      }
+	    }
+	  }
+	}
+      
+        ij += NumMatrixTypes;
+        i1j += NumMatrixTypes;
+        ij1 += NumMatrixTypes;
+        i1j1 += NumMatrixTypes;
+      }
+    }
+
+    // scale all expected counts appropriately
+    for (int i = 0; i < NumMatrixTypes; i++){
+      initCounts[i] -= totalProb;
+      for (int j = 0; j < NumMatrixTypes; j++)
+        transCounts[i][j] -= totalProb;
+    }
+    if (enableTrainEmissions){
+      for (int i = 0; i < 256; i++){
+	for (int j = 0; j < 256; j++)
+	  pairCounts[i][j] -= totalProb;
+	singleCounts[i] -= totalProb;
+      }
+    }
+
+    // compute new initial distribution
+    float totalInitDistribCounts = 0;
+    for (int i = 0; i < NumMatrixTypes; i++)
+      totalInitDistribCounts += exp (initCounts[i]); // should be 2
+    initDistribMat[0] = min (1.0f, max (0.0f, (float) exp (initCounts[0]) / totalInitDistribCounts));
+    for (int k = 0; k < NumInsertStates; k++){
+      float val = (exp (initCounts[2*k+1]) + exp (initCounts[2*k+2])) / 2;
+      initDistribMat[2*k+1] = initDistribMat[2*k+2] = min (1.0f, max (0.0f, val / totalInitDistribCounts));
+    }
+
+    // compute total counts for match state
+    float inMatchStateCounts = 0;
+    for (int i = 0; i < NumMatrixTypes; i++)
+      inMatchStateCounts += exp (transCounts[0][i]);
+    for (int i = 0; i < NumInsertStates; i++){
+
+      // compute total counts for gap state
+      float inGapStateCounts =
+        exp (transCounts[2*i+1][0]) +
+        exp (transCounts[2*i+1][2*i+1]) +
+        exp (transCounts[2*i+2][0]) +
+        exp (transCounts[2*i+2][2*i+2]);
+
+      gapOpen[2*i] = gapOpen[2*i+1] =
+        (exp (transCounts[0][2*i+1]) +
+         exp (transCounts[0][2*i+2])) /
+        (2 * inMatchStateCounts);
+
+      gapExtend[2*i] = gapExtend[2*i+1] =
+        (exp (transCounts[2*i+1][2*i+1]) +
+         exp (transCounts[2*i+2][2*i+2])) /
+        inGapStateCounts;
+    }
+
+    if (enableTrainEmissions){
+      float totalPairCounts = 0;
+      float totalSingleCounts = 0;
+      for (int i = 0; i < 256; i++){
+	for (int j = 0; j <= i; j++)
+	  totalPairCounts += exp (pairCounts[j][i]);
+	totalSingleCounts += exp (singleCounts[i]);
+      }
+      
+      for (int i = 0; i < 256; i++) if (!islower ((char) i)){
+	int li = (int)((unsigned char) tolower ((char) i));
+	for (int j = 0; j <= i; j++) if (!islower ((char) j)){
+	  int lj = (int)((unsigned char) tolower ((char) j));
+	  emitPairs[i][j] = emitPairs[i][lj] = emitPairs[li][j] = emitPairs[li][lj] = 
+	    emitPairs[j][i] = emitPairs[j][li] = emitPairs[lj][i] = emitPairs[lj][li] = exp(pairCounts[j][i]) / totalPairCounts;
+	}
+	emitSingle[i] = emitSingle[li] = exp(singleCounts[i]) / totalSingleCounts;
+      }
+    }
+  }
+    
+  /////////////////////////////////////////////////////////////////
+  // ProbabilisticModel::ComputeAlignment()
+  //
+  // Computes an alignment based on the given posterior matrix.
+  // This is done by finding the maximum summing path (or
+  // maximum weight trace) through the posterior matrix.  The
+  // final alignment is returned as a pair consisting of:
+  //    (1) a string (e.g., XXXBBXXXBBBBBBYYYYBBB) where X's and
+  //        denote insertions in one of the two sequences and
+  //        B's denote that both sequences are present (i.e.
+  //        matches).
+  //    (2) a float indicating the sum achieved
+  /////////////////////////////////////////////////////////////////
+
+  pair<SafeVector<char> *, float> ComputeAlignment (int seq1Length, int seq2Length,
+                                                    const VF &posterior) const {
+
+    float *twoRows = new float[(seq2Length+1)*2]; assert (twoRows);
+    float *oldRow = twoRows;
+    float *newRow = twoRows + seq2Length + 1;
+
+    char *tracebackMatrix = new char[(seq1Length+1)*(seq2Length+1)]; assert (tracebackMatrix);
+    char *tracebackPtr = tracebackMatrix;
+
+    VF::const_iterator posteriorPtr = posterior.begin() + seq2Length + 1;
+
+    // initialization
+    for (int i = 0; i <= seq2Length; i++){
+      oldRow[i] = 0;
+      *(tracebackPtr++) = 'L';
+    }
+
+    // fill in matrix
+    for (int i = 1; i <= seq1Length; i++){
+
+      // initialize left column
+      newRow[0] = 0;
+      posteriorPtr++;
+      *(tracebackPtr++) = 'U';
+
+      // fill in rest of row
+      for (int j = 1; j <= seq2Length; j++){
+        ChooseBestOfThree (*(posteriorPtr++) + oldRow[j-1], newRow[j-1], oldRow[j],
+                           'D', 'L', 'U', &newRow[j], tracebackPtr++);
+      }
+
+      // swap rows
+      float *temp = oldRow;
+      oldRow = newRow;
+      newRow = temp;
+    }
+
+    // store best score
+    float total = oldRow[seq2Length];
+    delete [] twoRows;
+
+    // compute traceback
+    SafeVector<char> *alignment = new SafeVector<char>; assert (alignment);
+    int r = seq1Length, c = seq2Length;
+    while (r != 0 || c != 0){
+      char ch = tracebackMatrix[r*(seq2Length+1) + c];
+      switch (ch){
+      case 'L': c--; alignment->push_back ('Y'); break;
+      case 'U': r--; alignment->push_back ('X'); break;
+      case 'D': c--; r--; alignment->push_back ('B'); break;
+      default: assert (false);
+      }
+    }
+
+    delete [] tracebackMatrix;
+
+    reverse (alignment->begin(), alignment->end());
+
+    return make_pair(alignment, total);
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // ProbabilisticModel::ComputeAlignmentWithGapPenalties()
+  //
+  // Similar to ComputeAlignment() except with gap penalties.
+  /////////////////////////////////////////////////////////////////
+
+  pair<SafeVector<char> *, float> ComputeAlignmentWithGapPenalties (MultiSequence *align1,
+                                                                    MultiSequence *align2,
+                                                                    const VF &posterior, int numSeqs1,
+                                                                    int numSeqs2,
+                                                                    float gapOpenPenalty,
+                                                                    float gapContinuePenalty) const {
+    int seq1Length = align1->GetSequence(0)->GetLength();
+    int seq2Length = align2->GetSequence(0)->GetLength();
+    SafeVector<SafeVector<char>::iterator > dataPtrs1 (align1->GetNumSequences());
+    SafeVector<SafeVector<char>::iterator > dataPtrs2 (align2->GetNumSequences());
+
+    // grab character data
+    for (int i = 0; i < align1->GetNumSequences(); i++)
+      dataPtrs1[i] = align1->GetSequence(i)->GetDataPtr();
+    for (int i = 0; i < align2->GetNumSequences(); i++)
+      dataPtrs2[i] = align2->GetSequence(i)->GetDataPtr();
+
+    // the number of active sequences at any given column is defined to be the
+    // number of non-gap characters in that column; the number of gap opens at
+    // any given column is defined to be the number of gap characters in that
+    // column where the previous character in the respective sequence was not
+    // a gap
+    SafeVector<int> numActive1 (seq1Length+1), numGapOpens1 (seq1Length+1);
+    SafeVector<int> numActive2 (seq2Length+1), numGapOpens2 (seq2Length+1);
+
+    // compute number of active sequences and gap opens for each group
+    for (int i = 0; i < align1->GetNumSequences(); i++){
+      SafeVector<char>::iterator dataPtr = align1->GetSequence(i)->GetDataPtr();
+      numActive1[0] = numGapOpens1[0] = 0;
+      for (int j = 1; j <= seq1Length; j++){
+        if (dataPtr[j] != '-'){
+          numActive1[j]++;
+          numGapOpens1[j] += (j != 1 && dataPtr[j-1] != '-');
+        }
+      }
+    }
+    for (int i = 0; i < align2->GetNumSequences(); i++){
+      SafeVector<char>::iterator dataPtr = align2->GetSequence(i)->GetDataPtr();
+      numActive2[0] = numGapOpens2[0] = 0;
+      for (int j = 1; j <= seq2Length; j++){
+        if (dataPtr[j] != '-'){
+          numActive2[j]++;
+          numGapOpens2[j] += (j != 1 && dataPtr[j-1] != '-');
+        }
+      }
+    }
+
+    VVF openingPenalty1 (numSeqs1+1, VF (numSeqs2+1));
+    VF continuingPenalty1 (numSeqs1+1);
+    VVF openingPenalty2 (numSeqs1+1, VF (numSeqs2+1));
+    VF continuingPenalty2 (numSeqs2+1);
+
+    // precompute penalties
+    for (int i = 0; i <= numSeqs1; i++)
+      for (int j = 0; j <= numSeqs2; j++)
+        openingPenalty1[i][j] = i * (gapOpenPenalty * j + gapContinuePenalty * (numSeqs2 - j));
+    for (int i = 0; i <= numSeqs1; i++)
+      continuingPenalty1[i] = i * gapContinuePenalty * numSeqs2;
+    for (int i = 0; i <= numSeqs2; i++)
+      for (int j = 0; j <= numSeqs1; j++)
+        openingPenalty2[i][j] = i * (gapOpenPenalty * j + gapContinuePenalty * (numSeqs1 - j));
+    for (int i = 0; i <= numSeqs2; i++)
+      continuingPenalty2[i] = i * gapContinuePenalty * numSeqs1;
+
+    float *twoRows = new float[6*(seq2Length+1)]; assert (twoRows);
+    float *oldRowMatch = twoRows;
+    float *newRowMatch = twoRows + (seq2Length+1);
+    float *oldRowInsertX = twoRows + 2*(seq2Length+1);
+    float *newRowInsertX = twoRows + 3*(seq2Length+1);
+    float *oldRowInsertY = twoRows + 4*(seq2Length+1);
+    float *newRowInsertY = twoRows + 5*(seq2Length+1);
+
+    char *tracebackMatrix = new char[3*(seq1Length+1)*(seq2Length+1)]; assert (tracebackMatrix);
+    char *tracebackPtr = tracebackMatrix;
+
+    VF::const_iterator posteriorPtr = posterior.begin() + seq2Length + 1;
+
+    // initialization
+    for (int i = 0; i <= seq2Length; i++){
+      oldRowMatch[i] = oldRowInsertX[i] = (i == 0) ? 0 : LOG_ZERO;
+      oldRowInsertY[i] = (i == 0) ? 0 : oldRowInsertY[i-1] + continuingPenalty2[numActive2[i]];
+      *(tracebackPtr) = *(tracebackPtr+1) = *(tracebackPtr+2) = 'Y';
+      tracebackPtr += 3;
+    }
+
+    // fill in matrix
+    for (int i = 1; i <= seq1Length; i++){
+
+      // initialize left column
+      newRowMatch[0] = newRowInsertY[0] = LOG_ZERO;
+      newRowInsertX[0] = oldRowInsertX[0] + continuingPenalty1[numActive1[i]];
+      posteriorPtr++;
+      *(tracebackPtr) = *(tracebackPtr+1) = *(tracebackPtr+2) = 'X';
+      tracebackPtr += 3;
+
+      // fill in rest of row
+      for (int j = 1; j <= seq2Length; j++){
+
+        // going to MATCH state
+        ChooseBestOfThree (oldRowMatch[j-1],
+                           oldRowInsertX[j-1],
+                           oldRowInsertY[j-1],
+                           'M', 'X', 'Y', &newRowMatch[j], tracebackPtr++);
+        newRowMatch[j] += *(posteriorPtr++);
+
+        // going to INSERT X state
+        ChooseBestOfThree (oldRowMatch[j] + openingPenalty1[numActive1[i]][numGapOpens2[j]],
+                           oldRowInsertX[j] + continuingPenalty1[numActive1[i]],
+                           oldRowInsertY[j] + openingPenalty1[numActive1[i]][numGapOpens2[j]],
+                           'M', 'X', 'Y', &newRowInsertX[j], tracebackPtr++);
+
+        // going to INSERT Y state
+        ChooseBestOfThree (newRowMatch[j-1] + openingPenalty2[numActive2[j]][numGapOpens1[i]],
+                           newRowInsertX[j-1] + openingPenalty2[numActive2[j]][numGapOpens1[i]],
+                           newRowInsertY[j-1] + continuingPenalty2[numActive2[j]],
+                           'M', 'X', 'Y', &newRowInsertY[j], tracebackPtr++);
+      }
+
+      // swap rows
+      float *temp;
+      temp = oldRowMatch; oldRowMatch = newRowMatch; newRowMatch = temp;
+      temp = oldRowInsertX; oldRowInsertX = newRowInsertX; newRowInsertX = temp;
+      temp = oldRowInsertY; oldRowInsertY = newRowInsertY; newRowInsertY = temp;
+    }
+
+    // store best score
+    float total;
+    char matrix;
+    ChooseBestOfThree (oldRowMatch[seq2Length], oldRowInsertX[seq2Length], oldRowInsertY[seq2Length],
+                       'M', 'X', 'Y', &total, &matrix);
+
+    delete [] twoRows;
+
+    // compute traceback
+    SafeVector<char> *alignment = new SafeVector<char>; assert (alignment);
+    int r = seq1Length, c = seq2Length;
+    while (r != 0 || c != 0){
+
+      int offset = (matrix == 'M') ? 0 : (matrix == 'X') ? 1 : 2;
+      char ch = tracebackMatrix[(r*(seq2Length+1) + c) * 3 + offset];
+      switch (matrix){
+      case 'Y': c--; alignment->push_back ('Y'); break;
+      case 'X': r--; alignment->push_back ('X'); break;
+      case 'M': c--; r--; alignment->push_back ('B'); break;
+      default: assert (false);
+      }
+      matrix = ch;
+    }
+
+    delete [] tracebackMatrix;
+
+    reverse (alignment->begin(), alignment->end());
+
+    return make_pair(alignment, 1.0f);
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // ProbabilisticModel::ComputeViterbiAlignment()
+  //
+  // Computes the highest probability pairwise alignment using the
+  // probabilistic model.  The final alignment is returned as a
+  //  pair consisting of:
+  //    (1) a string (e.g., XXXBBXXXBBBBBBYYYYBBB) where X's and
+  //        denote insertions in one of the two sequences and
+  //        B's denote that both sequences are present (i.e.
+  //        matches).
+  //    (2) a float containing the log probability of the best
+  //        alignment (not used)
+  /////////////////////////////////////////////////////////////////
+
+  pair<SafeVector<char> *, float> ComputeViterbiAlignment (Sequence *seq1, Sequence *seq2) const {
+    
+    assert (seq1);
+    assert (seq2);
+    
+    const int seq1Length = seq1->GetLength();
+    const int seq2Length = seq2->GetLength();
+    
+    // retrieve the points to the beginning of each sequence
+    SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
+    SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
+    
+    // create viterbi matrix
+    VF *viterbiPtr = new VF (NumMatrixTypes * (seq1Length+1) * (seq2Length+1), LOG_ZERO);
+    assert (viterbiPtr);
+    VF &viterbi = *viterbiPtr;
+
+    // create traceback matrix
+    VI *tracebackPtr = new VI (NumMatrixTypes * (seq1Length+1) * (seq2Length+1), -1);
+    assert (tracebackPtr);
+    VI &traceback = *tracebackPtr;
+
+    // initialization condition
+    for (int k = 0; k < NumMatrixTypes; k++)
+      viterbi[k] = initialDistribution[k];
+
+    // remember offset for each index combination
+    int ij = 0;
+    int i1j = -seq2Length - 1;
+    int ij1 = -1;
+    int i1j1 = -seq2Length - 2;
+
+    ij *= NumMatrixTypes;
+    i1j *= NumMatrixTypes;
+    ij1 *= NumMatrixTypes;
+    i1j1 *= NumMatrixTypes;
+
+    // compute viterbi scores
+    for (int i = 0; i <= seq1Length; i++){
+      unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i];
+      for (int j = 0; j <= seq2Length; j++){
+        unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j];
+
+        if (i > 0 && j > 0){
+          for (int k = 0; k < NumMatrixTypes; k++){
+	    float newVal = viterbi[k + i1j1] + transProb[k][0] + matchProb[c1][c2];
+	    if (viterbi[0 + ij] < newVal){
+	      viterbi[0 + ij] = newVal;
+	      traceback[0 + ij] = k;
+	    }
+	  }
+        }
+        if (i > 0){
+          for (int k = 0; k < NumInsertStates; k++){
+	    float valFromMatch = insProb[c1][k] + viterbi[0 + i1j] + transProb[0][2*k+1];
+	    float valFromIns = insProb[c1][k] + viterbi[2*k+1 + i1j] + transProb[2*k+1][2*k+1];
+	    if (valFromMatch >= valFromIns){
+	      viterbi[2*k+1 + ij] = valFromMatch;
+	      traceback[2*k+1 + ij] = 0;
+	    }
+	    else {
+	      viterbi[2*k+1 + ij] = valFromIns;
+	      traceback[2*k+1 + ij] = 2*k+1;
+	    }
+	  }
+	}
+        if (j > 0){
+          for (int k = 0; k < NumInsertStates; k++){
+	    float valFromMatch = insProb[c2][k] + viterbi[0 + ij1] + transProb[0][2*k+2];
+	    float valFromIns = insProb[c2][k] + viterbi[2*k+2 + ij1] + transProb[2*k+2][2*k+2];
+	    if (valFromMatch >= valFromIns){
+	      viterbi[2*k+2 + ij] = valFromMatch;
+	      traceback[2*k+2 + ij] = 0;
+	    }
+	    else {
+	      viterbi[2*k+2 + ij] = valFromIns;
+	      traceback[2*k+2 + ij] = 2*k+2;
+	    }
+	  }
+        }
+
+        ij += NumMatrixTypes;
+        i1j += NumMatrixTypes;
+        ij1 += NumMatrixTypes;
+        i1j1 += NumMatrixTypes;
+      }
+    }
+
+    // figure out best terminating cell
+    float bestProb = LOG_ZERO;
+    int state = -1;
+    for (int k = 0; k < NumMatrixTypes; k++){
+      float thisProb = viterbi[k + NumMatrixTypes * ((seq1Length+1)*(seq2Length+1) - 1)] + initialDistribution[k];
+      if (bestProb < thisProb){
+	bestProb = thisProb;
+	state = k;
+      }
+    }
+    assert (state != -1);
+
+    delete viterbiPtr;
+
+    // compute traceback
+    SafeVector<char> *alignment = new SafeVector<char>; assert (alignment);
+    int r = seq1Length, c = seq2Length;
+    while (r != 0 || c != 0){
+      int newState = traceback[state + NumMatrixTypes * (r * (seq2Length+1) + c)];
+      
+      if (state == 0){ c--; r--; alignment->push_back ('B'); }
+      else if (state % 2 == 1){ r--; alignment->push_back ('X'); }
+      else { c--; alignment->push_back ('Y'); }
+      
+      state = newState;
+    }
+
+    delete tracebackPtr;
+
+    reverse (alignment->begin(), alignment->end());
+    
+    return make_pair(alignment, bestProb);
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // ProbabilisticModel::BuildPosterior()
+  //
+  // Builds a posterior probability matrix needed to align a pair
+  // of alignments.  Mathematically, the returned matrix M is
+  // defined as follows:
+  //    M[i,j] =     sum          sum      f(s,t,i,j)
+  //             s in align1  t in align2
+  // where
+  //                  [  P(s[i'] <--> t[j'])
+  //                  [       if s[i'] is a letter in the ith column of align1 and
+  //                  [          t[j'] it a letter in the jth column of align2
+  //    f(s,t,i,j) =  [
+  //                  [  0    otherwise
+  //
+  /////////////////////////////////////////////////////////////////
+
+  VF *BuildPosterior (MultiSequence *align1, MultiSequence *align2,
+                      const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+		      float cutoff = 0.0f) const {
+    const int seq1Length = align1->GetSequence(0)->GetLength();
+    const int seq2Length = align2->GetSequence(0)->GetLength();
+
+    VF *posteriorPtr = new VF((seq1Length+1) * (seq2Length+1), 0); assert (posteriorPtr);
+    VF &posterior = *posteriorPtr;
+    VF::iterator postPtr = posterior.begin();
+
+    // for each s in align1
+    for (int i = 0; i < align1->GetNumSequences(); i++){
+      int first = align1->GetSequence(i)->GetLabel();
+      SafeVector<int> *mapping1 = align1->GetSequence(i)->GetMapping();
+
+      // for each t in align2
+      for (int j = 0; j < align2->GetNumSequences(); j++){
+        int second = align2->GetSequence(j)->GetLabel();
+        SafeVector<int> *mapping2 = align2->GetSequence(j)->GetMapping();
+
+	if (first < second){
+
+	  // get the associated sparse matrix
+	  SparseMatrix *matrix = sparseMatrices[first][second];
+	  
+	  for (int ii = 1; ii <= matrix->GetSeq1Length(); ii++){
+	    SafeVector<PIF>::iterator row = matrix->GetRowPtr(ii);
+	    int base = (*mapping1)[ii] * (seq2Length+1);
+	    int rowSize = matrix->GetRowSize(ii);
+	    
+	    // add in all relevant values
+	    for (int jj = 0; jj < rowSize; jj++)
+	      posterior[base + (*mapping2)[row[jj].first]] += row[jj].second;
+	    
+	    // subtract cutoff 
+	    for (int jj = 0; jj < matrix->GetSeq2Length(); jj++)
+	      posterior[base + (*mapping2)[jj]] -= cutoff;
+	  }
+
+	} else {
+
+	  // get the associated sparse matrix
+	  SparseMatrix *matrix = sparseMatrices[second][first];
+	  
+	  for (int jj = 1; jj <= matrix->GetSeq1Length(); jj++){
+	    SafeVector<PIF>::iterator row = matrix->GetRowPtr(jj);
+	    int base = (*mapping2)[jj];
+	    int rowSize = matrix->GetRowSize(jj);
+	    
+	    // add in all relevant values
+	    for (int ii = 0; ii < rowSize; ii++)
+	      posterior[base + (*mapping1)[row[ii].first] * (seq2Length + 1)] += row[ii].second;
+	    
+	    // subtract cutoff 
+	    for (int ii = 0; ii < matrix->GetSeq2Length(); ii++)
+	      posterior[base + (*mapping1)[ii] * (seq2Length + 1)] -= cutoff;
+	  }
+
+	}
+	
+
+        delete mapping2;
+      }
+
+      delete mapping1;
+    }
+
+    return posteriorPtr;
+  }
+};
+
+#endif

Added: trunk/packages/probcons/branches/upstream/current/ProjectPairwise.cc
===================================================================
--- trunk/packages/probcons/branches/upstream/current/ProjectPairwise.cc	2006-05-07 10:44:13 UTC (rev 40)
+++ trunk/packages/probcons/branches/upstream/current/ProjectPairwise.cc	2006-05-07 14:55:27 UTC (rev 41)
@@ -0,0 +1,71 @@
+/////////////////////////////////////////////////////////////////
+// ProjectPairwise
+//
+// Program for projecting multiple alignments to all pairwise
+// alignments.
+/////////////////////////////////////////////////////////////////
+
+#include "SafeVector.h"
+#include "MultiSequence.h"
+#include <string>
+#include <sstream>
+#include <iomanip>
+#include <iostream>
+#include <list>
+#include <set>
+#include <limits>
+#include <cstdio>
+#include <cstdlib>
+#include <cerrno>
+#include <iomanip>
+
+bool compressGaps = true;
+
+/////////////////////////////////////////////////////////////////
+// main()
+//
+// Main program.
+/////////////////////////////////////////////////////////////////
+
+int main (int argc, char **argv){
+
+  // check arguments
+  if (argc < 2){
+    cerr << "Usage: project ALIGNMENT [-nocompressgaps]" << endl;
+    exit (1);
+  }
+
+  for (int i = 2; i < argc; i++){
+    if (strcmp (argv[i], "-nocompressgaps") == 0)
+      compressGaps = false;
+    else {
+      cerr << "Unrecognized option: " << argv[i] << endl;
+      exit (1);
+    }
+  }
+
+  MultiSequence *align = new MultiSequence (string (argv[1])); assert (align);
+
+  int N = align->GetNumSequences();
+  for (int i = 0; i < N; i++){
+    for (int j = i+1; j < N; j++){
+      string name = align->GetSequence(i)->GetHeader() + "-" + align->GetSequence(j)->GetHeader() + ".fasta";
+      ofstream outfile (name.c_str());
+
+      if (compressGaps){
+	set<int> s;
+	s.insert (i); s.insert (j);
+	MultiSequence *proj = align->Project (s);
+	proj->WriteMFA (outfile);
+	delete proj;
+      }
+      else {
+	align->GetSequence(i)->WriteMFA (outfile, 60);
+	align->GetSequence(j)->WriteMFA (outfile, 60);
+      }
+      outfile.close();
+    }
+  }
+
+  delete align;
+}

Added: trunk/packages/probcons/branches/upstream/current/README
===================================================================
--- trunk/packages/probcons/branches/upstream/current/README	2006-05-07 10:44:13 UTC (rev 40)
+++ trunk/packages/probcons/branches/upstream/current/README	2006-05-07 14:55:27 UTC (rev 41)
@@ -0,0 +1,107 @@
+
+                          PROBCONS
+                          ~~~~~~~~                          
+
+   Probabilistic consistency-based multiple sequence alignment 
+
+-----------------------------------------------------------------
+                                                             
+PROBCONS is a novel tool for generating  multiple  alignments
+of protein sequences.  Using a combination  of  probabilistic 
+modeling and consistency-based alignment techniques, PROBCONS
+has achieved the highest accuracy of all alignment methods to
+date. 
+
+PROBCONS was developed by Chuong B. Do in collaboration  with 
+Michael Brudno in the research group  of  Serafim  Batzoglou,
+Department of Computer Science, Stanford University.
+
+For more information on the algorithms, please see
+
+     Do, C.B., Brudno, M., and Batzoglou, S. (2004) PROBCONS:
+     Probabilistic Consistency-based  Multiple  Alignment  of
+     Amino Acid Sequences.  12th International Conference  on
+     Intelligent Systems for Molecular Biology.  In press.
+
+and
+
+     Do, C.B., Brudno, M., and Batzoglou, S. (2004) PROBCONS:
+     Probabilistic Consistency-based  Multiple  Alignment  of
+     Amino Acid Sequences.  The 19th National  Conference  on
+     Artificial Intelligence (AAAI-04).  In press.
+
+-----------------------------------------------------------------
+
+PROBCONS has been made  freely  available  as  PUBLIC  DOMAIN
+software and hence is not subject to copyright in the  United
+States.  This system and/or any portion of  the  source  code
+may be used, modified, or redistributed without restrictions.  
+PROBCONS is distributed WITHOUT WARRANTY, express or implied.
+The authors accept NO LEGAL LIABILITY OR  RESPONSIBILITY  for
+loss due to reliance on the program.
+   
+-----------------------------------------------------------------
+
+Version History
+
+1.0, 3/23/2004 (Chuong Do)
+   -- initial release
+
+1.01, 3/25/2004 (Chuong Do)
+   -- fixed error in training procedure
+   -- retrained default parameters for 1 and 2 pairs of insert
+      states
+
+1.02, 4/17/2004 (Chuong Do)
+   -- replaced LOG_ADD and EXP routines
+   -- added support for reading MSF format files
+   -- added two extra utilities for scoring PROBCONS alignments
+      (for benchmarking purposes)
+      -- added the "compare" program for scoring alignments
+         according to a reference alignment with respect to 
+         sum-of-pairs and column scores
+      -- added the "fixref" program for adjusting PREFAB 
+	 alignments to contain all letters of the input 
+	 sequences; basically the main program for PROBCONS
+         "hacked" to get the job done
+
+1.03, 5/3/2004 (Chuong Do)
+   -- added option to do all-pairs pairwise alignments instead
+      of constructing a full multiple alignment
+   -- added support for reading DIALIGN style files
+   -- enabled support for using BAliBASE annotations for scoring
+      BAliBASE alignments
+   -- several minor bug fixes thanks to Bob Edgar
+   -- added "project" program to project multiple alignment to
+      pairwise alignments
+
+1.04, 5/9/2004 (Chuong Do)
+   -- switched over to default of one-insert state pair
+   -- retrained default parameters
+   -- added annotation scores
+   -- small changes to model topology to make end gaps symmetrical
+   -- added makegnuplot utility to plot annotation scores
+
+1.05, 5/26/2004 (Chuong Do)
+   -- added cutoff filtering for posterior scores
+   -- made small corrections to recurrences for computing alignments
+   -- added CLUSTALW output support
+
+1.06, 7/13/2004 (Chuong Do)
+   -- ProbCons is now PUBLIC DOMAIN software.
+
+1.07, 8/30/2004 (Chuong Do)
+   -- Fixed CLUSTALW output for sequence names (thanks to John Calley
+      for pointing this out)
+
+1.08, 8/31/2004 (Chuong Do)
+   -- Added option for alignment order output (-a).
+
+1.09, 9/1/2004 (Chuong Do)
+   -- PROBCONS now allows input files with existing gaps -- these are
+      automatically stripped before alignment.
+
+1.10, 3/16/2005 (Chuong Do)
+   -- Reduced memory consumption by
+      -- not storing posterior matrix transposes
+      -- restricting consistency-derived posterior matrices to original posterior matrix

Added: trunk/packages/probcons/branches/upstream/current/SafeVector.h
===================================================================
--- trunk/packages/probcons/branches/upstream/current/SafeVector.h	2006-05-07 10:44:13 UTC (rev 40)
+++ trunk/packages/probcons/branches/upstream/current/SafeVector.h	2006-05-07 14:55:27 UTC (rev 41)
@@ -0,0 +1,56 @@
+/////////////////////////////////////////////////////////////////
+// SafeVector.h
+//
+// STL vector with array bounds checking.  To enable bounds
+// checking, #define ENABLE_CHECKS.
+/////////////////////////////////////////////////////////////////
+
+#ifndef SAFEVECTOR_H
+#define SAFEVECTOR_H
+
+#include <cassert>
+#include <vector>
+
+/////////////////////////////////////////////////////////////////
+// SafeVector
+//
+// Class derived from the STL std::vector for bounds checking.
+/////////////////////////////////////////////////////////////////
+
+template<class TYPE>
+class SafeVector : public std::vector<TYPE>{
+ public:
+
+  // miscellaneous constructors
+  SafeVector() : std::vector<TYPE>() {}
+  SafeVector (size_t size) : std::vector<TYPE>(size) {}
+  SafeVector (size_t size, const TYPE &value) : std::vector<TYPE>(size, value) {}
+  SafeVector (const SafeVector &source) : std::vector<TYPE>(source) {}
+
+#ifdef ENABLE_CHECKS
+
+  // [] array bounds checking
+  TYPE &operator[](int index){
+    assert (index >= 0 && index < (int) size());
+    return std::vector<TYPE>::operator[] ((size_t) index);
+  }
+
+  // [] const array bounds checking
+  const TYPE &operator[] (int index) const {
+    assert (index >= 0 && index < (int) size());
+    return std::vector<TYPE>::operator[] ((size_t) index) ;
+  }
+
+#endif
+
+};
+
+// some commonly used vector types
+typedef SafeVector<int> VI;
+typedef SafeVector<VI> VVI;
+typedef SafeVector<VVI> VVVI;
+typedef SafeVector<float> VF;
+typedef SafeVector<VF> VVF;
+typedef SafeVector<VVF> VVVF;
+
+#endif

Added: trunk/packages/probcons/branches/upstream/current/ScoreType.h
===================================================================
--- trunk/packages/probcons/branches/upstream/current/ScoreType.h	2006-05-07 10:44:13 UTC (rev 40)
+++ trunk/packages/probcons/branches/upstream/current/ScoreType.h	2006-05-07 14:55:27 UTC (rev 41)
@@ -0,0 +1,340 @@
+/////////////////////////////////////////////////////////////////
+// ScoreType.h
+//
+// Routines for doing math operations in PROBCONS.
+/////////////////////////////////////////////////////////////////
+
+#ifndef SCORETYPE_H
+#define SCORETYPE_H
+
+#include <cmath>
+#include <algorithm>
+#include <cfloat>
+
+typedef float ScoreType;
+
+const float LOG_ZERO = -2e20;
+const float LOG_ONE = 0.0;
+
+/////////////////////////////////////////////////////////////////
+// LOG()
+//
+// Compute the logarithm of x.
+/////////////////////////////////////////////////////////////////
+
+inline ScoreType LOG (ScoreType x){
+  return log (x);
+}
+
+/////////////////////////////////////////////////////////////////
+// EXP()
+//
+// Computes exp(x).
+/////////////////////////////////////////////////////////////////
+
+inline ScoreType EXP (ScoreType x){
+  //return exp(x);
+  if (x > -2){
+    if (x > -0.5){
+      if (x > 0)
+	return exp(x);
+      return (((0.03254409303190190000*x + 0.16280432765779600000)*x + 0.49929760485974900000)*x + 0.99995149601363700000)*x + 0.99999925508501600000;
+    }
+    if (x > -1)
+      return (((0.01973899026052090000*x + 0.13822379685007000000)*x + 0.48056651562365000000)*x + 0.99326940370383500000)*x + 0.99906756856399500000;
+    return (((0.00940528203591384000*x + 0.09414963667859410000)*x + 0.40825793595877300000)*x + 0.93933625499130400000)*x + 0.98369508190545300000;
+  }
+  if (x > -8){
+    if (x > -4)
+      return (((0.00217245711583303000*x + 0.03484829428350620000)*x + 0.22118199801337800000)*x + 0.67049462206469500000)*x + 0.83556950223398500000;
+    return (((0.00012398771025456900*x + 0.00349155785951272000)*x + 0.03727721426017900000)*x + 0.17974997741536900000)*x + 0.33249299994217400000;
+  }
+  if (x > -16)
+    return (((0.00000051741713416603*x + 0.00002721456879608080)*x + 0.00053418601865636800)*x + 0.00464101989351936000)*x + 0.01507447981459420000;
+  return 0;
+}
+
+/*
+/////////////////////////////////////////////////////////////////
+// LOOKUP()
+//
+// Computes log (exp (x) + 1), for 0 <= x <= 7.5.
+/////////////////////////////////////////////////////////////////
+
+inline ScoreType LOOKUP (ScoreType x){
+  //return log (exp(x) + 1);
+  if (x < 2){
+    if (x < 0.5){
+      if (x < 0)
+	return log (exp(x) + 1);
+      return (((-0.00486373205785640000*x - 0.00020245408813934800)*x + 0.12504222666029800000)*x + 0.49999685320563000000)*x + 0.69314723138948900000;
+    }
+    if (x < 1)
+      return (((-0.00278634205460548000*x - 0.00458097251248546000)*x + 0.12865849880472500000)*x + 0.49862228499205200000)*x + 0.69334810088688000000;
+    return (((0.00059633755154209200*x - 0.01918996666063320000)*x + 0.15288232492093800000)*x + 0.48039958825756900000)*x + 0.69857578503189200000;
+  }
+  if (x < 8){
+    if (x < 4)
+      return (((0.00135958539181047000*x - 0.02329807659316430000)*x + 0.15885799609532100000)*x + 0.48167498563270800000)*x + 0.69276185058669200000;
+    return (((0.00011992394456683500*x - 0.00338464503306568000)*x + 0.03622746366545470000)*x + 0.82481250248383700000)*x + 0.32507892994863100000;
+  }
+  if (x < 16)
+    return (((0.00000051726300753785*x - 0.00002720671238876090)*x + 0.00053403733818413500)*x + 0.99536021775747900000)*x + 0.01507065715532010000;
+  return x;
+}
+
+/////////////////////////////////////////////////////////////////
+// LOOKUP_SLOW()
+//
+// Computes log (exp (x) + 1).
+/////////////////////////////////////////////////////////////////
+
+inline ScoreType LOOKUP_SLOW (ScoreType x){
+  return log (exp (x) + 1);
+}
+
+/////////////////////////////////////////////////////////////////
+// MAX()
+//
+// Compute max of three numbers
+/////////////////////////////////////////////////////////////////
+
+inline ScoreType MAX (ScoreType x, ScoreType y, ScoreType z){
+  if (x >= y){
+    if (x >= z)
+      return x;
+    return z;
+  }
+  if (y >= z)
+    return y;
+  return z;
+}
+
+/////////////////////////////////////////////////////////////////
+// LOG_PLUS_EQUALS()
+//
+// Add two log probabilities and store in the first argument
+/////////////////////////////////////////////////////////////////
+
+inline void LOG_PLUS_EQUALS (ScoreType &x, ScoreType y){
+  if (x < y)
+    x = (x <= LOG_ZERO) ? y : LOOKUP(y-x) + x;
+  else
+    x = (y <= LOG_ZERO) ? x : LOOKUP(x-y) + y;
+}
+
+/////////////////////////////////////////////////////////////////
+// LOG_PLUS_EQUALS_SLOW()
+//
+// Add two log probabilities and store in the first argument
+/////////////////////////////////////////////////////////////////
+
+inline void LOG_PLUS_EQUALS_SLOW (ScoreType &x, ScoreType y){
+  if (x < y)
+    x = (x <= LOG_ZERO) ? y : LOOKUP_SLOW(y-x) + x;
+  else
+    x = (y <= LOG_ZERO) ? x : LOOKUP_SLOW(x-y) + y;
+}
+
+/////////////////////////////////////////////////////////////////
+// LOG_ADD()
+//
+// Add two log probabilities
+/////////////////////////////////////////////////////////////////
+
+inline ScoreType LOG_ADD (ScoreType x, ScoreType y){
+  if (x < y) return (x <= LOG_ZERO) ? y : LOOKUP(y-x) + x;
+  return (y <= LOG_ZERO) ? x : LOOKUP(x-y) + y;
+}
+*/
+
+/*
+/////////////////////////////////////////////////////////////////
+// LOG()
+//
+// Compute the logarithm of x.
+/////////////////////////////////////////////////////////////////
+
+inline float LOG (float x){
+  return log (x);
+}
+
+/////////////////////////////////////////////////////////////////
+// EXP()
+//
+// Computes exp(x), fr -4.6 <= x <= 0.
+/////////////////////////////////////////////////////////////////
+
+inline float EXP (float x){
+  assert (x <= 0.00f);
+  if (x < EXP_UNDERFLOW_THRESHOLD) return 0.0f;
+  return (((0.006349841068584 * x + 0.080775412572352) * x + 0.397982026296272) * x + 0.95279335963787f) * x + 0.995176455837312f;
+  //return (((0.00681169825657f * x + 0.08386267698832f) * x + 0.40413983195844f) * x + 0.95656674979767f) * x + 0.99556744049130f;
+}
+*/
+
+const float EXP_UNDERFLOW_THRESHOLD = -4.6;
+const float LOG_UNDERFLOW_THRESHOLD = 7.5;
+
+/////////////////////////////////////////////////////////////////
+// LOOKUP()
+//
+// Computes log (exp (x) + 1), for 0 <= x <= 7.5.
+/////////////////////////////////////////////////////////////////
+
+inline float LOOKUP (float x){
+  assert (x >= 0.00f);
+  assert (x <= LOG_UNDERFLOW_THRESHOLD);
+  //return ((-0.00653779113685f * x + 0.09537236626558f) * x + 0.55317574459331f) * x + 0.68672959851568f;
+  if (x <= 1.00f) return ((-0.009350833524763f * x + 0.130659527668286f) * x + 0.498799810682272f) * x + 0.693203116424741f;
+  if (x <= 2.50f) return ((-0.014532321752540f * x + 0.139942324101744f) * x + 0.495635523139337f) * x + 0.692140569840976f;
+  if (x <= 4.50f) return ((-0.004605031767994f * x + 0.063427417320019f) * x + 0.695956496475118f) * x + 0.514272634594009f;
+  assert (x <= LOG_UNDERFLOW_THRESHOLD);
+  return ((-0.000458661602210f * x + 0.009695946122598f) * x + 0.930734667215156f) * x + 0.168037164329057f;
+
+  //return (((0.00089738532761f * x - 0.01859488697982f) * x + 0.14415772028626f) * x + 0.49515490689159f) * x + 0.69311928966454f;
+}
+
+/////////////////////////////////////////////////////////////////
+// LOOKUP_SLOW()
+//
+// Computes log (exp (x) + 1).
+/////////////////////////////////////////////////////////////////
+
+inline float LOOKUP_SLOW (float x){
+  return log (exp (x) + 1);
+}
+
+/////////////////////////////////////////////////////////////////
+// MAX()
+//
+// Compute max of three numbers
+/////////////////////////////////////////////////////////////////
+
+inline float MAX (float x, float y, float z){
+  if (x >= y){
+    if (x >= z)
+      return x;
+    return z;
+  }
+  if (y >= z)
+    return y;
+  return z;
+}
+
+/////////////////////////////////////////////////////////////////
+// LOG_PLUS_EQUALS()
+//
+// Add two log probabilities and store in the first argument
+/////////////////////////////////////////////////////////////////
+
+inline void LOG_PLUS_EQUALS (float &x, float y){
+  if (x < y)
+    x = (x == LOG_ZERO || y - x >= LOG_UNDERFLOW_THRESHOLD) ? y : LOOKUP(y-x) + x;
+  else
+    x = (y == LOG_ZERO || x - y >= LOG_UNDERFLOW_THRESHOLD) ? x : LOOKUP(x-y) + y;
+}
+
+/////////////////////////////////////////////////////////////////
+// LOG_PLUS_EQUALS_SLOW()
+//
+// Add two log probabilities and store in the first argument
+/////////////////////////////////////////////////////////////////
+
+inline void LOG_PLUS_EQUALS_SLOW (float &x, float y){
+  if (x < y)
+    x = (x == LOG_ZERO) ? y : LOOKUP_SLOW(y-x) + x;
+  else
+    x = (y == LOG_ZERO) ? x : LOOKUP_SLOW(x-y) + y;
+}
+
+/////////////////////////////////////////////////////////////////
+// LOG_ADD()
+//
+// Add two log probabilities
+/////////////////////////////////////////////////////////////////
+
+inline float LOG_ADD (float x, float y){
+  if (x < y) return (x == LOG_ZERO || y - x >= LOG_UNDERFLOW_THRESHOLD) ? y : LOOKUP(y-x) + x;
+  return (y == LOG_ZERO || x - y >= LOG_UNDERFLOW_THRESHOLD) ? x : LOOKUP(x-y) + y;
+}
+
+
+/////////////////////////////////////////////////////////////////
+// LOG_ADD()
+//
+// Add three log probabilities
+/////////////////////////////////////////////////////////////////
+
+inline float LOG_ADD (float x1, float x2, float x3){
+  return LOG_ADD (x1, LOG_ADD (x2, x3));
+}
+
+/////////////////////////////////////////////////////////////////
+// LOG_ADD()
+//
+// Add four log probabilities
+/////////////////////////////////////////////////////////////////
+
+inline float LOG_ADD (float x1, float x2, float x3, float x4){
+  return LOG_ADD (x1, LOG_ADD (x2, LOG_ADD (x3, x4)));
+}
+
+/////////////////////////////////////////////////////////////////
+// LOG_ADD()
+//
+// Add five log probabilities
+/////////////////////////////////////////////////////////////////
+
+inline float LOG_ADD (float x1, float x2, float x3, float x4, float x5){
+  return LOG_ADD (x1, LOG_ADD (x2, LOG_ADD (x3, LOG_ADD (x4, x5))));
+}
+
+/////////////////////////////////////////////////////////////////
+// LOG_ADD()
+//
+// Add siz log probabilities
+/////////////////////////////////////////////////////////////////
+
+inline float LOG_ADD (float x1, float x2, float x3, float x4, float x5, float x6){
+  return LOG_ADD (x1, LOG_ADD (x2, LOG_ADD (x3, LOG_ADD (x4, LOG_ADD (x5, x6)))));
+}
+
+/////////////////////////////////////////////////////////////////
+// LOG_ADD()
+//
+// Add seven log probabilities
+/////////////////////////////////////////////////////////////////
+
+inline float LOG_ADD (float x1, float x2, float x3, float x4, float x5, float x6, float x7){
+  return LOG_ADD (x1, LOG_ADD (x2, LOG_ADD (x3, LOG_ADD (x4, LOG_ADD (x5, LOG_ADD (x6, x7))))));
+}
+
+/////////////////////////////////////////////////////////////////
+// ChooseBestOfThree()
+//
+// Store the largest of three values x1, x2, and x3 in *x.  Also
+// if xi is the largest value, then store bi in *b.
+/////////////////////////////////////////////////////////////////
+
+inline void ChooseBestOfThree (float x1, float x2, float x3, char b1, char b2, char b3, float *x, char *b){
+  if (x1 >= x2){
+    if (x1 >= x3){
+      *x = x1;
+      *b = b1;
+      return;
+    }
+    *x = x3;
+    *b = b3;
+    return;
+  }
+  if (x2 >= x3){
+    *x = x2;
+    *b = b2;
+    return;
+  }
+  *x = x3;
+  *b = b3;
+}
+
+#endif

Added: trunk/packages/probcons/branches/upstream/current/Sequence.h
===================================================================
--- trunk/packages/probcons/branches/upstream/current/Sequence.h	2006-05-07 10:44:13 UTC (rev 40)
+++ trunk/packages/probcons/branches/upstream/current/Sequence.h	2006-05-07 14:55:27 UTC (rev 41)
@@ -0,0 +1,416 @@
+/////////////////////////////////////////////////////////////////
+// Sequence.h
+//
+// Class for reading/manipulating single sequence character data.
+/////////////////////////////////////////////////////////////////
+
+#ifndef SEQUENCE_H
+#define SEQUENCE_H
+
+#include <string>
+#include <fstream>
+#include <iostream>
+#include <cctype>
+#include <cstdlib>
+#include "SafeVector.h"
+#include "FileBuffer.h"
+
+/////////////////////////////////////////////////////////////////
+// Sequence
+//
+// Class for storing sequence information.
+/////////////////////////////////////////////////////////////////
+
+class Sequence {
+
+  bool isValid;                // a boolean indicating whether the sequence data is valid or not
+  string header;               // string containing the comment line of the FASTA file
+  SafeVector<char> *data;      // pointer to character data
+  int length;                  // length of the sequence
+  int sequenceLabel;           // integer sequence label, typically to indicate the ordering of sequences
+                               //   in a Multi-FASTA file
+  int inputLabel;              // position of sequence in original input
+
+  /////////////////////////////////////////////////////////////////
+  // Sequence::Sequence()
+  //
+  // Default constructor.  Does nothing.
+  /////////////////////////////////////////////////////////////////
+
+  Sequence () : isValid (false), header (""), data (NULL), length (0), sequenceLabel (0), inputLabel (0) {}
+
+ public:
+
+  /////////////////////////////////////////////////////////////////
+  // Sequence::Sequence()
+  //
+  // Constructor.  Reads the sequence from a FileBuffer.
+  /////////////////////////////////////////////////////////////////
+
+  Sequence (FileBuffer &infile, bool stripGaps = false) : isValid (false), header ("~"), data (NULL), length(0), sequenceLabel (0), inputLabel (0) {
+
+    // read until the first non-blank line
+    while (!infile.eof()){
+      infile.GetLine (header);
+      if (header.length() != 0) break;
+    }
+
+    // check to make sure that it is a correct header line
+    if (header[0] == '>'){
+
+      // if so, remove the leading ">"
+      header = header.substr (1);
+
+      // remove any leading or trailing white space in the header comment
+      while (header.length() > 0 && isspace (header[0])) header = header.substr (1);
+      while (header.length() > 0 && isspace (header[header.length() - 1])) header = header.substr(0, header.length() - 1);
+
+      // get ready to read the data[] array; note that data[0] is always '@'
+      char ch;
+      data = new SafeVector<char>; assert (data);
+      data->push_back ('@');
+
+      // get a character from the file
+      while (infile.Get(ch)){
+
+        // if we've reached a new comment line, put the character back and stop
+        if (ch == '>'){ infile.UnGet(); break; }
+
+        // skip whitespace
+        if (isspace (ch)) continue;
+
+        // substitute gap character
+        if (ch == '.') ch = '-';
+	if (stripGaps && ch == '-') continue;
+
+        // check for known characters
+        if (!((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '*' || ch == '-')){
+          cerr << "ERROR: Unknown character encountered: " << ch << endl;
+          exit (1);
+        }
+
+        // everything's ok so far, so just store this character.
+        data->push_back(ch);
+        ++length;
+      }
+
+      // sequence must contain data in order to be valid
+      isValid = length > 0;
+      if (!isValid){
+        delete data;
+        data = NULL;
+      }
+    }
+  }
+
+  
+  /////////////////////////////////////////////////////////////////
+  // Sequence::Sequence()
+  //
+  // Constructor.  Builds a sequence from existing data.  Note
+  // that the data must use one-based indexing where data[0] should
+  // be set to '@'.
+  /////////////////////////////////////////////////////////////////
+
+  Sequence (SafeVector<char> *data, string header, int length, int sequenceLabel, int inputLabel) :
+    isValid (data != NULL), header(header), data(data), length (length), sequenceLabel (sequenceLabel), inputLabel (inputLabel) {
+      assert (data);
+      assert ((*data)[0] == '@');
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // Sequence::Sequence()
+  //
+  // Destructor.  Release allocated memory.
+  /////////////////////////////////////////////////////////////////
+
+  ~Sequence (){
+    if (data){
+      assert (isValid);
+      delete data;
+      data = NULL;
+      isValid = false;
+    }
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // Sequence::GetHeader()
+  //
+  // Return the string comment associated with this sequence.
+  /////////////////////////////////////////////////////////////////
+
+  string GetHeader () const {
+    return header;
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // Sequence::GetName()
+  //
+  // Return the first word of the string comment associated with this sequence.
+  /////////////////////////////////////////////////////////////////
+
+  string GetName () const {
+    char name[1024];
+    sscanf (header.c_str(), "%s", name);
+    return string(name);
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // Sequence::GetDataPtr()
+  //
+  // Return the iterator to data associated with this sequence.
+  /////////////////////////////////////////////////////////////////
+
+  SafeVector<char>::iterator GetDataPtr(){
+    assert (isValid);
+    assert (data);
+    return data->begin();
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // Sequence::GetPosition()
+  //
+  // Return the character at position i.  Recall that the character
+  // data is stored with one-based indexing.
+  /////////////////////////////////////////////////////////////////
+
+  char GetPosition (int i) const {
+    assert (isValid);
+    assert (data);
+    assert (i >= 1 && i <= length);
+    return (*data)[i];
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // Sequence::SetLabel()
+  //
+  // Sets the sequence label to i.
+  /////////////////////////////////////////////////////////////////
+
+  void SetLabel (int i){
+    assert (isValid);
+    sequenceLabel = i;
+    inputLabel = i;
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // Sequence::SetSortLabel()
+  //
+  // Sets the sequence sorting label to i.
+  /////////////////////////////////////////////////////////////////
+
+  void SetSortLabel (int i){
+    assert (isValid);
+    sequenceLabel = i;
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // Sequence::GetLabel()
+  //
+  // Retrieves the input label.
+  /////////////////////////////////////////////////////////////////
+
+  int GetLabel () const {
+    assert (isValid);
+    return inputLabel;
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // Sequence::GetSortLabel()
+  //
+  // Retrieves the sorting label.
+  /////////////////////////////////////////////////////////////////
+
+  int GetSortLabel () const {
+    assert (isValid);
+    return sequenceLabel;
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // Sequence::Fail()
+  //
+  // Checks to see if the sequence successfully loaded.
+  /////////////////////////////////////////////////////////////////
+
+  bool Fail () const {
+    return !isValid;
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // Sequence::Length()
+  //
+  // Returns the length of the sequence.
+  /////////////////////////////////////////////////////////////////
+
+  int GetLength () const {
+    assert (isValid);
+    assert (data);
+    return length;
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // Sequence::WriteMFA()
+  //
+  // Writes the sequence to outfile in MFA format.  Uses numColumns
+  // columns per line.  If useIndex is set to false, then the
+  // header is printed as normal, but if useIndex is true, then
+  // ">S###" is printed where ### represents the sequence label.
+  /////////////////////////////////////////////////////////////////
+
+  void WriteMFA (ostream &outfile, int numColumns, bool useIndex = false) const {
+    assert (isValid);
+    assert (data);
+    assert (!outfile.fail());
+
+    // print out heading
+    if (useIndex)
+      outfile << ">S" << GetLabel() << endl;
+    else
+      outfile << ">" << header << endl;
+
+    // print out character data
+    int ct = 1;
+    for (; ct <= length; ct++){
+      outfile << (*data)[ct];
+      if (ct % numColumns == 0) outfile << endl;
+    }
+    if ((ct-1) % numColumns != 0) outfile << endl;
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // Sequence::Clone()
+  //
+  // Returns a new deep copy of the seqeuence.
+  /////////////////////////////////////////////////////////////////
+
+  Sequence *Clone () const {
+    Sequence *ret = new Sequence();
+    assert (ret);
+
+    ret->isValid = isValid;
+    ret->header = header;
+    ret->data = new SafeVector<char>; assert (ret->data);
+    *(ret->data) = *data;
+    ret->length = length;
+    ret->sequenceLabel = sequenceLabel;
+    ret->inputLabel = inputLabel;
+
+    return ret;
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // Sequence::GetRange()
+  //
+  // Returns a new sequence object consisting of a range of
+  // characters from the current seuquence.
+  /////////////////////////////////////////////////////////////////
+
+  Sequence *GetRange (int start, int end) const {
+    Sequence *ret = new Sequence();
+    assert (ret);
+
+    assert (start >= 1 && start <= length);
+    assert (end >= 1 && end <= length);
+    assert (start <= end);
+
+    ret->isValid = isValid;
+    ret->header = header;
+    ret->data = new SafeVector<char>; assert (ret->data);
+    ret->data->push_back ('@');
+    for (int i = start; i <= end; i++)
+      ret->data->push_back ((*data)[i]);
+    ret->length = end - start + 1;
+    ret->sequenceLabel = sequenceLabel;
+    ret->inputLabel = inputLabel;
+
+    return ret;
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // Sequence::AddGaps()
+  //
+  // Given an SafeVector<char> containing the skeleton for an
+  // alignment and the identity of the current character, this
+  // routine will create a new sequence with all necesssary gaps added.
+  // For instance,
+  //    alignment = "XXXBBYYYBBYYXX"
+  //    id = 'X'
+  // will perform the transformation
+  //    "ATGCAGTCA" --> "ATGCC---GT--CA"
+  //                    (XXXBBYYYBBYYXX)
+  /////////////////////////////////////////////////////////////////
+
+  Sequence *AddGaps (SafeVector<char> *alignment, char id){
+    Sequence *ret = new Sequence();
+    assert (ret);
+
+    ret->isValid = isValid;
+    ret->header = header;
+    ret->data = new SafeVector<char>; assert (ret->data);
+    ret->length = (int) alignment->size();
+    ret->sequenceLabel = sequenceLabel;
+    ret->inputLabel = inputLabel;
+    ret->data->push_back ('@');
+
+    SafeVector<char>::iterator dataIter = data->begin() + 1;
+    for (SafeVector<char>::iterator iter = alignment->begin(); iter != alignment->end(); ++iter){
+      if (*iter == 'B' || *iter == id){
+        ret->data->push_back (*dataIter);
+        ++dataIter;
+      }
+      else
+        ret->data->push_back ('-');
+    }
+
+    return ret;
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // Sequence::GetString()
+  //
+  // Returns the sequence as a string with gaps removed.
+  /////////////////////////////////////////////////////////////////
+
+  string GetString (){
+    string s = "";
+    for (int i = 1; i <= length; i++){
+      if ((*data)[i] != '-') s += (*data)[i];
+    }
+    return s;
+  }
+
+
+  /////////////////////////////////////////////////////////////////
+  // Sequence::GetMapping()
+  //
+  // Returns a SafeVector<int> containing the indices of every
+  // character in the sequence.  For instance, if the data is
+  // "ATGCC---GT--CA", the method returns {1,2,3,4,5,9,10,13,14}.
+  /////////////////////////////////////////////////////////////////
+
+  SafeVector<int> *GetMapping () const {
+    SafeVector<int> *ret = new SafeVector<int>(1, 0);
+    for (int i = 1; i <= length; i++){
+      if ((*data)[i] != '-') ret->push_back (i);
+    }
+    return ret;
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // Sequence::Highlight()
+  //
+  // Changes all positions with score >= cutoff to upper case and
+  // all positions with score < cutoff to lower case.
+  /////////////////////////////////////////////////////////////////
+
+  void Highlight (const SafeVector<float> &scores, const float cutoff){
+    for (int i = 1; i <= length; i++){
+      if (scores[i-1] >= cutoff)
+        (*data)[i] = toupper ((*data)[i]);
+      else
+        (*data)[i] = tolower ((*data)[i]);
+    }
+  }
+};
+
+#endif

Added: trunk/packages/probcons/branches/upstream/current/SparseMatrix.h
===================================================================
--- trunk/packages/probcons/branches/upstream/current/SparseMatrix.h	2006-05-07 10:44:13 UTC (rev 40)
+++ trunk/packages/probcons/branches/upstream/current/SparseMatrix.h	2006-05-07 14:55:27 UTC (rev 41)
@@ -0,0 +1,253 @@
+/////////////////////////////////////////////////////////////////
+// SparseMatrix.h
+//
+// Sparse matrix computations
+/////////////////////////////////////////////////////////////////
+
+#ifndef SPARSEMATRIX_H
+#define SPARSEMATRIX_H
+
+#include <iostream>
+
+using namespace std;
+
+const float POSTERIOR_CUTOFF = 0.01;         // minimum posterior probability
+                                             // value that is maintained in the
+                                             // sparse matrix representation
+
+typedef pair<int,float> PIF;                 // Sparse matrix entry type
+                                             //   first --> column
+                                             //   second --> value
+
+/////////////////////////////////////////////////////////////////
+// SparseMatrix
+//
+// Class for sparse matrix computations
+/////////////////////////////////////////////////////////////////
+
+class SparseMatrix {
+
+  int seq1Length, seq2Length;                     // dimensions of matrix
+  VI rowSize;                                     // rowSize[i] = # of cells in row i
+  SafeVector<PIF> data;                           // data values
+  SafeVector<SafeVector<PIF>::iterator> rowPtrs;  // pointers to the beginning of each row
+
+  /////////////////////////////////////////////////////////////////
+  // SparseMatrix::SparseMatrix()
+  //
+  // Private constructor.
+  /////////////////////////////////////////////////////////////////
+
+  SparseMatrix (){}
+
+ public:
+
+  /////////////////////////////////////////////////////////////////
+  // SparseMatrix::SparseMatrix()
+  //
+  // Constructor.  Builds a sparse matrix from a posterior matrix.
+  // Note that the expected format for the posterior matrix is as
+  // a (seq1Length+1) x (seq2Length+1) matrix where the 0th row
+  // and 0th column are ignored (they should contain all zeroes).
+  /////////////////////////////////////////////////////////////////
+
+  SparseMatrix (int seq1Length, int seq2Length, const VF &posterior) :
+    seq1Length (seq1Length), seq2Length (seq2Length) {
+
+    int numCells = 0;
+
+    assert (seq1Length > 0);
+    assert (seq2Length > 0);
+
+    // calculate memory required; count the number of cells in the
+    // posterior matrix above the threshold
+    VF::const_iterator postPtr = posterior.begin();
+    for (int i = 0; i <= seq1Length; i++){
+      for (int j = 0; j <= seq2Length; j++){
+        if (*(postPtr++) >= POSTERIOR_CUTOFF){
+          assert (i != 0 && j != 0);
+          numCells++;
+        }
+      }
+    }
+    
+    // allocate memory
+    data.resize(numCells);
+    rowSize.resize (seq1Length + 1); rowSize[0] = -1;
+    rowPtrs.resize (seq1Length + 1); rowPtrs[0] = data.end();
+
+    // build sparse matrix
+    postPtr = posterior.begin() + seq2Length + 1;           // note that we're skipping the first row here
+    SafeVector<PIF>::iterator dataPtr = data.begin();
+    for (int i = 1; i <= seq1Length; i++){
+      postPtr++;                                            // and skipping the first column of each row
+      rowPtrs[i] = dataPtr;
+      for (int j = 1; j <= seq2Length; j++){
+        if (*postPtr >= POSTERIOR_CUTOFF){
+          dataPtr->first = j;
+          dataPtr->second = *postPtr;
+          dataPtr++;
+        }
+        postPtr++;
+      }
+      rowSize[i] = dataPtr - rowPtrs[i];
+    }
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // SparseMatrix::GetRowPtr()
+  //
+  // Returns the pointer to a particular row in the sparse matrix.
+  /////////////////////////////////////////////////////////////////
+
+  SafeVector<PIF>::iterator GetRowPtr (int row) const {
+    assert (row >= 1 && row <= seq1Length);
+    return rowPtrs[row];
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // SparseMatrix::GetValue()
+  //
+  // Returns value at a particular row, column.
+  /////////////////////////////////////////////////////////////////
+
+  float GetValue (int row, int col){
+    assert (row >= 1 && row <= seq1Length);
+    assert (col >= 1 && col <= seq2Length);
+    for (int i = 0; i < rowSize[row]; i++){
+      if (rowPtrs[row][i].first == col) return rowPtrs[row][i].second;
+    }
+    return 0;
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // SparseMatrix::GetRowSize()
+  //
+  // Returns the number of entries in a particular row.
+  /////////////////////////////////////////////////////////////////
+
+  int GetRowSize (int row) const {
+    assert (row >= 1 && row <= seq1Length);
+    return rowSize[row];
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // SparseMatrix::GetSeq1Length()
+  //
+  // Returns the first dimension of the matrix.
+  /////////////////////////////////////////////////////////////////
+
+  int GetSeq1Length () const {
+    return seq1Length;
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // SparseMatrix::GetSeq2Length()
+  //
+  // Returns the second dimension of the matrix.
+  /////////////////////////////////////////////////////////////////
+
+  int GetSeq2Length () const {
+    return seq2Length;
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // SparseMatrix::GetRowPtr
+  //
+  // Returns the pointer to a particular row in the sparse matrix.
+  /////////////////////////////////////////////////////////////////
+
+  int GetNumCells () const {
+    return data.size();
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // SparseMatrix::Print()
+  //
+  // Prints out a sparse matrix.
+  /////////////////////////////////////////////////////////////////
+
+  void Print (ostream &outfile) const {
+    outfile << "Sparse Matrix:" << endl;
+    for (int i = 1; i <= seq1Length; i++){
+      outfile << "  " << i << ":";
+      for (int j = 0; j < rowSize[i]; j++){
+        outfile << " (" << rowPtrs[i][j].first << "," << rowPtrs[i][j].second << ")";
+      }
+      outfile << endl;
+    }
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // SparseMatrix::ComputeTranspose()
+  //
+  // Returns a new sparse matrix containing the transpose of the
+  // current matrix.
+  /////////////////////////////////////////////////////////////////
+
+  SparseMatrix *ComputeTranspose () const {
+
+    // create a new sparse matrix
+    SparseMatrix *ret = new SparseMatrix();
+    int numCells = data.size();
+
+    ret->seq1Length = seq2Length;
+    ret->seq2Length = seq1Length;
+
+    // allocate memory
+    ret->data.resize (numCells);
+    ret->rowSize.resize (seq2Length + 1); ret->rowSize[0] = -1;
+    ret->rowPtrs.resize (seq2Length + 1); ret->rowPtrs[0] = ret->data.end();
+
+    // compute row sizes
+    for (int i = 1; i <= seq2Length; i++) ret->rowSize[i] = 0;
+    for (int i = 0; i < numCells; i++)
+      ret->rowSize[data[i].first]++;
+
+    // compute row ptrs
+    for (int i = 1; i <= seq2Length; i++){
+      ret->rowPtrs[i] = (i == 1) ? ret->data.begin() : ret->rowPtrs[i-1] + ret->rowSize[i-1];
+    }
+
+    // now fill in data
+    SafeVector<SafeVector<PIF>::iterator> currPtrs = ret->rowPtrs;
+
+    for (int i = 1; i <= seq1Length; i++){
+      SafeVector<PIF>::iterator row = rowPtrs[i];
+      for (int j = 0; j < rowSize[i]; j++){
+        currPtrs[row[j].first]->first = i;
+        currPtrs[row[j].first]->second = row[j].second;
+        currPtrs[row[j].first]++;
+      }
+    }
+
+    return ret;
+  }
+
+  /////////////////////////////////////////////////////////////////
+  // SparseMatrix::GetPosterior()
+  //
+  // Return the posterior representation of the sparse matrix.
+  /////////////////////////////////////////////////////////////////
+
+  VF *GetPosterior () const {
+
+    // create a new posterior matrix
+    VF *posteriorPtr = new VF((seq1Length+1) * (seq2Length+1)); assert (posteriorPtr);
+    VF &posterior = *posteriorPtr;
+
+    // build the posterior matrix
+    for (int i = 0; i < (seq1Length+1) * (seq2Length+1); i++) posterior[i] = 0;
+    for (int i = 1; i <= seq1Length; i++){
+      VF::iterator postPtr = posterior.begin() + i * (seq2Length+1);
+      for (int j = 0; j < rowSize[i]; j++){
+        postPtr[rowPtrs[i][j].first] = rowPtrs[i][j].second;
+      }
+    }
+
+    return posteriorPtr;
+  }
+
+};
+
+#endif