[med-svn] r82 - in trunk/packages: . muscle muscle/branches
muscle/branches/upstream muscle/branches/upstream/current
Charles Plessy
charles-guest at costa.debian.org
Mon Aug 7 00:09:14 UTC 2006
Author: charles-guest
Date: 2006-08-07 00:08:59 +0000 (Mon, 07 Aug 2006)
New Revision: 82
Added:
trunk/packages/muscle/
trunk/packages/muscle/branches/
trunk/packages/muscle/branches/upstream/
trunk/packages/muscle/branches/upstream/current/
trunk/packages/muscle/branches/upstream/current/Makefile
trunk/packages/muscle/branches/upstream/current/aligngivenpath.cpp
trunk/packages/muscle/branches/upstream/current/aligngivenpathsw.cpp
trunk/packages/muscle/branches/upstream/current/aligntwomsas.cpp
trunk/packages/muscle/branches/upstream/current/aligntwoprofs.cpp
trunk/packages/muscle/branches/upstream/current/aln.cpp
trunk/packages/muscle/branches/upstream/current/alpha.cpp
trunk/packages/muscle/branches/upstream/current/alpha.h
trunk/packages/muscle/branches/upstream/current/anchors.cpp
trunk/packages/muscle/branches/upstream/current/bittraceback.cpp
trunk/packages/muscle/branches/upstream/current/blosumla.cpp
trunk/packages/muscle/branches/upstream/current/clust.cpp
trunk/packages/muscle/branches/upstream/current/clust.h
trunk/packages/muscle/branches/upstream/current/cluster.cpp
trunk/packages/muscle/branches/upstream/current/cluster.h
trunk/packages/muscle/branches/upstream/current/clustset.h
trunk/packages/muscle/branches/upstream/current/clustsetdf.h
trunk/packages/muscle/branches/upstream/current/clustsetmsa.h
trunk/packages/muscle/branches/upstream/current/clwwt.cpp
trunk/packages/muscle/branches/upstream/current/color.cpp
trunk/packages/muscle/branches/upstream/current/cons.cpp
trunk/packages/muscle/branches/upstream/current/diaglist.cpp
trunk/packages/muscle/branches/upstream/current/diaglist.h
trunk/packages/muscle/branches/upstream/current/diffobjscore.cpp
trunk/packages/muscle/branches/upstream/current/diffpaths.cpp
trunk/packages/muscle/branches/upstream/current/difftrees.cpp
trunk/packages/muscle/branches/upstream/current/difftreese.cpp
trunk/packages/muscle/branches/upstream/current/distcalc.cpp
trunk/packages/muscle/branches/upstream/current/distcalc.h
trunk/packages/muscle/branches/upstream/current/distfunc.cpp
trunk/packages/muscle/branches/upstream/current/distfunc.h
trunk/packages/muscle/branches/upstream/current/distpwkimura.cpp
trunk/packages/muscle/branches/upstream/current/domuscle.cpp
trunk/packages/muscle/branches/upstream/current/dosp.cpp
trunk/packages/muscle/branches/upstream/current/dpregionlist.h
trunk/packages/muscle/branches/upstream/current/dpreglist.cpp
trunk/packages/muscle/branches/upstream/current/dpreglist.h
trunk/packages/muscle/branches/upstream/current/drawtree.cpp
trunk/packages/muscle/branches/upstream/current/edgelist.cpp
trunk/packages/muscle/branches/upstream/current/edgelist.h
trunk/packages/muscle/branches/upstream/current/enumopts.cpp
trunk/packages/muscle/branches/upstream/current/enumopts.h
trunk/packages/muscle/branches/upstream/current/enums.h
trunk/packages/muscle/branches/upstream/current/enumtostr.cpp
trunk/packages/muscle/branches/upstream/current/estring.cpp
trunk/packages/muscle/branches/upstream/current/estring.h
trunk/packages/muscle/branches/upstream/current/fasta.cpp
trunk/packages/muscle/branches/upstream/current/fasta2.cpp
trunk/packages/muscle/branches/upstream/current/fastclust.cpp
trunk/packages/muscle/branches/upstream/current/fastdist.cpp
trunk/packages/muscle/branches/upstream/current/fastdistjones.cpp
trunk/packages/muscle/branches/upstream/current/fastdistkbit.cpp
trunk/packages/muscle/branches/upstream/current/fastdistkmer.cpp
trunk/packages/muscle/branches/upstream/current/fastdistmafft.cpp
trunk/packages/muscle/branches/upstream/current/fastdistnuc.cpp
trunk/packages/muscle/branches/upstream/current/fastscorepath2.cpp
trunk/packages/muscle/branches/upstream/current/finddiags.cpp
trunk/packages/muscle/branches/upstream/current/finddiagsn.cpp
trunk/packages/muscle/branches/upstream/current/gapscoredimer.h
trunk/packages/muscle/branches/upstream/current/glbalign.cpp
trunk/packages/muscle/branches/upstream/current/glbalign352.cpp
trunk/packages/muscle/branches/upstream/current/glbaligndiag.cpp
trunk/packages/muscle/branches/upstream/current/glbalignle.cpp
trunk/packages/muscle/branches/upstream/current/glbalignsimple.cpp
trunk/packages/muscle/branches/upstream/current/glbalignsp.cpp
trunk/packages/muscle/branches/upstream/current/glbalignspn.cpp
trunk/packages/muscle/branches/upstream/current/glbalignss.cpp
trunk/packages/muscle/branches/upstream/current/glbalndimer.cpp
trunk/packages/muscle/branches/upstream/current/globals.cpp
trunk/packages/muscle/branches/upstream/current/globalslinux.cpp
trunk/packages/muscle/branches/upstream/current/globalswin32.cpp
trunk/packages/muscle/branches/upstream/current/gonnet.cpp
trunk/packages/muscle/branches/upstream/current/gonnet.h
trunk/packages/muscle/branches/upstream/current/gotowt.cpp
trunk/packages/muscle/branches/upstream/current/henikoffweight.cpp
trunk/packages/muscle/branches/upstream/current/henikoffweightpb.cpp
trunk/packages/muscle/branches/upstream/current/html.cpp
trunk/packages/muscle/branches/upstream/current/hydro.cpp
trunk/packages/muscle/branches/upstream/current/intmath.cpp
trunk/packages/muscle/branches/upstream/current/intmath.h
trunk/packages/muscle/branches/upstream/current/local.cpp
trunk/packages/muscle/branches/upstream/current/main.cpp
trunk/packages/muscle/branches/upstream/current/makerootmsa.cpp
trunk/packages/muscle/branches/upstream/current/makerootmsab.cpp
trunk/packages/muscle/branches/upstream/current/mhack.cpp
trunk/packages/muscle/branches/upstream/current/mk
trunk/packages/muscle/branches/upstream/current/mpam200.cpp
trunk/packages/muscle/branches/upstream/current/msa.cpp
trunk/packages/muscle/branches/upstream/current/msa.h
trunk/packages/muscle/branches/upstream/current/msa2.cpp
trunk/packages/muscle/branches/upstream/current/msadist.h
trunk/packages/muscle/branches/upstream/current/msadistkimura.cpp
trunk/packages/muscle/branches/upstream/current/msf.cpp
trunk/packages/muscle/branches/upstream/current/muscle.cpp
trunk/packages/muscle/branches/upstream/current/muscle.h
trunk/packages/muscle/branches/upstream/current/muscleout.cpp
trunk/packages/muscle/branches/upstream/current/nucmx.cpp
trunk/packages/muscle/branches/upstream/current/nwdasimple.cpp
trunk/packages/muscle/branches/upstream/current/nwdasimple2.cpp
trunk/packages/muscle/branches/upstream/current/nwdasmall.cpp
trunk/packages/muscle/branches/upstream/current/nwrec.cpp
trunk/packages/muscle/branches/upstream/current/nwsmall.cpp
trunk/packages/muscle/branches/upstream/current/objscore.cpp
trunk/packages/muscle/branches/upstream/current/objscore.h
trunk/packages/muscle/branches/upstream/current/objscore2.cpp
trunk/packages/muscle/branches/upstream/current/objscoreda.cpp
trunk/packages/muscle/branches/upstream/current/onexception.cpp
trunk/packages/muscle/branches/upstream/current/options.cpp
trunk/packages/muscle/branches/upstream/current/outweights.cpp
trunk/packages/muscle/branches/upstream/current/pam200mafft.cpp
trunk/packages/muscle/branches/upstream/current/params.cpp
trunk/packages/muscle/branches/upstream/current/params.h
trunk/packages/muscle/branches/upstream/current/phy.cpp
trunk/packages/muscle/branches/upstream/current/phy2.cpp
trunk/packages/muscle/branches/upstream/current/phy3.cpp
trunk/packages/muscle/branches/upstream/current/phy4.cpp
trunk/packages/muscle/branches/upstream/current/phyfromclust.cpp
trunk/packages/muscle/branches/upstream/current/phyfromfile.cpp
trunk/packages/muscle/branches/upstream/current/physeq.cpp
trunk/packages/muscle/branches/upstream/current/phytofile.cpp
trunk/packages/muscle/branches/upstream/current/posgap.cpp
trunk/packages/muscle/branches/upstream/current/ppscore.cpp
trunk/packages/muscle/branches/upstream/current/profdb.cpp
trunk/packages/muscle/branches/upstream/current/profile.cpp
trunk/packages/muscle/branches/upstream/current/profile.h
trunk/packages/muscle/branches/upstream/current/profilefrommsa.cpp
trunk/packages/muscle/branches/upstream/current/progalign.cpp
trunk/packages/muscle/branches/upstream/current/progress.cpp
trunk/packages/muscle/branches/upstream/current/progressivealign.cpp
trunk/packages/muscle/branches/upstream/current/pwpath.cpp
trunk/packages/muscle/branches/upstream/current/pwpath.h
trunk/packages/muscle/branches/upstream/current/readmx.cpp
trunk/packages/muscle/branches/upstream/current/realigndiffs.cpp
trunk/packages/muscle/branches/upstream/current/realigndiffse.cpp
trunk/packages/muscle/branches/upstream/current/refine.cpp
trunk/packages/muscle/branches/upstream/current/refinehoriz.cpp
trunk/packages/muscle/branches/upstream/current/refinesubfams.cpp
trunk/packages/muscle/branches/upstream/current/refinetree.cpp
trunk/packages/muscle/branches/upstream/current/refinetreee.cpp
trunk/packages/muscle/branches/upstream/current/refinevert.cpp
trunk/packages/muscle/branches/upstream/current/refinew.cpp
trunk/packages/muscle/branches/upstream/current/savebest.cpp
trunk/packages/muscle/branches/upstream/current/scoregaps.cpp
trunk/packages/muscle/branches/upstream/current/scorehistory.cpp
trunk/packages/muscle/branches/upstream/current/scorehistory.h
trunk/packages/muscle/branches/upstream/current/scorepp.cpp
trunk/packages/muscle/branches/upstream/current/seq.cpp
trunk/packages/muscle/branches/upstream/current/seq.h
trunk/packages/muscle/branches/upstream/current/seqvect.cpp
trunk/packages/muscle/branches/upstream/current/seqvect.h
trunk/packages/muscle/branches/upstream/current/setblosumweights.cpp
trunk/packages/muscle/branches/upstream/current/setgscweights.cpp
trunk/packages/muscle/branches/upstream/current/setnewhandler.cpp
trunk/packages/muscle/branches/upstream/current/spfast.cpp
trunk/packages/muscle/branches/upstream/current/sptest.cpp
trunk/packages/muscle/branches/upstream/current/stabilize.cpp
trunk/packages/muscle/branches/upstream/current/subfam.cpp
trunk/packages/muscle/branches/upstream/current/subfams.cpp
trunk/packages/muscle/branches/upstream/current/sw.cpp
trunk/packages/muscle/branches/upstream/current/termgaps.cpp
trunk/packages/muscle/branches/upstream/current/textfile.cpp
trunk/packages/muscle/branches/upstream/current/textfile.h
trunk/packages/muscle/branches/upstream/current/threewaywt.cpp
trunk/packages/muscle/branches/upstream/current/timing.h
trunk/packages/muscle/branches/upstream/current/traceback.cpp
trunk/packages/muscle/branches/upstream/current/tracebackopt.cpp
trunk/packages/muscle/branches/upstream/current/tracebacksw.cpp
trunk/packages/muscle/branches/upstream/current/tree.h
trunk/packages/muscle/branches/upstream/current/treefrommsa.cpp
trunk/packages/muscle/branches/upstream/current/types.h
trunk/packages/muscle/branches/upstream/current/typetostr.cpp
trunk/packages/muscle/branches/upstream/current/unixio.h
trunk/packages/muscle/branches/upstream/current/upgma2.cpp
trunk/packages/muscle/branches/upstream/current/usage.cpp
trunk/packages/muscle/branches/upstream/current/validateids.cpp
trunk/packages/muscle/branches/upstream/current/vtml2.cpp
trunk/packages/muscle/branches/upstream/current/writescorefile.cpp
trunk/packages/muscle/tags/
Log:
[svn-inject] Installing original source of muscle
Added: trunk/packages/muscle/branches/upstream/current/Makefile
===================================================================
--- trunk/packages/muscle/branches/upstream/current/Makefile 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/Makefile 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,36 @@
+# Porting notes:
+# For Solaris and other platforms where the logf function
+# is missing from the math library, add the following line
+# to the end of muscle.h:
+# #define logf(x) ((float) log(x))
+# Using -static increases the executable size and thus gives a very
+# small increase in start time, but is more portable (the binding
+# to dynamic libraries often breaks when a new library is released).
+# On OSX, using -static gives the error "ld: can't locate file for: -lcrt0.o",
+# this is fixed by deleting "-static" from the LDLIBS line.
+
+CFLAGS = -O3 -funroll-loops -Winline -DNDEBUG=1
+LDLIBS = -lm -static
+# LDLIBS = -lm
+
+OBJ = .o
+EXE =
+
+RM = rm -f
+CP = cp
+
+GPP = g++
+LD = $(GPP) $(CFLAGS)
+CPP = $(GPP) -c $(CFLAGS)
+
+all: muscle
+
+CPPSRC = $(sort $(wildcard *.cpp))
+CPPOBJ = $(subst .cpp,.o,$(CPPSRC))
+
+$(CPPOBJ): %.o: %.cpp
+ $(CPP) $< -o $@
+
+muscle: $(CPPOBJ)
+ $(LD) -o muscle $(CPPOBJ) $(LDLIBS)
+ strip muscle
Added: trunk/packages/muscle/branches/upstream/current/aligngivenpath.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/aligngivenpath.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/aligngivenpath.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,802 @@
+#include "muscle.h"
+#include "msa.h"
+#include "pwpath.h"
+#include "profile.h"
+
+#define TRACE 0
+
+static void LogPP(const ProfPos &PP)
+ {
+ Log("ResidueGroup %u\n", PP.m_uResidueGroup);
+ Log("AllGaps %d\n", PP.m_bAllGaps);
+ Log("Occ %.3g\n", PP.m_fOcc);
+ Log("LL=%.3g LG=%.3g GL=%.3g GG=%.3g\n", PP.m_LL, PP.m_LG, PP.m_GL, PP.m_GG);
+ Log("Freqs ");
+ for (unsigned i = 0; i < 20; ++i)
+ if (PP.m_fcCounts[i] > 0)
+ Log("%c=%.3g ", LetterToChar(i), PP.m_fcCounts[i]);
+ Log("\n");
+ }
+
+static void AssertProfPosEq(const ProfPos *PA, const ProfPos *PB, unsigned i)
+ {
+ const ProfPos &PPA = PA[i];
+ const ProfPos &PPB = PB[i];
+#define eq(x) if (PPA.m_##x != PPB.m_##x) { LogPP(PPA); LogPP(PPB); Quit("AssertProfPosEq." #x); }
+#define be(x) if (!BTEq(PPA.m_##x, PPB.m_##x)) { LogPP(PPA); LogPP(PPB); Quit("AssertProfPosEq." #x); }
+ eq(bAllGaps)
+ eq(uResidueGroup)
+
+ be(LL)
+ be(LG)
+ be(GL)
+ be(GG)
+ be(fOcc)
+ be(scoreGapOpen)
+ be(scoreGapClose)
+
+ for (unsigned j = 0; j < 20; ++j)
+ {
+#define eqj(x) if (PPA.m_##x != PPB.m_##x) Quit("AssertProfPosEq j=%u " #x, j);
+#define bej(x) if (!BTEq(PPA.m_##x, PPB.m_##x)) Quit("AssertProfPosEq j=%u " #x, j);
+ bej(fcCounts[j]);
+// eqj(uSortOrder[j]) // may differ due to ties, don't check?
+ bej(AAScores[j])
+#undef eqj
+#undef bej
+ }
+#undef eq
+#undef be
+ }
+
+void AssertProfsEq(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
+ unsigned uLengthB)
+ {
+ if (uLengthA != uLengthB)
+ Quit("AssertProfsEq: lengths differ %u %u", uLengthA, uLengthB);
+ for (unsigned i = 0; i < uLengthB; ++i)
+ AssertProfPosEq(PA, PB, i);
+ }
+
+#if DEBUG
+static void ValidateProf(const ProfPos *Prof, unsigned uLength)
+ {
+ for (unsigned i = 0; i < uLength; ++i)
+ {
+ const ProfPos &PP = Prof[i];
+
+ FCOUNT s1 = PP.m_LL + PP.m_LG + PP.m_GL + PP.m_GG;
+ assert(BTEq(s1, 1.0));
+
+ if (i > 0)
+ {
+ const ProfPos &PPPrev = Prof[i-1];
+ FCOUNT s2 = PPPrev.m_LL + PPPrev.m_GL;
+ FCOUNT s3 = PP.m_LL + PP.m_LG;
+ assert(BTEq(s2, s3));
+ }
+ if (i < uLength - 1)
+ {
+ const ProfPos &PPNext = Prof[i+1];
+ FCOUNT s4 = PP.m_LL + PP.m_GL;
+ FCOUNT s5 = PPNext.m_LL + PPNext.m_LG;
+ assert(BTEq(s4, s5));
+ }
+ }
+ }
+#else
+#define ValidateProf(Prof, Length) /* empty */
+#endif
+
+static void ScoresFromFreqsPos(ProfPos *Prof, unsigned uLength, unsigned uPos)
+ {
+ ProfPos &PP = Prof[uPos];
+ SortCounts(PP.m_fcCounts, PP.m_uSortOrder);
+ PP.m_uResidueGroup = ResidueGroupFromFCounts(PP.m_fcCounts);
+
+// "Occupancy"
+ PP.m_fOcc = PP.m_LL + PP.m_GL;
+
+// Frequency of gap-opens in this position (i)
+// Gap open = letter in i-1 and gap in i
+// = iff LG in i
+ FCOUNT fcOpen = PP.m_LG;
+
+// Frequency of gap-closes in this position
+// Gap close = gap in i and letter in i+1
+// = iff GL in i+1
+ FCOUNT fcClose;
+ if (uPos + 1 < uLength)
+ fcClose = Prof[uPos + 1].m_GL;
+ else
+ fcClose = PP.m_GG + PP.m_LG;
+
+ PP.m_scoreGapOpen = (SCORE) ((1.0 - fcOpen)*g_scoreGapOpen/2.0);
+ PP.m_scoreGapClose = (SCORE) ((1.0 - fcClose)*g_scoreGapOpen/2.0);
+#if DOUBLE_AFFINE
+ PP.m_scoreGapOpen2 = (SCORE) ((1.0 - fcOpen)*g_scoreGapOpen2/2.0);
+ PP.m_scoreGapClose2 = (SCORE) ((1.0 - fcClose)*g_scoreGapOpen2/2.0);
+#endif
+
+ for (unsigned i = 0; i < g_AlphaSize; ++i)
+ {
+ SCORE scoreSum = 0;
+ for (unsigned j = 0; j < g_AlphaSize; ++j)
+ scoreSum += PP.m_fcCounts[j]*(*g_ptrScoreMatrix)[i][j];
+ PP.m_AAScores[i] = scoreSum;
+ }
+ }
+
+void ProfScoresFromFreqs(ProfPos *Prof, unsigned uLength)
+ {
+ for (unsigned i = 0; i < uLength; ++i)
+ ScoresFromFreqsPos(Prof, uLength, i);
+ }
+
+static void AppendDelete(const MSA &msaA, unsigned &uColIndexA,
+ unsigned uSeqCountA, unsigned uSeqCountB, MSA &msaCombined,
+ unsigned &uColIndexCombined)
+ {
+#if TRACE
+ Log("AppendDelete ColIxA=%u ColIxCmb=%u\n",
+ uColIndexA, uColIndexCombined);
+#endif
+ for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA)
+ {
+ char c = msaA.GetChar(uSeqIndexA, uColIndexA);
+ msaCombined.SetChar(uSeqIndexA, uColIndexCombined, c);
+ }
+
+ for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB)
+ msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined, '-');
+
+ ++uColIndexCombined;
+ ++uColIndexA;
+ }
+
+static void AppendInsert(const MSA &msaB, unsigned &uColIndexB,
+ unsigned uSeqCountA, unsigned uSeqCountB, MSA &msaCombined,
+ unsigned &uColIndexCombined)
+ {
+#if TRACE
+ Log("AppendInsert ColIxB=%u ColIxCmb=%u\n",
+ uColIndexB, uColIndexCombined);
+#endif
+ for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA)
+ msaCombined.SetChar(uSeqIndexA, uColIndexCombined, '-');
+
+ for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB)
+ {
+ char c = msaB.GetChar(uSeqIndexB, uColIndexB);
+ msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined, c);
+ }
+
+ ++uColIndexCombined;
+ ++uColIndexB;
+ }
+
+static void AppendTplInserts(const MSA &msaA, unsigned &uColIndexA, unsigned uColCountA,
+ const MSA &msaB, unsigned &uColIndexB, unsigned uColCountB, unsigned uSeqCountA,
+ unsigned uSeqCountB, MSA &msaCombined, unsigned &uColIndexCombined)
+ {
+#if TRACE
+ Log("AppendTplInserts ColIxA=%u ColIxB=%u ColIxCmb=%u\n",
+ uColIndexA, uColIndexB, uColIndexCombined);
+#endif
+ const unsigned uLengthA = msaA.GetColCount();
+ const unsigned uLengthB = msaB.GetColCount();
+
+ unsigned uNewColCount = uColCountA;
+ if (uColCountB > uNewColCount)
+ uNewColCount = uColCountB;
+
+ for (unsigned n = 0; n < uColCountA; ++n)
+ {
+ for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA)
+ {
+ char c = msaA.GetChar(uSeqIndexA, uColIndexA + n);
+ c = UnalignChar(c);
+ msaCombined.SetChar(uSeqIndexA, uColIndexCombined + n, c);
+ }
+ }
+ for (unsigned n = uColCountA; n < uNewColCount; ++n)
+ {
+ for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA)
+ msaCombined.SetChar(uSeqIndexA, uColIndexCombined + n, '.');
+ }
+
+ for (unsigned n = 0; n < uColCountB; ++n)
+ {
+ for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB)
+ {
+ char c = msaB.GetChar(uSeqIndexB, uColIndexB + n);
+ c = UnalignChar(c);
+ msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined + n, c);
+ }
+ }
+ for (unsigned n = uColCountB; n < uNewColCount; ++n)
+ {
+ for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB)
+ msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined + n, '.');
+ }
+
+ uColIndexCombined += uNewColCount;
+ uColIndexA += uColCountA;
+ uColIndexB += uColCountB;
+ }
+
+static void AppendMatch(const MSA &msaA, unsigned &uColIndexA, const MSA &msaB,
+ unsigned &uColIndexB, unsigned uSeqCountA, unsigned uSeqCountB,
+ MSA &msaCombined, unsigned &uColIndexCombined)
+ {
+#if TRACE
+ Log("AppendMatch ColIxA=%u ColIxB=%u ColIxCmb=%u\n",
+ uColIndexA, uColIndexB, uColIndexCombined);
+#endif
+
+ for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA)
+ {
+ char c = msaA.GetChar(uSeqIndexA, uColIndexA);
+ msaCombined.SetChar(uSeqIndexA, uColIndexCombined, c);
+ }
+
+ for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB)
+ {
+ char c = msaB.GetChar(uSeqIndexB, uColIndexB);
+ msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined, c);
+ }
+
+ ++uColIndexA;
+ ++uColIndexB;
+ ++uColIndexCombined;
+ }
+
+void AlignTwoMSAsGivenPath(const PWPath &Path, const MSA &msaA, const MSA &msaB,
+ MSA &msaCombined)
+ {
+ msaCombined.Clear();
+
+#if TRACE
+ Log("FastAlignProfiles\n");
+ Log("Template A:\n");
+ msaA.LogMe();
+ Log("Template B:\n");
+ msaB.LogMe();
+#endif
+
+ const unsigned uColCountA = msaA.GetColCount();
+ const unsigned uColCountB = msaB.GetColCount();
+
+ const unsigned uSeqCountA = msaA.GetSeqCount();
+ const unsigned uSeqCountB = msaB.GetSeqCount();
+
+ msaCombined.SetSeqCount(uSeqCountA + uSeqCountB);
+
+// Copy sequence names into combined MSA
+ for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA)
+ {
+ msaCombined.SetSeqName(uSeqIndexA, msaA.GetSeqName(uSeqIndexA));
+ msaCombined.SetSeqId(uSeqIndexA, msaA.GetSeqId(uSeqIndexA));
+ }
+
+ for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB)
+ {
+ msaCombined.SetSeqName(uSeqCountA + uSeqIndexB, msaB.GetSeqName(uSeqIndexB));
+ msaCombined.SetSeqId(uSeqCountA + uSeqIndexB, msaB.GetSeqId(uSeqIndexB));
+ }
+
+ unsigned uColIndexA = 0;
+ unsigned uColIndexB = 0;
+ unsigned uColIndexCombined = 0;
+ const unsigned uEdgeCount = Path.GetEdgeCount();
+ for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex)
+ {
+ const PWEdge &Edge = Path.GetEdge(uEdgeIndex);
+#if TRACE
+ Log("\nEdge %u %c%u.%u\n",
+ uEdgeIndex,
+ Edge.cType,
+ Edge.uPrefixLengthA,
+ Edge.uPrefixLengthB);
+#endif
+ const char cType = Edge.cType;
+ const unsigned uPrefixLengthA = Edge.uPrefixLengthA;
+ unsigned uColCountA = 0;
+ if (uPrefixLengthA > 0)
+ {
+ const unsigned uNodeIndexA = uPrefixLengthA - 1;
+ const unsigned uTplColIndexA = uNodeIndexA;
+ if (uTplColIndexA > uColIndexA)
+ uColCountA = uTplColIndexA - uColIndexA;
+ }
+
+ const unsigned uPrefixLengthB = Edge.uPrefixLengthB;
+ unsigned uColCountB = 0;
+ if (uPrefixLengthB > 0)
+ {
+ const unsigned uNodeIndexB = uPrefixLengthB - 1;
+ const unsigned uTplColIndexB = uNodeIndexB;
+ if (uTplColIndexB > uColIndexB)
+ uColCountB = uTplColIndexB - uColIndexB;
+ }
+
+// TODO: This code looks like a hangover from HMM estimation -- can we delete it?
+ assert(uColCountA == 0);
+ assert(uColCountB == 0);
+ AppendTplInserts(msaA, uColIndexA, uColCountA, msaB, uColIndexB, uColCountB,
+ uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined);
+
+ switch (cType)
+ {
+ case 'M':
+ {
+ assert(uPrefixLengthA > 0);
+ assert(uPrefixLengthB > 0);
+ const unsigned uColA = uPrefixLengthA - 1;
+ const unsigned uColB = uPrefixLengthB - 1;
+ assert(uColIndexA == uColA);
+ assert(uColIndexB == uColB);
+ AppendMatch(msaA, uColIndexA, msaB, uColIndexB, uSeqCountA, uSeqCountB,
+ msaCombined, uColIndexCombined);
+ break;
+ }
+ case 'D':
+ {
+ assert(uPrefixLengthA > 0);
+ const unsigned uColA = uPrefixLengthA - 1;
+ assert(uColIndexA == uColA);
+ AppendDelete(msaA, uColIndexA, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined);
+ break;
+ }
+ case 'I':
+ {
+ assert(uPrefixLengthB > 0);
+ const unsigned uColB = uPrefixLengthB - 1;
+ assert(uColIndexB == uColB);
+ AppendInsert(msaB, uColIndexB, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined);
+ break;
+ }
+ default:
+ assert(false);
+ }
+ }
+ unsigned uInsertColCountA = uColCountA - uColIndexA;
+ unsigned uInsertColCountB = uColCountB - uColIndexB;
+
+// TODO: This code looks like a hangover from HMM estimation -- can we delete it?
+ assert(uInsertColCountA == 0);
+ assert(uInsertColCountB == 0);
+ AppendTplInserts(msaA, uColIndexA, uInsertColCountA, msaB, uColIndexB,
+ uInsertColCountB, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined);
+
+ assert(msaCombined.GetColCount() == uEdgeCount);
+ }
+
+static const ProfPos PPStart =
+ {
+ false, //m_bAllGaps;
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // m_uSortOrder[21];
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // m_fcCounts[20];
+ 1.0, // m_LL;
+ 0.0, // m_LG;
+ 0.0, // m_GL;
+ 0.0, // m_GG;
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // m_ALScores
+ 0, // m_uResidueGroup;
+ 1.0, // m_fOcc;
+ 0.0, // m_fcStartOcc;
+ 0.0, // m_fcEndOcc;
+ 0.0, // m_scoreGapOpen;
+ 0.0, // m_scoreGapClose;
+ };
+
+// MM
+// Ai1 Ai Out
+// X X LL LL
+// X - LG LG
+// - X GL GL
+// - - GG GG
+//
+// Bj1 Bj
+// X X LL LL
+// X - LG LG
+// - X GL GL
+// - - GG GG
+static void SetGapsMM(
+ const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA,
+ const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB,
+ ProfPos *POut, unsigned uColIndexOut)
+ {
+ const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart;
+ const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart;
+ ProfPos &PPO = POut[uColIndexOut];
+
+ PPO.m_LL = wA*PPA.m_LL + wB*PPB.m_LL;
+ PPO.m_LG = wA*PPA.m_LG + wB*PPB.m_LG;
+ PPO.m_GL = wA*PPA.m_GL + wB*PPB.m_GL;
+ PPO.m_GG = wA*PPA.m_GG + wB*PPB.m_GG;
+ }
+
+// MD
+// Ai1 Ai Out
+// X X LL LL
+// X - LG LG
+// - X GL GL
+// - - GG GG
+//
+// Bj (-)
+// X - ?L LG
+// - - ?G GG
+static void SetGapsMD(
+ const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA,
+ const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB,
+ ProfPos *POut, unsigned uColIndexOut)
+ {
+ const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart;
+ const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart;
+ ProfPos &PPO = POut[uColIndexOut];
+
+ PPO.m_LL = wA*PPA.m_LL;
+ PPO.m_LG = wA*PPA.m_LG + wB*(PPB.m_LL + PPB.m_GL);
+ PPO.m_GL = wA*PPA.m_GL;
+ PPO.m_GG = wA*PPA.m_GG + wB*(PPB.m_LG + PPB.m_GG);
+ }
+
+// DD
+// Ai1 Ai Out
+// X X LL LL
+// X - LG LG
+// - X GL GL
+// - - GG GG
+//
+// (-) (-)
+// - - ?? GG
+static void SetGapsDD(
+ const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA,
+ const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB,
+ ProfPos *POut, unsigned uColIndexOut)
+ {
+ const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart;
+ ProfPos &PPO = POut[uColIndexOut];
+
+ PPO.m_LL = wA*PPA.m_LL;
+ PPO.m_LG = wA*PPA.m_LG;
+ PPO.m_GL = wA*PPA.m_GL;
+ PPO.m_GG = wA*PPA.m_GG + wB;
+ }
+
+// MI
+// Ai (-) Out
+// X - ?L LG
+// - - ?G GG
+
+// Bj1 Bj
+// X X LL LL
+// X - LG LG
+// - X GL GL
+// - - GG GG
+static void SetGapsMI(
+ const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA,
+ const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB,
+ ProfPos *POut, unsigned uColIndexOut)
+ {
+ const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart;
+ const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart;
+ ProfPos &PPO = POut[uColIndexOut];
+
+ PPO.m_LL = wB*PPB.m_LL;
+ PPO.m_LG = wB*PPB.m_LG + wA*(PPA.m_LL + PPA.m_GL);
+ PPO.m_GL = wB*PPB.m_GL;
+ PPO.m_GG = wB*PPB.m_GG + wA*(PPA.m_LG + PPA.m_GG);
+ }
+
+// DM
+// Ai1 Ai Out
+// X X LL LL
+// X - LG LG
+// - X GL GL
+// - - GG GG
+//
+// (-) Bj
+// - X ?L GL
+// - - ?G GG
+static void SetGapsDM(
+ const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA,
+ const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB,
+ ProfPos *POut, unsigned uColIndexOut)
+ {
+ const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart;
+ const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart;
+ ProfPos &PPO = POut[uColIndexOut];
+
+ PPO.m_LL = wA*PPA.m_LL;
+ PPO.m_LG = wA*PPA.m_LG;
+ PPO.m_GL = wA*PPA.m_GL + wB*(PPB.m_LL + PPB.m_GL);
+ PPO.m_GG = wA*PPA.m_GG + wB*(PPB.m_LG + PPB.m_GG);
+ }
+
+// IM
+// (-) Ai Out
+// - X ?L GL
+// - - ?G GG
+
+// Bj1 Bj
+// X X LL LL
+// X - LG LG
+// - X GL GL
+// - - GG GG
+static void SetGapsIM(
+ const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA,
+ const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB,
+ ProfPos *POut, unsigned uColIndexOut)
+ {
+ const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart;
+ const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart;
+ ProfPos &PPO = POut[uColIndexOut];
+
+ PPO.m_LL = wB*PPB.m_LL;
+ PPO.m_LG = wB*PPB.m_LG;
+ PPO.m_GL = wB*PPB.m_GL + wA*(PPA.m_LL + PPA.m_GL);
+ PPO.m_GG = wB*PPB.m_GG + wA*(PPA.m_LG + PPA.m_GG);
+ }
+
+// ID
+// (-) Ai Out
+// - X ?L GL
+// - - ?G GG
+
+// Bj (-)
+// X - ?L LG
+// - - ?G GG
+static void SetGapsID(
+ const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA,
+ const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB,
+ ProfPos *POut, unsigned uColIndexOut)
+ {
+ const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart;
+ const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart;
+ ProfPos &PPO = POut[uColIndexOut];
+
+ PPO.m_LL = 0;
+ PPO.m_LG = wB*PPB.m_GL + wB*PPB.m_LL;
+ PPO.m_GL = wA*PPA.m_GL + wA*PPA.m_LL;
+ PPO.m_GG = wA*(PPA.m_LG + PPA.m_GG) + wB*(PPB.m_LG + PPB.m_GG);
+ }
+
+// DI
+// Ai (-) Out
+// X - ?L LG
+// - - ?G GG
+
+// (-) Bj
+// - X ?L GL
+// - - ?G GG
+static void SetGapsDI(
+ const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA,
+ const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB,
+ ProfPos *POut, unsigned uColIndexOut)
+ {
+ const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart;
+ const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart;
+ ProfPos &PPO = POut[uColIndexOut];
+
+ PPO.m_LL = 0;
+ PPO.m_LG = wA*PPA.m_GL + wA*PPA.m_LL;
+ PPO.m_GL = wB*PPB.m_GL + wB*PPB.m_LL;
+ PPO.m_GG = wA*(PPA.m_LG + PPA.m_GG) + wB*(PPB.m_LG + PPB.m_GG);
+ }
+
+// II
+// (-) (-) Out
+// - - ?? GG
+
+// Bj1 Bj
+// X X LL LL
+// X - LG LG
+// - X GL GL
+// - - GG GG
+static void SetGapsII(
+ const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA,
+ const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB,
+ ProfPos *POut, unsigned uColIndexOut)
+ {
+ const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart;
+ ProfPos &PPO = POut[uColIndexOut];
+
+ PPO.m_LL = wB*PPB.m_LL;
+ PPO.m_LG = wB*PPB.m_LG;
+ PPO.m_GL = wB*PPB.m_GL;
+ PPO.m_GG = wB*PPB.m_GG + wA;
+ }
+
+static void SetFreqs(
+ const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA,
+ const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB,
+ ProfPos *POut, unsigned uColIndexOut)
+ {
+ const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart;
+ const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart;
+ ProfPos &PPO = POut[uColIndexOut];
+
+ if (g_bNormalizeCounts)
+ {
+ const FCOUNT fA = PPA.m_fOcc*wA/(wA + wB);
+ const FCOUNT fB = PPB.m_fOcc*wB/(wA + wB);
+ FCOUNT fTotal = 0;
+ for (unsigned i = 0; i < 20; ++i)
+ {
+ const FCOUNT f = fA*PPA.m_fcCounts[i] + fB*PPB.m_fcCounts[i];
+ PPO.m_fcCounts[i] = f;
+ fTotal += f;
+ }
+ if (fTotal > 0)
+ for (unsigned i = 0; i < 20; ++i)
+ PPO.m_fcCounts[i] /= fTotal;
+ }
+ else
+ {
+ for (unsigned i = 0; i < 20; ++i)
+ PPO.m_fcCounts[i] = wA*PPA.m_fcCounts[i] + wB*PPB.m_fcCounts[i];
+ }
+ }
+
+void AlignTwoProfsGivenPath(const PWPath &Path,
+ const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA,
+ const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB,
+ ProfPos **ptrPOut, unsigned *ptruLengthOut)
+ {
+#if TRACE
+ Log("AlignTwoProfsGivenPath wA=%.3g wB=%.3g Path=\n", wA, wB);
+ Path.LogMe();
+#endif
+ assert(BTEq(wA + wB, 1.0));
+
+ unsigned uColIndexA = 0;
+ unsigned uColIndexB = 0;
+ unsigned uColIndexOut = 0;
+ const unsigned uEdgeCount = Path.GetEdgeCount();
+ ProfPos *POut = new ProfPos[uEdgeCount];
+ char cPrevType = 'M';
+ for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex)
+ {
+ const PWEdge &Edge = Path.GetEdge(uEdgeIndex);
+ const char cType = Edge.cType;
+
+ const unsigned uPrefixLengthA = Edge.uPrefixLengthA;
+ const unsigned uPrefixLengthB = Edge.uPrefixLengthB;
+
+#if TRACE
+ Log("\nEdge %u %c%u.%u ColA=%u ColB=%u\n",
+ uEdgeIndex,
+ Edge.cType,
+ Edge.uPrefixLengthA,
+ Edge.uPrefixLengthB,
+ uColIndexA,
+ uColIndexB);
+#endif
+
+ POut[uColIndexOut].m_bAllGaps = false;
+ switch (cType)
+ {
+ case 'M':
+ {
+ assert(uPrefixLengthA > 0);
+ assert(uPrefixLengthB > 0);
+ SetFreqs(
+ PA, uPrefixLengthA, wA,
+ PB, uPrefixLengthB, wB,
+ POut, uColIndexOut);
+ switch (cPrevType)
+ {
+ case 'M':
+ SetGapsMM(
+ PA, uPrefixLengthA, wA,
+ PB, uPrefixLengthB, wB,
+ POut, uColIndexOut);
+ break;
+ case 'D':
+ SetGapsDM(
+ PA, uPrefixLengthA, wA,
+ PB, uPrefixLengthB, wB,
+ POut, uColIndexOut);
+ break;
+ case 'I':
+ SetGapsIM(
+ PA, uPrefixLengthA, wA,
+ PB, uPrefixLengthB, wB,
+ POut, uColIndexOut);
+ break;
+ default:
+ Quit("Bad cPrevType");
+ }
+ ++uColIndexA;
+ ++uColIndexB;
+ ++uColIndexOut;
+ break;
+ }
+ case 'D':
+ {
+ assert(uPrefixLengthA > 0);
+ SetFreqs(
+ PA, uPrefixLengthA, wA,
+ PB, uPrefixLengthB, 0,
+ POut, uColIndexOut);
+ switch (cPrevType)
+ {
+ case 'M':
+ SetGapsMD(
+ PA, uPrefixLengthA, wA,
+ PB, uPrefixLengthB, wB,
+ POut, uColIndexOut);
+ break;
+ case 'D':
+ SetGapsDD(
+ PA, uPrefixLengthA, wA,
+ PB, uPrefixLengthB, wB,
+ POut, uColIndexOut);
+ break;
+ case 'I':
+ SetGapsID(
+ PA, uPrefixLengthA, wA,
+ PB, uPrefixLengthB, wB,
+ POut, uColIndexOut);
+ break;
+ default:
+ Quit("Bad cPrevType");
+ }
+ ++uColIndexA;
+ ++uColIndexOut;
+ break;
+ }
+ case 'I':
+ {
+ assert(uPrefixLengthB > 0);
+ SetFreqs(
+ PA, uPrefixLengthA, 0,
+ PB, uPrefixLengthB, wB,
+ POut, uColIndexOut);
+ switch (cPrevType)
+ {
+ case 'M':
+ SetGapsMI(
+ PA, uPrefixLengthA, wA,
+ PB, uPrefixLengthB, wB,
+ POut, uColIndexOut);
+ break;
+ case 'D':
+ SetGapsDI(
+ PA, uPrefixLengthA, wA,
+ PB, uPrefixLengthB, wB,
+ POut, uColIndexOut);
+ break;
+ case 'I':
+ SetGapsII(
+ PA, uPrefixLengthA, wA,
+ PB, uPrefixLengthB, wB,
+ POut, uColIndexOut);
+ break;
+ default:
+ Quit("Bad cPrevType");
+ }
+ ++uColIndexB;
+ ++uColIndexOut;
+ break;
+ }
+ default:
+ assert(false);
+ }
+ cPrevType = cType;
+ }
+ assert(uColIndexOut == uEdgeCount);
+
+ ProfScoresFromFreqs(POut, uEdgeCount);
+ ValidateProf(POut, uEdgeCount);
+
+ *ptrPOut = POut;
+ *ptruLengthOut = uEdgeCount;
+
+#if TRACE
+ Log("AlignTwoProfsGivenPath:\n");
+ ListProfile(POut, uEdgeCount, 0);
+#endif
+ }
Added: trunk/packages/muscle/branches/upstream/current/aligngivenpathsw.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/aligngivenpathsw.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/aligngivenpathsw.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,237 @@
+#include "muscle.h"
+#include "msa.h"
+#include "pwpath.h"
+#include "profile.h"
+
+#define TRACE 0
+
+static void AppendDelete(const MSA &msaA, unsigned &uColIndexA,
+ unsigned uSeqCountA, unsigned uSeqCountB, MSA &msaCombined,
+ unsigned &uColIndexCombined)
+ {
+#if TRACE
+ Log("AppendDelete ColIxA=%u ColIxCmb=%u\n",
+ uColIndexA, uColIndexCombined);
+#endif
+ for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA)
+ {
+ char c = msaA.GetChar(uSeqIndexA, uColIndexA);
+ msaCombined.SetChar(uSeqIndexA, uColIndexCombined, c);
+ }
+
+ for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB)
+ msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined, '-');
+
+ ++uColIndexCombined;
+ ++uColIndexA;
+ }
+
+static void AppendInsert(const MSA &msaB, unsigned &uColIndexB,
+ unsigned uSeqCountA, unsigned uSeqCountB, MSA &msaCombined,
+ unsigned &uColIndexCombined)
+ {
+#if TRACE
+ Log("AppendInsert ColIxB=%u ColIxCmb=%u\n",
+ uColIndexB, uColIndexCombined);
+#endif
+ for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA)
+ msaCombined.SetChar(uSeqIndexA, uColIndexCombined, '-');
+
+ for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB)
+ {
+ char c = msaB.GetChar(uSeqIndexB, uColIndexB);
+ msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined, c);
+ }
+
+ ++uColIndexCombined;
+ ++uColIndexB;
+ }
+
+static void AppendUnalignedTerminals(const MSA &msaA, unsigned &uColIndexA, unsigned uColCountA,
+ const MSA &msaB, unsigned &uColIndexB, unsigned uColCountB, unsigned uSeqCountA,
+ unsigned uSeqCountB, MSA &msaCombined, unsigned &uColIndexCombined)
+ {
+#if TRACE
+ Log("AppendUnalignedTerminals ColIxA=%u ColIxB=%u ColIxCmb=%u\n",
+ uColIndexA, uColIndexB, uColIndexCombined);
+#endif
+ const unsigned uLengthA = msaA.GetColCount();
+ const unsigned uLengthB = msaB.GetColCount();
+
+ unsigned uNewColCount = uColCountA;
+ if (uColCountB > uNewColCount)
+ uNewColCount = uColCountB;
+
+ for (unsigned n = 0; n < uColCountA; ++n)
+ {
+ for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA)
+ {
+ char c = msaA.GetChar(uSeqIndexA, uColIndexA + n);
+ c = UnalignChar(c);
+ msaCombined.SetChar(uSeqIndexA, uColIndexCombined + n, c);
+ }
+ }
+ for (unsigned n = uColCountA; n < uNewColCount; ++n)
+ {
+ for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA)
+ msaCombined.SetChar(uSeqIndexA, uColIndexCombined + n, '.');
+ }
+
+ for (unsigned n = 0; n < uColCountB; ++n)
+ {
+ for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB)
+ {
+ char c = msaB.GetChar(uSeqIndexB, uColIndexB + n);
+ c = UnalignChar(c);
+ msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined + n, c);
+ }
+ }
+ for (unsigned n = uColCountB; n < uNewColCount; ++n)
+ {
+ for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB)
+ msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined + n, '.');
+ }
+
+ uColIndexCombined += uNewColCount;
+ uColIndexA += uColCountA;
+ uColIndexB += uColCountB;
+ }
+
+static void AppendMatch(const MSA &msaA, unsigned &uColIndexA, const MSA &msaB,
+ unsigned &uColIndexB, unsigned uSeqCountA, unsigned uSeqCountB,
+ MSA &msaCombined, unsigned &uColIndexCombined)
+ {
+#if TRACE
+ Log("AppendMatch ColIxA=%u ColIxB=%u ColIxCmb=%u\n",
+ uColIndexA, uColIndexB, uColIndexCombined);
+#endif
+
+ for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA)
+ {
+ char c = msaA.GetChar(uSeqIndexA, uColIndexA);
+ msaCombined.SetChar(uSeqIndexA, uColIndexCombined, c);
+ }
+
+ for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB)
+ {
+ char c = msaB.GetChar(uSeqIndexB, uColIndexB);
+ msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined, c);
+ }
+
+ ++uColIndexA;
+ ++uColIndexB;
+ ++uColIndexCombined;
+ }
+
+void AlignTwoMSAsGivenPathSW(const PWPath &Path, const MSA &msaA, const MSA &msaB,
+ MSA &msaCombined)
+ {
+ msaCombined.Clear();
+
+#if TRACE
+ Log("AlignTwoMSAsGivenPathSW\n");
+ Log("Template A:\n");
+ msaA.LogMe();
+ Log("Template B:\n");
+ msaB.LogMe();
+#endif
+
+ const unsigned uColCountA = msaA.GetColCount();
+ const unsigned uColCountB = msaB.GetColCount();
+
+ const unsigned uSeqCountA = msaA.GetSeqCount();
+ const unsigned uSeqCountB = msaB.GetSeqCount();
+
+ msaCombined.SetSeqCount(uSeqCountA + uSeqCountB);
+
+// Copy sequence names into combined MSA
+ for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA)
+ {
+ msaCombined.SetSeqName(uSeqIndexA, msaA.GetSeqName(uSeqIndexA));
+ msaCombined.SetSeqId(uSeqIndexA, msaA.GetSeqId(uSeqIndexA));
+ }
+
+ for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB)
+ {
+ msaCombined.SetSeqName(uSeqCountA + uSeqIndexB, msaB.GetSeqName(uSeqIndexB));
+ msaCombined.SetSeqId(uSeqCountA + uSeqIndexB, msaB.GetSeqId(uSeqIndexB));
+ }
+
+ unsigned uColIndexA = 0;
+ unsigned uColIndexB = 0;
+ unsigned uColIndexCombined = 0;
+ const unsigned uEdgeCount = Path.GetEdgeCount();
+ for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex)
+ {
+ const PWEdge &Edge = Path.GetEdge(uEdgeIndex);
+#if TRACE
+ Log("\nEdge %u %c%u.%u\n",
+ uEdgeIndex,
+ Edge.cType,
+ Edge.uPrefixLengthA,
+ Edge.uPrefixLengthB);
+#endif
+ const char cType = Edge.cType;
+ const unsigned uPrefixLengthA = Edge.uPrefixLengthA;
+ unsigned uColCountA = 0;
+ if (uPrefixLengthA > 0)
+ {
+ const unsigned uNodeIndexA = uPrefixLengthA - 1;
+ const unsigned uTplColIndexA = uNodeIndexA;
+ if (uTplColIndexA > uColIndexA)
+ uColCountA = uTplColIndexA - uColIndexA;
+ }
+
+ const unsigned uPrefixLengthB = Edge.uPrefixLengthB;
+ unsigned uColCountB = 0;
+ if (uPrefixLengthB > 0)
+ {
+ const unsigned uNodeIndexB = uPrefixLengthB - 1;
+ const unsigned uTplColIndexB = uNodeIndexB;
+ if (uTplColIndexB > uColIndexB)
+ uColCountB = uTplColIndexB - uColIndexB;
+ }
+
+ AppendUnalignedTerminals(msaA, uColIndexA, uColCountA, msaB, uColIndexB, uColCountB,
+ uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined);
+
+ switch (cType)
+ {
+ case 'M':
+ {
+ assert(uPrefixLengthA > 0);
+ assert(uPrefixLengthB > 0);
+ const unsigned uColA = uPrefixLengthA - 1;
+ const unsigned uColB = uPrefixLengthB - 1;
+ assert(uColIndexA == uColA);
+ assert(uColIndexB == uColB);
+ AppendMatch(msaA, uColIndexA, msaB, uColIndexB, uSeqCountA, uSeqCountB,
+ msaCombined, uColIndexCombined);
+ break;
+ }
+ case 'D':
+ {
+ assert(uPrefixLengthA > 0);
+ const unsigned uColA = uPrefixLengthA - 1;
+ assert(uColIndexA == uColA);
+ AppendDelete(msaA, uColIndexA, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined);
+ break;
+ }
+ case 'I':
+ {
+ assert(uPrefixLengthB > 0);
+ const unsigned uColB = uPrefixLengthB - 1;
+ assert(uColIndexB == uColB);
+ AppendInsert(msaB, uColIndexB, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined);
+ break;
+ }
+ default:
+ assert(false);
+ }
+ }
+ unsigned uInsertColCountA = uColCountA - uColIndexA;
+ unsigned uInsertColCountB = uColCountB - uColIndexB;
+
+ AppendUnalignedTerminals(msaA, uColIndexA, uInsertColCountA, msaB, uColIndexB,
+ uInsertColCountB, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined);
+ }
Added: trunk/packages/muscle/branches/upstream/current/aligntwomsas.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/aligntwomsas.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/aligntwomsas.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,41 @@
+#include "muscle.h"
+#include "msa.h"
+#include "profile.h"
+#include "pwpath.h"
+#include "textfile.h"
+#include "timing.h"
+
+SCORE AlignTwoMSAs(const MSA &msa1, const MSA &msa2, MSA &msaOut, PWPath &Path,
+ bool bLockLeft, bool bLockRight)
+ {
+ const unsigned uLengthA = msa1.GetColCount();
+ const unsigned uLengthB = msa2.GetColCount();
+
+ ProfPos *PA = ProfileFromMSA(msa1);
+ ProfPos *PB = ProfileFromMSA(msa2);
+
+ if (bLockLeft)
+ {
+ PA[0].m_scoreGapOpen = MINUS_INFINITY;
+ PB[0].m_scoreGapOpen = MINUS_INFINITY;
+ }
+
+ if (bLockRight)
+ {
+ PA[uLengthA-1].m_scoreGapClose = MINUS_INFINITY;
+ PB[uLengthB-1].m_scoreGapClose = MINUS_INFINITY;
+ }
+
+ float r = (float) uLengthA/ (float) (uLengthB + 1); // +1 to prevent div 0
+ if (r < 1)
+ r = 1/r;
+
+ SCORE Score = GlobalAlign(PA, uLengthA, PB, uLengthB, Path);
+
+ AlignTwoMSAsGivenPath(Path, msa1, msa2, msaOut);
+
+ delete[] PA;
+ delete[] PB;
+
+ return Score;
+ }
Added: trunk/packages/muscle/branches/upstream/current/aligntwoprofs.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/aligntwoprofs.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/aligntwoprofs.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,31 @@
+#include "muscle.h"
+#include "msa.h"
+#include "profile.h"
+#include "pwpath.h"
+
+SCORE GlobalAlign4(ProfPos *PA, unsigned uLengthA, ProfPos *PB,
+ unsigned uLengthB, PWPath &Path);
+
+SCORE AlignTwoProfs(
+ const ProfPos *PA, unsigned uLengthA, WEIGHT wA,
+ const ProfPos *PB, unsigned uLengthB, WEIGHT wB,
+ PWPath &Path, ProfPos **ptrPout, unsigned *ptruLengthOut)
+ {
+ assert(uLengthA < 100000);
+ assert(uLengthB < 100000);
+
+ float r = (float) uLengthA/ (float) (uLengthB + 1); // +1 to prevent div 0
+ if (r < 1)
+ r = 1/r;
+
+ SCORE Score = GlobalAlign(PA, uLengthA, PB, uLengthB, Path);
+
+ AlignTwoProfsGivenPath(Path, PA, uLengthB, wA/(wA + wB), PB, uLengthB, wB/(wA + wB),
+ ptrPout, ptruLengthOut);
+
+#if HYDRO
+ if (ALPHA_Amino == g_Alpha)
+ Hydro(*ptrPout, *ptruLengthOut);
+#endif
+ return Score;
+ }
Added: trunk/packages/muscle/branches/upstream/current/aln.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/aln.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/aln.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,170 @@
+#include "muscle.h"
+#include <stdio.h>
+#include <ctype.h>
+#include "msa.h"
+#include "textfile.h"
+
+const unsigned uCharsPerLine = 60;
+const int MIN_NAME = 10;
+const int MAX_NAME = 32;
+
+static char GetAlnConsensusChar(const MSA &a, unsigned uColIndex);
+
+void MSA::ToAlnFile(TextFile &File) const
+ {
+ if (g_bClwStrict)
+ File.PutString("CLUSTAL W (1.81) multiple sequence alignment\n");
+ else
+ {
+ File.PutString("MUSCLE ("
+ MUSCLE_MAJOR_VERSION "." MUSCLE_MINOR_VERSION ")"
+ " multiple sequence alignment\n");
+ File.PutString("\n");
+ }
+
+ int iLongestNameLength = 0;
+ for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)
+ {
+ const char *ptrName = GetSeqName(uSeqIndex);
+ const char *ptrBlank = strchr(ptrName, ' ');
+ int iLength;
+ if (0 != ptrBlank)
+ iLength = (int) (ptrBlank - ptrName);
+ else
+ iLength = (int) strlen(ptrName);
+ if (iLength > iLongestNameLength)
+ iLongestNameLength = iLength;
+ }
+ if (iLongestNameLength > MAX_NAME)
+ iLongestNameLength = MAX_NAME;
+ if (iLongestNameLength < MIN_NAME)
+ iLongestNameLength = MIN_NAME;
+
+ unsigned uLineCount = (GetColCount() - 1)/uCharsPerLine + 1;
+ for (unsigned uLineIndex = 0; uLineIndex < uLineCount; ++uLineIndex)
+ {
+ File.PutString("\n");
+ unsigned uStartColIndex = uLineIndex*uCharsPerLine;
+ unsigned uEndColIndex = uStartColIndex + uCharsPerLine - 1;
+ if (uEndColIndex >= GetColCount())
+ uEndColIndex = GetColCount() - 1;
+ char Name[MAX_NAME+1];
+ for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)
+ {
+ const char *ptrName = GetSeqName(uSeqIndex);
+ const char *ptrBlank = strchr(ptrName, ' ');
+ int iLength;
+ if (0 != ptrBlank)
+ iLength = (int) (ptrBlank - ptrName);
+ else
+ iLength = (int) strlen(ptrName);
+ if (iLength > MAX_NAME)
+ iLength = MAX_NAME;
+ memset(Name, ' ', MAX_NAME);
+ memcpy(Name, ptrName, iLength);
+ Name[iLongestNameLength] = 0;
+
+ File.PutFormat("%s ", Name);
+ for (unsigned uColIndex = uStartColIndex; uColIndex <= uEndColIndex;
+ ++uColIndex)
+ {
+ const char c = GetChar(uSeqIndex, uColIndex);
+ File.PutFormat("%c", toupper(c));
+ }
+ File.PutString("\n");
+ }
+
+ memset(Name, ' ', MAX_NAME);
+ Name[iLongestNameLength] = 0;
+ File.PutFormat("%s ", Name);
+ for (unsigned uColIndex = uStartColIndex; uColIndex <= uEndColIndex;
+ ++uColIndex)
+ {
+ const char c = GetAlnConsensusChar(*this, uColIndex);
+ File.PutChar(c);
+ }
+ File.PutString("\n");
+ }
+ }
+
+static char GetAlnConsensusChar(const MSA &a, unsigned uColIndex)
+ {
+ const unsigned uSeqCount = a.GetSeqCount();
+ unsigned BitMap = 0;
+ unsigned Count = 0;
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ unsigned uLetter = a.GetLetterEx(uSeqIndex, uColIndex);
+ assert(uLetter < 32);
+ unsigned Bit = (1 << uLetter);
+ if (!(BitMap & Bit))
+ ++Count;
+ BitMap |= Bit;
+ }
+
+// '*' indicates positions which have a single, fully conserved residue
+ if (1 == Count)
+ return '*';
+
+ if (ALPHA_Amino != g_Alpha)
+ return ' ';
+
+#define B(a) (1 << AX_##a)
+#define S2(a, b) S(B(a) | B(b))
+#define S3(a, b, c) S(B(a) | B(b) | B(c))
+#define S4(a, b, c, d) S(B(a) | B(b) | B(c) | B(d))
+#define S(w) if (0 == (BitMap & ~(w)) && (BitMap & (w)) != 0) return ':';
+
+#define W3(a, b, c) W(B(a) | B(b) | B(c))
+#define W4(a, b, c, d) W(B(a) | B(b) | B(c) | B(d))
+#define W5(a, b, c, d, e) W(B(a) | B(b) | B(c) | B(d) | B(e))
+#define W6(a, b, c, d, e, f) W(B(a) | B(b) | B(c) | B(d) | B(e) | B(f))
+#define W(w) if (0 == (BitMap & ~(w)) && (BitMap & (w)) != 0) return '.';
+
+// ':' indicates that one of the following 'strong'
+// groups is fully conserved
+// STA
+// NEQK
+// NHQK
+// NDEQ
+// QHRK
+// MILV
+// MILF
+// HY
+// FYW
+//
+ S3(S, T, A)
+ S4(N, E, Q, K)
+ S4(N, H, Q, K)
+ S4(N, D, E, Q)
+ S4(M, I, L, V)
+ S4(M, I, L, F)
+ S2(H, Y)
+ S3(F, Y, W)
+
+// '.' indicates that one of the following 'weaker'
+// groups is fully conserved
+// CSA
+// ATV
+// SAG
+// STNK
+// STPA
+// SGND
+// SNDEQK
+// NDEQHK
+// NEQHRK
+// FVLIM
+// HFY
+ W3(C, S, A)
+ W3(A, T, V)
+ W3(S, A, G)
+ W4(S, T, N, K)
+ W4(S, T, P, A)
+ W4(S, G, N, D)
+ W6(S, N, D, E, Q, K)
+ W6(N, W, Q, H, R, K)
+ W5(F, V, L, I, M)
+ W3(H, F, Y)
+
+ return ' ';
+ }
Added: trunk/packages/muscle/branches/upstream/current/alpha.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/alpha.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/alpha.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,283 @@
+#include "muscle.h"
+#include <ctype.h>
+
+/***
+From Bioperl docs:
+Extended DNA / RNA alphabet
+------------------------------------------
+Symbol Meaning Nucleic Acid
+------------------------------------------
+ A A Adenine
+ C C Cytosine
+ G G Guanine
+ T T Thymine
+ U U Uracil
+ M A or C
+ R A or G
+ W A or T
+ S C or G
+ Y C or T
+ K G or T
+ V A or C or G
+ H A or C or T
+ D A or G or T
+ B C or G or T
+ X G or A or T or C
+ N G or A or T or C
+
+IUPAC-IUB SYMBOLS FOR NUCLEOTIDE NOMENCLATURE:
+ Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030.
+***/
+
+unsigned g_CharToLetter[MAX_CHAR];
+unsigned g_CharToLetterEx[MAX_CHAR];
+
+char g_LetterToChar[MAX_ALPHA];
+char g_LetterExToChar[MAX_ALPHA_EX];
+
+char g_UnalignChar[MAX_CHAR];
+char g_AlignChar[MAX_CHAR];
+
+bool g_IsWildcardChar[MAX_CHAR];
+bool g_IsResidueChar[MAX_CHAR];
+
+ALPHA g_Alpha = ALPHA_Undefined;
+unsigned g_AlphaSize = 0;
+
+#define Res(c, Letter) \
+ { \
+ const unsigned char Upper = (unsigned char) toupper(c); \
+ const unsigned char Lower = (unsigned char) tolower(c); \
+ g_CharToLetter[Upper] = Letter; \
+ g_CharToLetter[Lower] = Letter; \
+ g_CharToLetterEx[Upper] = Letter; \
+ g_CharToLetterEx[Lower] = Letter; \
+ g_LetterToChar[Letter] = Upper; \
+ g_LetterExToChar[Letter] = Upper; \
+ g_IsResidueChar[Upper] = true; \
+ g_IsResidueChar[Lower] = true; \
+ g_AlignChar[Upper] = Upper; \
+ g_AlignChar[Lower] = Upper; \
+ g_UnalignChar[Upper] = Lower; \
+ g_UnalignChar[Lower] = Lower; \
+ }
+
+#define Wild(c, Letter) \
+ { \
+ const unsigned char Upper = (unsigned char) toupper(c); \
+ const unsigned char Lower = (unsigned char) tolower(c); \
+ g_CharToLetterEx[Upper] = Letter; \
+ g_CharToLetterEx[Lower] = Letter; \
+ g_LetterExToChar[Letter] = Upper; \
+ g_IsResidueChar[Upper] = true; \
+ g_IsResidueChar[Lower] = true; \
+ g_AlignChar[Upper] = Upper; \
+ g_AlignChar[Lower] = Upper; \
+ g_UnalignChar[Upper] = Lower; \
+ g_UnalignChar[Lower] = Lower; \
+ g_IsWildcardChar[Lower] = true; \
+ g_IsWildcardChar[Upper] = true; \
+ }
+
+static unsigned GetAlphaSize(ALPHA Alpha)
+ {
+ switch (Alpha)
+ {
+ case ALPHA_Amino:
+ return 20;
+
+ case ALPHA_RNA:
+ case ALPHA_DNA:
+ return 4;
+ }
+ Quit("Invalid Alpha=%d", Alpha);
+ return 0;
+ }
+
+static void InitArrays()
+ {
+ memset(g_CharToLetter, 0xff, sizeof(g_CharToLetter));
+ memset(g_CharToLetterEx, 0xff, sizeof(g_CharToLetterEx));
+
+ memset(g_LetterToChar, '?', sizeof(g_LetterToChar));
+ memset(g_LetterExToChar, '?', sizeof(g_LetterExToChar));
+
+ memset(g_AlignChar, '?', sizeof(g_UnalignChar));
+ memset(g_UnalignChar, '?', sizeof(g_UnalignChar));
+
+ memset(g_IsWildcardChar, 0, sizeof(g_IsWildcardChar));
+ }
+
+static void SetGapChar(char c)
+ {
+ unsigned char u = (unsigned char) c;
+
+ g_CharToLetterEx[u] = AX_GAP;
+ g_LetterExToChar[AX_GAP] = u;
+ g_AlignChar[u] = u;
+ g_UnalignChar[u] = u;
+ }
+
+static void SetAlphaDNA()
+ {
+ Res('A', NX_A)
+ Res('C', NX_C)
+ Res('G', NX_G)
+ Res('T', NX_T)
+ Wild('M', NX_M)
+ Wild('R', NX_R)
+ Wild('W', NX_W)
+ Wild('S', NX_S)
+ Wild('Y', NX_Y)
+ Wild('K', NX_K)
+ Wild('V', NX_V)
+ Wild('H', NX_H)
+ Wild('D', NX_D)
+ Wild('B', NX_B)
+ Wild('X', NX_X)
+ Wild('N', NX_N)
+ }
+
+static void SetAlphaRNA()
+ {
+ Res('A', NX_A)
+ Res('C', NX_C)
+ Res('G', NX_G)
+ Res('U', NX_U)
+ Res('T', NX_T)
+ Wild('M', NX_M)
+ Wild('R', NX_R)
+ Wild('W', NX_W)
+ Wild('S', NX_S)
+ Wild('Y', NX_Y)
+ Wild('K', NX_K)
+ Wild('V', NX_V)
+ Wild('H', NX_H)
+ Wild('D', NX_D)
+ Wild('B', NX_B)
+ Wild('X', NX_X)
+ Wild('N', NX_N)
+ }
+
+static void SetAlphaAmino()
+ {
+ Res('A', AX_A)
+ Res('C', AX_C)
+ Res('D', AX_D)
+ Res('E', AX_E)
+ Res('F', AX_F)
+ Res('G', AX_G)
+ Res('H', AX_H)
+ Res('I', AX_I)
+ Res('K', AX_K)
+ Res('L', AX_L)
+ Res('M', AX_M)
+ Res('N', AX_N)
+ Res('P', AX_P)
+ Res('Q', AX_Q)
+ Res('R', AX_R)
+ Res('S', AX_S)
+ Res('T', AX_T)
+ Res('V', AX_V)
+ Res('W', AX_W)
+ Res('Y', AX_Y)
+
+ Wild('B', AX_B)
+ Wild('X', AX_X)
+ Wild('Z', AX_Z)
+ }
+
+void SetAlpha(ALPHA Alpha)
+ {
+ InitArrays();
+
+ SetGapChar('.');
+ SetGapChar('-');
+
+ switch (Alpha)
+ {
+ case ALPHA_Amino:
+ SetAlphaAmino();
+ break;
+
+ case ALPHA_DNA:
+ SetAlphaDNA();
+
+ case ALPHA_RNA:
+ SetAlphaRNA();
+ break;
+
+ default:
+ Quit("Invalid Alpha=%d", Alpha);
+ }
+
+ g_AlphaSize = GetAlphaSize(Alpha);
+ g_Alpha = Alpha;
+
+ if (g_bVerbose)
+ Log("Alphabet %s\n", ALPHAToStr(g_Alpha));
+ }
+
+char GetWildcardChar()
+ {
+ switch (g_Alpha)
+ {
+ case ALPHA_Amino:
+ return 'X';
+
+ case ALPHA_DNA:
+ case ALPHA_RNA:
+ return 'N';
+
+ default:
+ Quit("Invalid Alpha=%d", g_Alpha);
+ }
+ return '?';
+ }
+
+bool IsNucleo(char c)
+ {
+ return strchr("ACGTURYNacgturyn", c) != 0;
+ }
+
+bool IsDNA(char c)
+ {
+ return strchr("AGCTNagctn", c) != 0;
+ }
+
+bool IsRNA(char c)
+ {
+ return strchr("AGCUNagcun", c) != 0;
+ }
+
+static char InvalidLetters[256];
+static int InvalidLetterCount = 0;
+
+void ClearInvalidLetterWarning()
+ {
+ memset(InvalidLetters, 0, 256);
+ }
+
+void InvalidLetterWarning(char c, char w)
+ {
+ InvalidLetters[(unsigned char) c] = 1;
+ ++InvalidLetterCount;
+ }
+
+void ReportInvalidLetters()
+ {
+ if (0 == InvalidLetterCount)
+ return;
+
+ char Str[257];
+ memset(Str, 0, 257);
+
+ int n = 0;
+ for (int i = 0; i < 256; ++i)
+ {
+ if (InvalidLetters[i])
+ Str[n++] = (char) i;
+ }
+ Warning("Assuming %s (see -seqtype option), invalid letters found: %s",
+ ALPHAToStr(g_Alpha), Str);
+ }
Added: trunk/packages/muscle/branches/upstream/current/alpha.h
===================================================================
--- trunk/packages/muscle/branches/upstream/current/alpha.h 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/alpha.h 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,106 @@
+#ifndef alpha_h
+#define alpha_h
+
+bool StrHasAmino(const char *Str);
+bool StrHasGap(const char *Str);
+void ClearInvalidLetterWarning();
+void InvalidLetterWarning(char c, char w);
+void ReportInvalidLetters();
+
+extern unsigned g_CharToLetter[];
+extern unsigned g_CharToLetterEx[];
+
+extern char g_LetterToChar[];
+extern char g_LetterExToChar[];
+
+extern char g_UnalignChar[];
+extern char g_AlignChar[];
+
+extern bool g_IsWildcardChar[];
+extern bool g_IsResidueChar[];
+
+#define CharToLetter(c) (g_CharToLetter[(unsigned char) (c)])
+#define CharToLetterEx(c) (g_CharToLetterEx[(unsigned char) (c)])
+
+#define LetterToChar(u) (g_LetterToChar[u])
+#define LetterExToChar(u) (g_LetterExToChar[u])
+
+#define IsResidueChar(c) (g_IsResidueChar[(unsigned char) (c)])
+#define IsGapChar(c) ('-' == (c) || '.' == (c))
+#define IsWildcardChar(c) (g_IsWildcardChar[(unsigned char) (c)])
+
+#define AlignChar(c) (g_AlignChar[(unsigned char) (c)])
+#define UnalignChar(c) (g_UnalignChar[(unsigned char) (c)])
+
+// AX=Amino alphabet with eXtensions (B, Z and X)
+enum AX
+ {
+ AX_A,
+ AX_C,
+ AX_D,
+ AX_E,
+ AX_F,
+ AX_G,
+ AX_H,
+ AX_I,
+ AX_K,
+ AX_L,
+ AX_M,
+ AX_N,
+ AX_P,
+ AX_Q,
+ AX_R,
+ AX_S,
+ AX_T,
+ AX_V,
+ AX_W,
+ AX_Y,
+
+ AX_X, // Any
+
+ AX_B, // D or N
+ AX_Z, // E or Q
+
+ AX_GAP,
+ };
+const unsigned AX_COUNT = AX_GAP + 1;
+
+// NX=Nucleotide alphabet with extensions
+enum NX
+ {
+ NX_A,
+ NX_C,
+ NX_G,
+ NX_T,
+ NX_U = NX_T,
+
+ NX_M, // AC
+ NX_R, // AG
+ NX_W, // AT
+ NX_S, // CG
+ NX_Y, // CT
+ NX_K, // GT
+ NX_V, // ACG
+ NX_H, // ACT
+ NX_D, // AGT
+ NX_B, // CGT
+ NX_X, // GATC
+ NX_N, // GATC
+ NX_GAP
+ };
+const unsigned NX_COUNT = NX_GAP + 1;
+
+const unsigned MAX_ALPHA = 20;
+const unsigned MAX_ALPHA_EX = AX_COUNT;
+const unsigned MAX_CHAR = 256;
+
+extern ALPHA g_Alpha;
+extern unsigned g_AlphaSize;
+
+void SetAlpha(ALPHA Alpha);
+char GetWildcardChar();
+bool IsNucleo(char c);
+bool IsDNA(char c);
+bool IsRNA(char c);
+
+#endif // alpha_h
Added: trunk/packages/muscle/branches/upstream/current/anchors.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/anchors.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/anchors.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,218 @@
+#include "muscle.h"
+#include "msa.h"
+#include "objscore.h"
+
+#define TRACE 0
+
+static void WindowSmooth(const SCORE Score[], unsigned uCount, unsigned uWindowLength,
+ SCORE SmoothScore[], double dCeil)
+ {
+#define Ceil(x) ((SCORE) ((x) > dCeil ? dCeil : (x)))
+
+ if (1 != uWindowLength%2)
+ Quit("WindowSmooth, length=%u", uWindowLength);
+
+ if (uCount <= uWindowLength)
+ {
+ for (unsigned i = 0; i < uCount; ++i)
+ SmoothScore[i] = 0;
+ return;
+ }
+
+ const unsigned w2 = uWindowLength/2;
+ for (unsigned i = 0; i < w2; ++i)
+ {
+ SmoothScore[i] = 0;
+ SmoothScore[uCount - i - 1] = 0;
+ }
+
+ SCORE scoreWindowTotal = 0;
+ for (unsigned i = 0; i < uWindowLength; ++i)
+ {
+ scoreWindowTotal += Ceil(Score[i]);
+ }
+
+ for (unsigned i = w2; ; ++i)
+ {
+ SmoothScore[i] = scoreWindowTotal/uWindowLength;
+ if (i == uCount - w2 - 1)
+ break;
+
+ scoreWindowTotal -= Ceil(Score[i - w2]);
+ scoreWindowTotal += Ceil(Score[i + w2 + 1]);
+ }
+#undef Ceil
+ }
+
+// Find columns that score above the given threshold.
+// A range of scores is defined between the average
+// and the maximum. The threshold is a fraction 0.0 .. 1.0
+// within that range, where 0.0 is the average score
+// and 1.0 is the maximum score.
+// "Grade" is by analogy with grading on a curve.
+static void FindBestColsGrade(const SCORE Score[], unsigned uCount,
+ double dThreshold, unsigned BestCols[], unsigned *ptruBestColCount)
+ {
+ SCORE scoreTotal = 0;
+ for (unsigned uIndex = 0; uIndex < uCount; ++uIndex)
+ scoreTotal += Score[uIndex];
+ const SCORE scoreAvg = scoreTotal / uCount;
+
+ SCORE scoreMax = MINUS_INFINITY;
+ for (unsigned uIndex = 0; uIndex < uCount; ++uIndex)
+ if (Score[uIndex] > scoreMax)
+ scoreMax = Score[uIndex];
+
+ unsigned uBestColCount = 0;
+ for (unsigned uIndex = 0; uIndex < uCount; ++uIndex)
+ {
+ const SCORE s = Score[uIndex];
+ const double dHeight = (s - scoreAvg)/(scoreMax - scoreAvg);
+ if (dHeight >= dThreshold)
+ {
+ BestCols[uBestColCount] = uIndex;
+ ++uBestColCount;
+ }
+ }
+ *ptruBestColCount = uBestColCount;
+ }
+
+// Best col only if all following criteria satisfied:
+// (1) Score >= min
+// (2) Smoothed score >= min
+// (3) No gaps.
+static void FindBestColsCombo(const MSA &msa, const SCORE Score[],
+ const SCORE SmoothScore[], double dMinScore, double dMinSmoothScore,
+ unsigned BestCols[], unsigned *ptruBestColCount)
+ {
+ const unsigned uColCount = msa.GetColCount();
+
+ unsigned uBestColCount = 0;
+ for (unsigned uIndex = 0; uIndex < uColCount; ++uIndex)
+ {
+ if (Score[uIndex] < dMinScore)
+ continue;
+ if (SmoothScore[uIndex] < dMinSmoothScore)
+ continue;
+ if (msa.ColumnHasGap(uIndex))
+ continue;
+ BestCols[uBestColCount] = uIndex;
+ ++uBestColCount;
+ }
+ *ptruBestColCount = uBestColCount;
+ }
+
+static void ListBestCols(const MSA &msa, const SCORE Score[], const SCORE SmoothScore[],
+ unsigned BestCols[], unsigned uBestColCount)
+ {
+ const unsigned uColCount = msa.GetColCount();
+ const unsigned uSeqCount = msa.GetSeqCount();
+
+ Log("Col ");
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ Log("%u", uSeqIndex%10);
+ Log(" ");
+
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
+ {
+ Log("%3u ", uColIndex);
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ Log("%c", msa.GetChar(uSeqIndex, uColIndex));
+
+ Log(" %10.3f", Score[uColIndex]);
+ Log(" %10.3f", SmoothScore[uColIndex]);
+
+ for (unsigned i = 0; i < uBestColCount; ++i)
+ if (BestCols[i] == uColIndex)
+ Log(" <-- Best");
+ Log("\n");
+ }
+ }
+
+// If two best columns are found within a window, choose
+// the highest-scoring. If more than two, choose the one
+// closest to the center of the window.
+static void MergeBestCols(const SCORE Scores[], const unsigned BestCols[],
+ unsigned uBestColCount, unsigned uWindowLength, unsigned AnchorCols[],
+ unsigned *ptruAnchorColCount)
+ {
+ unsigned uAnchorColCount = 0;
+ for (unsigned n = 0; n < uBestColCount; /* update inside loop */)
+ {
+ unsigned uBestColIndex = BestCols[n];
+ unsigned uCountWithinWindow = 0;
+ for (unsigned i = n + 1; i < uBestColCount; ++i)
+ {
+ unsigned uBestColIndex2 = BestCols[i];
+ if (uBestColIndex2 - uBestColIndex >= uWindowLength)
+ break;
+ ++uCountWithinWindow;
+ }
+ unsigned uAnchorCol = uBestColIndex;
+ if (1 == uCountWithinWindow)
+ {
+ unsigned uBestColIndex2 = BestCols[n+1];
+ if (Scores[uBestColIndex] > Scores[uBestColIndex2])
+ uAnchorCol = uBestColIndex;
+ else
+ uAnchorCol = uBestColIndex2;
+ }
+ else if (uCountWithinWindow > 1)
+ {
+ unsigned uWindowCenter = uBestColIndex + uWindowLength/2;
+ int iClosestDist = uWindowLength;
+ unsigned uClosestCol = uBestColIndex;
+ for (unsigned i = n + 1; i < n + uCountWithinWindow; ++i)
+ {
+ unsigned uColIndex = BestCols[i];
+ int iDist = uColIndex - uBestColIndex;
+ if (iDist < 0)
+ iDist = -iDist;
+ if (iDist < iClosestDist)
+ {
+ uClosestCol = uColIndex;
+ iClosestDist = iDist;
+ }
+ }
+ uAnchorCol = uClosestCol;
+ }
+ AnchorCols[uAnchorColCount] = uAnchorCol;
+ ++uAnchorColCount;
+ n += uCountWithinWindow + 1;
+ }
+ *ptruAnchorColCount = uAnchorColCount;
+ }
+
+void FindAnchorCols(const MSA &msa, unsigned AnchorCols[],
+ unsigned *ptruAnchorColCount)
+ {
+ const unsigned uColCount = msa.GetColCount();
+ if (uColCount < 16)
+ {
+ *ptruAnchorColCount = 0;
+ return;
+ }
+
+ SCORE *MatchScore = new SCORE[uColCount];
+ SCORE *SmoothScore = new SCORE[uColCount];
+ unsigned *BestCols = new unsigned[uColCount];
+
+ GetLetterScores(msa, MatchScore);
+ WindowSmooth(MatchScore, uColCount, g_uSmoothWindowLength, SmoothScore,
+ g_dSmoothScoreCeil);
+
+ unsigned uBestColCount;
+ FindBestColsCombo(msa, MatchScore, SmoothScore, g_dMinBestColScore, g_dMinSmoothScore,
+ BestCols, &uBestColCount);
+
+#if TRACE
+ ListBestCols(msa, MatchScore, SmoothScore, BestCols, uBestColCount);
+#endif
+
+ MergeBestCols(MatchScore, BestCols, uBestColCount, g_uAnchorSpacing, AnchorCols,
+ ptruAnchorColCount);
+
+ delete[] MatchScore;
+ delete[] SmoothScore;
+ delete[] BestCols;
+ }
Added: trunk/packages/muscle/branches/upstream/current/bittraceback.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/bittraceback.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/bittraceback.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,206 @@
+#include "muscle.h"
+#include "pwpath.h"
+
+#define TRACE 0
+
+static char XlatEdgeType(char c)
+ {
+ if ('E' == c)
+ return 'D';
+ if ('J' == c)
+ return 'I';
+ return c;
+ }
+
+static const char *BitsToStr(char Bits)
+ {
+ static char Str[] = "xM xD xI";
+
+ switch (Bits & BIT_xM)
+ {
+ case BIT_MM:
+ Str[0] = 'M';
+ break;
+ case BIT_DM:
+ Str[0] = 'D';
+ break;
+ case BIT_IM:
+ Str[0] = 'I';
+ break;
+ }
+
+ switch (Bits & BIT_xD)
+ {
+ case BIT_MD:
+ Str[3] = 'M';
+ break;
+ case BIT_DD:
+ Str[3] = 'D';
+ break;
+ }
+
+ switch (Bits & BIT_xI)
+ {
+ case BIT_MI:
+ Str[6] = 'M';
+ break;
+ case BIT_II:
+ Str[6] = 'I';
+ break;
+ }
+
+ return Str;
+ }
+
+static inline char XChar(char Bits, char cType)
+ {
+ switch (cType)
+ {
+ case 'M':
+ {
+ switch (Bits & BIT_xM)
+ {
+ case BIT_MM:
+ return 'M';
+ case BIT_DM:
+ return 'D';
+ case BIT_IM:
+ return 'I';
+#if DOUBLE_AFFINE
+ case BIT_EM:
+ return 'E';
+ case BIT_JM:
+ return 'J';
+#endif
+ }
+ Quit("Huh!?");
+ return '?';
+ }
+ case 'D':
+ {
+ switch (Bits & BIT_xD)
+ {
+ case BIT_MD:
+ return 'M';
+ case BIT_DD:
+ return 'D';
+ }
+ Quit("Huh!?");
+ return '?';
+ }
+ case 'I':
+ {
+ switch (Bits & BIT_xI)
+ {
+ case BIT_MI:
+ return 'M';
+ case BIT_II:
+ return 'I';
+ }
+ Quit("Huh!?");
+ return '?';
+ }
+#if DOUBLE_AFFINE
+ case 'E':
+ {
+ switch (Bits & BIT_xE)
+ {
+ case BIT_ME:
+ return 'M';
+ case BIT_EE:
+ return 'E';
+ }
+ Quit("Huh!?");
+ return '?';
+ }
+ case 'J':
+ {
+ switch (Bits & BIT_xJ)
+ {
+ case BIT_MJ:
+ return 'M';
+ case BIT_JJ:
+ return 'J';
+ }
+ Quit("Huh!?");
+ return '?';
+ }
+#endif
+ default:
+ Quit("Huh?");
+ return '?';
+ }
+ }
+
+void BitTraceBack(char **TraceBack, unsigned uLengthA, unsigned uLengthB,
+ char LastEdge, PWPath &Path)
+ {
+#if TRACE
+ Log("BitTraceBack\n");
+#endif
+ Path.Clear();
+
+ PWEdge Edge;
+ Edge.uPrefixLengthA = uLengthA;
+ Edge.uPrefixLengthB = uLengthB;
+ char Bits = TraceBack[uLengthA][uLengthB];
+ Edge.cType = LastEdge;
+ for (;;)
+ {
+#if TRACE
+ Log("Prepend %c%d.%d\n", Edge.cType, Edge.uPrefixLengthA, Edge.uPrefixLengthB);
+#endif
+ char cSave = Edge.cType;
+ Edge.cType = XlatEdgeType(cSave);
+ Path.PrependEdge(Edge);
+ Edge.cType = cSave;
+
+ unsigned PLA = Edge.uPrefixLengthA;
+ unsigned PLB = Edge.uPrefixLengthB;
+ char Bits = TraceBack[PLA][PLB];
+ char NextEdgeType = XChar(Bits, Edge.cType);
+#if TRACE
+ Log("XChar(%s, %c) = %c\n", BitsToStr(Bits), Edge.cType, NextEdgeType);
+#endif
+ switch (Edge.cType)
+ {
+ case 'M':
+ {
+ if (Edge.uPrefixLengthA == 0)
+ Quit("BitTraceBack MA=0");
+ if (Edge.uPrefixLengthB == 0)
+ Quit("BitTraceBack MA=0");
+ --(Edge.uPrefixLengthA);
+ --(Edge.uPrefixLengthB);
+ break;
+ }
+ case 'D':
+ case 'E':
+ {
+ if (Edge.uPrefixLengthA == 0)
+ Quit("BitTraceBack DA=0");
+ --(Edge.uPrefixLengthA);
+ break;
+ }
+ case 'I':
+ case 'J':
+ {
+ if (Edge.uPrefixLengthB == 0)
+ Quit("BitTraceBack IB=0");
+ --(Edge.uPrefixLengthB);
+ break;
+ }
+ default:
+ Quit("BitTraceBack: Invalid edge %c", Edge);
+ }
+
+ if (0 == Edge.uPrefixLengthA && 0 == Edge.uPrefixLengthB)
+ break;
+
+ Edge.cType = NextEdgeType;
+ }
+
+#if TRACE
+ Path.LogMe();
+#endif
+ }
Added: trunk/packages/muscle/branches/upstream/current/blosumla.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/blosumla.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/blosumla.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,118 @@
+#include "muscle.h"
+
+#define GAPVAL 0.3
+#define GAPGAPVAL 5.0
+
+// Blosum62 log-average factor matrix
+static float Blosum62LA[20][20] =
+ {
+#define v(x) ((float) x)
+#define S_ROW(n, c, A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y) \
+ { v(A), v(C), v(D), v(E), v(F), v(G), v(H), v(I), v(K), v(L), v(M), v(N), v(P), v(Q), \
+ v(R), v(S), v(T), v(V), v(W), v(Y) },
+
+// Blosum62 log average matrix
+// A C D E F
+// G H I K L
+// M N P Q R
+// S T V W Y
+S_ROW( 0, 'A', 3.9029401, 0.8679881, 0.5446049, 0.7412640, 0.4648942,
+ 1.0568696, 0.5693654, 0.6324813, 0.7753898, 0.6019460,
+ 0.7231498, 0.5883077, 0.7541214, 0.7568035, 0.6126988,
+ 1.4721037, 0.9844022, 0.9364584, 0.4165484, 0.5426125)
+
+S_ROW( 1, 'C', 0.8679881, 19.5765802, 0.3014542, 0.2859347, 0.4389910,
+ 0.4203886, 0.3550472, 0.6534589, 0.3491296, 0.6422760,
+ 0.6113537, 0.3978026, 0.3795628, 0.3657796, 0.3089379,
+ 0.7384148, 0.7405530, 0.7558448, 0.4499807, 0.4342013)
+
+S_ROW( 2, 'D', 0.5446049, 0.3014542, 7.3979253, 1.6878109, 0.2989696,
+ 0.6343015, 0.6785593, 0.3390155, 0.7840905, 0.2866128,
+ 0.3464547, 1.5538520, 0.5987177, 0.8970811, 0.5732000,
+ 0.9135051, 0.6947898, 0.3365004, 0.2321050, 0.3456829)
+
+S_ROW( 3, 'E', 0.7412640, 0.2859347, 1.6878109, 5.4695276, 0.3307441,
+ 0.4812675, 0.9600400, 0.3305223, 1.3082782, 0.3728734,
+ 0.5003421, 0.9112983, 0.6792027, 1.9017376, 0.9607983,
+ 0.9503570, 0.7414260, 0.4289431, 0.3743021, 0.4964664)
+
+S_ROW( 4, 'F', 0.4648942, 0.4389910, 0.2989696, 0.3307441, 8.1287983,
+ 0.3406407, 0.6519893, 0.9457698, 0.3440433, 1.1545978,
+ 1.0043715, 0.3542882, 0.2874440, 0.3339729, 0.3807263,
+ 0.4399736, 0.4816930, 0.7450894, 1.3743775, 2.7693817)
+
+S_ROW( 5, 'G', 1.0568696, 0.4203886, 0.6343015, 0.4812675, 0.3406407,
+ 6.8763075, 0.4929663, 0.2750096, 0.5888716, 0.2845039,
+ 0.3954865, 0.8637114, 0.4773858, 0.5386498, 0.4499840,
+ 0.9035965, 0.5792712, 0.3369551, 0.4216898, 0.3487141)
+
+S_ROW( 6, 'H', 0.5693654, 0.3550472, 0.6785593, 0.9600400, 0.6519893,
+ 0.4929663, 13.5060070, 0.3262878, 0.7788884, 0.3806759,
+ 0.5841316, 1.2220028, 0.4728797, 1.1679835, 0.9170473,
+ 0.7367319, 0.5575021, 0.3394474, 0.4440859, 1.7979036)
+
+S_ROW( 7, 'I', 0.6324813, 0.6534589, 0.3390155, 0.3305223, 0.9457698,
+ 0.2750096, 0.3262878, 3.9979299, 0.3963730, 1.6944349,
+ 1.4777449, 0.3279345, 0.3846629, 0.3829375, 0.3547509,
+ 0.4431634, 0.7798163, 2.4175121, 0.4088732, 0.6303898)
+
+S_ROW( 8, 'K', 0.7753898, 0.3491296, 0.7840905, 1.3082782, 0.3440433,
+ 0.5888716, 0.7788884, 0.3963730, 4.7643359, 0.4282702,
+ 0.6253033, 0.9398419, 0.7037741, 1.5543233, 2.0768092,
+ 0.9319192, 0.7929060, 0.4565429, 0.3589319, 0.5321784)
+
+S_ROW( 9, 'L', 0.6019460, 0.6422760, 0.2866128, 0.3728734, 1.1545978,
+ 0.2845039, 0.3806759, 1.6944349, 0.4282702, 3.7966214,
+ 1.9942957, 0.3100430, 0.3711219, 0.4773261, 0.4739194,
+ 0.4288939, 0.6603292, 1.3142355, 0.5680359, 0.6920589)
+
+S_ROW(10, 'M', 0.7231498, 0.6113537, 0.3464547, 0.5003421, 1.0043715,
+ 0.3954865, 0.5841316, 1.4777449, 0.6253033, 1.9942957,
+ 6.4814549, 0.4745299, 0.4238960, 0.8642486, 0.6226249,
+ 0.5985578, 0.7938018, 1.2689365, 0.6103022, 0.7083636)
+
+S_ROW(11, 'N', 0.5883077, 0.3978026, 1.5538520, 0.9112983, 0.3542882,
+ 0.8637114, 1.2220028, 0.3279345, 0.9398419, 0.3100430,
+ 0.4745299, 7.0940964, 0.4999337, 1.0005835, 0.8586298,
+ 1.2315289, 0.9841525, 0.3690340, 0.2777841, 0.4860309)
+
+S_ROW(12, 'P', 0.7541214, 0.3795628, 0.5987177, 0.6792027, 0.2874440,
+ 0.4773858, 0.4728797, 0.3846629, 0.7037741, 0.3711219,
+ 0.4238960, 0.4999337, 12.8375452, 0.6412803, 0.4815348,
+ 0.7555033, 0.6888962, 0.4430825, 0.2818321, 0.3635216)
+
+S_ROW(13, 'Q', 0.7568035, 0.3657796, 0.8970811, 1.9017376, 0.3339729,
+ 0.5386498, 1.1679835, 0.3829375, 1.5543233, 0.4773261,
+ 0.8642486, 1.0005835, 0.6412803, 6.2444210, 1.4057958,
+ 0.9655559, 0.7913219, 0.4667781, 0.5093584, 0.6110951)
+
+S_ROW(14, 'R', 0.6126988, 0.3089379, 0.5732000, 0.9607983, 0.3807263,
+ 0.4499840, 0.9170473, 0.3547509, 2.0768092, 0.4739194,
+ 0.6226249, 0.8586298, 0.4815348, 1.4057958, 6.6655769,
+ 0.7671661, 0.6777544, 0.4200721, 0.3951049, 0.5559652)
+
+S_ROW(15, 'S', 1.4721037, 0.7384148, 0.9135051, 0.9503570, 0.4399736,
+ 0.9035965, 0.7367319, 0.4431634, 0.9319192, 0.4288939,
+ 0.5985578, 1.2315289, 0.7555033, 0.9655559, 0.7671661,
+ 3.8428476, 1.6139205, 0.5652240, 0.3853031, 0.5575206)
+
+S_ROW(16, 'T', 0.9844022, 0.7405530, 0.6947898, 0.7414260, 0.4816930,
+ 0.5792712, 0.5575021, 0.7798163, 0.7929060, 0.6603292,
+ 0.7938018, 0.9841525, 0.6888962, 0.7913219, 0.6777544,
+ 1.6139205, 4.8321048, 0.9809432, 0.4309317, 0.5731577)
+
+S_ROW(17, 'V', 0.9364584, 0.7558448, 0.3365004, 0.4289431, 0.7450894,
+ 0.3369551, 0.3394474, 2.4175121, 0.4565429, 1.3142355,
+ 1.2689365, 0.3690340, 0.4430825, 0.4667781, 0.4200721,
+ 0.5652240, 0.9809432, 3.6921553, 0.3744576, 0.6580390)
+
+S_ROW(18, 'W', 0.4165484, 0.4499807, 0.2321050, 0.3743021, 1.3743775,
+ 0.4216898, 0.4440859, 0.4088732, 0.3589319, 0.5680359,
+ 0.6103022, 0.2777841, 0.2818321, 0.5093584, 0.3951049,
+ 0.3853031, 0.4309317, 0.3744576, 38.1077830, 2.1098056)
+
+S_ROW(19, 'Y', 0.5426125, 0.4342013, 0.3456829, 0.4964664, 2.7693817,
+ 0.3487141, 1.7979036, 0.6303898, 0.5321784, 0.6920589,
+ 0.7083636, 0.4860309, 0.3635216, 0.6110951, 0.5559652,
+ 0.5575206, 0.5731577, 0.6580390, 2.1098056, 9.8322054)
+ };
Added: trunk/packages/muscle/branches/upstream/current/clust.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/clust.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/clust.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,666 @@
+#include "muscle.h"
+#include "clust.h"
+#include "clustset.h"
+#include <stdio.h>
+
+#define TRACE 0
+
+Clust::Clust()
+ {
+ m_Nodes = 0;
+ m_uNodeCount = 0;
+ m_uLeafCount = 0;
+ m_uClusterCount = 0;
+ m_JoinStyle = JOIN_Undefined;
+ m_dDist = 0;
+ m_uLeafCount = 0;
+ m_ptrSet = 0;
+ }
+
+Clust::~Clust()
+ {
+ delete[] m_Nodes;
+ delete[] m_dDist;
+ delete[] m_ClusterIndexToNodeIndex;
+ }
+
+void Clust::Create(ClustSet &Set, CLUSTER Method)
+ {
+ m_ptrSet = &Set;
+
+ SetLeafCount(Set.GetLeafCount());
+
+ switch (Method)
+ {
+ case CLUSTER_UPGMA:
+ m_JoinStyle = JOIN_NearestNeighbor;
+ m_CentroidStyle = LINKAGE_Avg;
+ break;
+
+ case CLUSTER_UPGMAMax:
+ m_JoinStyle = JOIN_NearestNeighbor;
+ m_CentroidStyle = LINKAGE_Max;
+ break;
+
+ case CLUSTER_UPGMAMin:
+ m_JoinStyle = JOIN_NearestNeighbor;
+ m_CentroidStyle = LINKAGE_Min;
+ break;
+
+ case CLUSTER_UPGMB:
+ m_JoinStyle = JOIN_NearestNeighbor;
+ m_CentroidStyle = LINKAGE_Biased;
+ break;
+
+ case CLUSTER_NeighborJoining:
+ m_JoinStyle = JOIN_NeighborJoining;
+ m_CentroidStyle = LINKAGE_NeighborJoining;
+ break;
+
+ default:
+ Quit("Clust::Create, invalid method %d", Method);
+ }
+
+ if (m_uLeafCount <= 1)
+ Quit("Clust::Create: no leaves");
+
+ m_uNodeCount = 2*m_uLeafCount - 1;
+ m_Nodes = new ClustNode[m_uNodeCount];
+ m_ClusterIndexToNodeIndex = new unsigned[m_uLeafCount];
+
+ m_ptrClusterList = 0;
+ for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex)
+ {
+ ClustNode &Node = m_Nodes[uNodeIndex];
+ Node.m_uIndex = uNodeIndex;
+ if (uNodeIndex < m_uLeafCount)
+ {
+ Node.m_uSize = 1;
+ Node.m_uLeafIndexes = new unsigned[1];
+ Node.m_uLeafIndexes[0] = uNodeIndex;
+ AddToClusterList(uNodeIndex);
+ }
+ else
+ Node.m_uSize = 0;
+ }
+
+// Compute initial distance matrix between leaves
+ SetProgressDesc("Build dist matrix");
+ unsigned uPairIndex = 0;
+ const unsigned uPairCount = (m_uLeafCount*(m_uLeafCount - 1))/2;
+ for (unsigned i = 0; i < m_uLeafCount; ++i)
+ for (unsigned j = 0; j < i; ++j)
+ {
+ const float dDist = (float) m_ptrSet->ComputeDist(*this, i, j);
+ SetDist(i, j, dDist);
+ if (0 == uPairIndex%10000)
+ Progress(uPairIndex, uPairCount);
+ ++uPairIndex;
+ }
+ ProgressStepsDone();
+
+// Call CreateCluster once for each internal node in the tree
+ SetProgressDesc("Build guide tree");
+ m_uClusterCount = m_uLeafCount;
+ const unsigned uInternalNodeCount = m_uNodeCount - m_uLeafCount;
+ for (unsigned uNodeIndex = m_uLeafCount; uNodeIndex < m_uNodeCount; ++uNodeIndex)
+ {
+ unsigned i = uNodeIndex + 1 - m_uLeafCount;
+ Progress(i, uInternalNodeCount);
+ CreateCluster();
+ }
+ ProgressStepsDone();
+ }
+
+void Clust::CreateCluster()
+ {
+ unsigned uLeftNodeIndex;
+ unsigned uRightNodeIndex;
+ float dLeftLength;
+ float dRightLength;
+ ChooseJoin(&uLeftNodeIndex, &uRightNodeIndex, &dLeftLength, &dRightLength);
+
+ const unsigned uNewNodeIndex = m_uNodeCount - m_uClusterCount + 1;
+
+ JoinNodes(uLeftNodeIndex, uRightNodeIndex, dLeftLength, dRightLength,
+ uNewNodeIndex);
+
+#if TRACE
+ Log("Merge New=%u L=%u R=%u Ld=%7.2g Rd=%7.2g\n",
+ uNewNodeIndex, uLeftNodeIndex, uRightNodeIndex, dLeftLength, dRightLength);
+#endif
+
+// Compute distances to other clusters
+ --m_uClusterCount;
+ for (unsigned uNodeIndex = GetFirstCluster(); uNodeIndex != uInsane;
+ uNodeIndex = GetNextCluster(uNodeIndex))
+ {
+ if (uNodeIndex == uLeftNodeIndex || uNodeIndex == uRightNodeIndex)
+ continue;
+
+ if (uNewNodeIndex == uNodeIndex)
+ continue;
+
+ const float dDist = ComputeDist(uNewNodeIndex, uNodeIndex);
+ SetDist(uNewNodeIndex, uNodeIndex, dDist);
+ }
+
+ for (unsigned uNodeIndex = GetFirstCluster(); uNodeIndex != uInsane;
+ uNodeIndex = GetNextCluster(uNodeIndex))
+ {
+ if (uNodeIndex == uLeftNodeIndex || uNodeIndex == uRightNodeIndex)
+ continue;
+
+ if (uNewNodeIndex == uNodeIndex)
+ continue;
+
+#if REDLACK
+ const float dMetric = ComputeMetric(uNewNodeIndex, uNodeIndex);
+ InsertMetric(uNewNodeIndex, uNodeIndex, dMetric);
+#endif
+ }
+ }
+
+void Clust::ChooseJoin(unsigned *ptruLeftIndex, unsigned *ptruRightIndex,
+ float *ptrdLeftLength, float *ptrdRightLength)
+ {
+ switch (m_JoinStyle)
+ {
+ case JOIN_NearestNeighbor:
+ ChooseJoinNearestNeighbor(ptruLeftIndex, ptruRightIndex, ptrdLeftLength,
+ ptrdRightLength);
+ return;
+ case JOIN_NeighborJoining:
+ ChooseJoinNeighborJoining(ptruLeftIndex, ptruRightIndex, ptrdLeftLength,
+ ptrdRightLength);
+ return;
+ }
+ Quit("Clust::ChooseJoin, Invalid join style %u", m_JoinStyle);
+ }
+
+void Clust::ChooseJoinNearestNeighbor(unsigned *ptruLeftIndex,
+ unsigned *ptruRightIndex, float *ptrdLeftLength, float *ptrdRightLength)
+ {
+ const unsigned uClusterCount = GetClusterCount();
+
+ unsigned uMinLeftNodeIndex;
+ unsigned uMinRightNodeIndex;
+ GetMinMetric(&uMinLeftNodeIndex, &uMinRightNodeIndex);
+
+ float dMinDist = GetDist(uMinLeftNodeIndex, uMinRightNodeIndex);
+
+ const float dLeftHeight = GetHeight(uMinLeftNodeIndex);
+ const float dRightHeight = GetHeight(uMinRightNodeIndex);
+
+ *ptruLeftIndex = uMinLeftNodeIndex;
+ *ptruRightIndex = uMinRightNodeIndex;
+ *ptrdLeftLength = dMinDist/2 - dLeftHeight;
+ *ptrdRightLength = dMinDist/2 - dRightHeight;
+ }
+
+void Clust::ChooseJoinNeighborJoining(unsigned *ptruLeftIndex,
+ unsigned *ptruRightIndex, float *ptrdLeftLength, float *ptrdRightLength)
+ {
+ const unsigned uClusterCount = GetClusterCount();
+
+ //unsigned uMinLeftNodeIndex = uInsane;
+ //unsigned uMinRightNodeIndex = uInsane;
+ //float dMinD = PLUS_INFINITY;
+ //for (unsigned i = GetFirstCluster(); i != uInsane; i = GetNextCluster(i))
+ // {
+ // const float ri = Calc_r(i);
+ // for (unsigned j = GetNextCluster(i); j != uInsane; j = GetNextCluster(j))
+ // {
+ // const float rj = Calc_r(j);
+ // const float dij = GetDist(i, j);
+ // const float Dij = dij - (ri + rj);
+ // if (Dij < dMinD)
+ // {
+ // dMinD = Dij;
+ // uMinLeftNodeIndex = i;
+ // uMinRightNodeIndex = j;
+ // }
+ // }
+ // }
+
+ unsigned uMinLeftNodeIndex;
+ unsigned uMinRightNodeIndex;
+ GetMinMetric(&uMinLeftNodeIndex, &uMinRightNodeIndex);
+
+ const float dDistLR = GetDist(uMinLeftNodeIndex, uMinRightNodeIndex);
+ const float rL = Calc_r(uMinLeftNodeIndex);
+ const float rR = Calc_r(uMinRightNodeIndex);
+
+ const float dLeftLength = (dDistLR + rL - rR)/2;
+ const float dRightLength = (dDistLR - rL + rR)/2;
+
+ *ptruLeftIndex = uMinLeftNodeIndex;
+ *ptruRightIndex = uMinRightNodeIndex;
+ *ptrdLeftLength = dLeftLength;
+ *ptrdRightLength = dRightLength;
+ }
+
+void Clust::JoinNodes(unsigned uLeftIndex, unsigned uRightIndex, float dLeftLength,
+ float dRightLength, unsigned uNodeIndex)
+ {
+ ClustNode &Parent = m_Nodes[uNodeIndex];
+ ClustNode &Left = m_Nodes[uLeftIndex];
+ ClustNode &Right = m_Nodes[uRightIndex];
+
+ Left.m_dLength = dLeftLength;
+ Right.m_dLength = dRightLength;
+
+ Parent.m_ptrLeft = &Left;
+ Parent.m_ptrRight = &Right;
+
+ Left.m_ptrParent = &Parent;
+ Right.m_ptrParent = &Parent;
+
+ const unsigned uLeftSize = Left.m_uSize;
+ const unsigned uRightSize = Right.m_uSize;
+ const unsigned uParentSize = uLeftSize + uRightSize;
+ Parent.m_uSize = uParentSize;
+
+ assert(0 == Parent.m_uLeafIndexes);
+ Parent.m_uLeafIndexes = new unsigned[uParentSize];
+
+ const unsigned uLeftBytes = uLeftSize*sizeof(unsigned);
+ const unsigned uRightBytes = uRightSize*sizeof(unsigned);
+ memcpy(Parent.m_uLeafIndexes, Left.m_uLeafIndexes, uLeftBytes);
+ memcpy(Parent.m_uLeafIndexes + uLeftSize, Right.m_uLeafIndexes, uRightBytes);
+
+ DeleteFromClusterList(uLeftIndex);
+ DeleteFromClusterList(uRightIndex);
+ AddToClusterList(uNodeIndex);
+ }
+
+float Clust::Calc_r(unsigned uNodeIndex) const
+ {
+ const unsigned uClusterCount = GetClusterCount();
+ if (2 == uClusterCount)
+ return 0;
+
+ float dSum = 0;
+ for (unsigned i = GetFirstCluster(); i != uInsane; i = GetNextCluster(i))
+ {
+ if (i == uNodeIndex)
+ continue;
+ dSum += GetDist(uNodeIndex, i);
+ }
+ return dSum/(uClusterCount - 2);
+ }
+
+float Clust::ComputeDist(unsigned uNewNodeIndex, unsigned uNodeIndex)
+ {
+ switch (m_CentroidStyle)
+ {
+ case LINKAGE_Avg:
+ return ComputeDistAverageLinkage(uNewNodeIndex, uNodeIndex);
+
+ case LINKAGE_Min:
+ return ComputeDistMinLinkage(uNewNodeIndex, uNodeIndex);
+
+ case LINKAGE_Max:
+ return ComputeDistMaxLinkage(uNewNodeIndex, uNodeIndex);
+
+ case LINKAGE_Biased:
+ return ComputeDistMAFFT(uNewNodeIndex, uNodeIndex);
+
+ case LINKAGE_NeighborJoining:
+ return ComputeDistNeighborJoining(uNewNodeIndex, uNodeIndex);
+ }
+ Quit("Clust::ComputeDist, invalid centroid style %u", m_CentroidStyle);
+ return (float) g_dNAN;
+ }
+
+float Clust::ComputeDistMinLinkage(unsigned uNewNodeIndex, unsigned uNodeIndex)
+ {
+ const unsigned uLeftNodeIndex = GetLeftIndex(uNewNodeIndex);
+ const unsigned uRightNodeIndex = GetRightIndex(uNewNodeIndex);
+ const float dDistL = GetDist(uLeftNodeIndex, uNodeIndex);
+ const float dDistR = GetDist(uRightNodeIndex, uNodeIndex);
+ return (dDistL < dDistR ? dDistL : dDistR);
+ }
+
+float Clust::ComputeDistMaxLinkage(unsigned uNewNodeIndex, unsigned uNodeIndex)
+ {
+ const unsigned uLeftNodeIndex = GetLeftIndex(uNewNodeIndex);
+ const unsigned uRightNodeIndex = GetRightIndex(uNewNodeIndex);
+ const float dDistL = GetDist(uLeftNodeIndex, uNodeIndex);
+ const float dDistR = GetDist(uRightNodeIndex, uNodeIndex);
+ return (dDistL > dDistR ? dDistL : dDistR);
+ }
+
+float Clust::ComputeDistAverageLinkage(unsigned uNewNodeIndex, unsigned uNodeIndex)
+ {
+ const unsigned uLeftNodeIndex = GetLeftIndex(uNewNodeIndex);
+ const unsigned uRightNodeIndex = GetRightIndex(uNewNodeIndex);
+ const float dDistL = GetDist(uLeftNodeIndex, uNodeIndex);
+ const float dDistR = GetDist(uRightNodeIndex, uNodeIndex);
+ return (dDistL + dDistR)/2;
+ }
+
+float Clust::ComputeDistNeighborJoining(unsigned uNewNodeIndex, unsigned uNodeIndex)
+ {
+ const unsigned uLeftNodeIndex = GetLeftIndex(uNewNodeIndex);
+ const unsigned uRightNodeIndex = GetRightIndex(uNewNodeIndex);
+ const float dDistLR = GetDist(uLeftNodeIndex, uRightNodeIndex);
+ const float dDistL = GetDist(uLeftNodeIndex, uNodeIndex);
+ const float dDistR = GetDist(uRightNodeIndex, uNodeIndex);
+ const float dDist = (dDistL + dDistR - dDistLR)/2;
+ return dDist;
+ }
+
+// This is a mysterious variant of UPGMA reverse-engineered from MAFFT source.
+float Clust::ComputeDistMAFFT(unsigned uNewNodeIndex, unsigned uNodeIndex)
+ {
+ const unsigned uLeftNodeIndex = GetLeftIndex(uNewNodeIndex);
+ const unsigned uRightNodeIndex = GetRightIndex(uNewNodeIndex);
+
+ const float dDistLR = GetDist(uLeftNodeIndex, uRightNodeIndex);
+ const float dDistL = GetDist(uLeftNodeIndex, uNodeIndex);
+ const float dDistR = GetDist(uRightNodeIndex, uNodeIndex);
+ const float dMinDistLR = (dDistL < dDistR ? dDistL : dDistR);
+ const float dSumDistLR = dDistL + dDistR;
+ const float dDist = dMinDistLR*(1 - g_dSUEFF) + dSumDistLR*g_dSUEFF/2;
+ return dDist;
+ }
+
+unsigned Clust::GetClusterCount() const
+ {
+ return m_uClusterCount;
+ }
+
+void Clust::LogMe() const
+ {
+ Log("Clust %u leaves, %u nodes, %u clusters.\n",
+ m_uLeafCount, m_uNodeCount, m_uClusterCount);
+
+ Log("Distance matrix\n");
+ const unsigned uNodeCount = GetNodeCount();
+ Log(" ");
+ for (unsigned i = 0; i < uNodeCount - 1; ++i)
+ Log(" %7u", i);
+ Log("\n");
+
+ Log(" ");
+ for (unsigned i = 0; i < uNodeCount - 1; ++i)
+ Log(" ------");
+ Log("\n");
+
+ for (unsigned i = 0; i < uNodeCount - 1; ++i)
+ {
+ Log("%4u: ", i);
+ for (unsigned j = 0; j < i; ++j)
+ Log(" %7.2g", GetDist(i, j));
+ Log("\n");
+ }
+
+ Log("\n");
+ Log("Node Size Prnt Left Rght Length Name\n");
+ Log("---- ---- ---- ---- ---- ------ ----\n");
+ for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex)
+ {
+ const ClustNode &Node = m_Nodes[uNodeIndex];
+ Log("%4u %4u", uNodeIndex, Node.m_uSize);
+ if (0 != Node.m_ptrParent)
+ Log(" %4u", Node.m_ptrParent->m_uIndex);
+ else
+ Log(" ");
+
+ if (0 != Node.m_ptrLeft)
+ Log(" %4u", Node.m_ptrLeft->m_uIndex);
+ else
+ Log(" ");
+
+ if (0 != Node.m_ptrRight)
+ Log(" %4u", Node.m_ptrRight->m_uIndex);
+ else
+ Log(" ");
+
+ if (uNodeIndex != m_uNodeCount - 1)
+ Log(" %7.3g", Node.m_dLength);
+ if (IsLeaf(uNodeIndex))
+ {
+ const char *ptrName = GetNodeName(uNodeIndex);
+ if (0 != ptrName)
+ Log(" %s", ptrName);
+ }
+ if (GetRootNodeIndex() == uNodeIndex)
+ Log(" [ROOT]");
+ Log("\n");
+ }
+ }
+
+const ClustNode &Clust::GetNode(unsigned uNodeIndex) const
+ {
+ if (uNodeIndex >= m_uNodeCount)
+ Quit("ClustNode::GetNode(%u) %u", uNodeIndex, m_uNodeCount);
+ return m_Nodes[uNodeIndex];
+ }
+
+bool Clust::IsLeaf(unsigned uNodeIndex) const
+ {
+ return uNodeIndex < m_uLeafCount;
+ }
+
+unsigned Clust::GetClusterSize(unsigned uNodeIndex) const
+ {
+ const ClustNode &Node = GetNode(uNodeIndex);
+ return Node.m_uSize;
+ }
+
+unsigned Clust::GetLeftIndex(unsigned uNodeIndex) const
+ {
+ const ClustNode &Node = GetNode(uNodeIndex);
+ if (0 == Node.m_ptrLeft)
+ Quit("Clust::GetLeftIndex: leaf");
+ return Node.m_ptrLeft->m_uIndex;
+ }
+
+unsigned Clust::GetRightIndex(unsigned uNodeIndex) const
+ {
+ const ClustNode &Node = GetNode(uNodeIndex);
+ if (0 == Node.m_ptrRight)
+ Quit("Clust::GetRightIndex: leaf");
+ return Node.m_ptrRight->m_uIndex;
+ }
+
+float Clust::GetLength(unsigned uNodeIndex) const
+ {
+ const ClustNode &Node = GetNode(uNodeIndex);
+ return Node.m_dLength;
+ }
+
+void Clust::SetLeafCount(unsigned uLeafCount)
+ {
+ if (uLeafCount <= 1)
+ Quit("Clust::SetLeafCount(%u)", uLeafCount);
+
+ m_uLeafCount = uLeafCount;
+ const unsigned uNodeCount = GetNodeCount();
+
+// Triangular matrix size excluding diagonal (all zeros in our case).
+ m_uTriangularMatrixSize = (uNodeCount*(uNodeCount - 1))/2;
+ m_dDist = new float[m_uTriangularMatrixSize];
+ }
+
+unsigned Clust::GetLeafCount() const
+ {
+ return m_uLeafCount;
+ }
+
+unsigned Clust::VectorIndex(unsigned uIndex1, unsigned uIndex2) const
+ {
+ const unsigned uNodeCount = GetNodeCount();
+ if (uIndex1 >= uNodeCount || uIndex2 >= uNodeCount)
+ Quit("DistVectorIndex(%u,%u) %u", uIndex1, uIndex2, uNodeCount);
+ unsigned v;
+ if (uIndex1 >= uIndex2)
+ v = uIndex2 + (uIndex1*(uIndex1 - 1))/2;
+ else
+ v = uIndex1 + (uIndex2*(uIndex2 - 1))/2;
+ assert(v < m_uTriangularMatrixSize);
+ return v;
+ }
+
+float Clust::GetDist(unsigned uIndex1, unsigned uIndex2) const
+ {
+ unsigned v = VectorIndex(uIndex1, uIndex2);
+ return m_dDist[v];
+ }
+
+void Clust::SetDist(unsigned uIndex1, unsigned uIndex2, float dDist)
+ {
+ unsigned v = VectorIndex(uIndex1, uIndex2);
+ m_dDist[v] = dDist;
+ }
+
+float Clust::GetHeight(unsigned uNodeIndex) const
+ {
+ if (IsLeaf(uNodeIndex))
+ return 0;
+
+ const unsigned uLeftIndex = GetLeftIndex(uNodeIndex);
+ const unsigned uRightIndex = GetRightIndex(uNodeIndex);
+ const float dLeftLength = GetLength(uLeftIndex);
+ const float dRightLength = GetLength(uRightIndex);
+ const float dLeftHeight = dLeftLength + GetHeight(uLeftIndex);
+ const float dRightHeight = dRightLength + GetHeight(uRightIndex);
+ return (dLeftHeight + dRightHeight)/2;
+ }
+
+const char *Clust::GetNodeName(unsigned uNodeIndex) const
+ {
+ if (!IsLeaf(uNodeIndex))
+ Quit("Clust::GetNodeName, is not leaf");
+ return m_ptrSet->GetLeafName(uNodeIndex);
+ }
+
+unsigned Clust::GetNodeId(unsigned uNodeIndex) const
+ {
+ if (uNodeIndex >= GetLeafCount())
+ return 0;
+ return m_ptrSet->GetLeafId(uNodeIndex);
+ }
+
+unsigned Clust::GetLeaf(unsigned uNodeIndex, unsigned uLeafIndex) const
+ {
+ const ClustNode &Node = GetNode(uNodeIndex);
+ const unsigned uLeafCount = Node.m_uSize;
+ if (uLeafIndex >= uLeafCount)
+ Quit("Clust::GetLeaf, invalid index");
+ const unsigned uIndex = Node.m_uLeafIndexes[uLeafIndex];
+ if (uIndex >= m_uNodeCount)
+ Quit("Clust::GetLeaf, index out of range");
+ return uIndex;
+ }
+
+unsigned Clust::GetFirstCluster() const
+ {
+ if (0 == m_ptrClusterList)
+ return uInsane;
+ return m_ptrClusterList->m_uIndex;
+ }
+
+unsigned Clust::GetNextCluster(unsigned uIndex) const
+ {
+ ClustNode *ptrNode = &m_Nodes[uIndex];
+ if (0 == ptrNode->m_ptrNextCluster)
+ return uInsane;
+ return ptrNode->m_ptrNextCluster->m_uIndex;
+ }
+
+void Clust::DeleteFromClusterList(unsigned uNodeIndex)
+ {
+ assert(uNodeIndex < m_uNodeCount);
+ ClustNode *ptrNode = &m_Nodes[uNodeIndex];
+ ClustNode *ptrPrev = ptrNode->m_ptrPrevCluster;
+ ClustNode *ptrNext = ptrNode->m_ptrNextCluster;
+
+ if (0 != ptrNext)
+ ptrNext->m_ptrPrevCluster = ptrPrev;
+ if (0 == ptrPrev)
+ {
+ assert(m_ptrClusterList == ptrNode);
+ m_ptrClusterList = ptrNext;
+ }
+ else
+ ptrPrev->m_ptrNextCluster = ptrNext;
+
+ ptrNode->m_ptrNextCluster = 0;
+ ptrNode->m_ptrPrevCluster = 0;
+ }
+
+void Clust::AddToClusterList(unsigned uNodeIndex)
+ {
+ assert(uNodeIndex < m_uNodeCount);
+ ClustNode *ptrNode = &m_Nodes[uNodeIndex];
+
+ if (0 != m_ptrClusterList)
+ m_ptrClusterList->m_ptrPrevCluster = ptrNode;
+
+ ptrNode->m_ptrNextCluster = m_ptrClusterList;
+ ptrNode->m_ptrPrevCluster = 0;
+
+ m_ptrClusterList = ptrNode;
+ }
+
+float Clust::ComputeMetric(unsigned uIndex1, unsigned uIndex2) const
+ {
+ switch (m_JoinStyle)
+ {
+ case JOIN_NearestNeighbor:
+ return ComputeMetricNearestNeighbor(uIndex1, uIndex2);
+
+ case JOIN_NeighborJoining:
+ return ComputeMetricNeighborJoining(uIndex1, uIndex2);
+ }
+ Quit("Clust::ComputeMetric");
+ return 0;
+ }
+
+float Clust::ComputeMetricNeighborJoining(unsigned i, unsigned j) const
+ {
+ float ri = Calc_r(i);
+ float rj = Calc_r(j);
+ float dij = GetDist(i, j);
+ float dMetric = dij - (ri + rj);
+ return (float) dMetric;
+ }
+
+float Clust::ComputeMetricNearestNeighbor(unsigned i, unsigned j) const
+ {
+ return (float) GetDist(i, j);
+ }
+
+float Clust::GetMinMetricBruteForce(unsigned *ptruIndex1, unsigned *ptruIndex2) const
+ {
+ unsigned uMinLeftNodeIndex = uInsane;
+ unsigned uMinRightNodeIndex = uInsane;
+ float dMinMetric = PLUS_INFINITY;
+ for (unsigned uLeftNodeIndex = GetFirstCluster(); uLeftNodeIndex != uInsane;
+ uLeftNodeIndex = GetNextCluster(uLeftNodeIndex))
+ {
+ for (unsigned uRightNodeIndex = GetNextCluster(uLeftNodeIndex);
+ uRightNodeIndex != uInsane;
+ uRightNodeIndex = GetNextCluster(uRightNodeIndex))
+ {
+ float dMetric = ComputeMetric(uLeftNodeIndex, uRightNodeIndex);
+ if (dMetric < dMinMetric)
+ {
+ dMinMetric = dMetric;
+ uMinLeftNodeIndex = uLeftNodeIndex;
+ uMinRightNodeIndex = uRightNodeIndex;
+ }
+ }
+ }
+ *ptruIndex1 = uMinLeftNodeIndex;
+ *ptruIndex2 = uMinRightNodeIndex;
+ return dMinMetric;
+ }
+
+float Clust::GetMinMetric(unsigned *ptruIndex1, unsigned *ptruIndex2) const
+ {
+ return GetMinMetricBruteForce(ptruIndex1, ptruIndex2);
+ }
Added: trunk/packages/muscle/branches/upstream/current/clust.h
===================================================================
--- trunk/packages/muscle/branches/upstream/current/clust.h 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/clust.h 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,148 @@
+#ifndef Clust_h
+#define Clust_h
+
+class Clust;
+class ClustNode;
+class ClustSet;
+class Phylip;
+class SortedNode;
+
+const unsigned RB_NIL = ((unsigned) 0xfff0);
+
+class ClustNode
+ {
+public:
+ ClustNode()
+ {
+ m_uIndex = uInsane;
+ m_uSize = uInsane;
+ m_dLength = (float) dInsane;
+ m_ptrLeft = 0;
+ m_ptrRight = 0;
+ m_ptrParent = 0;
+ m_ptrNextCluster = 0;
+ m_ptrPrevCluster = 0;
+ m_uLeafIndexes = 0;
+ }
+ ~ClustNode()
+ {
+ delete[] m_uLeafIndexes;
+ }
+ unsigned m_uIndex;
+ unsigned m_uSize;
+ float m_dLength;
+ ClustNode *m_ptrLeft;
+ ClustNode *m_ptrRight;
+ ClustNode *m_ptrParent;
+ ClustNode *m_ptrNextCluster;
+ ClustNode *m_ptrPrevCluster;
+ unsigned *m_uLeafIndexes;
+ };
+
+class Clust
+ {
+public:
+ Clust();
+ virtual ~Clust();
+
+ void Create(ClustSet &Set, CLUSTER Method);
+
+ unsigned GetLeafCount() const;
+
+ unsigned GetClusterCount() const;
+ unsigned GetClusterSize(unsigned uNodeIndex) const;
+ unsigned GetLeaf(unsigned uClusterIndex, unsigned uLeafIndex) const;
+
+ unsigned GetNodeCount() const { return 2*m_uLeafCount - 1; }
+ const ClustNode &GetRoot() const { return m_Nodes[GetRootNodeIndex()]; }
+ unsigned GetRootNodeIndex() const { return m_uNodeCount - 1; }
+
+ const ClustNode &GetNode(unsigned uNodeIndex) const;
+ bool IsLeaf(unsigned uNodeIndex) const;
+ unsigned GetLeftIndex(unsigned uNodeIndex) const;
+ unsigned GetRightIndex(unsigned uNodeIndex) const;
+ float GetLength(unsigned uNodeIndex) const;
+ float GetHeight(unsigned uNodeIndex) const;
+ const char *GetNodeName(unsigned uNodeIndex) const;
+ unsigned GetNodeId(unsigned uNodeIndex) const;
+
+ JOIN GetJoinStyle() const { return m_JoinStyle; }
+ LINKAGE GetCentroidStyle() const { return m_CentroidStyle; }
+
+ void SetDist(unsigned uIndex1, unsigned uIndex2, float dDist);
+ float GetDist(unsigned uIndex1, unsigned uIndex2) const;
+
+ void ToPhylip(Phylip &tree);
+
+ void LogMe() const;
+
+//private:
+ void SetLeafCount(unsigned uLeafCount);
+
+ void CreateCluster();
+ void JoinNodes(unsigned uLeftNodeIndex, unsigned uRightNodeIndex,
+ float dLeftLength, float dRightLength, unsigned uNewNodeIndex);
+
+ void ChooseJoin(unsigned *ptruLeftIndex, unsigned *ptruRightIndex,
+ float *ptrdLeftLength, float *ptrdRightLength);
+ void ChooseJoinNeighborJoining(unsigned *ptruLeftIndex, unsigned *ptruRightIndex,
+ float *ptrdLeftLength, float *ptrdRightLength);
+ void ChooseJoinNearestNeighbor(unsigned *ptruLeftIndex, unsigned *ptruRightIndex,
+ float *ptrdLeftLength, float *ptrdRightLength);
+
+ float ComputeDist(unsigned uNewNodeIndex, unsigned uNodeIndex);
+ float ComputeDistAverageLinkage(unsigned uNewNodeIndex, unsigned uNodeIndex);
+ float ComputeDistMinLinkage(unsigned uNewNodeIndex, unsigned uNodeIndex);
+ float ComputeDistMaxLinkage(unsigned uNewNodeIndex, unsigned uNodeIndex);
+ float ComputeDistNeighborJoining(unsigned uNewNewIndex, unsigned uNodeIndex);
+ float ComputeDistMAFFT(unsigned uNewNewIndex, unsigned uNodeIndex);
+
+ float Calc_r(unsigned uNodeIndex) const;
+
+ unsigned VectorIndex(unsigned uIndex1, unsigned uIndex2) const;
+
+ unsigned GetFirstCluster() const;
+ unsigned GetNextCluster(unsigned uNodeIndex) const;
+
+ float ComputeMetric(unsigned uIndex1, unsigned uIndex2) const;
+ float ComputeMetricNearestNeighbor(unsigned i, unsigned j) const;
+ float ComputeMetricNeighborJoining(unsigned i, unsigned j) const;
+
+ void InitMetric(unsigned uMaxNodeIndex);
+ void InsertMetric(unsigned uIndex1, unsigned uIndex2, float dMetric);
+ float GetMinMetric(unsigned *ptruIndex1, unsigned *ptruIndex2) const;
+ float GetMinMetricBruteForce(unsigned *ptruIndex1, unsigned *ptruIndex2) const;
+ void DeleteMetric(unsigned uIndex);
+ void DeleteMetric(unsigned uIndex1, unsigned uIndex2);
+ void ListMetric() const;
+
+ void DeleteFromClusterList(unsigned uNodeIndex);
+ void AddToClusterList(unsigned uNodeIndex);
+
+ void RBDelete(unsigned RBNode);
+ unsigned RBInsert(unsigned i, unsigned j, float fMetric);
+
+ unsigned RBNext(unsigned RBNode) const;
+ unsigned RBPrev(unsigned RBNode) const;
+ unsigned RBMin(unsigned RBNode) const;
+ unsigned RBMax(unsigned RBNode) const;
+
+ void ValidateRB(const char szMsg[] = 0) const;
+ void ValidateRBNode(unsigned Node, const char szMsg[]) const;
+
+//private:
+ JOIN m_JoinStyle;
+ LINKAGE m_CentroidStyle;
+ ClustNode *m_Nodes;
+ unsigned *m_ClusterIndexToNodeIndex;
+ unsigned *m_NodeIndexToClusterIndex;
+ unsigned m_uLeafCount;
+ unsigned m_uNodeCount;
+ unsigned m_uClusterCount;
+ unsigned m_uTriangularMatrixSize;
+ float *m_dDist;
+ ClustSet *m_ptrSet;
+ ClustNode *m_ptrClusterList;
+ };
+
+#endif // Clust_h
Added: trunk/packages/muscle/branches/upstream/current/cluster.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/cluster.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/cluster.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,339 @@
+#include "muscle.h"
+#include "cluster.h"
+#include "distfunc.h"
+
+static inline float Min(float d1, float d2)
+ {
+ return d1 < d2 ? d1 : d2;
+ }
+
+static inline float Max(float d1, float d2)
+ {
+ return d1 > d2 ? d1 : d2;
+ }
+
+static inline float Mean(float d1, float d2)
+ {
+ return (float) ((d1 + d2)/2.0);
+ }
+
+#if _DEBUG
+void ClusterTree::Validate(unsigned uNodeCount)
+ {
+ unsigned n;
+ ClusterNode *pNode;
+ unsigned uDisjointListCount = 0;
+ for (pNode = m_ptrDisjoints; pNode; pNode = pNode->GetNextDisjoint())
+ {
+ ClusterNode *pPrev = pNode->GetPrevDisjoint();
+ ClusterNode *pNext = pNode->GetNextDisjoint();
+ if (0 != pPrev)
+ {
+ if (pPrev->GetNextDisjoint() != pNode)
+ {
+ Log("Prev->This mismatch, prev=\n");
+ pPrev->LogMe();
+ Log("This=\n");
+ pNode->LogMe();
+ Quit("ClusterTree::Validate()");
+ }
+ }
+ else
+ {
+ if (pNode != m_ptrDisjoints)
+ {
+ Log("[%u]->prev = 0 but != m_ptrDisjoints=%d\n",
+ pNode->GetIndex(),
+ m_ptrDisjoints ? m_ptrDisjoints->GetIndex() : 0xffffffff);
+ pNode->LogMe();
+ Quit("ClusterTree::Validate()");
+ }
+ }
+ if (0 != pNext)
+ {
+ if (pNext->GetPrevDisjoint() != pNode)
+ {
+ Log("Next->This mismatch, next=\n");
+ pNext->LogMe();
+ Log("This=\n");
+ pNode->LogMe();
+ Quit("ClusterTree::Validate()");
+ }
+ }
+ ++uDisjointListCount;
+ if (uDisjointListCount > m_uNodeCount)
+ Quit("Loop in disjoint list");
+ }
+
+ unsigned uParentlessNodeCount = 0;
+ for (n = 0; n < uNodeCount; ++n)
+ if (0 == m_Nodes[n].GetParent())
+ ++uParentlessNodeCount;
+
+ if (uDisjointListCount != uParentlessNodeCount)
+ Quit("Disjoints = %u Parentless = %u\n", uDisjointListCount,
+ uParentlessNodeCount);
+ }
+#else // !_DEBUG
+#define Validate(uNodeCount) // empty
+#endif
+
+void ClusterNode::LogMe() const
+ {
+ unsigned uClusterSize = GetClusterSize();
+ Log("[%02u] w=%5.3f CW=%5.3f LBW=%5.3f RBW=%5.3f LWT=%5.3f RWT=%5.3f L=%02d R=%02d P=%02d NxDj=%02d PvDj=%02d Sz=%02d {",
+ m_uIndex,
+ m_dWeight,
+ GetClusterWeight(),
+ GetLeftBranchWeight(),
+ GetRightBranchWeight(),
+ GetLeftWeight(),
+ GetRightWeight(),
+ m_ptrLeft ? m_ptrLeft->GetIndex() : 0xffffffff,
+ m_ptrRight ? m_ptrRight->GetIndex() : 0xffffffff,
+ m_ptrParent ? m_ptrParent->GetIndex() : 0xffffffff,
+ m_ptrNextDisjoint ? m_ptrNextDisjoint->GetIndex() : 0xffffffff,
+ m_ptrPrevDisjoint ? m_ptrPrevDisjoint->GetIndex() : 0xffffffff,
+ uClusterSize);
+ for (unsigned i = 0; i < uClusterSize; ++i)
+ Log(" %u", GetClusterLeaf(i)->GetIndex());
+ Log(" }\n");
+ }
+
+// How many leaves in the sub-tree under this node?
+unsigned ClusterNode::GetClusterSize() const
+ {
+ unsigned uLeafCount = 0;
+
+ if (0 == m_ptrLeft && 0 == m_ptrRight)
+ return 1;
+
+ if (0 != m_ptrLeft)
+ uLeafCount += m_ptrLeft->GetClusterSize();
+ if (0 != m_ptrRight)
+ uLeafCount += m_ptrRight->GetClusterSize();
+ assert(uLeafCount > 0);
+ return uLeafCount;
+ }
+
+double ClusterNode::GetClusterWeight() const
+ {
+ double dWeight = 0.0;
+ if (0 != m_ptrLeft)
+ dWeight += m_ptrLeft->GetClusterWeight();
+ if (0 != m_ptrRight)
+ dWeight += m_ptrRight->GetClusterWeight();
+ return dWeight + GetWeight();
+ }
+
+double ClusterNode::GetLeftBranchWeight() const
+ {
+ const ClusterNode *ptrLeft = GetLeft();
+ if (0 == ptrLeft)
+ return 0.0;
+
+ return GetWeight() - ptrLeft->GetWeight();
+ }
+
+double ClusterNode::GetRightBranchWeight() const
+ {
+ const ClusterNode *ptrRight = GetRight();
+ if (0 == ptrRight)
+ return 0.0;
+
+ return GetWeight() - ptrRight->GetWeight();
+ }
+
+double ClusterNode::GetRightWeight() const
+ {
+ const ClusterNode *ptrRight = GetRight();
+ if (0 == ptrRight)
+ return 0.0;
+ return ptrRight->GetClusterWeight() + GetWeight();
+ }
+
+double ClusterNode::GetLeftWeight() const
+ {
+ const ClusterNode *ptrLeft = GetLeft();
+ if (0 == ptrLeft)
+ return 0.0;
+ return ptrLeft->GetClusterWeight() + GetWeight();
+ }
+
+// Return n'th leaf in the sub-tree under this node.
+const ClusterNode *ClusterNode::GetClusterLeaf(unsigned uLeafIndex) const
+ {
+ if (0 != m_ptrLeft)
+ {
+ if (0 == m_ptrRight)
+ return this;
+
+ unsigned uLeftLeafCount = m_ptrLeft->GetClusterSize();
+
+ if (uLeafIndex < uLeftLeafCount)
+ return m_ptrLeft->GetClusterLeaf(uLeafIndex);
+
+ assert(uLeafIndex >= uLeftLeafCount);
+ return m_ptrRight->GetClusterLeaf(uLeafIndex - uLeftLeafCount);
+ }
+ if (0 == m_ptrRight)
+ return this;
+ return m_ptrRight->GetClusterLeaf(uLeafIndex);
+ }
+
+void ClusterTree::DeleteFromDisjoints(ClusterNode *ptrNode)
+ {
+ ClusterNode *ptrPrev = ptrNode->GetPrevDisjoint();
+ ClusterNode *ptrNext = ptrNode->GetNextDisjoint();
+
+ if (0 != ptrPrev)
+ ptrPrev->SetNextDisjoint(ptrNext);
+ else
+ m_ptrDisjoints = ptrNext;
+
+ if (0 != ptrNext)
+ ptrNext->SetPrevDisjoint(ptrPrev);
+
+#if _DEBUG
+// not algorithmically necessary, but improves clarity
+// and supports Validate().
+ ptrNode->SetPrevDisjoint(0);
+ ptrNode->SetNextDisjoint(0);
+#endif
+ }
+
+void ClusterTree::AddToDisjoints(ClusterNode *ptrNode)
+ {
+ ptrNode->SetNextDisjoint(m_ptrDisjoints);
+ ptrNode->SetPrevDisjoint(0);
+ if (0 != m_ptrDisjoints)
+ m_ptrDisjoints->SetPrevDisjoint(ptrNode);
+ m_ptrDisjoints = ptrNode;
+ }
+
+ClusterTree::ClusterTree()
+ {
+ m_ptrDisjoints = 0;
+ m_Nodes = 0;
+ m_uNodeCount = 0;
+ }
+
+ClusterTree::~ClusterTree()
+ {
+ delete[] m_Nodes;
+ }
+
+void ClusterTree::LogMe() const
+ {
+ Log("Disjoints=%d\n", m_ptrDisjoints ? m_ptrDisjoints->GetIndex() : 0xffffffff);
+ for (unsigned i = 0; i < m_uNodeCount; ++i)
+ {
+ m_Nodes[i].LogMe();
+ }
+ }
+
+ClusterNode *ClusterTree::GetRoot() const
+ {
+ return &m_Nodes[m_uNodeCount - 1];
+ }
+
+// This is the UPGMA algorithm as described in Durbin et al. p166.
+void ClusterTree::Create(const DistFunc &Dist)
+ {
+ unsigned i;
+ m_uLeafCount = Dist.GetCount();
+ m_uNodeCount = 2*m_uLeafCount - 1;
+
+ delete[] m_Nodes;
+ m_Nodes = new ClusterNode[m_uNodeCount];
+
+ for (i = 0; i < m_uNodeCount; ++i)
+ m_Nodes[i].SetIndex(i);
+
+ for (i = 0; i < m_uLeafCount - 1; ++i)
+ m_Nodes[i].SetNextDisjoint(&m_Nodes[i+1]);
+
+ for (i = 1; i < m_uLeafCount; ++i)
+ m_Nodes[i].SetPrevDisjoint(&m_Nodes[i-1]);
+
+ m_ptrDisjoints = &m_Nodes[0];
+
+// Log("Initial state\n");
+// LogMe();
+// Log("\n");
+
+ DistFunc ClusterDist;
+ ClusterDist.SetCount(m_uNodeCount);
+ double dMaxDist = 0.0;
+ for (i = 0; i < m_uLeafCount; ++i)
+ for (unsigned j = 0; j < m_uLeafCount; ++j)
+ {
+ float dDist = Dist.GetDist(i, j);
+ ClusterDist.SetDist(i, j, dDist);
+ }
+
+ Validate(m_uLeafCount);
+
+// Iteration. N-1 joins needed to create a binary tree from N leaves.
+ for (unsigned uJoinIndex = m_uLeafCount; uJoinIndex < m_uNodeCount;
+ ++uJoinIndex)
+ {
+ // Find closest pair of clusters
+ unsigned uIndexClosest1;
+ unsigned uIndexClosest2;
+ bool bFound = false;
+ double dDistClosest = 9e99;
+ for (ClusterNode *ptrNode1 = m_ptrDisjoints; ptrNode1;
+ ptrNode1 = ptrNode1->GetNextDisjoint())
+ {
+ for (ClusterNode *ptrNode2 = ptrNode1->GetNextDisjoint(); ptrNode2;
+ ptrNode2 = ptrNode2->GetNextDisjoint())
+ {
+ unsigned i1 = ptrNode1->GetIndex();
+ unsigned i2 = ptrNode2->GetIndex();
+ double dDist = ClusterDist.GetDist(i1, i2);
+ if (dDist < dDistClosest)
+ {
+ bFound = true;
+ dDistClosest = dDist;
+ uIndexClosest1 = i1;
+ uIndexClosest2 = i2;
+ }
+ }
+ }
+ assert(bFound);
+
+ ClusterNode &Join = m_Nodes[uJoinIndex];
+ ClusterNode &Child1 = m_Nodes[uIndexClosest1];
+ ClusterNode &Child2 = m_Nodes[uIndexClosest2];
+
+ Join.SetLeft(&Child1);
+ Join.SetRight(&Child2);
+ Join.SetWeight(dDistClosest);
+
+ Child1.SetParent(&Join);
+ Child2.SetParent(&Join);
+
+ DeleteFromDisjoints(&Child1);
+ DeleteFromDisjoints(&Child2);
+ AddToDisjoints(&Join);
+
+// Log("After join %d %d\n", uIndexClosest1, uIndexClosest2);
+// LogMe();
+
+ // Calculate distance of every remaining disjoint cluster to the
+ // new cluster created by the join
+ for (ClusterNode *ptrNode = m_ptrDisjoints; ptrNode;
+ ptrNode = ptrNode->GetNextDisjoint())
+ {
+ unsigned uNodeIndex = ptrNode->GetIndex();
+ float dDist1 = ClusterDist.GetDist(uNodeIndex, uIndexClosest1);
+ float dDist2 = ClusterDist.GetDist(uNodeIndex, uIndexClosest2);
+ float dDist = Min(dDist1, dDist2);
+ ClusterDist.SetDist(uJoinIndex, uNodeIndex, dDist);
+ }
+ Validate(uJoinIndex+1);
+ }
+ GetRoot()->GetClusterWeight();
+// LogMe();
+ }
Added: trunk/packages/muscle/branches/upstream/current/cluster.h
===================================================================
--- trunk/packages/muscle/branches/upstream/current/cluster.h 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/cluster.h 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,86 @@
+class DistFunc;
+
+class ClusterNode
+ {
+ friend class ClusterTree;
+public:
+ ClusterNode()
+ {
+ m_dWeight = 0.0;
+ m_dWeight2 = 0.0;
+ m_ptrLeft = 0;
+ m_ptrRight = 0;
+ m_ptrParent = 0;
+ m_uIndex = 0;
+ m_ptrPrevDisjoint = 0;
+ m_ptrNextDisjoint = 0;
+ }
+ ~ClusterNode() {}
+
+public:
+ unsigned GetIndex() const { return m_uIndex; }
+ ClusterNode *GetLeft() const { return m_ptrLeft; }
+ ClusterNode *GetRight() const { return m_ptrRight; }
+ ClusterNode *GetParent() const { return m_ptrParent; }
+ double GetWeight() const { return m_dWeight; }
+
+ const ClusterNode *GetClusterLeaf(unsigned uLeafIndex) const;
+ unsigned GetClusterSize() const;
+ double GetClusterWeight() const;
+ double GetLeftBranchWeight() const;
+ double GetRightBranchWeight() const;
+ double GetLeftWeight() const;
+ double GetRightWeight() const;
+
+ void LogMe() const;
+
+ double GetWeight2() const { return m_dWeight2; }
+ void SetWeight2(double dWeight2) { m_dWeight2 = dWeight2; }
+
+protected:
+ void SetIndex(unsigned uIndex) { m_uIndex = uIndex; }
+ void SetWeight(double dWeight) { m_dWeight = dWeight; }
+ void SetLeft(ClusterNode *ptrLeft) { m_ptrLeft = ptrLeft; }
+ void SetRight(ClusterNode *ptrRight) { m_ptrRight = ptrRight; }
+ void SetParent(ClusterNode *ptrParent) { m_ptrParent = ptrParent; }
+ void SetNextDisjoint(ClusterNode *ptrNode) { m_ptrNextDisjoint = ptrNode; }
+ void SetPrevDisjoint(ClusterNode *ptrNode) { m_ptrPrevDisjoint = ptrNode; }
+
+ ClusterNode *GetNextDisjoint() { return m_ptrNextDisjoint; }
+ ClusterNode *GetPrevDisjoint() { return m_ptrPrevDisjoint; }
+
+private:
+ double m_dWeight;
+ double m_dWeight2;
+ unsigned m_uIndex;
+ ClusterNode *m_ptrLeft;
+ ClusterNode *m_ptrRight;
+ ClusterNode *m_ptrParent;
+ ClusterNode *m_ptrNextDisjoint;
+ ClusterNode *m_ptrPrevDisjoint;
+ };
+
+class ClusterTree
+ {
+public:
+ ClusterTree();
+ virtual ~ClusterTree();
+
+ void Create(const DistFunc &DF);
+
+ ClusterNode *GetRoot() const;
+ void LogMe() const;
+
+protected:
+ void Join(ClusterNode *ptrNode1, ClusterNode *ptrNode2,
+ ClusterNode *ptrJoin);
+ void AddToDisjoints(ClusterNode *ptrNode);
+ void DeleteFromDisjoints(ClusterNode *ptrNode);
+ void Validate(unsigned uNodeCount);
+
+private:
+ ClusterNode *m_ptrDisjoints;
+ ClusterNode *m_Nodes;
+ unsigned m_uNodeCount;
+ unsigned m_uLeafCount;
+ };
Added: trunk/packages/muscle/branches/upstream/current/clustset.h
===================================================================
--- trunk/packages/muscle/branches/upstream/current/clustset.h 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/clustset.h 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,21 @@
+#ifndef ClustSet_h
+#define ClustSet_h
+
+enum JOIN;
+enum LINKAGE;
+class Clust;
+
+class ClustSet
+ {
+public:
+ virtual unsigned GetLeafCount() = 0;
+ virtual double ComputeDist(const Clust &C, unsigned uNodeIndex1,
+ unsigned uNodeIndex2) = 0;
+ virtual void JoinNodes(const Clust &C, unsigned uLeftNodeIndex,
+ unsigned uRightNodeIndex, unsigned uJoinedNodeIndex,
+ double *ptrdLeftLength, double *ptrdRightLength) = 0;
+ virtual const char *GetLeafName(unsigned uNodeIndex) = 0;
+ virtual unsigned GetLeafId(unsigned uNodeIndex) = 0;
+ };
+
+#endif // ClustSet_h
Added: trunk/packages/muscle/branches/upstream/current/clustsetdf.h
===================================================================
--- trunk/packages/muscle/branches/upstream/current/clustsetdf.h 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/clustsetdf.h 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,48 @@
+#ifndef ClustSetDF_h
+#define ClustSetDF_h
+
+class MSA;
+class Clust;
+
+#include "clustset.h"
+#include "distfunc.h"
+#include "msa.h"
+
+class ClustSetDF : public ClustSet
+ {
+public:
+ ClustSetDF(const DistFunc &DF) :
+ m_ptrDF(&DF)
+ {
+ }
+
+public:
+ virtual unsigned GetLeafCount()
+ {
+ return m_ptrDF->GetCount();
+ }
+ virtual const char *GetLeafName(unsigned uNodeIndex)
+ {
+ return m_ptrDF->GetName(uNodeIndex);
+ }
+ virtual unsigned GetLeafId(unsigned uNodeIndex)
+ {
+ return m_ptrDF->GetId(uNodeIndex);
+ }
+ virtual void JoinNodes(const Clust &C, unsigned uLeftNodeIndex,
+ unsigned uRightNodeIndex, unsigned uJoinedNodeIndex,
+ double *ptrdLeftLength, double *ptrdRightLength)
+ {
+ Quit("ClustSetDF::JoinNodes, should never be called");
+ }
+ virtual double ComputeDist(const Clust &C, unsigned uNodeIndex1,
+ unsigned uNodeIndex2)
+ {
+ return m_ptrDF->GetDist(uNodeIndex1, uNodeIndex2);
+ }
+
+private:
+ const DistFunc *m_ptrDF;
+ };
+
+#endif // ClustSetDF_h
Added: trunk/packages/muscle/branches/upstream/current/clustsetmsa.h
===================================================================
--- trunk/packages/muscle/branches/upstream/current/clustsetmsa.h 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/clustsetmsa.h 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,55 @@
+#ifndef ClustSetMSA_h
+#define ClustSetMSA_h
+
+class MSA;
+class Clust;
+
+#include "clustset.h"
+#include "msadist.h"
+
+// Distance matrix based set.
+// Computes distances between leaves, never between
+// joined clusters (leaves this to distance matrix method).
+class ClustSetMSA : public ClustSet
+ {
+public:
+ ClustSetMSA(const MSA &msa, MSADist &MD) :
+ m_ptrMSA(&msa),
+ m_ptrMSADist(&MD)
+ {
+ }
+
+public:
+ virtual unsigned GetLeafCount()
+ {
+ return m_ptrMSA->GetSeqCount();
+ }
+ virtual const char *GetLeafName(unsigned uNodeIndex)
+ {
+ return m_ptrMSA->GetSeqName(uNodeIndex);
+ }
+ virtual unsigned GetLeafId(unsigned uNodeIndex)
+ {
+ return m_ptrMSA->GetSeqId(uNodeIndex);
+ }
+ virtual void JoinNodes(const Clust &C, unsigned uLeftNodeIndex,
+ unsigned uRightNodeIndex, unsigned uJoinedNodeIndex,
+ double *ptrdLeftLength, double *ptrdRightLength)
+ {
+ Quit("ClustSetMSA::JoinNodes, should never be called");
+ }
+ virtual double ComputeDist(const Clust &C, unsigned uNodeIndex1,
+ unsigned uNodeIndex2)
+ {
+ return m_ptrMSADist->ComputeDist(*m_ptrMSA, uNodeIndex1, uNodeIndex2);
+ }
+
+public:
+ const MSA &GetMSA();
+
+private:
+ const MSA *m_ptrMSA;
+ MSADist *m_ptrMSADist;
+ };
+
+#endif // ClustSetMSA_h
Added: trunk/packages/muscle/branches/upstream/current/clwwt.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/clwwt.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/clwwt.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,190 @@
+#include "muscle.h"
+#include "tree.h"
+#include "msa.h"
+
+/***
+Compute weights by the CLUSTALW method.
+Thompson, Higgins and Gibson (1994), CABIOS (10) 19-29;
+see also CLUSTALW paper.
+
+Weights are computed from the edge lengths of a rooted tree.
+
+Define the strength of an edge to be its length divided by the number
+of leaves under that edge. The weight of a sequence is then the sum
+of edge strengths on the path from the root to the leaf.
+
+Example.
+
+ 0.2
+ -----A 0.1
+ -x ------- B 0.7
+ --------y ----------- C
+ 0.3 ----------z
+ 0.4 -------------- D
+ 0.8
+
+Edge Length Leaves Strength
+---- ----- ------ --------
+xy 0.3 3 0.1
+xA 0.2 1 0.2
+yz 0.4 2 0.2
+yB 0.1 1 0.1
+zC 0.7 1 0.7
+zD 0.8 1 0.8
+
+Leaf Path Strengths Weight
+---- ---- --------- ------
+A xA 0.2 0.2
+B xy-yB 0.1 + 0.1 0.2
+C xy-yz-zC 0.1 + 0.2 + 0.7 1.0
+D xy-yz-zD 0.1 + 0.2 + 0.8 1.1
+
+***/
+
+#define TRACE 0
+
+static unsigned CountLeaves(const Tree &tree, unsigned uNodeIndex,
+ unsigned LeavesUnderNode[])
+ {
+ if (tree.IsLeaf(uNodeIndex))
+ {
+ LeavesUnderNode[uNodeIndex] = 1;
+ return 1;
+ }
+
+ const unsigned uLeft = tree.GetLeft(uNodeIndex);
+ const unsigned uRight = tree.GetRight(uNodeIndex);
+ const unsigned uRightCount = CountLeaves(tree, uRight, LeavesUnderNode);
+ const unsigned uLeftCount = CountLeaves(tree, uLeft, LeavesUnderNode);
+ const unsigned uCount = uRightCount + uLeftCount;
+ LeavesUnderNode[uNodeIndex] = uCount;
+ return uCount;
+ }
+
+void CalcClustalWWeights(const Tree &tree, WEIGHT Weights[])
+ {
+#if TRACE
+ Log("CalcClustalWWeights\n");
+ tree.LogMe();
+#endif
+
+ const unsigned uLeafCount = tree.GetLeafCount();
+ if (0 == uLeafCount)
+ return;
+ else if (1 == uLeafCount)
+ {
+ Weights[0] = (WEIGHT) 1.0;
+ return;
+ }
+ else if (2 == uLeafCount)
+ {
+ Weights[0] = (WEIGHT) 0.5;
+ Weights[1] = (WEIGHT) 0.5;
+ return;
+ }
+
+ if (!tree.IsRooted())
+ Quit("CalcClustalWWeights requires rooted tree");
+
+ const unsigned uNodeCount = tree.GetNodeCount();
+ unsigned *LeavesUnderNode = new unsigned[uNodeCount];
+ memset(LeavesUnderNode, 0, uNodeCount*sizeof(unsigned));
+
+ const unsigned uRootNodeIndex = tree.GetRootNodeIndex();
+ unsigned uLeavesUnderRoot = CountLeaves(tree, uRootNodeIndex, LeavesUnderNode);
+ if (uLeavesUnderRoot != uLeafCount)
+ Quit("WeightsFromTreee: Internal error, root count %u %u",
+ uLeavesUnderRoot, uLeafCount);
+
+#if TRACE
+ Log("Node Leaves Length Strength\n");
+ Log("---- ------ -------- --------\n");
+ // 1234 123456 12345678 12345678
+#endif
+
+ double *Strengths = new double[uNodeCount];
+ for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex)
+ {
+ if (tree.IsRoot(uNodeIndex))
+ {
+ Strengths[uNodeIndex] = 0.0;
+ continue;
+ }
+ const unsigned uParent = tree.GetParent(uNodeIndex);
+ const double dLength = tree.GetEdgeLength(uNodeIndex, uParent);
+ const unsigned uLeaves = LeavesUnderNode[uNodeIndex];
+ const double dStrength = dLength / (double) uLeaves;
+ Strengths[uNodeIndex] = dStrength;
+#if TRACE
+ Log("%4u %6u %8g %8g\n", uNodeIndex, uLeaves, dLength, dStrength);
+#endif
+ }
+
+#if TRACE
+ Log("\n");
+ Log(" Seq Path..Weight\n");
+ Log("-------------------- ------------\n");
+#endif
+ for (unsigned n = 0; n < uLeafCount; ++n)
+ {
+ const unsigned uLeafNodeIndex = tree.LeafIndexToNodeIndex(n);
+#if TRACE
+ Log("%20.20s %4u ", tree.GetLeafName(uLeafNodeIndex), uLeafNodeIndex);
+#endif
+ if (!tree.IsLeaf(uLeafNodeIndex))
+ Quit("CalcClustalWWeights: leaf");
+
+ double dWeight = 0;
+ unsigned uNode = uLeafNodeIndex;
+ while (!tree.IsRoot(uNode))
+ {
+ dWeight += Strengths[uNode];
+ uNode = tree.GetParent(uNode);
+#if TRACE
+ Log("->%u(%g)", uNode, Strengths[uNode]);
+#endif
+ }
+ if (dWeight < 0.0001)
+ {
+#if TRACE
+ Log("zero->one");
+#endif
+ dWeight = 1.0;
+ }
+ Weights[n] = (WEIGHT) dWeight;
+#if TRACE
+ Log(" = %g\n", dWeight);
+#endif
+ }
+
+ delete[] Strengths;
+ delete[] LeavesUnderNode;
+
+ Normalize(Weights, uLeafCount);
+ }
+
+void MSA::SetClustalWWeights(const Tree &tree)
+ {
+ const unsigned uSeqCount = GetSeqCount();
+ const unsigned uLeafCount = tree.GetLeafCount();
+
+ WEIGHT *Weights = new WEIGHT[uSeqCount];
+
+ CalcClustalWWeights(tree, Weights);
+
+ for (unsigned n = 0; n < uLeafCount; ++n)
+ {
+ const WEIGHT w = Weights[n];
+ const unsigned uLeafNodeIndex = tree.LeafIndexToNodeIndex(n);
+ const unsigned uId = tree.GetLeafId(uLeafNodeIndex);
+ const unsigned uSeqIndex = GetSeqIndex(uId);
+#if DEBUG
+ if (GetSeqName(uSeqIndex) != tree.GetLeafName(uLeafNodeIndex))
+ Quit("MSA::SetClustalWWeights: names don't match");
+#endif
+ SetSeqWeight(uSeqIndex, w);
+ }
+ NormalizeWeights((WEIGHT) 1.0);
+
+ delete[] Weights;
+ }
Added: trunk/packages/muscle/branches/upstream/current/color.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/color.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/color.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,189 @@
+#include "muscle.h"
+#include "msa.h"
+
+static int Blosum62[23][23] =
+ {
+// A B C D E F G H I K L M N P Q R S T V W X Y Z
+ +4, -2, +0, -2, -1, -2, +0, -2, -1, -1, -1, -1, -2, -1, -1, -1, +1, +0, +0, -3, -1, -2, -1, // A
+ -2, +6, -3, +6, +2, -3, -1, -1, -3, -1, -4, -3, +1, -1, +0, -2, +0, -1, -3, -4, -1, -3, +2, // B
+ +0, -3, +9, -3, -4, -2, -3, -3, -1, -3, -1, -1, -3, -3, -3, -3, -1, -1, -1, -2, -1, -2, -4, // C
+ -2, +6, -3, +6, +2, -3, -1, -1, -3, -1, -4, -3, +1, -1, +0, -2, +0, -1, -3, -4, -1, -3, +2, // D
+ -1, +2, -4, +2, +5, -3, -2, +0, -3, +1, -3, -2, +0, -1, +2, +0, +0, -1, -2, -3, -1, -2, +5, // E
+
+ -2, -3, -2, -3, -3, +6, -3, -1, +0, -3, +0, +0, -3, -4, -3, -3, -2, -2, -1, +1, -1, +3, -3, // F
+ +0, -1, -3, -1, -2, -3, +6, -2, -4, -2, -4, -3, +0, -2, -2, -2, +0, -2, -3, -2, -1, -3, -2, // G
+ -2, -1, -3, -1, +0, -1, -2, +8, -3, -1, -3, -2, +1, -2, +0, +0, -1, -2, -3, -2, -1, +2, +0, // H
+ -1, -3, -1, -3, -3, +0, -4, -3, +4, -3, +2, +1, -3, -3, -3, -3, -2, -1, +3, -3, -1, -1, -3, // I
+ -1, -1, -3, -1, +1, -3, -2, -1, -3, +5, -2, -1, +0, -1, +1, +2, +0, -1, -2, -3, -1, -2, +1, // K
+
+ -1, -4, -1, -4, -3, +0, -4, -3, +2, -2, +4, +2, -3, -3, -2, -2, -2, -1, +1, -2, -1, -1, -3, // L
+ -1, -3, -1, -3, -2, +0, -3, -2, +1, -1, +2, +5, -2, -2, +0, -1, -1, -1, +1, -1, -1, -1, -2, // M
+ -2, +1, -3, +1, +0, -3, +0, +1, -3, +0, -3, -2, +6, -2, +0, +0, +1, +0, -3, -4, -1, -2, +0, // N
+ -1, -1, -3, -1, -1, -4, -2, -2, -3, -1, -3, -2, -2, +7, -1, -2, -1, -1, -2, -4, -1, -3, -1, // P
+ -1, +0, -3, +0, +2, -3, -2, +0, -3, +1, -2, +0, +0, -1, +5, +1, +0, -1, -2, -2, -1, -1, +2, // Q
+
+ -1, -2, -3, -2, +0, -3, -2, +0, -3, +2, -2, -1, +0, -2, +1, +5, -1, -1, -3, -3, -1, -2, +0, // R
+ +1, +0, -1, +0, +0, -2, +0, -1, -2, +0, -2, -1, +1, -1, +0, -1, +4, +1, -2, -3, -1, -2, +0, // S
+ +0, -1, -1, -1, -1, -2, -2, -2, -1, -1, -1, -1, +0, -1, -1, -1, +1, +5, +0, -2, -1, -2, -1, // T
+ +0, -3, -1, -3, -2, -1, -3, -3, +3, -2, +1, +1, -3, -2, -2, -3, -2, +0, +4, -3, -1, -1, -2, // V
+ -3, -4, -2, -4, -3, +1, -2, -2, -3, -3, -2, -1, -4, -4, -2, -3, -3, -2, -3,+11, -1, +2, -3, // W
+
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // X
+ -2, -3, -2, -3, -2, +3, -3, +2, -1, -2, -1, -1, -2, -3, -1, -2, -2, -2, -1, +2, -1, +7, -2, // Y
+ -1, +2, -4, +2, +5, -3, -2, +0, -3, +1, -3, -2, +0, -1, +2, +0, +0, -1, -2, -3, -1, -2, +5, // Z
+ };
+
+static int toi_tab[26] =
+ {
+ 0, // A
+ 1, // B
+ 2, // C
+ 3, // D
+ 4, // E
+ 5, // F
+ 6, // G
+ 7, // H
+ 8, // I
+ -1, // J
+ 9, // K
+ 10, // L
+ 11, // M
+ 12, // N
+ -1, // O
+ 13, // P
+ 14, // Q
+ 15, // R
+ 16, // S
+ 17, // T
+ -1, // U
+ 18, // V
+ 19, // W
+ 20, // X
+ 21, // Y
+ 22, // Z
+ };
+
+static int toi(char c)
+ {
+ c = toupper(c);
+ return toi_tab[c - 'A'];
+ }
+
+static int BlosumScore(char c1, char c2)
+ {
+ int i1 = toi(c1);
+ int i2 = toi(c2);
+ return Blosum62[i1][i2];
+ }
+
+/***
+Consider a column with 5 As and 3 Bs.
+There are:
+ 5x4 pairs of As.
+ 3x2 pairs of Bs.
+ 5x3x2 AB pairs
+ 8x7 = 5x4 + 3x2 + 5x3x2 pairs of letters
+***/
+static double BlosumScoreCol(const MSA &a, unsigned uColIndex)
+ {
+ int iCounts[23];
+ memset(iCounts, 0, sizeof(iCounts));
+ const unsigned uSeqCount = a.GetSeqCount();
+ unsigned uCharCount = 0;
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ char c = a.GetChar(uSeqIndex, uColIndex);
+ if (IsGapChar(c))
+ continue;
+ int iChar = toi(c);
+ ++iCounts[iChar];
+ ++uCharCount;
+ }
+ if (uCharCount < 2)
+ return -9;
+ int iTotalScore = 0;
+ for (int i1 = 0; i1 < 23; ++i1)
+ {
+ int iCounts1 = iCounts[i1];
+ iTotalScore += iCounts1*(iCounts1 - 1)*Blosum62[i1][i1];
+ for (int i2 = i1 + 1; i2 < 23; ++i2)
+ iTotalScore += iCounts[i2]*iCounts1*2*Blosum62[i1][i2];
+ }
+ int iPairCount = uCharCount*(uCharCount - 1);
+ return (double) iTotalScore / (double) iPairCount;
+ }
+
+/***
+Consider a column with 5 As and 3 Bs.
+A residue of type Q scores:
+ 5xAQ + 3xBQ
+***/
+static void AssignColorsCol(const MSA &a, unsigned uColIndex, int **Colors)
+ {
+ int iCounts[23];
+ memset(iCounts, 0, sizeof(iCounts));
+ const unsigned uSeqCount = a.GetSeqCount();
+ unsigned uCharCount = 0;
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ char c = a.GetChar(uSeqIndex, uColIndex);
+ if (IsGapChar(c))
+ continue;
+ int iChar = toi(c);
+ ++iCounts[iChar];
+ ++uCharCount;
+ }
+ int iMostConservedType = -1;
+ int iMostConservedCount = -1;
+ for (unsigned i = 0; i < 23; ++i)
+ {
+ if (iCounts[i] > iMostConservedCount)
+ {
+ iMostConservedType = i;
+ iMostConservedCount = iCounts[i];
+ }
+ }
+
+ double dColScore = BlosumScoreCol(a, uColIndex);
+ int c;
+ if (dColScore >= 3.0)
+ c = 3;
+ //else if (dColScore >= 1.0)
+ // c = 2;
+ else if (dColScore >= 0.2)
+ c = 1;
+ else
+ c = 0;
+
+ int Color[23];
+ for (unsigned uLetter = 0; uLetter < 23; ++uLetter)
+ {
+ double dScore = Blosum62[uLetter][iMostConservedType];
+ if (dScore >= dColScore)
+ Color[uLetter] = c;
+ else
+ Color[uLetter] = 0;
+ }
+
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ char c = a.GetChar(uSeqIndex, uColIndex);
+ if (IsGapChar(c))
+ {
+ Colors[uSeqIndex][uColIndex] = 0;
+ continue;
+ }
+ int iLetter = toi(c);
+ if (iLetter >= 0 && iLetter < 23)
+ Colors[uSeqIndex][uColIndex] = Color[iLetter];
+ else
+ Colors[uSeqIndex][uColIndex] = 0;
+ }
+ }
+
+void AssignColors(const MSA &a, int **Colors)
+ {
+ const unsigned uColCount = a.GetColCount();
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
+ AssignColorsCol(a, uColIndex, Colors);
+ }
Added: trunk/packages/muscle/branches/upstream/current/cons.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/cons.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/cons.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,118 @@
+/***
+Conservation value for a column in an MSA is defined as the number
+of times the most common letter appears divided by the number of
+sequences.
+***/
+
+#include "muscle.h"
+#include "msa.h"
+#include <math.h>
+
+double MSA::GetAvgCons() const
+ {
+ assert(GetSeqCount() > 0);
+ double dSum = 0;
+ unsigned uNonGapColCount = 0;
+ for (unsigned uColIndex = 0; uColIndex < GetColCount(); ++uColIndex)
+ {
+ if (!IsGapColumn(uColIndex))
+ {
+ dSum += GetCons(uColIndex);
+ ++uNonGapColCount;
+ }
+ }
+ assert(uNonGapColCount > 0);
+ double dAvg = dSum / uNonGapColCount;
+ assert(dAvg > 0 && dAvg <= 1);
+ return dAvg;
+ }
+
+double MSA::GetCons(unsigned uColIndex) const
+ {
+ unsigned Counts[MAX_ALPHA];
+ for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter)
+ Counts[uLetter] = 0;
+
+ unsigned uMaxCount = 0;
+ for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)
+ {
+ if (IsGap(uSeqIndex, uColIndex))
+ continue;
+ char c = GetChar(uSeqIndex, uColIndex);
+ c = toupper(c);
+ if ('X' == c || 'B' == c || 'Z' == c)
+ continue;
+ unsigned uLetter = GetLetter(uSeqIndex, uColIndex);
+ unsigned uCount = Counts[uLetter] + 1;
+ if (uCount > uMaxCount)
+ uMaxCount = uCount;
+ Counts[uLetter] = uCount;
+ }
+
+// Cons is undefined for all-gap column
+ if (0 == uMaxCount)
+ {
+// assert(false);
+ return 1;
+ }
+
+ double dCons = (double) uMaxCount / (double) GetSeqCount();
+ assert(dCons > 0 && dCons <= 1);
+ return dCons;
+ }
+
+// Perecent identity of a pair of sequences.
+// Positions with one or both gapped are ignored.
+double MSA::GetPctIdentityPair(unsigned uSeqIndex1, unsigned uSeqIndex2) const
+ {
+ const unsigned uColCount = GetColCount();
+ unsigned uPosCount = 0;
+ unsigned uSameCount = 0;
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
+ {
+ const char c1 = GetChar(uSeqIndex1, uColIndex);
+ const char c2 = GetChar(uSeqIndex2, uColIndex);
+ if (IsGapChar(c1) || IsGapChar(c2))
+ continue;
+ if (c1 == c2)
+ ++uSameCount;
+ ++uPosCount;
+ }
+ if (0 == uPosCount)
+ return 0;
+ return (double) uSameCount / (double) uPosCount;
+ }
+
+// Perecent group identity of a pair of sequences.
+// Positions with one or both gapped are ignored.
+double MSA::GetPctGroupIdentityPair(unsigned uSeqIndex1,
+ unsigned uSeqIndex2) const
+ {
+ extern unsigned ResidueGroup[];
+
+ const unsigned uColCount = GetColCount();
+ unsigned uPosCount = 0;
+ unsigned uSameCount = 0;
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
+ {
+ if (IsGap(uSeqIndex1, uColIndex))
+ continue;
+ if (IsGap(uSeqIndex2, uColIndex))
+ continue;
+ if (IsWildcard(uSeqIndex1, uColIndex))
+ continue;
+ if (IsWildcard(uSeqIndex2, uColIndex))
+ continue;
+
+ const unsigned uLetter1 = GetLetter(uSeqIndex1, uColIndex);
+ const unsigned uLetter2 = GetLetter(uSeqIndex2, uColIndex);
+ const unsigned uGroup1 = ResidueGroup[uLetter1];
+ const unsigned uGroup2 = ResidueGroup[uLetter2];
+ if (uGroup1 == uGroup2)
+ ++uSameCount;
+ ++uPosCount;
+ }
+ if (0 == uPosCount)
+ return 0;
+ return (double) uSameCount / (double) uPosCount;
+ }
Added: trunk/packages/muscle/branches/upstream/current/diaglist.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/diaglist.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/diaglist.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,378 @@
+#include "muscle.h"
+#include "diaglist.h"
+#include "pwpath.h"
+
+#define MAX(x, y) ((x) > (y) ? (x) : (y))
+#define MIN(x, y) ((x) < (y) ? (x) : (y))
+
+void DiagList::Add(const Diag &d)
+ {
+ if (m_uCount == MAX_DIAGS)
+ Quit("DiagList::Add, overflow %u", m_uCount);
+ m_Diags[m_uCount] = d;
+ ++m_uCount;
+ }
+
+void DiagList::Add(unsigned uStartPosA, unsigned uStartPosB, unsigned uLength)
+ {
+ Diag d;
+ d.m_uStartPosA = uStartPosA;
+ d.m_uStartPosB = uStartPosB;
+ d.m_uLength = uLength;
+ Add(d);
+ }
+
+const Diag &DiagList::Get(unsigned uIndex) const
+ {
+ if (uIndex >= m_uCount)
+ Quit("DiagList::Get(%u), count=%u", uIndex, m_uCount);
+ return m_Diags[uIndex];
+ }
+
+void DiagList::LogMe() const
+ {
+ Log("DiagList::LogMe, count=%u\n", m_uCount);
+ Log(" n StartA StartB Length\n");
+ Log("--- ------ ------ ------\n");
+ for (unsigned n = 0; n < m_uCount; ++n)
+ {
+ const Diag &d = m_Diags[n];
+ Log("%3u %6u %6u %6u\n",
+ n, d.m_uStartPosA, d.m_uStartPosB, d.m_uLength);
+ }
+ }
+
+void DiagList::FromPath(const PWPath &Path)
+ {
+ Clear();
+
+ const unsigned uEdgeCount = Path.GetEdgeCount();
+ unsigned uLength = 0;
+ unsigned uStartPosA;
+ unsigned uStartPosB;
+ for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex)
+ {
+ const PWEdge &Edge = Path.GetEdge(uEdgeIndex);
+
+ // Typical cases
+ if (Edge.cType == 'M')
+ {
+ if (0 == uLength)
+ {
+ uStartPosA = Edge.uPrefixLengthA - 1;
+ uStartPosB = Edge.uPrefixLengthB - 1;
+ }
+ ++uLength;
+ }
+ else
+ {
+ if (uLength >= g_uMinDiagLength)
+ Add(uStartPosA, uStartPosB, uLength);
+ uLength = 0;
+ }
+ }
+
+// Special case for last edge
+ if (uLength >= g_uMinDiagLength)
+ Add(uStartPosA, uStartPosB, uLength);
+ }
+
+bool DiagList::NonZeroIntersection(const Diag &d) const
+ {
+ for (unsigned n = 0; n < m_uCount; ++n)
+ {
+ const Diag &d2 = m_Diags[n];
+ if (DiagOverlap(d, d2) > 0)
+ return true;
+ }
+ return false;
+ }
+
+// DialogOverlap returns the length of the overlapping
+// section of the two diagonals along the diagonals
+// themselves; in other words, the length of
+// the intersection of the two sets of cells in
+// the matrix.
+unsigned DiagOverlap(const Diag &d1, const Diag &d2)
+ {
+// Determine where the diagonals intersect the A
+// axis (extending them if required). If they
+// intersect at different points, they do not
+// overlap. Coordinates on a diagonal are
+// given by B = A + c where c is the value of
+// A at the intersection with the A axis.
+// Hence, c = B - A for any point on the diagonal.
+ int c1 = (int) d1.m_uStartPosB - (int) d1.m_uStartPosA;
+ int c2 = (int) d2.m_uStartPosB - (int) d2.m_uStartPosA;
+ if (c1 != c2)
+ return 0;
+
+ assert(DiagOverlapA(d1, d2) == DiagOverlapB(d1, d2));
+ return DiagOverlapA(d1, d2);
+ }
+
+// DialogOverlapA returns the length of the overlapping
+// section of the projection of the two diagonals onto
+// the A axis.
+unsigned DiagOverlapA(const Diag &d1, const Diag &d2)
+ {
+ unsigned uMaxStart = MAX(d1.m_uStartPosA, d2.m_uStartPosA);
+ unsigned uMinEnd = MIN(d1.m_uStartPosA + d1.m_uLength - 1,
+ d2.m_uStartPosA + d2.m_uLength - 1);
+
+ int iLength = (int) uMinEnd - (int) uMaxStart + 1;
+ if (iLength < 0)
+ return 0;
+ return (unsigned) iLength;
+ }
+
+// DialogOverlapB returns the length of the overlapping
+// section of the projection of the two diagonals onto
+// the B axis.
+unsigned DiagOverlapB(const Diag &d1, const Diag &d2)
+ {
+ unsigned uMaxStart = MAX(d1.m_uStartPosB, d2.m_uStartPosB);
+ unsigned uMinEnd = MIN(d1.m_uStartPosB + d1.m_uLength - 1,
+ d2.m_uStartPosB + d2.m_uLength - 1);
+
+ int iLength = (int) uMinEnd - (int) uMaxStart + 1;
+ if (iLength < 0)
+ return 0;
+ return (unsigned) iLength;
+ }
+
+// Returns true if the two diagonals can be on the
+// same path through the DP matrix. If DiagCompatible
+// returns false, they cannot be in the same path
+// and hence "contradict" each other.
+bool DiagCompatible(const Diag &d1, const Diag &d2)
+ {
+ if (DiagOverlap(d1, d2) > 0)
+ return true;
+ return 0 == DiagOverlapA(d1, d2) && 0 == DiagOverlapB(d1, d2);
+ }
+
+// Returns the length of the "break" between two diagonals.
+unsigned DiagBreak(const Diag &d1, const Diag &d2)
+ {
+ int c1 = (int) d1.m_uStartPosB - (int) d1.m_uStartPosA;
+ int c2 = (int) d2.m_uStartPosB - (int) d2.m_uStartPosA;
+ if (c1 != c2)
+ return 0;
+
+ int iMaxStart = MAX(d1.m_uStartPosA, d2.m_uStartPosA);
+ int iMinEnd = MIN(d1.m_uStartPosA + d1.m_uLength - 1,
+ d2.m_uStartPosA + d1.m_uLength - 1);
+ int iBreak = iMaxStart - iMinEnd - 1;
+ if (iBreak < 0)
+ return 0;
+ return (unsigned) iBreak;
+ }
+
+// Merge diagonals that are continuations of each other with
+// short breaks of up to length g_uMaxDiagBreak.
+// In a sorted list of diagonals, we only have to check
+// consecutive entries.
+void MergeDiags(DiagList &DL)
+ {
+ return;
+#if DEBUG
+ if (!DL.IsSorted())
+ Quit("MergeDiags: !IsSorted");
+#endif
+
+// TODO: Fix this!
+// Breaks must be with no offset (no gaps)
+ const unsigned uCount = DL.GetCount();
+ if (uCount <= 1)
+ return;
+
+ DiagList NewList;
+
+ Diag MergedDiag;
+ const Diag *ptrPrev = &DL.Get(0);
+ for (unsigned i = 1; i < uCount; ++i)
+ {
+ const Diag *ptrDiag = &DL.Get(i);
+ unsigned uBreakLength = DiagBreak(*ptrPrev, *ptrDiag);
+ if (uBreakLength <= g_uMaxDiagBreak)
+ {
+ MergedDiag.m_uStartPosA = ptrPrev->m_uStartPosA;
+ MergedDiag.m_uStartPosB = ptrPrev->m_uStartPosB;
+ MergedDiag.m_uLength = ptrPrev->m_uLength + ptrDiag->m_uLength
+ + uBreakLength;
+ ptrPrev = &MergedDiag;
+ }
+ else
+ {
+ NewList.Add(*ptrPrev);
+ ptrPrev = ptrDiag;
+ }
+ }
+ NewList.Add(*ptrPrev);
+ DL.Copy(NewList);
+ }
+
+void DiagList::DeleteIncompatible()
+ {
+ assert(IsSorted());
+
+ if (m_uCount < 2)
+ return;
+
+ bool *bFlagForDeletion = new bool[m_uCount];
+ for (unsigned i = 0; i < m_uCount; ++i)
+ bFlagForDeletion[i] = false;
+
+ for (unsigned i = 0; i < m_uCount; ++i)
+ {
+ const Diag &di = m_Diags[i];
+ for (unsigned j = i + 1; j < m_uCount; ++j)
+ {
+ const Diag &dj = m_Diags[j];
+
+ // Verify sorted correctly
+ assert(di.m_uStartPosA <= dj.m_uStartPosA);
+
+ // If two diagonals are incompatible and
+ // one is is much longer than the other,
+ // keep the longer one.
+ if (!DiagCompatible(di, dj))
+ {
+ if (di.m_uLength > dj.m_uLength*4)
+ bFlagForDeletion[j] = true;
+ else if (dj.m_uLength > di.m_uLength*4)
+ bFlagForDeletion[i] = true;
+ else
+ {
+ bFlagForDeletion[i] = true;
+ bFlagForDeletion[j] = true;
+ }
+ }
+ }
+ }
+
+ for (unsigned i = 0; i < m_uCount; ++i)
+ {
+ const Diag &di = m_Diags[i];
+ if (bFlagForDeletion[i])
+ continue;
+
+ for (unsigned j = i + 1; j < m_uCount; ++j)
+ {
+ const Diag &dj = m_Diags[j];
+ if (bFlagForDeletion[j])
+ continue;
+
+ // Verify sorted correctly
+ assert(di.m_uStartPosA <= dj.m_uStartPosA);
+
+ // If sort order in B different from sorted order in A,
+ // either diags are incompatible or we detected a repeat
+ // or permutation.
+ if (di.m_uStartPosB >= dj.m_uStartPosB || !DiagCompatible(di, dj))
+ {
+ bFlagForDeletion[i] = true;
+ bFlagForDeletion[j] = true;
+ }
+ }
+ }
+
+ unsigned uNewCount = 0;
+ Diag *NewDiags = new Diag[m_uCount];
+ for (unsigned i = 0; i < m_uCount; ++i)
+ {
+ if (bFlagForDeletion[i])
+ continue;
+
+ const Diag &d = m_Diags[i];
+ NewDiags[uNewCount] = d;
+ ++uNewCount;
+ }
+ memcpy(m_Diags, NewDiags, uNewCount*sizeof(Diag));
+ m_uCount = uNewCount;
+ delete[] NewDiags;
+ }
+
+void DiagList::Copy(const DiagList &DL)
+ {
+ Clear();
+ unsigned uCount = DL.GetCount();
+ for (unsigned i = 0; i < uCount; ++i)
+ Add(DL.Get(i));
+ }
+
+// Check if sorted in increasing order of m_uStartPosA
+bool DiagList::IsSorted() const
+ {
+ return true;
+ unsigned uCount = GetCount();
+ for (unsigned i = 1; i < uCount; ++i)
+ if (m_Diags[i-1].m_uStartPosA > m_Diags[i].m_uStartPosA)
+ return false;
+ return true;
+ }
+
+// Sort in increasing order of m_uStartPosA
+// Dumb bubble sort, but don't care about speed
+// because don't get long lists.
+void DiagList::Sort()
+ {
+ if (m_uCount < 2)
+ return;
+
+ bool bContinue = true;
+ while (bContinue)
+ {
+ bContinue = false;
+ for (unsigned i = 0; i < m_uCount - 1; ++i)
+ {
+ if (m_Diags[i].m_uStartPosA > m_Diags[i+1].m_uStartPosA)
+ {
+ Diag Tmp = m_Diags[i];
+ m_Diags[i] = m_Diags[i+1];
+ m_Diags[i+1] = Tmp;
+ bContinue = true;
+ }
+ }
+ }
+ }
+
+//void TestDiag()
+// {
+// Diag d1;
+// Diag d2;
+// Diag d3;
+//
+// d1.m_uStartPosA = 0;
+// d1.m_uStartPosB = 1;
+// d1.m_uLength = 32;
+//
+// d2.m_uStartPosA = 55;
+// d2.m_uStartPosB = 70;
+// d2.m_uLength = 36;
+//
+// d3.m_uStartPosA = 102;
+// d3.m_uStartPosB = 122;
+// d3.m_uLength = 50;
+//
+// DiagList DL;
+// DL.Add(d1);
+// DL.Add(d2);
+// DL.Add(d3);
+//
+// Log("Before DeleteIncompatible:\n");
+// DL.LogMe();
+// DL.DeleteIncompatible();
+//
+// Log("After DeleteIncompatible:\n");
+// DL.LogMe();
+//
+// MergeDiags(DL);
+// Log("After Merge:\n");
+// DL.LogMe();
+//
+// DPRegionList RL;
+// DiagListToDPRegionList(DL, RL, 200, 200);
+// RL.LogMe();
+// }
Added: trunk/packages/muscle/branches/upstream/current/diaglist.h
===================================================================
--- trunk/packages/muscle/branches/upstream/current/diaglist.h 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/diaglist.h 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,89 @@
+#ifndef diaglist_h
+#define diaglist_h
+
+const unsigned EMPTY = (unsigned) ~0;
+const unsigned MAX_DIAGS = 1024;
+
+struct Diag
+ {
+ unsigned m_uStartPosA;
+ unsigned m_uStartPosB;
+ unsigned m_uLength;
+ };
+
+struct Rect
+ {
+ unsigned m_uStartPosA;
+ unsigned m_uStartPosB;
+ unsigned m_uLengthA;
+ unsigned m_uLengthB;
+ };
+
+class DiagList
+ {
+public:
+ DiagList()
+ {
+ m_uCount = 0;
+ }
+ ~DiagList()
+ {
+ Free();
+ }
+
+public:
+// Creation
+ void Clear()
+ {
+ Free();
+ }
+ void FromPath(const PWPath &Path);
+ void Add(const Diag &d);
+ void Add(unsigned uStartPosA, unsigned uStartPosB, unsigned uLength);
+ void DeleteIncompatible();
+
+// Accessors
+ unsigned GetCount() const
+ {
+ return m_uCount;
+ }
+ const Diag &Get(unsigned uIndex) const;
+
+// Operations
+ void Sort();
+ void Copy(const DiagList &DL);
+
+// Query
+ // returns true iff given diagonal is included in the list
+ // in whole or in part.
+ bool NonZeroIntersection(const Diag &d) const;
+ bool IsSorted() const;
+
+// Diagnostics
+ void LogMe() const;
+
+private:
+ void Free()
+ {
+ m_uCount = 0;
+ }
+
+private:
+ unsigned m_uCount;
+ Diag m_Diags[MAX_DIAGS];
+ };
+
+unsigned DiagOverlap(const Diag &d1, const Diag &d2);
+unsigned DiagOverlapA(const Diag &d1, const Diag &d2);
+unsigned DiagOverlapB(const Diag &d1, const Diag &d2);
+unsigned DiagBreak(const Diag &d1, const Diag &d2);
+bool DiagCompatible(const Diag &d1, const Diag &d2);
+void CheckDiags(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
+ unsigned uLengthB, const MSA &msaA, const MSA &msaB, const PWPath &Path);
+void FindDiags(const ProfPos *PX, unsigned uLengthX, const ProfPos *PY,
+ unsigned uLengthY, DiagList &DL);
+void FindDiagsNuc(const ProfPos *PX, unsigned uLengthX, const ProfPos *PY,
+ unsigned uLengthY, DiagList &DL);
+void MergeDiags(DiagList &DL);
+
+#endif // diaglist_h
Added: trunk/packages/muscle/branches/upstream/current/diffobjscore.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/diffobjscore.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/diffobjscore.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,162 @@
+#include "muscle.h"
+#include "msa.h"
+#include "objscore.h"
+#include "profile.h"
+
+#define TRACE 0
+#define COMPARE_3_52 0
+#define BRUTE_LETTERS 0
+
+static SCORE ScoreColLetters(const MSA &msa, unsigned uColIndex)
+ {
+ SCOREMATRIX &Mx = *g_ptrScoreMatrix;
+ const unsigned uSeqCount = msa.GetSeqCount();
+
+#if BRUTE_LETTERS
+ SCORE BruteScore = 0;
+ for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1)
+ {
+ unsigned uLetter1 = msa.GetLetterEx(uSeqIndex1, uColIndex);
+ if (uLetter1 >= g_AlphaSize)
+ continue;
+ WEIGHT w1 = msa.GetSeqWeight(uSeqIndex1);
+ for (unsigned uSeqIndex2 = uSeqIndex1 + 1; uSeqIndex2 < uSeqCount; ++uSeqIndex2)
+ {
+ unsigned uLetter2 = msa.GetLetterEx(uSeqIndex2, uColIndex);
+ if (uLetter2 >= g_AlphaSize)
+ continue;
+ WEIGHT w2 = msa.GetSeqWeight(uSeqIndex2);
+ BruteScore += w1*w2*Mx[uLetter1][uLetter2];
+ }
+ }
+#endif
+
+ double N = 0;
+ for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1)
+ {
+ WEIGHT w = msa.GetSeqWeight(uSeqIndex1);
+ N += w;
+ }
+ if (N <= 0)
+ return 0;
+
+ FCOUNT Freqs[20];
+ memset(Freqs, 0, sizeof(Freqs));
+ SCORE Score = 0;
+ for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1)
+ {
+ unsigned uLetter = msa.GetLetterEx(uSeqIndex1, uColIndex);
+ if (uLetter >= g_AlphaSize)
+ continue;
+ WEIGHT w = msa.GetSeqWeight(uSeqIndex1);
+ Freqs[uLetter] += w;
+ Score -= w*w*Mx[uLetter][uLetter];
+ }
+
+ for (unsigned uLetter1 = 0; uLetter1 < g_AlphaSize; ++uLetter1)
+ {
+ const FCOUNT f1 = Freqs[uLetter1];
+ Score += f1*f1*Mx[uLetter1][uLetter1];
+ for (unsigned uLetter2 = uLetter1 + 1; uLetter2 < g_AlphaSize; ++uLetter2)
+ {
+ const FCOUNT f2 = Freqs[uLetter2];
+ Score += 2*f1*f2*Mx[uLetter1][uLetter2];
+ }
+ }
+ Score /= 2;
+#if BRUTE_LETTERS
+ assert(BTEq(BruteScore, Score));
+#endif
+ return Score;
+ }
+
+static SCORE ScoreLetters(const MSA &msa, const unsigned Edges[],
+ unsigned uEdgeCount)
+ {
+ const unsigned uSeqCount = msa.GetSeqCount();
+ const unsigned uColCount = msa.GetColCount();
+
+// Letters
+ SCORE Score = 0;
+ for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex)
+ {
+ const unsigned uColIndex = Edges[uEdgeIndex];
+ assert(uColIndex < uColCount);
+ Score += ScoreColLetters(msa, uColIndex);
+ }
+ return Score;
+ }
+
+void GetLetterScores(const MSA &msa, SCORE Scores[])
+ {
+ const unsigned uColCount = msa.GetColCount();
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
+ Scores[uColIndex] = ScoreColLetters(msa, uColIndex);
+ }
+
+SCORE DiffObjScore(
+ const MSA &msa1, const PWPath &Path1, const unsigned Edges1[], unsigned uEdgeCount1,
+ const MSA &msa2, const PWPath &Path2, const unsigned Edges2[], unsigned uEdgeCount2)
+ {
+#if TRACE
+ {
+ Log("============DiffObjScore===========\n");
+ Log("msa1:\n");
+ msa1.LogMe();
+ Log("\n");
+ Log("Cols1: ");
+ for (unsigned i = 0; i < uEdgeCount1; ++i)
+ Log(" %u", Edges1[i]);
+ Log("\n\n");
+ Log("msa2:\n");
+ msa2.LogMe();
+ Log("Cols2: ");
+ for (unsigned i = 0; i < uEdgeCount2; ++i)
+ Log(" %u", Edges2[i]);
+ Log("\n\n");
+ }
+#endif
+
+#if COMPARE_3_52
+ extern SCORE g_SPScoreLetters;
+ extern SCORE g_SPScoreGaps;
+ SCORE SP1 = ObjScoreSP(msa1);
+ SCORE SPLetters1 = g_SPScoreLetters;
+ SCORE SPGaps1 = g_SPScoreGaps;
+
+ SCORE SP2 = ObjScoreSP(msa2);
+ SCORE SPLetters2 = g_SPScoreLetters;
+ SCORE SPGaps2 = g_SPScoreGaps;
+ SCORE SPDiffLetters = SPLetters2 - SPLetters1;
+ SCORE SPDiffGaps = SPGaps2 - SPGaps1;
+ SCORE SPDiff = SPDiffLetters + SPDiffGaps;
+#endif
+
+ SCORE Letters1 = ScoreLetters(msa1, Edges1, uEdgeCount1);
+ SCORE Letters2 = ScoreLetters(msa2, Edges2, uEdgeCount2);
+
+ SCORE Gaps1 = ScoreGaps(msa1, Edges1, uEdgeCount1);
+ SCORE Gaps2 = ScoreGaps(msa2, Edges2, uEdgeCount2);
+
+ SCORE DiffLetters = Letters2 - Letters1;
+ SCORE DiffGaps = Gaps2 - Gaps1;
+ SCORE Diff = DiffLetters + DiffGaps;
+
+#if COMPARE_3_52
+ Log("ObjScoreSP Letters1=%.4g Letters2=%.4g DiffLetters=%.4g\n",
+ SPLetters1, SPLetters2, SPDiffLetters);
+
+ Log("DiffObjScore Letters1=%.4g Letters2=%.4g DiffLetters=%.4g\n",
+ Letters1, Letters2, DiffLetters);
+
+ Log("ObjScoreSP Gaps1=%.4g Gaps2=%.4g DiffGaps=%.4g\n",
+ SPGaps1, SPGaps2, SPDiffGaps);
+
+ Log("DiffObjScore Gaps1=%.4g Gaps2=%.4g DiffGaps=%.4g\n",
+ Gaps1, Gaps2, DiffGaps);
+
+ Log("SP diff=%.4g DiffObjScore Diff=%.4g\n", SPDiff, Diff);
+#endif
+
+ return Diff;
+ }
Added: trunk/packages/muscle/branches/upstream/current/diffpaths.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/diffpaths.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/diffpaths.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,114 @@
+#include "muscle.h"
+#include "pwpath.h"
+
+#define TRACE 0
+
+void DiffPaths(const PWPath &p1, const PWPath &p2, unsigned Edges1[],
+ unsigned *ptruDiffCount1, unsigned Edges2[], unsigned *ptruDiffCount2)
+ {
+#if TRACE
+ Log("DiffPaths\n");
+ Log("p1=");
+ p1.LogMe();
+ Log("p2=");
+ p2.LogMe();
+#endif
+ const unsigned uEdgeCount1 = p1.GetEdgeCount();
+ const unsigned uEdgeCount2 = p2.GetEdgeCount();
+
+ unsigned uDiffCount1 = 0;
+ unsigned uDiffCount2 = 0;
+ unsigned uEdgeIndex1 = 0;
+ unsigned uEdgeIndex2 = 0;
+ const PWEdge *Edge1 = &p1.GetEdge(uEdgeIndex1);
+ const PWEdge *Edge2 = &p2.GetEdge(uEdgeIndex2);
+ for (;;)
+ {
+ unsigned uEdgeIndexTop1 = uEdgeIndex1;
+ unsigned uEdgeIndexTop2 = uEdgeIndex2;
+ Edge1 = &p1.GetEdge(uEdgeIndex1);
+ Edge2 = &p2.GetEdge(uEdgeIndex2);
+#if TRACE
+ Log("e1[%u] PLA%u PLB%u %c, e2[%u] PLA%u PLB %u %c DC1=%u DC2=%u\n",
+ uEdgeIndex1, Edge1->uPrefixLengthA, Edge1->uPrefixLengthB, Edge1->cType,
+ uEdgeIndex2, Edge2->uPrefixLengthA, Edge2->uPrefixLengthB, Edge2->cType,
+ uDiffCount1, uDiffCount2);
+#endif
+ if (Edge1->uPrefixLengthA == Edge2->uPrefixLengthA &&
+ Edge1->uPrefixLengthB == Edge2->uPrefixLengthB)
+ {
+ if (!Edge1->Equal(*Edge2))
+ {
+ Edges1[uDiffCount1++] = uEdgeIndex1;
+ Edges2[uDiffCount2++] = uEdgeIndex2;
+ }
+ ++uEdgeIndex1;
+ ++uEdgeIndex2;
+ }
+
+ else if (Edge2->uPrefixLengthA < Edge1->uPrefixLengthA ||
+ Edge2->uPrefixLengthB < Edge1->uPrefixLengthB)
+ Edges2[uDiffCount2++] = uEdgeIndex2++;
+
+ else if (Edge1->uPrefixLengthA < Edge2->uPrefixLengthA ||
+ Edge1->uPrefixLengthB < Edge2->uPrefixLengthB)
+ Edges1[uDiffCount1++] = uEdgeIndex1++;
+
+ if (uEdgeCount1 == uEdgeIndex1)
+ {
+ while (uEdgeIndex2 < uEdgeCount2)
+ Edges2[uDiffCount2++] = uEdgeIndex2++;
+ goto Done;
+ }
+ if (uEdgeCount2 == uEdgeIndex2)
+ {
+ while (uEdgeIndex1 < uEdgeCount1)
+ Edges1[uDiffCount1++] = uEdgeIndex1++;
+ goto Done;
+ }
+ if (uEdgeIndex1 == uEdgeIndexTop1 && uEdgeIndex2 == uEdgeIndexTop2)
+ Quit("DiffPaths stuck");
+ }
+Done:;
+#if TRACE
+ Log("DiffCount1=%u (%u %u)\n", uDiffCount1, uEdgeCount1, uEdgeCount2);
+ Log("Diffs1=");
+ for (unsigned i = 0; i < uDiffCount1; ++i)
+ {
+ const PWEdge e = p1.GetEdge(Edges1[i]);
+ Log(" %u=%c%u.%u", Edges1[i], e.cType, e.uPrefixLengthA, e.uPrefixLengthB);
+ }
+ Log("\n");
+ Log("DiffCount2=%u\n", uDiffCount2);
+ Log("Diffs2=");
+ for (unsigned i = 0; i < uDiffCount2; ++i)
+ {
+ const PWEdge e = p2.GetEdge(Edges2[i]);
+ Log(" %u=%c%u.%u", Edges2[i], e.cType, e.uPrefixLengthA, e.uPrefixLengthB);
+ }
+ Log("\n");
+#endif
+ *ptruDiffCount1 = uDiffCount1;
+ *ptruDiffCount2 = uDiffCount2;
+ }
+
+void TestDiffPaths()
+ {
+ PWPath p1;
+ PWPath p2;
+
+ p1.AppendEdge('M', 1, 1);
+ p1.AppendEdge('M', 2, 2);
+ p1.AppendEdge('M', 3, 3);
+
+ p2.AppendEdge('M', 1, 1);
+ p2.AppendEdge('D', 2, 1);
+ p2.AppendEdge('I', 2, 2);
+ p2.AppendEdge('M', 3, 3);
+
+ unsigned Edges1[64];
+ unsigned Edges2[64];
+ unsigned uDiffCount1;
+ unsigned uDiffCount2;
+ DiffPaths(p1, p2, Edges1, &uDiffCount1, Edges2, &uDiffCount2);
+ }
Added: trunk/packages/muscle/branches/upstream/current/difftrees.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/difftrees.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/difftrees.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,381 @@
+#include "muscle.h"
+#include "tree.h"
+
+#define TRACE 0
+
+/***
+Algorithm to compare two trees, X and Y.
+
+A node x in X and node y in Y are defined to be
+similar iff the set of leaves in the subtree under
+x is identical to the set of leaves under y.
+
+A node is defined to be dissimilar iff it is not
+similar to any node in the other tree.
+
+Nodes x and y are defined to be married iff every
+node in the subtree under x is similar to a node
+in the subtree under y. Married nodes are considered
+to be equal. The subtrees under two married nodes can
+at most differ by exchanges of left and right branches,
+which we do not consider to be significant here.
+
+A node is defined to be a bachelor iff it is not
+married. If a node is a bachelor, then it has a
+dissimilar node in its subtree, and it follows
+immediately from the definition of marriage that its
+parent is also a bachelor. Hence all nodes on the path
+from a bachelor node to the root are bachelors.
+
+We assume the trees have the same set of leaves, so
+every leaf is trivially both similar and married to
+the same leaf in the opposite tree. Bachelor nodes
+are therefore always internal (i.e., non-leaf) nodes.
+
+A node is defined to be a diff iff (a) it is married
+and (b) its parent is a bachelor. The subtree under
+a diff is maximally similar to the other tree. (In
+other words, you cannot extend the subtree without
+adding a bachelor).
+
+The set of diffs is the subset of the two trees that
+we consider to be identical.
+
+Example:
+
+ -----A
+ -----k
+ ----j -----B
+--i -----C
+ ------D
+
+
+ -----A
+ -----p
+ ----n -----B
+--m -----D
+ ------C
+
+
+The following pairs of internal nodes are similar.
+
+ Nodes Set of leaves
+ ----- -------------
+ k,p A,B
+ i,m A,B,C,D
+
+Bachelors in the first tree are i and j, bachelors
+in the second tree are m and n.
+
+Node k and p are married, but i and m are not (because j
+and n are bachelors). The diffs are C, D and k.
+
+The set of bachelor nodes can be viewed as the internal
+nodes of a tree, the leaves of which are diffs. (To see
+that there can't be disjoint subtrees, note that the path
+from a diff to a root is all bachelor nodes, so there is
+always a path between two diffs that goes through the root).
+We call this tree the "diffs tree".
+
+There is a simple O(N) algorithm to build the diffs tree.
+To achieve O(N) we avoid traversing a given subtree multiple
+times and also avoid comparing lists of leaves.
+
+We visit nodes in depth-first order (i.e., a node is visited
+before its parent).
+
+If either child of a node is a bachelor, we flag it as
+a bachelor.
+
+If both children of the node we are visiting are married,
+we check whether the spouses of those children have the
+same parent in the other tree. If the parents are different,
+the current node is a bachelor. If they have the same parent,
+then the node we are visiting is the spouse of that parent.
+We assign this newly identified married couple a unique integer
+id. The id of a node is in one-to-one correspondence with the
+set of leaves in its subtree. Two nodes have the same set of
+leaves iff they have the same id. Bachelor nodes do not get
+an id.
+***/
+
+static void BuildDiffs(const Tree &tree, unsigned uTreeNodeIndex,
+ const bool bIsDiff[], Tree &Diffs, unsigned uDiffsNodeIndex,
+ unsigned IdToDiffsLeafNodeIndex[])
+ {
+#if TRACE
+ Log("BuildDiffs(TreeNode=%u IsDiff=%d IsLeaf=%d)\n",
+ uTreeNodeIndex, bIsDiff[uTreeNodeIndex], tree.IsLeaf(uTreeNodeIndex));
+#endif
+ if (bIsDiff[uTreeNodeIndex])
+ {
+ unsigned uLeafCount = tree.GetLeafCount();
+ unsigned *Leaves = new unsigned[uLeafCount];
+ GetLeaves(tree, uTreeNodeIndex, Leaves, &uLeafCount);
+ for (unsigned n = 0; n < uLeafCount; ++n)
+ {
+ const unsigned uLeafNodeIndex = Leaves[n];
+ const unsigned uId = tree.GetLeafId(uLeafNodeIndex);
+ if (uId >= tree.GetLeafCount())
+ Quit("BuildDiffs, id out of range");
+ IdToDiffsLeafNodeIndex[uId] = uDiffsNodeIndex;
+#if TRACE
+ Log(" Leaf id=%u DiffsNode=%u\n", uId, uDiffsNodeIndex);
+#endif
+ }
+ delete[] Leaves;
+ return;
+ }
+
+ if (tree.IsLeaf(uTreeNodeIndex))
+ Quit("BuildDiffs: should never reach leaf");
+
+ const unsigned uTreeLeft = tree.GetLeft(uTreeNodeIndex);
+ const unsigned uTreeRight = tree.GetRight(uTreeNodeIndex);
+
+ const unsigned uDiffsLeft = Diffs.AppendBranch(uDiffsNodeIndex);
+ const unsigned uDiffsRight = uDiffsLeft + 1;
+
+ BuildDiffs(tree, uTreeLeft, bIsDiff, Diffs, uDiffsLeft, IdToDiffsLeafNodeIndex);
+ BuildDiffs(tree, uTreeRight, bIsDiff, Diffs, uDiffsRight, IdToDiffsLeafNodeIndex);
+ }
+
+void DiffTrees(const Tree &Tree1, const Tree &Tree2, Tree &Diffs,
+ unsigned IdToDiffsLeafNodeIndex[])
+ {
+#if TRACE
+ Log("Tree1:\n");
+ Tree1.LogMe();
+ Log("\n");
+ Log("Tree2:\n");
+ Tree2.LogMe();
+#endif
+
+ if (!Tree1.IsRooted() || !Tree2.IsRooted())
+ Quit("DiffTrees: requires rooted trees");
+
+ const unsigned uNodeCount = Tree1.GetNodeCount();
+ const unsigned uNodeCount2 = Tree2.GetNodeCount();
+
+ const unsigned uLeafCount = Tree1.GetLeafCount();
+ const unsigned uLeafCount2 = Tree2.GetLeafCount();
+ assert(uLeafCount == uLeafCount2);
+
+ if (uNodeCount != uNodeCount2)
+ Quit("DiffTrees: different node counts");
+
+// Allocate tables so we can convert tree node index to
+// and from the unique id with a O(1) lookup.
+ unsigned *NodeIndexToId1 = new unsigned[uNodeCount];
+ unsigned *IdToNodeIndex2 = new unsigned[uNodeCount];
+
+ bool *bIsBachelor1 = new bool[uNodeCount];
+ bool *bIsDiff1 = new bool[uNodeCount];
+
+ for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex)
+ {
+ NodeIndexToId1[uNodeIndex] = uNodeCount;
+ bIsBachelor1[uNodeIndex] = false;
+ bIsDiff1[uNodeIndex] = false;
+
+ // Use uNodeCount as value meaning "not set".
+ IdToNodeIndex2[uNodeIndex] = uNodeCount;
+ }
+
+// Initialize node index <-> id lookup tables
+ for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex)
+ {
+ if (Tree1.IsLeaf(uNodeIndex))
+ {
+ const unsigned uId = Tree1.GetLeafId(uNodeIndex);
+ if (uId >= uNodeCount)
+ Quit("Diff trees requires existing leaf ids in range 0 .. (N-1)");
+ NodeIndexToId1[uNodeIndex] = uId;
+ }
+
+ if (Tree2.IsLeaf(uNodeIndex))
+ {
+ const unsigned uId = Tree2.GetLeafId(uNodeIndex);
+ if (uId >= uNodeCount)
+ Quit("Diff trees requires existing leaf ids in range 0 .. (N-1)");
+ IdToNodeIndex2[uId] = uNodeIndex;
+ }
+ }
+
+// Validity check. This verifies that the ids
+// pre-assigned to the leaves in Tree1 are unique
+// (note that the id<N check above does not rule
+// out two leaves having duplicate ids).
+ for (unsigned uId = 0; uId < uLeafCount; ++uId)
+ {
+ unsigned uNodeIndex2 = IdToNodeIndex2[uId];
+ if (uNodeCount == uNodeIndex2)
+ Quit("DiffTrees, check 2");
+ }
+
+// Ids assigned to internal nodes are N, N+1 ...
+// An internal node id uniquely identifies a set
+// of two or more leaves.
+ unsigned uInternalNodeId = uLeafCount;
+
+// Depth-first traversal of tree.
+// The order guarantees that a node is visited before
+// its parent is visited.
+ for (unsigned uNodeIndex1 = Tree1.FirstDepthFirstNode();
+ NULL_NEIGHBOR != uNodeIndex1;
+ uNodeIndex1 = Tree1.NextDepthFirstNode(uNodeIndex1))
+ {
+#if TRACE
+ Log("Main loop: Node1=%u IsLeaf=%d IsBachelor=%d\n",
+ uNodeIndex1,
+ Tree1.IsLeaf(uNodeIndex1),
+ bIsBachelor1[uNodeIndex1]);
+#endif
+
+ // Leaves are trivial; nothing to do.
+ if (Tree1.IsLeaf(uNodeIndex1) || bIsBachelor1[uNodeIndex1])
+ continue;
+
+ // If either child is a bachelor, flag
+ // this node as a bachelor and continue.
+ unsigned uLeft1 = Tree1.GetLeft(uNodeIndex1);
+ if (bIsBachelor1[uLeft1])
+ {
+ bIsBachelor1[uNodeIndex1] = true;
+ continue;
+ }
+
+ unsigned uRight1 = Tree1.GetRight(uNodeIndex1);
+ if (bIsBachelor1[uRight1])
+ {
+ bIsBachelor1[uNodeIndex1] = true;
+ continue;
+ }
+
+ // Both children are married.
+ // Married nodes are guaranteed to have an id.
+ unsigned uIdLeft = NodeIndexToId1[uLeft1];
+ unsigned uIdRight = NodeIndexToId1[uRight1];
+
+ if (uIdLeft == uNodeCount || uIdRight == uNodeCount)
+ Quit("DiffTrees, check 5");
+
+ // uLeft2 is the spouse of uLeft1, and similarly for uRight2.
+ unsigned uLeft2 = IdToNodeIndex2[uIdLeft];
+ unsigned uRight2 = IdToNodeIndex2[uIdRight];
+
+ if (uLeft2 == uNodeCount || uRight2 == uNodeCount)
+ Quit("DiffTrees, check 6");
+
+ // If the spouses of uLeft1 and uRight1 have the same
+ // parent, then this parent is the spouse of uNodeIndex1.
+ // Otherwise, uNodeIndex1 is a diff.
+ unsigned uParentLeft2 = Tree2.GetParent(uLeft2);
+ unsigned uParentRight2 = Tree2.GetParent(uRight2);
+
+#if TRACE
+ Log("L1=%u R1=%u L2=%u R2=%u PL2=%u PR2=%u\n",
+ uLeft1,
+ uRight1,
+ uLeft2,
+ uRight2,
+ uParentLeft2,
+ uParentRight2);
+#endif
+
+ if (uParentLeft2 == uParentRight2)
+ {
+ NodeIndexToId1[uNodeIndex1] = uInternalNodeId;
+ IdToNodeIndex2[uInternalNodeId] = uParentLeft2;
+ ++uInternalNodeId;
+ }
+ else
+ bIsBachelor1[uNodeIndex1] = true;
+ }
+
+ unsigned uDiffCount = 0;
+ for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex)
+ {
+ if (bIsBachelor1[uNodeIndex])
+ continue;
+ if (Tree1.IsRoot(uNodeIndex))
+ {
+ // Special case: if no bachelors, consider the
+ // root a diff.
+ if (!bIsBachelor1[uNodeIndex])
+ bIsDiff1[uNodeIndex] = true;
+ continue;
+ }
+ const unsigned uParent = Tree1.GetParent(uNodeIndex);
+ if (bIsBachelor1[uParent])
+ {
+ bIsDiff1[uNodeIndex] = true;
+ ++uDiffCount;
+ }
+ }
+
+#if TRACE
+ Log("Tree1:\n");
+ Log("Node Id Bach Diff Name\n");
+ Log("---- ---- ---- ---- ----\n");
+ for (unsigned n = 0; n < uNodeCount; ++n)
+ {
+ Log("%4u %4u %d %d",
+ n,
+ NodeIndexToId1[n],
+ bIsBachelor1[n],
+ bIsDiff1[n]);
+ if (Tree1.IsLeaf(n))
+ Log(" %s", Tree1.GetLeafName(n));
+ Log("\n");
+ }
+ Log("\n");
+ Log("Tree2:\n");
+ Log("Node Id Name\n");
+ Log("---- ---- ----\n");
+ for (unsigned n = 0; n < uNodeCount; ++n)
+ {
+ Log("%4u ", n);
+ if (Tree2.IsLeaf(n))
+ Log(" %s", Tree2.GetLeafName(n));
+ Log("\n");
+ }
+#endif
+
+ Diffs.CreateRooted();
+ const unsigned uDiffsRootIndex = Diffs.GetRootNodeIndex();
+ const unsigned uRootIndex1 = Tree1.GetRootNodeIndex();
+
+ for (unsigned n = 0; n < uLeafCount; ++n)
+ IdToDiffsLeafNodeIndex[n] = uNodeCount;
+
+ BuildDiffs(Tree1, uRootIndex1, bIsDiff1, Diffs, uDiffsRootIndex,
+ IdToDiffsLeafNodeIndex);
+
+#if TRACE
+ Log("\n");
+ Log("Diffs:\n");
+ Diffs.LogMe();
+ Log("\n");
+ Log("IdToDiffsLeafNodeIndex:");
+ for (unsigned n = 0; n < uLeafCount; ++n)
+ {
+ if (n%16 == 0)
+ Log("\n");
+ else
+ Log(" ");
+ Log("%u=%u", n, IdToDiffsLeafNodeIndex[n]);
+ }
+ Log("\n");
+#endif
+
+ for (unsigned n = 0; n < uLeafCount; ++n)
+ if (IdToDiffsLeafNodeIndex[n] == uNodeCount)
+ Quit("TreeDiffs check 7");
+
+ delete[] NodeIndexToId1;
+ delete[] IdToNodeIndex2;
+
+ delete[] bIsBachelor1;
+ delete[] bIsDiff1;
+ }
Added: trunk/packages/muscle/branches/upstream/current/difftreese.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/difftreese.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/difftreese.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,235 @@
+#include "muscle.h"
+#include "tree.h"
+
+#define TRACE 0
+
+/***
+Algorithm to compare two trees, X and Y.
+
+A node x in X and node y in Y are defined to be
+similar iff the set of leaves in the subtree under
+x is identical to the set of leaves under y.
+
+A node is defined to be changed iff it is not
+similar to any node in the other tree.
+
+Nodes x and y are defined to be married iff every
+node in the subtree under x is similar to a node
+in the subtree under y. Married nodes are considered
+to be equal. The subtrees under two married nodes can
+at most differ by exchanges of left and right branches,
+which we do not consider to be significant here.
+
+A node is changed iff it is not married. If a node is
+changed, then it has a dissimilar node in its subtree,
+and it follows immediately from the definition of marriage
+that its parent is also a bachelor. Hence all nodes on the
+path from a changed node to the root are changed.
+
+We assume the trees have the same set of leaves, so
+every leaf is trivially both similar and married to
+the same leaf in the opposite tree. Changed nodes
+are therefore always internal (i.e., non-leaf) nodes.
+
+Example:
+
+ -----A
+ -----k
+ ----j -----B
+--i -----C
+ ------D
+
+
+ -----A
+ -----p
+ ----n -----B
+--m -----D
+ ------C
+
+
+The following pairs of internal nodes are similar.
+
+ Nodes Set of leaves
+ ----- -------------
+ k,p A,B
+ i,m A,B,C,D
+
+Changed nodes in the first tree are i and j, changed nodes
+in the second tree are m and n.
+
+Node k and p are married, but i and m are not (because j
+and n are changed). The diffs are C, D and k.
+
+To achieve O(N) we avoid traversing a given subtree multiple
+times and also avoid comparing lists of leaves.
+
+We visit nodes in depth-first order (i.e., a node is visited
+before its parent).
+
+If either child of a node is changed, we flag it as changed.
+
+If both children of the node we are visiting are married,
+we check whether the spouses of those children have the
+same parent in the other tree. If the parents are different,
+the current node is a bachelor. If they have the same parent,
+then the node we are visiting is the spouse of that parent.
+We assign this newly identified married couple a unique integer
+id. The id of a node is in one-to-one correspondence with the
+set of leaves in its subtree. Two nodes have the same set of
+leaves iff they have the same id. Changed nodes do not get
+an id.
+***/
+
+void DiffTreesE(const Tree &NewTree, const Tree &OldTree,
+ unsigned NewNodeIndexToOldNodeIndex[])
+ {
+#if TRACE
+ Log("DiffTreesE NewTree:\n");
+ NewTree.LogMe();
+ Log("\n");
+ Log("OldTree:\n");
+ OldTree.LogMe();
+#endif
+
+ if (!NewTree.IsRooted() || !OldTree.IsRooted())
+ Quit("DiffTrees: requires rooted trees");
+
+ const unsigned uNodeCount = NewTree.GetNodeCount();
+ const unsigned uOldNodeCount = OldTree.GetNodeCount();
+ const unsigned uLeafCount = NewTree.GetLeafCount();
+ const unsigned uOldLeafCount = OldTree.GetLeafCount();
+ if (uNodeCount != uOldNodeCount || uLeafCount != uOldLeafCount)
+ Quit("DiffTreesE: different node counts");
+
+ {
+ unsigned *IdToOldNodeIndex = new unsigned[uNodeCount];
+ for (unsigned uOldNodeIndex = 0; uOldNodeIndex < uNodeCount; ++uOldNodeIndex)
+ {
+ if (OldTree.IsLeaf(uOldNodeIndex))
+ {
+ unsigned Id = OldTree.GetLeafId(uOldNodeIndex);
+ IdToOldNodeIndex[Id] = uOldNodeIndex;
+ }
+ }
+
+// Initialize NewNodeIndexToOldNodeIndex[]
+// All internal nodes are marked as changed, but may be updated later.
+ for (unsigned uNewNodeIndex = 0; uNewNodeIndex < uNodeCount; ++uNewNodeIndex)
+ {
+ if (NewTree.IsLeaf(uNewNodeIndex))
+ {
+ unsigned uId = NewTree.GetLeafId(uNewNodeIndex);
+ assert(uId < uLeafCount);
+
+ unsigned uOldNodeIndex = IdToOldNodeIndex[uId];
+ assert(uOldNodeIndex < uNodeCount);
+
+ NewNodeIndexToOldNodeIndex[uNewNodeIndex] = uOldNodeIndex;
+ }
+ else
+ NewNodeIndexToOldNodeIndex[uNewNodeIndex] = NODE_CHANGED;
+ }
+ delete[] IdToOldNodeIndex;
+ }
+
+// Depth-first traversal of tree.
+// The order guarantees that a node is visited before
+// its parent is visited.
+ for (unsigned uNewNodeIndex = NewTree.FirstDepthFirstNode();
+ NULL_NEIGHBOR != uNewNodeIndex;
+ uNewNodeIndex = NewTree.NextDepthFirstNode(uNewNodeIndex))
+ {
+ if (NewTree.IsLeaf(uNewNodeIndex))
+ continue;
+
+ // If either child is changed, flag this node as changed and continue.
+ unsigned uNewLeft = NewTree.GetLeft(uNewNodeIndex);
+ unsigned uOldLeft = NewNodeIndexToOldNodeIndex[uNewLeft];
+ if (NODE_CHANGED == uOldLeft)
+ {
+ NewNodeIndexToOldNodeIndex[uNewLeft] = NODE_CHANGED;
+ continue;
+ }
+
+ unsigned uNewRight = NewTree.GetRight(uNewNodeIndex);
+ unsigned uOldRight = NewNodeIndexToOldNodeIndex[uNewRight];
+ if (NODE_CHANGED == NewNodeIndexToOldNodeIndex[uNewRight])
+ {
+ NewNodeIndexToOldNodeIndex[uNewRight] = NODE_CHANGED;
+ continue;
+ }
+
+ unsigned uOldParentLeft = OldTree.GetParent(uOldLeft);
+ unsigned uOldParentRight = OldTree.GetParent(uOldRight);
+ if (uOldParentLeft == uOldParentRight)
+ NewNodeIndexToOldNodeIndex[uNewNodeIndex] = uOldParentLeft;
+ else
+ NewNodeIndexToOldNodeIndex[uNewNodeIndex] = NODE_CHANGED;
+ }
+
+#if TRACE
+ {
+ Log("NewToOld ");
+ for (unsigned uNewNodeIndex = 0; uNewNodeIndex < uNodeCount; ++uNewNodeIndex)
+ {
+ Log(" [%3u]=", uNewNodeIndex);
+ if (NODE_CHANGED == NewNodeIndexToOldNodeIndex[uNewNodeIndex])
+ Log(" X");
+ else
+ Log("%3u", NewNodeIndexToOldNodeIndex[uNewNodeIndex]);
+ if ((uNewNodeIndex+1)%8 == 0)
+ Log("\n ");
+ }
+ Log("\n");
+ }
+#endif
+
+#if DEBUG
+ {
+ for (unsigned uNewNodeIndex = 0; uNewNodeIndex < uNodeCount; ++uNewNodeIndex)
+ {
+ unsigned uOld = NewNodeIndexToOldNodeIndex[uNewNodeIndex];
+ if (NewTree.IsLeaf(uNewNodeIndex))
+ {
+ if (uOld >= uNodeCount)
+ {
+ Log("NewNode=%u uOld=%u > uNodeCount=%u\n",
+ uNewNodeIndex, uOld, uNodeCount);
+ Quit("Diff check failed");
+ }
+ unsigned uIdNew = NewTree.GetLeafId(uNewNodeIndex);
+ unsigned uIdOld = OldTree.GetLeafId(uOld);
+ if (uIdNew != uIdOld)
+ {
+ Log("NewNode=%u uOld=%u IdNew=%u IdOld=%u\n",
+ uNewNodeIndex, uOld, uIdNew, uIdOld);
+ Quit("Diff check failed");
+ }
+ continue;
+ }
+
+ if (NODE_CHANGED == uOld)
+ continue;
+
+ unsigned uNewLeft = NewTree.GetLeft(uNewNodeIndex);
+ unsigned uNewRight = NewTree.GetRight(uNewNodeIndex);
+
+ unsigned uOldLeft = OldTree.GetLeft(uOld);
+ unsigned uOldRight = OldTree.GetRight(uOld);
+
+ unsigned uNewLeftPartner = NewNodeIndexToOldNodeIndex[uNewLeft];
+ unsigned uNewRightPartner = NewNodeIndexToOldNodeIndex[uNewRight];
+
+ bool bSameNotRotated = (uNewLeftPartner == uOldLeft && uNewRightPartner == uOldRight);
+ bool bSameRotated = (uNewLeftPartner == uOldRight && uNewRightPartner == uOldLeft);
+ if (!bSameNotRotated && !bSameRotated)
+ {
+ Log("NewNode=%u NewL=%u NewR=%u\n", uNewNodeIndex, uNewLeft, uNewRight);
+ Log("OldNode=%u OldL=%u OldR=%u\n", uOld, uOldLeft, uOldRight);
+ Log("NewLPartner=%u NewRPartner=%u\n", uNewLeftPartner, uNewRightPartner);
+ Quit("Diff check failed");
+ }
+ }
+ }
+#endif
+ }
Added: trunk/packages/muscle/branches/upstream/current/distcalc.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/distcalc.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/distcalc.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,72 @@
+#include "muscle.h"
+#include "distfunc.h"
+#include "distcalc.h"
+#include "msa.h"
+
+void DistCalcDF::Init(const DistFunc &DF)
+ {
+ m_ptrDF = &DF;
+ }
+
+void DistCalcDF::CalcDistRange(unsigned i, dist_t Dist[]) const
+ {
+ for (unsigned j = 0; j < i; ++j)
+ Dist[j] = m_ptrDF->GetDist(i, j);
+ }
+
+unsigned DistCalcDF::GetCount() const
+ {
+ return m_ptrDF->GetCount();
+ }
+
+unsigned DistCalcDF::GetId(unsigned i) const
+ {
+ return m_ptrDF->GetId(i);
+ }
+
+const char *DistCalcDF::GetName(unsigned i) const
+ {
+ return m_ptrDF->GetName(i);
+ }
+
+void DistCalcMSA::Init(const MSA &msa, DISTANCE Distance)
+ {
+ m_ptrMSA = &msa;
+ m_Distance = Distance;
+ }
+
+void DistCalcMSA::CalcDistRange(unsigned i, dist_t Dist[]) const
+ {
+// const unsigned uSeqIndex1 = m_ptrMSA->GetSeqIndex(i);
+ for (unsigned j = 0; j < i; ++j)
+ {
+// const unsigned uSeqIndex2 = m_ptrMSA->GetSeqIndex(j);
+ const float PctId = (float) m_ptrMSA->GetPctIdentityPair(i, j);
+ switch (m_Distance)
+ {
+ case DISTANCE_PctIdKimura:
+ Dist[j] = (float) KimuraDist(PctId);
+ break;
+ case DISTANCE_PctIdLog:
+ Dist[j] = (float) PctIdToMAFFTDist(PctId);
+ break;
+ default:
+ Quit("DistCalcMSA: Invalid DISTANCE_%u", m_Distance);
+ }
+ }
+ }
+
+unsigned DistCalcMSA::GetCount() const
+ {
+ return m_ptrMSA->GetSeqCount();
+ }
+
+unsigned DistCalcMSA::GetId(unsigned i) const
+ {
+ return m_ptrMSA->GetSeqId(i);
+ }
+
+const char *DistCalcMSA::GetName(unsigned i) const
+ {
+ return m_ptrMSA->GetSeqName(i);
+ }
Added: trunk/packages/muscle/branches/upstream/current/distcalc.h
===================================================================
--- trunk/packages/muscle/branches/upstream/current/distcalc.h 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/distcalc.h 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,45 @@
+#ifndef DistCalc_h
+#define DistCalc_h
+
+typedef float dist_t;
+const dist_t BIG_DIST = (dist_t) 1e29;
+
+class DistFunc;
+
+class DistCalc
+ {
+public:
+ virtual void CalcDistRange(unsigned i, dist_t Dist[]) const = 0;
+ virtual unsigned GetCount() const = 0;
+ virtual unsigned GetId(unsigned i) const = 0;
+ virtual const char *GetName(unsigned i) const = 0;
+ };
+
+class DistCalcDF : public DistCalc
+ {
+public:
+ void Init(const DistFunc &DF);
+ virtual void CalcDistRange(unsigned i, dist_t Dist[]) const;
+ virtual unsigned GetCount() const;
+ virtual unsigned GetId(unsigned i) const;
+ virtual const char *GetName(unsigned i) const;
+
+private:
+ const DistFunc *m_ptrDF;
+ };
+
+class DistCalcMSA : public DistCalc
+ {
+public:
+ void Init(const MSA &msa, DISTANCE Distance);
+ virtual void CalcDistRange(unsigned i, dist_t Dist[]) const;
+ virtual unsigned GetCount() const;
+ virtual unsigned GetId(unsigned i) const;
+ virtual const char *GetName(unsigned i) const;
+
+private:
+ const MSA *m_ptrMSA;
+ DISTANCE m_Distance;
+ };
+
+#endif // DistCalc_h
Added: trunk/packages/muscle/branches/upstream/current/distfunc.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/distfunc.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/distfunc.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,113 @@
+#include "muscle.h"
+#include "distfunc.h"
+#include <assert.h>
+
+DistFunc::DistFunc()
+ {
+ m_Dists = 0;
+ m_uCount = 0;
+ m_uCacheCount = 0;
+ m_Names = 0;
+ m_Ids = 0;
+ }
+
+DistFunc::~DistFunc()
+ {
+ if (0 != m_Names)
+ {
+ for (unsigned i = 0; i < m_uCount; ++i)
+ free(m_Names[i]);
+ }
+ delete[] m_Dists;
+ delete[] m_Names;
+ delete[] m_Ids;
+ }
+
+float DistFunc::GetDist(unsigned uIndex1, unsigned uIndex2) const
+ {
+ return m_Dists[VectorIndex(uIndex1, uIndex2)];
+ }
+
+unsigned DistFunc::GetCount() const
+ {
+ return m_uCount;
+ }
+
+void DistFunc::SetCount(unsigned uCount)
+ {
+ m_uCount = uCount;
+ if (uCount <= m_uCacheCount)
+ return;
+ delete[] m_Dists;
+ m_Dists = new float[VectorLength()];
+ m_Names = new char *[m_uCount];
+ m_Ids = new unsigned[m_uCount];
+ m_uCacheCount = uCount;
+
+ memset(m_Names, 0, m_uCount*sizeof(char *));
+ memset(m_Ids, 0xff, m_uCount*sizeof(unsigned));
+ memset(m_Dists, 0, VectorLength()*sizeof(float));
+ }
+
+void DistFunc::SetDist(unsigned uIndex1, unsigned uIndex2, float dDist)
+ {
+ m_Dists[VectorIndex(uIndex1, uIndex2)] = dDist;
+ m_Dists[VectorIndex(uIndex2, uIndex1)] = dDist;
+ }
+
+unsigned DistFunc::VectorIndex(unsigned uIndex1, unsigned uIndex2) const
+ {
+ assert(uIndex1 < m_uCount && uIndex2 < m_uCount);
+ return uIndex1*m_uCount + uIndex2;
+ }
+
+unsigned DistFunc::VectorLength() const
+ {
+ return m_uCount*m_uCount;
+ }
+
+void DistFunc::SetName(unsigned uIndex, const char szName[])
+ {
+ assert(uIndex < m_uCount);
+ m_Names[uIndex] = strsave(szName);
+ }
+
+void DistFunc::SetId(unsigned uIndex, unsigned uId)
+ {
+ assert(uIndex < m_uCount);
+ m_Ids[uIndex] = uId;
+ }
+
+const char *DistFunc::GetName(unsigned uIndex) const
+ {
+ assert(uIndex < m_uCount);
+ return m_Names[uIndex];
+ }
+
+unsigned DistFunc::GetId(unsigned uIndex) const
+ {
+ assert(uIndex < m_uCount);
+ return m_Ids[uIndex];
+ }
+
+void DistFunc::LogMe() const
+ {
+ Log("DistFunc::LogMe count=%u\n", m_uCount);
+ Log(" ");
+ for (unsigned i = 0; i < m_uCount; ++i)
+ Log(" %7u", i);
+ Log("\n");
+
+ Log(" ");
+ for (unsigned i = 0; i < m_uCount; ++i)
+ Log(" %7.7s", m_Names[i] ? m_Names[i] : "");
+ Log("\n");
+
+ for (unsigned i = 0; i < m_uCount; ++i)
+ {
+ Log("%4u %10.10s : ", i, m_Names[i] ? m_Names[i] : "");
+ for (unsigned j = 0; j <= i; ++j)
+ Log(" %7.4g", GetDist(i, j));
+ Log("\n");
+ }
+ }
Added: trunk/packages/muscle/branches/upstream/current/distfunc.h
===================================================================
--- trunk/packages/muscle/branches/upstream/current/distfunc.h 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/distfunc.h 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,36 @@
+#ifndef DistFunc_h
+#define DistFunc_h
+
+class DistFunc
+ {
+public:
+ DistFunc();
+ virtual ~DistFunc();
+
+public:
+ virtual void SetCount(unsigned uCount);
+ virtual void SetDist(unsigned uIndex1, unsigned uIndex2, float dDist);
+
+ void SetName(unsigned uIndex, const char szName[]);
+ void SetId(unsigned uIndex, unsigned uId);
+ const char *GetName(unsigned uIndex) const;
+ unsigned GetId(unsigned uIndex) const;
+
+ virtual float GetDist(unsigned uIndex1, unsigned uIndex2) const;
+ virtual unsigned GetCount() const;
+
+ void LogMe() const;
+
+protected:
+ unsigned VectorIndex(unsigned uIndex, unsigned uIndex2) const;
+ unsigned VectorLength() const;
+
+private:
+ unsigned m_uCount;
+ unsigned m_uCacheCount;
+ float *m_Dists;
+ char **m_Names;
+ unsigned *m_Ids;
+ };
+
+#endif // DistFunc_h
Added: trunk/packages/muscle/branches/upstream/current/distpwkimura.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/distpwkimura.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/distpwkimura.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,45 @@
+#include "muscle.h"
+#include "distfunc.h"
+#include "msa.h"
+#include "seqvect.h"
+#include "pwpath.h"
+
+void DistPWKimura(const SeqVect &v, DistFunc &DF)
+ {
+ SEQWEIGHT SeqWeightSave = GetSeqWeightMethod();
+ SetSeqWeightMethod(SEQWEIGHT_Henikoff);
+
+ const unsigned uSeqCount = v.Length();
+ DF.SetCount(uSeqCount);
+
+ const unsigned uPairCount = (uSeqCount*(uSeqCount + 1))/2;
+ unsigned uCount = 0;
+ SetProgressDesc("PWKimura distance");
+ for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1)
+ {
+ const Seq &s1 = v.GetSeq(uSeqIndex1);
+ MSA msa1;
+ msa1.FromSeq(s1);
+ for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqIndex1; ++uSeqIndex2)
+ {
+ if (0 == uCount%20)
+ Progress(uCount, uPairCount);
+ ++uCount;
+ const Seq &s2 = v.GetSeq(uSeqIndex2);
+ MSA msa2;
+ msa2.FromSeq(s2);
+
+ PWPath Path;
+ MSA msaOut;
+ AlignTwoMSAs(msa1, msa2, msaOut, Path, false, false);
+
+ double dPctId = msaOut.GetPctIdentityPair(0, 1);
+ float f = (float) KimuraDist(dPctId);
+
+ DF.SetDist(uSeqIndex1, uSeqIndex2, f);
+ }
+ }
+ ProgressStepsDone();
+
+ SetSeqWeightMethod(SeqWeightSave);
+ }
Added: trunk/packages/muscle/branches/upstream/current/domuscle.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/domuscle.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/domuscle.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,299 @@
+#include "muscle.h"
+#include "textfile.h"
+#include "seqvect.h"
+#include "distfunc.h"
+#include "msa.h"
+#include "tree.h"
+#include "profile.h"
+#include "timing.h"
+
+static char g_strUseTreeWarning[] =
+"\n******** WARNING ****************\n"
+"\nYou specified the -usetree option.\n"
+"Note that a good evolutionary tree may NOT be a good\n"
+"guide tree for multiple alignment. For more details,\n"
+"please refer to the user guide. To disable this\n"
+"warning, use -usetree_nowarn <treefilename>.\n\n";
+
+void DoMuscle()
+ {
+ SetOutputFileName(g_pstrOutFileName);
+ SetInputFileName(g_pstrInFileName);
+
+ SetMaxIters(g_uMaxIters);
+ SetSeqWeightMethod(g_SeqWeight1);
+
+ TextFile fileIn(g_pstrInFileName);
+ SeqVect v;
+ v.FromFASTAFile(fileIn);
+ const unsigned uSeqCount = v.Length();
+
+ if (0 == uSeqCount)
+ Quit("No sequences in input file");
+
+ ALPHA Alpha = ALPHA_Undefined;
+ switch (g_SeqType)
+ {
+ case SEQTYPE_Auto:
+ Alpha = v.GuessAlpha();
+ break;
+
+ case SEQTYPE_Protein:
+ Alpha = ALPHA_Amino;
+ break;
+
+ case SEQTYPE_DNA:
+ Alpha = ALPHA_DNA;
+ break;
+
+ case SEQTYPE_RNA:
+ Alpha = ALPHA_RNA;
+ break;
+
+ default:
+ Quit("Invalid seq type");
+ }
+ SetAlpha(Alpha);
+ v.FixAlpha();
+
+ PTR_SCOREMATRIX UserMatrix = 0;
+ if (0 != g_pstrMatrixFileName)
+ {
+ const char *FileName = g_pstrMatrixFileName;
+ const char *Path = getenv("MUSCLE_MXPATH");
+ if (Path != 0)
+ {
+ size_t n = strlen(Path) + 1 + strlen(FileName) + 1;
+ char *NewFileName = new char[n];
+ sprintf(NewFileName, "%s/%s", Path, FileName);
+ FileName = NewFileName;
+ }
+ TextFile File(FileName);
+ UserMatrix = ReadMx(File);
+ g_Alpha = ALPHA_Amino;
+ g_PPScore = PPSCORE_SP;
+ }
+
+ SetPPScore();
+
+ if (0 != UserMatrix)
+ g_ptrScoreMatrix = UserMatrix;
+
+ unsigned uMaxL = 0;
+ unsigned uTotL = 0;
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ unsigned L = v.GetSeq(uSeqIndex).Length();
+ uTotL += L;
+ if (L > uMaxL)
+ uMaxL = L;
+ }
+
+ SetIter(1);
+ g_bDiags = g_bDiags1;
+ SetSeqStats(uSeqCount, uMaxL, uTotL/uSeqCount);
+
+ SetMuscleSeqVect(v);
+
+ MSA::SetIdCount(uSeqCount);
+
+// Initialize sequence ids.
+// From this point on, ids must somehow propogate from here.
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ v.SetSeqId(uSeqIndex, uSeqIndex);
+
+ if (0 == uSeqCount)
+ Quit("Input file '%s' has no sequences", g_pstrInFileName);
+ if (1 == uSeqCount)
+ {
+ TextFile fileOut(g_pstrOutFileName, true);
+ v.ToFile(fileOut);
+ return;
+ }
+
+ if (uSeqCount > 1)
+ MHackStart(v);
+
+// First iteration
+ Tree GuideTree;
+ if (0 != g_pstrUseTreeFileName)
+ {
+ // Discourage users...
+ if (!g_bUseTreeNoWarn)
+ fprintf(stderr, g_strUseTreeWarning);
+
+ // Read tree from file
+ TextFile TreeFile(g_pstrUseTreeFileName);
+ GuideTree.FromFile(TreeFile);
+
+ // Make sure tree is rooted
+ if (!GuideTree.IsRooted())
+ Quit("User tree must be rooted");
+
+ if (GuideTree.GetLeafCount() != uSeqCount)
+ Quit("User tree does not match input sequences");
+
+ const unsigned uNodeCount = GuideTree.GetNodeCount();
+ for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex)
+ {
+ if (!GuideTree.IsLeaf(uNodeIndex))
+ continue;
+ const char *LeafName = GuideTree.GetLeafName(uNodeIndex);
+ unsigned uSeqIndex;
+ bool SeqFound = v.FindName(LeafName, &uSeqIndex);
+ if (!SeqFound)
+ Quit("Label %s in tree does not match sequences", LeafName);
+ }
+
+ // Set ids
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ const char *SeqName = v.GetSeqName(uSeqIndex);
+ unsigned uLeafIndex = GuideTree.GetLeafNodeIndex(SeqName);
+ GuideTree.SetLeafId(uLeafIndex, uSeqIndex);
+ }
+ }
+ else
+ TreeFromSeqVect(v, GuideTree, g_Cluster1, g_Distance1, g_Root1);
+
+ const char *Tree1 = ValueOpt("Tree1");
+ if (0 != Tree1)
+ {
+ TextFile f(Tree1, true);
+ GuideTree.ToFile(f);
+ if (g_bCluster)
+ return;
+ }
+
+ SetMuscleTree(GuideTree);
+ ValidateMuscleIds(GuideTree);
+
+ MSA msa;
+ ProgNode *ProgNodes = 0;
+ if (g_bLow)
+ ProgNodes = ProgressiveAlignE(v, GuideTree, msa);
+ else
+ ProgressiveAlign(v, GuideTree, msa);
+ SetCurrentAlignment(msa);
+
+ if (0 != g_pstrComputeWeightsFileName)
+ {
+ extern void OutWeights(const char *FileName, const MSA &msa);
+ SetMSAWeightsMuscle(msa);
+ OutWeights(g_pstrComputeWeightsFileName, msa);
+ return;
+ }
+
+ ValidateMuscleIds(msa);
+
+ if (1 == g_uMaxIters || 2 == uSeqCount)
+ {
+ //TextFile fileOut(g_pstrOutFileName, true);
+ //MHackEnd(msa);
+ //msa.ToFile(fileOut);
+ MuscleOutput(msa);
+ return;
+ }
+
+ if (0 == g_pstrUseTreeFileName)
+ {
+ g_bDiags = g_bDiags2;
+ SetIter(2);
+
+ if (g_bLow)
+ {
+ if (0 != g_uMaxTreeRefineIters)
+ RefineTreeE(msa, v, GuideTree, ProgNodes);
+ }
+ else
+ RefineTree(msa, GuideTree);
+
+ const char *Tree2 = ValueOpt("Tree2");
+ if (0 != Tree2)
+ {
+ TextFile f(Tree2, true);
+ GuideTree.ToFile(f);
+ }
+ }
+
+ SetSeqWeightMethod(g_SeqWeight2);
+ SetMuscleTree(GuideTree);
+
+ if (g_bAnchors)
+ RefineVert(msa, GuideTree, g_uMaxIters - 2);
+ else
+ RefineHoriz(msa, GuideTree, g_uMaxIters - 2, false, false);
+
+#if 0
+// Refining by subfamilies is disabled as it didn't give better
+// results. I tried doing this before and after RefineHoriz.
+// Should get back to this as it seems like this should work.
+ RefineSubfams(msa, GuideTree, g_uMaxIters - 2);
+#endif
+
+ ValidateMuscleIds(msa);
+ ValidateMuscleIds(GuideTree);
+
+ //TextFile fileOut(g_pstrOutFileName, true);
+ //MHackEnd(msa);
+ //msa.ToFile(fileOut);
+ MuscleOutput(msa);
+ }
+
+void Run()
+ {
+ SetStartTime();
+ Log("Started %s\n", GetTimeAsStr());
+ for (int i = 0; i < g_argc; ++i)
+ Log("%s ", g_argv[i]);
+ Log("\n");
+
+#if TIMING
+ TICKS t1 = GetClockTicks();
+#endif
+ if (g_bRefine)
+ Refine();
+ else if (g_bRefineW)
+ {
+ extern void DoRefineW();
+ DoRefineW();
+ }
+ else if (g_bProfDB)
+ ProfDB();
+ else if (g_bSW)
+ Local();
+ else if (0 != g_pstrSPFileName)
+ DoSP();
+ else if (g_bProfile)
+ Profile();
+ else if (g_bPPScore)
+ PPScore();
+ else if (g_bPAS)
+ ProgAlignSubFams();
+ else
+ DoMuscle();
+
+#if TIMING
+ extern TICKS g_ticksDP;
+ extern TICKS g_ticksObjScore;
+ TICKS t2 = GetClockTicks();
+ TICKS TotalTicks = t2 - t1;
+ TICKS ticksOther = TotalTicks - g_ticksDP - g_ticksObjScore;
+ double dSecs = TicksToSecs(TotalTicks);
+ double PctDP = (double) g_ticksDP*100.0/(double) TotalTicks;
+ double PctOS = (double) g_ticksObjScore*100.0/(double) TotalTicks;
+ double PctOther = (double) ticksOther*100.0/(double) TotalTicks;
+ Log(" Ticks Secs Pct\n");
+ Log(" ============ ======= =====\n");
+ Log("DP %12ld %7.2f %5.1f%%\n",
+ (long) g_ticksDP, TicksToSecs(g_ticksDP), PctDP);
+ Log("OS %12ld %7.2f %5.1f%%\n",
+ (long) g_ticksObjScore, TicksToSecs(g_ticksObjScore), PctOS);
+ Log("Other %12ld %7.2f %5.1f%%\n",
+ (long) ticksOther, TicksToSecs(ticksOther), PctOther);
+ Log("Total %12ld %7.2f 100.0%%\n", (long) TotalTicks, dSecs);
+#endif
+
+ ListDiagSavings();
+ Log("Finished %s\n", GetTimeAsStr());
+ }
Added: trunk/packages/muscle/branches/upstream/current/dosp.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/dosp.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/dosp.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,60 @@
+#include "muscle.h"
+#include "textfile.h"
+#include "msa.h"
+#include "objscore.h"
+#include "tree.h"
+#include "profile.h"
+
+void DoSP()
+ {
+ TextFile f(g_pstrSPFileName);
+
+ MSA a;
+ a.FromFile(f);
+
+ ALPHA Alpha = ALPHA_Undefined;
+ switch (g_SeqType)
+ {
+ case SEQTYPE_Auto:
+ Alpha = a.GuessAlpha();
+ break;
+
+ case SEQTYPE_Protein:
+ Alpha = ALPHA_Amino;
+ break;
+
+ case SEQTYPE_DNA:
+ Alpha = ALPHA_DNA;
+ break;
+
+ case SEQTYPE_RNA:
+ Alpha = ALPHA_RNA;
+ break;
+
+ default:
+ Quit("Invalid SeqType");
+ }
+ SetAlpha(Alpha);
+ a.FixAlpha();
+
+ SetPPScore();
+
+ const unsigned uSeqCount = a.GetSeqCount();
+ if (0 == uSeqCount)
+ Quit("No sequences in input file %s", g_pstrSPFileName);
+
+ MSA::SetIdCount(uSeqCount);
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ a.SetSeqId(uSeqIndex, uSeqIndex);
+
+ SetSeqWeightMethod(g_SeqWeight1);
+ Tree tree;
+ TreeFromMSA(a, tree, g_Cluster2, g_Distance2, g_Root2);
+ SetMuscleTree(tree);
+ SetMSAWeightsMuscle((MSA &) a);
+
+ SCORE SP = ObjScoreSP(a);
+
+ Log("File=%s;SP=%.4g\n", g_pstrSPFileName, SP);
+ fprintf(stderr, "File=%s;SP=%.4g\n", g_pstrSPFileName, SP);
+ }
Added: trunk/packages/muscle/branches/upstream/current/dpregionlist.h
===================================================================
--- trunk/packages/muscle/branches/upstream/current/dpregionlist.h 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/dpregionlist.h 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,73 @@
+#ifndef DPRegionList_h
+#define DPRegionList_h
+
+#include "diaglist.h"
+
+enum DPREGIONTYPE
+ {
+ DPREGIONTYPE_Unknown,
+ DPREGIONTYPE_Diag,
+ DPREGIONTYPE_Rect
+ };
+
+struct DPRegion
+ {
+ DPREGIONTYPE m_Type;
+ union
+ {
+ Diag m_Diag;
+ Rect m_Rect;
+ };
+ };
+
+const unsigned MAX_DPREGIONS = 1024;
+
+class DPRegionList
+ {
+public:
+ DPRegionList()
+ {
+ m_uCount = 0;
+ }
+ ~DPRegionList()
+ {
+ Free();
+ }
+
+public:
+// Creation
+ void Clear()
+ {
+ Free();
+ }
+ void Add(const DPRegion &r);
+
+// Accessors
+ unsigned GetCount() const
+ {
+ return m_uCount;
+ }
+ const DPRegion &Get(unsigned uIndex) const
+ {
+ assert(uIndex < m_uCount);
+ return m_DPRegions[uIndex];
+ }
+
+// Diagnostics
+ void LogMe() const;
+
+private:
+ void Free()
+ {
+ m_uCount = 0;
+ }
+
+private:
+ unsigned m_uCount;
+ DPRegion m_DPRegions[MAX_DPREGIONS];
+ };
+
+void DiagListToDPRegionList(const DiagList &DL, DPRegionList &RL,
+ unsigned uLengthA, unsigned uLengthB);
+
+#endif // DPRegionList_h
Added: trunk/packages/muscle/branches/upstream/current/dpreglist.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/dpreglist.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/dpreglist.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,108 @@
+#include "muscle.h"
+#include "dpreglist.h"
+
+unsigned DPRegionList::GetDPArea() const
+ {
+ unsigned uArea = 0;
+ for (unsigned i = 0; i < m_uCount; ++i)
+ {
+ const DPRegion &r = m_DPRegions[i];
+ if (DPREGIONTYPE_Rect == r.m_Type)
+ uArea += r.m_Rect.m_uLengthA*r.m_Rect.m_uLengthB;
+ }
+ return uArea;
+ }
+
+void DPRegionList::Add(const DPRegion &r)
+ {
+ if (m_uCount == MAX_DPREGIONS)
+ Quit("DPRegionList::Add, overflow %d", m_uCount);
+ m_DPRegions[m_uCount] = r;
+ ++m_uCount;
+ }
+
+void DPRegionList::LogMe() const
+ {
+ Log("DPRegionList::LogMe, count=%u\n", m_uCount);
+ Log("Region Type StartA StartB EndA EndB\n");
+ Log("------ ---- ------ ------ ---- ----\n");
+ for (unsigned i = 0; i < m_uCount; ++i)
+ {
+ const DPRegion &r = m_DPRegions[i];
+ Log("%6u ", i);
+ if (DPREGIONTYPE_Diag == r.m_Type)
+ Log("Diag %6u %6u %6u %6u\n",
+ r.m_Diag.m_uStartPosA,
+ r.m_Diag.m_uStartPosB,
+ r.m_Diag.m_uStartPosA + r.m_Diag.m_uLength - 1,
+ r.m_Diag.m_uStartPosB + r.m_Diag.m_uLength - 1);
+ else if (DPREGIONTYPE_Rect == r.m_Type)
+ Log("Rect %6u %6u %6u %6u\n",
+ r.m_Rect.m_uStartPosA,
+ r.m_Rect.m_uStartPosB,
+ r.m_Rect.m_uStartPosA + r.m_Rect.m_uLengthA - 1,
+ r.m_Rect.m_uStartPosB + r.m_Rect.m_uLengthB - 1);
+ else
+ Log(" *** ERROR *** Type=%u\n", r.m_Type);
+ }
+ }
+
+void DiagListToDPRegionList(const DiagList &DL, DPRegionList &RL,
+ unsigned uLengthA, unsigned uLengthB)
+ {
+ if (g_uDiagMargin > g_uMinDiagLength/2)
+ Quit("Invalid parameters, diagmargin=%d must be <= 2*diaglength=%d",
+ g_uDiagMargin, g_uMinDiagLength);
+
+ unsigned uStartPosA = 0;
+ unsigned uStartPosB = 0;
+ const unsigned uDiagCount = DL.GetCount();
+ DPRegion r;
+ for (unsigned uDiagIndex = 0; uDiagIndex < uDiagCount; ++uDiagIndex)
+ {
+ const Diag &d = DL.Get(uDiagIndex);
+ assert(d.m_uLength >= g_uMinDiagLength);
+ const unsigned uStartVertexA = d.m_uStartPosA + g_uDiagMargin - 1;
+ const unsigned uStartVertexB = d.m_uStartPosB + g_uDiagMargin - 1;
+ const unsigned uEndVertexA = d.m_uStartPosA + d.m_uLength - g_uDiagMargin;
+ const unsigned uEndVertexB = d.m_uStartPosB + d.m_uLength - g_uDiagMargin;
+
+ r.m_Type = DPREGIONTYPE_Rect;
+ r.m_Rect.m_uStartPosA = uStartPosA;
+ r.m_Rect.m_uStartPosB = uStartPosB;
+
+ assert(uStartVertexA + 1 >= uStartPosA);
+ assert(uStartVertexB + 1 >= uStartPosB);
+ r.m_Rect.m_uLengthA = uStartVertexA + 1 - uStartPosA;
+ r.m_Rect.m_uLengthB = uStartVertexB + 1 - uStartPosB;
+ RL.Add(r);
+
+ if (uEndVertexA > uStartVertexA + 1)
+ {
+ const unsigned uDiagLengthMinusCaps = uEndVertexA - uStartVertexA - 1;
+
+ r.m_Type = DPREGIONTYPE_Diag;
+ r.m_Diag.m_uStartPosA = uStartVertexA + 1;
+ r.m_Diag.m_uStartPosB = uStartVertexB + 1;
+ assert(uEndVertexA - uStartVertexA == uEndVertexB - uStartVertexB);
+ r.m_Diag.m_uLength = uEndVertexA - uStartVertexA - 1;
+ RL.Add(r);
+ }
+
+ uStartPosA = uEndVertexA;
+ uStartPosB = uEndVertexB;
+ }
+
+ assert((int) uLengthA - (int) uStartPosA >= (int) g_uDiagMargin);
+ assert((int) uLengthB - (int) uStartPosB >= (int) g_uDiagMargin);
+
+ r.m_Type = DPREGIONTYPE_Rect;
+ r.m_Rect.m_uStartPosA = uStartPosA;
+ r.m_Rect.m_uStartPosB = uStartPosB;
+
+ assert(uLengthA >= uStartPosA);
+ assert(uLengthB >= uStartPosB);
+ r.m_Rect.m_uLengthA = uLengthA - uStartPosA;
+ r.m_Rect.m_uLengthB = uLengthB - uStartPosB;
+ RL.Add(r);
+ }
Added: trunk/packages/muscle/branches/upstream/current/dpreglist.h
===================================================================
--- trunk/packages/muscle/branches/upstream/current/dpreglist.h 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/dpreglist.h 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,76 @@
+#ifndef dpreglist_h
+#define dpreglist_h
+
+#include "diaglist.h"
+
+enum DPREGIONTYPE
+ {
+ DPREGIONTYPE_Unknown,
+ DPREGIONTYPE_Diag,
+ DPREGIONTYPE_Rect
+ };
+
+struct DPRegion
+ {
+ DPREGIONTYPE m_Type;
+ union
+ {
+ Diag m_Diag;
+ Rect m_Rect;
+ };
+ };
+
+const unsigned MAX_DPREGIONS = 1024;
+
+class DPRegionList
+ {
+public:
+ DPRegionList()
+ {
+ m_uCount = 0;
+ }
+ ~DPRegionList()
+ {
+ Free();
+ }
+
+public:
+// Creation
+ void Clear()
+ {
+ Free();
+ }
+ void Add(const DPRegion &r);
+
+// Accessors
+ unsigned GetCount() const
+ {
+ return m_uCount;
+ }
+
+ const DPRegion &Get(unsigned uIndex) const
+ {
+ assert(uIndex < m_uCount);
+ return m_DPRegions[uIndex];
+ }
+
+ unsigned GetDPArea() const;
+
+// Diagnostics
+ void LogMe() const;
+
+private:
+ void Free()
+ {
+ m_uCount = 0;
+ }
+
+private:
+ unsigned m_uCount;
+ DPRegion m_DPRegions[MAX_DPREGIONS];
+ };
+
+void DiagListToDPRegionList(const DiagList &DL, DPRegionList &RL,
+ unsigned uLengthA, unsigned uLengthB);
+
+#endif // dpreglist_h
Added: trunk/packages/muscle/branches/upstream/current/drawtree.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/drawtree.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/drawtree.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,41 @@
+#include "muscle.h"
+#include "tree.h"
+
+/***
+Simple tree drawing algorithm.
+
+y coordinate of node is index in depth-first traversal.
+x coordinate is distance from root.
+***/
+
+static unsigned DistFromRoot(const Tree &tree, unsigned uNodeIndex)
+ {
+ const unsigned uRoot = tree.GetRootNodeIndex();
+ unsigned uDist = 0;
+ while (uNodeIndex != uRoot)
+ {
+ ++uDist;
+ uNodeIndex = tree.GetParent(uNodeIndex);
+ }
+ return uDist;
+ }
+
+static void DrawNode(const Tree &tree, unsigned uNodeIndex)
+ {
+ if (!tree.IsLeaf(uNodeIndex))
+ DrawNode(tree, tree.GetLeft(uNodeIndex));
+
+ unsigned uDist = DistFromRoot(tree, uNodeIndex);
+ for (unsigned i = 0; i < 5*uDist; ++i)
+ Log(" ");
+ Log("%d\n", uNodeIndex);
+
+ if (!tree.IsLeaf(uNodeIndex))
+ DrawNode(tree, tree.GetRight(uNodeIndex));
+ }
+
+void DrawTree(const Tree &tree)
+ {
+ unsigned uRoot = tree.GetRootNodeIndex();
+ DrawNode(tree, uRoot);
+ }
Added: trunk/packages/muscle/branches/upstream/current/edgelist.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/edgelist.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/edgelist.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,88 @@
+#include "muscle.h"
+#include "edgelist.h"
+
+EdgeList::EdgeList()
+ {
+ m_uNode1 = 0;
+ m_uNode2 = 0;
+ m_uCount = 0;
+ m_uCacheSize = 0;
+ }
+
+EdgeList::~EdgeList()
+ {
+ Clear();
+ }
+
+void EdgeList::Clear()
+ {
+ delete[] m_uNode1;
+ delete[] m_uNode2;
+ m_uNode1 = 0;
+ m_uNode2 = 0;
+ m_uCount = 0;
+ m_uCacheSize = 0;
+ }
+
+void EdgeList::Add(unsigned uNode1, unsigned uNode2)
+ {
+ if (m_uCount <= m_uCacheSize)
+ Expand();
+ m_uNode1[m_uCount] = uNode1;
+ m_uNode2[m_uCount] = uNode2;
+ ++m_uCount;
+ }
+
+unsigned EdgeList::GetCount() const
+ {
+ return m_uCount;
+ }
+
+void EdgeList::GetEdge(unsigned uIndex, unsigned *ptruNode1, unsigned *ptruNode2) const
+ {
+ if (uIndex > m_uCount)
+ Quit("EdgeList::GetEdge(%u) count=%u", uIndex, m_uCount);
+ *ptruNode1 = m_uNode1[uIndex];
+ *ptruNode2 = m_uNode2[uIndex];
+ }
+
+void EdgeList::Copy(const EdgeList &rhs)
+ {
+ Clear();
+ const unsigned uCount = rhs.GetCount();
+ for (unsigned n = 0; n < uCount; ++n)
+ {
+ unsigned uNode1;
+ unsigned uNode2;
+ rhs.GetEdge(n, &uNode1, &uNode2);
+ Add(uNode1, uNode2);
+ }
+ }
+
+void EdgeList::Expand()
+ {
+ unsigned uNewCacheSize = m_uCacheSize + 512;
+ unsigned *NewNode1 = new unsigned[uNewCacheSize];
+ unsigned *NewNode2 = new unsigned[uNewCacheSize];
+ if (m_uCount > 0)
+ {
+ memcpy(NewNode1, m_uNode1, m_uCount*sizeof(unsigned));
+ memcpy(NewNode2, m_uNode2, m_uCount*sizeof(unsigned));
+ }
+ delete[] m_uNode1;
+ delete[] m_uNode2;
+ m_uNode1 = NewNode1;
+ m_uNode2 = NewNode2;
+ m_uCacheSize = uNewCacheSize;
+ }
+
+void EdgeList::LogMe() const
+ {
+ for (unsigned n = 0; n < m_uCount; ++n)
+ {
+ if (n > 0)
+ Log(" ");
+ Log("%u->%u", m_uNode1[n], m_uNode2[n]);
+ }
+ Log("\n");
+ }
Added: trunk/packages/muscle/branches/upstream/current/edgelist.h
===================================================================
--- trunk/packages/muscle/branches/upstream/current/edgelist.h 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/edgelist.h 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,28 @@
+#ifndef EdgeList_h
+#define EdgeList_h
+
+class EdgeList
+ {
+public:
+ EdgeList();
+ virtual ~EdgeList();
+
+public:
+ void Clear();
+ void Add(unsigned uNode1, unsigned uNode2);
+ unsigned GetCount() const;
+ void GetEdge(unsigned uIndex, unsigned *ptruNode1, unsigned *ptruNode2) const;
+ void Copy(const EdgeList &rhs);
+ void LogMe() const;
+
+private:
+ void Expand();
+
+private:
+ unsigned m_uCount;
+ unsigned m_uCacheSize;
+ unsigned *m_uNode1;
+ unsigned *m_uNode2;
+ };
+
+#endif // EdgeList_h
Added: trunk/packages/muscle/branches/upstream/current/enumopts.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/enumopts.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/enumopts.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,8 @@
+#include "muscle.h"
+#include "enumopts.h"
+
+#define s(t) EnumOpt t##_Opts[] = {
+#define c(t, x) #x, t##_##x,
+#define e(t) 0, 0 };
+
+#include "enums.h"
Added: trunk/packages/muscle/branches/upstream/current/enumopts.h
===================================================================
--- trunk/packages/muscle/branches/upstream/current/enumopts.h 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/enumopts.h 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,16 @@
+#ifndef enumopts_h
+#define enumopts_h
+
+struct EnumOpt
+ {
+ const char *pstrOpt;
+ int iValue;
+ };
+
+#define s(t) extern EnumOpt t##_Opts[];
+#define c(t, x) /* empty */
+#define e(t) /* empty */
+#include "enums.h"
+
+
+#endif // enumopts_h
Added: trunk/packages/muscle/branches/upstream/current/enums.h
===================================================================
--- trunk/packages/muscle/branches/upstream/current/enums.h 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/enums.h 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,95 @@
+// enums.h
+// Define enum types.
+// Exploit macro hacks to avoid lots of repetetive typing.
+// Generally I am opposed to macro hacks because of the
+// highly obscure code that results, but in this case it
+// makes maintenance much easier and less error-prone.
+// The idea is that this file can be included in different
+// places with different definitions of s (Start), c (Case)
+// and e (End). See types.h.
+
+s(ALPHA)
+c(ALPHA, Amino)
+c(ALPHA, DNA)
+c(ALPHA, RNA)
+e(ALPHA)
+
+s(SEQTYPE)
+c(SEQTYPE, Protein)
+c(SEQTYPE, DNA)
+c(SEQTYPE, RNA)
+c(SEQTYPE, Auto)
+e(SEQTYPE)
+
+s(ROOT)
+c(ROOT, Pseudo)
+c(ROOT, MidLongestSpan)
+c(ROOT, MinAvgLeafDist)
+e(ROOT)
+
+s(CLUSTER)
+c(CLUSTER, UPGMA)
+c(CLUSTER, UPGMAMax)
+c(CLUSTER, UPGMAMin)
+c(CLUSTER, UPGMB)
+c(CLUSTER, NeighborJoining)
+e(CLUSTER)
+
+s(JOIN)
+c(JOIN, NearestNeighbor)
+c(JOIN, NeighborJoining)
+e(JOIN)
+
+s(LINKAGE)
+c(LINKAGE, Min)
+c(LINKAGE, Avg)
+c(LINKAGE, Max)
+c(LINKAGE, NeighborJoining)
+c(LINKAGE, Biased)
+e(LINKAGE)
+
+s(DISTANCE)
+c(DISTANCE, Kmer6_6)
+c(DISTANCE, Kmer20_3)
+c(DISTANCE, Kmer20_4)
+c(DISTANCE, Kbit20_3)
+c(DISTANCE, Kmer4_6)
+c(DISTANCE, PctIdKimura)
+c(DISTANCE, PctIdLog)
+c(DISTANCE, PWKimura)
+e(DISTANCE)
+
+s(PPSCORE)
+c(PPSCORE, LE)
+c(PPSCORE, SP)
+c(PPSCORE, SV)
+c(PPSCORE, SPN)
+e(PPSCORE)
+
+s(SEQWEIGHT)
+c(SEQWEIGHT, None)
+c(SEQWEIGHT, Henikoff)
+c(SEQWEIGHT, HenikoffPB)
+c(SEQWEIGHT, GSC)
+c(SEQWEIGHT, ClustalW)
+c(SEQWEIGHT, ThreeWay)
+e(SEQWEIGHT)
+
+s(OBJSCORE)
+c(OBJSCORE, SP) // Sum of Pairs of sequences
+c(OBJSCORE, DP) // Dynamic Programming score
+c(OBJSCORE, XP) // Cross Pairs = sum of pairs between two MSAs
+c(OBJSCORE, PS) // sum of Prof-Seq score for all seqs in MSA
+c(OBJSCORE, SPF) // sum of pairs, fast approximation
+c(OBJSCORE, SPM) // sp if <= 100 seqs, spf otherwise
+e(OBJSCORE)
+
+s(TERMGAPS)
+c(TERMGAPS, Full)
+c(TERMGAPS, Half)
+c(TERMGAPS, Ext)
+e(TERMGAPS)
+
+#undef s
+#undef c
+#undef e
Added: trunk/packages/muscle/branches/upstream/current/enumtostr.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/enumtostr.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/enumtostr.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,16 @@
+#include "muscle.h"
+#include <stdio.h>
+
+static char szMsg[64];
+
+// Define XXXToStr(XXX x) functions for each enum type XXX.
+#define s(t) const char *t##ToStr(t x) { switch (x) { case t##_Undefined: return "Undefined";
+#define c(t, x) case t##_##x: return #x;
+#define e(t) } sprintf(szMsg, #t "_%d", x); return szMsg; }
+#include "enums.h"
+
+// Define StrToXXX(const char *Str) functions for each enum type XXX.
+#define s(t) t StrTo##t(const char *Str) { if (0) ;
+#define c(t, x) else if (0 == stricmp(#x, Str)) return t##_##x;
+#define e(t) Quit("Invalid value %s for type %s", Str, #t); return t##_Undefined; }
+#include "enums.h"
Added: trunk/packages/muscle/branches/upstream/current/estring.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/estring.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/estring.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,689 @@
+#include "muscle.h"
+#include "pwpath.h"
+#include "estring.h"
+#include "seq.h"
+#include "msa.h"
+
+/***
+An "estring" is an edit string that operates on a sequence.
+An estring is represented as a vector of integers.
+It is interpreted in order of increasing suffix.
+A positive value n means copy n letters.
+A negative value -n means insert n indels.
+Zero marks the end of the vector.
+Consecutive entries must have opposite sign, i.e. the
+shortest possible representation must be used.
+
+A "tpair" is a traceback path for a pairwise alignment
+represented as two estrings, one for each sequence.
+***/
+
+#define c2(c,d) (((unsigned char) c) << 8 | (unsigned char) d)
+
+unsigned LengthEstring(const short es[])
+ {
+ unsigned i = 0;
+ while (*es++ != 0)
+ ++i;
+ return i;
+ }
+
+short *EstringNewCopy(const short es[])
+ {
+ unsigned n = LengthEstring(es) + 1;
+ short *esNew = new short[n];
+ memcpy(esNew, es, n*sizeof(short));
+ return esNew;
+ }
+
+void LogEstring(const short es[])
+ {
+ Log("<");
+ for (unsigned i = 0; es[i] != 0; ++i)
+ {
+ if (i > 0)
+ Log(" ");
+ Log("%d", es[i]);
+ }
+ Log(">");
+ }
+
+static bool EstringsEq(const short es1[], const short es2[])
+ {
+ for (;;)
+ {
+ if (*es1 != *es2)
+ return false;
+ if (0 == *es1)
+ break;
+ ++es1;
+ ++es2;
+ }
+ return true;
+ }
+
+static void EstringCounts(const short es[], unsigned *ptruSymbols,
+ unsigned *ptruIndels)
+ {
+ unsigned uSymbols = 0;
+ unsigned uIndels = 0;
+ for (unsigned i = 0; es[i] != 0; ++i)
+ {
+ short n = es[i];
+ if (n > 0)
+ uSymbols += n;
+ else if (n < 0)
+ uIndels += -n;
+ }
+ *ptruSymbols = uSymbols;
+ *ptruIndels = uIndels;
+ }
+
+static char *EstringOp(const short es[], const char s[])
+ {
+ unsigned uSymbols;
+ unsigned uIndels;
+ EstringCounts(es, &uSymbols, &uIndels);
+ assert((unsigned) strlen(s) == uSymbols);
+ char *sout = new char[uSymbols + uIndels + 1];
+ char *psout = sout;
+ for (;;)
+ {
+ int n = *es++;
+ if (0 == n)
+ break;
+ if (n > 0)
+ for (int i = 0; i < n; ++i)
+ *psout++ = *s++;
+ else
+ for (int i = 0; i < -n; ++i)
+ *psout++ = '-';
+ }
+ assert(0 == *s);
+ *psout = 0;
+ return sout;
+ }
+
+void EstringOp(const short es[], const Seq &sIn, Seq &sOut)
+ {
+#if DEBUG
+ unsigned uSymbols;
+ unsigned uIndels;
+ EstringCounts(es, &uSymbols, &uIndels);
+ assert(sIn.Length() == uSymbols);
+#endif
+ sOut.Clear();
+ sOut.SetName(sIn.GetName());
+ int p = 0;
+ for (;;)
+ {
+ int n = *es++;
+ if (0 == n)
+ break;
+ if (n > 0)
+ for (int i = 0; i < n; ++i)
+ {
+ char c = sIn[p++];
+ sOut.push_back(c);
+ }
+ else
+ for (int i = 0; i < -n; ++i)
+ sOut.push_back('-');
+ }
+ }
+
+unsigned EstringOp(const short es[], const Seq &sIn, MSA &a)
+ {
+ unsigned uSymbols;
+ unsigned uIndels;
+ EstringCounts(es, &uSymbols, &uIndels);
+ assert(sIn.Length() == uSymbols);
+
+ unsigned uColCount = uSymbols + uIndels;
+
+ a.Clear();
+ a.SetSize(1, uColCount);
+
+ a.SetSeqName(0, sIn.GetName());
+ a.SetSeqId(0, sIn.GetId());
+
+ unsigned p = 0;
+ unsigned uColIndex = 0;
+ for (;;)
+ {
+ int n = *es++;
+ if (0 == n)
+ break;
+ if (n > 0)
+ for (int i = 0; i < n; ++i)
+ {
+ char c = sIn[p++];
+ a.SetChar(0, uColIndex++, c);
+ }
+ else
+ for (int i = 0; i < -n; ++i)
+ a.SetChar(0, uColIndex++, '-');
+ }
+ assert(uColIndex == uColCount);
+ return uColCount;
+ }
+
+void PathToEstrings(const PWPath &Path, short **ptresA, short **ptresB)
+ {
+// First pass to determine size of estrings esA and esB
+ const unsigned uEdgeCount = Path.GetEdgeCount();
+ if (0 == uEdgeCount)
+ {
+ short *esA = new short[1];
+ short *esB = new short[1];
+ esA[0] = 0;
+ esB[0] = 0;
+ *ptresA = esA;
+ *ptresB = esB;
+ return;
+ }
+
+ unsigned iLengthA = 1;
+ unsigned iLengthB = 1;
+ const char cFirstEdgeType = Path.GetEdge(0).cType;
+ char cPrevEdgeType = cFirstEdgeType;
+ for (unsigned uEdgeIndex = 1; uEdgeIndex < uEdgeCount; ++uEdgeIndex)
+ {
+ const PWEdge &Edge = Path.GetEdge(uEdgeIndex);
+ char cEdgeType = Edge.cType;
+
+ switch (c2(cPrevEdgeType, cEdgeType))
+ {
+ case c2('M', 'M'):
+ case c2('D', 'D'):
+ case c2('I', 'I'):
+ break;
+
+ case c2('D', 'M'):
+ case c2('M', 'D'):
+ ++iLengthB;
+ break;
+
+ case c2('I', 'M'):
+ case c2('M', 'I'):
+ ++iLengthA;
+ break;
+
+ case c2('I', 'D'):
+ case c2('D', 'I'):
+ ++iLengthB;
+ ++iLengthA;
+ break;
+
+ default:
+ assert(false);
+ }
+ cPrevEdgeType = cEdgeType;
+ }
+
+// Pass2 for seq A
+ {
+ short *esA = new short[iLengthA+1];
+ unsigned iA = 0;
+ switch (Path.GetEdge(0).cType)
+ {
+ case 'M':
+ case 'D':
+ esA[0] = 1;
+ break;
+
+ case 'I':
+ esA[0] = -1;
+ break;
+
+ default:
+ assert(false);
+ }
+
+ char cPrevEdgeType = cFirstEdgeType;
+ for (unsigned uEdgeIndex = 1; uEdgeIndex < uEdgeCount; ++uEdgeIndex)
+ {
+ const PWEdge &Edge = Path.GetEdge(uEdgeIndex);
+ char cEdgeType = Edge.cType;
+
+ switch (c2(cPrevEdgeType, cEdgeType))
+ {
+ case c2('M', 'M'):
+ case c2('D', 'D'):
+ case c2('D', 'M'):
+ case c2('M', 'D'):
+ ++(esA[iA]);
+ break;
+
+ case c2('I', 'D'):
+ case c2('I', 'M'):
+ ++iA;
+ esA[iA] = 1;
+ break;
+
+ case c2('M', 'I'):
+ case c2('D', 'I'):
+ ++iA;
+ esA[iA] = -1;
+ break;
+
+ case c2('I', 'I'):
+ --(esA[iA]);
+ break;
+
+ default:
+ assert(false);
+ }
+
+ cPrevEdgeType = cEdgeType;
+ }
+ assert(iA == iLengthA - 1);
+ esA[iLengthA] = 0;
+ *ptresA = esA;
+ }
+
+ {
+// Pass2 for seq B
+ short *esB = new short[iLengthB+1];
+ unsigned iB = 0;
+ switch (Path.GetEdge(0).cType)
+ {
+ case 'M':
+ case 'I':
+ esB[0] = 1;
+ break;
+
+ case 'D':
+ esB[0] = -1;
+ break;
+
+ default:
+ assert(false);
+ }
+
+ char cPrevEdgeType = cFirstEdgeType;
+ for (unsigned uEdgeIndex = 1; uEdgeIndex < uEdgeCount; ++uEdgeIndex)
+ {
+ const PWEdge &Edge = Path.GetEdge(uEdgeIndex);
+ char cEdgeType = Edge.cType;
+
+ switch (c2(cPrevEdgeType, cEdgeType))
+ {
+ case c2('M', 'M'):
+ case c2('I', 'I'):
+ case c2('I', 'M'):
+ case c2('M', 'I'):
+ ++(esB[iB]);
+ break;
+
+ case c2('D', 'I'):
+ case c2('D', 'M'):
+ ++iB;
+ esB[iB] = 1;
+ break;
+
+ case c2('M', 'D'):
+ case c2('I', 'D'):
+ ++iB;
+ esB[iB] = -1;
+ break;
+
+ case c2('D', 'D'):
+ --(esB[iB]);
+ break;
+
+ default:
+ assert(false);
+ }
+
+ cPrevEdgeType = cEdgeType;
+ }
+ assert(iB == iLengthB - 1);
+ esB[iLengthB] = 0;
+ *ptresB = esB;
+ }
+
+#if DEBUG
+ {
+ const PWEdge &LastEdge = Path.GetEdge(uEdgeCount - 1);
+ unsigned uSymbols;
+ unsigned uIndels;
+ EstringCounts(*ptresA, &uSymbols, &uIndels);
+ assert(uSymbols == LastEdge.uPrefixLengthA);
+ assert(uSymbols + uIndels == uEdgeCount);
+
+ EstringCounts(*ptresB, &uSymbols, &uIndels);
+ assert(uSymbols == LastEdge.uPrefixLengthB);
+ assert(uSymbols + uIndels == uEdgeCount);
+
+ PWPath TmpPath;
+ EstringsToPath(*ptresA, *ptresB, TmpPath);
+ TmpPath.AssertEqual(Path);
+ }
+#endif
+ }
+
+void EstringsToPath(const short esA[], const short esB[], PWPath &Path)
+ {
+ Path.Clear();
+ unsigned iA = 0;
+ unsigned iB = 0;
+ int nA = esA[iA++];
+ int nB = esB[iB++];
+ unsigned uPrefixLengthA = 0;
+ unsigned uPrefixLengthB = 0;
+ for (;;)
+ {
+ char cType;
+ if (nA > 0)
+ {
+ if (nB > 0)
+ {
+ cType = 'M';
+ --nA;
+ --nB;
+ }
+ else if (nB < 0)
+ {
+ cType = 'D';
+ --nA;
+ ++nB;
+ }
+ else
+ assert(false);
+ }
+ else if (nA < 0)
+ {
+ if (nB > 0)
+ {
+ cType = 'I';
+ ++nA;
+ --nB;
+ }
+ else
+ assert(false);
+ }
+ else
+ assert(false);
+
+ switch (cType)
+ {
+ case 'M':
+ ++uPrefixLengthA;
+ ++uPrefixLengthB;
+ break;
+ case 'D':
+ ++uPrefixLengthA;
+ break;
+ case 'I':
+ ++uPrefixLengthB;
+ break;
+ }
+
+ PWEdge Edge;
+ Edge.cType = cType;
+ Edge.uPrefixLengthA = uPrefixLengthA;
+ Edge.uPrefixLengthB = uPrefixLengthB;
+ Path.AppendEdge(Edge);
+
+ if (nA == 0)
+ {
+ if (0 == esA[iA])
+ {
+ assert(0 == esB[iB]);
+ break;
+ }
+ nA = esA[iA++];
+ }
+ if (nB == 0)
+ nB = esB[iB++];
+ }
+ }
+
+/***
+Multiply two estrings to make a third estring.
+The product of two estrings e1*e2 is defined to be
+the estring that produces the same result as applying
+e1 then e2. Multiplication is not commutative. In fact,
+the reversed order is undefined unless both estrings
+consist of a single, identical, positive entry.
+A primary motivation for using estrings is that
+multiplication is very fast, reducing the time
+needed to construct the root alignment.
+
+Example
+
+ <-1,3>(XXX) = -XXX
+ <2,-1,2>(-XXX) = -X-XX
+
+Therefore,
+
+ <-1,3>*<2,-1,2> = <-1,1,-1,2>
+***/
+
+static bool CanMultiplyEstrings(const short es1[], const short es2[])
+ {
+ unsigned uSymbols1;
+ unsigned uSymbols2;
+ unsigned uIndels1;
+ unsigned uIndels2;
+ EstringCounts(es1, &uSymbols1, &uIndels1);
+ EstringCounts(es2, &uSymbols2, &uIndels2);
+ return uSymbols1 + uIndels1 == uSymbols2;
+ }
+
+static inline void AppendGaps(short esp[], int &ip, int n)
+ {
+ if (-1 == ip)
+ esp[++ip] = n;
+ else if (esp[ip] < 0)
+ esp[ip] += n;
+ else
+ esp[++ip] = n;
+ }
+
+static inline void AppendSymbols(short esp[], int &ip, int n)
+ {
+ if (-1 == ip)
+ esp[++ip] = n;
+ else if (esp[ip] > 0)
+ esp[ip] += n;
+ else
+ esp[++ip] = n;
+ }
+
+void MulEstrings(const short es1[], const short es2[], short esp[])
+ {
+ assert(CanMultiplyEstrings(es1, es2));
+
+ unsigned i1 = 0;
+ int ip = -1;
+ int n1 = es1[i1++];
+ for (unsigned i2 = 0; ; ++i2)
+ {
+ int n2 = es2[i2];
+ if (0 == n2)
+ break;
+ if (n2 > 0)
+ {
+ for (;;)
+ {
+ if (n1 < 0)
+ {
+ if (n2 > -n1)
+ {
+ AppendGaps(esp, ip, n1);
+ n2 += n1;
+ n1 = es1[i1++];
+ }
+ else if (n2 == -n1)
+ {
+ AppendGaps(esp, ip, n1);
+ n1 = es1[i1++];
+ break;
+ }
+ else
+ {
+ assert(n2 < -n1);
+ AppendGaps(esp, ip, -n2);
+ n1 += n2;
+ break;
+ }
+ }
+ else
+ {
+ assert(n1 > 0);
+ if (n2 > n1)
+ {
+ AppendSymbols(esp, ip, n1);
+ n2 -= n1;
+ n1 = es1[i1++];
+ }
+ else if (n2 == n1)
+ {
+ AppendSymbols(esp, ip, n1);
+ n1 = es1[i1++];
+ break;
+ }
+ else
+ {
+ assert(n2 < n1);
+ AppendSymbols(esp, ip, n2);
+ n1 -= n2;
+ break;
+ }
+ }
+ }
+ }
+ else
+ {
+ assert(n2 < 0);
+ AppendGaps(esp, ip, n2);
+ }
+ }
+ esp[++ip] = 0;
+
+#if DEBUG
+ {
+ int MaxLen = (int) (LengthEstring(es1) + LengthEstring(es2) + 1);
+ assert(ip < MaxLen);
+ if (ip >= 2)
+ for (int i = 0; i < ip - 2; ++i)
+ {
+ if (!(esp[i] > 0 && esp[i+1] < 0 || esp[i] < 0 && esp[i+1] > 0))
+ {
+ Log("Bad result of MulEstring: ");
+ LogEstring(esp);
+ Quit("Assert failed (alternating signs)");
+ }
+ }
+ unsigned uSymbols1;
+ unsigned uSymbols2;
+ unsigned uSymbolsp;
+ unsigned uIndels1;
+ unsigned uIndels2;
+ unsigned uIndelsp;
+ EstringCounts(es1, &uSymbols1, &uIndels1);
+ EstringCounts(es2, &uSymbols2, &uIndels2);
+ EstringCounts(esp, &uSymbolsp, &uIndelsp);
+ if (uSymbols1 + uIndels1 != uSymbols2)
+ {
+ Log("Bad result of MulEstring: ");
+ LogEstring(esp);
+ Quit("Assert failed (counts1 %u %u %u)",
+ uSymbols1, uIndels1, uSymbols2);
+ }
+ }
+#endif
+ }
+
+static void test(const short es1[], const short es2[], const short esa[])
+ {
+ unsigned uSymbols1;
+ unsigned uSymbols2;
+ unsigned uIndels1;
+ unsigned uIndels2;
+ EstringCounts(es1, &uSymbols1, &uIndels1);
+ EstringCounts(es2, &uSymbols2, &uIndels2);
+
+ char s[4096];
+ memset(s, 'X', sizeof(s));
+ s[uSymbols1] = 0;
+
+ char *s1 = EstringOp(es1, s);
+ char *s12 = EstringOp(es2, s1);
+
+ memset(s, 'X', sizeof(s));
+ s[uSymbols2] = 0;
+ char *s2 = EstringOp(es2, s);
+
+ Log("%s * %s = %s\n", s1, s2, s12);
+
+ LogEstring(es1);
+ Log(" * ");
+ LogEstring(es2);
+ Log(" = ");
+ LogEstring(esa);
+ Log("\n");
+
+ short esp[4096];
+ MulEstrings(es1, es2, esp);
+ LogEstring(esp);
+ if (!EstringsEq(esp, esa))
+ Log(" *ERROR* ");
+ Log("\n");
+
+ memset(s, 'X', sizeof(s));
+ s[uSymbols1] = 0;
+ char *sp = EstringOp(esp, s);
+ Log("%s\n", sp);
+ Log("\n==========\n\n");
+ }
+
+void TestEstrings()
+ {
+ SetListFileName("c:\\tmp\\muscle.log", false);
+ //{
+ //short es1[] = { -1, 1, -1, 0 };
+ //short es2[] = { 1, -1, 2, 0 };
+ //short esa[] = { -2, 1, -1, 0 };
+ //test(es1, es2, esa);
+ //}
+ //{
+ //short es1[] = { 2, -1, 2, 0 };
+ //short es2[] = { 1, -1, 3, -1, 1, 0 };
+ //short esa[] = { 1, -1, 1, -1, 1, -1, 1, 0 };
+ //test(es1, es2, esa);
+ //}
+ //{
+ //short es1[] = { -1, 3, 0 };
+ //short es2[] = { 2, -1, 2, 0 };
+ //short esa[] = { -1, 1, -1, 2, 0 };
+ //test(es1, es2, esa);
+ //}
+ //{
+ //short es1[] = { -1, 1, -1, 1, 0};
+ //short es2[] = { 4, 0 };
+ //short esa[] = { -1, 1, -1, 1, 0};
+ //test(es1, es2, esa);
+ //}
+ //{
+ //short es1[] = { 1, -1, 1, -1, 0};
+ //short es2[] = { 4, 0 };
+ //short esa[] = { 1, -1, 1, -1, 0};
+ //test(es1, es2, esa);
+ //}
+ //{
+ //short es1[] = { 1, -1, 1, -1, 0};
+ //short es2[] = { -1, 4, -1, 0 };
+ //short esa[] = { -1, 1, -1, 1, -2, 0};
+ //test(es1, es2, esa);
+ //}
+ {
+ short es1[] = { 106, -77, 56, -2, 155, -3, 123, -2, 0};
+ short es2[] = { 50, -36, 34, -3, 12, -6, 1, -6, 18, -17, 60, -5, 349, -56, 0 };
+ short esa[] = { 0 };
+ test(es1, es2, esa);
+ }
+ exit(0);
+ }
Added: trunk/packages/muscle/branches/upstream/current/estring.h
===================================================================
--- trunk/packages/muscle/branches/upstream/current/estring.h 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/estring.h 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,13 @@
+#ifndef pathsum_h
+#define pathsum_h
+
+void PathToEstrings(const PWPath &Path, short **ptresA, short **ptresB);
+void EstringsToPath(const short esA[], const short esB[], PWPath &Path);
+void MulEstrings(const short es1[], const short es2[], short esp[]);
+void EstringOp(const short es[], const Seq &sIn, Seq &sOut);
+unsigned EstringOp(const short es[], const Seq &sIn, MSA &a);
+void LogEstring(const short es[]);
+unsigned LengthEstring(const short es[]);
+short *EstringNewCopy(const short es[]);
+
+#endif // pathsum_h
Added: trunk/packages/muscle/branches/upstream/current/fasta.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/fasta.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/fasta.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,56 @@
+#include "muscle.h"
+#include <stdio.h>
+#include <ctype.h>
+#include "msa.h"
+#include "textfile.h"
+
+const unsigned FASTA_BLOCK = 60;
+
+void MSA::FromFASTAFile(TextFile &File)
+ {
+ Clear();
+
+ FILE *f = File.GetStdioFile();
+
+ unsigned uSeqCount = 0;
+ unsigned uColCount = uInsane;
+ for (;;)
+ {
+ char *Label;
+ unsigned uSeqLength;
+ char *SeqData = GetFastaSeq(f, &uSeqLength, &Label, false);
+ if (0 == SeqData)
+ break;
+ AppendSeq(SeqData, uSeqLength, Label);
+ }
+ }
+
+void MSA::ToFASTAFile(TextFile &File) const
+ {
+ const unsigned uColCount = GetColCount();
+ assert(uColCount > 0);
+ const unsigned uLinesPerSeq = (GetColCount() - 1)/FASTA_BLOCK + 1;
+ const unsigned uSeqCount = GetSeqCount();
+
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ File.PutString(">");
+ File.PutString(GetSeqName(uSeqIndex));
+ File.PutString("\n");
+
+ unsigned n = 0;
+ for (unsigned uLine = 0; uLine < uLinesPerSeq; ++uLine)
+ {
+ unsigned uLetters = uColCount - uLine*FASTA_BLOCK;
+ if (uLetters > FASTA_BLOCK)
+ uLetters = FASTA_BLOCK;
+ for (unsigned i = 0; i < uLetters; ++i)
+ {
+ char c = GetChar(uSeqIndex, n);
+ File.PutChar(c);
+ ++n;
+ }
+ File.PutChar('\n');
+ }
+ }
+ }
Added: trunk/packages/muscle/branches/upstream/current/fasta2.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/fasta2.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/fasta2.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,117 @@
+#include "muscle.h"
+#include <stdio.h>
+#include <errno.h>
+
+const int BUFFER_BYTES = 16*1024;
+const int CR = '\r';
+const int NL = '\n';
+
+#define ADD(c) \
+ { \
+ if (Pos >= BufferLength) \
+ { \
+ const int NewBufferLength = BufferLength + BUFFER_BYTES; \
+ char *NewBuffer = new char[NewBufferLength]; \
+ memcpy(NewBuffer, Buffer, BufferLength); \
+ delete[] Buffer; \
+ Buffer = NewBuffer; \
+ BufferLength = NewBufferLength; \
+ } \
+ Buffer[Pos++] = c; \
+ }
+
+// Get next sequence from file.
+char *GetFastaSeq(FILE *f, unsigned *ptrSeqLength, char **ptrLabel, bool DeleteGaps)
+ {
+ unsigned BufferLength = 0;
+ unsigned Pos = 0;
+ char *Buffer = 0;
+
+ int c = fgetc(f);
+ if (EOF == c)
+ return 0;
+ if ('>' != c)
+ Quit("Invalid file format, expected '>' to start FASTA label");
+
+ for (;;)
+ {
+ int c = fgetc(f);
+ if (EOF == c)
+ Quit("End-of-file or input error in FASTA label");
+
+ // Ignore CR (discard, do not include in label)
+ if (CR == c)
+ continue;
+
+ // NL terminates label
+ if (NL == c)
+ break;
+
+ // All other characters added to label
+ ADD(c)
+ }
+
+// Nul-terminate label
+ ADD(0)
+ *ptrLabel = Buffer;
+
+ BufferLength = 0;
+ Pos = 0;
+ Buffer = 0;
+ int PreviousChar = NL;
+ for (;;)
+ {
+ int c = fgetc(f);
+ if (EOF == c)
+ {
+ if (feof(f))
+ break;
+ else if (ferror(f))
+ Quit("Error reading FASTA file, ferror=TRUE feof=FALSE errno=%d %s",
+ errno, strerror(errno));
+ else
+ Quit("Error reading FASTA file, fgetc=EOF feof=FALSE ferror=FALSE errno=%d %s",
+ errno, strerror(errno));
+ }
+
+ if ('>' == c)
+ {
+ if (NL == PreviousChar)
+ {
+ ungetc(c, f);
+ break;
+ }
+ else
+ Quit("Unexpected '>' in FASTA sequence data");
+ }
+ else if (isspace(c))
+ ;
+ else if (IsGapChar(c))
+ {
+ if (!DeleteGaps)
+ ADD(c)
+ }
+ else if (isalpha(c))
+ {
+ c = toupper(c);
+ ADD(c)
+ }
+ else if (isprint(c))
+ {
+ Warning("Invalid character '%c' in FASTA sequence data, ignored", c);
+ continue;
+ }
+ else
+ {
+ Warning("Invalid byte hex %02x in FASTA sequence data, ignored", (unsigned char) c);
+ continue;
+ }
+ PreviousChar = c;
+ }
+
+ if (0 == Pos)
+ return GetFastaSeq(f, ptrSeqLength, ptrLabel, DeleteGaps);
+
+ *ptrSeqLength = Pos;
+ return Buffer;
+ }
Added: trunk/packages/muscle/branches/upstream/current/fastclust.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/fastclust.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/fastclust.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,57 @@
+#include "muscle.h"
+#include "seqvect.h"
+#include "distfunc.h"
+#include "clust.h"
+#include "clustsetdf.h"
+#include "tree.h"
+#include "clust.h"
+#include "distcalc.h"
+#include <math.h>
+
+static void TreeFromSeqVect_NJ(const DistFunc &DF, CLUSTER Cluster, Tree &tree)
+ {
+ ClustSetDF CSD(DF);
+
+ Clust C;
+ C.Create(CSD, Cluster);
+
+ tree.FromClust(C);
+ }
+
+static void TreeFromSeqVect_UPGMA(const DistFunc &DF, CLUSTER Cluster, Tree &tree)
+ {
+ LINKAGE Linkage = LINKAGE_Undefined;
+ switch (Cluster)
+ {
+ case CLUSTER_UPGMA:
+ Linkage = LINKAGE_Avg;
+ break;
+ case CLUSTER_UPGMAMin:
+ Linkage = LINKAGE_Min;
+ break;
+ case CLUSTER_UPGMAMax:
+ Linkage = LINKAGE_Max;
+ break;
+ case CLUSTER_UPGMB:
+ Linkage = LINKAGE_Biased;
+ break;
+ default:
+ Quit("TreeFromSeqVect_UPGMA, CLUSTER_%u not supported", Cluster);
+ }
+
+ DistCalcDF DC;
+ DC.Init(DF);
+ UPGMA2(DC, tree, Linkage);
+ }
+
+void TreeFromSeqVect(const SeqVect &v, Tree &tree, CLUSTER Cluster,
+ DISTANCE Distance, ROOT Root)
+ {
+ DistFunc DF;
+ DistUnaligned(v, Distance, DF);
+ if (CLUSTER_NeighborJoining == Cluster)
+ TreeFromSeqVect_NJ(DF, Cluster, tree);
+ else
+ TreeFromSeqVect_UPGMA(DF, Cluster, tree);
+ FixRoot(tree, Root);
+ }
Added: trunk/packages/muscle/branches/upstream/current/fastdist.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/fastdist.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/fastdist.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,50 @@
+#include "muscle.h"
+#include "distfunc.h"
+#include "seqvect.h"
+
+void DistUnaligned(const SeqVect &v, DISTANCE DistMethod, DistFunc &DF)
+ {
+ const unsigned uSeqCount = v.Length();
+
+ switch (DistMethod)
+ {
+ case DISTANCE_Kmer6_6:
+ DistKmer6_6(v, DF);
+ break;
+
+ case DISTANCE_Kmer20_3:
+ DistKmer20_3(v, DF);
+ break;
+
+ case DISTANCE_Kmer20_4:
+ FastDistKmer(v, DF);
+ break;
+
+ case DISTANCE_Kbit20_3:
+ DistKbit20_3(v, DF);
+ break;
+
+ case DISTANCE_Kmer4_6:
+ DistKmer4_6(v, DF);
+ break;
+
+ case DISTANCE_PWKimura:
+ DistPWKimura(v, DF);
+ break;
+
+ default:
+ Quit("DistUnaligned, unsupported distance method %d", DistMethod);
+ }
+
+// const char **SeqNames = (const char **) malloc(uSeqCount*sizeof(char *));
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ const Seq &s = *(v[uSeqIndex]);
+
+ const char *ptrName = s.GetName();
+ unsigned uId = s.GetId();
+
+ DF.SetName(uSeqIndex, ptrName);
+ DF.SetId(uSeqIndex, uId);
+ }
+ }
Added: trunk/packages/muscle/branches/upstream/current/fastdistjones.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/fastdistjones.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/fastdistjones.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,206 @@
+#include "muscle.h"
+#include "distfunc.h"
+#include "seqvect.h"
+#include <math.h>
+
+const unsigned TRIPLE_COUNT = 20*20*20;
+
+struct TripleCount
+ {
+ unsigned m_uSeqCount; // How many sequences have this triple?
+ unsigned short *m_Counts; // m_Counts[s] = nr of times triple found in seq s
+ };
+static TripleCount *TripleCounts;
+
+// WARNING: Sequences MUST be stripped of gaps and upper case!
+void DistKmer20_3(const SeqVect &v, DistFunc &DF)
+ {
+ const unsigned uSeqCount = v.Length();
+
+ DF.SetCount(uSeqCount);
+ if (0 == uSeqCount)
+ return;
+ for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
+ {
+ DF.SetDist(uSeq1, uSeq1, 0);
+ for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
+ DF.SetDist(uSeq1, uSeq2, 0);
+ }
+
+ const unsigned uTripleArrayBytes = TRIPLE_COUNT*sizeof(TripleCount);
+ TripleCounts = (TripleCount *) malloc(uTripleArrayBytes);
+ if (0 == TripleCounts)
+ Quit("Not enough memory (TripleCounts)");
+ memset(TripleCounts, 0, uTripleArrayBytes);
+
+ for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord)
+ {
+ TripleCount &tc = *(TripleCounts + uWord);
+ const unsigned uBytes = uSeqCount*sizeof(short);
+ tc.m_Counts = (unsigned short *) malloc(uBytes);
+ memset(tc.m_Counts, 0, uBytes);
+ }
+
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ Seq &s = *(v[uSeqIndex]);
+ const unsigned uSeqLength = s.Length();
+ for (unsigned uPos = 0; uPos < uSeqLength - 2; ++uPos)
+ {
+ const unsigned uLetter1 = CharToLetterEx(s[uPos]);
+ if (uLetter1 >= 20)
+ continue;
+ const unsigned uLetter2 = CharToLetterEx(s[uPos+1]);
+ if (uLetter2 >= 20)
+ continue;
+ const unsigned uLetter3 = CharToLetterEx(s[uPos+2]);
+ if (uLetter3 >= 20)
+ continue;
+
+ const unsigned uWord = uLetter1 + uLetter2*20 + uLetter3*20*20;
+ assert(uWord < TRIPLE_COUNT);
+
+ TripleCount &tc = *(TripleCounts + uWord);
+ const unsigned uOldCount = tc.m_Counts[uSeqIndex];
+ if (0 == uOldCount)
+ ++(tc.m_uSeqCount);
+
+ ++(tc.m_Counts[uSeqIndex]);
+ }
+ }
+
+#if TRACE
+ {
+ Log("TripleCounts\n");
+ unsigned uGrandTotal = 0;
+ for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord)
+ {
+ const TripleCount &tc = *(TripleCounts + uWord);
+ if (0 == tc.m_uSeqCount)
+ continue;
+
+ const unsigned uLetter3 = uWord/(20*20);
+ const unsigned uLetter2 = (uWord - uLetter3*20*20)/20;
+ const unsigned uLetter1 = uWord%20;
+ Log("Word %6u %c%c%c %6u",
+ uWord,
+ LetterToCharAmino(uLetter1),
+ LetterToCharAmino(uLetter2),
+ LetterToCharAmino(uLetter3),
+ tc.m_uSeqCount);
+
+ unsigned uSeqCountWithThisWord = 0;
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ const unsigned uCount = tc.m_Counts[uSeqIndex];
+ if (uCount > 0)
+ {
+ ++uSeqCountWithThisWord;
+ Log(" %u=%u", uSeqIndex, uCount);
+ uGrandTotal += uCount;
+ }
+ }
+ if (uSeqCountWithThisWord != tc.m_uSeqCount)
+ Log(" *** SQ ERROR *** %u %u", tc.m_uSeqCount, uSeqCountWithThisWord);
+ Log("\n");
+ }
+
+ unsigned uTotalBySeqLength = 0;
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ Seq &s = *(v[uSeqIndex]);
+ const unsigned uSeqLength = s.Length();
+ uTotalBySeqLength += uSeqLength - 2;
+ }
+ if (uGrandTotal != uTotalBySeqLength)
+ Log("*** TOTALS DISAGREE *** %u %u\n", uGrandTotal, uTotalBySeqLength);
+ }
+#endif
+
+ const unsigned uSeqListBytes = uSeqCount*sizeof(unsigned);
+ unsigned short *SeqList = (unsigned short *) malloc(uSeqListBytes);
+
+ for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord)
+ {
+ const TripleCount &tc = *(TripleCounts + uWord);
+ if (0 == tc.m_uSeqCount)
+ continue;
+
+ unsigned uSeqCountFound = 0;
+ memset(SeqList, 0, uSeqListBytes);
+
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ if (tc.m_Counts[uSeqIndex] > 0)
+ {
+ SeqList[uSeqCountFound] = uSeqIndex;
+ ++uSeqCountFound;
+ if (uSeqCountFound == tc.m_uSeqCount)
+ break;
+ }
+ }
+ assert(uSeqCountFound == tc.m_uSeqCount);
+
+ for (unsigned uSeq1 = 0; uSeq1 < uSeqCountFound; ++uSeq1)
+ {
+ const unsigned uSeqIndex1 = SeqList[uSeq1];
+ const unsigned uCount1 = tc.m_Counts[uSeqIndex1];
+ for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
+ {
+ const unsigned uSeqIndex2 = SeqList[uSeq2];
+ const unsigned uCount2 = tc.m_Counts[uSeqIndex2];
+ const unsigned uMinCount = uCount1 < uCount2 ? uCount1 : uCount2;
+ const double d = DF.GetDist(uSeqIndex1, uSeqIndex2);
+ DF.SetDist(uSeqIndex1, uSeqIndex2, (float) (d + uMinCount));
+ }
+ }
+ }
+ delete[] SeqList;
+ free(TripleCounts);
+
+ unsigned uDone = 0;
+ const unsigned uTotal = (uSeqCount*(uSeqCount - 1))/2;
+ for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
+ {
+ DF.SetDist(uSeq1, uSeq1, 0.0);
+
+ const Seq &s1 = *(v[uSeq1]);
+ const unsigned uLength1 = s1.Length();
+
+ for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
+ {
+ const Seq &s2 = *(v[uSeq2]);
+ const unsigned uLength2 = s2.Length();
+ unsigned uMinLength = uLength1 < uLength2 ? uLength1 : uLength2;
+ if (uMinLength < 3)
+ {
+ DF.SetDist(uSeq1, uSeq2, 1.0);
+ continue;
+ }
+
+ const double dTripleCount = DF.GetDist(uSeq1, uSeq2);
+ if (dTripleCount == 0)
+ {
+ DF.SetDist(uSeq1, uSeq2, 1.0);
+ continue;
+ }
+ double dNormalizedTripletScore = dTripleCount/(uMinLength - 2);
+ //double dEstimatedPairwiseIdentity = exp(0.3912*log(dNormalizedTripletScore));
+ //if (dEstimatedPairwiseIdentity > 1)
+ // dEstimatedPairwiseIdentity = 1;
+// DF.SetDist(uSeq1, uSeq2, (float) (1.0 - dEstimatedPairwiseIdentity));
+ DF.SetDist(uSeq1, uSeq2, (float) dNormalizedTripletScore);
+
+#if TRACE
+ {
+ Log("%s - %s Triplet count = %g Lengths %u, %u Estimated pwid = %g\n",
+ s1.GetName(), s2.GetName(), dTripleCount, uLength1, uLength2,
+ dEstimatedPairwiseIdentity);
+ }
+#endif
+ if (uDone%1000 == 0)
+ Progress(uDone, uTotal);
+ }
+ }
+ ProgressStepsDone();
+ }
Added: trunk/packages/muscle/branches/upstream/current/fastdistkbit.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/fastdistkbit.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/fastdistkbit.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,109 @@
+#include "muscle.h"
+#include "distfunc.h"
+#include "seqvect.h"
+#include <math.h>
+
+#define MIN(x, y) ((x) < (y) ? (x) : (y))
+
+static void SetKmerBitVector(const Seq &s, byte Bits[])
+ {
+ const unsigned uLength = s.Length();
+ const unsigned k = 3; // kmer length
+ unsigned i = 0;
+ unsigned c = 0;
+ unsigned h = 0;
+ for (unsigned j = 0; j < k - 1; ++j)
+ {
+ unsigned x = CharToLetterEx(s[i++]);
+ if (x <= AX_Y)
+ c = c*20 + x;
+ else
+ {
+ c = 0;
+ h = j + 1;
+ }
+ }
+ for ( ; i < uLength; ++i)
+ {
+ unsigned x = CharToLetterEx(s[i++]);
+ if (x <= AX_Y)
+ c = (c*20 + x)%8000;
+ else
+ {
+ c = 0;
+ h = i + k;
+ }
+ if (i >= h)
+ {
+ unsigned ByteOffset = c/8;
+ unsigned BitOffset = c%8;
+ Bits[ByteOffset] |= (1 << BitOffset);
+ }
+ }
+ }
+
+static unsigned CommonBitCount(const byte Bits1[], const byte Bits2[])
+ {
+ const byte * const p1end = Bits1 + 1000;
+ const byte *p2 = Bits2;
+
+ unsigned uCount = 0;
+ for (const byte *p1 = Bits1; p1 != p1end; ++p1)
+ {
+ // Here is a cute trick for efficiently counting the
+ // bits common between two bytes by combining them into
+ // a single word.
+ unsigned b = *p1 | (*p2 << 8);
+ while (b != 0)
+ {
+ if (b & 0x101)
+ ++uCount;
+ b >>= 1;
+ }
+ ++p2;
+ }
+ return uCount;
+ }
+
+void DistKbit20_3(const SeqVect &v, DistFunc &DF)
+ {
+ const unsigned uSeqCount = v.Length();
+ DF.SetCount(uSeqCount);
+
+// There are 20^3 = 8,000 distinct kmers in the 20-letter alphabet.
+// For each sequence, we create a bit vector of length 8,000, i.e.
+// 1,000 bytes, having one bit per kmer. The bit is set to 1 if the
+// kmer is present in the sequence.
+ const unsigned uBytes = uSeqCount*1000;
+ byte *BitVector = new byte[uBytes];
+ memset(BitVector, 0, uBytes);
+
+ SetProgressDesc("K-bit distance matrix");
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ SetKmerBitVector(*v[uSeqIndex], BitVector + uSeqIndex*1000);
+
+ unsigned uDone = 0;
+ const unsigned uTotal = (uSeqCount*(uSeqCount - 1))/2;
+ for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1)
+ {
+ const byte *Bits1 = BitVector + uSeqIndex1*1000;
+ const unsigned uLength1 = v[uSeqIndex1]->Length();
+ for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqIndex1; ++uSeqIndex2)
+ {
+ const byte *Bits2 = BitVector + uSeqIndex2*1000;
+ const unsigned uLength2 = v[uSeqIndex2]->Length();
+ const float fCount = (float) CommonBitCount(Bits1, Bits2);
+
+ // Distance measure = K / min(L1, L2)
+ // K is number of distinct kmers that are found in both sequences
+ const float fDist = fCount / MIN(uLength1, uLength2);
+ DF.SetDist(uSeqIndex1, uSeqIndex2, fDist);
+ if (uDone%10000 == 0)
+ Progress(uDone, uTotal);
+ ++uDone;
+ }
+ }
+ ProgressStepsDone();
+
+ delete[] BitVector;
+ }
Added: trunk/packages/muscle/branches/upstream/current/fastdistkmer.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/fastdistkmer.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/fastdistkmer.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,247 @@
+#include "muscle.h"
+#include "msa.h"
+#include "seqvect.h"
+#include "seq.h"
+#include "distfunc.h"
+#include <math.h>
+
+#define TRACE 0
+
+/***
+Some candidate alphabets considered because they
+have high correlations and small table sizes.
+Correlation coefficent is between k-mer distance
+and %id D measured from a CLUSTALW alignment.
+Table size is N^k where N is size of alphabet.
+A is standard (uncompressed) amino alphabet.
+
+ Correlation
+Alpha N k Table Size all 25-50%
+----- -- - ---------- ---- ------
+A 20 3 8,000 0.943 0.575
+A 20 4 160,000 0.962 0.685 <<
+LiA 14 4 38,416 0.966 0.645
+SEB 14 4 38,416 0.964 0.634
+LiA 13 4 28,561 0.965 0.640
+LiA 12 4 20,736 0.963 0.620
+LiA 10 5 100,000 0.964 0.652
+
+We select A with k=4 because it has the best
+correlations. The only drawback is a large table
+size, but space is readily available and the only
+additional time cost is in resetting the table to
+zero, which can be done quickly with memset or by
+keeping a list of the k-mers that were found (should
+test to see which is faster, and may vary by compiler
+and processor type). It also has the minor advantage
+that we don't need to convert the alphabet.
+
+Fractional identity d is estimated as follows.
+
+ F = fractional k-mer count
+ if F is 0: F = 0.01
+ Y = log(0.02 + F)
+ d = -4.1 + 4.12*Y
+
+The constant 0.02 was chosen to make the relationship
+between Y and D linear. The constants -4.1 and 4.12
+were chosen to fit a straight line to the scatterplot
+of Y vs D.
+***/
+
+#define MIN(x, y) (((x) < (y)) ? (x) : (y))
+
+const unsigned K = 4;
+const unsigned N = 20;
+const unsigned N_2 = 20*20;
+const unsigned N_3 = 20*20*20;
+const unsigned N_4 = 20*20*20*20;
+
+const unsigned TABLE_SIZE = N_4;
+
+// For debug output
+const char *KmerToStr(unsigned Kmer)
+ {
+ static char s[5];
+
+ unsigned c3 = (Kmer/N_3)%N;
+ unsigned c2 = (Kmer/N_2)%N;
+ unsigned c1 = (Kmer/N)%N;
+ unsigned c0 = Kmer%N;
+
+ s[0] = LetterToChar(c3);
+ s[1] = LetterToChar(c2);
+ s[2] = LetterToChar(c1);
+ s[3] = LetterToChar(c0);
+ return s;
+ }
+
+void CountKmers(const byte s[], unsigned uSeqLength, byte KmerCounts[])
+ {
+#if TRACE
+ Log("CountKmers\n");
+#endif
+ memset(KmerCounts, 0, TABLE_SIZE*sizeof(byte));
+
+ const byte *ptrKmerStart = s;
+ const byte *ptrKmerEnd = s + 4;
+ const byte *ptrSeqEnd = s + uSeqLength;
+
+ unsigned c3 = s[0]*N_3;
+ unsigned c2 = s[1]*N_2;
+ unsigned c1 = s[2]*N;
+ unsigned c0 = s[3];
+
+ unsigned Kmer = c3 + c2 + c1 + c0;
+
+ for (;;)
+ {
+ assert(Kmer < TABLE_SIZE);
+
+#if TRACE
+ Log("Kmer=%d=%s\n", Kmer, KmerToStr(Kmer));
+#endif
+ ++(KmerCounts[Kmer]);
+
+ if (ptrKmerEnd == ptrSeqEnd)
+ break;
+
+ // Compute k-mer as function of previous k-mer:
+ // 1. Subtract first letter from previous k-mer.
+ // 2. Multiply by N.
+ // 3. Add next letter.
+ c3 = (*ptrKmerStart++) * N_3;
+ Kmer = (Kmer - c3)*N;
+ Kmer += *ptrKmerEnd++;
+ }
+ }
+
+unsigned CommonKmerCount(const byte Seq[], unsigned uSeqLength,
+ const byte KmerCounts1[], const byte Seq2[], unsigned uSeqLength2)
+ {
+ byte KmerCounts2[TABLE_SIZE];
+ CountKmers(Seq2, uSeqLength2, KmerCounts2);
+
+ const byte *ptrKmerStart = Seq;
+ const byte *ptrKmerEnd = Seq + 4;
+ const byte *ptrSeqEnd = Seq + uSeqLength;
+
+ unsigned c3 = Seq[0]*N_3;
+ unsigned c2 = Seq[1]*N_2;
+ unsigned c1 = Seq[2]*N;
+ unsigned c0 = Seq[3];
+
+ unsigned Kmer = c3 + c2 + c1 + c0;
+
+ unsigned uCommonCount = 0;
+ for (;;)
+ {
+ assert(Kmer < TABLE_SIZE);
+
+ const byte Count1 = KmerCounts1[Kmer];
+ const byte Count2 = KmerCounts2[Kmer];
+
+ uCommonCount += MIN(Count1, Count2);
+
+ // Hack so we don't double-count
+ KmerCounts2[Kmer] = 0;
+
+ if (ptrKmerEnd == ptrSeqEnd)
+ break;
+
+ // Compute k-mer as function of previous k-mer:
+ // 1. Subtract first letter from previous k-mer.
+ // 2. Multiply by N.
+ // 3. Add next letter.
+ c3 = (*ptrKmerStart++) * N_3;
+ Kmer = (Kmer - c3)*N;
+ Kmer += *ptrKmerEnd++;
+ }
+ return uCommonCount;
+ }
+
+static void SeqToLetters(const Seq &s, byte Letters[])
+ {
+ const unsigned uSeqLength = s.Length();
+ for (unsigned uCol = 0; uCol < uSeqLength; ++uCol)
+ {
+ char c = s.GetChar(uCol);
+ // Ugly hack. My k-mer counting code isn't wild-card
+ // aware. Arbitrarily replace wildcards by a specific
+ // amino acid.
+ if (IsWildcardChar(c))
+ c = 'A';
+ *Letters++ = CharToLetter(c);
+ }
+ }
+
+void FastDistKmer(const SeqVect &v, DistFunc &DF)
+ {
+ byte KmerCounts[TABLE_SIZE];
+
+ const unsigned uSeqCount = v.GetSeqCount();
+
+ DF.SetCount(uSeqCount);
+ if (0 == uSeqCount)
+ return;
+
+// Initialize distance matrix to zero
+ for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
+ {
+ DF.SetDist(uSeq1, uSeq1, 0);
+ for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
+ DF.SetDist(uSeq1, uSeq2, 0);
+ }
+
+ unsigned uMaxLength = 0;
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ const Seq &s = v.GetSeq(uSeqIndex);
+ unsigned uSeqLength = s.Length();
+ if (uSeqLength > uMaxLength)
+ uMaxLength = uSeqLength;
+ }
+ if (0 == uMaxLength)
+ return;
+
+ byte *Seq1Letters = new byte[uMaxLength];
+ byte *Seq2Letters = new byte[uMaxLength];
+
+ for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount - 1; ++uSeqIndex1)
+ {
+ const Seq &s1 = v.GetSeq(uSeqIndex1);
+ const unsigned uSeqLength1 = s1.Length();
+
+ SeqToLetters(s1, Seq1Letters);
+ CountKmers(Seq1Letters, uSeqLength1, KmerCounts);
+
+ for (unsigned uSeqIndex2 = uSeqIndex1 + 1; uSeqIndex2 < uSeqCount;
+ ++uSeqIndex2)
+ {
+ const Seq &s2 = v.GetSeq(uSeqIndex2);
+ const unsigned uSeqLength2 = s2.Length();
+
+ SeqToLetters(s2, Seq2Letters);
+
+ unsigned uCommonKmerCount = CommonKmerCount(Seq1Letters, uSeqLength1,
+ KmerCounts, Seq2Letters, uSeqLength2);
+
+ unsigned uMinLength = MIN(uSeqLength1, uSeqLength2);
+ double F = (double) uCommonKmerCount / (uMinLength - K + 1);
+ if (0.0 == F)
+ F = 0.01;
+ double Y = log(0.02 + F);
+ double EstimatedPctId = Y/4.12 + 0.995;
+ double KD = KimuraDist(EstimatedPctId);
+// DF.SetDist(uSeqIndex1, uSeqIndex2, (float) KD);
+ DF.SetDist(uSeqIndex1, uSeqIndex2, (float) (1 - F));
+#if TRACE
+ Log("CommonCount=%u, MinLength=%u, F=%6.4f Y=%6.4f, %%id=%6.4f, KimuraDist=%8.4f\n",
+ uCommonKmerCount, uMinLength, F, Y, EstimatedPctId, KD);
+#endif
+ }
+ }
+
+ delete[] Seq1Letters;
+ delete[] Seq2Letters;
+ }
Added: trunk/packages/muscle/branches/upstream/current/fastdistmafft.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/fastdistmafft.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/fastdistmafft.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,290 @@
+#include "muscle.h"
+#include "distfunc.h"
+#include "seqvect.h"
+#include <math.h>
+
+#define TRACE 0
+
+#define MIN(x, y) (((x) < (y)) ? (x) : (y))
+#define MAX(x, y) (((x) > (y)) ? (x) : (y))
+
+const unsigned TUPLE_COUNT = 6*6*6*6*6*6;
+static unsigned char Count1[TUPLE_COUNT];
+static unsigned char Count2[TUPLE_COUNT];
+
+// Amino acid groups according to MAFFT (sextet5)
+// 0 = A G P S T
+// 1 = I L M V
+// 2 = N D Q E B Z
+// 3 = R H K
+// 4 = F W Y
+// 5 = C
+// 6 = X . - U
+unsigned ResidueGroup[] =
+ {
+ 0, // AX_A,
+ 5, // AX_C,
+ 2, // AX_D,
+ 2, // AX_E,
+ 4, // AX_F,
+ 0, // AX_G,
+ 3, // AX_H,
+ 1, // AX_I,
+ 3, // AX_K,
+ 1, // AX_L,
+ 1, // AX_M,
+ 2, // AX_N,
+ 0, // AX_P,
+ 2, // AX_Q,
+ 3, // AX_R,
+ 0, // AX_S,
+ 0, // AX_T,
+ 1, // AX_V,
+ 4, // AX_W,
+ 4, // AX_Y,
+
+ 2, // AX_B, // D or N
+ 2, // AX_Z, // E or Q
+ 0, // AX_X, // Unknown // ******** TODO *************
+ // This isn't the correct way of avoiding group 6
+ 0 // AX_GAP, // ******** TODO ******************
+ };
+unsigned uResidueGroupCount = sizeof(ResidueGroup)/sizeof(ResidueGroup[0]);
+
+static char *TupleToStr(int t)
+ {
+ static char s[7];
+ int t1, t2, t3, t4, t5, t6;
+
+ t1 = t%6;
+ t2 = (t/6)%6;
+ t3 = (t/(6*6))%6;
+ t4 = (t/(6*6*6))%6;
+ t5 = (t/(6*6*6*6))%6;
+ t6 = (t/(6*6*6*6*6))%6;
+
+ s[5] = '0' + t1;
+ s[4] = '0' + t2;
+ s[3] = '0' + t3;
+ s[2] = '0' + t4;
+ s[1] = '0' + t5;
+ s[0] = '0' + t6;
+ return s;
+ }
+
+static unsigned GetTuple(const unsigned uLetters[], unsigned n)
+ {
+ assert(uLetters[n] < uResidueGroupCount);
+ assert(uLetters[n+1] < uResidueGroupCount);
+ assert(uLetters[n+2] < uResidueGroupCount);
+ assert(uLetters[n+3] < uResidueGroupCount);
+ assert(uLetters[n+4] < uResidueGroupCount);
+ assert(uLetters[n+5] < uResidueGroupCount);
+
+ unsigned u1 = ResidueGroup[uLetters[n]];
+ unsigned u2 = ResidueGroup[uLetters[n+1]];
+ unsigned u3 = ResidueGroup[uLetters[n+2]];
+ unsigned u4 = ResidueGroup[uLetters[n+3]];
+ unsigned u5 = ResidueGroup[uLetters[n+4]];
+ unsigned u6 = ResidueGroup[uLetters[n+5]];
+
+ return u6 + u5*6 + u4*6*6 + u3*6*6*6 + u2*6*6*6*6 + u1*6*6*6*6*6;
+ }
+
+static void CountTuples(const unsigned L[], unsigned uTupleCount, unsigned char Count[])
+ {
+ memset(Count, 0, TUPLE_COUNT*sizeof(unsigned char));
+ for (unsigned n = 0; n < uTupleCount; ++n)
+ {
+ const unsigned uTuple = GetTuple(L, n);
+ ++(Count[uTuple]);
+ }
+ }
+
+static void ListCount(const unsigned char Count[])
+ {
+ for (unsigned n = 0; n < TUPLE_COUNT; ++n)
+ {
+ if (0 == Count[n])
+ continue;
+ Log("%s %u\n", TupleToStr(n), Count[n]);
+ }
+ }
+
+void DistKmer6_6(const SeqVect &v, DistFunc &DF)
+ {
+ const unsigned uSeqCount = v.Length();
+
+ DF.SetCount(uSeqCount);
+ if (0 == uSeqCount)
+ return;
+
+// Initialize distance matrix to zero
+ for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
+ {
+ DF.SetDist(uSeq1, uSeq1, 0);
+ for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
+ DF.SetDist(uSeq1, uSeq2, 0);
+ }
+
+// Convert to letters
+ unsigned **Letters = new unsigned *[uSeqCount];
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ Seq &s = *(v[uSeqIndex]);
+ const unsigned uSeqLength = s.Length();
+ unsigned *L = new unsigned[uSeqLength];
+ Letters[uSeqIndex] = L;
+ for (unsigned n = 0; n < uSeqLength; ++n)
+ {
+ char c = s[n];
+ L[n] = CharToLetterEx(c);
+ assert(L[n] < uResidueGroupCount);
+ }
+ }
+
+ unsigned **uCommonTupleCount = new unsigned *[uSeqCount];
+ for (unsigned n = 0; n < uSeqCount; ++n)
+ {
+ uCommonTupleCount[n] = new unsigned[uSeqCount];
+ memset(uCommonTupleCount[n], 0, uSeqCount*sizeof(unsigned));
+ }
+
+ const unsigned uPairCount = (uSeqCount*(uSeqCount + 1))/2;
+ unsigned uCount = 0;
+ for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
+ {
+ Seq &seq1 = *(v[uSeq1]);
+ const unsigned uSeqLength1 = seq1.Length();
+ if (uSeqLength1 < 5)
+ continue;
+
+ const unsigned uTupleCount = uSeqLength1 - 5;
+ const unsigned *L = Letters[uSeq1];
+ CountTuples(L, uTupleCount, Count1);
+#if TRACE
+ {
+ Log("Seq1=%d\n", uSeq1);
+ Log("Groups:\n");
+ for (unsigned n = 0; n < uSeqLength1; ++n)
+ Log("%u", ResidueGroup[L[n]]);
+ Log("\n");
+
+ Log("Tuples:\n");
+ ListCount(Count1);
+ }
+#endif
+
+ SetProgressDesc("K-mer dist pass 1");
+ for (unsigned uSeq2 = 0; uSeq2 <= uSeq1; ++uSeq2)
+ {
+ if (0 == uCount%500)
+ Progress(uCount, uPairCount);
+ ++uCount;
+ Seq &seq2 = *(v[uSeq2]);
+ const unsigned uSeqLength2 = seq2.Length();
+ if (uSeqLength2 < 5)
+ {
+ if (uSeq1 == uSeq2)
+ DF.SetDist(uSeq1, uSeq2, 0);
+ else
+ DF.SetDist(uSeq1, uSeq2, 1);
+ continue;
+ }
+
+ // First pass through seq 2 to count tuples
+ const unsigned uTupleCount = uSeqLength2 - 5;
+ const unsigned *L = Letters[uSeq2];
+ CountTuples(L, uTupleCount, Count2);
+#if TRACE
+ Log("Seq2=%d Counts=\n", uSeq2);
+ ListCount(Count2);
+#endif
+
+ // Second pass to accumulate sum of shared tuples
+ // MAFFT defines this as the sum over unique tuples
+ // in seq2 of the minimum of the number of tuples found
+ // in the two sequences.
+ unsigned uSum = 0;
+ for (unsigned n = 0; n < uTupleCount; ++n)
+ {
+ const unsigned uTuple = GetTuple(L, n);
+ uSum += MIN(Count1[uTuple], Count2[uTuple]);
+
+ // This is a hack to make sure each unique tuple counted only once.
+ Count2[uTuple] = 0;
+ }
+#if TRACE
+ {
+ Seq &s1 = *(v[uSeq1]);
+ Seq &s2 = *(v[uSeq2]);
+ const char *pName1 = s1.GetName();
+ const char *pName2 = s2.GetName();
+ Log("Common count %s(%d) - %s(%d) =%u\n",
+ pName1, uSeq1, pName2, uSeq2, uSum);
+ }
+#endif
+ uCommonTupleCount[uSeq1][uSeq2] = uSum;
+ uCommonTupleCount[uSeq2][uSeq1] = uSum;
+ }
+ }
+ ProgressStepsDone();
+
+ uCount = 0;
+ SetProgressDesc("K-mer dist pass 2");
+ for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
+ {
+ Seq &s1 = *(v[uSeq1]);
+ const char *pName1 = s1.GetName();
+
+ double dCommonTupleCount11 = uCommonTupleCount[uSeq1][uSeq1];
+ if (0 == dCommonTupleCount11)
+ dCommonTupleCount11 = 1;
+
+ DF.SetDist(uSeq1, uSeq1, 0);
+ for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
+ {
+ if (0 == uCount%500)
+ Progress(uCount, uPairCount);
+ ++uCount;
+
+ double dCommonTupleCount22 = uCommonTupleCount[uSeq2][uSeq2];
+ if (0 == dCommonTupleCount22)
+ dCommonTupleCount22 = 1;
+
+ const double dDist1 = 3.0*(dCommonTupleCount11 - uCommonTupleCount[uSeq1][uSeq2])
+ /dCommonTupleCount11;
+ const double dDist2 = 3.0*(dCommonTupleCount22 - uCommonTupleCount[uSeq1][uSeq2])
+ /dCommonTupleCount22;
+
+ // dMinDist is the value used for tree-building in MAFFT
+ const double dMinDist = MIN(dDist1, dDist2);
+ DF.SetDist(uSeq1, uSeq2, (float) dMinDist);
+
+ //const double dEstimatedPctId = TupleDistToEstimatedPctId(dMinDist);
+ //g_dfPwId.SetDist(uSeq1, uSeq2, dEstimatedPctId);
+ // **** TODO **** why does this make score slightly worse??
+ //const double dKimuraDist = KimuraDist(dEstimatedPctId);
+ //DF.SetDist(uSeq1, uSeq2, dKimuraDist);
+ }
+ }
+ ProgressStepsDone();
+
+ for (unsigned n = 0; n < uSeqCount; ++n)
+ delete[] uCommonTupleCount[n];
+ delete[] uCommonTupleCount;
+ delete[] Letters;
+ }
+
+double PctIdToMAFFTDist(double dPctId)
+ {
+ if (dPctId < 0.05)
+ dPctId = 0.05;
+ double dDist = -log(dPctId);
+ return dDist;
+ }
+
+double PctIdToHeightMAFFT(double dPctId)
+ {
+ return PctIdToMAFFTDist(dPctId);
+ }
Added: trunk/packages/muscle/branches/upstream/current/fastdistnuc.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/fastdistnuc.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/fastdistnuc.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,265 @@
+#include "muscle.h"
+#include "distfunc.h"
+#include "seqvect.h"
+#include <math.h>
+
+#define TRACE 0
+
+#define MIN(x, y) (((x) < (y)) ? (x) : (y))
+#define MAX(x, y) (((x) > (y)) ? (x) : (y))
+
+const unsigned TUPLE_COUNT = 6*6*6*6*6*6;
+static unsigned char Count1[TUPLE_COUNT];
+static unsigned char Count2[TUPLE_COUNT];
+
+// Nucleotide groups according to MAFFT (sextet5)
+// 0 = A
+// 1 = C
+// 2 = G
+// 3 = T
+// 4 = other
+
+static unsigned ResidueGroup[] =
+ {
+ 0, // NX_A,
+ 1, // NX_C,
+ 2, // NX_G,
+ 3, // NX_T/U
+ 4, // NX_N,
+ 4, // NX_R,
+ 4, // NX_Y,
+ 4, // NX_GAP
+ };
+static unsigned uResidueGroupCount = sizeof(ResidueGroup)/sizeof(ResidueGroup[0]);
+
+static char *TupleToStr(int t)
+ {
+ static char s[7];
+ int t1, t2, t3, t4, t5, t6;
+
+ t1 = t%6;
+ t2 = (t/6)%6;
+ t3 = (t/(6*6))%6;
+ t4 = (t/(6*6*6))%6;
+ t5 = (t/(6*6*6*6))%6;
+ t6 = (t/(6*6*6*6*6))%6;
+
+ s[5] = '0' + t1;
+ s[4] = '0' + t2;
+ s[3] = '0' + t3;
+ s[2] = '0' + t4;
+ s[1] = '0' + t5;
+ s[0] = '0' + t6;
+ return s;
+ }
+
+static unsigned GetTuple(const unsigned uLetters[], unsigned n)
+ {
+ assert(uLetters[n] < uResidueGroupCount);
+ assert(uLetters[n+1] < uResidueGroupCount);
+ assert(uLetters[n+2] < uResidueGroupCount);
+ assert(uLetters[n+3] < uResidueGroupCount);
+ assert(uLetters[n+4] < uResidueGroupCount);
+ assert(uLetters[n+5] < uResidueGroupCount);
+
+ unsigned u1 = ResidueGroup[uLetters[n]];
+ unsigned u2 = ResidueGroup[uLetters[n+1]];
+ unsigned u3 = ResidueGroup[uLetters[n+2]];
+ unsigned u4 = ResidueGroup[uLetters[n+3]];
+ unsigned u5 = ResidueGroup[uLetters[n+4]];
+ unsigned u6 = ResidueGroup[uLetters[n+5]];
+
+ return u6 + u5*6 + u4*6*6 + u3*6*6*6 + u2*6*6*6*6 + u1*6*6*6*6*6;
+ }
+
+static void CountTuples(const unsigned L[], unsigned uTupleCount, unsigned char Count[])
+ {
+ memset(Count, 0, TUPLE_COUNT*sizeof(unsigned char));
+ for (unsigned n = 0; n < uTupleCount; ++n)
+ {
+ const unsigned uTuple = GetTuple(L, n);
+ ++(Count[uTuple]);
+ }
+ }
+
+static void ListCount(const unsigned char Count[])
+ {
+ for (unsigned n = 0; n < TUPLE_COUNT; ++n)
+ {
+ if (0 == Count[n])
+ continue;
+ Log("%s %u\n", TupleToStr(n), Count[n]);
+ }
+ }
+
+void DistKmer4_6(const SeqVect &v, DistFunc &DF)
+ {
+ if (ALPHA_DNA != g_Alpha && ALPHA_RNA != g_Alpha)
+ Quit("DistKmer4_6 requires nucleo alphabet");
+
+ const unsigned uSeqCount = v.Length();
+
+ DF.SetCount(uSeqCount);
+ if (0 == uSeqCount)
+ return;
+
+// Initialize distance matrix to zero
+ for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
+ {
+ DF.SetDist(uSeq1, uSeq1, 0);
+ for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
+ DF.SetDist(uSeq1, uSeq2, 0);
+ }
+
+// Convert to letters
+ unsigned **Letters = new unsigned *[uSeqCount];
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ Seq &s = *(v[uSeqIndex]);
+ const unsigned uSeqLength = s.Length();
+ unsigned *L = new unsigned[uSeqLength];
+ Letters[uSeqIndex] = L;
+ for (unsigned n = 0; n < uSeqLength; ++n)
+ {
+ char c = s[n];
+ L[n] = CharToLetterEx(c);
+ if (L[n] >= 4)
+ L[n] = 4;
+ }
+ }
+
+ unsigned **uCommonTupleCount = new unsigned *[uSeqCount];
+ for (unsigned n = 0; n < uSeqCount; ++n)
+ {
+ uCommonTupleCount[n] = new unsigned[uSeqCount];
+ memset(uCommonTupleCount[n], 0, uSeqCount*sizeof(unsigned));
+ }
+
+ const unsigned uPairCount = (uSeqCount*(uSeqCount + 1))/2;
+ unsigned uCount = 0;
+ for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
+ {
+ Seq &seq1 = *(v[uSeq1]);
+ const unsigned uSeqLength1 = seq1.Length();
+ if (uSeqLength1 < 5)
+ continue;
+
+ const unsigned uTupleCount = uSeqLength1 - 5;
+ const unsigned *L = Letters[uSeq1];
+ CountTuples(L, uTupleCount, Count1);
+#if TRACE
+ {
+ Log("Seq1=%d\n", uSeq1);
+ Log("Groups:\n");
+ for (unsigned n = 0; n < uSeqLength1; ++n)
+ Log("%u", ResidueGroup[L[n]]);
+ Log("\n");
+
+ Log("Tuples:\n");
+ ListCount(Count1);
+ }
+#endif
+
+ SetProgressDesc("K-mer dist pass 1");
+ for (unsigned uSeq2 = 0; uSeq2 <= uSeq1; ++uSeq2)
+ {
+ if (0 == uCount%500)
+ Progress(uCount, uPairCount);
+ ++uCount;
+ Seq &seq2 = *(v[uSeq2]);
+ const unsigned uSeqLength2 = seq2.Length();
+ if (uSeqLength2 < 5)
+ {
+ if (uSeq1 == uSeq2)
+ DF.SetDist(uSeq1, uSeq2, 0);
+ else
+ DF.SetDist(uSeq1, uSeq2, 1);
+ continue;
+ }
+
+ // First pass through seq 2 to count tuples
+ const unsigned uTupleCount = uSeqLength2 - 5;
+ const unsigned *L = Letters[uSeq2];
+ CountTuples(L, uTupleCount, Count2);
+#if TRACE
+ Log("Seq2=%d Counts=\n", uSeq2);
+ ListCount(Count2);
+#endif
+
+ // Second pass to accumulate sum of shared tuples
+ // MAFFT defines this as the sum over unique tuples
+ // in seq2 of the minimum of the number of tuples found
+ // in the two sequences.
+ unsigned uSum = 0;
+ for (unsigned n = 0; n < uTupleCount; ++n)
+ {
+ const unsigned uTuple = GetTuple(L, n);
+ uSum += MIN(Count1[uTuple], Count2[uTuple]);
+
+ // This is a hack to make sure each unique tuple counted only once.
+ Count2[uTuple] = 0;
+ }
+#if TRACE
+ {
+ Seq &s1 = *(v[uSeq1]);
+ Seq &s2 = *(v[uSeq2]);
+ const char *pName1 = s1.GetName();
+ const char *pName2 = s2.GetName();
+ Log("Common count %s(%d) - %s(%d) =%u\n",
+ pName1, uSeq1, pName2, uSeq2, uSum);
+ }
+#endif
+ uCommonTupleCount[uSeq1][uSeq2] = uSum;
+ uCommonTupleCount[uSeq2][uSeq1] = uSum;
+ }
+ }
+ ProgressStepsDone();
+
+ uCount = 0;
+ SetProgressDesc("K-mer dist pass 2");
+ for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
+ {
+ Seq &s1 = *(v[uSeq1]);
+ const char *pName1 = s1.GetName();
+
+ double dCommonTupleCount11 = uCommonTupleCount[uSeq1][uSeq1];
+ if (0 == dCommonTupleCount11)
+ dCommonTupleCount11 = 1;
+
+ DF.SetDist(uSeq1, uSeq1, 0);
+ for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
+ {
+ if (0 == uCount%500)
+ Progress(uCount, uPairCount);
+ ++uCount;
+
+ double dCommonTupleCount22 = uCommonTupleCount[uSeq2][uSeq2];
+ if (0 == dCommonTupleCount22)
+ dCommonTupleCount22 = 1;
+
+ const double dDist1 = 3.0*(dCommonTupleCount11 - uCommonTupleCount[uSeq1][uSeq2])
+ /dCommonTupleCount11;
+ const double dDist2 = 3.0*(dCommonTupleCount22 - uCommonTupleCount[uSeq1][uSeq2])
+ /dCommonTupleCount22;
+
+ // dMinDist is the value used for tree-building in MAFFT
+ const double dMinDist = MIN(dDist1, dDist2);
+ DF.SetDist(uSeq1, uSeq2, (float) dMinDist);
+
+ //const double dEstimatedPctId = TupleDistToEstimatedPctId(dMinDist);
+ //g_dfPwId.SetDist(uSeq1, uSeq2, dEstimatedPctId);
+ // **** TODO **** why does this make score slightly worse??
+ //const double dKimuraDist = KimuraDist(dEstimatedPctId);
+ //DF.SetDist(uSeq1, uSeq2, dKimuraDist);
+ }
+ }
+ ProgressStepsDone();
+
+ for (unsigned n = 0; n < uSeqCount; ++n)
+ {
+ delete[] uCommonTupleCount[n];
+ delete[] Letters[n];
+ }
+ delete[] uCommonTupleCount;
+ delete[] Letters;
+ }
Added: trunk/packages/muscle/branches/upstream/current/fastscorepath2.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/fastscorepath2.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/fastscorepath2.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,165 @@
+#include "muscle.h"
+#include "profile.h"
+#include "pwpath.h"
+
+SCORE FastScorePath2(const ProfPos *PA, unsigned uLengthA,
+ const ProfPos *PB, unsigned uLengthB, const PWPath &Path)
+ {
+ const unsigned uEdgeCount = Path.GetEdgeCount();
+ Log("Edge SS PLA PLB Match Gap Total\n");
+ Log("---- -- --- --- ----- --- -----\n");
+ char cType = 'S';
+ SCORE scoreTotal = 0;
+ for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex)
+ {
+ const PWEdge &Edge = Path.GetEdge(uEdgeIndex);
+ const char cPrevType = cType;
+ cType = Edge.cType;
+ const unsigned uPrefixLengthA = Edge.uPrefixLengthA;
+ const unsigned uPrefixLengthB = Edge.uPrefixLengthB;
+ bool bGap = false;
+ bool bMatch = false;
+ SCORE scoreGap = 0;
+ SCORE scoreMatch = 0;
+
+ switch (cType)
+ {
+ case 'M':
+ {
+ if (0 == uPrefixLengthA || 0 == uPrefixLengthB)
+ Quit("FastScorePath2, M zero length");
+
+ const ProfPos &PPA = PA[uPrefixLengthA - 1];
+ const ProfPos &PPB = PB[uPrefixLengthB - 1];
+
+ bMatch = true;
+ scoreMatch = ScoreProfPos2(PPA, PPB);
+
+ if ('D' == cPrevType)
+ {
+ bGap = true;
+ assert(uPrefixLengthA > 1);
+ scoreGap = PA[uPrefixLengthA-2].m_scoreGapClose;
+ }
+ else if ('I' == cPrevType)
+ {
+ bGap = true;
+ assert(uPrefixLengthB > 1);
+ scoreGap = PB[uPrefixLengthB-2].m_scoreGapClose;
+ }
+ break;
+ }
+
+ case 'D':
+ {
+ if (0 == uPrefixLengthA)
+ Quit("FastScorePath2, D zero length");
+
+ const ProfPos &PPA = PA[uPrefixLengthA - 1];
+ bGap = true;
+ switch (cPrevType)
+ {
+ case 'S':
+ scoreGap = PPA.m_scoreGapOpen;
+ break;
+ case 'M':
+ scoreGap = PPA.m_scoreGapOpen;
+ break;
+ case 'D':
+// scoreGap = g_scoreGapExtend;
+ scoreGap = 0;
+ break;
+ case 'I':
+ Quit("FastScorePath2 DI");
+ }
+ break;
+ }
+
+ case 'I':
+ {
+ if (0 == uPrefixLengthB)
+ Quit("FastScorePath2, I zero length");
+
+ const ProfPos &PPB = PB[uPrefixLengthB - 1];
+ bGap = true;
+ switch (cPrevType)
+ {
+ case 'S':
+ scoreGap = PPB.m_scoreGapOpen;
+ break;
+ case 'M':
+ scoreGap = PPB.m_scoreGapOpen;
+ break;
+ case 'I':
+ scoreGap = 0;
+// scoreGap = g_scoreGapExtend;
+ break;
+ case 'D':
+ Quit("FastScorePath2 DI");
+ }
+ break;
+ }
+
+ case 'U':
+ {
+ Quit("FastScorePath2 U");
+ }
+
+ default:
+ Quit("FastScorePath2: invalid type %c", cType);
+ }
+
+ Log("%4u %c%c %4u %4u ", uEdgeIndex, cPrevType, cType,
+ uPrefixLengthA, uPrefixLengthB);
+ if (bMatch)
+ Log("%7.1f ", scoreMatch);
+ else
+ Log(" ");
+ if (bGap)
+ Log("%7.1f ", scoreGap);
+ else
+ Log(" ");
+ SCORE scoreEdge = scoreMatch + scoreGap;
+ scoreTotal += scoreEdge;
+ Log("%7.1f %7.1f", scoreEdge, scoreTotal);
+ Log("\n");
+ }
+
+ SCORE scoreGap = 0;
+// if (!g_bTermGapsHalf)
+ switch (cType)
+ {
+ case 'M':
+ scoreGap = 0;
+ break;
+
+ case 'D':
+ {
+ const ProfPos &LastPPA = PA[uLengthA - 1];
+ scoreGap = LastPPA.m_scoreGapClose;
+ break;
+ }
+
+ case 'I':
+ {
+ const ProfPos &LastPPB = PB[uLengthB - 1];
+ scoreGap = LastPPB.m_scoreGapClose;
+ break;
+ }
+
+ case 'U':
+ Quit("Unaligned regions not supported");
+
+ case 'S':
+ break;
+
+ default:
+ Quit("Invalid type %c", cType);
+ }
+
+ Log(" %cE %4u %4u %7.1f\n", cType, uLengthA, uLengthB, scoreGap);
+ scoreTotal += scoreGap;
+
+ Log("Total = %g\n", scoreTotal);
+ return scoreTotal;
+ }
Added: trunk/packages/muscle/branches/upstream/current/finddiags.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/finddiags.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/finddiags.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,161 @@
+#include "muscle.h"
+#include "profile.h"
+#include "diaglist.h"
+
+#define TRACE 0
+
+const unsigned KTUP = 5;
+const unsigned KTUPS = 6*6*6*6*6;
+static unsigned TuplePos[KTUPS];
+
+static char *TupleToStr(int t)
+ {
+ static char s[7];
+ int t1, t2, t3, t4, t5;
+
+ t1 = t%6;
+ t2 = (t/6)%6;
+ t3 = (t/(6*6))%6;
+ t4 = (t/(6*6*6))%6;
+ t5 = (t/(6*6*6*6))%6;
+
+ s[4] = '0' + t1;
+ s[3] = '0' + t2;
+ s[2] = '0' + t3;
+ s[1] = '0' + t4;
+ s[0] = '0' + t5;
+ return s;
+ }
+
+static unsigned GetTuple(const ProfPos *PP, unsigned uPos)
+ {
+ const unsigned t0 = PP[uPos].m_uResidueGroup;
+ if (RESIDUE_GROUP_MULTIPLE == t0)
+ return EMPTY;
+
+ const unsigned t1 = PP[uPos+1].m_uResidueGroup;
+ if (RESIDUE_GROUP_MULTIPLE == t1)
+ return EMPTY;
+
+ const unsigned t2 = PP[uPos+2].m_uResidueGroup;
+ if (RESIDUE_GROUP_MULTIPLE == t2)
+ return EMPTY;
+
+ const unsigned t3 = PP[uPos+3].m_uResidueGroup;
+ if (RESIDUE_GROUP_MULTIPLE == t3)
+ return EMPTY;
+
+ const unsigned t4 = PP[uPos+4].m_uResidueGroup;
+ if (RESIDUE_GROUP_MULTIPLE == t4)
+ return EMPTY;
+
+ return t0 + t1*6 + t2*6*6 + t3*6*6*6 + t4*6*6*6*6;
+ }
+
+void FindDiags(const ProfPos *PX, unsigned uLengthX, const ProfPos *PY,
+ unsigned uLengthY, DiagList &DL)
+ {
+ if (ALPHA_Amino != g_Alpha)
+ Quit("FindDiags: requires amino acid alphabet");
+
+ DL.Clear();
+
+ if (uLengthX < 12 || uLengthY < 12)
+ return;
+
+// Set A to shorter profile, B to longer
+ const ProfPos *PA;
+ const ProfPos *PB;
+ unsigned uLengthA;
+ unsigned uLengthB;
+ bool bSwap;
+ if (uLengthX < uLengthY)
+ {
+ bSwap = false;
+ PA = PX;
+ PB = PY;
+ uLengthA = uLengthX;
+ uLengthB = uLengthY;
+ }
+ else
+ {
+ bSwap = true;
+ PA = PY;
+ PB = PX;
+ uLengthA = uLengthY;
+ uLengthB = uLengthX;
+ }
+
+// Build tuple map for the longer profile, B
+ if (uLengthB < KTUP)
+ Quit("FindDiags: profile too short");
+
+ memset(TuplePos, EMPTY, sizeof(TuplePos));
+
+ for (unsigned uPos = 0; uPos < uLengthB - KTUP; ++uPos)
+ {
+ const unsigned uTuple = GetTuple(PB, uPos);
+ if (EMPTY == uTuple)
+ continue;
+ TuplePos[uTuple] = uPos;
+ }
+
+// Find matches
+ for (unsigned uPosA = 0; uPosA < uLengthA - KTUP; ++uPosA)
+ {
+ const unsigned uTuple = GetTuple(PA, uPosA);
+ if (EMPTY == uTuple)
+ continue;
+ const unsigned uPosB = TuplePos[uTuple];
+ if (EMPTY == uPosB)
+ continue;
+
+ // This tuple is found in both profiles
+ unsigned uStartPosA = uPosA;
+ unsigned uStartPosB = uPosB;
+
+ // Try to extend the match forwards
+ unsigned uEndPosA = uPosA + KTUP - 1;
+ unsigned uEndPosB = uPosB + KTUP - 1;
+ for (;;)
+ {
+ if (uLengthA - 1 == uEndPosA || uLengthB - 1 == uEndPosB)
+ break;
+ const unsigned uAAGroupA = PA[uEndPosA+1].m_uResidueGroup;
+ if (RESIDUE_GROUP_MULTIPLE == uAAGroupA)
+ break;
+ const unsigned uAAGroupB = PB[uEndPosB+1].m_uResidueGroup;
+ if (RESIDUE_GROUP_MULTIPLE == uAAGroupB)
+ break;
+ if (uAAGroupA != uAAGroupB)
+ break;
+ ++uEndPosA;
+ ++uEndPosB;
+ }
+ uPosA = uEndPosA;
+
+#if TRACE
+ {
+ Log("Match: A %4u-%4u ", uStartPosA, uEndPosA);
+ for (unsigned n = uStartPosA; n <= uEndPosA; ++n)
+ Log("%c", 'A' + PA[n].m_uResidueGroup);
+ Log("\n");
+ Log(" B %4u-%4u ", uStartPosB, uEndPosB);
+ for (unsigned n = uStartPosB; n <= uEndPosB; ++n)
+ Log("%c", 'A' + PB[n].m_uResidueGroup);
+ Log("\n");
+ }
+#endif
+
+ const unsigned uLength = uEndPosA - uStartPosA + 1;
+ assert(uEndPosB - uStartPosB + 1 == uLength);
+
+ if (uLength >= g_uMinDiagLength)
+ {
+ if (bSwap)
+ DL.Add(uStartPosB, uStartPosA, uLength);
+ else
+ DL.Add(uStartPosA, uStartPosB, uLength);
+ }
+ }
+ }
Added: trunk/packages/muscle/branches/upstream/current/finddiagsn.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/finddiagsn.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/finddiagsn.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,152 @@
+#include "muscle.h"
+#include "profile.h"
+#include "diaglist.h"
+
+#define TRACE 0
+
+#define pow4(i) (1 << (2*i)) // 4^i = 2^(2*i)
+const unsigned K = 7;
+const unsigned KTUPS = pow4(K);
+static unsigned TuplePos[KTUPS];
+
+static char *TupleToStr(int t)
+ {
+ static char s[K];
+
+ for (int i = 0; i < K; ++i)
+ {
+ unsigned Letter = (t/(pow4(i)))%4;
+ assert(Letter >= 0 && Letter < 4);
+ s[K-i-1] = LetterToChar(Letter);
+ }
+
+ return s;
+ }
+
+static unsigned GetTuple(const ProfPos *PP, unsigned uPos)
+ {
+ unsigned t = 0;
+
+ for (unsigned i = 0; i < K; ++i)
+ {
+ const unsigned uLetter = PP[uPos+i].m_uResidueGroup;
+ if (RESIDUE_GROUP_MULTIPLE == uLetter)
+ return EMPTY;
+ t = t*4 + uLetter;
+ }
+
+ return t;
+ }
+
+void FindDiagsNuc(const ProfPos *PX, unsigned uLengthX, const ProfPos *PY,
+ unsigned uLengthY, DiagList &DL)
+ {
+ if (ALPHA_DNA != g_Alpha && ALPHA_RNA != g_Alpha)
+ Quit("FindDiagsNuc: requires nucleo alphabet");
+
+ DL.Clear();
+
+// 16 is arbitrary slop, no principled reason for this.
+ if (uLengthX < K + 16 || uLengthY < K + 16)
+ return;
+
+// Set A to shorter profile, B to longer
+ const ProfPos *PA;
+ const ProfPos *PB;
+ unsigned uLengthA;
+ unsigned uLengthB;
+ bool bSwap;
+ if (uLengthX < uLengthY)
+ {
+ bSwap = false;
+ PA = PX;
+ PB = PY;
+ uLengthA = uLengthX;
+ uLengthB = uLengthY;
+ }
+ else
+ {
+ bSwap = true;
+ PA = PY;
+ PB = PX;
+ uLengthA = uLengthY;
+ uLengthB = uLengthX;
+ }
+
+#if TRACE
+ Log("FindDiagsNuc(LengthA=%d LengthB=%d\n", uLengthA, uLengthB);
+#endif
+
+// Build tuple map for the longer profile, B
+ if (uLengthB < K)
+ Quit("FindDiags: profile too short");
+
+ memset(TuplePos, EMPTY, sizeof(TuplePos));
+
+ for (unsigned uPos = 0; uPos < uLengthB - K; ++uPos)
+ {
+ const unsigned uTuple = GetTuple(PB, uPos);
+ if (EMPTY == uTuple)
+ continue;
+ TuplePos[uTuple] = uPos;
+ }
+
+// Find matches
+ for (unsigned uPosA = 0; uPosA < uLengthA - K; ++uPosA)
+ {
+ const unsigned uTuple = GetTuple(PA, uPosA);
+ if (EMPTY == uTuple)
+ continue;
+ const unsigned uPosB = TuplePos[uTuple];
+ if (EMPTY == uPosB)
+ continue;
+
+ // This tuple is found in both profiles
+ unsigned uStartPosA = uPosA;
+ unsigned uStartPosB = uPosB;
+
+ // Try to extend the match forwards
+ unsigned uEndPosA = uPosA + K - 1;
+ unsigned uEndPosB = uPosB + K - 1;
+ for (;;)
+ {
+ if (uLengthA - 1 == uEndPosA || uLengthB - 1 == uEndPosB)
+ break;
+ const unsigned uAAGroupA = PA[uEndPosA+1].m_uResidueGroup;
+ if (RESIDUE_GROUP_MULTIPLE == uAAGroupA)
+ break;
+ const unsigned uAAGroupB = PB[uEndPosB+1].m_uResidueGroup;
+ if (RESIDUE_GROUP_MULTIPLE == uAAGroupB)
+ break;
+ if (uAAGroupA != uAAGroupB)
+ break;
+ ++uEndPosA;
+ ++uEndPosB;
+ }
+ uPosA = uEndPosA;
+
+#if TRACE
+ {
+ Log("Match: A %4u-%4u ", uStartPosA, uEndPosA);
+ for (unsigned n = uStartPosA; n <= uEndPosA; ++n)
+ Log("%c", LetterToChar(PA[n].m_uResidueGroup));
+ Log("\n");
+ Log(" B %4u-%4u ", uStartPosB, uEndPosB);
+ for (unsigned n = uStartPosB; n <= uEndPosB; ++n)
+ Log("%c", LetterToChar(PB[n].m_uResidueGroup));
+ Log("\n");
+ }
+#endif
+
+ const unsigned uLength = uEndPosA - uStartPosA + 1;
+ assert(uEndPosB - uStartPosB + 1 == uLength);
+
+ if (uLength >= g_uMinDiagLength)
+ {
+ if (bSwap)
+ DL.Add(uStartPosB, uStartPosA, uLength);
+ else
+ DL.Add(uStartPosA, uStartPosB, uLength);
+ }
+ }
+ }
Added: trunk/packages/muscle/branches/upstream/current/gapscoredimer.h
===================================================================
--- trunk/packages/muscle/branches/upstream/current/gapscoredimer.h 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/gapscoredimer.h 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,69 @@
+// source code generated by dimer.py
+
+static SCORE GapScoreMM(const ProfPos &PPA, const ProfPos &PPB)
+ {
+ return
+ g_scoreGapOpen*(PPA.m_LL*PPB.m_LG + PPA.m_LG*PPB.m_LL + PPA.m_LG*PPB.m_GL + PPA.m_GL*PPB.m_LG) +
+ g_scoreGapExtend*(PPA.m_LL*PPB.m_GG + PPA.m_GG*PPB.m_LL) +
+ g_scoreGapAmbig*(PPA.m_GL*PPB.m_GG + PPA.m_GG*PPB.m_GL);
+ }
+
+static SCORE GapScoreMD(const ProfPos &PPA, const ProfPos &PPB)
+ {
+ return
+ g_scoreGapOpen*(PPA.m_LL*PPB.m_LL + PPA.m_LL*PPB.m_GL + PPA.m_GL*PPB.m_LL + PPA.m_GL*PPB.m_GL) +
+ g_scoreGapExtend*(PPA.m_LL*PPB.m_LG + PPA.m_LL*PPB.m_GG) +
+ g_scoreGapAmbig*(PPA.m_GL*PPB.m_LG + PPA.m_GL*PPB.m_GG);
+ }
+
+static SCORE GapScoreMI(const ProfPos &PPA, const ProfPos &PPB)
+ {
+ return
+ g_scoreGapOpen*(PPA.m_LL*PPB.m_LL + PPA.m_LL*PPB.m_GL + PPA.m_GL*PPB.m_LL + PPA.m_GL*PPB.m_GL) +
+ g_scoreGapExtend*(PPA.m_LG*PPB.m_LL + PPA.m_GG*PPB.m_LL) +
+ g_scoreGapAmbig*(PPA.m_LG*PPB.m_GL + PPA.m_GG*PPB.m_GL);
+ }
+
+static SCORE GapScoreDM(const ProfPos &PPA, const ProfPos &PPB)
+ {
+ return
+ g_scoreGapOpen*(PPA.m_LG*PPB.m_LL + PPA.m_LG*PPB.m_GL) +
+ g_scoreGapExtend*(PPA.m_LL*PPB.m_LG + PPA.m_LL*PPB.m_GG) +
+ g_scoreGapAmbig*(PPA.m_GL*PPB.m_LG + PPA.m_GL*PPB.m_GG + PPA.m_GG*PPB.m_LL + PPA.m_GG*PPB.m_GL);
+ }
+
+static SCORE GapScoreDD(const ProfPos &PPA, const ProfPos &PPB)
+ {
+ return
+ g_scoreGapExtend*(PPA.m_LL*PPB.m_LL + PPA.m_LL*PPB.m_LG + PPA.m_LL*PPB.m_GL + PPA.m_LL*PPB.m_GG) +
+ g_scoreGapAmbig*(PPA.m_GL*PPB.m_LL + PPA.m_GL*PPB.m_LG + PPA.m_GL*PPB.m_GL + PPA.m_GL*PPB.m_GG);
+ }
+
+static SCORE GapScoreDI(const ProfPos &PPA, const ProfPos &PPB)
+ {
+ return
+ g_scoreGapOpen*(PPA.m_LL*PPB.m_LL + PPA.m_LL*PPB.m_GL + PPA.m_GL*PPB.m_LL + PPA.m_GL*PPB.m_GL) +
+ g_scoreGapAmbig*(PPA.m_LG*PPB.m_LL + PPA.m_LG*PPB.m_GL + PPA.m_GG*PPB.m_LL + PPA.m_GG*PPB.m_GL);
+ }
+
+static SCORE GapScoreIM(const ProfPos &PPA, const ProfPos &PPB)
+ {
+ return
+ g_scoreGapOpen*(PPA.m_LL*PPB.m_LG + PPA.m_GL*PPB.m_LG) +
+ g_scoreGapExtend*(PPA.m_LG*PPB.m_LL + PPA.m_GG*PPB.m_LL) +
+ g_scoreGapAmbig*(PPA.m_LL*PPB.m_GG + PPA.m_LG*PPB.m_GL + PPA.m_GL*PPB.m_GG + PPA.m_GG*PPB.m_GL);
+ }
+
+static SCORE GapScoreID(const ProfPos &PPA, const ProfPos &PPB)
+ {
+ return
+ g_scoreGapOpen*(PPA.m_LL*PPB.m_LL + PPA.m_LL*PPB.m_GL + PPA.m_GL*PPB.m_LL + PPA.m_GL*PPB.m_GL) +
+ g_scoreGapAmbig*(PPA.m_LL*PPB.m_LG + PPA.m_LL*PPB.m_GG + PPA.m_GL*PPB.m_LG + PPA.m_GL*PPB.m_GG);
+ }
+
+static SCORE GapScoreII(const ProfPos &PPA, const ProfPos &PPB)
+ {
+ return
+ g_scoreGapExtend*(PPA.m_LL*PPB.m_LL + PPA.m_LG*PPB.m_LL + PPA.m_GL*PPB.m_LL + PPA.m_GG*PPB.m_LL) +
+ g_scoreGapAmbig*(PPA.m_LL*PPB.m_GL + PPA.m_LG*PPB.m_GL + PPA.m_GL*PPB.m_GL + PPA.m_GG*PPB.m_GL);
+ }
Added: trunk/packages/muscle/branches/upstream/current/glbalign.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/glbalign.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/glbalign.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,165 @@
+#include "muscle.h"
+#include "pwpath.h"
+#include "timing.h"
+#include "textfile.h"
+#include "msa.h"
+#include "profile.h"
+
+#if !VER_3_52
+
+#define COMPARE_SIMPLE 0
+
+#if TIMING
+TICKS g_ticksDP = 0;
+#endif
+
+#if 1
+extern bool g_bKeepSimpleDP;
+SCORE NWSmall(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
+ unsigned uLengthB, PWPath &Path);
+SCORE NWDASmall(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
+ unsigned uLengthB, PWPath &Path);
+SCORE NWDASimple(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
+ unsigned uLengthB, PWPath &Path);
+SCORE NWDASimple2(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
+ unsigned uLengthB, PWPath &Path);
+SCORE GlobalAlignSimple(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
+ unsigned uLengthB, PWPath &Path);
+
+SCORE GlobalAlignNoDiags(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
+ unsigned uLengthB, PWPath &Path)
+ {
+ return GlobalAlign(PA, uLengthA, PB, uLengthB, Path);
+ }
+
+#if COMPARE_SIMPLE
+
+SCORE GlobalAlign(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
+ unsigned uLengthB, PWPath &Path)
+ {
+#if TIMING
+ TICKS t1 = GetClockTicks();
+#endif
+ g_bKeepSimpleDP = true;
+ PWPath SimplePath;
+ GlobalAlignSimple(PA, uLengthA, PB, uLengthB, SimplePath);
+
+ SCORE Score = NWSmall(PA, uLengthA, PB, uLengthB, Path);
+
+ if (!Path.Equal(SimplePath))
+ {
+ Log("Simple:\n");
+ SimplePath.LogMe();
+ Log("Small:\n");
+ Path.LogMe();
+ Quit("Paths differ");
+ }
+
+#if TIMING
+ TICKS t2 = GetClockTicks();
+ g_ticksDP += (t2 - t1);
+#endif
+ return Score;
+ }
+
+#else // COMPARE_SIMPLE
+
+SCORE GlobalAlign(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
+ unsigned uLengthB, PWPath &Path)
+ {
+#if TIMING
+ TICKS t1 = GetClockTicks();
+#endif
+ SCORE Score = NWSmall(PA, uLengthA, PB, uLengthB, Path);
+#if TIMING
+ TICKS t2 = GetClockTicks();
+ g_ticksDP += (t2 - t1);
+#endif
+ return Score;
+ }
+
+#endif
+
+#else // 1
+
+static void AllInserts(PWPath &Path, unsigned uLengthB)
+ {
+ Path.Clear();
+ PWEdge Edge;
+ Edge.cType = 'I';
+ Edge.uPrefixLengthA = 0;
+ for (unsigned uPrefixLengthB = 1; uPrefixLengthB <= uLengthB; ++uPrefixLengthB)
+ {
+ Edge.uPrefixLengthB = uPrefixLengthB;
+ Path.AppendEdge(Edge);
+ }
+ }
+
+static void AllDeletes(PWPath &Path, unsigned uLengthA)
+ {
+ Path.Clear();
+ PWEdge Edge;
+ Edge.cType = 'D';
+ Edge.uPrefixLengthB = 0;
+ for (unsigned uPrefixLengthA = 1; uPrefixLengthA <= uLengthA; ++uPrefixLengthA)
+ {
+ Edge.uPrefixLengthA = uPrefixLengthA;
+ Path.AppendEdge(Edge);
+ }
+ }
+
+SCORE GlobalAlign(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
+ unsigned uLengthB, PWPath &Path)
+ {
+#if TIMING
+ TICKS t1 = GetClockTicks();
+#endif
+ if (0 == uLengthA)
+ {
+ AllInserts(Path, uLengthB);
+ return 0;
+ }
+ else if (0 == uLengthB)
+ {
+ AllDeletes(Path, uLengthA);
+ return 0;
+ }
+
+ SCORE Score = 0;
+ if (g_bDiags)
+ Score = GlobalAlignDiags(PA, uLengthA, PB, uLengthB, Path);
+ else
+ Score = GlobalAlignNoDiags(PA, uLengthA, PB, uLengthB, Path);
+#if TIMING
+ TICKS t2 = GetClockTicks();
+ g_ticksDP += (t2 - t1);
+#endif
+ return Score;
+ }
+
+SCORE GlobalAlignNoDiags(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
+ unsigned uLengthB, PWPath &Path)
+ {
+ if (g_bDimer)
+ return GlobalAlignDimer(PA, uLengthA, PB, uLengthB, Path);
+
+ switch (g_PPScore)
+ {
+ case PPSCORE_LE:
+ return GlobalAlignLE(PA, uLengthA, PB, uLengthB, Path);
+
+ case PPSCORE_SP:
+ case PPSCORE_SV:
+ return GlobalAlignSP(PA, uLengthA, PB, uLengthB, Path);
+
+ case PPSCORE_SPN:
+ return GlobalAlignSPN(PA, uLengthA, PB, uLengthB, Path);
+ }
+
+ Quit("Invalid PP score (GlobalAlignNoDiags)");
+ return 0;
+ }
+
+#endif
+
+#endif // !VER_3_52
Added: trunk/packages/muscle/branches/upstream/current/glbalign352.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/glbalign352.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/glbalign352.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,55 @@
+#include "muscle.h"
+#include "pwpath.h"
+#include "timing.h"
+#include "textfile.h"
+#include "msa.h"
+#include "profile.h"
+
+#if VER_3_52
+
+#if TIMING
+TICKS g_ticksDP = 0;
+#endif
+
+SCORE GlobalAlign(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
+ unsigned uLengthB, PWPath &Path)
+ {
+#if TIMING
+ TICKS t1 = GetClockTicks();
+#endif
+ SCORE Score = 0;
+ if (g_bDiags)
+ Score = GlobalAlignDiags(PA, uLengthA, PB, uLengthB, Path);
+ else
+ Score = GlobalAlignNoDiags(PA, uLengthA, PB, uLengthB, Path);
+#if TIMING
+ TICKS t2 = GetClockTicks();
+ g_ticksDP += (t2 - t1);
+#endif
+ return Score;
+ }
+
+SCORE GlobalAlignNoDiags(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
+ unsigned uLengthB, PWPath &Path)
+ {
+ if (g_bDimer)
+ return GlobalAlignDimer(PA, uLengthA, PB, uLengthB, Path);
+
+ switch (g_PPScore)
+ {
+ case PPSCORE_LE:
+ return GlobalAlignLE(PA, uLengthA, PB, uLengthB, Path);
+
+ case PPSCORE_SP:
+ case PPSCORE_SV:
+ return GlobalAlignSP(PA, uLengthA, PB, uLengthB, Path);
+
+ case PPSCORE_SPN:
+ return GlobalAlignSPN(PA, uLengthA, PB, uLengthB, Path);
+ }
+
+ Quit("Invalid PP score (GlobalAlignNoDiags)");
+ return 0;
+ }
+
+#endif // VER_3_52
Added: trunk/packages/muscle/branches/upstream/current/glbaligndiag.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/glbaligndiag.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/glbaligndiag.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,172 @@
+#include "muscle.h"
+#include "dpreglist.h"
+#include "diaglist.h"
+#include "pwpath.h"
+#include "profile.h"
+#include "timing.h"
+
+#define TRACE 0
+#define TRACE_PATH 0
+#define LIST_DIAGS 0
+
+static double g_dDPAreaWithoutDiags = 0.0;
+static double g_dDPAreaWithDiags = 0.0;
+
+static void OffsetPath(PWPath &Path, unsigned uOffsetA, unsigned uOffsetB)
+ {
+ const unsigned uEdgeCount = Path.GetEdgeCount();
+ for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex)
+ {
+ const PWEdge &Edge = Path.GetEdge(uEdgeIndex);
+
+ // Nasty hack -- poke new values back into path, circumventing class
+ PWEdge &NonConstEdge = (PWEdge &) Edge;
+ NonConstEdge.uPrefixLengthA += uOffsetA;
+ NonConstEdge.uPrefixLengthB += uOffsetB;
+ }
+ }
+
+static void DiagToPath(const Diag &d, PWPath &Path)
+ {
+ Path.Clear();
+ const unsigned uLength = d.m_uLength;
+ for (unsigned i = 0; i < uLength; ++i)
+ {
+ PWEdge Edge;
+ Edge.cType = 'M';
+ Edge.uPrefixLengthA = d.m_uStartPosA + i + 1;
+ Edge.uPrefixLengthB = d.m_uStartPosB + i + 1;
+ Path.AppendEdge(Edge);
+ }
+ }
+
+static void AppendRegPath(PWPath &Path, const PWPath &RegPath)
+ {
+ const unsigned uRegEdgeCount = RegPath.GetEdgeCount();
+ for (unsigned uRegEdgeIndex = 0; uRegEdgeIndex < uRegEdgeCount; ++uRegEdgeIndex)
+ {
+ const PWEdge &RegEdge = RegPath.GetEdge(uRegEdgeIndex);
+ Path.AppendEdge(RegEdge);
+ }
+ }
+
+SCORE GlobalAlignDiags(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
+ unsigned uLengthB, PWPath &Path)
+ {
+#if LIST_DIAGS
+ TICKS t1 = GetClockTicks();
+#endif
+
+ DiagList DL;
+
+ if (ALPHA_Amino == g_Alpha)
+ FindDiags(PA, uLengthA, PB, uLengthB, DL);
+ else if (ALPHA_DNA == g_Alpha || ALPHA_RNA == g_Alpha)
+ FindDiagsNuc(PA, uLengthA, PB, uLengthB, DL);
+ else
+ Quit("GlobalAlignDiags: bad alpha");
+
+#if TRACE
+ Log("GlobalAlignDiags, diag list:\n");
+ DL.LogMe();
+#endif
+
+ DL.Sort();
+ DL.DeleteIncompatible();
+
+#if TRACE
+ Log("After DeleteIncompatible:\n");
+ DL.LogMe();
+#endif
+
+ MergeDiags(DL);
+
+#if TRACE
+ Log("After MergeDiags:\n");
+ DL.LogMe();
+#endif
+
+ DPRegionList RL;
+ DiagListToDPRegionList(DL, RL, uLengthA, uLengthB);
+
+#if TRACE
+ Log("RegionList:\n");
+ RL.LogMe();
+#endif
+
+#if LIST_DIAGS
+ {
+ TICKS t2 = GetClockTicks();
+ unsigned uArea = RL.GetDPArea();
+ Log("ticks=%ld\n", (long) (t2 - t1));
+ Log("area=%u\n", uArea);
+ }
+#endif
+
+ g_dDPAreaWithoutDiags += uLengthA*uLengthB;
+
+ double dDPAreaWithDiags = 0.0;
+ const unsigned uRegionCount = RL.GetCount();
+ for (unsigned uRegionIndex = 0; uRegionIndex < uRegionCount; ++uRegionIndex)
+ {
+ const DPRegion &r = RL.Get(uRegionIndex);
+
+ PWPath RegPath;
+ if (DPREGIONTYPE_Diag == r.m_Type)
+ {
+ DiagToPath(r.m_Diag, RegPath);
+#if TRACE_PATH
+ Log("DiagToPath, path=\n");
+ RegPath.LogMe();
+#endif
+ }
+ else if (DPREGIONTYPE_Rect == r.m_Type)
+ {
+ const unsigned uRegStartPosA = r.m_Rect.m_uStartPosA;
+ const unsigned uRegStartPosB = r.m_Rect.m_uStartPosB;
+ const unsigned uRegLengthA = r.m_Rect.m_uLengthA;
+ const unsigned uRegLengthB = r.m_Rect.m_uLengthB;
+ const ProfPos *RegPA = PA + uRegStartPosA;
+ const ProfPos *RegPB = PB + uRegStartPosB;
+
+ dDPAreaWithDiags += uRegLengthA*uRegLengthB;
+ GlobalAlignNoDiags(RegPA, uRegLengthA, RegPB, uRegLengthB, RegPath);
+#if TRACE_PATH
+ Log("GlobalAlignNoDiags RegPath=\n");
+ RegPath.LogMe();
+#endif
+ OffsetPath(RegPath, uRegStartPosA, uRegStartPosB);
+#if TRACE_PATH
+ Log("After offset path, RegPath=\n");
+ RegPath.LogMe();
+#endif
+ }
+ else
+ Quit("GlobalAlignDiags, Invalid region type %u", r.m_Type);
+
+ AppendRegPath(Path, RegPath);
+#if TRACE_PATH
+ Log("After AppendPath, path=");
+ Path.LogMe();
+#endif
+ }
+
+#if TRACE
+ {
+ double dDPAreaWithoutDiags = uLengthA*uLengthB;
+ Log("DP area with diags %.3g without %.3g pct saved %.3g %%\n",
+ dDPAreaWithDiags, dDPAreaWithoutDiags, (1.0 - dDPAreaWithDiags/dDPAreaWithoutDiags)*100.0);
+ }
+#endif
+ g_dDPAreaWithDiags += dDPAreaWithDiags;
+ return 0;
+ }
+
+void ListDiagSavings()
+ {
+ if (!g_bVerbose || !g_bDiags)
+ return;
+ double dAreaSaved = g_dDPAreaWithoutDiags - g_dDPAreaWithDiags;
+ double dPct = dAreaSaved*100.0/g_dDPAreaWithoutDiags;
+ Log("DP area saved by diagonals %-4.1f%%\n", dPct);
+ }
Added: trunk/packages/muscle/branches/upstream/current/glbalignle.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/glbalignle.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/glbalignle.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,435 @@
+#include "muscle.h"
+#include "profile.h"
+#include "pwpath.h"
+
+#define OCC 1
+
+struct DP_MEMORY
+ {
+ unsigned uLength;
+ SCORE *GapOpenA;
+ SCORE *GapOpenB;
+ SCORE *GapCloseA;
+ SCORE *GapCloseB;
+ SCORE *MPrev;
+ SCORE *MCurr;
+ SCORE *MWork;
+ SCORE *DPrev;
+ SCORE *DCurr;
+ SCORE *DWork;
+ SCORE **ScoreMxB;
+#if OCC
+ FCOUNT *OccA;
+ FCOUNT *OccB;
+#endif
+ unsigned **SortOrderA;
+ unsigned *uDeletePos;
+ FCOUNT **FreqsA;
+ int **TraceBack;
+ };
+
+static struct DP_MEMORY DPM;
+
+static void AllocDPMem(unsigned uLengthA, unsigned uLengthB)
+ {
+// Max prefix length
+ unsigned uLength = (uLengthA > uLengthB ? uLengthA : uLengthB) + 1;
+ if (uLength < DPM.uLength)
+ return;
+
+// Add 256 to allow for future expansion and
+// round up to next multiple of 32.
+ uLength += 256;
+ uLength += 32 - uLength%32;
+
+ const unsigned uOldLength = DPM.uLength;
+ if (uOldLength > 0)
+ {
+ for (unsigned i = 0; i < uOldLength; ++i)
+ {
+ delete[] DPM.TraceBack[i];
+ delete[] DPM.FreqsA[i];
+ delete[] DPM.SortOrderA[i];
+ }
+ for (unsigned n = 0; n < 20; ++n)
+ delete[] DPM.ScoreMxB[n];
+
+ delete[] DPM.MPrev;
+ delete[] DPM.MCurr;
+ delete[] DPM.MWork;
+ delete[] DPM.DPrev;
+ delete[] DPM.DCurr;
+ delete[] DPM.DWork;
+ delete[] DPM.uDeletePos;
+ delete[] DPM.GapOpenA;
+ delete[] DPM.GapOpenB;
+ delete[] DPM.GapCloseA;
+ delete[] DPM.GapCloseB;
+ delete[] DPM.SortOrderA;
+ delete[] DPM.FreqsA;
+ delete[] DPM.ScoreMxB;
+ delete[] DPM.TraceBack;
+#if OCC
+ delete[] DPM.OccA;
+ delete[] DPM.OccB;
+#endif
+ }
+
+ DPM.uLength = uLength;
+
+ DPM.GapOpenA = new SCORE[uLength];
+ DPM.GapOpenB = new SCORE[uLength];
+ DPM.GapCloseA = new SCORE[uLength];
+ DPM.GapCloseB = new SCORE[uLength];
+#if OCC
+ DPM.OccA = new FCOUNT[uLength];
+ DPM.OccB = new FCOUNT[uLength];
+#endif
+
+ DPM.SortOrderA = new unsigned*[uLength];
+ DPM.FreqsA = new FCOUNT*[uLength];
+ DPM.ScoreMxB = new SCORE*[20];
+ DPM.MPrev = new SCORE[uLength];
+ DPM.MCurr = new SCORE[uLength];
+ DPM.MWork = new SCORE[uLength];
+
+ DPM.DPrev = new SCORE[uLength];
+ DPM.DCurr = new SCORE[uLength];
+ DPM.DWork = new SCORE[uLength];
+ DPM.uDeletePos = new unsigned[uLength];
+
+ DPM.TraceBack = new int*[uLength];
+
+ for (unsigned uLetter = 0; uLetter < 20; ++uLetter)
+ DPM.ScoreMxB[uLetter] = new SCORE[uLength];
+
+ for (unsigned i = 0; i < uLength; ++i)
+ {
+ DPM.SortOrderA[i] = new unsigned[20];
+ DPM.FreqsA[i] = new FCOUNT[20];
+ DPM.TraceBack[i] = new int[uLength];
+ }
+ }
+
+SCORE GlobalAlignLE(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
+ unsigned uLengthB, PWPath &Path)
+ {
+ SetTermGaps(PA, uLengthA);
+ SetTermGaps(PB, uLengthB);
+
+ const unsigned uPrefixCountA = uLengthA + 1;
+ const unsigned uPrefixCountB = uLengthB + 1;
+
+ AllocDPMem(uLengthA, uLengthB);
+
+ SCORE *GapOpenA = DPM.GapOpenA;
+ SCORE *GapOpenB = DPM.GapOpenB;
+ SCORE *GapCloseA = DPM.GapCloseA;
+ SCORE *GapCloseB = DPM.GapCloseB;
+
+ unsigned **SortOrderA = DPM.SortOrderA;
+ FCOUNT **FreqsA = DPM.FreqsA;
+ SCORE **ScoreMxB = DPM.ScoreMxB;
+ SCORE *MPrev = DPM.MPrev;
+ SCORE *MCurr = DPM.MCurr;
+ SCORE *MWork = DPM.MWork;
+
+ SCORE *DPrev = DPM.DPrev;
+ SCORE *DCurr = DPM.DCurr;
+ SCORE *DWork = DPM.DWork;
+
+#if OCC
+ FCOUNT *OccA = DPM.OccA;
+ FCOUNT *OccB = DPM.OccB;
+#endif
+
+ unsigned *uDeletePos = DPM.uDeletePos;
+
+ int **TraceBack = DPM.TraceBack;
+
+ for (unsigned i = 0; i < uLengthA; ++i)
+ {
+ GapOpenA[i] = PA[i].m_scoreGapOpen;
+ GapCloseA[i] = PA[i].m_scoreGapClose;
+#if OCC
+ OccA[i] = PA[i].m_fOcc;
+#endif
+
+ for (unsigned uLetter = 0; uLetter < 20; ++uLetter)
+ {
+ SortOrderA[i][uLetter] = PA[i].m_uSortOrder[uLetter];
+ FreqsA[i][uLetter] = PA[i].m_fcCounts[uLetter];
+ }
+ }
+
+ for (unsigned j = 0; j < uLengthB; ++j)
+ {
+ GapOpenB[j] = PB[j].m_scoreGapOpen;
+ GapCloseB[j] = PB[j].m_scoreGapClose;
+#if OCC
+ OccB[j] = PB[j].m_fOcc;
+#endif
+ }
+
+ for (unsigned uLetter = 0; uLetter < 20; ++uLetter)
+ {
+ for (unsigned j = 0; j < uLengthB; ++j)
+ ScoreMxB[uLetter][j] = PB[j].m_AAScores[uLetter];
+ }
+
+ for (unsigned i = 0; i < uPrefixCountA; ++i)
+ memset(TraceBack[i], 0, uPrefixCountB*sizeof(int));
+
+// Special case for i=0
+ unsigned **ptrSortOrderA = SortOrderA;
+ FCOUNT **ptrFreqsA = FreqsA;
+ assert(ptrSortOrderA == &(SortOrderA[0]));
+ assert(ptrFreqsA == &(FreqsA[0]));
+ TraceBack[0][0] = 0;
+
+ SCORE scoreSum = 0;
+ unsigned *ptrSortOrderAi = SortOrderA[0];
+ const unsigned *ptrSortOrderAEnd = ptrSortOrderAi + 20;
+ FCOUNT *ptrFreqsAi = FreqsA[0];
+ for (; ptrSortOrderAi != ptrSortOrderAEnd; ++ptrSortOrderAi)
+ {
+ const unsigned uLetter = *ptrSortOrderAi;
+ const FCOUNT fcLetter = ptrFreqsAi[uLetter];
+ if (0 == fcLetter)
+ break;
+ scoreSum += fcLetter*ScoreMxB[uLetter][0];
+ }
+ if (0 == scoreSum)
+ MPrev[0] = -2.5;
+ else
+ {
+#if OCC
+ MPrev[0] = (logf(scoreSum) - g_scoreCenter)*OccA[0]*OccB[0];
+#else
+ MPrev[0] = (logf(scoreSum) - g_scoreCenter);
+#endif
+ }
+
+// D(0,0) is -infinity (requires I->D).
+ DPrev[0] = MINUS_INFINITY;
+
+ for (unsigned j = 1; j < uLengthB; ++j)
+ {
+ // Only way to get M(0, j) looks like this:
+ // A ----X
+ // B XXXXX
+ // 0 j
+ // So gap-open at j=0, gap-close at j-1.
+ SCORE scoreSum = 0;
+ unsigned *ptrSortOrderAi = SortOrderA[0];
+ const unsigned *ptrSortOrderAEnd = ptrSortOrderAi + 20;
+ FCOUNT *ptrFreqsAi = FreqsA[0];
+ for (; ptrSortOrderAi != ptrSortOrderAEnd; ++ptrSortOrderAi)
+ {
+ const unsigned uLetter = *ptrSortOrderAi;
+ const FCOUNT fcLetter = ptrFreqsAi[uLetter];
+ if (0 == fcLetter)
+ break;
+ scoreSum += fcLetter*ScoreMxB[uLetter][j];
+ }
+ if (0 == scoreSum)
+ MPrev[j] = -2.5;
+ else
+ {
+#if OCC
+ MPrev[j] = (logf(scoreSum) - g_scoreCenter)*OccA[0]*OccB[j] +
+ GapOpenB[0] + GapCloseB[j-1];
+#else
+ MPrev[j] = (logf(scoreSum) - g_scoreCenter) +
+ GapOpenB[0] + GapCloseB[j-1];
+#endif
+ }
+ TraceBack[0][j] = -(int) j;
+
+ // Assume no D->I transitions, then can't be a delete if only
+ // one letter from A.
+ DPrev[j] = MINUS_INFINITY;
+ }
+
+ SCORE IPrev_j_1;
+ for (unsigned i = 1; i < uLengthA; ++i)
+ {
+ ++ptrSortOrderA;
+ ++ptrFreqsA;
+ assert(ptrSortOrderA == &(SortOrderA[i]));
+ assert(ptrFreqsA == &(FreqsA[i]));
+
+ SCORE *ptrMCurr_j = MCurr;
+ memset(ptrMCurr_j, 0, uLengthB*sizeof(SCORE));
+ const FCOUNT *FreqsAi = *ptrFreqsA;
+
+ const unsigned *SortOrderAi = *ptrSortOrderA;
+ const unsigned *ptrSortOrderAiEnd = SortOrderAi + 20;
+ const SCORE *ptrMCurrMax = MCurr + uLengthB;
+ for (const unsigned *ptrSortOrderAi = SortOrderAi;
+ ptrSortOrderAi != ptrSortOrderAiEnd;
+ ++ptrSortOrderAi)
+ {
+ const unsigned uLetter = *ptrSortOrderAi;
+ SCORE *NSBR_Letter = ScoreMxB[uLetter];
+ const FCOUNT fcLetter = FreqsAi[uLetter];
+ if (0 == fcLetter)
+ break;
+ SCORE *ptrNSBR = NSBR_Letter;
+ for (SCORE *ptrMCurr = MCurr; ptrMCurr != ptrMCurrMax; ++ptrMCurr)
+ *ptrMCurr += fcLetter*(*ptrNSBR++);
+ }
+
+#if OCC
+ const FCOUNT OccAi = OccA[i];
+#endif
+ for (unsigned j = 0; j < uLengthB; ++j)
+ {
+ if (MCurr[j] == 0)
+ MCurr[j] = -2.5;
+ else
+#if OCC
+ MCurr[j] = (logf(MCurr[j]) - g_scoreCenter)*OccAi*OccB[j];
+#else
+ MCurr[j] = (logf(MCurr[j]) - g_scoreCenter);
+#endif
+ }
+
+ ptrMCurr_j = MCurr;
+ unsigned *ptrDeletePos = uDeletePos;
+
+ // Special case for j=0
+ // Only way to get M(i, 0) looks like this:
+ // 0 i
+ // A XXXXX
+ // B ----X
+ // So gap-open at i=0, gap-close at i-1.
+ assert(ptrMCurr_j == &(MCurr[0]));
+ *ptrMCurr_j += GapOpenA[0] + GapCloseA[i-1];
+
+ ++ptrMCurr_j;
+
+ int *ptrTraceBack_ij = TraceBack[i];
+ *ptrTraceBack_ij++ = (int) i;
+
+ SCORE *ptrMPrev_j = MPrev;
+ SCORE *ptrDPrev = DPrev;
+ SCORE d = *ptrDPrev;
+ SCORE DNew = *ptrMPrev_j + GapOpenA[i];
+ if (DNew > d)
+ {
+ d = DNew;
+ *ptrDeletePos = i;
+ }
+
+ SCORE *ptrDCurr = DCurr;
+
+ assert(ptrDCurr == &(DCurr[0]));
+ *ptrDCurr = d;
+
+ // Can't have an insert if no letters from B
+ IPrev_j_1 = MINUS_INFINITY;
+
+ unsigned uInsertPos = 0;
+ const SCORE scoreGapOpenAi = GapOpenA[i];
+ const SCORE scoreGapCloseAi_1 = GapCloseA[i-1];
+
+ for (unsigned j = 1; j < uLengthB; ++j)
+ {
+ // Here, MPrev_j is preserved from previous
+ // iteration so with current i,j is M[i-1][j-1]
+ SCORE MPrev_j = *ptrMPrev_j;
+ SCORE INew = MPrev_j + GapOpenB[j];
+ if (INew > IPrev_j_1)
+ {
+ IPrev_j_1 = INew;
+ uInsertPos = j;
+ }
+
+ SCORE scoreMax = MPrev_j;
+
+ assert(ptrDPrev == &(DPrev[j-1]));
+ SCORE scoreD = *ptrDPrev++ + scoreGapCloseAi_1;
+ if (scoreD > scoreMax)
+ {
+ scoreMax = scoreD;
+ assert(ptrDeletePos == &(uDeletePos[j-1]));
+ *ptrTraceBack_ij = (int) i - (int) *ptrDeletePos;
+ assert(*ptrTraceBack_ij > 0);
+ }
+ ++ptrDeletePos;
+
+ SCORE scoreI = IPrev_j_1 + GapCloseB[j-1];
+ if (scoreI > scoreMax)
+ {
+ scoreMax = scoreI;
+ *ptrTraceBack_ij = (int) uInsertPos - (int) j;
+ assert(*ptrTraceBack_ij < 0);
+ }
+
+ assert(ptrSortOrderA == &(SortOrderA[i]));
+ assert(ptrFreqsA == &(FreqsA[i]));
+
+ *ptrMCurr_j += scoreMax;
+ assert(ptrMCurr_j == &(MCurr[j]));
+ ++ptrMCurr_j;
+
+ MPrev_j = *(++ptrMPrev_j);
+ assert(ptrDPrev == &(DPrev[j]));
+ SCORE d = *ptrDPrev;
+ SCORE DNew = MPrev_j + scoreGapOpenAi;
+ if (DNew > d)
+ {
+ d = DNew;
+ assert(ptrDeletePos == &uDeletePos[j]);
+ *ptrDeletePos = i;
+ }
+ assert(ptrDCurr + 1 == &(DCurr[j]));
+ *(++ptrDCurr) = d;
+
+ ++ptrTraceBack_ij;
+ }
+
+ Rotate(MPrev, MCurr, MWork);
+ Rotate(DPrev, DCurr, DWork);
+ }
+
+// Special case for i=uLengthA
+ SCORE IPrev = MINUS_INFINITY;
+
+ unsigned uInsertPos;
+
+ for (unsigned j = 1; j < uLengthB; ++j)
+ {
+ SCORE INew = MPrev[j-1] + GapOpenB[j];
+ if (INew > IPrev)
+ {
+ uInsertPos = j;
+ IPrev = INew;
+ }
+ }
+
+// Special case for i=uLengthA, j=uLengthB
+ SCORE scoreMax = MPrev[uLengthB-1];
+ int iTraceBack = 0;
+
+ SCORE scoreD = DPrev[uLengthB-1] + GapCloseA[uLengthA-1];
+ if (scoreD > scoreMax)
+ {
+ scoreMax = scoreD;
+ iTraceBack = (int) uLengthA - (int) uDeletePos[uLengthB-1];
+ }
+
+ SCORE scoreI = IPrev + GapCloseB[uLengthB-1];
+ if (scoreI > scoreMax)
+ {
+ scoreMax = scoreI;
+ iTraceBack = (int) uInsertPos - (int) uLengthB;
+ }
+
+ TraceBack[uLengthA][uLengthB] = iTraceBack;
+
+ TraceBackToPath(TraceBack, uLengthA, uLengthB, Path);
+
+ return scoreMax;
+ }
Added: trunk/packages/muscle/branches/upstream/current/glbalignsimple.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/glbalignsimple.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/glbalignsimple.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,368 @@
+#include "muscle.h"
+#include <math.h>
+#include "pwpath.h"
+#include "profile.h"
+#include <stdio.h>
+
+#define TRACE 0
+
+#if 1 // SINGLE_AFFINE
+
+extern bool g_bKeepSimpleDP;
+extern SCORE *g_DPM;
+extern SCORE *g_DPD;
+extern SCORE *g_DPI;
+extern char *g_TBM;
+extern char *g_TBD;
+extern char *g_TBI;
+
+static const char *LocalScoreToStr(SCORE s)
+ {
+ static char str[16];
+ if (s < -100000)
+ return " *";
+ sprintf(str, "%6.1f", s);
+ return str;
+ }
+
+static void ListTB(const char *TBM_, const ProfPos *PA, const ProfPos *PB,
+ unsigned uPrefixCountA, unsigned uPrefixCountB)
+ {
+ Log(" ");
+ for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ {
+ char c = ' ';
+ if (uPrefixLengthB > 0)
+ c = ConsensusChar(PB[uPrefixLengthB - 1]);
+ Log(" %4u:%c", uPrefixLengthB, c);
+ }
+ Log("\n");
+ for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA)
+ {
+ char c = ' ';
+ if (uPrefixLengthA > 0)
+ c = ConsensusChar(PA[uPrefixLengthA - 1]);
+ Log("%4u:%c ", uPrefixLengthA, c);
+ for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ Log(" %6c", TBM(uPrefixLengthA, uPrefixLengthB));
+ Log("\n");
+ }
+ }
+
+static void ListDP(const SCORE *DPM_, const ProfPos *PA, const ProfPos *PB,
+ unsigned uPrefixCountA, unsigned uPrefixCountB)
+ {
+ Log(" ");
+ for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ {
+ char c = ' ';
+ if (uPrefixLengthB > 0)
+ c = ConsensusChar(PB[uPrefixLengthB - 1]);
+ Log(" %4u:%c", uPrefixLengthB, c);
+ }
+ Log("\n");
+ for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA)
+ {
+ char c = ' ';
+ if (uPrefixLengthA > 0)
+ c = ConsensusChar(PA[uPrefixLengthA - 1]);
+ Log("%4u:%c ", uPrefixLengthA, c);
+ for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ Log(" %s", LocalScoreToStr(DPM(uPrefixLengthA, uPrefixLengthB)));
+ Log("\n");
+ }
+ }
+
+SCORE GlobalAlignSimple(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
+ unsigned uLengthB, PWPath &Path)
+ {
+ assert(uLengthB > 0 && uLengthA > 0);
+
+ SetTermGaps(PA, uLengthA);
+ SetTermGaps(PB, uLengthB);
+
+ const unsigned uPrefixCountA = uLengthA + 1;
+ const unsigned uPrefixCountB = uLengthB + 1;
+
+// Allocate DP matrices
+ const size_t LM = uPrefixCountA*uPrefixCountB;
+ SCORE *DPL_ = new SCORE[LM];
+ SCORE *DPM_ = new SCORE[LM];
+ SCORE *DPD_ = new SCORE[LM];
+ SCORE *DPI_ = new SCORE[LM];
+
+ char *TBM_ = new char[LM];
+ char *TBD_ = new char[LM];
+ char *TBI_ = new char[LM];
+
+ memset(TBM_, '?', LM);
+ memset(TBD_, '?', LM);
+ memset(TBI_, '?', LM);
+
+ DPM(0, 0) = 0;
+ DPD(0, 0) = MINUS_INFINITY;
+ DPI(0, 0) = MINUS_INFINITY;
+
+ DPM(1, 0) = MINUS_INFINITY;
+ DPD(1, 0) = PA[0].m_scoreGapOpen;
+ TBD(1, 0) = 'D';
+ DPI(1, 0) = MINUS_INFINITY;
+
+ DPM(0, 1) = MINUS_INFINITY;
+ DPD(0, 1) = MINUS_INFINITY;
+ DPI(0, 1) = PB[0].m_scoreGapOpen;
+ TBI(0, 1) = 'I';
+
+// Empty prefix of B is special case
+ for (unsigned uPrefixLengthA = 2; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA)
+ {
+ // M=LetterA+LetterB, impossible with empty prefix
+ DPM(uPrefixLengthA, 0) = MINUS_INFINITY;
+
+ // D=LetterA+GapB
+ DPD(uPrefixLengthA, 0) = DPD(uPrefixLengthA - 1, 0) + g_scoreGapExtend;
+ TBD(uPrefixLengthA, 0) = 'D';
+
+ // I=GapA+LetterB, impossible with empty prefix
+ DPI(uPrefixLengthA, 0) = MINUS_INFINITY;
+ }
+
+// Empty prefix of A is special case
+ for (unsigned uPrefixLengthB = 2; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ {
+ // M=LetterA+LetterB, impossible with empty prefix
+ DPM(0, uPrefixLengthB) = MINUS_INFINITY;
+
+ // D=LetterA+GapB, impossible with empty prefix
+ DPD(0, uPrefixLengthB) = MINUS_INFINITY;
+
+ // I=GapA+LetterB
+ DPI(0, uPrefixLengthB) = DPI(0, uPrefixLengthB - 1) + g_scoreGapExtend;
+ TBI(0, uPrefixLengthB) = 'I';
+ }
+
+// Special case to agree with NWFast, no D-I transitions so...
+ DPD(uLengthA, 0) = MINUS_INFINITY;
+// DPI(0, uLengthB) = MINUS_INFINITY;
+
+// ============
+// Main DP loop
+// ============
+ SCORE scoreGapCloseB = MINUS_INFINITY;
+ for (unsigned uPrefixLengthB = 1; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ {
+ const ProfPos &PPB = PB[uPrefixLengthB - 1];
+
+ SCORE scoreGapCloseA = MINUS_INFINITY;
+ for (unsigned uPrefixLengthA = 1; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA)
+ {
+ const ProfPos &PPA = PA[uPrefixLengthA - 1];
+
+ {
+ // Match M=LetterA+LetterB
+ SCORE scoreLL = ScoreProfPos2(PPA, PPB);
+ DPL(uPrefixLengthA, uPrefixLengthB) = scoreLL;
+
+ SCORE scoreMM = DPM(uPrefixLengthA-1, uPrefixLengthB-1);
+ SCORE scoreDM = DPD(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapCloseA;
+ SCORE scoreIM = DPI(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapCloseB;
+
+ SCORE scoreBest;
+ if (scoreMM >= scoreDM && scoreMM >= scoreIM)
+ {
+ scoreBest = scoreMM;
+ TBM(uPrefixLengthA, uPrefixLengthB) = 'M';
+ }
+ else if (scoreDM >= scoreMM && scoreDM >= scoreIM)
+ {
+ scoreBest = scoreDM;
+ TBM(uPrefixLengthA, uPrefixLengthB) = 'D';
+ }
+ else
+ {
+ assert(scoreIM >= scoreMM && scoreIM >= scoreDM);
+ scoreBest = scoreIM;
+ TBM(uPrefixLengthA, uPrefixLengthB) = 'I';
+ }
+ DPM(uPrefixLengthA, uPrefixLengthB) = scoreBest + scoreLL;
+ }
+
+ {
+ // Delete D=LetterA+GapB
+ SCORE scoreMD = DPM(uPrefixLengthA-1, uPrefixLengthB) +
+ PA[uPrefixLengthA-1].m_scoreGapOpen;
+ SCORE scoreDD = DPD(uPrefixLengthA-1, uPrefixLengthB) + g_scoreGapExtend;
+
+ SCORE scoreBest;
+ if (scoreMD >= scoreDD)
+ {
+ scoreBest = scoreMD;
+ TBD(uPrefixLengthA, uPrefixLengthB) = 'M';
+ }
+ else
+ {
+ assert(scoreDD >= scoreMD);
+ scoreBest = scoreDD;
+ TBD(uPrefixLengthA, uPrefixLengthB) = 'D';
+ }
+ DPD(uPrefixLengthA, uPrefixLengthB) = scoreBest;
+ }
+
+ // Insert I=GapA+LetterB
+ {
+ SCORE scoreMI = DPM(uPrefixLengthA, uPrefixLengthB-1) +
+ PB[uPrefixLengthB - 1].m_scoreGapOpen;
+ SCORE scoreII = DPI(uPrefixLengthA, uPrefixLengthB-1) + g_scoreGapExtend;
+
+ SCORE scoreBest;
+ if (scoreMI >= scoreII)
+ {
+ scoreBest = scoreMI;
+ TBI(uPrefixLengthA, uPrefixLengthB) = 'M';
+ }
+ else
+ {
+ assert(scoreII > scoreMI);
+ scoreBest = scoreII;
+ TBI(uPrefixLengthA, uPrefixLengthB) = 'I';
+ }
+ DPI(uPrefixLengthA, uPrefixLengthB) = scoreBest;
+ }
+
+ scoreGapCloseA = PPA.m_scoreGapClose;
+ }
+ scoreGapCloseB = PPB.m_scoreGapClose;
+ }
+
+#if TRACE
+ Log("\n");
+ Log("Simple DPL:\n");
+ ListDP(DPL_, PA, PB, uPrefixCountA, uPrefixCountB);
+ Log("\n");
+ Log("Simple DPM:\n");
+ ListDP(DPM_, PA, PB, uPrefixCountA, uPrefixCountB);
+ Log("\n");
+ Log("Simple DPD:\n");
+ ListDP(DPD_, PA, PB, uPrefixCountA, uPrefixCountB);
+ Log("\n");
+ Log("Simple DPI:\n");
+ ListDP(DPI_, PA, PB, uPrefixCountA, uPrefixCountB);
+ Log("\n");
+ Log("Simple TBM:\n");
+ ListTB(TBM_, PA, PB, uPrefixCountA, uPrefixCountB);
+ Log("\n");
+ Log("Simple TBD:\n");
+ ListTB(TBD_, PA, PB, uPrefixCountA, uPrefixCountB);
+ Log("\n");
+ Log("Simple TBI:\n");
+ ListTB(TBI_, PA, PB, uPrefixCountA, uPrefixCountB);
+#endif
+
+// Trace-back
+// ==========
+ Path.Clear();
+
+// Find last edge
+ SCORE M = DPM(uLengthA, uLengthB);
+ SCORE D = DPD(uLengthA, uLengthB) + PA[uLengthA-1].m_scoreGapClose;
+ SCORE I = DPI(uLengthA, uLengthB) + PB[uLengthB-1].m_scoreGapClose;
+ char cEdgeType = '?';
+
+ SCORE BestScore = MINUS_INFINITY;
+ if (M >= D && M >= I)
+ {
+ cEdgeType = 'M';
+ BestScore = M;
+ }
+ else if (D >= M && D >= I)
+ {
+ cEdgeType = 'D';
+ BestScore = D;
+ }
+ else
+ {
+ assert(I >= M && I >= D);
+ cEdgeType = 'I';
+ BestScore = I;
+ }
+
+#if TRACE
+ Log("Simple: MAB=%.4g DAB=%.4g IAB=%.4g best=%c\n", M, D, I, cEdgeType);
+#endif
+
+ unsigned PLA = uLengthA;
+ unsigned PLB = uLengthB;
+ for (;;)
+ {
+ PWEdge Edge;
+ Edge.cType = cEdgeType;
+ Edge.uPrefixLengthA = PLA;
+ Edge.uPrefixLengthB = PLB;
+#if TRACE
+ Log("Prepend %c%d.%d\n", Edge.cType, PLA, PLB);
+#endif
+ Path.PrependEdge(Edge);
+
+ switch (cEdgeType)
+ {
+ case 'M':
+ assert(PLA > 0);
+ assert(PLB > 0);
+ cEdgeType = TBM(PLA, PLB);
+ --PLA;
+ --PLB;
+ break;
+
+ case 'D':
+ assert(PLA > 0);
+ cEdgeType = TBD(PLA, PLB);
+ --PLA;
+ break;
+
+ case 'I':
+ assert(PLB > 0);
+ cEdgeType = TBI(PLA, PLB);
+ --PLB;
+ break;
+
+ default:
+ Quit("Invalid edge %c", cEdgeType);
+ }
+ if (0 == PLA && 0 == PLB)
+ break;
+ }
+ Path.Validate();
+
+// SCORE Score = TraceBack(PA, uLengthA, PB, uLengthB, DPM_, DPD_, DPI_, Path);
+
+#if TRACE
+ SCORE scorePath = FastScorePath2(PA, uLengthA, PB, uLengthB, Path);
+ Path.LogMe();
+ Log("Score = %s Path = %s\n", LocalScoreToStr(BestScore), LocalScoreToStr(scorePath));
+#endif
+
+ if (g_bKeepSimpleDP)
+ {
+ g_DPM = DPM_;
+ g_DPD = DPD_;
+ g_DPI = DPI_;
+
+ g_TBM = TBM_;
+ g_TBD = TBD_;
+ g_TBI = TBI_;
+ }
+ else
+ {
+ delete[] DPM_;
+ delete[] DPD_;
+ delete[] DPI_;
+
+ delete[] TBM_;
+ delete[] TBD_;
+ delete[] TBI_;
+ }
+
+ return BestScore;
+ }
+
+#endif // SINLGLE_AFFINE
Added: trunk/packages/muscle/branches/upstream/current/glbalignsp.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/glbalignsp.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/glbalignsp.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,374 @@
+#include "muscle.h"
+#include "profile.h"
+#include "pwpath.h"
+
+struct DP_MEMORY
+ {
+ unsigned uLength;
+ SCORE *GapOpenA;
+ SCORE *GapOpenB;
+ SCORE *GapCloseA;
+ SCORE *GapCloseB;
+ SCORE *MPrev;
+ SCORE *MCurr;
+ SCORE *MWork;
+ SCORE *DPrev;
+ SCORE *DCurr;
+ SCORE *DWork;
+ SCORE **ScoreMxB;
+ unsigned **SortOrderA;
+ unsigned *uDeletePos;
+ FCOUNT **FreqsA;
+ int **TraceBack;
+ };
+
+static struct DP_MEMORY DPM;
+
+static void AllocDPMem(unsigned uLengthA, unsigned uLengthB)
+ {
+// Max prefix length
+ unsigned uLength = (uLengthA > uLengthB ? uLengthA : uLengthB) + 1;
+ if (uLength < DPM.uLength)
+ return;
+
+// Add 256 to allow for future expansion and
+// round up to next multiple of 32.
+ uLength += 256;
+ uLength += 32 - uLength%32;
+
+ const unsigned uOldLength = DPM.uLength;
+ if (uOldLength > 0)
+ {
+ for (unsigned i = 0; i < uOldLength; ++i)
+ {
+ delete[] DPM.TraceBack[i];
+ delete[] DPM.FreqsA[i];
+ delete[] DPM.SortOrderA[i];
+ }
+ for (unsigned n = 0; n < 20; ++n)
+ delete[] DPM.ScoreMxB[n];
+
+ delete[] DPM.MPrev;
+ delete[] DPM.MCurr;
+ delete[] DPM.MWork;
+ delete[] DPM.DPrev;
+ delete[] DPM.DCurr;
+ delete[] DPM.DWork;
+ delete[] DPM.uDeletePos;
+ delete[] DPM.GapOpenA;
+ delete[] DPM.GapOpenB;
+ delete[] DPM.GapCloseA;
+ delete[] DPM.GapCloseB;
+ delete[] DPM.SortOrderA;
+ delete[] DPM.FreqsA;
+ delete[] DPM.ScoreMxB;
+ delete[] DPM.TraceBack;
+ }
+
+ DPM.uLength = uLength;
+
+ DPM.GapOpenA = new SCORE[uLength];
+ DPM.GapOpenB = new SCORE[uLength];
+ DPM.GapCloseA = new SCORE[uLength];
+ DPM.GapCloseB = new SCORE[uLength];
+
+ DPM.SortOrderA = new unsigned*[uLength];
+ DPM.FreqsA = new FCOUNT*[uLength];
+ DPM.ScoreMxB = new SCORE*[20];
+ DPM.MPrev = new SCORE[uLength];
+ DPM.MCurr = new SCORE[uLength];
+ DPM.MWork = new SCORE[uLength];
+
+ DPM.DPrev = new SCORE[uLength];
+ DPM.DCurr = new SCORE[uLength];
+ DPM.DWork = new SCORE[uLength];
+ DPM.uDeletePos = new unsigned[uLength];
+
+ DPM.TraceBack = new int*[uLength];
+
+ for (unsigned uLetter = 0; uLetter < 20; ++uLetter)
+ DPM.ScoreMxB[uLetter] = new SCORE[uLength];
+
+ for (unsigned i = 0; i < uLength; ++i)
+ {
+ DPM.SortOrderA[i] = new unsigned[20];
+ DPM.FreqsA[i] = new FCOUNT[20];
+ DPM.TraceBack[i] = new int[uLength];
+ }
+ }
+
+SCORE GlobalAlignSP(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
+ unsigned uLengthB, PWPath &Path)
+ {
+ const unsigned uPrefixCountA = uLengthA + 1;
+ const unsigned uPrefixCountB = uLengthB + 1;
+
+ AllocDPMem(uLengthA, uLengthB);
+
+ SCORE *GapOpenA = DPM.GapOpenA;
+ SCORE *GapOpenB = DPM.GapOpenB;
+ SCORE *GapCloseA = DPM.GapCloseA;
+ SCORE *GapCloseB = DPM.GapCloseB;
+
+ unsigned **SortOrderA = DPM.SortOrderA;
+ FCOUNT **FreqsA = DPM.FreqsA;
+ SCORE **ScoreMxB = DPM.ScoreMxB;
+ SCORE *MPrev = DPM.MPrev;
+ SCORE *MCurr = DPM.MCurr;
+ SCORE *MWork = DPM.MWork;
+
+ SCORE *DPrev = DPM.DPrev;
+ SCORE *DCurr = DPM.DCurr;
+ SCORE *DWork = DPM.DWork;
+ unsigned *uDeletePos = DPM.uDeletePos;
+
+ int **TraceBack = DPM.TraceBack;
+
+ for (unsigned i = 0; i < uLengthA; ++i)
+ {
+ GapOpenA[i] = PA[i].m_scoreGapOpen;
+ GapCloseA[i] = PA[i].m_scoreGapClose;
+
+ for (unsigned uLetter = 0; uLetter < 20; ++uLetter)
+ {
+ SortOrderA[i][uLetter] = PA[i].m_uSortOrder[uLetter];
+ FreqsA[i][uLetter] = PA[i].m_fcCounts[uLetter];
+ }
+ }
+
+ for (unsigned j = 0; j < uLengthB; ++j)
+ {
+ GapOpenB[j] = PB[j].m_scoreGapOpen;
+ GapCloseB[j] = PB[j].m_scoreGapClose;
+ }
+
+ for (unsigned uLetter = 0; uLetter < 20; ++uLetter)
+ {
+ for (unsigned j = 0; j < uLengthB; ++j)
+ ScoreMxB[uLetter][j] = PB[j].m_AAScores[uLetter];
+ }
+
+ for (unsigned i = 0; i < uPrefixCountA; ++i)
+ memset(TraceBack[i], 0, uPrefixCountB*sizeof(int));
+
+// Special case for i=0
+ unsigned **ptrSortOrderA = SortOrderA;
+ FCOUNT **ptrFreqsA = FreqsA;
+ assert(ptrSortOrderA == &(SortOrderA[0]));
+ assert(ptrFreqsA == &(FreqsA[0]));
+ TraceBack[0][0] = 0;
+
+ SCORE scoreSum = 0;
+ unsigned *ptrSortOrderAi = SortOrderA[0];
+ const unsigned *ptrSortOrderAEnd = ptrSortOrderAi + 20;
+ FCOUNT *ptrFreqsAi = FreqsA[0];
+ for (; ptrSortOrderAi != ptrSortOrderAEnd; ++ptrSortOrderAi)
+ {
+ const unsigned uLetter = *ptrSortOrderAi;
+ const FCOUNT fcLetter = ptrFreqsAi[uLetter];
+ if (0 == fcLetter)
+ break;
+ scoreSum += fcLetter*ScoreMxB[uLetter][0];
+ }
+ MPrev[0] = scoreSum - g_scoreCenter;
+
+// D(0,0) is -infinity (requires I->D).
+ DPrev[0] = MINUS_INFINITY;
+
+ for (unsigned j = 1; j < uLengthB; ++j)
+ {
+ // Only way to get M(0, j) looks like this:
+ // A ----X
+ // B XXXXX
+ // 0 j
+ // So gap-open at j=0, gap-close at j-1.
+ SCORE scoreSum = 0;
+ unsigned *ptrSortOrderAi = SortOrderA[0];
+ const unsigned *ptrSortOrderAEnd = ptrSortOrderAi + 20;
+ FCOUNT *ptrFreqsAi = FreqsA[0];
+ for (; ptrSortOrderAi != ptrSortOrderAEnd; ++ptrSortOrderAi)
+ {
+ const unsigned uLetter = *ptrSortOrderAi;
+ const FCOUNT fcLetter = ptrFreqsAi[uLetter];
+ if (0 == fcLetter)
+ break;
+ scoreSum += fcLetter*ScoreMxB[uLetter][j];
+ }
+ MPrev[j] = scoreSum - g_scoreCenter + GapOpenB[0] + GapCloseB[j-1];
+ TraceBack[0][j] = -(int) j;
+
+ // Assume no D->I transitions, then can't be a delete if only
+ // one letter from A.
+ DPrev[j] = MINUS_INFINITY;
+ }
+
+ SCORE IPrev_j_1;
+ for (unsigned i = 1; i < uLengthA; ++i)
+ {
+ ++ptrSortOrderA;
+ ++ptrFreqsA;
+ assert(ptrSortOrderA == &(SortOrderA[i]));
+ assert(ptrFreqsA == &(FreqsA[i]));
+
+ SCORE *ptrMCurr_j = MCurr;
+ memset(ptrMCurr_j, 0, uLengthB*sizeof(SCORE));
+ const FCOUNT *FreqsAi = *ptrFreqsA;
+
+ const unsigned *SortOrderAi = *ptrSortOrderA;
+ const unsigned *ptrSortOrderAiEnd = SortOrderAi + 20;
+ const SCORE *ptrMCurrMax = MCurr + uLengthB;
+ for (const unsigned *ptrSortOrderAi = SortOrderAi;
+ ptrSortOrderAi != ptrSortOrderAiEnd;
+ ++ptrSortOrderAi)
+ {
+ const unsigned uLetter = *ptrSortOrderAi;
+ SCORE *NSBR_Letter = ScoreMxB[uLetter];
+ const FCOUNT fcLetter = FreqsAi[uLetter];
+ if (0 == fcLetter)
+ break;
+ SCORE *ptrNSBR = NSBR_Letter;
+ for (SCORE *ptrMCurr = MCurr; ptrMCurr != ptrMCurrMax; ++ptrMCurr)
+ *ptrMCurr += fcLetter*(*ptrNSBR++);
+ }
+
+ for (unsigned j = 0; j < uLengthB; ++j)
+ MCurr[j] -= g_scoreCenter;
+
+ ptrMCurr_j = MCurr;
+ unsigned *ptrDeletePos = uDeletePos;
+
+ // Special case for j=0
+ // Only way to get M(i, 0) looks like this:
+ // 0 i
+ // A XXXXX
+ // B ----X
+ // So gap-open at i=0, gap-close at i-1.
+ assert(ptrMCurr_j == &(MCurr[0]));
+ *ptrMCurr_j += GapOpenA[0] + GapCloseA[i-1];
+
+ ++ptrMCurr_j;
+
+ int *ptrTraceBack_ij = TraceBack[i];
+ *ptrTraceBack_ij++ = (int) i;
+
+ SCORE *ptrMPrev_j = MPrev;
+ SCORE *ptrDPrev = DPrev;
+ SCORE d = *ptrDPrev;
+ SCORE DNew = *ptrMPrev_j + GapOpenA[i];
+ if (DNew > d)
+ {
+ d = DNew;
+ *ptrDeletePos = i;
+ }
+
+ SCORE *ptrDCurr = DCurr;
+
+ assert(ptrDCurr == &(DCurr[0]));
+ *ptrDCurr = d;
+
+ // Can't have an insert if no letters from B
+ IPrev_j_1 = MINUS_INFINITY;
+
+ unsigned uInsertPos;
+ const SCORE scoreGapOpenAi = GapOpenA[i];
+ const SCORE scoreGapCloseAi_1 = GapCloseA[i-1];
+
+ for (unsigned j = 1; j < uLengthB; ++j)
+ {
+ // Here, MPrev_j is preserved from previous
+ // iteration so with current i,j is M[i-1][j-1]
+ SCORE MPrev_j = *ptrMPrev_j;
+ SCORE INew = MPrev_j + GapOpenB[j];
+ if (INew > IPrev_j_1)
+ {
+ IPrev_j_1 = INew;
+ uInsertPos = j;
+ }
+
+ SCORE scoreMax = MPrev_j;
+
+ assert(ptrDPrev == &(DPrev[j-1]));
+ SCORE scoreD = *ptrDPrev++ + scoreGapCloseAi_1;
+ if (scoreD > scoreMax)
+ {
+ scoreMax = scoreD;
+ assert(ptrDeletePos == &(uDeletePos[j-1]));
+ *ptrTraceBack_ij = (int) i - (int) *ptrDeletePos;
+ assert(*ptrTraceBack_ij > 0);
+ }
+ ++ptrDeletePos;
+
+ SCORE scoreI = IPrev_j_1 + GapCloseB[j-1];
+ if (scoreI > scoreMax)
+ {
+ scoreMax = scoreI;
+ *ptrTraceBack_ij = (int) uInsertPos - (int) j;
+ assert(*ptrTraceBack_ij < 0);
+ }
+
+ assert(ptrSortOrderA == &(SortOrderA[i]));
+ assert(ptrFreqsA == &(FreqsA[i]));
+
+ *ptrMCurr_j += scoreMax;
+ assert(ptrMCurr_j == &(MCurr[j]));
+ ++ptrMCurr_j;
+
+ MPrev_j = *(++ptrMPrev_j);
+ assert(ptrDPrev == &(DPrev[j]));
+ SCORE d = *ptrDPrev;
+ SCORE DNew = MPrev_j + scoreGapOpenAi;
+ if (DNew > d)
+ {
+ d = DNew;
+ assert(ptrDeletePos == &uDeletePos[j]);
+ *ptrDeletePos = i;
+ }
+ assert(ptrDCurr + 1 == &(DCurr[j]));
+ *(++ptrDCurr) = d;
+
+ ++ptrTraceBack_ij;
+ }
+
+ Rotate(MPrev, MCurr, MWork);
+ Rotate(DPrev, DCurr, DWork);
+ }
+
+// Special case for i=uLengthA
+ SCORE IPrev = MINUS_INFINITY;
+
+ unsigned uInsertPos;
+
+ for (unsigned j = 1; j < uLengthB; ++j)
+ {
+ SCORE INew = MPrev[j-1] + GapOpenB[j];
+ if (INew > IPrev)
+ {
+ uInsertPos = j;
+ IPrev = INew;
+ }
+ }
+
+// Special case for i=uLengthA, j=uLengthB
+ SCORE scoreMax = MPrev[uLengthB-1];
+ int iTraceBack = 0;
+
+ SCORE scoreD = DPrev[uLengthB-1] + GapCloseA[uLengthA-1];
+ if (scoreD > scoreMax)
+ {
+ scoreMax = scoreD;
+ iTraceBack = (int) uLengthA - (int) uDeletePos[uLengthB-1];
+ }
+
+ SCORE scoreI = IPrev + GapCloseB[uLengthB-1];
+ if (scoreI > scoreMax)
+ {
+ scoreMax = scoreI;
+ iTraceBack = (int) uInsertPos - (int) uLengthB;
+ }
+
+ TraceBack[uLengthA][uLengthB] = iTraceBack;
+
+ TraceBackToPath(TraceBack, uLengthA, uLengthB, Path);
+
+ return scoreMax;
+ }
Added: trunk/packages/muscle/branches/upstream/current/glbalignspn.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/glbalignspn.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/glbalignspn.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,409 @@
+#include "muscle.h"
+#include "profile.h"
+#include "pwpath.h"
+
+struct DP_MEMORY
+ {
+ unsigned uLength;
+ SCORE *GapOpenA;
+ SCORE *GapOpenB;
+ SCORE *GapCloseA;
+ SCORE *GapCloseB;
+ SCORE *MPrev;
+ SCORE *MCurr;
+ SCORE *MWork;
+ SCORE *DPrev;
+ SCORE *DCurr;
+ SCORE *DWork;
+ SCORE **ScoreMxB;
+ unsigned **SortOrderA;
+ unsigned *uDeletePos;
+ FCOUNT **FreqsA;
+ int **TraceBack;
+ };
+
+static struct DP_MEMORY DPM;
+
+void FreeDPMemSPN()
+ {
+ const unsigned uOldLength = DPM.uLength;
+ if (0 == uOldLength)
+ return;
+
+ for (unsigned i = 0; i < uOldLength; ++i)
+ {
+ delete[] DPM.TraceBack[i];
+ delete[] DPM.FreqsA[i];
+ delete[] DPM.SortOrderA[i];
+ }
+ for (unsigned n = 0; n < 4; ++n)
+ delete[] DPM.ScoreMxB[n];
+
+ delete[] DPM.MPrev;
+ delete[] DPM.MCurr;
+ delete[] DPM.MWork;
+ delete[] DPM.DPrev;
+ delete[] DPM.DCurr;
+ delete[] DPM.DWork;
+ delete[] DPM.uDeletePos;
+ delete[] DPM.GapOpenA;
+ delete[] DPM.GapOpenB;
+ delete[] DPM.GapCloseA;
+ delete[] DPM.GapCloseB;
+ delete[] DPM.SortOrderA;
+ delete[] DPM.FreqsA;
+ delete[] DPM.ScoreMxB;
+ delete[] DPM.TraceBack;
+ }
+
+static void AllocDPMem(unsigned uLengthA, unsigned uLengthB)
+ {
+// Max prefix length
+ unsigned uLength = (uLengthA > uLengthB ? uLengthA : uLengthB) + 1;
+ if (uLength < DPM.uLength)
+ return;
+
+// Add 256 to allow for future expansion and
+// round up to next multiple of 32.
+ uLength += 256;
+ uLength += 32 - uLength%32;
+
+ const unsigned uOldLength = DPM.uLength;
+ if (uOldLength > 0)
+ {
+ for (unsigned i = 0; i < uOldLength; ++i)
+ {
+ delete[] DPM.TraceBack[i];
+ delete[] DPM.FreqsA[i];
+ delete[] DPM.SortOrderA[i];
+ }
+ for (unsigned n = 0; n < 4; ++n)
+ delete[] DPM.ScoreMxB[n];
+
+ delete[] DPM.MPrev;
+ delete[] DPM.MCurr;
+ delete[] DPM.MWork;
+ delete[] DPM.DPrev;
+ delete[] DPM.DCurr;
+ delete[] DPM.DWork;
+ delete[] DPM.uDeletePos;
+ delete[] DPM.GapOpenA;
+ delete[] DPM.GapOpenB;
+ delete[] DPM.GapCloseA;
+ delete[] DPM.GapCloseB;
+ delete[] DPM.SortOrderA;
+ delete[] DPM.FreqsA;
+ delete[] DPM.ScoreMxB;
+ delete[] DPM.TraceBack;
+ }
+
+ DPM.uLength = uLength;
+
+ DPM.GapOpenA = new SCORE[uLength];
+ DPM.GapOpenB = new SCORE[uLength];
+ DPM.GapCloseA = new SCORE[uLength];
+ DPM.GapCloseB = new SCORE[uLength];
+
+ DPM.SortOrderA = new unsigned*[uLength];
+ DPM.FreqsA = new FCOUNT*[uLength];
+ DPM.ScoreMxB = new SCORE*[4];
+ DPM.MPrev = new SCORE[uLength];
+ DPM.MCurr = new SCORE[uLength];
+ DPM.MWork = new SCORE[uLength];
+
+ DPM.DPrev = new SCORE[uLength];
+ DPM.DCurr = new SCORE[uLength];
+ DPM.DWork = new SCORE[uLength];
+ DPM.uDeletePos = new unsigned[uLength];
+
+ DPM.TraceBack = new int*[uLength];
+
+ for (unsigned uLetter = 0; uLetter < 4; ++uLetter)
+ DPM.ScoreMxB[uLetter] = new SCORE[uLength];
+
+ for (unsigned i = 0; i < uLength; ++i)
+ {
+ DPM.SortOrderA[i] = new unsigned[4];
+ DPM.FreqsA[i] = new FCOUNT[4];
+ DPM.TraceBack[i] = new int[uLength];
+ }
+ }
+
+SCORE GlobalAlignSPN(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
+ unsigned uLengthB, PWPath &Path)
+ {
+ if (ALPHA_DNA != g_Alpha || ALPHA_RNA == g_Alpha)
+ Quit("GlobalAlignSPN: must be nucleo");
+
+ const unsigned uPrefixCountA = uLengthA + 1;
+ const unsigned uPrefixCountB = uLengthB + 1;
+
+ AllocDPMem(uLengthA, uLengthB);
+
+ SCORE *GapOpenA = DPM.GapOpenA;
+ SCORE *GapOpenB = DPM.GapOpenB;
+ SCORE *GapCloseA = DPM.GapCloseA;
+ SCORE *GapCloseB = DPM.GapCloseB;
+
+ unsigned **SortOrderA = DPM.SortOrderA;
+ FCOUNT **FreqsA = DPM.FreqsA;
+ SCORE **ScoreMxB = DPM.ScoreMxB;
+ SCORE *MPrev = DPM.MPrev;
+ SCORE *MCurr = DPM.MCurr;
+ SCORE *MWork = DPM.MWork;
+
+ SCORE *DPrev = DPM.DPrev;
+ SCORE *DCurr = DPM.DCurr;
+ SCORE *DWork = DPM.DWork;
+ unsigned *uDeletePos = DPM.uDeletePos;
+
+ int **TraceBack = DPM.TraceBack;
+
+ for (unsigned i = 0; i < uLengthA; ++i)
+ {
+ GapOpenA[i] = PA[i].m_scoreGapOpen;
+ GapCloseA[i] = PA[i].m_scoreGapClose;
+
+ for (unsigned uLetter = 0; uLetter < 4; ++uLetter)
+ {
+ SortOrderA[i][uLetter] = PA[i].m_uSortOrder[uLetter];
+ FreqsA[i][uLetter] = PA[i].m_fcCounts[uLetter];
+ }
+ }
+
+ for (unsigned j = 0; j < uLengthB; ++j)
+ {
+ GapOpenB[j] = PB[j].m_scoreGapOpen;
+ GapCloseB[j] = PB[j].m_scoreGapClose;
+ }
+
+ for (unsigned uLetter = 0; uLetter < 4; ++uLetter)
+ {
+ for (unsigned j = 0; j < uLengthB; ++j)
+ ScoreMxB[uLetter][j] = PB[j].m_AAScores[uLetter];
+ }
+
+ for (unsigned i = 0; i < uPrefixCountA; ++i)
+ memset(TraceBack[i], 0, uPrefixCountB*sizeof(int));
+
+// Special case for i=0
+ unsigned **ptrSortOrderA = SortOrderA;
+ FCOUNT **ptrFreqsA = FreqsA;
+ assert(ptrSortOrderA == &(SortOrderA[0]));
+ assert(ptrFreqsA == &(FreqsA[0]));
+ TraceBack[0][0] = 0;
+
+ SCORE scoreSum = 0;
+ unsigned *ptrSortOrderAi = SortOrderA[0];
+ const unsigned *ptrSortOrderAEnd = ptrSortOrderAi + 4;
+ FCOUNT *ptrFreqsAi = FreqsA[0];
+ for (; ptrSortOrderAi != ptrSortOrderAEnd; ++ptrSortOrderAi)
+ {
+ const unsigned uLetter = *ptrSortOrderAi;
+ const FCOUNT fcLetter = ptrFreqsAi[uLetter];
+ if (0 == fcLetter)
+ break;
+ scoreSum += fcLetter*ScoreMxB[uLetter][0];
+ }
+ MPrev[0] = scoreSum - g_scoreCenter;
+
+// D(0,0) is -infinity (requires I->D).
+ DPrev[0] = MINUS_INFINITY;
+
+ for (unsigned j = 1; j < uLengthB; ++j)
+ {
+ // Only way to get M(0, j) looks like this:
+ // A ----X
+ // B XXXXX
+ // 0 j
+ // So gap-open at j=0, gap-close at j-1.
+ SCORE scoreSum = 0;
+ unsigned *ptrSortOrderAi = SortOrderA[0];
+ const unsigned *ptrSortOrderAEnd = ptrSortOrderAi + 4;
+ FCOUNT *ptrFreqsAi = FreqsA[0];
+ for (; ptrSortOrderAi != ptrSortOrderAEnd; ++ptrSortOrderAi)
+ {
+ const unsigned uLetter = *ptrSortOrderAi;
+ const FCOUNT fcLetter = ptrFreqsAi[uLetter];
+ if (0 == fcLetter)
+ break;
+ scoreSum += fcLetter*ScoreMxB[uLetter][j];
+ }
+ MPrev[j] = scoreSum - g_scoreCenter + GapOpenB[0] + GapCloseB[j-1];
+ TraceBack[0][j] = -(int) j;
+
+ // Assume no D->I transitions, then can't be a delete if only
+ // one letter from A.
+ DPrev[j] = MINUS_INFINITY;
+ }
+
+ SCORE IPrev_j_1;
+ for (unsigned i = 1; i < uLengthA; ++i)
+ {
+ ++ptrSortOrderA;
+ ++ptrFreqsA;
+ assert(ptrSortOrderA == &(SortOrderA[i]));
+ assert(ptrFreqsA == &(FreqsA[i]));
+
+ SCORE *ptrMCurr_j = MCurr;
+ memset(ptrMCurr_j, 0, uLengthB*sizeof(SCORE));
+ const FCOUNT *FreqsAi = *ptrFreqsA;
+
+ const unsigned *SortOrderAi = *ptrSortOrderA;
+ const unsigned *ptrSortOrderAiEnd = SortOrderAi + 4;
+ const SCORE *ptrMCurrMax = MCurr + uLengthB;
+ for (const unsigned *ptrSortOrderAi = SortOrderAi;
+ ptrSortOrderAi != ptrSortOrderAiEnd;
+ ++ptrSortOrderAi)
+ {
+ const unsigned uLetter = *ptrSortOrderAi;
+ SCORE *NSBR_Letter = ScoreMxB[uLetter];
+ const FCOUNT fcLetter = FreqsAi[uLetter];
+ if (0 == fcLetter)
+ break;
+ SCORE *ptrNSBR = NSBR_Letter;
+ for (SCORE *ptrMCurr = MCurr; ptrMCurr != ptrMCurrMax; ++ptrMCurr)
+ *ptrMCurr += fcLetter*(*ptrNSBR++);
+ }
+
+ for (unsigned j = 0; j < uLengthB; ++j)
+ MCurr[j] -= g_scoreCenter;
+
+ ptrMCurr_j = MCurr;
+ unsigned *ptrDeletePos = uDeletePos;
+
+ // Special case for j=0
+ // Only way to get M(i, 0) looks like this:
+ // 0 i
+ // A XXXXX
+ // B ----X
+ // So gap-open at i=0, gap-close at i-1.
+ assert(ptrMCurr_j == &(MCurr[0]));
+ *ptrMCurr_j += GapOpenA[0] + GapCloseA[i-1];
+
+ ++ptrMCurr_j;
+
+ int *ptrTraceBack_ij = TraceBack[i];
+ *ptrTraceBack_ij++ = (int) i;
+
+ SCORE *ptrMPrev_j = MPrev;
+ SCORE *ptrDPrev = DPrev;
+ SCORE d = *ptrDPrev;
+ SCORE DNew = *ptrMPrev_j + GapOpenA[i];
+ if (DNew > d)
+ {
+ d = DNew;
+ *ptrDeletePos = i;
+ }
+
+ SCORE *ptrDCurr = DCurr;
+
+ assert(ptrDCurr == &(DCurr[0]));
+ *ptrDCurr = d;
+
+ // Can't have an insert if no letters from B
+ IPrev_j_1 = MINUS_INFINITY;
+
+ unsigned uInsertPos;
+ const SCORE scoreGapOpenAi = GapOpenA[i];
+ const SCORE scoreGapCloseAi_1 = GapCloseA[i-1];
+
+ for (unsigned j = 1; j < uLengthB; ++j)
+ {
+ // Here, MPrev_j is preserved from previous
+ // iteration so with current i,j is M[i-1][j-1]
+ SCORE MPrev_j = *ptrMPrev_j;
+ SCORE INew = MPrev_j + GapOpenB[j];
+ if (INew > IPrev_j_1)
+ {
+ IPrev_j_1 = INew;
+ uInsertPos = j;
+ }
+
+ SCORE scoreMax = MPrev_j;
+
+ assert(ptrDPrev == &(DPrev[j-1]));
+ SCORE scoreD = *ptrDPrev++ + scoreGapCloseAi_1;
+ if (scoreD > scoreMax)
+ {
+ scoreMax = scoreD;
+ assert(ptrDeletePos == &(uDeletePos[j-1]));
+ *ptrTraceBack_ij = (int) i - (int) *ptrDeletePos;
+ assert(*ptrTraceBack_ij > 0);
+ }
+ ++ptrDeletePos;
+
+ SCORE scoreI = IPrev_j_1 + GapCloseB[j-1];
+ if (scoreI > scoreMax)
+ {
+ scoreMax = scoreI;
+ *ptrTraceBack_ij = (int) uInsertPos - (int) j;
+ assert(*ptrTraceBack_ij < 0);
+ }
+
+ assert(ptrSortOrderA == &(SortOrderA[i]));
+ assert(ptrFreqsA == &(FreqsA[i]));
+
+ *ptrMCurr_j += scoreMax;
+ assert(ptrMCurr_j == &(MCurr[j]));
+ ++ptrMCurr_j;
+
+ MPrev_j = *(++ptrMPrev_j);
+ assert(ptrDPrev == &(DPrev[j]));
+ SCORE d = *ptrDPrev;
+ SCORE DNew = MPrev_j + scoreGapOpenAi;
+ if (DNew > d)
+ {
+ d = DNew;
+ assert(ptrDeletePos == &uDeletePos[j]);
+ *ptrDeletePos = i;
+ }
+ assert(ptrDCurr + 1 == &(DCurr[j]));
+ *(++ptrDCurr) = d;
+
+ ++ptrTraceBack_ij;
+ }
+
+ Rotate(MPrev, MCurr, MWork);
+ Rotate(DPrev, DCurr, DWork);
+ }
+
+// Special case for i=uLengthA
+ SCORE IPrev = MINUS_INFINITY;
+
+ unsigned uInsertPos;
+
+ for (unsigned j = 1; j < uLengthB; ++j)
+ {
+ SCORE INew = MPrev[j-1] + GapOpenB[j];
+ if (INew > IPrev)
+ {
+ uInsertPos = j;
+ IPrev = INew;
+ }
+ }
+
+// Special case for i=uLengthA, j=uLengthB
+ SCORE scoreMax = MPrev[uLengthB-1];
+ int iTraceBack = 0;
+
+ SCORE scoreD = DPrev[uLengthB-1] + GapCloseA[uLengthA-1];
+ if (scoreD > scoreMax)
+ {
+ scoreMax = scoreD;
+ iTraceBack = (int) uLengthA - (int) uDeletePos[uLengthB-1];
+ }
+
+ SCORE scoreI = IPrev + GapCloseB[uLengthB-1];
+ if (scoreI > scoreMax)
+ {
+ scoreMax = scoreI;
+ iTraceBack = (int) uInsertPos - (int) uLengthB;
+ }
+
+ TraceBack[uLengthA][uLengthB] = iTraceBack;
+
+ TraceBackToPath(TraceBack, uLengthA, uLengthB, Path);
+
+ return scoreMax;
+ }
Added: trunk/packages/muscle/branches/upstream/current/glbalignss.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/glbalignss.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/glbalignss.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,318 @@
+#include "muscle.h"
+#include "profile.h"
+#include "pwpath.h"
+#include "seq.h"
+
+extern SCOREMATRIX VTML_SP;
+
+// #define SUBST(i, j) Subst(seqA, seqB, i, j)
+#define SUBST(i, j) MxRowA[i][seqB.GetLetter(j)]
+
+static SCORE Subst(const Seq &seqA, const Seq &seqB, unsigned i, unsigned j)
+ {
+ assert(i < seqA.Length());
+ assert(j < seqB.Length());
+
+ unsigned uLetterA = seqA.GetLetter(i);
+ unsigned uLetterB = seqB.GetLetter(j);
+ return VTML_SP[uLetterA][uLetterB] + g_scoreCenter;
+ }
+
+struct DP_MEMORY
+ {
+ unsigned uLength;
+ SCORE *MPrev;
+ SCORE *MCurr;
+ SCORE *MWork;
+ SCORE *DPrev;
+ SCORE *DCurr;
+ SCORE *DWork;
+ SCORE **MxRowA;
+ unsigned *LettersB;
+ unsigned *uDeletePos;
+ int **TraceBack;
+ };
+
+static struct DP_MEMORY DPM;
+
+static void AllocDPMem(unsigned uLengthA, unsigned uLengthB)
+ {
+// Max prefix length
+ unsigned uLength = (uLengthA > uLengthB ? uLengthA : uLengthB) + 1;
+ if (uLength < DPM.uLength)
+ return;
+
+// Add 256 to allow for future expansion and
+// round up to next multiple of 32.
+ uLength += 256;
+ uLength += 32 - uLength%32;
+
+ const unsigned uOldLength = DPM.uLength;
+ if (uOldLength > 0)
+ {
+ for (unsigned i = 0; i < uOldLength; ++i)
+ delete[] DPM.TraceBack[i];
+
+ delete[] DPM.MPrev;
+ delete[] DPM.MCurr;
+ delete[] DPM.MWork;
+ delete[] DPM.DPrev;
+ delete[] DPM.DCurr;
+ delete[] DPM.DWork;
+ delete[] DPM.MxRowA;
+ delete[] DPM.LettersB;
+ delete[] DPM.uDeletePos;
+ delete[] DPM.TraceBack;
+ }
+
+ DPM.uLength = uLength;
+
+ DPM.MPrev = new SCORE[uLength];
+ DPM.MCurr = new SCORE[uLength];
+ DPM.MWork = new SCORE[uLength];
+
+ DPM.DPrev = new SCORE[uLength];
+ DPM.DCurr = new SCORE[uLength];
+ DPM.DWork = new SCORE[uLength];
+ DPM.MxRowA = new SCORE *[uLength];
+ DPM.LettersB = new unsigned[uLength];
+ DPM.uDeletePos = new unsigned[uLength];
+
+ DPM.TraceBack = new int*[uLength];
+
+ for (unsigned i = 0; i < uLength; ++i)
+ DPM.TraceBack[i] = new int[uLength];
+ }
+
+static void RowFromSeq(const Seq &s, SCORE *Row[])
+ {
+ const unsigned uLength = s.Length();
+ for (unsigned i = 0; i < uLength; ++i)
+ {
+ char c = s.GetChar(i);
+ unsigned uLetter = CharToLetter(c);
+ if (uLetter < 20)
+ Row[i] = VTML_SP[uLetter];
+ else
+ Row[i] = VTML_SP[AX_X];
+ }
+ }
+
+static void LettersFromSeq(const Seq &s, unsigned Letters[])
+ {
+ const unsigned uLength = s.Length();
+ for (unsigned i = 0; i < uLength; ++i)
+ {
+ char c = s.GetChar(i);
+ unsigned uLetter = CharToLetter(c);
+ if (uLetter < 20)
+ Letters[i] = uLetter;
+ else
+ Letters[i] = AX_X;
+ }
+ }
+
+SCORE GlobalAlignSS(const Seq &seqA, const Seq &seqB, PWPath &Path)
+ {
+ const unsigned uLengthA = seqA.Length();
+ const unsigned uLengthB = seqB.Length();
+ const unsigned uPrefixCountA = uLengthA + 1;
+ const unsigned uPrefixCountB = uLengthB + 1;
+
+ AllocDPMem(uLengthA, uLengthB);
+
+ SCORE *MPrev = DPM.MPrev;
+ SCORE *MCurr = DPM.MCurr;
+ SCORE *MWork = DPM.MWork;
+
+ SCORE *DPrev = DPM.DPrev;
+ SCORE *DCurr = DPM.DCurr;
+ SCORE *DWork = DPM.DWork;
+ SCORE **MxRowA = DPM.MxRowA;
+ unsigned *LettersB = DPM.LettersB;
+
+ RowFromSeq(seqA, MxRowA);
+ LettersFromSeq(seqB, LettersB);
+
+ unsigned *uDeletePos = DPM.uDeletePos;
+
+ int **TraceBack = DPM.TraceBack;
+
+#if DEBUG
+ for (unsigned i = 0; i < uPrefixCountA; ++i)
+ memset(TraceBack[i], 0, uPrefixCountB*sizeof(int));
+#endif
+
+// Special case for i=0
+ TraceBack[0][0] = 0;
+ MPrev[0] = MxRowA[0][LettersB[0]];
+
+// D(0,0) is -infinity (requires I->D).
+ DPrev[0] = MINUS_INFINITY;
+
+ for (unsigned j = 1; j < uLengthB; ++j)
+ {
+ unsigned uLetterB = LettersB[j];
+
+ // Only way to get M(0, j) looks like this:
+ // A ----X
+ // B XXXXX
+ // 0 j
+ // So gap-open at j=0, gap-close at j-1.
+ MPrev[j] = MxRowA[0][uLetterB] + g_scoreGapOpen/2; // term gaps half
+ TraceBack[0][j] = -(int) j;
+
+ // Assume no D->I transitions, then can't be a delete if only
+ // one letter from A.
+ DPrev[j] = MINUS_INFINITY;
+ }
+
+ SCORE IPrev_j_1;
+ for (unsigned i = 1; i < uLengthA; ++i)
+ {
+ SCORE *ptrMCurr_j = MCurr;
+ memset(ptrMCurr_j, 0, uLengthB*sizeof(SCORE));
+
+ const SCORE *RowA = MxRowA[i];
+ const SCORE *ptrRowA = MxRowA[i];
+ const SCORE *ptrMCurrEnd = ptrMCurr_j + uLengthB;
+ unsigned *ptrLettersB = LettersB;
+ for (; ptrMCurr_j != ptrMCurrEnd; ++ptrMCurr_j)
+ {
+ *ptrMCurr_j = RowA[*ptrLettersB];
+ ++ptrLettersB;
+ }
+
+ unsigned *ptrDeletePos = uDeletePos;
+
+ // Special case for j=0
+ // Only way to get M(i, 0) looks like this:
+ // 0 i
+ // A XXXXX
+ // B ----X
+ // So gap-open at i=0, gap-close at i-1.
+ ptrMCurr_j = MCurr;
+ assert(ptrMCurr_j == &(MCurr[0]));
+ *ptrMCurr_j += g_scoreGapOpen/2; // term gaps half
+
+ ++ptrMCurr_j;
+
+ int *ptrTraceBack_ij = TraceBack[i];
+ *ptrTraceBack_ij++ = (int) i;
+
+ SCORE *ptrMPrev_j = MPrev;
+ SCORE *ptrDPrev = DPrev;
+ SCORE d = *ptrDPrev;
+ SCORE DNew = *ptrMPrev_j + g_scoreGapOpen;
+ if (DNew > d)
+ {
+ d = DNew;
+ *ptrDeletePos = i;
+ }
+
+ SCORE *ptrDCurr = DCurr;
+
+ assert(ptrDCurr == &(DCurr[0]));
+ *ptrDCurr = d;
+
+ // Can't have an insert if no letters from B
+ IPrev_j_1 = MINUS_INFINITY;
+
+ unsigned uInsertPos;
+
+ for (unsigned j = 1; j < uLengthB; ++j)
+ {
+ // Here, MPrev_j is preserved from previous
+ // iteration so with current i,j is M[i-1][j-1]
+ SCORE MPrev_j = *ptrMPrev_j;
+ SCORE INew = MPrev_j + g_scoreGapOpen;
+ if (INew > IPrev_j_1)
+ {
+ IPrev_j_1 = INew;
+ uInsertPos = j;
+ }
+
+ SCORE scoreMax = MPrev_j;
+
+ assert(ptrDPrev == &(DPrev[j-1]));
+ SCORE scoreD = *ptrDPrev++;
+ if (scoreD > scoreMax)
+ {
+ scoreMax = scoreD;
+ assert(ptrDeletePos == &(uDeletePos[j-1]));
+ *ptrTraceBack_ij = (int) i - (int) *ptrDeletePos;
+ assert(*ptrTraceBack_ij > 0);
+ }
+ ++ptrDeletePos;
+
+ SCORE scoreI = IPrev_j_1;
+ if (scoreI > scoreMax)
+ {
+ scoreMax = scoreI;
+ *ptrTraceBack_ij = (int) uInsertPos - (int) j;
+ assert(*ptrTraceBack_ij < 0);
+ }
+
+ *ptrMCurr_j += scoreMax;
+ assert(ptrMCurr_j == &(MCurr[j]));
+ ++ptrMCurr_j;
+
+ MPrev_j = *(++ptrMPrev_j);
+ assert(ptrDPrev == &(DPrev[j]));
+ SCORE d = *ptrDPrev;
+ SCORE DNew = MPrev_j + g_scoreGapOpen;
+ if (DNew > d)
+ {
+ d = DNew;
+ assert(ptrDeletePos == &uDeletePos[j]);
+ *ptrDeletePos = i;
+ }
+ assert(ptrDCurr + 1 == &(DCurr[j]));
+ *(++ptrDCurr) = d;
+
+ ++ptrTraceBack_ij;
+ }
+
+ Rotate(MPrev, MCurr, MWork);
+ Rotate(DPrev, DCurr, DWork);
+ }
+
+// Special case for i=uLengthA
+ SCORE IPrev = MINUS_INFINITY;
+
+ unsigned uInsertPos;
+
+ for (unsigned j = 1; j < uLengthB; ++j)
+ {
+ SCORE INew = MPrev[j-1];
+ if (INew > IPrev)
+ {
+ uInsertPos = j;
+ IPrev = INew;
+ }
+ }
+
+// Special case for i=uLengthA, j=uLengthB
+ SCORE scoreMax = MPrev[uLengthB-1];
+ int iTraceBack = 0;
+
+ SCORE scoreD = DPrev[uLengthB-1] - g_scoreGapOpen/2; // term gaps half
+ if (scoreD > scoreMax)
+ {
+ scoreMax = scoreD;
+ iTraceBack = (int) uLengthA - (int) uDeletePos[uLengthB-1];
+ }
+
+ SCORE scoreI = IPrev - g_scoreGapOpen/2;
+ if (scoreI > scoreMax)
+ {
+ scoreMax = scoreI;
+ iTraceBack = (int) uInsertPos - (int) uLengthB;
+ }
+
+ TraceBack[uLengthA][uLengthB] = iTraceBack;
+
+ TraceBackToPath(TraceBack, uLengthA, uLengthB, Path);
+
+ return scoreMax;
+ }
Added: trunk/packages/muscle/branches/upstream/current/glbalndimer.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/glbalndimer.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/glbalndimer.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,390 @@
+#include "muscle.h"
+#include <math.h>
+#include <stdio.h> // for sprintf
+#include "pwpath.h"
+#include "profile.h"
+#include "gapscoredimer.h"
+
+#define TRACE 0
+
+static SCORE TraceBackDimer( const SCORE *DPM_, const SCORE *DPD_, const SCORE *DPI_,
+ const char *TBM_, const char *TBD_, const char *TBI_,
+ unsigned uLengthA, unsigned uLengthB, PWPath &Path);
+
+static const char *LocalScoreToStr(SCORE s)
+ {
+ static char str[16];
+ if (MINUS_INFINITY == s)
+ return " *";
+ sprintf(str, "%6.3g", s);
+ return str;
+ }
+
+#if TRACE
+static void ListDP(const SCORE *DPM_, const ProfPos *PA, const ProfPos *PB,
+ unsigned uPrefixCountA, unsigned uPrefixCountB)
+ {
+ Log(" ");
+ for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ {
+ char c = ' ';
+ if (uPrefixLengthB > 0)
+ c = ConsensusChar(PB[uPrefixLengthB - 1]);
+ Log(" %4u:%c", uPrefixLengthB, c);
+ }
+ Log("\n");
+ for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA)
+ {
+ char c = ' ';
+ if (uPrefixLengthA > 0)
+ c = ConsensusChar(PA[uPrefixLengthA - 1]);
+ Log("%4u:%c ", uPrefixLengthA, c);
+ for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ Log(" %s", LocalScoreToStr(DPM(uPrefixLengthA, uPrefixLengthB)));
+ Log("\n");
+ }
+ }
+
+static void ListTB(const char *TBM_, const ProfPos *PA, const ProfPos *PB,
+ unsigned uPrefixCountA, unsigned uPrefixCountB)
+ {
+ Log(" ");
+ for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ Log("%2d", uPrefixLengthB);
+ Log("\n");
+ Log(" ");
+ for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ {
+ char c = ' ';
+ if (uPrefixLengthB > 0)
+ c = ConsensusChar(PB[uPrefixLengthB - 1]);
+ Log(" %c", c);
+ }
+ Log("\n");
+ for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA)
+ {
+ char c = ' ';
+ if (uPrefixLengthA > 0)
+ c = ConsensusChar(PA[uPrefixLengthA - 1]);
+ Log("%4u:%c ", uPrefixLengthA, c);
+ for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ Log(" %c", TBM(uPrefixLengthA, uPrefixLengthB));
+ Log("\n");
+ }
+ }
+#endif // TRACE
+
+static ProfPos PPTerm;
+static bool InitializePPTerm()
+ {
+ PPTerm.m_bAllGaps = false;
+ PPTerm.m_LL = 1;
+ PPTerm.m_LG = 0;
+ PPTerm.m_GL = 0;
+ PPTerm.m_GG = 0;
+ PPTerm.m_fOcc = 1;
+ return true;
+ }
+static bool PPTermInitialized = InitializePPTerm();
+
+static SCORE ScoreProfPosDimerLE(const ProfPos &PPA, const ProfPos &PPB)
+ {
+ SCORE Score = 0;
+ for (unsigned n = 0; n < 20; ++n)
+ {
+ const unsigned uLetter = PPA.m_uSortOrder[n];
+ const FCOUNT fcLetter = PPA.m_fcCounts[uLetter];
+ if (0 == fcLetter)
+ break;
+ Score += fcLetter*PPB.m_AAScores[uLetter];
+ }
+ if (0 == Score)
+ return -2.5;
+ SCORE logScore = logf(Score);
+ return (SCORE) (logScore*(PPA.m_fOcc * PPB.m_fOcc));
+ }
+
+static SCORE ScoreProfPosDimerPSP(const ProfPos &PPA, const ProfPos &PPB)
+ {
+ SCORE Score = 0;
+ for (unsigned n = 0; n < 20; ++n)
+ {
+ const unsigned uLetter = PPA.m_uSortOrder[n];
+ const FCOUNT fcLetter = PPA.m_fcCounts[uLetter];
+ if (0 == fcLetter)
+ break;
+ Score += fcLetter*PPB.m_AAScores[uLetter];
+ }
+ return Score;
+ }
+
+static SCORE ScoreProfPosDimer(const ProfPos &PPA, const ProfPos &PPB)
+ {
+ switch (g_PPScore)
+ {
+ case PPSCORE_LE:
+ return ScoreProfPosDimerLE(PPA, PPB);
+
+ case PPSCORE_SP:
+ case PPSCORE_SV:
+ return ScoreProfPosDimerPSP(PPA, PPB);
+ }
+ Quit("Invalid g_PPScore");
+ return 0;
+ }
+
+// Global alignment dynamic programming
+// This variant optimizes the profile-profile SP score under the
+// dimer approximation.
+SCORE GlobalAlignDimer(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
+ unsigned uLengthB, PWPath &Path)
+ {
+ assert(uLengthB > 0 && uLengthA > 0);
+
+ const unsigned uPrefixCountA = uLengthA + 1;
+ const unsigned uPrefixCountB = uLengthB + 1;
+
+// Allocate DP matrices
+ const size_t LM = uPrefixCountA*uPrefixCountB;
+ SCORE *DPM_ = new SCORE[LM];
+ SCORE *DPD_ = new SCORE[LM];
+ SCORE *DPI_ = new SCORE[LM];
+
+ char *TBM_ = new char[LM];
+ char *TBD_ = new char[LM];
+ char *TBI_ = new char[LM];
+
+ DPM(0, 0) = 0;
+ DPD(0, 0) = MINUS_INFINITY;
+ DPI(0, 0) = MINUS_INFINITY;
+
+ TBM(0, 0) = 'S';
+ TBD(0, 0) = '?';
+ TBI(0, 0) = '?';
+
+ DPM(1, 0) = MINUS_INFINITY;
+ DPD(1, 0) = GapScoreMD(PA[0], PPTerm);
+ DPI(1, 0) = MINUS_INFINITY;
+
+ TBM(1, 0) = '?';
+ TBD(1, 0) = 'S';
+ TBI(1, 0) = '?';
+
+ DPM(0, 1) = MINUS_INFINITY;
+ DPD(0, 1) = MINUS_INFINITY;
+ DPI(0, 1) = GapScoreMI(PPTerm, PB[0]);
+
+ TBM(0, 1) = '?';
+ TBD(0, 1) = '?';
+ TBI(0, 1) = 'S';
+
+// Empty prefix of B is special case
+ for (unsigned uPrefixLengthA = 2; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA)
+ {
+ // M=LetterA+LetterB, impossible with empty prefix
+ DPM(uPrefixLengthA, 0) = MINUS_INFINITY;
+ TBM(uPrefixLengthA, 0) = '?';
+
+ // D=LetterA+GapB
+ DPD(uPrefixLengthA, 0) = DPD(uPrefixLengthA - 1, 0) +
+ GapScoreDD(PA[uPrefixLengthA - 1], PPTerm);
+ TBD(uPrefixLengthA, 0) = 'D';
+
+ // I=GapA+LetterB, impossible with empty prefix
+ DPI(uPrefixLengthA, 0) = MINUS_INFINITY;
+ TBI(uPrefixLengthA, 0) = '?';
+ }
+
+// Empty prefix of A is special case
+ for (unsigned uPrefixLengthB = 2; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ {
+ // M=LetterA+LetterB, impossible with empty prefix
+ DPM(0, uPrefixLengthB) = MINUS_INFINITY;
+ TBM(0, uPrefixLengthB) = '?';
+
+ // D=LetterA+GapB, impossible with empty prefix
+ DPD(0, uPrefixLengthB) = MINUS_INFINITY;
+ TBD(0, uPrefixLengthB) = '?';
+
+ // I=GapA+LetterB
+ DPI(0, uPrefixLengthB) = DPI(0, uPrefixLengthB - 1) +
+ GapScoreII(PPTerm, PB[uPrefixLengthB - 1]);
+ TBI(0, uPrefixLengthB) = 'I';
+ }
+
+// ============
+// Main DP loop
+// ============
+ for (unsigned uPrefixLengthB = 1; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ {
+ const ProfPos &PPB = PB[uPrefixLengthB - 1];
+ for (unsigned uPrefixLengthA = 1; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA)
+ {
+ const ProfPos &PPA = PA[uPrefixLengthA - 1];
+ {
+ // Match M=LetterA+LetterB
+ SCORE scoreLL = ScoreProfPosDimer(PPA, PPB);
+
+ SCORE scoreMM = DPM(uPrefixLengthA-1, uPrefixLengthB-1) + GapScoreMM(PPA, PPB);
+ SCORE scoreDM = DPD(uPrefixLengthA-1, uPrefixLengthB-1) + GapScoreDM(PPA, PPB);
+ SCORE scoreIM = DPI(uPrefixLengthA-1, uPrefixLengthB-1) + GapScoreIM(PPA, PPB);
+
+ SCORE scoreBest = scoreMM;
+ char c = 'M';
+ if (scoreDM > scoreBest)
+ {
+ scoreBest = scoreDM;
+ c = 'D';
+ }
+ if (scoreIM > scoreBest)
+ {
+ scoreBest = scoreIM;
+ c = 'I';
+ }
+
+ DPM(uPrefixLengthA, uPrefixLengthB) = scoreBest + scoreLL;
+ TBM(uPrefixLengthA, uPrefixLengthB) = c;
+ }
+ {
+ // Delete D=LetterA+GapB
+ SCORE scoreMD = DPM(uPrefixLengthA-1, uPrefixLengthB) + GapScoreMD(PPA, PPB);
+ SCORE scoreDD = DPD(uPrefixLengthA-1, uPrefixLengthB) + GapScoreDD(PPA, PPB);
+ SCORE scoreID = DPI(uPrefixLengthA-1, uPrefixLengthB) + GapScoreID(PPA, PPB);
+
+ SCORE scoreBest = scoreMD;
+ char c = 'M';
+ if (scoreDD > scoreBest)
+ {
+ scoreBest = scoreDD;
+ c = 'D';
+ }
+ if (scoreID > scoreBest)
+ {
+ scoreBest = scoreID;
+ c = 'I';
+ }
+
+ DPD(uPrefixLengthA, uPrefixLengthB) = scoreBest;
+ TBD(uPrefixLengthA, uPrefixLengthB) = c;
+ }
+ {
+ // Insert I=GapA+LetterB
+ SCORE scoreMI = DPM(uPrefixLengthA, uPrefixLengthB-1) + GapScoreMI(PPA, PPB);
+ SCORE scoreDI = DPD(uPrefixLengthA, uPrefixLengthB-1) + GapScoreDI(PPA, PPB);
+ SCORE scoreII = DPI(uPrefixLengthA, uPrefixLengthB-1) + GapScoreII(PPA, PPB);
+
+ SCORE scoreBest = scoreMI;
+ char c = 'M';
+ if (scoreDI > scoreBest)
+ {
+ scoreBest = scoreDI;
+ c = 'D';
+ }
+ if (scoreII > scoreBest)
+ {
+ scoreBest = scoreII;
+ c = 'I';
+ }
+
+ DPI(uPrefixLengthA, uPrefixLengthB) = scoreBest;
+ TBI(uPrefixLengthA, uPrefixLengthB) = c;
+ }
+ }
+ }
+
+#if TRACE
+ Log("DPM:\n");
+ ListDP(DPM_, PA, PB, uPrefixCountA, uPrefixCountB);
+ Log("DPD:\n");
+ ListDP(DPD_, PA, PB, uPrefixCountA, uPrefixCountB);
+ Log("DPI:\n");
+ ListDP(DPI_, PA, PB, uPrefixCountA, uPrefixCountB);
+ Log("TBM:\n");
+ ListTB(TBM_, PA, PB, uPrefixCountA, uPrefixCountB);
+ Log("TBD:\n");
+ ListTB(TBD_, PA, PB, uPrefixCountA, uPrefixCountB);
+ Log("TBI:\n");
+ ListTB(TBI_, PA, PB, uPrefixCountA, uPrefixCountB);
+#endif
+
+ SCORE Score = TraceBackDimer(DPM_, DPD_, DPI_, TBM_, TBD_, TBI_,
+ uLengthA, uLengthB, Path);
+
+#if TRACE
+ Log("GlobalAlignDimer score = %.3g\n", Score);
+#endif
+
+ delete[] DPM_;
+ delete[] DPD_;
+ delete[] DPI_;
+
+ delete[] TBM_;
+ delete[] TBD_;
+ delete[] TBI_;
+
+ return Score;
+ }
+
+static SCORE TraceBackDimer( const SCORE *DPM_, const SCORE *DPD_, const SCORE *DPI_,
+ const char *TBM_, const char *TBD_, const char *TBI_,
+ unsigned uLengthA, unsigned uLengthB, PWPath &Path)
+ {
+ const unsigned uPrefixCountA = uLengthA + 1;
+
+ unsigned uPrefixLengthA = uLengthA;
+ unsigned uPrefixLengthB = uLengthB;
+
+ char cEdge = 'M';
+ SCORE scoreMax = DPM(uLengthA, uLengthB);
+ if (DPD(uLengthA, uLengthB) > scoreMax)
+ {
+ scoreMax = DPD(uLengthA, uLengthB);
+ cEdge = 'D';
+ }
+ if (DPI(uLengthA, uLengthB) > scoreMax)
+ {
+ scoreMax = DPI(uLengthA, uLengthB);
+ cEdge = 'I';
+ }
+
+ for (;;)
+ {
+ if (0 == uPrefixLengthA && 0 == uPrefixLengthB)
+ break;
+
+ PWEdge Edge;
+ Edge.cType = cEdge;
+ Edge.uPrefixLengthA = uPrefixLengthA;
+ Edge.uPrefixLengthB = uPrefixLengthB;
+ Path.PrependEdge(Edge);
+
+#if TRACE
+ Log("PLA=%u PLB=%u Edge=%c\n", uPrefixLengthA, uPrefixLengthB, cEdge);
+#endif
+ switch (cEdge)
+ {
+ case 'M':
+ assert(uPrefixLengthA > 0 && uPrefixLengthB > 0);
+ cEdge = TBM(uPrefixLengthA, uPrefixLengthB);
+ --uPrefixLengthA;
+ --uPrefixLengthB;
+ break;
+ case 'D':
+ assert(uPrefixLengthA > 0);
+ cEdge = TBD(uPrefixLengthA, uPrefixLengthB);
+ --uPrefixLengthA;
+ break;
+ case 'I':
+ assert(uPrefixLengthB > 0);
+ cEdge = TBI(uPrefixLengthA, uPrefixLengthB);
+ --uPrefixLengthB;
+ break;
+ default:
+ Quit("Invalid edge PLA=%u PLB=%u %c", uPrefixLengthA, uPrefixLengthB, cEdge);
+ }
+ }
+#if TRACE
+ Path.LogMe();
+#endif
+ return scoreMax;
+ }
Added: trunk/packages/muscle/branches/upstream/current/globals.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/globals.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/globals.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,267 @@
+#include "muscle.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <math.h>
+#include <assert.h>
+#include <time.h>
+#include <errno.h>
+
+#if WIN32
+#include <windows.h>
+#include <share.h>
+#endif
+
+#ifndef MAX_PATH
+#define MAX_PATH 260
+#endif
+
+static char g_strListFileName[MAX_PATH];
+static bool g_bListFileAppend = false;
+
+static SEQWEIGHT g_SeqWeight = SEQWEIGHT_Undefined;
+
+void SetSeqWeightMethod(SEQWEIGHT Method)
+ {
+ g_SeqWeight = Method;
+ }
+
+SEQWEIGHT GetSeqWeightMethod()
+ {
+ return g_SeqWeight;
+ }
+
+void SetListFileName(const char *ptrListFileName, bool bAppend)
+ {
+ assert(strlen(ptrListFileName) < MAX_PATH);
+ strcpy(g_strListFileName, ptrListFileName);
+ g_bListFileAppend = bAppend;
+ }
+
+void Log(const char szFormat[], ...)
+ {
+ if (0 == g_strListFileName[0])
+ return;
+
+ static FILE *f = NULL;
+ char *mode;
+ if (g_bListFileAppend)
+ mode = "a";
+ else
+ mode = "w";
+ if (NULL == f)
+ f = _fsopen(g_strListFileName, mode, _SH_DENYNO);
+ if (NULL == f)
+ {
+ perror(g_strListFileName);
+ exit(EXIT_NotStarted);
+ }
+
+ char szStr[4096];
+ va_list ArgList;
+ va_start(ArgList, szFormat);
+ vsprintf(szStr, szFormat, ArgList);
+ fprintf(f, "%s", szStr);
+ fflush(f);
+ }
+
+const char *GetTimeAsStr()
+ {
+ static char szStr[32];
+ time_t t;
+ time(&t);
+ struct tm *ptmCurrentTime = localtime(&t);
+ strcpy(szStr, asctime(ptmCurrentTime));
+ assert('\n' == szStr[24]);
+ szStr[24] = 0;
+ return szStr;
+ }
+
+// Exit immediately with error message, printf-style.
+void Quit(const char szFormat[], ...)
+ {
+ va_list ArgList;
+ char szStr[4096];
+
+ va_start(ArgList, szFormat);
+ vsprintf(szStr, szFormat, ArgList);
+
+ fprintf(stderr, "\n*** ERROR *** %s\n", szStr);
+
+ Log("\n*** FATAL ERROR *** ");
+ Log("%s\n", szStr);
+ Log("Stopped %s\n", GetTimeAsStr());
+
+#ifdef WIN32
+ if (IsDebuggerPresent())
+ {
+ int iBtn = MessageBox(NULL, szStr, "muscle", MB_ICONERROR | MB_OKCANCEL);
+ if (IDCANCEL == iBtn)
+ Break();
+ }
+#endif
+ exit(EXIT_FatalError);
+ }
+
+void Warning(const char szFormat[], ...)
+ {
+ va_list ArgList;
+ char szStr[4096];
+
+ va_start(ArgList, szFormat);
+ vsprintf(szStr, szFormat, ArgList);
+
+ fprintf(stderr, "\n*** WARNING *** %s\n", szStr);
+ Log("\n*** WARNING *** %s\n", szStr);
+ }
+
+// Remove leading and trailing blanks from string
+void TrimBlanks(char szStr[])
+ {
+ TrimLeadingBlanks(szStr);
+ TrimTrailingBlanks(szStr);
+ }
+
+void TrimLeadingBlanks(char szStr[])
+ {
+ size_t n = strlen(szStr);
+ while (szStr[0] == ' ')
+ {
+ memmove(szStr, szStr+1, n);
+ szStr[--n] = 0;
+ }
+ }
+
+void TrimTrailingBlanks(char szStr[])
+ {
+ size_t n = strlen(szStr);
+ while (n > 0 && szStr[n-1] == ' ')
+ szStr[--n] = 0;
+ }
+
+bool Verbose()
+ {
+ return true;
+ }
+
+SCORE StrToScore(const char *pszStr)
+ {
+ return (SCORE) atof(pszStr);
+ }
+
+void StripWhitespace(char szStr[])
+ {
+ unsigned uOutPos = 0;
+ unsigned uInPos = 0;
+ while (char c = szStr[uInPos++])
+ if (' ' != c && '\t' != c && '\n' != c && '\r' != c)
+ szStr[uOutPos++] = c;
+ szStr[uOutPos] = 0;
+ }
+
+void StripGaps(char szStr[])
+ {
+ unsigned uOutPos = 0;
+ unsigned uInPos = 0;
+ while (char c = szStr[uInPos++])
+ if ('-' != c)
+ szStr[uOutPos++] = c;
+ szStr[uOutPos] = 0;
+ }
+
+bool IsValidSignedInteger(const char *Str)
+ {
+ if (0 == strlen(Str))
+ return false;
+ if ('+' == *Str || '-' == *Str)
+ ++Str;
+ while (char c = *Str++)
+ if (!isdigit(c))
+ return false;
+ return true;
+ }
+
+bool IsValidInteger(const char *Str)
+ {
+ if (0 == strlen(Str))
+ return false;
+ while (char c = *Str++)
+ if (!isdigit(c))
+ return false;
+ return true;
+ }
+
+// Is c valid as first character in an identifier?
+bool isidentf(char c)
+ {
+ return isalpha(c) || '_' == c;
+ }
+
+// Is c valid character in an identifier?
+bool isident(char c)
+ {
+ return isalpha(c) || isdigit(c) || '_' == c;
+ }
+
+bool IsValidIdentifier(const char *Str)
+ {
+ if (!isidentf(Str[0]))
+ return false;
+ while (char c = *Str++)
+ if (!isident(c))
+ return false;
+ return true;
+ }
+
+void SetLogFile()
+ {
+ const char *strFileName = ValueOpt("loga");
+ if (0 != strFileName)
+ g_bListFileAppend = true;
+ else
+ strFileName = ValueOpt("log");
+ if (0 == strFileName)
+ return;
+ strcpy(g_strListFileName, strFileName);
+ }
+
+// Get filename, stripping any extension and directory parts.
+void NameFromPath(const char szPath[], char szName[], unsigned uBytes)
+ {
+ if (0 == uBytes)
+ return;
+ const char *pstrLastSlash = strrchr(szPath, '/');
+ const char *pstrLastBackslash = strrchr(szPath, '\\');
+ const char *pstrLastDot = strrchr(szPath, '.');
+ const char *pstrLastSep = pstrLastSlash > pstrLastBackslash ?
+ pstrLastSlash : pstrLastBackslash;
+ const char *pstrBegin = pstrLastSep ? pstrLastSep + 1 : szPath;
+ const char *pstrEnd = pstrLastDot ? pstrLastDot - 1 : szPath + strlen(szPath);
+ unsigned uNameLength = (unsigned) (pstrEnd - pstrBegin + 1);
+ if (uNameLength > uBytes - 1)
+ uNameLength = uBytes - 1;
+ memcpy(szName, pstrBegin, uNameLength);
+ szName[uNameLength] = 0;
+ }
+
+char *strsave(const char *s)
+ {
+ char *ptrCopy = strdup(s);
+ if (0 == ptrCopy)
+ Quit("Out of memory");
+ return ptrCopy;
+ }
+
+bool IsValidFloatChar(char c)
+ {
+ return isdigit(c) || '.' == c || 'e' == c || 'E' == c || 'd' == c ||
+ 'D' == c || '.' == c || '+' == c || '-' == c;
+ }
+
+void Call_MY_ASSERT(const char *file, int line, bool b, const char *msg)
+ {
+ if (b)
+ return;
+ Quit("%s(%d): MY_ASSERT(%s)", file, line, msg);
+ }
Added: trunk/packages/muscle/branches/upstream/current/globalslinux.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/globalslinux.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/globalslinux.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,163 @@
+#include "muscle.h"
+
+#ifndef WIN32
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <unistd.h>
+#include <errno.h>
+#include <stdio.h>
+#include <fcntl.h>
+
+const int ONE_MB = 1000000;
+const int MEM_WARNING_THRESHOLD = 20*ONE_MB;
+
+double GetNAN()
+ {
+ static unsigned long nan[2]={0xffffffff, 0x7fffffff};
+ double dNAN = *( double* )nan;
+ return dNAN;
+ }
+
+double g_dNAN = GetNAN();
+
+void chkmem(const char szMsg[])
+ {
+ //assert(_CrtCheckMemory());
+ }
+
+void Break()
+ {
+ //DebugBreak();
+ }
+
+static char szCmdLine[4096];
+
+void *ptrStartBreak = sbrk(0);
+
+const char *GetCmdLine()
+ {
+ return szCmdLine;
+ }
+
+double GetMemUseMB()
+ {
+ static char statm[64];
+ static int PageSize;
+ if (0 == statm[0])
+ {
+ PageSize = sysconf(_SC_PAGESIZE);
+ pid_t pid = getpid();
+ sprintf(statm, "/proc/%d/statm", (int) pid);
+ }
+
+ int fd = open(statm, O_RDONLY);
+ if (-1 == fd)
+ return -1;
+ char Buffer[64];
+ int n = read(fd, Buffer, sizeof(Buffer) - 1);
+ close(fd);
+ fd = -1;
+
+ if (n <= 0)
+ {
+ static bool Warned = false;
+ if (!Warned)
+ {
+ Warned = true;
+ Warning("*Warning* Cannot read %s errno=%d %s",
+ statm, errno, strerror(errno));
+ }
+ return 0;
+ }
+ Buffer[n] = 0;
+ int Pages = atoi(Buffer);
+
+ return ((double) Pages * (double) PageSize)/1e6;
+ }
+
+void SaveCmdLine(int argc, char *argv[])
+ {
+ for (int i = 0; i < argc; ++i)
+ {
+ if (i > 0)
+ strcat(szCmdLine, " ");
+ strcat(szCmdLine, argv[i]);
+ }
+ }
+
+double dPeakMemUseMB = 0;
+
+double GetPeakMemUseMB()
+ {
+ CheckMemUse();
+ return dPeakMemUseMB;
+ }
+
+double GetCPUGHz()
+ {
+ double dGHz = 2.5;
+ const char *e = getenv("CPUGHZ");
+ if (0 != e)
+ dGHz = atof(e);
+ return dGHz;
+ }
+
+void CheckMemUse()
+ {
+ double dMB = GetMemUseMB();
+ if (dMB > dPeakMemUseMB)
+ dPeakMemUseMB = dMB;
+ }
+
+double GetRAMSizeMB()
+ {
+ const double DEFAULT_RAM = 500;
+ static double RAMMB = 0;
+ if (RAMMB != 0)
+ return RAMMB;
+
+ int fd = open("/proc/meminfo", O_RDONLY);
+ if (-1 == fd)
+ {
+ static bool Warned = false;
+ if (!Warned)
+ {
+ Warned = true;
+ Warning("*Warning* Cannot open /proc/meminfo errno=%d %s",
+ errno, strerror(errno));
+ }
+ return DEFAULT_RAM;
+ }
+ char Buffer[1024];
+ int n = read(fd, Buffer, sizeof(Buffer) - 1);
+ close(fd);
+ fd = -1;
+
+ if (n <= 0)
+ {
+ static bool Warned = false;
+ if (!Warned)
+ {
+ Warned = true;
+ Warning("*Warning* Cannot read /proc/meminfo errno=%d %s",
+ errno, strerror(errno));
+ }
+ return DEFAULT_RAM;
+ }
+ Buffer[n] = 0;
+ char *pMem = strstr(Buffer, "MemTotal: ");
+ if (0 == pMem)
+ {
+ static bool Warned = false;
+ if (!Warned)
+ {
+ Warned = true;
+ Warning("*Warning* 'MemTotal:' not found in /proc/meminfo");
+ }
+ return DEFAULT_RAM;
+ }
+ int Bytes = atoi(pMem+9)*1000;
+ return ((double) Bytes)/1e6;
+ }
+
+#endif // !WIN32
Added: trunk/packages/muscle/branches/upstream/current/globalswin32.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/globalswin32.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/globalswin32.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,100 @@
+#include "muscle.h"
+
+#if WIN32
+#include <windows.h>
+#include <crtdbg.h>
+#include <psapi.h>
+#include <float.h>
+#include <stdio.h>
+
+void DebugPrintf(const char *szFormat, ...)
+ {
+ va_list ArgList;
+ char szStr[4096];
+
+ va_start(ArgList, szFormat);
+ vsprintf(szStr, szFormat, ArgList);
+
+ OutputDebugString(szStr);
+ }
+
+double GetNAN()
+ {
+ static unsigned long nan[2]={0xffffffff, 0x7fffffff};
+ double dNAN = *( double* )nan;
+ assert(_isnan(dNAN));
+ return dNAN;
+ }
+
+double g_dNAN = GetNAN();
+
+void chkmem(const char szMsg[])
+ {
+ if (!_CrtCheckMemory())
+ Quit("chkmem(%s)", szMsg);
+ }
+
+void Break()
+ {
+ if (IsDebuggerPresent())
+ DebugBreak();
+ }
+
+const char *GetCmdLine()
+ {
+ return GetCommandLine();
+ }
+
+static unsigned uPeakMemUseBytes;
+
+double GetRAMSizeMB()
+ {
+ MEMORYSTATUS MS;
+ GlobalMemoryStatus(&MS);
+ return MS.dwAvailPhys/1e6;
+ }
+
+double GetMemUseMB()
+ {
+ HANDLE hProc = GetCurrentProcess();
+ PROCESS_MEMORY_COUNTERS PMC;
+ BOOL bOk = GetProcessMemoryInfo(hProc, &PMC, sizeof(PMC));
+ assert(bOk);
+ //printf("GetMemUseMB()\n");
+ //printf("%12u PageFaultCount\n", (unsigned) PMC.PageFaultCount);
+ //printf("%12u PagefileUsage\n", (unsigned) PMC.PagefileUsage);
+ //printf("%12u PeakPagefileUsage\n", (unsigned) PMC.PeakPagefileUsage);
+ //printf("%12u WorkingSetSize\n", (unsigned) PMC.WorkingSetSize);
+ //printf("%12u PeakWorkingSetSize\n", (unsigned) PMC.PeakWorkingSetSize);
+ //printf("%12u QuotaPagedPoolUsage\n", (unsigned) PMC.QuotaPagedPoolUsage);
+ //printf("%12u QuotaPeakPagedPoolUsage\n", (unsigned) PMC.QuotaPeakPagedPoolUsage);
+ //printf("%12u QuotaNonPagedPoolUsage\n", (unsigned) PMC.QuotaNonPagedPoolUsage);
+ //printf("%12u QuotaPeakNonPagedPoolUsage\n", (unsigned) PMC.QuotaPeakNonPagedPoolUsage);
+ unsigned uBytes = (unsigned) PMC.WorkingSetSize;
+ if (uBytes > uPeakMemUseBytes)
+ uPeakMemUseBytes = uBytes;
+ return (uBytes + 500000.0)/1000000.0;
+ }
+
+double GetPeakMemUseMB()
+ {
+ return (uPeakMemUseBytes + 500000.0)/1000000.0;
+ }
+
+void CheckMemUse()
+ {
+// Side-effect: sets peak usage in uPeakMemUseBytes
+ GetMemUseMB();
+ }
+
+double GetCPUGHz()
+ {
+ double dGHz = 2.5;
+ const char *e = getenv("CPUGHZ");
+ if (0 != e)
+ dGHz = atof(e);
+ if (dGHz < 0.1 || dGHz > 1000.0)
+ Quit("Invalid value '%s' for environment variable CPUGHZ", e);
+ return dGHz;
+ }
+#endif // WIN32
Added: trunk/packages/muscle/branches/upstream/current/gonnet.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/gonnet.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/gonnet.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,499 @@
+#include "muscle.h"
+#include "gonnet.h"
+
+#define ROW(A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y) \
+ { A/4.0, C/4.0, D/4.0, E/4.0, F/4.0, G/4.0, H/4.0, I/4.0, K/4.0, L/4.0, M/4.0, N/4.0, P/4.0, Q/4.0, R/4.0, S/4.0, T/4.0, V/4.0, W/4.0, Y/4.0 },
+
+static double Gonnet80[20][20] =
+ {
+// A C D E F G H I K L
+// M N P Q R S T V W Y
+ROW( 1990, 1140, 930, 1070, 600, 1130, 850, 810, 940, 810,
+ 980, 900, 1080, 1020, 880, 1380, 1190, 1180, 370, 590) // A
+
+ROW( 1140, 2780, 310, 300, 850, 630, 810, 700, 360, 690,
+ 850, 690, 310, 480, 640, 1090, 900, 1030, 810, 920) // C
+
+ROW( 930, 310, 2200, 1550, 130, 980, 1070, 180, 1030, 150,
+ 360, 1450, 820, 1150, 800, 1100, 1000, 350, 0, 550) // D
+
+ROW( 1070, 300, 1550, 2120, 220, 770, 1070, 510, 1280, 490,
+ 710, 1110, 890, 1470, 1010, 1050, 970, 730, 260, 500) // E
+
+ROW( 600, 850, 130, 220, 2380, 90, 980, 1090, 350, 1310,
+ 1270, 490, 310, 540, 340, 470, 620, 930, 1400, 1730) // F
+
+ROW( 1130, 630, 980, 770, 90, 2210, 710, 100, 740, 200,
+ 410, 1060, 660, 800, 810, 1080, 720, 380, 430, 300) // G
+
+ROW( 850, 810, 1070, 1070, 980, 710, 2510, 600, 1120, 670,
+ 860, 1330, 790, 1380, 1140, 990, 1000, 590, 810, 1450) // H
+
+ROW( 810, 700, 180, 510, 1090, 100, 600, 2100, 650, 1460,
+ 1490, 530, 490, 640, 530, 620, 960, 1650, 610, 770) // I
+
+ROW( 940, 360, 1030, 1280, 350, 740, 1120, 650, 2090, 660,
+ 870, 1220, 870, 1410, 1570, 1040, 1090, 700, 350, 640) // K
+
+ROW( 810, 690, 150, 490, 1310, 200, 670, 1460, 660, 2010,
+ 1550, 450, 660, 850, 660, 600, 750, 1270, 800, 890) // L
+
+ROW( 980, 850, 360, 710, 1270, 410, 860, 1490, 870, 1550,
+ 2410, 620, 460, 1050, 710, 830, 990, 1250, 790, 870) // M
+
+ROW( 900, 690, 1450, 1110, 490, 1060, 1330, 530, 1220, 450,
+ 620, 2210, 760, 1180, 1020, 1290, 1170, 550, 380, 850) // N
+
+ROW( 1080, 310, 820, 890, 310, 660, 790, 490, 870, 660,
+ 460, 760, 2380, 1000, 790, 1100, 1040, 670, 120, 480) // P
+
+ROW( 1020, 480, 1150, 1470, 540, 800, 1380, 640, 1410, 850,
+ 1050, 1180, 1000, 2190, 1350, 1090, 1060, 730, 620, 710) // Q
+
+ROW( 880, 640, 800, 1010, 340, 810, 1140, 530, 1570, 660,
+ 710, 1020, 790, 1350, 2210, 970, 970, 640, 830, 740) // R
+
+ROW( 1380, 1090, 1100, 1050, 470, 1080, 990, 620, 1040, 600,
+ 830, 1290, 1100, 1090, 970, 2020, 1490, 810, 520, 780) // S
+
+ROW( 1190, 900, 1000, 970, 620, 720, 1000, 960, 1090, 750,
+ 990, 1170, 1040, 1060, 970, 1490, 2050, 1150, 370, 660) // T
+
+ROW( 1180, 1030, 350, 730, 930, 380, 590, 1650, 700, 1270,
+ 1250, 550, 670, 730, 640, 810, 1150, 2040, 440, 770) // V
+
+ROW( 370, 810, 0, 260, 1400, 430, 810, 610, 350, 800,
+ 790, 380, 120, 620, 830, 520, 370, 440, 2970, 1470) // W
+
+ROW( 590, 920, 550, 500, 1730, 300, 1450, 770, 640, 890,
+ 870, 850, 480, 710, 740, 780, 660, 770, 1470, 2470) // Y
+ };
+
+static double Gonnet120[20][20] =
+ {
+// A C D E F G H I K L
+// M N P Q R S T V W Y
+ROW( 1550, 950, 780, 870, 480, 930, 700, 690, 770, 660,
+ 790, 760, 900, 840, 730, 1120, 980, 960, 280, 480) // A
+
+ROW( 950, 2400, 270, 280, 700, 510, 650, 600, 320, 570,
+ 700, 550, 280, 400, 510, 890, 750, 850, 670, 760) // C
+
+ROW( 780, 270, 1780, 1310, 90, 820, 890, 160, 880, 140,
+ 320, 1220, 680, 970, 690, 910, 830, 310, 0, 430) // D
+
+ROW( 870, 280, 1310, 1680, 180, 650, 900, 410, 1070, 390,
+ 560, 950, 740, 1210, 860, 870, 810, 580, 180, 400) // E
+
+ROW( 480, 700, 90, 180, 1980, 40, 820, 930, 290, 1110,
+ 1070, 380, 240, 430, 280, 380, 490, 790, 1230, 1510) // F
+
+ROW( 930, 510, 820, 650, 40, 1860, 590, 90, 620, 140,
+ 310, 890, 550, 660, 660, 900, 610, 310, 300, 220) // G
+
+ROW( 700, 650, 890, 900, 820, 590, 2060, 480, 940, 540,
+ 680, 1100, 650, 1130, 950, 820, 820, 490, 680, 1220) // H
+
+ROW( 690, 600, 160, 410, 930, 90, 480, 1680, 520, 1240,
+ 1250, 410, 400, 530, 430, 520, 790, 1380, 500, 650) // I
+
+ROW( 770, 320, 880, 1070, 290, 620, 940, 520, 1650, 520,
+ 690, 1010, 720, 1160, 1320, 860, 900, 570, 280, 520) // K
+
+ROW( 660, 570, 140, 390, 1110, 140, 540, 1240, 520, 1620,
+ 1300, 350, 520, 660, 520, 490, 620, 1090, 670, 760) // L
+
+ROW( 790, 700, 320, 560, 1070, 310, 680, 1250, 690, 1300,
+ 1910, 500, 400, 820, 580, 670, 800, 1060, 650, 740) // M
+
+ROW( 760, 550, 1220, 950, 380, 890, 1100, 410, 1010, 350,
+ 500, 1760, 640, 970, 860, 1060, 960, 460, 280, 680) // N
+
+ROW( 900, 280, 680, 740, 240, 550, 650, 400, 720, 520,
+ 400, 640, 2010, 820, 660, 910, 860, 540, 70, 370) // P
+
+ROW( 840, 400, 970, 1210, 430, 660, 1130, 530, 1160, 660,
+ 820, 970, 820, 1700, 1120, 890, 870, 600, 470, 580) // Q
+
+ROW( 730, 510, 690, 860, 280, 660, 950, 430, 1320, 520,
+ 580, 860, 660, 1120, 1790, 810, 800, 520, 660, 590) // R
+
+ROW( 1120, 890, 910, 870, 380, 900, 820, 520, 860, 490,
+ 670, 1060, 910, 890, 810, 1560, 1220, 680, 390, 610) // S
+
+ROW( 980, 750, 830, 810, 490, 610, 820, 790, 900, 620,
+ 800, 960, 860, 870, 800, 1220, 1600, 930, 290, 540) // T
+
+ROW( 960, 850, 310, 580, 790, 310, 490, 1380, 570, 1090,
+ 1060, 460, 540, 600, 520, 680, 930, 1610, 370, 630) // V
+
+ROW( 280, 670, 0, 180, 1230, 300, 680, 500, 280, 670,
+ 650, 280, 70, 470, 660, 390, 290, 370, 2620, 1290) // W
+
+ROW( 480, 760, 430, 400, 1510, 220, 1220, 650, 520, 760,
+ 740, 680, 370, 580, 590, 610, 540, 630, 1290, 2070) // Y
+ };
+
+static SCORE Gonnet160[20][20] =
+ {
+// A C D E F G H I K L
+// M N P Q R S T V W Y
+ROW( 1240, 810, 670, 740, 400, 800, 600, 600, 660, 560,
+ 660, 660, 770, 710, 620, 940, 830, 790, 230, 410) // A
+
+ROW( 810, 2130, 250, 260, 600, 440, 550, 530, 300, 490,
+ 590, 470, 260, 360, 430, 760, 640, 720, 570, 650) // C
+
+ROW( 670, 250, 1480, 1120, 80, 710, 770, 160, 770, 130,
+ 280, 1040, 590, 840, 620, 780, 720, 290, 0, 360) // D
+
+ROW( 740, 260, 1120, 1370, 160, 570, 770, 350, 910, 330,
+ 470, 830, 640, 1010, 750, 750, 700, 480, 140, 340) // E
+
+ROW( 400, 600, 80, 160, 1690, 20, 710, 810, 250, 970,
+ 920, 310, 200, 370, 250, 330, 420, 700, 1100, 1340) // F
+
+ROW( 800, 440, 710, 570, 20, 1600, 510, 80, 540, 110,
+ 260, 760, 480, 570, 570, 770, 540, 260, 230, 180) // G
+
+ROW( 600, 550, 770, 770, 710, 510, 1710, 410, 800, 460,
+ 570, 930, 560, 950, 810, 700, 700, 430, 590, 1050) // H
+
+ROW( 600, 530, 160, 350, 810, 80, 410, 1370, 430, 1080,
+ 1070, 340, 350, 460, 370, 450, 660, 1180, 440, 580) // I
+
+ROW( 660, 300, 770, 910, 250, 540, 800, 430, 1330, 440,
+ 570, 860, 620, 980, 1130, 740, 760, 480, 240, 430) // K
+
+ROW( 560, 490, 130, 330, 970, 110, 460, 1080, 440, 1350,
+ 1120, 300, 430, 540, 430, 420, 540, 950, 580, 670) // L
+
+ROW( 660, 590, 280, 470, 920, 260, 570, 1070, 570, 1120,
+ 1540, 420, 360, 660, 490, 550, 670, 920, 560, 650) // M
+
+ROW( 660, 470, 1040, 830, 310, 760, 930, 340, 860, 300,
+ 420, 1430, 560, 830, 740, 890, 810, 400, 230, 560) // N
+
+ROW( 770, 260, 590, 640, 200, 480, 560, 350, 620, 430,
+ 360, 560, 1740, 700, 570, 780, 740, 460, 40, 300) // P
+
+ROW( 710, 360, 840, 1010, 370, 570, 950, 460, 980, 540,
+ 660, 830, 700, 1340, 950, 760, 740, 510, 380, 490) // Q
+
+ROW( 620, 430, 620, 750, 250, 570, 810, 370, 1130, 430,
+ 490, 740, 570, 950, 1490, 690, 690, 440, 540, 490) // R
+
+ROW( 940, 760, 780, 750, 330, 770, 700, 450, 740, 420,
+ 550, 890, 780, 760, 690, 1220, 1010, 580, 310, 500) // S
+
+ROW( 830, 640, 720, 700, 420, 540, 700, 660, 760, 540,
+ 670, 810, 740, 740, 690, 1010, 1280, 780, 240, 460) // T
+
+ROW( 790, 720, 290, 480, 700, 260, 430, 1180, 480, 950,
+ 920, 400, 460, 510, 440, 580, 780, 1310, 330, 540) // V
+
+ROW( 230, 570, 0, 140, 1100, 230, 590, 440, 240, 580,
+ 560, 230, 40, 380, 540, 310, 240, 330, 2360, 1160) // W
+
+ROW( 410, 650, 360, 340, 1340, 180, 1050, 580, 430, 670,
+ 650, 560, 300, 490, 490, 500, 460, 540, 1160, 1780) // Y
+ };
+
+double Gonnet16[21][21] =
+ {
+// A C D E F G H I K L
+// M N P Q R S T V W Y
+ROW( 124, 81, 67, 74, 40, 80, 60, 60, 66, 56,
+ 66, 66, 77, 71, 62, 94, 83, 79, 23, 41) // A
+
+ROW( 81, 213, 25, 26, 60, 44, 55, 53, 30, 49,
+ 59, 47, 26, 36, 43, 76, 64, 72, 57, 65) // C
+
+ROW( 67, 25, 148, 112, 8, 71, 77, 16, 77, 13,
+ 28, 104, 59, 84, 62, 78, 72, 29, 0, 36) // D
+
+ROW( 74, 26, 112, 137, 16, 57, 77, 35, 91, 33,
+ 47, 83, 64, 101, 75, 75, 70, 48, 14, 34) // E
+
+ROW( 40, 60, 8, 16, 169, 2, 71, 81, 25, 97,
+ 92, 31, 20, 37, 25, 33, 42, 70, 110, 134) // F
+
+ROW( 80, 44, 71, 57, 2, 160, 51, 8, 54, 11,
+ 26, 76, 48, 57, 57, 77, 54, 26, 23, 18) // G
+
+ROW( 60, 55, 77, 77, 71, 51, 171, 41, 80, 46,
+ 57, 93, 56, 95, 81, 70, 70, 43, 59, 105) // H
+
+ROW( 60, 53, 16, 35, 81, 8, 41, 137, 43, 108,
+ 107, 34, 35, 46, 37, 45, 66, 118, 44, 58) // I
+
+ROW( 66, 30, 77, 91, 25, 54, 80, 43, 133, 44,
+ 57, 86, 62, 98, 113, 74, 76, 48, 24, 43) // K
+
+ROW( 56, 49, 13, 33, 97, 11, 46, 108, 44, 135,
+ 112, 30, 43, 54, 43, 42, 54, 95, 58, 67) // L
+
+ROW( 66, 59, 28, 47, 92, 26, 57, 107, 57, 112,
+ 154, 42, 36, 66, 49, 55, 67, 92, 56, 65) // M
+
+ROW( 66, 47, 104, 83, 31, 76, 93, 34, 86, 30,
+ 42, 143, 56, 83, 74, 89, 81, 40, 23, 56) // N
+
+ROW( 77, 26, 59, 64, 20, 48, 56, 35, 62, 43,
+ 36, 56, 174, 70, 57, 78, 74, 46, 4, 30) // P
+
+ROW( 71, 36, 84, 101, 37, 57, 95, 46, 98, 54,
+ 66, 83, 70, 134, 95, 76, 74, 51, 38, 49) // Q
+
+ROW( 62, 43, 62, 75, 25, 57, 81, 37, 113, 43,
+ 49, 74, 57, 95, 149, 69, 69, 44, 54, 49) // R
+
+ROW( 94, 76, 78, 75, 33, 77, 70, 45, 74, 42,
+ 55, 89, 78, 76, 69, 122, 101, 58, 31, 50) // S
+
+ROW( 83, 64, 72, 70, 42, 54, 70, 66, 76, 54,
+ 67, 81, 74, 74, 69, 101, 128, 78, 24, 46) // T
+
+ROW( 79, 72, 29, 48, 70, 26, 43, 118, 48, 95,
+ 92, 40, 46, 51, 44, 58, 78, 131, 33, 54) // V
+
+ROW( 23, 57, 0, 14, 110, 23, 59, 44, 24, 58,
+ 56, 23, 4, 38, 54, 31, 24, 33, 236, 116) // W
+
+ROW( 41, 65, 36, 34, 134, 18, 105, 58, 43, 67,
+ 65, 56, 30, 49, 49, 50, 46, 54, 116, 178) // Y
+ };
+
+static double Gonnet250[20][20] =
+ {
+// A C D E F G H I K L
+// M N P Q R S T V W Y
+ROW( 760, 570, 490, 520, 290, 570, 440, 440, 480, 400,
+ 450, 490, 550, 500, 460, 630, 580, 530, 160, 300) // A
+
+ROW( 570, 1670, 200, 220, 440, 320, 390, 410, 240, 370,
+ 430, 340, 210, 280, 300, 530, 470, 520, 420, 470) // C
+
+ROW( 490, 200, 990, 790, 70, 530, 560, 140, 570, 120,
+ 220, 740, 450, 610, 490, 570, 520, 230, 0, 240) // D
+
+ROW( 520, 220, 790, 880, 130, 440, 560, 250, 640, 240,
+ 320, 610, 470, 690, 560, 540, 510, 330, 90, 250) // E
+
+ROW( 290, 440, 70, 130, 1220, 0, 510, 620, 190, 720,
+ 680, 210, 140, 260, 200, 240, 300, 530, 880, 1030) // F
+
+ROW( 570, 320, 530, 440, 0, 1180, 380, 70, 410, 80,
+ 170, 560, 360, 420, 420, 560, 410, 190, 120, 120) // G
+
+ROW( 440, 390, 560, 560, 510, 380, 1120, 300, 580, 330,
+ 390, 640, 410, 640, 580, 500, 490, 320, 440, 740) // H
+
+ROW( 440, 410, 140, 250, 620, 70, 300, 920, 310, 800,
+ 770, 240, 260, 330, 280, 340, 460, 830, 340, 450) // I
+
+ROW( 480, 240, 570, 640, 190, 410, 580, 310, 840, 310,
+ 380, 600, 460, 670, 790, 530, 530, 350, 170, 310) // K
+
+ROW( 400, 370, 120, 240, 720, 80, 330, 800, 310, 920,
+ 800, 220, 290, 360, 300, 310, 390, 700, 450, 520) // L
+
+ROW( 450, 430, 220, 320, 680, 170, 390, 770, 380, 800,
+ 950, 300, 280, 420, 350, 380, 460, 680, 420, 500) // M
+
+ROW( 490, 340, 740, 610, 210, 560, 640, 240, 600, 220,
+ 300, 900, 430, 590, 550, 610, 570, 300, 160, 380) // N
+
+ROW( 550, 210, 450, 470, 140, 360, 410, 260, 460, 290,
+ 280, 430, 1280, 500, 430, 560, 530, 340, 20, 210) // P
+
+ROW( 500, 280, 610, 690, 260, 420, 640, 330, 670, 360,
+ 420, 590, 500, 790, 670, 540, 520, 370, 250, 350) // Q
+
+ROW( 460, 300, 490, 560, 200, 420, 580, 280, 790, 300,
+ 350, 550, 430, 670, 990, 500, 500, 320, 360, 340) // R
+
+ROW( 630, 530, 570, 540, 240, 560, 500, 340, 530, 310,
+ 380, 610, 560, 540, 500, 740, 670, 420, 190, 330) // S
+
+ROW( 580, 470, 520, 510, 300, 410, 490, 460, 530, 390,
+ 460, 570, 530, 520, 500, 670, 770, 520, 170, 330) // T
+
+ROW( 530, 520, 230, 330, 530, 190, 320, 830, 350, 700,
+ 680, 300, 340, 370, 320, 420, 520, 860, 260, 410) // V
+
+ROW( 160, 420, 0, 90, 880, 120, 440, 340, 170, 450,
+ 420, 160, 20, 250, 360, 190, 170, 260, 1940, 930) // W
+
+ROW( 300, 470, 240, 250, 1030, 120, 740, 450, 310, 520,
+ 500, 380, 210, 350, 340, 330, 330, 410, 930, 1300) // Y
+ };
+
+static double Gonnet350[20][20] =
+ {
+// A C D E F G H I K L
+// M N P Q R S T V W Y
+ROW( 450, 390, 350, 360, 210, 400, 310, 310, 340, 280,
+ 310, 350, 380, 350, 330, 410, 390, 350, 110, 210) // A
+
+ROW( 390, 1280, 160, 180, 320, 230, 270, 300, 190, 280,
+ 310, 240, 170, 210, 220, 360, 330, 370, 310, 340) // C
+
+ROW( 350, 160, 640, 540, 50, 390, 400, 110, 410, 100,
+ 160, 500, 330, 430, 370, 400, 370, 170, 0, 170) // D
+
+ROW( 360, 180, 540, 550, 100, 330, 390, 180, 440, 170,
+ 220, 440, 350, 460, 410, 380, 360, 230, 60, 180) // E
+
+ROW( 210, 320, 50, 100, 860, 0, 360, 460, 140, 530,
+ 490, 150, 100, 190, 150, 170, 220, 400, 700, 770) // F
+
+ROW( 400, 230, 390, 330, 0, 860, 280, 60, 310, 50,
+ 120, 400, 280, 310, 310, 400, 300, 140, 50, 80) // G
+
+ROW( 310, 270, 400, 390, 360, 280, 680, 220, 400, 240,
+ 270, 430, 300, 420, 410, 350, 340, 240, 320, 500) // H
+
+ROW( 310, 300, 110, 180, 460, 60, 220, 620, 220, 570,
+ 540, 170, 190, 240, 200, 240, 320, 570, 260, 340) // I
+
+ROW( 340, 190, 410, 440, 140, 310, 400, 220, 530, 210,
+ 260, 420, 330, 450, 530, 370, 370, 250, 120, 210) // K
+
+ROW( 280, 280, 100, 170, 530, 50, 240, 570, 210, 630,
+ 560, 160, 200, 240, 210, 220, 280, 510, 340, 400) // L
+
+ROW( 310, 310, 160, 220, 490, 120, 270, 540, 260, 560,
+ 580, 210, 210, 280, 240, 260, 310, 490, 320, 370) // M
+
+ROW( 350, 240, 500, 440, 150, 400, 430, 170, 420, 160,
+ 210, 550, 320, 410, 390, 410, 390, 220, 110, 250) // N
+
+ROW( 380, 170, 330, 350, 100, 280, 300, 190, 330, 200,
+ 210, 320, 910, 350, 310, 390, 370, 240, 10, 150) // P
+
+ROW( 350, 210, 430, 460, 190, 310, 420, 240, 450, 240,
+ 280, 410, 350, 470, 450, 370, 360, 260, 160, 240) // Q
+
+ROW( 330, 220, 370, 410, 150, 310, 410, 200, 530, 210,
+ 240, 390, 310, 450, 630, 360, 350, 230, 230, 230) // R
+
+ROW( 410, 360, 400, 380, 170, 400, 350, 240, 370, 220,
+ 260, 410, 390, 370, 360, 450, 430, 290, 130, 230) // S
+
+ROW( 390, 330, 370, 360, 220, 300, 340, 320, 370, 280,
+ 310, 390, 370, 360, 350, 430, 460, 350, 120, 230) // T
+
+ROW( 350, 370, 170, 230, 400, 140, 240, 570, 250, 510,
+ 490, 220, 240, 260, 230, 290, 350, 560, 210, 310) // V
+
+ROW( 110, 310, 0, 60, 700, 50, 320, 260, 120, 340,
+ 320, 110, 10, 160, 230, 130, 120, 210, 1590, 740) // W
+
+ROW( 210, 340, 170, 180, 770, 80, 500, 340, 210, 400,
+ 370, 250, 150, 240, 230, 230, 230, 310, 740, 920) // Y
+ };
+
+const t_ROW *GetGonnetMatrix(unsigned N)
+ {
+ switch (N)
+ {
+ case 80:
+ return Gonnet80;
+ case 120:
+ return Gonnet120;
+ //case 16:
+ // return Gonnet16;
+ //case 160:
+ // return Gonnet160;
+ case 250:
+ return Gonnet250;
+ case 350:
+ return Gonnet350;
+ }
+ Quit("Invalid Gonnet%u", N);
+ return 0;
+ }
+
+//SCORE GetGonnetGapOpen(unsigned N)
+// {
+// switch (N)
+// {
+// case 80:
+// return -639;
+// case 120:
+// return -863;
+// case 160:
+// return -611;
+// case 250:
+// return -308;
+// case 350:
+// return -158;
+// }
+// Quit("Invalid Gonnet%u", N);
+// return 0;
+// }
+
+SCORE GetGonnetGapOpen(unsigned N)
+ {
+ switch (N)
+ {
+ case 80:
+ return -1000;
+ case 120:
+ return -800;
+ case 160:
+ return -700;
+ case 250:
+ return -200;
+ case 350:
+ return -175;
+ }
+ Quit("Invalid Gonnet%u", N);
+ return 0;
+ }
+
+SCORE GetGonnetGapExtend(unsigned N)
+ {
+ switch (N)
+ {
+ case 80:
+ return 350;
+ case 120:
+ return 200;
+ case 160:
+ return 175;
+ case 250:
+ return 20;
+ case 350:
+ return 20;
+ }
+ Quit("Invalid Gonnet%u", N);
+ return 0;
+ }
+
+//double GonnetLookup[400][400];
+//
+//static bool InitGonnetLookup()
+// {
+// for (unsigned i = 0; i < 400; ++i)
+// {
+// const unsigned A1 = i/20;
+// const unsigned A2 = i%20;
+// for (unsigned j = 0; j <= i; ++j)
+// {
+// const unsigned B1 = j/20;
+// const unsigned B2 = j%20;
+//
+// const double s00 = Gonnet16[A1][B1];
+// const double s01 = Gonnet16[A1][B2];
+// const double s10 = Gonnet16[A2][B1];
+// const double s11 = Gonnet16[A2][B2];
+//
+// GonnetLookup[i][j] = GonnetLookup[j][i] = (s00 + s01 + s10 + s11)/4;
+// }
+// }
+// return true;
+// }
+//
+//static bool bGonnetLookupInitialized = InitGonnetLookup();
Added: trunk/packages/muscle/branches/upstream/current/gonnet.h
===================================================================
--- trunk/packages/muscle/branches/upstream/current/gonnet.h 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/gonnet.h 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,12 @@
+#ifndef Gonnet_h
+#define Gonnet_h
+
+typedef double t_ROW[20];
+
+const t_ROW *GetGonnetMatrix(unsigned N);
+SCORE GetGonnetGapOpen(unsigned N);
+SCORE GetGonnetGapExtend(unsigned N);
+
+extern double GonnetLookup[400][400];
+
+#endif // Gonnet_h
Added: trunk/packages/muscle/branches/upstream/current/gotowt.cpp
===================================================================
Added: trunk/packages/muscle/branches/upstream/current/henikoffweight.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/henikoffweight.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/henikoffweight.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,84 @@
+#include "muscle.h"
+#include "msa.h"
+
+/***
+Compute Henikoff weights.
+Steven Henikoff and Jorja G. Henikoff (1994), Position-based sequence weights.
+J. Mol. Biol., 243(4):574-578.
+
+Award each different residue an equal share of the weight, and then to divide up
+that weight equally among the sequences sharing the same residue. So if in a
+position of a multiple alignment, r different residues are represented, a residue
+represented in only one sequence contributes a score of 1/r to that sequence, whereas a
+residue represented in s sequences contributes a score of 1/rs to each of the s
+sequences. For each sequence, the contributions from each position are summed to give
+a sequence weight.
+
+See also HenikoffWeightPB.
+***/
+
+void MSA::CalcHenikoffWeightsCol(unsigned uColIndex) const
+ {
+ const unsigned uSeqCount = GetSeqCount();
+
+// Compute letter counts in this column
+ unsigned uLetterCount[MAX_ALPHA];
+ memset(uLetterCount, 0, sizeof(uLetterCount));
+ unsigned uDifferentLetterCount = 0;
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ unsigned uLetter = GetLetterEx(uSeqIndex, uColIndex);
+ if (uLetter >= 20)
+ continue;
+ unsigned uNewCount = uLetterCount[uLetter] + 1;
+ uLetterCount[uLetter] = uNewCount;
+ if (1 == uNewCount)
+ ++uDifferentLetterCount;
+ }
+
+// Compute weight contributions
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ unsigned uLetter = GetLetterEx(uSeqIndex, uColIndex);
+ if (uLetter >= 20)
+ continue;
+ const unsigned uCount = uLetterCount[uLetter];
+ unsigned uDenom = uCount*uDifferentLetterCount;
+ if (uDenom == 0)
+ continue;
+ m_Weights[uSeqIndex] += (WEIGHT) (1.0/uDenom);
+ }
+ }
+
+void MSA::SetHenikoffWeights() const
+ {
+ const unsigned uColCount = GetColCount();
+ const unsigned uSeqCount = GetSeqCount();
+
+ if (0 == uSeqCount)
+ return;
+ else if (1 == uSeqCount)
+ {
+ m_Weights[0] = (WEIGHT) 1.0;
+ return;
+ }
+ else if (2 == uSeqCount)
+ {
+ m_Weights[0] = (WEIGHT) 0.5;
+ m_Weights[1] = (WEIGHT) 0.5;
+ return;
+ }
+
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ m_Weights[uSeqIndex] = 0.0;
+
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
+ CalcHenikoffWeightsCol(uColIndex);
+
+// Set all-gap seqs weight to 0
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ if (IsGapSeq(uSeqIndex))
+ m_Weights[uSeqIndex] = 0.0;
+
+ Normalize(m_Weights, uSeqCount);
+ }
Added: trunk/packages/muscle/branches/upstream/current/henikoffweightpb.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/henikoffweightpb.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/henikoffweightpb.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,124 @@
+#include "muscle.h"
+#include "msa.h"
+
+/***
+Compute Henikoff weights.
+Steven Henikoff and Jorja G. Henikoff (1994), Position-based sequence weights.
+J. Mol. Biol., 243(4):574-578.
+
+Award each different residue an equal share of the weight, and then to divide up
+that weight equally among the sequences sharing the same residue. So if in a
+position of a multiple alignment, r different residues are represented, a residue
+represented in only one sequence contributes a score of 1/r to that sequence, whereas a
+residue represented in s sequences contributes a score of 1/rs to each of the s
+sequences. For each sequence, the contributions from each position are summed to give
+a sequence weight.
+
+Here we use the variant from PSI-BLAST, which (a) treats gaps as a 21st letter,
+and (b) ignores columns that are perfectly conserved.
+
+>>> WARNING -- I SUSPECT THIS DOESN'T WORK CORRECTLY <<<
+***/
+
+void MSA::CalcHenikoffWeightsColPB(unsigned uColIndex) const
+ {
+ const unsigned uSeqCount = GetSeqCount();
+
+// Compute letter counts in this column
+ unsigned uLetterCount[MAX_ALPHA+1];
+ memset(uLetterCount, 0, (MAX_ALPHA+1)*sizeof(unsigned));
+ unsigned uLetter;
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ if (IsGap(uSeqIndex, uColIndex) || IsWildcard(uSeqIndex, uColIndex))
+ uLetter = MAX_ALPHA;
+ else
+ uLetter = GetLetter(uSeqIndex, uColIndex);
+ ++(uLetterCount[uLetter]);
+ }
+
+// Check for special case of perfect conservation
+ for (unsigned uLetter = 0; uLetter < MAX_ALPHA+1; ++uLetter)
+ {
+ unsigned uCount = uLetterCount[uLetter];
+ if (uCount > 0)
+ {
+ // Perfectly conserved?
+ if (uCount == uSeqCount)
+ return;
+ else
+ // If count > 0 but less than nr. sequences, can't be conserved
+ break;
+ }
+ }
+
+// Compute weight contributions
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ unsigned uLetter;
+ if (IsGap(uSeqIndex, uColIndex) || IsWildcard(uSeqIndex, uColIndex))
+ uLetter = MAX_ALPHA;
+ else
+ uLetter = GetLetter(uSeqIndex, uColIndex);
+ const unsigned uCount = uLetterCount[uLetter];
+ m_Weights[uSeqIndex] += (WEIGHT) (1.0/uCount);
+ }
+ }
+
+bool MSA::IsGapSeq(unsigned uSeqIndex) const
+ {
+ const unsigned uColCount = GetColCount();
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
+ if (!IsGap(uSeqIndex, uColIndex))
+ return false;
+ return true;
+ }
+
+void MSA::SetUniformWeights() const
+ {
+ const unsigned uSeqCount = GetSeqCount();
+ if (0 == uSeqCount)
+ return;
+
+ const WEIGHT w = (WEIGHT) (1.0 / uSeqCount);
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ m_Weights[uSeqIndex] = w;
+ }
+
+void MSA::SetHenikoffWeightsPB() const
+ {
+ const unsigned uColCount = GetColCount();
+ const unsigned uSeqCount = GetSeqCount();
+
+ if (0 == uSeqCount)
+ return;
+ else if (1 == uSeqCount)
+ {
+ m_Weights[0] = 1.0;
+ return;
+ }
+ else if (2 == uSeqCount)
+ {
+ m_Weights[0] = (WEIGHT) 0.5;
+ m_Weights[1] = (WEIGHT) 0.5;
+ return;
+ }
+
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ m_Weights[uSeqIndex] = 0.0;
+
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
+ CalcHenikoffWeightsColPB(uColIndex);
+
+// Set all-gap seqs weight to 0
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ if (IsGapSeq(uSeqIndex))
+ m_Weights[uSeqIndex] = 0.0;
+
+// Check for special case of identical sequences, which will cause all
+// columns to be skipped becasue they're perfectly conserved.
+ if (VectorIsZero(m_Weights, uSeqCount))
+ VectorSet(m_Weights, uSeqCount, 1.0);
+
+ Normalize(m_Weights, uSeqCount);
+ }
Added: trunk/packages/muscle/branches/upstream/current/html.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/html.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/html.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,136 @@
+#include "muscle.h"
+#include <stdio.h>
+#include <ctype.h>
+#include "msa.h"
+#include "textfile.h"
+
+const unsigned uCharsPerLine = 60;
+const int MIN_NAME = 10;
+const int MAX_NAME = 32;
+
+extern void AssignColors(const MSA &a, int **Colors);
+
+static int **MakeColors(const MSA &a)
+ {
+ const unsigned uSeqCount = a.GetSeqCount();
+ const unsigned uColCount = a.GetColCount();
+
+ int **Colors = new int *[uSeqCount];
+ for (unsigned i = 0; i < uSeqCount; ++i)
+ {
+ Colors[i] = new int[uColCount];
+ memset(Colors[i], 0, uColCount*sizeof(int));
+ }
+ AssignColors(a, Colors);
+ return Colors;
+ }
+
+static void ChangeColor(TextFile &File, int From, int To)
+ {
+ if (From == To)
+ return;
+
+#define COLOR_WHITE "FFFFFF"
+#define COLOR_GRAY "C0C0C0"
+#define COLOR_BLACK "000000"
+#define COLOR_RED "FF0000"
+#define COLOR_GREEN "00FF00"
+#define COLOR_BLUE "5590FF"
+#define COLOR_LIGHTBLUE "77FFFF"
+
+#define X(c) File.PutString("</SPAN><SPAN STYLE=\"background-color:#" c "\">");
+ switch (To)
+ {
+ case 0:
+ X(COLOR_WHITE)
+ break;
+ case 1:
+ X(COLOR_GRAY)
+ break;
+ case 2:
+ X(COLOR_BLUE)
+ break;
+ case 3:
+ X(COLOR_LIGHTBLUE)
+ break;
+ }
+ }
+
+#define COLOR_WINDOW "FFEEE0"
+
+void MSA::ToHTMLFile(TextFile &File) const
+ {
+ File.PutString("<HTML>\n");
+ File.PutString("<BODY BGCOLOR=\"#" COLOR_WINDOW "\">\n");
+ File.PutString("<PRE>");
+
+ int **Colors = MakeColors(*this);
+
+ int iLongestNameLength = 0;
+ for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)
+ {
+ const char *ptrName = GetSeqName(uSeqIndex);
+ const char *ptrBlank = strchr(ptrName, ' ');
+ int iLength;
+ if (0 != ptrBlank)
+ iLength = (int) (ptrBlank - ptrName);
+ else
+ iLength = (int) strlen(ptrName);
+ if (iLength > iLongestNameLength)
+ iLongestNameLength = iLength;
+ }
+ if (iLongestNameLength > MAX_NAME)
+ iLongestNameLength = MAX_NAME;
+ if (iLongestNameLength < MIN_NAME)
+ iLongestNameLength = MIN_NAME;
+
+ unsigned uLineCount = (GetColCount() - 1)/uCharsPerLine + 1;
+ int CurrentColor = -1;
+ for (unsigned uLineIndex = 0; uLineIndex < uLineCount; ++uLineIndex)
+ {
+ File.PutString("\n");
+ unsigned uStartColIndex = uLineIndex*uCharsPerLine;
+ unsigned uEndColIndex = uStartColIndex + uCharsPerLine - 1;
+ if (uEndColIndex >= GetColCount())
+ uEndColIndex = GetColCount() - 1;
+ char Name[MAX_NAME+1];
+ for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)
+ {
+ const char *ptrName = GetSeqName(uSeqIndex);
+ const char *ptrBlank = strchr(ptrName, ' ');
+ int iLength;
+ if (0 != ptrBlank)
+ iLength = (int) (ptrBlank - ptrName);
+ else
+ iLength = (int) strlen(ptrName);
+ if (iLength > MAX_NAME)
+ iLength = MAX_NAME;
+ memset(Name, ' ', MAX_NAME);
+ memcpy(Name, ptrName, iLength);
+ Name[iLongestNameLength] = 0;
+
+// File.PutString("<FONT COLOR=\"#000000\">");
+ CurrentColor = -1;
+ File.PutString("<SPAN STYLE=\"background-color:#" COLOR_WINDOW "\">");
+ File.PutFormat("%s ", Name);
+ File.PutString("<SPAN STYLE=\"background-color:#FFFFFF\">");
+ for (unsigned uColIndex = uStartColIndex; uColIndex <= uEndColIndex;
+ ++uColIndex)
+ {
+ const int Color = Colors[uSeqIndex][uColIndex];
+ ChangeColor(File, CurrentColor, Color);
+ CurrentColor = Color;
+ const char c = GetChar(uSeqIndex, uColIndex);
+ if (Color == 0)
+ File.PutFormat("%c", tolower(c));
+ else
+ File.PutFormat("%c", toupper(c));
+ }
+ File.PutString("\n");
+ }
+ }
+ File.PutString("</SPAN>\n");
+ File.PutString("</PRE>\n");
+ File.PutString("</BODY>\n");
+ File.PutString("</HTML>\n");
+ }
Added: trunk/packages/muscle/branches/upstream/current/hydro.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/hydro.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/hydro.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,37 @@
+#include "muscle.h"
+#include "profile.h"
+
+// Apply hydrophobicity heuristic to a profile
+void Hydro(ProfPos *Prof, unsigned uLength)
+ {
+ if (0 == g_uHydrophobicRunLength)
+ return;
+
+ unsigned uRunLength = 0;
+ for (unsigned uColIndex = 0; uColIndex < uLength; ++uColIndex)
+ {
+ ProfPos &PP = Prof[uColIndex];
+ bool bHydro = (PP.m_fOcc > 0.999 && IsHydrophobic(PP.m_fcCounts));
+ if (bHydro)
+ {
+ ++uRunLength;
+ if (uRunLength > g_uHydrophobicRunLength)
+ {
+ PP.m_scoreGapOpen *= (SCORE) g_dHydroFactor;
+ PP.m_scoreGapClose *= (SCORE) g_dHydroFactor;
+ }
+ else if (uRunLength == g_uHydrophobicRunLength)
+ {
+ for (unsigned n = uColIndex - g_uHydrophobicRunLength - 1;
+ n <= uColIndex; ++n)
+ {
+ ProfPos &PP = Prof[n];
+ PP.m_scoreGapOpen *= (SCORE) g_dHydroFactor;
+ PP.m_scoreGapClose *= (SCORE) g_dHydroFactor;
+ }
+ }
+ }
+ else
+ uRunLength = 0;
+ }
+ }
Added: trunk/packages/muscle/branches/upstream/current/intmath.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/intmath.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/intmath.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,352 @@
+#include "muscle.h"
+#include <math.h>
+
+PROB ScoreToProb(SCORE Score)
+ {
+ if (MINUS_INFINITY >= Score)
+ return 0.0;
+ return (PROB) pow(2.0, (double) Score/INTSCALE);
+ }
+
+static const double log2e = log2(exp(1.0));
+
+double lnTolog2(double ln)
+ {
+ return ln*log2e;
+ }
+
+double log2(double x)
+ {
+ if (0 == x)
+ return MINUS_INFINITY;
+
+ static const double dInvLn2 = 1.0/log(2);
+// Multiply by inverse of log(2) just in case multiplication
+// is faster than division.
+ return log(x)*dInvLn2;
+ }
+
+SCORE ProbToScore(PROB Prob)
+ {
+ if (0.0 == Prob)
+ return MINUS_INFINITY;
+// return (SCORE) floor(INTSCALE*log2(Prob));
+ return (SCORE) log2(Prob);
+ }
+
+WEIGHT DoubleToWeight(double d)
+ {
+ assert(d >= 0);
+ return (WEIGHT) (INTSCALE*d);
+ }
+
+double WeightToDouble(WEIGHT w)
+ {
+ return (double) w / (double) INTSCALE;
+ }
+
+SCORE DoubleToScore(double d)
+ {
+ return (SCORE)(d*(double) INTSCALE);
+ }
+
+bool ScoreEq(SCORE s1, SCORE s2)
+ {
+ return BTEq(s1, s2);
+ }
+
+static bool BTEq2(BASETYPE b1, BASETYPE b2)
+ {
+ double diff = fabs(b1 - b2);
+ if (diff < 0.0001)
+ return true;
+ double sum = fabs(b1) + fabs(b2);
+ return diff/sum < 0.005;
+ }
+
+bool BTEq(double b1, double b2)
+ {
+ return BTEq2((BASETYPE) b1, (BASETYPE) b2);
+ }
+
+const double dLn2 = log(2);
+
+// pow2(x)=2^x
+double pow2(double x)
+ {
+ if (MINUS_INFINITY == x)
+ return 0;
+ return exp(x*dLn2);
+ }
+
+// lp2(x) = log2(1 + 2^-x), x >= 0
+double lp2(double x)
+ {
+ return log2(1 + pow2(-x));
+ }
+
+// SumLog(x, y) = log2(2^x + 2^y)
+SCORE SumLog(SCORE x, SCORE y)
+ {
+ return (SCORE) log2(pow2(x) + pow2(y));
+ }
+
+// SumLog(x, y, z) = log2(2^x + 2^y + 2^z)
+SCORE SumLog(SCORE x, SCORE y, SCORE z)
+ {
+ return (SCORE) log2(pow2(x) + pow2(y) + pow2(z));
+ }
+
+// SumLog(w, x, y, z) = log2(2^w + 2^x + 2^y + 2^z)
+SCORE SumLog(SCORE w, SCORE x, SCORE y, SCORE z)
+ {
+ return (SCORE) log2(pow2(w) + pow2(x) + pow2(y) + pow2(z));
+ }
+
+SCORE lp2Fast(SCORE x)
+ {
+ assert(x >= 0);
+ const int iTableSize = 1000;
+ const double dRange = 20.0;
+ const double dScale = dRange/iTableSize;
+ static SCORE dValue[iTableSize];
+ static bool bInit = false;
+ if (!bInit)
+ {
+ for (int i = 0; i < iTableSize; ++i)
+ dValue[i] = (SCORE) lp2(i*dScale);
+ bInit = true;
+ }
+ if (x >= dRange)
+ return 0.0;
+ int i = (int) (x/dScale);
+ assert(i >= 0 && i < iTableSize);
+ SCORE dResult = dValue[i];
+ assert(BTEq(dResult, lp2(x)));
+ return dResult;
+ }
+
+// SumLog(x, y) = log2(2^x + 2^y)
+SCORE SumLogFast(SCORE x, SCORE y)
+ {
+ if (MINUS_INFINITY == x)
+ {
+ if (MINUS_INFINITY == y)
+ return MINUS_INFINITY;
+ return y;
+ }
+ else if (MINUS_INFINITY == y)
+ return x;
+
+ SCORE dResult;
+ if (x > y)
+ dResult = x + lp2Fast(x-y);
+ else
+ dResult = y + lp2Fast(y-x);
+ assert(SumLog(x, y) == dResult);
+ return dResult;
+ }
+
+SCORE SumLogFast(SCORE x, SCORE y, SCORE z)
+ {
+ SCORE dResult = SumLogFast(x, SumLogFast(y, z));
+ assert(SumLog(x, y, z) == dResult);
+ return dResult;
+ }
+
+SCORE SumLogFast(SCORE w, SCORE x, SCORE y, SCORE z)
+ {
+ SCORE dResult = SumLogFast(SumLogFast(w, x), SumLogFast(y, z));
+ assert(SumLog(w, x, y, z) == dResult);
+ return dResult;
+ }
+
+double VecSum(const double v[], unsigned n)
+ {
+ double dSum = 0.0;
+ for (unsigned i = 0; i < n; ++i)
+ dSum += v[i];
+ return dSum;
+ }
+
+void Normalize(PROB p[], unsigned n)
+ {
+ unsigned i;
+ PROB dSum = 0.0;
+ for (i = 0; i < n; ++i)
+ dSum += p[i];
+ if (0.0 == dSum)
+ Quit("Normalize, sum=0");
+ for (i = 0; i < n; ++i)
+ p[i] /= dSum;
+ }
+
+void NormalizeUnlessZero(PROB p[], unsigned n)
+ {
+ unsigned i;
+ PROB dSum = 0.0;
+ for (i = 0; i < n; ++i)
+ dSum += p[i];
+ if (0.0 == dSum)
+ return;
+ for (i = 0; i < n; ++i)
+ p[i] /= dSum;
+ }
+
+void Normalize(PROB p[], unsigned n, double dRequiredTotal)
+ {
+ unsigned i;
+ double dSum = 0.0;
+ for (i = 0; i < n; ++i)
+ dSum += p[i];
+ if (0.0 == dSum)
+ Quit("Normalize, sum=0");
+ double dFactor = dRequiredTotal / dSum;
+ for (i = 0; i < n; ++i)
+ p[i] *= (PROB) dFactor;
+ }
+
+bool VectorIsZero(const double dValues[], unsigned n)
+ {
+ for (unsigned i = 0; i < n; ++i)
+ if (dValues[i] != 0.0)
+ return false;
+ return true;
+ }
+
+void VectorSet(double dValues[], unsigned n, double d)
+ {
+ for (unsigned i = 0; i < n; ++i)
+ dValues[i] = d;
+ }
+
+bool VectorIsZero(const float dValues[], unsigned n)
+ {
+ for (unsigned i = 0; i < n; ++i)
+ if (dValues[i] != 0.0)
+ return false;
+ return true;
+ }
+
+void VectorSet(float dValues[], unsigned n, float d)
+ {
+ for (unsigned i = 0; i < n; ++i)
+ dValues[i] = d;
+ }
+
+double Correl(const double P[], const double Q[], unsigned uCount)
+ {
+ double dSumP = 0.0;
+ double dSumQ = 0.0;
+ for (unsigned n = 0; n < uCount; ++n)
+ {
+ dSumP += P[n];
+ dSumQ += Q[n];
+ }
+ const double dMeanP = dSumP/uCount;
+ const double dMeanQ = dSumQ/uCount;
+
+ double dSum1 = 0.0;
+ double dSum2 = 0.0;
+ double dSum3 = 0.0;
+ for (unsigned n = 0; n < uCount; ++n)
+ {
+ const double dDiffP = P[n] - dMeanP;
+ const double dDiffQ = Q[n] - dMeanQ;
+ dSum1 += dDiffP*dDiffQ;
+ dSum2 += dDiffP*dDiffP;
+ dSum3 += dDiffQ*dDiffQ;
+ }
+ if (0 == dSum1)
+ return 0;
+ const double dCorrel = dSum1 / sqrt(dSum2*dSum3);
+ return dCorrel;
+ }
+
+float Correl(const float P[], const float Q[], unsigned uCount)
+ {
+ float dSumP = 0.0;
+ float dSumQ = 0.0;
+ for (unsigned n = 0; n < uCount; ++n)
+ {
+ dSumP += P[n];
+ dSumQ += Q[n];
+ }
+ const float dMeanP = dSumP/uCount;
+ const float dMeanQ = dSumQ/uCount;
+
+ float dSum1 = 0.0;
+ float dSum2 = 0.0;
+ float dSum3 = 0.0;
+ for (unsigned n = 0; n < uCount; ++n)
+ {
+ const float dDiffP = P[n] - dMeanP;
+ const float dDiffQ = Q[n] - dMeanQ;
+ dSum1 += dDiffP*dDiffQ;
+ dSum2 += dDiffP*dDiffP;
+ dSum3 += dDiffQ*dDiffQ;
+ }
+ if (0 == dSum1)
+ return 0;
+ const float dCorrel = dSum1 / (float) sqrt(dSum2*dSum3);
+ return dCorrel;
+ }
+
+// Simple (but slow) function to compute Pearson ranks
+// that allows for ties. Correctness and simplicity
+// are priorities over speed here.
+void Rank(const float P[], float Ranks[], unsigned uCount)
+ {
+ for (unsigned n = 0; n < uCount; ++n)
+ {
+ unsigned uNumberGreater = 0;
+ unsigned uNumberEqual = 0;
+ unsigned uNumberLess = 0;
+ double dValue = P[n];
+ for (unsigned i = 0; i < uCount; ++i)
+ {
+ double v = P[i];
+ if (v == dValue)
+ ++uNumberEqual;
+ else if (v < dValue)
+ ++uNumberLess;
+ else
+ ++uNumberGreater;
+ }
+ assert(uNumberEqual >= 1);
+ assert(uNumberEqual + uNumberLess + uNumberGreater == uCount);
+ Ranks[n] = (float) (1 + uNumberLess + (uNumberEqual - 1)/2.0);
+ }
+ }
+
+void Rank(const double P[], double Ranks[], unsigned uCount)
+ {
+ for (unsigned n = 0; n < uCount; ++n)
+ {
+ unsigned uNumberGreater = 0;
+ unsigned uNumberEqual = 0;
+ unsigned uNumberLess = 0;
+ double dValue = P[n];
+ for (unsigned i = 0; i < uCount; ++i)
+ {
+ double v = P[i];
+ if (v == dValue)
+ ++uNumberEqual;
+ else if (v < dValue)
+ ++uNumberLess;
+ else
+ ++uNumberGreater;
+ }
+ assert(uNumberEqual >= 1);
+ assert(uNumberEqual + uNumberLess + uNumberGreater == uCount);
+ Ranks[n] = (double) (1 + uNumberLess + (uNumberEqual - 1)/2.0);
+ }
+ }
+
+FCOUNT SumCounts(const FCOUNT Counts[])
+ {
+ FCOUNT Sum = 0;
+ for (int i = 0; i < 20; ++i)
+ Sum += Counts[i];
+ return Sum;
+ }
Added: trunk/packages/muscle/branches/upstream/current/intmath.h
===================================================================
--- trunk/packages/muscle/branches/upstream/current/intmath.h 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/intmath.h 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,210 @@
+// IntMath.h: Header for doing fractional math with integers for speed.
+
+#ifndef IntMath_h
+#define IntMath_h
+
+typedef float BASETYPE;
+//typedef double BASETYPE;
+
+// Scaling factor used to store certain floating point
+// values as integers to a few significant figures.
+//const int INTSCALE = 1000;
+const int INTSCALE = 1;
+
+// Type for a probability in range 0.0 to 1.0.
+typedef BASETYPE PROB;
+
+// Type for an log-odds integer score.
+// Stored as log2(PROB)*INTSCALE.
+//typedef int SCORE;
+typedef BASETYPE SCORE;
+
+// Type for a weight.
+// Stored as w*INTSCALE where w is in range 0.0 to 1.0.
+//typedef unsigned WEIGHT;
+typedef BASETYPE WEIGHT;
+
+// Type for a fractional weighted count stored as n*WEIGHT/N
+// where n=measured count (integer >= 0) and N is total for
+// the distribution (e.g., n=number of residues of a given
+// type in a column, N=number of residues in the column).
+// Hence values in an FCOUNT variable range from 0..INTSCALE
+// as an integer, representing "true" values 0.0 to 1.0.
+//typedef unsigned FCOUNT;
+typedef BASETYPE FCOUNT;
+
+// Representation of -infinity. Value should
+// be large and negative, but not so large
+// that adding a few of them overflows.
+// TODO: Multiplied by 10 to work around bug
+// when aligning Bali 1ckaA in ref4, which is
+// so long that B->Mmax got to -infinity, causing
+// traceback to fail.
+//const int MINUS_INFINITY = -10000000;
+const BASETYPE MINUS_INFINITY = (BASETYPE) -1e37;
+const BASETYPE PLUS_INFINITY = (BASETYPE) 1e37;
+
+// Probability relative to a null model
+typedef double RPROB;
+
+PROB ScoreToProb(SCORE Score);
+SCORE ProbToScore(PROB Prob);
+SCORE DoubleToScore(double d);
+WEIGHT DoubleToWeight(double d);
+double WeightToDouble(WEIGHT w);
+SCORE MulScoreWeight(SCORE Score, WEIGHT Weight);
+bool ScoreEq(SCORE s1, SCORE s2);
+bool BTEq(double b1, double b2);
+
+static double ScoreToDouble(SCORE Score)
+ {
+ return (double) Score / (double) INTSCALE;
+ }
+
+#if 0
+// In-line assembler for Result = (x*y)/z
+// Note that imul and idiv will do 64-bit arithmetic
+// on 32-bit operands, so this shouldn't overflow
+// Can't write this efficiently in C/C++ (would
+// often overlow 32 bits).
+#define MulDivAssign(Result, x, y, z) \
+ { \
+ int X = (x); \
+ int Y = (y); \
+ int Z = (z); \
+ _asm mov eax,X \
+ _asm imul Y \
+ _asm mov ecx,Z \
+ _asm idiv ecx \
+ _asm mov Result,eax \
+ }
+#else
+#define MulDivAssign(Result, x, y, z) Result = (((x)*(y))/(z))
+#endif
+
+#define MulScoreWeight(r, s, w) MulDivAssign(r, s, w, INTSCALE)
+#define MulWeightWCount(r, wt, wc) MulDivAssign(r, wt, wc, INTSCALE)
+#define MulFCountScore(r, fc, sc) MulDivAssign(r, fc, sc, INTSCALE)
+
+#if _DEBUG
+
+static inline SCORE Add2(SCORE a, SCORE b)
+ {
+ if (MINUS_INFINITY == a)
+ return MINUS_INFINITY;
+ if (MINUS_INFINITY == b)
+ return MINUS_INFINITY;
+ SCORE sum = a + b;
+ if (sum < MINUS_INFINITY)
+ return MINUS_INFINITY;
+// assert(sum < OVERFLOW_WARN);
+ return sum;
+ }
+
+static inline SCORE Add3(SCORE a, SCORE b, SCORE c)
+ {
+ return Add2(Add2(a, b), c);
+ }
+
+static inline SCORE Add4(SCORE a, SCORE b, SCORE c, SCORE d)
+ {
+ return Add2(Add2(a, b), Add2(c, d));
+ }
+
+static inline SCORE Add5(SCORE a, SCORE b, SCORE c, SCORE d, SCORE e)
+ {
+ return Add3(Add2(a, b), Add2(c, d), e);
+ }
+
+static inline SCORE Add6(SCORE a, SCORE b, SCORE c, SCORE d, SCORE e, SCORE f)
+ {
+ return Add3(Add2(a, b), Add2(c, d), Add2(e, f));
+ }
+
+static inline SCORE Add7(SCORE a, SCORE b, SCORE c, SCORE d, SCORE e, SCORE f, SCORE g)
+ {
+ return Add4(Add2(a, b), Add2(c, d), Add2(e, f), g);
+ }
+
+static inline SCORE Mul2(SCORE a, SCORE b)
+ {
+ if (MINUS_INFINITY == a)
+ return MINUS_INFINITY;
+ if (MINUS_INFINITY == b)
+ return MINUS_INFINITY;
+ //__int64 prod = (__int64) a * (__int64) b;
+ //assert((SCORE) prod == prod);
+ //return (SCORE) prod;
+ return a*b;
+ }
+
+static inline SCORE Sub2(SCORE a, SCORE b)
+ {
+ if (MINUS_INFINITY == a)
+ return MINUS_INFINITY;
+ if (MINUS_INFINITY == b)
+ return MINUS_INFINITY;
+ SCORE diff = a - b;
+ if (diff < MINUS_INFINITY)
+ return MINUS_INFINITY;
+// assert(diff < OVERFLOW_WARN);
+ return diff;
+ }
+
+static inline SCORE Div2(SCORE a, int b)
+ {
+ if (MINUS_INFINITY == a)
+ return MINUS_INFINITY;
+ return a/b;
+ }
+
+//static inline SCORE MulScoreWeight(SCORE s, WEIGHT w)
+// {
+// SCORE Prod = s*(SCORE) w;
+// assert(Prod < OVERFLOW_WARN);
+// extern void Log(const char Format[], ...);
+// if (Prod/(SCORE) w != s)
+// Log("**WARRNING MulScoreWeight Prod=%d w=%d Prod/w=%d s=%d\n",
+// Prod,
+// w,
+// Prod/(SCORE) w,
+// s);
+// assert(Prod/ (SCORE) w == s);
+// return Prod/INTSCALE;
+// }
+//
+//static inline WCOUNT MulWeightWCount(WEIGHT wt, WCOUNT wc)
+// {
+// return (wt*wc)/INTSCALE;
+// }
+
+#else
+#define Add2(a, b) ((a) + (b))
+#define Sub2(a, b) ((MINUS_INFINITY == (a)) ? MINUS_INFINITY : ((a) - (b)))
+#define Div2(a, b) ((MINUS_INFINITY == (a)) ? MINUS_INFINITY : ((a) / (b)))
+#define Add3(a, b, c) ((a) + (b) + (c))
+#define Add4(a, b, c, d) ((a) + (b) + (c) + (d))
+#define Add5(a, b, c, d, e) ((a) + (b) + (c) + (d) + (e))
+#define Add6(a, b, c, d, e, f) ((a) + (b) + (c) + (d) + (e) + (f))
+#define Add7(a, b, c, d, e, f, g) ((a) + (b) + (c) + (d) + (e) + (f) + (g))
+//#define MulScoreWeight(s, w) (((s)*(SCORE) (w))/INTSCALE)
+#define Mul2(a, b) ((a)*(b))
+#endif
+
+//static inline SCORE MulFCountScore(FCOUNT fc, SCORE sc)
+// {
+//// Fast way to say "if (fc >= 2^15 || sc >= 2^15)":
+// if ((fc | sc) & 0xffff1000)
+// {
+// SCORE Score = ((fc+5)/10)*sc;
+// assert(Score < assert);
+// OVERFLOW_WARN(Score > MINUS_INFINITY);
+// return Score/(INTSCALE/10);
+// }
+// SCORE Score = fc*sc;
+// assert(Score < OVERFLOW_WARN);
+// assert(Score > MINUS_INFINITY);
+// return Score/INTSCALE;
+// }
+
+#endif // IntMath_h
Added: trunk/packages/muscle/branches/upstream/current/local.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/local.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/local.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,100 @@
+#include "muscle.h"
+#include "textfile.h"
+#include "msa.h"
+#include "profile.h"
+#include "pwpath.h"
+#include "tree.h"
+
+#define TRACE 0
+
+static void MSAFromFileName(const char *FileName, MSA &a)
+ {
+ TextFile File(FileName);
+ a.FromFile(File);
+ }
+
+static ProfPos *ProfileFromMSALocal(MSA &msa, Tree &tree)
+ {
+ const unsigned uSeqCount = msa.GetSeqCount();
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ msa.SetSeqId(uSeqIndex, uSeqIndex);
+
+ TreeFromMSA(msa, tree, g_Cluster1, g_Distance1, g_Root1);
+ SetMuscleTree(tree);
+ return ProfileFromMSA(msa);
+ }
+
+void Local()
+ {
+ if (0 == g_pstrFileName1 || 0 == g_pstrFileName2)
+ Quit("Must specify both -in1 and -in2 for -sw");
+
+ SetSeqWeightMethod(g_SeqWeight1);
+
+ MSA msa1;
+ MSA msa2;
+
+ MSAFromFileName(g_pstrFileName1, msa1);
+ MSAFromFileName(g_pstrFileName2, msa2);
+
+ ALPHA Alpha = ALPHA_Undefined;
+ switch (g_SeqType)
+ {
+ case SEQTYPE_Auto:
+ Alpha = msa1.GuessAlpha();
+ break;
+
+ case SEQTYPE_Protein:
+ Alpha = ALPHA_Amino;
+ break;
+
+ case SEQTYPE_DNA:
+ Alpha = ALPHA_DNA;
+ break;
+
+ case SEQTYPE_RNA:
+ Alpha = ALPHA_RNA;
+ break;
+
+ default:
+ Quit("Invalid SeqType");
+ }
+ SetAlpha(Alpha);
+
+ msa1.FixAlpha();
+ msa2.FixAlpha();
+
+ if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha)
+ SetPPScore(PPSCORE_SPN);
+
+ const unsigned uSeqCount1 = msa1.GetSeqCount();
+ const unsigned uSeqCount2 = msa2.GetSeqCount();
+ const unsigned uMaxSeqCount = (uSeqCount1 > uSeqCount2 ? uSeqCount1 : uSeqCount2);
+ MSA::SetIdCount(uMaxSeqCount);
+
+ unsigned uLength1 = msa1.GetColCount();
+ unsigned uLength2 = msa2.GetColCount();
+
+ Tree tree1;
+ Tree tree2;
+
+ ProfPos *Prof1 = ProfileFromMSALocal(msa1, tree1);
+ ProfPos *Prof2 = ProfileFromMSALocal(msa2, tree2);
+
+ PWPath Path;
+ SW(Prof1, uLength1, Prof2, uLength2, Path);
+
+#if TRACE
+ Path.LogMe();
+#endif
+
+ MSA msaOut;
+ AlignTwoMSAsGivenPathSW(Path, msa1, msa2, msaOut);
+
+#if TRACE
+ msaOut.LogMe();
+#endif
+
+ TextFile fileOut(g_pstrOutFileName, true);
+ msaOut.ToFile(fileOut);
+ }
Added: trunk/packages/muscle/branches/upstream/current/main.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/main.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/main.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,66 @@
+#include "muscle.h"
+#include <stdio.h>
+#ifdef WIN32
+#include <windows.h> // for SetPriorityClass()
+#include <io.h> // for isatty()
+#else
+#include <unistd.h> // for isatty()
+#endif
+
+int g_argc;
+char **g_argv;
+
+int main(int argc, char **argv)
+ {
+#if WIN32
+// Multi-tasking does not work well in CPU-bound
+// console apps running under Win32.
+// Reducing the process priority allows GUI apps
+// to run responsively in parallel.
+ SetPriorityClass(GetCurrentProcess(), BELOW_NORMAL_PRIORITY_CLASS);
+#endif
+ g_argc = argc;
+ g_argv = argv;
+
+ SetNewHandler();
+ SetStartTime();
+ ProcessArgVect(argc - 1, argv + 1);
+ SetParams();
+ SetLogFile();
+
+ //extern void TestSubFams(const char *);
+ //TestSubFams(g_pstrInFileName);
+ //return 0;
+
+ if (g_bVersion)
+ {
+ printf(MUSCLE_LONG_VERSION "\n");
+ exit(EXIT_SUCCESS);
+ }
+
+ if (!g_bQuiet)
+ Credits();
+
+ if (MissingCommand() && isatty(0))
+ {
+ Usage();
+ exit(EXIT_SUCCESS);
+ }
+
+ if (g_bCatchExceptions)
+ {
+ try
+ {
+ Run();
+ }
+ catch (...)
+ {
+ OnException();
+ exit(EXIT_Except);
+ }
+ }
+ else
+ Run();
+
+ exit(EXIT_Success);
+ }
Added: trunk/packages/muscle/branches/upstream/current/makerootmsa.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/makerootmsa.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/makerootmsa.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,230 @@
+#include "muscle.h"
+#include "tree.h"
+#include "seqvect.h"
+#include "profile.h"
+#include "msa.h"
+#include "pwpath.h"
+#include "estring.h"
+
+#define TRACE 0
+#define VALIDATE 0
+
+static void PathSeq(const Seq &s, const PWPath &Path, bool bRight, Seq &sOut)
+ {
+ short *esA;
+ short *esB;
+ PathToEstrings(Path, &esA, &esB);
+
+ const unsigned uSeqLength = s.Length();
+ const unsigned uEdgeCount = Path.GetEdgeCount();
+
+ sOut.Clear();
+ sOut.SetName(s.GetName());
+ unsigned uPos = 0;
+ for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex)
+ {
+ const PWEdge &Edge = Path.GetEdge(uEdgeIndex);
+ char cType = Edge.cType;
+ if (bRight)
+ {
+ if (cType == 'I')
+ cType = 'D';
+ else if (cType == 'D')
+ cType = 'I';
+ }
+ switch (cType)
+ {
+ case 'M':
+ sOut.AppendChar(s[uPos++]);
+ break;
+ case 'D':
+ sOut.AppendChar('-');
+ break;
+ case 'I':
+ sOut.AppendChar(s[uPos++]);
+ break;
+ default:
+ Quit("PathSeq, invalid edge type %c", cType);
+ }
+ }
+ }
+
+#if VALIDATE
+
+static void MakeRootSeq(const Seq &s, const Tree &GuideTree, unsigned uLeafNodeIndex,
+ const ProgNode Nodes[], Seq &sRoot)
+ {
+ sRoot.Copy(s);
+ unsigned uNodeIndex = uLeafNodeIndex;
+ for (;;)
+ {
+ unsigned uParent = GuideTree.GetParent(uNodeIndex);
+ if (NULL_NEIGHBOR == uParent)
+ break;
+ bool bRight = (GuideTree.GetLeft(uParent) == uNodeIndex);
+ uNodeIndex = uParent;
+ const PWPath &Path = Nodes[uNodeIndex].m_Path;
+ Seq sTmp;
+ PathSeq(sRoot, Path, bRight, sTmp);
+ sRoot.Copy(sTmp);
+ }
+ }
+
+#endif // VALIDATE
+
+static short *MakeRootSeqE(const Seq &s, const Tree &GuideTree, unsigned uLeafNodeIndex,
+ const ProgNode Nodes[], Seq &sRoot, short *Estring1, short *Estring2)
+ {
+ short *EstringCurr = Estring1;
+ short *EstringNext = Estring2;
+
+ const unsigned uSeqLength = s.Length();
+ EstringCurr[0] = uSeqLength;
+ EstringCurr[1] = 0;
+
+ unsigned uNodeIndex = uLeafNodeIndex;
+ for (;;)
+ {
+ unsigned uParent = GuideTree.GetParent(uNodeIndex);
+ if (NULL_NEIGHBOR == uParent)
+ break;
+ bool bRight = (GuideTree.GetLeft(uParent) == uNodeIndex);
+ uNodeIndex = uParent;
+ const PWPath &Path = Nodes[uNodeIndex].m_Path;
+ const short *EstringNode = bRight ?
+ Nodes[uNodeIndex].m_EstringL : Nodes[uNodeIndex].m_EstringR;
+
+ MulEstrings(EstringCurr, EstringNode, EstringNext);
+#if TRACE
+ Log("\n");
+ Log("Curr=");
+ LogEstring(EstringCurr);
+ Log("\n");
+ Log("Node=");
+ LogEstring(EstringNode);
+ Log("\n");
+ Log("Prod=");
+ LogEstring(EstringNext);
+ Log("\n");
+#endif
+ short *EstringTmp = EstringNext;
+ EstringNext = EstringCurr;
+ EstringCurr = EstringTmp;
+ }
+ EstringOp(EstringCurr, s, sRoot);
+
+#if TRACE
+ Log("Root estring=");
+ LogEstring(EstringCurr);
+ Log("\n");
+ Log("Root seq=");
+ sRoot.LogMe();
+#endif
+ return EstringCurr;
+ }
+
+static unsigned GetFirstNodeIndex(const Tree &tree)
+ {
+ if (g_bStable)
+ return 0;
+ return tree.FirstDepthFirstNode();
+ }
+
+static unsigned GetNextNodeIndex(const Tree &tree, unsigned uPrevNodeIndex)
+ {
+ if (g_bStable)
+ {
+ const unsigned uNodeCount = tree.GetNodeCount();
+ unsigned uNodeIndex = uPrevNodeIndex;
+ for (;;)
+ {
+ ++uNodeIndex;
+ if (uNodeIndex >= uNodeCount)
+ return NULL_NEIGHBOR;
+ if (tree.IsLeaf(uNodeIndex))
+ return uNodeIndex;
+ }
+ }
+ unsigned uNodeIndex = uPrevNodeIndex;
+ for (;;)
+ {
+ uNodeIndex = tree.NextDepthFirstNode(uNodeIndex);
+ if (NULL_NEIGHBOR == uNodeIndex || tree.IsLeaf(uNodeIndex))
+ return uNodeIndex;
+ }
+ }
+
+void MakeRootMSA(const SeqVect &v, const Tree &GuideTree, ProgNode Nodes[],
+ MSA &a)
+ {
+#if TRACE
+ Log("MakeRootMSA Tree=");
+ GuideTree.LogMe();
+#endif
+ const unsigned uSeqCount = v.GetSeqCount();
+ unsigned uColCount = uInsane;
+ unsigned uSeqIndex = 0;
+ const unsigned uTreeNodeCount = GuideTree.GetNodeCount();
+ const unsigned uRootNodeIndex = GuideTree.GetRootNodeIndex();
+ const PWPath &RootPath = Nodes[uRootNodeIndex].m_Path;
+ const unsigned uRootColCount = RootPath.GetEdgeCount();
+ const unsigned uEstringSize = uRootColCount + 1;
+ short *Estring1 = new short[uEstringSize];
+ short *Estring2 = new short[uEstringSize];
+ SetProgressDesc("Root alignment");
+
+ unsigned uTreeNodeIndex = GetFirstNodeIndex(GuideTree);
+ do
+ {
+ Progress(uSeqIndex, uSeqCount);
+
+ unsigned uId = GuideTree.GetLeafId(uTreeNodeIndex);
+ const Seq &s = *(v[uId]);
+
+ Seq sRootE;
+ short *es = MakeRootSeqE(s, GuideTree, uTreeNodeIndex, Nodes, sRootE,
+ Estring1, Estring2);
+ Nodes[uTreeNodeIndex].m_EstringL = EstringNewCopy(es);
+
+#if VALIDATE
+ Seq sRoot;
+ MakeRootSeq(s, GuideTree, uTreeNodeIndex, Nodes, sRoot);
+ if (!sRoot.Eq(sRootE))
+ {
+ Log("sRoot=");
+ sRoot.LogMe();
+ Log("sRootE=");
+ sRootE.LogMe();
+ Quit("Root seqs differ");
+ }
+#endif
+
+#if TRACE
+ Log("MakeRootSeq=\n");
+ sRoot.LogMe();
+#endif
+ if (uInsane == uColCount)
+ {
+ uColCount = sRootE.Length();
+ a.SetSize(uSeqCount, uColCount);
+ }
+ else
+ {
+ assert(uColCount == sRootE.Length());
+ }
+ a.SetSeqName(uSeqIndex, s.GetName());
+ a.SetSeqId(uSeqIndex, uId);
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
+ a.SetChar(uSeqIndex, uColIndex, sRootE[uColIndex]);
+ ++uSeqIndex;
+
+ uTreeNodeIndex = GetNextNodeIndex(GuideTree, uTreeNodeIndex);
+ }
+ while (NULL_NEIGHBOR != uTreeNodeIndex);
+
+ delete[] Estring1;
+ delete[] Estring2;
+
+ ProgressStepsDone();
+ assert(uSeqIndex == uSeqCount);
+ }
Added: trunk/packages/muscle/branches/upstream/current/makerootmsab.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/makerootmsab.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/makerootmsab.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,62 @@
+#include "muscle.h"
+#include "tree.h"
+#include "profile.h"
+#include "msa.h"
+#include "seqvect.h"
+#include "pwpath.h"
+
+static void DoSeq(Seq &s, unsigned uSeqIndex, const ProfPos *RootProf,
+ unsigned uRootProfLength, MSA &msaOut)
+ {
+ MSA msaSeq;
+ msaSeq.FromSeq(s);
+ const unsigned uSeqLength = s.Length();
+
+ MSA msaDummy;
+ msaDummy.SetSize(1, uRootProfLength);
+ msaDummy.SetSeqId(0, 0);
+ msaDummy.SetSeqName(0, "Dummy0");
+ for (unsigned uColIndex = 0; uColIndex < uRootProfLength; ++uColIndex)
+ msaDummy.SetChar(0, uColIndex, '?');
+
+ ProfPos *SeqProf = ProfileFromMSA(msaSeq);
+ for (unsigned uColIndex = 0; uColIndex < uSeqLength; ++uColIndex)
+ {
+ ProfPos &PP = SeqProf[uColIndex];
+ PP.m_scoreGapOpen = MINUS_INFINITY;
+ PP.m_scoreGapClose = MINUS_INFINITY;
+ }
+
+ ProfPos *ProfOut;
+ unsigned uLengthOut;
+ PWPath Path;
+ AlignTwoProfs(SeqProf, uSeqLength, 1.0, RootProf, uRootProfLength, 1.0,
+ Path, &ProfOut, &uLengthOut);
+ assert(uLengthOut = uRootProfLength);
+ delete[] ProfOut;
+
+ MSA msaCombined;
+ AlignTwoMSAsGivenPath(Path, msaSeq, msaDummy, msaCombined);
+
+ msaCombined.LogMe();
+ msaOut.SetSeqName(uSeqIndex, s.GetName());
+ msaOut.SetSeqId(uSeqIndex, s.GetId());
+ for (unsigned uColIndex = 0; uColIndex < uRootProfLength; ++uColIndex)
+ msaOut.SetChar(uSeqIndex, uColIndex, msaCombined.GetChar(0, uColIndex));
+ }
+
+// Steven Brenner's O(NL^2) proposal for creating a root alignment
+// Align each sequence to the profile at the root.
+// Compare the e-string solution, which is O(NL log N).
+void MakeRootMSABrenner(SeqVect &v, const Tree &GuideTree, ProgNode Nodes[],
+ MSA &a)
+ {
+ const unsigned uSeqCount = v.Length();
+ const unsigned uRootNodeIndex = GuideTree.GetRootNodeIndex();
+ const ProfPos *RootProfile = Nodes[uRootNodeIndex].m_Prof;
+ const unsigned uRootColCount = Nodes[uRootNodeIndex].m_uLength;
+ a.SetSize(uSeqCount, uRootColCount);
+
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ DoSeq(*v[uSeqIndex], uSeqIndex, RootProfile, uRootColCount, a);
+ }
Added: trunk/packages/muscle/branches/upstream/current/mhack.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/mhack.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/mhack.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,64 @@
+#include "muscle.h"
+#include "seqvect.h"
+#include "msa.h"
+
+/***
+Methionine hack.
+Most proteins start with M.
+This results in odd-looking alignments with the terminal Ms aligned followed
+immediately by gaps.
+Hack this by treating terminal M like X.
+***/
+
+static bool *M;
+
+void MHackStart(SeqVect &v)
+ {
+ if (ALPHA_Amino != g_Alpha)
+ return;
+
+ const unsigned uSeqCount = v.Length();
+ M = new bool[uSeqCount];
+ memset(M, 0, uSeqCount*sizeof(bool));
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ Seq &s = v.GetSeq(uSeqIndex);
+ if (0 == s.Length())
+ continue;
+ unsigned uId = s.GetId();
+ if (s[0] == 'M' || s[0] == 'm')
+ {
+ M[uId] = true;
+ s[0] = 'X';
+ }
+ }
+ }
+
+void MHackEnd(MSA &msa)
+ {
+ if (ALPHA_Amino != g_Alpha)
+ return;
+ if (0 == M)
+ return;
+
+ const unsigned uSeqCount = msa.GetSeqCount();
+ const unsigned uColCount = msa.GetColCount();
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ unsigned uId = msa.GetSeqId(uSeqIndex);
+ if (M[uId])
+ {
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
+ {
+ if (!msa.IsGap(uSeqIndex, uColIndex))
+ {
+ msa.SetChar(uSeqIndex, uColIndex, 'M');
+ break;
+ }
+ }
+ }
+ }
+
+ delete[] M;
+ M = 0;
+ }
Added: trunk/packages/muscle/branches/upstream/current/mk
===================================================================
--- trunk/packages/muscle/branches/upstream/current/mk 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/mk 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,10 @@
+ofiles=`echo *.o`
+
+# find -name "*.o" -exec "rm" "{}" ";"
+
+make -f Makefile 2> make.err
+
+# rm *.o
+
+cat make.err
+ls -l muscle
Property changes on: trunk/packages/muscle/branches/upstream/current/mk
___________________________________________________________________
Name: svn:executable
+
Added: trunk/packages/muscle/branches/upstream/current/mpam200.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/mpam200.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/mpam200.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,107 @@
+#include "muscle.h"
+
+const float PAM_200_CENTER = (float) 20.0;
+
+#define v(x) ((float) x + PAM_200_CENTER)
+#define ROW(A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y) \
+ { v(A), v(C), v(D), v(E), v(F), v(G), v(H), v(I), v(K), v(L), \
+ v(M), v(N), v(P), v(Q), v(R), v(S), v(T), v(V), v(W), v(Y) },
+
+float PAM200[32][32] =
+ {
+// A C D E F G H I K L
+// M N P Q R S T V W Y
+ROW( 388, -0, 34, 32, -202, 159, -88, 89, -55, -67,
+ 19, 86, 186, -34, -32, 237, 273, 171, -326, -239) // A
+ROW( -0, 1170, -248, -315, 74, -14, 43, -151, -204, -196,
+ -132, -49, -142, -215, 29, 165, -7, -69, 179, 313) // C
+ROW( 34, -248, 625, 496, -419, 148, 78, -245, 55, -361,
+ -255, 332, -169, 122, -64, 45, -13, -167, -438, -148) // D
+ROW( 32, -315, 496, 610, -480, 125, 25, -245, 175, -327,
+ -242, 166, -141, 279, 34, -30, -56, -150, -386, -305) // E
+ROW( -202, 74, -419, -480, 888, -407, 62, 80, -443, 320,
+ 67, -236, -180, -294, -327, -51, -173, 31, -1, 584) // F
+ROW( 159, -14, 148, 125, -407, 662, -114, -216, -34, -324,
+ -246, 79, -77, -68, 97, 155, 21, -93, -58, -349) // G
+ROW( -88, 43, 78, 25, 62, -114, 766, -205, 144, -92,
+ -152, 238, 66, 368, 257, 35, -35, -217, -201, 468) // H
+ROW( 89, -151, -245, -245, 80, -216, -205, 554, -224, 288,
+ 391, -114, -115, -222, -208, -19, 162, 469, -274, -153) // I
+ROW( -55, -204, 55, 175, -443, -34, 144, -224, 632, -249,
+ -118, 186, -86, 315, 466, 2, 19, -227, -216, -264) // K
+ROW( -67, -196, -361, -327, 320, -324, -92, 288, -249, 591,
+ 369, -223, 53, -86, -170, -69, -41, 239, -66, -29) // L
+ROW( 19, -132, -255, -242, 67, -246, -152, 391, -118, 369,
+ 756, -131, -98, -124, -129, -49, 129, 331, -229, -182) // M
+ROW( 86, -49, 332, 166, -236, 79, 238, -114, 186, -223,
+ -131, 516, -21, 88, 73, 240, 168, -118, -379, -8) // N
+ROW( 186, -142, -169, -141, -180, -77, 66, -115, -86, 53,
+ -98, -21, 736, 122, 5, 221, 139, -75, -373, -226) // P
+ROW( -34, -215, 122, 279, -294, -68, 368, -222, 315, -86,
+ -124, 88, 122, 635, 301, -13, -35, -195, -243, -73) // Q
+ROW( -32, 29, -64, 34, -327, 97, 257, -208, 466, -170,
+ -129, 73, 5, 301, 606, 28, -4, -201, 104, -133) // R
+ROW( 237, 165, 45, -30, -51, 155, 35, -19, 2, -69,
+ -49, 240, 221, -13, 28, 353, 259, 8, -213, -55) // S
+ROW( 273, -7, -13, -56, -173, 21, -35, 162, 19, -41,
+ 129, 168, 139, -35, -4, 259, 422, 143, -343, -190) // T
+ROW( 171, -69, -167, -150, 31, -93, -217, 469, -227, 239,
+ 331, -118, -75, -195, -201, 8, 143, 505, -245, -197) // V
+ROW( -326, 179, -438, -386, -1, -58, -201, -274, -216, -66,
+ -229, -379, -373, -243, 104, -213, -343, -245, 1475, 63) // W
+ROW( -239, 313, -148, -305, 584, -349, 468, -153, -264, -29,
+ -182, -8, -226, -73, -133, -55, -190, -197, 63, 979) // Y
+ };
+
+#undef v
+#define v(x) ((float) x)
+#define RNC(A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y) \
+ { v(A), v(C), v(D), v(E), v(F), v(G), v(H), v(I), v(K), v(L), \
+ v(M), v(N), v(P), v(Q), v(R), v(S), v(T), v(V), v(W), v(Y) },
+
+float PAM200NoCenter[32][32] =
+
+ {
+// A C D E F G H I K L
+// M N P Q R S T V W Y
+RNC( 388, -0, 34, 32, -202, 159, -88, 89, -55, -67,
+ 19, 86, 186, -34, -32, 237, 273, 171, -326, -239) // A
+RNC( -0, 1170, -248, -315, 74, -14, 43, -151, -204, -196,
+ -132, -49, -142, -215, 29, 165, -7, -69, 179, 313) // C
+RNC( 34, -248, 625, 496, -419, 148, 78, -245, 55, -361,
+ -255, 332, -169, 122, -64, 45, -13, -167, -438, -148) // D
+RNC( 32, -315, 496, 610, -480, 125, 25, -245, 175, -327,
+ -242, 166, -141, 279, 34, -30, -56, -150, -386, -305) // E
+RNC( -202, 74, -419, -480, 888, -407, 62, 80, -443, 320,
+ 67, -236, -180, -294, -327, -51, -173, 31, -1, 584) // F
+RNC( 159, -14, 148, 125, -407, 662, -114, -216, -34, -324,
+ -246, 79, -77, -68, 97, 155, 21, -93, -58, -349) // G
+RNC( -88, 43, 78, 25, 62, -114, 766, -205, 144, -92,
+ -152, 238, 66, 368, 257, 35, -35, -217, -201, 468) // H
+RNC( 89, -151, -245, -245, 80, -216, -205, 554, -224, 288,
+ 391, -114, -115, -222, -208, -19, 162, 469, -274, -153) // I
+RNC( -55, -204, 55, 175, -443, -34, 144, -224, 632, -249,
+ -118, 186, -86, 315, 466, 2, 19, -227, -216, -264) // K
+RNC( -67, -196, -361, -327, 320, -324, -92, 288, -249, 591,
+ 369, -223, 53, -86, -170, -69, -41, 239, -66, -29) // L
+RNC( 19, -132, -255, -242, 67, -246, -152, 391, -118, 369,
+ 756, -131, -98, -124, -129, -49, 129, 331, -229, -182) // M
+RNC( 86, -49, 332, 166, -236, 79, 238, -114, 186, -223,
+ -131, 516, -21, 88, 73, 240, 168, -118, -379, -8) // N
+RNC( 186, -142, -169, -141, -180, -77, 66, -115, -86, 53,
+ -98, -21, 736, 122, 5, 221, 139, -75, -373, -226) // P
+RNC( -34, -215, 122, 279, -294, -68, 368, -222, 315, -86,
+ -124, 88, 122, 635, 301, -13, -35, -195, -243, -73) // Q
+RNC( -32, 29, -64, 34, -327, 97, 257, -208, 466, -170,
+ -129, 73, 5, 301, 606, 28, -4, -201, 104, -133) // R
+RNC( 237, 165, 45, -30, -51, 155, 35, -19, 2, -69,
+ -49, 240, 221, -13, 28, 353, 259, 8, -213, -55) // S
+RNC( 273, -7, -13, -56, -173, 21, -35, 162, 19, -41,
+ 129, 168, 139, -35, -4, 259, 422, 143, -343, -190) // T
+RNC( 171, -69, -167, -150, 31, -93, -217, 469, -227, 239,
+ 331, -118, -75, -195, -201, 8, 143, 505, -245, -197) // V
+RNC( -326, 179, -438, -386, -1, -58, -201, -274, -216, -66,
+ -229, -379, -373, -243, 104, -213, -343, -245, 1475, 63) // W
+RNC( -239, 313, -148, -305, 584, -349, 468, -153, -264, -29,
+ -182, -8, -226, -73, -133, -55, -190, -197, 63, 979) // Y
+ };
Added: trunk/packages/muscle/branches/upstream/current/msa.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/msa.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/msa.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,851 @@
+#include "muscle.h"
+#include "msa.h"
+#include "textfile.h"
+#include "seq.h"
+#include <math.h>
+
+const unsigned DEFAULT_SEQ_LENGTH = 500;
+
+unsigned MSA::m_uIdCount = 0;
+
+MSA::MSA()
+ {
+ m_uSeqCount = 0;
+ m_uColCount = 0;
+
+ m_szSeqs = 0;
+ m_szNames = 0;
+ m_Weights = 0;
+
+ m_IdToSeqIndex = 0;
+ m_SeqIndexToId = 0;
+
+ m_uCacheSeqCount = 0;
+ m_uCacheSeqLength = 0;
+ }
+
+MSA::~MSA()
+ {
+ Free();
+ }
+
+void MSA::Free()
+ {
+ for (unsigned n = 0; n < m_uSeqCount; ++n)
+ {
+ delete[] m_szSeqs[n];
+ delete[] m_szNames[n];
+ }
+
+ delete[] m_szSeqs;
+ delete[] m_szNames;
+ delete[] m_Weights;
+ delete[] m_IdToSeqIndex;
+ delete[] m_SeqIndexToId;
+
+ m_uSeqCount = 0;
+ m_uColCount = 0;
+
+ m_szSeqs = 0;
+ m_szNames = 0;
+ m_Weights = 0;
+
+ m_IdToSeqIndex = 0;
+ m_SeqIndexToId = 0;
+ }
+
+void MSA::SetSize(unsigned uSeqCount, unsigned uColCount)
+ {
+ Free();
+
+ m_uSeqCount = uSeqCount;
+ m_uCacheSeqLength = uColCount;
+ m_uColCount = 0;
+
+ if (0 == uSeqCount && 0 == uColCount)
+ return;
+
+ m_szSeqs = new char *[uSeqCount];
+ m_szNames = new char *[uSeqCount];
+ m_Weights = new WEIGHT[uSeqCount];
+
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ m_szSeqs[uSeqIndex] = new char[uColCount+1];
+ m_szNames[uSeqIndex] = 0;
+#if DEBUG
+ m_Weights[uSeqIndex] = BTInsane;
+ memset(m_szSeqs[uSeqIndex], '?', uColCount);
+#endif
+ m_szSeqs[uSeqIndex][uColCount] = 0;
+ }
+
+ if (m_uIdCount > 0)
+ {
+ m_IdToSeqIndex = new unsigned[m_uIdCount];
+ m_SeqIndexToId = new unsigned[m_uSeqCount];
+#if DEBUG
+ memset(m_IdToSeqIndex, 0xff, m_uIdCount*sizeof(unsigned));
+ memset(m_SeqIndexToId, 0xff, m_uSeqCount*sizeof(unsigned));
+#endif
+ }
+ }
+
+void MSA::LogMe() const
+ {
+ if (0 == GetColCount())
+ {
+ Log("MSA empty\n");
+ return;
+ }
+
+ const unsigned uColsPerLine = 50;
+ unsigned uLinesPerSeq = (GetColCount() - 1)/uColsPerLine + 1;
+ for (unsigned n = 0; n < uLinesPerSeq; ++n)
+ {
+ unsigned i;
+ unsigned iStart = n*uColsPerLine;
+ unsigned iEnd = GetColCount();
+ if (iEnd - iStart + 1 > uColsPerLine)
+ iEnd = iStart + uColsPerLine;
+ Log(" ");
+ for (i = iStart; i < iEnd; ++i)
+ Log("%u", i%10);
+ Log("\n");
+ Log(" ");
+ for (i = iStart; i + 9 < iEnd; i += 10)
+ Log("%-10u", i);
+ if (n == uLinesPerSeq - 1)
+ Log(" %-10u", GetColCount());
+ Log("\n");
+ for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex)
+ {
+ Log("%12.12s", m_szNames[uSeqIndex]);
+ if (m_Weights[uSeqIndex] != BTInsane)
+ Log(" (%5.3f)", m_Weights[uSeqIndex]);
+ else
+ Log(" ");
+ Log(" ");
+ for (i = iStart; i < iEnd; ++i)
+ Log("%c", GetChar(uSeqIndex, i));
+ if (0 != m_SeqIndexToId)
+ Log(" [%5u]", m_SeqIndexToId[uSeqIndex]);
+ Log("\n");
+ }
+ Log("\n\n");
+ }
+ }
+
+char MSA::GetChar(unsigned uSeqIndex, unsigned uIndex) const
+ {
+// TODO: Performance cost?
+ if (uSeqIndex >= m_uSeqCount || uIndex >= m_uColCount)
+ Quit("MSA::GetChar(%u/%u,%u/%u)",
+ uSeqIndex, m_uSeqCount, uIndex, m_uColCount);
+
+ char c = m_szSeqs[uSeqIndex][uIndex];
+// assert(IsLegalChar(c));
+ return c;
+ }
+
+unsigned MSA::GetLetter(unsigned uSeqIndex, unsigned uIndex) const
+ {
+// TODO: Performance cost?
+ char c = GetChar(uSeqIndex, uIndex);
+ unsigned uLetter = CharToLetter(c);
+ if (uLetter >= 20)
+ {
+ char c = ' ';
+ if (uSeqIndex < m_uSeqCount && uIndex < m_uColCount)
+ c = m_szSeqs[uSeqIndex][uIndex];
+ Quit("MSA::GetLetter(%u/%u, %u/%u)='%c'/%u",
+ uSeqIndex, m_uSeqCount, uIndex, m_uColCount, c, uLetter);
+ }
+ return uLetter;
+ }
+
+unsigned MSA::GetLetterEx(unsigned uSeqIndex, unsigned uIndex) const
+ {
+// TODO: Performance cost?
+ char c = GetChar(uSeqIndex, uIndex);
+ unsigned uLetter = CharToLetterEx(c);
+ return uLetter;
+ }
+
+void MSA::SetSeqName(unsigned uSeqIndex, const char szName[])
+ {
+ if (uSeqIndex >= m_uSeqCount)
+ Quit("MSA::SetSeqName(%u, %s), count=%u", uSeqIndex, m_uSeqCount);
+ delete[] m_szNames[uSeqIndex];
+ int n = (int) strlen(szName) + 1;
+ m_szNames[uSeqIndex] = new char[n];
+ memcpy(m_szNames[uSeqIndex], szName, n);
+ }
+
+const char *MSA::GetSeqName(unsigned uSeqIndex) const
+ {
+ if (uSeqIndex >= m_uSeqCount)
+ Quit("MSA::GetSeqName(%u), count=%u", uSeqIndex, m_uSeqCount);
+ return m_szNames[uSeqIndex];
+ }
+
+bool MSA::IsGap(unsigned uSeqIndex, unsigned uIndex) const
+ {
+ char c = GetChar(uSeqIndex, uIndex);
+ return IsGapChar(c);
+ }
+
+bool MSA::IsWildcard(unsigned uSeqIndex, unsigned uIndex) const
+ {
+ char c = GetChar(uSeqIndex, uIndex);
+ return IsWildcardChar(c);
+ }
+
+void MSA::SetChar(unsigned uSeqIndex, unsigned uIndex, char c)
+ {
+ if (uSeqIndex >= m_uSeqCount || uIndex > m_uCacheSeqLength)
+ Quit("MSA::SetChar(%u,%u)", uSeqIndex, uIndex);
+
+ if (uIndex == m_uCacheSeqLength)
+ {
+ const unsigned uNewCacheSeqLength = m_uCacheSeqLength + DEFAULT_SEQ_LENGTH;
+ for (unsigned n = 0; n < m_uSeqCount; ++n)
+ {
+ char *ptrNewSeq = new char[uNewCacheSeqLength+1];
+ memcpy(ptrNewSeq, m_szSeqs[n], m_uCacheSeqLength);
+ memset(ptrNewSeq + m_uCacheSeqLength, '?', DEFAULT_SEQ_LENGTH);
+ ptrNewSeq[uNewCacheSeqLength] = 0;
+ delete[] m_szSeqs[n];
+ m_szSeqs[n] = ptrNewSeq;
+ }
+
+ m_uColCount = uIndex;
+ m_uCacheSeqLength = uNewCacheSeqLength;
+ }
+
+ if (uIndex >= m_uColCount)
+ m_uColCount = uIndex + 1;
+ m_szSeqs[uSeqIndex][uIndex] = c;
+ }
+
+void MSA::GetSeq(unsigned uSeqIndex, Seq &seq) const
+ {
+ assert(uSeqIndex < m_uSeqCount);
+
+ seq.Clear();
+
+ for (unsigned n = 0; n < m_uColCount; ++n)
+ if (!IsGap(uSeqIndex, n))
+ {
+ char c = GetChar(uSeqIndex, n);
+ if (!isalpha(c))
+ Quit("Invalid character '%c' in sequence", c);
+ c = toupper(c);
+ seq.push_back(c);
+ }
+ const char *ptrName = GetSeqName(uSeqIndex);
+ seq.SetName(ptrName);
+ }
+
+bool MSA::HasGap() const
+ {
+ for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)
+ for (unsigned n = 0; n < GetColCount(); ++n)
+ if (IsGap(uSeqIndex, n))
+ return true;
+ return false;
+ }
+
+bool MSA::IsLegalLetter(unsigned uLetter) const
+ {
+ return uLetter < 20;
+ }
+
+void MSA::SetSeqCount(unsigned uSeqCount)
+ {
+ Free();
+ SetSize(uSeqCount, DEFAULT_SEQ_LENGTH);
+ }
+
+void MSA::CopyCol(unsigned uFromCol, unsigned uToCol)
+ {
+ assert(uFromCol < GetColCount());
+ assert(uToCol < GetColCount());
+ if (uFromCol == uToCol)
+ return;
+
+ for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)
+ {
+ const char c = GetChar(uSeqIndex, uFromCol);
+ SetChar(uSeqIndex, uToCol, c);
+ }
+ }
+
+void MSA::Copy(const MSA &msa)
+ {
+ Free();
+ const unsigned uSeqCount = msa.GetSeqCount();
+ const unsigned uColCount = msa.GetColCount();
+ SetSize(uSeqCount, uColCount);
+
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ SetSeqName(uSeqIndex, msa.GetSeqName(uSeqIndex));
+ const unsigned uId = msa.GetSeqId(uSeqIndex);
+ SetSeqId(uSeqIndex, uId);
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
+ {
+ const char c = msa.GetChar(uSeqIndex, uColIndex);
+ SetChar(uSeqIndex, uColIndex, c);
+ }
+ }
+ }
+
+bool MSA::IsGapColumn(unsigned uColIndex) const
+ {
+ assert(GetSeqCount() > 0);
+ for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)
+ if (!IsGap(uSeqIndex, uColIndex))
+ return false;
+ return true;
+ }
+
+bool MSA::GetSeqIndex(const char *ptrSeqName, unsigned *ptruSeqIndex) const
+ {
+ for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)
+ if (0 == stricmp(ptrSeqName, GetSeqName(uSeqIndex)))
+ {
+ *ptruSeqIndex = uSeqIndex;
+ return true;
+ }
+ return false;
+ }
+
+void MSA::DeleteCol(unsigned uColIndex)
+ {
+ assert(uColIndex < m_uColCount);
+ size_t n = m_uColCount - uColIndex;
+ if (n > 0)
+ {
+ for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)
+ {
+ char *ptrSeq = m_szSeqs[uSeqIndex];
+ memmove(ptrSeq + uColIndex, ptrSeq + uColIndex + 1, n);
+ }
+ }
+ --m_uColCount;
+ }
+
+void MSA::DeleteColumns(unsigned uColIndex, unsigned uColCount)
+ {
+ for (unsigned n = 0; n < uColCount; ++n)
+ DeleteCol(uColIndex);
+ }
+
+void MSA::FromFile(TextFile &File)
+ {
+ FromFASTAFile(File);
+ }
+
+// Weights sum to 1, WCounts sum to NIC
+WEIGHT MSA::GetSeqWeight(unsigned uSeqIndex) const
+ {
+ assert(uSeqIndex < m_uSeqCount);
+ WEIGHT w = m_Weights[uSeqIndex];
+ if (w == wInsane)
+ Quit("Seq weight not set");
+ return w;
+ }
+
+void MSA::SetSeqWeight(unsigned uSeqIndex, WEIGHT w) const
+ {
+ assert(uSeqIndex < m_uSeqCount);
+ m_Weights[uSeqIndex] = w;
+ }
+
+void MSA::NormalizeWeights(WEIGHT wDesiredTotal) const
+ {
+ WEIGHT wTotal = 0;
+ for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex)
+ wTotal += m_Weights[uSeqIndex];
+
+ if (0 == wTotal)
+ return;
+
+ const WEIGHT f = wDesiredTotal/wTotal;
+ for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex)
+ m_Weights[uSeqIndex] *= f;
+ }
+
+void MSA::CalcWeights() const
+ {
+ Quit("Calc weights not implemented");
+ }
+
+static void FmtChar(char c, unsigned uWidth)
+ {
+ Log("%c", c);
+ for (unsigned n = 0; n < uWidth - 1; ++n)
+ Log(" ");
+ }
+
+static void FmtInt(unsigned u, unsigned uWidth)
+ {
+ static char szStr[1024];
+ assert(uWidth < sizeof(szStr));
+ if (u > 0)
+ sprintf(szStr, "%u", u);
+ else
+ strcpy(szStr, ".");
+ Log(szStr);
+ unsigned n = (unsigned) strlen(szStr);
+ if (n < uWidth)
+ for (unsigned i = 0; i < uWidth - n; ++i)
+ Log(" ");
+ }
+
+static void FmtInt0(unsigned u, unsigned uWidth)
+ {
+ static char szStr[1024];
+ assert(uWidth < sizeof(szStr));
+ sprintf(szStr, "%u", u);
+ Log(szStr);
+ unsigned n = (unsigned) strlen(szStr);
+ if (n < uWidth)
+ for (unsigned i = 0; i < uWidth - n; ++i)
+ Log(" ");
+ }
+
+static void FmtPad(unsigned n)
+ {
+ for (unsigned i = 0; i < n; ++i)
+ Log(" ");
+ }
+
+void MSA::FromSeq(const Seq &s)
+ {
+ unsigned uSeqLength = s.Length();
+ SetSize(1, uSeqLength);
+ SetSeqName(0, s.GetName());
+ if (0 != m_SeqIndexToId)
+ SetSeqId(0, s.GetId());
+ for (unsigned n = 0; n < uSeqLength; ++n)
+ SetChar(0, n, s[n]);
+ }
+
+unsigned MSA::GetCharCount(unsigned uSeqIndex, unsigned uColIndex) const
+ {
+ assert(uSeqIndex < GetSeqCount());
+ assert(uColIndex < GetColCount());
+
+ unsigned uCol = 0;
+ for (unsigned n = 0; n <= uColIndex; ++n)
+ if (!IsGap(uSeqIndex, n))
+ ++uCol;
+ return uCol;
+ }
+
+void MSA::CopySeq(unsigned uToSeqIndex, const MSA &msaFrom, unsigned uFromSeqIndex)
+ {
+ assert(uToSeqIndex < m_uSeqCount);
+ const unsigned uColCount = msaFrom.GetColCount();
+ assert(m_uColCount == uColCount ||
+ (0 == m_uColCount && uColCount <= m_uCacheSeqLength));
+
+ memcpy(m_szSeqs[uToSeqIndex], msaFrom.GetSeqBuffer(uFromSeqIndex), uColCount);
+ SetSeqName(uToSeqIndex, msaFrom.GetSeqName(uFromSeqIndex));
+ if (0 == m_uColCount)
+ m_uColCount = uColCount;
+ }
+
+const char *MSA::GetSeqBuffer(unsigned uSeqIndex) const
+ {
+ assert(uSeqIndex < m_uSeqCount);
+ return m_szSeqs[uSeqIndex];
+ }
+
+void MSA::DeleteSeq(unsigned uSeqIndex)
+ {
+ assert(uSeqIndex < m_uSeqCount);
+
+ delete m_szSeqs[uSeqIndex];
+ delete m_szNames[uSeqIndex];
+
+ const unsigned uBytesToMove = (m_uSeqCount - uSeqIndex)*sizeof(char *);
+ if (uBytesToMove > 0)
+ {
+ memmove(m_szSeqs + uSeqIndex, m_szSeqs + uSeqIndex + 1, uBytesToMove);
+ memmove(m_szNames + uSeqIndex, m_szNames + uSeqIndex + 1, uBytesToMove);
+ }
+
+ --m_uSeqCount;
+
+ delete[] m_Weights;
+ m_Weights = 0;
+ }
+
+bool MSA::IsEmptyCol(unsigned uColIndex) const
+ {
+ const unsigned uSeqCount = GetSeqCount();
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ if (!IsGap(uSeqIndex, uColIndex))
+ return false;
+ return true;
+ }
+
+//void MSA::DeleteEmptyCols(bool bProgress)
+// {
+// unsigned uColCount = GetColCount();
+// for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
+// {
+// if (IsEmptyCol(uColIndex))
+// {
+// if (bProgress)
+// {
+// Log("Deleting col %u of %u\n", uColIndex, uColCount);
+// printf("Deleting col %u of %u\n", uColIndex, uColCount);
+// }
+// DeleteCol(uColIndex);
+// --uColCount;
+// }
+// }
+// }
+
+unsigned MSA::AlignedColIndexToColIndex(unsigned uAlignedColIndex) const
+ {
+ Quit("MSA::AlignedColIndexToColIndex not implemented");
+ return 0;
+ }
+
+WEIGHT MSA::GetTotalSeqWeight() const
+ {
+ WEIGHT wTotal = 0;
+ const unsigned uSeqCount = GetSeqCount();
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ wTotal += m_Weights[uSeqIndex];
+ return wTotal;
+ }
+
+bool MSA::SeqsEq(const MSA &a1, unsigned uSeqIndex1, const MSA &a2,
+ unsigned uSeqIndex2)
+ {
+ Seq s1;
+ Seq s2;
+
+ a1.GetSeq(uSeqIndex1, s1);
+ a2.GetSeq(uSeqIndex2, s2);
+
+ s1.StripGaps();
+ s2.StripGaps();
+
+ return s1.EqIgnoreCase(s2);
+ }
+
+unsigned MSA::GetSeqLength(unsigned uSeqIndex) const
+ {
+ assert(uSeqIndex < GetSeqCount());
+
+ const unsigned uColCount = GetColCount();
+ unsigned uLength = 0;
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
+ if (!IsGap(uSeqIndex, uColIndex))
+ ++uLength;
+ return uLength;
+ }
+
+void MSA::GetPWID(unsigned uSeqIndex1, unsigned uSeqIndex2, double *ptrPWID,
+ unsigned *ptruPosCount) const
+ {
+ assert(uSeqIndex1 < GetSeqCount());
+ assert(uSeqIndex2 < GetSeqCount());
+
+ unsigned uSameCount = 0;
+ unsigned uPosCount = 0;
+ const unsigned uColCount = GetColCount();
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
+ {
+ char c1 = GetChar(uSeqIndex1, uColIndex);
+ if (IsGapChar(c1))
+ continue;
+ char c2 = GetChar(uSeqIndex2, uColIndex);
+ if (IsGapChar(c2))
+ continue;
+ ++uPosCount;
+ if (c1 == c2)
+ ++uSameCount;
+ }
+ *ptruPosCount = uPosCount;
+ if (uPosCount > 0)
+ *ptrPWID = 100.0 * (double) uSameCount / (double) uPosCount;
+ else
+ *ptrPWID = 0;
+ }
+
+void MSA::UnWeight()
+ {
+ for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)
+ m_Weights[uSeqIndex] = BTInsane;
+ }
+
+unsigned MSA::UniqueResidueTypes(unsigned uColIndex) const
+ {
+ assert(uColIndex < GetColCount());
+
+ unsigned Counts[MAX_ALPHA];
+ memset(Counts, 0, sizeof(Counts));
+ const unsigned uSeqCount = GetSeqCount();
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ if (IsGap(uSeqIndex, uColIndex) || IsWildcard(uSeqIndex, uColIndex))
+ continue;
+ const unsigned uLetter = GetLetter(uSeqIndex, uColIndex);
+ ++(Counts[uLetter]);
+ }
+ unsigned uUniqueCount = 0;
+ for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter)
+ if (Counts[uLetter] > 0)
+ ++uUniqueCount;
+ return uUniqueCount;
+ }
+
+double MSA::GetOcc(unsigned uColIndex) const
+ {
+ unsigned uGapCount = 0;
+ for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)
+ if (IsGap(uSeqIndex, uColIndex))
+ ++uGapCount;
+ unsigned uSeqCount = GetSeqCount();
+ return (double) (uSeqCount - uGapCount) / (double) uSeqCount;
+ }
+
+void MSA::ToFile(TextFile &File) const
+ {
+ if (g_bMSF)
+ ToMSFFile(File);
+ else if (g_bAln)
+ ToAlnFile(File);
+ else if (g_bHTML)
+ ToHTMLFile(File);
+ else if (g_bPHYS)
+ ToPhySequentialFile(File);
+ else if (g_bPHYI)
+ ToPhyInterleavedFile(File);
+ else
+ ToFASTAFile(File);
+ if (0 != g_pstrScoreFileName)
+ WriteScoreFile(*this);
+ }
+
+bool MSA::ColumnHasGap(unsigned uColIndex) const
+ {
+ const unsigned uSeqCount = GetSeqCount();
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ if (IsGap(uSeqIndex, uColIndex))
+ return true;
+ return false;
+ }
+
+void MSA::SetIdCount(unsigned uIdCount)
+ {
+ //if (m_uIdCount != 0)
+ // Quit("MSA::SetIdCount: may only be called once");
+
+ if (m_uIdCount > 0)
+ {
+ if (uIdCount > m_uIdCount)
+ Quit("MSA::SetIdCount: cannot increase count");
+ return;
+ }
+ m_uIdCount = uIdCount;
+ }
+
+void MSA::SetSeqId(unsigned uSeqIndex, unsigned uId)
+ {
+ assert(uSeqIndex < m_uSeqCount);
+ assert(uId < m_uIdCount);
+ if (0 == m_SeqIndexToId)
+ {
+ if (0 == m_uIdCount)
+ Quit("MSA::SetSeqId, SetIdCount has not been called");
+ m_IdToSeqIndex = new unsigned[m_uIdCount];
+ m_SeqIndexToId = new unsigned[m_uSeqCount];
+
+ memset(m_IdToSeqIndex, 0xff, m_uIdCount*sizeof(unsigned));
+ memset(m_SeqIndexToId, 0xff, m_uSeqCount*sizeof(unsigned));
+ }
+ m_SeqIndexToId[uSeqIndex] = uId;
+ m_IdToSeqIndex[uId] = uSeqIndex;
+ }
+
+unsigned MSA::GetSeqIndex(unsigned uId) const
+ {
+ assert(uId < m_uIdCount);
+ assert(0 != m_IdToSeqIndex);
+ unsigned uSeqIndex = m_IdToSeqIndex[uId];
+ assert(uSeqIndex < m_uSeqCount);
+ return uSeqIndex;
+ }
+
+bool MSA::GetSeqIndex(unsigned uId, unsigned *ptruIndex) const
+ {
+ for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex)
+ {
+ if (uId == m_SeqIndexToId[uSeqIndex])
+ {
+ *ptruIndex = uSeqIndex;
+ return true;
+ }
+ }
+ return false;
+ }
+
+unsigned MSA::GetSeqId(unsigned uSeqIndex) const
+ {
+ assert(uSeqIndex < m_uSeqCount);
+ unsigned uId = m_SeqIndexToId[uSeqIndex];
+ assert(uId < m_uIdCount);
+ return uId;
+ }
+
+bool MSA::WeightsSet() const
+ {
+ return BTInsane != m_Weights[0];
+ }
+
+void MSASubsetByIds(const MSA &msaIn, const unsigned Ids[], unsigned uIdCount,
+ MSA &msaOut)
+ {
+ const unsigned uColCount = msaIn.GetColCount();
+ msaOut.SetSize(uIdCount, uColCount);
+ for (unsigned uSeqIndexOut = 0; uSeqIndexOut < uIdCount; ++uSeqIndexOut)
+ {
+ const unsigned uId = Ids[uSeqIndexOut];
+
+ const unsigned uSeqIndexIn = msaIn.GetSeqIndex(uId);
+ const char *ptrName = msaIn.GetSeqName(uSeqIndexIn);
+
+ msaOut.SetSeqId(uSeqIndexOut, uId);
+ msaOut.SetSeqName(uSeqIndexOut, ptrName);
+
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
+ {
+ const char c = msaIn.GetChar(uSeqIndexIn, uColIndex);
+ msaOut.SetChar(uSeqIndexOut, uColIndex, c);
+ }
+ }
+ }
+
+// Caller must allocate ptrSeq and ptrLabel as new char[n].
+void MSA::AppendSeq(char *ptrSeq, unsigned uSeqLength, char *ptrLabel)
+ {
+ if (m_uSeqCount > m_uCacheSeqCount)
+ Quit("Internal error MSA::AppendSeq");
+ if (m_uSeqCount == m_uCacheSeqCount)
+ ExpandCache(m_uSeqCount + 4, uSeqLength);
+ m_szSeqs[m_uSeqCount] = ptrSeq;
+ m_szNames[m_uSeqCount] = ptrLabel;
+ ++m_uSeqCount;
+ }
+
+void MSA::ExpandCache(unsigned uSeqCount, unsigned uColCount)
+ {
+ if (m_IdToSeqIndex != 0 || m_SeqIndexToId != 0 || uSeqCount < m_uSeqCount)
+ Quit("Internal error MSA::ExpandCache");
+
+ if (m_uSeqCount > 0 && uColCount != m_uColCount)
+ Quit("Internal error MSA::ExpandCache, ColCount changed");
+
+ char **NewSeqs = new char *[uSeqCount];
+ char **NewNames = new char *[uSeqCount];
+ WEIGHT *NewWeights = new WEIGHT[uSeqCount];
+
+ for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex)
+ {
+ NewSeqs[uSeqIndex] = m_szSeqs[uSeqIndex];
+ NewNames[uSeqIndex] = m_szNames[uSeqIndex];
+ NewWeights[uSeqIndex] = m_Weights[uSeqIndex];
+ }
+
+ for (unsigned uSeqIndex = m_uSeqCount; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ char *Seq = new char[uColCount];
+ NewSeqs[uSeqIndex] = Seq;
+#if DEBUG
+ memset(Seq, '?', uColCount);
+#endif
+ }
+
+ delete[] m_szSeqs;
+ delete[] m_szNames;
+ delete[] m_Weights;
+
+ m_szSeqs = NewSeqs;
+ m_szNames = NewNames;
+ m_Weights = NewWeights;
+
+ m_uCacheSeqCount = uSeqCount;
+ m_uCacheSeqLength = uColCount;
+ m_uColCount = uColCount;
+ }
+
+void MSA::FixAlpha()
+ {
+ ClearInvalidLetterWarning();
+ for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex)
+ {
+ for (unsigned uColIndex = 0; uColIndex < m_uColCount; ++uColIndex)
+ {
+ char c = GetChar(uSeqIndex, uColIndex);
+ if (!IsResidueChar(c) && !IsGapChar(c))
+ {
+ char w = GetWildcardChar();
+ // Warning("Invalid letter '%c', replaced by '%c'", c, w);
+ InvalidLetterWarning(c, w);
+ SetChar(uSeqIndex, uColIndex, w);
+ }
+ }
+ }
+ ReportInvalidLetters();
+ }
+
+ALPHA MSA::GuessAlpha() const
+ {
+// If at least MIN_NUCLEO_PCT of the first CHAR_COUNT non-gap
+// letters belong to the nucleotide alphabet, guess nucleo.
+// Otherwise amino.
+ const unsigned CHAR_COUNT = 100;
+ const unsigned MIN_NUCLEO_PCT = 95;
+
+ const unsigned uSeqCount = GetSeqCount();
+ const unsigned uColCount = GetColCount();
+ if (0 == uSeqCount)
+ return ALPHA_Amino;
+
+ unsigned uDNACount = 0;
+ unsigned uRNACount = 0;
+ unsigned uTotal = 0;
+ unsigned i = 0;
+ for (;;)
+ {
+ unsigned uSeqIndex = i/uColCount;
+ if (uSeqIndex >= uSeqCount)
+ break;
+ unsigned uColIndex = i%uColCount;
+ ++i;
+ char c = GetChar(uSeqIndex, uColIndex);
+ if (IsGapChar(c))
+ continue;
+ if (IsDNA(c))
+ ++uDNACount;
+ if (IsRNA(c))
+ ++uRNACount;
+ ++uTotal;
+ if (uTotal >= CHAR_COUNT)
+ break;
+ }
+ if (uTotal != 0 && ((uRNACount*100)/uTotal) >= MIN_NUCLEO_PCT)
+ return ALPHA_RNA;
+ if (uTotal != 0 && ((uDNACount*100)/uTotal) >= MIN_NUCLEO_PCT)
+ return ALPHA_DNA;
+ return ALPHA_Amino;
+ }
Added: trunk/packages/muscle/branches/upstream/current/msa.h
===================================================================
--- trunk/packages/muscle/branches/upstream/current/msa.h 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/msa.h 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,179 @@
+#ifndef MSA_h
+#define MSA_h
+
+const int MAX_SEQ_NAME = 63;
+struct PathEdge;
+class TextFile;
+class Seq;
+class ClusterNode;
+class NodeCounts;
+class DataBuffer;
+
+class MSA
+ {
+public:
+ MSA();
+ virtual ~MSA();
+
+public:
+// Ways to create an MSA
+ void FromFile(TextFile &File);
+ void FromFASTAFile(TextFile &File);
+ void FromSeq(const Seq &s);
+
+ void ToFile(TextFile &File) const;
+ void ToFASTAFile(TextFile &File) const;
+ void ToMSFFile(TextFile &File, const char *ptrComment = 0) const;
+ void ToAlnFile(TextFile &File) const;
+ void ToHTMLFile(TextFile &File) const;
+ void ToPhySequentialFile(TextFile &File) const;
+ void ToPhyInterleavedFile(TextFile &File) const;
+
+ void SetSize(unsigned uSeqCount, unsigned uColCount);
+ void SetSeqCount(unsigned uSeqCount);
+ char GetChar(unsigned uSeqIndex, unsigned uIndex) const;
+ unsigned GetLetter(unsigned uSeqIndex, unsigned uIndex) const;
+ unsigned GetLetterEx(unsigned uSeqIndex, unsigned uIndex) const;
+ const char *GetSeqName(unsigned uSeqIndex) const;
+ unsigned GetSeqId(unsigned uSeqIndex) const;
+ unsigned GetSeqIndex(unsigned uId) const;
+ bool GetSeqIndex(unsigned uId, unsigned *ptruIndex) const;
+ double GetOcc(unsigned uColIndex) const;
+ void GetFractionalWeightedCounts(unsigned uColIndex, bool bNormalize,
+ FCOUNT fcCounts[], FCOUNT *ptrfcGapStart, FCOUNT *ptrfcGapEnd,
+ FCOUNT *fcGapExtend, FCOUNT *ptrfOcc,
+ FCOUNT *fcLL, FCOUNT *fcLG, FCOUNT *fcGL, FCOUNT *fcGG) const;
+ bool IsGap(unsigned uSeqIndex, unsigned uColIndex) const;
+ bool IsWildcard(unsigned uSeqIndex, unsigned uColIndex) const;
+ bool IsGapColumn(unsigned uColIndex) const;
+ bool ColumnHasGap(unsigned uColIndex) const;
+ bool IsGapSeq(unsigned uSeqIndex) const;
+
+ void SetChar(unsigned uSeqIndex, unsigned uColIndex, char c);
+ void SetSeqName(unsigned uSeqIndex, const char szName[]);
+ void SetSeqId(unsigned uSeqIndex, unsigned uId);
+ bool HasGap() const;
+ bool IsLegalLetter(unsigned uLetter) const;
+ void GetSeq(unsigned uSeqIndex, Seq &seq) const;
+ void Copy(const MSA &msa);
+ double GetCons(unsigned uColIndex) const;
+ double GetAvgCons() const;
+ double GetPctIdentityPair(unsigned uSeqIndex1, unsigned uSeqIndex2) const;
+ bool GetSeqIndex(const char *ptrSeqName, unsigned *ptruSeqIndex) const;
+ void DeleteCol(unsigned uColIndex);
+ void DeleteColumns(unsigned uColIndex, unsigned uColCount);
+ void CopySeq(unsigned uToSeqIndex, const MSA &msaFrom, unsigned uFromSeqIndex);
+ void DeleteSeq(unsigned uSeqIndex);
+// void DeleteEmptyCols(bool bProgress = false);
+ bool IsEmptyCol(unsigned uColIndex) const;
+
+ WEIGHT GetSeqWeight(unsigned uSeqIndex) const;
+ WEIGHT GetTotalSeqWeight() const;
+ void SetSeqWeight(unsigned uSeqIndex, WEIGHT w) const;
+ void NormalizeWeights(WEIGHT wTotal) const;
+ bool WeightsSet() const;
+
+ unsigned GetGCGCheckSum(unsigned uSeqIndex) const;
+
+ ALPHA GuessAlpha() const;
+ void FixAlpha();
+
+ unsigned UniqueResidueTypes(unsigned uColIndex) const;
+
+ void UnWeight();
+
+ void GetNodeCounts(unsigned uAlignedColIndex, NodeCounts &Counts) const;
+ void ValidateBreakMatrices() const;
+ unsigned GetCharCount(unsigned uSeqIndex, unsigned uColIndex) const;
+ const char *GetSeqBuffer(unsigned uSeqIndex) const;
+ unsigned AlignedColIndexToColIndex(unsigned uAlignedColIndex) const;
+ unsigned GetSeqLength(unsigned uSeqIndex) const;
+ void GetPWID(unsigned uSeqIndex1, unsigned uSeqIndex2, double *ptrdPWID,
+ unsigned *ptruPosCount) const;
+
+ void GetPairMap(unsigned uSeqIndex1, unsigned uSeqIndex2, int iMap1[],
+ int iMap2[]) const;
+
+ void LogMe() const;
+ void ListWeights() const;
+
+ void GapInfoToDataBuffer(DataBuffer &Buffer) const;
+ void GapInfoFromDataBuffer(const DataBuffer &Buffer);
+ double GetPctGroupIdentityPair(unsigned uSeqIndex1, unsigned uSeqIndex2) const;
+
+ void Clear()
+ {
+ Free();
+ }
+ unsigned GetSeqCount() const
+ {
+ return m_uSeqCount;
+ }
+ unsigned GetColCount() const
+ {
+ return m_uColCount;
+ }
+
+ static bool SeqsEq(const MSA &a1, unsigned uSeqIndex1, const MSA &a2,
+ unsigned uSeqIndex2);
+
+ static void SetIdCount(unsigned uIdCount);
+
+private:
+ friend void SetMSAWeightsMuscle(MSA &msa);
+ friend void SetThreeWayWeightsMuscle(MSA &msa);
+ void SetHenikoffWeightsPB() const;
+ void SetHenikoffWeights() const;
+ void SetGSCWeights() const;
+ void SetUniformWeights() const;
+ void SetClustalWWeights(const Tree &tree);
+
+ void Free();
+ void AppendSeq(char *ptrSeq, unsigned uSeqLength, char *ptrLabel);
+ void ExpandCache(unsigned uSeqCount, unsigned uColCount);
+ void CalcWeights() const;
+ void GetNameFromFASTAAnnotationLine(const char szLine[],
+ char szName[], unsigned uBytes);
+ void CopyCol(unsigned uFromCol, unsigned uToCol);
+ unsigned CalcBLOSUMWeights(ClusterTree &BlosumCluster) const;
+ void SetBLOSUMSubtreeWeight(const ClusterNode *ptrNode, double dWeight) const;
+ unsigned SetBLOSUMNodeWeight(const ClusterNode *ptrNode, double dMinDist) const;
+ void SetSubtreeWeight2(const ClusterNode *ptrNode) const;
+ void SetSubtreeGSCWeight(ClusterNode *ptrNode) const;
+
+ void CalcHenikoffWeightsColPB(unsigned uColIndex) const;
+ void CalcHenikoffWeightsCol(unsigned uColIndex) const;
+
+private:
+ unsigned m_uSeqCount;
+ unsigned m_uColCount;
+ unsigned m_uCacheSeqLength;
+ unsigned m_uCacheSeqCount;
+ char **m_szSeqs;
+ char **m_szNames;
+
+ static unsigned m_uIdCount;
+
+ unsigned *m_IdToSeqIndex;
+ unsigned *m_SeqIndexToId;
+
+ WEIGHT *m_Weights;
+ };
+
+void SeqVectFromMSA(const MSA &msa, SeqVect &v);
+void DeleteGappedCols(MSA &msa);
+void MSAFromColRange(const MSA &msaIn, unsigned uFromColIndex, unsigned uColCount,
+ MSA &msaOut);
+void MSACat(const MSA &msa1, const MSA &msa2, MSA &msaCat);
+void MSAAppend(MSA &msa1, const MSA &msa2);
+void MSAFromSeqSubset(const MSA &msaIn, const unsigned uSeqIndexes[], unsigned uSeqCount,
+ MSA &msaOut);
+void AssertMSAEq(const MSA &msa1, const MSA &msa2);
+void AssertMSAEqIgnoreCaseAndGaps(const MSA &msa1, const MSA &msa2);
+void MSASubsetByIds(const MSA &msaIn, const unsigned Ids[], unsigned uIdCount,
+ MSA &msaOut);
+void SetMSAWeightsMuscle(MSA &msa);
+void SetClustalWWeightsMuscle(MSA &msa);
+void SetThreeWayWeightsMuscle(MSA &msa);
+
+#endif // MSA_h
Added: trunk/packages/muscle/branches/upstream/current/msa2.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/msa2.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/msa2.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,531 @@
+#include "muscle.h"
+#include "msa.h"
+#include "seqvect.h"
+#include "profile.h"
+#include "tree.h"
+
+// These global variables are a hack to allow the tree
+// dependent iteration code to communicate the edge
+// used to divide the tree. The three-way weighting
+// scheme needs to know this edge in order to compute
+// sequence weights.
+static const Tree *g_ptrMuscleTree = 0;
+unsigned g_uTreeSplitNode1 = NULL_NEIGHBOR;
+unsigned g_uTreeSplitNode2 = NULL_NEIGHBOR;
+
+void MSA::GetFractionalWeightedCounts(unsigned uColIndex, bool bNormalize,
+ FCOUNT fcCounts[], FCOUNT *ptrfcGapStart, FCOUNT *ptrfcGapEnd,
+ FCOUNT *ptrfcGapExtend, FCOUNT *ptrfOcc,
+ FCOUNT *ptrfcLL, FCOUNT *ptrfcLG, FCOUNT *ptrfcGL, FCOUNT *ptrfcGG) const
+ {
+ const unsigned uSeqCount = GetSeqCount();
+ const unsigned uColCount = GetColCount();
+
+ memset(fcCounts, 0, g_AlphaSize*sizeof(FCOUNT));
+ WEIGHT wTotal = 0;
+ FCOUNT fGap = 0;
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ const WEIGHT w = GetSeqWeight(uSeqIndex);
+ if (IsGap(uSeqIndex, uColIndex))
+ {
+ fGap += w;
+ continue;
+ }
+ else if (IsWildcard(uSeqIndex, uColIndex))
+ {
+ const unsigned uLetter = GetLetterEx(uSeqIndex, uColIndex);
+ switch (g_Alpha)
+ {
+ case ALPHA_Amino:
+ switch (uLetter)
+ {
+ case AX_B: // D or N
+ fcCounts[AX_D] += w/2;
+ fcCounts[AX_N] += w/2;
+ break;
+ case AX_Z: // E or Q
+ fcCounts[AX_E] += w/2;
+ fcCounts[AX_Q] += w/2;
+ break;
+ default: // any
+ {
+ const FCOUNT f = w/20;
+ for (unsigned uLetter = 0; uLetter < 20; ++uLetter)
+ fcCounts[uLetter] += f;
+ break;
+ }
+ }
+ break;
+
+ case ALPHA_DNA:
+ case ALPHA_RNA:
+ switch (uLetter)
+ {
+ case AX_R: // G or A
+ fcCounts[NX_G] += w/2;
+ fcCounts[NX_A] += w/2;
+ break;
+ case AX_Y: // C or T/U
+ fcCounts[NX_C] += w/2;
+ fcCounts[NX_T] += w/2;
+ break;
+ default: // any
+ const FCOUNT f = w/20;
+ for (unsigned uLetter = 0; uLetter < 4; ++uLetter)
+ fcCounts[uLetter] += f;
+ break;
+ }
+ break;
+
+ default:
+ Quit("Alphabet %d not supported", g_Alpha);
+ }
+ continue;
+ }
+ unsigned uLetter = GetLetter(uSeqIndex, uColIndex);
+ fcCounts[uLetter] += w;
+ wTotal += w;
+ }
+ *ptrfOcc = (float) (1.0 - fGap);
+
+ if (bNormalize && wTotal > 0)
+ {
+ if (wTotal > 1.001)
+ Quit("wTotal=%g\n", wTotal);
+ for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter)
+ fcCounts[uLetter] /= wTotal;
+// AssertNormalized(fcCounts);
+ }
+
+ FCOUNT fcStartCount = 0;
+ if (uColIndex == 0)
+ {
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ if (IsGap(uSeqIndex, uColIndex))
+ fcStartCount += GetSeqWeight(uSeqIndex);
+ }
+ else
+ {
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ if (IsGap(uSeqIndex, uColIndex) && !IsGap(uSeqIndex, uColIndex - 1))
+ fcStartCount += GetSeqWeight(uSeqIndex);
+ }
+
+ FCOUNT fcEndCount = 0;
+ if (uColCount - 1 == uColIndex)
+ {
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ if (IsGap(uSeqIndex, uColIndex))
+ fcEndCount += GetSeqWeight(uSeqIndex);
+ }
+ else
+ {
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ if (IsGap(uSeqIndex, uColIndex) && !IsGap(uSeqIndex, uColIndex + 1))
+ fcEndCount += GetSeqWeight(uSeqIndex);
+ }
+
+ FCOUNT LL = 0;
+ FCOUNT LG = 0;
+ FCOUNT GL = 0;
+ FCOUNT GG = 0;
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ WEIGHT w = GetSeqWeight(uSeqIndex);
+ bool bLetterHere = !IsGap(uSeqIndex, uColIndex);
+ bool bLetterPrev = (uColIndex == 0 || !IsGap(uSeqIndex, uColIndex - 1));
+ if (bLetterHere)
+ {
+ if (bLetterPrev)
+ LL += w;
+ else
+ GL += w;
+ }
+ else
+ {
+ if (bLetterPrev)
+ LG += w;
+ else
+ GG += w;
+ }
+ }
+
+ FCOUNT fcExtendCount = 0;
+ if (uColIndex > 0 && uColIndex < GetColCount() - 1)
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ if (IsGap(uSeqIndex, uColIndex) && IsGap(uSeqIndex, uColIndex - 1) &&
+ IsGap(uSeqIndex, uColIndex + 1))
+ fcExtendCount += GetSeqWeight(uSeqIndex);
+
+ *ptrfcLL = LL;
+ *ptrfcLG = LG;
+ *ptrfcGL = GL;
+ *ptrfcGG = GG;
+ *ptrfcGapStart = fcStartCount;
+ *ptrfcGapEnd = fcEndCount;
+ *ptrfcGapExtend = fcExtendCount;
+ }
+
+// Return true if the given column has no gaps and all
+// its residues are in the same biochemical group.
+bool MSAColIsConservative(const MSA &msa, unsigned uColIndex)
+ {
+ extern unsigned ResidueGroup[];
+
+ const unsigned uSeqCount = msa.GetColCount();
+ if (0 == uSeqCount)
+ Quit("MSAColIsConservative: empty alignment");
+
+ if (msa.IsGap(0, uColIndex))
+ return false;
+
+ unsigned uLetter = msa.GetLetterEx(0, uColIndex);
+ const unsigned uGroup = ResidueGroup[uLetter];
+
+ for (unsigned uSeqIndex = 1; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ if (msa.IsGap(uSeqIndex, uColIndex))
+ return false;
+ uLetter = msa.GetLetter(uSeqIndex, uColIndex);
+ if (ResidueGroup[uLetter] != uGroup)
+ return false;
+ }
+ return true;
+ }
+
+void MSAFromSeqRange(const MSA &msaIn, unsigned uFromSeqIndex, unsigned uSeqCount,
+ MSA &msaOut)
+ {
+ const unsigned uColCount = msaIn.GetColCount();
+ msaOut.SetSize(uSeqCount, uColCount);
+
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ const char *ptrName = msaIn.GetSeqName(uFromSeqIndex + uSeqIndex);
+ msaOut.SetSeqName(uSeqIndex, ptrName);
+
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
+ {
+ const char c = msaIn.GetChar(uFromSeqIndex + uSeqIndex, uColIndex);
+ msaOut.SetChar(uSeqIndex, uColIndex, c);
+ }
+ }
+ }
+
+void MSAFromColRange(const MSA &msaIn, unsigned uFromColIndex, unsigned uColCount,
+ MSA &msaOut)
+ {
+ const unsigned uSeqCount = msaIn.GetSeqCount();
+ const unsigned uInColCount = msaIn.GetColCount();
+
+ if (uFromColIndex + uColCount - 1 > uInColCount)
+ Quit("MSAFromColRange, out of bounds");
+
+ msaOut.SetSize(uSeqCount, uColCount);
+
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ const char *ptrName = msaIn.GetSeqName(uSeqIndex);
+ unsigned uId = msaIn.GetSeqId(uSeqIndex);
+ msaOut.SetSeqName(uSeqIndex, ptrName);
+ msaOut.SetSeqId(uSeqIndex, uId);
+
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
+ {
+ const char c = msaIn.GetChar(uSeqIndex, uFromColIndex + uColIndex);
+ msaOut.SetChar(uSeqIndex, uColIndex, c);
+ }
+ }
+ }
+
+void SeqVectFromMSA(const MSA &msa, SeqVect &v)
+ {
+ v.Clear();
+ const unsigned uSeqCount = msa.GetSeqCount();
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ Seq s;
+ msa.GetSeq(uSeqIndex, s);
+
+ s.StripGaps();
+ //if (0 == s.Length())
+ // continue;
+
+ const char *ptrName = msa.GetSeqName(uSeqIndex);
+ s.SetName(ptrName);
+
+ v.AppendSeq(s);
+ }
+ }
+
+void DeleteGappedCols(MSA &msa)
+ {
+ unsigned uColIndex = 0;
+ for (;;)
+ {
+ if (uColIndex >= msa.GetColCount())
+ break;
+ if (msa.IsGapColumn(uColIndex))
+ msa.DeleteCol(uColIndex);
+ else
+ ++uColIndex;
+ }
+ }
+
+void MSAFromSeqSubset(const MSA &msaIn, const unsigned uSeqIndexes[], unsigned uSeqCount,
+ MSA &msaOut)
+ {
+ const unsigned uColCount = msaIn.GetColCount();
+ msaOut.SetSize(uSeqCount, uColCount);
+ for (unsigned uSeqIndexOut = 0; uSeqIndexOut < uSeqCount; ++uSeqIndexOut)
+ {
+ unsigned uSeqIndexIn = uSeqIndexes[uSeqIndexOut];
+ const char *ptrName = msaIn.GetSeqName(uSeqIndexIn);
+ unsigned uId = msaIn.GetSeqId(uSeqIndexIn);
+ msaOut.SetSeqName(uSeqIndexOut, ptrName);
+ msaOut.SetSeqId(uSeqIndexOut, uId);
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
+ {
+ const char c = msaIn.GetChar(uSeqIndexIn, uColIndex);
+ msaOut.SetChar(uSeqIndexOut, uColIndex, c);
+ }
+ }
+ }
+
+void AssertMSAEqIgnoreCaseAndGaps(const MSA &msa1, const MSA &msa2)
+ {
+ const unsigned uSeqCount1 = msa1.GetSeqCount();
+ const unsigned uSeqCount2 = msa2.GetSeqCount();
+ if (uSeqCount1 != uSeqCount2)
+ Quit("Seq count differs");
+
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount1; ++uSeqIndex)
+ {
+ Seq seq1;
+ msa1.GetSeq(uSeqIndex, seq1);
+
+ unsigned uId = msa1.GetSeqId(uSeqIndex);
+ unsigned uSeqIndex2 = msa2.GetSeqIndex(uId);
+
+ Seq seq2;
+ msa2.GetSeq(uSeqIndex2, seq2);
+
+ if (!seq1.EqIgnoreCaseAndGaps(seq2))
+ {
+ Log("Input:\n");
+ seq1.LogMe();
+ Log("Output:\n");
+ seq2.LogMe();
+ Quit("Seq %s differ ", msa1.GetSeqName(uSeqIndex));
+ }
+ }
+ }
+
+void AssertMSAEq(const MSA &msa1, const MSA &msa2)
+ {
+ const unsigned uSeqCount1 = msa1.GetSeqCount();
+ const unsigned uSeqCount2 = msa2.GetSeqCount();
+ if (uSeqCount1 != uSeqCount2)
+ Quit("Seq count differs");
+
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount1; ++uSeqIndex)
+ {
+ Seq seq1;
+ msa1.GetSeq(uSeqIndex, seq1);
+
+ unsigned uId = msa1.GetSeqId(uSeqIndex);
+ unsigned uSeqIndex2 = msa2.GetSeqIndex(uId);
+
+ Seq seq2;
+ msa2.GetSeq(uSeqIndex2, seq2);
+
+ if (!seq1.Eq(seq2))
+ {
+ Log("Input:\n");
+ seq1.LogMe();
+ Log("Output:\n");
+ seq2.LogMe();
+ Quit("Seq %s differ ", msa1.GetSeqName(uSeqIndex));
+ }
+ }
+ }
+
+void SetMSAWeightsMuscle(MSA &msa)
+ {
+ SEQWEIGHT Method = GetSeqWeightMethod();
+ switch (Method)
+ {
+ case SEQWEIGHT_None:
+ msa.SetUniformWeights();
+ return;
+
+ case SEQWEIGHT_Henikoff:
+ msa.SetHenikoffWeights();
+ return;
+
+ case SEQWEIGHT_HenikoffPB:
+ msa.SetHenikoffWeightsPB();
+ return;
+
+ case SEQWEIGHT_GSC:
+ msa.SetGSCWeights();
+ return;
+
+ case SEQWEIGHT_ClustalW:
+ SetClustalWWeightsMuscle(msa);
+ return;
+
+ case SEQWEIGHT_ThreeWay:
+ SetThreeWayWeightsMuscle(msa);
+ return;
+ }
+ Quit("SetMSAWeightsMuscle, Invalid method=%d", Method);
+ }
+
+static WEIGHT *g_MuscleWeights;
+static unsigned g_uMuscleIdCount;
+
+WEIGHT GetMuscleSeqWeightById(unsigned uId)
+ {
+ if (0 == g_MuscleWeights)
+ Quit("g_MuscleWeights = 0");
+ if (uId >= g_uMuscleIdCount)
+ Quit("GetMuscleSeqWeightById(%u): count=%u",
+ uId, g_uMuscleIdCount);
+
+ return g_MuscleWeights[uId];
+ }
+
+void SetMuscleTree(const Tree &tree)
+ {
+ g_ptrMuscleTree = &tree;
+
+ if (SEQWEIGHT_ClustalW != GetSeqWeightMethod())
+ return;
+
+ delete[] g_MuscleWeights;
+
+ const unsigned uLeafCount = tree.GetLeafCount();
+ g_uMuscleIdCount = uLeafCount;
+ g_MuscleWeights = new WEIGHT[uLeafCount];
+ CalcClustalWWeights(tree, g_MuscleWeights);
+ }
+
+void SetClustalWWeightsMuscle(MSA &msa)
+ {
+ if (0 == g_MuscleWeights)
+ Quit("g_MuscleWeights = 0");
+ const unsigned uSeqCount = msa.GetSeqCount();
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ const unsigned uId = msa.GetSeqId(uSeqIndex);
+ if (uId >= g_uMuscleIdCount)
+ Quit("SetClustalWWeightsMuscle: id out of range");
+ msa.SetSeqWeight(uSeqIndex, g_MuscleWeights[uId]);
+ }
+ msa.NormalizeWeights((WEIGHT) 1.0);
+ }
+
+#define LOCAL_VERBOSE 0
+
+void SetThreeWayWeightsMuscle(MSA &msa)
+ {
+ if (NULL_NEIGHBOR == g_uTreeSplitNode1 || NULL_NEIGHBOR == g_uTreeSplitNode2)
+ {
+ msa.SetHenikoffWeightsPB();
+ return;
+ }
+
+ const unsigned uMuscleSeqCount = g_ptrMuscleTree->GetLeafCount();
+ WEIGHT *Weights = new WEIGHT[uMuscleSeqCount];
+
+ CalcThreeWayWeights(*g_ptrMuscleTree, g_uTreeSplitNode1, g_uTreeSplitNode2,
+ Weights);
+
+ const unsigned uMSASeqCount = msa.GetSeqCount();
+ for (unsigned uSeqIndex = 0; uSeqIndex < uMSASeqCount; ++uSeqIndex)
+ {
+ const unsigned uId = msa.GetSeqId(uSeqIndex);
+ if (uId >= uMuscleSeqCount)
+ Quit("SetThreeWayWeightsMuscle: id out of range");
+ msa.SetSeqWeight(uSeqIndex, Weights[uId]);
+ }
+#if LOCAL_VERBOSE
+ {
+ Log("SetThreeWayWeightsMuscle\n");
+ for (unsigned n = 0; n < uMSASeqCount; ++n)
+ {
+ const unsigned uId = msa.GetSeqId(n);
+ Log("%20.20s %6.3f\n", msa.GetSeqName(n), Weights[uId]);
+ }
+ }
+#endif
+ msa.NormalizeWeights((WEIGHT) 1.0);
+
+ delete[] Weights;
+ }
+
+// Append msa2 at the end of msa1
+void MSAAppend(MSA &msa1, const MSA &msa2)
+ {
+ const unsigned uSeqCount = msa1.GetSeqCount();
+
+ const unsigned uColCount1 = msa1.GetColCount();
+ const unsigned uColCount2 = msa2.GetColCount();
+ const unsigned uColCountCat = uColCount1 + uColCount2;
+
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ unsigned uId = msa1.GetSeqId(uSeqIndex);
+ unsigned uSeqIndex2 = msa2.GetSeqIndex(uId);
+ for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex)
+ {
+ const char c = msa2.GetChar(uSeqIndex2, uColIndex);
+ msa1.SetChar(uSeqIndex, uColCount1 + uColIndex, c);
+ }
+ }
+ }
+
+// "Catenate" two MSAs (by bad analogy with UNIX cat command).
+// msa1 and msa2 must have same sequence names, but possibly
+// in a different order.
+// msaCat is the combined alignment produce by appending
+// sequences in msa2 to sequences in msa1.
+void MSACat(const MSA &msa1, const MSA &msa2, MSA &msaCat)
+ {
+ const unsigned uSeqCount = msa1.GetSeqCount();
+
+ const unsigned uColCount1 = msa1.GetColCount();
+ const unsigned uColCount2 = msa2.GetColCount();
+ const unsigned uColCountCat = uColCount1 + uColCount2;
+
+ msaCat.SetSize(uSeqCount, uColCountCat);
+
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ for (unsigned uColIndex = 0; uColIndex < uColCount1; ++uColIndex)
+ {
+ const char c = msa1.GetChar(uSeqIndex, uColIndex);
+ msaCat.SetChar(uSeqIndex, uColIndex, c);
+ }
+
+ const char *ptrSeqName = msa1.GetSeqName(uSeqIndex);
+ unsigned uSeqIndex2;
+ msaCat.SetSeqName(uSeqIndex, ptrSeqName);
+ bool bFound = msa2.GetSeqIndex(ptrSeqName, &uSeqIndex2);
+ if (bFound)
+ {
+ for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex)
+ {
+ const char c = msa2.GetChar(uSeqIndex2, uColIndex);
+ msaCat.SetChar(uSeqIndex, uColCount1 + uColIndex, c);
+ }
+ }
+ else
+ {
+ for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex)
+ msaCat.SetChar(uSeqIndex, uColCount1 + uColIndex, '-');
+ }
+ }
+ }
Added: trunk/packages/muscle/branches/upstream/current/msadist.h
===================================================================
--- trunk/packages/muscle/branches/upstream/current/msadist.h 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/msadist.h 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,34 @@
+#ifndef MSADist_h
+#define MSADist_h
+
+#include <math.h>
+
+class MSADist
+ {
+public:
+ MSADist(DISTANCE Distance)
+ {
+ m_Distance = Distance;
+ }
+
+ double ComputeDist(const MSA &msa, unsigned uSeqIndex1, unsigned uSeqIndex2)
+ {
+ double dPctId = msa.GetPctIdentityPair(uSeqIndex1, uSeqIndex2);
+ switch(m_Distance)
+ {
+ case DISTANCE_PctIdKimura:
+ return KimuraDist(dPctId);
+ case DISTANCE_PctIdLog:
+ if (dPctId < 0.05)
+ dPctId = 0.05;
+ return -log(dPctId);
+ }
+ Quit("MSADist::ComputeDist, invalid DISTANCE_%u", m_Distance);
+ return 0;
+ }
+
+private:
+ DISTANCE m_Distance;
+ };
+
+#endif // MSADist_h
Added: trunk/packages/muscle/branches/upstream/current/msadistkimura.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/msadistkimura.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/msadistkimura.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,88 @@
+#include "muscle.h"
+#include "msa.h"
+#include <math.h>
+
+// "Standard" NJ distance: the Kimura measure.
+// This is defined to be:
+//
+// log_e(1 - p - p*p/5)
+//
+// where p is the fraction of residues that differ, i.e.:
+//
+// p = (1 - fractional_conservation)
+//
+// This measure is infinite for p = 0.8541 and is considered
+// unreliable for p >= 0.75 (according to the ClustalW docs).
+// ClustalW uses a table lookup for values > 0.75.
+// The following table was copied from the ClustalW file dayhoff.h.
+
+static int dayhoff_pams[]={
+ 195, /* 75.0% observed d; 195 PAMs estimated = 195% estimated d */
+ 196, /* 75.1% observed d; 196 PAMs estimated */
+ 197, 198, 199, 200, 200, 201, 202, 203,
+ 204, 205, 206, 207, 208, 209, 209, 210, 211, 212,
+ 213, 214, 215, 216, 217, 218, 219, 220, 221, 222,
+ 223, 224, 226, 227, 228, 229, 230, 231, 232, 233,
+ 234, 236, 237, 238, 239, 240, 241, 243, 244, 245,
+ 246, 248, 249, 250, /* 250 PAMs = 80.3% observed d */
+ 252, 253, 254, 255, 257, 258,
+ 260, 261, 262, 264, 265, 267, 268, 270, 271, 273,
+ 274, 276, 277, 279, 281, 282, 284, 285, 287, 289,
+ 291, 292, 294, 296, 298, 299, 301, 303, 305, 307,
+ 309, 311, 313, 315, 317, 319, 321, 323, 325, 328,
+ 330, 332, 335, 337, 339, 342, 344, 347, 349, 352,
+ 354, 357, 360, 362, 365, 368, 371, 374, 377, 380,
+ 383, 386, 389, 393, 396, 399, 403, 407, 410, 414,
+ 418, 422, 426, 430, 434, 438, 442, 447, 451, 456,
+ 461, 466, 471, 476, 482, 487, 493, 498, 504, 511,
+ 517, 524, 531, 538, 545, 553, 560, 569, 577, 586,
+ 595, 605, 615, 626, 637, 649, 661, 675, 688, 703,
+ 719, 736, 754, 775, 796, 819, 845, 874, 907, 945,
+ /* 92.9% observed; 945 PAMs */
+ 988 /* 93.0% observed; 988 PAMs */
+};
+static int iTableEntries = sizeof(dayhoff_pams)/sizeof(dayhoff_pams[0]);
+
+double KimuraDist(double dPctId)
+ {
+ double p = 1 - dPctId;
+// Typical case: use Kimura's empirical formula
+ if (p < 0.75)
+ return -log(1 - p - (p*p)/5);
+
+// Per ClustalW, return 10.0 for anything over 93%
+ if (p > 0.93)
+ return 10.0;
+
+// If p >= 0.75, use table lookup
+ assert(p <= 1 && p >= 0.75);
+// Thanks for Michael Hoel for pointing out a bug
+// in the table index calculation in versions <= 3.52.
+ int iTableIndex = (int) ((p - 0.75)*1000 + 0.5);
+ if (iTableIndex < 0 || iTableIndex >= iTableEntries)
+ Quit("Internal error in MSADistKimura::ComputeDist");
+
+ return dayhoff_pams[iTableIndex] / 100.0;
+ }
+
+//double MSADistKimura::ComputeDist(const MSA &msa, unsigned uSeqIndex1,
+// unsigned uSeqIndex2)
+// {
+// double dPctId = msa.GetPctIdentityPair(uSeqIndex1, uSeqIndex2);
+// return KimuraDist(dPctId);
+// }
+
+double KimuraDistToPctId(double dKimuraDist)
+ {
+// Solve quadratic equation
+ const double a = 0.2;
+ const double b = 1;
+ const double c = 1.0 - exp(-dKimuraDist);
+ const double p = (-b + sqrt(b*b + 4*a*c))/(2*a);
+ return 1 - p;
+ }
+
+double PctIdToHeightKimura(double dPctId)
+ {
+ return KimuraDist(dPctId);
+ }
Added: trunk/packages/muscle/branches/upstream/current/msf.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/msf.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/msf.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,121 @@
+#include "muscle.h"
+#include <stdio.h>
+#include <ctype.h>
+#include "msa.h"
+#include "textfile.h"
+
+const int MAX_NAME = 63;
+
+const unsigned uCharsPerLine = 50;
+const unsigned uCharsPerBlock = 10;
+
+// Truncate at first white space or MAX_NAME, whichever comes
+// first, then pad with blanks up to PadLength.
+static const char *GetPaddedName(const char *Name, int PadLength)
+ {
+ static char PaddedName[MAX_NAME+1];
+ memset(PaddedName, ' ', MAX_NAME);
+ size_t n = strcspn(Name, " \t");
+ memcpy(PaddedName, Name, n);
+ PaddedName[PadLength] = 0;
+ return PaddedName;
+ }
+
+static const char *strfind(const char *s, const char *t)
+ {
+ size_t n = strcspn(s, t);
+ if (0 == n)
+ return 0;
+ return s + n;
+ }
+
+// GCG checksum code kindly provided by Eric Martel.
+unsigned MSA::GetGCGCheckSum(unsigned uSeqIndex) const
+ {
+ unsigned CheckSum = 0;
+ const unsigned uColCount = GetColCount();
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
+ {
+ unsigned c = (unsigned) GetChar(uSeqIndex, uColIndex);
+ CheckSum += c*(uColIndex%57 + 1);
+ CheckSum %= 10000;
+ }
+ return CheckSum;
+ }
+
+static void MSFFixGaps(MSA &a)
+ {
+ const int SeqCount = a.GetSeqCount();
+ const int ColCount = a.GetColCount();
+ for (int SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex)
+ {
+ for (int ColIndex = 0; ColIndex < ColCount; ++ColIndex)
+ if (a.IsGap(SeqIndex, ColIndex))
+ a.SetChar(SeqIndex, ColIndex, '.');
+ }
+ }
+
+void MSA::ToMSFFile(TextFile &File, const char *ptrComment) const
+ {
+// Cast away const, yuck
+ SetMSAWeightsMuscle((MSA &) *this);
+ MSFFixGaps((MSA &) *this);
+
+ File.PutString("PileUp\n");
+
+ if (0 != ptrComment)
+ File.PutFormat("Comment: %s\n", ptrComment);
+ else
+ File.PutString("\n");
+
+ char seqtype = (g_Alpha == ALPHA_DNA || g_Alpha == ALPHA_RNA) ? 'N' : 'A';
+ File.PutFormat(" MSF: %u Type: %c Check: 0000 ..\n\n",
+ GetColCount(), seqtype);
+
+ int iLongestNameLength = 0;
+ for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)
+ {
+ const char *Name = GetSeqName(uSeqIndex);
+ const char *PaddedName = GetPaddedName(Name, MAX_NAME);
+ int iLength = (int) strcspn(PaddedName, " \t");
+ if (iLength > iLongestNameLength)
+ iLongestNameLength = iLength;
+ }
+
+ for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)
+ {
+ const char *Name = GetSeqName(uSeqIndex);
+ const char *PaddedName = GetPaddedName(Name, iLongestNameLength);
+ File.PutFormat(" Name: %s", PaddedName);
+ File.PutFormat(" Len: %u Check: %5u Weight: %g\n",
+ GetColCount(), GetGCGCheckSum(uSeqIndex), GetSeqWeight(uSeqIndex));
+ }
+ File.PutString("\n//\n");
+ if (0 == GetColCount())
+ return;
+
+ unsigned uLineCount = (GetColCount() - 1)/uCharsPerLine + 1;
+ for (unsigned uLineIndex = 0; uLineIndex < uLineCount; ++uLineIndex)
+ {
+ File.PutString("\n");
+ unsigned uStartColIndex = uLineIndex*uCharsPerLine;
+ unsigned uEndColIndex = uStartColIndex + uCharsPerLine - 1;
+ if (uEndColIndex >= GetColCount())
+ uEndColIndex = GetColCount() - 1;
+ for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)
+ {
+ const char *Name = GetSeqName(uSeqIndex);
+ const char *PaddedName = GetPaddedName(Name, iLongestNameLength);
+ File.PutFormat("%s ", PaddedName);
+ for (unsigned uColIndex = uStartColIndex; uColIndex <= uEndColIndex;
+ ++uColIndex)
+ {
+ if (0 == uColIndex%uCharsPerBlock)
+ File.PutString(" ");
+ char c = GetChar(uSeqIndex, uColIndex);
+ File.PutFormat("%c", c);
+ }
+ File.PutString("\n");
+ }
+ }
+ }
Added: trunk/packages/muscle/branches/upstream/current/muscle.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/muscle.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/muscle.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,130 @@
+#include "muscle.h"
+#include "msa.h"
+#include "seqvect.h"
+#include "msa.h"
+#include "tree.h"
+#include "profile.h"
+
+void MUSCLE(SeqVect &v, MSA &msaOut)
+ {
+ const unsigned uSeqCount = v.Length();
+
+ if (0 == uSeqCount)
+ Quit("No sequences in input file");
+
+ ALPHA Alpha = ALPHA_Undefined;
+ switch (g_SeqType)
+ {
+ case SEQTYPE_Auto:
+ Alpha = v.GuessAlpha();
+ break;
+
+ case SEQTYPE_Protein:
+ Alpha = ALPHA_Amino;
+ break;
+
+ case SEQTYPE_RNA:
+ Alpha = ALPHA_RNA;
+ break;
+
+ case SEQTYPE_DNA:
+ Alpha = ALPHA_DNA;
+ break;
+
+ default:
+ Quit("Invalid seq type");
+ }
+ SetAlpha(Alpha);
+ v.FixAlpha();
+
+ if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha)
+ {
+ SetPPScore(PPSCORE_SPN);
+ g_Distance1 = DISTANCE_Kmer4_6;
+ }
+
+ unsigned uMaxL = 0;
+ unsigned uTotL = 0;
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ unsigned L = v.GetSeq(uSeqIndex).Length();
+ uTotL += L;
+ if (L > uMaxL)
+ uMaxL = L;
+ }
+
+ SetIter(1);
+ g_bDiags = g_bDiags1;
+ SetSeqStats(uSeqCount, uMaxL, uTotL/uSeqCount);
+
+ MSA::SetIdCount(uSeqCount);
+
+//// Initialize sequence ids.
+//// From this point on, ids must somehow propogate from here.
+// for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+// v.SetSeqId(uSeqIndex, uSeqIndex);
+
+ if (uSeqCount > 1)
+ MHackStart(v);
+
+ if (0 == uSeqCount)
+ {
+ msaOut.Clear();
+ return;
+ }
+
+ if (1 == uSeqCount && ALPHA_Amino == Alpha)
+ {
+ const Seq &s = v.GetSeq(0);
+ msaOut.FromSeq(s);
+ return;
+ }
+
+// First iteration
+ Tree GuideTree;
+ TreeFromSeqVect(v, GuideTree, g_Cluster1, g_Distance1, g_Root1);
+
+ SetMuscleTree(GuideTree);
+
+ ProgNode *ProgNodes = 0;
+ if (g_bLow)
+ ProgNodes = ProgressiveAlignE(v, GuideTree, msaOut);
+ else
+ ProgressiveAlign(v, GuideTree, msaOut);
+ SetCurrentAlignment(msaOut);
+
+ if (1 == g_uMaxIters || 2 == uSeqCount)
+ {
+ MHackEnd(msaOut);
+ return;
+ }
+
+ g_bDiags = g_bDiags2;
+ SetIter(2);
+
+ if (g_bLow)
+ {
+ if (0 != g_uMaxTreeRefineIters)
+ RefineTreeE(msaOut, v, GuideTree, ProgNodes);
+ }
+ else
+ RefineTree(msaOut, GuideTree);
+
+ extern void DeleteProgNode(ProgNode &Node);
+ const unsigned uNodeCount = GuideTree.GetNodeCount();
+ for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex)
+ DeleteProgNode(ProgNodes[uNodeIndex]);
+
+ delete[] ProgNodes;
+ ProgNodes = 0;
+
+ SetSeqWeightMethod(g_SeqWeight2);
+ SetMuscleTree(GuideTree);
+
+ if (g_bAnchors)
+ RefineVert(msaOut, GuideTree, g_uMaxIters - 2);
+ else
+ RefineHoriz(msaOut, GuideTree, g_uMaxIters - 2, false, false);
+
+ MHackEnd(msaOut);
+ }
Added: trunk/packages/muscle/branches/upstream/current/muscle.h
===================================================================
--- trunk/packages/muscle/branches/upstream/current/muscle.h 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/muscle.h 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,328 @@
+#if DEBUG && !_DEBUG
+#define _DEBUG 1
+#endif
+
+#if _DEBUG && !DEBUG
+#define DEBUG 1
+#endif
+
+#if _MSC_VER
+#define TIMING 0
+#endif
+
+#define VER_3_52 0
+
+#ifdef _MSC_VER // Miscrosoft compiler
+#pragma warning(disable : 4800) // disable int-bool conversion warning
+#endif
+
+#define MUSCLE_LONG_VERSION "MUSCLE v3.6 by Robert C. Edgar"
+#define MUSCLE_MAJOR_VERSION "3"
+#define MUSCLE_MINOR_VERSION "6"
+
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <stdarg.h>
+#include <stdio.h>
+
+#define DOUBLE_AFFINE 0
+#define SINGLE_AFFINE 1
+#define PAF 0
+#define HYDRO 1
+
+#include "types.h"
+#include "intmath.h"
+#include "alpha.h"
+#include "params.h"
+
+#ifndef _WIN32
+#define stricmp strcasecmp
+#define strnicmp strncasecmp
+#define _snprintf snprintf
+#define _fsopen(name, mode, share) fopen((name), (mode))
+#endif
+
+#if DEBUG
+#undef assert
+#define assert(b) Call_MY_ASSERT(__FILE__, __LINE__, b, #b)
+void Call_MY_ASSERT(const char *file, int line, bool b, const char *msg);
+#else
+#define assert(exp) ((void)0)
+#endif
+
+extern int g_argc;
+extern char **g_argv;
+
+#define Rotate(a, b, c) { SCORE *tmp = a; a = b; b = c; c = tmp; }
+
+const double VERY_LARGE_DOUBLE = 1e20;
+
+extern unsigned g_uTreeSplitNode1;
+extern unsigned g_uTreeSplitNode2;
+
+// Number of elements in array a[]
+#define countof(a) (sizeof(a)/sizeof(a[0]))
+
+// Maximum of two of any type
+#define Max2(a, b) ((a) > (b) ? (a) : (b))
+
+// Maximum of three of any type
+#define Max3(a, b, c) Max2(Max2(a, b), c)
+
+// Minimum of two of any type
+#define Min2(a, b) ((a) < (b) ? (a) : (b))
+
+// Maximum of four of any type
+#define Max4(a, b, c, d) Max2(Max2(a, b), Max2(c, d))
+
+const double VERY_NEGATIVE_DOUBLE = -9e29;
+const float VERY_NEGATIVE_FLOAT = (float) -9e29;
+
+const double BLOSUM_DIST = 0.62; // todo settable
+
+// insane value for uninitialized variables
+const unsigned uInsane = 8888888;
+const int iInsane = 8888888;
+const SCORE scoreInsane = 8888888;
+const char cInsane = (char) 0xcd; // int 3 instruction, used e.g. for unint. memory
+const double dInsane = VERY_NEGATIVE_DOUBLE;
+const float fInsane = VERY_NEGATIVE_FLOAT;
+const char INVALID_STATE = '*';
+const BASETYPE BTInsane = (BASETYPE) dInsane;
+const WEIGHT wInsane = BTInsane;
+
+extern double g_dNAN;
+
+extern unsigned long g_tStart;
+
+void Quit(const char szFormat[], ...);
+void Warning(const char szFormat[], ...);
+void TrimBlanks(char szStr[]);
+void TrimLeadingBlanks(char szStr[]);
+void TrimTrailingBlanks(char szStr[]);
+void Log(const char szFormat[], ...);
+bool Verbose();
+const char *ScoreToStr(SCORE Score);
+const char *ScoreToStrL(SCORE Score);
+SCORE StrToScore(const char *pszStr);
+void Break();
+
+double VecSum(const double v[], unsigned n);
+bool IsValidInteger(const char *Str);
+bool IsValidSignedInteger(const char *Str);
+bool IsValidIdentifier(const char *Str);
+bool IsValidFloatChar(char c);
+bool isident(char c);
+bool isidentf(char c);
+
+void TreeFromSeqVect(const SeqVect &c, Tree &tree, CLUSTER Cluster,
+ DISTANCE Distance, ROOT Root);
+void TreeFromMSA(const MSA &msa, Tree &tree, CLUSTER Cluster,
+ DISTANCE Distance, ROOT Root);
+
+void StripGaps(char szStr[]);
+void StripWhitespace(char szStr[]);
+const char *GetTimeAsStr();
+unsigned CalcBLOSUMWeights(MSA &Aln, ClusterTree &BlosumCluster);
+void CalcGSCWeights(MSA &Aln, const ClusterTree &BlosumCluster);
+void AssertNormalized(const PROB p[]);
+void AssertNormalizedOrZero(const PROB p[]);
+void AssertNormalized(const double p[]);
+bool VectorIsZero(const double dValues[], unsigned n);
+void VectorSet(double dValues[], unsigned n, double d);
+bool VectorIsZero(const float dValues[], unsigned n);
+void VectorSet(float dValues[], unsigned n, float d);
+
+#if _WIN32
+double log2(double x); // Defined in <math.h> on Linux
+#endif
+
+double pow2(double x);
+double lnTolog2(double ln);
+
+double lp2(double x);
+SCORE SumLog(SCORE x, SCORE y);
+SCORE SumLog(SCORE x, SCORE y, SCORE z);
+SCORE SumLog(SCORE w, SCORE x, SCORE y, SCORE z);
+
+double lp2Fast(double x);
+double SumLogFast(double x, double y);
+double SumLogFast(double x, double y, double z);
+double SumLogFast(double w, double x, double y, double z);
+
+void chkmem(const char szMsg[] = "");
+
+void Normalize(PROB p[], unsigned n);
+void Normalize(PROB p[], unsigned n, double dRequiredTotal);
+void NormalizeUnlessZero(PROB p[], unsigned n);
+
+void DebugPrintf(const char szFormat[], ...);
+void SetListFileName(const char *ptrListFileName, bool bAppend);
+void ModelFromAlign(const char *strInputFileName, const char *strModelFileName,
+ double dMaxNIC);
+double GetMemUseMB();
+double GetRAMSizeMB();
+double GetPeakMemUseMB();
+void CheckMemUse();
+const char *ElapsedTimeAsString();
+char *SecsToHHMMSS(long lSecs, char szStr[]);
+double GetCPUGHz();
+SCORE GetBlosum62(unsigned uLetterA, unsigned uLetterB);
+SCORE GetBlosum62d(unsigned uLetterA, unsigned uLetterB);
+SCORE GetBlosum50(unsigned uLetterA, unsigned uLetterB);
+void AssertNormalizedDist(const PROB p[], unsigned N);
+void CmdLineError(const char *Format, ...);
+void Fatal(const char *Format, ...);
+void InitCmd();
+void ExecCommandLine(int argc, char *argv[]);
+void DoCmd();
+void SetLogFile();
+void NameFromPath(const char szPath[], char szName[], unsigned uBytes);
+char *strsave(const char *s);
+void DistKmer20_3(const SeqVect &v, DistFunc &DF);
+void DistKbit20_3(const SeqVect &v, DistFunc &DF);
+void DistKmer6_6(const SeqVect &v, DistFunc &DF);
+void DistKmer4_6(const SeqVect &v, DistFunc &DF);
+void DistPWKimura(const SeqVect &v, DistFunc &DF);
+void FastDistKmer(const SeqVect &v, DistFunc &DF);
+void DistUnaligned(const SeqVect &v, DISTANCE DistMethod, DistFunc &DF);
+double PctIdToMAFFTDist(double dPctId);
+double KimuraDist(double dPctId);
+void SetFastParams();
+void AssertProfsEq(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
+ unsigned uLengthB);
+void ValidateMuscleIds(const MSA &msa);
+void ValidateMuscleIds(const Tree &tree);
+void TraceBackToPath(int **TraceBack, unsigned uLengthA,
+ unsigned uLengthB, PWPath &Path);
+void BitTraceBack(char **TraceBack, unsigned uLengthA, unsigned uLengthB,
+ char LastEdge, PWPath &Path);
+SCORE AlignTwoMSAs(const MSA &msa1, const MSA &msa2, MSA &msaOut, PWPath &Path,
+ bool bLockLeft = false, bool bLockRight = false);
+SCORE AlignTwoProfs(
+ const ProfPos *PA, unsigned uLengthA, WEIGHT wA,
+ const ProfPos *PB, unsigned uLengthB, WEIGHT wB,
+ PWPath &Path, ProfPos **ptrPout, unsigned *ptruLengthOut);
+void AlignTwoProfsGivenPath(const PWPath &Path,
+ const ProfPos *PA, unsigned uLengthA, WEIGHT wA,
+ const ProfPos *PB, unsigned uLengthB, WEIGHT wB,
+ ProfPos **ptrPOut, unsigned *ptruLengthOut);
+void AlignTwoMSAsGivenPathSW(const PWPath &Path, const MSA &msaA, const MSA &msaB,
+ MSA &msaCombined);
+void AlignTwoMSAsGivenPath(const PWPath &Path, const MSA &msaA, const MSA &msaB,
+ MSA &msaCombined);
+SCORE FastScorePath2(const ProfPos *PA, unsigned uLengthA,
+ const ProfPos *PB, unsigned uLengthB, const PWPath &Path);
+SCORE GlobalAlignDiags(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
+ unsigned uLengthB, PWPath &Path);
+SCORE GlobalAlignSimple(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
+ unsigned uLengthB, PWPath &Path);
+SCORE GlobalAlignSP(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
+ unsigned uLengthB, PWPath &Path);
+SCORE GlobalAlignSPN(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
+ unsigned uLengthB, PWPath &Path);
+SCORE GlobalAlignLE(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
+ unsigned uLengthB, PWPath &Path);
+void CalcThreeWayWeights(const Tree &tree, unsigned uNode1, unsigned uNode2,
+ WEIGHT *Weights);
+SCORE GlobalAlignSS(const Seq &seqA, const Seq &seqB, PWPath &Path);
+bool RefineHoriz(MSA &msaIn, const Tree &tree, unsigned uIters, bool bLockLeft, bool bLockRight);
+bool RefineVert(MSA &msaIn, const Tree &tree, unsigned uIters);
+SCORE GlobalAlignNoDiags(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
+ unsigned uLengthB, PWPath &Path);
+
+void SetInputFileName(const char *pstrFileName);
+void SetIter(unsigned uIter);
+void IncIter();
+void SetMaxIters(unsigned uMaxIters);
+void Progress(unsigned uStep, unsigned uTotalSteps);
+void Progress(const char *szFormat, ...);
+void SetStartTime();
+void ProgressStepsDone();
+void SetProgressDesc(const char szDesc[]);
+void SetSeqStats(unsigned uSeqCount, unsigned uMaxL, unsigned uAvgL);
+
+void SetNewHandler();
+void SaveCurrentAlignment();
+void SetCurrentAlignment(MSA &msa);
+void SetOutputFileName(const char *out);
+
+#if DEBUG
+void SetMuscleSeqVect(SeqVect &v);
+void SetMuscleInputMSA(MSA &msa);
+void ValidateMuscleIds(const MSA &msa);
+void ValidateMuscleIds(const Tree &tree);
+#else
+#define SetMuscleSeqVect(x) /* empty */
+#define SetMuscleInputMSA(x) /* empty */
+#define ValidateMuscleIds(x) /* empty */
+#endif
+
+void ProcessArgVect(int argc, char *argv[]);
+void ProcessArgStr(const char *Str);
+void Usage();
+void SetParams();
+
+void SortCounts(const FCOUNT fcCounts[], unsigned SortOrder[]);
+unsigned ResidueGroupFromFCounts(const FCOUNT fcCounts[]);
+FCOUNT SumCounts(const FCOUNT Counts[]);
+
+bool FlagOpt(const char *Name);
+const char *ValueOpt(const char *Name);
+void DoMuscle();
+void ProfDB();
+void DoSP();
+void ProgAlignSubFams();
+void Run();
+void ListParams();
+void OnException();
+void SetSeqWeightMethod(SEQWEIGHT Method);
+SEQWEIGHT GetSeqWeightMethod();
+WEIGHT GetMuscleSeqWeightById(unsigned uId);
+void ListDiagSavings();
+void CheckMaxTime();
+const char *MaxSecsToStr();
+unsigned long GetStartTime();
+
+void ProgressiveAlign(const SeqVect &v, const Tree &GuideTree, MSA &a);
+ProgNode *ProgressiveAlignE(const SeqVect &v, const Tree &GuideTree, MSA &a);
+
+void CalcDistRangeKmer6_6(const MSA &msa, unsigned uRow, float Dist[]);
+void CalcDistRangeKmer20_3(const MSA &msa, unsigned uRow, float Dist[]);
+void CalcDistRangeKmer20_4(const MSA &msa, unsigned uRow, float Dist[]);
+void CalcDistRangePctIdKimura(const MSA &msa, unsigned uRow, float Dist[]);
+void CalcDistRangePctIdLog(const MSA &msa, unsigned uRow, float Dist[]);
+
+void MakeRootMSA(const SeqVect &v, const Tree &GuideTree, ProgNode Nodes[], MSA &a);
+void MakeRootMSABrenner(SeqVect &v, const Tree &GuideTree, ProgNode Nodes[], MSA &a);
+
+void Refine();
+void Local();
+void Profile();
+void PPScore();
+void UPGMA2(const DistCalc &DC, Tree &tree, LINKAGE Linkage);
+
+char *GetFastaSeq(FILE *f, unsigned *ptrSeqLength, char **ptrLabel,
+ bool DeleteGaps = true);
+SCORE SW(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
+ unsigned uLengthB, PWPath &Path);
+void TraceBackSW(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
+ unsigned uLengthB, const SCORE *DPM_, const SCORE *DPD_, const SCORE *DPI_,
+ unsigned uPrefixLengthAMax, unsigned uPrefixLengthBMax, PWPath &Path);
+void DiffPaths(const PWPath &p1, const PWPath &p2, unsigned Edges1[],
+ unsigned *ptruDiffCount1, unsigned Edges2[], unsigned *ptruDiffCount2);
+void SetPPScore(bool bRespectFlagOpts = true);
+void SetPPScore(PPSCORE p);
+SCORE GlobalAlignDimer(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
+ unsigned uLengthB, PWPath &Path);
+bool MissingCommand();
+void Credits();
+void ProfileProfile(MSA &msa1, MSA &msa2, MSA &msaOut);
+void MHackStart(SeqVect &v);
+void MHackEnd(MSA &msa);
+void WriteScoreFile(const MSA &msa);
+char ConsensusChar(const ProfPos &PP);
+void Stabilize(const MSA &msa, MSA &msaStable);
+void MuscleOutput(MSA &msa);
+PTR_SCOREMATRIX ReadMx(TextFile &File);
Added: trunk/packages/muscle/branches/upstream/current/muscleout.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/muscleout.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/muscleout.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,109 @@
+#include "muscle.h"
+#include "msa.h"
+#include "params.h"
+#include "textfile.h"
+
+static void DoOutput(MSA &msa)
+ {
+ bool AnyOutput = false;
+
+// Flag options, at most one used (because only one -out filename)
+ TextFile fileOut(g_pstrOutFileName, true);
+ if (g_bFASTA)
+ {
+ msa.ToFASTAFile(fileOut);
+ AnyOutput = true;
+ }
+ else if (g_bMSF)
+ {
+ msa.ToMSFFile(fileOut);
+ AnyOutput = true;
+ }
+ else if (g_bAln)
+ {
+ msa.ToAlnFile(fileOut);
+ AnyOutput = true;
+ }
+ else if (g_bHTML)
+ {
+ msa.ToHTMLFile(fileOut);
+ AnyOutput = true;
+ }
+ else if (g_bPHYI)
+ {
+ msa.ToPhyInterleavedFile(fileOut);
+ AnyOutput = true;
+ }
+ else if (g_bPHYS)
+ {
+ msa.ToPhySequentialFile(fileOut);
+ AnyOutput = true;
+ }
+
+// If -out option was given but no flags, output as FASTA
+ if (!AnyOutput && strcmp(g_pstrOutFileName, "-") != 0)
+ msa.ToFASTAFile(fileOut);
+
+ fileOut.Close();
+
+// Value options
+ if (g_pstrFASTAOutFileName)
+ {
+ TextFile File(g_pstrFASTAOutFileName, true);
+ msa.ToFASTAFile(File);
+ }
+
+ if (g_pstrMSFOutFileName)
+ {
+ TextFile File(g_pstrMSFOutFileName, true);
+ msa.ToMSFFile(File);
+ }
+
+ if (g_pstrClwOutFileName)
+ {
+ TextFile File(g_pstrClwOutFileName, true);
+ msa.ToAlnFile(File);
+ }
+
+ if (g_pstrClwStrictOutFileName)
+ {
+ g_bClwStrict = true;
+ TextFile File(g_pstrClwStrictOutFileName, true);
+ msa.ToAlnFile(File);
+ }
+
+ if (g_pstrHTMLOutFileName)
+ {
+ TextFile File(g_pstrHTMLOutFileName, true);
+ msa.ToHTMLFile(File);
+ }
+
+ if (g_pstrPHYIOutFileName)
+ {
+ TextFile File(g_pstrPHYIOutFileName, true);
+ msa.ToPhySequentialFile(File);
+ }
+
+ if (g_pstrPHYSOutFileName)
+ {
+ TextFile File(g_pstrPHYSOutFileName, true);
+ msa.ToPhySequentialFile(File);
+ }
+
+ if (0 != g_pstrScoreFileName)
+ WriteScoreFile(msa);
+ }
+
+void MuscleOutput(MSA &msa)
+ {
+ MHackEnd(msa);
+ if (g_bStable)
+ {
+ MSA msaStable;
+ Stabilize(msa, msaStable);
+ msa.Clear(); // save memory
+ DoOutput(msaStable);
+ }
+ else
+ DoOutput(msa);
+ }
Added: trunk/packages/muscle/branches/upstream/current/nucmx.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/nucmx.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/nucmx.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,23 @@
+#include "muscle.h"
+
+// BLASTZ default parameters
+// open 400, extend 30, matrix as below
+
+const float NUC_EXTEND = 30;
+const float NUC_SP_CENTER = 2*NUC_EXTEND;
+
+#define v(x) ((float) x + NUC_SP_CENTER)
+#define ROW(A, C, G, T) \
+ { v(A), v(C), v(G), v(T) },
+
+float NUC_SP[32][32] =
+ {
+// A C G T
+ROW( 91, -114, -31, -123) // A
+
+ROW( -114, 100, -125, -31) // C
+
+ROW( -31, -125, 100, -114) // G
+
+ROW( -123, -31, -114, 91) // T
+ };
Added: trunk/packages/muscle/branches/upstream/current/nwdasimple.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/nwdasimple.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/nwdasimple.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,494 @@
+#include "muscle.h"
+#include <math.h>
+#include "pwpath.h"
+#include "profile.h"
+#include <stdio.h>
+
+#define TRACE 0
+
+bool g_bKeepSimpleDP;
+SCORE *g_DPM;
+SCORE *g_DPD;
+SCORE *g_DPE;
+SCORE *g_DPI;
+SCORE *g_DPJ;
+char *g_TBM;
+char *g_TBD;
+char *g_TBE;
+char *g_TBI;
+char *g_TBJ;
+
+#if DOUBLE_AFFINE
+
+static char XlatEdgeType(char c)
+ {
+ if ('E' == c)
+ return 'D';
+ if ('J' == c)
+ return 'I';
+ return c;
+ }
+
+static const char *LocalScoreToStr(SCORE s)
+ {
+ static char str[16];
+ if (s < -100000)
+ return " *";
+ sprintf(str, "%6.1f", s);
+ return str;
+ }
+
+static void ListTB(const char *TBM_, const ProfPos *PA, const ProfPos *PB,
+ unsigned uPrefixCountA, unsigned uPrefixCountB)
+ {
+ Log(" ");
+ for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ {
+ char c = ' ';
+ if (uPrefixLengthB > 0)
+ c = ConsensusChar(PB[uPrefixLengthB - 1]);
+ Log(" %4u:%c", uPrefixLengthB, c);
+ }
+ Log("\n");
+ for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA)
+ {
+ char c = ' ';
+ if (uPrefixLengthA > 0)
+ c = ConsensusChar(PA[uPrefixLengthA - 1]);
+ Log("%4u:%c ", uPrefixLengthA, c);
+ for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ Log(" %6c", TBM(uPrefixLengthA, uPrefixLengthB));
+ Log("\n");
+ }
+ }
+
+static void ListDP(const SCORE *DPM_, const ProfPos *PA, const ProfPos *PB,
+ unsigned uPrefixCountA, unsigned uPrefixCountB)
+ {
+ Log(" ");
+ for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ {
+ char c = ' ';
+ if (uPrefixLengthB > 0)
+ c = ConsensusChar(PB[uPrefixLengthB - 1]);
+ Log(" %4u:%c", uPrefixLengthB, c);
+ }
+ Log("\n");
+ for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA)
+ {
+ char c = ' ';
+ if (uPrefixLengthA > 0)
+ c = ConsensusChar(PA[uPrefixLengthA - 1]);
+ Log("%4u:%c ", uPrefixLengthA, c);
+ for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ Log(" %s", LocalScoreToStr(DPM(uPrefixLengthA, uPrefixLengthB)));
+ Log("\n");
+ }
+ }
+
+SCORE NWDASimple(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
+ unsigned uLengthB, PWPath &Path)
+ {
+ assert(uLengthB > 0 && uLengthA > 0);
+
+ const unsigned uPrefixCountA = uLengthA + 1;
+ const unsigned uPrefixCountB = uLengthB + 1;
+
+// Allocate DP matrices
+ const size_t LM = uPrefixCountA*uPrefixCountB;
+ SCORE *DPL_ = new SCORE[LM];
+ SCORE *DPM_ = new SCORE[LM];
+ SCORE *DPD_ = new SCORE[LM];
+ SCORE *DPE_ = new SCORE[LM];
+ SCORE *DPI_ = new SCORE[LM];
+ SCORE *DPJ_ = new SCORE[LM];
+
+ char *TBM_ = new char[LM];
+ char *TBD_ = new char[LM];
+ char *TBE_ = new char[LM];
+ char *TBI_ = new char[LM];
+ char *TBJ_ = new char[LM];
+
+ memset(TBM_, '?', LM);
+ memset(TBD_, '?', LM);
+ memset(TBE_, '?', LM);
+ memset(TBI_, '?', LM);
+ memset(TBJ_, '?', LM);
+
+ DPM(0, 0) = 0;
+ DPD(0, 0) = MINUS_INFINITY;
+ DPE(0, 0) = MINUS_INFINITY;
+ DPI(0, 0) = MINUS_INFINITY;
+ DPJ(0, 0) = MINUS_INFINITY;
+
+ DPM(1, 0) = MINUS_INFINITY;
+ DPD(1, 0) = PA[0].m_scoreGapOpen;
+ DPE(1, 0) = PA[0].m_scoreGapOpen2;
+ TBD(1, 0) = 'D';
+ TBE(1, 0) = 'E';
+ DPI(1, 0) = MINUS_INFINITY;
+ DPJ(1, 0) = MINUS_INFINITY;
+
+ DPM(0, 1) = MINUS_INFINITY;
+ DPD(0, 1) = MINUS_INFINITY;
+ DPE(0, 1) = MINUS_INFINITY;
+ DPI(0, 1) = PB[0].m_scoreGapOpen;
+ DPJ(0, 1) = PB[0].m_scoreGapOpen2;
+ TBI(0, 1) = 'I';
+ TBJ(0, 1) = 'J';
+
+// Empty prefix of B is special case
+ for (unsigned uPrefixLengthA = 2; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA)
+ {
+ DPM(uPrefixLengthA, 0) = MINUS_INFINITY;
+
+ DPD(uPrefixLengthA, 0) = DPD(uPrefixLengthA - 1, 0) + g_scoreGapExtend;
+ DPE(uPrefixLengthA, 0) = DPE(uPrefixLengthA - 1, 0) + g_scoreGapExtend2;
+
+ TBD(uPrefixLengthA, 0) = 'D';
+ TBE(uPrefixLengthA, 0) = 'E';
+
+ DPI(uPrefixLengthA, 0) = MINUS_INFINITY;
+ DPJ(uPrefixLengthA, 0) = MINUS_INFINITY;
+ }
+
+// Empty prefix of A is special case
+ for (unsigned uPrefixLengthB = 2; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ {
+ DPM(0, uPrefixLengthB) = MINUS_INFINITY;
+
+ DPD(0, uPrefixLengthB) = MINUS_INFINITY;
+ DPE(0, uPrefixLengthB) = MINUS_INFINITY;
+
+ DPI(0, uPrefixLengthB) = DPI(0, uPrefixLengthB - 1) + g_scoreGapExtend;
+ DPJ(0, uPrefixLengthB) = DPJ(0, uPrefixLengthB - 1) + g_scoreGapExtend2;
+
+ TBI(0, uPrefixLengthB) = 'I';
+ TBJ(0, uPrefixLengthB) = 'J';
+ }
+
+// Special case to agree with NWFast, no D-I transitions so...
+ DPD(uLengthA, 0) = MINUS_INFINITY;
+ DPE(uLengthA, 0) = MINUS_INFINITY;
+// DPI(0, uLengthB) = MINUS_INFINITY;
+// DPJ(0, uLengthB) = MINUS_INFINITY;
+
+// ============
+// Main DP loop
+// ============
+ SCORE scoreGapCloseB = MINUS_INFINITY;
+ SCORE scoreGapClose2B = MINUS_INFINITY;
+ for (unsigned uPrefixLengthB = 1; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ {
+ const ProfPos &PPB = PB[uPrefixLengthB - 1];
+
+ SCORE scoreGapCloseA = MINUS_INFINITY;
+ SCORE scoreGapClose2A = MINUS_INFINITY;
+ for (unsigned uPrefixLengthA = 1; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA)
+ {
+ const ProfPos &PPA = PA[uPrefixLengthA - 1];
+
+ {
+ // Match M=LetterA+LetterB
+ SCORE scoreLL = ScoreProfPos2(PPA, PPB);
+ DPL(uPrefixLengthA, uPrefixLengthB) = scoreLL;
+
+ SCORE scoreMM = DPM(uPrefixLengthA-1, uPrefixLengthB-1);
+ SCORE scoreDM = DPD(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapCloseA;
+ SCORE scoreEM = DPE(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapClose2A;
+ SCORE scoreIM = DPI(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapCloseB;
+ SCORE scoreJM = DPJ(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapClose2B;
+
+ SCORE scoreBest;
+ if (scoreMM >= scoreDM && scoreMM >= scoreEM && scoreMM >= scoreIM && scoreMM >= scoreJM)
+ {
+ scoreBest = scoreMM;
+ TBM(uPrefixLengthA, uPrefixLengthB) = 'M';
+ }
+ else if (scoreDM >= scoreMM && scoreDM >= scoreEM && scoreDM >= scoreIM && scoreDM >= scoreJM)
+ {
+ scoreBest = scoreDM;
+ TBM(uPrefixLengthA, uPrefixLengthB) = 'D';
+ }
+ else if (scoreEM >= scoreMM && scoreEM >= scoreDM && scoreEM >= scoreIM && scoreEM >= scoreJM)
+ {
+ scoreBest = scoreEM;
+ TBM(uPrefixLengthA, uPrefixLengthB) = 'E';
+ }
+ else if (scoreIM >= scoreMM && scoreIM >= scoreDM && scoreIM >= scoreEM && scoreIM >= scoreJM)
+ {
+ scoreBest = scoreIM;
+ TBM(uPrefixLengthA, uPrefixLengthB) = 'I';
+ }
+ else
+ {
+ assert(scoreJM >= scoreMM && scoreJM >= scoreDM && scoreJM >= scoreEM && scoreJM >= scoreIM);
+ scoreBest = scoreJM;
+ TBM(uPrefixLengthA, uPrefixLengthB) = 'J';
+ }
+ DPM(uPrefixLengthA, uPrefixLengthB) = scoreBest + scoreLL;
+ }
+
+ {
+ // Delete D=LetterA+GapB
+ SCORE scoreMD = DPM(uPrefixLengthA-1, uPrefixLengthB) +
+ PA[uPrefixLengthA-1].m_scoreGapOpen;
+ SCORE scoreDD = DPD(uPrefixLengthA-1, uPrefixLengthB) + g_scoreGapExtend;
+
+ SCORE scoreBest;
+ if (scoreMD >= scoreDD)
+ {
+ scoreBest = scoreMD;
+ TBD(uPrefixLengthA, uPrefixLengthB) = 'M';
+ }
+ else
+ {
+ assert(scoreDD >= scoreMD);
+ scoreBest = scoreDD;
+ TBD(uPrefixLengthA, uPrefixLengthB) = 'D';
+ }
+ DPD(uPrefixLengthA, uPrefixLengthB) = scoreBest;
+ }
+
+ {
+ // Delete E=LetterA+GapB
+ SCORE scoreME = DPM(uPrefixLengthA-1, uPrefixLengthB) +
+ PA[uPrefixLengthA-1].m_scoreGapOpen2;
+ SCORE scoreEE = DPE(uPrefixLengthA-1, uPrefixLengthB) + g_scoreGapExtend2;
+
+ SCORE scoreBest;
+ if (scoreME >= scoreEE)
+ {
+ scoreBest = scoreME;
+ TBE(uPrefixLengthA, uPrefixLengthB) = 'M';
+ }
+ else
+ {
+ assert(scoreEE >= scoreME);
+ scoreBest = scoreEE;
+ TBE(uPrefixLengthA, uPrefixLengthB) = 'E';
+ }
+ DPE(uPrefixLengthA, uPrefixLengthB) = scoreBest;
+ }
+
+ // Insert I=GapA+LetterB
+ {
+ SCORE scoreMI = DPM(uPrefixLengthA, uPrefixLengthB-1) +
+ PB[uPrefixLengthB - 1].m_scoreGapOpen;
+ SCORE scoreII = DPI(uPrefixLengthA, uPrefixLengthB-1) + g_scoreGapExtend;
+
+ SCORE scoreBest;
+ if (scoreMI >= scoreII)
+ {
+ scoreBest = scoreMI;
+ TBI(uPrefixLengthA, uPrefixLengthB) = 'M';
+ }
+ else
+ {
+ assert(scoreII > scoreMI);
+ scoreBest = scoreII;
+ TBI(uPrefixLengthA, uPrefixLengthB) = 'I';
+ }
+ DPI(uPrefixLengthA, uPrefixLengthB) = scoreBest;
+ }
+
+ // Insert J=GapA+LetterB
+ {
+ SCORE scoreMJ = DPM(uPrefixLengthA, uPrefixLengthB-1) +
+ PB[uPrefixLengthB - 1].m_scoreGapOpen2;
+ SCORE scoreJJ = DPJ(uPrefixLengthA, uPrefixLengthB-1) + g_scoreGapExtend2;
+
+ SCORE scoreBest;
+ if (scoreMJ >= scoreJJ)
+ {
+ scoreBest = scoreMJ;
+ TBJ(uPrefixLengthA, uPrefixLengthB) = 'M';
+ }
+ else
+ {
+ assert(scoreJJ > scoreMJ);
+ scoreBest = scoreJJ;
+ TBJ(uPrefixLengthA, uPrefixLengthB) = 'J';
+ }
+ DPJ(uPrefixLengthA, uPrefixLengthB) = scoreBest;
+ }
+
+ scoreGapCloseA = PPA.m_scoreGapClose;
+ scoreGapClose2A = PPA.m_scoreGapClose2;
+ }
+ scoreGapCloseB = PPB.m_scoreGapClose;
+ scoreGapClose2B = PPB.m_scoreGapClose2;
+ }
+
+#if TRACE
+ Log("\n");
+ Log("DA Simple DPL:\n");
+ ListDP(DPL_, PA, PB, uPrefixCountA, uPrefixCountB);
+ Log("\n");
+ Log("DA Simple DPM:\n");
+ ListDP(DPM_, PA, PB, uPrefixCountA, uPrefixCountB);
+ Log("\n");
+ Log("DA Simple DPD:\n");
+ ListDP(DPD_, PA, PB, uPrefixCountA, uPrefixCountB);
+ Log("\n");
+ Log("DA Simple DPE:\n");
+ ListDP(DPE_, PA, PB, uPrefixCountA, uPrefixCountB);
+ Log("\n");
+ Log("DA Simple DPI:\n");
+ ListDP(DPI_, PA, PB, uPrefixCountA, uPrefixCountB);
+ Log("\n");
+ Log("DA Simple DPJ:\n");
+ ListDP(DPJ_, PA, PB, uPrefixCountA, uPrefixCountB);
+ Log("\n");
+ Log("DA Simple TBM:\n");
+ ListTB(TBM_, PA, PB, uPrefixCountA, uPrefixCountB);
+ Log("\n");
+ Log("DA Simple TBD:\n");
+ ListTB(TBD_, PA, PB, uPrefixCountA, uPrefixCountB);
+ Log("\n");
+ Log("DA Simple TBE:\n");
+ ListTB(TBE_, PA, PB, uPrefixCountA, uPrefixCountB);
+ Log("\n");
+ Log("DA Simple TBI:\n");
+ ListTB(TBI_, PA, PB, uPrefixCountA, uPrefixCountB);
+ Log("\n");
+ Log("DA Simple TBJ:\n");
+ ListTB(TBJ_, PA, PB, uPrefixCountA, uPrefixCountB);
+#endif
+
+// Trace-back
+// ==========
+ Path.Clear();
+
+// Find last edge
+ SCORE M = DPM(uLengthA, uLengthB);
+ SCORE D = DPD(uLengthA, uLengthB) + PA[uLengthA-1].m_scoreGapClose;
+ SCORE E = DPE(uLengthA, uLengthB) + PA[uLengthA-1].m_scoreGapClose2;
+ SCORE I = DPI(uLengthA, uLengthB) + PB[uLengthB-1].m_scoreGapClose;
+ SCORE J = DPJ(uLengthA, uLengthB) + PB[uLengthB-1].m_scoreGapClose2;
+ char cEdgeType = '?';
+
+ SCORE BestScore = M;
+ cEdgeType = 'M';
+ if (D > BestScore)
+ {
+ cEdgeType = 'D';
+ BestScore = D;
+ }
+ if (E > BestScore)
+ {
+ cEdgeType = 'E';
+ BestScore = E;
+ }
+ if (I > BestScore)
+ {
+ cEdgeType = 'I';
+ BestScore = I;
+ }
+ if (J > BestScore)
+ {
+ cEdgeType = 'J';
+ BestScore = J;
+ }
+
+#if TRACE
+ Log("DA Simple: MAB=%.4g DAB=%.4g EAB=%.4g IAB=%.4g JAB=%.4g best=%c\n",
+ M, D, E, I, J, cEdgeType);
+#endif
+
+ unsigned PLA = uLengthA;
+ unsigned PLB = uLengthB;
+ for (;;)
+ {
+ PWEdge Edge;
+ Edge.cType = XlatEdgeType(cEdgeType);
+ Edge.uPrefixLengthA = PLA;
+ Edge.uPrefixLengthB = PLB;
+#if TRACE
+ Log("Prepend %c%d.%d\n", Edge.cType, PLA, PLB);
+#endif
+ Path.PrependEdge(Edge);
+
+ switch (cEdgeType)
+ {
+ case 'M':
+ assert(PLA > 0);
+ assert(PLB > 0);
+ cEdgeType = TBM(PLA, PLB);
+ --PLA;
+ --PLB;
+ break;
+
+ case 'D':
+ assert(PLA > 0);
+ cEdgeType = TBD(PLA, PLB);
+ --PLA;
+ break;
+
+ case 'E':
+ assert(PLA > 0);
+ cEdgeType = TBE(PLA, PLB);
+ --PLA;
+ break;
+
+ case 'I':
+ assert(PLB > 0);
+ cEdgeType = TBI(PLA, PLB);
+ --PLB;
+ break;
+
+ case 'J':
+ assert(PLB > 0);
+ cEdgeType = TBJ(PLA, PLB);
+ --PLB;
+ break;
+
+ default:
+ Quit("Invalid edge %c", cEdgeType);
+ }
+ if (0 == PLA && 0 == PLB)
+ break;
+ }
+ Path.Validate();
+
+// SCORE Score = TraceBack(PA, uLengthA, PB, uLengthB, DPM_, DPD_, DPI_, Path);
+
+#if TRACE
+ SCORE scorePath = FastScorePath2(PA, uLengthA, PB, uLengthB, Path);
+ Path.LogMe();
+ Log("Score = %s Path = %s\n", LocalScoreToStr(BestScore), LocalScoreToStr(scorePath));
+#endif
+
+ if (g_bKeepSimpleDP)
+ {
+ g_DPM = DPM_;
+ g_DPD = DPD_;
+ g_DPE = DPE_;
+ g_DPI = DPI_;
+ g_DPJ = DPJ_;
+
+ g_TBM = TBM_;
+ g_TBD = TBD_;
+ g_TBE = TBE_;
+ g_TBI = TBI_;
+ g_TBJ = TBJ_;
+ }
+ else
+ {
+ delete[] DPM_;
+ delete[] DPD_;
+ delete[] DPE_;
+ delete[] DPI_;
+ delete[] DPJ_;
+
+ delete[] TBM_;
+ delete[] TBD_;
+ delete[] TBE_;
+ delete[] TBI_;
+ delete[] TBJ_;
+ }
+
+ return BestScore;
+ }
+
+#endif // DOUBLE_AFFINE
Added: trunk/packages/muscle/branches/upstream/current/nwdasimple2.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/nwdasimple2.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/nwdasimple2.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,549 @@
+#include "muscle.h"
+#include "pwpath.h"
+#include "profile.h"
+
+#if DOUBLE_AFFINE
+
+#define TRACE 0
+
+extern bool g_bKeepSimpleDP;
+extern SCORE *g_DPM;
+extern SCORE *g_DPD;
+extern SCORE *g_DPE;
+extern SCORE *g_DPI;
+extern SCORE *g_DPJ;
+extern char *g_TBM;
+extern char *g_TBD;
+extern char *g_TBE;
+extern char *g_TBI;
+extern char *g_TBJ;
+
+static char XlatEdgeType(char c)
+ {
+ if ('E' == c)
+ return 'D';
+ if ('J' == c)
+ return 'I';
+ return c;
+ }
+
+static const char *LocalScoreToStr(SCORE s)
+ {
+ static char str[16];
+ if (s < -100000)
+ return " *";
+ sprintf(str, "%6.1f", s);
+ return str;
+ }
+
+static void ListDP(const SCORE *DPM_, const ProfPos *PA, const ProfPos *PB,
+ unsigned uPrefixCountA, unsigned uPrefixCountB)
+ {
+ Log(" ");
+ for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ {
+ char c = ' ';
+ if (uPrefixLengthB > 0)
+ c = ConsensusChar(PB[uPrefixLengthB - 1]);
+ Log(" %4u:%c", uPrefixLengthB, c);
+ }
+ Log("\n");
+ for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA)
+ {
+ char c = ' ';
+ if (uPrefixLengthA > 0)
+ c = ConsensusChar(PA[uPrefixLengthA - 1]);
+ Log("%4u:%c ", uPrefixLengthA, c);
+ for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ Log(" %s", LocalScoreToStr(DPM(uPrefixLengthA, uPrefixLengthB)));
+ Log("\n");
+ }
+ }
+
+static void ListTB(const char *TBM_, const ProfPos *PA, const ProfPos *PB,
+ unsigned uPrefixCountA, unsigned uPrefixCountB)
+ {
+ Log(" ");
+ for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ {
+ char c = ' ';
+ if (uPrefixLengthB > 0)
+ c = ConsensusChar(PB[uPrefixLengthB - 1]);
+ Log(" %4u:%c", uPrefixLengthB, c);
+ }
+ Log("\n");
+ for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA)
+ {
+ char c = ' ';
+ if (uPrefixLengthA > 0)
+ c = ConsensusChar(PA[uPrefixLengthA - 1]);
+ Log("%4u:%c ", uPrefixLengthA, c);
+ for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ Log(" %6c", TBM(uPrefixLengthA, uPrefixLengthB));
+ Log("\n");
+ }
+ }
+
+static void ListDPM(const SCORE *DPM_, const ProfPos *PA, const ProfPos *PB,
+ unsigned uPrefixCountA, unsigned uPrefixCountB)
+ {
+ Log(" ");
+ for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ {
+ char c = ' ';
+ if (uPrefixLengthB > 0)
+ c = ConsensusChar(PB[uPrefixLengthB - 1]);
+ Log(" %4u:%c", uPrefixLengthB, c);
+ }
+ Log("\n");
+ for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA)
+ {
+ char c = ' ';
+ if (uPrefixLengthA > 0)
+ c = ConsensusChar(PA[uPrefixLengthA - 1]);
+ Log("%4u:%c ", uPrefixLengthA, c);
+ for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ {
+ SCORE x = (uPrefixLengthA + uPrefixLengthB)*g_scoreGapExtend;
+ SCORE s = DPM(uPrefixLengthA, uPrefixLengthB) - x;
+ Log(" %s", LocalScoreToStr(s));
+ }
+ Log("\n");
+ }
+ }
+
+extern SCORE ScoreProfPos2(const ProfPos &PP, const ProfPos &PPB);
+
+SCORE NWDASimple2(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
+ unsigned uLengthB, PWPath &Path)
+ {
+ assert(uLengthB > 0 && uLengthA > 0);
+
+ const unsigned uPrefixCountA = uLengthA + 1;
+ const unsigned uPrefixCountB = uLengthB + 1;
+
+// Allocate DP matrices
+ const size_t LM = uPrefixCountA*uPrefixCountB;
+ SCORE *DPM_ = new SCORE[LM];
+ SCORE *DPD_ = new SCORE[LM];
+ SCORE *DPE_ = new SCORE[LM];
+ SCORE *DPI_ = new SCORE[LM];
+ SCORE *DPJ_ = new SCORE[LM];
+ SCORE *DPL_ = new SCORE[LM];
+
+ char *TBM_ = new char[LM];
+ char *TBD_ = new char[LM];
+ char *TBE_ = new char[LM];
+ char *TBI_ = new char[LM];
+ char *TBJ_ = new char[LM];
+
+ memset(DPM_, 0, LM*sizeof(SCORE));
+ memset(DPD_, 0, LM*sizeof(SCORE));
+ memset(DPE_, 0, LM*sizeof(SCORE));
+ memset(DPI_, 0, LM*sizeof(SCORE));
+ memset(DPJ_, 0, LM*sizeof(SCORE));
+
+// memset(DPL_, 0, LM*sizeof(SCORE));
+
+ memset(TBM_, '?', LM);
+ memset(TBD_, '?', LM);
+ memset(TBE_, '?', LM);
+ memset(TBI_, '?', LM);
+ memset(TBJ_, '?', LM);
+
+ DPM(0, 0) = 0;
+ DPD(0, 0) = MINUS_INFINITY;
+ DPE(0, 0) = MINUS_INFINITY;
+ DPI(0, 0) = MINUS_INFINITY;
+ DPJ(0, 0) = MINUS_INFINITY;
+
+ DPM(1, 0) = MINUS_INFINITY;
+ DPD(1, 0) = PA[0].m_scoreGapOpen;
+ DPE(1, 0) = PA[0].m_scoreGapOpen2;
+ DPI(1, 0) = MINUS_INFINITY;
+ DPJ(1, 0) = MINUS_INFINITY;
+
+ DPM(0, 1) = MINUS_INFINITY;
+ DPD(0, 1) = MINUS_INFINITY;
+ DPE(0, 1) = MINUS_INFINITY;
+ DPI(0, 1) = PB[0].m_scoreGapOpen;
+ DPJ(0, 1) = PB[0].m_scoreGapOpen2;
+
+// Empty prefix of B is special case
+ for (unsigned uPrefixLengthA = 2; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA)
+ {
+ // M=LetterA+LetterB, impossible with empty prefix
+ DPM(uPrefixLengthA, 0) = MINUS_INFINITY;
+
+ // D=LetterA+GapB
+ DPD(uPrefixLengthA, 0) = DPD(uPrefixLengthA - 1, 0) + g_scoreGapExtend;
+ TBD(uPrefixLengthA, 0) = 'D';
+
+ DPE(uPrefixLengthA, 0) = DPE(uPrefixLengthA - 1, 0) + g_scoreGapExtend2;
+ TBE(uPrefixLengthA, 0) = 'E';
+
+ // I=GapA+LetterB, impossible with empty prefix
+ DPI(uPrefixLengthA, 0) = MINUS_INFINITY;
+ DPJ(uPrefixLengthA, 0) = MINUS_INFINITY;
+ }
+
+// Empty prefix of A is special case
+ for (unsigned uPrefixLengthB = 2; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ {
+ // M=LetterA+LetterB, impossible with empty prefix
+ DPM(0, uPrefixLengthB) = MINUS_INFINITY;
+
+ // D=LetterA+GapB, impossible with empty prefix
+ DPD(0, uPrefixLengthB) = MINUS_INFINITY;
+ DPE(0, uPrefixLengthB) = MINUS_INFINITY;
+
+ // I=GapA+LetterB
+ DPI(0, uPrefixLengthB) = DPI(0, uPrefixLengthB - 1) + g_scoreGapExtend;
+ TBI(0, uPrefixLengthB) = 'I';
+
+ DPJ(0, uPrefixLengthB) = DPJ(0, uPrefixLengthB - 1) + g_scoreGapExtend2;
+ TBJ(0, uPrefixLengthB) = 'J';
+ }
+
+// ============
+// Main DP loop
+// ============
+ for (unsigned uPrefixLengthB = 1; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ {
+ const ProfPos &PPB = PB[uPrefixLengthB - 1];
+ SCORE scoreGapCloseB;
+ if (uPrefixLengthB == 1)
+ scoreGapCloseB = MINUS_INFINITY;
+ else
+ scoreGapCloseB = PB[uPrefixLengthB-2].m_scoreGapClose;
+
+ SCORE scoreGapClose2B;
+ if (uPrefixLengthB == 1)
+ scoreGapClose2B = MINUS_INFINITY;
+ else
+ scoreGapClose2B = PB[uPrefixLengthB-2].m_scoreGapClose2;
+
+ for (unsigned uPrefixLengthA = 1; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA)
+ {
+ const ProfPos &PPA = PA[uPrefixLengthA - 1];
+
+ {
+ // Match M=LetterA+LetterB
+ SCORE scoreLL = ScoreProfPos2(PPA, PPB);
+ DPL(uPrefixLengthA, uPrefixLengthB) = scoreLL;
+
+ SCORE scoreGapCloseA;
+ if (uPrefixLengthA == 1)
+ scoreGapCloseA = MINUS_INFINITY;
+ else
+ scoreGapCloseA = PA[uPrefixLengthA-2].m_scoreGapClose;
+
+ SCORE scoreGapClose2A;
+ if (uPrefixLengthA == 1)
+ scoreGapClose2A = MINUS_INFINITY;
+ else
+ scoreGapClose2A = PA[uPrefixLengthA-2].m_scoreGapClose2;
+
+ SCORE scoreMM = DPM(uPrefixLengthA-1, uPrefixLengthB-1);
+ SCORE scoreDM = DPD(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapCloseA;
+ SCORE scoreEM = DPE(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapClose2A;
+ SCORE scoreIM = DPI(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapCloseB;
+ SCORE scoreJM = DPJ(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapClose2B;
+ SCORE scoreBest;
+ if (scoreMM >= scoreDM && scoreMM >= scoreIM && scoreMM >= scoreEM && scoreMM >= scoreJM)
+ {
+ scoreBest = scoreMM;
+ TBM(uPrefixLengthA, uPrefixLengthB) = 'M';
+ }
+ else if (scoreDM >= scoreMM && scoreDM >= scoreIM && scoreDM >= scoreEM && scoreDM >= scoreJM)
+ {
+ scoreBest = scoreDM;
+ TBM(uPrefixLengthA, uPrefixLengthB) = 'D';
+ }
+ else if (scoreEM >= scoreMM && scoreEM >= scoreIM && scoreEM >= scoreDM && scoreEM >= scoreJM)
+ {
+ scoreBest = scoreEM;
+ TBM(uPrefixLengthA, uPrefixLengthB) = 'E';
+ }
+ else if (scoreIM >= scoreMM && scoreIM >= scoreDM && scoreIM >= scoreEM && scoreIM >= scoreJM)
+ {
+ scoreBest = scoreIM;
+ TBM(uPrefixLengthA, uPrefixLengthB) = 'I';
+ }
+ else if (scoreJM >= scoreMM && scoreJM >= scoreDM && scoreJM >= scoreEM && scoreJM >= scoreIM)
+ {
+ scoreBest = scoreJM;
+ TBM(uPrefixLengthA, uPrefixLengthB) = 'J';
+ }
+ else
+ Quit("Max failed (M)");
+
+ DPM(uPrefixLengthA, uPrefixLengthB) = scoreBest + scoreLL;
+ }
+
+ {
+ // Delete D=LetterA+GapB
+ SCORE scoreMD = DPM(uPrefixLengthA-1, uPrefixLengthB) +
+ PA[uPrefixLengthA-1].m_scoreGapOpen;
+ SCORE scoreDD = DPD(uPrefixLengthA-1, uPrefixLengthB) +
+ g_scoreGapExtend;
+
+ SCORE scoreBest;
+ if (scoreMD >= scoreDD)
+ {
+ scoreBest = scoreMD;
+ TBD(uPrefixLengthA, uPrefixLengthB) = 'M';
+ }
+ else
+ {
+ assert(scoreDD >= scoreMD);
+ scoreBest = scoreDD;
+ TBD(uPrefixLengthA, uPrefixLengthB) = 'D';
+ }
+ DPD(uPrefixLengthA, uPrefixLengthB) = scoreBest;
+ }
+
+ {
+ // Delete E=LetterA+GapB
+ SCORE scoreME = DPM(uPrefixLengthA-1, uPrefixLengthB) +
+ PA[uPrefixLengthA-1].m_scoreGapOpen2;
+ SCORE scoreEE = DPE(uPrefixLengthA-1, uPrefixLengthB) +
+ g_scoreGapExtend2;
+
+ SCORE scoreBest;
+ if (scoreME >= scoreEE)
+ {
+ scoreBest = scoreME;
+ TBE(uPrefixLengthA, uPrefixLengthB) = 'M';
+ }
+ else
+ {
+ assert(scoreEE >= scoreME);
+ scoreBest = scoreEE;
+ TBE(uPrefixLengthA, uPrefixLengthB) = 'E';
+ }
+ DPE(uPrefixLengthA, uPrefixLengthB) = scoreBest;
+ }
+
+ // Insert I=GapA+LetterB
+ {
+ SCORE scoreMI = DPM(uPrefixLengthA, uPrefixLengthB-1) +
+ PB[uPrefixLengthB-1].m_scoreGapOpen;
+ SCORE scoreII = DPI(uPrefixLengthA, uPrefixLengthB-1) +
+ g_scoreGapExtend;
+
+ SCORE scoreBest;
+ if (scoreMI >= scoreII)
+ {
+ scoreBest = scoreMI;
+ TBI(uPrefixLengthA, uPrefixLengthB) = 'M';
+ }
+ else
+ {
+ assert(scoreII > scoreMI);
+ scoreBest = scoreII;
+ TBI(uPrefixLengthA, uPrefixLengthB) = 'I';
+ }
+ DPI(uPrefixLengthA, uPrefixLengthB) = scoreBest;
+ }
+
+ // Insert J=GapA+LetterB
+ {
+ SCORE scoreMJ = DPM(uPrefixLengthA, uPrefixLengthB-1) +
+ PB[uPrefixLengthB-1].m_scoreGapOpen2;
+ SCORE scoreJJ = DPJ(uPrefixLengthA, uPrefixLengthB-1) +
+ g_scoreGapExtend2;
+
+ SCORE scoreBest;
+ if (scoreMJ > scoreJJ)
+ {
+ scoreBest = scoreMJ;
+ TBJ(uPrefixLengthA, uPrefixLengthB) = 'M';
+ }
+ else
+ {
+ assert(scoreJJ >= scoreMJ);
+ scoreBest = scoreJJ;
+ TBJ(uPrefixLengthA, uPrefixLengthB) = 'J';
+ }
+ DPJ(uPrefixLengthA, uPrefixLengthB) = scoreBest;
+ }
+ }
+ }
+
+// Special case: close gaps at end of alignment
+ DPD(uLengthA, uLengthB) += PA[uLengthA-1].m_scoreGapClose;
+ DPE(uLengthA, uLengthB) += PA[uLengthA-1].m_scoreGapClose2;
+
+ DPI(uLengthA, uLengthB) += PB[uLengthB-1].m_scoreGapClose;
+ DPJ(uLengthA, uLengthB) += PB[uLengthB-1].m_scoreGapClose2;
+
+#if TRACE
+ Log("DPL:\n");
+ ListDP(DPL_, PA, PB, uPrefixCountA, uPrefixCountB);
+ Log("DPM:\n");
+ ListDP(DPM_, PA, PB, uPrefixCountA, uPrefixCountB);
+ Log("DPD:\n");
+ ListDP(DPD_, PA, PB, uPrefixCountA, uPrefixCountB);
+ Log("DPE:\n");
+ ListDP(DPE_, PA, PB, uPrefixCountA, uPrefixCountB);
+ Log("DPI:\n");
+ ListDP(DPI_, PA, PB, uPrefixCountA, uPrefixCountB);
+ Log("DPJ:\n");
+ ListDP(DPJ_, PA, PB, uPrefixCountA, uPrefixCountB);
+ Log("TBM:\n");
+ ListTB(TBM_, PA, PB, uPrefixCountA, uPrefixCountB);
+ Log("TBD:\n");
+ ListTB(TBD_, PA, PB, uPrefixCountA, uPrefixCountB);
+ Log("TBE:\n");
+ ListTB(TBE_, PA, PB, uPrefixCountA, uPrefixCountB);
+ Log("TBI:\n");
+ ListTB(TBI_, PA, PB, uPrefixCountA, uPrefixCountB);
+ Log("TBJ:\n");
+ ListTB(TBJ_, PA, PB, uPrefixCountA, uPrefixCountB);
+#endif
+
+// ==========
+// Trace-back
+// ==========
+
+ Path.Clear();
+
+// Find last edge
+ char cEdgeType = '?';
+ SCORE BestScore = MINUS_INFINITY;
+ SCORE M = DPM(uLengthA, uLengthB);
+ SCORE D = DPD(uLengthA, uLengthB);
+ SCORE E = DPE(uLengthA, uLengthB);
+ SCORE I = DPI(uLengthA, uLengthB);
+ SCORE J = DPJ(uLengthA, uLengthB);
+
+ if (M >= D && M >= E && M >= I && M >= J)
+ {
+ cEdgeType = 'M';
+ BestScore = M;
+ }
+ else if (D >= M && D >= E && D >= I && D >= J)
+ {
+ cEdgeType = 'D';
+ BestScore = D;
+ }
+ else if (E >= M && E >= D && E >= I && E >= J)
+ {
+ cEdgeType = 'E';
+ BestScore = E;
+ }
+ else if (I >= M && I >= D && I >= E && I >= J)
+ {
+ cEdgeType = 'I';
+ BestScore = I;
+ }
+ else if (J >= M && J >= D && J >= E && J >= I)
+ {
+ cEdgeType = 'J';
+ BestScore = J;
+ }
+ else
+ Quit("Bad max");
+
+ unsigned PLA = uLengthA;
+ unsigned PLB = uLengthB;
+ unsigned ECount = 0;
+ unsigned JCount = 0;
+ for (;;)
+ {
+#if TRACE
+ Log("TraceBack: %c%u.%u\n", cEdgeType, PLA, PLB);
+#endif
+ PWEdge Edge;
+ Edge.cType = XlatEdgeType(cEdgeType);
+ Edge.uPrefixLengthA = PLA;
+ Edge.uPrefixLengthB = PLB;
+ Path.PrependEdge(Edge);
+
+ switch (cEdgeType)
+ {
+ case 'M':
+ assert(PLA > 0);
+ assert(PLB > 0);
+ cEdgeType = TBM(PLA, PLB);
+ --PLA;
+ --PLB;
+ break;
+
+ case 'D':
+ assert(PLA > 0);
+ cEdgeType = TBD(PLA, PLB);
+ --PLA;
+ break;
+
+ case 'E':
+ ++ECount;
+ assert(PLA > 0);
+ cEdgeType = TBE(PLA, PLB);
+ --PLA;
+ break;
+
+ case 'I':
+ assert(PLB > 0);
+ cEdgeType = TBI(PLA, PLB);
+ --PLB;
+ break;
+
+ case 'J':
+ ++JCount;
+ assert(PLB > 0);
+ cEdgeType = TBJ(PLA, PLB);
+ --PLB;
+ break;
+
+ default:
+ Quit("Invalid edge %c", cEdgeType);
+ }
+ if (0 == PLA && 0 == PLB)
+ break;
+ }
+ //if (ECount > 0 || JCount > 0)
+ // fprintf(stderr, "E=%d J=%d\n", ECount, JCount);
+ Path.Validate();
+ if (Path.GetMatchCount() + Path.GetDeleteCount() != uLengthA)
+ Quit("Path count A");
+ if (Path.GetMatchCount() + Path.GetInsertCount() != uLengthB)
+ Quit("Path count B");
+
+ if (g_bKeepSimpleDP)
+ {
+ g_DPM = DPM_;
+ g_DPD = DPD_;
+ g_DPE = DPE_;
+ g_DPI = DPI_;
+ g_DPJ = DPJ_;
+
+ g_TBM = TBM_;
+ g_TBD = TBD_;
+ g_TBE = TBE_;
+ g_TBI = TBI_;
+ g_TBJ = TBJ_;
+ }
+ else
+ {
+ delete[] DPM_;
+ delete[] DPD_;
+ delete[] DPE_;
+ delete[] DPI_;
+ delete[] DPJ_;
+
+ delete[] TBM_;
+ delete[] TBD_;
+ delete[] TBE_;
+ delete[] TBI_;
+ delete[] TBJ_;
+ }
+
+#if TRACE
+ Log("BestScore=%.6g\n", BestScore);
+#endif
+ return BestScore;
+ }
+
+#endif // DOUBLE_AFFINE
Added: trunk/packages/muscle/branches/upstream/current/nwdasmall.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/nwdasmall.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/nwdasmall.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,947 @@
+#include "muscle.h"
+#include <math.h>
+#include "pwpath.h"
+#include "profile.h"
+#include <stdio.h>
+
+#if DOUBLE_AFFINE
+
+// NW double affine small memory, term gaps fully penalized
+// (so up to caller to adjust in profile if desired).
+
+#define TRACE 0
+
+#define MIN(x, y) ((x) < (y) ? (x) : (y))
+
+#if TRACE
+extern bool g_bKeepSimpleDP;
+extern SCORE *g_DPM;
+extern SCORE *g_DPD;
+extern SCORE *g_DPE;
+extern SCORE *g_DPI;
+extern SCORE *g_DPJ;
+extern char *g_TBM;
+extern char *g_TBD;
+extern char *g_TBE;
+extern char *g_TBI;
+extern char *g_TBJ;
+#endif
+
+#if TRACE
+#define ALLOC_TRACE() \
+ const SCORE UNINIT = MINUS_INFINITY; \
+ const size_t LM = uPrefixCountA*uPrefixCountB; \
+ \
+ SCORE *DPM_ = new SCORE[LM]; \
+ SCORE *DPD_ = new SCORE[LM]; \
+ SCORE *DPE_ = new SCORE[LM]; \
+ SCORE *DPI_ = new SCORE[LM]; \
+ SCORE *DPJ_ = new SCORE[LM]; \
+ \
+ char *TBM_ = new char[LM]; \
+ char *TBD_ = new char[LM]; \
+ char *TBE_ = new char[LM]; \
+ char *TBI_ = new char[LM]; \
+ char *TBJ_ = new char[LM]; \
+ \
+ memset(TBM_, '?', LM); \
+ memset(TBD_, '?', LM); \
+ memset(TBE_, '?', LM); \
+ memset(TBI_, '?', LM); \
+ memset(TBJ_, '?', LM); \
+ \
+ for (unsigned i = 0; i <= uLengthA; ++i) \
+ for (unsigned j = 0; j <= uLengthB; ++j) \
+ { \
+ DPM(i, j) = UNINIT; \
+ DPD(i, j) = UNINIT; \
+ DPE(i, j) = UNINIT; \
+ DPI(i, j) = UNINIT; \
+ DPJ(i, j) = UNINIT; \
+ }
+#else
+#define ALLOC_TRACE()
+#endif
+
+#if TRACE
+#define SetDPM(i, j, x) DPM(i, j) = x
+#define SetDPD(i, j, x) DPD(i, j) = x
+#define SetDPE(i, j, x) DPE(i, j) = x
+#define SetDPI(i, j, x) DPI(i, j) = x
+#define SetDPJ(i, j, x) DPJ(i, j) = x
+#define SetTBM(i, j, x) TBM(i, j) = x
+#define SetTBD(i, j, x) TBD(i, j) = x
+#define SetTBE(i, j, x) TBE(i, j) = x
+#define SetTBI(i, j, x) TBI(i, j) = x
+#define SetTBJ(i, j, x) TBJ(i, j) = x
+#else
+#define SetDPM(i, j, x) /* empty */
+#define SetDPD(i, j, x) /* empty */
+#define SetDPE(i, j, x) /* empty */
+#define SetDPI(i, j, x) /* empty */
+#define SetDPJ(i, j, x) /* empty */
+#define SetTBM(i, j, x) /* empty */
+#define SetTBD(i, j, x) /* empty */
+#define SetTBE(i, j, x) /* empty */
+#define SetTBI(i, j, x) /* empty */
+#define SetTBJ(i, j, x) /* empty */
+#endif
+
+#define RECURSE_D(i, j) \
+ { \
+ SCORE DD = DRow[j] + e; \
+ SCORE MD = MPrev[j] + PA[i-1].m_scoreGapOpen;\
+ if (DD > MD) \
+ { \
+ DRow[j] = DD; \
+ SetTBD(i, j, 'D'); \
+ } \
+ else \
+ { \
+ DRow[j] = MD; \
+ SetBitTBD(TB, i, j, 'M'); \
+ SetTBD(i, j, 'M'); \
+ } \
+ SetDPD(i, j, DRow[j]); \
+ }
+
+#define RECURSE_E(i, j) \
+ { \
+ SCORE EE = ERow[j] + e2; \
+ SCORE ME = MPrev[j] + PA[i-1].m_scoreGapOpen2;\
+ if (EE > ME) \
+ { \
+ ERow[j] = EE; \
+ SetTBE(i, j, 'E'); \
+ } \
+ else \
+ { \
+ ERow[j] = ME; \
+ SetBitTBE(TB, i, j, 'M'); \
+ SetTBE(i, j, 'M'); \
+ } \
+ SetDPE(i, j, ERow[j]); \
+ }
+
+#define RECURSE_D_ATerm(j) RECURSE_D(uLengthA, j)
+#define RECURSE_E_ATerm(j) RECURSE_E(uLengthA, j)
+
+#define RECURSE_D_BTerm(j) RECURSE_D(i, uLengthB)
+#define RECURSE_E_BTerm(j) RECURSE_E(i, uLengthB)
+
+#define RECURSE_I(i, j) \
+ { \
+ Iij += e; \
+ SCORE MI = MCurr[j-1] + PB[j-1].m_scoreGapOpen;\
+ if (MI >= Iij) \
+ { \
+ Iij = MI; \
+ SetBitTBI(TB, i, j, 'M'); \
+ SetTBI(i, j, 'M'); \
+ } \
+ else \
+ SetTBI(i, j, 'I'); \
+ SetDPI(i, j, Iij); \
+ }
+
+#define RECURSE_J(i, j) \
+ { \
+ Jij += e2; \
+ SCORE MJ = MCurr[j-1] + PB[j-1].m_scoreGapOpen2;\
+ if (MJ >= Jij) \
+ { \
+ Jij = MJ; \
+ SetBitTBJ(TB, i, j, 'M'); \
+ SetTBJ(i, j, 'M'); \
+ } \
+ else \
+ SetTBJ(i, j, 'I'); \
+ SetDPJ(i, j, Jij); \
+ }
+
+#define RECURSE_I_ATerm(j) RECURSE_I(uLengthA, j)
+#define RECURSE_J_ATerm(j) RECURSE_J(uLengthA, j)
+
+#define RECURSE_I_BTerm(j) RECURSE_I(i, uLengthB)
+#define RECURSE_J_BTerm(j) RECURSE_J(i, uLengthB)
+
+#define RECURSE_M(i, j) \
+ { \
+ SCORE Best = MCurr[j]; /* MM */ \
+ SetTBM(i+1, j+1, 'M'); \
+ SetBitTBM(TB, i+1, j+1, 'M'); \
+ \
+ SCORE DM = DRow[j] + PA[i-1].m_scoreGapClose; \
+ if (DM > Best) \
+ { \
+ Best = DM; \
+ SetTBM(i+1, j+1, 'D'); \
+ SetBitTBM(TB, i+1, j+1, 'D'); \
+ } \
+ \
+ SCORE EM = ERow[j] + PA[i-1].m_scoreGapClose2; \
+ if (EM > Best) \
+ { \
+ Best = EM; \
+ SetTBM(i+1, j+1, 'E'); \
+ SetBitTBM(TB, i+1, j+1, 'E'); \
+ } \
+ \
+ SCORE IM = Iij + PB[j-1].m_scoreGapClose; \
+ if (IM > Best) \
+ { \
+ Best = IM; \
+ SetTBM(i+1, j+1, 'I'); \
+ SetBitTBM(TB, i+1, j+1, 'I'); \
+ } \
+ \
+ SCORE JM = Jij + PB[j-1].m_scoreGapClose2; \
+ if (JM > Best) \
+ { \
+ Best = JM; \
+ SetTBM(i+1, j+1, 'J'); \
+ SetBitTBM(TB, i+1, j+1, 'J'); \
+ } \
+ MNext[j+1] += Best; \
+ SetDPM(i+1, j+1, MNext[j+1]); \
+ }
+
+#if TRACE
+static bool LocalEq(BASETYPE b1, BASETYPE b2)
+ {
+ if (b1 < -100000 && b2 < -100000)
+ return true;
+ double diff = fabs(b1 - b2);
+ if (diff < 0.0001)
+ return true;
+ double sum = fabs(b1) + fabs(b2);
+ return diff/sum < 0.005;
+ }
+
+static char Get_M_Char(char Bits)
+ {
+ switch (Bits & BIT_xM)
+ {
+ case BIT_MM:
+ return 'M';
+ case BIT_DM:
+ return 'D';
+ case BIT_EM:
+ return 'E';
+ case BIT_IM:
+ return 'I';
+ case BIT_JM:
+ return 'J';
+ }
+ Quit("Huh?");
+ return '?';
+ }
+
+static char Get_D_Char(char Bits)
+ {
+ return (Bits & BIT_xD) ? 'M' : 'D';
+ }
+
+static char Get_E_Char(char Bits)
+ {
+ return (Bits & BIT_xE) ? 'M' : 'E';
+ }
+
+static char Get_I_Char(char Bits)
+ {
+ return (Bits & BIT_xI) ? 'M' : 'I';
+ }
+
+static char Get_J_Char(char Bits)
+ {
+ return (Bits & BIT_xJ) ? 'M' : 'J';
+ }
+
+static bool DPEq(char c, SCORE *g_DP, SCORE *DPD_,
+ unsigned uPrefixCountA, unsigned uPrefixCountB)
+ {
+ if (0 == g_DP)
+ {
+ Log("***DPDIFF*** DP%c=NULL\n", c);
+ return true;
+ }
+
+ SCORE *DPM_ = g_DP;
+ for (unsigned i = 0; i < uPrefixCountA; ++i)
+ for (unsigned j = 0; j < uPrefixCountB; ++j)
+ if (!LocalEq(DPM(i, j), DPD(i, j)))
+ {
+ Log("***DPDIFF*** DP%c(%d, %d) Simple = %.2g, Small = %.2g\n",
+ c, i, j, DPM(i, j), DPD(i, j));
+ return false;
+ }
+ return true;
+ }
+
+static bool CompareTB(char **TB, char *TBM_, char *TBD_, char *TBE_, char *TBI_, char *TBJ_,
+ unsigned uPrefixCountA, unsigned uPrefixCountB)
+ {
+ if (!g_bKeepSimpleDP)
+ return true;
+ SCORE *DPM_ = g_DPM;
+ bool Eq = true;
+ for (unsigned i = 0; i < uPrefixCountA; ++i)
+ for (unsigned j = 0; j < uPrefixCountB; ++j)
+ {
+ char c1 = TBM(i, j);
+ char c2 = Get_M_Char(TB[i][j]);
+ if (c1 != '?' && c1 != c2 && DPM(i, j) > -100000)
+ {
+ Log("TBM(%d, %d) Simple = %c, NW = %c\n", i, j, c1, c2);
+ Eq = false;
+ goto D;
+ }
+ }
+
+D:
+ SCORE *DPD_ = g_DPD;
+ for (unsigned i = 0; i < uPrefixCountA; ++i)
+ for (unsigned j = 0; j < uPrefixCountB; ++j)
+ {
+ char c1 = TBD(i, j);
+ char c2 = Get_D_Char(TB[i][j]);
+ if (c1 != '?' && c1 != c2 && DPD(i, j) > -100000)
+ {
+ Log("TBD(%d, %d) Simple = %c, NW = %c\n", i, j, c1, c2);
+ Eq = false;
+ goto E;
+ }
+ }
+E:
+ SCORE *DPE_ = g_DPE;
+ if (0 == TBE_)
+ goto I;
+ for (unsigned i = 0; i < uPrefixCountA; ++i)
+ for (unsigned j = 0; j < uPrefixCountB; ++j)
+ {
+ char c1 = TBE(i, j);
+ char c2 = Get_E_Char(TB[i][j]);
+ if (c1 != '?' && c1 != c2 && DPE(i, j) > -100000)
+ {
+ Log("TBE(%d, %d) Simple = %c, NW = %c\n", i, j, c1, c2);
+ Eq = false;
+ goto I;
+ }
+ }
+I:
+ SCORE *DPI_ = g_DPI;
+ for (unsigned i = 0; i < uPrefixCountA; ++i)
+ for (unsigned j = 0; j < uPrefixCountB; ++j)
+ {
+ char c1 = TBI(i, j);
+ char c2 = Get_I_Char(TB[i][j]);
+ if (c1 != '?' && c1 != c2 && DPI(i, j) > -100000)
+ {
+ Log("TBI(%d, %d) Simple = %c, NW = %c\n", i, j, c1, c2);
+ Eq = false;
+ goto J;
+ }
+ }
+J:
+ SCORE *DPJ_ = g_DPJ;
+ if (0 == DPJ_)
+ goto Done;
+ for (unsigned i = 0; i < uPrefixCountA; ++i)
+ for (unsigned j = 0; j < uPrefixCountB; ++j)
+ {
+ char c1 = TBJ(i, j);
+ char c2 = Get_J_Char(TB[i][j]);
+ if (c1 != '?' && c1 != c2 && DPJ(i, j) > -100000)
+ {
+ Log("TBJ(%d, %d) Simple = %c, NW = %c\n", i, j, c1, c2);
+ Eq = false;
+ goto Done;
+ }
+ }
+Done:
+ if (Eq)
+ Log("TB success\n");
+ return Eq;
+ }
+
+static const char *LocalScoreToStr(SCORE s)
+ {
+ static char str[16];
+ if (s < -100000)
+ return " *";
+ sprintf(str, "%6.1f", s);
+ return str;
+ }
+
+static void LogDP(const SCORE *DPM_, const ProfPos *PA, const ProfPos *PB,
+ unsigned uPrefixCountA, unsigned uPrefixCountB)
+ {
+ Log(" ");
+ for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ {
+ char c = ' ';
+ if (uPrefixLengthB > 0)
+ c = ConsensusChar(PB[uPrefixLengthB - 1]);
+ Log(" %4u:%c", uPrefixLengthB, c);
+ }
+ Log("\n");
+ for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA)
+ {
+ char c = ' ';
+ if (uPrefixLengthA > 0)
+ c = ConsensusChar(PA[uPrefixLengthA - 1]);
+ Log("%4u:%c ", uPrefixLengthA, c);
+ for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ Log(" %s", LocalScoreToStr(DPM(uPrefixLengthA, uPrefixLengthB)));
+ Log("\n");
+ }
+ }
+
+static void LogBitTB(char **TB, const ProfPos *PA, const ProfPos *PB,
+ unsigned uPrefixCountA, unsigned uPrefixCountB)
+ {
+ Log(" ");
+ for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ {
+ char c = ' ';
+ if (uPrefixLengthB > 0)
+ c = ConsensusChar(PB[uPrefixLengthB - 1]);
+ Log(" %4u:%c", uPrefixLengthB, c);
+ }
+ Log("\n");
+ Log("Bit TBM:\n");
+ for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA)
+ {
+ char c = ' ';
+ if (uPrefixLengthA > 0)
+ c = ConsensusChar(PA[uPrefixLengthA - 1]);
+ Log("%4u:%c ", uPrefixLengthA, c);
+ for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ {
+ char c = Get_M_Char(TB[uPrefixLengthA][uPrefixLengthB]);
+ Log(" %6c", c);
+ }
+ Log("\n");
+ }
+
+ Log("\n");
+ Log("Bit TBD:\n");
+ for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA)
+ {
+ char c = ' ';
+ if (uPrefixLengthA > 0)
+ c = ConsensusChar(PA[uPrefixLengthA - 1]);
+ Log("%4u:%c ", uPrefixLengthA, c);
+ for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ {
+ char c = Get_D_Char(TB[uPrefixLengthA][uPrefixLengthB]);
+ Log(" %6c", c);
+ }
+ Log("\n");
+ }
+
+ Log("\n");
+ Log("Bit TBE:\n");
+ for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA)
+ {
+ char c = ' ';
+ if (uPrefixLengthA > 0)
+ c = ConsensusChar(PA[uPrefixLengthA - 1]);
+ Log("%4u:%c ", uPrefixLengthA, c);
+ for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ {
+ char c = Get_E_Char(TB[uPrefixLengthA][uPrefixLengthB]);
+ Log(" %6c", c);
+ }
+ Log("\n");
+ }
+
+ Log("\n");
+ Log("Bit TBI:\n");
+ for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA)
+ {
+ char c = ' ';
+ if (uPrefixLengthA > 0)
+ c = ConsensusChar(PA[uPrefixLengthA - 1]);
+ Log("%4u:%c ", uPrefixLengthA, c);
+ for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ {
+ char c = Get_I_Char(TB[uPrefixLengthA][uPrefixLengthB]);
+ Log(" %6c", c);
+ }
+ Log("\n");
+ }
+
+ Log("\n");
+ Log("Bit TBJ:\n");
+ for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA)
+ {
+ char c = ' ';
+ if (uPrefixLengthA > 0)
+ c = ConsensusChar(PA[uPrefixLengthA - 1]);
+ Log("%4u:%c ", uPrefixLengthA, c);
+ for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ {
+ char c = Get_J_Char(TB[uPrefixLengthA][uPrefixLengthB]);
+ Log(" %6c", c);
+ }
+ Log("\n");
+ }
+ }
+
+static void ListTB(char *TBM_, const ProfPos *PA, const ProfPos *PB,
+ unsigned uPrefixCountA, unsigned uPrefixCountB)
+ {
+ Log(" ");
+ for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ {
+ char c = ' ';
+ if (uPrefixLengthB > 0)
+ c = ConsensusChar(PB[uPrefixLengthB - 1]);
+ Log(" %4u:%c", uPrefixLengthB, c);
+ }
+ Log("\n");
+ for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA)
+ {
+ char c = ' ';
+ if (uPrefixLengthA > 0)
+ c = ConsensusChar(PA[uPrefixLengthA - 1]);
+ Log("%4u:%c ", uPrefixLengthA, c);
+ for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ {
+ char c = TBM(uPrefixLengthA, uPrefixLengthB);
+ Log(" %6c", c);
+ }
+ Log("\n");
+ }
+ }
+
+static const char *BitsToStr(char Bits)
+ {
+ static char Str[32];
+
+ sprintf(Str, "%cM %cD %cE %cI %cJ",
+ Get_M_Char(Bits),
+ Get_D_Char(Bits),
+ Get_E_Char(Bits),
+ Get_I_Char(Bits),
+ Get_J_Char(Bits));
+ }
+#endif // TRACE
+
+static inline void SetBitTBM(char **TB, unsigned i, unsigned j, char c)
+ {
+ char Bit;
+ switch (c)
+ {
+ case 'M':
+ Bit = BIT_MM;
+ break;
+ case 'D':
+ Bit = BIT_DM;
+ break;
+#if DOUBLE_AFFINE
+ case 'E':
+ Bit = BIT_EM;
+ break;
+ case 'I':
+ Bit = BIT_IM;
+ break;
+ case 'J':
+ Bit = BIT_JM;
+ break;
+#endif
+ default:
+ Quit("Huh?!");
+ }
+ TB[i][j] &= ~BIT_xM;
+ TB[i][j] |= Bit;
+ }
+
+static inline void SetBitTBD(char **TB, unsigned i, unsigned j, char c)
+ {
+ char Bit;
+ switch (c)
+ {
+ case 'M':
+ Bit = BIT_MD;
+ break;
+ case 'D':
+ Bit = BIT_DD;
+ break;
+ default:
+ Quit("Huh?!");
+ }
+ TB[i][j] &= ~BIT_xD;
+ TB[i][j] |= Bit;
+ }
+
+static inline void SetBitTBI(char **TB, unsigned i, unsigned j, char c)
+ {
+ char Bit;
+ switch (c)
+ {
+ case 'M':
+ Bit = BIT_MI;
+ break;
+ case 'I':
+ Bit = BIT_II;
+ break;
+ default:
+ Quit("Huh?!");
+ }
+ TB[i][j] &= ~BIT_xI;
+ TB[i][j] |= Bit;
+ }
+
+#if DOUBLE_AFFINE
+static inline void SetBitTBE(char **TB, unsigned i, unsigned j, char c)
+ {
+ char Bit;
+ switch (c)
+ {
+ case 'M':
+ Bit = BIT_ME;
+ break;
+ case 'E':
+ Bit = BIT_EE;
+ break;
+ default:
+ Quit("Huh?!");
+ }
+ TB[i][j] &= ~BIT_xE;
+ TB[i][j] |= Bit;
+ }
+
+static inline void SetBitTBJ(char **TB, unsigned i, unsigned j, char c)
+ {
+ char Bit;
+ switch (c)
+ {
+ case 'M':
+ Bit = BIT_MJ;
+ break;
+ case 'J':
+ Bit = BIT_JJ;
+ break;
+ default:
+ Quit("Huh?!");
+ }
+ TB[i][j] &= ~BIT_xJ;
+ TB[i][j] |= Bit;
+ }
+#endif
+
+#if TRACE
+#define LogMatrices() \
+ { \
+ Log("Bit DPM:\n"); \
+ LogDP(DPM_, PA, PB, uPrefixCountA, uPrefixCountB); \
+ Log("Bit DPD:\n"); \
+ LogDP(DPD_, PA, PB, uPrefixCountA, uPrefixCountB); \
+ Log("Bit DPE:\n"); \
+ LogDP(DPE_, PA, PB, uPrefixCountA, uPrefixCountB); \
+ Log("Bit DPI:\n"); \
+ LogDP(DPI_, PA, PB, uPrefixCountA, uPrefixCountB); \
+ Log("Bit DPJ:\n"); \
+ LogDP(DPJ_, PA, PB, uPrefixCountA, uPrefixCountB); \
+ Log("Bit TB:\n"); \
+ LogBitTB(TB, PA, PB, uPrefixCountA, uPrefixCountB); \
+ bool Same; \
+ Same = DPEq('M', g_DPM, DPM_, uPrefixCountA, uPrefixCountB);\
+ if (Same) \
+ Log("DPM success\n"); \
+ Same = DPEq('D', g_DPD, DPD_, uPrefixCountA, uPrefixCountB);\
+ if (Same) \
+ Log("DPD success\n"); \
+ Same = DPEq('E', g_DPE, DPE_, uPrefixCountA, uPrefixCountB);\
+ if (Same) \
+ Log("DPE success\n"); \
+ Same = DPEq('I', g_DPI, DPI_, uPrefixCountA, uPrefixCountB);\
+ if (Same) \
+ Log("DPI success\n"); \
+ Same = DPEq('J', g_DPJ, DPJ_, uPrefixCountA, uPrefixCountB);\
+ if (Same) \
+ Log("DPJ success\n"); \
+ CompareTB(TB, g_TBM, g_TBD, g_TBE, g_TBI, g_TBJ, uPrefixCountA, uPrefixCountB);\
+ }
+#else
+#define LogMatrices() /* empty */
+#endif
+
+SCORE NWDASmall(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
+ unsigned uLengthB, PWPath &Path)
+ {
+ assert(uLengthB > 0 && uLengthA > 0);
+
+ ProfPos *pa0 = (ProfPos *) PA;
+ ProfPos *pb0 = (ProfPos *) PB;
+ ProfPos *paa = (ProfPos *) (PA + uLengthA - 1);
+ ProfPos *pbb = (ProfPos *) (PB + uLengthB - 1);
+
+ pa0->m_scoreGapOpen *= -1;
+ pb0->m_scoreGapOpen *= -1;
+
+ paa->m_scoreGapClose *= -1;
+ pbb->m_scoreGapClose *= -1;
+
+ pa0->m_scoreGapOpen2 *= -1;
+ pb0->m_scoreGapOpen2 *= -1;
+ paa->m_scoreGapClose2 *= -1;
+ pbb->m_scoreGapClose2 *= -1;
+
+ const unsigned uPrefixCountA = uLengthA + 1;
+ const unsigned uPrefixCountB = uLengthB + 1;
+ const SCORE e = g_scoreGapExtend;
+
+ const SCORE e2 = g_scoreGapExtend2;
+ const SCORE min_e = MIN(g_scoreGapExtend, g_scoreGapExtend2);
+
+ ALLOC_TRACE()
+
+ SCORE *MCurr = new SCORE[uPrefixCountB];
+ SCORE *MNext = new SCORE[uPrefixCountB];
+ SCORE *MPrev = new SCORE[uPrefixCountB];
+ SCORE *DRow = new SCORE[uPrefixCountB];
+ SCORE *ERow = new SCORE[uPrefixCountB];
+
+ char **TB = new char *[uPrefixCountA];
+ for (unsigned i = 0; i < uPrefixCountA; ++i)
+ {
+ TB[i] = new char [uPrefixCountB];
+ memset(TB[i], 0, uPrefixCountB);
+ }
+
+ SCORE Iij = MINUS_INFINITY;
+ SetDPI(0, 0, Iij);
+
+ SCORE Jij = MINUS_INFINITY;
+ SetDPJ(0, 0, Jij);
+
+ Iij = PB[0].m_scoreGapOpen;
+ SetDPI(0, 1, Iij);
+
+ Jij = PB[0].m_scoreGapOpen2;
+ SetDPJ(0, 1, Jij);
+
+ for (unsigned j = 2; j <= uLengthB; ++j)
+ {
+ Iij += e;
+ Jij += e2;
+
+ SetDPI(0, j, Iij);
+ SetDPJ(0, j, Jij);
+
+ SetTBI(0, j, 'I');
+ SetTBJ(0, j, 'J');
+ }
+
+ for (unsigned j = 0; j <= uLengthB; ++j)
+ {
+ DRow[j] = MINUS_INFINITY;
+ ERow[j] = MINUS_INFINITY;
+
+ SetDPD(0, j, DRow[j]);
+ SetDPE(0, j, ERow[j]);
+
+ SetTBD(0, j, 'D');
+ SetTBE(0, j, 'E');
+ }
+
+ MPrev[0] = 0;
+ SetDPM(0, 0, MPrev[0]);
+ for (unsigned j = 1; j <= uLengthB; ++j)
+ {
+ MPrev[j] = MINUS_INFINITY;
+ SetDPM(0, j, MPrev[j]);
+ }
+
+ MCurr[0] = MINUS_INFINITY;
+ SetDPM(1, 0, MCurr[0]);
+
+ MCurr[1] = ScoreProfPos2(PA[0], PB[0]);
+ SetDPM(1, 1, MCurr[1]);
+ SetBitTBM(TB, 1, 1, 'M');
+ SetTBM(1, 1, 'M');
+
+ for (unsigned j = 2; j <= uLengthB; ++j)
+ {
+ SCORE M = ScoreProfPos2(PA[0], PB[j-1]) + PB[0].m_scoreGapOpen +
+ (j - 2)*e + PB[j-2].m_scoreGapClose;
+ SCORE M2 = ScoreProfPos2(PA[0], PB[j-1]) + PB[0].m_scoreGapOpen2 +
+ (j - 2)*e2 + PB[j-2].m_scoreGapClose2;
+
+ if (M >= M2)
+ {
+ MCurr[j] = M;
+ SetBitTBM(TB, 1, j, 'I');
+ SetTBM(1, j, 'I');
+ }
+ else
+ {
+ MCurr[j] = M2;
+ SetBitTBM(TB, 1, j, 'J');
+ SetTBM(1, j, 'J');
+ }
+ SetDPM(1, j, MCurr[j]);
+ }
+
+// Main DP loop
+ for (unsigned i = 1; i < uLengthA; ++i)
+ {
+ Iij = MINUS_INFINITY;
+ Jij = MINUS_INFINITY;
+ SetDPI(i, 0, Iij);
+ SetDPJ(i, 0, Jij);
+
+ DRow[0] = PA[0].m_scoreGapOpen + (i - 1)*e;
+ ERow[0] = PA[0].m_scoreGapOpen2 + (i - 1)*e2;
+ SetDPD(i, 0, DRow[0]);
+ SetDPE(i, 0, ERow[0]);
+
+ MCurr[0] = MINUS_INFINITY;
+ if (i == 1)
+ {
+ MCurr[1] = ScoreProfPos2(PA[0], PB[0]);
+ SetBitTBM(TB, i, 1, 'M');
+ SetTBM(i, 1, 'M');
+ }
+ else
+ {
+ SCORE M = ScoreProfPos2(PA[i-1], PB[0]) + PA[0].m_scoreGapOpen +
+ (i - 2)*e + PA[i-2].m_scoreGapClose;
+ SCORE M2 = ScoreProfPos2(PA[i-1], PB[0]) + PA[0].m_scoreGapOpen2 +
+ (i - 2)*e2 + PA[i-2].m_scoreGapClose2;
+ if (M >= M2)
+ {
+ MCurr[1] = M;
+ SetBitTBM(TB, i, 1, 'D');
+ SetTBM(i, 1, 'D');
+ }
+ else
+ {
+ MCurr[1] = M2;
+ SetBitTBM(TB, i, 1, 'E');
+ SetTBM(i, 1, 'E');
+ }
+ }
+ SetDPM(i, 0, MCurr[0]);
+ SetDPM(i, 1, MCurr[1]);
+
+ for (unsigned j = 1; j < uLengthB; ++j)
+ MNext[j+1] = ScoreProfPos2(PA[i], PB[j]);
+
+ for (unsigned j = 1; j < uLengthB; ++j)
+ {
+ RECURSE_D(i, j)
+ RECURSE_E(i, j)
+ RECURSE_I(i, j)
+ RECURSE_J(i, j)
+ RECURSE_M(i, j)
+ }
+ // Special case for j=uLengthB
+ RECURSE_D_BTerm(i)
+ RECURSE_E_BTerm(i)
+ RECURSE_I_BTerm(i)
+ RECURSE_J_BTerm(i)
+
+ // Prev := Curr, Curr := Next, Next := Prev
+ Rotate(MPrev, MCurr, MNext);
+ }
+
+// Special case for i=uLengthA
+ MCurr[0] = MINUS_INFINITY;
+ SCORE M = ScoreProfPos2(PA[uLengthA-1], PB[0]) + (uLengthA - 2)*e +
+ PA[0].m_scoreGapOpen + PA[uLengthA-2].m_scoreGapClose;
+ SCORE M2 = ScoreProfPos2(PA[uLengthA-1], PB[0]) + (uLengthA - 2)*e +
+ PA[0].m_scoreGapOpen + PA[uLengthA-2].m_scoreGapClose;
+ if (M >= M2)
+ {
+ MCurr[1] = M;
+ SetBitTBM(TB, uLengthA, 1, 'D');
+ SetTBM(uLengthA, 1, 'D');
+ }
+ else
+ {
+ MCurr[1] = M2;
+ SetBitTBM(TB, uLengthA, 1, 'E');
+ SetTBM(uLengthA, 1, 'D');
+ }
+ SetDPM(uLengthA, 0, MCurr[0]);
+ SetDPM(uLengthA, 1, MCurr[1]);
+
+ DRow[0] = MINUS_INFINITY;
+ ERow[0] = MINUS_INFINITY;
+
+ SetDPD(uLengthA, 0, DRow[0]);
+ SetDPE(uLengthA, 0, ERow[0]);
+
+ for (unsigned j = 1; j <= uLengthB; ++j)
+ {
+ RECURSE_D_ATerm(j);
+ RECURSE_E_ATerm(j);
+ }
+
+ Iij = MINUS_INFINITY;
+ Jij = MINUS_INFINITY;
+
+ for (unsigned j = 1; j <= uLengthB; ++j)
+ {
+ RECURSE_I_ATerm(j)
+ RECURSE_J_ATerm(j)
+ }
+
+ LogMatrices();
+
+ SCORE MAB = MCurr[uLengthB];
+ SCORE DAB = DRow[uLengthB] + PA[uLengthA-1].m_scoreGapClose;
+ SCORE EAB = ERow[uLengthB] + PA[uLengthA-1].m_scoreGapClose2;
+ SCORE IAB = Iij + PB[uLengthB-1].m_scoreGapClose;
+ SCORE JAB = Jij + PB[uLengthB-1].m_scoreGapClose2;
+
+ SCORE Score = MAB;
+ char cEdgeType = 'M';
+ if (DAB > Score)
+ {
+ Score = DAB;
+ cEdgeType = 'D';
+ }
+ if (EAB > Score)
+ {
+ Score = EAB;
+ cEdgeType = 'E';
+ }
+ if (IAB > Score)
+ {
+ Score = IAB;
+ cEdgeType = 'I';
+ }
+ if (JAB > Score)
+ {
+ Score = JAB;
+ cEdgeType = 'J';
+ }
+
+#if TRACE
+ Log(" Small: MAB=%.4g DAB=%.4g EAB=%.4g IAB=%.4g JAB=%.4g best=%c\n",
+ MAB, DAB, EAB, IAB, JAB, cEdgeType);
+#endif
+
+ BitTraceBack(TB, uLengthA, uLengthB, cEdgeType, Path);
+
+#if DBEUG
+ Path.Validate();
+#endif
+
+ delete[] MCurr;
+ delete[] MNext;
+ delete[] MPrev;
+ delete[] DRow;
+ delete[] ERow;
+ for (unsigned i = 0; i < uPrefixCountA; ++i)
+ delete[] TB[i];
+ delete[] TB;
+
+ return 0;
+ }
+#endif // DOUBLE_AFFINE
Added: trunk/packages/muscle/branches/upstream/current/nwrec.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/nwrec.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/nwrec.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,137 @@
+/***
+Needleman-Wunch recursions
+
+Notation: i,j are prefix lengths so are in
+ranges i = [0,|A|) and j = [0,|B|].
+
+Profile positions are in ranges [0,|A|-1]
+and [0,|B|-1] so prefix length i corresponds
+to position (i-1) in the profile, and similarly
+for j.
+
+Terminal gap scoring
+--------------------
+Terminal gaps are scored as with open [close]
+penalties only at the left [right] terminal,
+as follows:
+
+ 0 i
+ | |
+ A XXXXX...
+ B ---XX...
+
+ i |A|-1
+ | |
+ A ...XXXXX
+ B ...XX---
+
+In these examples, open / close penalty at position
+i is included, but close / open penalty at |A|-1 /
+0 is not included.
+
+This is implemented by setting the open [close]
+penalty to zero in the first [last] position of
+each profile.
+
+Consider adding a column to a sub-alignment. After the
+column is added, there are i letters from A and j letters
+from B.
+
+The column starts a left-terminal gap if:
+ Delete with i=1, j=0 or
+ Insert with i=0, j=1.
+
+The column ends a left-terminal gap if:
+ Match following Delete with j=1, or
+ Match following Insert with i=1.
+
+The column starts a right-terminal gap if:
+ Delete following a Match and i=|A|, or
+ Insert following a Match and j=|B|.
+
+The column ends a right-terminal gap if:
+ Match with i=|A|, j=|B| following Delete or Insert.
+
+RECURSION RELATIONS
+===================
+
+ i-1
+ |
+DD A ..X X
+ B ..- -
+
+MD A ..X X
+ B ..X -
+
+D(i,j) = max
+ D(i-1,j) + e
+ M(i-1,j) + goA(i-1)
+Valid for:
+ i = [1,|A|-1]
+ j = [1,|B|]
+
+I(i,j) By symmetry with D(i,j).
+
+ i-2
+ | i-1
+ | |
+MM A ..X X
+ B ..X X
+
+DM A ..X X
+ B ..- X
+
+IM A ..- X
+ B ..X X
+ | |
+ | j-1
+ j-2
+
+M(i,j) = L(i-1,j-1) + max
+ M(i-1,j-1)
+ D(i-1,j-1) + gcA(i-2)
+ I(i-1,j-1) + gcB(j-2)
+Valid for:
+ i = [2,|A|]
+ j = [2,|B|]
+
+Equivalently:
+
+M(i+1,j+1) = L(i,j) + max
+ M(i,j)
+ D(i,j) + gcA(i-1)
+ I(i,j) + gcB(j-1)
+
+Valid for:
+ i = [1,|A|-1]
+ j = [1,|B|-1]
+
+Boundary conditions
+===================
+
+A XXXX
+B ----
+ D(0,0) = -infinity
+
+ D(i,0) = ie
+ i = [1,|A|]
+
+ D(0,j) = -infinity
+ j = [0,|B|]
+
+I(0,0), I(0,j) and I(i,0) by symmetry with D.
+
+ M(0,0) = 0
+ M(i,0) = -infinity, i > 0
+ M(0,j) = -infinity, j > 0
+
+A X
+B -
+ D(1,0) = e
+ D(1,j) = -infinity, j = [1,|B|]
+ (assuming no I-D allowed).
+
+ D(0,1) = -infinity
+ D(1,1) = -infinity
+ D(i,1) = max.
+***/
Added: trunk/packages/muscle/branches/upstream/current/nwsmall.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/nwsmall.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/nwsmall.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,660 @@
+#include "muscle.h"
+#include <math.h>
+#include "pwpath.h"
+#include "profile.h"
+#include <stdio.h>
+
+// NW small memory
+
+#define TRACE 0
+
+#if TRACE
+extern bool g_bKeepSimpleDP;
+extern SCORE *g_DPM;
+extern SCORE *g_DPD;
+extern SCORE *g_DPI;
+extern char *g_TBM;
+extern char *g_TBD;
+extern char *g_TBI;
+#endif
+
+#if TRACE
+#define ALLOC_TRACE() \
+ const SCORE UNINIT = MINUS_INFINITY; \
+ const size_t LM = uPrefixCountA*uPrefixCountB; \
+ \
+ SCORE *DPM_ = new SCORE[LM]; \
+ SCORE *DPD_ = new SCORE[LM]; \
+ SCORE *DPI_ = new SCORE[LM]; \
+ \
+ char *TBM_ = new char[LM]; \
+ char *TBD_ = new char[LM]; \
+ char *TBI_ = new char[LM]; \
+ \
+ memset(TBM_, '?', LM); \
+ memset(TBD_, '?', LM); \
+ memset(TBI_, '?', LM); \
+ \
+ for (unsigned i = 0; i <= uLengthA; ++i) \
+ for (unsigned j = 0; j <= uLengthB; ++j) \
+ { \
+ DPM(i, j) = UNINIT; \
+ DPD(i, j) = UNINIT; \
+ DPI(i, j) = UNINIT; \
+ }
+#else
+#define ALLOC_TRACE()
+#endif
+
+#if TRACE
+#define SetDPM(i, j, x) DPM(i, j) = x
+#define SetDPD(i, j, x) DPD(i, j) = x
+#define SetDPI(i, j, x) DPI(i, j) = x
+#define SetTBM(i, j, x) TBM(i, j) = x
+#define SetTBD(i, j, x) TBD(i, j) = x
+#define SetTBI(i, j, x) TBI(i, j) = x
+#else
+#define SetDPM(i, j, x) /* empty */
+#define SetDPD(i, j, x) /* empty */
+#define SetDPI(i, j, x) /* empty */
+#define SetTBM(i, j, x) /* empty */
+#define SetTBD(i, j, x) /* empty */
+#define SetTBI(i, j, x) /* empty */
+#endif
+
+#define RECURSE_D(i, j) \
+ { \
+ SCORE DD = DRow[j] + e; \
+ SCORE MD = MPrev[j] + PA[i-1].m_scoreGapOpen;\
+ if (DD > MD) \
+ { \
+ DRow[j] = DD; \
+ SetTBD(i, j, 'D'); \
+ } \
+ else \
+ { \
+ DRow[j] = MD; \
+ /* SetBitTBD(TB, i, j, 'M'); */ \
+ TBRow[j] &= ~BIT_xD; \
+ TBRow[j] |= BIT_MD; \
+ SetTBD(i, j, 'M'); \
+ } \
+ SetDPD(i, j, DRow[j]); \
+ }
+
+#define RECURSE_D_ATerm(j) RECURSE_D(uLengthA, j)
+#define RECURSE_D_BTerm(j) RECURSE_D(i, uLengthB)
+
+#define RECURSE_I(i, j) \
+ { \
+ Iij += e; \
+ SCORE MI = MCurr[j-1] + PB[j-1].m_scoreGapOpen;\
+ if (MI >= Iij) \
+ { \
+ Iij = MI; \
+ /* SetBitTBI(TB, i, j, 'M'); */ \
+ TBRow[j] &= ~BIT_xI; \
+ TBRow[j] |= BIT_MI; \
+ SetTBI(i, j, 'M'); \
+ } \
+ else \
+ SetTBI(i, j, 'I'); \
+ SetDPI(i, j, Iij); \
+ }
+
+#define RECURSE_I_ATerm(j) RECURSE_I(uLengthA, j)
+#define RECURSE_I_BTerm(j) RECURSE_I(i, uLengthB)
+
+#define RECURSE_M(i, j) \
+ { \
+ SCORE DM = DRow[j] + PA[i-1].m_scoreGapClose; \
+ SCORE IM = Iij + PB[j-1].m_scoreGapClose; \
+ SCORE MM = MCurr[j]; \
+ TB[i+1][j+1] &= ~BIT_xM; \
+ if (MM >= DM && MM >= IM) \
+ { \
+ MNext[j+1] += MM; \
+ SetDPM(i+1, j+1, MNext[j+1]); \
+ SetTBM(i+1, j+1, 'M'); \
+ /* SetBitTBM(TB, i+1, j+1, 'M'); */ \
+ TB[i+1][j+1] |= BIT_MM; \
+ } \
+ else if (DM >= MM && DM >= IM) \
+ { \
+ MNext[j+1] += DM; \
+ SetDPM(i+1, j+1, MNext[j+1]); \
+ SetTBM(i+1, j+1, 'D'); \
+ /* SetBitTBM(TB, i+1, j+1, 'D'); */ \
+ TB[i+1][j+1] |= BIT_DM; \
+ } \
+ else \
+ { \
+ assert(IM >= MM && IM >= DM); \
+ MNext[j+1] += IM; \
+ SetDPM(i+1, j+1, MNext[j+1]); \
+ SetTBM(i+1, j+1, 'I'); \
+ /* SetBitTBM(TB, i+1, j+1, 'I'); */ \
+ TB[i+1][j+1] |= BIT_IM; \
+ } \
+ }
+
+#if TRACE
+static bool LocalEq(BASETYPE b1, BASETYPE b2)
+ {
+ if (b1 < -100000 && b2 < -100000)
+ return true;
+ double diff = fabs(b1 - b2);
+ if (diff < 0.0001)
+ return true;
+ double sum = fabs(b1) + fabs(b2);
+ return diff/sum < 0.005;
+ }
+
+static char Get_M_Char(char Bits)
+ {
+ switch (Bits & BIT_xM)
+ {
+ case BIT_MM:
+ return 'M';
+ case BIT_DM:
+ return 'D';
+ case BIT_IM:
+ return 'I';
+ }
+ Quit("Huh?");
+ return '?';
+ }
+
+static char Get_D_Char(char Bits)
+ {
+ return (Bits & BIT_xD) ? 'M' : 'D';
+ }
+
+static char Get_I_Char(char Bits)
+ {
+ return (Bits & BIT_xI) ? 'M' : 'I';
+ }
+
+static bool DPEq(char c, SCORE *g_DP, SCORE *DPD_,
+ unsigned uPrefixCountA, unsigned uPrefixCountB)
+ {
+ SCORE *DPM_ = g_DP;
+ for (unsigned i = 0; i < uPrefixCountA; ++i)
+ for (unsigned j = 0; j < uPrefixCountB; ++j)
+ if (!LocalEq(DPM(i, j), DPD(i, j)))
+ {
+ Log("***DPDIFF*** DP%c(%d, %d) Simple = %.2g, Fast = %.2g\n",
+ c, i, j, DPM(i, j), DPD(i, j));
+ return false;
+ }
+ return true;
+ }
+
+static bool CompareTB(char **TB, char *TBM_, char *TBD_, char *TBI_,
+ unsigned uPrefixCountA, unsigned uPrefixCountB)
+ {
+ SCORE *DPM_ = g_DPM;
+ bool Eq = true;
+ for (unsigned i = 0; i < uPrefixCountA; ++i)
+ for (unsigned j = 0; j < uPrefixCountB; ++j)
+ {
+ char c1 = TBM(i, j);
+ char c2 = Get_M_Char(TB[i][j]);
+ if (c1 != '?' && c1 != c2 && DPM(i, j) > -100000)
+ {
+ Log("TBM(%d, %d) Simple = %c, NW = %c\n", i, j, c1, c2);
+ Eq = false;
+ goto D;
+ }
+ }
+
+D:
+ SCORE *DPD_ = g_DPD;
+ for (unsigned i = 0; i < uPrefixCountA; ++i)
+ for (unsigned j = 0; j < uPrefixCountB; ++j)
+ {
+ char c1 = TBD(i, j);
+ char c2 = Get_D_Char(TB[i][j]);
+ if (c1 != '?' && c1 != c2 && DPD(i, j) > -100000)
+ {
+ Log("TBD(%d, %d) Simple = %c, NW = %c\n", i, j, c1, c2);
+ Eq = false;
+ goto I;
+ }
+ }
+I:
+ SCORE *DPI_ = g_DPI;
+ for (unsigned i = 0; i < uPrefixCountA; ++i)
+ for (unsigned j = 0; j < uPrefixCountB; ++j)
+ {
+ char c1 = TBI(i, j);
+ char c2 = Get_I_Char(TB[i][j]);
+ if (c1 != '?' && c1 != c2 && DPI(i, j) > -100000)
+ {
+ Log("TBI(%d, %d) Simple = %c, NW = %c\n", i, j, c1, c2);
+ Eq = false;
+ goto Done;
+ }
+ }
+Done:
+ if (Eq)
+ Log("TB success\n");
+ return Eq;
+ }
+
+static const char *LocalScoreToStr(SCORE s)
+ {
+ static char str[16];
+ if (s < -100000)
+ return " *";
+ sprintf(str, "%6.1f", s);
+ return str;
+ }
+
+static void LogDP(const SCORE *DPM_, const ProfPos *PA, const ProfPos *PB,
+ unsigned uPrefixCountA, unsigned uPrefixCountB)
+ {
+ Log(" ");
+ for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ {
+ char c = ' ';
+ if (uPrefixLengthB > 0)
+ c = ConsensusChar(PB[uPrefixLengthB - 1]);
+ Log(" %4u:%c", uPrefixLengthB, c);
+ }
+ Log("\n");
+ for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA)
+ {
+ char c = ' ';
+ if (uPrefixLengthA > 0)
+ c = ConsensusChar(PA[uPrefixLengthA - 1]);
+ Log("%4u:%c ", uPrefixLengthA, c);
+ for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ Log(" %s", LocalScoreToStr(DPM(uPrefixLengthA, uPrefixLengthB)));
+ Log("\n");
+ }
+ }
+
+static void LogBitTB(char **TB, const ProfPos *PA, const ProfPos *PB,
+ unsigned uPrefixCountA, unsigned uPrefixCountB)
+ {
+ Log(" ");
+ for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ {
+ char c = ' ';
+ if (uPrefixLengthB > 0)
+ c = ConsensusChar(PB[uPrefixLengthB - 1]);
+ Log(" %4u:%c", uPrefixLengthB, c);
+ }
+ Log("\n");
+ Log("Bit TBM:\n");
+ for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA)
+ {
+ char c = ' ';
+ if (uPrefixLengthA > 0)
+ c = ConsensusChar(PA[uPrefixLengthA - 1]);
+ Log("%4u:%c ", uPrefixLengthA, c);
+ for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ {
+ char c = Get_M_Char(TB[uPrefixLengthA][uPrefixLengthB]);
+ Log(" %6c", c);
+ }
+ Log("\n");
+ }
+
+ Log("\n");
+ Log("Bit TBD:\n");
+ for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA)
+ {
+ char c = ' ';
+ if (uPrefixLengthA > 0)
+ c = ConsensusChar(PA[uPrefixLengthA - 1]);
+ Log("%4u:%c ", uPrefixLengthA, c);
+ for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ {
+ char c = Get_D_Char(TB[uPrefixLengthA][uPrefixLengthB]);
+ Log(" %6c", c);
+ }
+ Log("\n");
+ }
+
+ Log("\n");
+ Log("Bit TBI:\n");
+ for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA)
+ {
+ char c = ' ';
+ if (uPrefixLengthA > 0)
+ c = ConsensusChar(PA[uPrefixLengthA - 1]);
+ Log("%4u:%c ", uPrefixLengthA, c);
+ for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ {
+ char c = Get_I_Char(TB[uPrefixLengthA][uPrefixLengthB]);
+ Log(" %6c", c);
+ }
+ Log("\n");
+ }
+ }
+
+static void ListTB(char *TBM_, const ProfPos *PA, const ProfPos *PB,
+ unsigned uPrefixCountA, unsigned uPrefixCountB)
+ {
+ Log(" ");
+ for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ {
+ char c = ' ';
+ if (uPrefixLengthB > 0)
+ c = ConsensusChar(PB[uPrefixLengthB - 1]);
+ Log(" %4u:%c", uPrefixLengthB, c);
+ }
+ Log("\n");
+ for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA)
+ {
+ char c = ' ';
+ if (uPrefixLengthA > 0)
+ c = ConsensusChar(PA[uPrefixLengthA - 1]);
+ Log("%4u:%c ", uPrefixLengthA, c);
+ for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ {
+ char c = TBM(uPrefixLengthA, uPrefixLengthB);
+ Log(" %6c", c);
+ }
+ Log("\n");
+ }
+ }
+
+static const char *BitsToStr(char Bits)
+ {
+ static char Str[9];
+
+ sprintf(Str, "%cM %cD %cI",
+ Get_M_Char(Bits),
+ Get_D_Char(Bits),
+ Get_I_Char(Bits));
+ }
+#endif // TRACE
+
+static inline void SetBitTBM(char **TB, unsigned i, unsigned j, char c)
+ {
+ char Bit;
+ switch (c)
+ {
+ case 'M':
+ Bit = BIT_MM;
+ break;
+ case 'D':
+ Bit = BIT_DM;
+ break;
+ case 'I':
+ Bit = BIT_IM;
+ break;
+ default:
+ Quit("Huh?!");
+ }
+ TB[i][j] &= ~BIT_xM;
+ TB[i][j] |= Bit;
+ }
+
+static inline void SetBitTBD(char **TB, unsigned i, unsigned j, char c)
+ {
+ char Bit;
+ switch (c)
+ {
+ case 'M':
+ Bit = BIT_MD;
+ break;
+ case 'D':
+ Bit = BIT_DD;
+ break;
+ default:
+ Quit("Huh?!");
+ }
+ TB[i][j] &= ~BIT_xD;
+ TB[i][j] |= Bit;
+ }
+
+static inline void SetBitTBI(char **TB, unsigned i, unsigned j, char c)
+ {
+ char Bit;
+ switch (c)
+ {
+ case 'M':
+ Bit = BIT_MI;
+ break;
+ case 'I':
+ Bit = BIT_II;
+ break;
+ default:
+ Quit("Huh?!");
+ }
+ TB[i][j] &= ~BIT_xI;
+ TB[i][j] |= Bit;
+ }
+
+#if TRACE
+#define LogMatrices() \
+ { \
+ Log("Bit DPM:\n"); \
+ LogDP(DPM_, PA, PB, uPrefixCountA, uPrefixCountB); \
+ Log("Bit DPD:\n"); \
+ LogDP(DPD_, PA, PB, uPrefixCountA, uPrefixCountB); \
+ Log("Bit DPI:\n"); \
+ LogDP(DPI_, PA, PB, uPrefixCountA, uPrefixCountB); \
+ Log("Bit TB:\n"); \
+ LogBitTB(TB, PA, PB, uPrefixCountA, uPrefixCountB); \
+ bool Same; \
+ Same = DPEq('M', g_DPM, DPM_, uPrefixCountA, uPrefixCountB);\
+ if (Same) \
+ Log("DPM success\n"); \
+ Same = DPEq('D', g_DPD, DPD_, uPrefixCountA, uPrefixCountB);\
+ if (Same) \
+ Log("DPD success\n"); \
+ Same = DPEq('I', g_DPI, DPI_, uPrefixCountA, uPrefixCountB);\
+ if (Same) \
+ Log("DPI success\n"); \
+ CompareTB(TB, g_TBM, g_TBD, g_TBI, uPrefixCountA, uPrefixCountB);\
+ }
+#else
+#define LogMatrices() /* empty */
+#endif
+
+static unsigned uCachePrefixCountB;
+static unsigned uCachePrefixCountA;
+static SCORE *CacheMCurr;
+static SCORE *CacheMNext;
+static SCORE *CacheMPrev;
+static SCORE *CacheDRow;
+static char **CacheTB;
+
+static void AllocCache(unsigned uPrefixCountA, unsigned uPrefixCountB)
+ {
+ if (uPrefixCountA <= uCachePrefixCountA && uPrefixCountB <= uCachePrefixCountB)
+ return;
+
+ delete[] CacheMCurr;
+ delete[] CacheMNext;
+ delete[] CacheMPrev;
+ delete[] CacheDRow;
+ for (unsigned i = 0; i < uCachePrefixCountA; ++i)
+ delete[] CacheTB[i];
+ delete[] CacheTB;
+
+ uCachePrefixCountA = uPrefixCountA + 1024;
+ uCachePrefixCountB = uPrefixCountB + 1024;
+
+ CacheMCurr = new SCORE[uCachePrefixCountB];
+ CacheMNext = new SCORE[uCachePrefixCountB];
+ CacheMPrev = new SCORE[uCachePrefixCountB];
+ CacheDRow = new SCORE[uCachePrefixCountB];
+
+ CacheTB = new char *[uCachePrefixCountA];
+ for (unsigned i = 0; i < uCachePrefixCountA; ++i)
+ CacheTB[i] = new char [uCachePrefixCountB];
+ }
+
+SCORE NWSmall(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
+ unsigned uLengthB, PWPath &Path)
+ {
+ if (0 == uLengthB || 0 == uLengthA )
+ Quit("Internal error, NWSmall: length=0");
+
+ SetTermGaps(PA, uLengthA);
+ SetTermGaps(PB, uLengthB);
+
+ const unsigned uPrefixCountA = uLengthA + 1;
+ const unsigned uPrefixCountB = uLengthB + 1;
+ const SCORE e = g_scoreGapExtend;
+
+ ALLOC_TRACE()
+
+ AllocCache(uPrefixCountA, uPrefixCountB);
+
+ SCORE *MCurr = CacheMCurr;
+ SCORE *MNext = CacheMNext;
+ SCORE *MPrev = CacheMPrev;
+ SCORE *DRow = CacheDRow;
+
+ char **TB = CacheTB;
+ for (unsigned i = 0; i < uPrefixCountA; ++i)
+ memset(TB[i], 0, uPrefixCountB);
+
+ SCORE Iij = MINUS_INFINITY;
+ SetDPI(0, 0, Iij);
+
+ Iij = PB[0].m_scoreGapOpen;
+ SetDPI(0, 1, Iij);
+
+ for (unsigned j = 2; j <= uLengthB; ++j)
+ {
+ Iij += e;
+ SetDPI(0, j, Iij);
+ SetTBI(0, j, 'I');
+ }
+
+ for (unsigned j = 0; j <= uLengthB; ++j)
+ {
+ DRow[j] = MINUS_INFINITY;
+ SetDPD(0, j, DRow[j]);
+ SetTBD(0, j, 'D');
+ }
+
+ MPrev[0] = 0;
+ SetDPM(0, 0, MPrev[0]);
+ for (unsigned j = 1; j <= uLengthB; ++j)
+ {
+ MPrev[j] = MINUS_INFINITY;
+ SetDPM(0, j, MPrev[j]);
+ }
+
+ MCurr[0] = MINUS_INFINITY;
+ SetDPM(1, 0, MCurr[0]);
+
+ MCurr[1] = ScoreProfPos2(PA[0], PB[0]);
+ SetDPM(1, 1, MCurr[1]);
+ SetBitTBM(TB, 1, 1, 'M');
+ SetTBM(1, 1, 'M');
+
+ for (unsigned j = 2; j <= uLengthB; ++j)
+ {
+ MCurr[j] = ScoreProfPos2(PA[0], PB[j-1]) + PB[0].m_scoreGapOpen +
+ (j - 2)*e + PB[j-2].m_scoreGapClose;
+ SetDPM(1, j, MCurr[j]);
+ SetBitTBM(TB, 1, j, 'I');
+ SetTBM(1, j, 'I');
+ }
+
+// Main DP loop
+ for (unsigned i = 1; i < uLengthA; ++i)
+ {
+ char *TBRow = TB[i];
+
+ Iij = MINUS_INFINITY;
+ SetDPI(i, 0, Iij);
+
+ DRow[0] = PA[0].m_scoreGapOpen + (i - 1)*e;
+ SetDPD(i, 0, DRow[0]);
+
+ MCurr[0] = MINUS_INFINITY;
+ if (i == 1)
+ {
+ MCurr[1] = ScoreProfPos2(PA[0], PB[0]);
+ SetBitTBM(TB, i, 1, 'M');
+ SetTBM(i, 1, 'M');
+ }
+ else
+ {
+ MCurr[1] = ScoreProfPos2(PA[i-1], PB[0]) + PA[0].m_scoreGapOpen +
+ (i - 2)*e + PA[i-2].m_scoreGapClose;
+ SetBitTBM(TB, i, 1, 'D');
+ SetTBM(i, 1, 'D');
+ }
+ SetDPM(i, 0, MCurr[0]);
+ SetDPM(i, 1, MCurr[1]);
+
+ for (unsigned j = 1; j < uLengthB; ++j)
+ MNext[j+1] = ScoreProfPos2(PA[i], PB[j]);
+
+ for (unsigned j = 1; j < uLengthB; ++j)
+ {
+ RECURSE_D(i, j)
+ RECURSE_I(i, j)
+ RECURSE_M(i, j)
+ }
+ // Special case for j=uLengthB
+ RECURSE_D_BTerm(i)
+ RECURSE_I_BTerm(i)
+
+ // Prev := Curr, Curr := Next, Next := Prev
+ Rotate(MPrev, MCurr, MNext);
+ }
+
+// Special case for i=uLengthA
+ char *TBRow = TB[uLengthA];
+ MCurr[0] = MINUS_INFINITY;
+ MCurr[1] = ScoreProfPos2(PA[uLengthA-1], PB[0]) + (uLengthA - 2)*e +
+ PA[0].m_scoreGapOpen + PA[uLengthA-2].m_scoreGapClose;
+ SetBitTBM(TB, uLengthA, 1, 'D');
+ SetTBM(uLengthA, 1, 'D');
+ SetDPM(uLengthA, 0, MCurr[0]);
+ SetDPM(uLengthA, 1, MCurr[1]);
+
+ DRow[0] = MINUS_INFINITY;
+ SetDPD(uLengthA, 0, DRow[0]);
+ for (unsigned j = 1; j <= uLengthB; ++j)
+ RECURSE_D_ATerm(j);
+
+ Iij = MINUS_INFINITY;
+ for (unsigned j = 1; j <= uLengthB; ++j)
+ RECURSE_I_ATerm(j)
+
+ LogMatrices();
+
+ SCORE MAB = MCurr[uLengthB];
+ SCORE DAB = DRow[uLengthB];
+ SCORE IAB = Iij;
+
+ SCORE Score = MAB;
+ char cEdgeType = 'M';
+ if (DAB > Score)
+ {
+ Score = DAB;
+ cEdgeType = 'D';
+ }
+ if (IAB > Score)
+ {
+ Score = IAB;
+ cEdgeType = 'I';
+ }
+
+#if TRACE
+ Log(" Fast: MAB=%.4g DAB=%.4g IAB=%.4g best=%c\n",
+ MAB, DAB, IAB, cEdgeType);
+#endif
+
+ BitTraceBack(TB, uLengthA, uLengthB, cEdgeType, Path);
+
+#if DBEUG
+ Path.Validate();
+#endif
+
+ return 0;
+ }
Added: trunk/packages/muscle/branches/upstream/current/objscore.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/objscore.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/objscore.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,113 @@
+#include "muscle.h"
+#include "msa.h"
+#include "objscore.h"
+#include "profile.h"
+#include "timing.h"
+
+#if TIMING
+TICKS g_ticksObjScore = 0;
+#endif
+
+SCORE ObjScore(const MSA &msa, const unsigned SeqIndexes1[],
+ unsigned uSeqCount1, const unsigned SeqIndexes2[], unsigned uSeqCount2)
+ {
+#if TIMING
+ TICKS t1 = GetClockTicks();
+#endif
+ const unsigned uSeqCount = msa.GetSeqCount();
+
+ OBJSCORE OS = g_ObjScore;
+ if (g_ObjScore == OBJSCORE_SPM)
+ {
+ if (uSeqCount <= 100)
+ OS = OBJSCORE_XP;
+ else
+ OS = OBJSCORE_SPF;
+ }
+
+ MSA msa1;
+ MSA msa2;
+
+ switch (OS)
+ {
+ case OBJSCORE_DP:
+ case OBJSCORE_XP:
+ MSAFromSeqSubset(msa, SeqIndexes1, uSeqCount1, msa1);
+ MSAFromSeqSubset(msa, SeqIndexes2, uSeqCount2, msa2);
+
+ SetMSAWeightsMuscle(msa1);
+ SetMSAWeightsMuscle(msa2);
+ break;
+
+ case OBJSCORE_SP:
+ case OBJSCORE_SPF:
+ case OBJSCORE_PS:
+ // Yuck -- casting away const (design flaw)
+ SetMSAWeightsMuscle((MSA &) msa);
+ break;
+ }
+
+ SCORE Score = 0;
+ switch (OS)
+ {
+ case OBJSCORE_SP:
+ Score = ObjScoreSP(msa);
+ break;
+
+ case OBJSCORE_DP:
+ Score = ObjScoreDP(msa1, msa2);
+ break;
+
+ case OBJSCORE_XP:
+ Score = ObjScoreXP(msa1, msa2);
+ break;
+
+ case OBJSCORE_PS:
+ Score = ObjScorePS(msa);
+ break;
+
+ case OBJSCORE_SPF:
+ Score = ObjScoreSPDimer(msa);
+ break;
+
+ default:
+ Quit("Invalid g_ObjScore=%d", g_ObjScore);
+ }
+#if TIMING
+ TICKS t2 = GetClockTicks();
+ g_ticksObjScore += (t2 - t1);
+#endif
+ return Score;
+ }
+
+SCORE ObjScoreIds(const MSA &msa, const unsigned Ids1[],
+ unsigned uCount1, const unsigned Ids2[], unsigned uCount2)
+ {
+#if TIMING
+ TICKS t1 = GetClockTicks();
+#endif
+ unsigned *SeqIndexes1 = new unsigned[uCount1];
+ unsigned *SeqIndexes2 = new unsigned[uCount2];
+
+ for (unsigned n = 0; n < uCount1; ++n)
+ SeqIndexes1[n] = msa.GetSeqIndex(Ids1[n]);
+
+ for (unsigned n = 0; n < uCount2; ++n)
+ SeqIndexes2[n] = msa.GetSeqIndex(Ids2[n]);
+
+#if DOUBLE_AFFINE
+ extern SCORE ObjScoreDA(const MSA &msa, SCORE *ptrLetters, SCORE *ptrGaps);
+ SCORE Letters, Gaps;
+ SCORE dObjScore = ObjScoreDA(msa, &Letters, &Gaps);
+
+ delete[] SeqIndexes1;
+ delete[] SeqIndexes2;
+#else
+ SCORE dObjScore = ObjScore(msa, SeqIndexes1, uCount1, SeqIndexes2, uCount2);
+#endif
+#if TIMING
+ TICKS t2 = GetClockTicks();
+ g_ticksObjScore += (t2 - t1);
+#endif
+ return dObjScore;
+ }
Added: trunk/packages/muscle/branches/upstream/current/objscore.h
===================================================================
--- trunk/packages/muscle/branches/upstream/current/objscore.h 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/objscore.h 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,30 @@
+#ifndef ObjScore_h
+#define ObjScore_h
+
+SCORE ScoreSeqPairGaps(const MSA &msa1, unsigned uSeqIndex1,
+ const MSA &msa2, unsigned uSeqIndex2);
+SCORE ScoreSeqPairLetters(const MSA &msa1, unsigned uSeqIndex1,
+ const MSA &msa2, unsigned uSeqIndex2);
+SCORE ScoreGaps(const MSA &msa, const unsigned Cols[], unsigned ColCount);
+
+SCORE ObjScore(const MSA &msa, const unsigned SeqIndexes1[],
+ unsigned uSeqCount1, const unsigned SeqIndexes2[], unsigned uSeqCount2);
+
+SCORE ObjScoreIds(const MSA &msa, const unsigned Ids1[],
+ unsigned uCount1, const unsigned Ids2[], unsigned uCount2);
+
+void GetLetterScores(const MSA &msa, SCORE LetterScores[]);
+
+SCORE ObjScoreDP(const MSA &msa1, const MSA &msa2, SCORE MatchScore[] = 0);
+SCORE ObjScorePS(const MSA &msa, SCORE MatchScore[] = 0);
+SCORE ObjScoreSP(const MSA &msa, SCORE MatchScore[] = 0);
+SCORE ObjScoreXP(const MSA &msa, const MSA &msa2);
+SCORE ObjScoreSPDimer(const MSA &msa);
+SCORE ObjScoreDP_Profs(const ProfPos *PA, const ProfPos *PB, unsigned uColCount,
+ SCORE MatchScore[] = 0);
+
+SCORE DiffObjScore(
+ const MSA &msa1, const PWPath &Path1, const unsigned Edges1[], unsigned uEdgeCount1,
+ const MSA &msa2, const PWPath &Path2, const unsigned Edges2[], unsigned uEdgeCount2);
+
+#endif // ObjScore_h
Added: trunk/packages/muscle/branches/upstream/current/objscore2.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/objscore2.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/objscore2.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,522 @@
+#include "muscle.h"
+#include "msa.h"
+#include "profile.h"
+#include "objscore.h"
+
+#define TRACE 0
+#define TRACE_SEQPAIR 0
+#define TEST_SPFAST 0
+
+extern SCOREMATRIX VTML_LA;
+extern SCOREMATRIX PAM200;
+extern SCOREMATRIX PAM200NoCenter;
+extern SCOREMATRIX VTML_SP;
+extern SCOREMATRIX VTML_SPNoCenter;
+extern SCOREMATRIX NUC_SP;
+
+SCORE g_SPScoreLetters;
+SCORE g_SPScoreGaps;
+
+static SCORE TermGapScore(bool Gap)
+ {
+ switch (g_TermGaps)
+ {
+ case TERMGAPS_Full:
+ return 0;
+
+ case TERMGAPS_Half:
+ if (Gap)
+ return g_scoreGapOpen/2;
+ return 0;
+
+ case TERMGAPS_Ext:
+ if (Gap)
+ return g_scoreGapExtend;
+ return 0;
+ }
+ Quit("TermGapScore?!");
+ return 0;
+ }
+
+SCORE ScoreSeqPairLetters(const MSA &msa1, unsigned uSeqIndex1,
+ const MSA &msa2, unsigned uSeqIndex2)
+ {
+ const unsigned uColCount = msa1.GetColCount();
+ const unsigned uColCount2 = msa2.GetColCount();
+ if (uColCount != uColCount2)
+ Quit("ScoreSeqPairLetters, different lengths");
+
+#if TRACE_SEQPAIR
+ {
+ Log("\n");
+ Log("ScoreSeqPairLetters\n");
+ MSA msaTmp;
+ msaTmp.SetSize(2, uColCount);
+ msaTmp.CopySeq(0, msa1, uSeqIndex1);
+ msaTmp.CopySeq(1, msa2, uSeqIndex2);
+ msaTmp.LogMe();
+ }
+#endif
+
+ SCORE scoreLetters = 0;
+ SCORE scoreGaps = 0;
+ bool bGapping1 = false;
+ bool bGapping2 = false;
+
+ unsigned uColStart = 0;
+ bool bLeftTermGap = false;
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
+ {
+ bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex);
+ bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex);
+ if (!bGap1 || !bGap2)
+ {
+ if (bGap1 || bGap2)
+ bLeftTermGap = true;
+ uColStart = uColIndex;
+ break;
+ }
+ }
+
+ unsigned uColEnd = uColCount - 1;
+ bool bRightTermGap = false;
+ for (int iColIndex = (int) uColCount - 1; iColIndex >= 0; --iColIndex)
+ {
+ bool bGap1 = msa1.IsGap(uSeqIndex1, iColIndex);
+ bool bGap2 = msa2.IsGap(uSeqIndex2, iColIndex);
+ if (!bGap1 || !bGap2)
+ {
+ if (bGap1 || bGap2)
+ bRightTermGap = true;
+ uColEnd = (unsigned) iColIndex;
+ break;
+ }
+ }
+
+#if TRACE_SEQPAIR
+ Log("LeftTermGap=%d RightTermGap=%d\n", bLeftTermGap, bRightTermGap);
+#endif
+
+ for (unsigned uColIndex = uColStart; uColIndex <= uColEnd; ++uColIndex)
+ {
+ unsigned uLetter1 = msa1.GetLetterEx(uSeqIndex1, uColIndex);
+ if (uLetter1 >= g_AlphaSize)
+ continue;
+ unsigned uLetter2 = msa2.GetLetterEx(uSeqIndex2, uColIndex);
+ if (uLetter2 >= g_AlphaSize)
+ continue;
+
+ SCORE scoreMatch = (*g_ptrScoreMatrix)[uLetter1][uLetter2];
+ scoreLetters += scoreMatch;
+ }
+ return scoreLetters;
+ }
+
+SCORE ScoreSeqPairGaps(const MSA &msa1, unsigned uSeqIndex1,
+ const MSA &msa2, unsigned uSeqIndex2)
+ {
+ const unsigned uColCount = msa1.GetColCount();
+ const unsigned uColCount2 = msa2.GetColCount();
+ if (uColCount != uColCount2)
+ Quit("ScoreSeqPairGaps, different lengths");
+
+#if TRACE_SEQPAIR
+ {
+ Log("\n");
+ Log("ScoreSeqPairGaps\n");
+ MSA msaTmp;
+ msaTmp.SetSize(2, uColCount);
+ msaTmp.CopySeq(0, msa1, uSeqIndex1);
+ msaTmp.CopySeq(1, msa2, uSeqIndex2);
+ msaTmp.LogMe();
+ }
+#endif
+
+ SCORE scoreGaps = 0;
+ bool bGapping1 = false;
+ bool bGapping2 = false;
+
+ unsigned uColStart = 0;
+ bool bLeftTermGap = false;
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
+ {
+ bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex);
+ bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex);
+ if (!bGap1 || !bGap2)
+ {
+ if (bGap1 || bGap2)
+ bLeftTermGap = true;
+ uColStart = uColIndex;
+ break;
+ }
+ }
+
+ unsigned uColEnd = uColCount - 1;
+ bool bRightTermGap = false;
+ for (int iColIndex = (int) uColCount - 1; iColIndex >= 0; --iColIndex)
+ {
+ bool bGap1 = msa1.IsGap(uSeqIndex1, iColIndex);
+ bool bGap2 = msa2.IsGap(uSeqIndex2, iColIndex);
+ if (!bGap1 || !bGap2)
+ {
+ if (bGap1 || bGap2)
+ bRightTermGap = true;
+ uColEnd = (unsigned) iColIndex;
+ break;
+ }
+ }
+
+#if TRACE_SEQPAIR
+ Log("LeftTermGap=%d RightTermGap=%d\n", bLeftTermGap, bRightTermGap);
+#endif
+
+ for (unsigned uColIndex = uColStart; uColIndex <= uColEnd; ++uColIndex)
+ {
+ bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex);
+ bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex);
+
+ if (bGap1 && bGap2)
+ continue;
+
+ if (bGap1)
+ {
+ if (!bGapping1)
+ {
+#if TRACE_SEQPAIR
+ Log("Gap open seq 1 col %d\n", uColIndex);
+#endif
+ if (uColIndex == uColStart)
+ scoreGaps += TermGapScore(true);
+ else
+ scoreGaps += g_scoreGapOpen;
+ bGapping1 = true;
+ }
+ else
+ scoreGaps += g_scoreGapExtend;
+ continue;
+ }
+
+ else if (bGap2)
+ {
+ if (!bGapping2)
+ {
+#if TRACE_SEQPAIR
+ Log("Gap open seq 2 col %d\n", uColIndex);
+#endif
+ if (uColIndex == uColStart)
+ scoreGaps += TermGapScore(true);
+ else
+ scoreGaps += g_scoreGapOpen;
+ bGapping2 = true;
+ }
+ else
+ scoreGaps += g_scoreGapExtend;
+ continue;
+ }
+
+ bGapping1 = false;
+ bGapping2 = false;
+ }
+
+ if (bGapping1 || bGapping2)
+ {
+ scoreGaps -= g_scoreGapOpen;
+ scoreGaps += TermGapScore(true);
+ }
+ return scoreGaps;
+ }
+
+// The usual sum-of-pairs objective score: sum the score
+// of the alignment of each pair of sequences.
+SCORE ObjScoreSP(const MSA &msa, SCORE MatchScore[])
+ {
+#if TRACE
+ Log("==================ObjScoreSP==============\n");
+ Log("msa=\n");
+ msa.LogMe();
+#endif
+ g_SPScoreLetters = 0;
+ g_SPScoreGaps = 0;
+
+ if (0 != MatchScore)
+ {
+ const unsigned uColCount = msa.GetColCount();
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
+ MatchScore[uColIndex] = 0;
+ }
+
+ const unsigned uSeqCount = msa.GetSeqCount();
+ SCORE scoreTotal = 0;
+ unsigned uPairCount = 0;
+#if TRACE
+ Log("Seq1 Seq2 wt1 wt2 Letters Gaps Unwt.Score Wt.Score Total\n");
+ Log("---- ---- ------ ------ ---------- ---------- ---------- ---------- ----------\n");
+#endif
+ for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1)
+ {
+ const WEIGHT w1 = msa.GetSeqWeight(uSeqIndex1);
+ for (unsigned uSeqIndex2 = uSeqIndex1 + 1; uSeqIndex2 < uSeqCount; ++uSeqIndex2)
+ {
+ const WEIGHT w2 = msa.GetSeqWeight(uSeqIndex2);
+ const WEIGHT w = w1*w2;
+
+ SCORE scoreLetters = ScoreSeqPairLetters(msa, uSeqIndex1, msa, uSeqIndex2);
+ SCORE scoreGaps = ScoreSeqPairGaps(msa, uSeqIndex1, msa, uSeqIndex2);
+ SCORE scorePair = scoreLetters + scoreGaps;
+ ++uPairCount;
+
+ scoreTotal += w*scorePair;
+
+ g_SPScoreLetters += w*scoreLetters;
+ g_SPScoreGaps += w*scoreGaps;
+#if TRACE
+ Log("%4d %4d %6.3f %6.3f %10.2f %10.2f %10.2f %10.2f %10.2f >%s >%s\n",
+ uSeqIndex1,
+ uSeqIndex2,
+ w1,
+ w2,
+ scoreLetters,
+ scoreGaps,
+ scorePair,
+ scorePair*w1*w2,
+ scoreTotal,
+ msa.GetSeqName(uSeqIndex1),
+ msa.GetSeqName(uSeqIndex2));
+#endif
+ }
+ }
+#if TEST_SPFAST
+ {
+ SCORE f = ObjScoreSPFast(msa);
+ Log("Fast = %.6g\n", f);
+ Log("Brute = %.6g\n", scoreTotal);
+ if (BTEq(f, scoreTotal))
+ Log("Agree\n");
+ else
+ Log("** DISAGREE **\n");
+ }
+#endif
+// return scoreTotal / uPairCount;
+ return scoreTotal;
+ }
+
+// Objective score defined as the dynamic programming score.
+// Input is two alignments, which must be of the same length.
+// Result is the same profile-profile score that is optimized
+// by dynamic programming.
+SCORE ObjScoreDP(const MSA &msa1, const MSA &msa2, SCORE MatchScore[])
+ {
+ const unsigned uColCount = msa1.GetColCount();
+ if (msa2.GetColCount() != uColCount)
+ Quit("ObjScoreDP, must be same length");
+
+ const unsigned uColCount1 = msa1.GetColCount();
+ const unsigned uColCount2 = msa2.GetColCount();
+
+ const ProfPos *PA = ProfileFromMSA(msa1);
+ const ProfPos *PB = ProfileFromMSA(msa2);
+
+ return ObjScoreDP_Profs(PA, PB, uColCount1, MatchScore);
+ }
+
+SCORE ObjScoreDP_Profs(const ProfPos *PA, const ProfPos *PB, unsigned uColCount,
+ SCORE MatchScore[])
+ {
+//#if TRACE
+// Log("Profile 1:\n");
+// ListProfile(PA, uColCount, &msa1);
+//
+// Log("Profile 2:\n");
+// ListProfile(PB, uColCount, &msa2);
+//#endif
+
+ SCORE scoreTotal = 0;
+
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
+ {
+ const ProfPos &PPA = PA[uColIndex];
+ const ProfPos &PPB = PB[uColIndex];
+
+ SCORE scoreGap = 0;
+ SCORE scoreMatch = 0;
+ // If gapped column...
+ if (PPA.m_bAllGaps && PPB.m_bAllGaps)
+ scoreGap = 0;
+ else if (PPA.m_bAllGaps)
+ {
+ if (uColCount - 1 == uColIndex || !PA[uColIndex+1].m_bAllGaps)
+ scoreGap = PPB.m_scoreGapClose;
+ if (0 == uColIndex || !PA[uColIndex-1].m_bAllGaps)
+ scoreGap += PPB.m_scoreGapOpen;
+ //if (0 == scoreGap)
+ // scoreGap = PPB.m_scoreGapExtend;
+ }
+ else if (PPB.m_bAllGaps)
+ {
+ if (uColCount - 1 == uColIndex || !PB[uColIndex+1].m_bAllGaps)
+ scoreGap = PPA.m_scoreGapClose;
+ if (0 == uColIndex || !PB[uColIndex-1].m_bAllGaps)
+ scoreGap += PPA.m_scoreGapOpen;
+ //if (0 == scoreGap)
+ // scoreGap = PPA.m_scoreGapExtend;
+ }
+ else
+ scoreMatch = ScoreProfPos2(PPA, PPB);
+
+ if (0 != MatchScore)
+ MatchScore[uColIndex] = scoreMatch;
+
+ scoreTotal += scoreMatch + scoreGap;
+
+ extern bool g_bTracePPScore;
+ extern MSA *g_ptrPPScoreMSA1;
+ extern MSA *g_ptrPPScoreMSA2;
+ if (g_bTracePPScore)
+ {
+ const MSA &msa1 = *g_ptrPPScoreMSA1;
+ const MSA &msa2 = *g_ptrPPScoreMSA2;
+ const unsigned uSeqCount1 = msa1.GetSeqCount();
+ const unsigned uSeqCount2 = msa2.GetSeqCount();
+
+ for (unsigned n = 0; n < uSeqCount1; ++n)
+ Log("%c", msa1.GetChar(n, uColIndex));
+ Log(" ");
+ for (unsigned n = 0; n < uSeqCount2; ++n)
+ Log("%c", msa2.GetChar(n, uColIndex));
+ Log(" %10.3f", scoreMatch);
+ if (scoreGap != 0)
+ Log(" %10.3f", scoreGap);
+ Log("\n");
+ }
+ }
+
+ delete[] PA;
+ delete[] PB;
+
+ return scoreTotal;
+ }
+
+// Objective score defined as the sum of profile-sequence
+// scores for each sequence in the alignment. The profile
+// is computed from the entire alignment, so this includes
+// the score of each sequence against itself. This is to
+// avoid recomputing the profile each time, so we reduce
+// complexity but introduce a questionable approximation.
+// The goal is to see if we can exploit the apparent
+// improvement in performance of log-expectation score
+// over the usual sum-of-pairs by optimizing this
+// objective score in the iterative refinement stage.
+SCORE ObjScorePS(const MSA &msa, SCORE MatchScore[])
+ {
+ if (g_PPScore != PPSCORE_LE)
+ Quit("FastScoreMSA_LASimple: LA");
+
+ const unsigned uSeqCount = msa.GetSeqCount();
+ const unsigned uColCount = msa.GetColCount();
+
+ const ProfPos *Prof = ProfileFromMSA(msa);
+
+ if (0 != MatchScore)
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
+ MatchScore[uColIndex] = 0;
+
+ SCORE scoreTotal = 0;
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ const WEIGHT weightSeq = msa.GetSeqWeight(uSeqIndex);
+ SCORE scoreSeq = 0;
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
+ {
+ const ProfPos &PP = Prof[uColIndex];
+ if (msa.IsGap(uSeqIndex, uColIndex))
+ {
+ bool bOpen = (0 == uColIndex ||
+ !msa.IsGap(uSeqIndex, uColIndex - 1));
+ bool bClose = (uColCount - 1 == uColIndex ||
+ !msa.IsGap(uSeqIndex, uColIndex + 1));
+
+ if (bOpen)
+ scoreSeq += PP.m_scoreGapOpen;
+ if (bClose)
+ scoreSeq += PP.m_scoreGapClose;
+ //if (!bOpen && !bClose)
+ // scoreSeq += PP.m_scoreGapExtend;
+ }
+ else if (msa.IsWildcard(uSeqIndex, uColIndex))
+ continue;
+ else
+ {
+ unsigned uLetter = msa.GetLetter(uSeqIndex, uColIndex);
+ const SCORE scoreMatch = PP.m_AAScores[uLetter];
+ if (0 != MatchScore)
+ MatchScore[uColIndex] += weightSeq*scoreMatch;
+ scoreSeq += scoreMatch;
+ }
+ }
+ scoreTotal += weightSeq*scoreSeq;
+ }
+
+ delete[] Prof;
+ return scoreTotal;
+ }
+
+// The XP score is the sum of the score of each pair of
+// sequences between two profiles which are aligned to
+// each other. Notice that for two given profiles aligned
+// in different ways, the difference in XP score must be
+// the same as the difference in SP score because the
+// score of a pair of sequences in one profile doesn't
+// depend on the alignment.
+SCORE ObjScoreXP(const MSA &msa1, const MSA &msa2)
+ {
+ const unsigned uColCount1 = msa1.GetColCount();
+ const unsigned uColCount2 = msa2.GetColCount();
+ if (uColCount1 != uColCount2)
+ Quit("ObjScoreXP, alignment lengths differ %u %u", uColCount1, uColCount2);
+
+ const unsigned uSeqCount1 = msa1.GetSeqCount();
+ const unsigned uSeqCount2 = msa2.GetSeqCount();
+
+#if TRACE
+ Log(" Score Weight Weight Total\n");
+ Log("---------- ------ ------ ----------\n");
+#endif
+
+ SCORE scoreTotal = 0;
+ unsigned uPairCount = 0;
+ for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount1; ++uSeqIndex1)
+ {
+ const WEIGHT w1 = msa1.GetSeqWeight(uSeqIndex1);
+ for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqCount2; ++uSeqIndex2)
+ {
+ const WEIGHT w2 = msa2.GetSeqWeight(uSeqIndex2);
+ const WEIGHT w = w1*w2;
+ SCORE scoreLetters = ScoreSeqPairLetters(msa1, uSeqIndex1, msa2, uSeqIndex2);
+ SCORE scoreGaps = ScoreSeqPairGaps(msa1, uSeqIndex1, msa2, uSeqIndex2);
+ SCORE scorePair = scoreLetters + scoreGaps;
+ scoreTotal += w1*w2*scorePair;
+ ++uPairCount;
+#if TRACE
+ Log("%10.2f %6.3f %6.3f %10.2f >%s >%s\n",
+ scorePair,
+ w1,
+ w2,
+ scorePair*w1*w2,
+ msa1.GetSeqName(uSeqIndex1),
+ msa2.GetSeqName(uSeqIndex2));
+#endif
+ }
+ }
+ if (0 == uPairCount)
+ Quit("0 == uPairCount");
+
+#if TRACE
+ Log("msa1=\n");
+ msa1.LogMe();
+ Log("msa2=\n");
+ msa2.LogMe();
+ Log("XP=%g\n", scoreTotal);
+#endif
+// return scoreTotal / uPairCount;
+ return scoreTotal;
+ }
Added: trunk/packages/muscle/branches/upstream/current/objscoreda.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/objscoreda.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/objscoreda.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,289 @@
+#include "muscle.h"
+#include "msa.h"
+#include "profile.h"
+#include "objscore.h"
+
+#if DOUBLE_AFFINE
+
+#define TRACE 0
+#define TEST_SPFAST 0
+
+static SCORE GapPenalty(unsigned uLength, bool Term, SCORE g, SCORE e)
+ {
+ //if (Term)
+ // {
+ // switch (g_TermGap)
+ // {
+ // case TERMGAP_Full:
+ // return g + (uLength - 1)*e;
+
+ // case TERMGAP_Half:
+ // return g/2 + (uLength - 1)*e;
+
+ // case TERMGAP_Ext:
+ // return uLength*e;
+ // }
+ // Quit("Bad termgap");
+ // }
+ //else
+ // return g + (uLength - 1)*e;
+ //return MINUS_INFINITY;
+ return g + (uLength - 1)*e;
+ }
+
+static SCORE GapPenalty(unsigned uLength, bool Term)
+ {
+ SCORE s1 = GapPenalty(uLength, Term, g_scoreGapOpen, g_scoreGapExtend);
+#if DOUBLE_AFFINE
+ SCORE s2 = GapPenalty(uLength, Term, g_scoreGapOpen2, g_scoreGapExtend2);
+ if (s1 > s2)
+ return s1;
+ return s2;
+#else
+ return s1;
+#endif
+ }
+
+static const MSA *g_ptrMSA1;
+static const MSA *g_ptrMSA2;
+static unsigned g_uSeqIndex1;
+static unsigned g_uSeqIndex2;
+
+static void LogGap(unsigned uStart, unsigned uEnd, unsigned uGapLength,
+ bool bNTerm, bool bCTerm)
+ {
+ Log("%16.16s ", "");
+ for (unsigned i = 0; i < uStart; ++i)
+ Log(" ");
+ unsigned uMyLength = 0;
+ for (unsigned i = uStart; i <= uEnd; ++i)
+ {
+ bool bGap1 = g_ptrMSA1->IsGap(g_uSeqIndex1, i);
+ bool bGap2 = g_ptrMSA2->IsGap(g_uSeqIndex2, i);
+ if (!bGap1 && !bGap2)
+ Quit("Error -- neither gapping");
+ if (bGap1 && bGap2)
+ Log(".");
+ else
+ {
+ ++uMyLength;
+ Log("-");
+ }
+ }
+ SCORE s = GapPenalty(uGapLength, bNTerm || bCTerm);
+ Log(" L=%d N%d C%d s=%.3g", uGapLength, bNTerm, bCTerm, s);
+ Log("\n");
+ if (uMyLength != uGapLength)
+ Quit("Lengths differ");
+
+ }
+
+static SCORE ScoreSeqPair(const MSA &msa1, unsigned uSeqIndex1,
+ const MSA &msa2, unsigned uSeqIndex2, SCORE *ptrLetters, SCORE *ptrGaps)
+ {
+ g_ptrMSA1 = &msa1;
+ g_ptrMSA2 = &msa2;
+ g_uSeqIndex1 = uSeqIndex1;
+ g_uSeqIndex2 = uSeqIndex2;
+
+ const unsigned uColCount = msa1.GetColCount();
+ const unsigned uColCount2 = msa2.GetColCount();
+ if (uColCount != uColCount2)
+ Quit("ScoreSeqPair, different lengths");
+
+#if TRACE
+ Log("ScoreSeqPair\n");
+ Log("%16.16s ", msa1.GetSeqName(uSeqIndex1));
+ for (unsigned i = 0; i < uColCount; ++i)
+ Log("%c", msa1.GetChar(uSeqIndex1, i));
+ Log("\n");
+ Log("%16.16s ", msa2.GetSeqName(uSeqIndex2));
+ for (unsigned i = 0; i < uColCount; ++i)
+ Log("%c", msa1.GetChar(uSeqIndex2, i));
+ Log("\n");
+#endif
+
+ SCORE scoreTotal = 0;
+
+// Substitution scores
+ unsigned uFirstLetter1 = uInsane;
+ unsigned uFirstLetter2 = uInsane;
+ unsigned uLastLetter1 = uInsane;
+ unsigned uLastLetter2 = uInsane;
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
+ {
+ bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex);
+ bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex);
+ bool bWildcard1 = msa1.IsWildcard(uSeqIndex1, uColIndex);
+ bool bWildcard2 = msa2.IsWildcard(uSeqIndex2, uColIndex);
+
+ if (!bGap1)
+ {
+ if (uInsane == uFirstLetter1)
+ uFirstLetter1 = uColIndex;
+ uLastLetter1 = uColIndex;
+ }
+ if (!bGap2)
+ {
+ if (uInsane == uFirstLetter2)
+ uFirstLetter2 = uColIndex;
+ uLastLetter2 = uColIndex;
+ }
+
+ if (bGap1 || bGap2 || bWildcard1 || bWildcard2)
+ continue;
+
+ unsigned uLetter1 = msa1.GetLetter(uSeqIndex1, uColIndex);
+ unsigned uLetter2 = msa2.GetLetter(uSeqIndex2, uColIndex);
+
+ SCORE scoreMatch = (*g_ptrScoreMatrix)[uLetter1][uLetter2];
+ scoreTotal += scoreMatch;
+#if TRACE
+ Log("%c <-> %c = %7.1f %10.1f\n",
+ msa1.GetChar(uSeqIndex1, uColIndex),
+ msa2.GetChar(uSeqIndex2, uColIndex),
+ scoreMatch,
+ scoreTotal);
+#endif
+ }
+
+ *ptrLetters = scoreTotal;
+
+// Gap penalties
+ unsigned uGapLength = uInsane;
+ unsigned uGapStartCol = uInsane;
+ bool bGapping1 = false;
+ bool bGapping2 = false;
+
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
+ {
+ bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex);
+ bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex);
+
+ if (bGap1 && bGap2)
+ continue;
+
+ if (bGapping1)
+ {
+ if (bGap1)
+ ++uGapLength;
+ else
+ {
+ bGapping1 = false;
+ bool bNTerm = (uFirstLetter2 == uGapStartCol);
+ bool bCTerm = (uLastLetter2 + 1 == uColIndex);
+ SCORE scoreGap = GapPenalty(uGapLength, bNTerm || bCTerm);
+ scoreTotal += scoreGap;
+#if TRACE
+ LogGap(uGapStartCol, uColIndex - 1, uGapLength, bNTerm, bCTerm);
+ Log("GAP %7.1f %10.1f\n",
+ scoreGap,
+ scoreTotal);
+#endif
+ }
+ continue;
+ }
+ else
+ {
+ if (bGap1)
+ {
+ uGapStartCol = uColIndex;
+ bGapping1 = true;
+ uGapLength = 1;
+ continue;
+ }
+ }
+
+ if (bGapping2)
+ {
+ if (bGap2)
+ ++uGapLength;
+ else
+ {
+ bGapping2 = false;
+ bool bNTerm = (uFirstLetter1 == uGapStartCol);
+ bool bCTerm = (uLastLetter1 + 1 == uColIndex);
+ SCORE scoreGap = GapPenalty(uGapLength, bNTerm || bCTerm);
+ scoreTotal += scoreGap;
+#if TRACE
+ LogGap(uGapStartCol, uColIndex - 1, uGapLength, bNTerm, bCTerm);
+ Log("GAP %7.1f %10.1f\n",
+ scoreGap,
+ scoreTotal);
+#endif
+ }
+ }
+ else
+ {
+ if (bGap2)
+ {
+ uGapStartCol = uColIndex;
+ bGapping2 = true;
+ uGapLength = 1;
+ }
+ }
+ }
+
+ if (bGapping1 || bGapping2)
+ {
+ SCORE scoreGap = GapPenalty(uGapLength, true);
+ scoreTotal += scoreGap;
+#if TRACE
+ LogGap(uGapStartCol, uColCount - 1, uGapLength, false, true);
+ Log("GAP %7.1f %10.1f\n",
+ scoreGap,
+ scoreTotal);
+#endif
+ }
+ *ptrGaps = scoreTotal - *ptrLetters;
+ return scoreTotal;
+ }
+
+// The usual sum-of-pairs objective score: sum the score
+// of the alignment of each pair of sequences.
+SCORE ObjScoreDA(const MSA &msa, SCORE *ptrLetters, SCORE *ptrGaps)
+ {
+ const unsigned uSeqCount = msa.GetSeqCount();
+ SCORE scoreTotal = 0;
+ unsigned uPairCount = 0;
+#if TRACE
+ msa.LogMe();
+ Log(" Score Weight Weight Total\n");
+ Log("---------- ------ ------ ----------\n");
+#endif
+ SCORE TotalLetters = 0;
+ SCORE TotalGaps = 0;
+ for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1)
+ {
+ const WEIGHT w1 = msa.GetSeqWeight(uSeqIndex1);
+ for (unsigned uSeqIndex2 = uSeqIndex1 + 1; uSeqIndex2 < uSeqCount; ++uSeqIndex2)
+ {
+ const WEIGHT w2 = msa.GetSeqWeight(uSeqIndex2);
+ const WEIGHT w = w1*w2;
+ SCORE Letters;
+ SCORE Gaps;
+ SCORE scorePair = ScoreSeqPair(msa, uSeqIndex1, msa, uSeqIndex2,
+ &Letters, &Gaps);
+ scoreTotal += w1*w2*scorePair;
+ TotalLetters += w1*w2*Letters;
+ TotalGaps += w1*w2*Gaps;
+ ++uPairCount;
+#if TRACE
+ Log("%10.2f %6.3f %6.3f %10.2f %d=%s %d=%s\n",
+ scorePair,
+ w1,
+ w2,
+ scorePair*w1*w2,
+ uSeqIndex1,
+ msa.GetSeqName(uSeqIndex1),
+ uSeqIndex2,
+ msa.GetSeqName(uSeqIndex2));
+#endif
+ }
+ }
+ *ptrLetters = TotalLetters;
+ *ptrGaps = TotalGaps;
+ return scoreTotal;
+ }
+
+#endif // DOUBLE_AFFINE
Added: trunk/packages/muscle/branches/upstream/current/onexception.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/onexception.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/onexception.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,15 @@
+#include "muscle.h"
+#include <stdio.h>
+
+static char szOnExceptionMessage[] =
+ {
+ "\nFatal error, exception caught.\n"
+ };
+
+void OnException()
+ {
+ fprintf(stderr, szOnExceptionMessage);
+ Log(szOnExceptionMessage);
+ Log("Finished %s\n", GetTimeAsStr());
+ exit(EXIT_Except);
+ }
Added: trunk/packages/muscle/branches/upstream/current/options.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/options.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/options.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,233 @@
+#include "muscle.h"
+#include <stdio.h>
+
+struct VALUE_OPT
+ {
+ const char *m_pstrName;
+ const char *m_pstrValue;
+ };
+
+struct FLAG_OPT
+ {
+ const char *m_pstrName;
+ bool m_bSet;
+ };
+
+static VALUE_OPT ValueOpts[] =
+ {
+ "in", 0,
+ "in1", 0,
+ "in2", 0,
+ "out", 0,
+ "MaxIters", 0,
+ "MaxHours", 0,
+ "GapOpen", 0,
+ "GapOpen2", 0,
+ "GapExtend", 0,
+ "GapExtend2", 0,
+ "GapAmbig", 0,
+ "Center", 0,
+ "SmoothScoreCeil", 0,
+ "MinBestColScore", 0,
+ "MinSmoothScore", 0,
+ "ObjScore", 0,
+ "SmoothWindow", 0,
+ "RefineWindow", 0,
+ "FromWindow", 0,
+ "ToWindow", 0,
+ "SaveWindow", 0,
+ "WindowOffset", 0,
+ "FirstWindow", 0,
+ "AnchorSpacing", 0,
+ "Log", 0,
+ "LogA", 0,
+ "MaxTrees", 0,
+ "SUEFF", 0,
+ "Distance1", 0,
+ "Distance2", 0,
+ "Weight1", 0,
+ "Weight2", 0,
+ "Cluster1", 0,
+ "Cluster2", 0,
+ "Root1", 0,
+ "Root2", 0,
+ "Tree1", 0,
+ "Tree2", 0,
+ "UseTree", 0,
+ "UseTree_NoWarn", 0,
+ "DiagLength", 0,
+ "DiagMargin", 0,
+ "DiagBreak", 0,
+ "Hydro", 0,
+ "HydroFactor", 0,
+ "SPScore", 0,
+ "SeqType", 0,
+ "MaxMB", 0,
+ "ComputeWeights", 0,
+ "MaxSubFam", 0,
+ "ScoreFile", 0,
+ "TermGaps", 0,
+ "FASTAOut", 0,
+ "CLWOut", 0,
+ "CLWStrictOut", 0,
+ "HTMLOut", 0,
+ "MSFOut", 0,
+ "PHYIOut", 0,
+ "PHYSOut", 0,
+ "Matrix", 0,
+ };
+static int ValueOptCount = sizeof(ValueOpts)/sizeof(ValueOpts[0]);
+
+static FLAG_OPT FlagOpts[] =
+ {
+ "LE", false,
+ "SP", false,
+ "SV", false,
+ "SPN", false,
+ "Core", false,
+ "NoCore", false,
+ "Diags1", false,
+ "Diags2", false,
+ "Diags", false,
+ "Quiet", false,
+ "MSF", false,
+ "Verbose", false,
+ "Anchors", false,
+ "NoAnchors", false,
+ "Refine", false,
+ "RefineW", false,
+ "SW", false,
+ "Profile", false,
+ "PPScore", false,
+ "Cluster", false,
+ "Brenner", false,
+ "Dimer", false,
+ "clw", false,
+ "clwstrict", false,
+ "HTML", false,
+ "Version", false,
+ "Stable", false,
+ "Group", false,
+ "FASTA", false,
+ "ProfDB", false,
+ "PAS", false,
+ "PHYI", false,
+ "PHYS", false,
+ };
+static int FlagOptCount = sizeof(FlagOpts)/sizeof(FlagOpts[0]);
+
+static bool TestSetFlagOpt(const char *Arg)
+ {
+ for (int i = 0; i < FlagOptCount; ++i)
+ if (!stricmp(Arg, FlagOpts[i].m_pstrName))
+ {
+ FlagOpts[i].m_bSet = true;
+ return true;
+ }
+ return false;
+ }
+
+static bool TestSetValueOpt(const char *Arg, const char *Value)
+ {
+ for (int i = 0; i < ValueOptCount; ++i)
+ if (!stricmp(Arg, ValueOpts[i].m_pstrName))
+ {
+ if (0 == Value)
+ {
+ fprintf(stderr, "Option -%s must have value\n", Arg);
+ exit(EXIT_NotStarted);
+ }
+ ValueOpts[i].m_pstrValue = strsave(Value);
+ return true;
+ }
+ return false;
+ }
+
+bool FlagOpt(const char *Name)
+ {
+ for (int i = 0; i < FlagOptCount; ++i)
+ if (!stricmp(Name, FlagOpts[i].m_pstrName))
+ return FlagOpts[i].m_bSet;
+ Quit("FlagOpt(%s) invalid", Name);
+ return false;
+ }
+
+const char *ValueOpt(const char *Name)
+ {
+ for (int i = 0; i < ValueOptCount; ++i)
+ if (!stricmp(Name, ValueOpts[i].m_pstrName))
+ return ValueOpts[i].m_pstrValue;
+ Quit("ValueOpt(%s) invalid", Name);
+ return 0;
+ }
+
+void ProcessArgVect(int argc, char *argv[])
+ {
+ for (int iArgIndex = 0; iArgIndex < argc; )
+ {
+ const char *Arg = argv[iArgIndex];
+ if (Arg[0] != '-')
+ {
+ fprintf(stderr, "Command-line option \"%s\" must start with '-'\n", Arg);
+ exit(EXIT_NotStarted);
+ }
+ const char *ArgName = Arg + 1;
+ if (TestSetFlagOpt(ArgName))
+ {
+ ++iArgIndex;
+ continue;
+ }
+
+ char *Value = 0;
+ if (iArgIndex < argc - 1)
+ Value = argv[iArgIndex+1];
+ if (TestSetValueOpt(ArgName, Value))
+ {
+ iArgIndex += 2;
+ continue;
+ }
+ fprintf(stderr, "Invalid command line option \"%s\"\n", ArgName);
+ Usage();
+ exit(EXIT_NotStarted);
+ }
+ }
+
+void ProcessArgStr(const char *ArgStr)
+ {
+ const int MAX_ARGS = 64;
+ char *argv[MAX_ARGS];
+
+ if (0 == ArgStr)
+ return;
+
+// Modifiable copy
+ char *StrCopy = strsave(ArgStr);
+
+ int argc = 0;
+ bool bInArg = false;
+ char *Str = StrCopy;
+ while (char c = *Str)
+ {
+ if (isspace(c))
+ {
+ *Str = 0;
+ bInArg = false;
+ }
+ else if (!bInArg)
+ {
+ bInArg = true;
+ if (argc >= MAX_ARGS)
+ Quit("Too many args in MUSCLE_CMDLINE");
+ argv[argc++] = Str;
+ }
+ Str++;
+ }
+ ProcessArgVect(argc, argv);
+ free(StrCopy);
+ }
+
+void ListFlagOpts()
+ {
+ for (int i = 0; i < FlagOptCount; ++i)
+ Log("%s %d\n", FlagOpts[i].m_pstrName, FlagOpts[i].m_bSet);
+ }
Added: trunk/packages/muscle/branches/upstream/current/outweights.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/outweights.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/outweights.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,17 @@
+#include "muscle.h"
+#include "msa.h"
+
+void OutWeights(const char *FileName, const MSA &msa)
+ {
+ FILE *f = fopen(FileName, "w");
+ if (0 == f)
+ Quit("Cannot open '%s'", FileName);
+ const unsigned uSeqCount = msa.GetSeqCount();
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ const char *Id = msa.GetSeqName(uSeqIndex);
+ const WEIGHT w = msa.GetSeqWeight(uSeqIndex);
+ fprintf(f, "%s\t%.3g\n", Id, w);
+ }
+ fclose(f);
+ }
Added: trunk/packages/muscle/branches/upstream/current/pam200mafft.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/pam200mafft.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/pam200mafft.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,32 @@
+#include "muscle.h"
+
+// Adjusted PAM200 scoring matrix as used by default in MAFFT.
+// Katoh, Misawa, Kuma and Miyata (2002), NAR 30(14), 3059-3066.
+
+static float PAM200[23][23] =
+ {
+// A C D E F G H I K L M N P Q R S T V W Y B Z X
+ 408, 20, 54, 52, -182, 179, -68, 109, -35, -47, 39, 106, 206, -14, -12, 257, 293, 191, -306, -219, 0, 0, 0, // A
+ 20, 1190, -228, -295, 94, 6, 63, -131, -184, -176, -112, -29, -122, -195, 49, 185, 13, -49, 199, 333, 0, 0, 0, // C
+ 54, -228, 645, 516, -399, 168, 98, -225, 75, -341, -235, 352, -149, 142, -44, 65, 7, -147, -418, -128, 0, 0, 0, // D
+ 52, -295, 516, 630, -460, 145, 45, -225, 195, -307, -222, 186, -121, 299, 54, -10, -36, -130, -366, -285, 0, 0, 0, // E
+ -182, 94, -399, -460, 908, -387, 82, 100, -423, 340, 87, -216, -160, -274, -307, -31, -153, 51, 19, 604, 0, 0, 0, // F
+ 179, 6, 168, 145, -387, 682, -94, -196, -14, -304, -226, 99, -57, -48, 117, 175, 41, -73, -38, -329, 0, 0, 0, // G
+ -68, 63, 98, 45, 82, -94, 786, -185, 164, -72, -132, 258, 86, 388, 277, 55, -15, -197, -181, 488, 0, 0, 0, // H
+ 109, -131, -225, -225, 100, -196, -185, 574, -204, 308, 411, -94, -95, -202, -188, 1, 182, 489, -254, -133, 0, 0, 0, // I
+ -35, -184, 75, 195, -423, -14, 164, -204, 652, -229, -98, 206, -66, 335, 486, 22, 39, -207, -196, -244, 0, 0, 0, // K
+ -47, -176, -341, -307, 340, -304, -72, 308, -229, 611, 389, -203, 73, -66, -150, -49, -21, 259, -46, -9, 0, 0, 0, // L
+ 39, -112, -235, -222, 87, -226, -132, 411, -98, 389, 776, -111, -78, -104, -109, -29, 149, 351, -209, -162, 0, 0, 0, // M
+ 106, -29, 352, 186, -216, 99, 258, -94, 206, -203, -111, 536, -1, 108, 93, 260, 188, -98, -359, 12, 0, 0, 0, // N
+ 206, -122, -149, -121, -160, -57, 86, -95, -66, 73, -78, -1, 756, 142, 25, 241, 159, -55, -353, -206, 0, 0, 0, // P
+ -14, -195, 142, 299, -274, -48, 388, -202, 335, -66, -104, 108, 142, 655, 321, 7, -15, -175, -223, -53, 0, 0, 0, // Q
+ -12, 49, -44, 54, -307, 117, 277, -188, 486, -150, -109, 93, 25, 321, 626, 48, 16, -181, 124, -113, 0, 0, 0, // R
+ 257, 185, 65, -10, -31, 175, 55, 1, 22, -49, -29, 260, 241, 7, 48, 373, 279, 28, -193, -35, 0, 0, 0, // S
+ 293, 13, 7, -36, -153, 41, -15, 182, 39, -21, 149, 188, 159, -15, 16, 279, 442, 163, -323, -170, 0, 0, 0, // T
+ 191, -49, -147, -130, 51, -73, -197, 489, -207, 259, 351, -98, -55, -175, -181, 28, 163, 525, -225, -177, 0, 0, 0, // V
+ -306, 199, -418, -366, 19, -38, -181, -254, -196, -46, -209, -359, -353, -223, 124, -193, -323, -225, 1495, 83, 0, 0, 0, // W
+ -219, 333, -128, -285, 604, -329, 488, -133, -244, -9, -162, 12, -206, -53, -113, -35, -170, -177, 83, 999, 0, 0, 0, // Y
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Z
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // X
+ };
Added: trunk/packages/muscle/branches/upstream/current/params.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/params.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/params.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,608 @@
+#include "muscle.h"
+#include "objscore.h"
+#include "profile.h"
+#include "enumopts.h"
+
+const double DEFAULT_MAX_MB_FRACT = 0.8;
+
+SCORE g_scoreCenter = 0;
+SCORE g_scoreGapExtend = 0;
+SCORE g_scoreGapOpen2 = MINUS_INFINITY;
+SCORE g_scoreGapExtend2 = MINUS_INFINITY;
+SCORE g_scoreGapAmbig = 0;
+SCORE g_scoreAmbigFactor = 0;
+
+extern SCOREMATRIX VTML_LA;
+extern SCOREMATRIX PAM200;
+extern SCOREMATRIX PAM200NoCenter;
+extern SCOREMATRIX VTML_SP;
+extern SCOREMATRIX VTML_SPNoCenter;
+extern SCOREMATRIX NUC_SP;
+
+PTR_SCOREMATRIX g_ptrScoreMatrix;
+
+const char *g_pstrInFileName = "-";
+const char *g_pstrOutFileName = "-";
+const char *g_pstrFASTAOutFileName = 0;
+const char *g_pstrMSFOutFileName = 0;
+const char *g_pstrClwOutFileName = 0;
+const char *g_pstrClwStrictOutFileName = 0;
+const char *g_pstrHTMLOutFileName = 0;
+const char *g_pstrPHYIOutFileName = 0;
+const char *g_pstrPHYSOutFileName = 0;
+
+const char *g_pstrFileName1 = 0;
+const char *g_pstrFileName2 = 0;
+
+const char *g_pstrSPFileName = 0;
+const char *g_pstrMatrixFileName = 0;
+
+const char *g_pstrUseTreeFileName = 0;
+bool g_bUseTreeNoWarn = false;
+
+const char *g_pstrComputeWeightsFileName;
+const char *g_pstrScoreFileName;
+
+const char *g_pstrProf1FileName = 0;
+const char *g_pstrProf2FileName = 0;
+
+unsigned g_uSmoothWindowLength = 7;
+unsigned g_uAnchorSpacing = 32;
+unsigned g_uMaxTreeRefineIters = 1;
+
+unsigned g_uRefineWindow = 200;
+unsigned g_uWindowFrom = 0;
+unsigned g_uWindowTo = 0;
+unsigned g_uSaveWindow = uInsane;
+unsigned g_uWindowOffset = 0;
+
+unsigned g_uMaxSubFamCount = 5;
+
+unsigned g_uHydrophobicRunLength = 5;
+float g_dHydroFactor = (float) 1.2;
+
+unsigned g_uMinDiagLength = 24; // TODO alpha -- should depend on alphabet?
+unsigned g_uMaxDiagBreak = 1;
+unsigned g_uDiagMargin = 5;
+
+float g_dSUEFF = (float) 0.1;
+
+bool g_bPrecompiledCenter = true;
+bool g_bNormalizeCounts = false;
+bool g_bDiags1 = false;
+bool g_bDiags2 = false;
+bool g_bAnchors = true;
+bool g_bQuiet = false;
+bool g_bVerbose = false;
+bool g_bRefine = false;
+bool g_bRefineW = false;
+bool g_bProfDB = false;
+bool g_bLow = false;
+bool g_bSW = false;
+bool g_bCluster = false;
+bool g_bProfile = false;
+bool g_bPPScore = false;
+bool g_bBrenner = false;
+bool g_bDimer = false;
+bool g_bVersion = false;
+bool g_bStable = false;
+bool g_bFASTA = false;
+bool g_bPAS = false;
+
+#if DEBUG
+bool g_bCatchExceptions = false;
+#else
+bool g_bCatchExceptions = true;
+#endif
+
+bool g_bMSF = false;
+bool g_bAln = false;
+bool g_bClwStrict = false;
+bool g_bHTML = false;
+bool g_bPHYI = false;
+bool g_bPHYS = false;
+
+unsigned g_uMaxIters = 8;
+unsigned long g_ulMaxSecs = 0;
+unsigned g_uMaxMB = 500;
+
+PPSCORE g_PPScore = PPSCORE_LE;
+OBJSCORE g_ObjScore = OBJSCORE_SPM;
+
+SEQWEIGHT g_SeqWeight1 = SEQWEIGHT_ClustalW;
+SEQWEIGHT g_SeqWeight2 = SEQWEIGHT_ClustalW;
+
+DISTANCE g_Distance1 = DISTANCE_Kmer6_6;
+DISTANCE g_Distance2 = DISTANCE_PctIdKimura;
+
+CLUSTER g_Cluster1 = CLUSTER_UPGMB;
+CLUSTER g_Cluster2 = CLUSTER_UPGMB;
+
+ROOT g_Root1 = ROOT_Pseudo;
+ROOT g_Root2 = ROOT_Pseudo;
+
+bool g_bDiags;
+
+SEQTYPE g_SeqType = SEQTYPE_Auto;
+
+TERMGAPS g_TermGaps = TERMGAPS_Half;
+
+//------------------------------------------------------
+// These parameters depending on the chosen prof-prof
+// score (g_PPScore), initialized to "Undefined".
+float g_dSmoothScoreCeil = fInsane;
+float g_dMinBestColScore = fInsane;
+float g_dMinSmoothScore = fInsane;
+SCORE g_scoreGapOpen = fInsane;
+//------------------------------------------------------
+
+static unsigned atou(const char *s)
+ {
+ return (unsigned) atoi(s);
+ }
+
+const char *MaxSecsToStr()
+ {
+ if (0 == g_ulMaxSecs)
+ return "(No limit)";
+ return SecsToStr(g_ulMaxSecs);
+ }
+
+void ListParams()
+ {
+ Log("\n");
+ Log("%s\n", MUSCLE_LONG_VERSION);
+ Log("http://www.drive5.com/muscle\n");
+ Log("\n");
+ Log("Profile-profile score %s\n", PPSCOREToStr(g_PPScore));
+ Log("Max iterations %u\n", g_uMaxIters);
+ Log("Max trees %u\n", g_uMaxTreeRefineIters);
+ Log("Max time %s\n", MaxSecsToStr());
+ Log("Max MB %u\n", g_uMaxMB);
+ Log("Gap open %g\n", g_scoreGapOpen);
+ Log("Gap extend (dimer) %g\n", g_scoreGapExtend);
+ Log("Gap ambig factor %g\n", g_scoreAmbigFactor);
+ Log("Gap ambig penalty %g\n", g_scoreGapAmbig);
+ Log("Center (LE) %g\n", g_scoreCenter);
+ Log("Term gaps %s\n", TERMGAPSToStr(g_TermGaps));
+
+ Log("Smooth window length %u\n", g_uSmoothWindowLength);
+ Log("Refine window length %u\n", g_uRefineWindow);
+ Log("Min anchor spacing %u\n", g_uAnchorSpacing);
+ Log("Min diag length (lambda) %u\n", g_uMinDiagLength);
+ Log("Diag margin (mu) %u\n", g_uDiagMargin);
+ Log("Min diag break %u\n", g_uMaxDiagBreak);
+ Log("Hydrophobic window %u\n", g_uHydrophobicRunLength);
+
+ Log("Hydrophobic gap factor %g\n", g_dHydroFactor);
+ Log("Smooth score ceiling %g\n", g_dSmoothScoreCeil);
+ Log("Min best col score %g\n", g_dMinBestColScore);
+ Log("Min anchor score %g\n", g_dMinSmoothScore);
+ Log("SUEFF %g\n", g_dSUEFF);
+
+ Log("Brenner root MSA %s\n", BoolToStr(g_bBrenner));
+ Log("Normalize counts %s\n", BoolToStr(g_bNormalizeCounts));
+ Log("Diagonals (1) %s\n", BoolToStr(g_bDiags1));
+ Log("Diagonals (2) %s\n", BoolToStr(g_bDiags2));
+ Log("Anchors %s\n", BoolToStr(g_bAnchors));
+ Log("MSF output format %s\n", BoolToStr(g_bMSF));
+ Log("Phylip interleaved %s\n", BoolToStr(g_bPHYI));
+ Log("Phylip sequential %s\n", BoolToStr(g_bPHYS));
+ Log("ClustalW output format %s\n", BoolToStr(g_bAln));
+ Log("Catch exceptions %s\n", BoolToStr(g_bCatchExceptions));
+ Log("Quiet %s\n", BoolToStr(g_bQuiet));
+ Log("Refine %s\n", BoolToStr(g_bRefine));
+ Log("ProdfDB %s\n", BoolToStr(g_bProfDB));
+ Log("Low complexity profiles %s\n", BoolToStr(g_bLow));
+
+ Log("Objective score %s\n", OBJSCOREToStr(g_ObjScore));
+
+ Log("Distance method (1) %s\n", DISTANCEToStr(g_Distance1));
+ Log("Clustering method (1) %s\n", CLUSTERToStr(g_Cluster1));
+ Log("Root method (1) %s\n", ROOTToStr(g_Root1));
+ Log("Sequence weighting (1) %s\n", SEQWEIGHTToStr(g_SeqWeight1));
+
+ Log("Distance method (2) %s\n", DISTANCEToStr(g_Distance2));
+ Log("Clustering method (2) %s\n", CLUSTERToStr(g_Cluster2));
+ Log("Root method (2) %s\n", ROOTToStr(g_Root2));
+ Log("Sequence weighting (2) %s\n", SEQWEIGHTToStr(g_SeqWeight2));
+
+ Log("\n");
+ }
+
+static void SetDefaultsLE()
+ {
+ g_ptrScoreMatrix = &VTML_LA;
+
+ //g_scoreGapOpen = (SCORE) -3.00;
+ //g_scoreCenter = (SCORE) -0.55;
+ g_scoreGapOpen = (SCORE) -2.9;
+ g_scoreCenter = (SCORE) -0.52;
+
+ g_bNormalizeCounts = true;
+
+ //g_dSmoothScoreCeil = 5.0;
+ //g_dMinBestColScore = 4.0;
+ //g_dMinSmoothScore = 2.0;
+ g_dSmoothScoreCeil = 3.0;
+ g_dMinBestColScore = 2.0;
+ g_dMinSmoothScore = 1.0;
+
+ g_Distance1 = DISTANCE_Kmer6_6;
+ g_Distance2 = DISTANCE_PctIdKimura;
+ }
+
+static void SetDefaultsSP()
+ {
+ g_ptrScoreMatrix = &PAM200;
+
+ g_scoreGapOpen = -1439;
+ g_scoreCenter = 0.0; // center pre-added into score mx
+
+ g_bNormalizeCounts = false;
+
+ g_dSmoothScoreCeil = 200.0;
+ g_dMinBestColScore = 300.0;
+ g_dMinSmoothScore = 125.0;
+
+ g_Distance1 = DISTANCE_Kmer6_6;
+ g_Distance2 = DISTANCE_PctIdKimura;
+ }
+
+static void SetDefaultsSV()
+ {
+ g_ptrScoreMatrix = &VTML_SP;
+
+ g_scoreGapOpen = -300;
+ g_scoreCenter = 0.0; // center pre-added into score mx
+
+ g_bNormalizeCounts = false;
+
+ g_dSmoothScoreCeil = 90.0;
+ g_dMinBestColScore = 130.0;
+ g_dMinSmoothScore = 40.0;
+
+ g_Distance1 = DISTANCE_Kmer6_6;
+ g_Distance2 = DISTANCE_PctIdKimura;
+ }
+
+//static void SetDefaultsSPN()
+// {
+// g_ptrScoreMatrix = &NUC_SP;
+//
+// g_scoreGapOpen = -400;
+// g_scoreCenter = 0.0; // center pre-added into score mx
+//
+// g_bNormalizeCounts = false;
+//
+// g_dSmoothScoreCeil = 999.0; // disable
+// g_dMinBestColScore = 90;
+// g_dMinSmoothScore = 90;
+//
+// g_Distance1 = DISTANCE_Kmer4_6;
+// g_Distance2 = DISTANCE_PctIdKimura;
+// }
+
+static void SetDefaultsSPN_DNA()
+ {
+ g_ptrScoreMatrix = &NUC_SP;
+
+ g_scoreGapOpen = -400;
+ g_scoreCenter = 0.0; // center pre-added into score mx
+ g_scoreGapExtend = 0.0;
+
+ g_bNormalizeCounts = false;
+
+ g_dSmoothScoreCeil = 999.0; // disable
+ g_dMinBestColScore = 90;
+ g_dMinSmoothScore = 90;
+
+ g_Distance1 = DISTANCE_Kmer4_6;
+ g_Distance2 = DISTANCE_PctIdKimura;
+ }
+
+static void SetDefaultsSPN_RNA()
+ {
+ g_ptrScoreMatrix = &NUC_SP;
+
+ g_scoreGapOpen = -420;
+ g_scoreCenter = -300; // total center = NUC_EXTEND - 300
+ g_scoreGapExtend = 0.0;
+
+ g_bNormalizeCounts = false;
+
+ g_dSmoothScoreCeil = 999.0; // disable
+ g_dMinBestColScore = 90;
+ g_dMinSmoothScore = 90;
+
+ g_Distance1 = DISTANCE_Kmer4_6;
+ g_Distance2 = DISTANCE_PctIdKimura;
+ }
+
+static void FlagParam(const char *OptName, bool *ptrParam, bool bValueIfFlagSet)
+ {
+ bool bIsSet = FlagOpt(OptName);
+ if (bIsSet)
+ *ptrParam = bValueIfFlagSet;
+ }
+
+static void StrParam(const char *OptName, const char **ptrptrParam)
+ {
+ const char *opt = ValueOpt(OptName);
+ if (0 != opt)
+ *ptrptrParam = opt;
+ }
+
+static void FloatParam(const char *OptName, float *ptrParam)
+ {
+ const char *opt = ValueOpt(OptName);
+ if (0 != opt)
+ *ptrParam = (float) atof(opt);
+ }
+
+static void UintParam(const char *OptName, unsigned *ptrParam)
+ {
+ const char *opt = ValueOpt(OptName);
+ if (0 != opt)
+ *ptrParam = atou(opt);
+ }
+
+static void EnumParam(const char *OptName, EnumOpt *Opts, int *Param)
+ {
+ const char *Value = ValueOpt(OptName);
+ if (0 == Value)
+ return;
+
+ for (;;)
+ {
+ if (0 == Opts->pstrOpt)
+ Quit("Invalid parameter -%s %s", OptName, Value);
+ if (0 == stricmp(Value, Opts->pstrOpt))
+ {
+ *Param = Opts->iValue;
+ return;
+ }
+ ++Opts;
+ }
+ }
+
+static void SetPPDefaultParams()
+ {
+ switch (g_PPScore)
+ {
+ case PPSCORE_SP:
+ SetDefaultsSP();
+ break;
+
+ case PPSCORE_LE:
+ SetDefaultsLE();
+ break;
+
+ case PPSCORE_SV:
+ SetDefaultsSV();
+ break;
+
+ case PPSCORE_SPN:
+ switch (g_Alpha)
+ {
+ case ALPHA_DNA:
+ SetDefaultsSPN_DNA();
+ break;
+ case ALPHA_RNA:
+ SetDefaultsSPN_RNA();
+ break;
+ default:
+ Quit("Invalid alpha %d", g_Alpha);
+ }
+ break;
+
+ default:
+ Quit("Invalid g_PPScore");
+ }
+ }
+
+static void SetPPCommandLineParams()
+ {
+ FloatParam("GapOpen", &g_scoreGapOpen);
+ FloatParam("GapOpen2", &g_scoreGapOpen2);
+ FloatParam("GapExtend", &g_scoreGapExtend);
+ FloatParam("GapExtend2", &g_scoreGapExtend2);
+ FloatParam("GapAmbig", &g_scoreAmbigFactor);
+ FloatParam("Center", &g_scoreCenter);
+ FloatParam("SmoothScoreCeil", &g_dSmoothScoreCeil);
+ FloatParam("MinBestColScore", &g_dMinBestColScore);
+ FloatParam("MinSmoothScore", &g_dMinSmoothScore);
+
+ EnumParam("Distance1", DISTANCE_Opts, (int *) &g_Distance1);
+ EnumParam("Distance2", DISTANCE_Opts, (int *) &g_Distance2);
+ }
+
+void SetPPScore(bool bRespectFlagOpts)
+ {
+ if (bRespectFlagOpts)
+ {
+ if (FlagOpt("SP"))
+ g_PPScore = PPSCORE_SP;
+ else if (FlagOpt("LE"))
+ g_PPScore = PPSCORE_LE;
+ else if (FlagOpt("SV"))
+ g_PPScore = PPSCORE_SV;
+ else if (FlagOpt("SPN"))
+ g_PPScore = PPSCORE_SPN;
+ }
+
+ switch (g_PPScore)
+ {
+ case PPSCORE_LE:
+ case PPSCORE_SP:
+ case PPSCORE_SV:
+ if (ALPHA_RNA == g_Alpha || ALPHA_DNA == g_Alpha)
+ g_PPScore = PPSCORE_SPN;
+ break;
+ case PPSCORE_SPN:
+ if (ALPHA_Amino == g_Alpha)
+ g_PPScore = PPSCORE_LE;
+ break;
+ }
+
+ SetPPDefaultParams();
+ SetPPCommandLineParams();
+
+ if (g_bVerbose)
+ ListParams();
+ }
+
+void SetPPScore(PPSCORE p)
+ {
+ g_PPScore = p;
+ SetPPScore(true);
+ }
+
+static void SetMaxSecs()
+ {
+ float fMaxHours = 0.0;
+ FloatParam("MaxHours", &fMaxHours);
+ if (0.0 == fMaxHours)
+ return;
+ g_ulMaxSecs = (unsigned long) (fMaxHours*60*60);
+ }
+
+static bool CanDoLowComplexity()
+ {
+ if (g_SeqWeight1 != SEQWEIGHT_ClustalW)
+ return false;
+ if (1 == g_uMaxIters)
+ return true;
+ return g_SeqWeight2 == SEQWEIGHT_ClustalW;
+ }
+
+bool MissingCommand()
+ {
+ if (strcmp(g_pstrInFileName, "-"))
+ return false;
+ if (0 != g_pstrFileName1)
+ return false;
+ if (0 != g_pstrSPFileName)
+ return false;
+ return true;
+ }
+
+void SetParams()
+ {
+ SetMaxSecs();
+
+ StrParam("in", &g_pstrInFileName);
+ StrParam("out", &g_pstrOutFileName);
+
+ StrParam("FASTAOut", &g_pstrFASTAOutFileName);
+ StrParam("ClwOut", &g_pstrClwOutFileName);
+ StrParam("ClwStrictOut", &g_pstrClwStrictOutFileName);
+ StrParam("HTMLOut", &g_pstrHTMLOutFileName);
+ StrParam("PHYIOut", &g_pstrPHYIOutFileName);
+ StrParam("PHYSOut", &g_pstrPHYSOutFileName);
+ StrParam("MSFOut", &g_pstrMSFOutFileName);
+
+ StrParam("in1", &g_pstrFileName1);
+ StrParam("in2", &g_pstrFileName2);
+
+ StrParam("Matrix", &g_pstrMatrixFileName);
+ StrParam("SPScore", &g_pstrSPFileName);
+
+ StrParam("UseTree_NoWarn", &g_pstrUseTreeFileName);
+ if (0 != g_pstrUseTreeFileName)
+ g_bUseTreeNoWarn = true;
+
+ StrParam("UseTree", &g_pstrUseTreeFileName);
+ StrParam("ComputeWeights", &g_pstrComputeWeightsFileName);
+ StrParam("ScoreFile", &g_pstrScoreFileName);
+
+ FlagParam("Core", &g_bCatchExceptions, false);
+ FlagParam("NoCore", &g_bCatchExceptions, true);
+
+ FlagParam("Diags1", &g_bDiags1, true);
+ FlagParam("Diags2", &g_bDiags2, true);
+
+ bool Diags = false;
+ FlagParam("Diags", &Diags, true);
+ if (Diags)
+ {
+ g_bDiags1 = true;
+ g_bDiags2 = true;
+ }
+
+ FlagParam("Anchors", &g_bAnchors, true);
+ FlagParam("NoAnchors", &g_bAnchors, false);
+
+ FlagParam("Quiet", &g_bQuiet, true);
+ FlagParam("Verbose", &g_bVerbose, true);
+ FlagParam("Version", &g_bVersion, true);
+ FlagParam("Stable", &g_bStable, true);
+ FlagParam("Group", &g_bStable, false);
+ FlagParam("Refine", &g_bRefine, true);
+ FlagParam("RefineW", &g_bRefineW, true);
+ FlagParam("ProfDB", &g_bProfDB, true);
+ FlagParam("SW", &g_bSW, true);
+ FlagParam("Cluster", &g_bCluster, true);
+ FlagParam("Profile", &g_bProfile, true);
+ FlagParam("PPScore", &g_bPPScore, true);
+ FlagParam("Brenner", &g_bBrenner, true);
+ FlagParam("Dimer", &g_bDimer, true);
+
+ FlagParam("MSF", &g_bMSF, true);
+ FlagParam("PHYI", &g_bPHYI, true);
+ FlagParam("PHYS", &g_bPHYS, true);
+ FlagParam("clw", &g_bAln, true);
+ FlagParam("HTML", &g_bHTML, true);
+ FlagParam("FASTA", &g_bFASTA, true);
+ FlagParam("PAS", &g_bPAS, true);
+
+ bool b = false;
+ FlagParam("clwstrict", &b, true);
+ if (b)
+ {
+ g_bAln = true;
+ g_bClwStrict = true;
+ }
+
+ UintParam("MaxIters", &g_uMaxIters);
+ UintParam("MaxTrees", &g_uMaxTreeRefineIters);
+ UintParam("SmoothWindow", &g_uSmoothWindowLength);
+ UintParam("RefineWindow", &g_uRefineWindow);
+ UintParam("FromWindow", &g_uWindowFrom);
+ UintParam("ToWindow", &g_uWindowTo);
+ UintParam("SaveWindow", &g_uSaveWindow);
+ UintParam("WindowOffset", &g_uWindowOffset);
+ UintParam("AnchorSpacing", &g_uAnchorSpacing);
+ UintParam("DiagLength", &g_uMinDiagLength);
+ UintParam("DiagMargin", &g_uDiagMargin);
+ UintParam("DiagBreak", &g_uMaxDiagBreak);
+ UintParam("Hydro", &g_uHydrophobicRunLength);
+ UintParam("MaxSubFam", &g_uMaxSubFamCount);
+
+ FloatParam("SUEFF", &g_dSUEFF);
+ FloatParam("HydroFactor", &g_dHydroFactor);
+
+ EnumParam("ObjScore", OBJSCORE_Opts, (int *) &g_ObjScore);
+ EnumParam("TermGaps", TERMGAPS_Opts, (int *) &g_TermGaps);
+
+ EnumParam("Weight1", SEQWEIGHT_Opts, (int *) &g_SeqWeight1);
+ EnumParam("Weight2", SEQWEIGHT_Opts, (int *) &g_SeqWeight2);
+
+ EnumParam("Cluster1", CLUSTER_Opts, (int *) &g_Cluster1);
+ EnumParam("Cluster2", CLUSTER_Opts, (int *) &g_Cluster2);
+
+ EnumParam("Root1", ROOT_Opts, (int *) &g_Root1);
+ EnumParam("Root2", ROOT_Opts, (int *) &g_Root2);
+
+ EnumParam("SeqType", SEQTYPE_Opts, (int *) &g_SeqType);
+
+ g_scoreGapAmbig = g_scoreGapOpen*g_scoreAmbigFactor;
+ g_bLow = CanDoLowComplexity();
+
+ if (g_bDimer)
+ g_bPrecompiledCenter = false;
+
+ UintParam("MaxMB", &g_uMaxMB);
+ if (0 == ValueOpt("MaxMB"))
+ g_uMaxMB = (unsigned) (GetRAMSizeMB()*DEFAULT_MAX_MB_FRACT);
+ }
Added: trunk/packages/muscle/branches/upstream/current/params.h
===================================================================
--- trunk/packages/muscle/branches/upstream/current/params.h 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/params.h 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,114 @@
+#ifndef params_h
+#define params_h
+
+extern const char *g_pstrInFileName;
+extern const char *g_pstrOutFileName;
+
+extern const char *g_pstrFASTAOutFileName;
+extern const char *g_pstrMSFOutFileName;
+extern const char *g_pstrClwOutFileName;
+extern const char *g_pstrClwStrictOutFileName;
+extern const char *g_pstrHTMLOutFileName;
+extern const char *g_pstrPHYIOutFileName;
+extern const char *g_pstrPHYSOutFileName;
+
+extern const char *g_pstrFileName1;
+extern const char *g_pstrFileName2;
+
+extern const char *g_pstrSPFileName;
+extern const char *g_pstrMatrixFileName;
+
+extern const char *g_pstrUseTreeFileName;
+extern bool g_bUseTreeNoWarn;
+
+extern const char *g_pstrComputeWeightsFileName;
+extern const char *g_pstrScoreFileName;
+
+extern SCORE g_scoreGapOpen;
+extern SCORE g_scoreCenter;
+extern SCORE g_scoreGapExtend;
+extern SCORE g_scoreGapAmbig;
+
+#if DOUBLE_AFFINE
+extern SCORE g_scoreGapOpen2;
+extern SCORE g_scoreGapExtend2;
+#endif
+
+extern unsigned g_uSmoothWindowLength;
+extern unsigned g_uAnchorSpacing;
+extern unsigned g_uMaxTreeRefineIters;
+
+extern unsigned g_uMinDiagLength;
+extern unsigned g_uMaxDiagBreak;
+extern unsigned g_uDiagMargin;
+
+extern unsigned g_uRefineWindow;
+extern unsigned g_uWindowFrom;
+extern unsigned g_uWindowTo;
+extern unsigned g_uSaveWindow;
+extern unsigned g_uWindowOffset;
+
+extern unsigned g_uMaxSubFamCount;
+
+extern unsigned g_uHydrophobicRunLength;
+extern float g_dHydroFactor;
+
+extern float g_dSmoothScoreCeil;
+extern float g_dMinBestColScore;
+extern float g_dMinSmoothScore;
+extern float g_dSUEFF;
+
+extern bool g_bPrecompiledCenter;
+extern bool g_bNormalizeCounts;
+extern bool g_bDiags1;
+extern bool g_bDiags2;
+extern bool g_bDiags;
+extern bool g_bAnchors;
+extern bool g_bCatchExceptions;
+
+extern bool g_bMSF;
+extern bool g_bAln;
+extern bool g_bClwStrict;
+extern bool g_bHTML;
+extern bool g_bPHYI;
+extern bool g_bPHYS;
+
+extern bool g_bQuiet;
+extern bool g_bVerbose;
+extern bool g_bRefine;
+extern bool g_bRefineW;
+extern bool g_bRefineX;
+extern bool g_bLow;
+extern bool g_bSW;
+extern bool g_bCluster;
+extern bool g_bProfile;
+extern bool g_bProfDB;
+extern bool g_bPPScore;
+extern bool g_bBrenner;
+extern bool g_bDimer;
+extern bool g_bVersion;
+extern bool g_bStable;
+extern bool g_bFASTA;
+extern bool g_bPAS;
+
+extern PPSCORE g_PPScore;
+extern OBJSCORE g_ObjScore;
+
+extern DISTANCE g_Distance1;
+extern CLUSTER g_Cluster1;
+extern ROOT g_Root1;
+extern SEQWEIGHT g_SeqWeight1;
+
+extern DISTANCE g_Distance2;
+extern CLUSTER g_Cluster2;
+extern ROOT g_Root2;
+extern SEQWEIGHT g_SeqWeight2;
+
+extern unsigned g_uMaxIters;
+extern unsigned long g_ulMaxSecs;
+extern unsigned g_uMaxMB;
+
+extern SEQTYPE g_SeqType;
+extern TERMGAPS g_TermGaps;
+
+#endif // params_h
Added: trunk/packages/muscle/branches/upstream/current/phy.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/phy.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/phy.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,1069 @@
+#include "muscle.h"
+#include "tree.h"
+#include <math.h>
+
+#define TRACE 0
+
+/***
+Node has 0 to 3 neighbors:
+ 0 neighbors: singleton root
+ 1 neighbor: leaf, neighbor is parent
+ 2 neigbors: non-singleton root
+ 3 neighbors: internal node (other than root)
+
+Minimal rooted tree is single node.
+Minimal unrooted tree is single edge.
+Leaf node always has nulls in neighbors 2 and 3, neighbor 1 is parent.
+When tree is rooted, neighbor 1=parent, 2=left, 3=right.
+***/
+
+void Tree::AssertAreNeighbors(unsigned uNodeIndex1, unsigned uNodeIndex2) const
+ {
+ if (uNodeIndex1 >= m_uNodeCount || uNodeIndex2 >= m_uNodeCount)
+ Quit("AssertAreNeighbors(%u,%u), are %u nodes",
+ uNodeIndex1, uNodeIndex2, m_uNodeCount);
+
+ if (m_uNeighbor1[uNodeIndex1] != uNodeIndex2 &&
+ m_uNeighbor2[uNodeIndex1] != uNodeIndex2 &&
+ m_uNeighbor3[uNodeIndex1] != uNodeIndex2)
+ {
+ LogMe();
+ Quit("AssertAreNeighbors(%u,%u) failed", uNodeIndex1, uNodeIndex2);
+ }
+
+ if (m_uNeighbor1[uNodeIndex2] != uNodeIndex1 &&
+ m_uNeighbor2[uNodeIndex2] != uNodeIndex1 &&
+ m_uNeighbor3[uNodeIndex2] != uNodeIndex1)
+ {
+ LogMe();
+ Quit("AssertAreNeighbors(%u,%u) failed", uNodeIndex1, uNodeIndex2);
+ }
+
+ if (HasEdgeLength(uNodeIndex1, uNodeIndex2) &&
+ GetEdgeLength(uNodeIndex1, uNodeIndex2) !=
+ GetEdgeLength(uNodeIndex2, uNodeIndex1))
+ {
+ LogMe();
+ Quit("Tree::AssertAreNeighbors, Edge length disagrees %u, %u",
+ uNodeIndex1, uNodeIndex2);
+ }
+ }
+
+void Tree::ValidateNode(unsigned uNodeIndex) const
+ {
+ if (uNodeIndex >= m_uNodeCount)
+ Quit("ValidateNode(%u), %u nodes", uNodeIndex, m_uNodeCount);
+
+ const unsigned uNeighborCount = GetNeighborCount(uNodeIndex);
+
+ if (2 == uNeighborCount)
+ {
+ if (!m_bRooted)
+ {
+ LogMe();
+ Quit("Tree::ValidateNode: Node %u has two neighbors, tree is not rooted",
+ uNodeIndex);
+ }
+ if (uNodeIndex != m_uRootNodeIndex)
+ {
+ LogMe();
+ Quit("Tree::ValidateNode: Node %u has two neighbors, but not root node=%u",
+ uNodeIndex, m_uRootNodeIndex);
+ }
+ }
+
+ const unsigned n1 = m_uNeighbor1[uNodeIndex];
+ const unsigned n2 = m_uNeighbor2[uNodeIndex];
+ const unsigned n3 = m_uNeighbor3[uNodeIndex];
+
+ if (NULL_NEIGHBOR == n2 && NULL_NEIGHBOR != n3)
+ {
+ LogMe();
+ Quit("Tree::ValidateNode, n2=null, n3!=null", uNodeIndex);
+ }
+ if (NULL_NEIGHBOR == n3 && NULL_NEIGHBOR != n2)
+ {
+ LogMe();
+ Quit("Tree::ValidateNode, n3=null, n2!=null", uNodeIndex);
+ }
+
+ if (n1 != NULL_NEIGHBOR)
+ AssertAreNeighbors(uNodeIndex, n1);
+ if (n2 != NULL_NEIGHBOR)
+ AssertAreNeighbors(uNodeIndex, n2);
+ if (n3 != NULL_NEIGHBOR)
+ AssertAreNeighbors(uNodeIndex, n3);
+
+ if (n1 != NULL_NEIGHBOR && (n1 == n2 || n1 == n3))
+ {
+ LogMe();
+ Quit("Tree::ValidateNode, duplicate neighbors in node %u", uNodeIndex);
+ }
+ if (n2 != NULL_NEIGHBOR && (n2 == n1 || n2 == n3))
+ {
+ LogMe();
+ Quit("Tree::ValidateNode, duplicate neighbors in node %u", uNodeIndex);
+ }
+ if (n3 != NULL_NEIGHBOR && (n3 == n1 || n3 == n2))
+ {
+ LogMe();
+ Quit("Tree::ValidateNode, duplicate neighbors in node %u", uNodeIndex);
+ }
+
+ if (IsRooted())
+ {
+ if (NULL_NEIGHBOR == GetParent(uNodeIndex))
+ {
+ if (uNodeIndex != m_uRootNodeIndex)
+ {
+ LogMe();
+ Quit("Tree::ValiateNode(%u), no parent", uNodeIndex);
+ }
+ }
+ else if (GetLeft(GetParent(uNodeIndex)) != uNodeIndex &&
+ GetRight(GetParent(uNodeIndex)) != uNodeIndex)
+ {
+ LogMe();
+ Quit("Tree::ValidateNode(%u), parent / child mismatch", uNodeIndex);
+ }
+ }
+ }
+
+void Tree::Validate() const
+ {
+ for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex)
+ ValidateNode(uNodeIndex);
+ }
+
+bool Tree::IsEdge(unsigned uNodeIndex1, unsigned uNodeIndex2) const
+ {
+ assert(uNodeIndex1 < m_uNodeCount && uNodeIndex2 < m_uNodeCount);
+
+ return m_uNeighbor1[uNodeIndex1] == uNodeIndex2 ||
+ m_uNeighbor2[uNodeIndex1] == uNodeIndex2 ||
+ m_uNeighbor3[uNodeIndex1] == uNodeIndex2;
+ }
+
+double Tree::GetEdgeLength(unsigned uNodeIndex1, unsigned uNodeIndex2) const
+ {
+ assert(uNodeIndex1 < m_uNodeCount && uNodeIndex2 < m_uNodeCount);
+ assert(HasEdgeLength(uNodeIndex1, uNodeIndex2));
+
+ if (m_uNeighbor1[uNodeIndex1] == uNodeIndex2)
+ return m_dEdgeLength1[uNodeIndex1];
+ else if (m_uNeighbor2[uNodeIndex1] == uNodeIndex2)
+ return m_dEdgeLength2[uNodeIndex1];
+ assert(m_uNeighbor3[uNodeIndex1] == uNodeIndex2);
+ return m_dEdgeLength3[uNodeIndex1];
+ }
+
+void Tree::ExpandCache()
+ {
+ const unsigned uNodeCount = 100;
+ unsigned uNewCacheCount = m_uCacheCount + uNodeCount;
+ unsigned *uNewNeighbor1 = new unsigned[uNewCacheCount];
+ unsigned *uNewNeighbor2 = new unsigned[uNewCacheCount];
+ unsigned *uNewNeighbor3 = new unsigned[uNewCacheCount];
+
+ unsigned *uNewIds = new unsigned[uNewCacheCount];
+ memset(uNewIds, 0xff, uNewCacheCount*sizeof(unsigned));
+
+ double *dNewEdgeLength1 = new double[uNewCacheCount];
+ double *dNewEdgeLength2 = new double[uNewCacheCount];
+ double *dNewEdgeLength3 = new double[uNewCacheCount];
+ double *dNewHeight = new double[uNewCacheCount];
+
+ bool *bNewHasEdgeLength1 = new bool[uNewCacheCount];
+ bool *bNewHasEdgeLength2 = new bool[uNewCacheCount];
+ bool *bNewHasEdgeLength3 = new bool[uNewCacheCount];
+ bool *bNewHasHeight = new bool[uNewCacheCount];
+
+ char **ptrNewName = new char *[uNewCacheCount];
+ memset(ptrNewName, 0, uNewCacheCount*sizeof(char *));
+
+ if (m_uCacheCount > 0)
+ {
+ const unsigned uUnsignedBytes = m_uCacheCount*sizeof(unsigned);
+ memcpy(uNewNeighbor1, m_uNeighbor1, uUnsignedBytes);
+ memcpy(uNewNeighbor2, m_uNeighbor2, uUnsignedBytes);
+ memcpy(uNewNeighbor3, m_uNeighbor3, uUnsignedBytes);
+
+ memcpy(uNewIds, m_Ids, uUnsignedBytes);
+
+ const unsigned uEdgeBytes = m_uCacheCount*sizeof(double);
+ memcpy(dNewEdgeLength1, m_dEdgeLength1, uEdgeBytes);
+ memcpy(dNewEdgeLength2, m_dEdgeLength2, uEdgeBytes);
+ memcpy(dNewEdgeLength3, m_dEdgeLength3, uEdgeBytes);
+ memcpy(dNewHeight, m_dHeight, uEdgeBytes);
+
+ const unsigned uBoolBytes = m_uCacheCount*sizeof(bool);
+ memcpy(bNewHasEdgeLength1, m_bHasEdgeLength1, uBoolBytes);
+ memcpy(bNewHasEdgeLength2, m_bHasEdgeLength1, uBoolBytes);
+ memcpy(bNewHasEdgeLength3, m_bHasEdgeLength1, uBoolBytes);
+ memcpy(bNewHasHeight, m_bHasHeight, uBoolBytes);
+
+ const unsigned uNameBytes = m_uCacheCount*sizeof(char *);
+ memcpy(ptrNewName, m_ptrName, uNameBytes);
+
+ delete[] m_uNeighbor1;
+ delete[] m_uNeighbor2;
+ delete[] m_uNeighbor3;
+
+ delete[] m_Ids;
+
+ delete[] m_dEdgeLength1;
+ delete[] m_dEdgeLength2;
+ delete[] m_dEdgeLength3;
+
+ delete[] m_bHasEdgeLength1;
+ delete[] m_bHasEdgeLength2;
+ delete[] m_bHasEdgeLength3;
+ delete[] m_bHasHeight;
+
+ delete[] m_ptrName;
+ }
+ m_uCacheCount = uNewCacheCount;
+ m_uNeighbor1 = uNewNeighbor1;
+ m_uNeighbor2 = uNewNeighbor2;
+ m_uNeighbor3 = uNewNeighbor3;
+ m_Ids = uNewIds;
+ m_dEdgeLength1 = dNewEdgeLength1;
+ m_dEdgeLength2 = dNewEdgeLength2;
+ m_dEdgeLength3 = dNewEdgeLength3;
+ m_dHeight = dNewHeight;
+ m_bHasEdgeLength1 = bNewHasEdgeLength1;
+ m_bHasEdgeLength2 = bNewHasEdgeLength2;
+ m_bHasEdgeLength3 = bNewHasEdgeLength3;
+ m_bHasHeight = bNewHasHeight;
+ m_ptrName = ptrNewName;
+ }
+
+// Creates tree with single node, no edges.
+// Root node always has index 0.
+void Tree::CreateRooted()
+ {
+ Clear();
+ ExpandCache();
+ m_uNodeCount = 1;
+
+ m_uNeighbor1[0] = NULL_NEIGHBOR;
+ m_uNeighbor2[0] = NULL_NEIGHBOR;
+ m_uNeighbor3[0] = NULL_NEIGHBOR;
+
+ m_bHasEdgeLength1[0] = false;
+ m_bHasEdgeLength2[0] = false;
+ m_bHasEdgeLength3[0] = false;
+ m_bHasHeight[0] = false;
+
+ m_uRootNodeIndex = 0;
+ m_bRooted = true;
+
+#if DEBUG
+ Validate();
+#endif
+ }
+
+// Creates unrooted tree with single edge.
+// Nodes for that edge are always 0 and 1.
+void Tree::CreateUnrooted(double dEdgeLength)
+ {
+ Clear();
+ ExpandCache();
+
+ m_uNeighbor1[0] = 1;
+ m_uNeighbor2[0] = NULL_NEIGHBOR;
+ m_uNeighbor3[0] = NULL_NEIGHBOR;
+
+ m_uNeighbor1[1] = 0;
+ m_uNeighbor2[1] = NULL_NEIGHBOR;
+ m_uNeighbor3[1] = NULL_NEIGHBOR;
+
+ m_dEdgeLength1[0] = dEdgeLength;
+ m_dEdgeLength1[1] = dEdgeLength;
+
+ m_bHasEdgeLength1[0] = true;
+ m_bHasEdgeLength1[1] = true;
+
+ m_bRooted = false;
+
+#if DEBUG
+ Validate();
+#endif
+ }
+
+void Tree::SetLeafName(unsigned uNodeIndex, const char *ptrName)
+ {
+ assert(uNodeIndex < m_uNodeCount);
+ assert(IsLeaf(uNodeIndex));
+ free(m_ptrName[uNodeIndex]);
+ m_ptrName[uNodeIndex] = strsave(ptrName);
+ }
+
+void Tree::SetLeafId(unsigned uNodeIndex, unsigned uId)
+ {
+ assert(uNodeIndex < m_uNodeCount);
+ assert(IsLeaf(uNodeIndex));
+ m_Ids[uNodeIndex] = uId;
+ }
+
+const char *Tree::GetLeafName(unsigned uNodeIndex) const
+ {
+ assert(uNodeIndex < m_uNodeCount);
+ assert(IsLeaf(uNodeIndex));
+ return m_ptrName[uNodeIndex];
+ }
+
+unsigned Tree::GetLeafId(unsigned uNodeIndex) const
+ {
+ assert(uNodeIndex < m_uNodeCount);
+ assert(IsLeaf(uNodeIndex));
+ return m_Ids[uNodeIndex];
+ }
+
+// Append a new branch.
+// This adds two new nodes and joins them to an existing leaf node.
+// Return value is k, new nodes have indexes k and k+1 respectively.
+unsigned Tree::AppendBranch(unsigned uExistingLeafIndex)
+ {
+ if (0 == m_uNodeCount)
+ Quit("Tree::AppendBranch: tree has not been created");
+
+#if DEBUG
+ assert(uExistingLeafIndex < m_uNodeCount);
+ if (!IsLeaf(uExistingLeafIndex))
+ {
+ LogMe();
+ Quit("AppendBranch(%u): not leaf", uExistingLeafIndex);
+ }
+#endif
+
+ if (m_uNodeCount >= m_uCacheCount - 2)
+ ExpandCache();
+
+ const unsigned uNewLeaf1 = m_uNodeCount;
+ const unsigned uNewLeaf2 = m_uNodeCount + 1;
+
+ m_uNodeCount += 2;
+
+ assert(m_uNeighbor2[uExistingLeafIndex] == NULL_NEIGHBOR);
+ assert(m_uNeighbor3[uExistingLeafIndex] == NULL_NEIGHBOR);
+
+ m_uNeighbor2[uExistingLeafIndex] = uNewLeaf1;
+ m_uNeighbor3[uExistingLeafIndex] = uNewLeaf2;
+
+ m_uNeighbor1[uNewLeaf1] = uExistingLeafIndex;
+ m_uNeighbor1[uNewLeaf2] = uExistingLeafIndex;
+
+ m_uNeighbor2[uNewLeaf1] = NULL_NEIGHBOR;
+ m_uNeighbor2[uNewLeaf2] = NULL_NEIGHBOR;
+
+ m_uNeighbor3[uNewLeaf1] = NULL_NEIGHBOR;
+ m_uNeighbor3[uNewLeaf2] = NULL_NEIGHBOR;
+
+ m_dEdgeLength2[uExistingLeafIndex] = 0;
+ m_dEdgeLength3[uExistingLeafIndex] = 0;
+
+ m_dEdgeLength1[uNewLeaf1] = 0;
+ m_dEdgeLength2[uNewLeaf1] = 0;
+ m_dEdgeLength3[uNewLeaf1] = 0;
+
+ m_dEdgeLength1[uNewLeaf2] = 0;
+ m_dEdgeLength2[uNewLeaf2] = 0;
+ m_dEdgeLength3[uNewLeaf2] = 0;
+
+ m_bHasEdgeLength1[uNewLeaf1] = false;
+ m_bHasEdgeLength2[uNewLeaf1] = false;
+ m_bHasEdgeLength3[uNewLeaf1] = false;
+
+ m_bHasEdgeLength1[uNewLeaf2] = false;
+ m_bHasEdgeLength2[uNewLeaf2] = false;
+ m_bHasEdgeLength3[uNewLeaf2] = false;
+
+ m_bHasHeight[uNewLeaf1] = false;
+ m_bHasHeight[uNewLeaf2] = false;
+
+ return uNewLeaf1;
+ }
+
+void Tree::LogMe() const
+ {
+ Log("Tree::LogMe %u nodes, ", m_uNodeCount);
+
+ if (IsRooted())
+ {
+ Log("rooted.\n");
+ Log("\n");
+ Log("Index Parnt LengthP Left LengthL Right LengthR Name\n");
+ Log("----- ----- ------- ---- ------- ----- ------- ----\n");
+ }
+ else
+ {
+ Log("unrooted.\n");
+ Log("\n");
+ Log("Index Nbr_1 Length1 Nbr_2 Length2 Nbr_3 Length3 Name\n");
+ Log("----- ----- ------- ----- ------- ----- ------- ----\n");
+ }
+
+ for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex)
+ {
+ Log("%5u ", uNodeIndex);
+ const unsigned n1 = m_uNeighbor1[uNodeIndex];
+ const unsigned n2 = m_uNeighbor2[uNodeIndex];
+ const unsigned n3 = m_uNeighbor3[uNodeIndex];
+ if (NULL_NEIGHBOR != n1)
+ Log("%5u %7.3g ", n1, m_dEdgeLength1[uNodeIndex]);
+ else
+ Log(" ");
+ if (NULL_NEIGHBOR != n2)
+ Log("%5u %7.3g ", n2, m_dEdgeLength2[uNodeIndex]);
+ else
+ Log(" ");
+ if (NULL_NEIGHBOR != n3)
+ Log("%5u %7.3g ", n3, m_dEdgeLength3[uNodeIndex]);
+ else
+ Log(" ");
+ if (m_bRooted && uNodeIndex == m_uRootNodeIndex)
+ Log("[ROOT] ");
+ const char *ptrName = m_ptrName[uNodeIndex];
+ if (ptrName != 0)
+ Log("%s", ptrName);
+ Log("\n");
+ }
+ }
+
+void Tree::SetEdgeLength(unsigned uNodeIndex1, unsigned uNodeIndex2,
+ double dLength)
+ {
+ assert(uNodeIndex1 < m_uNodeCount && uNodeIndex2 < m_uNodeCount);
+ assert(IsEdge(uNodeIndex1, uNodeIndex2));
+
+ if (m_uNeighbor1[uNodeIndex1] == uNodeIndex2)
+ {
+ m_dEdgeLength1[uNodeIndex1] = dLength;
+ m_bHasEdgeLength1[uNodeIndex1] = true;
+ }
+ else if (m_uNeighbor2[uNodeIndex1] == uNodeIndex2)
+ {
+ m_dEdgeLength2[uNodeIndex1] = dLength;
+ m_bHasEdgeLength2[uNodeIndex1] = true;
+ }
+ else
+ {
+ assert(m_uNeighbor3[uNodeIndex1] == uNodeIndex2);
+ m_dEdgeLength3[uNodeIndex1] = dLength;
+ m_bHasEdgeLength3[uNodeIndex1] = true;
+ }
+
+ if (m_uNeighbor1[uNodeIndex2] == uNodeIndex1)
+ {
+ m_dEdgeLength1[uNodeIndex2] = dLength;
+ m_bHasEdgeLength1[uNodeIndex2] = true;
+ }
+ else if (m_uNeighbor2[uNodeIndex2] == uNodeIndex1)
+ {
+ m_dEdgeLength2[uNodeIndex2] = dLength;
+ m_bHasEdgeLength2[uNodeIndex2] = true;
+ }
+ else
+ {
+ assert(m_uNeighbor3[uNodeIndex2] == uNodeIndex1);
+ m_dEdgeLength3[uNodeIndex2] = dLength;
+ m_bHasEdgeLength3[uNodeIndex2] = true;
+ }
+ }
+
+unsigned Tree::UnrootFromFile()
+ {
+#if TRACE
+ Log("Before unroot:\n");
+ LogMe();
+#endif
+
+ if (!m_bRooted)
+ Quit("Tree::Unroot, not rooted");
+
+// Convention: root node is always node zero
+ assert(IsRoot(0));
+ assert(NULL_NEIGHBOR == m_uNeighbor1[0]);
+
+ const unsigned uThirdNode = m_uNodeCount++;
+
+ m_uNeighbor1[0] = uThirdNode;
+ m_uNeighbor1[uThirdNode] = 0;
+
+ m_uNeighbor2[uThirdNode] = NULL_NEIGHBOR;
+ m_uNeighbor3[uThirdNode] = NULL_NEIGHBOR;
+
+ m_dEdgeLength1[0] = 0;
+ m_dEdgeLength1[uThirdNode] = 0;
+ m_bHasEdgeLength1[uThirdNode] = true;
+
+ m_bRooted = false;
+
+#if TRACE
+ Log("After unroot:\n");
+ LogMe();
+#endif
+
+ return uThirdNode;
+ }
+
+// In an unrooted tree, equivalent of GetLeft/Right is
+// GetFirst/SecondNeighbor.
+// uNeighborIndex must be a known neighbor of uNodeIndex.
+// This is the way to find the other two neighbor nodes of
+// an internal node.
+// The labeling as "First" and "Second" neighbor is arbitrary.
+// Calling these functions on a leaf returns NULL_NEIGHBOR, as
+// for GetLeft/Right.
+unsigned Tree::GetFirstNeighbor(unsigned uNodeIndex, unsigned uNeighborIndex) const
+ {
+ assert(uNodeIndex < m_uNodeCount);
+ assert(uNeighborIndex < m_uNodeCount);
+ assert(IsEdge(uNodeIndex, uNeighborIndex));
+
+ for (unsigned n = 0; n < 3; ++n)
+ {
+ unsigned uNeighbor = GetNeighbor(uNodeIndex, n);
+ if (NULL_NEIGHBOR != uNeighbor && uNeighborIndex != uNeighbor)
+ return uNeighbor;
+ }
+ return NULL_NEIGHBOR;
+ }
+
+unsigned Tree::GetSecondNeighbor(unsigned uNodeIndex, unsigned uNeighborIndex) const
+ {
+ assert(uNodeIndex < m_uNodeCount);
+ assert(uNeighborIndex < m_uNodeCount);
+ assert(IsEdge(uNodeIndex, uNeighborIndex));
+
+ bool bFoundOne = false;
+ for (unsigned n = 0; n < 3; ++n)
+ {
+ unsigned uNeighbor = GetNeighbor(uNodeIndex, n);
+ if (NULL_NEIGHBOR != uNeighbor && uNeighborIndex != uNeighbor)
+ {
+ if (bFoundOne)
+ return uNeighbor;
+ else
+ bFoundOne = true;
+ }
+ }
+ return NULL_NEIGHBOR;
+ }
+
+// Compute the number of leaves in the sub-tree defined by an edge
+// in an unrooted tree. Conceptually, the tree is cut at this edge,
+// and uNodeIndex2 considered the root of the sub-tree.
+unsigned Tree::GetLeafCountUnrooted(unsigned uNodeIndex1, unsigned uNodeIndex2,
+ double *ptrdTotalDistance) const
+ {
+ assert(!IsRooted());
+
+ if (IsLeaf(uNodeIndex2))
+ {
+ *ptrdTotalDistance = GetEdgeLength(uNodeIndex1, uNodeIndex2);
+ return 1;
+ }
+
+// Recurse down the rooted sub-tree defined by cutting the edge
+// and considering uNodeIndex2 as the root.
+ const unsigned uLeft = GetFirstNeighbor(uNodeIndex2, uNodeIndex1);
+ const unsigned uRight = GetSecondNeighbor(uNodeIndex2, uNodeIndex1);
+
+ double dLeftDistance;
+ double dRightDistance;
+
+ const unsigned uLeftCount = GetLeafCountUnrooted(uNodeIndex2, uLeft,
+ &dLeftDistance);
+ const unsigned uRightCount = GetLeafCountUnrooted(uNodeIndex2, uRight,
+ &dRightDistance);
+
+ *ptrdTotalDistance = dLeftDistance + dRightDistance;
+ return uLeftCount + uRightCount;
+ }
+
+void Tree::RootUnrootedTree(ROOT Method)
+ {
+ assert(!IsRooted());
+#if TRACE
+ Log("Tree::RootUnrootedTree, before:");
+ LogMe();
+#endif
+
+ unsigned uNode1;
+ unsigned uNode2;
+ double dLength1;
+ double dLength2;
+ FindRoot(*this, &uNode1, &uNode2, &dLength1, &dLength2, Method);
+
+ if (m_uNodeCount == m_uCacheCount)
+ ExpandCache();
+ m_uRootNodeIndex = m_uNodeCount++;
+
+ double dEdgeLength = GetEdgeLength(uNode1, uNode2);
+
+ m_uNeighbor1[m_uRootNodeIndex] = NULL_NEIGHBOR;
+ m_uNeighbor2[m_uRootNodeIndex] = uNode1;
+ m_uNeighbor3[m_uRootNodeIndex] = uNode2;
+
+ if (m_uNeighbor1[uNode1] == uNode2)
+ m_uNeighbor1[uNode1] = m_uRootNodeIndex;
+ else if (m_uNeighbor2[uNode1] == uNode2)
+ m_uNeighbor2[uNode1] = m_uRootNodeIndex;
+ else
+ {
+ assert(m_uNeighbor3[uNode1] == uNode2);
+ m_uNeighbor3[uNode1] = m_uRootNodeIndex;
+ }
+
+ if (m_uNeighbor1[uNode2] == uNode1)
+ m_uNeighbor1[uNode2] = m_uRootNodeIndex;
+ else if (m_uNeighbor2[uNode2] == uNode1)
+ m_uNeighbor2[uNode2] = m_uRootNodeIndex;
+ else
+ {
+ assert(m_uNeighbor3[uNode2] == uNode1);
+ m_uNeighbor3[uNode2] = m_uRootNodeIndex;
+ }
+
+ OrientParent(uNode1, m_uRootNodeIndex);
+ OrientParent(uNode2, m_uRootNodeIndex);
+
+ SetEdgeLength(m_uRootNodeIndex, uNode1, dLength1);
+ SetEdgeLength(m_uRootNodeIndex, uNode2, dLength2);
+
+ m_bHasHeight[m_uRootNodeIndex] = false;
+
+ m_ptrName[m_uRootNodeIndex] = 0;
+
+ m_bRooted = true;
+
+#if TRACE
+ Log("\nPhy::RootUnrootedTree, after:");
+ LogMe();
+#endif
+
+ Validate();
+ }
+
+bool Tree::HasEdgeLength(unsigned uNodeIndex1, unsigned uNodeIndex2) const
+ {
+ assert(uNodeIndex1 < m_uNodeCount);
+ assert(uNodeIndex2 < m_uNodeCount);
+ assert(IsEdge(uNodeIndex1, uNodeIndex2));
+
+ if (m_uNeighbor1[uNodeIndex1] == uNodeIndex2)
+ return m_bHasEdgeLength1[uNodeIndex1];
+ else if (m_uNeighbor2[uNodeIndex1] == uNodeIndex2)
+ return m_bHasEdgeLength2[uNodeIndex1];
+ assert(m_uNeighbor3[uNodeIndex1] == uNodeIndex2);
+ return m_bHasEdgeLength3[uNodeIndex1];
+ }
+
+void Tree::OrientParent(unsigned uNodeIndex, unsigned uParentNodeIndex)
+ {
+ if (NULL_NEIGHBOR == uNodeIndex)
+ return;
+
+ if (m_uNeighbor1[uNodeIndex] == uParentNodeIndex)
+ ;
+ else if (m_uNeighbor2[uNodeIndex] == uParentNodeIndex)
+ {
+ double dEdgeLength2 = m_dEdgeLength2[uNodeIndex];
+ m_uNeighbor2[uNodeIndex] = m_uNeighbor1[uNodeIndex];
+ m_dEdgeLength2[uNodeIndex] = m_dEdgeLength1[uNodeIndex];
+ m_uNeighbor1[uNodeIndex] = uParentNodeIndex;
+ m_dEdgeLength1[uNodeIndex] = dEdgeLength2;
+ }
+ else
+ {
+ assert(m_uNeighbor3[uNodeIndex] == uParentNodeIndex);
+ double dEdgeLength3 = m_dEdgeLength3[uNodeIndex];
+ m_uNeighbor3[uNodeIndex] = m_uNeighbor1[uNodeIndex];
+ m_dEdgeLength3[uNodeIndex] = m_dEdgeLength1[uNodeIndex];
+ m_uNeighbor1[uNodeIndex] = uParentNodeIndex;
+ m_dEdgeLength1[uNodeIndex] = dEdgeLength3;
+ }
+
+ OrientParent(m_uNeighbor2[uNodeIndex], uNodeIndex);
+ OrientParent(m_uNeighbor3[uNodeIndex], uNodeIndex);
+ }
+
+unsigned Tree::FirstDepthFirstNode() const
+ {
+ assert(IsRooted());
+
+// Descend via left branches until we hit a leaf
+ unsigned uNodeIndex = m_uRootNodeIndex;
+ while (!IsLeaf(uNodeIndex))
+ uNodeIndex = GetLeft(uNodeIndex);
+ return uNodeIndex;
+ }
+
+unsigned Tree::FirstDepthFirstNodeR() const
+ {
+ assert(IsRooted());
+
+// Descend via left branches until we hit a leaf
+ unsigned uNodeIndex = m_uRootNodeIndex;
+ while (!IsLeaf(uNodeIndex))
+ uNodeIndex = GetRight(uNodeIndex);
+ return uNodeIndex;
+ }
+
+unsigned Tree::NextDepthFirstNode(unsigned uNodeIndex) const
+ {
+#if TRACE
+ Log("NextDepthFirstNode(%3u) ", uNodeIndex);
+#endif
+
+ assert(IsRooted());
+ assert(uNodeIndex < m_uNodeCount);
+
+ if (IsRoot(uNodeIndex))
+ {
+#if TRACE
+ Log(">> Node %u is root, end of traversal\n", uNodeIndex);
+#endif
+ return NULL_NEIGHBOR;
+ }
+
+ unsigned uParent = GetParent(uNodeIndex);
+ if (GetRight(uParent) == uNodeIndex)
+ {
+#if TRACE
+ Log(">> Is right branch, return parent=%u\n", uParent);
+#endif
+ return uParent;
+ }
+
+ uNodeIndex = GetRight(uParent);
+#if TRACE
+ Log(">> Descend left from right sibling=%u ... ", uNodeIndex);
+#endif
+ while (!IsLeaf(uNodeIndex))
+ uNodeIndex = GetLeft(uNodeIndex);
+
+#if TRACE
+ Log("bottom out at leaf=%u\n", uNodeIndex);
+#endif
+ return uNodeIndex;
+ }
+
+unsigned Tree::NextDepthFirstNodeR(unsigned uNodeIndex) const
+ {
+#if TRACE
+ Log("NextDepthFirstNode(%3u) ", uNodeIndex);
+#endif
+
+ assert(IsRooted());
+ assert(uNodeIndex < m_uNodeCount);
+
+ if (IsRoot(uNodeIndex))
+ {
+#if TRACE
+ Log(">> Node %u is root, end of traversal\n", uNodeIndex);
+#endif
+ return NULL_NEIGHBOR;
+ }
+
+ unsigned uParent = GetParent(uNodeIndex);
+ if (GetLeft(uParent) == uNodeIndex)
+ {
+#if TRACE
+ Log(">> Is left branch, return parent=%u\n", uParent);
+#endif
+ return uParent;
+ }
+
+ uNodeIndex = GetLeft(uParent);
+#if TRACE
+ Log(">> Descend right from left sibling=%u ... ", uNodeIndex);
+#endif
+ while (!IsLeaf(uNodeIndex))
+ uNodeIndex = GetRight(uNodeIndex);
+
+#if TRACE
+ Log("bottom out at leaf=%u\n", uNodeIndex);
+#endif
+ return uNodeIndex;
+ }
+
+void Tree::UnrootByDeletingRoot()
+ {
+ assert(IsRooted());
+ assert(m_uNodeCount >= 3);
+
+ const unsigned uLeft = GetLeft(m_uRootNodeIndex);
+ const unsigned uRight = GetRight(m_uRootNodeIndex);
+
+ m_uNeighbor1[uLeft] = uRight;
+ m_uNeighbor1[uRight] = uLeft;
+
+ bool bHasEdgeLength = HasEdgeLength(m_uRootNodeIndex, uLeft) &&
+ HasEdgeLength(m_uRootNodeIndex, uRight);
+ if (bHasEdgeLength)
+ {
+ double dEdgeLength = GetEdgeLength(m_uRootNodeIndex, uLeft) +
+ GetEdgeLength(m_uRootNodeIndex, uRight);
+ m_dEdgeLength1[uLeft] = dEdgeLength;
+ m_dEdgeLength1[uRight] = dEdgeLength;
+ }
+
+// Remove root node entry from arrays
+ const unsigned uMoveCount = m_uNodeCount - m_uRootNodeIndex;
+ const unsigned uUnsBytes = uMoveCount*sizeof(unsigned);
+ memmove(m_uNeighbor1 + m_uRootNodeIndex, m_uNeighbor1 + m_uRootNodeIndex + 1,
+ uUnsBytes);
+ memmove(m_uNeighbor2 + m_uRootNodeIndex, m_uNeighbor2 + m_uRootNodeIndex + 1,
+ uUnsBytes);
+ memmove(m_uNeighbor3 + m_uRootNodeIndex, m_uNeighbor3 + m_uRootNodeIndex + 1,
+ uUnsBytes);
+
+ const unsigned uDoubleBytes = uMoveCount*sizeof(double);
+ memmove(m_dEdgeLength1 + m_uRootNodeIndex, m_dEdgeLength1 + m_uRootNodeIndex + 1,
+ uDoubleBytes);
+ memmove(m_dEdgeLength2 + m_uRootNodeIndex, m_dEdgeLength2 + m_uRootNodeIndex + 1,
+ uDoubleBytes);
+ memmove(m_dEdgeLength3 + m_uRootNodeIndex, m_dEdgeLength3 + m_uRootNodeIndex + 1,
+ uDoubleBytes);
+
+ const unsigned uBoolBytes = uMoveCount*sizeof(bool);
+ memmove(m_bHasEdgeLength1 + m_uRootNodeIndex, m_bHasEdgeLength1 + m_uRootNodeIndex + 1,
+ uBoolBytes);
+ memmove(m_bHasEdgeLength2 + m_uRootNodeIndex, m_bHasEdgeLength2 + m_uRootNodeIndex + 1,
+ uBoolBytes);
+ memmove(m_bHasEdgeLength3 + m_uRootNodeIndex, m_bHasEdgeLength3 + m_uRootNodeIndex + 1,
+ uBoolBytes);
+
+ const unsigned uPtrBytes = uMoveCount*sizeof(char *);
+ memmove(m_ptrName + m_uRootNodeIndex, m_ptrName + m_uRootNodeIndex + 1, uPtrBytes);
+
+ --m_uNodeCount;
+ m_bRooted = false;
+
+// Fix up table entries
+ for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex)
+ {
+#define DEC(x) if (x != NULL_NEIGHBOR && x > m_uRootNodeIndex) --x;
+ DEC(m_uNeighbor1[uNodeIndex])
+ DEC(m_uNeighbor2[uNodeIndex])
+ DEC(m_uNeighbor3[uNodeIndex])
+#undef DEC
+ }
+
+ Validate();
+ }
+
+unsigned Tree::GetLeafParent(unsigned uNodeIndex) const
+ {
+ assert(IsLeaf(uNodeIndex));
+
+ if (IsRooted())
+ return GetParent(uNodeIndex);
+
+ if (m_uNeighbor1[uNodeIndex] != NULL_NEIGHBOR)
+ return m_uNeighbor1[uNodeIndex];
+ if (m_uNeighbor2[uNodeIndex] != NULL_NEIGHBOR)
+ return m_uNeighbor2[uNodeIndex];
+ return m_uNeighbor3[uNodeIndex];
+ }
+
+// TODO: This is not efficient for large trees, should cache.
+double Tree::GetNodeHeight(unsigned uNodeIndex) const
+ {
+ if (!IsRooted())
+ Quit("Tree::GetNodeHeight: undefined unless rooted tree");
+
+ if (IsLeaf(uNodeIndex))
+ return 0.0;
+
+ if (m_bHasHeight[uNodeIndex])
+ return m_dHeight[uNodeIndex];
+
+ const unsigned uLeft = GetLeft(uNodeIndex);
+ const unsigned uRight = GetRight(uNodeIndex);
+ double dLeftLength = GetEdgeLength(uNodeIndex, uLeft);
+ double dRightLength = GetEdgeLength(uNodeIndex, uRight);
+
+ if (dLeftLength < 0)
+ dLeftLength = 0;
+ if (dRightLength < 0)
+ dRightLength = 0;
+
+ const double dLeftHeight = dLeftLength + GetNodeHeight(uLeft);
+ const double dRightHeight = dRightLength + GetNodeHeight(uRight);
+ const double dHeight = (dLeftHeight + dRightHeight)/2;
+ m_bHasHeight[uNodeIndex] = true;
+ m_dHeight[uNodeIndex] = dHeight;
+ return dHeight;
+ }
+
+unsigned Tree::GetNeighborSubscript(unsigned uNodeIndex, unsigned uNeighborIndex) const
+ {
+ assert(uNodeIndex < m_uNodeCount);
+ assert(uNeighborIndex < m_uNodeCount);
+ if (uNeighborIndex == m_uNeighbor1[uNodeIndex])
+ return 0;
+ if (uNeighborIndex == m_uNeighbor2[uNodeIndex])
+ return 1;
+ if (uNeighborIndex == m_uNeighbor3[uNodeIndex])
+ return 2;
+ return NULL_NEIGHBOR;
+ }
+
+unsigned Tree::GetNeighbor(unsigned uNodeIndex, unsigned uNeighborSubscript) const
+ {
+ switch (uNeighborSubscript)
+ {
+ case 0:
+ return m_uNeighbor1[uNodeIndex];
+ case 1:
+ return m_uNeighbor2[uNodeIndex];
+ case 2:
+ return m_uNeighbor3[uNodeIndex];
+ }
+ Quit("Tree::GetNeighbor, sub=%u", uNeighborSubscript);
+ return NULL_NEIGHBOR;
+ }
+
+// TODO: check if this is a performance issue, could cache a lookup table
+unsigned Tree::LeafIndexToNodeIndex(unsigned uLeafIndex) const
+ {
+ const unsigned uNodeCount = GetNodeCount();
+ unsigned uLeafCount = 0;
+ for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex)
+ {
+ if (IsLeaf(uNodeIndex))
+ {
+ if (uLeafCount == uLeafIndex)
+ return uNodeIndex;
+ else
+ ++uLeafCount;
+ }
+ }
+ Quit("LeafIndexToNodeIndex: out of range");
+ return 0;
+ }
+
+unsigned Tree::GetLeafNodeIndex(const char *ptrName) const
+ {
+ const unsigned uNodeCount = GetNodeCount();
+ for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex)
+ {
+ if (!IsLeaf(uNodeIndex))
+ continue;
+ const char *ptrLeafName = GetLeafName(uNodeIndex);
+ if (0 == strcmp(ptrName, ptrLeafName))
+ return uNodeIndex;
+ }
+ Quit("Tree::GetLeafNodeIndex, name not found");
+ return 0;
+ }
+
+void Tree::Copy(const Tree &tree)
+ {
+ const unsigned uNodeCount = tree.GetNodeCount();
+ InitCache(uNodeCount);
+
+ m_uNodeCount = uNodeCount;
+
+ const size_t UnsignedBytes = uNodeCount*sizeof(unsigned);
+ const size_t DoubleBytes = uNodeCount*sizeof(double);
+ const size_t BoolBytes = uNodeCount*sizeof(bool);
+
+ memcpy(m_uNeighbor1, tree.m_uNeighbor1, UnsignedBytes);
+ memcpy(m_uNeighbor2, tree.m_uNeighbor2, UnsignedBytes);
+ memcpy(m_uNeighbor3, tree.m_uNeighbor3, UnsignedBytes);
+
+ memcpy(m_Ids, tree.m_Ids, UnsignedBytes);
+
+ memcpy(m_dEdgeLength1, tree.m_dEdgeLength1, DoubleBytes);
+ memcpy(m_dEdgeLength2, tree.m_dEdgeLength2, DoubleBytes);
+ memcpy(m_dEdgeLength3, tree.m_dEdgeLength3, DoubleBytes);
+
+ memcpy(m_dHeight, tree.m_dHeight, DoubleBytes);
+
+ memcpy(m_bHasEdgeLength1, tree.m_bHasEdgeLength1, BoolBytes);
+ memcpy(m_bHasEdgeLength2, tree.m_bHasEdgeLength2, BoolBytes);
+ memcpy(m_bHasEdgeLength3, tree.m_bHasEdgeLength3, BoolBytes);
+
+ memcpy(m_bHasHeight, tree.m_bHasHeight, BoolBytes);
+
+ m_uRootNodeIndex = tree.m_uRootNodeIndex;
+ m_bRooted = tree.m_bRooted;
+
+ for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex)
+ {
+ if (tree.IsLeaf(uNodeIndex))
+ {
+ const char *ptrName = tree.GetLeafName(uNodeIndex);
+ m_ptrName[uNodeIndex] = strsave(ptrName);
+ }
+ else
+ m_ptrName[uNodeIndex] = 0;
+ }
+
+#if DEBUG
+ Validate();
+#endif
+ }
+
+// Create rooted tree from a vector description.
+// Node indexes are 0..N-1 for leaves, N..2N-2 for
+// internal nodes.
+// Vector subscripts are i-N and have values for
+// internal nodes only, but those values are node
+// indexes 0..2N-2. So e.g. if N=6 and Left[2]=1,
+// this means that the third internal node (node index 8)
+// has the second leaf (node index 1) as its left child.
+// uRoot gives the vector subscript of the root, so add N
+// to get the node index.
+void Tree::Create(unsigned uLeafCount, unsigned uRoot, const unsigned Left[],
+ const unsigned Right[], const float LeftLength[], const float RightLength[],
+ const unsigned LeafIds[], char **LeafNames)
+ {
+ Clear();
+
+ m_uNodeCount = 2*uLeafCount - 1;
+ InitCache(m_uNodeCount);
+
+ for (unsigned uNodeIndex = 0; uNodeIndex < uLeafCount; ++uNodeIndex)
+ {
+ m_Ids[uNodeIndex] = LeafIds[uNodeIndex];
+ m_ptrName[uNodeIndex] = strsave(LeafNames[uNodeIndex]);
+ }
+
+ for (unsigned uNodeIndex = uLeafCount; uNodeIndex < m_uNodeCount; ++uNodeIndex)
+ {
+ unsigned v = uNodeIndex - uLeafCount;
+ unsigned uLeft = Left[v];
+ unsigned uRight = Right[v];
+ float fLeft = LeftLength[v];
+ float fRight = RightLength[v];
+
+ m_uNeighbor2[uNodeIndex] = uLeft;
+ m_uNeighbor3[uNodeIndex] = uRight;
+
+ m_bHasEdgeLength2[uNodeIndex] = true;
+ m_bHasEdgeLength3[uNodeIndex] = true;
+
+ m_dEdgeLength2[uNodeIndex] = fLeft;
+ m_dEdgeLength3[uNodeIndex] = fRight;
+
+ m_uNeighbor1[uLeft] = uNodeIndex;
+ m_uNeighbor1[uRight] = uNodeIndex;
+
+ m_dEdgeLength1[uLeft] = fLeft;
+ m_dEdgeLength1[uRight] = fRight;
+
+ m_bHasEdgeLength1[uLeft] = true;
+ m_bHasEdgeLength1[uRight] = true;
+ }
+
+ m_bRooted = true;
+ m_uRootNodeIndex = uRoot + uLeafCount;
+
+ Validate();
+ }
Added: trunk/packages/muscle/branches/upstream/current/phy2.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/phy2.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/phy2.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,282 @@
+#include "muscle.h"
+#include "tree.h"
+
+#define TRACE 0
+
+// Return false when done
+bool PhyEnumEdges(const Tree &tree, PhyEnumEdgeState &ES)
+ {
+ unsigned uNode1 = uInsane;
+
+ if (!ES.m_bInit)
+ {
+ if (tree.GetNodeCount() <= 1)
+ {
+ ES.m_uNodeIndex1 = NULL_NEIGHBOR;
+ ES.m_uNodeIndex2 = NULL_NEIGHBOR;
+ return false;
+ }
+ uNode1 = tree.FirstDepthFirstNode();
+ ES.m_bInit = true;
+ }
+ else
+ {
+ uNode1 = tree.NextDepthFirstNode(ES.m_uNodeIndex1);
+ if (NULL_NEIGHBOR == uNode1)
+ return false;
+ if (tree.IsRooted() && tree.IsRoot(uNode1))
+ {
+ uNode1 = tree.NextDepthFirstNode(uNode1);
+ if (NULL_NEIGHBOR == uNode1)
+ return false;
+ }
+ }
+ unsigned uNode2 = tree.GetParent(uNode1);
+
+ ES.m_uNodeIndex1 = uNode1;
+ ES.m_uNodeIndex2 = uNode2;
+ return true;
+ }
+
+bool PhyEnumEdgesR(const Tree &tree, PhyEnumEdgeState &ES)
+ {
+ unsigned uNode1 = uInsane;
+
+ if (!ES.m_bInit)
+ {
+ if (tree.GetNodeCount() <= 1)
+ {
+ ES.m_uNodeIndex1 = NULL_NEIGHBOR;
+ ES.m_uNodeIndex2 = NULL_NEIGHBOR;
+ return false;
+ }
+ uNode1 = tree.FirstDepthFirstNodeR();
+ ES.m_bInit = true;
+ }
+ else
+ {
+ uNode1 = tree.NextDepthFirstNodeR(ES.m_uNodeIndex1);
+ if (NULL_NEIGHBOR == uNode1)
+ return false;
+ if (tree.IsRooted() && tree.IsRoot(uNode1))
+ {
+ uNode1 = tree.NextDepthFirstNode(uNode1);
+ if (NULL_NEIGHBOR == uNode1)
+ return false;
+ }
+ }
+ unsigned uNode2 = tree.GetParent(uNode1);
+
+ ES.m_uNodeIndex1 = uNode1;
+ ES.m_uNodeIndex2 = uNode2;
+ return true;
+ }
+
+static void GetLeavesSubtree(const Tree &tree, unsigned uNodeIndex1,
+ const unsigned uNodeIndex2, unsigned Leaves[], unsigned *ptruCount)
+ {
+ if (tree.IsLeaf(uNodeIndex1))
+ {
+ Leaves[*ptruCount] = uNodeIndex1;
+ ++(*ptruCount);
+ return;
+ }
+
+ const unsigned uLeft = tree.GetFirstNeighbor(uNodeIndex1, uNodeIndex2);
+ const unsigned uRight = tree.GetSecondNeighbor(uNodeIndex1, uNodeIndex2);
+ if (NULL_NEIGHBOR != uLeft)
+ GetLeavesSubtree(tree, uLeft, uNodeIndex1, Leaves, ptruCount);
+ if (NULL_NEIGHBOR != uRight)
+ GetLeavesSubtree(tree, uRight, uNodeIndex1, Leaves, ptruCount);
+ }
+
+static void PhyGetLeaves(const Tree &tree, unsigned uNodeIndex1, unsigned uNodeIndex2,
+ unsigned Leaves[], unsigned *ptruCount)
+ {
+ *ptruCount = 0;
+ GetLeavesSubtree(tree, uNodeIndex1, uNodeIndex2, Leaves, ptruCount);
+ }
+
+bool PhyEnumBiParts(const Tree &tree, PhyEnumEdgeState &ES,
+ unsigned Leaves1[], unsigned *ptruCount1,
+ unsigned Leaves2[], unsigned *ptruCount2)
+ {
+ bool bOk = PhyEnumEdges(tree, ES);
+ if (!bOk)
+ {
+ *ptruCount1 = 0;
+ *ptruCount2 = 0;
+ return false;
+ }
+
+// Special case: in a rooted tree, both edges from the root
+// give the same bipartition, so skip one of them.
+ if (tree.IsRooted() && tree.IsRoot(ES.m_uNodeIndex2)
+ && tree.GetRight(ES.m_uNodeIndex2) == ES.m_uNodeIndex1)
+ {
+ bOk = PhyEnumEdges(tree, ES);
+ if (!bOk)
+ return false;
+ }
+
+ PhyGetLeaves(tree, ES.m_uNodeIndex1, ES.m_uNodeIndex2, Leaves1, ptruCount1);
+ PhyGetLeaves(tree, ES.m_uNodeIndex2, ES.m_uNodeIndex1, Leaves2, ptruCount2);
+
+ if (*ptruCount1 + *ptruCount2 != tree.GetLeafCount())
+ Quit("PhyEnumBiParts %u + %u != %u",
+ *ptruCount1, *ptruCount2, tree.GetLeafCount());
+#if DEBUG
+ {
+ for (unsigned i = 0; i < *ptruCount1; ++i)
+ {
+ if (!tree.IsLeaf(Leaves1[i]))
+ Quit("PhyEnumByParts: not leaf");
+ for (unsigned j = 0; j < *ptruCount2; ++j)
+ {
+ if (!tree.IsLeaf(Leaves2[j]))
+ Quit("PhyEnumByParts: not leaf");
+ if (Leaves1[i] == Leaves2[j])
+ Quit("PhyEnumByParts: dupe");
+ }
+ }
+ }
+#endif
+
+ return true;
+ }
+
+#if 0
+void TestBiPart()
+ {
+ SetListFileName("c:\\tmp\\lobster.log", false);
+ Tree tree;
+ TextFile fileIn("c:\\tmp\\test.phy");
+ tree.FromFile(fileIn);
+ tree.LogMe();
+
+ const unsigned uNodeCount = tree.GetNodeCount();
+ unsigned *Leaves1 = new unsigned[uNodeCount];
+ unsigned *Leaves2 = new unsigned[uNodeCount];
+
+ PhyEnumEdgeState ES;
+ bool bDone = false;
+ for (;;)
+ {
+ unsigned uCount1 = uInsane;
+ unsigned uCount2 = uInsane;
+ bool bOk = PhyEnumBiParts(tree, ES, Leaves1, &uCount1, Leaves2, &uCount2);
+ Log("PEBP=%d ES.Init=%d ES.ni1=%d ES.ni2=%d\n",
+ bOk,
+ ES.m_bInit,
+ ES.m_uNodeIndex1,
+ ES.m_uNodeIndex2);
+ if (!bOk)
+ break;
+ Log("\n");
+ Log("Part1: ");
+ for (unsigned n = 0; n < uCount1; ++n)
+ Log(" %d(%s)", Leaves1[n], tree.GetLeafName(Leaves1[n]));
+ Log("\n");
+ Log("Part2: ");
+ for (unsigned n = 0; n < uCount2; ++n)
+ Log(" %d(%s)", Leaves2[n], tree.GetLeafName(Leaves2[n]));
+ Log("\n");
+ }
+ }
+#endif
+
+static void GetLeavesSubtreeExcluding(const Tree &tree, unsigned uNodeIndex,
+ unsigned uExclude, unsigned Leaves[], unsigned *ptruCount)
+ {
+ if (uNodeIndex == uExclude)
+ return;
+
+ if (tree.IsLeaf(uNodeIndex))
+ {
+ Leaves[*ptruCount] = uNodeIndex;
+ ++(*ptruCount);
+ return;
+ }
+
+ const unsigned uLeft = tree.GetLeft(uNodeIndex);
+ const unsigned uRight = tree.GetRight(uNodeIndex);
+ if (NULL_NEIGHBOR != uLeft)
+ GetLeavesSubtreeExcluding(tree, uLeft, uExclude, Leaves, ptruCount);
+ if (NULL_NEIGHBOR != uRight)
+ GetLeavesSubtreeExcluding(tree, uRight, uExclude, Leaves, ptruCount);
+ }
+
+void GetLeavesExcluding(const Tree &tree, unsigned uNodeIndex,
+ unsigned uExclude, unsigned Leaves[], unsigned *ptruCount)
+ {
+ *ptruCount = 0;
+ GetLeavesSubtreeExcluding(tree, uNodeIndex, uExclude, Leaves, ptruCount);
+ }
+
+void GetInternalNodesInHeightOrder(const Tree &tree, unsigned NodeIndexes[])
+ {
+ const unsigned uNodeCount = tree.GetNodeCount();
+ if (uNodeCount < 3)
+ Quit("GetInternalNodesInHeightOrder: %u nodes, none are internal",
+ uNodeCount);
+ const unsigned uInternalNodeCount = (uNodeCount - 1)/2;
+ double *Heights = new double[uInternalNodeCount];
+
+ unsigned uIndex = 0;
+ for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex)
+ {
+ if (tree.IsLeaf(uNodeIndex))
+ continue;
+ NodeIndexes[uIndex] = uNodeIndex;
+ Heights[uIndex] = tree.GetNodeHeight(uNodeIndex);
+ ++uIndex;
+ }
+ if (uIndex != uInternalNodeCount)
+ Quit("Internal error: GetInternalNodesInHeightOrder");
+
+// Simple but slow bubble sort (probably don't care about speed here)
+ bool bDone = false;
+ while (!bDone)
+ {
+ bDone = true;
+ for (unsigned i = 0; i < uInternalNodeCount - 1; ++i)
+ {
+ if (Heights[i] > Heights[i+1])
+ {
+ double dTmp = Heights[i];
+ Heights[i] = Heights[i+1];
+ Heights[i+1] = dTmp;
+
+ unsigned uTmp = NodeIndexes[i];
+ NodeIndexes[i] = NodeIndexes[i+1];
+ NodeIndexes[i+1] = uTmp;
+ bDone = false;
+ }
+ }
+ }
+#if TRACE
+ Log("Internal node index Height\n");
+ Log("------------------- --------\n");
+ // 1234567890123456789 123456789
+ for (unsigned n = 0; n < uInternalNodeCount; ++n)
+ Log("%19u %9.3f\n", NodeIndexes[n], Heights[n]);
+#endif
+ delete[] Heights;
+ }
+
+void ApplyMinEdgeLength(Tree &tree, double dMinEdgeLength)
+ {
+ const unsigned uNodeCount = tree.GetNodeCount();
+ for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex)
+ {
+ const unsigned uNeighborCount = tree.GetNeighborCount(uNodeIndex);
+ for (unsigned n = 0; n < uNeighborCount; ++n)
+ {
+ const unsigned uNeighborNodeIndex = tree.GetNeighbor(uNodeIndex, n);
+ if (!tree.HasEdgeLength(uNodeIndex, uNeighborNodeIndex))
+ continue;
+ if (tree.GetEdgeLength(uNodeIndex, uNeighborNodeIndex) < dMinEdgeLength)
+ tree.SetEdgeLength(uNodeIndex, uNeighborNodeIndex, dMinEdgeLength);
+ }
+ }
+ }
Added: trunk/packages/muscle/branches/upstream/current/phy3.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/phy3.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/phy3.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,469 @@
+#include "muscle.h"
+#include "tree.h"
+#include "edgelist.h"
+
+#define TRACE 0
+
+struct EdgeInfo
+ {
+ EdgeInfo()
+ {
+ m_bSet = false;
+ }
+// Is data in this structure valid (i.e, has been set)?
+ bool m_bSet;
+
+// Node at start of this edge
+ unsigned m_uNode1;
+
+// Node at end of this edge
+ unsigned m_uNode2;
+
+// Maximum distance from Node2 to a leaf
+ double m_dMaxDistToLeaf;
+
+// Sum of distances from Node2 to all leaves under Node2
+ double m_dTotalDistToLeaves;
+
+// Next node on path from Node2 to most distant leaf
+ unsigned m_uMaxStep;
+
+// Most distant leaf from Node2 (used for debugging only)
+ unsigned m_uMostDistantLeaf;
+
+// Number of leaves under Node2
+ unsigned m_uLeafCount;
+ };
+
+static void RootByMidLongestSpan(const Tree &tree, EdgeInfo **EIs,
+ unsigned *ptruNode1, unsigned *ptruNode2,
+ double *ptrdLength1, double *ptrdLength2);
+static void RootByMinAvgLeafDist(const Tree &tree, EdgeInfo **EIs,
+ unsigned *ptruNode1, unsigned *ptruNode2,
+ double *ptrdLength1, double *ptrdLength2);
+
+static void ListEIs(EdgeInfo **EIs, unsigned uNodeCount)
+ {
+ Log("Node1 Node2 MaxDist TotDist MostDist LeafCount Step\n");
+ Log("----- ----- ------- ------- -------- --------- ----\n");
+ // 12345 12345 1234567 1234567 12345678 123456789
+
+ for (unsigned uNode = 0; uNode < uNodeCount; ++uNode)
+ for (unsigned uNeighbor = 0; uNeighbor < 3; ++uNeighbor)
+ {
+ const EdgeInfo &EI = EIs[uNode][uNeighbor];
+ if (!EI.m_bSet)
+ continue;
+ Log("%5u %5u %7.3g %7.3g %8u %9u",
+ EI.m_uNode1,
+ EI.m_uNode2,
+ EI.m_dMaxDistToLeaf,
+ EI.m_dTotalDistToLeaves,
+ EI.m_uMostDistantLeaf,
+ EI.m_uLeafCount);
+ if (NULL_NEIGHBOR != EI.m_uMaxStep)
+ Log(" %4u", EI.m_uMaxStep);
+ Log("\n");
+ }
+ }
+
+static void CalcInfo(const Tree &tree, unsigned uNode1, unsigned uNode2, EdgeInfo **EIs)
+ {
+ const unsigned uNeighborIndex = tree.GetNeighborSubscript(uNode1, uNode2);
+ EdgeInfo &EI = EIs[uNode1][uNeighborIndex];
+ EI.m_uNode1 = uNode1;
+ EI.m_uNode2 = uNode2;
+
+ if (tree.IsLeaf(uNode2))
+ {
+ EI.m_dMaxDistToLeaf = 0;
+ EI.m_dTotalDistToLeaves = 0;
+ EI.m_uMaxStep = NULL_NEIGHBOR;
+ EI.m_uMostDistantLeaf = uNode2;
+ EI.m_uLeafCount = 1;
+ EI.m_bSet = true;
+ return;
+ }
+
+ double dMaxDistToLeaf = -1e29;
+ double dTotalDistToLeaves = 0.0;
+ unsigned uLeafCount = 0;
+ unsigned uMostDistantLeaf = NULL_NEIGHBOR;
+ unsigned uMaxStep = NULL_NEIGHBOR;
+
+ const unsigned uNeighborCount = tree.GetNeighborCount(uNode2);
+ for (unsigned uSub = 0; uSub < uNeighborCount; ++uSub)
+ {
+ const unsigned uNode3 = tree.GetNeighbor(uNode2, uSub);
+ if (uNode3 == uNode1)
+ continue;
+ const EdgeInfo &EINext = EIs[uNode2][uSub];
+ if (!EINext.m_bSet)
+ Quit("CalcInfo: internal error, dist %u->%u not known",
+ uNode2, uNode3);
+
+
+ uLeafCount += EINext.m_uLeafCount;
+
+ const double dEdgeLength = tree.GetEdgeLength(uNode2, uNode3);
+ const double dTotalDist = EINext.m_dTotalDistToLeaves +
+ EINext.m_uLeafCount*dEdgeLength;
+ dTotalDistToLeaves += dTotalDist;
+
+ const double dDist = EINext.m_dMaxDistToLeaf + dEdgeLength;
+ if (dDist > dMaxDistToLeaf)
+ {
+ dMaxDistToLeaf = dDist;
+ uMostDistantLeaf = EINext.m_uMostDistantLeaf;
+ uMaxStep = uNode3;
+ }
+ }
+ if (NULL_NEIGHBOR == uMaxStep || NULL_NEIGHBOR == uMostDistantLeaf ||
+ 0 == uLeafCount)
+ Quit("CalcInfo: internal error 2");
+
+ const double dThisDist = tree.GetEdgeLength(uNode1, uNode2);
+ EI.m_dMaxDistToLeaf = dMaxDistToLeaf;
+ EI.m_dTotalDistToLeaves = dTotalDistToLeaves;
+ EI.m_uMaxStep = uMaxStep;
+ EI.m_uMostDistantLeaf = uMostDistantLeaf;
+ EI.m_uLeafCount = uLeafCount;
+ EI.m_bSet = true;
+ }
+
+static bool Known(const Tree &tree, EdgeInfo **EIs, unsigned uNodeFrom,
+ unsigned uNodeTo)
+ {
+ const unsigned uSub = tree.GetNeighborSubscript(uNodeFrom, uNodeTo);
+ return EIs[uNodeFrom][uSub].m_bSet;
+ }
+
+static bool AllKnownOut(const Tree &tree, EdgeInfo **EIs, unsigned uNodeFrom,
+ unsigned uNodeTo)
+ {
+ const unsigned uNeighborCount = tree.GetNeighborCount(uNodeTo);
+ for (unsigned uSub = 0; uSub < uNeighborCount; ++uSub)
+ {
+ unsigned uNeighborIndex = tree.GetNeighbor(uNodeTo, uSub);
+ if (uNeighborIndex == uNodeFrom)
+ continue;
+ if (!EIs[uNodeTo][uSub].m_bSet)
+ return false;
+ }
+ return true;
+ }
+
+void FindRoot(const Tree &tree, unsigned *ptruNode1, unsigned *ptruNode2,
+ double *ptrdLength1, double *ptrdLength2,
+ ROOT RootMethod)
+ {
+#if TRACE
+ tree.LogMe();
+#endif
+ if (tree.IsRooted())
+ Quit("FindRoot: tree already rooted");
+
+ const unsigned uNodeCount = tree.GetNodeCount();
+ const unsigned uLeafCount = tree.GetLeafCount();
+
+ if (uNodeCount < 2)
+ Quit("Root: don't support trees with < 2 edges");
+
+ EdgeInfo **EIs = new EdgeInfo *[uNodeCount];
+ for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex)
+ EIs[uNodeIndex] = new EdgeInfo[3];
+
+ EdgeList Edges;
+ for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex)
+ if (tree.IsLeaf(uNodeIndex))
+ {
+ unsigned uParent = tree.GetNeighbor1(uNodeIndex);
+ Edges.Add(uParent, uNodeIndex);
+ }
+
+#if TRACE
+ Log("Edges: ");
+ Edges.LogMe();
+#endif
+
+// Main loop: iterate until all distances known
+ double dAllMaxDist = -1e20;
+ unsigned uMaxFrom = NULL_NEIGHBOR;
+ unsigned uMaxTo = NULL_NEIGHBOR;
+ for (;;)
+ {
+ EdgeList NextEdges;
+
+#if TRACE
+ Log("\nTop of main loop\n");
+ Log("Edges: ");
+ Edges.LogMe();
+ Log("MDs:\n");
+ ListEIs(EIs, uNodeCount);
+#endif
+
+ // For all edges
+ const unsigned uEdgeCount = Edges.GetCount();
+ if (0 == uEdgeCount)
+ break;
+ for (unsigned n = 0; n < uEdgeCount; ++n)
+ {
+ unsigned uNodeFrom;
+ unsigned uNodeTo;
+ Edges.GetEdge(n, &uNodeFrom, &uNodeTo);
+
+ CalcInfo(tree, uNodeFrom, uNodeTo, EIs);
+#if TRACE
+ Log("Edge %u -> %u\n", uNodeFrom, uNodeTo);
+#endif
+ const unsigned uNeighborCount = tree.GetNeighborCount(uNodeFrom);
+ for (unsigned i = 0; i < uNeighborCount; ++i)
+ {
+ const unsigned uNeighborIndex = tree.GetNeighbor(uNodeFrom, i);
+ if (!Known(tree, EIs, uNeighborIndex, uNodeFrom) &&
+ AllKnownOut(tree, EIs, uNeighborIndex, uNodeFrom))
+ NextEdges.Add(uNeighborIndex, uNodeFrom);
+ }
+ }
+ Edges.Copy(NextEdges);
+ }
+
+#if TRACE
+ ListEIs(EIs, uNodeCount);
+#endif
+
+ switch (RootMethod)
+ {
+ case ROOT_MidLongestSpan:
+ RootByMidLongestSpan(tree, EIs, ptruNode1, ptruNode2,
+ ptrdLength1, ptrdLength2);
+ break;
+
+ case ROOT_MinAvgLeafDist:
+ RootByMinAvgLeafDist(tree, EIs, ptruNode1, ptruNode2,
+ ptrdLength1, ptrdLength2);
+ break;
+
+ default:
+ Quit("Invalid RootMethod=%d", RootMethod);
+ }
+
+ for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex)
+ delete[] EIs[uNodeIndex];
+ delete[] EIs;
+ }
+
+static void RootByMidLongestSpan(const Tree &tree, EdgeInfo **EIs,
+ unsigned *ptruNode1, unsigned *ptruNode2,
+ double *ptrdLength1, double *ptrdLength2)
+ {
+ const unsigned uNodeCount = tree.GetNodeCount();
+
+ unsigned uLeaf1 = NULL_NEIGHBOR;
+ unsigned uMostDistantLeaf = NULL_NEIGHBOR;
+ double dMaxDist = -VERY_LARGE_DOUBLE;
+ for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex)
+ {
+ if (!tree.IsLeaf(uNodeIndex))
+ continue;
+
+ const unsigned uNode2 = tree.GetNeighbor1(uNodeIndex);
+ if (NULL_NEIGHBOR == uNode2)
+ Quit("RootByMidLongestSpan: internal error 0");
+ const double dEdgeLength = tree.GetEdgeLength(uNodeIndex, uNode2);
+ const EdgeInfo &EI = EIs[uNodeIndex][0];
+ if (!EI.m_bSet)
+ Quit("RootByMidLongestSpan: internal error 1");
+ if (EI.m_uNode1 != uNodeIndex || EI.m_uNode2 != uNode2)
+ Quit("RootByMidLongestSpan: internal error 2");
+ const double dSpanLength = dEdgeLength + EI.m_dMaxDistToLeaf;
+ if (dSpanLength > dMaxDist)
+ {
+ dMaxDist = dSpanLength;
+ uLeaf1 = uNodeIndex;
+ uMostDistantLeaf = EI.m_uMostDistantLeaf;
+ }
+ }
+
+ if (NULL_NEIGHBOR == uLeaf1)
+ Quit("RootByMidLongestSpan: internal error 3");
+
+ const double dTreeHeight = dMaxDist/2.0;
+ unsigned uNode1 = uLeaf1;
+ unsigned uNode2 = tree.GetNeighbor1(uLeaf1);
+ double dAccumSpanLength = 0;
+
+#if TRACE
+ Log("RootByMidLongestSpan: span=%u", uLeaf1);
+#endif
+
+ for (;;)
+ {
+ const double dEdgeLength = tree.GetEdgeLength(uNode1, uNode2);
+#if TRACE
+ Log("->%u(%g;%g)", uNode2, dEdgeLength, dAccumSpanLength);
+#endif
+ if (dAccumSpanLength + dEdgeLength >= dTreeHeight)
+ {
+ *ptruNode1 = uNode1;
+ *ptruNode2 = uNode2;
+ *ptrdLength1 = dTreeHeight - dAccumSpanLength;
+ *ptrdLength2 = dEdgeLength - *ptrdLength1;
+#if TRACE
+ {
+ const EdgeInfo &EI = EIs[uLeaf1][0];
+ Log("...\n");
+ Log("Midpoint: Leaf1=%u Leaf2=%u Node1=%u Node2=%u Length1=%g Length2=%g\n",
+ uLeaf1, EI.m_uMostDistantLeaf, *ptruNode1, *ptruNode2, *ptrdLength1, *ptrdLength2);
+ }
+#endif
+ return;
+ }
+
+ if (tree.IsLeaf(uNode2))
+ Quit("RootByMidLongestSpan: internal error 4");
+
+ dAccumSpanLength += dEdgeLength;
+ const unsigned uSub = tree.GetNeighborSubscript(uNode1, uNode2);
+ const EdgeInfo &EI = EIs[uNode1][uSub];
+ if (!EI.m_bSet)
+ Quit("RootByMidLongestSpan: internal error 5");
+
+ uNode1 = uNode2;
+ uNode2 = EI.m_uMaxStep;
+ }
+ }
+
+/***
+Root by balancing average distance to leaves.
+The root is a point p such that the average
+distance to leaves to the left of p is the
+same as the to the right.
+
+This is the method used by CLUSTALW, which
+was originally used in PROFILEWEIGHT:
+
+ Thompson et al. (1994) CABIOS (10) 1, 19-29.
+***/
+
+static void RootByMinAvgLeafDist(const Tree &tree, EdgeInfo **EIs,
+ unsigned *ptruNode1, unsigned *ptruNode2,
+ double *ptrdLength1, double *ptrdLength2)
+ {
+ const unsigned uNodeCount = tree.GetNodeCount();
+ const unsigned uLeafCount = tree.GetLeafCount();
+ unsigned uNode1 = NULL_NEIGHBOR;
+ unsigned uNode2 = NULL_NEIGHBOR;
+ double dMinHeight = VERY_LARGE_DOUBLE;
+ double dBestLength1 = VERY_LARGE_DOUBLE;
+ double dBestLength2 = VERY_LARGE_DOUBLE;
+
+ for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex)
+ {
+ const unsigned uNeighborCount = tree.GetNeighborCount(uNodeIndex);
+ for (unsigned uSub = 0; uSub < uNeighborCount; ++uSub)
+ {
+ const unsigned uNeighborIndex = tree.GetNeighbor(uNodeIndex, uSub);
+
+ // Avoid visiting same edge a second time in reversed order.
+ if (uNeighborIndex < uNodeIndex)
+ continue;
+
+ const unsigned uSubRev = tree.GetNeighborSubscript(uNeighborIndex, uNodeIndex);
+ if (NULL_NEIGHBOR == uSubRev)
+ Quit("RootByMinAvgLeafDist, internal error 1");
+
+ // Get info for edges Node1->Node2 and Node2->Node1 (reversed)
+ const EdgeInfo &EI = EIs[uNodeIndex][uSub];
+ const EdgeInfo &EIRev = EIs[uNeighborIndex][uSubRev];
+
+ if (EI.m_uNode1 != uNodeIndex || EI.m_uNode2 != uNeighborIndex ||
+ EIRev.m_uNode1 != uNeighborIndex || EIRev.m_uNode2 != uNodeIndex)
+ Quit("RootByMinAvgLeafDist, internal error 2");
+ if (!EI.m_bSet)
+ Quit("RootByMinAvgLeafDist, internal error 3");
+ if (uLeafCount != EI.m_uLeafCount + EIRev.m_uLeafCount)
+ Quit("RootByMinAvgLeafDist, internal error 4");
+
+ const double dEdgeLength = tree.GetEdgeLength(uNodeIndex, uNeighborIndex);
+ if (dEdgeLength != tree.GetEdgeLength(uNeighborIndex, uNodeIndex))
+ Quit("RootByMinAvgLeafDist, internal error 5");
+
+ // Consider point p on edge 12 in tree (1=Node, 2=Neighbor).
+ //
+ // ----- ----
+ // | |
+ // 1----p--2
+ // | |
+ // ----- ----
+ //
+ // Define:
+ // ADLp = average distance to leaves to left of point p.
+ // ADRp = average distance to leaves to right of point p.
+ // L = edge length = distance 12
+ // x = distance 1p
+ // So distance p2 = L - x.
+ // Average distance from p to leaves on left of p is:
+ // ADLp = ADL1 + x
+ // Average distance from p to leaves on right of p is:
+ // ADRp = ADR2 + (L - x)
+ // To be a root, we require these two distances to be equal,
+ // ADLp = ADRp
+ // ADL1 + x = ADR2 + (L - x)
+ // Solving for x,
+ // x = (ADR2 - ADL1 + L)/2
+ // If 0 <= x <= L, we can place the root on edge 12.
+
+ const double ADL1 = EI.m_dTotalDistToLeaves / EI.m_uLeafCount;
+ const double ADR2 = EIRev.m_dTotalDistToLeaves / EIRev.m_uLeafCount;
+
+ const double x = (ADR2 - ADL1 + dEdgeLength)/2.0;
+ if (x >= 0 && x <= dEdgeLength)
+ {
+ const double dLength1 = x;
+ const double dLength2 = dEdgeLength - x;
+ const double dHeight1 = EI.m_dMaxDistToLeaf + dLength1;
+ const double dHeight2 = EIRev.m_dMaxDistToLeaf + dLength2;
+ const double dHeight = dHeight1 >= dHeight2 ? dHeight1 : dHeight2;
+#if TRACE
+ Log("Candidate root Node1=%u Node2=%u Height=%g\n",
+ uNodeIndex, uNeighborIndex, dHeight);
+#endif
+ if (dHeight < dMinHeight)
+ {
+ uNode1 = uNodeIndex;
+ uNode2 = uNeighborIndex;
+ dBestLength1 = dLength1;
+ dBestLength2 = dLength2;
+ dMinHeight = dHeight;
+ }
+ }
+ }
+ }
+
+ if (NULL_NEIGHBOR == uNode1 || NULL_NEIGHBOR == uNode2)
+ Quit("RootByMinAvgLeafDist, internal error 6");
+
+#if TRACE
+ Log("Best root Node1=%u Node2=%u Length1=%g Length2=%g Height=%g\n",
+ uNode1, uNode2, dBestLength1, dBestLength2, dMinHeight);
+#endif
+
+ *ptruNode1 = uNode1;
+ *ptruNode2 = uNode2;
+ *ptrdLength1 = dBestLength1;
+ *ptrdLength2 = dBestLength2;
+ }
+
+void FixRoot(Tree &tree, ROOT Method)
+ {
+ if (!tree.IsRooted())
+ Quit("FixRoot: expecting rooted tree");
+
+ // Pseudo-root: keep root assigned by clustering
+ if (ROOT_Pseudo == Method)
+ return;
+
+ tree.UnrootByDeletingRoot();
+ tree.RootUnrootedTree(Method);
+ }
Added: trunk/packages/muscle/branches/upstream/current/phy4.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/phy4.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/phy4.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,295 @@
+#include "muscle.h"
+#include "tree.h"
+#include <stdio.h>
+
+#define TRACE 0
+
+void ClusterByHeight(const Tree &tree, double dMaxHeight, unsigned Subtrees[],
+ unsigned *ptruSubtreeCount)
+ {
+ if (!tree.IsRooted())
+ Quit("ClusterByHeight: requires rooted tree");
+
+#if TRACE
+ Log("ClusterByHeight, max height=%g\n", dMaxHeight);
+#endif
+
+ unsigned uSubtreeCount = 0;
+ const unsigned uNodeCount = tree.GetNodeCount();
+ for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex)
+ {
+ if (tree.IsRoot(uNodeIndex))
+ continue;
+ unsigned uParent = tree.GetParent(uNodeIndex);
+ double dHeight = tree.GetNodeHeight(uNodeIndex);
+ double dParentHeight = tree.GetNodeHeight(uParent);
+
+#if TRACE
+ Log("Node %3u Height %5.2f ParentHeight %5.2f\n",
+ uNodeIndex, dHeight, dParentHeight);
+#endif
+ if (dParentHeight > dMaxHeight && dHeight <= dMaxHeight)
+ {
+ Subtrees[uSubtreeCount] = uNodeIndex;
+#if TRACE
+ Log("Subtree[%u]=%u\n", uSubtreeCount, uNodeIndex);
+#endif
+ ++uSubtreeCount;
+ }
+ }
+ *ptruSubtreeCount = uSubtreeCount;
+ }
+
+static void ClusterBySubfamCount_Iteration(const Tree &tree, unsigned Subfams[],
+ unsigned uCount)
+ {
+// Find highest child node of current set of subfamilies.
+ double dHighestHeight = -1e20;
+ int iParentSubscript = -1;
+
+ for (int n = 0; n < (int) uCount; ++n)
+ {
+ const unsigned uNodeIndex = Subfams[n];
+ if (tree.IsLeaf(uNodeIndex))
+ continue;
+
+ const unsigned uLeft = tree.GetLeft(uNodeIndex);
+ const double dHeightLeft = tree.GetNodeHeight(uLeft);
+ if (dHeightLeft > dHighestHeight)
+ {
+ dHighestHeight = dHeightLeft;
+ iParentSubscript = n;
+ }
+
+ const unsigned uRight = tree.GetRight(uNodeIndex);
+ const double dHeightRight = tree.GetNodeHeight(uRight);
+ if (dHeightRight > dHighestHeight)
+ {
+ dHighestHeight = dHeightRight;
+ iParentSubscript = n;
+ }
+ }
+
+ if (-1 == iParentSubscript)
+ Quit("CBSFCIter: failed to find highest child");
+
+ const unsigned uNodeIndex = Subfams[iParentSubscript];
+ const unsigned uLeft = tree.GetLeft(uNodeIndex);
+ const unsigned uRight = tree.GetRight(uNodeIndex);
+
+// Delete parent by replacing with left child
+ Subfams[iParentSubscript] = uLeft;
+
+// Append right child to list
+ Subfams[uCount] = uRight;
+
+#if TRACE
+ {
+ Log("Iter %3u:", uCount);
+ for (unsigned n = 0; n < uCount; ++n)
+ Log(" %u", Subfams[n]);
+ Log("\n");
+ }
+#endif
+ }
+
+// Divide a tree containing N leaves into k families by
+// cutting the tree at a horizontal line at some height.
+// Each internal node defines a height for the cut,
+// considering all internal nodes enumerates all distinct
+// cuts. Visit internal nodes in decreasing order of height.
+// Visiting the node corresponds to moving the horizontal
+// line down to cut the tree at the height of that node.
+// We consider the cut to be "infinitestimally below"
+// the node, so the effect is to remove the current node
+// from the list of subfamilies and add its two children.
+// We must visit a parent before its children (so care may
+// be needed to handle zero edge lengths properly).
+// We assume that N is small, and write dumb O(N^2) code.
+// More efficient strategies are possible for large N
+// by maintaining a list of nodes sorted by height.
+void ClusterBySubfamCount(const Tree &tree, unsigned uSubfamCount,
+ unsigned Subfams[], unsigned *ptruSubfamCount)
+ {
+ const unsigned uNodeCount = tree.GetNodeCount();
+ const unsigned uLeafCount = (uNodeCount + 1)/2;
+
+// Special case: empty tree
+ if (0 == uNodeCount)
+ {
+ *ptruSubfamCount = 0;
+ return;
+ }
+
+// Special case: more subfamilies than leaves
+ if (uSubfamCount >= uLeafCount)
+ {
+ for (unsigned n = 0; n < uLeafCount; ++n)
+ Subfams[n] = n;
+ *ptruSubfamCount = uLeafCount;
+ return;
+ }
+
+// Initialize list of subfamilies to be root
+ Subfams[0] = tree.GetRootNodeIndex();
+
+// Iterate
+ for (unsigned i = 1; i < uSubfamCount; ++i)
+ ClusterBySubfamCount_Iteration(tree, Subfams, i);
+
+ *ptruSubfamCount = uSubfamCount;
+ }
+
+static void GetLeavesRecurse(const Tree &tree, unsigned uNodeIndex,
+ unsigned Leaves[], unsigned &uLeafCount /* in-out */)
+ {
+ if (tree.IsLeaf(uNodeIndex))
+ {
+ Leaves[uLeafCount] = uNodeIndex;
+ ++uLeafCount;
+ return;
+ }
+
+ const unsigned uLeft = tree.GetLeft(uNodeIndex);
+ const unsigned uRight = tree.GetRight(uNodeIndex);
+
+ GetLeavesRecurse(tree, uLeft, Leaves, uLeafCount);
+ GetLeavesRecurse(tree, uRight, Leaves, uLeafCount);
+ }
+
+void GetLeaves(const Tree &tree, unsigned uNodeIndex, unsigned Leaves[],
+ unsigned *ptruLeafCount)
+ {
+ unsigned uLeafCount = 0;
+ GetLeavesRecurse(tree, uNodeIndex, Leaves, uLeafCount);
+ *ptruLeafCount = uLeafCount;
+ }
+
+void Tree::PruneTree(const Tree &tree, unsigned Subfams[],
+ unsigned uSubfamCount)
+ {
+ if (!tree.IsRooted())
+ Quit("Tree::PruneTree: requires rooted tree");
+
+ Clear();
+
+ m_uNodeCount = 2*uSubfamCount - 1;
+ InitCache(m_uNodeCount);
+
+ const unsigned uUnprunedNodeCount = tree.GetNodeCount();
+
+ unsigned *uUnprunedToPrunedIndex = new unsigned[uUnprunedNodeCount];
+ unsigned *uPrunedToUnprunedIndex = new unsigned[m_uNodeCount];
+
+ for (unsigned n = 0; n < uUnprunedNodeCount; ++n)
+ uUnprunedToPrunedIndex[n] = NULL_NEIGHBOR;
+
+ for (unsigned n = 0; n < m_uNodeCount; ++n)
+ uPrunedToUnprunedIndex[n] = NULL_NEIGHBOR;
+
+// Create mapping between unpruned and pruned node indexes
+ unsigned uInternalNodeIndex = uSubfamCount;
+ for (unsigned uSubfamIndex = 0; uSubfamIndex < uSubfamCount; ++uSubfamIndex)
+ {
+ unsigned uUnprunedNodeIndex = Subfams[uSubfamIndex];
+ uUnprunedToPrunedIndex[uUnprunedNodeIndex] = uSubfamIndex;
+ uPrunedToUnprunedIndex[uSubfamIndex] = uUnprunedNodeIndex;
+ for (;;)
+ {
+ uUnprunedNodeIndex = tree.GetParent(uUnprunedNodeIndex);
+ if (tree.IsRoot(uUnprunedNodeIndex))
+ break;
+
+ // Already visited this node?
+ if (NULL_NEIGHBOR != uUnprunedToPrunedIndex[uUnprunedNodeIndex])
+ break;
+
+ uUnprunedToPrunedIndex[uUnprunedNodeIndex] = uInternalNodeIndex;
+ uPrunedToUnprunedIndex[uInternalNodeIndex] = uUnprunedNodeIndex;
+
+ ++uInternalNodeIndex;
+ }
+ }
+
+ const unsigned uUnprunedRootIndex = tree.GetRootNodeIndex();
+ uUnprunedToPrunedIndex[uUnprunedRootIndex] = uInternalNodeIndex;
+ uPrunedToUnprunedIndex[uInternalNodeIndex] = uUnprunedRootIndex;
+
+#if TRACE
+ {
+ Log("Pruned to unpruned:\n");
+ for (unsigned i = 0; i < m_uNodeCount; ++i)
+ Log(" [%u]=%u", i, uPrunedToUnprunedIndex[i]);
+ Log("\n");
+ Log("Unpruned to pruned:\n");
+ for (unsigned i = 0; i < uUnprunedNodeCount; ++i)
+ {
+ unsigned n = uUnprunedToPrunedIndex[i];
+ if (n != NULL_NEIGHBOR)
+ Log(" [%u]=%u", i, n);
+ }
+ Log("\n");
+ }
+#endif
+
+ if (uInternalNodeIndex != m_uNodeCount - 1)
+ Quit("Tree::PruneTree, Internal error");
+
+// Nodes 0, 1 ... are the leaves
+ for (unsigned uSubfamIndex = 0; uSubfamIndex < uSubfamCount; ++uSubfamIndex)
+ {
+ char szName[32];
+ sprintf(szName, "Subfam_%u", uSubfamIndex + 1);
+ m_ptrName[uSubfamIndex] = strsave(szName);
+ }
+
+ for (unsigned uPrunedNodeIndex = uSubfamCount; uPrunedNodeIndex < m_uNodeCount;
+ ++uPrunedNodeIndex)
+ {
+ unsigned uUnprunedNodeIndex = uPrunedToUnprunedIndex[uPrunedNodeIndex];
+
+ const unsigned uUnprunedLeft = tree.GetLeft(uUnprunedNodeIndex);
+ const unsigned uUnprunedRight = tree.GetRight(uUnprunedNodeIndex);
+
+ const unsigned uPrunedLeft = uUnprunedToPrunedIndex[uUnprunedLeft];
+ const unsigned uPrunedRight = uUnprunedToPrunedIndex[uUnprunedRight];
+
+ const double dLeftLength =
+ tree.GetEdgeLength(uUnprunedNodeIndex, uUnprunedLeft);
+ const double dRightLength =
+ tree.GetEdgeLength(uUnprunedNodeIndex, uUnprunedRight);
+
+ m_uNeighbor2[uPrunedNodeIndex] = uPrunedLeft;
+ m_uNeighbor3[uPrunedNodeIndex] = uPrunedRight;
+
+ m_dEdgeLength1[uPrunedLeft] = dLeftLength;
+ m_dEdgeLength1[uPrunedRight] = dRightLength;
+
+ m_uNeighbor1[uPrunedLeft] = uPrunedNodeIndex;
+ m_uNeighbor1[uPrunedRight] = uPrunedNodeIndex;
+
+ m_bHasEdgeLength1[uPrunedLeft] = true;
+ m_bHasEdgeLength1[uPrunedRight] = true;
+
+ m_dEdgeLength2[uPrunedNodeIndex] = dLeftLength;
+ m_dEdgeLength3[uPrunedNodeIndex] = dRightLength;
+
+ m_bHasEdgeLength2[uPrunedNodeIndex] = true;
+ m_bHasEdgeLength3[uPrunedNodeIndex] = true;
+ }
+
+ m_uRootNodeIndex = uUnprunedToPrunedIndex[uUnprunedRootIndex];
+
+ m_bRooted = true;
+
+ Validate();
+
+ delete[] uUnprunedToPrunedIndex;
+ }
+
+void LeafIndexesToIds(const Tree &tree, const unsigned Leaves[], unsigned uCount,
+ unsigned Ids[])
+ {
+ for (unsigned n = 0; n < uCount; ++n)
+ Ids[n] = tree.GetLeafId(Leaves[n]);
+ }
Added: trunk/packages/muscle/branches/upstream/current/phyfromclust.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/phyfromclust.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/phyfromclust.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,95 @@
+#include "muscle.h"
+#include "tree.h"
+#include "clust.h"
+
+void Tree::InitCache(unsigned uCacheCount)
+ {
+ m_uCacheCount = uCacheCount;
+
+ m_uNeighbor1 = new unsigned[m_uCacheCount];
+ m_uNeighbor2 = new unsigned[m_uCacheCount];
+ m_uNeighbor3 = new unsigned[m_uCacheCount];
+
+ m_Ids = new unsigned[m_uCacheCount];
+
+ m_dEdgeLength1 = new double[m_uCacheCount];
+ m_dEdgeLength2 = new double[m_uCacheCount];
+ m_dEdgeLength3 = new double[m_uCacheCount];
+ m_dHeight = new double[m_uCacheCount];
+
+ m_bHasEdgeLength1 = new bool[m_uCacheCount];
+ m_bHasEdgeLength2 = new bool[m_uCacheCount];
+ m_bHasEdgeLength3 = new bool[m_uCacheCount];
+ m_bHasHeight = new bool[m_uCacheCount];
+
+ m_ptrName = new char *[m_uCacheCount];
+
+ for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex)
+ {
+ m_uNeighbor1[uNodeIndex] = NULL_NEIGHBOR;
+ m_uNeighbor2[uNodeIndex] = NULL_NEIGHBOR;
+ m_uNeighbor3[uNodeIndex] = NULL_NEIGHBOR;
+ m_bHasEdgeLength1[uNodeIndex] = false;
+ m_bHasEdgeLength2[uNodeIndex] = false;
+ m_bHasEdgeLength3[uNodeIndex] = false;
+ m_bHasHeight[uNodeIndex] = false;
+ m_dEdgeLength1[uNodeIndex] = dInsane;
+ m_dEdgeLength2[uNodeIndex] = dInsane;
+ m_dEdgeLength3[uNodeIndex] = dInsane;
+ m_dHeight[uNodeIndex] = dInsane;
+ m_ptrName[uNodeIndex] = 0;
+ m_Ids[uNodeIndex] = uInsane;
+ }
+ }
+
+void Tree::FromClust(Clust &C)
+ {
+ Clear();
+
+ m_uNodeCount = C.GetNodeCount();
+ InitCache(m_uNodeCount);
+
+// Cluster is always rooted. An unrooted cluster
+// is represented by a pseudo-root, which we fix later.
+ m_bRooted = true;
+ const unsigned uRoot = C.GetRootNodeIndex();
+ m_uRootNodeIndex = uRoot;
+ m_uNeighbor1[uRoot] = NULL_NEIGHBOR;
+ m_bHasEdgeLength1[uRoot] = false;
+
+ for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex)
+ {
+ if (C.IsLeaf(uNodeIndex))
+ {
+ const char *ptrName = C.GetNodeName(uNodeIndex);
+ m_ptrName[uNodeIndex] = strsave(ptrName);
+ m_Ids[uNodeIndex] = C.GetNodeId(uNodeIndex);
+ continue;
+ }
+
+ const unsigned uLeft = C.GetLeftIndex(uNodeIndex);
+ const unsigned uRight = C.GetRightIndex(uNodeIndex);
+
+ const double dLeftLength = C.GetLength(uLeft);
+ const double dRightLength = C.GetLength(uRight);
+
+ m_uNeighbor2[uNodeIndex] = uLeft;
+ m_uNeighbor3[uNodeIndex] = uRight;
+
+ m_dEdgeLength1[uLeft] = dLeftLength;
+ m_dEdgeLength1[uRight] = dRightLength;
+
+ m_uNeighbor1[uLeft] = uNodeIndex;
+ m_uNeighbor1[uRight] = uNodeIndex;
+
+ m_bHasEdgeLength1[uLeft] = true;
+ m_bHasEdgeLength1[uRight] = true;
+
+ m_dEdgeLength2[uNodeIndex] = dLeftLength;
+ m_dEdgeLength3[uNodeIndex] = dRightLength;
+
+ m_bHasEdgeLength2[uNodeIndex] = true;
+ m_bHasEdgeLength3[uNodeIndex] = true;
+ }
+ Validate();
+ }
Added: trunk/packages/muscle/branches/upstream/current/phyfromfile.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/phyfromfile.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/phyfromfile.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,269 @@
+#include "muscle.h"
+#include "tree.h"
+#include "textfile.h"
+
+#define TRACE 0
+
+// Tokens in Newick files are:
+// ( ) : , ;
+// string
+// 'string'
+// "string"
+// [ comment ]
+//
+// We can't safely distinguish between identifiers and floating point
+// numbers at the lexical level (because identifiers may be numeric,
+// or start with digits), so both edge lengths and identifiers are
+// returned as strings.
+
+const char *Tree::NTTStr(NEWICK_TOKEN_TYPE NTT) const
+ {
+ switch (NTT)
+ {
+#define c(x) case NTT_##x: return #x;
+ c(Unknown)
+ c(Lparen)
+ c(Rparen)
+ c(Colon)
+ c(Comma)
+ c(Semicolon)
+ c(String)
+ c(SingleQuotedString)
+ c(DoubleQuotedString)
+ c(Comment)
+#undef c
+ }
+ return "??";
+ }
+
+NEWICK_TOKEN_TYPE Tree::GetToken(TextFile &File, char szToken[], unsigned uBytes) const
+ {
+// Skip leading white space
+ File.SkipWhite();
+
+ char c;
+ File.GetCharX(c);
+
+// In case a single-character token
+ szToken[0] = c;
+ szToken[1] = 0;
+
+ unsigned uBytesCopied = 0;
+ NEWICK_TOKEN_TYPE TT;
+ switch (c)
+ {
+ case '(':
+ return NTT_Lparen;
+
+ case ')':
+ return NTT_Rparen;
+
+ case ':':
+ return NTT_Colon;
+
+ case ';':
+ return NTT_Semicolon;
+
+ case ',':
+ return NTT_Comma;
+
+ case '\'':
+ TT = NTT_SingleQuotedString;
+ File.GetCharX(c);
+ break;
+
+ case '"':
+ TT = NTT_DoubleQuotedString;
+ File.GetCharX(c);
+ break;
+
+ case '[':
+ TT = NTT_Comment;
+ break;
+
+ default:
+ TT = NTT_String;
+ break;
+ }
+
+ for (;;)
+ {
+ if (TT != NTT_Comment)
+ {
+ if (uBytesCopied < uBytes - 2)
+ {
+ szToken[uBytesCopied++] = c;
+ szToken[uBytesCopied] = 0;
+ }
+ else
+ Quit("Tree::GetToken: input buffer too small, token so far='%s'", szToken);
+ }
+ bool bEof = File.GetChar(c);
+ if (bEof)
+ return TT;
+
+ switch (TT)
+ {
+ case NTT_String:
+ if (0 != strchr("():;,", c))
+ {
+ File.PushBack(c);
+ return NTT_String;
+ }
+ if (isspace(c))
+ return NTT_String;
+ break;
+
+ case NTT_SingleQuotedString:
+ if ('\'' == c)
+ return NTT_String;
+ break;
+
+ case NTT_DoubleQuotedString:
+ if ('"' == c)
+ return NTT_String;
+ break;
+
+ case NTT_Comment:
+ if (']' == c)
+ return GetToken(File, szToken, uBytes);
+ break;
+
+ default:
+ Quit("Tree::GetToken, invalid TT=%u", TT);
+ }
+ }
+ }
+
+// NOTE: this hack must come after definition of Tree::GetToken.
+#if TRACE
+#define GetToken GetTokenVerbose
+#endif
+
+void Tree::FromFile(TextFile &File)
+ {
+// Assume rooted.
+// If we discover that it is unrooted, will convert on the fly.
+ CreateRooted();
+
+ double dEdgeLength;
+ bool bEdgeLength = GetGroupFromFile(File, 0, &dEdgeLength);
+
+// Next token should be either ';' for rooted tree or ',' for unrooted.
+ char szToken[16];
+ NEWICK_TOKEN_TYPE NTT = GetToken(File, szToken, sizeof(szToken));
+
+// If rooted, all done.
+ if (NTT_Semicolon == NTT)
+ {
+ if (bEdgeLength)
+ Log(" *** Warning *** edge length on root group in Newick file %s\n",
+ File.GetFileName());
+ Validate();
+ return;
+ }
+
+ if (NTT_Comma != NTT)
+ Quit("Tree::FromFile, expected ';' or ',', got '%s'", szToken);
+
+ const unsigned uThirdNode = UnrootFromFile();
+ bEdgeLength = GetGroupFromFile(File, uThirdNode, &dEdgeLength);
+ if (bEdgeLength)
+ SetEdgeLength(0, uThirdNode, dEdgeLength);
+ Validate();
+ }
+
+// Return true if edge length for this group.
+bool Tree::GetGroupFromFile(TextFile &File, unsigned uNodeIndex,
+ double *ptrdEdgeLength)
+ {
+ char szToken[1024];
+ NEWICK_TOKEN_TYPE NTT = GetToken(File, szToken, sizeof(szToken));
+
+// Group is either leaf name or (left, right).
+ if (NTT_String == NTT)
+ {
+ SetLeafName(uNodeIndex, szToken);
+#if TRACE
+ Log("Group is leaf '%s'\n", szToken);
+#endif
+ }
+ else if (NTT_Lparen == NTT)
+ {
+ const unsigned uLeft = AppendBranch(uNodeIndex);
+ const unsigned uRight = uLeft + 1;
+
+ // Left sub-group...
+#if TRACE
+ Log("Got '(', group is compound, expect left sub-group\n");
+#endif
+ double dEdgeLength;
+ bool bLeftLength = GetGroupFromFile(File, uLeft, &dEdgeLength);
+#if TRACE
+ if (bLeftLength)
+ Log("Edge length for left sub-group: %.3g\n", dEdgeLength);
+ else
+ Log("No edge length for left sub-group\n");
+#endif
+ if (bLeftLength)
+ SetEdgeLength(uNodeIndex, uLeft, dEdgeLength);
+
+ // ... then comma ...
+#if TRACE
+ Log("Expect comma\n");
+#endif
+ NTT = GetToken(File, szToken, sizeof(szToken));
+ if (NTT_Comma != NTT)
+ Quit("Tree::GetGroupFromFile, expected ',', got '%s'", szToken);
+
+ // ...then right sub-group...
+#if TRACE
+ Log("Expect right sub-group\n");
+#endif
+ bool bRightLength = GetGroupFromFile(File, uRight, &dEdgeLength);
+ if (bRightLength)
+ SetEdgeLength(uNodeIndex, uRight, dEdgeLength);
+#if TRACE
+ if (bRightLength)
+ Log("Edge length for right sub-group: %.3g\n", dEdgeLength);
+ else
+ Log("No edge length for right sub-group\n");
+#endif
+
+ // ... then closing parenthesis.
+#if TRACE
+ Log("Expect closing parenthesis (or comma if > 2-ary)\n");
+#endif
+ NTT = GetToken(File, szToken, sizeof(szToken));
+ if (NTT_Rparen == NTT)
+ ;
+ else if (NTT_Comma == NTT)
+ {
+ File.PushBack(',');
+ return false;
+ }
+ else
+ Quit("Tree::GetGroupFromFile, expected ')' or ',', got '%s'", szToken);
+ }
+ else
+ Quit("Tree::GetGroupFromFile, expected '(' or leaf name, got '%s'",
+ szToken);
+
+// Group may optionally be followed by edge length.
+ File.SkipWhite();
+ char c;
+ File.GetCharX(c);
+#if TRACE
+ Log("Character following group, could be colon, is '%c'\n", c);
+#endif
+ if (':' == c)
+ {
+ NTT = GetToken(File, szToken, sizeof(szToken));
+ if (NTT_String != NTT)
+ Quit("Tree::GetGroupFromFile, expected edge length, got '%s'", szToken);
+ *ptrdEdgeLength = atof(szToken);
+ return true;
+ }
+ File.PushBack(c);
+ return false;
+ }
Added: trunk/packages/muscle/branches/upstream/current/physeq.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/physeq.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/physeq.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,128 @@
+#include "muscle.h"
+#include "msa.h"
+#include "textfile.h"
+
+const int BLOCKSIZE = 60;
+
+static char FixChar(char c)
+ {
+ switch (c)
+ {
+ case '(':
+ case ')':
+ case '[':
+ case ']':
+ case ':':
+ case ';':
+ case ',':
+ return '_';
+ }
+ if (!isprint(c))
+ return '_';
+ return c;
+ }
+
+static void FixName(char Name[])
+ {
+ while (char c = *Name)
+ *Name++ = FixChar(c);
+ }
+
+void MSA::ToPhySequentialFile(TextFile &File) const
+ {
+ const unsigned SeqCount = GetSeqCount();
+ const unsigned ColCount = GetColCount();
+
+ File.PutFormat("%d %d\n", SeqCount, ColCount);
+
+ if (0 == ColCount)
+ return;
+
+ for (unsigned Seq = 0; Seq < SeqCount; ++Seq)
+ {
+ char Name[11];
+ const char *ptrName = GetSeqName(Seq);
+ size_t n = strlen(ptrName);
+ if (n > 10)
+ n = 10;
+ memcpy(Name, ptrName, n);
+ Name[n] = 0;
+ FixName(Name);
+ File.PutFormat("%-10.10s", Name);
+
+ int BlockIndex = 0;
+ int Col = 0;
+ for (;;)
+ {
+ const unsigned MaxCols = (BlockIndex == 0) ? (BLOCKSIZE - 10) : BLOCKSIZE;
+ for (unsigned ColsThisBlock = 0; ColsThisBlock < MaxCols; ++ColsThisBlock)
+ {
+ if (Col == ColCount)
+ break;
+ if (ColsThisBlock%10 == 0 && (BlockIndex == 0 || ColsThisBlock > 0))
+ File.PutChar(' ');
+ char c = GetChar(Seq, Col);
+ if (isalpha(c))
+ c = toupper(c);
+ File.PutChar(c);
+ ++Col;
+ }
+ File.PutChar('\n');
+ if (Col == ColCount)
+ break;
+ ++BlockIndex;
+ }
+ }
+ }
+
+void MSA::ToPhyInterleavedFile(TextFile &File) const
+ {
+ const unsigned SeqCount = GetSeqCount();
+ const unsigned ColCount = GetColCount();
+
+ File.PutFormat("%d %d\n", SeqCount, ColCount);
+
+ if (0 == ColCount)
+ return;
+
+ int Col = 0;
+ for (;;)
+ {
+ const unsigned ColBlockStart = Col;
+ const unsigned MaxCols = (ColBlockStart == 0) ? (BLOCKSIZE - 10) : BLOCKSIZE;
+
+ for (unsigned Seq = 0; Seq < SeqCount; ++Seq)
+ {
+ if (0 == ColBlockStart)
+ {
+ char Name[11];
+ const char *ptrName = GetSeqName(Seq);
+ size_t n = strlen(ptrName);
+ if (n > 10)
+ n = 10;
+ memcpy(Name, ptrName, n);
+ Name[n] = 0;
+ FixName(Name);
+ File.PutFormat("%-10.10s", Name);
+ }
+
+ Col = ColBlockStart;
+ for (unsigned ColsThisBlock = 0; ColsThisBlock < MaxCols; ++ColsThisBlock)
+ {
+ if (Col == ColCount)
+ break;
+ if (ColsThisBlock%10 == 0 && (0 == ColBlockStart || ColsThisBlock > 0))
+ File.PutChar(' ');
+ char c = GetChar(Seq, Col);
+ if (isalpha(c))
+ c = toupper(c);
+ File.PutChar(c);
+ ++Col;
+ }
+ File.PutChar('\n');
+ }
+ if (Col == ColCount)
+ break;
+ File.PutChar('\n');
+ }
+ }
Added: trunk/packages/muscle/branches/upstream/current/phytofile.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/phytofile.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/phytofile.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,86 @@
+#include "muscle.h"
+#include "tree.h"
+#include "textfile.h"
+
+unsigned Tree::GetAnyNonLeafNode() const
+ {
+ for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex)
+ if (!IsLeaf(uNodeIndex))
+ return uNodeIndex;
+ return NULL_NEIGHBOR;
+ }
+
+void Tree::ToFile(TextFile &File) const
+ {
+ if (IsRooted())
+ {
+ ToFileNodeRooted(File, m_uRootNodeIndex);
+ File.PutString(";\n");
+ return;
+ }
+
+// Unrooted.
+ unsigned uNodeIndex = GetAnyNonLeafNode();
+
+ File.PutString("(\n");
+ ToFileNodeUnrooted(File, m_uNeighbor1[uNodeIndex], uNodeIndex);
+ File.PutString(",\n");
+ ToFileNodeUnrooted(File, m_uNeighbor2[uNodeIndex], uNodeIndex);
+ File.PutString(",\n");
+ ToFileNodeUnrooted(File, m_uNeighbor3[uNodeIndex], uNodeIndex);
+ File.PutString(");\n");
+ }
+
+void Tree::ToFileNodeUnrooted(TextFile &File, unsigned uNodeIndex, unsigned uParent) const
+ {
+ assert(!IsRooted());
+
+ bool bGroup = !IsLeaf(uNodeIndex);
+ if (bGroup)
+ File.PutString("(\n");
+
+ if (IsLeaf(uNodeIndex))
+ File.PutString(GetName(uNodeIndex));
+ else
+ {
+ ToFileNodeUnrooted(File, GetFirstNeighbor(uNodeIndex, uParent), uNodeIndex);
+ File.PutString(",\n");
+ ToFileNodeUnrooted(File, GetSecondNeighbor(uNodeIndex, uParent), uNodeIndex);
+ }
+
+ if (bGroup)
+ File.PutString(")");
+
+ if (HasEdgeLength(uNodeIndex, uParent))
+ File.PutFormat(":%g", GetEdgeLength(uNodeIndex, uParent));
+ File.PutString("\n");
+ }
+
+void Tree::ToFileNodeRooted(TextFile &File, unsigned uNodeIndex) const
+ {
+ assert(IsRooted());
+
+ bool bGroup = !IsLeaf(uNodeIndex) || IsRoot(uNodeIndex);
+ if (bGroup)
+ File.PutString("(\n");
+
+ if (IsLeaf(uNodeIndex))
+ File.PutString(GetName(uNodeIndex));
+ else
+ {
+ ToFileNodeRooted(File, GetLeft(uNodeIndex));
+ File.PutString(",\n");
+ ToFileNodeRooted(File, GetRight(uNodeIndex));
+ }
+
+ if (bGroup)
+ File.PutString(")");
+
+ if (!IsRoot(uNodeIndex))
+ {
+ unsigned uParent = GetParent(uNodeIndex);
+ if (HasEdgeLength(uNodeIndex, uParent))
+ File.PutFormat(":%g", GetEdgeLength(uNodeIndex, uParent));
+ }
+ File.PutString("\n");
+ }
Added: trunk/packages/muscle/branches/upstream/current/posgap.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/posgap.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/posgap.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,141 @@
+#include "muscle.h"
+
+// Pascaralle and Argos gap factors
+// after Table 1 in Thompson et. al. ClustalW NAR paper.
+static double PAFFacs[20] =
+ {
+ 1.13, // A
+ 1.13, // C
+ 0.96, // D
+ 1.31, // E
+ 1.20, // F
+ 0.61, // G
+ 1.00, // H
+ 1.32, // I
+ 0.96, // K
+ 1.21, // L
+ 1.29, // M
+ 0.62, // N
+ 0.74, // P
+ 1.07, // Q
+ 0.72, // R
+ 0.76, // S
+ 0.89, // T
+ 1.25, // V
+ 1.00, // Y
+ 1.23, // W
+ };
+
+// (Not used: does not appear to work well).
+SCORE PAFactor(const FCOUNT fcCounts[])
+ {
+ if (ALPHA_Amino != g_Alpha)
+ Quit("PAFFactor: requires amino acid sequence");
+
+ FCOUNT fLetterCount = 0;
+ double dSum = 0;
+ for (unsigned uLetter = 0; uLetter < 20; ++uLetter)
+ {
+ const FCOUNT fCount = fcCounts[uLetter];
+ dSum += fCount*PAFFacs[uLetter];
+ fLetterCount += fCount;
+ }
+ if (0 == fLetterCount)
+ return 0.5;
+ return (SCORE) (dSum/fLetterCount);
+ }
+
+static bool Hydrophilic[20] =
+ {
+ false, // A
+ false, // C
+ true, // D
+ true, // E
+ false, // F
+ true, // G
+ false, // H
+ false, // I
+ true, // K
+ false, // L
+ false, // M
+ true, // N
+ true, // P
+ true, // Q
+ true, // R
+ true, // S
+ false, // T
+ false, // V
+ false, // Y
+ false, // W
+ };
+
+bool IsHydrophilic(const FCOUNT fcCounts[])
+ {
+ if (ALPHA_Amino != g_Alpha)
+ Quit("IsHydrophilic: requires amino acid sequence");
+
+ for (unsigned uLetter = 0; uLetter < 20; ++uLetter)
+ if (fcCounts[uLetter] > 0 && !Hydrophilic[uLetter])
+ return false;
+ return true;
+ }
+
+bool IsHydrophilic(const unsigned uCounts[])
+ {
+ if (ALPHA_Amino != g_Alpha)
+ Quit("IsHydrophilic: requires amino acid sequence");
+
+ for (unsigned uLetter = 0; uLetter < 20; ++uLetter)
+ if (uCounts[uLetter] > 0 && !Hydrophilic[uLetter])
+ return false;
+ return true;
+ }
+
+// LIVCATMFYWHK
+// Venn Pascaralla B&T Me
+// L y y y
+// I y y y
+// V y y y
+// C y n
+// A y y y
+// T N n
+// M y y y
+// F y y y
+// Y n n
+// W y n
+// H n n
+// K n n
+static bool Hydrophobic[20] =
+ {
+ true, // A
+ true, // C
+ false, // D
+ false, // E
+ true, // F
+ false, // G
+ true, // H
+ true, // I
+ false, // K
+ true, // L
+ true, // M
+ false, // N
+ false, // P
+ false, // Q
+ false, // R
+ false, // S
+ true, // T
+ true, // V
+ true, // Y
+ true, // W
+ };
+
+bool IsHydrophobic(const FCOUNT fcCounts[])
+ {
+ if (ALPHA_Amino != g_Alpha)
+ Quit("IsHydrophobic: requires amino acid sequence");
+
+ for (unsigned uLetter = 0; uLetter < 20; ++uLetter)
+ if (fcCounts[uLetter] > 0.0 && !Hydrophobic[uLetter])
+ return false;
+ return true;
+ }
Added: trunk/packages/muscle/branches/upstream/current/ppscore.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/ppscore.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/ppscore.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,93 @@
+#include "muscle.h"
+#include "textfile.h"
+#include "msa.h"
+#include "tree.h"
+#include "profile.h"
+#include "objscore.h"
+
+bool g_bTracePPScore = false;
+MSA *g_ptrPPScoreMSA1 = 0;
+MSA *g_ptrPPScoreMSA2 = 0;
+
+static ProfPos *ProfileFromMSALocal(MSA &msa, Tree &tree)
+ {
+ const unsigned uSeqCount = msa.GetSeqCount();
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ msa.SetSeqId(uSeqIndex, uSeqIndex);
+
+ TreeFromMSA(msa, tree, g_Cluster2, g_Distance2, g_Root1);
+ SetMuscleTree(tree);
+ return ProfileFromMSA(msa);
+ }
+
+void PPScore()
+ {
+ if (0 == g_pstrFileName1 || 0 == g_pstrFileName2)
+ Quit("-ppscore needs -in1 and -in2");
+
+ SetSeqWeightMethod(g_SeqWeight1);
+
+ TextFile file1(g_pstrFileName1);
+ TextFile file2(g_pstrFileName2);
+
+ MSA msa1;
+ MSA msa2;
+
+ msa1.FromFile(file1);
+ msa2.FromFile(file2);
+
+ const unsigned uLength1 = msa1.GetColCount();
+ const unsigned uLength2 = msa2.GetColCount();
+
+ if (uLength1 != uLength2)
+ Quit("Profiles must have the same length");
+
+ ALPHA Alpha = ALPHA_Undefined;
+ switch (g_SeqType)
+ {
+ case SEQTYPE_Auto:
+ Alpha = msa1.GuessAlpha();
+ break;
+
+ case SEQTYPE_Protein:
+ Alpha = ALPHA_Amino;
+ break;
+
+ case SEQTYPE_DNA:
+ Alpha = ALPHA_DNA;
+ break;
+
+ case SEQTYPE_RNA:
+ Alpha = ALPHA_RNA;
+ break;
+
+ default:
+ Quit("Invalid SeqType");
+ }
+ SetAlpha(Alpha);
+
+ msa1.FixAlpha();
+ msa2.FixAlpha();
+
+ if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha)
+ SetPPScore(PPSCORE_SPN);
+
+ const unsigned uSeqCount1 = msa1.GetSeqCount();
+ const unsigned uSeqCount2 = msa2.GetSeqCount();
+ const unsigned uMaxSeqCount = (uSeqCount1 > uSeqCount2 ? uSeqCount1 : uSeqCount2);
+ MSA::SetIdCount(uMaxSeqCount);
+
+ Tree tree1;
+ Tree tree2;
+ ProfPos *Prof1 = ProfileFromMSALocal(msa1, tree1);
+ ProfPos *Prof2 = ProfileFromMSALocal(msa2, tree2);
+
+ g_bTracePPScore = true;
+ g_ptrPPScoreMSA1 = &msa1;
+ g_ptrPPScoreMSA2 = &msa2;
+
+ SCORE Score = ObjScoreDP_Profs(Prof1, Prof2, uLength1);
+
+ Log("Score=%.4g\n", Score);
+ printf("Score=%.4g\n", Score);
+ }
Added: trunk/packages/muscle/branches/upstream/current/profdb.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/profdb.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/profdb.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,54 @@
+#include "muscle.h"
+#include "textfile.h"
+#include "seqvect.h"
+#include "distfunc.h"
+#include "msa.h"
+#include "tree.h"
+#include "clust.h"
+#include "profile.h"
+#include "clustsetmsa.h"
+
+void ProfDB()
+ {
+ SetOutputFileName(g_pstrOutFileName);
+ SetInputFileName(g_pstrFileName2);
+ SetStartTime();
+
+ TextFile file1(g_pstrFileName1);
+ TextFile file2(g_pstrFileName2);
+
+ SetMaxIters(g_uMaxIters);
+ SetSeqWeightMethod(g_SeqWeight1);
+
+ TextFile fileIn(g_pstrFileName1);
+ MSA msa1;
+ msa1.FromFile(fileIn);
+
+ const unsigned uSeqCount1 = msa1.GetSeqCount();
+ if (0 == uSeqCount1)
+ Quit("No sequences in input alignment");
+
+ SeqVect v;
+ v.FromFASTAFile(file2);
+ const unsigned uSeqCount2 = v.Length();
+ if (0 == uSeqCount2)
+ Quit("No sequences in input alignment");
+
+ MSA::SetIdCount(uSeqCount1 + uSeqCount2);
+ SetProgressDesc("Align sequence database to profile");
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount2; ++uSeqIndex)
+ {
+ Progress(uSeqIndex, uSeqCount2);
+ Seq &s = *(v[uSeqIndex]);
+ s.SetId(0);
+ MSA msaTmp;
+ msaTmp.FromSeq(s);
+ MSA msaOut;
+ ProfileProfile(msa1, msaTmp, msaOut);
+ msa1.Copy(msaOut);
+ }
+ ProgressStepsDone();
+
+ TextFile fileOut(g_pstrOutFileName, true);
+ msa1.ToFile(fileOut);
+ }
Added: trunk/packages/muscle/branches/upstream/current/profile.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/profile.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/profile.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,180 @@
+#include "muscle.h"
+#include "textfile.h"
+#include "msa.h"
+#include "tree.h"
+#include "profile.h"
+#include "objscore.h"
+
+static ProfPos *ProfileFromMSALocal(MSA &msa, Tree &tree)
+ {
+ const unsigned uSeqCount = msa.GetSeqCount();
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ msa.SetSeqId(uSeqIndex, uSeqIndex);
+
+ TreeFromMSA(msa, tree, g_Cluster2, g_Distance2, g_Root1);
+ SetMuscleTree(tree);
+ return ProfileFromMSA(msa);
+ }
+
+void ProfileProfile(MSA &msa1, MSA &msa2, MSA &msaOut)
+ {
+ ALPHA Alpha = ALPHA_Undefined;
+ switch (g_SeqType)
+ {
+ case SEQTYPE_Auto:
+ Alpha = msa1.GuessAlpha();
+ break;
+
+ case SEQTYPE_Protein:
+ Alpha = ALPHA_Amino;
+ break;
+
+ case SEQTYPE_DNA:
+ Alpha = ALPHA_DNA;
+ break;
+
+ case SEQTYPE_RNA:
+ Alpha = ALPHA_RNA;
+ break;
+
+ default:
+ Quit("Invalid SeqType");
+ }
+ SetAlpha(Alpha);
+
+ msa1.FixAlpha();
+ msa2.FixAlpha();
+
+ if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha)
+ SetPPScore(PPSCORE_SPN);
+
+ unsigned uLength1;
+ unsigned uLength2;
+
+ uLength1 = msa1.GetColCount();
+ uLength2 = msa2.GetColCount();
+
+ Tree tree1;
+ Tree tree2;
+ ProfPos *Prof1 = ProfileFromMSALocal(msa1, tree1);
+ ProfPos *Prof2 = ProfileFromMSALocal(msa2, tree2);
+
+ PWPath Path;
+ ProfPos *ProfOut;
+ unsigned uLengthOut;
+ AlignTwoProfs(Prof1, uLength1, 1.0, Prof2, uLength2, 1.0, Path, &ProfOut, &uLengthOut);
+
+ AlignTwoMSAsGivenPath(Path, msa1, msa2, msaOut);
+ }
+
+// Do profile-profile alignment
+void Profile()
+ {
+ if (0 == g_pstrFileName1 || 0 == g_pstrFileName2)
+ Quit("-profile needs -in1 and -in2");
+
+ SetSeqWeightMethod(g_SeqWeight1);
+
+ TextFile file1(g_pstrFileName1);
+ TextFile file2(g_pstrFileName2);
+
+ MSA msa1;
+ MSA msa2;
+ MSA msaOut;
+
+ msa1.FromFile(file1);
+ msa2.FromFile(file2);
+
+ ALPHA Alpha = ALPHA_Undefined;
+ switch (g_SeqType)
+ {
+ case SEQTYPE_Auto:
+ Alpha = msa1.GuessAlpha();
+ break;
+
+ case SEQTYPE_Protein:
+ Alpha = ALPHA_Amino;
+ break;
+
+ case SEQTYPE_DNA:
+ Alpha = ALPHA_DNA;
+ break;
+
+ case SEQTYPE_RNA:
+ Alpha = ALPHA_RNA;
+ break;
+
+ default:
+ Quit("Invalid seq type");
+ }
+ SetAlpha(Alpha);
+ msa1.FixAlpha();
+ msa2.FixAlpha();
+ SetPPScore();
+
+ const unsigned uSeqCount1 = msa1.GetSeqCount();
+ const unsigned uSeqCount2 = msa2.GetSeqCount();
+ //const unsigned uMaxSeqCount = (uSeqCount1 > uSeqCount2 ? uSeqCount1 : uSeqCount2);
+ //MSA::SetIdCount(uMaxSeqCount);
+ const unsigned uSumSeqCount = uSeqCount1 + uSeqCount2;
+ MSA::SetIdCount(uSumSeqCount);
+
+ //msa1.FromFile(file1);
+ //msa2.FromFile(file2);
+
+ //ALPHA Alpha = ALPHA_Undefined;
+ //switch (g_SeqType)
+ // {
+ //case SEQTYPE_Auto:
+ // Alpha = msa1.GuessAlpha();
+ // break;
+
+ //case SEQTYPE_Protein:
+ // Alpha = ALPHA_Amino;
+ // break;
+
+ //case SEQTYPE_Nucleo:
+ // Alpha = ALPHA_Nucleo;
+ // break;
+
+ //default:
+ // Quit("Invalid SeqType");
+ // }
+ //SetAlpha(Alpha);
+
+ //msa1.FixAlpha();
+ //msa2.FixAlpha();
+
+ //if (ALPHA_Nucleo == Alpha)
+ // SetPPScore(PPSCORE_SPN);
+
+ //unsigned uLength1;
+ //unsigned uLength2;
+
+ //uLength1 = msa1.GetColCount();
+ //uLength2 = msa2.GetColCount();
+
+ //const unsigned uSeqCount1 = msa1.GetSeqCount();
+ //const unsigned uSeqCount2 = msa2.GetSeqCount();
+ //const unsigned uMaxSeqCount = (uSeqCount1 > uSeqCount2 ? uSeqCount1 : uSeqCount2);
+ //MSA::SetIdCount(uMaxSeqCount);
+
+ //Tree tree1;
+ //Tree tree2;
+ //ProfPos *Prof1 = ProfileFromMSALocal(msa1, tree1);
+ //ProfPos *Prof2 = ProfileFromMSALocal(msa2, tree2);
+
+ //PWPath Path;
+ //ProfPos *ProfOut;
+ //unsigned uLengthOut;
+ //AlignTwoProfs(Prof1, uLength1, 1.0, Prof2, uLength2, 1.0, Path, &ProfOut, &uLengthOut);
+
+ //MSA msaOut;
+ //AlignTwoMSAsGivenPath(Path, msa1, msa2, msaOut);
+
+ ProfileProfile(msa1, msa2, msaOut);
+
+// TextFile fileOut(g_pstrOutFileName, true);
+// msaOut.ToFile(fileOut);
+ MuscleOutput(msaOut);
+ }
Added: trunk/packages/muscle/branches/upstream/current/profile.h
===================================================================
--- trunk/packages/muscle/branches/upstream/current/profile.h 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/profile.h 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,127 @@
+#ifndef FastProf2_h
+#define FastProf2_h
+
+#include "msa.h"
+#include "pwpath.h"
+#include <math.h> // for log function
+
+class DiagList;
+class WeightList;
+
+struct ProfPos
+ {
+ bool m_bAllGaps;
+ unsigned m_uSortOrder[21];
+ FCOUNT m_fcCounts[20];
+ FCOUNT m_LL;
+ FCOUNT m_LG;
+ FCOUNT m_GL;
+ FCOUNT m_GG;
+ SCORE m_AAScores[20];
+ unsigned m_uResidueGroup;
+ FCOUNT m_fOcc;
+ FCOUNT m_fcStartOcc;
+ FCOUNT m_fcEndOcc;
+ SCORE m_scoreGapOpen;
+ SCORE m_scoreGapClose;
+#if DOUBLE_AFFINE
+ SCORE m_scoreGapOpen2;
+ SCORE m_scoreGapClose2;
+#endif
+// SCORE m_scoreGapExtend;
+ };
+
+struct ProgNode
+ {
+ ProgNode()
+ {
+ m_Prof = 0;
+ m_EstringL = 0;
+ m_EstringR = 0;
+ }
+ MSA m_MSA;
+ ProfPos *m_Prof;
+ PWPath m_Path;
+ short *m_EstringL;
+ short *m_EstringR;
+ unsigned m_uLength;
+ WEIGHT m_Weight;
+ };
+
+extern unsigned ResidueGroup[];
+const unsigned RESIDUE_GROUP_MULTIPLE = (unsigned) ~0;
+
+extern PTR_SCOREMATRIX g_ptrScoreMatrix;
+
+ProfPos *ProfileFromMSA(const MSA &a);
+
+SCORE TraceBack(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
+ unsigned uLengthB, const SCORE *DPM_, const SCORE *DPD_, const SCORE *DPI_,
+ PWPath &Path);
+SCORE GlobalAlign(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
+ unsigned uLengthB, PWPath &Path);
+void ProgressiveAlign(const SeqVect &v, const Tree &tree, MSA &a);
+SCORE MSAPairSP(const MSA &msa1, const MSA &msa2);
+
+void AlignTwoMSAsGivenPath(const PWPath &Path, const MSA &msaA, const MSA &msaB,
+ MSA &msaCombined);
+
+void ListProfile(const ProfPos *Prof, unsigned uLength, const MSA *ptrMSA = 0);
+SCORE ScoreProfPos2(const ProfPos &PPA, const ProfPos &PPB);
+SCORE FastScorePath2(const ProfPos *PA, unsigned uLengthA,
+ const ProfPos *PB, unsigned uLengthB, const PWPath &Path);
+bool IsHydrophilic(const FCOUNT fcCounts[]);
+int PAM200_Letter(unsigned uLetter1, unsigned uLetter2);
+SCORE AverageMatchScore(const PWPath &Path, unsigned uEdgeIndex,
+ unsigned uWindowLength);
+void WindowSmooth(const SCORE Score[], unsigned uCount, unsigned uWindowLength,
+ SCORE SmoothScore[], double dCeil = 9e29);
+SCORE FastScoreMSA_LA(const MSA &msa, SCORE MatchScore[] = 0);
+SCORE FastScoreMSA_NS(const MSA &msa, SCORE MatchScore[] = 0);
+SCORE FastScoreMSA_SP(const MSA &msa, SCORE MatchScore[] = 0);
+bool RefineMSA(MSA &msa, const Tree &tree);
+SCORE MSAQScore(const MSA &msa, SCORE MatchScore[] = 0);
+bool RefineBiParts(MSA &msa, const Tree &tree, bool R);
+void FindAnchorCols(const MSA &msa, unsigned AnchorCols[],
+ unsigned *ptruAnchorColCount);
+double PctIdToHeight(double dPctId);
+double PctIdToHeightKimura(double dPctId);
+double PctIdToHeightMAFFT(double dPctId);
+double PctIdToMAFFTDist(double dPctId);
+bool RefineBlocks(MSA &msa, const Tree &tree);
+bool RefineSubfams(MSA &msaIn, const Tree &tree, unsigned uIters);
+void SetMuscleTree(const Tree &tree);
+void CalcClustalWWeights(const Tree &tree, WEIGHT Weights[]);
+void RealignDiffs(const MSA &msaIn, const Tree &Diffs,
+ const unsigned IdToDiffsTreeNodeIndex[], MSA &msaOut);
+void RealignDiffsE(const MSA &msaIn, const SeqVect &v,
+ const Tree &NewTree, const Tree &OldTree,
+ const unsigned uNewNodeIndexToOldNodeIndex[],
+ MSA &msaOut, ProgNode *OldProgNodes);
+void RefineTree(MSA &msa, Tree &tree);
+void RefineTreeE(MSA &msa, const SeqVect &v, Tree &tree, ProgNode *ProgNodes);
+void SetScoreMatrix();
+extern bool IsHydrophobic(const FCOUNT fcCounts[]);
+void Hydro(ProfPos *Prof, unsigned uLength);
+void SetTermGaps(const ProfPos *Prof, unsigned uLength);
+
+// Macros to simulate 2D matrices
+#define DPL(PLA, PLB) DPL_[(PLB)*uPrefixCountA + (PLA)]
+#define DPM(PLA, PLB) DPM_[(PLB)*uPrefixCountA + (PLA)]
+#define DPD(PLA, PLB) DPD_[(PLB)*uPrefixCountA + (PLA)]
+#define DPE(PLA, PLB) DPE_[(PLB)*uPrefixCountA + (PLA)]
+#define DPI(PLA, PLB) DPI_[(PLB)*uPrefixCountA + (PLA)]
+#define DPJ(PLA, PLB) DPJ_[(PLB)*uPrefixCountA + (PLA)]
+#define DPU(PLA, PLB) DPU_[(PLB)*uPrefixCountA + (PLA)]
+#define TBM(PLA, PLB) TBM_[(PLB)*uPrefixCountA + (PLA)]
+#define TBD(PLA, PLB) TBD_[(PLB)*uPrefixCountA + (PLA)]
+#define TBE(PLA, PLB) TBE_[(PLB)*uPrefixCountA + (PLA)]
+#define TBI(PLA, PLB) TBI_[(PLB)*uPrefixCountA + (PLA)]
+#define TBJ(PLA, PLB) TBJ_[(PLB)*uPrefixCountA + (PLA)]
+
+SCORE ScoreProfPos2LA(const ProfPos &PPA, const ProfPos &PPB);
+SCORE ScoreProfPos2NS(const ProfPos &PPA, const ProfPos &PPB);
+SCORE ScoreProfPos2SP(const ProfPos &PPA, const ProfPos &PPB);
+SCORE ScoreProfPos2SPN(const ProfPos &PPA, const ProfPos &PPB);
+
+#endif // FastProf_h
Added: trunk/packages/muscle/branches/upstream/current/profilefrommsa.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/profilefrommsa.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/profilefrommsa.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,318 @@
+#include "muscle.h"
+#include "msa.h"
+#include "profile.h"
+
+#define TRACE 0
+
+static void LogF(FCOUNT f)
+ {
+ if (f > -0.00001 && f < 0.00001)
+ Log(" ");
+ else
+ Log(" %5.3f", f);
+ }
+
+static const char *LocalScoreToStr(SCORE s)
+ {
+ static char str[16];
+ if (s < -1e10 || s > 1e10)
+ return " *";
+ sprintf(str, "%5.1f", s);
+ return str;
+ }
+
+#if DOUBLE_AFFINE
+void ListProfile(const ProfPos *Prof, unsigned uLength, const MSA *ptrMSA)
+ {
+ Log(" Pos Occ LL LG GL GG Open Close Open2 Clos2\n");
+ Log(" --- --- -- -- -- -- ---- ----- ----- -----\n");
+ for (unsigned n = 0; n < uLength; ++n)
+ {
+ const ProfPos &PP = Prof[n];
+ Log("%5u", n);
+ LogF(PP.m_fOcc);
+ LogF(PP.m_LL);
+ LogF(PP.m_LG);
+ LogF(PP.m_GL);
+ LogF(PP.m_GG);
+ Log(" %s", LocalScoreToStr(-PP.m_scoreGapOpen));
+ Log(" %s", LocalScoreToStr(-PP.m_scoreGapClose));
+ Log(" %s", LocalScoreToStr(-PP.m_scoreGapOpen2));
+ Log(" %s", LocalScoreToStr(-PP.m_scoreGapClose2));
+ if (0 != ptrMSA)
+ {
+ const unsigned uSeqCount = ptrMSA->GetSeqCount();
+ Log(" ");
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ Log("%c", ptrMSA->GetChar(uSeqIndex, n));
+ }
+ Log("\n");
+ }
+
+ Log("\n");
+ Log(" Pos G");
+ for (unsigned n = 0; n < g_AlphaSize; ++n)
+ Log(" %c", LetterExToChar(n));
+ Log("\n");
+ Log(" --- -");
+ for (unsigned n = 0; n < g_AlphaSize; ++n)
+ Log(" -----");
+ Log("\n");
+
+ for (unsigned n = 0; n < uLength; ++n)
+ {
+ const ProfPos &PP = Prof[n];
+ Log("%5u", n);
+ if (-1 == PP.m_uResidueGroup)
+ Log(" -", PP.m_uResidueGroup);
+ else
+ Log(" %d", PP.m_uResidueGroup);
+
+ for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter)
+ {
+ FCOUNT f = PP.m_fcCounts[uLetter];
+ if (f == 0.0)
+ Log(" ");
+ else
+ Log(" %5.3f", f);
+ }
+ if (0 != ptrMSA)
+ {
+ const unsigned uSeqCount = ptrMSA->GetSeqCount();
+ Log(" ");
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ Log("%c", ptrMSA->GetChar(uSeqIndex, n));
+ }
+ Log("\n");
+ }
+ }
+#endif // DOUBLE_AFFINE
+
+#if SINGLE_AFFINE
+void ListProfile(const ProfPos *Prof, unsigned uLength, const MSA *ptrMSA)
+ {
+ Log(" Pos Occ LL LG GL GG Open Close\n");
+ Log(" --- --- -- -- -- -- ---- -----\n");
+ for (unsigned n = 0; n < uLength; ++n)
+ {
+ const ProfPos &PP = Prof[n];
+ Log("%5u", n);
+ LogF(PP.m_fOcc);
+ LogF(PP.m_LL);
+ LogF(PP.m_LG);
+ LogF(PP.m_GL);
+ LogF(PP.m_GG);
+ Log(" %5.1f", -PP.m_scoreGapOpen);
+ Log(" %5.1f", -PP.m_scoreGapClose);
+ if (0 != ptrMSA)
+ {
+ const unsigned uSeqCount = ptrMSA->GetSeqCount();
+ Log(" ");
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ Log("%c", ptrMSA->GetChar(uSeqIndex, n));
+ }
+ Log("\n");
+ }
+
+ Log("\n");
+ Log(" Pos G");
+ for (unsigned n = 0; n < g_AlphaSize; ++n)
+ Log(" %c", LetterExToChar(n));
+ Log("\n");
+ Log(" --- -");
+ for (unsigned n = 0; n < g_AlphaSize; ++n)
+ Log(" -----");
+ Log("\n");
+
+ for (unsigned n = 0; n < uLength; ++n)
+ {
+ const ProfPos &PP = Prof[n];
+ Log("%5u", n);
+ if (-1 == PP.m_uResidueGroup)
+ Log(" -", PP.m_uResidueGroup);
+ else
+ Log(" %d", PP.m_uResidueGroup);
+
+ for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter)
+ {
+ FCOUNT f = PP.m_fcCounts[uLetter];
+ if (f == 0.0)
+ Log(" ");
+ else
+ Log(" %5.3f", f);
+ }
+ if (0 != ptrMSA)
+ {
+ const unsigned uSeqCount = ptrMSA->GetSeqCount();
+ Log(" ");
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ Log("%c", ptrMSA->GetChar(uSeqIndex, n));
+ }
+ Log("\n");
+ }
+ }
+#endif
+
+void SortCounts(const FCOUNT fcCounts[], unsigned SortOrder[])
+ {
+ static unsigned InitialSortOrder[MAX_ALPHA] =
+ {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19
+ };
+ memcpy(SortOrder, InitialSortOrder, g_AlphaSize*sizeof(unsigned));
+
+ bool bAny = true;
+ while (bAny)
+ {
+ bAny = false;
+ for (unsigned n = 0; n < g_AlphaSize - 1; ++n)
+ {
+ unsigned i1 = SortOrder[n];
+ unsigned i2 = SortOrder[n+1];
+ if (fcCounts[i1] < fcCounts[i2])
+ {
+ SortOrder[n+1] = i1;
+ SortOrder[n] = i2;
+ bAny = true;
+ }
+ }
+ }
+ }
+
+static unsigned AminoGroupFromFCounts(const FCOUNT fcCounts[])
+ {
+ bool bAny = false;
+ unsigned uConsensusResidueGroup = RESIDUE_GROUP_MULTIPLE;
+ for (unsigned uLetter = 0; uLetter < 20; ++uLetter)
+ {
+ if (0 == fcCounts[uLetter])
+ continue;
+ const unsigned uResidueGroup = ResidueGroup[uLetter];
+ if (bAny)
+ {
+ if (uResidueGroup != uConsensusResidueGroup)
+ return RESIDUE_GROUP_MULTIPLE;
+ }
+ else
+ {
+ bAny = true;
+ uConsensusResidueGroup = uResidueGroup;
+ }
+ }
+ return uConsensusResidueGroup;
+ }
+
+static unsigned NucleoGroupFromFCounts(const FCOUNT fcCounts[])
+ {
+ bool bAny = false;
+ unsigned uConsensusResidueGroup = RESIDUE_GROUP_MULTIPLE;
+ for (unsigned uLetter = 0; uLetter < 4; ++uLetter)
+ {
+ if (0 == fcCounts[uLetter])
+ continue;
+ const unsigned uResidueGroup = uLetter;
+ if (bAny)
+ {
+ if (uResidueGroup != uConsensusResidueGroup)
+ return RESIDUE_GROUP_MULTIPLE;
+ }
+ else
+ {
+ bAny = true;
+ uConsensusResidueGroup = uResidueGroup;
+ }
+ }
+ return uConsensusResidueGroup;
+ }
+
+unsigned ResidueGroupFromFCounts(const FCOUNT fcCounts[])
+ {
+ switch (g_Alpha)
+ {
+ case ALPHA_Amino:
+ return AminoGroupFromFCounts(fcCounts);
+
+ case ALPHA_DNA:
+ case ALPHA_RNA:
+ return NucleoGroupFromFCounts(fcCounts);
+ }
+ Quit("ResidueGroupFromFCounts: bad alpha");
+ return 0;
+ }
+
+ProfPos *ProfileFromMSA(const MSA &a)
+ {
+ const unsigned uSeqCount = a.GetSeqCount();
+ const unsigned uColCount = a.GetColCount();
+
+// Yuck -- cast away const (inconsistent design here).
+ SetMSAWeightsMuscle((MSA &) a);
+
+ ProfPos *Pos = new ProfPos[uColCount];
+
+ unsigned uHydrophobicRunLength = 0;
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
+ {
+ ProfPos &PP = Pos[uColIndex];
+
+ PP.m_bAllGaps = a.IsGapColumn(uColIndex);
+
+ FCOUNT fcGapStart;
+ FCOUNT fcGapEnd;
+ FCOUNT fcGapExtend;
+ FCOUNT fOcc;
+ a.GetFractionalWeightedCounts(uColIndex, g_bNormalizeCounts, PP.m_fcCounts,
+ &fcGapStart, &fcGapEnd, &fcGapExtend, &fOcc,
+ &PP.m_LL, &PP.m_LG, &PP.m_GL, &PP.m_GG);
+ PP.m_fOcc = fOcc;
+
+ SortCounts(PP.m_fcCounts, PP.m_uSortOrder);
+
+ PP.m_uResidueGroup = ResidueGroupFromFCounts(PP.m_fcCounts);
+
+ for (unsigned i = 0; i < g_AlphaSize; ++i)
+ {
+ SCORE scoreSum = 0;
+ for (unsigned j = 0; j < g_AlphaSize; ++j)
+ scoreSum += PP.m_fcCounts[j]*(*g_ptrScoreMatrix)[i][j];
+ PP.m_AAScores[i] = scoreSum;
+ }
+
+ SCORE sStartOcc = (SCORE) (1.0 - fcGapStart);
+ SCORE sEndOcc = (SCORE) (1.0 - fcGapEnd);
+
+ PP.m_fcStartOcc = sStartOcc;
+ PP.m_fcEndOcc = sEndOcc;
+
+ PP.m_scoreGapOpen = sStartOcc*g_scoreGapOpen/2;
+ PP.m_scoreGapClose = sEndOcc*g_scoreGapOpen/2;
+#if DOUBLE_AFFINE
+ PP.m_scoreGapOpen2 = sStartOcc*g_scoreGapOpen2/2;
+ PP.m_scoreGapClose2 = sEndOcc*g_scoreGapOpen2/2;
+#endif
+// PP.m_scoreGapExtend = (SCORE) ((1.0 - fcGapExtend)*scoreGapExtend);
+
+#if PAF
+ if (ALHPA_Amino == g_Alpha && sStartOcc > 0.5)
+ {
+ extern SCORE PAFactor(const FCOUNT fcCounts[]);
+ SCORE paf = PAFactor(PP.m_fcCounts);
+ PP.m_scoreGapOpen *= paf;
+ PP.m_scoreGapClose *= paf;
+ }
+#endif
+ }
+
+#if HYDRO
+ if (ALPHA_Amino == g_Alpha)
+ Hydro(Pos, uColCount);
+#endif
+
+#if TRACE
+ {
+ Log("ProfileFromMSA\n");
+ ListProfile(Pos, uColCount, &a);
+ }
+#endif
+ return Pos;
+ }
Added: trunk/packages/muscle/branches/upstream/current/progalign.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/progalign.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/progalign.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,206 @@
+#include "muscle.h"
+#include "tree.h"
+#include "seqvect.h"
+#include "profile.h"
+#include "msa.h"
+#include "pwpath.h"
+#include "distfunc.h"
+#include "textfile.h"
+#include "estring.h"
+
+#define TRACE 0
+#define VALIDATE 0
+#define TRACE_LENGTH_DELTA 0
+
+static void LogLeafNames(const Tree &tree, unsigned uNodeIndex)
+ {
+ const unsigned uNodeCount = tree.GetNodeCount();
+ unsigned *Leaves = new unsigned[uNodeCount];
+ unsigned uLeafCount;
+ GetLeaves(tree, uNodeIndex, Leaves, &uLeafCount);
+ for (unsigned i = 0; i < uLeafCount; ++i)
+ {
+ if (i > 0)
+ Log(",");
+ Log("%s", tree.GetLeafName(Leaves[i]));
+ }
+ delete[] Leaves;
+ }
+
+ProgNode *ProgressiveAlignE(const SeqVect &v, const Tree &GuideTree, MSA &a)
+ {
+ assert(GuideTree.IsRooted());
+
+#if TRACE
+ Log("GuideTree:\n");
+ GuideTree.LogMe();
+#endif
+
+ const unsigned uSeqCount = v.Length();
+ const unsigned uNodeCount = 2*uSeqCount - 1;
+ const unsigned uIterCount = uSeqCount - 1;
+
+ WEIGHT *Weights = new WEIGHT[uSeqCount];
+ CalcClustalWWeights(GuideTree, Weights);
+
+ ProgNode *ProgNodes = new ProgNode[uNodeCount];
+
+ unsigned uJoin = 0;
+ unsigned uTreeNodeIndex = GuideTree.FirstDepthFirstNode();
+ SetProgressDesc("Align node");
+ do
+ {
+ if (GuideTree.IsLeaf(uTreeNodeIndex))
+ {
+ if (uTreeNodeIndex >= uNodeCount)
+ Quit("TreeNodeIndex=%u NodeCount=%u\n", uTreeNodeIndex, uNodeCount);
+ ProgNode &Node = ProgNodes[uTreeNodeIndex];
+ unsigned uId = GuideTree.GetLeafId(uTreeNodeIndex);
+ if (uId >= uSeqCount)
+ Quit("Seq index out of range");
+ const Seq &s = *(v[uId]);
+ Node.m_MSA.FromSeq(s);
+ Node.m_MSA.SetSeqId(0, uId);
+ Node.m_uLength = Node.m_MSA.GetColCount();
+ Node.m_Weight = Weights[uId];
+ // TODO: Term gaps settable
+ Node.m_Prof = ProfileFromMSA(Node.m_MSA);
+ Node.m_EstringL = 0;
+ Node.m_EstringR = 0;
+#if TRACE
+ Log("Leaf id=%u\n", uId);
+ Log("MSA=\n");
+ Node.m_MSA.LogMe();
+ Log("Profile (from MSA)=\n");
+ ListProfile(Node.m_Prof, Node.m_uLength, &Node.m_MSA);
+#endif
+ }
+ else
+ {
+ Progress(uJoin, uSeqCount - 1);
+ ++uJoin;
+
+ const unsigned uMergeNodeIndex = uTreeNodeIndex;
+ ProgNode &Parent = ProgNodes[uMergeNodeIndex];
+
+ const unsigned uLeft = GuideTree.GetLeft(uTreeNodeIndex);
+ const unsigned uRight = GuideTree.GetRight(uTreeNodeIndex);
+
+ if (g_bVerbose)
+ {
+ Log("Align: (");
+ LogLeafNames(GuideTree, uLeft);
+ Log(") (");
+ LogLeafNames(GuideTree, uRight);
+ Log(")\n");
+ }
+
+ ProgNode &Node1 = ProgNodes[uLeft];
+ ProgNode &Node2 = ProgNodes[uRight];
+
+#if TRACE
+ Log("AlignTwoMSAs:\n");
+#endif
+ AlignTwoProfs(
+ Node1.m_Prof, Node1.m_uLength, Node1.m_Weight,
+ Node2.m_Prof, Node2.m_uLength, Node2.m_Weight,
+ Parent.m_Path,
+ &Parent.m_Prof, &Parent.m_uLength);
+#if TRACE_LENGTH_DELTA
+ {
+ unsigned L = Node1.m_uLength;
+ unsigned R = Node2.m_uLength;
+ unsigned P = Parent.m_Path.GetEdgeCount();
+ unsigned Max = L > R ? L : R;
+ unsigned d = P - Max;
+ Log("LD%u;%u;%u;%u\n", L, R, P, d);
+ }
+#endif
+ PathToEstrings(Parent.m_Path, &Parent.m_EstringL, &Parent.m_EstringR);
+
+ Parent.m_Weight = Node1.m_Weight + Node2.m_Weight;
+
+#if VALIDATE
+ {
+#if TRACE
+ Log("AlignTwoMSAs:\n");
+#endif
+ PWPath TmpPath;
+ AlignTwoMSAs(Node1.m_MSA, Node2.m_MSA, Parent.m_MSA, TmpPath);
+ ProfPos *P1 = ProfileFromMSA(Node1.m_MSA, true);
+ ProfPos *P2 = ProfileFromMSA(Node2.m_MSA, true);
+ unsigned uLength = Parent.m_MSA.GetColCount();
+ ProfPos *TmpProf = ProfileFromMSA(Parent.m_MSA, true);
+
+#if TRACE
+ Log("Node1 MSA=\n");
+ Node1.m_MSA.LogMe();
+
+ Log("Node1 prof=\n");
+ ListProfile(Node1.m_Prof, Node1.m_MSA.GetColCount(), &Node1.m_MSA);
+ Log("Node1 prof (from MSA)=\n");
+ ListProfile(P1, Node1.m_MSA.GetColCount(), &Node1.m_MSA);
+
+ AssertProfsEq(Node1.m_Prof, Node1.m_uLength, P1, Node1.m_MSA.GetColCount());
+
+ Log("Node2 prof=\n");
+ ListProfile(Node2.m_Prof, Node2.m_MSA.GetColCount(), &Node2.m_MSA);
+
+ Log("Node2 MSA=\n");
+ Node2.m_MSA.LogMe();
+
+ Log("Node2 prof (from MSA)=\n");
+ ListProfile(P2, Node2.m_MSA.GetColCount(), &Node2.m_MSA);
+
+ AssertProfsEq(Node2.m_Prof, Node2.m_uLength, P2, Node2.m_MSA.GetColCount());
+
+ TmpPath.AssertEqual(Parent.m_Path);
+
+ Log("Parent MSA=\n");
+ Parent.m_MSA.LogMe();
+
+ Log("Parent prof=\n");
+ ListProfile(Parent.m_Prof, Parent.m_uLength, &Parent.m_MSA);
+
+ Log("Parent prof (from MSA)=\n");
+ ListProfile(TmpProf, Parent.m_MSA.GetColCount(), &Parent.m_MSA);
+
+#endif // TRACE
+ AssertProfsEq(Parent.m_Prof, Parent.m_uLength,
+ TmpProf, Parent.m_MSA.GetColCount());
+ delete[] P1;
+ delete[] P2;
+ delete[] TmpProf;
+ }
+#endif // VALIDATE
+
+ Node1.m_MSA.Clear();
+ Node2.m_MSA.Clear();
+
+ // Don't delete profiles, may need them for tree refinement.
+ //delete[] Node1.m_Prof;
+ //delete[] Node2.m_Prof;
+ //Node1.m_Prof = 0;
+ //Node2.m_Prof = 0;
+ }
+ uTreeNodeIndex = GuideTree.NextDepthFirstNode(uTreeNodeIndex);
+ }
+ while (NULL_NEIGHBOR != uTreeNodeIndex);
+ ProgressStepsDone();
+
+ if (g_bBrenner)
+ MakeRootMSABrenner((SeqVect &) v, GuideTree, ProgNodes, a);
+ else
+ MakeRootMSA(v, GuideTree, ProgNodes, a);
+
+#if VALIDATE
+ {
+ unsigned uRootNodeIndex = GuideTree.GetRootNodeIndex();
+ const ProgNode &RootProgNode = ProgNodes[uRootNodeIndex];
+ AssertMSAEq(a, RootProgNode.m_MSA);
+ }
+#endif
+
+ delete[] Weights;
+ return ProgNodes;
+ }
Added: trunk/packages/muscle/branches/upstream/current/progress.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/progress.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/progress.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,186 @@
+#include "muscle.h"
+#include <stdio.h>
+#include <time.h>
+
+// Functions that provide visible feedback to the user
+// that progress is being made.
+
+static unsigned g_uIter = 0; // Main MUSCLE iteration 1, 2..
+static unsigned g_uLocalMaxIters = 0; // Max iters
+static FILE *g_fProgress = stderr; // Default to standard error
+static char g_strFileName[32]; // File name
+static time_t g_tLocalStart; // Start time
+static char g_strDesc[32]; // Description
+static bool g_bWipeDesc = false;
+static int g_nPrevDescLength;
+static unsigned g_uTotalSteps;
+
+double GetCheckMemUseMB()
+ {
+ unsigned MB = (unsigned) GetMemUseMB();
+ if (0 == g_uMaxMB || MB <= g_uMaxMB)
+ return MB;
+ fprintf(stderr, "\n\n*** MAX MEMORY %u MB EXCEEDED***\n", g_uMaxMB);
+ fprintf(stderr, "Memory allocated so far %u MB, physical RAM %u MB\n",
+ MB, (unsigned) GetRAMSizeMB());
+ fprintf(stderr, "Use -maxmb <n> option to increase limit, where <n> is in MB.\n");
+ SaveCurrentAlignment();
+ exit(EXIT_FatalError);
+ return MB;
+ }
+
+const char *ElapsedTimeAsStr()
+ {
+ time_t Now = time(0);
+ unsigned long ElapsedSecs = (unsigned long) (Now - g_tLocalStart);
+ return SecsToStr(ElapsedSecs);
+ }
+
+const char *MemToStr(double MB)
+ {
+ if (MB < 0)
+ return "";
+
+ static char Str[9];
+ static double MaxMB = 0;
+ static double RAMMB = 0;
+
+ if (RAMMB == 0)
+ RAMMB = GetRAMSizeMB();
+
+ if (MB > MaxMB)
+ MaxMB = MB;
+ double Pct = (MaxMB*100.0)/RAMMB;
+ if (Pct > 100)
+ Pct = 100;
+ sprintf(Str, "%.0f MB(%.0f%%)", MaxMB, Pct);
+ return Str;
+ }
+
+void SetInputFileName(const char *pstrFileName)
+ {
+ NameFromPath(pstrFileName, g_strFileName, sizeof(g_strFileName));
+ }
+
+void SetSeqStats(unsigned uSeqCount, unsigned uMaxL, unsigned uAvgL)
+ {
+ if (g_bQuiet)
+ return;
+
+ fprintf(g_fProgress, "%s %u seqs, max length %u, avg length %u\n",
+ g_strFileName, uSeqCount, uMaxL, uAvgL);
+ if (g_bVerbose)
+ Log("%u seqs, max length %u, avg length %u\n",
+ uSeqCount, uMaxL, uAvgL);
+ }
+
+void SetStartTime()
+ {
+ time(&g_tLocalStart);
+ }
+
+unsigned long GetStartTime()
+ {
+ return (unsigned long) g_tLocalStart;
+ }
+
+void SetIter(unsigned uIter)
+ {
+ g_uIter = uIter;
+ }
+
+void IncIter()
+ {
+ ++g_uIter;
+ }
+
+void SetMaxIters(unsigned uMaxIters)
+ {
+ g_uLocalMaxIters = uMaxIters;
+ }
+
+void SetProgressDesc(const char szDesc[])
+ {
+ strncpy(g_strDesc, szDesc, sizeof(g_strDesc));
+ g_strDesc[sizeof(g_strDesc) - 1] = 0;
+ }
+
+static void Wipe(int n)
+ {
+ for (int i = 0; i < n; ++i)
+ fprintf(g_fProgress, " ");
+ }
+
+void Progress(const char *szFormat, ...)
+ {
+ CheckMaxTime();
+
+ if (g_bQuiet)
+ return;
+
+ double MB = GetCheckMemUseMB();
+
+ char szStr[4096];
+ va_list ArgList;
+ va_start(ArgList, szFormat);
+ vsprintf(szStr, szFormat, ArgList);
+
+ fprintf(g_fProgress, "\n%8.8s %12s %s",
+ ElapsedTimeAsStr(),
+ MemToStr(MB),
+ szStr);
+
+ fprintf(g_fProgress, "\n");
+ fflush(g_fProgress);
+ }
+
+void Progress(unsigned uStep, unsigned uTotalSteps)
+ {
+ CheckMaxTime();
+
+ if (g_bQuiet)
+ return;
+
+ double dPct = ((uStep + 1)*100.0)/uTotalSteps;
+ double MB = GetCheckMemUseMB();
+ fprintf(g_fProgress, "%8.8s %12s Iter %3u %6.2f%% %s",
+ ElapsedTimeAsStr(),
+ MemToStr(MB),
+ g_uIter,
+ dPct,
+ g_strDesc);
+
+ if (g_bWipeDesc)
+ {
+ int n = g_nPrevDescLength - (int) strlen(g_strDesc);
+ Wipe(n);
+ g_bWipeDesc = false;
+ }
+
+ fprintf(g_fProgress, "\r");
+
+ g_uTotalSteps = uTotalSteps;
+ }
+
+void ProgressStepsDone()
+ {
+ CheckMaxTime();
+
+ if (g_bVerbose)
+ {
+ double MB = GetCheckMemUseMB();
+ Log("Elapsed time %8.8s Peak memory use %12s Iteration %3u %s\n",
+ ElapsedTimeAsStr(),
+ MemToStr(MB),
+ g_uIter,
+ g_strDesc);
+ }
+
+ if (g_bQuiet)
+ return;
+
+ Progress(g_uTotalSteps - 1, g_uTotalSteps);
+ fprintf(g_fProgress, "\n");
+ g_bWipeDesc = true;
+ g_nPrevDescLength = (int) strlen(g_strDesc);
+ }
Added: trunk/packages/muscle/branches/upstream/current/progressivealign.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/progressivealign.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/progressivealign.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,76 @@
+#include "muscle.h"
+#include <math.h>
+#include "tree.h"
+#include "seqvect.h"
+#include "profile.h"
+#include "msa.h"
+#include "pwpath.h"
+#include "distfunc.h"
+
+#define TRACE 0
+
+void ProgressiveAlign(const SeqVect &v, const Tree &GuideTree, MSA &a)
+ {
+ assert(GuideTree.IsRooted());
+
+#if TRACE
+ Log("GuideTree:\n");
+ GuideTree.LogMe();
+#endif
+
+ const unsigned uSeqCount = v.Length();
+ const unsigned uNodeCount = 2*uSeqCount - 1;
+
+ ProgNode *ProgNodes = new ProgNode[uNodeCount];
+
+ unsigned uJoin = 0;
+ unsigned uTreeNodeIndex = GuideTree.FirstDepthFirstNode();
+ SetProgressDesc("Align node");
+ do
+ {
+ if (GuideTree.IsLeaf(uTreeNodeIndex))
+ {
+ if (uTreeNodeIndex >= uNodeCount)
+ Quit("TreeNodeIndex=%u NodeCount=%u\n", uTreeNodeIndex, uNodeCount);
+ ProgNode &Node = ProgNodes[uTreeNodeIndex];
+ unsigned uId = GuideTree.GetLeafId(uTreeNodeIndex);
+ if (uId >= uSeqCount)
+ Quit("Seq index out of range");
+ const Seq &s = *(v[uId]);
+ Node.m_MSA.FromSeq(s);
+ Node.m_MSA.SetSeqId(0, uId);
+ Node.m_uLength = Node.m_MSA.GetColCount();
+ }
+ else
+ {
+ Progress(uJoin, uSeqCount - 1);
+ ++uJoin;
+
+ const unsigned uMergeNodeIndex = uTreeNodeIndex;
+ ProgNode &Parent = ProgNodes[uMergeNodeIndex];
+
+ const unsigned uLeft = GuideTree.GetLeft(uTreeNodeIndex);
+ const unsigned uRight = GuideTree.GetRight(uTreeNodeIndex);
+
+ ProgNode &Node1 = ProgNodes[uLeft];
+ ProgNode &Node2 = ProgNodes[uRight];
+
+ PWPath Path;
+ AlignTwoMSAs(Node1.m_MSA, Node2.m_MSA, Parent.m_MSA, Path);
+ Parent.m_uLength = Parent.m_MSA.GetColCount();
+
+ Node1.m_MSA.Clear();
+ Node2.m_MSA.Clear();
+ }
+ uTreeNodeIndex = GuideTree.NextDepthFirstNode(uTreeNodeIndex);
+ }
+ while (NULL_NEIGHBOR != uTreeNodeIndex);
+ ProgressStepsDone();
+
+ unsigned uRootNodeIndex = GuideTree.GetRootNodeIndex();
+ const ProgNode &RootProgNode = ProgNodes[uRootNodeIndex];
+ a.Copy(RootProgNode.m_MSA);
+
+ delete[] ProgNodes;
+ ProgNodes = 0;
+ }
Added: trunk/packages/muscle/branches/upstream/current/pwpath.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/pwpath.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/pwpath.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,386 @@
+#include "muscle.h"
+#include "pwpath.h"
+#include "seq.h"
+#include "textfile.h"
+#include "msa.h"
+
+PWPath::PWPath()
+ {
+ m_uArraySize = 0;
+ m_uEdgeCount = 0;
+ m_Edges = 0;
+ }
+
+PWPath::~PWPath()
+ {
+ Clear();
+ }
+
+void PWPath::Clear()
+ {
+ delete[] m_Edges;
+ m_Edges = 0;
+ m_uArraySize = 0;
+ m_uEdgeCount = 0;
+ }
+
+void PWPath::ExpandPath(unsigned uAdditionalEdgeCount)
+ {
+ PWEdge *OldPath = m_Edges;
+ unsigned uEdgeCount = m_uArraySize + uAdditionalEdgeCount;
+
+ m_Edges = new PWEdge[uEdgeCount];
+ m_uArraySize = uEdgeCount;
+ if (m_uEdgeCount > 0)
+ memcpy(m_Edges, OldPath, m_uEdgeCount*sizeof(PWEdge));
+ delete[] OldPath;
+ }
+
+void PWPath::AppendEdge(const PWEdge &Edge)
+ {
+ if (0 == m_uArraySize || m_uEdgeCount + 1 == m_uArraySize)
+ ExpandPath(200);
+
+ m_Edges[m_uEdgeCount] = Edge;
+ ++m_uEdgeCount;
+ }
+
+void PWPath::AppendEdge(char cType, unsigned uPrefixLengthA, unsigned uPrefixLengthB)
+ {
+ PWEdge e;
+ e.uPrefixLengthA = uPrefixLengthA;
+ e.uPrefixLengthB = uPrefixLengthB;
+ e.cType = cType;
+ AppendEdge(e);
+ }
+
+void PWPath::PrependEdge(const PWEdge &Edge)
+ {
+ if (0 == m_uArraySize || m_uEdgeCount + 1 == m_uArraySize)
+ ExpandPath(1000);
+ if (m_uEdgeCount > 0)
+ memmove(m_Edges + 1, m_Edges, sizeof(PWEdge)*m_uEdgeCount);
+ m_Edges[0] = Edge;
+ ++m_uEdgeCount;
+ }
+
+const PWEdge &PWPath::GetEdge(unsigned uEdgeIndex) const
+ {
+ assert(uEdgeIndex < m_uEdgeCount);
+ return m_Edges[uEdgeIndex];
+ }
+
+void PWPath::Validate() const
+ {
+ const unsigned uEdgeCount = GetEdgeCount();
+ if (0 == uEdgeCount)
+ return;
+ const PWEdge &FirstEdge = GetEdge(0);
+ const PWEdge &LastEdge = GetEdge(uEdgeCount - 1);
+ unsigned uStartA = FirstEdge.uPrefixLengthA;
+ unsigned uStartB = FirstEdge.uPrefixLengthB;
+ if (FirstEdge.cType != 'I')
+ --uStartA;
+ if (FirstEdge.cType != 'D')
+ --uStartB;
+
+ unsigned uPrefixLengthA = FirstEdge.uPrefixLengthA;
+ unsigned uPrefixLengthB = FirstEdge.uPrefixLengthB;
+ for (unsigned uEdgeIndex = 1; uEdgeIndex < uEdgeCount; ++uEdgeIndex)
+ {
+ const PWEdge &Edge = GetEdge(uEdgeIndex);
+ switch (Edge.cType)
+ {
+ case 'M':
+ if (uPrefixLengthA + 1 != Edge.uPrefixLengthA)
+ Quit("PWPath::Validate MA %u", uPrefixLengthA);
+ if (uPrefixLengthB + 1 != Edge.uPrefixLengthB)
+ Quit("PWPath::Validate MB %u", uPrefixLengthB);
+ ++uPrefixLengthA;
+ ++uPrefixLengthB;
+ break;
+ case 'D':
+ if (uPrefixLengthA + 1 != Edge.uPrefixLengthA)
+ Quit("PWPath::Validate DA %u", uPrefixLengthA);
+ if (uPrefixLengthB != Edge.uPrefixLengthB)
+ Quit("PWPath::Validate DB %u", uPrefixLengthB);
+ ++uPrefixLengthA;
+ break;
+ case 'I':
+ if (uPrefixLengthA != Edge.uPrefixLengthA)
+ Quit("PWPath::Validate IA %u", uPrefixLengthA);
+ if (uPrefixLengthB + 1 != Edge.uPrefixLengthB)
+ Quit("PWPath::Validate IB %u", uPrefixLengthB);
+ ++uPrefixLengthB;
+ break;
+ }
+ }
+ }
+
+void PWPath::LogMe() const
+ {
+ for (unsigned uEdgeIndex = 0; uEdgeIndex < GetEdgeCount(); ++uEdgeIndex)
+ {
+ const PWEdge &Edge = GetEdge(uEdgeIndex);
+ if (uEdgeIndex > 0)
+ Log(" ");
+ Log("%c%d.%d",
+ Edge.cType,
+ Edge.uPrefixLengthA,
+ Edge.uPrefixLengthB);
+ if ((uEdgeIndex > 0 && uEdgeIndex%10 == 0) ||
+ uEdgeIndex == GetEdgeCount() - 1)
+ Log("\n");
+ }
+ }
+
+void PWPath::Copy(const PWPath &Path)
+ {
+ Clear();
+ const unsigned uEdgeCount = Path.GetEdgeCount();
+ for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex)
+ {
+ const PWEdge &Edge = Path.GetEdge(uEdgeIndex);
+ AppendEdge(Edge);
+ }
+ }
+
+void PWPath::FromMSAPair(const MSA &msaA, const MSA &msaB)
+ {
+ const unsigned uColCount = msaA.GetColCount();
+ if (uColCount != msaB.GetColCount())
+ Quit("PWPath::FromMSAPair, lengths differ");
+
+ Clear();
+
+ unsigned uPrefixLengthA = 0;
+ unsigned uPrefixLengthB = 0;
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
+ {
+ bool bIsGapA = msaA.IsGapColumn(uColIndex);
+ bool bIsGapB = msaB.IsGapColumn(uColIndex);
+
+ PWEdge Edge;
+ char cType;
+ if (!bIsGapA && !bIsGapB)
+ {
+ cType = 'M';
+ ++uPrefixLengthA;
+ ++uPrefixLengthB;
+ }
+ else if (bIsGapA && !bIsGapB)
+ {
+ cType = 'I';
+ ++uPrefixLengthB;
+ }
+ else if (!bIsGapA && bIsGapB)
+ {
+ cType = 'D';
+ ++uPrefixLengthA;
+ }
+ else
+ {
+ assert(bIsGapB && bIsGapA);
+ continue;
+ }
+
+ Edge.cType = cType;
+ Edge.uPrefixLengthA = uPrefixLengthA;
+ Edge.uPrefixLengthB = uPrefixLengthB;
+ AppendEdge(Edge);
+ }
+ }
+
+// Very similar to HMMPath::FromFile, should consolidate.
+void PWPath::FromFile(TextFile &File)
+ {
+ Clear();
+ char szToken[1024];
+ File.GetTokenX(szToken, sizeof(szToken));
+ if (0 != strcmp(szToken, "Path"))
+ Quit("Invalid path file (Path)");
+
+ File.GetTokenX(szToken, sizeof(szToken));
+ if (0 != strcmp(szToken, "edges"))
+ Quit("Invalid path file (edges)");
+
+ File.GetTokenX(szToken, sizeof(szToken));
+ if (!IsValidInteger(szToken))
+ Quit("Invalid path file (edges value)");
+
+ const unsigned uEdgeCount = (unsigned) atoi(szToken);
+ unsigned uEdgeIndex = 0;
+ for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex)
+ {
+ // index
+ File.GetTokenX(szToken, sizeof(szToken));
+ if (!IsValidInteger(szToken))
+ Quit("Invalid path file, invalid index '%s'", szToken);
+ unsigned n = (unsigned) atoi(szToken);
+ if (n != uEdgeIndex)
+ Quit("Invalid path file, expecting edge %u got %u", uEdgeIndex, n);
+
+ // type
+ File.GetTokenX(szToken, sizeof(szToken));
+ if (1 != strlen(szToken))
+ Quit("Invalid path file, expecting state, got '%s'", szToken);
+ const char cType = szToken[0];
+ if ('M' != cType && 'D' != cType && cType != 'I' && 'S' != cType)
+ Quit("Invalid path file, expecting state, got '%c'", cType);
+
+ // prefix length A
+ File.GetTokenX(szToken, sizeof(szToken));
+ if (!IsValidInteger(szToken))
+ Quit("Invalid path file, bad prefix length A '%s'", szToken);
+ const unsigned uPrefixLengthA = (unsigned) atoi(szToken);
+
+ // prefix length B
+ File.GetTokenX(szToken, sizeof(szToken));
+ if (!IsValidInteger(szToken))
+ Quit("Invalid path file, bad prefix length B '%s'", szToken);
+ const unsigned uPrefixLengthB = (unsigned) atoi(szToken);
+
+ PWEdge Edge;
+ Edge.cType = cType;
+ Edge.uPrefixLengthA = uPrefixLengthA;
+ Edge.uPrefixLengthB = uPrefixLengthB;
+ AppendEdge(Edge);
+ }
+ File.GetTokenX(szToken, sizeof(szToken));
+ if (0 != strcmp(szToken, "//"))
+ Quit("Invalid path file (//)");
+ }
+
+void PWPath::ToFile(TextFile &File) const
+ {
+ const unsigned uEdgeCount = GetEdgeCount();
+
+ File.PutString("Path\n");
+ File.PutFormat("edges %u\n", uEdgeCount);
+ for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex)
+ {
+ const PWEdge &Edge = GetEdge(uEdgeIndex);
+ File.PutFormat("%u %c %u %u\n",
+ uEdgeIndex,
+ Edge.cType,
+ Edge.uPrefixLengthA,
+ Edge.uPrefixLengthB);
+ }
+ File.PutString("//\n");
+ }
+
+void PWPath::AssertEqual(const PWPath &Path) const
+ {
+ const unsigned uEdgeCount = GetEdgeCount();
+ if (uEdgeCount != Path.GetEdgeCount())
+ {
+ Log("PWPath::AssertEqual, this=\n");
+ LogMe();
+ Log("\nOther path=\n");
+ Path.LogMe();
+ Log("\n");
+ Quit("PWPath::AssertEqual, Edge count different %u %u\n",
+ uEdgeCount, Path.GetEdgeCount());
+ }
+
+ for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex)
+ {
+ const PWEdge &e1 = GetEdge(uEdgeIndex);
+ const PWEdge &e2 = Path.GetEdge(uEdgeIndex);
+ if (e1.cType != e2.cType || e1.uPrefixLengthA != e2.uPrefixLengthA ||
+ e1.uPrefixLengthB != e2.uPrefixLengthB)
+ {
+ Log("PWPath::AssertEqual, this=\n");
+ LogMe();
+ Log("\nOther path=\n");
+ Path.LogMe();
+ Log("\n");
+ Log("This edge %c%u.%u, other edge %c%u.%u\n",
+ e1.cType, e1.uPrefixLengthA, e1.uPrefixLengthB,
+ e2.cType, e2.uPrefixLengthA, e2.uPrefixLengthB);
+ Quit("PWPath::AssertEqual, edge %u different\n", uEdgeIndex);
+ }
+ }
+ }
+
+bool PWPath::Equal(const PWPath &Path) const
+ {
+ const unsigned uEdgeCount = GetEdgeCount();
+ if (uEdgeCount != Path.GetEdgeCount())
+ return false;
+
+ for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex)
+ {
+ const PWEdge &e1 = GetEdge(uEdgeIndex);
+ const PWEdge &e2 = Path.GetEdge(uEdgeIndex);
+ if (e1.cType != e2.cType || e1.uPrefixLengthA != e2.uPrefixLengthA ||
+ e1.uPrefixLengthB != e2.uPrefixLengthB)
+ return false;
+ }
+ return true;
+ }
+
+unsigned PWPath::GetMatchCount() const
+ {
+ unsigned uMatchCount = 0;
+ const unsigned uEdgeCount = GetEdgeCount();
+ for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex)
+ {
+ const PWEdge &e = GetEdge(uEdgeIndex);
+ if ('M' == e.cType)
+ ++uMatchCount;
+ }
+ return uMatchCount;
+ }
+
+unsigned PWPath::GetInsertCount() const
+ {
+ unsigned uInsertCount = 0;
+ const unsigned uEdgeCount = GetEdgeCount();
+ for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex)
+ {
+ const PWEdge &e = GetEdge(uEdgeIndex);
+ if ('I' == e.cType)
+ ++uInsertCount;
+ }
+ return uInsertCount;
+ }
+
+unsigned PWPath::GetDeleteCount() const
+ {
+ unsigned uDeleteCount = 0;
+ const unsigned uEdgeCount = GetEdgeCount();
+ for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex)
+ {
+ const PWEdge &e = GetEdge(uEdgeIndex);
+ if ('D' == e.cType)
+ ++uDeleteCount;
+ }
+ return uDeleteCount;
+ }
+
+void PWPath::FromStr(const char Str[])
+ {
+ Clear();
+ unsigned uPrefixLengthA = 0;
+ unsigned uPrefixLengthB = 0;
+ while (char c = *Str++)
+ {
+ switch (c)
+ {
+ case 'M':
+ ++uPrefixLengthA;
+ ++uPrefixLengthB;
+ break;
+ case 'D':
+ ++uPrefixLengthA;
+ break;
+ case 'I':
+ ++uPrefixLengthB;
+ break;
+ default:
+ Quit("PWPath::FromStr, invalid state %c", c);
+ }
+ AppendEdge(c, uPrefixLengthA, uPrefixLengthB);
+ }
+ }
Added: trunk/packages/muscle/branches/upstream/current/pwpath.h
===================================================================
--- trunk/packages/muscle/branches/upstream/current/pwpath.h 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/pwpath.h 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,100 @@
+#ifndef PWPath_h
+#define PWPath_h
+
+/***
+Each PWEdge in a PWPath specifies a column in a pair-wise (PW) alignment.
+"Path" is by analogy with the path through an HMM.
+Edge types are:
+
+ 'M' LetterA + LetterB
+ 'D' LetterA + GapB
+ 'I' GapB + LetterA
+
+The mnemomic is Match, Delete, Insert (with respect to A).
+Here is a global alignment of sequences A and B.
+
+ A: AMQT-F
+ B: -M-TIF
+
+The path for this example is:
+
+ Edge cType uPrefixLengthA uPrefixLengthB
+ 0 D 1 0
+ 1 M 2 1
+ 2 D 3 1
+ 3 M 4 2
+ 4 I 4 3
+ 5 M 5 4
+
+Given the starting positions in each alignment (e.g., column zero for
+a global alignment), the prefix length fields are redundant; they are
+included only for convenience and as a sanity check, we are not trying
+to optimize for speed or space here. We use prefix lengths rather than
+column indexes because of the problem of representing the special case
+of a gap in the first position.
+***/
+
+class Seq;
+class MSA;
+class SatchmoParams;
+class PW;
+class TextFile;
+class PWScore;
+
+class PWEdge
+ {
+public:
+ char cType;
+ unsigned uPrefixLengthA;
+ unsigned uPrefixLengthB;
+
+ bool Equal(const PWEdge &e) const
+ {
+ return uPrefixLengthA == e.uPrefixLengthA &&
+ uPrefixLengthB == e.uPrefixLengthB &&
+ cType == e.cType;
+ }
+ };
+
+class PWPath
+ {
+// Disable compiler defaults
+private:
+ PWPath &operator=(const PWPath &rhs);
+ PWPath(const PWPath &rhs);
+
+public:
+ PWPath();
+ virtual ~PWPath();
+
+public:
+ void Clear();
+ void FromStr(const char Str[]);
+ void Copy(const PWPath &Path);
+ void AppendEdge(const PWEdge &Edge);
+ void AppendEdge(char cType, unsigned uPrefixLengthA, unsigned uPrefixLengthB);
+ void PrependEdge(const PWEdge &Edge);
+ unsigned GetEdgeCount() const { return m_uEdgeCount; }
+ const PWEdge &GetEdge(unsigned uEdgeIndex) const;
+ void Validate(const PWScore &PWS) const;
+ void Validate() const;
+ void LogMe() const;
+ void FromFile(TextFile &File);
+ void ToFile(TextFile &File) const;
+ void FromMSAPair(const MSA &msaA, const MSA &msaB);
+ void AssertEqual(const PWPath &Path) const;
+ bool Equal(const PWPath &Path) const;
+ unsigned GetMatchCount() const;
+ unsigned GetDeleteCount() const;
+ unsigned GetInsertCount() const;
+
+private:
+ void ExpandPath(unsigned uAdditionalEdgeCount);
+
+private:
+ unsigned m_uEdgeCount;
+ unsigned m_uArraySize;
+ PWEdge *m_Edges;
+ };
+
+#endif // PWPath_h
Added: trunk/packages/muscle/branches/upstream/current/readmx.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/readmx.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/readmx.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,156 @@
+#include "muscle.h"
+#include "textfile.h"
+
+#define TRACE 0
+
+const int MAX_LINE = 4096;
+const int MAX_HEADINGS = 32;
+static char Heading[MAX_HEADINGS];
+static unsigned HeadingCount = 0;
+static float Mx[32][32];
+
+static void LogMx()
+ {
+ Log("Matrix\n");
+ Log(" ");
+ for (int i = 0; i < 20; ++i)
+ Log(" %c", LetterToChar(i));
+ Log("\n");
+
+ for (int i = 0; i < 20; ++i)
+ {
+ Log("%c ", LetterToChar(i));
+ for (int j = 0; j < 20; ++j)
+ Log("%5.1f", Mx[i][j]);
+ Log("\n");
+ }
+ Log("\n");
+ }
+
+static unsigned MxCharToLetter(char c)
+ {
+ for (unsigned Letter = 0; Letter < HeadingCount; ++Letter)
+ if (Heading[Letter] == c)
+ return Letter;
+ Quit("Letter '%c' has no heading", c);
+ return 0;
+ }
+
+PTR_SCOREMATRIX ReadMx(TextFile &File)
+ {
+// Find column headers
+ char Line[MAX_LINE];
+ for (;;)
+ {
+ bool EndOfFile = File.GetLine(Line, sizeof(Line));
+ if (EndOfFile)
+ Quit("Premature EOF in matrix file");
+
+ if (Line[0] == '#')
+ continue;
+ else if (Line[0] == ' ')
+ break;
+ else
+ Quit("Invalid line in matrix file: '%s'", Line);
+ }
+
+// Read column headers
+ HeadingCount = 0;
+ for (char *p = Line; *p; ++p)
+ {
+ char c = *p;
+ if (!isspace(c))
+ Heading[HeadingCount++] = c;
+ }
+
+ if (HeadingCount > 0 && Heading[HeadingCount-1] == '*')
+ --HeadingCount;
+
+ if (HeadingCount < 20)
+ Quit("Error in matrix file: < 20 headers, line='%s'", Line);
+
+#if TRACE
+ {
+ Log("ReadMx\n");
+ Log("%d headings: ", HeadingCount);
+ for (unsigned i = 0; i < HeadingCount; ++i)
+ Log("%c", Heading[i]);
+ Log("\n");
+ }
+#endif
+
+// Zero out matrix
+ for (int i = 0; i < MAX_ALPHA; ++i)
+ for (int j = 0; j < MAX_ALPHA; ++j)
+ Mx[i][j] = 0.0;
+
+// Read data lines
+ for (unsigned RowIndex = 0; RowIndex < HeadingCount; ++RowIndex)
+ {
+ bool EndOfFile = File.GetTrimLine(Line, sizeof(Line));
+ if (EndOfFile)
+ Quit("Premature EOF in matrix file");
+
+#if TRACE
+ Log("Line=%s\n", Line);
+#endif
+ if (Line[0] == '#')
+ continue;
+
+ char c = Line[0];
+#if TRACE
+ Log("Row char=%c\n", c);
+#endif
+ if (!IsResidueChar(c))
+ continue;
+
+ unsigned RowLetter = CharToLetter(c);
+#if TRACE
+ Log("Row letter = %u\n", RowLetter);
+#endif
+
+ char *p = Line + 1;
+ char *maxp = p + strlen(Line);
+ for (unsigned Col = 0; Col < HeadingCount - 1; ++Col)
+ {
+ if (p >= maxp)
+ Quit("Too few fields in line of matrix file: '%s'", Line);
+ while (isspace(*p))
+ ++p;
+ char *Value = p;
+ while (!isspace(*p))
+ ++p;
+ float v = (float) atof(Value);
+ char HeaderChar = Heading[Col];
+ if (IsResidueChar(HeaderChar))
+ {
+ unsigned ColLetter = CharToLetter(HeaderChar);
+ Mx[RowLetter][ColLetter] = v;
+ }
+ p += 1;
+ }
+ }
+
+// Sanity check for symmetry
+ for (int i = 0; i < 20; ++i)
+ for (int j = 0; j < i; ++j)
+ {
+ if (Mx[i][j] != Mx[j][i])
+ {
+ Warning("Matrix is not symmetrical, %c->%c=%g, %c->%c=%g",
+ CharToLetter(i),
+ CharToLetter(j),
+ Mx[i][j],
+ CharToLetter(j),
+ CharToLetter(i),
+ Mx[j][i]);
+ goto ExitLoop;
+ }
+ }
+ExitLoop:;
+
+ if (g_bVerbose)
+ LogMx();
+
+ return &Mx;
+ }
Added: trunk/packages/muscle/branches/upstream/current/realigndiffs.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/realigndiffs.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/realigndiffs.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,115 @@
+#include "muscle.h"
+#include "msa.h"
+#include "tree.h"
+#include "profile.h"
+#include "pwpath.h"
+
+#define TRACE 0
+
+// Progressive alignment according to a diffs tree.
+
+static void MakeNode(const MSA &msaIn, const Tree &Diffs, unsigned uDiffsNodeIndex,
+ const unsigned IdToDiffsTreeNodeIndex[], ProgNode &Node)
+ {
+ const unsigned uSeqCount = msaIn.GetSeqCount();
+
+ unsigned *Ids = new unsigned[uSeqCount];
+
+ unsigned uSeqsInDiffCount = 0;
+ for (unsigned uId = 0; uId < uSeqCount; ++uId)
+ {
+ if (IdToDiffsTreeNodeIndex[uId] == uDiffsNodeIndex)
+ {
+ Ids[uSeqsInDiffCount] = uId;
+ ++uSeqsInDiffCount;
+ }
+ }
+ if (0 == uSeqsInDiffCount)
+ Quit("MakeNode: no seqs in diff");
+
+ MSASubsetByIds(msaIn, Ids, uSeqsInDiffCount, Node.m_MSA);
+
+#if DEBUG
+ ValidateMuscleIds(Node.m_MSA);
+#endif
+
+ DeleteGappedCols(Node.m_MSA);
+ delete[] Ids;
+ }
+
+void RealignDiffs(const MSA &msaIn, const Tree &Diffs,
+ const unsigned IdToDiffsTreeNodeIndex[], MSA &msaOut)
+ {
+ assert(Diffs.IsRooted());
+
+#if TRACE
+ Log("RealignDiffs\n");
+ Log("Diff tree:\n");
+ Diffs.LogMe();
+#endif
+
+ const unsigned uNodeCount = Diffs.GetNodeCount();
+ if (uNodeCount%2 == 0)
+ Quit("RealignDiffs: Expected odd number of nodes");
+
+ const unsigned uMergeCount = (uNodeCount - 1)/2;
+
+ ProgNode *ProgNodes = new ProgNode[uNodeCount];
+
+ unsigned uJoin = 0;
+ SetProgressDesc("Refine tree");
+ for (unsigned uDiffsNodeIndex = Diffs.FirstDepthFirstNode();
+ NULL_NEIGHBOR != uDiffsNodeIndex;
+ uDiffsNodeIndex = Diffs.NextDepthFirstNode(uDiffsNodeIndex))
+ {
+ if (Diffs.IsLeaf(uDiffsNodeIndex))
+ {
+ assert(uDiffsNodeIndex < uNodeCount);
+ if (uDiffsNodeIndex >= uNodeCount)
+ Quit("TreeNodeIndex=%u NodeCount=%u\n", uDiffsNodeIndex, uNodeCount);
+
+ ProgNode &Node = ProgNodes[uDiffsNodeIndex];
+ MakeNode(msaIn, Diffs, uDiffsNodeIndex, IdToDiffsTreeNodeIndex, Node);
+
+ Node.m_uLength = Node.m_MSA.GetColCount();
+ }
+ else
+ {
+ Progress(uJoin, uMergeCount);
+ ++uJoin;
+ const unsigned uMergeNodeIndex = uDiffsNodeIndex;
+ ProgNode &Parent = ProgNodes[uMergeNodeIndex];
+
+ const unsigned uLeft = Diffs.GetLeft(uDiffsNodeIndex);
+ const unsigned uRight = Diffs.GetRight(uDiffsNodeIndex);
+
+ ProgNode &Node1 = ProgNodes[uLeft];
+ ProgNode &Node2 = ProgNodes[uRight];
+
+ PWPath Path;
+ AlignTwoMSAs(Node1.m_MSA, Node2.m_MSA, Parent.m_MSA, Path);
+
+#if TRACE
+ {
+ Log("Combined:\n");
+ Parent.m_MSA.LogMe();
+ }
+#endif
+
+ Node1.m_MSA.Clear();
+ Node2.m_MSA.Clear();
+ }
+ }
+ ProgressStepsDone();
+
+ unsigned uRootNodeIndex = Diffs.GetRootNodeIndex();
+ const ProgNode &RootProgNode = ProgNodes[uRootNodeIndex];
+ msaOut.Copy(RootProgNode.m_MSA);
+
+#if DEBUG
+ AssertMSAEqIgnoreCaseAndGaps(msaIn, msaOut);
+#endif
+
+ delete[] ProgNodes;
+ ProgNodes = 0;
+ }
Added: trunk/packages/muscle/branches/upstream/current/realigndiffse.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/realigndiffse.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/realigndiffse.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,142 @@
+#include "muscle.h"
+#include "msa.h"
+#include "tree.h"
+#include "profile.h"
+#include "pwpath.h"
+#include "seqvect.h"
+#include "estring.h"
+
+#define TRACE 0
+
+void DeleteProgNode(ProgNode &Node)
+ {
+ delete[] Node.m_Prof;
+ delete[] Node.m_EstringL;
+ delete[] Node.m_EstringR;
+
+ Node.m_Prof = 0;
+ Node.m_EstringL = 0;
+ Node.m_EstringR = 0;
+ }
+
+static void MakeNode(ProgNode &OldNode, ProgNode &NewNode, bool bSwapLR)
+ {
+ if (bSwapLR)
+ {
+ NewNode.m_EstringL = OldNode.m_EstringR;
+ NewNode.m_EstringR = OldNode.m_EstringL;
+ }
+ else
+ {
+ NewNode.m_EstringL = OldNode.m_EstringL;
+ NewNode.m_EstringR = OldNode.m_EstringR;
+ }
+ NewNode.m_Prof = OldNode.m_Prof;
+ NewNode.m_uLength = OldNode.m_uLength;
+ NewNode.m_Weight = OldNode.m_Weight;
+
+ OldNode.m_Prof = 0;
+ OldNode.m_EstringL = 0;
+ OldNode.m_EstringR = 0;
+ }
+
+void RealignDiffsE(const MSA &msaIn, const SeqVect &v,
+ const Tree &NewTree, const Tree &OldTree,
+ const unsigned uNewNodeIndexToOldNodeIndex[],
+ MSA &msaOut, ProgNode *OldProgNodes)
+ {
+ assert(OldProgNodes != 0);
+
+ const unsigned uNodeCount = NewTree.GetNodeCount();
+ if (uNodeCount%2 == 0)
+ Quit("RealignDiffs: Expected odd number of nodes");
+
+ const unsigned uMergeCount = (uNodeCount - 1)/2;
+ ProgNode *NewProgNodes = new ProgNode[uNodeCount];
+
+ for (unsigned uNewNodeIndex = 0; uNewNodeIndex < uNodeCount; ++uNewNodeIndex)
+ {
+ if (NODE_CHANGED == uNewNodeIndexToOldNodeIndex[uNewNodeIndex])
+ continue;
+
+ unsigned uOldNodeIndex = uNewNodeIndexToOldNodeIndex[uNewNodeIndex];
+ assert(uNewNodeIndex < uNodeCount);
+ assert(uOldNodeIndex < uNodeCount);
+
+ ProgNode &NewNode = NewProgNodes[uNewNodeIndex];
+ ProgNode &OldNode = OldProgNodes[uOldNodeIndex];
+ bool bSwapLR = false;
+ if (!NewTree.IsLeaf(uNewNodeIndex))
+ {
+ unsigned uNewLeft = NewTree.GetLeft(uNewNodeIndex);
+ unsigned uNewRight = NewTree.GetRight(uNewNodeIndex);
+ unsigned uOld = uNewNodeIndexToOldNodeIndex[uNewNodeIndex];
+ unsigned uOldLeft = OldTree.GetLeft(uOld);
+ unsigned uOldRight = OldTree.GetRight(uOld);
+ assert(uOldLeft < uNodeCount && uOldRight < uNodeCount);
+ if (uOldLeft != uNewNodeIndexToOldNodeIndex[uNewLeft])
+ {
+ assert(uOldLeft == uNewNodeIndexToOldNodeIndex[uNewRight]);
+ bSwapLR = true;
+ }
+ }
+ MakeNode(OldNode, NewNode, bSwapLR);
+#if TRACE
+ Log("MakeNode old=%u new=%u swap=%d length=%u weight=%.3g\n",
+ uOldNodeIndex, uNewNodeIndex, bSwapLR, NewNode.m_uLength, NewNode.m_Weight);
+#endif
+ }
+
+ unsigned uJoin = 0;
+ SetProgressDesc("Refine tree");
+ for (unsigned uNewNodeIndex = NewTree.FirstDepthFirstNode();
+ NULL_NEIGHBOR != uNewNodeIndex;
+ uNewNodeIndex = NewTree.NextDepthFirstNode(uNewNodeIndex))
+ {
+ if (NODE_CHANGED != uNewNodeIndexToOldNodeIndex[uNewNodeIndex])
+ continue;
+
+ Progress(uJoin, uMergeCount - 1);
+ ++uJoin;
+
+ const unsigned uMergeNodeIndex = uNewNodeIndex;
+ ProgNode &Parent = NewProgNodes[uMergeNodeIndex];
+
+ const unsigned uLeft = NewTree.GetLeft(uNewNodeIndex);
+ const unsigned uRight = NewTree.GetRight(uNewNodeIndex);
+
+ ProgNode &Node1 = NewProgNodes[uLeft];
+ ProgNode &Node2 = NewProgNodes[uRight];
+
+ AlignTwoProfs(
+ Node1.m_Prof, Node1.m_uLength, Node1.m_Weight,
+ Node2.m_Prof, Node2.m_uLength, Node2.m_Weight,
+ Parent.m_Path,
+ &Parent.m_Prof, &Parent.m_uLength);
+ PathToEstrings(Parent.m_Path, &Parent.m_EstringL, &Parent.m_EstringR);
+
+ Parent.m_Weight = Node1.m_Weight + Node2.m_Weight;
+
+ delete[] Node1.m_Prof;
+ delete[] Node2.m_Prof;
+
+ Node1.m_Prof = 0;
+ Node2.m_Prof = 0;
+ }
+
+ ProgressStepsDone();
+
+ if (g_bBrenner)
+ MakeRootMSABrenner((SeqVect &) v, NewTree, NewProgNodes, msaOut);
+ else
+ MakeRootMSA(v, NewTree, NewProgNodes, msaOut);
+
+#if DEBUG
+ AssertMSAEqIgnoreCaseAndGaps(msaIn, msaOut);
+#endif
+
+ for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex)
+ DeleteProgNode(NewProgNodes[uNodeIndex]);
+
+ delete[] NewProgNodes;
+ }
Added: trunk/packages/muscle/branches/upstream/current/refine.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/refine.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/refine.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,79 @@
+#include "muscle.h"
+#include "textfile.h"
+#include "seqvect.h"
+#include "distfunc.h"
+#include "msa.h"
+#include "tree.h"
+#include "clust.h"
+#include "profile.h"
+#include "clustsetmsa.h"
+
+void Refine()
+ {
+ SetOutputFileName(g_pstrOutFileName);
+ SetInputFileName(g_pstrInFileName);
+ SetStartTime();
+
+ SetMaxIters(g_uMaxIters);
+ SetSeqWeightMethod(g_SeqWeight1);
+
+ TextFile fileIn(g_pstrInFileName);
+ MSA msa;
+ msa.FromFile(fileIn);
+
+ const unsigned uSeqCount = msa.GetSeqCount();
+ if (0 == uSeqCount)
+ Quit("No sequences in input file");
+
+ ALPHA Alpha = ALPHA_Undefined;
+ switch (g_SeqType)
+ {
+ case SEQTYPE_Auto:
+ Alpha = msa.GuessAlpha();
+ break;
+
+ case SEQTYPE_Protein:
+ Alpha = ALPHA_Amino;
+ break;
+
+ case SEQTYPE_DNA:
+ Alpha = ALPHA_DNA;
+ break;
+
+ case SEQTYPE_RNA:
+ Alpha = ALPHA_RNA;
+ break;
+
+ default:
+ Quit("Invalid SeqType");
+ }
+ SetAlpha(Alpha);
+ msa.FixAlpha();
+
+ if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha)
+ SetPPScore(PPSCORE_SPN);
+
+ MSA::SetIdCount(uSeqCount);
+
+// Initialize sequence ids.
+// From this point on, ids must somehow propogate from here.
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ msa.SetSeqId(uSeqIndex, uSeqIndex);
+ SetMuscleInputMSA(msa);
+
+ Tree GuideTree;
+ TreeFromMSA(msa, GuideTree, g_Cluster2, g_Distance2, g_Root2);
+ SetMuscleTree(GuideTree);
+
+ if (g_bAnchors)
+ RefineVert(msa, GuideTree, g_uMaxIters);
+ else
+ RefineHoriz(msa, GuideTree, g_uMaxIters, false, false);
+
+ ValidateMuscleIds(msa);
+ ValidateMuscleIds(GuideTree);
+
+// TextFile fileOut(g_pstrOutFileName, true);
+// msa.ToFile(fileOut);
+ MuscleOutput(msa);
+ }
Added: trunk/packages/muscle/branches/upstream/current/refinehoriz.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/refinehoriz.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/refinehoriz.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,288 @@
+#include "muscle.h"
+#include "tree.h"
+#include "msa.h"
+#include "pwpath.h"
+#include "profile.h"
+#include "scorehistory.h"
+#include "objscore.h"
+
+unsigned g_uRefineHeightSubtree;
+unsigned g_uRefineHeightSubtreeTotal;
+
+#define TRACE 0
+#define DIFFOBJSCORE 0
+
+static bool TryRealign(MSA &msaIn, const Tree &tree, const unsigned Leaves1[],
+ unsigned uCount1, const unsigned Leaves2[], unsigned uCount2,
+ SCORE *ptrscoreBefore, SCORE *ptrscoreAfter,
+ bool bLockLeft, bool bLockRight)
+ {
+#if TRACE
+ Log("TryRealign, msaIn=\n");
+ msaIn.LogMe();
+#endif
+
+ const unsigned uSeqCount = msaIn.GetSeqCount();
+
+ unsigned *Ids1 = new unsigned[uSeqCount];
+ unsigned *Ids2 = new unsigned[uSeqCount];
+
+ LeafIndexesToIds(tree, Leaves1, uCount1, Ids1);
+ LeafIndexesToIds(tree, Leaves2, uCount2, Ids2);
+
+ MSA msa1;
+ MSA msa2;
+
+ MSASubsetByIds(msaIn, Ids1, uCount1, msa1);
+ MSASubsetByIds(msaIn, Ids2, uCount2, msa2);
+
+#if DEBUG
+ ValidateMuscleIds(msa1);
+ ValidateMuscleIds(msa2);
+#endif
+
+// Computing the objective score may be expensive for
+// large numbers of sequences. As a speed optimization,
+// we check whether the alignment changes. If it does
+// not change, there is no need to compute the objective
+// score. We test for the alignment changing by comparing
+// the Viterbi paths before and after re-aligning.
+ PWPath pathBefore;
+ pathBefore.FromMSAPair(msa1, msa2);
+
+ DeleteGappedCols(msa1);
+ DeleteGappedCols(msa2);
+
+ if (0 == msa1.GetColCount() || 0 == msa2.GetColCount())
+ return false;
+
+ MSA msaRealigned;
+ PWPath pathAfter;
+
+ AlignTwoMSAs(msa1, msa2, msaRealigned, pathAfter, bLockLeft, bLockRight);
+
+ bool bAnyChanges = !pathAfter.Equal(pathBefore);
+ unsigned uDiffCount1;
+ unsigned uDiffCount2;
+ static unsigned Edges1[10000];
+ static unsigned Edges2[10000];
+ DiffPaths(pathBefore, pathAfter, Edges1, &uDiffCount1, Edges2, &uDiffCount2);
+
+#if TRACE
+ Log("TryRealign, msa1=\n");
+ msa1.LogMe();
+ Log("\nmsa2=\n");
+ msa2.LogMe();
+ Log("\nRealigned (changes %s)=\n", bAnyChanges ? "TRUE" : "FALSE");
+ msaRealigned.LogMe();
+#endif
+
+ if (!bAnyChanges)
+ {
+ *ptrscoreBefore = 0;
+ *ptrscoreAfter = 0;
+ return false;
+ }
+
+ SetMSAWeightsMuscle(msaIn);
+ SetMSAWeightsMuscle(msaRealigned);
+
+#if DIFFOBJSCORE
+ const SCORE scoreDiff = DiffObjScore(msaIn, pathBefore, Edges1, uDiffCount1,
+ msaRealigned, pathAfter, Edges2, uDiffCount2);
+ bool bAccept = (scoreDiff > 0);
+ *ptrscoreBefore = 0;
+ *ptrscoreAfter = scoreDiff;
+ //const SCORE scoreBefore = ObjScoreIds(msaIn, Ids1, uCount1, Ids2, uCount2);
+ //const SCORE scoreAfter = ObjScoreIds(msaRealigned, Ids1, uCount1, Ids2, uCount2);
+ //Log("Diff = %.3g %.3g\n", scoreDiff, scoreAfter - scoreBefore);
+#else
+ const SCORE scoreBefore = ObjScoreIds(msaIn, Ids1, uCount1, Ids2, uCount2);
+ const SCORE scoreAfter = ObjScoreIds(msaRealigned, Ids1, uCount1, Ids2, uCount2);
+
+ bool bAccept = (scoreAfter > scoreBefore);
+
+#if TRACE
+ Log("Score %g -> %g Accept %s\n", scoreBefore, scoreAfter, bAccept ? "TRUE" : "FALSE");
+#endif
+
+ *ptrscoreBefore = scoreBefore;
+ *ptrscoreAfter = scoreAfter;
+#endif
+
+ if (bAccept)
+ msaIn.Copy(msaRealigned);
+ delete[] Ids1;
+ delete[] Ids2;
+ return bAccept;
+ }
+
+static void RefineHeightParts(MSA &msaIn, const Tree &tree,
+ const unsigned InternalNodeIndexes[], bool bReversed, bool bRight,
+ unsigned uIter,
+ ScoreHistory &History,
+ bool *ptrbAnyChanges, bool *ptrbOscillating, bool bLockLeft, bool bLockRight)
+ {
+ *ptrbOscillating = false;
+
+ const unsigned uSeqCount = msaIn.GetSeqCount();
+ const unsigned uInternalNodeCount = uSeqCount - 1;
+
+ unsigned *Leaves1 = new unsigned[uSeqCount];
+ unsigned *Leaves2 = new unsigned[uSeqCount];
+
+ const unsigned uRootNodeIndex = tree.GetRootNodeIndex();
+ bool bAnyAccepted = false;
+ for (unsigned i = 0; i < uInternalNodeCount; ++i)
+ {
+ const unsigned uInternalNodeIndex = InternalNodeIndexes[i];
+ unsigned uNeighborNodeIndex;
+ if (tree.IsRoot(uInternalNodeIndex) && !bRight)
+ continue;
+ else if (bRight)
+ uNeighborNodeIndex = tree.GetRight(uInternalNodeIndex);
+ else
+ uNeighborNodeIndex = tree.GetLeft(uInternalNodeIndex);
+
+ g_uTreeSplitNode1 = uInternalNodeIndex;
+ g_uTreeSplitNode2 = uNeighborNodeIndex;
+
+ unsigned uCount1;
+ unsigned uCount2;
+
+ GetLeaves(tree, uNeighborNodeIndex, Leaves1, &uCount1);
+ GetLeavesExcluding(tree, uRootNodeIndex, uNeighborNodeIndex,
+ Leaves2, &uCount2);
+
+#if TRACE
+ Log("\nRefineHeightParts node %u\n", uInternalNodeIndex);
+ Log("Group1=");
+ for (unsigned n = 0; n < uCount1; ++n)
+ Log(" %u(%s)", Leaves1[n], tree.GetName(Leaves1[n]));
+ Log("\n");
+ Log("Group2=");
+ for (unsigned n = 0; n < uCount2; ++n)
+ Log(" %u(%s)", Leaves2[n], tree.GetName(Leaves2[n]));
+ Log("\n");
+#endif
+
+ SCORE scoreBefore;
+ SCORE scoreAfter;
+ bool bAccepted = TryRealign(msaIn, tree, Leaves1, uCount1, Leaves2, uCount2,
+ &scoreBefore, &scoreAfter, bLockLeft, bLockRight);
+ SetCurrentAlignment(msaIn);
+
+ ++g_uRefineHeightSubtree;
+ Progress(g_uRefineHeightSubtree, g_uRefineHeightSubtreeTotal);
+
+#if TRACE
+ if (uIter > 0)
+ Log("Before %g %g\n", scoreBefore,
+ History.GetScore(uIter - 1, uInternalNodeIndex, bReversed, bRight));
+#endif
+ SCORE scoreMax = scoreAfter > scoreBefore? scoreAfter : scoreBefore;
+ bool bRepeated = History.SetScore(uIter, uInternalNodeIndex, bRight, scoreMax);
+ if (bRepeated)
+ {
+ *ptrbOscillating = true;
+ break;
+ }
+
+ if (bAccepted)
+ bAnyAccepted = true;
+ }
+
+ delete[] Leaves1;
+ delete[] Leaves2;
+
+ *ptrbAnyChanges = bAnyAccepted;
+ }
+
+// Return true if any changes made
+bool RefineHoriz(MSA &msaIn, const Tree &tree, unsigned uIters, bool bLockLeft,
+ bool bLockRight)
+ {
+#if TRACE
+ tree.LogMe();
+#endif
+
+ if (!tree.IsRooted())
+ Quit("RefineHeight: requires rooted tree");
+
+ const unsigned uSeqCount = msaIn.GetSeqCount();
+ if (uSeqCount < 3)
+ return false;
+
+ const unsigned uInternalNodeCount = uSeqCount - 1;
+ unsigned *InternalNodeIndexes = new unsigned[uInternalNodeCount];
+ unsigned *InternalNodeIndexesR = new unsigned[uInternalNodeCount];
+
+ GetInternalNodesInHeightOrder(tree, InternalNodeIndexes);
+
+ ScoreHistory History(uIters, 2*uSeqCount - 1);
+
+ bool bAnyChangesAnyIter = false;
+ for (unsigned n = 0; n < uInternalNodeCount; ++n)
+ InternalNodeIndexesR[uInternalNodeCount - 1 - n] = InternalNodeIndexes[n];
+
+ for (unsigned uIter = 0; uIter < uIters; ++uIter)
+ {
+ bool bAnyChangesThisIter = false;
+ IncIter();
+ SetProgressDesc("Refine biparts");
+ g_uRefineHeightSubtree = 0;
+ g_uRefineHeightSubtreeTotal = uInternalNodeCount*2 - 1;
+
+ bool bReverse = (uIter%2 != 0);
+ unsigned *Internals;
+ if (bReverse)
+ Internals = InternalNodeIndexesR;
+ else
+ Internals = InternalNodeIndexes;
+
+ bool bOscillating;
+ for (unsigned i = 0; i < 2; ++i)
+ {
+ bool bAnyChanges = false;
+ bool bRight;
+ switch (i)
+ {
+ case 0:
+ bRight = true;
+ break;
+ case 1:
+ bRight = false;
+ break;
+ default:
+ Quit("RefineHeight default case");
+ }
+ RefineHeightParts(msaIn, tree, Internals, bReverse, bRight,
+ uIter,
+ History,
+ &bAnyChanges, &bOscillating, bLockLeft, bLockRight);
+ if (bOscillating)
+ {
+ ProgressStepsDone();
+ goto Osc;
+ }
+ if (bAnyChanges)
+ {
+ bAnyChangesThisIter = true;
+ bAnyChangesAnyIter = true;
+ }
+ }
+
+ ProgressStepsDone();
+ if (bOscillating)
+ break;
+
+ if (!bAnyChangesThisIter)
+ break;
+ }
+
+Osc:
+ delete[] InternalNodeIndexes;
+ delete[] InternalNodeIndexesR;
+
+ return bAnyChangesAnyIter;
+ }
Added: trunk/packages/muscle/branches/upstream/current/refinesubfams.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/refinesubfams.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/refinesubfams.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,212 @@
+#include "muscle.h"
+#include "msa.h"
+#include "tree.h"
+#include "clust.h"
+#include "profile.h"
+#include "pwpath.h"
+
+#define TRACE 0
+
+static void ProgressiveAlignSubfams(const Tree &tree, const unsigned Subfams[],
+ unsigned uSubfamCount, const MSA SubfamMSAs[], MSA &msa);
+
+// Identify subfamilies in a tree.
+// Returns array of internal node indexes, one for each subfamily.
+// First try is to select groups by height (which should approximate
+// minimum percent identity), if this gives too many subfamilies then
+// we cut at a point that gives the maximum allowed number of subfams.
+static void GetSubfams(const Tree &tree, double dMaxHeight,
+ unsigned uMaxSubfamCount, unsigned **ptrptrSubfams, unsigned *ptruSubfamCount)
+ {
+ const unsigned uNodeCount = tree.GetNodeCount();
+
+ unsigned *Subfams = new unsigned[uNodeCount];
+
+ unsigned uSubfamCount;
+ ClusterByHeight(tree, dMaxHeight, Subfams, &uSubfamCount);
+
+ if (uSubfamCount > uMaxSubfamCount)
+ ClusterBySubfamCount(tree, uMaxSubfamCount, Subfams, &uSubfamCount);
+
+ *ptrptrSubfams = Subfams;
+ *ptruSubfamCount = uSubfamCount;
+ }
+
+static void LogSubfams(const Tree &tree, const unsigned Subfams[],
+ unsigned uSubfamCount)
+ {
+ const unsigned uNodeCount = tree.GetNodeCount();
+ Log("%u subfamilies found\n", uSubfamCount);
+ Log("Subfam Sequence\n");
+ Log("------ --------\n");
+ unsigned *Leaves = new unsigned[uNodeCount];
+ for (unsigned uSubfamIndex = 0; uSubfamIndex < uSubfamCount; ++uSubfamIndex)
+ {
+ unsigned uSubfamNodeIndex = Subfams[uSubfamIndex];
+ unsigned uLeafCount;
+ GetLeaves(tree, uSubfamNodeIndex, Leaves, &uLeafCount);
+ for (unsigned uLeafIndex = 0; uLeafIndex < uLeafCount; ++uLeafIndex)
+ Log("%6u %s\n", uSubfamIndex + 1, tree.GetLeafName(Leaves[uLeafIndex]));
+ Log("\n");
+ }
+ delete[] Leaves;
+ }
+
+bool RefineSubfams(MSA &msa, const Tree &tree, unsigned uIters)
+ {
+ const unsigned uSeqCount = msa.GetSeqCount();
+ if (uSeqCount < 3)
+ return false;
+
+ const double dMaxHeight = 0.6;
+ const unsigned uMaxSubfamCount = 16;
+ const unsigned uNodeCount = tree.GetNodeCount();
+
+ unsigned *Subfams;
+ unsigned uSubfamCount;
+ GetSubfams(tree, dMaxHeight, uMaxSubfamCount, &Subfams, &uSubfamCount);
+ assert(uSubfamCount <= uSeqCount);
+
+ if (g_bVerbose)
+ LogSubfams(tree, Subfams, uSubfamCount);
+
+ MSA *SubfamMSAs = new MSA[uSubfamCount];
+ unsigned *Leaves = new unsigned[uSeqCount];
+ unsigned *Ids = new unsigned[uSeqCount];
+
+ bool bAnyChanges = false;
+ for (unsigned uSubfamIndex = 0; uSubfamIndex < uSubfamCount; ++uSubfamIndex)
+ {
+ unsigned uSubfam = Subfams[uSubfamIndex];
+ unsigned uLeafCount;
+ GetLeaves(tree, uSubfam, Leaves, &uLeafCount);
+ assert(uLeafCount <= uSeqCount);
+
+ LeafIndexesToIds(tree, Leaves, uLeafCount, Ids);
+
+ MSA &msaSubfam = SubfamMSAs[uSubfamIndex];
+ MSASubsetByIds(msa, Ids, uLeafCount, msaSubfam);
+ DeleteGappedCols(msaSubfam);
+
+#if TRACE
+ Log("Subfam %u MSA=\n", uSubfamIndex);
+ msaSubfam.LogMe();
+#endif
+
+ if (msaSubfam.GetSeqCount() <= 2)
+ continue;
+
+ // TODO /////////////////////////////////////////
+ // Try using existing tree, may actually hurt to
+ // re-estimate, may also be a waste of CPU & mem.
+ /////////////////////////////////////////////////
+ Tree SubfamTree;
+ TreeFromMSA(msaSubfam, SubfamTree, g_Cluster2, g_Distance2, g_Root2);
+
+ bool bAnyChangesThisSubfam;
+ if (g_bAnchors)
+ bAnyChangesThisSubfam = RefineVert(msaSubfam, SubfamTree, uIters);
+ else
+ bAnyChangesThisSubfam = RefineHoriz(msaSubfam, SubfamTree, uIters, false, false);
+#if TRACE
+ Log("Subfam %u Changed %d\n", uSubfamIndex, bAnyChangesThisSubfam);
+#endif
+ if (bAnyChangesThisSubfam)
+ bAnyChanges = true;
+ }
+
+ if (bAnyChanges)
+ ProgressiveAlignSubfams(tree, Subfams, uSubfamCount, SubfamMSAs, msa);
+
+ delete[] Leaves;
+ delete[] Subfams;
+ delete[] SubfamMSAs;
+
+ return bAnyChanges;
+ }
+
+static void ProgressiveAlignSubfams(const Tree &tree, const unsigned Subfams[],
+ unsigned uSubfamCount, const MSA SubfamMSAs[], MSA &msa)
+ {
+ const unsigned uNodeCount = tree.GetNodeCount();
+
+ bool *Ready = new bool[uNodeCount];
+ MSA **MSAs = new MSA *[uNodeCount];
+ for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex)
+ {
+ Ready[uNodeIndex] = false;
+ MSAs[uNodeIndex] = 0;
+ }
+
+ for (unsigned uSubfamIndex = 0; uSubfamIndex < uSubfamCount; ++uSubfamIndex)
+ {
+ unsigned uNodeIndex = Subfams[uSubfamIndex];
+ Ready[uNodeIndex] = true;
+ MSA *ptrMSA = new MSA;
+ // TODO: Wasteful copy, needs re-design
+ ptrMSA->Copy(SubfamMSAs[uSubfamIndex]);
+ MSAs[uNodeIndex] = ptrMSA;
+ }
+
+ for (unsigned uNodeIndex = tree.FirstDepthFirstNode();
+ NULL_NEIGHBOR != uNodeIndex;
+ uNodeIndex = tree.NextDepthFirstNode(uNodeIndex))
+ {
+ if (tree.IsLeaf(uNodeIndex))
+ continue;
+
+ unsigned uRight = tree.GetRight(uNodeIndex);
+ unsigned uLeft = tree.GetLeft(uNodeIndex);
+ if (!Ready[uRight] || !Ready[uLeft])
+ continue;
+
+ MSA *ptrLeft = MSAs[uLeft];
+ MSA *ptrRight = MSAs[uRight];
+ assert(ptrLeft != 0 && ptrRight != 0);
+
+ MSA *ptrParent = new MSA;
+
+ PWPath Path;
+ AlignTwoMSAs(*ptrLeft, *ptrRight, *ptrParent, Path);
+
+ MSAs[uNodeIndex] = ptrParent;
+ Ready[uNodeIndex] = true;
+ Ready[uLeft] = false;
+ Ready[uRight] = false;
+
+ delete MSAs[uLeft];
+ delete MSAs[uRight];
+ MSAs[uLeft] = 0;
+ MSAs[uRight] = 0;
+ }
+
+#if DEBUG
+ {
+ unsigned uReadyCount = 0;
+ for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex)
+ {
+ if (Ready[uNodeIndex])
+ {
+ assert(tree.IsRoot(uNodeIndex));
+ ++uReadyCount;
+ assert(0 != MSAs[uNodeIndex]);
+ }
+ else
+ assert(0 == MSAs[uNodeIndex]);
+ }
+ assert(1 == uReadyCount);
+ }
+#endif
+
+ const unsigned uRoot = tree.GetRootNodeIndex();
+ MSA *ptrRootAlignment = MSAs[uRoot];
+
+ msa.Copy(*ptrRootAlignment);
+
+ delete ptrRootAlignment;
+
+#if TRACE
+ Log("After refine subfamilies, root alignment=\n");
+ msa.LogMe();
+#endif
+ }
Added: trunk/packages/muscle/branches/upstream/current/refinetree.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/refinetree.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/refinetree.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,59 @@
+#include "muscle.h"
+#include "msa.h"
+#include "tree.h"
+#include "profile.h"
+#include <stdio.h>
+
+void RefineTree(MSA &msa, Tree &tree)
+ {
+ const unsigned uSeqCount = msa.GetSeqCount();
+ if (tree.GetLeafCount() != uSeqCount)
+ Quit("Refine tree, tree has different number of nodes");
+
+ if (uSeqCount < 3)
+ return;
+
+#if DEBUG
+ ValidateMuscleIds(msa);
+ ValidateMuscleIds(tree);
+#endif
+
+ unsigned *IdToDiffsLeafNodeIndex = new unsigned[uSeqCount];
+ unsigned uDiffsCount = uSeqCount;
+ Tree Tree2;
+ for (unsigned uIter = 0; uIter < g_uMaxTreeRefineIters; ++uIter)
+ {
+ TreeFromMSA(msa, Tree2, g_Cluster2, g_Distance2, g_Root2);
+
+#if DEBUG
+ ValidateMuscleIds(Tree2);
+#endif
+
+ Tree Diffs;
+ DiffTrees(Tree2, tree, Diffs, IdToDiffsLeafNodeIndex);
+
+ tree.Copy(Tree2);
+
+ const unsigned uNewDiffsNodeCount = Diffs.GetNodeCount();
+ const unsigned uNewDiffsCount = (uNewDiffsNodeCount - 1)/2;
+
+ if (0 == uNewDiffsCount || uNewDiffsCount >= uDiffsCount)
+ {
+ ProgressStepsDone();
+ break;
+ }
+ uDiffsCount = uNewDiffsCount;
+
+ MSA msa2;
+ RealignDiffs(msa, Diffs, IdToDiffsLeafNodeIndex, msa2);
+
+#if DEBUG
+ ValidateMuscleIds(msa2);
+#endif
+
+ msa.Copy(msa2);
+ SetCurrentAlignment(msa);
+ }
+
+ delete[] IdToDiffsLeafNodeIndex;
+ }
Added: trunk/packages/muscle/branches/upstream/current/refinetreee.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/refinetreee.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/refinetreee.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,51 @@
+#include "muscle.h"
+#include "msa.h"
+#include "tree.h"
+#include "profile.h"
+#include <stdio.h>
+
+#define TRACE 0
+
+void RefineTreeE(MSA &msa, const SeqVect &v, Tree &tree, ProgNode *ProgNodes)
+ {
+ const unsigned uSeqCount = msa.GetSeqCount();
+ if (tree.GetLeafCount() != uSeqCount)
+ Quit("Refine tree, tree has different number of nodes");
+
+ if (uSeqCount < 3)
+ return;
+
+#if DEBUG
+ ValidateMuscleIds(msa);
+ ValidateMuscleIds(tree);
+#endif
+
+ const unsigned uNodeCount = tree.GetNodeCount();
+ unsigned *uNewNodeIndexToOldNodeIndex= new unsigned[uNodeCount];
+
+ Tree Tree2;
+ TreeFromMSA(msa, Tree2, g_Cluster2, g_Distance2, g_Root2);
+
+#if DEBUG
+ ValidateMuscleIds(Tree2);
+#endif
+
+ DiffTreesE(Tree2, tree, uNewNodeIndexToOldNodeIndex);
+
+ unsigned uRoot = Tree2.GetRootNodeIndex();
+ if (NODE_CHANGED == uNewNodeIndexToOldNodeIndex[uRoot])
+ {
+ MSA msa2;
+ RealignDiffsE(msa, v, Tree2, tree, uNewNodeIndexToOldNodeIndex, msa2, ProgNodes);
+ tree.Copy(Tree2);
+ msa.Copy(msa2);
+#if DEBUG
+ ValidateMuscleIds(msa2);
+#endif
+ }
+
+ delete[] uNewNodeIndexToOldNodeIndex;
+
+ SetCurrentAlignment(msa);
+ ProgressStepsDone();
+ }
Added: trunk/packages/muscle/branches/upstream/current/refinevert.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/refinevert.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/refinevert.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,159 @@
+#include "muscle.h"
+#include "profile.h"
+#include "msa.h"
+#include "pwpath.h"
+#include "seqvect.h"
+#include "clust.h"
+#include "tree.h"
+
+#define TRACE 0
+
+struct Range
+ {
+ unsigned m_uBestColLeft;
+ unsigned m_uBestColRight;
+ };
+
+static void ListVertSavings(unsigned uColCount, unsigned uAnchorColCount,
+ const Range *Ranges, unsigned uRangeCount)
+ {
+ if (!g_bVerbose || !g_bAnchors)
+ return;
+ double dTotalArea = uColCount*uColCount;
+ double dArea = 0.0;
+ for (unsigned i = 0; i < uRangeCount; ++i)
+ {
+ unsigned uLength = Ranges[i].m_uBestColRight - Ranges[i].m_uBestColLeft;
+ dArea += uLength*uLength;
+ }
+ double dPct = (dTotalArea - dArea)*100.0/dTotalArea;
+ Log("Anchor columns found %u\n", uAnchorColCount);
+ Log("DP area saved by anchors %-4.1f%%\n", dPct);
+ }
+
+static void ColsToRanges(const unsigned BestCols[], unsigned uBestColCount,
+ unsigned uColCount, Range Ranges[])
+ {
+// N best columns produces N+1 vertical blocks.
+ const unsigned uRangeCount = uBestColCount + 1;
+ for (unsigned uIndex = 0; uIndex < uRangeCount ; ++uIndex)
+ {
+ unsigned uBestColLeft = 0;
+ if (uIndex > 0)
+ uBestColLeft = BestCols[uIndex-1];
+
+ unsigned uBestColRight = uColCount;
+ if (uIndex < uBestColCount)
+ uBestColRight = BestCols[uIndex];
+
+ Ranges[uIndex].m_uBestColLeft = uBestColLeft;
+ Ranges[uIndex].m_uBestColRight = uBestColRight;
+ }
+ }
+
+// Return true if any changes made
+bool RefineVert(MSA &msaIn, const Tree &tree, unsigned uIters)
+ {
+ bool bAnyChanges = false;
+
+ const unsigned uColCountIn = msaIn.GetColCount();
+ const unsigned uSeqCountIn = msaIn.GetSeqCount();
+
+ if (uColCountIn < 3 || uSeqCountIn < 3)
+ return false;
+
+ unsigned *AnchorCols = new unsigned[uColCountIn];
+ unsigned uAnchorColCount;
+ SetMSAWeightsMuscle(msaIn);
+ FindAnchorCols(msaIn, AnchorCols, &uAnchorColCount);
+
+ const unsigned uRangeCount = uAnchorColCount + 1;
+ Range *Ranges = new Range[uRangeCount];
+
+#if TRACE
+ Log("%u ranges\n", uRangeCount);
+#endif
+
+ ColsToRanges(AnchorCols, uAnchorColCount, uColCountIn, Ranges);
+ ListVertSavings(uColCountIn, uAnchorColCount, Ranges, uRangeCount);
+
+#if TRACE
+ {
+ Log("Anchor cols: ");
+ for (unsigned i = 0; i < uAnchorColCount; ++i)
+ Log(" %u", AnchorCols[i]);
+ Log("\n");
+
+ Log("Ranges:\n");
+ for (unsigned i = 0; i < uRangeCount; ++i)
+ Log("%4u - %4u\n", Ranges[i].m_uBestColLeft, Ranges[i].m_uBestColRight);
+ }
+#endif
+
+ delete[] AnchorCols;
+
+ MSA msaOut;
+ msaOut.SetSize(uSeqCountIn, 0);
+
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCountIn; ++uSeqIndex)
+ {
+ const char *ptrName = msaIn.GetSeqName(uSeqIndex);
+ unsigned uId = msaIn.GetSeqId(uSeqIndex);
+ msaOut.SetSeqName(uSeqIndex, ptrName);
+ msaOut.SetSeqId(uSeqIndex, uId);
+ }
+
+ for (unsigned uRangeIndex = 0; uRangeIndex < uRangeCount; ++uRangeIndex)
+ {
+ MSA msaRange;
+
+ const Range &r = Ranges[uRangeIndex];
+
+ const unsigned uFromColIndex = r.m_uBestColLeft;
+ const unsigned uRangeColCount = r.m_uBestColRight - uFromColIndex;
+
+ if (0 == uRangeColCount)
+ continue;
+ else if (1 == uRangeColCount)
+ {
+ MSAFromColRange(msaIn, uFromColIndex, 1, msaRange);
+ MSAAppend(msaOut, msaRange);
+ continue;
+ }
+ MSAFromColRange(msaIn, uFromColIndex, uRangeColCount, msaRange);
+
+#if TRACE
+ Log("\n-------------\n");
+ Log("Range %u - %u count=%u\n", r.m_uBestColLeft, r.m_uBestColRight, uRangeColCount);
+ Log("Before:\n");
+ msaRange.LogMe();
+#endif
+
+ bool bLockLeft = (0 != uRangeIndex);
+ bool bLockRight = (uRangeCount - 1 != uRangeIndex);
+ bool bAnyChangesThisBlock = RefineHoriz(msaRange, tree, uIters, bLockLeft, bLockRight);
+ bAnyChanges = (bAnyChanges || bAnyChangesThisBlock);
+
+#if TRACE
+ Log("After:\n");
+ msaRange.LogMe();
+#endif
+
+ MSAAppend(msaOut, msaRange);
+
+#if TRACE
+ Log("msaOut after Cat:\n");
+ msaOut.LogMe();
+#endif
+ }
+
+#if DEBUG
+// Sanity check
+ AssertMSAEqIgnoreCaseAndGaps(msaIn, msaOut);
+#endif
+
+ delete[] Ranges;
+ if (bAnyChanges)
+ msaIn.Copy(msaOut);
+ return bAnyChanges;
+ }
Added: trunk/packages/muscle/branches/upstream/current/refinew.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/refinew.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/refinew.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,227 @@
+#include "muscle.h"
+#include "msa.h"
+#include "seqvect.h"
+#include "textfile.h"
+
+#define MEMDEBUG 0
+
+#if MEMDEBUG
+#include <crtdbg.h>
+#endif
+
+void MUSCLE(SeqVect &v, MSA &msaOut);
+
+// Append msa2 at the end of msa1
+void AppendMSA(MSA &msa1, const MSA &msa2)
+ {
+ const unsigned uSeqCount = msa1.GetSeqCount();
+
+ const unsigned uColCount1 = msa1.GetColCount();
+ const unsigned uColCount2 = msa2.GetColCount();
+
+ const unsigned uColCountCat = uColCount1 + uColCount2;
+
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ unsigned uId = msa1.GetSeqId(uSeqIndex);
+ unsigned uSeqIndex2;
+ bool bFound = msa2.GetSeqIndex(uId, &uSeqIndex2);
+ if (bFound)
+ {
+ for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex)
+ {
+ const char c = msa2.GetChar(uSeqIndex2, uColIndex);
+ msa1.SetChar(uSeqIndex, uColCount1 + uColIndex, c);
+ }
+ }
+ else
+ {
+ for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex)
+ msa1.SetChar(uSeqIndex, uColCount1 + uColIndex, '-');
+ }
+ }
+ }
+
+static void SeqFromMSACols(const MSA &msa, unsigned uSeqIndex, unsigned uColFrom,
+ unsigned uColTo, Seq &s)
+ {
+ s.Clear();
+ s.SetName(msa.GetSeqName(uSeqIndex));
+ s.SetId(msa.GetSeqId(uSeqIndex));
+ for (unsigned uColIndex = uColFrom; uColIndex <= uColTo; ++uColIndex)
+ {
+ char c = msa.GetChar(uSeqIndex, uColIndex);
+ if (!IsGapChar(c))
+ s.AppendChar(c);
+ }
+ }
+
+static void SeqVectFromMSACols(const MSA &msa, unsigned uColFrom, unsigned uColTo,
+ SeqVect &v)
+ {
+ v.Clear();
+ const unsigned uSeqCount = msa.GetSeqCount();
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ Seq s;
+ SeqFromMSACols(msa, uSeqIndex, uColFrom, uColTo, s);
+ v.AppendSeq(s);
+ }
+ }
+
+void RefineW(const MSA &msaIn, MSA &msaOut)
+ {
+ const unsigned uSeqCount = msaIn.GetSeqCount();
+ const unsigned uColCount = msaIn.GetColCount();
+
+// Reserve same nr seqs, 20% more cols
+ const unsigned uReserveColCount = (uColCount*120)/100;
+ msaOut.SetSize(uSeqCount, uReserveColCount);
+
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ msaOut.SetSeqName(uSeqIndex, msaIn.GetSeqName(uSeqIndex));
+ msaOut.SetSeqId(uSeqIndex, msaIn.GetSeqId(uSeqIndex));
+ }
+
+ const unsigned uWindowCount = (uColCount + g_uRefineWindow - 1)/g_uRefineWindow;
+ if (0 == g_uWindowTo)
+ g_uWindowTo = uWindowCount - 1;
+
+#if MEMDEBUG
+ _CrtSetBreakAlloc(1560);
+#endif
+
+ if (g_uWindowOffset > 0)
+ {
+ MSA msaTmp;
+ MSAFromColRange(msaIn, 0, g_uWindowOffset, msaOut);
+ }
+
+ fprintf(stderr, "\n");
+ for (unsigned uWindowIndex = g_uWindowFrom; uWindowIndex <= g_uWindowTo; ++uWindowIndex)
+ {
+ fprintf(stderr, "Window %d of %d \r", uWindowIndex, uWindowCount);
+ const unsigned uColFrom = g_uWindowOffset + uWindowIndex*g_uRefineWindow;
+ unsigned uColTo = uColFrom + g_uRefineWindow - 1;
+ if (uColTo >= uColCount)
+ uColTo = uColCount - 1;
+ assert(uColTo >= uColFrom);
+
+ SeqVect v;
+ SeqVectFromMSACols(msaIn, uColFrom, uColTo, v);
+
+#if MEMDEBUG
+ _CrtMemState s1;
+ _CrtMemCheckpoint(&s1);
+#endif
+
+ MSA msaTmp;
+ MUSCLE(v, msaTmp);
+ AppendMSA(msaOut, msaTmp);
+ if (uWindowIndex == g_uSaveWindow)
+ {
+ MSA msaInTmp;
+ unsigned uOutCols = msaOut.GetColCount();
+ unsigned un = uColTo - uColFrom + 1;
+ MSAFromColRange(msaIn, uColFrom, un, msaInTmp);
+
+ char fn[256];
+ sprintf(fn, "win%d_inaln.tmp", uWindowIndex);
+ TextFile fIn(fn, true);
+ msaInTmp.ToFile(fIn);
+
+ sprintf(fn, "win%d_inseqs.tmp", uWindowIndex);
+ TextFile fv(fn, true);
+ v.ToFile(fv);
+
+ sprintf(fn, "win%d_outaln.tmp", uWindowIndex);
+ TextFile fOut(fn, true);
+ msaTmp.ToFile(fOut);
+ }
+
+#if MEMDEBUG
+ void FreeDPMemSPN();
+ FreeDPMemSPN();
+
+ _CrtMemState s2;
+ _CrtMemCheckpoint(&s2);
+
+ _CrtMemState s;
+ _CrtMemDifference(&s, &s1, &s2);
+
+ _CrtMemDumpStatistics(&s);
+ _CrtMemDumpAllObjectsSince(&s1);
+ exit(1);
+#endif
+//#if DEBUG
+// AssertMSAEqIgnoreCaseAndGaps(msaInTmp, msaTmp);
+//#endif
+ }
+ fprintf(stderr, "\n");
+
+// AssertMSAEqIgnoreCaseAndGaps(msaIn, msaOut);//@@uncomment!
+ }
+
+void DoRefineW()
+ {
+ SetOutputFileName(g_pstrOutFileName);
+ SetInputFileName(g_pstrInFileName);
+ SetStartTime();
+
+ SetMaxIters(g_uMaxIters);
+ SetSeqWeightMethod(g_SeqWeight1);
+
+ TextFile fileIn(g_pstrInFileName);
+ MSA msa;
+ msa.FromFile(fileIn);
+
+ const unsigned uSeqCount = msa.GetSeqCount();
+ if (0 == uSeqCount)
+ Quit("No sequences in input file");
+
+ MSA::SetIdCount(uSeqCount);
+
+// Initialize sequence ids.
+// From this point on, ids must somehow propogate from here.
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ msa.SetSeqId(uSeqIndex, uSeqIndex);
+ SetMuscleInputMSA(msa);
+
+ ALPHA Alpha = ALPHA_Undefined;
+ switch (g_SeqType)
+ {
+ case SEQTYPE_Auto:
+ Alpha = msa.GuessAlpha();
+ break;
+
+ case SEQTYPE_Protein:
+ Alpha = ALPHA_Amino;
+ break;
+
+ case SEQTYPE_DNA:
+ Alpha = ALPHA_DNA;
+ break;
+
+ case SEQTYPE_RNA:
+ Alpha = ALPHA_RNA;
+ break;
+
+ default:
+ Quit("Invalid SeqType");
+ }
+ SetAlpha(Alpha);
+ msa.FixAlpha();
+
+ if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha)
+ SetPPScore(PPSCORE_SPN);
+
+ MSA msaOut;
+ RefineW(msa, msaOut);
+
+// ValidateMuscleIds(msa);
+
+// TextFile fileOut(g_pstrOutFileName, true);
+// msaOut.ToFile(fileOut);
+ MuscleOutput(msaOut);
+ }
Added: trunk/packages/muscle/branches/upstream/current/savebest.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/savebest.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/savebest.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,66 @@
+#include "muscle.h"
+#include "msa.h"
+#include "textfile.h"
+#include <time.h>
+
+static MSA *ptrBestMSA;
+static const char *pstrOutputFileName;
+
+void SetOutputFileName(const char *out)
+ {
+ pstrOutputFileName = out;
+ }
+
+void SetCurrentAlignment(MSA &msa)
+ {
+ ptrBestMSA = &msa;
+ }
+
+void SaveCurrentAlignment()
+ {
+ static bool bCalled = false;
+ if (bCalled)
+ {
+ fprintf(stderr,
+ "\nRecursive call to SaveCurrentAlignment, giving up attempt to save.\n");
+ exit(EXIT_FatalError);
+ }
+
+ if (0 == ptrBestMSA)
+ {
+ fprintf(stderr, "\nAlignment not completed, cannot save.\n");
+ Log("Alignment not completed, cannot save.\n");
+ exit(EXIT_FatalError);
+ }
+
+ if (0 == pstrOutputFileName)
+ {
+ fprintf(stderr, "\nOutput file name not specified, cannot save.\n");
+ exit(EXIT_FatalError);
+ }
+
+ fprintf(stderr, "\nSaving current alignment ...\n");
+
+ TextFile fileOut(pstrOutputFileName, true);
+ ptrBestMSA->ToFASTAFile(fileOut);
+
+ fprintf(stderr, "Current alignment saved to \"%s\".\n", pstrOutputFileName);
+ Log("Current alignment saved to \"%s\".\n", pstrOutputFileName);
+ }
+
+void CheckMaxTime()
+ {
+ if (0 == g_ulMaxSecs)
+ return;
+
+ time_t Now = time(0);
+ time_t ElapsedSecs = Now - GetStartTime();
+ if (ElapsedSecs <= (time_t) g_ulMaxSecs)
+ return;
+
+ Log("Max time %s exceeded, elapsed seconds = %ul\n",
+ MaxSecsToStr(), ElapsedSecs);
+
+ SaveCurrentAlignment();
+ exit(EXIT_Success);
+ }
Added: trunk/packages/muscle/branches/upstream/current/scoregaps.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/scoregaps.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/scoregaps.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,201 @@
+#include "muscle.h"
+#include "msa.h"
+#include "objscore.h"
+
+#define TRACE 0
+
+struct GAPINFO
+ {
+ GAPINFO *Next;
+ unsigned Start;
+ unsigned End;
+ };
+
+static GAPINFO **g_Gaps;
+static GAPINFO *g_FreeList;
+static unsigned g_MaxSeqCount;
+static unsigned g_MaxColCount;
+static unsigned g_ColCount;
+static bool *g_ColDiff;
+
+static GAPINFO *NewGapInfo()
+ {
+ if (0 == g_FreeList)
+ {
+ const int NEWCOUNT = 256;
+ GAPINFO *NewList = new GAPINFO[NEWCOUNT];
+ g_FreeList = &NewList[0];
+ for (int i = 0; i < NEWCOUNT-1; ++i)
+ NewList[i].Next = &NewList[i+1];
+ NewList[NEWCOUNT-1].Next = 0;
+ }
+ GAPINFO *GI = g_FreeList;
+ g_FreeList = g_FreeList->Next;
+ return GI;
+ }
+
+static void FreeGapInfo(GAPINFO *GI)
+ {
+ GI->Next = g_FreeList;
+ g_FreeList = GI;
+ }
+
+// TODO: This could be much faster, no need to look
+// at all columns.
+static void FindIntersectingGaps(const MSA &msa, unsigned SeqIndex)
+ {
+ const unsigned ColCount = msa.GetColCount();
+ bool InGap = false;
+ bool Intersects = false;
+ unsigned Start = uInsane;
+ for (unsigned Col = 0; Col <= ColCount; ++Col)
+ {
+ bool Gap = ((Col != ColCount) && msa.IsGap(SeqIndex, Col));
+ if (Gap)
+ {
+ if (!InGap)
+ {
+ InGap = true;
+ Start = Col;
+ }
+ if (g_ColDiff[Col])
+ Intersects = true;
+ }
+ else if (InGap)
+ {
+ InGap = false;
+ if (Intersects)
+ {
+ GAPINFO *GI = NewGapInfo();
+ GI->Start = Start;
+ GI->End = Col - 1;
+ GI->Next = g_Gaps[SeqIndex];
+ g_Gaps[SeqIndex] = GI;
+ }
+ Intersects = false;
+ }
+ }
+ }
+
+static SCORE Penalty(unsigned Length, bool Term)
+ {
+ if (0 == Length)
+ return 0;
+ SCORE s1 = g_scoreGapOpen + g_scoreGapExtend*(Length - 1);
+#if DOUBLE_AFFINE
+ SCORE s2 = g_scoreGapOpen2 + g_scoreGapExtend2*(Length - 1);
+ if (s1 > s2)
+ return s1;
+ return s2;
+#else
+ return s1;
+#endif
+ }
+
+//static SCORE ScorePair(unsigned Seq1, unsigned Seq2)
+// {
+//#if TRACE
+// {
+// Log("ScorePair(%d,%d)\n", Seq1, Seq2);
+// Log("Gaps seq 1: ");
+// for (GAPINFO *GI = g_Gaps[Seq1]; GI; GI = GI->Next)
+// Log(" %d-%d", GI->Start, GI->End);
+// Log("\n");
+// Log("Gaps seq 2: ");
+// for (GAPINFO *GI = g_Gaps[Seq2]; GI; GI = GI->Next)
+// Log(" %d-%d", GI->Start, GI->End);
+// Log("\n");
+// }
+//#endif
+// return 0;
+// }
+
+SCORE ScoreGaps(const MSA &msa, const unsigned DiffCols[], unsigned DiffColCount)
+ {
+#if TRACE
+ {
+ Log("ScoreGaps\n");
+ Log("DiffCols ");
+ for (unsigned i = 0; i < DiffColCount; ++i)
+ Log(" %u", DiffCols[i]);
+ Log("\n");
+ Log("msa=\n");
+ msa.LogMe();
+ Log("\n");
+ }
+#endif
+ const unsigned SeqCount = msa.GetSeqCount();
+ const unsigned ColCount = msa.GetColCount();
+ g_ColCount = ColCount;
+
+ if (SeqCount > g_MaxSeqCount)
+ {
+ delete[] g_Gaps;
+ g_MaxSeqCount = SeqCount + 256;
+ g_Gaps = new GAPINFO *[g_MaxSeqCount];
+ }
+ memset(g_Gaps, 0, SeqCount*sizeof(GAPINFO *));
+
+ if (ColCount > g_MaxColCount)
+ {
+ delete[] g_ColDiff;
+ g_MaxColCount = ColCount + 256;
+ g_ColDiff = new bool[g_MaxColCount];
+ }
+
+ memset(g_ColDiff, 0, g_ColCount*sizeof(bool));
+ for (unsigned i = 0; i < DiffColCount; ++i)
+ {
+ unsigned Col = DiffCols[i];
+ assert(Col < ColCount);
+ g_ColDiff[Col] = true;
+ }
+
+ for (unsigned SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex)
+ FindIntersectingGaps(msa, SeqIndex);
+
+#if TRACE
+ {
+ Log("\n");
+ Log("Intersecting gaps:\n");
+ Log(" ");
+ for (unsigned Col = 0; Col < ColCount; ++Col)
+ Log("%c", g_ColDiff[Col] ? '*' : ' ');
+ Log("\n");
+ Log(" ");
+ for (unsigned Col = 0; Col < ColCount; ++Col)
+ Log("%d", Col%10);
+ Log("\n");
+ for (unsigned Seq = 0; Seq < SeqCount; ++Seq)
+ {
+ Log("%3d: ", Seq);
+ for (unsigned Col = 0; Col < ColCount; ++Col)
+ Log("%c", msa.GetChar(Seq, Col));
+ Log(" :: ");
+ for (GAPINFO *GI = g_Gaps[Seq]; GI; GI = GI->Next)
+ Log(" (%d,%d)", GI->Start, GI->End);
+ Log(" >%s\n", msa.GetSeqName(Seq));
+ }
+ Log("\n");
+ }
+#endif
+
+ SCORE Score = 0;
+ for (unsigned Seq1 = 0; Seq1 < SeqCount; ++Seq1)
+ {
+ const WEIGHT w1 = msa.GetSeqWeight(Seq1);
+ for (unsigned Seq2 = Seq1 + 1; Seq2 < SeqCount; ++Seq2)
+ {
+ const WEIGHT w2 = msa.GetSeqWeight(Seq2);
+// const SCORE Pair = ScorePair(Seq1, Seq2);
+ const SCORE Pair = ScoreSeqPairGaps(msa, Seq1, msa, Seq2);
+ Score += w1*w2*Pair;
+#if TRACE
+ Log("Seq1=%u Seq2=%u ScorePair=%.4g w1=%.4g w2=%.4g Sum=%.4g\n",
+ Seq1, Seq2, Pair, w1, w2, Score);
+#endif
+ }
+ }
+
+ return Score;
+ }
Added: trunk/packages/muscle/branches/upstream/current/scorehistory.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/scorehistory.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/scorehistory.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,101 @@
+#include "muscle.h"
+#include "scorehistory.h"
+#include <stdio.h>
+
+#define TRACE 0
+
+ScoreHistory::ScoreHistory(unsigned uIters, unsigned uNodeCount)
+ {
+ m_uNodeCount = uNodeCount;
+ m_uIters = uIters;
+
+ m_Score = new SCORE *[uIters];
+ m_bScoreSet = new bool *[uIters];
+ for (unsigned n = 0; n < uIters; ++n)
+ {
+ m_Score[n] = new SCORE[uNodeCount*2];
+ m_bScoreSet[n] = new bool[uNodeCount*2];
+ memset(m_bScoreSet[n], 0, uNodeCount*2*sizeof(bool));
+ }
+ }
+
+ScoreHistory::~ScoreHistory()
+ {
+ for (unsigned n = 0; n < m_uIters; ++n)
+ {
+ delete[] m_Score[n];
+ delete[] m_bScoreSet[n];
+ }
+ delete[] m_Score;
+ delete[] m_bScoreSet;
+ }
+
+bool ScoreHistory::SetScore(unsigned uIter, unsigned uNodeIndex, bool bRight, SCORE Score)
+ {
+#if TRACE
+ Log("ScoreHistory::SetScore(Iter=%u Node=%u Right=%d Score=%g)\n",
+ uIter, uNodeIndex, bRight, Score);
+#endif
+ if (uIter >= m_uIters)
+ Quit("ScoreHistory::SetScore-1");
+ if (uNodeIndex >= m_uNodeCount)
+ Quit("ScoreHistory::SetScore-2");
+
+ const unsigned uIndex = uNodeIndex*2 + bRight;
+ for (unsigned n = 1; n < uIter; ++n)
+ {
+ const unsigned uPrevIter = n - 1;
+ if (!m_bScoreSet[uPrevIter][uIndex])
+ {
+ LogMe();
+ Quit("ScoreHistory::SetScore-3");
+ }
+ if (m_Score[uPrevIter][uIndex] == Score)
+ {
+ ProgressStepsDone();
+#if TRACE
+ Log("Oscillating\n");
+#endif
+ return true;
+ }
+ }
+ m_Score[uIter][uIndex] = Score;
+ m_bScoreSet[uIter][uIndex] = true;
+ return false;
+ }
+
+void ScoreHistory::LogMe() const
+ {
+ Log("ScoreHistory\n");
+ Log("Iter Node Right Score\n");
+ Log("---- ---- ----- ---------\n");
+ for (unsigned uIter = 0; uIter < m_uIters; ++uIter)
+ {
+ bool bAnySet = false;
+ for (unsigned n = 0; n < m_uNodeCount*2; ++n)
+ if (m_bScoreSet[uIter][n])
+ {
+ bAnySet = true;
+ break;
+ }
+ if (!bAnySet)
+ return;
+ for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex)
+ {
+ const unsigned uBase = 2*uNodeIndex;
+ if (m_bScoreSet[uIter][uBase])
+ Log("%4u %4u F %9.3f\n", uIter, uNodeIndex, m_Score[uIter][uBase]);
+ if (m_bScoreSet[uIter][uBase+1])
+ Log("%4u %4u T %9.3f\n", uIter, uNodeIndex, m_Score[uIter][uBase+1]);
+ }
+ }
+ }
+
+SCORE ScoreHistory::GetScore(unsigned uIter, unsigned uNodeIndex,
+ bool bReverse, bool bRight) const
+ {
+ const unsigned uIndex = uNodeIndex*2 + bRight;
+ if (!m_bScoreSet[uIter][uIndex])
+ Quit("ScoreHistory::GetScore");
+ return m_Score[uIter][uIndex];
+ }
Added: trunk/packages/muscle/branches/upstream/current/scorehistory.h
===================================================================
--- trunk/packages/muscle/branches/upstream/current/scorehistory.h 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/scorehistory.h 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,21 @@
+#ifndef ScoreHistory_h
+#define ScoreHistory_h
+
+class ScoreHistory
+ {
+public:
+ ScoreHistory(unsigned uIters, unsigned uInternalNodeCount);
+ ~ScoreHistory();
+ bool SetScore(unsigned uIter, unsigned uInternalNodeIndex, bool bRight, SCORE Score);
+ void LogMe() const;
+ SCORE GetScore(unsigned uIter, unsigned uInternalNodeIndex, bool bReversed,
+ bool bRight) const;
+
+private:
+ SCORE **m_Score;
+ bool **m_bScoreSet;
+ unsigned m_uIters;
+ unsigned m_uNodeCount;
+ };
+
+#endif // ScoreHistory_h
Added: trunk/packages/muscle/branches/upstream/current/scorepp.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/scorepp.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/scorepp.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,104 @@
+#include "muscle.h"
+#include "profile.h"
+
+char ConsensusChar(const ProfPos &PP)
+ {
+ unsigned uMostCommonLetter = 0;
+ FCOUNT fcMostCommon = PP.m_fcCounts[0];
+ bool bMoreThanOneLetter = false;
+ bool bAnyLetter = false;
+ for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter)
+ {
+ const FCOUNT fc = PP.m_fcCounts[uLetter];
+ if (fc > 0)
+ {
+ if (bAnyLetter)
+ bMoreThanOneLetter = true;
+ bAnyLetter = true;
+ }
+ if (fc > fcMostCommon)
+ {
+ uMostCommonLetter = uLetter;
+ fcMostCommon = fc;
+ }
+ }
+ if (!bAnyLetter)
+ return '-';
+ char c = LetterToChar(uMostCommonLetter);
+ if (bMoreThanOneLetter)
+ return UnalignChar(c);
+ return c;
+ }
+
+SCORE ScoreProfPos2LA(const ProfPos &PPA, const ProfPos &PPB)
+ {
+ SCORE Score = 0;
+ for (unsigned n = 0; n < 20; ++n)
+ {
+ const unsigned uLetter = PPA.m_uSortOrder[n];
+ const FCOUNT fcLetter = PPA.m_fcCounts[uLetter];
+ if (0 == fcLetter)
+ break;
+ Score += fcLetter*PPB.m_AAScores[uLetter];
+ }
+ if (0 == Score)
+ return -2.5;
+ SCORE logScore = logf(Score);
+ return (SCORE) ((logScore - g_scoreCenter)*(PPA.m_fOcc * PPB.m_fOcc));
+ }
+
+SCORE ScoreProfPos2NS(const ProfPos &PPA, const ProfPos &PPB)
+ {
+ SCORE Score = 0;
+ for (unsigned n = 0; n < 20; ++n)
+ {
+ const unsigned uLetter = PPA.m_uSortOrder[n];
+ const FCOUNT fcLetter = PPA.m_fcCounts[uLetter];
+ if (0 == fcLetter)
+ break;
+ Score += fcLetter*PPB.m_AAScores[uLetter];
+ }
+ return Score - g_scoreCenter;
+ }
+
+SCORE ScoreProfPos2SP(const ProfPos &PPA, const ProfPos &PPB)
+ {
+ SCORE Score = 0;
+ for (unsigned n = 0; n < 20; ++n)
+ {
+ const unsigned uLetter = PPA.m_uSortOrder[n];
+ const FCOUNT fcLetter = PPA.m_fcCounts[uLetter];
+ if (0 == fcLetter)
+ break;
+ Score += fcLetter*PPB.m_AAScores[uLetter];
+ }
+ return Score - g_scoreCenter;
+ }
+
+SCORE ScoreProfPos2SPN(const ProfPos &PPA, const ProfPos &PPB)
+ {
+ SCORE Score = 0;
+ for (unsigned n = 0; n < 4; ++n)
+ {
+ const unsigned uLetter = PPA.m_uSortOrder[n];
+ const FCOUNT fcLetter = PPA.m_fcCounts[uLetter];
+ if (0 == fcLetter)
+ break;
+ Score += fcLetter*PPB.m_AAScores[uLetter];
+ }
+ return Score - g_scoreCenter;
+ }
+
+SCORE ScoreProfPos2(const ProfPos &PPA, const ProfPos &PPB)
+ {
+ if (PPSCORE_SP == g_PPScore)
+ return ScoreProfPos2NS(PPA, PPB);
+ else if (PPSCORE_LE == g_PPScore)
+ return ScoreProfPos2LA(PPA, PPB);
+ else if (PPSCORE_SV == g_PPScore)
+ return ScoreProfPos2SP(PPA, PPB);
+ else if (PPSCORE_SPN == g_PPScore)
+ return ScoreProfPos2SPN(PPA, PPB);
+ Quit("Invalid g_PPScore");
+ return 0;
+ }
Added: trunk/packages/muscle/branches/upstream/current/seq.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/seq.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/seq.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,342 @@
+#include "muscle.h"
+#include "seq.h"
+#include "textfile.h"
+#include "msa.h"
+//#include <ctype.h>
+
+const size_t MAX_FASTA_LINE = 16000;
+
+void Seq::SetName(const char *ptrName)
+ {
+ delete[] m_ptrName;
+ size_t n = strlen(ptrName) + 1;
+ m_ptrName = new char[n];
+ strcpy(m_ptrName, ptrName);
+ }
+
+void Seq::ToFASTAFile(TextFile &File) const
+ {
+ File.PutFormat(">%s\n", m_ptrName);
+ unsigned uColCount = Length();
+ for (unsigned n = 0; n < uColCount; ++n)
+ {
+ if (n > 0 && n%60 == 0)
+ File.PutString("\n");
+ File.PutChar(at(n));
+ }
+ File.PutString("\n");
+ }
+
+// Return true on end-of-file
+bool Seq::FromFASTAFile(TextFile &File)
+ {
+ Clear();
+
+ char szLine[MAX_FASTA_LINE];
+ bool bEof = File.GetLine(szLine, sizeof(szLine));
+ if (bEof)
+ return true;
+ if ('>' != szLine[0])
+ Quit("Expecting '>' in FASTA file %s line %u",
+ File.GetFileName(), File.GetLineNr());
+
+ size_t n = strlen(szLine);
+ if (1 == n)
+ Quit("Missing annotation following '>' in FASTA file %s line %u",
+ File.GetFileName(), File.GetLineNr());
+
+ m_ptrName = new char[n];
+ strcpy(m_ptrName, szLine + 1);
+
+ TEXTFILEPOS Pos = File.GetPos();
+ for (;;)
+ {
+ bEof = File.GetLine(szLine, sizeof(szLine));
+ if (bEof)
+ {
+ if (0 == size())
+ {
+ Quit("Empty sequence in FASTA file %s line %u",
+ File.GetFileName(), File.GetLineNr());
+ return true;
+ }
+ return false;
+ }
+ if ('>' == szLine[0])
+ {
+ if (0 == size())
+ Quit("Empty sequence in FASTA file %s line %u",
+ File.GetFileName(), File.GetLineNr());
+ // Rewind to beginning of this line, it's the start of the
+ // next sequence.
+ File.SetPos(Pos);
+ return false;
+ }
+ const char *ptrChar = szLine;
+ while (char c = *ptrChar++)
+ {
+ if (isspace(c))
+ continue;
+ if (IsGapChar(c))
+ continue;
+ if (!IsResidueChar(c))
+ {
+ if (isprint(c))
+ {
+ char w = GetWildcardChar();
+ Warning("Invalid residue '%c' in FASTA file %s line %d, replaced by '%c'",
+ c, File.GetFileName(), File.GetLineNr(), w);
+ c = w;
+ }
+ else
+ Quit("Invalid byte hex %02x in FASTA file %s line %d",
+ (unsigned char) c, File.GetFileName(), File.GetLineNr());
+ }
+ c = toupper(c);
+ push_back(c);
+ }
+ Pos = File.GetPos();
+ }
+ }
+
+void Seq::ExtractUngapped(MSA &msa) const
+ {
+ msa.Clear();
+ unsigned uColCount = Length();
+ msa.SetSize(1, 1);
+ unsigned uUngappedPos = 0;
+ for (unsigned n = 0; n < uColCount; ++n)
+ {
+ char c = at(n);
+ if (!IsGapChar(c))
+ msa.SetChar(0, uUngappedPos++, c);
+ }
+ msa.SetSeqName(0, m_ptrName);
+ }
+
+void Seq::Copy(const Seq &rhs)
+ {
+ clear();
+ const unsigned uLength = rhs.Length();
+ for (unsigned uColIndex = 0; uColIndex < uLength; ++uColIndex)
+ push_back(rhs.at(uColIndex));
+ const char *ptrName = rhs.GetName();
+ size_t n = strlen(ptrName) + 1;
+ m_ptrName = new char[n];
+ strcpy(m_ptrName, ptrName);
+ SetId(rhs.GetId());
+ }
+
+void Seq::CopyReversed(const Seq &rhs)
+ {
+ clear();
+ const unsigned uLength = rhs.Length();
+ const unsigned uBase = rhs.Length() - 1;
+ for (unsigned uColIndex = 0; uColIndex < uLength; ++uColIndex)
+ push_back(rhs.at(uBase - uColIndex));
+ const char *ptrName = rhs.GetName();
+ size_t n = strlen(ptrName) + 1;
+ m_ptrName = new char[n];
+ strcpy(m_ptrName, ptrName);
+ }
+
+void Seq::StripGaps()
+ {
+ for (CharVect::iterator p = begin(); p != end(); )
+ {
+ char c = *p;
+ if (IsGapChar(c))
+ erase(p);
+ else
+ ++p;
+ }
+ }
+
+void Seq::StripGapsAndWhitespace()
+ {
+ for (CharVect::iterator p = begin(); p != end(); )
+ {
+ char c = *p;
+ if (isspace(c) || IsGapChar(c))
+ erase(p);
+ else
+ ++p;
+ }
+ }
+
+void Seq::ToUpper()
+ {
+ for (CharVect::iterator p = begin(); p != end(); ++p)
+ {
+ char c = *p;
+ if (islower(c))
+ *p = toupper(c);
+ }
+ }
+
+unsigned Seq::GetLetter(unsigned uIndex) const
+ {
+ assert(uIndex < Length());
+ char c = operator[](uIndex);
+ return CharToLetter(c);
+ }
+
+bool Seq::EqIgnoreCase(const Seq &s) const
+ {
+ const unsigned n = Length();
+ if (n != s.Length())
+ return false;
+ for (unsigned i = 0; i < n; ++i)
+ {
+ const char c1 = at(i);
+ const char c2 = s.at(i);
+ if (IsGapChar(c1))
+ {
+ if (!IsGapChar(c2))
+ return false;
+ }
+ else
+ {
+ if (toupper(c1) != toupper(c2))
+ return false;
+ }
+ }
+ return true;
+ }
+
+bool Seq::Eq(const Seq &s) const
+ {
+ const unsigned n = Length();
+ if (n != s.Length())
+ return false;
+ for (unsigned i = 0; i < n; ++i)
+ {
+ const char c1 = at(i);
+ const char c2 = s.at(i);
+ if (c1 != c2)
+ return false;
+ }
+ return true;
+ }
+
+bool Seq::EqIgnoreCaseAndGaps(const Seq &s) const
+ {
+ const unsigned uThisLength = Length();
+ const unsigned uOtherLength = s.Length();
+
+ unsigned uThisPos = 0;
+ unsigned uOtherPos = 0;
+
+ int cThis;
+ int cOther;
+ for (;;)
+ {
+ if (uThisPos == uThisLength && uOtherPos == uOtherLength)
+ break;
+
+ // Set cThis to next non-gap character in this string
+ // or -1 if end-of-string.
+ for (;;)
+ {
+ if (uThisPos == uThisLength)
+ {
+ cThis = -1;
+ break;
+ }
+ else
+ {
+ cThis = at(uThisPos);
+ ++uThisPos;
+ if (!IsGapChar(cThis))
+ {
+ cThis = toupper(cThis);
+ break;
+ }
+ }
+ }
+
+ // Set cOther to next non-gap character in s
+ // or -1 if end-of-string.
+ for (;;)
+ {
+ if (uOtherPos == uOtherLength)
+ {
+ cOther = -1;
+ break;
+ }
+ else
+ {
+ cOther = s.at(uOtherPos);
+ ++uOtherPos;
+ if (!IsGapChar(cOther))
+ {
+ cOther = toupper(cOther);
+ break;
+ }
+ }
+ }
+
+ // Compare characters are corresponding ungapped position
+ if (cThis != cOther)
+ return false;
+ }
+ return true;
+ }
+
+unsigned Seq::GetUngappedLength() const
+ {
+ unsigned uUngappedLength = 0;
+ for (CharVect::const_iterator p = begin(); p != end(); ++p)
+ {
+ char c = *p;
+ if (!IsGapChar(c))
+ ++uUngappedLength;
+ }
+ return uUngappedLength;
+ }
+
+void Seq::LogMe() const
+ {
+ Log(">%s\n", m_ptrName);
+ const unsigned n = Length();
+ for (unsigned i = 0; i < n; ++i)
+ Log("%c", at(i));
+ Log("\n");
+ }
+
+void Seq::FromString(const char *pstrSeq, const char *pstrName)
+ {
+ clear();
+ const unsigned uLength = (unsigned) strlen(pstrSeq);
+ for (unsigned uColIndex = 0; uColIndex < uLength; ++uColIndex)
+ push_back(pstrSeq[uColIndex]);
+ size_t n = strlen(pstrName) + 1;
+ m_ptrName = new char[n];
+ strcpy(m_ptrName, pstrName);
+ }
+
+bool Seq::HasGap() const
+ {
+ for (CharVect::const_iterator p = begin(); p != end(); ++p)
+ {
+ char c = *p;
+ if (IsGapChar(c))
+ return true;
+ }
+ return false;
+ }
+
+void Seq::FixAlpha()
+ {
+ for (CharVect::iterator p = begin(); p != end(); ++p)
+ {
+ char c = *p;
+ if (!IsResidueChar(c))
+ {
+ char w = GetWildcardChar();
+ // Warning("Invalid residue '%c', replaced by '%c'", c, w);
+ InvalidLetterWarning(c, w);
+ *p = w;
+ }
+ }
+ }
Added: trunk/packages/muscle/branches/upstream/current/seq.h
===================================================================
--- trunk/packages/muscle/branches/upstream/current/seq.h 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/seq.h 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,85 @@
+#ifndef Seq_h
+#define Seq_h
+
+#include <vector>
+
+class TextFile;
+class MSA;
+
+typedef std::vector<char> CharVect;
+
+class Seq : public CharVect
+ {
+public:
+ Seq()
+ {
+ m_ptrName = 0;
+ // Start with moderate size to avoid
+ // thrashing the heap.
+ reserve(200);
+ }
+ virtual ~Seq()
+ {
+ delete[] m_ptrName;
+ }
+
+private:
+// Not implemented; prevent use of copy c'tor and assignment.
+ Seq(const Seq &);
+ Seq &operator=(const Seq &);
+
+public:
+ void Clear()
+ {
+ clear();
+ delete[] m_ptrName;
+ m_ptrName = 0;
+ m_uId = uInsane;
+ }
+ const char *GetName() const
+ {
+ return m_ptrName;
+ }
+ unsigned GetId() const
+ {
+ if (uInsane == m_uId)
+ Quit("Seq::GetId, id not set");
+ return m_uId;
+ }
+ void SetId(unsigned uId) { m_uId = uId; }
+
+ bool FromFASTAFile(TextFile &File);
+ void ToFASTAFile(TextFile &File) const;
+ void ExtractUngapped(MSA &msa) const;
+
+ void FromString(const char *pstrSeq, const char *pstrName);
+ void Copy(const Seq &rhs);
+ void CopyReversed(const Seq &rhs);
+ void StripGaps();
+ void StripGapsAndWhitespace();
+ void ToUpper();
+ void SetName(const char *ptrName);
+ unsigned GetLetter(unsigned uIndex) const;
+ unsigned Length() const { return (unsigned) size(); }
+ bool Eq(const Seq &s) const;
+ bool EqIgnoreCase(const Seq &s) const;
+ bool EqIgnoreCaseAndGaps(const Seq &s) const;
+ bool HasGap() const;
+ unsigned GetUngappedLength() const;
+ void LogMe() const;
+ char GetChar(unsigned uIndex) const { return operator[](uIndex); }
+ void SetChar(unsigned uIndex, char c) { operator[](uIndex) = c; }
+ void AppendChar(char c) { push_back(c); }
+ void FixAlpha();
+
+#ifndef _WIN32
+ reference at(size_type i) { return operator[](i); }
+ const_reference at(size_type i) const { return operator[](i); }
+#endif
+
+private:
+ char *m_ptrName;
+ unsigned m_uId;
+ };
+
+#endif // Seq.h
Added: trunk/packages/muscle/branches/upstream/current/seqvect.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/seqvect.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/seqvect.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,290 @@
+#include "muscle.h"
+#include "seqvect.h"
+#include "textfile.h"
+#include "msa.h"
+
+const size_t MAX_FASTA_LINE = 16000;
+
+SeqVect::~SeqVect()
+ {
+ Clear();
+ }
+
+void SeqVect::Clear()
+ {
+ for (size_t n = 0; n < size(); ++n)
+ delete (*this)[n];
+ }
+
+void SeqVect::ToFASTAFile(TextFile &File) const
+ {
+ unsigned uSeqCount = Length();
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ Seq *ptrSeq = at(uSeqIndex);
+ ptrSeq->ToFASTAFile(File);
+ }
+ }
+
+void SeqVect::FromFASTAFile(TextFile &File)
+ {
+ Clear();
+
+ FILE *f = File.GetStdioFile();
+ for (;;)
+ {
+ char *Label;
+ unsigned uLength;
+ char *SeqData = GetFastaSeq(f, &uLength, &Label);
+ if (0 == SeqData)
+ return;
+ Seq *ptrSeq = new Seq;
+
+ for (unsigned i = 0; i < uLength; ++i)
+ {
+ char c = SeqData[i];
+ ptrSeq->push_back(c);
+ }
+
+ ptrSeq->SetName(Label);
+ push_back(ptrSeq);
+
+ delete[] SeqData;
+ delete[] Label;
+ }
+ }
+
+void SeqVect::PadToMSA(MSA &msa)
+ {
+ unsigned uSeqCount = Length();
+ if (0 == uSeqCount)
+ {
+ msa.Clear();
+ return;
+ }
+
+ unsigned uLongestSeqLength = 0;
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ Seq *ptrSeq = at(uSeqIndex);
+ unsigned uColCount = ptrSeq->Length();
+ if (uColCount > uLongestSeqLength)
+ uLongestSeqLength = uColCount;
+ }
+ msa.SetSize(uSeqCount, uLongestSeqLength);
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ Seq *ptrSeq = at(uSeqIndex);
+ msa.SetSeqName(uSeqIndex, ptrSeq->GetName());
+ unsigned uColCount = ptrSeq->Length();
+ unsigned uColIndex;
+ for (uColIndex = 0; uColIndex < uColCount; ++uColIndex)
+ {
+ char c = ptrSeq->at(uColIndex);
+ msa.SetChar(uSeqIndex, uColIndex, c);
+ }
+ while (uColIndex < uLongestSeqLength)
+ msa.SetChar(uSeqIndex, uColIndex++, '.');
+ }
+ }
+
+void SeqVect::Copy(const SeqVect &rhs)
+ {
+ clear();
+ unsigned uSeqCount = rhs.Length();
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ Seq *ptrSeq = rhs.at(uSeqIndex);
+ Seq *ptrSeqCopy = new Seq;
+ ptrSeqCopy->Copy(*ptrSeq);
+ push_back(ptrSeqCopy);
+ }
+ }
+
+void SeqVect::StripGaps()
+ {
+ unsigned uSeqCount = Length();
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ Seq *ptrSeq = at(uSeqIndex);
+ ptrSeq->StripGaps();
+ }
+ }
+
+void SeqVect::StripGapsAndWhitespace()
+ {
+ unsigned uSeqCount = Length();
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ Seq *ptrSeq = at(uSeqIndex);
+ ptrSeq->StripGapsAndWhitespace();
+ }
+ }
+
+void SeqVect::ToUpper()
+ {
+ unsigned uSeqCount = Length();
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ Seq *ptrSeq = at(uSeqIndex);
+ ptrSeq->ToUpper();
+ }
+ }
+
+bool SeqVect::FindName(const char *ptrName, unsigned *ptruIndex) const
+ {
+ unsigned uSeqCount = Length();
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ const Seq *ptrSeq = at(uSeqIndex);
+ if (0 == stricmp(ptrSeq->GetName(), ptrName))
+ {
+ *ptruIndex = uSeqIndex;
+ return true;
+ }
+ }
+ return false;
+ }
+
+void SeqVect::AppendSeq(const Seq &s)
+ {
+ Seq *ptrSeqCopy = new Seq;
+ ptrSeqCopy->Copy(s);
+ push_back(ptrSeqCopy);
+ }
+
+void SeqVect::LogMe() const
+ {
+ unsigned uSeqCount = Length();
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ const Seq *ptrSeq = at(uSeqIndex);
+ ptrSeq->LogMe();
+ }
+ }
+
+const char *SeqVect::GetSeqName(unsigned uSeqIndex) const
+ {
+ assert(uSeqIndex < size());
+ const Seq *ptrSeq = at(uSeqIndex);
+ return ptrSeq->GetName();
+ }
+
+unsigned SeqVect::GetSeqId(unsigned uSeqIndex) const
+ {
+ assert(uSeqIndex < size());
+ const Seq *ptrSeq = at(uSeqIndex);
+ return ptrSeq->GetId();
+ }
+
+unsigned SeqVect::GetSeqIdFromName(const char *Name) const
+ {
+ const unsigned uSeqCount = GetSeqCount();
+ for (unsigned i = 0; i < uSeqCount; ++i)
+ {
+ if (!strcmp(Name, GetSeqName(i)))
+ return GetSeqId(i);
+ }
+ Quit("SeqVect::GetSeqIdFromName(%s): not found", Name);
+ return 0;
+ }
+
+Seq &SeqVect::GetSeqById(unsigned uId)
+ {
+ const unsigned uSeqCount = GetSeqCount();
+ for (unsigned i = 0; i < uSeqCount; ++i)
+ {
+ if (GetSeqId(i) == uId)
+ return GetSeq(i);
+ }
+ Quit("SeqVect::GetSeqIdByUd(%d): not found", uId);
+ return (Seq &) *((Seq *) 0);
+ }
+
+unsigned SeqVect::GetSeqLength(unsigned uSeqIndex) const
+ {
+ assert(uSeqIndex < size());
+ const Seq *ptrSeq = at(uSeqIndex);
+ return ptrSeq->Length();
+ }
+
+Seq &SeqVect::GetSeq(unsigned uSeqIndex)
+ {
+ assert(uSeqIndex < size());
+ return *at(uSeqIndex);
+ }
+
+const Seq &SeqVect::GetSeq(unsigned uSeqIndex) const
+ {
+ assert(uSeqIndex < size());
+ return *at(uSeqIndex);
+ }
+
+void SeqVect::SetSeqId(unsigned uSeqIndex, unsigned uId)
+ {
+ assert(uSeqIndex < size());
+ Seq *ptrSeq = at(uSeqIndex);
+ return ptrSeq->SetId(uId);
+ }
+
+ALPHA SeqVect::GuessAlpha() const
+ {
+// If at least MIN_NUCLEO_PCT of the first CHAR_COUNT non-gap
+// letters belong to the nucleotide alphabet, guess nucleo.
+// Otherwise amino.
+ const unsigned CHAR_COUNT = 100;
+ const unsigned MIN_NUCLEO_PCT = 95;
+
+ const unsigned uSeqCount = GetSeqCount();
+ if (0 == uSeqCount)
+ return ALPHA_Amino;
+
+ unsigned uSeqIndex = 0;
+ unsigned uPos = 0;
+ unsigned uSeqLength = GetSeqLength(0);
+ unsigned uDNACount = 0;
+ unsigned uRNACount = 0;
+ unsigned uTotal = 0;
+ const Seq *ptrSeq = &GetSeq(0);
+ for (;;)
+ {
+ while (uPos >= uSeqLength)
+ {
+ ++uSeqIndex;
+ if (uSeqIndex >= uSeqCount)
+ break;
+ ptrSeq = &GetSeq(uSeqIndex);
+ uSeqLength = ptrSeq->Length();
+ uPos = 0;
+ }
+ if (uSeqIndex >= uSeqCount)
+ break;
+ char c = ptrSeq->at(uPos++);
+ if (IsGapChar(c))
+ continue;
+ if (IsDNA(c))
+ ++uDNACount;
+ if (IsRNA(c))
+ ++uRNACount;
+ ++uTotal;
+ if (uTotal >= CHAR_COUNT)
+ break;
+ }
+ if (uTotal != 0 && ((uDNACount*100)/uTotal) >= MIN_NUCLEO_PCT)
+ return ALPHA_DNA;
+ if (uTotal != 0 && ((uRNACount*100)/uTotal) >= MIN_NUCLEO_PCT)
+ return ALPHA_RNA;
+ return ALPHA_Amino;
+ }
+
+void SeqVect::FixAlpha()
+ {
+ ClearInvalidLetterWarning();
+ unsigned uSeqCount = Length();
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ Seq *ptrSeq = at(uSeqIndex);
+ ptrSeq->FixAlpha();
+ }
+ ReportInvalidLetters();
+ }
Added: trunk/packages/muscle/branches/upstream/current/seqvect.h
===================================================================
--- trunk/packages/muscle/branches/upstream/current/seqvect.h 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/seqvect.h 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,63 @@
+#ifndef SeqVect_h
+#define SeqVect_h
+
+#include <vector>
+#include "seq.h"
+
+typedef std::vector<Seq *> SeqVectBase;
+
+class SeqVect : public SeqVectBase
+ {
+public:
+ SeqVect() {}
+ virtual ~SeqVect();
+
+private:
+// Not implemented; prevent use of copy c'tor and assignment.
+ SeqVect(const SeqVect &);
+ SeqVect &operator=(const SeqVect &);
+
+public:
+ void FromFile(TextFile &File)
+ {
+ FromFASTAFile(File);
+ }
+
+ void FromFASTAFile(TextFile &File);
+ void ToFASTAFile(TextFile &File) const;
+
+ void ToFile(TextFile &File) const
+ {
+ ToFASTAFile(File);
+ }
+
+ void PadToMSA(MSA &msa);
+ void Copy(const SeqVect &rhs);
+ void StripGaps();
+ void StripGapsAndWhitespace();
+ void ToUpper();
+ void Clear();
+ unsigned Length() const { return (unsigned) size(); }
+ unsigned GetSeqCount() const { return (unsigned) size(); }
+ void AppendSeq(const Seq &s);
+ bool FindName(const char *ptrName, unsigned *ptruIndex) const;
+ void LogMe() const;
+ const char *GetSeqName(unsigned uSeqIndex) const;
+ unsigned GetSeqId(unsigned uSeqIndex) const;
+ unsigned GetSeqIdFromName(const char *Name) const;
+ unsigned GetSeqLength(unsigned uSeqIndex) const;
+ void SetSeqId(unsigned uSeqIndex, unsigned uId);
+ Seq &GetSeq(unsigned uIndex);
+ Seq &GetSeqById(unsigned uId);
+ const Seq &GetSeq(unsigned uIndex) const;
+
+ ALPHA GuessAlpha() const;
+ void FixAlpha();
+
+#ifndef _WIN32
+ reference at(size_type i) { return operator[](i); }
+ const_reference at(size_type i) const { return operator[](i); }
+#endif
+ };
+
+#endif // SeqVect_h
Added: trunk/packages/muscle/branches/upstream/current/setblosumweights.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/setblosumweights.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/setblosumweights.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,131 @@
+/***
+Code for implementing HMMer's "BLOSUM weighting" algorithm.
+
+The algorithm was deduced by reverse-engineering the HMMer code.
+
+The HMMer documentation refers to BLOSUM weighting as "Henikoff
+simple filter weighting"
+
+The name BLOSUM implied to me that HMMer would be using a
+substitution probability matrix to compute distances, but this
+turned out not to be the case.
+
+It is notable, not to say puzzling, that the HMMer BLOSUM weighting
+algorithm is guaranteed to produce an integral NIC (number-of-indepdent-
+counts, also known as effective sequence count). Presumably Eddy must
+have known this, though he doesn't comment on it and he computes & stores
+the value in a float.
+
+Here's the algorithm:
+
+Distances between two sequences are based on the average of a simple
+binary equal (one) / not equal (zero) at each position. The only thing
+that has anything to do with BLOSUM in this calculation is an obscure
+(to me) constant value of 0.62. The sequences are clustered using this
+distance. If the pairwise identity (fraction of identical positions)
+is less than 0.62, they get assigned to disjoint clusters, the final
+number of disjoint clusters is the NIC. This makes some intuitive sense:
+I would interpret this by saying that if a set of sequences are close
+enough they count as one sequence. The weight for each sequence within a
+disjoint cluster is then determined to be 1 / (clustersize), from which it
+follows that the sum of all weights is equal to the number of disjoint
+clusters and is thus guaranteed to be an integer value. It is exactly this
+sum that HMMer uses for the NIC, by default.
+
+The individual BLOSUM sequence weights are not used for anything else in
+HMMer, unless you specify that BLOSUM weighting should override the default
+GSC weighting. GSC weighting uses a different clustering algorithm to
+determine relative weights. The BLOSUM NIC is then distributed over the
+GSC tree according to those relative weights.
+***/
+
+#include "muscle.h"
+#include "msa.h"
+#include "cluster.h"
+#include "distfunc.h"
+
+// Set weights of all sequences in the subtree under given node.
+void MSA::SetBLOSUMSubtreeWeight(const ClusterNode *ptrNode, double dWeight) const
+ {
+ if (0 == ptrNode)
+ return;
+
+ const ClusterNode *ptrRight = ptrNode->GetRight();
+ const ClusterNode *ptrLeft = ptrNode->GetLeft();
+
+// If leaf, set weight
+ if (0 == ptrRight && 0 == ptrLeft)
+ {
+ unsigned uIndex = ptrNode->GetIndex();
+ WEIGHT w = DoubleToWeight(dWeight);
+ m_Weights[uIndex] = w;
+ return;
+ }
+
+// Otherwise, recursively set subtrees
+ SetBLOSUMSubtreeWeight(ptrLeft, dWeight);
+ SetBLOSUMSubtreeWeight(ptrRight, dWeight);
+ }
+
+// Traverse a subtree looking for clusters where all
+// the leaves are sufficiently similar that they
+// should be weighted as a group, i.e. given a weight
+// of 1/N where N is the cluster size. The idea is
+// to avoid sample bias where we have closely related
+// sequences in the input alignment.
+// The weight at a node is the distance between
+// the two closest sequences in the left and right
+// subtrees under that node. "Sufficiently similar"
+// is defined as being where that minimum distance
+// is less than the dMinDist threshhold. I don't know
+// why the clustering is done using a minimum rather
+// than a maximum or average, either of which would
+// seem more natural to me.
+// Return value is number of groups under this node.
+// A "group" is the cluster found under a node with a
+// weight less than the minimum.
+unsigned MSA::SetBLOSUMNodeWeight(const ClusterNode *ptrNode, double dMinDist) const
+ {
+ if (0 == ptrNode)
+ return 0;
+
+ if (ptrNode->GetWeight() < dMinDist)
+ {
+ unsigned uClusterSize = ptrNode->GetClusterSize();
+ assert(uClusterSize > 0);
+ double dWeight = 1.0 / uClusterSize;
+ SetBLOSUMSubtreeWeight(ptrNode, dWeight);
+ return 1;
+ }
+
+ const ClusterNode *ptrLeft = ptrNode->GetLeft();
+ const ClusterNode *ptrRight = ptrNode->GetRight();
+
+ unsigned uLeftGroupCount = SetBLOSUMNodeWeight(ptrLeft, dMinDist);
+ unsigned uRightGroupCount = SetBLOSUMNodeWeight(ptrRight, dMinDist);
+
+ return uLeftGroupCount + uRightGroupCount;
+ }
+
+// Return value is the group count, i.e. the effective number
+// of distinctly different sequences.
+unsigned MSA::CalcBLOSUMWeights(ClusterTree &BlosumCluster) const
+ {
+// Build distance matrix
+ DistFunc DF;
+ unsigned uSeqCount = GetSeqCount();
+ DF.SetCount(uSeqCount);
+ for (unsigned i = 0; i < uSeqCount; ++i)
+ for (unsigned j = i+1; j < uSeqCount; ++j)
+ {
+ double dDist = GetPctIdentityPair(i, j);
+ assert(dDist >= 0.0 && dDist <= 1.0);
+ DF.SetDist(i, j, (float) (1.0 - dDist));
+ }
+
+// Cluster based on the distance function
+ BlosumCluster.Create(DF);
+
+// Return value is HMMer's "effective sequence count".
+ return SetBLOSUMNodeWeight(BlosumCluster.GetRoot(), 1.0 - BLOSUM_DIST);
+ }
Added: trunk/packages/muscle/branches/upstream/current/setgscweights.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/setgscweights.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/setgscweights.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,195 @@
+/***
+Gerstein/Sonnhammer/Chothia ad hoc sequence weighting.
+The algorithm was deduced by reverse-engineering the
+HMMer code.
+
+I used an alternative representation that I prefer over
+HMMer's. The HMMer code is full of tree manipulations
+that do something to the left child and then the equivalent
+thing to the right child. It was clear that there must be
+a re-formulation that does everything once for each node,
+which would reduce the number of operations expressed
+in the code by a factor of two. This gives a more elegant
+and less error-prone way to code it.
+
+These notes explain the correspondence between my design
+and Eddy's.
+
+HMMer stores a data structure phylo_s for each non-leaf
+node in the cluster tree. This structure contains the
+following fields:
+
+ diff Weight of the node
+ lblen Left branch length
+ rblen Right branch length
+
+The lblen and rblen branch lengths are calculated as:
+
+ this.lblen = this.diff - left.diff
+ this.rblen = this.diff - right.diff
+
+My code stores one ClusterNode data structure per node
+in the cluster tree, including leaves. I store only the
+weight. I can recover the HMMer branch length fields
+in a trivial O(1) calculation as follows:
+
+ lblen = Node.GetWeight() - Node.GetLeft()->GetWeight()
+ rblen = Node.GetWeight() - Node.GetRight()->GetWeight()
+
+For the GSC weights calculation, HMMer constructs the
+following vectors, which have entries for all nodes,
+including leaves:
+
+ lwt Left weight
+ rwt Right weight
+
+The "left weight" is calculated as the sum of the weights in
+all the nodes reachable through the left branch, including
+the node itself. (This is not immediately obvious from the
+code, which does the calculation using branch lengths rather
+than weights, but this is an equivalent, and to my mind clearer,
+statement of what they are). Similarly, the "right weight" is
+the sum of all weights reachable via the right branch. I define
+the "cluster weight" to be the summed weight of all nodes in the
+subtree under the node, including the node itself. I provide
+a function Node.GetClusterWeight() which calculates the cluster
+weight using a O(ln N) recursion through the tree. The lwt and
+rwt values can be recovered as follows:
+
+ lwt = Node.GetLeft()->GetClusterWeight()
+ + Node.GetWeight()
+
+ lwt = Node.GetLeft()->GetClusterWeight()
+ + Node.GetWeight()
+
+HMMer calculates a further vector fwt as follows.
+
+ this.fwt = parent.fwt * parent.lwt / (parent.lwt + parent.rwt)
+
+This applies to nodes reached via a left branch, for nodes reached
+via a right branch:
+
+ this.fwt = parent.fwt * parent.rwt / (parent.lwt + parent.rwt)
+
+The values of fwt at the leaf nodes are the final GSC weights.
+We derive the various terms using our equivalents.
+
+ parent.lwt = Parent.GetLeft()->GetClusterWeight()
+ + Parent.GetWeight()
+
+ parent.rwt = Parent.GetRight()->GetClusterWeight()
+ + Parent.GetWeight()
+
+ parent.lwt + parent.rwt =
+ { Parent.GetLeft()->GetClusterWeight()
+ + Parent.GetRight()->GetClusterWeight()
+ + Parent.GetWeight() }
+ + Parent.GetWeight()
+
+We recognize the term {...} as the cluster weight of the
+parent, so
+
+ parent.lwt + parent.rwt
+ = Parent.GetClusterWeight()
+ + Parent.GetWeight()
+
+As you would expect, repeating this exercise for parent.rwt gives
+exactly the same expression.
+
+The GSC weights (fwt) are stored in the Weight2 field of the cluster
+tree, the Weight field stores the original (BLOSUM) weights used
+as input to this algorithm.
+***/
+
+#include "muscle.h"
+#include "msa.h"
+#include "cluster.h"
+#include "distfunc.h"
+
+// Set weights of all sequences in the subtree under given node.
+void MSA::SetSubtreeWeight2(const ClusterNode *ptrNode) const
+ {
+ if (0 == ptrNode)
+ return;
+
+ const ClusterNode *ptrRight = ptrNode->GetRight();
+ const ClusterNode *ptrLeft = ptrNode->GetLeft();
+
+// If leaf, set weight
+ if (0 == ptrRight && 0 == ptrLeft)
+ {
+ unsigned uIndex = ptrNode->GetIndex();
+ double dWeight = ptrNode->GetWeight2();
+ WEIGHT w = DoubleToWeight(dWeight);
+ m_Weights[uIndex] = w;
+ return;
+ }
+
+// Otherwise, recursively set subtrees
+ SetSubtreeWeight2(ptrLeft);
+ SetSubtreeWeight2(ptrRight);
+ }
+
+void MSA::SetSubtreeGSCWeight(ClusterNode *ptrNode) const
+ {
+ if (0 == ptrNode)
+ return;
+
+ ClusterNode *ptrParent = ptrNode->GetParent();
+ double dParentWeight2 = ptrParent->GetWeight2();
+ double dParentClusterWeight = ptrParent->GetClusterWeight();
+ if (0.0 == dParentClusterWeight)
+ {
+ double dThisClusterSize = ptrNode->GetClusterSize();
+ double dParentClusterSize = ptrParent->GetClusterSize();
+ double dWeight2 =
+ dParentWeight2*dThisClusterSize/dParentClusterSize;
+ ptrNode->SetWeight2(dWeight2);
+ }
+ else
+ {
+ // Could cache cluster weights for better performance.
+ // We calculate cluster weight of each node twice, so this
+ // would give x2 improvement.
+ // As weighting is not very expensive, we don't care.
+ double dThisClusterWeight = ptrNode->GetClusterWeight();
+ double dParentWeight = ptrParent->GetWeight();
+
+ double dNum = dThisClusterWeight + dParentWeight;
+ double dDenom = dParentClusterWeight + dParentWeight;
+ double dWeight2 = dParentWeight2*(dNum/dDenom);
+
+ ptrNode->SetWeight2(dWeight2);
+ }
+
+ SetSubtreeGSCWeight(ptrNode->GetLeft());
+ SetSubtreeGSCWeight(ptrNode->GetRight());
+ }
+
+void MSA::SetGSCWeights() const
+ {
+ ClusterTree CT;
+ CalcBLOSUMWeights(CT);
+
+// Calculate weights and store in tree.
+ ClusterNode *ptrRoot = CT.GetRoot();
+ ptrRoot->SetWeight2(1.0);
+ SetSubtreeGSCWeight(ptrRoot->GetLeft());
+ SetSubtreeGSCWeight(ptrRoot->GetRight());
+
+// Copy weights from tree to MSA.
+ SetSubtreeWeight2(ptrRoot);
+ }
+
+void MSA::ListWeights() const
+ {
+ const unsigned uSeqCount = GetSeqCount();
+ Log("Weights:\n");
+ WEIGHT wTotal = 0;
+ for (unsigned n = 0; n < uSeqCount; ++n)
+ {
+ wTotal += GetSeqWeight(n);
+ Log("%6.3f %s\n", GetSeqWeight(n), GetSeqName(n));
+ }
+ Log("Total weights = %6.3f, should be 1.0\n", wTotal);
+ }
Added: trunk/packages/muscle/branches/upstream/current/setnewhandler.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/setnewhandler.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/setnewhandler.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,22 @@
+#include "muscle.h"
+#include <stdio.h>
+#include <new>
+
+const int ONE_MB = 1024*1024;
+const size_t RESERVE_BYTES = 8*ONE_MB;
+static void *EmergencyReserve = 0;
+
+void OnOutOfMemory()
+ {
+ free(EmergencyReserve);
+ fprintf(stderr, "\n*** OUT OF MEMORY ***\n");
+ fprintf(stderr, "Memory allocated so far %g MB\n", GetMemUseMB());
+ SaveCurrentAlignment();
+ exit(EXIT_FatalError);
+ }
+
+void SetNewHandler()
+ {
+ EmergencyReserve = malloc(RESERVE_BYTES);
+ std::set_new_handler(OnOutOfMemory);
+ }
Added: trunk/packages/muscle/branches/upstream/current/spfast.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/spfast.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/spfast.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,269 @@
+#include "muscle.h"
+#include "profile.h"
+
+#define TRACE 0
+
+enum
+ {
+ LL = 0,
+ LG = 1,
+ GL = 2,
+ GG = 3,
+ };
+
+static char *GapTypeToStr(int GapType)
+ {
+ switch (GapType)
+ {
+ case LL: return "LL";
+ case LG: return "LG";
+ case GL: return "GL";
+ case GG: return "GG";
+ }
+ Quit("Invalid gap type");
+ return "?";
+ }
+
+static SCORE GapScoreMatrix[4][4];
+
+static void InitGapScoreMatrix()
+ {
+ const SCORE t = (SCORE) 0.2;
+
+ GapScoreMatrix[LL][LL] = 0;
+ GapScoreMatrix[LL][LG] = g_scoreGapOpen;
+ GapScoreMatrix[LL][GL] = 0;
+ GapScoreMatrix[LL][GG] = 0;
+
+ GapScoreMatrix[LG][LL] = g_scoreGapOpen;
+ GapScoreMatrix[LG][LG] = 0;
+ GapScoreMatrix[LG][GL] = g_scoreGapOpen;
+ GapScoreMatrix[LG][GG] = t*g_scoreGapOpen; // approximation!
+
+ GapScoreMatrix[GL][LL] = 0;
+ GapScoreMatrix[GL][LG] = g_scoreGapOpen;
+ GapScoreMatrix[GL][GL] = 0;
+ GapScoreMatrix[GL][GG] = 0;
+
+ GapScoreMatrix[GG][LL] = 0;
+ GapScoreMatrix[GG][LG] = t*g_scoreGapOpen; // approximation!
+ GapScoreMatrix[GG][GL] = 0;
+ GapScoreMatrix[GG][GG] = 0;
+
+ for (int i = 0; i < 4; ++i)
+ for (int j = 0; j < i; ++j)
+ if (GapScoreMatrix[i][j] != GapScoreMatrix[j][i])
+ Quit("GapScoreMatrix not symmetrical");
+ }
+
+static SCORE SPColBrute(const MSA &msa, unsigned uColIndex)
+ {
+ SCORE Sum = 0;
+ const unsigned uSeqCount = msa.GetSeqCount();
+ for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1)
+ {
+ const WEIGHT w1 = msa.GetSeqWeight(uSeqIndex1);
+ unsigned uLetter1 = msa.GetLetterEx(uSeqIndex1, uColIndex);
+ if (uLetter1 >= 20)
+ continue;
+ for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqIndex1; ++uSeqIndex2)
+ {
+ const WEIGHT w2 = msa.GetSeqWeight(uSeqIndex2);
+ unsigned uLetter2 = msa.GetLetterEx(uSeqIndex2, uColIndex);
+ if (uLetter2 >= 20)
+ continue;
+ SCORE t = w1*w2*(*g_ptrScoreMatrix)[uLetter1][uLetter2];
+#if TRACE
+ Log("Check %c %c w1=%.3g w2=%.3g Mx=%.3g t=%.3g\n",
+ LetterToCharAmino(uLetter1),
+ LetterToCharAmino(uLetter2),
+ w1,
+ w2,
+ (*g_ptrScoreMatrix)[uLetter1][uLetter2],
+ t);
+#endif
+ Sum += t;
+ }
+ }
+ return Sum;
+ }
+
+static SCORE SPGapFreqs(const FCOUNT Freqs[])
+ {
+#if TRACE
+ Log("Freqs=");
+ for (unsigned i = 0; i < 4; ++i)
+ if (Freqs[i] != 0)
+ Log(" %s=%.3g", GapTypeToStr(i), Freqs[i]);
+ Log("\n");
+#endif
+
+ SCORE TotalOffDiag = 0;
+ SCORE TotalDiag = 0;
+ for (unsigned i = 0; i < 4; ++i)
+ {
+ const FCOUNT fi = Freqs[i];
+ if (0 == fi)
+ continue;
+ const float *Row = GapScoreMatrix[i];
+ SCORE diagt = fi*fi*Row[i];
+ TotalDiag += diagt;
+#if TRACE
+ Log("SPFGaps %s %s + Mx=%.3g TotalDiag += %.3g\n",
+ GapTypeToStr(i),
+ GapTypeToStr(i),
+ Row[i],
+ diagt);
+#endif
+ SCORE Sum = 0;
+ for (unsigned j = 0; j < i; ++j)
+ {
+ SCORE t = Freqs[j]*Row[j];
+#if TRACE
+ if (Freqs[j] != 0)
+ Log("SPFGaps %s %s + Mx=%.3g Sum += %.3g\n",
+ GapTypeToStr(i),
+ GapTypeToStr(j),
+ Row[j],
+ fi*t);
+#endif
+ Sum += t;
+ }
+ TotalOffDiag += fi*Sum;
+ }
+#if TRACE
+ Log("SPFGap TotalOffDiag=%.3g + TotalDiag=%.3g = %.3g\n",
+ TotalOffDiag, TotalDiag, TotalOffDiag + TotalDiag);
+#endif
+ return TotalOffDiag*2 + TotalDiag;
+ }
+
+static SCORE SPFreqs(const FCOUNT Freqs[])
+ {
+#if TRACE
+ Log("Freqs=");
+ for (unsigned i = 0; i < 20; ++i)
+ if (Freqs[i] != 0)
+ Log(" %c=%.3g", LetterToCharAmino(i), Freqs[i]);
+ Log("\n");
+#endif
+
+ SCORE TotalOffDiag = 0;
+ SCORE TotalDiag = 0;
+ for (unsigned i = 0; i < 20; ++i)
+ {
+ const FCOUNT fi = Freqs[i];
+ if (0 == fi)
+ continue;
+ const float *Row = (*g_ptrScoreMatrix)[i];
+ SCORE diagt = fi*fi*Row[i];
+ TotalDiag += diagt;
+#if TRACE
+ Log("SPF %c %c + Mx=%.3g TotalDiag += %.3g\n",
+ LetterToCharAmino(i),
+ LetterToCharAmino(i),
+ Row[i],
+ diagt);
+#endif
+ SCORE Sum = 0;
+ for (unsigned j = 0; j < i; ++j)
+ {
+ SCORE t = Freqs[j]*Row[j];
+#if TRACE
+ if (Freqs[j] != 0)
+ Log("SPF %c %c + Mx=%.3g Sum += %.3g\n",
+ LetterToCharAmino(i),
+ LetterToCharAmino(j),
+ Row[j],
+ fi*t);
+#endif
+ Sum += t;
+ }
+ TotalOffDiag += fi*Sum;
+ }
+#if TRACE
+ Log("SPF TotalOffDiag=%.3g + TotalDiag=%.3g = %.3g\n",
+ TotalOffDiag, TotalDiag, TotalOffDiag + TotalDiag);
+#endif
+ return TotalOffDiag*2 + TotalDiag;
+ }
+
+static SCORE ObjScoreSPCol(const MSA &msa, unsigned uColIndex)
+ {
+ FCOUNT Freqs[20];
+ FCOUNT GapFreqs[4];
+
+ memset(Freqs, 0, sizeof(Freqs));
+ memset(GapFreqs, 0, sizeof(GapFreqs));
+
+ const unsigned uSeqCount = msa.GetSeqCount();
+#if TRACE
+ Log("Weights=");
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ Log(" %u=%.3g", uSeqIndex, msa.GetSeqWeight(uSeqIndex));
+ Log("\n");
+#endif
+ SCORE SelfOverCount = 0;
+ SCORE GapSelfOverCount = 0;
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ WEIGHT w = msa.GetSeqWeight(uSeqIndex);
+
+ bool bGapThisCol = msa.IsGap(uSeqIndex, uColIndex);
+ bool bGapPrevCol = (uColIndex == 0 ? false : msa.IsGap(uSeqIndex, uColIndex - 1));
+ int GapType = bGapThisCol + 2*bGapPrevCol;
+ assert(GapType >= 0 && GapType < 4);
+ GapFreqs[GapType] += w;
+ SCORE gapt = w*w*GapScoreMatrix[GapType][GapType];
+ GapSelfOverCount += gapt;
+
+ if (bGapThisCol)
+ continue;
+ unsigned uLetter = msa.GetLetterEx(uSeqIndex, uColIndex);
+ if (uLetter >= 20)
+ continue;
+ Freqs[uLetter] += w;
+ SCORE t = w*w*(*g_ptrScoreMatrix)[uLetter][uLetter];
+#if TRACE
+ Log("FastCol compute freqs & SelfOverCount %c w=%.3g M=%.3g SelfOverCount += %.3g\n",
+ LetterToCharAmino(uLetter), w, (*g_ptrScoreMatrix)[uLetter][uLetter], t);
+#endif
+ SelfOverCount += t;
+ }
+ SCORE SPF = SPFreqs(Freqs);
+ SCORE Col = SPF - SelfOverCount;
+
+ SCORE SPFGaps = SPGapFreqs(GapFreqs);
+ SCORE ColGaps = SPFGaps - GapSelfOverCount;
+#if TRACE
+ Log("SPF=%.3g - SelfOverCount=%.3g = %.3g\n", SPF, SelfOverCount, Col);
+ Log("SPFGaps=%.3g - GapsSelfOverCount=%.3g = %.3g\n", SPFGaps, GapSelfOverCount, ColGaps);
+#endif
+ return Col + ColGaps;
+ }
+
+SCORE ObjScoreSPDimer(const MSA &msa)
+ {
+ static bool bGapScoreMatrixInit = false;
+ if (!bGapScoreMatrixInit)
+ InitGapScoreMatrix();
+
+ SCORE Total = 0;
+ const unsigned uSeqCount = msa.GetSeqCount();
+ const unsigned uColCount = msa.GetColCount();
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
+ {
+ SCORE Col = ObjScoreSPCol(msa, uColIndex);
+#if TRACE
+ {
+ SCORE ColCheck = SPColBrute(msa, uColIndex);
+ Log("FastCol=%.3g CheckCol=%.3g\n", Col, ColCheck);
+ }
+#endif
+ Total += Col;
+ }
+#if TRACE
+ Log("Total/2 = %.3g (final result from fast)\n", Total/2);
+#endif
+ return Total/2;
+ }
Added: trunk/packages/muscle/branches/upstream/current/sptest.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/sptest.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/sptest.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,176 @@
+#include "muscle.h"
+#include "objscore.h"
+#include "msa.h"
+#include "textfile.h"
+#include "pwpath.h"
+
+const unsigned INDELS = 1;
+
+static void GetPos(const char Str[], unsigned L, int *pi1, int *pi2)
+ {
+ int i1;
+ for (;;)
+ {
+ i1 = rand()%(L-2) + 1;
+ if (Str[i1] == 'M')
+ break;
+ }
+ int i2;
+ for (;;)
+ {
+ i2 = rand()%(L-2) + 1;
+ if (i1 != i2 && Str[i2] == 'M')
+ break;
+ }
+ *pi1 = i1;
+ *pi2 = i2;
+ }
+
+static void MakePath(unsigned uSeqLength, unsigned uIndelCount, char Str[])
+ {
+ unsigned uPathLength = uSeqLength + uIndelCount;
+ for (unsigned i = 0; i < uPathLength; ++i)
+ Str[i] = 'M';
+
+ for (unsigned i = 0; i < uIndelCount; ++i)
+ {
+ int i1, i2;
+ GetPos(Str, uPathLength, &i1, &i2);
+ Str[i1] = 'D';
+ Str[i2] = 'I';
+ }
+
+ Str[uPathLength] = 0;
+ Log("MakePath=%s\n", Str);
+ }
+
+void SPTest()
+ {
+ SetPPScore(PPSCORE_SV);
+
+ SetListFileName("c:\\tmp\\muscle.log", false);
+
+ TextFile file1("c:\\tmp\\msa1.afa");
+ TextFile file2("c:\\tmp\\msa2.afa");
+
+ MSA msa1;
+ MSA msa2;
+
+ msa1.FromFile(file1);
+ msa2.FromFile(file2);
+
+ Log("msa1=\n");
+ msa1.LogMe();
+ Log("msa2=\n");
+ msa2.LogMe();
+
+ const unsigned uColCount = msa1.GetColCount();
+ if (msa2.GetColCount() != uColCount)
+ Quit("Different lengths");
+
+ const unsigned uSeqCount1 = msa1.GetSeqCount();
+ const unsigned uSeqCount2 = msa2.GetSeqCount();
+ const unsigned uSeqCount = uSeqCount1 + uSeqCount2;
+
+ MSA::SetIdCount(uSeqCount);
+
+ for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount1; ++uSeqIndex1)
+ {
+ msa1.SetSeqWeight(uSeqIndex1, 1.0);
+ msa1.SetSeqId(uSeqIndex1, uSeqIndex1);
+ }
+
+ for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqCount2; ++uSeqIndex2)
+ {
+ msa2.SetSeqWeight(uSeqIndex2, 1.0);
+ msa2.SetSeqId(uSeqIndex2, uSeqCount1 + uSeqIndex2);
+ }
+
+ MSA alnA;
+ MSA alnB;
+
+ char strPathA[1024];
+ char strPathB[1024];
+ MakePath(uColCount, INDELS, strPathA);
+ MakePath(uColCount, INDELS, strPathB);
+
+ PWPath PathA;
+ PWPath PathB;
+ PathA.FromStr(strPathA);
+ PathB.FromStr(strPathB);
+
+ Log("PathA=\n");
+ PathA.LogMe();
+ Log("PathB=\n");
+ PathB.LogMe();
+
+ AlignTwoMSAsGivenPath(PathA, msa1, msa2, alnA);
+ AlignTwoMSAsGivenPath(PathB, msa1, msa2, alnB);
+
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ alnA.SetSeqWeight(uSeqIndex, 1.0);
+ alnB.SetSeqWeight(uSeqIndex, 1.0);
+ }
+
+ unsigned Seqs1[1024];
+ unsigned Seqs2[1024];
+
+ for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount1; ++uSeqIndex1)
+ Seqs1[uSeqIndex1] = uSeqIndex1;
+
+ for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqCount2; ++uSeqIndex2)
+ Seqs2[uSeqIndex2] = uSeqCount1 + uSeqIndex2;
+
+ MSA msaA1;
+ MSA msaA2;
+ MSA msaB1;
+ MSA msaB2;
+ MSAFromSeqSubset(alnA, Seqs1, uSeqCount1, msaA1);
+ MSAFromSeqSubset(alnB, Seqs1, uSeqCount1, msaB1);
+ MSAFromSeqSubset(alnA, Seqs2, uSeqCount2, msaA2);
+ MSAFromSeqSubset(alnB, Seqs2, uSeqCount2, msaB2);
+
+ for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount1; ++uSeqIndex1)
+ {
+ msaA1.SetSeqWeight(uSeqIndex1, 1.0);
+ msaB1.SetSeqWeight(uSeqIndex1, 1.0);
+ }
+
+ for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqCount2; ++uSeqIndex2)
+ {
+ msaA2.SetSeqWeight(uSeqIndex2, 1.0);
+ msaB2.SetSeqWeight(uSeqIndex2, 1.0);
+ }
+
+ Log("msaA1=\n");
+ msaA1.LogMe();
+
+ Log("msaB1=\n");
+ msaB1.LogMe();
+
+ Log("msaA2=\n");
+ msaA2.LogMe();
+
+ Log("msaB2=\n");
+ msaB2.LogMe();
+
+ Log("alnA=\n");
+ alnA.LogMe();
+
+ Log("AlnB=\n");
+ alnB.LogMe();
+
+ Log("\nSPA\n---\n");
+ SCORE SPA = ObjScoreSP(alnA);
+ Log("\nSPB\n---\n");
+ SCORE SPB = ObjScoreSP(alnB);
+
+ Log("\nXPA\n---\n");
+ SCORE XPA = ObjScoreXP(msaA1, msaA2);
+ Log("\nXPB\n---\n");
+ SCORE XPB = ObjScoreXP(msaB1, msaB2);
+
+ Log("SPA=%.4g SPB=%.4g Diff=%.4g\n", SPA, SPB, SPA - SPB);
+ Log("XPA=%.4g XPB=%.4g Diff=%.4g\n", XPA, XPB, XPA - XPB);
+ }
Added: trunk/packages/muscle/branches/upstream/current/stabilize.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/stabilize.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/stabilize.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,20 @@
+#include "muscle.h"
+#include "msa.h"
+
+void Stabilize(const MSA &msa, MSA &msaStable)
+ {
+ const unsigned uSeqCount = msa.GetSeqCount();
+ const unsigned uColCount = msa.GetColCount();
+
+ msaStable.SetSize(uSeqCount, uColCount);
+ for (unsigned uId = 0; uId < uSeqCount; ++uId)
+ {
+ const unsigned uSeqIndex = msa.GetSeqIndex(uId);
+ msaStable.SetSeqName(uId, msa.GetSeqName(uSeqIndex));
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
+ {
+ const char c = msa.GetChar(uSeqIndex, uColIndex);
+ msaStable.SetChar(uId, uColIndex, c);
+ }
+ }
+ }
Added: trunk/packages/muscle/branches/upstream/current/subfam.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/subfam.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/subfam.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,384 @@
+#include "muscle.h"
+#include "tree.h"
+#include "textfile.h" // for test only
+#include "msa.h"
+#include "seqvect.h"
+#include "profile.h"
+#ifndef _MSC_VER
+#include <unistd.h> // for unlink
+#endif
+
+#define TRACE 0
+
+/***
+Find subfamilies from tree by following criteria:
+
+(a) number of leaves <= max,
+(b) is monophyletic, i.e. most recent common ancestor is parent
+of no more than one subfamily.
+***/
+
+static unsigned SubFamRecurse(const Tree &tree, unsigned uNodeIndex, unsigned uMaxLeafCount,
+ unsigned SubFams[], unsigned &uSubFamCount)
+ {
+ if (tree.IsLeaf(uNodeIndex))
+ return 1;
+
+ unsigned uLeft = tree.GetLeft(uNodeIndex);
+ unsigned uRight = tree.GetRight(uNodeIndex);
+ unsigned uLeftCount = SubFamRecurse(tree, uLeft, uMaxLeafCount, SubFams, uSubFamCount);
+ unsigned uRightCount = SubFamRecurse(tree, uRight, uMaxLeafCount, SubFams, uSubFamCount);
+
+ unsigned uLeafCount = uLeftCount + uRightCount;
+ if (uLeftCount + uRightCount > uMaxLeafCount)
+ {
+ if (uLeftCount <= uMaxLeafCount)
+ SubFams[uSubFamCount++] = uLeft;
+ if (uRightCount <= uMaxLeafCount)
+ SubFams[uSubFamCount++] = uRight;
+ }
+ else if (tree.IsRoot(uNodeIndex))
+ {
+ if (uSubFamCount != 0)
+ Quit("Error in SubFamRecurse");
+ SubFams[uSubFamCount++] = uNodeIndex;
+ }
+
+ return uLeafCount;
+ }
+
+void SubFam(const Tree &tree, unsigned uMaxLeafCount, unsigned SubFams[], unsigned *ptruSubFamCount)
+ {
+ *ptruSubFamCount = 0;
+ SubFamRecurse(tree, tree.GetRootNodeIndex(), uMaxLeafCount, SubFams, *ptruSubFamCount);
+
+#if TRACE
+ {
+ Log("\n");
+ Log("Tree:\n");
+ tree.LogMe();
+ //void DrawTree(const Tree &tree);
+ //DrawTree(tree);
+ Log("\n");
+ Log("%d subfams:\n", *ptruSubFamCount);
+ for (unsigned i = 0; i < *ptruSubFamCount; ++i)
+ Log(" %d=%d", i, SubFams[i]);
+ Log("\n");
+ }
+#endif
+ }
+
+//unsigned SubFams[9999];
+//unsigned uSubFamCount;
+//
+//static unsigned DistFromRoot(const Tree &tree, unsigned uNodeIndex)
+// {
+// const unsigned uRoot = tree.GetRootNodeIndex();
+// unsigned uDist = 0;
+// while (uNodeIndex != uRoot)
+// {
+// ++uDist;
+// uNodeIndex = tree.GetParent(uNodeIndex);
+// }
+// return uDist;
+// }
+//
+//static void DrawNode(const Tree &tree, unsigned uNodeIndex)
+// {
+// if (!tree.IsLeaf(uNodeIndex))
+// DrawNode(tree, tree.GetLeft(uNodeIndex));
+//
+// unsigned uDist = DistFromRoot(tree, uNodeIndex);
+// for (unsigned i = 0; i < 5*uDist; ++i)
+// Log(" ");
+// Log("%d", uNodeIndex);
+// for (unsigned i = 0; i < uSubFamCount; ++i)
+// if (uNodeIndex == SubFams[i])
+// {
+// Log("*");
+// break;
+// }
+// Log("\n");
+//
+// if (!tree.IsLeaf(uNodeIndex))
+// DrawNode(tree, tree.GetRight(uNodeIndex));
+// }
+//
+//static void DrawTree(const Tree &tree)
+// {
+// unsigned uRoot = tree.GetRootNodeIndex();
+// DrawNode(tree, uRoot);
+// }
+//
+//void TestSubFams(const char *FileName)
+// {
+// Tree tree;
+// TextFile f(FileName);
+// tree.FromFile(f);
+// SubFam(tree, 5, SubFams, &uSubFamCount);
+// DrawTree(tree);
+// }
+
+static void SetInFam(const Tree &tree, unsigned uNodeIndex, bool NodeInSubFam[])
+ {
+ if (tree.IsLeaf(uNodeIndex))
+ return;
+ unsigned uLeft = tree.GetLeft(uNodeIndex);
+ unsigned uRight = tree.GetRight(uNodeIndex);
+ NodeInSubFam[uLeft] = true;
+ NodeInSubFam[uRight] = true;
+
+ SetInFam(tree, uLeft, NodeInSubFam);
+ SetInFam(tree, uRight, NodeInSubFam);
+ }
+
+void AlignSubFam(SeqVect &vAll, const Tree &GuideTree, unsigned uNodeIndex,
+ MSA &msaOut)
+ {
+ const unsigned uSeqCount = vAll.GetSeqCount();
+
+ const char *InTmp = "asf_in.tmp";
+ const char *OutTmp = "asf_out.tmp";
+
+ unsigned *Leaves = new unsigned[uSeqCount];
+ unsigned uLeafCount;
+ GetLeaves(GuideTree, uNodeIndex, Leaves, &uLeafCount);
+
+ SeqVect v;
+ for (unsigned i = 0; i < uLeafCount; ++i)
+ {
+ unsigned uLeafNodeIndex = Leaves[i];
+ unsigned uId = GuideTree.GetLeafId(uLeafNodeIndex);
+ Seq &s = vAll.GetSeqById(uId);
+ v.AppendSeq(s);
+ }
+
+#if TRACE
+ {
+ Log("Align subfam[node=%d, size=%d] ", uNodeIndex, uLeafCount);
+ for (unsigned i = 0; i < uLeafCount; ++i)
+ Log(" %s", v.GetSeqName(i));
+ Log("\n");
+ }
+#endif
+
+ TextFile fIn(InTmp, true);
+
+ v.ToFASTAFile(fIn);
+ fIn.Close();
+
+ char CmdLine[4096];
+ sprintf(CmdLine, "probcons %s > %s 2> /dev/null", InTmp, OutTmp);
+// sprintf(CmdLine, "muscle -in %s -out %s -maxiters 1", InTmp, OutTmp);
+ system(CmdLine);
+
+ TextFile fOut(OutTmp);
+ msaOut.FromFile(fOut);
+
+ for (unsigned uSeqIndex = 0; uSeqIndex < uLeafCount; ++uSeqIndex)
+ {
+ const char *Name = msaOut.GetSeqName(uSeqIndex);
+ unsigned uId = vAll.GetSeqIdFromName(Name);
+ msaOut.SetSeqId(uSeqIndex, uId);
+ }
+
+ unlink(InTmp);
+ unlink(OutTmp);
+
+ delete[] Leaves;
+ }
+
+void ProgAlignSubFams()
+ {
+ MSA msaOut;
+
+ SetOutputFileName(g_pstrOutFileName);
+ SetInputFileName(g_pstrInFileName);
+
+ SetMaxIters(g_uMaxIters);
+ SetSeqWeightMethod(g_SeqWeight1);
+
+ TextFile fileIn(g_pstrInFileName);
+ SeqVect v;
+ v.FromFASTAFile(fileIn);
+ const unsigned uSeqCount = v.Length();
+
+ if (0 == uSeqCount)
+ Quit("No sequences in input file");
+
+ ALPHA Alpha = ALPHA_Undefined;
+ switch (g_SeqType)
+ {
+ case SEQTYPE_Auto:
+ Alpha = v.GuessAlpha();
+ break;
+
+ case SEQTYPE_Protein:
+ Alpha = ALPHA_Amino;
+ break;
+
+ case SEQTYPE_DNA:
+ Alpha = ALPHA_DNA;
+ break;
+
+ case SEQTYPE_RNA:
+ Alpha = ALPHA_RNA;
+ break;
+
+ default:
+ Quit("Invalid seq type");
+ }
+ SetAlpha(Alpha);
+ v.FixAlpha();
+
+ if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha)
+ {
+ SetPPScore(PPSCORE_SPN);
+ g_Distance1 = DISTANCE_Kmer4_6;
+ }
+
+ unsigned uMaxL = 0;
+ unsigned uTotL = 0;
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ unsigned L = v.GetSeq(uSeqIndex).Length();
+ uTotL += L;
+ if (L > uMaxL)
+ uMaxL = L;
+ }
+
+ SetIter(1);
+ g_bDiags = g_bDiags1;
+ SetSeqStats(uSeqCount, uMaxL, uTotL/uSeqCount);
+
+ MSA::SetIdCount(uSeqCount);
+
+// Initialize sequence ids.
+// From this point on, ids must somehow propogate from here.
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ v.SetSeqId(uSeqIndex, uSeqIndex);
+
+ if (uSeqCount > 1)
+ MHackStart(v);
+
+ if (0 == uSeqCount)
+ {
+ msaOut.Clear();
+ return;
+ }
+
+ if (1 == uSeqCount && ALPHA_Amino == Alpha)
+ {
+ const Seq &s = v.GetSeq(0);
+ msaOut.FromSeq(s);
+ return;
+ }
+
+ Tree GuideTree;
+ TreeFromSeqVect(v, GuideTree, g_Cluster1, g_Distance1, g_Root1);
+ SetMuscleTree(GuideTree);
+
+ MSA msa;
+ if (g_bLow)
+ {
+ ProgNode *ProgNodes = 0;
+ ProgNodes = ProgressiveAlignE(v, GuideTree, msa);
+ delete[] ProgNodes;
+ }
+ else
+ ProgressiveAlign(v, GuideTree, msa);
+ SetCurrentAlignment(msa);
+ TreeFromMSA(msa, GuideTree, g_Cluster2, g_Distance2, g_Root2);
+ SetMuscleTree(GuideTree);
+
+ unsigned *SubFams = new unsigned[uSeqCount];
+ unsigned uSubFamCount;
+ SubFam(GuideTree, g_uMaxSubFamCount, SubFams, &uSubFamCount);
+
+ SetProgressDesc("Align node");
+ const unsigned uNodeCount = 2*uSeqCount - 1;
+
+ ProgNode *ProgNodes = new ProgNode[uNodeCount];
+ bool *NodeIsSubFam = new bool[uNodeCount];
+ bool *NodeInSubFam = new bool[uNodeCount];
+
+ for (unsigned i = 0; i < uNodeCount; ++i)
+ {
+ NodeIsSubFam[i] = false;
+ NodeInSubFam[i] = false;
+ }
+
+ for (unsigned i = 0; i < uSubFamCount; ++i)
+ {
+ unsigned uNodeIndex = SubFams[i];
+ assert(uNodeIndex < uNodeCount);
+ NodeIsSubFam[uNodeIndex] = true;
+ SetInFam(GuideTree, uNodeIndex, NodeInSubFam);
+ }
+
+ unsigned uJoin = 0;
+ unsigned uTreeNodeIndex = GuideTree.FirstDepthFirstNode();
+ do
+ {
+ if (NodeIsSubFam[uTreeNodeIndex])
+ {
+#if TRACE
+ Log("Node %d: align subfam\n", uTreeNodeIndex);
+#endif
+ ProgNode &Node = ProgNodes[uTreeNodeIndex];
+ AlignSubFam(v, GuideTree, uTreeNodeIndex, Node.m_MSA);
+ Node.m_uLength = Node.m_MSA.GetColCount();
+ }
+ else if (!NodeInSubFam[uTreeNodeIndex])
+ {
+#if TRACE
+ Log("Node %d: align two subfams\n", uTreeNodeIndex);
+#endif
+ Progress(uJoin, uSubFamCount - 1);
+ ++uJoin;
+
+ const unsigned uMergeNodeIndex = uTreeNodeIndex;
+ ProgNode &Parent = ProgNodes[uMergeNodeIndex];
+
+ const unsigned uLeft = GuideTree.GetLeft(uTreeNodeIndex);
+ const unsigned uRight = GuideTree.GetRight(uTreeNodeIndex);
+
+ ProgNode &Node1 = ProgNodes[uLeft];
+ ProgNode &Node2 = ProgNodes[uRight];
+
+ PWPath Path;
+ AlignTwoMSAs(Node1.m_MSA, Node2.m_MSA, Parent.m_MSA, Path);
+ Parent.m_uLength = Parent.m_MSA.GetColCount();
+
+ Node1.m_MSA.Clear();
+ Node2.m_MSA.Clear();
+ }
+ else
+ {
+#if TRACE
+ Log("Node %d: in subfam\n", uTreeNodeIndex);
+#endif
+ ;
+ }
+ uTreeNodeIndex = GuideTree.NextDepthFirstNode(uTreeNodeIndex);
+ }
+ while (NULL_NEIGHBOR != uTreeNodeIndex);
+ ProgressStepsDone();
+
+ unsigned uRootNodeIndex = GuideTree.GetRootNodeIndex();
+ ProgNode &RootProgNode = ProgNodes[uRootNodeIndex];
+
+ TextFile fOut(g_pstrOutFileName, true);
+ MHackEnd(RootProgNode.m_MSA);
+ RootProgNode.m_MSA.ToFile(fOut);
+
+ delete[] NodeInSubFam;
+ delete[] NodeIsSubFam;
+ delete[] ProgNodes;
+ delete[] SubFams;
+
+ ProgNodes = 0;
+ NodeInSubFam = 0;
+ NodeIsSubFam = 0;
+ SubFams = 0;
+ }
Added: trunk/packages/muscle/branches/upstream/current/subfams.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/subfams.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/subfams.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,65 @@
+#include "muscle.h"
+#include "distfunc.h"
+
+const float INFINITY = float(1e29);
+const unsigned NILL = uInsane;
+
+static float *ShortestPathEstimate;
+static unsigned *Predecessor;
+
+static void GetMostDistantPair(DistFunc &DF, unsigned *ptrIndex1, unsigned *ptrIndex2)
+ {
+ const unsigned uNodeCount = DF.GetCount();
+ if (uNodeCount < 2)
+ Quit("GetMostDistantPair: < 2 seqs");
+
+ float MaxDist = -1;
+ unsigned Index1 = uInsane;
+ unsigned Index2 = uInsane;
+ for (unsigned i = 0; i < uNodeCount; ++i)
+ {
+ for (unsigned j = i + 1; j < uNodeCount; ++j)
+ {
+ float d = DF.GetDist(i, j);
+ if (d > MaxDist)
+ {
+ MaxDist = d;
+ Index1 = i;
+ Index2 = j;
+ }
+ }
+ }
+
+ assert(Index1 != uInsane);
+ assert(Index2 != uInsane);
+
+ *ptrIndex1 = Index1;
+ *ptrIndex2 = Index2;
+ }
+
+static void InitializeSingleSource(DistFunc &DF, unsigned uIndex)
+ {
+ const unsigned uNodeCount = 0;
+
+ for (unsigned i = 0; i < uNodeCount; ++i)
+ {
+ ShortestPathEstimate[i] = INFINITY;
+ Predecessor[i] = NILL;
+ }
+ ShortestPathEstimate[uIndex] = 0;
+ }
+
+static void Relax(DistFunc &DF, unsigned u, unsigned v)
+ {
+ float w = DF.GetDist(u, v);
+ float d = ShortestPathEstimate[u] + w;
+ if (ShortestPathEstimate[v] > d)
+ {
+ ShortestPathEstimate[v] = d;
+ Predecessor[v] = u;
+ }
+ }
+
+void ShortestPath(DistFunc &DF, unsigned uIndex)
+ {
+ }
Added: trunk/packages/muscle/branches/upstream/current/sw.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/sw.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/sw.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,206 @@
+#include "muscle.h"
+#include <math.h>
+#include "pwpath.h"
+#include "profile.h"
+#include <stdio.h>
+
+// Textbook Smith-Waterman affine gap implementation.
+
+#define TRACE 0
+
+static const char *LocalScoreToStr(SCORE s)
+ {
+ static char str[16];
+ if (MINUS_INFINITY == s)
+ return " *";
+ sprintf(str, "%6.2f", s);
+ return str;
+ }
+
+static void ListDP(const SCORE *DPM_, const ProfPos *PA, const ProfPos *PB,
+ unsigned uPrefixCountA, unsigned uPrefixCountB)
+ {
+ Log(" ");
+ for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ {
+ char c = ' ';
+ if (uPrefixLengthB > 0)
+ c = ConsensusChar(PB[uPrefixLengthB - 1]);
+ Log(" %4u:%c", uPrefixLengthB, c);
+ }
+ Log("\n");
+ for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA)
+ {
+ char c = ' ';
+ if (uPrefixLengthA > 0)
+ c = ConsensusChar(PA[uPrefixLengthA - 1]);
+ Log("%4u:%c ", uPrefixLengthA, c);
+ for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ Log(" %s", LocalScoreToStr(DPM(uPrefixLengthA, uPrefixLengthB)));
+ Log("\n");
+ }
+ }
+
+SCORE SW(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
+ unsigned uLengthB, PWPath &Path)
+ {
+ assert(uLengthB > 0 && uLengthA > 0);
+
+ const unsigned uPrefixCountA = uLengthA + 1;
+ const unsigned uPrefixCountB = uLengthB + 1;
+
+// Allocate DP matrices
+ const size_t LM = uPrefixCountA*uPrefixCountB;
+ SCORE *DPM_ = new SCORE[LM];
+ SCORE *DPD_ = new SCORE[LM];
+ SCORE *DPI_ = new SCORE[LM];
+
+ DPM(0, 0) = 0;
+ DPD(0, 0) = MINUS_INFINITY;
+ DPI(0, 0) = MINUS_INFINITY;
+
+ DPM(1, 0) = MINUS_INFINITY;
+ DPD(1, 0) = MINUS_INFINITY;
+ DPI(1, 0) = MINUS_INFINITY;
+
+ DPM(0, 1) = MINUS_INFINITY;
+ DPD(0, 1) = MINUS_INFINITY;
+ DPI(0, 1) = MINUS_INFINITY;
+
+// Empty prefix of B is special case
+ for (unsigned uPrefixLengthA = 2; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA)
+ {
+ // M=LetterA+LetterB, impossible with empty prefix
+ DPM(uPrefixLengthA, 0) = MINUS_INFINITY;
+
+ // D=LetterA+GapB, never optimal in local alignment with gap penalties
+ DPD(uPrefixLengthA, 0) = MINUS_INFINITY;
+
+ // I=GapA+LetterB, impossible with empty prefix
+ DPI(uPrefixLengthA, 0) = MINUS_INFINITY;
+ }
+
+// Empty prefix of A is special case
+ for (unsigned uPrefixLengthB = 2; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ {
+ // M=LetterA+LetterB, impossible with empty prefix
+ DPM(0, uPrefixLengthB) = MINUS_INFINITY;
+
+ // D=LetterA+GapB, impossible with empty prefix
+ DPD(0, uPrefixLengthB) = MINUS_INFINITY;
+
+ // I=GapA+LetterB, never optimal in local alignment with gap penalties
+ DPI(0, uPrefixLengthB) = MINUS_INFINITY;
+ }
+
+ SCORE scoreMax = MINUS_INFINITY;
+ unsigned uPrefixLengthAMax = uInsane;
+ unsigned uPrefixLengthBMax = uInsane;
+
+// ============
+// Main DP loop
+// ============
+ SCORE scoreGapCloseB = MINUS_INFINITY;
+ for (unsigned uPrefixLengthB = 1; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB)
+ {
+ const ProfPos &PPB = PB[uPrefixLengthB - 1];
+
+ SCORE scoreGapCloseA = MINUS_INFINITY;
+ for (unsigned uPrefixLengthA = 1; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA)
+ {
+ const ProfPos &PPA = PA[uPrefixLengthA - 1];
+
+ {
+ // Match M=LetterA+LetterB
+ SCORE scoreLL = ScoreProfPos2(PPA, PPB);
+
+ SCORE scoreMM = DPM(uPrefixLengthA-1, uPrefixLengthB-1);
+ SCORE scoreDM = DPD(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapCloseA;
+ SCORE scoreIM = DPI(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapCloseB;
+
+ SCORE scoreBest;
+ if (scoreMM >= scoreDM && scoreMM >= scoreIM)
+ scoreBest = scoreMM;
+ else if (scoreDM >= scoreMM && scoreDM >= scoreIM)
+ scoreBest = scoreDM;
+ else
+ {
+ assert(scoreIM >= scoreMM && scoreIM >= scoreDM);
+ scoreBest = scoreIM;
+ }
+ if (scoreBest < 0)
+ scoreBest = 0;
+ scoreBest += scoreLL;
+ if (scoreBest > scoreMax)
+ {
+ scoreMax = scoreBest;
+ uPrefixLengthAMax = uPrefixLengthA;
+ uPrefixLengthBMax = uPrefixLengthB;
+ }
+ DPM(uPrefixLengthA, uPrefixLengthB) = scoreBest;
+ }
+
+ {
+ // Delete D=LetterA+GapB
+ SCORE scoreMD = DPM(uPrefixLengthA-1, uPrefixLengthB) +
+ PA[uPrefixLengthA-1].m_scoreGapOpen;
+ SCORE scoreDD = DPD(uPrefixLengthA-1, uPrefixLengthB);
+
+ SCORE scoreBest;
+ if (scoreMD >= scoreDD)
+ scoreBest = scoreMD;
+ else
+ {
+ assert(scoreDD >= scoreMD);
+ scoreBest = scoreDD;
+ }
+ DPD(uPrefixLengthA, uPrefixLengthB) = scoreBest;
+ }
+
+ // Insert I=GapA+LetterB
+ {
+ SCORE scoreMI = DPM(uPrefixLengthA, uPrefixLengthB-1) +
+ PB[uPrefixLengthB - 1].m_scoreGapOpen;
+ SCORE scoreII = DPI(uPrefixLengthA, uPrefixLengthB-1);
+
+ SCORE scoreBest;
+ if (scoreMI >= scoreII)
+ scoreBest = scoreMI;
+ else
+ {
+ assert(scoreII > scoreMI);
+ scoreBest = scoreII;
+ }
+ DPI(uPrefixLengthA, uPrefixLengthB) = scoreBest;
+ }
+
+ scoreGapCloseA = PPA.m_scoreGapClose;
+ }
+ scoreGapCloseB = PPB.m_scoreGapClose;
+ }
+
+#if TRACE
+ Log("DPM:\n");
+ ListDP(DPM_, PA, PB, uPrefixLengthA, uPrefixLengthB);
+ Log("DPD:\n");
+ ListDP(DPD_, PA, PB, uPrefixLengthA, uPrefixLengthB);
+ Log("DPI:\n");
+ ListDP(DPI_, PA, PB, uPrefixLengthA, uPrefixLengthB);
+#endif
+
+ assert(scoreMax == DPM(uPrefixLengthAMax, uPrefixLengthBMax));
+ TraceBackSW(PA, uLengthA, PB, uLengthB, DPM_, DPD_, DPI_,
+ uPrefixLengthAMax, uPrefixLengthBMax, Path);
+
+#if TRACE
+ SCORE scorePath = FastScorePath2(PA, uLengthA, PB, uLengthB, Path);
+ Path.LogMe();
+ Log("Score = %s Path = %s\n", LocalScoreToStr(scoreMax), LocalScoreToStr(scorePath));
+#endif
+
+ delete[] DPM_;
+ delete[] DPD_;
+ delete[] DPI_;
+
+ return scoreMax;
+ }
Added: trunk/packages/muscle/branches/upstream/current/termgaps.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/termgaps.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/termgaps.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,36 @@
+#include "muscle.h"
+#include "profile.h"
+
+void SetTermGaps(const ProfPos *Prof, unsigned uLength)
+ {
+ if (0 == uLength)
+ return;
+
+ ProfPos *First = (ProfPos *) Prof;
+ ProfPos *Last = (ProfPos *) (Prof + uLength - 1);
+
+ switch (g_TermGaps)
+ {
+ case TERMGAPS_Full:
+ break;
+
+ case TERMGAPS_Half:
+ // -infinity check for lock left/right
+ if (First->m_scoreGapOpen != MINUS_INFINITY)
+ First->m_scoreGapOpen = 0;
+
+ if (uLength > 1 && Last->m_scoreGapClose != MINUS_INFINITY)
+ Last->m_scoreGapClose = 0;
+
+ case TERMGAPS_Ext:
+ if (First->m_scoreGapOpen != MINUS_INFINITY)
+ First->m_scoreGapOpen *= -1;
+
+ if (uLength > 1 && Last->m_scoreGapClose != MINUS_INFINITY)
+ Last->m_scoreGapClose *= -1;
+ break;
+
+ default:
+ Quit("Invalid g_TermGaps");
+ }
+ }
Added: trunk/packages/muscle/branches/upstream/current/textfile.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/textfile.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/textfile.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,345 @@
+#include "muscle.h"
+#include "textfile.h"
+#include <errno.h>
+
+TextFile::TextFile(const char szFileName[], bool bWrite)
+ {
+ FILE *ptrFile = 0;
+ if (bWrite)
+ {
+ if (0 == strcmp(szFileName, "-"))
+ ptrFile = stdout;
+ else
+ ptrFile = fopen(szFileName, "wb");
+ }
+ else
+ {
+ if (0 == strcmp(szFileName, "-"))
+ ptrFile = stdin;
+ else
+ ptrFile = fopen(szFileName, "rb");
+ }
+ if (0 == ptrFile)
+ Quit("Cannot open '%s' errno=%d\n", szFileName, errno);
+ Init(ptrFile, szFileName);
+ }
+
+void TextFile::Init(FILE *ptrFile, const char *ptrFileName)
+ {
+ m_ptrFile = ptrFile;
+ m_ptrName = strdup(ptrFileName);
+ m_uLineNr = 1;
+ m_uColNr = 0;
+ m_bLastCharWasEOL = true;
+ m_cPushedBack = -1;
+#if DEBUG
+ setbuf(m_ptrFile, 0);
+#endif
+ }
+
+TextFile::TextFile(FILE *ptrFile, const char *ptrFileName)
+ {
+ Init(ptrFile, "-");
+ }
+
+TextFile::~TextFile()
+ {
+ if (m_ptrFile &&
+ m_ptrFile != stdin && m_ptrFile != stdout && m_ptrFile != stderr)
+ fclose(m_ptrFile);
+ free(m_ptrName);
+ }
+
+// Get line from file.
+// Return true if end-of-file, quit if line too long.
+bool TextFile::GetLine(char szLine[], unsigned uBytes)
+ {
+ if (0 == uBytes)
+ Quit("TextFile::GetLine, buffer zero size");
+
+ memset(szLine, 0, uBytes);
+
+ unsigned uBytesCopied = 0;
+
+// Loop until end of line or end of file.
+ for (;;)
+ {
+ char c;
+ bool bEof = GetChar(c);
+ if (bEof)
+ return true;
+ if ('\r' == c)
+ continue;
+ if ('\n' == c)
+ return false;
+ if (uBytesCopied < uBytes - 1)
+ szLine[uBytesCopied++] = (char) c;
+ else
+ Quit("TextFile::GetLine: input buffer too small, line %u",
+ m_uLineNr);
+ }
+ }
+
+// As GetLine, but trim leading and trailing blanks; skip empty lines
+bool TextFile::GetTrimLine(char szLine[], unsigned uBytes)
+ {
+ for (;;)
+ {
+ bool bEOF = GetLine(szLine, uBytes);
+ if (bEOF)
+ return true;
+ TrimBlanks(szLine);
+ if (0 != szLine[0])
+ break;
+ }
+ return false;
+ }
+
+void TextFile::Rewind()
+ {
+ fseek(m_ptrFile, 0, SEEK_SET);
+ m_uLineNr = 1;
+ m_bLastCharWasEOL = true;
+ }
+
+void TextFile::PutChar(char c)
+ {
+ int i = fputc(c, m_ptrFile);
+ assert(i == c);
+ if ('\n' == c)
+ {
+ ++m_uLineNr;
+ m_uColNr = 1;
+ }
+ else
+ ++m_uColNr;
+ }
+
+void TextFile::PutString(const char szLine[])
+ {
+ int iError = fputs(szLine, m_ptrFile);
+ assert(iError >= 0);
+ }
+
+void TextFile::PutFormat(const char szFormat[], ...)
+ {
+ char szStr[4096];
+ va_list ArgList;
+ va_start(ArgList, szFormat);
+ vsprintf(szStr, szFormat, ArgList);
+ PutString(szStr);
+ }
+
+void TextFile::GetLineX(char szLine[], unsigned uBytes)
+ {
+ bool bEof = GetLine(szLine, uBytes);
+ if (bEof)
+ Quit("end-of-file in GetLineX");
+ }
+
+bool TextFile::GetToken(char szToken[], unsigned uBytes, const char szCharTokens[])
+ {
+// Skip leading white space
+ char c;
+ for (;;)
+ {
+ bool bEof = GetChar(c);
+ if (bEof)
+ return true;
+ if (!isspace(c))
+ break;
+ }
+
+// Check for special case single-character tokens
+ if (0 != strchr(szCharTokens, c))
+ {
+ assert(uBytes >= 2);
+ szToken[0] = c;
+ szToken[1] = 0;
+ return false;
+ }
+
+// Loop until token terminated by white space, EOF or special
+ unsigned uBytesCopied = 0;
+ for (;;)
+ {
+ if (uBytesCopied < uBytes - 1)
+ szToken[uBytesCopied++] = c;
+ else
+ Quit("TextFile::GetToken: input buffer too small, line %u",
+ m_uLineNr);
+ bool bEof = GetChar(c);
+ if (bEof)
+ {
+ szToken[uBytesCopied] = 0;
+ return true;
+ }
+ // Check for special case single-character tokens
+ if (0 != strchr(szCharTokens, c))
+ {
+ PushBack(c);
+ assert(uBytesCopied > 0 && uBytesCopied < uBytes);
+ szToken[uBytesCopied] = 0;
+ return false;
+ }
+ if (isspace(c))
+ {
+ assert(uBytesCopied > 0 && uBytesCopied < uBytes);
+ szToken[uBytesCopied] = 0;
+ return false;
+ }
+ }
+ }
+
+void TextFile::GetTokenX(char szToken[], unsigned uBytes, const char szCharTokens[])
+ {
+ bool bEof = GetToken(szToken, uBytes, szCharTokens);
+ if (bEof)
+ Quit("End-of-file in GetTokenX");
+ }
+
+void TextFile::Skip()
+ {
+ for (;;)
+ {
+ char c;
+ bool bEof = GetChar(c);
+ if (bEof || '\n' == c)
+ return;
+ assert(isspace(c));
+ }
+ }
+
+#ifdef _WIN32
+
+TEXTFILEPOS TextFile::GetPos()
+ {
+ fpos_t p;
+ int i = fgetpos(m_ptrFile, &p);
+ assert(0 == i);
+ assert(p >= 0);
+ TEXTFILEPOS Pos;
+ Pos.uOffset = (unsigned) p;
+ Pos.uLineNr = m_uLineNr;
+ Pos.uColNr = m_uColNr;
+ return Pos;
+ }
+
+void TextFile::SetPos(TEXTFILEPOS Pos)
+ {
+ fpos_t p = (fpos_t) Pos.uOffset;
+ int i = fsetpos(m_ptrFile, &p);
+ assert(0 == i);
+ m_uLineNr = Pos.uLineNr;
+ m_uColNr = Pos.uColNr;
+ }
+
+#else
+
+TEXTFILEPOS TextFile::GetPos()
+ {
+ TEXTFILEPOS Pos;
+ Pos.uOffset = ftell(m_ptrFile);
+ Pos.uLineNr = m_uLineNr;
+ Pos.uColNr = m_uColNr;
+ return Pos;
+ }
+
+void TextFile::SetPos(TEXTFILEPOS Pos)
+ {
+ fseek(m_ptrFile, Pos.uOffset, SEEK_SET);
+ m_uLineNr = Pos.uLineNr;
+ m_uColNr = Pos.uColNr;
+ }
+
+#endif
+
+bool TextFile::GetChar(char &c)
+ {
+ if (-1 != m_cPushedBack)
+ {
+ c = (char) m_cPushedBack;
+ m_cPushedBack = -1;
+ return false;
+ }
+
+ int ic = fgetc(m_ptrFile);
+ if (ic < 0)
+ {
+ if (feof(m_ptrFile))
+ {
+ // Hack to fix up a non-empty text file that is missing
+ // and end-of-line character in the last line.
+ if (!m_bLastCharWasEOL && m_uLineNr > 0)
+ {
+ c = '\n';
+ m_bLastCharWasEOL = true;
+ return false;
+ }
+ return true;
+ }
+ Quit("TextFile::GetChar, error %s", strerror(errno));
+ }
+ c = (char) ic;
+ if ('\n' == c)
+ {
+ m_bLastCharWasEOL = true;
+ ++m_uLineNr;
+ m_uColNr = 1;
+ }
+ else
+ {
+ m_bLastCharWasEOL = false;
+ ++m_uColNr;
+ }
+ return false;
+ }
+
+void TextFile::GetCharX(char &c)
+ {
+ bool bEof = GetChar(c);
+ if (bEof)
+ Quit("End-of-file in GetCharX");
+ }
+
+void TextFile::GetNonblankChar(char &c)
+ {
+ do
+ {
+ bool bEof = GetChar(c);
+ if (bEof)
+ Quit("End-of-file in GetCharX");
+ }
+ while (isspace(c));
+ }
+
+void TextFile::SkipLine()
+ {
+ if (m_bLastCharWasEOL)
+ return;
+ for (;;)
+ {
+ char c;
+ bool bEof = GetChar(c);
+ if (bEof)
+ Quit("End-of-file in SkipLine");
+ if ('\n' == c)
+ break;
+ }
+ }
+
+void TextFile::SkipWhite()
+ {
+ for (;;)
+ {
+ char c;
+ bool bEof = GetChar(c);
+ if (bEof)
+ Quit("End-of-file in SkipWhite");
+ if (!isspace(c))
+ {
+ PushBack(c);
+ break;
+ }
+ }
+ }
Added: trunk/packages/muscle/branches/upstream/current/textfile.h
===================================================================
--- trunk/packages/muscle/branches/upstream/current/textfile.h 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/textfile.h 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,69 @@
+#ifndef TextFile_h
+#define TextFile_h
+
+#include <stdio.h>
+
+struct TEXTFILEPOS
+ {
+ unsigned uOffset;
+ unsigned uLineNr;
+ unsigned uColNr;
+ };
+
+const unsigned TextFileBufferSize = 256;
+
+class TextFile
+ {
+private:
+// no default c'tor, not implemented
+ TextFile();
+
+public:
+ virtual ~TextFile();
+
+ TextFile(const char szFileName[], bool bWrite = false);
+ TextFile(FILE *ptrFile, const char *ptrFileName = "-");
+ void Close() { fclose(m_ptrFile); m_ptrFile = 0; }
+
+ bool GetLine(char szLine[], unsigned uBytes);
+ bool GetTrimLine(char szLine[], unsigned uBytes);
+ void GetLineX(char szLine[], unsigned uBytes);
+
+ bool GetToken(char szToken[], unsigned uBytes, const char szCharTokens[] = "{}");
+ void GetTokenX(char szToken[], unsigned uBytes, const char szCharTokens[] = "{}");
+
+ void Skip();
+ void SkipLine();
+ void SkipWhite();
+ void Rewind();
+ TEXTFILEPOS GetPos();
+ void SetPos(TEXTFILEPOS Pos);
+ bool GetChar(char &c);
+ void GetCharX(char &c);
+ void GetNonblankChar(char &c);
+
+ unsigned GetLineNr() { return m_uLineNr; }
+
+ void PutString(const char szLine[]);
+ void PutFormat(const char szFormat[], ...);
+ void PutChar(char c);
+
+ const char *GetFileName() { return m_ptrName; }
+
+ void PushBack(int c) { m_cPushedBack = c; }
+
+ FILE *GetStdioFile() const { return m_ptrFile; }
+
+private:
+ void Init(FILE *ptrFile, const char *ptrFileName);
+
+private:
+ FILE *m_ptrFile;
+ unsigned m_uLineNr;
+ unsigned m_uColNr;
+ char *m_ptrName;
+ bool m_bLastCharWasEOL;
+ int m_cPushedBack;
+ };
+
+#endif // TextFile_h
Added: trunk/packages/muscle/branches/upstream/current/threewaywt.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/threewaywt.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/threewaywt.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,342 @@
+#include "muscle.h"
+#include "tree.h"
+#include <math.h>
+
+#define TRACE 0
+
+/***
+Sequence weights derived from a tree using Gotoh's
+three-way method.
+
+ Gotoh (1995) CABIOS 11(5), 543-51.
+
+Each edge e is assigned a weight w(e).
+
+Consider first a tree with three leaves A,B and C
+having branch lengths a, b and c, as follows.
+
+ B
+ |
+ b
+ |
+ A---a---R---c---C
+
+The internal node is denoted by R.
+
+Define:
+
+ S = (ab + ca + ab)
+ x = bc(a + b)(a + c)
+ y = a(b + c)FS
+
+Here F is a tunable normalization factor which is
+approximately 1.0. Then the edge weight for AR
+is computed as:
+
+ w(AR) = sqrt(x/y)
+
+Similar expressions for the other edges follow by
+symmetry.
+
+For a tree with more than three edges, the weight
+of an edge that ends in a leaf is computed from
+the three-way tree that includes the edge and
+its two neighbors. The weight of an internal edge
+is computed as the product of the weights for that
+edge derived from the two three-way subtrees that
+include that edge.
+
+For example, consider the following tree.
+
+ B
+ |
+ A--R--V--C
+ |
+ D
+
+Here, w(RV) is computed as the product of the
+two values for w(RV) derived from the three-way
+trees with leaves ABV and RCD respectively.
+
+The calculation is done using "Gotoh lengths",
+not the real edge lengths.
+
+The Gotoh length G of a directed edge is calculated
+recursively as:
+
+ G = d + LR/(L + R)
+
+where d is the length of the edge, and L and R are
+the Gotoh lengths of the left and right edges adjoining
+the terminal end of the edge. If the edge terminates on
+a leaf, then G=d.
+
+Pairwise sequence weights are computed as the
+product of edge weights on the path that connects
+their leaves.
+
+If the tree is split into two subtrees by deleting
+a given edge e, then the pairwise weights factorize.
+For operations on profiles formed from the two
+subtrees, it is possible to assign a weight to a
+sequence as the product of edge weights on a path
+from e to its leaf.
+***/
+
+// The xxxUnrooted functions present a rooted tree as
+// if it had been unrooted by deleting the root node.
+static unsigned GetFirstNeighborUnrooted(const Tree &tree, unsigned uNode1,
+ unsigned uNode2)
+ {
+ if (tree.IsRoot(uNode1) || tree.IsRoot(uNode2))
+ Quit("GetFirstNeighborUnrooted, should never be called with root");
+ if (!tree.IsEdge(uNode1, uNode2))
+ {
+ if (!tree.IsRoot(tree.GetParent(uNode1)) ||
+ !tree.IsRoot(tree.GetParent(uNode2)))
+ Quit("GetFirstNeighborUnrooted, not edge");
+ const unsigned uRoot = tree.GetRootNodeIndex();
+ return tree.GetFirstNeighbor(uNode1, uRoot);
+ }
+
+ unsigned uNeighbor = tree.GetFirstNeighbor(uNode1, uNode2);
+ if (tree.IsRoot(uNeighbor))
+ return tree.GetFirstNeighbor(uNeighbor, uNode1);
+ return uNeighbor;
+ }
+
+static unsigned GetSecondNeighborUnrooted(const Tree &tree, unsigned uNode1,
+ unsigned uNode2)
+ {
+ if (tree.IsRoot(uNode1) || tree.IsRoot(uNode2))
+ Quit("GetFirstNeighborUnrooted, should never be called with root");
+ if (!tree.IsEdge(uNode1, uNode2))
+ {
+ if (!tree.IsRoot(tree.GetParent(uNode1)) ||
+ !tree.IsRoot(tree.GetParent(uNode2)))
+ Quit("GetFirstNeighborUnrooted, not edge");
+ const unsigned uRoot = tree.GetRootNodeIndex();
+ return tree.GetSecondNeighbor(uNode1, uRoot);
+ }
+
+ unsigned uNeighbor = tree.GetSecondNeighbor(uNode1, uNode2);
+ if (tree.IsRoot(uNeighbor))
+ return tree.GetFirstNeighbor(uNeighbor, uNode1);
+ return uNeighbor;
+ }
+
+static unsigned GetNeighborUnrooted(const Tree &tree, unsigned uNode1,
+ unsigned uSub)
+ {
+ unsigned uNeighbor = tree.GetNeighbor(uNode1, uSub);
+ if (tree.IsRoot(uNeighbor))
+ return tree.GetFirstNeighbor(uNeighbor, uNode1);
+ return uNeighbor;
+ }
+
+static unsigned GetNeighborSubscriptUnrooted(const Tree &tree, unsigned uNode1,
+ unsigned uNode2)
+ {
+ if (tree.IsEdge(uNode1, uNode2))
+ return tree.GetNeighborSubscript(uNode1, uNode2);
+ if (!tree.IsRoot(tree.GetParent(uNode1)) ||
+ !tree.IsRoot(tree.GetParent(uNode2)))
+ Quit("GetNeighborSubscriptUnrooted, not edge");
+ for (unsigned uSub = 0; uSub < 3; ++uSub)
+ if (GetNeighborUnrooted(tree, uNode1, uSub) == uNode2)
+ return uSub;
+ Quit("GetNeighborSubscriptUnrooted, not a neighbor");
+ return NULL_NEIGHBOR;
+ }
+
+static double GetEdgeLengthUnrooted(const Tree &tree, unsigned uNode1,
+ unsigned uNode2)
+ {
+ if (tree.IsRoot(uNode1) || tree.IsRoot(uNode2))
+ Quit("GetEdgeLengthUnrooted, should never be called with root");
+ if (!tree.IsEdge(uNode1, uNode2))
+ {
+ if (!tree.IsRoot(tree.GetParent(uNode1)) ||
+ !tree.IsRoot(tree.GetParent(uNode2)))
+ Quit("GetEdgeLengthUnrooted, not edge");
+
+ const unsigned uRoot = tree.GetRootNodeIndex();
+ return tree.GetEdgeLength(uNode1, uRoot) +
+ tree.GetEdgeLength(uNode2, uRoot);
+ }
+ return tree.GetEdgeLength(uNode1, uNode2);
+ }
+
+double GetGotohLength(const Tree &tree, unsigned R, unsigned A)
+ {
+ double dThis = GetEdgeLengthUnrooted(tree, R, A);
+
+// Enforce non-negative edge lengths
+ if (dThis < 0)
+ dThis = 0;
+
+ if (tree.IsLeaf(A))
+ return dThis;
+
+ const unsigned uFirst = GetFirstNeighborUnrooted(tree, A, R);
+ const unsigned uSecond = GetSecondNeighborUnrooted(tree, A, R);
+ const double dFirst = GetGotohLength(tree, A, uFirst);
+ const double dSecond = GetGotohLength(tree, A, uSecond);
+ const double dSum = dFirst + dSecond;
+ const double dThird = dSum == 0 ? 0 : (dFirst*dSecond)/dSum;
+ return dThis + dThird;
+ }
+
+// Return weight of edge A-R in three-way subtree that has
+// leaves A,B,C and internal node R.
+static double GotohWeightThreeWay(const Tree &tree, unsigned A,
+ unsigned B, unsigned C, unsigned R)
+ {
+ const double F = 1.0;
+
+ if (tree.IsLeaf(R))
+ Quit("GotohThreeWay: R must be internal node");
+
+ double a = GetGotohLength(tree, R, A);
+ double b = GetGotohLength(tree, R, B);
+ double c = GetGotohLength(tree, R, C);
+
+ double S = b*c + c*a + a*b;
+ double x = b*c*(a + b)*(a + c);
+ double y = a*(b + c)*F*S;
+
+// y is zero iff all three branch lengths are zero.
+ if (y < 0.001)
+ return 1.0;
+ return sqrt(x/y);
+ }
+
+static double GotohWeightEdge(const Tree &tree, unsigned uNodeIndex1,
+ unsigned uNodeIndex2)
+ {
+ double w1 = 1.0;
+ double w2 = 1.0;
+ if (!tree.IsLeaf(uNodeIndex1))
+ {
+ unsigned R = uNodeIndex1;
+ unsigned A = uNodeIndex2;
+ unsigned B = GetFirstNeighborUnrooted(tree, R, A);
+ unsigned C = GetSecondNeighborUnrooted(tree, R, A);
+ w1 = GotohWeightThreeWay(tree, A, B, C, R);
+ }
+ if (!tree.IsLeaf(uNodeIndex2))
+ {
+ unsigned R = uNodeIndex2;
+ unsigned A = uNodeIndex1;
+ unsigned B = GetFirstNeighborUnrooted(tree, R, A);
+ unsigned C = GetSecondNeighborUnrooted(tree, R, A);
+ w2 = GotohWeightThreeWay(tree, A, B, C, R);
+ }
+ return w1*w2;
+ }
+
+void CalcThreeWayEdgeWeights(const Tree &tree, WEIGHT **EdgeWeights)
+ {
+ const unsigned uNodeCount = tree.GetNodeCount();
+ for (unsigned uNodeIndex1 = 0; uNodeIndex1 < uNodeCount; ++uNodeIndex1)
+ {
+ if (tree.IsRoot(uNodeIndex1))
+ continue;
+ for (unsigned uSub1 = 0; uSub1 < 3; ++uSub1)
+ {
+ const unsigned uNodeIndex2 = GetNeighborUnrooted(tree, uNodeIndex1, uSub1);
+ if (NULL_NEIGHBOR == uNodeIndex2)
+ continue;
+
+ // Avoid computing same edge twice in reversed order
+ if (uNodeIndex2 < uNodeIndex1)
+ continue;
+
+ const WEIGHT w = (WEIGHT) GotohWeightEdge(tree, uNodeIndex1, uNodeIndex2);
+ const unsigned uSub2 = GetNeighborSubscriptUnrooted(tree, uNodeIndex2, uNodeIndex1);
+#if DEBUG
+ {
+ assert(uNodeIndex2 == GetNeighborUnrooted(tree, uNodeIndex1, uSub1));
+ assert(uNodeIndex1 == GetNeighborUnrooted(tree, uNodeIndex2, uSub2));
+ const WEIGHT wRev = (WEIGHT) GotohWeightEdge(tree, uNodeIndex2, uNodeIndex1);
+ if (!BTEq(w, wRev))
+ Quit("CalcThreeWayWeights: rev check failed %g %g",
+ w, wRev);
+ }
+#endif
+ EdgeWeights[uNodeIndex1][uSub1] = w;
+ EdgeWeights[uNodeIndex2][uSub2] = w;
+ }
+ }
+ }
+
+static void SetSeqWeights(const Tree &tree, unsigned uNode1, unsigned uNode2,
+ double dPathWeight, WEIGHT *Weights)
+ {
+ if (tree.IsRoot(uNode1) || tree.IsRoot(uNode2))
+ Quit("SetSeqWeights, should never be called with root");
+
+ const double dThisLength = GetEdgeLengthUnrooted(tree, uNode1, uNode2);
+ if (tree.IsLeaf(uNode2))
+ {
+ const unsigned Id = tree.GetLeafId(uNode2);
+ Weights[Id] = (WEIGHT) (dPathWeight + dThisLength);
+ return;
+ }
+ const unsigned uFirst = GetFirstNeighborUnrooted(tree, uNode2, uNode1);
+ const unsigned uSecond = GetSecondNeighborUnrooted(tree, uNode2, uNode1);
+ dPathWeight *= dThisLength;
+ SetSeqWeights(tree, uNode2, uFirst, dPathWeight, Weights);
+ SetSeqWeights(tree, uNode2, uSecond, dPathWeight, Weights);
+ }
+
+void CalcThreeWayWeights(const Tree &tree, unsigned uNode1, unsigned uNode2,
+ WEIGHT *Weights)
+ {
+#if TRACE
+ Log("CalcThreeWayEdgeWeights\n");
+ tree.LogMe();
+#endif
+
+ if (tree.IsRoot(uNode1))
+ uNode1 = tree.GetFirstNeighbor(uNode1, uNode2);
+ else if (tree.IsRoot(uNode2))
+ uNode2 = tree.GetFirstNeighbor(uNode2, uNode1);
+ const unsigned uNodeCount = tree.GetNodeCount();
+ WEIGHT **EdgeWeights = new WEIGHT *[uNodeCount];
+ for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex)
+ EdgeWeights[uNodeIndex] = new WEIGHT[3];
+
+ CalcThreeWayEdgeWeights(tree, EdgeWeights);
+
+#if TRACE
+ {
+ Log("Node1 Node2 Length Gotoh EdgeWt\n");
+ Log("----- ----- ------ ------ ------\n");
+ for (unsigned uNodeIndex1 = 0; uNodeIndex1 < uNodeCount; ++uNodeIndex1)
+ {
+ if (tree.IsRoot(uNodeIndex1))
+ continue;
+ for (unsigned uSub1 = 0; uSub1 < 3; ++uSub1)
+ {
+ const unsigned uNodeIndex2 = GetNeighborUnrooted(tree, uNodeIndex1, uSub1);
+ if (NULL_NEIGHBOR == uNodeIndex2)
+ continue;
+ if (uNodeIndex2 < uNodeIndex1)
+ continue;
+ const WEIGHT ew = EdgeWeights[uNodeIndex1][uSub1];
+ const double d = GetEdgeLengthUnrooted(tree, uNodeIndex1, uNodeIndex2);
+ const double g = GetGotohLength(tree, uNodeIndex1, uNodeIndex2);
+ Log("%5u %5u %6.3f %6.3f %6.3f\n", uNodeIndex1, uNodeIndex2, d, g, ew);
+ }
+ }
+ }
+#endif
+
+ SetSeqWeights(tree, uNode1, uNode2, 0.0, Weights);
+ SetSeqWeights(tree, uNode2, uNode1, 0.0, Weights);
+
+ for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex)
+ delete[] EdgeWeights[uNodeIndex];
+ delete[] EdgeWeights;
+ }
Added: trunk/packages/muscle/branches/upstream/current/timing.h
===================================================================
--- trunk/packages/muscle/branches/upstream/current/timing.h 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/timing.h 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,24 @@
+#if WIN32
+
+typedef unsigned __int64 TICKS;
+
+#pragma warning(disable:4035)
+inline TICKS GetClockTicks()
+ {
+ _asm
+ {
+ _emit 0x0f
+ _emit 0x31
+ }
+ }
+
+#define StartTimer() __int64 t1__ = GetClockTicks()
+
+#define GetElapsedTicks() (GetClockTicks() - t1__)
+
+static double TicksToSecs(TICKS t)
+ {
+ return (__int64) t/2.5e9;
+ }
+
+#endif // WIN32
Added: trunk/packages/muscle/branches/upstream/current/traceback.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/traceback.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/traceback.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,208 @@
+#include "muscle.h"
+#include "profile.h"
+#include "pwpath.h"
+#include <math.h>
+
+#define TRACE 0
+
+#define EQ(a, b) (fabs(a-b) < 0.1)
+
+SCORE TraceBack(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
+ unsigned uLengthB, const SCORE *DPM_, const SCORE *DPD_, const SCORE *DPI_,
+ PWPath &Path)
+ {
+#if TRACE
+ Log("\n");
+ Log("TraceBack LengthA=%u LengthB=%u\n", uLengthA, uLengthB);
+#endif
+ assert(uLengthB > 0 && uLengthA > 0);
+
+ const unsigned uPrefixCountA = uLengthA + 1;
+ const unsigned uPrefixCountB = uLengthB + 1;
+
+ Path.Clear();
+
+ unsigned uPrefixLengthA = uLengthA;
+ unsigned uPrefixLengthB = uLengthB;
+
+ const SCORE scoreM = DPM(uPrefixLengthA, uPrefixLengthB);
+ SCORE scoreD = DPD(uPrefixLengthA, uPrefixLengthB);
+ SCORE scoreI = DPI(uPrefixLengthA, uPrefixLengthB);
+
+ const ProfPos &LastPPA = PA[uLengthA - 1];
+ const ProfPos &LastPPB = PB[uLengthB - 1];
+
+ scoreD += LastPPA.m_scoreGapClose;
+ scoreI += LastPPB.m_scoreGapClose;
+
+ char cEdgeType = cInsane;
+ SCORE scoreMax;
+ if (scoreM >= scoreD && scoreM >= scoreI)
+ {
+ scoreMax = scoreM;
+ cEdgeType = 'M';
+ }
+ else if (scoreD >= scoreM && scoreD >= scoreI)
+ {
+ scoreMax = scoreD;
+ cEdgeType = 'D';
+ }
+ else
+ {
+ assert(scoreI >= scoreM && scoreI >= scoreD);
+ scoreMax = scoreI;
+ cEdgeType = 'I';
+ }
+
+ for (;;)
+ {
+ if ('S' == cEdgeType)
+ break;
+
+ PWEdge Edge;
+ Edge.cType = cEdgeType;
+ Edge.uPrefixLengthA = uPrefixLengthA;
+ Edge.uPrefixLengthB = uPrefixLengthB;
+ Path.PrependEdge(Edge);
+
+ char cPrevEdgeType;
+ unsigned uPrevPrefixLengthA = uPrefixLengthA;
+ unsigned uPrevPrefixLengthB = uPrefixLengthB;
+
+ switch (cEdgeType)
+ {
+ case 'M':
+ {
+ assert(uPrefixLengthA > 0);
+ assert(uPrefixLengthB > 0);
+ const ProfPos &PPA = PA[uPrefixLengthA - 1];
+ const ProfPos &PPB = PB[uPrefixLengthB - 1];
+
+ const SCORE Score = DPM(uPrefixLengthA, uPrefixLengthB);
+ const SCORE scoreMatch = ScoreProfPos2(PPA, PPB);
+
+ SCORE scoreSM;
+ if (1 == uPrefixLengthA && 1 == uPrefixLengthB)
+ scoreSM = scoreMatch;
+ else
+ scoreSM = MINUS_INFINITY;
+
+ SCORE scoreMM = MINUS_INFINITY;
+ SCORE scoreDM = MINUS_INFINITY;
+ SCORE scoreIM = MINUS_INFINITY;
+ if (uPrefixLengthA > 1 && uPrefixLengthB > 1)
+ scoreMM = DPM(uPrefixLengthA-1, uPrefixLengthB-1) + scoreMatch;
+ if (uPrefixLengthA > 1)
+ {
+ SCORE scoreTransDM = PA[uPrefixLengthA-2].m_scoreGapClose;
+ scoreDM = DPD(uPrefixLengthA-1, uPrefixLengthB-1) + scoreTransDM + scoreMatch;
+ }
+ if (uPrefixLengthB > 1)
+ {
+ SCORE scoreTransIM = PB[uPrefixLengthB-2].m_scoreGapClose;
+ scoreIM = DPI(uPrefixLengthA-1, uPrefixLengthB-1) + scoreTransIM + scoreMatch;
+ }
+
+ if (EQ(scoreMM, Score))
+ cPrevEdgeType = 'M';
+ else if (EQ(scoreDM, Score))
+ cPrevEdgeType = 'D';
+ else if (EQ(scoreIM, Score))
+ cPrevEdgeType = 'I';
+ else if (EQ(scoreSM, Score))
+ cPrevEdgeType = 'S';
+ else
+ Quit("TraceBack: failed to match M score=%g M=%g D=%g I=%g S=%g",
+ Score, scoreMM, scoreDM, scoreIM, scoreSM);
+
+ --uPrevPrefixLengthA;
+ --uPrevPrefixLengthB;
+ break;
+ }
+
+ case 'D':
+ {
+ assert(uPrefixLengthA > 0);
+ const SCORE Score = DPD(uPrefixLengthA, uPrefixLengthB);
+
+ SCORE scoreMD = MINUS_INFINITY;
+ SCORE scoreDD = MINUS_INFINITY;
+ SCORE scoreSD = MINUS_INFINITY;
+ if (uPrefixLengthB == 0)
+ {
+ if (uPrefixLengthA == 1)
+ scoreSD = PA[0].m_scoreGapOpen;
+ else
+ scoreSD = DPD(uPrefixLengthA - 1, 0);
+ }
+ if (uPrefixLengthA > 1)
+ {
+ const ProfPos &PPA = PA[uPrefixLengthA - 1];
+ SCORE scoreTransMD = PPA.m_scoreGapOpen;
+ scoreMD = DPM(uPrefixLengthA-1, uPrefixLengthB) + scoreTransMD;
+ scoreDD = DPD(uPrefixLengthA-1, uPrefixLengthB);
+ }
+
+ if (EQ(Score, scoreMD))
+ cPrevEdgeType = 'M';
+ else if (EQ(Score, scoreDD))
+ cPrevEdgeType = 'D';
+ else if (EQ(Score, scoreSD))
+ cPrevEdgeType = 'S';
+ else
+ Quit("TraceBack: failed to match D");
+
+ --uPrevPrefixLengthA;
+ break;
+ }
+
+ case 'I':
+ {
+ assert(uPrefixLengthB > 0);
+ const SCORE Score = DPI(uPrefixLengthA, uPrefixLengthB);
+
+ SCORE scoreMI = MINUS_INFINITY;
+ SCORE scoreII = MINUS_INFINITY;
+ SCORE scoreSI = MINUS_INFINITY;
+ if (uPrefixLengthA == 0)
+ {
+ if (uPrefixLengthB == 1)
+ scoreSI = PB[0].m_scoreGapOpen;
+ else
+ scoreSI = DPI(0, uPrefixLengthB - 1);
+ }
+ if (uPrefixLengthB > 1)
+ {
+ const ProfPos &PPB = PB[uPrefixLengthB - 1];
+ SCORE scoreTransMI = PPB.m_scoreGapOpen;
+ scoreMI = DPM(uPrefixLengthA, uPrefixLengthB-1) + scoreTransMI;
+ scoreII = DPI(uPrefixLengthA, uPrefixLengthB-1);
+ }
+
+ if (EQ(Score, scoreMI))
+ cPrevEdgeType = 'M';
+ else if (EQ(Score, scoreII))
+ cPrevEdgeType = 'I';
+ else if (EQ(Score, scoreSI))
+ cPrevEdgeType = 'S';
+ else
+ Quit("TraceBack: failed to match I");
+
+ --uPrevPrefixLengthB;
+ break;
+ }
+
+ default:
+ assert(false);
+ }
+#if TRACE
+ Log("Edge %c%c%u.%u", cPrevEdgeType, cEdgeType, uPrefixLengthA, uPrefixLengthB);
+ Log("\n");
+#endif
+ cEdgeType = cPrevEdgeType;
+ uPrefixLengthA = uPrevPrefixLengthA;
+ uPrefixLengthB = uPrevPrefixLengthB;
+ }
+
+ return scoreMax;
+ }
Added: trunk/packages/muscle/branches/upstream/current/tracebackopt.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/tracebackopt.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/tracebackopt.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,73 @@
+#include "muscle.h"
+#include "pwpath.h"
+
+void TraceBackToPath(int **TraceBack, unsigned uLengthA,
+ unsigned uLengthB, PWPath &Path)
+ {
+ Path.Clear();
+
+ PWEdge Edge;
+ Edge.uPrefixLengthA = uLengthA;
+ Edge.uPrefixLengthB = uLengthB;
+
+ for (;;)
+ {
+ if (0 == Edge.uPrefixLengthA && 0 == Edge.uPrefixLengthB)
+ break;
+
+ int iDelta = TraceBack[Edge.uPrefixLengthA][Edge.uPrefixLengthB];
+#if TRACE
+ Log("TraceBack[%u][%u] = %d\n",
+ Edge.uPrefixLengthA, Edge.uPrefixLengthB, iDelta);
+#endif
+ if (0 == iDelta)
+ {
+ assert(Edge.uPrefixLengthA > 0);
+ assert(Edge.uPrefixLengthB > 0);
+
+ Edge.cType = 'M';
+ Path.PrependEdge(Edge);
+ --(Edge.uPrefixLengthA);
+ --(Edge.uPrefixLengthB);
+ continue;
+ }
+ else if (iDelta > 0)
+ {
+ Edge.cType = 'D';
+ while (iDelta-- > 0)
+ {
+ assert(Edge.uPrefixLengthA > 0);
+
+ Path.PrependEdge(Edge);
+ --(Edge.uPrefixLengthA);
+ }
+ }
+ else if (iDelta < 0)
+ {
+ Edge.cType = 'I';
+ while (iDelta++ < 0)
+ {
+ assert(Edge.uPrefixLengthB > 0);
+
+ Path.PrependEdge(Edge);
+ --(Edge.uPrefixLengthB);
+ }
+ }
+
+ if (0 == Edge.uPrefixLengthA && 0 == Edge.uPrefixLengthB)
+ break;
+
+ assert(Edge.uPrefixLengthA > 0);
+ assert(Edge.uPrefixLengthB > 0);
+
+ Edge.cType = 'M';
+ Path.PrependEdge(Edge);
+ --(Edge.uPrefixLengthA);
+ --(Edge.uPrefixLengthB);
+ }
+
+#if TRACE
+ Log("TraceBackToPath ");
+ Path.LogMe();
+#endif
+ }
Added: trunk/packages/muscle/branches/upstream/current/tracebacksw.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/tracebacksw.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/tracebacksw.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,186 @@
+#include "muscle.h"
+#include "profile.h"
+#include "pwpath.h"
+#include <math.h>
+
+#define TRACE 0
+
+#define EQ(a, b) (fabs(a-b) < 0.1)
+
+void TraceBackSW(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
+ unsigned uLengthB, const SCORE *DPM_, const SCORE *DPD_, const SCORE *DPI_,
+ unsigned uPrefixLengthAMax, unsigned uPrefixLengthBMax, PWPath &Path)
+ {
+#if TRACE
+ Log("\n");
+ Log("TraceBackSW LengthA=%u LengthB=%u PLAMax=%u PLBMax=%u\n",
+ uLengthA, uLengthB, uPrefixLengthAMax, uPrefixLengthBMax);
+#endif
+ assert(uLengthB > 0 && uLengthA > 0);
+
+ const unsigned uPrefixCountA = uLengthA + 1;
+ const unsigned uPrefixCountB = uLengthB + 1;
+
+ Path.Clear();
+
+ unsigned uPrefixLengthA = uPrefixLengthAMax;
+ unsigned uPrefixLengthB = uPrefixLengthBMax;
+
+ SCORE scoreMax = DPM(uPrefixLengthA, uPrefixLengthB);
+ char cEdgeType = 'M';
+
+ for (;;)
+ {
+ if ('S' == cEdgeType)
+ break;
+
+ PWEdge Edge;
+ Edge.cType = cEdgeType;
+ Edge.uPrefixLengthA = uPrefixLengthA;
+ Edge.uPrefixLengthB = uPrefixLengthB;
+ Path.PrependEdge(Edge);
+
+ char cPrevEdgeType;
+ unsigned uPrevPrefixLengthA = uPrefixLengthA;
+ unsigned uPrevPrefixLengthB = uPrefixLengthB;
+
+ switch (cEdgeType)
+ {
+ case 'M':
+ {
+ assert(uPrefixLengthA > 0);
+ assert(uPrefixLengthB > 0);
+ const ProfPos &PPA = PA[uPrefixLengthA - 1];
+ const ProfPos &PPB = PB[uPrefixLengthB - 1];
+
+ const SCORE Score = DPM(uPrefixLengthA, uPrefixLengthB);
+ const SCORE scoreMatch = ScoreProfPos2(PPA, PPB);
+
+ SCORE scoreSM;
+ if (1 == uPrefixLengthA && 1 == uPrefixLengthB)
+ scoreSM = scoreMatch;
+ else
+ scoreSM = MINUS_INFINITY;
+
+ SCORE scoreMM = MINUS_INFINITY;
+ SCORE scoreDM = MINUS_INFINITY;
+ SCORE scoreIM = MINUS_INFINITY;
+ if (uPrefixLengthA > 1 && uPrefixLengthB > 1)
+ {
+ SCORE scoreTrans = DPM(uPrefixLengthA-1, uPrefixLengthB-1);
+ scoreMM = scoreTrans + scoreMatch;
+ }
+ if (uPrefixLengthA > 1)
+ {
+ SCORE scoreTransDM = PA[uPrefixLengthA-2].m_scoreGapClose;
+ scoreDM = DPD(uPrefixLengthA-1, uPrefixLengthB-1) + scoreTransDM + scoreMatch;
+ }
+ if (uPrefixLengthB > 1)
+ {
+ SCORE scoreTransIM = PB[uPrefixLengthB-2].m_scoreGapClose;
+ scoreIM = DPI(uPrefixLengthA-1, uPrefixLengthB-1) + scoreTransIM + scoreMatch;
+ }
+
+ if (EQ(scoreMM, Score))
+ cPrevEdgeType = 'M';
+ else if (EQ(scoreDM, Score))
+ cPrevEdgeType = 'D';
+ else if (EQ(scoreIM, Score))
+ cPrevEdgeType = 'I';
+ else if (EQ(scoreSM, Score))
+ cPrevEdgeType = 'S';
+ else if (EQ(scoreMatch, Score))
+ cPrevEdgeType = 'S';
+ else
+ Quit("TraceBack2: failed to match M score=%g M=%g D=%g I=%g S=%g",
+ Score, scoreMM, scoreDM, scoreIM, scoreSM);
+
+ --uPrevPrefixLengthA;
+ --uPrevPrefixLengthB;
+ break;
+ }
+
+ case 'D':
+ {
+ assert(uPrefixLengthA > 0);
+ const SCORE Score = DPD(uPrefixLengthA, uPrefixLengthB);
+
+ SCORE scoreMD = MINUS_INFINITY;
+ SCORE scoreDD = MINUS_INFINITY;
+ SCORE scoreSD = MINUS_INFINITY;
+ if (uPrefixLengthB == 0)
+ {
+ if (uPrefixLengthA == 1)
+ scoreSD = PA[0].m_scoreGapOpen;
+ else
+ scoreSD = DPD(uPrefixLengthA - 1, 0);
+ }
+ if (uPrefixLengthA > 1)
+ {
+ const ProfPos &PPA = PA[uPrefixLengthA - 1];
+ SCORE scoreTransMD = PPA.m_scoreGapOpen;
+ scoreMD = DPM(uPrefixLengthA-1, uPrefixLengthB) + scoreTransMD;
+ scoreDD = DPD(uPrefixLengthA-1, uPrefixLengthB);
+ }
+
+ if (EQ(Score, scoreMD))
+ cPrevEdgeType = 'M';
+ else if (EQ(Score, scoreDD))
+ cPrevEdgeType = 'D';
+ else if (EQ(Score, scoreSD))
+ cPrevEdgeType = 'S';
+ else
+ Quit("TraceBack2: failed to match D");
+
+ --uPrevPrefixLengthA;
+ break;
+ }
+
+ case 'I':
+ {
+ assert(uPrefixLengthB > 0);
+ const SCORE Score = DPI(uPrefixLengthA, uPrefixLengthB);
+
+ SCORE scoreMI = MINUS_INFINITY;
+ SCORE scoreII = MINUS_INFINITY;
+ SCORE scoreSI = MINUS_INFINITY;
+ if (uPrefixLengthA == 0)
+ {
+ if (uPrefixLengthB == 1)
+ scoreSI = PB[0].m_scoreGapOpen;
+ else
+ scoreSI = DPI(0, uPrefixLengthB - 1);
+ }
+ if (uPrefixLengthB > 1)
+ {
+ const ProfPos &PPB = PB[uPrefixLengthB - 1];
+ SCORE scoreTransMI = PPB.m_scoreGapOpen;
+ scoreMI = DPM(uPrefixLengthA, uPrefixLengthB-1) + scoreTransMI;
+ scoreII = DPI(uPrefixLengthA, uPrefixLengthB-1);
+ }
+
+ if (EQ(Score, scoreMI))
+ cPrevEdgeType = 'M';
+ else if (EQ(Score, scoreII))
+ cPrevEdgeType = 'I';
+ else if (EQ(Score, scoreSI))
+ cPrevEdgeType = 'S';
+ else
+ Quit("TraceBack2: failed to match I");
+
+ --uPrevPrefixLengthB;
+ break;
+ }
+
+ default:
+ assert(false);
+ }
+#if TRACE
+ Log("Edge %c%c%u.%u", cPrevEdgeType, cEdgeType, uPrefixLengthA, uPrefixLengthB);
+ Log("\n");
+#endif
+ cEdgeType = cPrevEdgeType;
+ uPrefixLengthA = uPrevPrefixLengthA;
+ uPrefixLengthB = uPrevPrefixLengthB;
+ }
+ }
Added: trunk/packages/muscle/branches/upstream/current/tree.h
===================================================================
--- trunk/packages/muscle/branches/upstream/current/tree.h 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/tree.h 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,339 @@
+#ifndef tree_h
+#define tree_h
+
+#include <limits.h>
+
+class Clust;
+
+const unsigned NULL_NEIGHBOR = UINT_MAX;
+
+enum NEWICK_TOKEN_TYPE
+ {
+ NTT_Unknown,
+
+// Returned from Tree::GetToken:
+ NTT_Lparen,
+ NTT_Rparen,
+ NTT_Colon,
+ NTT_Comma,
+ NTT_Semicolon,
+ NTT_String,
+
+// Following are never returned from Tree::GetToken:
+ NTT_SingleQuotedString,
+ NTT_DoubleQuotedString,
+ NTT_Comment
+ };
+
+class Tree
+ {
+public:
+ Tree()
+ {
+ m_uNodeCount = 0;
+ m_uCacheCount = 0;
+ m_uNeighbor1 = 0;
+ m_uNeighbor2 = 0;
+ m_uNeighbor3 = 0;
+ m_dEdgeLength1 = 0;
+ m_dEdgeLength2 = 0;
+ m_dEdgeLength3 = 0;
+ m_dHeight = 0;
+ m_bHasEdgeLength1 = 0;
+ m_bHasEdgeLength2 = 0;
+ m_bHasEdgeLength3 = 0;
+ m_bHasHeight = 0;
+ m_ptrName = 0;
+ m_Ids = 0;
+ }
+ virtual ~Tree()
+ {
+ Clear();
+ }
+
+ void Clear()
+ {
+ for (unsigned n = 0; n < m_uNodeCount; ++n)
+ free(m_ptrName[n]);
+
+ m_uNodeCount = 0;
+ m_uCacheCount = 0;
+
+ delete[] m_uNeighbor1;
+ delete[] m_uNeighbor2;
+ delete[] m_uNeighbor3;
+ delete[] m_dEdgeLength1;
+ delete[] m_dEdgeLength2;
+ delete[] m_dEdgeLength3;
+ delete[] m_bHasEdgeLength1;
+ delete[] m_bHasEdgeLength2;
+ delete[] m_bHasEdgeLength3;
+ delete[] m_ptrName;
+ delete[] m_Ids;
+ delete[] m_bHasHeight;
+ delete[] m_dHeight;
+
+ m_uNeighbor1 = 0;
+ m_uNeighbor2 = 0;
+ m_uNeighbor3 = 0;
+ m_dEdgeLength1 = 0;
+ m_dEdgeLength2 = 0;
+ m_dEdgeLength3 = 0;
+ m_ptrName = 0;
+ m_Ids = 0;
+ m_uRootNodeIndex = 0;
+ m_bHasHeight = 0;
+ m_dHeight = 0;
+
+ m_bRooted = false;
+ }
+
+// Creation and manipulation
+ void CreateRooted();
+ void CreateUnrooted(double dEdgeLength);
+
+ void FromFile(TextFile &File);
+ void FromClust(Clust &C);
+
+ void Copy(const Tree &tree);
+
+ void Create(unsigned uLeafCount, unsigned uRoot, const unsigned Left[],
+ const unsigned Right[], const float LeftLength[], const float RightLength[],
+ const unsigned LeafIds[], char *LeafNames[]);
+ unsigned AppendBranch(unsigned uExistingNodeIndex);
+ void SetLeafName(unsigned uNodeIndex, const char *ptrName);
+ void SetLeafId(unsigned uNodeIndex, unsigned uId);
+ void SetEdgeLength(unsigned uNodeIndex1, unsigned uNodeIndex2,
+ double dLength);
+
+ void RootUnrootedTree(unsigned uNodeIndex1, unsigned uNodeIndex2);
+ void RootUnrootedTree(ROOT Method);
+ void UnrootByDeletingRoot();
+
+// Saving to file
+ void ToFile(TextFile &File) const;
+
+// Accessor functions
+ unsigned GetNodeCount() const
+ {
+ return m_uNodeCount;
+ }
+
+ unsigned GetLeafCount() const
+ {
+ if (m_bRooted)
+ {
+ assert(m_uNodeCount%2 == 1);
+ return (m_uNodeCount + 1)/2;
+ }
+ else
+ {
+ assert(m_uNodeCount%2 == 0);
+ return (m_uNodeCount + 2)/2;
+ }
+ }
+
+ unsigned GetNeighbor(unsigned uNodeIndex, unsigned uNeighborSubscript) const;
+
+ unsigned GetNeighbor1(unsigned uNodeIndex) const
+ {
+ assert(uNodeIndex < m_uNodeCount);
+ return m_uNeighbor1[uNodeIndex];
+ }
+
+ unsigned GetNeighbor2(unsigned uNodeIndex) const
+ {
+ assert(uNodeIndex < m_uNodeCount);
+ return m_uNeighbor2[uNodeIndex];
+ }
+
+ unsigned GetNeighbor3(unsigned uNodeIndex) const
+ {
+ assert(uNodeIndex < m_uNodeCount);
+ return m_uNeighbor3[uNodeIndex];
+ }
+
+ unsigned GetParent(unsigned uNodeIndex) const
+ {
+ assert(m_bRooted && uNodeIndex < m_uNodeCount);
+ return m_uNeighbor1[uNodeIndex];
+ }
+
+ bool IsRooted() const
+ {
+ return m_bRooted;
+ }
+
+ unsigned GetLeft(unsigned uNodeIndex) const
+ {
+ assert(m_bRooted && uNodeIndex < m_uNodeCount);
+ return m_uNeighbor2[uNodeIndex];
+ }
+
+ unsigned GetRight(unsigned uNodeIndex) const
+ {
+ assert(m_bRooted && uNodeIndex < m_uNodeCount);
+ return m_uNeighbor3[uNodeIndex];
+ }
+
+ const char *GetName(unsigned uNodeIndex) const
+ {
+ assert(uNodeIndex < m_uNodeCount);
+ return m_ptrName[uNodeIndex];
+ }
+
+ unsigned GetRootNodeIndex() const
+ {
+ assert(m_bRooted);
+ return m_uRootNodeIndex;
+ }
+
+ unsigned GetNeighborCount(unsigned uNodeIndex) const
+ {
+ const unsigned n1 = m_uNeighbor1[uNodeIndex];
+ const unsigned n2 = m_uNeighbor2[uNodeIndex];
+ const unsigned n3 = m_uNeighbor3[uNodeIndex];
+ return (NULL_NEIGHBOR != n1) + (NULL_NEIGHBOR != n2) + (NULL_NEIGHBOR != n3);
+ }
+
+ bool IsLeaf(unsigned uNodeIndex) const
+ {
+ assert(uNodeIndex < m_uNodeCount);
+ if (1 == m_uNodeCount)
+ return true;
+ return 1 == GetNeighborCount(uNodeIndex);
+ }
+
+ bool IsRoot(unsigned uNodeIndex) const
+ {
+ return IsRooted() && m_uRootNodeIndex == uNodeIndex;
+ }
+
+ unsigned GetLeafId(unsigned uNodeIndex) const;
+ unsigned GetLeafNodeIndex(const char *ptrName) const;
+ bool IsEdge(unsigned uNodeIndex1, unsigned uNodeIndex2) const;
+ bool HasEdgeLength(unsigned uNodeIndex1, unsigned uNodeIndex2) const;
+ double GetEdgeLength(unsigned uNodeIndex1, unsigned uNodeIndex2) const;
+ const char *GetLeafName(unsigned uNodeIndex) const;
+ unsigned GetNeighborSubscript(unsigned uNodeIndex, unsigned uNeighborIndex) const;
+ double GetNodeHeight(unsigned uNodeIndex) const;
+
+// Depth-first traversal
+ unsigned FirstDepthFirstNode() const;
+ unsigned NextDepthFirstNode(unsigned uNodeIndex) const;
+
+ unsigned FirstDepthFirstNodeR() const;
+ unsigned NextDepthFirstNodeR(unsigned uNodeIndex) const;
+
+// Equivalent of GetLeft/Right in unrooted tree, works in rooted tree too.
+ unsigned GetFirstNeighbor(unsigned uNodeIndex, unsigned uNeighborIndex) const;
+ unsigned GetSecondNeighbor(unsigned uNodeIndex, unsigned uNeighborIndex) const;
+
+// Getting parent node in unrooted tree defined iff leaf
+ unsigned GetLeafParent(unsigned uNodeIndex) const;
+
+// Misc
+ const char *NTTStr(NEWICK_TOKEN_TYPE NTT) const;
+ void FindCenterByLongestSpan(unsigned *ptrNodeIndex1,
+ unsigned *ptrNodeIndex2) const;
+ void PruneTree(const Tree &tree, unsigned Subfams[],
+ unsigned uSubfamCount);
+ unsigned LeafIndexToNodeIndex(unsigned uLeafIndex) const;
+
+// Debugging & trouble-shooting support
+ void Validate() const;
+ void ValidateNode(unsigned uNodeIndex) const;
+ void AssertAreNeighbors(unsigned uNodeIndex1, unsigned uNodeIndex2) const;
+ void LogMe() const;
+
+private:
+ unsigned UnrootFromFile();
+ NEWICK_TOKEN_TYPE GetTokenVerbose(TextFile &File, char szToken[],
+ unsigned uBytes) const
+ {
+ NEWICK_TOKEN_TYPE NTT = GetToken(File, szToken, uBytes);
+ Log("GetToken %10.10s %s\n", NTTStr(NTT), szToken);
+ return NTT;
+ }
+
+ void InitCache(unsigned uCacheCount);
+ void ExpandCache();
+ NEWICK_TOKEN_TYPE GetToken(TextFile &File, char szToken[], unsigned uBytes) const;
+ bool GetGroupFromFile(TextFile &File, unsigned uNodeIndex, double *ptrdEdgeLength);
+ unsigned GetLeafCountUnrooted(unsigned uNodeIndex1, unsigned uNodeIndex2,
+ double *ptrdTotalDistance) const;
+ void ToFileNodeRooted(TextFile &File, unsigned uNodeIndex) const;
+ void ToFileNodeUnrooted(TextFile &File, unsigned uNodeIndex, unsigned uParent) const;
+ void OrientParent(unsigned uNodeIndex, unsigned uParentNodeIndex);
+ double FromClustNode(const Clust &C, unsigned uClustNodeIndex, unsigned uPhyNodeIndex);
+ unsigned GetAnyNonLeafNode() const;
+
+// Yuck. Data is made public for the convenience of Tree::Copy.
+// There has to be a better way.
+public:
+ unsigned m_uNodeCount;
+ unsigned m_uCacheCount;
+ unsigned *m_uNeighbor1;
+ unsigned *m_uNeighbor2;
+ unsigned *m_uNeighbor3;
+ double *m_dEdgeLength1;
+ double *m_dEdgeLength2;
+ double *m_dEdgeLength3;
+ double *m_dHeight;
+ bool *m_bHasEdgeLength1;
+ bool *m_bHasEdgeLength2;
+ bool *m_bHasEdgeLength3;
+ bool *m_bHasHeight;
+ unsigned *m_Ids;
+ char **m_ptrName;
+ bool m_bRooted;
+ unsigned m_uRootNodeIndex;
+ };
+
+struct PhyEnumEdgeState
+ {
+ PhyEnumEdgeState()
+ {
+ m_bInit = false;
+ m_uNodeIndex1 = NULL_NEIGHBOR;
+ m_uNodeIndex2 = NULL_NEIGHBOR;
+ }
+ bool m_bInit;
+ unsigned m_uNodeIndex1;
+ unsigned m_uNodeIndex2;
+ };
+
+const unsigned NODE_CHANGED = (unsigned) (~0);
+
+extern bool PhyEnumBiParts(const Tree &tree, PhyEnumEdgeState &ES,
+ unsigned Leaves1[], unsigned *ptruCount1,
+ unsigned Leaves2[], unsigned *ptruCount2);
+extern bool PhyEnumBiPartsR(const Tree &tree, PhyEnumEdgeState &ES,
+ unsigned Leaves1[], unsigned *ptruCount1,
+ unsigned Leaves2[], unsigned *ptruCount2);
+extern void ClusterByHeight(const Tree &tree, double dMaxHeight, unsigned Subtrees[],
+ unsigned *ptruSubtreeCount);
+void ClusterBySubfamCount(const Tree &tree, unsigned uSubfamCount,
+ unsigned Subfams[], unsigned *ptruSubfamCount);
+void GetLeaves(const Tree &tree, unsigned uNodeIndex, unsigned Leaves[],
+ unsigned *ptruLeafCount);
+void GetLeavesExcluding(const Tree &tree, unsigned uNodeIndex,
+ unsigned uExclude, unsigned Leaves[], unsigned *ptruCount);
+void GetInternalNodesInHeightOrder(const Tree &tree, unsigned NodeIndexes[]);
+void ApplyMinEdgeLength(Tree &tree, double dMinEdgeLength);
+void LeafIndexesToLeafNames(const Tree &tree, const unsigned Leaves[], unsigned uCount,
+ char *Names[]);
+void LeafIndexesToIds(const Tree &tree, const unsigned Leaves[], unsigned uCount,
+ unsigned Ids[]);
+void MSASeqSubset(const MSA &msaIn, char *Names[], unsigned uSeqCount,
+ MSA &msaOut);
+void DiffTrees(const Tree &Tree1, const Tree &Tree2, Tree &Diffs,
+ unsigned IdToDiffsLeafNodeIndex[]);
+void DiffTreesE(const Tree &NewTree, const Tree &OldTree,
+ unsigned NewNodeIndexToOldNodeIndex[]);
+void FindRoot(const Tree &tree, unsigned *ptruNode1, unsigned *ptruNode2,
+ double *ptrdLength1, double *ptrdLength2,
+ ROOT RootMethod);
+void FixRoot(Tree &tree, ROOT RootMethod);
+
+#endif // tree_h
Added: trunk/packages/muscle/branches/upstream/current/treefrommsa.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/treefrommsa.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/treefrommsa.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,55 @@
+#include "muscle.h"
+#include "msa.h"
+#include "tree.h"
+#include "clust.h"
+#include "clustsetmsa.h"
+#include "distcalc.h"
+
+static void TreeFromMSA_NJ(const MSA &msa, Tree &tree, CLUSTER Cluster,
+ DISTANCE Distance)
+ {
+ MSADist MD(Distance);
+ ClustSetMSA Set(msa, MD);
+
+ Clust C;
+ C.Create(Set, Cluster);
+
+ tree.FromClust(C);
+ }
+
+static void TreeFromMSA_UPGMA(const MSA &msa, Tree &tree, CLUSTER Cluster,
+ DISTANCE Distance)
+ {
+ LINKAGE Linkage = LINKAGE_Undefined;
+ switch (Cluster)
+ {
+ case CLUSTER_UPGMA:
+ Linkage = LINKAGE_Avg;
+ break;
+ case CLUSTER_UPGMAMin:
+ Linkage = LINKAGE_Min;
+ break;
+ case CLUSTER_UPGMAMax:
+ Linkage = LINKAGE_Max;
+ break;
+ case CLUSTER_UPGMB:
+ Linkage = LINKAGE_Biased;
+ break;
+ default:
+ Quit("TreeFromMSA_UPGMA, CLUSTER_%u not supported", Cluster);
+ }
+
+ DistCalcMSA DC;
+ DC.Init(msa, Distance);
+ UPGMA2(DC, tree, Linkage);
+ }
+
+void TreeFromMSA(const MSA &msa, Tree &tree, CLUSTER Cluster,
+ DISTANCE Distance, ROOT Root)
+ {
+ if (CLUSTER_NeighborJoining == Cluster)
+ TreeFromMSA_NJ(msa, tree, Cluster, Distance);
+ else
+ TreeFromMSA_UPGMA(msa, tree, Cluster, Distance);
+ FixRoot(tree, Root);
+ }
Added: trunk/packages/muscle/branches/upstream/current/types.h
===================================================================
--- trunk/packages/muscle/branches/upstream/current/types.h 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/types.h 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,117 @@
+#ifndef types_h
+#define types_h
+
+typedef unsigned char byte;
+typedef unsigned short ushort;
+
+typedef float SCOREMATRIX[32][32];
+typedef SCOREMATRIX *PTR_SCOREMATRIX;
+
+class MSA;
+class Seq;
+class ClusterTree;
+class DistFunc;
+class TextFile;
+class PWPath;
+class Tree;
+class SeqVect;
+class DistCalc;
+
+struct ProgNode;
+struct ProfPos;
+
+#if SINGLE_AFFINE
+// Compress M, D and I trace-back matrices into 4 bits
+enum
+ {
+ BIT_MM = 0x00,
+ BIT_DM = 0x01,
+ BIT_IM = 0x02,
+ BIT_xM = 0x03,
+
+ BIT_DD = 0x00,
+ BIT_MD = 0x04,
+ // ID not allowed
+ BIT_xD = 0x04,
+
+ BIT_II = 0x00,
+ BIT_MI = 0x08,
+ // DI not allowed
+ BIT_xI = 0x08,
+ };
+
+#endif
+
+#if DOUBLE_AFFINE
+// Compress M, D, E, I and J trace-back matrices into 7 bits
+enum
+ {
+ BIT_MM = 0x00,
+ BIT_DM = 0x01,
+ BIT_EM = 0x02,
+ BIT_IM = 0x03,
+ BIT_JM = 0x04,
+ BIT_xM = 0x07,
+
+ BIT_DD = 0x00,
+ BIT_MD = 0x08,
+ // [EIJ]D not sallowed
+ BIT_xD = 0x08,
+
+ BIT_EE = 0x00,
+ BIT_ME = 0x10,
+ // [DDJ]E not allowed
+ BIT_xE = 0x10,
+
+ BIT_II = 0x00,
+ BIT_MI = 0x20,
+ // [EDJ]I not allowed
+ BIT_xI = 0x20,
+
+ BIT_JJ = 0x00,
+ BIT_MJ = 0x40,
+ // [EDI]J not allowed
+ BIT_xJ = 0x40,
+ };
+#endif
+
+enum EXIT
+ {
+ EXIT_Success = 0,
+ EXIT_NotStarted = 1,
+ EXIT_FatalError = 2,
+ EXIT_Except = 3,
+ };
+
+enum NODECMP
+ {
+ NODECMP_Undefined = 0,
+ NODECMP_Same = 0, // equivalent to node in old tree
+ NODECMP_Diff = 1, // equivalent & parent is changed
+ NODECMP_Changed = 2 // no equivalent node in old tree
+ };
+
+// Declare enums using macro hacks (see enums.h).
+#define s(t) enum t { t##_Undefined = 0,
+#define c(t, x) t##_##x,
+#define e(t) };
+#include "enums.h"
+
+// Declare conversion function XXXToStr(XXX x)
+// for each enum type XXX.
+#define s(t) const char *t##ToStr(t x);
+#define c(t, x) /* empty */
+#define e(t) /* empty */
+#include "enums.h"
+
+// Declare conversion function StrToXXX(const char *Str)
+// for each enum type XXX.
+#define s(t) t StrTo##t(const char *Str);
+#define c(t, x) /* empty */
+#define e(t) /* empty */
+#include "enums.h"
+
+const char *BoolToStr(bool b);
+const char *SecsToStr(unsigned long Secs);
+
+#endif // types_h
Added: trunk/packages/muscle/branches/upstream/current/typetostr.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/typetostr.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/typetostr.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,58 @@
+#include "muscle.h"
+#include <stdio.h>
+
+const char *SecsToStr(unsigned long Secs)
+ {
+ static char Str[16];
+ long hh, mm, ss;
+
+ hh = Secs/(60*60);
+ mm = (Secs/60)%60;
+ ss = Secs%60;
+
+ sprintf(Str, "%02d:%02d:%02d", hh, mm, ss);
+ return Str;
+ }
+
+const char *BoolToStr(bool b)
+ {
+ return b ? "True" : "False";
+ }
+
+const char *ScoreToStr(SCORE Score)
+ {
+ if (MINUS_INFINITY >= Score)
+ return " *";
+// Hack to use "circular" buffer so when called multiple
+// times in a printf-like argument list it works OK.
+ const int iBufferCount = 16;
+ const int iBufferLength = 16;
+ static char szStr[iBufferCount*iBufferLength];
+ static int iBufferIndex = 0;
+ iBufferIndex = (iBufferIndex + 1)%iBufferCount;
+ char *pStr = szStr + iBufferIndex*iBufferLength;
+ sprintf(pStr, "%8g", Score);
+ return pStr;
+ }
+
+// Left-justified version of ScoreToStr
+const char *ScoreToStrL(SCORE Score)
+ {
+ if (MINUS_INFINITY >= Score)
+ return "*";
+// Hack to use "circular" buffer so when called multiple
+// times in a printf-like argument list it works OK.
+ const int iBufferCount = 16;
+ const int iBufferLength = 16;
+ static char szStr[iBufferCount*iBufferLength];
+ static int iBufferIndex = 0;
+ iBufferIndex = (iBufferIndex + 1)%iBufferCount;
+ char *pStr = szStr + iBufferIndex*iBufferLength;
+ sprintf(pStr, "%.3g", Score);
+ return pStr;
+ }
+
+const char *WeightToStr(WEIGHT w)
+ {
+ return ScoreToStr(w);
+ }
Added: trunk/packages/muscle/branches/upstream/current/unixio.h
===================================================================
--- trunk/packages/muscle/branches/upstream/current/unixio.h 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/unixio.h 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,11 @@
+#ifdef WIN32
+#include <fcntl.h>
+#include <io.h>
+#else
+#include <fcntl.h>
+#include <unistd.h>
+#endif
+
+#if !defined(WIN32) && !defined(O_BINARY)
+#define O_BINARY 0
+#endif
Added: trunk/packages/muscle/branches/upstream/current/upgma2.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/upgma2.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/upgma2.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,395 @@
+#include "muscle.h"
+#include "tree.h"
+#include "distcalc.h"
+
+// UPGMA clustering in O(N^2) time and space.
+
+#define TRACE 0
+
+#define MIN(x, y) ((x) < (y) ? (x) : (y))
+#define MAX(x, y) ((x) > (y) ? (x) : (y))
+#define AVG(x, y) (((x) + (y))/2)
+
+static unsigned g_uLeafCount;
+static unsigned g_uTriangleSize;
+static unsigned g_uInternalNodeCount;
+static unsigned g_uInternalNodeIndex;
+
+// Triangular distance matrix is g_Dist, which is allocated
+// as a one-dimensional vector of length g_uTriangleSize.
+// TriangleSubscript(i,j) maps row,column=i,j to the subscript
+// into this vector.
+// Row / column coordinates are a bit messy.
+// Initially they are leaf indexes 0..N-1.
+// But each time we create a new node (=new cluster, new subtree),
+// we re-use one of the two rows that become available (the children
+// of the new node). This saves memory.
+// We keep track of this through the g_uNodeIndex vector.
+static dist_t *g_Dist;
+
+// Distance to nearest neighbor in row i of distance matrix.
+// Subscript is distance matrix row.
+static dist_t *g_MinDist;
+
+// Nearest neighbor to row i of distance matrix.
+// Subscript is distance matrix row.
+static unsigned *g_uNearestNeighbor;
+
+// Node index of row i in distance matrix.
+// Node indexes are 0..N-1 for leaves, N..2N-2 for internal nodes.
+// Subscript is distance matrix row.
+static unsigned *g_uNodeIndex;
+
+// The following vectors are defined on internal nodes,
+// subscripts are internal node index 0..N-2.
+// For g_uLeft/Right, value is the node index 0 .. 2N-2
+// because a child can be internal or leaf.
+static unsigned *g_uLeft;
+static unsigned *g_uRight;
+static dist_t *g_Height;
+static dist_t *g_LeftLength;
+static dist_t *g_RightLength;
+
+static inline unsigned TriangleSubscript(unsigned uIndex1, unsigned uIndex2)
+ {
+#if DEBUG
+ if (uIndex1 >= g_uLeafCount || uIndex2 >= g_uLeafCount)
+ Quit("TriangleSubscript(%u,%u) %u", uIndex1, uIndex2, g_uLeafCount);
+#endif
+ unsigned v;
+ if (uIndex1 >= uIndex2)
+ v = uIndex2 + (uIndex1*(uIndex1 - 1))/2;
+ else
+ v = uIndex1 + (uIndex2*(uIndex2 - 1))/2;
+ assert(v < (g_uLeafCount*(g_uLeafCount - 1))/2);
+ return v;
+ }
+
+static void ListState()
+ {
+ Log("Dist matrix\n");
+ Log(" ");
+ for (unsigned i = 0; i < g_uLeafCount; ++i)
+ {
+ if (uInsane == g_uNodeIndex[i])
+ continue;
+ Log(" %5u", g_uNodeIndex[i]);
+ }
+ Log("\n");
+
+ for (unsigned i = 0; i < g_uLeafCount; ++i)
+ {
+ if (uInsane == g_uNodeIndex[i])
+ continue;
+ Log("%5u ", g_uNodeIndex[i]);
+ for (unsigned j = 0; j < g_uLeafCount; ++j)
+ {
+ if (uInsane == g_uNodeIndex[j])
+ continue;
+ if (i == j)
+ Log(" ");
+ else
+ {
+ unsigned v = TriangleSubscript(i, j);
+ Log("%5.2g ", g_Dist[v]);
+ }
+ }
+ Log("\n");
+ }
+
+ Log("\n");
+ Log(" i Node NrNb Dist\n");
+ Log("----- ----- ----- --------\n");
+ for (unsigned i = 0; i < g_uLeafCount; ++i)
+ {
+ if (uInsane == g_uNodeIndex[i])
+ continue;
+ Log("%5u %5u %5u %8.3f\n",
+ i,
+ g_uNodeIndex[i],
+ g_uNearestNeighbor[i],
+ g_MinDist[i]);
+ }
+
+ Log("\n");
+ Log(" Node L R Height LLength RLength\n");
+ Log("----- ----- ----- ------ ------- -------\n");
+ for (unsigned i = 0; i <= g_uInternalNodeIndex; ++i)
+ Log("%5u %5u %5u %6.2g %6.2g %6.2g\n",
+ i,
+ g_uLeft[i],
+ g_uRight[i],
+ g_Height[i],
+ g_LeftLength[i],
+ g_RightLength[i]);
+ }
+
+void UPGMA2(const DistCalc &DC, Tree &tree, LINKAGE Linkage)
+ {
+ g_uLeafCount = DC.GetCount();
+
+ g_uTriangleSize = (g_uLeafCount*(g_uLeafCount - 1))/2;
+ g_uInternalNodeCount = g_uLeafCount - 1;
+
+ g_Dist = new dist_t[g_uTriangleSize];
+
+ g_uNodeIndex = new unsigned[g_uLeafCount];
+ g_uNearestNeighbor = new unsigned[g_uLeafCount];
+ g_MinDist = new dist_t[g_uLeafCount];
+ unsigned *Ids = new unsigned [g_uLeafCount];
+ char **Names = new char *[g_uLeafCount];
+
+ g_uLeft = new unsigned[g_uInternalNodeCount];
+ g_uRight = new unsigned[g_uInternalNodeCount];
+ g_Height = new dist_t[g_uInternalNodeCount];
+ g_LeftLength = new dist_t[g_uInternalNodeCount];
+ g_RightLength = new dist_t[g_uInternalNodeCount];
+
+ for (unsigned i = 0; i < g_uLeafCount; ++i)
+ {
+ g_MinDist[i] = BIG_DIST;
+ g_uNodeIndex[i] = i;
+ g_uNearestNeighbor[i] = uInsane;
+ Ids[i] = DC.GetId(i);
+ Names[i] = strsave(DC.GetName(i));
+ }
+
+ for (unsigned i = 0; i < g_uInternalNodeCount; ++i)
+ {
+ g_uLeft[i] = uInsane;
+ g_uRight[i] = uInsane;
+ g_LeftLength[i] = BIG_DIST;
+ g_RightLength[i] = BIG_DIST;
+ g_Height[i] = BIG_DIST;
+ }
+
+// Compute initial NxN triangular distance matrix.
+// Store minimum distance for each full (not triangular) row.
+// Loop from 1, not 0, because "row" is 0, 1 ... i-1,
+// so nothing to do when i=0.
+ for (unsigned i = 1; i < g_uLeafCount; ++i)
+ {
+ dist_t *Row = g_Dist + TriangleSubscript(i, 0);
+ DC.CalcDistRange(i, Row);
+ for (unsigned j = 0; j < i; ++j)
+ {
+ const dist_t d = Row[j];
+ if (d < g_MinDist[i])
+ {
+ g_MinDist[i] = d;
+ g_uNearestNeighbor[i] = j;
+ }
+ if (d < g_MinDist[j])
+ {
+ g_MinDist[j] = d;
+ g_uNearestNeighbor[j] = i;
+ }
+ }
+ }
+
+#if TRACE
+ Log("Initial state:\n");
+ ListState();
+#endif
+
+ for (g_uInternalNodeIndex = 0; g_uInternalNodeIndex < g_uLeafCount - 1;
+ ++g_uInternalNodeIndex)
+ {
+#if TRACE
+ Log("\n");
+ Log("Internal node index %5u\n", g_uInternalNodeIndex);
+ Log("-------------------------\n");
+#endif
+
+ // Find nearest neighbors
+ unsigned Lmin = uInsane;
+ unsigned Rmin = uInsane;
+ dist_t dtMinDist = BIG_DIST;
+ for (unsigned j = 0; j < g_uLeafCount; ++j)
+ {
+ if (uInsane == g_uNodeIndex[j])
+ continue;
+
+ dist_t d = g_MinDist[j];
+ if (d < dtMinDist)
+ {
+ dtMinDist = d;
+ Lmin = j;
+ Rmin = g_uNearestNeighbor[j];
+ assert(uInsane != Rmin);
+ assert(uInsane != g_uNodeIndex[Rmin]);
+ }
+ }
+
+ assert(Lmin != uInsane);
+ assert(Rmin != uInsane);
+ assert(dtMinDist != BIG_DIST);
+
+#if TRACE
+ Log("Nearest neighbors Lmin %u[=%u] Rmin %u[=%u] dist %.3g\n",
+ Lmin,
+ g_uNodeIndex[Lmin],
+ Rmin,
+ g_uNodeIndex[Rmin],
+ dtMinDist);
+#endif
+
+ // Compute distances to new node
+ // New node overwrites row currently assigned to Lmin
+ dist_t dtNewMinDist = BIG_DIST;
+ unsigned uNewNearestNeighbor = uInsane;
+ for (unsigned j = 0; j < g_uLeafCount; ++j)
+ {
+ if (j == Lmin || j == Rmin)
+ continue;
+ if (uInsane == g_uNodeIndex[j])
+ continue;
+
+ const unsigned vL = TriangleSubscript(Lmin, j);
+ const unsigned vR = TriangleSubscript(Rmin, j);
+ const dist_t dL = g_Dist[vL];
+ const dist_t dR = g_Dist[vR];
+ dist_t dtNewDist;
+
+ switch (Linkage)
+ {
+ case LINKAGE_Avg:
+ dtNewDist = AVG(dL, dR);
+ break;
+
+ case LINKAGE_Min:
+ dtNewDist = MIN(dL, dR);
+ break;
+
+ case LINKAGE_Max:
+ dtNewDist = MAX(dL, dR);
+ break;
+
+ case LINKAGE_Biased:
+ dtNewDist = g_dSUEFF*AVG(dL, dR) + (1 - g_dSUEFF)*MIN(dL, dR);
+ break;
+
+ default:
+ Quit("UPGMA2: Invalid LINKAGE_%u", Linkage);
+ }
+
+ // Nasty special case.
+ // If nearest neighbor of j is Lmin or Rmin, then make the new
+ // node (which overwrites the row currently occupied by Lmin)
+ // the nearest neighbor. This situation can occur when there are
+ // equal distances in the matrix. If we don't make this fix,
+ // the nearest neighbor pointer for j would become invalid.
+ // (We don't need to test for == Lmin, because in that case
+ // the net change needed is zero due to the change in row
+ // numbering).
+ if (g_uNearestNeighbor[j] == Rmin)
+ g_uNearestNeighbor[j] = Lmin;
+
+#if TRACE
+ Log("New dist to %u = (%u/%.3g + %u/%.3g)/2 = %.3g\n",
+ j, Lmin, dL, Rmin, dR, dtNewDist);
+#endif
+ g_Dist[vL] = dtNewDist;
+ if (dtNewDist < dtNewMinDist)
+ {
+ dtNewMinDist = dtNewDist;
+ uNewNearestNeighbor = j;
+ }
+ }
+
+ assert(g_uInternalNodeIndex < g_uLeafCount - 1 || BIG_DIST != dtNewMinDist);
+ assert(g_uInternalNodeIndex < g_uLeafCount - 1 || uInsane != uNewNearestNeighbor);
+
+ const unsigned v = TriangleSubscript(Lmin, Rmin);
+ const dist_t dLR = g_Dist[v];
+ const dist_t dHeightNew = dLR/2;
+ const unsigned uLeft = g_uNodeIndex[Lmin];
+ const unsigned uRight = g_uNodeIndex[Rmin];
+ const dist_t HeightLeft =
+ uLeft < g_uLeafCount ? 0 : g_Height[uLeft - g_uLeafCount];
+ const dist_t HeightRight =
+ uRight < g_uLeafCount ? 0 : g_Height[uRight - g_uLeafCount];
+
+ g_uLeft[g_uInternalNodeIndex] = uLeft;
+ g_uRight[g_uInternalNodeIndex] = uRight;
+ g_LeftLength[g_uInternalNodeIndex] = dHeightNew - HeightLeft;
+ g_RightLength[g_uInternalNodeIndex] = dHeightNew - HeightRight;
+ g_Height[g_uInternalNodeIndex] = dHeightNew;
+
+ // Row for left child overwritten by row for new node
+ g_uNodeIndex[Lmin] = g_uLeafCount + g_uInternalNodeIndex;
+ g_uNearestNeighbor[Lmin] = uNewNearestNeighbor;
+ g_MinDist[Lmin] = dtNewMinDist;
+
+ // Delete row for right child
+ g_uNodeIndex[Rmin] = uInsane;
+
+#if TRACE
+ Log("\nInternalNodeIndex=%u Lmin=%u Rmin=%u\n",
+ g_uInternalNodeIndex, Lmin, Rmin);
+ ListState();
+#endif
+ }
+
+ unsigned uRoot = g_uLeafCount - 2;
+ tree.Create(g_uLeafCount, uRoot, g_uLeft, g_uRight, g_LeftLength, g_RightLength,
+ Ids, Names);
+
+#if TRACE
+ tree.LogMe();
+#endif
+
+ delete[] g_Dist;
+
+ delete[] g_uNodeIndex;
+ delete[] g_uNearestNeighbor;
+ delete[] g_MinDist;
+ delete[] g_Height;
+
+ delete[] g_uLeft;
+ delete[] g_uRight;
+ delete[] g_LeftLength;
+ delete[] g_RightLength;
+
+ for (unsigned i = 0; i < g_uLeafCount; ++i)
+ free(Names[i]);
+ delete[] Names;
+ delete[] Ids;
+ }
+
+class DistCalcTest : public DistCalc
+ {
+ virtual void CalcDistRange(unsigned i, dist_t Dist[]) const
+ {
+ static dist_t TestDist[5][5] =
+ {
+ 0, 2, 14, 14, 20,
+ 2, 0, 14, 14, 20,
+ 14, 14, 0, 4, 20,
+ 14, 14, 4, 0, 20,
+ 20, 20, 20, 20, 0,
+ };
+ for (unsigned j = 0; j < i; ++j)
+ Dist[j] = TestDist[i][j];
+ }
+ virtual unsigned GetCount() const
+ {
+ return 5;
+ }
+ virtual unsigned GetId(unsigned i) const
+ {
+ return i;
+ }
+ virtual const char *GetName(unsigned i) const
+ {
+ return "name";
+ }
+ };
+
+void Test()
+ {
+ SetListFileName("c:\\tmp\\lobster.log", false);
+ DistCalcTest DC;
+ Tree tree;
+ UPGMA2(DC, tree, LINKAGE_Avg);
+ }
Added: trunk/packages/muscle/branches/upstream/current/usage.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/usage.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/usage.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,47 @@
+#include "muscle.h"
+#include <stdio.h>
+
+void Credits()
+ {
+ static bool Displayed = false;
+ if (Displayed)
+ return;
+
+ fprintf(stderr, "\n" MUSCLE_LONG_VERSION "\n\n");
+ fprintf(stderr, "http://www.drive5.com/muscle\n");
+ fprintf(stderr, "This software is donated to the public domain.\n");
+ fprintf(stderr, "Please cite: Edgar, R.C. Nucleic Acids Res 32(5), 1792-97.\n\n");
+ Displayed = true;
+ }
+
+void Usage()
+ {
+ Credits();
+ fprintf(stderr,
+"\n"
+"Basic usage\n"
+"\n"
+" muscle -in <inputfile> -out <outputfile>\n"
+"\n"
+"Common options (for a complete list please see the User Guide):\n"
+"\n"
+" -in <inputfile> Input file in FASTA format (default stdin)\n"
+" -out <outputfile> Output alignment in FASTA format (default stdout)\n"
+" -diags Find diagonals (faster for similar sequences)\n"
+" -maxiters <n> Maximum number of iterations (integer, default 16)\n"
+" -maxhours <h> Maximum time to iterate in hours (default no limit)\n"
+" -maxmb <m> Maximum memory to allocate in Mb (default 80%% of RAM)\n"
+" -html Write output in HTML format (default FASTA)\n"
+" -msf Write output in GCG MSF format (default FASTA)\n"
+" -clw Write output in CLUSTALW format (default FASTA)\n"
+" -clwstrict As -clw, with 'CLUSTAL W (1.81)' header\n"
+" -log[a] <logfile> Log to file (append if -loga, overwrite if -log)\n"
+" -quiet Do not write progress messages to stderr\n"
+" -stable Output sequences in input order (default is -group)\n"
+" -group Group sequences by similarity (this is the default)\n"
+" -version Display version information and exit\n"
+"\n"
+"Without refinement (very fast, avg accuracy similar to T-Coffee): -maxiters 2\n"
+"Fastest possible (amino acids): -maxiters 1 -diags -sv -distance1 kbit20_3\n"
+"Fastest possible (nucleotides): -maxiters 1 -diags\n");
+ }
Added: trunk/packages/muscle/branches/upstream/current/validateids.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/validateids.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/validateids.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,105 @@
+#include "muscle.h"
+#include "msa.h"
+#include "tree.h"
+#include "seqvect.h"
+
+#if DEBUG
+static SeqVect *g_ptrMuscleSeqVect = 0;
+static MSA MuscleInputMSA;
+
+void SetMuscleInputMSA(MSA &msa)
+ {
+ MuscleInputMSA.Copy(msa);
+ }
+
+void SetMuscleSeqVect(SeqVect &v)
+ {
+ g_ptrMuscleSeqVect = &v;
+ }
+
+void ValidateMuscleIdsSeqVect(const MSA &msa)
+ {
+ const unsigned uSeqCount = msa.GetSeqCount();
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ const unsigned uId = msa.GetSeqId(uSeqIndex);
+ const char *ptrNameMSA = msa.GetSeqName(uSeqIndex);
+ const char *ptrName = g_ptrMuscleSeqVect->GetSeqName(uId);
+ if (0 != strcmp(ptrNameMSA, ptrName))
+ Quit("ValidateMuscleIdsSeqVect, names don't match");
+ }
+ }
+
+void ValidateMuscleIdsMSA(const MSA &msa)
+ {
+ const unsigned uSeqCount = msa.GetSeqCount();
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
+ {
+ const unsigned uId = msa.GetSeqId(uSeqIndex);
+ const char *ptrNameMSA = msa.GetSeqName(uSeqIndex);
+ const char *ptrName = MuscleInputMSA.GetSeqName(uId);
+ if (0 != strcmp(ptrNameMSA, ptrName))
+ {
+ Log("Input MSA:\n");
+ MuscleInputMSA.LogMe();
+ Log("MSA being tested:\n");
+ msa.LogMe();
+ Log("Id=%u\n", uId);
+ Log("Input name=%s\n", ptrName);
+ Log("Test name=%s\n", ptrNameMSA);
+ Quit("ValidateMuscleIdsMSA, names don't match");
+ }
+ }
+ }
+
+void ValidateMuscleIds(const MSA &msa)
+ {
+ if (0 != g_ptrMuscleSeqVect)
+ ValidateMuscleIdsSeqVect(msa);
+ else if (0 != MuscleInputMSA.GetSeqCount())
+ ValidateMuscleIdsMSA(msa);
+ else
+ Quit("ValidateMuscleIds, ptrMuscleSeqVect=0 && 0 == MuscleInputMSA.SeqCount()");
+
+ }
+
+void ValidateMuscleIdsSeqVect(const Tree &tree)
+ {
+ const unsigned uNodeCount = tree.GetNodeCount();
+ for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex)
+ {
+ if (!tree.IsLeaf(uNodeIndex))
+ continue;
+ const unsigned uId = tree.GetLeafId(uNodeIndex);
+ const char *ptrNameTree = tree.GetLeafName(uNodeIndex);
+ const char *ptrName = g_ptrMuscleSeqVect->GetSeqName(uId);
+ if (0 != strcmp(ptrNameTree, ptrName))
+ Quit("ValidateMuscleIds: names don't match");
+ }
+ }
+
+void ValidateMuscleIdsMSA(const Tree &tree)
+ {
+ const unsigned uNodeCount = tree.GetNodeCount();
+ for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex)
+ {
+ if (!tree.IsLeaf(uNodeIndex))
+ continue;
+ const unsigned uId = tree.GetLeafId(uNodeIndex);
+ const char *ptrNameTree = tree.GetLeafName(uNodeIndex);
+ const char *ptrName = MuscleInputMSA.GetSeqName(uId);
+ if (0 != strcmp(ptrNameTree, ptrName))
+ Quit("ValidateMuscleIds: names don't match");
+ }
+ }
+
+void ValidateMuscleIds(const Tree &tree)
+ {
+ if (0 != g_ptrMuscleSeqVect)
+ ValidateMuscleIdsSeqVect(tree);
+ else if (0 != MuscleInputMSA.GetSeqCount())
+ ValidateMuscleIdsMSA(tree);
+ else
+ Quit("ValidateMuscleIds, ptrMuscleSeqVect=0 && 0 == MuscleInputMSA.SeqCount");
+ }
+#endif
Added: trunk/packages/muscle/branches/upstream/current/vtml2.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/vtml2.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/vtml2.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,145 @@
+#include "muscle.h"
+
+// Note: We use 32x32 arrays rather than 20x20 as this may give the compiler
+// optimizer an opportunity to make subscript arithmetic more efficient
+// (multiplying by 32 is same as shifting left by 5 bits).
+
+#define v(x) ((float) x)
+#define ROW(A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y) \
+ { v(A), v(C), v(D), v(E), v(F), v(G), v(H), v(I), v(K), v(L), v(M), v(N), v(P), v(Q), \
+ v(R), v(S), v(T), v(V), v(W), v(Y) },
+
+
+// A C D E F G H I K L
+// M N P Q R S T V W Y
+// VTML200
+float VTML_LA[32][32] =
+ {
+ROW( 2.25080, 1.31180, 0.82704, 0.88740, 0.55520, 1.09860, 0.71673, 0.80805, 0.81213, 0.68712,
+ 0.79105, 0.86777, 0.99328, 0.86644, 0.72821, 1.33924, 1.20373, 1.05956, 0.38107, 0.54373) // A
+
+ROW( 1.31180,15.79469, 0.39862, 0.42329, 0.49882, 0.65541, 0.67100, 0.97185, 0.46414, 0.55673,
+ 0.90230, 0.63236, 0.54479, 0.47895, 0.56465, 1.18490, 0.99069, 1.21604, 0.28988, 0.91338) // C
+
+ROW( 0.82704, 0.39862, 4.18833, 2.06850, 0.25194, 0.90937, 1.01617, 0.32860, 1.03391, 0.31300,
+ 0.42498, 1.80888, 0.81307, 1.20043, 0.63712, 1.03001, 0.88191, 0.43557, 0.26313, 0.37947) // D
+
+ROW( 0.88740, 0.42329, 2.06850, 3.08354, 0.33456, 0.77183, 0.94536, 0.43151, 1.35989, 0.45579,
+ 0.53423, 1.15745, 0.82832, 1.66752, 0.84500, 0.98693, 0.88132, 0.54047, 0.24519, 0.52025) // E
+
+ROW( 0.55520, 0.49882, 0.25194, 0.33456, 6.08351, 0.30140, 1.02191, 1.10969, 0.37069, 1.50587,
+ 1.41207, 0.42850, 0.41706, 0.48113, 0.41970, 0.56867, 0.57172, 0.91256, 2.02494, 3.44675) // F
+
+ROW( 1.09860, 0.65541, 0.90937, 0.77183, 0.30140, 5.62829, 0.64191, 0.28432, 0.67874, 0.30549,
+ 0.37739, 1.01012, 0.60851, 0.65996, 0.63660, 1.03448, 0.68435, 0.40728, 0.36034, 0.35679) // G
+
+ROW( 0.71673, 0.67100, 1.01617, 0.94536, 1.02191, 0.64191, 6.05494, 0.50783, 1.03822, 0.60887,
+ 0.55685, 1.28619, 0.72275, 1.41503, 1.24635, 0.93344, 0.83543, 0.54817, 0.81780, 1.81552) // H
+
+ROW( 0.80805, 0.97185, 0.32860, 0.43151, 1.10969, 0.28432, 0.50783, 3.03766, 0.49310, 1.88886,
+ 1.75039, 0.44246, 0.44431, 0.53213, 0.48153, 0.55603, 0.88168, 2.37367, 0.68494, 0.70035) // I
+
+ROW( 0.81213, 0.46414, 1.03391, 1.35989, 0.37069, 0.67874, 1.03822, 0.49310, 2.72883, 0.52739,
+ 0.68244, 1.15671, 0.82911, 1.51333, 2.33521, 0.93858, 0.92730, 0.55467, 0.39944, 0.52549) // K
+
+ROW( 0.68712, 0.55673, 0.31300, 0.45579, 1.50587, 0.30549, 0.60887, 1.88886, 0.52739, 3.08540,
+ 2.14480, 0.43539, 0.53630, 0.62771, 0.53025, 0.53468, 0.69924, 1.50372, 0.82822, 0.89854) // L
+
+ROW( 0.79105, 0.90230, 0.42498, 0.53423, 1.41207, 0.37739, 0.55685, 1.75039, 0.68244, 2.14480,
+ 4.04057, 0.55603, 0.48415, 0.76770, 0.66775, 0.62409, 0.87759, 1.42742, 0.52278, 0.72067) // M
+
+ROW( 0.86777, 0.63236, 1.80888, 1.15745, 0.42850, 1.01012, 1.28619, 0.44246, 1.15671, 0.43539,
+ 0.55603, 3.36000, 0.69602, 1.13490, 0.98603, 1.31366, 1.11252, 0.50603, 0.35810, 0.68349) // N
+
+ROW( 0.99328, 0.54479, 0.81307, 0.82832, 0.41706, 0.60851, 0.72275, 0.44431, 0.82911, 0.53630,
+ 0.48415, 0.69602, 7.24709, 0.90276, 0.74827, 1.03719, 0.83014, 0.56795, 0.37867, 0.33127) // P
+
+ROW( 0.86644, 0.47895, 1.20043, 1.66752, 0.48113, 0.65996, 1.41503, 0.53213, 1.51333, 0.62771,
+ 0.76770, 1.13490, 0.90276, 2.86937, 1.50116, 0.99561, 0.93103, 0.61085, 0.29926, 0.51971) // Q
+
+ROW( 0.72821, 0.56465, 0.63712, 0.84500, 0.41970, 0.63660, 1.24635, 0.48153, 2.33521, 0.53025,
+ 0.66775, 0.98603, 0.74827, 1.50116, 4.28698, 0.84662, 0.80673, 0.51422, 0.47569, 0.59592) // R
+
+ROW( 1.33924, 1.18490, 1.03001, 0.98693, 0.56867, 1.03448, 0.93344, 0.55603, 0.93858, 0.53468,
+ 0.62409, 1.31366, 1.03719, 0.99561, 0.84662, 2.13816, 1.52911, 0.67767, 0.45129, 0.66767) // S
+
+ROW( 1.20373, 0.99069, 0.88191, 0.88132, 0.57172, 0.68435, 0.83543, 0.88168, 0.92730, 0.69924,
+ 0.87759, 1.11252, 0.83014, 0.93103, 0.80673, 1.52911, 2.58221, 0.98702, 0.31541, 0.57954) // T
+
+ROW( 1.05956, 1.21604, 0.43557, 0.54047, 0.91256, 0.40728, 0.54817, 2.37367, 0.55467, 1.50372,
+ 1.42742, 0.50603, 0.56795, 0.61085, 0.51422, 0.67767, 0.98702, 2.65580, 0.43419, 0.63805) // V
+
+ROW( 0.38107, 0.28988, 0.26313, 0.24519, 2.02494, 0.36034, 0.81780, 0.68494, 0.39944, 0.82822,
+ 0.52278, 0.35810, 0.37867, 0.29926, 0.47569, 0.45129, 0.31541, 0.43419,31.39564, 2.51433) // W
+
+ROW( 0.54373, 0.91338, 0.37947, 0.52025, 3.44675, 0.35679, 1.81552, 0.70035, 0.52549, 0.89854,
+ 0.72067, 0.68349, 0.33127, 0.51971, 0.59592, 0.66767, 0.57954, 0.63805, 2.51433, 7.50693) // Y
+ };
+
+const float VTML_SP_CENTER = (float) 22.0;
+
+#undef ROW
+#undef v
+#define v(x) ((float) (x + VTML_SP_CENTER))
+#define ROW(A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y, X) \
+ { v(A), v(C), v(D), v(E), v(F), v(G), v(H), v(I), v(K), v(L), v(M), v(N), v(P), v(Q), \
+ v(R), v(S), v(T), v(V), v(W), v(Y), v(X) },
+
+// VTML 240
+float VTML_SP[32][32] =
+ {
+// A C D E F G H I K L M N P Q R S T V W Y X
+ROW( 58, 23, -12, -7, -44, 10, -23, -14, -14, -27, -17, -8, 1, -9, -22, 23, 15, 5, -74, -45, 0) // A
+ROW( 23, 224, -67, -63, -50, -30, -29, 1, -56, -41, -6, -33, -44, -53, -43, 15, 2, 18, -93, -6, 0) // C
+ROW( -12, -67, 111, 59,-104, -4, 4, -84, 6, -88, -65, 48, -13, 18, -29, 5, -7, -63,-105, -73, 0) // D
+ROW( -7, -63, 59, 85, -83, -17, -1, -63, 25, -60, -47, 15, -12, 40, -8, 1, -7, -47,-108, -51, 0) // E
+ROW( -44, -50,-104, -83, 144, -93, 4, 12, -74, 36, 30, -64, -67, -56, -65, -43, -41, -3, 63, 104, 0) // F
+ROW( 10, -30, -4, -17, -93, 140, -32, -95, -27, -91, -75, 4, -36, -29, -32, 5, -26, -68, -80, -79, 0) // G
+ROW( -23, -29, 4, -1, 4, -32, 137, -50, 6, -37, -42, 21, -23, 27, 19, -4, -12, -44, -13, 48, 0) // H
+ROW( -14, 1, -84, -63, 12, -95, -50, 86, -53, 53, 47, -62, -60, -47, -55, -43, -8, 69, -27, -24, 0) // I
+ROW( -14, -56, 6, 25, -74, -27, 6, -53, 75, -48, -30, 13, -12, 34, 68, -3, -4, -44, -71, -49, 0) // K
+ROW( -27, -41, -88, -60, 36, -91, -37, 53, -48, 88, 62, -63, -48, -36, -48, -47, -25, 36, -11, -4, 0) // L
+ROW( -17, -6, -65, -47, 30, -75, -42, 47, -30, 62, 103, -45, -54, -21, -31, -35, -9, 31, -46, -20, 0) // M
+ROW( -8, -33, 48, 15, -64, 4, 21, -62, 13, -63, -45, 89, -25, 12, 2, 22, 10, -51, -79, -29, 0) // N
+ROW( 1, -44, -13, -12, -67, -36, -23, -60, -12, -48, -54, -25, 160, -6, -20, 5, -12, -42, -76, -83, 0) // P
+ROW( -9, -53, 18, 40, -56, -29, 27, -47, 34, -36, -21, 12, -6, 75, 34, 1, -4, -37, -92, -48, 0) // Q
+ROW( -22, -43, -29, -8, -65, -32, 19, -55, 68, -48, -31, 2, -20, 34, 113, -10, -14, -49, -58, -39, 0) // R
+ROW( 23, 15, 5, 1, -43, 5, -4, -43, -3, -47, -35, 22, 5, 1, -10, 53, 32, -28, -62, -31, 0) // S
+ROW( 15, 2, -7, -7, -41, -26, -12, -8, -4, -25, -9, 10, -12, -4, -14, 32, 68, 0, -87, -40, 0) // T
+ROW( 5, 18, -63, -47, -3, -68, -44, 69, -44, 36, 31, -51, -42, -37, -49, -28, 0, 74, -61, -32, 0) // V
+ROW( -74, -93,-105,-108, 63, -80, -13, -27, -71, -11, -46, -79, -76, -92, -58, -62, -87, -61, 289, 81, 0) // W
+ROW( -45, -6, -73, -51, 104, -79, 48, -24, -49, -4, -20, -29, -83, -48, -39, -31, -40, -32, 81, 162, 0) // Y
+ROW( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) // X
+ };
+
+#undef v
+#define v(x) ((float) (x))
+#define RNC(A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y, X) \
+ { v(A), v(C), v(D), v(E), v(F), v(G), v(H), v(I), v(K), v(L), v(M), v(N), v(P), v(Q), \
+ v(R), v(S), v(T), v(V), v(W), v(Y), v(X) },
+
+float VTML_SPNoCenter[32][32] =
+ {
+// A C D E F G H I K L M N P Q R S T V W Y X
+RNC( 58, 23, -12, -7, -44, 10, -23, -14, -14, -27, -17, -8, 1, -9, -22, 23, 15, 5, -74, -45, 0) // A
+RNC( 23, 224, -67, -63, -50, -30, -29, 1, -56, -41, -6, -33, -44, -53, -43, 15, 2, 18, -93, -6, 0) // C
+RNC( -12, -67, 111, 59,-104, -4, 4, -84, 6, -88, -65, 48, -13, 18, -29, 5, -7, -63,-105, -73, 0) // D
+RNC( -7, -63, 59, 85, -83, -17, -1, -63, 25, -60, -47, 15, -12, 40, -8, 1, -7, -47,-108, -51, 0) // E
+RNC( -44, -50,-104, -83, 144, -93, 4, 12, -74, 36, 30, -64, -67, -56, -65, -43, -41, -3, 63, 104, 0) // F
+RNC( 10, -30, -4, -17, -93, 140, -32, -95, -27, -91, -75, 4, -36, -29, -32, 5, -26, -68, -80, -79, 0) // G
+RNC( -23, -29, 4, -1, 4, -32, 137, -50, 6, -37, -42, 21, -23, 27, 19, -4, -12, -44, -13, 48, 0) // H
+RNC( -14, 1, -84, -63, 12, -95, -50, 86, -53, 53, 47, -62, -60, -47, -55, -43, -8, 69, -27, -24, 0) // I
+RNC( -14, -56, 6, 25, -74, -27, 6, -53, 75, -48, -30, 13, -12, 34, 68, -3, -4, -44, -71, -49, 0) // K
+RNC( -27, -41, -88, -60, 36, -91, -37, 53, -48, 88, 62, -63, -48, -36, -48, -47, -25, 36, -11, -4, 0) // L
+RNC( -17, -6, -65, -47, 30, -75, -42, 47, -30, 62, 103, -45, -54, -21, -31, -35, -9, 31, -46, -20, 0) // M
+RNC( -8, -33, 48, 15, -64, 4, 21, -62, 13, -63, -45, 89, -25, 12, 2, 22, 10, -51, -79, -29, 0) // N
+RNC( 1, -44, -13, -12, -67, -36, -23, -60, -12, -48, -54, -25, 160, -6, -20, 5, -12, -42, -76, -83, 0) // P
+RNC( -9, -53, 18, 40, -56, -29, 27, -47, 34, -36, -21, 12, -6, 75, 34, 1, -4, -37, -92, -48, 0) // Q
+RNC( -22, -43, -29, -8, -65, -32, 19, -55, 68, -48, -31, 2, -20, 34, 113, -10, -14, -49, -58, -39, 0) // R
+RNC( 23, 15, 5, 1, -43, 5, -4, -43, -3, -47, -35, 22, 5, 1, -10, 53, 32, -28, -62, -31, 0) // S
+RNC( 15, 2, -7, -7, -41, -26, -12, -8, -4, -25, -9, 10, -12, -4, -14, 32, 68, 0, -87, -40, 0) // T
+RNC( 5, 18, -63, -47, -3, -68, -44, 69, -44, 36, 31, -51, -42, -37, -49, -28, 0, 74, -61, -32, 0) // V
+RNC( -74, -93,-105,-108, 63, -80, -13, -27, -71, -11, -46, -79, -76, -92, -58, -62, -87, -61, 289, 81, 0) // W
+RNC( -45, -6, -73, -51, 104, -79, 48, -24, -49, -4, -20, -29, -83, -48, -39, -31, -40, -32, 81, 162, 0) // Y
+RNC( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) // X
+ };
Added: trunk/packages/muscle/branches/upstream/current/writescorefile.cpp
===================================================================
--- trunk/packages/muscle/branches/upstream/current/writescorefile.cpp 2006-07-10 12:54:50 UTC (rev 81)
+++ trunk/packages/muscle/branches/upstream/current/writescorefile.cpp 2006-08-07 00:08:59 UTC (rev 82)
@@ -0,0 +1,69 @@
+#include "muscle.h"
+#include "msa.h"
+#include <errno.h>
+
+extern float VTML_SP[32][32];
+extern float NUC_SP[32][32];
+
+static double GetColScore(const MSA &msa, unsigned uCol)
+ {
+ const unsigned uSeqCount = msa.GetSeqCount();
+ unsigned uPairCount = 0;
+ double dSum = 0.0;
+ for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
+ {
+ if (msa.IsGap(uSeq1, uCol))
+ continue;
+ unsigned uLetter1 = msa.GetLetterEx(uSeq1, uCol);
+ if (uLetter1 >= g_AlphaSize)
+ continue;
+ for (unsigned uSeq2 = uSeq1 + 1; uSeq2 < uSeqCount; ++uSeq2)
+ {
+ if (msa.IsGap(uSeq2, uCol))
+ continue;
+ unsigned uLetter2 = msa.GetLetterEx(uSeq2, uCol);
+ if (uLetter2 >= g_AlphaSize)
+ continue;
+ double Score;
+ switch (g_Alpha)
+ {
+ case ALPHA_Amino:
+ Score = VTML_SP[uLetter1][uLetter2];
+ break;
+ case ALPHA_DNA:
+ case ALPHA_RNA:
+ Score = NUC_SP[uLetter1][uLetter2];
+ break;
+ default:
+ Quit("GetColScore: invalid alpha=%d", g_Alpha);
+ }
+ dSum += Score;
+ ++uPairCount;
+ }
+ }
+ if (0 == uPairCount)
+ return 0;
+ return dSum / uPairCount;
+ }
+
+void WriteScoreFile(const MSA &msa)
+ {
+ FILE *f = fopen(g_pstrScoreFileName, "w");
+ if (0 == f)
+ Quit("Cannot open score file '%s' errno=%d", g_pstrScoreFileName, errno);
+
+ const unsigned uColCount = msa.GetColCount();
+ const unsigned uSeqCount = msa.GetSeqCount();
+ for (unsigned uCol = 0; uCol < uColCount; ++uCol)
+ {
+ double Score = GetColScore(msa, uCol);
+ fprintf(f, "%10.3f ", Score);
+ for (unsigned uSeq = 0; uSeq < uSeqCount; ++uSeq)
+ {
+ char c = msa.GetChar(uSeq, uCol);
+ fprintf(f, "%c", c);
+ }
+ fprintf(f, "\n");
+ }
+ fclose(f);
+ }
More information about the debian-med-commit
mailing list