[med-svn] [clustalo] 01/04: Imported Upstream version 1.2.2
Andreas Tille
tille at debian.org
Mon Jul 4 09:28:00 UTC 2016
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository clustalo.
commit bacfc626b5362497d5be8f54cb866d70cbdff095
Author: Andreas Tille <tille at debian.org>
Date: Mon Jul 4 11:14:49 2016 +0200
Imported Upstream version 1.2.2
---
Doxyfile | 2 +-
INSTALL | 64 ++++
configure | 20 +-
configure.ac | 6 +-
src/clustal-omega-config.h | 10 +-
src/clustal-omega.c | 78 ++++-
src/clustal-omega.h | 2 +
src/clustal/hhalign_wrapper.c | 171 ++++++++--
src/clustal/ktuple_pair.c | 8 +-
src/clustal/mbed.c | 11 +-
src/clustal/pair_dist.c | 38 ++-
src/clustal/pair_dist.h | 6 +-
src/clustal/seq.c | 106 ++++++-
src/clustal/seq.h | 8 +-
src/clustal/symmatrix.c | 48 ++-
src/hhalign/hash-C.h | 6 +-
src/hhalign/hhalign.cpp | 12 +-
src/hhalign/hhalign.h | 4 +-
src/hhalign/hhalignment-C.h | 687 ++++++++++++++++++++++++----------------
src/hhalign/hhfullalignment-C.h | 3 +-
src/hhalign/hhfunc-C.h | 7 +-
src/hhalign/hhhalfalignment-C.h | 3 +-
src/hhalign/hhhit-C.h | 7 +-
src/hhalign/hhhitlist-C.h | 8 +-
src/hhalign/hhhmm-C.h | 3 +-
src/hhalign/util-C.h | 3 +-
src/mymain.c | 22 +-
src/squid/a2m.c | 9 +-
src/squid/msa.c | 78 ++++-
src/squid/msa.h | 7 +-
src/squid/sqio.c | 21 +-
src/squid/squid.h | 3 +
src/squid/stockholm.c | 99 +++++-
33 files changed, 1160 insertions(+), 400 deletions(-)
diff --git a/Doxyfile b/Doxyfile
index 6a5c02f..0c9b5c9 100644
--- a/Doxyfile
+++ b/Doxyfile
@@ -31,7 +31,7 @@ PROJECT_NAME = "Clustal Omega"
# This could be handy for archiving the generated documentation or
# if some version control system is used.
-PROJECT_NUMBER = 1.2.1
+PROJECT_NUMBER = 1.2.2
# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
# base path where the generated documentation will be put.
diff --git a/INSTALL b/INSTALL
index ede75b5..b7c0645 100644
--- a/INSTALL
+++ b/INSTALL
@@ -282,6 +282,70 @@ not `/usr/local'. It is recommended to use the following options:
./configure --prefix=/boot/common
+
+ On Windows do
+
+ 1. Preparation
+
+ 1.1. Install free MinGW64 on Windows 7. Download
+ mingw-w64-bin_x86_64-mingw_20111101_sezero.zip from
+ http://sourceforge.net/projects/mingw-w64/files/Toolchains
+ targetting Win64/Personal Builds/sezero_4.5_20111101/, extract
+ it, move mingw64 folder to C:\ and rename it to mingw. MinGW64
+ provides tools to develop 64-bit Windows applications using
+ gcc and g++. With MinGW64, some software developed for Linux
+ platform can be built on Windows.
+
+ 1.2. There is a file named pthreads-w64.zip in C:\mingw
+ folder. Extract it under C:\mingw folder.
+
+ 1.3. Download MSYS-1.0.11.exe from
+ http://sourceforge.net/projects/mingw/files/MSYS/Base/msys-core/msys-1.0.11/
+ and install it.
+
+ 1.4. Download Clustal Omega source from
+ http://www.clustal.org/omega/clustal-omega-x.x.x.tar.gz (where
+ x.x.x is the current version)
+
+ 1.5. Copy downloaded file to MSYS. If you installed MSYS in
+ C:\msys and your account is Administrator, copy it to
+ C:\msys\1.0\home\Administrator. You can do it using Windows
+ explorer.
+
+ 1.6. Download argtable2-13.tar.gz from
+ http://argtable.sourceforge.net/ and copy it to MSYS. This is
+ required by Clustal Omega.
+
+ 2. Configuration and Build process
+
+ 2.1. Launch MSYS and extract argtable2 source as tar xfz
+ argtable2-13.tar.gz.
+
+ 2.2. Extract Clustal Omega source as tar xfz
+ clustal-omega-1.2.0.tar.gz.
+
+ 2.3. cd argtable2-13; ./configure; make; make install
+
+ 2.4. cd ~/clustal-omega-1.2.0
+
+ 2.5. ./configure CFLAGS='-I/usr/local/include
+ -DSRE_STRICT_ANSI' LDFLAGS='-L/usr/local/lib'
+
+ 2.6. make; make install
+
+ 2.7. You can find clustalo.exe in /usr/local/bin folder which
+ is C:\msys\1.0\local/bin.
+
+ 2.8. Following DLLs are necessary to run clustalo.exe. Put
+ them in the same folder where clustalo.exe
+ exists. C:\mingw\bin\libcc_sjlj-1.dll,
+ C:\mingw\bin\libgomp-1.dll, C:\mingw\bin\libstdc++-6.dll,
+ C:\mingw\bin\pthreadGC2-w64.dll
+
+ (lifted from
+ http://www.blaststation.com/freestuff/en/howtoBuildx64ClustalO.php,
+ as of 2014-10-14)
+
Specifying the System Type
==========================
diff --git a/configure b/configure
index 2bc6c4e..b2a728b 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.63 for Clustal Omega 1.2.1.
+# Generated by GNU Autoconf 2.63 for Clustal Omega 1.2.2.
#
# Report bugs to <clustalw at ucd.ie>.
#
@@ -745,8 +745,8 @@ SHELL=${CONFIG_SHELL-/bin/sh}
# Identity of this package.
PACKAGE_NAME='Clustal Omega'
PACKAGE_TARNAME='clustal-omega'
-PACKAGE_VERSION='1.2.1'
-PACKAGE_STRING='Clustal Omega 1.2.1'
+PACKAGE_VERSION='1.2.2'
+PACKAGE_STRING='Clustal Omega 1.2.2'
PACKAGE_BUGREPORT='clustalw at ucd.ie'
# Factoring default headers for most tests.
@@ -1483,7 +1483,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
-\`configure' configures Clustal Omega 1.2.1 to adapt to many kinds of systems.
+\`configure' configures Clustal Omega 1.2.2 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1553,7 +1553,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
- short | recursive ) echo "Configuration of Clustal Omega 1.2.1:";;
+ short | recursive ) echo "Configuration of Clustal Omega 1.2.2:";;
esac
cat <<\_ACEOF
@@ -1659,7 +1659,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
-Clustal Omega configure 1.2.1
+Clustal Omega configure 1.2.2
generated by GNU Autoconf 2.63
Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
@@ -1673,7 +1673,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
-It was created by Clustal Omega $as_me 1.2.1, which was
+It was created by Clustal Omega $as_me 1.2.2, which was
generated by GNU Autoconf 2.63. Invocation command line was
$ $0 $@
@@ -2492,7 +2492,7 @@ fi
# Define the identity of the package.
PACKAGE='clustal-omega'
- VERSION='1.2.1'
+ VERSION='1.2.2'
cat >>confdefs.h <<_ACEOF
@@ -21790,7 +21790,7 @@ exec 6>&1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
-This file was extended by Clustal Omega $as_me 1.2.1, which was
+This file was extended by Clustal Omega $as_me 1.2.2, which was
generated by GNU Autoconf 2.63. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -21853,7 +21853,7 @@ Report bugs to <bug-autoconf at gnu.org>."
_ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_version="\\
-Clustal Omega config.status 1.2.1
+Clustal Omega config.status 1.2.2
configured by $0, generated by GNU Autoconf 2.63,
with options \\"`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`\\"
diff --git a/configure.ac b/configure.ac
index d825cb8..aa10e17 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,6 +1,6 @@
# configure.ac for Clustal Omega
#
-# RCS $Id: configure.ac 292 2014-02-28 14:26:55Z fabian $
+# RCS $Id: configure.ac 310 2016-06-13 14:11:35Z fabian $
# release
@@ -26,7 +26,9 @@
#PACKAGE_CODENAME="FilumVitae"
#AC_INIT([Clustal Omega], [1.2.0], [clustalw at ucd.ie])
#PACKAGE_CODENAME="AndreaGiacomo"
-AC_INIT([Clustal Omega], [1.2.1], [clustalw at ucd.ie])
+#AC_INIT([Clustal Omega], [1.2.1], [clustalw at ucd.ie])
+#PACKAGE_CODENAME="AndreaGiacomo"
+AC_INIT([Clustal Omega], [1.2.2], [clustalw at ucd.ie])
PACKAGE_CODENAME="AndreaGiacomo"
# The AC_INIT macro can take any source file as an argument. It just
diff --git a/src/clustal-omega-config.h b/src/clustal-omega-config.h
index 300343a..ba86373 100644
--- a/src/clustal-omega-config.h
+++ b/src/clustal-omega-config.h
@@ -199,7 +199,9 @@
/* #undef MINGW */
/* No-debug Mode */
-/* #undef NDEBUG */
+#ifndef CLUSTAL_OMEGA_NDEBUG
+#define CLUSTAL_OMEGA_NDEBUG /**/
+#endif
/* Some strange OS */
/* #undef OTHEROS */
@@ -226,7 +228,7 @@
/* Define to the full name and version of this package. */
#ifndef CLUSTAL_OMEGA_PACKAGE_STRING
-#define CLUSTAL_OMEGA_PACKAGE_STRING "Clustal Omega 1.2.1"
+#define CLUSTAL_OMEGA_PACKAGE_STRING "Clustal Omega 1.2.2"
#endif
/* Define to the one symbol short name of this package. */
@@ -236,7 +238,7 @@
/* Define to the version of this package. */
#ifndef CLUSTAL_OMEGA_PACKAGE_VERSION
-#define CLUSTAL_OMEGA_PACKAGE_VERSION "1.2.1"
+#define CLUSTAL_OMEGA_PACKAGE_VERSION "1.2.2"
#endif
/* The size of `fpos_t', as computed by sizeof. */
@@ -299,7 +301,7 @@
/* Version number of package */
#ifndef CLUSTAL_OMEGA_VERSION
-#define CLUSTAL_OMEGA_VERSION "1.2.1"
+#define CLUSTAL_OMEGA_VERSION "1.2.2"
#endif
/* Define if using the dmalloc debugging malloc package */
diff --git a/src/clustal-omega.c b/src/clustal-omega.c
index 8fff342..f6ccfd2 100644
--- a/src/clustal-omega.c
+++ b/src/clustal-omega.c
@@ -15,7 +15,7 @@
********************************************************************/
/*
- * RCS $Id: clustal-omega.c 290 2013-09-20 15:18:12Z fabian $
+ * RCS $Id: clustal-omega.c 304 2016-06-13 13:39:13Z fabian $
*/
#ifdef HAVE_CONFIG_H
@@ -207,7 +207,8 @@ SetDefaultAlnOpts(opts_t *prOpts) {
prOpts->ppcHMMInput = NULL;
prOpts->iHMMInputFiles = 0;
-
+ prOpts->pcHMMBatch = NULL; /* FS, r291 -> */
+
prOpts->iNumIterations = 0;
prOpts->bIterationsAuto = FALSE;
prOpts->iMaxGuidetreeIterations = INT_MAX;
@@ -958,7 +959,7 @@ SetAutoOptions(opts_t *prOpts, int iNumSeq) {
int
Align(mseq_t *prMSeq,
mseq_t *prMSeqProfile,
- opts_t *prOpts) {
+ opts_t *prOpts) { /* Note DEVEL 291: at this stage pppcHMMBNames is set but ppiHMMBindex is not */
/* HMM
*/
@@ -980,6 +981,10 @@ Align(mseq_t *prMSeq,
/* last dAlnScore for iteration */
double dLastAlnScore = -666.666;
+ /* HMM batch file */
+ char **ppcHMMbatch = NULL; /* names of unique HMM files */
+ int iHMMbatch = 0; /* number of unique HMM files */
+
int i, j; /* aux */
assert(NULL != prMSeq);
@@ -1027,17 +1032,57 @@ Align(mseq_t *prMSeq,
}
- /* Read backgrounds HMMs and store in prHMMs
+ /* Read backgrounds HMMs and store in prHMMs (Devel 291)
*
*/
- if (0 < prOpts->iHMMInputFiles) {
+ if (NULL != prOpts->pcHMMBatch){
+ int i, j, k;
+
+ for (i = 0; i < prMSeq->nseqs; i++){
+
+ if (NULL != prMSeq->pppcHMMBNames[i]){
+ for (j = 0; NULL != prMSeq->pppcHMMBNames[i][j]; j++){
+
+ for (k = 0; k < iHMMbatch; k++){
+ if (0 == strcmp(ppcHMMbatch[k], prMSeq->pppcHMMBNames[i][j])){
+ prMSeq->ppiHMMBindex[i][j] = k;
+ break; /* HMM already registered */
+ }
+ } /* went through HMM batch files already identified */
+ if (k == iHMMbatch){
+ FILE *pfHMM = NULL;
+ if (NULL == (pfHMM = fopen(prMSeq->pppcHMMBNames[i][j], "r"))){
+ prMSeq->ppiHMMBindex[i][j] = -1;
+ Log(&rLog, LOG_WARN, "Background HMM %s for %s (%d/%d) does not exist",
+ prMSeq->pppcHMMBNames[i][j], prMSeq->sqinfo[i].name, i, j);
+ }
+ else {
+ fclose(pfHMM); pfHMM = NULL;
+ ppcHMMbatch = (char **)realloc(ppcHMMbatch, (iHMMbatch+1)*sizeof(char *));
+ ppcHMMbatch[iHMMbatch] = strdup(prMSeq->pppcHMMBNames[i][j]);
+ prMSeq->ppiHMMBindex[i][j] = k;
+ iHMMbatch++;
+ }
+ }
+
+ } /* j = 0; NULL != prMSeq->pppcHMMBNames[i][j] */
+ } /* NULL != prMSeq->pppcHMMBNames[i] */
+ else {
+ /* void */
+ }
+ } /* 0 <= i < prMSeq->nseqs */
+
+ } /* there was a HMM batch file */
+
+
+ if (0 < prOpts->iHMMInputFiles) {
int iHMMInfileIndex;
/**
* @warning old structure used to be initialised like this:
* hmm_light rHMM = {0};
*/
- prHMMs = (hmm_light *) CKMALLOC(prOpts->iHMMInputFiles * sizeof(hmm_light));
+ prHMMs = (hmm_light *) CKMALLOC( (prOpts->iHMMInputFiles) * sizeof(hmm_light));
for (iHMMInfileIndex=0; iHMMInfileIndex<prOpts->iHMMInputFiles; iHMMInfileIndex++) {
char *pcHMMInput = prOpts->ppcHMMInput[iHMMInfileIndex];
@@ -1077,6 +1122,24 @@ Align(mseq_t *prMSeq,
CKFREE(prOpts->ppcHMMInput);
} /* there were background HMM files */
+ /** read HMMs specific to individual sequences
+ */
+ if (iHMMbatch > 0){
+ int i;
+
+ prHMMs = (hmm_light *) realloc( prHMMs, (prOpts->iHMMInputFiles + iHMMbatch + 1) * sizeof(hmm_light));
+
+ for (i = 0; i < iHMMbatch; i++){
+ char *pcHMMInput = ppcHMMbatch[i];
+
+ if (OK != readHMMWrapper(&prHMMs[i + prOpts->iHMMInputFiles], pcHMMInput)){
+ Log(&rLog, LOG_ERROR, "Processing of HMM file %s failed", pcHMMInput);
+ return -1;
+ }
+
+ } /* 0 <= i < iHMMbatch */
+
+ } /* there were HMM batch files */
/* If the input ("non-profile") sequences are aligned, then turn
@@ -1172,6 +1235,9 @@ Align(mseq_t *prMSeq,
if (prOpts->iMaxHMMIterations < 0){
Log(&rLog, LOG_VERBOSE,
"iMaxHMMIterations < 0 (%d), will not perform alignment", prOpts->iMaxHMMIterations);
+ if (NULL != piOrderLR){
+ free(piOrderLR); piOrderLR = NULL;
+ }
return 0;
}
diff --git a/src/clustal-omega.h b/src/clustal-omega.h
index e6360db..d0d483c 100644
--- a/src/clustal-omega.h
+++ b/src/clustal-omega.h
@@ -121,6 +121,8 @@ typedef struct {
/** number of provided HMM input files. not really a user
option but need for ppcHMMInput */
int iHMMInputFiles;
+ /** HMM batch-file, specify HMMs for individual sequences. FS, r291 -> */
+ char *pcHMMBatch;
/* Iteration
*/
diff --git a/src/clustal/hhalign_wrapper.c b/src/clustal/hhalign_wrapper.c
index 287d90b..269fc56 100644
--- a/src/clustal/hhalign_wrapper.c
+++ b/src/clustal/hhalign_wrapper.c
@@ -15,7 +15,7 @@
********************************************************************/
/*
- * RCS $Id: hhalign_wrapper.c 290 2013-09-20 15:18:12Z fabian $
+ * RCS $Id: hhalign_wrapper.c 306 2016-06-13 13:49:04Z fabian $
*/
#ifdef HAVE_CONFIG_H
@@ -50,7 +50,7 @@
*/
void SetDefaultHhalignPara(hhalign_para *prHhalignPara)
{
- prHhalignPara->iMacRamMB = 2048; /* 2048 default|give 2GB to MAC algorithm */
+ prHhalignPara->iMacRamMB = 8000; /* default|give just under 8GB to MAC algorithm, FS, 2016-04-18, went from 2GB to 8GB */
prHhalignPara->bIsDna = false; /* protein mode unless we say otherwise */
prHhalignPara->bIsRna = false;
prHhalignPara->pca = -UNITY;
@@ -550,7 +550,8 @@ PosteriorProbabilities(mseq_t *prMSeq, hmm_light rHMMalignment, hhalign_para rHh
zcError[0] = '\0';
hhalign(ppcCopy, 1, NULL,
ppcRepresent, 0, NULL,
- &dScore, &rHMMalignment, NULL, NULL, NULL, NULL,
+ &dScore, &rHMMalignment, &rHMMalignment,
+ NULL, NULL, NULL, NULL,
rHhalignPara, &prHHscores[iS], 0/* debug */, FALSE /* Log-Level*/,
zcAux, zcError);
if (NULL != strstr(zcError, "Viterbi")){
@@ -682,7 +683,7 @@ PileUp(mseq_t *prMSeq, hhalign_para rHhalignPara, int iClustersize)
* external HMM not used at this stage (prHMM=NULL) */
iHHret = hhalign(ppcProfilePro, iI, NULL/*pdWeightL*/,
ppcProfileSeq, 1, NULL/*pdWeightR*/,
- &dScore, prHMM,
+ &dScore, prHMM, prHMM,
pcConsensPro, pcReprsntPro,
pcConsensSeq, pcReprsntSeq,
rHhalignPara, &rHHscores,
@@ -732,7 +733,7 @@ PileUp(mseq_t *prMSeq, hhalign_para rHhalignPara, int iClustersize)
prHMM = &rHMMprofile;
hhalign(ppcReprsnt, 0, NULL,
ppcProfileSeq, 1, NULL,
- &dScore, prHMM,
+ &dScore, prHMM, prHMM,
pcConsensPro, pcReprsntPro,
pcConsensSeq, pcReprsntSeq,
rHhalignPara, &rHHscores,
@@ -893,7 +894,10 @@ HHalignWrapper(mseq_t *prMSeq, int *piOrderLR,
double *pdWeightsL = NULL; /* sequence weights of left profile */
double *pdWeightsR = NULL; /* sequence weights of right profile */
int iMergeNodeCounter = 0;
- hmm_light *prHMM = NULL;
+ hmm_light *prHMM = NULL;
+ hmm_light *prHMMleft = NULL;
+ hmm_light *prHMMrght = NULL;
+ hmm_light *prHMMnull = NULL;
bool bPrintCR = (rLog.iLogLevelEnabled<=LOG_VERBOSE) ? FALSE : TRUE;
#if TIMING
char zcStopwatchMsg[1024];
@@ -903,15 +907,28 @@ HHalignWrapper(mseq_t *prMSeq, int *piOrderLR,
#endif
hhalign_scores rHHscores = {0};
- if (NULL != prHMMList) {
+#if 0 /* DEVEL 291 */
+ if (NULL != prHMMList) { /* FIXME DEVEL 291: replace this outer test with iHMMCount>0*/
if (iHMMCount>1) {
Log(&rLog, LOG_WARN, "FIXME: Using only first of %u HMMs (needs implementation)", iHMMCount);
}
- prHMM = &(prHMMList[0]);
+ prHMM = &(prHMMList[0]); /* FIXME DEVEL 291: check for NULL */
} else {
/* FIXME: prHMM not allowed to be NULL and therefore pseudo allocated here */
prHMM = (hmm_light *) CKCALLOC(1, sizeof(hmm_light));
}
+#else
+ prHMMnull = (hmm_light *) CKCALLOC(1, sizeof(hmm_light));
+ if ( (iHMMCount > 0) && (NULL != prHMMList) ){
+ prHMM = &(prHMMList[0]);
+ if (iHMMCount > 1) {
+ Log(&rLog, LOG_WARN, "FIXME: Using only first of %u HMMs (needs implementation)", iHMMCount);
+ }
+ }
+ else {
+ prHMM = prHMMnull; /* prHMM not allowed to be NULL and therefore assigned to pseudo allocated */
+ }
+#endif
assert(NULL!=prMSeq);
@@ -1075,7 +1092,38 @@ HHalignWrapper(mseq_t *prMSeq, int *piOrderLR,
}
}
+#if 1 /* DEVEL 291 */
+ /*
+ * Note: if there is a HMM-batch file, then prMSeq->ppiHMMBindex is not NULL;
+ * ppiLeafList[iL/iR][0] is the 'lead' sequence in a profile;
+ * prMSeq->ppiHMMBindex[ppiLeafList[iL][0]] are the HMMs associated with the lead sequence;
+ * this could be NULL if there are no HMMs associated with this particular sequence
+ * at the moment only 1st HMM can be used, prMSeq->ppiHMMBindex[ppiLeafList[iL][0]][0];
+ * the index of this HMM can be '-1' if the specified HMM file does not exist
+ * Note: we only use prHMMleft/prHMMrght, even if global HMM (--hmm-in) is specified
+ **/
+ if ( (NULL != prMSeq->ppiHMMBindex) && (NULL != prMSeq->ppiHMMBindex[ppiLeafList[iL][0]]) &&
+ (prMSeq->ppiHMMBindex[ppiLeafList[iL][0]][0] > -1) ){
+ prHMMleft = &(prHMMList[prMSeq->ppiHMMBindex[ppiLeafList[iL][0]][0]]);
+ }
+ else if (iHMMCount > 0){
+ prHMMleft = prHMM;
+ }
+ else {
+ prHMMleft = prHMMnull;
+ }
+ if ( (NULL != prMSeq->ppiHMMBindex) && (NULL != prMSeq->ppiHMMBindex[ppiLeafList[iR][0]]) &&
+ (prMSeq->ppiHMMBindex[ppiLeafList[iR][0]][0] > -1) ){
+ prHMMrght = &(prHMMList[prMSeq->ppiHMMBindex[ppiLeafList[iR][0]][0]]);
+ }
+ else if (iHMMCount > 0){
+ prHMMrght = prHMM;
+ }
+ else {
+ prHMMrght = prHMMnull;
+ }
+#endif
/* align individual sequences to HMM;
* - use representative sequence to get gapping
@@ -1086,15 +1134,16 @@ HHalignWrapper(mseq_t *prMSeq, int *piOrderLR,
* full HMM but that does not seem to work at all
* -- try harder! Fail better!
*/
- if ( (piLeafCount[iL] <= APPLY_BG_HMM_UP_TO_TREE_DEPTH) && (0 != prHMM->L) ){
+ /*if ( (piLeafCount[iL] <= APPLY_BG_HMM_UP_TO_TREE_DEPTH) && (0 != prHMM->L) ){*/
+ if (0 != prHMMleft->L){
int i, j;
- pcReprsnt1 = CKCALLOC(prHMM->L+strlen(ppcProfile1[0])+1, sizeof(char));
- for (i = 0; i < prHMM->L; i++){
- pcReprsnt1[i] = prHMM->seq[prHMM->ncons][i+1];
+ pcReprsnt1 = CKCALLOC(prHMMleft->L+strlen(ppcProfile1[0])+1, sizeof(char));
+ for (i = 0; i < prHMMleft->L; i++){
+ pcReprsnt1[i] = prHMMleft->seq[prHMMleft->ncons][i+1];
}
ppcCopy1 = CKCALLOC(piLeafCount[iL], sizeof(char *));
for (j = 0; j < piLeafCount[iL]; j++){
- ppcCopy1[j] = CKCALLOC(prHMM->L+strlen(ppcProfile1[0])+1, sizeof(char));
+ ppcCopy1[j] = CKCALLOC(prHMMleft->L+strlen(ppcProfile1[0])+1, sizeof(char));
for (i = 0; i < (int) strlen(ppcProfile1[0]); i++){
ppcCopy1[j][i] = ppcProfile1[j][i];
}
@@ -1118,13 +1167,13 @@ HHalignWrapper(mseq_t *prMSeq, int *piOrderLR,
FS, r236 -> r237
*/
int iLenA = strlen(ppcCopy1[0]);
- int iLenH = prHMM->L;
+ int iLenH = prHMMleft->L;
int iHHret = 0;
if (iLenH < iLenA){
iHHret = hhalign(ppcReprsnt1, 0/* only one representative seq*/, NULL,
ppcCopy1, piLeafCount[iL], pdWeightsL,
- &dScore, prHMM,
+ &dScore, prHMMleft, prHMMleft,
NULL, NULL, NULL, NULL,
rHhalignPara, &rHHscores,
iAux_FS++, /* DEBUG ARGUMENT */ rLog.iLogLevelEnabled,
@@ -1133,7 +1182,7 @@ HHalignWrapper(mseq_t *prMSeq, int *piOrderLR,
else {
iHHret = hhalign(ppcCopy1, piLeafCount[iL], pdWeightsL,
ppcReprsnt1, 0/* only one representative seq*/, NULL,
- &dScore, prHMM,
+ &dScore, prHMMleft, prHMMleft,
NULL, NULL, NULL, NULL,
rHhalignPara, &rHHscores,
iAux_FS++, /* DEBUG ARGUMENT */ rLog.iLogLevelEnabled,
@@ -1159,8 +1208,8 @@ HHalignWrapper(mseq_t *prMSeq, int *piOrderLR,
* it only contains a gap if all sequences of the profile
* have a gap at this position
*/
- pcConsens1 = CKCALLOC(prHMM->L+strlen(ppcProfile1[0])+1, sizeof(char));
- for (i = 0; i < prHMM->L; i++){
+ pcConsens1 = CKCALLOC(prHMMleft->L+strlen(ppcProfile1[0])+1, sizeof(char));
+ for (i = 0; i < prHMMleft->L; i++){
for (j = 0, pcConsens1[i] = '-'; (j < piLeafCount[iL]) && ('-' == pcConsens1[i]); j++){
pcConsens1[i] = ppcCopy1[j][i];
}
@@ -1173,16 +1222,17 @@ HHalignWrapper(mseq_t *prMSeq, int *piOrderLR,
#endif
} /* ( (1 == piLeafCount[iL]) && (0 != prHMM->L) ) */
- if ( (piLeafCount[iR] <= APPLY_BG_HMM_UP_TO_TREE_DEPTH) && (0 != prHMM->L) ){
+ /*if ( (piLeafCount[iR] <= APPLY_BG_HMM_UP_TO_TREE_DEPTH) && (0 != prHMM->L) ){*/
+ if (0 != prHMMrght->L){
int i, j;
- pcReprsnt2 = CKCALLOC(prHMM->L+strlen(ppcProfile2[0])+1, sizeof(char));
- for (i = 0; i < prHMM->L; i++){
- pcReprsnt2[i] = prHMM->seq[prHMM->ncons][i+1];
+ pcReprsnt2 = CKCALLOC(prHMMrght->L+strlen(ppcProfile2[0])+1, sizeof(char));
+ for (i = 0; i < prHMMrght->L; i++){
+ pcReprsnt2[i] = prHMMrght->seq[prHMMrght->ncons][i+1];
}
ppcCopy2 = CKCALLOC(piLeafCount[iR], sizeof(char *));
for (j = 0; j < piLeafCount[iR]; j++){
- ppcCopy2[j] = CKCALLOC(prHMM->L+strlen(ppcProfile2[0])+1, sizeof(char));
+ ppcCopy2[j] = CKCALLOC(prHMMrght->L+strlen(ppcProfile2[0])+1, sizeof(char));
for (i = 0; i < (int) strlen(ppcProfile2[0]); i++){
ppcCopy2[j][i] = ppcProfile2[j][i];
}
@@ -1206,13 +1256,13 @@ HHalignWrapper(mseq_t *prMSeq, int *piOrderLR,
FS, r236 -> r237
*/
int iLenA = strlen(ppcCopy2[0]);
- int iLenH = prHMM->L;
+ int iLenH = prHMMrght->L;
int iHHret = 0;
if (iLenH < iLenA){
iHHret = hhalign(ppcReprsnt2, 0/* only one representative seq */, NULL,
ppcCopy2, piLeafCount[iR], pdWeightsR,
- &dScore, prHMM,
+ &dScore, prHMMrght, prHMMrght,
NULL, NULL, NULL, NULL,
rHhalignPara, &rHHscores,
iAux_FS++, /* DEBUG ARGUMENT */ rLog.iLogLevelEnabled,
@@ -1221,7 +1271,7 @@ HHalignWrapper(mseq_t *prMSeq, int *piOrderLR,
else {
iHHret = hhalign(ppcCopy2, piLeafCount[iR], pdWeightsR,
ppcReprsnt2, 0/* only one representative seq */, NULL,
- &dScore, prHMM,
+ &dScore, prHMMrght, prHMMrght,
NULL, NULL, NULL, NULL,
rHhalignPara, &rHHscores,
iAux_FS++, /* DEBUG ARGUMENT */ rLog.iLogLevelEnabled,
@@ -1247,8 +1297,8 @@ HHalignWrapper(mseq_t *prMSeq, int *piOrderLR,
* it only contains a gap if all sequences of the profile
* have a gap at this position
*/
- pcConsens2 = CKCALLOC(prHMM->L+strlen(ppcProfile2[0])+1, sizeof(char));
- for (i = 0; i < prHMM->L; i++){
+ pcConsens2 = CKCALLOC(prHMMrght->L+strlen(ppcProfile2[0])+1, sizeof(char));
+ for (i = 0; i < prHMMrght->L; i++){
for (j = 0, pcConsens2[i] = '-'; (j < piLeafCount[iR]) && ('-' == pcConsens2[i]); j++){
pcConsens2[i] = ppcCopy2[j][i];
}
@@ -1291,7 +1341,7 @@ HHalignWrapper(mseq_t *prMSeq, int *piOrderLR,
int iOldMacRam = rHhalignPara.iMacRamMB;
iHHret = hhalign(ppcProfile1, piLeafCount[iL], pdWeightsL,
ppcProfile2, piLeafCount[iR], pdWeightsR,
- &dScore, prHMM,
+ &dScore, prHMMleft, prHMMrght,
pcConsens1, pcReprsnt1,
pcConsens2, pcReprsnt2,
rHhalignPara, &rHHscores,
@@ -1310,6 +1360,7 @@ HHalignWrapper(mseq_t *prMSeq, int *piOrderLR,
the only thing we can do (easily) is to re-run it in Viterbi mode,
for this set MAC-RAM=0, set it back to its original value after 2nd try.
FS, r241 -> r243 */
+
if (RETURN_FROM_MAC == iHHret){
/* Note: the default way to run hhalign() is to initially select MAC
by giving it all the memory it needs. MAC may fail due to overflow (repeats?).
@@ -1328,7 +1379,7 @@ HHalignWrapper(mseq_t *prMSeq, int *piOrderLR,
iHHret = hhalign(ppcProfile1, piLeafCount[iL], pdWeightsL,
ppcProfile2, piLeafCount[iR], pdWeightsR,
- &dScore, prHMM,
+ &dScore, prHMMleft, prHMMrght,
pcConsens1, pcReprsnt1,
pcConsens2, pcReprsnt2,
rHhalignPara, &rHHscores,
@@ -1353,7 +1404,7 @@ HHalignWrapper(mseq_t *prMSeq, int *piOrderLR,
int iOldMacRam = rHhalignPara.iMacRamMB;
iHHret = hhalign(ppcProfile2, piLeafCount[iR], pdWeightsR,
ppcProfile1, piLeafCount[iL], pdWeightsL,
- &dScore, prHMM,
+ &dScore, prHMMrght, prHMMleft,
pcConsens2, pcReprsnt2,
pcConsens1, pcReprsnt1,
rHhalignPara, &rHHscores,
@@ -1372,6 +1423,7 @@ HHalignWrapper(mseq_t *prMSeq, int *piOrderLR,
the only thing we can do (easily) is to re-run it in Viterbi mode,
for this set MAC-RAM=0, set it back to its original value after 2nd try.
FS, r241 -> r243 */
+
if (RETURN_FROM_MAC == iHHret){
/* see above */
rHhalignPara.iMacRamMB = 0;
@@ -1382,7 +1434,7 @@ HHalignWrapper(mseq_t *prMSeq, int *piOrderLR,
iHHret = hhalign(ppcProfile2, piLeafCount[iR], pdWeightsR,
ppcProfile1, piLeafCount[iL], pdWeightsL,
- &dScore, prHMM,
+ &dScore, prHMMrght, prHMMleft,
pcConsens2, pcReprsnt2,
pcConsens1, pcReprsnt1,
rHhalignPara, &rHHscores,
@@ -1402,7 +1454,56 @@ HHalignWrapper(mseq_t *prMSeq, int *piOrderLR,
} /* 1st invocation failed */
} /* 2nd profile was shorter than 1st */
-
+
+ /*
+ * at this stage have performed alignment of 2 profiles/sequences.
+ * if HMM batch information had been used then have choices:
+ * (i) if HMM info only intended for initial alignment (of sequences) then make both HMMs stale;
+ * (iia) if alignment of 2 profiles/sequences where same HMM used, then retain;
+ * (iib) if alignment of 2 profiles/sequences where different HMMs used, then make both stale;
+ * (iii) some majority voting
+ */
+#if 0 /* always make HMM batch stale (after 1st invocation) */
+ if ( (NULL != prMSeq->ppiHMMBindex) && (NULL != prMSeq->ppiHMMBindex[ppiLeafList[iL][0]]) ){
+ prMSeq->ppiHMMBindex[ppiLeafList[iL][0]][0] = -1;
+ }
+ if ( (NULL != prMSeq->ppiHMMBindex) && (NULL != prMSeq->ppiHMMBindex[ppiLeafList[iR][0]]) ){
+ prMSeq->ppiHMMBindex[ppiLeafList[iR][0]][0] = -1;
+ }
+#else /* retain HMMs if they were the same for both profiles */
+ if (NULL != prMSeq->ppiHMMBindex) {
+ int i;
+
+ if ( (NULL != prMSeq->ppiHMMBindex[ppiLeafList[iL][0]]) && (NULL != prMSeq->ppiHMMBindex[ppiLeafList[iR][0]]) ){
+ if ( prMSeq->ppiHMMBindex[ppiLeafList[iL][0]][0] == -1){
+ prMSeq->ppiHMMBindex[ppiLeafList[iR][0]][0] = -1; /* this is conservative, could do H[iL] = H[iR] */
+ for (i = 0; i < piLeafCount[iR]; i++){
+ prMSeq->ppiHMMBindex[ppiLeafList[iR][i]][0] = -1;
+ }
+ }
+ else if ( prMSeq->ppiHMMBindex[ppiLeafList[iR][0]][0] == -1){
+ prMSeq->ppiHMMBindex[ppiLeafList[iL][0]][0] = -1; /* this is conservative, could do H[iR] = H[iL] */
+ for (i = 0; i < piLeafCount[iL]; i++){
+ prMSeq->ppiHMMBindex[ppiLeafList[iL][i]][0] = -1;
+ }
+ }
+ else if (prMSeq->ppiHMMBindex[ppiLeafList[iL][0]][0] != prMSeq->ppiHMMBindex[ppiLeafList[iR][0]][0]){
+ prMSeq->ppiHMMBindex[ppiLeafList[iL][0]][0] = -1; /* this is NOT conservative, mandatory */
+ for (i = 0; i < piLeafCount[iL]; i++){
+ prMSeq->ppiHMMBindex[ppiLeafList[iL][i]][0] = -1;
+ }
+ prMSeq->ppiHMMBindex[ppiLeafList[iR][0]][0] = -1; /* this is NOT conservative, mandatory */
+ for (i = 0; i < piLeafCount[iR]; i++){
+ prMSeq->ppiHMMBindex[ppiLeafList[iR][i]][0] = -1;
+ }
+ }
+ else {
+ /* void, HMMs should be same */
+ }
+ }
+ } /* there was a HMM batch */
+#endif
+
if (rLog.iLogLevelEnabled <= LOG_DEBUG){
int i;
@@ -1513,9 +1614,13 @@ HHalignWrapper(mseq_t *prMSeq, int *piOrderLR,
TranslateUnknown2Ambiguity(prMSeq);
ReAttachLeadingGaps(prMSeq, iProfProfSeparator);
+#if 0 /* DEVEL 291 */
if (NULL == prHMMList){
CKFREE(prHMM);
}
+#else
+ CKFREE(prHMMnull);
+#endif
CKFREE(ppcProfile2);
CKFREE(ppcProfile1);
CKFREE(ppiLeafList[piOrderLR[DIFF_NODE*(iNodeCount-1)+PRNT_NODE]]);
diff --git a/src/clustal/ktuple_pair.c b/src/clustal/ktuple_pair.c
index 3ea9ca9..3e49a02 100644
--- a/src/clustal/ktuple_pair.c
+++ b/src/clustal/ktuple_pair.c
@@ -15,7 +15,7 @@
********************************************************************/
/*
- * RCS $Id: ktuple_pair.c 230 2011-04-09 15:37:50Z andreas $
+ * RCS $Id: ktuple_pair.c 305 2016-06-13 13:46:02Z fabian $
*
*
* K-Tuple code for pairwise alignment (Wilbur and Lipman, 1983; PMID
@@ -649,7 +649,7 @@ KTuplePairDist(symmatrix_t *tmat, mseq_t *mseq,
/* int uStepNo, uTotalStepNo; */
ktuple_param_t aln_param = default_protein_param;
bool bPrintCR = (rLog.iLogLevelEnabled<=LOG_VERBOSE) ? FALSE : TRUE;
-
+
if(prProgress == NULL) {
NewProgress(&prProgress, LogGetFP(&rLog, LOG_INFO),
@@ -822,7 +822,9 @@ KTuplePairDist(symmatrix_t *tmat, mseq_t *mseq,
#pragma omp critical(ktuple)
#if 0
{
- printf("steps: %d\n", private_step_no);
+ int tid;
+ tid = omp_get_thread_num();
+ printf("%s:%d: tid %d: steps %d\n", __FILE__, __LINE__, tid, private_step_no);
}
#endif
#endif
diff --git a/src/clustal/mbed.c b/src/clustal/mbed.c
index 0977607..33fd2de 100644
--- a/src/clustal/mbed.c
+++ b/src/clustal/mbed.c
@@ -15,7 +15,7 @@
********************************************************************/
/*
- * RCS $Id: mbed.c 283 2013-06-10 17:42:14Z fabian $
+ * RCS $Id: mbed.c 300 2016-06-13 13:29:58Z fabian $
*
*
* Reimplementation from scratch of mBed (Blackshields et al., 2010;
@@ -306,9 +306,9 @@ SeqToVec(double **ppdSeqVec, mseq_t *prMSeq,
int iSeqIndex;
int iSeedIndex;
/* indices for restoring order */
- int *restore;
+ int *restore = NULL;
/* sorted copy of piSeeds */
- int *piSortedSeeds;
+ int *piSortedSeeds = NULL;
#if TIMING
Stopwatch_t *stopwatch = StopwatchCreate();
@@ -446,6 +446,7 @@ SeqToVec(double **ppdSeqVec, mseq_t *prMSeq,
FreeSymMatrix(&prDistmat);
CKFREE(restore);
+ CKFREE(piSortedSeeds);
#if TIMING
StopwatchStop(stopwatch);
StopwatchDisplay(stdout, "Total time for SeqToVec(): ", stopwatch);
@@ -1267,8 +1268,8 @@ Mbed(tree_t **prMbedTree_p, mseq_t *prMSeq, const int iPairDistType,
for (iI = 0; iI < prKMeansResult->iNClusters; iI++) {
for (iJ=0; iJ<prKMeansResult->piNObjsPerCluster[iI]; iJ++) {
int iRealIndex = prKMeansResult->ppiObjIndicesPerCluster[iI][iJ];
- fprintf(pfClust, "Cluster %u: object %u has index %u (=seq %s )\t %s\n",
- iI, iJ, iRealIndex, prMSeq->sqinfo[iRealIndex].name, ppcClusterSplits[iRealIndex]);
+ fprintf(pfClust, "Cluster %u: object %u has index %u (=seq %s %d~len)\t %s\n",
+ iI, iJ, iRealIndex, prMSeq->sqinfo[iRealIndex].name, prMSeq->sqinfo[iRealIndex].len, ppcClusterSplits[iRealIndex]);
}
}
fclose(pfClust); pfClust = NULL;
diff --git a/src/clustal/pair_dist.c b/src/clustal/pair_dist.c
index 7d318b5..e6dbdc3 100644
--- a/src/clustal/pair_dist.c
+++ b/src/clustal/pair_dist.c
@@ -15,7 +15,7 @@
********************************************************************/
/*
- * RCS $Id: pair_dist.c 288 2013-07-29 13:15:50Z andreas $
+ * RCS $Id: pair_dist.c 301 2016-06-13 13:32:55Z fabian $
*/
#ifdef HAVE_CONFIG_H
@@ -35,6 +35,9 @@
#include "progress.h"
#include "util.h"
+/* Made iend/jend const unsigned long int (originally just int), FS, 2016-04-04
+ */
+
/* Up to rev 173 we had a USE_SYM_KTUPLE switch implemented here. When active
* ktuple distances were computed twice for each pair and averaged. Idea was
@@ -57,8 +60,8 @@ KimuraCorrection(double frac_id);
static int
SquidIdPairDist(symmatrix_t *tmat, mseq_t *mseq,
- int istart, int iend,
- int jstart, int jend,
+ int istart, const unsigned long int iend,
+ int jstart, const unsigned long int jend,
bool use_KimuraCorrection, progress_t *prProgress,
unsigned long int *ulStepNo, unsigned long int ulTotalStepNo);
@@ -167,8 +170,8 @@ KimuraCorrection(double p)
*/
int
SquidIdPairDist(symmatrix_t *tmat, mseq_t *mseq,
- int istart, int iend,
- int jstart, int jend,
+ int istart, const unsigned long int iend,
+ int jstart, const unsigned long int jend,
bool use_kimura, progress_t *prProgress,
unsigned long int *ulStepNo, unsigned long int ulTotalStepNo)
{
@@ -272,8 +275,8 @@ SquidIdPairDist(symmatrix_t *tmat, mseq_t *mseq,
*/
int
PairDistances(symmatrix_t **distmat, mseq_t *mseq, int pairdist_type, bool bPercID,
- int istart, int iend,
- int jstart, int jend,
+ int istart, const unsigned long int iend,
+ int jstart, const unsigned long int jend,
char *fdist_in, char *fdist_out)
{
int uSeqIndex;
@@ -315,26 +318,33 @@ PairDistances(symmatrix_t **distmat, mseq_t *mseq, int pairdist_type, bool bPerc
hence making even chunk sizes is slightly fiddlier
*/
ulTotalStepNo = iend*jend - iend*iend/2 + iend/2;
-
+
/* FIXME: can get rid of iChunkStart, iChunkEnd now that we're using the arrays */
iChunkStart = iend;
for(iChunk = 0; iChunk <= iNumberOfThreads; iChunk++)
{
iChunkEnd = iChunkStart;
- if(iChunk == iNumberOfThreads - 1)
+ if (iChunk == iNumberOfThreads - 1){
iChunkStart = 0;
- else
+ }
+ else if (iend == jend){
iChunkStart = iend - ((double)(iend - istart) * sqrt(((double)iChunk + 1.0)/(double)iNumberOfThreads));
+ }
+ else {
+ iChunkStart = iend - (iend - istart) * (iChunk + 1) / (double)(iNumberOfThreads);
+ }
iChunkStarts[iChunk] = iChunkStart;
iChunkEnds[iChunk] = iChunkEnd;
+ /*printf("%s:%d: C=%d, ie=%d, is=%d, je=%d, js=%d, Cstart=%d, Cend=%d, diff=%d\n",
+ __FILE__, __LINE__, iChunk, iend, istart, jend, jstart, iChunkStart, iChunkEnd, iChunkEnd-iChunkStart);*/
}
if (PAIRDIST_KTUPLE == pairdist_type) {
Log(&rLog, LOG_INFO, "Calculating pairwise ktuple-distances...");
-
+
NewProgress(&prProgress, LogGetFP(&rLog, LOG_INFO),
- "Ktuple-distance calculation progress", bPrintCR);
+ "Ktuple-distance calculation progress", bPrintCR);
#ifdef HAVE_OPENMP
#pragma omp parallel for private(iChunk) schedule(dynamic)
#endif
@@ -394,7 +404,7 @@ PairDistances(symmatrix_t **distmat, mseq_t *mseq, int pairdist_type, bool bPerc
}
#endif /* random/proper distance calculation */
-
+
/* optional printing of matrix to file
*/
if (NULL != fdist_out) {
@@ -420,7 +430,7 @@ PairDistances(symmatrix_t **distmat, mseq_t *mseq, int pairdist_type, bool bPerc
ProgressDone(prProgress);
FreeProgress(&prProgress);
}
-
+
return 0;
}
/*** end: PairDistances() ***/
diff --git a/src/clustal/pair_dist.h b/src/clustal/pair_dist.h
index d210329..e760506 100644
--- a/src/clustal/pair_dist.h
+++ b/src/clustal/pair_dist.h
@@ -13,7 +13,7 @@
********************************************************************/
/*
- * RCS $Id: pair_dist.h 283 2013-06-10 17:42:14Z fabian $
+ * RCS $Id: pair_dist.h 302 2016-06-13 13:35:50Z fabian $
*/
@@ -34,8 +34,8 @@
extern int
PairDistances(symmatrix_t **distmat, mseq_t *mseq, const int pairdist_type, bool bPercID,
- const int istart, const int iend,
- const int jstart, const int jend,
+ const int istart, const unsigned long int iend,
+ const int jstart, const unsigned long int jend,
char *fdist_in, char *fdist_out);
#endif
diff --git a/src/clustal/seq.c b/src/clustal/seq.c
index a7e9323..27cca14 100644
--- a/src/clustal/seq.c
+++ b/src/clustal/seq.c
@@ -15,7 +15,7 @@
********************************************************************/
/*
- * RCS $Id: seq.c 291 2014-02-27 18:20:54Z fabian $
+ * RCS $Id: seq.c 298 2014-11-07 12:18:36Z fabian $
*
*
* Module for sequence/alignment IO and misc.
@@ -37,7 +37,7 @@
#include "util.h"
#include "log.h"
#include "seq.h"
-
+/*#include "../../mymemmonitor.h"*/
#define ALLOW_ONLY_PROTEIN 0 // DD
@@ -419,11 +419,11 @@ SeqTypeToStr(int iSeqType)
int
ReadSequences(mseq_t *prMSeq, char *seqfile,
int iSeqType, int iSeqFmt, bool bIsProfile, bool bDealignInputSeqs,
- int iMaxNumSeq, int iMaxSeqLen)
+ int iMaxNumSeq, int iMaxSeqLen, char *pcHMMBatch)
{
SQFILE *dbfp; /* sequence file descriptor */
- char *cur_seq;
- SQINFO cur_sqinfo;
+ char *cur_seq = NULL;
+ SQINFO cur_sqinfo = {0};
int iSeqIdx; /* sequence counter */
int iSeqPos; /* sequence string position counter */
@@ -462,8 +462,8 @@ ReadSequences(mseq_t *prMSeq, char *seqfile,
*/
while (ReadSeq(dbfp, dbfp->format,
&cur_seq,
- &cur_sqinfo)) {
-
+ &cur_sqinfo)) {
+
if (prMSeq->nseqs+1>iMaxNumSeq) {
Log(&rLog, LOG_ERROR, "Maximum number of sequences (=%d) exceeded after reading sequence '%s' from '%s'",
iMaxNumSeq, cur_sqinfo.name, seqfile);
@@ -489,7 +489,8 @@ ReadSequences(mseq_t *prMSeq, char *seqfile,
prMSeq->sqinfo = (SQINFO *)
- CKREALLOC(prMSeq->sqinfo, (prMSeq->nseqs+1) * sizeof(SQINFO));
+ CKREALLOC(prMSeq->sqinfo, (prMSeq->nseqs+1) * sizeof(SQINFO));
+ memset(&prMSeq->sqinfo[prMSeq->nseqs], 0, sizeof(SQINFO));
SeqinfoCopy(&prMSeq->sqinfo[prMSeq->nseqs], &cur_sqinfo);
#ifdef TRACE
@@ -549,7 +550,7 @@ ReadSequences(mseq_t *prMSeq, char *seqfile,
prMSeq->nseqs++;
- FreeSequence(cur_seq, &cur_sqinfo);
+ FreeSequence(cur_seq, &cur_sqinfo);
}
SeqfileClose(dbfp);
@@ -616,6 +617,83 @@ ReadSequences(mseq_t *prMSeq, char *seqfile,
Log(&rLog, LOG_INFO, "Read %d sequences (type: %s) from %s",
prMSeq->nseqs, SeqTypeToStr(prMSeq->seqtype), prMSeq->filename);
+ prMSeq->pppcHMMBNames = NULL;
+ prMSeq->ppiHMMBindex = NULL;
+
+ /* read HMM-batch file if existent */
+ if (NULL != pcHMMBatch) {
+
+ enum {MAXLINE=10000};
+ FILE *pfHMMBatch = NULL;
+ char zcScanline[MAXLINE] = {0};
+ char *pcToken = NULL;
+ char *pcSeqName = NULL;
+ int iSeq = 0;
+
+ /* check that file exists */
+ if (NULL == (pfHMMBatch = fopen(pcHMMBatch, "r"))){
+ Log(&rLog, LOG_ERROR, "Failed to open HMM-batch file %s for reading", pcHMMBatch);
+ return -1;
+ }
+
+ /* initialise names and indices */
+ prMSeq->pppcHMMBNames = (char ***)CKMALLOC(prMSeq->nseqs * sizeof(char **));
+ for (iSeq = 0; iSeq < prMSeq->nseqs; iSeq++){
+ prMSeq->pppcHMMBNames[iSeq] = NULL;
+ }
+ prMSeq->ppiHMMBindex = (int **)CKMALLOC(prMSeq->nseqs * sizeof(int *));
+ for (iSeq = 0; iSeq < prMSeq->nseqs; iSeq++){
+ prMSeq->ppiHMMBindex[iSeq] = (int *)CKMALLOC(1 * sizeof(int));
+ prMSeq->ppiHMMBindex[iSeq][0] = -1;
+ }
+
+ /* read batch file line-by-line */
+ while (NULL != fgets(zcScanline, MAXLINE, pfHMMBatch)){
+
+ pcToken = strtok(zcScanline, " \040\t\n");
+ if (NULL == pcToken){
+ continue;
+ }
+ else {
+ pcSeqName = pcToken;
+ }
+ /* identify sequence label from batch file in labels read from sequence file */
+ for (iSeq = 0; iSeq < prMSeq->nseqs; iSeq++){
+ int iHMM = 0;
+
+ if (0 == strcmp(pcSeqName, prMSeq->sqinfo[iSeq].name)){
+
+ while (NULL != (pcToken = strtok(NULL, " \040\t\n"))){
+ prMSeq->pppcHMMBNames[iSeq] = (char **)CKREALLOC(prMSeq->pppcHMMBNames[iSeq],
+ (iHMM+2) * sizeof(char *));
+ prMSeq->pppcHMMBNames[iSeq][iHMM] = CkStrdup(pcToken);
+ prMSeq->ppiHMMBindex[iSeq] = (int *)CKREALLOC(prMSeq->ppiHMMBindex[iSeq],
+ (iHMM+2) * sizeof(int));
+ prMSeq->ppiHMMBindex[iSeq][iHMM] = 0;
+ iHMM++;
+ prMSeq->pppcHMMBNames[iSeq][iHMM] = NULL;
+ prMSeq->ppiHMMBindex[iSeq][iHMM] = 0;
+ }
+ break;
+ }
+
+ } /* 0 <= iSeq < prMSeq->nseqs */
+ if (iSeq >= prMSeq->nseqs) {
+ Log(&rLog, LOG_WARN,
+ "sequence %s not found in input sequences (%s), will be ignored",
+ pcSeqName, seqfile);
+ }
+
+ } /* !EOF */
+
+ fclose(pfHMMBatch); pfHMMBatch = NULL;
+
+ } /* there was a HMM batch file */
+ else {
+ prMSeq->pppcHMMBNames = NULL;
+ prMSeq->ppiHMMBindex = NULL;
+ } /* there was no HMM batch file */
+
return 0;
}
/*** end: ReadSequences ***/
@@ -644,6 +722,8 @@ NewMSeq(mseq_t **prMSeq)
(*prMSeq)->sqinfo = NULL;
(*prMSeq)->filename = NULL;
(*prMSeq)->tree_order = NULL;
+ (*prMSeq)->pppcHMMBNames = NULL;
+ (*prMSeq)->ppiHMMBindex = NULL;
}
/*** end: NewMSeq ***/
@@ -765,6 +845,14 @@ FreeMSeq(mseq_t **mseq)
CKFREE((*mseq)->tree_order);
}
+ if (NULL != (*mseq)->pppcHMMBNames){ /* FS, r291 -> */
+ for (i = 0; (*mseq)->pppcHMMBNames[i] && (i < (*mseq)->nseqs); i++){
+ int iIter = 0;
+ for (iIter = 0; NULL != (*mseq)->pppcHMMBNames[i][iIter]; iIter++){
+ CKFREE((*mseq)->pppcHMMBNames[i][iIter]);
+ }
+ }
+ }
(*mseq)->seqtype = SEQTYPE_UNKNOWN;
(*mseq)->nseqs = 0;
diff --git a/src/clustal/seq.h b/src/clustal/seq.h
index 17f1113..8f9fe1b 100644
--- a/src/clustal/seq.h
+++ b/src/clustal/seq.h
@@ -13,7 +13,7 @@
********************************************************************/
/*
- * RCS $Id: seq.h 289 2013-09-17 10:09:37Z fabian $
+ * RCS $Id: seq.h 296 2014-10-07 12:15:41Z fabian $
*/
#ifndef CLUSTALO_SEQ_H
@@ -111,6 +111,10 @@ typedef struct {
*
*/
SQINFO *sqinfo;
+
+ /* HMM batch information */
+ char ***pppcHMMBNames;
+ int **ppiHMMBindex;
} mseq_t;
extern void
@@ -131,7 +135,7 @@ SeqTypeToStr(int seqtype);
extern int
ReadSequences(mseq_t *prMSeq_p, char *pcSeqFile,
int iSeqType, int iSeqFmt, bool bIsProfile, bool bDealignInputSeqs,
- int iMaxNumSeq, int iMaxSeqLen);
+ int iMaxNumSeq, int iMaxSeqLen, char *pcHMMBatch);
extern void
NewMSeq(mseq_t **mseq);
diff --git a/src/clustal/symmatrix.c b/src/clustal/symmatrix.c
index 350ca65..b37d883 100644
--- a/src/clustal/symmatrix.c
+++ b/src/clustal/symmatrix.c
@@ -15,7 +15,7 @@
********************************************************************/
/*
- * RCS $Id: symmatrix.c 288 2013-07-29 13:15:50Z andreas $
+ * RCS $Id: symmatrix.c 303 2016-06-13 13:37:47Z fabian $
*
*
* Functions for symmetric (square) matrices including diagonal.
@@ -77,6 +77,10 @@ NewSymMatrix(symmatrix_t **symmat, int nrows, int ncols)
fprintf(stderr, "Couldn't allocate memory (%s|%s)\n",
__FILE__, __FUNCTION__);
return -1;
+ }
+ else {
+ (*symmat)->nrows = nrows;
+ (*symmat)->ncols = ncols;
}
(*symmat)->data = (double **) malloc(nrows * sizeof(double *));
@@ -87,6 +91,38 @@ NewSymMatrix(symmatrix_t **symmat, int nrows, int ncols)
*symmat = NULL;
return -1;
}
+
+#ifdef HAVE_OPENMP
+ /* one malloc for the full matrix data structure
+ assumes ncols >= nrows (asserted above) */
+ double *mat_data;
+ int elements_to_date=0;
+ if ((mat_data = (double *)malloc(((nrows+1)*nrows/2 + (ncols-nrows)*nrows) * sizeof(double))) == NULL) {
+ fprintf(stderr, "Couldn't allocate MPI memory (%s|%s)\n", __FILE__, __FUNCTION__);
+ free((*symmat)->data);
+ free(*symmat);
+ *symmat = NULL;
+ return -1;
+ }
+ for (i=0; i<nrows; i++) {
+ (*symmat)->data[i] = &mat_data[elements_to_date];
+ elements_to_date += ncols-i;
+#ifdef TRACE
+ fprintf(stderr, "DEBUG(%s|%s():%d) initialising symmat with the number of the beast\n",
+ __FILE__, __FUNCTION__, __LINE__);
+#endif
+ {
+ int j;
+ for (j=0; j<ncols-i; j++) {
+#ifdef TRACE
+ (*symmat)->data[i][j] = -666.0;
+#else
+ (*symmat)->data[i][j] = 0.0;
+#endif
+ }
+ }
+ }
+#else /* not HAVE_OPENMP */
for (i=0; i<nrows; i++) {
(*symmat)->data[i] = (double *) calloc((ncols-i), sizeof(double));
if (NULL == (*symmat)->data[i]) {
@@ -112,10 +148,8 @@ NewSymMatrix(symmatrix_t **symmat, int nrows, int ncols)
}
#endif
}
+#endif /* HAVE_OPENMP */
- (*symmat)->nrows = nrows;
- (*symmat)->ncols = ncols;
-
return 0;
}
/*** end: new_symmatrix ***/
@@ -242,12 +276,18 @@ SymMatrixGetValueP(double **val, symmatrix_t *symmat,
void
FreeSymMatrix(symmatrix_t **symmat)
{
+#ifndef HAVE_OPENMP
int i;
+#endif
if (NULL != (*symmat)) {
if (NULL != (*symmat)->data) {
+#ifdef HAVE_OPENMP
+ free((*symmat)->data[0]);
+#else
for (i=0; i<(*symmat)->nrows; i++) {
free((*symmat)->data[i]);
}
+#endif
free((*symmat)->data);
}
}
diff --git a/src/hhalign/hash-C.h b/src/hhalign/hash-C.h
index 24b7815..3ddae4b 100644
--- a/src/hhalign/hash-C.h
+++ b/src/hhalign/hash-C.h
@@ -15,7 +15,7 @@
********************************************************************/
/*
- * RCS $Id: hash-C.h 143 2010-10-14 13:11:14Z andreas $
+ * RCS $Id: hash-C.h 309 2016-06-13 14:10:11Z fabian $
*/
// hash.C
@@ -27,7 +27,7 @@
// * works also if maximal size is exceeded, but gets slower by a factor ~num_keys/num_slots
/*
- * RCS $Id: hash-C.h 143 2010-10-14 13:11:14Z andreas $
+ * RCS $Id: hash-C.h 309 2016-06-13 14:10:11Z fabian $
*/
#ifndef HASH
@@ -59,7 +59,7 @@ using std::ofstream;
#endif
#include "hash.h"
-
+//#include "new_new.h" /* memory tracking */
//////////////////////////////////////////////////////////////////////////////
diff --git a/src/hhalign/hhalign.cpp b/src/hhalign/hhalign.cpp
index 2905f39..6fdcc96 100644
--- a/src/hhalign/hhalign.cpp
+++ b/src/hhalign/hhalign.cpp
@@ -15,7 +15,7 @@
********************************************************************/
/*
- * RCS $Id: hhalign.cpp 290 2013-09-20 15:18:12Z fabian $
+ * RCS $Id: hhalign.cpp 309 2016-06-13 14:10:11Z fabian $
*/
/* hhalign.C:
@@ -677,7 +677,7 @@ ProcessArguments(int argc, char** argv)
extern "C" int
hhalign(char **ppcFirstProf, int iFirstCnt, double *pdWeightsL,
char **ppcSecndProf, int iSecndCnt, double *pdWeightsR,
- double *dScore_p, hmm_light *prHMM,
+ double *dScore_p, hmm_light *prHMM, hmm_light *prHMM2,
char *pcPrealigned1, char *pcRepresent1,
char *pcPrealigned2, char *pcRepresent2,
hhalign_para rHhalignPara, hhalign_scores *rHHscores,
@@ -879,7 +879,7 @@ hhalign(char **ppcFirstProf, int iFirstCnt, double *pdWeightsL,
tL = strlen(ppcSecndProf[0]);
}
else {
- tL = prHMM->L;
+ tL = prHMM2->L;
}
@@ -942,7 +942,7 @@ hhalign(char **ppcFirstProf, int iFirstCnt, double *pdWeightsL,
and add pseudocounts etc. */
t.cQT = 't';
if (OK != ReadAndPrepare(INTERN_ALN_2_HMM,
- ppcSecndProf, iSecndCnt, prHMM,
+ ppcSecndProf, iSecndCnt, prHMM2,
pcPrealigned2, pcRepresent2, pdWeightsR,
infile, t)) {
sprintf(zcError, "%s:%s:%d: Problem Reading/Preparing t-profile (len=%d)\n",
@@ -1344,7 +1344,7 @@ hhalign(char **ppcFirstProf, int iFirstCnt, double *pdWeightsL,
hitlist.Reset();
while (!hitlist.End())
hitlist.ReadNext().Delete(); // Delete content of hit object
-
+
if (strucfile && par.wstruc>0)
{
for (int i=0; i<q.L+2; i++){
@@ -1396,7 +1396,7 @@ hhalign(char **ppcFirstProf, int iFirstCnt, double *pdWeightsL,
t.ClobberGlobal();
}
hitlist.ClobberGlobal();
-
+
return iRetVal;
} /* this is the end of hhalign() //end main */
diff --git a/src/hhalign/hhalign.h b/src/hhalign/hhalign.h
index 5103de6..1942ea5 100644
--- a/src/hhalign/hhalign.h
+++ b/src/hhalign/hhalign.h
@@ -15,14 +15,14 @@
********************************************************************/
/*
- * RCS $Id: hhalign.h 284 2013-06-12 10:10:11Z fabian $
+ * RCS $Id: hhalign.h 296 2014-10-07 12:15:41Z fabian $
*/
extern int
hhalign(char **ppcFirstProf, int iFirstCnt, double *pdWeightsL,
char **ppcSecndProf, int iSecndCnt, double *pdWeightsR,
- double *dScore_p, hmm_light *prHMM,
+ double *dScore_p, hmm_light *prHMM, hmm_light *prHMM2,
char *pcPrealigned1, char *pcRepresent1,
char *pcPrealigned2, char *pcRepresent2,
hhalign_para rHhalignPara, hhalign_scores *rHHscores,
diff --git a/src/hhalign/hhalignment-C.h b/src/hhalign/hhalignment-C.h
index c89aa45..a9aa299 100644
--- a/src/hhalign/hhalignment-C.h
+++ b/src/hhalign/hhalignment-C.h
@@ -15,7 +15,7 @@
********************************************************************/
/*
- * RCS $Id: hhalignment-C.h 236 2011-04-14 11:30:04Z fabian $
+ * RCS $Id: hhalignment-C.h 309 2016-06-13 14:10:11Z fabian $
*/
@@ -61,6 +61,11 @@ using std::ofstream;
#include "hhhmm.h"
#endif
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+//#include "new_new.h" /* memory tracking */
+
enum {KEEP_NOT = 0, KEEP_CONDITIONALLY, KEEP_ALWAYS};
@@ -516,7 +521,7 @@ Alignment::Compress(const char infile[])
i--;
if (L!=i && L!=/*MAXRES*/par.maxResLen-2 && !unequal_lengths) unequal_lengths=k; //sequences have different lengths
L=imin(L,i);
- }
+ } // end for (k)
if (unequal_lengths) break;
//Replace GAP with ENDGAP for all end gaps /* MR1 */
@@ -559,6 +564,10 @@ Alignment::Compress(const char infile[])
// Conversion to integer representation, checking for unequal lengths and initialization
if (nres==NULL) nres=new(int[N_in]);
+#if 0
+//#ifdef HAVE_OPENMP
+#pragma omp parallel for schedule(static), private(l)
+#endif
for (k=0; k<N_in; k++)
{
if (!keep[k]) continue;
@@ -582,6 +591,10 @@ Alignment::Compress(const char infile[])
for (k=0; k<N_in; k++) if (keep[k]) nl[ (int)X[k][l]]++;
for (a=0; a<20; a++) if(nl[a]) naa++;
if (!naa) naa=1; //naa=0 when column consists of only gaps and Xs (=ANY)
+#if 0
+//#ifdef HAVE_OPENMP
+#pragma omp parallel for schedule(static)
+#endif
for (k=0; k<N_in; k++)
if (keep[k] && (X[k][l]<20) )
{
@@ -601,6 +614,10 @@ Alignment::Compress(const char infile[])
}
// Add up percentage of gaps
+#if 0
+//#ifdef HAVE_OPENMP
+#pragma omp parallel for schedule(static) private(k)
+#endif
for (l=1; l<=L; l++)
{
float res=0;
@@ -725,6 +742,10 @@ Alignment::Compress(const char infile[])
}
i++;
this->l[i]=l;
+#if 0
+//#ifdef HAVE_OPENMP
+#pragma omp parallel for schedule(static)
+#endif
for (k=0; k<N_in; k++)
{
if (keep[k])
@@ -1032,7 +1053,6 @@ Alignment::Filter2(char keep[], int coverage, int qid, float qsc, int seqid1, in
int i; // counts residues
int n; // number of sequences accepted so far
-
// Initialize in[k]
for (n=k=0; k<N_in; k++) if (keep[k]==KEEP_ALWAYS) {in[k]=2/*KEEP_ALWAYS??*/; n++;} else in[k]=0;
@@ -1051,7 +1071,9 @@ Alignment::Filter2(char keep[], int coverage, int qid, float qsc, int seqid1, in
}
// Determine number of residues nres[k]?
- if ( (nres==NULL) || (sizeof(nres)<N_in*sizeof(int)) )
+ // KB: memory leak as sizeof(nres) just gives the size of an int pointer
+ //if ( (nres==NULL) || (sizeof(nres)<N_in*sizeof(int)) )
+ if (nres==NULL)
{
nres=new(int[N_in]);
for (k=0; k<N_in; k++) // do this for ALL sequences, not only those with in[k]==1 (since in[k] may be display[k])
@@ -1131,7 +1153,7 @@ Alignment::Filter2(char keep[], int coverage, int qid, float qsc, int seqid1, in
if (diff>=qdiff_max) {keep[k]=KEEP_NOT; continue;} // too different from query? => reject once and for all
}
// printf(" qsc=%6.2f qid=%6.2f \n",qsc_sum/nres[k],100.0*(1.0-(float)(diff)/nres[k]));
- }
+ } // end for (k)
if (seqid1>seqid2)
{
@@ -1194,6 +1216,10 @@ Alignment::Filter2(char keep[], int coverage, int qid, float qsc, int seqid1, in
// printf("\n");
// Loop over all candidate sequences kk (-> k)
+#if 0
+//#ifdef HAVE_OPENMP
+#pragma omp parallel for schedule(static) private(k, i, diff_min_frac, jj, j, first_kj, last_kj, cov_kj, diff_suff, diff)
+#endif
for (kk=0; kk<N_in; kk++)
{
if (inkk[kk]) continue; // seq k already accepted
@@ -1202,7 +1228,16 @@ Alignment::Filter2(char keep[], int coverage, int qid, float qsc, int seqid1, in
if (keep[k]==KEEP_ALWAYS) {inkk[kk]=2; continue;} // accept all marked sequences (no n++, since this has been done already)
// Calculate max-seq-id threshold seqidk for sequence k (as maximum over idmaxwin[i])
- if (seqid>=100) {in[k]=inkk[kk]=1; n++; continue;}
+// if (seqid>=100) {in[k]=inkk[kk]=1; n++; continue;}
+ if (seqid>=100) {
+ in[k]=inkk[kk]=1;
+#if 0
+//#ifdef HAVE_OPENMP
+#pragma omp atomic
+#endif
+ n++;
+ continue;
+ }
float seqidk=seqid1;
for (i=first[k]; i<=last[k]; i++)
if (idmaxwin[i]>seqidk) seqidk=idmaxwin[i];
@@ -1240,8 +1275,18 @@ Alignment::Filter2(char keep[], int coverage, int qid, float qsc, int seqid1, in
if (jj>=kk) // did loop reach end? => accept k. Otherwise reject k (the shorter of the two)
{
in[k]=inkk[kk]=1;
+#if 0
+//#ifdef HAVE_OPENMP
+#pragma omp atomic
+#endif
n++;
- for (i=first[k]; i<=last[k]; i++) N[i]++; // update number of sequences at position i
+ for (i=first[k]; i<=last[k]; i++) {
+#if 0
+//#ifdef HAVE_OPENMP
+#pragma omp atomic
+#endif
+ N[i]++; // update number of sequences at position i
+ }
// printf("%i %20.20s accepted\n",k,sname[k]);
}
// else
@@ -1556,7 +1601,8 @@ Alignment::FrequenciesAndTransitions(HMM& q, char* in)
for (k=0; k<N_in; k++) X[k][0]=ENDGAP; // make sure that sequences ENTER subalignment j for j=1
for (k=0; k<N_in; k++) X[k][L+1]=ENDGAP; // does it have an influence?
-#ifdef HAVE_OPENMP
+#if 0
+//#ifdef HAVE_OPENMP
if(par.wg != 1)
{
#pragma omp parallel sections
@@ -1836,122 +1882,169 @@ Alignment::Amino_acid_frequencies_and_transitions_from_M_state(HMM& q, char* in)
// Global weights?
if (par.wg==1)
- for (k=0; k<N_in; k++) wi[k]=wg[k];
+ for (k=0; k<N_in; k++)
+ wi[k]=wg[k];
// Initialization
q.Neff_HMM=0.0f;
Neff[0]=0.0; // if the first column has no residues (i.e. change==0), Neff[i]=Neff[i-1]=Neff[0]
n = new(int*[L+2]);
- for (j=1; j<=L; j++) n[j]=new(int[NAA+3]);
- for (j=1; j<=L; j++)
- for (a=0; a<NAA+3; a++) n[j][a]=0;
+//#if 0
+#ifdef HAVE_OPENMP
+#pragma omp parallel for schedule(static) private(a)
+#endif
+ for (j=1; j<=L; j++) {
+ n[j]=new(int[NAA+3]);
+ for (a=0; a<NAA+3; a++)
+ n[j][a]=0;
+ }
//////////////////////////////////////////////////////////////////////////////////////////////
// Main loop through alignment columns
for (i=1; i<=L; i++) // Calculate wi[k] at position i as well as Neff[i]
- {
-
+ {
if (par.wg==0)
- {
-
- change=0;
- // Check all sequences k and update n[j][a] and ri[j] if necessary
- for (k=0; k<N_in; k++)
- {
- if (!in[k]) continue;
- if (X[k][i-1]>=ANY && X[k][i]<ANY)
- { // ... if sequence k was NOT included in i-1 and has to be included for column i
- change=1;
- nseqi++;
- for (int j=1; j<=L; j++) n[j][ (int)X[k][j]]++;
- }
- else if (X[k][i-1]<ANY && X[k][i]>=ANY)
- { // ... if sequence k WAS included in i-1 and has to be thrown out for column i
- change=1;
- nseqi--;
- for (int j=1; j<=L; j++) n[j][ (int)X[k][j]]--;
- }
- } //end for (k)
- nseqs[i]=nseqi;
-
- // If subalignment changed: update weights wi[k] and Neff[i]
- if (change)
- {
- // Initialize weights and numbers of residues for subalignment i
- ncol=0;
- for (k=0; k<N_in; k++) wi[k]=1E-8; // for pathological alignments all wi[k] can get 0; /* MR1 */
-
- // sum wi[k] over all columns j and sequences k of subalignment
- for (j=1; j<=L; j++)
- {
- // do at least a fraction MAXENDGAPFRAC of sequences in subalignment contain an end gap in j?
- if (n[j][ENDGAP]>MAXENDGAPFRAC*nseqi) continue;
- naa=0; for (a=0; a<20; a++) if(n[j][a]) naa++;
- if (naa==0) continue;
- ncol++;
- for (k=0; k<N_in; k++)
- {
- if (in[k] && X[k][i]<ANY && X[k][j]<ANY)
- {
- // if (!n[j][ (int)X[k][j]]) {fprintf(stderr,"Error: Mi=%i: n[%i][X[%i]]=0! (X[%i]=%i)\n",i,j,k,k,X[k][j]);}
- wi[k]+=1.0/float(n[j][ (int)X[k][j] ]*naa);
- }
- }
- }
-
- // Check whether number of columns in subalignment is sufficient
- if (ncol<NCOLMIN)
- // Take global weights
- for (k=0; k<N_in; k++)
- if(in[k] && X[k][i]<ANY) wi[k]=wg[k]; else wi[k]=0.0;
-
- // Calculate Neff[i]
- Neff[i]=0.0;
- for (j=1; j<=L; j++)
- {
- // do at least a fraction MAXENDGAPFRA of sequences in subalignment contain an end gap in j?
- if (n[j][ENDGAP]>MAXENDGAPFRAC*nseqi) continue;
- for (a=0; a<20; a++) fj[a]=0;
- for (k=0; k<N_in; k++)
- if (in[k] && X[k][i]<ANY && X[k][j]<ANY)
- fj[ (int)X[k][j] ]+=wi[k];
- NormalizeTo1(fj,NAA);
- for (a=0; a<20; a++)
- if (fj[a]>1E-10) Neff[i]-=fj[a]*fast_log2(fj[a]);
- }
- if (ncol>0) Neff[i]=pow(2.0,Neff[i]/ncol); else Neff[i]=1.0;
-
- }
-
- else //no update was necessary; copy values for i-1
- {
- Neff[i]=Neff[i-1];
- }
- }
+ {
+ change=0;
+ // Check all sequences k and update n[j][a] and ri[j] if necessary
+ for (k=0; k<N_in; k++)
+ {
+ if (!in[k]) continue;
+ if (X[k][i-1]>=ANY && X[k][i]<ANY)
+ { // ... if sequence k was NOT included in i-1 and has to be included for column i
+ change=1;
+ nseqi++;
+//#if 0
+#ifdef HAVE_OPENMP
+#pragma omp parallel for schedule(static)
+#endif
+ for (int j=1; j<=L; j++)
+ n[j][ (int)X[k][j]]++;
+ }
+ else if (X[k][i-1]<ANY && X[k][i]>=ANY)
+ { // ... if sequence k WAS included in i-1 and has to be thrown out for column i
+ change=1;
+ nseqi--;
+//#if 0
+#ifdef HAVE_OPENMP
+#pragma omp parallel for schedule(static)
+#endif
+ for (int j=1; j<=L; j++)
+ n[j][ (int)X[k][j]]--;
+ }
+ } //end for (k)
+ nseqs[i]=nseqi;
+
+ // If subalignment changed: update weights wi[k] and Neff[i]
+ if (change)
+ {
+ // Initialize weights and numbers of residues for subalignment i
+ ncol=0;
+ for (k=0; k<N_in; k++)
+ wi[k]=1E-8; // for pathological alignments all wi[k] can get 0; /* MR1 */
+
+ // sum wi[k] over all columns j and sequences k of subalignment
+//#if 0
+#ifdef HAVE_OPENMP
+#pragma omp parallel for schedule(static) private(naa, a, k)
+#endif
+ for (j=1; j<=L; j++)
+ {
+ // do at least a fraction MAXENDGAPFRAC of sequences in subalignment contain an end gap in j?
+ if (n[j][ENDGAP]>MAXENDGAPFRAC*nseqi) continue;
+ naa=0;
+ for (a=0; a<20; a++)
+ if(n[j][a])
+ naa++;
+ if (naa==0) continue;
+//#if 0
+#ifdef HAVE_OPENMP
+#pragma omp atomic
+#endif
+ ncol++;
+ for (k=0; k<N_in; k++)
+ {
+ if (in[k] && X[k][i]<ANY && X[k][j]<ANY)
+ {
+ // if (!n[j][ (int)X[k][j]]) {fprintf(stderr,"Error: Mi=%i: n[%i][X[%i]]=0! (X[%i]=%i)\n",i,j,k,k,X[k][j]);}
+//#if 0
+#ifdef HAVE_OPENMP
+#pragma omp atomic
+#endif
+ wi[k]+=1.0/float(n[j][ (int)X[k][j] ]*naa);
+ }
+ }
+ } // end for (j)
+
+ // Check whether number of columns in subalignment is sufficient
+ if (ncol<NCOLMIN)
+ // Take global weights
+ for (k=0; k<N_in; k++)
+ if(in[k] && X[k][i]<ANY)
+ wi[k]=wg[k];
+ else wi[k]=0.0;
+
+ // Calculate Neff[i]
+ Neff[i]=0.0;
+//#if 0
+#ifdef HAVE_OPENMP
+#pragma omp parallel for schedule(static) private(a, k, fj)
+#endif
+ for (j=1; j<=L; j++)
+ {
+ // do at least a fraction MAXENDGAPFRA of sequences in subalignment contain an end gap in j?
+ if (n[j][ENDGAP]>MAXENDGAPFRAC*nseqi) continue;
+ for (a=0; a<20; a++) fj[a]=0;
+ for (k=0; k<N_in; k++)
+ if (in[k] && X[k][i]<ANY && X[k][j]<ANY)
+ fj[ (int)X[k][j] ]+=wi[k];
+ NormalizeTo1(fj,NAA);
+ for (a=0; a<20; a++)
+ if (fj[a]>1E-10)
+//#if 0
+#ifdef HAVE_OPENMP
+#pragma omp atomic
+#endif
+ Neff[i]-=fj[a]*fast_log2(fj[a]);
+ }
+ if (ncol>0)
+ Neff[i]=pow(2.0,Neff[i]/ncol);
+ else
+ Neff[i]=1.0;
+ } // end change
+
+ else //no update was necessary; copy values for i-1
+ {
+ Neff[i]=Neff[i-1];
+ }
+ } // end par.wg==0
// Calculate amino acid frequencies q.f[i][a] from weights wi[k]
- for (a=0; a<20; a++) q.f[i][a]=0;
- for (k=0; k<N_in; k++) if (in[k]) q.f[i][ (int)X[k][i] ]+=wi[k];
+ for (a=0; a<20; a++)
+ q.f[i][a]=0;
+ for (k=0; k<N_in; k++)
+ if (in[k])
+ q.f[i][ (int)X[k][i] ]+=wi[k];
NormalizeTo1(q.f[i],NAA,pb);
// Calculate transition probabilities from M state
q.tr[i][M2M]=q.tr[i][M2D]=q.tr[i][M2I]=0.0;
for (k=0; k<N_in; k++) //for all sequences
- {
- if (!in[k]) continue;
- //if input alignment is local ignore transitions from and to end gaps
- if (X[k][i]<ANY) //current state is M
- {
- if (I[k][i]) //next state is I
- q.tr[i][M2I]+=wi[k];
- else if (X[k][i+1]<=ANY) //next state is M
- q.tr[i][M2M]+=wi[k];
- else if (X[k][i+1]==GAP) //next state is D
- q.tr[i][M2D]+=wi[k];
- }
- } // end for(k)
+ {
+ if (!in[k]) continue;
+ //if input alignment is local ignore transitions from and to end gaps
+ if (X[k][i]<ANY) //current state is M
+ {
+ if (I[k][i]) //next state is I
+ q.tr[i][M2I]+=wi[k];
+ else if (X[k][i+1]<=ANY) //next state is M
+ q.tr[i][M2M]+=wi[k];
+ else if (X[k][i+1]==GAP) //next state is D
+ q.tr[i][M2D]+=wi[k];
+ }
+ } // end for(k)
// Normalize and take log
sum = q.tr[i][M2M]+q.tr[i][M2I]+q.tr[i][M2D]+FLT_MIN;
q.tr[i][M2M]=log2(q.tr[i][M2M]/sum);
@@ -1959,17 +2052,17 @@ Alignment::Amino_acid_frequencies_and_transitions_from_M_state(HMM& q, char* in)
q.tr[i][M2D]=log2(q.tr[i][M2D]/sum);
// for (k=0; k<N_in; k++) if (in[k]) w[k][i]=wi[k];
- }
- // DD TODO:fill in all the missing Neff values
+ } // end loop through alignment columns i
+ // DD TODO:fill in all the missing Neff values
- // end loop through alignment columns i
//////////////////////////////////////////////////////////////////////////////////////////////
delete[](wi); wi=NULL;
// delete n[][]
for (j=1; j<=L; j++){
- delete[](n[j]); (n[j]) = NULL;
+ delete[](n[j]);
+ (n[j]) = NULL;
}
delete[](n); (n) = NULL;
@@ -1982,41 +2075,60 @@ Alignment::Amino_acid_frequencies_and_transitions_from_M_state(HMM& q, char* in)
q.Neff_M[0]=99.999; // Neff_av[0] is used for calculation of transition pseudocounts for the start state
// Set emission probabilities of zero'th (begin) state and L+1st (end) state to background probabilities
- for (a=0; a<20; a++) q.f[0][a]=q.f[L+1][a]=pb[a];
+ for (a=0; a<20; a++)
+ q.f[0][a]=q.f[L+1][a]=pb[a];
// Assign Neff_M[i] and calculate average over alignment, Neff_M[0]
if (par.wg==1)
- {
+ {
+//#if 0
+#ifdef HAVE_OPENMP
+#pragma omp parallel for schedule(static) private(a)
+#endif
for (i=1; i<=L; i++)
- {
- float sum=0.0f;
- for (a=0; a<20; a++)
- if (q.f[i][a]>1E-10) sum -= q.f[i][a]*fast_log2(q.f[i][a]);
- q.Neff_HMM+=pow(2.0,sum);
- }
+ {
+ float sum=0.0f;
+ for (a=0; a<20; a++)
+ if (q.f[i][a]>1E-10)
+ sum -= q.f[i][a]*fast_log2(q.f[i][a]);
+//#if 0
+#ifdef HAVE_OPENMP
+#pragma omp atomic
+#endif
+ q.Neff_HMM+=pow(2.0,sum);
+ }
q.Neff_HMM/=L;
float Nlim=fmax(10.0,q.Neff_HMM+1.0); // limiting Neff
float scale=log2((Nlim-q.Neff_HMM)/(Nlim-1.0)); // for calculating Neff for those seqs with inserts at specific pos
+//#if 0
+#ifdef HAVE_OPENMP
+#pragma omp parallel for schedule(static) private(k)
+#endif
for (i=1; i<=L; i++)
- {
- float w_M=-1.0/N_filtered;
- for (k=0; k<N_in; k++)
- if (in[k] && X[k][i]<=ANY) w_M+=wg[k];
- if (w_M<0) q.Neff_M[i]=1.0;
- else q.Neff_M[i] = Nlim - (Nlim-1.0)*fpow2(scale*w_M);
- // fprintf(stderr,"M i=%3i ncol=--- Neff_M=%5.2f Nlim=%5.2f w_M=%5.3f Neff_M=%5.2f\n",i,q.Neff_HMM,Nlim,w_M,q.Neff_M[i]);
- }
- }
+ {
+ float w_M=-1.0/N_filtered;
+ for (k=0; k<N_in; k++)
+ if (in[k] && X[k][i]<=ANY)
+ w_M+=wg[k];
+ if (w_M<0)
+ q.Neff_M[i]=1.0;
+ else
+ q.Neff_M[i] = Nlim - (Nlim-1.0)*fpow2(scale*w_M);
+ // fprintf(stderr,"M i=%3i ncol=--- Neff_M=%5.2f Nlim=%5.2f w_M=%5.3f Neff_M=%5.2f\n",i,q.Neff_HMM,Nlim,w_M,q.Neff_M[i]);
+ }
+ }
else
- {
+ {
for (i=1; i<=L; i++)
- {
- q.Neff_HMM+=Neff[i];
- q.Neff_M[i]=Neff[i];
- if (q.Neff_M[i] == 0) { q.Neff_M[i] = 1; } /* MR1 */
- }
+ {
+ q.Neff_HMM+=Neff[i];
+ q.Neff_M[i]=Neff[i];
+ if (q.Neff_M[i] == 0) {
+ q.Neff_M[i] = 1;
+ } /* MR1 */
+ }
q.Neff_HMM/=L;
- }
+ } // end par.wg==1
delete[] Neff; Neff = NULL;
@@ -2266,158 +2378,200 @@ Alignment::Transitions_from_D_state(HMM& q, char* in)
// Global weights?
if (par.wg==1)
{
- for (k=0; k<N_in; k++) wi[k]=wg[k];
+ for (k=0; k<N_in; k++)
+ wi[k]=wg[k];
Nlim=fmax(10.0,q.Neff_HMM+1.0); // limiting Neff
scale=log2((Nlim-q.Neff_HMM)/(Nlim-1.0)); // for calculating Neff for those seqs with dels at specific pos
}
// Initialization
n = new(int*[L+2]);
- for (j=1; j<=L; j++) n[j]=new(int[NAA+3]);
- for (j=1; j<=L; j++)
- for (a=0; a<NAA+3; a++) n[j][a]=0;
-
-
+//#if 0
+#ifdef HAVE_OPENMP
+#pragma omp parallel for schedule(static) private(a)
+#endif
+ for (j=1; j<=L; j++) {
+ n[j]=new(int[NAA+3]);
+ for (a=0; a<NAA+3; a++)
+ n[j][a]=0;
+ }
//////////////////////////////////////////////////////////////////////////////////////////////
// Main loop through alignment columns
for (i=1; i<=L; i++) // Calculate wi[k] at position i as well as Neff[i]
+ {
+ if (par.wg==0) // if local weights
{
- if (par.wg==0) // if local weights
+ change=0;
+ // Check all sequences k and update n[j][a] and ri[j] if necessary
+ for (k=0; k<N_in; k++)
+ {
+ if (!in[k]) continue;
+ if (X[k][i-1]!=GAP && X[k][i]==GAP)
+ { // ... if sequence k was NOT included in i-1 and has to be included for column i
+ change=1;
+ nseqi++;
+//#if 0
+#ifdef HAVE_OPENMP
+#pragma omp parallel for schedule(static)
+#endif
+ for (int j=1; j<=L; j++)
+ n[j][ (int)X[k][j]]++;
+ }
+ else if (X[k][i-1]==GAP && X[k][i]!=GAP)
+ { // ... if sequence k WAS included in i-1 and has to be thrown out for column i
+ change=1;
+ nseqi--;
+//#if 0
+#ifdef HAVE_OPENMP
+#pragma omp parallel for schedule(static)
+#endif
+ for (int j=1; j<=L; j++)
+ n[j][ (int)X[k][j]]--;
+ }
+ } //end for (k)
+ nseqs[i]=nseqi;
+
+ // If there is no sequence in subalignment j ...
+ if (nseqi==0)
+ {
+ ncol=0;
+ Neff[i]=0.0; // effective number of sequences = 0!
+ q.tr[i][D2M]=-100000;
+ q.tr[i][D2D]=-100000;
+ continue;
+ }
+
+ // If subalignment changed: update weights wi[k] and Neff[i]
+ if (change)
+ {
+ // Initialize weights and numbers of residues for subalignment i
+ ncol=0;
+ for (k=0; k<N_in; k++)
+ wi[k]=0.0;
+
+ // sum wg[k][i] over all columns j and sequences k of subalignment
+//#if 0
+#ifdef HAVE_OPENMP
+#pragma omp parallel for schedule(static) private(naa, a, k)
+#endif
+ for (j=1; j<=L; j++)
{
- change=0;
- // Check all sequences k and update n[j][a] and ri[j] if necessary
+ if (n[j][ENDGAP]>MAXENDGAPFRAC*nseqi) continue;
+ naa=0;
+ for (a=0; a<20; a++)
+ if(n[j][a])
+ naa++;
+ if (naa==0) continue;
+//#if 0
+#ifdef HAVE_OPENMP
+#pragma omp atomic
+#endif
+ ncol++;
for (k=0; k<N_in; k++)
+ {
+ if (in[k] && X[k][i]==GAP && X[k][j]<ANY)
{
- if (!in[k]) continue;
- if (X[k][i-1]!=GAP && X[k][i]==GAP)
- { // ... if sequence k was NOT included in i-1 and has to be included for column i
- change=1;
- nseqi++;
- for (int j=1; j<=L; j++) n[j][ (int)X[k][j]]++;
- }
- else if (X[k][i-1]==GAP && X[k][i]!=GAP)
- { // ... if sequence k WAS included in i-1 and has to be thrown out for column i
- change=1;
- nseqi--;
- for (int j=1; j<=L; j++) n[j][ (int)X[k][j]]--;
- }
- } //end for (k)
- nseqs[i]=nseqi;
-
- // If there is no sequence in subalignment j ...
- if (nseqi==0)
- {
- ncol=0;
- Neff[i]=0.0; // effective number of sequences = 0!
- q.tr[i][D2M]=-100000;
- q.tr[i][D2D]=-100000;
- continue;
+ if (!n[j][ (int)X[k][j]]) {fprintf(stderr,"Error: Di=%i: n[%i][X[%i]]=0! (X[%i]=%i)\n",i,j,k,k,X[k][j]);}
+//#if 0
+#ifdef HAVE_OPENMP
+#pragma omp atomic
+#endif
+ wi[k]+=1.0/float(n[j][ (int)X[k][j] ]*naa);
}
+ }
+ } // end for (j)
+ // Check whether number of columns in subalignment is sufficient
+ if (ncol<NCOLMIN)
+ // Take global weights
+ for (k=0; k<N_in; k++)
+ if(in[k] && X[k][i]==GAP)
+ wi[k]=wg[k];
+ else
+ wi[k]=0.0;
+
+ // Calculate Neff[i]
+ Neff[i]=0.0;
+//#if 0
+#ifdef HAVE_OPENMP
+#pragma omp parallel for schedule(static) private(a, k, fj)
+#endif
+ for (j=1; j<=L; j++)
+ {
+ if (n[j][ENDGAP]>MAXENDGAPFRAC*nseqi) continue;
+ for (a=0; a<20; a++) fj[a]=0;
+ for (k=0; k<N_in; k++)
+ if (in[k] && X[k][i]==GAP && X[k][j]<ANY)
+ fj[ (int)X[k][j] ]+=wi[k];
+ NormalizeTo1(fj,NAA);
+ for (a=0; a<20; a++)
+ if (fj[a]>1E-10)
+//#if 0
+#ifdef HAVE_OPENMP
+#pragma omp atomic
+#endif
+ Neff[i]-=fj[a]*fast_log2(fj[a]);
+ }
+ if (ncol>0) Neff[i]=pow(2.0,Neff[i]/ncol); else Neff[i]=1.0;
- // If subalignment changed: update weights wi[k] and Neff[i]
- if (change)
- {
- // Initialize weights and numbers of residues for subalignment i
- ncol=0;
- for (k=0; k<N_in; k++) wi[k]=0.0;
-
- // sum wg[k][i] over all columns j and sequences k of subalignment
- for (j=1; j<=L; j++)
- {
- if (n[j][ENDGAP]>MAXENDGAPFRAC*nseqi) continue;
- naa=0; for (a=0; a<20; a++) if(n[j][a]) naa++;
- if (naa==0) continue;
- ncol++;
- for (k=0; k<N_in; k++)
- {
- if (in[k] && X[k][i]==GAP && X[k][j]<ANY)
- {
- if (!n[j][ (int)X[k][j]]) {fprintf(stderr,"Error: Di=%i: n[%i][X[%i]]=0! (X[%i]=%i)\n",i,j,k,k,X[k][j]);}
- wi[k]+=1.0/float(n[j][ (int)X[k][j] ]*naa);
- }
- }
- }
-
- // Check whether number of columns in subalignment is sufficient
- if (ncol<NCOLMIN)
- // Take global weights
- for (k=0; k<N_in; k++)
- if(in[k] && X[k][i]==GAP) wi[k]=wg[k]; else wi[k]=0.0;
-
- // Calculate Neff[i]
- Neff[i]=0.0;
- for (j=1; j<=L; j++)
- {
- if (n[j][ENDGAP]>MAXENDGAPFRAC*nseqi) continue;
- for (a=0; a<20; a++) fj[a]=0;
- for (k=0; k<N_in; k++)
- if (in[k] && X[k][i]==GAP && X[k][j]<ANY)
- fj[ (int)X[k][j] ]+=wi[k];
- NormalizeTo1(fj,NAA);
- for (a=0; a<20; a++)
- if (fj[a]>1E-10) Neff[i]-=fj[a]*fast_log2(fj[a]);
- }
- if (ncol>0) Neff[i]=pow(2.0,Neff[i]/ncol); else Neff[i]=1.0;
-
- }
+ }
- else //no update was necessary; copy values for i-1
- {
- Neff[i]=Neff[i-1];
- }
+ else //no update was necessary; copy values for i-1
+ {
+ Neff[i]=Neff[i-1];
+ }
- // Calculate transition probabilities from D state
- q.tr[i][D2M]=q.tr[i][D2D]=0.0;
- for (k=0; k<N_in; k++) //for all sequences
- {
- if (in[k] && X[k][i]==GAP) //current state is D
- {
- if (X[k][i+1]==GAP) //next state is D
- q.tr[i][D2D]+=wi[k];
- else if (X[k][i+1]<=ANY) //next state is M
- q.tr[i][D2M]+=wi[k];
- }
- } // end for(k)
+ // Calculate transition probabilities from D state
+ q.tr[i][D2M]=q.tr[i][D2D]=0.0;
+ for (k=0; k<N_in; k++) //for all sequences
+ {
+ if (in[k] && X[k][i]==GAP) //current state is D
+ {
+ if (X[k][i+1]==GAP) //next state is D
+ q.tr[i][D2D]+=wi[k];
+ else if (X[k][i+1]<=ANY) //next state is M
+ q.tr[i][D2M]+=wi[k];
}
+ } // end for(k)
+ }
- else // fast global weights?
+ else // fast global weights?
+ {
+ float w_D=-1.0/N_filtered;
+ ncol=0;
+ q.tr[i][D2M]=q.tr[i][D2D]=0.0;
+ // Calculate amino acid frequencies fj[a] from weights wg[k]
+ for (k=0; k<N_in; k++) //for all sequences
+ if (in[k] && X[k][i]==GAP) //current state is D
{
- float w_D=-1.0/N_filtered;
- ncol=0;
- q.tr[i][D2M]=q.tr[i][D2D]=0.0;
- // Calculate amino acid frequencies fj[a] from weights wg[k]
- for (k=0; k<N_in; k++) //for all sequences
- if (in[k] && X[k][i]==GAP) //current state is D
- {
- ncol++;
- w_D+=wg[k];
- if (X[k][i+1]==GAP) //next state is D
- q.tr[i][D2D]+=wi[k];
- else if (X[k][i+1]<=ANY) //next state is M
- q.tr[i][D2M]+=wi[k];
- }
- if (ncol>0)
- {
- if (w_D<0) Neff[i]=1.0;
- else Neff[i] = Nlim - (Nlim-1.0)*fpow2(scale*w_D);
- // fprintf(stderr,"D i=%3i ncol=%3i Neff_M=%5.2f Nlim=%5.2f w_D=%5.3f Neff_D=%5.2f\n",i,ncol,q.Neff_HMM,Nlim,w_D,Neff[i]);
- }
- else
- {
- Neff[i]=0.0; // effective number of sequences = 0!
- q.tr[i][D2M]=-100000;
- q.tr[i][D2D]=-100000;
- continue;
- }
+ ncol++;
+ w_D+=wg[k];
+ if (X[k][i+1]==GAP) //next state is D
+ q.tr[i][D2D]+=wi[k];
+ else if (X[k][i+1]<=ANY) //next state is M
+ q.tr[i][D2M]+=wi[k];
}
+ if (ncol>0)
+ {
+ if (w_D<0) Neff[i]=1.0;
+ else Neff[i] = Nlim - (Nlim-1.0)*fpow2(scale*w_D);
+ // fprintf(stderr,"D i=%3i ncol=%3i Neff_M=%5.2f Nlim=%5.2f w_D=%5.3f Neff_D=%5.2f\n",i,ncol,q.Neff_HMM,Nlim,w_D,Neff[i]);
+ }
+ else
+ {
+ Neff[i]=0.0; // effective number of sequences = 0!
+ q.tr[i][D2M]=-100000;
+ q.tr[i][D2D]=-100000;
+ continue;
+ }
+ }
- // Normalize and take log
- sum = q.tr[i][D2M]+q.tr[i][D2D];
- q.tr[i][D2M]=log2(q.tr[i][D2M]/sum);
- q.tr[i][D2D]=log2(q.tr[i][D2D]/sum);
+ // Normalize and take log
+ sum = q.tr[i][D2M]+q.tr[i][D2D];
+ q.tr[i][D2M]=log2(q.tr[i][D2M]/sum);
+ q.tr[i][D2D]=log2(q.tr[i][D2D]/sum);
- }
+ }
// end loop through alignment columns i
//////////////////////////////////////////////////////////////////////////////////////////////
@@ -2426,15 +2580,14 @@ Alignment::Transitions_from_D_state(HMM& q, char* in)
q.Neff_D[0]=99.999;
// Assign Neff_D[i]
- for (i=1; i<=L; i++)
+ for (i=1; i<=L; i++) {
q.Neff_D[i]=Neff[i];
-
- delete[](wi); wi = NULL;/* FIXME: FS */
- // delete n[][]
- for (j=1; j<=L; j++){
- delete[](n[j]); (n[j]) = NULL;
+ delete[](n[i]);
+ (n[i]) = NULL;
}
+
delete[](n); (n) = NULL;
+ delete[](wi); wi = NULL;/* FIXME: FS */
delete[] Neff; Neff = NULL;
return;
diff --git a/src/hhalign/hhfullalignment-C.h b/src/hhalign/hhfullalignment-C.h
index 5fa477d..9140c12 100644
--- a/src/hhalign/hhfullalignment-C.h
+++ b/src/hhalign/hhfullalignment-C.h
@@ -15,7 +15,7 @@
********************************************************************/
/*
- * RCS $Id: hhfullalignment-C.h 284 2013-06-12 10:10:11Z fabian $
+ * RCS $Id: hhfullalignment-C.h 309 2016-06-13 14:10:11Z fabian $
*/
// hhfullalignment.C
@@ -48,6 +48,7 @@ using std::endl;
#include "hhhit.h"
#include "hhhalfalignment.h"
#endif
+//#include "new_new.h" /* memory tracking */
diff --git a/src/hhalign/hhfunc-C.h b/src/hhalign/hhfunc-C.h
index cb41ef2..8b4152b 100644
--- a/src/hhalign/hhfunc-C.h
+++ b/src/hhalign/hhfunc-C.h
@@ -15,7 +15,7 @@
********************************************************************/
/*
- * RCS $Id: hhfunc-C.h 290 2013-09-20 15:18:12Z fabian $
+ * RCS $Id: hhfunc-C.h 309 2016-06-13 14:10:11Z fabian $
*/
/*
@@ -29,6 +29,7 @@
// hhfunc.C
+//#include "new_new.h" /* memory tracking */
/**
* AddBackgroundFrequencies()
@@ -218,7 +219,7 @@ AddBackgroundFrequencies(float **ppfFreq, float **ppfPseudoF, float **ppfPseudoT
*
* @param[in] iRnPtype
* type of read/prepare
- * enum {INTERN_ALN_2_HMM = 0, READ_ALN_2_HMM, READ_HMM_2_HMM};
+ * enum {INTERN_ALN_2_HMM = 0, READ_ALN_2_HMM, READ_HMM_2_HMM, INTERN_HMM_2_HMM};
* @param[in] ppcProf
* alignment
* @param[in] iCnt
@@ -1141,7 +1142,7 @@ readHMMWrapper(hmm_light *rHMM_p,
} /* (0 <= iA < AMINOACIDS) */
rHMM_p->seq[rHMM_p->ncons][i] = i2aa(iAmax);
} /* (0 <= i < rHMM_p->L) */
-
+ rHMM_p->seq[rHMM_p->ncons][i] = '\0'; /* FS, r291 -> */
} /* ncons not set */
diff --git a/src/hhalign/hhhalfalignment-C.h b/src/hhalign/hhhalfalignment-C.h
index 25c4e3c..53f1d97 100644
--- a/src/hhalign/hhhalfalignment-C.h
+++ b/src/hhalign/hhhalfalignment-C.h
@@ -15,7 +15,7 @@
********************************************************************/
/*
- * RCS $Id: hhhalfalignment-C.h 227 2011-03-28 17:03:09Z fabian $
+ * RCS $Id: hhhalfalignment-C.h 309 2016-06-13 14:10:11Z fabian $
*/
// hhfullalignment.C
@@ -47,6 +47,7 @@ using std::endl;
#include "hhalignment.h" // class Alignment
#include "hhhit.h"
#endif
+//#include "new_new.h" /* memory tracking */
/////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////
diff --git a/src/hhalign/hhhit-C.h b/src/hhalign/hhhit-C.h
index 981ad18..e047946 100644
--- a/src/hhalign/hhhit-C.h
+++ b/src/hhalign/hhhit-C.h
@@ -15,7 +15,7 @@
********************************************************************/
/*
- * RCS $Id: hhhit-C.h 284 2013-06-12 10:10:11Z fabian $
+ * RCS $Id: hhhit-C.h 308 2016-06-13 14:07:35Z fabian $
*/
// hhhit.C
@@ -47,6 +47,7 @@ using std::ofstream;
#include "hhalignment.h" // class Alignment
#include "hhhitlist.h" // class HitList
#endif
+//#include "new_new.h" /* memory tracking */
#define CALCULATE_MAX6(max, var1, var2, var3, var4, var5, var6, varb) \
if (var1>var2) { max=var1; varb=STOP;} \
@@ -159,10 +160,10 @@ Hit::Delete()
delete[] dbfile; dbfile = NULL;
for (int k=0; k<n_display; k++)
{
- //delete[] sname[k]; sname[k] = NULL;
+ delete[] sname[k]; sname[k] = NULL; /* this seems to be the long lost piece of memory, FS, 2016-06-09 */
delete[] seq[k]; seq[k] = NULL;
}
- //delete[] sname; sname = NULL;
+ delete[] sname; sname = NULL; /* this seems to be the long lost piece of memory, FS, 2016-06-09 */
delete[] seq; seq = NULL;
}
}
diff --git a/src/hhalign/hhhitlist-C.h b/src/hhalign/hhhitlist-C.h
index ec2a5ef..95777fc 100644
--- a/src/hhalign/hhhitlist-C.h
+++ b/src/hhalign/hhhitlist-C.h
@@ -15,7 +15,7 @@
********************************************************************/
/*
- * RCS $Id: hhhitlist-C.h 284 2013-06-12 10:10:11Z fabian $
+ * RCS $Id: hhhitlist-C.h 307 2016-06-13 14:04:39Z fabian $
*/
// hhhitlist.C
@@ -49,6 +49,7 @@ using std::endl;
#include "hhhalfalignment.h"
#include "hhfullalignment.h"
#endif
+//#include "new_new.h" /* memory tracking */
//////////////////////////////////////////////////////////////////////////////
@@ -3191,7 +3192,10 @@ HitList::ClobberGlobal(void){
}
// printf("POINTER:\t\t\t%p=TAIL\n", tail);
-
+ if ( (NULL != current) && (head != current) ){
+ delete current; /* this seems to be the long lost piece of memory, FS, 2016-04-15 */
+ current = NULL;
+ }
head->next = tail;
tail->prev = head;
size = 0;
diff --git a/src/hhalign/hhhmm-C.h b/src/hhalign/hhhmm-C.h
index 84760a3..ce4d7db 100644
--- a/src/hhalign/hhhmm-C.h
+++ b/src/hhalign/hhhmm-C.h
@@ -15,7 +15,7 @@
********************************************************************/
/*
- * RCS $Id: hhhmm-C.h 284 2013-06-12 10:10:11Z fabian $
+ * RCS $Id: hhhmm-C.h 309 2016-06-13 14:10:11Z fabian $
*/
@@ -45,6 +45,7 @@ using std::ofstream;
#include "hhdecl-C.h"
#include "hhutil-C.h" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc.
#endif
+//#include "new_new.h" /* memory tracking */
// #ifndef WNLIB
// #define WNLIB
diff --git a/src/hhalign/util-C.h b/src/hhalign/util-C.h
index b422f35..224ce17 100644
--- a/src/hhalign/util-C.h
+++ b/src/hhalign/util-C.h
@@ -15,7 +15,7 @@
********************************************************************/
/*
- * RCS $Id: util-C.h 155 2010-11-17 12:18:47Z fabian $
+ * RCS $Id: util-C.h 309 2016-06-13 14:10:11Z fabian $
*/
// Utility subroutines
@@ -29,6 +29,7 @@
#include <time.h> // clock
#endif
#include <sys/time.h>
+//#include "new_new.h" /* memory tracking */
/////////////////////////////////////////////////////////////////////////////////////
// Arithmetics
diff --git a/src/mymain.c b/src/mymain.c
index c2dba54..16806e5 100644
--- a/src/mymain.c
+++ b/src/mymain.c
@@ -15,7 +15,7 @@
********************************************************************/
/*
- * RCS $Id: mymain.c 290 2013-09-20 15:18:12Z fabian $
+ * RCS $Id: mymain.c 296 2014-10-07 12:15:41Z fabian $
*/
#ifdef HAVE_CONFIG_H
@@ -140,6 +140,7 @@ SetDefaultUserOpts(cmdline_opts_t *opts)
opts->bIsProfile = FALSE;
opts->aln_opts.bUseKimura = FALSE;
opts->aln_opts.bPercID = FALSE;
+ opts->aln_opts.pcHMMBatch = NULL;
opts->iMaxNumSeq = INT_MAX;
opts->iMaxSeqLen = INT_MAX;
@@ -358,6 +359,8 @@ ParseCommandLine(cmdline_opts_t *user_opts, int argc, char **argv)
struct arg_file *opt_hmm_in = arg_filen(NULL, "hmm-in", "<file>",
/*min*/ 0, /*max*/ 128,
"HMM input files");
+ struct arg_file *opt_hmm_batch = arg_file0(NULL, "hmm-batch", "<file>", /* FS, r291 -> */
+ "specify HMMs for individual sequences");
struct arg_lit *opt_dealign = arg_lit0(NULL, "dealign",
"Dealign input sequences");
struct arg_file *opt_profile1 = arg_file0(NULL, "profile1,p1",
@@ -491,6 +494,7 @@ ParseCommandLine(cmdline_opts_t *user_opts, int argc, char **argv)
void *argtable[] = {rem_seq_input,
opt_seqin,
opt_hmm_in,
+ opt_hmm_batch, /* FS, r291 -> */
opt_dealign,
opt_profile1,
opt_profile2,
@@ -785,6 +789,16 @@ ParseCommandLine(cmdline_opts_t *user_opts, int argc, char **argv)
}
+ /* HMM Batch, FS, r291 ->
+ */
+ user_opts->aln_opts.pcHMMBatch = NULL;
+ if (opt_hmm_batch->count>0){
+ user_opts->aln_opts.pcHMMBatch = CkStrdup(opt_hmm_batch->filename[0]);
+ if (! FileExists(user_opts->aln_opts.pcHMMBatch)){
+ Log(&rLog, LOG_FATAL, "File '%s' does not exist.", user_opts->aln_opts.pcHMMBatch);
+ }
+ }
+
/* Pair distance method
*/
if (opt_pairdist->count > 0) {
@@ -1058,7 +1072,7 @@ MyMain(int argc, char **argv)
if (ReadSequences(prMSeq, cmdline_opts.pcSeqInfile,
cmdline_opts.iSeqType, cmdline_opts.iSeqInFormat,
cmdline_opts.bIsProfile, cmdline_opts.bDealignInputSeqs,
- cmdline_opts.iMaxNumSeq, cmdline_opts.iMaxSeqLen)) {
+ cmdline_opts.iMaxNumSeq, cmdline_opts.iMaxSeqLen, cmdline_opts.aln_opts.pcHMMBatch)) {
Log(&rLog, LOG_FATAL, "Reading sequence file '%s' failed", cmdline_opts.pcSeqInfile);
}
#if TRACE
@@ -1106,7 +1120,7 @@ MyMain(int argc, char **argv)
if (ReadSequences(prMSeqProfile1, cmdline_opts.pcProfile1Infile,
cmdline_opts.iSeqType, cmdline_opts.iSeqInFormat,
cmdline_opts.bIsProfile, cmdline_opts.bDealignInputSeqs,
- cmdline_opts.iMaxNumSeq, cmdline_opts.iMaxSeqLen)) {
+ cmdline_opts.iMaxNumSeq, cmdline_opts.iMaxSeqLen, cmdline_opts.aln_opts.pcHMMBatch)) {
Log(&rLog, LOG_FATAL, "Reading sequences from profile file '%s' failed",
cmdline_opts.pcProfile1Infile);
}
@@ -1132,7 +1146,7 @@ MyMain(int argc, char **argv)
if (ReadSequences(prMSeqProfile2, cmdline_opts.pcProfile2Infile,
cmdline_opts.iSeqType, cmdline_opts.iSeqInFormat,
cmdline_opts.bIsProfile, cmdline_opts.bDealignInputSeqs,
- cmdline_opts.iMaxNumSeq, cmdline_opts.iMaxSeqLen)) {
+ cmdline_opts.iMaxNumSeq, cmdline_opts.iMaxSeqLen, cmdline_opts.aln_opts.pcHMMBatch)) {
Log(&rLog, LOG_FATAL, "Reading sequences from profile file '%s' failed",
cmdline_opts.pcProfile2Infile);
}
diff --git a/src/squid/a2m.c b/src/squid/a2m.c
index 28fe3a3..b5032ff 100644
--- a/src/squid/a2m.c
+++ b/src/squid/a2m.c
@@ -11,7 +11,7 @@
*
* reading/writing A2M (aligned FASTA) files.
*
- * RCS $Id: a2m.c 290 2013-09-20 15:18:12Z fabian $ (Original squid RCS Id: a2m.c,v 1.1 1999/07/15 22:26:40 eddy Exp)
+ * RCS $Id: a2m.c 297 2014-10-31 13:02:37Z fabian $ (Original squid RCS Id: a2m.c,v 1.1 1999/07/15 22:26:40 eddy Exp)
*/
#include <stdio.h>
@@ -148,4 +148,9 @@ WriteA2M(FILE *fp, MSA *msa)
fprintf(fp, "\n");*/
}
-}
+
+#ifdef CLUSTALO
+ free(buf); buf = NULL;
+#endif
+
+} /* this is the end of WriteA2M() */
diff --git a/src/squid/msa.c b/src/squid/msa.c
index 20200dd..7c19858 100644
--- a/src/squid/msa.c
+++ b/src/squid/msa.c
@@ -13,7 +13,7 @@
* SQUID's interface for multiple sequence alignment
* manipulation: access to the MSA object.
*
- * RCS $Id: msa.c 291 2014-02-27 18:20:54Z fabian $ (Original squid RCS Id: msa.c,v 1.18 2002/10/12 04:40:35 eddy Exp)
+ * RCS $Id: msa.c 297 2014-10-31 13:02:37Z fabian $ (Original squid RCS Id: msa.c,v 1.18 2002/10/12 04:40:35 eddy Exp)
*/
#include <stdio.h>
@@ -98,6 +98,8 @@ MSAAlloc(int nseq, int alen)
msa->sslen = NULL;
msa->sa = NULL;
msa->salen = NULL;
+ msa->co = NULL;
+ msa->colen = NULL;
msa->index = GKIInit();
msa->lastidx = 0;
@@ -253,6 +255,7 @@ MSAFree(MSA *msa)
Free2DArray((void **) msa->sqdesc, msa->nseq);
Free2DArray((void **) msa->ss, msa->nseq);
Free2DArray((void **) msa->sa, msa->nseq);
+ Free2DArray((void **) msa->co, msa->nseq);
if (msa->sqlen != NULL) free(msa->sqlen);
if (msa->wgt != NULL) free(msa->wgt);
@@ -663,7 +666,7 @@ MSAVerifyParse(MSA *msa)
if (msa->nseq == 0) Die("Parse error: no sequences were found for alignment %s",
msa->name != NULL ? msa->name : "");
- msa->alen = msa->sqlen[0];
+ msa->alen = msa->sqlen[0];
/* We can rely on msa->sqname[] being valid for any index,
* because of the way the line parsers always store any name
@@ -681,7 +684,7 @@ MSAVerifyParse(MSA *msa)
msa->sqname[idx],
msa->name != NULL ? msa->name : "");
/* all aseq must be same length. */
- if (msa->sqlen[idx] != msa->alen)
+ if (msa->sqlen[idx] != msa->alen)
Die("Parse error: sequence %s: length %d, expected %d in alignment %s",
msa->sqname[idx], msa->sqlen[idx], msa->alen,
msa->name != NULL ? msa->name : "");
@@ -698,13 +701,13 @@ MSAVerifyParse(MSA *msa)
}
/* if cons SS is present, must have length right */
- if (msa->ss_cons != NULL && strlen(msa->ss_cons) != msa->alen)
+ if (msa->ss_cons != NULL && strlen(msa->ss_cons) != msa->alen) /* FIXME */
Die("Parse error: #=GC SS_cons annotation: length %d, expected %d in alignment %s",
strlen(msa->ss_cons), msa->alen,
msa->name != NULL ? msa->name : "");
/* if cons SA is present, must have length right */
- if (msa->sa_cons != NULL && strlen(msa->sa_cons) != msa->alen)
+ if (msa->sa_cons != NULL && strlen(msa->sa_cons) != msa->alen) /* FIXME */
Die("Parse error: #=GC SA_cons annotation: length %d, expected %d in alignment %s",
strlen(msa->sa_cons), msa->alen,
msa->name != NULL ? msa->name : "");
@@ -728,6 +731,70 @@ MSAVerifyParse(MSA *msa)
}
+/* @* MSAVerifyParseDub */
+void
+MSAVerifyParseDub(MSA *msa)
+{
+ int idx;
+
+ if (msa->nseq == 0) Die("Parse error: no sequences were found for alignment %s",
+ msa->name != NULL ? msa->name : "");
+
+ msa->alen = msa->sqlen[0];
+
+ /* We can rely on msa->sqname[] being valid for any index,
+ * because of the way the line parsers always store any name
+ * they add to the index.
+ */
+ for (idx = 0; idx < msa->nseq; idx++)
+ {
+ /* aseq is required. */
+ if (msa->aseq[idx] == NULL)
+ Die("Parse error: No sequence for %s in alignment %s", msa->sqname[idx],
+ msa->name != NULL ? msa->name : "");
+ /* either all weights must be set, or none of them */
+ if ((msa->flags & MSA_SET_WGT) && msa->wgt[idx] == -1.0)
+ Die("Parse error: some weights are set, but %s doesn't have one in alignment %s",
+ msa->sqname[idx],
+ msa->name != NULL ? msa->name : "");
+ /* this is Dublin format, aseq don't have to be same length. */
+ if (msa->sqlen[idx] > msa->alen) {
+ msa->alen = msa->sqlen[idx];
+ }
+ /* if SS is present, must have length right */
+ if (msa->ss != NULL && msa->ss[idx] != NULL && msa->sslen[idx] != msa->sqlen[idx])
+ Die("Parse error: #=GR SS annotation for %s: length %d, expected %d in alignment %s",
+ msa->sqname[idx], msa->sslen[idx], msa->sqlen[idx],
+ msa->name != NULL ? msa->name : "");
+ /* if SA is present, must have length right */
+ if (msa->sa != NULL && msa->sa[idx] != NULL && msa->salen[idx] != msa->sqlen[idx])
+ Die("Parse error: #=GR SA annotation for %s: length %d, expected %d in alignment %s",
+ msa->sqname[idx], msa->salen[idx], msa->sqlen[idx],
+ msa->name != NULL ? msa->name : "");
+
+ /* if CO is present, must have length right */
+ if (msa->co != NULL && msa->co[idx] != NULL && msa->colen[idx] != msa->sqlen[idx])
+ Die("Parse error: #=GR CO annotation for %s: length %d, expected %d in alignment %s",
+ msa->sqname[idx], msa->colen[idx], msa->sqlen[idx],
+ msa->name != NULL ? msa->name : "");
+ }
+
+
+
+
+ /* Check that all or no weights are set */
+ if (!(msa->flags & MSA_SET_WGT))
+ FSet(msa->wgt, msa->nseq, 1.0); /* default weights */
+
+ /* Clean up a little from the parser */
+ if (msa->sqlen != NULL) { free(msa->sqlen); msa->sqlen = NULL; }
+ if (msa->sslen != NULL) { free(msa->sslen); msa->sslen = NULL; }
+ if (msa->salen != NULL) { free(msa->salen); msa->salen = NULL; }
+ if (msa->colen != NULL) { free(msa->colen); msa->colen = NULL; }
+
+ return;
+} /* this is the end of MSAVerifyParseDub() */
+
/* Function: MSAFileOpen()
@@ -919,6 +986,7 @@ MSAFileRead(MSAFILE *afp)
case MSAFILE_CLUSTAL: msa = ReadClustal(afp); break;
case MSAFILE_SELEX: msa = ReadSELEX(afp); break;
case MSAFILE_PHYLIP: msa = ReadPhylip(afp); break;
+ case MSAFILE_DUBLIN: msa = ReadDublin(afp); break; /* -> r296, FS */
default:
Die("MSAFILE corrupted: bad format index");
}
diff --git a/src/squid/msa.h b/src/squid/msa.h
index dec244b..ef0854d 100644
--- a/src/squid/msa.h
+++ b/src/squid/msa.h
@@ -16,7 +16,7 @@
* Header file for SQUID's multiple sequence alignment
* manipulation code.
*
- * RCS $Id: msa.h 291 2014-02-27 18:20:54Z fabian $ (Original squid RCS Id: msa.h,v 1.12 2002/10/12 04:40:35 eddy Exp)
+ * RCS $Id: msa.h 297 2014-10-31 13:02:37Z fabian $ (Original squid RCS Id: msa.h,v 1.12 2002/10/12 04:40:35 eddy Exp)
*/
#include <stdio.h> /* FILE support */
@@ -130,6 +130,7 @@ typedef struct msa_struct {
char **sqacc; /* accession numbers for individual sequences */
char **sqdesc; /* description lines for individual sequences */
char **ss; /* per-seq secondary structure annotation, or NULL */
+ char **co; /* per-seq confidence of secondary structure annotation, or NULL, -> r296, FS */
char **sa; /* per-seq surface accessibility annotation, or NULL */
float cutoff[MSA_MAXCUTOFFS]; /* NC, TC, GA cutoffs propagated to Pfam/Rfam */
int cutoff_is_set[MSA_MAXCUTOFFS];/* TRUE if a cutoff is set; else FALSE */
@@ -171,6 +172,7 @@ typedef struct msa_struct {
int *sqlen; /* individual sequence lengths during parsing */
int *sslen; /* individual ss lengths during parsing */
int *salen; /* individual sa lengths during parsing */
+ int *colen; /* individual co lengths during parsing */
int lastidx; /* last index we saw; use for guessing next */
} MSA;
#define MSA_SET_WGT (1 << 0) /* track whether wgts were set, or left at default 1.0 */
@@ -214,6 +216,7 @@ typedef struct msafile_struct {
#define MSAFILE_EPS 107 /* Encapsulated PostScript (output only) */
#ifdef CLUSTALO
#define MSAFILE_VIENNA 108 /* Vienna: concatenated fasta */
+#define MSAFILE_DUBLIN 109 /* Dublin: modified Stockholm format */
#endif
#define IsAlignmentFormat(fmt) ((fmt) > 100)
@@ -248,6 +251,7 @@ extern void MSAAppendGC(MSA *msa, char *tag, char *value);
extern char *MSAGetGC(MSA *msa, char *tag);
extern void MSAAppendGR(MSA *msa, char *tag, int seqidx, char *value);
extern void MSAVerifyParse(MSA *msa);
+extern void MSAVerifyParseDub(MSA *msa);
extern int MSAGetSeqidx(MSA *msa, char *name, int guess);
extern MSA *MSAFromAINFO(char **aseq, AINFO *ainfo);
@@ -304,6 +308,7 @@ extern void WriteSELEXOneBlock(FILE *fp, MSA *msa);
/* from stockholm.c
*/
+extern MSA *ReadDublin(MSAFILE *afp);
extern MSA *ReadStockholm(MSAFILE *afp);
extern void WriteStockholm(FILE *fp, MSA *msa);
extern void WriteStockholmOneBlock(FILE *fp, MSA *msa);
diff --git a/src/squid/sqio.c b/src/squid/sqio.c
index aa17744..dfd403a 100644
--- a/src/squid/sqio.c
+++ b/src/squid/sqio.c
@@ -336,7 +336,12 @@ FreeSequence(char *seq, SQINFO *sqinfo)
free(sqinfo->sa);
}
}
-}
+ if (sqinfo->flags & SQINFO_CO){
+ if (NULL != sqinfo->co){ /* FS, r296 -> */
+ free(sqinfo->co);
+ }
+ }
+} /* this is the end of FreeSequence() */
int
SetSeqinfoString(SQINFO *sqinfo, char *sptr, int flag)
@@ -438,6 +443,7 @@ SeqinfoCopy(SQINFO *sq1, SQINFO *sq2)
if (sq2->flags & SQINFO_TYPE) sq1->type = sq2->type;
if (sq2->flags & SQINFO_SS) sq1->ss = Strdup(sq2->ss);
if (sq2->flags & SQINFO_SA) sq1->sa = Strdup(sq2->sa);
+ if (sq2->flags & SQINFO_CO) sq1->co = Strdup(sq2->co);
}
/* Function: ToDNA()
@@ -1124,6 +1130,16 @@ ReadSeq(SQFILE *V, int format, char **ret_seq, SQINFO *sqinfo)
#endif
sqinfo->flags |= SQINFO_SA;
}
+ if (V->msa->co != NULL && V->msa->co[V->msa->lastidx] != NULL) {
+/* AW: stopping squid from dealigning sequences and corresponding info */
+#ifdef CLUSTALO
+ sqinfo->co = sre_strdup(V->msa->co[V->msa->lastidx], V->msa->alen);
+#else
+ MakeDealignedString(V->msa->aseq[V->msa->lastidx], V->msa->alen,
+ V->msa->co[V->msa->lastidx], &(sqinfo->co));
+#endif
+ sqinfo->flags |= SQINFO_CO;
+ } /* co */
V->msa->lastidx++;
}
else {
@@ -1236,6 +1252,9 @@ SeqfileFormat(FILE *fp)
strncmp(buf, "!!NA_SEQUENCE", 13) == 0)
{ fmt = SQFILE_GCG; goto DONE; }
+ if (strncmp(buf, "# DUBLIN 1.", 11) == 0) /* -> r296, FS */
+ { fmt = MSAFILE_DUBLIN; goto DONE; }
+
if (strncmp(buf, "# STOCKHOLM 1.", 14) == 0)
{ fmt = MSAFILE_STOCKHOLM; goto DONE; }
diff --git a/src/squid/squid.h b/src/squid/squid.h
index f86bb5c..a987bfb 100644
--- a/src/squid/squid.h
+++ b/src/squid/squid.h
@@ -147,6 +147,7 @@ struct seqinfo_s {
int type; /* kRNA, kDNA, kAmino, or kOther */
char *ss; /* 0..len-1 secondary structure string */
char *sa; /* 0..len-1 % side chain surface access. */
+ char *co; /* 0..len-1 secondary struct confidence */
};
typedef struct seqinfo_s SQINFO;
@@ -161,6 +162,7 @@ typedef struct seqinfo_s SQINFO;
#define SQINFO_OLEN (1 << 8)
#define SQINFO_SS (1 << 9)
#define SQINFO_SA (1 << 10)
+#define SQINFO_CO (1 << 11)
/****************************************************
@@ -244,6 +246,7 @@ extern int aa_index[]; /* convert 0..19 indices to 0..26 */
/* 17 was Clustal. Now alignment format*/
#ifdef CLUSTALO
#define SQFILE_VIENNA 18 /* Vienna format: concatenated fasta */
+#define SQFILE_DUBLIN 19 /* unaligned version of Stockholm */
#endif
#define IsUnalignedFormat(fmt) ((fmt) && (fmt) < 100)
diff --git a/src/squid/stockholm.c b/src/squid/stockholm.c
index b5f0cb4..9e7aee5 100644
--- a/src/squid/stockholm.c
+++ b/src/squid/stockholm.c
@@ -24,7 +24,7 @@
* MSAFree(msa);
* }
*
- * RCS $Id: stockholm.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: stockholm.c,v 1.7 2002/10/12 04:40:36 eddy Exp)
+ * RCS $Id: stockholm.c 297 2014-10-31 13:02:37Z fabian $ (Original squid RCS Id: stockholm.c,v 1.7 2002/10/12 04:40:36 eddy Exp)
*/
#include <stdio.h>
#include <string.h>
@@ -157,6 +157,89 @@ alignment file, right?), please either:\n\
return msa;
}
+/* Function: ReadDublin()
+ * Date: 2014-10-15
+ *
+ * Purpose: Parse the next sequences from an open Dublin
+ * format file. Return the sequences, or
+ * NULL if there are no more sequences in the file.
+ *
+ * Args: afp - open alignment file
+ *
+ * Returns: MSA * - an alignment object.
+ * caller responsible for an MSAFree()
+ * NULL if no more alignments
+ *
+ * Diagnostics:
+ * Will Die() here with a (potentially) useful message
+ * if a parsing error occurs
+ */
+MSA *
+ReadDublin(MSAFILE *afp)
+{
+ MSA *msa;
+ char *s;
+ int status;
+
+ if (feof(afp->f)) return NULL;
+
+ /* Initialize allocation of the MSA.
+ */
+ msa = MSAAlloc(10, 0);
+
+ /* Check the magic Dublin header line.
+ * We have to skip blank lines here, else we perceive
+ * trailing blank lines in a file as a format error when
+ * reading in multi-record mode.
+ */
+ do {
+ if ((s = MSAFileGetLine(afp)) == NULL) {
+ MSAFree(msa);
+ return NULL;
+ }
+ } while (IsBlankline(s));
+
+ if (strncmp(s, "# DUBLIN 1.", 11) != 0)
+ Die("\
+File %s doesn't appear to be in Dublin format.\n",
+ afp->fname);
+
+ /* Read the alignment file one line at a time.
+ */
+ while ((s = MSAFileGetLine(afp)) != NULL)
+ {
+ while (*s == ' ' || *s == '\t') s++; /* skip leading whitespace */
+
+ if (*s == '#') {
+ if (strncmp(s, "#=GF", 4) == 0) status = parse_gf(msa, s);
+ else if (strncmp(s, "#=GS", 4) == 0) status = parse_gs(msa, s);
+ else if (strncmp(s, "#=GC", 4) == 0) status = parse_gc(msa, s);
+ else if (strncmp(s, "#=GR", 4) == 0) status = parse_gr(msa, s);
+ else status = parse_comment(msa, s);
+ }
+ else if (strncmp(s, "//", 2) == 0) break;
+ else if (*s == '\n') continue;
+ else status = parse_sequence(msa, s);
+
+ if (status == 0)
+ Die("Dublin format parse error: line %d of file %s while reading alignment %s",
+ afp->linenumber, afp->fname, msa->name == NULL? "" : msa->name);
+ }
+
+ if (s == NULL && msa->nseq != 0)
+ Die ("Didn't find // at end of alignment %s", msa->name == NULL ? "" : msa->name);
+
+ if (s == NULL && msa->nseq == 0) {
+ /* probably just some junk at end of file */
+ MSAFree(msa);
+ return NULL;
+ }
+
+ MSAVerifyParseDub(msa);
+ return msa;
+
+} /* this is the end of ReadDublin() */
+
/* Function: WriteStockholm()
* Date: SRE, Mon May 31 19:15:22 1999 [St. Louis]
@@ -579,6 +662,20 @@ parse_gr(MSA *msa, char *buf)
}
msa->salen[seqidx] = sre_strcat(&(msa->sa[seqidx]), msa->salen[seqidx], text, len);
}
+ else if (strcmp(featurename, "CO") == 0)
+ {
+ if (msa->co == NULL)
+ {
+ msa->co = MallocOrDie(sizeof(char *) * msa->nseqalloc);
+ msa->colen = MallocOrDie(sizeof(int) * msa->nseqalloc);
+ for (j = 0; j < msa->nseqalloc; j++)
+ {
+ msa->co[j] = NULL;
+ msa->colen[j] = 0;
+ }
+ }
+ msa->colen[seqidx] = sre_strcat(&(msa->co[seqidx]), msa->colen[seqidx], text, len);
+ }
else
MSAAppendGR(msa, featurename, seqidx, text);
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/clustalo.git
More information about the debian-med-commit
mailing list