[med-svn] r127 - trunk/packages/poa/branches/upstream/current
Charles Plessy
charles-guest at costa.debian.org
Tue Sep 26 04:34:36 UTC 2006
Author: charles-guest
Date: 2006-09-26 04:34:02 +0000 (Tue, 26 Sep 2006)
New Revision: 127
Added:
trunk/packages/poa/branches/upstream/current/align_score.h
trunk/packages/poa/branches/upstream/current/blosum80_trunc.mat
trunk/packages/poa/branches/upstream/current/make_pscores.pl
trunk/packages/poa/branches/upstream/current/msa_format.c
trunk/packages/poa/branches/upstream/current/msa_format.h
trunk/packages/poa/branches/upstream/current/multidom.pscore
Modified:
trunk/packages/poa/branches/upstream/current/Makefile
trunk/packages/poa/branches/upstream/current/README
trunk/packages/poa/branches/upstream/current/align_lpo.c
trunk/packages/poa/branches/upstream/current/align_lpo2.c
trunk/packages/poa/branches/upstream/current/align_lpo_po.c
trunk/packages/poa/branches/upstream/current/align_lpo_po2.c
trunk/packages/poa/branches/upstream/current/align_score.c
trunk/packages/poa/branches/upstream/current/buildup_lpo.c
trunk/packages/poa/branches/upstream/current/lpo.c
trunk/packages/poa/branches/upstream/current/lpo.h
trunk/packages/poa/branches/upstream/current/lpo_format.c
trunk/packages/poa/branches/upstream/current/main.c
trunk/packages/poa/branches/upstream/current/seq_util.c
trunk/packages/poa/branches/upstream/current/seq_util.h
Log:
Load /tmp/tmp.N8MrAD/poa-2.0 into
trunk/packages/poa/branches/upstream/current.
Modified: trunk/packages/poa/branches/upstream/current/Makefile
===================================================================
--- trunk/packages/poa/branches/upstream/current/Makefile 2006-09-25 13:21:49 UTC (rev 126)
+++ trunk/packages/poa/branches/upstream/current/Makefile 2006-09-26 04:34:02 UTC (rev 127)
@@ -13,6 +13,7 @@
black_flag.o \
seq_util.o \
fasta_format.o \
+ msa_format.o \
align_lpo2.o \
align_lpo_po2.o \
buildup_lpo.o \
@@ -27,7 +28,7 @@
CC = gcc
#CFLAGS= -g -ansi-strict -W -Wall -DUSE_WEIGHTED_LINKS -DUSE_PROJECT_HEADER -I.
-CFLAGS= -g -DUSE_WEIGHTED_LINKS -DUSE_PROJECT_HEADER -I. -DTRUNCATE_GAP_LENGTH=8
+CFLAGS= -g -DUSE_WEIGHTED_LINKS -DUSE_PROJECT_HEADER -I.
# -I$(HOME)/lib/include
# -DREPORT_MAX_ALLOC
@@ -48,5 +49,3 @@
what:
@echo poa: partial-order based sequence alignment program
@echo liblpo.a: partial-order alignment and utilities function library
-
-
Modified: trunk/packages/poa/branches/upstream/current/README
===================================================================
--- trunk/packages/poa/branches/upstream/current/README 2006-09-25 13:21:49 UTC (rev 126)
+++ trunk/packages/poa/branches/upstream/current/README 2006-09-26 04:34:02 UTC (rev 127)
@@ -1,19 +1,25 @@
-POA installation notes Sept. 2001
+
+-- POA INSTALLATION NOTES --
+September 2001, updated March 2004.
+
Chris Lee
Dept. of Chemistry & Biochemistry
UCLA
+
I. COMPILATION
+
To compile this program, simply type 'make poa'.
+This produces an executable for sequence alignment (poa) and also a
+linkable library liblpo.a. The software has been compiled and tested
+on LINUX and Mac OS X.
-This produces an executable for sequence alignment (poa) and also a linkable
-library liblpo.a. The software has been compiled and tested on LINUX.
II. RUNNING POA
-Poa has a variety of command line options. Running poa without any arguments
-will print a list of the possible command line arguments. POA may be used to
-construct a PO-MSA, or to analyze a PO-MSA.
+POA has a variety of command line options. Running POA without any
+arguments will print a list of the possible command line arguments.
+POA may be used to construct a PO-MSA, or to analyze a PO-MSA.
A. Constructing a PO-MSA
-------------------------
@@ -21,62 +27,116 @@
1. Required Input:
i. An Alignment Score Matrix File:
-A score matrix file is required, because poa uses it to get the residue
-alphabet and indexing. Even if poa is not being used to perform
-multiple sequence alignment, this file must be provided. Any basic alignment
-matrix which may be used with BLAST may be used here. This file must be the
-first command line argument without a flag in order to be interpreted by poa
-as the score matrix file. An example score matrix file, blosum80.mat, is
-provided in this directory.
-Note: Poa is Case Sensitive.
-Poa can align amino acid sequences to nucleotide sequences. In order to
-distinguish amino acid residues from nucleic acid residues, poa is case
-sensitive. Residues that are upper case are interpreted as amino acids
-while residues that are lower case are interpreted as nucleotides. Poa
-can handle mixed score matrices containing scores for aligning amino acid
-residues to nucleotide residues as long as the column and row labels of the
-matrix are case sensitive. See blosum80.mat file for an example of just
-such a mixed score matrix.
+A score matrix file is required, because POA uses it to get the
+residue alphabet and indexing. Even if POA is not being used to
+perform multiple sequence alignment, this file must be provided. Any
+basic alignment matrix which may be used with BLAST may be used here.
+This file must be the first command line argument without a flag in
+order to be interpreted by POA as the score matrix file. Two example
+score matrix files, blosum80.mat and blosum80_trunc.mat, are provided
+in this directory. They includes scores for matching nucleotides, as
+well as amino acids. Header lines may be used to specify gap
+parameters, as in the examples:
-ii. A FASTA File:
-A FASTA file is required only if poa is being used to construct a
+GAP-PENALTIES=A B C
+GAP-TRUNCATION-LENGTH=T
+GAP-DECAY-LENGTH=D
+
+means that the gap opening penalty is A; the gap extension penalty is
+B until the gap length reaches T; the gap extension penalty decreases
+linearly from B to C for gap lengths between T and T+D; and the gap
+extension penalty is C for all longer gaps. An additional line,
+"GAP-PENALTIES-X=Ax Bx Cx", can be inserted after the GAP-PENALTIES
+line to specify the opening and extension penalties for gaps in the
+first sequence relative to the second sequence in an alignment, i.e.,
+for asymmetric gap scoring. Use the -v flag to see what gap penalties
+POA is using for a given run.
+
+NOTE: POA is case-sensitive.
+In order to distinguish amino acid residues from nucleic acid
+residues, POA is case sensitive. Residues that are uppercase are
+interpreted as amino acids, while residues that are lowercase are
+interpreted as nucleotides. POA can handle mixed score matrices
+containing both amino acid and nucleotide scores, as long as the
+column and row labels of the matrix are case-sensitive. The
+blosum80.mat file is an example.
+
+ii. A FASTA file, or MSA Files in PO, CLUSTAL or PIR Format
+A FASTA file is required only if POA is being used to construct a
new PO-MSA from a list of sequences, or to align a list of sequences to
an already existing PO-MSA (see Analyzing a PO-MSA below). This FASTA file
-should contain sequences to be aligned by poa. The command line argument
-to get poa to accept a FASTA file as input is '-read_fasta FILENAME'. Poa
-will interpret FILENAME as the FASTA sequence file. An example file,
-multidom.seq, is provided in this directory.
+should contain sequences to be aligned by POA. The command line argument
+to get POA to accept a FASTA file as input is '-read_fasta FILENAME'. POA
+will interpret FILENAME as the FASTA sequence file. An example file,
+multidom.seq, is provided in this directory.
-Poa is case sensitive (see note above). All residues in protein sequences
-in the FASTA file must be upper case to be interpreted as amino acids by poa,
-while all residues in nucleotide sequences in the FASTA file must be lower
-case to be interpreted as nucleotides by poa. To switch the case of all of
-the letters in the FASTA file to upper case, use the '-toupper' command
-line argument. To switch the case of all the letters in the FASTA file to
-lower case, use the '-tolower' command line argument.
+POA is case-sensitive (see NOTE above). All residues in the FASTA
+file must be uppercase to be interpreted as amino acids by POA, or
+lowercase to be interpreted as nucleotides. To switch the case of all
+of the letters in the FASTA file to uppercase, use the '-toupper'
+command line argument. To switch the case of all the letters in the
+FASTA file to lowercase, use the '-tolower' command line argument.
+POA will also read in a set of MSA files to be aligned (see below).
+
+
2. MSA Construction Options:
-i. Aggressive Fusion:
-During the building up of a PO-MSA, if a node i with label 'a' is aligned
-to an align ring which already contains a node j with label 'a', poa simply
-adds the node to the align ring. It is possible to force poa to do
-aggressive fusion, so that when a node i with label 'a' is aligned to
-an align ring which already contains a node j with label 'a', node i is
-fused to node j. The command line argument for accomplishing this is
-'-fuse_all'.
+i. Global vs. Local Alignment
+POA will build alignments using local or global alignment. The
+default is set to local aligment. To call global alignment, use the
+'-do_global' flag.
+
+ii. Iterative vs. Progressive Alignment
+
+POA will build the alignment iteratively, aligning sequences and MSAs
+in the order they are provided. It will also align sequences and MSAs
+in the order dictated by a guide tree built from a matrix of pairwise
+similarity scores. To call progressive alignment, use the
+'-do_progressive' flag.
+
+To provide POA with a set of pairwise similarity scores, use the
+'-read_pairscores' flag followed by the name of the text file
+containing the list of pairwise similarity scores. This file should
+be a tab-delimited file, where each row contains two sequence names
+followed by the pairwise similarity score. The included file
+"multidom.pscore" shows an example.
+
+Example row in pairscore file:
+ABL1_HUMAN MATK_HUMAN 260.0
+
+To quickly compute a set of pairwise similarity scores, run BLAST, and
+set the similarity scores to the set of BLAST bitscores. A simple
+BLAST driver/parser is provided as "make_pscores.pl"; you may need to
+modify this script for your particular configuration. If the
+'-do_progressive' flag is specified without a corresponding pairscore
+file, POA will compute pairwise similarity scores itself, by
+performing all pairwise alignments. This is very slow, and we do not
+recommend it.
+
+iii. Aggressive Fusion:
+
+By default, during the building up of a PO-MSA, if a node i with label
+'A' is aligned to a node j with label 'B' that belongs to an align
+ring containing a third node k with label 'A', POA simply adds node i
+to the j-k align ring. It is possible to force POA to do aggressive
+fusion, so that node i is instead fused to node k. Use the '-fuse_all'
+flag to accomplish this.
+
+
3. MSA Output Formats:
-Poa can output a PO-MSA in several formats simultaneously including CLUSTAL,
-PIR, and PO. The PO format is the best format since it contains all of the
-information in the PO-MSA. The other formats accurately represent the MSA,
-but since they are RC-MSA formats, they may lose some of the information in the
-full PO-MSA.
+POA can output a PO-MSA in several formats simultaneously, including
+CLUSTAL, PIR, and PO. The PO format is the best format since it
+contains all of the information in the PO-MSA. The other formats
+accurately represent the MSA, but since they are RC-MSA formats, they
+may lose some of the information in the full PO-MSA.
+
i. CLUSTAL format:
-This format is the standard CLUSTAL format. The command line argument to get
-the MSA output in this format is '-clustal FILENAME'.
+This format is the standard CLUSTAL format. The command line argument
+to get the MSA output in this format is '-clustal FILENAME'.
ii. PIR format:
This format is the standard PIR format, which is like FASTA with a '.'
@@ -84,87 +144,112 @@
output in this format is '-pir FILENAME'.
iii. PO format:
-This format is the standard PO format. It is described below in the section
-PO format. The command line argument to get the MSA output in this format is
-'-po FILENAME'.
+This format is the standard PO format. It is described below in the
+section PO format. The command line argument to get the MSA output in
+this format is '-po FILENAME'.
-Example: Constructing a MSA of Four Protein Sequences
-Running poa with the following statement will take the fasta formatted
-sequences in the multidom.seq file, construct a PO-MSA using the scoring
-matrix in the file blosum80.mat, and then output the PO-MSA in CLUSTAL format
-in the file multidom.aln.
+EXAMPLE: Constructing an MSA of Four Protein Sequences
-poa -clustal multidom.aln blosum80.mat multidom.seq
+Running POA with the following statement will take the FASTA-formatted
+sequences in the multidom.seq file, construct a PO-MSA using the
+scoring matrix in the file blosum80.mat, and then output the PO-MSA in
+CLUSTAL format to the file multidom.aln.
-The output should be identical to the results of figs. 6 & 7 in the paper.
+poa -read_fasta multidom.seq -clustal multidom.aln blosum80.mat
+
4. Other Output:
-i. Score Matrix
-Poa will also print to stdout the score matrix stored in the '.mat' file.
-The command line argument to get poa to do this is '-printmatrix LETTERSET',
-where LETTERSET is a string of letters to be printed with the score matrix.
-For example, if the score matrix is designed for protein alignment the
-letter set might be 'ARNDCQEGHILKMFPSTWYV'.
+i. Score Matrix
-ii. Verbose Mode
-Poa will run in verbose mode, printing additional information generated
-during the run to stdout. The command line argument to get poa to do run in
-verbose mode is '-v'.
+POA will also print to stdout the score matrix stored in the '.mat'
+file. The command line argument to get POA to do this is
+'-printmatrix LETTERSET', where LETTERSET is a string of letters to be
+printed with the score matrix. For example, if the score matrix is
+designed for protein alignment the letter set might be
+'ARNDCQEGHILKMFPSTWYV'.
+ii. Verbose Mode
+
+POA will run in verbose mode, printing additional information
+generated during the run (such as the set of gap scores used) to
+stdout. The command line argument for verbose mode is '-v'.
+
+
B. Analyzing a PO-MSA
-----------------------
-Poa can also take a PO format file as input and rebuild the PO-MSA data
-structure. Once this data structure has been rebuilt, it may be analyzed
-for features. In 'liblpo.a', the linkable poa library, we have included the
-functions necessary to do heaviest bundling and thereby find consensus
-sequences in the PO-MSA (the details of the heaviest bundling algorithm
-are described elsewhere). Poa has been written so that users may create
-their own functions for analyzing a PO-MSA. We have not included in the
-'liblpo.a' library the functions that we wrote to analyze PO-MSAs
-constructed with ESTs and genome sequence to find snps and alternative
-splice sites. However, it is possible to design modular library functions
-that will look for highly specific biological features in any PO-MSA data
-structure.
+POA can also take as input an MSA in PO, CLUSTAL or PIR file format
+and rebuild the PO-MSA data structure. Once this data structure has
+been rebuilt, it may be analyzed for features. In "liblpo.a", the
+linkable POA library, we have included the functions necessary to do
+heaviest bundling and thereby find consensus sequences in the PO-MSA
+(the details of the heaviest bundling algorithm are described
+elsewhere). POA has been written so that users may create their own
+functions for analyzing a PO-MSA. We have not included in the
+"liblpo.a" library the functions that we wrote to analyze PO-MSAs
+constructed with ESTs and genome sequence to find snps and alternative
+splice sites. However, it is possible to design modular library
+functions that will look for highly specific biological features in
+any PO-MSA data structure.
+
1. Required Input:
Before the PO-MSA data structure can be analyzed it must be built. It can
-be built either from a PO file or from a FASTA file, or from both a PO file
-and a FASTA file.
+be built iteratively or using a guide tree, or converted from another file
+type. POA can align a set of FASTA-formatted sequences to each other or
+to an existing PO-MSA. It can align two PO-MSAs. It can also align an
+arbitrary set of PO-MSAs.
-Note: POA Requires Either A PO File or a FASTA File
-If neither files are read in by POA it will terminate early, since it
+Note: POA Requires Either An MSA File or a FASTA File
+If neither type of file is read in by POA it will terminate early, since it
has not received any sequence data.
-i. A PO file:
-Poa will read in a PO formatted file. The command line argument to get
-poa to read in a PO formatted file and rebuild the PO-MSA data structure
-is '-read_po FILENAME'.
+i. An MSA file in PO, CLUSTAL or PIR format:
-It is possible to filter the PO-MSA data structure as it is being rebuilt. In
-order to filter the PO-MSA in the PO file to include only a subset of sequences
-use the command line argument '-subset FILENAME', where the file named FILENAME
-contains the list of sequence names to be included in the new PO-MSA. In order
-to filter the PO-MSA in the PO file to exclude a subset of sequences use the
-command line argument '-remove FILENAME', where the file named FILENAME contains
-the list of sequences to be excluded from the new PO-MSA. The names of
-sequences to be included or excluded should be in the format 'SOURCENAME= *"
-as they are in the PO file. Lists of sequence source names can be created by
-using the unix grep utility on the PO file.
+POA will read in an MSA file in PO, CLUSTAL or PIR format. The
+command line argument to get poa to read in an MSA file and rebuild
+the PO-MSA data structure is '-read_msa FILENAME'. POA automatically
+determines whether the MSA file is in PO, CLUSTAL, or PIR format. POA
+will read in a second MSA file when the '-read_msa2 FILENAME' flag is
+used. POA will read in a set of MSA files using the '-read_msa_list
+FILENAME' flag. The file should contain a list of names of MSA files.
-ii. A FASTA File:
-The FASTA file should contain sequences to be aligned by poa. The command line
-argument to get poa to accept a FASTA file as input is '-read_fasta FILENAME'.
-Poa will interpret FILENAME as the FASTA sequence file. An example file,
-multidom.seq, is provided in this directory. (See note above on case sensitivity).
+It is possible to filter the PO-MSA data structure as it is being
+rebuilt. In order to filter the PO-MSA in the MSA file to include
+only a subset of sequences use the command line argument '-subset
+FILENAME', where the file named FILENAME contains the list of sequence
+names to be included in the new PO-MSA. In order to filter the PO-MSA
+in the MSA file to exclude a subset of sequences, use the command line
+argument '-remove FILENAME', where the file named FILENAME contains
+the list of sequences to be excluded from the new PO-MSA. The names
+of sequences to be included or excluded should be in the format
+"SOURCENAME=*", as they are in a PO file. Lists of sequence source
+names can be created by using the unix grep utility on the PO file.
+Each line in the list of sequences to be filtered should read,
+"SOURCENAME=" followed by the name of the sequence,
+e.g. "SOURCENAME=ABL1_HUMAN". To filter the second PO-MSA read in
+using the '-read_msa2 FILENAME', use the '-subset2' and '-remove2'
+flags.
-Note: POA Can Take Both A PO File And A FASTA File As Input
-If both the '-read_po FILENAME' argument and the '-read_fasta FILENAME'
-argument are given to poa on the command line, then poa will first rebuild the
-PO-MSA in the PO file, and then it will align the sequences in the FASTA file
-to this PO-MSA.
+ii. A FASTA File:
+The FASTA file should contain sequences to be aligned by POA. The
+command line argument to get POA to accept a FASTA file as input is
+'-read_fasta FILENAME'. POA will interpret FILENAME as the FASTA
+sequence file. An example file, "multidom.seq", is provided in this
+directory. (See note above on case-sensitivity).
+
+NOTE: POA Can Take Both An MSA File And A FASTA File As Input
+If both the '-read_msa FILENAME' argument and the '-read_fasta FILENAME'
+argument are given to POA on the command line, then POA will first rebuild the
+PO-MSA in the MSA file, and then it will align the sequences in the FASTA file
+to this PO-MSA. Similarly, if both the '-read_msa_list FILENAME' flag
+and the '-read_fasta FILENAME' flag are given to POA, then POA will
+rebuild all of the PO-MSAs and will align them to each other and to the
+sequences in the FASTA file.
+
+
2. Additional PO Utilities:
i. Consensus Generation Via Heaviest Bundling Algorithm:
@@ -173,7 +258,7 @@
function adds the new consensus sequences to the PO-MSA by storing new
consensus sequence indices on the in the PO-MSA nodes corresponding to
the consensus sequence paths. The sequence source names for consensus
-sequences generated by heaviest bundling is CONSENS'i' where 'i' is the
+sequences generated by heaviest bundling are CONSENS'i' where 'i' is the
index of the bundle corresponding to the consensus sequence.
The heaviest bundling algorithm can also take as input a bundling
@@ -189,12 +274,13 @@
in the SOURCEINFO line for each sequence the index of the bundle to which
that sequence belongs is give. Finally, using the command line argument
'-best' restricts the MSA output to the consensus sequences generated
-by heaviest bundling.
+by heaviest bundling (NB: this applies to PIR output only).
+
III. PO FILE FORMAT
****************************HEADER****************************************
-VERSION= ~Current version of poa~
+VERSION= ~Current version of POA,e.g. LPO.1.0~
NAME= ~Name of PO-MSA. Defaults to name of 1st sequence in PO-MSA~
TITLE= ~Title of PO-MSA. Defaults to title of 1st sequence in PO-MSA~
LENGTH= ~Number of nodes in PO-MSA~
@@ -232,4 +318,3 @@
For more information, see http://www.bioinformatics.ucla.edu/poa.
-
Modified: trunk/packages/poa/branches/upstream/current/align_lpo.c
===================================================================
--- trunk/packages/poa/branches/upstream/current/align_lpo.c 2006-09-25 13:21:49 UTC (rev 126)
+++ trunk/packages/poa/branches/upstream/current/align_lpo.c 2006-09-26 04:34:02 UTC (rev 127)
@@ -167,30 +167,14 @@
insert_x_try,insert_x_score,insert_y_score,best_score= -999999;
LPORowFreeList_T *free_list=NULL;
LastRightList_T *last_right=NULL;
+
+ int max_gap_length;
+ LPOScore_T *gap_penalty_x, *gap_penalty_y;
- LPOScore_T *gap_penalty_x, *gap_penalty_y;
- double gap_decrement;
- int L_trunc = m->trunc_gap_length, L_decay = m->decay_gap_length;
+ max_gap_length = m->max_gap_length;
+ gap_penalty_x = m->gap_penalty_x;
+ gap_penalty_y = m->gap_penalty_y;
- CALLOC (gap_penalty_x, L_trunc + L_decay + 1, LPOScore_T);
- CALLOC (gap_penalty_y, L_trunc + L_decay + 1, LPOScore_T);
-
- gap_penalty_x[0] = m->gap_penalty_set[0][0]; /* INITIALIZE GAP PENALTY LOOKUP */
- for (i=1;i<L_trunc;i++)
- gap_penalty_x[i] = m->gap_penalty_set[0][1];
- gap_decrement = (m->gap_penalty_set[0][1] - m->gap_penalty_set[0][2]) / ((double)(L_decay + 1));
- for (i=L_trunc;i<L_trunc+L_decay+1;i++)
- gap_penalty_x[i] = m->gap_penalty_set[0][1] - (i-L_trunc+1) * gap_decrement;
- gap_penalty_x[L_trunc + L_decay] = m->gap_penalty_set[0][2]; /* (REDUNDANT) */
-
- gap_penalty_y[0] = m->gap_penalty_set[1][0]; /* DIFFERENT PENALTY FOR Y-GAP */
- for (i=1;i<L_trunc;i++)
- gap_penalty_y[i] = m->gap_penalty_set[1][1];
- gap_decrement = (m->gap_penalty_set[1][1] - m->gap_penalty_set[1][2]) / ((double)(L_decay + 1));
- for (i=L_trunc;i<L_trunc+L_decay+1;i++)
- gap_penalty_y[i] = m->gap_penalty_set[1][1] - (i-L_trunc+1) * gap_decrement;
- gap_penalty_y[L_trunc + L_decay] = m->gap_penalty_set[1][2]; /* (REDUNDANT) */
-
CALLOC(score,len_x,LPOScore_T *); /* ALLOCATE MATRIX STORAGE: ROW POINTERS */
CALLOC(move,len_x,LPOMove_T *);
CALLOC(gap_length,len_x,LPOGapLength_T *);
@@ -237,8 +221,8 @@
insert_x=i_x;
insert_x_score=insert_x_try;
new_gap_len=current_gap_length+1;
- if (new_gap_len>L_trunc+L_decay) /* PREVENT OVERFLOW */
- new_gap_len=L_trunc+L_decay;
+ if (new_gap_len>max_gap_length) /* PREVENT OVERFLOW */
+ new_gap_len=max_gap_length;
}
/* FIND BEST [X,1] MOVE */
@@ -285,8 +269,8 @@
my_move[j].x=0;
my_move[j].y=1;
my_gap_length[j]=current_gap_length+1;
- if (my_gap_length[j]>L_trunc+L_decay) /* PREVENT OVERFLOW */
- my_gap_length[j]=L_trunc+L_decay;
+ if (my_gap_length[j]>max_gap_length) /* PREVENT OVERFLOW */
+ my_gap_length[j]=max_gap_length;
}
if (my_score[j]>best_score) { /* RECORD BEST MOVE */
best_score=my_score[j];
@@ -310,9 +294,6 @@
x_to_y,y_to_x);
- FREE (gap_penalty_x);
- FREE (gap_penalty_y);
-
free_row_list(nfree_list,free_list);
FREE(move_base); /* DUMP ALLOCATED MATRIX */
FREE(score);
Modified: trunk/packages/poa/branches/upstream/current/align_lpo2.c
===================================================================
--- trunk/packages/poa/branches/upstream/current/align_lpo2.c 2006-09-25 13:21:49 UTC (rev 126)
+++ trunk/packages/poa/branches/upstream/current/align_lpo2.c 2006-09-26 04:34:02 UTC (rev 127)
@@ -168,11 +168,10 @@
DPScore_T *curr_score = NULL, *prev_score = NULL, *init_col_score = NULL, *my_score, *swap;
+ int max_gap_length;
LPOScore_T *gap_penalty_x, *gap_penalty_y;
- double gap_decrement;
int *next_gap_array, *next_perp_gap_array;
- int L_trunc = m->trunc_gap_length, L_decay = m->decay_gap_length;
-
+
LPOScore_T try_score, insert_x_score, insert_y_score, match_score;
int insert_x_x, insert_x_gap;
int insert_y_y, insert_y_gap;
@@ -180,35 +179,20 @@
long n_edges = 0;
- CALLOC (gap_penalty_x, L_trunc + L_decay + 2, LPOScore_T);
- CALLOC (gap_penalty_y, L_trunc + L_decay + 2, LPOScore_T);
- CALLOC (next_gap_array, L_trunc + L_decay + 2, int);
- CALLOC (next_perp_gap_array, L_trunc + L_decay + 2, int);
-
- gap_penalty_x[0] = m->gap_penalty_set[0][0]; /* INITIALIZE GAP PENALTY LOOKUP */
- for (i=1;i<L_trunc;i++)
- gap_penalty_x[i] = m->gap_penalty_set[0][1];
- gap_decrement = (m->gap_penalty_set[0][1] - m->gap_penalty_set[0][2]) / ((double)(L_decay + 1));
- for (i=L_trunc;i<L_trunc+L_decay+1;i++)
- gap_penalty_x[i] = m->gap_penalty_set[0][1] - (i-L_trunc+1) * gap_decrement;
- gap_penalty_x[L_trunc + L_decay] = m->gap_penalty_set[0][2]; /* (REDUNDANT) */
- gap_penalty_x[L_trunc + L_decay + 1] = 0; /* INITIAL STATE FOR LOCAL ALIGNMENT */
+ max_gap_length = m->max_gap_length;
+ gap_penalty_x = m->gap_penalty_x;
+ gap_penalty_y = m->gap_penalty_y;
+ CALLOC (next_gap_array, max_gap_length + 2, int);
+ CALLOC (next_perp_gap_array, max_gap_length + 2, int);
- gap_penalty_y[0] = m->gap_penalty_set[1][0]; /* DIFFERENT PENALTY FOR Y-GAP */
- for (i=1;i<L_trunc;i++)
- gap_penalty_y[i] = m->gap_penalty_set[1][1];
- gap_decrement = (m->gap_penalty_set[1][1] - m->gap_penalty_set[1][2]) / ((double)(L_decay + 1));
- for (i=L_trunc;i<L_trunc+L_decay+1;i++)
- gap_penalty_y[i] = m->gap_penalty_set[1][1] - (i-L_trunc+1) * gap_decrement;
- gap_penalty_y[L_trunc + L_decay] = m->gap_penalty_set[1][2]; /* (REDUNDANT) */
- gap_penalty_y[L_trunc + L_decay + 1] = 0; /* INITIAL STATE FOR LOCAL ALIGNMENT */
-
- for (i=0; i<L_trunc+L_decay+2; i++) {
- /* GAP LENGTH EXTENSION RULE: */
- /* 0->1, 1->2, 2->3, ..., M-1->M, M->M, M+1->M+1. */
- next_gap_array[i] = (i < L_trunc+L_decay) ? i+1 : i;
- next_perp_gap_array[i] = (DOUBLE_GAP_SCORING && i < L_trunc+L_decay+1) ? 0 : next_gap_array[i];
+ /* GAP LENGTH EXTENSION RULE: */
+ /* 0->1, 1->2, 2->3, ..., M-1->M, M->M, M+1->M+1. */
+ for (i=0; i<max_gap_length+1; i++) {
+ next_gap_array[i] = (i<max_gap_length) ? i+1 : i;
+ next_perp_gap_array[i] = (DOUBLE_GAP_SCORING ? 0 : next_gap_array[i]);
}
+ next_gap_array[max_gap_length+1] = max_gap_length+1;
+ next_perp_gap_array[max_gap_length+1] = max_gap_length+1;
get_seq_left_and_final (lposeq_x, &x_left, &is_final_node_x);
@@ -358,8 +342,6 @@
x_to_y, y_to_x);
- FREE (gap_penalty_x);
- FREE (gap_penalty_y);
FREE (next_gap_array);
FREE (next_perp_gap_array);
Modified: trunk/packages/poa/branches/upstream/current/align_lpo_po.c
===================================================================
--- trunk/packages/poa/branches/upstream/current/align_lpo_po.c 2006-09-25 13:21:49 UTC (rev 126)
+++ trunk/packages/poa/branches/upstream/current/align_lpo_po.c 2006-09-26 04:34:02 UTC (rev 127)
@@ -106,29 +106,13 @@
LPOScore_T match_score,previous_score,
insert_x_try,insert_x_score,insert_y_score,best_score= -999999;
- LPOScore_T *gap_penalty_x, *gap_penalty_y;
- double gap_decrement;
- int L_trunc = m->trunc_gap_length, L_decay = m->decay_gap_length;
-
- CALLOC (gap_penalty_x, L_trunc + L_decay + 1, LPOScore_T);
- CALLOC (gap_penalty_y, L_trunc + L_decay + 1, LPOScore_T);
-
- gap_penalty_x[0] = m->gap_penalty_set[0][0]; /* INITIALIZE GAP PENALTY LOOKUP */
- for (i=1;i<L_trunc;i++)
- gap_penalty_x[i] = m->gap_penalty_set[0][1];
- gap_decrement = (m->gap_penalty_set[0][1] - m->gap_penalty_set[0][2]) / ((double)(L_decay + 1));
- for (i=L_trunc;i<L_trunc+L_decay+1;i++)
- gap_penalty_x[i] = m->gap_penalty_set[0][1] - (i-L_trunc+1) * gap_decrement;
- gap_penalty_x[L_trunc + L_decay] = m->gap_penalty_set[0][2]; /* (REDUNDANT) */
-
- gap_penalty_y[0] = m->gap_penalty_set[1][0]; /* DIFFERENT PENALTY FOR Y-GAP */
- for (i=1;i<L_trunc;i++)
- gap_penalty_y[i] = m->gap_penalty_set[1][1];
- gap_decrement = (m->gap_penalty_set[1][1] - m->gap_penalty_set[1][2]) / ((double)(L_decay + 1));
- for (i=L_trunc;i<L_trunc+L_decay+1;i++)
- gap_penalty_y[i] = m->gap_penalty_set[1][1] - (i-L_trunc+1) * gap_decrement;
- gap_penalty_y[L_trunc + L_decay] = m->gap_penalty_set[1][2]; /* (REDUNDANT) */
-
+ int max_gap_length;
+ LPOScore_T *gap_penalty_x, *gap_penalty_y;
+
+ max_gap_length = m->max_gap_length;
+ gap_penalty_x = m->gap_penalty_x;
+ gap_penalty_y = m->gap_penalty_y;
+
CALLOC(score,len_x,LPOScore_T *); /* ALLOCATE MATRIX STORAGE: ROW POINTERS */
CALLOC(move,len_x,LPOMove_T *);
CALLOC(gap_length,len_x,LPOGapLength_T *);
@@ -167,8 +151,8 @@
insert_x=i_x;
insert_x_score=insert_x_try;
new_gap_len=current_gap_length+1;
- if (new_gap_len>TRUNCATE_GAP_LENGTH) /* PREVENT OVERFLOW */
- new_gap_len=TRUNCATE_GAP_LENGTH;
+ if (new_gap_len>max_gap_length) /* PREVENT OVERFLOW */
+ new_gap_len=max_gap_length;
}
/* FIND BEST [X,Y] MOVE */
@@ -239,8 +223,8 @@
my_move[j].x=0;
my_move[j].y=insert_y;
my_gap_length[j]=current_gap_length+1;
- if (my_gap_length[j]>TRUNCATE_GAP_LENGTH) /* PREVENT OVERFLOW */
- my_gap_length[j]=TRUNCATE_GAP_LENGTH;
+ if (my_gap_length[j]>max_gap_length) /* PREVENT OVERFLOW */
+ my_gap_length[j]=max_gap_length;
}
if (my_score[j]>best_score) { /* RECORD BEST MOVE */
best_score=my_score[j];
@@ -254,8 +238,6 @@
trace_back_lpo_po_alignment(len_x,seq_x,len_y,seq_y,move,best_x,best_y,
x_to_y,y_to_x);
- FREE (gap_penalty_x);
- FREE (gap_penalty_y);
FREE(score_base); /* FREE MEMORY */
FREE(move_base);
Modified: trunk/packages/poa/branches/upstream/current/align_lpo_po2.c
===================================================================
--- trunk/packages/poa/branches/upstream/current/align_lpo_po2.c 2006-09-25 13:21:49 UTC (rev 126)
+++ trunk/packages/poa/branches/upstream/current/align_lpo_po2.c 2006-09-26 04:34:02 UTC (rev 127)
@@ -21,64 +21,87 @@
}
DPScore_T;
-/** is node 'i' the first node in any sequence in lposeq? */
-static int is_initial_node (int i, LPOSequence_T *lposeq)
+
+
+#define LPO_INITIAL_NODE 1
+#define LPO_FINAL_NODE 2
+
+static void get_lpo_stats (LPOSequence_T *lposeq,
+ int *n_nodes_ptr, int *n_edges_ptr, int **node_type_ptr,
+ int **refs_from_right_ptr, int *max_rows_alloced_ptr,
+ LPOLetterLink_T ***left_links_ptr)
{
- LPOLetterSource_T *src = &((lposeq->letter[i]).source);
- while (src != NULL && src->iseq >= 0) {
- if (src->ipos == 0) {
- return 1;
+ int i, j, rows_alloced = 0, max_rows_alloced = 0, n_edges = 0, len = lposeq->length;
+ LPOLetter_T *seq = lposeq->letter;
+ int *node_type;
+ int *refs_from_right, *tmp;
+ LPOLetterSource_T *src;
+ LPOLetterLink_T *lnk, **left_links;
+
+ CALLOC (node_type, len, int);
+ CALLOC (refs_from_right, len, int);
+ CALLOC (tmp, len, int);
+ CALLOC (left_links, len, LPOLetterLink_T *);
+
+ for (i=0; i<len; i++) {
+
+ /* NODES CONTAINING THE FIRST RESIDUE IN ANY SEQ ARE 'INITIAL'; */
+ /* DITTO, LAST RESIDUE IN ANY SEQ, 'FINAL'. */
+ for (src = &(seq[i].source); src != NULL && src->iseq >= 0; src = src->more) {
+ if (src->ipos == 0) {
+ node_type[i] = (node_type[i] | LPO_INITIAL_NODE);
+ }
+ if (src->ipos == (lposeq->source_seq[src->iseq]).length - 1) {
+ node_type[i] = (node_type[i] | LPO_FINAL_NODE);
+ }
}
- src = src->more;
+
+ /* COUNTING THE LEFT-LINKS BACK TO EACH NODE ALLOWS FOR EFFICIENT */
+ /* MEMORY MANAGEMENT OF 'SCORE' ROWS (in align_lpo_po). */
+ for (lnk = &(seq[i].left); lnk != NULL && lnk->ipos >= 0; lnk = lnk->more) {
+ refs_from_right[lnk->ipos]++;
+ n_edges++;
+ }
}
- return 0;
-}
-/** is node 'i' the last node in any sequence in lposeq? */
-static int is_final_node (int i, LPOSequence_T *lposeq)
-{
- LPOLetterSource_T *src = &((lposeq->letter[i]).source);
- while (src != NULL && src->iseq >= 0) {
- if (src->ipos == (lposeq->source_seq[src->iseq]).length - 1) {
- return 1;
+ /* ALL 'INITIAL' NODES (1st in some seq) MUST BE LEFT-LINKED TO -1. */
+ /* THIS ALLOWS FREE ALIGNMENT TO ANY 'BRANCH' IN GLOBAL ALIGNMENT. */
+ for (i=0; i<len; i++) {
+ if ((node_type[i] & LPO_INITIAL_NODE) && seq[i].left.ipos != -1) {
+ CALLOC (left_links[i], 1, LPOLetterLink_T);
+ left_links[i]->ipos = -1;
+ left_links[i]->score = 0;
+ left_links[i]->more = &seq[i].left;
}
- src = src->more;
+ else {
+ left_links[i] = &seq[i].left;
+ }
}
- return 0;
-}
-
-
-static void get_seq_left_and_final (LPOSequence_T *lposeq_x,
- LPOLetterLink_T ***x_left_ptr,
- int **is_final_node_ptr)
-{
- int i, len_x = lposeq_x->length;
- LPOLetter_T *seq_x = lposeq_x->letter;
- int *is_final_node_x = NULL;
- LPOLetterLink_T **x_left = NULL;
- CALLOC (is_final_node_x, len_x, int);
-
- for (i=0; i<len_x; i++) {
- is_final_node_x[i] = is_final_node (i, lposeq_x);
+ for (i=0; i<len; i++) {
+ tmp[i] = refs_from_right[i];
}
- CALLOC (x_left, len_x, LPOLetterLink_T *);
-
- for (i=0; i<len_x; i++) {
- if (is_initial_node (i, lposeq_x) && seq_x[i].left.ipos != -1) {
- CALLOC (x_left[i], 1, LPOLetterLink_T);
- x_left[i]->ipos = -1;
- x_left[i]->score = 0;
- x_left[i]->more = &seq_x[i].left;
+ for (i=0; i<len; i++) {
+ rows_alloced++;
+ if (rows_alloced > max_rows_alloced) {
+ max_rows_alloced = rows_alloced;
}
- else {
- x_left[i] = &seq_x[i].left;
+ for (lnk = &(seq[i].left); lnk != NULL && lnk->ipos >= 0; lnk = lnk->more) {
+ if ((--tmp[lnk->ipos]) == 0) {
+ rows_alloced--;
+ }
}
}
- (*is_final_node_ptr) = is_final_node_x;
- (*x_left_ptr) = x_left;
+ FREE (tmp);
+
+ (*n_nodes_ptr) = len;
+ (*n_edges_ptr) = n_edges;
+ (*node_type_ptr) = node_type;
+ (*refs_from_right_ptr) = refs_from_right;
+ (*max_rows_alloced_ptr) = max_rows_alloced;
+ (*left_links_ptr) = left_links;
}
@@ -145,7 +168,8 @@
}
-/** performs partial order alignment:
+/** (align_lpo_po:)
+ performs partial order alignment:
lposeq_x and lposeq_y are partial orders;
returns the alignment in x_to_y[] and y_to_x[], and also
returns the alignment score as the return value.
@@ -160,66 +184,73 @@
(int, int, LPOLetter_T *, LPOLetter_T *, ResidueScoreMatrix_T *),
int use_global_alignment)
{
- int len_x = lposeq_x->length;
- int len_y = lposeq_y->length;
LPOLetter_T *seq_x = lposeq_x->letter;
LPOLetter_T *seq_y = lposeq_y->letter;
- int i, j, xcount, ycount, prev_gap, next_gap;
+ int len_x, len_y;
+ int n_edges_x, n_edges_y;
+ int *node_type_x, *node_type_y;
+ int *refs_from_right_x, *refs_from_right_y;
+ int max_rows_alloced_x, max_rows_alloced_y, n_score_rows_alloced = 0;
+
+ int i, j, xcount, ycount, prev_gap;
int best_x = -1, best_y = -1;
- LPOScore_T best_score = -999999;
- int *is_final_node_x, *is_final_node_y;
+ LPOScore_T min_score = -999999, best_score = -999999;
+ int possible_end_square;
LPOLetterLink_T **x_left = NULL, **y_left = NULL, *xl, *yl;
DPMove_T **move = NULL, *my_move;
DPScore_T *curr_score = NULL, *prev_score = NULL, *init_col_score = NULL, *my_score;
DPScore_T **score_rows = NULL;
-
+
+ int max_gap_length;
LPOScore_T *gap_penalty_x, *gap_penalty_y;
- double gap_decrement;
int *next_gap_array, *next_perp_gap_array;
- int L_trunc = m->trunc_gap_length, L_decay = m->decay_gap_length;
LPOScore_T try_score, insert_x_score, insert_y_score, match_score;
int insert_x_x, insert_x_gap;
int insert_y_y, insert_y_gap;
int match_x, match_y;
- long n_edges = 0;
+ get_lpo_stats (lposeq_x, &len_x, &n_edges_x, &node_type_x, &refs_from_right_x, &max_rows_alloced_x, &x_left);
+ get_lpo_stats (lposeq_y, &len_y, &n_edges_y, &node_type_y, &refs_from_right_y, &max_rows_alloced_y, &y_left);
- CALLOC (gap_penalty_x, L_trunc + L_decay + 2, LPOScore_T);
- CALLOC (gap_penalty_y, L_trunc + L_decay + 2, LPOScore_T);
- CALLOC (next_gap_array, L_trunc + L_decay + 2, int);
- CALLOC (next_perp_gap_array, L_trunc + L_decay + 2, int);
+ /*
+ fprintf (stdout, "sequence x: %ld nodes, %ld edges, %ld rows at most --> %ld mem\n", len_x, n_edges_x, max_rows_alloced_x, max_rows_alloced_x * len_y);
+ fprintf (stdout, "sequence y: %ld nodes, %ld edges, %ld rows at most --> %ld mem\n", len_y, n_edges_y, max_rows_alloced_y, max_rows_alloced_y * len_x);
+ */
+
+ /* INITIALIZE GAP PENALTIES: */
+ max_gap_length = m->max_gap_length;
+ gap_penalty_x = m->gap_penalty_x;
+ gap_penalty_y = m->gap_penalty_y;
+ CALLOC (next_gap_array, max_gap_length + 2, int);
+ CALLOC (next_perp_gap_array, max_gap_length + 2, int);
- gap_penalty_x[0] = m->gap_penalty_set[0][0]; /* INITIALIZE GAP PENALTY LOOKUP */
- for (i=1;i<L_trunc;i++)
- gap_penalty_x[i] = m->gap_penalty_set[0][1];
- gap_decrement = (m->gap_penalty_set[0][1] - m->gap_penalty_set[0][2]) / ((double)(L_decay + 1));
- for (i=L_trunc;i<L_trunc+L_decay+1;i++)
- gap_penalty_x[i] = m->gap_penalty_set[0][1] - (i-L_trunc+1) * gap_decrement;
- gap_penalty_x[L_trunc + L_decay] = m->gap_penalty_set[0][2]; /* (REDUNDANT) */
- gap_penalty_x[L_trunc + L_decay + 1] = 0; /* INITIAL STATE FOR LOCAL ALIGNMENT */
-
- gap_penalty_y[0] = m->gap_penalty_set[1][0]; /* DIFFERENT PENALTY FOR Y-GAP */
- for (i=1;i<L_trunc;i++)
- gap_penalty_y[i] = m->gap_penalty_set[1][1];
- gap_decrement = (m->gap_penalty_set[1][1] - m->gap_penalty_set[1][2]) / ((double)(L_decay + 1));
- for (i=L_trunc;i<L_trunc+L_decay+1;i++)
- gap_penalty_y[i] = m->gap_penalty_set[1][1] - (i-L_trunc+1) * gap_decrement;
- gap_penalty_y[L_trunc + L_decay] = m->gap_penalty_set[1][2]; /* (REDUNDANT) */
- gap_penalty_y[L_trunc + L_decay + 1] = 0; /* INITIAL STATE FOR LOCAL ALIGNMENT */
-
- for (i=0; i<L_trunc+L_decay+2; i++) {
+ for (i=0; i<max_gap_length+1; i++) {
/* GAP LENGTH EXTENSION RULE: */
- /* 0->1, 1->2, 2->3, ..., M-1->M, M->M, M+1->M+1. */
- next_gap_array[i] = (i < L_trunc+L_decay) ? i+1 : i;
- next_perp_gap_array[i] = (DOUBLE_GAP_SCORING && i < L_trunc+L_decay+1) ? 0 : next_gap_array[i];
+ /* 0->1, 1->2, 2->3, ..., M-1->M; but M->M. */
+ next_gap_array[i] = (i<max_gap_length) ? i+1 : i;
+ /* PERPENDICULAR GAP (i.e. X FOR A GROWING Y-GAP) IS KEPT AT 0 IF DOUBLE-GAP-SCORING (old scoring) IS USED. */
+ next_perp_gap_array[i] = (DOUBLE_GAP_SCORING ? 0 : next_gap_array[i]);
}
- get_seq_left_and_final (lposeq_x, &x_left, &is_final_node_x);
- get_seq_left_and_final (lposeq_y, &y_left, &is_final_node_y);
+ /* GAP LENGTH = M+1 IS USED FOR INITIAL STATE. */
+ /* THIS MUST BE TREATED DIFFERENTLY FOR GLOBAL v. LOCAL ALIGNMENT: */
+ if (0 == use_global_alignment) { /* FREE EXTENSION OF INITIAL GAP (FOR LOCAL ALIGNMENT) */
+ gap_penalty_x[max_gap_length+1] = gap_penalty_y[max_gap_length+1] = 0;
+ next_gap_array[max_gap_length+1] = next_perp_gap_array[max_gap_length+1] = max_gap_length+1;
+ }
+ else { /* TREAT INITIAL GAP LIKE ANY OTHER (FOR GLOBAL ALIGNMENT) */
+ gap_penalty_x[max_gap_length+1] = gap_penalty_x[0];
+ gap_penalty_y[max_gap_length+1] = gap_penalty_y[0];
+ next_gap_array[max_gap_length+1] = next_gap_array[0];
+ next_perp_gap_array[max_gap_length+1] = next_perp_gap_array[0];
+ }
+
+ /* ALLOCATE MEMORY FOR 'MOVE' AND 'SCORE' MATRICES: */
+
CALLOC (move, len_y, DPMove_T *);
for (i=0; i<len_y; i++) {
CALLOC (move[i], len_x, DPMove_T);
@@ -230,25 +261,23 @@
CALLOC (score_rows, len_y+1, DPScore_T *);
score_rows = &(score_rows[1]);
- for (i=-1; i<len_y; i++) {
- CALLOC (score_rows[i], len_x+1, DPScore_T);
- score_rows[i] = &(score_rows[i][1]);
- }
+ CALLOC (score_rows[-1], len_x+1, DPScore_T);
+ score_rows[-1] = &(score_rows[-1][1]);
curr_score = score_rows[-1];
+
+
+ /* FILL INITIAL ROW (-1). */
+ /* GAP LENGTH = M+1 IS USED FOR INITIAL STATE. */
- /* FILL INITIAL ROW. */
- /* GLOBAL ALIGNMENT: no free gaps */
- /* LOCAL ALIGNMENT: free initial gap */
-
curr_score[-1].score = 0;
- curr_score[-1].gap_x = curr_score[-1].gap_y =
- (use_global_alignment) ? 0 : TRUNCATE_GAP_LENGTH+1;
+ curr_score[-1].gap_x = curr_score[-1].gap_y = max_gap_length+1;
for (i=0; i<len_x; i++) {
+ curr_score[i].score = min_score;
for (xcount = 1, xl = x_left[i]; xl != NULL; xcount++, xl = xl->more) {
prev_gap = curr_score[xl->ipos].gap_x;
try_score = curr_score[xl->ipos].score + xl->score - gap_penalty_x[prev_gap];
- if (xcount == 1 || try_score > curr_score[i].score) {
+ if (try_score > curr_score[i].score) {
curr_score[i].score = try_score;
curr_score[i].gap_x = next_gap_array[prev_gap];
curr_score[i].gap_y = next_perp_gap_array[prev_gap];
@@ -256,14 +285,15 @@
}
}
- /* FILL INITIAL COLUMN. */
+ /* FILL INITIAL COLUMN (-1). */
init_col_score[-1] = curr_score[-1];
for (i=0; i<len_y; i++) {
+ init_col_score[i].score = min_score;
for (ycount = 1, yl = y_left[i]; yl != NULL; ycount++, yl = yl->more) {
prev_gap = init_col_score[yl->ipos].gap_y;
try_score = init_col_score[yl->ipos].score + yl->score - gap_penalty_y[prev_gap];
- if (ycount == 1 || try_score > init_col_score[i].score) {
+ if (try_score > init_col_score[i].score) {
init_col_score[i].score = try_score;
init_col_score[i].gap_x = next_perp_gap_array[prev_gap];
init_col_score[i].gap_y = next_gap_array[prev_gap];
@@ -278,13 +308,28 @@
/* OUTER LOOP (i-th position in LPO y): */
for (i=0; i<len_y; i++) {
+ /* ALLOCATE MEMORY FOR 'SCORE' ROW i: */
+ CALLOC (score_rows[i], len_x+1, DPScore_T);
+ score_rows[i] = &(score_rows[i][1]);
+ n_score_rows_alloced++;
+
curr_score = score_rows[i];
-
curr_score[-1] = init_col_score[i];
/* INNER LOOP (j-th position in LPO x): */
for (j=0; j<len_x; j++) {
+
+ match_score = (use_global_alignment) ? min_score : 0;
+ match_x = match_y = 0;
+ insert_x_score = insert_y_score = min_score;
+ insert_x_x = insert_y_y = 0;
+ insert_x_gap = insert_y_gap = 0;
+
+ /* THIS SQUARE CAN END THE ALIGNMENT IF WE'RE USING LOCAL ALIGNMENT, */
+ /* OR IF BOTH THE X- AND Y-NODES CONTAIN THE END OF A SEQUENCE. */
+ possible_end_square = ((0 == use_global_alignment) || ((node_type_x[j] & LPO_FINAL_NODE) && (node_type_y[i] & LPO_FINAL_NODE)));
+
/* LOOP OVER y-predecessors: */
for (ycount = 1, yl = y_left[i]; yl != NULL; ycount++, yl = yl->more) {
@@ -293,18 +338,18 @@
/* IMPROVE Y-INSERTION?: trace back to (i'=yl->ipos, j) */
prev_gap = prev_score[j].gap_y;
try_score = prev_score[j].score + yl->score - gap_penalty_y[prev_gap];
- if (ycount == 1 || try_score > insert_y_score) {
+ if (try_score > insert_y_score) {
insert_y_score = try_score;
insert_y_y = ycount;
insert_y_gap = prev_gap;
}
- /* LOOP OVER x-predecessors: */
+ /* LOOP OVER x-predecessors (INSIDE y-predecessor LOOP): */
for (xcount = 1, xl = x_left[j]; xl != NULL; xcount++, xl = xl->more) {
/* IMPROVE XY-MATCH?: trace back to (i'=yl->ipos, j'=xl->ipos) */
try_score = prev_score[xl->ipos].score + xl->score + yl->score;
- if ((xcount == 1 && ycount == 1) || try_score > match_score) {
+ if (try_score > match_score) {
match_score = try_score;
match_x = xcount;
match_y = ycount;
@@ -312,26 +357,27 @@
}
}
- /* LOOP OVER x-predecessors */
- /* IMPROVE X-INSERTION?: trace back to (i, j'=xl->ipos) */
+ /* LOOP OVER x-predecessors (OUTSIDE y-predecessor LOOP): */
for (xcount = 1, xl = x_left[j]; xl != NULL; xcount++, xl = xl->more) {
+
+ /* IMPROVE X-INSERTION?: trace back to (i, j'=xl->ipos) */
prev_gap = curr_score[xl->ipos].gap_x;
try_score = curr_score[xl->ipos].score + xl->score - gap_penalty_x[prev_gap];
- if (xcount == 1 || try_score > insert_x_score) {
+ if (try_score > insert_x_score) {
insert_x_score = try_score;
insert_x_x = xcount;
insert_x_gap = prev_gap;
}
}
- if (0 == use_global_alignment && match_score <= 0) {
- match_score = 0;
- match_x = match_y = 0; /* FIRST ALIGNED PAIR */
+ /* USE CUSTOM OR DEFAULT SCORING FUNCTION: */
+ if (scoring_function != NULL) {
+ match_score += scoring_function (j, i, seq_x, seq_y, m);
}
+ else {
+ match_score += m->score[seq_x[i].letter][seq_y[j].letter];
+ }
- n_edges += (xcount-1)*(ycount-1);
- match_score += scoring_function (j, i, seq_x, seq_y, m);
-
my_score = &curr_score[j];
my_move = &move[i][j];
@@ -353,7 +399,6 @@
}
else {
/* Y-INSERTION */
- next_gap = next_gap_array[insert_y_gap];
my_score->score = insert_y_score;
my_score->gap_x = next_perp_gap_array[insert_y_gap];
my_score->gap_y = next_gap_array[insert_y_gap];
@@ -361,9 +406,9 @@
my_move->y = insert_y_y;
}
- /* RECORD BEST START FOR TRACEBACK */
- /* KEEPING ONLY FINAL-FINAL BESTS FOR GLOBAL ALIGNMENT */
- if (my_score->score >= best_score && (0 == use_global_alignment || (is_final_node_x[j] && is_final_node_y[i]))) {
+ /* RECORD BEST ALIGNMENT END FOR TRACEBACK: */
+ if (possible_end_square && my_score->score >= best_score) {
+ /* BREAK TIES BY CHOOSING MINIMUM (x,y): */
if (my_score->score > best_score || (j == best_x && i < best_y) || j < best_x) {
best_score = my_score->score;
best_x = j;
@@ -371,39 +416,54 @@
}
}
}
+
+ /* UPDATE # OF REFS TO 'SCORE' ROWS; FREE MEMORY WHEN POSSIBLE: */
+ for (yl = y_left[i]; yl != NULL; yl = yl->more) if ((j = yl->ipos) >= 0) {
+ if ((--refs_from_right_y[j]) == 0) {
+ score_rows[j] = &(score_rows[j][-1]);
+ FREE (score_rows[j]);
+ n_score_rows_alloced--;
+ }
+ }
+ if (refs_from_right_y[i] == 0) {
+ score_rows[i] = &(score_rows[i][-1]);
+ FREE (score_rows[i]);
+ n_score_rows_alloced--;
+ }
}
IF_GUARD(best_x>=len_x || best_y>=len_y,1.1,(ERRTXT,"Bounds exceeded!\nbest_x,best_y:%d,%d\tlen:%d,%d\n",best_x,best_y,len_x,len_y),CRASH);
- /*
- fprintf (stderr, "aligned (%d nodes, %ld edges) to (%d nodes)\n", len_x, n_edges, len_y);
- fprintf (stderr, "best score %d @ (%d %d)\n", best_score, best_x, best_y);
- */
-
+ /**/
+ fprintf (stderr, "aligned (%d nodes, %ld edges) to (%d nodes, %ld edges): ", len_x, n_edges_x, len_y, n_edges_y);
+ fprintf (stderr, "best %s score = %d @ (%d %d)\n", (use_global_alignment ? "global" : "local"), best_score, best_x, best_y);
+ /**/
+
/* DYNAMIC PROGRAMING MATRIX COMPLETE, NOW TRACE BACK FROM best_x, best_y */
trace_back_lpo_alignment (len_x, len_y, move, x_left, y_left,
best_x, best_y,
x_to_y, y_to_x);
- FREE (gap_penalty_x);
- FREE (gap_penalty_y);
+ /* CLEAN UP AND RETURN: */
+
+ FREE (node_type_x);
+ FREE (node_type_y);
+
+ FREE (refs_from_right_x);
+ FREE (refs_from_right_y);
+
FREE (next_gap_array);
FREE (next_perp_gap_array);
- for (i=-1; i<len_y; i++) {
- score_rows[i] = &(score_rows[i][-1]);
- FREE (score_rows[i]);
- }
+ score_rows[-1] = &(score_rows[-1][-1]);
+ FREE (score_rows[-1]);
score_rows = &(score_rows[-1]);
FREE (score_rows);
init_col_score = &(init_col_score[-1]);
FREE (init_col_score);
-
- FREE (is_final_node_x);
- FREE (is_final_node_y);
-
+
for (i=0; i<len_x; i++) {
if (x_left[i] != &seq_x[i].left) {
FREE (x_left[i]);
Modified: trunk/packages/poa/branches/upstream/current/align_score.c
===================================================================
--- trunk/packages/poa/branches/upstream/current/align_score.c 2006-09-25 13:21:49 UTC (rev 126)
+++ trunk/packages/poa/branches/upstream/current/align_score.c 2006-09-26 04:34:02 UTC (rev 127)
@@ -17,10 +17,10 @@
-/* CATIE: YOU CAN PUT ANY SCORING METHOD YOU WANT INSIDE THIS
+/* YOU CAN PUT ANY SCORING METHOD YOU WANT INSIDE THIS
FUNCTION. JUST REPLACE THE CONTENTS OF THE FUNCTION WITH
YOUR SCORING METHOD */
-LPOScore_T caties_scoring_function(int i,
+LPOScore_T matrix_scoring_function(int i,
int j,
LPOLetter_T seq_x[],
LPOLetter_T seq_y[],
Added: trunk/packages/poa/branches/upstream/current/align_score.h
===================================================================
--- trunk/packages/poa/branches/upstream/current/align_score.h (rev 0)
+++ trunk/packages/poa/branches/upstream/current/align_score.h 2006-09-26 04:34:02 UTC (rev 127)
@@ -0,0 +1,15 @@
+#ifndef ALIGN_SCORE_HEADER_INCLUDED
+#define ALIGN_SCORE_HEADER_INCLUDED
+
+#include <default.h>
+#include <poa.h>
+#include <seq_util.h>
+
+/*********************************************************** align_score.c */
+LPOScore_T matrix_scoring_function(int i,
+ int j,
+ LPOLetter_T seq_x[],
+ LPOLetter_T seq_y[],
+ ResidueScoreMatrix_T *m);
+
+#endif
Added: trunk/packages/poa/branches/upstream/current/blosum80_trunc.mat
===================================================================
--- trunk/packages/poa/branches/upstream/current/blosum80_trunc.mat (rev 0)
+++ trunk/packages/poa/branches/upstream/current/blosum80_trunc.mat 2006-09-26 04:34:02 UTC (rev 127)
@@ -0,0 +1,42 @@
+# Blosum80
+# Matrix made by matblas from blosum80.iij
+# * column uses minimum score
+# BLOSUM Clustered Scoring Matrix in 1/3 Bit Units
+# Blocks Database = /data/blocks_5.0/blocks.dat
+# Cluster Percentage: >= 80
+# Entropy = 0.9868, Expected = -0.7442
+GAP-TRUNCATION-LENGTH=10
+GAP-DECAY-LENGTH=5
+GAP-PENALTIES=12 6 0
+ A R N D C Q E G H I L K M F P S T W Y V B Z X ? a g t c u ] n
+A 7 -3 -3 -3 -1 -2 -2 0 -3 -3 -3 -1 -2 -4 -1 2 0 -5 -4 -1 -3 -2 -1 -9 -9 -9 -9 -9 -9 -9 -9
+R -3 9 -1 -3 -6 1 -1 -4 0 -5 -4 3 -3 -5 -3 -2 -2 -5 -4 -4 -2 0 -2 -9 -9 -9 -9 -9 -9 -9 -9
+N -3 -1 9 2 -5 0 -1 -1 1 -6 -6 0 -4 -6 -4 1 0 -7 -4 -5 5 -1 -2 -9 -9 -9 -9 -9 -9 -9 -9
+D -3 -3 2 10 -7 -1 2 -3 -2 -7 -7 -2 -6 -6 -3 -1 -2 -8 -6 -6 6 1 -3 -9 -9 -9 -9 -9 -9 -9 -9
+C -1 -6 -5 -7 13 -5 -7 -6 -7 -2 -3 -6 -3 -4 -6 -2 -2 -5 -5 -2 -6 -7 -4 -9 -9 -9 -9 -9 -9 -9 -9
+Q -2 1 0 -1 -5 9 3 -4 1 -5 -4 2 -1 -5 -3 -1 -1 -4 -3 -4 -1 5 -2 -9 -9 -9 -9 -9 -9 -9 -9
+E -2 -1 -1 2 -7 3 8 -4 0 -6 -6 1 -4 -6 -2 -1 -2 -6 -5 -4 1 6 -2 -9 -9 -9 -9 -9 -9 -9 -9
+G 0 -4 -1 -3 -6 -4 -4 9 -4 -7 -7 -3 -5 -6 -5 -1 -3 -6 -6 -6 -2 -4 -3 -9 -9 -9 -9 -9 -9 -9 -9
+H -3 0 1 -2 -7 1 0 -4 12 -6 -5 -1 -4 -2 -4 -2 -3 -4 3 -5 -1 0 -2 -9 -9 -9 -9 -9 -9 -9 -9
+I -3 -5 -6 -7 -2 -5 -6 -7 -6 7 2 -5 2 -1 -5 -4 -2 -5 -3 4 -6 -6 -2 -9 -9 -9 -9 -9 -9 -9 -9
+L -3 -4 -6 -7 -3 -4 -6 -7 -5 2 6 -4 3 0 -5 -4 -3 -4 -2 1 -7 -5 -2 -9 -9 -9 -9 -9 -9 -9 -9
+K -1 3 0 -2 -6 2 1 -3 -1 -5 -4 8 -3 -5 -2 -1 -1 -6 -4 -4 -1 1 -2 -9 -9 -9 -9 -9 -9 -9 -9
+M -2 -3 -4 -6 -3 -1 -4 -5 -4 2 3 -3 9 0 -4 -3 -1 -3 -3 1 -5 -3 -2 -9 -9 -9 -9 -9 -9 -9 -9
+F -4 -5 -6 -6 -4 -5 -6 -6 -2 -1 0 -5 0 10 -6 -4 -4 0 4 -2 -6 -6 -3 -9 -9 -9 -9 -9 -9 -9 -9
+P -1 -3 -4 -3 -6 -3 -2 -5 -4 -5 -5 -2 -4 -6 12 -2 -3 -7 -6 -4 -4 -2 -3 -9 -9 -9 -9 -9 -9 -9 -9
+S 2 -2 1 -1 -2 -1 -1 -1 -2 -4 -4 -1 -3 -4 -2 7 2 -6 -3 -3 0 -1 -1 -9 -9 -9 -9 -9 -9 -9 -9
+T 0 -2 0 -2 -2 -1 -2 -3 -3 -2 -3 -1 -1 -4 -3 2 8 -5 -3 0 -1 -2 -1 -9 -9 -9 -9 -9 -9 -9 -9
+W -5 -5 -7 -8 -5 -4 -6 -6 -4 -5 -4 -6 -3 0 -7 -6 -5 16 3 -5 -8 -5 -5 -9 -9 -9 -9 -9 -9 -9 -9
+Y -4 -4 -4 -6 -5 -3 -5 -6 3 -3 -2 -4 -3 4 -6 -3 -3 3 11 -3 -5 -4 -3 -9 -9 -9 -9 -9 -9 -9 -9
+V -1 -4 -5 -6 -2 -4 -4 -6 -5 4 1 -4 1 -2 -4 -3 0 -5 -3 7 -6 -4 -2 -9 -9 -9 -9 -9 -9 -9 -9
+B -3 -2 5 6 -6 -1 1 -2 -1 -6 -7 -1 -5 -6 -4 0 -1 -8 -5 -6 6 0 -3 -9 -9 -9 -9 -9 -9 -9 -9
+Z -2 0 -1 1 -7 5 6 -4 0 -6 -5 1 -3 -6 -2 -1 -2 -5 -4 -4 0 6 -1 -9 -9 -9 -9 -9 -9 -9 -9
+X -1 -2 -2 -3 -4 -2 -2 -3 -2 -2 -2 -2 -2 -3 -3 -1 -1 -5 -3 -2 -3 -1 -2 -9 -9 -9 -9 -9 -9 -9 -9
+? -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9
+a -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 4 -2 -2 -2 -2 -9 0
+g -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -2 4 -2 -2 -2 -9 0
+t -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -2 -2 4 -2 4 -9 0
+c -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -2 -2 -2 4 -2 -9 0
+u -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -2 -2 4 -2 4 -9 0
+] -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9
+n -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 0 0 0 0 0 -9 0
Modified: trunk/packages/poa/branches/upstream/current/buildup_lpo.c
===================================================================
--- trunk/packages/poa/branches/upstream/current/buildup_lpo.c 2006-09-25 13:21:49 UTC (rev 126)
+++ trunk/packages/poa/branches/upstream/current/buildup_lpo.c 2006-09-26 04:34:02 UTC (rev 127)
@@ -6,7 +6,15 @@
#include "lpo.h"
-
+/** if two align-rings are aligned to each other, make sure
+ that the (single) aligned residue pair consists of identical
+ residues, if possible.
+ :
+ ((a,-),(b,-),(c,d),(-,a),(-b)) ==>
+ ((a,a),(b,-),(c,-),(-,d),(-b)) OR ((a,-),(b,b),(c,-),(-,d)).
+ :
+ ((a,-),(c,d),(-,b)) ==> self.
+*/
void fuse_ring_identities(int len_x,LPOLetter_T seq_x[],
int len_y,LPOLetter_T seq_y[],
LPOLetterRef_T al_x[],
@@ -16,17 +24,64 @@
LOOP (i,len_y) {
if (al_y[i]<0 || seq_x[al_y[i]].letter == seq_y[i].letter)
continue; /* NOT ALIGNED, OR ALREADY IDENTICAL, SO SKIP */
- for (j=seq_x[al_y[i]].align_ring;j!=al_y[i];j=seq_x[j].align_ring)
+ for (j=seq_x[al_y[i]].align_ring;j!=al_y[i];j=seq_x[j].align_ring) {
if (seq_x[j].letter == seq_y[i].letter) { /* IDENTICAL! SO FUSE! */
al_x[al_y[i]]= INVALID_LETTER_POSITION; /* DISCONNECT FROM OLD */
al_y[i]=j; /* CONNECT TO NEW IDENTITY */
al_x[j]=i;
break; /* SEARCH YE NO FURTHER */
}
+ }
}
}
+/** if two align-rings are aligned to each other, make sure
+ that as many identical-residue pairs are fused as possible.
+ :
+ ((a,-),(b,-),(c,d),(-,a),(-b)) ==>
+ ((a,a),(b,b),(c,-),(-,d)).
+ :
+ ((a,-),(c,d),(-,b)) ==> self.
+ NB:
+ THIS DOES NOT WORK with the current fuse_lpo function.
+ The fusion
+*/
+void full_fuse_ring_identities (int len_x, LPOLetter_T *seq_x,
+ int len_y, LPOLetter_T *seq_y,
+ LPOLetterRef_T *al_x,
+ LPOLetterRef_T *al_y)
+{
+ int i, ip, j, jp; /* i LABELS POS IN seq_x, j LABELS POS IN seq_y */
+ int ck; /* WAS IDENTICAL-RESIDUE PAIR FOUND? */
+ for (i=0; i<len_x; i++) if ((j = al_x[i]) >= 0) {
+ al_x[i] = al_y[j] = INVALID_LETTER_POSITION; /* DISCONNECT FROM OLD */
+
+ /* i,j ARE AN ALIGNED PAIR. WALK THROUGH RESPECTIVE RINGS: */
+ ip=i; jp=j; ck=0;
+ do /* while (ip!=i) */ {
+ do /* while (jp!=j) */ {
+ if (seq_x[ip].letter == seq_y[jp].letter) { /* IDENTICAL! SO FUSE! */
+ al_x[ip] = jp;
+ al_y[jp] = ip;
+ ck=1;
+ jp=j; /* EXIT TO OUTER LOOP... AT MOST ONE FUSED TO EACH POS IN seq_y. */
+ }
+ else {
+ jp = seq_y[jp].align_ring;
+ }
+ }
+ while (jp!=j);
+ ip = seq_x[ip].align_ring;
+ }
+ while (ip!=i);
+
+ if (ck==0) { /* NO IDENTICAL-RESIDUE PAIR FOUND, SO RECONNECT ORIGINAL */
+ al_x[i] = j;
+ al_y[j] = i;
+ }
+ }
+}
/** aligns the sequences in seq[] to the sequence or partial order in
@@ -37,7 +92,7 @@
int nseq,LPOSequence_T seq[],
ResidueScoreMatrix_T *score_matrix,
int use_aggressive_fusion,
- int use_global_alignment)
+ int use_global_alignment)
{
int i,max_alloc=0,total_alloc;
LPOLetterRef_T *al1=NULL,*al2=NULL;
@@ -54,12 +109,12 @@
fprintf(stderr,"max_alloc: %d bytes\n",max_alloc);
#endif
if (max_alloc>POA_MAX_ALLOC) {
- WARN_MSG(TRAP,(ERRTXT,"Exceeded memory bound: %d\n Exiting!\n\n",max_alloc),"$Revision: 1.2.2.1 $");
+ WARN_MSG(TRAP,(ERRTXT,"Exceeded memory bound: %d\n Exiting!\n\n",max_alloc),"$Revision: 1.2.2.9 $");
break; /* JUST RETURN AND FINISH */
}
}
- align_lpo(new_seq,&seq[i],
- score_matrix,&al1,&al2,use_global_alignment); /* ALIGN ONE MORE SEQ */
+ align_lpo_po (new_seq,&seq[i],
+ score_matrix,&al1,&al2,NULL,use_global_alignment); /* ALIGN ONE MORE SEQ */
if (use_aggressive_fusion)
fuse_ring_identities(new_seq->length,new_seq->letter,
seq[i].length,seq[i].letter,al1,al2);
@@ -74,7 +129,7 @@
return new_seq;
}
/**@memo example: aligning a set of sequences to a partial order:
- lpo_out=buildup_lpo(lpo_in,nseq,seq,&score_matrix,0);
+ lpo_out=buildup_lpo(lpo_in,nseq,seq,&score_matrix);
*/
@@ -154,7 +209,7 @@
LPOSequence_T *buildup_clipped_lpo(LPOSequence_T *new_seq,
int nseq,LPOSequence_T seq[],
ResidueScoreMatrix_T *score_matrix,
- int use_global_alignment)
+ int use_global_alignment)
{
int i,ntemp,offset=0,nidentity,length_max=0,match_length=0;
int total_alloc,max_alloc=0;
@@ -175,12 +230,12 @@
new_seq->length,seq[i].length);
#endif
if (max_alloc>POA_MAX_ALLOC) {
- WARN_MSG(TRAP,(ERRTXT,"Exceeded memory bound: %d\n Exiting!\n\n",max_alloc),"$Revision: 1.2.2.1 $");
+ WARN_MSG(TRAP,(ERRTXT,"Exceeded memory bound: %d\n Exiting!\n\n",max_alloc),"$Revision: 1.2.2.9 $");
break; /* JUST RETURN AND FINISH */
}
}
- align_lpo(new_seq, &seq[i],
- score_matrix,&al1,&al2,use_global_alignment); /* ALIGN ONE MORE SEQ */
+ align_lpo_po (new_seq, &seq[i],
+ score_matrix,&al1,&al2,NULL,use_global_alignment); /* ALIGN ONE MORE SEQ */
ntemp=seq[i].length; /* SAVE letter[] BEFORE CLIPPING IT TO ALIGNED AREA*/
temp=seq[i].letter;
if ((nidentity=clip_unaligned_ends(seq+i,al2,/*THERE IS AN ALIGNED REGION*/
@@ -204,112 +259,229 @@
}
-int find_seq_name(int nseq,LPOSequence_T seq[],char name[])
+/** which LPOSeq is called, or holds a sequence called, `name'? */
+int find_seq_name (int nseq, LPOSequence_T **seq, char name[])
{
- int i;
- LOOP (i,nseq)
- if (0==strcmp(seq[i].name,name))
+ int i,j;
+ for (i=0;i<nseq;i++) if (seq[i]) {
+ if (0==strcmp(seq[i]->name,name))
return i;
+ for (j=0;j<seq[i]->nsource_seq;j++) {
+ if (0==strcmp(seq[i]->source_seq[j].name,name))
+ return i;
+ }
+ }
return -1;
}
typedef struct {
double score;
- double bitscore;
int i;
int j;
-} SeqPairScore_T;
+}
+SeqPairScore_T;
-/* SORT IN ASCENDING ORDER BY score, THEN DESCENDING ORDER by bitscore */
-int seqpair_score_qsort_cmp(const void *void_a,const void *void_b)
+/* SORT IN DESCENDING ORDER BY score (SO HIGH SIMILARITY SCORES MERGE FIRST). */
+/* FOR TIES, USE ITERATIVE MERGE ORDER (1-2, then 1-3, then 1-4, etc.) */
+int seqpair_score_qsort_cmp (const void *void_a, const void *void_b)
{
- const SeqPairScore_T *a=(const SeqPairScore_T *)void_a,
- *b=(const SeqPairScore_T *)void_b;
- if (a->score < b->score)
+ const SeqPairScore_T *a = (const SeqPairScore_T *)void_a;
+ const SeqPairScore_T *b = (const SeqPairScore_T *)void_b;
+
+ if (a->score > b->score)
return -1;
- else if (a->score == b->score) {
- if (a->bitscore > b->bitscore)
- return -1;
- else if (a->bitscore == b->bitscore)
- return 0;
- }
- else
+ else if (a->score < b->score)
return 1;
+
+ if (a->i > b->i)
+ return 1;
+ else if (a->i < b->i)
+ return -1;
+
+ if (a->j > b->j)
+ return 1;
+ else if (a->j < b->j)
+ return -1;
+
+ return 0;
}
-
-
-SeqPairScore_T *read_seqpair_scorefile(int nseq,LPOSequence_T seq[],
- FILE *ifile,int *p_nscore)
+SeqPairScore_T *read_seqpair_scorefile (int nseq, LPOSequence_T **seq,
+ ResidueScoreMatrix_T *score_matrix,
+ LPOScore_T (*scoring_function)
+ (int,int,LPOLetter_T [],LPOLetter_T [],
+ ResidueScoreMatrix_T *),
+ int use_global_alignment,
+ int do_progressive, FILE *ifile, int *p_nscore)
{
- int i,j,nscore=0;
- SeqPairScore_T *score=NULL;
- double v,x;
+ int i,j,nscore=0,max_nscore=0;
+ int *adj_score = NULL;
+ SeqPairScore_T *score_list=NULL;
+ LPOLetterRef_T *al1=NULL,*al2=NULL;
+ double x, min_score=0.0;
char name1[256],name2[256];
-
- CALLOC(score,nseq*nseq/2,SeqPairScore_T);
- while (fscanf(ifile," %s %s %lf %lf",name1,name2,
- &v,&x)==4) { /*READ SCORE FILE*/
- i=find_seq_name(nseq,seq,name1);
- j=find_seq_name(nseq,seq,name2);
- if (i<0 || j<0) {
- WARN_MSG(USERR,(ERRTXT,"invalid sequence pair, not found: %s,%s",name1,name2),"$Revision: 1.2.2.1 $");
- FREE(score);
- return NULL;
+
+ CALLOC (adj_score, nseq, int);
+
+ max_nscore = nseq*nseq;
+ CALLOC (score_list, max_nscore, SeqPairScore_T);
+
+ if (ifile) { /* IF PAIR SCORE FILE (PROGRESSIVE ASSUMED) */
+ while (fscanf(ifile," %s %s %lf",name1,name2,&x)==3) { /* READ SCORE FILE */
+ i=find_seq_name(nseq,seq,name1);
+ j=find_seq_name(nseq,seq,name2);
+ if (i<0 || j<0) {
+ WARN_MSG(USERR,(ERRTXT,"invalid sequence pair, not found: %s,%s",name1,name2),"$Revision: 1.2.2.9 $");
+ FREE (score_list);
+ FREE (adj_score);
+ return NULL;
+ }
+
+ /* fprintf(stderr,"i=%d,j=%d,x=%.2f\n",i,j,x); */
+ fprintf(stderr,"Saving score from file %d (%s), %d (%s) : %.2f\n",i,name1,j,name2,x);
+ if (i<j) { int swap=i;i=j;j=swap; } /* DON'T SAVE UPPER (DUPLICATE?) HALF OF THE MATRIX */
+ score_list[nscore].i = i;
+ score_list[nscore].j = j;
+ score_list[nscore].score = x;
+ if (x<min_score) min_score = x;
+ nscore++;
+ if (nscore==max_nscore) {
+ max_nscore *= 2;
+ REALLOC (score_list, max_nscore, SeqPairScore_T);
+ }
+ if (j==i-1) {
+ adj_score[i]=1;
+ }
}
-
- /* fprintf(stderr,"i=%d,j=%d,x=%e\n",i,j,x);*/
- if (i<j) { /* DON'T SAVE UPPER, DUPLICATE HALF OF THE MATRIX */
- /* fprintf(stderr,"Saving score %s,%s:%e\n",name1,name2,x);*/
- score[nscore].score=x; /* SAVE THE SCORE INTO THE MATRIX */
- score[nscore].bitscore=v;
- score[nscore].i=i;
- score[nscore].j=j;
+ }
+ else if (do_progressive) { /* IF PROGRESSIVE BUT NO PAIR SCORE FILE */
+ for (i=0;i<nseq;i++) for (j=0;j<i;j++) { /* SCORE IS BASED ON LOCAL ALIGNMENT */
+ x = align_lpo_po (seq[i],seq[j],
+ score_matrix,&al1,&al2,scoring_function,use_global_alignment);
+ FREE(al1); /* DUMP TEMPORARY MAPPING ARRAYS */
+ FREE(al2);
+ fprintf(stderr,"Saving alignment score %d (%s), %d (%s) : %.2f\n",i,seq[i]->name,j,seq[j]->name,x);
+ score_list[nscore].i = i;
+ score_list[nscore].j = j;
+ score_list[nscore].score = x;
+ if (x<min_score) min_score = x;
nscore++;
+ if (nscore==max_nscore) {
+ max_nscore *= 2;
+ REALLOC (score_list, max_nscore, SeqPairScore_T);
+ }
+ if (j==i-1) {
+ adj_score[i]=1;
+ }
}
}
-
- /* NOW SORT THESE IN ASCENDING ORDER AND HAND BACK TO CALLER */
- qsort(score,nscore,sizeof(SeqPairScore_T),seqpair_score_qsort_cmp);
+ else { /* NOT DOING PROGRESSIVE ALIGNMENT */
+ /* USE DEFAULT (=0.0) PAIRSCORES, ENSURING ITERATIVE ALIGNMENT: */
+ fprintf(stderr,"Performing iterative alignment...\n");
+ }
+
+ for (i=1;i<nseq;i++) {
+ if (0 == adj_score[i]) {
+ score_list[nscore].i = i;
+ score_list[nscore].j = i-1;
+ score_list[nscore].score = min_score - 1.0;
+ nscore++;
+ if (nscore==max_nscore) {
+ max_nscore *= 2;
+ REALLOC (score_list, max_nscore, SeqPairScore_T);
+ }
+ }
+ }
+ FREE (adj_score);
+
+ /* NOW SORT LIST IN DESCENDING ORDER AND HAND BACK TO CALLER: */
+ qsort(score_list,nscore,sizeof(SeqPairScore_T),seqpair_score_qsort_cmp);
if (p_nscore) /* RETURN LENGTH OF PAIR SCORE TABLE IF REQUESTED */
*p_nscore=nscore;
- return score;
+ return score_list;
}
-LPOSequence_T *buildup_progressive_lpo(int nseq,LPOSequence_T seq[],
+LPOSequence_T *buildup_progressive_lpo(int nseq,LPOSequence_T **all_seqs,
ResidueScoreMatrix_T *score_matrix,
int use_aggressive_fusion,
- char score_file[],
+ int do_progressive,
+ char score_file[],
LPOScore_T (*scoring_function)
(int,int,LPOLetter_T [],LPOLetter_T [],
ResidueScoreMatrix_T *),
- int use_global_alignment)
+ int use_global_alignment,
+ int preserve_sequence_order)
{
- int i,j,max_alloc=0,total_alloc;
- LPOLetterRef_T *al1=NULL,*al2=NULL;
+ int i,j,k,max_alloc=0,total_alloc,min_counts=0;
SeqPairScore_T *score=NULL;
- FILE *ifile;
LPOSequence_T *new_seq=NULL;
+ FILE *ifile=NULL;
int *seq_cluster=NULL,cluster_i,cluster_j,nscore=0,iscore;
+ int *initial_nseq, *cluster_size, *seq_id_in_cluster;
+ int nseq_tot;
- ifile=fopen(score_file,"r");
- if (ifile) {
- if ((score=read_seqpair_scorefile(nseq,seq,ifile,&nscore))==NULL)
+
+ /* INITIALIZE ALL UNINITIALIZED SEQS: */
+ for (i=0;i<nseq;i++) {
+ if (all_seqs[i]->letter == NULL) {
+ initialize_seqs_as_lpo(1,all_seqs[i],score_matrix);
+ }
+ lpo_index_symbols(all_seqs[i],score_matrix); /* MAKE SURE LPO IS TRANSLATED */
+ }
+
+ /* RETURN IF NOTHING TO ALIGN */
+ if (nseq<=0)
+ return NULL;
+ else if (nseq==1)
+ return all_seqs[0];
+
+
+ new_seq = all_seqs[0];
+
+ CALLOC(seq_cluster,nseq,int); /* MAPS SEQS (or CLUSTERS) TO CLUSTER THEY'RE IN */
+ CALLOC(seq_id_in_cluster,nseq,int); /* INDEXES SEQS (or CLUSTERS) WITHIN EACH CLUSTER */
+ CALLOC(cluster_size,nseq,int); /* COUNTS SEQS IN SAME CLUSTER (updated w/ merges) */
+ CALLOC(initial_nseq,nseq,int); /* COUNTS SEQS INITIALLY IN EACH CLUSTER (not updated w/ merges) */
+
+ for (i=nseq_tot=0;i<nseq;i++) {
+ seq_cluster[i] = i; /* CREATE TRIVIAL MAPPING, EACH SEQ ITS OWN CLUSTER */
+ seq_id_in_cluster[i] = 0;
+ cluster_size[i] = all_seqs[i]->nsource_seq;
+ initial_nseq[i] = cluster_size[i];
+ nseq_tot += cluster_size[i];
+ }
+
+ if (score_file) {
+ ifile=fopen(score_file,"r");
+ if (ifile==NULL) {
+ WARN_MSG(USERR,(ERRTXT,"Error reading pair score file %s.\nExiting",
+ score_file),"$Revision: 1.2.2.9 $");
goto free_and_exit;
- fclose(ifile);
+ }
}
- else
+ else {
+ ifile = NULL;
+ }
+
+
+ score = read_seqpair_scorefile(nseq,all_seqs,score_matrix,scoring_function,use_global_alignment,
+ do_progressive,ifile,&nscore);
+ if (score==NULL) {
+ WARN_MSG(USERR,(ERRTXT,"Error generating pair scores (file %s).\nExiting",
+ score_file ? score_file : "unspecified"),"$Revision: 1.2.2.9 $");
goto free_and_exit;
-
- CALLOC(seq_cluster,nseq,int); /* MAPS SEQS TO CLUSTER THEY'RE IN */
- LOOP (i,nseq) /* CREATE TRIVIAL MAPPING, EACH SEQ ITS OWN CLUSTER */
- seq_cluster[i]=i;
-
+ }
+ if (ifile)
+ fclose (ifile);
+
for (iscore=0;iscore<nscore;iscore++) {
+
+ /* NB: NEW CLUSTER ID WILL BE MINIMUM OF INPUT IDs,
+ SO MASTER CLUSTER WILL ALWAYS BE CLUSTER 0. */
if (seq_cluster[score[iscore].i] < seq_cluster[score[iscore].j]) {
cluster_i=seq_cluster[score[iscore].i];
cluster_j=seq_cluster[score[iscore].j];
@@ -321,56 +493,98 @@
else /* CLUSTERS ALREADY FUSED, SO SKIP THIS PAIR */
continue;
- fprintf(stderr,"Fusing cluster %s --> %s... score %e,%lf\n",
- seq[cluster_j].name,seq[cluster_i].name,score[iscore].score,
- score[iscore].bitscore);
- new_seq=seq+cluster_i; /* THIS WILL BECOME THE NEW MASTER CLUSTER */
- if (seq[cluster_i].letter == NULL) /* NOT INITIALIZED AT ALL YET */
- initialize_seqs_as_lpo(1,seq+cluster_i,score_matrix);
- if (seq[cluster_j].letter == NULL) /* NOT INITIALIZED AT ALL YET */
- initialize_seqs_as_lpo(1,seq+cluster_j,score_matrix);
- total_alloc=new_seq->length*seq[cluster_j].length
- + sizeof(LPOLetter_T)*new_seq->length;
+ fprintf(stderr,"Fusing cluster %d (%s, nseq=%d) --> %d (%s, nseq=%d)... score %.2f\n",
+ cluster_j,all_seqs[cluster_j]->name,all_seqs[cluster_j]->nsource_seq,
+ cluster_i,all_seqs[cluster_i]->name,all_seqs[cluster_i]->nsource_seq,
+ score[iscore].score);
+
+ new_seq = all_seqs[cluster_i];
+ total_alloc = new_seq->length * (sizeof(LPOLetter_T) + all_seqs[cluster_j]->length);
if (total_alloc>max_alloc) { /* DP RECTANGLE ARRAY SIZE */
max_alloc=total_alloc;
#ifdef REPORT_MAX_ALLOC
fprintf(stderr,"max_alloc: %d bytes\n",max_alloc);
#endif
if (max_alloc>POA_MAX_ALLOC) {
- WARN_MSG(TRAP,(ERRTXT,"Exceeded memory bound: %d\n Exiting!\n\n",max_alloc),"$Revision: 1.2.2.1 $");
+ WARN_MSG(TRAP,(ERRTXT,"Exceeded memory bound: %d\n Exiting!\n\n",max_alloc),"$Revision: 1.2.2.9 $");
break; /* JUST RETURN AND FINISH */
}
}
-
+
#ifdef USE_LOCAL_NEUTRALITY_CORRECTION /* NO LONGER USED */
if (score_matrix->nfreq>0) { /* CALCULATE BALANCED SCORING ON EACH PO */
balance_matrix_score(new_seq->length,new_seq->letter,score_matrix);
- balance_matrix_score(seq[cluster_j].length,seq[cluster_j].letter,
+ balance_matrix_score(all_seqs[cluster_j]->length,all_seqs[cluster_j]->letter,
score_matrix);
}
#endif
- align_lpo_po(new_seq, &seq[cluster_j],
- score_matrix,&al1,&al2,
- scoring_function,use_global_alignment); /* ALIGN ONE MORE SEQ */
- if (use_aggressive_fusion)
- fuse_ring_identities(new_seq->length,new_seq->letter,
- seq[cluster_j].length,seq[cluster_j].letter,
- al1,al2);
- fuse_lpo(new_seq,seq+cluster_j,al1,al2); /* BUILD COMPOSITE LPO */
+ buildup_pairwise_lpo(new_seq,all_seqs[cluster_j],score_matrix,
+ use_aggressive_fusion,
+ scoring_function,use_global_alignment);
- free_lpo_letters(seq[cluster_j].length,seq[cluster_j].letter,TRUE);
- seq[cluster_j].letter=NULL; /*MARK AS FREED. DON'T LEAVE DANGLING POINTER*/
- FREE(al1); /* DUMP TEMPORARY MAPPING ARRAYS */
- FREE(al2);
+ LOOP (i,nseq) { /* APPEND ALL MEMBERS OF cluster_j TO cluster_i */
+ if (seq_cluster[i] == cluster_j) {
+ seq_cluster[i] = cluster_i;
+ seq_id_in_cluster[i] += cluster_size[cluster_i];
+ }
+ }
+ cluster_size[cluster_i] += cluster_size[cluster_j];
+ cluster_size[cluster_j] = 0;
+ }
+
+ if (preserve_sequence_order) { /* PUT SEQUENCES WITHIN LPO BACK IN THEIR ORIGINAL ORDER: */
+ int *perm;
+ CALLOC (perm, nseq_tot, int);
- LOOP (i,nseq) /* REINDEX ALL MEMBERS OF cluster_j TO JOIN cluster_i */
- if (seq_cluster[i]==cluster_j)
- seq_cluster[i]=cluster_i;
+ for (i=nseq_tot=0; i<nseq; i++) {
+ for (j=0; j<initial_nseq[i]; j++) {
+ perm[seq_id_in_cluster[i] + j] = (nseq_tot++);
+ }
+ }
+ for (i=0; i<nseq_tot; i++) printf ("%d ", perm[i]); printf ("\n");
+
+ reindex_lpo_source_seqs (new_seq, perm);
+ FREE (perm);
}
+
+ free_and_exit:
+ FREE (initial_nseq);
+ FREE (seq_cluster);
+ FREE (cluster_size);
+ FREE (seq_id_in_cluster);
+ FREE (score);
+
+ return new_seq; /* RETURN THE FINAL MASTER CLUSTER... */
+}
- free_and_exit:
- FREE(score);
- FREE(seq_cluster);
- return new_seq; /* RETURN THE FINAL MASTER CLUSTER */
+
+LPOSequence_T *buildup_pairwise_lpo(LPOSequence_T seq1[],LPOSequence_T seq2[],
+ ResidueScoreMatrix_T *score_matrix,
+ int use_aggressive_fusion,
+ LPOScore_T (*scoring_function)
+ (int,int,LPOLetter_T [],LPOLetter_T [],
+ ResidueScoreMatrix_T *),
+ int use_global_alignment)
+{
+ int min_counts1=0;
+ int min_counts2=0;
+ LPOLetterRef_T *al1=NULL,*al2=NULL;
+
+ lpo_index_symbols(seq1,score_matrix); /* MAKE SURE LPO IS TRANSLATED */
+ lpo_index_symbols(seq2,score_matrix); /* MAKE SURE LPO IS TRANSLATED */
+ align_lpo_po (seq1, seq2, score_matrix, &al1, &al2,
+ scoring_function, use_global_alignment); /* ALIGN TWO POS */
+ if (use_aggressive_fusion)
+ fuse_ring_identities(seq1->length,seq1->letter,
+ seq2->length,seq2->letter,al1,al2);
+ fuse_lpo(seq1,seq2,al1,al2); /* BUILD COMPOSITE LPO */
+
+ /* FREE LETTERS IN SECOND LPO */
+ free_lpo_letters(seq2->length,seq2->letter,TRUE);
+ seq2->letter=NULL; /*MARK AS FREED. DON'T LEAVE DANGLING POINTER*/
+ FREE(al1); /* DUMP TEMPORARY MAPPING ARRAYS */
+ FREE(al2);
+ return seq1; /* RETURN THE FINAL LPO */
}
+
Modified: trunk/packages/poa/branches/upstream/current/lpo.c
===================================================================
--- trunk/packages/poa/branches/upstream/current/lpo.c 2006-09-25 13:21:49 UTC (rev 126)
+++ trunk/packages/poa/branches/upstream/current/lpo.c 2006-09-26 04:34:02 UTC (rev 127)
@@ -258,8 +258,53 @@
}
+void reindex_lpo_source_seqs (LPOSequence_T *seq, int *perm)
+{
+ int i, j, len, nseq, *map, *invmap;
+ LPOSourceInfo_T tmp;
+ LPOLetterSource_T *src;
+
+ len = seq->length;
+ nseq = seq->nsource_seq;
+
+ CALLOC (map, len, int);
+ CALLOC (invmap, len, int);
+
+ /* BUILD INITIAL MAP AND INVERSE MAP: */
+ for (i=0; i<nseq; i++) {
+ map[i] = invmap[i] = -1;
+ }
+ for (i=0; i<nseq; i++) {
+ map[i] = perm[i];
+ IF_GUARD (map[i]>=nseq || map[i]<0 || invmap[map[i]]!=-1, 1.1, (ERRTXT,"Bad argument! 'perm' must be a permutation of [0,%d]\n",nseq-1), CRASH);
+ invmap[map[i]] = i;
+ }
+
+ /* RENUMBER SEQS IN 'source_info' ENTRIES FOR ALL LETTERS: */
+ for (i=0; i<len; i++) {
+ for (src=&(seq->letter[i].source); src!=NULL && src->ipos>=0; src=src->more) {
+ src->iseq = map[src->iseq];
+ }
+ }
+ /* SHUFFLE 'source_seq' ENTRIES, IN-PLACE, TO NEW ORDER: */
+ /* (THIS DESTROYS map AND invmap.) */
+ for (i=0; i<nseq; i++) if ((j = invmap[i]) != i) {
+ /* SWAP POSITIONS i AND j: */
+ tmp = seq->source_seq[i];
+ seq->source_seq[i] = seq->source_seq[j];
+ seq->source_seq[j] = tmp;
+ /* UPDATE map AND invmap: */
+ map[j] = map[i];
+ invmap[map[i]] = j;
+ map[i] = invmap[i] = i;
+ }
+
+ FREE (map);
+ FREE (invmap);
+}
+
void copy_lpo_letter(LPOLetter_T *new,LPOLetter_T *old,
LPOLetterRef_T old_to_new[],
int iseq_new[])
Modified: trunk/packages/poa/branches/upstream/current/lpo.h
===================================================================
--- trunk/packages/poa/branches/upstream/current/lpo.h 2006-09-25 13:21:49 UTC (rev 126)
+++ trunk/packages/poa/branches/upstream/current/lpo.h 2006-09-26 04:34:02 UTC (rev 127)
@@ -13,6 +13,11 @@
void initialize_seqs_as_lpo(int nseq, Sequence_T seq[],ResidueScoreMatrix_T *m);
void lpo_index_symbols(LPOSequence_T *lpo,ResidueScoreMatrix_T *m);
+/** reindex source sequences in `seq' so that the sequence in position i
+ ends up in position perm[i]. `perm' must be a permutation of the
+ integers [0,seq->nsource_seq-1]. */
+void reindex_lpo_source_seqs (LPOSequence_T *seq, int *perm);
+
int save_lpo_source(LPOSequence_T *seq,
char name[],
char title[],
@@ -90,15 +95,25 @@
ResidueScoreMatrix_T *score_matrix,
int use_global_alignment);
-LPOSequence_T *buildup_progressive_lpo(int nseq,LPOSequence_T seq[],
+LPOSequence_T *buildup_progressive_lpo(int nseq, LPOSequence_T **seqs,
ResidueScoreMatrix_T *score_matrix,
int use_aggressive_fusion,
- char score_file[],
+ int do_progressive,
+ char score_file[],
LPOScore_T (*scoring_function)
(int,int,LPOLetter_T [],LPOLetter_T [],
ResidueScoreMatrix_T *),
- int use_global_alignment);
+ int use_global_alignment,
+ int preserve_sequence_order);
+LPOSequence_T *buildup_pairwise_lpo(LPOSequence_T seq1[],LPOSequence_T seq2[],
+ ResidueScoreMatrix_T *score_matrix,
+ int use_aggressive_fusion,
+ LPOScore_T (*scoring_function)
+ (int,int,LPOLetter_T [],LPOLetter_T [],
+ ResidueScoreMatrix_T *),
+ int use_global_alignment);
+
/**************************************************** lpo_format.c */
void write_lpo(FILE *ifile,LPOSequence_T *seq,
ResidueScoreMatrix_T *score_matrix);
Modified: trunk/packages/poa/branches/upstream/current/lpo_format.c
===================================================================
--- trunk/packages/poa/branches/upstream/current/lpo_format.c 2006-09-25 13:21:49 UTC (rev 126)
+++ trunk/packages/poa/branches/upstream/current/lpo_format.c 2006-09-26 04:34:02 UTC (rev 127)
@@ -54,13 +54,15 @@
int i,j,length,nsource_seq,istart,field_id,*pos_count=NULL,value;
int weight,bundle_id,last_alloc=0;
LPOSequence_T *seq=NULL;
- char c,name[1024],title[4096],version[256];
+ char c,name[1024]="",title[4096]="",version[256]="";
LPOLetterSource_T save_source={0,0,NULL};
CALLOC(seq,1,LPOSequence_T);
fscanf(ifile,"VERSION=%s",version);
- if (fscanf(ifile," NAME=%[^\n] TITLE=%[^\n] LENGTH=%d SOURCECOUNT=%d",
- name,title,&length,&nsource_seq)!=4)
+ fscanf(ifile," NAME=%[^\n]",name);
+ fscanf(ifile," TITLE=%[^\n]",title);
+ if (fscanf(ifile," LENGTH=%d SOURCECOUNT=%d",
+ &length,&nsource_seq)!=2)
return NULL;
STRNCPY(seq->name,name,SEQUENCE_NAME_MAX);
seq->title=strdup(title);
@@ -146,7 +148,7 @@
int nlink,*link_list=NULL,*match_pos=NULL,*ring_old=NULL;
int *pos_compact=NULL,npos_compact=0,keep_this_letter,retention_mode;
LPOSequence_T *seq=NULL;
- char c,name[1024],title[4096],version[256];
+ char c,name[1024]="",title[4096]="",version[256]="";
LPOLetterSource_T save_source={0,0,NULL},*source=NULL;
if (remove_listed_sequences)
@@ -156,8 +158,10 @@
CALLOC(seq,1,LPOSequence_T);
fscanf(ifile,"VERSION=%s",version);
- if (fscanf(ifile," NAME=%[^\n] TITLE=%[^\n] LENGTH=%d SOURCECOUNT=%d",
- name,title,&length,&nsource_seq)!=4)
+ fscanf(ifile," NAME=%[^\n]",name);
+ fscanf(ifile," TITLE=%[^\n]",title);
+ if (fscanf(ifile," LENGTH=%d SOURCECOUNT=%d",
+ &length,&nsource_seq)!=2)
return NULL;
STRNCPY(seq->name,name,SEQUENCE_NAME_MAX);
seq->title=strdup(title);
@@ -174,9 +178,13 @@
}
LOOPF(i,nsource_seq) { /* SAVE SOURCE INFO LIST */
- if (fscanf(ifile," SOURCENAME=%[^\n] SOURCEINFO=%d %d %d %d %[^\n]",
- name,&length,&istart,&weight,&bundle_id,title)!=6)
+ if (fscanf(ifile," SOURCENAME=%[^\n] SOURCEINFO=%d %d %d %d",
+ name,&length,&istart,&weight,&bundle_id)!=5)
return NULL;
+ /* SKIP WHITESPACE BEFORE TITLE; ALLOW EMPTY TITLE. */
+ fscanf(ifile,"%*[ \t]");
+ if (fscanf(ifile,"%[^\n]",title)!=1)
+ title[0] = '\0';
STRNCPY(seq->source_seq[i].name,name,SEQUENCE_NAME_MAX);
seq->source_seq[i].length=length;
seq->source_seq[i].istart=istart;
@@ -376,10 +384,6 @@
}
if (p_seq_pos)
*p_seq_pos = seq_pos;
- if (p_seq_pos)
- *p_seq_pos = seq_pos;
- if (p_seq_pos)
- *p_seq_pos = seq_pos;
return nring;
}
Modified: trunk/packages/poa/branches/upstream/current/main.c
===================================================================
--- trunk/packages/poa/branches/upstream/current/main.c 2006-09-25 13:21:49 UTC (rev 126)
+++ trunk/packages/poa/branches/upstream/current/main.c 2006-09-26 04:34:02 UTC (rev 127)
@@ -1,76 +1,80 @@
-#include <lpo.h>
+#include "lpo.h"
+#include "msa_format.h"
+#include "align_score.h"
-extern LPOScore_T caties_scoring_function(int i,
- int j,
- LPOLetter_T seq_x[],
- LPOLetter_T seq_y[],
- ResidueScoreMatrix_T *m)
- ; /* INCLUDE THIS HERE SO WE CAN PASS IT TO buildup_progressive_lpo() */
+static LPOSequence_T *read_partial_order_file (char *po_filename, char *subset_filename, int remove_listed_seqs, int keep_all_links, int do_switch_case, ResidueScoreMatrix_T *mat);
-
int main(int argc,char *argv[])
{
- int i,ibundle=ALL_BUNDLES,nframe_seq=0,use_reverse_complement=0;
+ int i,j,ibundle=ALL_BUNDLES,nframe_seq=0,use_reverse_complement=0;
int nseq=0,do_switch_case=dont_switch_case,do_analyze_bundles=0;
- char score_file[256],seq_file[256],*comment=NULL,*al_name="test align";
+ int nseq_in_list=0,n_input_seqs=0,max_input_seqs=0;
+ char score_file[256],seq_file[256],po_list_entry_filename[256],*comment=NULL,*al_name="test align";
ResidueScoreMatrix_T score_matrix; /* DEFAULT GAP PENALTIES*/
- Sequence_T *seq=NULL,*lpo_out=NULL,*frame_seq=NULL,*dna_lpo=NULL,
- *lpo_in=NULL;
- FILE *seq_ifile=NULL,*errfile=stderr,*logfile=NULL,*lpo_file_out=NULL,
- *subset_ifile=NULL;
+ LPOSequence_T *seq=NULL,*lpo_out=NULL,*frame_seq=NULL,*dna_lpo=NULL,*lpo_in=NULL;
+ LPOSequence_T **input_seqs=NULL;
+ FILE *errfile=stderr,*logfile=NULL,*lpo_file_out=NULL,*po_list_file=NULL,*seq_ifile=NULL;
char *print_matrix_letters=NULL,*fasta_out=NULL,*po_out=NULL,*matrix_filename=NULL,
- *seq_filename=NULL,*frame_dna_filename=NULL,*po_filename=NULL,
- *hbmin=NULL,*numeric_data=NULL,*numeric_data_name="Nmiscall",*dna_to_aa=NULL,
- *pair_score_file=NULL,*aafreq_file=NULL,*termval_file=NULL,
- *bold_seq_name=NULL,*subset_file=NULL,*rm_subset_file=NULL;
+ *seq_filename=NULL,*frame_dna_filename=NULL,*po_filename=NULL,*po2_filename=NULL,
+ *po_list_filename=NULL, *hbmin=NULL,*numeric_data=NULL,*numeric_data_name="Nmiscall",
+ *dna_to_aa=NULL,*pair_score_file=NULL,*aafreq_file=NULL,*termval_file=NULL,
+ *bold_seq_name=NULL,*subset_file=NULL,*subset2_file=NULL,*rm_subset_file=NULL,
+ *rm_subset2_file=NULL;
float bundling_threshold=0.9;
int exit_code=0,count_sequence_errors=0,please_print_snps=0,
report_consensus_seqs=0,report_major_allele=0,use_aggressive_fusion=0;
int show_allele_evidence=0,please_collapse_lines=0,keep_all_links=0;
- int remove_listed_seqs=0,please_report_similarity;
+ int remove_listed_seqs=0,remove_listed_seqs2=0,please_report_similarity;
+ int do_global=0, do_progressive=0, do_preserve_sequence_order=0;
char *reference_seq_name="CONSENS%d",*clustal_out=NULL;
- int use_global_alignment=0;
-
black_flag_init(argv[0],PROGRAM_VERSION);
if (argc<2) {
- fprintf(stderr,"\nUsage: %s [OPTIONS] matrixfile\n\n"
-" OPTIONS\t\t\tFUNCTION\n"
-" -------\t\t\t________\n"
+ fprintf(stderr,"\nUsage: %s [OPTIONS] MATRIXFILE\n"
+"Align a set of sequences or alignments using the scores in MATRIXFILE.\n"
+"Example: %s -read_fasta multidom.seq -clustal m.aln blosum80.mat\n\n"
"INPUT:\n"
-" -read_fasta FILENAME\t\tReads in FASTA sequence file.\n"
-" -tolower\t\t\tForces FASTA sequences to lowercase\n"
-"\t\t\t\t(nucleotides in our matrix files)\n"
-" -toupper\t\t\tForces FASTA sequences to UPPERCASE\n"
-"\t\t\t\t(amino acids in our matrix files)\n"
-" -read_po FILENAME\t\tReads in PO file.\n"
-" -subset FILENAME\t\tFilters PO-MSA to include list of seqs in file.\n"
-" -remove FILENAME\t\tFilters PO-MSA to exclude list of seqs in file.\n"
-"\n"
+" -read_fasta FILE Read in FASTA sequence file.\n"
+" -read_msa FILE Read in MSA alignment file.\n"
+" -read_msa2 FILE Read in second MSA file. \n"
+" -subset FILE Filter MSA to include list of seqs in file.\n"
+" -subset2 FILE Filter second MSA to include list of seqs in file.\n"
+" -remove FILE Filter MSA to exclude list of seqs in file.\n"
+" -remove2 FILE Filter second MSA to exclude list of seqs in file.\n"
+" -read_msa_list FILE Read an MSA from each filename listed in file.\n"
+" -tolower Force FASTA/MSA sequences to lowercase\n"
+" (nucleotides in our matrix files)\n"
+" -toupper Force FASTA/MSA sequences to UPPERCASE\n"
+" (amino acids in our matrix files)\n"
"\nALIGNMENT:\n"
-" -pair_score FILENAME\t\tReads tab-delimited file of sequence-sequence\n"
-"\t\t\t\tsimilarity scores for constructing a guide-tree\n"
-"\t\t\t\tand performing progressive alignment using\n"
-"\t\t\t\tPO-PO alignment steps.\n"
-" -fuse_all\t\t\tFuses identical letters on align rings.\n"
+" -do_global Do global alignment.\n"
+" -do_progressive Perform progressive alignment using a guide tree\n"
+" built by neighbor joining from a set of\n"
+" sequence-sequence similarity scores.\n"
+" -read_pairscores FILE Read tab-delimited file of similarity scores.\n"
+" (If not provided, scores are constructed\n"
+" using pairwise sequence alignment.)\n"
+" -fuse_all Fuse identical letters on align rings.\n"
"\nANALYSIS:\n"
-" -hb\t\t\t\tPerforms heaviest bundling to generate consensi.\n"
-" -hbmin VALUE\t\t\tBundles into heaviest bundle seqs with percent id >= value.\n"
+" -hb Perform heaviest bundling to generate consensi.\n"
+" -hbmin VALUE Include in heaviest bundle sequences with\n"
+" percent ID (as a fraction) >= value.\n"
"\nOUTPUT:\n"
-" -best\t\t\t\tRestricts MSA output to heaviest bundles.\n"
-" -pir FILENAME\t\t\tWrites out MSA in PIR format.\n"
-" -clustal FILENAME\t\tWrites out MSA in CLUSTAL format.\n"
-" -po FILENAME\t\t\tWrites out MSA in PO format.\n"
-" -printmatrix LETTERSET\tPrints score matrix to stdout.\n"
-" -v\t\t\t\tRuns in verbose mode.\n\n"
-" Note: Either the -read_fasta or -read_po argument must be used with poa,\n"
-" since a FASTA file or PO file must be read in by poa.\n\n"
-"For more information, see http://www.bioinformatics.ucla.edu/poa\n\n"
- ,argv[0]);
+" -pir FILE Write out MSA in PIR format.\n"
+" -clustal FILE Write out MSA in CLUSTAL format.\n"
+" -po FILE Write out MSA in PO format.\n"
+" -preserve_seqorder Write out MSA with sequences in their input order.\n"
+" -printmatrix LETTERS Print score matrix to stdout.\n"
+" -best Restrict MSA output to heaviest bundles (PIR only).\n"
+" -v Run in verbose mode (e.g. output gap penalties).\n\n"
+" NOTE: One of the -read_fasta, -read_msa, or -read_msa_list arguments\n"
+" must be used, since a sequence or alignment file is required.\n\n"
+"For more information, see http://www.bioinformatics.ucla.edu/poa.\n\n"
+ ,argv[0],argv[0]);
exit(-1);
}
@@ -81,160 +85,237 @@
ARGMATCH_VAL("-best",ibundle,0); /*RESTRICT FASTA OUTPUT TO HB */
ARGMATCH_VAL("-hb",do_analyze_bundles,1);/*CALCULATE HEAVIEST BUNDLING*/
ARGGET("-printmatrix",print_matrix_letters);
- ARGGET("-read_po",po_filename); /* READ A PO FILE FOR ALIGNMENT/ANALYSIS*/
+ ARGGET("-read_msa",po_filename); /* READ A MSA FILE FOR ALIGNMENT/ANALYSIS*/
+ ARGGET("-read_msa2",po2_filename); /* READ A SECOND MSA FILE FOR ALIGNMENT/ANALYSIS*/
+ ARGGET("-read_msa_list",po_list_filename); /* READ A LIST OF MSAs FOR ALIGNMENT/ANALYSIS */
ARGGET("-pir",fasta_out); /* SAVE FASTA-PIR FORMAT ALIGNMENT FILE */
ARGGET("-clustal",clustal_out); /* SAVE CLUSTAL FORMAT ALIGNMENT FILE */
ARGGET("-po",po_out); /* SAVE PO FORMAT ALIGNMENT FILE */
+ ARGMATCH("-preserve_seqorder",do_preserve_sequence_order); /* DO PRESERVE SEQUENCE ORDER */
ARGGET("-hbmin",hbmin); /* SET THRESHOLD FOR BUNDLING */
ARGMATCH("-fuse_all",use_aggressive_fusion);
- ARGMATCH("-global",use_global_alignment);
- ARGGET("-pair_score",pair_score_file); /* FILENAME TO READ PAIR SCORES*/
+ ARGMATCH("-do_global",do_global); /* DO GLOBAL */
+ ARGGET("-read_pairscores",pair_score_file); /* FILENAME TO READ PAIR SCORES*/
+ ARGMATCH("-do_progressive", do_progressive); /* DO PROGRESSIVE ALIGNMENT */
ARGGET("-subset",subset_file); /* FILENAME TO READ SEQ SUBSET LIST*/
+ ARGGET("-subset2",subset2_file); /* FILENAME TO READ SEQ SUBSET LIST*/
ARGGET("-remove",rm_subset_file); /* FILENAME TO READ SEQ REMOVAL LIST*/
+ ARGGET("-remove2",rm_subset2_file); /* FILENAME TO READ SEQ REMOVAL LIST*/
ARGGET("-read_fasta",seq_filename); /* READ FASTA FILE FOR ALIGNMENT */
- NEXTARG(matrix_filename); /* NON-FLAG ARG SHOULD BE MATRIX FILE */
-
+ NEXTARG(matrix_filename); /* NON-FLAG ARG SHOULD BE MATRIX FILE */
}
- if (rm_subset_file) { /* TREAT SUBSET FILE AS LIST OF SEQS TO REMOVE */
- subset_file=rm_subset_file;
- remove_listed_seqs=1;
+
+ /** CHECK FOR CONFLICTING FLAGS **/
+
+ if (po_list_filename && (po_filename || po2_filename)) {
+ WARN_MSG(USERR,(ERRTXT, "Error: The -read_po_list and -read_po flags cannot be used at the same time.\nExiting."), "$Revision: 1.2.2.9 $");
+ exit_code = 1;
+ goto free_memory_and_exit;
}
-
+
+ if (((subset_file || rm_subset_file) && !po_filename) || ((subset2_file || rm_subset2_file) && !po2_filename)) {
+ WARN_MSG(USERR,(ERRTXT, "Error: Each -subset/-remove flag must have a corresponding -read_po flag.\nExiting."),"$Revision: 1.2.2.9 $");
+ exit_code = 1;
+ goto free_memory_and_exit;
+ }
+
+ if ((subset_file && rm_subset_file) || (subset2_file && rm_subset2_file)) {
+ WARN_MSG(USERR,(ERRTXT, "Error: The -subset and -remove flags cannot be used at the same time.\nExiting."),"$Revision: 1.2.2.9 $");
+ exit_code = 1;
+ goto free_memory_and_exit;
+ }
+
+ if (rm_subset_file) {
+ subset_file = rm_subset_file;
+ remove_listed_seqs = 1;
+ }
+ if (rm_subset2_file) {
+ subset2_file = rm_subset2_file;
+ remove_listed_seqs2 = 1;
+ }
+
if (hbmin)
- bundling_threshold=atof(hbmin);
+ bundling_threshold=atof(hbmin);
if (!matrix_filename ||
read_score_matrix(matrix_filename,&score_matrix)<=0){/* READ MATRIX */
- WARN_MSG(USERR,(ERRTXT,"Error reading matrix file %s. Exiting",
- matrix_filename? matrix_filename: "(null)"),"$Revision: 1.2 $");
+ WARN_MSG(USERR,(ERRTXT,"Error reading matrix file %s.\nExiting",
+ matrix_filename ? matrix_filename: "because none specified"),"$Revision: 1.2.2.9 $");
exit_code=1; /* SIGNAL ERROR CONDITION */
goto free_memory_and_exit;
}
-
+
if (logfile) {
- fprintf(logfile, "X-Gap Penalties: %d %d %d\n",
+ fprintf(logfile,"X-Gap Penalties (Open, Aff1, Aff2; LTrunc, LDecay): %d %d %d %d %d\n",
score_matrix.gap_penalty_set[0][0],
score_matrix.gap_penalty_set[0][1],
- score_matrix.gap_penalty_set[0][2]);
- fprintf(logfile, "Y-Gap Penalties: %d %d %d\n",
+ score_matrix.gap_penalty_set[0][2],
+ score_matrix.trunc_gap_length,
+ score_matrix.decay_gap_length);
+ fprintf(logfile,"X-Gap Penalties (0, 1, 2, ...): ");
+ for (i=0; i<=score_matrix.max_gap_length; i++) {
+ fprintf (logfile, "%d ", score_matrix.gap_penalty_x[i]);
+ }
+ fprintf(logfile,"... \n");
+ fprintf(logfile,"Y-Gap Penalties (Open, Aff1, Aff2; LTrunc, LDecay): %d %d %d %d %d\n",
score_matrix.gap_penalty_set[1][0],
score_matrix.gap_penalty_set[1][1],
- score_matrix.gap_penalty_set[1][2]);
+ score_matrix.gap_penalty_set[1][2],
+ score_matrix.trunc_gap_length,
+ score_matrix.decay_gap_length);
+ fprintf(logfile,"Y-Gap Penalties (0, 1, 2, ...): ");
+ for (i=0; i<=score_matrix.max_gap_length; i++) {
+ fprintf (logfile, "%d ", score_matrix.gap_penalty_y[i]);
+ }
+ fprintf(logfile,"... \n");
}
if (print_matrix_letters) /* USER WANTS US TO PRINT A MATRIX */
print_score_matrix(stdout,&score_matrix,print_matrix_letters
/*"ARNDCQEGHILKMFPSTWYV"*/);
-
- if (po_filename) { /* READ PARTIAL ORDER FILE */
- if (seq_ifile=fopen(po_filename,"r")) {
- if (subset_file) { /* LIST OF SEQS TO FILTER THE PO WITH */
- subset_ifile=fopen(subset_file,"r");
- if (!subset_ifile) {
- WARN_MSG(USERR,(ERRTXT,"Error reading subset file %s. Exiting",
- subset_file),"$Revision: 1.2 $");
- exit_code=1; /* SIGNAL ERROR CONDITION */
- goto free_memory_and_exit;
- }
- lpo_in=read_lpo_select(seq_ifile,subset_ifile,keep_all_links,
- remove_listed_seqs);
- fclose(subset_ifile);
- }
- else
- lpo_in=read_lpo(seq_ifile); /* READ LPO NORMALLY W/O FILTERING */
- fclose(seq_ifile);
+
+
+ /** READ INPUT FILES **/
+
+ n_input_seqs = 0;
+ max_input_seqs = 10;
+ CALLOC (input_seqs, max_input_seqs, LPOSequence_T *);
+
+ if (po_filename) {
+ lpo_in = read_partial_order_file (po_filename, subset_file, remove_listed_seqs, keep_all_links, do_switch_case, &score_matrix);
+ if (lpo_in == NULL) {
+ exit_code = 1;
+ goto free_memory_and_exit;
}
- if (!lpo_in) {
- WARN_MSG(USERR,(ERRTXT,"Error reading PO file %s!!!\nExiting.",
- po_filename),"$Revision: 1.2 $");
- exit_code=1; /* SIGNAL ERROR CONDITION */
+ fprintf(errfile,"...Read %d sequences from MSA file %s...\n",lpo_in->nsource_seq,po_filename);
+ input_seqs[n_input_seqs++] = lpo_in;
+ lpo_in = NULL;
+ }
+
+ if (po2_filename) {
+ lpo_in = read_partial_order_file (po2_filename, subset2_file, remove_listed_seqs2, keep_all_links, do_switch_case, &score_matrix);
+ if (lpo_in == NULL) {
+ exit_code = 1;
goto free_memory_and_exit;
}
+ fprintf(errfile,"...Read %d sequences from second MSA file %s...\n",lpo_in->nsource_seq,po2_filename);
+ input_seqs[n_input_seqs++] = lpo_in;
+ lpo_in = NULL;
}
-
- if (seq_filename) { /* READ SEQUENCES TO ALIGN */
- seq_ifile=fopen(seq_filename,"r"); /* READ SEQUENCE DATABASE */
- if (seq_ifile) {
- nseq=read_fasta(seq_ifile,&seq,do_switch_case,&comment);
- fclose(seq_ifile);
+
+ if (po_list_filename) {
+ po_list_file = fopen (po_list_filename, "r");
+ while (po_list_file && fscanf (po_list_file, " %s", po_list_entry_filename) == 1) {
+ lpo_in = read_partial_order_file (po_list_entry_filename, NULL, 0, 0, do_switch_case, &score_matrix);
+ if (lpo_in == NULL) {
+ exit_code = 1;
+ goto free_memory_and_exit;
+ }
+ fprintf(errfile,"...Read %d sequences from PO list entry %s...\n",lpo_in->nsource_seq,po_list_entry_filename);
+ nseq_in_list += lpo_in->nsource_seq;
+ input_seqs[n_input_seqs++] = lpo_in;
+ lpo_in = NULL;
+ if (n_input_seqs == max_input_seqs) {
+ max_input_seqs *= 2;
+ REALLOC (input_seqs, max_input_seqs, LPOSequence_T *);
+ }
}
+ if (nseq_in_list==0) {
+ WARN_MSG(USERR,(ERRTXT,"Error reading PO list file %s.\nExiting",
+ po_list_file),"$Revision: 1.2.2.9 $");
+ exit_code=1; /* SIGNAL ERROR CONDITION */
+ goto free_memory_and_exit;
+ }
}
- if (nseq>0) { /* INITIALIZE THE SEQUENCES AND RUN THE ALIGNMENT */
- fprintf(errfile,"...Read %d sequences from %s...\n",nseq,seq_filename);
- /*initialize_seqs_as_lpo(nseq,seq,&score_matrix); */
- if (lpo_in) { /* ADD TO OUR EXISTING ALIGNMENT */
- lpo_out=buildup_lpo(lpo_in,nseq,seq,&score_matrix,
- use_aggressive_fusion,
- use_global_alignment);/*BUILD ALIGNMENT*/
- lpo_in=NULL; /* DATA STRUCTURE NOW POINTED TO BY lpo_out,
- SO DON'T FREE IT TWICE!! */
+ if (seq_filename) {
+ seq_ifile = fopen (seq_filename, "r");
+ if (seq_ifile == NULL) {
+ WARN_MSG(USERR,(ERRTXT,"Couldn't open sequence file %s.\nExiting",
+ seq_filename),"$Revision: 1.2.2.9 $");
+ exit_code=1; /* SIGNAL ERROR CONDITION */
+ goto free_memory_and_exit;
}
- else if (pair_score_file) {
- lpo_out=buildup_progressive_lpo(nseq,seq,&score_matrix,
- use_aggressive_fusion,
- pair_score_file,
- caties_scoring_function,
- use_global_alignment);
+ nseq = read_fasta (seq_ifile, &seq, do_switch_case, &comment);
+ fclose (seq_ifile);
+ if (nseq == 0) {
+ WARN_MSG(USERR,(ERRTXT,"Error reading sequence file %s.\nExiting",
+ seq_filename),"$Revision: 1.2.2.9 $");
+ exit_code=1; /* SIGNAL ERROR CONDITION */
+ goto free_memory_and_exit;
}
- else /* OTHERWISE JUST BUILDUP ON TOP OF FIRST READ SEQUENCE */
- lpo_out=buildup_lpo(seq,nseq-1,seq+1,&score_matrix,
- use_aggressive_fusion,
- use_global_alignment);
+ fprintf(errfile,"...Read %d sequences from sequence file %s...\n",nseq,seq_filename);
+ for (i=0; i<nseq; i++) {
+ input_seqs[n_input_seqs++] = &(seq[i]);
+ initialize_seqs_as_lpo(1,&(seq[i]),&score_matrix);
+ if (n_input_seqs == max_input_seqs) {
+ max_input_seqs *= 2;
+ REALLOC (input_seqs, max_input_seqs, LPOSequence_T *);
+ }
+ }
}
- else if (lpo_in) /* FILTERED LPO... CAN PRINT OUT AS LPO */
- lpo_out=lpo_in;
- else { /* HMM... NO DATA TO WORK WITH AT ALL. MUST BE AN ERROR. COMPLAIN!*/
- WARN_MSG(USERR,(ERRTXT,"Error reading sequence file %s and PO file %s. Exiting.",
- seq_filename? seq_filename: "because none specified",
- po_filename? po_filename: "because none specified"),"$Revision: 1.2 $");
+
+
+ /** BUILD AND ANALYZE OUTPUT PO-MSA **/
+
+ if (n_input_seqs == 0) { /* HMM.. NO DATA. */
+ WARN_MSG(USERR,(ERRTXT,"No input sequences were provided; use one of the -read_ flags.\nExiting."), "$Revision: 1.2.2.9 $");
exit_code=1; /* SIGNAL ERROR CONDITION */
goto free_memory_and_exit;
}
+ else {
+ lpo_out = buildup_progressive_lpo (n_input_seqs, input_seqs, &score_matrix,
+ use_aggressive_fusion, do_progressive, pair_score_file,
+ matrix_scoring_function, do_global, do_preserve_sequence_order);
+ }
if (comment) { /* SAVE THE COMMENT LINE AS TITLE OF OUR LPO */
FREE(lpo_out->title);
lpo_out->title=strdup(comment);
}
-
- if (do_analyze_bundles) /* DIVIDE INTO BUNDLES W/ CONSENSUS */
+
+ /* DIVIDE INTO BUNDLES W/ CONSENSUS USING PERCENT ID */
+ if (do_analyze_bundles)
generate_lpo_bundles(lpo_out,bundling_threshold);
if (po_out) { /* WRITE FINAL PARTIAL ORDER ALIGNMENT TO OUTPUT */
if (lpo_file_out=fopen(po_out, "w")) {
write_lpo(lpo_file_out,lpo_out,&score_matrix);
fclose(lpo_file_out);
+ fprintf(errfile,"...Wrote %d sequences to PO file %s...\n",lpo_out->nsource_seq,po_out);
}
else {
WARN_MSG(USERR,(ERRTXT,"*** Could not save PO file %s. Exiting.",
- po_out),"$Revision: 1.2 $");
+ po_out),"$Revision: 1.2.2.9 $");
exit_code=1; /* SIGNAL ERROR CONDITION */
}
}
- if (fasta_out) { /* WRITE FINAL ALIGNMENT IN FASTA FORMAT */
- if (seq_ifile=fopen(fasta_out,"w")) { /* FASTA ALIGNMENT*/
+ if (fasta_out) { /* WRITE FINAL ALIGNMENT IN FASTA-PIR FORMAT */
+ if (seq_ifile=fopen(fasta_out,"w")) { /* FASTA-PIR ALIGNMENT*/
write_lpo_bundle_as_fasta(seq_ifile,lpo_out,score_matrix.nsymbol,
score_matrix.symbol,ibundle);
fclose(seq_ifile);
+ fprintf(errfile,"...Wrote %d sequences to FASTA-PIR file %s...\n",lpo_out->nsource_seq,fasta_out);
}
else {
- WARN_MSG(USERR,(ERRTXT,"*** Could not save FASTA file %s. Exiting.",
- fasta_out),"$Revision: 1.2 $");
+ WARN_MSG(USERR,(ERRTXT,"*** Could not save FASTA-PIR file %s. Exiting.",
+ fasta_out),"$Revision: 1.2.2.9 $");
exit_code=1; /* SIGNAL ERROR CONDITION */
}
}
- if (clustal_out) { /* WRITE FINAL ALIGNMENT IN FASTA FORMAT */
- if (seq_ifile=fopen(clustal_out,"w")) { /* FASTA ALIGNMENT*/
+ if (clustal_out) { /* WRITE FINAL ALIGNMENT IN CLUSTAL FORMAT */
+ if (seq_ifile=fopen(clustal_out,"w")) { /* CLUSTAL ALIGNMENT*/
export_clustal_seqal(seq_ifile,lpo_out,score_matrix.nsymbol,
score_matrix.symbol);
fclose(seq_ifile);
+ fprintf(errfile,"...Wrote %d sequences to CLUSTAL file %s...\n",lpo_out->nsource_seq,clustal_out);
}
else {
WARN_MSG(USERR,(ERRTXT,"*** Could not save CLUSTAL file %s. Exiting.",
- fasta_out),"$Revision: 1.2 $");
+ clustal_out),"$Revision: 1.2.2.9 $");
exit_code=1; /* SIGNAL ERROR CONDITION */
}
}
@@ -242,14 +323,64 @@
free_memory_and_exit: /* FREE ALL DYNAMICALLY ALLOCATED DATA!!!! */
+
if (dna_lpo)
free_lpo_sequence(dna_lpo,TRUE);
- if (lpo_in)
- free_lpo_sequence(lpo_in,TRUE);
- if (lpo_out != seq && lpo_out != lpo_in)
- free_lpo_sequence(lpo_out,TRUE);
- LOOPB (i,nseq) /* seq[] WAS ALLOCATED AS ONE ARRAY, SO FREE seq[0] LAST */
- free_lpo_sequence(seq+i,i==0); /* FREE BLOCK AFTER ALL HOLDERS EMPTY*/
- exit(exit_code);
+
+ for (i=0;i<n_input_seqs;i++) {
+ for (j=0;j<nseq;j++) {
+ if (input_seqs[i]==&(seq[j]))
+ break;
+ }
+ free_lpo_sequence(input_seqs[i],(j==nseq));
+ }
+ FREE (input_seqs);
+ if (nseq>0) FREE (seq);
+
+ exit (exit_code);
}
+
+
+static LPOSequence_T *read_partial_order_file (char *po_filename, char *subset_filename, int remove_listed_seqs, int keep_all_links, int do_switch_case, ResidueScoreMatrix_T *mat)
+{
+ LPOSequence_T *lpo_in;
+ FILE *po_file=NULL, *subset_file=NULL;
+
+ if (!po_filename)
+ return NULL;
+
+ po_file = fopen (po_filename, "r");
+ if (!po_file) {
+ WARN_MSG (USERR, (ERRTXT,"Couldn't open MSA file %s.\nExiting.",po_filename), "$Revision: 1.2.2.9 $");
+ return NULL;
+ }
+
+ if (subset_filename) {
+ subset_file = fopen (subset_filename, "r");
+ if (!subset_file) {
+ WARN_MSG (USERR, (ERRTXT,"Couldn't open subset file %s.\nExiting.",subset_filename), "$Revision: 1.2.2.9 $");
+ return NULL;
+ }
+ }
+
+ if (subset_file) {
+ lpo_in = read_msa_select (po_file, UNKNOWN_MSA, subset_file, keep_all_links, remove_listed_seqs, do_switch_case, mat);
+ fclose (subset_file);
+ fclose (po_file);
+ if (lpo_in==NULL || lpo_in->nsource_seq == 0) {
+ WARN_MSG (USERR, (ERRTXT,"MSA file %s, filtered with subset file %s, couldn't be read or contains no sequences.\nExiting.", po_filename, subset_filename), "$Revision: 1.2.2.9 $");
+ return NULL;
+ }
+ }
+ else {
+ lpo_in = read_msa (po_file, UNKNOWN_MSA, do_switch_case, mat);
+ fclose (po_file);
+ if (lpo_in==NULL || lpo_in->nsource_seq == 0) {
+ WARN_MSG (USERR, (ERRTXT,"MSA file %s couldn't be read or contains no sequences.\nExiting.", po_filename), "$Revision: 1.2.2.9 $");
+ return NULL;
+ }
+ }
+
+ return lpo_in;
+}
Added: trunk/packages/poa/branches/upstream/current/make_pscores.pl
===================================================================
--- trunk/packages/poa/branches/upstream/current/make_pscores.pl (rev 0)
+++ trunk/packages/poa/branches/upstream/current/make_pscores.pl 2006-09-26 04:34:02 UTC (rev 127)
@@ -0,0 +1,48 @@
+#!/usr/bin/perl
+#
+# Usage: make_pscores.pl SEQFILE SCOREFILE
+#
+# Runs BLAST and writes output to a file "SEQFILE.out".
+# This file is parsed into lines of the form "seqname1 \t seqname2 \t bitscore"
+# (suitable for input to POA), which are written to SCOREFILE.
+#
+# NB: The bitscore increases with increasing sequence similarity.
+#
+
+$seq_file = $ARGV[0];
+$pscore_file = $ARGV[1];
+
+open(PSCORE_OUT, ">$pscore_file");
+system("./formatdb -i $seq_file -p T");
+system("./blastall -p blastp -d $seq_file -i $seq_file -M BLOSUM80 -o $seq_file.out");
+
+open(BLAST_OUT, "<$seq_file.out");
+while(<BLAST_OUT>){
+ @my_parse = split(/\s/, $_);
+ if ($my_parse[0] =~ /^Query=/){
+ $seq_name1 = $my_parse[1];
+ while(<BLAST_OUT>){
+ if ($_ =~ />/){
+ last;
+ }
+ if ($_ =~ /bits/){
+ while(<BLAST_OUT>){
+ if ($_ =~ />/){
+ last;
+ }
+ @other_parse = split(/\s+/, $_);
+ $seq_name2 = $other_parse[0];
+ $bit_score = $other_parse[1];
+ if ($seq_name2 ne ""){
+ printf PSCORE_OUT "$seq_name1\t$seq_name2\t$bit_score.0\n";
+ }
+ }
+ if ($_ =~ />/){
+ last;
+ }
+ }
+ }
+ }
+}
+close(BLAST_OUT);
+close(PSCORE_OUT);
Property changes on: trunk/packages/poa/branches/upstream/current/make_pscores.pl
___________________________________________________________________
Name: svn:executable
+ *
Added: trunk/packages/poa/branches/upstream/current/msa_format.c
===================================================================
--- trunk/packages/poa/branches/upstream/current/msa_format.c (rev 0)
+++ trunk/packages/poa/branches/upstream/current/msa_format.c 2006-09-26 04:34:02 UTC (rev 127)
@@ -0,0 +1,619 @@
+
+
+/****************/
+/* msa_format.c */
+/****************/
+
+/* ---
+ Implementation of "msa_format.h":
+
+ Functions for reading CLUSTAL- and FASTA-PIR-formatted files
+ into the LPOSequence_T data structure, and for determining file
+ type from the initial line(s) of a file.
+ --- */
+
+
+
+#include "msa_format.h"
+
+
+/** is `ch' an allowed residue? (a-z OR A-Z OR ? OR [ OR ]) */
+static int is_residue_char (char ch);
+
+/** is `ch' an allowed gap? (. OR -) */
+static int is_gap_char (char ch);
+
+/** could `ch' be the first character of a sequence name? (NOT # AND NOT * AND NOT whitespace) */
+static int is_name_first_char (char ch);
+
+
+/** decide which sequences in an RC-MSA alignment to keep based on a filter file `select_ifile'.
+ remove discarded sequences and reindex the remaining ones.
+ returns the NEW number of sequences. */
+static int filter_sequence_set (int n_seqs, FILE *select_ifile, int remove_listed_sequences,
+ char **seq_names, char **seq_titles, char **aln_mat, int *aln_lengths);
+
+
+/** remove sequence bad_seq_id from each letter containing it.
+ ASSUMES THAT NO LETTER CONTAINS _ONLY_ bad_seq_id... e.g. bad_seq_id IS A CONSENSUS.
+ either keep or don't keep links present only in the removed sequence. */
+static void strip_seq_from_lpo (LPOSequence_T *lposeq, int bad_seq_id, int keep_all_links);
+
+
+static LPOSequence_T *read_clustal (FILE *ifile, const char *first_line,
+ FILE *select_ifile, int remove_listed_sequences,
+ int do_switch_case, ResidueScoreMatrix_T *score_matrix);
+
+static LPOSequence_T *read_pir (FILE *ifile, const char *first_line,
+ FILE *select_ifile, int remove_listed_sequences,
+ int do_switch_case, ResidueScoreMatrix_T *score_matrix);
+
+
+
+/** Reads an MSA from a file. If `format' is UNKNOWN_MSA, the file
+ format is determined from the first line(s) of the file.
+ Uses `select_ifile', if non-NULL, to filter the sequence set.
+ Uppercases or lowercases sequence characters according to `do_switch_case'.
+ Indexes LPO symbols using `score_matrix'.
+*/
+LPOSequence_T *read_msa_select (FILE *ifile, msa_file_format format,
+ FILE *select_ifile, int keep_all_links, int remove_listed_sequences,
+ int do_switch_case, ResidueScoreMatrix_T *score_matrix)
+{
+ static char line[512]="";
+
+ /* USE format TO FIX FILE FORMAT IF POSSIBLE. */
+ /* OTHERWISE, PO FILES START WITH 'VERSION=' (and this line is discarded),
+ FASTA-PIR FILES START WITH '>', AND CLUSTAL FILES WITH 'CLUSTAL' OR SIMPLY
+ WITH THE FIRST ALIGNMENT LINE.
+ LINES STARTING WITH whitespace OR '#' OR '*' ARE IGNORED. */
+
+ while (format!=UNKNOWN_MSA || fgets (line, sizeof(line)-1, ifile)) {
+
+ if (format==PIR_MSA || line[0] == '>') {
+ return read_pir (ifile, line,
+ select_ifile, remove_listed_sequences,
+ do_switch_case, score_matrix);
+ }
+ else if (format==PO_MSA || 0==strncmp(line,"VERSION=",8)) {
+ if (select_ifile != NULL) {
+ return read_lpo_select (ifile, select_ifile, keep_all_links, remove_listed_sequences);
+ }
+ else {
+ return read_lpo (ifile);
+ }
+ }
+ else if (format==CLUSTAL_MSA || 0==strncmp(line,"CLUSTAL",7)) {
+ return read_clustal (ifile, line,
+ select_ifile, remove_listed_sequences,
+ do_switch_case, score_matrix);
+ }
+ else if (line[0] == '#' || line[0] == '*' || line[0] == ' ' || line[0] == '\t' || line[0] == '\n' || line[0] == '\r') {
+ continue;
+ }
+ else {
+ WARN_MSG(USERR,(ERRTXT, "Unable to determine MSA file type... trying CLUSTAL.\n"),"$Revision: 1.1.2.3 $");
+ return read_clustal (ifile, line,
+ select_ifile, remove_listed_sequences,
+ do_switch_case, score_matrix);
+ }
+ }
+
+ WARN_MSG(USERR,(ERRTXT, "No data in MSA file.\n"),"$Revision: 1.1.2.3 $");
+
+ return NULL;
+}
+
+/** Reads an MSA from a file (cf. read_msa_select).
+ */
+LPOSequence_T *read_msa (FILE *ifile, msa_file_format format,
+ int do_switch_case, ResidueScoreMatrix_T *score_matrix)
+{
+ return read_msa_select (ifile, format,
+ NULL, 0, 0,
+ do_switch_case, score_matrix);
+}
+
+
+
+
+/** Reads a CLUSTAL-formatted alignment file.
+ */
+LPOSequence_T *read_clustal (FILE *fp, const char *first_line,
+ FILE *select_ifile, int remove_listed_sequences,
+ int do_switch_case, ResidueScoreMatrix_T *score_matrix)
+{
+ int i, j, n_seqs=0, curr_seq=0, expect_repeats=0, expect_header=1, line_num=0;
+ char ch;
+ char **seq_names=NULL, **seq_titles=NULL, **aln_mat=NULL;
+ int *aln_lengths=NULL;
+ char line[512]="", name[512]="", aln[512]="";
+ LPOSequence_T *lposeq = NULL;
+
+ while ((first_line!=NULL && line_num==0) || fgets (line, sizeof(line)-1, fp)) {
+ if (first_line!=NULL && line_num==0) {
+ strcpy (line, first_line);
+ }
+ line_num++;
+
+ if (expect_header) { /* LOOKING FOR 'CLUSTAL' HEADER LINE */
+ if (0 == strncmp(line,"CLUSTAL",7)) { /* FOUND IT; STOP LOOKING */
+ expect_header=0;
+ continue;
+ }
+ else if (0 == is_name_first_char(line[0])) { /* COMMENT LINE; KEEP LOOKING */
+ continue;
+ }
+ else {
+ expect_header=0; /* FOUND SOMETHING ELSE; NO HEADER, SO CONTINUE */
+ }
+ }
+
+ if (0 == is_name_first_char(line[0])) { /* BLOCK SEPARATOR? */
+ curr_seq = 0;
+ if (n_seqs>0) {
+ expect_repeats=1; /* YES; NOW EXPECT SAME SEQUENCES OVER AGAIN IN EACH BLOCK */
+ }
+ continue;
+ }
+
+ if (sscanf (line, "%s %[^\n\r]", name, aln) < 2) {
+ WARN_MSG(USERR,(ERRTXT, "Error: Trouble reading CLUSTAL-formatted file near line %d: \n>>>\n%s<<<\nBailing out.\n",line_num,line),
+ "$Revision: 1.1.2.3 $");
+ goto free_memory_and_exit;
+ }
+
+ if (0 == expect_repeats) { /* FIRST BLOCK STILL, SO MAKE ROOM FOR NEW SEQ */
+ n_seqs++;
+
+ REALLOC (seq_names, n_seqs, char *);
+ seq_names[n_seqs-1] = strdup(name);
+ REALLOC (seq_titles, n_seqs, char *);
+ seq_titles[n_seqs-1] = strdup("");
+
+ REALLOC (aln_mat, n_seqs, char *);
+ aln_mat[n_seqs-1] = strdup(" ");
+ REALLOC (aln_lengths, n_seqs, int);
+ aln_lengths[n_seqs-1] = 0;
+ }
+ else if (curr_seq>=n_seqs || strcmp(name,seq_names[curr_seq])) { /* NAME SHOULD BE A REPEAT */
+ WARN_MSG(USERR,(ERRTXT, "Error: Trouble reading CLUSTAL-formatted file at line %d: \n>>>\n%s<<<\nSequence name (%s) does not match expected sequence name (%s). Bailing out.\n",line_num,line,name,seq_names[curr_seq]),
+ "$Revision: 1.1.2.3 $");
+ goto free_memory_and_exit;
+ }
+
+ REALLOC (aln_mat[curr_seq], aln_lengths[curr_seq] + strlen(aln), char);
+ for (i=0; i<strlen(aln); i++) {
+ ch = aln[i];
+ if (is_residue_char(ch) || is_gap_char(ch)) {
+ aln_mat[curr_seq][aln_lengths[curr_seq]++] = aln[i];
+ }
+ }
+ curr_seq++;
+ }
+
+ n_seqs = filter_sequence_set (n_seqs, select_ifile, remove_listed_sequences, seq_names, seq_titles, aln_mat, aln_lengths);
+
+ lposeq = lpo_from_aln_mat (n_seqs, seq_names, seq_titles, aln_mat, aln_lengths, do_switch_case, score_matrix);
+
+ free_memory_and_exit:
+ for (i=0; i<n_seqs; i++) {
+ FREE (seq_names[i]);
+ FREE (seq_titles[i]);
+ FREE (aln_mat[i]);
+ }
+ FREE (seq_names);
+ FREE (seq_titles);
+ FREE (aln_mat);
+ FREE (aln_lengths);
+
+ if (n_seqs>0) {
+ strcpy (lposeq->name, lposeq->source_seq[0].name);
+ FREE (lposeq->title);
+ lposeq->title = strdup (lposeq->source_seq[0].title);
+ }
+
+ return lposeq;
+}
+
+/** Reads a FASTA-PIR-formatted alignment file.
+ */
+LPOSequence_T *read_pir (FILE *fp, const char *first_line,
+ FILE *select_ifile, int remove_listed_sequences,
+ int do_switch_case, ResidueScoreMatrix_T *score_matrix)
+{
+ int i, j, n_seqs=0, curr_seq=0, line_num=0;
+ char ch;
+ char **seq_names=NULL, **seq_titles=NULL, **aln_mat=NULL;
+ int *aln_lengths=NULL, *keep_seq=NULL;
+ char line[512]="", name[512]="", title[512]="", aln[512]="";
+ LPOSequence_T *lposeq = NULL;
+
+ while ((first_line!=NULL && line_num==0) || fgets (line, sizeof(line)-1, fp)) {
+ if (first_line!=NULL && line_num==0) {
+ strcpy (line, first_line);
+ }
+ line_num++;
+
+ if (line[0] == '>') { /* HEADER LINE FOR NEW SEQUENCE */
+ title[0] = '\0';
+ if (sscanf (line, ">%s %[^\n\r]", name, title) < 1) {
+ WARN_MSG(USERR,(ERRTXT, "Error: Trouble reading PIR-formatted file near line %d (no sequence name?):\n>>>\n%s<<<\nBailing out.\n",line_num,line),
+ "$Revision: 1.1.2.3 $");
+ goto free_memory_and_exit;
+ }
+
+ n_seqs++;
+ curr_seq=n_seqs-1;
+ REALLOC (seq_names, n_seqs, char *);
+ seq_names[n_seqs-1] = strdup(name);
+ REALLOC (seq_titles, n_seqs, char *);
+ seq_titles[n_seqs-1] = strdup(title);
+ REALLOC (aln_mat, n_seqs, char *);
+ aln_mat[n_seqs-1] = strdup(" ");
+ REALLOC (aln_lengths, n_seqs, int);
+ aln_lengths[n_seqs-1] = 0;
+ }
+ else if (line[0]=='#' || line[0]=='*') { /* COMMENT LINE */
+ continue;
+ }
+ else { /* ALIGNMENT ROW FOR CURRENT SEQUENCE */
+ sscanf (line, "%[^\n\r]", aln);
+ if (n_seqs==0) {
+ WARN_MSG(USERR,(ERRTXT, "Error: Trouble reading PIR-formatted file near line %d (no preceding '>seqname' line?):\n>>>\n%s<<<\nBailing out.\n",line_num,line),
+ "$Revision: 1.1.2.3 $");
+ goto free_memory_and_exit;
+ }
+
+ REALLOC (aln_mat[curr_seq], aln_lengths[curr_seq] + strlen(aln), char);
+ for (i=0; i<strlen(aln); i++) {
+ ch = aln[i];
+ if (is_residue_char(ch) || is_gap_char(ch)) {
+ aln_mat[curr_seq][aln_lengths[curr_seq]++] = aln[i];
+ }
+ }
+ }
+ }
+
+ n_seqs = filter_sequence_set (n_seqs, select_ifile, remove_listed_sequences, seq_names, seq_titles, aln_mat, aln_lengths);
+
+ lposeq = lpo_from_aln_mat (n_seqs, seq_names, seq_titles, aln_mat, aln_lengths, do_switch_case, score_matrix);
+
+ free_memory_and_exit:
+ for (i=0; i<n_seqs; i++) {
+ FREE (seq_names[i]);
+ FREE (seq_titles[i]);
+ FREE (aln_mat[i]);
+ }
+ FREE (seq_names);
+ FREE (seq_titles);
+ FREE (aln_mat);
+ FREE (aln_lengths);
+
+ if (n_seqs>0) {
+ strcpy (lposeq->name, lposeq->source_seq[0].name);
+ FREE (lposeq->title);
+ lposeq->title = strdup (lposeq->source_seq[0].title);
+ }
+
+ return lposeq;
+}
+
+/** Creates an LPO from an RC-MSA alignment matrix.
+ */
+LPOSequence_T *lpo_from_aln_mat (int n_seqs, char **seq_names, char **seq_titles, char **aln_mat, int *aln_lengths, int do_switch_case, ResidueScoreMatrix_T *score_matrix)
+{
+ int i, j, k, len, col, res_id, letter_id;
+ char ch;
+ LPOSequence_T *curr_seq, *lposeq;
+ int **column_ids; /** which column contains each residue */
+ int **res_ids; /** which residue is in each column */
+ int *al_x, *al_y;
+ int max_aln_length = 0;
+ char *consens_row;
+
+ if (n_seqs==0)
+ return NULL;
+
+ CALLOC (column_ids, n_seqs+1, int *);
+ CALLOC (res_ids, n_seqs+1, int *);
+ for (i=0; i<n_seqs; i++) {
+ if (aln_lengths[i] > max_aln_length) {
+ max_aln_length = aln_lengths[i];
+ }
+ }
+ for (i=0; i<n_seqs+1; i++) {
+ CALLOC (column_ids[i], max_aln_length, int);
+ CALLOC (res_ids[i], max_aln_length, int);
+ }
+
+ /* GET CONSENSUS ROW THAT ALIGNS TO SOME OTHER ROW IN EACH COLUMN */
+ /* CONSENSUS ROW IS _NECESSARY_ TO DEAL WITH THE POSSIBILITY THAT, e.g., UNORDERED RESIDUES
+ IN seqs 0 & 1 ARE ORDERED BY ALIGNMENT WITH seq 2. */
+
+ CALLOC (consens_row, max_aln_length, char);
+ for (col=0; col<max_aln_length; col++) {
+ consens_row[col] = '-';
+ for (i=0; i<n_seqs; i++) {
+ if (col < aln_lengths[i] && is_residue_char(aln_mat[i][col])) {
+ consens_row[col] = aln_mat[i][col];
+ break;
+ }
+ }
+ }
+
+ /* MAKE SEQUENCE FROM CONSENSUS ROW */
+
+ CALLOC (lposeq, 1, LPOSequence_T);
+ strcpy (lposeq->name, "consens_row");
+ lposeq->title = strdup ("");
+
+ CALLOC (lposeq->sequence, max_aln_length+1, char);
+ for (len=col=0; col<max_aln_length; col++) {
+ ch = consens_row[col];
+ if (do_switch_case == switch_case_to_lower) {
+ ch = tolower(ch);
+ }
+ else if (do_switch_case == switch_case_to_upper) {
+ ch = toupper(ch);
+ }
+ if (is_residue_char(ch)) {
+ lposeq->sequence[len] = ch;
+ column_ids[0][len] = col;
+ res_ids[0][col] = len;
+ len++;
+ }
+ }
+ lposeq->length = len;
+ lposeq->sequence[col] = '\0';
+ REALLOC (lposeq->sequence, len+1, char);
+
+ initialize_seqs_as_lpo (1, lposeq, score_matrix);
+
+ for (i=0; i<n_seqs; i++) {
+
+ CALLOC (curr_seq, 1, LPOSequence_T);
+ strcpy (curr_seq->name, seq_names[i]);
+ curr_seq->title = strdup (seq_titles[i]);
+
+ /* READ CHARACTERS FROM ALIGNMENT ROW INTO SEQUENCE: */
+ CALLOC (curr_seq->sequence, aln_lengths[i]+1, char);
+ for (len=col=0; col<aln_lengths[i]; col++) {
+ ch = aln_mat[i][col];
+ if (do_switch_case == switch_case_to_lower) {
+ ch = tolower(ch);
+ }
+ else if (do_switch_case == switch_case_to_upper) {
+ ch = toupper(ch);
+ }
+ if (is_residue_char(ch)) {
+ curr_seq->sequence[len] = ch;
+ column_ids[i+1][len] = col;
+ res_ids[i+1][col] = len;
+ len++;
+ }
+ }
+ curr_seq->length = len;
+ curr_seq->sequence[len] = '\0';
+ REALLOC (curr_seq->sequence, len+1, char);
+
+ initialize_seqs_as_lpo (1, curr_seq, score_matrix);
+
+ /* ALIGN THIS SEQUENCE TO EXISTING ALIGNMENT USING CONSENSUS ROW */
+
+ build_seq_to_po_index (lposeq);
+
+ CALLOC (al_x, len, int);
+ CALLOC (al_y, lposeq->length, int);
+
+ for (j=0; j<lposeq->length; j++) {
+ al_y[j] = INVALID_LETTER_POSITION;
+ }
+
+ /* FOR EACH RESIDUE IN THIS SEQUENCE: */
+ for (j=0; j<len; j++) {
+ col = column_ids[i+1][j];
+
+ /* GET AN OLD NODE IN CONSENSUS ROW ALIGNED TO THIS ONE: */
+ res_id = res_ids[0][col];
+
+ /* STORE INFO IN al_x,al_y: */
+ letter_id = lposeq->source_seq[0].seq_to_po[res_id];
+ al_x[j] = letter_id;
+ al_y[letter_id] = j;
+ }
+
+ /* DO THE FUSION-BASED BUILDUP */
+
+ fuse_ring_identities (lposeq->length, lposeq->letter,
+ curr_seq->length, curr_seq->letter,
+ al_y, al_x);
+
+ fuse_lpo (lposeq, curr_seq, al_y, al_x);
+
+ free_lpo_sequence (curr_seq, 0);
+ FREE (al_x);
+ FREE (al_y);
+ FREE (curr_seq);
+ }
+
+ /* STRIP OUT CONSENSUS SEQUENCE */
+
+ strip_seq_from_lpo (lposeq, /*remove #*/ 0, /*0=don't keep all links*/ 0);
+
+ for (i=0; i<n_seqs+1; i++) {
+ FREE (column_ids[i]);
+ FREE (res_ids[i]);
+ }
+ FREE (column_ids);
+ FREE (res_ids);
+ FREE (consens_row);
+
+ return lposeq;
+}
+
+
+static int is_residue_char (char ch)
+{
+ if (ch>='a' && ch<='z') return 1;
+ if (ch>='A' && ch<='Z') return 1;
+ if (ch=='?' || ch=='[' || ch==']') return 1;
+ return 0;
+}
+
+static int is_gap_char (char ch)
+{
+ if (ch=='.' || ch=='-') return 1;
+ return 0;
+}
+
+static int is_name_first_char (char ch)
+{
+ if (ch=='\n' || ch=='\r' || ch==' ' || ch=='\t' || ch=='#' || ch=='*') return 0;
+ return 1;
+}
+
+static int filter_sequence_set (int n_seqs, FILE *select_ifile, int remove_listed_sequences,
+ char **seq_names, char **seq_titles, char **aln_mat, int *aln_lengths)
+{
+ int i, j;
+ int *keep_seq;
+ char name[512]="";
+
+ /* DECIDE WHICH SEQUENCES TO KEEP: */
+
+ CALLOC (keep_seq, n_seqs, int);
+
+ for (i=0; i<n_seqs; i++) {
+ keep_seq[i] = ((!select_ifile || remove_listed_sequences) ? 1 : 0);
+ }
+
+ while (select_ifile && fscanf (select_ifile, "SOURCENAME=%[^\r\n]%*[\r\n]", name) >= 1) {
+ for (i=0; i<n_seqs; i++) {
+ if (0 == strcmp(seq_names[i],name)) {
+ keep_seq[i] = (remove_listed_sequences ? 0 : 1);
+ }
+ }
+ }
+
+ /* FREE MEM FOR DISCARDED SEQUENCES: */
+ for (i=0; i<n_seqs; i++) if (keep_seq[i]==0) {
+ FREE (seq_names[i]);
+ FREE (seq_titles[i]);
+ FREE (aln_mat[i]);
+ }
+
+ /* COMPACT SEQUENCE LIST (MOVE i<--j): */
+ for (i=j=0; j<n_seqs; i++, j++) {
+ while (j<n_seqs && keep_seq[j]==0)
+ j++;
+ if (j==n_seqs)
+ break;
+ if (j<n_seqs && i!=j) {
+ seq_names[i] = seq_names[j];
+ seq_titles[i] = seq_titles[j];
+ aln_mat[i] = aln_mat[j];
+ aln_lengths[i] = aln_lengths[j];
+ }
+ }
+
+ FREE (keep_seq);
+
+ return i;
+}
+
+
+
+void strip_seq_from_lpo (LPOSequence_T *lposeq, int bad_seq_id, int keep_all_links)
+{
+ int i, j, id_left, id_right, len, n_seqs;
+ LPOLetterLink_T *lnk, *tmp_lnk;
+ LPOLetterSource_T *src, *prev, *tmp_src;
+ LPOLetter_T *lett;
+
+ n_seqs = lposeq->nsource_seq - 1;
+
+ /* FREE MEM ASSOCIATED WITH BAD SEQ */
+ FREE (lposeq->source_seq[bad_seq_id].title);
+ FREE (lposeq->source_seq[bad_seq_id].sequence);
+ FREE (lposeq->source_seq[bad_seq_id].seq_to_po);
+ FREE (lposeq->source_seq[bad_seq_id].po_to_seq);
+
+ /* COMPACT SOURCE LIST */
+ for (i=0; i<n_seqs; i++) if (i>=bad_seq_id) {
+ lposeq->source_seq[i] = lposeq->source_seq[i+1];
+ }
+ /* This de-allocation causes a bug in conjunction with
+ later REBUFF calls: */
+ /* REALLOC (lposeq->source_seq, n_seqs, LPOSourceInfo_T); */
+ lposeq->nsource_seq = n_seqs;
+
+
+ /* RENUMBER SOURCES IN EACH LETTER */
+ for (i=0; i<lposeq->length; i++) {
+ lett = &(lposeq->letter[i]);
+ prev = NULL;
+ src = &(lett->source);
+ while (src != NULL && src->iseq >= 0) {
+ if (src->iseq == bad_seq_id) {
+ if (prev) { /* NOT IN LIST HEAD, SO RELINK FROM PREV AND FREE */
+ prev->more = src->more;
+ FREE (src);
+ src = prev->more;
+ }
+ else { /* IN LIST HEAD, SO REASSIGN HEAD DATA, RELINK, FREE */
+ if (src->more) {
+ tmp_src = src->more;
+ src->ipos = tmp_src->ipos;
+ src->iseq = tmp_src->iseq;
+ src->more = tmp_src->more;
+ FREE (tmp_src);
+ }
+ else {
+ src->ipos = -1;
+ src->iseq = -1;
+ }
+ }
+ }
+ else {
+ if (src->iseq>=bad_seq_id)
+ src->iseq--;
+ prev = src;
+ src = src->more;
+ }
+ }
+ }
+
+ if (0==keep_all_links) {
+
+ build_seq_to_po_index (lposeq);
+
+ /* FREE ALL LINK INFO */
+ for (i=0; i<lposeq->length; i++) {
+ lett = &(lposeq->letter[i]);
+ for (lnk = &(lett->left); lnk->more != NULL; ) {
+ tmp_lnk = lnk->more;
+ lnk->more = tmp_lnk->more;
+ FREE (tmp_lnk);
+ }
+ lnk->ipos = -1;
+ for (lnk = &(lett->right); lnk->more != NULL; ) {
+ tmp_lnk = lnk->more;
+ lnk->more = tmp_lnk->more;
+ FREE (tmp_lnk);
+ }
+ lnk->ipos = -1;
+ }
+
+ /* REBUILD LINK INFO FROM REMAINING SEQS */
+ for (i=0; i<n_seqs; i++) {
+ len = lposeq->source_seq[i].length;
+ for (j=0; j<len-1; j++) {
+ id_left = lposeq->source_seq[i].seq_to_po[j];
+ id_right = lposeq->source_seq[i].seq_to_po[j+1];
+ add_lpo_link (&(lposeq->letter[id_left].right), id_right);
+ add_lpo_link (&(lposeq->letter[id_right].left), id_left);
+ }
+ }
+ }
+}
Added: trunk/packages/poa/branches/upstream/current/msa_format.h
===================================================================
--- trunk/packages/poa/branches/upstream/current/msa_format.h (rev 0)
+++ trunk/packages/poa/branches/upstream/current/msa_format.h 2006-09-26 04:34:02 UTC (rev 127)
@@ -0,0 +1,52 @@
+
+
+/****************/
+/* msa_format.h */
+/****************/
+
+/* ---
+ Functions for reading CLUSTAL- and FASTA-PIR-formatted files
+ into the LPOSequence_T data structure, and for determining file
+ type from the initial line(s) of a file.
+ --- */
+
+
+#ifndef MSA_FORMAT_HEADER_INCLUDED
+#define MSA_FORMAT_HEADER_INCLUDED
+
+#include "default.h"
+#include "seq_util.h"
+#include "lpo.h"
+
+/** types of MSA files supported */
+typedef enum {
+ UNKNOWN_MSA, CLUSTAL_MSA, PIR_MSA, PO_MSA
+}
+msa_file_format;
+
+
+/** Reads an MSA from a file (cf. read_msa_select).
+ */
+LPOSequence_T *read_msa (FILE *ifile, msa_file_format format,
+ int do_switch_case, ResidueScoreMatrix_T *score_matrix);
+
+
+/** Reads an MSA from a file. If `format' is UNKNOWN_MSA, the file
+ format is determined from the first line(s) of the file.
+ Uses `select_ifile', if non-NULL, to filter the sequence set.
+ Uppercases or lowercases sequence characters according to `do_switch_case'.
+ Indexes LPO symbols using `score_matrix'.
+*/
+LPOSequence_T *read_msa_select (FILE *ifile, msa_file_format format,
+ FILE *select_ifile, int keep_all_links, int remove_listed_sequences,
+ int do_switch_case, ResidueScoreMatrix_T *score_matrix);
+
+
+/** Creates an LPO from an RC-MSA alignment matrix.
+ */
+LPOSequence_T *lpo_from_aln_mat (int n_seqs, char **seq_names, char **seq_titles, char **aln_mat, int *aln_lengths,
+ int do_switch_case, ResidueScoreMatrix_T *score_matrix);
+
+
+
+#endif /* MSA_FORMAT_HEADER_INCLUDED */
Added: trunk/packages/poa/branches/upstream/current/multidom.pscore
===================================================================
--- trunk/packages/poa/branches/upstream/current/multidom.pscore (rev 0)
+++ trunk/packages/poa/branches/upstream/current/multidom.pscore 2006-09-26 04:34:02 UTC (rev 127)
@@ -0,0 +1,16 @@
+ABL1_HUMAN ABL1_HUMAN 2475.0
+ABL1_HUMAN MATK_HUMAN 260.0
+ABL1_HUMAN GRB2_HUMAN 100.0
+ABL1_HUMAN CRKL_HUMAN 63.0
+CRKL_HUMAN CRKL_HUMAN 736.0
+CRKL_HUMAN GRB2_HUMAN 70.0
+CRKL_HUMAN ABL1_HUMAN 60.0
+CRKL_HUMAN MATK_HUMAN 55.0
+GRB2_HUMAN GRB2_HUMAN 551.0
+GRB2_HUMAN ABL1_HUMAN 100.0
+GRB2_HUMAN MATK_HUMAN 77.0
+GRB2_HUMAN CRKL_HUMAN 70.0
+MATK_HUMAN MATK_HUMAN 1215.0
+MATK_HUMAN ABL1_HUMAN 279.0
+MATK_HUMAN GRB2_HUMAN 69.0
+MATK_HUMAN CRKL_HUMAN 55.0
Modified: trunk/packages/poa/branches/upstream/current/seq_util.c
===================================================================
--- trunk/packages/poa/branches/upstream/current/seq_util.c 2006-09-25 13:21:49 UTC (rev 126)
+++ trunk/packages/poa/branches/upstream/current/seq_util.c 2006-09-26 04:34:02 UTC (rev 127)
@@ -94,7 +94,7 @@
ifile=fopen(filename,"r");
if (!ifile) {
- WARN_MSG(USERR,(ERRTXT,"Can't open alignment matrix from %s\n",filename),"$Revision: 1.2.2.1 $");
+ WARN_MSG(USERR,(ERRTXT,"Can't open alignment matrix from %s\n",filename),"$Revision: 1.2.2.2 $");
return -2; /* FAILED TO FIND FILE TO READ */
}
@@ -165,6 +165,37 @@
}
fclose(ifile);
+ /* CONSTRUCT GAP PENALTY ARRAYS FROM GAP PARAMETERS: */
+ m->max_gap_length = m->trunc_gap_length + m->decay_gap_length;
+ CALLOC (m->gap_penalty_x, m->max_gap_length+2, LPOScore_T);
+ CALLOC (m->gap_penalty_y, m->max_gap_length+2, LPOScore_T);
+
+ /*** GAP OPENING PENALTY @ L=0->1 */
+ m->gap_penalty_x[0] = m->gap_penalty_set[0][0];
+ m->gap_penalty_y[0] = m->gap_penalty_set[1][0];
+
+ /*** 1st AFFINE EXTENSION PENALTY (A1) @ L=1->2,2->3,...T-1->T */
+ for (i=1;i<m->trunc_gap_length;i++) {
+ m->gap_penalty_x[i] = m->gap_penalty_set[0][1];
+ m->gap_penalty_y[i] = m->gap_penalty_set[1][1];
+ }
+
+ /*** DECAYING EXTENSION PENALTY (A1-->A2; skipped if D=0) @ L=T->T+1,...T+D-1->T+D */
+ for (i=0;i<m->decay_gap_length;i++) {
+ double dec_x = (m->gap_penalty_set[0][1] - m->gap_penalty_set[0][2]) / ((double)(m->decay_gap_length + 1));
+ double dec_y = (m->gap_penalty_set[1][1] - m->gap_penalty_set[1][2]) / ((double)(m->decay_gap_length + 1));
+ m->gap_penalty_x[i+m->trunc_gap_length] = m->gap_penalty_set[0][1] - (i+1) * dec_x;
+ m->gap_penalty_y[i+m->trunc_gap_length] = m->gap_penalty_set[1][1] - (i+1) * dec_y;
+ }
+
+ /*** 2nd AFFINE EXTENSION PENALTY (A2) @ L>=T+D */
+ m->gap_penalty_x[m->max_gap_length] = m->gap_penalty_set[0][2];
+ m->gap_penalty_y[m->max_gap_length] = m->gap_penalty_set[1][2];
+
+ m->gap_penalty_x[m->max_gap_length+1] = 0; /* DON'T REMOVE THIS!... SPECIAL STATE USED IN align_lpo. */
+ m->gap_penalty_y[m->max_gap_length+1] = 0; /* DON'T REMOVE THIS!... SPECIAL STATE USED IN align_lpo. */
+
+
LOOPF (i,nsymb) {
Score_matrix_row= m->score[i]; /* ROW TO USE FOR SORTING best_match */
LOOP (j,nsymb)
Modified: trunk/packages/poa/branches/upstream/current/seq_util.h
===================================================================
--- trunk/packages/poa/branches/upstream/current/seq_util.h 2006-09-25 13:21:49 UTC (rev 126)
+++ trunk/packages/poa/branches/upstream/current/seq_util.h 2006-09-26 04:34:02 UTC (rev 127)
@@ -35,9 +35,14 @@
char symbol[MATRIX_SYMBOL_MAX];
ResidueScore_T score[MATRIX_SYMBOL_MAX][MATRIX_SYMBOL_MAX];
int best_match[MATRIX_SYMBOL_MAX][MATRIX_SYMBOL_MAX];
- ResidueScore_T gap_penalty_set[2][3];
+
+ ResidueScore_T gap_penalty_set[2][3];
int trunc_gap_length;
int decay_gap_length;
+
+ ResidueScore_T *gap_penalty_x, *gap_penalty_y;
+ int max_gap_length;
+
int nfreq; /* STORE FREQUENCIES OF AMINO ACIDS FOR BALANCING MATRIX...*/
char freq_symbol[MATRIX_SYMBOL_MAX];
float freq[MATRIX_SYMBOL_MAX];
More information about the debian-med-commit
mailing list