[med-svn] r163 - in trunk/packages: . clustalw clustalw/branches
clustalw/branches/upstream clustalw/branches/upstream/current
Charles Plessy
charles-guest at alioth.debian.org
Mon Dec 4 01:55:57 CET 2006
Author: charles-guest
Date: 2006-12-04 01:55:49 +0100 (Mon, 04 Dec 2006)
New Revision: 163
Added:
trunk/packages/clustalw/
trunk/packages/clustalw/branches/
trunk/packages/clustalw/branches/upstream/
trunk/packages/clustalw/branches/upstream/current/
trunk/packages/clustalw/branches/upstream/current/README_W
trunk/packages/clustalw/branches/upstream/current/README_X
trunk/packages/clustalw/branches/upstream/current/alnscore.c
trunk/packages/clustalw/branches/upstream/current/amenu.c
trunk/packages/clustalw/branches/upstream/current/calcgapcoeff.c
trunk/packages/clustalw/branches/upstream/current/calcprf1.c
trunk/packages/clustalw/branches/upstream/current/calcprf2.c
trunk/packages/clustalw/branches/upstream/current/calctree.c
trunk/packages/clustalw/branches/upstream/current/clustalv.doc
trunk/packages/clustalw/branches/upstream/current/clustalw.c
trunk/packages/clustalw/branches/upstream/current/clustalw.doc
trunk/packages/clustalw/branches/upstream/current/clustalw.h
trunk/packages/clustalw/branches/upstream/current/clustalw.ms
trunk/packages/clustalw/branches/upstream/current/clustalw.new
trunk/packages/clustalw/branches/upstream/current/clustalw_help
trunk/packages/clustalw/branches/upstream/current/clustalx.c
trunk/packages/clustalw/branches/upstream/current/clustalx.html
trunk/packages/clustalw/branches/upstream/current/clustalx_help
trunk/packages/clustalw/branches/upstream/current/coldna.par
trunk/packages/clustalw/branches/upstream/current/colprint.par
trunk/packages/clustalw/branches/upstream/current/colprot.par
trunk/packages/clustalw/branches/upstream/current/dayhoff.h
trunk/packages/clustalw/branches/upstream/current/gcgcheck.c
trunk/packages/clustalw/branches/upstream/current/general.h
trunk/packages/clustalw/branches/upstream/current/globin.pep
trunk/packages/clustalw/branches/upstream/current/gon90.bla
trunk/packages/clustalw/branches/upstream/current/interface.c
trunk/packages/clustalw/branches/upstream/current/makefile
trunk/packages/clustalw/branches/upstream/current/makefile.alpha
trunk/packages/clustalw/branches/upstream/current/makefile.linux
trunk/packages/clustalw/branches/upstream/current/makefile.sgi
trunk/packages/clustalw/branches/upstream/current/makefile.sun
trunk/packages/clustalw/branches/upstream/current/malign.c
trunk/packages/clustalw/branches/upstream/current/matrices.h
trunk/packages/clustalw/branches/upstream/current/matrixseries.gon
trunk/packages/clustalw/branches/upstream/current/pairalign.c
trunk/packages/clustalw/branches/upstream/current/param.h
trunk/packages/clustalw/branches/upstream/current/prfalign.c
trunk/packages/clustalw/branches/upstream/current/random.c
trunk/packages/clustalw/branches/upstream/current/readmat.c
trunk/packages/clustalw/branches/upstream/current/sequence.c
trunk/packages/clustalw/branches/upstream/current/showpair.c
trunk/packages/clustalw/branches/upstream/current/trees.c
trunk/packages/clustalw/branches/upstream/current/util.c
trunk/packages/clustalw/branches/upstream/current/xcolor.c
trunk/packages/clustalw/branches/upstream/current/xdisplay.c
trunk/packages/clustalw/branches/upstream/current/xmenu.c
trunk/packages/clustalw/branches/upstream/current/xmenu.h
trunk/packages/clustalw/branches/upstream/current/xscore.c
trunk/packages/clustalw/branches/upstream/current/xutils.c
trunk/packages/clustalw/tags/
Log:
[svn-inject] Installing original source of clustalw
Added: trunk/packages/clustalw/branches/upstream/current/README_W
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/README_W 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/README_W 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,280 @@
+******************************************************************************
+
+ CLUSTAL W Multiple Sequence Alignment Program
+ (version 1.83, Feb 2003)
+
+******************************************************************************
+
+
+Please send bug reports, comments etc. to one of:-
+ gibson at embl-heidelberg.de
+ thompson at igbmc.u-strasbg.fr
+ d.higgins at ucc.ie
+
+
+******************************************************************************
+
+ POLICY ON COMMERCIAL DISTRIBUTION OF CLUSTAL W
+
+Clustal W is freely available to the user community. However, Clustal W is
+increasingly being distributed as part of commercial sequence analysis
+packages. To help us safeguard future maintenance and development, commercial
+distributors of Clustal W must take out a NON-EXCLUSIVE LICENCE. Anyone
+wishing to commercially distribute version 1.81 of Clustal W should contact the
+authors unless they have previously taken out a licence.
+
+******************************************************************************
+
+Clustal W is written in ANSI-C and can be run on any machine with an ANSI-C
+compiler. Executables are provided for several major platforms.
+
+Changes since CLUSTAL X Version 1.82
+------------------------------------
+
+1. The FASTA format has been added to the list of alignment output options.
+
+2. It is now possible to save the residue ranges (appended after the sequence
+names) when saving a specified range of the alignment.
+
+3. The efficiency of the neighour-joining algorithm has been improved. This
+work was done by Tadashi Koike at the Center for Information Biology and DNA Data
+Bank of Japan and FUJITSU Limited.
+
+Some example speedups are given below : (timings on a SPARC64 CPU)
+
+No. of sequences original NJ new NJ
+ 200 0' 12" 0.1"
+ 500 9' 19" 1.4"
+ 1000 XXXX 0' 31"
+
+Changes since version 1.8
+--------------------------
+
+1. ClustalW now returns error codes for some common errors when exiting. This
+may be useful for people who run clustalw automatically from within a script.
+Error codes are:
+ 1 bad command line option
+ 2 cannot open sequence file
+ 3 wrong format in sequence file
+ 4 sequence file contains only 1 sequence (for multiple alignments)
+
+2. Alignments can now be saved in Nexus format, for compatibility with PAUP,
+MacClade etc. For a description of the Nexus format, see:
+Maddison, D. R., D. L. Swofford and W. P. Maddison. 1997.
+NEXUS: an extensible file format for systematic information.
+Systematic Biology 46:590-621.
+
+3. Phylogenetic trees can also be saved in nexus format.
+
+4. A ClustalW icon has been designed for MAC and PC systems.
+
+
+Changes since version 1.74
+--------------------------
+
+1. Some work has been done to automatically select the optimal parameters
+depending on the set of sequences to be aligned. The Gonnet series of residue
+comparison matrices are now used by default. The Blosum series remains as an
+option. The default gap extension penalty for proteins has been changed to 0.2
+(was 0.05).The 'delay divergent sequences' option has been changed to 30%
+residue identity (was 40%).
+
+2. The default parameters used when the 'Negative matrix' option is selected
+have been optimised. This option may help when the sequences to be aligned are
+not superposable over their whole lengths (e.g. in the presence of N/C terminal
+extensions).
+
+3. A bug in the calculation of phylogenetic trees for 2 sequences has been
+fixed.
+
+4. A command line option has been added to turn off the sequence weighting
+calculation.
+
+5. The phylogenetic tree calculation now ignores any ambiguity codes in the
+sequences.
+
+6. A bug in the memory access during the calculation of profiles has been
+fixed. (Thanks to Haruna Cofer at SGI).
+
+7. A bug has been fixed in the 'transition weight' option for nucleic acid
+sequences. (Thanks to Chanan Rubin at Compugen).
+
+8. An option has been added to read in a series of comparison matrices from a
+file. This option is only applicable for protein sequences. For details of the
+file format, see the on-line documentation.
+
+9. The MSF output file format has been changed. The sequence weights
+calculated by Clustal W are now included in the header.
+
+10. Two bugs in the FAST/APPROXIMATE pairwise alignments have been fixed. One
+involved the alignment of new sequences to an existing profile using the fast
+pairwise alignment option; the second was caused by changing the default
+options for the fast pairwise alignments.
+
+11. A bug in the alignment of a small number of sequences has been fixed.
+Previously a Guide Tree was not calculated for less than 4 sequences.
+
+
+Changes since version 1.6
+-------------------------
+
+1. The static arrays used by clustalw for storing the alignment data have been
+replaced by dynamically allocated memory. There is now no limit on the number
+or length of sequences which can be input.
+
+2. The alignment of DNA sequences now offers a new hard-coded matrix, as well
+as the identity matrix used previously. The new matrix is the default scoring
+matrix used by the BESTFIT program of the GCG package for the comparison of
+nucleic acid sequences. X's and N's are treated as matches to any IUB ambiguity
+symbol. All matches score 1.9; all mismatches for IUB symbols score 0.0.
+
+3. The transition weight option for aligning nucleotide sequences has been
+changed from an on/off toggle to a weight between 0 and 1. A weight of zero
+means that the transitions are scored as mismatches; a weight of 1 gives
+transitions the full match score. For distantly related DNA sequences, the
+weight should be near to zero; for closely related sequences it can be useful
+to assign a higher score.
+
+4. The RSF sequence alignment file format used by GCG Version 9 can now be
+read.
+
+5. The clustal sequence alignment file format has been changed to allow
+sequence names longer than 10 characters. The maximum length allowed is set in
+clustalw.h by the statement:
+#define MAXNAMES 10
+
+For the fasta format, the name is taken as the first string after the '>'
+character, stopping at the first white space. (Previously, the first 10
+characters were taken, replacing blanks by underscores).
+
+6. The bootstrap values written in the phylip tree file format can be assigned
+either to branches or nodes. The default is to write the values on the nodes,
+as this can be read by several commonly-used tree display programs. But note
+that this can lead to confusion if the tree is rooted and the bootstraps may
+be better attached to the internal branches: Software developers should ensure
+they can read the branch label format.
+
+7. The sequence weighting used during sequence to profile alignments has been
+changed. The tree weight is now multiplied by the percent identity of the
+new sequence compared with the most closely related sequence in the profile.
+
+8. The sequence weighting used during profile to profile alignments has been
+changed. A guide tree is now built for each profile separately and the
+sequence weights calculated from the two trees. The weights for each
+sequence are then multiplied by the percent identity of the sequence compared
+with the most closely related sequence in the opposite profile.
+
+9. The adjustment of the Gap Opening and Gap Extension Penalties for sequences
+of unequal length has been improved.
+
+10. The default order of the sequences in the output alignment file has been
+changed. Previously the default was to output the sequences in the same order
+as the input file. Now the default is to use the order in which the sequences
+were aligned (from the guide tree/dendrogram), thus automatically grouping
+closely related sequences.
+
+11. The option to 'Reset Gaps between alignments' has been switched off by
+default.
+
+12. The conservation line output in the clustal format alignment file has been
+changed. Three characters are now used:
+'*' indicates positions which have a single, fully conserved residue
+':' indicates that one of the following 'strong' groups is fully conserved:-
+ STA
+ NEQK
+ NHQK
+ NDEQ
+ QHRK
+ MILV
+ MILF
+ HY
+ FYW
+
+'.' indicates that one of the following 'weaker' groups is fully conserved:-
+ CSA
+ ATV
+ SAG
+ STNK
+ STPA
+ SGND
+ SNDEQK
+ NDEQHK
+ NEQHRK
+ FVLIM
+ HFY
+
+These are all the positively scoring groups that occur in the Gonnet Pam250
+matrix. The strong and weak groups are defined as strong score >0.5 and weak
+score =<0.5 respectively.
+
+13. A bug in the modification of the Myers and Miller alignment algorithm
+for residue-specific gap penalites has been fixed. This occasionally caused
+new gaps to be opened a few residues away from the optimal position.
+
+14. The GCG/MSF input format no longer needs the word PILEUP on the first
+line. Several versions can now be recognised:-
+ 1. The word PILEUP as the first word in the file
+ 2. The word !!AA_MULTIPLE_ALIGNMENT or !!NA_MULTIPLE_ALIGNMENT
+ as the first word in the file
+ 3. The characters MSF on the first line in the line, and the
+ characters .. at the end of the line.
+
+15. The standard command line separator for UNIX systems has been changed from
+'/' to '-'. ie. to give options on the command line, you now type
+
+ clustalw input.aln -gapopen=8.0
+
+instead of clustalw input.aln /gapopen=8.0
+
+
+ ATTENTION SOFTWARE DEVELOPERS!!
+ -------------------------------
+
+The CLUSTAL sequence alignment output format was modified from version 1.7:
+
+1. Names longer than 10 chars are now allowed. (The maximum is specified in
+clustalw.h by '#define MAXNAMES'.)
+
+2. The consensus line now consists of three characters: '*',':' and '.'. (Only
+the '*' and '.' were previously used.)
+
+3. An option (not the default) has been added, allowing the user to print out
+sequence numbers at the end of each line of the alignment output.
+
+4. Both RNA bases (U) and base ambiguities are now supported in nucleic acid
+sequences. In the past, all characters (upper or lower case) other than
+a,c,g,t or u were converted to N. Now the following characters are recognised
+and retained in the alignment output: ABCDGHKMNRSTUVWXY (upper or lower case).
+
+5. A Blank line inadvertently added in the version 1.6 header has been taken
+out again.
+
+ CLUSTAL REFERENCES
+ ------------------
+
+Details of algorithms, implementation and useful tips on usage of Clustal
+programs can be found in the following publications:
+
+Jeanmougin,F., Thompson,J.D., Gouy,M., Higgins,D.G. and Gibson,T.J. (1998)
+Multiple sequence alignment with Clustal X. Trends Biochem Sci, 23, 403-5.
+
+Thompson,J.D., Gibson,T.J., Plewniak,F., Jeanmougin,F. and Higgins,D.G. (1997)
+The ClustalX windows interface: flexible strategies for multiple sequence
+alignment aided by quality analysis tools. Nucleic Acids Research, 24:4876-4882.
+
+Higgins, D. G., Thompson, J. D. and Gibson, T. J. (1996) Using CLUSTAL for
+multiple sequence alignments. Methods Enzymol., 266, 383-402.
+
+Thompson, J.D., Higgins, D.G. and Gibson, T.J. (1994) CLUSTAL W: improving the
+sensitivity of progressive multiple sequence alignment through sequence
+weighting, positions-specific gap penalties and weight matrix choice. Nucleic
+Acids Research, 22:4673-4680.
+
+Higgins,D.G., Bleasby,A.J. and Fuchs,R. (1992) CLUSTAL V: improved software for
+multiple sequence alignment. CABIOS 8,189-191.
+
+Higgins,D.G. and Sharp,P.M. (1989) Fast and sensitive multiple sequence
+alignments on a microcomputer. CABIOS 5,151-153.
+
+Higgins,D.G. and Sharp,P.M. (1988) CLUSTAL: a package for performing multiple
+sequence alignment on a microcomputer. Gene 73,237-244.
Added: trunk/packages/clustalw/branches/upstream/current/README_X
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/README_X 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/README_X 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,392 @@
+******************************************************************************
+
+ CLUSTAL X Multiple Sequence Alignment Program
+ (version 1.83, Feb 2003)
+
+******************************************************************************
+
+This README contains notes on version CHANGES and help with INSTALLATION
+
+Clustal X provides a new window-based user interface to the Clustal W multiple
+alignment program. It uses the Vibrant multi-platform user interface
+development library, developed by the National Center for Biotechnology
+Information (Bldg 38A, NIH 8600 Rockville Pike,Bethesda, MD 20894) as part of
+their NCBI SOFTWARE DEVELOPEMENT TOOLKIT. The toolkit is available by
+anonymous ftp from ncbi.nlm.nih.gov
+
+Please e-mail bug reports/complaints/suggestions (polite if possible) to
+ Julie Thompson at julie at igbmc.u-strasbg.fr
+ or Toby Gibson at gibson at embl-heidelberg.de
+
+
+******************************************************************************
+
+ POLICY ON COMMERCIAL DISTRIBUTION OF CLUSTAL W and X
+
+Clustal W and X are freely available to the user community. However, Clustal W
+is increasingly being distributed as part of commercial sequence analysis
+packages. To help us safeguard future maintenance and development, commercial
+distributors of Clustal X must take out a non-exclusive licence. Anyone
+wishing to commercially distribute version 1.81 of Clustal X should contact the
+authors unless they have previously taken out a licence.
+
+******************************************************************************
+
+Changes since CLUSTAL X Version 1.82
+------------------------------------
+
+1. The FASTA format has been added to the list of alignment output options.
+
+2. It is now possible to save the residue ranges (appended after the sequence
+names) when saving a specified range of the alignment.
+
+3. The efficiency of the neighour-joining algorithm has been improved. This
+work was done by Tadashi Koike at the Center for Information Biology and DNA Data
+Bank of Japan and FUJITSU Limited.
+
+Some example speedups are given below : (timings on a SPARC64 CPU)
+
+No. of sequences original NJ new NJ
+ 200 0' 12" 0.1"
+ 500 9' 19" 1.4"
+ 1000 XXXX 0' 31"
+
+
+Changes since CLUSTAL X Version 1.8
+-----------------------------------
+
+1. ClustalX now returns error codes for some common errors when exiting. This
+may be useful for people who run clustalx automatically from within a script.
+Error codes are:
+ 1 bad command line option
+ 2 cannot open sequence file
+ 3 wrong format in sequence file
+ 4 sequence file contains only 1 sequence (for multiple alignments)
+
+2. Alignments can now be saved in Nexus format, for compatibility with PAUP,
+MacClade etc. For a description of the Nexus format, see:
+Maddison, D. R., D. L. Swofford and W. P. Maddison. 1997.
+NEXUS: an extensible file format for systematic information.
+Systematic Biology 46:590-621.
+
+3. Phylogenetic trees can also be saved in nexus format.
+
+4. A bug causing ClustalX to crash during cut-and-paste operations has been fixed.
+
+5. A bug on PC systems, causing an error message when writing to files with
+space characters in the filename has been fixed.
+
+6. The Quality Curve is now displayed as a bar chart, instead of a line plot.
+(Thanks to Michele Clamp, michele at ebi.ac.uk, who used this format in the JalView
+editor.)
+
+7. A bug in the 'Save Profile' option, causing the default profile filename to
+be lost has been fixed.
+
+8. A ClustalX icon has been designed for MAC and PC systems.
+
+
+Changes since CLUSTAL X Version 1.65b
+-------------------------------------
+
+1. Some work has been done to automatically select the optimal parameters
+depending on the set of sequences to be aligned. The Gonnet series of residue
+comparison matrices are now used by default. The Blosum series remains as an
+option. The default gap extension penalty for proteins has been changed to 0.2
+(was 0.05).The 'delay divergent sequences' option has been changed to 30%
+residue identity (was 40%).
+
+2. The default parameters used when the 'Negative matrix' option is selected
+have been optimised. This option may help when the sequences to be aligned are
+not superposable over their whole lengths (e.g. in the presence of N/C terminal
+extensions).
+
+3. An option has been added to save the quality scores displayed underneath the
+sequence window to a text file.
+
+4. The 'Hide Low-scoring segments' option has been moved from the Low-scoring
+parameter window to the Quality menu, and has been changed to 'Show Low-scoring
+segments'.
+
+5. An option has been added to allow the user to search for a string in the
+sequences.
+
+6. An option has been added to the postscript output to print on US Letter size
+paper.
+
+7. A bug in the display of the message at the bottom of the window causing the
+text to disappear when the window was resized has been fixed.
+
+8. The font for the Help window as been changed to Courier.
+
+9. A bug in the calculation of phylogenetic trees for 2 sequences has been
+fixed.
+
+10. A command line option has been added to turn off the sequence weighting
+calculation.
+
+11. The phylogenetic tree calculation now ignores any ambiguity codes in the
+sequences.
+
+12. A bug in the memory access during the calculation of profiles has been
+fixed. (Thanks to Haruna Cofer at SGI).
+
+13. A bug has been fixed in the 'transition weight' option for nucleic acid
+sequences. (Thanks to Chanan Rubin at Compugen).
+
+14. An option has been added to allow the user to read in a series of residue
+comparison matrices from a file.
+
+15. The MSF output file format has been changed. The sequence weights
+calculated by ClustalX are now included in the header.
+
+16. Two bugs in the FAST/APPROXIMATE pairwise alignments have been fixed. One
+involved the alignment of new sequences to an existing profile using the fast
+pairwise alignment option; the second was caused by changing the default
+options for the fast pairwise alignments.
+
+17. A bug in the alignment of a small number of sequences has been fixed.
+Previously a Guide Tree was not calculated for less than 4 sequences.
+
+18. Several bugs affecting use of secondary structure masks in Clustal X (but
+not in Clustal W) have been fixed.
+
+
+Changes since Version 1.5b
+--------------------------
+
+1. The window displayed under MS Windows has previously been a fixed size. The
+window can now be resized by dragging the window frame.
+
+2. An option has been added to read in a series of comparison matrices from a
+file. This option is only applicable for protein sequences. For details of
+the file format, see the on-line documentation.
+
+3. A new DNA comparison matrix has been added. This is the default scoring
+matrix used by BESTFIT for the comparison of nucleic acid sequences. X's and N's
+are treated as matches to any IUB ambiguity symbol. All matches score 1.9; all
+mismatches for IUB symbols score 0.
+The previous system used by ClustalW, in which matches score 1.0 and mismatches
+score 0 remains as an option. All matches for IUB symbols will also score 0.
+
+4. You can now read a comparison matrix for DNA sequences from a file. The
+matrix file should be in the same format as for the Blast program.
+
+5. The 'Reset gaps before alignment' has been changed to 'Reset new gaps
+before alignments'. A new option 'Reset ALL gaps before alignment' has been
+added.
+RESET NEW GAPS BEFORE ALIGNMENT will remove any new gaps introduced into the
+sequences during multiple alignment if you wish to change the parameters and
+try again.
+RESET ALL GAPS BEFORE ALIGNMENT will remove all gaps in the sequences including
+gaps which were read in from the sequence input file.
+
+6. The 'Realign Residue Range' option has been changed. By default, gap
+opening and extension penalties are now applied to the ends of the alignment
+range in order to penalise terminal gaps. If the REALIGN SEGMENT END GAP
+PENALTIES option is switched off, gaps can be introduced at the ends of the
+residue range at no cost.
+
+7. The MSF output file format has been changed. The sequence weights calculated
+by ClustalX are now included in the header.
+
+8. Two bugs in the FAST/APPROXIMATE pairwise alignments have been fixed. One
+involved the alignment of new sequences to an existing profile using the
+fast pairwise alignment option; the second was caused by changing the default
+options for the fast pairwise alignments.
+
+9. A bug in the postscript output file has been fixed. The residue numbers
+printed at the right hand side of the alignment were not always correct.
+
+10. A bug in the alignment of a small number of sequences has been fixed.
+Previously a Guide Tree was not calculated for less than 4 sequences.
+
+11. A bug which occurred after frequent cut-and-paste operations has been
+fixed.
+
+12. A new file called clustalx.html contains an html'ised version of the
+on-line help. The file can be viewed using a World Wide Web viewer, such as
+Netscape.
+
+
+New Features since ClustalW
+---------------------------
+
+1. A subset of sequences in an alignment may be selected and realigned to a
+profile made from the unselected sequences. This may be useful when trying to
+align very divergent sequences which have been badly aligned in the initial
+full multiple alignment.
+
+
+2. A range of the sequence alignment can be selected for realignment. A new
+phylogenetic guide tree is built based only on the residue range selected.
+The selected residues are then aligned, and pasted back into the full sequence
+alignment. This may be useful for aligning small sections of the alignment
+which have been badly aligned in the full sequence alignment, or which have a
+very different guide tree structure from the tree built using the full
+sequences.
+
+
+3. Clustal X provides a versatile coloring scheme for the sequence alignment
+display. The sequences (or profiles) are colored automatically, when they are
+loaded. Sequences can be colored either by assigning a color to specific
+residues, or on the basis of an alignment consensus. In the latter case,
+the alignment consensus is calculated automatically, and the residues in each
+column are colored according to the consensus character assigned to the column.
+In this way, for example, conserved hydrophylic or hydrophobic positions can
+be highlighted.
+
+
+4. An 'Alignment Quality Score' is plotted below the alignment. This is an
+estimate of the conservation of each column in the alignment. Highly conserved
+columns will have a high quality score, less conserved positions will be
+marked by a low score.
+
+
+5. 'Exceptional' residues in the alignment that cause the low quality scores
+described above, can be highlighted. These can be expected to occur at a
+moderate frequency in all the sequences because of their steady divergence
+due to the natural processes of evolution. However, clustering of highlighted
+residues is a strong indication of misalignment.
+Occasionally, highlighted residues may also point to regions of some biological
+significance.
+
+6. Low-scoring segments in the alignment can be highlighted. The segments are
+defined as those regions which score negatively in a forward and backward
+summation of the alignment profile scores. See the online help for more
+details.
+
+7. The new GCG9 MSF,RSF formats are now recognised as input formats for
+clustalx. The alignments cannot be written out in these formats however.
+
+The code has been tested on UNIX (SGI, SUN, DIGITAL) and Macintosh. Compiled
+executables are provided for these systems. If you wish to recompile the
+source files, you will first need to install the NCBI toolkit on your machine.
+Then, to compile the program on UNIX, edit the makefile to point to your NCBI
+include and library files, and type:
+
+ make -f makefile.sun
+or make -f makefile.sgi
+or make -f makefile.osf
+
+
+To run the program, type clustalx. A window is displayed with a pull-down menu
+bar which allow all functions to be selected and all alignment parameters
+may be modified, if desired.
+
+
+Documentation for ClustalW (clustalw.doc) is included in the directory. Online
+help is also available for most options of Clustal X by selecting HELP from
+the menu bar.
+
+Help is also available on the WWW at
+
+www-igbmc.u-strasbg.fr/BioInfo/ClustalX/
+www-igbmc.u-strasbg.fr/BioInfo/ClustalW/
+www.U.arizona.edu/~schluter/ClustalW/index.html
+
+
+INSTALLATION (for Unix, PC and MAC)
+------------
+
+UNIX
+----
+
+Executables are provided in the appropriate archives for Digital UNIX 4.0 on
+Alphas, Sun OS 5.6, Silicon Graphics IRIX 6.2 and LINUX (libc6 must be
+installed). If you wish to run on another platform, you will need to recompile
+Clustal X for yourself.
+
+The executable file clustalx should be copied to one of the directories
+specified in your PATH environment variable. The files called *.par and
+clustalx_help should also be copied to the same directory.
+
+Recompiling ClustalX:
+
+First of all, you need the NCBI Vibrant toolkit installed on your machine. If
+this is not already done, you can get the toolkit by anonymous ftp to
+ncbi.nlm.nih.gov.
+You should then copy one of the makefiles supplied in the unix archives to
+'makefile' and edit it, changing the NCBI_INC and NCBI_LIB paths for your
+system.
+
+You make the program with:
+make -f makefile
+
+This produces the executable file clustalx. You can then proceed with the
+installation as described above.
+
+
+MS WINDOWS
+----------
+
+We supply an executable file (clustalx.exe) which will run under MS Windows
+(32 bit). The directory containing the executable (plus the files named *.par,
+and clustalx.hlp) should be added to your path defined in the autoexec.bat
+file.
+
+
+Recompiling ClustalX:
+
+First of all, you need the NCBI Vibrant toolkit installed on your machine. If
+this is not already done, you can get the toolkit by anonymous ftp to
+ncbi.nlm.nih.gov.
+
+A makefile is supplied which can be used as a guide for recompiling the
+ClustalX source code. You will need to edit it for your system. In
+particular the NCBI_INC and NCBI_LIB paths should point to your installation.
+
+
+MAC
+---
+
+An executable program called clustalx is supplied for Power Macintoshes.
+For 68K machines, you will need to recompile the code yourself. The
+program may need up to 10m of memory to run depending on the number and
+length of your sequences. The memory allocation can be adjusted with the
+Get Info (%I) command from the Finder if you have problems. Just double click
+the executable file name or icon and off you go (we hope). The files *.par and
+clustalx_help should be stored in the same directory as the clustalx program.
+
+Recompiling ClustalX:
+
+First of all, you need the NCBI Vibrant toolkit installed on your machine. If
+this is not already done, you can get the toolkit by anonymous ftp to
+ncbi.nlm.nih.gov.
+
+We used the Metroworks Codewarrior C compiler to compile the ClustalX files,
+but another ANSI C compiler should work. You need to compile all the *.c
+files supplied in the archive, then link them together with the NCBI Toolkit
+libraries 'ncbi' and 'vibrant'.
+
+
+ CLUSTAL REFERENCES
+ ------------------
+
+Details of algorithms, implementation and useful tips on usage of Clustal
+programs can be found in the following publications:
+
+Jeanmougin,F., Thompson,J.D., Gouy,M., Higgins,D.G. and Gibson,T.J. (1998)
+Multiple sequence alignment with Clustal X. Trends Biochem Sci, 23, 403-5.
+
+Thompson,J.D., Gibson,T.J., Plewniak,F., Jeanmougin,F. and Higgins,D.G. (1997)
+The ClustalX windows interface: flexible strategies for multiple sequence
+alignment aided by quality analysis tools. Nucleic Acids Research, 25:4876-4882.
+
+Higgins, D. G., Thompson, J. D. and Gibson, T. J. (1996) Using CLUSTAL for
+multiple sequence alignments. Methods Enzymol., 266, 383-402.
+
+Thompson, J.D., Higgins, D.G. and Gibson, T.J. (1994) CLUSTAL W: improving the
+sensitivity of progressive multiple sequence alignment through sequence
+weighting, positions-specific gap penalties and weight matrix choice. Nucleic
+Acids Research, 22:4673-4680.
+
+Higgins,D.G., Bleasby,A.J. and Fuchs,R. (1992) CLUSTAL V: improved software for
+multiple sequence alignment. CABIOS 8,189-191.
+
+Higgins,D.G. and Sharp,P.M. (1989) Fast and sensitive multiple sequence
+alignments on a microcomputer. CABIOS 5,151-153.
+
+Higgins,D.G. and Sharp,P.M. (1988) CLUSTAL: a package for performing multiple
+sequence alignment on a microcomputer. Gene 73,237-244.
+
Added: trunk/packages/clustalw/branches/upstream/current/alnscore.c
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/alnscore.c 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/alnscore.c 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,114 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include "clustalw.h"
+
+#define MAX(a,b) ((a)>(b)?(a):(b))
+#define MIN(a,b) ((a)<(b)?(a):(b))
+
+/*
+ * Prototypes
+ */
+
+static sint count_gaps(sint s1, sint s2, sint l);
+
+/*
+ * Global Variables
+ */
+
+extern float gap_open;
+extern sint nseqs;
+extern sint *seqlen_array;
+extern short blosum45mt[];
+extern short def_aa_xref[];
+extern sint debug;
+extern sint max_aa;
+extern char **seq_array;
+
+
+void aln_score(void)
+{
+ static short *mat_xref, *matptr;
+ static sint maxres;
+ static sint s1,s2,c1,c2;
+ static sint ngaps;
+ static sint i,l1,l2;
+ static lint score;
+ static sint matrix[NUMRES][NUMRES];
+
+/* calculate an overall score for the alignment by summing the
+scores for each pairwise alignment */
+
+ matptr = blosum45mt;
+ mat_xref = def_aa_xref;
+ maxres = get_matrix(matptr, mat_xref, matrix, TRUE, 100);
+ if (maxres == 0)
+ {
+ fprintf(stdout,"Error: matrix blosum30 not found\n");
+ return;
+ }
+
+ score=0;
+ for (s1=1;s1<=nseqs;s1++)
+ {
+ for (s2=1;s2<s1;s2++)
+ {
+
+ l1 = seqlen_array[s1];
+ l2 = seqlen_array[s2];
+ for (i=1;i<l1 && i<l2;i++)
+ {
+ c1 = seq_array[s1][i];
+ c2 = seq_array[s2][i];
+ if ((c1>=0) && (c1<=max_aa) && (c2>=0) && (c2<=max_aa))
+ score += matrix[c1][c2];
+ }
+
+ ngaps = count_gaps(s1, s2, l1);
+
+ score -= 100 * gap_open * ngaps;
+
+ }
+ }
+
+ score /= 100;
+
+ info("Alignment Score %d", (pint)score);
+
+}
+
+static sint count_gaps(sint s1, sint s2, sint l)
+{
+ sint i, g;
+ sint q, r, *Q, *R;
+
+
+ Q = (sint *)ckalloc((l+2) * sizeof(sint));
+ R = (sint *)ckalloc((l+2) * sizeof(sint));
+
+ Q[0] = R[0] = g = 0;
+
+ for (i=1;i<l;i++)
+ {
+ if (seq_array[s1][i] > max_aa) q = 1;
+ else q = 0;
+ if (seq_array[s2][i] > max_aa) r = 1;
+ else r = 0;
+
+ if (((Q[i-1] <= R[i-1]) && (q != 0) && (1-r != 0)) ||
+ ((Q[i-1] >= R[i-1]) && (1-q != 0) && (r != 0)))
+ g += 1;
+ if (q != 0) Q[i] = Q[i-1]+1;
+ else Q[i] = 0;
+
+ if (r != 0) R[i] = R[i-1]+1;
+ else R[i] = 0;
+ }
+
+ Q=ckfree((void *)Q);
+ R=ckfree((void *)R);
+
+ return(g);
+}
+
+
Added: trunk/packages/clustalw/branches/upstream/current/amenu.c
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/amenu.c 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/amenu.c 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,1317 @@
+/* Menus and command line interface for Clustal W */
+/* DES was here MARCH. 1994 */
+/* DES was here SEPT. 1994 */
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <signal.h>
+#include <setjmp.h>
+#include "clustalw.h"
+
+static jmp_buf jmpbuf;
+#ifndef VMS
+#ifndef AIX
+#define BADSIG (void (*)())-1
+#endif
+#endif
+
+static void jumper(int);
+
+static void jumper(int i)
+{
+ longjmp(jmpbuf,1);
+}
+
+
+/*
+* Prototypes
+*/
+
+
+static void pair_menu(void);
+static void multi_menu(void);
+static void gap_penalties_menu(void);
+static void multiple_align_menu(void); /* multiple alignments menu */
+static void profile_align_menu(void); /* profile " " */
+static void phylogenetic_tree_menu(void); /* NJ trees/distances menu */
+static void format_options_menu(void); /* format of alignment output */
+static void tree_format_options_menu(void); /* format of tree output */
+static void ss_options_menu(void);
+static sint secstroutput_options(void);
+static sint read_matrix(char *title,MatMenu menu, char *matnam, sint matn, short *mat, short *xref);
+
+/*
+* Global variables
+*/
+
+extern float gap_open, gap_extend;
+extern float dna_gap_open, dna_gap_extend;
+extern float prot_gap_open, prot_gap_extend;
+extern float pw_go_penalty, pw_ge_penalty;
+extern float dna_pw_go_penalty, dna_pw_ge_penalty;
+extern float prot_pw_go_penalty, prot_pw_ge_penalty;
+extern float transition_weight;
+extern char revision_level[];
+extern sint wind_gap,ktup,window,signif;
+extern sint dna_wind_gap, dna_ktup, dna_window, dna_signif;
+extern sint prot_wind_gap,prot_ktup,prot_window,prot_signif;
+extern sint nseqs;
+extern sint divergence_cutoff;
+extern sint debug;
+extern Boolean neg_matrix;
+extern Boolean quick_pairalign;
+extern Boolean reset_alignments_new; /* DES */
+extern Boolean reset_alignments_all; /* DES */
+extern sint gap_dist;
+extern Boolean no_var_penalties, no_hyd_penalties, no_pref_penalties;
+extern sint output_order;
+extern sint profile_no;
+extern short usermat[], pw_usermat[];
+extern short aa_xref[], pw_aa_xref[];
+extern short userdnamat[], pw_userdnamat[];
+extern short dna_xref[], pw_dna_xref[];
+
+extern Boolean lowercase; /* Flag for GDE output - set on comm. line*/
+extern Boolean cl_seq_numbers;
+extern Boolean seqRange; /* to append sequence range with seq names, Ranu */
+
+extern Boolean output_clustal, output_nbrf, output_phylip, output_gcg, output_gde, output_nexus;
+extern Boolean output_fasta; /* Ramu */
+
+extern Boolean output_tree_clustal, output_tree_phylip, output_tree_distances,output_tree_nexus;
+extern sint bootstrap_format;
+extern Boolean tossgaps, kimura;
+extern Boolean percent;
+extern Boolean usemenu;
+extern Boolean showaln, save_parameters;
+extern Boolean dnaflag;
+extern Boolean use_ambiguities;
+
+
+extern char hyd_residues[];
+extern char mtrxname[], pw_mtrxname[];
+extern char dnamtrxname[], pw_dnamtrxname[];
+extern char seqname[];
+
+extern sint output_struct_penalties;
+extern Boolean use_ss1, use_ss2;
+
+extern Boolean empty;
+extern Boolean profile1_empty, profile2_empty; /* whether or not profiles */
+
+extern char profile1_name[FILENAMELEN+1];
+extern char profile2_name[FILENAMELEN+1];
+
+extern Boolean use_endgaps;
+extern sint matnum,pw_matnum;
+extern sint dnamatnum,pw_dnamatnum;
+
+extern sint helix_penalty;
+extern sint strand_penalty;
+extern sint loop_penalty;
+extern sint helix_end_minus;
+extern sint helix_end_plus;
+extern sint strand_end_minus;
+extern sint strand_end_plus;
+extern sint helix_end_penalty;
+extern sint strand_end_penalty;
+
+extern MatMenu matrix_menu;
+extern MatMenu pw_matrix_menu;
+extern MatMenu dnamatrix_menu;
+
+static char phylip_name[FILENAMELEN]="";
+static char clustal_name[FILENAMELEN]="";
+static char dist_name[FILENAMELEN]="";
+static char nexus_name[FILENAMELEN]="";
+static char fasta_name[FILENAMELEN]="";
+
+static char p1_tree_name[FILENAMELEN]="";
+static char p2_tree_name[FILENAMELEN]="";
+
+static char *secstroutput_txt[] = {
+ "Secondary Structure",
+ "Gap Penalty Mask",
+ "Structure and Penalty Mask",
+ "None" };
+
+
+static char *lin1, *lin2, *lin3;
+
+static int firstres =0; /* range of alignment for saving as ... */
+static int lastres = 0;
+
+void init_amenu(void)
+{
+
+ lin1 = (char *)ckalloc( (MAXLINE+1) * sizeof (char) );
+ lin2 = (char *)ckalloc( (MAXLINE+1) * sizeof (char) );
+ lin3 = (char *)ckalloc( (MAXLINE+1) * sizeof (char) );
+}
+
+void main_menu(void)
+{
+ int catchint;
+
+ catchint = signal(SIGINT, SIG_IGN) != SIG_IGN;
+ if (catchint) {
+ if (setjmp(jmpbuf) != 0)
+ fprintf(stdout,"\n.. Interrupt\n");
+#ifdef UNIX
+ if (signal(SIGINT,jumper) == BADSIG)
+ fprintf(stdout,"Error: signal\n");
+#else
+ if (signal(SIGINT,SIG_DFL) == BADSIG)
+ fprintf(stdout,"Error: signal\n");
+#endif
+ }
+
+ while(TRUE) {
+ fprintf(stdout,"\n\n\n");
+ fprintf(stdout," **************************************************************\n");
+ fprintf(stdout," ******** CLUSTAL %s Multiple Sequence Alignments ********\n",revision_level);
+ fprintf(stdout," **************************************************************\n");
+ fprintf(stdout,"\n\n");
+
+ fprintf(stdout," 1. Sequence Input From Disc\n");
+ fprintf(stdout," 2. Multiple Alignments\n");
+ fprintf(stdout," 3. Profile / Structure Alignments\n");
+ fprintf(stdout," 4. Phylogenetic trees\n");
+ fprintf(stdout,"\n");
+ fprintf(stdout," S. Execute a system command\n");
+ fprintf(stdout," H. HELP\n");
+ fprintf(stdout," X. EXIT (leave program)\n\n\n");
+
+ getstr("Your choice",lin1);
+
+ switch(toupper(*lin1)) {
+ case '1': seq_input(FALSE);
+ phylip_name[0]=EOS;
+ clustal_name[0]=EOS;
+ dist_name[0]=EOS;
+ nexus_name[0]=EOS;
+ break;
+ case '2': multiple_align_menu();
+ break;
+ case '3': profile_align_menu();
+ break;
+ case '4': phylogenetic_tree_menu();
+ break;
+ case 'S': do_system();
+ break;
+ case '?':
+ case 'H': get_help('1');
+ break;
+ case 'Q':
+ case 'X': exit(0);
+ break;
+ default: fprintf(stdout,"\n\nUnrecognised Command\n\n");
+ break;
+ }
+ }
+}
+
+
+
+
+
+
+
+
+
+static void multiple_align_menu(void)
+{
+ int catchint;
+
+ catchint = signal(SIGINT, SIG_IGN) != SIG_IGN;
+ if (catchint) {
+ if (setjmp(jmpbuf) != 0)
+ fprintf(stdout,"\n.. Interrupt\n");
+#ifdef UNIX
+ if (signal(SIGINT,jumper) == BADSIG)
+ fprintf(stdout,"Error: signal\n");
+#else
+ if (signal(SIGINT,SIG_DFL) == BADSIG)
+ fprintf(stdout,"Error: signal\n");
+#endif
+ }
+
+
+ while(TRUE)
+ {
+ fprintf(stdout,"\n\n\n");
+ fprintf(stdout,"****** MULTIPLE ALIGNMENT MENU ******\n");
+ fprintf(stdout,"\n\n");
+
+
+ fprintf(stdout," 1. Do complete multiple alignment now (%s)\n",
+ (!quick_pairalign) ? "Slow/Accurate" : "Fast/Approximate");
+ fprintf(stdout," 2. Produce guide tree file only\n");
+ fprintf(stdout," 3. Do alignment using old guide tree file\n\n");
+ fprintf(stdout," 4. Toggle Slow/Fast pairwise alignments = %s\n\n",
+ (!quick_pairalign) ? "SLOW" : "FAST");
+ fprintf(stdout," 5. Pairwise alignment parameters\n");
+ fprintf(stdout," 6. Multiple alignment parameters\n\n");
+ fprintf(stdout," 7. Reset gaps before alignment?");
+ if(reset_alignments_new)
+ fprintf(stdout," = ON\n");
+ else
+ fprintf(stdout," = OFF\n");
+ fprintf(stdout," 8. Toggle screen display = %s\n",
+ (!showaln) ? "OFF" : "ON");
+ fprintf(stdout," 9. Output format options\n");
+ fprintf(stdout,"\n");
+
+ fprintf(stdout," S. Execute a system command\n");
+ fprintf(stdout," H. HELP\n");
+ fprintf(stdout," or press [RETURN] to go back to main menu\n\n\n");
+
+ getstr("Your choice",lin1);
+ if(*lin1 == EOS) return;
+
+ switch(toupper(*lin1))
+ {
+ case '1': align(phylip_name);
+ break;
+ case '2': make_tree(phylip_name);
+ break;
+ case '3': get_tree(phylip_name);
+ break;
+ case '4': quick_pairalign ^= TRUE;
+ break;
+ case '5': pair_menu();
+ break;
+ case '6': multi_menu();
+ break;
+ case '7': reset_alignments_new ^= TRUE;
+ if(reset_alignments_new==TRUE)
+ reset_alignments_all=FALSE;
+ break;
+ case '8': showaln ^= TRUE;
+ break;
+ case '9': format_options_menu();
+ break;
+ case 'S': do_system();
+ break;
+ case '?':
+ case 'H': get_help('2');
+ break;
+ case 'Q':
+ case 'X': return;
+
+ default: fprintf(stdout,"\n\nUnrecognised Command\n\n");
+ break;
+ }
+ }
+}
+
+
+
+
+
+
+
+
+
+static void profile_align_menu(void)
+{
+ int catchint;
+
+ catchint = signal(SIGINT, SIG_IGN) != SIG_IGN;
+ if (catchint) {
+ if (setjmp(jmpbuf) != 0)
+ fprintf(stdout,"\n.. Interrupt\n");
+#ifdef UNIX
+ if (signal(SIGINT,jumper) == BADSIG)
+ fprintf(stdout,"Error: signal\n");
+#else
+ if (signal(SIGINT,SIG_DFL) == BADSIG)
+ fprintf(stdout,"Error: signal\n");
+#endif
+ }
+
+
+ while(TRUE)
+ {
+ fprintf(stdout,"\n\n\n");
+ fprintf(stdout,"****** PROFILE AND STRUCTURE ALIGNMENT MENU ******\n");
+ fprintf(stdout,"\n\n");
+
+ fprintf(stdout," 1. Input 1st. profile ");
+ if (!profile1_empty) fprintf(stdout,"(loaded)");
+ fprintf(stdout,"\n");
+ fprintf(stdout," 2. Input 2nd. profile/sequences ");
+ if (!profile2_empty) fprintf(stdout,"(loaded)");
+ fprintf(stdout,"\n\n");
+ fprintf(stdout," 3. Align 2nd. profile to 1st. profile\n");
+ fprintf(stdout," 4. Align sequences to 1st. profile (%s)\n\n",
+ (!quick_pairalign) ? "Slow/Accurate" : "Fast/Approximate");
+ fprintf(stdout," 5. Toggle Slow/Fast pairwise alignments = %s\n\n",
+ (!quick_pairalign) ? "SLOW" : "FAST");
+ fprintf(stdout," 6. Pairwise alignment parameters\n");
+ fprintf(stdout," 7. Multiple alignment parameters\n\n");
+ fprintf(stdout," 8. Toggle screen display = %s\n",
+ (!showaln) ? "OFF" : "ON");
+ fprintf(stdout," 9. Output format options\n");
+ fprintf(stdout," 0. Secondary structure options\n");
+ fprintf(stdout,"\n");
+ fprintf(stdout," S. Execute a system command\n");
+ fprintf(stdout," H. HELP\n");
+ fprintf(stdout," or press [RETURN] to go back to main menu\n\n\n");
+
+ getstr("Your choice",lin1);
+ if(*lin1 == EOS) return;
+
+ switch(toupper(*lin1))
+ {
+ case '1': profile_no = 1; /* 1 => 1st profile */
+ profile_input();
+ strcpy(profile1_name, seqname);
+ break;
+ case '2': profile_no = 2; /* 2 => 2nd profile */
+ profile_input();
+ strcpy(profile2_name, seqname);
+ break;
+ case '3': profile_align(p1_tree_name,p2_tree_name); /* align the 2 alignments now */
+ break;
+ case '4': new_sequence_align(phylip_name); /* align new sequences to profile 1 */
+ break;
+ case '5': quick_pairalign ^= TRUE;
+ break;
+ case '6': pair_menu();
+ break;
+ case '7': multi_menu();
+ break;
+ case '8': showaln ^= TRUE;
+ break;
+ case '9': format_options_menu();
+ break;
+ case '0': ss_options_menu();
+ break;
+ case 'S': do_system();
+ break;
+ case '?':
+ case 'H': get_help('6');
+ break;
+ case 'Q':
+ case 'X': return;
+
+ default: fprintf(stdout,"\n\nUnrecognised Command\n\n");
+ break;
+ }
+ }
+}
+
+
+static void ss_options_menu(void)
+{
+ int catchint;
+
+ catchint = signal(SIGINT, SIG_IGN) != SIG_IGN;
+ if (catchint) {
+ if (setjmp(jmpbuf) != 0)
+ fprintf(stdout,"\n.. Interrupt\n");
+#ifdef UNIX
+ if (signal(SIGINT,jumper) == BADSIG)
+ fprintf(stdout,"Error: signal\n");
+#else
+ if (signal(SIGINT,SIG_DFL) == BADSIG)
+ fprintf(stdout,"Error: signal\n");
+#endif
+ }
+
+
+ while(TRUE) {
+
+ fprintf(stdout,"\n\n\n");
+ fprintf(stdout," ********* SECONDARY STRUCTURE OPTIONS *********\n");
+ fprintf(stdout,"\n\n");
+
+ fprintf(stdout," 1. Use profile 1 secondary structure / penalty mask ");
+ if(use_ss1)
+ fprintf(stdout,"= YES\n");
+ else
+ fprintf(stdout,"= NO\n");
+ fprintf(stdout," 2. Use profile 2 secondary structure / penalty mask ");
+ if(use_ss2)
+ fprintf(stdout,"= YES\n");
+ else
+ fprintf(stdout,"= NO\n");
+ fprintf(stdout,"\n");
+ fprintf(stdout," 3. Output in alignment ");
+ fprintf(stdout,"= %s\n",secstroutput_txt[output_struct_penalties]);
+ fprintf(stdout,"\n");
+
+ fprintf(stdout," 4. Helix gap penalty :%d\n",(pint)helix_penalty);
+ fprintf(stdout," 5. Strand gap penalty :%d\n",(pint)strand_penalty);
+ fprintf(stdout," 6. Loop gap penalty :%d\n",(pint)loop_penalty);
+
+ fprintf(stdout," 7. Secondary structure terminal penalty :%d\n",(pint)helix_end_penalty);
+ fprintf(stdout," 8. Helix terminal positions within :%d outside :%d\n",
+ (pint)helix_end_minus,(pint)helix_end_plus);
+ fprintf(stdout," 9. Strand terminal positions within :%d outside :%d\n",
+ (pint)strand_end_minus,(pint)strand_end_plus);
+
+ fprintf(stdout,"\n\n");
+ fprintf(stdout," H. HELP\n\n\n");
+
+ getstr("Enter number (or [RETURN] to exit)",lin2);
+ if( *lin2 == EOS) {
+ return;
+ }
+
+ switch(toupper(*lin2)) {
+ case '1': use_ss1 ^= TRUE;
+ break;
+ case '2': use_ss2 ^= TRUE;
+ break;
+ case '3': output_struct_penalties = secstroutput_options();
+ break;
+ case '4':
+ fprintf(stdout,"Helix Penalty Currently: %d\n",(pint)helix_penalty);
+ helix_penalty=getint("Enter number",1,9,helix_penalty);
+ break;
+ case '5':
+ fprintf(stdout,"Strand Gap Penalty Currently: %d\n",(pint)strand_penalty);
+ strand_penalty=getint("Enter number",1,9,strand_penalty);
+ break;
+ case '6':
+ fprintf(stdout,"Loop Gap Penalty Currently: %d\n",(pint)loop_penalty);
+ loop_penalty=getint("Enter number",1,9,loop_penalty);
+ break;
+ case '7':
+ fprintf(stdout,"Secondary Structure Terminal Penalty Currently: %d\n",
+ (pint)helix_end_penalty);
+ helix_end_penalty=getint("Enter number",1,9,helix_end_penalty);
+ strand_end_penalty = helix_end_penalty;
+ break;
+ case '8':
+ fprintf(stdout,"Helix Terminal Positions Currently: \n");
+ fprintf(stdout," within helix: %d outside helix: %d\n",
+ (pint)helix_end_minus,(pint)helix_end_plus);
+ helix_end_minus=getint("Enter number of residues within helix",0,3,helix_end_minus);
+ helix_end_plus=getint("Enter number of residues outside helix",0,3,helix_end_plus);
+ break;
+ case '9':
+ fprintf(stdout,"Strand Terminal Positions Currently: \n");
+ fprintf(stdout," within strand: %d outside strand: %d\n",
+ (pint)strand_end_minus,(pint)strand_end_plus);
+ strand_end_minus=getint("Enter number of residues within strand",0,3,strand_end_minus);
+ strand_end_plus=getint("Enter number of residues outside strand",0,3,strand_end_plus);
+ break;
+ case '?':
+ case 'H':
+ get_help('B');
+ break;
+ default:
+ fprintf(stdout,"\n\nUnrecognised Command\n\n");
+ break;
+ }
+ }
+}
+
+
+static sint secstroutput_options(void)
+{
+
+ while(TRUE)
+ {
+ fprintf(stdout,"\n\n\n");
+ fprintf(stdout," ********* Secondary Structure Output Menu *********\n");
+ fprintf(stdout,"\n\n");
+
+
+ fprintf(stdout," 1. %s\n",secstroutput_txt[0]);
+ fprintf(stdout," 2. %s\n",secstroutput_txt[1]);
+ fprintf(stdout," 3. %s\n",secstroutput_txt[2]);
+ fprintf(stdout," 4. %s\n",secstroutput_txt[3]);
+ fprintf(stdout," H. HELP\n\n");
+ fprintf(stdout,
+" -- Current output is %s ",secstroutput_txt[output_struct_penalties]);
+ fprintf(stdout,"--\n");
+
+
+ getstr("\n\nEnter number (or [RETURN] to exit)",lin2);
+ if(*lin2 == EOS) return(output_struct_penalties);
+
+ switch(toupper(*lin2))
+ {
+ case '1': return(0);
+ case '2': return(1);
+ case '3': return(2);
+ case '4': return(3);
+ case '?':
+ case 'H': get_help('C');
+ case 'Q':
+ case 'X': return(0);
+
+ default: fprintf(stdout,"\n\nUnrecognised Command\n\n");
+ break;
+ }
+ }
+}
+
+
+
+
+
+
+
+static void phylogenetic_tree_menu(void)
+{
+ int catchint;
+
+ catchint = signal(SIGINT, SIG_IGN) != SIG_IGN;
+ if (catchint) {
+ if (setjmp(jmpbuf) != 0)
+ fprintf(stdout,"\n.. Interrupt\n");
+#ifdef UNIX
+ if (signal(SIGINT,jumper) == BADSIG)
+ fprintf(stdout,"Error: signal\n");
+#else
+ if (signal(SIGINT,SIG_DFL) == BADSIG)
+ fprintf(stdout,"Error: signal\n");
+#endif
+ }
+
+
+ while(TRUE)
+ {
+ fprintf(stdout,"\n\n\n");
+ fprintf(stdout,"****** PHYLOGENETIC TREE MENU ******\n");
+ fprintf(stdout,"\n\n");
+
+ fprintf(stdout," 1. Input an alignment\n");
+ fprintf(stdout," 2. Exclude positions with gaps? ");
+ if(tossgaps)
+ fprintf(stdout,"= ON\n");
+ else
+ fprintf(stdout,"= OFF\n");
+ fprintf(stdout," 3. Correct for multiple substitutions? ");
+ if(kimura)
+ fprintf(stdout,"= ON\n");
+ else
+ fprintf(stdout,"= OFF\n");
+ fprintf(stdout," 4. Draw tree now\n");
+ fprintf(stdout," 5. Bootstrap tree\n");
+ fprintf(stdout," 6. Output format options\n");
+ fprintf(stdout,"\n");
+ fprintf(stdout," S. Execute a system command\n");
+ fprintf(stdout," H. HELP\n");
+ fprintf(stdout," or press [RETURN] to go back to main menu\n\n\n");
+
+ getstr("Your choice",lin1);
+ if(*lin1 == EOS) return;
+
+ switch(toupper(*lin1))
+ {
+ case '1': seq_input(FALSE);
+ phylip_name[0]=EOS;
+ clustal_name[0]=EOS;
+ dist_name[0]=EOS;
+ nexus_name[0]=EOS;
+ break;
+ case '2': tossgaps ^= TRUE;
+ break;
+ case '3': kimura ^= TRUE;;
+ break;
+ case '4': phylogenetic_tree(phylip_name,clustal_name,dist_name,nexus_name,"amenu.pim");
+ break;
+ case '5': bootstrap_tree(phylip_name,clustal_name,nexus_name);
+ break;
+ case '6': tree_format_options_menu();
+ break;
+ case 'S': do_system();
+ break;
+ case '?':
+ case 'H': get_help('7');
+ break;
+ case 'Q':
+ case 'X': return;
+
+ default: fprintf(stdout,"\n\nUnrecognised Command\n\n");
+ break;
+ }
+ }
+}
+
+
+
+
+
+
+static void tree_format_options_menu(void) /* format of tree output */
+{
+ int catchint;
+
+ catchint = signal(SIGINT, SIG_IGN) != SIG_IGN;
+ if (catchint) {
+ if (setjmp(jmpbuf) != 0)
+ fprintf(stdout,"\n.. Interrupt\n");
+#ifdef UNIX
+ if (signal(SIGINT,jumper) == BADSIG)
+ fprintf(stdout,"Error: signal\n");
+#else
+ if (signal(SIGINT,SIG_DFL) == BADSIG)
+ fprintf(stdout,"Error: signal\n");
+#endif
+ }
+
+
+ while(TRUE) {
+ fprintf(stdout,"\n\n\n");
+ fprintf(stdout," ****** Format of Phylogenetic Tree Output ******\n");
+ fprintf(stdout,"\n\n");
+ fprintf(stdout," 1. Toggle CLUSTAL format tree output = %s\n",
+ (!output_tree_clustal) ? "OFF" : "ON");
+ fprintf(stdout," 2. Toggle Phylip format tree output = %s\n",
+ (!output_tree_phylip) ? "OFF" : "ON");
+ fprintf(stdout," 3. Toggle Phylip distance matrix output = %s\n",
+ (!output_tree_distances)? "OFF" : "ON");
+ fprintf(stdout," 4. Toggle Nexus format tree output = %s\n\n",
+ (!output_tree_nexus)? "OFF" : "ON");
+ fprintf(stdout," 5. Toggle Phylip bootstrap positions = %s\n\n",
+(bootstrap_format==BS_NODE_LABELS) ? "NODE LABELS" : "BRANCH LABELS");
+ fprintf(stdout,"\n");
+ fprintf(stdout," H. HELP\n\n\n");
+
+ getstr("Enter number (or [RETURN] to exit)",lin2);
+ if(*lin2 == EOS) return;
+
+ switch(toupper(*lin2)) {
+ case '1':
+ output_tree_clustal ^= TRUE;
+ break;
+ case '2':
+ output_tree_phylip ^= TRUE;
+ break;
+ case '3':
+ output_tree_distances ^= TRUE;
+ break;
+ case '4':
+ output_tree_nexus ^= TRUE;
+ break;
+ case '5':
+ if (bootstrap_format == BS_NODE_LABELS)
+ bootstrap_format = BS_BRANCH_LABELS;
+ else
+ bootstrap_format = BS_NODE_LABELS;
+ break;
+ case '?':
+ case 'H':
+ get_help('0');
+ break;
+ default:
+ fprintf(stdout,"\n\nUnrecognised Command\n\n");
+ break;
+ }
+ }
+}
+
+
+static void format_options_menu(void) /* format of alignment output */
+{
+ sint i;
+ sint length = 0;
+ char path[FILENAMELEN+1];
+ int catchint;
+
+ catchint = signal(SIGINT, SIG_IGN) != SIG_IGN;
+ if (catchint) {
+ if (setjmp(jmpbuf) != 0)
+ fprintf(stdout,"\n.. Interrupt\n");
+#ifdef UNIX
+ if (signal(SIGINT,jumper) == BADSIG)
+ fprintf(stdout,"Error: signal\n");
+#else
+ if (signal(SIGINT,SIG_DFL) == BADSIG)
+ fprintf(stdout,"Error: signal\n");
+#endif
+ }
+
+
+ while(TRUE) {
+ fprintf(stdout,"\n\n\n");
+ fprintf(stdout," ********* Format of Alignment Output *********\n");
+ fprintf(stdout,"\n\n");
+ fprintf(stdout," F. Toggle FASTA format output = %s\n\n",
+ (!output_fasta) ? "OFF" : "ON");
+ fprintf(stdout," 1. Toggle CLUSTAL format output = %s\n",
+ (!output_clustal) ? "OFF" : "ON");
+ fprintf(stdout," 2. Toggle NBRF/PIR format output = %s\n",
+ (!output_nbrf) ? "OFF" : "ON");
+ fprintf(stdout," 3. Toggle GCG/MSF format output = %s\n",
+ (!output_gcg) ? "OFF" : "ON");
+ fprintf(stdout," 4. Toggle PHYLIP format output = %s\n",
+ (!output_phylip) ? "OFF" : "ON");
+ fprintf(stdout," 5. Toggle NEXUS format output = %s\n",
+ (!output_nexus) ? "OFF" : "ON");
+ fprintf(stdout," 6. Toggle GDE format output = %s\n\n",
+ (!output_gde) ? "OFF" : "ON");
+ fprintf(stdout," 7. Toggle GDE output case = %s\n",
+ (!lowercase) ? "UPPER" : "LOWER");
+
+ fprintf(stdout," 8. Toggle CLUSTALW sequence numbers = %s\n",
+ (!cl_seq_numbers) ? "OFF" : "ON");
+ fprintf(stdout," 9. Toggle output order = %s\n\n",
+ (output_order==0) ? "INPUT FILE" : "ALIGNED");
+
+ fprintf(stdout," 0. Create alignment output file(s) now?\n\n");
+ fprintf(stdout," T. Toggle parameter output = %s\n",
+ (!save_parameters) ? "OFF" : "ON");
+ fprintf(stdout," R. Toggle sequence range numbers = %s\n",
+ (!seqRange) ? "OFF" : "ON");
+ fprintf(stdout,"\n");
+ fprintf(stdout," H. HELP\n\n\n");
+
+ getstr("Enter number (or [RETURN] to exit)",lin2);
+ if(*lin2 == EOS) return;
+
+ switch(toupper(*lin2)) {
+ case '1':
+ output_clustal ^= TRUE;
+ break;
+ case '2':
+ output_nbrf ^= TRUE;
+ break;
+ case '3':
+ output_gcg ^= TRUE;
+ break;
+ case '4':
+ output_phylip ^= TRUE;
+ break;
+ case '5':
+ output_nexus ^= TRUE;
+ break;
+ case '6':
+ output_gde ^= TRUE;
+ break;
+ case '7':
+ lowercase ^= TRUE;
+ break;
+ case '8':
+ cl_seq_numbers ^= TRUE;
+ break;
+ case '9':
+ if (output_order == INPUT) output_order = ALIGNED;
+ else output_order = INPUT;
+ break;
+ case 'F':
+ output_fasta ^= TRUE;
+ break;
+ case 'R':
+ seqRange ^= TRUE;
+ break;
+
+ case '0': /* DES */
+ if(empty) {
+ error("No sequences loaded");
+ break;
+ }
+ get_path(seqname,path);
+ if(!open_alignment_output(path)) break;
+ create_alignment_output(1,nseqs);
+ break;
+ case 'T': save_parameters ^= TRUE;
+ break;
+ case '?':
+ case 'H':
+ get_help('5');
+ break;
+ default:
+ fprintf(stdout,"\n\nUnrecognised Command\n\n");
+ break;
+ }
+ }
+}
+
+
+
+
+
+
+
+
+
+
+
+
+static void pair_menu(void)
+{
+ int catchint;
+
+ catchint = signal(SIGINT, SIG_IGN) != SIG_IGN;
+ if (catchint) {
+ if (setjmp(jmpbuf) != 0)
+ fprintf(stdout,"\n.. Interrupt\n");
+#ifdef UNIX
+ if (signal(SIGINT,jumper) == BADSIG)
+ fprintf(stdout,"Error: signal\n");
+#else
+ if (signal(SIGINT,SIG_DFL) == BADSIG)
+ fprintf(stdout,"Error: signal\n");
+#endif
+ }
+
+
+ if(dnaflag) {
+ pw_go_penalty = dna_pw_go_penalty;
+ pw_ge_penalty = dna_pw_ge_penalty;
+ ktup = dna_ktup;
+ window = dna_window;
+ signif = dna_signif;
+ wind_gap = dna_wind_gap;
+
+ }
+ else {
+ pw_go_penalty = prot_pw_go_penalty;
+ pw_ge_penalty = prot_pw_ge_penalty;
+ ktup = prot_ktup;
+ window = prot_window;
+ signif = prot_signif;
+ wind_gap = prot_wind_gap;
+
+ }
+
+ while(TRUE) {
+
+ fprintf(stdout,"\n\n\n");
+ fprintf(stdout," ********* PAIRWISE ALIGNMENT PARAMETERS *********\n");
+ fprintf(stdout,"\n\n");
+
+ fprintf(stdout," Slow/Accurate alignments:\n\n");
+
+ fprintf(stdout," 1. Gap Open Penalty :%4.2f\n",pw_go_penalty);
+ fprintf(stdout," 2. Gap Extension Penalty :%4.2f\n",pw_ge_penalty);
+ fprintf(stdout," 3. Protein weight matrix :%s\n" ,
+ matrix_menu.opt[pw_matnum-1].title);
+ fprintf(stdout," 4. DNA weight matrix :%s\n" ,
+ dnamatrix_menu.opt[pw_dnamatnum-1].title);
+ fprintf(stdout,"\n");
+
+ fprintf(stdout," Fast/Approximate alignments:\n\n");
+
+ fprintf(stdout," 5. Gap penalty :%d\n",(pint)wind_gap);
+ fprintf(stdout," 6. K-tuple (word) size :%d\n",(pint)ktup);
+ fprintf(stdout," 7. No. of top diagonals :%d\n",(pint)signif);
+ fprintf(stdout," 8. Window size :%d\n\n",(pint)window);
+
+ fprintf(stdout," 9. Toggle Slow/Fast pairwise alignments ");
+ if(quick_pairalign)
+ fprintf(stdout,"= FAST\n\n");
+ else
+ fprintf(stdout,"= SLOW\n\n");
+
+
+ fprintf(stdout," H. HELP\n\n\n");
+
+ getstr("Enter number (or [RETURN] to exit)",lin2);
+ if( *lin2 == EOS) {
+ if(dnaflag) {
+ dna_pw_go_penalty = pw_go_penalty;
+ dna_pw_ge_penalty = pw_ge_penalty;
+ dna_ktup = ktup;
+ dna_window = window;
+ dna_signif = signif;
+ dna_wind_gap = wind_gap;
+
+ }
+ else {
+ prot_pw_go_penalty = pw_go_penalty;
+ prot_pw_ge_penalty = pw_ge_penalty;
+ prot_ktup = ktup;
+ prot_window = window;
+ prot_signif = signif;
+ prot_wind_gap = wind_gap;
+
+ }
+
+ return;
+ }
+
+ switch(toupper(*lin2)) {
+ case '1':
+ fprintf(stdout,"Gap Open Penalty Currently: %4.2f\n",pw_go_penalty);
+ pw_go_penalty=(float)getreal("Enter number",(double)0.0,(double)100.0,(double)pw_go_penalty);
+ break;
+ case '2':
+ fprintf(stdout,"Gap Extension Penalty Currently: %4.2f\n",pw_ge_penalty);
+ pw_ge_penalty=(float)getreal("Enter number",(double)0.0,(double)10.0,(double)pw_ge_penalty);
+ break;
+ case '3':
+ pw_matnum = read_matrix("PROTEIN",pw_matrix_menu,pw_mtrxname,pw_matnum,pw_usermat,pw_aa_xref);
+ break;
+ case '4':
+ pw_dnamatnum = read_matrix("DNA",dnamatrix_menu,pw_dnamtrxname,pw_dnamatnum,pw_userdnamat,pw_dna_xref);
+ break;
+ case '5':
+ fprintf(stdout,"Gap Penalty Currently: %d\n",(pint)wind_gap);
+ wind_gap=getint("Enter number",1,500,wind_gap);
+ break;
+ case '6':
+ fprintf(stdout,"K-tuple Currently: %d\n",(pint)ktup);
+ if(dnaflag)
+ ktup=getint("Enter number",1,4,ktup);
+ else
+ ktup=getint("Enter number",1,2,ktup);
+ break;
+ case '7':
+ fprintf(stdout,"Top diagonals Currently: %d\n",(pint)signif);
+ signif=getint("Enter number",1,50,signif);
+ break;
+ case '8':
+ fprintf(stdout,"Window size Currently: %d\n",(pint)window);
+ window=getint("Enter number",1,50,window);
+ break;
+ case '9': quick_pairalign ^= TRUE;
+ break;
+ case '?':
+ case 'H':
+ get_help('3');
+ break;
+ default:
+ fprintf(stdout,"\n\nUnrecognised Command\n\n");
+ break;
+ }
+ }
+}
+
+
+
+
+
+static void multi_menu(void)
+{
+ int catchint;
+
+ catchint = signal(SIGINT, SIG_IGN) != SIG_IGN;
+ if (catchint) {
+ if (setjmp(jmpbuf) != 0)
+ fprintf(stdout,"\n.. Interrupt\n");
+#ifdef UNIX
+ if (signal(SIGINT,jumper) == BADSIG)
+ fprintf(stdout,"Error: signal\n");
+#else
+ if (signal(SIGINT,SIG_DFL) == BADSIG)
+ fprintf(stdout,"Error: signal\n");
+#endif
+ }
+
+
+ if(dnaflag) {
+ gap_open = dna_gap_open;
+ gap_extend = dna_gap_extend;
+ }
+ else {
+ gap_open = prot_gap_open;
+ gap_extend = prot_gap_extend;
+ }
+
+ while(TRUE) {
+
+ fprintf(stdout,"\n\n\n");
+ fprintf(stdout," ********* MULTIPLE ALIGNMENT PARAMETERS *********\n");
+ fprintf(stdout,"\n\n");
+
+ fprintf(stdout," 1. Gap Opening Penalty :%4.2f\n",gap_open);
+ fprintf(stdout," 2. Gap Extension Penalty :%4.2f\n",gap_extend);
+
+ fprintf(stdout," 3. Delay divergent sequences :%d %%\n\n",(pint)divergence_cutoff);
+
+ fprintf(stdout," 4. DNA Transitions Weight :%1.2f\n\n",transition_weight);
+ fprintf(stdout," 5. Protein weight matrix :%s\n"
+ ,matrix_menu.opt[matnum-1].title);
+ fprintf(stdout," 6. DNA weight matrix :%s\n"
+ ,dnamatrix_menu.opt[dnamatnum-1].title);
+ fprintf(stdout," 7. Use negative matrix :%s\n\n",(!neg_matrix) ? "OFF" : "ON");
+ fprintf(stdout," 8. Protein Gap Parameters\n\n");
+ fprintf(stdout," H. HELP\n\n\n");
+
+ getstr("Enter number (or [RETURN] to exit)",lin2);
+
+ if(*lin2 == EOS) {
+ if(dnaflag) {
+ dna_gap_open = gap_open;
+ dna_gap_extend = gap_extend;
+ }
+ else {
+ prot_gap_open = gap_open;
+ prot_gap_extend = gap_extend;
+ }
+ return;
+ }
+
+ switch(toupper(*lin2)) {
+ case '1':
+ fprintf(stdout,"Gap Opening Penalty Currently: %4.2f\n",gap_open);
+ gap_open=(float)getreal("Enter number",(double)0.0,(double)100.0,(double)gap_open);
+ break;
+ case '2':
+ fprintf(stdout,"Gap Extension Penalty Currently: %4.2f\n",gap_extend);
+ gap_extend=(float)getreal("Enter number",(double)0.0,(double)10.0,(double)gap_extend);
+ break;
+ case '3':
+ fprintf(stdout,"Min Identity Currently: %d\n",(pint)divergence_cutoff);
+ divergence_cutoff=getint("Enter number",0,100,divergence_cutoff);
+ break;
+ case '4':
+ fprintf(stdout,"Transition Weight Currently: %1.2f\n",(pint)transition_weight);
+ transition_weight=(float)getreal("Enter number",(double)0.0,(double)1.0,(double)transition_weight);
+ break;
+ case '5':
+ matnum = read_matrix("PROTEIN",matrix_menu,mtrxname,matnum,usermat,aa_xref);
+ break;
+ case '6':
+ dnamatnum = read_matrix("DNA",dnamatrix_menu,dnamtrxname,dnamatnum,userdnamat,dna_xref);
+ break;
+ case '7':
+ neg_matrix ^= TRUE;
+ break;
+ case '8':
+ gap_penalties_menu();
+ break;
+ case '?':
+ case 'H':
+ get_help('4');
+ break;
+ default:
+ fprintf(stdout,"\n\nUnrecognised Command\n\n");
+ break;
+ }
+ }
+}
+
+
+
+
+
+
+static void gap_penalties_menu(void)
+{
+ char c;
+ sint i;
+ int catchint;
+
+ catchint = signal(SIGINT, SIG_IGN) != SIG_IGN;
+ if (catchint) {
+ if (setjmp(jmpbuf) != 0)
+ fprintf(stdout,"\n.. Interrupt\n");
+#ifdef UNIX
+ if (signal(SIGINT,jumper) == BADSIG)
+ fprintf(stdout,"Error: signal\n");
+#else
+ if (signal(SIGINT,SIG_DFL) == BADSIG)
+ fprintf(stdout,"Error: signal\n");
+#endif
+ }
+
+
+ while(TRUE) {
+
+ fprintf(stdout,"\n\n\n");
+ fprintf(stdout," ********* PROTEIN GAP PARAMETERS *********\n");
+ fprintf(stdout,"\n\n\n");
+
+ fprintf(stdout," 1. Toggle Residue-Specific Penalties :%s\n\n",(no_pref_penalties) ? "OFF" : "ON");
+ fprintf(stdout," 2. Toggle Hydrophilic Penalties :%s\n",(no_hyd_penalties) ? "OFF" : "ON");
+ fprintf(stdout," 3. Hydrophilic Residues :%s\n\n"
+ ,hyd_residues);
+ fprintf(stdout," 4. Gap Separation Distance :%d\n",(pint)gap_dist);
+ fprintf(stdout," 5. Toggle End Gap Separation :%s\n\n",(!use_endgaps) ? "OFF" : "ON");
+ fprintf(stdout," H. HELP\n\n\n");
+
+ getstr("Enter number (or [RETURN] to exit)",lin2);
+
+ if(*lin2 == EOS) return;
+
+ switch(toupper(*lin2)) {
+ case '1':
+ no_pref_penalties ^= TRUE;
+ break;
+ case '2':
+ no_hyd_penalties ^= TRUE;
+ break;
+ case '3':
+ fprintf(stdout,"Hydrophilic Residues Currently: %s\n",hyd_residues);
+
+ getstr("Enter residues (or [RETURN] to quit)",lin1);
+ if (*lin1 != EOS) {
+ for (i=0;i<strlen(hyd_residues) && i<26;i++) {
+ c = lin1[i];
+ if (isalpha(c))
+ hyd_residues[i] = (char)toupper(c);
+ else
+ break;
+ }
+ hyd_residues[i] = EOS;
+ }
+ break;
+ case '4':
+ fprintf(stdout,"Gap Separation Distance Currently: %d\n",(pint)gap_dist);
+ gap_dist=getint("Enter number",0,100,gap_dist);
+ break;
+ case '5':
+ use_endgaps ^= TRUE;
+ break;
+ case '?':
+ case 'H':
+ get_help('A');
+ break;
+ default:
+ fprintf(stdout,"\n\nUnrecognised Command\n\n");
+ break;
+ }
+ }
+}
+
+
+
+
+static sint read_matrix(char *title,MatMenu menu, char *matnam, sint matn, short *mat, short *xref)
+{ static char userfile[FILENAMELEN+1];
+ int i;
+
+ while(TRUE)
+ {
+ fprintf(stdout,"\n\n\n");
+ fprintf(stdout," ********* %s WEIGHT MATRIX MENU *********\n",title);
+ fprintf(stdout,"\n\n");
+
+ for(i=0;i<menu.noptions;i++)
+ fprintf(stdout," %d. %s\n",i+1,menu.opt[i].title);
+ fprintf(stdout," H. HELP\n\n");
+ fprintf(stdout,
+" -- Current matrix is the %s ",menu.opt[matn-1].title);
+ if(matn == menu.noptions) fprintf(stdout,"(file = %s)",userfile);
+ fprintf(stdout,"--\n");
+
+
+ getstr("\n\nEnter number (or [RETURN] to exit)",lin2);
+ if(*lin2 == EOS) return(matn);
+
+ i=toupper(*lin2)-'0';
+ if(i>0 && i<menu.noptions) {
+ strcpy(matnam,menu.opt[i-1].string);
+ matn=i;
+ } else if (i==menu.noptions) {
+ if(user_mat(userfile, mat, xref)) {
+ strcpy(matnam,userfile);
+ matn=i;
+ }
+ }
+ else
+ switch(toupper(*lin2)) {
+ case '?':
+ case 'H':
+ get_help('8');
+ break;
+ default:
+ fprintf(stdout,"\n\nUnrecognised Command\n\n");
+ break;
+ }
+ }
+}
+
+
+char prompt_for_yes_no(char *title,char *prompt)
+{
+ char line[80];
+ char lin2[80];
+
+ fprintf(stdout,"\n%s\n",title);
+ strcpy(line,prompt);
+ strcat(line, "(y/n) ? [y]");
+ getstr(line,lin2);
+ if ((*lin2 != 'n') && (*lin2 != 'N'))
+ return('y');
+ else
+ return('n');
+
+}
+
+
+/*
+* fatal()
+*
+* Prints error msg to stdout and exits.
+* Variadic parameter list can be passed.
+*
+* Return values:
+* none
+*/
+
+void fatal( char *msg,...)
+{
+ va_list ap;
+
+ va_start(ap,msg);
+ fprintf(stdout,"\n\nFATAL ERROR: ");
+ vfprintf(stdout,msg,ap);
+ fprintf(stdout,"\n\n");
+ va_end(ap);
+ exit(1);
+}
+
+/*
+* error()
+*
+* Prints error msg to stdout.
+* Variadic parameter list can be passed.
+*
+* Return values:
+* none
+*/
+
+void error( char *msg,...)
+{
+ va_list ap;
+
+ va_start(ap,msg);
+ fprintf(stdout,"\n\nERROR: ");
+ vfprintf(stdout,msg,ap);
+ fprintf(stdout,"\n\n");
+ va_end(ap);
+}
+
+/*
+* warning()
+*
+* Prints warning msg to stdout.
+* Variadic parameter list can be passed.
+*
+* Return values:
+* none
+*/
+
+void warning( char *msg,...)
+{
+ va_list ap;
+
+ va_start(ap,msg);
+ fprintf(stdout,"\n\nWARNING: ");
+ vfprintf(stdout,msg,ap);
+ fprintf(stdout,"\n\n");
+ va_end(ap);
+}
+
+/*
+* info()
+*
+* Prints info msg to stdout.
+* Variadic parameter list can be passed.
+*
+* Return values:
+* none
+*/
+
+void info( char *msg,...)
+{
+ va_list ap;
+
+ va_start(ap,msg);
+ fprintf(stdout,"\n");
+ vfprintf(stdout,msg,ap);
+ va_end(ap);
+}
Added: trunk/packages/clustalw/branches/upstream/current/calcgapcoeff.c
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/calcgapcoeff.c 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/calcgapcoeff.c 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,497 @@
+#include <stdio.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <string.h>
+#include "clustalw.h"
+
+
+/*
+ * Prototypes
+ */
+void calc_p_penalties(char **aln, sint n, sint fs, sint ls, sint *weight);
+void calc_h_penalties(char **aln, sint n, sint fs, sint ls, sint *weight);
+void calc_v_penalties(char **aln, sint n, sint fs, sint ls, sint *weight);
+sint local_penalty(sint penalty, sint n, sint *pweight, sint *hweight, sint *vweight);
+float percentid(char *s1, char *s2,sint length);
+/*
+ * Global variables
+ */
+
+extern sint gap_dist;
+extern sint max_aa;
+extern sint debug;
+extern Boolean dnaflag;
+extern Boolean use_endgaps;
+extern Boolean endgappenalties;
+extern Boolean no_var_penalties, no_hyd_penalties, no_pref_penalties;
+extern char hyd_residues[];
+extern char *amino_acid_codes;
+
+/* vwindow is the number of residues used for a window for the variable zone penalties */
+/* vll is the lower limit for the variable zone penalties (vll < pen < 1.0) */
+int vll=50;
+int vwindow=5;
+
+sint vlut[26][26] = {
+/* A B C D E F G H I J K L M N O P Q R S T U V W X Y Z */
+/*A*/ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+/*B*/ 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+/*C*/ 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+/*D*/ 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+/*E*/ 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+/*F*/ 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+/*G*/ 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+/*H*/ 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+/*I*/ 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+/*J*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+/*K*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+/*L*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+/*M*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+/*N*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+/*O*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+/*P*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+/*Q*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+/*R*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
+/*S*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
+/*T*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
+/*U*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
+/*V*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
+/*W*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
+/*X*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
+/*Y*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
+/*Z*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
+ };
+
+/* pascarella probabilities for opening a gap at specific residues */
+char pr[] = {'A' , 'C', 'D', 'E', 'F', 'G', 'H', 'K', 'I', 'L',
+ 'M' , 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'Y', 'W'};
+sint pas_op[] = { 87, 87,104, 69, 80,139,100,104, 68, 79,
+ 71,137,126, 93,128,124,111, 75,100, 77};
+sint pas_op2[] ={ 88, 57,111, 98, 75,126, 95, 97, 70, 90,
+ 60,122,110,107, 91,125,124, 81,106, 88};
+sint pal_op[] = { 84, 69,128, 78, 88,176, 53, 95, 55, 49,
+ 52,148,147,100, 91,129,105, 51,128, 88};
+
+float reduced_gap = 1.0;
+Boolean nvar_pen,nhyd_pen,npref_pen; /* local copies of ho_hyd_penalties, no_pref_penalties */
+sint gdist; /* local copy of gap_dist */
+
+void calc_gap_coeff(char **alignment, sint *gaps, sint **profile, Boolean struct_penalties,
+ char *gap_penalty_mask, sint first_seq, sint last_seq,
+ sint prf_length, sint gapcoef, sint lencoef)
+{
+
+ char c;
+ sint i, j;
+ sint is, ie;
+ static sint numseq,val,pcid;
+ static sint *gap_pos;
+ static sint *v_weight, *p_weight, *h_weight;
+ static float scale;
+
+ numseq = last_seq - first_seq;
+ if(numseq == 2)
+ {
+ pcid=percentid(alignment[first_seq],alignment[first_seq+1],prf_length);
+ }
+ else pcid=0;
+
+ for (j=0; j<prf_length; j++)
+ gaps[j] = 0;
+/*
+ Check for a gap penalty mask
+*/
+ if (struct_penalties != NONE)
+ {
+ nvar_pen = nhyd_pen = npref_pen = TRUE;
+ gdist = 0;
+ }
+ else if (no_var_penalties == FALSE && pcid > 60)
+ {
+if(debug>0) fprintf(stderr,"Using variable zones to set gap penalties (pcid = %d)\n",pcid);
+ nhyd_pen = npref_pen = TRUE;
+ nvar_pen = FALSE;
+ }
+ else
+ {
+ nvar_pen = TRUE;
+ nhyd_pen = no_hyd_penalties;
+ npref_pen = no_pref_penalties;
+ gdist = gap_dist;
+ }
+
+ for (i=first_seq; i<last_seq; i++)
+ {
+/*
+ Include end gaps as gaps ?
+*/
+ is = 0;
+ ie = prf_length;
+ if (use_endgaps == FALSE && endgappenalties==FALSE)
+ {
+ for (j=0; j<prf_length; j++)
+ {
+ c = alignment[i][j];
+ if ((c < 0) || (c > max_aa))
+ is++;
+ else
+ break;
+ }
+ for (j=prf_length-1; j>=0; j--)
+ {
+ c = alignment[i][j];
+ if ((c < 0) || (c > max_aa))
+ ie--;
+ else
+ break;
+ }
+ }
+
+ for (j=is; j<ie; j++)
+ {
+ if ((alignment[i][j] < 0) || (alignment[i][j] > max_aa))
+ gaps[j]++;
+ }
+ }
+
+ if ((!dnaflag) && (nvar_pen == FALSE))
+ {
+ v_weight = (sint *) ckalloc( (prf_length+2) * sizeof (sint) );
+ calc_v_penalties(alignment, prf_length, first_seq, last_seq, v_weight);
+ }
+
+
+ if ((!dnaflag) && (npref_pen == FALSE))
+ {
+ p_weight = (sint *) ckalloc( (prf_length+2) * sizeof (sint) );
+ calc_p_penalties(alignment, prf_length, first_seq, last_seq, p_weight);
+ }
+
+ if ((!dnaflag) && (nhyd_pen == FALSE))
+ {
+ h_weight = (sint *) ckalloc( (prf_length+2) * sizeof (sint) );
+ calc_h_penalties(alignment, prf_length, first_seq, last_seq, h_weight);
+ }
+
+ gap_pos = (sint *) ckalloc( (prf_length+2) * sizeof (sint) );
+/*
+ mark the residues close to an existing gap (set gaps[i] = -ve)
+*/
+ if (dnaflag || (gdist <= 0))
+ {
+ for (i=0;i<prf_length;i++) gap_pos[i] = gaps[i];
+ }
+ else
+ {
+ i=0;
+ while (i<prf_length)
+ {
+ if (gaps[i] <= 0)
+ {
+ gap_pos[i] = gaps[i];
+ i++;
+ }
+ else
+ {
+ for (j = -gdist+1; j<0; j++)
+ {
+ if ((i+j>=0) && (i+j<prf_length) &&
+ ((gaps[i+j] == 0) || (gaps[i+j] < j))) gap_pos[i+j] = j;
+ }
+ while (gaps[i] > 0)
+ {
+ if (i>=prf_length) break;
+ gap_pos[i] = gaps[i];
+ i++;
+ }
+ for (j = 0; j<gdist; j++)
+ {
+ if (gaps[i+j] > 0) break;
+ if ((i+j>=0) && (i+j<prf_length) &&
+ ((gaps[i+j] == 0) || (gaps[i+j] < -j))) gap_pos[i+j] = -j-1;
+ }
+ i += j;
+ }
+ }
+ }
+if (debug>1)
+{
+fprintf(stdout,"gap open %d gap ext %d\n",(pint)gapcoef,(pint)lencoef);
+fprintf(stdout,"gaps:\n");
+ for(i=0;i<prf_length;i++) fprintf(stdout,"%d ", (pint)gaps[i]);
+ fprintf(stdout,"\n");
+fprintf(stdout,"gap_pos:\n");
+ for(i=0;i<prf_length;i++) fprintf(stdout,"%d ", (pint)gap_pos[i]);
+ fprintf(stdout,"\n");
+}
+
+
+ for (j=0;j<prf_length; j++)
+ {
+
+ if (gap_pos[j] <= 0)
+ {
+/*
+ apply residue-specific and hydrophilic gap penalties.
+*/
+ if (!dnaflag) {
+ profile[j+1][GAPCOL] = local_penalty(gapcoef, j,
+ p_weight, h_weight, v_weight);
+ profile[j+1][LENCOL] = lencoef;
+ }
+ else {
+ profile[j+1][GAPCOL] = gapcoef;
+ profile[j+1][LENCOL] = lencoef;
+ }
+
+/*
+ increase gap penalty near to existing gaps.
+*/
+ if (gap_pos[j] < 0)
+ {
+ profile[j+1][GAPCOL] *= 2.0+2.0*(gdist+gap_pos[j])/gdist;
+ }
+
+
+ }
+ else
+ {
+ scale = ((float)(numseq-gaps[j])/(float)numseq) * reduced_gap;
+ profile[j+1][GAPCOL] = scale*gapcoef;
+ profile[j+1][LENCOL] = 0.5 * lencoef;
+ }
+/*
+ apply the gap penalty mask
+*/
+ if (struct_penalties != NONE)
+ {
+ val = gap_penalty_mask[j]-'0';
+ if (val > 0 && val < 10)
+ {
+ profile[j+1][GAPCOL] *= val;
+ profile[j+1][LENCOL] *= val;
+ }
+ }
+/*
+ make sure no penalty is zero - even for all-gap positions
+*/
+ if (profile[j+1][GAPCOL] <= 0) profile[j+1][GAPCOL] = 1;
+ if (profile[j+1][LENCOL] <= 0) profile[j+1][LENCOL] = 1;
+ }
+
+/* set the penalties at the beginning and end of the profile */
+ if(endgappenalties==TRUE)
+ {
+ profile[0][GAPCOL] = gapcoef;
+ profile[0][LENCOL] = lencoef;
+ }
+ else
+ {
+ profile[0][GAPCOL] = 0;
+ profile[0][LENCOL] = 0;
+ profile[prf_length][GAPCOL] = 0;
+ profile[prf_length][LENCOL] = 0;
+ }
+if (debug>0)
+{
+ fprintf(stdout,"Opening penalties:\n");
+ for(i=0;i<=prf_length;i++) fprintf(stdout," %d:%d ",i, (pint)profile[i][GAPCOL]);
+ fprintf(stdout,"\n");
+}
+if (debug>0)
+{
+ fprintf(stdout,"Extension penalties:\n");
+ for(i=0;i<=prf_length;i++) fprintf(stdout,"%d:%d ",i, (pint)profile[i][LENCOL]);
+ fprintf(stdout,"\n");
+}
+ if ((!dnaflag) && (nvar_pen == FALSE))
+ v_weight=ckfree((void *)v_weight);
+
+ if ((!dnaflag) && (npref_pen == FALSE))
+ p_weight=ckfree((void *)p_weight);
+
+ if ((!dnaflag) && (nhyd_pen == FALSE))
+ h_weight=ckfree((void *)h_weight);
+
+
+ gap_pos=ckfree((void *)gap_pos);
+}
+
+void calc_v_penalties(char **aln, sint n, sint fs, sint ls, sint *weight)
+{
+ char ix1,ix2;
+ sint i,j,k,t;
+
+ for (i=0;i<n;i++)
+ {
+ weight[i] = 0;
+ t=0;
+ for(j=i-vwindow;j<i+vwindow;j++)
+ {
+ if(j>=0 && j<n)
+ {
+ ix1 = aln[fs][j];
+ ix2 = aln[fs+1][j];
+ if ((ix1 < 0) || (ix1 > max_aa) || (ix2< 0) || (ix2> max_aa)) continue;
+ weight[i] += vlut[amino_acid_codes[ix1]-'A'][amino_acid_codes[ix2]-'A'];
+ t++;
+ }
+ }
+/* now we have a weight -t < w < t */
+ weight[i] +=t;
+ if(t>0)
+ weight[i] = (weight[i]*100)/(2*t);
+ else
+ weight[i] = 100;
+/* now we have a weight vll < w < 100 */
+ if (weight[i]<vll) weight[i]=vll;
+ }
+
+
+}
+
+void calc_p_penalties(char **aln, sint n, sint fs, sint ls, sint *weight)
+{
+ char ix;
+ sint j,k,numseq;
+ sint i;
+
+ numseq = ls - fs;
+ for (i=0;i<n;i++)
+ {
+ weight[i] = 0;
+ for (k=fs;k<ls;k++)
+ {
+ for (j=0;j<22;j++)
+ {
+ ix = aln[k][i];
+ if ((ix < 0) || (ix > max_aa)) continue;
+ if (amino_acid_codes[ix] == pr[j])
+ {
+ weight[i] += (180-pas_op[j]);
+ break;
+ }
+ }
+ }
+ weight[i] /= numseq;
+ }
+
+}
+
+void calc_h_penalties(char **aln, sint n, sint fs, sint ls, sint *weight)
+{
+
+/*
+ weight[] is the length of the hydrophilic run of residues.
+*/
+ char ix;
+ sint nh,j,k;
+ sint i,e,s;
+ sint *hyd;
+ float scale;
+
+ hyd = (sint *)ckalloc((n+2) * sizeof(sint));
+ nh = (sint)strlen(hyd_residues);
+ for (i=0;i<n;i++)
+ weight[i] = 0;
+
+ for (k=fs;k<ls;k++)
+ {
+ for (i=0;i<n;i++)
+ {
+ hyd[i] = 0;
+ for (j=0;j<nh;j++)
+ {
+ ix = aln[k][i];
+ if ((ix < 0) || (ix > max_aa)) continue;
+ if (amino_acid_codes[ix] == hyd_residues[j])
+ {
+ hyd[i] = 1;
+ break;
+ }
+ }
+ }
+ i = 0;
+ while (i < n)
+ {
+ if (hyd[i] == 0) i++;
+ else
+ {
+ s = i;
+ while ((hyd[i] != 0) && (i<n)) i++;
+ e = i;
+ if (e-s > 3)
+ for (j=s; j<e; j++) weight[j] += 100;
+ }
+ }
+ }
+
+ scale = ls - fs;
+ for (i=0;i<n;i++)
+ weight[i] /= scale;
+
+ hyd=ckfree((void *)hyd);
+
+if (debug>1)
+{
+ for(i=0;i<n;i++) fprintf(stdout,"%d ", (pint)weight[i]);
+ fprintf(stdout,"\n");
+}
+
+}
+
+sint local_penalty(sint penalty, sint n, sint *pweight, sint *hweight, sint *vweight)
+{
+
+ Boolean h = FALSE;
+ float gw;
+
+ if (dnaflag) return(1);
+
+ gw = 1.0;
+ if (nvar_pen == FALSE)
+ {
+ gw *= (float)vweight[n]/100.0;
+ }
+
+ if (nhyd_pen == FALSE)
+ {
+ if (hweight[n] > 0)
+ {
+ gw *= 0.5;
+ h = TRUE;
+ }
+ }
+ if ((npref_pen == FALSE) && (h==FALSE))
+ {
+ gw *= ((float)pweight[n]/100.0);
+ }
+
+ gw *= penalty;
+ return((sint)gw);
+
+}
+
+float percentid(char *s1, char *s2,sint length)
+{
+ sint i;
+ sint count,total;
+ float score;
+
+ count = total = 0;
+ for (i=0;i<length;i++) {
+ if ((s1[i]>=0) && (s1[i]<max_aa)) {
+ total++;
+ if (s1[i] == s2[i]) count++;
+ }
+ if (s1[i]==(-3) || s2[i]==(-3)) break;
+
+ }
+
+ if(total==0) score=0;
+ else
+ score = 100.0 * (float)count / (float)total;
+ return(score);
+
+}
+
Added: trunk/packages/clustalw/branches/upstream/current/calcprf1.c
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/calcprf1.c 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/calcprf1.c 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,99 @@
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include "clustalw.h"
+
+
+/*
+ * Prototypes
+ */
+
+/*
+ * Global variables
+ */
+
+extern sint max_aa,gap_pos1,gap_pos2;
+
+void calc_prf1(sint **profile, char **alignment, sint *gaps,
+ sint matrix[NUMRES][NUMRES],
+ sint *seq_weight, sint prf_length, sint first_seq, sint last_seq)
+{
+
+ sint **weighting, sum2, d, i, res;
+ sint numseq;
+ sint r, pos;
+ int f;
+ float scale;
+
+ weighting = (sint **) ckalloc( (NUMRES+2) * sizeof (sint *) );
+ for (i=0;i<NUMRES+2;i++)
+ weighting[i] = (sint *) ckalloc( (prf_length+2) * sizeof (sint) );
+
+ numseq = last_seq-first_seq;
+
+ sum2 = 0;
+ for (i=first_seq; i<last_seq; i++)
+ sum2 += seq_weight[i];
+
+ for (r=0; r<prf_length; r++)
+ {
+ for (d=0; d<=max_aa; d++)
+ {
+ weighting[d][r] = 0;
+ for (i=first_seq; i<last_seq; i++)
+ if (d == alignment[i][r]) weighting[d][r] += seq_weight[i];
+ }
+ weighting[gap_pos1][r] = 0;
+ for (i=first_seq; i<last_seq; i++)
+ if (gap_pos1 == alignment[i][r]) weighting[gap_pos1][r] += seq_weight[i];
+ weighting[gap_pos2][r] = 0;
+ for (i=first_seq; i<last_seq; i++)
+ if (gap_pos2 == alignment[i][r]) weighting[gap_pos2][r] += seq_weight[i];
+ }
+
+ for (pos=0; pos< prf_length; pos++)
+ {
+ if (gaps[pos] == numseq)
+ {
+ for (res=0; res<=max_aa; res++)
+ {
+ profile[pos+1][res] = matrix[res][gap_pos1];
+ }
+ profile[pos+1][gap_pos1] = matrix[gap_pos1][gap_pos1];
+ profile[pos+1][gap_pos2] = matrix[gap_pos2][gap_pos1];
+ }
+ else
+ {
+ scale = (float)(numseq-gaps[pos]) / (float)numseq;
+ for (res=0; res<=max_aa; res++)
+ {
+ f = 0;
+ for (d=0; d<=max_aa; d++)
+ f += (weighting[d][pos] * matrix[d][res]);
+ f += (weighting[gap_pos1][pos] * matrix[gap_pos1][res]);
+ f += (weighting[gap_pos2][pos] * matrix[gap_pos2][res]);
+ profile[pos+1][res] = (sint )(((float)f / (float)sum2)*scale);
+ }
+ f = 0;
+ for (d=0; d<=max_aa; d++)
+ f += (weighting[d][pos] * matrix[d][gap_pos1]);
+ f += (weighting[gap_pos1][pos] * matrix[gap_pos1][gap_pos1]);
+ f += (weighting[gap_pos2][pos] * matrix[gap_pos2][gap_pos1]);
+ profile[pos+1][gap_pos1] = (sint )(((float)f / (float)sum2)*scale);
+ f = 0;
+ for (d=0; d<=max_aa; d++)
+ f += (weighting[d][pos] * matrix[d][gap_pos2]);
+ f += (weighting[gap_pos1][pos] * matrix[gap_pos1][gap_pos2]);
+ f += (weighting[gap_pos2][pos] * matrix[gap_pos2][gap_pos2]);
+ profile[pos+1][gap_pos2] = (sint )(((float)f / (float)sum2)*scale);
+ }
+ }
+
+ for (i=0;i<NUMRES+2;i++)
+ weighting[i]=ckfree((void *)weighting[i]);
+ weighting=ckfree((void *)weighting);
+
+}
+
+
Added: trunk/packages/clustalw/branches/upstream/current/calcprf2.c
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/calcprf2.c 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/calcprf2.c 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,73 @@
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include "clustalw.h"
+
+/*
+ * Prototypes
+ */
+/*
+ * Global variables
+ */
+
+extern sint max_aa,gap_pos1,gap_pos2;
+
+void calc_prf2(sint **profile, char **alignment,
+ sint *seq_weight,sint prf_length, sint first_seq, sint last_seq)
+{
+
+ sint sum1, sum2;
+ sint i, d;
+ sint r;
+
+
+ for (r=0; r<prf_length; r++)
+ {
+/*
+ calculate sum2 = number of residues found in this column
+*/
+ sum2 = 0;
+ for (i=first_seq; i<last_seq; i++)
+ {
+ sum2 += seq_weight[i];
+ }
+/*
+ only include matrix comparison scores for those residue types found in this
+ column
+*/
+ if (sum2 == 0)
+ {
+ for (d=0; d<=max_aa; d++)
+ profile[r+1][d] = 0;
+ profile[r+1][gap_pos1] = 0;
+ profile[r+1][gap_pos2] = 0;
+ }
+ else
+ {
+ for (d=0; d<=max_aa; d++)
+ {
+ sum1 = 0;
+ for (i=first_seq; i<last_seq; i++)
+ {
+ if (d == alignment[i][r]) sum1 += seq_weight[i];
+ }
+ profile[r+1][d] = (sint)(10 * (float)sum1 / (float)sum2);
+ }
+ sum1 = 0;
+ for (i=first_seq; i<last_seq; i++)
+ {
+ if (gap_pos1 == alignment[i][r]) sum1 += seq_weight[i];
+ }
+ profile[r+1][gap_pos1] = (sint)(10 * (float)sum1 / (float)sum2);
+ sum1 = 0;
+ for (i=first_seq; i<last_seq; i++)
+ {
+ if (gap_pos2 == alignment[i][r]) sum1 += seq_weight[i];
+ }
+ profile[r+1][gap_pos2] = (sint)(10 * (float)sum1 / (float)sum2);
+ }
+ }
+}
+
+
Added: trunk/packages/clustalw/branches/upstream/current/calctree.c
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/calctree.c 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/calctree.c 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,984 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <stdarg.h>
+#include <ctype.h>
+#include "clustalw.h"
+
+#define MAXERRS 10
+
+/*
+ * Prototypes
+ */
+static void create_tree(treeptr ptree, treeptr parent);
+static void create_node(treeptr pptr, treeptr parent);
+static treeptr insert_node(treeptr pptr);
+static void skip_space(FILE *fd);
+static treeptr avail(void);
+static void set_info(treeptr p, treeptr parent, sint pleaf, char *pname, float pdist);
+static treeptr reroot(treeptr ptree, sint nseqs);
+static treeptr insert_root(treeptr p, float diff);
+static float calc_root_mean(treeptr root, float *maxdist);
+static float calc_mean(treeptr nptr, float *maxdist, sint nseqs);
+static void order_nodes(void);
+static sint calc_weight(sint leaf);
+static void group_seqs(treeptr p, sint *next_groups, sint nseqs);
+static void mark_group1(treeptr p, sint *groups, sint n);
+static void mark_group2(treeptr p, sint *groups, sint n);
+static void save_set(sint n, sint *groups);
+static void clear_tree_nodes(treeptr p);
+
+
+/*
+ * Global variables
+ */
+extern Boolean interactive;
+extern Boolean distance_tree;
+extern Boolean usemenu;
+extern sint debug;
+extern double **tmat;
+extern sint **sets;
+extern sint nsets;
+extern char **names;
+extern sint *seq_weight;
+extern Boolean no_weights;
+
+char ch;
+FILE *fd;
+treeptr *lptr;
+treeptr *olptr;
+treeptr *nptr;
+treeptr *ptrs;
+sint nnodes = 0;
+sint ntotal = 0;
+Boolean rooted_tree = TRUE;
+static treeptr seq_tree,root;
+static sint *groups, numseq;
+
+void calc_seq_weights(sint first_seq, sint last_seq, sint *sweight)
+{
+ sint i, nseqs;
+ sint temp, sum, *weight;
+
+
+/*
+ If there are more than three sequences....
+*/
+ nseqs = last_seq-first_seq;
+ if ((nseqs >= 2) && (distance_tree == TRUE) && (no_weights == FALSE))
+ {
+/*
+ Calculate sequence weights based on Phylip tree.
+*/
+ weight = (sint *)ckalloc((last_seq+1) * sizeof(sint));
+
+ for (i=first_seq; i<last_seq; i++)
+ weight[i] = calc_weight(i);
+
+/*
+ Normalise the weights, such that the sum of the weights = INT_SCALE_FACTOR
+*/
+
+ sum = 0;
+ for (i=first_seq; i<last_seq; i++)
+ sum += weight[i];
+
+ if (sum == 0)
+ {
+ for (i=first_seq; i<last_seq; i++)
+ weight[i] = 1;
+ sum = i;
+ }
+
+ for (i=first_seq; i<last_seq; i++)
+ {
+ sweight[i] = (weight[i] * INT_SCALE_FACTOR) / sum;
+ if (sweight[i] < 1) sweight[i] = 1;
+ }
+
+ weight=ckfree((void *)weight);
+
+ }
+
+ else
+ {
+/*
+ Otherwise, use identity weights.
+*/
+ temp = INT_SCALE_FACTOR / nseqs;
+ for (i=first_seq; i<last_seq; i++)
+ sweight[i] = temp;
+ }
+
+}
+
+void create_sets(sint first_seq, sint last_seq)
+{
+ sint i, j, nseqs;
+
+ nsets = 0;
+ nseqs = last_seq-first_seq;
+ if (nseqs >= 2)
+ {
+/*
+ If there are more than three sequences....
+*/
+ groups = (sint *)ckalloc((nseqs+1) * sizeof(sint));
+ group_seqs(root, groups, nseqs);
+ groups=ckfree((void *)groups);
+
+ }
+
+ else
+ {
+ groups = (sint *)ckalloc((nseqs+1) * sizeof(sint));
+ for (i=0;i<nseqs-1;i++)
+ {
+ for (j=0;j<nseqs;j++)
+ if (j<=i) groups[j] = 1;
+ else if (j==i+1) groups[j] = 2;
+ else groups[j] = 0;
+ save_set(nseqs, groups);
+ }
+ groups=ckfree((void *)groups);
+ }
+
+}
+
+sint read_tree(char *treefile, sint first_seq, sint last_seq)
+{
+
+ char c;
+ char name1[MAXNAMES+1], name2[MAXNAMES+1];
+ sint i, j, k;
+ Boolean found;
+
+ numseq = 0;
+ nnodes = 0;
+ ntotal = 0;
+ rooted_tree = TRUE;
+
+#ifdef VMS
+ if ((fd = fopen(treefile,"r","rat=cr","rfm=var")) == NULL)
+#else
+ if ((fd = fopen(treefile, "r")) == NULL)
+#endif
+ {
+ error("cannot open %s", treefile);
+ return((sint)0);
+ }
+
+ skip_space(fd);
+ ch = (char)getc(fd);
+ if (ch != '(')
+ {
+ error("Wrong format in tree file %s", treefile);
+ return((sint)0);
+ }
+ rewind(fd);
+
+ distance_tree = TRUE;
+
+/*
+ Allocate memory for tree
+*/
+ nptr = (treeptr *)ckalloc(3*(last_seq-first_seq+1) * sizeof(treeptr));
+ ptrs = (treeptr *)ckalloc(3*(last_seq-first_seq+1) * sizeof(treeptr));
+ lptr = (treeptr *)ckalloc((last_seq-first_seq+1) * sizeof(treeptr));
+ olptr = (treeptr *)ckalloc((last_seq+1) * sizeof(treeptr));
+
+ seq_tree = avail();
+ set_info(seq_tree, NULL, 0, "", 0.0);
+
+ create_tree(seq_tree,NULL);
+ fclose(fd);
+
+
+ if (numseq != last_seq-first_seq)
+ {
+ error("tree not compatible with alignment\n(%d sequences in alignment and %d in tree", (pint)last_seq-first_seq,(pint)numseq);
+ return((sint)0);
+ }
+
+/*
+ If the tree is unrooted, reroot the tree - ie. minimise the difference
+ between the mean root->leaf distances for the left and right branches of
+ the tree.
+*/
+
+ if (distance_tree == FALSE)
+ {
+ if (rooted_tree == FALSE)
+ {
+ error("input tree is unrooted and has no distances.\nCannot align sequences");
+ return((sint)0);
+ }
+ }
+
+ if (rooted_tree == FALSE)
+ {
+ root = reroot(seq_tree, last_seq-first_seq+1);
+ }
+ else
+ {
+ root = seq_tree;
+ }
+
+/*
+ calculate the 'order' of each node.
+*/
+ order_nodes();
+
+ if (numseq >= 2)
+ {
+/*
+ If there are more than three sequences....
+*/
+/*
+ assign the sequence nodes (in the same order as in the alignment file)
+*/
+ for (i=first_seq; i<last_seq; i++)
+ {
+ if (strlen(names[i+1]) > MAXNAMES)
+ warning("name %s is too long for PHYLIP tree format (max %d chars)", names[i+1],MAXNAMES);
+
+ for (k=0; k< strlen(names[i+1]) && k<MAXNAMES ; k++)
+ {
+ c = names[i+1][k];
+ if ((c>0x40) && (c<0x5b)) c=c | 0x20;
+ if (c == ' ') c = '_';
+ name2[k] = c;
+ }
+ name2[k]='\0';
+ found = FALSE;
+ for (j=0; j<numseq; j++)
+ {
+ for (k=0; k< strlen(lptr[j]->name) && k<MAXNAMES ; k++)
+ {
+ c = lptr[j]->name[k];
+ if ((c>0x40) && (c<0x5b)) c=c | 0x20;
+ name1[k] = c;
+ }
+ name1[k]='\0';
+ if (strcmp(name1, name2) == 0)
+ {
+ olptr[i] = lptr[j];
+ found = TRUE;
+ }
+ }
+ if (found == FALSE)
+ {
+ error("tree not compatible with alignment:\n%s not found", name2);
+ return((sint)0);
+ }
+ }
+
+ }
+ return((sint)1);
+}
+
+static void create_tree(treeptr ptree, treeptr parent)
+{
+ treeptr p;
+
+ sint i, type;
+ float dist;
+ char name[MAXNAMES+1];
+
+/*
+ is this a node or a leaf ?
+*/
+ skip_space(fd);
+ ch = (char)getc(fd);
+ if (ch == '(')
+ {
+/*
+ this must be a node....
+*/
+ type = NODE;
+ name[0] = '\0';
+ ptrs[ntotal] = nptr[nnodes] = ptree;
+ nnodes++;
+ ntotal++;
+
+ create_node(ptree, parent);
+
+ p = ptree->left;
+ create_tree(p, ptree);
+
+ if ( ch == ',')
+ {
+ p = ptree->right;
+ create_tree(p, ptree);
+ if ( ch == ',')
+ {
+ ptree = insert_node(ptree);
+ ptrs[ntotal] = nptr[nnodes] = ptree;
+ nnodes++;
+ ntotal++;
+ p = ptree->right;
+ create_tree(p, ptree);
+ rooted_tree = FALSE;
+ }
+ }
+
+ skip_space(fd);
+ ch = (char)getc(fd);
+ }
+/*
+ ...otherwise, this is a leaf
+*/
+ else
+ {
+ type = LEAF;
+ ptrs[ntotal++] = lptr[numseq++] = ptree;
+/*
+ get the sequence name
+*/
+ name[0] = ch;
+ ch = (char)getc(fd);
+ i = 1;
+ while ((ch != ':') && (ch != ',') && (ch != ')'))
+ {
+ if (i < MAXNAMES) name[i++] = ch;
+ ch = (char)getc(fd);
+ }
+ name[i] = '\0';
+ if (ch != ':')
+ {
+ distance_tree = FALSE;
+ dist = 0.0;
+ }
+ }
+
+/*
+ get the distance information
+*/
+ dist = 0.0;
+ if (ch == ':')
+ {
+ skip_space(fd);
+ fscanf(fd,"%f",&dist);
+ skip_space(fd);
+ ch = (char)getc(fd);
+ }
+ set_info(ptree, parent, type, name, dist);
+
+
+}
+
+static void create_node(treeptr pptr, treeptr parent)
+{
+ treeptr t;
+
+ pptr->parent = parent;
+ t = avail();
+ pptr->left = t;
+ t = avail();
+ pptr->right = t;
+
+}
+
+static treeptr insert_node(treeptr pptr)
+{
+
+ treeptr newnode;
+
+ newnode = avail();
+ create_node(newnode, pptr->parent);
+
+ newnode->left = pptr;
+ pptr->parent = newnode;
+
+ set_info(newnode, pptr->parent, NODE, "", 0.0);
+
+ return(newnode);
+}
+
+static void skip_space(FILE *fd)
+{
+ int c;
+
+ do
+ c = getc(fd);
+ while(isspace(c));
+
+ ungetc(c, fd);
+}
+
+static treeptr avail(void)
+{
+ treeptr p;
+ p = ckalloc(sizeof(stree));
+ p->left = NULL;
+ p->right = NULL;
+ p->parent = NULL;
+ p->dist = 0.0;
+ p->leaf = 0;
+ p->order = 0;
+ p->name[0] = '\0';
+ return(p);
+}
+
+void clear_tree(treeptr p)
+{
+ clear_tree_nodes(p);
+
+ nptr=ckfree((void *)nptr);
+ ptrs=ckfree((void *)ptrs);
+ lptr=ckfree((void *)lptr);
+ olptr=ckfree((void *)olptr);
+}
+
+static void clear_tree_nodes(treeptr p)
+{
+ if (p==NULL) p = root;
+ if (p->left != NULL)
+ {
+ clear_tree_nodes(p->left);
+ }
+ if (p->right != NULL)
+ {
+ clear_tree_nodes(p->right);
+ }
+ p->left = NULL;
+ p->right = NULL;
+ p=ckfree((void *)p);
+}
+
+static void set_info(treeptr p, treeptr parent, sint pleaf, char *pname, float pdist)
+{
+ p->parent = parent;
+ p->leaf = pleaf;
+ p->dist = pdist;
+ p->order = 0;
+ strcpy(p->name, pname);
+ if (p->leaf == TRUE)
+ {
+ p->left = NULL;
+ p->right = NULL;
+ }
+}
+
+static treeptr reroot(treeptr ptree, sint nseqs)
+{
+
+ treeptr p, rootnode, rootptr;
+ float diff, mindiff = 0.0, mindepth = 1.0, maxdist;
+ sint i;
+ Boolean first = TRUE;
+
+/*
+ find the difference between the means of leaf->node
+ distances on the left and on the right of each node
+*/
+ rootptr = ptree;
+ for (i=0; i<ntotal; i++)
+ {
+ p = ptrs[i];
+ if (p->parent == NULL)
+ diff = calc_root_mean(p, &maxdist);
+ else
+ diff = calc_mean(p, &maxdist, nseqs);
+
+ if ((diff == 0) || ((diff > 0) && (diff < 2 * p->dist)))
+ {
+ if ((maxdist < mindepth) || (first == TRUE))
+ {
+ first = FALSE;
+ rootptr = p;
+ mindepth = maxdist;
+ mindiff = diff;
+ }
+ }
+
+ }
+
+/*
+ insert a new node as the ancestor of the node which produces the shallowest
+ tree.
+*/
+ if (rootptr == ptree)
+ {
+ mindiff = rootptr->left->dist + rootptr->right->dist;
+ rootptr = rootptr->right;
+ }
+ rootnode = insert_root(rootptr, mindiff);
+
+ diff = calc_root_mean(rootnode, &maxdist);
+
+ return(rootnode);
+}
+
+static treeptr insert_root(treeptr p, float diff)
+{
+ treeptr newp, prev, q, t;
+ float dist, prevdist,td;
+
+ newp = avail();
+
+ t = p->parent;
+ prevdist = t->dist;
+
+ p->parent = newp;
+
+ dist = p->dist;
+
+ p->dist = diff / 2;
+ if (p->dist < 0.0) p->dist = 0.0;
+ if (p->dist > dist) p->dist = dist;
+
+ t->dist = dist - p->dist;
+
+ newp->left = t;
+ newp->right = p;
+ newp->parent = NULL;
+ newp->dist = 0.0;
+ newp->leaf = NODE;
+
+ if (t->left == p) t->left = t->parent;
+ else t->right = t->parent;
+
+ prev = t;
+ q = t->parent;
+
+ t->parent = newp;
+
+ while (q != NULL)
+ {
+ if (q->left == prev)
+ {
+ q->left = q->parent;
+ q->parent = prev;
+ td = q->dist;
+ q->dist = prevdist;
+ prevdist = td;
+ prev = q;
+ q = q->left;
+ }
+ else
+ {
+ q->right = q->parent;
+ q->parent = prev;
+ td = q->dist;
+ q->dist = prevdist;
+ prevdist = td;
+ prev = q;
+ q = q->right;
+ }
+ }
+
+/*
+ remove the old root node
+*/
+ q = prev;
+ if (q->left == NULL)
+ {
+ dist = q->dist;
+ q = q->right;
+ q->dist += dist;
+ q->parent = prev->parent;
+ if (prev->parent->left == prev)
+ prev->parent->left = q;
+ else
+ prev->parent->right = q;
+ prev->right = NULL;
+ }
+ else
+ {
+ dist = q->dist;
+ q = q->left;
+ q->dist += dist;
+ q->parent = prev->parent;
+ if (prev->parent->left == prev)
+ prev->parent->left = q;
+ else
+ prev->parent->right = q;
+ prev->left = NULL;
+ }
+
+ return(newp);
+}
+
+static float calc_root_mean(treeptr root, float *maxdist)
+{
+ float dist , lsum = 0.0, rsum = 0.0, lmean,rmean,diff;
+ treeptr p;
+ sint i;
+ sint nl, nr;
+ sint direction;
+/*
+ for each leaf, determine whether the leaf is left or right of the root.
+*/
+ dist = (*maxdist) = 0;
+ nl = nr = 0;
+ for (i=0; i< numseq; i++)
+ {
+ p = lptr[i];
+ dist = 0.0;
+ while (p->parent != root)
+ {
+ dist += p->dist;
+ p = p->parent;
+ }
+ if (p == root->left) direction = LEFT;
+ else direction = RIGHT;
+ dist += p->dist;
+
+ if (direction == LEFT)
+ {
+ lsum += dist;
+ nl++;
+ }
+ else
+ {
+ rsum += dist;
+ nr++;
+ }
+ if (dist > (*maxdist)) *maxdist = dist;
+ }
+
+ lmean = lsum / nl;
+ rmean = rsum / nr;
+
+ diff = lmean - rmean;
+ return(diff);
+}
+
+
+static float calc_mean(treeptr nptr, float *maxdist, sint nseqs)
+{
+ float dist , lsum = 0.0, rsum = 0.0, lmean,rmean,diff;
+ treeptr p, *path2root;
+ float *dist2node;
+ sint depth = 0, i,j , n = 0;
+ sint nl , nr;
+ sint direction, found;
+
+ path2root = (treeptr *)ckalloc(nseqs * sizeof(treeptr));
+ dist2node = (float *)ckalloc(nseqs * sizeof(float));
+/*
+ determine all nodes between the selected node and the root;
+*/
+ depth = (*maxdist) = dist = 0;
+ nl = nr = 0;
+ p = nptr;
+ while (p != NULL)
+ {
+ path2root[depth] = p;
+ dist += p->dist;
+ dist2node[depth] = dist;
+ p = p->parent;
+ depth++;
+ }
+
+/*
+ *nl = *nr = 0;
+ for each leaf, determine whether the leaf is left or right of the node.
+ (RIGHT = descendant, LEFT = not descendant)
+*/
+ for (i=0; i< numseq; i++)
+ {
+ p = lptr[i];
+ if (p == nptr)
+ {
+ direction = RIGHT;
+ dist = 0.0;
+ }
+ else
+ {
+ direction = LEFT;
+ dist = 0.0;
+/*
+ find the common ancestor.
+*/
+ found = FALSE;
+ n = 0;
+ while ((found == FALSE) && (p->parent != NULL))
+ {
+ for (j=0; j< depth; j++)
+ if (p->parent == path2root[j])
+ {
+ found = TRUE;
+ n = j;
+ }
+ dist += p->dist;
+ p = p->parent;
+ }
+ if (p == nptr) direction = RIGHT;
+ }
+
+ if (direction == LEFT)
+ {
+ lsum += dist;
+ lsum += dist2node[n-1];
+ nl++;
+ }
+ else
+ {
+ rsum += dist;
+ nr++;
+ }
+
+ if (dist > (*maxdist)) *maxdist = dist;
+ }
+
+ dist2node=ckfree((void *)dist2node);
+ path2root=ckfree((void *)path2root);
+
+ lmean = lsum / nl;
+ rmean = rsum / nr;
+
+ diff = lmean - rmean;
+ return(diff);
+}
+
+static void order_nodes(void)
+{
+ sint i;
+ treeptr p;
+
+ for (i=0; i<numseq; i++)
+ {
+ p = lptr[i];
+ while (p != NULL)
+ {
+ p->order++;
+ p = p->parent;
+ }
+ }
+}
+
+
+static sint calc_weight(sint leaf)
+{
+
+ treeptr p;
+ float weight = 0.0;
+
+ p = olptr[leaf];
+ while (p->parent != NULL)
+ {
+ weight += p->dist / p->order;
+ p = p->parent;
+ }
+
+ weight *= 100.0;
+
+ return((sint)weight);
+
+}
+
+static void group_seqs(treeptr p, sint *next_groups, sint nseqs)
+{
+ sint i;
+ sint *tmp_groups;
+
+ tmp_groups = (sint *)ckalloc((nseqs+1) * sizeof(sint));
+ for (i=0;i<nseqs;i++)
+ tmp_groups[i] = 0;
+
+ if (p->left != NULL)
+ {
+ if (p->left->leaf == NODE)
+ {
+ group_seqs(p->left, next_groups, nseqs);
+ for (i=0;i<nseqs;i++)
+ if (next_groups[i] != 0) tmp_groups[i] = 1;
+ }
+ else
+ {
+ mark_group1(p->left, tmp_groups, nseqs);
+ }
+
+ }
+
+ if (p->right != NULL)
+ {
+ if (p->right->leaf == NODE)
+ {
+ group_seqs(p->right, next_groups, nseqs);
+ for (i=0;i<nseqs;i++)
+ if (next_groups[i] != 0) tmp_groups[i] = 2;
+ }
+ else
+ {
+ mark_group2(p->right, tmp_groups, nseqs);
+ }
+ save_set(nseqs, tmp_groups);
+ }
+ for (i=0;i<nseqs;i++)
+ next_groups[i] = tmp_groups[i];
+
+ tmp_groups=ckfree((void *)tmp_groups);
+
+}
+
+static void mark_group1(treeptr p, sint *groups, sint n)
+{
+ sint i;
+
+ for (i=0;i<n;i++)
+ {
+ if (olptr[i] == p)
+ groups[i] = 1;
+ else
+ groups[i] = 0;
+ }
+}
+
+static void mark_group2(treeptr p, sint *groups, sint n)
+{
+ sint i;
+
+ for (i=0;i<n;i++)
+ {
+ if (olptr[i] == p)
+ groups[i] = 2;
+ else if (groups[i] != 0)
+ groups[i] = 1;
+ }
+}
+
+static void save_set(sint n, sint *groups)
+{
+ sint i;
+
+ for (i=0;i<n;i++)
+ sets[nsets+1][i+1] = groups[i];
+ nsets++;
+}
+
+
+
+sint calc_similarities(sint nseqs)
+{
+ sint depth = 0, i,j, k, n;
+ sint found;
+ sint nerrs, seq1[MAXERRS],seq2[MAXERRS];
+ treeptr p, *path2root;
+ float dist;
+ float *dist2node, bad_dist[MAXERRS];
+ double **dmat;
+ char err_mess[1024],err1[MAXLINE],reply[MAXLINE];
+
+ path2root = (treeptr *)ckalloc((nseqs) * sizeof(treeptr));
+ dist2node = (float *)ckalloc((nseqs) * sizeof(float));
+ dmat = (double **)ckalloc((nseqs) * sizeof(double *));
+ for (i=0;i<nseqs;i++)
+ dmat[i] = (double *)ckalloc((nseqs) * sizeof(double));
+
+ if (nseqs >= 2)
+ {
+/*
+ for each leaf, determine all nodes between the leaf and the root;
+*/
+ for (i = 0;i<nseqs; i++)
+ {
+ depth = dist = 0;
+ p = olptr[i];
+ while (p != NULL)
+ {
+ path2root[depth] = p;
+ dist += p->dist;
+ dist2node[depth] = dist;
+ p = p->parent;
+ depth++;
+ }
+
+/*
+ for each pair....
+*/
+ for (j=0; j < i; j++)
+ {
+ p = olptr[j];
+ dist = 0.0;
+/*
+ find the common ancestor.
+*/
+ found = FALSE;
+ n = 0;
+ while ((found == FALSE) && (p->parent != NULL))
+ {
+ for (k=0; k< depth; k++)
+ if (p->parent == path2root[k])
+ {
+ found = TRUE;
+ n = k;
+ }
+ dist += p->dist;
+ p = p->parent;
+ }
+
+ dmat[i][j] = dist + dist2node[n-1];
+ }
+ }
+
+ nerrs = 0;
+ for (i=0;i<nseqs;i++)
+ {
+ dmat[i][i] = 0.0;
+ for (j=0;j<i;j++)
+ {
+ if (dmat[i][j] < 0.01) dmat[i][j] = 0.01;
+ if (dmat[i][j] > 1.0) {
+ if (dmat[i][j] > 1.1 && nerrs<MAXERRS) {
+ seq1[nerrs] = i;
+ seq2[nerrs] = j;
+ bad_dist[nerrs] = dmat[i][j];
+ nerrs++;
+ }
+ dmat[i][j] = 1.0;
+ }
+ }
+ }
+ if (nerrs>0)
+ {
+ strcpy(err_mess,"The following sequences are too divergent to be aligned:\n");
+ for (i=0;i<nerrs && i<5;i++)
+ {
+ sprintf(err1," %s and %s (distance %1.3f)\n",
+ names[seq1[i]+1],
+ names[seq2[i]+1],bad_dist[i]);
+ strcat(err_mess,err1);
+ }
+ strcat(err_mess,"(All distances should be between 0.0 and 1.0)\n");
+ strcat(err_mess,"This may not be fatal but you have been warned!\n");
+ strcat(err_mess,"SUGGESTION: Remove one or more problem sequences and try again");
+ if(interactive)
+ (*reply)=prompt_for_yes_no(err_mess,"Continue ");
+ else (*reply) = 'y';
+ if ((*reply != 'y') && (*reply != 'Y'))
+ return((sint)0);
+ }
+ }
+ else
+ {
+ for (i=0;i<nseqs;i++)
+ {
+ for (j=0;j<i;j++)
+ {
+ dmat[i][j] = tmat[i+1][j+1];
+ }
+ }
+ }
+
+ path2root=ckfree((void *)path2root);
+ dist2node=ckfree((void *)dist2node);
+ for (i=0;i<nseqs;i++)
+ {
+ tmat[i+1][i+1] = 0.0;
+ for (j=0;j<i;j++)
+ {
+ tmat[i+1][j+1] = 100.0 - (dmat[i][j]) * 100.0;
+ tmat[j+1][i+1] = tmat[i+1][j+1];
+ }
+ }
+
+ for (i=0;i<nseqs;i++) dmat[i]=ckfree((void *)dmat[i]);
+ dmat=ckfree((void *)dmat);
+
+ return((sint)1);
+}
+
Added: trunk/packages/clustalw/branches/upstream/current/clustalv.doc
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/clustalv.doc 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/clustalv.doc 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,1978 @@
+
+
+
+ Clustal V Multiple Sequence Alignments.
+
+ Documentation (Installation and Usage).
+
+ Des Higgins
+ European Molecular Biology Laboratory
+ Postfach 10.2209
+ D-6900 Heidelberg
+ Germany.
+
+ higgins at EMBL-Heidelberg.DE
+
+
+*******************************************************************
+
+
+ Contents.
+
+
+ 1 Overview
+
+ 2 Installation
+
+ 3 Interactive usage
+
+ 4 Command-line interface
+
+ 5 Algorithms and references
+
+
+*******************************************************************
+
+ 1. Overview
+
+This document describes how to install and use ClustalV on various
+machines. ClustalV is a complete upgrade and rewrite of the Clustal
+package of multiple alignment programs (Higgins and Sharp, 1988 and
+1989). The original programs were written in Fortran for
+microcomputers running MSDOS. You carried out a complete alignment
+by running 3 programs in succession. Later, these were merged into
+a single menu driven program with on-line help, for VAX/VMS.
+ClustalV was written in C and has all of the features of the old
+programs plus many new ones. It has been compiled and tested using
+VAX/VMS C, Decstation ULTRIX C, Gnu C for Sun workstations, Turbo C
+for IBM PC's and Think C for Apple Mac's. The original Clustal was
+written by Des Higgins while he was a Post-Doc in the lab of Paul
+Sharp in the Genetics Department, Trinity College, Dublin 2,
+Ireland.
+
+The main feature of the old package was the ability to carry out
+reliable multiple alignments of many sequences. The sensitivity of
+the program is as good as from any other program we have tried, with
+the exception of the programs of Vingron and Argos (1991), while it
+works in reasonable time on a microcomputer. The programs of
+Vingron and Argos are specialised for finding distant similarities
+between proteins but require mainframes or workstations and are more
+difficult to use.
+
+The main new features are: profile alignments (alignments of old
+alignments); phylogenetic trees (Neighbor Joining trees calculated
+after multiple alignment with a bootstrapping option); better
+sequence input (automatically recognise and read NBRF/PIR, Pearson
+(Fasta) or EMBL/SwissProt formats); flexible alignment output
+(choose one of: old Clustal format, NBRF/PIR, GCG msf format or
+Phylip format); full command line interface (everything that you can
+do interactively can be specified on the command line).
+
+In version 7 of the GCG package, there is a program called PILEUP
+which uses a very similar algorithm to the one in ClustalV. There
+are 2 main differences between the programs: 1) the metric used to
+compare the sequences for the initial "guide tree" uses a full
+global, optimal alignment in PILEUP instead of the fast, approximate
+ones in ClustalV. This makes PILEUP much slower for the comparison
+of long sequences. In principle, the distances calculated from
+PILEUP will be more sensitive than ours, but in practice it will not
+make much difference, except in difficult cases. 2) During the
+multiple alignment, terminal gaps are penalised in ClustalV but not
+in PILEUP. This will make the PILEUP alignments better when the
+sequences are of very different lengths (has no effect if there are
+no large terminal gaps).
+
+
+This software may be distributed and used freely, provided that you
+do not modify it or this documentation in any way without the
+permission of the authors.
+
+If you wish to refer to ClustalV, please cite:
+Higgins,D.G. Bleasby,A.J. and Fuchs,R. (1991) CLUSTAL V: improved software
+for multiple sequence alignment. CABIOS, vol .8, 189-191.
+
+The overall multiple alignment algorithm was described in:
+Higgins,D.G. and Sharp,P.M. (1989). Fast and sensitive multiple
+sequence alignments on a microcomputer. CABIOS, vol. 5, 151-153.
+
+
+ACKNOWLEDGEMENTS.
+
+D.H. would particularly like to thank Paul Sharp, in whose lab. this
+work originated. We also thank Manolo Gouy, Gene Myers, Peter Rice
+and Martin Vingron for suggestions, bug-fixes and help.
+
+Des Higgins and Rainer Fuchs,
+EMBL Data Library, Heidelberg, Germany.
+
+Alan Bleasby,
+Daresbury, UK.
+
+JUNE 1991
+*******************************************************************
+
+ 2. Installation.
+
+
+
+As far as possible, we have tried to make ClustalV portable to any
+machine with a standard C compiler (proposed ANSI C standard). The
+source code, as supplied by us, has been compiled and tested using
+the following compilers:
+
+VAX/VMS C
+Ultrix C (on a Decstation 2100)
+Gnu C on a Sun 4 workstation
+Think C on an Apple Macintosh SE
+Turbo C on an IBM AT.
+
+In each case, one must make 1 change to 1 line of code in 1 header
+file. This is described below. The exact capacity of the program
+(how many sequences of what length can be aligned) will depend of
+course on available memory but can also be set in this header file.
+
+The package comes as 9 C source files; 3 header files; 1 file of on-
+line help; this documentation file; 3 make files:
+
+Source code: clustalv.c, amenu.c, gcgcheck.c, myers.c, sequence.c,
+ showpair.c, trees.c, upgma.c, util.c
+
+Header files: clustalv.h, general.h, matrices.h
+
+On-Line help: clustalv.hlp (must be renamed or defined as
+ clustalv_help except on PC's)
+
+Documentation: clustalv.doc (this file).
+
+Makefiles: makefile.sun (gnu c on Sun), vmslink.com (vax/vms),
+ makefile.ult (ultrix).
+
+
+
+
+
+
+
+Before compiling ClustalV you must look at and possibly change
+clustalV.h, shown below..
+
+/*******************CLUSTALV.H********************************/
+
+/*
+Main header file for ClustalV. Uncomment ONE of the following lines
+depending on which compiler you wish to use.
+*/
+
+#define VMS 1 /* VAX VMS */
+
+/*#define MAC 1 Think_C for MacIntosh */
+
+/*#define MSDOS 1 Turbo C for PC's */
+
+/*#define UNIX 1 Ultrix for Decstations or Gnu C for Sun */
+
+/*************************************************************/
+
+#include "general.h"
+
+#define MAXNAMES 10
+#define MAXTITLES 60
+#define FILENAMELEN 256
+
+#define UNKNOWN 0
+#define EMBLSWISS 1
+#define PIR 2
+#define PEARSON 3
+
+#define PAGE_LEN 22
+
+#if VMS
+#define DIRDELIM ']'
+#define MAXLEN 3000
+#define MAXN 150
+#define FSIZE 15000
+#define LINELENGTH 60
+#define GCG_LINELENGTH 50
+
+#elif MAC
+#define DIRDELIM ':'
+#define MAXLEN 2600
+#define MAXN 30
+#define FSIZE 10000
+#define LINELENGTH 50
+#define GCG_LINELENGTH 50
+
+#elif MSDOS
+#define DIRDELIM '\\'
+#define MAXLEN 1300
+#define MAXN 30
+#define FSIZE 5000
+#define LINELENGTH 50
+#define GCG_LINELENGTH 50
+
+#elif UNIX
+#define DIRDELIM '/'
+#define MAXLEN 3000
+#define MAXN 50
+#define FSIZE 15000
+#define LINELENGTH 60
+#define GCG_LINELENGTH 50
+#endif
+/*****************end*of*CLUSTALV.H***************************/
+
+
+
+First, you must remove the comments from one of the first 10 lines.
+There are 4 'define' compiler directives here (e.g. #define VMS 1),
+and you should use one of these, depending on which system you wish
+to work. So choose one of these, remove its comments (if it is
+already commented out) and put comments around any of the others
+that are still active. If you wish to use a different system, you
+will need to insert a new line with a new keyword (which you must
+invent) to identify your system. Most of the rest of this header
+file is taken up with a block of 'define' statements for each system
+type; e.g. the VAX/VMS block is:
+
+#if VMS
+#define DIRDELIM ']'
+#define MAXLEN 3000
+#define MAXN 150
+#define FSIZE 15000
+#define LINELENGTH 60
+#define GCG_LINELENGTH 50
+
+In this block, you can specify the maximum number of sequences to be
+allowed (MAXN); the maximum sequence length, including gaps
+(MAXLEN); FSIZE declares the size of some workspace, used by the
+fast 2 sequence comparison routines and should be APPROXIMATELY 4
+times MAXLEN; LINELENGTH is the length of the blocks of alignment
+output in the output files; GCG_LINELENGTH is the same but for the
+GCG compatible output only. Finally, DIRDELIM is the character used
+to specify directories and subdirectories in file names. It should
+be the character used to seperate the file name itself from the
+directory name (e.g. in VMS, file names are like:
+$drive:[dir1.dir2.dir3]filename.ext;2 so ']' is used as DIRDELIM).
+
+So, if you want to use a system, not covered in Clustalv.h, you will
+have to insert a new block, like the above one. To compile and link
+the program, we supply 3 makefiles: one each for VAX/VMS, Ultrix
+and GNU C for Sun workstations.
+
+
+
+VAX/VMS
+
+Compile and link the program with the
+supplied makefile for vms: vmslink.com .
+
+$ @vmslink
+
+This will produce clustalv.exe (and a lot of .obj files which you can delete).
+
+The on-line help file (clustalv.hlp) should be 'defined' as
+clustalv_help as follows:
+
+$ def clustalv_help $drive:[dir1.dir2]clustalv.hlp
+
+where $drive is the drive designation and [dir1.dir2] is the
+directory where clustalv.hlp is kept.
+
+To make use of the command-line interface, you must make clustalv a
+'foreign' command with:
+
+$ clustalv :== $$drive:[dir1.dir2]clustalv
+
+where $drive is the drive designation and [dir1.dir2] is the
+directory where clustalv.exe is kept.
+
+
+
+IBM PC/MSDOS/TURBO C
+
+Create a makefile (something.prj) with the names of the source files
+(clustalv.c, amenu.c etc.) and 'make' this using the HUGE memory
+model. You will get half a dozen warnings from the compiler about
+pieces of code than look suspicious to it but ignore these. The
+help file should remain as clustalv.hlp . To run the program using
+the default settings in Clustalv.h, you need approximately 500k of
+memory. To reduce this, the main influence on memory usage is the
+parameter MAXLEN; reduce MAXLEN to reduce memory usage.
+
+
+
+Apple Mac/THINK_C version 4.0.2
+
+This version of the program is not at all Mac like. It runs in a
+window, the inside of which looks just like a normal character based
+terminal. In the future we might put a proper Mac interface on it
+but do not have the time right now. With the default settings in
+the header file ClustalV.h, you need just over 800k of memory to run
+the program. To reduce this, reduce MAXLEN; this is easily the
+biggest influence on memory usage. To compile the program and save
+it as an application you need to 'set the application type'; here
+you specify how much memory (in kilobytes (k)) the application will
+need. You should set this to 900k to run the application as it is
+OR reduce MAXLEN in the header. To compile the program you have to
+create a 'project'; you 'add' the names of the 9 source files to the
+project AND the name of the ANSI library. The source code is too
+large to compile in one compilation unit. You will get a 'link
+error: code segment too big' if you try to compile and link as is.
+You should compile amenu.c (the biggest source file) as a seperate
+unit ..... you will have to read the manual/ask someone/mail me to
+find out what this is.
+
+
+*******************************************************************
+
+ 3. Interactive usage.
+
+
+
+Interactive usage of Clustal V is completely menu driven. On-line
+help is provided, defaults are offered for all parameters and file
+names. With a little effort it should be completely self
+explanatory. The main menu, which appears when you run the
+programs is shown below. Each item brings you to a sub menu.
+
+
+
+Main menu for Clustal V:
+
+
+ 1. Sequence Input From Disc
+ 2. Multiple Alignments
+ 3. Profile Alignments
+ 4. Phylogenetic trees
+
+ S. Execute a system command
+ H. HELP
+ X. EXIT (leave program)
+
+
+Your choice:
+
+
+
+The options S and H appear on all the main menus. H will provide
+help and if you type S you will be asked to enter a command, such as
+DIR or LS, which will be sent to the system (does not work on
+Mac's). Before carrying out an alignment, you must use option 1
+(sequence input); the format for sequences is explained below.
+Under menu item 2 you will be able to automatically align your
+sequences to each other. Menu item 3 allows you to do profile
+alignments. These are alignments of old alignments. This allows
+you to build up a multiple alignment in stages or add a new sequence
+to an old alignment. You can calculate phylogenetic trees from
+alignments using menu item 4.
+
+
+
+
+ ******************************
+ * SEQUENCE INPUT. *
+ ******************************
+
+
+All sequences should be in 1 file. Three formats are automatically
+recognised and used: NBRF/PIR, EMBL/SwissProt and FASTA (Pearson and
+Lipman (1988) format).
+
+***
+Users of the Wisconsin GCG package should use the command TONBRF
+(recently changed to TOPIR) to reformat their sequences before use.
+***
+
+Sequences can be in upper or lower case. For proteins, the only
+symbols recognised are: A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y and
+for DNA/RNA use: A,C,G and T (or U). Any other letters of the
+alphabet will be treated as X (proteins) or N (DNA/RNA) for unknown.
+All other symbols (blanks, digits etc.) will be ignored EXCEPT for
+the hyphen "-" which can be used to specify a gap. This last point
+is especially useful for 2 reasons: 1) you can fix the positions of
+some gaps in advance; 2) the alignment output from this program can
+be written out in NBRF format using "-"'s to specify gaps; these
+alignments can be used again as input, either for profile alignments
+or for phylogenetic trees.
+
+If you are using an editor to create sequence files, use the FASTA
+format as it is by far the simplest (see below). If you have access
+to utility programs for generating/converting the NBRF/PIR format
+then use it in preference.
+
+
+
+FASTA (PEARSON AND LIPMAN, 1988) FORMAT: The sequences are
+delimited by an angle bracket ">" in column 1. The text immediately
+after the ">" is used as a title. Everything on the following line
+until the next ">" or the end of the file is one sequence.
+
+e.g.
+
+> RABSTOUT rabbit Guinness receptor
+ LKMHLMGHLKMGLKMGLKGMHLMHLKHMHLMTYTYTTYRRWPLWMWLPDFGHAS
+ ADSCVCAHGFAVCACFAHFDVCFGAVCFHAVCFAHVCFAAAVCFAVCAC
+> MUSNOSE mouse nose drying factor
+ mhkmmhkgmkhmhgmhmhglhmkmhlkmgkhmgkmkytytytryrwtqtqwtwyt
+ fdgfdsgafdagfdgfsagdfavdfdvgavfsvfgvdfsvdgvagvfdv
+> HSHEAVEN human Guinness receptor repeat
+ mhkmmhkgmkhmhgmhmhg lhmkmhlkmgkhmgkmk ytytytryrwtqtqwtwyt
+ fdgfdsgafdagfdgfsag dfavdfdvgavfsvfgv dfsvdgvagvfdv
+ mhkmmhkgmkhmhgmhmhg lhmkmhlkmgkhmgkmk ytytytryrwtqtqwtwyt
+ fdgfdsgafdagfdgfsag dfavdfdvgavfsvfgv dfsvdgvagvfdv
+
+
+
+NBRF/PIR FORMAT is similar to FASTA format but immediately
+after the ">", you find the characters "P1;" if the sequences are
+protein or "DL;" if they are nucleic acid. Clustalv looks for the
+";" character as the third character after the ">". If it finds one
+it assumes that the format is NBRF if not, FASTA format is assumed.
+The text after the ";" is treated as a sequence name while the
+entire next line is treated as a title. The sequence is terminated
+by a star "*" and the next sequence can then begin (with a >P1; etc
+). This is just the basic format description (there are other
+variations and rules).
+
+ANY files/sequences in GCG format can be converted to this format
+using the TONBRF command (now TOPIR) of the Wisconsin GCG package.
+
+
+e.g.
+
+>P1;RABSTOUT
+rabbit Guinness receptor
+LKMHLMGHLKMGLKMGLKGMHLMHLKHMHLMTYTYTTYRRWPLWMWLPDFGHAS
+ADSCVCAHGFAVCACFAHFDVCFGAVCFHAVCFAHVCFAAAVCFAVCAC*
+>P1;MUSNOSE
+mouse nose drying factor
+mhkmmhkgmkhmhgmhmhglhmkmhlkmgkhmgkmkytytytryrwtqtqwtwyt
+fdgfdsgafdagfdgfsagdfavdfdvgavfsvfgvdfsvdgvagvfd
+*
+>P1;HSHEAVEN
+human Guinness receptor repeat protein.
+mhkmmhkgmkhmhgmhmhg lhmkmhlkmgkhmgkmk ytytytryrwtqtqwtwyt
+fdgfdsgafdagfdgfsag dfavdfdvgavfsvfgv dfsvdgvagvfdv
+mhkmmhkgmkhmhgmhmhg lhmkmhlkmgkhmgkmk ytytytryrwtqtqwtwyt
+fdgfdsgafdagfdgfsag dfavdfdvgavfsvfgv dfsvdgvagvfdv*
+
+
+
+
+EMBL/SWISSPROT FORMAT: Do not try to create files with this
+format unless you have utilities to help. If you are just using an
+editor, use one of the above formats. If you do use this format,
+the program will ignore everything between the ID line (line
+beginning with the characters "ID") and the SQ line. The sequence
+is then read from between the SQ line and the "//" characters.
+
+
+
+It is critically important for the program to know whether or not it
+is aligning DNA or protein sequences. The input routines attempt to
+guess which type of sequence is being used by counting the number of
+A,C,G,T or U's in the sequences. If the total is more than 85% of
+the sequence length then DNA is assumed. If you use very bizarre
+sequences (proteins with really strange aa compositions or DNA
+sequences with loads of strange ambiguity codes) you might confuse
+the program. It is difficult to do but be careful.
+
+
+
+
+
+ ******************************
+ * MULTIPLE ALIGNMENT MENU. *
+ ******************************
+
+The multiple alignment menu is shown below. Before explaining how
+to use it, you must be introduced briefly to the alignment strategy.
+If you do not follow this, try using option 1 anyway; the entire
+process will be carried out automatically.
+
+To do a complete multiple alignment, we need to know the approximate
+relationships of the sequences to each other (which ones are most
+similar to each other). We do this by calculating a crude
+phylogenetic tree which we call a dendrogram (to distinguish it from
+the more sensitive trees available under the phylogenetic tree
+menu). This dendrogram is used as a guide to align bigger and
+bigger groups of sequences during the multiple alignment. The
+dendrogram is calculated in 2 stages: 1) all pairs of sequence are
+compared using the fast/approximate method of Wilbur and Lipman
+(1983); the result of each comparison is a similarity score. 2) the
+similarity scores are used to construct the dendrogram using the
+UPGMA cluster analysis method of Sneath and Sokal (1973).
+
+The construction of the dendrogram can be very time consuming if you
+wish to align many sequences (e.g. for 100 sequences you need to
+carry out 100x99/2 sequence comparisons = 4950). During every
+multiple alignment, a dendrogram is constructed and saved to a file
+(something.dnd). These can be reused later.
+
+
+
+
+
+
+
+
+******Multiple*Alignment*Menu******
+
+
+ 1. Do complete multiple alignment now
+ 2. Produce dendrogram file only
+ 3. Use old dendrogram file
+ 4. Pairwise alignment parameters
+ 5. Multiple alignment parameters
+ 6. Output format options
+
+ S. Execute a system command
+ H. HELP
+ or press [RETURN] to go back to main menu
+
+
+Your choice:
+
+
+So, if in doubt, and you have already loaded some sequences from the
+main menu, just try option 1 and press the <Return> key in response
+to any questions. You will be prompted for 2 file names e.g. if the
+sequence input file was called DRINK.PEP, you will be offered
+DRINK.ALN as the file to contain the alignment and DRINK.DND for the
+dendrogram.
+
+If you wish to repeat a multiple alignment (e.g. to experiment with
+different gap penalties) but do not wish to make a dendrogram all
+over again use menu item 3 (providing you are using the same
+sequences). Similarly, menu item 2 allows you to produce the
+dendrogram file only.
+
+
+
+
+PAIRWISE ALIGNMENT PARAMETERS:
+
+The parameters that control the initial fast/approximate comparisons
+can be set from menu item 4 which looks like:
+
+
+ ********* WILBUR/LIPMAN PAIRWISE ALIGNMENT PARAMETERS *********
+
+
+ 1. Toggle Scoring Method :Percentage
+ 2. Gap Penalty :3
+ 3. K-tuple :1
+ 4. No. of top diagonals :5
+ 5. Window size :5
+
+ H. HELP
+
+
+Enter number (or [RETURN] to exit):
+
+
+
+The similarity scores are calculated from fast alignments generated
+by the method of Wilbur and Lipman (1983). These are 'hash' or
+'word' or 'k-tuple' alignments carried out in 3 stages.
+
+First you mark the positions of every fragment of sequence, K-tuple
+long (for proteins, the default length is 1 residue, for DNA it is 2
+bases) in both sequences. Then you locate all k-tuple matches
+between the 2 sequences. At this stage you have to imagine a dot-
+matrix plot between the 2 sequences with each k-tuple match as a
+dot. You find those diagonals in the plot with most matches (you
+take the "No. of top diagonals" best ones) and mark all diagonals
+within "Window size" of each top diagonal. This process will define
+diagonal bands in the plot where you hope the most likely regions of
+similarity will lie.
+
+The final alignment stage is to find that head to tail arrangement
+of k-tuple matches from these diagonal regions that will give the
+highest score. The score is calculated as the number of exactly
+matching residues in this alignment minus a "gap penalty" for every
+gap that was introduced. When you toggle "Scoring method" you
+choose between expressing these similarity scores as raw scores or
+expressed as a percentage of the shorter sequence length.
+
+K-TUPLE SIZE: Can be 1 or 2 for proteins; 1 to 4 for DNA.
+Increase this to increase speed; decrease to improve sensitivity.
+
+GAP PENALTY: The number of matching residues that must be found
+in order to introduce a gap. This should be larger than K-Tuple
+Size. This has little effect on speed or sensitivity.
+
+NO. OF TOP DIAGONALS: The number of best diagonals in the
+imaginary dot-matrix plot that are considered. Decrease (must be
+greater than zero) to increase speed; increase to improve
+sensitivity.
+
+WINDOW SIZE: The number of diagonals around each "top" diagonal
+that are considered. Decrease for speed; increase for greater
+sensitivity.
+
+SCORING METHOD: The similarity scores may be expressed as raw scores
+(number of identical residues minus a "gap penalty" for each gap) or
+as percentage scores. If the sequences are of very different
+lengths, percentage scores make more sense.
+
+
+
+CHANGING THE PAIRWISE ALIGNMENT PARAMETERS
+
+The main reason for wanting to change the above parameters is SPEED
+(especially on microcomputers), NOT SENSITIVITY. The dendrograms
+that are produced can only show the relationships between the
+sequences APPROXIMATELY because the similarity scores are calculated
+from seperate pairwise alignments; not from a multiple alignment
+(that is what we eventually hope to produce). If the groupings of
+the sequences are "obvious", the above method should work well; if
+the relationships are obscure or weakly represented by the data, it
+will not make much difference playing with the parameters. The main
+factor influencing speed is the K-TUPLE SIZE followed by the WINDOW
+SIZE.
+
+The alignments are carried out in a small amount of memory.
+Occasionally (it is hard to predict), you will run out of memory
+while doing these alignments; when this happens, it will say on the
+screen: "Sequences (a,b) partially aligned" (instead of "Sequences
+(a,b) aligned"). This means that the alignment score for these
+sequences will be approximate; it is not a problem unless many of
+the alignments do this. It can be fixed by using less sensitive
+parameters or increasing parameter FSIZE in clustalv.h .
+
+
+THE DENDROGRAM ITSELF
+
+The similarity scores generated by the fast comparison of all the
+sequences are used to construct a dendrogram by the UPGMA method of
+Sneath and Sokal (1973). This is a form of cluster analysis and the
+end result produces something that looks like a tree. It represents
+the similarity of the sequences as a hierarchy. The dendrogram is
+written to a file in a machine readable format and is ahown below
+for an example with 6 sequences.
+
+
+ 91.0 0 0 2 012000 ! seq 2 joins seq 3 at 91% ID.
+ 72.0 1 0 3 011200 ! seq 4 joins seqs 2,3 at 72%
+ 71.1 0 0 2 000012 ! seq 5 joins seq 6 at 71%
+ 35.5 0 2 4 122200 ! seq 1 joins seqs 2,3,4
+ 21.7 4 3 6 111122 ! seqs 1,2,3,4 join seqs 5,6
+
+This LOOKS complicated but you do not normally need to care what is
+in here. Anyway, each row represents the joining together of 2 or
+more sequences. You progress from the top down, joining more and
+more sequences until all are joined together; for N sequences you
+have N-1 groupings hence there are 5 rows in the above file (there
+were 6 sequences). In each row, the first number is the similarity
+score of this grouping; ignore the next three columns for the
+moment; the last 6 digits in the line show which sequences are
+grouped; there is one digit for each sequence (the first digit is
+for the first sequence). The rule is: in each row, all of the "1"s
+join all of the "2"s; the zero's do nothing.
+
+Hence, in the first row, sequence 2 joins sequence 3 at a similarity
+level of 91% identity; next, sequence 4 joins the previous grouping
+of 2 plus 3 at a level of 72% etc. This is shown diagrammatically
+below. Before leaving the dendrogram format, the other 3 columns of
+numbers are: a pointer to the row from which the "1" sequences were
+last joined (or zero if only one of them); a pointer to the row in
+which the "2"s were last joined; the total number of sequences
+joined in this line.
+
+
+
+
+ I------ 2
+ I------I
+ I I------ 3 Diagram of the sequence similarity
+ I----I
+ I I------------- 4 relationships shown in the above
+ I--I
+ I I------------------ 1 dendrogram file (branch lengths are
+ ----I
+ I I------------- 5 not to scale).
+ I-------I
+ I------------- 6
+
+
+
+
+
+
+
+
+
+MULTIPLE ALIGNMENT PARAMETERS:
+
+
+Having calculated a dendrogram between a set of sequences, the final
+multiple alignment is carried out by a series of alignments of
+larger and larger groups of sequences. The order is determined by
+the dendrogram so that the most similar sequences get aligned first.
+Any gaps that are introduced in the early alignments are fixed.
+When two groups of sequences are aligned against each other, a full
+protein weight matrix (such as a Dayhoff PAM 250) is used. Two gap
+penalties are offered: a "FIXED" penalty for opening up a gap and a
+"FLOATING" penalty for extending a gap.
+
+
+ ********* MULTIPLE ALIGNMENT PARAMETERS *********
+
+
+ 1. Fixed Gap Penalty :10
+ 2. Floating Gap Penalty :10
+ 3. Toggle Transitions (DNA):Weighted
+ 4. Protein weight matrix :PAM 250
+
+ H. HELP
+
+
+Enter number (or [RETURN] to exit):
+
+
+FIXED GAP PENALTY: Reduce this to encourage gaps of all sizes;
+increase it to discourage them. Terminal gaps are penalised same
+as all others. BEWARE of making this too small (approx 5 or so); if
+the penalty is too small, the program may prefer to align each
+sequence opposite one long gap.
+
+FLOATING GAP PENALTY: Reduce this to encourage longer gaps;
+increase it to shorten them. Terminal gaps are penalised same as
+all others. BEWARE of making this too small (approx 5 or so); if
+the penalty is too small, the program may prefer to align each
+sequence opposite one long gap.
+
+
+DNA TRANSITIONS = WEIGHTED or UNWEIGHTED: By default, transitions
+(A versus G; C versus T) are weighted more strongly than
+transversions (an A aligned with a G will be preferred to an A
+aligned with a C or a T). You can make all pairs of nucleotide
+equally weighted with this option.
+
+PROTEIN WEIGHT MATRIX: For protein comparisons, a weight matrix is
+used to differentially weight different pairs of aligned amino
+acids. The default is the well known Dayhoff PAM 250 matrix. We
+also offer a PAM 100 matrix, an identity matrix (all weights are the
+same for exact matches) or allow you to give the name of a file with
+your own matrix. The weight matrices used by Clustal V are shown in
+full in the Algorithms and References section of this documentation.
+
+If you input a matrix from a file, it must be in the following
+format. Use a 20x20 matrix only (entries for the 20 "normal" amino
+acids only; no ambiguity codes etc.). Input the lower left triangle
+of the matrix, INCLUDING the diagonal. The order of the amino acids
+(rows and columns) must be: CSTPAGNDEQHRKMILVFYW. The values can be
+in free format seperated by spaces (not commas). The PAM 250 matrix
+is shown below in this format.
+
+ 12
+ 0 2
+ -2 1 3
+ -3 1 0 6
+ -2 1 1 1 2
+ -3 1 0 -1 1 5
+ -4 1 0 -1 0 0 2
+ -5 0 0 -1 0 1 2 4
+ -5 0 0 -1 0 0 1 3 4
+ -5 -1 -1 0 0 -1 1 2 2 4
+ -3 -1 -1 0 -1 -2 2 1 1 3 6
+ -4 0 -1 0 -2 -3 0 -1 -1 1 2 6
+ -5 0 0 -1 -1 -2 1 0 0 1 0 3 5
+ -5 -2 -1 -2 -1 -3 -2 -3 -2 -1 -2 0 0 6
+ -2 -1 0 -2 -1 -3 -2 -2 -2 -2 -2 -2 -2 2 5
+ -6 -3 -2 -3 -2 -4 -3 -4 -3 -2 -2 -3 -3 4 2 6
+ -2 -1 0 -1 0 -1 -2 -2 -2 -2 -2 -2 -2 2 4 2 4
+ -4 -3 -3 -5 -4 -5 -4 -6 -5 -5 -2 -4 -5 0 1 2 -1 9
+ 0 -3 -3 -5 -3 -5 -2 -4 -4 -4 0 -4 -4 -2 -1 -1 -2 7 10
+ -8 -2 -5 -6 -6 -7 -4 -7 -7 -5 -3 2 -3 -4 -5 -2 -6 0 0 17
+
+Values must be integers and can be all positive or positive and
+negative as above. These are SIMILARITY values.
+
+
+
+
+ALIGNMENT OUTPUT OPTIONS:
+
+By default, the alignment goes to a file in a self explanatory
+"blocked" alignment format. This format is fine for displaying the
+results but requires heavy editing if you wish to use the alignment
+with other software. To help, we provide 3 other formats which can
+be turned on or off. If you have a sequence data set or alignment
+in memory, you can also ask for output files in whatever formats are
+turned on, NOW. The menu you use to choose format is shown below.
+
+***
+We draw your attention to NBRF/PIR format in particular. This
+format is EXACTLY the same as one of the input formats. Therefore,
+alignments written in this format can be used again as input (to the
+profile alignments or phylogenetic trees).
+***
+
+
+ ********* Format of Alignment Output *********
+
+
+ 1. Toggle CLUSTAL format output = ON
+ 2. Toggle NBRF/PIR format output = OFF
+ 3. Toggle GCG format output = OFF
+ 4. Toggle PHYLIP format output = OFF
+
+ 5. Create alignment output file(s) now?
+ H. HELP
+
+
+Enter number (or [RETURN] to exit):
+
+
+
+CLUSTAL FORMAT: This is a self explanatory alignment. The
+alignment is written out in blocks. Identities are highlighted and
+(if you use a PAM 250 matrix) positions in the alignment where all
+of the residues are "similar" to each other (PAM 250 score of 8 or
+more) are indicated.
+
+NBRF/PIR FORMAT: This is the usual NBRF/PIR format with gaps
+indicated by hyphens ("-"). AS we have stressed before, this format
+is EXACTLY compatible with the sequence input format. Therefore you
+can read in these alignments again for profile alignments or for
+calculating phylogenetic trees.
+
+GCG FORMAT: In version 7 of the Wisconsin GCG package, a new
+multiple sequence format was introduced. This is the MSF (Multiple
+Sequence Format) format. It can be used as input to the GCG
+sequence editor or any of the GCG programs that make use of multiple
+alignments. THIS FORMAT IS ONLY SUPPORTED IN VERSION 7 OF THE GCG
+PACKAGE OR LATER.
+
+PHYLIP FORMAT: This format can be used by the Phylip package of
+Joe Felsenstein (see the references/algorithms section for details
+of how to get it). Phylip allows you to do a huge range of
+phylogenetic analyses (we just offer one method in this program) and
+is probably the most widely used set of programs for drawing trees.
+It also works on just about every computer you can think of,
+providing you have a decent Pascal compiler.
+
+
+
+
+
+ ******************************
+ * PROFILE ALIGNMENT MENU. *
+ ******************************
+
+
+
+This menu is for taking two old alignments (or single sequences) and
+aligning them with each other. The result is one bigger alignment.
+The menu is very similar to the multiple alignment menu except that
+there is no mention of dendrograms here (they are not needed) and
+you need to input two sets of sequences. The menu looks like this:
+
+
+
+******Profile*Alignment*Menu******
+
+
+ 1. Input 1st. profile/sequence
+ 2. Input 2nd. profile/sequence
+ 3. Do alignment now
+ 4. Alignment parameters
+ 5. Output format options
+
+ S. Execute a system command
+ H. HELP
+ or press [RETURN] to go back to main menu
+
+
+Your choice:
+
+
+You must input profile number 1 first. When both profiles are
+loaded, use item 3 (Do alignment now) and the 2 profiles will be
+aligned. Items 4 and 5 (parameters and output options) are
+identical to the equivalent options on the multiple alignment menu.
+
+The same input routines that are used for general input are used
+here i.e. sequences must be in NBRF/PIR, EMBL/SwissProt or FASTA
+format, with gaps indicated by hyphens ("-"). This is why we have
+continualy drawn your attention to the NBRF/PIR format as a useful
+output format.
+
+Either profile can consist of just one sequence. Therefore, if you
+have a favourite alignment of sequences that you are working on and
+wish to add a new sequence, you can use this menu, provided the
+alignment is in the correct format.
+
+The total number of sequences in the two profiles must be less less
+than or equal to the MAXN parameter set in the clustalv.h header
+file.
+
+
+
+
+
+
+
+
+
+
+
+ ******************************
+ * PHYLOGENETIC TREE MENU. *
+ ******************************
+
+
+This menu allows you to input an alignment and calculate a
+phylogenetic tree. You can also calculate a tree if you have just
+carried out a multiple alignment and the alignment is still in
+memory. THE SEQUENCES MUST BE ALIGNED ALREADY!!!!!! The tree will
+look strange if the sequences are not already aligned. You can also
+"BOOTSTRAP" the tree to show confidence levels for groupings. This
+is SLOW on microcomputers but works fine on workstations or
+mainframes.
+
+
+
+******Phylogenetic*tree*Menu******
+
+
+ 1. Input an alignment
+ 2. Exclude positions with gaps? = OFF
+ 3. Correct for multiple substitutions? = OFF
+ 4. Draw tree now
+ 5. Bootstrap tree
+
+ S. Execute a system command
+ H. HELP
+ or press [RETURN] to go back to main menu
+
+
+Your choice:
+
+
+
+
+The same input routine that is used for general input is used here
+i.e. sequences must be in NBRF/PIR, EMBL/SwissProt or FASTA format,
+with gaps indicated by hyphens ("-"). This is why we have
+continualy drawn your attention to the NBRF/PIR format as a useful
+output format.
+
+If you have input an alignment, then just use item 4 to draw a tree.
+The method used is the Neighbor Joining method of Saitou and Nei
+(1987). This is a "distance method". First, percent divergence
+figures are calculated between all pairs of sequence. These
+divergence figures are then used by the NJ method to give the tree.
+Example trees will be shown below.
+
+There are two options which can be used to control the way the
+distances are calculated. These are set by options 2 and 3 in the
+menu.
+
+EXCLUDE POSITIONS WITH GAPS? This option allows you to ignore all
+alignment positions (columns) where there is a gap in ANY sequence.
+This guarantees that "like" is compared with "like" in all distances
+i.e. the same positions are used to calculate all distances. It
+also means that the distances will be "metric". The disadvantage of
+using this option is that you throw away much of the data if there
+are many gaps. If the total number of gaps is small, it has little
+effect.
+
+CORRECT FOR MULTIPLE SUBSTITUTIONS? As sequences diverge,
+substitutions accumulate. It becomes increasingly likely that more
+than one substitution (as a result of a mutation) will have happened
+at a site where you observe just one difference now. This option
+allows you to use formulae developed by Motoo Kimura to correct for
+this effect. It has the effect of stretching long branches in tres
+while leaving short ones relatively untouched. The desired effect
+is to try and make distances proportional to time since divergence.
+
+The tree is sent to a file called BLAH.NJ, where BLAH.SEQ is the
+name of the input, alignment file. An example is shown below for 6
+globin sequences.
+
+
+
+ DIST = percentage divergence (/100)
+ Length = number of sites used in comparison
+
+ 1 vs. 2 DIST = 0.5683; length = 139
+ 1 vs. 3 DIST = 0.5540; length = 139
+ 1 vs. 4 DIST = 0.5315; length = 111
+ 1 vs. 5 DIST = 0.7447; length = 141
+ 1 vs. 6 DIST = 0.7571; length = 140
+ 2 vs. 3 DIST = 0.0897; length = 145
+ 2 vs. 4 DIST = 0.1391; length = 115
+ 2 vs. 5 DIST = 0.7517; length = 145
+ 2 vs. 6 DIST = 0.7431; length = 144
+ 3 vs. 4 DIST = 0.0957; length = 115
+ 3 vs. 5 DIST = 0.7379; length = 145
+ 3 vs. 6 DIST = 0.7361; length = 144
+ 4 vs. 5 DIST = 0.7304; length = 115
+ 4 vs. 6 DIST = 0.7368; length = 114
+ 5 vs. 6 DIST = 0.2697; length = 152
+
+
+ Neighbor-joining Method
+
+ Saitou, N. and Nei, M. (1987) The Neighbor-joining Method:
+ A New Method for Reconstructing Phylogenetic Trees.
+ Mol. Biol. Evol., 4(4), 406-425
+
+
+ This is an UNROOTED tree
+
+ Numbers in parentheses are branch lengths
+
+
+ Cycle 1 = SEQ: 5 ( 0.13382) joins SEQ: 6 ( 0.13592)
+
+ Cycle 2 = SEQ: 1 ( 0.28142) joins Node: 5 ( 0.33462)
+
+ Cycle 3 = SEQ: 2 ( 0.05879) joins SEQ: 3 ( 0.03086)
+
+ Cycle 4 (Last cycle, trichotomy):
+
+ Node: 1 ( 0.20798) joins
+ Node: 2 ( 0.02341) joins
+ SEQ: 4 ( 0.04915)
+
+
+
+The output file first shows the percent divergence (distance)
+figures between each pair of sequence. Then a description of a NJ
+tree is given. This description shows which sequences (SEQ:) or
+which groups of sequences (NODE: , a node is numbered using the
+lowest sequence that belongs to it) join at each level of the tree.
+
+This is an unrooted tree!! This means that the direction of
+evolution through the tree is not shown. This can only be inferred
+in one of two ways:
+1) assume a degree of constancy in the molecular clock and place the
+root (bottom of the tree; the point where all the sequences radiate
+from) half way along the longest branch. **OR**
+2) use an "outgroup", a sequence from an organism that you "know"
+must be outside of the rest of the sequences i.e. root the tree
+manually, on biological grounds.
+
+The above tree can be represented diagramatically as follows:
+
+
+ SEQ 1 SEQ 4
+ I I
+ 13.6 I 28.1 I 4.9 5.9
+ SEQ 6 ----------I I I I--------- SEQ 2
+ I I I I
+ I--------I-----------I----------I
+ 13.4 I 33.5 20.8 2.3 I 3.1
+ SEQ 5 ----------I I--------- SEQ 3
+
+
+The figures along each branch are percent divergences along that
+branch. If you root the tree by placing the root along the longest
+branch (33.5%) then you can draw it again as follows, this time
+rooted:
+
+
+
+ 13.6
+ I-------------------- SEQ 6
+ I---------I 13.4
+ I I-------------------- SEQ 5
+ I 33.5
+ -----I 28.1
+ I I-------------------- SEQ 1
+ I I
+ I---------I 4.9
+ I 20.8 I----------- SEQ 4
+ I--------I
+ I 5.9
+ I 2.3 I----- SEQ 2
+ I-----I 3.1
+ I----- SEQ 3
+
+
+
+The longest branch (33.5% between 5,6 and 1,2,3,4) is split between
+the 2 bottom branches of the tree. As it happens in this particular
+case, sequences 5 and 6 are myoglobins while sequences 1,2,3 and 4
+are alpha and beta globins, so you could also justify the above
+rooting on biological grounds. If you do not have any particular
+need or evidence for the position of the root, then LEAVE THE TREE
+UNROOTED. Unrooted trees do not look as pretty as rooted ones but
+it is uaual to leave them unrooted if you do not have any evidence
+for the position of the root.
+
+
+BOTSTRAPPING: Different sets of sequences and different tree
+drawing methods may give different topologies (branching orders) for
+parts of a tree that are weakly supported by the data. It is useful
+to have an indication of the degree of error in the tree. There are
+several ways of doing this, some of them rather technical. We
+provide one general purpose method in this program, which makes use
+of a technique called bootstrapping (see Felsenstein, 1985).
+
+In the case of sequence alignments, bootstrapping involves taking
+random samples of positions from the alignment. If the alignment
+has N positions, each bootstrap sample consists of a random sample
+of N positions, taken WITH REPLACEMENT i.e. in any given sample,
+some sites may be sampled several times, others not at all. Then,
+with each sample of sites, you calculate a distance matrix as usual
+and draw a tree. If the data very strongly support just one tree
+then the sample trees will be very similar to each other and to the
+original tree, drawn without bootstrapping. However, if parts of
+the tree are not well supported, then the sample trees will vary
+considerably in how they represent these parts.
+
+In practice, you should use a very large number of bootstrap
+replicates (1000 is recommended, even if it means running the
+program for an hour on a slow microcomputer; on a workstation it
+will be MUCH faster). For each grouping on the tree, you record the
+number of times this grouping occurs in the sample trees. For a
+group to be considered "significant" at the 95% level (or P <= 0.05
+in statistical terms) you expect the grouping to show up in >= 95%
+of the sample trees. If this happens, then you can say that the
+grouping is significant, given the data set and the method used to
+draw the tree.
+
+So, when you use the bootstrap option, a NJ tree is drawn as before
+and then you are asked to say how many bootstrap samples you want
+(1000 is the default) and you are asked to give a seed number for
+the random number generator. If you give the same seed number in
+future, you will get the same results (we hope). Remember to give
+different seed numbers if you wish to carry out genuinely different
+bootstrap sampling experiments. Below is the output file from using
+the same data for the 6 globin sequences as used before. The output
+file has the same name as the input fike with the extension ".njb".
+
+//
+STUFF DELETED .... same as for the ordinary NJ output
+//
+ Bootstrap Confidence Limits
+
+
+ Random number generator seed = 99
+
+ Number of bootstrap trials = 1000
+
+
+ Diagrammatic representation of the above tree:
+
+ Each row represents 1 tree cycle; defining 2 groups.
+
+ Each column is 1 sequence; the stars in each line show 1 group;
+ the dots show the other
+
+ Numbers show occurences in bootstrap samples.
+
+****.. 1000
+.***.. 1000 <- This is the answer!!
+*..*** 812
+122311
+
+
+For an unrooted tree with N sequences, there are actually only N-3
+genuinely different groupings that we can test (this is the number
+of "internal branches"; each internal branch splits the sequences
+into 2 groups). In this example, we have 6 sequences with 3
+internal branches in the reference tree. In the bootstrap
+resampling, we count how often each of these internal branches
+occur. Here, we find that the branch which splits 1,2,3 and 4
+versus 5 and 6 occurs in all 1000 samples; the branch which splits
+2,3 and 4 versus 1,5 and 6 occurs in 1000; the branch which splits 2
+and 3 versus 1,4,5 and 6 occurs in 812/1000 samples. We can put
+these figures on to the diagrammatic representation we made earlier
+of our unrooted NJ tree as follows:
+
+
+
+ SEQ 1 SEQ 4
+ I I
+ I I
+ SEQ 6 ----------I I I I--------- SEQ 2
+ I 1000 I 1000 I 812 I
+ I--------I-----------I----------I
+ I I
+ SEQ 5 ----------I I--------- SEQ 3
+
+
+
+You can equally put these confidence figures on the rooted tree (in
+fact the interpretation is simpler with rooted trees). With the
+unrooted tree, the grouping of sequence 5 with 6 is significant (as
+is the grouping of sequences 1,2,3 and 4). Equally the grouping of
+sequences 1,5 and 6 is significant (the same as saying that 2,3 and
+4 group significantly). However, the grouping of 2 and 3 is not
+significant, although it is relatively strongly supported.
+
+Unfortunately, there is a small complication in the interpretation
+of these results. In statistical hypothesis testing, it is not
+valid to make multiple simultaneous tests and to treat the result of
+each test completely independantly. In the above case, if you have
+one particular test (grouping) that you wish to make in advance, it
+is valid to test IT ALONE and to simply show the other bootstrap
+figures for reference. If you do not have any particular test in
+mind before you do the bootstrapping, you can just show all of the
+figures and use the 95% level as an ARBITRARY cut off to show those
+groups that are very strongly supported; but not mention anything
+about SIGNIFICANCE testing. In the literature, it is common
+practice to simply show the figures with a tree; they frequently
+speak for themselves.
+
+
+
+*******************************************************************
+
+ 4. Command Line Interface.
+
+
+
+You can do almost everything that can be done from the menus, using
+a command line interface. In this mode, the program will take all of
+its instructions as "switches" when you activate it; no questions
+will be asked; if there are no errors, the program just does an
+analysis and stops. It does not work so well on the MAC but is
+still possible. To get you started we will show you the 2 simplest
+uses of the command line as it looks on VAX/VMS. On all other
+machines (except the MAC) it works in the same way.
+
+$ clustalv /help **OR** $ clustalv /check
+
+Both of the above switches give you a one page summary of the
+command line on the screen and then the program stops.
+
+
+$ clustalv proteins.seq **OR** $ clustalv /infile=proteins.seq
+
+This will read the sequences from the file 'proteins.seq' and do a
+complete multiple alignment. Default parameters will be used, the
+program will try to tell whether or not the sequences are DNA or
+protein and the output will go to a file called 'proteins.aln' . A
+dendrogram file called 'proteins.dnd' will also be created. Thus
+the default action for the program, when it successfully reads in an
+input file is to do a full multiple alignment. Some further
+examples of command line usage will be given leter.
+
+Command line switches can be abbreviated but MAKE SURE YOU DO NOT
+MAKE THEM AMBIGUOUS. No attempt will be made to detect ambiguity.
+Use enough characters to distinguish each switch uniquely.
+
+
+
+
+
+
+
+The full list of allowed switches is given below:
+
+
+ DATA (sequences)
+
+/INFILE=file.ext :input sequences. If you give an input file and
+ nothing else as a switch, the default action is
+ to do a complete multiple alignment. The input
+ file can also be specified by giving it as the
+ first command line parameter with no "/" in
+ front of it e.g $ clustalv file.ext .
+
+/PROFILE1=file.ext :You use these two switches to give the names of
+/PROFILE2=file.ext two profiles. The default action is to align
+ the two. You must give the names of both profile
+ files.
+
+
+
+ VERBS (do things)
+
+/HELP :list the command line parameters on the screen.
+/CHECK
+
+/ALIGN :do full multiple alignment. This is the default
+ action if no other switches except for input files
+ are given.
+
+/TREE :calculate NJ tree. If this is the only action
+ specified (e.g. $ clustalv proteins.seq/tree ) it IS
+ ASSUMED THAT THE SEQUENCES ARE ALREADY ALIGNED. If
+ the sequences are not already aligned, you should
+ also give the /ALIGN switch. This will align the
+ sequences first, output an alignment file and
+ calculate the tree in memory.
+
+/BOOTSTRAP(=n) :bootstrap a NJ tree (n= number of bootstraps;
+ default = 1000). If this is the only action
+ specified (e.g. $ clustalv proteins.seq/bootstrap )
+ it IS ASSUMED THAT THE SEQUENCES ARE ALREADY ALIGNED.
+ If the sequences are not already aligned, you should
+ also give the /ALIGN switch. This will align the
+ sequences first, output an alignment file and
+ calculate the bootstraps in memory. You can set the
+ number of bootstrap trials here (e.g./bootstrap=500).
+ You can set the seed number for the random number
+ generator with /seed=n.
+
+
+
+ PARAMETERS (set things)
+
+***Pairwise alignments:***
+
+/KTUP=n :word size
+
+/TOPDIAGS=n :number of best diagonals
+
+/WINDOW=n :window around best diagonals
+
+/PAIRGAP=n :gap penalty
+
+
+
+***Multiple alignments:***
+
+/FIXEDGAP=n :fixed length gap pen.
+
+/FLOATGAP=n :variable length gap pen.
+
+/MATRIX= :PAM100 or ID or file name. The default weight matrix
+ for proteins is PAM 250.
+
+/TYPE=p or d :type is protein or DNA. This allows you to
+ explicitely overide the programs attempt at guessing
+ the type of the sequence. It is only useful if you
+ are using sequences with a VERY strange composition.
+
+/OUTPUT= :GCG or PHYLIP or PIR. The default output is
+ Clustal format.
+
+/TRANSIT :transitions not weighted. The default is to weight
+ transitions as more favourable than other mismatches
+ in DNA alignments. This switch makes all nucleotide
+ mismatches equally weighted.
+
+
+***Trees:***
+
+/KIMURA :use Kimura's correction on distances.
+
+/TOSSGAPS :ignore positions with a gap in ANY sequence.
+
+/SEED=n :seed number for bootstraps.
+
+
+
+
+EXAMPLES:
+
+These examples use the VAX/VMS $ prompt; otherwise, command-line
+usage is the same on all machines except the Macintosh.
+
+
+$ clustalv proteins.seq OR $ clustalv /infile=proteins.seq
+
+Read whatever sequences are in the file "proteins.seq" and do a full
+multiple alignment; output will go to the files: "proteins.dnd"
+(dendrogram) and "proteins.aln" (alignment).
+
+
+$ clustalv proteins.seq/ktup=2/matrix=pam100/output=pir
+
+Same as last example but use K-Tuple size of 2; use a PAM 100
+protein weight matrix; write the alignment out in NBRF/PIR format
+(goes to a file called "proteins.pir").
+
+
+$ clustalv /profile1=proteins.seq/profile2=more.seq/type=p/fixed=11
+
+Take the alignment in "proteins.seq" and align it with "more.seq"
+using default values for everything except the fixed gap penalty
+which is set to 11. The sequence type is explicitely set to
+PROTEIN.
+
+
+$ clustalv proteins.pir/tree/kimura
+
+Take the sequences in proteins.pir (they MUST BE ALIGNED ALREADY)
+and calculate a phylogenetic tree using Kimura's correction for
+distances.
+
+
+$ clustalv proteins.pir/align/tree/kimura
+
+Same as the previous example, EXCEPT THAT AN ALIGNMENT IS DONE
+FIRST.
+
+
+$ clustalv proteins.seq/align/boot=500/seed=99/tossgaps/type=p
+
+Take the sequences in proteins.seq; they are explicitely set to be
+protein; align them; bootstrap a tree using 500 samples and a seed
+number of 99.
+
+
+*******************************************************************
+
+ 5. Algorithms and references.
+
+
+
+In this section, we will try to BRIEFLY describe the algorithms used
+in ClustalV and give references. The topics covered are:
+
+
+ -Multiple alignments
+
+ -Profile alignments
+
+ -Protein weight matrices
+
+ -Phylogenetic trees
+
+ -distances
+
+ -NJ method
+
+ -Bootstrapping
+
+ -Phylip
+
+ -References
+
+
+
+
+
+
+MULTIPLE ALIGNMENTS.
+
+The approach used in ClustalV is a modified version of the method of
+Feng and Doolittle (1987) who aligned the sequences in larger and
+larger groups according to the branching order in an initial
+phylogenetic tree. This approach allows a very useful combination
+of computational tractability and sensitivity.
+
+The positions of gaps that are generated in early alignments remain
+through later stages. This can be justified because gaps that arise
+from the comparison of closely related sequences should not be moved
+because of later alignment with more distantly related sequences.
+At each alignment stage, you align two groups of already aligned
+sequences. This is done using a dynamic programming algorithm where
+one allows the residues that occur in every sequence at each
+alignment position to contribute to the alignment score. A Dayhoff
+(1978) PAM matrix is used in protein comparisons.
+
+The details of the algorithm used in ClustalV have been published in
+Higgins and Sharp (1989). This was an improved version of an
+earlier algorithm published in Higgins and Sharp (1988). First, you
+calculate a crude similarity measure between every pair of sequence.
+This is done using the fast, approximate alignment algorithm of
+Wilbur and Lipman (1983). Then, these scores are used to calculate
+a "guide tree" or dendrogram, which will tell the multiple alignment
+stage in which order to align the sequences for the final multiple
+alignment. This "guide tree" is calculated using the UPGMA method
+of Sneath and Sokal (1973). UPGMA is a fancy name for one type of
+average linkage cluster analysis, invented by Sokal and Michener
+(1958).
+
+Having calculated the dendrogram, the sequences are aligned in
+larger and larger groups. At each alignment stage, we use the
+algorithm of Myers and Miller (1988) for the optimal alignments.
+This algorithm is a very memory efficient variation of Gotoh's
+algorithm (Gotoh, 1982). It is because of this algorithm that
+ClustalV can work on microcomputers. Each of these alignments
+consists of aligning 2 alignments, using what we call "profile
+alignments".
+
+
+PROFILE ALIGNMENTS.
+
+We use the term "profile alignment" to describe the alignment of 2
+alignments. We use this term because the method is a simple
+extension of the profile method of Gribskov, et al. (1987) for
+aligning 1 sequence with an alignment. Normally, with a 2 sequence
+alignment, you use a weight matrix (e.g. a PAM 250 matrix) to give a
+score between the pairs of aligned residues. The alignment is
+considered "optimal" if it gives the best total score for aligned
+residues minus penalties for any gaps (insertions or deletions) that
+must be introduced.
+
+Profile alignments are a simple extension of 2 sequence alignments
+in that you can treat each of the two input alignments as single
+sequences but you calculate the score at aligned positions as the
+average weight matrix score of all the residues in one alignment
+versus all those in the other e.g. if you have 2 alignments with I
+and J sequences respectively; the score at any position is the
+average of all the I times J scores of the residues compared
+seperately. Any gaps that are introduced are placed in all of the
+sequences of an alignment at the same position. The profile
+alignments offered in the "profile alignment menu" are also
+calculated in this way.
+
+
+PROTEIN WEIGHT MATRICES.
+
+There are 3 built-in weight matrices used by clustalV. These are
+the PAM 100 and PAM 250 matrices of Dayhoff (1978) and an identity
+matrix. Each matrix is given as the bottom left half, including the
+diagonal of a 20 by 20 matrix. The order of the rows and columns is
+CSTPAGNDEQHRKMILVFYW.
+
+
+PAM 250
+
+C 12
+S 0 2
+T -2 1 3
+P -3 1 0 6
+A -2 1 1 1 2
+G -3 1 0 -1 1 5
+N -4 1 0 -1 0 0 2
+D -5 0 0 -1 0 1 2 4
+E -5 0 0 -1 0 0 1 3 4
+Q -5 -1 -1 0 0 -1 1 2 2 4
+H -3 -1 -1 0 -1 -2 2 1 1 3 6
+R -4 0 -1 0 -2 -3 0 -1 -1 1 2 6
+K -5 0 0 -1 -1 -2 1 0 0 1 0 3 5
+M -5 -2 -1 -2 -1 -3 -2 -3 -2 -1 -2 0 0 6
+I -2 -1 0 -2 -1 -3 -2 -2 -2 -2 -2 -2 -2 2 5
+L -6 -3 -2 -3 -2 -4 -3 -4 -3 -2 -2 -3 -3 4 2 6
+V -2 -1 0 -1 0 -1 -2 -2 -2 -2 -2 -2 -2 2 4 2 4
+F -4 -3 -3 -5 -4 -5 -4 -6 -5 -5 -2 -4 -5 0 1 2 -1 9
+Y 0 -3 -3 -5 -3 -5 -2 -4 -4 -4 0 -4 -4 -2 -1 -1 -2 7 10
+W -8 -2 -5 -6 -6 -7 -4 -7 -7 -5 -3 2 -3 -4 -5 -2 -6 0 0 17
+----------------------------------------------------------------
+ C S T P A G N D E Q H R K M I L V F Y W
+
+
+IDENTITY MATRIX
+
+10
+ 0 10
+ 0 0 10
+ 0 0 0 10
+ 0 0 0 0 10
+ 0 0 0 0 1 10
+ 0 0 0 0 0 0 10
+ 0 0 0 0 0 0 0 10
+ 0 0 0 0 0 0 0 0 10
+ 0 0 0 0 0 0 0 0 0 10
+ 0 0 0 0 0 0 0 0 0 0 10
+ 0 0 0 0 0 0 0 0 0 0 0 10
+ 0 0 0 0 0 0 0 0 0 0 0 0 10
+ 0 0 0 0 0 0 0 0 0 0 0 0 0 10
+ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 10
+ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 10
+ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 10
+ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 10
+ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 10
+ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 10
+
+
+
+
+
+PAM 100
+
+ 14
+ -1 6
+ -5 2 7
+ -6 1 -1 10
+ -5 2 2 1 6
+ -8 1 -3 -3 1 8
+ -8 2 0 -3 -1 -1 7
+-11 -1 -2 -4 -1 -1 4 8
+-11 -2 -3 -3 0 -2 1 5 8
+-11 -3 -3 -1 -2 -5 -1 1 4 9
+ -6 -4 -5 -2 -5 -7 2 -1 -2 4 11
+ -6 -1 -4, -2 -5 -8 -3 -6 -5 1 1 10
+-11 -2 -1 -4 -4 -5 1 -2 -2 -1 -3 3 8
+-11 -4 -2 -6 -3 -8 -5 -8 -6 -2 -7 -2 1 13
+ -5 -4 -1 -6 -3 -7 -4 -6 -5 -5 -7 -4 4 2 9
+-12 -7 -5 -5 -5 -8 -6 -9 -7 -3 -5 -7 -6 4 2 9
+ -4 -4 -1 -4 0 -4 -5 -6 -5 -5 -6 -6 -6 1 5 1 8
+-10 -5 -6 -9 -7 -8 -6 -11 -11 -10 -4 -7-11 -2 0 0 -5 12
+ -2 -6 -6 -11 -6 -11 -3 -9 -7 -9 -1-10-10 -8 -4 -5 -6 6 13
+-13 -4 -10 -11 -11 -13 -8 -13 -14 -11 -7 1 -9-11-12 -7-14 -2 -2 19
+
+
+
+
+PHYLOGENETIC TREES.
+
+There are two COMMONLY used approaches for inferring phylogentic
+trees from sequence data: parsimony and distance methods. There are
+other approaches which are probably superior in theory but which are
+yet to be used widely. This does not mean that they are no use; we
+(the authors of this program at any rate) simply do not know enough
+about them yet. You should see the documentation accompanying the
+Phylip package and some of the references there for an explanation
+of the different methods and what assumptions are implied when you
+use them.
+
+There is a constant debate in the literature as to the merits of
+different methods but unfortunately, a lot of what is said is
+incomprehensible or inaccurate. It is also a field that is prone to
+having highly opinionated schools of thought. This is a pity
+because it prevents rational discussion of the pro's and con's of
+the different methods. The approach adopted in ClustalV is to
+supply just one method and to produce alignments in a format that
+can be used by Phylip. In simple cases, the trees produced will be
+as "good" (reliable, robust) as those from ANY other method. In
+more complicated cases, there is no single magic recipe that we can
+supply that will work well in even most situations.
+
+The method we provide is the Neighbor Joining method (NJ) of Saitou
+and Nei (1987) which is a distance method. We use this for three
+reasons: it is conceptually and computationally simple; it is fast;
+it gives "good" trees in simple cases. It is difficult to prove that
+one tree is "better" than another if you do not know the true
+phylogeny; the few systematic surveys of methods show it to work
+more or less as well as any other method ON AVERAGE. Another reason
+for using the NJ method is that it is very commonly used; THIS IS A
+BAD REASON SCIENTIFICALLY but at least you will not feel lonely if
+you use it.
+
+The NJ method works on a matrix of distances (the distance matrix)
+between all pairs of sequence to be analysed. These distances are
+related to the degree of divergence between the sequences. It is
+normal to calculate the distances from the sequences after they are
+multiply aligned. If you calculate them from seperate alignments
+(as done for the dendrograms in another part of this program), you
+may increase the error considerably.
+
+
+DISTANCES
+
+The simplest measure of distance between sequences is percent
+divergence (100% minus percent identity). For two sequences, you
+count how many positions differ between them (ignoring all positions
+with a gap or an unknown residue) and divide by the number of
+positions considered. It is common practice to also ignore all
+positions in the alignment where there is a GAP in ANY of the
+sequences (Tossgaps ? option in the menu). Usually, you express the
+percent distance divided by 100 (gives distances between 0.0 and
+1.0).
+
+This measure of distance is perfectly adequate (with some further
+modification described below) for rRNA sequences. However it treats
+all residues identically e.g. all amino acid substitutions are
+equally weighted. It also treats all positions identically e.g. it
+does not take account of different rates of substitution in
+different positions of different codons in protein coding DNA
+sequences; see Li et al (1985) for a distance measure that does.
+Despite these shortcomings, these percent identity distances do work
+well in practice in a wide variety of situations.
+
+In a simple world, you would like a distance to be proportional to
+the time since the sequences diverged. If this were EXACTLY true,
+then the calculation of the tree would be a simple matter of algebra
+(UPGMA does this for you) and the branch lengths will be nice and
+meaningful (times). In practice this OBVIOUSLY depends on the
+existence and quality of the "molecular clock", a subject of on-
+going debate. However, even if there is a good clock, there is a
+further problem with estimating divergences. As sequences diverge,
+they become "saturated" with mutations. Sites can have
+substitutions more than once. Calculated distances will
+underestimate actual divergence times; the greater the divergence,
+the greater the discrepancy. There are various methods for dealing
+with this and we provide two commonly used ones, both due to Motoo
+Kimura; one for proteins and one for DNA.
+
+
+For distance K (percent divergence /100 ) ...
+
+Correction for Protein distances: (Kimura, 1983).
+
+ Corrected K = -ln(1.0 - K - (K * k/5.0))
+
+
+
+Correction for nucleotide distances: Kimura's 2-parameter method
+(Kimura, 1980).
+
+ Corrected K = 0.5*ln(a) + 0.25*ln(b)
+
+ where a = 1/(1 - 2*P - Q)
+ and b = 1/(1 - 2*Q)
+
+ P and Q are the proportions of transitions (A<-->G, C<-->T)
+ and transversions occuring between the sequences.
+
+
+One paradoxical effect of these corrections, is that distances can
+be corrected to have more than 100% divergence. That is because,
+for very highly diverged sequences of length N, you can estimate
+that more than N substitutions have occured by correcting the
+observed distance in the above ways. Don't panic!
+
+
+
+NEIGHBOR JOINING TREES.
+
+VERY briefly, the NJ method works as follows. You start by placing
+the sequences in a star topology (no internal branches). You then
+find that internal branch (take 2 sequences; join them; connect them
+to the rest by the internal branch) which when added to the tree
+will minimise the total branch length. The two joined sequences
+(neighbours) are merged into a single sequence and the process is
+repeated. For an unrooted tree with N sequences, there are N-3
+internal branches. The above process is repeated N-3 times to give
+the final tree. The full details are given in Saitou and Nei
+(1987).
+
+As explained elsewhere in the documentation, you can only root the
+tree by one of two methods:
+
+1) assume a degree of constancy in the molecular clock and place the
+root along the longest branch (internal or external). Methods that
+appear to produce rooted trees automatically are often just doing
+this without letting you know; this is true of UPGMA.
+
+2) root the tree on biological grounds. The usual method is to
+include an "outgroup", a sequence that you are certain will branch
+to the outside of the tree.
+
+
+
+BOOTSTRAPPING.
+
+Bootstrapping is a general purpose technique that can be used for
+placing confidence limits on statistics that you estimate without
+any knowledge of the underlying distribution (e.g. a normal or
+poisson distribution). In the case of phylogenetic trees, there are
+several analytical methods for placing confidence limits on
+groupings (actually on the internal branches) but these are either
+restricted to particular tree drawing methods or only work on small
+trees of 4 or 5 sequences. Felsenstein (1985) showed how to use
+bootstrapping to calculate confidence limits on trees. His approach
+is completely general and can be applied to any tree drawing method.
+The main assumption of the method in this context is that the sites
+in the alignment are independant; this will be true of some sequence
+alignments (e.g. pseudogenes) but not others (e.g. rRNA's). What
+effect, lack of independance will have on the results is not known.
+
+The method works by taking random samples of data from the complete
+data set. You compute the test statistic (tree in this case) on
+each sample. Variation in the statistic computed from the samples
+gives a measure of variation in the statistic which can be used to
+calculate confidence intervals. Each random sample is the same size
+as the complete data set and is taken WITH REPLACEMENT i.e. a data
+point can be selected more than once (or not at all) in any given
+sample.
+
+In the case of an alignment N residues long, each random sample is a
+random selection of N sites form the alignment. For each sample, we
+calculate a distance matrix and tree in the usual way. Variation in
+the sample trees compared to a tree calculated from the full data
+set gives an indication of how well supported the tree is by the
+data. If the sample trees are very similar to each other and to the
+full tree, then the tree is "strongly" supported; if the sample
+trees show great variation, then the tree will be weakly supported.
+In practice, you usually find some parts of a tree well supported,
+others weakly. This can be seen by counting how often each
+monophyletic group in the full tree occurs in the sample trees.
+
+For a particular grouping, one considers it to be significant at the
+95% level (P <= 0.05) if it occurs in 95% of the bootstrap samples.
+If a grouping is significant, it is significant with respect to the
+particular data set and method used for drawing the tree.
+Biological "significance" is another matter.
+
+
+PHYLIP.
+
+The Phylip package was written by Joe Felsenstein, University of
+Washington, USA. It provides Pascal source code for a large number
+of programs for doing most types of phylogenetic analyses. The
+Phylip format alignments produced by this program can be used by all
+of the Phylip programs, version 3.4 or later (March 1991). It is
+freely available from him as follows.
+
+
+
+================= PHYLIP information sheet =====================
+
+ PHYLIP - Phylogeny Inference Package (version 3.3)
+
+This is a FREE package of programs for inferring phylogenies and
+carrying out certain related tasks. At present it contains 28
+programs, which carry out different algorithms on different kinds of
+data. The programs in the package are:
+
+ ---------- Programs for molecular sequence data ----------
+PROTPARS Protein parsimony
+DNAPARS Parsimony method for DNA
+DNAMOVE Interactive DNA parsimony
+DNAPENNY Branch and bound for DNA
+DNABOOT Bootstraps DNA parsimony
+DNACOMP Compatibility for DNA
+DNAINVAR Phylogenetic invariants
+DNAML Maximum likelihood method
+DNAMLK DNAML with molecular clock
+DNADIST Distances from sequences
+RESTML ML for restriction sites
+
+ ----------- Programs for distance matrix data ------------
+FITCH Fitch-Margoliash and least-squares methods
+KITSCH Fitch-Margoliash and least squares methods with
+ evolutionary clock
+
+ --- Programs for gene frequencies and continuous characters --
+CONTML Maximum likelihood method
+GENDIST Computes genetic distances
+
+ ------------- Programs for discrete state data -----------
+MIX Wagner, Camin-Sokal, and mixed parsimony criteria
+MOVE Interactive Wagner, C-S, mixed parsimony program
+PENNY Finds all most parsimonious trees by branch-and-bound
+BOOT Bootstrap confidence interval on mixed parsimony methods
+DOLLOP, DOLMOVE, DOLPENNY, DOLBOOT same as preceding four
+ programs, but for the Dollo and polymorphism parsimony
+ criteria
+CLIQUE Compatibility method
+FACTOR recode multistate characters
+
+ ---- Programs for plotting trees and consensus trees ----
+DRAWGRAM Draws cladograms and phenograms on screens, plotters and
+ printers
+DRAWTREE Draws unrooted phylogenies on screens, plotters and
+ printers
+CONSENSE Majority-rule and strict consensus trees
+
+The package includes extensive documentation files that provide the
+information necessary to use and modify the programs.
+
+COMPATIBILITY: The programs are written in a very standard subset of
+Pascal, a language that is available on most computers (including
+microcomputers). The programs require only trivial modifications to
+run on most machines: for example they work with only minor
+modifications with Turbo Pascal, and without modifications on VAX
+VMS Pascal. Pascal source code is distributed in the regular version
+of PHYLIP: compiled object code is not. To use that version, you
+must have a Pascal compiler.
+
+DISKETTE DISTRIBUTION: The package is distributed in a variety of
+microcomputer diskette formats. You should send FORMATTED
+diskettes, which I will return with the package written on them.
+Unfortunately, I cannot write any Apple formats. See below for how
+many diskettes to send. The programs on the magnetic tape or
+electronic network versions may of course also be moved to
+microcomputers using a terminal program.
+
+PRECOMPILED VERSIONS: Precompiled executable programs for PCDOS
+systems are available from me. Specify the "PCDOS executable
+version" and send the number of extra diskettes indicated below.
+An Apple Macintosh version with precompiled code is available from
+Willem Ellis, Instituut voor Taxonomische Zoologie, Zoologisch
+Museum, Universiteit van Amsterdam, Plantage Middenlaan 64, 1018DH
+Amsterdam, Netherlands, who asks that you send 5 800K diskettes.
+
+HOW MANY DISKETTES TO SEND: The following table shows for different
+PCDOS formats how many diskettes to send, and how many extra
+diskettes to send for the PCDOS executable version:
+
+Diskette size Density For source code For executables, send
+ in addition
+ 3.5 inch 1.44 Mb 2 1
+ 5.25 inch 1.2 Mb 2 2
+ 3.5 inch 720 Kb 4 2
+ 5.25 inch 360 Kb 7 4
+
+Some other formats are also available. You MUST tell me EXACTLY
+which of these formats you need. The diskettes MUST be formatted by
+you before being sent to me. Sending an extra diskette may be
+helpful.
+
+NETWORK DISTRIBUTION: The package is also available by distribution
+of the files directly over electronic networks, and by anonymous ftp
+from evolution.genetics.washington.edu. Contact me by electronic
+mail for details.
+
+TAPE DISTRIBUTION: The programs are also distributed on a magnetic
+tape provided by you (which should be a small tape and need only be
+able to hold two megabytes) in the following format: 9-track, ASCII,
+odd parity, unlabelled, 6250 bpi (unless otherwise indicated).
+Logical record: 80 bytes, physical record: 3200 bytes (i.e. blocking
+factor 40). There are a total of 71 files. The first one describes
+the contents of the package.
+
+POLICIES: The package is distributed free. I do not make it
+available or support it in South Africa. The package will be
+written on the diskettes or tape, which will be mailed back. They
+can be sent to:
+
+ Joe Felsenstein
+Electronic mail addresses: Department of Genetics SK-50
+ Internet: joe at genetics.washington.edu University of Washington
+ Bitnet/EARN: felsenst at uwavm Seattle, Washington 98195
+ UUCP: uw-beaver!evolution.genetics!joe U.S.A.
+
+
+===================== End of Phylip Info. Sheet ====================
+
+
+
+
+REFERENCES.
+
+Dayhoff, M.O., Schwartz, R.M. and Orcutt, B.C. (1978) in Atlas of
+Protein Sequence and Structure, Vol. 5 supplement 3, Dayhoff, M.O.
+(ed.), NBRF, Washington, p. 345.
+
+Felsenstein, J. (1985) Confidence limits on phylogenies: an
+approach using the bootstrap. Evolution 39, 783-791.
+
+Feng, D.-F. and Doolittle, R.F. (1987) Progressive sequence
+alignment as a prerequisite to correct phylogenetic trees.
+J.Mol.Evol. 25, 351-360.
+
+Gotoh, O. (1982) An improved algorithm for matching biological
+sequences. J.Mol.Biol. 162, 705-708.
+
+Gribskov, M., McLachlan, A.D. and Eisenberg, D. (1987) Profile
+analysis: detection of distantly related proteins. PNAS USA 84,
+4355-4358.
+
+Higgins, D.G. and Sharp, P.M. (1988) CLUSTAL: a package for
+performing multiple sequence alignments on a microcomputer. Gene
+73, 237-244.
+
+Higgins, D.G. and Sharp, P.M. (1989) Fast and sensitive multiple
+sequence alignments on a microcomputer. CABIOS 5, 151-153.
+
+Kimura, M. (1980) A simple method for estimating evolutionary
+rates of base substitutions through comparative studies of
+nucleotide sequences. J. Mol. Evol. 16, 111-120.
+
+Kimura, M. (1983) The Neutral Theory of Molecular Evolution.
+Cambridge University Press, Cambridge, England.
+
+Li, W.-H., Wu, C.-I. and Luo, C.-C. (1985) A new method for
+estimating synonymous and nonsynonymous rates of nucleotide
+substitution considering the relative likelihood of nucleotide and
+codon changes. Mol.Biol.Evol. 2, 150-174.
+
+Myers, E.W. and Miller, W. (1988) Optimal alignments in linear
+space. CABIOS 4, 11-17.
+
+Pearson, W.R. and Lipman, D.J. (1988) Improved tools for biological
+sequence comparison. PNAS USA 85, 2444-2448.
+
+Saitou, N. and Nei, M. (1987) The neighbor-joining method: a new
+method for reconstructing phylogenetic trees. Mol.Biol.Evol. 4,
+406-425.
+
+Sneath, P.H.A. and Sokal, R.R. (1973) Numerical Taxonomy. Freeman,
+San Francisco.
+
+Sokal, R.R. and Michener, C.D. (1958) A statistical method for
+evaluating systematic relationships. Univ.Kansas Sci.Bull. 38,
+1409-1438.
+
+Vingron, M. and Argos, P. (1991) Motif recognition and alignment
+for many sequences by comparison of dot matrices. J.Mol.Biol. 218,
+33-43.
+
+Wilbur, W.J. and Lipman, D.J. (1983) Rapid similarity searches of
+nucleic acid and protein data banks. PNAS USA 80, 726-730.
+
Added: trunk/packages/clustalw/branches/upstream/current/clustalw.c
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/clustalw.c 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/clustalw.c 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,122 @@
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#ifdef MAC
+#include <console.h>
+#endif
+#include "clustalw.h"
+
+/*
+* Prototypes
+*/
+
+#ifdef MAC
+extern int ccommand(char ***);
+#endif
+
+extern void *ckalloc(size_t);
+extern void init_amenu(void);
+extern void init_interface(void);
+extern void init_matrix(void);
+extern void fill_chartab(void);
+extern void parse_params(Boolean);
+extern void main_menu(void);
+
+/*
+* Global variables
+*/
+double **tmat;
+
+char revision_level[] = "W (1.83)"; /* JULIE feb 2001*/
+
+Boolean interactive=FALSE;
+
+#ifdef MSDOS
+ char *help_file_name = "clustalw.hlp";
+#else
+ char *help_file_name = "clustalw_help";
+#endif
+
+sint max_names; /* maximum length of names in current alignment file */
+
+float gap_open, gap_extend;
+float pw_go_penalty, pw_ge_penalty;
+
+FILE *tree;
+FILE *clustal_outfile, *gcg_outfile, *nbrf_outfile, *phylip_outfile,
+ *gde_outfile, *nexus_outfile;
+FILE *fasta_outfile; /* Ramu */
+
+sint *seqlen_array;
+sint max_aln_length;
+short usermat[NUMRES][NUMRES], pw_usermat[NUMRES][NUMRES];
+short def_aa_xref[NUMRES+1], aa_xref[NUMRES+1], pw_aa_xref[NUMRES+1];
+short userdnamat[NUMRES][NUMRES], pw_userdnamat[NUMRES][NUMRES];
+short def_dna_xref[NUMRES+1], dna_xref[NUMRES+1], pw_dna_xref[NUMRES+1];
+sint nseqs;
+sint nsets;
+sint *output_index;
+sint **sets;
+sint *seq_weight;
+sint max_aa;
+sint gap_pos1;
+sint gap_pos2;
+sint mat_avscore;
+sint profile_no;
+
+Boolean usemenu;
+Boolean dnaflag;
+Boolean distance_tree;
+
+char **seq_array;
+char **names,**titles;
+char **args;
+char seqname[FILENAMELEN+1];
+
+char *gap_penalty_mask1 = NULL, *gap_penalty_mask2 = NULL;
+char *sec_struct_mask1 = NULL, *sec_struct_mask2 = NULL;
+sint struct_penalties;
+char *ss_name1 = NULL, *ss_name2 = NULL;
+
+Boolean user_series = FALSE;
+UserMatSeries matseries;
+short usermatseries[MAXMAT][NUMRES][NUMRES];
+short aa_xrefseries[MAXMAT][NUMRES+1];
+
+int main(int argc,char **argv)
+{
+ sint i;
+
+#ifdef MAC
+ argc=ccommand(&argv);
+#endif
+
+ init_amenu();
+ init_interface();
+ init_matrix();
+
+ fill_chartab();
+
+ if(argc>1) {
+ args = (char **)ckalloc(argc * sizeof(char *));
+
+ for(i=1;i<argc;++i)
+ {
+ args[i-1]=(char *)ckalloc((strlen(argv[i])+1) * sizeof(char));
+ strcpy(args[i-1],argv[i]);
+ }
+ usemenu=FALSE;
+ parse_params(FALSE);
+
+ for(i=0;i<argc-1;i++)
+ ckfree(args[i]);
+ ckfree(args);
+ }
+ usemenu=TRUE;
+ interactive=TRUE;
+
+ main_menu();
+
+ exit(0);
+}
+
Added: trunk/packages/clustalw/branches/upstream/current/clustalw.doc
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/clustalw.doc 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/clustalw.doc 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,757 @@
+README for Clustal W version 1.7 June 1997
+
+ Clustal W version 1.7 Documentation
+
+This file provides some notes on the latest changes, installation and usage
+of the Clustal W multiple sequence alignment program.
+
+
+
+Julie Thompson (Thompson at EMBL-Heidelberg.DE)
+Toby Gibson (Gibson at EMBL-Heidelberg.DE)
+
+European Molecular Biology Laboratory
+Meyerhofstrasse 1
+D 69117 Heidelberg
+Germany
+
+
+Des Higgins (Higgins at ucc.ie)
+
+University of County Cork
+Cork
+Ireland
+
+
+Please e-mail bug reports/complaints/suggestions (polite if possible)
+to Toby Gibson or Des Higgins.
+
+
+
+Thompson, J.D., Higgins, D.G. and Gibson, T.J. (1994)
+CLUSTAL W: improving the sensitivity of progressive multiple sequence alignment
+through sequence weighting, positions-specific gap penalties and weight matrix
+choice. Nucleic Acids Research, 22:4673-4680.
+
+--------------------------------------------------------------
+
+What's New (June 1997) in Version 1.7 (since version 1.6).
+
+
+1. The static arrays used by clustalw for storing the alignment data have been
+replaced by dynamically allocated memory. There is now no limit on the number
+or length of sequences which can be input.
+
+2. The alignment of DNA sequences now offers a new hard-coded matrix, as well
+as the identity matrix used previously. The new matrix is the default scoring
+matrix used by the BESTFIT program of the GCG package for the comparison of
+nucleic acid sequences. X's and N's are treated as matches to any IUB ambiguity
+symbol. All matches score 1.9; all mismatches for IUB symbols score 0.0.
+
+3. The transition weight option for aligning nucleotide sequences has been
+changed from an on/off toggle to a weight between 0 and 1. A weight of zero
+means that the transitions are scored as mismatches; a weight of 1 gives
+transitions the full match score. For distantly related DNA sequences, the
+weight should be near to zero; for closely related sequences it can be useful
+to assign a higher score.
+
+4. The RSF sequence alignment file format used by GCG Version 9 can now be
+read.
+
+5. The clustal sequence alignment file format has been changed to allow
+sequence names longer than 10 characters. The maximum length allowed is set in
+clustalw.h by the statement:
+#define MAXNAMES 10
+
+For the fasta format, the name is taken as the first string after the '>'
+character, stopping at the first white space. (Previously, the first 10
+characters were taken, replacing blanks by underscores).
+
+6. The bootstrap values written in the phylip tree file format can be assigned
+either to branches or nodes. The default is to write the values on the nodes,
+as this can be read by several commonly-used tree display programs. But note
+that this can lead to confusion if the tree is rooted and the bootstraps may
+be better attached to the internal branches: Software developers should ensure
+they can read the branch label format.
+
+7. The sequence weighting used during sequence to profile alignments has been
+changed. The tree weight is now multiplied by the percent identity of the
+new sequence compared with the most closely related sequence in the profile.
+
+8. The sequence weighting used during profile to profile alignments has been
+changed. A guide tree is now built for each profile separately and the
+sequence weights calculated from the two trees. The weights for each
+sequence are then multiplied by the percent identity of the sequence compared
+with the most closely related sequence in the opposite profile.
+
+9. The adjustment of the Gap Opening and Gap Extension Penalties for sequences
+of unequal length has been improved.
+
+10. The default order of the sequences in the output alignment file has been
+changed. Previously the default was to output the sequences in the same order
+as the input file. Now the default is to use the order in which the sequences
+were aligned (from the guide tree/dendrogram), thus automatically grouping
+closely related sequences.
+
+11. The option to 'Reset Gaps between alignments' has been switched off by
+default.
+
+12. The conservation line output in the clustal format alignment file has been
+changed. Three characters are now used:
+'*' indicates positions which have a single, fully conserved residue
+':' indicates that one of the following 'strong' groups is fully conserved:-
+ STA
+ NEQK
+ NHQK
+ NDEQ
+ QHRK
+ MILV
+ MILF
+ HY
+ FYW
+
+'.' indicates that one of the following 'weaker' groups is fully conserved:-
+ CSA
+ ATV
+ SAG
+ STNK
+ STPA
+ SGND
+ SNDEQK
+ NDEQHK
+ NEQHRK
+ FVLIM
+ HFY
+
+These are all the positively scoring groups that occur in the Gonnet Pam250
+matrix. The strong and weak groups are defined as strong score >0.5 and weak
+score =<0.5 respectively.
+
+13. A bug in the modification of the Myers and Miller alignment algorithm
+for residue-specific gap penalites has been fixed. This occasionally caused
+new gaps to be opened a few residues away from the optimal position.
+
+14. The GCG/MSF input format no longer needs the word PILEUP on the first
+line. Several versions can now be recognised:-
+ 1. The word PILEUP as the first word in the file
+ 2. The word !!AA_MULTIPLE_ALIGNMENT or !!NA_MULTIPLE_ALIGNMENT
+ as the first word in the file
+ 3. The characters MSF on the first line in the line, and the
+ characters .. at the end of the line.
+
+15. The standard command line separator for UNIX systems has been changed from
+'/' to '-'. ie. to give options on the command line, you now type
+
+ clustalw input.aln -gapopen=8.0
+
+instead of clustalw input.aln /gapopen=8.0
+
+
+ ATTENTION SOFTWARE DEVELOPERS!!
+ -------------------------------
+
+The CLUSTAL sequence alignment output format has been modified:
+
+1. Names longer than 10 chars are now allowed. (The maximum is specified in
+clustalw.h by '#define MAXNAMES'.)
+
+2. The consensus line now consists of three characters: '*',':' and '.'. (Only
+the '*' and '.' were previously used.)
+
+3. An option (not the default) has been added, allowing the user to print out
+sequence numbers at the end of each line of the alignment output.
+
+4. Both RNA bases (U) and base ambiguities are now supported in nucleic acid
+sequences. In the past, all characters (upper or lower case) other than
+a,c,g,t or u were converted to N. Now the following characters are recognised
+and retained in the alignment output: ABCDGHKMNRSTUVWXY (upper or lower case).
+
+5. A Blank line inadvertently added in the version 1.6 header has been taken
+out again.
+
+
+--------------------------------------------------------------
+
+What's New (March 1996) in Version 1.6 (since version 1.5).
+
+
+1) Improved handling of sequences of unequal length. Previously, we
+increased the gap extension penalties for both sequences if the two sequences
+(or groups of previously aligned sequences) were of different lengths.
+Now, we increase the gap opening and extension penalties for the shorter
+sequence only. This helps prevent short sequences being stretched out
+along longer ones.
+
+2) Added the "Gonnet" series of weight matrices (from Gaston Gonnet and
+co-workers at the ETH in Zurich). Fixed a bug in the matrix
+choice menu; now PAM matrices can be selected ok.
+
+3) Added secondary structure/gap penalty masks. These allow you to
+include, in an alignment, a position specific set of gap penalties.
+You can either set a gap opening penalty at each position or specify
+the secondary strcuture (if protein; alpha helix, beta strand or loop)
+and have gap penalties set automatically. This, basically, is used to make
+gaps harder to open inside helices or strands.
+
+These masks are only used in the "profile alignment" menu. They may be read in
+as part of an alignment in a special format (see the on-line help for
+details) or associated with each sequence, if the sequences are in Swiss Prot
+format and secondary structure information is given. All of the mask
+parameters can be set from the profile alignment menu. Basically, the
+mask is made up of a series of numbers between 1 and 9, one per position.
+The gap opening penalty at a position is calculated as the starting penalty
+multipleied by the mask value at that site.
+
+4) Added command line options /profile and /sequences.
+These allow uses to choose between normal profile alignment where the
+two profiles (pre-existing alignments specified in the files
+/profile1= and /profile2=) are merged/aligned with each other (/profile)
+and the case where the individual sequences in /profile2 are aligned
+sequentially with the alignment in /profile1 (/sequences).
+
+5) Fixed bug in modified Myers and Miller algorithm - gap penalty score
+was not always calculated properly for type 2 midpoints. This is the core
+alignment algorithm.
+
+6) Only allows one output file format to be selected from command line
+- ie. multiple output alignment files are not allowed.
+
+7) Fixed 'bad calls to ckfree' error during calculation of phylip distance
+matrix.
+
+8) Fixed command line options /gapopen /gapext /type=protein /negative.
+
+9) Allowed user to change command line separator on UNIX from '/' to '-'.
+This allows unix users to use the more conventinal '-' symbol
+for seperating command line options. "/" can then be used in unix
+file names on the command line. The symbol that is used,
+is specified in the file clustalw.h which must be edited if you
+wish to change it (and the program must then be recompiled). Find the
+block of code in clustalw.h that corrsponds to the operating system you
+are using. These blocks are started by one of the following:
+
+#ifdef VMS
+#elif MAC
+#elif MSDOS
+#elif UNIX
+
+On the next line after each is the line:
+
+#define COMMANDSEP '/'
+
+Change this in the appropriate block of code (e.g. the UNIX block) to
+
+#define COMMANDSEP '-'
+
+if you wish to use the "-" character as command seperator.
+
+
+
+--------------------------------------------------------------
+
+What's New (April 1995) in Version 1.5 (since version 1.3).
+
+1) ported to MAC and PC. These versions are quite slow unless you
+have a nice beefy machine. On a Power Mac or a Pentium box
+it is nice and fast. Two precompiled versions are supplied for Macs
+(Power mac and old mac versions).
+Mac: 1500 residues by 100 sequences
+Power Mac 3000 " " " "
+PC 1500 " " " "
+
+2) alignment of new sequences to an alignment. Fixed a serious bug
+which assigned weights to the wrong sequences. Now also, weights
+sequences according to distance from the incoming sequence. The
+new weights are: tree weights * similarity to incoming sequence.
+The tree weights are the old weights that we derive from the tree
+connecting all the sequences in the existing alignment.
+
+3) for all platforms, output linelength = 60.
+
+4) Bootstrap files (*.phb): the "final" node (arbitrary trichotomy
+at the end of the neighbor-joining process) is labelled as
+TRICHOTOMY in the bootstrap output files. This is to help
+link bootstrap figures with nodes when you reroot the tree.
+
+5) Command line /bootstrap option now more robust.
+
+--------------------------------------------------------------
+INTRODUCTION
+
+
+
+This document gives some BRIEF notes about usage of the Clustal W
+multiple alignment program for UNIX and VMS machines. Clustal W
+is a major update and rewrite of the Clustal V program which
+was described in:
+
+Higgins, D.G., Bleasby, A.J. and Fuchs, R. (1992)
+CLUSTAL V: improved software for multiple sequence alignment.
+Computer Applications in the Biosciences (CABIOS), 8(2):189-191.
+
+The main new features are a greatly improved (more sensitive)
+multiple alignment procedure for proteins and improved support
+for different file formats. This software was described in:
+
+Thompson, J.D., Higgins, D.G. and Gibson, T.J. (1994)
+CLUSTAL W: improving the sensitivity of progressive multiple
+sequence alignment through sequence weighting, position specific
+gap penalties and weight matrix choice.
+Nucleic Acids Research, 22(22):4673-4680.
+
+
+The usage of Clustal W is largely the same as for
+Clustal V details of which are described in clustalv.doc. Details of the
+new alignment algorithms are described in the manuscript by
+Thompson et. al. above, an ascii/text version of which is included
+(clustalw.ms). This file lists some of the details not covered by either
+of the above documents.
+
+
+There are brief notes on the following topics:
+
+1) Installation for VMS and UNIX and MAC and PC
+2) File input
+3) file output
+4) changes to the alignment algorithms
+5) minor modifications to the phylogenetic tree and bootstrapping methods
+6) summary of the command line usage.
+
+-------------------------------------------------------------------
+
+1) INSTALLATION (for Unix, VAX/VMS, PC and MAC)
+
+
+
+*****IMPORTANT*****
+If you wish to recompile the program (or compile it for the first
+time; you will have to do this with UNIX):
+first check the file CLUSTALW.H which needs to be changed if you
+move the code from between unix and vms machines. At the top
+of the file are four lines which define one of VMS, MSDOS, MAC or
+UNIX to be 1. All of these EXCEPT one must be commented out
+using enclosed /* ... */.
+*******************
+
+
+Unix
+-----
+
+Make files are supplied for unix machines. The code was compiled and
+tested using Decstation (Ultrix), SUN (Gnu C compiler/gcc), Silicon
+Graphics (IRIX) and DEC/Alpha (OSF1). We have not tested the code on any other
+systems. Just use makefile to make on most systems. For Sun, you need to
+have the Gnuc C (gcc) compiler installed ... use the file makefile.sun in this
+case. You make the program with:
+make (or make -f makefile.sun)
+
+This produces the file clustalw which can be run by typing clustalw and
+pressing return. The help file is called clustalw_help
+
+
+VMS
+----
+
+There is a small DCL command file (VMSLINK.COM) to compile and link the
+code for VMS machines (vax or alpha). This procedure just compiles the
+source files and links using default settings. Run it using:
+$ @vmslink
+This produces Clustalw.exe which can be run using the run command:
+$ run clustalw
+
+The intermediate object files can be deleted with:
+$ del *.obj;
+
+There is an extensive command line facility. To use this, you must
+create a symbol to run the program (and put this in your login.com file).
+e.g.
+$ clustalw :== $$drive:[dir.dir]clustalw
+where $drive is the drive on which the executable file is stored (clustalw.exe)
+and [dir.dir] is the full directory specification. NOTE THE EXTRA DOLLAR SIGN.
+Then the program can be run using the command:
+$ clustalw
+
+
+PC
+__
+
+We supply an executable file (Clustalw.exe) which will run using MSDOS.
+It will also run under windows (as a DOS application)
+*** IF you have a maths coprocessor***. If you do not have a maths chip
+(e.g. 80387), the program can only be run under MSDOS. In the latter case,
+you must have the file EMU387.exe in the same directory as CLUSTALW.EXE.
+This file emulates a maths chip if you do not have one.
+
+
+We generated the executable file using gnu c for MSDOS.
+It will also compile (with about 10,000 warning messages)
+using Microsoft C but we have not tested it and there appear to be problems
+with the executable.
+
+You will need to use a "memory extender" to allow the program to get at more
+than 640kb of memory.
+
+
+
+MAC
+---
+
+The code compiles for Power Mac and older macs using Metroworks Codewarrior
+C compiler. We supply 2 executable programs (one each for PowerMac and
+older mac): ClustalwPPC and Clustalw68k). These need up to
+10mb of memory to run which needs to be adjusted with the Get Info (%I)
+command from the Finder if you have problems. Just double click the
+executable file name or icon and off you go (we hope).
+
+As a special treat for Mac users, we supply an executable and brief readme
+file for NJPLOT. This is a really nice program by Manolo Gouy
+(University of Lyon, France) that allows you to import the trees
+made by Clustal W and display them/manipulate them. It will properly
+display the bootstrap figures from the *.phb files. It can export the
+trees in PICT format which can then be used by MacDraw for example.
+
+
+-------------------------------------------------------------------------
+
+2) FILE INPUT (sequences to be aligned)
+
+
+
+The sequences must all be in one file (or two files for a "profile alignment")
+in ONE of the following formats:
+
+FASTA (Pearson), NBRF/PIR, EMBL/Swiss Prot, GDE, CLUSTAL, GCG/MSF, GCG9/RSF.
+
+The program tries to "guess" which format is being used and whether
+the sequences are nucleic acid (DNA/RNA) or amino acid (proteins). The
+format is recognised by the first characters in the file. This is kind
+of stupid/crude but works most of the time and it is difficult
+to do reliably, any other way.
+
+
+Format First non blank word or character in the file.
+...............................................................
+FASTA >
+NBRF >P1; or >D1;
+EMBL/SWISS ID
+GDE protein %
+GDE nucleotide #
+CLUSTAL CLUSTAL (blocked multiple alignments)
+GCG/MSF PILEUP or !!AA_MULTIPLE_ALIGNMENT or !!NA_MULTIPLE_ALIGNMENT
+ or MSF on the first line, and '..' at the end of line
+GCG9/RSF !!RICH_SEQUENCE
+
+Note, that the only way of spotting that a file is MSF format is if
+the word PILEUP appears at the very beginning of the file. If you
+produce this format from software other than the GCG pileup program,
+then you will have to insert the word PILEUP at the start of the file.
+Similarly, if you use clustal format, the word CLUSTAL must appear first.
+
+All of these formats can be used to read in AN EXISTING FULL ALIGNMENT.
+With CLUSTAL format, this is just the same as the output format of this
+program and Clustal V. If you use PILEUP or CLUSTAL format, all sequences
+must be the same length, INCLUDING GAPS ("-" in clustal format; "." in MSF).
+With the other formats, sequences can be gapped with "-" characters. If you
+read in any gaps these are kept during any later alignments. You can use
+this facility to read in an alignment in order to calculate a phylogenetic
+tree OR to output the same alignment in a different format (from the
+output format options menu of the multiple alignment menu) e.g. read
+in a GCG/MSF format alignment and output a PHYLIP format alignment. This is
+also useful to read in one reference alignment and to add one or more new
+sequences to it using the "profile alignment" facilities.
+
+DNA vs. PROTEIN: the program will count the number of A,C,G,T,U and N
+charcters. If 85% or more of the characters in a sequence are as above,
+then DNA/RNA is assumed, protein otherwise.
+
+-------------------------------------------------------------------------
+
+
+3) FILE OUTPUT
+
+
+1) the alignments.
+
+In the multiple alignment and profile alignment menus, there is a menu
+item to control the output format(s).
+
+The alignment output format can be set to any (or all) of:
+CLUSTAL (a self explanatory blocked alignment)
+NBRF/PIR (same as input format but with "-" characters for gaps)
+MSF (the main GCG package multiple alignment format)
+PHYLIP (Joe Felsenstein's phylogeny inference package. Gaps are set to
+ "-" characters. For some programs (e.g. PROTPARS/DNAPARS) these
+ should be changed to "?" characters for unknown residues.
+GDE (Used by Steven Smith's GDE package)
+
+You can also choose between having the sequences in the same order as in
+the input file or writing them out in an order that more closely matches the
+order used to carry out the multiple alignment.
+
+
+2) The trees.
+
+Believe it or not, we now use the New Hampshire (nested parentheses)
+format as default for our trees. This format is compatible with e.g. the
+PHYLIP package. If you want to view a tree, you can use the RETREE or
+DRAWGRAM/DRAWTREE programs of PHYLIP. This format is used for all our
+trees, even the initial guide trees for deciding the order of multiple
+alignment. The output trees from the phylogenetic tree menu can also be
+requested in our old verbose/cryptic format. This may be more useful
+if, for example, you wish to see the bootstrap figures. The bootstrap
+trees in the default New Hampshire format give the bootstrap figures
+as extra labels which can be viewed very easily using TREETOOL which is
+available as part of the GDE package. TREETOOL is available from the
+RDP project by ftp from rdp.life.uiuc.edu.
+
+The New Hampshire format is only useful if you have software to display or
+manipulate the trees. The PHYLIP package is highly recommended if you intend
+to do much work with trees and includes programs for doing this. If you do
+not have such software, request the trees in the older clustal format
+and see the documentation for Clustal V (clustalv.doc). WE DO NOT PROVIDE
+ANY DIRECT MEANS FOR VIEWING TREES GRAPHICALLY.
+
+-------------------------------------------------------------------------
+
+4) THE ALIGNMENT ALGORITHMS
+
+
+The basic algorithm is the same as for Clustal V and is described in some
+detail in clustalv.doc. The new modifications are described in detail in
+clustalw.ms. Here we just list some notes to help answer some of the most
+obvious questions.
+
+
+Terminal Gaps
+
+In the original Clustal V program, terminal gaps were penalised the same
+as all other gaps. This caused some ugly side effects e.g.
+
+acgtacgtacgtacgt acgtacgtacgtacgt
+a----cgtacgtacgt gets the same score as ----acgtacgtacgt
+
+NOW, terminal gaps are free. This is better on average and stops silly
+effects like single residues jumping to the edge of the alignment. However,
+it is not perfect. It does mean that if there should be a gap near the end
+of the alignment, the program may be reluctant to insert it i.e.
+
+cccccgggccccc cccccgggccccc
+ccccc---ccccc may be considered worse (lower score) than cccccccccc---
+
+In the right hand case above, the terminal gap is free and may score higher
+than the laft hand alignment. This can be prevented by lowering the gap
+opening and extension penalties. It is difficult to get this right all the
+time. Please watch the ends of your alignments.
+
+
+
+Speed of the initial (pairwise) alignments (fast approximate/slow accurate)
+
+By default, the initial pairwise alignments are now carried out using a full
+dynamic programming algorithm. This is more accurate than the older hash/
+k-tuple based alignments (Wilbur and Lipman) but is MUCH slower. On a fast
+workstation you may not notice but on a slow box, the difference is extreme.
+You can set the alignment method from the menus easily to the older, faster
+method.
+
+
+
+Delaying alignment of distant sequences
+
+The user can set a cut off to delay the alignment of the most divergent
+sequences in a data set until all other sequences have been aligned. By
+default, this is set to 40% which means that if a sequence is less than 40%
+identical to any other sequence, its alignment will be delayed.
+
+
+
+Iterative realignment/Reset gaps between alignments
+
+By default, if you align a set of sequences a second time (e.g. with changed
+gap penalties), the gaps from the first alignment are discarded. You can
+set this from the menus so that older gaps will be kept between alignments,
+This can sometimes give better alignments by keeping the gaps (do not reset
+them) and doing the full multiple alignment a second time. Sometimes, the
+alignment will converge on a better solution; sometimes the new alignment will
+be the same as the first. There can be a strange side effect: you can get
+columns of nothing but gaps introduced.
+
+Any gaps that are read in from the input file are always kept, regardless
+of the setting of this switch. If you read in a full multiple alignment, the "reset
+gaps" switch has no effect. The old gaps will remain and if you carry out
+a multiple alignment, any new gaps will be added in. If you wish to carry out
+a full new alignment of a set of sequences that are already aligned in a file
+you must input the sequences without gaps.
+
+
+
+Profile alignment
+
+By profile alignment, we simply mean the alignment of old alignments/sequences.
+In this context, a profile is just an existing alignment (or even a set of
+unaligned sequences; see below). This allows you to
+read in an old alignment (in any of the allowed input formats) and align
+one or more new sequences to it. From the profile alignment menu, you
+are allowed to read in 2 profiles. Either profile can be a full alignment
+OR a single sequence. In the simplest mode, you simply align the two profiles
+to each other. This is useful if you want to gradually build up a full
+multiple alignment.
+
+A second option is to align the sequences from the second profile, one at
+a time to the first profile. This is done, taking the underlying tree between
+the sequences into account. This is useful if you have a set of new sequences
+(not aligned) and you wish to add them all to an older alignment.
+
+----------------------------------------------------------------------------
+
+5) CHANGES TO THE PHYLOGENTIC TREE CALCULATIONS AND SOME HINTS.
+
+
+
+IMPROVED DISTANCE CALCULATIONS FOR PROTEIN TREES
+
+
+The phylogenetic trees in Clustal W (the real trees that you calculate
+AFTER alignment; not the guide trees used to decide the branching order
+for multiple alignment) use the Neighbor-Joining method of Saitou and
+Nei based on a matrix of "distances" between all sequences. These distances
+can be corrected for "multiple hits". This is normal practice when accurate
+trees are needed. This correction stretches distances (especially large ones)
+to try to correct for the fact that OBSERVED distances (mean number of
+differences per site) greatly underestimate the actual number that happened
+during evolution.
+
+In Clustal V we used a simple formula to convert an observed distance to one
+that is corrected for multiple hits. The observed distance is the mean number
+of differences per site in an alignment (ignoring sites with a gap) and is
+therefore always between 0.0 (for ientical sequences) an 1.0 (no residues the
+same at any site). These distances can be multiplied by 100 to give percent
+difference values. 100 minus percent difference gives percent identity.
+The formula we use to correct for multiple hits is from Motoo Kimura
+(Kimura, M. The neutral Theory of Molecular Evolution, Camb.Univ.Press, 1983,
+page 75) and is:
+
+K = -Ln(1 - D - (D.D)/5) where D is the observed distance and K is
+ corrected distance.
+
+This formula gives mean number of estimated substitutions per site and, in
+contrast to D (the observed number), can be greater than 1 i.e. more than
+one substitution per site, on average. For example, if you observe 0.8
+differences per site (80% difference; 20% identity), then the above formula
+predicts that there have been 2.5 substitutions per site over the course
+of evolution since the 2 sequences diverged. This can also be expressed in
+PAM units by multiplying by 100 (mean number of substitutions per 100 residues).
+The PAM scale of evolution and its derivation/calculation comes from the
+work of Margaret Dayhoff and co workers (the famous Dayhoff PAM series
+of weight matrices also came from this work). Dayhoff et al constructed
+an elaborate model of protein evolution based on observed frequencies
+of substitution between very closely related proteins. Using this model,
+they derived a table relating observed distances to predicted PAM distances.
+Kimura's formula, above, is just a "curve fitting" approximation to this table.
+It is very accurate in the range 0.75 > D > 0.0 but becomes increasingly
+unaccurate at high D (>0.75) and fails completely at around D = 0.85.
+
+To circumvent this problem, we calculated all the values for K corresponding
+to D above 0.75 directly using the Dayhoff model and store these in an
+internal table, used by Clustal W. This table is declared in the file dayhoff.h and
+gives values of K for all D between 0.75 and 0.93 in intervals of 0.001 i.e.
+for D = 0.750, 0.751, 0.752 ...... 0.929, 0.930. For any observed D
+higher than 0.930, we arbitrarily set K to 10.0. This sounds drastic but
+with real sequences, distances of 0.93 (less than 7% identity) are rare.
+If your data set includes sequences with this degree of divergence, you
+will have great difficulty getting accurate trees by ANY method; the alignment
+itself will be very difficult (to construct and to evaluate).
+
+There are some important
+things to note. Firstly, this formula works well if your sequences are
+of average amino acid composition and if the amino acids substitute according
+to the original Dayhoff model. In other cases, it may be misleading. Secondly,
+it is based only on observed percent distance i.e. it does not DIRECTLY
+take conservative substitutions into account. Thirdly, the error on the
+estimated PAM distances may be VERY great for high distances; at very high
+distance (e.g. over 85%) it may give largely arbitrary corrected distances.
+In most cases, however, the correction is still worth using; the trees will
+be more accurate and the branch lengths will be more realistic.
+
+A far more sophisticated distance correction based on a full Dayhoff
+model which DOES take conservative substitutions and actual amino acid
+composition into account, may be found in the PROTDIST program of the
+PHYLIP package. For serious tree makers, this program is highly recommended.
+
+
+
+TWO NOTES ON BOOTSTRAPPING...
+
+When you use the BOOTSTRAP in Clustal W to estimate the reliability of parts
+of a tree, many of the uncorrected distances may randomly exceed the arbitrary cut
+off of 0.93 (sequences only 7% identical) if the sequences are distantly
+related. This will happen randomly i.e. even if none of the pairs of
+sequences are less than 7% identical, the bootstrap samples may contain pairs
+of sequences that do exceed this cut off.
+If this happens, you will be warned. In practice, this can
+happen with many data sets. It is not a serious problem if it happens rarely.
+If it does happen (you are warned when it happens and told how often the
+problem occurs), you should consider removing the most distantly
+related sequences and/or using the PHYLIP package instead.
+
+
+A further problem arises in almost exactly the opposite situation: when
+you bootstrap a data set which contains 3 or more sequences that are identical
+or almost identical. Here, the sets of identical sequences should be shown
+as a multifurcation (several sequences joing at the same part of the tree).
+Because the Neighbor-Joining method only gives strictly dichotomous trees
+(never more than 2 sequences join at one time), this cannot be exactly
+represented. In practice, this is NOT a problem as there will be some
+internal branches of zero length seperating the sequences. If you
+display the tree with all branch lengths, you will still see a multifurcation.
+However, when you bootstrap
+the tree, only the branching orders are stored and counted. In the case
+of multifurcations, the exact branching order is arbitrary but the program
+will always get the same branching order, depending only on the input order
+of the sequences. In practice, this is only a problem in situations where
+you have a set of sequences where all of them are VERY similar. In this case,
+you can find very high support for some groupings which will disappear if you
+run the analysis with a different input order. Again, the PHYLIP package
+deals with this by offering a JUMBLE option to shuffle the input order
+of your sequences between each bootstrap sample.
+
+----------------------------------------------------------------------------
+
+6) SUMMARY OF THE COMMAND LINE USAGE
+
+Clustal W is designed to be run interactively. However, there are many
+situations where it is convenient to run it from the command line, especially
+if you wish to run it from another piece of software (e.g. SeqApp or GDE).
+All parameters can be set from the command line by giving options after the
+clustalw command. On UNIX options should be preceded by '-', all other systems
+use the '/' character.
+
+If anything is put on the command line, the program will (attempt to) carry
+out whatever is requested and will exit. If you wish to use the command
+line to set some parameters and then go into interactive mode, use the
+command line switch: interactive .... e.g.
+
+clustalw -quicktree -interactive on UNIX
+or
+clustalw /quicktree /interactive on VMS,MAC and PC
+
+will set the default initial alignment mode to fast/approximate and will then
+go to the main menu.
+
+
+To see a list of all the command line parameters, type:
+
+clustalw -options on UNIX
+or
+clustalw /options on VMS,MAC and PC
+
+and you will see a list with no explanation.
+
+
+To get (VERY BRIEF) help on command line usage, use the /HELP or /CHECK
+(-help or -check on UNIX systems) options. Otherwise, the command line
+usage is self explanatory or is explained in clustalv.doc. The defaults
+for all parameters are set in the file param.h which can be changed easily
+(remember to recompile the program afterwards :-).
+
+------------------------------------------------------------------------------
Property changes on: trunk/packages/clustalw/branches/upstream/current/clustalw.doc
___________________________________________________________________
Name: svn:executable
+
Added: trunk/packages/clustalw/branches/upstream/current/clustalw.h
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/clustalw.h 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/clustalw.h 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,250 @@
+/*#include "/us1/user/julie/dmalloc/malloc.h"*/
+/*********************CLUSTALW.H*********************************************/
+/****************************************************************************/
+
+ /*
+ Main header file for ClustalW. Uncomment ONE of the following 4 lines
+ depending on which compiler you wish to use.
+ */
+
+/*#define VMS 1 VAX or ALPHA VMS */
+
+/*#define MAC 1 Think_C for Macintosh */
+
+/*#define MSDOS 1 Turbo C for PC's */
+
+#define UNIX 1 /*Ultrix/Decstation, Gnu C for
+ Sun, IRIX/SGI, OSF1/ALPHA */
+
+/***************************************************************************/
+/***************************************************************************/
+
+
+#include "general.h"
+
+#define MAXNAMES 30 /* Max chars read for seq. names */
+#define MAXTITLES 60 /* Title length */
+#define FILENAMELEN 256 /* Max. file name length */
+
+#define UNKNOWN 0
+#define EMBLSWISS 1
+#define PIR 2
+#define PEARSON 3
+#define GDE 4
+#define CLUSTAL 5 /* DES */
+#define MSF 6 /* DES */
+#define RSF 7 /* JULIE */
+#define USER 8 /* DES */
+#define PHYLIP 9 /* DES */
+#define NEXUS 10/* DES */
+#define FASTA 11/* Ramu */
+
+#define NONE 0
+#define SECST 1
+#define GMASK 2
+
+#define PROFILE 0
+#define SEQUENCE 1
+
+#define BS_NODE_LABELS 2
+#define BS_BRANCH_LABELS 1
+
+#define PAGE_LEN 22 /* Number of lines of help sent to screen */
+
+#define PAGEWIDTH 80 /* maximum characters on output file page */
+#define LINELENGTH 60 /* Output file line length */
+#define GCG_LINELENGTH 50
+
+#ifdef VMS /* Defaults for VAX VMS */
+#define COMMANDSEP '/'
+#define DIRDELIM ']' /* Last character before file name in full file
+ specs */
+#define INT_SCALE_FACTOR 1000 /* Scaling factor to convert float to integer for profile scores */
+
+#elif MAC
+#define COMMANDSEP '/'
+#define DIRDELIM ':'
+#define INT_SCALE_FACTOR 100 /* Scaling factor to convert float to integer for profile scores */
+
+#elif MSDOS
+#define COMMANDSEP '/'
+#define DIRDELIM '\\'
+#define INT_SCALE_FACTOR 100 /* Scaling factor to convert float to integer for profile scores */
+
+#elif UNIX
+#define COMMANDSEP '-'
+#define DIRDELIM '/'
+#define INT_SCALE_FACTOR 1000 /* Scaling factor to convert float to integer for profile scores */
+#endif
+
+#define NUMRES 32 /* max size of comparison matrix */
+
+#define INPUT 0
+#define ALIGNED 1
+
+#define LEFT 1
+#define RIGHT 2
+
+#define NODE 0
+#define LEAF 1
+
+#define GAPCOL 32 /* position of gap open penalty in profile */
+#define LENCOL 33 /* position of gap extension penalty in profile */
+
+typedef struct node { /* phylogenetic tree structure */
+ struct node *left;
+ struct node *right;
+ struct node *parent;
+ float dist;
+ sint leaf;
+ int order;
+ char name[64];
+} stree, *treeptr;
+
+typedef struct {
+ char title[30];
+ char string[30];
+} MatMenuEntry;
+
+typedef struct {
+ int noptions;
+ MatMenuEntry opt[10];
+} MatMenu;
+
+#define MAXMAT 10
+
+typedef struct {
+ int llimit;
+ int ulimit;
+ short *matptr;
+ short *aa_xref;
+} SeriesMat;
+
+typedef struct {
+ int nmat;
+ SeriesMat mat[MAXMAT];
+} UserMatSeries;
+
+
+/*
+ Prototypes
+*/
+
+/* alnscore.c */
+void aln_score(void);
+/* interface.c */
+void parse_params(Boolean);
+void init_amenu(void);
+void init_interface(void);
+void main_menu(void);
+FILE *open_output_file(char *, char *, char *, char *);
+FILE *open_explicit_file(char *);
+sint seq_input(Boolean);
+Boolean open_alignment_output(char *);
+void create_alignment_output(sint fseq,sint lseq);
+void align(char *phylip_name);
+void profile_align(char *p1_tree_name,char *p2_tree_name);/* Align 2 alignments */
+void make_tree(char *phylip_name);
+void get_tree(char *phylip_name);
+sint profile_input(void); /* read a profile */
+void new_sequence_align(char *phylip_name);
+Boolean user_mat(char *, short *, short *);
+Boolean user_mat_series(char *, short *, short *);
+void get_help(char);
+void clustal_out(FILE *, sint, sint, sint, sint);
+void nbrf_out(FILE *, sint, sint, sint, sint);
+void gcg_out(FILE *, sint, sint, sint, sint);
+void phylip_out(FILE *, sint, sint, sint, sint);
+void gde_out(FILE *, sint, sint, sint, sint);
+void nexus_out(FILE *, sint, sint, sint, sint);
+void fasta_out(FILE *, sint, sint, sint, sint);
+void print_sec_struct_mask(int prf_length,char *mask,char *struct_mask);
+void fix_gaps(void);
+
+
+/* calcgapcoeff.c */
+void calc_gap_coeff(char **alignment, sint *gaps, sint **profile, Boolean struct_penalties,
+ char *gap_penalty_mask, sint first_seq, sint last_seq,
+ sint prf_length, sint gapcoef, sint lencoef);
+/* calcprf1.c */
+void calc_prf1(sint **profile, char **alignment, sint *gaps, sint matrix[NUMRES ][NUMRES ],
+ sint *seq_weight, sint prf_length, sint first_seq, sint last_seq);
+/* calcprf2.c */
+void calc_prf2(sint **profile, char **alignment, sint *seq_weight, sint prf_length,
+ sint first_seq, sint last_seq);
+/* calctree.c */
+void calc_seq_weights(sint first_seq, sint last_seq,sint *seq_weight);
+void create_sets(sint first_seq, sint last_seq);
+sint read_tree(char *treefile, sint first_seq, sint last_seq);
+void clear_tree(treeptr p);
+sint calc_similarities(sint nseqs);
+/* clustalw.c */
+int main(int argc, char **argv);
+/* gcgcheck.c */
+int SeqGCGCheckSum(char *seq, sint len);
+/* malign.c */
+sint malign(sint istart,char *phylip_name);
+sint seqalign(sint istart,char *phylip_name);
+sint palign1(void);
+float countid(sint s1, sint s2);
+sint palign2(char *p1_tree_name,char *p2_tree_name);
+/* pairalign.c */
+sint pairalign(sint istart, sint iend, sint jstart, sint jend);
+/* prfalign.c */
+lint prfalign(sint *group, sint *aligned);
+/* random.c */
+unsigned long linrand(unsigned long r);
+unsigned long addrand(unsigned long r);
+void addrandinit(unsigned long s);
+/* readmat.c */
+void init_matrix(void);
+sint get_matrix(short *matptr, short *xref, sint matrix[NUMRES ][NUMRES ], Boolean neg_flag,
+ sint scale);
+sint read_user_matrix(char *filename, short *usermat, short *xref);
+sint read_matrix_series(char *filename, short *usermat, short *xref);
+int getargs(char *inline1, char *args[], int max);
+/* sequence.c */
+void fill_chartab(void);
+sint readseqs(sint first_seq);
+/* showpair.c */
+void show_pair(sint istart, sint iend, sint jstart, sint jend);
+/* trees.c */
+void phylogenetic_tree(char *phylip_name,char *clustal_name,char *dist_name, char *nexus_name, char *pim_name);
+void bootstrap_tree(char *phylip_name,char *clustal_name, char *nexus_name);
+sint dna_distance_matrix(FILE *tree);
+sint prot_distance_matrix(FILE *tree);
+void guide_tree(FILE *tree,int first_seq,sint nseqs);
+
+void calc_percidentity(FILE *pfile);
+
+/* util.c */
+
+void alloc_aln(sint nseqs);
+void realloc_aln(sint first_seq,sint nseqs);
+void free_aln(sint nseqs);
+void alloc_seq(sint seq_no,sint length);
+void realloc_seq(sint seq_no,sint length);
+void free_seq(sint seq_no);
+
+void *ckalloc(size_t bytes);
+void *ckrealloc(void *ptr, size_t bytes);
+void *ckfree(void *ptr);
+char prompt_for_yes_no(char *title,char *prompt);
+void fatal(char *msg, ...);
+void error(char *msg, ...);
+void warning(char *msg, ...);
+void info(char *msg, ...);
+char *rtrim(char *str);
+char *blank_to_(char *str);
+char *upstr(char *str);
+char *lowstr(char *str);
+void getstr(char *instr, char *outstr);
+double getreal(char *instr, double minx, double maxx, double def);
+int getint(char *instr, int minx, int maxx, int def);
+void do_system(void);
+Boolean linetype(char *line, char *code);
+Boolean keyword(char *line, char *code);
+Boolean blankline(char *line);
+void get_path(char *str, char *path);
+
+
Added: trunk/packages/clustalw/branches/upstream/current/clustalw.ms
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/clustalw.ms 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/clustalw.ms 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,794 @@
+This is just an ASCII text version of the manuscript describing
+Clustal W, without the figures. It was published:
+
+Nucleic Acids Research, 22(22):4673-4680.
+
+
+
+CLUSTAL W: improving the sensitivity of progressive multiple
+sequence alignment through sequence weighting, position specific
+gap penalties and weight matrix choice.
+
+
+
+Julie D. Thompson, Desmond G. Higgins1 and Toby J. Gibson*
+
+European Molecular Biology Laboratory
+Postfach 102209
+Meyerhofstrasse 1
+D-69012 Heidelberg
+Germany
+
+
+Phone: +49-6221-387398
+Fax: +49-6221-387306
+E-mail: Gibson at EMBL-Heidelberg.DE
+ Des.Higgins at EBI.AC.UK
+ Thompson at EMBL-Heidelberg.DE
+
+
+Keywords: Multiple alignment, phylogenetic tree, weight matrix, gap
+ penalty, dynamic programming, sequence weighting.
+
+
+1 Current address:
+European Bioinformatics Institute
+Hinxton Hall
+Hinxton
+Cambridge CB10 1RQ
+UK.
+
+* To whom correspondence should be addressed
+
+
+ABSTRACT
+
+The sensitivity of the commonly used progressive multiple sequence
+alignment method has been greatly improved for the alignment of divergent
+protein sequences. Firstly, individual weights are assigned to each sequence
+in a partial alignment in order to downweight near-duplicate sequences and
+upweight the most divergent ones. Secondly, amino acid substitution
+matrices are varied at different alignment stages according to the divergence
+of the sequences to be aligned. Thirdly, residue specific gap penalties and
+locally reduced gap penalties in hydrophilic regions encourage new gaps in
+potential loop regions rather than regular secondary structure. Fourthly,
+positions in early alignments where gaps have been opened receive locally
+reduced gap penalties to encourage the opening up of new gaps at these
+positions. These modifications are incorporated into a new program,
+CLUSTAL W which is freely available.
+
+
+INTRODUCTION
+
+The simultaneous alignment of many nucleotide or amino acid sequences is
+now an essential tool in molecular biology. Multiple alignments are used to
+find diagnostic patterns to characterise protein families; to detect or
+demonstrate homology between new sequences and existing families of
+sequences; to help predict the secondary and tertiary structures of new
+sequences; to suggest oligonucleotide primers for PCR; as an essential prelude
+to molecular evolutionary analysis. The rate of appearance of new sequence
+data is steadily increasing and the development of efficient and accurate
+automatic methods for multiple alignment is, therefore, of major
+importance. The majority of automatic multiple alignments are now carried
+out using the "progressive" approach of Feng and Doolittle (1). In this paper,
+we describe a number of improvements to the progressive multiple
+alignment method which greatly improve the sensitivity without sacrificing
+any of the speed and efficiency which makes this approach so practical. The
+new methods are made available in a program called CLUSTAL W which is
+freely available and portable to a wide variety of computers and operating
+systems.
+
+In order to align just two sequences, it is standard practice to use dynamic
+programming (2). This guarantees a mathematically optimal alignment,
+given a table of scores for matches and mismatches between all amino acids
+or nucleotides (e.g. the PAM250 matrix (3) or BLOSUM62 matrix (4)) and
+penalties for insertions or deletions of different lengths. Attempts at
+generalising dynamic programming to multiple alignments are limited to
+small numbers of short sequences (5). For much more than eight or so
+proteins of average length, the problem is uncomputable given current
+computer power. Therefore, all of the methods capable of handling larger
+problems in practical timescales, make use of heuristics. Currently, the most
+widely used approach is to exploit the fact that homologous sequences are
+evolutionarily related. One can build up a multiple alignment progressively
+by a series of pairwise alignments, following the branching order in a
+phylogenetic tree (1). One first aligns the most closely related sequences,
+gradually adding in the more distant ones. This approach is sufficiently fast
+to allow alignments of virtually any size. Further, in simple cases, the
+quality of the alignments is excellent, as judged by the ability to correctly align
+corresponding domains from sequences of known secondary or tertiary
+structure (6). In more difficult cases, the alignments give good starting points
+for further automatic or manual refinement.
+
+This approach works well when the data set consists of sequences of different
+degrees of divergence. Pairwise alignment of very closely related sequences
+can be carried out very accurately. The correct answer may often be obtained
+using a wide range of parameter values (gap penalties and weight matrix). By
+the time the most distantly related sequences are aligned, one already has a
+sample of aligned sequences which gives important information about the
+variability at each position. The positions of the gaps that were introduced
+during the early alignments of the closely related sequences are not changed
+as new sequences are added. This is justified because the placement of gaps
+in alignments between closely related sequences is much more accurate than
+between distantly related ones. When all of the sequences are highly
+divergent (e.g. less than approximately 25-30% identity between any pair of
+sequences), this progressive approach becomes much less reliable.
+
+There are two major problems with the progressive approach: the local
+minimum problem and the choice of alignment parameters. The local
+minimum problem stems from the "greedy" nature of the alignment strategy.
+The algorithm greedily adds sequences together, following the initial tree.
+There is no guarantee that the global optimal solution, as defined by some
+overall measure of multiple alignment quality (7,8), or anything close to it,
+will be found. More specifically, any mistakes (misaligned regions) made
+early in the alignment process cannot be corrected later as new information
+from other sequences is added. This problem is frequently thought of as
+mainly resulting from an incorrect branching order in the initial tree. The
+initial trees are derived from a matrix of distances between separately aligned
+pairs of sequences and are much less reliable than trees from complete
+multiple alignments. In our experience, however, the real problem is caused
+simply by errors in the initial alignments. Even if the topology of the guide
+tree is correct, each alignment step in the multiple alignment process may
+have some percentage of the residues misaligned. This percentage will be
+very low on average for very closely related sequences but will increase as
+sequences diverge. It is these misalignments which carry through from the
+early alignment steps that cause the local minimum problem. The only way
+to correct this is to use an iterative or stochastic sampling procedure (e.g.
+7,9,10). We do not directly address this problem in this paper.
+
+The alignment parameter choice problem is, in our view, at least as serious as
+the local minimum problem. Stochastic or iterative algorithms will be just
+as badly affected as progressive ones if the parameters are inappropriate: they
+will arrive at a false global minimum. Traditionally, one chooses one weight
+matrix and two gap penalties (one for opening a new gap and one for
+extending an existing gap) and hope that these will work well over all parts of
+all the sequences in the data set. When the sequences are all closely related,
+this works. The first reason is that virtually all residue weight matrices give
+most weight to identities. When identities dominate an alignment, almost
+any weight matrix will find approximately the correct solution. With very
+divergent sequences, however, the scores given to non-identical residues will
+become critically important; there will be more mismatches than identities.
+Different weight matrices will be optimal at different evolutionary distances
+or for different classes of proteins.
+
+The second reason is that the range of gap penalty values that will find the
+correct or best possible solution can be very broad for highly similar sequences
+(11). As more and more divergent sequences are used, however, the exact
+values of the gap penalties become important for success. In each case, there
+may be a very narrow range of values which will deliver the best alignment.
+Further, in protein alignments, gaps do not occur randomly (i.e. with equal
+probability at all positions). They occur far more often between the major
+secondary structural elements of alpha helices and beta strands than within
+(12).
+
+The major improvements described in this paper attempt to address the
+alignment parameter choice problem. We dynamically vary the gap
+penalties in a position and residue specific manner. The observed relative
+frequencies of gaps adjacent to each of the 20 amino acids (12) are used to
+locally adjust the gap opening penalty after each residue. Short stretches of
+hydrophilic residues (e.g. 5 or more) usually indicate loop or random coil
+regions and the gap opening penalties are locally reduced in these stretches.
+In addition, the locations of the gaps found in the early alignments are also
+given reduced gap opening penalties. It has been observed in alignments
+between sequences of known structure that gaps tend not to be closer than
+roughly eight residues on average (12). We increase the gap opening penalty
+within eight residues of exising gaps. The two main series of amino acid
+weight matrices that are used today are the PAM series (3) and the BLOSUM
+series (4). In each case, there is a range of matrices to choose from. Some
+matrices are appropriate for aligning very closely related sequences where
+most weight by far is given to identities, with only the most frequent
+conservative substitutions receiving high scores. Other matrices work better
+at greater evolutionary distances where less importance is attached to
+identities (13). We choose different weight matrices, as the alignment
+proceeds, depending on the estimated divergence of the sequences to be
+aligned at each stage.
+
+Sequences are weighted to correct for unequal sampling across all
+evolutionary distances in the data set (14). This downweights sequences that
+are very similar to other sequences in the data set and upweights the most
+divergent ones. The weights are calculated directly from the branch lengths
+in the initial guide tree (15). Sequence weighting has already been shown to
+be effective in improving the sensitivity of profile searches (15,16). In the
+original CLUSTAL programs (17-19), the initial guide trees, used to guide the
+multiple alignment, were calculated using the UPGMA method (20). We
+now use the Neighbour-Joining method (21) which is more robust against the
+effects of unequal evolutionary rates in different lineages and which gives
+better estimates of individual branch lengths. This is useful because it is these
+branch lengths which are used to derive the sequence weights. We also allow
+users to choose between fast approximate alignments (22) or full dynamic
+programming for the distance calculations used to make the guide tree.
+
+The new improvements dramatically improve the sensitivity of the
+progressive alignment method for difficult alignments involving highly
+diverged sequences. We show one very demanding test case of over 60 SH3
+domains (23) which includes sequence pairs with as little as 12% identity and
+where there is only one exactly conserved residue across all of the sequences.
+Using default parameters, we can achieve an alignment that is almost exactly
+correct, according to available structural information (24). Using the program
+in a wide variety of situations, we find that it will normally find the correct
+alignment, in all but the most difficult and pathological of cases.
+
+
+MATERIAL AND METHODS
+
+
+The basic alignment method
+
+The basic multiple alignment algorithm consists of three main stages: 1) all
+pairs of sequences are aligned separately in order to calculate a distance matrix
+giving the divergence of each pair of sequences; 2) a guide tree is calculated
+from the distance matrix; 3) the sequences are progressively aligned according
+to the branching order in the guide tree. An example using 7 globin
+sequences of known tertiary structure (25) is given in figure 1.
+
+
+1) The distance matrix/pairwise alignments
+
+In the original CLUSTAL programs, the pairwise distances were calculated
+using a fast approximate method (22). This allows very large numbers of
+sequences to be aligned, even on a microcomputer. The scores are calculated
+as the number of k-tuple matches (runs of identical residues, typically 1 or 2
+long for proteins or 2 to 4 long for nucleotide sequences) in the best alignment
+between two sequences minus a fixed penalty for every gap. We now offer a
+choice between this method and the slower but more accurate scores from full
+dynamic programming alignments using two gap penalties (for opening or
+extending gaps) and a full amino acid weight matrix. These scores are
+calculated as the number of identities in the best alignment divided by the
+number of residues compared (gap positions are excluded). Both of these
+scores are initially calculated as percent identity scores and are converted to
+distances by dividing by 100 and subtracting from 1.0 to give number of
+differences per site. We do not correct for multiple substitutions in these
+initial distances. In figure 1 we give the 7x7 distance matrix between the 7
+globin sequences calculated using the full dynamic programming method.
+
+
+2) The guide tree
+
+The trees used to guide the final multiple alignment process are calculated
+from the distance matrix of step 1 using the Neighbour-Joining method (21).
+This produces unrooted trees with branch lengths proportional to estimated
+divergence along each branch. The root is placed by a "mid-point" method
+(15) at a position where the means of the branch lengths on either side of the
+root are equal. These trees are also used to derive a weight for each sequence
+(15). The weights are dependent upon the distance from the root of the tree
+but sequences which have a common branch with other sequences share the
+weight derived from the shared branch. In the example in figure 1, the
+leghaemoglobin (Lgb2_Luplu) gets a weight of 0.442 which is equal to the
+length of the branch from the root to it. The Human beta globin
+(Hbb_Human) gets a weight consisting of the length of the branch leading to
+it that is not shared with any other sequences (0.081) plus half the length of
+the branch shared with the horse beta globin (0.226/2) plus one quarter the
+length of the branch shared by all four haemoglobins (0.061/4) plus one fifth
+the branch shared between the haemoglobins and the myoglobin (0.015/5)
+plus one sixth the branch leading to all the vertebrate globins (0.062). This
+sums to a total of 0.221. By contrast, in the normal progressive alignment
+algorithm, all sequences would be equally weighted. The rooted tree with
+branch lengths and sequence weights for the 7 globins is given in figure 1.
+
+
+3) Progressive alignment
+
+The basic procedure at this stage is to use a series of pairwise alignments to
+align larger and larger groups of sequences, following the branching order in
+the guide tree. You proceed from the tips of the rooted tree towards the root.
+In the globin example in figure 1 you align the sequences in the following
+order: human vs. horse beta globin; human vs. horse alpha globin; the 2
+alpha globins vs. the 2 beta globins; the myoglobin vs. the haemoglobins; the
+cyanohaemoglobin vs the haemoglobins plus myoglobin; the leghaemoglobin
+vs. all the rest. At each stage a full dynamic programming (26,27) algorithm is
+used with a residue weight matrix and penalties for opening and extending
+gaps. Each step consists of aligning two existing alignments or sequences.
+Gaps that are present in older alignments remain fixed. In the basic
+algorithm, new gaps that are introduced at each stage get full gap opening and
+extension penalties, even if they are introduced inside old gap positions (see
+the section on gap penalties below for modifications to this rule). In order to
+calculate the score between a position from one sequence or alignment and
+one from another, the average of all the pairwise weight matrix scores from
+the amino acids in the two sets of sequences is used i.e. if you align 2
+alignments with 2 and 4 sequences respectively, the score at each position is
+the average of 8 (2x4) comparisons. This is illustrated in figure 2. If either set
+of sequences contains one or more gaps in one of the positions being
+considered, each gap versus a residue is scored as zero. The default amino
+acid weight matrices we use are rescored to have only positive values.
+Therefore, this treatment of gaps treats the score of a residue versus a gap as
+having the worst possible score. When sequences are weighted (see
+improvements to progressive alignment, below), each weight matrix value is
+multiplied by the weights from the 2 sequences, as illustrated in figure 2.
+
+
+Improvements to progressive alignment
+
+All of the remaining modifications apply only to the final progressive
+alignment stage. Sequence weighting is relatively straightforward and is
+already widely used in profile searches (15,16). The treatment of gap penalties
+is more complicated. Initial gap penalties are calculated depending on the
+weight matrix, the similarity of the sequences, and the length of the
+sequences. Then, an attempt is made to derive sensible local gap opening
+penalties at every position in each pre-aligned group of sequences that will
+vary as new sequences are added. The use of different weight matrices as the
+alignment progresses is novel and largely by-passes the problem of initial
+choice of weight matrix. The final modification allows us to delay the
+addition of very divergent sequences until the end of the alignment process
+when all of the more closely related sequences have already been aligned.
+
+
+Sequence weighting
+
+Sequence weights are calculated directly from the guide tree. The weights
+are normalised such that the biggest one is set to 1.0 and the rest are all less
+than one. Groups of closely related sequences receive lowered weights
+because they contain much duplicated information. Highly divergent
+sequences without any close relatives receive high weights. These weights
+are used as simple multiplication factors for scoring positions from different
+sequences or prealigned groups of sequences. The method is illustrated in
+figure 2. In the globin example in figure 1, the two alpha globins get
+downweighted because they are almost duplicate sequences (as do the two
+beta globins); they receive a combined weight of only slightly more than if a
+single alpha globin was used.
+
+
+Initial gap penalties
+
+Initially, two gap penalties are used: a gap opening penalty (GOP) which gives
+the cost of opening a new gap of any length and a gap extension penalty (GEP)
+which gives the cost of every item in a gap. Initial values can be set by the
+user from a menu. The software then automatically attempts to choose
+appropriate gap penalties for each sequence alignment, depending on the
+following factors.
+
+1) Dependence on the weight matrix
+
+It has been shown (16,28) that varying the gap penalties used with different
+weight matrices can improve the accuracy of sequence alignments. Here, we
+use the average score for two mismatched residues (ie. off-diagonal values in
+the matrix) as a scaling factor for the GOP.
+
+2) Dependence on the similarity of the sequences
+
+The percent identity of the two (groups of) sequences to be aligned is used to
+increase the GOP for closely related sequences and decrease it for more
+divergent sequences on a linear scale.
+
+3) Dependence on the lengths of the sequences
+
+The scores for both true and false sequence alignments grow with the length
+of the sequences. We use the logarithm of the length of the shorter sequence
+to increase the GOP with sequence length.
+
+Using these three modifications, the initial GOP calculated by the program is:
+
+GOP->(GOP+log(MIN(N,M))) * (average residue mismatch score) *
+ (percent identity scaling factor)
+where N, M are the lengths of the two sequences.
+
+4) Dependence on the difference in the lengths of the sequences
+
+The GEP is modified depending on the difference between the lengths of the
+two sequences to be aligned. If one sequence is much shorter than the other,
+the GEP is increased to inhibit too many long gaps in the shorter sequence.
+The initial GEP calculated by the program is:
+
+GEP -> GEP*(1.0+|log(N/M)|)
+where N, M are the lengths of the two sequences.
+
+
+Position-specific gap penalties
+
+ In most dynamic programming applications, the initial gap opening and
+extension penalties are applied equally at every position in the sequence,
+regardless of the location of a gap, except for terminal gaps which are usually
+allowed at no cost. In CLUSTAL W, before any pair of sequences or
+prealigned groups of sequences are aligned, we generate a table of gap opening
+penalties for every position in the two (sets of) sequences. An example is
+shown in figure 3. We manipulate the initial gap opening penalty in a
+position specific manner, in order to make gaps more or less likely at different
+positions.
+
+The local gap penalty modification rules are applied in a hierarchical manner.
+The exact details of each rule are given below. Firstly, if there is a gap at a
+position, the gap opening and gap extension penalties are lowered; the other
+rules do not apply. This makes gaps more likely at positions where there are
+already gaps. If there is no gap at a position, then the gap opening penalty is
+increased if the position is within 8 residues of an existing gap. This
+discourages gaps that are too close together. Finally, at any position within a
+run of hydrophilic residues, the penalty is decreased. These runs usually
+indicate loop regions in protein structures. If there is no run of hydrophilic
+residues, the penalty is modified using a table of residue specific gap
+propensities (12). These propensities were derived by counting the frequency
+of each residue at either end of gaps in alignments of proteins of known
+structure. An illustration of the application of these rules from one part of
+the globin example, in figure 1, is given in figure 3.
+
+1) Lowered gap penalties at existing gaps
+
+If there are already gaps at a position, then the GOP is reduced in proportion
+to the number of sequences with a gap at this position and the GEP is lowered
+by a half. The new gap opening penalty is calculated as:
+
+GOP -> GOP*0.3*(no. of sequences without a gap/no. of sequences).
+
+2) Increased gap penalties near existing gaps
+
+If a position does not have any gaps but is within 8 residues of an existing gap,
+the GOP is increased by:
+
+GOP -> GOP*(2+((8-distance from gap)*2)/8)
+
+3) Reduced gap penalties in hydrophilic stretches
+
+Any run of 5 hydrophilic residues is considered to be a hydrophilic stretch.
+The residues that are to be considered hydrophilic may be set by the user but
+are conservatively set to D, E, G, K, N, Q, P, R or S by default. If, at any
+position, there are no gaps and any of the sequences has such a stretch, the
+GOP is reduced by one third.
+
+
+4) Residue specific penalties
+
+If there is no hydrophilic stretch and the position does not contain any gaps,
+then the GOP is multiplied by one of the 20 numbers in table 1, depending on
+the residue. If there is a mixture of residues at a position, the multiplication
+factor is the average of all the contributions from each sequence.
+
+
+Weight matrices
+
+Two main series of weight matrices are offered to the user: the Dayhoff PAM
+series (3) and the BLOSUM series (4). The default is the BLOSUM series. In
+each case, there is a choice of matrix ranging from strict ones, useful for
+comparing very closely related sequences to very "soft" ones that are useful
+for comparing very distantly related sequences. Depending on the distance
+between the two sequences or groups of sequences to be compared, we switch
+between 4 different matrices. The distances are measured directly from the
+guide tree. The ranges of distances and tables used with the PAM series of
+matrices is: 80-100%:PAM20, 60-80%:PAM60, 40-60%:PAM120, 0-40%:PAM350.
+The range used with the BLOSUM series is:80-100%:BLOSUM80,
+60-80%:BLOSUM62, 30-60%:BLOSUM45, 0-30%:BLOSUM30.
+
+
+Divergent sequences
+
+The most divergent sequences (most different, on average from all of the
+other sequences) are usually the most difficult to align correctly. It is
+sometimes better to delay the incorporation of these sequences until all of the
+more easily aligned sequences are merged first. This may give a better chance
+of correctly placing the gaps and matching weakly conserved positions against
+the rest of the sequences. A choice is offered to set a cut off (default is 40%
+identity or less with any other sequence) that will delay the alignment of the
+divergent sequences until all of the rest have been aligned.
+
+
+Software and Algorithms
+
+
+Dynamic Programming
+
+The most demanding part of the multiple alignment strategy, in terms of
+computer processing and memory usage, is the alignment of two (groups of)
+sequences at each step in the final progressive alignment. To make it
+possible to align very long sequences (e.g. dynein heavy chains at ~ 5,000
+residues) in a reasonable amount of memory, we use the memory efficient
+dynamic programming algorithm of Myers and Miller (26). This sacrifices
+some processing time but makes very large alignments practical in very little
+memory. One disadvantage of this algorithm is that it does not allow
+different gap opening and extension penalties at each position. We have
+modified the algorithm so as to allow this and the details are described in a
+separate paper (27).
+
+
+
+Menus/file formats
+
+Six different sequence input formats are detected automatically and read by
+the program: EMBL/Swiss Prot, NBRF/PIR, Pearson/FASTA (29), GCG/MSF
+(30), GDE (Steven Smith, Harvard University Genome Center) and CLUSTAL
+format alignments. The last three formats allow users to read in complete
+alignments (e.g. for calculating phylogenetic trees or for addition of new
+sequences to an existing alignment). Alignment output may be requested in
+standard CLUSTAL format (self-explanatory blocked alignments) or in
+formats compatible with the GDE, PHYLIP (31) or GCG (30) packages. The
+program offers the user the ability to calculate Neighbour-Joining
+phylogenetic trees from existing alignments with options to correct for
+multiple hits (32,33) and to estimate confidence levels using a bootstrap
+resampling procedure (34). The trees may be output in the "New
+Hampshire" format that is compatible with the PHYLIP package (31).
+
+Alignment to an alignment
+
+Profile alignment is used to align two existing alignments (either of which
+may consist of just one sequence) or to add a series of new sequences to an
+existing alignment. This is useful because one may wish to build up a
+multiple alignment gradually, choosing different parameters manually, or
+correcting intermediate errors as the alignment proceeds. Often, just a few
+sequences cause misalignments in the progressive algorithm and these can be
+removed from the process and then added at the end by profile alignment. A
+second use is where one has a high quality reference alignment and wishes to
+keep it fixed while adding new sequences automatically.
+
+
+Portability/Availability
+
+The full source code of the package is provided free to academic users. The
+program will run on any machine with a full ANSI conforming C compiler.
+It has been tested on the following hardware/software combinations:
+Decstation/Ultrix, Vax or ALPHA/VMS, Silicon Graphics/IRIX. The source
+code and documentation are available by E-mail from the EMBL file server
+(send the words HELP and HELP SOFTWARE on two lines to the internet
+address:
+Netserv at EMBL-Heidelberg.DE) or by anonymous FTP from
+FTP.EMBL-Heidelberg.DE. Queries may be addressed by E-mail to
+Des.Higgins at EBI.AC.UK or Gibson at EMBL-Heidelberg.DE.
+
+
+RESULTS AND DISCUSSION
+
+
+Alignment of SH3 Domains
+
+The ~60 residue SH3 domain was chosen to illustrate the performance of
+CLUSTAL W, as there is a reference manual alignment (23) and the fold is
+known (24). SH3 domains, with a minimum similarity below 12% identity,
+are poorly aligned by progressive alignment programs such as CLUSTAL V
+and PILEUP: neither program can generate the correct blocks corresponding to
+the secondary structure elements.
+
+Figure 4 shows an alignment generated by CLUSTAL W of the example set of
+SH3 domains. The alignment was generated in two steps. After progressive
+alignment, five blocks were produced, corresponding to structural elements,
+with gaps inserted exclusively in the known loop regions. The beta strands in
+blocks 1, 4 and 5 were all correctly superposed. However, four sequences in
+block 2 and one sequence in block 3 were misaligned by 1-2 residues
+(underlined in figure 4). A second progressive alignment of the aligned
+sequences, including the gaps, improved this alignment: A single misaligned
+sequence, H_P55, remains in block 2 (boxed in figure 4), while block 3 is now
+completely aligned. This alignment corrects several errors (eg. P85A, P85B
+and FUS1) in the manual alignment (23).
+
+The SH3 alignment illustrates several features of CLUSTAL W usage. Firstly,
+in a practical application involving divergent sequences, the initial
+progressive alignment is likely to be a good but not perfect approximation to
+the correct alignment. The alignment quality can be improved in a number of
+ways. If the block structure of the alignment appears to be correct, realignment
+of the alignment will usually improve most of the misaligned blocks: the
+existing gaps allow the blocks to "float" cheaply to a locally optimal position
+without disturbing the rest of the alignment. Remaining sequences which are
+doubtfully aligned can then be individually tested by profile alignment to the
+remainder: the misaligned H_P55 SH3 domain can be correctly aligned by
+profile (with GOP <= 8). The indel regions in the final alignment can then be
+manually cleaned up: Usually the exact alignment in the loop regions is not
+determinable, and may have no meaning in structural terms. It is then
+desirable to have a single gap per structural loop. CLUSTAL W achieved this
+for two of the four SH3 loop regions (figure 4).
+
+If the block structure of the alignment appears suspect, greater intervention by
+the user may be required. The most divergent sequences, especially if they
+have large insertions (which can be discerned with the aid of dot matrix
+plots), should be left out of the progressive alignment. If there are sets of
+closely related sequences that are deeply diverged from other sets, these can be
+separately aligned and then merged by profile alignment. Incorrectly
+determined sequences, containing frameshifts, can also confound regions of
+an alignment: these can be hard to detect but sometimes they have been
+grouped within the excluded divergent sequences: then they may be revealed
+when they are individually compared to the alignment as having apparently
+nonsense segments with respect to the other sequences.
+
+
+
+Finding the best alignment
+
+In cases where all of the sequences in a data set are very similar (e.g. no pair
+less than 35% identical), CLUSTAL W will find an alignment which is
+difficult to improve by eye. In this sense, the alignment is optimal with
+regard to the alternative of manual alignment. Mathematically, this is vague
+and can only be put on a more systematic footing by finding an objective
+function (a measure of multiple alignment quality) that exactly mirrors the
+information used by an "expert" to evaluate an alignment. Nonetheless, if an
+alignment is impossible to improve by eye, then the program has achieved a
+very useful result.
+
+In more difficult cases, as more divergent sequences are included, it becomes
+increasingly difficult to find good alignments and to evaluate them. What
+we find with CLUSTAL W is that the basic block-like structure of the
+alignment (corresponding to the major secondary structure elements) is
+usually recovered, with some of the most divergent sequences misaligned in
+small regions. This is a very useful starting point for manual refinement as it
+helps define the major blocks of similarity. The problem sequences can be
+removed from the analysis and realigned to the rest of the sequences
+automatically or with different parameter settings. An examination of the
+tree used to guide the alignment will usually show which sequences will be
+most unreliably placed (those that branch off closest to the root and/or those
+that align to other single sequences at a very low level of sequence identity
+rather than align to a group of pre-aligned sequences). Finally, one can
+simply iterate the multiple alignment process by feeding an output alignment
+back into CLUSTAL W and repeating the multiple alignment process (using
+the same or different parameters). The SH3 domain alignment in figure 4
+was derived in this way by 2 passes using default parameters. In the second
+pass, the local gap penalties are dominated by the placement of the initial
+major gap positions. The alignment will either remain unchanged or will
+converge rapidly (after 1 or 2 extra passes) on a better solution. If the
+placement of the initial gaps is approximately correct but some of the
+sequences are locally misaligned, this works well.
+
+
+Comparison with other methods
+
+Recently, several papers have addressed the problem of position specific
+parameters for multiple alignment. In one case (35), local gap penalties are
+increased in alpha helical and beta strand regions, when the 3-D structures of
+one or more of the sequences are known. In a second case (36), a hidden
+Markov model was used to estimate position specific gap penalties and
+residue substitution weight matrices when large numbers of examples of a
+protein domain were known. With CLUSTAL W, we attempt to derive the
+same information purely from the set of sequences to be aligned. Therefore,
+we can apply the method to any set of sequences. The success of this approach
+will depend on the number of available sequences and their evolutionary
+relationships. It will also depend on the decision making process during
+multiple alignment (e.g. when to change weight matrix) and the accuracy and
+appropriateness of our parameterisation. In the long term, this can only be
+evaluated by exhaustive testing of sets of sequences where the correct
+alignment (or parts of it) are known from structural information. What is
+clear, however, is that the modifications described here significantly improve
+the sensitivity of the progressive multiple alignment approach. This is
+achieved with almost no sacrifice in speed and efficiency.
+
+There are several areas where further improvements in sensitivity and
+accuracy can be made. Firstly, the residue weight matrices and gap settings
+can be made more accurate as more and more data accumulate, while
+matrices for specific sequence types can be derived (e.g. for transmembrane
+regions (37)). Secondly, stochastic or iterative optimisation methods can be
+used to refine initial alignments (7,9,10). CLUSTAL W could be run with
+several sets of starting parameters and in each case, the alignments refined
+according to an objective function. The search for a good objective function,
+that takes into account the sequence and position specific information used in
+CLUSTAL W is a key area of research. Finally, the average number of
+examples of each protein domain or family is growing steadily. It is not only
+important that programs can cope with the large volumes of data that are
+being generated, they should be able to exploit the new information to make
+the alignments more and more accurate. Globally optimal alignments
+(according to an objective function) may not always be possible but the
+problem may be avoided if sufficiently large volumes of data become
+available. CLUSTAL W is a step in this direction.
+
+ACKNOWLEDGEMENTS
+
+Numerous people have offered advice and suggestions for improvements to
+earlier versions of the CLUSTAL programs. D.H. wishes to apologise to all of
+the irate CLUSTAL V users who had to live with the bugs and lack of facilities
+for getting trees in the New Hampshire format. We wish to specifically thank
+Jeroen Coppieters who suggested using a series of weight matrices and Steven
+Henikoff for advice on using the BLOSUM matrices. We are grateful to Rein
+Aasland, Peer Bork, Ariel Blocker and Brtrand Seraphin for providing
+challenging alignment problems. T.G. and J.T. thank Kevin Leonard for
+support and encouragement. Finally, we thank all of the people who were
+involved with various CLUSTAL programs over the years, namely: Paul
+Sharp, Rainer Fuchs and Alan Bleasby.
+
+
+REFERENCES
+
+ 1.Feng, D.-F. and Doolittle, R.F. (1987). J. Mol. Evol. 25, 351-360.
+ 2.Needleman, S.B. and Wunsch, C.D. (1970). J. Mol. Biol. 48, 443-453.
+ 3.Dayhoff, M.O., Schwartz, R.M. and Orcutt, B.C. (1978) in Atlas of Protein
+Sequence and Structure, vol. 5, suppl. 3 (Dayhoff, M.O., ed.), pp 345-352,
+NBRF, Washington.
+ 4.Henikoff, S. and Henikoff, J.G. (1992). Proc. Natl. Acad. Sci. USA 89, 10915-
+10919.
+ 5.Lipman, D.J., Altschul, S.F. and Kececioglu, J.D. (1989). Proc. Natl. Acad. Sci.
+USA 86, 4412-4415.
+ 6.Barton, G.J. and Sternberg, M.J.E. (1987). J. Mol. Biol. 198, 327-337.
+ 7.Gotoh, O. (1993). CABIOS 9, 361-370.
+ 8.Altschul, S.F. (1989). J. Theor. Biol. 138, 297-309.
+ 9.Lukashin, A.V., Engelbrecht, J. and Brunak, S. (1992). Nucl. Acids Res. 20,
+2511-2516.
+10.Lawrence, C.E., Altschul, S.F., Boguski, M.S., Liu, J.S., Neuwald, A.F. and
+Wooton, J.C. (1993). Science, 262, 208-214.
+11.Vingron, M. and Waterman, M.S. (1993). J. Mol. Biol. 234, 1-12.
+12.Pascarella, S. and Argos, P. (1992). J. Mol. Biol. 224, 461-471.
+13.Collins, J.F. and Coulson, A.F.W. (1987). In Nucleic acid and protein
+sequence analysis a practical approach, Bishop, M.J. and Rawlings, C.J. ed.,
+chapter 13, pp. 323-358.
+14.Vingron, M. and Sibbald, P.R. (1993). Proc. Natl. Acad. Sci. USA, 90, 8777-
+8781.
+15.Thompson, J.D., Higgins, D.G. and Gibson, T.J. (1994). CABIOS, 10, 19-29.
+16.Lthy, R., Xenarios, I. and Bucher, P. (1994). Protein Science, 3, 139-146.
+17.Higgins, D.G. and Sharp, P.M. (1988). Gene, 73, 237-244.
+18.Higgins, D.G. and Sharp, P.M. (1989). CABIOS, 5, 151-153.
+19.Higgins, D.G., Bleasby, A.J. and Fuchs, R. (1992). CABIOS, 8, 189-191.
+20.Sneath, P.H.A. and Sokal, R.R. (1973). Numerical Taxonomy, W.H.
+Freeman, San Francisco.
+21.Saitou, N. and Nei, M. (1987). Mol. Biol. Evol. 4, 406-425.
+22.Wilbur, W.J. and Lipman, D.J. (1983). Proc. Natl. Acad. Sci. USA, 80, 726-
+730.
+23.Musacchio, A., Gibson, T., Lehto, V.-P. and Saraste, M. (1992). FEBS Lett.
+307, 55-61.
+24.Musacchio, A., Noble, M., Pauptit, R., Wierenga, R. and Saraste, M. (1992).
+Nature, 359, 851-855.
+25.Bashford, D., Chothia, C. and Lesk, A.M. (1987). J. Mol. Biol. 196, 199-216.
+26.Myers, E.W. and Miller, W. (1988). CABIOS, 4, 11-17.
+27.Thompson, J.D. (1994). CABIOS, (Submitted).
+28.Smith, T.F., Waterman, M.S. and Fitch, W.M. (1981). J. Mol. Evol. 18, 38-46.
+29.Pearson, W.R. and Lipman, D.J. (1988). Proc. Natl. Acad. Sci. USA. 85, 2444-
+2448.
+30.Devereux, J., Haeberli, P. and Smithies, O. (1984). Nucleic Acids Res. 12,
+387-395.
+31.Felsenstein, J. (1989). Cladistics 5, 164-166.
+32.Kimura, M. (1980). J. Mol. Evol. 16, 111-120.
+33.Kimura, M. (1983). The Neutral Theory of Molecular Evolution.
+Cambridge University Press, Cambridge.
+34.Felsenstein, J. (1985). Evolution 39, 783-791.
+35.Smith, R.F. and Smith, T.F. (1992) Protein Engineering 5, 35-41.
+36.Krogh, A., Brown, M., Mian, S., Sjlander, K. and Haussler, D. (1994) J. Mol.
+Biol. 235-1501-1531.
+37.Jones, D.T., Taylor, W.R. and Thornton, J.M. (1994). FEBS Lett. 339, 269-275.
+38.Bairoch, A. and Bckmann, B. (1992) Nucleic Acids Res., 20, 2019-2022.
+39.Noble, M.E.M., Musacchio, A., Saraste, M., Courtneidge, S.A. and
+Wierenga, R.K. (1993) EMBO J. 12, 2617-2624.
+40.Kabsch, W. and Sander, C. (1983) Biopolymers, 22, 2577-2637.
+
+FIGURE LEGENDS
+
+Figure 1. The basic progressive alignment procedure, illustrated using a set of
+7 globins of known tertiary structure. The sequence names are from Swiss
+Prot (38): Hba_Horse: horse alpha globin; Hba_Human: human alpha globin;
+Hbb_Horse: horse beta globin; Hbb_Human: human beta globin; Myg_Phyca:
+sperm whale myoglobin; Glb5_Petma: lamprey cyanohaemoglobin;
+Lgb2_Luplu: lupin leghaemoglobin. In the distance matrix, the mean
+number of differences per residue is given. The unrooted tree shows all
+branch lengths drawn to scale. In the rooted tree, all branch lengths (mean
+number of differences per residue along each branch) are given as well as
+weights for each sequence. In the multiple alignment, the approximate
+positions of the 7 alpha helices, common to all 7 proteins are shown. This
+alignment was derived using CLUSTAL W with default parameters and the
+PAM (3) series of weight matrices.
+
+Figure 2. The scoring scheme for comparing two positions from two
+alignments. Two sections of alignment with 4 and 2 sequences respectively
+are shown. The score of the position with amino acids T,L,K,K versus the
+position with amino acids V and I is given with and without sequence
+weights. M(X,Y) is the weight matrix entry for amino acid X versus amino
+acid Y. Wn is the weight for sequence n.
+
+Figure 3. The variation in local gap opening penalty is plotted for a section of
+alignment. The inital gap opening penalty is indicated by a dotted line. Two
+hydrophilic stretches are underlined. The lowest penalties correspond to the
+ends of the alignment, the hydrophilic stretches and the two positions with
+gaps. The highest values are within 8 residues of the two gap positions. The
+rest of the variation is caused by the residue specific gap penalties (12).
+
+Figure 4. CLUSTAL W Alignment of a set of SH3 domains taken from (23).
+Secondary structure assignments for the solved Spectrin (24) and Fyn (39)
+domains are according to DSSP (40). The alignment was generated in two
+steps using default parameters. After full multiple alignment, the aligned
+sequences were realigned. Segments which were correctly aligned in the
+second pass are underlined. The single misaligned segment in H_P55 and the
+misaligned residue in H_NCK/2 are boxed.
+
+The sequences are coloured to illustrate significant features. All G (orange)
+and P (yellow) are coloured. Other residues matching a frequent occurrence of
+a property in a column are coloured: hydrophobic = blue; hydrophobic
+tendency = light blue; basic = red; acidic = purple; hydrophilic = green; White
+= unconserved. The alignment figure was prepared with the GDE sequence
+editor (S. Smith, Harvard University) and COLORMASK (J. Thompson,
+EMBL).
+
+
+
+
+Table 1. Pascarella and Argos residue specific gap modification factors.
+-----------------------------------------------------------------------------------
+A 1.13 M 1.29
+C 1.13 N 0.63
+D 0.96 P 0.74
+E 1.31 Q 1.07
+F 1.20 R 0.72
+G 0.61 S 0.76
+H 1.00 T 0.89
+I 1.32 V 1.25
+K 0.96 Y 1.00
+L 1.21 W 1.23
+-----------------------------------------------------------------------------------
+The values are normalised around a mean value of 1.0 for H. The lower the
+value, the greater the chance of having an adjacent gap. These are derived
+from the original table of relative frequencies of gaps adjacent to each residue
+(12) by subtraction from 2.0.
+
+
Property changes on: trunk/packages/clustalw/branches/upstream/current/clustalw.ms
___________________________________________________________________
Name: svn:executable
+
Added: trunk/packages/clustalw/branches/upstream/current/clustalw.new
===================================================================
(Binary files differ)
Property changes on: trunk/packages/clustalw/branches/upstream/current/clustalw.new
___________________________________________________________________
Name: svn:executable
+
Name: svn:mime-type
+ application/octet-stream
Added: trunk/packages/clustalw/branches/upstream/current/clustalw_help
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/clustalw_help 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/clustalw_help 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,697 @@
+
+This is the on-line help file for CLUSTAL W ( version 1.83).
+
+It should be named or defined as: clustalw_help
+except with MSDOS in which case it should be named CLUSTALW.HLP
+
+For full details of usage and algorithms, please read the CLUSTALW.DOC file.
+
+
+Toby Gibson EMBL, Heidelberg, Germany.
+Des Higgins UCC, Cork, Ireland.
+Julie Thompson IGBMC, Strasbourg, France.
+
+
+
+>>NEW <<
+
+ Fasta output
+ ===========
+
+ Write/Read sequence with range specified. The command line syntax
+ for range specification is flexible. You can use one of the following
+ syntax.
+
+ -range=n:m
+ -range=n-m
+ -range="n m"
+
+ where m is the starting and m is the length of the sequence.
+
+ Range and range numbers.
+ =======================
+
+ Include range numbers in the ouput.
+
+ -seqno_range=on/off
+
+ The sequence range will be appended as to the names of the sequence.
+
+
+ PIM: Percentage Identity Matrix
+ ===============================
+
+
+
+>>HELP 1 << General help for CLUSTAL W (1.81)
+
+Clustal W is a general purpose multiple alignment program for DNA or proteins.
+
+SEQUENCE INPUT: all sequences must be in 1 file, one after another.
+7 formats are automatically recognised: NBRF-PIR, EMBL-SWISSPROT,
+Pearson (Fasta), Clustal (*.aln), GCG-MSF (Pileup), GCG9-RSF and GDE flat file.
+All non-alphabetic characters (spaces, digits, punctuation marks) are ignored
+except "-" which is used to indicate a GAP ("." in MSF-RSF).
+
+To do a MULTIPLE ALIGNMENT on a set of sequences, use item 1 from this menu to
+INPUT them; go to menu item 2 to do the multiple alignment.
+
+PROFILE ALIGNMENTS (menu item 3) are used to align 2 alignments. Use this to
+add a new sequence to an old alignment, or to use secondary structure to guide
+the alignment process. GAPS in the old alignments are indicated using the "-"
+character. PROFILES can be input in ANY of the allowed formats; just
+use "-" (or "." for MSF-RSF) for each gap position.
+
+PHYLOGENETIC TREES (menu item 4) can be calculated from old alignments (read in
+with "-" characters to indicate gaps) OR after a multiple alignment while the
+alignment is still in memory.
+
+
+The program tries to automatically recognise the different file formats used
+and to guess whether the sequences are amino acid or nucleotide. This is not
+always foolproof.
+
+FASTA and NBRF-PIR formats are recognised by having a ">" as the first
+character in the file.
+
+EMBL-Swiss Prot formats are recognised by the letters
+ID at the start of the file (the token for the entry name field).
+
+CLUSTAL format is recognised by the word CLUSTAL at the beginning of the file.
+
+GCG-MSF format is recognised by one of the following:
+ - the word PileUp at the start of the file.
+ - the word !!AA_MULTIPLE_ALIGNMENT or !!NA_MULTIPLE_ALIGNMENT
+ at the start of the file.
+ - the word MSF on the first line of the line, and the characters ..
+ at the end of this line.
+
+GCG-RSF format is recognised by the word !!RICH_SEQUENCE at the beginning of
+the file.
+
+
+If 85% or more of the characters in the sequence are from A,C,G,T,U or N, the
+sequence will be assumed to be nucleotide. This works in 97.3% of cases
+but watch out!
+
+>>HELP 2 << Help for multiple alignments
+
+If you have already loaded sequences, use menu item 1 to do the complete
+multiple alignment. You will be prompted for 2 output files: 1 for the
+alignment itself; another to store a dendrogram that describes the similarity
+of the sequences to each other.
+
+Multiple alignments are carried out in 3 stages (automatically done from menu
+item 1 ...Do complete multiple alignments now):
+
+1) all sequences are compared to each other (pairwise alignments);
+
+2) a dendrogram (like a phylogenetic tree) is constructed, describing the
+approximate groupings of the sequences by similarity (stored in a file).
+
+3) the final multiple alignment is carried out, using the dendrogram as a guide.
+
+
+PAIRWISE ALIGNMENT parameters control the speed-sensitivity of the initial
+alignments.
+
+MULTIPLE ALIGNMENT parameters control the gaps in the final multiple alignments.
+
+
+RESET GAPS (menu item 7) will remove any new gaps introduced into the sequences
+during multiple alignment if you wish to change the parameters and try again.
+This only takes effect just before you do a second multiple alignment. You
+can make phylogenetic trees after alignment whether or not this is ON.
+If you turn this OFF, the new gaps are kept even if you do a second multiple
+alignment. This allows you to iterate the alignment gradually. Sometimes, the
+alignment is improved by a second or third pass.
+
+SCREEN DISPLAY (menu item 8) can be used to send the output alignments to the
+screen as well as to the output file.
+
+You can skip the first stages (pairwise alignments; dendrogram) by using an
+old dendrogram file (menu item 3); or you can just produce the dendrogram
+with no final multiple alignment (menu item 2).
+
+
+OUTPUT FORMAT: Menu item 9 (format options) allows you to choose from 6
+different alignment formats (CLUSTAL, GCG, NBRF-PIR, PHYLIP, GDE, NEXUS, and FASTA).
+
+
+>>HELP 3 << Help for pairwise alignment parameters
+A distance is calculated between every pair of sequences and these are used to
+construct the dendrogram which guides the final multiple alignment. The scores
+are calculated from separate pairwise alignments. These can be calculated using
+2 methods: dynamic programming (slow but accurate) or by the method of Wilbur
+and Lipman (extremely fast but approximate).
+
+You can choose between the 2 alignment methods using menu option 8. The
+slow-accurate method is fine for short sequences but will be VERY SLOW for
+many (e.g. >100) long (e.g. >1000 residue) sequences.
+
+SLOW-ACCURATE alignment parameters:
+ These parameters do not have any affect on the speed of the alignments.
+They are used to give initial alignments which are then rescored to give percent
+identity scores. These % scores are the ones which are displayed on the
+screen. The scores are converted to distances for the trees.
+
+1) Gap Open Penalty: the penalty for opening a gap in the alignment.
+2) Gap extension penalty: the penalty for extending a gap by 1 residue.
+3) Protein weight matrix: the scoring table which describes the similarity
+ of each amino acid to each other.
+4) DNA weight matrix: the scores assigned to matches and mismatches
+ (including IUB ambiguity codes).
+
+
+FAST-APPROXIMATE alignment parameters:
+
+These similarity scores are calculated from fast, approximate, global align-
+ments, which are controlled by 4 parameters. 2 techniques are used to make
+these alignments very fast: 1) only exactly matching fragments (k-tuples) are
+considered; 2) only the 'best' diagonals (the ones with most k-tuple matches)
+are used.
+
+K-TUPLE SIZE: This is the size of exactly matching fragment that is used.
+INCREASE for speed (max= 2 for proteins; 4 for DNA), DECREASE for sensitivity.
+For longer sequences (e.g. >1000 residues) you may need to increase the default.
+
+GAP PENALTY: This is a penalty for each gap in the fast alignments. It has
+little affect on the speed or sensitivity except for extreme values.
+
+TOP DIAGONALS: The number of k-tuple matches on each diagonal (in an imaginary
+dot-matrix plot) is calculated. Only the best ones (with most matches) are
+used in the alignment. This parameter specifies how many. Decrease for speed;
+increase for sensitivity.
+
+WINDOW SIZE: This is the number of diagonals around each of the 'best'
+diagonals that will be used. Decrease for speed; increase for sensitivity.
+
+
+>>HELP 4 << Help for multiple alignment parameters
+
+These parameters control the final multiple alignment. This is the core of the
+program and the details are complicated. To fully understand the use of the
+parameters and the scoring system, you will have to refer to the documentation.
+
+Each step in the final multiple alignment consists of aligning two alignments
+or sequences. This is done progressively, following the branching order in
+the GUIDE TREE. The basic parameters to control this are two gap penalties and
+the scores for various identical-non-indentical residues.
+
+1) and 2) The GAP PENALTIES are set by menu items 1 and 2. These control the
+cost of opening up every new gap and the cost of every item in a gap.
+Increasing the gap opening penalty will make gaps less frequent. Increasing
+the gap extension penalty will make gaps shorter. Terminal gaps are not
+penalised.
+
+3) The DELAY DIVERGENT SEQUENCES switch delays the alignment of the most
+distantly related sequences until after the most closely related sequences have
+been aligned. The setting shows the percent identity level required to delay
+the addition of a sequence; sequences that are less identical than this level
+to any other sequences will be aligned later.
+
+
+
+4) The TRANSITION WEIGHT gives transitions (A <--> G or C <--> T
+i.e. purine-purine or pyrimidine-pyrimidine substitutions) a weight between 0
+and 1; a weight of zero means that the transitions are scored as mismatches,
+while a weight of 1 gives the transitions the match score. For distantly related
+DNA sequences, the weight should be near to zero; for closely related sequences
+it can be useful to assign a higher score.
+
+
+5) PROTEIN WEIGHT MATRIX leads to a new menu where you are offered a choice of
+weight matrices. The default for proteins in version 1.8 is the PAM series
+derived by Gonnet and colleagues. Note, a series is used! The actual matrix
+that is used depends on how similar the sequences to be aligned at this
+alignment step are. Different matrices work differently at each evolutionary
+distance.
+
+6) DNA WEIGHT MATRIX leads to a new menu where a single matrix (not a series)
+can be selected. The default is the matrix used by BESTFIT for comparison of
+nucleic acid sequences.
+
+Further help is offered in the weight matrix menu.
+
+
+7) In the weight matrices, you can use negative as well as positive values if
+you wish, although the matrix will be automatically adjusted to all positive
+scores, unless the NEGATIVE MATRIX option is selected.
+
+8) PROTEIN GAP PARAMETERS displays a menu allowing you to set some Gap Penalty
+options which are only used in protein alignments.
+
+
+>>HELP A << Help for protein gap parameters.
+1) RESIDUE SPECIFIC PENALTIES are amino acid specific gap penalties that reduce
+or increase the gap opening penalties at each position in the alignment or
+sequence. See the documentation for details. As an example, positions that
+are rich in glycine are more likely to have an adjacent gap than positions that
+are rich in valine.
+
+2) 3) HYDROPHILIC GAP PENALTIES are used to increase the chances of a gap within
+a run (5 or more residues) of hydrophilic amino acids; these are likely to
+be loop or random coil regions where gaps are more common. The residues that
+are "considered" to be hydrophilic are set by menu item 3.
+
+4) GAP SEPARATION DISTANCE tries to decrease the chances of gaps being too
+close to each other. Gaps that are less than this distance apart are penalised
+more than other gaps. This does not prevent close gaps; it makes them less
+frequent, promoting a block-like appearance of the alignment.
+
+5) END GAP SEPARATION treats end gaps just like internal gaps for the purposes
+of avoiding gaps that are too close (set by GAP SEPARATION DISTANCE above).
+If you turn this off, end gaps will be ignored for this purpose. This is
+useful when you wish to align fragments where the end gaps are not biologically
+meaningful.
+>>HELP 5 << Help for output format options.
+
+Six output formats are offered. You can choose any (or all 6 if you wish).
+
+CLUSTAL format output is a self explanatory alignment format. It shows the
+sequences aligned in blocks. It can be read in again at a later date to
+(for example) calculate a phylogenetic tree or add a new sequence with a
+profile alignment.
+
+GCG output can be used by any of the GCG programs that can work on multiple
+alignments (e.g. PRETTY, PROFILEMAKE, PLOTALIGN). It is the same as the GCG
+.msf format files (multiple sequence file); new in version 7 of GCG.
+
+PHYLIP format output can be used for input to the PHYLIP package of Joe
+Felsenstein. This is an extremely widely used package for doing every
+imaginable form of phylogenetic analysis (MUCH more than the the modest intro-
+duction offered by this program).
+
+NBRF-PIR: this is the same as the standard PIR format with ONE ADDITION. Gap
+characters "-" are used to indicate the positions of gaps in the multiple
+alignment. These files can be re-used as input in any part of clustal that
+allows sequences (or alignments or profiles) to be read in.
+
+GDE: this is the flat file format used by the GDE package of Steven Smith.
+
+NEXUS: the format used by several phylogeny programs, including PAUP and
+MacClade.
+
+GDE OUTPUT CASE: sequences in GDE format may be written in either upper or
+lower case.
+
+CLUSTALW SEQUENCE NUMBERS: residue numbers may be added to the end of the
+alignment lines in clustalw format.
+
+OUTPUT ORDER is used to control the order of the sequences in the output
+alignments. By default, the order corresponds to the order in which the
+sequences were aligned (from the guide tree-dendrogram), thus automatically
+grouping closely related sequences. This switch can be used to set the order
+to the same as the input file.
+
+PARAMETER OUTPUT: This option allows you to save all your parameter settings
+in a parameter file. This file can be used subsequently to rerun Clustal W
+using the same parameters.
+
+>>HELP 6 << Help for profile and structure alignments
+
+By PROFILE ALIGNMENT, we mean alignment using existing alignments. Profile
+alignments allow you to store alignments of your favourite sequences and add
+new sequences to them in small bunches at a time. A profile is simply an
+alignment of one or more sequences (e.g. an alignment output file from CLUSTAL
+W). Each input can be a single sequence. One or both sets of input sequences
+may include secondary structure assignments or gap penalty masks to guide the
+alignment.
+
+The profiles can be in any of the allowed input formats with "-" characters
+used to specify gaps (except for MSF-RSF where "." is used).
+
+You have to specify the 2 profiles by choosing menu items 1 and 2 and giving
+2 file names. Then Menu item 3 will align the 2 profiles to each other.
+Secondary structure masks in either profile can be used to guide the alignment.
+
+Menu item 4 will take the sequences in the second profile and align them to
+the first profile, 1 at a time. This is useful to add some new sequences to
+an existing alignment, or to align a set of sequences to a known structure.
+In this case, the second profile would not be pre-aligned.
+
+
+The alignment parameters can be set using menu items 5, 6 and 7. These are
+EXACTLY the same parameters as used by the general, automatic multiple
+alignment procedure. The general multiple alignment procedure is simply a
+series of profile alignments. Carrying out a series of profile alignments on
+larger and larger groups of sequences, allows you to manually build up a
+complete alignment, if necessary editing intermediate alignments.
+
+SECONDARY STRUCTURE OPTIONS. Menu Option 0 allows you to set 2D structure
+parameters. If a solved structure is available, it can be used to guide the
+alignment by raising gap penalties within secondary structure elements, so
+that gaps will preferentially be inserted into unstructured surface loops.
+Alternatively, a user-specified gap penalty mask can be supplied directly.
+
+A gap penalty mask is a series of numbers between 1 and 9, one per position in
+the alignment. Each number specifies how much the gap opening penalty is to be
+raised at that position (raised by multiplying the basic gap opening penalty
+by the number) i.e. a mask figure of 1 at a position means no change
+in gap opening penalty; a figure of 4 means that the gap opening penalty is
+four times greater at that position, making gaps 4 times harder to open.
+
+The format for gap penalty masks and secondary structure masks is explained
+in the help under option 0 (secondary structure options).
+>>HELP B << Help for secondary structure - gap penalty masks
+
+The use of secondary structure-based penalties has been shown to improve the
+accuracy of multiple alignment. Therefore CLUSTAL W now allows gap penalty
+masks to be supplied with the input sequences. The masks work by raising gap
+penalties in specified regions (typically secondary structure elements) so that
+gaps are preferentially opened in the less well conserved regions (typically
+surface loops).
+
+Options 1 and 2 control whether the input secondary structure information or
+gap penalty masks will be used.
+
+Option 3 controls whether the secondary structure and gap penalty masks should
+be included in the output alignment.
+
+Options 4 and 5 provide the value for raising the gap penalty at core Alpha
+Helical (A) and Beta Strand (B) residues. In CLUSTAL format, capital residues
+denote the A and B core structure notation. The basic gap penalties are
+multiplied by the amount specified.
+
+Option 6 provides the value for the gap penalty in Loops. By default this
+penalty is not raised. In CLUSTAL format, loops are specified by "." in the
+secondary structure notation.
+
+Option 7 provides the value for setting the gap penalty at the ends of
+secondary structures. Ends of secondary structures are observed to grow
+and-or shrink in related structures. Therefore by default these are given
+intermediate values, lower than the core penalties. All secondary structure
+read in as lower case in CLUSTAL format gets the reduced terminal penalty.
+
+Options 8 and 9 specify the range of structure termini for the intermediate
+penalties. In the alignment output, these are indicated as lower case.
+For Alpha Helices, by default, the range spans the end helical turn. For
+Beta Strands, the default range spans the end residue and the adjacent loop
+residue, since sequence conservation often extends beyond the actual H-bonded
+Beta Strand.
+
+CLUSTAL W can read the masks from SWISS-PROT, CLUSTAL or GDE format input
+files. For many 3-D protein structures, secondary structure information is
+recorded in the feature tables of SWISS-PROT database entries. You should
+always check that the assignments are correct - some are quite inaccurate.
+CLUSTAL W looks for SWISS-PROT HELIX and STRAND assignments e.g.
+
+FT HELIX 100 115
+FT STRAND 118 119
+
+The structure and penalty masks can also be read from CLUSTAL alignment format
+as comment lines beginning "!SS_" or "!GM_" e.g.
+
+!SS_HBA_HUMA ..aaaAAAAAAAAAAaaa.aaaAAAAAAAAAAaaaaaaAaaa.........aaaAAAAAA
+!GM_HBA_HUMA 112224444444444222122244444444442222224222111111111222444444
+HBA_HUMA VLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHGK
+
+Note that the mask itself is a set of numbers between 1 and 9 each of which is
+assigned to the residue(s) in the same column below.
+
+In GDE flat file format, the masks are specified as text and the names must
+begin with "SS_ or "GM_.
+
+Either a structure or penalty mask or both may be used. If both are included in
+an alignment, the user will be asked which is to be used.
+
+>>HELP C << Help for secondary structure - gap penalty mask output options
+
+ The options in this menu let you choose whether or not to include the masks
+in the CLUSTAL W output alignments. Showing both is useful for understanding
+how the masks work. The secondary structure information is itself very useful
+in judging the alignment quality and in seeing how residue conservation
+patterns vary with secondary structure.
+
+
+>>HELP 7 << Help for phylogenetic trees
+
+1) Before calculating a tree, you must have an ALIGNMENT in memory. This can be
+input in any format or you should have just carried out a full multiple
+alignment and the alignment is still in memory.
+
+
+*************** Remember YOU MUST ALIGN THE SEQUENCES FIRST!!!! ***************
+
+
+The method used is the NJ (Neighbour Joining) method of Saitou and Nei. First
+you calculate distances (percent divergence) between all pairs of sequence from
+a multiple alignment; second you apply the NJ method to the distance matrix.
+
+2) EXCLUDE POSITIONS WITH GAPS? With this option, any alignment positions where
+ANY of the sequences have a gap will be ignored. This means that 'like' will be
+compared to 'like' in all distances, which is highly desirable. It also
+automatically throws away the most ambiguous parts of the alignment, which are
+concentrated around gaps (usually). The disadvantage is that you may throw away
+much of the data if there are many gaps (which is why it is difficult for us to
+make it the default).
+
+
+
+3) CORRECT FOR MULTIPLE SUBSTITUTIONS? For small divergence (say <10%) this
+option makes no difference. For greater divergence, it corrects for the fact
+that observed distances underestimate actual evolutionary distances. This is
+because, as sequences diverge, more than one substitution will happen at many
+sites. However, you only see one difference when you look at the present day
+sequences. Therefore, this option has the effect of stretching branch lengths
+in trees (especially long branches). The corrections used here (for DNA or
+proteins) are both due to Motoo Kimura. See the documentation for details.
+
+Where possible, this option should be used. However, for VERY divergent
+sequences, the distances cannot be reliably corrected. You will be warned if
+this happens. Even if none of the distances in a data set exceed the reliable
+threshold, if you bootstrap the data, some of the bootstrap distances may
+randomly exceed the safe limit.
+
+4) To calculate a tree, use option 4 (DRAW TREE NOW). This gives an UNROOTED
+tree and all branch lengths. The root of the tree can only be inferred by
+using an outgroup (a sequence that you are certain branches at the outside
+of the tree .... certain on biological grounds) OR if you assume a degree
+of constancy in the 'molecular clock', you can place the root in the 'middle'
+of the tree (roughly equidistant from all tips).
+
+5) TOGGLE PHYLIP BOOTSTRAP POSITIONS
+By default, the bootstrap values are correctly placed on the tree branches of
+the phylip format output tree. The toggle allows them to be placed on the
+nodes, which is incorrect, but some display packages (e.g. TreeTool, TreeView
+and Phylowin) only support node labelling but not branch labelling. Care
+should be taken to note which branches and labels go together.
+
+6) OUTPUT FORMATS: four different formats are allowed. None of these displays
+the tree visually. Useful display programs accepting PHYLIP format include
+NJplot (from Manolo Gouy and supplied with Clustal W), TreeView (Mac-PC), and
+PHYLIP itself - OR get the PHYLIP package and use the tree drawing facilities
+there. (Get the PHYLIP package anyway if you are interested in trees). The
+NEXUS format can be read into PAUP or MacClade.
+
+>>HELP 8 << Help for choosing a weight matrix
+
+For protein alignments, you use a weight matrix to determine the similarity of
+non-identical amino acids. For example, Tyr aligned with Phe is usually judged
+to be 'better' than Tyr aligned with Pro.
+
+There are three 'in-built' series of weight matrices offered. Each consists of
+several matrices which work differently at different evolutionary distances. To
+see the exact details, read the documentation. Crudely, we store several
+matrices in memory, spanning the full range of amino acid distance (from almost
+identical sequences to highly divergent ones). For very similar sequences, it
+is best to use a strict weight matrix which only gives a high score to
+identities and the most favoured conservative substitutions. For more divergent
+sequences, it is appropriate to use "softer" matrices which give a high score
+to many other frequent substitutions.
+
+1) BLOSUM (Henikoff). These matrices appear to be the best available for
+carrying out database similarity (homology searches). The matrices used are:
+Blosum 80, 62, 45 and 30. (BLOSUM was the default in earlier Clustal W
+versions)
+
+2) PAM (Dayhoff). These have been extremely widely used since the late '70s.
+We use the PAM 20, 60, 120 and 350 matrices.
+
+3) GONNET. These matrices were derived using almost the same procedure as the
+Dayhoff one (above) but are much more up to date and are based on a far larger
+data set. They appear to be more sensitive than the Dayhoff series. We use the
+GONNET 80, 120, 160, 250 and 350 matrices. This series is the default for
+Clustal W version 1.8.
+
+We also supply an identity matrix which gives a score of 1.0 to two identical
+amino acids and a score of zero otherwise. This matrix is not very useful.
+Alternatively, you can read in your own (just one matrix, not a series).
+
+A new matrix can be read from a file on disk, if the filename consists only
+of lower case characters. The values in the new weight matrix must be integers
+and the scores should be similarities. You can use negative as well as positive
+values if you wish, although the matrix will be automatically adjusted to all
+positive scores.
+
+
+
+For DNA, a single matrix (not a series) is used. Two hard-coded matrices are
+available:
+
+
+1) IUB. This is the default scoring matrix used by BESTFIT for the comparison
+of nucleic acid sequences. X's and N's are treated as matches to any IUB
+ambiguity symbol. All matches score 1.9; all mismatches for IUB symbols score 0.
+
+
+2) CLUSTALW(1.6). The previous system used by Clustal W, in which matches score
+1.0 and mismatches score 0. All matches for IUB symbols also score 0.
+
+INPUT FORMAT The format used for a new matrix is the same as the BLAST program.
+Any lines beginning with a # character are assumed to be comments. The first
+non-comment line should contain a list of amino acids in any order, using the
+1 letter code, followed by a * character. This should be followed by a square
+matrix of integer scores, with one row and one column for each amino acid. The
+last row and column of the matrix (corresponding to the * character) contain
+the minimum score over the whole matrix.
+
+>>HELP 9 << Help for command line parameters
+ DATA (sequences)
+
+-INFILE=file.ext :input sequences.
+-PROFILE1=file.ext and -PROFILE2=file.ext :profiles (old alignment).
+
+
+ VERBS (do things)
+
+-OPTIONS :list the command line parameters
+-HELP or -CHECK :outline the command line params.
+-ALIGN :do full multiple alignment
+-TREE :calculate NJ tree.
+-BOOTSTRAP(=n) :bootstrap a NJ tree (n= number of bootstraps; def. = 1000).
+-CONVERT :output the input sequences in a different file format.
+
+
+ PARAMETERS (set things)
+
+***General settings:****
+-INTERACTIVE :read command line, then enter normal interactive menus
+-QUICKTREE :use FAST algorithm for the alignment guide tree
+-TYPE= :PROTEIN or DNA sequences
+-NEGATIVE :protein alignment with negative values in matrix
+-OUTFILE= :sequence alignment file name
+-OUTPUT= :GCG, GDE, PHYLIP, PIR or NEXUS
+-OUTORDER= :INPUT or ALIGNED
+-CASE :LOWER or UPPER (for GDE output only)
+-SEQNOS= :OFF or ON (for Clustal output only)
+-SEQNO_RANGE=:OFF or ON (NEW: for all output formats)
+-RANGE=m,n :sequence range to write starting m to m+n.
+
+***Fast Pairwise Alignments:***
+-KTUPLE=n :word size
+-TOPDIAGS=n :number of best diags.
+-WINDOW=n :window around best diags.
+-PAIRGAP=n :gap penalty
+-SCORE :PERCENT or ABSOLUTE
+
+
+***Slow Pairwise Alignments:***
+-PWMATRIX= :Protein weight matrix=BLOSUM, PAM, GONNET, ID or filename
+-PWDNAMATRIX= :DNA weight matrix=IUB, CLUSTALW or filename
+-PWGAPOPEN=f :gap opening penalty
+-PWGAPEXT=f :gap opening penalty
+
+
+***Multiple Alignments:***
+-NEWTREE= :file for new guide tree
+-USETREE= :file for old guide tree
+-MATRIX= :Protein weight matrix=BLOSUM, PAM, GONNET, ID or filename
+-DNAMATRIX= :DNA weight matrix=IUB, CLUSTALW or filename
+-GAPOPEN=f :gap opening penalty
+-GAPEXT=f :gap extension penalty
+-ENDGAPS :no end gap separation pen.
+-GAPDIST=n :gap separation pen. range
+-NOPGAP :residue-specific gaps off
+-NOHGAP :hydrophilic gaps off
+-HGAPRESIDUES= :list hydrophilic res.
+-MAXDIV=n :% ident. for delay
+-TYPE= :PROTEIN or DNA
+-TRANSWEIGHT=f :transitions weighting
+
+
+***Profile Alignments:***
+-PROFILE :Merge two alignments by profile alignment
+-NEWTREE1= :file for new guide tree for profile1
+-NEWTREE2= :file for new guide tree for profile2
+-USETREE1= :file for old guide tree for profile1
+-USETREE2= :file for old guide tree for profile2
+
+
+***Sequence to Profile Alignments:***
+-SEQUENCES :Sequentially add profile2 sequences to profile1 alignment
+-NEWTREE= :file for new guide tree
+-USETREE= :file for old guide tree
+
+
+***Structure Alignments:***
+-NOSECSTR1 :do not use secondary structure-gap penalty mask for profile 1
+-NOSECSTR2 :do not use secondary structure-gap penalty mask for profile 2
+-SECSTROUT=STRUCTURE or MASK or BOTH or NONE :output in alignment file
+-HELIXGAP=n :gap penalty for helix core residues
+-STRANDGAP=n :gap penalty for strand core residues
+-LOOPGAP=n :gap penalty for loop regions
+-TERMINALGAP=n :gap penalty for structure termini
+-HELIXENDIN=n :number of residues inside helix to be treated as terminal
+-HELIXENDOUT=n :number of residues outside helix to be treated as terminal
+-STRANDENDIN=n :number of residues inside strand to be treated as terminal
+-STRANDENDOUT=n:number of residues outside strand to be treated as terminal
+
+
+***Trees:***
+-OUTPUTTREE=nj OR phylip OR dist OR nexus
+-SEED=n :seed number for bootstraps.
+-KIMURA :use Kimura's correction.
+-TOSSGAPS :ignore positions with gaps.
+-BOOTLABELS=node OR branch :position of bootstrap values in tree display
+
+>>HELP 0 << Help for tree output format options
+
+Four output formats are offered: 1) Clustal, 2) Phylip, 3) Just the distances
+4) Nexus
+
+None of these formats displays the results graphically. Many packages can
+display trees in the the PHYLIP format 2) below. It can also be imported into
+the PHYLIP programs RETREE, DRAWTREE and DRAWGRAM for graphical display.
+NEXUS format trees can be read by PAUP and MacClade.
+
+1) Clustal format output.
+This format is verbose and lists all of the distances between the sequences and
+the number of alignment positions used for each. The tree is described at the
+end of the file. It lists the sequences that are joined at each alignment step
+and the branch lengths. After two sequences are joined, it is referred to later
+as a NODE. The number of a NODE is the number of the lowest sequence in that
+NODE.
+
+2) Phylip format output.
+This format is the New Hampshire format, used by many phylogenetic analysis
+packages. It consists of a series of nested parentheses, describing the
+branching order, with the sequence names and branch lengths. It can be used by
+the RETREE, DRAWGRAM and DRAWTREE programs of the PHYLIP package to see the
+trees graphically. This is the same format used during multiple alignment for
+the guide trees.
+
+Use this format with NJplot (Manolo Gouy), supplied with Clustal W. Some other
+packages that can read and display New Hampshire format are TreeView (Mac/PC),
+TreeTool (UNIX), and Phylowin.
+
+3) The distances only.
+This format just outputs a matrix of all the pairwise distances in a format
+that can be used by the Phylip package. It used to be useful when one could not
+produce distances from protein sequences in the Phylip package but is now
+redundant (Protdist of Phylip 3.5 now does this).
+
+4) NEXUS FORMAT TREE. This format is used by several popular phylogeny programs,
+including PAUP and MacClade. The format is described fully in:
+Maddison, D. R., D. L. Swofford and W. P. Maddison. 1997.
+NEXUS: an extensible file format for systematic information.
+Systematic Biology 46:590-621.
+
+5) TOGGLE PHYLIP BOOTSTRAP POSITIONS
+By default, the bootstrap values are placed on the nodes of the phylip format
+output tree. This is inaccurate as the bootstrap values should be associated
+with the tree branches and not the nodes. However, this format can be read and
+displayed by TreeTool, TreeView and Phylowin. An option is available to
+correctly place the bootstrap values on the branches with which they are
+associated.
+
Added: trunk/packages/clustalw/branches/upstream/current/clustalx.c
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/clustalx.c 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/clustalx.c 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,129 @@
+#include <string.h>
+#include <stdlib.h>
+#include <vibrant.h>
+
+#include "clustalw.h"
+
+/*
+* Prototypes
+*/
+
+extern void *ckalloc(size_t);
+extern void init_interface(void);
+extern void init_matrix(void);
+extern void fill_chartab(void);
+extern void parse_params(Boolean);
+extern void x_menu(void);
+
+/*
+* Global variables
+*/
+
+double **tmat;
+
+char revision_level[] = "X (1.83)"; /* JULIE feb 2001*/
+Boolean interactive=TRUE;
+#ifdef MSDOS
+ char *help_file_name = "clustalx.hlp";
+#else
+ char *help_file_name = "clustalx_help";
+#endif
+
+sint max_names; /* maximum length of names in current alignment file */
+
+float gap_open, gap_extend;
+float pw_go_penalty, pw_ge_penalty;
+
+FILE *tree;
+FILE *clustal_outfile, *gcg_outfile, *nbrf_outfile, *phylip_outfile,
+ *gde_outfile, *nexus_outfile;
+FILE *fasta_outfile; /* Ramu */
+sint *seqlen_array;
+sint max_aln_length;
+short usermat[NUMRES][NUMRES], pw_usermat[NUMRES][NUMRES];
+short score_matrix[NUMRES][NUMRES],score_dnamatrix[NUMRES][NUMRES];
+short segment_matrix[NUMRES][NUMRES],segment_dnamatrix[NUMRES][NUMRES];
+short def_aa_xref[NUMRES+1], aa_xref[NUMRES+1], pw_aa_xref[NUMRES+1];
+short userdnamat[NUMRES][NUMRES], pw_userdnamat[NUMRES][NUMRES];
+short def_dna_xref[NUMRES+1], dna_xref[NUMRES+1], pw_dna_xref[NUMRES+1];
+short score_aa_xref[NUMRES+1],score_dna_xref[NUMRES+1];
+short segment_aa_xref[NUMRES+1],segment_dna_xref[NUMRES+1];
+sint nseqs;
+sint nsets;
+sint *output_index;
+sint **sets;
+sint *seq_weight;
+sint max_aa;
+sint gap_pos1;
+sint gap_pos2;
+sint mat_avscore;
+sint profile_no;
+
+Boolean usemenu=FALSE;
+Boolean dnaflag;
+Boolean distance_tree;
+
+char **seq_array;
+char **names,**titles;
+char **args;
+char seqname[FILENAMELEN+1];
+
+char *gap_penalty_mask1 = NULL, *gap_penalty_mask2 = NULL;
+char *sec_struct_mask1 = NULL, *sec_struct_mask2 = NULL;
+sint struct_penalties;
+char *ss_name1 = NULL, *ss_name2 = NULL;
+
+Boolean user_series = FALSE;
+UserMatSeries matseries;
+short usermatseries[MAXMAT][NUMRES][NUMRES];
+short aa_xrefseries[MAXMAT][NUMRES+1];
+
+
+extern Int2 Main(void)
+
+{
+ int i;
+
+#ifndef WIN_MAC
+#ifdef GetArgc
+ int argc;
+ char **argv;
+
+ argc=GetArgc();
+ argv=GetArgv();
+#else
+ extern int argc;
+ extern char **argv;
+#endif
+#endif
+
+ init_interface();
+ init_matrix();
+
+ fill_chartab();
+
+#ifndef WIN_MAC
+ if(argc>1) {
+ args = (char **)ckalloc(argc * sizeof(char *));
+
+ for(i=1;i<argc;++i)
+ {
+ args[i-1]=(char *)ckalloc((strlen(argv[i])+1) * sizeof(char));
+ strcpy(args[i-1],argv[i]);
+ }
+ usemenu=FALSE;
+ parse_params(TRUE);
+
+ for(i=0;i<argc-1;i++)
+ ckfree(args[i]);
+ ckfree(args);
+
+ }
+#endif
+ interactive=TRUE;
+ x_menu();
+
+ return 0;
+ /* exit(0); */
+}
+
Added: trunk/packages/clustalw/branches/upstream/current/clustalx.html
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/clustalx.html 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/clustalx.html 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,2112 @@
+<HEAD>
+<TITLE>ClustalX Help</TITLE>
+</HEAD>
+<BODY BGCOLOR=white>
+<CENTER><H1>ClustalX Help</H1></CENTER>
+<P>
+You can get the latest version of the ClustalX program here:
+</P>
+<DL><DD>
+<A HREF="ftp://ftp-igbmc.u-strasbg.fr/pub/ClustalX/">
+ftp://ftp-igbmc.u-strasbg.fr/pub/ClustalX/</A>
+</DL>
+<P>For full details of usage and algorithms, please read the <A HREF="clustalw.doc"><EM>ClustalW.Doc</EM></A> file.</P>
+<PRE><EM>
+Toby Gibson EMBL, Heidelberg, Germany.
+Des Higgins UCC, Cork, Ireland.
+Julie Thompson/Francois Jeanmougin IGBMC, Strasbourg, France.
+</EM></PRE>
+<CENTER><H2><A NAME="Index">Index</A></H2></CENTER>
+<OL>
+<LI><A HREF="#G"> General help for CLUSTAL X (1.8)
+</A></LI>
+<LI><A HREF="#F"> Input / Output Files
+</A></LI>
+<LI><A HREF="#E"> Editing Alignments
+</A></LI>
+<LI><A HREF="#M"> Multiple Alignments
+</A></LI>
+<LI><A HREF="#P"> Profile and Structure Alignments
+</A></LI>
+<LI><A HREF="#B"> Secondary Structure / Gap Penalty Masks
+</A></LI>
+<LI><A HREF="#T"> Phylogenetic Trees
+</A></LI>
+<LI><A HREF="#C"> Colors
+</A></LI>
+<LI><A HREF="#Q"> Alignment Quality Analysis
+</A></LI>
+<LI><A HREF="#9"> Command Line Parameters
+</A></LI>
+<LI><A HREF="#R"> References
+</A></LI>
+</OL>
+<CENTER><H2><A NAME="G"> General help for CLUSTAL X (1.8)
+</A></H2></CENTER>
+<P>
+</P>
+<P>
+Clustal X is a windows interface for the ClustalW multiple sequence alignment
+program. It provides an integrated environment for performing multiple sequence
+and profile alignments and analysing the results. The sequence alignment is
+displayed in a window on the screen. A versatile coloring scheme has been
+incorporated allowing you to highlight conserved features in the alignment.
+The pull-down menus at the top of the window allow you to select all the
+options required for traditional multiple sequence and profile alignment.
+</P>
+<P>
+You can cut-and-paste sequences to change the order of the alignment; you can
+select a subset of sequences to be aligned; you can select a sub-range of the
+alignment to be realigned and inserted back into the original alignment.
+</P>
+<P>
+Alignment quality analysis can be performed and low-scoring segments or
+exceptional residues can be highlighted.
+</P>
+<P>
+ClustalX is available for a number of different platforms including: SUN
+Solaris, IRIX5.3 on Silicon Graphics, Digital UNIX on DECStations, Microsoft
+Windows (32 bit) for PC's, Linux ELF for x86 PC's and Macintosh PowerMac. (See
+the README file for Installation instructions.)
+</P>
+<P>
+</P>
+<P>
+<H4>
+SEQUENCE INPUT
+</H4>
+</P>
+<P>
+Sequences and profiles (a term for pre-existing alignments) are input using
+the FILE menu. Invalid options will be disabled. All sequences must be included
+into 1 file. 7 formats are automatically recognised: NBRF/PIR, EMBL/SWISSPROT,
+Pearson (Fasta), Clustal (*.aln), GCG/MSF (Pileup), GCG9 RSF and GDE flat file.
+All non-alphabetic characters (spaces, digits, punctuation marks) are ignored
+except "-" which is used to indicate a GAP ("." in MSF/RSF).
+</P>
+<P>
+<H4>
+SEQUENCE / PROFILE ALIGNMENTS
+</H4>
+</P>
+<P>
+Clustal X has two modes which can be selected using the switch directly above
+the sequence display: MULTIPLE ALIGNMENT MODE and PROFILE ALIGNMENT MODE.
+</P>
+<P>
+To do a MULTIPLE ALIGNMENT on a set of sequences, make sure MULTIPLE ALIGNMENT
+MODE is selected. A single sequence data area is then displayed. The ALIGNMENT
+menu then allows you to either produce a guide tree for the alignment, or to do
+a multiple alignment following the guide tree, or to do a full multiple
+alignment.
+</P>
+<P>
+In PROFILE ALIGNMENT MODE, two sequence data areas are displayed, allowing you
+to align 2 alignments (termed profiles). Profiles are also used to add a new
+sequence to an old alignment, or to use secondary structure to guide the
+alignment process. GAPS in the old alignments are indicated using the "-"
+character. PROFILES can be input in ANY of the allowed formats; just use "-"
+(or "." for MSF/RSF) for each gap position. In Profile Alignment Mode, a button
+"Lock Scroll" is displayed which allows you to scroll the two profiles together
+using a single scroll bar. When the Lock Scroll is turned off, the two profiles
+can be scrolled independently.
+</P>
+<P>
+<H4>
+PHYLOGENETIC TREES
+</H4>
+</P>
+<P>
+Phylogenetic trees can be calculated from old alignments (read in with "-"
+characters to indicate gaps) OR after a multiple alignment while the alignment
+is still displayed.
+</P>
+<P>
+<H4>
+ALIGNMENT DISPLAY
+</H4>
+</P>
+<P>
+The alignment is displayed on the screen with the sequence names on the left
+hand side. The sequence alignment is for display only, it cannot be edited here
+(except for changing the sequence order by cutting-and-pasting on the sequence
+names).
+</P>
+<P>
+A ruler is displayed below the sequences, starting at 1 for the first residue
+position (residue numbers in the sequence input file are ignored).
+</P>
+<P>
+A line above the alignment is used to mark strongly conserved positions. Three
+characters ('*', ':' and '.') are used:
+</P>
+<P>
+'*' indicates positions which have a single, fully conserved residue
+</P>
+<P>
+':' indicates that one of the following 'strong' groups is fully conserved:-
+<PRE>
+ STA
+ NEQK
+ NHQK
+ NDEQ
+ QHRK
+ MILV
+ MILF
+ HY
+ FYW
+</PRE>
+</P>
+<P>
+'.' indicates that one of the following 'weaker' groups is fully conserved:-
+<PRE>
+ CSA
+ ATV
+ SAG
+ STNK
+ STPA
+ SGND
+ SNDEQK
+ NDEQHK
+ NEQHRK
+ FVLIM
+ HFY
+</PRE>
+</P>
+<P>
+These are all the positively scoring groups that occur in the Gonnet Pam250
+matrix. The strong and weak groups are defined as strong score >0.5 and weak
+score =<0.5 respectively.
+</P>
+<P>
+For profile alignments, secondary structure and gap penalty masks are displayed
+above the sequences, if any data is found in the profile input file.
+</P>
+<P>
+</P>
+<P>
+</P>
+<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
+<CENTER><H2><A NAME="F"> Input / Output Files
+</A></H2></CENTER>
+<P>
+</P>
+<P>
+LOAD SEQUENCES reads sequences from one of 7 file formats, replacing any
+sequences that are already loaded. All sequences must be in 1 file. The formats
+that are automatically recognised are: NBRF/PIR, EMBL/SWISSPROT, Pearson
+(Fasta), Clustal (*.aln), GCG/MSF (Pileup), GCG9/RSF and GDE flat file. All
+non-alphabetic characters (spaces, digits, punctuation marks) are ignored
+except "-" which is used to indicate a GAP ("." in MSF/RSF).
+</P>
+<P>
+The program tries to automatically recognise the different file formats used
+and to guess whether the sequences are amino acid or nucleotide. This is not
+always foolproof.
+</P>
+<P>
+FASTA and NBRF/PIR formats are recognised by having a ">" as the first
+character in the file.
+</P>
+<P>
+EMBL/Swiss Prot formats are recognised by the letters "ID" at the start of the
+file (the token for the entry name field).
+</P>
+<P>
+CLUSTAL format is recognised by the word CLUSTAL at the beginning of the file.
+</P>
+<P>
+GCG/MSF format is recognised by one of the following:
+<UL>
+<LI>
+ - the word PileUp at the start of the file.
+</LI><LI>
+ - the word !!AA_MULTIPLE_ALIGNMENT or !!NA_MULTIPLE_ALIGNMENT
+ at the start of the file.
+</LI><LI>
+ - the word MSF on the first line of the file, and the characters ..
+ at the end of this line.
+</LI>
+</UL>
+</P>
+<P>
+GCG/RSF format is recognised by the word !!RICH_SEQUENCE at the beginning of
+the file.
+</P>
+<P>
+</P>
+<P>
+If 85% or more of the characters in the sequence are from A,C,G,T,U or N, the
+sequence will be assumed to be nucleotide. This works in 97.3% of cases but
+watch out!
+</P>
+<P>
+APPEND SEQUENCES is only valid in MULTIPLE ALIGNMENT MODE. The input sequences
+do not replace those already loaded, but are appended at the end of the
+alignment.
+</P>
+<P>
+SAVE SEQUENCES AS... offers the user a choice of one of six output formats:
+CLUSTAL, NBRF/PIR, GCG/MSF, PHYLIP, NEXUS or GDE. All sequences are written
+to a single file. Options are available to save a range of the alignment,
+switch between UPPER/LOWER case for GDE files, and to output SEQUENCE NUMBERING
+for CLUSTAL files.
+</P>
+<P>
+LOAD PROFILE 1 reads sequences in the same 7 file formats, replacing any
+sequences already loaded as Profile 1. This option will also remove any
+sequences which are loaded in Profile 2.
+</P>
+<P>
+LOAD PROFILE 2 reads sequences in the same 7 file formats, replacing any
+sequences already loaded as Profile 2.
+</P>
+<P>
+SAVE PROFILE 1 AS... is similar to the Save Sequences option except that only
+those sequences in Profile 1 will be written to the output file.
+</P>
+<P>
+SAVE PROFILE 2 AS... is similar to the Save Sequences option except that only
+those sequences in Profile 2 will be written to the output file.
+</P>
+<P>
+WRITE ALIGNMENT AS POSTSCRIPT will write the sequence display to a postscript
+format file. This will include any secondary structure / gap penalty mask
+information and the consensus and ruler lines which are displayed on the
+screen. The Alignment Quality curve can be optionally included in the output
+file.
+</P>
+<P>
+WRITE PROFILE 1 AS POSTSCRIPT is similar to WRITE ALIGNMENT AS POSTSCRIPT
+except that only the profile 1 display will be printed.
+</P>
+<P>
+WRITE PROFILE 2 AS POSTSCRIPT is similar to WRITE ALIGNMENT AS POSTSCRIPT
+except that only the profile 2 display will be printed.
+</P>
+<P>
+</P>
+<P>
+<H4>
+POSTSCRIPT PARAMETERS
+</H4>
+</P>
+<P>
+A number of options are available to allow you to configure your postscript
+output file.
+</P>
+<P>
+PS COLORS FILE:
+</P>
+<P>
+The exact RGB values required to reproduce the colors used in the alignment
+window will vary from printer to printer. A PS colors file can be specified
+that contains the RGB values for all the colors required by each of your
+postscript printers.
+</P>
+<P>
+By default, Clustal X looks for a file called 'colprint.par' in the current
+directory (if your running under UNIX, it then looks in your home directory,
+and finally in the directories in your PATH environment variable). If no PS
+colors file is found or a color used on the screen is not defined here, the
+screen RGB values (from the Color Parameter File) are used.
+</P>
+<P>
+The PS colors file consists of one line for each color to be defined, with the
+color name followed by the RGB values (on a scale of 0 to 1). For example,
+</P>
+<P>
+RED 0.9 0.1 0.1
+</P>
+<P>
+Blank lines and comments (lines beginning with a '#' character) are ignored.
+</P>
+<P>
+</P>
+<P>
+PAGE SIZE: The alignment can be displayed on either A4, A3 or US Letter size
+pages.
+</P>
+<P>
+ORIENTATION: The alignment can be displayed on either a landscape or portrait
+page.
+</P>
+<P>
+PRINT HEADER: An optional header including the postscript filename, and
+creation date can be printed at the top of each page.
+</P>
+<P>
+PRINT QUALITY CURVE: The Alignment Quality curve which is displayed underneath
+the alignment on the screen can be included in the postscript output.
+</P>
+<P>
+PRINT RULER: The ruler which is displayed underneath the alignment on the
+screen can be included in the postscript output.
+</P>
+<P>
+PRINT RESIDUE NUMBERS: Sequence residue numbers can be printed at the right
+hand side of the alignment.
+</P>
+<P>
+RESIZE TO FIT PAGE: By default, the alignment is scaled to fit the page size
+selected. This option can be turned off, in which case a font size of 10 will
+be used for the sequences.
+</P>
+<P>
+PRINT FROM POSITION/TO: A range of the alignment can be printed. The default
+is to print the full alignment. The first and last residues to be printed are
+specified here.
+</P>
+<P>
+USE BLOCK LENGTH: The alignment can be divided into blocks of residues. The
+number of residues in a block is specified here. More than one block may then
+be printed on a single page. This is useful for long alignments of a small
+number of sequences. If the block length is set to 0, The alignment will not
+be divided into blocks, but printed across a number of pages.
+</P>
+<P>
+</P>
+<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
+<CENTER><H2><A NAME="E"> Editing Alignments
+</A></H2></CENTER>
+<P>
+</P>
+<P>
+Clustal X allows you to change the order of the sequences in the alignment, by
+cutting-and-pasting the sequence names.
+</P>
+<P>
+To select a group of sequences to be moved, click on a sequence name and drag
+the cursor until all the required sequences are highlighted. Holding down the
+Shift key when clicking on the first name will add new sequences to those
+already selected.
+</P>
+<P>
+(Options are provided to Select All Sequences, Select Profile 1 or Select
+Profile 2.)
+</P>
+<P>
+The selected sequences can be removed from the alignment by using the EDIT
+menu, CUT option.
+</P>
+<P>
+To add the cut sequences back into an alignment, select a sequence by clicking
+on the sequence name. The cut sequences will be added to the alignment,
+immediately following the selected sequence, by the EDIT menu, PASTE option.
+</P>
+<P>
+To add the cut sequences to an empty alignment (eg. when cutting sequences from
+Profile 1 and pasting them to Profile 2), click on the empty sequence name
+display area, and select the EDIT menu, PASTE option as before.
+</P>
+<P>
+The sequence selection and sequence range selection can be cleared using the
+EDIT menu, CLEAR SEQUENCE SELECTION and CLEAR RANGE SELECTION options
+respectively.
+</P>
+<P>
+To search for a string of residues in the sequences, select the sequences to be
+searched by clicking on the sequence names. You can then enter the string to
+search for by selecting the SEARCH FOR STRING option. If the string is found in
+any of the sequences selected, the sequence name and column number is printed
+below the sequence display.
+</P>
+<P>
+In PROFILE ALIGNMENT MODE, the two profiles can be merged (normally done after
+alignment) by selecting ADD PROFILE 2 TO PROFILE 1. The sequences currently
+displayed as Profile 2 will be appended to Profile 1.
+</P>
+<P>
+The REMOVE ALL GAPS option will remove all gaps from the sequences currently
+selected.
+WARNING: This option removes ALL gaps, not only those introduced by ClustalX,
+but also those that were read from the input alignment file. Any secondary
+structure information associated with the alignment will NOT be automatically
+realigned.
+</P>
+<P>
+The REMOVE GAP-ONLY COLUMNS will remove those positions in the alignment which
+contain gaps in all sequences. This can occur as a result of removing divergent
+sequences from an alignment, or if an alignment has been realigned.
+</P>
+<P>
+</P>
+<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
+<CENTER><H2><A NAME="M"> Multiple Alignments
+</A></H2></CENTER>
+<P>
+</P>
+<P>
+Make sure MULTIPLE ALIGNMENT MODE is selected, using the switch directly above
+the sequence display area. Then, use the ALIGNMENT menu to do multiple
+alignments.
+</P>
+<P>
+Multiple alignments are carried out in 3 stages:
+</P>
+<P>
+1) all sequences are compared to each other (pairwise alignments);
+</P>
+<P>
+2) a dendrogram (like a phylogenetic tree) is constructed, describing the
+approximate groupings of the sequences by similarity (stored in a file).
+</P>
+<P>
+3) the final multiple alignment is carried out, using the dendrogram as a guide.
+</P>
+<P>
+The 3 stages are carried out automatically by the DO COMPLETE ALIGNMENT option.
+You can skip the first stages (pairwise alignments; guide tree) by using an old
+guide tree file (DO ALIGNMENT FROM GUIDE TREE); or you can just produce the
+guide tree with no final multiple alignment (PRODUCE GUIDE TREE ONLY).
+</P>
+<P>
+</P>
+<P>
+REALIGN SELECTED SEQUENCES is used to realign badly aligned sequences in the
+alignment. Sequences can be selected by clicking on the sequence names - see
+Editing Alignments for more details. The unselected sequences are then 'fixed'
+and a profile is made including only the unselected sequences. Each of the
+selected sequences in turn is then realigned to this profile. The realigned
+sequences will be displayed as a group at the end the alignment.
+</P>
+<P>
+</P>
+<P>
+REALIGN SELECTED SEQUENCE RANGE is used to realign a small region of the
+alignment. A residue range can be selected by clicking on the sequence display
+area. A multiple alignment is then performed, following the 3 stages described
+above, but only using the selected residue range. Finally the new alignment of
+the range is pasted back into the full sequence alignment.
+</P>
+<P>
+By default, gap penalties are used at each end of the subrange in order to
+penalise terminal gaps. If the REALIGN SEGMENT END GAP PENALTIES option is
+switched off, gaps can be introduced at the ends of the residue range at no
+cost.
+</P>
+<P>
+</P>
+<P>
+ALIGNMENT PARAMETERS displays a sub-menu with the following options:
+</P>
+<P>
+RESET NEW GAPS BEFORE ALIGNMENT will remove any new gaps introduced into the
+sequences during multiple alignment if you wish to change the parameters and
+try again. This only takes effect just before you do a second multiple
+alignment. You can make phylogenetic trees after alignment whether or not this
+is ON. If you turn this OFF, the new gaps are kept even if you do a second
+multiple alignment. This allows you to iterate the alignment gradually.
+Sometimes, the alignment is improved by a second or third pass.
+</P>
+<P>
+RESET ALL GAPS BEFORE ALIGNMENT will remove all gaps in the sequences including
+gaps which were read in from the sequence input file. This only takes effect
+just before you do a second multiple alignment. You can make phylogenetic
+trees after alignment whether or not this is ON. If you turn this OFF, all
+gaps are kept even if you do a second multiple alignment. This allows you to
+iterate the alignment gradually. Sometimes, the alignment is improved by a
+second or third pass.
+</P>
+<P>
+</P>
+<P>
+PAIRWISE ALIGNMENT PARAMETERS control the speed/sensitivity of the initial
+alignments.
+</P>
+<P>
+MULTIPLE ALIGNMENT PARAMETERS control the gaps in the final multiple
+alignments.
+</P>
+<P>
+PROTEIN GAP PARAMETERS displays a temporary window which allows you to set
+various parameters only used in the alignment of protein sequences.
+</P>
+<P>
+(SECONDARY STRUCTURE PARAMETERS, for use with the Profile Alignment Mode only,
+allows you to set various parameters only used with gap penalty masks.)
+</P>
+<P>
+SAVE LOG FILE will write the alignment calculation scores to a file. The log
+filename is the same as the input sequence filename, with an extension .log
+appended.
+</P>
+<P>
+</P>
+<P>
+<H4>
+OUTPUT FORMAT OPTIONS
+</H4>
+</P>
+<P>
+You can choose from 6 different alignment formats (CLUSTAL, GCG, NBRF/PIR,
+PHYLIP, GDE and NEXUS). You can choose more than one (or all 6 if you wish).
+</P>
+<P>
+CLUSTAL format output is a self explanatory alignment format. It shows the
+sequences aligned in blocks. It can be read in again at a later date to (for
+example) calculate a phylogenetic tree or add in new sequences by profile
+alignment.
+</P>
+<P>
+GCG output can be used by any of the GCG programs that can work on multiple
+alignments (e.g. PRETTY, PROFILEMAKE, PLOTALIGN). It is the same as the GCG
+.msf format files (multiple sequence file); new in version 7 of GCG.
+</P>
+<P>
+NEXUS format is used by several phylogeny programs, including PAUP and
+MacClade.
+</P>
+<P>
+PHYLIP format output can be used for input to the PHYLIP package of Joe
+Felsenstein. This is a very widely used package for doing every imaginable
+form of phylogenetic analysis (MUCH more than the the modest introduction
+offered by this program).
+</P>
+<P>
+NBRF/PIR: this is the same as the standard PIR format with ONE ADDITION. Gap
+characters "-" are used to indicate the positions of gaps in the multiple
+alignment. These files can be re-used as input in any part of clustal that
+allows sequences (or alignments or profiles) to be read in.
+</P>
+<P>
+GDE: this format is used by the GDE package of Steven Smith and is understood
+by SEQLAB in GCG 9 or later.
+</P>
+<P>
+GDE OUTPUT CASE: sequences in GDE format may be written in either upper or
+lower case.
+</P>
+<P>
+CLUSTALW SEQUENCE NUMBERS: residue numbers may be added to the end of the
+alignment lines in clustalw format.
+</P>
+<P>
+OUTPUT ORDER is used to control the order of the sequences in the output
+alignments. By default, it uses the order in which the sequences were aligned
+(from the guide tree/dendrogram), thus automatically grouping closely related
+sequences. It can be switched to be the same as the original input order.
+</P>
+<P>
+PARAMETER OUTPUT: This option will save all your parameter settings in a
+parameter file (suffix .par) during alignment. The file can be subsequently
+used to rerun ClustalW using the same parameters.
+</P>
+<P>
+</P>
+<P>
+<H3>
+ALIGNMENT PARAMETERS
+</H3>
+</P>
+<P>
+<STRONG>
+PAIRWISE ALIGNMENT PARAMETERS
+</STRONG>
+</P>
+<P>
+A distance is calculated between every pair of sequences and these are used to
+construct the phylogenetic tree which guides the final multiple alignment. The
+scores are calculated from separate pairwise alignments. These can be
+calculated using 2 methods: dynamic programming (slow but accurate) or by the
+method of Wilbur and Lipman (extremely fast but approximate).
+</P>
+<P>
+You can choose between the 2 alignment methods using the PAIRWISE ALIGNMENTS
+option. The slow/accurate method is fast enough for short sequences but will be
+VERY SLOW for many (e.g. >100) long (e.g. >1000 residue) sequences.
+</P>
+<P>
+</P>
+<P>
+<STRONG>
+SLOW-ACCURATE alignment parameters:
+</STRONG>
+</P>
+<P>
+These parameters do not have any affect on the speed of the alignments. They
+are used to give initial alignments which are then rescored to give percent
+identity scores. These % scores are the ones which are displayed on the
+screen. The scores are converted to distances for the trees.
+</P>
+<P>
+Gap Open Penalty: the penalty for opening a gap in the alignment.
+</P>
+<P>
+Gap Extension Penalty: the penalty for extending a gap by 1 residue.
+</P>
+<P>
+Protein Weight Matrix: the scoring table which describes the similarity of
+each amino acid to each other.
+</P>
+<P>
+Load protein matrix: allows you to read in a comparison table from a file.
+</P>
+<P>
+DNA weight matrix: the scores assigned to matches and mismatches (including
+IUB ambiguity codes).
+</P>
+<P>
+Load DNA matrix: allows you to read in a comparison table from a file.
+</P>
+<P>
+See the Multiple alignment parameters, MATRIX option below for details of the
+matrix input format.
+</P>
+<P>
+</P>
+<P>
+<STRONG>
+FAST-APPROXIMATE alignment parameters:
+</STRONG>
+</P>
+<P>
+These similarity scores are calculated from fast, approximate, global align-
+ments, which are controlled by 4 parameters. 2 techniques are used to make
+these alignments very fast: 1) only exactly matching fragments (k-tuples) are
+considered; 2) only the 'best' diagonals (the ones with most k-tuple matches)
+are used.
+</P>
+<P>
+GAP PENALTY: This is a penalty for each gap in the fast alignments. It has
+little effect on the speed or sensitivity except for extreme values.
+</P>
+<P>
+K-TUPLE SIZE: This is the size of exactly matching fragment that is used.
+INCREASE for speed (max= 2 for proteins; 4 for DNA), DECREASE for sensitivity.
+For longer sequences (e.g. >1000 residues) you may wish to increase the
+default.
+</P>
+<P>
+TOP DIAGONALS: The number of k-tuple matches on each diagonal (in an imaginary
+dot-matrix plot) is calculated. Only the best ones (with most matches) are used
+in the alignment. This parameter specifies how many. Decrease for speed;
+increase for sensitivity.
+</P>
+<P>
+WINDOW SIZE: This is the number of diagonals around each of the 'best'
+diagonals that will be used. Decrease for speed; increase for sensitivity.
+</P>
+<P>
+</P>
+<P>
+<STRONG>
+MULTIPLE ALIGNMENT PARAMETERS
+</STRONG>
+</P>
+<P>
+These parameters control the final multiple alignment. This is the core of the
+program and the details are complicated. To fully understand the use of the
+parameters and the scoring system, you will have to refer to the documentation.
+</P>
+<P>
+Each step in the final multiple alignment consists of aligning two alignments
+or sequences. This is done progressively, following the branching order in the
+GUIDE TREE. The basic parameters to control this are two gap penalties and the
+scores for various identical/non-indentical residues.
+</P>
+<P>
+The GAP OPENING and EXTENSION PENALTIES can be set here. These control the
+cost of opening up every new gap and the cost of every item in a gap.
+Increasing the gap opening penalty will make gaps less frequent. Increasing
+the gap extension penalty will make gaps shorter. Terminal gaps are not
+penalised.
+</P>
+<P>
+The DELAY DIVERGENT SEQUENCES switch delays the alignment of the most distantly
+related sequences until after the most closely related sequences have been
+aligned. The setting shows the percent identity level required to delay the
+addition of a sequence; sequences that are less identical than this level to
+any other sequences will be aligned later.
+</P>
+<P>
+The TRANSITION WEIGHT gives transitions (A<-->G or C<-->T i.e. purine-purine or
+pyrimidine-pyrimidine substitutions) a weight between 0 and 1; a weight of zero
+means that the transitions are scored as mismatches, while a weight of 1 gives
+the transitions the match score. For distantly related DNA sequences, the
+weight should be near to zero; for closely related sequences it can be useful
+to assign a higher score. The default is set to 0.5.
+</P>
+<P>
+</P>
+<P>
+The PROTEIN WEIGHT MATRIX option allows you to choose a series of weight
+matrices. For protein alignments, you use a weight matrix to determine the
+similarity of non-identical amino acids. For example, Tyr aligned with Phe is
+usually judged to be 'better' than Tyr aligned with Pro.
+</P>
+<P>
+There are three 'in-built' series of weight matrices offered. Each consists of
+several matrices which work differently at different evolutionary distances. To
+see the exact details, read the documentation. Crudely, we store several
+matrices in memory, spanning the full range of amino acid distance (from almost
+identical sequences to highly divergent ones). For very similar sequences, it
+is best to use a strict weight matrix which only gives a high score to
+identities and the most favoured conservative substitutions. For more divergent
+sequences, it is appropriate to use "softer" matrices which give a high score
+to many other frequent substitutions.
+</P>
+<P>
+1) BLOSUM (Henikoff). These matrices appear to be the best available for
+carrying out data base similarity (homology searches). The matrices currently
+used are: Blosum 80, 62, 45 and 30. BLOSUM was the default in earlier Clustal X
+versions.
+</P>
+<P>
+2) PAM (Dayhoff). These have been extremely widely used since the late '70s. We
+currently use the PAM 20, 60, 120, 350 matrices.
+</P>
+<P>
+3) GONNET. These matrices were derived using almost the same procedure as the
+Dayhoff one (above) but are much more up to date and are based on a far larger
+data set. They appear to be more sensitive than the Dayhoff series. We
+currently use the GONNET 80, 120, 160, 250 and 350 matrices. This series is the
+default for Clustal X version 1.8.
+</P>
+<P>
+We also supply an identity matrix which gives a score of 10 to two identical
+amino acids and a score of zero otherwise. This matrix is not very useful.
+</P>
+<P>
+Load protein matrix: allows you to read in a comparison matrix from a file.
+This can be either a single matrix or a series of matrices (see below for
+format).
+</P>
+<P>
+</P>
+<P>
+DNA WEIGHT MATRIX option allows you to select a single matrix (not a series)
+used for aligning nucleic acid sequences. Two hard-coded matrices are available:
+</P>
+<P>
+1) IUB. This is the default scoring matrix used by BESTFIT for the comparison
+of nucleic acid sequences. X's and N's are treated as matches to any IUB
+ambiguity symbol. All matches score 1.9; all mismatches for IUB symbols score 0.
+</P>
+<P>
+2) CLUSTALW(1.6). A previous system used by ClustalW, in which matches score
+1.0 and mismatches score 0. All matches for IUB symbols also score 0.
+</P>
+<P>
+Load DNA matrix: allows you to read in a nucleic acid comparison matrix from a
+file (just one matrix, not a series).
+</P>
+<P>
+</P>
+<P>
+SINGLE MATRIX INPUT FORMAT
+The format used for a single matrix is the same as the BLAST program. The
+scores in the new weight matrix should be similarities. You can use negative as
+well as positive values if you wish, although the matrix will be automatically
+adjusted to all positive scores, unless the NEGATIVE MATRIX option is selected.
+Any lines beginning with a # character are assumed to be comments. The first
+non-comment line should contain a list of amino acids in any order, using the 1
+letter code, followed by a * character. This should be followed by a square
+matrix of scores, with one row and one column for each amino acid. The last row
+and column of the matrix (corresponding to the * character) contain the minimum
+score over the whole matrix.
+</P>
+<P>
+MATRIX SERIES INPUT FORMAT
+ClustalX uses different matrices depending on the mean percent identity of the
+sequences to be aligned. You can specify a series of matrices and the range of
+the percent identity for each matrix in a matrix series file. The file is
+automatically recognised by the word CLUSTAL_SERIES at the beginning of the
+file. Each matrix in the series is then specified on one line which should
+start with the word MATRIX. This is followed by the lower and upper limits of
+the sequence percent identities for which you want to apply the matrix. The
+final entry on the matrix line is the filename of a Blast format matrix file
+(see above for details of the single matrix file format).
+</P>
+<P>
+Example.
+</P>
+<P>
+CLUSTAL_SERIES
+</P>
+<P>
+MATRIX 81 100 /us1/user/julie/matrices/blosum80
+MATRIX 61 80 /us1/user/julie/matrices/blosum62
+MATRIX 31 60 /us1/user/julie/matrices/blosum45
+MATRIX 0 30 /us1/user/julie/matrices/blosum30
+</P>
+<P>
+</P>
+<P>
+<STRONG>
+PROTEIN GAP PARAMETERS
+</STRONG>
+</P>
+<P>
+RESIDUE SPECIFIC PENALTIES are amino acid specific gap penalties that reduce or
+increase the gap opening penalties at each position in the alignment or
+sequence. See the documentation for details. As an example, positions that are
+rich in glycine are more likely to have an adjacent gap than positions that are
+rich in valine.
+</P>
+<P>
+HYDROPHILIC GAP PENALTIES are used to increase the chances of a gap within a
+run (5 or more residues) of hydrophilic amino acids; these are likely to be
+loop or random coil regions where gaps are more common. The residues that are
+"considered" to be hydrophilic can be entered in HYDROPHILIC RESIDUES.
+</P>
+<P>
+GAP SEPARATION DISTANCE tries to decrease the chances of gaps being too close
+to each other. Gaps that are less than this distance apart are penalised more
+than other gaps. This does not prevent close gaps; it makes them less frequent,
+promoting a block-like appearance of the alignment.
+</P>
+<P>
+END GAP SEPARATION treats end gaps just like internal gaps for the purposes of
+avoiding gaps that are too close (set by GAP SEPARATION DISTANCE above). If you
+turn this off, end gaps will be ignored for this purpose. This is useful when
+you wish to align fragments where the end gaps are not biologically meaningful.
+</P>
+<P>
+</P>
+<P>
+</P>
+<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
+<CENTER><H2><A NAME="P"> Profile and Structure Alignments
+</A></H2></CENTER>
+<P>
+</P>
+<P>
+By PROFILE ALIGNMENT, we mean alignment using existing alignments. Profile
+alignments allow you to store alignments of your favourite sequences and add
+new sequences to them in small bunches at a time. A profile is simply an
+alignment of one or more sequences (e.g. an alignment output file from Clustal
+X). Each input can be a single sequence. One or both sets of input sequences
+may include secondary structure assignments or gap penalty masks to guide the
+alignment.
+</P>
+<P>
+Make sure PROFILE ALIGNMENT MODE is selected, using the switch directly above
+the sequence display area. Then, use the ALIGNMENT menu to do profile and
+secondary structure alignments.
+</P>
+<P>
+The profiles can be in any of the allowed input formats with "-" characters
+used to specify gaps (except for GCG/MSF where "." is used).
+</P>
+<P>
+You have to load the 2 profiles by choosing FILE, LOAD PROFILE 1 and LOAD
+PROFILE 2. Then ALIGNMENT, ALIGN PROFILE 2 TO PROFILE 1 will align the 2
+profiles to each other. Secondary structure masks in either profile can be used
+to guide the alignment. This option compares all the sequences in profile 1
+with all the sequences in profile 2 in order to build guide trees which will be
+used to calculate sequence weights, and select appropriate alignment parameters
+for the final profile alignment.
+</P>
+<P>
+You can skip the first stage (pairwise alignments; guide trees) by using old
+guide tree files (ALIGN PROFILES FROM GUIDE TREES).
+</P>
+<P>
+The ALIGN SEQUENCES TO PROFILE 1 option will take the sequences in the second
+profile and align them to the first profile, 1 at a time. This is useful to
+add some new sequences to an existing alignment, or to align a set of sequences
+to a known structure. In this case, the second profile set need not be
+pre-aligned.
+</P>
+<P>
+You can skip the first stage (pairwise alignments; guide tree) by using an old
+guide tree file (ALIGN SEQUENCES TO PROFILE 1 FROM TREE).
+</P>
+<P>
+SAVE LOG FILE will write the alignment calculation scores to a file. The log
+filename is the same as the input sequence filename, with an extension .log
+appended.
+</P>
+<P>
+The alignment parameters can be set using the ALIGNMENT PARAMETERS menu,
+Pairwise Parameters, Multiple Parameters and Protein Gap Parameters options.
+These are EXACTLY the same parameters as used by the general, automatic
+multiple alignment procedure. The general multiple alignment procedure is
+simply a series of profile alignments. Carrying out a series of profile
+alignments on larger and larger groups of sequences, allows you to manually
+build up a complete alignment, if necessary editing intermediate alignments.
+</P>
+<P>
+<STRONG>
+SECONDARY STRUCTURE PARAMETERS
+</STRONG>
+</P>
+<P>
+Use this menu to set secondary structure options. If a solved structure is
+known, it can be used to guide the alignment by raising gap penalties within
+secondary structure elements, so that gaps will preferentially be inserted into
+unstructured surface loop regions. Alternatively, a user-specified gap penalty
+mask can be supplied for a similar purpose.
+</P>
+<P>
+A gap penalty mask is a series of numbers between 1 and 9, one per position in
+the alignment. Each number specifies how much the gap opening penalty is to be
+raised at that position (raised by multiplying the basic gap opening penalty
+by the number) i.e. a mask figure of 1 at a position means no change
+in gap opening penalty; a figure of 4 means that the gap opening penalty is
+four times greater at that position, making gaps 4 times harder to open.
+</P>
+<P>
+The format for gap penalty masks and secondary structure masks is explained in
+a separate help section.
+</P>
+<P>
+</P>
+<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
+<CENTER><H2><A NAME="B"> Secondary Structure / Gap Penalty Masks
+</A></H2></CENTER>
+<P>
+</P>
+<P>
+The use of secondary structure-based penalties has been shown to improve the
+accuracy of sequence alignment. Clustal X now allows secondary structure/ gap
+penalty masks to be supplied with the input sequences used during profile
+alignment. (NB. The secondary structure information is NOT used during multiple
+sequence alignment). The masks work by raising gap penalties in specified
+regions (typically secondary structure elements) so that gaps are
+preferentially opened in the less well conserved regions (typically surface
+loops).
+</P>
+<P>
+The USE PROFILE 1(2) SECONDARY STRUCTURE / GAP PENALTY MASK options control
+whether the input 2D-structure information or gap penalty masks will be used
+during the profile alignment.
+</P>
+<P>
+The OUTPUT options control whether the secondary structure and gap penalty
+masks should be included in the Clustal X output alignments. Showing both is
+useful for understanding how the masks work. The 2D-structure information is
+itself useful in judging the alignment quality and in seeing how residue
+conservation patterns vary with secondary structure.
+</P>
+<P>
+The HELIX and STRAND GAP PENALTY options provide the value for raising the gap
+penalty at core Alpha Helical (A) and Beta Strand (B) residues. In CLUSTAL
+format, capital residues denote the A and B core structure notation. Basic gap
+penalties are multiplied by the amount specified.
+</P>
+<P>
+The LOOP GAP PENALTY option provides the value for the gap penalty in Loops.
+By default this penalty is not raised. In CLUSTAL format, loops are specified
+by "." in the secondary structure notation.
+</P>
+<P>
+The SECONDARY STRUCTURE TERMINAL PENALTY provides the value for setting the gap
+penalty at the ends of secondary structures. Ends of secondary structures are
+known to grow or shrink, comparing related structures. Therefore by default
+these are given intermediate values, lower than the core penalties. All
+secondary structure read in as lower case in CLUSTAL format gets the reduced
+terminal penalty.
+</P>
+<P>
+The HELIX and STRAND TERMINAL POSITIONS options specify the range of structure
+termini for the intermediate penalties. In the alignment output, these are
+indicated as lower case. For Alpha Helices, by default, the range spans the
+end-helical turn (3 residues). For Beta Strands, the default range spans the
+end residue and the adjacent loop residue, since sequence conservation often
+extends beyond the actual H-bonded Beta Strand.
+</P>
+<P>
+Clustal X can read the masks from SWISS-PROT, CLUSTAL or GDE format input
+files. For many 3-D protein structures, secondary structure information is
+recorded in the feature tables of SWISS-PROT database entries. You should
+always check that the assignments are correct - some are quite inaccurate.
+Clustal X looks for SWISS-PROT HELIX and STRAND assignments e.g.
+</P>
+<P>
+</P>
+<P>
+<PRE>
+FT HELIX 100 115
+FT STRAND 118 119
+</PRE>
+</P>
+<P>
+The structure and penalty masks can also be read from CLUSTAL alignment format
+as comment lines beginning "!SS_" or "!GM_" e.g.
+</P>
+<P>
+<PRE>
+!SS_HBA_HUMA ..aaaAAAAAAAAAAaaa.aaaAAAAAAAAAAaaaaaaAaaa.........aaaAAAAAA
+!GM_HBA_HUMA 112224444444444222122244444444442222224222111111111222444444
+HBA_HUMA VLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHGK
+</PRE>
+</P>
+<P>
+Note that the mask itself is a set of numbers between 1 and 9 each of which is
+assigned to the residue(s) in the same column below.
+</P>
+<P>
+In GDE flat file format, the masks are specified as text and the names must
+begin with "SS_ or "GM_.
+</P>
+<P>
+Either a structure or penalty mask or both may be used. If both are included
+in an alignment, the user will be asked which is to be used.
+</P>
+<P>
+</P>
+<P>
+</P>
+<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
+<CENTER><H2><A NAME="T"> Phylogenetic Trees
+</A></H2></CENTER>
+<P>
+</P>
+<P>
+Before calculating a tree, you must have an ALIGNMENT in memory. This can be
+input using the FILE menu, LOAD SEQUENCES option or you should have just
+carried out a full multiple alignment and the alignment is still in memory.
+Remember YOU MUST ALIGN THE SEQUENCES FIRST!!!!
+</P>
+<P>
+The method used is the NJ (Neighbour Joining) method of Saitou and Nei. First
+you calculate distances (percent divergence) between all pairs of sequence from
+a multiple alignment; second you apply the NJ method to the distance matrix.
+</P>
+<P>
+To calculate a tree, use the DRAW N-J TREE option. This gives an UNROOTED tree
+and all branch lengths. The root of the tree can only be inferred by using an
+outgroup (a sequence that you are certain branches at the outside of the tree
+.... certain on biological grounds) OR if you assume a degree of constancy in
+the 'molecular clock', you can place the root in the 'middle' of the tree
+(roughly equidistant from all tips).
+</P>
+<P>
+BOOTSTRAP N-J TREE uses a method for deriving confidence values for the
+groupings in a tree (first adapted for trees by Joe Felsenstein). It involves
+making N random samples of sites from the alignment (N should be LARGE, e.g.
+500 - 1000); drawing N trees (1 from each sample) and counting how many times
+each grouping from the original tree occurs in the sample trees. You can set N
+using the NUMBER OF BOOTSTRAP TRIALS option in the BOOTSTRAP TREE window. In
+practice, you should use a large number of bootstrap replicates (1000 is
+recommended, even if it means running the program for an hour on a slow
+computer). You can also supply a seed number for the random number generator
+here. Different runs with the same seed will give the same answer. See the
+documentation for more details.
+</P>
+<P>
+EXCLUDE POSITIONS WITH GAPS? With this option, any alignment positions where
+ANY of the sequences have a gap will be ignored. This means that 'like' will
+be compared to 'like' in all distances, which is highly desirable. It also
+automatically throws away the most ambiguous parts of the alignment, which are
+concentrated around gaps (usually). The disadvantage is that you may throw away
+much of the data if there are many gaps (which is why it is difficult for us to
+make it the default).
+</P>
+<P>
+CORRECT FOR MULTIPLE SUBSTITUTIONS? For small divergence (say <10%) this option
+makes no difference. For greater divergence, this option corrects for the fact
+that observed distances underestimate actual evolutionary distances. This is
+because, as sequences diverge, more than one substitution will happen at many
+sites. However, you only see one difference when you look at the present day
+sequences. Therefore, this option has the effect of stretching branch lengths
+in trees (especially long branches). The corrections used here (for DNA or
+proteins) are both due to Motoo Kimura. See the documentation for details.
+</P>
+<P>
+Where possible, this option should be used. However, for VERY divergent
+sequences, the distances cannot be reliably corrected. You will be warned if
+this happens. Even if none of the distances in a data set exceed the reliable
+threshold, if you bootstrap the data, some of the bootstrap distances may
+randomly exceed the safe limit.
+</P>
+<P>
+SAVE LOG FILE will write the tree calculation scores to a file. The log
+filename is the same as the input sequence filename, with an extension .log
+appended.
+</P>
+<P>
+<H4>
+OUTPUT FORMAT OPTIONS
+</H4>
+</P>
+<P>
+Three different formats are allowed. None of these displays the tree visually.
+You can display the tree using the NJPLOT program distributed with Clustal X
+OR get the PHYLIP package and use the tree drawing facilities there.
+</P>
+<P>
+1) CLUSTAL FORMAT TREE. This format is verbose and lists all of the distances
+between the sequences and the number of alignment positions used for each. The
+tree is described at the end of the file. It lists the sequences that are
+joined at each alignment step and the branch lengths. After two sequences are
+joined, it is referred to later as a NODE. The number of a NODE is the number
+of the lowest sequence in that NODE.
+</P>
+<P>
+2) PHYLIP FORMAT TREE. This format is the New Hampshire format, used by many
+phylogenetic analysis packages. It consists of a series of nested parentheses,
+describing the branching order, with the sequence names and branch lengths. It
+can be read by the NJPLOT program distributed with ClustalX. It can also be
+used by the RETREE, DRAWGRAM and DRAWTREE programs of the PHYLIP package to see
+the trees graphically. This is the same format used during multiple alignment
+for the guide trees. Some other packages that can read and display New
+Hampshire format are TreeTool, TreeView, and Phylowin.
+</P>
+<P>
+3) PHYLIP DISTANCE MATRIX. This format just outputs a matrix of all the
+pairwise distances in a format that can be used by the PHYLIP package. It used
+to be useful when one could not produce distances from protein sequences in the
+Phylip package but is now redundant (PROTDIST of Phylip 3.5 now does this).
+</P>
+<P>
+4) NEXUS FORMAT TREE. This format is used by several popular phylogeny programs,
+including PAUP and MacClade. The format is described fully in:
+Maddison, D. R., D. L. Swofford and W. P. Maddison. 1997.
+NEXUS: an extensible file format for systematic information.
+Systematic Biology 46:590-621.
+</P>
+<P>
+BOOTSTRAP LABELS ON: By default, the bootstrap values are correctly placed on
+the tree branches of the phylip format output tree. The toggle allows them to
+be placed on the nodes, which is incorrect, but some display packages (e.g.
+TreeTool, TreeView and Phylowin) only support node labelling but not branch
+labelling. Care should be taken to note which branches and labels go together.
+</P>
+<P>
+</P>
+<P>
+</P>
+<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
+<CENTER><H2><A NAME="C"> Colors
+</A></H2></CENTER>
+<P>
+</P>
+<P>
+Clustal X provides a versatile coloring scheme for the sequence alignment
+display. The sequences (or profiles) are colored automatically, when they are
+loaded. Sequences can be colored either by assigning a color to specific
+residues, or on the basis of an alignment consensus. In the latter case, the
+alignment consensus is calculated automatically, and the residues in each
+column are colored according to the consensus character assigned to that
+column. In this way, you can choose to highlight, for example, conserved
+hydrophylic or hydrophobic positions in the alignment.
+</P>
+<P>
+The 'rules' used to color the alignment are specified in a COLOR PARAMETER
+FILE. Clustal X automatically looks for a file called 'colprot.par' for protein
+sequences or 'coldna.par' for DNA, in the current directory. (If your running
+under UNIX, it then looks in your home directory, and finally in the
+directories in your PATH environment variable).
+</P>
+<P>
+By default, if no color parameter file is found, protein sequences are colored
+by residue as follows:
+</P>
+<P>
+<PRE>
+ Color Residue Code
+</P>
+<P>
+ ORANGE GPST
+ RED HKR
+ BLUE FWY
+ GREEN ILMV
+</PRE>
+</P>
+<P>
+In the case of DNA sequences, the default colors are as follows:
+</P>
+<P>
+<PRE>
+ Color Residue Code
+</P>
+<P>
+ ORANGE A
+ RED C
+ BLUE T
+ GREEN G
+</PRE>
+</P>
+<P>
+</P>
+<P>
+The default BACKGROUND COLORING option shows the sequence residues using a
+black character on a colored background. It can be switched off to show
+residues as a colored character on a white background.
+</P>
+<P>
+Either BLACK AND WHITE or DEFAULT COLOR options can be selected. The Color
+option looks first for the color parameter file (as described above) and, if no
+file is found, uses the default residue-specific colors.
+</P>
+<P>
+You can specify your own coloring scheme by using the LOAD COLOR PARAMETER FILE
+option. The format of the color parameter file is described below.
+</P>
+<P>
+<H4>
+COLOR PARAMETER FILE
+</H4>
+</P>
+<P>
+This file is divided into 3 sections:
+</P>
+<P>
+1) the names and rgb values of the colors
+2) the rules for calculating the consensus
+3) the rules for assigning colors to the residues
+</P>
+<P>
+An example file is given here.
+</P>
+<P>
+<PRE>
+ --------------------------------------------------------------------
+ at rgbindex
+RED 0.9 0.1 0.1
+BLUE 0.1 0.1 0.9
+GREEN 0.1 0.9 0.1
+YELLOW 0.9 0.9 0.0
+</P>
+<P>
+ at consensus
+% = 60% w:l:v:i:m:a:f:c:y:h:p
+# = 80% w:l:v:i:m:a:f:c:y:h:p
+- = 50% e:d
++ = 60% k:r
+q = 50% q:e
+p = 50% p
+n = 50% n
+t = 50% t:s
+</P>
+<P>
+ at color
+g = RED
+p = YELLOW
+t = GREEN if t:%:#
+n = GREEN if n
+w = BLUE if %:#:p
+k = RED if +
+ --------------------------------------------------------------------
+</PRE>
+</P>
+<P>
+The first section is optional and is identified by the header @rgbindex. If
+this section exists, each color used in the file must be named and the rgb
+values specified (on a scale from 0 to 1). If the rgb index section is not
+found, the following set of hard-coded colors will be used.
+</P>
+<P>
+<PRE>
+RED 0.9 0.1 0.1
+BLUE 0.1 0.1 0.9
+GREEN 0.1 0.9 0.1
+ORANGE 0.9 0.7 0.3
+CYAN 0.1 0.9 0.9
+PINK 0.9 0.5 0.5
+MAGENTA 0.9 0.1 0.9
+YELLOW 0.9 0.9 0.0
+</PRE>
+</P>
+<P>
+The second section is optional and is identified by the header @consensus. It
+defines how the consensus is calculated.
+</P>
+<P>
+The format of each consensus parameter is:-
+</P>
+<P>
+<PRE>
+c = n% residue_list
+</P>
+<P>
+ where
+ c is a character used to identify the parameter.
+ n is an integer value used as the percentage cutoff
+ point.
+ residue_list is a list of residues denoted by a single
+ character, delimited by a colon (:).
+</PRE>
+</P>
+<P>
+For example: # = 60% w:l:v:i
+</P>
+<P>
+will assign a consensus character # to any column in the alignment which
+contains more than 60% of the residues w,l,v and i.
+</P>
+<P>
+</P>
+<P>
+The third section is identified by the header @color, and defines how colors
+are assigned to each residue in the alignment.
+</P>
+<P>
+The color parameters can take one of two formats:
+</P>
+<P>
+<PRE>
+1) r = color
+2) r = color if consensus_list
+</P>
+<P>
+ where
+ r is a character used to denote a residue.
+ color is one of the colors in the GDE color lookup table.
+ residue_list is a list of residues denoted by a single
+ character, delimited by a colon (:).
+</PRE>
+</P>
+<P>
+Examples:
+1) g = ORANGE
+</P>
+<P>
+will color all glycines ORANGE, regardless of the consensus.
+</P>
+<P>
+2) w = BLUE if w:%:#
+</P>
+<P>
+will color BLUE any tryptophan which is found in a column with a consensus of
+w, % or #.
+</P>
+<P>
+</P>
+<P>
+</P>
+<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
+<CENTER><H2><A NAME="Q"> Alignment Quality Analysis
+</A></H2></CENTER>
+<P>
+</P>
+<P>
+<H3>
+QUALITY SCORES
+</H3>
+</P>
+<P>
+Clustal X provides an indication of the quality of an alignment by plotting
+a 'conservation score' for each column of the alignment. A high score indicates
+a well-conserved column; a low score indicates low conservation. The quality
+curve is drawn below the alignment.
+</P>
+<P>
+Two methods are also provided to indicate single residues or sequence segments
+which score badly in the alignment.
+</P>
+<P>
+Low-scoring residues are expected to occur at a moderate frequency in all the
+sequences because of their steady divergence due to the natural processes of
+evolution. The most divergent sequences are likely to have the most outliers.
+However, the highlighted residues are especially useful in pointing to
+sequence misalignments. Note that clustering of highlighted residues is a
+strong indication of misalignment. This can arise due to various reasons, for
+example:
+</P>
+<P>
+ 1. Partial or total misalignments caused by a failure in the
+ alignment algorithm. Usually only in difficult alignment cases.
+</P>
+<P>
+ 2. Partial or total misalignments because at least one of the
+ sequences in the given set is partly or completely unrelated to the
+ other sequences. It is up to the user to check that the set of
+ sequences are alignable.
+</P>
+<P>
+ 3. Frameshift translation errors in a protein sequence causing local
+ mismatched regions to be heavily highlighted. These are surprisingly
+ common in database entries. If suspected, a 3-frame translation of
+ the source DNA needs to be examined.
+</P>
+<P>
+Occasionally, highlighted residues may point to regions of some biological
+significance. This might happen for example if a protein alignment contains a
+sequence which has acquired new functions relative to the main sequence set. It
+is important to exclude other explanations, such as error or the natural
+divergence of sequences, before invoking a biological explanation.
+</P>
+<P>
+</P>
+<P>
+<H3>
+LOW-SCORING SEGMENTS
+</H3>
+</P>
+<P>
+Unreliable regions in the alignment can be highlighted using the Low-Scoring
+Segments option. A sequence-weighted profile is used to indicate any segments
+in the sequences which score badly. Because the profile calculation may take
+some time, an option is provided to calculate LOW-SCORING SEGMENTS. The
+segment display can then be toggled on or off without having to repeat the
+time-consuming calculations.
+</P>
+<P>
+For details of the low-scoring segment calculation, see the CALCULATION section
+below.
+</P>
+<P>
+</P>
+<P>
+<H4>
+LOW-SCORING SEGMENT PARAMETERS
+</H4>
+</P>
+<P>
+MINIMUM LENGTH OF SEGMENTS: short segments (or even single residues) can be
+hidden by increasing the minimum length of segments which will be displayed.
+</P>
+<P>
+DNA MARKING SCALE is used to remove less significant segments from the
+highlighted display. Increase the scale to display more segments; decrease the
+scale to remove the least significant.
+</P>
+<P>
+</P>
+<P>
+PROTEIN WEIGHT MATRIX: the scoring table which describes the similarity of each
+amino acid to each other. The matrix is used to calculate the sequence-
+weighted profile scores. There are four 'in-built' Log-Odds matrices offered:
+the Gonnet PAM 80, 120, 250, 350 matrices. A more stringent matrix which only
+gives a high score to identities and the most favoured conservative
+substitutions, may be more suitable when the sequences are closely related. For
+more divergent sequences, it is appropriate to use "softer" matrices which give
+a high score to many other frequent substitutions. This option automatically
+recalculates the low-scoring segments.
+</P>
+<P>
+</P>
+<P>
+DNA WEIGHT MATRIX: Two hard-coded matrices are available:
+</P>
+<P>
+1) IUB. This is the default scoring matrix used by BESTFIT for the comparison
+of nucleic acid sequences. X's and N's are treated as matches to any IUB
+ambiguity symbol. All matches score 1.0; all mismatches for IUB symbols score
+0.9.
+</P>
+<P>
+2) CLUSTALW(1.6). The previous system used by ClustalW, in which matches score
+1.0 and mismatches score 0. All matches for IUB symbols also score 0.
+</P>
+<P>
+A new matrix can be read from a file on disk, if the filename consists only
+of lower case characters. The values in the new weight matrix should be
+similarities and should be NEGATIVE for infrequent substitutions.
+</P>
+<P>
+INPUT FORMAT. The format used for a new matrix is the same as the BLAST
+program. Any lines beginning with a # character are assumed to be comments. The
+first non-comment line should contain a list of amino acids in any order, using
+the 1 letter code, followed by a * character. This should be followed by a
+square matrix of scores, with one row and one column for each amino acid. The
+last row and column of the matrix (corresponding to the * character) contain
+the minimum score over the whole matrix.
+</P>
+<P>
+<H4>
+QUALITY SCORE PARAMETERS
+</H4>
+</P>
+<P>
+You can customise the column 'quality scores' plotted underneath the alignment
+display using the following options.
+</P>
+<P>
+SCORE PLOT SCALE: this is a scalar value from 1 to 10, which can be used to
+change the scale of the quality score plot.
+</P>
+<P>
+RESIDUE EXCEPTION CUTOFF: this is a scalar value from 1 to 10, which can be
+used to change the number of residue exceptions which are highlighted in the
+alignment display. (For an explanation of this cutoff, see the CALCULATION OF
+RESIDUE EXCEPTIONS section below.)
+</P>
+<P>
+PROTEIN WEIGHT MATRIX: the scoring table which describes the similarity of
+each amino acid to each other.
+</P>
+<P>
+DNA WEIGHT MATRIX: two hard-coded matrices are available: IUB and CLUSTALW(1.6).
+</P>
+<P>
+For more information about the weight matrices, see the help above for
+the Low-scoring Segments Weight Matrix.
+</P>
+<P>
+For details of the quality score calculations, see the CALCULATION section
+below.
+</P>
+<P>
+</P>
+<P>
+<STRONG>
+SHOW LOW-SCORING SEGMENTS
+</STRONG>
+</P>
+<P>
+The low-scoring segment display can be toggled on or off. This option does not
+recalculate the profile scores.
+</P>
+<P>
+</P>
+<P>
+<STRONG>
+SHOW EXCEPTIONAL RESIDUES
+</STRONG>
+</P>
+<P>
+This option highlights individual residues which score badly in the alignment
+quality calculations. Residues which score exceptionally low are highlighted by
+using a white character on a grey background.
+</P>
+<P>
+<STRONG>
+SAVE QUALITY SCORES TO FILE
+</STRONG>
+</P>
+<P>
+The quality scores that are plotted underneath the alignment display can also
+be saved in a text file. Each column in the alignment is written on one line in
+the output file, with the value of the quality score at the end of the line.
+Only the sequences currently selected in the display are written to the file.
+One use for quality scores is to color residues in a protein structure by
+sequence conservation. In this way conserved surface residues can be
+highlighted to locate functional regions such as ligand-binding sites.
+</P>
+<P>
+</P>
+<P>
+<H3>
+CALCULATION OF QUALITY SCORES
+</H3>
+</P>
+<P>
+Suppose we have an alignment of m sequences of length n. Then, the alignment
+can be written as:
+</P>
+<P>
+<PRE>
+ A11 A12 A13 .......... A1n
+ A21 A22 A23 .......... A2n
+ .
+ .
+ Am1 Am2 Am3 .......... Amn
+</PRE>
+</P>
+<P>
+We also have a residue comparison matrix of size R where C(i,j) is the score
+for aligning residue i with residue j.
+</P>
+<P>
+We want to calculate a score for the conservation of the jth position in the
+alignment.
+</P>
+<P>
+To do this, we define an R-dimensional sequence space. For the jth position in
+the alignment, each sequence consists of a single residue which is assigned a
+point S in the space. S has R dimensions, and for sequence i, the rth dimension
+is defined as:
+</P>
+<P>
+<PRE>
+ Sr = C(r,Aij)
+</PRE>
+</P>
+<P>
+We then calculate a consensus value for the jth position in the alignment. This
+value X also has R dimensions, and the rth dimension is defined as:
+</P>
+<P>
+<PRE>
+ Xr = ( SUM (Fij * C(i,r)) ) / m
+ 1<=i<=R
+</PRE>
+</P>
+<P>
+where Fij is the count of residues i at position j in the alignment.
+</P>
+<P>
+Now we can calculate the distance Di between each sequence i and the consensus
+position X in the R-dimensional space.
+</P>
+<P>
+<PRE>
+ Di = SQRT ( SUM (Xr - Sr)(Xr - Sr) )
+ 1<=i<=R
+</P>
+<P>
+</PRE>
+</P>
+<P>
+The quality score for the jth position in the alignment is defined as the mean
+of the sequence distances Di.
+</P>
+<P>
+The score is normalised by multiplying by the percentage of sequences which
+have residues (and not gaps) at this position.
+</P>
+<P>
+<H3>
+CALCULATION OF RESIDUE EXCEPTIONS
+</H3>
+</P>
+<P>
+The jth residue of the ith sequence is considered as an exception if the
+distance Di of the sequence from the consensus value P is greater than (Upper
+Quartile + Inter Quartile Range * Cutoff). The value used as a cutoff for
+displaying exceptions can be set from the SCORE PARAMETERS menu. A high cutoff
+value will only display very significant exceptions; a low value will allow
+more, less significant, exceptions to be highlighted.
+</P>
+<P>
+(NB. Sequences which contain gaps at this position are not included in the
+exception calculation.)
+</P>
+<P>
+</P>
+<P>
+<H3>
+CALCULATION OF LOW-SCORING SEGMENTS
+</H3>
+</P>
+<P>
+Suppose we have an alignment of m sequences of length n. Then, the alignment
+can be written as:
+</P>
+<P>
+<PRE>
+ A11 A12 A13 .......... A1n
+ A21 A22 A23 .......... A2n
+ .
+ .
+ Am1 Am2 Am3 .......... Amn
+</PRE>
+</P>
+<P>
+We also have a residue comparison matrix of size R where C(i,j) is the score
+for aligning residue i with residue j.
+</P>
+<P>
+We calculate sequence weights by building a neighbour-joining tree, in which
+branch lengths are proportional to divergence. Summing the branches by branch
+ownership provides the weights. See (Thompson et al., CABIOS, 10, 19 (1994) and
+Henikoff et al.,JMB, 243, 574 1994).
+</P>
+<P>
+To find the low-scoring segments in a sequence Si, we build a weighted profile
+of the remaining sequences in the alignment. Suppose we find residue r at
+position j in the sequence; then the score for the jth position in the sequence
+is defined as
+</P>
+<P>
+<PRE>
+ Score(Si,j) = Profile(j,r) where Profile(j,r) is the profile score
+ for residue r at position j in the
+ alignment.
+</PRE>
+</P>
+<P>
+These residue scores are summed along the sequence in both forward and backward
+directions. If the sum of the scores is positive, then it is reset to zero.
+Segments which score negatively in both directions are considered as
+'low-scoring' and will be highlighted in the alignment display.
+</P>
+<P>
+</P>
+<P>
+</P>
+<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
+<CENTER><H2><A NAME="9"> Command Line Parameters
+</A></H2></CENTER>
+<CENTER><H3> DATA (sequences)
+</H3></CENTER>
+<CENTER><TABLE ALIGN=ABSCENTER BORDER=1 CELLSPACING=1 CELLPADDING=5>
+<TR>
+<TD><STRONG>Parameter</STRONG></TD>
+<TD><STRONG><EM>Description</EM></STRONG></TD>
+</TR>
+<TR>
+<TD><TT>-PROFILE1=file.ext and -PROFILE2=file.ext </TT></TD>
+<TD><EM>profiles (aligned sequences)</EM></TD>
+</TR>
+</TABLE></CENTER>
+<CENTER><H3> VERBS (do things)
+</H3></CENTER>
+<CENTER><TABLE ALIGN=ABSCENTER BORDER=1 CELLSPACING=1 CELLPADDING=5>
+<TR>
+<TD><STRONG>Parameter</STRONG></TD>
+<TD><STRONG><EM>Description</EM></STRONG></TD>
+</TR>
+<TR>
+<TD><TT>-HELP or -CHECK </TT></TD>
+<TD><EM>outline the command line parameters</EM></TD>
+</TR>
+<TR>
+<TD><TT>-ALIGN </TT></TD>
+<TD><EM>do full multiple alignment </EM></TD>
+</TR>
+<TR>
+<TD><TT>-TREE </TT></TD>
+<TD><EM>calculate NJ tree</EM></TD>
+</TR>
+<TR>
+<TD><TT>-BOOTSTRAP(=n) </TT></TD>
+<TD><EM>bootstrap a NJ tree (n= number of bootstraps; def. = 1000)</EM></TD>
+</TR>
+<TR>
+<TD><TT>-CONVERT </TT></TD>
+<TD><EM>output the input sequences in a different file format</EM></TD>
+</TR>
+</TABLE></CENTER>
+<CENTER><H3> PARAMETERS (set things)
+</H3></CENTER>
+<CENTER><P><STRONG>***General settings:****
+</STRONG></P></CENTER>
+<CENTER><TABLE ALIGN=ABSCENTER BORDER=1 CELLSPACING=1 CELLPADDING=5>
+<TR>
+<TD><STRONG>Parameter</STRONG></TD>
+<TD><STRONG><EM>Description</EM></STRONG></TD>
+</TR>
+<TR>
+<TD><TT>-INTERACTIVE </TT></TD>
+<TD><EM>read command line, then enter normal interactive menus</EM></TD>
+</TR>
+<TR>
+<TD><TT>-QUICKTREE </TT></TD>
+<TD><EM>use FAST algorithm for the alignment guide tree</EM></TD>
+</TR>
+<TR>
+<TD><TT>-TYPE= </TT></TD>
+<TD><EM>PROTEIN or DNA sequences</EM></TD>
+</TR>
+<TR>
+<TD><TT>-NEGATIVE </TT></TD>
+<TD><EM>protein alignment with negative values in matrix</EM></TD>
+</TR>
+<TR>
+<TD><TT>-OUTFILE= </TT></TD>
+<TD><EM>sequence alignment file name</EM></TD>
+</TR>
+<TR>
+<TD><TT>-OUTPUT= </TT></TD>
+<TD><EM>GCG, GDE, PHYLIP, PIR or NEXUS</EM></TD>
+</TR>
+<TR>
+<TD><TT>-OUTORDER= </TT></TD>
+<TD><EM>INPUT or ALIGNED</EM></TD>
+</TR>
+<TR>
+<TD><TT>-CASE= </TT></TD>
+<TD><EM>LOWER or UPPER (for GDE output only)</EM></TD>
+</TR>
+<TR>
+<TD><TT>-SEQNOS= </TT></TD>
+<TD><EM>OFF or ON (for Clustal output only)</EM></TD>
+</TR>
+</TABLE></CENTER>
+<CENTER><H3>***Fast Pairwise Alignments:***
+</H3></CENTER>
+<CENTER><TABLE ALIGN=ABSCENTER BORDER=1 CELLSPACING=1 CELLPADDING=5>
+<TR>
+<TD><STRONG>Parameter</STRONG></TD>
+<TD><STRONG><EM>Description</EM></STRONG></TD>
+</TR>
+<TR>
+<TD><TT>-TOPDIAGS=n </TT></TD>
+<TD><EM>number of best diags.</EM></TD>
+</TR>
+<TR>
+<TD><TT>-WINDOW=n </TT></TD>
+<TD><EM>window around best diags.</EM></TD>
+</TR>
+<TR>
+<TD><TT>-PAIRGAP=n </TT></TD>
+<TD><EM>gap penalty</EM></TD>
+</TR>
+<TR>
+<TD><TT>-SCORE= </TT></TD>
+<TD><EM>PERCENT or ABSOLUTE</EM></TD>
+</TR>
+</TABLE></CENTER>
+<CENTER><H3>***Slow Pairwise Alignments:***
+</H3></CENTER>
+<CENTER><TABLE ALIGN=ABSCENTER BORDER=1 CELLSPACING=1 CELLPADDING=5>
+<TR>
+<TD><STRONG>Parameter</STRONG></TD>
+<TD><STRONG><EM>Description</EM></STRONG></TD>
+</TR>
+<TR>
+<TD><TT>-PWDNAMATRIX= </TT></TD>
+<TD><EM>DNA weight matrix=IUB, CLUSTALW or filename</EM></TD>
+</TR>
+<TR>
+<TD><TT>-PWGAPOPEN=f </TT></TD>
+<TD><EM>gap opening penalty</EM></TD>
+</TR>
+<TR>
+<TD><TT>-PWGAPEXT=f </TT></TD>
+<TD><EM>gap opening penalty</EM></TD>
+</TR>
+</TABLE></CENTER>
+<CENTER><H3>***Multiple Alignments:***
+</H3></CENTER>
+<CENTER><TABLE ALIGN=ABSCENTER BORDER=1 CELLSPACING=1 CELLPADDING=5>
+<TR>
+<TD><STRONG>Parameter</STRONG></TD>
+<TD><STRONG><EM>Description</EM></STRONG></TD>
+</TR>
+<TR>
+<TD><TT>-USETREE= </TT></TD>
+<TD><EM>file for old guide tree</EM></TD>
+</TR>
+<TR>
+<TD><TT>-MATRIX= </TT></TD>
+<TD><EM>Protein weight matrix=BLOSUM, PAM, GONNET, ID or filename</EM></TD>
+</TR>
+<TR>
+<TD><TT>-DNAMATRIX= </TT></TD>
+<TD><EM>DNA weight matrix=IUB, CLUSTALW or filename</EM></TD>
+</TR>
+<TR>
+<TD><TT>-GAPOPEN=f </TT></TD>
+<TD><EM>gap opening penalty</EM></TD>
+</TR>
+<TR>
+<TD><TT>-GAPEXT=f </TT></TD>
+<TD><EM>gap extension penalty</EM></TD>
+</TR>
+<TR>
+<TD><TT>-ENDGAPS </TT></TD>
+<TD><EM>no end gap separation pen.</EM></TD>
+</TR>
+<TR>
+<TD><TT>-GAPDIST=n </TT></TD>
+<TD><EM>gap separation pen. range</EM></TD>
+</TR>
+<TR>
+<TD><TT>-NOPGAP </TT></TD>
+<TD><EM>residue-specific gaps off</EM></TD>
+</TR>
+<TR>
+<TD><TT>-NOHGAP </TT></TD>
+<TD><EM>hydrophilic gaps off</EM></TD>
+</TR>
+<TR>
+<TD><TT>-HGAPRESIDUES= </TT></TD>
+<TD><EM>list hydrophilic res.</EM></TD>
+</TR>
+<TR>
+<TD><TT>-MAXDIV=n </TT></TD>
+<TD><EM>% ident. for delay</EM></TD>
+</TR>
+<TR>
+<TD><TT>-TYPE= </TT></TD>
+<TD><EM>PROTEIN or DNA</EM></TD>
+</TR>
+<TR>
+<TD><TT>-TRANSWEIGHT=f </TT></TD>
+<TD><EM>transitions weighting</EM></TD>
+</TR>
+</TABLE></CENTER>
+<CENTER><H3>***Profile Alignments:***
+</H3></CENTER>
+<CENTER><TABLE ALIGN=ABSCENTER BORDER=1 CELLSPACING=1 CELLPADDING=5>
+<TR>
+<TD><STRONG>Parameter</STRONG></TD>
+<TD><STRONG><EM>Description</EM></STRONG></TD>
+</TR>
+<TR>
+<TD><TT>-NEWTREE1= </TT></TD>
+<TD><EM>file for new guide tree for profile1</EM></TD>
+</TR>
+<TR>
+<TD><TT>-NEWTREE2= </TT></TD>
+<TD><EM>file for new guide tree for profile2</EM></TD>
+</TR>
+<TR>
+<TD><TT>-USETREE1= </TT></TD>
+<TD><EM>file for old guide tree for profile1</EM></TD>
+</TR>
+<TR>
+<TD><TT>-USETREE2= </TT></TD>
+<TD><EM>file for old guide tree for profile2</EM></TD>
+</TR>
+</TABLE></CENTER>
+<CENTER><H3>***Sequence to Profile Alignments:***
+</H3></CENTER>
+<CENTER><TABLE ALIGN=ABSCENTER BORDER=1 CELLSPACING=1 CELLPADDING=5>
+<TR>
+<TD><STRONG>Parameter</STRONG></TD>
+<TD><STRONG><EM>Description</EM></STRONG></TD>
+</TR>
+<TR>
+<TD><TT>-NEWTREE= </TT></TD>
+<TD><EM>file for new guide tree</EM></TD>
+</TR>
+<TR>
+<TD><TT>-USETREE= </TT></TD>
+<TD><EM>file for old guide tree</EM></TD>
+</TR>
+</TABLE></CENTER>
+<CENTER><H3>***Structure Alignments:***
+</H3></CENTER>
+<CENTER><TABLE ALIGN=ABSCENTER BORDER=1 CELLSPACING=1 CELLPADDING=5>
+<TR>
+<TD><STRONG>Parameter</STRONG></TD>
+<TD><STRONG><EM>Description</EM></STRONG></TD>
+</TR>
+<TR>
+<TD><TT>-NOSECSTR2 </TT></TD>
+<TD><EM>do not use secondary structure/gap penalty mask for profile 2</EM></TD>
+</TR>
+<TR>
+<TD><TT>-SECSTROUT=STRUCTURE or MASK or BOTH or NONE </TT></TD>
+<TD><EM>output in alignment file</EM></TD>
+</TR>
+<TR>
+<TD><TT>-HELIXGAP=n </TT></TD>
+<TD><EM>gap penalty for helix core residues </EM></TD>
+</TR>
+<TR>
+<TD><TT>-STRANDGAP=n </TT></TD>
+<TD><EM>gap penalty for strand core residues</EM></TD>
+</TR>
+<TR>
+<TD><TT>-LOOPGAP=n </TT></TD>
+<TD><EM>gap penalty for loop regions</EM></TD>
+</TR>
+<TR>
+<TD><TT>-TERMINALGAP=n </TT></TD>
+<TD><EM>gap penalty for structure termini</EM></TD>
+</TR>
+<TR>
+<TD><TT>-HELIXENDIN=n </TT></TD>
+<TD><EM>number of residues inside helix to be treated as terminal</EM></TD>
+</TR>
+<TR>
+<TD><TT>-HELIXENDOUT=n </TT></TD>
+<TD><EM>number of residues outside helix to be treated as terminal</EM></TD>
+</TR>
+<TR>
+<TD><TT>-STRANDENDIN=n </TT></TD>
+<TD><EM>number of residues inside strand to be treated as terminal</EM></TD>
+</TR>
+<TR>
+<TD><TT>-STRANDENDOUT=n</TT></TD>
+<TD><EM>number of residues outside strand to be treated as terminal </EM></TD>
+</TR>
+</TABLE></CENTER>
+<CENTER><H3>***Trees:***
+</H3></CENTER>
+<CENTER><TABLE ALIGN=ABSCENTER BORDER=1 CELLSPACING=1 CELLPADDING=5>
+<TR>
+<TD><STRONG>Parameter</STRONG></TD>
+<TD><STRONG><EM>Description</EM></STRONG></TD>
+</TR>
+<TR>
+<TD><TT>-SEED=n </TT></TD>
+<TD><EM>seed number for bootstraps</EM></TD>
+</TR>
+<TR>
+<TD><TT>-KIMURA </TT></TD>
+<TD><EM>use Kimura's correction</EM></TD>
+</TR>
+<TR>
+<TD><TT>-TOSSGAPS </TT></TD>
+<TD><EM>ignore positions with gaps</EM></TD>
+</TR>
+<TR>
+<TD><TT>-BOOTLABELS=node OR branch </TT></TD>
+<TD><EM>position of bootstrap values in tree display</EM></TD>
+</TR>
+</TABLE></CENTER>
+</P>
+<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
+<CENTER><H2><A NAME="R"> References
+</A></H2></CENTER>
+<P>
+</P>
+<P>
+<STRONG>
+The ClustalX program is described in the manuscript:
+</STRONG>
+</P>
+<P>
+Thompson,J.D., Gibson,T.J., Plewniak,F., Jeanmougin,F. and Higgins,D.G. (1997)
+The ClustalX windows interface: flexible strategies for multiple sequence
+alignment aided by quality analysis tools. Nucleic Acids Research, 25:4876-4882.
+</P>
+<P>
+</P>
+<P>
+<STRONG>
+The ClustalW program is described in the manuscript:
+</STRONG>
+</P>
+<P>
+Thompson, J.D., Higgins, D.G. and Gibson, T.J. (1994) CLUSTAL W: improving the
+sensitivity of progressive multiple sequence alignment through sequence
+weighting, positions-specific gap penalties and weight matrix choice. Nucleic
+Acids Research, 22:4673-4680.
+</P>
+<P>
+</P>
+<P>
+<STRONG>
+The ClustalV program is described in the manuscript:
+</STRONG>
+</P>
+<P>
+Higgins,D.G., Bleasby,A.J. and Fuchs,R. (1992) CLUSTAL V: improved software for
+multiple sequence alignment. CABIOS 8,189-191.
+</P>
+<P>
+</P>
+<P>
+<STRONG>
+The original Clustal program is described in the manuscripts:
+</STRONG>
+</P>
+<P>
+Higgins,D.G. and Sharp,P.M. (1989) Fast and sensitive multiple sequence
+alignments on a microcomputer.
+CABIOS 5,151-153.
+</P>
+<P>
+Higgins,D.G. and Sharp,P.M. (1988) CLUSTAL: a package for performing multiple
+sequence alignment on a microcomputer. Gene 73,237-244.
+</P>
+<P>
+<STRONG>
+Some tips on using Clustal X:
+</STRONG>
+</P>
+<P>
+Jeannmougin,F., Thompson,J.D., Gouy,M., Higgins,D.G. and Gibson,T.J. (1998)
+Multiple sequence alignment with Clustal X. Trends Biochem Sci, 23, 403-5.
+</P>
+<P>
+<STRONG>
+Some tips on using Clustal W:
+</STRONG>
+</P>
+<P>
+Higgins, D. G., Thompson, J. D. and Gibson, T. J. (1996) Using CLUSTAL for
+multiple sequence alignments. Methods Enzymol., 266, 383-402.
+</P>
+<P>
+<STRONG>
+You can get the latest version of the ClustalX program by anonymous ftp to:
+</STRONG>
+</P>
+<P>
+ftp-igbmc.u-strasbg.fr
+ftp.embl-heidelberg.de
+ftp.ebi.ac.uk
+</P>
+<P>
+<STRONG>
+Or, have a look at the following WWW site:
+</STRONG>
+</P>
+<P>
+http://www-igbmc.u-strasbg.fr/BioInfo/
+</P>
+<P>
+</P>
+<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
Added: trunk/packages/clustalw/branches/upstream/current/clustalx_help
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/clustalx_help 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/clustalx_help 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,1523 @@
+
+This is the on-line help file for Clustal X (version 1.83), using the NCBI
+Vibrant Toolkit.
+
+It should be named or defined as: clustalx_help
+except with MSDOS in which case it should be named ClustalX.HLP
+
+For full details of usage and algorithms, please read the CLUSTALW.DOC file.
+
+
+Toby Gibson EMBL, Heidelberg, Germany.
+Des Higgins UCC, Cork, Ireland.
+Julie Thompson/Francois Jeanmougin IGBMC, Strasbourg, France.
+
+
+
+
+>>HELP G <<
+ General help for CLUSTAL X (1.83)
+
+Clustal X is a windows interface for the ClustalW multiple sequence alignment
+program. It provides an integrated environment for performing multiple sequence
+and profile alignments and analysing the results. The sequence alignment is
+displayed in a window on the screen. A versatile coloring scheme has been
+incorporated allowing you to highlight conserved features in the alignment.
+The pull-down menus at the top of the window allow you to select all the
+options required for traditional multiple sequence and profile alignment.
+
+You can cut-and-paste sequences to change the order of the alignment; you can
+select a subset of sequences to be aligned; you can select a sub-range of the
+alignment to be realigned and inserted back into the original alignment.
+
+Alignment quality analysis can be performed and low-scoring segments or
+exceptional residues can be highlighted.
+
+ClustalX is available for a number of different platforms including: SUN
+Solaris, IRIX5.3 on Silicon Graphics, Digital UNIX on DECStations, Microsoft
+Windows (32 bit) for PC's, Linux ELF for x86 PC's and Macintosh PowerMac. (See
+the README file for Installation instructions.)
+
+
+<H4>
+SEQUENCE INPUT
+</H4>
+
+Sequences and profiles (a term for pre-existing alignments) are input using
+the FILE menu. Invalid options will be disabled. All sequences must be included
+into 1 file. 7 formats are automatically recognised: NBRF/PIR, EMBL/SWISSPROT,
+Pearson (Fasta), Clustal (*.aln), GCG/MSF (Pileup), GCG9 RSF and GDE flat file.
+All non-alphabetic characters (spaces, digits, punctuation marks) are ignored
+except "-" which is used to indicate a GAP ("." in MSF/RSF).
+
+<H4>
+SEQUENCE / PROFILE ALIGNMENTS
+</H4>
+
+Clustal X has two modes which can be selected using the switch directly above
+the sequence display: MULTIPLE ALIGNMENT MODE and PROFILE ALIGNMENT MODE.
+
+To do a MULTIPLE ALIGNMENT on a set of sequences, make sure MULTIPLE ALIGNMENT
+MODE is selected. A single sequence data area is then displayed. The ALIGNMENT
+menu then allows you to either produce a guide tree for the alignment, or to do
+a multiple alignment following the guide tree, or to do a full multiple
+alignment.
+
+In PROFILE ALIGNMENT MODE, two sequence data areas are displayed, allowing you
+to align 2 alignments (termed profiles). Profiles are also used to add a new
+sequence to an old alignment, or to use secondary structure to guide the
+alignment process. GAPS in the old alignments are indicated using the "-"
+character. PROFILES can be input in ANY of the allowed formats; just use "-"
+(or "." for MSF/RSF) for each gap position. In Profile Alignment Mode, a button
+"Lock Scroll" is displayed which allows you to scroll the two profiles together
+using a single scroll bar. When the Lock Scroll is turned off, the two profiles
+can be scrolled independently.
+
+<H4>
+PHYLOGENETIC TREES
+</H4>
+
+Phylogenetic trees can be calculated from old alignments (read in with "-"
+characters to indicate gaps) OR after a multiple alignment while the alignment
+is still displayed.
+
+<H4>
+ALIGNMENT DISPLAY
+</H4>
+
+The alignment is displayed on the screen with the sequence names on the left
+hand side. The sequence alignment is for display only, it cannot be edited here
+(except for changing the sequence order by cutting-and-pasting on the sequence
+names).
+
+A ruler is displayed below the sequences, starting at 1 for the first residue
+position (residue numbers in the sequence input file are ignored).
+
+A line above the alignment is used to mark strongly conserved positions. Three
+characters ('*', ':' and '.') are used:
+
+'*' indicates positions which have a single, fully conserved residue
+
+':' indicates that one of the following 'strong' groups is fully conserved:-
+<PRE>
+ STA
+ NEQK
+ NHQK
+ NDEQ
+ QHRK
+ MILV
+ MILF
+ HY
+ FYW
+</PRE>
+
+'.' indicates that one of the following 'weaker' groups is fully conserved:-
+<PRE>
+ CSA
+ ATV
+ SAG
+ STNK
+ STPA
+ SGND
+ SNDEQK
+ NDEQHK
+ NEQHRK
+ FVLIM
+ HFY
+</PRE>
+
+These are all the positively scoring groups that occur in the Gonnet Pam250
+matrix. The strong and weak groups are defined as strong score >0.5 and weak
+score =<0.5 respectively.
+
+For profile alignments, secondary structure and gap penalty masks are displayed
+above the sequences, if any data is found in the profile input file.
+
+
+>>HELP F <<
+ Input / Output Files
+
+LOAD SEQUENCES reads sequences from one of 7 file formats, replacing any
+sequences that are already loaded. All sequences must be in 1 file. The formats
+that are automatically recognised are: NBRF/PIR, EMBL/SWISSPROT, Pearson
+(Fasta), Clustal (*.aln), GCG/MSF (Pileup), GCG9/RSF and GDE flat file. All
+non-alphabetic characters (spaces, digits, punctuation marks) are ignored
+except "-" which is used to indicate a GAP ("." in MSF/RSF).
+
+The program tries to automatically recognise the different file formats used
+and to guess whether the sequences are amino acid or nucleotide. This is not
+always foolproof.
+
+FASTA and NBRF/PIR formats are recognised by having a ">" as the first
+character in the file.
+
+EMBL/Swiss Prot formats are recognised by the letters "ID" at the start of the
+file (the token for the entry name field).
+
+CLUSTAL format is recognised by the word CLUSTAL at the beginning of the file.
+
+GCG/MSF format is recognised by one of the following:
+<UL>
+<LI>
+ - the word PileUp at the start of the file.
+</LI><LI>
+ - the word !!AA_MULTIPLE_ALIGNMENT or !!NA_MULTIPLE_ALIGNMENT
+ at the start of the file.
+</LI><LI>
+ - the word MSF on the first line of the file, and the characters ..
+ at the end of this line.
+</LI>
+</UL>
+
+GCG/RSF format is recognised by the word !!RICH_SEQUENCE at the beginning of
+the file.
+
+
+If 85% or more of the characters in the sequence are from A,C,G,T,U or N, the
+sequence will be assumed to be nucleotide. This works in 97.3% of cases but
+watch out!
+
+APPEND SEQUENCES is only valid in MULTIPLE ALIGNMENT MODE. The input sequences
+do not replace those already loaded, but are appended at the end of the
+alignment.
+
+SAVE SEQUENCES AS... offers the user a choice of one of six output formats:
+CLUSTAL, NBRF/PIR, GCG/MSF, PHYLIP, NEXUS, GDE or FASTA. All sequences are written
+to a single file. Options are available to save a range of the alignment,
+switch between UPPER/LOWER case for GDE files, and to output SEQUENCE NUMBERING
+for CLUSTAL files. Users can also choose to include the residue range numbers
+by appending them to the sequence names.
+
+LOAD PROFILE 1 reads sequences in the same 7 file formats, replacing any
+sequences already loaded as Profile 1. This option will also remove any
+sequences which are loaded in Profile 2.
+
+LOAD PROFILE 2 reads sequences in the same 7 file formats, replacing any
+sequences already loaded as Profile 2.
+
+SAVE PROFILE 1 AS... is similar to the Save Sequences option except that only
+those sequences in Profile 1 will be written to the output file.
+
+SAVE PROFILE 2 AS... is similar to the Save Sequences option except that only
+those sequences in Profile 2 will be written to the output file.
+
+WRITE ALIGNMENT AS POSTSCRIPT will write the sequence display to a postscript
+format file. This will include any secondary structure / gap penalty mask
+information and the consensus and ruler lines which are displayed on the
+screen. The Alignment Quality curve can be optionally included in the output
+file.
+
+WRITE PROFILE 1 AS POSTSCRIPT is similar to WRITE ALIGNMENT AS POSTSCRIPT
+except that only the profile 1 display will be printed.
+
+WRITE PROFILE 2 AS POSTSCRIPT is similar to WRITE ALIGNMENT AS POSTSCRIPT
+except that only the profile 2 display will be printed.
+
+
+<H4>
+POSTSCRIPT PARAMETERS
+</H4>
+
+A number of options are available to allow you to configure your postscript
+output file.
+
+PS COLORS FILE:
+
+The exact RGB values required to reproduce the colors used in the alignment
+window will vary from printer to printer. A PS colors file can be specified
+that contains the RGB values for all the colors required by each of your
+postscript printers.
+
+By default, Clustal X looks for a file called 'colprint.par' in the current
+directory (if your running under UNIX, it then looks in your home directory,
+and finally in the directories in your PATH environment variable). If no PS
+colors file is found or a color used on the screen is not defined here, the
+screen RGB values (from the Color Parameter File) are used.
+
+The PS colors file consists of one line for each color to be defined, with the
+color name followed by the RGB values (on a scale of 0 to 1). For example,
+
+RED 0.9 0.1 0.1
+
+Blank lines and comments (lines beginning with a '#' character) are ignored.
+
+
+PAGE SIZE: The alignment can be displayed on either A4, A3 or US Letter size
+pages.
+
+ORIENTATION: The alignment can be displayed on either a landscape or portrait
+page.
+
+PRINT HEADER: An optional header including the postscript filename, and
+creation date can be printed at the top of each page.
+
+PRINT QUALITY CURVE: The Alignment Quality curve which is displayed underneath
+the alignment on the screen can be included in the postscript output.
+
+PRINT RULER: The ruler which is displayed underneath the alignment on the
+screen can be included in the postscript output.
+
+PRINT RESIDUE NUMBERS: Sequence residue numbers can be printed at the right
+hand side of the alignment.
+
+RESIZE TO FIT PAGE: By default, the alignment is scaled to fit the page size
+selected. This option can be turned off, in which case a font size of 10 will
+be used for the sequences.
+
+PRINT FROM POSITION/TO: A range of the alignment can be printed. The default
+is to print the full alignment. The first and last residues to be printed are
+specified here.
+
+USE BLOCK LENGTH: The alignment can be divided into blocks of residues. The
+number of residues in a block is specified here. More than one block may then
+be printed on a single page. This is useful for long alignments of a small
+number of sequences. If the block length is set to 0, The alignment will not
+be divided into blocks, but printed across a number of pages.
+
+>>HELP E <<
+ Editing Alignments
+
+Clustal X allows you to change the order of the sequences in the alignment, by
+cutting-and-pasting the sequence names.
+
+To select a group of sequences to be moved, click on a sequence name and drag
+the cursor until all the required sequences are highlighted. Holding down the
+Shift key when clicking on the first name will add new sequences to those
+already selected.
+
+(Options are provided to Select All Sequences, Select Profile 1 or Select
+Profile 2.)
+
+The selected sequences can be removed from the alignment by using the EDIT
+menu, CUT option.
+
+To add the cut sequences back into an alignment, select a sequence by clicking
+on the sequence name. The cut sequences will be added to the alignment,
+immediately following the selected sequence, by the EDIT menu, PASTE option.
+
+To add the cut sequences to an empty alignment (eg. when cutting sequences from
+Profile 1 and pasting them to Profile 2), click on the empty sequence name
+display area, and select the EDIT menu, PASTE option as before.
+
+The sequence selection and sequence range selection can be cleared using the
+EDIT menu, CLEAR SEQUENCE SELECTION and CLEAR RANGE SELECTION options
+respectively.
+
+To search for a string of residues in the sequences, select the sequences to be
+searched by clicking on the sequence names. You can then enter the string to
+search for by selecting the SEARCH FOR STRING option. If the string is found in
+any of the sequences selected, the sequence name and column number is printed
+below the sequence display.
+
+In PROFILE ALIGNMENT MODE, the two profiles can be merged (normally done after
+alignment) by selecting ADD PROFILE 2 TO PROFILE 1. The sequences currently
+displayed as Profile 2 will be appended to Profile 1.
+
+The REMOVE ALL GAPS option will remove all gaps from the sequences currently
+selected.
+WARNING: This option removes ALL gaps, not only those introduced by ClustalX,
+but also those that were read from the input alignment file. Any secondary
+structure information associated with the alignment will NOT be automatically
+realigned.
+
+The REMOVE GAP-ONLY COLUMNS will remove those positions in the alignment which
+contain gaps in all sequences. This can occur as a result of removing divergent
+sequences from an alignment, or if an alignment has been realigned.
+
+>>HELP M <<
+ Multiple Alignments
+
+Make sure MULTIPLE ALIGNMENT MODE is selected, using the switch directly above
+the sequence display area. Then, use the ALIGNMENT menu to do multiple
+alignments.
+
+Multiple alignments are carried out in 3 stages:
+
+1) all sequences are compared to each other (pairwise alignments);
+
+2) a dendrogram (like a phylogenetic tree) is constructed, describing the
+approximate groupings of the sequences by similarity (stored in a file).
+
+3) the final multiple alignment is carried out, using the dendrogram as a guide.
+
+The 3 stages are carried out automatically by the DO COMPLETE ALIGNMENT option.
+You can skip the first stages (pairwise alignments; guide tree) by using an old
+guide tree file (DO ALIGNMENT FROM GUIDE TREE); or you can just produce the
+guide tree with no final multiple alignment (PRODUCE GUIDE TREE ONLY).
+
+
+REALIGN SELECTED SEQUENCES is used to realign badly aligned sequences in the
+alignment. Sequences can be selected by clicking on the sequence names - see
+Editing Alignments for more details. The unselected sequences are then 'fixed'
+and a profile is made including only the unselected sequences. Each of the
+selected sequences in turn is then realigned to this profile. The realigned
+sequences will be displayed as a group at the end the alignment.
+
+
+REALIGN SELECTED SEQUENCE RANGE is used to realign a small region of the
+alignment. A residue range can be selected by clicking on the sequence display
+area. A multiple alignment is then performed, following the 3 stages described
+above, but only using the selected residue range. Finally the new alignment of
+the range is pasted back into the full sequence alignment.
+
+By default, gap penalties are used at each end of the subrange in order to
+penalise terminal gaps. If the REALIGN SEGMENT END GAP PENALTIES option is
+switched off, gaps can be introduced at the ends of the residue range at no
+cost.
+
+
+ALIGNMENT PARAMETERS displays a sub-menu with the following options:
+
+RESET NEW GAPS BEFORE ALIGNMENT will remove any new gaps introduced into the
+sequences during multiple alignment if you wish to change the parameters and
+try again. This only takes effect just before you do a second multiple
+alignment. You can make phylogenetic trees after alignment whether or not this
+is ON. If you turn this OFF, the new gaps are kept even if you do a second
+multiple alignment. This allows you to iterate the alignment gradually.
+Sometimes, the alignment is improved by a second or third pass.
+
+RESET ALL GAPS BEFORE ALIGNMENT will remove all gaps in the sequences including
+gaps which were read in from the sequence input file. This only takes effect
+just before you do a second multiple alignment. You can make phylogenetic
+trees after alignment whether or not this is ON. If you turn this OFF, all
+gaps are kept even if you do a second multiple alignment. This allows you to
+iterate the alignment gradually. Sometimes, the alignment is improved by a
+second or third pass.
+
+
+PAIRWISE ALIGNMENT PARAMETERS control the speed/sensitivity of the initial
+alignments.
+
+MULTIPLE ALIGNMENT PARAMETERS control the gaps in the final multiple
+alignments.
+
+PROTEIN GAP PARAMETERS displays a temporary window which allows you to set
+various parameters only used in the alignment of protein sequences.
+
+(SECONDARY STRUCTURE PARAMETERS, for use with the Profile Alignment Mode only,
+allows you to set various parameters only used with gap penalty masks.)
+
+SAVE LOG FILE will write the alignment calculation scores to a file. The log
+filename is the same as the input sequence filename, with an extension .log
+appended.
+
+
+<H4>
+OUTPUT FORMAT OPTIONS
+</H4>
+
+You can choose from 7 different alignment formats (CLUSTAL, GCG, NBRF/PIR,
+PHYLIP, GDE, NEXUS, FASTA). You can choose more than one (or all 7 if you wish).
+
+CLUSTAL format output is a self explanatory alignment format. It shows the
+sequences aligned in blocks. It can be read in again at a later date to (for
+example) calculate a phylogenetic tree or add in new sequences by profile
+alignment.
+
+GCG output can be used by any of the GCG programs that can work on multiple
+alignments (e.g. PRETTY, PROFILEMAKE, PLOTALIGN). It is the same as the GCG
+.msf format files (multiple sequence file); new in version 7 of GCG.
+
+NEXUS format is used by several phylogeny programs, including PAUP and
+MacClade.
+
+PHYLIP format output can be used for input to the PHYLIP package of Joe
+Felsenstein. This is a very widely used package for doing every imaginable
+form of phylogenetic analysis (MUCH more than the the modest introduction
+offered by this program).
+
+NBRF/PIR: this is the same as the standard PIR format with ONE ADDITION. Gap
+characters "-" are used to indicate the positions of gaps in the multiple
+alignment. These files can be re-used as input in any part of clustal that
+allows sequences (or alignments or profiles) to be read in.
+
+FASTA: this is included for compatibility with numberous sequence analysis programs.
+
+GDE: this format is used by the GDE package of Steven Smith and is understood
+by SEQLAB in GCG 9 or later.
+
+GDE OUTPUT CASE: sequences in GDE format may be written in either upper or
+lower case.
+
+CLUSTALW SEQUENCE NUMBERS: residue numbers may be added to the end of the
+alignment lines in clustalw format.
+
+OUTPUT ORDER is used to control the order of the sequences in the output
+alignments. By default, it uses the order in which the sequences were aligned
+(from the guide tree/dendrogram), thus automatically grouping closely related
+sequences. It can be switched to be the same as the original input order.
+
+PARAMETER OUTPUT: This option will save all your parameter settings in a
+parameter file (suffix .par) during alignment. The file can be subsequently
+used to rerun ClustalW using the same parameters.
+
+
+<H3>
+ALIGNMENT PARAMETERS
+</H3>
+--------------------
+
+<STRONG>
+PAIRWISE ALIGNMENT PARAMETERS
+</STRONG>
+
+A distance is calculated between every pair of sequences and these are used to
+construct the phylogenetic tree which guides the final multiple alignment. The
+scores are calculated from separate pairwise alignments. These can be
+calculated using 2 methods: dynamic programming (slow but accurate) or by the
+method of Wilbur and Lipman (extremely fast but approximate).
+
+You can choose between the 2 alignment methods using the PAIRWISE ALIGNMENTS
+option. The slow/accurate method is fast enough for short sequences but will be
+VERY SLOW for many (e.g. >100) long (e.g. >1000 residue) sequences.
+
+
+<STRONG>
+SLOW-ACCURATE alignment parameters:
+</STRONG>
+
+These parameters do not have any affect on the speed of the alignments. They
+are used to give initial alignments which are then rescored to give percent
+identity scores. These % scores are the ones which are displayed on the
+screen. The scores are converted to distances for the trees.
+
+Gap Open Penalty: the penalty for opening a gap in the alignment.
+
+Gap Extension Penalty: the penalty for extending a gap by 1 residue.
+
+Protein Weight Matrix: the scoring table which describes the similarity of
+each amino acid to each other.
+
+Load protein matrix: allows you to read in a comparison table from a file.
+
+DNA weight matrix: the scores assigned to matches and mismatches (including
+IUB ambiguity codes).
+
+Load DNA matrix: allows you to read in a comparison table from a file.
+
+See the Multiple alignment parameters, MATRIX option below for details of the
+matrix input format.
+
+
+<STRONG>
+FAST-APPROXIMATE alignment parameters:
+</STRONG>
+
+These similarity scores are calculated from fast, approximate, global align-
+ments, which are controlled by 4 parameters. 2 techniques are used to make
+these alignments very fast: 1) only exactly matching fragments (k-tuples) are
+considered; 2) only the 'best' diagonals (the ones with most k-tuple matches)
+are used.
+
+GAP PENALTY: This is a penalty for each gap in the fast alignments. It has
+little effect on the speed or sensitivity except for extreme values.
+
+K-TUPLE SIZE: This is the size of exactly matching fragment that is used.
+INCREASE for speed (max= 2 for proteins; 4 for DNA), DECREASE for sensitivity.
+For longer sequences (e.g. >1000 residues) you may wish to increase the
+default.
+
+TOP DIAGONALS: The number of k-tuple matches on each diagonal (in an imaginary
+dot-matrix plot) is calculated. Only the best ones (with most matches) are used
+in the alignment. This parameter specifies how many. Decrease for speed;
+increase for sensitivity.
+
+WINDOW SIZE: This is the number of diagonals around each of the 'best'
+diagonals that will be used. Decrease for speed; increase for sensitivity.
+
+
+<STRONG>
+MULTIPLE ALIGNMENT PARAMETERS
+</STRONG>
+
+These parameters control the final multiple alignment. This is the core of the
+program and the details are complicated. To fully understand the use of the
+parameters and the scoring system, you will have to refer to the documentation.
+
+Each step in the final multiple alignment consists of aligning two alignments
+or sequences. This is done progressively, following the branching order in the
+GUIDE TREE. The basic parameters to control this are two gap penalties and the
+scores for various identical/non-indentical residues.
+
+The GAP OPENING and EXTENSION PENALTIES can be set here. These control the
+cost of opening up every new gap and the cost of every item in a gap.
+Increasing the gap opening penalty will make gaps less frequent. Increasing
+the gap extension penalty will make gaps shorter. Terminal gaps are not
+penalised.
+
+The DELAY DIVERGENT SEQUENCES switch delays the alignment of the most distantly
+related sequences until after the most closely related sequences have been
+aligned. The setting shows the percent identity level required to delay the
+addition of a sequence; sequences that are less identical than this level to
+any other sequences will be aligned later.
+
+The TRANSITION WEIGHT gives transitions (A<-->G or C<-->T i.e. purine-purine or
+pyrimidine-pyrimidine substitutions) a weight between 0 and 1; a weight of zero
+means that the transitions are scored as mismatches, while a weight of 1 gives
+the transitions the match score. For distantly related DNA sequences, the
+weight should be near to zero; for closely related sequences it can be useful
+to assign a higher score. The default is set to 0.5.
+
+
+The PROTEIN WEIGHT MATRIX option allows you to choose a series of weight
+matrices. For protein alignments, you use a weight matrix to determine the
+similarity of non-identical amino acids. For example, Tyr aligned with Phe is
+usually judged to be 'better' than Tyr aligned with Pro.
+
+There are three 'in-built' series of weight matrices offered. Each consists of
+several matrices which work differently at different evolutionary distances. To
+see the exact details, read the documentation. Crudely, we store several
+matrices in memory, spanning the full range of amino acid distance (from almost
+identical sequences to highly divergent ones). For very similar sequences, it
+is best to use a strict weight matrix which only gives a high score to
+identities and the most favoured conservative substitutions. For more divergent
+sequences, it is appropriate to use "softer" matrices which give a high score
+to many other frequent substitutions.
+
+1) BLOSUM (Henikoff). These matrices appear to be the best available for
+carrying out data base similarity (homology searches). The matrices currently
+used are: Blosum 80, 62, 45 and 30. BLOSUM was the default in earlier Clustal X
+versions.
+
+2) PAM (Dayhoff). These have been extremely widely used since the late '70s. We
+currently use the PAM 20, 60, 120, 350 matrices.
+
+3) GONNET. These matrices were derived using almost the same procedure as the
+Dayhoff one (above) but are much more up to date and are based on a far larger
+data set. They appear to be more sensitive than the Dayhoff series. We
+currently use the GONNET 80, 120, 160, 250 and 350 matrices. This series is the
+default for Clustal X version 1.8.
+
+We also supply an identity matrix which gives a score of 10 to two identical
+amino acids and a score of zero otherwise. This matrix is not very useful.
+
+Load protein matrix: allows you to read in a comparison matrix from a file.
+This can be either a single matrix or a series of matrices (see below for
+format).
+
+
+DNA WEIGHT MATRIX option allows you to select a single matrix (not a series)
+used for aligning nucleic acid sequences. Two hard-coded matrices are available:
+
+1) IUB. This is the default scoring matrix used by BESTFIT for the comparison
+of nucleic acid sequences. X's and N's are treated as matches to any IUB
+ambiguity symbol. All matches score 1.9; all mismatches for IUB symbols score 0.
+
+2) CLUSTALW(1.6). A previous system used by ClustalW, in which matches score
+1.0 and mismatches score 0. All matches for IUB symbols also score 0.
+
+Load DNA matrix: allows you to read in a nucleic acid comparison matrix from a
+file (just one matrix, not a series).
+
+
+SINGLE MATRIX INPUT FORMAT
+The format used for a single matrix is the same as the BLAST program. The
+scores in the new weight matrix should be similarities. You can use negative as
+well as positive values if you wish, although the matrix will be automatically
+adjusted to all positive scores, unless the NEGATIVE MATRIX option is selected.
+Any lines beginning with a # character are assumed to be comments. The first
+non-comment line should contain a list of amino acids in any order, using the 1
+letter code, followed by a * character. This should be followed by a square
+matrix of scores, with one row and one column for each amino acid. The last row
+and column of the matrix (corresponding to the * character) contain the minimum
+score over the whole matrix.
+
+MATRIX SERIES INPUT FORMAT
+ClustalX uses different matrices depending on the mean percent identity of the
+sequences to be aligned. You can specify a series of matrices and the range of
+the percent identity for each matrix in a matrix series file. The file is
+automatically recognised by the word CLUSTAL_SERIES at the beginning of the
+file. Each matrix in the series is then specified on one line which should
+start with the word MATRIX. This is followed by the lower and upper limits of
+the sequence percent identities for which you want to apply the matrix. The
+final entry on the matrix line is the filename of a Blast format matrix file
+(see above for details of the single matrix file format).
+
+Example.
+
+CLUSTAL_SERIES
+
+MATRIX 81 100 /us1/user/julie/matrices/blosum80
+MATRIX 61 80 /us1/user/julie/matrices/blosum62
+MATRIX 31 60 /us1/user/julie/matrices/blosum45
+MATRIX 0 30 /us1/user/julie/matrices/blosum30
+
+
+<STRONG>
+PROTEIN GAP PARAMETERS
+</STRONG>
+
+RESIDUE SPECIFIC PENALTIES are amino acid specific gap penalties that reduce or
+increase the gap opening penalties at each position in the alignment or
+sequence. See the documentation for details. As an example, positions that are
+rich in glycine are more likely to have an adjacent gap than positions that are
+rich in valine.
+
+HYDROPHILIC GAP PENALTIES are used to increase the chances of a gap within a
+run (5 or more residues) of hydrophilic amino acids; these are likely to be
+loop or random coil regions where gaps are more common. The residues that are
+"considered" to be hydrophilic can be entered in HYDROPHILIC RESIDUES.
+
+GAP SEPARATION DISTANCE tries to decrease the chances of gaps being too close
+to each other. Gaps that are less than this distance apart are penalised more
+than other gaps. This does not prevent close gaps; it makes them less frequent,
+promoting a block-like appearance of the alignment.
+
+END GAP SEPARATION treats end gaps just like internal gaps for the purposes of
+avoiding gaps that are too close (set by GAP SEPARATION DISTANCE above). If you
+turn this off, end gaps will be ignored for this purpose. This is useful when
+you wish to align fragments where the end gaps are not biologically meaningful.
+
+
+>>HELP P <<
+ Profile and Structure Alignments
+
+By PROFILE ALIGNMENT, we mean alignment using existing alignments. Profile
+alignments allow you to store alignments of your favourite sequences and add
+new sequences to them in small bunches at a time. A profile is simply an
+alignment of one or more sequences (e.g. an alignment output file from Clustal
+X). Each input can be a single sequence. One or both sets of input sequences
+may include secondary structure assignments or gap penalty masks to guide the
+alignment.
+
+Make sure PROFILE ALIGNMENT MODE is selected, using the switch directly above
+the sequence display area. Then, use the ALIGNMENT menu to do profile and
+secondary structure alignments.
+
+The profiles can be in any of the allowed input formats with "-" characters
+used to specify gaps (except for GCG/MSF where "." is used).
+
+You have to load the 2 profiles by choosing FILE, LOAD PROFILE 1 and LOAD
+PROFILE 2. Then ALIGNMENT, ALIGN PROFILE 2 TO PROFILE 1 will align the 2
+profiles to each other. Secondary structure masks in either profile can be used
+to guide the alignment. This option compares all the sequences in profile 1
+with all the sequences in profile 2 in order to build guide trees which will be
+used to calculate sequence weights, and select appropriate alignment parameters
+for the final profile alignment.
+
+You can skip the first stage (pairwise alignments; guide trees) by using old
+guide tree files (ALIGN PROFILES FROM GUIDE TREES).
+
+The ALIGN SEQUENCES TO PROFILE 1 option will take the sequences in the second
+profile and align them to the first profile, 1 at a time. This is useful to
+add some new sequences to an existing alignment, or to align a set of sequences
+to a known structure. In this case, the second profile set need not be
+pre-aligned.
+
+You can skip the first stage (pairwise alignments; guide tree) by using an old
+guide tree file (ALIGN SEQUENCES TO PROFILE 1 FROM TREE).
+
+SAVE LOG FILE will write the alignment calculation scores to a file. The log
+filename is the same as the input sequence filename, with an extension .log
+appended.
+
+The alignment parameters can be set using the ALIGNMENT PARAMETERS menu,
+Pairwise Parameters, Multiple Parameters and Protein Gap Parameters options.
+These are EXACTLY the same parameters as used by the general, automatic
+multiple alignment procedure. The general multiple alignment procedure is
+simply a series of profile alignments. Carrying out a series of profile
+alignments on larger and larger groups of sequences, allows you to manually
+build up a complete alignment, if necessary editing intermediate alignments.
+
+<STRONG>
+SECONDARY STRUCTURE PARAMETERS
+</STRONG>
+
+Use this menu to set secondary structure options. If a solved structure is
+known, it can be used to guide the alignment by raising gap penalties within
+secondary structure elements, so that gaps will preferentially be inserted into
+unstructured surface loop regions. Alternatively, a user-specified gap penalty
+mask can be supplied for a similar purpose.
+
+A gap penalty mask is a series of numbers between 1 and 9, one per position in
+the alignment. Each number specifies how much the gap opening penalty is to be
+raised at that position (raised by multiplying the basic gap opening penalty
+by the number) i.e. a mask figure of 1 at a position means no change
+in gap opening penalty; a figure of 4 means that the gap opening penalty is
+four times greater at that position, making gaps 4 times harder to open.
+
+The format for gap penalty masks and secondary structure masks is explained in
+a separate help section.
+
+>>HELP B <<
+ Secondary Structure / Gap Penalty Masks
+
+The use of secondary structure-based penalties has been shown to improve the
+accuracy of sequence alignment. Clustal X now allows secondary structure/ gap
+penalty masks to be supplied with the input sequences used during profile
+alignment. (NB. The secondary structure information is NOT used during multiple
+sequence alignment). The masks work by raising gap penalties in specified
+regions (typically secondary structure elements) so that gaps are
+preferentially opened in the less well conserved regions (typically surface
+loops).
+
+The USE PROFILE 1(2) SECONDARY STRUCTURE / GAP PENALTY MASK options control
+whether the input 2D-structure information or gap penalty masks will be used
+during the profile alignment.
+
+The OUTPUT options control whether the secondary structure and gap penalty
+masks should be included in the Clustal X output alignments. Showing both is
+useful for understanding how the masks work. The 2D-structure information is
+itself useful in judging the alignment quality and in seeing how residue
+conservation patterns vary with secondary structure.
+
+The HELIX and STRAND GAP PENALTY options provide the value for raising the gap
+penalty at core Alpha Helical (A) and Beta Strand (B) residues. In CLUSTAL
+format, capital residues denote the A and B core structure notation. Basic gap
+penalties are multiplied by the amount specified.
+
+The LOOP GAP PENALTY option provides the value for the gap penalty in Loops.
+By default this penalty is not raised. In CLUSTAL format, loops are specified
+by "." in the secondary structure notation.
+
+The SECONDARY STRUCTURE TERMINAL PENALTY provides the value for setting the gap
+penalty at the ends of secondary structures. Ends of secondary structures are
+known to grow or shrink, comparing related structures. Therefore by default
+these are given intermediate values, lower than the core penalties. All
+secondary structure read in as lower case in CLUSTAL format gets the reduced
+terminal penalty.
+
+The HELIX and STRAND TERMINAL POSITIONS options specify the range of structure
+termini for the intermediate penalties. In the alignment output, these are
+indicated as lower case. For Alpha Helices, by default, the range spans the
+end-helical turn (3 residues). For Beta Strands, the default range spans the
+end residue and the adjacent loop residue, since sequence conservation often
+extends beyond the actual H-bonded Beta Strand.
+
+Clustal X can read the masks from SWISS-PROT, CLUSTAL or GDE format input
+files. For many 3-D protein structures, secondary structure information is
+recorded in the feature tables of SWISS-PROT database entries. You should
+always check that the assignments are correct - some are quite inaccurate.
+Clustal X looks for SWISS-PROT HELIX and STRAND assignments e.g.
+
+
+<PRE>
+FT HELIX 100 115
+FT STRAND 118 119
+</PRE>
+
+The structure and penalty masks can also be read from CLUSTAL alignment format
+as comment lines beginning "!SS_" or "!GM_" e.g.
+
+<PRE>
+!SS_HBA_HUMA ..aaaAAAAAAAAAAaaa.aaaAAAAAAAAAAaaaaaaAaaa.........aaaAAAAAA
+!GM_HBA_HUMA 112224444444444222122244444444442222224222111111111222444444
+HBA_HUMA VLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHGK
+</PRE>
+
+Note that the mask itself is a set of numbers between 1 and 9 each of which is
+assigned to the residue(s) in the same column below.
+
+In GDE flat file format, the masks are specified as text and the names must
+begin with "SS_ or "GM_.
+
+Either a structure or penalty mask or both may be used. If both are included
+in an alignment, the user will be asked which is to be used.
+
+
+>>HELP T <<
+ Phylogenetic Trees
+
+Before calculating a tree, you must have an ALIGNMENT in memory. This can be
+input using the FILE menu, LOAD SEQUENCES option or you should have just
+carried out a full multiple alignment and the alignment is still in memory.
+Remember YOU MUST ALIGN THE SEQUENCES FIRST!!!!
+
+The method used is the NJ (Neighbour Joining) method of Saitou and Nei. First
+you calculate distances (percent divergence) between all pairs of sequence from
+a multiple alignment; second you apply the NJ method to the distance matrix.
+
+To calculate a tree, use the DRAW N-J TREE option. This gives an UNROOTED tree
+and all branch lengths. The root of the tree can only be inferred by using an
+outgroup (a sequence that you are certain branches at the outside of the tree
+.... certain on biological grounds) OR if you assume a degree of constancy in
+the 'molecular clock', you can place the root in the 'middle' of the tree
+(roughly equidistant from all tips).
+
+BOOTSTRAP N-J TREE uses a method for deriving confidence values for the
+groupings in a tree (first adapted for trees by Joe Felsenstein). It involves
+making N random samples of sites from the alignment (N should be LARGE, e.g.
+500 - 1000); drawing N trees (1 from each sample) and counting how many times
+each grouping from the original tree occurs in the sample trees. You can set N
+using the NUMBER OF BOOTSTRAP TRIALS option in the BOOTSTRAP TREE window. In
+practice, you should use a large number of bootstrap replicates (1000 is
+recommended, even if it means running the program for an hour on a slow
+computer). You can also supply a seed number for the random number generator
+here. Different runs with the same seed will give the same answer. See the
+documentation for more details.
+
+EXCLUDE POSITIONS WITH GAPS? With this option, any alignment positions where
+ANY of the sequences have a gap will be ignored. This means that 'like' will
+be compared to 'like' in all distances, which is highly desirable. It also
+automatically throws away the most ambiguous parts of the alignment, which are
+concentrated around gaps (usually). The disadvantage is that you may throw away
+much of the data if there are many gaps (which is why it is difficult for us to
+make it the default).
+
+CORRECT FOR MULTIPLE SUBSTITUTIONS? For small divergence (say <10%) this option
+makes no difference. For greater divergence, this option corrects for the fact
+that observed distances underestimate actual evolutionary distances. This is
+because, as sequences diverge, more than one substitution will happen at many
+sites. However, you only see one difference when you look at the present day
+sequences. Therefore, this option has the effect of stretching branch lengths
+in trees (especially long branches). The corrections used here (for DNA or
+proteins) are both due to Motoo Kimura. See the documentation for details.
+
+Where possible, this option should be used. However, for VERY divergent
+sequences, the distances cannot be reliably corrected. You will be warned if
+this happens. Even if none of the distances in a data set exceed the reliable
+threshold, if you bootstrap the data, some of the bootstrap distances may
+randomly exceed the safe limit.
+
+SAVE LOG FILE will write the tree calculation scores to a file. The log
+filename is the same as the input sequence filename, with an extension .log
+appended.
+
+<H4>
+OUTPUT FORMAT OPTIONS
+</H4>
+
+Three different formats are allowed. None of these displays the tree visually.
+You can display the tree using the NJPLOT program distributed with Clustal X
+OR get the PHYLIP package and use the tree drawing facilities there.
+
+1) CLUSTAL FORMAT TREE. This format is verbose and lists all of the distances
+between the sequences and the number of alignment positions used for each. The
+tree is described at the end of the file. It lists the sequences that are
+joined at each alignment step and the branch lengths. After two sequences are
+joined, it is referred to later as a NODE. The number of a NODE is the number
+of the lowest sequence in that NODE.
+
+2) PHYLIP FORMAT TREE. This format is the New Hampshire format, used by many
+phylogenetic analysis packages. It consists of a series of nested parentheses,
+describing the branching order, with the sequence names and branch lengths. It
+can be read by the NJPLOT program distributed with ClustalX. It can also be
+used by the RETREE, DRAWGRAM and DRAWTREE programs of the PHYLIP package to see
+the trees graphically. This is the same format used during multiple alignment
+for the guide trees. Some other packages that can read and display New
+Hampshire format are TreeTool, TreeView, and Phylowin.
+
+3) PHYLIP DISTANCE MATRIX. This format just outputs a matrix of all the
+pairwise distances in a format that can be used by the PHYLIP package. It used
+to be useful when one could not produce distances from protein sequences in the
+Phylip package but is now redundant (PROTDIST of Phylip 3.5 now does this).
+
+4) NEXUS FORMAT TREE. This format is used by several popular phylogeny programs,
+including PAUP and MacClade. The format is described fully in:
+Maddison, D. R., D. L. Swofford and W. P. Maddison. 1997.
+NEXUS: an extensible file format for systematic information.
+Systematic Biology 46:590-621.
+
+BOOTSTRAP LABELS ON: By default, the bootstrap values are correctly placed on
+the tree branches of the phylip format output tree. The toggle allows them to
+be placed on the nodes, which is incorrect, but some display packages (e.g.
+TreeTool, TreeView and Phylowin) only support node labelling but not branch
+labelling. Care should be taken to note which branches and labels go together.
+
+
+>>HELP C <<
+ Colors
+
+Clustal X provides a versatile coloring scheme for the sequence alignment
+display. The sequences (or profiles) are colored automatically, when they are
+loaded. Sequences can be colored either by assigning a color to specific
+residues, or on the basis of an alignment consensus. In the latter case, the
+alignment consensus is calculated automatically, and the residues in each
+column are colored according to the consensus character assigned to that
+column. In this way, you can choose to highlight, for example, conserved
+hydrophylic or hydrophobic positions in the alignment.
+
+The 'rules' used to color the alignment are specified in a COLOR PARAMETER
+FILE. Clustal X automatically looks for a file called 'colprot.par' for protein
+sequences or 'coldna.par' for DNA, in the current directory. (If your running
+under UNIX, it then looks in your home directory, and finally in the
+directories in your PATH environment variable).
+
+By default, if no color parameter file is found, protein sequences are colored
+by residue as follows:
+
+<PRE>
+ Color Residue Code
+
+ ORANGE GPST
+ RED HKR
+ BLUE FWY
+ GREEN ILMV
+</PRE>
+
+In the case of DNA sequences, the default colors are as follows:
+
+<PRE>
+ Color Residue Code
+
+ ORANGE A
+ RED C
+ BLUE T
+ GREEN G
+</PRE>
+
+
+The default BACKGROUND COLORING option shows the sequence residues using a
+black character on a colored background. It can be switched off to show
+residues as a colored character on a white background.
+
+Either BLACK AND WHITE or DEFAULT COLOR options can be selected. The Color
+option looks first for the color parameter file (as described above) and, if no
+file is found, uses the default residue-specific colors.
+
+You can specify your own coloring scheme by using the LOAD COLOR PARAMETER FILE
+option. The format of the color parameter file is described below.
+
+<H4>
+COLOR PARAMETER FILE
+</H4>
+
+This file is divided into 3 sections:
+
+1) the names and rgb values of the colors
+2) the rules for calculating the consensus
+3) the rules for assigning colors to the residues
+
+An example file is given here.
+
+<PRE>
+ --------------------------------------------------------------------
+ at rgbindex
+RED 0.9 0.1 0.1
+BLUE 0.1 0.1 0.9
+GREEN 0.1 0.9 0.1
+YELLOW 0.9 0.9 0.0
+
+ at consensus
+% = 60% w:l:v:i:m:a:f:c:y:h:p
+# = 80% w:l:v:i:m:a:f:c:y:h:p
+- = 50% e:d
++ = 60% k:r
+q = 50% q:e
+p = 50% p
+n = 50% n
+t = 50% t:s
+
+ at color
+g = RED
+p = YELLOW
+t = GREEN if t:%:#
+n = GREEN if n
+w = BLUE if %:#:p
+k = RED if +
+ --------------------------------------------------------------------
+</PRE>
+
+The first section is optional and is identified by the header @rgbindex. If
+this section exists, each color used in the file must be named and the rgb
+values specified (on a scale from 0 to 1). If the rgb index section is not
+found, the following set of hard-coded colors will be used.
+
+<PRE>
+RED 0.9 0.1 0.1
+BLUE 0.1 0.1 0.9
+GREEN 0.1 0.9 0.1
+ORANGE 0.9 0.7 0.3
+CYAN 0.1 0.9 0.9
+PINK 0.9 0.5 0.5
+MAGENTA 0.9 0.1 0.9
+YELLOW 0.9 0.9 0.0
+</PRE>
+
+The second section is optional and is identified by the header @consensus. It
+defines how the consensus is calculated.
+
+The format of each consensus parameter is:-
+
+<PRE>
+c = n% residue_list
+
+ where
+ c is a character used to identify the parameter.
+ n is an integer value used as the percentage cutoff
+ point.
+ residue_list is a list of residues denoted by a single
+ character, delimited by a colon (:).
+</PRE>
+
+For example: # = 60% w:l:v:i
+
+will assign a consensus character # to any column in the alignment which
+contains more than 60% of the residues w,l,v and i.
+
+
+The third section is identified by the header @color, and defines how colors
+are assigned to each residue in the alignment.
+
+The color parameters can take one of two formats:
+
+<PRE>
+1) r = color
+2) r = color if consensus_list
+
+ where
+ r is a character used to denote a residue.
+ color is one of the colors in the GDE color lookup table.
+ residue_list is a list of residues denoted by a single
+ character, delimited by a colon (:).
+</PRE>
+
+Examples:
+1) g = ORANGE
+
+will color all glycines ORANGE, regardless of the consensus.
+
+2) w = BLUE if w:%:#
+
+will color BLUE any tryptophan which is found in a column with a consensus of
+w, % or #.
+
+
+>>HELP Q <<
+ Alignment Quality Analysis
+
+<H3>
+QUALITY SCORES
+</H3>
+--------------
+
+Clustal X provides an indication of the quality of an alignment by plotting
+a 'conservation score' for each column of the alignment. A high score indicates
+a well-conserved column; a low score indicates low conservation. The quality
+curve is drawn below the alignment.
+
+Two methods are also provided to indicate single residues or sequence segments
+which score badly in the alignment.
+
+Low-scoring residues are expected to occur at a moderate frequency in all the
+sequences because of their steady divergence due to the natural processes of
+evolution. The most divergent sequences are likely to have the most outliers.
+However, the highlighted residues are especially useful in pointing to
+sequence misalignments. Note that clustering of highlighted residues is a
+strong indication of misalignment. This can arise due to various reasons, for
+example:
+
+ 1. Partial or total misalignments caused by a failure in the
+ alignment algorithm. Usually only in difficult alignment cases.
+
+ 2. Partial or total misalignments because at least one of the
+ sequences in the given set is partly or completely unrelated to the
+ other sequences. It is up to the user to check that the set of
+ sequences are alignable.
+
+ 3. Frameshift translation errors in a protein sequence causing local
+ mismatched regions to be heavily highlighted. These are surprisingly
+ common in database entries. If suspected, a 3-frame translation of
+ the source DNA needs to be examined.
+
+Occasionally, highlighted residues may point to regions of some biological
+significance. This might happen for example if a protein alignment contains a
+sequence which has acquired new functions relative to the main sequence set. It
+is important to exclude other explanations, such as error or the natural
+divergence of sequences, before invoking a biological explanation.
+
+
+<H3>
+LOW-SCORING SEGMENTS
+</H3>
+--------------------
+
+Unreliable regions in the alignment can be highlighted using the Low-Scoring
+Segments option. A sequence-weighted profile is used to indicate any segments
+in the sequences which score badly. Because the profile calculation may take
+some time, an option is provided to calculate LOW-SCORING SEGMENTS. The
+segment display can then be toggled on or off without having to repeat the
+time-consuming calculations.
+
+For details of the low-scoring segment calculation, see the CALCULATION section
+below.
+
+
+<H4>
+LOW-SCORING SEGMENT PARAMETERS
+</H4>
+------------------------------
+
+MINIMUM LENGTH OF SEGMENTS: short segments (or even single residues) can be
+hidden by increasing the minimum length of segments which will be displayed.
+
+DNA MARKING SCALE is used to remove less significant segments from the
+highlighted display. Increase the scale to display more segments; decrease the
+scale to remove the least significant.
+
+
+PROTEIN WEIGHT MATRIX: the scoring table which describes the similarity of each
+amino acid to each other. The matrix is used to calculate the sequence-
+weighted profile scores. There are four 'in-built' Log-Odds matrices offered:
+the Gonnet PAM 80, 120, 250, 350 matrices. A more stringent matrix which only
+gives a high score to identities and the most favoured conservative
+substitutions, may be more suitable when the sequences are closely related. For
+more divergent sequences, it is appropriate to use "softer" matrices which give
+a high score to many other frequent substitutions. This option automatically
+recalculates the low-scoring segments.
+
+
+DNA WEIGHT MATRIX: Two hard-coded matrices are available:
+
+1) IUB. This is the default scoring matrix used by BESTFIT for the comparison
+of nucleic acid sequences. X's and N's are treated as matches to any IUB
+ambiguity symbol. All matches score 1.0; all mismatches for IUB symbols score
+0.9.
+
+2) CLUSTALW(1.6). The previous system used by ClustalW, in which matches score
+1.0 and mismatches score 0. All matches for IUB symbols also score 0.
+
+A new matrix can be read from a file on disk, if the filename consists only
+of lower case characters. The values in the new weight matrix should be
+similarities and should be NEGATIVE for infrequent substitutions.
+
+INPUT FORMAT. The format used for a new matrix is the same as the BLAST
+program. Any lines beginning with a # character are assumed to be comments. The
+first non-comment line should contain a list of amino acids in any order, using
+the 1 letter code, followed by a * character. This should be followed by a
+square matrix of scores, with one row and one column for each amino acid. The
+last row and column of the matrix (corresponding to the * character) contain
+the minimum score over the whole matrix.
+
+<H4>
+QUALITY SCORE PARAMETERS
+</H4>
+------------------------
+
+You can customise the column 'quality scores' plotted underneath the alignment
+display using the following options.
+
+SCORE PLOT SCALE: this is a scalar value from 1 to 10, which can be used to
+change the scale of the quality score plot.
+
+RESIDUE EXCEPTION CUTOFF: this is a scalar value from 1 to 10, which can be
+used to change the number of residue exceptions which are highlighted in the
+alignment display. (For an explanation of this cutoff, see the CALCULATION OF
+RESIDUE EXCEPTIONS section below.)
+
+PROTEIN WEIGHT MATRIX: the scoring table which describes the similarity of
+each amino acid to each other.
+
+DNA WEIGHT MATRIX: two hard-coded matrices are available: IUB and CLUSTALW(1.6).
+
+For more information about the weight matrices, see the help above for
+the Low-scoring Segments Weight Matrix.
+
+For details of the quality score calculations, see the CALCULATION section
+below.
+
+
+<STRONG>
+SHOW LOW-SCORING SEGMENTS
+</STRONG>
+
+The low-scoring segment display can be toggled on or off. This option does not
+recalculate the profile scores.
+
+
+<STRONG>
+SHOW EXCEPTIONAL RESIDUES
+</STRONG>
+
+This option highlights individual residues which score badly in the alignment
+quality calculations. Residues which score exceptionally low are highlighted by
+using a white character on a grey background.
+
+<STRONG>
+SAVE QUALITY SCORES TO FILE
+</STRONG>
+
+The quality scores that are plotted underneath the alignment display can also
+be saved in a text file. Each column in the alignment is written on one line in
+the output file, with the value of the quality score at the end of the line.
+Only the sequences currently selected in the display are written to the file.
+One use for quality scores is to color residues in a protein structure by
+sequence conservation. In this way conserved surface residues can be
+highlighted to locate functional regions such as ligand-binding sites.
+
+
+<H3>
+CALCULATION OF QUALITY SCORES
+</H3>
+-----------------------------
+
+Suppose we have an alignment of m sequences of length n. Then, the alignment
+can be written as:
+
+<PRE>
+ A11 A12 A13 .......... A1n
+ A21 A22 A23 .......... A2n
+ .
+ .
+ Am1 Am2 Am3 .......... Amn
+</PRE>
+
+We also have a residue comparison matrix of size R where C(i,j) is the score
+for aligning residue i with residue j.
+
+We want to calculate a score for the conservation of the jth position in the
+alignment.
+
+To do this, we define an R-dimensional sequence space. For the jth position in
+the alignment, each sequence consists of a single residue which is assigned a
+point S in the space. S has R dimensions, and for sequence i, the rth dimension
+is defined as:
+
+<PRE>
+ Sr = C(r,Aij)
+</PRE>
+
+We then calculate a consensus value for the jth position in the alignment. This
+value X also has R dimensions, and the rth dimension is defined as:
+
+<PRE>
+ Xr = ( SUM (Fij * C(i,r)) ) / m
+ 1<=i<=R
+</PRE>
+
+where Fij is the count of residues i at position j in the alignment.
+
+Now we can calculate the distance Di between each sequence i and the consensus
+position X in the R-dimensional space.
+
+<PRE>
+ Di = SQRT ( SUM (Xr - Sr)(Xr - Sr) )
+ 1<=i<=R
+
+</PRE>
+
+The quality score for the jth position in the alignment is defined as the mean
+of the sequence distances Di.
+
+The score is normalised by multiplying by the percentage of sequences which
+have residues (and not gaps) at this position.
+
+<H3>
+CALCULATION OF RESIDUE EXCEPTIONS
+</H3>
+---------------------------------
+
+The jth residue of the ith sequence is considered as an exception if the
+distance Di of the sequence from the consensus value P is greater than (Upper
+Quartile + Inter Quartile Range * Cutoff). The value used as a cutoff for
+displaying exceptions can be set from the SCORE PARAMETERS menu. A high cutoff
+value will only display very significant exceptions; a low value will allow
+more, less significant, exceptions to be highlighted.
+
+(NB. Sequences which contain gaps at this position are not included in the
+exception calculation.)
+
+
+<H3>
+CALCULATION OF LOW-SCORING SEGMENTS
+</H3>
+-----------------------------------
+
+Suppose we have an alignment of m sequences of length n. Then, the alignment
+can be written as:
+
+<PRE>
+ A11 A12 A13 .......... A1n
+ A21 A22 A23 .......... A2n
+ .
+ .
+ Am1 Am2 Am3 .......... Amn
+</PRE>
+
+We also have a residue comparison matrix of size R where C(i,j) is the score
+for aligning residue i with residue j.
+
+We calculate sequence weights by building a neighbour-joining tree, in which
+branch lengths are proportional to divergence. Summing the branches by branch
+ownership provides the weights. See (Thompson et al., CABIOS, 10, 19 (1994) and
+Henikoff et al.,JMB, 243, 574 1994).
+
+To find the low-scoring segments in a sequence Si, we build a weighted profile
+of the remaining sequences in the alignment. Suppose we find residue r at
+position j in the sequence; then the score for the jth position in the sequence
+is defined as
+
+<PRE>
+ Score(Si,j) = Profile(j,r) where Profile(j,r) is the profile score
+ for residue r at position j in the
+ alignment.
+</PRE>
+
+These residue scores are summed along the sequence in both forward and backward
+directions. If the sum of the scores is positive, then it is reset to zero.
+Segments which score negatively in both directions are considered as
+'low-scoring' and will be highlighted in the alignment display.
+
+
+>>HELP 9 <<
+ Command Line Parameters
+
+ DATA (sequences)
+
+-INFILE=file.ext :input sequences
+-PROFILE1=file.ext and -PROFILE2=file.ext :profiles (aligned sequences)
+
+
+ VERBS (do things)
+
+-OPTIONS :list the command line parameters
+-HELP or -CHECK :outline the command line parameters
+-ALIGN :do full multiple alignment
+-TREE :calculate NJ tree
+-BOOTSTRAP(=n) :bootstrap a NJ tree (n= number of bootstraps; def. = 1000)
+-CONVERT :output the input sequences in a different file format
+
+
+ PARAMETERS (set things)
+
+***General settings:****
+-INTERACTIVE :read command line, then enter normal interactive menus
+-QUICKTREE :use FAST algorithm for the alignment guide tree
+-TYPE= :PROTEIN or DNA sequences
+-NEGATIVE :protein alignment with negative values in matrix
+-OUTFILE= :sequence alignment file name
+-OUTPUT= :CLUSTAL, GCG, GDE, PHYLIP, PIR, NEXUS, FASTA
+-OUTORDER= :INPUT or ALIGNED
+-CASE= :LOWER or UPPER (for GDE output only)
+-SEQNOS= :OFF or ON (for Clustal output only)
+
+
+***Fast Pairwise Alignments:***
+-KTUPLE=n :word size
+-TOPDIAGS=n :number of best diags.
+-WINDOW=n :window around best diags.
+-PAIRGAP=n :gap penalty
+-SCORE= :PERCENT or ABSOLUTE
+
+
+***Slow Pairwise Alignments:***
+-PWMATRIX= :Protein weight matrix=BLOSUM, PAM, GONNET, ID or filename
+-PWDNAMATRIX= :DNA weight matrix=IUB, CLUSTALW or filename
+-PWGAPOPEN=f :gap opening penalty
+-PWGAPEXT=f :gap opening penalty
+
+
+***Multiple Alignments:***
+-NEWTREE= :file for new guide tree
+-USETREE= :file for old guide tree
+-MATRIX= :Protein weight matrix=BLOSUM, PAM, GONNET, ID or filename
+-DNAMATRIX= :DNA weight matrix=IUB, CLUSTALW or filename
+-GAPOPEN=f :gap opening penalty
+-GAPEXT=f :gap extension penalty
+-ENDGAPS :no end gap separation pen.
+-GAPDIST=n :gap separation pen. range
+-NOPGAP :residue-specific gaps off
+-NOHGAP :hydrophilic gaps off
+-HGAPRESIDUES= :list hydrophilic res.
+-MAXDIV=n :% ident. for delay
+-TYPE= :PROTEIN or DNA
+-TRANSWEIGHT=f :transitions weighting
+
+
+***Profile Alignments:***
+-PROFILE :Merge two alignments by profile alignment
+-NEWTREE1= :file for new guide tree for profile1
+-NEWTREE2= :file for new guide tree for profile2
+-USETREE1= :file for old guide tree for profile1
+-USETREE2= :file for old guide tree for profile2
+
+
+***Sequence to Profile Alignments:***
+-SEQUENCES :Sequentially add profile2 sequences to profile1 alignment
+-NEWTREE= :file for new guide tree
+-USETREE= :file for old guide tree
+
+
+***Structure Alignments:***
+-NOSECSTR1 :do not use secondary structure/gap penalty mask for profile 1
+-NOSECSTR2 :do not use secondary structure/gap penalty mask for profile 2
+-SECSTROUT=STRUCTURE or MASK or BOTH or NONE :output in alignment file
+-HELIXGAP=n :gap penalty for helix core residues
+-STRANDGAP=n :gap penalty for strand core residues
+-LOOPGAP=n :gap penalty for loop regions
+-TERMINALGAP=n :gap penalty for structure termini
+-HELIXENDIN=n :number of residues inside helix to be treated as terminal
+-HELIXENDOUT=n :number of residues outside helix to be treated as terminal
+-STRANDENDIN=n :number of residues inside strand to be treated as terminal
+-STRANDENDOUT=n:number of residues outside strand to be treated as terminal
+
+
+***Trees:***
+-OUTPUTTREE=nj OR phylip OR dist OR nexus
+-SEED=n :seed number for bootstraps
+-KIMURA :use Kimura's correction
+-TOSSGAPS :ignore positions with gaps
+-BOOTLABELS=node OR branch :position of bootstrap values in tree display
+
+
+>>HELP R <<
+ References
+
+<STRONG>
+The ClustalX program is described in the manuscript:
+</STRONG>
+
+Thompson,J.D., Gibson,T.J., Plewniak,F., Jeanmougin,F. and Higgins,D.G. (1997)
+The ClustalX windows interface: flexible strategies for multiple sequence
+alignment aided by quality analysis tools. Nucleic Acids Research, 25:4876-4882.
+
+
+<STRONG>
+The ClustalW program is described in the manuscript:
+</STRONG>
+
+Thompson, J.D., Higgins, D.G. and Gibson, T.J. (1994) CLUSTAL W: improving the
+sensitivity of progressive multiple sequence alignment through sequence
+weighting, positions-specific gap penalties and weight matrix choice. Nucleic
+Acids Research, 22:4673-4680.
+
+
+<STRONG>
+The ClustalV program is described in the manuscript:
+</STRONG>
+
+Higgins,D.G., Bleasby,A.J. and Fuchs,R. (1992) CLUSTAL V: improved software for
+multiple sequence alignment. CABIOS 8,189-191.
+
+
+<STRONG>
+The original Clustal program is described in the manuscripts:
+</STRONG>
+
+Higgins,D.G. and Sharp,P.M. (1989) Fast and sensitive multiple sequence
+alignments on a microcomputer.
+CABIOS 5,151-153.
+
+Higgins,D.G. and Sharp,P.M. (1988) CLUSTAL: a package for performing multiple
+sequence alignment on a microcomputer. Gene 73,237-244.
+
+-------------------------------------------------------------------------------
+<STRONG>
+Some tips on using Clustal X:
+</STRONG>
+
+Jeanmougin,F., Thompson,J.D., Gouy,M., Higgins,D.G. and Gibson,T.J. (1998)
+Multiple sequence alignment with Clustal X. Trends Biochem Sci, 23, 403-5.
+
+<STRONG>
+Some tips on using Clustal W:
+</STRONG>
+
+Higgins, D. G., Thompson, J. D. and Gibson, T. J. (1996) Using CLUSTAL for
+multiple sequence alignments. Methods Enzymol., 266, 383-402.
+
+-------------------------------------------------------------------------------
+<STRONG>
+You can get the latest version of the ClustalX program by anonymous ftp to:
+</STRONG>
+
+ftp-igbmc.u-strasbg.fr
+ftp.embl-heidelberg.de
+ftp.ebi.ac.uk
+
+<STRONG>
+Or, have a look at the following WWW site:
+</STRONG>
+
+http://www-igbmc.u-strasbg.fr/BioInfo/
+
Added: trunk/packages/clustalw/branches/upstream/current/coldna.par
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/coldna.par 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/coldna.par 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,16 @@
+# color lookup table - this is optional, if no rgbindex is specified, 8
+# hardcoded colors will be used.
+# A maximum of 16 colors can be specified - any more will be ignored!
+ at rgbindex
+RED 0.9 0.2 0.1
+BLUE 0.1 0.5 0.9
+GREEN 0.1 0.8 0.1
+ORANGE 0.9 0.6 0.3
+
+
+ at color
+a = RED
+c = BLUE
+g = ORANGE
+t = GREEN
+u = GREEN
Added: trunk/packages/clustalw/branches/upstream/current/colprint.par
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/colprint.par 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/colprint.par 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,15 @@
+WHITE 1.0 1.0 1.0
+YELLOW 1.0 1.0 0.0
+VIOLET 0.4 0.1 0.9
+RED 0.9 0.5 0.4
+BLUE 0.4 0.9 0.9
+PURPLE 0.7 0.6 0.9
+BLACK 0.0 0.0 0.0
+GREY 0.6 0.7 0.7
+PINK 0.8 0.3 0.8
+ORANGE 0.9 0.7 0.3
+CYAN 0.1 0.7 0.7
+PINK 0.9 0.5 0.5
+MAGENTA 0.8 0.3 0.8
+ORANGE 0.9 0.6 0.3
+
Added: trunk/packages/clustalw/branches/upstream/current/colprot.par
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/colprot.par 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/colprot.par 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,66 @@
+# color lookup table - this is optional, if no rgbindex is specified, 8
+# hardcoded colors will be used.
+# A maximum of 16 colors can be specified - any more will be ignored!
+ at rgbindex
+RED 0.9 0.2 0.1
+BLUE 0.1 0.5 0.9
+GREEN 0.1 0.8 0.1
+CYAN 0.1 0.7 0.7
+PINK 0.9 0.5 0.5
+MAGENTA 0.8 0.3 0.8
+YELLOW 0.8 0.8 0.0
+ORANGE 0.9 0.6 0.3
+
+ at consensus
+% = 60% w:l:v:i:m:a:f:c:y:h:p
+# = 80% w:l:v:i:m:a:f:c:y:h:p
+- = 50% e:d
++ = 60% k:r
+g = 50% g
+n = 50% n
+q = 50% q:e
+p = 50% p
+t = 50% t:s
+A = 85% a
+C = 85% c
+D = 85% d
+E = 85% e
+F = 85% f
+G = 85% g
+H = 85% h
+I = 85% i
+K = 85% k
+L = 85% l
+M = 85% m
+N = 85% n
+P = 85% p
+Q = 85% q
+R = 85% r
+S = 85% s
+T = 85% t
+V = 85% v
+W = 85% w
+Y = 85% y
+
+ at color
+g = ORANGE
+p = YELLOW
+t = GREEN if t:S:T:%:#
+s = GREEN if t:S:T:#
+n = GREEN if n:N:D
+q = GREEN if q:Q:E:+:K:R
+w = BLUE if %:#:A:C:F:H:I:L:M:V:W:Y:P:p
+l = BLUE if %:#:A:C:F:H:I:L:M:V:W:Y:P:p
+v = BLUE if %:#:A:C:F:H:I:L:M:V:W:Y:P:p
+i = BLUE if %:#:A:C:F:H:I:L:M:V:W:Y:P:p
+m = BLUE if %:#:A:C:F:H:I:L:M:V:W:Y:P:p
+a = BLUE if %:#:A:C:F:H:I:L:M:V:W:Y:P:p:T:S:s:G
+f = BLUE if %:#:A:C:F:H:I:L:M:V:W:Y:P:p
+c = BLUE if %:#:A:F:H:I:L:M:V:W:Y:S:P:p
+c = PINK if C
+h = CYAN if %:#:A:C:F:H:I:L:M:V:W:Y:P:p
+y = CYAN if %:#:A:C:F:H:I:L:M:V:W:Y:P:p
+e = MAGENTA if -:D:E:q:Q
+d = MAGENTA if -:D:E:n:N
+k = RED if +:K:R:Q
+r = RED if +:K:R:Q
Added: trunk/packages/clustalw/branches/upstream/current/dayhoff.h
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/dayhoff.h 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/dayhoff.h 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,45 @@
+/* DAYHOFF.H
+
+ Table of estimated PAMS (actual no. of substitutions per 100 residues)
+ for a range of observed amino acid distances from 75.0% (the first entry
+ in the array), in 0.1% increments, up to 93.0%.
+
+ These values are used to correct for multiple hits in protein alignments.
+ The values below are for observed distances above 74.9%. For values above
+ 93%, an arbitrary value of 1000 PAMS (1000% substitution) is used.
+
+ These values are derived from a Dayhoff model (1978) of amino acid
+ substitution and assume average amino acid composition and that amino
+ acids replace each other at the same rate as in the original Dayhoff model.
+
+ Up to 75% observed distance, use Kimura's emprical formula to derive
+ the correction. For 75% or greater, use this table. Kimura's formula
+ is accurate up to about 75% and fails completely above 85%.
+*/
+
+int dayhoff_pams[]={
+ 195, /* 75.0% observed d; 195 PAMs estimated = 195% estimated d */
+ 196, /* 75.1% observed d; 196 PAMs estimated */
+ 197, 198, 199, 200, 200, 201, 202, 203,
+ 204, 205, 206, 207, 208, 209, 209, 210, 211, 212,
+ 213, 214, 215, 216, 217, 218, 219, 220, 221, 222,
+ 223, 224, 226, 227, 228, 229, 230, 231, 232, 233,
+ 234, 236, 237, 238, 239, 240, 241, 243, 244, 245,
+ 246, 248, 249, 250, /* 250 PAMs = 80.3% observed d */
+ 252, 253, 254, 255, 257, 258,
+ 260, 261, 262, 264, 265, 267, 268, 270, 271, 273,
+ 274, 276, 277, 279, 281, 282, 284, 285, 287, 289,
+ 291, 292, 294, 296, 298, 299, 301, 303, 305, 307,
+ 309, 311, 313, 315, 317, 319, 321, 323, 325, 328,
+ 330, 332, 335, 337, 339, 342, 344, 347, 349, 352,
+ 354, 357, 360, 362, 365, 368, 371, 374, 377, 380,
+ 383, 386, 389, 393, 396, 399, 403, 407, 410, 414,
+ 418, 422, 426, 430, 434, 438, 442, 447, 451, 456,
+ 461, 466, 471, 476, 482, 487, 493, 498, 504, 511,
+ 517, 524, 531, 538, 545, 553, 560, 569, 577, 586,
+ 595, 605, 615, 626, 637, 649, 661, 675, 688, 703,
+ 719, 736, 754, 775, 796, 819, 845, 874, 907, 945,
+ /* 92.9% observed; 945 PAMs */
+ 988 /* 93.0% observed; 988 PAMs */
+};
+
Added: trunk/packages/clustalw/branches/upstream/current/gcgcheck.c
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/gcgcheck.c 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/gcgcheck.c 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,15 @@
+#include <ctype.h> /* because of toupper() */
+int SeqGCGCheckSum(char *seq, int len);
+
+int SeqGCGCheckSum(char *seq, int len)
+{
+ int i;
+ long check;
+
+ for( i=0, check=0; i< len; i++,seq++)
+ check += ((i % 57)+1) * toupper(*seq);
+
+ return(check % 10000);
+}
+
+
Added: trunk/packages/clustalw/branches/upstream/current/general.h
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/general.h 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/general.h 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,50 @@
+/* General purpose header file - rf 12/90 */
+
+#ifndef _H_general
+#define _H_general
+
+
+
+/* Macintosh specific */
+#ifdef MAC /* rf 12/9/94 */
+
+#define const /* THINK C doesn't know about these identifiers */
+#define signed
+#define volatile
+#define int long
+#ifndef Boolean
+#define Boolean char
+#endif
+#define pint short /* cast ints in printf statements as pint */
+#define sint int /* cast ints for sequence lengths */
+#define lint int /* cast ints for profile scores */
+
+#else /* not Macintoshs */
+
+#define pint int /* cast ints in printf statements as pint */
+#define sint int /* cast ints for sequence lengths */
+#define lint int /* cast ints for profile scores */
+#ifndef Boolean
+#define Boolean char
+#endif
+
+#endif /* ifdef MAC */
+
+/* definitions for all machines */
+
+#undef TRUE /* Boolean values; first undef them, just in case */
+#undef FALSE
+#define TRUE 1
+#define FALSE 0
+
+#define EOS '\0' /* End-Of-String */
+#define MAXLINE 512 /* Max. line length */
+
+
+#ifdef VMS
+#define signed
+#endif
+
+
+#endif /* ifndef _H_general */
+
Added: trunk/packages/clustalw/branches/upstream/current/globin.pep
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/globin.pep 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/globin.pep 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,86 @@
+>P1;HBB_HUMAN
+Sw:Hbb_Human => HBB_HUMAN
+ VHLTPEEKSA VTALWGKVNV DEVGGEALGR LLVVYPWTQR FFESFGDLST
+ PDAVMGNPKV KAHGKKVLGA FSDGLAHLDN LKGTFATLSE LHCDKLHVDP
+ ENFRLLGNVL VCVLAHHFGK EFTPPVQAAY QKVVAGVANA LAHKYH*
+C;ID HBB_HUMAN STANDARD; PRT; 146 AA.
+C;AC P02023;
+C;DT 21-JUL-1986 (REL. 01, CREATED)
+C;DT 21-JUL-1986 (REL. 01, LAST SEQUENCE UPDATE)
+C;DT 01-APR-1993 (REL. 25, LAST ANNOTATION UPDATE)
+C;DE HEMOGLOBIN BETA CHAIN. . . .
+
+>P1;HBB_HORSE
+Sw:Hbb_Horse => HBB_HORSE
+ VQLSGEEKAA VLALWDKVNE EEVGGEALGR LLVVYPWTQR FFDSFGDLSN
+ PGAVMGNPKV KAHGKKVLHS FGEGVHHLDN LKGTFAALSE LHCDKLHVDP
+ ENFRLLGNVL VVVLARHFGK DFTPELQASY QKVVAGVANA LAHKYH*
+C;ID HBB_HORSE STANDARD; PRT; 146 AA.
+C;AC P02062;
+C;DT 21-JUL-1986 (REL. 01, CREATED)
+C;DT 21-JUL-1986 (REL. 01, LAST SEQUENCE UPDATE)
+C;DT 01-MAR-1992 (REL. 21, LAST ANNOTATION UPDATE)
+C;DE HEMOGLOBIN BETA CHAIN. . . .
+
+>P1;HBA_HUMAN
+Sw:Hba_Human => HBA_HUMAN
+ VLSPADKTNV KAAWGKVGAH AGEYGAEALE RMFLSFPTTK TYFPHFDLSH
+ GSAQVKGHGK KVADALTNAV AHVDDMPNAL SALSDLHAHK LRVDPVNFKL
+ LSHCLLVTLA AHLPAEFTPA VHASLDKFLA SVSTVLTSKY R*
+C;ID HBA_HUMAN STANDARD; PRT; 141 AA.
+C;AC P01922;
+C;DT 21-JUL-1986 (REL. 01, CREATED)
+C;DT 21-JUL-1986 (REL. 01, LAST SEQUENCE UPDATE)
+C;DT 01-FEB-1994 (REL. 28, LAST ANNOTATION UPDATE)
+C;DE HEMOGLOBIN ALPHA CHAIN. . . .
+
+>P1;HBA_HORSE
+Sw:Hba_Horse => HBA_HORSE
+ VLSAADKTNV KAAWSKVGGH AGEYGAEALE RMFLGFPTTK TYFPHFDLSH
+ GSAQVKAHGK KVGDALTLAV GHLDDLPGAL SNLSDLHAHK LRVDPVNFKL
+ LSHCLLSTLA VHLPNDFTPA VHASLDKFLS SVSTVLTSKY R*
+C;ID HBA_HORSE STANDARD; PRT; 141 AA.
+C;AC P01958;
+C;DT 21-JUL-1986 (REL. 01, CREATED)
+C;DT 21-JUL-1986 (REL. 01, LAST SEQUENCE UPDATE)
+C;DT 01-MAR-1992 (REL. 21, LAST ANNOTATION UPDATE)
+C;DE HEMOGLOBIN ALPHA CHAINS (SLOW AND FAST). . . .
+
+>P1;MYG_PHYCA
+Sw:Myg_Phyca => MYG_PHYCA
+ VLSEGEWQLV LHVWAKVEAD VAGHGQDILI RLFKSHPETL EKFDRFKHLK
+ TEAEMKASED LKKHGVTVLT ALGAILKKKG HHEAELKPLA QSHATKHKIP
+ IKYLEFISEA IIHVLHSRHP GDFGADAQGA MNKALELFRK DIAAKYKELG
+ YQG*
+C;ID MYG_PHYCA STANDARD; PRT; 153 AA.
+C;AC P02185;
+C;DT 21-JUL-1986 (REL. 01, CREATED)
+C;DT 21-JUL-1986 (REL. 01, LAST SEQUENCE UPDATE)
+C;DT 01-MAY-1992 (REL. 22, LAST ANNOTATION UPDATE)
+C;DE MYOGLOBIN. . . .
+
+>P1;GLB5_PETMA
+Sw:Glb5_Petma => GLB5_PETMA
+ PIVDTGSVAP LSAAEKTKIR SAWAPVYSTY ETSGVDILVK FFTSTPAAQE
+ FFPKFKGLTT ADQLKKSADV RWHAERIINA VNDAVASMDD TEKMSMKLRD
+ LSGKHAKSFQ VDPQYFKVLA AVIADTVAAG DAGFEKLMSM ICILLRSAY*
+C;ID GLB5_PETMA STANDARD; PRT; 149 AA.
+C;AC P02208;
+C;DT 21-JUL-1986 (REL. 01, CREATED)
+C;DT 21-JUL-1986 (REL. 01, LAST SEQUENCE UPDATE)
+C;DT 01-MAR-1992 (REL. 21, LAST ANNOTATION UPDATE)
+C;DE GLOBIN V. . . .
+
+>P1;LGB2_LUPLU
+Sw:Lgb2_Luplu => LGB2_LUPLU
+ GALTESQAAL VKSSWEEFNA NIPKHTHRFF ILVLEIAPAA KDLFSFLKGT
+ SEVPQNNPEL QAHAGKVFKL VYEAAIQLQV TGVVVTDATL KNLGSVHVSK
+ GVADAHFPVV KEAILKTIKE VVGAKWSEEL NSAWTIAYDE LAIVIKKEMN
+ DAA*
+C;ID LGB2_LUPLU STANDARD; PRT; 153 AA.
+C;AC P02240;
+C;DT 21-JUL-1986 (REL. 01, CREATED)
+C;DT 01-NOV-1988 (REL. 09, LAST SEQUENCE UPDATE)
+C;DT 01-MAR-1992 (REL. 21, LAST ANNOTATION UPDATE)
+C;DE LEGHEMOGLOBIN II. . . .
+
Property changes on: trunk/packages/clustalw/branches/upstream/current/globin.pep
___________________________________________________________________
Name: svn:executable
+
Added: trunk/packages/clustalw/branches/upstream/current/gon90.bla
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/gon90.bla 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/gon90.bla 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,24 @@
+#
+#
+ C S T P A G N D E Q H R K M I L V F Y W *
+ 15.10 -1.20-3.00 -8.50 -0.70 -5.60 -5.10 -8.60 -8.60 -7.00-3.90 -5.50 -8.10 -3.50 -4.80 -5.10 -1.80 -3.50 -2.80 -3.90 0.0
+ -1.20 7.302.60 -1.10 1.50 -1.20 0.70 -1.10 -1.60 -1.20-2.10 -2.30 -1.70 -3.70 -5.70 -5.90 -3.90 -7.10 -4.30 -6.80 0.0
+ -3.00 2.60 7.70 -1.70 -0.30 -4.70 -0.50 -2.00 -2.30 -1.50-2.10 -2.40 -1.20 -2.20 -2.40 -4.40 -0.70 -5.80 -5.30 -8.10 0.0
+ -8.50 -1.10-1.70 11.20 -1.30 -5.30 -4.30 -3.80 -3.10 -2.10-4.00 -4.10 -3.30 -7.10 -6.90 -5.40 -5.30 -8.70 -7.10 -10.50 0.0
+ -0.70 1.50 -0.30 -1.30 7.10 -0.80 -2.90 -2.70 -1.40 -1.90-3.50 -3.20 -2.70 -2.30 -3.80 -3.90 -0.40 -5.90 -6.00 -8.20 0.0
+ -5.60 -1.20-4.70 -5.30 -0.80 9.50 -1.40 -2.30 -4.20 -4.00-4.80 -3.90 -4.50 -7.70 -10.60 -9.80 -8.00 -10.80-8.80 -7.60 0.0
+ -5.10 0.70 -0.50 -4.30 -2.90 -1.40 9.30 2.30 -0.90 -0.401.10 -1.90 0.10 -5.70 -6.70 -7.40 -6.30 -7.00 -3.60 -8.10 0.0
+ -8.60 -1.10-2.00 -3.80 -2.70 -2.30 2.30 9.30 3.30 -0.60-1.40 -3.90 -1.70 -8.10 -9.80 -10.10 -8.10 -10.40-6.40 -11.60 0.0
+ -8.60 -1.60-2.30 -3.10 -1.40 -4.20 -0.90 3.30 8.40 2.40 -1.40 -1.90 0.70 -4.90 -6.80 -7.00 -4.70 -9.50 -6.90 -9.20 0.0
+ -7.00 -1.20-1.50 -2.10 -1.90 -4.00 -0.40 -0.60 2.40 8.901.60 1.30 1.80 -1.80 -5.50 -3.60 -4.70 -6.50 -4.90 -5.80 0.0
+ -3.90 -2.10-2.10 -4.00 -3.50 -4.80 1.10 -1.40 -1.40 1.60 12.30 -0.70 -0.90 -3.50 -5.90 -5.30 -6.00 -2.20 2.30 -3.80 0.0
+ -5.50 -2.30-2.40 -4.10 -3.20 -3.90 -1.90 -3.90 -1.90 1.30 -0.70 9.30 3.50 -4.80 -6.50 -5.40 -5.50 -8.40 -4.60 -3.80 0.0
+ -8.10 -1.70-1.20 -3.30 -2.70 -4.50 0.10 -1.70 0.70 1.80 -0.90 3.50 8.10 -3.40 -5.50 -5.40 -4.90 -8.30 -5.50 -8.30 0.0
+ -3.50 -3.70-2.20 -7.10 -2.30 -7.70 -5.70 -8.10 -4.90 -1.80-3.50 -4.80 -3.40 11.10 2.70 3.20 0.40 0.60 -3.30 -4.10 0.0
+ -4.80 -5.70-2.40 -6.90 -3.80 -10.60 -6.70 -9.80 -6.80 -5.50-5.90 -6.50 -5.50 2.70 8.20 2.40 4.20 -1.10 -4.20 -5.80 0.0
+ -5.10 -5.90-4.40 -5.40 -3.90 -9.80 -7.40 -10.10 -7.00 -3.60-5.30 -5.40 -5.40 3.20 2.40 7.40 0.60 1.00 -3.10 -3.90 0.0
+ -1.80 -3.90-0.70 -5.30 -0.40 -8.00 -6.30 -8.10 -4.70 -4.70-6.00 -5.50 -4.90 0.40 4.20 0.60 7.60 -2.70 -4.30 -7.30 0.0
+ -3.50 -7.10-5.80 -8.70 -5.90 -10.80 -7.00 -10.40 -9.50 -6.50-2.20 -8.40 -8.30 0.60 -1.10 1.00 -2.70 11.105.10 2.00 0.0
+ -2.80 -4.30-5.30 -7.10 -6.00 -8.80 -3.60 -6.40 -6.90 -4.902.30 -4.60 -5.50 -3.30 -4.20 -3.10 -4.30 5.10 12.00 2.60 0.0
+ -3.90 -6.80-8.10 -10.50 -8.20 -7.60 -8.10 -11.60 -9.20 -5.80-3.80 -3.80 -8.30 -4.10 -5.80 -3.90 -7.30 2.00 2.60 17.10 0.0
+0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Added: trunk/packages/clustalw/branches/upstream/current/interface.c
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/interface.c 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/interface.c 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,4391 @@
+/* command line interface for Clustal W */
+/* DES was here MARCH. 1994 */
+/* DES was here SEPT. 1994 */
+/* Fixed memory allocation bug in check_param() . Alan Bleasby Dec 2002 */
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <setjmp.h>
+#include "clustalw.h"
+#include "param.h"
+
+/*
+* Prototypes
+*/
+
+#ifdef UNIX
+FILE *open_path(char *);
+#endif
+
+
+char *nameonly(char *s) ;
+
+static sint check_param(char **args,char *params[], char *param_arg[]);
+static void set_optional_param(void);
+static sint find_match(char *probe, char *list[], sint n);
+static void show_aln(void);
+static void create_parameter_output(void);
+static void reset_align(void);
+static void reset_prf1(void);
+static void reset_prf2(void);
+static void calc_gap_penalty_mask(int prf_length,char *struct_mask,char *gap_mask);
+void print_sec_struct_mask(int prf_length,char *mask,char *struct_mask);
+/*
+* Global variables
+*/
+
+extern sint max_names;
+
+extern Boolean interactive;
+
+extern double **tmat;
+extern float gap_open, gap_extend;
+extern float dna_gap_open, dna_gap_extend;
+extern float prot_gap_open, prot_gap_extend;
+extern float pw_go_penalty, pw_ge_penalty;
+extern float dna_pw_go_penalty, dna_pw_ge_penalty;
+extern float prot_pw_go_penalty, prot_pw_ge_penalty;
+extern char revision_level[];
+extern sint wind_gap,ktup,window,signif;
+extern sint dna_wind_gap, dna_ktup, dna_window, dna_signif;
+extern sint prot_wind_gap,prot_ktup,prot_window,prot_signif;
+extern sint boot_ntrials; /* number of bootstrap trials */
+extern sint nseqs;
+extern sint new_seq;
+extern sint *seqlen_array;
+extern sint divergence_cutoff;
+extern sint debug;
+extern Boolean no_weights;
+extern Boolean neg_matrix;
+extern Boolean quick_pairalign;
+extern Boolean reset_alignments_new; /* DES */
+extern Boolean reset_alignments_all; /* DES */
+extern sint gap_dist;
+extern Boolean no_hyd_penalties, no_pref_penalties;
+extern sint max_aa;
+extern sint gap_pos1, gap_pos2;
+extern sint max_aln_length;
+extern sint *output_index, output_order;
+extern sint profile_no;
+extern short usermat[], pw_usermat[];
+extern short aa_xref[], pw_aa_xref[];
+extern short userdnamat[], pw_userdnamat[];
+extern short dna_xref[], pw_dna_xref[];
+extern sint *seq_weight;
+
+extern Boolean lowercase; /* Flag for GDE output - set on comm. line*/
+extern Boolean cl_seq_numbers;
+
+extern Boolean seqRange; /*Ramu */
+
+extern Boolean output_clustal, output_nbrf, output_phylip, output_gcg, output_gde, output_nexus, output_fasta;
+extern Boolean output_tree_clustal, output_tree_phylip, output_tree_distances, output_tree_nexus;
+extern sint bootstrap_format;
+extern Boolean tossgaps, kimura;
+extern Boolean percent;
+extern Boolean explicit_dnaflag; /* Explicit setting of sequence type on comm.line*/
+extern Boolean usemenu;
+extern Boolean showaln, save_parameters;
+extern Boolean dnaflag;
+extern float transition_weight;
+extern unsigned sint boot_ran_seed;
+
+
+extern FILE *tree;
+extern FILE *clustal_outfile, *gcg_outfile, *nbrf_outfile, *phylip_outfile, *nexus_outfile;
+extern FILE *fasta_outfile; /* Ramu */
+extern FILE *gde_outfile;
+
+extern char hyd_residues[];
+extern char *amino_acid_codes;
+extern char **args;
+extern char seqname[];
+
+extern char **seq_array;
+extern char **names, **titles;
+
+extern char *gap_penalty_mask1,*gap_penalty_mask2;
+extern char *sec_struct_mask1,*sec_struct_mask2;
+extern sint struct_penalties,struct_penalties1,struct_penalties2;
+extern sint output_struct_penalties;
+extern Boolean use_ss1, use_ss2;
+extern char *ss_name1,*ss_name2;
+
+
+char *ss_name = NULL;
+char *sec_struct_mask = NULL;
+char *gap_penalty_mask = NULL;
+
+char profile1_name[FILENAMELEN+1];
+char profile2_name[FILENAMELEN+1];
+
+Boolean empty;
+Boolean profile1_empty, profile2_empty; /* whether or not profiles */
+
+char outfile_name[FILENAMELEN+1]="";
+
+static char clustal_outname[FILENAMELEN+1], gcg_outname[FILENAMELEN+1];
+static char phylip_outname[FILENAMELEN+1],nbrf_outname[FILENAMELEN+1];
+static char gde_outname[FILENAMELEN+1], nexus_outname[FILENAMELEN+1];
+static char fasta_outname[FILENAMELEN+1]; /* Ramu */
+char clustal_tree_name[FILENAMELEN+1]="";
+char dist_tree_name[FILENAMELEN+1]="";
+char phylip_tree_name[FILENAMELEN+1]="";
+char nexus_tree_name[FILENAMELEN+1]="";
+char p1_tree_name[FILENAMELEN+1]="";
+char p2_tree_name[FILENAMELEN+1]="";
+
+char pim_name[FILENAMELEN+1]=""; /* Ramu */
+
+static char *params[MAXARGS];
+static char *param_arg[MAXARGS];
+
+static char *cmd_line_type[] = {
+ " ",
+ "=n ",
+ "=f ",
+ "=string ",
+ "=filename ",
+ ""};
+
+static sint numparams;
+static Boolean check_tree = TRUE;
+
+sint profile1_nseqs; /* have been filled; the no. of seqs in prof 1*/
+Boolean use_tree_file = FALSE,new_tree_file = FALSE;
+Boolean use_tree1_file = FALSE, use_tree2_file = FALSE;
+Boolean new_tree1_file = FALSE, new_tree2_file = FALSE;
+
+static char *lin2;
+
+MatMenu dnamatrix_menu = {3,
+ "IUB","iub",
+ "CLUSTALW(1.6)","clustalw",
+ "User defined",""
+ };
+
+MatMenu matrix_menu = {5,
+ "BLOSUM series","blosum",
+ "PAM series","pam",
+ "Gonnet series","gonnet",
+ "Identity matrix","id",
+ "User defined",""
+ };
+
+MatMenu pw_matrix_menu = {5,
+ "BLOSUM 30","blosum",
+ "PAM 350","pam",
+ "Gonnet 250","gonnet",
+ "Identity matrix","id",
+ "User defined",""
+ };
+
+
+void init_interface(void)
+{
+ empty=TRUE;
+
+ profile1_empty = TRUE; /* */
+ profile2_empty = TRUE; /* */
+
+ lin2 = (char *)ckalloc( (MAXLINE+1) * sizeof (char) );
+
+}
+
+
+
+
+static sint check_param(char **args,char *params[], char *param_arg[])
+{
+
+/*
+#ifndef MAC
+ char *strtok(char *s1, const char *s2);
+#endif
+*/
+ sint len,i,j,k,s,n,match[MAXARGS];
+ Boolean name1 = FALSE;
+ sint ajb;
+
+ if(args[0]==NULL) return;
+
+
+
+ params[0]=(char *)ckalloc((strlen(args[0])+1)*sizeof(char));
+ if (args[0][0]!=COMMANDSEP)
+ {
+ name1 = TRUE;
+ strcpy(params[0],args[0]);
+ }
+ else
+ strcpy(params[0],&args[0][1]);
+
+ for (i=1;i<MAXARGS;i++) {
+ if(args[i]==NULL) break;
+ params[i]=(char *)ckalloc((strlen(args[i])+1)*sizeof(char));
+ ajb=0;
+ for(j=0;j<strlen(args[i])-1;j++)
+ if(isprint(args[i][j+1])) params[i][ajb++]=args[i][j+1];
+ params[i][ajb]='\0';
+ }
+
+ if (i==MAXARGS) {
+ fprintf(stdout,"Error: too many command line arguments\n");
+ return(-1);
+ }
+/*
+ special case - first parameter is input filename
+ */
+ s = 0;
+ if(name1 == TRUE) {
+ strcpy(seqname, params[0]);
+ /* JULIE
+ convert to lower case now
+ */
+#ifndef UNIX
+ for(k=0;k<(sint)strlen(params[0]);++k) seqname[k]=tolower(params[0][k]);
+#else
+ for(k=0;k<(sint)strlen(params[0]);++k) seqname[k]=params[0][k];
+#endif
+ s++;
+ }
+
+ n = i;
+ for (i=s;i<n;i++) {
+ param_arg[i] = NULL;
+ len = (sint)strlen(params[i]);
+ for(j=0; j<len; j++)
+ if(params[i][j] == '=') {
+ param_arg[i] = (char *)ckalloc((len-j) * sizeof(char));
+ strncpy(param_arg[i],¶ms[i][j+1],len-j-1);
+ params[i][j] = EOS;
+ /* JULIE
+ convert keywords to lower case now
+ */
+ for(k=0;k<j;++k) params[i][k]=tolower(params[i][k]);
+ param_arg[i][len-j-1] = EOS;
+ break;
+ }
+ }
+
+ /*
+ for each parameter given on the command line, first search the list of recognised optional
+ parameters....
+ */
+
+ for (i=0;i<n;i++) {
+ if ((i==0) && (name1 == TRUE)) continue;
+ j = 0;
+ match[i] = -1;
+ for(;;) {
+ if (cmd_line_para[j].str[0] == '\0') break;
+ if (!strcmp(params[i],cmd_line_para[j].str)) {
+ match[i] = j;
+ *cmd_line_para[match[i]].flag = i;
+ if ((cmd_line_para[match[i]].type != NOARG) &&
+ (param_arg[i] == NULL)) {
+ fprintf(stdout,
+ "Error: parameter required for /%s\n",params[i]);
+ exit(1);
+ }
+ /* JULIE
+ convert parameters to lower case now, unless the parameter is a filename
+ */
+#ifdef UNIX
+ else if (cmd_line_para[match[i]].type != FILARG
+ && param_arg[i] != NULL)
+#endif
+ if (param_arg[i]!=0)
+ {
+ for(k=0;k<strlen(param_arg[i]);++k)
+ param_arg[i][k]=tolower(param_arg[i][k]);
+ }
+ break;
+ }
+ j++;
+ }
+ }
+ /*
+ ....then the list of recognised input files,....
+*/
+ for (i=0;i<n;i++) {
+ if ((i==0) && (name1 == TRUE)) continue;
+ if (match[i] != -1) continue;
+ j = 0;
+ for(;;) {
+ if (cmd_line_file[j].str[0] == '\0') break;
+ if (!strcmp(params[i],cmd_line_file[j].str)) {
+ match[i] = j;
+ *cmd_line_file[match[i]].flag = i;
+ if ((cmd_line_file[match[i]].type != NOARG) &&
+ (param_arg[i] == NULL)) {
+ fprintf(stdout,
+ "Error: parameter required for /%s\n",params[i]);
+ exit(1);
+ }
+ break;
+ }
+ j++;
+ }
+ }
+/*
+ ....and finally the recognised verbs.
+*/
+ for (i=0;i<n;i++) {
+ if ((i==0) && (name1 == TRUE)) continue;
+ if (match[i] != -1) continue;
+ j = 0;
+ for(;;) {
+ if (cmd_line_verb[j].str[0] == '\0') break;
+ if (!strcmp(params[i],cmd_line_verb[j].str)) {
+ match[i] = j;
+ *cmd_line_verb[match[i]].flag = i;
+ if ((cmd_line_verb[match[i]].type != NOARG) &&
+ (param_arg[i] == NULL)) {
+ fprintf(stdout,
+ "Error: parameter required for /%s\n",params[i]);
+ exit(1);
+ }
+ break;
+ }
+ j++;
+ }
+ }
+
+/*
+ check for any unrecognised parameters.
+*/
+ for (i=0;i<n;i++) {
+ if (match[i] == -1) {
+ fprintf(stdout,
+ "Error: unknown option %c%s\n",COMMANDSEP,params[i]);
+ exit(1);
+ }
+ }
+ return(n);
+}
+
+static void set_optional_param(void)
+{
+ int i,temp;
+ int c;
+ float ftemp;
+ char tstr[100];
+
+ /****************************************************************************/
+ /* look for parameters on command line e.g. gap penalties, k-tuple etc. */
+ /****************************************************************************/
+
+ /*** ? /score=percent or /score=absolute */
+ if(setscore != -1)
+ if(strlen(param_arg[setscore]) > 0) {
+ temp = find_match(param_arg[setscore],score_arg,2);
+ if(temp == 0)
+ percent = TRUE;
+ else if(temp == 1)
+ percent = FALSE;
+ else
+ fprintf(stdout,"\nUnknown SCORE type: %s\n",
+ param_arg[setscore]);
+ }
+
+ /*** ? /seed=n */
+ if(setseed != -1) {
+ temp = 0;
+ if(strlen(param_arg[setseed]) > 0)
+ if (sscanf(param_arg[setseed],"%d",&temp)!=1) {
+ fprintf(stdout,"Bad option for /seed (must be integer)\n");
+ temp = 0;
+ }
+ if(temp > 0) boot_ran_seed = temp;
+ fprintf(stdout,"\ntemp = %d; seed = %u;\n",(pint)temp,boot_ran_seed);
+ }
+
+
+/*** ? /output=PIR, GCG, GDE or PHYLIP */
+ if(setoutput != -1)
+ if(strlen(param_arg[setoutput]) > 0) {
+ temp = find_match(param_arg[setoutput],output_arg,6);
+ if (temp >= 0 && temp <= 5) {
+ output_clustal = FALSE;
+ output_gcg = FALSE;
+ output_phylip = FALSE;
+ output_nbrf = FALSE;
+ output_gde = FALSE;
+ output_nexus = FALSE;
+ output_fasta = FALSE;
+ }
+ switch (temp) {
+ case 0: /* GCG */
+ output_gcg = TRUE;
+ break;
+ case 1: /* GDE */
+ output_gde = TRUE;
+ break;
+ case 2: /* PIR */
+ output_nbrf = TRUE;
+ break;
+ case 3: /* PHYLIP */
+ output_phylip = TRUE;
+ break;
+ case 4: /* NEXUS */
+ output_nexus = TRUE;
+ break;
+ case 5: /* NEXUS */
+ output_fasta = TRUE;
+ break;
+ default:
+ fprintf(stdout,"\nUnknown OUTPUT type: %s\n",
+ param_arg[setoutput]);
+ }
+ }
+
+/*** ? /outputtree=NJ or PHYLIP or DIST or NEXUS */
+ if(setoutputtree != -1)
+ if(strlen(param_arg[setoutputtree]) > 0) {
+ temp = find_match(param_arg[setoutputtree],outputtree_arg,4);
+ switch (temp) {
+ case 0: /* NJ */
+ output_tree_clustal = TRUE;
+ break;
+ case 1: /* PHYLIP */
+ output_tree_phylip = TRUE;
+ break;
+ case 2: /* DIST */
+ output_tree_distances = TRUE;
+ break;
+ case 3: /* NEXUS */
+ output_tree_nexus = TRUE;
+ break;
+ default:
+ fprintf(stdout,"\nUnknown OUTPUT TREE type: %s\n",
+ param_arg[setoutputtree]);
+ }
+ }
+
+/*** ? /profile (sets type of second input file to profile) */
+ if(setprofile != -1)
+ profile_type = PROFILE;
+
+ /*** ? /sequences (sets type of second input file to list of sequences) */
+ if(setsequences != -1)
+ profile_type = SEQUENCE;
+
+
+
+ /*** ? /ktuple=n */
+ if(setktuple != -1) {
+ temp = 0;
+ if(strlen(param_arg[setktuple]) > 0)
+ if (sscanf(param_arg[setktuple],"%d",&temp)!=1) {
+ fprintf(stdout,"Bad option for /ktuple (must be integer)\n");
+ temp = 0;
+ }
+ if(temp > 0) {
+ if(dnaflag) {
+ if(temp <= 4) {
+ ktup = temp;
+ dna_ktup = ktup;
+ wind_gap = ktup + 4;
+ dna_wind_gap = wind_gap;
+ }
+ }
+ else {
+ if(temp <= 2) {
+ ktup = temp;
+ prot_ktup = ktup;
+ wind_gap = ktup + 3;
+ prot_wind_gap = wind_gap;
+ }
+ }
+ }
+ }
+
+ /*** ? /pairgap=n */
+ if(setpairgap != -1) {
+ temp = 0;
+ if(strlen(param_arg[setpairgap]) > 0)
+ if (sscanf(param_arg[setpairgap],"%d",&temp)!=1) {
+ fprintf(stdout,"Bad option for /pairgap (must be integer)\n");
+ temp = 0;
+ }
+ if(temp > 0)
+ if(dnaflag) {
+ if(temp > ktup) {
+ wind_gap = temp;
+ dna_wind_gap = wind_gap;
+ }
+ }
+ else {
+ if(temp > ktup) {
+ wind_gap = temp;
+ prot_wind_gap = wind_gap;
+ }
+ }
+ }
+
+
+/*** ? /topdiags=n */
+ if(settopdiags != -1) {
+ temp = 0;
+ if(strlen(param_arg[settopdiags]) > 0)
+ if (sscanf(param_arg[settopdiags],"%d",&temp)!=1) {
+ fprintf(stdout,"Bad option for /topdiags (must be integer)\n");
+ temp = 0;
+ }
+ if(temp > 0)
+ if(dnaflag) {
+ if(temp > ktup) {
+ signif = temp;
+ dna_signif = signif;
+ }
+ }
+ else {
+ if(temp > ktup) {
+ signif = temp;
+ prot_signif = signif;
+ }
+ }
+ }
+
+
+/*** ? /window=n */
+ if(setwindow != -1) {
+ temp = 0;
+ if(strlen(param_arg[setwindow]) > 0)
+ if (sscanf(param_arg[setwindow],"%d",&temp)!=1) {
+ fprintf(stdout,"Bad option for /window (must be integer)\n");
+ temp = 0;
+ }
+ if(temp > 0)
+ if(dnaflag) {
+ if(temp > ktup) {
+ window = temp;
+ dna_window = window;
+ }
+ }
+ else {
+ if(temp > ktup) {
+ window = temp;
+ prot_window = window;
+ }
+ }
+ }
+
+/*** ? /kimura */
+ if(setkimura != -1)
+ kimura = TRUE;
+
+ /*** ? /tossgaps */
+ if(settossgaps != -1)
+ tossgaps = TRUE;
+
+
+ /*** ? /negative */
+ if(setnegative != -1)
+ neg_matrix = TRUE;
+
+ /*** ? /noweights */
+ if(setnoweights!= -1)
+ no_weights = TRUE;
+
+
+ /*** ? /pwmatrix=ID (user's file) */
+ if(setpwmatrix != -1)
+ {
+ temp=strlen(param_arg[setpwmatrix]);
+ if(temp > 0) {
+ for(i=0;i<temp;i++)
+ if (isupper(param_arg[setpwmatrix][i]))
+ tstr[i]=tolower(param_arg[setpwmatrix][i]);
+ else
+ tstr[i]=param_arg[setpwmatrix][i];
+ tstr[i]='\0';
+ if (strcmp(tstr,"blosum")==0) {
+ strcpy(pw_mtrxname, tstr);
+ pw_matnum = 1;
+ }
+ else if (strcmp(tstr,"pam")==0) {
+ strcpy(pw_mtrxname, tstr);
+ pw_matnum = 2;
+ }
+ else if (strcmp(tstr,"gonnet")==0) {
+ strcpy(pw_mtrxname, tstr);
+ pw_matnum = 3;
+ }
+ else if (strcmp(tstr,"id")==0) {
+ strcpy(pw_mtrxname, tstr);
+ pw_matnum = 4;
+ }
+ else {
+ if(user_mat(param_arg[setpwmatrix], pw_usermat, pw_aa_xref))
+ {
+ strcpy(pw_mtrxname,param_arg[setpwmatrix]);
+ strcpy(pw_usermtrxname,param_arg[setpwmatrix]);
+ pw_matnum=5;
+ }
+ else exit(1);
+ }
+
+ }
+ }
+
+/*** ? /matrix=ID (user's file) */
+ if(setmatrix != -1)
+ {
+ temp=strlen(param_arg[setmatrix]);
+ if(temp > 0) {
+ for(i=0;i<temp;i++)
+ if (isupper(param_arg[setmatrix][i]))
+ tstr[i]=tolower(param_arg[setmatrix][i]);
+ else
+ tstr[i]=param_arg[setmatrix][i];
+ tstr[i]='\0';
+ if (strcmp(tstr,"blosum")==0) {
+ strcpy(mtrxname, tstr);
+ matnum = 1;
+ }
+ else if (strcmp(tstr,"pam")==0) {
+ strcpy(mtrxname, tstr);
+ matnum = 2;
+ }
+ else if (strcmp(tstr,"gonnet")==0) {
+ strcpy(mtrxname, tstr);
+ matnum = 3;
+ }
+ else if (strcmp(tstr,"id")==0) {
+ strcpy(mtrxname, tstr);
+ matnum = 4;
+ }
+ else {
+ if(user_mat_series(param_arg[setmatrix], usermat, aa_xref))
+ {
+ strcpy(mtrxname,param_arg[setmatrix]);
+ strcpy(usermtrxname,param_arg[setmatrix]);
+ matnum=5;
+ }
+ else exit(1);
+ }
+
+ }
+ }
+
+/*** ? /pwdnamatrix=ID (user's file) */
+ if(setpwdnamatrix != -1)
+ {
+ temp=strlen(param_arg[setpwdnamatrix]);
+ if(temp > 0) {
+ for(i=0;i<temp;i++)
+ if (isupper(param_arg[setpwdnamatrix][i]))
+ tstr[i]=tolower(param_arg[setpwdnamatrix][i]);
+ else
+ tstr[i]=param_arg[setpwdnamatrix][i];
+ tstr[i]='\0';
+ if (strcmp(tstr,"iub")==0) {
+ strcpy(pw_dnamtrxname, tstr);
+ pw_dnamatnum = 1;
+ }
+ else if (strcmp(tstr,"clustalw")==0) {
+ strcpy(pw_dnamtrxname, tstr);
+ pw_dnamatnum = 2;
+ }
+ else {
+ if(user_mat(param_arg[setpwdnamatrix], pw_userdnamat, pw_dna_xref))
+ {
+ strcpy(pw_dnamtrxname,param_arg[setpwdnamatrix]);
+ strcpy(pw_dnausermtrxname,param_arg[setpwdnamatrix]);
+ pw_dnamatnum=3;
+ }
+ else exit(1);
+ }
+
+ }
+ }
+
+/*** ? /matrix=ID (user's file) */
+ if(setdnamatrix != -1)
+ {
+ temp=strlen(param_arg[setdnamatrix]);
+ if(temp > 0) {
+ for(i=0;i<temp;i++)
+ if (isupper(param_arg[setdnamatrix][i]))
+ tstr[i]=tolower(param_arg[setdnamatrix][i]);
+ else
+ tstr[i]=param_arg[setdnamatrix][i];
+ tstr[i]='\0';
+ if (strcmp(tstr,"iub")==0) {
+ strcpy(dnamtrxname, tstr);
+ dnamatnum = 1;
+ }
+ else if (strcmp(tstr,"clustalw")==0) {
+ strcpy(dnamtrxname, tstr);
+ dnamatnum = 2;
+ }
+ else {
+ if(user_mat(param_arg[setdnamatrix], userdnamat, dna_xref))
+ {
+ strcpy(dnamtrxname,param_arg[setdnamatrix]);
+ strcpy(dnausermtrxname,param_arg[setdnamatrix]);
+ dnamatnum=3;
+ }
+ else exit(1);
+ }
+
+ }
+ }
+/*** ? /maxdiv= n */
+ if(setmaxdiv != -1) {
+ temp = 0;
+ if(strlen(param_arg[setmaxdiv]) > 0)
+ if (sscanf(param_arg[setmaxdiv],"%d",&temp)!=1) {
+ fprintf(stdout,"Bad option for /maxdiv (must be integer)\n");
+ temp = 0;
+ }
+ if (temp >= 0)
+ divergence_cutoff = temp;
+ }
+
+/*** ? /gapdist= n */
+ if(setgapdist != -1) {
+ temp = 0;
+ if(strlen(param_arg[setgapdist]) > 0)
+ if (sscanf(param_arg[setgapdist],"%d",&temp)!=1) {
+ fprintf(stdout,"Bad option for /gapdist (must be integer)\n");
+ temp = 0;
+ }
+ if (temp >= 0)
+ gap_dist = temp;
+ }
+
+/*** ? /debug= n */
+ if(setdebug != -1) {
+ temp = 0;
+ if(strlen(param_arg[setdebug]) > 0)
+ if (sscanf(param_arg[setdebug],"%d",&temp)!=1) {
+ fprintf(stdout,"Bad option for /debug (must be integer)\n");
+ temp = 0;
+ }
+ if (temp >= 0)
+ debug = temp;
+ }
+
+/*** ? /outfile= (user's file) */
+ if(setoutfile != -1)
+ if(strlen(param_arg[setoutfile]) > 0) {
+ strcpy(outfile_name, param_arg[setoutfile]);
+ }
+
+/*** ? /case= lower/upper */
+ if(setcase != -1)
+ if(strlen(param_arg[setcase]) > 0) {
+ temp = find_match(param_arg[setcase],case_arg,2);
+ if(temp == 0) {
+ lowercase = TRUE;
+ }
+ else if(temp == 1) {
+ lowercase = FALSE;
+ }
+ else
+ fprintf(stdout,"\nUnknown case %s\n",
+ param_arg[setcase]);
+ }
+
+/*** ? /seqnos=off/on */
+ if(setseqno != -1)
+ if(strlen(param_arg[setseqno]) > 0) {
+ temp = find_match(param_arg[setseqno],seqno_arg,2);
+ if(temp == 0) {
+ cl_seq_numbers = FALSE;
+ }
+ else if(temp == 1) {
+ cl_seq_numbers = TRUE;
+ }
+ else
+ fprintf(stdout,"\nUnknown SEQNO option %s\n",
+ param_arg[setseqno]);
+ }
+
+
+
+ if(setseqno_range != -1)
+ if(strlen(param_arg[setseqno_range]) > 0) {
+ temp = find_match(param_arg[setseqno_range],seqno_range_arg,2);
+ printf("\n comparing ");
+ printf("\nparam_arg[setseqno_range]= %s", param_arg[setseqno_range]);
+ /* printf("\nseqno_range_arg = %s ",seqno_range_arg); */
+ printf("\n comparing \n ");
+
+ if(temp == 0) {
+ seqRange = FALSE;
+ }
+ else if(temp == 1) {
+ seqRange = TRUE;
+
+ }
+ else
+ fprintf(stdout,"\nUnknown Sequence range option %s\n",
+ param_arg[setseqno_range]);
+ }
+
+
+/*** ? /range=n:m */
+ if(setrange != -1) {
+ temp = 0;
+ if(strlen(param_arg[setrange]) > 0)
+ if (sscanf(param_arg[setrange],"%d:%d",&temp,&temp)!=2) {
+ fprintf(stdout,"setrange: Syntax Error: Cannot set range, should be from:to \n");
+ temp = 0;
+ }
+ }
+
+/*** ? /range=n:m */
+
+
+
+/*** ? /gapopen=n */
+ if(setgapopen != -1) {
+ ftemp = 0.0;
+ if(strlen(param_arg[setgapopen]) > 0)
+ if (sscanf(param_arg[setgapopen],"%f",&ftemp)!=1) {
+ fprintf(stdout,"Bad option for /gapopen (must be real number)\n");
+ ftemp = 0.0;
+ }
+ if(ftemp >= 0.0)
+ if(dnaflag) {
+ gap_open = ftemp;
+ dna_gap_open = gap_open;
+ }
+ else {
+ gap_open = ftemp;
+ prot_gap_open = gap_open;
+ }
+ }
+
+
+/*** ? /gapext=n */
+ if(setgapext != -1) {
+ ftemp = 0.0;
+ if(strlen(param_arg[setgapext]) > 0)
+ if (sscanf(param_arg[setgapext],"%f",&ftemp)!=1) {
+ fprintf(stdout,"Bad option for /gapext (must be real number)\n");
+ ftemp = 0.0;
+ }
+ if(ftemp >= 0)
+ if(dnaflag) {
+ gap_extend = ftemp;
+ dna_gap_extend = gap_extend;
+ }
+ else {
+ gap_extend = ftemp;
+ prot_gap_extend = gap_extend;
+ }
+ }
+
+/*** ? /transweight=n*/
+ if(settransweight != -1) {
+ ftemp = 0.0;
+ if(strlen(param_arg[settransweight]) > 0)
+ if (sscanf(param_arg[settransweight],"%f",&ftemp)!=1) {
+ fprintf(stdout,"Bad option for /transweight (must be real number)\n");
+ ftemp = 0.0;
+ }
+ transition_weight=ftemp;
+ }
+
+/*** ? /pwgapopen=n */
+ if(setpwgapopen != -1) {
+ ftemp = 0.0;
+ if(strlen(param_arg[setpwgapopen]) > 0)
+ if (sscanf(param_arg[setpwgapopen],"%f",&ftemp)!=1) {
+ fprintf(stdout,"Bad option for /pwgapopen (must be real number)\n");
+ ftemp = 0.0;
+ }
+ if(ftemp >= 0.0)
+ if(dnaflag) {
+ pw_go_penalty = ftemp;
+ dna_pw_go_penalty = pw_go_penalty;
+ }
+ else {
+ pw_go_penalty = ftemp;
+ prot_pw_go_penalty = pw_go_penalty;
+ }
+ }
+
+
+/*** ? /gapext=n */
+ if(setpwgapext != -1) {
+ ftemp = 0.0;
+ if(strlen(param_arg[setpwgapext]) > 0)
+ if (sscanf(param_arg[setpwgapext],"%f",&ftemp)!=1) {
+ fprintf(stdout,"Bad option for /pwgapext (must be real number)\n");
+ ftemp = 0.0;
+ }
+ if(ftemp >= 0)
+ if(dnaflag) {
+ pw_ge_penalty = ftemp;
+ dna_pw_ge_penalty = pw_ge_penalty;
+ }
+ else {
+ pw_ge_penalty = ftemp;
+ prot_pw_ge_penalty = pw_ge_penalty;
+ }
+ }
+
+
+
+/*** ? /outorder=n */
+ if(setoutorder != -1) {
+ if(strlen(param_arg[setoutorder]) > 0)
+ temp = find_match(param_arg[setoutorder],outorder_arg,2);
+ if(temp == 0) {
+ output_order = INPUT;
+ }
+ else if(temp == 1) {
+ output_order = ALIGNED;
+ }
+ else
+ fprintf(stdout,"\nUnknown OUTPUT ORDER type %s\n",
+ param_arg[setoutorder]);
+ }
+
+/*** ? /bootlabels=n */
+ if(setbootlabels != -1) {
+ if(strlen(param_arg[setbootlabels]) > 0)
+ temp = find_match(param_arg[setbootlabels],bootlabels_arg,2);
+ if(temp == 0) {
+ bootstrap_format = BS_NODE_LABELS;
+ }
+ else if(temp == 1) {
+ bootstrap_format = BS_BRANCH_LABELS;
+ }
+ else
+ fprintf(stdout,"\nUnknown bootlabels type %s\n",
+ param_arg[setoutorder]);
+ }
+
+/*** ? /endgaps */
+ if(setuseendgaps != -1)
+ use_endgaps = FALSE;
+
+/*** ? /nopgap */
+ if(setnopgap != -1)
+ no_pref_penalties = TRUE;
+
+/*** ? /nohgap */
+ if(setnohgap != -1)
+ no_hyd_penalties = TRUE;
+
+/*** ? /novgap */
+ if(setnovgap != -1)
+ no_var_penalties = FALSE;
+
+/*** ? /hgapresidues="string" */
+ if(sethgapres != -1)
+ if(strlen(param_arg[sethgapres]) > 0) {
+ for (i=0;i<strlen(hyd_residues) && i<26;i++) {
+ c = param_arg[sethgapres][i];
+ if (isalpha(c))
+ hyd_residues[i] = (char)toupper(c);
+ else
+ break;
+ }
+ }
+
+
+/*** ? /nosecstr1 */
+ if(setsecstr1 != -1)
+ use_ss1 = FALSE;
+
+/*** ? /nosecstr2 */
+ if(setsecstr2 != -1)
+ use_ss2 = FALSE;
+
+/*** ? /secstroutput */
+ if(setsecstroutput != -1)
+ if(strlen(param_arg[setsecstroutput]) > 0) {
+ temp = find_match(param_arg[setsecstroutput],outputsecstr_arg,4);
+ if(temp >= 0 && temp <= 3)
+ output_struct_penalties = temp;
+ else
+ fprintf(stdout,"\nUnknown case %s\n",
+ param_arg[setsecstroutput]);
+ }
+
+
+/*** ? /helixgap= n */
+ if(sethelixgap != -1) {
+ temp = 0;
+ if(strlen(param_arg[sethelixgap]) > 0)
+ if (sscanf(param_arg[sethelixgap],"%d",&temp)!=1) {
+ fprintf(stdout,"Bad option for /helixgap (must be integer)\n");
+ temp = 0;
+ }
+ if (temp >= 1 && temp <= 9)
+ helix_penalty = temp;
+ }
+
+/*** ? /strandgap= n */
+ if(setstrandgap != -1) {
+ temp = 0;
+ if(strlen(param_arg[setstrandgap]) > 0)
+ if (sscanf(param_arg[setstrandgap],"%d",&temp)!=1) {
+ fprintf(stdout,"Bad option for /strandgap (must be integer)\n");
+ temp = 0;
+ }
+ if (temp >= 1 && temp <= 9)
+ strand_penalty = temp;
+ }
+
+/*** ? /loopgap= n */
+ if(setloopgap != -1) {
+ temp = 0;
+ if(strlen(param_arg[setloopgap]) > 0)
+ if (sscanf(param_arg[setloopgap],"%d",&temp)!=1) {
+ fprintf(stdout,"Bad option for /loopgap (must be integer)\n");
+ temp = 0;
+ }
+ if (temp >= 1 && temp <= 9)
+ loop_penalty = temp;
+ }
+
+/*** ? /terminalgap= n */
+ if(setterminalgap != -1) {
+ temp = 0;
+ if(strlen(param_arg[setterminalgap]) > 0)
+ if (sscanf(param_arg[setterminalgap],"%d",&temp)!=1) {
+ fprintf(stdout,"Bad option for /terminalgap (must be integer)\n");
+ temp = 0;
+ }
+ if (temp >= 1 && temp <= 9) {
+ helix_end_penalty = temp;
+ strand_end_penalty = temp;
+ }
+ }
+
+/*** ? /helixendin= n */
+ if(sethelixendin != -1) {
+ temp = 0;
+ if(strlen(param_arg[sethelixendin]) > 0)
+ if (sscanf(param_arg[sethelixendin],"%d",&temp)!=1) {
+ fprintf(stdout,"Bad option for /helixendin (must be integer)\n");
+ temp = 0;
+ }
+ if (temp >= 0 && temp <= 3)
+ helix_end_minus = temp;
+ }
+
+/*** ? /helixendout= n */
+ if(sethelixendout != -1) {
+ temp = 0;
+ if(strlen(param_arg[sethelixendout]) > 0)
+ if (sscanf(param_arg[sethelixendout],"%d",&temp)!=1) {
+ fprintf(stdout,"Bad option for /helixendout (must be integer)\n");
+ temp = 0;
+ }
+ if (temp >= 0 && temp <= 3)
+ helix_end_plus = temp;
+ }
+
+/*** ? /strandendin= n */
+ if(setstrandendin != -1) {
+ temp = 0;
+ if(strlen(param_arg[setstrandendin]) > 0)
+ if (sscanf(param_arg[setstrandendin],"%d",&temp)!=1) {
+ fprintf(stdout,"Bad option for /strandendin (must be integer)\n");
+ temp = 0;
+ }
+ if (temp >= 0 && temp <= 3)
+ strand_end_minus = temp;
+ }
+
+/*** ? /strandendout= n */
+ if(setstrandendout != -1) {
+ temp = 0;
+ if(strlen(param_arg[setstrandendout]) > 0)
+ if (sscanf(param_arg[setstrandendout],"%d",&temp)!=1) {
+ fprintf(stdout,"Bad option for /strandendout (must be integer)\n");
+ temp = 0;
+ }
+ if (temp >= 0 && temp <= 3)
+ strand_end_plus = temp;
+ }
+
+}
+
+#ifdef UNIX
+FILE *open_path(char *fname) /* to open in read-only file fname searching for
+ it through all path directories */
+{
+#define Mxdir 70
+ char dir[Mxdir+1], *path, *deb, *fin;
+ FILE *fich;
+ sint lf, ltot;
+ char *path1;
+
+ path=getenv("PATH"); /* get the list of path directories,
+ separated by :
+ */
+
+ /* added for File System Standards - Francois */
+ path1=(char *)ckalloc((strlen(path)+64)*sizeof(char));
+ strcpy(path1,path);
+ strcat(path1,"/usr/share/clustalx:/usr/local/share/clustalx");
+
+ lf=(sint)strlen(fname);
+ deb=path1;
+ do
+ {
+ fin=strchr(deb,':');
+ if(fin!=NULL)
+ { strncpy(dir,deb,fin-deb); ltot=fin-deb; }
+ else
+ { strcpy(dir,deb); ltot=(sint)strlen(dir); }
+ /* now one directory is in string dir */
+ if( ltot + lf + 1 <= Mxdir)
+ {
+ dir[ltot]='/';
+ strcpy(dir+ltot+1,fname); /* now dir is appended with fi
+ lename */
+ if( (fich = fopen(dir,"r") ) != NULL) break;
+ }
+ else fich = NULL;
+ deb=fin+1;
+ }
+ while (fin != NULL);
+ return fich;
+}
+#endif
+
+
+void get_help(char help_pointer) /* Help procedure */
+{
+ FILE *help_file;
+ sint i, number, nlines;
+ Boolean found_help;
+ char temp[MAXLINE+1];
+ char token = '\0';
+ char *digits = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+ char *help_marker = ">>HELP";
+
+ extern char *help_file_name;
+
+#ifdef VMS
+ if((help_file=fopen(help_file_name,"r","rat=cr","rfm=var"))==NULL) {
+ error("Cannot open help file [%s]",help_file_name);
+ return;
+ }
+#else
+
+#ifdef UNIX
+ if((help_file=open_path(help_file_name))==NULL) {
+ if((help_file=fopen(help_file_name,"r"))==NULL) {
+ error("Cannot open help file [%s]",help_file_name);
+ return;
+ }
+ }
+
+#else
+ if((help_file=fopen(help_file_name,"r"))==NULL) {
+ error("Cannot open help file [%s]",help_file_name);
+ return;
+ }
+#endif
+
+#endif
+/* error("Cannot open help file [%s]",help_file_name);
+ return;
+ }
+*/
+ nlines = 0;
+ number = -1;
+ found_help = FALSE;
+
+ while(TRUE) {
+ if(fgets(temp,MAXLINE+1,help_file) == NULL) {
+ if(!found_help)
+ error("No help found in help file");
+ fclose(help_file);
+ return;
+ }
+ if(strstr(temp,help_marker)) {
+ token = ' ';
+ for(i=strlen(help_marker); i<8; i++)
+ if(strchr(digits, temp[i])) {
+ token = temp[i];
+ break;
+ }
+ }
+ if(token == help_pointer) {
+ found_help = TRUE;
+ while(fgets(temp,MAXLINE+1,help_file)) {
+ if(strstr(temp, help_marker)){
+ if(usemenu) {
+ fprintf(stdout,"\n");
+ getstr("Press [RETURN] to continue",lin2);
+ }
+ fclose(help_file);
+ return;
+ }
+ if(temp[0]!='<') {
+ fputs(temp,stdout);
+ ++nlines;
+ }
+ if(usemenu) {
+ if(nlines >= PAGE_LEN) {
+ fprintf(stdout,"\n");
+ getstr("Press [RETURN] to continue or X to stop",lin2);
+ if(toupper(*lin2) == 'X') {
+ fclose(help_file);
+ return;
+ }
+ else
+ nlines = 0;
+ }
+ }
+ }
+ if(usemenu) {
+ fprintf(stdout,"\n");
+ getstr("Press [RETURN] to continue",lin2);
+ }
+ fclose(help_file);
+ }
+ }
+}
+
+static void show_aln(void) /* Alignment screen display procedure */
+{
+ FILE *file;
+ sint nlines;
+ char temp[MAXLINE+1];
+ char file_name[FILENAMELEN+1];
+
+ if(output_clustal) strcpy(file_name,clustal_outname);
+ else if(output_nbrf) strcpy(file_name,nbrf_outname);
+ else if(output_gcg) strcpy(file_name,gcg_outname);
+ else if(output_phylip) strcpy(file_name,phylip_outname);
+ else if(output_gde) strcpy(file_name,gde_outname);
+ else if(output_nexus) strcpy(file_name,nexus_outname);
+ else if(output_fasta) strcpy(file_name,fasta_outname);
+
+#ifdef VMS
+ if((file=fopen(file_name,"r","rat=cr","rfm=var"))==NULL) {
+#else
+ if((file=fopen(file_name,"r"))==NULL) {
+#endif
+ error("Cannot open file [%s]",file_name);
+ return;
+ }
+
+ fprintf(stdout,"\n\n");
+ nlines = 0;
+
+ while(fgets(temp,MAXLINE+1,file)) {
+ fputs(temp,stdout);
+ ++nlines;
+ if(nlines >= PAGE_LEN) {
+ fprintf(stdout,"\n");
+ getstr("Press [RETURN] to continue or X to stop",lin2);
+ if(toupper(*lin2) == 'X') {
+ fclose(file);
+ return;
+ }
+ else
+ nlines = 0;
+ }
+ }
+ fclose(file);
+ fprintf(stdout,"\n");
+ getstr("Press [RETURN] to continue",lin2);
+}
+
+
+void parse_params(Boolean xmenus)
+{
+ sint i,j,len,temp;
+ static sint cl_error_code=0;
+ char path[FILENAMELEN];
+
+
+ Boolean do_align, do_convert, do_align_only, do_tree_only, do_tree, do_boot, do_profile, do_something;
+
+ if (!xmenus)
+ {
+ fprintf(stdout,"\n\n\n");
+ fprintf(stdout," CLUSTAL %s Multiple Sequence Alignments\n\n\n",revision_level);
+ }
+
+ do_align = do_convert = do_align_only = do_tree_only = do_tree = do_boot = do_profile = do_something = FALSE;
+
+ *seqname=EOS;
+
+/* JULIE
+ len=(sint)strlen(paramstr);
+ Stop converting command line to lower case - unix, mac, pc are case sensitive
+ for(i=0;i<len;++i) paramstr[i]=tolower(paramstr[i]);
+*/
+
+ numparams = check_param(args, params, param_arg);
+ if (numparams <0) exit(1);
+
+ if(sethelp != -1) {
+ get_help('9');
+ exit(1);
+ }
+
+ if(setoptions != -1) {
+ fprintf(stdout,"clustalw option list:-\n");
+ for (i=0;cmd_line_verb[i].str[0] != '\0';i++) {
+ fprintf(stdout,"\t\t%c%s%s",COMMANDSEP,cmd_line_verb[i].str,cmd_line_type[cmd_line_verb[i].type]);
+ if (cmd_line_verb[i].type == OPTARG) {
+ if (cmd_line_verb[i].arg[0][0] != '\0')
+ fprintf(stdout,"=%s",cmd_line_verb[i].arg[0]);
+ for (j=1;cmd_line_verb[i].arg[j][0] != '\0';j++)
+ fprintf(stdout," OR %s",cmd_line_verb[i].arg[j]);
+ }
+ fprintf(stdout,"\n");
+ }
+ for (i=0;cmd_line_file[i].str[0] != '\0';i++) {
+ fprintf(stdout,"\t\t%c%s%s",COMMANDSEP,cmd_line_file[i].str,cmd_line_type[cmd_line_file[i].type]);
+ if (cmd_line_file[i].type == OPTARG) {
+ if (cmd_line_file[i].arg[0][0] != '\0')
+ fprintf(stdout,"=%s",cmd_line_file[i].arg[0]);
+ for (j=1;cmd_line_file[i].arg[j][0] != '\0';j++)
+ fprintf(stdout," OR %s",cmd_line_file[i].arg[j]);
+ }
+ fprintf(stdout,"\n");
+ }
+ for (i=0;cmd_line_para[i].str[0] != '\0';i++) {
+ fprintf(stdout,"\t\t%c%s%s",COMMANDSEP,cmd_line_para[i].str,cmd_line_type[cmd_line_para[i].type]);
+ if (cmd_line_para[i].type == OPTARG) {
+ if (cmd_line_para[i].arg[0][0] != '\0')
+ fprintf(stdout,"=%s",cmd_line_para[i].arg[0]);
+ for (j=1;cmd_line_para[i].arg[j][0] != '\0';j++)
+ fprintf(stdout," OR %s",cmd_line_para[i].arg[j]);
+ }
+ fprintf(stdout,"\n");
+ }
+ exit(1);
+ }
+
+
+/*****************************************************************************/
+/* Check to see if sequence type is explicitely stated..override ************/
+/* the automatic checking (DNA or Protein). /type=d or /type=p *************/
+/*****************************************************************************/
+ if(settype != -1)
+ if(strlen(param_arg[settype])>0) {
+ temp = find_match(param_arg[settype],type_arg,2);
+ if(temp == 0) {
+ dnaflag = FALSE;
+ explicit_dnaflag = TRUE;
+ info("Sequence type explicitly set to Protein");
+ }
+ else if(temp == 1) {
+ info("Sequence type explicitly set to DNA");
+ dnaflag = TRUE;
+ explicit_dnaflag = TRUE;
+ }
+ else
+ fprintf(stdout,"\nUnknown sequence type %s\n",
+ param_arg[settype]);
+ }
+
+
+/***************************************************************************
+* check to see if 1st parameter does not start with '/' i.e. look for an *
+* input file as first parameter. The input file can also be specified *
+* by /infile=fname. *
+****************************************************************************/
+/* JULIE - moved to check_param()
+ if(paramstr[0] != '/') {
+ strcpy(seqname, params[0]);
+ }
+*/
+
+/**************************************************/
+/* Look for /infile=file.ext on the command line */
+/**************************************************/
+
+ if(setinfile != -1) {
+ if(strlen(param_arg[setinfile]) <= 0) {
+ error("Bad sequence file name");
+ exit(1);
+ }
+ strcpy(seqname, param_arg[setinfile]);
+ }
+
+ if(*seqname != EOS) {
+ profile_no = 0;
+ nseqs = readseqs((sint)1);
+ if(nseqs < 2) {
+ if(nseqs < 0) cl_error_code = 2;
+ else if(nseqs == 0) cl_error_code = 3;
+ else cl_error_code = 4;
+ fprintf(stdout,
+ "\nNo. of seqs. read = %d. No alignment!\n",(pint)nseqs);
+ exit(cl_error_code);
+ }
+ for(i = 1; i<=nseqs; i++)
+ info("Sequence %d: %-*s %6.d %s",
+ (pint)i,max_names,names[i],(pint)seqlen_array[i],dnaflag?"bp":"aa");
+ empty = FALSE;
+ do_something = TRUE;
+ }
+
+ set_optional_param();
+
+/*********************************************************/
+/* Look for /profile1=file.ext AND /profile2=file2.ext */
+/* You must give both file names OR neither. */
+/*********************************************************/
+
+ if(setprofile1 != -1) {
+ if(strlen(param_arg[setprofile1]) <= 0) {
+ error("Bad profile 1 file name");
+ exit(1);
+ }
+ strcpy(seqname, param_arg[setprofile1]);
+ profile_no = 1;
+ profile_input();
+ if(nseqs <= 0) {
+ if(nseqs<0) cl_error_code=2;
+ else if(nseqs==0) cl_error_code=3;
+ exit(cl_error_code);
+ }
+ strcpy(profile1_name,seqname);
+ }
+
+ if(setprofile2 != -1) {
+ if(strlen(param_arg[setprofile2]) <= 0) {
+ error("Bad profile 2 file name");
+ exit(1);
+ }
+ if(profile1_empty) {
+ error("Only 1 profile file (profile 2) specified.");
+ exit(1);
+ }
+ strcpy(seqname, param_arg[setprofile2]);
+ profile_no = 2;
+ profile_input();
+ if(nseqs > profile1_nseqs)
+ do_something = do_profile = TRUE;
+ else {
+ if(nseqs<0) cl_error_code=2;
+ else if(nseqs==0) cl_error_code=3;
+ error("No sequences read from profile 2");
+ exit(cl_error_code);
+ }
+ strcpy(profile2_name,seqname);
+ }
+
+/*************************************************************************/
+/* Look for /tree or /bootstrap or /align or /usetree ******************/
+/*************************************************************************/
+
+ if (setbatch != -1)
+ interactive=FALSE;
+
+ if (setinteractive != -1)
+ interactive=TRUE;
+
+ if (interactive) {
+ settree = -1;
+ setbootstrap = -1;
+ setalign = -1;
+ setusetree = -1;
+ setusetree1 = -1;
+ setusetree2 = -1;
+ setnewtree = -1;
+ setconvert = -1;
+ }
+
+ if(settree != -1 )
+ if(empty) {
+ error("Cannot draw tree. No input alignment file");
+ exit(1);
+ }
+ else
+ do_tree = TRUE;
+
+ if(setbootstrap != -1)
+ if(empty) {
+ error("Cannot bootstrap tree. No input alignment file");
+ exit(1);
+ }
+ else {
+ temp = 0;
+ if(param_arg[setbootstrap] != NULL)
+ if (sscanf(param_arg[setbootstrap],"%d",&temp)!=1) {
+ fprintf(stdout,"Bad option for /bootstrap (must be integer)\n");
+ temp = 0;
+ };
+ if(temp > 0) boot_ntrials = temp;
+ do_boot = TRUE;
+ }
+
+ if(setalign != -1)
+ if(empty) {
+ error("Cannot align sequences. No input file");
+ exit(1);
+ }
+ else
+ do_align = TRUE;
+
+ if(setconvert != -1)
+ if(empty) {
+ error("Cannot convert sequences. No input file");
+ exit(1);
+ }
+ else
+ do_convert = TRUE;
+
+ if(setusetree != -1)
+ if(empty) {
+ error("Cannot align sequences. No input file");
+ exit(1);
+ }
+ else {
+ if(strlen(param_arg[setusetree]) == 0) {
+ error("Cannot align sequences. No tree file specified");
+ exit(1);
+ }
+ else {
+ strcpy(phylip_tree_name, param_arg[setusetree]);
+ }
+ use_tree_file = TRUE;
+ do_align_only = TRUE;
+ }
+
+ if(setnewtree != -1)
+ if(empty) {
+ error("Cannot align sequences. No input file");
+ exit(1);
+ }
+ else {
+ if(strlen(param_arg[setnewtree]) == 0) {
+ error("Cannot align sequences. No tree file specified");
+ exit(1);
+ }
+ else {
+ strcpy(phylip_tree_name, param_arg[setnewtree]);
+ }
+ new_tree_file = TRUE;
+ do_tree_only = TRUE;
+ }
+
+ if(setusetree1 != -1)
+ if(profile1_empty) {
+ error("Cannot align profiles. No input file");
+ exit(1);
+ }
+ else if(profile_type == SEQUENCE) {
+ error("Invalid option /usetree1.");
+ exit(1);
+ }
+ else {
+ if(strlen(param_arg[setusetree1]) == 0) {
+ error("Cannot align profiles. No tree file specified");
+ exit(1);
+ }
+ else {
+ strcpy(p1_tree_name, param_arg[setusetree1]);
+ }
+ use_tree1_file = TRUE;
+ do_align_only = TRUE;
+ }
+
+ if(setnewtree1 != -1)
+ if(profile1_empty) {
+ error("Cannot align profiles. No input file");
+ exit(1);
+ }
+ else if(profile_type == SEQUENCE) {
+ error("Invalid option /newtree1.");
+ exit(1);
+ }
+ else {
+ if(strlen(param_arg[setnewtree1]) == 0) {
+ error("Cannot align profiles. No tree file specified");
+ exit(1);
+ }
+ else {
+ strcpy(p1_tree_name, param_arg[setnewtree1]);
+ }
+ new_tree1_file = TRUE;
+ }
+
+ if(setusetree2 != -1)
+ if(profile2_empty) {
+ error("Cannot align profiles. No input file");
+ exit(1);
+ }
+ else if(profile_type == SEQUENCE) {
+ error("Invalid option /usetree2.");
+ exit(1);
+ }
+ else {
+ if(strlen(param_arg[setusetree2]) == 0) {
+ error("Cannot align profiles. No tree file specified");
+ exit(1);
+ }
+ else {
+ strcpy(p2_tree_name, param_arg[setusetree2]);
+ }
+ use_tree2_file = TRUE;
+ do_align_only = TRUE;
+ }
+
+ if(setnewtree2 != -1)
+ if(profile2_empty) {
+ error("Cannot align profiles. No input file");
+ exit(1);
+ }
+ else if(profile_type == SEQUENCE) {
+ error("Invalid option /newtree2.");
+ exit(1);
+ }
+ else {
+ if(strlen(param_arg[setnewtree2]) == 0) {
+ error("Cannot align profiles. No tree file specified");
+ exit(1);
+ }
+ else {
+ strcpy(p2_tree_name, param_arg[setnewtree2]);
+ }
+ new_tree2_file = TRUE;
+ }
+
+
+ if( (!do_tree) && (!do_boot) && (!empty) && (!do_profile) && (!do_align_only) && (!do_tree_only) && (!do_convert))
+ do_align = TRUE;
+
+/*** ? /quicktree */
+ if(setquicktree != -1)
+ quick_pairalign = TRUE;
+
+ if(dnaflag) {
+ gap_open = dna_gap_open;
+ gap_extend = dna_gap_extend;
+ pw_go_penalty = dna_pw_go_penalty;
+ pw_ge_penalty = dna_pw_ge_penalty;
+ ktup = dna_ktup;
+ window = dna_window;
+ signif = dna_signif;
+ wind_gap = dna_wind_gap;
+
+ }
+ else {
+ gap_open = prot_gap_open;
+ gap_extend = prot_gap_extend;
+ pw_go_penalty = prot_pw_go_penalty;
+ pw_ge_penalty = prot_pw_ge_penalty;
+ ktup = prot_ktup;
+ window = prot_window;
+ signif = prot_signif;
+ wind_gap = prot_wind_gap;
+ }
+
+ if(interactive) {
+ if (!xmenus) usemenu = TRUE;
+ return;
+ }
+
+
+ if(!do_something) {
+ error("No input file(s) specified");
+ exit(1);
+ }
+
+
+
+
+/****************************************************************************/
+/* Now do whatever has been requested ***************************************/
+/****************************************************************************/
+
+
+ if(do_profile) {
+ if (profile_type == PROFILE) profile_align(p1_tree_name,p2_tree_name);
+ else new_sequence_align(phylip_tree_name);
+ }
+
+ else if(do_align)
+ align(phylip_tree_name);
+
+ else if(do_convert) {
+ get_path(seqname,path);
+ if(!open_alignment_output(path)) exit(1);
+ create_alignment_output(1,nseqs);
+ }
+
+ else if (do_align_only)
+ get_tree(phylip_tree_name);
+
+ else if(do_tree_only)
+ make_tree(phylip_tree_name);
+
+ else if(do_tree)
+ phylogenetic_tree(phylip_tree_name,clustal_tree_name,dist_tree_name,nexus_tree_name,pim_name);
+
+ else if(do_boot)
+ bootstrap_tree(phylip_tree_name,clustal_tree_name,nexus_tree_name);
+
+ fprintf(stdout,"\n");
+ exit(0);
+
+/*******whew!***now*go*home****/
+}
+
+
+Boolean user_mat(char *str, short *mat, short *xref)
+{
+ sint maxres;
+
+ FILE *infile;
+
+ if(usemenu)
+ getstr("Enter name of the matrix file",lin2);
+ else
+ strcpy(lin2,str);
+
+ if(*lin2 == EOS) return FALSE;
+
+ if((infile=fopen(lin2,"r"))==NULL) {
+ error("Cannot find matrix file [%s]",lin2);
+ return FALSE;
+ }
+
+ strcpy(str, lin2);
+
+ maxres = read_user_matrix(str, mat, xref);
+ if (maxres <= 0) return FALSE;
+
+ return TRUE;
+}
+
+Boolean user_mat_series(char *str, short *mat, short *xref)
+{
+ sint maxres;
+
+ FILE *infile;
+
+ if(usemenu)
+ getstr("Enter name of the matrix file",lin2);
+ else
+ strcpy(lin2,str);
+
+ if(*lin2 == EOS) return FALSE;
+
+ if((infile=fopen(lin2,"r"))==NULL) {
+ error("Cannot find matrix file [%s]",lin2);
+ return FALSE;
+ }
+
+ strcpy(str, lin2);
+
+ maxres = read_matrix_series(str, mat, xref);
+ if (maxres <= 0) return FALSE;
+
+ return TRUE;
+}
+
+
+
+
+
+
+sint seq_input(Boolean append)
+{
+ sint i;
+ sint local_nseqs;
+
+ if(usemenu) {
+fprintf(stdout,"\n\nSequences should all be in 1 file.\n");
+fprintf(stdout,"\n7 formats accepted: \n");
+fprintf(stdout,
+"NBRF/PIR, EMBL/SwissProt, Pearson (Fasta), GDE, Clustal, GCG/MSF, RSF.\n\n\n");
+/*fprintf(stdout,
+"\nGCG users should use TOPIR to convert their sequence files before use.\n\n\n");*/
+ }
+
+ if (append)
+ local_nseqs = readseqs(nseqs+(sint)1);
+ else
+ local_nseqs = readseqs((sint)1); /* 1 is the first seq to be read */
+ if(local_nseqs < 0) /* file could not be opened */
+ {
+ return local_nseqs;
+ }
+ else if(local_nseqs == 0) /* no sequences */
+ {
+ error("No sequences in file! Bad format?");
+ return local_nseqs;
+ }
+ else
+ {
+ struct_penalties1 = struct_penalties2 = NONE;
+ if (sec_struct_mask1 != NULL) sec_struct_mask1=ckfree(sec_struct_mask1);
+ if (sec_struct_mask2 != NULL) sec_struct_mask2=ckfree(sec_struct_mask2);
+ if (gap_penalty_mask1 != NULL) gap_penalty_mask1=ckfree(gap_penalty_mask1);
+ if (gap_penalty_mask2 != NULL) gap_penalty_mask2=ckfree(gap_penalty_mask2);
+ if (ss_name1 != NULL) ss_name1=ckfree(ss_name1);
+ if (ss_name2 != NULL) ss_name2=ckfree(ss_name2);
+
+ if(append) nseqs+=local_nseqs;
+ else nseqs=local_nseqs;
+ info("Sequences assumed to be %s",
+ dnaflag?"DNA":"PROTEIN");
+ if (usemenu) {
+ fprintf(stdout,"\n\n");
+ for(i=1; i<=nseqs; i++) {
+/* DES fprintf(stdout,"%s: = ",names[i]); */
+ info("Sequence %d: %-*s %6.d %s",
+ (pint)i,max_names,names[i],(pint)seqlen_array[i],dnaflag?"bp":"aa");
+ }
+ }
+ if(dnaflag) {
+ gap_open = dna_gap_open;
+ gap_extend = dna_gap_extend;
+ }
+ else {
+ gap_open = prot_gap_open;
+ gap_extend = prot_gap_extend;
+ }
+ empty=FALSE;
+ }
+ return local_nseqs;
+}
+
+
+
+
+
+
+
+sint profile_input(void) /* read a profile */
+{ /* profile_no is 1 or 2 */
+ sint local_nseqs, i;
+
+ if(profile_no == 2 && profile1_empty)
+ {
+ error("You must read in profile number 1 first");
+ return 0;
+ }
+
+ if(profile_no == 1) /* for the 1st profile */
+ {
+ local_nseqs = readseqs((sint)1); /* (1) means 1st seq to be read = no. 1 */
+ if(local_nseqs < 0) /* file could not be opened */
+ {
+ return local_nseqs;
+ }
+ else if(local_nseqs == 0) /* no sequences */
+ {
+ error("No sequences in file! Bad format?");
+ return local_nseqs;
+ }
+ else if (local_nseqs > 0)
+ { /* success; found some seqs. */
+ struct_penalties1 = NONE;
+ if (sec_struct_mask1 != NULL) sec_struct_mask1=ckfree(sec_struct_mask1);
+ if (gap_penalty_mask1 != NULL) gap_penalty_mask1=ckfree(gap_penalty_mask1);
+ if (ss_name1 != NULL) ss_name1=ckfree(ss_name1);
+ if (struct_penalties != NONE) /* feature table / mask in alignment */
+ {
+ struct_penalties1 = struct_penalties;
+ if (struct_penalties == SECST) {
+ sec_struct_mask1 = (char *)ckalloc((max_aln_length) * sizeof (char));
+ for (i=0;i<max_aln_length;i++)
+ sec_struct_mask1[i] = sec_struct_mask[i];
+ }
+ gap_penalty_mask1 = (char *)ckalloc((max_aln_length) * sizeof (char));
+ for (i=0;i<max_aln_length;i++)
+ gap_penalty_mask1[i] = gap_penalty_mask[i];
+ ss_name1 = (char *)ckalloc( (MAXNAMES+1) * sizeof (char));
+
+ strcpy(ss_name1,ss_name);
+if (debug>0) {
+for (i=0;i<seqlen_array[1];i++)
+ fprintf(stdout,"%c",gap_penalty_mask1[i]);
+fprintf(stdout,"\n");
+}
+ }
+ nseqs = profile1_nseqs = local_nseqs;
+ info("No. of seqs=%d",(pint)nseqs);
+ profile1_empty=FALSE;
+ profile2_empty=TRUE;
+ }
+ }
+ else
+ { /* first seq to be read = profile1_nseqs + 1 */
+ local_nseqs = readseqs(profile1_nseqs+(sint)1);
+ if(local_nseqs < 0) /* file could not be opened */
+ {
+ return local_nseqs;
+ }
+ else if(local_nseqs == 0) /* no sequences */
+ {
+ error("No sequences in file! Bad format?");
+ return local_nseqs;
+ }
+ else if(local_nseqs > 0)
+ {
+ struct_penalties2 = NONE;
+ if (sec_struct_mask2 != NULL) sec_struct_mask2=ckfree(sec_struct_mask2);
+ if (gap_penalty_mask2 != NULL) gap_penalty_mask2=ckfree(gap_penalty_mask2);
+ if (ss_name2 != NULL) ss_name2=ckfree(ss_name2);
+ if (struct_penalties != NONE) /* feature table / mask in alignment */
+ {
+ struct_penalties2 = struct_penalties;
+ if (struct_penalties == SECST) {
+ sec_struct_mask2 = (char *)ckalloc((max_aln_length) * sizeof (char));
+ for (i=0;i<max_aln_length;i++)
+ sec_struct_mask2[i] = sec_struct_mask[i];
+ }
+ gap_penalty_mask2 = (char *)ckalloc((max_aln_length) * sizeof (char));
+ for (i=0;i<max_aln_length;i++)
+ gap_penalty_mask2[i] = gap_penalty_mask[i];
+ ss_name2 = (char *)ckalloc( (MAXNAMES+1) * sizeof (char));
+ strcpy(ss_name2,ss_name);
+if (debug>0) {
+for (i=0;i<seqlen_array[profile1_nseqs+1];i++)
+ fprintf(stdout,"%c",gap_penalty_mask2[i]);
+fprintf(stdout,"\n");
+}
+ }
+ info("No. of seqs in profile=%d",(pint)local_nseqs);
+ nseqs = profile1_nseqs + local_nseqs;
+ info("Total no. of seqs =%d",(pint)nseqs);
+ profile2_empty=FALSE;
+ empty = FALSE;
+ }
+
+ }
+ if (sec_struct_mask != NULL) sec_struct_mask=ckfree(sec_struct_mask);
+ if (gap_penalty_mask != NULL) gap_penalty_mask=ckfree(gap_penalty_mask);
+ if (ss_name != NULL) ss_name=ckfree(ss_name);
+
+ if(local_nseqs<=0) return local_nseqs;
+
+ info("Sequences assumed to be %s",
+ dnaflag?"DNA":"PROTEIN");
+ if (usemenu) fprintf(stdout,"\n\n");
+ for(i=profile2_empty?1:profile1_nseqs+1; i<=nseqs; i++) {
+ info("Sequence %d: %-*s %6.d %s",
+ (pint)i,max_names,names[i],(pint)seqlen_array[i],dnaflag?"bp":"aa");
+ }
+ if(dnaflag) {
+ gap_open = dna_gap_open;
+ gap_extend = dna_gap_extend;
+ }
+ else {
+ gap_open = prot_gap_open;
+ gap_extend = prot_gap_extend;
+ }
+
+ return nseqs;
+}
+
+
+
+static void calc_gap_penalty_mask(int prf_length, char *mask, char *gap_mask)
+{
+ int i,j;
+ char *struct_mask;
+
+ struct_mask = (char *)ckalloc((prf_length+1) * sizeof(char));
+/*
+ calculate the gap penalty mask from the secondary structures
+*/
+ i=0;
+ while (i<prf_length) {
+ if (tolower(mask[i]) == 'a' || mask[i] == '$') {
+ for (j = -helix_end_plus; j<0; j++) {
+ if ((i+j>=0) && (tolower(struct_mask[i+j]) != 'a')
+ && (tolower(struct_mask[i+j]) != 'b'))
+ struct_mask[i+j] = 'a';
+ }
+ for (j = 0; j<helix_end_minus; j++) {
+ if (i+j>=prf_length || (tolower(mask[i+j]) != 'a'
+ && mask[i+j] != '$')) break;
+ struct_mask[i+j] = 'a';
+ }
+ i += j;
+ while (tolower(mask[i]) == 'a'
+ || mask[i] == '$') {
+ if (i>=prf_length) break;
+ if (mask[i] == '$') {
+ struct_mask[i] = 'A';
+ i++;
+ break;
+ }
+ else struct_mask[i] = mask[i];
+ i++;
+ }
+ for (j = 0; j<helix_end_minus; j++) {
+ if ((i-j-1>=0) && (tolower(mask[i-j-1]) == 'a'
+ || mask[i-j-1] == '$'))
+ struct_mask[i-j-1] = 'a';
+ }
+ for (j = 0; j<helix_end_plus; j++) {
+ if (i+j>=prf_length) break;
+ struct_mask[i+j] = 'a';
+ }
+ }
+ else if (tolower(mask[i]) == 'b' || mask[i] == '%') {
+ for (j = -strand_end_plus; j<0; j++) {
+ if ((i+j>=0) && (tolower(struct_mask[i+j]) != 'a')
+ && (tolower(struct_mask[i+j]) != 'b'))
+ struct_mask[i+j] = 'b';
+ }
+ for (j = 0; j<strand_end_minus; j++) {
+ if (i+j>=prf_length || (tolower(mask[i+j]) != 'b'
+ && mask[i+j] != '%')) break;
+ struct_mask[i+j] = 'b';
+ }
+ i += j;
+ while (tolower(mask[i]) == 'b'
+ || mask[i] == '%') {
+ if (i>=prf_length) break;
+ if (mask[i] == '%') {
+ struct_mask[i] = 'B';
+ i++;
+ break;
+ }
+ else struct_mask[i] = mask[i];
+ i++;
+ }
+ for (j = 0; j<strand_end_minus; j++) {
+ if ((i-j-1>=0) && (tolower(mask[i-j-1]) == 'b'
+ || mask[i-j-1] == '%'))
+ struct_mask[i-j-1] = 'b';
+ }
+ for (j = 0; j<strand_end_plus; j++) {
+ if (i+j>=prf_length) break;
+ struct_mask[i+j] = 'b';
+ }
+ }
+ else i++;
+ }
+
+ for(i=0;i<prf_length;i++) {
+ switch (struct_mask[i]) {
+ case 'A':
+ gap_mask[i] = helix_penalty+'0';
+ break;
+ case 'a':
+ gap_mask[i] = helix_end_penalty+'0';
+ break;
+ case 'B':
+ gap_mask[i] = strand_penalty+'0';
+ break;
+ case 'b':
+ gap_mask[i] = strand_end_penalty+'0';
+ break;
+ default:
+ gap_mask[i] = loop_penalty+'0';
+ break;
+ }
+ }
+
+ struct_mask=ckfree(struct_mask);
+
+}
+
+void print_sec_struct_mask(int prf_length, char *mask, char *struct_mask)
+{
+ int i,j;
+
+/*
+ calculate the gap penalty mask from the secondary structures
+*/
+ i=0;
+ while (i<prf_length) {
+ if (tolower(mask[i]) == 'a' || mask[i] == '$') {
+ for (j = 0; j<helix_end_minus; j++) {
+ if (i+j>=prf_length || (tolower(mask[i+j]) != 'a'
+ && mask[i+j] != '$')) break;
+ struct_mask[i+j] = 'a';
+ }
+ i += j;
+ while (tolower(mask[i]) == 'a'
+ || mask[i] == '$') {
+ if (i>=prf_length) break;
+ if (mask[i] == '$') {
+ struct_mask[i] = 'A';
+ i++;
+ break;
+ }
+ else struct_mask[i] = mask[i];
+ i++;
+ }
+ for (j = 0; j<helix_end_minus; j++) {
+ if ((i-j-1>=0) && (tolower(mask[i-j-1]) == 'a'
+ || mask[i-j-1] == '$'))
+ struct_mask[i-j-1] = 'a';
+ }
+ }
+ else if (tolower(mask[i]) == 'b' || mask[i] == '%') {
+ for (j = 0; j<strand_end_minus; j++) {
+ if (i+j>=prf_length || (tolower(mask[i+j]) != 'b'
+ && mask[i+j] != '%')) break;
+ struct_mask[i+j] = 'b';
+ }
+ i += j;
+ while (tolower(mask[i]) == 'b'
+ || mask[i] == '%') {
+ if (i>=prf_length) break;
+ if (mask[i] == '%') {
+ struct_mask[i] = 'B';
+ i++;
+ break;
+ }
+ else struct_mask[i] = mask[i];
+ i++;
+ }
+ for (j = 0; j<strand_end_minus; j++) {
+ if ((i-j-1>=0) && (tolower(mask[i-j-1]) == 'b'
+ || mask[i-j-1] == '%'))
+ struct_mask[i-j-1] = 'b';
+ }
+ }
+ else i++;
+ }
+}
+
+
+
+FILE * open_output_file(char *prompt, char *path,
+ char *file_name, char *file_extension)
+
+{ static char temp[FILENAMELEN+1];
+ static char local_prompt[MAXLINE];
+ FILE * file_handle;
+
+/* if (*file_name == EOS) {
+*/ strcpy(file_name,path);
+ strcat(file_name,file_extension);
+/* }
+*/
+ if(strcmp(file_name,seqname)==0) {
+ warning("Output file name is the same as input file.");
+ if (usemenu) {
+ strcpy(local_prompt,"\n\nEnter new name to avoid overwriting ");
+ strcat(local_prompt," [%s]: ");
+ fprintf(stdout,local_prompt,file_name);
+ gets(temp);
+ if(*temp != EOS) strcpy(file_name,temp);
+ }
+ }
+ else if (usemenu) {
+ strcpy(local_prompt,prompt);
+ strcat(local_prompt," [%s]: ");
+ fprintf(stdout,local_prompt,file_name);
+ gets(temp);
+ if(*temp != EOS) strcpy(file_name,temp);
+ }
+
+#ifdef VMS
+ if((file_handle=fopen(file_name,"w","rat=cr","rfm=var"))==NULL) {
+#else
+ if((file_handle=fopen(file_name,"w"))==NULL) {
+#endif
+ error("Cannot open output file [%s]",file_name);
+ return NULL;
+ }
+ return file_handle;
+}
+
+
+
+FILE * open_explicit_file(char *file_name)
+{
+ FILE * file_handle;
+
+ if (*file_name == EOS) {
+ error("Bad output file [%s]",file_name);
+ return NULL;
+ }
+#ifdef VMS
+ if((file_handle=fopen(file_name,"w","rat=cr","rfm=var"))==NULL) {
+#else
+ if((file_handle=fopen(file_name,"w"))==NULL) {
+#endif
+ error("Cannot open output file [%s]",file_name);
+ return NULL;
+ }
+ return file_handle;
+}
+
+
+/* Ramu void */
+
+void align(char *phylip_name)
+{
+ char path[FILENAMELEN+1];
+ FILE *tree;
+ sint count;
+
+ if(empty && usemenu) {
+ error("No sequences in memory. Load sequences first.");
+ return;
+ }
+
+ struct_penalties1 = struct_penalties2 = NONE;
+ if (sec_struct_mask1 != NULL) sec_struct_mask1=ckfree(sec_struct_mask1);
+ if (sec_struct_mask2 != NULL) sec_struct_mask2=ckfree(sec_struct_mask2);
+ if (gap_penalty_mask1 != NULL) gap_penalty_mask1=ckfree(gap_penalty_mask1);
+ if (gap_penalty_mask2 != NULL) gap_penalty_mask2=ckfree(gap_penalty_mask2);
+ if (ss_name1 != NULL) ss_name1=ckfree(ss_name1);
+ if (ss_name2 != NULL) ss_name2=ckfree(ss_name2);
+
+
+ get_path(seqname,path);
+/* DES DEBUG
+ fprintf(stdout,"\n\n Seqname = %s \n Path = %s \n\n",seqname,path);
+*/
+ if(usemenu || !interactive) {
+ if(!open_alignment_output(path)) return;
+ }
+
+ if (nseqs >= 2) {
+
+ get_path(seqname,path);
+ if (phylip_name[0]!=EOS) {
+ if((tree = open_explicit_file(
+ phylip_name))==NULL) return;
+ }
+ else {
+ if((tree = open_output_file(
+ "\nEnter name for new GUIDE TREE file ",path,
+ phylip_name,"dnd")) == NULL) return;
+ }
+ }
+
+ if (save_parameters) create_parameter_output();
+
+ if(reset_alignments_new || reset_alignments_all) reset_align();
+
+ info("Start of Pairwise alignments");
+ info("Aligning...");
+ if(dnaflag) {
+ gap_open = dna_gap_open;
+ gap_extend = dna_gap_extend;
+ pw_go_penalty = dna_pw_go_penalty;
+ pw_ge_penalty = dna_pw_ge_penalty;
+ ktup = dna_ktup;
+ window = dna_window;
+ signif = dna_signif;
+ wind_gap = dna_wind_gap;
+
+ }
+ else {
+ gap_open = prot_gap_open;
+ gap_extend = prot_gap_extend;
+ pw_go_penalty = prot_pw_go_penalty;
+ pw_ge_penalty = prot_pw_ge_penalty;
+ ktup = prot_ktup;
+ window = prot_window;
+ signif = prot_signif;
+ wind_gap = prot_wind_gap;
+
+ }
+
+ if (quick_pairalign)
+ show_pair((sint)0,nseqs,(sint)0,nseqs);
+ else
+ pairalign((sint)0,nseqs,(sint)0,nseqs);
+
+ if (nseqs >= 2) {
+
+ guide_tree(tree,1,nseqs);
+ info("Guide tree file created: [%s]",
+ phylip_name);
+ }
+
+
+ count = malign((sint)0,phylip_name);
+
+ if (count <= 0) return;
+
+ if (usemenu) fprintf(stdout,"\n\n\n");
+
+ create_alignment_output(1,nseqs);
+ if (showaln && usemenu) show_aln();
+ phylip_name[0]=EOS;
+ return ;
+}
+
+
+
+
+
+void new_sequence_align(char *phylip_name)
+{
+ char path[FILENAMELEN+1];
+ char tree_name[FILENAMELEN+1],temp[MAXLINE+1];
+ Boolean use_tree;
+ FILE *tree;
+ sint i,j,count;
+ float dscore;
+ Boolean save_ss2;
+
+ if(profile1_empty && usemenu) {
+ error("No profile in memory. Input 1st profile first.");
+ return;
+ }
+
+ if(profile2_empty && usemenu) {
+ error("No sequences in memory. Input sequences first.");
+ return;
+ }
+
+ get_path(profile2_name,path);
+
+ if(usemenu || !interactive) {
+ if(!open_alignment_output(path)) return;
+ }
+
+ new_seq = profile1_nseqs+1;
+
+/* check for secondary structure information for list of sequences */
+
+ save_ss2 = use_ss2;
+ if (struct_penalties2 != NONE && use_ss2 == TRUE && (nseqs - profile1_nseqs >
+1)) {
+ if (struct_penalties2 == SECST)
+ warning("Warning: ignoring secondary structure for a list of sequences");
+ else if (struct_penalties2 == GMASK)
+ warning("Warning: ignoring gap penalty mask for a list of sequences");
+ use_ss2 = FALSE;
+ }
+
+ for (i=1;i<=new_seq;i++) {
+ for (j=i+1;j<=new_seq;j++) {
+ dscore = countid(i,j);
+ tmat[i][j] = ((double)100.0 - (double)dscore)/(double)100.0;
+ tmat[j][i] = tmat[i][j];
+ }
+ }
+
+ tree_name[0] = EOS;
+ use_tree = FALSE;
+ if (nseqs >= 2) {
+ if (check_tree && usemenu) {
+ strcpy(tree_name,path);
+ strcat(tree_name,"dnd");
+#ifdef VMS
+ if((tree=fopen(tree_name,"r","rat=cr","rfm=var"))!=NULL) {
+#else
+ if((tree=fopen(tree_name,"r"))!=NULL) {
+#endif
+ if (usemenu)
+ fprintf(stdout,"\nUse the existing GUIDE TREE file, %s (y/n) ? [y]: ",
+ tree_name);
+ gets(temp);
+ if(*temp != 'n' && *temp != 'N') {
+ strcpy(phylip_name,tree_name);
+ use_tree = TRUE;
+ }
+ fclose(tree);
+ }
+ }
+ else if (!usemenu && use_tree_file) {
+ use_tree = TRUE;
+ }
+ }
+
+ if (save_parameters) create_parameter_output();
+
+ if(reset_alignments_new || reset_alignments_all) {
+/*
+ reset_prf1();
+*/
+ reset_prf2();
+ }
+ else fix_gaps();
+
+ if (struct_penalties1 == SECST)
+
+ calc_gap_penalty_mask(seqlen_array[1],sec_struct_mask1,gap_penalty_mask1);
+
+ if (struct_penalties2 == SECST)
+
+calc_gap_penalty_mask(seqlen_array[profile1_nseqs+1],sec_struct_mask2,gap_penalty_mask2);
+
+
+/* create the new tree file, if necessary */
+
+ if (use_tree == FALSE) {
+
+ if (nseqs >= 2) {
+ get_path(profile2_name,path);
+ if (phylip_name[0]!=EOS) {
+ if((tree = open_explicit_file(
+ phylip_name))==NULL) return;
+ }
+ else {
+ if((tree = open_output_file(
+ "\nEnter name for new GUIDE TREE file ",path,
+ phylip_name,"dnd")) == NULL) return;
+ }
+ }
+ info("Start of Pairwise alignments");
+ info("Aligning...");
+ if(dnaflag) {
+ gap_open = dna_gap_open;
+ gap_extend = dna_gap_extend;
+ pw_go_penalty = dna_pw_go_penalty;
+ pw_ge_penalty = dna_pw_ge_penalty;
+ ktup = dna_ktup;
+ window = dna_window;
+ signif = dna_signif;
+ wind_gap = dna_wind_gap;
+
+ }
+ else {
+ gap_open = prot_gap_open;
+ gap_extend = prot_gap_extend;
+ pw_go_penalty = prot_pw_go_penalty;
+ pw_ge_penalty = prot_pw_ge_penalty;
+ ktup = prot_ktup;
+ window = prot_window;
+ signif = prot_signif;
+ wind_gap = prot_wind_gap;
+
+ }
+
+ if (quick_pairalign)
+ show_pair((sint)0,nseqs,new_seq-2,nseqs);
+ else
+ pairalign((sint)0,nseqs,new_seq-2,nseqs);
+
+ if (nseqs >= 2) {
+ guide_tree(tree,1,nseqs);
+ info("Guide tree file created: [%s]",
+ phylip_name);
+ }
+ }
+
+ if (new_tree_file) return;
+
+ count = seqalign(new_seq-2,phylip_name);
+
+ use_ss2 = save_ss2;
+
+ if (count <= 0) return;
+
+ if (usemenu) fprintf(stdout,"\n\n\n");
+
+ create_alignment_output(1,nseqs);
+ if (showaln && usemenu) show_aln();
+
+ phylip_name[0]=EOS;
+
+}
+
+
+
+
+
+void make_tree(char *phylip_name)
+{
+ char path[FILENAMELEN+1];
+ FILE *tree;
+
+ if(empty) {
+ error("No sequences in memory. Load sequences first.");
+ return;
+ }
+
+ struct_penalties1 = struct_penalties2 = NONE;
+ if (sec_struct_mask1 != NULL) sec_struct_mask1=ckfree(sec_struct_mask1);
+ if (sec_struct_mask2 != NULL) sec_struct_mask2=ckfree(sec_struct_mask2);
+ if (gap_penalty_mask1 != NULL) gap_penalty_mask1=ckfree(gap_penalty_mask1);
+ if (gap_penalty_mask2 != NULL) gap_penalty_mask2=ckfree(gap_penalty_mask2);
+ if (ss_name1 != NULL) ss_name1=ckfree(ss_name1);
+ if (ss_name2 != NULL) ss_name2=ckfree(ss_name2);
+
+ if(reset_alignments_new || reset_alignments_all) reset_align();
+
+ get_path(seqname,path);
+
+ if (nseqs < 2) {
+ error("Less than 2 sequences in memory. Phylogenetic tree cannot be built.");
+ return;
+ }
+
+ if (save_parameters) create_parameter_output();
+
+ info("Start of Pairwise alignments");
+ info("Aligning...");
+ if(dnaflag) {
+ gap_open = dna_gap_open;
+ gap_extend = dna_gap_extend;
+ pw_go_penalty = dna_pw_go_penalty;
+ pw_ge_penalty = dna_pw_ge_penalty;
+ ktup = dna_ktup;
+ window = dna_window;
+ signif = dna_signif;
+ wind_gap = dna_wind_gap;
+
+ }
+ else {
+ gap_open = prot_gap_open;
+ gap_extend = prot_gap_extend;
+ pw_go_penalty = prot_pw_go_penalty;
+ pw_ge_penalty = prot_pw_ge_penalty;
+ ktup = prot_ktup;
+ window = prot_window;
+ signif = prot_signif;
+ wind_gap = prot_wind_gap;
+
+
+ }
+
+ if (quick_pairalign)
+ show_pair((sint)0,nseqs,(sint)0,nseqs);
+ else
+ pairalign((sint)0,nseqs,(sint)0,nseqs);
+
+ if (nseqs >= 2) {
+ get_path(seqname,path);
+ if (phylip_name[0]!=EOS) {
+ if((tree = open_explicit_file(
+ phylip_name))==NULL) return;
+ }
+ else {
+ if((tree = open_output_file(
+ "\nEnter name for new GUIDE TREE file ",path,
+ phylip_name,"dnd")) == NULL) return;
+ }
+
+ guide_tree(tree,1,nseqs);
+ info("Guide tree file created: [%s]",
+ phylip_name);
+ }
+
+ if(reset_alignments_new || reset_alignments_all) reset_align();
+
+ phylip_name[0]=EOS;
+}
+
+
+
+
+
+
+
+
+
+void get_tree(char *phylip_name)
+{
+ char path[FILENAMELEN+1],temp[MAXLINE+1];
+ sint count;
+
+ if(empty) {
+ error("No sequences in memory. Load sequences first.");
+ return;
+ }
+ struct_penalties1 = struct_penalties2 = NONE;
+ if (sec_struct_mask1 != NULL) sec_struct_mask1=ckfree(sec_struct_mask1);
+ if (sec_struct_mask2 != NULL) sec_struct_mask2=ckfree(sec_struct_mask2);
+ if (gap_penalty_mask1 != NULL) gap_penalty_mask1=ckfree(gap_penalty_mask1);
+ if (gap_penalty_mask2 != NULL) gap_penalty_mask2=ckfree(gap_penalty_mask2);
+ if (ss_name1 != NULL) ss_name1=ckfree(ss_name1);
+ if (ss_name2 != NULL) ss_name2=ckfree(ss_name2);
+
+
+ get_path(seqname,path);
+
+ if(usemenu || !interactive) {
+ if(!open_alignment_output(path)) return;
+ }
+
+ if(reset_alignments_new || reset_alignments_all) reset_align();
+
+ get_path(seqname,path);
+
+ if (nseqs >= 2) {
+
+ if(usemenu) {
+ strcpy(phylip_name,path);
+ strcat(phylip_name,"dnd");
+
+ fprintf(stdout,"\nEnter a name for the guide tree file [%s]: ",
+ phylip_name);
+ gets(temp);
+ if(*temp != EOS)
+ strcpy(phylip_name,temp);
+ }
+
+ if(usemenu || !interactive) {
+#ifdef VMS
+ if((tree=fopen(phylip_name,"r","rat=cr","rfm=var"))==NULL) {
+#else
+ if((tree=fopen(phylip_name,"r"))==NULL) {
+#endif
+ error("Cannot open tree file [%s]",phylip_name);
+ return;
+ }
+ }
+ }
+ else {
+ info("Start of Pairwise alignments");
+ info("Aligning...");
+ if(dnaflag) {
+ gap_open = dna_gap_open;
+ gap_extend = dna_gap_extend;
+ pw_go_penalty = dna_pw_go_penalty;
+ pw_ge_penalty = dna_pw_ge_penalty;
+ ktup = dna_ktup;
+ window = dna_window;
+ signif = dna_signif;
+ wind_gap = dna_wind_gap;
+
+ }
+ else {
+ gap_open = prot_gap_open;
+ gap_extend = prot_gap_extend;
+ pw_go_penalty = prot_pw_go_penalty;
+ pw_ge_penalty = prot_pw_ge_penalty;
+ ktup = prot_ktup;
+ window = prot_window;
+ signif = prot_signif;
+ wind_gap = prot_wind_gap;
+
+ }
+
+ if (quick_pairalign)
+ show_pair((sint)0,nseqs,(sint)0,nseqs);
+ else
+ pairalign((sint)0,nseqs,(sint)0,nseqs);
+ }
+
+ if (save_parameters) create_parameter_output();
+
+ count = malign(0,phylip_name);
+ if (count <= 0) return;
+
+ if (usemenu) fprintf(stdout,"\n\n\n");
+
+ create_alignment_output(1,nseqs);
+ if (showaln && usemenu) show_aln();
+
+ phylip_name[0]=EOS;
+}
+
+
+
+void profile_align(char *p1_tree_name,char *p2_tree_name)
+{
+ char path[FILENAMELEN+1];
+ char tree_name[FILENAMELEN+1];
+ char temp[MAXLINE+1];
+ Boolean use_tree1,use_tree2;
+ FILE *tree;
+ sint count,i,j,dscore;
+
+ if(profile1_empty || profile2_empty) {
+ error("No sequences in memory. Load sequences first.");
+ return;
+ }
+
+ get_path(profile1_name,path);
+
+ if(usemenu || !interactive) {
+ if(!open_alignment_output(path)) return;
+ }
+
+ if(reset_alignments_new || reset_alignments_all) {
+ reset_prf1();
+ reset_prf2();
+ }
+ else fix_gaps();
+
+ tree_name[0] = EOS;
+ use_tree1 = FALSE;
+ if (profile1_nseqs >= 2) {
+ if (check_tree && usemenu) {
+ strcpy(tree_name,path);
+ strcat(tree_name,"dnd");
+#ifdef VMS
+ if((tree=fopen(tree_name,"r","rat=cr","rfm=var"))!=NULL) {
+#else
+ if((tree=fopen(tree_name,"r"))!=NULL) {
+#endif
+ fprintf(stdout,"\nUse the existing GUIDE TREE file for Profile 1, %s (y/n) ? [y]: ",
+ tree_name);
+ gets(temp);
+ if(*temp != 'n' && *temp != 'N') {
+ strcpy(p1_tree_name,tree_name);
+ use_tree1 = TRUE;
+ }
+ fclose(tree);
+ }
+ }
+ else if (!usemenu && use_tree1_file) {
+ use_tree1 = TRUE;
+ }
+ }
+ tree_name[0] = EOS;
+ use_tree2 = FALSE;
+ get_path(profile2_name,path);
+ if (nseqs-profile1_nseqs >= 2) {
+ if (check_tree && usemenu) {
+ strcpy(tree_name,path);
+ strcat(tree_name,"dnd");
+#ifdef VMS
+ if((tree=fopen(tree_name,"r","rat=cr","rfm=var"))!=NULL) {
+#else
+ if((tree=fopen(tree_name,"r"))!=NULL) {
+#endif
+ fprintf(stdout,"\nUse the existing GUIDE TREE file for Profile 2, %s (y/n) ? [y]: ",
+ tree_name);
+ gets(temp);
+ if(*temp != 'n' && *temp != 'N') {
+ strcpy(p2_tree_name,tree_name);
+ use_tree2 = TRUE;
+ }
+ fclose(tree);
+ }
+ }
+ else if (!usemenu && use_tree2_file) {
+ use_tree2 = TRUE;
+ }
+ }
+
+ if (save_parameters) create_parameter_output();
+
+ if (struct_penalties1 == SECST)
+
+ calc_gap_penalty_mask(seqlen_array[1],sec_struct_mask1,gap_penalty_mask1);
+
+ if (struct_penalties2 == SECST)
+
+ calc_gap_penalty_mask(seqlen_array[profile1_nseqs+1],sec_struct_mask2,gap_penalty_mask2);
+
+ if (use_tree1 == FALSE)
+ if (profile1_nseqs >= 2) {
+ for (i=1;i<=profile1_nseqs;i++) {
+ for (j=i+1;j<=profile1_nseqs;j++) {
+ dscore = countid(i,j);
+ tmat[i][j] = (100.0 - dscore)/100.0;
+ tmat[j][i] = tmat[i][j];
+ }
+ }
+ get_path(profile1_name,path);
+ if (p1_tree_name[0]!=EOS) {
+ if((tree = open_explicit_file(p1_tree_name))==NULL) return;
+ }
+ else {
+ if((tree = open_output_file(
+ "\nEnter name for new GUIDE TREE file for profile 1 ",path,
+ p1_tree_name,"dnd")) == NULL) return;
+ }
+
+ guide_tree(tree,1,profile1_nseqs);
+ info("Guide tree file created: [%s]",
+ p1_tree_name);
+ }
+ if (use_tree2 == FALSE)
+ if(nseqs-profile1_nseqs >= 2) {
+ for (i=1+profile1_nseqs;i<=nseqs;i++) {
+ for (j=i+1;j<=nseqs;j++) {
+ dscore = countid(i,j);
+ tmat[i][j] = (100.0 - dscore)/100.0;
+ tmat[j][i] = tmat[i][j];
+ }
+ }
+ if (p2_tree_name[0]!=EOS) {
+ if((tree = open_explicit_file(p2_tree_name))==NULL) return;
+ }
+ else {
+ get_path(profile2_name,path);
+ if((tree = open_output_file(
+ "\nEnter name for new GUIDE TREE file for profile 2 ",path,
+ p2_tree_name,"dnd")) == NULL) return;
+ }
+ guide_tree(tree,profile1_nseqs+1,nseqs-profile1_nseqs);
+ info("Guide tree file created: [%s]",
+ p2_tree_name);
+ }
+
+ if (new_tree1_file || new_tree2_file) return;
+
+/* do an initial alignment to get the pairwise identities between the two
+profiles - used to set parameters for the final alignment */
+ count = palign1();
+ if (count == 0) return;
+
+ reset_prf1();
+ reset_prf2();
+
+ count = palign2(p1_tree_name,p2_tree_name);
+
+ if (count == 0) return;
+
+ if(usemenu) fprintf(stdout,"\n\n\n");
+
+ create_alignment_output(1,nseqs);
+ if (showaln && usemenu) show_aln();
+
+ p1_tree_name[0]=EOS;
+ p2_tree_name[0]=EOS;
+}
+
+
+
+
+
+
+ typedef struct rangeNum {
+ int start;
+ int end;
+ } rangeNum;
+
+
+/**** ********************************************************************************
+ *
+ *
+ *
+ * INPUT:
+ *
+ * RETURNS: the range objects with the from, to range for each seqs.
+ *
+ * the best things is to couple this up with the seqnames
+ * structure (there is no struct for seqnames yet!)
+ */
+
+
+void fillrange(rangeNum *rnum, sint fres, sint len, sint fseq)
+{
+ sint val;
+ sint i,ii;
+ sint j,slen;
+
+ char tmpName[FILENAMELEN+15];
+ int istart =0;
+ int iend = 0; /* to print sequence start-end with names */
+ int found =0;
+ int ngaps=0;
+ int tmpStart=0;
+ int tmpEnd=0;
+ int ntermgaps=0;
+ int pregaps=0;
+ int tmpk=0;
+ int isRange=0;
+ int formula =0;
+
+ tmpName[0] = '\0';
+ slen = 0;
+
+ ii = fseq ;
+ i = output_index[ii];
+ if( (sscanf(names[i],"%[^/]/%d-%d",tmpName, &tmpStart, &tmpEnd) == 3)) {
+ isRange = 1;
+ }
+ for(tmpk=1; tmpk<fres; tmpk++) { /* do this irrespective of above sscanf */
+ val = seq_array[i][tmpk];
+ if ((val < 0) || (val > max_aa)) { /*it is gap */
+ pregaps++;
+ }
+ }
+ for(j=fres; j<fres+len; j++) {
+ val = seq_array[i][j];
+ if((val == -3) || (val == 253))
+ break;
+ else if((val < 0) || (val > max_aa)) {
+ /* residue = '-'; */
+ ngaps++;
+ }
+ else {
+ /* residue = amino_acid_codes[val]; */
+ found = j;
+ }
+ if ( found && (istart == 0) ) {
+ istart = found;
+ ntermgaps = ngaps;
+ }
+ slen++;
+ }
+ if( seqRange) {
+ printf("Name : %s ",names[i]);
+ printf("\n fres = %d ",fres);
+ printf(" len = %d ",len);
+ printf("\n istart = %d ",istart);
+ printf("\n tmpStart = %d ",tmpStart);
+ printf("\n ngaps = %d ",ngaps);
+ printf("\n pregaps = %d ",pregaps);
+ if (!isRange)
+ formula = istart - pregaps;
+ else
+ formula = istart - pregaps + ( tmpStart == 1 ? 0: tmpStart-1) ;
+
+ printf("\n\nsuggestion istart - pregaps + tmpStart - ntermgaps = %d - %d + %d - %d",istart,
+ pregaps,tmpStart,ntermgaps);
+ printf(" formula %d ",formula);
+ }
+ else {
+ printf("\n no range found .... strange, istart = %d",istart);
+ formula = 1;
+ }
+ if (pregaps == fres-1) /* all gaps - now the conditions........ */
+ formula = tmpStart ; /* keep the previous start... */
+ formula = (formula <= 0) ? 1: formula;
+ if (pregaps ==0 && tmpStart == 0) {
+ formula = fres;
+ }
+ iend = formula + len - ngaps -1;
+
+ rnum->start = formula;
+ rnum->end = iend;
+ printf("\n check... %s %d - %d",names[i],rnum->start,rnum->end);
+ printf(" Done checking.........");
+}
+
+
+void fasta_out(FILE *fastaout, sint fres, sint len, sint fseq, sint lseq)
+{
+
+ char *seq, residue;
+ sint val;
+ sint i,ii;
+ sint j,slen;
+ sint line_length;
+
+ rangeNum *rnum;
+ int tmpk;
+
+ seq = (char *)ckalloc((len+1) * sizeof(char));
+
+ line_length=PAGEWIDTH-max_names;
+ line_length=line_length-line_length % 10; /* round to a multiple of 10*/
+ if (line_length > LINELENGTH) line_length=LINELENGTH;
+
+ if(seqRange) {
+ rnum = (struct rangeNum *) malloc(sizeof(struct rangeNum));
+ }
+
+ for(ii=fseq; ii<=lseq; ii++) {
+ i = output_index[ii];
+ slen = 0;
+ for(j=fres; j<fres+len; j++) {
+ val = seq_array[i][j];
+ if((val == -3) || (val == 253))
+ break;
+ else if((val < 0) || (val > max_aa)) {
+ residue = '-';
+ }
+ else {
+ residue = amino_acid_codes[val];
+ }
+ if (lowercase)
+ seq[j-fres] = (char)tolower((int)residue);
+ else
+ seq[j-fres] = residue;
+ slen++;
+ }
+ fprintf(fastaout, ">%-s",nameonly(names[i]));
+ if(seqRange) {
+ fillrange(rnum,fres, len, ii);
+ fprintf(fastaout,"/%d-%d",rnum->start, rnum->end);
+ }
+ fprintf(fastaout,"\n");
+ for(j=1; j<=slen; j++) {
+ fprintf(fastaout,"%c",toupper(seq[j-1]));
+ if((j % line_length == 0) || (j == slen))
+ fprintf(fastaout,"\n");
+ }
+ }
+ seq=ckfree((void *)seq);
+
+ if(seqRange)
+ if (rnum)
+ free(rnum);
+ /* just try and see
+ printf("\n Now.... calculating percentage identity....\n\n");
+ calc_percidentity();*/
+
+}
+
+
+void clustal_out(FILE *clusout, sint fres, sint len, sint fseq, sint lseq)
+{
+ static char *seq1;
+ static sint *seq_no;
+ static sint *print_seq_no;
+ char *ss_mask1, *ss_mask2;
+ char temp[MAXLINE];
+ char c;
+ sint val;
+ sint ii,lv1,catident1[NUMRES],catident2[NUMRES],ident,chunks;
+ sint i,j,k,l;
+ sint pos,ptr;
+ sint line_length;
+
+ rangeNum *rnum;
+ char tmpStr[FILENAMELEN+15];
+ int tmpk;
+
+ /*
+ stop doing this ...... opens duplicate files in VMS DES
+ fclose(clusout);
+ if ((clusout=fopen(clustal_outname,"w")) == NULL)
+ {
+ fprintf(stdout,"Error opening %s\n",clustal_outfile);
+ return;
+ }
+ */
+
+ if(seqRange) {
+ rnum = (struct rangeNum *) malloc(sizeof(struct rangeNum));
+ if ( rnum ==NULL ) {
+ printf("cannot alloc memory for rnum");
+ }
+ }
+
+ seq_no = (sint *)ckalloc((nseqs+1) * sizeof(sint));
+ print_seq_no = (sint *)ckalloc((nseqs+1) * sizeof(sint));
+ for (i=fseq;i<=lseq;i++)
+ {
+ print_seq_no[i] = seq_no[i] = 0;
+ for(j=1;j<fres;j++) {
+ val = seq_array[i][j];
+ if((val >=0) || (val <=max_aa)) seq_no[i]++;
+ }
+ }
+
+ seq1 = (char *)ckalloc((max_aln_length+1) * sizeof(char));
+
+ if (struct_penalties1 == SECST && use_ss1 == TRUE) {
+ ss_mask1 = (char *)ckalloc((seqlen_array[1]+10) * sizeof(char));
+ for (i=0;i<seqlen_array[1];i++)
+ ss_mask1[i] = sec_struct_mask1[i];
+ print_sec_struct_mask(seqlen_array[1],sec_struct_mask1,ss_mask1);
+ }
+ if (struct_penalties2 == SECST && use_ss2 == TRUE) {
+ ss_mask2 = (char *)ckalloc((seqlen_array[profile1_nseqs+1]+10) * sizeof(char));
+ for (i=0;i<seqlen_array[profile1_nseqs+1];i++)
+ ss_mask2[i] = sec_struct_mask2[i];
+ print_sec_struct_mask(seqlen_array[profile1_nseqs+1],sec_struct_mask2,ss_mask2);
+ }
+
+ fprintf(clusout,"CLUSTAL %s multiple sequence alignment\n\n",
+ revision_level);
+
+ /* decide the line length for this alignment - maximum is LINELENGTH */
+ line_length=PAGEWIDTH-max_names;
+ line_length=line_length-line_length % 10; /* round to a multiple of 10*/
+ if (line_length > LINELENGTH) line_length=LINELENGTH;
+
+ chunks = len/line_length;
+ if(len % line_length != 0)
+ ++chunks;
+
+ for(lv1=1;lv1<=chunks;++lv1) {
+ pos = ((lv1-1)*line_length)+1;
+ ptr = (len<pos+line_length-1) ? len : pos+line_length-1;
+
+ fprintf(clusout,"\n");
+
+ if (output_struct_penalties == 0 || output_struct_penalties == 2) {
+ if (struct_penalties1 == SECST && use_ss1 == TRUE) {
+ for(i=pos;i<=ptr;++i) {
+ val=ss_mask1[i+fres-2];
+ if (val == gap_pos1 || val == gap_pos2)
+ temp[i-pos]='-';
+ else
+ temp[i-pos]=val;
+ }
+ temp[ptr-pos+1]=EOS;
+ if(seqRange) /*Ramu*/
+ fprintf(clusout,"!SS_%-*s %s\n",max_names+15,ss_name1,temp);
+ else
+ fprintf(clusout,"!SS_%-*s %s\n",max_names,ss_name1,temp);
+ }
+ }
+ if (output_struct_penalties == 1 || output_struct_penalties == 2) {
+ if (struct_penalties1 != NONE && use_ss1 == TRUE) {
+ for(i=pos;i<=ptr;++i) {
+ val=gap_penalty_mask1[i+fres-2];
+ if (val == gap_pos1 || val == gap_pos2)
+ temp[i-pos]='-';
+ else
+ temp[i-pos]=val;
+ }
+ temp[ptr-pos+1]=EOS;
+ fprintf(clusout,"!GM_%-*s %s\n",max_names,ss_name1,temp);
+ }
+ }
+ if (output_struct_penalties == 0 || output_struct_penalties == 2) {
+ if (struct_penalties2 == SECST && use_ss2 == TRUE) {
+ for(i=pos;i<=ptr;++i) {
+ val=ss_mask2[i+fres-2];
+ if (val == gap_pos1 || val == gap_pos2)
+ temp[i-pos]='-';
+ else
+ temp[i-pos]=val;
+ }
+ temp[ptr-pos+1]=EOS;
+ if (seqRange )
+ fprintf(clusout,"!SS_%-*s %s\n",max_names+15,ss_name2,temp);
+ else
+ fprintf(clusout,"!SS_%-*s %s\n",max_names,ss_name2,temp);
+ }
+ }
+ if (output_struct_penalties == 1 || output_struct_penalties == 2) {
+ if (struct_penalties2 != NONE && use_ss2 == TRUE) {
+ for(i=pos;i<=ptr;++i) {
+ val=gap_penalty_mask2[i+fres-2];
+ if (val == gap_pos1 || val == gap_pos2)
+ temp[i-pos]='-';
+ else
+ temp[i-pos]=val;
+ }
+ temp[ptr-pos+1]=EOS;
+ fprintf(clusout,"!GM_%-*s %s\n",max_names,ss_name2,temp);
+ }
+ }
+
+ for(ii=fseq;ii<=lseq;++ii) {
+ i=output_index[ii];
+ print_seq_no[i] = 0;
+ for(j=pos;j<=ptr;++j) {
+ if (j+fres-1<=seqlen_array[i])
+ val = seq_array[i][j+fres-1];
+ else val = -3;
+ if((val == -3) || (val == 253)) break;
+ else if((val < 0) || (val > max_aa)){
+ seq1[j]='-';
+ }
+ else {
+ seq1[j]=amino_acid_codes[val];
+ seq_no[i]++;
+ print_seq_no[i]=1;
+ }
+ }
+ for(;j<=ptr;++j) seq1[j]='-';
+ strncpy(temp,&seq1[pos],ptr-pos+1);
+ temp[ptr-pos+1]=EOS;
+ if (!seqRange) {
+ fprintf(clusout,"%-*s",max_names+5,names[i]);
+ }
+ else {
+ fillrange(rnum,fres, len, ii);
+ sprintf(tmpStr,"%s/%d-%d", nameonly(names[i]), rnum->start, rnum->end);
+ fprintf(clusout,"%-*s",max_names+15,tmpStr);
+ }
+ fprintf(clusout," %s",temp);
+ if (cl_seq_numbers && print_seq_no[i])
+ fprintf(clusout," %d",seq_no[i]);
+ fprintf(clusout,"\n");
+ }
+
+ for(i=pos;i<=ptr;++i) {
+ seq1[i]=' ';
+ ident=0;
+ for(j=1;res_cat1[j-1]!=NULL;j++) catident1[j-1] = 0;
+ for(j=1;res_cat2[j-1]!=NULL;j++) catident2[j-1] = 0;
+ for(j=fseq;j<=lseq;++j) {
+ if((seq_array[fseq][i+fres-1] >=0) &&
+ (seq_array[fseq][i+fres-1] <= max_aa)) {
+ if(seq_array[fseq][i+fres-1] == seq_array[j][i+fres-1])
+ ++ident;
+ for(k=1;res_cat1[k-1]!=NULL;k++) {
+ for(l=0;(c=res_cat1[k-1][l]);l++) {
+ if (amino_acid_codes[seq_array[j][i+fres-1]]==c)
+ {
+ catident1[k-1]++;
+ break;
+ }
+ }
+ }
+ for(k=1;res_cat2[k-1]!=NULL;k++) {
+ for(l=0;(c=res_cat2[k-1][l]);l++) {
+ if (amino_acid_codes[seq_array[j][i+fres-1]]==c)
+ {
+ catident2[k-1]++;
+ break;
+ }
+ }
+ }
+ }
+ }
+ if(ident==lseq-fseq+1)
+ seq1[i]='*';
+ else if (!dnaflag) {
+ for(k=1;res_cat1[k-1]!=NULL;k++) {
+ if (catident1[k-1]==lseq-fseq+1) {
+ seq1[i]=':';
+ break;
+ }
+ }
+ if(seq1[i]==' ')
+ for(k=1;res_cat2[k-1]!=NULL;k++) {
+ if (catident2[k-1]==lseq-fseq+1) {
+ seq1[i]='.';
+ break;
+ }
+ }
+ }
+ }
+ strncpy(temp,&seq1[pos],ptr-pos+1);
+ temp[ptr-pos+1]=EOS;
+ for(k=0;k<max_names+6;k++) fprintf(clusout," ");
+ if(seqRange) /*<ramu>*/
+ fprintf(clusout," "); /*</ramu>*/
+ fprintf(clusout,"%s\n",temp);
+ }
+
+ seq1=ckfree((void *)seq1);
+ if (struct_penalties1 == SECST && use_ss1 == TRUE) ckfree(ss_mask1);
+ if (struct_penalties2 == SECST && use_ss2 == TRUE) ckfree(ss_mask2);
+ /* DES ckfree(output_index); */
+
+ if(seqRange)
+ if (rnum)
+ free(rnum);
+}
+
+
+
+
+void gcg_out(FILE *gcgout, sint fres, sint len, sint fseq, sint lseq)
+{
+ /* static char *aacids = "XCSTPAGNDEQHRKMILVFYW";*/
+ /* static char *nbases = "XACGT"; */
+ char *seq, residue;
+ sint val;
+ sint *all_checks;
+ sint i,ii,chunks,block;
+ sint j,k,pos1,pos2;
+ long grand_checksum;
+
+ /*<ramu>*/
+ rangeNum *rnum;
+ char tmpStr[FILENAMELEN+15];
+ int tmpk;
+
+ if(seqRange) {
+ rnum = (struct rangeNum *) malloc(sizeof(struct rangeNum));
+ if ( rnum ==NULL ) {
+ printf("cannot alloc memory for rnum");
+ }
+ }
+
+ seq = (char *)ckalloc((max_aln_length+1) * sizeof(char));
+ all_checks = (sint *)ckalloc((lseq+1) * sizeof(sint));
+
+ for(i=fseq; i<=lseq; i++) {
+ for(j=fres; j<=fres+len-1; j++) {
+ val = seq_array[i][j];
+ if((val == -3) || (val == 253)) break;
+ else if((val < 0) || (val > max_aa))
+ residue = '.';
+ else {
+ residue = amino_acid_codes[val];
+ }
+ seq[j-fres+1] = residue;
+ }
+ /* pad any short sequences with gaps, to make all sequences the same length */
+ for(; j<=fres+len-1; j++)
+ seq[j-fres+1] = '.';
+ all_checks[i] = SeqGCGCheckSum(seq+1, (int)len);
+ }
+
+ grand_checksum = 0;
+ for(i=1; i<=nseqs; i++) grand_checksum += all_checks[output_index[i]];
+ grand_checksum = grand_checksum % 10000;
+ fprintf(gcgout,"PileUp\n\n");
+ fprintf(gcgout,"\n\n MSF:%5d Type: ",(pint)len);
+ if(dnaflag)
+ fprintf(gcgout,"N");
+ else
+ fprintf(gcgout,"P");
+ fprintf(gcgout," Check:%6ld .. \n\n", (long)grand_checksum);
+ for(ii=fseq; ii<=lseq; ii++) {
+ i = output_index[ii];
+ fprintf(gcgout,
+ " Name: %s oo Len:%5d Check:%6ld Weight: %.1f\n",
+ names[i],(pint)len,(long)all_checks[i],(float)seq_weight[i-1]*100.0/(float)INT_SCALE_FACTOR);
+ }
+ fprintf(gcgout,"\n//\n");
+
+ chunks = len/GCG_LINELENGTH;
+ if(len % GCG_LINELENGTH != 0) ++chunks;
+
+ for(block=1; block<=chunks; block++) {
+ fprintf(gcgout,"\n\n");
+ pos1 = ((block-1) * GCG_LINELENGTH) + 1;
+ pos2 = (len<pos1+GCG_LINELENGTH-1)? len : pos1+GCG_LINELENGTH-1;
+ for(ii=fseq; ii<=lseq; ii++) {
+ i = output_index[ii];
+ if (!seqRange) {
+ fprintf(gcgout,"\n%-*s ",max_names+5,names[i]);
+ }
+ else {
+ fillrange(rnum,fres, len, ii);
+ sprintf(tmpStr,"%s/%d-%d",nameonly(names[i]),rnum->start,rnum->end);
+ fprintf(gcgout,"\n%-*s",max_names+15,tmpStr);
+ }
+ for(j=pos1, k=1; j<=pos2; j++, k++) {
+ /*
+ JULIE -
+ check for sint sequences - pad out with '.' characters to end of alignment
+ */
+ if (j+fres-1<=seqlen_array[i])
+ val = seq_array[i][j+fres-1];
+ else val = -3;
+ if((val == -3) || (val == 253))
+ residue = '.';
+ else if((val < 0) || (val > max_aa))
+ residue = '.';
+ else {
+ residue = amino_acid_codes[val];
+ }
+ fprintf(gcgout,"%c",residue);
+ if(j % 10 == 0) fprintf(gcgout," ");
+ }
+ }
+ }
+ /* DES ckfree(output_index); */
+
+ seq=ckfree((void *)seq);
+ all_checks=ckfree((void *)all_checks);
+ fprintf(gcgout,"\n\n");
+
+
+ if(seqRange) if (rnum) free(rnum);
+}
+
+
+/* <Ramu> */
+/************************************************************************
+ *
+ *
+ * Removes the sequence range from sequence name
+ *
+ *
+ * INPUT: Sequence name
+ * (e.g. finc_rat/1-200 )
+ *
+ *
+ * RETURNS: pointer to string
+ */
+
+char *nameonly(char *s)
+{
+ static char tmp[FILENAMELEN+1];
+ int i =0;
+
+ while (*s != '/' && *s != '\0') {
+ tmp[i++] = *s++;
+ }
+ tmp[i] = '\0';
+ return &tmp[0];
+}
+
+
+int startFind(char *s)
+{
+ int i = 0;
+ sint val;
+ printf("\n Debug.....\n %s",s);
+
+ while( *s ) {
+ val = *s;
+ if ( (val <0 ) || (val > max_aa)) {
+ i++;
+ *s++;
+ printf("%c",amino_acid_codes[val]);
+ }
+ }
+ return i;
+}
+
+/*
+void fasta_out(FILE *fastaout, sint fres, sint len, sint fseq, sint lseq)
+{
+ char residue;
+ sint val;
+ sint i,ii;
+ sint j,k;
+
+ for(ii=fseq; ii<=lseq; ii++) {
+ i = output_index[ii];
+ fprintf(fastaout,">%-s",names[i],len);
+ j = 1;
+ while(j<len) {
+ if ( ! (j%80) ) {
+ fprintf(fastaout,"\n");
+ }
+ val = seq_array[i][j];
+ if((val < 0) || (val > max_aa))
+ residue = '-';
+ else {
+ residue = amino_acid_codes[val];
+ }
+ fprintf(fastaout,"%c",residue);
+ j++;
+ }
+ fprintf(fastaout,"\n");
+ }
+
+}
+*/
+
+/* </Ramu> */
+
+void nexus_out(FILE *nxsout, sint fres, sint len, sint fseq, sint lseq)
+{
+/* static char *aacids = "XCSTPAGNDEQHRKMILVFYW";*/
+/* static char *nbases = "XACGT"; */
+ char residue;
+ sint val;
+ sint i,ii,chunks,block;
+ sint j,k,pos1,pos2;
+
+
+ /*<ramu>*/
+ rangeNum *rnum;
+ char tmpStr[FILENAMELEN+15];
+ int tmpk;
+
+ if(seqRange) {
+ rnum = (struct rangeNum *) malloc(sizeof(struct rangeNum));
+ if ( rnum ==NULL ) {
+ printf("cannot alloc memory for rnum");
+ }
+ }
+
+
+ chunks = len/GCG_LINELENGTH;
+ if(len % GCG_LINELENGTH != 0) ++chunks;
+
+ fprintf(nxsout,"#NEXUS\n");
+ fprintf(nxsout,"BEGIN DATA;\n");
+ fprintf(nxsout,"dimensions ntax=%d nchar=%d;\n",(pint)nseqs,(pint)len);
+ fprintf(nxsout,"format missing=?\n");
+ fprintf(nxsout,"symbols=\"");
+ for(i=0;i<=max_aa;i++)
+ fprintf(nxsout,"%c",amino_acid_codes[i]);
+ fprintf(nxsout,"\"\n");
+ fprintf(nxsout,"interleave datatype=");
+ fprintf(nxsout, dnaflag ? "DNA " : "PROTEIN ");
+ fprintf(nxsout,"gap= -;\n");
+ fprintf(nxsout,"\nmatrix");
+
+ for(block=1; block<=chunks; block++) {
+ pos1 = ((block-1) * GCG_LINELENGTH)+1;
+ pos2 = (len<pos1+GCG_LINELENGTH-1)? len : pos1+GCG_LINELENGTH-1;
+ for(ii=fseq; ii<=lseq; ii++) {
+ i = output_index[ii];
+ if (!seqRange) {
+ fprintf(nxsout,"\n%-*s ",max_names+1,names[i]);
+ }
+ else {
+ fillrange(rnum,fres, len, ii);
+ sprintf(tmpStr,"%s/%d-%d",nameonly(names[i]),rnum->start,rnum->end);
+ fprintf(nxsout,"\n%-*s",max_names+15,tmpStr);
+ }
+ for(j=pos1, k=1; j<=pos2; j++, k++) {
+ if (j+fres-1<=seqlen_array[i])
+ val = seq_array[i][j+fres-1];
+ else val = -3;
+ if((val == -3) || (val == 253))
+ break;
+ else if((val < 0) || (val > max_aa))
+ residue = '-';
+ else {
+ residue = amino_acid_codes[val];
+ }
+ fprintf(nxsout,"%c",residue);
+ }
+ }
+ fprintf(nxsout,"\n");
+ }
+ fprintf(nxsout,";\nend;\n");
+ /* DES ckfree(output_index); */
+
+ if(seqRange) if (rnum) free(rnum);
+
+}
+
+
+
+
+void phylip_out(FILE *phyout, sint fres, sint len, sint fseq, sint lseq)
+{
+/* static char *aacids = "XCSTPAGNDEQHRKMILVFYW";*/
+/* static char *nbases = "XACGT"; */
+ char residue;
+ sint val;
+ sint i,ii,chunks,block;
+ sint j,k,pos1,pos2;
+ sint name_len;
+ Boolean warn;
+ char **snames;
+
+ /*<ramu>*/
+ rangeNum *rnum;
+ char tmpStr[FILENAMELEN+15];
+ int tmpk;
+
+
+ if(seqRange) {
+ rnum = (struct rangeNum *) malloc(sizeof(struct rangeNum));
+ if ( rnum ==NULL ) {
+ printf("cannot alloc memory for rnum");
+ }
+ }
+
+ snames=(char **)ckalloc((lseq-fseq+2)*sizeof(char *));
+ name_len=0;
+ for(i=fseq; i<=lseq; i++) {
+ snames[i]=(char *)ckalloc((11)*sizeof(char));
+ ii=strlen(names[i]);
+ strncpy(snames[i],names[i],10);
+ if(name_len<ii) name_len=ii;
+ }
+ if(name_len>10) {
+ warn=FALSE;
+ for(i=fseq; i<=lseq; i++) {
+ for(j=i+1;j<=lseq;j++) {
+ if (strcmp(snames[i],snames[j]) == 0)
+ warn=TRUE;
+ }
+ }
+ if(warn)
+ warning("Truncating sequence names to 10 characters for PHYLIP output.\n"
+ "Names in the PHYLIP format file are NOT unambiguous.");
+ else
+ warning("Truncating sequence names to 10 characters for PHYLIP output.");
+ }
+
+
+ chunks = len/GCG_LINELENGTH;
+ if(len % GCG_LINELENGTH != 0) ++chunks;
+
+ fprintf(phyout,"%6d %6d",(pint)nseqs,(pint)len);
+
+ for(block=1; block<=chunks; block++) {
+ pos1 = ((block-1) * GCG_LINELENGTH)+1;
+ pos2 = (len<pos1+GCG_LINELENGTH-1)? len : pos1+GCG_LINELENGTH-1;
+ for(ii=fseq; ii<=lseq; ii++) {
+ i = output_index[ii];
+ if(block == 1) {
+ if(!seqRange) {
+ fprintf(phyout,"\n%-10s ",snames[i]);
+ }
+ else
+ {
+ fillrange(rnum,fres, len, ii);
+ sprintf(tmpStr,"%s/%d-%d",nameonly(names[i]),rnum->start,rnum->end);
+ fprintf(phyout,"\n%-*s",max_names+15,tmpStr);
+ }
+ }
+ else
+ fprintf(phyout,"\n ");
+ for(j=pos1, k=1; j<=pos2; j++, k++) {
+ if (j+fres-1<=seqlen_array[i])
+ val = seq_array[i][j+fres-1];
+ else val = -3;
+ if((val == -3) || (val == 253))
+ break;
+ else if((val < 0) || (val > max_aa))
+ residue = '-';
+ else {
+ residue = amino_acid_codes[val];
+ }
+ fprintf(phyout,"%c",residue);
+ if(j % 10 == 0) fprintf(phyout," ");
+ }
+ }
+ fprintf(phyout,"\n");
+ }
+ /* DES ckfree(output_index); */
+
+ for(i=fseq;i<=lseq;i++)
+ ckfree(snames[i]);
+ ckfree(snames);
+
+ if(seqRange) if (rnum) free(rnum);
+
+}
+
+
+
+
+
+void nbrf_out(FILE *nbout, sint fres, sint len, sint fseq, sint lseq)
+{
+/* static char *aacids = "XCSTPAGNDEQHRKMILVFYW";*/
+/* static char *nbases = "XACGT"; */
+ char *seq, residue;
+ sint val;
+ sint i,ii;
+ sint j,slen;
+ sint line_length;
+
+
+ /*<ramu>*/
+ rangeNum *rnum;
+ char tmpStr[FILENAMELEN+15];
+ int tmpk;
+
+ if(seqRange) {
+ rnum = (struct rangeNum *) malloc(sizeof(struct rangeNum));
+ if ( rnum ==NULL ) {
+ printf("cannot alloc memory for rnum");
+ }
+ }
+
+ seq = (char *)ckalloc((max_aln_length+1) * sizeof(char));
+
+ /* decide the line length for this alignment - maximum is LINELENGTH */
+ line_length=PAGEWIDTH-max_names;
+ line_length=line_length-line_length % 10; /* round to a multiple of 10*/
+ if (line_length > LINELENGTH) line_length=LINELENGTH;
+
+ for(ii=fseq; ii<=lseq; ii++) {
+ i = output_index[ii];
+ fprintf(nbout, dnaflag ? ">DL;" : ">P1;");
+ if (!seqRange) {
+ fprintf(nbout, "%s\n%s\n", names[i], titles[i]);
+ }
+ else {
+ fillrange(rnum,fres, len, ii);
+ sprintf(tmpStr,"%s/%d-%d",nameonly(names[i]),rnum->start,rnum->end);
+ fprintf(nbout,"%s\n%s\n",tmpStr,titles[i]);
+ }
+ slen = 0;
+ for(j=fres; j<fres+len; j++) {
+ val = seq_array[i][j];
+ if((val == -3) || (val == 253))
+ break;
+ else if((val < 0) || (val > max_aa))
+ residue = '-';
+ else {
+ residue = amino_acid_codes[val];
+ }
+ seq[j-fres] = residue;
+ slen++;
+ }
+ for(j=1; j<=slen; j++) {
+ fprintf(nbout,"%c",seq[j-1]);
+ if((j % line_length == 0) || (j == slen))
+ fprintf(nbout,"\n");
+ }
+ fprintf(nbout,"*\n");
+ }
+ /* DES ckfree(output_index); */
+
+ seq=ckfree((void *)seq);
+
+ if(seqRange) if (rnum) free(rnum);
+
+}
+
+
+void gde_out(FILE *gdeout, sint fres, sint len, sint fseq, sint lseq)
+{
+/* static char *aacids = "XCSTPAGNDEQHRKMILVFYW";*/
+/* static char *nbases = "XACGT"; */
+ char *seq, residue;
+ sint val;
+ char *ss_mask1, *ss_mask2;
+ sint i,ii;
+ sint j,slen;
+ sint line_length;
+
+
+ /*<ramu>*/
+ rangeNum *rnum;
+ char tmpStr[FILENAMELEN+15];
+ int tmpk;
+
+ if(seqRange) {
+ rnum = (struct rangeNum *) malloc(sizeof(struct rangeNum));
+ if ( rnum ==NULL ) {
+ printf("cannot alloc memory for rnum");
+ }
+ }
+
+ seq = (char *)ckalloc((max_aln_length+1) * sizeof(char));
+
+ /* decide the line length for this alignment - maximum is LINELENGTH */
+ line_length=PAGEWIDTH-max_names;
+ line_length=line_length-line_length % 10; /* round to a multiple of 10*/
+ if (line_length > LINELENGTH) line_length=LINELENGTH;
+
+ if (struct_penalties1 == SECST && use_ss1 == TRUE) {
+ ss_mask1 = (char *)ckalloc((seqlen_array[1]+10) * sizeof(char));
+ for (i=0;i<seqlen_array[1];i++)
+ ss_mask1[i] = sec_struct_mask1[i];
+ print_sec_struct_mask(seqlen_array[1],sec_struct_mask1,ss_mask1);
+ }
+ if (struct_penalties2 == SECST && use_ss2 == TRUE) {
+ ss_mask2 = (char *)ckalloc((seqlen_array[profile1_nseqs+1]+10) *
+ sizeof(char));
+ for (i=0;i<seqlen_array[profile1_nseqs+1];i++)
+ ss_mask2[i] = sec_struct_mask2[i];
+ print_sec_struct_mask(seqlen_array[profile1_nseqs+1],sec_struct_mask2,ss_mask2);
+ }
+
+
+ for(ii=fseq; ii<=lseq; ii++) {
+ i = output_index[ii];
+ fprintf(gdeout, dnaflag ? "#" : "%%");
+ if(!seqRange) {
+ fprintf(gdeout, "%s\n", names[i]);
+ }
+ else {
+ fillrange(rnum,fres, len, ii);
+ fprintf(gdeout,"%s/%d-%d\n",nameonly(names[i]),rnum->start,rnum->end);
+ }
+ slen = 0;
+ for(j=fres; j<fres+len; j++) {
+ val = seq_array[i][j];
+ if((val == -3) || (val == 253))
+ break;
+ else if((val < 0) || (val > max_aa))
+ residue = '-';
+ else {
+ residue = amino_acid_codes[val];
+ }
+ if (lowercase)
+ seq[j-fres] = (char)tolower((int)residue);
+ else
+ seq[j-fres] = residue;
+ slen++;
+ }
+ for(j=1; j<=slen; j++) {
+ fprintf(gdeout,"%c",seq[j-1]);
+ if((j % line_length == 0) || (j == slen))
+ fprintf(gdeout,"\n");
+ }
+ }
+ /* DES ckfree(output_index); */
+
+ if (output_struct_penalties == 0 || output_struct_penalties == 2) {
+ if (struct_penalties1 == SECST && use_ss1 == TRUE) {
+ fprintf(gdeout,"\"SS_%-*s\n",max_names,ss_name1);
+ for(i=fres; i<fres+len; i++) {
+ val=ss_mask1[i-1];
+ if (val == gap_pos1 || val == gap_pos2)
+ seq[i-fres]='-';
+ else
+ seq[i-fres]=val;
+ }
+ seq[i-fres]=EOS;
+ for(i=1; i<=len; i++) {
+ fprintf(gdeout,"%c",seq[i-1]);
+ if((i % line_length == 0) || (i == len))
+ fprintf(gdeout,"\n");
+ }
+ }
+
+ if (struct_penalties2 == SECST && use_ss2 == TRUE) {
+ fprintf(gdeout,"\"SS_%-*s\n",max_names,ss_name2);
+ for(i=fres; i<fres+len; i++) {
+ val=ss_mask2[i-1];
+ if (val == gap_pos1 || val == gap_pos2)
+ seq[i-fres]='-';
+ else
+ seq[i-fres]=val;
+ }
+ seq[i]=EOS;
+ for(i=1; i<=len; i++) {
+ fprintf(gdeout,"%c",seq[i-1]);
+ if((i % line_length == 0) || (i == len))
+ fprintf(gdeout,"\n");
+ }
+ }
+ }
+ if (output_struct_penalties == 1 || output_struct_penalties == 2) {
+ if (struct_penalties1 != NONE && use_ss1 == TRUE) {
+ fprintf(gdeout,"\"GM_%-*s\n",max_names,ss_name1);
+ for(i=fres; i<fres+len; i++) {
+ val=gap_penalty_mask1[i-1];
+ if (val == gap_pos1 || val == gap_pos2)
+ seq[i-fres]='-';
+ else
+ seq[i-fres]=val;
+ }
+ seq[i]=EOS;
+ for(i=1; i<=len; i++) {
+ fprintf(gdeout,"%c",seq[i-1]);
+ if((i % line_length == 0) || (i == len))
+ fprintf(gdeout,"\n");
+ }
+ }
+ if (struct_penalties2 != NONE && use_ss2 == TRUE) {
+ fprintf(gdeout,"\"GM_%-*s\n",max_names,ss_name2);
+ for(i=fres; i<fres+len; i++) {
+ val=gap_penalty_mask2[i-1];
+ if (val == gap_pos1 || val == gap_pos2)
+ seq[i-fres]='-';
+ else
+ seq[i-fres]=val;
+ }
+ seq[i]=EOS;
+ for(i=1; i<=len; i++) {
+ fprintf(gdeout,"%c",seq[i-1]);
+ if((i % line_length == 0) || (i == len))
+ fprintf(gdeout,"\n");
+ }
+ }
+ }
+
+ if (struct_penalties1 == SECST && use_ss1 == TRUE) ckfree(ss_mask1);
+ if (struct_penalties2 == SECST && use_ss2 == TRUE) ckfree(ss_mask2);
+ seq=ckfree((void *)seq);
+
+
+ if(seqRange) if (rnum) free(rnum);
+
+}
+
+
+Boolean open_alignment_output(char *path)
+{
+
+ if(!output_clustal && !output_nbrf && !output_gcg &&
+ !output_phylip && !output_gde && !output_nexus && !output_fasta) {
+ error("You must select an alignment output format");
+ return FALSE;
+ }
+
+ if(output_clustal)
+ if (outfile_name[0]!=EOS) {
+ strcpy(clustal_outname,outfile_name);
+ if((clustal_outfile = open_explicit_file(
+ clustal_outname))==NULL) return FALSE;
+ }
+ else {
+ /* DES DEBUG
+ fprintf(stdout,"\n\n path = %s\n clustal_outname = %s\n\n",
+ path,clustal_outname);
+ */
+ if((clustal_outfile = open_output_file(
+ "\nEnter a name for the CLUSTAL output file ",path,
+ clustal_outname,"aln"))==NULL) return FALSE;
+ /* DES DEBUG
+ fprintf(stdout,"\n\n path = %s\n clustal_outname = %s\n\n",
+ path,clustal_outname);
+ */
+ }
+ if(output_nbrf)
+ if (outfile_name[0]!=EOS) {
+ strcpy(nbrf_outname,outfile_name);
+ if( (nbrf_outfile = open_explicit_file(nbrf_outname))==NULL)
+ return FALSE;
+ }
+ else
+ if((nbrf_outfile = open_output_file(
+ "\nEnter a name for the NBRF/PIR output file",path,
+ nbrf_outname,"pir"))==NULL) return FALSE;
+ if(output_gcg)
+ if (outfile_name[0]!=EOS) {
+ strcpy(gcg_outname,outfile_name);
+ if((gcg_outfile = open_explicit_file( gcg_outname))==NULL)
+ return FALSE;
+ }
+ else
+ if((gcg_outfile = open_output_file(
+ "\nEnter a name for the GCG output file ",path,
+ gcg_outname,"msf"))==NULL) return FALSE;
+ if(output_phylip)
+ if (outfile_name[0]!=EOS) {
+ strcpy(phylip_outname,outfile_name);
+ if((phylip_outfile = open_explicit_file(
+ phylip_outname))==NULL) return FALSE;
+ }
+ else
+ if((phylip_outfile = open_output_file(
+ "\nEnter a name for the PHYLIP output file ",path,
+ phylip_outname,"phy"))==NULL) return FALSE;
+ if(output_gde)
+ if (outfile_name[0]!=EOS) {
+ strcpy(gde_outname,outfile_name);
+ if((gde_outfile = open_explicit_file(
+ gde_outname))==NULL) return FALSE;
+ }
+ else
+ if((gde_outfile = open_output_file(
+ "\nEnter a name for the GDE output file ",path,
+ gde_outname,"gde"))==NULL) return FALSE;
+ if(output_nexus)
+ if (outfile_name[0]!=EOS) {
+ strcpy(nexus_outname,outfile_name);
+ if((nexus_outfile = open_explicit_file(
+ nexus_outname))==NULL) return FALSE;
+ }
+ else
+ if((nexus_outfile = open_output_file(
+ "\nEnter a name for the NEXUS output file ",path,
+ nexus_outname,"nxs"))==NULL) return FALSE;
+
+ /* Ramu */
+ if(output_fasta)
+ if (outfile_name[0]!=EOS) {
+ strcpy(fasta_outname,outfile_name);
+ if((fasta_outfile = open_explicit_file(
+ fasta_outname))==NULL) return FALSE;
+ }
+ else
+ if((fasta_outfile = open_output_file(
+ "\nEnter a name for the Fasta output file ",path,
+ fasta_outname,"fasta"))==NULL) return FALSE;
+
+ return TRUE;
+}
+
+
+
+
+void create_alignment_output(sint fseq, sint lseq)
+{
+ sint i,length;
+
+ sint ifres; /* starting sequence range - Ramu */
+ sint ilres; /* ending sequence range */
+ char ignore;
+ Boolean rangeOK;
+
+ length=0;
+
+ ifres = 1;
+ ilres = 0;
+ rangeOK = FALSE;
+ for (i=fseq;i<=lseq;i++)
+ if (length < seqlen_array[i])
+ length = seqlen_array[i];
+ ilres=length;
+
+
+ if (setrange != -1 ) {
+ /* printf("\n ==================== seqRange is set \n"); */
+ if ( sscanf(param_arg[setrange],"%d%[ :,-]%d",&ifres,&ignore,&ilres) !=3) {
+ info("seqrange numers are not set properly, using default....");
+ ifres = 1;
+ ilres = length;
+ }
+ else
+ rangeOK = TRUE;
+ }
+ if ( rangeOK && ilres > length ) {
+ ilres = length; /* if asked for more, set the limit, Ramui */
+ info("Seqrange %d is more than the %d setting it to %d ",ilres,length,length);
+ }
+
+ /* if (usemenu) info("Consensus length = %d",(pint)length);*/
+
+ if (usemenu) info("Consensus length = %d",(pint)ilres); /* Ramu */
+
+ /*
+ printf("\n creating output ....... normal.... setrange = %d \n",setrange);
+ printf(" ---------> %d %d \n\n ",ifres,ilres);
+ printf(" ---------> %d \n\n ",length);
+ */
+
+ if(output_clustal) {
+ clustal_out(clustal_outfile, ifres, ilres, fseq, lseq);
+ fclose(clustal_outfile);
+ info("CLUSTAL-Alignment file created [%s]",clustal_outname);
+ }
+ if(output_nbrf) {
+ nbrf_out(nbrf_outfile, ifres, ilres, /*1, length */ fseq, lseq);
+ fclose(nbrf_outfile);
+ info("NBRF/PIR-Alignment file created [%s]",nbrf_outname);
+ }
+ if(output_gcg) {
+ gcg_out(gcg_outfile, ifres, ilres, /*1, length */ fseq, lseq);
+ fclose(gcg_outfile);
+ info("GCG-Alignment file created [%s]",gcg_outname);
+ }
+ if(output_phylip) {
+ phylip_out(phylip_outfile, ifres, ilres, /*1, length */ fseq, lseq);
+ fclose(phylip_outfile);
+ info("PHYLIP-Alignment file created [%s]",phylip_outname);
+ }
+ if(output_gde) {
+ gde_out(gde_outfile, ifres, ilres /*1, length */, fseq, lseq);
+ fclose(gde_outfile);
+ info("GDE-Alignment file created [%s]",gde_outname);
+ }
+ if(output_nexus) {
+ nexus_out(nexus_outfile, ifres, ilres /*1, length */, fseq, lseq);
+ fclose(nexus_outfile);
+ info("NEXUS-Alignment file created [%s]",nexus_outname);
+ }
+ /* Ramu */
+ if(output_fasta) {
+ fasta_out(fasta_outfile, ifres, ilres /*1, length */, fseq, lseq);
+ fclose(fasta_outfile);
+ info("Fasta-Alignment file created [%s]",fasta_outname);
+ }
+}
+
+
+static void reset_align(void) /* remove gaps from older alignments (code =
+ gap_pos1) */
+{ /* EXCEPT for gaps that were INPUT with the seqs.*/
+ register sint sl; /* which have code = gap_pos2 */
+ sint i,j;
+
+ for(i=1;i<=nseqs;++i) {
+ sl=0;
+ for(j=1;j<=seqlen_array[i];++j) {
+ if(seq_array[i][j] == gap_pos1 &&
+ ( reset_alignments_new ||
+ reset_alignments_all)) continue;
+ if(seq_array[i][j] == gap_pos2 && (reset_alignments_all)) continue;
+ ++sl;
+ seq_array[i][sl]=seq_array[i][j];
+ }
+ seqlen_array[i]=sl;
+ }
+}
+
+
+
+static void reset_prf1(void) /* remove gaps from older alignments (code =
+ gap_pos1) */
+{ /* EXCEPT for gaps that were INPUT with the seqs.*/
+ register sint sl; /* which have code = gap_pos2 */
+ sint i,j;
+
+ if (struct_penalties1 != NONE) {
+ sl=0;
+ for (j=0;j<seqlen_array[1];++j) {
+ if (gap_penalty_mask1[j] == gap_pos1 && (reset_alignments_new ||
+ reset_alignments_all)) continue;
+ if (gap_penalty_mask1[j] == gap_pos2 && (reset_alignments_all)) continue;
+ gap_penalty_mask1[sl]=gap_penalty_mask1[j];
+ ++sl;
+ }
+ }
+
+ if (struct_penalties1 == SECST) {
+ sl=0;
+ for (j=0;j<seqlen_array[1];++j) {
+ if (sec_struct_mask1[j] == gap_pos1 && (reset_alignments_new ||
+ reset_alignments_all)) continue;
+ if (sec_struct_mask1[j] == gap_pos2 && (reset_alignments_all)) continue;
+ sec_struct_mask1[sl]=sec_struct_mask1[j];
+ ++sl;
+ }
+ }
+
+ for(i=1;i<=profile1_nseqs;++i) {
+ sl=0;
+ for(j=1;j<=seqlen_array[i];++j) {
+ if(seq_array[i][j] == gap_pos1 && (reset_alignments_new ||
+ reset_alignments_all)) continue;
+ if(seq_array[i][j] == gap_pos2 && (reset_alignments_all)) continue;
+ ++sl;
+ seq_array[i][sl]=seq_array[i][j];
+ }
+ seqlen_array[i]=sl;
+ }
+
+
+}
+
+
+
+static void reset_prf2(void) /* remove gaps from older alignments (code =
+ gap_pos1) */
+{ /* EXCEPT for gaps that were INPUT with the seqs.*/
+ register sint sl; /* which have code = gap_pos2 */
+ sint i,j;
+
+ if (struct_penalties2 != NONE) {
+ sl=0;
+ for (j=0;j<seqlen_array[profile1_nseqs+1];++j) {
+ if (gap_penalty_mask2[j] == gap_pos1 && (reset_alignments_new ||
+ reset_alignments_all)) continue;
+ if (gap_penalty_mask2[j] == gap_pos2 && (reset_alignments_all)) continue;
+ gap_penalty_mask2[sl]=gap_penalty_mask2[j];
+ ++sl;
+ }
+ }
+
+ if (struct_penalties2 == SECST) {
+ sl=0;
+ for (j=0;j<seqlen_array[profile1_nseqs+1];++j) {
+ if (sec_struct_mask2[j] == gap_pos1 && (reset_alignments_new ||
+ reset_alignments_all)) continue;
+ if (sec_struct_mask2[j] == gap_pos2 && (reset_alignments_all)) continue;
+ sec_struct_mask2[sl]=sec_struct_mask2[j];
+ ++sl;
+ }
+ }
+
+ for(i=profile1_nseqs+1;i<=nseqs;++i) {
+ sl=0;
+ for(j=1;j<=seqlen_array[i];++j) {
+ if(seq_array[i][j] == gap_pos1 && (reset_alignments_new ||
+ reset_alignments_all)) continue;
+ if(seq_array[i][j] == gap_pos2 && (reset_alignments_all)) continue;
+ ++sl;
+ seq_array[i][sl]=seq_array[i][j];
+ }
+ seqlen_array[i]=sl;
+ }
+
+
+}
+
+
+
+void fix_gaps(void) /* fix gaps introduced in older alignments (code = gap_pos1) */
+{
+ sint i,j;
+
+ if (struct_penalties1 != NONE) {
+ for (j=0;j<seqlen_array[1];++j) {
+ if (gap_penalty_mask1[j] == gap_pos1)
+ gap_penalty_mask1[j]=gap_pos2;
+ }
+ }
+
+ if (struct_penalties1 == SECST) {
+ for (j=0;j<seqlen_array[1];++j) {
+ if (sec_struct_mask1[j] == gap_pos1)
+ sec_struct_mask1[j]=gap_pos2;
+ }
+ }
+
+ for(i=1;i<=nseqs;++i) {
+ for(j=1;j<=seqlen_array[i];++j) {
+ if(seq_array[i][j] == gap_pos1)
+ seq_array[i][j]=gap_pos2;
+ }
+ }
+}
+
+static sint find_match(char *probe, char *list[], sint n)
+{
+ sint i,j,len;
+ sint count,match=0;
+
+ len = (sint)strlen(probe);
+ for (i=0;i<len;i++) {
+ count = 0;
+ for (j=0;j<n;j++) {
+ if (probe[i] == list[j][i]) {
+ match = j;
+ count++;
+ }
+ }
+ if (count == 0) return((sint)-1);
+ if (count == 1) return(match);
+ }
+ return((sint)-1);
+}
+
+static void create_parameter_output(void)
+{
+ char parname[FILENAMELEN+1], temp[FILENAMELEN+1];
+ char path[FILENAMELEN+1];
+ FILE *parout;
+
+ get_path(seqname,path);
+ strcpy(parname,path);
+ strcat(parname,"par");
+
+ if(usemenu) {
+ fprintf(stdout,"\nEnter a name for the parameter output file [%s]: ",
+ parname);
+ gets(temp);
+ if(*temp != EOS)
+ strcpy(parname,temp);
+ }
+
+/* create a file with execute permissions first */
+ remove(parname);
+ /*
+ fd = creat(parname, 0777);
+ close(fd);
+ */
+
+ if((parout = open_explicit_file(parname))==NULL) return;
+
+ fprintf(parout,"clustalw \\\n");
+ if (!empty && profile1_empty) fprintf(parout,"-infile=%s \\\n",seqname);
+ if (!profile1_empty) fprintf(parout,"-profile1=%s\\\n",profile1_name);
+ if (!profile2_empty) fprintf(parout,"-profile2=%s \\\n",profile2_name);
+ if (dnaflag == TRUE)
+ fprintf(parout,"-type=dna \\\n");
+ else
+ fprintf(parout,"-type=protein \\\n");
+
+ if (quick_pairalign) {
+ fprintf(parout,"-quicktree \\\n");
+ fprintf(parout,"-ktuple=%d \\\n",(pint)ktup);
+ fprintf(parout,"-window=%d \\\n",(pint)window);
+ fprintf(parout,"-pairgap=%d \\\n",(pint)wind_gap);
+ fprintf(parout,"-topdiags=%d \\\n",(pint)signif);
+ if (percent) fprintf(parout,"-score=percent \\\n");
+ else
+ fprintf(parout,"-score=absolute \\\n");
+ }
+ else {
+ if (!dnaflag) {
+ fprintf(parout,"-pwmatrix=%s \\\n",pw_mtrxname);
+ fprintf(parout,"-pwgapopen=%.2f \\\n",prot_pw_go_penalty);
+ fprintf(parout,"-pwgapext=%.2f \\\n",prot_pw_ge_penalty);
+ }
+ else {
+ fprintf(parout,"-pwgapopen=%.2f \\\n",pw_go_penalty);
+ fprintf(parout,"-pwgapext=%.2f \\\n",pw_ge_penalty);
+ }
+ }
+
+ if (!dnaflag) {
+ fprintf(parout,"-matrix=%s \\\n",mtrxname);
+ fprintf(parout,"-gapopen=%.2f \\\n",prot_gap_open);
+ fprintf(parout,"-gapext=%.2f \\\n",prot_gap_extend);
+ }
+ else {
+ fprintf(parout,"-gapopen=%.2f \\\n",dna_gap_open);
+ fprintf(parout,"-gapext=%.2f \\\n",dna_gap_extend);
+ }
+
+ fprintf(parout,"-maxdiv=%d \\\n",(pint)divergence_cutoff);
+ if (!use_endgaps) fprintf(parout,"-endgaps \\\n");
+
+ if (!dnaflag) {
+ if (neg_matrix) fprintf(parout,"-negative \\\n");
+ if (no_pref_penalties) fprintf(parout,"-nopgap \\\n");
+ if (no_hyd_penalties) fprintf(parout,"-nohgap \\\n");
+ if (no_var_penalties) fprintf(parout,"-novgap \\\n");
+ fprintf(parout,"-hgapresidues=%s \\\n",hyd_residues);
+ fprintf(parout,"-gapdist=%d \\\n",(pint)gap_dist);
+ }
+ else {
+ fprintf(parout,"-transweight=%.2f \\\n",transition_weight);
+ }
+
+ if (output_gcg) fprintf(parout,"-output=gcg \\\n");
+ else if (output_gde) fprintf(parout,"-output=gde \\\n");
+ else if (output_nbrf) fprintf(parout,"-output=pir \\\n");
+ else if (output_phylip) fprintf(parout,"-output=phylip \\\n");
+ else if (output_nexus) fprintf(parout,"-output=nexus \\\n");
+ if (outfile_name[0]!=EOS) fprintf(parout,"-outfile=%s \\\n",outfile_name);
+ if (output_order==ALIGNED) fprintf(parout,"-outorder=aligned \\\n");
+ else fprintf(parout,"-outorder=input \\\n");
+ if (output_gde)
+ if (lowercase) fprintf(parout,"-case=lower \\\n");
+ else fprintf(parout,"-case=upper \\\n");
+
+
+ fprintf(parout,"-interactive\n");
+
+ /*
+ if (kimura) fprintf(parout,"-kimura \\\n");
+ if (tossgaps) fprintf(parout,"-tossgaps \\\n");
+ fprintf(parout,"-seed=%d \\\n",(pint)boot_ran_seed);
+ fprintf(parout,"-bootstrap=%d \\\n",(pint)boot_ntrials);
+ */
+ fclose(parout);
+}
+
+
+#define isgap(val1) ( (val1 < 0) || (val1 > max_aa) )
+#define isend(val1) ((val1 == -3)||(val1 == 253) )
+
+void calc_percidentity(FILE *pfile)
+{
+ double **pmat;
+ char residue;
+
+ float ident;
+ int nmatch;
+
+ sint val1, val2;
+
+ sint i,j,k, length_longest;
+ sint length_shortest;
+
+ int rs=0, rl=0;
+ /* findout sequence length, longest and shortest ; */
+ length_longest=0;
+ length_shortest=0;
+
+ for (i=1;i<=nseqs;i++) {
+ /*printf("\n %d : %d ",i,seqlen_array[i]);*/
+ if (length_longest < seqlen_array[i]){
+ length_longest = seqlen_array[i];
+ rs = i;
+ }
+ if (length_shortest > seqlen_array[i]) {
+ length_shortest = seqlen_array[i];
+ rl = i;
+ }
+ }
+ /*
+ printf("\n shortest length %s %d ",names[rs], length_shortest);
+ printf("\n longest est length %s %d",names[rl], length_longest);
+ */
+
+ pmat = (double **)ckalloc((nseqs+1) * sizeof(double *));
+ for (i=0;i<=nseqs;i++)
+ pmat[i] = (double *)ckalloc((nseqs+1) * sizeof(double));
+ for (i = 0; i <= nseqs; i++)
+ for (j = 0; j <= nseqs; j++)
+ pmat[i][j] = 0.0;
+
+ nmatch = 0;
+
+ for (i=1; i <= nseqs; i++) {
+ /*printf("\n %5d: comparing %s with ",i,names[i]); */
+ for (j=i; j<=nseqs ; j++) {
+ printf("\n %s ",names[j]);
+ ident = 0;
+ nmatch = 0;
+ for(k=1; k<=length_longest; k++) {
+ val1 = seq_array[i][k];
+ val2 = seq_array[j][k];
+ if ( isend(val1) || isend(val2)) break; /* end of sequence ????? */
+ if ( isgap(val1) || isgap(val2) ) continue; /* residue = '-'; */
+ if (val1 == val2) {
+ ident++ ;
+ nmatch++;
+ /* residue = amino_acid_codes[val1];
+ printf("%c:",residue);
+ residue = amino_acid_codes[val2];
+ printf("%c ",residue);*/
+ }
+ else {
+ nmatch++ ;
+ }
+ }
+ ident = ident/nmatch * 100.0 ;
+ pmat[i][j] = ident;
+ pmat[j][i]= ident;
+ /* printf(" %d x %d .... match %d %d \n",i,j,ident,pmat[i][j]); */
+ }
+
+ }
+ /* printf("\n nmatch = %d\n ", nmatch);*/
+ fprintf(pfile,"#\n#\n# Percent Identity Matrix - created by Clustal%s \n#\n#\n",revision_level);
+ for(i=1;i<=nseqs;i++) {
+ fprintf(pfile,"\n %5d: %-*s",i,max_names,names[i]);
+ for(j=1;j<=nseqs;j++) {
+ fprintf(pfile,"%8.0f",pmat[i][j]);
+ }
+ }
+ fprintf(pfile,"\n");
+
+ for (i=0;i<nseqs;i++)
+ pmat[i]=ckfree((void *)pmat[i]);
+ pmat=ckfree((void *)pmat);
+
+}
Added: trunk/packages/clustalw/branches/upstream/current/makefile
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/makefile 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/makefile 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,61 @@
+install: clustalx clustalw
+
+clean:
+ rm *.o
+
+OBJECTS = interface.o sequence.o showpair.o malign.o \
+ util.o trees.o gcgcheck.o prfalign.o pairalign.o \
+ calcgapcoeff.o calcprf1.o calcprf2.o calctree.o \
+ readmat.o alnscore.o random.o
+
+XOBJECTS = xutils.o xmenu.o xcolor.o xdisplay.o xscore.o
+
+HEADERS = general.h clustalw.h
+
+CC = cc
+CFLAGS = -c -O
+LFLAGS = -O -lm
+NCBI_INC = /dec/biolo/ncbi/include
+NCBI_LIB = /dec/biolo/ncbi/lib
+CXFLAGS = -DWIN_MOTIF -I$(NCBI_INC)
+LXFLAGS = -L$(NCBI_LIB) -lvibrant -lncbi -lpthread -lXm -lXmu -lXt -lX11 -lm
+
+clustalw : $(OBJECTS) amenu.o clustalw.o
+ $(CC) -o $@ $(OBJECTS) amenu.o clustalw.o $(LFLAGS)
+
+interface.o : interface.c $(HEADERS) param.h
+ $(CC) $(CFLAGS) $*.c
+
+amenu.o : amenu.c $(HEADERS) param.h
+ $(CC) $(CFLAGS) $*.c
+
+clustalx : $(OBJECTS) $(XOBJECTS) clustalx.o
+ $(CC) -o $@ $(OBJECTS) $(XOBJECTS) clustalx.o $(LFLAGS) $(LXFLAGS)
+
+clustalx.o : clustalx.c $(HEADERS)
+ $(CC) $(CFLAGS) $(CXFLAGS) $*.c
+
+xmenu.o : xmenu.c $(HEADERS) param.h
+ $(CC) $(CFLAGS) $(CXFLAGS) $*.c
+
+xdisplay.o : xdisplay.c $(HEADERS) param.h
+ $(CC) $(CFLAGS) $(CXFLAGS) $*.c
+
+xutils.o : xutils.c $(HEADERS) param.h
+ $(CC) $(CFLAGS) $(CXFLAGS) $*.c
+
+xcolor.o : xcolor.c $(HEADERS) param.h
+ $(CC) $(CFLAGS) $(CXFLAGS) $*.c
+
+xscore.o : xscore.c $(HEADERS) param.h
+ $(CC) $(CFLAGS) $(CXFLAGS) $*.c
+
+readmat.o : readmat.c $(HEADERS) matrices.h
+ $(CC) $(CFLAGS) $*.c
+
+trees.o : trees.c $(HEADERS) dayhoff.h
+ $(CC) $(CFLAGS) $*.c
+
+.c.o :
+ $(CC) $(CFLAGS) $?
+
Added: trunk/packages/clustalw/branches/upstream/current/makefile.alpha
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/makefile.alpha 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/makefile.alpha 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,65 @@
+install: clustalx clustalw
+
+clean:
+ rm *.o
+
+OBJECTS = interface.o sequence.o showpair.o malign.o \
+ util.o trees.o gcgcheck.o prfalign.o pairalign.o \
+ calcgapcoeff.o calcprf1.o calcprf2.o calctree.o \
+ readmat.o alnscore.o random.o
+
+XOBJECTS = xutils.o xmenu.o xcolor.o xdisplay.o xscore.o
+
+HEADERS = general.h clustalw.h
+
+CC = cc
+CFLAGS = -c -O
+LFLAGS = -O -lm
+NCBI_INC = /dec/biolo/ncbi/include
+NCBI_LIB = /dec/biolo/ncbi/lib
+CXFLAGS = -DWIN_MOTIF -I$(NCBI_INC)
+LXFLAGS = -L$(NCBI_LIB) -lvibrant -lncbi -lpthread -lXm -lXmu -lXt -lX11 -lm
+
+clustalw : $(OBJECTS) amenu.o clustalw.o
+ $(CC) -o $@ $(OBJECTS) amenu.o clustalw.o $(LFLAGS)
+
+clustalt : $(OBJECTS) amenu.o clustalw.o
+ $(CC) -o clustalt $(OBJECTS) amenu.o clustalw.o $(LFLAGS)
+
+
+interface.o : interface.c $(HEADERS) param.h
+ $(CC) $(CFLAGS) $*.c
+
+amenu.o : amenu.c $(HEADERS) param.h
+ $(CC) $(CFLAGS) $*.c
+
+clustalx : $(OBJECTS) $(XOBJECTS) clustalx.o
+ $(CC) -o $@ $(OBJECTS) $(XOBJECTS) clustalx.o $(LFLAGS) $(LXFLAGS)
+
+clustalx.o : clustalx.c $(HEADERS)
+ $(CC) $(CFLAGS) $(CXFLAGS) $*.c
+
+xmenu.o : xmenu.c $(HEADERS) param.h
+ $(CC) $(CFLAGS) $(CXFLAGS) $*.c
+
+xdisplay.o : xdisplay.c $(HEADERS) param.h
+ $(CC) $(CFLAGS) $(CXFLAGS) $*.c
+
+xutils.o : xutils.c $(HEADERS) param.h
+ $(CC) $(CFLAGS) $(CXFLAGS) $*.c
+
+xcolor.o : xcolor.c $(HEADERS) param.h
+ $(CC) $(CFLAGS) $(CXFLAGS) $*.c
+
+xscore.o : xscore.c $(HEADERS) param.h
+ $(CC) $(CFLAGS) $(CXFLAGS) $*.c
+
+readmat.o : readmat.c $(HEADERS) matrices.h
+ $(CC) $(CFLAGS) $*.c
+
+trees.o : trees.c $(HEADERS) dayhoff.h
+ $(CC) $(CFLAGS) $*.c
+
+.c.o :
+ $(CC) $(CFLAGS) $?
+
Added: trunk/packages/clustalw/branches/upstream/current/makefile.linux
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/makefile.linux 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/makefile.linux 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,58 @@
+install: clustalx clustalw
+
+clean:
+ rm *.o
+
+OBJECTS = interface.o sequence.o showpair.o malign.o util.o trees.o gcgcheck.o prfalign.o pairalign.o calcgapcoeff.o calcprf1.o calcprf2.o calctree.o readmat.o alnscore.o random.o
+
+XOBJECTS = xutils.o xmenu.o xcolor.o xdisplay.o xscore.o
+
+HEADERS = general.h clustalw.h
+
+CC = cc
+CFLAGS = -c -O
+LFLAGS = -O -lm
+CXFLAGS = -DWIN_MOTIF -I/usr/bio/src/ncbi/include
+LXFLAGS = -L/usr/bio/src/ncbi/lib -L/usr/ccs/lib -L/usr/X11R6/lib -lvibrant -lncbi -lXm -lXmu -lXpm -lXt -lX11 -lm
+
+
+static: $(OBJECTS) amenu.o clustalw.o $(XOBJECTS) clustalx.o
+ $(CC) -o clustalx.static $(OBJECTS) $(XOBJECTS) clustalx.o $(LFLAGS) $(LXFLAGS) -lXext -lX11 -lSM -static /usr/X11R6/lib/libICE.a
+ $(CC) -o clustalw $(OBJECTS) amenu.o clustalw.o $(LFLAGS)
+
+clustalw : $(OBJECTS) amenu.o clustalw.o
+ $(CC) -o $@ $(OBJECTS) amenu.o clustalw.o $(LFLAGS)
+
+amenu.o : amenu.c $(HEADERS) param.h
+ $(CC) $(CFLAGS) $*.c
+
+clustalx : $(OBJECTS) $(XOBJECTS) clustalx.o
+ $(CC) -o $@ $(OBJECTS) $(XOBJECTS) clustalx.o $(LFLAGS) $(LXFLAGS)
+
+clustalx.o : clustalx.c $(HEADERS)
+ $(CC) $(CFLAGS) $(CXFLAGS) $*.c
+
+xmenu.o : xmenu.c $(HEADERS) param.h
+ $(CC) $(CFLAGS) $(CXFLAGS) $*.c
+
+xdisplay.o : xdisplay.c $(HEADERS) param.h
+ $(CC) $(CFLAGS) $(CXFLAGS) $*.c
+
+xutils.o : xutils.c $(HEADERS) param.h
+ $(CC) $(CFLAGS) $(CXFLAGS) $*.c
+
+xcolor.o : xcolor.c $(HEADERS) param.h
+ $(CC) $(CFLAGS) $(CXFLAGS) $*.c
+
+xscore.o : xscore.c $(HEADERS) param.h
+ $(CC) $(CFLAGS) $(CXFLAGS) $*.c
+
+readmat.o : readmat.c $(HEADERS) matrices.h
+ $(CC) $(CFLAGS) $*.c
+
+trees.o : trees.c $(HEADERS) dayhoff.h
+ $(CC) $(CFLAGS) $*.c
+
+.c.o :
+ $(CC) $(CFLAGS) $?
+
Added: trunk/packages/clustalw/branches/upstream/current/makefile.sgi
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/makefile.sgi 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/makefile.sgi 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,58 @@
+install: clustalx clustalw
+
+clean:
+ rm *.o
+
+OBJECTS = interface.o sequence.o showpair.o malign.o \
+ util.o trees.o gcgcheck.o prfalign.o pairalign.o \
+ calcgapcoeff.o calcprf1.o calcprf2.o calctree.o \
+ readmat.o alnscore.o random.o
+
+XOBJECTS = xutils.o xmenu.o xcolor.o xdisplay.o xscore.o
+
+HEADERS = general.h clustalw.h
+
+CC = cc
+CFLAGS = -c -O
+LFLAGS = -O -lm
+NCBI_INC = /biolo/ncbi/include
+NCBI_LIB = /biolo/ncbi/lib
+CXFLAGS = -DWIN_MOTIF -I$(NCBI_INC)
+LXFLAGS = -L$(NCBI_LIB) -L/usr/ccs/lib/ -lvibrant -lncbi -lXm -lXt -lX11 -lXmu -lm
+
+clustalw : $(OBJECTS) amenu.o clustalw.o
+ $(CC) -o $@ $(OBJECTS) amenu.o clustalw.o $(LFLAGS)
+
+amenu.o : amenu.c $(HEADERS) param.h
+ $(CC) $(CFLAGS) $*.c
+
+clustalx : $(OBJECTS) $(XOBJECTS) clustalx.o
+ $(CC) -o $@ $(OBJECTS) $(XOBJECTS) clustalx.o $(LFLAGS) $(LXFLAGS)
+
+clustalx.o : clustalx.c $(HEADERS)
+ $(CC) $(CFLAGS) $(CXFLAGS) $*.c
+
+xmenu.o : xmenu.c $(HEADERS) param.h
+ $(CC) $(CFLAGS) $(CXFLAGS) $*.c
+
+xdisplay.o : xdisplay.c $(HEADERS) param.h
+ $(CC) $(CFLAGS) $(CXFLAGS) $*.c
+
+xutils.o : xutils.c $(HEADERS) param.h
+ $(CC) $(CFLAGS) $(CXFLAGS) $*.c
+
+xcolor.o : xcolor.c $(HEADERS) param.h
+ $(CC) $(CFLAGS) $(CXFLAGS) $*.c
+
+xscore.o : xscore.c $(HEADERS) param.h
+ $(CC) $(CFLAGS) $(CXFLAGS) $*.c
+
+readmat.o : readmat.c $(HEADERS) matrices.h
+ $(CC) $(CFLAGS) $*.c
+
+trees.o : trees.c $(HEADERS) dayhoff.h
+ $(CC) $(CFLAGS) $*.c
+
+.c.o :
+ $(CC) $(CFLAGS) $?
+
Added: trunk/packages/clustalw/branches/upstream/current/makefile.sun
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/makefile.sun 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/makefile.sun 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,61 @@
+install: clustalx clustalw
+
+clean:
+ rm *.o
+
+OBJECTS = interface.o sequence.o showpair.o malign.o \
+ util.o trees.o gcgcheck.o prfalign.o pairalign.o \
+ calcgapcoeff.o calcprf1.o calcprf2.o calctree.o \
+ readmat.o alnscore.o random.o
+
+XOBJECTS = xutils.o xmenu.o xcolor.o xdisplay.o xscore.o
+
+HEADERS = general.h clustalw.h
+
+CC = cc
+CFLAGS = -c -O
+LFLAGS = -O -lm
+NCBI_INC = /workbench/include/ncbi
+NCBI_LIB = /workbench/lib/ncbi
+CXFLAGS = -DWIN_MOTIF -I$(NCBI_INC) -I/opt/SUNWmotif/include
+LXFLAGS = -L$(NCBI_LIB) -L/usr/ccs/lib/ -L/opt/SUNWmotif/lib -Bstatic -lvibrant -lncbi -Bdynamic -lXm -lXmu -Bdynamic -lXt -lX11 -lgen
+
+clustalw : $(OBJECTS) amenu.o clustalw.o
+ $(CC) -o $@ $(OBJECTS) amenu.o clustalw.o $(LFLAGS)
+
+interface.o : interface.c $(HEADERS) param.h
+ $(CC) $(CFLAGS) $*.c
+
+amenu.o : amenu.c $(HEADERS) param.h
+ $(CC) $(CFLAGS) $*.c
+
+clustalx : $(OBJECTS) $(XOBJECTS) clustalx.o
+ $(CC) -o $@ $(OBJECTS) $(XOBJECTS) clustalx.o $(LFLAGS) $(LXFLAGS)
+
+clustalx.o : clustalx.c $(HEADERS)
+ $(CC) $(CFLAGS) $(CXFLAGS) $*.c
+
+xmenu.o : xmenu.c $(HEADERS) param.h
+ $(CC) $(CFLAGS) $(CXFLAGS) $*.c
+
+xdisplay.o : xdisplay.c $(HEADERS) param.h
+ $(CC) $(CFLAGS) $(CXFLAGS) $*.c
+
+xutils.o : xutils.c $(HEADERS) param.h
+ $(CC) $(CFLAGS) $(CXFLAGS) $*.c
+
+xcolor.o : xcolor.c $(HEADERS) param.h
+ $(CC) $(CFLAGS) $(CXFLAGS) $*.c
+
+xscore.o : xscore.c $(HEADERS) param.h
+ $(CC) $(CFLAGS) $(CXFLAGS) $*.c
+
+readmat.o : readmat.c $(HEADERS) matrices.h
+ $(CC) $(CFLAGS) $*.c
+
+trees.o : trees.c $(HEADERS) dayhoff.h
+ $(CC) $(CFLAGS) $*.c
+
+.c.o :
+ $(CC) $(CFLAGS) $?
+
Added: trunk/packages/clustalw/branches/upstream/current/malign.c
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/malign.c 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/malign.c 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,654 @@
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include "clustalw.h"
+
+
+/*
+ * Prototypes
+ */
+
+/*
+ * Global Variables
+ */
+
+extern double **tmat;
+extern Boolean no_weights;
+extern sint debug;
+extern sint max_aa;
+extern sint nseqs;
+extern sint profile1_nseqs;
+extern sint nsets;
+extern sint **sets;
+extern sint divergence_cutoff;
+extern sint *seq_weight;
+extern sint output_order, *output_index;
+extern Boolean distance_tree;
+extern char seqname[];
+extern sint *seqlen_array;
+extern char **seq_array;
+
+sint malign(sint istart,char *phylip_name) /* full progressive alignment*/
+{
+ static sint *aligned;
+ static sint *group;
+ static sint ix;
+
+ sint *maxid, max, sum;
+ sint *tree_weight;
+ sint i,j,set,iseq=0;
+ sint status,entries;
+ lint score = 0;
+
+
+ info("Start of Multiple Alignment");
+
+/* get the phylogenetic tree from *.ph */
+
+ if (nseqs >= 2)
+ {
+ status = read_tree(phylip_name, (sint)0, nseqs);
+ if (status == 0) return((sint)0);
+ }
+
+/* calculate sequence weights according to branch lengths of the tree -
+ weights in global variable seq_weight normalised to sum to 100 */
+
+ calc_seq_weights((sint)0, nseqs, seq_weight);
+
+/* recalculate tmat matrix as percent similarity matrix */
+
+ status = calc_similarities(nseqs);
+ if (status == 0) return((sint)0);
+
+/* for each sequence, find the most closely related sequence */
+
+ maxid = (sint *)ckalloc( (nseqs+1) * sizeof (sint));
+ for (i=1;i<=nseqs;i++)
+ {
+ maxid[i] = -1;
+ for (j=1;j<=nseqs;j++)
+ if (j!=i && maxid[i] < tmat[i][j]) maxid[i] = tmat[i][j];
+ }
+
+/* group the sequences according to their relative divergence */
+
+ if (istart == 0)
+ {
+ sets = (sint **) ckalloc( (nseqs+1) * sizeof (sint *) );
+ for(i=0;i<=nseqs;i++)
+ sets[i] = (sint *)ckalloc( (nseqs+1) * sizeof (sint) );
+
+ create_sets((sint)0,nseqs);
+ info("There are %d groups",(pint)nsets);
+
+/* clear the memory used for the phylogenetic tree */
+
+ if (nseqs >= 2)
+ clear_tree(NULL);
+
+/* start the multiple alignments......... */
+
+ info("Aligning...");
+
+/* first pass, align closely related sequences first.... */
+
+ ix = 0;
+ aligned = (sint *)ckalloc( (nseqs+1) * sizeof (sint) );
+ for (i=0;i<=nseqs;i++) aligned[i] = 0;
+
+ for(set=1;set<=nsets;++set)
+ {
+ entries=0;
+ for (i=1;i<=nseqs;i++)
+ {
+ if ((sets[set][i] != 0) && (maxid[i] > divergence_cutoff))
+ {
+ entries++;
+ if (aligned[i] == 0)
+ {
+ if (output_order==INPUT)
+ {
+ ++ix;
+ output_index[i] = i;
+ }
+ else output_index[++ix] = i;
+ aligned[i] = 1;
+ }
+ }
+ }
+
+ if(entries > 0) score = prfalign(sets[set], aligned);
+ else score=0.0;
+
+
+/* negative score means fatal error... exit now! */
+
+ if (score < 0)
+ {
+ return(-1);
+ }
+ if ((entries > 0) && (score > 0))
+ info("Group %d: Sequences:%4d Score:%d",
+ (pint)set,(pint)entries,(pint)score);
+ else
+ info("Group %d: Delayed",
+ (pint)set);
+ }
+
+ for (i=0;i<=nseqs;i++)
+ sets[i]=ckfree((void *)sets[i]);
+ sets=ckfree(sets);
+ }
+ else
+ {
+/* clear the memory used for the phylogenetic tree */
+
+ if (nseqs >= 2)
+ clear_tree(NULL);
+
+ aligned = (sint *)ckalloc( (nseqs+1) * sizeof (sint) );
+ ix = 0;
+ for (i=1;i<=istart+1;i++)
+ {
+ aligned[i] = 1;
+ ++ix;
+ output_index[i] = i;
+ }
+ for (i=istart+2;i<=nseqs;i++) aligned[i] = 0;
+ }
+
+/* second pass - align remaining, more divergent sequences..... */
+
+/* if not all sequences were aligned, for each unaligned sequence,
+ find it's closest pair amongst the aligned sequences. */
+
+ group = (sint *)ckalloc( (nseqs+1) * sizeof (sint));
+ tree_weight = (sint *) ckalloc( (nseqs) * sizeof(sint) );
+ for (i=0;i<nseqs;i++)
+ tree_weight[i] = seq_weight[i];
+
+/* if we haven't aligned any sequences, in the first pass - align the
+two most closely related sequences now */
+ if(ix==0)
+ {
+ max = -1;
+ iseq = 0;
+ for (i=1;i<=nseqs;i++)
+ {
+ for (j=i+1;j<=nseqs;j++)
+ {
+ if (max < tmat[i][j])
+ {
+ max = tmat[i][j];
+ iseq = i;
+ }
+ }
+ }
+ aligned[iseq]=1;
+ if (output_order == INPUT)
+ {
+ ++ix;
+ output_index[iseq] = iseq;
+ }
+ else
+ output_index[++ix] = iseq;
+ }
+
+ while (ix < nseqs)
+ {
+ for (i=1;i<=nseqs;i++) {
+ if (aligned[i] == 0)
+ {
+ maxid[i] = -1;
+ for (j=1;j<=nseqs;j++)
+ if ((maxid[i] < tmat[i][j]) && (aligned[j] != 0))
+ maxid[i] = tmat[i][j];
+ }
+ }
+/* find the most closely related sequence to those already aligned */
+
+ max = -1;
+ iseq = 0;
+ for (i=1;i<=nseqs;i++)
+ {
+ if ((aligned[i] == 0) && (maxid[i] > max))
+ {
+ max = maxid[i];
+ iseq = i;
+ }
+ }
+
+
+/* align this sequence to the existing alignment */
+/* weight sequences with percent identity with profile*/
+/* OR...., multiply sequence weights from tree by percent identity with new sequence */
+ if(no_weights==FALSE) {
+ for (j=0;j<nseqs;j++)
+ if (aligned[j+1] != 0)
+ seq_weight[j] = tree_weight[j] * tmat[j+1][iseq];
+/*
+ Normalise the weights, such that the sum of the weights = INT_SCALE_FACTOR
+*/
+
+ sum = 0;
+ for (j=0;j<nseqs;j++)
+ if (aligned[j+1] != 0)
+ sum += seq_weight[j];
+ if (sum == 0)
+ {
+ for (j=0;j<nseqs;j++)
+ seq_weight[j] = 1;
+ sum = j;
+ }
+ for (j=0;j<nseqs;j++)
+ if (aligned[j+1] != 0)
+ {
+ seq_weight[j] = (seq_weight[j] * INT_SCALE_FACTOR) / sum;
+ if (seq_weight[j] < 1) seq_weight[j] = 1;
+ }
+ }
+
+ entries = 0;
+ for (j=1;j<=nseqs;j++)
+ if (aligned[j] != 0)
+ {
+ group[j] = 1;
+ entries++;
+ }
+ else if (iseq==j)
+ {
+ group[j] = 2;
+ entries++;
+ }
+ aligned[iseq] = 1;
+
+ score = prfalign(group, aligned);
+ info("Sequence:%d Score:%d",(pint)iseq,(pint)score);
+ if (output_order == INPUT)
+ {
+ ++ix;
+ output_index[iseq] = iseq;
+ }
+ else
+ output_index[++ix] = iseq;
+ }
+
+ group=ckfree((void *)group);
+ aligned=ckfree((void *)aligned);
+ maxid=ckfree((void *)maxid);
+ tree_weight=ckfree((void *)tree_weight);
+
+ aln_score();
+
+/* make the rest (output stuff) into routine clustal_out in file amenu.c */
+
+ return(nseqs);
+
+}
+
+sint seqalign(sint istart,char *phylip_name) /* sequence alignment to existing profile */
+{
+ static sint *aligned, *tree_weight;
+ static sint *group;
+ static sint ix;
+
+ sint *maxid, max;
+ sint i,j,status,iseq;
+ sint sum,entries;
+ lint score = 0;
+
+
+ info("Start of Multiple Alignment");
+
+/* get the phylogenetic tree from *.ph */
+
+ if (nseqs >= 2)
+ {
+ status = read_tree(phylip_name, (sint)0, nseqs);
+ if (status == 0) return(0);
+ }
+
+/* calculate sequence weights according to branch lengths of the tree -
+ weights in global variable seq_weight normalised to sum to 100 */
+
+ calc_seq_weights((sint)0, nseqs, seq_weight);
+
+ tree_weight = (sint *) ckalloc( (nseqs) * sizeof(sint) );
+ for (i=0;i<nseqs;i++)
+ tree_weight[i] = seq_weight[i];
+
+/* recalculate tmat matrix as percent similarity matrix */
+
+ status = calc_similarities(nseqs);
+ if (status == 0) return((sint)0);
+
+/* for each sequence, find the most closely related sequence */
+
+ maxid = (sint *)ckalloc( (nseqs+1) * sizeof (sint));
+ for (i=1;i<=nseqs;i++)
+ {
+ maxid[i] = -1;
+ for (j=1;j<=nseqs;j++)
+ if (maxid[i] < tmat[i][j]) maxid[i] = tmat[i][j];
+ }
+
+/* clear the memory used for the phylogenetic tree */
+
+ if (nseqs >= 2)
+ clear_tree(NULL);
+
+ aligned = (sint *)ckalloc( (nseqs+1) * sizeof (sint) );
+ ix = 0;
+ for (i=1;i<=istart+1;i++)
+ {
+ aligned[i] = 1;
+ ++ix;
+ output_index[i] = i;
+ }
+ for (i=istart+2;i<=nseqs;i++) aligned[i] = 0;
+
+/* for each unaligned sequence, find it's closest pair amongst the
+ aligned sequences. */
+
+ group = (sint *)ckalloc( (nseqs+1) * sizeof (sint));
+
+ while (ix < nseqs)
+ {
+ if (ix > 0)
+ {
+ for (i=1;i<=nseqs;i++) {
+ if (aligned[i] == 0)
+ {
+ maxid[i] = -1;
+ for (j=1;j<=nseqs;j++)
+ if ((maxid[i] < tmat[i][j]) && (aligned[j] != 0))
+ maxid[i] = tmat[i][j];
+ }
+ }
+ }
+
+/* find the most closely related sequence to those already aligned */
+
+ max = -1;
+ for (i=1;i<=nseqs;i++)
+ {
+ if ((aligned[i] == 0) && (maxid[i] > max))
+ {
+ max = maxid[i];
+ iseq = i;
+ }
+ }
+
+/* align this sequence to the existing alignment */
+
+ entries = 0;
+ for (j=1;j<=nseqs;j++)
+ if (aligned[j] != 0)
+ {
+ group[j] = 1;
+ entries++;
+ }
+ else if (iseq==j)
+ {
+ group[j] = 2;
+ entries++;
+ }
+ aligned[iseq] = 1;
+
+
+/* EITHER....., set sequence weights equal to percent identity with new sequence */
+/*
+ for (j=0;j<nseqs;j++)
+ seq_weight[j] = tmat[j+1][iseq];
+*/
+/* OR...., multiply sequence weights from tree by percent identity with new sequence */
+ for (j=0;j<nseqs;j++)
+ seq_weight[j] = tree_weight[j] * tmat[j+1][iseq];
+if (debug>1)
+ for (j=0;j<nseqs;j++) if (group[j+1] == 1)fprintf (stdout,"sequence %d: %d\n", j+1,tree_weight[j]);
+/*
+ Normalise the weights, such that the sum of the weights = INT_SCALE_FACTOR
+*/
+
+ sum = 0;
+ for (j=0;j<nseqs;j++)
+ if (group[j+1] == 1) sum += seq_weight[j];
+ if (sum == 0)
+ {
+ for (j=0;j<nseqs;j++)
+ seq_weight[j] = 1;
+ sum = j;
+ }
+ for (j=0;j<nseqs;j++)
+ {
+ seq_weight[j] = (seq_weight[j] * INT_SCALE_FACTOR) / sum;
+ if (seq_weight[j] < 1) seq_weight[j] = 1;
+ }
+
+if (debug > 1) {
+ fprintf(stdout,"new weights\n");
+ for (j=0;j<nseqs;j++) if (group[j+1] == 1)fprintf( stdout,"sequence %d: %d\n", j+1,seq_weight[j]);
+}
+
+ score = prfalign(group, aligned);
+ info("Sequence:%d Score:%d",(pint)iseq,(pint)score);
+ if (output_order == INPUT)
+ {
+ ++ix;
+ output_index[iseq] = iseq;
+ }
+ else
+ output_index[++ix] = iseq;
+ }
+
+ group=ckfree((void *)group);
+ aligned=ckfree((void *)aligned);
+ maxid=ckfree((void *)maxid);
+
+ aln_score();
+
+/* make the rest (output stuff) into routine clustal_out in file amenu.c */
+
+ return(nseqs);
+
+}
+
+
+sint palign1(void) /* a profile alignment */
+{
+ sint i,j,temp;
+ sint entries;
+ sint *aligned, *group;
+ float dscore;
+ lint score;
+
+ info("Start of Initial Alignment");
+
+/* calculate sequence weights according to branch lengths of the tree -
+ weights in global variable seq_weight normalised to sum to INT_SCALE_FACTOR */
+
+ temp = INT_SCALE_FACTOR/nseqs;
+ for (i=0;i<nseqs;i++) seq_weight[i] = temp;
+
+ distance_tree = FALSE;
+
+/* do the initial alignment......... */
+
+ group = (sint *)ckalloc( (nseqs+1) * sizeof (sint));
+
+ for(i=1; i<=profile1_nseqs; ++i)
+ group[i] = 1;
+ for(i=profile1_nseqs+1; i<=nseqs; ++i)
+ group[i] = 2;
+ entries = nseqs;
+
+ aligned = (sint *)ckalloc( (nseqs+1) * sizeof (sint) );
+ for (i=1;i<=nseqs;i++) aligned[i] = 1;
+
+ score = prfalign(group, aligned);
+ info("Sequences:%d Score:%d",(pint)entries,(pint)score);
+ group=ckfree((void *)group);
+ aligned=ckfree((void *)aligned);
+
+ for (i=1;i<=nseqs;i++) {
+ for (j=i+1;j<=nseqs;j++) {
+ dscore = countid(i,j);
+ tmat[i][j] = ((double)100.0 - (double)dscore)/(double)100.0;
+ tmat[j][i] = tmat[i][j];
+ }
+ }
+
+ return(nseqs);
+}
+
+float countid(sint s1, sint s2)
+{
+ char c1,c2;
+ sint i;
+ sint count,total;
+ float score;
+
+ count = total = 0;
+ for (i=1;i<=seqlen_array[s1] && i<=seqlen_array[s2];i++) {
+ c1 = seq_array[s1][i];
+ c2 = seq_array[s2][i];
+ if ((c1>=0) && (c1<max_aa)) {
+ total++;
+ if (c1 == c2) count++;
+ }
+
+ }
+
+ if(total==0) score=0;
+ else
+ score = 100.0 * (float)count / (float)total;
+ return(score);
+
+}
+
+sint palign2(char *p1_tree_name,char *p2_tree_name) /* a profile alignment */
+{
+ sint i,j,sum,entries,status;
+ lint score;
+ sint *aligned, *group;
+ sint *maxid,*p1_weight,*p2_weight;
+ sint dscore;
+
+ info("Start of Multiple Alignment");
+
+/* get the phylogenetic trees from *.ph */
+
+ if (profile1_nseqs >= 2)
+ {
+ status = read_tree(p1_tree_name, (sint)0, profile1_nseqs);
+ if (status == 0) return(0);
+ }
+
+/* calculate sequence weights according to branch lengths of the tree -
+ weights in global variable seq_weight normalised to sum to 100 */
+
+ p1_weight = (sint *) ckalloc( (profile1_nseqs) * sizeof(sint) );
+
+ calc_seq_weights((sint)0, profile1_nseqs, p1_weight);
+
+/* clear the memory for the phylogenetic tree */
+
+ if (profile1_nseqs >= 2)
+ clear_tree(NULL);
+
+ if (nseqs-profile1_nseqs >= 2)
+ {
+ status = read_tree(p2_tree_name, profile1_nseqs, nseqs);
+ if (status == 0) return(0);
+ }
+
+ p2_weight = (sint *) ckalloc( (nseqs) * sizeof(sint) );
+
+ calc_seq_weights(profile1_nseqs,nseqs, p2_weight);
+
+
+/* clear the memory for the phylogenetic tree */
+
+ if (nseqs-profile1_nseqs >= 2)
+ clear_tree(NULL);
+
+/* convert tmat distances to similarities */
+
+ for (i=1;i<nseqs;i++)
+ for (j=i+1;j<=nseqs;j++) {
+ tmat[i][j]=100.0-tmat[i][j]*100.0;
+ tmat[j][i]=tmat[i][j];
+ }
+
+
+/* weight sequences with max percent identity with other profile*/
+
+ maxid = (sint *)ckalloc( (nseqs+1) * sizeof (sint));
+ for (i=0;i<profile1_nseqs;i++) {
+ maxid[i] = 0;
+ for (j=profile1_nseqs+1;j<=nseqs;j++)
+ if(maxid[i]<tmat[i+1][j]) maxid[i] = tmat[i+1][j];
+ seq_weight[i] = maxid[i]*p1_weight[i];
+ }
+
+ for (i=profile1_nseqs;i<nseqs;i++) {
+ maxid[i] = -1;
+ for (j=1;j<=profile1_nseqs;j++)
+ if(maxid[i]<tmat[i+1][j]) maxid[i] = tmat[i+1][j];
+ seq_weight[i] = maxid[i]*p2_weight[i];
+ }
+/*
+ Normalise the weights, such that the sum of the weights = INT_SCALE_FACTOR
+*/
+
+ sum = 0;
+ for (j=0;j<nseqs;j++)
+ sum += seq_weight[j];
+ if (sum == 0)
+ {
+ for (j=0;j<nseqs;j++)
+ seq_weight[j] = 1;
+ sum = j;
+ }
+ for (j=0;j<nseqs;j++)
+ {
+ seq_weight[j] = (seq_weight[j] * INT_SCALE_FACTOR) / sum;
+ if (seq_weight[j] < 1) seq_weight[j] = 1;
+ }
+if (debug > 1) {
+ fprintf(stdout,"new weights\n");
+ for (j=0;j<nseqs;j++) fprintf( stdout,"sequence %d: %d\n", j+1,seq_weight[j]);
+}
+
+
+/* do the alignment......... */
+
+ info("Aligning...");
+
+ group = (sint *)ckalloc( (nseqs+1) * sizeof (sint));
+
+ for(i=1; i<=profile1_nseqs; ++i)
+ group[i] = 1;
+ for(i=profile1_nseqs+1; i<=nseqs; ++i)
+ group[i] = 2;
+ entries = nseqs;
+
+ aligned = (sint *)ckalloc( (nseqs+1) * sizeof (sint) );
+ for (i=1;i<=nseqs;i++) aligned[i] = 1;
+
+ score = prfalign(group, aligned);
+ info("Sequences:%d Score:%d",(pint)entries,(pint)score);
+ group=ckfree((void *)group);
+ p1_weight=ckfree((void *)p1_weight);
+ p2_weight=ckfree((void *)p2_weight);
+ aligned=ckfree((void *)aligned);
+ maxid=ckfree((void *)maxid);
+
+/* DES output_index = (int *)ckalloc( (nseqs+1) * sizeof (int)); */
+ for (i=1;i<=nseqs;i++) output_index[i] = i;
+
+ return(nseqs);
+}
+
Added: trunk/packages/clustalw/branches/upstream/current/matrices.h
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/matrices.h 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/matrices.h 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,854 @@
+char *amino_acid_order = "ABCDEFGHIKLMNPQRSTVWXYZ";
+
+short blosum30mt[]={
+ 4,
+ 0, 5,
+ -3, -2, 17,
+ 0, 5, -3, 9,
+ 0, 0, 1, 1, 6,
+ -2, -3, -3, -5, -4, 10,
+ 0, 0, -4, -1, -2, -3, 8,
+ -2, -2, -5, -2, 0, -3, -3, 14,
+ 0, -2, -2, -4, -3, 0, -1, -2, 6,
+ 0, 0, -3, 0, 2, -1, -1, -2, -2, 4,
+ -1, -1, 0, -1, -1, 2, -2, -1, 2, -2, 4,
+ 1, -2, -2, -3, -1, -2, -2, 2, 1, 2, 2, 6,
+ 0, 4, -1, 1, -1, -1, 0, -1, 0, 0, -2, 0, 8,
+ -1, -2, -3, -1, 1, -4, -1, 1, -3, 1, -3, -4, -3, 11,
+ 1, -1, -2, -1, 2, -3, -2, 0, -2, 0, -2, -1, -1, 0, 8,
+ -1, -2, -2, -1, -1, -1, -2, -1, -3, 1, -2, 0, -2, -1, 3, 8,
+ 1, 0, -2, 0, 0, -1, 0, -1, -1, 0, -2, -2, 0, -1, -1, -1, 4,
+ 1, 0, -2, -1, -2, -2, -2, -2, 0, -1, 0, 0, 1, 0, 0, -3, 2, 5,
+ 1, -2, -2, -2, -3, 1, -3, -3, 4, -2, 1, 0, -2, -4, -3, -1, -1, 1, 5,
+ -5, -5, -2, -4, -1, 1, 1, -5, -3, -2, -2, -3, -7, -3, -1, 0, -3, -5, -3, 20,
+ 0, -1, -2, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, -1, 0, -1, 0, 0, 0, -2, -1,
+ -4, -3, -6, -1, -2, 3, -3, 0, -1, -1, 3, -1, -4, -2, -1, 0, -2, -1, 1, 5, -1, 9,
+ 0, 0, 0, 0, 5, -4, -2, 0, -3, 1, -1, -1, -1, 0, 4, 0, -1, -1, -3, -1, 0, -2, 4};
+
+/*
+short blosum35mt[]={
+ 5,
+ -1, 5,
+ -2, -2, 15,
+ -1, 5, -3, 8,
+ -1, 0, -1, 2, 6,
+ -2, -2, -4, -3, -3, 8,
+ 0, 0, -3, -2, -2, -3, 7,
+ -2, 0, -4, 0, -1, -3, -2, 12,
+ -1, -2, -4, -3, -3, 1, -3, -3, 5,
+ 0, 0, -2, -1, 1, -1, -1, -2, -2, 5,
+ -2, -2, -2, -2, -1, 2, -3, -2, 2, -2, 5,
+ 0, -2, -4, -3, -2, 0, -1, 1, 1, 0, 3, 6,
+ -1, 4, -1, 1, -1, -1, 1, 1, -1, 0, -2, -1, 7,
+ -2, -1, -4, -1, 0, -4, -2, -1, -1, 0, -3, -3, -2, 10,
+ 0, 0, -3, -1, 2, -4, -2, -1, -2, 0, -2, -1, 1, 0, 7,
+ -1, -1, -3, -1, -1, -1, -2, -1, -3, 2, -2, 0, -1, -2, 2, 8,
+ 1, 0, -3, -1, 0, -1, 1, -1, -2, 0, -2, -1, 0, -2, 0, -1, 4,
+ 0, -1, -1, -1, -1, -1, -2, -2, -1, 0, 0, 0, 0, 0, 0, -2, 2, 5,
+ 0, -2, -2, -2, -2, 1, -3, -4, 4, -2, 2, 1, -2, -3, -3, -1, -1, 1, 5,
+ -2, -3, -5, -3, -1, 1, -1, -4, -1, 0, 0, 1, -2, -4, -1, 0, -2, -2, -2, 16,
+ 0, -1, -2, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, -1, -1, -1, 0, 0, 0, -1, -1,
+ -1, -2, -5, -2, -1, 3, -2, 0, 0, -1, 0, 0, -2, -3, 0, 0, -1, -2, 0, 3, -1, 8,
+ -1, 0, -2, 1, 5, -3, -2, -1, -3, 1, -2, -2, 0, 0, 4, 0, 0, -1, -2, -1, 0, -1, 4};
+*/
+short blosum40mt[]={
+ 5,
+ -1, 5,
+ -2, -2, 16,
+ -1, 6, -2, 9,
+ -1, 1, -2, 2, 7,
+ -3, -3, -2, -4, -3, 9,
+ 1, -1, -3, -2, -3, -3, 8,
+ -2, 0, -4, 0, 0, -2, -2, 13,
+ -1, -3, -4, -4, -4, 1, -4, -3, 6,
+ -1, 0, -3, 0, 1, -3, -2, -1, -3, 6,
+ -2, -3, -2, -3, -2, 2, -4, -2, 2, -2, 6,
+ -1, -3, -3, -3, -2, 0, -2, 1, 1, -1, 3, 7,
+ -1, 4, -2, 2, -1, -3, 0, 1, -2, 0, -3, -2, 8,
+ -2, -2, -5, -2, 0, -4, -1, -2, -2, -1, -4, -2, -2, 11,
+ 0, 0, -4, -1, 2, -4, -2, 0, -3, 1, -2, -1, 1, -2, 8,
+ -2, -1, -3, -1, -1, -2, -3, 0, -3, 3, -2, -1, 0, -3, 2, 9,
+ 1, 0, -1, 0, 0, -2, 0, -1, -2, 0, -3, -2, 1, -1, 1, -1, 5,
+ 0, 0, -1, -1, -1, -1, -2, -2, -1, 0, -1, -1, 0, 0, -1, -2, 2, 6,
+ 0, -3, -2, -3, -3, 0, -4, -4, 4, -2, 2, 1, -3, -3, -3, -2, -1, 1, 5,
+ -3, -4, -6, -5, -2, 1, -2, -5, -3, -2, -1, -2, -4, -4, -1, -2, -5, -4, -3, 19,
+ 0, -1, -2, -1, -1, -1, -1, -1, -1, -1, -1, 0, -1, -2, -1, -1, 0, 0, -1, -2, -1,
+ -2, -3, -4, -3, -2, 4, -3, 2, 0, -1, 0, 1, -2, -3, -1, -1, -2, -1, -1, 3, -1, 9,
+ -1, 2, -3, 1, 5, -4, -2, 0, -4, 1, -2, -2, 0, -1, 4, 0, 0, -1, -3, -2, -1, -2, 5};
+
+short blosum45mt[]={
+ 5,
+ -1, 4,
+ -1, -2, 12,
+ -2, 5, -3, 7,
+ -1, 1, -3, 2, 6,
+ -2, -3, -2, -4, -3, 8,
+ 0, -1, -3, -1, -2, -3, 7,
+ -2, 0, -3, 0, 0, -2, -2, 10,
+ -1, -3, -3, -4, -3, 0, -4, -3, 5,
+ -1, 0, -3, 0, 1, -3, -2, -1, -3, 5,
+ -1, -3, -2, -3, -2, 1, -3, -2, 2, -3, 5,
+ -1, -2, -2, -3, -2, 0, -2, 0, 2, -1, 2, 6,
+ -1, 4, -2, 2, 0, -2, 0, 1, -2, 0, -3, -2, 6,
+ -1, -2, -4, -1, 0, -3, -2, -2, -2, -1, -3, -2, -2, 9,
+ -1, 0, -3, 0, 2, -4, -2, 1, -2, 1, -2, 0, 0, -1, 6,
+ -2, -1, -3, -1, 0, -2, -2, 0, -3, 3, -2, -1, 0, -2, 1, 7,
+ 1, 0, -1, 0, 0, -2, 0, -1, -2, -1, -3, -2, 1, -1, 0, -1, 4,
+ 0, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, 0, -1, -1, -1, 2, 5,
+ 0, -3, -1, -3, -3, 0, -3, -3, 3, -2, 1, 1, -3, -3, -3, -2, -1, 0, 5,
+ -2, -4, -5, -4, -3, 1, -2, -3, -2, -2, -2, -2, -4, -3, -2, -2, -4, -3, -3, 15,
+ 0, -1, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, -1, -2, -1,
+ -2, -2, -3, -2, -2, 3, -3, 2, 0, -1, 0, 0, -2, -3, -1, -1, -2, -1, -1, 3, -1, 8,
+ -1, 2, -3, 1, 4, -3, -2, 0, -3, 1, -2, -1, 0, -1, 4, 0, 0, -1, -3, -2, -1, -2, 4};
+
+/*
+short blosum50mt[]={
+ 5,
+ -2, 5,
+ -1, -3, 13,
+ -2, 5, -4, 8,
+ -1, 1, -3, 2, 6,
+ -3, -4, -2, -5, -3, 8,
+ 0, -1, -3, -1, -3, -4, 8,
+ -2, 0, -3, -1, 0, -1, -2, 10,
+ -1, -4, -2, -4, -4, 0, -4, -4, 5,
+ -1, 0, -3, -1, 1, -4, -2, 0, -3, 6,
+ -2, -4, -2, -4, -3, 1, -4, -3, 2, -3, 5,
+ -1, -3, -2, -4, -2, 0, -3, -1, 2, -2, 3, 7,
+ -1, 4, -2, 2, 0, -4, 0, 1, -3, 0, -4, -2, 7,
+ -1, -2, -4, -1, -1, -4, -2, -2, -3, -1, -4, -3, -2, 10,
+ -1, 0, -3, 0, 2, -4, -2, 1, -3, 2, -2, 0, 0, -1, 7,
+ -2, -1, -4, -2, 0, -3, -3, 0, -4, 3, -3, -2, -1, -3, 1, 7,
+ 1, 0, -1, 0, -1, -3, 0, -1, -3, 0, -3, -2, 1, -1, 0, -1, 5,
+ 0, 0, -1, -1, -1, -2, -2, -2, -1, -1, -1, -1, 0, -1, -1, -1, 2, 5,
+ 0, -4, -1, -4, -3, -1, -4, -4, 4, -3, 1, 1, -3, -3, -3, -3, -2, 0, 5,
+ -3, -5, -5, -5, -3, 1, -3, -3, -3, -3, -2, -1, -4, -4, -1, -3, -4, -3, -3, 15,
+ -1, -1, -2, -1, -1, -2, -2, -1, -1, -1, -1, -1, -1, -2, -1, -1, -1, 0, -1, -3, -1,
+ -2, -3, -3, -3, -2, 4, -3, 2, -1, -2, -1, 0, -2, -3, -1, -1, -2, -2, -1, 2, -1, 8,
+ -1, 2, -3, 1, 5, -4, -2, 0, -3, 1, -3, -1, 0, -1, 4, 0, 0, -1, -3, -2, -1, -2, 5};
+
+short blosum55mt[]={
+ 5,
+ -2, 5,
+ 0, -4, 13,
+ -2, 5, -4, 8,
+ -1, 1, -4, 2, 7,
+ -3, -5, -3, -5, -4, 9,
+ 0, -1, -3, -2, -3, -4, 8,
+ -2, 0, -4, -1, -1, -1, -2, 11,
+ -2, -4, -2, -4, -4, 0, -5, -4, 6,
+ -1, 0, -4, -1, 1, -4, -2, 0, -4, 6,
+ -2, -4, -2, -5, -4, 1, -5, -3, 2, -3, 6,
+ -1, -3, -2, -4, -3, 0, -3, -2, 2, -2, 3, 8,
+ -2, 4, -3, 2, 0, -4, 0, 1, -4, 0, -4, -3, 8,
+ -1, -2, -3, -2, -1, -5, -3, -3, -3, -1, -4, -3, -2, 10,
+ -1, 0, -4, 0, 2, -4, -2, 1, -4, 2, -3, 0, 0, -1, 7,
+ -2, -1, -4, -2, 0, -3, -3, 0, -4, 3, -3, -2, -1, -3, 1, 8,
+ 2, 0, -1, 0, 0, -3, 0, -1, -3, 0, -3, -2, 1, -1, 0, -1, 5,
+ 0, -1, -1, -1, -1, -3, -2, -2, -1, -1, -2, -1, 0, -1, -1, -1, 2, 6,
+ 0, -4, -1, -4, -3, -1, -4, -4, 4, -3, 1, 1, -4, -3, -3, -3, -2, 0, 5,
+ -4, -5, -4, -5, -3, 2, -3, -3, -3, -4, -3, -2, -5, -5, -2, -3, -4, -3, -4, 15,
+ -1, -1, -2, -2, -1, -2, -2, -1, -1, -1, -1, -1, -1, -2, -1, -1, -1, -1, -1, -3, -1,
+ -2, -3, -3, -3, -2, 4, -4, 2, -1, -2, -1, -1, -2, -4, -1, -2, -2, -2, -2, 3, -1, 9,
+ -1, 2, -4, 1, 5, -4, -3, 0, -4, 1, -3, -2, 0, -1, 4, 0, 0, -1, -3, -3, -1, -2, 5};
+
+
+short blosum62mt[]={
+ 4,
+ -2, 4,
+ 0, -3, 9,
+ -2, 4, -3, 6,
+ -1, 1, -4, 2, 5,
+ -2, -3, -2, -3, -3, 6,
+ 0, -1, -3, -1, -2, -3, 6,
+ -2, 0, -3, -1, 0, -1, -2, 8,
+ -1, -3, -1, -3, -3, 0, -4, -3, 4,
+ -1, 0, -3, -1, 1, -3, -2, -1, -3, 5,
+ -1, -4, -1, -4, -3, 0, -4, -3, 2, -2, 4,
+ -1, -3, -1, -3, -2, 0, -3, -2, 1, -1, 2, 5,
+ -2, 3, -3, 1, 0, -3, 0, 1, -3, 0, -3, -2, 6,
+ -1, -2, -3, -1, -1, -4, -2, -2, -3, -1, -3, -2, -2, 7,
+ -1, 0, -3, 0, 2, -3, -2, 0, -3, 1, -2, 0, 0, -1, 5,
+ -1, -1, -3, -2, 0, -3, -2, 0, -3, 2, -2, -1, 0, -2, 1, 5,
+ 1, 0, -1, 0, 0, -2, 0, -1, -2, 0, -2, -1, 1, -1, 0, -1, 4,
+ 0, -1, -1, -1, -1, -2, -2, -2, -1, -1, -1, -1, 0, -1, -1, -1, 1, 5,
+ 0, -3, -1, -3, -2, -1, -3, -3, 3, -2, 1, 1, -3, -2, -2, -3, -2, 0, 4,
+ -3, -4, -2, -4, -3, 1, -2, -2, -3, -3, -2, -1, -4, -4, -2, -3, -3, -2, -3, 11,
+ 0, -1, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -1, -1, 0, 0, -1, -2, -1,
+ -2, -3, -2, -3, -2, 3, -3, 2, -1, -2, -1, -1, -2, -3, -1, -2, -2, -2, -1, 2, -1, 7,
+ -1, 1, -3, 1, 4, -3, -2, 0, -3, 1, -3, -1, 0, -1, 3, 0, 0, -1, -2, -3, -1, -2, 4};
+*/
+
+short blosum62mt2[]={
+ 8,
+ -4, 8,
+ 0, -6, 18,
+ -4, 8, -6, 12,
+ -2, 2, -8, 4, 10,
+ -4, -6, -4, -6, -6, 12,
+ 0, -2, -6, -2, -4, -6, 12,
+ -4, 0, -6, -2, 0, -2, -4, 16,
+ -2, -6, -2, -6, -6, 0, -8, -6, 8,
+ -2, 0, -6, -2, 2, -6, -4, -2, -6, 10,
+ -2, -8, -2, -8, -6, 0, -8, -6, 4, -4, 8,
+ -2, -6, -2, -6, -4, 0, -6, -4, 2, -2, 4, 10,
+ -4, 6, -6, 2, 0, -6, 0, 2, -6, 0, -6, -4, 12,
+ -2, -4, -6, -2, -2, -8, -4, -4, -6, -2, -6, -4, -4, 14,
+ -2, 0, -6, 0, 4, -6, -4, 0, -6, 2, -4, 0, 0, -2, 10,
+ -2, -2, -6, -4, 0, -6, -4, 0, -6, 4, -4, -2, 0, -4, 2, 10,
+ 2, 0, -2, 0, 0, -4, 0, -2, -4, 0, -4, -2, 2, -2, 0, -2, 8,
+ 0, -2, -2, -2, -2, -4, -4, -4, -2, -2, -2, -2, 0, -2, -2, -2, 2, 10,
+ 0, -6, -2, -6, -4, -2, -6, -6, 6, -4, 2, 2, -6, -4, -4, -6, -4, 0, 8,
+ -6, -8, -4, -8, -6, 2, -4, -4, -6, -6, -4, -2, -8, -8, -4, -6, -6, -4, -6, 22,
+ 0, -2, -4, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -4, -2, -2, 0, 0, -2, -4, -2,
+ -4, -6, -4, -6, -4, 6, -6, 4, -2, -4, -2, -2, -4, -6, -2, -4, -4, -4, -2, 4, -2, 14,
+ -2, 2, -6, 2, 8, -6, -4, 0, -6, 2, -6, -2, 0, -2, 6, 0, 0, -2, -4, -6, -2, -4, 8};
+
+/*
+short blosum65mt[]={
+ 4,
+ -2, 4,
+ 0, -3, 9,
+ -2, 4, -4, 6,
+ -1, 1, -4, 2, 5,
+ -2, -3, -2, -4, -3, 6,
+ 0, -1, -3, -1, -2, -3, 6,
+ -2, 0, -3, -1, 0, -1, -2, 8,
+ -1, -3, -1, -3, -3, 0, -4, -3, 4,
+ -1, 0, -3, -1, 1, -3, -2, -1, -3, 5,
+ -2, -4, -1, -4, -3, 0, -4, -3, 2, -3, 4,
+ -1, -3, -2, -3, -2, 0, -3, -2, 1, -2, 2, 6,
+ -2, 3, -3, 1, 0, -3, -1, 1, -3, 0, -4, -2, 6,
+ -1, -2, -3, -2, -1, -4, -2, -2, -3, -1, -3, -3, -2, 8,
+ -1, 0, -3, 0, 2, -3, -2, 1, -3, 1, -2, 0, 0, -1, 6,
+ -1, -1, -4, -2, 0, -3, -2, 0, -3, 2, -2, -2, 0, -2, 1, 6,
+ 1, 0, -1, 0, 0, -2, 0, -1, -2, 0, -3, -2, 1, -1, 0, -1, 4,
+ 0, -1, -1, -1, -1, -2, -2, -2, -1, -1, -1, -1, 0, -1, -1, -1, 1, 5,
+ 0, -3, -1, -3, -3, -1, -3, -3, 3, -2, 1, 1, -3, -2, -2, -3, -2, 0, 4,
+ -3, -4, -2, -5, -3, 1, -3, -2, -2, -3, -2, -2, -4, -4, -2, -3, -3, -3, -3, 10,
+ -1, -1, -2, -1, -1, -2, -2, -1, -1, -1, -1, -1, -1, -2, -1, -1, -1, -1, -1, -2, -1,
+ -2, -3, -2, -3, -2, 3, -3, 2, -1, -2, -1, -1, -2, -3, -2, -2, -2, -2, -1, 2, -1, 7,
+ -1, 1, -4, 1, 4, -3, -2, 0, -3, 1, -3, -2, 0, -1, 3, 0, 0, -1, -2, -3, -1, -2, 4};
+
+short blosum70mt[]={
+ 4,
+ -2, 4,
+ -1, -4, 9,
+ -2, 4, -4, 6,
+ -1, 1, -4, 1, 5,
+ -2, -4, -2, -4, -4, 6,
+ 0, -1, -3, -2, -2, -4, 6,
+ -2, -1, -4, -1, 0, -1, -2, 8,
+ -2, -4, -1, -4, -4, 0, -4, -4, 4,
+ -1, -1, -4, -1, 1, -3, -2, -1, -3, 5,
+ -2, -4, -2, -4, -3, 0, -4, -3, 2, -3, 4,
+ -1, -3, -2, -3, -2, 0, -3, -2, 1, -2, 2, 6,
+ -2, 3, -3, 1, 0, -3, -1, 0, -4, 0, -4, -2, 6,
+ -1, -2, -3, -2, -1, -4, -3, -2, -3, -1, -3, -3, -2, 8,
+ -1, 0, -3, -1, 2, -3, -2, 1, -3, 1, -2, 0, 0, -2, 6,
+ -2, -1, -4, -2, 0, -3, -3, 0, -3, 2, -3, -2, -1, -2, 1, 6,
+ 1, 0, -1, 0, 0, -3, -1, -1, -3, 0, -3, -2, 0, -1, 0, -1, 4,
+ 0, -1, -1, -1, -1, -2, -2, -2, -1, -1, -2, -1, 0, -1, -1, -1, 1, 5,
+ 0, -3, -1, -4, -3, -1, -4, -3, 3, -3, 1, 1, -3, -3, -2, -3, -2, 0, 4,
+ -3, -4, -3, -5, -4, 1, -3, -2, -3, -3, -2, -2, -4, -4, -2, -3, -3, -3, -3, 11,
+ -1, -1, -2, -2, -1, -2, -2, -1, -1, -1, -1, -1, -1, -2, -1, -1, -1, -1, -1, -3, -1,
+ -2, -3, -3, -4, -3, 3, -4, 2, -1, -2, -1, -1, -2, -3, -2, -2, -2, -2, -2, 2, -2, 7,
+ -1, 0, -4, 1, 4, -4, -2, 0, -3, 1, -3, -2, 0, -1, 3, 0, 0, -1, -3, -3, -1, -2, 4};
+
+short blosum75mt[]={
+ 4,
+ -2, 4,
+ -1, -4, 9,
+ -2, 4, -4, 6,
+ -1, 1, -5, 1, 5,
+ -3, -4, -2, -4, -4, 6,
+ 0, -1, -3, -2, -3, -4, 6,
+ -2, -1, -4, -1, 0, -2, -2, 8,
+ -2, -4, -1, -4, -4, 0, -5, -4, 4,
+ -1, -1, -4, -1, 1, -4, -2, -1, -3, 5,
+ -2, -4, -2, -4, -4, 0, -4, -3, 1, -3, 4,
+ -1, -3, -2, -4, -2, 0, -3, -2, 1, -2, 2, 6,
+ -2, 3, -3, 1, -1, -4, -1, 0, -4, 0, -4, -3, 6,
+ -1, -2, -4, -2, -1, -4, -3, -2, -3, -1, -3, -3, -3, 8,
+ -1, 0, -3, -1, 2, -4, -2, 1, -3, 1, -3, 0, 0, -2, 6,
+ -2, -1, -4, -2, 0, -3, -3, 0, -3, 2, -3, -2, -1, -2, 1, 6,
+ 1, 0, -1, -1, 0, -3, -1, -1, -3, 0, -3, -2, 0, -1, 0, -1, 5,
+ 0, -1, -1, -1, -1, -2, -2, -2, -1, -1, -2, -1, 0, -1, -1, -1, 1, 5,
+ 0, -4, -1, -4, -3, -1, -4, -4, 3, -3, 1, 1, -3, -3, -2, -3, -2, 0, 4,
+ -3, -5, -3, -5, -4, 1, -3, -2, -3, -4, -2, -2, -4, -5, -2, -3, -3, -3, -3, 11,
+ -1, -2, -2, -2, -1, -2, -2, -1, -2, -1, -1, -1, -1, -2, -1, -1, -1, -1, -1, -3, -1,
+ -2, -3, -3, -4, -3, 3, -4, 2, -2, -2, -1, -2, -3, -4, -2, -2, -2, -2, -2, 2, -2, 7,
+ -1, 0, -4, 1, 4, -4, -2, 0, -4, 1, -3, -2, 0, -2, 3, 0, 0, -1, -3, -3, -1, -3, 4};
+*/
+
+short blosum80mt[]={
+ 7,
+ -3, 6,
+ -1, -6, 13,
+ -3, 6, -7, 10,
+ -2, 1, -7, 2, 8,
+ -4, -6, -4, -6, -6, 10,
+ 0, -2, -6, -3, -4, -6, 9,
+ -3, -1, -7, -2, 0, -2, -4, 12,
+ -3, -6, -2, -7, -6, -1, -7, -6, 7,
+ -1, -1, -6, -2, 1, -5, -3, -1, -5, 8,
+ -3, -7, -3, -7, -6, 0, -7, -5, 2, -4, 6,
+ -2, -5, -3, -6, -4, 0, -5, -4, 2, -3, 3, 9,
+ -3, 5, -5, 2, -1, -6, -1, 1, -6, 0, -6, -4, 9,
+ -1, -4, -6, -3, -2, -6, -5, -4, -5, -2, -5, -4, -4, 12,
+ -2, -1, -5, -1, 3, -5, -4, 1, -5, 2, -4, -1, 0, -3, 9,
+ -3, -2, -6, -3, -1, -5, -4, 0, -5, 3, -4, -3, -1, -3, 1, 9,
+ 2, 0, -2, -1, -1, -4, -1, -2, -4, -1, -4, -3, 1, -2, -1, -2, 7,
+ 0, -1, -2, -2, -2, -4, -3, -3, -2, -1, -3, -1, 0, -3, -1, -2, 2, 8,
+ -1, -6, -2, -6, -4, -2, -6, -5, 4, -4, 1, 1, -5, -4, -4, -4, -3, 0, 7,
+ -5, -8, -5, -8, -6, 0, -6, -4, -5, -6, -4, -3, -7, -7, -4, -5, -6, -5, -5, 16,
+ -1, -3, -4, -3, -2, -3, -3, -2, -2, -2, -2, -2, -2, -3, -2, -2, -1, -1, -2, -5, -2,
+ -4, -5, -5, -6, -5, 4, -6, 3, -3, -4, -2, -3, -4, -6, -3, -4, -3, -3, -3, 3, -3, 11,
+ -2, 0, -7, 1, 6, -6, -4, 0, -6, 1, -5, -3, -1, -2, 5, 0, -1, -2, -4, -5, -1, -4, 6};
+
+/*
+short blosum85mt[]={
+ 5,
+ -2, 4,
+ -1, -4, 9,
+ -2, 4, -5, 7,
+ -1, 0, -5, 1, 6,
+ -3, -4, -3, -4, -4, 7,
+ 0, -1, -4, -2, -3, -4, 6,
+ -2, -1, -5, -2, -1, -2, -3, 8,
+ -2, -5, -2, -5, -4, -1, -5, -4, 5,
+ -1, -1, -4, -1, 0, -4, -2, -1, -3, 6,
+ -2, -5, -2, -5, -4, 0, -5, -3, 1, -3, 4,
+ -2, -4, -2, -4, -3, -1, -4, -3, 1, -2, 2, 7,
+ -2, 4, -4, 1, -1, -4, -1, 0, -4, 0, -4, -3, 7,
+ -1, -3, -4, -2, -2, -4, -3, -3, -4, -2, -4, -3, -3, 8,
+ -1, -1, -4, -1, 2, -4, -3, 1, -4, 1, -3, 0, 0, -2, 6,
+ -2, -2, -4, -2, -1, -4, -3, 0, -4, 2, -3, -2, -1, -2, 1, 6,
+ 1, 0, -2, -1, -1, -3, -1, -1, -3, -1, -3, -2, 0, -1, -1, -1, 5,
+ 0, -1, -2, -2, -1, -3, -2, -2, -1, -1, -2, -1, 0, -2, -1, -2, 1, 5,
+ -1, -4, -1, -4, -3, -1, -4, -4, 3, -3, 0, 0, -4, -3, -3, -3, -2, 0, 5,
+ -3, -5, -4, -6, -4, 0, -4, -3, -3, -5, -3, -2, -5, -5, -3, -4, -4, -4, -3, 11,
+ -1, -2, -3, -2, -1, -2, -2, -2, -2, -1, -2, -1, -2, -2, -1, -2, -1, -1, -1, -3, -2,
+ -3, -4, -3, -4, -4, 3, -5, 2, -2, -3, -2, -2, -3, -4, -2, -3, -2, -2, -2, 2, -2, 7,
+ -1, 0, -5, 1, 4, -4, -3, 0, -4, 1, -4, -2, -1, -2, 4, 0, -1, -1, -3, -4, -1, -3, 4};
+
+short blosum90mt[]={
+ 5,
+ -2, 4,
+ -1, -4, 9,
+ -3, 4, -5, 7,
+ -1, 0, -6, 1, 6,
+ -3, -4, -3, -5, -5, 7,
+ 0, -2, -4, -2, -3, -5, 6,
+ -2, -1, -5, -2, -1, -2, -3, 8,
+ -2, -5, -2, -5, -4, -1, -5, -4, 5,
+ -1, -1, -4, -1, 0, -4, -2, -1, -4, 6,
+ -2, -5, -2, -5, -4, 0, -5, -4, 1, -3, 5,
+ -2, -4, -2, -4, -3, -1, -4, -3, 1, -2, 2, 7,
+ -2, 4, -4, 1, -1, -4, -1, 0, -4, 0, -4, -3, 7,
+ -1, -3, -4, -3, -2, -4, -3, -3, -4, -2, -4, -3, -3, 8,
+ -1, -1, -4, -1, 2, -4, -3, 1, -4, 1, -3, 0, 0, -2, 7,
+ -2, -2, -5, -3, -1, -4, -3, 0, -4, 2, -3, -2, -1, -3, 1, 6,
+ 1, 0, -2, -1, -1, -3, -1, -2, -3, -1, -3, -2, 0, -2, -1, -1, 5,
+ 0, -1, -2, -2, -1, -3, -3, -2, -1, -1, -2, -1, 0, -2, -1, -2, 1, 6,
+ -1, -4, -2, -5, -3, -2, -5, -4, 3, -3, 0, 0, -4, -3, -3, -3, -2, -1, 5,
+ -4, -6, -4, -6, -5, 0, -4, -3, -4, -5, -3, -2, -5, -5, -3, -4, -4, -4, -3, 11,
+ -1, -2, -3, -2, -2, -2, -2, -2, -2, -1, -2, -1, -2, -2, -1, -2, -1, -1, -2, -3, -2,
+ -3, -4, -4, -4, -4, 3, -5, 1, -2, -3, -2, -2, -3, -4, -3, -3, -3, -2, -3, 2, -2, 8,
+ -1, 0, -5, 0, 4, -4, -3, 0, -4, 1, -4, -2, -1, -2, 4, 0, -1, -1, -3, -4, -1, -3, 4};
+*/
+
+short pam20mt[]={
+ 6,
+ -5, 6,
+ -8,-14, 10,
+ -4, 6,-16, 8,
+ -3, 0,-16, 2, 8,
+ -9,-12,-15,-17,-16, 9,
+ -3, -4,-11, -4, -5,-10, 7,
+ -8, -2, -8, -5, -6, -7,-10, 9,
+ -6, -7, -7, -9, -6, -3,-13,-11, 9,
+ -8, -3,-16, -6, -5,-16, -8, -8, -7, 7,
+ -7,-10,-17,-15,-10, -4,-12, -7, -2, -9, 7,
+ -6,-12,-16,-13, -8, -5,-10,-13, -2, -3, 0, 11,
+ -5, 6,-13, 1, -3,-10, -4, -1, -6, -2, -8,-11, 8,
+ -2, -8, -9, -9, -7,-11, -7, -5,-10, -8, -8, -9, -7, 8,
+ -5, -4,-16, -4, 0,-15, -8, 0, -9, -4, -6, -5, -5, -4, 9,
+ -8, -9, -9,-12,-11,-10,-11, -3, -6, -1,-10, -5, -7, -5, -2, 9,
+ -1, -2, -4, -5, -5, -7, -3, -7, -8, -5, -9, -6, -1, -3, -6, -4, 7,
+ -1, -4, -9, -6, -7,-10, -7, -8, -3, -4, -8, -5, -3, -5, -7, -8, 0, 7,
+ -3, -9, -7, -9, -8, -9, -7, -7, 1,-10, -3, -2, -9, -7, -8, -9, -8, -4, 7,
+-16,-11,-18,-17,-19, -6,-17, -8,-16,-14, -7,-15, -9,-16,-15, -3, -6,-15,-18, 13,
+ -4, -6,-11, -7, -6, -9, -6, -6, -6, -6, -7, -6, -4, -6, -6, -7, -4, -5, -6,-13, -6,
+ -9, -7, -5,-13, -9, 1,-16, -4, -7,-10, -8,-13, -5,-16,-14,-11, -8, -7, -8, -6, -9, 10,
+ -4, -1,-16, 0, 6,-16, -6, -2, -7, -5, -8, -6, -4, -5, 7, -5, -6, -7, -8,-17, -6,-11, 6};
+
+short pam60mt[]={
+ 5,
+ -2, 5,
+ -5, -9, 9,
+ -2, 5,-10, 7,
+ -1, 2,-10, 3, 7,
+ -6, -8, -9,-11,-10, 8,
+ 0, -2, -7, -2, -2, -7, 6,
+ -5, 0, -6, -2, -3, -4, -6, 8,
+ -3, -4, -4, -5, -4, -1, -7, -6, 7,
+ -5, -1,-10, -2, -3,-10, -5, -4, -4, 6,
+ -4, -7,-11, -9, -7, -1, -8, -4, 0, -6, 6,
+ -3, -6,-10, -7, -5, -2, -6, -7, 1, 0, 2, 10,
+ -2, 5, -7, 2, 0, -6, -1, 1, -4, 0, -5, -6, 6,
+ 0, -4, -6, -5, -3, -7, -4, -2, -6, -4, -5, -6, -4, 7,
+ -3, -1,-10, -1, 2, -9, -5, 2, -5, -1, -3, -2, -2, -1, 7,
+ -5, -5, -6, -6, -6, -7, -7, 0, -4, 2, -6, -2, -3, -2, 0, 8,
+ 1, 0, -1, -2, -2, -5, 0, -4, -4, -2, -6, -4, 1, 0, -3, -2, 5,
+ 1, -2, -5, -3, -4, -6, -3, -5, -1, -2, -5, -2, -1, -2, -4, -4, 1, 6,
+ -1, -5, -4, -6, -4, -5, -4, -5, 3, -6, -1, 0, -5, -4, -5, -5, -4, -1, 6,
+-10, -8,-12,-11,-12, -3,-11, -5,-10, -8, -4, -9, -6,-10, -9, 0, -4, -9,-11, 13,
+ -2, -3, -6, -3, -3, -5, -3, -3, -3, -3, -4, -3, -2, -3, -3, -4, -2, -2, -3, -8, -3,
+ -6, -5, -2, -8, -7, 3,-10, -2, -4, -7, -5, -7, -3,-10, -8, -8, -5, -5, -5, -3, -5, 9,
+ -2, 1,-10, 2, 5,-10, -3, 0, -4, -2, -5, -4, -1, -2, 6, -2, -3, -4, -5,-11, -3, -7, 5};
+
+short pam120mt[]={
+ 3,
+ 0, 4,
+ -3, -6, 9,
+ 0, 4, -7, 5,
+ 0, 3, -7, 3, 5,
+ -4, -5, -6, -7, -7, 8,
+ 1, 0, -4, 0, -1, -5, 5,
+ -3, 1, -4, 0, -1, -3, -4, 7,
+ -1, -3, -3, -3, -3, 0, -4, -4, 6,
+ -2, 0, -7, -1, -1, -7, -3, -2, -3, 5,
+ -3, -4, -7, -5, -4, 0, -5, -3, 1, -4, 5,
+ -2, -4, -6, -4, -3, -1, -4, -4, 1, 0, 3, 8,
+ -1, 3, -5, 2, 1, -4, 0, 2, -2, 1, -4, -3, 4,
+ 1, -2, -4, -3, -2, -5, -2, -1, -3, -2, -3, -3, -2, 6,
+ -1, 0, -7, 1, 2, -6, -3, 3, -3, 0, -2, -1, 0, 0, 6,
+ -3, -2, -4, -3, -3, -5, -4, 1, -2, 2, -4, -1, -1, -1, 1, 6,
+ 1, 0, 0, 0, -1, -3, 1, -2, -2, -1, -4, -2, 1, 1, -2, -1, 3,
+ 1, 0, -3, -1, -2, -4, -1, -3, 0, -1, -3, -1, 0, -1, -2, -2, 2, 4,
+ 0, -3, -3, -3, -3, -3, -2, -3, 3, -4, 1, 1, -3, -2, -3, -3, -2, 0, 5,
+ -7, -6, -8, -8, -8, -1, -8, -3, -6, -5, -3, -6, -4, -7, -6, 1, -2, -6, -8, 12,
+ -1, -1, -4, -2, -1, -3, -2, -2, -1, -2, -2, -2, -1, -2, -1, -2, -1, -1, -1, -5, -2,
+ -4, -3, -1, -5, -5, 4, -6, -1, -2, -5, -2, -4, -2, -6, -5, -5, -3, -3, -3, -2, -3, 8,
+ -1, 2, -7, 3, 4, -6, -2, 1, -3, -1, -3, -2, 0, -1, 4, -1, -1, -2, -3, -7, -1, -5, 4};
+
+/*
+short pam160mt[]={
+ 2,
+ 0, 3,
+ -2, -4, 9,
+ 0, 3, -5, 4,
+ 0, 2, -5, 3, 4,
+ -3, -4, -5, -6, -5, 7,
+ 1, 0, -3, 0, 0, -4, 4,
+ -2, 1, -3, 0, 0, -2, -3, 6,
+ -1, -2, -2, -3, -2, 0, -3, -3, 5,
+ -2, 0, -5, 0, -1, -5, -2, -1, -2, 4,
+ -2, -4, -6, -4, -3, 1, -4, -2, 2, -3, 5,
+ -1, -3, -5, -3, -2, 0, -3, -3, 2, 0, 3, 7,
+ 0, 2, -4, 2, 1, -3, 0, 2, -2, 1, -3, -2, 3,
+ 1, -1, -3, -2, -1, -4, -1, -1, -2, -2, -3, -2, -1, 5,
+ -1, 1, -5, 1, 2, -5, -2, 2, -2, 0, -2, -1, 0, 0, 5,
+ -2, -1, -3, -2, -2, -4, -3, 1, -2, 3, -3, -1, -1, -1, 1, 6,
+ 1, 0, 0, 0, 0, -3, 1, -1, -2, -1, -3, -2, 1, 1, -1, -1, 2,
+ 1, 0, -2, -1, -1, -3, -1, -2, 0, 0, -2, -1, 0, 0, -1, -1, 1, 3,
+ 0, -2, -2, -3, -2, -2, -2, -2, 3, -3, 1, 1, -2, -2, -2, -3, -1, 0, 4,
+ -5, -5, -7, -6, -7, -1, -7, -3, -5, -4, -2, -4, -4, -5, -5, 1, -2, -5, -6, 12,
+ 0, -1, -3, -1, -1, -3, -1, -1, -1, -1, -2, -1, 0, -1, -1, -1, 0, 0, -1, -4, -1,
+ -3, -3, 0, -4, -4, 5, -5, 0, -2, -4, -2, -3, -2, -5, -4, -4, -3, -3, -3, -1, -3, 8,
+ 0, 2, -5, 2, 3, -5, -1, 1, -2, 0, -3, -2, 1, -1, 3, 0, -1, -1, -2, -6, -1, -4, 3};
+
+short pam250mt[]={
+ 2,
+ 0, 3,
+ -2, -4, 12,
+ 0, 3, -5, 4,
+ 0, 3, -5, 3, 4,
+ -3, -4, -4, -6, -5, 9,
+ 1, 0, -3, 1, 0, -5, 5,
+ -1, 1, -3, 1, 1, -2, -2, 6,
+ -1, -2, -2, -2, -2, 1, -3, -2, 5,
+ -1, 1, -5, 0, 0, -5, -2, 0, -2, 5,
+ -2, -3, -6, -4, -3, 2, -4, -2, 2, -3, 6,
+ -1, -2, -5, -3, -2, 0, -3, -2, 2, 0, 4, 6,
+ 0, 2, -4, 2, 1, -3, 0, 2, -2, 1, -3, -2, 2,
+ 1, -1, -3, -1, -1, -5, 0, 0, -2, -1, -3, -2, 0, 6,
+ 0, 1, -5, 2, 2, -5, -1, 3, -2, 1, -2, -1, 1, 0, 4,
+ -2, -1, -4, -1, -1, -4, -3, 2, -2, 3, -3, 0, 0, 0, 1, 6,
+ 1, 0, 0, 0, 0, -3, 1, -1, -1, 0, -3, -2, 1, 1, -1, 0, 2,
+ 1, 0, -2, 0, 0, -3, 0, -1, 0, 0, -2, -1, 0, 0, -1, -1, 1, 3,
+ 0, -2, -2, -2, -2, -1, -1, -2, 4, -2, 2, 2, -2, -1, -2, -2, -1, 0, 4,
+ -6, -5, -8, -7, -7, 0, -7, -3, -5, -3, -2, -4, -4, -6, -5, 2, -2, -5, -6, 17,
+ 0, -1, -3, -1, -1, -2, -1, -1, -1, -1, -1, -1, 0, -1, -1, -1, 0, 0, -1, -4, -1,
+ -3, -3, 0, -4, -4, 7, -5, 0, -1, -4, -1, -2, -2, -5, -4, -4, -3, -3, -2, 0, -2, 10,
+ 0, 2, -5, 3, 3, -5, 0, 2, -2, 0, -3, -2, 1, 0, 3, 0, 0, -1, -2, -6, -1, -4, 3};
+*/
+short pam350mt[]={
+ 2,
+ 1, 3,
+ -2, -5, 18,
+ 1, 3, -6, 4,
+ 1, 3, -6, 4, 4,
+ -4, -5, -5, -6, -6, 13,
+ 2, 1, -4, 1, 1, -6, 5,
+ -1, 1, -4, 1, 1, -2, -2, 7,
+ 0, -2, -3, -2, -2, 2, -2, -2, 5,
+ -1, 1, -6, 1, 0, -6, -1, 1, -2, 5,
+ -2, -4, -7, -4, -4, 3, -4, -2, 4, -3, 8,
+ -1, -2, -6, -3, -2, 1, -3, -2, 3, 0, 5, 6,
+ 0, 2, -4, 2, 2, -4, 1, 2, -2, 1, -3, -2, 2,
+ 1, 0, -3, 0, 0, -5, 0, 0, -2, -1, -3, -2, 0, 6,
+ 0, 2, -6, 2, 3, -5, -1, 3, -2, 1, -2, -1, 1, 1, 4,
+ -1, 0, -4, -1, 0, -5, -2, 2, -2, 4, -3, 0, 1, 0, 2, 7,
+ 1, 1, 0, 1, 0, -4, 1, -1, -1, 0, -3, -2, 1, 1, 0, 0, 1,
+ 1, 0, -2, 0, 0, -3, 1, -1, 0, 0, -2, -1, 1, 1, 0, -1, 1, 2,
+ 0, -2, -2, -2, -2, -1, -1, -2, 4, -2, 3, 2, -2, -1, -2, -3, -1, 0, 5,
+ -7, -6,-10, -8, -8, 1, -8, -3, -6, -4, -2, -5, -5, -7, -5, 4, -3, -6, -7, 27,
+ 0, 0, -3, -1, 0, -2, -1, 0, 0, -1, -1, 0, 0, 0, 0, -1, 0, 0, 0, -5, -1,
+ -4, -4, 1, -5, -5, 11, -6, 0, 0, -5, 0, -2, -3, -6, -5, -5, -3, -3, -2, 1, -2, 14,
+ 0, 2, -6, 3, 3, -6, 0, 2, -2, 1, -3, -2, 2, 0, 3, 1, 0, 0, -2, -7, 0, -5, 3};
+
+/*
+short md_40mt[]={
+ 9,
+ 0, 0,
+ -7, 0, 16,
+ -6, 0,-13, 11,
+ -5, 0,-15, 3, 11,
+-11, 0, -5,-15,-16, 13,
+ -3, 0, -7, -4, -4,-15, 10,
+ -9, 0, -6, -4, -8, -7,-10, 14,
+ -6, 0,-11,-12,-12, -5,-13,-11, 11,
+ -8, 0,-12, -8, -3,-16, -9, -6,-11, 11,
+ -9, 0,-10,-14,-13, -1,-14, -7, -1,-12, 9,
+ -6, 0, -9,-12,-11, -7,-12, -9, 1, -7, 1, 14,
+ -6, 0, -8, 1, -5,-12, -5, 0, -8, -1,-12, -9, 12,
+ -2, 0,-11,-11,-11,-11, -9, -4,-11,-10, -5,-10, -9, 12,
+ -7, 0,-12, -6, 0,-14, -9, 2,-12, -1, -6, -8, -5, -3, 12,
+ -7, 0, -5,-10, -8,-15, -4, 0,-10, 3, -9, -8, -6, -6, 0, 11,
+ 0, 0, -2, -6, -8, -6, -2, -6, -8, -7, -7, -8, 1, -1, -7, -5, 9,
+ 1, 0, -7, -8, -8,-11, -7, -7, -2, -5, -9, -2, -2, -4, -7, -6, 1, 10,
+ -1, 0, -7, -9, -8, -6, -8,-12, 4,-12, -2, 0,-10, -9,-11,-11, -7, -4, 10,
+-14, 0, -4,-15,-15, -7, -7,-13,-13,-13, -8,-11,-14,-14,-11, -4, -9,-12,-10, 18,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+-13, 0, -2, -8,-14, 2,-13, 2, -9,-13, -9,-11, -6,-13, -9,-10, -7,-10,-11, -6, 0, 14,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+short md_120mt[]={
+ 6,
+ 0, 0,
+ -3, 0, 14,
+ -2, 0, -7, 8,
+ -2, 0, -8, 5, 8,
+ -6, 0, -2, -9,-10, 11,
+ 0, 0, -3, 0, -1, -9, 8,
+ -4, 0, -2, -1, -3, -2, -4, 11,
+ -1, 0, -5, -7, -7, -1, -6, -6, 7,
+ -4, 0, -6, -2, 0, -9, -4, -1, -6, 8,
+ -4, 0, -5, -8, -8, 2, -8, -4, 2, -6, 7,
+ -2, 0, -5, -7, -6, -2, -6, -5, 3, -4, 3, 10,
+ -1, 0, -3, 3, -1, -6, -1, 2, -4, 1, -6, -5, 8,
+ 0, 0, -5, -5, -5, -5, -4, -1, -5, -4, -2, -5, -3, 9,
+ -3, 0, -6, -1, 2, -7, -4, 4, -6, 2, -3, -4, -1, 0, 9,
+ -3, 0, -2, -4, -3, -8, -1, 2, -6, 4, -5, -4, -2, -2, 2, 8,
+ 2, 0, 0, -2, -3, -3, 0, -2, -3, -3, -3, -3, 2, 1, -3, -2, 5,
+ 2, 0, -3, -3, -4, -6, -2, -3, 0, -2, -4, 0, 1, 0, -3, -3, 2, 6,
+ 1, 0, -3, -5, -5, -2, -4, -6, 5, -6, 1, 2, -5, -4, -6, -6, -3, 0, 7,
+ -8, 0, 0, -9, -9, -3, -3, -6, -7, -6, -4, -6, -8, -8, -6, -1, -5, -7, -6, 17,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ -7, 0, 2, -4, -7, 5, -8, 4, -5, -7, -4, -6, -2, -7, -4, -5, -3, -6, -6, -2, 0, 12,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+short md_250mt[]={
+ 2,
+ 0, 0,
+ -1, 0, 11,
+ -1, 0, -3, 5,
+ -1, 0, -4, 4, 5,
+ -3, 0, 0, -5, -5, 8,
+ 1, 0, -1, 1, 1, -5, 5,
+ -2, 0, 0, 0, 0, 0, -2, 6,
+ 0, 0, -2, -3, -3, 0, -3, -3, 4,
+ -1, 0, -3, 0, 1, -5, -1, 1, -3, 5,
+ -1, 0, -2, -4, -4, 2, -4, -2, 2, -3, 5,
+ 0, 0, -2, -3, -3, 0, -3, -2, 3, -2, 3, 6,
+ 0, 0, -1, 2, 1, -3, 0, 1, -2, 1, -3, -2, 3,
+ 1, 0, -2, -2, -2, -2, -1, 0, -2, -1, 0, -2, -1, 6,
+ -1, 0, -3, 0, 2, -4, -1, 3, -3, 2, -2, -2, 0, 0, 5,
+ -1, 0, -1, -1, 0, -4, 0, 2, -3, 4, -3, -2, 0, -1, 2, 5,
+ 1, 0, 1, 0, -1, -2, 1, -1, -1, -1, -2, -1, 1, 1, -1, -1, 2,
+ 2, 0, -1, -1, -1, -2, 0, -1, 1, -1, -1, 0, 1, 1, -1, -1, 1, 2,
+ 1, 0, -2, -3, -2, 0, -2, -3, 4, -3, 2, 2, -2, -1, -3, -3, -1, 0, 4,
+ -4, 0, 1, -5, -5, -1, -1, -3, -4, -3, -2, -3, -4, -4, -3, 0, -3, -4, -3, 15,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ -3, 0, 2, -2, -4, 5, -4, 4, -2, -3, -1, -3, -1, -3, -2, -2, -1, -3, -3, 0, 0, 9,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+short md_350mt[]={
+ 1,
+ 0, 0,
+ 0, 0, 9,
+ 0, 0, -2, 3,
+ 0, 0, -2, 3, 3,
+ -2, 0, 1, -3, -4, 6,
+ 1, 0, 0, 1, 1, -3, 4,
+ -1, 0, 0, 0, 0, 0, -1, 3,
+ 0, 0, -1, -2, -2, 1, -2, -2, 3,
+ -1, 0, -1, 0, 1, -3, 0, 1, -2, 3,
+ -1, 0, -1, -3, -3, 2, -2, -1, 2, -2, 3,
+ 0, 0, -1, -2, -2, 1, -2, -1, 2, -2, 2, 3,
+ 0, 0, -1, 1, 1, -2, 0, 1, -1, 1, -2, -1, 2,
+ 1, 0, -1, -1, -1, -2, -1, 0, -1, -1, 0, -1, 0, 4,
+ -1, 0, -2, 1, 1, -2, 0, 2, -2, 2, -1, -1, 0, 0, 3,
+ -1, 0, 0, 0, 0, -3, 0, 1, -2, 3, -2, -1, 0, 0, 2, 3,
+ 1, 0, 0, 0, 0, -1, 1, 0, -1, 0, -1, -1, 1, 1, 0, 0, 1,
+ 1, 0, 0, 0, -1, -1, 0, -1, 0, 0, -1, 0, 0, 1, -1, 0, 1, 1,
+ 0, 0, -1, -2, -2, 0, -1, -2, 2, -2, 1, 2, -1, -1, -2, -2, 0, 0, 2,
+ -3, 0, 1, -4, -3, 0, -1, -2, -3, -2, -1, -2, -3, -3, -2, 0, -2, -3, -2, 14,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ -2, 0, 2, -2, -2, 5, -3, 3, -1, -2, 0, -1, -1, -2, -1, -1, -1, -2, -2, 0, 0, 7,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+*/
+
+short idmat[]={
+10,
+ 0, 10,
+ 0, 0, 10,
+ 0, 0, 0, 10,
+ 0, 0, 0, 0, 10,
+ 0, 0, 0, 0, 0, 10,
+ 0, 0, 0, 0, 0, 0, 10,
+ 0, 0, 0, 0, 0, 0, 0, 10,
+ 0, 0, 0, 0, 0, 0, 0, 0, 10,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 10,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,10,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,10,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,10,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,10,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,10};
+
+short gon40mt[]={
+ 92,
+ 0, 0,
+ -31, 0, 163,
+ -56, 0,-135, 111,
+ -37, 0,-140, 16, 105,
+ -92, 0, -64,-152,-143, 126,
+ -32, 0, -91, -51, -76,-152, 105,
+ -65, 0, -67, -41, -40, -50, -81, 145,
+ -76, 0, -87,-150,-106, -39,-158, -94, 104,
+ -54, 0,-132, -47, -13,-127, -79, -34, -86, 103,
+ -68, 0, -85,-155,-108, -13,-141, -85, 5, -85, 89,
+ -45, 0, -63,-130, -80, -16,-114, -60, 10, -57, 16, 140,
+ -62, 0, -83, 6, -38,-104, -40, -7, -99, -20,-112, -91, 115,
+ -37, 0,-137, -69, -60,-128, -87, -71,-108, -62, -83,-119, -78, 124,
+ -43, 0,-113, -32, 10,-100, -71, 0, -91, 2, -60, -35, -25, -46, 118,
+ -61, 0, -86, -77, -50,-130, -69, -31,-103, 19, -84, -81, -47, -73, -6, 112,
+ 0, 0, -35, -36, -41,-111, -37, -48, -95, -43, -95, -64, -11, -35, -35, -51, 99,
+ -25, 0, -59, -47, -52, -90, -85, -46, -51, -34, -78, -44, -27, -42, -39, -52, 13, 100,
+ -22, 0, -43,-133, -74, -58,-122, -98, 28, -82, -18, -22,-103, -86, -79, -88, -74, -25, 97,
+-120, 0, -68,-171,-131, -6,-108, -70, -93,-127, -71, -72,-119,-149, -87, -63, -98,-120,-115, 181,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ -95, 0, -56, -98,-107, 31,-129, 5, -76, -88, -64, -66, -62,-106, -81, -75, -69, -87, -73, 1, 0, 135,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+short gon80mt[]={
+ 75,
+ 0, 0,
+ -10, 0, 154,
+ -31, 0, -93, 96,
+ -17, 0, -94, 31, 88,
+ -64, 0, -39,-111,-102, 114,
+ -11, 0, -61, -26, -47,-115, 97,
+ -39, 0, -43, -17, -17, -26, -53, 127,
+ -43, 0, -54,-106, -73, -15,-114, -64, 86,
+ -30, 0, -88, -21, 4, -89, -50, -12, -59, 85,
+ -43, 0, -55,-109, -75, 7,-104, -57, 22, -58, 77,
+ -26, 0, -39, -88, -53, 3, -83, -38, 25, -37, 31, 117,
+ -34, 0, -55, 21, -13, -75, -18, 9, -71, -2, -79, -62, 97,
+ -16, 0, -93, -42, -35, -93, -58, -45, -75, -37, -58, -78, -48, 114,
+ -22, 0, -76, -9, 23, -70, -44, 14, -60, 17, -39, -19, -6, -24, 95,
+ -36, 0, -60, -44, -23, -90, -43, -10, -71, 33, -58, -53, -22, -45, 11, 97,
+ 14, 0, -15, -14, -19, -77, -16, -25, -62, -20, -64, -41, 5, -14, -15, -27, 78,
+ -5, 0, -34, -24, -27, -62, -52, -24, -28, -15, -49, -25, -7, -20, -18, -27, 25, 81,
+ -6, 0, -21, -89, -51, -31, -86, -65, 41, -54, 3, 1, -69, -57, -51, -60, -43, -9, 80,
+ -87, 0, -43,-124, -98, 16, -81, -43, -63, -89, -44, -45, -86,-112, -62, -41, -72, -87, -80, 173,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ -65, 0, -32, -69, -74, 49, -94, 21, -47, -60, -35, -37, -39, -76, -53, -50, -46, -58, -47, 23, 0, 123,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+short gon120mt[]={
+ 59,
+ 0, 0,
+ -1, 0, 144,
+ -18, 0, -69, 82,
+ -9, 0, -68, 35, 72,
+ -48, 0, -26, -87, -78, 102,
+ -3, 0, -45, -14, -31, -92, 90,
+ -26, 0, -31, -7, -6, -14, -37, 110,
+ -27, 0, -36, -80, -55, -3, -87, -48, 72,
+ -19, 0, -64, -8, 11, -67, -34, -2, -44, 69,
+ -30, 0, -39, -82, -57, 15, -82, -42, 28, -44, 66,
+ -17, 0, -26, -64, -40, 11, -65, -28, 29, -27, 34, 95,
+ -20, 0, -41, 26, -1, -58, -7, 14, -55, 5, -61, -46, 80,
+ -6, 0, -68, -28, -22, -72, -41, -31, -56, -24, -44, -56, -32, 105,
+ -12, 0, -56, 1, 25, -53, -30, 17, -43, 20, -30, -14, 1, -14, 74,
+ -23, 0, -45, -27, -10, -68, -30, -1, -53, 36, -44, -38, -10, -30, 16, 83,
+ 16, 0, -7, -5, -9, -58, -6, -14, -44, -10, -47, -29, 10, -5, -7, -15, 60,
+ 2, 0, -21, -13, -15, -47, -35, -14, -17, -6, -34, -16, 0, -10, -9, -16, 26, 64,
+ 0, 0, -11, -65, -38, -17, -65, -47, 42, -39, 13, 10, -50, -42, -36, -44, -28, -3, 65,
+ -68, 0, -29, -96, -78, 27, -66, -28, -46, -68, -29, -31, -68, -89, -49, -30, -57, -67, -59, 166,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ -48, 0, -20, -53, -56, 55, -74, 26, -31, -44, -20, -22, -28, -59, -38, -37, -35, -42, -33, 33, 0, 111,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+short gon160mt[]={
+ 46,
+ 0, 0,
+ 3, 0, 135,
+ -11, 0, -53, 70,
+ -4, 0, -52, 34, 59,
+ -38, 0, -18, -70, -62, 91,
+ 2, 0, -34, -7, -21, -76, 82,
+ -18, 0, -23, -1, -1, -7, -27, 93,
+ -18, 0, -25, -62, -43, 3, -70, -37, 59,
+ -12, 0, -48, -1, 13, -53, -24, 2, -35, 55,
+ -22, 0, -29, -65, -45, 19, -67, -32, 30, -34, 57,
+ -12, 0, -19, -50, -31, 14, -52, -21, 29, -21, 34, 76,
+ -12, 0, -31, 26, 5, -47, -2, 15, -44, 8, -48, -36, 65,
+ -1, 0, -52, -19, -14, -58, -30, -22, -43, -16, -35, -42, -22, 96,
+ -7, 0, -42, 6, 23, -41, -21, 17, -32, 20, -24, -12, 5, -8, 56,
+ -16, 0, -35, -16, -3, -53, -21, 3, -41, 35, -35, -29, -4, -21, 17, 71,
+ 16, 0, -2, 0, -3, -45, -1, -8, -33, -4, -36, -23, 11, 0, -2, -9, 44,
+ 5, 0, -14, -6, -8, -36, -24, -8, -12, -2, -24, -11, 3, -4, -4, -9, 23, 50,
+ 1, 0, -6, -49, -30, -8, -52, -35, 40, -30, 17, 14, -38, -32, -27, -34, -20, 0, 53,
+ -55, 0, -21, -78, -64, 32, -55, -19, -34, -54, -20, -22, -55, -74, -40, -24, -47, -54, -45, 158,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ -37, 0, -13, -42, -44, 56, -60, 27, -20, -35, -11, -13, -22, -48, -29, -29, -28, -32, -24, 38, 0, 100,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+short gon250mt[]={
+ 24,
+ 0, 0,
+ 5, 0, 115,
+ -3, 0, -32, 47,
+ 0, 0, -30, 27, 36,
+ -23, 0, -8, -45, -39, 70,
+ 5, 0, -20, 1, -8, -52, 66,
+ -8, 0, -13, 4, 4, -1, -14, 60,
+ -8, 0, -11, -38, -27, 10, -45, -22, 40,
+ -4, 0, -28, 5, 12, -33, -11, 6, -21, 32,
+ -12, 0, -15, -40, -28, 20, -44, -19, 28, -21, 40,
+ -7, 0, -9, -30, -20, 16, -35, -13, 25, -14, 28, 43,
+ -3, 0, -18, 22, 9, -31, 4, 12, -28, 8, -30, -22, 38,
+ 3, 0, -31, -7, -5, -38, -16, -11, -26, -6, -23, -24, -9, 76,
+ -2, 0, -24, 9, 17, -26, -10, 12, -19, 15, -16, -10, 7, -2, 27,
+ -6, 0, -22, -3, 4, -32, -10, 6, -24, 27, -22, -17, 3, -9, 15, 47,
+ 11, 0, 1, 5, 2, -28, 4, -2, -18, 1, -21, -14, 9, 4, 2, -2, 22,
+ 6, 0, -5, 0, -1, -22, -11, -3, -6, 1, -13, -6, 5, 1, 0, -2, 15, 25,
+ 1, 0, 0, -29, -19, 1, -33, -20, 31, -17, 18, 16, -22, -18, -15, -20, -10, 0, 34,
+ -36, 0, -10, -52, -43, 36, -40, -8, -18, -35, -7, -10, -36, -50, -27, -16, -33, -35, -26, 142,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ -22, 0, -5, -28, -27, 51, -40, 22, -7, -21, 0, -2, -14, -31, -17, -18, -19, -19, -11, 41, 0, 78,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+short gon300mt[]={
+ 16,
+ 0, 0,
+ 5, 0, 104,
+ -1, 0, -24, 37,
+ 1, 0, -23, 23, 27,
+ -18, 0, -5, -37, -31, 60,
+ 5, 0, -15, 3, -4, -42, 58,
+ -6, 0, -10, 5, 4, 0, -10, 45,
+ -6, 0, -7, -30, -21, 11, -36, -16, 33,
+ -2, 0, -21, 6, 11, -26, -7, 5, -17, 24,
+ -9, 0, -10, -32, -22, 19, -36, -14, 25, -17, 33,
+ -5, 0, -6, -24, -16, 15, -28, -10, 22, -11, 24, 31,
+ -1, 0, -14, 18, 9, -25, 5, 10, -22, 8, -24, -17, 27,
+ 3, 0, -23, -4, -2, -30, -11, -8, -20, -3, -18, -19, -6, 66,
+ -1, 0, -18, 9, 14, -20, -6, 9, -15, 13, -13, -8, 7, -1, 18,
+ -4, 0, -17, 0, 5, -25, -6, 6, -19, 22, -18, -13, 4, -6, 13, 37,
+ 8, 0, 1, 5, 3, -22, 4, -1, -14, 2, -17, -11, 7, 4, 2, 0, 15,
+ 5, 0, -3, 1, 1, -17, -7, -1, -4, 2, -9, -5, 4, 2, 1, -1, 11, 17,
+ 0, 0, 1, -23, -15, 4, -26, -15, 26, -13, 17, 15, -17, -14, -12, -15, -8, 0, 26,
+ -29, 0, -7, -42, -36, 36, -34, -5, -13, -28, -4, -6, -30, -41, -23, -14, -27, -28, -19, 132,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ -17, 0, -3, -22, -22, 46, -33, 18, -3, -17, 3, 1, -12, -25, -14, -14, -15, -15, -7, 40, 0, 67,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+short gon350mt[]={
+ 10,
+ 0, 0,
+ 4, 0, 93,
+ 0, 0, -19, 29,
+ 1, 0, -17, 19, 20,
+ -14, 0, -3, -30, -25, 51,
+ 5, 0, -12, 4, -2, -35, 51,
+ -4, 0, -8, 5, 4, 1, -7, 33,
+ -4, 0, -5, -24, -17, 11, -29, -13, 27,
+ -1, 0, -16, 6, 9, -21, -4, 5, -13, 18,
+ -7, 0, -7, -25, -18, 18, -30, -11, 22, -14, 28,
+ -4, 0, -4, -19, -13, 14, -23, -8, 19, -9, 21, 23,
+ 0, 0, -11, 15, 9, -20, 5, 8, -18, 7, -19, -14, 20,
+ 3, 0, -18, -2, 0, -25, -7, -5, -16, -2, -15, -14, -3, 56,
+ 0, 0, -14, 8, 11, -16, -4, 7, -11, 10, -11, -7, 6, 0, 12,
+ -2, 0, -13, 2, 6, -20, -4, 6, -15, 18, -14, -11, 4, -4, 10, 28,
+ 6, 0, 1, 5, 3, -18, 5, 0, -11, 2, -13, -9, 6, 4, 2, 1, 10,
+ 4, 0, -2, 2, 1, -13, -5, -1, -3, 2, -7, -4, 4, 2, 1, 0, 8, 11,
+ 0, 0, 2, -18, -12, 5, -21, -11, 22, -10, 16, 14, -13, -11, -9, -12, -6, 0, 21,
+ -24, 0, -4, -35, -29, 35, -30, -3, -9, -23, -1, -3, -24, -34, -19, -12, -22, -23, -14, 124,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ -14, 0, -1, -18, -17, 42, -27, 15, -1, -14, 5, 2, -10, -20, -11, -12, -12, -12, -4, 39, 0, 57,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+
+char *nucleic_acid_order = "ABCDGHKMNRSTUVWXY";
+
+short clustalvdnamt[]={
+ 10,
+ 0, 0,
+ 0, 0, 10,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0, 10,
+ 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+short swgapdnamt[]={
+ 10,
+ -9, 10,
+ -9, 10, 10,
+ 10, 10, -9, 10,
+ -9, 10, -9, 10, 10,
+ 10, 10, 10, 10, -9, 10,
+ -9, 10, -9, 10, 10, 10, 10,
+ 10, 10, 10, 10, -9, 10, -9, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, -9, 10, 10, 10, 10, 10, 10, 10,
+ -9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ -9, 10, -9, 10, -9, 10, 10, -9, 10, -9, -9, 10,
+ -9, 10, -9, 10, -9, 10, 10, -9, 10, -9, -9, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, -9, -9, 10,
+ 10, 10, -9, 10, -9, 10, 10, 10, 10, 10, -9, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ -9, 10, 10, 10, -9, 10, 10, 10, 10, -9, 10, 10, 10, 10, 10, 10, 10};
+
Added: trunk/packages/clustalw/branches/upstream/current/matrixseries.gon
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/matrixseries.gon 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/matrixseries.gon 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,7 @@
+CLUSTAL_SERIES
+
+
+MATRIX 61 100 /us1/user/julie/matrices/gon80.bla
+MATRIX 41 60 /us1/user/julie/matrices/gon120.bla
+MATRIX 21 40 /us1/user/julie/matrices/gon250.bla
+MATRIX 0 40 /us1/user/julie/matrices/gon350.bla
Added: trunk/packages/clustalw/branches/upstream/current/pairalign.c
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/pairalign.c 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/pairalign.c 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,615 @@
+/* Change int h to int gh everywhere DES June 1994 */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "clustalw.h"
+
+#define MIN(a,b) ((a)<(b)?(a):(b))
+#define MAX(a,b) ((a)>(b)?(a):(b))
+
+#define gap(k) ((k) <= 0 ? 0 : g + gh * (k))
+#define tbgap(k) ((k) <= 0 ? 0 : tb + gh * (k))
+#define tegap(k) ((k) <= 0 ? 0 : te + gh * (k))
+
+/*
+ * Prototypes
+ */
+static void add(sint v);
+static sint calc_score(sint iat, sint jat, sint v1, sint v2);
+static float tracepath(sint tsb1,sint tsb2);
+static void forward_pass(char *ia, char *ib, sint n, sint m);
+static void reverse_pass(char *ia, char *ib);
+static sint diff(sint A, sint B, sint M, sint N, sint tb, sint te);
+static void del(sint k);
+
+/*
+ * Global variables
+ */
+#ifdef MAC
+#define pwint short
+#else
+#define pwint int
+#endif
+static sint int_scale;
+
+extern double **tmat;
+extern float pw_go_penalty;
+extern float pw_ge_penalty;
+extern float transition_weight;
+extern sint nseqs;
+extern sint max_aa;
+extern sint gap_pos1,gap_pos2;
+extern sint max_aln_length;
+extern sint *seqlen_array;
+extern sint debug;
+extern sint mat_avscore;
+extern short blosum30mt[],pam350mt[],idmat[],pw_usermat[],pw_userdnamat[];
+extern short clustalvdnamt[],swgapdnamt[];
+extern short gon250mt[];
+extern short def_dna_xref[],def_aa_xref[],pw_dna_xref[],pw_aa_xref[];
+extern Boolean dnaflag;
+extern char **seq_array;
+extern char *amino_acid_codes;
+extern char pw_mtrxname[];
+extern char pw_dnamtrxname[];
+
+static float mm_score;
+static sint print_ptr,last_print;
+static sint *displ;
+static pwint *HH, *DD, *RR, *SS;
+static sint g, gh;
+static sint seq1, seq2;
+static sint matrix[NUMRES][NUMRES];
+static pwint maxscore;
+static sint sb1, sb2, se1, se2;
+
+
+sint pairalign(sint istart, sint iend, sint jstart, sint jend)
+{
+ short *mat_xref;
+ static sint si, sj, i;
+ static sint n,m,len1,len2;
+ static sint maxres;
+ static short *matptr;
+ static char c;
+ static float gscale,ghscale;
+
+ displ = (sint *)ckalloc((2*max_aln_length+1) * sizeof(sint));
+ HH = (pwint *)ckalloc((max_aln_length) * sizeof(pwint));
+ DD = (pwint *)ckalloc((max_aln_length) * sizeof(pwint));
+ RR = (pwint *)ckalloc((max_aln_length) * sizeof(pwint));
+ SS = (pwint *)ckalloc((max_aln_length) * sizeof(pwint));
+
+#ifdef MAC
+ int_scale = 10;
+#else
+ int_scale = 100;
+#endif
+ gscale=ghscale=1.0;
+ if (dnaflag)
+ {
+ if (debug>1) fprintf(stdout,"matrix %s\n",pw_dnamtrxname);
+ if (strcmp(pw_dnamtrxname, "iub") == 0)
+ {
+ matptr = swgapdnamt;
+ mat_xref = def_dna_xref;
+ }
+ else if (strcmp(pw_dnamtrxname, "clustalw") == 0)
+ {
+ matptr = clustalvdnamt;
+ mat_xref = def_dna_xref;
+ gscale=0.6667;
+ ghscale=0.751;
+ }
+ else
+ {
+ matptr = pw_userdnamat;
+ mat_xref = pw_dna_xref;
+ }
+ maxres = get_matrix(matptr, mat_xref, matrix, TRUE, int_scale);
+ if (maxres == 0) return((sint)-1);
+
+ matrix[0][4]=transition_weight*matrix[0][0];
+ matrix[4][0]=transition_weight*matrix[0][0];
+ matrix[2][11]=transition_weight*matrix[0][0];
+ matrix[11][2]=transition_weight*matrix[0][0];
+ matrix[2][12]=transition_weight*matrix[0][0];
+ matrix[12][2]=transition_weight*matrix[0][0];
+ }
+ else
+ {
+ if (debug>1) fprintf(stdout,"matrix %s\n",pw_mtrxname);
+ if (strcmp(pw_mtrxname, "blosum") == 0)
+ {
+ matptr = blosum30mt;
+ mat_xref = def_aa_xref;
+ }
+ else if (strcmp(pw_mtrxname, "pam") == 0)
+ {
+ matptr = pam350mt;
+ mat_xref = def_aa_xref;
+ }
+ else if (strcmp(pw_mtrxname, "gonnet") == 0)
+ {
+ matptr = gon250mt;
+ int_scale /= 10;
+ mat_xref = def_aa_xref;
+ }
+ else if (strcmp(pw_mtrxname, "id") == 0)
+ {
+ matptr = idmat;
+ mat_xref = def_aa_xref;
+ }
+ else
+ {
+ matptr = pw_usermat;
+ mat_xref = pw_aa_xref;
+ }
+
+ maxres = get_matrix(matptr, mat_xref, matrix, TRUE, int_scale);
+ if (maxres == 0) return((sint)-1);
+ }
+
+
+ for (si=MAX(0,istart);si<nseqs && si<iend;si++)
+ {
+ n = seqlen_array[si+1];
+ len1 = 0;
+ for (i=1;i<=n;i++) {
+ c = seq_array[si+1][i];
+ if ((c!=gap_pos1) && (c != gap_pos2)) len1++;
+ }
+
+ for (sj=MAX(si+1,jstart+1);sj<nseqs && sj<jend;sj++)
+ {
+ m = seqlen_array[sj+1];
+ if(n==0 || m==0) {
+ tmat[si+1][sj+1]=1.0;
+ tmat[sj+1][si+1]=1.0;
+ continue;
+ }
+ len2 = 0;
+ for (i=1;i<=m;i++) {
+ c = seq_array[sj+1][i];
+ if ((c!=gap_pos1) && (c != gap_pos2)) len2++;
+ }
+
+ if (dnaflag) {
+ g = 2 * (float)pw_go_penalty * int_scale*gscale;
+ gh = pw_ge_penalty * int_scale*ghscale;
+ }
+ else {
+ if (mat_avscore <= 0)
+ g = 2 * (float)(pw_go_penalty + log((double)(MIN(n,m))))*int_scale;
+ else
+ g = 2 * mat_avscore * (float)(pw_go_penalty +
+ log((double)(MIN(n,m))))*gscale;
+ gh = pw_ge_penalty * int_scale;
+ }
+
+ if (debug>1) fprintf(stdout,"go %d ge %d\n",(pint)g,(pint)gh);
+
+ /*
+ align the sequences
+ */
+ seq1 = si+1;
+ seq2 = sj+1;
+
+ forward_pass(&seq_array[seq1][0], &seq_array[seq2][0],
+ n, m);
+
+ reverse_pass(&seq_array[seq1][0], &seq_array[seq2][0]);
+
+ last_print = 0;
+ print_ptr = 1;
+/*
+ sb1 = sb2 = 1;
+ se1 = n-1;
+ se2 = m-1;
+*/
+
+/* use Myers and Miller to align two sequences */
+
+ maxscore = diff(sb1-1, sb2-1, se1-sb1+1, se2-sb2+1,
+ (sint)0, (sint)0);
+
+/* calculate percentage residue identity */
+
+ mm_score = tracepath(sb1,sb2);
+
+ if(len1==0 || len2==0) mm_score=0;
+ else
+ mm_score /= (float)MIN(len1,len2);
+
+ tmat[si+1][sj+1] = ((float)100.0 - mm_score)/(float)100.0;
+ tmat[sj+1][si+1] = ((float)100.0 - mm_score)/(float)100.0;
+
+if (debug>1)
+{
+ fprintf(stdout,"Sequences (%d:%d) Aligned. Score: %d CompScore: %d\n",
+ (pint)si+1,(pint)sj+1,
+ (pint)mm_score,
+ (pint)maxscore/(MIN(len1,len2)*100));
+}
+else
+{
+ info("Sequences (%d:%d) Aligned. Score: %d",
+ (pint)si+1,(pint)sj+1,
+ (pint)mm_score);
+}
+
+ }
+ }
+ displ=ckfree((void *)displ);
+ HH=ckfree((void *)HH);
+ DD=ckfree((void *)DD);
+ RR=ckfree((void *)RR);
+ SS=ckfree((void *)SS);
+
+
+ return((sint)1);
+}
+
+static void add(sint v)
+{
+
+ if(last_print<0) {
+ displ[print_ptr-1] = v;
+ displ[print_ptr++] = last_print;
+ }
+ else
+ last_print = displ[print_ptr++] = v;
+}
+
+static sint calc_score(sint iat,sint jat,sint v1,sint v2)
+{
+ sint ipos,jpos;
+ sint ret;
+
+ ipos = v1 + iat;
+ jpos = v2 + jat;
+
+ ret=matrix[(int)seq_array[seq1][ipos]][(int)seq_array[seq2][jpos]];
+
+ return(ret);
+}
+
+
+static float tracepath(sint tsb1,sint tsb2)
+{
+ char c1,c2;
+ sint i1,i2,r;
+ sint i,k,pos,to_do;
+ sint count;
+ float score;
+ char s1[600], s2[600];
+
+ to_do=print_ptr-1;
+ i1 = tsb1;
+ i2 = tsb2;
+
+ pos = 0;
+ count = 0;
+ for(i=1;i<=to_do;++i) {
+
+ if (debug>1) fprintf(stdout,"%d ",(pint)displ[i]);
+ if(displ[i]==0) {
+ c1 = seq_array[seq1][i1];
+ c2 = seq_array[seq2][i2];
+
+ if (debug>0)
+ {
+ if (c1>max_aa) s1[pos] = '-';
+ else s1[pos]=amino_acid_codes[c1];
+ if (c2>max_aa) s2[pos] = '-';
+ else s2[pos]=amino_acid_codes[c2];
+ }
+
+ if ((c1!=gap_pos1) && (c1 != gap_pos2) &&
+ (c1 == c2)) count++;
+ ++i1;
+ ++i2;
+ ++pos;
+ }
+ else {
+ if((k=displ[i])>0) {
+
+ if (debug>0)
+ for (r=0;r<k;r++)
+ {
+ s1[pos+r]='-';
+ if (seq_array[seq2][i2+r]>max_aa) s2[pos+r] = '-';
+ else s2[pos+r]=amino_acid_codes[seq_array[seq2][i2+r]];
+ }
+
+ i2 += k;
+ pos += k;
+ }
+ else {
+
+ if (debug>0)
+ for (r=0;r<(-k);r++)
+ {
+ s2[pos+r]='-';
+ if (seq_array[seq1][i1+r]>max_aa) s1[pos+r] = '-';
+ else s1[pos+r]=amino_acid_codes[seq_array[seq1][i1+r]];
+ }
+
+ i1 -= k;
+ pos -= k;
+ }
+ }
+ }
+ if (debug>0) fprintf(stdout,"\n");
+ if (debug>0)
+ {
+ for (i=0;i<pos;i++) fprintf(stdout,"%c",s1[i]);
+ fprintf(stdout,"\n");
+ for (i=0;i<pos;i++) fprintf(stdout,"%c",s2[i]);
+ fprintf(stdout,"\n");
+ }
+ /*
+ if (count <= 0) count = 1;
+ */
+ score = 100.0 * (float)count;
+ return(score);
+}
+
+
+static void forward_pass(char *ia, char *ib, sint n, sint m)
+{
+
+ sint i,j;
+ pwint f,hh,p,t;
+
+ maxscore = 0;
+ se1 = se2 = 0;
+ for (i=0;i<=m;i++)
+ {
+ HH[i] = 0;
+ DD[i] = -g;
+ }
+
+ for (i=1;i<=n;i++)
+ {
+ hh = p = 0;
+ f = -g;
+
+ for (j=1;j<=m;j++)
+ {
+
+ f -= gh;
+ t = hh - g - gh;
+ if (f<t) f = t;
+
+ DD[j] -= gh;
+ t = HH[j] - g - gh;
+ if (DD[j]<t) DD[j] = t;
+
+ hh = p + matrix[(int)ia[i]][(int)ib[j]];
+ if (hh<f) hh = f;
+ if (hh<DD[j]) hh = DD[j];
+ if (hh<0) hh = 0;
+
+ p = HH[j];
+ HH[j] = hh;
+
+ if (hh > maxscore)
+ {
+ maxscore = hh;
+ se1 = i;
+ se2 = j;
+ }
+ }
+ }
+
+}
+
+
+static void reverse_pass(char *ia, char *ib)
+{
+
+ sint i,j;
+ pwint f,hh,p,t;
+ pwint cost;
+
+ cost = 0;
+ sb1 = sb2 = 1;
+ for (i=se2;i>0;i--)
+ {
+ HH[i] = -1;
+ DD[i] = -1;
+ }
+
+ for (i=se1;i>0;i--)
+ {
+ hh = f = -1;
+ if (i == se1) p = 0;
+ else p = -1;
+
+ for (j=se2;j>0;j--)
+ {
+
+ f -= gh;
+ t = hh - g - gh;
+ if (f<t) f = t;
+
+ DD[j] -= gh;
+ t = HH[j] - g - gh;
+ if (DD[j]<t) DD[j] = t;
+
+ hh = p + matrix[(int)ia[i]][(int)ib[j]];
+ if (hh<f) hh = f;
+ if (hh<DD[j]) hh = DD[j];
+
+ p = HH[j];
+ HH[j] = hh;
+
+ if (hh > cost)
+ {
+ cost = hh;
+ sb1 = i;
+ sb2 = j;
+ if (cost >= maxscore) break;
+ }
+ }
+ if (cost >= maxscore) break;
+ }
+
+}
+
+static int diff(sint A,sint B,sint M,sint N,sint tb,sint te)
+{
+ sint type;
+ sint midi,midj,i,j;
+ int midh;
+ static pwint f, hh, e, s, t;
+
+ if(N<=0) {
+ if(M>0) {
+ del(M);
+ }
+
+ return(-(int)tbgap(M));
+ }
+
+ if(M<=1) {
+ if(M<=0) {
+ add(N);
+ return(-(int)tbgap(N));
+ }
+
+ midh = -(tb+gh) - tegap(N);
+ hh = -(te+gh) - tbgap(N);
+ if (hh>midh) midh = hh;
+ midj = 0;
+ for(j=1;j<=N;j++) {
+ hh = calc_score(1,j,A,B)
+ - tegap(N-j) - tbgap(j-1);
+ if(hh>midh) {
+ midh = hh;
+ midj = j;
+ }
+ }
+
+ if(midj==0) {
+ del(1);
+ add(N);
+ }
+ else {
+ if(midj>1)
+ add(midj-1);
+ displ[print_ptr++] = last_print = 0;
+ if(midj<N)
+ add(N-midj);
+ }
+ return midh;
+ }
+
+/* Divide: Find optimum midpoint (midi,midj) of cost midh */
+
+ midi = M / 2;
+ HH[0] = 0.0;
+ t = -tb;
+ for(j=1;j<=N;j++) {
+ HH[j] = t = t-gh;
+ DD[j] = t-g;
+ }
+
+ t = -tb;
+ for(i=1;i<=midi;i++) {
+ s=HH[0];
+ HH[0] = hh = t = t-gh;
+ f = t-g;
+ for(j=1;j<=N;j++) {
+ if ((hh=hh-g-gh) > (f=f-gh)) f=hh;
+ if ((hh=HH[j]-g-gh) > (e=DD[j]-gh)) e=hh;
+ hh = s + calc_score(i,j,A,B);
+ if (f>hh) hh = f;
+ if (e>hh) hh = e;
+
+ s = HH[j];
+ HH[j] = hh;
+ DD[j] = e;
+ }
+ }
+
+ DD[0]=HH[0];
+
+ RR[N]=0;
+ t = -te;
+ for(j=N-1;j>=0;j--) {
+ RR[j] = t = t-gh;
+ SS[j] = t-g;
+ }
+
+ t = -te;
+ for(i=M-1;i>=midi;i--) {
+ s = RR[N];
+ RR[N] = hh = t = t-gh;
+ f = t-g;
+
+ for(j=N-1;j>=0;j--) {
+
+ if ((hh=hh-g-gh) > (f=f-gh)) f=hh;
+ if ((hh=RR[j]-g-gh) > (e=SS[j]-gh)) e=hh;
+ hh = s + calc_score(i+1,j+1,A,B);
+ if (f>hh) hh = f;
+ if (e>hh) hh = e;
+
+ s = RR[j];
+ RR[j] = hh;
+ SS[j] = e;
+
+ }
+ }
+
+ SS[N]=RR[N];
+
+ midh=HH[0]+RR[0];
+ midj=0;
+ type=1;
+ for(j=0;j<=N;j++) {
+ hh = HH[j] + RR[j];
+ if(hh>=midh)
+ if(hh>midh || (HH[j]!=DD[j] && RR[j]==SS[j])) {
+ midh=hh;
+ midj=j;
+ }
+ }
+
+ for(j=N;j>=0;j--) {
+ hh = DD[j] + SS[j] + g;
+ if(hh>midh) {
+ midh=hh;
+ midj=j;
+ type=2;
+ }
+ }
+
+ /* Conquer recursively around midpoint */
+
+
+ if(type==1) { /* Type 1 gaps */
+ diff(A,B,midi,midj,tb,g);
+ diff(A+midi,B+midj,M-midi,N-midj,g,te);
+ }
+ else {
+ diff(A,B,midi-1,midj,tb,0.0);
+ del(2);
+ diff(A+midi+1,B+midj,M-midi-1,N-midj,0.0,te);
+ }
+
+ return midh; /* Return the score of the best alignment */
+}
+
+static void del(sint k)
+{
+ if(last_print<0)
+ last_print = displ[print_ptr-1] -= k;
+ else
+ last_print = displ[print_ptr++] = -(k);
+}
+
+
Added: trunk/packages/clustalw/branches/upstream/current/param.h
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/param.h 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/param.h 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,383 @@
+#define MAXARGS 100
+
+typedef struct {
+ char *str;
+ sint *flag;
+ int type;
+ char **arg;
+} cmd_line_data;
+
+/*
+ command line switches
+*/
+sint setoptions = -1;
+sint sethelp = -1;
+sint setinteractive = -1;
+sint setbatch = -1;
+sint setgapopen = -1;
+sint setgapext = -1;
+sint setpwgapopen = -1;
+sint setpwgapext = -1;
+sint setoutorder = -1;
+sint setbootlabels = -1;
+sint setpwmatrix = -1;
+sint setmatrix = -1;
+sint setpwdnamatrix = -1;
+sint setdnamatrix = -1;
+sint setnegative = -1;
+sint setnoweights = -1;
+sint setoutput = -1;
+sint setoutputtree = -1;
+sint setquicktree = -1;
+sint settype = -1;
+sint setcase = -1;
+sint setseqno = -1;
+
+sint setseqno_range = -1;
+sint setrange = -1;
+
+sint settransweight = -1;
+sint setseed = -1;
+sint setscore = -1;
+sint setwindow = -1;
+sint setktuple = -1;
+sint setkimura = -1;
+sint settopdiags = -1;
+sint setpairgap = -1;
+sint settossgaps = -1;
+sint setnopgap = -1;
+sint setnohgap = -1;
+sint setnovgap = -1;
+sint sethgapres = -1;
+sint setvgapres = -1;
+sint setuseendgaps = -1;
+sint setmaxdiv = -1;
+sint setgapdist = -1;
+sint setdebug = -1;
+sint setoutfile = -1;
+sint setinfile = -1;
+sint setprofile1 = -1;
+sint setprofile2 = -1;
+sint setalign = -1;
+sint setconvert = -1;
+sint setnewtree = -1;
+sint setusetree = -1;
+sint setnewtree1 = -1;
+sint setusetree1 = -1;
+sint setnewtree2 = -1;
+sint setusetree2 = -1;
+sint setbootstrap = -1;
+sint settree = -1;
+sint setprofile = -1;
+sint setsequences = -1;
+sint setsecstr1 = -1;
+sint setsecstr2 = -1;
+sint setsecstroutput = -1;
+sint sethelixgap = -1;
+sint setstrandgap = -1;
+sint setloopgap = -1;
+sint setterminalgap = -1;
+sint sethelixendin = -1;
+sint sethelixendout = -1;
+sint setstrandendin = -1;
+sint setstrandendout = -1;
+
+/*
+ multiple alignment parameters
+*/
+float dna_gap_open = 15.0, dna_gap_extend = 6.66;
+float prot_gap_open = 10.0, prot_gap_extend = 0.2;
+sint profile_type = PROFILE;
+sint gap_dist = 4;
+sint output_order = ALIGNED;
+sint divergence_cutoff = 30;
+sint matnum = 3;
+char mtrxname[FILENAMELEN+1] = "gonnet";
+sint dnamatnum = 1;
+char dnamtrxname[FILENAMELEN+1] = "iub";
+char hyd_residues[] = "GPSNDQEKR";
+Boolean no_weights = FALSE;
+Boolean neg_matrix = FALSE;
+Boolean no_hyd_penalties = FALSE;
+Boolean no_var_penalties = TRUE;
+Boolean no_pref_penalties = FALSE;
+Boolean use_endgaps = FALSE;
+Boolean endgappenalties = FALSE;
+Boolean reset_alignments_new = FALSE; /* DES */
+Boolean reset_alignments_all = FALSE; /* DES */
+sint output_struct_penalties = 0;
+sint struct_penalties1 = NONE;
+sint struct_penalties2 = NONE;
+Boolean use_ss1 = TRUE;
+Boolean use_ss2 = TRUE;
+sint helix_penalty = 4;
+sint strand_penalty = 4;
+sint loop_penalty = 1;
+sint helix_end_minus = 3;
+sint helix_end_plus = 0;
+sint strand_end_minus = 1;
+sint strand_end_plus = 1;
+sint helix_end_penalty = 2;
+sint strand_end_penalty = 2;
+Boolean use_ambiguities = FALSE;
+
+/*
+ pairwise alignment parameters
+*/
+float dna_pw_go_penalty = 15.0, dna_pw_ge_penalty = 6.66;
+float prot_pw_go_penalty = 10.0, prot_pw_ge_penalty = 0.1;
+sint pw_matnum = 3;
+char pw_mtrxname[FILENAMELEN+1] = "gonnet";
+sint pw_dnamatnum = 1;
+char pw_dnamtrxname[FILENAMELEN+1] = "iub";
+char usermtrxname[FILENAMELEN+1], pw_usermtrxname[FILENAMELEN+1];
+char dnausermtrxname[FILENAMELEN+1], pw_dnausermtrxname[FILENAMELEN+1];
+
+Boolean quick_pairalign = FALSE;
+float transition_weight = 0.5;
+sint new_seq;
+
+/*
+ quick pairwise alignment parameters
+*/
+sint dna_ktup = 2; /* default parameters for DNA */
+sint dna_wind_gap = 5;
+sint dna_signif = 4;
+sint dna_window = 4;
+
+sint prot_ktup = 1; /* default parameters for proteins */
+sint prot_wind_gap = 3;
+sint prot_signif = 5;
+sint prot_window = 5;
+Boolean percent=TRUE;
+Boolean tossgaps = FALSE;
+Boolean kimura = FALSE;
+
+
+sint boot_ntrials = 1000;
+unsigned sint boot_ran_seed = 111;
+
+
+sint debug = 0;
+
+Boolean explicit_dnaflag = FALSE; /* Explicit setting of sequence type on comm.line*/
+Boolean lowercase = TRUE; /* Flag for GDE output - set on comm. line*/
+Boolean cl_seq_numbers = FALSE;
+
+Boolean seqRange = FALSE; /* Ramu */
+
+Boolean output_clustal = TRUE;
+Boolean output_gcg = FALSE;
+Boolean output_phylip = FALSE;
+Boolean output_nbrf = FALSE;
+Boolean output_gde = FALSE;
+Boolean output_nexus = FALSE;
+Boolean output_fasta = FALSE;
+
+Boolean showaln = TRUE;
+Boolean save_parameters = FALSE;
+
+/* DES */
+Boolean output_tree_clustal = FALSE;
+Boolean output_tree_phylip = TRUE;
+Boolean output_tree_distances = FALSE;
+Boolean output_tree_nexus = FALSE;
+Boolean output_pim = FALSE;
+
+
+sint bootstrap_format = BS_BRANCH_LABELS;
+
+/*These are all the positively scoring groups that occur in the Gonnet Pam250
+matrix. There are strong and weak groups, defined as strong score >0.5 and
+weak score =<0.5. Strong matching columns to be assigned ':' and weak matches
+assigned '.' in the clustal output format.
+*/
+
+char *res_cat1[] = {
+ "STA",
+ "NEQK",
+ "NHQK",
+ "NDEQ",
+ "QHRK",
+ "MILV",
+ "MILF",
+ "HY",
+ "FYW",
+ NULL };
+
+char *res_cat2[] = {
+ "CSA",
+ "ATV",
+ "SAG",
+ "STNK",
+ "STPA",
+ "SGND",
+ "SNDEQK",
+ "NDEQHK",
+ "NEQHRK",
+ "FVLIM",
+ "HFY",
+ NULL };
+
+
+
+static char *type_arg[] = {
+ "protein",
+ "dna",
+ ""};
+
+static char *bootlabels_arg[] = {
+ "node",
+ "branch",
+ ""};
+
+static char *outorder_arg[] = {
+ "input",
+ "aligned",
+ ""};
+
+static char *case_arg[] = {
+ "lower",
+ "upper",
+ ""};
+
+static char *seqno_arg[] = {
+ "off",
+ "on",
+ ""};
+
+static char *seqno_range_arg[] = {
+ "off",
+ "on",
+ ""};
+
+static char *score_arg[] = {
+ "percent",
+ "absolute",
+ ""};
+
+static char *output_arg[] = {
+ "gcg",
+ "gde",
+ "pir",
+ "phylip",
+ "nexus",
+ "fasta",
+ ""};
+
+static char *outputtree_arg[] = {
+ "nj",
+ "phylip",
+ "dist",
+ "nexus",
+ ""};
+
+static char *outputsecstr_arg[] = {
+ "structure",
+ "mask",
+ "both",
+ "none",
+ ""};
+
+/*
+ command line initialisation
+
+ type = 0 no argument
+ type = 1 integer argument
+ type = 2 float argument
+ type = 3 string argument
+ type = 4 filename
+ type = 5 opts
+*/
+#define NOARG 0
+#define INTARG 1
+#define FLTARG 2
+#define STRARG 3
+#define FILARG 4
+#define OPTARG 5
+
+
+/* command line switches for DATA **************************/
+cmd_line_data cmd_line_file[] = {
+ "infile", &setinfile, FILARG, NULL,
+ "profile1", &setprofile1, FILARG, NULL,
+ "profile2", &setprofile2, FILARG, NULL,
+ "", NULL, -1};
+/* command line switches for VERBS **************************/
+cmd_line_data cmd_line_verb[] = {
+ "help", &sethelp, NOARG, NULL,
+ "check", &sethelp, NOARG, NULL,
+ "options", &setoptions, NOARG, NULL,
+ "align", &setalign, NOARG, NULL,
+ "newtree", &setnewtree, FILARG, NULL,
+ "usetree", &setusetree, FILARG, NULL,
+ "newtree1", &setnewtree1, FILARG, NULL,
+ "usetree1", &setusetree1, FILARG, NULL,
+ "newtree2", &setnewtree2, FILARG, NULL,
+ "usetree2", &setusetree2, FILARG, NULL,
+ "bootstrap", &setbootstrap, NOARG, NULL,
+ "tree", &settree, NOARG, NULL,
+ "quicktree", &setquicktree, NOARG, NULL,
+ "convert", &setconvert, NOARG, NULL,
+ "interactive", &setinteractive, NOARG, NULL,
+ "batch", &setbatch, NOARG, NULL,
+ "", NULL, -1};
+/* command line switches for PARAMETERS **************************/
+cmd_line_data cmd_line_para[] = {
+ "type", &settype, OPTARG, type_arg,
+ "profile", &setprofile, NOARG, NULL,
+ "sequences", &setsequences, NOARG, NULL,
+ "matrix", &setmatrix, FILARG, NULL,
+ "dnamatrix", &setdnamatrix, FILARG, NULL,
+ "negative", &setnegative, NOARG, NULL,
+ "noweights", &setnoweights, NOARG, NULL,
+ "gapopen", &setgapopen, FLTARG, NULL,
+ "gapext", &setgapext, FLTARG, NULL,
+ "endgaps", &setuseendgaps, NOARG, NULL,
+ "nopgap", &setnopgap, NOARG, NULL,
+ "nohgap", &setnohgap, NOARG, NULL,
+ "novgap", &setnovgap, NOARG, NULL,
+ "hgapresidues", &sethgapres, STRARG, NULL,
+ "maxdiv", &setmaxdiv, INTARG, NULL,
+
+ "gapdist", &setgapdist, INTARG, NULL,
+ "pwmatrix", &setpwmatrix, FILARG, NULL,
+ "pwdnamatrix", &setpwdnamatrix, FILARG, NULL,
+ "pwgapopen", &setpwgapopen, FLTARG, NULL,
+ "pwgapext", &setpwgapext, FLTARG, NULL,
+ "ktuple", &setktuple, INTARG, NULL,
+ "window", &setwindow, INTARG, NULL,
+ "pairgap", &setpairgap, INTARG, NULL,
+ "topdiags", &settopdiags, INTARG, NULL,
+ "score", &setscore, OPTARG, score_arg,
+ "transweight", &settransweight, FLTARG, NULL,
+ "seed", &setseed, INTARG, NULL,
+ "kimura", &setkimura, NOARG, NULL,
+ "tossgaps", &settossgaps, NOARG, NULL,
+ "bootlabels", &setbootlabels, OPTARG, bootlabels_arg,
+ "debug", &setdebug, INTARG, NULL,
+ "output", &setoutput, OPTARG, output_arg,
+ "outputtree", &setoutputtree, OPTARG, outputtree_arg,
+ "outfile", &setoutfile, FILARG, NULL,
+ "outorder", &setoutorder, OPTARG, outorder_arg,
+ "case", &setcase, OPTARG, case_arg,
+ "seqnos", &setseqno, OPTARG, seqno_arg,
+
+ "seqno_range", &setseqno_range, OPTARG, seqno_range_arg, /* this one should be on/off and */
+ "range", &setrange, STRARG, NULL, /* this one should be like 10:20 , messy option settings */
+
+ "nosecstr1", &setsecstr1, NOARG, NULL,
+ "nosecstr2", &setsecstr2, NOARG, NULL,
+ "secstrout", &setsecstroutput, OPTARG, outputsecstr_arg,
+ "helixgap", &sethelixgap, INTARG, NULL,
+ "strandgap", &setstrandgap, INTARG, NULL,
+ "loopgap", &setloopgap, INTARG, NULL,
+ "terminalgap", &setterminalgap, INTARG, NULL,
+ "helixendin", &sethelixendin, INTARG, NULL,
+ "helixendout", &sethelixendout, INTARG, NULL,
+ "strandendin", &setstrandendin, INTARG, NULL,
+ "strandendout",&setstrandendout, INTARG, NULL,
+
+ "", NULL, -1};
+
+
Added: trunk/packages/clustalw/branches/upstream/current/prfalign.c
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/prfalign.c 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/prfalign.c 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,1132 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "clustalw.h"
+#define ENDALN 127
+
+#define MAX(a,b) ((a)>(b)?(a):(b))
+#define MIN(a,b) ((a)<(b)?(a):(b))
+
+/*
+ * Prototypes
+ */
+static lint pdiff(sint A,sint B,sint i,sint j,sint go1,sint go2);
+static lint prfscore(sint n, sint m);
+static sint gap_penalty1(sint i, sint j,sint k);
+static sint open_penalty1(sint i, sint j);
+static sint ext_penalty1(sint i, sint j);
+static sint gap_penalty2(sint i, sint j,sint k);
+static sint open_penalty2(sint i, sint j);
+static sint ext_penalty2(sint i, sint j);
+static void padd(sint k);
+static void pdel(sint k);
+static void palign(void);
+static void ptracepath(sint *alen);
+static void add_ggaps(void);
+static char * add_ggaps_mask(char *mask, int len, char *path1, char *path2);
+
+/*
+ * Global variables
+ */
+extern double **tmat;
+extern float gap_open, gap_extend;
+extern float transition_weight;
+extern sint gap_pos1, gap_pos2;
+extern sint max_aa;
+extern sint nseqs;
+extern sint *seqlen_array;
+extern sint *seq_weight;
+extern sint debug;
+extern Boolean neg_matrix;
+extern sint mat_avscore;
+extern short blosum30mt[], blosum40mt[], blosum45mt[];
+extern short blosum62mt2[], blosum80mt[];
+extern short pam20mt[], pam60mt[];
+extern short pam120mt[], pam160mt[], pam350mt[];
+extern short gon40mt[], gon80mt[];
+extern short gon120mt[], gon160mt[], gon250mt[], gon350mt[];
+extern short clustalvdnamt[],swgapdnamt[];
+extern short idmat[];
+extern short usermat[];
+extern short userdnamat[];
+extern Boolean user_series;
+extern UserMatSeries matseries;
+
+extern short def_dna_xref[],def_aa_xref[],dna_xref[],aa_xref[];
+extern sint max_aln_length;
+extern Boolean distance_tree;
+extern Boolean dnaflag;
+extern char mtrxname[];
+extern char dnamtrxname[];
+extern char **seq_array;
+extern char *amino_acid_codes;
+extern char *gap_penalty_mask1,*gap_penalty_mask2;
+extern char *sec_struct_mask1,*sec_struct_mask2;
+extern sint struct_penalties1, struct_penalties2;
+extern Boolean use_ss1, use_ss2;
+extern Boolean endgappenalties;
+
+static sint print_ptr,last_print;
+static sint *displ;
+
+static char **alignment;
+static sint *aln_len;
+static sint *aln_weight;
+static char *aln_path1, *aln_path2;
+static sint alignment_len;
+static sint **profile1, **profile2;
+static lint *HH, *DD, *RR, *SS;
+static lint *gS;
+static sint matrix[NUMRES][NUMRES];
+static sint nseqs1, nseqs2;
+static sint prf_length1, prf_length2;
+static sint *gaps;
+static sint gapcoef1,gapcoef2;
+static sint lencoef1,lencoef2;
+static Boolean switch_profiles;
+
+lint prfalign(sint *group, sint *aligned)
+{
+
+ static Boolean found;
+ static Boolean negative;
+ static Boolean error_given=FALSE;
+ static sint i, j, count = 0;
+ static sint NumSeq;
+ static sint len, len1, len2, is, minlen;
+ static sint se1, se2, sb1, sb2;
+ static sint maxres;
+ static sint int_scale;
+ static short *matptr;
+ static short *mat_xref;
+ static char c;
+ static lint score;
+ static float scale;
+ static double logmin,logdiff;
+ static double pcid;
+
+
+ alignment = (char **) ckalloc( nseqs * sizeof (char *) );
+ aln_len = (sint *) ckalloc( nseqs * sizeof (sint) );
+ aln_weight = (sint *) ckalloc( nseqs * sizeof (sint) );
+
+ for (i=0;i<nseqs;i++)
+ if (aligned[i+1] == 0) group[i+1] = 0;
+
+ nseqs1 = nseqs2 = 0;
+ for (i=0;i<nseqs;i++)
+ {
+ if (group[i+1] == 1) nseqs1++;
+ else if (group[i+1] == 2) nseqs2++;
+ }
+
+ if ((nseqs1 == 0) || (nseqs2 == 0)) return(0.0);
+
+ if (nseqs2 > nseqs1)
+ {
+ switch_profiles = TRUE;
+ for (i=0;i<nseqs;i++)
+ {
+ if (group[i+1] == 1) group[i+1] = 2;
+ else if (group[i+1] == 2) group[i+1] = 1;
+ }
+ }
+ else
+ switch_profiles = FALSE;
+
+ int_scale = 100;
+
+/*
+ calculate the mean of the sequence pc identities between the two groups
+*/
+ count = 0;
+ pcid = 0.0;
+ negative=neg_matrix;
+ for (i=0;i<nseqs;i++)
+ {
+ if (group[i+1] == 1)
+ for (j=0;j<nseqs;j++)
+ if (group[j+1] == 2)
+ {
+ count++;
+ pcid += tmat[i+1][j+1];
+ }
+ }
+
+ pcid = pcid/(float)count;
+
+if (debug > 0) fprintf(stdout,"mean tmat %3.1f\n", pcid);
+
+
+/*
+ Make the first profile.
+*/
+ prf_length1 = 0;
+ for (i=0;i<nseqs;i++)
+ if (group[i+1] == 1)
+ if(seqlen_array[i+1]>prf_length1) prf_length1=seqlen_array[i+1];
+
+ nseqs1 = 0;
+if (debug>0) fprintf(stdout,"sequences profile 1:\n");
+ for (i=0;i<nseqs;i++)
+ {
+ if (group[i+1] == 1)
+ {
+if (debug>0) {
+extern char **names;
+fprintf(stdout,"%s\n",names[i+1]);
+}
+ len = seqlen_array[i+1];
+ alignment[nseqs1] = (char *) ckalloc( (prf_length1+2) * sizeof (char) );
+ for (j=0;j<len;j++)
+ alignment[nseqs1][j] = seq_array[i+1][j+1];
+ for(j=len;j<prf_length1;j++)
+ alignment[nseqs1][j]=gap_pos1;
+ alignment[nseqs1][j] = ENDALN;
+ aln_len[nseqs1] = prf_length1;
+ aln_weight[nseqs1] = seq_weight[i];
+ nseqs1++;
+ }
+ }
+
+/*
+ Make the second profile.
+*/
+ prf_length2 = 0;
+ for (i=0;i<nseqs;i++)
+ if (group[i+1] == 2)
+ if(seqlen_array[i+1]>prf_length2) prf_length2=seqlen_array[i+1];
+
+ nseqs2 = 0;
+if (debug>0) fprintf(stdout,"sequences profile 2:\n");
+ for (i=0;i<nseqs;i++)
+ {
+ if (group[i+1] == 2)
+ {
+if (debug>0) {
+extern char **names;
+fprintf(stdout,"%s\n",names[i+1]);
+}
+ len = seqlen_array[i+1];
+ alignment[nseqs1+nseqs2] =
+ (char *) ckalloc( (prf_length2+2) * sizeof (char) );
+ for (j=0;j<len;j++)
+ alignment[nseqs1+nseqs2][j] = seq_array[i+1][j+1];
+ for(j=len;j<prf_length2;j++)
+ alignment[nseqs1+nseqs2][j]=gap_pos1;
+ alignment[nseqs1+nseqs2][j] = ENDALN;
+ aln_len[nseqs1+nseqs2] = prf_length2;
+ aln_weight[nseqs1+nseqs2] = seq_weight[i];
+ nseqs2++;
+ }
+ }
+
+ max_aln_length = prf_length1 + prf_length2+2;
+
+/*
+ calculate real length of profiles - removing gaps!
+*/
+ len1=0;
+ for (i=0;i<nseqs1;i++)
+ {
+ is=0;
+ for (j=0; j<MIN(aln_len[i],prf_length1); j++)
+ {
+ c = alignment[i][j];
+ if ((c !=gap_pos1) && (c != gap_pos2)) is++;
+ }
+ len1+=is;
+ }
+ len1/=(float)nseqs1;
+
+ len2=0;
+ for (i=nseqs1;i<nseqs2+nseqs1;i++)
+ {
+ is=0;
+ for (j=0; j<MIN(aln_len[i],prf_length2); j++)
+ {
+ c = alignment[i][j];
+ if ((c !=gap_pos1) && (c != gap_pos2)) is++;
+ }
+ len2+=is;
+ }
+ len2/=(float)nseqs2;
+
+ if (dnaflag)
+ {
+ scale=1.0;
+ if (strcmp(dnamtrxname, "iub") == 0)
+ {
+ matptr = swgapdnamt;
+ mat_xref = def_dna_xref;
+ }
+ else if (strcmp(dnamtrxname, "clustalw") == 0)
+ {
+ matptr = clustalvdnamt;
+ mat_xref = def_dna_xref;
+ scale=0.66;
+ }
+ else
+ {
+ matptr = userdnamat;
+ mat_xref = dna_xref;
+ }
+ maxres = get_matrix(matptr, mat_xref, matrix, neg_matrix, int_scale);
+ if (maxres == 0) return((sint)-1);
+/*
+ matrix[0][4]=transition_weight*matrix[0][0];
+ matrix[4][0]=transition_weight*matrix[0][0];
+ matrix[2][11]=transition_weight*matrix[0][0];
+ matrix[11][2]=transition_weight*matrix[0][0];
+ matrix[2][12]=transition_weight*matrix[0][0];
+ matrix[12][2]=transition_weight*matrix[0][0];
+*/
+/* fix suggested by Chanan Rubin at Compugen */
+ matrix[mat_xref[0]][mat_xref[4]]=transition_weight*matrix[0][0];
+ matrix[mat_xref[4]][mat_xref[0]]=transition_weight*matrix[0][0];
+ matrix[mat_xref[2]][mat_xref[11]]=transition_weight*matrix[0][0];
+ matrix[mat_xref[11]][mat_xref[2]]=transition_weight*matrix[0][0];
+ matrix[mat_xref[2]][mat_xref[12]]=transition_weight*matrix[0][0];
+ matrix[mat_xref[12]][mat_xref[2]]=transition_weight*matrix[0][0];
+
+ gapcoef1 = gapcoef2 = 100.0 * gap_open *scale;
+ lencoef1 = lencoef2 = 100.0 * gap_extend *scale;
+ }
+ else
+ {
+ if(len1==0 || len2==0) {
+ logmin=1.0;
+ logdiff=1.0;
+ }
+ else {
+ minlen = MIN(len1,len2);
+ logmin = 1.0/log10((double)minlen);
+ if (len2<len1)
+ logdiff = 1.0+0.5*log10((double)((float)len2/(float)len1));
+ else if (len1<len2)
+ logdiff = 1.0+0.5*log10((double)((float)len1/(float)len2));
+ else logdiff=1.0;
+ if(logdiff<0.9) logdiff=0.9;
+ }
+if(debug>0) fprintf(stdout,"%d %d logmin %f logdiff %f\n",
+(pint)len1,(pint)len2, logmin,logdiff);
+ scale=0.75;
+ if (strcmp(mtrxname, "blosum") == 0)
+ {
+ scale=0.75;
+ if (negative || distance_tree == FALSE) matptr = blosum40mt;
+ else if (pcid > 80.0)
+ {
+ matptr = blosum80mt;
+ }
+ else if (pcid > 60.0)
+ {
+ matptr = blosum62mt2;
+ }
+ else if (pcid > 40.0)
+ {
+ matptr = blosum45mt;
+ }
+ else if (pcid > 30.0)
+ {
+ scale=0.5;
+ matptr = blosum45mt;
+ }
+ else if (pcid > 20.0)
+ {
+ scale=0.6;
+ matptr = blosum45mt;
+ }
+ else
+ {
+ scale=0.6;
+ matptr = blosum30mt;
+ }
+ mat_xref = def_aa_xref;
+
+ }
+ else if (strcmp(mtrxname, "pam") == 0)
+ {
+ scale=0.75;
+ if (negative || distance_tree == FALSE) matptr = pam120mt;
+ else if (pcid > 80.0) matptr = pam20mt;
+ else if (pcid > 60.0) matptr = pam60mt;
+ else if (pcid > 40.0) matptr = pam120mt;
+ else matptr = pam350mt;
+ mat_xref = def_aa_xref;
+ }
+ else if (strcmp(mtrxname, "gonnet") == 0)
+ {
+ scale/=2.0;
+ if (negative || distance_tree == FALSE) matptr = gon250mt;
+ else if (pcid > 35.0)
+ {
+ matptr = gon80mt;
+ scale/=2.0;
+ }
+ else if (pcid > 25.0)
+ {
+ if(minlen<100) matptr = gon250mt;
+ else matptr = gon120mt;
+ }
+ else
+ {
+ if(minlen<100) matptr = gon350mt;
+ else matptr = gon160mt;
+ }
+ mat_xref = def_aa_xref;
+ int_scale /= 10;
+ }
+ else if (strcmp(mtrxname, "id") == 0)
+ {
+ matptr = idmat;
+ mat_xref = def_aa_xref;
+ }
+ else if(user_series)
+ {
+ matptr=NULL;
+ found=FALSE;
+ for(i=0;i<matseries.nmat;i++)
+ if(pcid>=matseries.mat[i].llimit && pcid<=matseries.mat[i].ulimit)
+ {
+ j=i;
+ found=TRUE;
+ break;
+ }
+ if(found==FALSE)
+ {
+ if(!error_given)
+ warning(
+"\nSeries matrix not found for sequence percent identity = %d.\n"
+"(Using first matrix in series as a default.)\n"
+"This alignment may not be optimal!\n"
+"SUGGESTION: Check your matrix series input file and try again.",(int)pcid);
+ error_given=TRUE;
+ j=0;
+ }
+if (debug>0) fprintf(stdout,"pcid %d matrix %d\n",(pint)pcid,(pint)j+1);
+
+ matptr = matseries.mat[j].matptr;
+ mat_xref = matseries.mat[j].aa_xref;
+/* this gives a scale of 0.5 for pcid=llimit and 1.0 for pcid=ulimit */
+ scale=0.5+(pcid-matseries.mat[j].llimit)/((matseries.mat[j].ulimit-matseries.mat[j].llimit)*2.0);
+ }
+ else
+ {
+ matptr = usermat;
+ mat_xref = aa_xref;
+ }
+if(debug>0) fprintf(stdout,"pcid %3.1f scale %3.1f\n",pcid,scale);
+ maxres = get_matrix(matptr, mat_xref, matrix, negative, int_scale);
+ if (maxres == 0)
+ {
+ fprintf(stdout,"Error: matrix %s not found\n", mtrxname);
+ return(-1);
+ }
+
+ if (negative) {
+ gapcoef1 = gapcoef2 = 100.0 * (float)(gap_open);
+ lencoef1 = lencoef2 = 100.0 * gap_extend;
+ }
+ else {
+ if (mat_avscore <= 0)
+ gapcoef1 = gapcoef2 = 100.0 * (float)(gap_open + logmin);
+ else
+ gapcoef1 = gapcoef2 = scale * mat_avscore * (float)(gap_open/(logdiff*logmin));
+ lencoef1 = lencoef2 = 100.0 * gap_extend;
+ }
+ }
+if (debug>0)
+{
+fprintf(stdout,"matavscore %d\n",mat_avscore);
+fprintf(stdout,"Gap Open1 %d Gap Open2 %d Gap Extend1 %d Gap Extend2 %d\n",
+ (pint)gapcoef1,(pint)gapcoef2, (pint)lencoef1,(pint)lencoef2);
+fprintf(stdout,"Matrix %s\n", mtrxname);
+}
+
+ profile1 = (sint **) ckalloc( (prf_length1+2) * sizeof (sint *) );
+ for(i=0; i<prf_length1+2; i++)
+ profile1[i] = (sint *) ckalloc( (LENCOL+2) * sizeof(sint) );
+
+ profile2 = (sint **) ckalloc( (prf_length2+2) * sizeof (sint *) );
+ for(i=0; i<prf_length2+2; i++)
+ profile2[i] = (sint *) ckalloc( (LENCOL+2) * sizeof(sint) );
+
+/*
+ calculate the Gap Coefficients.
+*/
+ gaps = (sint *) ckalloc( (max_aln_length+1) * sizeof (sint) );
+
+ if (switch_profiles == FALSE)
+ calc_gap_coeff(alignment, gaps, profile1, (struct_penalties1 && use_ss1), gap_penalty_mask1,
+ (sint)0, nseqs1, prf_length1, gapcoef1, lencoef1);
+ else
+ calc_gap_coeff(alignment, gaps, profile1, (struct_penalties2 && use_ss2), gap_penalty_mask2,
+ (sint)0, nseqs1, prf_length1, gapcoef1, lencoef1);
+/*
+ calculate the profile matrix.
+*/
+ calc_prf1(profile1, alignment, gaps, matrix,
+ aln_weight, prf_length1, (sint)0, nseqs1);
+
+if (debug>4)
+{
+extern char *amino_acid_codes;
+ for (j=0;j<=max_aa;j++)
+ fprintf(stdout,"%c ", amino_acid_codes[j]);
+ fprintf(stdout,"\n");
+ for (i=0;i<prf_length1;i++)
+ {
+ for (j=0;j<=max_aa;j++)
+ fprintf(stdout,"%d ", (pint)profile1[i+1][j]);
+ fprintf(stdout,"%d ", (pint)profile1[i+1][gap_pos1]);
+ fprintf(stdout,"%d ", (pint)profile1[i+1][gap_pos2]);
+ fprintf(stdout,"%d %d\n",(pint)profile1[i+1][GAPCOL],(pint)profile1[i+1][LENCOL]);
+ }
+}
+
+/*
+ calculate the Gap Coefficients.
+*/
+
+ if (switch_profiles == FALSE)
+ calc_gap_coeff(alignment, gaps, profile2, (struct_penalties2 && use_ss2), gap_penalty_mask2,
+ nseqs1, nseqs1+nseqs2, prf_length2, gapcoef2, lencoef2);
+ else
+ calc_gap_coeff(alignment, gaps, profile2, (struct_penalties1 && use_ss1), gap_penalty_mask1,
+ nseqs1, nseqs1+nseqs2, prf_length2, gapcoef2, lencoef2);
+/*
+ calculate the profile matrix.
+*/
+ calc_prf2(profile2, alignment, aln_weight,
+ prf_length2, nseqs1, nseqs1+nseqs2);
+
+ aln_weight=ckfree((void *)aln_weight);
+
+if (debug>4)
+{
+extern char *amino_acid_codes;
+ for (j=0;j<=max_aa;j++)
+ fprintf(stdout,"%c ", amino_acid_codes[j]);
+ fprintf(stdout,"\n");
+ for (i=0;i<prf_length2;i++)
+ {
+ for (j=0;j<=max_aa;j++)
+ fprintf(stdout,"%d ", (pint)profile2[i+1][j]);
+ fprintf(stdout,"%d ", (pint)profile2[i+1][gap_pos1]);
+ fprintf(stdout,"%d ", (pint)profile2[i+1][gap_pos2]);
+ fprintf(stdout,"%d %d\n",(pint)profile2[i+1][GAPCOL],(pint)profile2[i+1][LENCOL]);
+ }
+}
+
+ aln_path1 = (char *) ckalloc( (max_aln_length+1) * sizeof(char) );
+ aln_path2 = (char *) ckalloc( (max_aln_length+1) * sizeof(char) );
+
+
+/*
+ align the profiles
+*/
+/* use Myers and Miller to align two sequences */
+
+ last_print = 0;
+ print_ptr = 1;
+
+ sb1 = sb2 = 0;
+ se1 = prf_length1;
+ se2 = prf_length2;
+
+ HH = (lint *) ckalloc( (max_aln_length+1) * sizeof (lint) );
+ DD = (lint *) ckalloc( (max_aln_length+1) * sizeof (lint) );
+ RR = (lint *) ckalloc( (max_aln_length+1) * sizeof (lint) );
+ SS = (lint *) ckalloc( (max_aln_length+1) * sizeof (lint) );
+ gS = (lint *) ckalloc( (max_aln_length+1) * sizeof (lint) );
+ displ = (sint *) ckalloc( (max_aln_length+1) * sizeof (sint) );
+
+ score = pdiff(sb1, sb2, se1-sb1, se2-sb2, profile1[0][GAPCOL], profile1[prf_length1][GAPCOL]);
+
+ HH=ckfree((void *)HH);
+ DD=ckfree((void *)DD);
+ RR=ckfree((void *)RR);
+ SS=ckfree((void *)SS);
+ gS=ckfree((void *)gS);
+
+ ptracepath( &alignment_len);
+
+ displ=ckfree((void *)displ);
+
+ add_ggaps();
+
+ for (i=0;i<prf_length1+2;i++)
+ profile1[i]=ckfree((void *)profile1[i]);
+ profile1=ckfree((void *)profile1);
+
+ for (i=0;i<prf_length2+2;i++)
+ profile2[i]=ckfree((void *)profile2[i]);
+ profile2=ckfree((void *)profile2);
+
+ prf_length1 = alignment_len;
+
+ aln_path1=ckfree((void *)aln_path1);
+ aln_path2=ckfree((void *)aln_path2);
+
+ NumSeq = 0;
+ for (j=0;j<nseqs;j++)
+ {
+ if (group[j+1] == 1)
+ {
+ seqlen_array[j+1] = prf_length1;
+ realloc_seq(j+1,prf_length1);
+ for (i=0;i<prf_length1;i++)
+ seq_array[j+1][i+1] = alignment[NumSeq][i];
+ NumSeq++;
+ }
+ }
+ for (j=0;j<nseqs;j++)
+ {
+ if (group[j+1] == 2)
+ {
+ seqlen_array[j+1] = prf_length1;
+ seq_array[j+1] = (char *)realloc(seq_array[j+1], (prf_length1+2) * sizeof (char));
+ realloc_seq(j+1,prf_length1);
+ for (i=0;i<prf_length1;i++)
+ seq_array[j+1][i+1] = alignment[NumSeq][i];
+ NumSeq++;
+ }
+ }
+
+ for (i=0;i<nseqs1+nseqs2;i++)
+ alignment[i]=ckfree((void *)alignment[i]);
+ alignment=ckfree((void *)alignment);
+
+ aln_len=ckfree((void *)aln_len);
+ gaps=ckfree((void *)gaps);
+
+ return(score/100);
+}
+
+static void add_ggaps(void)
+{
+ sint j;
+ sint i,ix;
+ sint len;
+ char *ta;
+
+ ta = (char *) ckalloc( (alignment_len+1) * sizeof (char) );
+
+ for (j=0;j<nseqs1;j++)
+ {
+ ix = 0;
+ for (i=0;i<alignment_len;i++)
+ {
+ if (aln_path1[i] == 2)
+ {
+ if (ix < aln_len[j])
+ ta[i] = alignment[j][ix];
+ else
+ ta[i] = ENDALN;
+ ix++;
+ }
+ else if (aln_path1[i] == 1)
+ {
+/*
+ insertion in first alignment...
+*/
+ ta[i] = gap_pos1;
+ }
+ else
+ {
+ fprintf(stdout,"Error in aln_path\n");
+ }
+ }
+ ta[i] = ENDALN;
+
+ len = alignment_len;
+ alignment[j] = (char *)realloc(alignment[j], (len+2) * sizeof (char));
+ for (i=0;i<len;i++)
+ alignment[j][i] = ta[i];
+ alignment[j][len] = ENDALN;
+ aln_len[j] = len;
+ }
+
+ for (j=nseqs1;j<nseqs1+nseqs2;j++)
+ {
+ ix = 0;
+ for (i=0;i<alignment_len;i++)
+ {
+ if (aln_path2[i] == 2)
+ {
+ if (ix < aln_len[j])
+ ta[i] = alignment[j][ix];
+ else
+ ta[i] = ENDALN;
+ ix++;
+ }
+ else if (aln_path2[i] == 1)
+ {
+/*
+ insertion in second alignment...
+*/
+ ta[i] = gap_pos1;
+ }
+ else
+ {
+ fprintf(stdout,"Error in aln_path\n");
+ }
+ }
+ ta[i] = ENDALN;
+
+ len = alignment_len;
+ alignment[j] = (char *) realloc(alignment[j], (len+2) * sizeof (char) );
+ for (i=0;i<len;i++)
+ alignment[j][i] = ta[i];
+ alignment[j][len] = ENDALN;
+ aln_len[j] = len;
+ }
+
+ ta=ckfree((void *)ta);
+
+ if (struct_penalties1 != NONE)
+ gap_penalty_mask1 = add_ggaps_mask(gap_penalty_mask1,alignment_len,aln_path1,aln_path2);
+ if (struct_penalties1 == SECST)
+ sec_struct_mask1 = add_ggaps_mask(sec_struct_mask1,alignment_len,aln_path1,aln_path2);
+
+ if (struct_penalties2 != NONE)
+ gap_penalty_mask2 = add_ggaps_mask(gap_penalty_mask2,alignment_len,aln_path2,aln_path1);
+ if (struct_penalties2 == SECST)
+ sec_struct_mask2 = add_ggaps_mask(sec_struct_mask2,alignment_len,aln_path2,aln_path1);
+
+if (debug>0)
+{
+ char c;
+ extern char *amino_acid_codes;
+
+ for (i=0;i<nseqs1+nseqs2;i++)
+ {
+ for (j=0;j<alignment_len;j++)
+ {
+ if (alignment[i][j] == ENDALN) break;
+ else if ((alignment[i][j] == gap_pos1) || (alignment[i][j] == gap_pos2)) c = '-';
+ else c = amino_acid_codes[alignment[i][j]];
+ fprintf(stdout,"%c", c);
+ }
+ fprintf(stdout,"\n\n");
+ }
+}
+
+}
+
+static char * add_ggaps_mask(char *mask, int len, char *path1, char *path2)
+{
+ int i,ix;
+ char *ta;
+
+ ta = (char *) ckalloc( (len+1) * sizeof (char) );
+
+ ix = 0;
+ if (switch_profiles == FALSE)
+ {
+ for (i=0;i<len;i++)
+ {
+ if (path1[i] == 2)
+ {
+ ta[i] = mask[ix];
+ ix++;
+ }
+ else if (path1[i] == 1)
+ ta[i] = gap_pos1;
+ }
+ }
+ else
+ {
+ for (i=0;i<len;i++)
+ {
+ if (path2[i] == 2)
+ {
+ ta[i] = mask[ix];
+ ix++;
+ }
+ else if (path2[i] == 1)
+ ta[i] = gap_pos1;
+ }
+ }
+ mask = (char *)realloc(mask,(len+2) * sizeof (char));
+ for (i=0;i<len;i++)
+ mask[i] = ta[i];
+ mask[i] ='\0';
+
+ ta=ckfree((void *)ta);
+
+ return(mask);
+}
+
+static lint prfscore(sint n, sint m)
+{
+ sint ix;
+ lint score;
+
+ score = 0.0;
+ for (ix=0; ix<=max_aa; ix++)
+ {
+ score += (profile1[n][ix] * profile2[m][ix]);
+ }
+ score += (profile1[n][gap_pos1] * profile2[m][gap_pos1]);
+ score += (profile1[n][gap_pos2] * profile2[m][gap_pos2]);
+ return(score/10);
+
+}
+
+static void ptracepath(sint *alen)
+{
+ sint i,j,k,pos,to_do;
+
+ pos = 0;
+
+ to_do=print_ptr-1;
+
+ for(i=1;i<=to_do;++i) {
+if (debug>1) fprintf(stdout,"%d ",(pint)displ[i]);
+ if(displ[i]==0) {
+ aln_path1[pos]=2;
+ aln_path2[pos]=2;
+ ++pos;
+ }
+ else {
+ if((k=displ[i])>0) {
+ for(j=0;j<=k-1;++j) {
+ aln_path2[pos+j]=2;
+ aln_path1[pos+j]=1;
+ }
+ pos += k;
+ }
+ else {
+ k = (displ[i]<0) ? displ[i] * -1 : displ[i];
+ for(j=0;j<=k-1;++j) {
+ aln_path1[pos+j]=2;
+ aln_path2[pos+j]=1;
+ }
+ pos += k;
+ }
+ }
+ }
+if (debug>1) fprintf(stdout,"\n");
+
+ (*alen) = pos;
+
+}
+
+static void pdel(sint k)
+{
+ if(last_print<0)
+ last_print = displ[print_ptr-1] -= k;
+ else
+ last_print = displ[print_ptr++] = -(k);
+}
+
+static void padd(sint k)
+{
+
+ if(last_print<0) {
+ displ[print_ptr-1] = k;
+ displ[print_ptr++] = last_print;
+ }
+ else
+ last_print = displ[print_ptr++] = k;
+}
+
+static void palign(void)
+{
+ displ[print_ptr++] = last_print = 0;
+}
+
+
+static lint pdiff(sint A,sint B,sint M,sint N,sint go1, sint go2)
+{
+ sint midi,midj,type;
+ lint midh;
+
+ static lint t, tl, g, h;
+
+{ static sint i,j;
+ static lint hh, f, e, s;
+
+/* Boundary cases: M <= 1 or N == 0 */
+if (debug>2) fprintf(stdout,"A %d B %d M %d N %d midi %d go1 %d go2 %d\n",
+(pint)A,(pint)B,(pint)M,(pint)N,(pint)M/2,(pint)go1,(pint)go2);
+
+/* if sequence B is empty.... */
+
+ if(N<=0) {
+
+/* if sequence A is not empty.... */
+
+ if(M>0) {
+
+/* delete residues A[1] to A[M] */
+
+ pdel(M);
+ }
+ return(-gap_penalty1(A,B,M));
+ }
+
+/* if sequence A is empty.... */
+
+ if(M<=1) {
+ if(M<=0) {
+
+/* insert residues B[1] to B[N] */
+
+ padd(N);
+ return(-gap_penalty2(A,B,N));
+ }
+
+/* if sequence A has just one residue.... */
+
+ if (go1 == 0)
+ midh = -gap_penalty1(A+1,B+1,N);
+ else
+ midh = -gap_penalty2(A+1,B,1)-gap_penalty1(A+1,B+1,N);
+ midj = 0;
+ for(j=1;j<=N;j++) {
+ hh = -gap_penalty1(A,B+1,j-1) + prfscore(A+1,B+j)
+ -gap_penalty1(A+1,B+j+1,N-j);
+ if(hh>midh) {
+ midh = hh;
+ midj = j;
+ }
+ }
+
+ if(midj==0) {
+ padd(N);
+ pdel(1);
+ }
+ else {
+ if(midj>1) padd(midj-1);
+ palign();
+ if(midj<N) padd(N-midj);
+ }
+ return midh;
+ }
+
+
+/* Divide sequence A in half: midi */
+
+ midi = M / 2;
+
+/* In a forward phase, calculate all HH[j] and HH[j] */
+
+ HH[0] = 0.0;
+ t = -open_penalty1(A,B+1);
+ tl = -ext_penalty1(A,B+1);
+ for(j=1;j<=N;j++) {
+ HH[j] = t = t+tl;
+ DD[j] = t-open_penalty2(A+1,B+j);
+ }
+
+ if (go1 == 0) t = 0;
+ else t = -open_penalty2(A+1,B);
+ tl = -ext_penalty2(A+1,B);
+ for(i=1;i<=midi;i++) {
+ s = HH[0];
+ HH[0] = hh = t = t+tl;
+ f = t-open_penalty1(A+i,B+1);
+
+ for(j=1;j<=N;j++) {
+ g = open_penalty1(A+i,B+j);
+ h = ext_penalty1(A+i,B+j);
+ if ((hh=hh-g-h) > (f=f-h)) f=hh;
+ g = open_penalty2(A+i,B+j);
+ h = ext_penalty2(A+i,B+j);
+ if ((hh=HH[j]-g-h) > (e=DD[j]-h)) e=hh;
+ hh = s + prfscore(A+i, B+j);
+ if (f>hh) hh = f;
+ if (e>hh) hh = e;
+
+ s = HH[j];
+ HH[j] = hh;
+ DD[j] = e;
+
+ }
+ }
+
+ DD[0]=HH[0];
+
+/* In a reverse phase, calculate all RR[j] and SS[j] */
+
+ RR[N]=0.0;
+ tl = 0.0;
+ for(j=N-1;j>=0;j--) {
+ g = -open_penalty1(A+M,B+j+1);
+ tl -= ext_penalty1(A+M,B+j+1);
+ RR[j] = g+tl;
+ SS[j] = RR[j]-open_penalty2(A+M,B+j);
+ gS[j] = open_penalty2(A+M,B+j);
+ }
+
+ tl = 0.0;
+ for(i=M-1;i>=midi;i--) {
+ s = RR[N];
+ if (go2 == 0) g = 0;
+ else g = -open_penalty2(A+i+1,B+N);
+ tl -= ext_penalty2(A+i+1,B+N);
+ RR[N] = hh = g+tl;
+ t = open_penalty1(A+i,B+N);
+ f = RR[N]-t;
+
+ for(j=N-1;j>=0;j--) {
+ g = open_penalty1(A+i,B+j+1);
+ h = ext_penalty1(A+i,B+j+1);
+ if ((hh=hh-g-h) > (f=f-h-g+t)) f=hh;
+ t = g;
+ g = open_penalty2(A+i+1,B+j);
+ h = ext_penalty2(A+i+1,B+j);
+ hh=RR[j]-g-h;
+ if (i==(M-1)) {
+ e=SS[j]-h;
+ }
+ else {
+ e=SS[j]-h-g+open_penalty2(A+i+2,B+j);
+ gS[j] = g;
+ }
+ if (hh > e) e=hh;
+ hh = s + prfscore(A+i+1, B+j+1);
+ if (f>hh) hh = f;
+ if (e>hh) hh = e;
+
+ s = RR[j];
+ RR[j] = hh;
+ SS[j] = e;
+
+ }
+ }
+ SS[N]=RR[N];
+ gS[N] = open_penalty2(A+midi+1,B+N);
+
+/* find midj, such that HH[j]+RR[j] or DD[j]+SS[j]+gap is the maximum */
+
+ midh=HH[0]+RR[0];
+ midj=0;
+ type=1;
+ for(j=0;j<=N;j++) {
+ hh = HH[j] + RR[j];
+ if(hh>=midh)
+ if(hh>midh || (HH[j]!=DD[j] && RR[j]==SS[j])) {
+ midh=hh;
+ midj=j;
+ }
+ }
+
+ for(j=N;j>=0;j--) {
+ hh = DD[j] + SS[j] + gS[j];
+ if(hh>midh) {
+ midh=hh;
+ midj=j;
+ type=2;
+ }
+ }
+}
+
+/* Conquer recursively around midpoint */
+
+
+ if(type==1) { /* Type 1 gaps */
+if (debug>2) fprintf(stdout,"Type 1,1: midj %d\n",(pint)midj);
+ pdiff(A,B,midi,midj,go1,1);
+if (debug>2) fprintf(stdout,"Type 1,2: midj %d\n",(pint)midj);
+ pdiff(A+midi,B+midj,M-midi,N-midj,1,go2);
+ }
+ else {
+if (debug>2) fprintf(stdout,"Type 2,1: midj %d\n",(pint)midj);
+ pdiff(A,B,midi-1,midj,go1, 0);
+ pdel(2);
+if (debug>2) fprintf(stdout,"Type 2,2: midj %d\n",(pint)midj);
+ pdiff(A+midi+1,B+midj,M-midi-1,N-midj,0,go2);
+ }
+
+ return midh; /* Return the score of the best alignment */
+}
+
+/* calculate the score for opening a gap at residues A[i] and B[j] */
+
+static sint open_penalty1(sint i, sint j)
+{
+ sint g;
+
+ if (!endgappenalties &&(i==0 || i==prf_length1)) return(0);
+
+ g = profile2[j][GAPCOL] + profile1[i][GAPCOL];
+ return(g);
+}
+
+/* calculate the score for extending an existing gap at A[i] and B[j] */
+
+static sint ext_penalty1(sint i, sint j)
+{
+ sint h;
+
+ if (!endgappenalties &&(i==0 || i==prf_length1)) return(0);
+
+ h = profile2[j][LENCOL];
+ return(h);
+}
+
+/* calculate the score for a gap of length k, at residues A[i] and B[j] */
+
+static sint gap_penalty1(sint i, sint j, sint k)
+{
+ sint ix;
+ sint gp;
+ sint g, h = 0;
+
+ if (k <= 0) return(0);
+ if (!endgappenalties &&(i==0 || i==prf_length1)) return(0);
+
+ g = profile2[j][GAPCOL] + profile1[i][GAPCOL];
+ for (ix=0;ix<k && ix+j<prf_length2;ix++)
+ h += profile2[ix+j][LENCOL];
+
+ gp = g + h;
+ return(gp);
+}
+/* calculate the score for opening a gap at residues A[i] and B[j] */
+
+static sint open_penalty2(sint i, sint j)
+{
+ sint g;
+
+ if (!endgappenalties &&(j==0 || j==prf_length2)) return(0);
+
+ g = profile1[i][GAPCOL] + profile2[j][GAPCOL];
+ return(g);
+}
+
+/* calculate the score for extending an existing gap at A[i] and B[j] */
+
+static sint ext_penalty2(sint i, sint j)
+{
+ sint h;
+
+ if (!endgappenalties &&(j==0 || j==prf_length2)) return(0);
+
+ h = profile1[i][LENCOL];
+ return(h);
+}
+
+/* calculate the score for a gap of length k, at residues A[i] and B[j] */
+
+static sint gap_penalty2(sint i, sint j, sint k)
+{
+ sint ix;
+ sint gp;
+ sint g, h = 0;
+
+ if (k <= 0) return(0);
+ if (!endgappenalties &&(j==0 || j==prf_length2)) return(0);
+
+ g = profile1[i][GAPCOL] + profile2[j][GAPCOL];
+ for (ix=0;ix<k && ix+i<prf_length1;ix++)
+ h += profile1[ix+i][LENCOL];
+
+ gp = g + h;
+ return(gp);
+}
Added: trunk/packages/clustalw/branches/upstream/current/random.c
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/random.c 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/random.c 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,81 @@
+/*
+*
+* Rand.c
+*
+* - linear and additive congruential random number generators
+* (see R. Sedgewick, Algorithms, Chapter 35)
+*
+* Implementation: R. Fuchs, EMBL Data Library, 1991
+*
+*/
+#include <stdio.h>
+
+unsigned long linrand(unsigned long r);
+unsigned long addrand(unsigned long r);
+void addrandinit(unsigned long s);
+
+static unsigned long mult(unsigned long p,unsigned long q);
+
+
+#define m1 10000
+#define m 100000000
+
+static unsigned long mult(unsigned long p, unsigned long q);
+
+/* linear congruential method
+*
+* linrand() returns an unsigned long random number in the range 0 to r-1
+*/
+
+
+unsigned long linrand(unsigned long r)
+{
+ static unsigned long a=1234567;
+
+ a = (mult(a,31415821)+1) % m;
+ return( ( (a / m1) * r) / m1 );
+}
+
+static unsigned long mult(unsigned long p, unsigned long q)
+{
+ unsigned long p1,p0,q1,q0;
+
+ p1 = p/m1; p0 = p % m1;
+ q1 = q/m1; q0 = q % m1;
+ return((((p0*q1 + p1*q0) % m1) * m1 + p0*q0) % m);
+}
+
+
+/* additive congruential method
+*
+* addrand() returns an unsigned long random number in the range 0 to r-1
+* The random number generator is initialized by addrandinit()
+*/
+
+static unsigned long j;
+static unsigned long a[55];
+
+unsigned long addrand(unsigned long r)
+{
+int x,y;
+/* fprintf(stdout,"\n j = %d",j); */
+ j = (j + 1) % 55;
+/* fprintf(stdout,"\n j = %d",j); */
+ x = (j+23)%55;
+ y = (j+54)%55;
+ a[j] = (a[x] + a[y]) % m;
+/* a[j] = (a[(j+23)%55] + a[(j+54)%55]) % m; */
+/* fprintf(stdout,"\n a[j] = %d",a[j]); */
+ return( ((a[j] / m1) * r) / m1 );
+}
+
+void addrandinit(unsigned long s)
+{
+ a[0] = s;
+ j = 0;
+ do {
+ ++j;
+ a[j] = (mult(31,a[j-1]) + 1) % m;
+ } while (j<54);
+}
+
Added: trunk/packages/clustalw/branches/upstream/current/readmat.c
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/readmat.c 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/readmat.c 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,476 @@
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include "clustalw.h"
+#include "matrices.h"
+
+
+/*
+ * Prototypes
+ */
+static Boolean commentline(char *line);
+
+
+/*
+ * Global variables
+ */
+
+extern char *amino_acid_codes;
+extern sint gap_pos1, gap_pos2;
+extern sint max_aa;
+extern short def_dna_xref[],def_aa_xref[];
+extern sint mat_avscore;
+extern sint debug;
+extern Boolean dnaflag;
+
+extern Boolean user_series;
+extern UserMatSeries matseries;
+extern short usermatseries[MAXMAT][NUMRES][NUMRES];
+extern short aa_xrefseries[MAXMAT][NUMRES+1];
+
+
+void init_matrix(void)
+{
+
+ char c1,c2;
+ short i, j, maxres;
+
+ max_aa = strlen(amino_acid_codes)-2;
+ gap_pos1 = NUMRES-2; /* code for gaps inserted by clustalw */
+ gap_pos2 = NUMRES-1; /* code for gaps already in alignment */
+
+/*
+ set up cross-reference for default matrices hard-coded in matrices.h
+*/
+ for (i=0;i<NUMRES;i++) def_aa_xref[i] = -1;
+ for (i=0;i<NUMRES;i++) def_dna_xref[i] = -1;
+
+ maxres = 0;
+ for (i=0;(c1=amino_acid_order[i]);i++)
+ {
+ for (j=0;(c2=amino_acid_codes[j]);j++)
+ {
+ if (c1 == c2)
+ {
+ def_aa_xref[i] = j;
+ maxres++;
+ break;
+ }
+ }
+ if ((def_aa_xref[i] == -1) && (amino_acid_order[i] != '*'))
+ {
+ error("residue %c in matrices.h is not recognised",
+ amino_acid_order[i]);
+ }
+ }
+
+ maxres = 0;
+ for (i=0;(c1=nucleic_acid_order[i]);i++)
+ {
+ for (j=0;(c2=amino_acid_codes[j]);j++)
+ {
+ if (c1 == c2)
+ {
+ def_dna_xref[i] = j;
+ maxres++;
+ break;
+ }
+ }
+ if ((def_dna_xref[i] == -1) && (nucleic_acid_order[i] != '*'))
+ {
+ error("nucleic acid %c in matrices.h is not recognised",
+ nucleic_acid_order[i]);
+ }
+ }
+}
+
+sint get_matrix(short *matptr, short *xref, sint matrix[NUMRES][NUMRES], Boolean neg_flag, sint scale)
+{
+ sint gg_score = 0;
+ sint gr_score = 0;
+ sint i, j, k, ix = 0;
+ sint ti, tj;
+ sint maxres;
+ sint av1,av2,av3,min, max;
+/*
+ default - set all scores to 0
+*/
+ for (i=0;i<=max_aa;i++)
+ for (j=0;j<=max_aa;j++)
+ matrix[i][j] = 0;
+
+ ix = 0;
+ maxres = 0;
+ for (i=0;i<=max_aa;i++)
+ {
+ ti = xref[i];
+ for (j=0;j<=i;j++)
+ {
+ tj = xref[j];
+ if ((ti != -1) && (tj != -1))
+ {
+ k = matptr[ix];
+ if (ti==tj)
+ {
+ matrix[ti][ti] = k * scale;
+ maxres++;
+ }
+ else
+ {
+ matrix[ti][tj] = k * scale;
+ matrix[tj][ti] = k * scale;
+ }
+ ix++;
+ }
+ }
+ }
+
+ --maxres;
+
+ av1 = av2 = av3 = 0;
+ for (i=0;i<=max_aa;i++)
+ {
+ for (j=0;j<=i;j++)
+ {
+ av1 += matrix[i][j];
+ if (i==j)
+ {
+ av2 += matrix[i][j];
+ }
+ else
+ {
+ av3 += matrix[i][j];
+ }
+ }
+ }
+
+ av1 /= (maxres*maxres)/2;
+ av2 /= maxres;
+ av3 /= ((float)(maxres*maxres-maxres))/2;
+ mat_avscore = -av3;
+
+ min = max = matrix[0][0];
+ for (i=0;i<=max_aa;i++)
+ for (j=1;j<=i;j++)
+ {
+ if (matrix[i][j] < min) min = matrix[i][j];
+ if (matrix[i][j] > max) max = matrix[i][j];
+ }
+if (debug>1) fprintf(stdout,"maxres %d\n",(pint)max_aa);
+if (debug>1) fprintf(stdout,"average mismatch score %d\n",(pint)av3);
+if (debug>1) fprintf(stdout,"average match score %d\n",(pint)av2);
+if (debug>1) fprintf(stdout,"average score %d\n",(pint)av1);
+
+/*
+ if requested, make a positive matrix - add -(lowest score) to every entry
+*/
+ if (neg_flag == FALSE)
+ {
+
+if (debug>1) fprintf(stdout,"min %d max %d\n",(pint)min,(pint)max);
+ if (min < 0)
+ {
+ for (i=0;i<=max_aa;i++)
+ {
+ ti = xref[i];
+ if (ti != -1)
+ {
+ for (j=0;j<=max_aa;j++)
+ {
+ tj = xref[j];
+/*
+ if (tj != -1) matrix[ti][tj] -= (2*av3);
+*/
+ if (tj != -1) matrix[ti][tj] -= min;
+ }
+ }
+ }
+ }
+/*
+ gr_score = av3;
+ gg_score = -av3;
+*/
+
+ }
+
+
+
+ for (i=0;i<gap_pos1;i++)
+ {
+ matrix[i][gap_pos1] = gr_score;
+ matrix[gap_pos1][i] = gr_score;
+ matrix[i][gap_pos2] = gr_score;
+ matrix[gap_pos2][i] = gr_score;
+ }
+ matrix[gap_pos1][gap_pos1] = gg_score;
+ matrix[gap_pos2][gap_pos2] = gg_score;
+ matrix[gap_pos2][gap_pos1] = gg_score;
+ matrix[gap_pos1][gap_pos2] = gg_score;
+
+ maxres += 2;
+
+ return(maxres);
+}
+
+
+sint read_matrix_series(char *filename, short *usermat, short *xref)
+{
+ FILE *fd = NULL, *matfd = NULL;
+ char mat_filename[FILENAMELEN];
+ char inline1[1024];
+ sint maxres = 0;
+ sint nmat;
+ sint n,llimit,ulimit;
+
+ if (filename[0] == '\0')
+ {
+ error("comparison matrix not specified");
+ return((sint)0);
+ }
+ if ((fd=fopen(filename,"r"))==NULL)
+ {
+ error("cannot open %s", filename);
+ return((sint)0);
+ }
+
+/* check the first line to see if it's a series or a single matrix */
+ while (fgets(inline1,1024,fd) != NULL)
+ {
+ if (commentline(inline1)) continue;
+ if(linetype(inline1,"CLUSTAL_SERIES"))
+ user_series=TRUE;
+ else
+ user_series=FALSE;
+ break;
+ }
+
+/* it's a single matrix */
+ if(user_series == FALSE)
+ {
+ fclose(fd);
+ maxres=read_user_matrix(filename,usermat,xref);
+ return(maxres);
+ }
+
+/* it's a series of matrices, find the next MATRIX line */
+ nmat=0;
+ matseries.nmat=0;
+ while (fgets(inline1,1024,fd) != NULL)
+ {
+ if (commentline(inline1)) continue;
+ if(linetype(inline1,"MATRIX"))
+ {
+ if(sscanf(inline1+6,"%d %d %s",&llimit,&ulimit,mat_filename)!=3)
+ {
+ error("Bad format in file %s\n",filename);
+ fclose(fd);
+ return((sint)0);
+ }
+ if(llimit<0 || llimit > 100 || ulimit <0 || ulimit>100)
+ {
+ error("Bad format in file %s\n",filename);
+ fclose(fd);
+ return((sint)0);
+ }
+ if(ulimit<=llimit)
+ {
+ error("in file %s: lower limit is greater than upper (%d-%d)\n",filename,llimit,ulimit);
+ fclose(fd);
+ return((sint)0);
+ }
+ n=read_user_matrix(mat_filename,&usermatseries[nmat][0][0],&aa_xrefseries[nmat][0]);
+ if(n<=0)
+ {
+ error("Bad format in matrix file %s\n",mat_filename);
+ fclose(fd);
+ return((sint)0);
+ }
+ matseries.mat[nmat].llimit=llimit;
+ matseries.mat[nmat].ulimit=ulimit;
+ matseries.mat[nmat].matptr=&usermatseries[nmat][0][0];
+ matseries.mat[nmat].aa_xref=&aa_xrefseries[nmat][0];
+ nmat++;
+ }
+ }
+ fclose(fd);
+ matseries.nmat=nmat;
+
+ maxres=n;
+ return(maxres);
+
+}
+
+sint read_user_matrix(char *filename, short *usermat, short *xref)
+{
+ double f;
+ FILE *fd;
+ sint numargs,farg;
+ sint i, j, k = 0;
+ char codes[NUMRES];
+ char inline1[1024];
+ char *args[NUMRES+4];
+ char c1,c2;
+ sint ix1, ix = 0;
+ sint maxres = 0;
+ float scale;
+
+ if (filename[0] == '\0')
+ {
+ error("comparison matrix not specified");
+ return((sint)0);
+ }
+
+ if ((fd=fopen(filename,"r"))==NULL)
+ {
+ error("cannot open %s", filename);
+ return((sint)0);
+ }
+ maxres = 0;
+ while (fgets(inline1,1024,fd) != NULL)
+ {
+ if (commentline(inline1)) continue;
+ if(linetype(inline1,"CLUSTAL_SERIES"))
+ {
+ error("in %s - single matrix expected.", filename);
+ fclose(fd);
+ return((sint)0);
+ }
+/*
+ read residue characters.
+*/
+ k = 0;
+ for (j=0;j<strlen(inline1);j++)
+ {
+ if (isalpha((int)inline1[j])) codes[k++] = inline1[j];
+ if (k>NUMRES)
+ {
+ error("too many entries in matrix %s",filename);
+ fclose(fd);
+ return((sint)0);
+ }
+ }
+ codes[k] = '\0';
+ break;
+ }
+
+ if (k == 0)
+ {
+ error("wrong format in matrix %s",filename);
+ fclose(fd);
+ return((sint)0);
+ }
+
+/*
+ cross-reference the residues
+*/
+ for (i=0;i<NUMRES;i++) xref[i] = -1;
+
+ maxres = 0;
+ for (i=0;(c1=codes[i]);i++)
+ {
+ for (j=0;(c2=amino_acid_codes[j]);j++)
+ if (c1 == c2)
+ {
+ xref[i] = j;
+ maxres++;
+ break;
+ }
+ if ((xref[i] == -1) && (codes[i] != '*'))
+ {
+ warning("residue %c in matrix %s not recognised",
+ codes[i],filename);
+ }
+ }
+
+
+/*
+ get the weights
+*/
+
+ ix = ix1 = 0;
+ while (fgets(inline1,1024,fd) != NULL)
+ {
+ if (inline1[0] == '\n') continue;
+ if (inline1[0] == '#' ||
+ inline1[0] == '!') break;
+ numargs = getargs(inline1, args, (int)(k+1));
+ if (numargs < maxres)
+ {
+ error("wrong format in matrix %s",filename);
+ fclose(fd);
+ return((sint)0);
+ }
+ if (isalpha(args[0][0])) farg=1;
+ else farg=0;
+
+/* decide whether the matrix values are float or decimal */
+ scale=1.0;
+ for(i=0;i<strlen(args[farg]);i++)
+ if(args[farg][i]=='.')
+ {
+/* we've found a float value */
+ scale=10.0;
+ break;
+ }
+
+ for (i=0;i<=ix;i++)
+ {
+ if (xref[i] != -1)
+ {
+ f = atof(args[i+farg]);
+ usermat[ix1++] = (short)(f*scale);
+ }
+ }
+ ix++;
+ }
+ if (ix != k+1)
+ {
+ error("wrong format in matrix %s",filename);
+ fclose(fd);
+ return((sint)0);
+ }
+
+
+ maxres += 2;
+ fclose(fd);
+
+ return(maxres);
+}
+
+int getargs(char *inline1,char *args[],int max)
+{
+
+ char *inptr;
+/*
+#ifndef MAC
+ char *strtok(char *s1, const char *s2);
+#endif
+*/
+ int i;
+
+ inptr=inline1;
+ for (i=0;i<=max;i++)
+ {
+ if ((args[i]=strtok(inptr," \t\n"))==NULL)
+ break;
+ inptr=NULL;
+ }
+
+ return(i);
+}
+
+
+static Boolean commentline(char *line)
+{
+ int i;
+
+ if(line[0] == '#') return TRUE;
+ for(i=0;line[i]!='\n' && line[i]!=EOS;i++) {
+ if(!isspace(line[i]))
+ return FALSE;
+ }
+ return TRUE;
+}
+
Added: trunk/packages/clustalw/branches/upstream/current/sequence.c
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/sequence.c 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/sequence.c 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,1379 @@
+/********* Sequence input routines for CLUSTAL W *******************/
+/* DES was here. FEB. 1994 */
+/* Now reads PILEUP/MSF and CLUSTAL alignment files */
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include "clustalw.h"
+
+#define MIN(a,b) ((a)<(b)?(a):(b))
+
+
+
+/*
+* Prototypes
+*/
+
+static char * get_seq(char *,sint *,char *);
+static char * get_clustal_seq(char *,sint *,char *,sint);
+static char * get_msf_seq(char *,sint *,char *,sint);
+static void check_infile(sint *);
+static void p_encode(char *, char *, sint);
+static void n_encode(char *, char *, sint);
+static sint res_index(char *,char);
+static Boolean check_dnaflag(char *, sint);
+static sint count_clustal_seqs(void);
+static sint count_pir_seqs(void);
+static sint count_msf_seqs(void);
+static sint count_rsf_seqs(void);
+static void get_swiss_feature(char *line,sint len);
+static void get_rsf_feature(char *line,sint len);
+static void get_swiss_mask(char *line,sint len);
+static void get_clustal_ss(sint length);
+static void get_embl_ss(sint length);
+static void get_rsf_ss(sint length);
+static void get_gde_ss(sint length);
+static Boolean cl_blankline(char *line);
+
+/*
+ * Global variables
+ */
+extern sint max_names;
+FILE *fin;
+extern Boolean usemenu, dnaflag, explicit_dnaflag;
+extern Boolean interactive;
+extern char seqname[];
+extern sint nseqs;
+extern sint *seqlen_array;
+extern sint *output_index;
+extern char **names,**titles;
+extern char **seq_array;
+extern Boolean profile1_empty, profile2_empty;
+extern sint gap_pos2;
+extern sint max_aln_length;
+extern char *gap_penalty_mask, *sec_struct_mask;
+extern sint struct_penalties;
+extern char *ss_name;
+extern sint profile_no;
+extern sint debug;
+
+char *amino_acid_codes = "ABCDEFGHIKLMNPQRSTUVWXYZ-"; /* DES */
+static sint seqFormat;
+static char chartab[128];
+static char *formatNames[] = {"unknown","EMBL/Swiss-Prot","PIR",
+ "Pearson","GDE","Clustal","Pileup/MSF","RSF","USER","PHYLIP","NEXUS"};
+
+void fill_chartab(void) /* Create translation and check table */
+{
+ register sint i;
+ register char c;
+
+ for(i=0;i<128;chartab[i++]=0);
+ for(i=0;(c=amino_acid_codes[i]);i++)
+ chartab[(int)c]=chartab[tolower(c)]=c;
+}
+
+static char * get_msf_seq(char *sname,sint *len,char *tit,sint seqno)
+/* read the seqno_th. sequence from a PILEUP multiple alignment file */
+{
+ static char line[MAXLINE+1];
+ char *seq = NULL;
+ sint i,j,k;
+ unsigned char c;
+
+ fseek(fin,0,0); /* start at the beginning */
+
+ *len=0; /* initialise length to zero */
+ for(i=0;;i++) {
+ if(fgets(line,MAXLINE+1,fin)==NULL) return NULL; /* read the title*/
+ if(linetype(line,"//") ) break; /* lines...ignore*/
+ }
+
+ while (fgets(line,MAXLINE+1,fin) != NULL) {
+ if(!blankline(line)) {
+
+ for(i=1;i<seqno;i++) fgets(line,MAXLINE+1,fin);
+ for(j=0;j<=strlen(line);j++) if(line[j] != ' ') break;
+ for(k=j;k<=strlen(line);k++) if(line[k] == ' ') break;
+ strncpy(sname,line+j,MIN(MAXNAMES,k-j));
+ sname[MIN(MAXNAMES,k-j)]=EOS;
+ rtrim(sname);
+ blank_to_(sname);
+
+ if(seq==NULL)
+ seq=(char *)ckalloc((MAXLINE+2)*sizeof(char));
+ else
+ seq=(char *)ckrealloc(seq,((*len)+MAXLINE+2)*sizeof(char));
+ for(i=k;i<=MAXLINE;i++) {
+ c=line[i];
+ if(c == '.' || c == '~' ) c = '-';
+ if(c == '*') c = 'X';
+ if(c == '\n' || c == EOS) break; /* EOL */
+ c=chartab[c];
+ if(c) seq[++(*len)]=c;
+ }
+
+ for(i=0;;i++) {
+ if(fgets(line,MAXLINE+1,fin)==NULL) return seq;
+ if(blankline(line)) break;
+ }
+ }
+ }
+ return seq;
+}
+
+static Boolean cl_blankline(char *line)
+{
+ int i;
+
+ if (line[0] == '!') return TRUE;
+
+ for(i=0;line[i]!='\n' && line[i]!=EOS;i++) {
+ if( isdigit(line[i]) ||
+ isspace(line[i]) ||
+ (line[i] == '*') ||
+ (line[i] == ':') ||
+ (line[i] == '.'))
+ ;
+ else
+ return FALSE;
+ }
+ return TRUE;
+}
+
+static char * get_clustal_seq(char *sname,sint *len,char *tit,sint seqno)
+/* read the seqno_th. sequence from a clustal multiple alignment file */
+{
+ static char line[MAXLINE+1];
+ static char tseq[MAXLINE+1];
+ char *seq = NULL;
+ sint i,j;
+ unsigned char c;
+
+ fseek(fin,0,0); /* start at the beginning */
+
+ *len=0; /* initialise length to zero */
+ fgets(line,MAXLINE+1,fin); /* read the title line...ignore it */
+
+ while (fgets(line,MAXLINE+1,fin) != NULL) {
+ if(!cl_blankline(line)) {
+
+ for(i=1;i<seqno;i++) fgets(line,MAXLINE+1,fin);
+ for(j=0;j<=strlen(line);j++) if(line[j] != ' ') break;
+
+ sscanf(line,"%s%s",sname,tseq);
+ for(j=0;j<MAXNAMES;j++) if(sname[j] == ' ') break;
+ sname[j]=EOS;
+ rtrim(sname);
+ blank_to_(sname);
+
+ if(seq==NULL)
+ seq=(char *)ckalloc((MAXLINE+2)*sizeof(char));
+ else
+ seq=(char *)ckrealloc(seq,((*len)+MAXLINE+2)*sizeof(char));
+ for(i=0;i<=MAXLINE;i++) {
+ c=tseq[i];
+ /*if(c == '\n' || c == EOS) break;*/ /* EOL */
+ if(isspace(c) || c == EOS) break; /* EOL */
+ c=chartab[c];
+ if(c) seq[++(*len)]=c;
+ }
+
+ for(i=0;;i++) {
+ if(fgets(line,MAXLINE+1,fin)==NULL) return seq;
+ if(cl_blankline(line)) break;
+ }
+ }
+ }
+
+ return seq;
+}
+
+static void get_clustal_ss(sint length)
+/* read the structure data from a clustal multiple alignment file */
+{
+ static char title[MAXLINE+1];
+ static char line[MAXLINE+1];
+ static char lin2[MAXLINE+1];
+ static char tseq[MAXLINE+1];
+ static char sname[MAXNAMES+1];
+ sint i,j,len,ix,struct_index=0;
+ char c;
+
+
+ fseek(fin,0,0); /* start at the beginning */
+
+ len=0; /* initialise length to zero */
+ if (fgets(line,MAXLINE+1,fin) == NULL) return; /* read the title line...ignore it */
+
+ if (fgets(line,MAXLINE+1,fin) == NULL) return; /* read the next line... */
+/* skip any blank lines */
+ for (;;) {
+ if(fgets(line,MAXLINE+1,fin)==NULL) return;
+ if(!blankline(line)) break;
+ }
+
+/* look for structure table lines */
+ ix = -1;
+ for(;;) {
+ if(line[0] != '!') break;
+ if(strncmp(line,"!SS",3) == 0) {
+ ix++;
+ sscanf(line+4,"%s%s",sname,tseq);
+ for(j=0;j<MAXNAMES;j++) if(sname[j] == ' ') break;
+ sname[j]=EOS;
+ rtrim(sname);
+ blank_to_(sname);
+ if (interactive) {
+ strcpy(title,"Found secondary structure in alignment file: ");
+ strcat(title,sname);
+ (*lin2)=prompt_for_yes_no(title,"Use it to set local gap penalties ");
+ }
+ else (*lin2) = 'y';
+ if ((*lin2 != 'n') && (*lin2 != 'N')) {
+ struct_penalties = SECST;
+ struct_index = ix;
+ for (i=0;i<length;i++)
+ {
+ sec_struct_mask[i] = '.';
+ gap_penalty_mask[i] = '.';
+ }
+ strcpy(ss_name,sname);
+ for(i=0;len < length;i++) {
+ c = tseq[i];
+ if(c == '\n' || c == EOS) break; /* EOL */
+ if (!isspace(c)) sec_struct_mask[len++] = c;
+ }
+ }
+ }
+ else if(strncmp(line,"!GM",3) == 0) {
+ ix++;
+ sscanf(line+4,"%s%s",sname,tseq);
+ for(j=0;j<MAXNAMES;j++) if(sname[j] == ' ') break;
+ sname[j]=EOS;
+ rtrim(sname);
+ blank_to_(sname);
+ if (interactive) {
+ strcpy(title,"Found gap penalty mask in alignment file: ");
+ strcat(title,sname);
+ (*lin2)=prompt_for_yes_no(title,"Use it to set local gap penalties ");
+ }
+ else (*lin2) = 'y';
+ if ((*lin2 != 'n') && (*lin2 != 'N')) {
+ struct_penalties = GMASK;
+ struct_index = ix;
+ for (i=0;i<length;i++)
+ gap_penalty_mask[i] = '1';
+ strcpy(ss_name,sname);
+ for(i=0;len < length;i++) {
+ c = tseq[i];
+ if(c == '\n' || c == EOS) break; /* EOL */
+ if (!isspace(c)) gap_penalty_mask[len++] = c;
+ }
+ }
+ }
+ if (struct_penalties != NONE) break;
+ if(fgets(line,MAXLINE+1,fin)==NULL) return;
+ }
+
+ if (struct_penalties == NONE) return;
+
+/* skip any more comment lines */
+ while (line[0] == '!') {
+ if(fgets(line,MAXLINE+1,fin)==NULL) return;
+ }
+
+/* skip the sequence lines and any comments after the alignment */
+ for (;;) {
+ if(isspace(line[0])) break;
+ if(fgets(line,MAXLINE+1,fin)==NULL) return;
+ }
+
+
+/* read the rest of the alignment */
+
+ for (;;) {
+/* skip any blank lines */
+ for (;;) {
+ if(!blankline(line)) break;
+ if(fgets(line,MAXLINE+1,fin)==NULL) return;
+ }
+/* get structure table line */
+ for(ix=0;ix<struct_index;ix++) {
+ if (line[0] != '!') {
+ if(struct_penalties == SECST)
+ error("bad secondary structure format");
+ else
+ error("bad gap penalty mask format");
+ struct_penalties = NONE;
+ return;
+ }
+ if(fgets(line,MAXLINE+1,fin)==NULL) return;
+ }
+ if(struct_penalties == SECST) {
+ if (strncmp(line,"!SS",3) != 0) {
+ error("bad secondary structure format");
+ struct_penalties = NONE;
+ return;
+ }
+ sscanf(line+4,"%s%s",sname,tseq);
+ for(i=0;len < length;i++) {
+ c = tseq[i];
+ if(c == '\n' || c == EOS) break; /* EOL */
+ if (!isspace(c)) sec_struct_mask[len++] = c;
+ }
+ }
+ else if (struct_penalties == GMASK) {
+ if (strncmp(line,"!GM",3) != 0) {
+ error("bad gap penalty mask format");
+ struct_penalties = NONE;
+ return;
+ }
+ sscanf(line+4,"%s%s",sname,tseq);
+ for(i=0;len < length;i++) {
+ c = tseq[i];
+ if(c == '\n' || c == EOS) break; /* EOL */
+ if (!isspace(c)) gap_penalty_mask[len++] = c;
+ }
+ }
+
+/* skip any more comment lines */
+ while (line[0] == '!') {
+ if(fgets(line,MAXLINE+1,fin)==NULL) return;
+ }
+
+/* skip the sequence lines */
+ for (;;) {
+ if(isspace(line[0])) break;
+ if(fgets(line,MAXLINE+1,fin)==NULL) return;
+ }
+ }
+}
+
+static void get_embl_ss(sint length)
+{
+ static char title[MAXLINE+1];
+ static char line[MAXLINE+1];
+ static char lin2[MAXLINE+1];
+ static char sname[MAXNAMES+1];
+ char feature[MAXLINE+1];
+ sint i;
+
+/* find the start of the sequence entry */
+ for (;;) {
+ while( !linetype(line,"ID") )
+ if (fgets(line,MAXLINE+1,fin) == NULL) return;
+
+ for(i=5;i<=strlen(line);i++) /* DES */
+ if(line[i] != ' ') break;
+ strncpy(sname,line+i,MAXNAMES); /* remember entryname */
+ for(i=0;i<=strlen(sname);i++)
+ if(sname[i] == ' ') {
+ sname[i]=EOS;
+ break;
+ }
+ sname[MAXNAMES]=EOS;
+ rtrim(sname);
+ blank_to_(sname);
+
+/* look for secondary structure feature table / gap penalty mask */
+ while(fgets(line,MAXLINE+1,fin) != NULL) {
+ if (linetype(line,"FT")) {
+ sscanf(line+2,"%s",feature);
+ if (strcmp(feature,"HELIX") == 0 ||
+ strcmp(feature,"STRAND") == 0)
+ {
+
+ if (interactive) {
+ strcpy(title,"Found secondary structure in alignment file: ");
+ strcat(title,sname);
+ (*lin2)=prompt_for_yes_no(title,"Use it to set local gap penalties ");
+ }
+ else (*lin2) = 'y';
+ if ((*lin2 != 'n') && (*lin2 != 'N')) {
+ struct_penalties = SECST;
+ for (i=0;i<length;i++)
+ sec_struct_mask[i] = '.';
+ do {
+ get_swiss_feature(&line[2],length);
+ fgets(line,MAXLINE+1,fin);
+ } while( linetype(line,"FT") );
+ }
+ else {
+ do {
+ fgets(line,MAXLINE+1,fin);
+ } while( linetype(line,"FT") );
+ }
+ strcpy(ss_name,sname);
+ }
+ }
+ else if (linetype(line,"GM")) {
+ if (interactive) {
+ strcpy(title,"Found gap penalty mask in alignment file: ");
+ strcat(title,sname);
+ (*lin2)=prompt_for_yes_no(title,"Use it to set local gap penalties ");
+ }
+ else (*lin2) = 'y';
+ if ((*lin2 != 'n') && (*lin2 != 'N')) {
+ struct_penalties = GMASK;
+ for (i=0;i<length;i++)
+ gap_penalty_mask[i] = '1';
+ do {
+ get_swiss_mask(&line[2],length);
+ fgets(line,MAXLINE+1,fin);
+ } while( linetype(line,"GM") );
+ }
+ else {
+ do {
+ fgets(line,MAXLINE+1,fin);
+ } while( linetype(line,"GM") );
+ }
+ strcpy(ss_name,sname);
+ }
+ if (linetype(line,"SQ"))
+ break;
+
+ if (struct_penalties != NONE) break;
+ }
+
+ }
+
+}
+
+static void get_rsf_ss(sint length)
+{
+ static char title[MAXLINE+1];
+ static char line[MAXLINE+1];
+ static char lin2[MAXLINE+1];
+ static char sname[MAXNAMES+1];
+ sint i;
+
+/* skip the comments */
+ while (fgets(line,MAXLINE+1,fin) != NULL) {
+ if(line[strlen(line)-2]=='.' &&
+ line[strlen(line)-3]=='.')
+ break;
+ }
+
+/* find the start of the sequence entry */
+ for (;;) {
+ while (fgets(line,MAXLINE+1,fin) != NULL)
+ if( *line == '{' ) break;
+
+ while( !keyword(line,"name") )
+ if (fgets(line,MAXLINE+1,fin) == NULL) return;
+
+ for(i=5;i<=strlen(line);i++) /* DES */
+ if(line[i] != ' ') break;
+ strncpy(sname,line+i,MAXNAMES); /* remember entryname */
+ for(i=0;i<=strlen(sname);i++)
+ if(sname[i] == ' ') {
+ sname[i]=EOS;
+ break;
+ }
+ sname[MAXNAMES]=EOS;
+ rtrim(sname);
+ blank_to_(sname);
+
+/* look for secondary structure feature table / gap penalty mask */
+ while(fgets(line,MAXLINE+1,fin) != NULL) {
+ if (keyword(line,"feature")) {
+ if (interactive) {
+ strcpy(title,"Found secondary structure in alignment file: ");
+ strcat(title,sname);
+ (*lin2)=prompt_for_yes_no(title,"Use it to set local gap penalties ");
+ }
+ else (*lin2) = 'y';
+ if ((*lin2 != 'n') && (*lin2 != 'N')) {
+ struct_penalties = SECST;
+ for (i=0;i<length;i++)
+ sec_struct_mask[i] = '.';
+ do {
+ if(keyword(line,"feature"))
+ get_rsf_feature(&line[7],length);
+ fgets(line,MAXLINE+1,fin);
+ } while( !keyword(line,"sequence") );
+ }
+ else {
+ do {
+ fgets(line,MAXLINE+1,fin);
+ } while( !keyword(line,"sequence") );
+ }
+ strcpy(ss_name,sname);
+ }
+ else if (keyword(line,"sequence"))
+ break;
+
+ if (struct_penalties != NONE) break;
+ }
+
+ }
+
+}
+
+static void get_gde_ss(sint length)
+{
+ static char title[MAXLINE+1];
+ static char line[MAXLINE+1];
+ static char lin2[MAXLINE+1];
+ static char sname[MAXNAMES+1];
+ sint i, len, offset = 0;
+ unsigned char c;
+
+ for (;;) {
+ line[0] = '\0';
+/* search for the next comment line */
+ while(*line != '"')
+ if (fgets(line,MAXLINE+1,fin) == NULL) return;
+
+/* is it a secondary structure entry? */
+ if (strncmp(&line[1],"SS_",3) == 0) {
+ for (i=1;i<=MAXNAMES-3;i++) {
+ if (line[i+3] == '(' || line[i+3] == '\n')
+ break;
+ sname[i-1] = line[i+3];
+ }
+ i--;
+ sname[i]=EOS;
+ if (sname[i-1] == '(') sscanf(&line[i+3],"%d",&offset);
+ else offset = 0;
+ for(i--;i > 0;i--)
+ if(isspace(sname[i])) {
+ sname[i]=EOS;
+ }
+ else break;
+ blank_to_(sname);
+
+ if (interactive) {
+ strcpy(title,"Found secondary structure in alignment file: ");
+ strcat(title,sname);
+ (*lin2)=prompt_for_yes_no(title,"Use it to set local gap penalties ");
+ }
+ else (*lin2) = 'y';
+ if ((*lin2 != 'n') && (*lin2 != 'N')) {
+ struct_penalties = SECST;
+ for (i=0;i<length;i++)
+ sec_struct_mask[i] = '.';
+ len = 0;
+ while(fgets(line,MAXLINE+1,fin)) {
+ if(*line == '%' || *line == '#' || *line == '"') break;
+ for(i=offset;i < length;i++) {
+ c=line[i];
+ if(c == '\n' || c == EOS)
+ break; /* EOL */
+ sec_struct_mask[len++]=c;
+ }
+ if (len > length) break;
+ }
+ strcpy(ss_name,sname);
+ }
+ }
+/* or is it a gap penalty mask entry? */
+ else if (strncmp(&line[1],"GM_",3) == 0) {
+ for (i=1;i<=MAXNAMES-3;i++) {
+ if (line[i+3] == '(' || line[i+3] == '\n')
+ break;
+ sname[i-1] = line[i+3];
+ }
+ i--;
+ sname[i]=EOS;
+ if (sname[i-1] == '(') sscanf(&line[i+3],"%d",&offset);
+ else offset = 0;
+ for(i--;i > 0;i--)
+ if(isspace(sname[i])) {
+ sname[i]=EOS;
+ }
+ else break;
+ blank_to_(sname);
+
+ if (interactive) {
+ strcpy(title,"Found gap penalty mask in alignment file: ");
+ strcat(title,sname);
+ (*lin2)=prompt_for_yes_no(title,"Use it to set local gap penalties ");
+ }
+ else (*lin2) = 'y';
+ if ((*lin2 != 'n') && (*lin2 != 'N')) {
+ struct_penalties = GMASK;
+ for (i=0;i<length;i++)
+ gap_penalty_mask[i] = '1';
+ len = 0;
+ while(fgets(line,MAXLINE+1,fin)) {
+ if(*line == '%' || *line == '#' || *line == '"') break;
+ for(i=offset;i < length;i++) {
+ c=line[i];
+ if(c == '\n' || c == EOS)
+ break; /* EOL */
+ gap_penalty_mask[len++]=c;
+ }
+ if (len > length) break;
+ }
+ strcpy(ss_name,sname);
+ }
+ }
+ if (struct_penalties != NONE) break;
+ }
+
+}
+
+static void get_swiss_feature(char *line, sint len)
+{
+ char c, s, feature[MAXLINE+1];
+ int i, start_pos, end_pos;
+
+ if (sscanf(line,"%s%d%d",feature,&start_pos,&end_pos) != 3) {
+ return;
+ }
+
+ if (strcmp(feature,"HELIX") == 0) {
+ c = 'A';
+ s = '$';
+ }
+ else if (strcmp(feature,"STRAND") == 0) {
+ c = 'B';
+ s = '%';
+ }
+ else
+ return;
+
+ if(start_pos >=len || end_pos>=len) return;
+
+ sec_struct_mask[start_pos-1] = s;
+ for (i=start_pos;i<end_pos-1;i++)
+ sec_struct_mask[i] = c;
+ sec_struct_mask[end_pos-1] = s;
+
+}
+
+static void get_rsf_feature(char *line, sint len)
+{
+ char c, s;
+ char str1[MAXLINE+1],str2[MAXLINE+1],feature[MAXLINE+1];
+ int i, tmp,start_pos, end_pos;
+
+ if (sscanf(line,"%d%d%d%s%s%s",&start_pos,&end_pos,&tmp,str1,str2,feature) != 6) {
+ return;
+ }
+
+ if (strcmp(feature,"HELIX") == 0) {
+ c = 'A';
+ s = '$';
+ }
+ else if (strcmp(feature,"STRAND") == 0) {
+ c = 'B';
+ s = '%';
+ }
+ else
+ return;
+
+ if(start_pos>=len || end_pos >= len) return;
+ sec_struct_mask[start_pos-1] = s;
+ for (i=start_pos;i<end_pos-1;i++)
+ sec_struct_mask[i] = c;
+ sec_struct_mask[end_pos-1] = s;
+
+}
+
+static void get_swiss_mask(char *line, sint len)
+{
+ int i, value, start_pos, end_pos;
+
+ if (sscanf(line,"%d%d%d",&value,&start_pos,&end_pos) != 3) {
+ return;
+ }
+
+ if (value < 1 || value > 9) return;
+
+ if(start_pos>=len || end_pos >= len) return;
+ for (i=start_pos-1;i<end_pos;i++)
+ gap_penalty_mask[i] = value+'0';
+
+}
+
+static char * get_seq(char *sname,sint *len,char *tit)
+{
+ static char line[MAXLINE+1];
+ char *seq = NULL;
+ sint i, offset = 0;
+ unsigned char c=EOS;
+ Boolean got_seq=FALSE;
+
+ switch(seqFormat) {
+
+/************************************/
+ case EMBLSWISS:
+ while( !linetype(line,"ID") )
+ if (fgets(line,MAXLINE+1,fin) == NULL) return NULL;
+
+ for(i=5;i<=strlen(line);i++) /* DES */
+ if(line[i] != ' ') break;
+ strncpy(sname,line+i,MAXNAMES); /* remember entryname */
+ for(i=0;i<=strlen(sname);i++)
+ if(sname[i] == ' ') {
+ sname[i]=EOS;
+ break;
+ }
+
+ sname[MAXNAMES]=EOS;
+ rtrim(sname);
+ blank_to_(sname);
+
+
+ while( !linetype(line,"SQ") )
+ fgets(line,MAXLINE+1,fin);
+
+ *len=0;
+ while(fgets(line,MAXLINE+1,fin)) {
+ if(got_seq && blankline(line)) break;
+ if( strlen(line) > 2 && line[strlen(line)-2]=='.' && line[strlen(line)-3]=='.' )
+ continue;
+ if(seq==NULL)
+ seq=(char *)ckalloc((MAXLINE+2)*sizeof(char));
+ else
+ seq=(char *)ckrealloc(seq,((*len)+MAXLINE+2)*sizeof(char));
+ for(i=0;i<=MAXLINE;i++) {
+ c=line[i];
+ if(c == '\n' || c == EOS || c == '/')
+ break; /* EOL */
+ c=chartab[c];
+ if(c) {
+ got_seq=TRUE;
+ seq[++(*len)]=c;
+ }
+ }
+ if(c == '/') break;
+ }
+ break;
+
+/************************************/
+ case PIR:
+ while(*line != '>')
+ fgets(line,MAXLINE+1,fin);
+ for(i=4;i<=strlen(line);i++) /* DES */
+ if(line[i] != ' ') break;
+ strncpy(sname,line+i,MAXNAMES); /* remember entryname */
+ sname[MAXNAMES]=EOS;
+ rtrim(sname);
+ blank_to_(sname);
+
+ fgets(line,MAXLINE+1,fin);
+ strncpy(tit,line,MAXTITLES);
+ tit[MAXTITLES]=EOS;
+ i=strlen(tit);
+ if(tit[i-1]=='\n') tit[i-1]=EOS;
+
+ *len=0;
+ while(fgets(line,MAXLINE+1,fin)) {
+ if(seq==NULL)
+ seq=(char *)ckalloc((MAXLINE+2)*sizeof(char));
+ else
+ seq=(char *)ckrealloc(seq,((*len)+MAXLINE+2)*sizeof(char));
+ for(i=0;i<=MAXLINE;i++) {
+ c=line[i];
+ if(c == '\n' || c == EOS || c == '*')
+ break; /* EOL */
+
+ c=chartab[c];
+ if(c) seq[++(*len)]=c;
+ }
+ if(c == '*') break;
+ }
+ break;
+/***********************************************/
+ case PEARSON:
+ while(*line != '>')
+ fgets(line,MAXLINE+1,fin);
+
+ for(i=1;i<=strlen(line);i++) /* DES */
+ if(line[i] != ' ') break;
+ strncpy(sname,line+i,MAXNAMES); /* remember entryname */
+ for(i=1;i<=strlen(sname);i++) /* DES */
+ if(sname[i] == ' ') break;
+ sname[i]=EOS;
+ rtrim(sname);
+ blank_to_(sname);
+
+ *tit=EOS;
+
+ *len=0;
+ while(fgets(line,MAXLINE+1,fin)) {
+ if(seq==NULL)
+ seq=(char *)ckalloc((MAXLINE+2)*sizeof(char));
+ else
+ seq=(char *)ckrealloc(seq,((*len)+MAXLINE+2)*sizeof(char));
+ for(i=0;i<=MAXLINE;i++) {
+ c=line[i];
+ if(c == '\n' || c == EOS || c == '>')
+ break; /* EOL */
+
+ c=chartab[c];
+ if(c) seq[++(*len)]=c;
+ }
+ if(c == '>') break;
+ }
+ break;
+/**********************************************/
+ case GDE:
+ if (dnaflag) {
+ while(*line != '#')
+ fgets(line,MAXLINE+1,fin);
+ }
+ else {
+ while(*line != '%')
+ fgets(line,MAXLINE+1,fin);
+ }
+
+ for (i=1;i<=MAXNAMES;i++) {
+ if (line[i] == '(' || line[i] == '\n')
+ break;
+ sname[i-1] = line[i];
+ }
+ i--;
+ sname[i]=EOS;
+ if (sname[i-1] == '(') sscanf(&line[i],"%d",&offset);
+ else offset = 0;
+ for(i--;i > 0;i--)
+ if(isspace(sname[i])) {
+ sname[i]=EOS;
+ }
+ else break;
+ blank_to_(sname);
+
+ *tit=EOS;
+
+ *len=0;
+ for (i=0;i<offset;i++) seq[++(*len)] = '-';
+ while(fgets(line,MAXLINE+1,fin)) {
+ if(*line == '%' || *line == '#' || *line == '"') break;
+ if(seq==NULL)
+ seq=(char *)ckalloc((MAXLINE+2)*sizeof(char));
+ else
+ seq=(char *)ckrealloc(seq,((*len)+MAXLINE+2)*sizeof(char));
+ for(i=0;i<=MAXLINE;i++) {
+ c=line[i];
+ if(c == '\n' || c == EOS)
+ break; /* EOL */
+
+ c=chartab[c];
+ if(c) seq[++(*len)]=c;
+ }
+ }
+ break;
+/***********************************************/
+ case RSF:
+ while(*line != '{')
+ if (fgets(line,MAXLINE+1,fin) == NULL) return NULL;
+
+ while( !keyword(line,"name") )
+ if (fgets(line,MAXLINE+1,fin) == NULL) return NULL;
+
+ for(i=5;i<=strlen(line);i++) /* DES */
+ if(line[i] != ' ') break;
+ strncpy(sname,line+i,MAXNAMES); /* remember entryname */
+ for(i=0;i<=strlen(sname);i++)
+ if(sname[i] == ' ') {
+ sname[i]=EOS;
+ break;
+ }
+
+ sname[MAXNAMES]=EOS;
+ rtrim(sname);
+ blank_to_(sname);
+
+
+ while( !keyword(line,"sequence") )
+ if (fgets(line,MAXLINE+1,fin) == NULL) return NULL;
+
+ *len=0;
+ while(fgets(line,MAXLINE+1,fin)) {
+ if(seq==NULL)
+ seq=(char *)ckalloc((MAXLINE+2)*sizeof(char));
+ else
+ seq=(char *)ckrealloc(seq,((*len)+MAXLINE+2)*sizeof(char));
+ for(i=0;i<=MAXLINE;i++) {
+ c=line[i];
+ if(c == EOS || c == '}')
+ break; /* EOL */
+ if( c=='.')
+ seq[++(*len)]='-';
+ c=chartab[c];
+ if(c) seq[++(*len)]=c;
+ }
+ if(c == '}') break;
+ }
+ break;
+/***********************************************/
+ }
+
+ seq[*len+1]=EOS;
+
+ return seq;
+}
+
+
+sint readseqs(sint first_seq) /*first_seq is the #no. of the first seq. to read */
+{
+ char line[FILENAMELEN+1];
+ char fileName[FILENAMELEN+1];
+
+ static char *seq1,sname1[MAXNAMES+1],title[MAXTITLES+1];
+ sint i,j;
+ sint no_seqs;
+ static sint l1;
+ static Boolean dnaflag1;
+
+ if(usemenu)
+ getstr("Enter the name of the sequence file",line);
+ else
+ strcpy(line,seqname);
+ if(*line == EOS) return -1;
+
+ if ((sscanf(line,"file://%s",fileName) == 1 )) {
+ strcpy(line,fileName);
+ }
+
+ if((fin=fopen(line,"r"))==NULL) {
+ error("Could not open sequence file (%s) ",line);
+ return -1; /* DES -1 => file not found */
+ }
+ strcpy(seqname,line);
+ no_seqs=0;
+ check_infile(&no_seqs);
+ info("Sequence format is %s",formatNames[seqFormat]);
+ if(seqFormat==NEXUS)
+ error("Cannot read nexus format");
+
+/* DES DEBUG
+ fprintf(stdout,"\n\n File name = %s\n\n",seqname);
+*/
+ if(no_seqs == 0)
+ return 0; /* return the number of seqs. (zero here)*/
+
+/*
+ if((no_seqs + first_seq -1) > MAXN) {
+ error("Too many sequences. Maximum is %d",(pint)MAXN);
+ return 0;
+ }
+*/
+
+/* DES */
+/* if(seqFormat == CLUSTAL) {
+ info("no of sequences = %d",(pint)no_seqs);
+ return no_seqs;
+ }
+*/
+ max_aln_length = 0;
+
+/* if this is a multiple alignment, or profile 1 - free any memory used
+by previous alignments, then allocate memory for the new alignment */
+ if(first_seq == 1) {
+ max_names = 0;
+ free_aln(nseqs);
+ alloc_aln(no_seqs);
+ }
+/* otherwise, this is a profile 2, and we need to reallocate the arrays,
+leaving the data for profile 1 intact */
+ else realloc_aln(first_seq,no_seqs);
+
+ for(i=1;i<first_seq;i++)
+ {
+ if(seqlen_array[i]>max_aln_length)
+ max_aln_length=seqlen_array[i];
+ if(strlen(names[i])>max_names)
+ max_names=strlen(names[i]);
+ }
+
+ for(i=first_seq;i<=first_seq+no_seqs-1;i++) { /* get the seqs now*/
+ output_index[i] = i; /* default output order */
+ if(seqFormat == CLUSTAL)
+ seq1=get_clustal_seq(sname1,&l1,title,i-first_seq+1);
+ else if(seqFormat == MSF)
+ seq1=get_msf_seq(sname1,&l1,title,i-first_seq+1);
+ else
+ seq1=get_seq(sname1,&l1,title);
+
+ if(seq1==NULL) break;
+/* JULIE */
+/* Set max length of dynamically allocated arrays in prfalign.c */
+ if (l1 > max_aln_length) max_aln_length = l1;
+ seqlen_array[i]=l1; /* store the length */
+ strcpy(names[i],sname1); /* " " name */
+ strcpy(titles[i],title); /* " " title */
+
+ if(!explicit_dnaflag) {
+ dnaflag1 = check_dnaflag(seq1,l1); /* check DNA/Prot */
+ if(i == 1) dnaflag = dnaflag1;
+ } /* type decided by first seq*/
+ else
+ dnaflag1 = dnaflag;
+
+ alloc_seq(i,l1);
+
+ if(dnaflag)
+ n_encode(seq1,seq_array[i],l1); /* encode the sequence*/
+ else /* as ints */
+ p_encode(seq1,seq_array[i],l1);
+ if(seq1!=NULL) seq1=ckfree(seq1);
+ }
+
+
+ max_aln_length *= 2;
+/*
+ JULIE
+ check sequence names are all different - otherwise phylip tree is
+ confused.
+*/
+ for(i=1;i<=first_seq+no_seqs-1;i++) {
+ for(j=i+1;j<=first_seq+no_seqs-1;j++) {
+ if (strncmp(names[i],names[j],MAXNAMES) == 0) {
+ error("Multiple sequences found with same name, %s (first %d chars are significant)", names[i],MAXNAMES);
+ return 0;
+ }
+ }
+ }
+ for(i=first_seq;i<=first_seq+no_seqs-1;i++)
+ {
+ if(seqlen_array[i]>max_aln_length)
+ max_aln_length=seqlen_array[i];
+ }
+
+/* look for a feature table / gap penalty mask (only if this is a profile) */
+ if (profile_no > 0) {
+ rewind(fin);
+ struct_penalties = NONE;
+ gap_penalty_mask = (char *)ckalloc((max_aln_length+1) * sizeof (char));
+ sec_struct_mask = (char *)ckalloc((max_aln_length+1) * sizeof (char));
+ ss_name = (char *)ckalloc((MAXNAMES+1) * sizeof (char));
+
+ if (seqFormat == CLUSTAL) {
+ get_clustal_ss(max_aln_length);
+ }
+ else if (seqFormat == GDE) {
+ get_gde_ss(max_aln_length);
+ }
+ else if (seqFormat == EMBLSWISS) {
+ get_embl_ss(max_aln_length);
+ }
+ else if (seqFormat == RSF) {
+ get_rsf_ss(max_aln_length);
+ }
+ }
+
+ for(i=first_seq;i<=first_seq+no_seqs-1;i++)
+ {
+ if(strlen(names[i])>max_names)
+ max_names=strlen(names[i]);
+ }
+
+ if(max_names<10) max_names=10;
+
+ fclose(fin);
+
+ return no_seqs; /* return the number of seqs. read in this call */
+}
+
+
+static Boolean check_dnaflag(char *seq, sint slen)
+/* check if DNA or Protein
+ The decision is based on counting all A,C,G,T,U or N.
+ If >= 85% of all characters (except -) are as above => DNA */
+{
+ sint i, c, nresidues, nbases;
+ float ratio;
+ char *dna_codes="ACGTUN";
+
+ nresidues = nbases = 0;
+ for(i=1; i <= slen; i++) {
+ if(seq[i] != '-') {
+ nresidues++;
+ if(seq[i] == 'N')
+ nbases++;
+ else {
+ c = res_index(dna_codes, seq[i]);
+ if(c >= 0)
+ nbases++;
+ }
+ }
+ }
+ if( (nbases == 0) || (nresidues == 0) ) return FALSE;
+ ratio = (float)nbases/(float)nresidues;
+/* DES fprintf(stdout,"\n nbases = %d, nresidues = %d, ratio = %f\n",
+ (pint)nbases,(pint)nresidues,(pint)ratio); */
+ if(ratio >= 0.85)
+ return TRUE;
+ else
+ return FALSE;
+}
+
+
+
+static void check_infile(sint *nseqs)
+{
+ char line[MAXLINE+1];
+ sint i;
+
+ *nseqs=0;
+ while (fgets(line,MAXLINE+1,fin) != NULL) {
+ if(!blankline(line))
+ break;
+ }
+
+ for(i=strlen(line)-1;i>=0;i--)
+ if(isgraph(line[i])) break;
+ line[i+1]=EOS;
+
+ for(i=0;i<=6;i++) line[i] = toupper(line[i]);
+
+ if( linetype(line,"ID") ) { /* EMBL/Swiss-Prot format ? */
+ seqFormat=EMBLSWISS;
+ (*nseqs)++;
+ }
+ else if( linetype(line,"CLUSTAL") ) {
+ seqFormat=CLUSTAL;
+ }
+ else if( linetype(line,"PILEUP") ) {
+ seqFormat = MSF;
+ }
+ else if( linetype(line,"!!AA_MULTIPLE_ALIGNMENT") ) {
+ seqFormat = MSF;
+ dnaflag = FALSE;
+ }
+ else if( linetype(line,"!!NA_MULTIPLE_ALIGNMENT") ) {
+ seqFormat = MSF;
+ dnaflag = TRUE;
+ }
+ else if( strstr(line,"MSF") && line[strlen(line)-1]=='.' &&
+ line[strlen(line)-2]=='.' ) {
+ seqFormat = MSF;
+ }
+ else if( linetype(line,"!!RICH_SEQUENCE") ) {
+ seqFormat = RSF;
+ }
+ else if( linetype(line,"#NEXUS") ) {
+ seqFormat=NEXUS;
+ return;
+ }
+ else if(*line == '>') { /* no */
+ seqFormat=(line[3] == ';')?PIR:PEARSON; /* distinguish PIR and Pearson */
+ (*nseqs)++;
+ }
+ else if((*line == '"') || (*line == '%') || (*line == '#')) {
+ seqFormat=GDE; /* GDE format */
+ if (*line == '%') {
+ (*nseqs)++;
+ dnaflag = FALSE;
+ }
+ else if (*line == '#') {
+ (*nseqs)++;
+ dnaflag = TRUE;
+ }
+ }
+ else {
+ seqFormat=UNKNOWN;
+ return;
+ }
+
+ while(fgets(line,MAXLINE+1,fin) != NULL) {
+ switch(seqFormat) {
+ case EMBLSWISS:
+ if( linetype(line,"ID") )
+ (*nseqs)++;
+ break;
+ case PIR:
+ *nseqs = count_pir_seqs();
+ fseek(fin,0,0);
+ return;
+ case PEARSON:
+ if( *line == '>' )
+ (*nseqs)++;
+ break;
+ case GDE:
+ if(( *line == '%' ) && ( dnaflag == FALSE))
+ (*nseqs)++;
+ else if (( *line == '#') && ( dnaflag == TRUE))
+ (*nseqs)++;
+ break;
+ case CLUSTAL:
+ *nseqs = count_clustal_seqs();
+/* DES */ /* fprintf(stdout,"\nnseqs = %d\n",(pint)*nseqs); */
+ fseek(fin,0,0);
+ return;
+ case MSF:
+ *nseqs = count_msf_seqs();
+ fseek(fin,0,0);
+ return;
+ case RSF:
+ fseek(fin,0,0);
+ *nseqs = count_rsf_seqs();
+ fseek(fin,0,0);
+ return;
+ case USER:
+ default:
+ break;
+ }
+ }
+ fseek(fin,0,0);
+}
+
+
+static sint count_pir_seqs(void)
+/* count the number of sequences in a pir alignment file */
+{
+ char line[MAXLINE+1],c;
+ sint nseqs, i;
+ Boolean seq_ok;
+
+ seq_ok = FALSE;
+ while (fgets(line,MAXLINE+1,fin) != NULL) { /* Look for end of first seq */
+ if(*line == '>') break;
+ for(i=0;seq_ok == FALSE;i++) {
+ c=line[i];
+ if(c == '*') {
+ seq_ok = TRUE; /* ok - end of sequence found */
+ break;
+ } /* EOL */
+ if(c == '\n' || c == EOS)
+ break; /* EOL */
+ }
+ if (seq_ok == TRUE)
+ break;
+ }
+ if (seq_ok == FALSE) {
+ error("PIR format sequence end marker '*'\nmissing for one or more sequences.");
+ return (sint)0; /* funny format*/
+ }
+
+
+ nseqs = 1;
+
+ while (fgets(line,MAXLINE+1,fin) != NULL) {
+ if(*line == '>') { /* Look for start of next seq */
+ seq_ok = FALSE;
+ while (fgets(line,MAXLINE+1,fin) != NULL) { /* Look for end of seq */
+ if(*line == '>') {
+ error("PIR format sequence end marker '*' missing for one or more sequences.");
+ return (sint)0; /* funny format*/
+ }
+ for(i=0;seq_ok == FALSE;i++) {
+ c=line[i];
+ if(c == '*') {
+ seq_ok = TRUE; /* ok - sequence found */
+ break;
+ } /* EOL */
+ if(c == '\n' || c == EOS)
+ break; /* EOL */
+ }
+ if (seq_ok == TRUE) {
+ nseqs++;
+ break;
+ }
+ }
+ }
+ }
+ return (sint)nseqs;
+}
+
+
+static sint count_clustal_seqs(void)
+/* count the number of sequences in a clustal alignment file */
+{
+ char line[MAXLINE+1];
+ sint nseqs;
+
+ while (fgets(line,MAXLINE+1,fin) != NULL) {
+ if(!cl_blankline(line)) break; /* Look for next non- */
+ } /* blank line */
+ nseqs = 1;
+
+ while (fgets(line,MAXLINE+1,fin) != NULL) {
+ if(cl_blankline(line)) return nseqs;
+ nseqs++;
+ }
+
+ return (sint)0; /* if you got to here-funny format/no seqs.*/
+}
+
+static sint count_msf_seqs(void)
+{
+/* count the number of sequences in a PILEUP alignment file */
+
+ char line[MAXLINE+1];
+ sint nseqs;
+
+ while (fgets(line,MAXLINE+1,fin) != NULL) {
+ if(linetype(line,"//")) break;
+ }
+
+ while (fgets(line,MAXLINE+1,fin) != NULL) {
+ if(!blankline(line)) break; /* Look for next non- */
+ } /* blank line */
+ nseqs = 1;
+
+ while (fgets(line,MAXLINE+1,fin) != NULL) {
+ if(blankline(line)) return nseqs;
+ nseqs++;
+ }
+
+ return (sint)0; /* if you got to here-funny format/no seqs.*/
+}
+
+static sint count_rsf_seqs(void)
+{
+/* count the number of sequences in a GCG RSF alignment file */
+
+ char line[MAXLINE+1];
+ sint nseqs;
+
+ nseqs = 0;
+/* skip the comments */
+ while (fgets(line,MAXLINE+1,fin) != NULL) {
+ if(line[strlen(line)-2]=='.' &&
+ line[strlen(line)-3]=='.')
+ break;
+ }
+
+ while (fgets(line,MAXLINE+1,fin) != NULL) {
+ if( *line == '{' )
+ nseqs++;
+ }
+ return (sint)nseqs;
+}
+
+static void p_encode(char *seq, char *naseq, sint l)
+{ /* code seq as ints .. use gap_pos2 for gap */
+ register sint i;
+/* static char *aacids="CSTPAGNDEQHRKMILVFYW";*/
+
+ for(i=1;i<=l;i++)
+ if(seq[i] == '-')
+ naseq[i] = gap_pos2;
+ else
+ naseq[i] = res_index(amino_acid_codes,seq[i]);
+ naseq[i] = -3;
+}
+
+static void n_encode(char *seq,char *naseq,sint l)
+{ /* code seq as ints .. use gap_pos2 for gap */
+ register sint i;
+/* static char *nucs="ACGTU"; */
+
+ for(i=1;i<=l;i++) {
+ if(seq[i] == '-') /* if a gap character -> code = gap_pos2 */
+ naseq[i] = gap_pos2; /* this is the code for a gap in */
+ else { /* the input files */
+ naseq[i]=res_index(amino_acid_codes,seq[i]);
+ }
+ }
+ naseq[i] = -3;
+}
+
+static sint res_index(char *t,char c)
+{
+ register sint i;
+
+ for(i=0;t[i] && t[i] != c;i++)
+ ;
+ if(t[i]) return(i);
+ else return -1;
+}
Added: trunk/packages/clustalw/branches/upstream/current/showpair.c
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/showpair.c 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/showpair.c 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,486 @@
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <math.h>
+#include "clustalw.h"
+
+static void make_p_ptrs(sint *tptr, sint *pl, sint naseq, sint l);
+static void make_n_ptrs(sint *tptr, sint *pl, sint naseq, sint len);
+static void put_frag(sint fs, sint v1, sint v2, sint flen);
+static sint frag_rel_pos(sint a1, sint b1, sint a2, sint b2);
+static void des_quick_sort(sint *array1, sint *array2, sint array_size);
+static void pair_align(sint seq_no, sint l1, sint l2);
+
+
+/*
+* Prototypes
+*/
+
+/*
+* Global variables
+*/
+extern sint *seqlen_array;
+extern char **seq_array;
+extern sint dna_ktup, dna_window, dna_wind_gap, dna_signif; /* params for DNA */
+extern sint prot_ktup,prot_window,prot_wind_gap,prot_signif; /* params for prots */
+extern sint nseqs;
+extern Boolean dnaflag;
+extern double **tmat;
+extern sint max_aa;
+extern sint max_aln_length;
+
+static sint next;
+static sint curr_frag,maxsf,vatend;
+static sint **accum;
+static sint *diag_index;
+static char *slopes;
+
+sint ktup,window,wind_gap,signif; /* Pairwise aln. params */
+sint *displ;
+sint *zza, *zzb, *zzc, *zzd;
+
+extern Boolean percent;
+
+
+static void make_p_ptrs(sint *tptr,sint *pl,sint naseq,sint l)
+{
+ static sint a[10];
+ sint i,j,limit,code,flag;
+ char residue;
+
+ for (i=1;i<=ktup;i++)
+ a[i] = (sint) pow((double)(max_aa+1),(double)(i-1));
+
+ limit = (sint) pow((double)(max_aa+1),(double)ktup);
+ for(i=1;i<=limit;++i)
+ pl[i]=0;
+ for(i=1;i<=l;++i)
+ tptr[i]=0;
+
+ for(i=1;i<=(l-ktup+1);++i) {
+ code=0;
+ flag=FALSE;
+ for(j=1;j<=ktup;++j) {
+ residue = seq_array[naseq][i+j-1];
+ if((residue<0) || (residue > max_aa)){
+ flag=TRUE;
+ break;
+ }
+ code += ((residue) * a[j]);
+ }
+ if(flag)
+ continue;
+ ++code;
+ if(pl[code]!=0)
+ tptr[i]=pl[code];
+ pl[code]=i;
+ }
+}
+
+
+static void make_n_ptrs(sint *tptr,sint *pl,sint naseq,sint len)
+{
+ static sint pot[]={ 0, 1, 4, 16, 64, 256, 1024, 4096 };
+ sint i,j,limit,code,flag;
+ char residue;
+
+ limit = (sint) pow((double)4,(double)ktup);
+
+ for(i=1;i<=limit;++i)
+ pl[i]=0;
+ for(i=1;i<=len;++i)
+ tptr[i]=0;
+
+ for(i=1;i<=len-ktup+1;++i) {
+ code=0;
+ flag=FALSE;
+ for(j=1;j<=ktup;++j) {
+ residue = seq_array[naseq][i+j-1];
+ if((residue<0) || (residue>4)){
+ flag=TRUE;
+ break;
+ }
+ code += ((residue) * pot[j]); /* DES */
+ }
+ if(flag)
+ continue;
+ ++code;
+ if(pl[code]!=0)
+ tptr[i]=pl[code];
+ pl[code]=i;
+ }
+}
+
+
+static void put_frag(sint fs,sint v1,sint v2,sint flen)
+{
+ sint end;
+ accum[0][curr_frag]=fs;
+ accum[1][curr_frag]=v1;
+ accum[2][curr_frag]=v2;
+ accum[3][curr_frag]=flen;
+
+ if(!maxsf) {
+ maxsf=1;
+ accum[4][curr_frag]=0;
+ return;
+ }
+
+ if(fs >= accum[0][maxsf]) {
+ accum[4][curr_frag]=maxsf;
+ maxsf=curr_frag;
+ return;
+ }
+ else {
+ next=maxsf;
+ while(TRUE) {
+ end=next;
+ next=accum[4][next];
+ if(fs>=accum[0][next])
+ break;
+ }
+ accum[4][curr_frag]=next;
+ accum[4][end]=curr_frag;
+ }
+}
+
+
+static sint frag_rel_pos(sint a1,sint b1,sint a2,sint b2)
+{
+ sint ret;
+
+ ret=FALSE;
+ if(a1-b1==a2-b2) {
+ if(a2<a1)
+ ret=TRUE;
+ }
+ else {
+ if(a2+ktup-1<a1 && b2+ktup-1<b1)
+ ret=TRUE;
+ }
+ return ret;
+}
+
+
+static void des_quick_sort(sint *array1, sint *array2, sint array_size)
+/* */
+/* Quicksort routine, adapted from chapter 4, page 115 of software tools */
+/* by Kernighan and Plauger, (1986) */
+/* Sort the elements of array1 and sort the */
+/* elements of array2 accordingly */
+/* */
+{
+ sint temp1, temp2;
+ sint p, pivlin;
+ sint i, j;
+ sint lst[50], ust[50]; /* the maximum no. of elements must be*/
+ /* < log(base2) of 50 */
+
+ lst[1] = 1;
+ ust[1] = array_size-1;
+ p = 1;
+
+ while(p > 0) {
+ if(lst[p] >= ust[p])
+ p--;
+ else {
+ i = lst[p] - 1;
+ j = ust[p];
+ pivlin = array1[j];
+ while(i < j) {
+ for(i=i+1; array1[i] < pivlin; i++)
+ ;
+ for(j=j-1; j > i; j--)
+ if(array1[j] <= pivlin) break;
+ if(i < j) {
+ temp1 = array1[i];
+ array1[i] = array1[j];
+ array1[j] = temp1;
+
+ temp2 = array2[i];
+ array2[i] = array2[j];
+ array2[j] = temp2;
+ }
+ }
+
+ j = ust[p];
+
+ temp1 = array1[i];
+ array1[i] = array1[j];
+ array1[j] = temp1;
+
+ temp2 = array2[i];
+ array2[i] = array2[j];
+ array2[j] = temp2;
+
+ if(i-lst[p] < ust[p] - i) {
+ lst[p+1] = lst[p];
+ ust[p+1] = i - 1;
+ lst[p] = i + 1;
+ }
+ else {
+ lst[p+1] = i + 1;
+ ust[p+1] = ust[p];
+ ust[p] = i - 1;
+ }
+ p = p + 1;
+ }
+ }
+ return;
+
+}
+
+
+
+
+
+static void pair_align(sint seq_no,sint l1,sint l2)
+{
+ sint pot[8],i,j,l,m,flag,limit,pos,tl1,vn1,vn2,flen,osptr,fs;
+ sint tv1,tv2,encrypt,subt1,subt2,rmndr;
+ char residue;
+
+ if(dnaflag) {
+ for(i=1;i<=ktup;++i)
+ pot[i] = (sint) pow((double)4,(double)(i-1));
+ limit = (sint) pow((double)4,(double)ktup);
+ }
+ else {
+ for (i=1;i<=ktup;i++)
+ pot[i] = (sint) pow((double)(max_aa+1),(double)(i-1));
+ limit = (sint) pow((double)(max_aa+1),(double)ktup);
+ }
+
+ tl1 = (l1+l2)-1;
+
+ for(i=1;i<=tl1;++i) {
+ slopes[i]=displ[i]=0;
+ diag_index[i] = i;
+ }
+
+
+/* increment diagonal score for each k_tuple match */
+
+ for(i=1;i<=limit;++i) {
+ vn1=zzc[i];
+ while(TRUE) {
+ if(!vn1) break;
+ vn2=zzd[i];
+ while(vn2 != 0) {
+ osptr=vn1-vn2+l2;
+ ++displ[osptr];
+ vn2=zzb[vn2];
+ }
+ vn1=zza[vn1];
+ }
+ }
+
+/* choose the top SIGNIF diagonals */
+
+ des_quick_sort(displ, diag_index, tl1);
+
+ j = tl1 - signif + 1;
+ if(j < 1) j = 1;
+
+/* flag all diagonals within WINDOW of a top diagonal */
+
+ for(i=tl1; i>=j; i--)
+ if(displ[i] > 0) {
+ pos = diag_index[i];
+ l = (1 >pos-window) ? 1 : pos-window;
+ m = (tl1<pos+window) ? tl1 : pos+window;
+ for(; l <= m; l++)
+ slopes[l] = 1;
+ }
+
+ for(i=1; i<=tl1; i++) displ[i] = 0;
+
+
+ curr_frag=maxsf=0;
+
+ for(i=1;i<=(l1-ktup+1);++i) {
+ encrypt=flag=0;
+ for(j=1;j<=ktup;++j) {
+ residue = seq_array[seq_no][i+j-1];
+ if((residue<0) || (residue>max_aa)) {
+ flag=TRUE;
+ break;
+ }
+ encrypt += ((residue)*pot[j]);
+ }
+ if(flag) continue;
+ ++encrypt;
+
+ vn2=zzd[encrypt];
+
+ flag=FALSE;
+ while(TRUE) {
+ if(!vn2) {
+ flag=TRUE;
+ break;
+ }
+ osptr=i-vn2+l2;
+ if(slopes[osptr]!=1) {
+ vn2=zzb[vn2];
+ continue;
+ }
+ flen=0;
+ fs=ktup;
+ next=maxsf;
+
+ /*
+ * A-loop
+ */
+
+ while(TRUE) {
+ if(!next) {
+ ++curr_frag;
+ if(curr_frag>=2*max_aln_length) {
+ info("(Partial alignment)");
+ vatend=1;
+ return;
+ }
+ displ[osptr]=curr_frag;
+ put_frag(fs,i,vn2,flen);
+ }
+ else {
+ tv1=accum[1][next];
+ tv2=accum[2][next];
+ if(frag_rel_pos(i,vn2,tv1,tv2)) {
+ if(i-vn2==accum[1][next]-accum[2][next]) {
+ if(i>accum[1][next]+(ktup-1))
+ fs=accum[0][next]+ktup;
+ else {
+ rmndr=i-accum[1][next];
+ fs=accum[0][next]+rmndr;
+ }
+ flen=next;
+ next=0;
+ continue;
+ }
+ else {
+ if(displ[osptr]==0)
+ subt1=ktup;
+ else {
+ if(i>accum[1][displ[osptr]]+(ktup-1))
+ subt1=accum[0][displ[osptr]]+ktup;
+ else {
+ rmndr=i-accum[1][displ[osptr]];
+ subt1=accum[0][displ[osptr]]+rmndr;
+ }
+ }
+ subt2=accum[0][next]-wind_gap+ktup;
+ if(subt2>subt1) {
+ flen=next;
+ fs=subt2;
+ }
+ else {
+ flen=displ[osptr];
+ fs=subt1;
+ }
+ next=0;
+ continue;
+ }
+ }
+ else {
+ next=accum[4][next];
+ continue;
+ }
+ }
+ break;
+ }
+ /*
+ * End of Aloop
+ */
+
+ vn2=zzb[vn2];
+ }
+ }
+ vatend=0;
+}
+
+void show_pair(sint istart, sint iend, sint jstart, sint jend)
+{
+ sint i,j,dsr;
+ double calc_score;
+
+ accum = (sint **)ckalloc( 5*sizeof (sint *) );
+ for (i=0;i<5;i++)
+ accum[i] = (sint *) ckalloc((2*max_aln_length+1) * sizeof (sint) );
+
+ displ = (sint *) ckalloc( (2*max_aln_length +1) * sizeof (sint) );
+ slopes = (char *)ckalloc( (2*max_aln_length +1) * sizeof (char));
+ diag_index = (sint *) ckalloc( (2*max_aln_length +1) * sizeof (sint) );
+
+ zza = (sint *)ckalloc( (max_aln_length+1) * sizeof (sint) );
+ zzb = (sint *)ckalloc( (max_aln_length+1) * sizeof (sint) );
+
+ zzc = (sint *)ckalloc( (max_aln_length+1) * sizeof (sint) );
+ zzd = (sint *)ckalloc( (max_aln_length+1) * sizeof (sint) );
+
+ if(dnaflag) {
+ ktup = dna_ktup;
+ window = dna_window;
+ signif = dna_signif;
+ wind_gap = dna_wind_gap;
+ }
+ else {
+ ktup = prot_ktup;
+ window = prot_window;
+ signif = prot_signif;
+ wind_gap = prot_wind_gap;
+ }
+
+ fprintf(stdout,"\n\n");
+
+ for(i=istart+1;i<=iend;++i) {
+ if(dnaflag)
+ make_n_ptrs(zza,zzc,i,seqlen_array[i]);
+ else
+ make_p_ptrs(zza,zzc,i,seqlen_array[i]);
+ for(j=jstart+2;j<=jend;++j) {
+ if(dnaflag)
+ make_n_ptrs(zzb,zzd,j,seqlen_array[j]);
+ else
+ make_p_ptrs(zzb,zzd,j,seqlen_array[j]);
+ pair_align(i,seqlen_array[i],seqlen_array[j]);
+ if(!maxsf)
+ calc_score=0.0;
+ else {
+ calc_score=(double)accum[0][maxsf];
+ if(percent) {
+ dsr=(seqlen_array[i]<seqlen_array[j]) ?
+ seqlen_array[i] : seqlen_array[j];
+ calc_score = (calc_score/(double)dsr) * 100.0;
+ }
+ }
+/*
+ tmat[i][j]=calc_score;
+ tmat[j][i]=calc_score;
+*/
+
+ tmat[i][j] = (100.0 - calc_score)/100.0;
+ tmat[j][i] = (100.0 - calc_score)/100.0;
+ if(calc_score>0.1)
+ info("Sequences (%d:%d) Aligned. Score: %lg",
+ (pint)i,(pint)j,calc_score);
+ else
+ info("Sequences (%d:%d) Not Aligned",
+ (pint)i,(pint)j);
+ }
+ }
+
+ for (i=0;i<5;i++)
+ accum[i]=ckfree((void *)accum[i]);
+ accum=ckfree((void *)accum);
+
+ displ=ckfree((void *)displ);
+ slopes=ckfree((void *)slopes);
+ diag_index=ckfree((void *)diag_index);
+
+ zza=ckfree((void *)zza);
+ zzb=ckfree((void *)zzb);
+ zzc=ckfree((void *)zzc);
+ zzd=ckfree((void *)zzd);
+}
+
Added: trunk/packages/clustalw/branches/upstream/current/trees.c
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/trees.c 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/trees.c 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,2166 @@
+/* Phyle of filogenetic tree calculating functions for CLUSTAL W */
+/* DES was here FEB. 1994 */
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <math.h>
+#include "clustalw.h"
+#include "dayhoff.h" /* set correction for amino acid distances >= 75% */
+
+
+/*
+ * Prototypes
+ */
+Boolean transition(sint base1, sint base2);
+void tree_gap_delete(void);
+void distance_matrix_output(FILE *ofile);
+void nj_tree(char **tree_description, FILE *tree);
+void compare_tree(char **tree1, char **tree2, sint *hits, sint n);
+void print_phylip_tree(char **tree_description, FILE *tree, sint bootstrap);
+void print_nexus_tree(char **tree_description, FILE *tree, sint bootstrap);
+sint two_way_split(char **tree_description, FILE *tree, sint start_row, sint flag, sint bootstrap);
+sint two_way_split_nexus(char **tree_description, FILE *tree, sint start_row, sint flag, sint bootstrap);
+void print_tree(char **tree_description, FILE *tree, sint *totals);
+static Boolean is_ambiguity(char c);
+static void overspill_message(sint overspill,sint total_dists);
+
+
+/*
+ * Global variables
+ */
+
+extern sint max_names;
+
+extern double **tmat; /* general nxn array of reals; allocated from main */
+ /* this is used as a distance matrix */
+extern Boolean dnaflag; /* TRUE for DNA seqs; FALSE for proteins */
+extern Boolean tossgaps; /* Ignore places in align. where ANY seq. has a gap*/
+extern Boolean kimura; /* Use correction for multiple substitutions */
+extern Boolean output_tree_clustal; /* clustal text output for trees */
+extern Boolean output_tree_phylip; /* phylip nested parentheses format */
+extern Boolean output_tree_distances; /* phylip distance matrix */
+extern Boolean output_tree_nexus; /* nexus format tree */
+extern Boolean output_pim; /* perc identity matrix output Ramu */
+
+extern sint bootstrap_format; /* bootstrap file format */
+extern Boolean empty; /* any sequences in memory? */
+extern Boolean usemenu; /* interactive (TRUE) or command line (FALSE) */
+extern sint nseqs;
+extern sint max_aln_length;
+extern sint *seqlen_array; /* the lengths of the sequences */
+extern char **seq_array; /* the sequences */
+extern char **names; /* the seq. names */
+extern char seqname[]; /* name of input file */
+extern sint gap_pos1,gap_pos2;
+extern Boolean use_ambiguities;
+extern char *amino_acid_codes;
+
+static double *av;
+static double *left_branch, *right_branch;
+static double *save_left_branch, *save_right_branch;
+static sint *boot_totals;
+static sint *tkill;
+/*
+ The next line is a fossil from the days of using the cc ran()
+static int ran_factor;
+*/
+static sint *boot_positions;
+static FILE *phylip_phy_tree_file;
+static FILE *clustal_phy_tree_file;
+static FILE *distances_phy_tree_file;
+static FILE *nexus_phy_tree_file;
+static FILE *pim_file; /* Ramu */
+static Boolean verbose;
+static char *tree_gaps;
+static sint first_seq, last_seq;
+ /* array of weights; 1 for use this posn.; 0 don't */
+
+extern sint boot_ntrials; /* number of bootstrap trials */
+extern unsigned sint boot_ran_seed; /* random number generator seed */
+
+void phylogenetic_tree(char *phylip_name,char *clustal_name,char *dist_name, char *nexus_name, char *pim_name)
+/*
+ Calculate a tree using the distances in the nseqs*nseqs array tmat.
+ This is the routine for getting the REAL trees after alignment.
+*/
+{ char path[FILENAMELEN+1];
+ sint i, j;
+ sint overspill = 0;
+ sint total_dists;
+ static char **standard_tree;
+ static char **save_tree;
+ char lin2[10];
+
+ if(empty) {
+ error("You must load an alignment first");
+ return;
+ }
+
+ if(nseqs<2) {
+ error("Alignment has only %d sequences",nseqs);
+ return;
+ }
+ first_seq=1;
+ last_seq=nseqs;
+
+ get_path(seqname,path);
+
+if(output_tree_clustal) {
+ if (clustal_name[0]!=EOS) {
+ if((clustal_phy_tree_file = open_explicit_file(
+ clustal_name))==NULL) return;
+ }
+ else {
+ if((clustal_phy_tree_file = open_output_file(
+ "\nEnter name for CLUSTAL tree output file ",path,
+ clustal_name,"nj")) == NULL) return;
+ }
+}
+
+if(output_tree_phylip) {
+ if (phylip_name[0]!=EOS) {
+ if((phylip_phy_tree_file = open_explicit_file(
+ phylip_name))==NULL) return;
+ }
+ else {
+ if((phylip_phy_tree_file = open_output_file(
+ "\nEnter name for PHYLIP tree output file ",path,
+ phylip_name,"ph")) == NULL) return;
+ }
+}
+
+if(output_tree_distances)
+{
+ if (dist_name[0]!=EOS) {
+ if((distances_phy_tree_file = open_explicit_file(
+ dist_name))==NULL) return;
+ }
+ else {
+ if((distances_phy_tree_file = open_output_file(
+ "\nEnter name for distance matrix output file ",path,
+ dist_name,"dst")) == NULL) return;
+ }
+}
+
+if(output_tree_nexus)
+{
+ if (nexus_name[0]!=EOS) {
+ if((nexus_phy_tree_file = open_explicit_file(
+ nexus_name))==NULL) return;
+ }
+ else {
+ if((nexus_phy_tree_file = open_output_file(
+ "\nEnter name for NEXUS tree output file ",path,
+ nexus_name,"tre")) == NULL) return;
+ }
+}
+
+if(output_pim)
+{
+ if (pim_name[0]!=EOS) {
+ if((pim_file = open_explicit_file(
+ pim_name))==NULL) return;
+ }
+ else {
+ if((pim_file = open_output_file(
+ "\nEnter name for % Identity matrix output file ",path,
+ pim_name,"pim")) == NULL) return;
+ }
+}
+
+ boot_positions = (sint *)ckalloc( (seqlen_array[first_seq]+2) * sizeof (sint) );
+
+ for(j=1; j<=seqlen_array[first_seq]; ++j)
+ boot_positions[j] = j;
+
+ if(output_tree_clustal) {
+ verbose = TRUE; /* Turn on file output */
+ if(dnaflag)
+ overspill = dna_distance_matrix(clustal_phy_tree_file);
+ else
+ overspill = prot_distance_matrix(clustal_phy_tree_file);
+ }
+
+ if(output_tree_phylip) {
+ verbose = FALSE; /* Turn off file output */
+ if(dnaflag)
+ overspill = dna_distance_matrix(phylip_phy_tree_file);
+ else
+ overspill = prot_distance_matrix(phylip_phy_tree_file);
+ }
+
+ if(output_tree_nexus) {
+ verbose = FALSE; /* Turn off file output */
+ if(dnaflag)
+ overspill = dna_distance_matrix(nexus_phy_tree_file);
+ else
+ overspill = prot_distance_matrix(nexus_phy_tree_file);
+ }
+
+ if(output_pim) { /* Ramu */
+ verbose = FALSE; /* Turn off file output */
+ if(dnaflag)
+ calc_percidentity(pim_file);
+ else
+ calc_percidentity(pim_file);
+ }
+
+
+ if(output_tree_distances) {
+ verbose = FALSE; /* Turn off file output */
+ if(dnaflag)
+ overspill = dna_distance_matrix(distances_phy_tree_file);
+ else
+ overspill = prot_distance_matrix(distances_phy_tree_file);
+ distance_matrix_output(distances_phy_tree_file);
+ }
+
+/* check if any distances overflowed the distance corrections */
+ if ( overspill > 0 ) {
+ total_dists = (nseqs*(nseqs-1))/2;
+ overspill_message(overspill,total_dists);
+ }
+
+ if(output_tree_clustal) verbose = TRUE; /* Turn on file output */
+
+ standard_tree = (char **) ckalloc( (nseqs+1) * sizeof (char *) );
+ for(i=0; i<nseqs+1; i++)
+ standard_tree[i] = (char *) ckalloc( (nseqs+1) * sizeof(char) );
+ save_tree = (char **) ckalloc( (nseqs+1) * sizeof (char *) );
+ for(i=0; i<nseqs+1; i++)
+ save_tree[i] = (char *) ckalloc( (nseqs+1) * sizeof(char) );
+
+ if(output_tree_clustal || output_tree_phylip || output_tree_nexus)
+ nj_tree(standard_tree,clustal_phy_tree_file);
+
+ for(i=1; i<nseqs+1; i++)
+ for(j=1; j<nseqs+1; j++)
+ save_tree[i][j] = standard_tree[i][j];
+
+ if(output_tree_phylip)
+ print_phylip_tree(standard_tree,phylip_phy_tree_file,0);
+
+ for(i=1; i<nseqs+1; i++)
+ for(j=1; j<nseqs+1; j++)
+ standard_tree[i][j] = save_tree[i][j];
+
+ if(output_tree_nexus)
+ print_nexus_tree(standard_tree,nexus_phy_tree_file,0);
+
+/*
+ print_tree(standard_tree,phy_tree_file);
+*/
+ tree_gaps=ckfree((void *)tree_gaps);
+ boot_positions=ckfree((void *)boot_positions);
+ if (left_branch != NULL) left_branch=ckfree((void *)left_branch);
+ if (right_branch != NULL) right_branch=ckfree((void *)right_branch);
+ if (tkill != NULL) tkill=ckfree((void *)tkill);
+ if (av != NULL) av=ckfree((void *)av);
+ for (i=0;i<nseqs+1;i++)
+ standard_tree[i]=ckfree((void *)standard_tree[i]);
+ standard_tree=ckfree((void *)standard_tree);
+
+ for (i=0;i<nseqs+1;i++)
+ save_tree[i]=ckfree((void *)save_tree[i]);
+ save_tree=ckfree((void *)save_tree);
+
+if(output_tree_clustal) {
+ fclose(clustal_phy_tree_file);
+ info("Phylogenetic tree file created: [%s]",clustal_name);
+}
+
+if(output_tree_phylip) {
+ fclose(phylip_phy_tree_file);
+ info("Phylogenetic tree file created: [%s]",phylip_name);
+}
+
+if(output_tree_distances) {
+ fclose(distances_phy_tree_file);
+ info("Distance matrix file created: [%s]",dist_name);
+}
+
+if(output_tree_nexus) {
+ fclose(nexus_phy_tree_file);
+ info("Nexus tree file created: [%s]",nexus_name);
+}
+
+if(output_pim) {
+ fclose(pim_file);
+ info(" perc identity matrix file created: [%s]",pim_name);
+}
+
+}
+
+static void overspill_message(sint overspill,sint total_dists)
+{
+ char err_mess[1024]="";
+
+ sprintf(err_mess,"%d of the distances out of a total of %d",
+ (pint)overspill,(pint)total_dists);
+ strcat(err_mess,"\n were out of range for the distance correction.");
+ strcat(err_mess,"\n");
+ strcat(err_mess,"\n SUGGESTIONS: 1) remove the most distant sequences");
+ strcat(err_mess,"\n or 2) use the PHYLIP package");
+ strcat(err_mess,"\n or 3) turn off the correction.");
+ strcat(err_mess,"\n Note: Use option 3 with caution! With this degree");
+ strcat(err_mess,"\n of divergence you will have great difficulty");
+ strcat(err_mess,"\n getting robust and reliable trees.");
+ strcat(err_mess,"\n\n");
+ warning(err_mess);
+}
+
+
+
+Boolean transition(sint base1, sint base2) /* TRUE if transition; else FALSE */
+/*
+
+ assumes that the bases of DNA sequences have been translated as
+ a,A = 0; c,C = 1; g,G = 2; t,T,u,U = 3; N = 4;
+ a,A = 0; c,C = 2; g,G = 6; t,T,u,U =17;
+
+ A <--> G and T <--> C are transitions; all others are transversions.
+
+*/
+{
+ if( ((base1 == 0) && (base2 == 6)) || ((base1 == 6) && (base2 == 0)) )
+ return TRUE; /* A <--> G */
+ if( ((base1 ==17) && (base2 == 2)) || ((base1 == 2) && (base2 ==17)) )
+ return TRUE; /* T <--> C */
+ return FALSE;
+}
+
+
+void tree_gap_delete(void) /* flag all positions in alignment that have a gap */
+{ /* in ANY sequence */
+ sint seqn;
+ sint posn;
+
+ tree_gaps = (char *)ckalloc( (max_aln_length+1) * sizeof (char) );
+
+ for(posn=1; posn<=seqlen_array[first_seq]; ++posn) {
+ tree_gaps[posn] = 0;
+ for(seqn=1; seqn<=last_seq-first_seq+1; ++seqn) {
+ if((seq_array[seqn+first_seq-1][posn] == gap_pos1) ||
+ (seq_array[seqn+first_seq-1][posn] == gap_pos2)) {
+ tree_gaps[posn] = 1;
+ break;
+ }
+ }
+ }
+
+}
+
+void distance_matrix_output(FILE *ofile)
+{
+ sint i,j;
+
+ fprintf(ofile,"%6d",(pint)last_seq-first_seq+1);
+ for(i=1;i<=last_seq-first_seq+1;i++) {
+ fprintf(ofile,"\n%-*s ",max_names,names[i]);
+ for(j=1;j<=last_seq-first_seq+1;j++) {
+ fprintf(ofile,"%6.3f ",tmat[i][j]);
+ if(j % 8 == 0) {
+ if(j!=last_seq-first_seq+1) fprintf(ofile,"\n");
+ if(j != last_seq-first_seq+1 ) fprintf(ofile," ");
+ }
+ }
+ }
+}
+
+
+
+#ifdef ORIGINAL_NJ_TREE
+void nj_tree(char **tree_description, FILE *tree)
+{
+ register int i;
+ sint l[4],nude,k;
+ sint nc,mini,minj,j,ii,jj;
+ double fnseqs,fnseqs2=0,sumd;
+ double diq,djq,dij,d2r,dr,dio,djo,da;
+ double tmin,total,dmin;
+ double bi,bj,b1,b2,b3,branch[4];
+ sint typei,typej; /* 0 = node; 1 = OTU */
+
+ fnseqs = (double)last_seq-first_seq+1;
+
+/*********************** First initialisation ***************************/
+
+ if(verbose) {
+ fprintf(tree,"\n\n\t\t\tNeighbor-joining Method\n");
+ fprintf(tree,"\n Saitou, N. and Nei, M. (1987)");
+ fprintf(tree," The Neighbor-joining Method:");
+ fprintf(tree,"\n A New Method for Reconstructing Phylogenetic Trees.");
+ fprintf(tree,"\n Mol. Biol. Evol., 4(4), 406-425\n");
+ fprintf(tree,"\n\n This is an UNROOTED tree\n");
+ fprintf(tree,"\n Numbers in parentheses are branch lengths\n\n");
+ }
+
+ if (fnseqs == 2) {
+ if (verbose) fprintf(tree,"Cycle 1 = SEQ: 1 (%9.5f) joins SEQ: 2 (%9.5f)",tmat[first_seq][first_seq+1],tmat[first_seq][first_seq+1]);
+ return;
+ }
+
+ mini = minj = 0;
+
+ left_branch = (double *) ckalloc( (nseqs+2) * sizeof (double) );
+ right_branch = (double *) ckalloc( (nseqs+2) * sizeof (double) );
+ tkill = (sint *) ckalloc( (nseqs+1) * sizeof (sint) );
+ av = (double *) ckalloc( (nseqs+1) * sizeof (double) );
+
+ for(i=1;i<=last_seq-first_seq+1;++i)
+ {
+ tmat[i][i] = av[i] = 0.0;
+ tkill[i] = 0;
+ }
+
+/*********************** Enter The Main Cycle ***************************/
+
+ /* for(nc=1; nc<=(last_seq-first_seq+1-3); ++nc) { */ /**start main cycle**/
+ for(nc=1; nc<=(last_seq-first_seq+1-3); ++nc) {
+ sumd = 0.0;
+ for(j=2; j<=last_seq-first_seq+1; ++j)
+ for(i=1; i<j; ++i) {
+ tmat[j][i] = tmat[i][j];
+ sumd = sumd + tmat[i][j];
+ }
+
+ tmin = 99999.0;
+
+/*.................compute SMATij values and find the smallest one ........*/
+
+ for(jj=2; jj<=last_seq-first_seq+1; ++jj)
+ if(tkill[jj] != 1)
+ for(ii=1; ii<jj; ++ii)
+ if(tkill[ii] != 1) {
+ diq = djq = 0.0;
+
+ for(i=1; i<=last_seq-first_seq+1; ++i) {
+ diq = diq + tmat[i][ii];
+ djq = djq + tmat[i][jj];
+ }
+
+ dij = tmat[ii][jj];
+ d2r = diq + djq - (2.0*dij);
+ dr = sumd - dij -d2r;
+ fnseqs2 = fnseqs - 2.0;
+ total= d2r+ fnseqs2*dij +dr*2.0;
+ total= total / (2.0*fnseqs2);
+
+ if(total < tmin) {
+ tmin = total;
+ mini = ii;
+ minj = jj;
+ }
+ }
+
+
+/*.................compute branch lengths and print the results ........*/
+
+
+ dio = djo = 0.0;
+ for(i=1; i<=last_seq-first_seq+1; ++i) {
+ dio = dio + tmat[i][mini];
+ djo = djo + tmat[i][minj];
+ }
+
+ dmin = tmat[mini][minj];
+ dio = (dio - dmin) / fnseqs2;
+ djo = (djo - dmin) / fnseqs2;
+ bi = (dmin + dio - djo) * 0.5;
+ bj = dmin - bi;
+ bi = bi - av[mini];
+ bj = bj - av[minj];
+
+ if( av[mini] > 0.0 )
+ typei = 0;
+ else
+ typei = 1;
+ if( av[minj] > 0.0 )
+ typej = 0;
+ else
+ typej = 1;
+
+ if(verbose)
+ fprintf(tree,"\n Cycle%4d = ",(pint)nc);
+
+/*
+ set negative branch lengths to zero. Also set any tiny positive
+ branch lengths to zero.
+*/ if( fabs(bi) < 0.0001) bi = 0.0;
+ if( fabs(bj) < 0.0001) bj = 0.0;
+
+ if(verbose) {
+ if(typei == 0)
+ fprintf(tree,"Node:%4d (%9.5f) joins ",(pint)mini,bi);
+ else
+ fprintf(tree," SEQ:%4d (%9.5f) joins ",(pint)mini,bi);
+
+ if(typej == 0)
+ fprintf(tree,"Node:%4d (%9.5f)",(pint)minj,bj);
+ else
+ fprintf(tree," SEQ:%4d (%9.5f)",(pint)minj,bj);
+
+ fprintf(tree,"\n");
+ }
+
+
+ left_branch[nc] = bi;
+ right_branch[nc] = bj;
+
+ for(i=1; i<=last_seq-first_seq+1; i++)
+ tree_description[nc][i] = 0;
+
+ if(typei == 0) {
+ for(i=nc-1; i>=1; i--)
+ if(tree_description[i][mini] == 1) {
+ for(j=1; j<=last_seq-first_seq+1; j++)
+ if(tree_description[i][j] == 1)
+ tree_description[nc][j] = 1;
+ break;
+ }
+ }
+ else
+ tree_description[nc][mini] = 1;
+
+ if(typej == 0) {
+ for(i=nc-1; i>=1; i--)
+ if(tree_description[i][minj] == 1) {
+ for(j=1; j<=last_seq-first_seq+1; j++)
+ if(tree_description[i][j] == 1)
+ tree_description[nc][j] = 1;
+ break;
+ }
+ }
+ else
+ tree_description[nc][minj] = 1;
+
+
+/*
+ Here is where the -0.00005 branch lengths come from for 3 or more
+ identical seqs.
+*/
+/* if(dmin <= 0.0) dmin = 0.0001; */
+ if(dmin <= 0.0) dmin = 0.000001;
+ av[mini] = dmin * 0.5;
+
+/*........................Re-initialisation................................*/
+
+ fnseqs = fnseqs - 1.0;
+ tkill[minj] = 1;
+
+ for(j=1; j<=last_seq-first_seq+1; ++j)
+ if( tkill[j] != 1 ) {
+ da = ( tmat[mini][j] + tmat[minj][j] ) * 0.5;
+ if( (mini - j) < 0 )
+ tmat[mini][j] = da;
+ if( (mini - j) > 0)
+ tmat[j][mini] = da;
+ }
+
+ for(j=1; j<=last_seq-first_seq+1; ++j)
+ tmat[minj][j] = tmat[j][minj] = 0.0;
+
+
+/****/ } /**end main cycle**/
+
+/******************************Last Cycle (3 Seqs. left)********************/
+
+ nude = 1;
+
+ for(i=1; i<=last_seq-first_seq+1; ++i)
+ if( tkill[i] != 1 ) {
+ l[nude] = i;
+ nude = nude + 1;
+ }
+
+ b1 = (tmat[l[1]][l[2]] + tmat[l[1]][l[3]] - tmat[l[2]][l[3]]) * 0.5;
+ b2 = tmat[l[1]][l[2]] - b1;
+ b3 = tmat[l[1]][l[3]] - b1;
+
+ branch[1] = b1 - av[l[1]];
+ branch[2] = b2 - av[l[2]];
+ branch[3] = b3 - av[l[3]];
+
+/* Reset tiny negative and positive branch lengths to zero */
+ if( fabs(branch[1]) < 0.0001) branch[1] = 0.0;
+ if( fabs(branch[2]) < 0.0001) branch[2] = 0.0;
+ if( fabs(branch[3]) < 0.0001) branch[3] = 0.0;
+
+ left_branch[last_seq-first_seq+1-2] = branch[1];
+ left_branch[last_seq-first_seq+1-1] = branch[2];
+ left_branch[last_seq-first_seq+1] = branch[3];
+
+ for(i=1; i<=last_seq-first_seq+1; i++)
+ tree_description[last_seq-first_seq+1-2][i] = 0;
+
+ if(verbose)
+ fprintf(tree,"\n Cycle%4d (Last cycle, trichotomy):\n",(pint)nc);
+
+ for(i=1; i<=3; ++i) {
+ if( av[l[i]] > 0.0) {
+ if(verbose)
+ fprintf(tree,"\n\t\t Node:%4d (%9.5f) ",(pint)l[i],branch[i]);
+ for(k=last_seq-first_seq+1-3; k>=1; k--)
+ if(tree_description[k][l[i]] == 1) {
+ for(j=1; j<=last_seq-first_seq+1; j++)
+ if(tree_description[k][j] == 1)
+ tree_description[last_seq-first_seq+1-2][j] = i;
+ break;
+ }
+ }
+ else {
+ if(verbose)
+ fprintf(tree,"\n\t\t SEQ:%4d (%9.5f) ",(pint)l[i],branch[i]);
+ tree_description[last_seq-first_seq+1-2][l[i]] = i;
+ }
+ if(i < 3) {
+ if(verbose)
+ fprintf(tree,"joins");
+ }
+ }
+
+ if(verbose)
+ fprintf(tree,"\n");
+
+}
+
+#else /* ORIGINAL_NJ_TREE */
+
+void nj_tree(char **tree_description, FILE *tree) {
+ void fast_nj_tree();
+
+ /*fprintf(stderr, "****** call fast_nj_tree() !!!! ******\n");*/
+ fast_nj_tree(tree_description, tree);
+}
+
+
+/****************************************************************************
+ * [ Improvement ideas in fast_nj_tree() ] by DDBJ & FUJITSU Limited.
+ * written by Tadashi Koike
+ * (takoike at genes.nig.ac.jp)
+ *******************
+ * <IMPROVEMENT 1> : Store the value of sum of the score to temporary array,
+ * and use again and again.
+ *
+ * In the main cycle, these are calculated again and again :
+ * diq = sum of tmat[n][ii] (n:1 to last_seq-first_seq+1),
+ * djq = sum of tmat[n][jj] (n:1 to last_seq-first_seq+1),
+ * dio = sum of tmat[n][mini] (n:1 to last_seq-first_seq+1),
+ * djq = sum of tmat[n][minj] (n:1 to last_seq-first_seq+1)
+ * // 'last_seq' and 'first_seq' are both constant values //
+ * and the result of above calculations is always same until
+ * a best pair of neighbour nodes is joined.
+ *
+ * So, we change the logic to calculate the sum[i] (=sum of tmat[n][i]
+ * (n:1 to last_seq-first_seq+1)) and store it to array, before
+ * beginning to find a best pair of neighbour nodes, and after that
+ * we use them again and again.
+ *
+ * tmat[i][j]
+ * 1 2 3 4 5
+ * +---+---+---+---+---+
+ * 1 | | | | | |
+ * +---+---+---+---+---+
+ * 2 | | | | | | 1) calculate sum of tmat[n][i]
+ * +---+---+---+---+---+ (n: 1 to last_seq-first_seq+1)
+ * 3 | | | | | | 2) store that sum value to sum[i]
+ * +---+---+---+---+---+
+ * 4 | | | | | | 3) use sum[i] during finding a best
+ * +---+---+---+---+---+ pair of neibour nodes.
+ * 5 | | | | | |
+ * +---+---+---+---+---+
+ * | | | | |
+ * V V V V V Calculate sum , and store it to sum[i]
+ * +---+---+---+---+---+
+ * sum[i] | | | | | |
+ * +---+---+---+---+---+
+ *
+ * At this time, we thought that we use upper triangle of the matrix
+ * because tmat[i][j] is equal to tmat[j][i] and tmat[i][i] is equal
+ * to zero. Therefore, we prepared sum_rows[i] and sum_cols[i] instead
+ * of sum[i] for storing the sum value.
+ *
+ * tmat[i][j]
+ * 1 2 3 4 5 sum_cols[i]
+ * +---+---+---+---+---+ +---+
+ * 1 | # | # | # | # | --> | | ... sum of tmat[1][2..5]
+ * + - +---+---+---+---+ +---+
+ * 2 | # | # | # | --> | | ... sum of tmat[2][3..5]
+ * + - + - +---+---+---+ +---+
+ * 3 | # | # | --> | | ... sum of tmat[3][4..5]
+ * + - + - + - +---+---+ +---+
+ * 4 | # | --> | | ... sum of tmat[4][5]
+ * + - + - + - + - +---+ +---+
+ * 5 | --> | | ... zero
+ * + - + - + - + - + - + +---+
+ * | | | | |
+ * V V V V V Calculate sum , sotre to sum[i]
+ * +---+---+---+---+---+
+ * sum_rows[i] | | | | | |
+ * +---+---+---+---+---+
+ * | | | | |
+ * | | | | +----- sum of tmat[1..4][5]
+ * | | | +--------- sum of tmat[1..3][4]
+ * | | +------------- sum of tmat[1..2][3]
+ * | +----------------- sum of tmat[1][2]
+ * +--------------------- zero
+ *
+ * And we use (sum_rows[i] + sum_cols[i]) instead of sum[i].
+ *
+ *******************
+ * <IMPROVEMENT 2> : We manage valid nodes with chain list, instead of
+ * tkill[i] flag array.
+ *
+ * In original logic, invalid(killed?) nodes after nodes-joining
+ * are managed with tkill[i] flag array (set to 1 when killed).
+ * By this method, it is conspicuous to try next node but skip it
+ * at the latter of finding a best pair of neighbor nodes.
+ *
+ * So, we thought that we managed valid nodes by using a chain list
+ * as below:
+ *
+ * 1) declare the list structure.
+ * struct {
+ * sint n; // entry number of node.
+ * void *prev; // pointer to previous entry.
+ * void *next; // pointer to next entry.
+ * }
+ * 2) construct a valid node list.
+ *
+ * +-----+ +-----+ +-----+ +-----+ +-----+
+ * NULL<-|prev |<---|prev |<---|prev |<---|prev |<- - - -|prev |
+ * | 0 | | 1 | | 2 | | 3 | | n |
+ * | next|--->| next|--->| next|--->| next|- - - ->| next|->NULL
+ * +-----+ +-----+ +-----+ +-----+ +-----+
+ *
+ * 3) when finding a best pair of neighbor nodes, we use
+ * this chain list as loop counter.
+ *
+ * 4) If an entry was killed by node-joining, this chain list is
+ * modified to remove that entry.
+ *
+ * EX) remove the entry No 2.
+ * +-----+ +-----+ +-----+ +-----+
+ * NULL<-|prev |<---|prev |<--------------|prev |<- - - -|prev |
+ * | 0 | | 1 | | 3 | | n |
+ * | next|--->| next|-------------->| next|- - - ->| next|->NULL
+ * +-----+ +-----+ +-----+ +-----+
+ * +-----+
+ * NULL<-|prev |
+ * | 2 |
+ * | next|->NULL
+ * +-----+
+ *
+ * By this method, speed is up at the latter of finding a best pair of
+ * neighbor nodes.
+ *
+ *******************
+ * <IMPROVEMENT 3> : Cut the frequency of division.
+ *
+ * At comparison between 'total' and 'tmin' in the main cycle, total is
+ * divided by (2.0*fnseqs2) before comparison. If N nodes are available,
+ * that division happen (N*(N-1))/2 order.
+ *
+ * We thought that the comparison relation between tmin and total/(2.0*fnseqs2)
+ * is equal to the comparison relation between (tmin*2.0*fnseqs2) and total.
+ * Calculation of (tmin*2.0*fnseqs2) is only one time. so we stop dividing
+ * a total value and multiply tmin and (tmin*2.0*fnseqs2) instead.
+ *
+ *******************
+ * <IMPROVEMENT 4> : some transformation of the equation (to cut operations).
+ *
+ * We transform an equation of calculating 'total' in the main cycle.
+ *
+ */
+
+
+void fast_nj_tree(char **tree_description, FILE *tree)
+{
+ register int i;
+ sint l[4],nude,k;
+ sint nc,mini,minj,j,ii,jj;
+ double fnseqs,fnseqs2=0,sumd;
+ double diq,djq,dij,d2r,dr,dio,djo,da;
+ double tmin,total,dmin;
+ double bi,bj,b1,b2,b3,branch[4];
+ sint typei,typej; /* 0 = node; 1 = OTU */
+
+ /* IMPROVEMENT 1, STEP 0 : declare variables */
+ double *sum_cols, *sum_rows, *join;
+
+ /* IMPROVEMENT 2, STEP 0 : declare variables */
+ sint loop_limit;
+ typedef struct _ValidNodeID {
+ sint n;
+ struct _ValidNodeID *prev;
+ struct _ValidNodeID *next;
+ } ValidNodeID;
+ ValidNodeID *tvalid, *lpi, *lpj, *lpii, *lpjj, *lp_prev, *lp_next;
+
+ /*
+ * correspondence of the loop counter variables.
+ * i .. lpi->n, ii .. lpii->n
+ * j .. lpj->n, jj .. lpjj->n
+ */
+
+ fnseqs = (double)last_seq-first_seq+1;
+
+/*********************** First initialisation ***************************/
+
+ if(verbose) {
+ fprintf(tree,"\n\n\t\t\tNeighbor-joining Method\n");
+ fprintf(tree,"\n Saitou, N. and Nei, M. (1987)");
+ fprintf(tree," The Neighbor-joining Method:");
+ fprintf(tree,"\n A New Method for Reconstructing Phylogenetic Trees.");
+ fprintf(tree,"\n Mol. Biol. Evol., 4(4), 406-425\n");
+ fprintf(tree,"\n\n This is an UNROOTED tree\n");
+ fprintf(tree,"\n Numbers in parentheses are branch lengths\n\n");
+ }
+
+ if (fnseqs == 2) {
+ if (verbose) fprintf(tree,"Cycle 1 = SEQ: 1 (%9.5f) joins SEQ: 2 (%9.5f)",tmat[first_seq][first_seq+1],tmat[first_seq][first_seq+1]);
+ return;
+ }
+
+ mini = minj = 0;
+
+ left_branch = (double *) ckalloc( (nseqs+2) * sizeof (double) );
+ right_branch = (double *) ckalloc( (nseqs+2) * sizeof (double) );
+ tkill = (sint *) ckalloc( (nseqs+1) * sizeof (sint) );
+ av = (double *) ckalloc( (nseqs+1) * sizeof (double) );
+
+ /* IMPROVEMENT 1, STEP 1 : Allocate memory */
+ sum_cols = (double *) ckalloc( (nseqs+1) * sizeof (double) );
+ sum_rows = (double *) ckalloc( (nseqs+1) * sizeof (double) );
+ join = (double *) ckalloc( (nseqs+1) * sizeof (double) );
+
+ /* IMPROVEMENT 2, STEP 1 : Allocate memory */
+ tvalid = (ValidNodeID *) ckalloc( (nseqs+1) * sizeof (ValidNodeID) );
+ /* tvalid[0] is special entry in array. it points a header of valid entry list */
+ tvalid[0].n = 0;
+ tvalid[0].prev = NULL;
+ tvalid[0].next = &tvalid[1];
+
+ /* IMPROVEMENT 2, STEP 2 : Construct and initialize the entry chain list */
+ for(i=1, loop_limit = last_seq-first_seq+1,
+ lpi=&tvalid[1], lp_prev=&tvalid[0], lp_next=&tvalid[2] ;
+ i<=loop_limit ;
+ ++i, ++lpi, ++lp_prev, ++lp_next)
+ {
+ tmat[i][i] = av[i] = 0.0;
+ tkill[i] = 0;
+ lpi->n = i;
+ lpi->prev = lp_prev;
+ lpi->next = lp_next;
+
+ /* IMPROVEMENT 1, STEP 2 : Initialize arrays */
+ sum_cols[i] = sum_rows[i] = join[i] = 0.0;
+ }
+ tvalid[loop_limit].next = NULL;
+
+ /*
+ * IMPROVEMENT 1, STEP 3 : Calculate the sum of score value that
+ * is sequence[i] to others.
+ */
+ sumd = 0.0;
+ for (lpj=tvalid[0].next ; lpj!=NULL ; lpj = lpj->next) {
+ double tmp_sum = 0.0;
+ j = lpj->n;
+ /* calculate sum_rows[j] */
+ for (lpi=tvalid[0].next ; lpi->n < j ; lpi = lpi->next) {
+ i = lpi->n;
+ tmp_sum += tmat[i][j];
+ /* tmat[j][i] = tmat[i][j]; */
+ }
+ sum_rows[j] = tmp_sum;
+
+ tmp_sum = 0.0;
+ /* Set lpi to that lpi->n is greater than j */
+ if ((lpi != NULL) && (lpi->n == j)) {
+ lpi = lpi->next;
+ }
+ /* calculate sum_cols[j] */
+ for( ; lpi!=NULL ; lpi = lpi->next) {
+ i = lpi->n;
+ tmp_sum += tmat[j][i];
+ /* tmat[i][j] = tmat[j][i]; */
+ }
+ sum_cols[j] = tmp_sum;
+ }
+
+/*********************** Enter The Main Cycle ***************************/
+
+ for(nc=1, loop_limit = (last_seq-first_seq+1-3); nc<=loop_limit; ++nc) {
+
+ sumd = 0.0;
+ /* IMPROVEMENT 1, STEP 4 : use sum value */
+ for(lpj=tvalid[0].next ; lpj!=NULL ; lpj = lpj->next) {
+ sumd += sum_cols[lpj->n];
+ }
+
+ /* IMPROVEMENT 3, STEP 0 : multiply tmin and 2*fnseqs2 */
+ fnseqs2 = fnseqs - 2.0; /* Set fnseqs2 at this point. */
+ tmin = 99999.0 * 2.0 * fnseqs2;
+
+
+/*.................compute SMATij values and find the smallest one ........*/
+
+ mini = minj = 0;
+
+ /* jj must starts at least 2 */
+ if ((tvalid[0].next != NULL) && (tvalid[0].next->n == 1)) {
+ lpjj = tvalid[0].next->next;
+ } else {
+ lpjj = tvalid[0].next;
+ }
+
+ for( ; lpjj != NULL; lpjj = lpjj->next) {
+ jj = lpjj->n;
+ for(lpii=tvalid[0].next ; lpii->n < jj ; lpii = lpii->next) {
+ ii = lpii->n;
+ diq = djq = 0.0;
+
+ /* IMPROVEMENT 1, STEP 4 : use sum value */
+ diq = sum_cols[ii] + sum_rows[ii];
+ djq = sum_cols[jj] + sum_rows[jj];
+ /*
+ * always ii < jj in this point. Use upper
+ * triangle of score matrix.
+ */
+ dij = tmat[ii][jj];
+
+ /*
+ * IMPROVEMENT 3, STEP 1 : fnseqs2 is
+ * already calculated.
+ */
+ /* fnseqs2 = fnseqs - 2.0 */
+
+ /* IMPROVEMENT 4 : transform the equation */
+ /*-------------------------------------------------------------------*
+ * OPTIMIZE of expression 'total = d2r + fnseqs2*dij + dr*2.0' *
+ * total = d2r + fnseq2*dij + 2.0*dr *
+ * = d2r + fnseq2*dij + 2(sumd - dij - d2r) *
+ * = d2r + fnseq2*dij + 2*sumd - 2*dij - 2*d2r *
+ * = fnseq2*dij + 2*sumd - 2*dij - 2*d2r + d2r *
+ * = fnseq2*dij + 2*sumd - 2*dij - d2r *
+ * = fnseq2*dij + 2*sumd - 2*dij - (diq + djq - 2*dij) *
+ * = fnseq2*dij + 2*sumd - 2*dij - diq - djq + 2*dij *
+ * = fnseq2*dij + 2*sumd - 2*dij + 2*dij - diq - djq *
+ * = fnseq2*dij + 2*sumd - diq - djq *
+ *-------------------------------------------------------------------*/
+ total = fnseqs2*dij + 2.0*sumd - diq - djq;
+
+ /*
+ * IMPROVEMENT 3, STEP 2 : abbrevlate
+ * the division on comparison between
+ * total and tmin.
+ */
+ /* total = total / (2.0*fnseqs2); */
+
+ if(total < tmin) {
+ tmin = total;
+ mini = ii;
+ minj = jj;
+ }
+ }
+ }
+
+ /* MEMO: always ii < jj in avobe loop, so mini < minj */
+
+/*.................compute branch lengths and print the results ........*/
+
+
+ dio = djo = 0.0;
+
+ /* IMPROVEMENT 1, STEP 4 : use sum value */
+ dio = sum_cols[mini] + sum_rows[mini];
+ djo = sum_cols[minj] + sum_rows[minj];
+
+ dmin = tmat[mini][minj];
+ dio = (dio - dmin) / fnseqs2;
+ djo = (djo - dmin) / fnseqs2;
+ bi = (dmin + dio - djo) * 0.5;
+ bj = dmin - bi;
+ bi = bi - av[mini];
+ bj = bj - av[minj];
+
+ if( av[mini] > 0.0 )
+ typei = 0;
+ else
+ typei = 1;
+ if( av[minj] > 0.0 )
+ typej = 0;
+ else
+ typej = 1;
+
+ if(verbose)
+ fprintf(tree,"\n Cycle%4d = ",(pint)nc);
+
+/*
+ set negative branch lengths to zero. Also set any tiny positive
+ branch lengths to zero.
+*/ if( fabs(bi) < 0.0001) bi = 0.0;
+ if( fabs(bj) < 0.0001) bj = 0.0;
+
+ if(verbose) {
+ if(typei == 0)
+ fprintf(tree,"Node:%4d (%9.5f) joins ",(pint)mini,bi);
+ else
+ fprintf(tree," SEQ:%4d (%9.5f) joins ",(pint)mini,bi);
+
+ if(typej == 0)
+ fprintf(tree,"Node:%4d (%9.5f)",(pint)minj,bj);
+ else
+ fprintf(tree," SEQ:%4d (%9.5f)",(pint)minj,bj);
+
+ fprintf(tree,"\n");
+ }
+
+
+ left_branch[nc] = bi;
+ right_branch[nc] = bj;
+
+ for(i=1; i<=last_seq-first_seq+1; i++)
+ tree_description[nc][i] = 0;
+
+ if(typei == 0) {
+ for(i=nc-1; i>=1; i--)
+ if(tree_description[i][mini] == 1) {
+ for(j=1; j<=last_seq-first_seq+1; j++)
+ if(tree_description[i][j] == 1)
+ tree_description[nc][j] = 1;
+ break;
+ }
+ }
+ else
+ tree_description[nc][mini] = 1;
+
+ if(typej == 0) {
+ for(i=nc-1; i>=1; i--)
+ if(tree_description[i][minj] == 1) {
+ for(j=1; j<=last_seq-first_seq+1; j++)
+ if(tree_description[i][j] == 1)
+ tree_description[nc][j] = 1;
+ break;
+ }
+ }
+ else
+ tree_description[nc][minj] = 1;
+
+
+/*
+ Here is where the -0.00005 branch lengths come from for 3 or more
+ identical seqs.
+*/
+/* if(dmin <= 0.0) dmin = 0.0001; */
+ if(dmin <= 0.0) dmin = 0.000001;
+ av[mini] = dmin * 0.5;
+
+/*........................Re-initialisation................................*/
+
+ fnseqs = fnseqs - 1.0;
+ tkill[minj] = 1;
+
+ /* IMPROVEMENT 2, STEP 3 : Remove tvalid[minj] from chain list. */
+ /* [ Before ]
+ * +---------+ +---------+ +---------+
+ * |prev |<-------|prev |<-------|prev |<---
+ * | n | | n(=minj)| | n |
+ * | next|------->| next|------->| next|----
+ * +---------+ +---------+ +---------+
+ *
+ * [ After ]
+ * +---------+ +---------+
+ * |prev |<--------------------------|prev |<---
+ * | n | | n |
+ * | next|-------------------------->| next|----
+ * +---------+ +---------+
+ * +---------+
+ * NULL---|prev |
+ * | n(=minj)|
+ * | next|---NULL
+ * +---------+
+ */
+ (tvalid[minj].prev)->next = tvalid[minj].next;
+ if (tvalid[minj].next != NULL) {
+ (tvalid[minj].next)->prev = tvalid[minj].prev;
+ }
+ tvalid[minj].prev = tvalid[minj].next = NULL;
+
+ /* IMPROVEMENT 1, STEP 5 : re-calculate sum values. */
+ for(lpj=tvalid[0].next ; lpj != NULL ; lpj = lpj->next) {
+ double tmp_di = 0.0;
+ double tmp_dj = 0.0;
+ j = lpj->n;
+
+ /*
+ * subtrace a score value related with 'minj' from
+ * sum arrays .
+ */
+ if (j < minj) {
+ tmp_dj = tmat[j][minj];
+ sum_cols[j] -= tmp_dj;
+ } else if (j > minj) {
+ tmp_dj = tmat[minj][j];
+ sum_rows[j] -= tmp_dj;
+ } /* nothing to do when j is equal to minj. */
+
+
+ /*
+ * subtrace a score value related with 'mini' from
+ * sum arrays .
+ */
+ if (j < mini) {
+ tmp_di = tmat[j][mini];
+ sum_cols[j] -= tmp_di;
+ } else if (j > mini) {
+ tmp_di = tmat[mini][j];
+ sum_rows[j] -= tmp_di;
+ } /* nothing to do when j is equal to mini. */
+
+ /*
+ * calculate a score value of the new inner node.
+ * then, store it temporary to join[] array.
+ */
+ join[j] = (tmp_dj + tmp_di) * 0.5;
+ }
+
+ /*
+ * 1)
+ * Set the score values (stored in join[]) into the matrix,
+ * row/column position is 'mini'.
+ * 2)
+ * Add a score value of the new inner node to sum arrays.
+ */
+ for(lpj=tvalid[0].next ; lpj != NULL; lpj = lpj->next) {
+ j = lpj->n;
+ if (j < mini) {
+ tmat[j][mini] = join[j];
+ sum_cols[j] += join[j];
+ } else if (j > mini) {
+ tmat[mini][j] = join[j];
+ sum_rows[j] += join[j];
+ } /* nothing to do when j is equal to mini. */
+ }
+
+ /* Re-calculate sum_rows[mini],sum_cols[mini]. */
+ sum_cols[mini] = sum_rows[mini] = 0.0;
+
+ /* calculate sum_rows[mini] */
+ da = 0.0;
+ for(lpj=tvalid[0].next ; lpj->n < mini ; lpj = lpj->next) {
+ da += join[lpj->n];
+ }
+ sum_rows[mini] = da;
+
+ /* skip if 'lpj->n' is equal to 'mini' */
+ if ((lpj != NULL) && (lpj->n == mini)) {
+ lpj = lpj->next;
+ }
+
+ /* calculate sum_cols[mini] */
+ da = 0.0;
+ for( ; lpj != NULL; lpj = lpj->next) {
+ da += join[lpj->n];
+ }
+ sum_cols[mini] = da;
+
+ /*
+ * Clean up sum_rows[minj], sum_cols[minj] and score matrix
+ * related with 'minj'.
+ */
+ sum_cols[minj] = sum_rows[minj] = 0.0;
+ for(j=1; j<=last_seq-first_seq+1; ++j)
+ tmat[minj][j] = tmat[j][minj] = join[j] = 0.0;
+
+
+/****/ } /**end main cycle**/
+
+/******************************Last Cycle (3 Seqs. left)********************/
+
+ nude = 1;
+
+ for(lpi=tvalid[0].next; lpi != NULL; lpi = lpi->next) {
+ l[nude] = lpi->n;
+ ++nude;
+ }
+
+ b1 = (tmat[l[1]][l[2]] + tmat[l[1]][l[3]] - tmat[l[2]][l[3]]) * 0.5;
+ b2 = tmat[l[1]][l[2]] - b1;
+ b3 = tmat[l[1]][l[3]] - b1;
+
+ branch[1] = b1 - av[l[1]];
+ branch[2] = b2 - av[l[2]];
+ branch[3] = b3 - av[l[3]];
+
+/* Reset tiny negative and positive branch lengths to zero */
+ if( fabs(branch[1]) < 0.0001) branch[1] = 0.0;
+ if( fabs(branch[2]) < 0.0001) branch[2] = 0.0;
+ if( fabs(branch[3]) < 0.0001) branch[3] = 0.0;
+
+ left_branch[last_seq-first_seq+1-2] = branch[1];
+ left_branch[last_seq-first_seq+1-1] = branch[2];
+ left_branch[last_seq-first_seq+1] = branch[3];
+
+ for(i=1; i<=last_seq-first_seq+1; i++)
+ tree_description[last_seq-first_seq+1-2][i] = 0;
+
+ if(verbose)
+ fprintf(tree,"\n Cycle%4d (Last cycle, trichotomy):\n",(pint)nc);
+
+ for(i=1; i<=3; ++i) {
+ if( av[l[i]] > 0.0) {
+ if(verbose)
+ fprintf(tree,"\n\t\t Node:%4d (%9.5f) ",(pint)l[i],branch[i]);
+ for(k=last_seq-first_seq+1-3; k>=1; k--)
+ if(tree_description[k][l[i]] == 1) {
+ for(j=1; j<=last_seq-first_seq+1; j++)
+ if(tree_description[k][j] == 1)
+ tree_description[last_seq-first_seq+1-2][j] = i;
+ break;
+ }
+ }
+ else {
+ if(verbose)
+ fprintf(tree,"\n\t\t SEQ:%4d (%9.5f) ",(pint)l[i],branch[i]);
+ tree_description[last_seq-first_seq+1-2][l[i]] = i;
+ }
+ if(i < 3) {
+ if(verbose)
+ fprintf(tree,"joins");
+ }
+ }
+
+ if(verbose)
+ fprintf(tree,"\n");
+
+
+ /* IMPROVEMENT 1, STEP 6 : release memory area */
+ ckfree(sum_cols);
+ ckfree(sum_rows);
+ ckfree(join);
+
+ /* IMPROVEMENT 2, STEP 4 : release memory area */
+ ckfree(tvalid);
+
+}
+#endif /* ORIGINAL_NJ_TREE */
+
+
+
+void bootstrap_tree(char *phylip_name,char *clustal_name, char *nexus_name)
+{
+ sint i,j;
+ int ranno;
+ char path[MAXLINE+1];
+ char dummy[10];
+ char err_mess[1024];
+ static char **sample_tree;
+ static char **standard_tree;
+ static char **save_tree;
+ sint total_dists, overspill = 0, total_overspill = 0;
+ sint nfails = 0;
+
+ if(empty) {
+ error("You must load an alignment first");
+ return;
+ }
+
+ if(nseqs<4) {
+ error("Alignment has only %d sequences",nseqs);
+ return;
+ }
+
+ if(!output_tree_clustal && !output_tree_phylip && !output_tree_nexus) {
+ error("You must select either clustal or phylip or nexus tree output format");
+ return;
+ }
+ get_path(seqname, path);
+
+ if (output_tree_clustal) {
+ if (clustal_name[0]!=EOS) {
+ if((clustal_phy_tree_file = open_explicit_file(
+ clustal_name))==NULL) return;
+ }
+ else {
+ if((clustal_phy_tree_file = open_output_file(
+ "\nEnter name for bootstrap output file ",path,
+ clustal_name,"njb")) == NULL) return;
+ }
+ }
+
+ first_seq=1;
+ last_seq=nseqs;
+
+ if (output_tree_phylip) {
+ if (phylip_name[0]!=EOS) {
+ if((phylip_phy_tree_file = open_explicit_file(
+ phylip_name))==NULL) return;
+ }
+ else {
+ if((phylip_phy_tree_file = open_output_file(
+ "\nEnter name for bootstrap output file ",path,
+ phylip_name,"phb")) == NULL) return;
+ }
+ }
+
+ if (output_tree_nexus) {
+ if (nexus_name[0]!=EOS) {
+ if((nexus_phy_tree_file = open_explicit_file(
+ nexus_name))==NULL) return;
+ }
+ else {
+ if((nexus_phy_tree_file = open_output_file(
+ "\nEnter name for bootstrap output file ",path,
+ nexus_name,"treb")) == NULL) return;
+ }
+ }
+
+ boot_totals = (sint *)ckalloc( (nseqs+1) * sizeof (sint) );
+ for(i=0;i<nseqs+1;i++)
+ boot_totals[i]=0;
+
+ boot_positions = (sint *)ckalloc( (seqlen_array[first_seq]+2) * sizeof (sint) );
+
+ for(j=1; j<=seqlen_array[first_seq]; ++j) /* First select all positions for */
+ boot_positions[j] = j; /* the "standard" tree */
+
+ if(output_tree_clustal) {
+ verbose = TRUE; /* Turn on file output */
+ if(dnaflag)
+ overspill = dna_distance_matrix(clustal_phy_tree_file);
+ else
+ overspill = prot_distance_matrix(clustal_phy_tree_file);
+ }
+
+ if(output_tree_phylip) {
+ verbose = FALSE; /* Turn off file output */
+ if(dnaflag)
+ overspill = dna_distance_matrix(phylip_phy_tree_file);
+ else
+ overspill = prot_distance_matrix(phylip_phy_tree_file);
+ }
+
+ if(output_tree_nexus) {
+ verbose = FALSE; /* Turn off file output */
+ if(dnaflag)
+ overspill = dna_distance_matrix(nexus_phy_tree_file);
+ else
+ overspill = prot_distance_matrix(nexus_phy_tree_file);
+ }
+
+/* check if any distances overflowed the distance corrections */
+ if ( overspill > 0 ) {
+ total_dists = (nseqs*(nseqs-1))/2;
+ overspill_message(overspill,total_dists);
+ }
+
+ tree_gaps=ckfree((void *)tree_gaps);
+
+ if (output_tree_clustal) verbose = TRUE; /* Turn on screen output */
+
+ standard_tree = (char **) ckalloc( (nseqs+1) * sizeof (char *) );
+ for(i=0; i<nseqs+1; i++)
+ standard_tree[i] = (char *) ckalloc( (nseqs+1) * sizeof(char) );
+
+/* compute the standard tree */
+
+ if(output_tree_clustal || output_tree_phylip || output_tree_nexus)
+ nj_tree(standard_tree,clustal_phy_tree_file);
+
+ if (output_tree_clustal)
+ fprintf(clustal_phy_tree_file,"\n\n\t\t\tBootstrap Confidence Limits\n\n");
+
+/* save the left_branch and right_branch for phylip output */
+ save_left_branch = (double *) ckalloc( (nseqs+2) * sizeof (double) );
+ save_right_branch = (double *) ckalloc( (nseqs+2) * sizeof (double) );
+ for (i=1;i<=nseqs;i++) {
+ save_left_branch[i] = left_branch[i];
+ save_right_branch[i] = right_branch[i];
+ }
+/*
+ The next line is a fossil from the days of using the cc ran()
+ ran_factor = RAND_MAX / seqlen_array[first_seq];
+*/
+
+ if(usemenu)
+ boot_ran_seed =
+getint("\n\nEnter seed no. for random number generator ",1,1000,boot_ran_seed);
+
+/* do not use the native cc ran()
+ srand(boot_ran_seed);
+*/
+ addrandinit((unsigned long) boot_ran_seed);
+
+ if (output_tree_clustal)
+ fprintf(clustal_phy_tree_file,"\n Random number generator seed = %7u\n",
+ boot_ran_seed);
+
+ if(usemenu)
+ boot_ntrials =
+getint("\n\nEnter number of bootstrap trials ",1,10000,boot_ntrials);
+
+ if (output_tree_clustal) {
+ fprintf(clustal_phy_tree_file,"\n Number of bootstrap trials = %7d\n",
+ (pint)boot_ntrials);
+
+ fprintf(clustal_phy_tree_file,
+ "\n\n Diagrammatic representation of the above tree: \n");
+ fprintf(clustal_phy_tree_file,"\n Each row represents 1 tree cycle;");
+ fprintf(clustal_phy_tree_file," defining 2 groups.\n");
+ fprintf(clustal_phy_tree_file,"\n Each column is 1 sequence; ");
+ fprintf(clustal_phy_tree_file,"the stars in each line show 1 group; ");
+ fprintf(clustal_phy_tree_file,"\n the dots show the other\n");
+ fprintf(clustal_phy_tree_file,"\n Numbers show occurences in bootstrap samples.");
+ }
+/*
+ print_tree(standard_tree, clustal_phy_tree_file, boot_totals);
+*/
+ verbose = FALSE; /* Turn OFF screen output */
+
+ left_branch=ckfree((void *)left_branch);
+ right_branch=ckfree((void *)right_branch);
+ tkill=ckfree((void *)tkill);
+ av=ckfree((void *)av);
+
+ sample_tree = (char **) ckalloc( (nseqs+1) * sizeof (char *) );
+ for(i=0; i<nseqs+1; i++)
+ sample_tree[i] = (char *) ckalloc( (nseqs+1) * sizeof(char) );
+
+ if (usemenu)
+ fprintf(stdout,"\n\nEach dot represents 10 trials\n\n");
+ total_overspill = 0;
+ nfails = 0;
+ for(i=1; i<=boot_ntrials; ++i) {
+ for(j=1; j<=seqlen_array[first_seq]; ++j) { /* select alignment */
+ /* positions for */
+ ranno = addrand( (unsigned long) seqlen_array[1]) + 1;
+ boot_positions[j] = ranno; /* bootstrap sample */
+ }
+ if(output_tree_clustal) {
+ if(dnaflag)
+ overspill = dna_distance_matrix(clustal_phy_tree_file);
+ else
+ overspill = prot_distance_matrix(clustal_phy_tree_file);
+ }
+
+ if(output_tree_phylip) {
+ if(dnaflag)
+ overspill = dna_distance_matrix(phylip_phy_tree_file);
+ else
+ overspill = prot_distance_matrix(phylip_phy_tree_file);
+ }
+
+ if(output_tree_nexus) {
+ if(dnaflag)
+ overspill = dna_distance_matrix(nexus_phy_tree_file);
+ else
+ overspill = prot_distance_matrix(nexus_phy_tree_file);
+ }
+
+ if( overspill > 0) {
+ total_overspill = total_overspill + overspill;
+ nfails++;
+ }
+
+ tree_gaps=ckfree((void *)tree_gaps);
+
+ if(output_tree_clustal || output_tree_phylip || output_tree_nexus)
+ nj_tree(sample_tree,clustal_phy_tree_file);
+
+ left_branch=ckfree((void *)left_branch);
+ right_branch=ckfree((void *)right_branch);
+ tkill=ckfree((void *)tkill);
+ av=ckfree((void *)av);
+
+ compare_tree(standard_tree, sample_tree, boot_totals, last_seq-first_seq+1);
+ if (usemenu) {
+ if(i % 10 == 0) fprintf(stdout,".");
+ if(i % 100 == 0) fprintf(stdout,"\n");
+ }
+ }
+
+/* check if any distances overflowed the distance corrections */
+ if ( nfails > 0 ) {
+ total_dists = (nseqs*(nseqs-1))/2;
+ fprintf(stdout,"\n");
+ fprintf(stdout,"\n WARNING: %ld of the distances out of a total of %ld times %ld",
+ (long)total_overspill,(long)total_dists,(long)boot_ntrials);
+ fprintf(stdout,"\n were out of range for the distance correction.");
+ fprintf(stdout,"\n This affected %d out of %d bootstrap trials.",
+ (pint)nfails,(pint)boot_ntrials);
+ fprintf(stdout,"\n This may not be fatal but you have been warned!");
+ fprintf(stdout,"\n");
+ fprintf(stdout,"\n SUGGESTIONS: 1) turn off the correction");
+ fprintf(stdout,"\n or 2) remove the most distant sequences");
+ fprintf(stdout,"\n or 3) use the PHYLIP package.");
+ fprintf(stdout,"\n\n");
+ if (usemenu)
+ getstr("Press [RETURN] to continue",dummy);
+ }
+
+
+ boot_positions=ckfree((void *)boot_positions);
+
+ for (i=1;i<nseqs+1;i++)
+ sample_tree[i]=ckfree((void *)sample_tree[i]);
+ sample_tree=ckfree((void *)sample_tree);
+/*
+ fprintf(clustal_phy_tree_file,"\n\n Bootstrap totals for each group\n");
+*/
+ if (output_tree_clustal)
+ print_tree(standard_tree, clustal_phy_tree_file, boot_totals);
+
+ save_tree = (char **) ckalloc( (nseqs+1) * sizeof (char *) );
+ for(i=0; i<nseqs+1; i++)
+ save_tree[i] = (char *) ckalloc( (nseqs+1) * sizeof(char) );
+
+ for(i=1; i<nseqs+1; i++)
+ for(j=1; j<nseqs+1; j++)
+ save_tree[i][j] = standard_tree[i][j];
+
+ if(output_tree_phylip) {
+ left_branch = (double *) ckalloc( (nseqs+2) * sizeof (double) );
+ right_branch = (double *) ckalloc( (nseqs+2) * sizeof (double) );
+ for (i=1;i<=nseqs;i++) {
+ left_branch[i] = save_left_branch[i];
+ right_branch[i] = save_right_branch[i];
+ }
+ print_phylip_tree(standard_tree,phylip_phy_tree_file,
+ bootstrap_format);
+ left_branch=ckfree((void *)left_branch);
+ right_branch=ckfree((void *)right_branch);
+ }
+
+ for(i=1; i<nseqs+1; i++)
+ for(j=1; j<nseqs+1; j++)
+ standard_tree[i][j] = save_tree[i][j];
+
+ if(output_tree_nexus) {
+ left_branch = (double *) ckalloc( (nseqs+2) * sizeof (double) );
+ right_branch = (double *) ckalloc( (nseqs+2) * sizeof (double) );
+ for (i=1;i<=nseqs;i++) {
+ left_branch[i] = save_left_branch[i];
+ right_branch[i] = save_right_branch[i];
+ }
+ print_nexus_tree(standard_tree,nexus_phy_tree_file,
+ bootstrap_format);
+ left_branch=ckfree((void *)left_branch);
+ right_branch=ckfree((void *)right_branch);
+ }
+
+ boot_totals=ckfree((void *)boot_totals);
+ save_left_branch=ckfree((void *)save_left_branch);
+ save_right_branch=ckfree((void *)save_right_branch);
+
+ for (i=1;i<nseqs+1;i++)
+ standard_tree[i]=ckfree((void *)standard_tree[i]);
+ standard_tree=ckfree((void *)standard_tree);
+
+ for (i=0;i<nseqs+1;i++)
+ save_tree[i]=ckfree((void *)save_tree[i]);
+ save_tree=ckfree((void *)save_tree);
+
+ if (output_tree_clustal)
+ fclose(clustal_phy_tree_file);
+
+ if (output_tree_phylip)
+ fclose(phylip_phy_tree_file);
+
+ if (output_tree_nexus)
+ fclose(nexus_phy_tree_file);
+
+ if (output_tree_clustal)
+ info("Bootstrap output file completed [%s]"
+ ,clustal_name);
+ if (output_tree_phylip)
+ info("Bootstrap output file completed [%s]"
+ ,phylip_name);
+ if (output_tree_nexus)
+ info("Bootstrap output file completed [%s]"
+ ,nexus_name);
+}
+
+
+void compare_tree(char **tree1, char **tree2, sint *hits, sint n)
+{
+ sint i,j,k;
+ sint nhits1, nhits2;
+
+ for(i=1; i<=n-3; i++) {
+ for(j=1; j<=n-3; j++) {
+ nhits1 = 0;
+ nhits2 = 0;
+ for(k=1; k<=n; k++) {
+ if(tree1[i][k] == tree2[j][k]) nhits1++;
+ if(tree1[i][k] != tree2[j][k]) nhits2++;
+ }
+ if((nhits1 == last_seq-first_seq+1) || (nhits2 == last_seq-first_seq+1)) hits[i]++;
+ }
+ }
+}
+
+
+void print_nexus_tree(char **tree_description, FILE *tree, sint bootstrap)
+{
+ sint i;
+ sint old_row;
+
+ fprintf(tree,"#NEXUS\n\n");
+
+ fprintf(tree,"BEGIN TREES;\n\n");
+ fprintf(tree,"\tTRANSLATE\n");
+ for(i=1;i<nseqs;i++) {
+ fprintf(tree,"\t\t%d %s,\n",(pint)i,names[i]);
+ }
+ fprintf(tree,"\t\t%d %s\n",(pint)nseqs,names[nseqs]);
+ fprintf(tree,"\t\t;\n");
+
+ fprintf(tree,"\tUTREE PAUP_1= ");
+
+ if(last_seq-first_seq+1==2) {
+ fprintf(tree,"(%d:%7.5f,%d:%7.5f);",first_seq,tmat[first_seq][first_seq+1],first_seq+1,tmat[first_seq][first_seq+1]);
+ }
+ else {
+
+ fprintf(tree,"(");
+
+ old_row=two_way_split_nexus(tree_description, tree, last_seq-first_seq+1-2,1,bootstrap);
+ fprintf(tree,":%7.5f",left_branch[last_seq-first_seq+1-2]);
+ if ((bootstrap==BS_BRANCH_LABELS) && (old_row>0) && (boot_totals[old_row]>0))
+ fprintf(tree,"[%d]",(pint)boot_totals[old_row]);
+ fprintf(tree,",");
+
+ old_row=two_way_split_nexus(tree_description, tree, last_seq-first_seq+1-2,2,bootstrap);
+ fprintf(tree,":%7.5f",left_branch[last_seq-first_seq+1-1]);
+ if ((bootstrap==BS_BRANCH_LABELS) && (old_row>0) && (boot_totals[old_row]>0))
+ fprintf(tree,"[%d]",(pint)boot_totals[old_row]);
+ fprintf(tree,",");
+
+ old_row=two_way_split_nexus(tree_description, tree, last_seq-first_seq+1-2,3,bootstrap);
+ fprintf(tree,":%7.5f",left_branch[last_seq-first_seq+1]);
+ if ((bootstrap==BS_BRANCH_LABELS) && (old_row>0) && (boot_totals[old_row]>0))
+ fprintf(tree,"[%d]",(pint)boot_totals[old_row]);
+ fprintf(tree,")");
+ if (bootstrap==BS_NODE_LABELS) fprintf(tree,"TRICHOTOMY");
+ fprintf(tree,";");
+ }
+ fprintf(tree,"\nENDBLOCK;\n");
+}
+
+
+sint two_way_split_nexus
+(char **tree_description, FILE *tree, sint start_row, sint flag, sint bootstrap)
+{
+ sint row, new_row = 0, old_row, col, test_col = 0;
+ Boolean single_seq;
+
+ if(start_row != last_seq-first_seq+1-2) fprintf(tree,"(");
+
+ for(col=1; col<=last_seq-first_seq+1; col++) {
+ if(tree_description[start_row][col] == flag) {
+ test_col = col;
+ break;
+ }
+ }
+
+ single_seq = TRUE;
+ for(row=start_row-1; row>=1; row--)
+ if(tree_description[row][test_col] == 1) {
+ single_seq = FALSE;
+ new_row = row;
+ break;
+ }
+
+ if(single_seq) {
+ tree_description[start_row][test_col] = 0;
+ fprintf(tree,"%d",test_col+first_seq-1);
+ if(start_row == last_seq-first_seq+1-2) {
+ return(0);
+ }
+
+ fprintf(tree,":%7.5f,",left_branch[start_row]);
+ }
+ else {
+ for(col=1; col<=last_seq-first_seq+1; col++) {
+ if((tree_description[start_row][col]==1)&&
+ (tree_description[new_row][col]==1))
+ tree_description[start_row][col] = 0;
+ }
+ old_row=two_way_split_nexus(tree_description, tree, new_row, (sint)1, bootstrap);
+ if(start_row == last_seq-first_seq+1-2) {
+ return(new_row);
+ }
+
+ fprintf(tree,":%7.5f",left_branch[start_row]);
+ if ((bootstrap==BS_BRANCH_LABELS) && (boot_totals[old_row]>0))
+ fprintf(tree,"[%d]",(pint)boot_totals[old_row]);
+
+ fprintf(tree,",");
+ }
+
+
+ for(col=1; col<=last_seq-first_seq+1; col++)
+ if(tree_description[start_row][col] == flag) {
+ test_col = col;
+ break;
+ }
+
+ single_seq = TRUE;
+ new_row = 0;
+ for(row=start_row-1; row>=1; row--)
+ if(tree_description[row][test_col] == 1) {
+ single_seq = FALSE;
+ new_row = row;
+ break;
+ }
+
+ if(single_seq) {
+ tree_description[start_row][test_col] = 0;
+ fprintf(tree,"%d",test_col+first_seq-1);
+ fprintf(tree,":%7.5f)",right_branch[start_row]);
+ }
+ else {
+ for(col=1; col<=last_seq-first_seq+1; col++) {
+ if((tree_description[start_row][col]==1)&&
+ (tree_description[new_row][col]==1))
+ tree_description[start_row][col] = 0;
+ }
+ old_row=two_way_split_nexus(tree_description, tree, new_row, (sint)1, bootstrap);
+ fprintf(tree,":%7.5f",right_branch[start_row]);
+ if ((bootstrap==BS_BRANCH_LABELS) && (boot_totals[old_row]>0))
+ fprintf(tree,"[%d]",(pint)boot_totals[old_row]);
+
+ fprintf(tree,")");
+ }
+ if ((bootstrap==BS_NODE_LABELS) && (boot_totals[start_row]>0))
+ fprintf(tree,"%d",(pint)boot_totals[start_row]);
+
+ return(start_row);
+}
+
+
+void print_phylip_tree(char **tree_description, FILE *tree, sint bootstrap)
+{
+ sint old_row;
+
+ if(last_seq-first_seq+1==2) {
+ fprintf(tree,"(%s:%7.5f,%s:%7.5f);",names[first_seq],tmat[first_seq][first_seq+1],names[first_seq+1],tmat[first_seq][first_seq+1]);
+ return;
+ }
+
+ fprintf(tree,"(\n");
+
+ old_row=two_way_split(tree_description, tree, last_seq-first_seq+1-2,1,bootstrap);
+ fprintf(tree,":%7.5f",left_branch[last_seq-first_seq+1-2]);
+ if ((bootstrap==BS_BRANCH_LABELS) && (old_row>0) && (boot_totals[old_row]>0))
+ fprintf(tree,"[%d]",(pint)boot_totals[old_row]);
+ fprintf(tree,",\n");
+
+ old_row=two_way_split(tree_description, tree, last_seq-first_seq+1-2,2,bootstrap);
+ fprintf(tree,":%7.5f",left_branch[last_seq-first_seq+1-1]);
+ if ((bootstrap==BS_BRANCH_LABELS) && (old_row>0) && (boot_totals[old_row]>0))
+ fprintf(tree,"[%d]",(pint)boot_totals[old_row]);
+ fprintf(tree,",\n");
+
+ old_row=two_way_split(tree_description, tree, last_seq-first_seq+1-2,3,bootstrap);
+ fprintf(tree,":%7.5f",left_branch[last_seq-first_seq+1]);
+ if ((bootstrap==BS_BRANCH_LABELS) && (old_row>0) && (boot_totals[old_row]>0))
+ fprintf(tree,"[%d]",(pint)boot_totals[old_row]);
+ fprintf(tree,")");
+ if (bootstrap==BS_NODE_LABELS) fprintf(tree,"TRICHOTOMY");
+ fprintf(tree,";\n");
+}
+
+
+sint two_way_split
+(char **tree_description, FILE *tree, sint start_row, sint flag, sint bootstrap)
+{
+ sint row, new_row = 0, old_row, col, test_col = 0;
+ Boolean single_seq;
+
+ if(start_row != last_seq-first_seq+1-2) fprintf(tree,"(\n");
+
+ for(col=1; col<=last_seq-first_seq+1; col++) {
+ if(tree_description[start_row][col] == flag) {
+ test_col = col;
+ break;
+ }
+ }
+
+ single_seq = TRUE;
+ for(row=start_row-1; row>=1; row--)
+ if(tree_description[row][test_col] == 1) {
+ single_seq = FALSE;
+ new_row = row;
+ break;
+ }
+
+ if(single_seq) {
+ tree_description[start_row][test_col] = 0;
+ fprintf(tree,"%.*s",max_names,names[test_col+first_seq-1]);
+ if(start_row == last_seq-first_seq+1-2) {
+ return(0);
+ }
+
+ fprintf(tree,":%7.5f,\n",left_branch[start_row]);
+ }
+ else {
+ for(col=1; col<=last_seq-first_seq+1; col++) {
+ if((tree_description[start_row][col]==1)&&
+ (tree_description[new_row][col]==1))
+ tree_description[start_row][col] = 0;
+ }
+ old_row=two_way_split(tree_description, tree, new_row, (sint)1, bootstrap);
+ if(start_row == last_seq-first_seq+1-2) {
+ return(new_row);
+ }
+
+ fprintf(tree,":%7.5f",left_branch[start_row]);
+ if ((bootstrap==BS_BRANCH_LABELS) && (boot_totals[old_row]>0))
+ fprintf(tree,"[%d]",(pint)boot_totals[old_row]);
+
+ fprintf(tree,",\n");
+ }
+
+
+ for(col=1; col<=last_seq-first_seq+1; col++)
+ if(tree_description[start_row][col] == flag) {
+ test_col = col;
+ break;
+ }
+
+ single_seq = TRUE;
+ new_row = 0;
+ for(row=start_row-1; row>=1; row--)
+ if(tree_description[row][test_col] == 1) {
+ single_seq = FALSE;
+ new_row = row;
+ break;
+ }
+
+ if(single_seq) {
+ tree_description[start_row][test_col] = 0;
+ fprintf(tree,"%.*s",max_names,names[test_col+first_seq-1]);
+ fprintf(tree,":%7.5f)\n",right_branch[start_row]);
+ }
+ else {
+ for(col=1; col<=last_seq-first_seq+1; col++) {
+ if((tree_description[start_row][col]==1)&&
+ (tree_description[new_row][col]==1))
+ tree_description[start_row][col] = 0;
+ }
+ old_row=two_way_split(tree_description, tree, new_row, (sint)1, bootstrap);
+ fprintf(tree,":%7.5f",right_branch[start_row]);
+ if ((bootstrap==BS_BRANCH_LABELS) && (boot_totals[old_row]>0))
+ fprintf(tree,"[%d]",(pint)boot_totals[old_row]);
+
+ fprintf(tree,")\n");
+ }
+ if ((bootstrap==BS_NODE_LABELS) && (boot_totals[start_row]>0))
+ fprintf(tree,"%d",(pint)boot_totals[start_row]);
+
+ return(start_row);
+}
+
+
+
+void print_tree(char **tree_description, FILE *tree, sint *totals)
+{
+ sint row,col;
+
+ fprintf(tree,"\n");
+
+ for(row=1; row<=last_seq-first_seq+1-3; row++) {
+ fprintf(tree," \n");
+ for(col=1; col<=last_seq-first_seq+1; col++) {
+ if(tree_description[row][col] == 0)
+ fprintf(tree,"*");
+ else
+ fprintf(tree,".");
+ }
+ if(totals[row] > 0)
+ fprintf(tree,"%7d",(pint)totals[row]);
+ }
+ fprintf(tree," \n");
+ for(col=1; col<=last_seq-first_seq+1; col++)
+ fprintf(tree,"%1d",(pint)tree_description[last_seq-first_seq+1-2][col]);
+ fprintf(tree,"\n");
+}
+
+
+
+sint dna_distance_matrix(FILE *tree)
+{
+ sint m,n;
+ sint j,i;
+ sint res1, res2;
+ sint overspill = 0;
+ double p,q,e,a,b,k;
+
+ tree_gap_delete(); /* flag positions with gaps (tree_gaps[i] = 1 ) */
+
+ if(verbose) {
+ fprintf(tree,"\n");
+ fprintf(tree,"\n DIST = percentage divergence (/100)");
+ fprintf(tree,"\n p = rate of transition (A <-> G; C <-> T)");
+ fprintf(tree,"\n q = rate of transversion");
+ fprintf(tree,"\n Length = number of sites used in comparison");
+ fprintf(tree,"\n");
+ if(tossgaps) {
+ fprintf(tree,"\n All sites with gaps (in any sequence) deleted!");
+ fprintf(tree,"\n");
+ }
+ if(kimura) {
+ fprintf(tree,"\n Distances corrected by Kimura's 2 parameter model:");
+ fprintf(tree,"\n\n Kimura, M. (1980)");
+ fprintf(tree," A simple method for estimating evolutionary ");
+ fprintf(tree,"rates of base");
+ fprintf(tree,"\n substitutions through comparative studies of ");
+ fprintf(tree,"nucleotide sequences.");
+ fprintf(tree,"\n J. Mol. Evol., 16, 111-120.");
+ fprintf(tree,"\n\n");
+ }
+ }
+
+ for(m=1; m<last_seq-first_seq+1; ++m) /* for every pair of sequence */
+ for(n=m+1; n<=last_seq-first_seq+1; ++n) {
+ p = q = e = 0.0;
+ tmat[m][n] = tmat[n][m] = 0.0;
+ for(i=1; i<=seqlen_array[first_seq]; ++i) {
+ j = boot_positions[i];
+ if(tossgaps && (tree_gaps[j] > 0) )
+ goto skip; /* gap position */
+ res1 = seq_array[m+first_seq-1][j];
+ res2 = seq_array[n+first_seq-1][j];
+ if( (res1 == gap_pos1) || (res1 == gap_pos2) ||
+ (res2 == gap_pos1) || (res2 == gap_pos2))
+ goto skip; /* gap in a seq*/
+ if(!use_ambiguities)
+ if( is_ambiguity(res1) || is_ambiguity(res2))
+ goto skip; /* ambiguity code in a seq*/
+ e = e + 1.0;
+ if(res1 != res2) {
+ if(transition(res1,res2))
+ p = p + 1.0;
+ else
+ q = q + 1.0;
+ }
+ skip:;
+ }
+
+
+ /* Kimura's 2 parameter correction for multiple substitutions */
+
+ if(!kimura) {
+ if (e == 0) {
+ fprintf(stdout,"\n WARNING: sequences %d and %d are non-overlapping\n",m,n);
+ k = 0.0;
+ p = 0.0;
+ q = 0.0;
+ }
+ else {
+ k = (p+q)/e;
+ if(p > 0.0)
+ p = p/e;
+ else
+ p = 0.0;
+ if(q > 0.0)
+ q = q/e;
+ else
+ q = 0.0;
+ }
+ tmat[m][n] = tmat[n][m] = k;
+ if(verbose) /* if screen output */
+ fprintf(tree,
+ "%4d vs.%4d: DIST = %7.4f; p = %6.4f; q = %6.4f; length = %6.0f\n"
+ ,(pint)m,(pint)n,k,p,q,e);
+ }
+ else {
+ if (e == 0) {
+ fprintf(stdout,"\n WARNING: sequences %d and %d are non-overlapping\n",m,n);
+ p = 0.0;
+ q = 0.0;
+ }
+ else {
+ if(p > 0.0)
+ p = p/e;
+ else
+ p = 0.0;
+ if(q > 0.0)
+ q = q/e;
+ else
+ q = 0.0;
+ }
+
+ if( ((2.0*p)+q) == 1.0 )
+ a = 0.0;
+ else
+ a = 1.0/(1.0-(2.0*p)-q);
+
+ if( q == 0.5 )
+ b = 0.0;
+ else
+ b = 1.0/(1.0-(2.0*q));
+
+/* watch for values going off the scale for the correction. */
+ if( (a<=0.0) || (b<=0.0) ) {
+ overspill++;
+ k = 3.5; /* arbitrary high score */
+ }
+ else
+ k = 0.5*log(a) + 0.25*log(b);
+ tmat[m][n] = tmat[n][m] = k;
+ if(verbose) /* if screen output */
+ fprintf(tree,
+ "%4d vs.%4d: DIST = %7.4f; p = %6.4f; q = %6.4f; length = %6.0f\n"
+ ,(pint)m,(pint)n,k,p,q,e);
+
+ }
+ }
+ return overspill; /* return the number of off-scale values */
+}
+
+
+sint prot_distance_matrix(FILE *tree)
+{
+ sint m,n;
+ sint j,i;
+ sint res1, res2;
+ sint overspill = 0;
+ double p,e,k, table_entry;
+
+
+ tree_gap_delete(); /* flag positions with gaps (tree_gaps[i] = 1 ) */
+
+ if(verbose) {
+ fprintf(tree,"\n");
+ fprintf(tree,"\n DIST = percentage divergence (/100)");
+ fprintf(tree,"\n Length = number of sites used in comparison");
+ fprintf(tree,"\n\n");
+ if(tossgaps) {
+ fprintf(tree,"\n All sites with gaps (in any sequence) deleted");
+ fprintf(tree,"\n");
+ }
+ if(kimura) {
+ fprintf(tree,"\n Distances up tp 0.75 corrected by Kimura's empirical method:");
+ fprintf(tree,"\n\n Kimura, M. (1983)");
+ fprintf(tree," The Neutral Theory of Molecular Evolution.");
+ fprintf(tree,"\n Page 75. Cambridge University Press, Cambridge, England.");
+ fprintf(tree,"\n\n");
+ }
+ }
+
+ for(m=1; m<nseqs; ++m) /* for every pair of sequence */
+ for(n=m+1; n<=nseqs; ++n) {
+ p = e = 0.0;
+ tmat[m][n] = tmat[n][m] = 0.0;
+ for(i=1; i<=seqlen_array[1]; ++i) {
+ j = boot_positions[i];
+ if(tossgaps && (tree_gaps[j] > 0) ) goto skip; /* gap position */
+ res1 = seq_array[m][j];
+ res2 = seq_array[n][j];
+ if( (res1 == gap_pos1) || (res1 == gap_pos2) ||
+ (res2 == gap_pos1) || (res2 == gap_pos2))
+ goto skip; /* gap in a seq*/
+ e = e + 1.0;
+ if(res1 != res2) p = p + 1.0;
+ skip:;
+ }
+
+ if(p <= 0.0)
+ k = 0.0;
+ else
+ k = p/e;
+
+/* DES debug */
+/* fprintf(stdout,"Seq1=%4d Seq2=%4d k =%7.4f \n",(pint)m,(pint)n,k); */
+/* DES debug */
+
+ if(kimura) {
+ if(k < 0.75) { /* use Kimura's formula */
+ if(k > 0.0) k = - log(1.0 - k - (k * k/5.0) );
+ }
+ else {
+ if(k > 0.930) {
+ overspill++;
+ k = 10.0; /* arbitrarily set to 1000% */
+ }
+ else {
+ table_entry = (k*1000.0) - 750.0;
+ k = (double)dayhoff_pams[(int)table_entry];
+ k = k/100.0;
+ }
+ }
+ }
+
+ tmat[m][n] = tmat[n][m] = k;
+ if(verbose) /* if screen output */
+ fprintf(tree,
+ "%4d vs.%4d DIST = %6.4f; length = %6.0f\n",
+ (pint)m,(pint)n,k,e);
+ }
+ return overspill;
+}
+
+
+void guide_tree(FILE *tree,sint firstseq,sint numseqs)
+/*
+ Routine for producing unrooted NJ trees from seperately aligned
+ pairwise distances. This produces the GUIDE DENDROGRAMS in
+ PHYLIP format.
+*/
+{
+ static char **standard_tree;
+ sint i;
+ float dist;
+
+ phylip_phy_tree_file=tree;
+ verbose = FALSE;
+ first_seq=firstseq;
+ last_seq=first_seq+numseqs-1;
+
+ if(numseqs==2) {
+ dist=tmat[firstseq][firstseq+1]/2.0;
+ fprintf(tree,"(%s:%0.5f,%s:%0.5f);\n",
+ names[firstseq],dist,names[firstseq+1],dist);
+ }
+ else {
+ standard_tree = (char **) ckalloc( (last_seq-first_seq+2) * sizeof (char *) );
+ for(i=0; i<last_seq-first_seq+2; i++)
+ standard_tree[i] = (char *) ckalloc( (last_seq-first_seq+2) * sizeof(char));
+
+ nj_tree(standard_tree,clustal_phy_tree_file);
+
+ print_phylip_tree(standard_tree,phylip_phy_tree_file,0);
+
+ if(left_branch != NULL) left_branch=ckfree((void *)left_branch);
+ if(right_branch != NULL) right_branch=ckfree((void *)right_branch);
+ if(tkill != NULL) tkill=ckfree((void *)tkill);
+ if(av != NULL) av=ckfree((void *)av);
+ for (i=1;i<last_seq-first_seq+2;i++)
+ standard_tree[i]=ckfree((void *)standard_tree[i]);
+ standard_tree=ckfree((void *)standard_tree);
+ }
+ fclose(phylip_phy_tree_file);
+
+}
+
+static Boolean is_ambiguity(char c)
+{
+ int i;
+ char codes[]="ACGTU";
+
+ if(use_ambiguities==TRUE)
+ {
+ return FALSE;
+ }
+
+ for(i=0;i<5;i++)
+ if(amino_acid_codes[c]==codes[i])
+ return FALSE;
+
+ return TRUE;
+}
+
Added: trunk/packages/clustalw/branches/upstream/current/util.c
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/util.c 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/util.c 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,405 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdarg.h>
+#include <ctype.h>
+#include "clustalw.h"
+
+extern char **seq_array;
+extern sint *seqlen_array;
+extern char **names,**titles;
+extern sint *output_index;
+extern sint *seq_weight;
+extern double **tmat;
+
+
+/*
+* ckalloc()
+*
+* Tries to allocate "bytes" bytes of memory. Exits program if failed.
+* Return value:
+* Generic pointer to the newly allocated memory.
+*/
+
+void *ckalloc(size_t bytes)
+{
+ register void *ret;
+
+ if( (ret = calloc(bytes, sizeof(char))) == NULL)
+/*
+ if( (ret = malloc(bytes)) == NULL)
+*/
+ fatal("Out of memory\n");
+ else
+ return ret;
+
+ return ret;
+}
+
+/*
+* ckrealloc()
+*
+* Tries to reallocate "bytes" bytes of memory. Exits program if failed.
+* Return value:
+* Generic pointer to the re-allocated memory.
+*/
+
+void *ckrealloc(void *ptr, size_t bytes)
+{
+ register void *ret=NULL;
+
+ if (ptr == NULL)
+ fatal("Bad call to ckrealloc\n");
+ else if( (ret = realloc(ptr, bytes)) == NULL)
+ fatal("Out of memory\n");
+ else
+ return ret;
+
+ return ret;
+}
+
+/*
+* ckfree()
+*
+* Tries to free memory allocated by ckalloc.
+* Return value:
+* None.
+*/
+
+void *ckfree(void *ptr)
+{
+ if (ptr == NULL)
+ warning("Bad call to ckfree\n");
+ else {
+ free(ptr);
+ ptr = NULL;
+ }
+ return ptr;
+}
+
+
+/*
+* rtrim()
+*
+* Removes trailing blanks from a string
+*
+* Return values:
+* Pointer to the processed string
+*/
+
+char * rtrim(char *str)
+{
+ register int p;
+
+ p = strlen(str) - 1;
+
+ while ( isspace(str[p]) )
+ p--;
+
+ str[p + 1] = EOS;
+
+ return str;
+}
+
+
+/*
+* blank_to_()
+*
+* Replace blanks in a string with underscores
+*
+* Also replaces , ; : ( or ) with _
+*
+* Return value:
+* Pointer to the processed string
+*/
+
+char * blank_to_(char *str)
+{
+ int i,p;
+
+ p = strlen(str) - 1;
+
+ for(i=0;i<=p;i++)
+ if(
+ (str[i]==' ') ||
+ (str[i]==';') ||
+ (str[i]==',') ||
+ (str[i]=='(') ||
+ (str[i]==')') ||
+ (str[i]==':')
+ )
+ str[i] = '_';
+
+ return str;
+}
+
+
+/*
+* upstr()
+*
+* Converts string str to uppercase.
+* Return values:
+* Pointer to the converted string.
+*/
+
+char * upstr(char *str)
+{
+ register char *s = str;
+
+ while( (*s = toupper(*s)) )
+ s++;
+
+ return str;
+}
+
+/*
+* lowstr()
+*
+* Converts string str to lower case.
+* Return values:
+* Pointer to the converted string.
+*/
+
+char * lowstr(char *str)
+{
+ register char *s = str;
+
+ while( (*s = tolower(*s)) )
+ s++;
+
+ return str;
+}
+
+void getstr(char *instr,char *outstr)
+{
+ fprintf(stdout,"%s: ",instr);
+ gets(outstr);
+}
+
+double getreal(char *instr,double minx,double maxx,double def)
+{
+ int status;
+ float ret;
+ char line[MAXLINE];
+
+ while(TRUE) {
+ fprintf(stdout,"%s (%.1f-%.1f) [%.1f]: ",instr,minx,maxx,def);
+ gets(line);
+ status=sscanf(line,"%f",&ret);
+ if(status == EOF) return def;
+ if(ret>maxx) {
+ fprintf(stdout,"ERROR: Max. value=%.1f\n\n",maxx);
+ continue;
+ }
+ if(ret<minx) {
+ fprintf(stdout,"ERROR: Min. value=%.1f\n\n",minx);
+ continue;
+ }
+ break;
+ }
+ return (double)ret;
+}
+
+
+int getint(char *instr,int minx,int maxx, int def)
+{
+ int ret,status;
+ char line[MAXLINE];
+
+ while(TRUE) {
+ fprintf(stdout,"%s (%d..%d) [%d]: ",
+ instr,(pint)minx,(pint)maxx,(pint)def);
+ gets(line);
+ status=sscanf(line,"%d",&ret);
+ if(status == EOF) return def;
+ if(ret>maxx) {
+ fprintf(stdout,"ERROR: Max. value=%d\n\n",(pint)maxx);
+ continue;
+ }
+ if(ret<minx) {
+ fprintf(stdout,"ERROR: Min. value=%d\n\n",(pint)minx);
+ continue;
+ }
+ break;
+ }
+ return ret;
+}
+
+void do_system(void)
+{
+ char line[MAXLINE];
+
+ getstr("\n\nEnter system command",line);
+ if(*line != EOS)
+ system(line);
+ fprintf(stdout,"\n\n");
+}
+
+
+Boolean linetype(char *line,char *code)
+{
+ return( strncmp(line,code,strlen(code)) == 0 );
+}
+
+Boolean keyword(char *line,char *code)
+{
+ int i;
+ char key[MAXLINE];
+
+ for(i=0;!isspace(line[i]) && line[i]!=EOS;i++)
+ key[i]=line[i];
+ key[i]=EOS;
+ return( strcmp(key,code) == 0 );
+}
+
+Boolean blankline(char *line)
+{
+ int i;
+
+ for(i=0;line[i]!='\n' && line[i]!=EOS;i++) {
+ if( isdigit(line[i]) ||
+ isspace(line[i]) ||
+ (line[i] == '*') ||
+ (line[i] == ':') ||
+ (line[i] == '.'))
+ ;
+ else
+ return FALSE;
+ }
+ return TRUE;
+}
+
+
+void get_path(char *str,char *path)
+{
+ register int i;
+
+ strcpy(path,str);
+ for(i=strlen(path)-1;i>-1;--i) {
+ if(str[i]==DIRDELIM) {
+ i = -1;
+ break;
+ }
+ if(str[i]=='.') break;
+ }
+ if(i<0)
+ strcat(path,".");
+ else
+ path[i+1]=EOS;
+}
+
+void alloc_aln(sint nseqs)
+{
+ sint i,j;
+
+ seqlen_array = (sint *)ckalloc( (nseqs+1) * sizeof (sint));
+
+ seq_array = (char **)ckalloc( (nseqs + 1) * sizeof (char *) );
+ for(i=0;i<nseqs+1;i++)
+ seq_array[i]=NULL;
+
+ names = (char **)ckalloc( (nseqs+1) * sizeof (char *) );
+ for(i=1;i<=nseqs;i++)
+ names[i] = (char *)ckalloc((MAXNAMES+1) * sizeof (char));
+
+ titles = (char **)ckalloc( (nseqs+1) * sizeof (char *) );
+ for(i=1;i<=nseqs;i++)
+ titles[i] = (char *)ckalloc((MAXTITLES+1) * sizeof (char));
+
+ output_index = (sint *)ckalloc( (nseqs+1) * sizeof (sint));
+
+ tmat = (double **) ckalloc( (nseqs+1) * sizeof (double *) );
+ for(i=1;i<=nseqs;i++)
+ tmat[i] = (double *)ckalloc( (nseqs+1) * sizeof (double) );
+ for(i=1;i<=nseqs;i++)
+ for(j=1;j<=nseqs;j++)
+ tmat[i][j]=0.0;
+
+ seq_weight = (sint *)ckalloc( (nseqs+1) * sizeof (sint));
+ for(i=1;i<=nseqs;i++)
+ seq_weight[i]=100;
+}
+
+void realloc_aln(sint first_seq,sint nseqs)
+{
+ sint i,j;
+
+ seqlen_array = (sint *)ckrealloc(seqlen_array, (first_seq+nseqs+1) * sizeof (sint));
+
+ seq_array = (char **)ckrealloc(seq_array, (first_seq+nseqs+1) * sizeof (char *) );
+ for(i=first_seq;i<first_seq+nseqs+1;i++)
+ seq_array[i]=NULL;
+
+ names = (char **)ckrealloc(names, (first_seq+nseqs+1) * sizeof (char *) );
+ for(i=first_seq;i<first_seq+nseqs;i++)
+ names[i] = (char *)ckalloc((MAXNAMES+1) * sizeof (char));
+
+ titles = (char **)ckrealloc(titles, (first_seq+nseqs+1) * sizeof (char *) );
+ for(i=first_seq;i<first_seq+nseqs;i++)
+ titles[i] = (char *)ckalloc((MAXTITLES+1) * sizeof (char));
+
+ output_index = (sint *)ckrealloc(output_index, (first_seq+nseqs+1) * sizeof (sint));
+
+ seq_weight = (sint *)ckrealloc(seq_weight, (first_seq+nseqs+1) * sizeof (sint));
+ for(i=first_seq;i<first_seq+nseqs;i++)
+ seq_weight[i]=100;
+
+ tmat = (double **) ckrealloc(tmat, (first_seq+nseqs+1) * sizeof (double *) );
+ for(i=1;i<first_seq;i++)
+ tmat[i] = (double *)ckrealloc(tmat[i], (first_seq+nseqs+1) * sizeof (double) );
+ for(i=first_seq;i<first_seq+nseqs;i++)
+ tmat[i] = (double *)ckalloc( (first_seq+nseqs+1) * sizeof (double) );
+ for(i=1;i<first_seq;i++)
+ for(j=first_seq;j<first_seq+nseqs;j++)
+ {
+ tmat[i][j]=0.0;
+ tmat[j][i]=0.0;
+ }
+}
+
+void free_aln(sint nseqs)
+{
+ sint i;
+
+ if(nseqs<=0) return;
+
+ seqlen_array = ckfree(seqlen_array);
+
+ for(i=1;i<=nseqs;i++)
+ seq_array[i] = ckfree(seq_array[i]);
+ seq_array = ckfree(seq_array);
+
+ for(i=1;i<=nseqs;i++)
+ names[i] = ckfree(names[i]);
+ names = ckfree(names);
+
+ for(i=1;i<=nseqs;i++)
+ titles[i] = ckfree(titles[i]);
+ titles = ckfree(titles);
+
+ output_index = ckfree(output_index);
+
+ seq_weight = ckfree(seq_weight);
+
+ for(i=1;i<=nseqs;i++)
+ tmat[i] = ckfree(tmat[i]);
+ tmat = ckfree(tmat);
+}
+
+void alloc_seq(sint seq_no,sint length)
+{
+ seq_array[seq_no] = (char *)ckalloc((length+2) * sizeof (char));
+}
+
+void realloc_seq(sint seq_no,sint length)
+{
+ seq_array[seq_no] = (char *)realloc(seq_array[seq_no], (length+2) * sizeof (char));
+
+}
+
+void free_seq(sint seq_no)
+{
+ seq_array[seq_no]=ckfree(seq_array[seq_no]);
+}
+
Added: trunk/packages/clustalw/branches/upstream/current/xcolor.c
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/xcolor.c 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/xcolor.c 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,1191 @@
+#include <stdio.h>
+#include <stdarg.h>
+#include <string.h>
+#include <time.h>
+#include <ctype.h>
+
+#include <vibrant.h>
+
+#include "clustalw.h"
+#include "xmenu.h"
+
+#define SIMPLE 1
+#define COMPOUND 2
+
+#define LEFTMARGIN 20
+#define SEPARATION 2
+#define CHARHEIGHT 10
+#define CHARWIDTH 6
+#define A4X 564
+#define A4Y 800
+#define A3X 832
+#define A3Y 1159
+#define USLETTERX 564
+#define USLETTERY 750
+#define SCOREY 3
+#define HEADER 7
+#define NOHEADER 0
+#define MAXRESNO 6
+
+#define MAXPARLEN 10
+#define MAXPAR 100
+
+static void print_ps_info(FILE *fd,int pagesize);
+static void print_page_header(FILE *fd,int ps_rotation,int maxx,int maxy,
+int page,int numpages,Boolean header,char *str_time,
+char *ps_file,int ps_xtrans,int ps_ytrans,float ps_scale);
+static void print_header_line(FILE *fd,panel_data name_data, panel_data seq_data,
+int ix,int fr,int lr);
+static void print_footer_line(FILE *fd,panel_data name_data, panel_data seq_data,
+int ix,int fr,int lr);
+static void print_quality_curve(FILE *fd,panel_data seq_data
+,int fr,int lr,int score_height);
+static void print_seq_line(FILE *fd,panel_data name_data, panel_data seq_data,
+int row,int seq,int fr,int lr,int res_number);
+
+
+typedef struct consensus_parameters
+{
+char consensus;
+int cutoff;
+int length;
+char cutoff_list[20];
+} consensus_para;
+
+typedef struct color_parameters
+{
+int type;
+char residue;
+int color;
+int length;
+char cons_list[20];
+} color_para;
+
+static void init_color_lut(FILE *fd);
+static int init_printer_lut(char *filename);
+static char *init_consensus(panel_data data);
+static int SaveColPara(char word[MAXPAR][MAXPARLEN],int num_words,int count);
+static int SaveConPara(char word[MAXPAR][MAXPARLEN],int num_words,int count);
+static int get_line(char *sinline,char word[MAXPAR][MAXPARLEN]);
+static int residue_color(char res,char consensus);
+static Boolean commentline(char *line);
+
+#define DEF_NCOLORS 4
+#define MAX_NCOLORS 8
+#define DEFAULT_COLOR 0
+
+typedef struct rgb_color {
+ char name[20];
+ float r,g,b;
+} rgb_color;
+
+rgb_color def_color_lut[MAX_NCOLORS]={
+ "RED" ,0.9, 0.1, 0.1,
+ "BLUE" ,0.1, 0.1, 0.7,
+ "GREEN" ,0.1, 0.9, 0.1,
+ "ORANGE" ,0.9, 0.6, 0.3,
+ "CYAN" ,0.1, 0.9, 0.9,
+ "PINK" ,0.9, 0.5, 0.5,
+ "MAGENTA" ,0.9, 0.1, 0.9,
+ "YELLOW" ,0.9, 0.9, 0.0,
+};
+
+char def_aacolor[MAX_NCOLORS][26]={"krh",
+ "fwy",
+ "ilmv",
+ "gpst"};
+
+char def_dnacolor[MAX_NCOLORS][26]={"a",
+ "c",
+ "tu",
+ "g"};
+
+extern char revision_level[];
+
+extern int max_names;
+
+extern int ncolors;
+extern int ncolor_pars;
+extern color color_lut[];
+extern int inverted;
+extern Boolean residue_exceptions;
+extern Boolean segment_exceptions;
+extern Boolean dnaflag;
+
+int NumColParas;
+int NumConParas;
+
+color_para Col_Par[100];
+consensus_para Con_Par[100];
+
+
+void make_colormask(panel_data data)
+{
+ int i,j;
+
+ for(i=0;i<data.nseqs;i++)
+ for(j=0;j<data.ncols;j++)
+ data.colormask[i][j] = DEFAULT_COLOR;
+
+ if (ncolors > 1)
+ {
+ data.consensus=init_consensus(data);
+
+ for(i=0;i<data.nseqs;i++)
+ for(j=0;j<data.ncols;j++)
+ data.colormask[i][j] = residue_color(data.lines[i][j],data.consensus[j]);
+
+ }
+}
+
+static void init_color_lut(FILE *fd)
+{
+ char sinline[1025];
+ char *args[10];
+ int i,numargs;
+ Boolean found=FALSE;
+
+ if (inverted==FALSE)
+ {
+ strcpy(color_lut[0].name,"BLACK");
+ color_lut[0].r=0.4;
+ color_lut[0].g=0.4;
+ color_lut[0].b=0.4;
+ SelectColor(color_lut[0].r*255, color_lut[0].g*255, color_lut[0].b*255);
+ color_lut[0].val=GetColor();
+ }
+ else
+ {
+ strcpy(color_lut[0].name,"WHITE");
+ color_lut[0].r=1.0;
+ color_lut[0].g=1.0;
+ color_lut[0].b=1.0;
+ SelectColor(color_lut[0].r*255, color_lut[0].g*255, color_lut[0].b*255);
+ color_lut[0].val=GetColor();
+ }
+
+ ncolors=1;
+ if (fd != NULL)
+ {
+ for (;fgets(sinline,1024,fd)!=NULL;)
+ {
+ sinline[strlen(sinline)-1] = '\0';
+ if (strcmp(sinline,"@rgbindex")==0)
+ {
+ found = TRUE;
+ break;
+ }
+ }
+ }
+ if (found == TRUE)
+ {
+ for (;fgets(sinline,1024,fd)!=NULL;)
+ {
+ if (commentline(sinline)) continue;
+ if (sinline[0]=='@') break;
+ numargs = getargs(sinline, args, 4);
+ if (numargs != 4)
+ {
+ error("Problem in color rgb index - line %d\n",ncolors+1);
+ break;
+ }
+ else
+ {
+ strcpy(color_lut[ncolors].name, args[0]);
+ color_lut[ncolors].r=atof(args[1]);
+ color_lut[ncolors].g=atof(args[2]);
+ color_lut[ncolors].b=atof(args[3]);
+ SelectColor(color_lut[ncolors].r*255, color_lut[ncolors].g*255, color_lut[ncolors].b*255);
+ color_lut[ncolors].val=GetColor();
+ ncolors++;
+ if (ncolors>=MAXCOLORS)
+ {
+ warning("Only using first %d colors in rgb index.",MAXCOLORS);
+ break;
+ }
+ }
+ }
+
+ }
+
+/* if we can't find a table, use the hard-coded colors */
+ if (ncolors==1)
+ {
+ ncolors=MAX_NCOLORS+1;
+ for(i=1;i<ncolors;i++)
+ {
+ strcpy(color_lut[i].name,def_color_lut[i-1].name);
+ color_lut[i].r=def_color_lut[i-1].r;
+ color_lut[i].g=def_color_lut[i-1].g;
+ color_lut[i].b=def_color_lut[i-1].b;
+ SelectColor(color_lut[i].r*255, color_lut[i].g*255, color_lut[i].b*255);
+ color_lut[i].val=GetColor();
+ }
+ }
+
+}
+
+void init_color_parameters(char *par_file)
+{
+
+ int i,j,err;
+ char sinline[1025];
+ int maxparas = 50;
+ char inword[MAXPAR][MAXPARLEN];
+ int num_words;
+ int in_consensus=FALSE,in_color=FALSE;
+ int consensus_found=FALSE,color_found=FALSE;
+ FILE *par_fd=NULL;
+
+ if(par_file!=NULL)
+ par_fd=fopen(par_file,"r");
+ if(par_fd==NULL)
+ {
+ info("No color file found - using defaults");
+ ncolor_pars=0;
+ }
+
+ init_color_lut(par_fd);
+ if (par_fd != NULL) rewind(par_fd);
+ if (ncolors==0) return;
+
+ NumColParas=0;
+ NumConParas=0;
+ if (par_fd != NULL)
+ {
+ for(;fgets(sinline,1024,par_fd) != NULL;)
+ {
+ sinline[strlen(sinline)-1] = '\0';
+ if (commentline(sinline)) continue;
+ switch(sinline[0])
+ {
+ case '\0':
+ break;
+ case '@':
+ if (strcmp((char*)(sinline+1),"consensus")==0)
+ {
+ in_consensus = TRUE;
+ in_color = FALSE;
+ consensus_found = TRUE;
+ }
+ else if (strcmp((char*)(sinline+1),"color")==0)
+ {
+ in_consensus = FALSE;
+ in_color = TRUE;
+ color_found = TRUE;
+ }
+ break;
+ default:
+ num_words = get_line(sinline,inword);
+ if (in_consensus == TRUE)
+ {
+ err = SaveConPara(inword,num_words,NumConParas);
+ if (err == 0) NumConParas++;
+ }
+ else if (in_color == TRUE)
+ {
+ err = SaveColPara(inword,num_words,NumColParas);
+ if (err == 0) NumColParas++;
+ }
+
+ if((NumColParas>maxparas) || (NumConParas>maxparas))
+ error("Too many parameters in color file");
+
+ }
+ }
+ if (color_found == FALSE)
+ {
+ error("@color not found in parameter file - using defaults\n");
+ ncolor_pars=0;
+ }
+ fclose(par_fd);
+ }
+ ncolor_pars=NumColParas;
+
+/* if no color parameters found, use the default aa groupings */
+ if(ncolor_pars==0)
+ {
+ if (dnaflag)
+ {
+ for(i=0;i<DEF_NCOLORS;i++)
+ {
+ for(j=0;j<strlen(def_dnacolor[i]);j++)
+ {
+ Col_Par[ncolor_pars].type=SIMPLE;
+ Col_Par[ncolor_pars].residue=def_dnacolor[i][j];
+ Col_Par[ncolor_pars].color=i+1;
+ ncolor_pars++;
+ }
+ }
+ }
+ else
+ {
+ for(i=0;i<DEF_NCOLORS;i++)
+ {
+ for(j=0;j<strlen(def_aacolor[i]);j++)
+ {
+ Col_Par[ncolor_pars].type=SIMPLE;
+ Col_Par[ncolor_pars].residue=def_aacolor[i][j];
+ Col_Par[ncolor_pars].color=i+1;
+ ncolor_pars++;
+ }
+ }
+ }
+ }
+ NumColParas=ncolor_pars;
+}
+
+char *find_file(char *def_file)
+{
+ char filename[FILENAMELEN];
+ char *retname;
+ FILE *fd;
+ Boolean found=FALSE;
+#ifdef UNIX
+ char *path, *path1, *deb, *fin;
+ sint lf, ltot;
+ char *home;
+#endif
+
+
+ strcpy(filename,def_file);
+ fd = fopen(filename,"r");
+ if (fd != NULL)
+ found=TRUE;
+#ifdef UNIX
+ if (found == FALSE)
+ {
+ home = getenv("HOME");
+ if (home != NULL)
+ {
+ sprintf(filename,"%s/%s",home,def_file);
+ fd = fopen(filename,"r");
+ if (fd != NULL)
+ found=TRUE;
+ }
+ if (found == FALSE)
+ {
+ path=getenv("PATH");/* get the list of path directories,
+ separated by : */
+ /* added for File System Standards - Francois */
+ path1=(char *)ckalloc((strlen(path)+64)*sizeof(char));
+ strcpy(path1,path);
+ strcat(path1,"/usr/share/clustalx:/usr/local/share/clustalx");
+
+ lf=(sint)strlen(def_file);
+ deb=path1;
+ do
+ {
+ fin=strchr(deb,':');
+ if(fin!=NULL)
+ {
+ strncpy(filename,deb,fin-deb);
+ ltot=fin-deb;
+ }
+ else
+ {
+ strcpy(filename,deb);
+ ltot=(sint)strlen(filename);
+ }
+ /* now one directory is in filename */
+ if( ltot + lf + 1 <= FILENAMELEN)
+ {
+ filename[ltot]='/';
+ strcpy(filename+ltot+1,def_file); /* now dir is appended with filename */
+ if( (fd = fopen(filename,"r") ) != NULL)
+ {
+ found=TRUE;
+ break;
+ }
+ }
+ else found = FALSE;
+ deb=fin+1;
+ }
+ while (fin != NULL);
+ }
+ }
+#endif
+ if (found == TRUE)
+ {
+ fclose(fd);
+ retname=(char *)ckalloc((strlen(filename)+1)*sizeof(char));
+ strcpy(retname,filename);
+ }
+ else
+ retname=NULL;
+ return(retname);
+}
+
+static char *init_consensus(panel_data data)
+{
+ char *cons_data;
+ int num_res,seq,res,par,cons_total,i;
+ char residue;
+
+ cons_data=(char *)ckalloc((data.ncols+1)*sizeof(char));
+
+ for (res=0;res<data.ncols;res++)
+ {
+ cons_data[res] = '.';
+ for (par=0;par<NumConParas;par++)
+ {
+ cons_total = num_res = 0;
+ for (seq=0;seq<data.nseqs;seq++)
+ {
+ residue=tolower(data.lines[seq][res]);
+ if (isalpha(residue))
+ num_res++;
+ for (i=0;i<Con_Par[par].length;i++)
+ if (residue==tolower(Con_Par[par].cutoff_list[i]))
+ cons_total++;
+ }
+ if (num_res != 0)
+ if (((cons_total*100)/num_res) >= Con_Par[par].cutoff)
+ cons_data[res] = Con_Par[par].consensus;
+ }
+ }
+
+ return(cons_data);
+}
+
+static int SaveColPara(char word[MAXPAR][MAXPARLEN],int num_words,int count)
+{
+
+ int i;
+
+ if (num_words < 3)
+ {
+ error("Wrong format in color list");
+ return(1);
+ }
+
+ if (word[1][0] != '=')
+ {
+ error("Wrong format in color list");
+ return(2);
+ }
+
+ if (num_words == 3)
+ {
+ Col_Par[count].type = SIMPLE;
+ Col_Par[count].residue = word[0][0];
+ Col_Par[count].color = -1;
+ for (i=0;i<ncolors;i++)
+ if (strcmp(word[2],color_lut[i].name)==0) Col_Par[count].color = i;
+ if (Col_Par[count].color == -1)
+ {
+ error("%s not found in rgb index - using %s",word[2],color_lut[0].name);
+ Col_Par[count].color = 0;
+ }
+ }
+ else
+ {
+ if (strcmp(word[3],"if")==0)
+ {
+ Col_Par[count].type = COMPOUND;
+ Col_Par[count].residue = word[0][0];
+ Col_Par[count].color = -1;
+ for (i=0;i<ncolors;i++)
+ if (strcmp(word[2],color_lut[i].name)==0) Col_Par[count].color = i;
+ if (Col_Par[count].color == -1)
+ {
+ error("%s not found in rgb index - using %s",word[2],color_lut[0].name);
+ Col_Par[count].color = 0;
+ }
+ Col_Par[count].length = num_words - 4;
+ for (i=4;i<num_words;i++)
+ Col_Par[count].cons_list[i-4] = word[i][0];
+ }
+ else
+ {
+ error("Wrong format in color list");
+ return(3);
+ }
+ }
+
+ return(0);
+
+}
+
+
+static int SaveConPara(char word[MAXPAR][MAXPARLEN],int num_words,int count)
+{
+
+ int i;
+
+ if (num_words < 3)
+ {
+ error("Wrong format in consensus list");
+ return(1);
+ }
+
+ if (word[1][0] != '=')
+ {
+ error("Wrong format in consensus list");
+ return(2);
+ }
+
+ Con_Par[count].consensus = word[0][0];
+ for (i=0;i<MAXPARLEN-1;i++)
+ {
+ if(word[2][i]=='%') word[2][i] = '\0';
+ }
+ Con_Par[count].cutoff = atoi(word[2]);
+ Con_Par[count].length = num_words - 3;
+ for (i=3;i<num_words;i++)
+ {
+ Con_Par[count].cutoff_list[i-3] = word[i][0];
+ }
+
+ return(0);
+
+}
+
+static int get_line(char *sinline,char word[MAXPAR][MAXPARLEN])
+{
+ int i=0, j, word_count=0, char_count=0;
+ int in_word=FALSE;
+
+ for(i=0;i<MAXPAR-1;i++)
+ for(j=0;j<MAXPARLEN-1;j++)
+ word[i][j]='\0';
+
+ for (i=0;i<=strlen(sinline);i++)
+ {
+ switch (sinline[i])
+ {
+ case ' ':
+ case '\t':
+ case '\0':
+ case ':':
+ if (in_word)
+ {
+ word[word_count][char_count] = '\0';
+ word_count++;
+ char_count = 0;
+ in_word = FALSE;
+ }
+ break;
+ default:
+ in_word = TRUE;
+ word[word_count][char_count] = sinline[i];
+ char_count++;
+ break;
+ }
+
+ }
+ return(word_count);
+}
+
+static int residue_color(char res,char consensus)
+{
+ int i,j;
+
+ for (i=0;i<NumColParas;i++)
+ {
+ if (tolower(res) == tolower(Col_Par[i].residue))
+ {
+ switch (Col_Par[i].type)
+ {
+ case SIMPLE:
+ return(Col_Par[i].color);
+ case COMPOUND:
+ for (j=0;j<Col_Par[i].length;j++)
+ {
+ if (consensus == Col_Par[i].cons_list[j]
+)
+ return(Col_Par[i].color);
+ }
+ break;
+ default:
+ return(DEFAULT_COLOR);
+ }
+ }
+ }
+ return(DEFAULT_COLOR);
+}
+
+static Boolean commentline(char *line)
+{
+ int i;
+
+ if (line[0] == '#') return TRUE;
+ for(i=0;line[i]!='\n' && line[i]!=EOS;i++) {
+ if( !isspace(line[i]) )
+ return FALSE;
+ }
+ return TRUE;
+}
+
+int block_height,block_left,block_top;
+int header_top,seq_top,footer_top,curve_top;
+
+void write_ps_file(spanel p,char *ps_file,char *par_file,int pagesize,
+int orientation,Boolean header, Boolean ruler, Boolean resno, Boolean resize,
+int first_printres,int last_printres,
+int blength,Boolean show_curve)
+{
+ int i,j,bn,seq,numseqs;
+ int err;
+ int blocklen,numpages;
+ int fr,lr;
+ int page,row;
+ int ps_rotation=0,ps_xtrans=0,ps_ytrans=0;
+ float ps_scale,hscale,wscale;
+ int maxseq;
+ int maxx=0,maxy=0;
+ int score_height=0;
+ int main_header=0;
+ int numelines,numecols;
+ int nhead,nfoot;
+ int ppix_width; /* width of the page in pixels */
+ int pchar_height; /* height of the page in chars for sequences */
+ int ppix_height; /* height of the page in pixels for sequences */
+ int blocksperpage,numblocks;
+ int *res_number;
+ panel_data name_data,seq_data;
+ FILE *fd;
+
+ time_t *tptr=NULL,ttime;
+ char *str_time;
+
+/* open the output file */
+ if ((fd=fopen(ps_file,"w"))==NULL)
+ {
+ error("Cannot open file %s",ps_file);
+ return;
+ }
+
+/* check for printer-specific rgb values */
+ err=init_printer_lut(par_file);
+ if(err>0) warning("No PS Colors file: using default colors\n");
+
+/* get the page size parameters */
+
+ if (pagesize==A4)
+ {
+ if (orientation==PORTRAIT)
+ {
+ maxx=A4X;
+ maxy=A4Y;
+ ps_rotation=0;
+ }
+ else
+ {
+ maxx=A4Y;
+ maxy=A4X;
+ ps_rotation=-90;
+ }
+ }
+ else if (pagesize==A3)
+ {
+ if (orientation==PORTRAIT)
+ {
+ maxx=A3X;
+ maxy=A3Y;
+ ps_rotation=0;
+ }
+ else
+ {
+ maxx=A3Y;
+ maxy=A3X;
+ ps_rotation=-90;
+ }
+ }
+ else if (pagesize==USLETTER)
+ {
+ if (orientation==PORTRAIT)
+ {
+ maxx=USLETTERX;
+ maxy=USLETTERY;
+ ps_rotation=0;
+ }
+ else
+ {
+ maxx=USLETTERY;
+ maxy=USLETTERX;
+ ps_rotation=-90;
+ }
+ }
+ if(show_curve) score_height=SCOREY;
+ if(header) main_header=HEADER;
+ else main_header=NOHEADER;
+ ppix_width=maxx-LEFTMARGIN*2;
+ ppix_height=maxy-main_header*CHARHEIGHT;
+
+/* get the name data */
+ GetPanelExtra(p.names,&name_data);
+
+/* get the sequence data */
+ GetPanelExtra(p.seqs,&seq_data);
+ numseqs=seq_data.nseqs;
+ nhead=seq_data.nhead;
+ if(ruler)
+ nfoot=seq_data.nfoot;
+ else
+ nfoot=seq_data.nfoot-1;
+ numelines=nhead+nfoot+score_height+SEPARATION;
+
+/* check the block length, residue range parameters */
+ if(first_printres<=0)
+ first_printres=1;
+ if((last_printres<=0) || (last_printres>seq_data.ncols))
+ last_printres=seq_data.ncols;
+ if(first_printres>last_printres)
+ {
+ error("Bad residue range - cannot write postscript");
+ return;
+ }
+ if (blength==0 || last_printres-first_printres+1<blength)
+ blocklen=last_printres-first_printres+1;
+ else
+ blocklen=blength;
+
+ res_number=(int *)ckalloc((name_data.nseqs+1)*sizeof(int));
+ for(i=0;i<name_data.nseqs;i++)
+ {
+ res_number[i]=0;
+ for(j=0;j<first_printres-1;j++)
+ if(isalpha(seq_data.lines[i][j])) res_number[i]++;
+ }
+ if(resno)
+ numecols=MAXRESNO+1+max_names;
+ else
+ numecols=1+max_names;
+
+/* print out the PS revision level etc. */
+ ttime = time(tptr);
+ str_time = ctime(&ttime);
+ print_ps_info(fd,pagesize);
+
+/* calculate scaling factors, block sizes to fit the page etc. */
+
+ if (resize==FALSE || blocklen==last_printres-first_printres+1)
+ {
+/* split the alignment into blocks of sequences. If the blocks are too long
+for the page - tough! */
+ if(resize==FALSE)
+ ps_scale=1.0;
+ else
+ ps_scale=(float)ppix_width/(float)((blocklen+numecols)*CHARWIDTH);
+ ps_xtrans= LEFTMARGIN * (1-ps_scale);
+ ps_ytrans= ppix_height * (1-ps_scale);
+ if (pagesize!=A3 && orientation==LANDSCAPE)
+ ps_xtrans-=LEFTMARGIN;
+
+ pchar_height=((maxy/CHARHEIGHT)-main_header)/ps_scale;
+ maxseq=pchar_height-numelines;
+ block_height = (maxseq+numelines) * CHARHEIGHT;
+ numpages = (numseqs/maxseq) + 1;
+ seq=0;
+ for (page=0;page<numpages;page++)
+ {
+/* print the top of page header */
+ print_page_header(fd,ps_rotation,maxx,maxy,
+ page,numpages,header,str_time,
+ ps_file,ps_xtrans,ps_ytrans,ps_scale);
+
+ block_top = maxy - main_header*CHARHEIGHT;
+ block_left = LEFTMARGIN + (1+max_names)*CHARWIDTH;
+ header_top = block_top;
+
+ fr=first_printres-1;
+ lr=last_printres-1;
+/* show the header lines */
+ for (i=0;i<nhead;i++)
+ print_header_line(fd,name_data,seq_data,i,fr,lr);
+
+ seq_top = block_top-nhead*CHARHEIGHT;
+/* show the sequence lines */
+ for (row=0;row<maxseq ;row++)
+ {
+ if(resno)
+ {
+ for(i=fr;i<=lr;i++)
+ if(isalpha(seq_data.lines[seq][i]))
+ res_number[seq]++;
+ }
+ print_seq_line(fd,name_data,seq_data,row,seq,fr,lr,res_number[seq]);
+ seq++;
+ if(seq>=numseqs)
+ {
+ row++;
+ break;
+ }
+ }
+
+ footer_top = seq_top-row*CHARHEIGHT;
+/* show the footer lines */
+ for (i=0;i<nfoot;i++)
+ print_footer_line(fd,name_data,seq_data,i,fr,lr);
+
+ curve_top = footer_top-nfoot*CHARHEIGHT;
+/* show the quality curve */
+ if(show_curve)
+ print_quality_curve(fd,seq_data,fr,lr,score_height);
+
+ fprintf(fd,"\nshowpage\n");
+ fprintf(fd,"restore\n");
+ }
+ }
+ else
+ {
+/* split the alignment into blocks of residues, and scale the blocks to fit the page */
+ maxseq=ppix_height/CHARHEIGHT-numelines-main_header;
+ hscale=(float)maxseq/(float)numseqs;
+ wscale=(float)ppix_width/(float)((blocklen+numecols)*CHARWIDTH);
+ ps_scale=MIN(hscale,wscale);
+ ps_xtrans= LEFTMARGIN * (1-ps_scale);
+ ps_ytrans= ppix_height * (1-ps_scale);
+ if (pagesize!=A3 && orientation==LANDSCAPE)
+ ps_xtrans-=LEFTMARGIN;
+
+ pchar_height=((maxy/CHARHEIGHT)-main_header)/ps_scale;
+ maxseq=pchar_height-numelines;
+ block_height = (numseqs+numelines) * CHARHEIGHT;
+ blocksperpage = pchar_height/(numseqs+numelines);
+ if (blocksperpage==0)
+ {
+ error("illegal combination of print parameters");
+ return;
+ }
+ numblocks = (last_printres-first_printres) / blocklen + 1;
+ if (numblocks % blocksperpage == 0)
+ numpages = numblocks / blocksperpage;
+ else
+ numpages = numblocks / blocksperpage + 1;
+
+ for (bn=0;bn<numblocks;bn++)
+ {
+ page = bn / blocksperpage;
+/* print the top of page header */
+ if (bn % blocksperpage == 0)
+ print_page_header(fd,ps_rotation,maxx,maxy,
+ page,numpages,header,str_time,
+ ps_file,ps_xtrans,ps_ytrans,ps_scale);
+
+ block_top = maxy - main_header*CHARHEIGHT-block_height*(bn%blocksperpage);
+ block_left = LEFTMARGIN + (1+max_names)*CHARWIDTH;
+ header_top = block_top;
+ seq_top = block_top-nhead*CHARHEIGHT;
+ footer_top = block_top-(nhead+numseqs)*CHARHEIGHT;
+ curve_top = block_top-(nhead+numseqs+nfoot)*CHARHEIGHT;
+
+ fr=first_printres-1 + blocklen*bn;
+ lr=fr+blocklen-1;
+ if(lr>=last_printres) lr=last_printres-1;
+/* show the header lines */
+ for (i=0;i<nhead;i++)
+ print_header_line(fd,name_data,seq_data,i,fr,lr);
+
+/* show the sequence lines */
+ for (i=0;i<numseqs;i++)
+ {
+ row = i % maxseq;
+ if(resno)
+ {
+ for(j=fr;j<=lr;j++)
+ if(isalpha(seq_data.lines[i][j]))
+ res_number[i]++;
+ }
+ print_seq_line(fd,name_data,seq_data,row,i,fr,lr,res_number[i]);
+ }
+/* show the footer lines */
+ for (i=0;i<nfoot;i++)
+ print_footer_line(fd,name_data,seq_data,i,fr,lr);
+
+/* show the quality curve */
+ if(show_curve)
+ print_quality_curve(fd,seq_data,fr,lr,score_height);
+
+ if ((bn == (numblocks-1)) || ((bn % blocksperpage == blocksperpage-1)))
+ {
+ fprintf(fd,"\nshowpage\n");
+ fprintf(fd,"restore\n");
+ }
+ }
+ }
+ fclose(fd);
+ return;
+}
+
+static int init_printer_lut(char *filename)
+{
+ FILE *fd;
+ char sinline[1025];
+ char *args[10];
+ char name[20];
+ int i,numargs;
+ Boolean found=FALSE;
+ char *par_file=NULL;
+
+/* reset the printer rgb colors to the color file rgb values */
+ for(i=0;i<ncolors;i++)
+ {
+ color_lut[i].pr=color_lut[i].r;
+ color_lut[i].pg=color_lut[i].g;
+ color_lut[i].pb=color_lut[i].b;
+ }
+
+/* search for the printer color file */
+ if(filename[0]==EOS) return 1;
+ par_file=find_file(filename);
+ if(par_file==NULL)
+ {
+ error("Cannot find printer file %s",filename);
+ return 1;
+ }
+ if ((fd=fopen(par_file,"r"))==NULL)
+ {
+ error("Cannot open printer file %s",par_file);
+ return 1;
+ }
+
+ for (;fgets(sinline,1024,fd)!=NULL;)
+ {
+ if (commentline(sinline)) continue;
+ numargs = getargs(sinline, args, 4);
+ if (numargs != 4)
+ {
+ error("Problem in parameter file - line %d\n",ncolors+1);
+ break;
+ }
+ else
+ {
+/* we've found a color - find the index the color lut */
+ strcpy(name, args[0]);
+ for(i=0;i<ncolors;i++)
+ {
+ if(strcmp(name,color_lut[i].name)==0)
+ {
+ color_lut[i].pr=atof(args[1]);
+ color_lut[i].pg=atof(args[2]);
+ color_lut[i].pb=atof(args[3]);
+ }
+ }
+ }
+ }
+ ckfree(par_file);
+ return 0;
+}
+
+static void print_ps_info(FILE *fd,int pagesize)
+{
+ fprintf(fd,"%%!PS-Adobe-1.0\n");
+ fprintf(fd,"%%%%Creator: Julie Thompson\n");
+ fprintf(fd,"%%%%Title:ClustalX Alignment\n");
+ fprintf(fd,"%%%%EndComments\n");
+ fprintf(fd,"/box { newpath\n");
+ fprintf(fd,"\t-0 -3 moveto\n");
+ fprintf(fd,"\t-0 %d lineto\n",CHARHEIGHT-3);
+ fprintf(fd,"\t%d %d lineto\n",CHARWIDTH,CHARHEIGHT-3);
+ fprintf(fd,"\t%d -3 lineto\n",CHARWIDTH);
+ fprintf(fd,"\tclosepath\n");
+ fprintf(fd," } def\n\n");
+
+ fprintf(fd,"/color_char { gsave\n");
+ fprintf(fd,"\tsetrgbcolor\n");
+ fprintf(fd,"\tmoveto\n");
+ fprintf(fd,"\tshow\n");
+ fprintf(fd,"\tgrestore\n");
+ fprintf(fd," } def\n\n");
+
+ fprintf(fd,"/cbox { gsave\n");
+ fprintf(fd,"\ttranslate\n");
+ fprintf(fd,"\tnewpath\n");
+ fprintf(fd,"\t0 0 moveto\n");
+ fprintf(fd,"\tlineto\n");
+ fprintf(fd,"\tlineto\n");
+ fprintf(fd,"\tlineto\n");
+ fprintf(fd,"\tclosepath\n");
+ fprintf(fd,"\tfill\n");
+ fprintf(fd,"\tgrestore\n");
+ fprintf(fd," } def\n\n");
+
+ fprintf(fd,"/color_inv { gsave\n");
+ fprintf(fd,"\tsetrgbcolor\n");
+ fprintf(fd,"\ttranslate\n");
+ fprintf(fd,"\tbox fill\n");
+ fprintf(fd,"\tgrestore\n");
+ fprintf(fd,"\tmoveto\n");
+ fprintf(fd,"\tshow\n");
+ fprintf(fd," } def\n\n");
+
+ fprintf(fd,"/white_inv { gsave\n");
+ fprintf(fd,"\tsetrgbcolor\n");
+ fprintf(fd,"\ttranslate\n");
+ fprintf(fd,"\tbox fill\n");
+ fprintf(fd,"\tgrestore\n");
+ fprintf(fd,"\tgsave\n");
+ fprintf(fd,"\tsetrgbcolor\n");
+ fprintf(fd,"\tmoveto\n");
+ fprintf(fd,"\tshow\n");
+ fprintf(fd,"\tgrestore\n");
+ fprintf(fd," } def\n\n");
+
+ if (pagesize==A3)
+ fprintf(fd,"statusdict begin a3 end\n\n");
+/* For canon color printer, use a3tray instead of a3!! */
+}
+
+static void print_page_header(FILE *fd,int ps_rotation,int maxx,int maxy,
+int page,int numpages,Boolean header,char *str_time,
+char *ps_file,int ps_xtrans,int ps_ytrans,float ps_scale)
+{
+ int ps_x,ps_y;
+ char tstr[50];
+
+ fprintf(fd,"%%%%Page: P%d\n",page);
+ fprintf(fd,"save\n\n");
+
+ if (ps_rotation==-90)
+ {
+ fprintf(fd,"0 %d translate\n",maxx);
+ fprintf(fd,"%d rotate\n",ps_rotation);
+ }
+
+ if (header)
+ {
+ sprintf(tstr,"CLUSTAL %s MULTIPLE SEQUENCE ALIGNMENT",revision_level);
+ ps_x = (maxx-strlen(tstr)*10)/2;
+ ps_y = maxy - 2*CHARHEIGHT;
+ fprintf(fd,"%d %d moveto\n",ps_x,ps_y);
+ fprintf(fd,"/Times-Bold findfont 14 scalefont setfont\n");
+ fprintf(fd,"(%s) show\n\n",tstr);
+
+ ps_x = 20;
+ ps_y = maxy - 4*CHARHEIGHT;
+ fprintf(fd,"%d %d moveto\n",ps_x,ps_y);
+ fprintf(fd,"(File: %s) show\n\n",ps_file);
+
+ sprintf(tstr,"Date: %s",str_time);
+ ps_x = maxx-strlen(tstr)*8-20;
+ ps_y = maxy - 4*CHARHEIGHT;
+ fprintf(fd,"%d %d moveto\n",ps_x,ps_y);
+ fprintf(fd,"(%s) show\n\n",tstr);
+
+ sprintf(tstr,"Page %d of %d",page+1,numpages);
+ ps_x = 20;
+ ps_y = maxy - 5*CHARHEIGHT-4;
+ fprintf(fd,"%d %d moveto\n",ps_x,ps_y);
+ fprintf(fd,"(%s) show\n\n",tstr);
+ }
+ fprintf(fd,"%d %d translate\n",ps_xtrans,ps_ytrans);
+ fprintf(fd,"%#3.2f %#3.2f scale\n",ps_scale,ps_scale);
+ fprintf(fd,"/Courier-Bold findfont 10 scalefont setfont\n");
+}
+
+static void print_header_line(FILE *fd,panel_data name_data, panel_data seq_data,
+int ix,int fr,int lr)
+{
+ int i;
+ int ps_x,ps_y;
+
+ ps_x = LEFTMARGIN;
+ ps_y = header_top - (ix * CHARHEIGHT);
+ fprintf(fd,"%d %d moveto\n",ps_x,ps_y);
+ fprintf(fd,"(%*s ) show\n",max_names,name_data.header[ix]);
+ for(i=fr;i<=lr;i++)
+ {
+ ps_x = block_left + (i-fr) * CHARWIDTH;
+ fprintf(fd,"(");
+ fprintf(fd,"%c",seq_data.header[ix][i]);
+ fprintf(fd,") ");
+ fprintf(fd,"%d %d %d %d 1.0 1.0 1.0 color_inv\n",ps_x,ps_y,ps_x,ps_y);
+ }
+ fprintf(fd,"\n");
+}
+
+static void print_footer_line(FILE *fd,panel_data name_data, panel_data seq_data,
+int ix,int fr,int lr)
+{
+ int i;
+ int ps_x,ps_y;
+
+ ps_x = LEFTMARGIN;
+ ps_y = footer_top - (ix * CHARHEIGHT);
+ fprintf(fd,"%d %d moveto\n",ps_x,ps_y);
+ fprintf(fd,"(%*s ) show\n",max_names,name_data.footer[ix]);
+ for(i=fr;i<=lr;i++)
+ {
+ ps_x = block_left + (i-fr) * CHARWIDTH;
+ fprintf(fd,"(");
+ fprintf(fd,"%c",seq_data.footer[ix][i]);
+ fprintf(fd,") ");
+ fprintf(fd,"%d %d %d %d 1.0 1.0 1.0 color_inv\n",ps_x,ps_y,ps_x,ps_y);
+ }
+ fprintf(fd,"\n");
+}
+
+static void print_quality_curve(FILE *fd,panel_data seq_data,
+int fr,int lr,int score_height)
+{
+ int i,w,h;
+ int ps_x,ps_y,curve_bottom;
+
+ w=CHARWIDTH;
+ ps_x = block_left+CHARWIDTH;
+ curve_bottom=curve_top-score_height*CHARHEIGHT;
+ fprintf(fd,"0.3 0.3 0.3 setrgbcolor\n");
+ for(i=fr+1;i<=lr;i++)
+ {
+ fprintf(fd,"%d %d moveto\n",ps_x,curve_bottom);
+ h=score_height*CHARHEIGHT*((float)seq_data.colscore[i]/100.0);
+ if(h<1) h=1;
+ fprintf(fd,"%d 0 %d %d 0 %d %d %d cbox\n",w,w,h,h,ps_x,curve_bottom);
+ ps_x+=CHARWIDTH;
+ }
+ fprintf(fd,"0.0 0.0 0.0 setrgbcolor\n");
+}
+
+static void print_seq_line(FILE *fd,panel_data name_data, panel_data seq_data,
+int row,int seq,int fr,int lr,int res_number)
+{
+ int i,color;
+ int ps_x,ps_y;
+ float red, green, blue;
+
+ ps_x = LEFTMARGIN;
+ ps_y = seq_top - (row * CHARHEIGHT);
+ fprintf(fd,"%d %d moveto\n",ps_x,ps_y);
+ fprintf(fd,"(%*s ) show\n",max_names,name_data.lines[seq]);
+ for(i=fr;i<=lr;i++)
+ {
+ color = seq_data.colormask[seq][i];
+ red = color_lut[color].pr;
+ green = color_lut[color].pg;
+ blue = color_lut[color].pb;
+ ps_x = block_left + (i-fr) * CHARWIDTH;
+ fprintf(fd,"(");
+ fprintf(fd,"%c",seq_data.lines[seq][i]);
+ fprintf(fd,") ");
+ if(segment_exceptions && seq_data.segment_exception[seq][i] > 0)
+ {
+ fprintf(fd,"%d %d %1.1f %1.1f %1.1f %d %d %1.1f %1.1f %1.1f white_inv\n",
+ ps_x,ps_y,1.0,1.0,1.0,ps_x,ps_y,0.1,0.1,0.1);
+ }
+ else if(residue_exceptions && seq_data.residue_exception[seq][i] == TRUE)
+ {
+ fprintf(fd,"%d %d %1.1f %1.1f %1.1f %d %d %1.1f %1.1f %1.1f white_inv\n",
+ ps_x,ps_y,1.0,1.0,1.0,ps_x,ps_y,0.4,0.4,0.4);
+ }
+ else
+ {
+ if(inverted)
+ fprintf(fd,"%d %d %d %d %1.1f %1.1f %1.1f color_inv\n",
+ ps_x,ps_y,ps_x,ps_y,red,green,blue);
+ else
+ fprintf(fd,"%d %d %1.1f %1.1f %1.1f color_char\n",
+ ps_x,ps_y,red,green,blue);
+ }
+ }
+
+ if(res_number>0)
+ {
+ ps_x = block_left + (lr-fr+1) * CHARWIDTH;
+ ps_y = seq_top - (row * CHARHEIGHT);
+ fprintf(fd,"%d %d moveto\n",ps_x,ps_y);
+ fprintf(fd,"(%*d) show\n",MAXRESNO,res_number);
+ }
+ fprintf(fd,"\n");
+}
Added: trunk/packages/clustalw/branches/upstream/current/xdisplay.c
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/xdisplay.c 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/xdisplay.c 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,2191 @@
+#include <stdio.h>
+#include <stdarg.h>
+#include <string.h>
+
+#include <vibrant.h>
+#include <document.h>
+
+#include "clustalw.h"
+#include "xmenu.h"
+
+static void VscrollMulti(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval);
+static void HscrollMultiN(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval);
+static void HscrollMultiS(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval);
+static void VscrollPrf1(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval);
+static void HscrollPrf1N(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval);
+static void HscrollPrf1S(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval);
+static void VscrollPrf2(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval);
+static void HscrollPrf2N(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval);
+static void HscrollPrf2S(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval);
+
+static void NameClick(PaneL panel, PoinT pt);
+static void NameDrag(PaneL panel, PoinT pt);
+static void NameRelease(PaneL panel, PoinT pt);
+static void SeqClick(PaneL panel, PoinT pt);
+static void SeqDrag(PaneL panel, PoinT pt);
+static void SeqRelease(PaneL panel, PoinT pt);
+
+static void fit_seq_display(RecT wr,Boolean mv_message);
+static void fit_prf_displays(RecT wr,int numseqs1,int numseqs2,Boolean mv_message);
+
+static void vscrollnames(BaR bar, int newval, int oldval);
+static void hscrollnames(BaR bar, int newval, int oldval);
+static void vscrollseqs(BaR bar, int newval, int oldval);
+static void hscrollseqs(BaR bar, int newval, int oldval);
+
+static void correct_scrollbar(BaR b,int visible,int total,int value,Boolean reset);
+
+static PaneL make_panel(int type,GrouP g,int width,int height,int firstline,int tlines);
+static panel_data free_panel_data(panel_data data);
+static panel_data alloc_name_data(panel_data data);
+static panel_data alloc_seq_data(panel_data data);
+
+extern sint max_names;
+
+extern int mheader; /* maximum header lines */
+extern int mfooter; /* maximum footer lines */
+extern int max_plines; /* profile align display length */
+extern int min_plines1; /* profile align display length */
+extern int min_plines2; /* profile align display length */
+extern int loffset,boffset,toffset;
+extern int roffset;
+extern int poffset;
+
+extern Boolean aln_mode;
+extern Boolean fixed_prf_scroll;
+extern Boolean window_displayed;
+
+extern PrompT message; /* used in temporary message window */
+
+extern spanel seq_panel; /* data for multiple alignment area */
+extern spanel prf_panel[]; /* data for profile alignment areas */
+extern spanel active_panel; /* 'in-use' panel -scrolling,clicking etc. */
+extern FonT datafont;
+extern WindoW mainw;
+extern GrouP seq_display,prf1_display,prf2_display;
+
+extern int ncolors;
+extern int inverted;
+
+extern Boolean dnaflag;
+extern sint nseqs;
+extern sint profile1_nseqs;
+extern sint output_order;
+extern sint *output_index;
+extern sint *seqlen_array;
+extern char **seq_array;
+extern char **names, **titles;
+extern char *amino_acid_codes;
+extern sint gap_pos1, gap_pos2;
+extern char *gap_penalty_mask1,*gap_penalty_mask2;
+extern char *sec_struct_mask1,*sec_struct_mask2;
+extern sint struct_penalties1,struct_penalties2;
+extern sint output_struct_penalties;
+extern Boolean use_ss1, use_ss2;
+
+extern char *explicit_par_file;
+extern char *par_file;
+extern char def_protpar_file[];
+extern char def_dnapar_file[];
+extern sint ncutseqs;
+extern Boolean residue_exceptions;
+extern Boolean segment_exceptions;
+extern color color_lut[];
+extern char *res_cat1[];
+extern char *res_cat2[];
+
+static range selected_seqs; /* sequences selected by clicking on names */
+static range selected_res; /* residues selected by clicking on seqs */
+
+
+static int fromvscroll,fromhscroll; /* set by scrolling functions,
+ used by DrawPanel, draw_names, draw_seqs */
+
+
+void resize_multi_window(void)
+{
+ RecT r;
+
+ ObjectRect(mainw,&r);
+ fit_seq_display(r,FALSE);
+}
+
+void resize_prf_window(int numseqs1,int numseqs2)
+{
+ RecT r;
+
+ SelectFont(datafont);
+ stdCharWidth=CharWidth('A');
+ stdLineHeight=LineHeight();
+
+ if(numseqs1>max_plines)
+ numseqs1=max_plines;
+ else if(numseqs1<min_plines1)
+ numseqs1=min_plines1;
+ if(numseqs2>max_plines)
+ numseqs2=max_plines;
+ else if(numseqs2<min_plines2)
+ numseqs2=min_plines2;
+ ObjectRect(mainw,&r);
+
+ fit_prf_displays(r,numseqs1,numseqs2,FALSE);
+}
+
+static void fit_seq_display(RecT wr,Boolean mv_message)
+{
+ int width,height,moffset;
+ RecT nr,sr,mr;
+ panel_data data;
+
+ ObjectRect(seq_panel.names,&nr);
+ ObjectRect(message,&mr);
+ moffset=mr.top-nr.bottom;
+ width=nr.right-nr.left;
+ height=wr.bottom-wr.top-boffset-toffset;
+ nr.top=toffset;
+ nr.left=loffset;
+ nr.bottom=nr.top+height;
+ nr.right=nr.left+width;
+ SetPosition(seq_panel.names,&nr);
+
+ GetPanelExtra(seq_panel.names,&data);
+ data.vlines=(height-SCOREHEIGHT)/data.lineheight - MARGIN;
+ data.vseqs=data.vlines-data.nhead-data.nfoot;
+ SetPanelExtra(seq_panel.names,&data);
+
+
+ sr.top=nr.top;
+ sr.left=nr.right;
+ sr.bottom=sr.top+height;
+ sr.right=wr.right-wr.left-roffset;
+ width=sr.right-sr.left;
+ SetPosition(seq_panel.seqs,&sr);
+
+ GetPanelExtra(seq_panel.seqs,&data);
+ data.vcols=width/data.charwidth - MARGIN*2;
+ data.vlines=(height-SCOREHEIGHT)/data.lineheight - MARGIN;
+ data.vseqs=data.vlines-data.nhead-data.nfoot;
+ SetPanelExtra(seq_panel.seqs,&data);
+
+ if(mv_message) {
+ height=mr.bottom-mr.top;
+ mr.top=nr.bottom+moffset;
+ mr.bottom=mr.top+height;
+ SetPosition(message,&mr);
+ }
+
+
+ position_scrollbars(seq_panel);
+ correct_name_bars(TRUE);
+ correct_seq_bars(TRUE);
+
+}
+
+static void fit_prf_displays(RecT wr,int numseqs1,int numseqs2,Boolean mv_message)
+{
+ int width,height,moffset;
+ RecT mr,nr,sr;
+ panel_data data;
+
+ ObjectRect(prf_panel[1].names,&nr);
+ ObjectRect(message,&mr);
+ moffset=mr.top-nr.bottom;
+
+ ObjectRect(prf_panel[0].names,&nr);
+ width=nr.right-nr.left;
+
+ nr.top=toffset;
+ nr.left=loffset;
+ height=(wr.bottom-wr.top-boffset-toffset-poffset)*numseqs1/(numseqs1+numseqs2);
+ nr.bottom=nr.top+height;
+ nr.right=nr.left+width;
+ SetPosition(prf_panel[0].names,&nr);
+ GetPanelExtra(prf_panel[0].names,&data);
+ data.vlines=(height-SCOREHEIGHT)/data.lineheight - MARGIN;
+ data.vseqs=data.vlines-data.nhead-data.nfoot;
+ SetPanelExtra(prf_panel[0].names,&data);
+ sr.top=nr.top;
+ sr.left=nr.right;
+ sr.bottom=sr.top+height;
+ sr.right=wr.right-wr.left-roffset;
+ width=sr.right-sr.left;
+ SetPosition(prf_panel[0].seqs,&sr);
+ GetPanelExtra(prf_panel[0].seqs,&data);
+ data.vcols=width/data.charwidth - MARGIN*2;
+ data.vlines=(height-SCOREHEIGHT)/data.lineheight - MARGIN;
+ data.vseqs=data.vlines-data.nhead-data.nfoot;
+ SetPanelExtra(prf_panel[0].seqs,&data);
+ position_scrollbars(prf_panel[0]);
+
+
+
+ nr.top=nr.bottom+poffset;
+ height=(wr.bottom-wr.top-boffset-toffset-poffset)*numseqs2/(numseqs1+numseqs2);
+ nr.bottom=nr.top+height;
+ SetPosition(prf_panel[1].names,&nr);
+ GetPanelExtra(prf_panel[1].names,&data);
+ data.vlines=(height-SCOREHEIGHT)/data.lineheight - MARGIN;
+ data.vseqs=data.vlines-data.nhead-data.nfoot;
+ SetPanelExtra(prf_panel[1].names,&data);
+ sr.top=nr.top;
+ sr.bottom=sr.top+height;
+ SetPosition(prf_panel[1].seqs,&sr);
+ GetPanelExtra(prf_panel[1].seqs,&data);
+ data.vcols=width/data.charwidth - MARGIN*2;
+ data.vlines=(height-SCOREHEIGHT)/data.lineheight - MARGIN;
+ data.vseqs=data.vlines-data.nhead-data.nfoot;
+ SetPanelExtra(prf_panel[1].seqs,&data);
+ position_scrollbars(prf_panel[1]);
+
+ if(mv_message) {
+ height=mr.bottom-mr.top;
+ mr.top=nr.bottom+moffset;
+ mr.bottom=mr.top+height;
+ SetPosition(message,&mr);
+ }
+
+ correct_name_bars(TRUE);
+ correct_seq_bars(TRUE);
+}
+
+void ResizeWindowProc(WindoW w)
+{
+ int numseqs1,numseqs2;
+ RecT wr;
+ panel_data data;
+
+ if(window_displayed==FALSE) return;
+
+ ObjectRect(w,&wr);
+ if (aln_mode==MULTIPLEM)
+ {
+/* if the window is too small, hide everything */
+ if(wr.bottom-wr.top < toffset+boffset)
+ {
+ Hide(seq_display);
+ Hide(message);
+ return;
+ }
+ fit_seq_display(wr,TRUE);
+ Show(seq_display);
+ Show(message);
+ }
+ else
+ {
+/* if the window is too small, hide everything */
+ if(wr.bottom-wr.top < toffset+boffset+2*poffset)
+ {
+ Hide(prf1_display);
+ Hide(prf2_display);
+ Hide(message);
+ return;
+ }
+ GetPanelExtra(prf_panel[0].names,&data);
+ numseqs1=data.nseqs;
+ if(numseqs1<min_plines1)numseqs1=min_plines1;
+ else if(numseqs1>max_plines)numseqs1=max_plines;
+ GetPanelExtra(prf_panel[1].names,&data);
+ numseqs2=data.nseqs;
+ if(numseqs2<min_plines2)numseqs2=min_plines2;
+ else if(numseqs2>max_plines)numseqs2=max_plines;
+
+ fit_prf_displays(wr,numseqs1,numseqs2,TRUE);
+ Show(prf1_display);
+ Show(prf2_display);
+ Show(message);
+ }
+
+}
+
+void position_scrollbars(spanel p)
+{
+ int height;
+ RecT hr,vr,nr,sr;
+ panel_data data;
+
+ ObjectRect(p.names,&nr);
+ GetPanelExtra(p.names,&data);
+ ObjectRect(data.hscrollbar,&hr);
+ height=hr.bottom-hr.top;
+ LoadRect(&hr,nr.left,nr.bottom,nr.right,nr.bottom+height);
+ SetPosition(data.hscrollbar,&hr);
+#ifdef WIN_MAC
+ AdjustPrnt(data.hscrollbar,&hr,FALSE);
+#endif
+ ObjectRect(p.seqs,&sr);
+ GetPanelExtra(p.seqs,&data);
+ ObjectRect(data.hscrollbar,&hr);
+ height=hr.bottom-hr.top;
+ LoadRect(&hr,sr.left,sr.bottom,sr.right,sr.bottom+height);
+ SetPosition(data.hscrollbar,&hr);
+#ifdef WIN_MAC
+ AdjustPrnt(data.hscrollbar,&hr,FALSE);
+#endif
+ ObjectRect(data.vscrollbar,&vr);
+ LoadRect(&vr,vr.left,sr.top,vr.right,sr.bottom);
+ SetPosition(data.vscrollbar,&vr);
+#ifdef WIN_MAC
+ AdjustPrnt(data.vscrollbar,&vr,FALSE);
+#endif
+}
+
+
+
+
+
+void load_aln_data(spanel p,int fs,int ls,Boolean reset)
+{
+ int i,j,slength=0;
+ int nhead;
+ sint val;
+ panel_data name_data,seq_data;
+
+ WatchCursor();
+
+ GetPanelExtra(p.names,&name_data);
+ GetPanelExtra(p.seqs,&seq_data);
+ name_data=free_panel_data(name_data);
+ seq_data=free_panel_data(seq_data);
+ SetPanelExtra(p.names,&name_data);
+ SetPanelExtra(p.seqs,&seq_data);
+
+ name_data.nseqs=ls-fs+1;
+ seq_data.nseqs=name_data.nseqs;
+ name_data.firstseq=fs;
+ seq_data.firstseq=fs;
+
+/* find the maximum length of sequence */
+ for(i=fs;i<=ls;i++)
+ if (slength < seqlen_array[i+1]) slength = seqlen_array[i+1];
+ name_data.ncols=max_names;
+ seq_data.ncols=slength;
+
+ if (name_data.nseqs>0)
+ {
+ name_data=alloc_name_data(name_data);
+ seq_data=alloc_seq_data(seq_data);
+
+
+ for(i=fs;i<=ls;i++)
+ {
+ strncpy(name_data.lines[i-fs],names[i+1],MAXNAMES);
+ name_data.lines[i-fs][MAXNAMES]='\0';
+ for(j=0;j<seqlen_array[i+1];j++)
+ {
+ val = seq_array[i+1][j+1];
+ if((val == -3) || (val == 253))
+ break;
+ else if((val == gap_pos1) || (val == gap_pos2))
+ seq_data.lines[i-fs][j] = '-';
+ else {
+ seq_data.lines[i-fs][j] = amino_acid_codes[val];
+ }
+ }
+ for(j=seqlen_array[i+1];j<slength;j++)
+ seq_data.lines[i-fs][j] = ' ';
+ seq_data.lines[i-fs][j]='\0';
+
+ name_data.selected[i-fs]=FALSE;
+
+ }
+
+
+ make_consensus(seq_data,name_data.header[0],seq_data.header[0]);
+ nhead=make_struct_data(seq_data.prf_no,slength,name_data.header[1],seq_data.header[1]);
+ if (nhead==0)
+ nhead=make_gp_data(seq_data.prf_no,slength,name_data.header[1],seq_data.header[1]);
+ seq_data.nhead=name_data.nhead=nhead+1;
+
+ seq_data.nfoot=name_data.nfoot=1;
+ seq_data.consensus=NULL;
+ make_ruler(slength,name_data.footer[0],seq_data.footer[0]);
+ make_colscores(seq_data);
+ }
+ else
+ {
+ seq_data.ncols=name_data.ncols=0;
+ }
+
+ if(reset==TRUE)
+ {
+ name_data.firstvline=0;
+ name_data.firstvcol=0;
+ seq_data.firstvline=0;
+ seq_data.firstvcol=0;
+ }
+ name_data.vseqs=name_data.vlines-name_data.nhead-name_data.nfoot;
+ seq_data.vseqs=seq_data.vlines-seq_data.nhead-seq_data.nfoot;
+
+ if(seq_data.nseqs>0)
+ {
+/* try to find the user's color parameter file */
+ if (explicit_par_file == NULL)
+ {
+ if (par_file != NULL)
+ ckfree(par_file);
+ if(dnaflag)
+ par_file=find_file(def_dnapar_file);
+ else
+ par_file=find_file(def_protpar_file);
+ }
+ init_color_parameters(par_file);
+ make_colormask(seq_data);
+ }
+
+ SetPanelExtra(p.names,&name_data);
+ SetPanelExtra(p.seqs,&seq_data);
+
+ ArrowCursor();
+}
+
+void load_aln(spanel p,int fs,int ls,Boolean reset)
+{
+
+ load_aln_data(p,fs,ls,reset);
+
+ DrawPanel(p.names);
+ DrawPanel(p.seqs);
+ correct_name_bars(reset);
+ correct_seq_bars(reset);
+
+}
+
+static panel_data alloc_name_data(panel_data data)
+{
+ int i;
+
+ data.lines=(char **)ckalloc((data.nseqs+1)*sizeof(char *));
+ data.colormask=NULL;
+ data.selected=(int *)ckalloc((data.nseqs+1)*sizeof(int));
+
+ for(i=0;i<data.nseqs;i++)
+ {
+ data.lines[i]=(char *)ckalloc((MAXNAMES+1)*sizeof(char));
+ strncpy(data.lines[i],names[i+1],MAXNAMES);
+ data.lines[i][MAXNAMES]='\0';
+ }
+
+ data.header=(char **)ckalloc((mheader+1)*sizeof(char *));
+ for(i=0;i<mheader;i++)
+ data.header[i]=(char *)ckalloc((MAXNAMES+1)*sizeof(char));
+ data.footer=(char **)ckalloc((mfooter+1)*sizeof(char *));
+ for(i=0;i<mfooter;i++)
+ data.footer[i]=(char *)ckalloc((MAXNAMES+1)*sizeof(char));
+ return(data);
+}
+
+static panel_data alloc_seq_data(panel_data data)
+{
+ int i;
+
+ data.lines=(char **)ckalloc((data.nseqs+1)*sizeof(char *));
+ data.colormask=(char **)ckalloc((data.nseqs+1)*sizeof(char *));
+ data.firstsel=data.lastsel=-1;
+
+ for(i=0;i<data.nseqs;i++)
+ {
+ data.lines[i]=(char *)ckalloc((data.ncols+1)*sizeof(char));
+ data.colormask[i]=(char *)ckalloc((data.ncols+1)*sizeof(char));
+ }
+
+ data.selected=(int *)ckalloc((data.ncols+1)*sizeof(int));
+ for(i=0;i<data.ncols;i++)
+ data.selected[i]=FALSE;
+
+ data.header=(char **)ckalloc((mheader+1)*sizeof(char *));
+ for(i=0;i<mheader;i++)
+ data.header[i]=(char *)ckalloc((data.ncols+1)*sizeof(char));
+
+ data.colscore=(sint *)ckalloc((data.ncols+1)*sizeof(sint));
+ data.residue_exception=(Boolean **)ckalloc((data.nseqs+1)*sizeof(Boolean *));
+ for(i=0;i<data.nseqs;i++)
+ data.residue_exception[i]=(Boolean *)ckalloc((data.ncols+1)*sizeof(Boolean));
+ data.segment_exception=(short **)ckalloc((data.nseqs+1)*sizeof(short *));
+ for(i=0;i<data.nseqs;i++)
+ data.segment_exception[i]=(short *)ckalloc((data.ncols+1)*sizeof(short));
+
+ data.footer=(char **)ckalloc((mfooter+1)*sizeof(char *));
+ for(i=0;i<mfooter;i++)
+ data.footer[i]=(char *)ckalloc((data.ncols+1)*sizeof(char));
+ return(data);
+}
+
+void correct_name_bars(Boolean reset)
+{
+ panel_data data,data1;
+
+ if(aln_mode==PROFILEM)
+ {
+ GetPanelExtra(prf_panel[0].names,&data);
+ GetPanelExtra(prf_panel[1].names,&data1);
+ if(reset==TRUE)
+ {
+ data.firstvcol=0;
+ data1.firstvcol=0;
+ }
+ correct_scrollbar(data.hscrollbar,data.vcols,data.ncols,data.firstvcol,reset);
+ correct_scrollbar(data1.hscrollbar,data1.vcols,data1.ncols,data.firstvcol,reset);
+ if(reset==TRUE)
+ {
+ data.firstvline=0;
+ data1.firstvline=0;
+ }
+ correct_scrollbar(data.vscrollbar,data.vseqs,data.nseqs,data.firstvline,reset);
+ correct_scrollbar(data1.vscrollbar,data1.vseqs,data1.nseqs,data1.firstvline,reset);
+ SetPanelExtra(prf_panel[0].names,&data);
+ SetPanelExtra(prf_panel[1].names,&data1);
+ }
+ else
+ {
+ GetPanelExtra(seq_panel.names,&data);
+ if(reset==TRUE)
+ {
+ data.firstvcol=0;
+ data.firstvline=0;
+ }
+ correct_scrollbar(data.vscrollbar,data.vseqs,data.nseqs,data.firstvline,reset);
+ correct_scrollbar(data.hscrollbar,data.vcols,data.ncols,data.firstvcol,reset);
+
+ SetPanelExtra(seq_panel.names,&data);
+ }
+
+}
+
+void correct_seq_bars(Boolean reset)
+{
+ int maxcols,m1,m2;
+ panel_data data,data1;
+
+ if(aln_mode==PROFILEM)
+ {
+ GetPanelExtra(prf_panel[0].seqs,&data);
+ GetPanelExtra(prf_panel[1].seqs,&data1);
+ if(fixed_prf_scroll==TRUE)
+ {
+ Hide(data.hscrollbar);
+ m1=MAX(data.firstvcol,data1.firstvcol);
+ m2=MAX(data.ncols-data.firstvcol,data1.ncols-data1.firstvcol);
+ maxcols=m1+m2;
+ if(reset==TRUE)
+ {
+ data.firstvcol=0;
+ data1.firstvcol=0;
+ }
+ data.lockoffset= -MAX(data1.firstvcol-data.firstvcol,0);
+ data1.lockoffset= -MAX(data.firstvcol-data1.firstvcol,0);
+ correct_scrollbar(data1.hscrollbar,data1.vcols,maxcols,m1,TRUE);
+ }
+ else
+ {
+ Show(data.hscrollbar);
+ if(reset==TRUE)
+ {
+ data.firstvcol=0;
+ data1.firstvcol=0;
+ }
+ data.lockoffset=0;
+ data1.lockoffset=0;
+ correct_scrollbar(data.hscrollbar,data.vcols,data.ncols,data.firstvcol,reset);
+ correct_scrollbar(data1.hscrollbar,data1.vcols,data1.ncols,data.firstvcol,reset);
+ }
+ if(reset==TRUE)
+ {
+ data.firstvline=0;
+ data1.firstvline=0;
+ }
+ correct_scrollbar(data.vscrollbar,data.vseqs,data.nseqs,data.firstvline,reset);
+ correct_scrollbar(data1.vscrollbar,data1.vseqs,data1.nseqs,data.firstvline,reset);
+ SetPanelExtra(prf_panel[0].seqs,&data);
+ SetPanelExtra(prf_panel[1].seqs,&data1);
+ }
+ else
+ {
+ GetPanelExtra(seq_panel.seqs,&data);
+ if(reset==TRUE)
+ {
+ data.firstvcol=0;
+ data.firstvline=0;
+ }
+ correct_scrollbar(data.vscrollbar,data.vseqs,data.nseqs,data.firstvline,reset);
+ correct_scrollbar(data.hscrollbar,data.vcols,data.ncols,data.firstvcol,reset);
+
+ SetPanelExtra(seq_panel.seqs,&data);
+ }
+
+}
+
+static void correct_scrollbar(BaR b,int visible,int total,int value,Boolean reset)
+{
+ int max;
+
+ if (b!=NULL)
+ {
+ if (visible > 0 && total > visible)
+ max=total-visible;
+ else
+ max=0;
+ if(reset==TRUE) CorrectBarValue(b,0);
+ CorrectBarPage(b,visible,visible);
+ CorrectBarValue(b,value);
+ CorrectBarMax(b,max);
+ }
+}
+
+
+void color_seqs(void)
+{
+ panel_data data;
+
+ GetPanelExtra(seq_panel.seqs,&data);
+ if (data.nseqs == 0) return;
+
+ info("Coloring sequences...");
+ make_colormask(data);
+ DrawPanel(seq_panel.seqs);
+ info("Done.");
+}
+
+void color_prf1(void)
+{
+ panel_data data;
+
+ GetPanelExtra(prf_panel[0].seqs,&data);
+ if (data.nseqs == 0) return;
+
+ make_colormask(data);
+ info("Coloring profile 1...");
+ DrawPanel(prf_panel[0].seqs);
+ info("Done.");
+}
+
+void color_prf2(void)
+{
+ panel_data data;
+
+ GetPanelExtra(prf_panel[1].seqs,&data);
+ if (data.nseqs == 0) return;
+
+ make_colormask(data);
+ info("Coloring profile 2...");
+ DrawPanel(prf_panel[1].seqs);
+ info("Done.");
+}
+
+void remove_gap_pos(int fseq, int lseq,int prf_no)
+{
+ int i,j,k,ngaps;
+
+
+ if (fseq>=lseq) return;
+
+ for (i=1;i<=seqlen_array[fseq];)
+ {
+ ngaps=0;
+ for (j=fseq;j<=lseq;j++)
+ if(seq_array[j][i]==gap_pos1 || seq_array[j][i]==gap_pos2) ngaps++;
+ if (ngaps==lseq-fseq+1)
+ {
+ for (j=fseq;j<=lseq;j++)
+ {
+ for(k=i+1;k<=seqlen_array[j]+1;k++)
+ seq_array[j][k-1]=seq_array[j][k];
+ seqlen_array[j]--;
+ }
+ if(prf_no==1 && sec_struct_mask1 != NULL)
+ for(k=i;k<=seqlen_array[fseq];k++)
+ sec_struct_mask1[k-1]=sec_struct_mask1[k];
+ if(prf_no==1 && gap_penalty_mask1 != NULL)
+ for(k=i;k<=seqlen_array[fseq];k++)
+ gap_penalty_mask1[k-1]=gap_penalty_mask1[k];
+ if(prf_no==2 && sec_struct_mask2 != NULL)
+ for(k=i;k<=seqlen_array[fseq];k++)
+ sec_struct_mask2[k-1]=sec_struct_mask2[k];
+ if(prf_no==2 && gap_penalty_mask2 != NULL)
+ for(k=i;k<=seqlen_array[fseq];k++)
+ gap_penalty_mask2[k-1]=gap_penalty_mask2[k];
+ if(seqlen_array[fseq]<=0) break;
+ }
+ else i++;
+ }
+}
+
+/* width and height passed here are in pixels */
+
+static PaneL make_panel(int type,GrouP g,int width,int height,int firstseq,int nseqs)
+{
+ int i,l,length=0;
+ PaneL p;
+ panel_data data;
+
+ data.type=type;
+ SelectFont(datafont);
+ data.lineheight=LineHeight();
+ data.charwidth=CharWidth('A');
+ if(type==NAMES)
+ {
+/* find the maximum length of sequence name */
+ for (i=firstseq;i<=firstseq+nseqs-1;i++)
+ {
+ l = strlen(names[i]);
+ if (length < l) length = l;
+ }
+ data.vcols=width/data.charwidth - MARGIN*2 - DNUMBER;
+ }
+ else
+ {
+ for (i=firstseq;i<=firstseq+nseqs-1;i++)
+ if (length < seqlen_array[i]) length = seqlen_array[i];
+ data.vcols=width/data.charwidth - MARGIN*2;
+ }
+
+ data.lines=NULL;
+ data.nhead=0;
+ data.nfoot=0;
+ data.header=NULL;
+ data.footer=NULL;
+ data.consensus=NULL;
+ data.colormask=NULL;
+ data.vlines=(height-SCOREHEIGHT)/data.lineheight - MARGIN;
+ data.vseqs=data.vlines-data.nhead-data.nfoot;
+ data.nseqs=nseqs;
+ data.ncols=length;
+ data.firstseq=firstseq-1;
+ data.firstvline=0;
+ data.firstvcol=0;
+ data.lockoffset=0;
+ data.ascent=Ascent();
+ data.descent=Descent();
+ data.selected=NULL;
+ data.firstsel=-1;
+ data.lastsel=-1;
+ data.colscore=NULL;
+ data.seqweight=NULL;
+ data.subgroup=NULL;
+ data.residue_exception=NULL;
+ data.segment_exception=NULL;
+ data.vscrollbar=NULL;
+ data.hscrollbar=NULL;
+
+ p=AutonomousPanel(g, width, height, DrawPanel, NULL,NULL,sizeof(panel_data), NULL, NULL);
+
+ SetPanelExtra(p, &data);
+ return p;
+
+}
+
+void DrawPanel(PaneL p)
+{
+ RecT r;
+ panel_data data;
+ int pixelwidth,pixelheight;
+
+ UseWindow(mainw);
+ Select(p);
+
+ if (fromvscroll==0 && fromhscroll==0)
+ {
+ ObjectRect(p,&r);
+ pixelwidth=r.right-r.left;
+ pixelheight=r.bottom-r.top;
+
+ SelectFont(datafont);
+ GetPanelExtra(p, &data);
+ data.lineheight=LineHeight();
+ data.charwidth=CharWidth('A');
+ if (data.type==NAMES)
+ data.vcols=pixelwidth/data.charwidth-MARGIN*2-DNUMBER;
+ else
+ data.vcols=pixelwidth/data.charwidth-MARGIN*2;
+ data.vlines=(pixelheight-SCOREHEIGHT)/data.lineheight - MARGIN;
+ data.vseqs=data.vlines-data.nhead-data.nfoot;
+ if(data.vseqs<0)data.vseqs=0;
+ if(data.vcols<0)data.vcols=0;
+ SetPanelExtra(p, &data);
+/* draw the outside frame */
+ ObjectRect (p, &r);
+ Black();
+ FrameRect(&r);
+ InsetRect(&r,1,1);
+ black_on_white();
+ EraseRect(&r);
+ if(data.nseqs == 0) return;
+ }
+
+/* draw the structure and gap penalty data */
+/* draw the footer */
+ if (fromvscroll==0)
+ {
+ draw_header(p);
+ draw_footer(p);
+ draw_colscores(p);
+ }
+
+/* draw the data lines */
+ if (data.type==NAMES)
+ draw_names(p);
+ else
+ draw_seqs(p);
+
+
+}
+
+void hscrollnames(BaR bar, int newval, int oldval)
+{
+ PaneL p;
+ panel_data data;
+
+ p = active_panel.names;
+ GetPanelExtra(p, &data);
+ data.firstvcol = newval;
+ SetPanelExtra(p, &data);
+ Select(p);
+
+ if (data.vseqs<=0) return;
+ draw_names(p);
+}
+
+void vscrollnames(BaR bar, int newval, int oldval)
+{
+ PaneL p;
+ panel_data data;
+
+ p = active_panel.names;
+ GetPanelExtra(p, &data);
+ data.firstvline = newval;
+ SetPanelExtra(p, &data);
+ Select(p);
+
+ if (data.vseqs<=0) return;
+ draw_names(p);
+}
+
+void vscrollseqs(BaR bar, int newval, int oldval)
+{
+ PaneL p;
+ panel_data data;
+ RecT block,rect;
+ int l;
+
+ p = active_panel.seqs;
+ GetPanelExtra(p, &data);
+ l=data.firstvline;
+ data.firstvline = newval;
+ SetPanelExtra(p, &data);
+ Select(p);
+
+ if (data.vseqs<=0) return;
+
+ if (data.vseqs<3 || data.nseqs-l < data.vseqs)
+ {
+ fromvscroll=0;
+ draw_seqs(p);
+ return;
+ }
+
+ if (newval == oldval + 1) {
+ fromvscroll=1;
+ ObjectRect(p, &rect);
+ InsetRect(&rect,1,1);
+ block.top = rect.top+(data.nhead)*data.lineheight+data.descent+1;
+ block.bottom = block.top+(data.vseqs)*data.lineheight;
+ block.left=rect.left;
+ block.right=rect.right;
+ ScrollRect(&block, 0, -data.lineheight);
+ } else if (newval == oldval - 1) {
+ fromvscroll=-1;
+ ObjectRect(p, &rect);
+ InsetRect(&rect,1,1);
+ block.top = rect.top+(data.nhead)*data.lineheight+data.descent+1;
+ block.bottom = block.top+(data.vseqs)*data.lineheight;
+ block.left=rect.left;
+ block.right=rect.right;
+ ScrollRect(&block, 0, data.lineheight);
+ } else {
+ fromvscroll=0;
+ }
+ draw_seqs(p);
+}
+
+void hscrollseqs(BaR bar, int newval, int oldval)
+{
+ PaneL p;
+ panel_data data;
+ RecT rect;
+
+
+ p = active_panel.seqs;
+ GetPanelExtra(p, &data);
+ data.firstvcol = newval+data.lockoffset;
+ SetPanelExtra(p, &data);
+ Select(p);
+
+ if (data.vcols<=0) return;
+
+ if (data.vcols<3)
+ {
+ fromhscroll=0;
+ draw_header(p);
+ draw_seqs(p);
+ draw_footer(p);
+ draw_colscores(p);
+ return;
+ }
+ if (newval == oldval + 1) {
+ fromhscroll=1;
+ ObjectRect(p, &rect);
+ InsetRect(&rect,1,1);
+ rect.left+=data.charwidth;
+ ScrollRect(&rect, -data.charwidth, 0);
+ } else if (newval == oldval - 1) {
+ fromhscroll=-1;
+ ObjectRect(p, &rect);
+ InsetRect(&rect,1,1);
+ rect.right=rect.left+(data.vcols+1)*data.charwidth;
+ ScrollRect(&rect, data.charwidth, 0);
+ } else {
+ fromhscroll=0;
+ }
+ draw_header(p);
+ draw_seqs(p);
+ draw_footer(p);
+ draw_colscores(p);
+}
+
+void draw_names(PaneL p)
+{
+ int i,f,l;
+ panel_data data;
+
+ UseWindow(mainw);
+ Select(p);
+ GetPanelExtra(p,&data);
+ if(data.lines==NULL) return;
+ SelectFont(datafont);
+
+ if (fromvscroll==0)
+ {
+ f=data.firstvline;
+ l=data.firstvline+data.vseqs-1;
+ }
+ else if (fromvscroll==-1)
+ f=l=data.firstvline;
+ else
+ f=l=data.firstvline+data.vseqs-1;
+
+ if(l>=data.nseqs) l=data.nseqs-1;
+ for(i=f;i<=l;i++)
+ if (data.selected[i]==TRUE)
+ draw_nameline(p,i,i,HIGHLIGHT);
+ else
+ draw_nameline(p,i,i,NORMAL);
+}
+
+void draw_seqs(PaneL p)
+{
+ int i,f,l,s,x,y,format;
+ int fs,ls;
+ panel_data data;
+ PoinT pt;
+ RecT r,block;
+
+ UseWindow(mainw);
+ Select(p);
+ GetPanelExtra(p,&data);
+ if(data.lines==NULL) return;
+ SelectFont(datafont);
+ black_on_white();
+ if (fromhscroll==-1)
+ {
+ f=data.firstvcol;
+ if ((f>=data.firstsel) && (f<=data.lastsel))
+ format=HIGHLIGHT;
+ else format=NORMAL;
+ draw_seqcol(p,f,format);
+ }
+ else if (fromhscroll==1)
+ {
+ f=data.firstvcol+data.vcols-1;
+ if ((f>=data.firstsel) && (f<=data.lastsel))
+ format=HIGHLIGHT;
+ else format=NORMAL;
+ draw_seqcol(p,f,format);
+ }
+ else
+ {
+ if (fromvscroll==-1)
+ {
+ f=l=data.firstvline;
+ }
+ else if (fromvscroll==1)
+ {
+ f=l=data.firstvline+data.vseqs-1;
+ }
+ else
+ {
+ f=data.firstvline;
+ l=data.firstvline+data.vseqs-1;
+ }
+
+ if(l>=data.nseqs) l=data.nseqs-1;
+ s=f-data.firstvline;
+ ObjectRect (p, &r);
+ InsetRect(&r,1,1);
+ data_colors();
+ block.top=r.top+((s+data.nhead)*data.lineheight)+data.descent+1;
+ block.bottom=block.top+(l-f+1)*data.lineheight;
+ block.left=r.left;
+ block.right=r.right;
+ EraseRect(&block);
+ if(data.nseqs == 0) return;
+
+ if(data.firstsel != -1)
+ {
+ if ((data.firstsel>=data.firstvcol && data.firstsel<data.firstvcol+data.vcols)||
+ (data.lastsel>=data.firstvcol && data.lastsel<data.firstvcol+data.vcols))
+ {
+ fs=data.firstsel-data.firstvcol;
+ if (fs<0) fs=0;
+ if (fs>=data.vcols) fs=data.vcols-1;
+ ls=data.lastsel-data.firstvcol;
+ if (ls<0) ls=0;
+ if (ls>=data.vcols) ls=data.vcols-1;
+ block.left=r.left+(fs+1)*data.charwidth;
+ block.right=r.left+(ls+2)*data.charwidth;
+ text_colors();
+ EraseRect(&block);
+ }
+ }
+ x=r.left+data.charwidth;
+
+ for(i=f;i<=l;i++)
+ {
+ y=block.top+(i-f+1)*data.lineheight-data.descent-1;
+ LoadPt(&pt,x,y);
+ draw_seqline(data,i,pt,data.firstvcol,data.firstvcol+data.vcols-1,NORMAL);
+ }
+ }
+
+ black_on_white();
+ fromvscroll=fromhscroll=0;
+}
+
+static void NameClick(PaneL panel, PoinT pt)
+{
+ int i;
+ panel_data data;
+ RecT r;
+
+ GetPanelExtra(panel,&data);
+ if(data.prf_no==1)
+ {
+/* revert selected area in profile 2 to normal */
+ GetPanelExtra(prf_panel[1].names,&data);
+ if(data.nseqs==0)
+ draw_seq_pointer(prf_panel[1].names,0,NORMAL);
+ for(i=0;i<data.nseqs;i++)
+ if (data.selected[i]==TRUE)
+ draw_nameline(prf_panel[1].names,i,i,NORMAL);
+ SetPanelExtra(prf_panel[1].names,&data);
+ }
+ else if(data.prf_no==2)
+ {
+/* revert selected area in profile 1 to normal */
+ GetPanelExtra(prf_panel[0].names,&data);
+ if(data.nseqs==0)
+ draw_seq_pointer(prf_panel[0].names,0,NORMAL);
+ for(i=0;i<data.nseqs;i++)
+ if (data.selected[i]==TRUE)
+ draw_nameline(prf_panel[0].names,i,i,NORMAL);
+ SetPanelExtra(prf_panel[0].names,&data);
+ }
+ GetPanelExtra(panel,&data);
+ Select(panel);
+ ObjectRect(panel,&r);
+ if (!shftKey)
+ {
+/* revert existing selected area to normal */
+ for(i=0;i<data.nseqs;i++)
+ if (data.selected[i]==TRUE)
+ draw_nameline(panel,i,i,NORMAL);
+ }
+
+ selected_seqs.first = (pt.y - r.top-data.lineheight/2)/data.lineheight + data.firstvline-data.nhead;
+ if (selected_seqs.first <0) selected_seqs.first=0;
+ if (selected_seqs.first >=data.nseqs) selected_seqs.first=data.nseqs-1;
+ if (selected_seqs.first==-1 && ncutseqs > 0)
+ {
+ selected_seqs.last=selected_seqs.first=0;
+ draw_seq_pointer(panel,0,HIGHLIGHT);
+ }
+ else
+ {
+ selected_seqs.last=selected_seqs.first;
+ draw_nameline(panel,selected_seqs.first,selected_seqs.last,HIGHLIGHT);
+ }
+ black_on_white();
+
+}
+
+static void NameDrag(PaneL panel, PoinT pt)
+{
+ panel_data data;
+ RecT r;
+ int s;
+
+ GetPanelExtra(panel,&data);
+ Select(panel);
+ ObjectRect(panel,&r);
+ s = (pt.y - r.top-data.lineheight/2)/data.lineheight + data.firstvline-data.nhead;
+ if (s<0) s=0;
+ if (s>=data.nseqs) s=data.nseqs-1;
+ if (s==selected_seqs.first)
+ {
+ if (s!=selected_seqs.last)
+ {
+ draw_nameline(panel,selected_seqs.first,selected_seqs.last,NORMAL);
+ draw_nameline(panel,selected_seqs.first,s,HIGHLIGHT);
+ }
+ }
+ else if (s>selected_seqs.first)
+ {
+ if (s>selected_seqs.last)
+ draw_nameline(panel,selected_seqs.last+1,s,HIGHLIGHT);
+ else if (s<selected_seqs.last)
+ draw_nameline(panel,s+1,selected_seqs.last,NORMAL);
+ }
+ else
+ {
+ if (s<selected_seqs.last)
+ draw_nameline(panel,s,selected_seqs.last-1,HIGHLIGHT);
+ else if (s>selected_seqs.last)
+ draw_nameline(panel,selected_seqs.last,s-1,NORMAL);
+ }
+ selected_seqs.last=s;
+
+ black_on_white();
+}
+
+static void NameRelease(PaneL panel, PoinT pt)
+{
+ int t;
+ panel_data data;
+
+ if (selected_seqs.first > selected_seqs.last)
+ {
+ t=selected_seqs.first;
+ selected_seqs.first=selected_seqs.last;
+ selected_seqs.last=t;
+ }
+ active_panel.names = panel;
+ GetPanelExtra(panel,&data);
+ active_panel.seqs = data.index;
+
+}
+
+void draw_seq_pointer(PaneL panel,int seq,int format)
+{
+ RecT r,block;
+ panel_data data;
+
+ Select(panel);
+ GetPanelExtra(panel,&data);
+
+ ObjectRect(panel,&r);
+ InsetRect(&r,1,1);
+ block.top=r.top+((seq+data.nhead)*data.lineheight)+data.descent+1;
+ block.bottom=block.top+data.lineheight;
+ block.left=r.left;
+ block.right=r.right;
+ if (format==HIGHLIGHT)
+ Black();
+ else
+ White();
+ PaintRect(&block);
+
+}
+
+static void SeqClick(PaneL panel, PoinT pt)
+{
+ int s;
+ int f,l;
+ panel_data data;
+ RecT r;
+
+ GetPanelExtra(panel,&data);
+ if(data.prf_no==1)
+ {
+/* revert selected area in profile 2 to normal */
+ GetPanelExtra(prf_panel[1].seqs,&data);
+ f=data.firstsel;
+ l=data.lastsel;
+ data.firstsel=-1;
+ data.lastsel=-1;
+ SetPanelExtra(prf_panel[1].seqs,&data);
+ if (f != -1) highlight_seqrange(prf_panel[1].seqs,f,l,NORMAL);
+ }
+ else if(data.prf_no==2)
+ {
+/* revert selected area in profile 1 to normal */
+ GetPanelExtra(prf_panel[0].seqs,&data);
+ f=data.firstsel;
+ l=data.lastsel;
+ data.firstsel=-1;
+ data.lastsel=-1;
+ SetPanelExtra(prf_panel[0].seqs,&data);
+ if (f != -1) highlight_seqrange(prf_panel[0].seqs,f,l,NORMAL);
+ }
+ GetPanelExtra(panel,&data);
+ Select(panel);
+ ObjectRect(panel,&r);
+
+ s = (pt.x - r.left-data.charwidth)/data.charwidth + data.firstvcol;
+ if (s <0) s=0;
+ if (s<data.firstvcol) s=data.firstvcol;
+ if (s >=data.ncols) s=data.ncols-1;
+ if (s >=data.firstvcol+data.vcols) s=data.firstvcol+data.vcols-1;
+
+ if (shftKey && data.firstsel != -1)
+ {
+ if (s>data.lastsel)
+ {
+ highlight_seqrange(panel,data.firstsel,s,HIGHLIGHT);
+ data.lastsel=s;
+ }
+ else if (s<data.firstsel)
+ {
+ highlight_seqrange(panel,s,data.lastsel,HIGHLIGHT);
+ data.firstsel=s;
+ }
+ else
+ {
+ highlight_seqrange(panel,s+1,data.lastsel,NORMAL);
+ highlight_seqrange(panel,data.firstsel,s,HIGHLIGHT);
+ data.lastsel=s;
+ }
+ selected_res.first=data.firstsel;
+ selected_res.last=data.lastsel;
+ }
+ else
+ {
+/* revert existing selected area to normal */
+ f=data.firstsel;
+ l=data.lastsel;
+ data.firstsel=-1;
+ data.lastsel=-1;
+ SetPanelExtra(panel,&data);
+ if (f != -1) highlight_seqrange(panel,f,l,NORMAL);
+ selected_res.first=selected_res.last=s;
+ highlight_seqrange(panel,selected_res.first,selected_res.last,HIGHLIGHT);
+ data.firstsel=selected_res.first;
+ data.lastsel=selected_res.last;
+ }
+
+ SetPanelExtra(panel,&data);
+ black_on_white();
+
+}
+
+static void SeqDrag(PaneL panel, PoinT pt)
+{
+ panel_data data;
+ RecT r;
+ int s;
+
+ GetPanelExtra(panel,&data);
+ Select(panel);
+ ObjectRect(panel,&r);
+ s = (pt.x - r.left-data.charwidth)/data.charwidth + data.firstvcol;
+ if (s<0) s=0;
+ if (s<data.firstvcol) s=data.firstvcol;
+ if (s>=data.ncols) s=data.ncols-1;
+ if (s >=data.firstvcol+data.vcols) s=data.firstvcol+data.vcols-1;
+ if (s==selected_res.first)
+ {
+ if (s!=selected_res.last)
+ {
+ highlight_seqrange(panel,selected_res.first,selected_res.last,NORMAL);
+ highlight_seqrange(panel,selected_res.first,s,HIGHLIGHT);
+ }
+ }
+ else if (s>selected_res.first)
+ {
+ if (s>selected_res.last)
+ highlight_seqrange(panel,selected_res.last+1,s,HIGHLIGHT);
+ else if (s<selected_res.last)
+ highlight_seqrange(panel,s+1,selected_res.last,NORMAL);
+ }
+ else
+ {
+ if (s<selected_res.last)
+ highlight_seqrange(panel,s,selected_res.last-1,HIGHLIGHT);
+ else if (s>selected_res.last)
+ highlight_seqrange(panel,selected_res.last,s-1,NORMAL);
+ }
+ selected_res.last=s;
+
+ black_on_white();
+}
+
+static void SeqRelease(PaneL panel, PoinT pt)
+{
+ int t;
+ panel_data data;
+
+ if (selected_res.first > selected_res.last)
+ {
+ t=selected_res.first;
+ selected_res.first=selected_res.last;
+ selected_res.last=t;
+ }
+
+ active_panel.seqs = panel;
+ GetPanelExtra(panel,&data);
+ active_panel.names = data.index;
+ data.firstsel=selected_res.first;
+ data.lastsel=selected_res.last;
+ SetPanelExtra(panel,&data);
+
+}
+
+void draw_header(PaneL p)
+{
+ RecT block,r;
+ PoinT pt;
+ int i, j, x, y;
+ panel_data data;
+ char *line;
+
+ UseWindow(mainw);
+ Select(p);
+ SelectFont(datafont);
+ GetPanelExtra(p, &data);
+ if(data.nseqs == 0) return;
+ if(data.header == NULL) return;
+ if(data.vlines<data.nhead) return;
+ if(data.vcols<=0) return;
+
+ line=(char *)ckalloc((data.vcols+1) * sizeof(char));
+ ObjectRect (p, &r);
+ InsetRect(&r,1,1);
+ block.top=r.top+data.descent/2;
+ block.bottom=block.top+(data.nhead*data.lineheight);
+ block.left=r.left;
+ block.right=r.right;
+ text_colors();
+ EraseRect(&block);
+ if (data.type==NAMES)
+ x=r.left+DNUMBER*data.charwidth;
+ else
+ x=r.left+data.charwidth;
+ y=r.top+data.lineheight-data.descent/2;
+ for(i=0;i<data.nhead;i++)
+ {
+ for(j=data.firstvcol;j<data.firstvcol+data.vcols && j<data.ncols;j++)
+ if(j>=0)
+ line[j-data.firstvcol]=data.header[i][j];
+ else
+ line[j-data.firstvcol]=' ';
+ line[j-data.firstvcol]='\0';
+ LoadPt(&pt, x, y);
+ SetPen(pt);
+ PaintString(line);
+ y+=data.lineheight;
+ }
+ black_on_white();
+ ckfree(line);
+}
+
+void draw_footer(PaneL p)
+{
+ RecT block,r;
+ PoinT pt;
+ int i, j,x, y;
+ panel_data data;
+ char *line;
+
+ UseWindow(mainw);
+ Select(p);
+ SelectFont(datafont);
+ GetPanelExtra(p, &data);
+ if(data.nseqs == 0) return;
+ if(data.footer == NULL) return;
+ if(data.vlines<data.nfoot) return;
+ if(data.vcols<=0) return;
+
+ line=(char *)ckalloc((data.vcols+1) * sizeof(char));
+ ObjectRect (p, &r);
+ InsetRect(&r,1,1);
+ block.top=r.top+((data.vlines-data.nfoot)*data.lineheight)+data.descent+data.ascent/2;
+ block.bottom=block.top+data.nfoot*data.lineheight;
+ block.left=r.left;
+ block.right=r.right;
+ text_colors();
+ EraseRect(&block);
+ if(data.type==NAMES)
+ x=block.left+DNUMBER*data.charwidth;
+ else
+ x=block.left+data.charwidth;
+ y=block.top+data.lineheight-1;
+ for(i=0;i<data.nfoot;i++)
+ {
+ for(j=data.firstvcol;j<data.firstvcol+data.vcols && j<data.ncols;j++)
+ if(j>=0)
+ line[j-data.firstvcol]=data.footer[i][j];
+ else
+ line[j-data.firstvcol]=' ';
+ line[j-data.firstvcol]='\0';
+ LoadPt(&pt, x, y);
+ SetPen(pt);
+ PaintString(line);
+ y+=data.lineheight;
+ }
+ black_on_white();
+ ckfree(line);
+}
+
+
+void draw_nameline(PaneL p,int fseq,int lseq,int format)
+{
+ RecT block,r;
+ PoinT pt;
+ int n,i, j, t, f,l,x, y,ix;
+ panel_data data;
+ char *line;
+
+ Select(p);
+ SelectFont(datafont);
+ GetPanelExtra(p, &data);
+ if(data.nseqs == 0) return;
+
+ n=1;
+ i=data.nseqs;
+ for(;;)
+ {
+ i/=10;
+ if(i==0) break;
+ n++;
+ }
+
+ line=(char *)ckalloc((data.vcols+1) * sizeof(char));
+ if (fseq > lseq)
+ {
+ t=fseq;
+ fseq=lseq;
+ lseq=t;
+ }
+ if (format==HIGHLIGHT)
+ for(i=fseq;i<=lseq;i++) data.selected[i]=TRUE;
+ else
+ for(i=fseq;i<=lseq;i++) data.selected[i]=FALSE;
+ SetPanelExtra(p,&data);
+ if (fseq<data.firstvline)
+ fseq=data.firstvline;
+ if (fseq>=data.firstvline+data.vseqs)
+ fseq=data.firstvline+data.vseqs;
+ if (lseq<data.firstvline)
+ lseq=data.firstvline;
+ if (lseq>=data.firstvline+data.vseqs)
+ lseq=data.firstvline+data.vseqs-1;
+ f=fseq-data.firstvline;
+ l=lseq-data.firstvline;
+ ObjectRect (p, &r);
+ InsetRect(&r,1,1);
+ block.top=r.top+((f+data.nhead)*data.lineheight)+data.descent+1;
+ block.bottom=block.top+((l-f+1)*data.lineheight);
+ block.left=r.left;
+ block.right=r.right;
+ if (format==HIGHLIGHT)
+ white_on_black();
+ else
+ data_colors();
+ EraseRect(&block);
+ y=block.top+data.lineheight-data.descent-1;
+ for(i=fseq;i<=lseq;i++)
+ {
+ x=r.left+data.charwidth;
+ sprintf(line,"%*d",n,i+1);
+ LoadPt(&pt, x, y);
+ SetPen(pt);
+ Gray();
+ PaintString(line);
+ y+=data.lineheight;
+ }
+ y=block.top+data.lineheight-data.descent-1;
+ for(i=fseq;i<=lseq;i++)
+ {
+ ix=output_index[i+1]-1;
+ x=r.left+DNUMBER*data.charwidth;
+ for(j=0;j<data.vcols && j<data.ncols-data.firstvcol;j++)
+ line[j]=data.lines[ix][j+data.firstvcol];
+ line[j]='\0';
+ LoadPt(&pt, x, y);
+ SetPen(pt);
+ if(format==HIGHLIGHT) White();
+ else Black();
+ PaintString(line);
+ y+=data.lineheight;
+ }
+ black_on_white();
+ ckfree(line);
+}
+
+void draw_seqline(panel_data data,int seq,PoinT pt,int fcol,int lcol,int format)
+{
+ RecT r;
+ int i, j, ix;
+ char *line[MAXCOLORS+1];
+
+ if(data.nseqs == 0) return;
+
+/* draw colored character on white background */
+ for(i=0;i<ncolors;i++)
+ {
+ line[i]=(char *)ckalloc((data.vcols+1) * sizeof(char));
+ for(j=0;j<data.vcols;j++)
+ line[i][j]=' ';
+ line[i][j]='\0';
+ }
+
+ ix=output_index[seq+1]-1;
+
+ r.top=pt.y-data.lineheight+data.descent+1;
+ r.bottom=r.top+data.lineheight;
+ for(j=fcol;j<=lcol && j<data.ncols;j++)
+ {
+ if(j>=0)
+ {
+ if(segment_exceptions && data.segment_exception[ix][j] > 0)
+ {
+ r.left=pt.x;
+ r.right=r.left+data.charwidth;
+ DkGray();
+ PaintRect(&r);
+ White();
+ }
+ else if(residue_exceptions && data.residue_exception[ix][j] == TRUE)
+ {
+ r.left=pt.x;
+ r.right=r.left+data.charwidth;
+ /* LtGray(); */
+ SelectColor(150,150,150);
+ PaintRect(&r);
+ White();
+ }
+ else
+ {
+ if(inverted)
+ {
+ if(format==HIGHLIGHT || (j>=data.firstsel && j<=data.lastsel))
+ Black();
+ else
+ {
+ r.left=pt.x;
+#ifdef UNIX
+ r.right=r.left+data.charwidth-1;
+#else
+ r.right=r.left+data.charwidth;
+#endif
+ SetColor(color_lut[(int)data.colormask[ix][j]].val);
+ PaintRect(&r);
+ Black();
+ }
+ }
+ else
+ SetColor(color_lut[(int)data.colormask[ix][j]].val);
+
+ }
+ SetPen(pt);
+ PaintChar(data.lines[ix][j]);
+ }
+ pt.x+=data.charwidth;
+ }
+ for(i=0;i<ncolors;i++)
+ ckfree(line[i]);
+ Black();
+}
+
+void draw_seqcol(PaneL p,int col,int format)
+{
+ RecT block,r, r2;
+ PoinT pt;
+ int totseqs,i, c,x,y,ix;
+ panel_data data;
+
+ Select(p);
+ SelectFont(datafont);
+ GetPanelExtra(p, &data);
+ if(data.nseqs == 0) return;
+ if(data.ncols == 0) return;
+
+ SetPanelExtra(p, &data);
+
+ if (col<data.firstvcol)
+ col=data.firstvcol;
+ if (col>=data.firstvcol+data.vcols)
+ col=data.firstvcol+data.vcols-1;
+ c=col-data.firstvcol;
+ totseqs=data.vseqs;
+ if (totseqs>data.nseqs) totseqs=data.nseqs;
+ ObjectRect (p, &r);
+ InsetRect(&r,1,1);
+ block.top=r.top+(data.nhead*data.lineheight)+data.descent+1;
+ block.bottom=block.top+(totseqs)*data.lineheight;
+ block.left=r.left+(c+1)*data.charwidth;
+ block.right=block.left+data.charwidth;
+ if (format==HIGHLIGHT)
+ text_colors();
+ else
+ data_colors();
+ EraseRect(&block);
+
+ x=r.left+(c+1)*data.charwidth;
+ y=block.top+data.lineheight-data.descent-1;
+ r2.left=x;
+ r2.right=r2.left+data.charwidth;
+ for(i=data.firstvline;i<data.firstvline+data.vseqs && i<data.nseqs;i++)
+ {
+ ix=output_index[i+1]-1;
+ if(segment_exceptions && data.segment_exception[ix][col] > 0)
+ {
+ r2.top=y-data.lineheight+data.descent+1;
+ r2.bottom=r.top+data.lineheight;
+ DkGray();
+ PaintRect(&r2);
+ White();
+ }
+ else if(residue_exceptions && data.residue_exception[ix][col] == TRUE)
+ {
+ r2.top=y-data.lineheight+data.descent+1;
+ r2.bottom=r.top+data.lineheight;
+ /* LtGray(); */
+ SelectColor(150,150,150);
+ PaintRect(&r2);
+ White();
+ }
+ else
+ {
+ if(inverted)
+ {
+ r2.top=y-data.lineheight+data.descent+1;
+ r2.bottom=r2.top+data.lineheight;
+ if(format==HIGHLIGHT)
+ {
+ LtGray();
+ }
+ else
+ SetColor(color_lut[(int)data.colormask[ix][col]].val);
+ PaintRect(&r2);
+ Black();
+ }
+ else
+ SetColor(color_lut[(int)data.colormask[ix][col]].val);
+
+ }
+ LoadPt(&pt,x,y);
+ SetPen(pt);
+ PaintChar(data.lines[ix][col]);
+ y+=data.lineheight;
+ }
+ Black();
+}
+
+void highlight_seqrange(PaneL p,int fcol,int lcol, int format)
+{
+ RecT block,r;
+ int i,t,x,y;
+ int fseq,lseq,s;
+ panel_data data;
+ PoinT pt;
+
+ Select(p);
+ SelectFont(datafont);
+ GetPanelExtra(p, &data);
+ if(data.nseqs == 0) return;
+ if(data.ncols == 0) return;
+
+ if (fcol > lcol)
+ {
+ t=fcol;
+ fcol=lcol;
+ lcol=t;
+ }
+
+ if ((fcol>=data.firstvcol && fcol<data.firstvcol+data.vcols)||
+ (lcol>=data.firstvcol && lcol<data.firstvcol+data.vcols))
+ {
+ if (fcol<data.firstvcol) fcol=data.firstvcol;
+ if (fcol>=data.firstvcol+data.vcols) fcol=data.firstvcol+data.vcols-1;
+ if (lcol<data.firstvcol) lcol=data.firstvcol;
+ if (lcol>=data.firstvcol+data.vcols) lcol=data.firstvcol+data.vcols-1;
+ }
+
+ fseq=data.firstvline;
+ lseq=data.firstvline+data.vseqs-1;
+ if(lseq>=data.nseqs) lseq=data.nseqs-1;
+ s=fseq-data.firstvline;
+ ObjectRect (p, &r);
+ InsetRect(&r,1,1);
+ if(format==HIGHLIGHT)
+ text_colors();
+ else
+ data_colors();
+ block.top=r.top+((s+data.nhead)*data.lineheight)+data.descent+1;
+ block.bottom=block.top+(lseq-fseq+1)*data.lineheight;
+ block.left=r.left+(fcol-data.firstvcol+1)*data.charwidth;
+ block.right=r.left+(lcol-data.firstvcol+2)*data.charwidth;
+ EraseRect(&block);
+
+ x=r.left+(fcol-data.firstvcol+1)*data.charwidth;
+
+ for(i=fseq;i<=lseq;i++)
+ {
+ y=block.top+(i-fseq+1)*data.lineheight-data.descent-1;
+ LoadPt(&pt,x,y);
+ draw_seqline(data,i,pt,fcol,lcol,format);
+ }
+ black_on_white();
+}
+
+GrouP make_scroll_area(GrouP w,int prf_no,int nwidth,int swidth,int height,int firstseq,int nseqs,spanel *p)
+{
+ panel_data ndata,sdata;
+ GrouP display;
+ RecT rect;
+ PoinT pt;
+ PaneL names,seqs;
+ BaR vscrollbar,hnscrollbar,hsscrollbar;
+ BarScrlProc hscrollnameproc, hscrollseqproc, vscrollproc;
+
+ if(prf_no==0)
+ {
+ hscrollnameproc=HscrollMultiN;
+ hscrollseqproc=HscrollMultiS;
+ vscrollproc=VscrollMulti;
+ }
+ else if (prf_no==1)
+ {
+ hscrollnameproc=HscrollPrf1N;
+ hscrollseqproc=HscrollPrf1S;
+ vscrollproc=VscrollPrf1;
+ }
+ else
+ {
+ hscrollnameproc=HscrollPrf2N;
+ hscrollseqproc=HscrollPrf2S;
+ vscrollproc=VscrollPrf2;
+ }
+
+ display=HiddenGroup(w, 0, 0, NULL);
+ SetGroupSpacing(display, 0, 0);
+ Hide(display);
+
+ vscrollbar=ScrollBar(display, -1, 1, vscrollproc);
+
+ ObjectRect(vscrollbar, &rect);/* vscrollbar for names */
+ pt.x=rect.right; /*how near they should be with name panel Ramu */
+ pt.y=rect.top;
+ SetNextPosition(display, pt);
+ names=make_panel(NAMES,display, nwidth+(5*max_names), height, firstseq,nseqs); /* 5*max_names Ramu */
+
+ ObjectRect(names, &rect);
+ pt.x=rect.right;
+ pt.y=rect.top;
+ SetNextPosition(display, pt);
+ seqs=make_panel(SEQS,display, swidth, height, firstseq,nseqs);
+
+/* horizontal scroll bars */
+ ObjectRect(names, &rect);
+ pt.x=rect.left;
+ pt.y=rect.bottom;
+ SetNextPosition(display, pt);
+ hnscrollbar=ScrollBar(display, 1, -1, hscrollnameproc);
+ ObjectRect(seqs, &rect);
+ pt.x=rect.left;
+ pt.y=rect.bottom;
+ SetNextPosition(display, pt);
+ hsscrollbar=ScrollBar(display, 1, -1, hscrollseqproc);
+
+ SetRange(hsscrollbar,1,1,0);
+ SetRange(hnscrollbar,1,1,0);
+ SetRange(vscrollbar,1,1,0);
+
+ GetPanelExtra(names,&ndata);
+ ndata.hscrollbar=hnscrollbar;
+ ndata.index=seqs;
+ ndata.prf_no=prf_no;
+
+ GetPanelExtra(seqs,&sdata);
+ sdata.vscrollbar=vscrollbar;
+ sdata.hscrollbar=hsscrollbar;
+ sdata.index=names;
+ sdata.prf_no=prf_no;
+
+ SetPanelClick(names,NameClick, NameDrag, NULL, NameRelease);
+ SetPanelClick(seqs,SeqClick, SeqDrag, NULL, SeqRelease);
+
+ p->names = names;
+ p->seqs = seqs;
+
+ ndata=alloc_name_data(ndata);
+ sdata=alloc_seq_data(sdata);
+ SetPanelExtra(names,&ndata);
+ SetPanelExtra(seqs,&sdata);
+
+ Show(display);
+ return(display);
+}
+
+
+void white_on_black(void)
+{
+ Black(); InvertColors(); White();
+}
+void black_on_white(void)
+{
+ White(); InvertColors(); Black();
+}
+void text_colors(void)
+{
+ SelectColor(220,220,220);
+ InvertColors();
+ Black();
+}
+void data_colors(void)
+{
+ White();
+ InvertColors();
+ Black();
+}
+
+
+
+
+void make_ruler(int length, char *name,char *seq)
+{
+
+ int i,j;
+ char marker[5];
+ int marker_len;
+
+ strcpy(name,"ruler");
+ seq[0] = '1';
+ for (i=1;i<length;i++)
+ {
+ if ((i+1)%10 > 0)
+ seq[i] = '.';
+ else
+ {
+ sprintf(marker,"%d",((i+1)/10)*10);
+ marker_len = strlen(marker);
+ for (j=0;j<marker_len && i+1+j-marker_len < length;j++)
+ seq[i+1+j-marker_len] = marker[j];
+ }
+ }
+ seq[length]='\0';
+}
+
+panel_data free_panel_data(panel_data data)
+{
+ int i;
+
+ if (data.header!=NULL)
+ {
+ for (i=0;i<mheader;i++)
+ {
+ if(data.header[i] != NULL) ckfree(data.header[i]);
+ data.header[i]=NULL;
+ }
+ ckfree(data.header);
+ data.header=NULL;
+ }
+ if (data.footer!=NULL)
+ {
+ for (i=0;i<mfooter;i++)
+ {
+ if(data.footer[i] != NULL) ckfree(data.footer[i]);
+ data.footer[i]=NULL;
+ }
+ ckfree(data.footer);
+ data.footer=NULL;
+ }
+ if (data.consensus!=NULL)
+ {
+ ckfree(data.consensus);
+ data.consensus=NULL;
+ }
+ if (data.lines!=NULL)
+ {
+ for (i=0;i<data.nseqs;i++)
+ {
+ if(data.lines[i] != NULL) ckfree(data.lines[i]);
+ data.lines[i]=NULL;
+ }
+ ckfree(data.lines);
+ data.lines=NULL;
+ }
+ if (data.colormask!=NULL)
+ {
+ for (i=0;i<data.nseqs;i++)
+ {
+ if(data.colormask[i] != NULL) ckfree(data.colormask[i]);
+ data.colormask[i]=NULL;
+ }
+ ckfree(data.colormask);
+ data.colormask=NULL;
+ }
+ if (data.selected!=NULL) ckfree(data.selected);
+ data.selected=NULL;
+
+ if (data.seqweight!=NULL) ckfree(data.seqweight);
+ data.seqweight=NULL;
+ if (data.subgroup!=NULL) ckfree(data.subgroup);
+ data.subgroup=NULL;
+ if (data.colscore!=NULL) ckfree(data.colscore);
+ data.colscore=NULL;
+ if (data.residue_exception!=NULL)
+ {
+ for (i=0;i<data.nseqs;i++)
+ {
+ if(data.residue_exception[i] != NULL) ckfree(data.residue_exception[i]);
+ data.residue_exception[i]=NULL;
+ }
+ ckfree(data.residue_exception);
+ data.residue_exception=NULL;
+ }
+ if (data.segment_exception!=NULL)
+ {
+ for (i=0;i<data.nseqs;i++)
+ {
+ if(data.segment_exception[i] != NULL) ckfree(data.segment_exception[i]);
+ data.segment_exception[i]=NULL;
+ }
+ ckfree(data.segment_exception);
+ data.segment_exception=NULL;
+ }
+
+ return(data);
+}
+
+
+void make_consensus(panel_data data,char *name,char *seq1)
+{
+ char c;
+ sint catident1[NUMRES],catident2[NUMRES],ident;
+ sint i,j,k,l;
+
+
+ strcpy(name,"");
+ for(i=0; i<data.ncols; i++) {
+ seq1[i]=' ';
+ ident=0;
+ for(j=0;res_cat1[j]!=NULL;j++) catident1[j] = 0;
+ for(j=0;res_cat2[j]!=NULL;j++) catident2[j] = 0;
+ for(j=0;j<data.nseqs;++j) {
+ if(isalpha(data.lines[0][i])) {
+ if(data.lines[0][i] == data.lines[j][i])
+ ++ident;
+ for(k=0;res_cat1[k]!=NULL;k++) {
+ for(l=0;(c=res_cat1[k][l]);l++) {
+ if (c=='\0') break;
+ if (data.lines[j][i]==c)
+ {
+ catident1[k]++;
+ break;
+ }
+ }
+ }
+ for(k=0;res_cat2[k]!=NULL;k++) {
+ for(l=0;(c=res_cat2[k][l]);l++) {
+ if (c=='\0') break;
+ if (data.lines[j][i]==c)
+ {
+ catident2[k]++;
+ break;
+ }
+ }
+ }
+ }
+ }
+ if(ident==data.nseqs)
+ seq1[i]='*';
+ else if (!dnaflag) {
+ for(k=0;res_cat1[k]!=NULL;k++) {
+ if (catident1[k]==data.nseqs) {
+ seq1[i]=':';
+ break;
+ }
+ }
+ if(seq1[i]==' ')
+ for(k=0;res_cat2[k]!=NULL;k++) {
+ if (catident2[k]==data.nseqs) {
+ seq1[i]='.';
+ break;
+ }
+ }
+ }
+ }
+}
+
+int make_struct_data(int prf_no,int len, char *name,char *seq)
+{
+ int i,n=0;
+ char val;
+ char *ss_mask;
+
+ seq[0]='\0';
+ name[0]='\0';
+if (prf_no == 1)
+{
+ if (struct_penalties1 == SECST && use_ss1 == TRUE) {
+ n=1;
+ strcpy(name,"Structures");
+ ss_mask = (char *)ckalloc((seqlen_array[1]+10) * sizeof(char));
+ for (i=0;i<seqlen_array[1];i++)
+ ss_mask[i] = sec_struct_mask1[i];
+ print_sec_struct_mask(seqlen_array[1],sec_struct_mask1,ss_mask)
+;
+ for(i=0; i<len; i++) {
+ val=ss_mask[i];
+ if (val == gap_pos1)
+ seq[i]='-';
+ else
+ seq[i]=val;
+ }
+ seq[i]=EOS;
+ ckfree(ss_mask);
+ }
+
+}
+else if (prf_no == 2)
+{
+ if (struct_penalties2 == SECST && use_ss2 == TRUE) {
+ n=1;
+ strcpy(name,"Structures");
+ ss_mask = (char *)ckalloc((seqlen_array[profile1_nseqs+1]+10) *
+sizeof(char));
+ for (i=0;i<seqlen_array[profile1_nseqs+1];i++)
+ ss_mask[i] = sec_struct_mask2[i];
+ print_sec_struct_mask(seqlen_array[profile1_nseqs+1],sec_struct_mask2,ss_mask);
+
+ for(i=0; i<len; i++) {
+ val=ss_mask[i];
+ if (val == gap_pos1)
+ seq[i]='-';
+ else
+ seq[i]=val;
+ }
+ seq[i]=EOS;
+ ckfree(ss_mask);
+ }
+}
+ return(n);
+}
+
+int make_gp_data(int prf_no,int len, char *name,char *seq)
+{
+ int i,n=0;
+ char val;
+
+ seq[0]='\0';
+ name[0]='\0';
+if (prf_no == 1)
+{
+ if (struct_penalties1 == GMASK && use_ss1 == TRUE) {
+ n=1;
+ strcpy(name,"Gap Penalties");
+ for(i=0; i<len; i++) {
+ val=gap_penalty_mask1[i];
+ if (val == gap_pos1)
+ seq[i]='-';
+ else
+ seq[i]=val;
+ }
+ seq[i]=EOS;
+ }
+}
+else if (prf_no == 2)
+{
+ if (struct_penalties2 == GMASK && use_ss2 == TRUE) {
+ n=1;
+ strcpy(name,"Gap Penalties");
+ for(i=0; i<len; i++) {
+ val=gap_penalty_mask2[i];
+ if (val == gap_pos1)
+ seq[i]='-';
+ else
+ seq[i]=val;
+ }
+ seq[i]=EOS;
+ }
+}
+ return(n);
+}
+
+static void VscrollMulti(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval)
+{
+ active_panel=seq_panel;
+ vscrollnames(bar, newval, oldval);
+ vscrollseqs(bar, newval, oldval);
+}
+
+static void HscrollMultiN(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval)
+{
+ active_panel=seq_panel;
+ hscrollnames(bar, newval, oldval);
+}
+
+static void HscrollMultiS(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval)
+{
+ active_panel=seq_panel;
+ hscrollseqs(bar, newval, oldval);
+}
+
+static void VscrollPrf1(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval)
+{
+ active_panel=prf_panel[0];
+ vscrollnames(bar, newval, oldval);
+ vscrollseqs(bar, newval, oldval);
+}
+
+static void HscrollPrf1N(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval)
+{
+ active_panel=prf_panel[0];
+ hscrollnames(bar, newval, oldval);
+}
+
+static void HscrollPrf1S(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval)
+{
+ active_panel=prf_panel[0];
+ hscrollseqs(bar, newval, oldval);
+ if(fixed_prf_scroll==TRUE)
+ {
+ active_panel=prf_panel[1];
+ hscrollseqs(bar, newval, oldval);
+ }
+}
+
+static void VscrollPrf2(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval)
+{
+ active_panel=prf_panel[1];
+ vscrollnames(bar, newval, oldval);
+ vscrollseqs(bar, newval, oldval);
+}
+
+static void HscrollPrf2N(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval)
+{
+ active_panel=prf_panel[1];
+ hscrollnames(bar, newval, oldval);
+}
+
+static void HscrollPrf2S(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval)
+{
+ if(fixed_prf_scroll==TRUE)
+ {
+ active_panel=prf_panel[0];
+ hscrollseqs(bar, newval, oldval);
+ }
+ active_panel=prf_panel[1];
+ hscrollseqs(bar, newval, oldval);
+}
+
+
Added: trunk/packages/clustalw/branches/upstream/current/xmenu.c
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/xmenu.c 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/xmenu.c 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,4636 @@
+/***********************************************************************************************
+ *
+ *
+ *
+ * History
+ *
+ * 27.3.2002 - color parameter chooser can browse filenames like load sequences - Jose
+ * 16.1.2002 - remove the 'cut sequences' dialog box, not needed - Jise
+ * 17.1.2002 - 'Remove positions that contain gaps in all sequences ?' removed, no need for confirmation - Toby
+ *
+ *
+ *
+*/
+
+#include <stdarg.h>
+#include <string.h>
+
+#include <vibrant.h>
+#include <document.h>
+
+/* #include <ncbi.h> ramu for time funs */
+
+#include "clustalw.h"
+#include "xmenu.h"
+
+
+static void RemoveWin(WindoW w);
+static void QuitWinW(WindoW w);
+static void QuitWinI(IteM i);
+static void QuitHelpW(WindoW w);
+static void QuitHelpB(ButtoN b);
+static void SearchStrWin (IteM item);
+static void SavePSSeqWin (IteM item);
+static void SavePSPrf1Win (IteM item);
+static void SavePSPrf2Win (IteM item);
+static void SaveSeqFileWin (IteM item);
+static void SavePrf1FileWin (IteM item);
+static void SavePrf2FileWin (IteM item);
+static void OpenColorParWin (IteM item);
+static void SearchStr(ButtoN but);
+static void SavePSSeqFile(ButtoN but);
+static void SavePSPrf1File(ButtoN but);
+static void SavePSPrf2File(ButtoN but);
+static void SaveSeqFile(ButtoN but);
+static void SavePrf1File(ButtoN but);
+static void SavePrf2File(ButtoN but);
+static void SaveScoresWin (IteM item);
+static void SaveScores(ButtoN but);
+static void OpenColorPar(ButtoN but);
+static void CancelWin(ButtoN but);
+static void SaveTreeWin (IteM item);
+static void CAlignWin (IteM item);
+static void RealignSeqsWin (IteM item);
+static void RealignSeqRangeWin (IteM item);
+static void DrawTreeWin (IteM item);
+static void AlignFromTreeWin(IteM item);
+static void PrfPrfAlignWin(IteM item);
+static void PrfPrfTreeAlignWin(IteM item);
+static void SeqPrfAlignWin(IteM item);
+static void SeqPrfTreeAlignWin(IteM item);
+static void BootstrapTreeWin (IteM item);
+static void CreateAlignTree(ButtoN but);
+static void CompleteAlign(ButtoN but);
+static void RealignSeqs(ButtoN but);
+static void RealignSeqRange(ButtoN but);
+static void DrawTree(ButtoN but);
+static void AlignFromTree(ButtoN but);
+static void PrfPrfAlign(ButtoN but);
+static void PrfPrfTreeAlign(ButtoN but);
+static void SeqPrfAlign(ButtoN but);
+static void SeqPrfTreeAlign(ButtoN but);
+static void BootstrapTree(ButtoN but);
+static void OpenSeqFile (IteM item);
+static void AppendSeqFile (IteM item);
+static void OpenPrf1File (IteM item);
+static void OpenPrf2File (IteM item);
+static void ScoreWin(IteM item);
+static void SegmentWin(IteM item);
+static void ScoreSegments(ButtoN but);
+static void PWParameters(IteM item);
+static void MultiParameters(IteM item);
+static void GapParameters(IteM item);
+static void SSParameters(IteM item);
+static void OutputParameters(IteM item);
+static void OutputTreeParameters(IteM item);
+static void HelpProc(IteM item);
+static void DefColorPar(IteM item);
+static void BlackandWhite(IteM item);
+static void set_reset_new_gaps(IteM i);
+static void set_reset_all_gaps(IteM i);
+static void SearchStringAgain(ButtoN but);
+
+static PopuP make_toggle(GrouP g,CharPtr title,CharPtr true_text, CharPtr false_text,
+ Boolean *value,PupActnProc SetProc);
+static PrompT make_scale(GrouP g,CharPtr title,int length,int value,int max,BarScrlProc SetProc);
+static PrompT make_prompt(GrouP g,CharPtr title);
+
+static void CutSequences(IteM item);
+static void PasteSequences(IteM item);
+static void RemoveGaps(IteM item);
+static void RemoveGapPos(IteM item);
+
+static void SelectSeqs(IteM item);
+static void SelectPrf1(IteM item);
+static void SelectPrf2(IteM item);
+static void MergeProfiles(IteM item);
+static void ClearSeqs(IteM item);
+
+static void cut_multiplem(void);
+static void cut_profile1(void);
+static void cut_profile2(void);
+static void ssave(int j);
+static void sscpy(int i,int j);
+static void sload(int i);
+static void clear_seqrange(spanel p);
+static void select_seqs(spanel p,Boolean flag);
+static void clear_seg_exceptions(spanel p);
+
+static void make_menu_headers(WindoW w);
+static void make_help_menu(void);
+static void make_score_menu(void);
+static void make_file_menu(void);
+static void make_edit_menu(void);
+static void make_align_menu(void);
+static void make_tree_menu(void);
+static void make_color_menu(void);
+
+static void save_aln_window(int prf_no,char *title,char *prompt,void save_proc(ButtoN but));
+static void save_ps_window(int prf_no,char *prompt,void save_proc(ButtoN but));
+static void read_file_window(char *title,char *prompt,char *filename,void read_proc(ButtoN but));
+static void do_align_window(WindoW *alignw,TexT *treetext,Boolean treestatus,char *title,void align_proc(ButtoN but));
+static void do_palign_window(WindoW *alignw,TexT *tree1text,TexT *tree2test,Boolean treestatus,char *title,void align_proc(ButtoN but));
+static Boolean open_aln_files(void);
+static void write_file(int fseq,int lseq,int fres,int lres);
+
+
+Boolean x_menus=FALSE;
+
+int mheader = 2; /* maximum header lines */
+int mfooter = 1; /* maximum footer lines */
+int max_mlines = 20; /* multiple align display length */
+int min_mlines = 10; /* multiple align display length */
+int max_plines = 8; /* profile align display length */
+int min_plines1 = 5; /* profile align display length */
+int min_plines2 = 3; /* profile align display length */
+
+Boolean aln_mode = MULTIPLEM;
+Boolean window_displayed = FALSE;
+
+int save_format = CLUSTAL;
+Boolean fixed_prf_scroll = FALSE;
+int loffset,boffset,toffset;
+int roffset;
+int poffset;
+
+int score_cutoff=5; /* cutoff for residue exceptions */
+int score_hwin=5; /* half window for summing alignment column scores */
+int score_scale=5;
+int segment_dnascale=5;
+int length_cutoff=1; /* length cutoff for segment exceptions */
+Boolean residue_exceptions=FALSE;
+Boolean segment_exceptions=FALSE;
+int score_matnum=4;
+char score_mtrxname[FILENAMELEN];
+int segment_matnum=3;
+char segment_mtrxname[FILENAMELEN];
+int score_dnamatnum=1;
+char score_dnamtrxname[FILENAMELEN];
+int segment_dnamatnum=1;
+char segment_dnamtrxname[FILENAMELEN];
+
+Boolean output_ss;
+Boolean output_gp;
+
+extern char revision_level[];
+extern Boolean interactive;
+
+extern char seqname[];
+extern char outfile_name[];
+extern char profile1_name[];
+extern char profile2_name[];
+extern char usermtrxname[], pw_usermtrxname[];
+extern char dnausermtrxname[], pw_dnausermtrxname[];
+
+extern Boolean usemenu;
+extern Boolean use_tree_file;
+extern Boolean use_tree1_file,use_tree2_file;
+extern Boolean dnaflag;
+extern sint nseqs;
+extern sint profile1_nseqs;
+extern sint profile_no;
+extern sint max_aa;
+extern sint *seqlen_array;
+extern char **seq_array;
+extern char **names, **titles;
+extern Boolean empty;
+extern Boolean profile1_empty, profile2_empty;
+extern sint gap_pos1, gap_pos2;
+extern Boolean use_ambiguities;
+
+
+extern float gap_open, gap_extend;
+extern float dna_gap_open, dna_gap_extend;
+extern float prot_gap_open, prot_gap_extend;
+extern float pw_go_penalty, pw_ge_penalty;
+extern float dna_pw_go_penalty, dna_pw_ge_penalty;
+extern float prot_pw_go_penalty, prot_pw_ge_penalty;
+extern sint wind_gap,ktup,window,signif;
+extern sint dna_wind_gap, dna_ktup, dna_window, dna_signif;
+extern sint prot_wind_gap,prot_ktup,prot_window,prot_signif;
+extern sint helix_penalty;
+extern sint strand_penalty;
+extern sint loop_penalty;
+extern sint helix_end_minus;
+extern sint helix_end_plus;
+extern sint strand_end_minus;
+extern sint strand_end_plus;
+extern sint helix_end_penalty;
+extern sint strand_end_penalty;
+extern sint divergence_cutoff;
+extern sint gap_dist;
+extern sint boot_ntrials; /* number of bootstrap trials */
+extern unsigned sint boot_ran_seed; /* random number generator seed */
+
+extern sint matnum,pw_matnum;
+extern char mtrxname[], pw_mtrxname[];
+extern sint dnamatnum,pw_dnamatnum;
+extern char dnamtrxname[], pw_dnamtrxname[];
+
+extern MatMenu matrix_menu;
+extern MatMenu pw_matrix_menu;
+extern MatMenu dnamatrix_menu;
+
+extern Boolean quick_pairalign;
+extern sint matnum,pw_matnum;
+extern Boolean neg_matrix;
+extern float transition_weight;
+extern char hyd_residues[];
+extern Boolean no_var_penalties, no_hyd_penalties, no_pref_penalties;
+extern Boolean use_endgaps;
+extern Boolean endgappenalties;
+extern Boolean output_clustal, output_nbrf, output_phylip, output_gcg, output_gde, output_nexus;
+extern Boolean output_fasta; /* Ramu */
+
+extern Boolean save_parameters;
+extern Boolean output_tree_clustal, output_tree_phylip, output_tree_distances, output_tree_nexus, output_pim;
+extern Boolean lowercase; /* Flag for GDE output - set on comm. line*/
+extern Boolean cl_seq_numbers;
+
+extern Boolean seqRange;
+
+extern sint output_order;
+extern sint *output_index;
+extern Boolean reset_alignments_new; /* DES */
+extern Boolean reset_alignments_all; /* DES */
+
+extern FILE *clustal_outfile, *gcg_outfile, *nbrf_outfile, *phylip_outfile;
+extern FILE *gde_outfile, *nexus_outfile;
+extern FILE *fasta_outfile;
+
+extern sint max_aln_length;
+
+extern Boolean tossgaps; /* Ignore places in align. where ANY seq. has a gap*/
+extern Boolean kimura; /* Use correction for multiple substitutions */
+extern sint bootstrap_format; /* bootstrap file format */
+
+extern sint output_struct_penalties;
+extern Boolean use_ss1, use_ss2;
+extern char *res_cat1[];
+extern char *res_cat2[];
+
+extern char *amino_acid_codes;
+
+PrompT message; /* used in temporary message window */
+
+static Char filename[FILENAMELEN]; /* used in temporary file selection window */
+
+Boolean mess_output=TRUE;
+Boolean save_log=FALSE;
+FILE *save_log_fd=NULL;
+static char save_log_filename[FILENAMELEN];
+static IteM save_item1,save_item2,exc_item;
+
+spanel seq_panel; /* data for multiple alignment area */
+spanel prf_panel[2]; /* data for profile alignment areas */
+spanel active_panel; /* 'in-use' panel -scrolling,clicking etc. */
+static range selected_seqs; /* sequences selected by clicking on names */
+static range selected_res; /* residues selected by clicking on seqs */
+int firstres, lastres; /* range of alignment for saving as ... */
+
+/* data for Search function */
+
+char find_string[MAXFINDSTR]="";
+aln_pos find_pos;
+
+/* arrays for storing clustalw data for cut-and-paste sequences */
+static sint *saveseqlen_array=NULL;
+static char **saveseq_array=NULL;
+static char **savenames=NULL, **savetitles=NULL;
+sint ncutseqs=0;
+
+FonT datafont,helpfont;
+WindoW mainw=NULL;
+WindoW messagew=NULL;
+WindoW readfilew=NULL;
+WindoW savealnw=NULL;
+WindoW savescoresw=NULL;
+WindoW savepsw=NULL;
+WindoW findw=NULL;
+WindoW calignw=NULL;
+WindoW ralignw=NULL;
+WindoW rralignw=NULL;
+WindoW talignw=NULL;
+WindoW palignw=NULL;
+WindoW salignw=NULL;
+WindoW scorew=NULL;
+WindoW exceptionw=NULL;
+TexT savealntext;
+TexT savescorestext;
+TexT savepstext;
+TexT findtext;
+TexT pspartext;
+TexT ctreetext;
+TexT rtreetext;
+TexT rrtreetext;
+TexT ttreetext;
+TexT ptree1text,ptree2text;
+TexT streetext;
+TexT readfiletext;
+WindoW savetreew=NULL;
+TexT savetreetext;
+WindoW drawtreew=NULL;
+TexT drawnjtreetext;
+TexT drawphtreetext;
+TexT drawdsttreetext;
+TexT drawnxstreetext;
+
+TexT drawpimtext;
+
+WindoW boottreew=NULL;
+TexT bootnjtreetext;
+TexT bootphtreetext;
+TexT bootnxstreetext;
+TexT blocklentext;
+PrompT mattext,pwmattext,dnamattext,pwdnamattext,scoremattext,segmentmattext;
+PrompT scorednamattext,segmentdnamattext;
+GrouP seg_matrix_list,score_matrix_list;
+GrouP seg_dnamatrix_list,score_dnamatrix_list;
+GrouP matrix_list,pw_matrix_list,dnamatrix_list,pw_dnamatrix_list;
+
+TexT cl_outtext,pir_outtext,msf_outtext,phylip_outtext,gde_outtext,nexus_outtext;
+TexT fasta_outtext; /* Ramu */
+
+GrouP slow_para,fast_para;
+GrouP seq_display,prf1_display,prf2_display;
+
+MenU filem,alignm,editm,treem,colorm;
+menu_item file_item,align_item,edit_item,tree_item,color_item;
+MenU scorem,helpmenu;
+menu_item score_item,help_item;
+IteM segment_item;
+IteM bw_item,defcol_item,usercol_item;
+IteM new_gaps_item,all_gaps_item;
+WindoW helpw[MAXHELPW];
+int numhelp=0;
+
+PopuP modetext,flisttext;
+ButtoN pscrolltext;
+
+ButtoN selFonts;
+
+PopuP show_seg_toggle;
+PrompT residue_cutofftext;
+PrompT length_cutofftext;
+PrompT scorescaletext;
+PrompT segmentdnascaletext;
+
+#define MAXFONTS 6
+int nfonts=6; /*shoud be MAXFONTS ................ ramu */
+int av_font[MAXFONTS]={8,10,12,14,18,24};
+int font_size=1;
+
+int ncolors=0;
+int ncolor_pars=0;
+color color_lut[MAXCOLORS+1];
+char def_protpar_file[]="colprot.par";
+char def_dnapar_file[]="coldna.par";
+char *explicit_par_file = NULL;
+char *par_file = NULL;
+int inverted = TRUE;
+int usebw=FALSE,usedefcolors=TRUE,useusercolors=FALSE;
+
+char ps_par_file[FILENAMELEN]="colprint.par";
+int pagesize=A4;
+int orientation=LANDSCAPE;
+Boolean ps_header=TRUE;
+Boolean ps_ruler=TRUE;
+Boolean resize=TRUE;
+int first_printres=0,last_printres=0,blocklen;
+Boolean ps_curve=TRUE;
+Boolean ps_resno=TRUE;
+PoinT display_pos;
+int namewidth,seqwidth; /* fixed widths of sequence display areas */
+
+Boolean realign_endgappenalties=TRUE;
+Boolean align_endgappenalties=FALSE;
+
+char helptext[MAXHELPLENGTH];
+
+
+
+
+/* ramu */
+
+#include <time.h>
+#include <math.h>
+#include <unistd.h>
+#include <pwd.h>
+#include <sys/times.h>
+
+float cputime(float *seconds); /* Ramu , need's reset function */
+
+float cputime(float *seconds)
+{
+ struct tms buf;
+ static time_t last=0, first;
+ static int calls=0;
+ int hertz=sysconf(_SC_CLK_TCK);
+ time_t this;
+
+ /* get the current number of user and system cpu ticks */
+
+ times(&buf);
+ this = buf.tms_utime + buf.tms_stime;
+
+ /* if this is the first call then this is time zero */
+
+ if ( !calls ) {
+ first = this;
+ calls = -1;
+ }
+ else
+ this = this - first;
+ if(seconds)
+ *seconds = ((float)(this - last))/(float)hertz;
+ last = this;
+ return ((float)this)/(float)hertz;
+}
+
+/* Ramu */
+
+
+/* main subroutine called from clustalx.c, initialises windows and enters a
+ forever loop monitoring user input */
+
+void x_menu(void)
+{
+ int i,n;
+ char font[30];
+ char tstr[30];
+ int height;
+ PrompT fsize;
+ RecT wr,r,r1;
+
+
+/* make the pulldown menu bar */
+
+#ifdef WIN_MAC
+ MenU m;
+
+ m=AppleMenu (NULL);
+ DeskAccGroup (m);
+ make_menu_headers(NULL);
+#endif
+#ifndef UNIX
+ ProcessUpdatesFirst(FALSE);
+#endif
+
+ sprintf(tstr,"Clustal%s",revision_level);
+/*#ifdef WIN_MSWIN
+ mainw = FixedWindow (-50,-33,-10,-10,tstr,QuitWinW);
+#else*/
+ mainw = DocumentWindow (-50,-33,-10,-10,tstr,QuitWinW,ResizeWindowProc);
+/*#endif*/ SetGroupSpacing(mainw,0,10);
+ SetGroupSpacing(mainw,0,10);
+
+ x_menus=TRUE;
+
+#ifndef WIN_MAC
+ make_menu_headers(mainw);
+#endif
+/* decide if we're starting in profile or sequence mode */
+ if (!profile1_empty) aln_mode=PROFILEM;
+ else aln_mode=MULTIPLEM;
+
+ make_file_menu();
+ make_edit_menu();
+ make_align_menu();
+ make_tree_menu();
+ make_color_menu();
+ make_score_menu();
+ make_help_menu();
+
+/* add a button to switch between multiple and profile alignment modes */
+
+ modetext=PopupList(mainw,TRUE,set_aln_mode);
+ PopupItem(modetext,"Multiple Alignment Mode");
+ PopupItem(modetext,"Profile Alignment Mode");
+ if(aln_mode==MULTIPLEM)
+ SetValue(modetext,1);
+ else
+ SetValue(modetext,2);
+
+ sprintf(font, "%s,%d,%c", "courier", av_font[font_size], 'm');
+ datafont=ParseFont(font);
+
+ sprintf(font, "%s,%d,%c", "courier", 10, 'm');
+ helpfont=ParseFont(font);
+
+ Advance(mainw);
+ shift(mainw,20,0);
+
+/* add a button to select font size */
+ fsize=StaticPrompt(mainw,"Font Size:",0,dialogTextHeight,systemFont,'r');
+ Advance(mainw);
+ flisttext=PopupList(mainw,TRUE,set_font_size);
+ for(i=0;i<nfonts;i++)
+ {
+ sprintf(tstr,"%d",av_font[i]);
+ PopupItem(flisttext,tstr);
+ }
+ SetValue(flisttext,font_size+1);
+
+ Advance(mainw);
+ shift(mainw,20,0);
+
+ /* ramu .........
+ selFonts = PushButton(mainw,"Select Fonts",VSeqMgrFontProc);
+ Advance(mainw);
+ shift(mainw,20,0);
+
+ end ramu ........... */
+
+/* add a button to switch profile scrolling modes */
+ pscrolltext=CheckBox(mainw,"Lock Scroll",set_pscroll_mode);
+ if(fixed_prf_scroll) SetStatus(pscrolltext,TRUE);
+ Break(mainw);
+
+
+ selected_seqs.first=selected_seqs.last=-1;
+ selected_res.first=selected_res.last=-1;
+
+
+/* initialise the multiple alignment display area */
+
+ SelectFont(datafont);
+ stdCharWidth=CharWidth('A');
+ stdLineHeight=LineHeight();
+
+ GetNextPosition(mainw,&display_pos);
+
+/* calculate initial pixel width and height of displays */
+ namewidth=(DNAMES+DNUMBER+1)*stdCharWidth;
+ seqwidth=(DCOLS+2*MARGIN)*stdCharWidth+2;
+ n=screenRect.right-screenRect.left;
+ if(seqwidth+namewidth>n) seqwidth=n-namewidth;
+
+ height=(max_mlines+mfooter+MARGIN)*stdLineHeight+2+SCOREHEIGHT;
+ n=screenRect.bottom-screenRect.top;
+ if(height>n) height=n;
+
+ seq_display=make_scroll_area(mainw,0,namewidth+20,seqwidth,height,1,nseqs,&seq_panel);
+ position_scrollbars(seq_panel);
+
+/* initialise the profile alignment display area */
+
+ SetNextPosition(mainw,display_pos);
+ height=(max_plines+MARGIN)*stdLineHeight+2+SCOREHEIGHT;
+ if(height>n) height=n;
+ prf1_display=make_scroll_area(mainw,1,namewidth,seqwidth,height,1,profile1_nseqs,&prf_panel[0]);
+ position_scrollbars(prf_panel[0]);
+
+ prf2_display=make_scroll_area(mainw,2,namewidth,seqwidth,height,profile1_nseqs+1,nseqs-profile1_nseqs,&prf_panel[1]);
+ position_scrollbars(prf_panel[1]);
+
+/* add the message line */
+ Break(mainw);
+ Advance(mainw);
+ SelectFont(systemFont);
+ stdCharWidth=CharWidth('A');
+ stdLineHeight=LineHeight();
+ message = StaticPrompt(mainw, "",500, 0,systemFont,'l');
+
+/* save some pixel sizes for future resizing events */
+ if(aln_mode==PROFILEM)
+ {
+ Hide(seq_display);
+ profile_no=1;
+ Show(prf1_display);
+ Show(prf2_display);
+ Show(pscrolltext);
+ active_panel=prf_panel[0];
+ Select(prf1_display);
+ load_aln(prf_panel[0],0,profile1_nseqs-1,TRUE);
+ load_aln(prf_panel[1],profile1_nseqs,nseqs-1,TRUE);
+
+ Show(mainw);
+ ObjectRect(mainw,&wr);
+ ObjectRect(prf_panel[0].names,&r);
+ ObjectRect(prf_panel[1].names,&r1);
+ boffset=wr.bottom-wr.top-r1.bottom;
+ loffset=r.left;
+ toffset=r.top;
+ ObjectRect(prf_panel[0].seqs,&r);
+ roffset=wr.right-wr.left-r.right;
+ }
+ else
+ {
+ Hide(prf1_display);
+ Hide(prf2_display);
+ Hide(pscrolltext);
+ profile_no=0;
+ Show(seq_display);
+ active_panel=seq_panel;
+
+ Select(seq_display);
+ load_aln(seq_panel,0,nseqs-1,TRUE);
+
+ Show(mainw);
+ ObjectRect(mainw,&wr);
+ ObjectRect(seq_panel.names,&r);
+ boffset=wr.bottom-wr.top-r.bottom;
+ loffset=r.left;
+ toffset=r.top;
+ ObjectRect(seq_panel.seqs,&r);
+ roffset=wr.right-wr.left-r.right;
+ }
+ ObjectRect(prf_panel[0].names,&r);
+ ObjectRect(prf_panel[1].names,&r1);
+ poffset=r1.top-r.bottom;
+
+/* initialise some variables before we display the window */
+ if(orientation==LANDSCAPE)
+ {
+ if(pagesize==A4) blocklen=150;
+ else if (pagesize==A3) blocklen=250;
+ else blocklen=150;
+ }
+ else
+ {
+ if(pagesize==A4) blocklen=80;
+ else if (pagesize==A3) blocklen=150;
+ else blocklen=150;
+ }
+
+/* ok - Go! */
+ window_displayed=TRUE;
+ ProcessEvents();
+
+}
+
+
+static void RemoveWin(WindoW w)
+{
+ Remove(w);
+}
+
+
+static void QuitWinW(WindoW w)
+{
+ if(aln_mode == MULTIPLEM)
+ {
+ if(seq_panel.modified)
+ if (Message(MSG_YN,"Alignment has not been saved.\n"
+ "Quit program anyway?")==ANS_NO) return;
+ }
+ else if(aln_mode == PROFILEM)
+ {
+ if(prf_panel[0].modified)
+ if (Message(MSG_YN,"Profile 1 has not been saved.\n"
+ "Quit program anyway?")==ANS_NO) return;
+ if(prf_panel[1].modified)
+ if (Message(MSG_YN,"Profile 2 has not been saved.\n"
+ "Quit program anyway?")==ANS_NO) return;
+ }
+ QuitProgram ();
+}
+
+static void SearchStrWin (IteM item)
+{
+ int i;
+ Boolean sel=FALSE;
+ GrouP findgr;
+ ButtoN find_can,find_ok;
+ PopuP ps,or;
+ char path[FILENAMELEN];
+ char str[FILENAMELEN];
+ panel_data data;
+
+ GetPanelExtra(active_panel.names,&data);
+ if (data.nseqs==0)
+ {
+ Message(MSG_OK,"No file loaded.");
+ return;
+ }
+ for (i=0;i<data.nseqs;i++)
+ if(data.selected[i]==TRUE)
+ {
+ sel=TRUE;
+ break;
+ }
+ if(sel==FALSE)
+ {
+ Message(MSG_OK,"Select sequences by clicking on the names.");
+ return;
+ }
+
+ SelectFont(systemFont);
+ stdCharWidth=CharWidth('A');
+ stdLineHeight=LineHeight();
+ findw=FixedWindow(-50, -33, -10, -10, "SEARCH IN SELECTED SEQUENCES",RemoveWin);
+ stdLineHeight=18;
+ SelectFont(programFont);
+ findtext=DialogText(findw, "", 35, NULL);
+ Break(findw);
+ find_ok=PushButton(findw, "SEARCH FROM START", SearchStr);
+ Break(findw);
+ find_ok=PushButton(findw, "SEARCH AGAIN", SearchStringAgain);
+ Break(findw);
+ find_can=PushButton(findw, "CLOSE", CancelWin);
+
+ Show(findw);
+}
+
+static void SavePSSeqWin (IteM item)
+{
+ if (empty)
+ {
+ error("No file loaded");
+ return;
+ }
+ save_ps_window(0,"WRITE SEQUENCES TO:",SavePSSeqFile);
+}
+
+static void SavePSPrf1Win (IteM item)
+{
+ if (profile1_empty)
+ {
+ error("No file loaded");
+ return;
+ }
+ save_ps_window(1,"WRITE PROFILE 1 TO:",SavePSPrf1File);
+}
+
+static void SavePSPrf2Win (IteM item)
+{
+ if (profile2_empty)
+ {
+ error("No file loaded");
+ return;
+ }
+ save_ps_window(2,"WRITE PROFILE 2 TO:",SavePSPrf2File);
+}
+
+static void save_ps_window(int prf_no,char *prompt,void save_proc(ButtoN but))
+{
+ GrouP savegr;
+ ButtoN save_can,save_ok;
+ PopuP ps,or;
+ char path[FILENAMELEN];
+ char str[FILENAMELEN];
+ panel_data data;
+
+ SelectFont(systemFont);
+ stdCharWidth=CharWidth('A');
+ stdLineHeight=LineHeight();
+ savepsw=FixedWindow(-50, -33, -10, -10, "WRITE POSTSCRIPT FILE",RemoveWin);
+ make_prompt(savepsw, prompt);
+ stdLineHeight=18;
+ SelectFont(programFont);
+ savepstext=DialogText(savepsw, "", 35, NULL);
+ Break(savepsw);
+ make_prompt(savepsw, "PS Colors File :");
+ pspartext=DialogText(savepsw, ps_par_file, 35, NULL);
+ Break(savepsw);
+ make_prompt(savepsw, "Page Size");
+ Advance(savepsw);
+ ps=PopupList(savepsw,TRUE,set_pagesize);
+ PopupItem(ps,"A4");
+ PopupItem(ps,"A3");
+ PopupItem(ps,"US Letter");
+ if (pagesize == A4)
+ SetValue(ps,1);
+ else if (pagesize == A3)
+ SetValue(ps,2);
+ else if (pagesize == USLETTER)
+ SetValue(ps,3);
+ Break(savepsw);
+ make_prompt(savepsw, "Orientation");
+ Advance(savepsw);
+ or=PopupList(savepsw,TRUE,set_orientation);
+ PopupItem(or,"LANDSCAPE");
+ PopupItem(or,"PORTRAIT");
+ if (orientation == LANDSCAPE)
+ SetValue(or,1);
+ else if (orientation == PORTRAIT)
+ SetValue(or,2);
+ Break(savepsw);
+ make_toggle(savepsw,"Print Header :","YES","NO",&ps_header,set_header);
+ Advance(savepsw);
+ make_toggle(savepsw,"Print Quality Curve :","YES","NO",&ps_curve,set_curve);
+ Break(savepsw);
+ make_toggle(savepsw,"Print Ruler :","YES","NO",&ps_ruler,set_ruler);
+ Advance(savepsw);
+ make_toggle(savepsw,"Print Residue Numbers :","YES","NO",&ps_resno,set_resno);
+ Break(savepsw);
+ make_toggle(savepsw,"Resize to fit page:","YES","NO",&resize,set_resize);
+ Break(savepsw);
+ first_printres=1;
+ if (prf_no==0)
+ GetPanelExtra(seq_panel.seqs,&data);
+ else if (prf_no==1)
+ GetPanelExtra(prf_panel[0].seqs,&data);
+ else
+ GetPanelExtra(prf_panel[1].seqs,&data);
+ last_printres=data.ncols;
+ make_prompt(savepsw, "Print from position :");
+ Advance(savepsw);
+ sprintf(str,"%5d",first_printres);
+ DialogText(savepsw, str, 5,set_fpres);
+ Advance(savepsw);
+ make_prompt(savepsw, "to :");
+ Advance(savepsw);
+ sprintf(str,"%5d",last_printres);
+ DialogText(savepsw, str, 5,set_lpres);
+ Break(savepsw);
+ make_prompt(savepsw, "Use block length :");
+ Advance(savepsw);
+ sprintf(str,"%5d",blocklen);
+ blocklentext=DialogText(savepsw, str, 5,set_blocklen);
+ Break(savepsw);
+ savegr=HiddenGroup(savepsw, 2, 0, NULL);
+ shift(savegr, 60, 20);
+ save_ok=PushButton(savegr, " OK ", save_proc);
+ shift(savegr, 20,0);
+ save_can=PushButton(savegr, "CLOSE", CancelWin);
+
+ if(prf_no==0)
+ get_path(seqname,path);
+ else if(prf_no==1)
+ get_path(profile1_name,path);
+ else if(prf_no==2)
+ get_path(profile2_name,path);
+ strcat(path,"ps");
+ SetTitle(savepstext, path);
+ Show(savepsw);
+}
+
+static void SaveScoresWin (IteM item)
+{
+ int i;
+ Boolean sel=FALSE;
+ GrouP scoregr;
+ ButtoN score_can,score_ok;
+ PopuP ps,or;
+ char path[FILENAMELEN];
+ char str[FILENAMELEN];
+ panel_data data;
+
+
+ if (empty)
+ {
+ error("No file loaded");
+ return;
+ }
+
+ GetPanelExtra(active_panel.names,&data);
+ for (i=0;i<data.nseqs;i++)
+ if(data.selected[i]==TRUE)
+ {
+ sel=TRUE;
+ break;
+ }
+ if(sel==FALSE)
+ {
+ Message(MSG_OK,"Select sequences to be written by clicking on the names.");
+ return;
+ }
+
+ get_path(seqname,path);
+ strcat(path,"qscores");
+
+ SelectFont(systemFont);
+ stdCharWidth=CharWidth('A');
+ stdLineHeight=LineHeight();
+ savescoresw=FixedWindow(-50, -33, -10, -10, "SAVE QUALITY SCORES",RemoveWin);
+ stdLineHeight=18;
+ SelectFont(programFont);
+ make_prompt(savescoresw, "SAVE QUALITY SCORES TO:");
+ stdLineHeight=18;
+ SelectFont(programFont);
+ Break(savescoresw);
+ savescorestext=DialogText(savescoresw, "", 35, NULL);
+ Break(savescoresw);
+ scoregr=HiddenGroup(savescoresw, 2, 0, NULL);
+ shift(scoregr, 60, 20);
+ score_ok=PushButton(scoregr, " OK ", SaveScores);
+ shift(scoregr, 20,0);
+ score_can=PushButton(scoregr, "CANCEL", CancelWin);
+
+ SetTitle(savescorestext, path);
+ Show(savescoresw);
+
+ Advance(savescoresw);
+ Show(savescoresw);
+}
+
+static void SaveScores(ButtoN but)
+{
+ char c;
+ int i,j,val;
+ int length=0;
+ FILE *outfile;
+ panel_data name_data,seq_data;
+ Boolean gap;
+
+ GetPanelExtra(active_panel.names,&name_data);
+ GetPanelExtra(active_panel.seqs,&seq_data);
+
+ GetTitle(savescorestext, filename, FILENAMELEN);
+ stripspace(filename);
+
+ outfile=open_explicit_file(filename);
+
+/* get the maximum length of the selected sequences */
+ for (i=1;i<=nseqs;i++)
+ if (name_data.selected[i-1]==TRUE && length < seqlen_array[i]) length = seqlen_array[i];
+
+ for(j=1;j<=length;j++)
+ {
+/* first check for a column of gaps */
+ gap=TRUE;
+ for (i=1;i<=nseqs;i++)
+ if (name_data.selected[i-1]==TRUE)
+ {
+ val = seq_array[i][j];
+ if(j<=seqlen_array[i] && (val != gap_pos1) && (val != gap_pos2))
+ {
+ gap=FALSE;
+ break;
+ }
+ }
+ if(gap==FALSE)
+ {
+ for (i=1;i<=nseqs;i++)
+ {
+ if (name_data.selected[i-1]==TRUE)
+ {
+ val = seq_array[i][j];
+ if(j>seqlen_array[i] || (val == gap_pos1) || (val == gap_pos2))
+ c = '-';
+ else {
+ c = amino_acid_codes[val];
+ }
+
+ fprintf(outfile,"%c ",c);
+ }
+ }
+ fprintf(outfile,"\t%3d\n",seq_data.colscore[j-1]);
+ }
+
+ }
+ fclose(outfile);
+
+ if (Visible(savescoresw))
+ {
+ Remove(savescoresw);
+ savescoresw=NULL;
+ }
+
+
+
+ info("File %s saved",filename);
+}
+
+static void SaveSeqFileWin (IteM item)
+{
+ if (empty)
+ {
+ error("No file loaded");
+ return;
+ }
+ save_aln_window(0,"SAVE SEQUENCES","SAVE SEQUENCES AS:",SaveSeqFile);
+}
+
+static void SavePrf1FileWin (IteM item)
+{
+ if (profile1_empty)
+ {
+ error("No file loaded");
+ return;
+ }
+ save_aln_window(1,"SAVE PROFILE","SAVE PROFILE 1 AS:",SavePrf1File);
+}
+static void SavePrf2FileWin (IteM item)
+{
+ if (profile2_empty)
+ {
+ error("No file loaded");
+ return;
+ }
+ save_aln_window(2,"SAVE PROFILE","SAVE PROFILE 2 AS:",SavePrf2File);
+}
+
+static void save_aln_window(int prf_no,char *title,char *prompt,void save_proc(ButtoN but))
+{
+ GrouP savegr;
+ ButtoN save_ok, save_can;
+ GrouP maing;
+ GrouP format_list;
+ ButtoN formatb[6+1]; /* + 1 for fasta */
+ PopuP case_toggle,snos_toggle;
+ PopuP seqRange_toggle; /* Ramu */
+ char path[FILENAMELEN+1];
+ char str[FILENAMELEN+1];
+
+ SelectFont(systemFont);
+ stdCharWidth=CharWidth('A');
+ stdLineHeight=LineHeight();
+
+ savealnw=FixedWindow(-50, -33, -10, -10, title,RemoveWin);
+
+ format_list=NormalGroup(savealnw,3,0,"Format",systemFont,set_format);
+ formatb[0]=RadioButton(format_list,"CLUSTAL");
+ formatb[1]=RadioButton(format_list,"NBRF/PIR");
+ formatb[2]=RadioButton(format_list,"GCG/MSF");
+ formatb[3]=RadioButton(format_list,"PHYLIP");
+ formatb[4]=RadioButton(format_list,"GDE");
+ formatb[5]=RadioButton(format_list,"NEXUS");
+ formatb[6]=RadioButton(format_list,"FASTA");
+
+ if(prf_no==0)
+ get_path(seqname,path);
+ else if(prf_no==1)
+ get_path(profile1_name,path);
+ else if(prf_no==2)
+ get_path(profile2_name,path);
+
+ if (save_format==CLUSTAL)
+ {
+ SetValue(format_list,1);
+ strcat(path,"aln");
+ }
+ else if (save_format==PIR)
+ {
+ SetValue(format_list,2);
+ strcat(path,"pir");
+ }
+ else if (save_format==MSF)
+ {
+ SetValue(format_list,3);
+ strcat(path,"msf");
+ }
+ else if (save_format==PHYLIP)
+ {
+ SetValue(format_list,4);
+ strcat(path,"phy");
+ }
+ else if (save_format==GDE)
+ {
+ SetValue(format_list,5);
+ strcat(path,"gde");
+ }
+ else if (save_format==NEXUS)
+ {
+ SetValue(format_list,6);
+ strcat(path,"nxs");
+ }
+ else if (save_format==FASTA)
+ {
+ SetValue(format_list,7);
+ strcat(path,"fasta");
+ }
+
+ maing=HiddenGroup(savealnw,0,0,NULL);
+ SetGroupSpacing(maing,0,10);
+
+ case_toggle=make_toggle(maing,"GDE output case :","Lower","Upper",&lowercase,set_case);
+ Break(maing);
+ snos_toggle=make_toggle(maing,"CLUSTALW sequence numbers :","ON","OFF",&cl_seq_numbers,set_snos);
+
+ Break(maing);
+ make_prompt(maing, "Save range from :");
+ Advance(maing);
+ firstres = 0; /* init always ramu */
+ lastres = 0; /* init always ramu */
+ sprintf(str,"%5d",firstres);
+ DialogText(maing, str, 5,set_fres);
+ Advance(maing);
+ make_prompt(maing, "to :");
+ Advance(maing);
+ sprintf(str,"%5d",lastres);
+ DialogText(maing, str, 5,set_lres);
+ /* <Ramu> */
+ Advance(maing);
+ seqRange_toggle=make_toggle(maing," and include range numbers :","ON","OFF",&seqRange,setRange);
+ /*</Ramu>*/
+
+ Break(maing);
+ shift(savealnw, 0, 20);
+ make_prompt(savealnw, prompt);
+ stdLineHeight=18;
+ SelectFont(programFont);
+ Break(savealnw);
+ savealntext=DialogText(savealnw, "", 35, NULL);
+ Break(savealnw);
+ savegr=HiddenGroup(savealnw, 2, 0, NULL);
+ shift(savegr, 60, 20);
+ save_ok=PushButton(savegr, " OK ", save_proc);
+ shift(savegr, 20,0);
+ save_can=PushButton(savegr, "CANCEL", CancelWin);
+
+ SetTitle(savealntext, path);
+ Show(savealnw);
+
+}
+
+static void read_file_window(char *title,char *prompt,char *filename,void read_proc(ButtoN but))
+{
+ GrouP readgr;
+ ButtoN read_ok, read_can;
+ GrouP maing;
+
+ SelectFont(systemFont);
+ stdCharWidth=CharWidth('A');
+ stdLineHeight=LineHeight();
+ readfilew=FixedWindow(-50, -33, -10, -10, title,RemoveWin);
+
+ maing=HiddenGroup(readfilew,2,0,NULL);
+ SetGroupSpacing(maing,0,10);
+
+ shift(readfilew, 0, 20);
+ make_prompt(readfilew, prompt);
+ stdLineHeight=18;
+ SelectFont(programFont);
+ Break(readfilew);
+ readfiletext=DialogText(readfilew, "", 35, NULL);
+ if (filename != NULL) SetTitle(readfiletext, filename);
+ Break(readfilew);
+ readgr=HiddenGroup(readfilew, 2, 0, NULL);
+ shift(readgr, 60, 20);
+ read_ok=PushButton(readgr, " OK ", read_proc);
+ shift(readgr, 20,0);
+ read_can=PushButton(readgr, "CANCEL", CancelWin);
+
+ Show(readfilew);
+}
+
+static void CancelWin (ButtoN but)
+{
+ Remove(ParentWindow(but));
+}
+
+static void SearchStr(ButtoN but)
+{
+
+/* reset the current position */
+
+ find_pos.seq=0;
+ find_pos.res=-1;
+
+/* find the next occurrence of the string */
+ SearchStringAgain(but);
+
+
+}
+
+static void SearchStringAgain(ButtoN but)
+{
+ int i,j,ix,length;
+ int seq,res,start_res;
+ Boolean in_string,found;
+ panel_data ndata,sdata;
+
+ GetTitle(findtext, filename, FILENAMELEN);
+ stripspace(filename);
+
+ strncpy(find_string,filename,MAXFINDSTR);
+ length=strlen(find_string);
+ if(length==0) return;
+ for(i=0;i<length;i++)
+ find_string[i]=toupper(find_string[i]);
+
+ GetPanelExtra(active_panel.names,&ndata);
+ GetPanelExtra(active_panel.seqs,&sdata);
+
+ in_string=FALSE;
+ found=FALSE;
+ start_res=0;
+ ix=0;
+ seq=find_pos.seq;
+ res=find_pos.res+1;
+ while (seq<ndata.nseqs)
+ {
+ if(ndata.selected[seq]==TRUE)
+ {
+ while (res<sdata.ncols)
+ {
+ if(sdata.lines[seq][res]==find_string[ix])
+ {
+ if(in_string==FALSE)
+ start_res=res;
+ ix++;
+ in_string=TRUE;
+ }
+ else if(in_string==TRUE)
+ {
+ res=start_res;
+ ix=0;
+ in_string=FALSE;
+ }
+ if(ix==length)
+ {
+ find_pos.seq=seq;
+ find_pos.res=start_res;
+ found=TRUE;
+ break;
+ }
+ res++;
+ while(res<sdata.ncols && sdata.lines[seq][res]=='-')
+ res++;
+ }
+ }
+ if(found) break;
+ seq++;
+ res=0;
+ }
+
+
+ if(found==FALSE)
+ info("String %s not found",find_string);
+ else
+ {
+ info("String %s in sequence %s, column %d",find_string,names[find_pos.seq+1],find_pos.res+1);
+ }
+}
+
+static void SavePSSeqFile(ButtoN but)
+{
+ char *ps_file;
+
+ GetTitle(savepstext, filename, FILENAMELEN);
+ stripspace(filename);
+
+ ps_file=(char *)ckalloc(FILENAMELEN*sizeof(char));
+ strcpy(ps_file,filename);
+
+ GetTitle(pspartext, filename, FILENAMELEN);
+ stripspace(filename);
+
+ strcpy(ps_par_file,filename);
+
+ write_ps_file(seq_panel,ps_file,ps_par_file,pagesize,orientation,
+ ps_header,ps_ruler,ps_resno,
+ resize,first_printres,last_printres,blocklen,ps_curve);
+
+ info("Postscript file %s written",ps_file);
+ ckfree(ps_file);
+
+}
+
+static void SavePSPrf1File(ButtoN but)
+{
+ char *ps_file;
+ char *ps_par_file;
+
+ GetTitle(savepstext, filename, FILENAMELEN);
+ stripspace(filename);
+
+ ps_file=(char *)ckalloc(FILENAMELEN*sizeof(char));
+ strcpy(ps_file,filename);
+
+ GetTitle(pspartext, filename, FILENAMELEN);
+ stripspace(filename);
+
+ ps_par_file=(char *)ckalloc(FILENAMELEN*sizeof(char));
+ strcpy(ps_par_file,filename);
+
+ write_ps_file(prf_panel[0],ps_file,ps_par_file,pagesize,orientation,
+ ps_header,ps_ruler,ps_resno,
+ resize,first_printres,last_printres,blocklen,ps_curve);
+
+ info("Postscript file %s written",ps_file);
+ ckfree(ps_file);
+
+}
+
+static void SavePSPrf2File(ButtoN but)
+{
+ char *ps_file;
+ char *ps_par_file;
+
+ GetTitle(savepstext, filename, FILENAMELEN);
+ stripspace(filename);
+
+ ps_file=(char *)ckalloc(FILENAMELEN*sizeof(char));
+ strcpy(ps_file,filename);
+
+ GetTitle(pspartext, filename, FILENAMELEN);
+ stripspace(filename);
+
+ ps_par_file=(char *)ckalloc(FILENAMELEN*sizeof(char));
+ strcpy(ps_par_file,filename);
+
+ write_ps_file(prf_panel[1],ps_file,ps_par_file,pagesize,orientation,
+ ps_header,ps_ruler,ps_resno,
+ resize,first_printres,last_printres,blocklen,ps_curve);
+
+ info("Postscript file %s written",ps_file);
+ ckfree(ps_file);
+
+}
+
+static void SaveSeqFile(ButtoN but)
+{
+ write_file(1,nseqs,firstres,lastres);
+ seq_panel.modified=FALSE;
+ info("File %s saved",filename);
+}
+
+static void SavePrf1File(ButtoN but)
+{
+ write_file(1,profile1_nseqs,firstres,lastres);
+ prf_panel[0].modified=FALSE;
+ info("File %s saved",filename);
+}
+
+static void SavePrf2File(ButtoN but)
+{
+ write_file(profile1_nseqs+1,nseqs,firstres,lastres);
+ prf_panel[1].modified=FALSE;
+ info("File %s saved",filename);
+}
+
+/* this is equivalent to open_alignment_output(), but uses the window
+interface to input file names */
+
+static Boolean open_aln_files(void)
+{
+ char path[FILENAMELEN];
+
+ if(!output_clustal && !output_nbrf && !output_gcg &&
+ !output_phylip && !output_gde && !output_nexus) {
+ error("You must select an alignment output format");
+ return FALSE;
+ }
+
+ if(output_clustal) {
+ GetTitle(cl_outtext,filename,FILENAMELEN);
+ stripspace(filename);
+ if((clustal_outfile = open_explicit_file(
+ filename))==NULL) return FALSE;
+ }
+ if(output_nbrf) {
+ GetTitle(pir_outtext,filename,FILENAMELEN);
+ stripspace(filename);
+ if((nbrf_outfile = open_explicit_file(
+ filename))==NULL) return FALSE;
+ }
+ if(output_gcg) {
+ GetTitle(msf_outtext,filename,FILENAMELEN);
+ stripspace(filename);
+ if((gcg_outfile = open_explicit_file(
+ filename))==NULL) return FALSE;
+ }
+ if(output_phylip) {
+ GetTitle(phylip_outtext,filename,FILENAMELEN);
+ stripspace(filename);
+ if((phylip_outfile = open_explicit_file(
+ filename))==NULL) return FALSE;
+ }
+ if(output_gde) {
+ GetTitle(gde_outtext,filename,FILENAMELEN);
+ stripspace(filename);
+ if((gde_outfile = open_explicit_file(
+ filename))==NULL) return FALSE;
+ }
+ if(output_nexus) {
+ GetTitle(nexus_outtext,filename,FILENAMELEN);
+ stripspace(filename);
+ if((nexus_outfile = open_explicit_file(
+ filename))==NULL) return FALSE;
+ }
+
+/* <Ramu> */
+ if(output_fasta) {
+ GetTitle(fasta_outtext,filename,FILENAMELEN);
+ stripspace(filename);
+ if((fasta_outfile = open_explicit_file(
+ filename))==NULL) return FALSE;
+ }
+/* </Ramu> */
+ if(save_log)
+ {
+ get_path(seqname,path);
+ strcpy(save_log_filename,path);
+ strcat(save_log_filename,"log");
+ if ((save_log_fd=fopen(save_log_filename,"a"))==NULL)
+ error("Cannot open log file %s",save_log_filename);
+ }
+
+ return TRUE;
+}
+
+static void write_file(int fseq, int lseq, int fres, int lres)
+{
+ int i,length=0;
+ FILE *outfile;
+
+ GetTitle(savealntext, filename, FILENAMELEN);
+ stripspace(filename);
+
+ outfile=open_explicit_file(filename);
+
+ for (i=fseq;i<=lseq;i++)
+ if (length < seqlen_array[i]) length = seqlen_array[i];
+
+ if(fres<1) fres=1;
+ if(lres<1) lres=length;
+ length=lres-fres+1;
+
+ if(save_format==CLUSTAL) {
+ clustal_out(outfile, fres, length, fseq, lseq);
+ fclose(outfile);
+ info("CLUSTAL format file created [%s]",filename);
+ }
+ else if(save_format==PIR) {
+ nbrf_out(outfile, fres, length, fseq, lseq);
+ fclose(outfile);
+ info("NBRF/PIR format file created [%s]",filename);
+ }
+ else if(save_format==MSF) {
+ gcg_out(outfile, fres, length, fseq, lseq);
+ fclose(outfile);
+ info("GCG/MSF format file created [%s]",filename);
+ }
+ else if(save_format==PHYLIP) {
+ phylip_out(outfile, fres, length, fseq, lseq);
+ fclose(outfile);
+ info("PHYLIP format file created [%s]",filename);
+ }
+ else if(save_format==GDE) {
+ gde_out(outfile, fres, length, fseq, lseq);
+ fclose(outfile);
+ info("GDE format file created [%s]",filename);
+ }
+ else if(save_format==NEXUS) {
+ nexus_out(outfile, fres, length, fseq, lseq);
+ fclose(outfile);
+ info("NEXUS format file created [%s]",filename);
+ }
+
+/* <Ramu> */
+ else if(save_format==FASTA) {
+ fasta_out(outfile, fres, length, fseq, lseq);
+ fclose(outfile);
+ info("FASTA format file created [%s]",filename);
+ }
+
+
+/* </Ramu> */
+ if (Visible(savealnw))
+ {
+ Remove(savealnw);
+ savealnw=NULL;
+ }
+
+
+}
+
+static void SaveTreeWin (IteM item)
+{
+ GrouP savegr;
+ ButtoN save_ok, save_can;
+ char path[FILENAMELEN];
+
+ if (empty)
+ {
+ error("No file loaded");
+ return;
+ }
+ if (nseqs < 2)
+ {
+ error("Alignment has only %d sequences",nseqs);
+ return;
+ }
+
+ SelectFont(systemFont);
+ stdCharWidth=CharWidth('A');
+ stdLineHeight=LineHeight();
+ savetreew=FixedWindow(-50, -33, -10, -10, "CREATE TREE",RemoveWin);
+ shift(savetreew, 0, 20);
+ make_prompt(savetreew, "SAVE TREE AS :");
+ Advance(savetreew);
+ shift(savetreew, 0, -10);
+ stdLineHeight=18;
+ SelectFont(programFont);
+ savetreetext=DialogText(savetreew, "", 35, NULL);
+ SelectFont(systemFont);
+ stdLineHeight=15;
+ Break(savetreew);
+ savegr=HiddenGroup(savetreew, 2, 0, NULL);
+ shift(savegr, 140, 20);
+ save_ok=PushButton(savegr, " OK ", CreateAlignTree);
+ shift(savegr, 20, 0);
+ save_can=PushButton(savegr, "CANCEL", CancelWin);
+
+ get_path(seqname,path);
+ strcat(path,"dnd");
+
+ SetTitle(savetreetext, path);
+ Show(savetreew);
+}
+
+static void DrawTreeWin (IteM item)
+{
+ GrouP drawgr;
+ GrouP output_list;
+ ButtoN draw_ok, draw_can;
+ char path[FILENAMELEN];
+ char name[FILENAMELEN];
+
+ if (empty)
+ {
+ error("No file loaded");
+ return;
+ }
+ if (nseqs < 2)
+ {
+ error("Alignment has only %d sequences",nseqs);
+ return;
+ }
+
+ get_path(seqname,path);
+
+ SelectFont(systemFont);
+ stdCharWidth=CharWidth('A');
+ stdLineHeight=LineHeight();
+ drawtreew=FixedWindow(-50, -33, -10, -10, "DRAW TREE",RemoveWin);
+ output_list=HiddenGroup(drawtreew, 2, 0, NULL);
+ if (output_tree_clustal)
+ {
+ make_prompt(output_list, "SAVE CLUSTAL TREE AS :");
+ drawnjtreetext=DialogText(output_list, "", 35, NULL);
+ strcpy(name,path);
+ strcat(name,"nj");
+ SetTitle(drawnjtreetext, name);
+ Break(output_list);
+ }
+ if (output_tree_phylip)
+ {
+ make_prompt(output_list, "SAVE PHYLIP TREE AS :");
+ drawphtreetext=DialogText(output_list, "", 35, NULL);
+ strcpy(name,path);
+ strcat(name,"ph");
+ SetTitle(drawphtreetext, name);
+ Break(output_list);
+ }
+ if (output_tree_distances)
+ {
+ make_prompt(output_list, "SAVE DISTANCE MATRIX AS :");
+ drawdsttreetext=DialogText(output_list, "", 35, NULL);
+ strcpy(name,path);
+ strcat(name,"dst");
+ SetTitle(drawdsttreetext, name);
+ Break(output_list);
+ }
+ if (output_tree_nexus)
+ {
+ make_prompt(output_list, "SAVE NEXUS TREE AS :");
+ drawnxstreetext=DialogText(output_list, "", 35, NULL);
+ strcpy(name,path);
+ strcat(name,"tre");
+ SetTitle(drawnxstreetext, name);
+ Break(output_list);
+ }
+
+ if (output_pim)
+ {
+ make_prompt(output_list, "SAVE % IDENTITY MATRIX AS :");
+ drawpimtext=DialogText(output_list, "", 35, NULL);
+ strcpy(name,path);
+ strcat(name,"pim");
+ SetTitle(drawpimtext, name);
+ Break(output_list);
+ }
+
+ SelectFont(systemFont);
+ stdLineHeight=15;
+ Break(drawtreew);
+ drawgr=HiddenGroup(drawtreew, 2, 0, NULL);
+ shift(drawgr, 140, 20);
+ draw_ok=PushButton(drawgr, " OK ", DrawTree);
+ shift(drawgr, 20, 0);
+ draw_can=PushButton(drawgr, "CANCEL", CancelWin);
+
+ Show(drawtreew);
+}
+
+static void BootstrapTreeWin (IteM item)
+{
+ GrouP bootgr;
+ ButtoN boot_ok, boot_can;
+ TexT seed,ntrials;
+ char name[FILENAMELEN];
+ char path[FILENAMELEN];
+ char str[FILENAMELEN];
+ GrouP output_list;
+
+ if (empty)
+ {
+ error("No file loaded");
+ return;
+ }
+ if (nseqs < 2)
+ {
+ error("Alignment has only %d sequences",nseqs);
+ return;
+ }
+
+ get_path(seqname,path);
+
+ SelectFont(systemFont);
+ stdCharWidth=CharWidth('A');
+ stdLineHeight=LineHeight();
+ boottreew=FixedWindow(-50, -33, -10, -10, "BOOTSTRAP TREE",RemoveWin);
+ make_prompt(boottreew, "Random number generator seed [1-1000] :");
+ Advance(boottreew);
+ sprintf(str,"%4d",boot_ran_seed);
+ seed=DialogText(boottreew, str, 4,set_ran_seed);
+ Break(boottreew);
+ make_prompt(boottreew, "Number of bootstrap trials [1-10000] :");
+ Advance(boottreew);
+ sprintf(str,"%5d",boot_ntrials);
+ ntrials=DialogText(boottreew, str, 5,set_ntrials);
+ Break(boottreew);
+
+ output_list=HiddenGroup(boottreew, 2, 0, NULL);
+ if (output_tree_clustal)
+ {
+ make_prompt(output_list, "SAVE CLUSTAL TREE AS :");
+ bootnjtreetext=DialogText(output_list, "", 35, NULL);
+ strcpy(name,path);
+ strcat(name,"njb");
+ SetTitle(bootnjtreetext, name);
+ Break(output_list);
+ }
+ if (output_tree_phylip)
+ {
+ make_prompt(output_list, "SAVE PHYLIP TREE AS :");
+ bootphtreetext=DialogText(output_list, "", 35, NULL);
+ strcpy(name,path);
+ strcat(name,"phb");
+ SetTitle(bootphtreetext, name);
+ Break(output_list);
+ }
+ if (output_tree_nexus)
+ {
+ make_prompt(output_list, "SAVE NEXUS TREE AS :");
+ bootnxstreetext=DialogText(output_list, "", 35, NULL);
+ strcpy(name,path);
+ strcat(name,"treb");
+ SetTitle(bootnxstreetext, name);
+ Break(output_list);
+ }
+ SelectFont(systemFont);
+ stdLineHeight=15;
+ Break(boottreew);
+ bootgr=HiddenGroup(boottreew, 2, 0, NULL);
+ shift(bootgr, 140, 20);
+ boot_ok=PushButton(bootgr, " OK ", BootstrapTree);
+ shift(bootgr, 20, 0);
+ boot_can=PushButton(bootgr, "CANCEL", CancelWin);
+
+
+ Show(boottreew);
+}
+
+static void CreateAlignTree(ButtoN but)
+{
+ char path[FILENAMELEN];
+ char phylip_name[FILENAMELEN];
+
+ GetTitle(savetreetext, filename, FILENAMELEN);
+ strcpy(phylip_name,filename);
+ stripspace(filename);
+
+ info("Doing pairwise alignments...");
+ if(save_log)
+ {
+ get_path(seqname,path);
+ strcpy(save_log_filename,path);
+ strcat(save_log_filename,"log");
+ if ((save_log_fd=fopen(save_log_filename,"a"))==NULL)
+ error("Cannot open log file %s",save_log_filename);
+ }
+
+ WatchCursor();
+ if (Visible(savetreew))
+ {
+ Remove(savetreew);
+ savetreew=NULL;
+ }
+ make_tree(phylip_name);
+ if(save_log && save_log_fd!=NULL)
+ {
+ fclose(save_log_fd);
+ save_log_fd=NULL;
+ }
+ ArrowCursor();
+ info("Tree %s created",filename);
+}
+
+static void DrawTree(ButtoN but)
+{
+ char path[FILENAMELEN];
+ char phylip_name[FILENAMELEN];
+ char clustal_name[FILENAMELEN];
+ char dist_name[FILENAMELEN];
+ char nexus_name[FILENAMELEN];
+ char pim_name[FILENAMELEN];
+
+ if(output_tree_clustal)
+ {
+ GetTitle(drawnjtreetext, filename, FILENAMELEN);
+ stripspace(filename);
+ strcpy(clustal_name,filename);
+ }
+ if(output_tree_phylip)
+ {
+ GetTitle(drawphtreetext, filename, FILENAMELEN);
+ stripspace(filename);
+ strcpy(phylip_name,filename);
+ }
+ if(output_tree_distances)
+ {
+ GetTitle(drawdsttreetext, filename, FILENAMELEN);
+ stripspace(filename);
+ strcpy(dist_name,filename);
+ }
+ if(output_tree_nexus)
+ {
+ GetTitle(drawnxstreetext, filename, FILENAMELEN);
+ stripspace(filename);
+ strcpy(nexus_name,filename);
+ }
+
+
+
+ if(output_pim) /* if this is absent, no file gets created ??? */
+ {
+ GetTitle(drawpimtext, filename, FILENAMELEN);
+ stripspace(filename);
+ strcpy(pim_name,filename);
+ }
+
+
+ info("Calculating tree...");
+ WatchCursor();
+ if(save_log)
+ {
+ get_path(seqname,path);
+ strcpy(save_log_filename,path);
+ strcat(save_log_filename,"log");
+ if ((save_log_fd=fopen(save_log_filename,"a"))==NULL)
+ error("Cannot open log file %s",save_log_filename);
+ }
+ if (Visible(drawtreew))
+ {
+ Remove(drawtreew);
+ drawtreew=NULL;
+ }
+ phylogenetic_tree(phylip_name,clustal_name,dist_name,nexus_name,pim_name);
+ if(save_log && save_log_fd!=NULL)
+ {
+ fclose(save_log_fd);
+ save_log_fd=NULL;
+ }
+ ArrowCursor();
+ info("Tree %s created",filename);
+}
+
+static void BootstrapTree(ButtoN but)
+{
+ char phylip_name[FILENAMELEN];
+ char clustal_name[FILENAMELEN];
+ char nexus_name[FILENAMELEN];
+ char path[FILENAMELEN];
+
+ if(output_tree_clustal)
+ {
+ GetTitle(bootnjtreetext, filename, FILENAMELEN);
+ stripspace(filename);
+ strcpy(clustal_name,filename);
+ }
+ if(output_tree_phylip)
+ {
+ GetTitle(bootphtreetext, filename, FILENAMELEN);
+ stripspace(filename);
+ strcpy(phylip_name,filename);
+ }
+ if(output_tree_nexus)
+ {
+ GetTitle(bootnxstreetext, filename, FILENAMELEN);
+ stripspace(filename);
+ strcpy(nexus_name,filename);
+ }
+
+ info("Bootstrapping tree...");
+
+ WatchCursor();
+ if(save_log)
+ {
+ get_path(seqname,path);
+ strcpy(save_log_filename,path);
+ strcat(save_log_filename,"log");
+ if ((save_log_fd=fopen(save_log_filename,"a"))==NULL)
+ warning("Cannot open log file %s",save_log_filename);
+ }
+ if (Visible(boottreew))
+ {
+ Remove(boottreew);
+ boottreew=NULL;
+ }
+ bootstrap_tree(phylip_name,clustal_name,nexus_name);
+ if(save_log && save_log_fd!=NULL)
+ {
+ fclose(save_log_fd);
+ save_log_fd=NULL;
+ }
+ info("Bootstrap tree %s created",filename);
+ ArrowCursor();
+}
+
+
+static void OpenSeqFile (IteM item)
+{
+ int n;
+ panel_data data;
+
+ if (nseqs>0)
+ {
+ if (Message(MSG_YN,"Replace existing sequences ?")==ANS_NO)
+ return;
+ }
+
+ if (!GetInputFileName (filename,FILENAMELEN,"","")) return;
+
+ strcpy(seqname,filename);
+ GetPanelExtra(seq_panel.names,&data);
+ data.nseqs=0;
+ data.vseqs=0;
+ SetPanelExtra(seq_panel.names,&data);
+ GetPanelExtra(seq_panel.seqs,&data);
+ data.nseqs=0;
+ data.vseqs=0;
+ SetPanelExtra(seq_panel.seqs,&data);
+
+ n=seq_input(FALSE);
+ if (n<=0)
+ {
+ info("File %s not loaded.",seqname);
+ return;
+ }
+
+ load_aln(seq_panel,0,nseqs-1,TRUE);
+
+ ncutseqs=0;
+
+ info("File %s loaded.",seqname);
+}
+
+static void AppendSeqFile (IteM item)
+{
+ int n;
+ panel_data data;
+
+ if (!GetInputFileName (filename,FILENAMELEN,"","")) return;
+
+ strcpy(seqname,filename);
+ GetPanelExtra(seq_panel.names,&data);
+ data.nseqs=0;
+ SetPanelExtra(seq_panel.names,&data);
+ n=seq_input(TRUE);
+ if (n<=0)
+ {
+ info("File %s not loaded.",seqname);
+ return;
+ }
+
+ load_aln(seq_panel,0,nseqs-1,FALSE);
+
+ info("File %s appended.",seqname);
+}
+
+static void OpenPrf1File (IteM item)
+{
+ int i,j,n,tmpn=0,tmpfs;
+ sint *tmplen_array;
+ sint *tmpindex;
+ char **tmp_array;
+ char **tmpnames;
+ char **tmptitles;
+ panel_data data;
+
+ if (profile1_nseqs>0)
+ {
+ if (Message(MSG_YN,"Replace existing sequences ?")==ANS_NO)
+ return;
+ }
+
+ if (!GetInputFileName (filename,FILENAMELEN,"","")) return;
+
+ if(!profile2_empty)
+ {
+ tmpn=nseqs-profile1_nseqs;
+ tmpfs=profile1_nseqs;
+ tmpnames=(char **)ckalloc((tmpn+1)*sizeof(char *));
+ tmptitles=(char **)ckalloc((tmpn+1)*sizeof(char *));
+ tmplen_array=(sint *)ckalloc((tmpn+1)*sizeof(sint));
+ tmpindex=(sint *)ckalloc((tmpn+1)*sizeof(sint));
+ tmp_array=(char **)ckalloc((tmpn+1)*sizeof(char *));
+ for(i=profile1_nseqs+1;i<=nseqs;i++)
+ {
+ tmpnames[i-profile1_nseqs-1]=(char *)ckalloc((MAXNAMES+2)*sizeof(char));
+ tmptitles[i-profile1_nseqs-1]=(char *)ckalloc((MAXTITLES+2)*sizeof(char));
+ strcpy(tmpnames[i-profile1_nseqs-1],names[i]);
+
+ strcpy(tmptitles[i-profile1_nseqs-1],titles[i]);
+ tmplen_array[i-profile1_nseqs-1]=seqlen_array[i];
+ tmpindex[i-profile1_nseqs-1]=output_index[i]-tmpfs+profile1_nseqs;
+ tmp_array[i-profile1_nseqs-1]=(char *)ckalloc((seqlen_array[i]+2)*sizeof(char));
+ for(j=1;j<=seqlen_array[i];j++)
+ tmp_array[i-profile1_nseqs-1][j]=seq_array[i][j];
+ }
+ }
+
+ strcpy(seqname,filename);
+ GetPanelExtra(prf_panel[0].names,&data);
+ data.nseqs=0;
+ data.vseqs=0;
+ SetPanelExtra(prf_panel[0].names,&data);
+ GetPanelExtra(prf_panel[0].seqs,&data);
+ data.nseqs=0;
+ data.vseqs=0;
+ SetPanelExtra(prf_panel[0].seqs,&data);
+ profile_no = 1;
+ n=profile_input();
+ if (n<=0)
+ {
+ info("File %s not loaded.",seqname);
+ return;
+ }
+ strcpy(profile1_name,seqname);
+ load_aln(prf_panel[0],0,profile1_nseqs-1,TRUE);
+
+ if(tmpn!=0)
+ {
+ nseqs=tmpn+profile1_nseqs;
+ realloc_aln(profile1_nseqs+1,nseqs);
+ for(i=profile1_nseqs+1;i<=nseqs;i++)
+ {
+ names[i]=(char *)ckalloc((MAXNAMES+2)*sizeof(char));
+ titles[i]=(char *)ckalloc((MAXTITLES+2)*sizeof(char));
+
+ strcpy(names[i],tmpnames[i-profile1_nseqs-1]);
+ ckfree(tmpnames[i-profile1_nseqs-1]);
+ strcpy(titles[i],tmptitles[i-profile1_nseqs-1]);
+ ckfree(tmptitles[i-profile1_nseqs-1]);
+ seqlen_array[i]=tmplen_array[i-profile1_nseqs-1];
+ output_index[i]=tmpindex[i-profile1_nseqs-1]-tmpfs+profile1_nseqs;
+ seq_array[i]=(char *)ckalloc((seqlen_array[i]+2)*sizeof(char));
+ for(j=1;j<=seqlen_array[i];j++)
+ seq_array[i][j]=tmp_array[i-profile1_nseqs-1][j];
+ ckfree(tmp_array[i-profile1_nseqs-1]);
+ }
+ ckfree(tmpnames);
+ ckfree(tmptitles);
+ ckfree(tmplen_array);
+ ckfree(tmpindex);
+ ckfree(tmp_array);
+ profile2_empty=FALSE;
+ }
+ load_aln(prf_panel[1],profile1_nseqs,nseqs-1,TRUE);
+
+ ncutseqs=0;
+
+ info("File %s loaded.",profile1_name);
+}
+
+static void OpenPrf2File (IteM item)
+{
+ int n;
+ panel_data data;
+
+ if(profile1_empty)
+ {
+ error("You must load profile 1 first.");
+ return;
+ }
+
+ if (nseqs>profile1_nseqs)
+ {
+ if (Message(MSG_YN,"Replace existing sequences ?")==ANS_NO)
+ return;
+ }
+
+ if (!GetInputFileName (filename,FILENAMELEN,"","")) return;
+
+ strcpy(seqname,filename);
+ GetPanelExtra(prf_panel[1].names,&data);
+ data.nseqs=0;
+ data.vseqs=0;
+ SetPanelExtra(prf_panel[1].names,&data);
+ GetPanelExtra(prf_panel[1].seqs,&data);
+ data.nseqs=0;
+ data.vseqs=0;
+ SetPanelExtra(prf_panel[1].seqs,&data);
+ profile_no = 2;
+ n=profile_input();
+ if (n<=0)
+ {
+ info("File %s not loaded.",seqname);
+ return;
+ }
+ strcpy(profile2_name,seqname);
+ ncutseqs=0;
+ load_aln(prf_panel[1],profile1_nseqs,nseqs-1,TRUE);
+
+ info("File %s loaded.",profile2_name);
+}
+
+
+static void BlackandWhite(IteM item)
+{
+
+ ncolors=1;
+
+ if (aln_mode == MULTIPLEM)
+ color_seqs();
+ else
+ {
+ color_prf1();
+ color_prf2();
+ }
+ usebw=TRUE;
+ usedefcolors=FALSE;
+ useusercolors=FALSE;
+ SetStatus(bw_item,usebw);
+ SetStatus(defcol_item,usedefcolors);
+ SetStatus(usercol_item,useusercolors);
+ info("Done.");
+}
+
+
+static void DefColorPar(IteM item)
+{
+
+ if (explicit_par_file != NULL)
+ ckfree(explicit_par_file);
+ explicit_par_file=NULL;
+ if(dnaflag)
+ par_file=find_file(def_dnapar_file);
+ else
+ par_file=find_file(def_protpar_file);
+ init_color_parameters(par_file);
+ if (aln_mode == MULTIPLEM)
+ color_seqs();
+ else
+ {
+ color_prf1();
+ color_prf2();
+ }
+ usebw=FALSE;
+ usedefcolors=TRUE;
+ useusercolors=FALSE;
+ SetStatus(bw_item,usebw);
+ SetStatus(defcol_item,usedefcolors);
+ SetStatus(usercol_item,useusercolors);
+ info("Done.");
+}
+
+void set_reset_new_gaps(IteM i)
+{
+ reset_alignments_new=GetStatus(i);
+ if(reset_alignments_new==TRUE)
+ {
+ reset_alignments_all=FALSE;
+ SetStatus(all_gaps_item,reset_alignments_all);
+ }
+}
+void set_reset_all_gaps(IteM i)
+{
+ reset_alignments_all=GetStatus(i);
+ if(reset_alignments_all==TRUE)
+ {
+ reset_alignments_new=FALSE;
+ SetStatus(new_gaps_item,reset_alignments_new);
+ }
+}
+
+
+static void OpenColorParWin(IteM item)
+{
+ read_file_window("Input Color File","COLOR PARAMETER FILE NAME:",explicit_par_file,OpenColorPar);
+}
+
+static void OpenColorPar(ButtoN but)
+{
+ /*<ramu> this might do to open a file selection window */
+
+ if (par_file != NULL)
+ ckfree(par_file);
+ par_file=(char *)ckalloc(FILENAMELEN*sizeof(char));
+
+ if (!GetInputFileName (par_file,FILENAMELEN,"par","")) return;
+
+
+ /* GetTitle(readfiletext, filename, FILENAMELEN); */
+ /* stripspace(filename); */
+
+ if (explicit_par_file != NULL)
+ ckfree(explicit_par_file);
+ explicit_par_file=(char *)ckalloc(FILENAMELEN*sizeof(char));
+ /* if (par_file != NULL)
+ ckfree(par_file);
+ par_file=(char *)ckalloc(FILENAMELEN*sizeof(char));
+ strcpy(explicit_par_file,filename); */
+
+ strcpy(explicit_par_file,par_file);
+ /*strcpy(par_file,filename); */
+ info("Loading color file: %s\n",par_file);
+ init_color_parameters(par_file);
+ if (Visible(readfilew))
+ {
+ Remove(readfilew);
+ readfilew=NULL;
+ }
+ if (aln_mode == MULTIPLEM)
+ color_seqs();
+ else
+ {
+ color_prf1();
+ color_prf2();
+ }
+ usebw=FALSE;
+ usedefcolors=FALSE;
+ useusercolors=TRUE;
+ SetStatus(bw_item,usebw);
+ SetStatus(defcol_item,usedefcolors);
+ SetStatus(usercol_item,useusercolors);
+ info("Done.");
+}
+
+static void RemoveGapPos(IteM item)
+{
+ int i,j,sl;
+ Boolean sel=FALSE;
+
+ if (nseqs==0)
+ {
+ Message(MSG_OK,"No file loaded.");
+ return;
+ }
+ /* no need for a confirmation! Ramu
+ if (Message(MSG_YN,"Remove positions that contain gaps in all sequences ?")==ANS_NO)
+ return;
+ */
+ if(aln_mode==MULTIPLEM)
+ {
+ remove_gap_pos(1,nseqs,0);
+ load_aln(seq_panel,0,nseqs-1,FALSE);
+ }
+ else
+ {
+ remove_gap_pos(1,profile1_nseqs,1);
+ load_aln(prf_panel[0],0,profile1_nseqs-1,FALSE);
+ remove_gap_pos(profile1_nseqs+1,nseqs,2);
+ load_aln(prf_panel[1],profile1_nseqs,nseqs-1,FALSE);
+ }
+ /* info("Gap positions removed.");*/
+ info("All the columns that contains only the gaps, are removed!");
+}
+
+
+static void RemoveGaps(IteM item)
+{
+ int i,j,sl;
+ panel_data data;
+ Boolean sel=FALSE;
+
+ if (nseqs==0)
+ {
+ Message(MSG_OK,"No file loaded.");
+ return;
+ }
+ GetPanelExtra(active_panel.names,&data);
+ for (i=0;i<data.nseqs;i++)
+ if(data.selected[i]==TRUE)
+ {
+ sel=TRUE;
+ break;
+ }
+ if(sel==FALSE)
+ {
+ Message(MSG_OK,"Select sequences by clicking on the names.");
+ return;
+ }
+
+ if (Message(MSG_YN,"Remove gaps from selected sequences ?")==ANS_NO)
+ return;
+
+ for (i=data.firstseq+1;i<=data.firstseq+data.nseqs;i++)
+ if(data.selected[i-data.firstseq-1]==TRUE)
+ {
+ sl=0;
+ for(j=1;j<=seqlen_array[i];++j) {
+ if((seq_array[i][j] == gap_pos1) ||
+ (seq_array[i][j] == gap_pos2)) continue;
+ ++sl;
+ seq_array[i][sl]=seq_array[i][j];
+ }
+ seq_array[i][sl+1]=-3;
+ seqlen_array[i]=sl;
+ }
+ load_aln(active_panel,data.firstseq,data.firstseq+data.nseqs-1,FALSE);
+ active_panel.modified=TRUE;
+ info("Gaps in selected sequences removed.");
+}
+
+static void CutSequences(IteM item)
+{
+ int i,pos;
+ Boolean sel=FALSE;
+ panel_data data;
+
+ if (nseqs==0)
+ {
+ Message(MSG_OK,"No file loaded.");
+ return;
+ }
+
+ GetPanelExtra(active_panel.names,&data);
+ for (i=0;i<data.nseqs;i++)
+ if(data.selected[i]==TRUE)
+ {
+ sel=TRUE;
+ pos=i;
+ break;
+ }
+ if(sel==FALSE)
+ {
+ Message(MSG_OK,"Select sequences to be cut by clicking on the names.");
+ return;
+ }
+
+ /* if(ncutseqs>0)
+ {
+ if (Message(MSG_YN,"The previously cut sequences will be lost.\nDo you want to continue?")==ANS_NO) return;
+ }
+ */
+ if (saveseqlen_array!=NULL) ckfree(saveseqlen_array);
+ if (saveseq_array!=NULL)
+ {
+ for(i=0;i<ncutseqs;i++)
+ {
+ if (saveseq_array[i]!=NULL) ckfree(saveseq_array[i]);
+ }
+ ckfree(saveseq_array);
+ }
+ if (savetitles!=NULL)
+ {
+ for(i=0;i<ncutseqs;i++)
+ {
+ if (savetitles[i]!=NULL) ckfree(savetitles[i]);
+ }
+ ckfree(savetitles);
+ }
+ if (savenames!=NULL)
+ {
+ for(i=0;i<ncutseqs;i++)
+ {
+ if (savenames[i]!=NULL) ckfree(savenames[i]);
+ }
+ ckfree(savenames);
+ }
+ ncutseqs=0;
+
+ savenames=(char **)ckalloc((data.nseqs+1) * sizeof(char *));
+ savetitles=(char **)ckalloc((data.nseqs+1) * sizeof(char *));
+ saveseq_array=(char **)ckalloc((data.nseqs+1) * sizeof(char *));
+ saveseqlen_array=(sint *)ckalloc((data.nseqs+1) * sizeof(sint));
+ for(i=0;i<data.nseqs;i++)
+ {
+ savenames[i]=NULL;
+ savetitles[i]=NULL;
+ saveseq_array[i]=NULL;
+ }
+ if (data.prf_no == 0)
+ cut_multiplem();
+ else if (data.prf_no == 1)
+ cut_profile1();
+ else if (data.prf_no == 2)
+ cut_profile2();
+
+ GetPanelExtra(active_panel.names,&data);
+ if(pos>=data.nseqs) pos=data.nseqs-1;
+ if(data.nseqs>0)
+ data.selected[pos]=TRUE;
+ SetPanelExtra(active_panel.names,&data);
+ DrawPanel(active_panel.names);
+
+ active_panel.modified=TRUE;
+ info("Cut %d sequences.",ncutseqs);
+}
+
+static void cut_multiplem(void)
+{
+ int i,j;
+ panel_data data;
+
+ GetPanelExtra(active_panel.names,&data);
+ for (i=data.nseqs;i>0;i--)
+ {
+ if(data.selected[i-1]==TRUE)
+ {
+ ssave(i);
+ for(j=i;j<data.nseqs;j++)
+ sscpy(j,j+1);
+ }
+ }
+ nseqs-=ncutseqs;
+ if (nseqs<=0) empty=TRUE;
+ if (ncutseqs>0)
+ if(nseqs<=data.vseqs)
+ load_aln(active_panel,0,nseqs-1,TRUE);
+ else
+ load_aln(active_panel,0,nseqs-1,FALSE);
+}
+
+static void cut_profile1(void)
+{
+ int i,j;
+ panel_data data;
+
+ GetPanelExtra(active_panel.names,&data);
+ for (i=data.nseqs;i>0;i--)
+ {
+ if(data.selected[i-1]==TRUE)
+ {
+ ssave(i);
+ for(j=i;j<nseqs;j++)
+ sscpy(j,j+1);
+ }
+ }
+ profile1_nseqs-=ncutseqs;
+ nseqs-=ncutseqs;
+ if (profile1_nseqs<=0) profile1_empty=TRUE;
+ if (nseqs<=0) empty=TRUE;
+ if (ncutseqs>0)
+ {
+ if(profile1_nseqs<=data.vseqs)
+ load_aln(active_panel,0,profile1_nseqs-1,TRUE);
+ else
+ load_aln(active_panel,0,profile1_nseqs-1,FALSE);
+ if (!profile2_empty)
+ load_aln(prf_panel[1],profile1_nseqs,nseqs-1,FALSE);
+ }
+}
+
+static void cut_profile2(void)
+{
+ int i,j;
+ panel_data data;
+
+ GetPanelExtra(active_panel.names,&data);
+ for (i=data.nseqs;i>0;i--)
+ {
+ if(data.selected[i-1]==TRUE)
+ {
+ ssave(i+profile1_nseqs);
+ for(j=i+profile1_nseqs;j<nseqs;j++)
+ sscpy(j,j+1);
+ }
+ }
+ nseqs-=ncutseqs;
+ if (nseqs-profile1_nseqs<=0) profile2_empty=TRUE;
+ if (nseqs<=0) empty=TRUE;
+ if (ncutseqs>0)
+ if(nseqs-profile1_nseqs<=data.vseqs)
+ load_aln(active_panel,profile1_nseqs,nseqs-1,FALSE);
+ else
+ load_aln(active_panel,profile1_nseqs,nseqs-1,TRUE);
+}
+
+static void PasteSequences(IteM item)
+{
+ int insert;
+ int i,n;
+ panel_data data;
+
+ if (ncutseqs<=0)
+ {
+ Message(MSG_OK,"No sequences available for pasting.\n"
+ " Cut selected sequences first.");
+ return;
+ }
+
+ GetPanelExtra(active_panel.names,&data);
+ n=ncutseqs;
+ insert=-1;
+ if (data.nseqs>0)
+ {
+ for(i=data.nseqs-1;i>=0;i--)
+ if(data.selected[i]==TRUE)
+ {
+ insert=i;
+ break;
+ }
+ if (insert==-1)
+ {
+ Message(MSG_OK,"Select a sequence by clicking on the name.\n"
+ " Cut sequences will be pasted after this one.");
+ return;
+ }
+ }
+
+ if (data.prf_no == 2)
+ {
+ insert += profile1_nseqs;
+ for(i=profile1_nseqs+data.nseqs;i>insert+1;i--)
+ sscpy(i+ncutseqs,i);
+ for(i=1;ncutseqs>0;i++)
+ sload(insert+i+1);
+ }
+
+
+ else
+ {
+ for(i=nseqs;i>insert+1;i--)
+ sscpy(i+ncutseqs,i);
+ for(i=1;ncutseqs>0;i++)
+ sload(insert+i+1);
+ }
+
+ if(data.prf_no==0)
+ {
+ nseqs=data.nseqs+n;
+ if (nseqs>0) empty=FALSE;
+ load_aln(seq_panel,0,nseqs-1,FALSE);
+ }
+ else if(data.prf_no==1)
+ {
+ profile1_nseqs=data.nseqs+n;
+ nseqs+=n;
+ if (profile1_nseqs>0) profile1_empty=FALSE;
+ load_aln(active_panel,0,profile1_nseqs-1,FALSE);
+ if (!profile2_empty)
+ load_aln(prf_panel[1],profile1_nseqs,nseqs-1,FALSE);
+ }
+ else if(data.prf_no==2)
+ {
+ nseqs=profile1_nseqs+data.nseqs+n;
+ if (profile1_nseqs<nseqs)
+ {
+ profile2_empty=FALSE;
+ empty=FALSE;
+ }
+/*
+ load_aln(prf_panel[0],0,profile1_nseqs-1,FALSE);
+*/
+ load_aln(prf_panel[1],profile1_nseqs,nseqs-1,FALSE);
+ }
+
+ active_panel.modified=TRUE;
+ info("Pasted %d sequences.",n);
+}
+
+/* copies a sequence from clustal arrays position j to temp arrays */
+static void ssave(int j)
+{
+ int k;
+
+ if (saveseq_array[ncutseqs] != NULL) ckfree(saveseq_array[ncutseqs]);
+ if (savenames[ncutseqs] != NULL) ckfree(savenames[ncutseqs]);
+ if (savetitles[ncutseqs] != NULL) ckfree(savetitles[ncutseqs]);
+ savenames[ncutseqs]=(char *)ckalloc((MAXNAMES+2)*sizeof(char));
+ savetitles[ncutseqs]=(char *)ckalloc((MAXTITLES+2)*sizeof(char));
+
+ strcpy(savenames[ncutseqs],names[j]);
+ strcpy(savetitles[ncutseqs],titles[j]);
+ saveseqlen_array[ncutseqs]=seqlen_array[j];
+ saveseq_array[ncutseqs]=(char *)ckalloc((seqlen_array[j]+2)*sizeof(char));
+ for(k=1;k<=seqlen_array[j];k++)
+ saveseq_array[ncutseqs][k]=seq_array[j][k];
+ saveseq_array[ncutseqs][k]= -3;
+ ncutseqs++;
+}
+
+/* copies a sequence from clustal arrays position i to position j */
+static void sscpy(int i,int j)
+{
+ int k;
+
+
+ strcpy(names[i],names[j]);
+ strcpy(titles[i],titles[j]);
+ seqlen_array[i]=seqlen_array[j];
+ realloc_seq(i,seqlen_array[i]);
+
+ for(k=1;k<=seqlen_array[j];k++)
+ seq_array[i][k]=seq_array[j][k];
+ seq_array[i][k]= -3;
+}
+
+/* copies last sequence in temp arrays to clustal arrays after entry i */
+static void sload(int i)
+{
+ int k;
+
+ if (ncutseqs<1) return;
+
+ ncutseqs--;
+ strcpy(names[i],savenames[ncutseqs]);
+ strcpy(titles[i],savetitles[ncutseqs]);
+ seqlen_array[i]=saveseqlen_array[ncutseqs];
+ realloc_seq(i,seqlen_array[i]);
+ for(k=1;k<=seqlen_array[i];k++)
+ seq_array[i][k]=saveseq_array[ncutseqs][k];
+ seq_array[i][k]= -3;
+}
+
+static void SelectSeqs(IteM item)
+{
+ select_seqs(seq_panel,TRUE);
+}
+
+static void SelectPrf1(IteM item)
+{
+ select_seqs(prf_panel[0],TRUE);
+}
+
+static void SelectPrf2(IteM item)
+{
+ select_seqs(prf_panel[1],TRUE);
+}
+
+static void MergeProfiles(IteM item)
+{
+ if (profile2_empty)
+ {
+ error("Profile 2 not loaded");
+ return;
+ }
+ profile_no=1;
+ profile1_nseqs=nseqs;
+ profile2_empty=TRUE;
+ load_aln(prf_panel[0],0,profile1_nseqs-1,FALSE);
+ load_aln(prf_panel[1],profile1_nseqs,nseqs-1,FALSE);
+ active_panel=prf_panel[0];
+
+ info("Added Profile 2 to Profile 1.");
+}
+
+
+static void ClearSeqRange(IteM item)
+{
+ if(aln_mode==MULTIPLEM)
+ clear_seqrange(seq_panel);
+ else
+ {
+ clear_seqrange(prf_panel[1]);
+ clear_seqrange(prf_panel[0]);
+ }
+}
+
+static void ClearSeqs(IteM item)
+{
+ if(aln_mode==MULTIPLEM)
+ select_seqs(seq_panel,FALSE);
+ else
+ {
+ select_seqs(prf_panel[1],FALSE);
+ select_seqs(prf_panel[0],FALSE);
+ }
+}
+
+static void clear_seqrange(spanel p)
+{
+ int f,l;
+ panel_data data;
+
+ GetPanelExtra(p.seqs,&data);
+ f=data.firstsel;
+ l=data.lastsel;
+ data.firstsel=data.lastsel=-1;
+ SetPanelExtra(p.seqs,&data);
+ highlight_seqrange(p.seqs,f,l,NORMAL);
+}
+
+static void select_seqs(spanel p,Boolean flag)
+{
+ int i;
+ panel_data data;
+
+ GetPanelExtra(p.names,&data);
+ if (data.nseqs == 0) return;
+
+ for (i=0;i<data.nseqs;i++)
+ data.selected[i]=flag;
+
+ SetPanelExtra(p.names,&data);
+ draw_names(p.names);
+ if(flag==TRUE) active_panel=p;
+}
+
+static void CAlignWin (IteM item)
+{
+ if (empty)
+ {
+ error("No sequences loaded");
+ return;
+ }
+ if (nseqs <= 1)
+ {
+ error("Alignment has only %d sequences",nseqs);
+ return;
+ }
+ do_align_window(&calignw,&ctreetext,NEW,"Complete Alignment",CompleteAlign);
+}
+
+void CompleteAlign(ButtoN but)
+{
+ char phylip_name[FILENAMELEN];
+
+ float etime = 0;
+
+ /* time_t startTime, timeElapsed; */
+
+ GetTitle(ctreetext, filename, FILENAMELEN);
+ stripspace(filename);
+
+ strcpy(phylip_name,filename);
+
+ if (!open_aln_files()) return;
+
+ WatchCursor();
+ if (Visible(calignw))
+ {
+ Remove(calignw);
+ calignw=NULL;
+ }
+ cputime(0); /* Start timing ........ Ramu */
+ /* startTime = GetSecs(); */
+
+ align(phylip_name);
+
+ if(save_log && save_log_fd!=NULL)
+ {
+ fclose(save_log_fd);
+ save_log_fd=NULL;
+ }
+/* reload the sequences from the output file (so that the sequence order is
+correct - either INPUT or ALIGNED , don't output messages */
+ reload_alignment();
+
+ load_aln(seq_panel,0,nseqs-1,FALSE);
+ ArrowCursor();
+ /* timeElapsed = GetSecs() - startTime; */
+ info("\n Elapsed time : %7.2f Secs ",cputime(0)); /* Ramu */
+}
+
+static void RealignSeqsWin (IteM item)
+{
+ int i;
+ Boolean sel=FALSE;
+ panel_data data;
+
+ if (empty)
+ {
+ error("No sequences loaded");
+ return;
+ }
+ if (nseqs <= 1)
+ {
+ error("Alignment has only %d sequences",nseqs);
+ return;
+ }
+
+/* check some sequences have been selected */
+
+ GetPanelExtra(seq_panel.names,&data);
+ for (i=0;i<data.nseqs;i++)
+ if(data.selected[i]==TRUE)
+ {
+ sel=TRUE;
+ break;
+ }
+ if(sel==FALSE)
+ {
+ Message(MSG_OK,"Select sequences to be realigned\n"
+ "by clicking on the names.");
+ return;
+ }
+
+ do_align_window(&ralignw,&rtreetext,NEW,"Realign Sequences",RealignSeqs);
+}
+
+static void RealignSeqs(ButtoN but)
+{
+ int insert;
+ int i,j,n;
+ panel_data data;
+ char phylip_name[FILENAMELEN];
+
+ GetTitle(rtreetext, filename, FILENAMELEN);
+ stripspace(filename);
+
+ strcpy(phylip_name,filename);
+
+ if (!open_aln_files()) return;
+
+/* cut selected sequences */
+
+ GetPanelExtra(seq_panel.names,&data);
+
+ if (saveseqlen_array!=NULL) ckfree(saveseqlen_array);
+ if (saveseq_array!=NULL)
+ {
+ for(i=0;i<ncutseqs;i++)
+ {
+ if (saveseq_array[i]!=NULL) ckfree(saveseq_array[i]);
+ }
+ ckfree(saveseq_array);
+ }
+ if (savetitles!=NULL)
+ {
+ for(i=0;i<ncutseqs;i++)
+ {
+ if (savetitles[i]!=NULL) ckfree(savetitles[i]);
+ }
+ ckfree(savetitles);
+ }
+ if (savenames!=NULL)
+ {
+ for(i=0;i<ncutseqs;i++)
+ {
+ if (savenames[i]!=NULL) ckfree(savenames[i]);
+ }
+ ckfree(savenames);
+ }
+ ncutseqs=0;
+
+ savenames=(char **)ckalloc((data.nseqs+1) * sizeof(char *));
+ savetitles=(char **)ckalloc((data.nseqs+1) * sizeof(char *));
+ saveseq_array=(char **)ckalloc((data.nseqs+1) * sizeof(char *));
+ saveseqlen_array=(sint *)ckalloc((data.nseqs+1) * sizeof(sint));
+ for(i=0;i<data.nseqs;i++)
+ {
+ savenames[i]=NULL;
+ savetitles[i]=NULL;
+ saveseq_array[i]=NULL;
+ }
+ for (i=data.nseqs;i>0;i--)
+ {
+ if(data.selected[i-1]==TRUE)
+ {
+ ssave(i);
+ for(j=i;j<data.nseqs;j++)
+ sscpy(j,j+1);
+ }
+ }
+ nseqs=data.nseqs-ncutseqs;
+ if (nseqs<=0) empty=TRUE;
+
+/* paste selected sequences at the end */
+ n=ncutseqs;
+ profile1_nseqs=nseqs;
+ insert=profile1_nseqs-1;
+
+ for(i=nseqs;i>insert+1;i--)
+ sscpy(i+ncutseqs,i);
+ for(i=1;ncutseqs>0;i++)
+ sload(insert+i+1);
+
+ nseqs=profile1_nseqs+n;
+
+/* align profile 2 sequences to profile 1 */
+ WatchCursor();
+ if (Visible(ralignw))
+ {
+ Remove(ralignw);
+ ralignw=NULL;
+ }
+ new_sequence_align(phylip_name);
+
+ if(save_log && save_log_fd!=NULL)
+ {
+ fclose(save_log_fd);
+ save_log_fd=NULL;
+ }
+/* reload the sequences from the output file (so that the sequence order is
+correct - either INPUT or ALIGNED */
+ reload_alignment();
+ load_aln(seq_panel,0,nseqs-1,FALSE);
+
+ GetPanelExtra(seq_panel.names,&data);
+ for (i=0;i<profile1_nseqs;i++)
+ data.selected[i]=FALSE;
+ for (i=profile1_nseqs;i<nseqs;i++)
+ data.selected[i]=TRUE;
+ SetPanelExtra(seq_panel.names,&data);
+ draw_names(seq_panel.names);
+ ArrowCursor();
+ info("Selected sequences realigned.");
+}
+
+static void RealignSeqRangeWin (IteM item)
+{
+ panel_data data;
+ GrouP aligngr;
+ GrouP output_list;
+ ButtoN align_ok, align_can;
+ GrouP maing;
+ PopuP end_gap_toggle;
+ char name[FILENAMELEN+1];
+ char path[FILENAMELEN+1];
+
+ if (empty)
+ {
+ error("No sequences loaded");
+ return;
+ }
+ if (nseqs <= 1)
+ {
+ error("Alignment has only %d sequences",nseqs);
+ return;
+ }
+
+/* check a range has been selected */
+
+ GetPanelExtra(seq_panel.seqs,&data);
+ if(data.firstsel==-1)
+ {
+ Message(MSG_OK,"Select residue range to be realigned\n"
+ "by clicking in the sequence display area.");
+ return;
+ }
+
+
+ get_path(seqname,path);
+ SelectFont(systemFont);
+ stdCharWidth=CharWidth('A');
+ stdLineHeight=LineHeight();
+ rralignw=FixedWindow(-50, -33, -10, -10,"Realign Residue Range",RemoveWin);
+
+ maing=HiddenGroup(rralignw,2,0,NULL);
+ SetGroupSpacing(maing,0,10);
+
+ make_prompt(rralignw, "Output Guide Tree File:");
+ stdLineHeight=18;
+ SelectFont(programFont);
+ Break(rralignw);
+ rrtreetext=DialogText(rralignw, "", 35, NULL);
+ strcpy(name,path);
+ strcat(name,"dnd");
+ SetTitle(rrtreetext, name);
+ Break(rralignw);
+
+ make_prompt(rralignw, "Output Alignment Files:");
+ output_list=HiddenGroup(rralignw, 2, 0, NULL);
+ if(output_clustal) {
+ make_prompt(output_list,"Clustal: ");
+ cl_outtext=DialogText(output_list, "", 35, NULL);
+ strcpy(name,path);
+ strcat(name,"aln");
+ SetTitle(cl_outtext, name);
+ Break(output_list);
+ }
+ if(output_nbrf) {
+ make_prompt(output_list,"NBRF/PIR: ");
+ pir_outtext=DialogText(output_list, "", 35, NULL);
+ strcpy(name,path);
+ strcat(name,"pir");
+ SetTitle(pir_outtext, name);
+ Break(output_list);
+ }
+ if(output_gcg) {
+ make_prompt(output_list,"GCG/MSF: ");
+ msf_outtext=DialogText(output_list, "", 35, NULL);
+ strcpy(name,path);
+ strcat(name,"msf");
+ SetTitle(msf_outtext, name);
+ Break(output_list);
+ }
+ if(output_phylip) {
+ make_prompt(output_list,"Phylip: ");
+ phylip_outtext=DialogText(output_list, "", 35, NULL);
+ strcpy(name,path);
+ strcat(name,"phy");
+ SetTitle(phylip_outtext, name);
+ Break(output_list);
+ }
+ if(output_gde) {
+ make_prompt(output_list,"GDE: ");
+ gde_outtext=DialogText(output_list, "", 35, NULL);
+ strcpy(name,path);
+ strcat(name,"gde");
+ SetTitle(gde_outtext, name);
+ Break(output_list);
+ }
+ if(output_nexus) {
+ make_prompt(output_list,"Nexus: ");
+ nexus_outtext=DialogText(output_list, "", 35, NULL);
+ strcpy(name,path);
+ strcat(name,"nxs");
+ SetTitle(nexus_outtext, name);
+ Break(output_list);
+ }
+
+ Break(rralignw);
+ end_gap_toggle=make_toggle(rralignw,"Realign Segment End Gap Penalties","ON","OFF",&realign_endgappenalties,set_realign_endgappenalties);
+ Break(rralignw);
+
+ aligngr=HiddenGroup(rralignw, 2, 0, NULL);
+ shift(aligngr, 60, 20);
+ align_ok=PushButton(aligngr, " ALIGN ", RealignSeqRange);
+ shift(aligngr, 20,0);
+ align_can=PushButton(aligngr, "CANCEL", CancelWin);
+
+ Show(rralignw);
+}
+
+static void RealignSeqRange(ButtoN but)
+{
+ int i,j;
+ int fs,save_order,length,length1,length2;
+ panel_data data;
+ sint *tmplen_array;
+ char **tmp_array;
+ sint *newlen_array;
+ char **new_array;
+ char phylip_name[FILENAMELEN];
+
+ GetTitle(rrtreetext, filename, FILENAMELEN);
+ stripspace(filename);
+
+ strcpy(phylip_name,filename);
+
+ if (!open_aln_files()) return;
+
+ WatchCursor();
+ if (Visible(rralignw))
+ {
+ Remove(rralignw);
+ rralignw=NULL;
+ }
+/* save the alignment into a temporary area */
+
+ GetPanelExtra(seq_panel.seqs,&data);
+
+ tmplen_array=(sint *)ckalloc((data.nseqs+2) * sizeof(sint));
+ tmp_array=(char **)ckalloc((data.nseqs+2) * sizeof(char *));
+ for (i=1;i<=data.nseqs;i++)
+ {
+ tmplen_array[i]=seqlen_array[i];
+ tmp_array[i]=(char *)ckalloc((data.ncols+2) * sizeof(char));
+ for(j=1;j<=seqlen_array[i];j++)
+ tmp_array[i][j]=seq_array[i][j];
+ for(j=seqlen_array[i]+1;j<=data.ncols;j++)
+ tmp_array[i][j]=gap_pos2;
+ }
+
+/* copy the selected residue range to the clustal alignment arrays */
+
+ fs=data.firstsel;
+ length=data.lastsel-data.firstsel+1;
+ max_aln_length=2*length;
+ for (i=1;i<=data.nseqs;i++)
+ {
+ seqlen_array[i]=length;
+ realloc_seq(i,length);
+ for(j=data.firstsel;j<=data.lastsel;j++)
+ seq_array[i][j-data.firstsel+1]=tmp_array[i][j+1];
+ seq_array[i][j-data.firstsel+1]=-3;
+ }
+/* temporarily set the output order to be the same as the input */
+ save_order=output_order;
+ output_order=INPUT;
+/* set the end gaps penalties */
+ endgappenalties=realign_endgappenalties;
+
+/* align the residue range */
+ align(phylip_name);
+ if(save_log && save_log_fd!=NULL)
+ {
+ fclose(save_log_fd);
+ save_log_fd=NULL;
+ }
+
+ output_order=save_order;
+/* reset the end gaps penalties */
+ endgappenalties=align_endgappenalties;
+
+/* remove positions that contain just gaps */
+ remove_gap_pos(1,nseqs,0);
+
+
+/* save the new alignment into another temporary area */
+ newlen_array=(sint *)ckalloc((data.nseqs+2) * sizeof(sint));
+ new_array=(char **)ckalloc((data.nseqs+2) * sizeof(char *));
+ for (i=1;i<=data.nseqs;i++)
+ {
+ newlen_array[i]=seqlen_array[i];
+ new_array[i]=(char *)ckalloc((seqlen_array[i]+2) * sizeof(char));
+ for(j=1;j<=seqlen_array[i];j++)
+ new_array[i][j]=seq_array[i][j];
+ }
+
+/* paste the realigned range back into the alignment */
+ max_aln_length=0;
+ length1=length2=0;
+ for (i=1;i<=data.nseqs;i++)
+ {
+ length1=tmplen_array[i]-length+newlen_array[i];
+ if(length1>max_aln_length) max_aln_length=length1;
+ length2=newlen_array[i];
+ seqlen_array[i]=length1;
+ realloc_seq(i,length1);
+ for(j=1;j<=data.firstsel;j++)
+ seq_array[i][j]=tmp_array[i][j];
+ for(j=data.firstsel+1;j<=data.firstsel+length2;j++)
+ seq_array[i][j]=new_array[i][j-data.firstsel];
+ for(j=data.firstsel+length2+1;j<=length1;j++)
+ seq_array[i][j]=tmp_array[i][data.lastsel+j-data.firstsel-length2+1];
+ }
+ max_aln_length*=2;
+ ckfree(tmplen_array);
+ for(i=1;i<=data.nseqs;i++)
+ ckfree(tmp_array[i]);
+ ckfree(tmp_array);
+ ckfree(newlen_array);
+ for(i=1;i<=data.nseqs;i++)
+ ckfree(new_array[i]);
+ ckfree(new_array);
+
+ if (open_aln_files())
+ create_alignment_output(1,data.nseqs);
+
+ load_aln(seq_panel,0,nseqs-1,FALSE);
+ GetPanelExtra(seq_panel.seqs,&data);
+ data.firstsel=fs;
+ data.lastsel=data.firstsel+length2-1;
+ SetPanelExtra(seq_panel.seqs,&data);
+ highlight_seqrange(seq_panel.seqs,data.firstsel,data.lastsel,HIGHLIGHT);
+ ArrowCursor();
+ info("Selected sequence range realigned.");
+}
+
+
+void AlignFromTreeWin(IteM item)
+{
+ if (empty)
+ {
+ error("No sequences loaded");
+ return;
+ }
+ if (nseqs < 2)
+ {
+ error("Alignment has only %d sequences",nseqs);
+ return;
+ }
+ do_align_window(&talignw,&ttreetext,OLD,"Alignment from Guide Tree",AlignFromTree);
+}
+
+static void do_align_window(WindoW *ralignw,TexT *rtreetext,Boolean treestatus,char *title,void align_proc(ButtoN but))
+{
+ WindoW alignw;
+ TexT treetext;
+ GrouP aligngr;
+ GrouP output_list;
+ ButtoN align_ok, align_can;
+ GrouP maing;
+ char name[FILENAMELEN+1];
+ char path[FILENAMELEN+1];
+
+ get_path(seqname,path);
+ SelectFont(systemFont);
+ stdCharWidth=CharWidth('A');
+ stdLineHeight=LineHeight();
+ alignw=FixedWindow(-50, -33, -10, -10,title,RemoveWin);
+
+ maing=HiddenGroup(alignw,2,0,NULL);
+ SetGroupSpacing(maing,0,10);
+
+ if(treestatus==NEW)
+ make_prompt(alignw, "Output Guide Tree File:");
+ else
+ make_prompt(alignw, "Input Guide Tree File:");
+ stdLineHeight=18;
+ SelectFont(programFont);
+ Break(alignw);
+ treetext=DialogText(alignw, "", 35, NULL);
+ strcpy(name,path);
+ strcat(name,"dnd");
+ SetTitle(treetext, name);
+ Break(alignw);
+
+ make_prompt(alignw, "Output Alignment Files:");
+ output_list=HiddenGroup(alignw, 2, 0, NULL);
+ if(output_clustal) {
+ make_prompt(output_list,"Clustal: ");
+ cl_outtext=DialogText(output_list, "", 35, NULL);
+ strcpy(name,path);
+ strcat(name,"aln");
+ SetTitle(cl_outtext, name);
+ Break(output_list);
+ }
+ if(output_nbrf) {
+ make_prompt(output_list,"NBRF/PIR: ");
+ pir_outtext=DialogText(output_list, "", 35, NULL);
+ strcpy(name,path);
+ strcat(name,"pir");
+ SetTitle(pir_outtext, name);
+ Break(output_list);
+ }
+ if(output_gcg) {
+ make_prompt(output_list,"GCG/MSF: ");
+ msf_outtext=DialogText(output_list, "", 35, NULL);
+ strcpy(name,path);
+ strcat(name,"msf");
+ SetTitle(msf_outtext, name);
+ Break(output_list);
+ }
+ if(output_phylip) {
+ make_prompt(output_list,"Phylip: ");
+ phylip_outtext=DialogText(output_list, "", 35, NULL);
+ strcpy(name,path);
+ strcat(name,"phy");
+ SetTitle(phylip_outtext, name);
+ Break(output_list);
+ }
+ if(output_gde) {
+ make_prompt(output_list,"GDE: ");
+ gde_outtext=DialogText(output_list, "", 35, NULL);
+ strcpy(name,path);
+ strcat(name,"gde");
+ SetTitle(gde_outtext, name);
+ Break(output_list);
+ }
+ if(output_nexus) {
+ make_prompt(output_list,"Nexus: ");
+ nexus_outtext=DialogText(output_list, "", 35, NULL);
+ strcpy(name,path);
+ strcat(name,"nxs");
+ SetTitle(nexus_outtext, name);
+ Break(output_list);
+ }
+/* Ramu */
+ if(output_fasta) {
+ make_prompt(output_list,"Fasta: ");
+ fasta_outtext=DialogText(output_list, "", 35, NULL);
+ strcpy(name,path);
+ strcat(name,"fasta");
+ SetTitle(fasta_outtext, name);
+ Break(output_list);
+ }
+
+/* Ramu */
+
+ Break(alignw);
+ aligngr=HiddenGroup(alignw, 2, 0, NULL);
+ shift(aligngr, 60, 20);
+ align_ok=PushButton(aligngr, " ALIGN ", align_proc);
+ shift(aligngr, 20,0);
+ align_can=PushButton(aligngr, "CANCEL", CancelWin);
+
+ *ralignw=alignw;
+ *rtreetext=treetext;
+ Show(alignw);
+}
+
+
+static void do_palign_window(WindoW *ralignw,TexT *rtree1text,TexT *rtree2text,Boolean treestatus,char *title,void align_proc(ButtoN but))
+{
+ Boolean istree=FALSE;
+ WindoW alignw;
+ TexT tree1text,tree2text;
+ GrouP aligngr;
+ GrouP output_list;
+ ButtoN align_ok, align_can;
+ GrouP maing;
+ char name[FILENAMELEN+1];
+ char path[FILENAMELEN+1];
+
+ SelectFont(systemFont);
+ stdCharWidth=CharWidth('A');
+ stdLineHeight=LineHeight();
+ alignw=FixedWindow(-50, -33, -10, -10,title,RemoveWin);
+
+ maing=HiddenGroup(alignw,2,0,NULL);
+ SetGroupSpacing(maing,0,10);
+
+ if(treestatus==NEW)
+ make_prompt(alignw, "Output Guide Tree Files:");
+ else
+ make_prompt(alignw, "Input Guide Tree Files:");
+ stdLineHeight=18;
+ SelectFont(programFont);
+ Break(alignw);
+ tree1text=DialogText(alignw, "", 35, NULL);
+ get_path(profile1_name,path);
+ strcpy(name,path);
+ strcat(name,"dnd");
+ SetTitle(tree1text, name);
+ Break(alignw);
+ tree2text=DialogText(alignw, "", 35, NULL);
+ get_path(profile2_name,path);
+ strcpy(name,path);
+ strcat(name,"dnd");
+ SetTitle(tree2text, name);
+ Break(alignw);
+
+ make_prompt(alignw, "Output Alignment Files:");
+ output_list=HiddenGroup(alignw, 2, 0, NULL);
+ if(output_clustal) {
+ make_prompt(output_list,"Clustal: ");
+ cl_outtext=DialogText(output_list, "", 35, NULL);
+ strcpy(name,path);
+ strcat(name,"aln");
+ SetTitle(cl_outtext, name);
+ Break(output_list);
+ }
+ if(output_nbrf) {
+ make_prompt(output_list,"NBRF/PIR: ");
+ pir_outtext=DialogText(output_list, "", 35, NULL);
+ strcpy(name,path);
+ strcat(name,"pir");
+ SetTitle(pir_outtext, name);
+ Break(output_list);
+ }
+ if(output_gcg) {
+ make_prompt(output_list,"GCG/MSF: ");
+ msf_outtext=DialogText(output_list, "", 35, NULL);
+ strcpy(name,path);
+ strcat(name,"msf");
+ SetTitle(msf_outtext, name);
+ Break(output_list);
+ }
+ if(output_phylip) {
+ make_prompt(output_list,"Phylip: ");
+ phylip_outtext=DialogText(output_list, "", 35, NULL);
+ strcpy(name,path);
+ strcat(name,"phy");
+ SetTitle(phylip_outtext, name);
+ Break(output_list);
+ }
+ if(output_gde) {
+ make_prompt(output_list,"GDE: ");
+ gde_outtext=DialogText(output_list, "", 35, NULL);
+ strcpy(name,path);
+ strcat(name,"gde");
+ SetTitle(gde_outtext, name);
+ Break(output_list);
+ }
+ if(output_nexus) {
+ make_prompt(output_list,"Nexus: ");
+ nexus_outtext=DialogText(output_list, "", 35, NULL);
+ strcpy(name,path);
+ strcat(name,"nxs");
+ SetTitle(nexus_outtext, name);
+ Break(output_list);
+ }
+/* Ramu */
+ if(output_fasta) {
+ make_prompt(output_list,"Fasta: ");
+ fasta_outtext=DialogText(output_list, "", 35, NULL);
+ strcpy(name,path);
+ strcat(name,"fasta");
+ SetTitle(fasta_outtext, name);
+ Break(output_list);
+ }
+/* Ramu */
+ Break(alignw);
+ aligngr=HiddenGroup(alignw, 2, 0, NULL);
+ shift(aligngr, 60, 20);
+ align_ok=PushButton(aligngr, " ALIGN ", align_proc);
+ shift(aligngr, 20,0);
+ align_can=PushButton(aligngr, "CANCEL", CancelWin);
+
+ *ralignw=alignw;
+ *rtree1text=tree1text;
+ *rtree2text=tree2text;
+ Show(alignw);
+}
+
+
+void AlignFromTree(ButtoN but)
+{
+ FILE *tree;
+ char phylip_name[FILENAMELEN];
+
+ GetTitle(ttreetext, filename, FILENAMELEN);
+ stripspace(filename);
+
+ strcpy(phylip_name,filename);
+#ifdef VMS
+ if((tree=fopen(phylip_name,"r","rat=cr","rfm=var"))==NULL) {
+#else
+ if((tree=fopen(phylip_name,"r"))==NULL) {
+#endif
+ error("Cannot open tree file [%s]",phylip_name);
+ return;
+ }
+
+ if (!open_aln_files()) return;
+
+ WatchCursor();
+ info("Doing alignments from guide tree...");
+ if (Visible(talignw))
+ {
+ Remove(talignw);
+ talignw=NULL;
+ }
+ get_tree(phylip_name);
+ if(save_log && save_log_fd!=NULL)
+ {
+ fclose(save_log_fd);
+ save_log_fd=NULL;
+ }
+/* reload the sequences from the output file (so that the sequence order is
+correct - either INPUT or ALIGNED */
+ reload_alignment();
+ load_aln(seq_panel,0,nseqs-1,FALSE);
+ ArrowCursor();
+ info("Done.");
+}
+
+static void PrfPrfAlignWin (IteM item)
+{
+ if (profile1_empty)
+ {
+ error("Profile 1 not loaded");
+ return;
+ }
+ if (profile2_empty)
+ {
+ error("Profile 2 not loaded");
+ return;
+ }
+ do_palign_window(&palignw,&ptree1text,&ptree2text,NEW,"Profile to Profile Alignment",PrfPrfAlign);
+}
+
+static void PrfPrfTreeAlignWin (IteM item)
+{
+ if (profile1_empty)
+ {
+ error("Profile 1 not loaded");
+ return;
+ }
+ if (profile2_empty)
+ {
+ error("Profile 2 not loaded");
+ return;
+ }
+ do_palign_window(&palignw,&ptree1text,&ptree2text,OLD,"Profile Alignment from Tree",PrfPrfTreeAlign);
+}
+
+static void SeqPrfAlignWin (IteM item)
+{
+ if (profile1_empty)
+ {
+ error("Profile 1 not loaded");
+ return;
+ }
+ if (profile2_empty)
+ {
+ error("Profile 2 not loaded");
+ return;
+ }
+ do_align_window(&salignw,&streetext,NEW,"Sequence to Profile Alignment",SeqPrfAlign);
+}
+
+static void SeqPrfTreeAlignWin (IteM item)
+{
+ if (profile1_empty)
+ {
+ error("Profile 1 not loaded");
+ return;
+ }
+ if (profile2_empty)
+ {
+ error("Profile 2 not loaded");
+ return;
+ }
+ do_align_window(&salignw,&streetext,OLD,"Sequence to Profile Alignment from Tree",SeqPrfTreeAlign);
+}
+
+static void PrfPrfAlign(ButtoN but)
+{
+ char p1_tree_name[FILENAMELEN];
+ char p2_tree_name[FILENAMELEN];
+
+ GetTitle(ptree1text, filename, FILENAMELEN);
+ stripspace(filename);
+ use_tree1_file=FALSE;
+ strcpy(p1_tree_name,filename);
+
+ GetTitle(ptree2text, filename, FILENAMELEN);
+ stripspace(filename);
+ use_tree2_file=FALSE;
+ strcpy(p2_tree_name,filename);
+
+ if (!open_aln_files()) return;
+
+ WatchCursor();
+ if (Visible(palignw))
+ {
+ Remove(palignw);
+ palignw=NULL;
+ }
+ profile_align(p1_tree_name,p2_tree_name);
+ if(save_log && save_log_fd!=NULL)
+ {
+ fclose(save_log_fd);
+ save_log_fd=NULL;
+ }
+ load_aln(prf_panel[0],0,profile1_nseqs-1,FALSE);
+ load_aln(prf_panel[1],profile1_nseqs,nseqs-1,FALSE);
+ ArrowCursor();
+}
+
+static void PrfPrfTreeAlign(ButtoN but)
+{
+ char p1_tree_name[FILENAMELEN];
+ char p2_tree_name[FILENAMELEN];
+
+ GetTitle(ptree1text, filename, FILENAMELEN);
+ stripspace(filename);
+ if(filename[0]!=EOS) use_tree1_file=TRUE;
+ strcpy(p1_tree_name,filename);
+
+ GetTitle(ptree2text, filename, FILENAMELEN);
+ stripspace(filename);
+ if(filename[0]!=EOS) use_tree2_file=TRUE;
+ strcpy(p2_tree_name,filename);
+
+ if (!open_aln_files()) return;
+
+ WatchCursor();
+ if (Visible(palignw))
+ {
+ Remove(palignw);
+ palignw=NULL;
+ }
+ profile_align(p1_tree_name,p2_tree_name);
+ if(save_log && save_log_fd!=NULL)
+ {
+ fclose(save_log_fd);
+ save_log_fd=NULL;
+ }
+ load_aln(prf_panel[0],0,profile1_nseqs-1,FALSE);
+ load_aln(prf_panel[1],profile1_nseqs,nseqs-1,FALSE);
+ ArrowCursor();
+}
+
+static void SeqPrfAlign(ButtoN but)
+{
+ char phylip_name[FILENAMELEN];
+
+ GetTitle(streetext, filename, FILENAMELEN);
+ stripspace(filename);
+
+ strcpy(phylip_name,filename);
+ use_tree_file=FALSE;
+
+ if (!open_aln_files()) return;
+
+ WatchCursor();
+ if (Visible(salignw))
+ {
+ Remove(salignw);
+ salignw=NULL;
+ }
+ new_sequence_align(phylip_name);
+ if(save_log && save_log_fd!=NULL)
+ {
+ fclose(save_log_fd);
+ save_log_fd=NULL;
+ }
+
+/* reload the sequences from the output file (so that the sequence order is
+correct - either INPUT or ALIGNED */
+ reload_alignment();
+
+ load_aln(prf_panel[0],0,profile1_nseqs-1,FALSE);
+ load_aln(prf_panel[1],profile1_nseqs,nseqs-1,FALSE);
+ ArrowCursor();
+}
+
+static void SeqPrfTreeAlign(ButtoN but)
+{
+ char phylip_name[FILENAMELEN];
+
+ GetTitle(streetext, filename, FILENAMELEN);
+ stripspace(filename);
+
+ strcpy(phylip_name,filename);
+ use_tree_file=TRUE;
+
+ if (!open_aln_files()) return;
+
+ WatchCursor();
+ if (Visible(salignw))
+ {
+ Remove(salignw);
+ salignw=NULL;
+ }
+ new_sequence_align(phylip_name);
+ if(save_log && save_log_fd!=NULL)
+ {
+ fclose(save_log_fd);
+ save_log_fd=NULL;
+ }
+
+/* reload the sequences from the output file (so that the sequence order is
+correct - either INPUT or ALIGNED */
+ reload_alignment();
+
+ load_aln(prf_panel[0],0,profile1_nseqs-1,FALSE);
+ load_aln(prf_panel[1],profile1_nseqs,nseqs-1,FALSE);
+ ArrowCursor();
+}
+void reload_alignment(void)
+{
+ int i,k;
+ sint *sseqlen_array;
+ char **sseq_array;
+ char **snames, **stitles;
+
+ if (nseqs==0) return;
+ if (output_order == INPUT) return;
+
+
+ snames=(char **)ckalloc((nseqs+2) * sizeof(char *));
+ stitles=(char **)ckalloc((nseqs+2) * sizeof(char *));
+ sseq_array=(char **)ckalloc((nseqs+2) * sizeof(char *));
+ sseqlen_array=(sint *)ckalloc((nseqs+2) * sizeof(sint));
+ for (i=1;i<=nseqs;i++)
+ {
+ snames[i]=(char *)ckalloc((MAXNAMES+2)*sizeof(char));
+ stitles[i]=(char *)ckalloc((MAXTITLES+2)*sizeof(char));
+ sseq_array[i]=(char *)ckalloc((seqlen_array[output_index[i]]+2)*sizeof(char));
+ strcpy(snames[i],names[output_index[i]]);
+ strcpy(stitles[i],titles[output_index[i]]);
+ sseqlen_array[i]=seqlen_array[output_index[i]];
+ for(k=1;k<=seqlen_array[output_index[i]];k++)
+ sseq_array[i][k]=seq_array[output_index[i]][k];
+ }
+ for (i=1;i<=nseqs;i++)
+ {
+ strcpy(names[i],snames[i]);
+ strcpy(titles[i],stitles[i]);
+ seqlen_array[i]=sseqlen_array[i];
+ realloc_seq(i,seqlen_array[i]);
+ for(k=1;k<=seqlen_array[i];k++)
+ seq_array[i][k]=sseq_array[i][k];
+ output_index[i]=i;
+ }
+
+ ckfree(sseqlen_array);
+ for(i=1;i<=nseqs;i++)
+ ckfree(sseq_array[i]);
+ ckfree(sseq_array);
+ for(i=1;i<=nseqs;i++)
+ ckfree(stitles[i]);
+ ckfree(stitles);
+ for(i=1;i<=nseqs;i++)
+ ckfree(snames[i]);
+ ckfree(snames);
+ ncutseqs=0;
+}
+
+static void SegmentWin(IteM item)
+{
+ WindoW w;
+ GrouP maing;
+ ButtoN closeb;
+ GrouP mat_list;
+ ButtoN matrixb[5];
+
+ SelectFont(systemFont);
+ stdCharWidth=CharWidth('A');
+ stdLineHeight=LineHeight();
+ w=FixedWindow(-50, -33, -10, -10, "Low-Scoring Segment Parameters",RemoveWin);
+ maing=HiddenGroup(w,0,0,NULL);
+ SetGroupSpacing(maing,20,10);
+ closeb=PushButton(maing, "CLOSE", CancelWin);
+ Break(maing);
+
+ /*PushButton(maing, "Calculate Low-Scoring Segments", calc_segment_exceptions);
+ Break(maing);*/
+
+ length_cutofftext=make_scale(maing,"Minimum Length of Segments:",9,length_cutoff,19,set_lengthcutoff);
+ Break(maing);
+
+ segmentdnascaletext=make_scale(maing,"DNA Marking Scale:",9,segment_dnascale,9,set_segment_dnascale);
+ if(!dnaflag) Disable(segmentdnascaletext);
+ Break(maing);
+
+
+ mat_list=NormalGroup(maing,4,0,"Protein Weight Matrix",systemFont,set_segment_matrix);
+ matrixb[0]=RadioButton(mat_list,"Gonnet PAM 80");
+ matrixb[1]=RadioButton(mat_list,"Gonnet PAM 120");
+ matrixb[2]=RadioButton(mat_list,"Gonnet PAM 250");
+ matrixb[3]=RadioButton(mat_list,"Gonnet PAM 350");
+ matrixb[4]=RadioButton(mat_list,"User defined");
+ SetValue(mat_list,segment_matnum);
+ seg_matrix_list=mat_list;
+ Break(maing);
+ PushButton(maing, "Load protein matrix: ", set_segment_user_matrix);
+ Advance(maing);
+ segmentmattext=StaticPrompt(maing,"", MAXPROMPTLEN, dialogTextHeight, systemFont, 'l');
+ SetTitle(segmentmattext,segment_mtrxname);
+ Break(maing);
+
+ mat_list=NormalGroup(maing,4,0,"DNA Weight Matrix",systemFont,set_segment_dnamatrix);
+ matrixb[0]=RadioButton(mat_list,"IUB");
+ matrixb[1]=RadioButton(mat_list,"CLUSTALW(1.6)");
+ matrixb[2]=RadioButton(mat_list,"User defined");
+ SetValue(mat_list,segment_dnamatnum);
+ seg_dnamatrix_list=matrix_list;
+ Break(maing);
+ PushButton(maing, "Load DNA matrix: ", set_segment_user_dnamatrix);
+ Advance(maing);
+ segmentdnamattext=StaticPrompt(maing,"", MAXPROMPTLEN, dialogTextHeight, systemFont, 'l');
+ SetTitle(segmentdnamattext,segment_dnamtrxname);
+ Break(maing);
+
+
+ Show(w);
+}
+
+static void ScoreWin(IteM item)
+{
+ WindoW w;
+ GrouP maing;
+ ButtoN closeb;
+ GrouP mat_list;
+ PopuP show_exceptions;
+ ButtoN matrixb[6];
+
+ SelectFont(systemFont);
+ stdCharWidth=CharWidth('A');
+ stdLineHeight=LineHeight();
+ w=FixedWindow(-50, -33, -10, -10, "Score Parameters",RemoveWin);
+ maing=HiddenGroup(w,0,0,NULL);
+ SetGroupSpacing(maing,20,10);
+ closeb=PushButton(maing, "CLOSE", CancelWin);
+ Break(maing);
+
+
+/* add a scale to set the scaling value for the alignment scoring function */
+ scorescaletext=make_scale(maing,"Score Plot Scale:",9,score_scale,9,set_scorescale);
+ Break(maing);
+
+ residue_cutofftext=make_scale(maing,"Residue Exception Cutoff:",9,score_cutoff,9,set_scorecutoff);
+ Break(maing);
+
+ mat_list=NormalGroup(maing,4,0,"Protein Weight Matrix",systemFont,set_score_matrix);
+ matrixb[0]=RadioButton(mat_list,"Identity");
+ matrixb[1]=RadioButton(mat_list,"Gonnet PAM 80");
+ matrixb[2]=RadioButton(mat_list,"Gonnet PAM 120");
+ matrixb[3]=RadioButton(mat_list,"Gonnet PAM 250");
+ matrixb[4]=RadioButton(mat_list,"Gonnet PAM 350");
+ matrixb[5]=RadioButton(mat_list,"User defined");
+ SetValue(mat_list,score_matnum);
+ score_matrix_list=mat_list;
+ Break(maing);
+ PushButton(maing, "Load protein matrix: ", set_score_user_matrix);
+ Advance(maing);
+ scoremattext=StaticPrompt(maing,"", MAXPROMPTLEN, dialogTextHeight, systemFont, 'l');
+ SetTitle(scoremattext,score_mtrxname);
+ Break(maing);
+
+ mat_list=NormalGroup(maing,4,0,"DNA Weight Matrix",systemFont,set_score_dnamatrix);
+ matrixb[0]=RadioButton(mat_list,"IUB");
+ matrixb[1]=RadioButton(mat_list,"CLUSTALW(1.6)");
+ matrixb[2]=RadioButton(mat_list,"User defined");
+ SetValue(mat_list,score_dnamatnum);
+ score_dnamatrix_list=mat_list;
+ Break(maing);
+ PushButton(maing, "Load DNA matrix: ", set_score_user_dnamatrix);
+ Advance(maing);
+ scorednamattext=StaticPrompt(maing,"", MAXPROMPTLEN, dialogTextHeight, systemFont, 'l');
+ SetTitle(scorednamattext,score_dnamtrxname);
+ Break(maing);
+
+ Show (w);
+}
+
+
+static void PWParameters(IteM item)
+{
+ int i;
+ WindoW w;
+ PoinT pt;
+ GrouP maing;
+ ButtoN closeb;
+ TexT go_scale,ge_scale;
+ TexT gp_scale,ktuple_scale,topdiags_scale,window_scale;
+ PopuP fs_toggle;
+ GrouP mat_list;
+ ButtoN matrixb[5];
+ char str[FILENAMELEN];
+
+ if(dnaflag) {
+ gap_open = dna_gap_open;
+ gap_extend = dna_gap_extend;
+ pw_go_penalty = dna_pw_go_penalty;
+ pw_ge_penalty = dna_pw_ge_penalty;
+ ktup = dna_ktup;
+ window = dna_window;
+ signif = dna_signif;
+ wind_gap = dna_wind_gap;
+
+ }
+ else {
+ gap_open = prot_gap_open;
+ gap_extend = prot_gap_extend;
+ pw_go_penalty = prot_pw_go_penalty;
+ pw_ge_penalty = prot_pw_ge_penalty;
+ ktup = prot_ktup;
+ window = prot_window;
+ signif = prot_signif;
+ wind_gap = prot_wind_gap;
+
+ }
+
+ SelectFont(systemFont);
+ stdCharWidth=CharWidth('A');
+ stdLineHeight=LineHeight();
+ w=FixedWindow(-50, -33, -10, -10, "Pairwise Parameters",RemoveWin);
+ maing=HiddenGroup(w,0,0,NULL);
+ SetGroupSpacing(maing,0,10);
+ closeb=PushButton(maing, "CLOSE", CancelWin);
+ Break(maing);
+ fs_toggle=make_toggle(maing,"Pairwise Alignments :","Fast-Approximate","Slow-Accurate",&quick_pairalign,set_fs_toggle);
+ Break(maing);
+
+ GetNextPosition(maing,&pt);
+ slow_para=NormalGroup(maing,0,0,"Pairwise Parameters",systemFont,NULL);
+ SetGroupSpacing(slow_para,0,10);
+
+ make_prompt(slow_para, "Gap Opening [0-100] :");
+ Advance(slow_para);
+ sprintf(str,"%.2f",pw_go_penalty);
+ go_scale=DialogText(slow_para, str, 5, set_pw_go_penalty);
+ Break(slow_para);
+
+ make_prompt(slow_para, "Gap Extension [0-100] :");
+ Advance(slow_para);
+ sprintf(str,"%.2f",pw_ge_penalty);
+ ge_scale=DialogText(slow_para, str, 5, set_pw_ge_penalty);
+ Break(slow_para);
+ mat_list=NormalGroup(slow_para,4,0,"Protein Weight Matrix",systemFont,set_pw_matrix);
+ for(i=0;i<pw_matrix_menu.noptions;i++)
+ matrixb[i]=RadioButton(mat_list,pw_matrix_menu.opt[i].title);
+ SetValue(mat_list,pw_matnum);
+ pw_matrix_list=mat_list;
+ Break(slow_para);
+ PushButton(slow_para, "Load protein matrix: ", set_pw_user_matrix);
+ Advance(slow_para);
+ pwmattext=StaticPrompt(slow_para,"", MAXPROMPTLEN, dialogTextHeight, systemFont, 'l');
+ SetTitle(pwmattext,pw_usermtrxname);
+
+ Break(slow_para);
+ mat_list=NormalGroup(slow_para,4,0,"DNA Weight Matrix",systemFont,set_pw_dnamatrix);
+ for(i=0;i<dnamatrix_menu.noptions;i++)
+ matrixb[i]=RadioButton(mat_list,dnamatrix_menu.opt[i].title);
+ SetValue(mat_list,pw_dnamatnum);
+ pw_dnamatrix_list=mat_list;
+ Break(slow_para);
+ PushButton(slow_para, "Load DNA matrix: ", set_pw_user_dnamatrix);
+ Advance(slow_para);
+ pwdnamattext=StaticPrompt(slow_para,"", MAXPROMPTLEN, dialogTextHeight, systemFont, 'l');
+ SetTitle(pwdnamattext,pw_dnausermtrxname);
+
+ Break(slow_para);
+
+
+/* fast parameters */
+ SetNextPosition(maing,pt);
+ fast_para=NormalGroup(maing,2,0,"Pairwise Parameters",systemFont,NULL);
+ SetGroupSpacing(fast_para,0,10);
+ make_prompt(fast_para, "Gap Penalty [1-500]:");
+ sprintf(str,"%d",wind_gap);
+ gp_scale=DialogText(fast_para, str, 3, set_gp);
+ make_prompt(fast_para, "K-Tuple Size [1-2]:");
+ sprintf(str,"%d",ktup);
+ ktuple_scale=DialogText(fast_para, str, 1, set_ktuple);
+ make_prompt(fast_para, "Top Diagonals [1-50]:");
+ sprintf(str,"%d",signif);
+ topdiags_scale=DialogText(fast_para, str, 2, set_topdiags);
+ make_prompt(fast_para, "Window Size [1-50]:");
+ sprintf(str,"%d",window);
+ window_scale=DialogText(fast_para, str, 2, set_window);
+
+ if (quick_pairalign)
+ {
+ Hide(slow_para);
+ Show(fast_para);
+ }
+ else
+ {
+ Hide(fast_para);
+ Show(slow_para);
+ }
+
+ Break(maing);
+
+ Show (w);
+}
+
+
+static void MultiParameters(IteM item)
+{
+ int i;
+ WindoW w;
+ GrouP maing;
+ ButtoN closeb;
+ TexT go_scale,ge_scale;
+ GrouP mat_list;
+ ButtoN matrixb[5];
+ GrouP multi_para;
+ TexT div_seq;
+ TexT transitions;
+ PopuP neg_mat_toggle;
+ PopuP end_gap_toggle;
+ char str[FILENAMELEN];
+
+ if(dnaflag) {
+ gap_open = dna_gap_open;
+ gap_extend = dna_gap_extend;
+ pw_go_penalty = dna_pw_go_penalty;
+ pw_ge_penalty = dna_pw_ge_penalty;
+ ktup = dna_ktup;
+ window = dna_window;
+ signif = dna_signif;
+ wind_gap = dna_wind_gap;
+
+ }
+ else {
+ gap_open = prot_gap_open;
+ gap_extend = prot_gap_extend;
+ pw_go_penalty = prot_pw_go_penalty;
+ pw_ge_penalty = prot_pw_ge_penalty;
+ ktup = prot_ktup;
+ window = prot_window;
+ signif = prot_signif;
+ wind_gap = prot_wind_gap;
+
+ }
+
+ SelectFont(systemFont);
+ stdCharWidth=CharWidth('A');
+ stdLineHeight=LineHeight();
+ w=FixedWindow(-50, -33, -10, -10, "Alignment Parameters",RemoveWin);
+ maing=HiddenGroup(w,0,0,NULL);
+ SetGroupSpacing(maing,0,10);
+ closeb=PushButton(maing, "CLOSE", CancelWin);
+ Break(maing);
+
+/* multiple alignment parameters */
+
+ multi_para=NormalGroup(maing,0,0,"Multiple Parameters",systemFont,NULL);
+ SetGroupSpacing(multi_para,0,10);
+ make_prompt(multi_para, "Gap Opening [0-100] :");
+ Advance(multi_para);
+ sprintf(str,"%.2f",gap_open);
+ go_scale=DialogText(multi_para, str, 5, set_go_penalty);
+ Advance(multi_para);
+ make_prompt(multi_para, "Gap Extention [0-100] :");
+ Advance(multi_para);
+ sprintf(str,"%.2f",gap_extend);
+ ge_scale=DialogText(multi_para, str, 5, set_ge_penalty);
+ Break(multi_para);
+ make_prompt(multi_para, "Delay Divergent Sequences (%) :");
+ Advance(multi_para);
+ sprintf(str,"%d",divergence_cutoff);
+ div_seq=DialogText(multi_para, str, 3, set_div_seq);
+ Break(multi_para);
+
+ make_prompt(multi_para, "DNA Transition Weight [0-1] :");
+ Advance(multi_para);
+ sprintf(str,"%.2f",transition_weight);
+ transitions=DialogText(multi_para, str, 5, set_transitions);
+ Break(multi_para);
+
+
+ neg_mat_toggle=make_toggle(multi_para,"Use Negative Matrix","ON","OFF",&neg_matrix,set_neg_matrix);
+
+ Break(multi_para);
+ mat_list=NormalGroup(multi_para,2,0,"Protein Weight Matrix",systemFont,set_matrix);
+ for(i=0;i<matrix_menu.noptions;i++)
+ matrixb[i]=RadioButton(mat_list,matrix_menu.opt[i].title);
+ SetValue(mat_list,matnum);
+ matrix_list=mat_list;
+ Break(multi_para);
+ PushButton(multi_para, "Load protein matrix: ", set_user_matrix);
+ Advance(multi_para);
+ mattext=StaticPrompt(multi_para,"", MAXPROMPTLEN, dialogTextHeight, systemFont, 'l');
+ SetTitle(mattext,usermtrxname);
+
+ Break(multi_para);
+ mat_list=NormalGroup(multi_para,2,0,"DNA Weight Matrix",systemFont,set_dnamatrix);
+ for(i=0;i<dnamatrix_menu.noptions;i++)
+ matrixb[i]=RadioButton(mat_list,dnamatrix_menu.opt[i].title);
+ SetValue(mat_list,dnamatnum);
+ dnamatrix_list=mat_list;
+ Break(multi_para);
+ PushButton(multi_para, "Load DNA: ", set_user_dnamatrix);
+ Advance(multi_para);
+ dnamattext=StaticPrompt(multi_para,"", MAXPROMPTLEN, dialogTextHeight, systemFont, 'l');
+ SetTitle(dnamattext,dnausermtrxname);
+ Show (w);
+}
+
+static void GapParameters(IteM item)
+{
+ WindoW gapparaw;
+ GrouP maing;
+ ButtoN closeb;
+ PopuP rp_toggle,vp_toggle,hp_toggle,end_gap_toggle;
+ TexT gdist,hyd_text;
+ char str[80];
+
+ SelectFont(systemFont);
+ stdCharWidth=CharWidth('A');
+ stdLineHeight=LineHeight();
+ gapparaw=FixedWindow(-50, -33, -10, -10, "Protein Gap Parameters",RemoveWin);
+ maing=HiddenGroup(gapparaw,0,0,NULL);
+ SetGroupSpacing(maing,0,10);
+ closeb=PushButton(maing, "CLOSE", CancelWin);
+ Break(maing);
+ rp_toggle=make_toggle(maing,"Residue-specific Penalties","OFF","ON",&no_pref_penalties,set_pref_penalties);
+ Break(maing);
+ hp_toggle=make_toggle(maing,"Hydrophilic Penalties","OFF","ON",&no_hyd_penalties,set_hyd_penalties);
+ Break(maing);
+ make_prompt(maing, "Hydrophilic Residues :");
+ Advance(maing);
+ hyd_text=DialogText(maing, hyd_residues, 20, set_hyd_res);
+ Break(maing);
+ make_prompt(maing, "Gap Separation Distance [0-100] :");
+ Advance(maing);
+ sprintf(str,"%d",gap_dist);
+ gdist=DialogText(maing, str, 3, set_gap_dist);
+ Break(maing);
+ end_gap_toggle=make_toggle(maing,"End Gap Separation","ON","OFF",&use_endgaps,set_endgaps);
+
+ Show (gapparaw);
+}
+
+static void SSParameters(IteM item)
+{
+ WindoW ssparaw;
+ GrouP maing;
+ ButtoN closeb;
+ PopuP use_p1,use_p2;
+ TexT helix_gp,strand_gp,loop_gp,terminal_gp,helix_minus,helix_plus;
+ TexT strand_minus,strand_plus;
+ GrouP output_list;
+ ButtoN outputb[4];
+ char str[80];
+
+ SelectFont(systemFont);
+ stdCharWidth=CharWidth('A');
+ stdLineHeight=LineHeight();
+ ssparaw=FixedWindow(-50, -33, -10, -10, "Secondary Structure Options",RemoveWin);
+ closeb=PushButton(ssparaw, "CLOSE", CancelWin);
+ Break(ssparaw);
+
+ use_p1=make_toggle(ssparaw,"Use profile 1 secondary structure / penalty mask","YES","NO",&use_ss1,set_use_ss1);
+ Break(ssparaw);
+ use_p2=make_toggle(ssparaw,"Use profile 2 secondary structure / penalty mask","YES","NO",&use_ss2,set_use_ss2);
+
+ Break(ssparaw);
+ output_list=NormalGroup(ssparaw,2,0,"Output ",systemFont,NULL)
+;
+ outputb[0]=CheckBox(output_list,"Secondary Structure",set_ss_output);
+ if(output_struct_penalties==0 || output_struct_penalties==2)
+ output_ss=TRUE;
+ else
+ output_ss=FALSE;
+ SetStatus(outputb[0],output_ss);
+ outputb[1]=CheckBox(output_list,"Gap Penalty Mask",set_gp_output);
+ if(output_struct_penalties==1 || output_struct_penalties==2)
+ output_gp=TRUE;
+ else
+ output_gp=FALSE;
+ SetStatus(outputb[1],output_gp);
+
+ Break(ssparaw);
+ maing=HiddenGroup(ssparaw,2,0,NULL);
+ SetGroupSpacing(maing,0,10);
+ make_prompt(maing, "Helix Gap Penalty [0-9] :");
+ sprintf(str,"%d",helix_penalty);
+ helix_gp=DialogText(maing, str, 1, set_helix_gp);
+ make_prompt(maing, "Strand Gap Penalty [0-9] :");
+ sprintf(str,"%d",strand_penalty);
+ strand_gp=DialogText(maing, str, 1, set_strand_gp);
+ make_prompt(maing, "Loop Gap Penalty [0-9] :");
+ sprintf(str,"%d",loop_penalty);
+ loop_gp=DialogText(maing, str, 1, set_loop_gp);
+ make_prompt(maing, "Secondary Structure Terminal Penalty [0-9] :");
+ sprintf(str,"%d",helix_end_penalty);
+ terminal_gp=DialogText(maing, str, 1, set_terminal_gp);
+ make_prompt(maing, "Helix Terminal Positions [0-3] within:");
+ sprintf(str,"%d",helix_end_minus);
+ helix_minus=DialogText(maing, str, 1, set_helix_minus);
+ make_prompt(maing, "outside:");
+ sprintf(str,"%d",helix_end_plus);
+ helix_plus=DialogText(maing, str, 1, set_helix_plus);
+ make_prompt(maing, "Strand Terminal Penalty [0-3] within:");
+ sprintf(str,"%d",strand_end_minus);
+ strand_minus=DialogText(maing, str, 1, set_strand_minus);
+ make_prompt(maing, "outside:");
+ sprintf(str,"%d",strand_end_plus);
+ strand_plus=DialogText(maing, str, 1, set_strand_plus);
+
+
+ Show (ssparaw);
+}
+
+static void OutputParameters(IteM item)
+{
+ WindoW outputparaw;
+ GrouP maing;
+ ButtoN closeb;
+ GrouP output_list;
+ ButtoN outputb[6+1]; /* +1 for fasta */
+ PopuP order_toggle,para_toggle;
+ PopuP case_toggle,snos_toggle;
+
+ SelectFont(systemFont);
+ stdCharWidth=CharWidth('A');
+ stdLineHeight=LineHeight();
+ outputparaw=FixedWindow(-50, -33, -10, -10, "Output Format Options",RemoveWin);
+ closeb=PushButton(outputparaw, "CLOSE", CancelWin);
+ Break(outputparaw);
+ output_list=NormalGroup(outputparaw,2,0,"Output Files",systemFont,NULL);
+
+ SelectFont(systemFont);
+ outputb[0]=CheckBox(output_list,"CLUSTAL format",set_output_clustal);
+ if(output_clustal) SetStatus(outputb[0],TRUE);
+
+ outputb[1]=CheckBox(output_list,"NBRF/PIR format",set_output_nbrf);
+ if(output_nbrf) SetStatus(outputb[1],TRUE);
+
+ outputb[2]=CheckBox(output_list,"GCG/MSF format",set_output_gcg);
+ if(output_gcg) SetStatus(outputb[2],TRUE);
+
+ outputb[3]=CheckBox(output_list,"PHYLIP format",set_output_phylip);
+ if(output_phylip) SetStatus(outputb[3],TRUE);
+
+ outputb[4]=CheckBox(output_list,"GDE format",set_output_gde);
+ if(output_gde) SetStatus(outputb[4],TRUE);
+
+ outputb[5]=CheckBox(output_list,"NEXUS format",set_output_nexus);
+ if(output_nexus) SetStatus(outputb[5],TRUE);
+
+/* Ramu */
+ outputb[6]=CheckBox(output_list,"FASTA format",set_output_fasta);
+ if(output_fasta) SetStatus(outputb[6],TRUE);
+/* Ramu */
+
+ maing=HiddenGroup(outputparaw,2,0,NULL);
+ SetGroupSpacing(maing,0,10);
+
+
+ case_toggle=make_toggle(maing,"GDE output case :","Lower","Upper",&lowercase,set_case);
+ snos_toggle=make_toggle(maing,"CLUSTALW sequence numbers :","ON","OFF",&cl_seq_numbers,set_snos);
+ make_prompt(maing, "Output order");
+ order_toggle=PopupList(maing,TRUE,set_output_order);
+ PopupItem(order_toggle,"INPUT");
+ PopupItem(order_toggle,"ALIGNED");
+ if (output_order == INPUT)
+ SetValue(order_toggle,1);
+ else if (output_order == ALIGNED)
+ SetValue(order_toggle,2);
+ para_toggle=make_toggle(maing,"Parameter output","ON","OFF",&save_parameters,set_save_paras);
+
+ Show (outputparaw);
+}
+
+
+static void OutputTreeParameters(IteM item)
+{
+ WindoW outputtreeparaw;
+ ButtoN closeb;
+ GrouP output_list;
+ ButtoN outputb[5]; /* ButtoN outputb[3]; Ramu, crash????; 5th one for perc_ident mat */
+ PopuP boot_format;
+
+ SelectFont(systemFont);
+ stdCharWidth=CharWidth('A');
+ stdLineHeight=LineHeight();
+ outputtreeparaw=FixedWindow(-50, -33, -10, -10, "Output Tree Format Options",RemoveWin);
+ closeb=PushButton(outputtreeparaw, "CLOSE", CancelWin);
+ Break(outputtreeparaw);
+ output_list=NormalGroup(outputtreeparaw,2,0,"Output Files",systemFont,NULL);
+ outputb[0]=CheckBox(output_list,"CLUSTAL format tree",set_output_tree_clustal);
+ if(output_tree_clustal) SetStatus(outputb[0],TRUE);
+
+ outputb[1]=CheckBox(output_list,"Phylip format tree",set_output_tree_phylip);
+ if(output_tree_phylip) SetStatus(outputb[1],TRUE);
+
+ outputb[2]=CheckBox(output_list,"Phylip distance matrix",set_output_tree_distances);
+ if(output_tree_distances) SetStatus(outputb[2],TRUE);
+
+ outputb[3]=CheckBox(output_list,"Nexus format tree",set_output_tree_nexus);
+ if(output_tree_nexus) SetStatus(outputb[3],TRUE);
+
+
+ outputb[4]=CheckBox(output_list,"% identity matrix",set_output_pim); /* pim? perc ident matr */
+ if(output_pim) SetStatus(outputb[4],TRUE);
+
+ Break(outputtreeparaw);
+ make_prompt(outputtreeparaw, "Bootstrap labels on:");
+ Advance(outputtreeparaw);
+ boot_format=PopupList(outputtreeparaw,TRUE,set_boot_format);
+ PopupItem(boot_format ,"NODE");
+ PopupItem(boot_format ,"BRANCH");
+ if (bootstrap_format == BS_NODE_LABELS)
+ SetValue(boot_format ,1);
+ if (bootstrap_format == BS_BRANCH_LABELS)
+ SetValue(boot_format ,2);
+ Show (outputtreeparaw);
+}
+
+static PrompT make_prompt(GrouP g,CharPtr title)
+{
+ PrompT p=NULL;
+
+ if (title != NULL)
+ p=StaticPrompt(g, title, 0, dialogTextHeight, systemFont, 'l');
+
+ return p;
+}
+
+static PrompT make_scale(GrouP g,CharPtr title,int length, int value,int max,BarScrlProc SetProc)
+{
+ char str[FILENAMELEN];
+ BaR scale;
+ PrompT t;
+
+ sprintf(str,"%s %3d",title,value);
+ t=make_prompt(g,str);
+ Advance(g);
+ scale=ScrollBar(g, length, -1, SetProc);
+ CorrectBarPage(scale,(Int4)1,(Int4)1);
+ CorrectBarMax(scale,(Int4)max);
+ CorrectBarValue(scale,(Int4)value);
+ return t;
+}
+
+static PopuP make_toggle(GrouP g,CharPtr title,CharPtr true_text, CharPtr false_text, Boolean *value,PupActnProc SetProc)
+{
+ PopuP p;
+
+ if (title != NULL)
+ make_prompt(g, title);
+ Advance(g);
+ p=PopupList(g,TRUE,SetProc);
+ PopupItem(p,true_text);
+ PopupItem(p,false_text);
+ if (*value)
+ SetValue(p,1);
+ else
+ SetValue(p,2);
+ return p;
+}
+
+
+void switch_mode(void)
+{
+ char path[FILENAMELEN];
+
+ if(aln_mode==MULTIPLEM)
+ {
+ Hide(prf1_display);
+ Hide(prf2_display);
+ Hide(pscrolltext);
+ SetValue(modetext,1);
+ resize_multi_window();
+ profile_no=0;
+ check_menus(file_item,PROFILEM);
+ check_menus(align_item,PROFILEM);
+ check_menus(edit_item,PROFILEM);
+ check_menus(tree_item,PROFILEM);
+ check_menus(color_item,PROFILEM);
+ active_panel=seq_panel;
+ fix_gaps();
+ load_aln_data(seq_panel,0,nseqs-1,TRUE);
+ Show(seq_display);
+ }
+ else if(aln_mode==PROFILEM)
+ {
+ Hide(seq_display);
+ resize_prf_window(nseqs,0);
+ SetValue(modetext,2);
+ profile_no=1;
+ profile1_nseqs=nseqs;
+ if (profile1_nseqs > 0) profile1_empty = FALSE;
+ profile2_empty = TRUE;
+
+ check_menus(file_item,MULTIPLEM);
+ check_menus(align_item,MULTIPLEM);
+ check_menus(edit_item,MULTIPLEM);
+ check_menus(tree_item,MULTIPLEM);
+ check_menus(color_item,MULTIPLEM);
+ active_panel=prf_panel[0];
+ get_path(seqname,path);
+ strcpy(profile1_name,path);
+ strcat(profile1_name,"1.");
+ strcpy(profile2_name,path);
+ strcat(profile2_name,"2.");
+ fix_gaps();
+ load_aln_data(prf_panel[0],0,profile1_nseqs-1,TRUE);
+ load_aln_data(prf_panel[1],profile1_nseqs,nseqs-1,TRUE);
+ Show(prf1_display);
+ Show(prf2_display);
+ Show(pscrolltext);
+ }
+
+}
+
+
+static void make_menu_headers(WindoW w)
+{
+ filem = PulldownMenu (w,"File/FFF");
+ editm = PulldownMenu (w,"Edit/EEE");
+ alignm = PulldownMenu (w,"Alignment/AAA");
+ treem = PulldownMenu(w,"Trees/TTT");
+ colorm = PulldownMenu(w,"Colors/CCC");
+ scorem = PulldownMenu(w,"Quality/UUU");
+ helpmenu = PulldownMenu(w,"Help/HHH");
+
+}
+
+static void make_file_menu(void)
+{
+ int n=0;
+
+ file_item.mode[n] = MULTIPLEM;
+
+ file_item.i[n] = CommandItem (filem,"Load Sequences/OOO", OpenSeqFile); n++;
+
+ file_item.mode[n] = MULTIPLEM;
+ file_item.i[n] = CommandItem (filem,"Append Sequences", AppendSeqFile); n++;
+ file_item.mode[n] = MULTIPLEM;
+ file_item.i[n] = CommandItem (filem,"Save Sequences as.../SSS", SaveSeqFileWin); n++;
+ file_item.mode[n] = PROFILEM;
+ file_item.i[n] = CommandItem (filem,"Load Profile 1", OpenPrf1File); n++;
+ file_item.mode[n] = PROFILEM;
+ file_item.i[n] = CommandItem (filem,"Load Profile 2", OpenPrf2File); n++;
+ file_item.mode[n] = PROFILEM;
+ file_item.i[n] = CommandItem (filem,"Save Profile 1 as...", SavePrf1FileWin); n++;
+ file_item.mode[n] = PROFILEM;
+ file_item.i[n] = CommandItem (filem,"Save Profile 2 as...", SavePrf2FileWin); n++;
+ file_item.mode[n] = MULTIPLEM;
+ file_item.i[n] = CommandItem (filem,"Write Alignment as PostScript/PPP", SavePSSeqWin); n++;
+ file_item.mode[n] = PROFILEM;
+ file_item.i[n] = CommandItem (filem,"Write Profile 1 as PostScript", SavePSPrf1Win); n++;
+ file_item.mode[n] = PROFILEM;
+ file_item.i[n] = CommandItem (filem,"Write Profile 2 as PostScript", SavePSPrf2Win); n++;
+ file_item.i[n] = CommandItem (filem,"Quit/QQQ", QuitWinI); n++;
+ file_item.num = n;
+ if(aln_mode==MULTIPLEM)
+ check_menus(file_item,PROFILEM);
+ else
+ check_menus(file_item,MULTIPLEM);
+}
+
+static void QuitWinI (IteM i)
+{
+ if(aln_mode == MULTIPLEM)
+ {
+ if(seq_panel.modified)
+ if (Message(MSG_YN,"Alignment has not been saved.\n"
+ "Quit program anyway?")==ANS_NO) return;
+ }
+ else if(aln_mode == PROFILEM)
+ {
+ if(prf_panel[0].modified)
+ if (Message(MSG_YN,"Profile 1 has not been saved.\n"
+ "Quit program anyway?")==ANS_NO) return;
+ if(prf_panel[1].modified)
+ if (Message(MSG_YN,"Profile 2 has not been saved.\n"
+ "Quit program anyway?")==ANS_NO) return;
+ }
+ QuitProgram ();
+}
+
+
+
+static void make_score_menu(void)
+{
+ int n=0;
+
+ score_item.i[n] = CommandItem (scorem,"Calculate Low-Scoring Segments", calc_segment_exceptions); n++;
+ segment_item=score_item.i[n]=StatusItem(scorem, "Show Low-Scoring Segments", set_show_segments);
+ SetStatus(score_item.i[n],segment_exceptions); n++;
+ score_item.i[n]=StatusItem(scorem, "Show Exceptional Residues", set_residue_exceptions);
+ SetStatus(score_item.i[n],residue_exceptions); n++;
+ score_item.i[n] = CommandItem (scorem,"Low-Scoring Segment Parameters",SegmentWin); n++;
+ score_item.i[n] = CommandItem (scorem,"Column Score Parameters",ScoreWin); n++;
+ score_item.mode[n] = MULTIPLEM;
+ score_item.i[n]=CommandItem(scorem, "Save Column Scores to File", SaveScoresWin); n++;
+
+
+ score_item.num = n;
+}
+
+static void make_help_menu(void)
+{
+ int n=0;
+
+ help_item.ptr[n] = 'G';
+ help_item.i[n] = CommandItem (helpmenu,"General",HelpProc); n++;
+ help_item.ptr[n] = 'F';
+ help_item.i[n] = CommandItem (helpmenu,"Input & Output Files",HelpProc); n++;
+ help_item.ptr[n] = 'E';
+ help_item.i[n] = CommandItem (helpmenu,"Editing Alignments",HelpProc); n++;
+ help_item.ptr[n] = 'M';
+ help_item.i[n] = CommandItem (helpmenu,"Multiple Alignments",HelpProc); n++;
+ help_item.ptr[n] = 'P';
+ help_item.i[n] = CommandItem (helpmenu,"Profile Alignments",HelpProc); n++;
+ help_item.ptr[n] = 'B';
+ help_item.i[n] = CommandItem (helpmenu,"Secondary Structures",HelpProc); n++;
+ help_item.ptr[n] = 'T';
+ help_item.i[n] = CommandItem (helpmenu,"Trees",HelpProc); n++;
+ help_item.ptr[n] = 'C';
+ help_item.i[n] = CommandItem (helpmenu,"Colors",HelpProc); n++;
+ help_item.ptr[n] = 'Q';
+ help_item.i[n] = CommandItem (helpmenu,"Alignment Quality",HelpProc); n++;
+ help_item.ptr[n] = '9';
+ help_item.i[n] = CommandItem (helpmenu,"Command Line Parameters",HelpProc); n++;
+ help_item.ptr[n] = 'R';
+ help_item.i[n] = CommandItem (helpmenu,"References",HelpProc); n++;
+
+ help_item.num = n;
+}
+
+static void HelpProc(IteM item)
+{
+ int n,index=-1;
+ FILE *fd;
+ int i, number, nlines;
+ Boolean found_help;
+ char temp[MAXLINE+1];
+ char token;
+ char *digits = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+ char *help_marker = ">>HELP";
+
+ TexT htext;
+ char *help_file = NULL;
+
+ extern char *help_file_name;
+
+ helptext[0]='\0';
+
+ for(n=0;n<help_item.num;n++)
+ if (item==help_item.i[n])
+ {
+ index=n;
+ break;
+ }
+
+ if (index==-1)
+ {
+ error("Problem with HELP routines\n");
+ return;
+ }
+
+
+ help_file=find_file(help_file_name);
+ if(help_file==NULL) {
+ error("Cannot find help file");
+ return;
+ }
+
+ if((fd=fopen(help_file,"r"))==NULL) {
+ error("Cannot open help file [%s]",help_file);
+ return;
+ }
+ nlines = 0;
+ number = -1;
+ found_help = FALSE;
+
+ while(TRUE) {
+ if(fgets(temp,MAXLINE+1,fd) == NULL) {
+ if(!found_help)
+ error("No help found in help file");
+ fclose(fd);
+ return;
+ }
+ if(strstr(temp,help_marker)) {
+ token = ' ';
+ for(i=strlen(help_marker); i<8; i++)
+ if(strchr(digits, temp[i])) {
+ token = temp[i];
+ break;
+ }
+ if(token == help_item.ptr[index]) {
+ found_help = TRUE;
+ while(fgets(temp,MAXLINE+1,fd)) {
+ if(strstr(temp, help_marker)) break;
+ if(strlen(helptext)+strlen(temp) >MAXHELPLENGTH)
+ break;
+ for(i=strlen(temp)-1;i>=0;i--)
+ if(iscntrl(temp[i])||isspace(temp[i]))
+ temp[i]='\0';
+ else break;
+/* ignore lines starting with < - these are for html output processing */
+ if(temp[0]!='<') {
+ strcat(helptext,temp);
+#ifdef WIN_MAC
+ strcat(helptext,"\r");
+#else
+#ifdef WIN32
+ strcat(helptext,"\r\n");
+#else
+ strcat(helptext,"\n");
+#endif
+#endif
+ ++nlines;
+ }
+ }
+ fclose(fd);
+#ifdef WIN_MAC
+ if(numhelp>=1)
+#else
+ if(numhelp>=MAXHELPW)
+#endif
+ {
+ error("Too many help windows");
+ return;
+ }
+ numhelp++;
+
+ helpw[numhelp]=FixedWindow(-50, -33, -10, -10, "", QuitHelpW);
+ SelectFont(helpfont);
+#ifdef WIN_MAC
+ htext=ScrollText(helpw[numhelp], 60, 20, helpfont, TRUE, NULL);
+#else
+ htext=ScrollText(helpw[numhelp], 80, 30, helpfont, TRUE, NULL);
+#endif
+ Break(helpw[numhelp]);
+ PushButton(helpw[numhelp], "OK", QuitHelpB);
+ SetTitle(htext, helptext);
+ Show(helpw[numhelp]);
+ return;
+ }
+ }
+ }
+}
+
+void QuitHelpB(ButtoN b)
+{
+
+ Remove(ParentWindow(b));
+ numhelp--;
+}
+
+void QuitHelpW(WindoW w)
+{
+
+ Remove(w);
+ numhelp--;
+}
+static void make_edit_menu(void)
+{
+ int n=0;
+
+ edit_item.i[n] = CommandItem (editm,"Cut Sequences/XXX", CutSequences); n++;
+ edit_item.i[n] = CommandItem (editm,"Paste Sequences/VVV", PasteSequences); n++;
+ edit_item.mode[n] = MULTIPLEM;
+ edit_item.i[n] = CommandItem (editm,"Select All Sequences/AAA", SelectSeqs); n++;
+ edit_item.mode[n] = PROFILEM;
+ edit_item.i[n] = CommandItem (editm,"Select Profile 1", SelectPrf1); n++;
+ edit_item.mode[n] = PROFILEM;
+ edit_item.i[n] = CommandItem (editm,"Select Profile 2", SelectPrf2); n++;
+ edit_item.mode[n] = PROFILEM;
+ edit_item.i[n] = CommandItem (editm,"Add Profile 2 to Profile 1", MergeProfiles); n++;
+ edit_item.i[n] = CommandItem (editm,"Clear Sequence Selection",ClearSeqs); n++;
+ edit_item.i[n] = CommandItem (editm,"Clear Range Selection",ClearSeqRange); n++;
+ SeparatorItem(editm);
+ edit_item.i[n] = CommandItem (editm,"Search for String/FFF", SearchStrWin); n++;
+ SeparatorItem(editm);
+ edit_item.i[n] = CommandItem (editm,"Remove All Gaps", RemoveGaps); n++;
+ edit_item.i[n] = CommandItem (editm,"Remove Gap-Only Columns", RemoveGapPos); n++;
+ edit_item.num = n;
+ if(aln_mode==MULTIPLEM)
+ check_menus(edit_item,PROFILEM);
+ else
+ check_menus(edit_item,MULTIPLEM);
+}
+
+static void make_align_menu(void)
+{
+ MenU parasm;
+ int n=0;
+
+ align_item.mode[n] = MULTIPLEM;
+ align_item.i[n] = CommandItem (alignm,"Do Complete Alignment/LLL",CAlignWin); n++;
+ align_item.mode[n] = MULTIPLEM;
+ align_item.i[n] = CommandItem (alignm,"Produce Guide Tree Only/GGG",SaveTreeWin); n++;
+ align_item.mode[n] = MULTIPLEM;
+ align_item.i[n] = CommandItem (alignm,"Do Alignment from Guide Tree",AlignFromTreeWin); n++;
+ SeparatorItem(alignm);
+ align_item.mode[n] = MULTIPLEM;
+ align_item.i[n] = CommandItem (alignm,"Realign Selected Sequences",RealignSeqsWin); n++;
+ align_item.mode[n] = MULTIPLEM;
+ align_item.i[n] = CommandItem (alignm,"Realign Selected Residue Range",RealignSeqRangeWin); n++;
+ align_item.mode[n] = PROFILEM;
+ align_item.i[n] = CommandItem (alignm,"Align Profile 2 to Profile 1",PrfPrfAlignWin); n++;
+ align_item.mode[n] = PROFILEM;
+ align_item.i[n] = CommandItem (alignm,"Align Profiles from Guide Trees",PrfPrfTreeAlignWin); n++;
+ align_item.mode[n] = PROFILEM;
+ align_item.i[n] = CommandItem (alignm,"Align Sequences to Profile 1",SeqPrfAlignWin); n++;
+ align_item.mode[n] = PROFILEM;
+ align_item.i[n] = CommandItem (alignm,"Align Sequences to Profile 1 from Tree",SeqPrfTreeAlignWin); n++;
+
+ SeparatorItem(alignm);
+ parasm=SubMenu(alignm,"Alignment Parameters");
+ new_gaps_item=align_item.i[n]=StatusItem(parasm, "Reset New Gaps before Alignment", set_reset_new_gaps);
+ SetStatus(align_item.i[n],reset_alignments_new); n++;
+ all_gaps_item=align_item.i[n]=StatusItem(parasm, "Reset All Gaps before Alignment", set_reset_all_gaps);
+ SetStatus(align_item.i[n],reset_alignments_all); n++;
+ align_item.i[n] = CommandItem (parasm,"Pairwise Alignment Parameters",PWParameters); n++;
+ align_item.i[n] = CommandItem (parasm,"Multiple Alignment Parameters",MultiParameters); n++;
+ align_item.i[n] = CommandItem (parasm,"Protein Gap Parameters",GapParameters); n++;
+ align_item.mode[n] = PROFILEM;
+ align_item.i[n] = CommandItem (parasm,"Secondary Structure Parameters",SSParameters); n++;
+ align_item.i[n]=StatusItem(alignm, "Save Log File", set_save_log);
+ save_item1=align_item.i[n];
+ SetStatus(save_item1,save_log); n++;
+ align_item.i[n] = CommandItem (alignm,"Output Format Options",OutputParameters); n++;
+ align_item.num = n;
+ if(aln_mode==MULTIPLEM)
+ check_menus(align_item,PROFILEM);
+ else
+ check_menus(align_item,MULTIPLEM);
+
+}
+
+void set_save_log(IteM i)
+{
+ save_log=GetStatus(i);
+ SetStatus(save_item1,save_log);
+ SetStatus(save_item2,save_log);
+}
+
+static void make_tree_menu(void)
+{
+ int n=0;
+
+ tree_item.mode[n] = MULTIPLEM;
+ tree_item.i[n] = CommandItem (treem,"Draw N-J Tree/RRR",DrawTreeWin); n++;
+ tree_item.mode[n] = MULTIPLEM;
+ tree_item.i[n] = CommandItem (treem,"Bootstrap N-J Tree/BBB",BootstrapTreeWin); n++;
+ SeparatorItem(treem);
+ tree_item.mode[n] = MULTIPLEM;
+ tree_item.i[n]=StatusItem(treem, "Exclude Positions with Gaps", set_tossgaps);
+ SetStatus(tree_item.i[n],tossgaps); n++;
+ tree_item.mode[n] = MULTIPLEM;
+ tree_item.i[n]=StatusItem(treem, "Correct for Multiple Substitutions", set_kimura);
+ SetStatus(tree_item.i[n],kimura); n++;
+ SeparatorItem(treem);
+ tree_item.mode[n] = MULTIPLEM;
+ tree_item.i[n]=StatusItem(treem, "Save Log File", set_save_log);
+ save_item2=tree_item.i[n];
+ SetStatus(save_item2,save_log); n++;
+ tree_item.mode[n] = MULTIPLEM;
+ tree_item.i[n] = CommandItem (treem,"Output Format Options",OutputTreeParameters); n++;
+ tree_item.mode[n] = MULTIPLEM;
+ tree_item.num = n;
+ if(aln_mode==MULTIPLEM)
+ check_menus(tree_item,PROFILEM);
+ else
+ check_menus(tree_item,MULTIPLEM);
+
+}
+
+static void make_color_menu(void)
+{
+ int n=0;
+
+ color_item.i[n]=StatusItem(colorm, "Background Coloring", set_inverted);
+ SetStatus(color_item.i[n],inverted); n++;
+ SeparatorItem(colorm);
+ bw_item=color_item.i[n] = StatusItem (colorm,"Black and White",BlackandWhite);
+ SetStatus(color_item.i[n],usebw); n++;
+ defcol_item=color_item.i[n] = StatusItem (colorm,"Default Colors",DefColorPar);
+ SetStatus(color_item.i[n],usedefcolors); n++;
+ /*usercol_item=color_item.i[n] = StatusItem (colorm,"Load Color Parameter File",OpenColorParWin);*/
+ usercol_item=color_item.i[n] = StatusItem (colorm,"Load Color Parameter File",OpenColorPar);
+ SetStatus(color_item.i[n],useusercolors); n++;
+ color_item.num = n;
+
+}
+
+void check_menus(menu_item m,int mode)
+{
+ int i;
+
+ for (i=0;i<m.num;i++)
+ if (m.mode[i] == mode)
+ Disable(m.i[i]);
+ else
+ Enable(m.i[i]);
+}
+
+
+
Added: trunk/packages/clustalw/branches/upstream/current/xmenu.h
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/xmenu.h 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/xmenu.h 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,278 @@
+
+#define MAXHELPLENGTH 50000
+#define MAXHELPW 2
+
+#define DCOLS 60 /* sequence display width */
+#define DNAMES 15
+#define DNUMBER 6
+#define SCREENMARGIN 10
+#define SCOREHEIGHT 50
+
+#define MAXMENU 20
+#define MAXCOLORS 16
+
+#define MESSLENGTH 70
+#define MESSLINES 10
+#define MAXPROMPTLEN 300
+
+#define MULTIPLEM 1
+#define PROFILEM 2
+
+#define NEW 1
+#define OLD 0
+
+#define HIGHLIGHT 1
+#define NORMAL 0
+
+#define MARGIN 1
+
+#define A4 0
+#define A3 1
+#define USLETTER 2
+
+#define PORTRAIT 1
+#define LANDSCAPE 0
+
+#define MAXFINDSTR 20
+typedef struct aln_pos {
+ int seq;
+ int res;
+} aln_pos;
+
+typedef struct menu_item {
+ int num;
+ IteM i[MAXMENU];
+ int mode[MAXMENU];
+ char ptr[MAXMENU];
+} menu_item;
+
+typedef struct color {
+ char name[20];
+ Uint4 val;
+ float r,g,b;
+ float pr,pg,pb;
+} color;
+
+#define NAMES 0
+#define SEQS 1
+
+typedef struct panel_data {
+ int type; /* = NAMES or SEQS */
+ PaneL index;
+ int prf_no;
+ char **lines;
+ char **header;
+ char **footer;
+ char **colormask;
+ int nhead;
+ int nfoot;
+ PnlActnProc callback;
+ int pixelheight;
+ int pixelwidth;
+ int vlines;
+ int vseqs;
+ int vcols;
+ int nseqs;
+ int ncols;
+ int firstseq;
+ int firstvline;
+ int firstvcol;
+ int lockoffset;
+ int *selected;
+ int firstsel;
+ int lastsel;
+ int lineheight, charwidth, ascent, descent;
+ BaR vscrollbar,hscrollbar;
+ int *seqweight;
+ int *subgroup;
+ int *colscore;
+ char *consensus;
+ Boolean **residue_exception;
+ short **segment_exception;
+} panel_data;
+
+typedef struct spanel {
+ PaneL names;
+ PaneL seqs;
+ Boolean modified;
+} spanel;
+
+typedef struct range {
+ int first;
+ int last;
+} range;
+
+/*
+ PROTOTYPES - subroutines with capitalised names are defined in NCBI toolkit
+ and cannot be modified!
+*/
+void x_menu(void);
+void ResizeWindowProc(WindoW w);
+
+void shift(Handle a, int dx, int dy);
+void swap(float *scores,int s1, int s2);
+void sort_scores(float *scores,int f,int l);
+void reload_alignment(void);
+
+void DrawPanel(PaneL p);
+
+void resize_multi_window(void);
+void resize_prf_window(int numseqs1,int numseqs2);
+void position_scrollbars(spanel p);
+
+void color_seqs(void);
+void color_prf1(void);
+void color_prf2(void);
+
+void select_panel(spanel p);
+void deselect_panel(spanel p);
+void correct_name_bars(Boolean reset);
+void correct_seq_bars(Boolean reset);
+void load_aln_data(spanel p, int fs, int ls, Boolean reset);
+void load_aln(spanel p, int fs, int ls, Boolean reset);
+void remove_gap_pos(int fseq,int lseq,int prf_no);
+
+GrouP make_scroll_area(GrouP w,int prf_no,int nwidth,int swidth,int height,int firstseq,int nseqs,spanel *p);
+
+void draw_seq_pointer(PaneL p,int pos,int format);
+void draw_names(PaneL p);
+void draw_seqs(PaneL p);
+void draw_header(PaneL p);
+void draw_footer(PaneL p);
+void draw_colscores(PaneL p);
+void draw_allseqs(PaneL p,int fseq,int lseq);
+void draw_nameline(PaneL p,int fseq,int lseq,int format);
+void draw_seqline(panel_data data,int seq,PoinT pt,int fcol,int lcol,int format);
+void draw_seqcol(PaneL p,int col,int format);
+void highlight_seqrange(PaneL p,int fcol,int lcol,int format);
+void make_ruler(int length,char *name,char *seq);
+void make_consensus(panel_data data,char *name,char *seq);
+void make_colscores(panel_data data);
+int calc_colscore(sint matrix[NUMRES][NUMRES],int col);
+void calc_seg_exceptions(void);
+int make_struct_data(int prf_no,int length,char *name,char *seq);
+int make_gp_data(int prf_no,int length,char *name,char *seq);
+void make_colormask(panel_data data);
+void init_color_parameters(char *filename);
+char *find_file(char *filename);
+void white_on_black(void);
+void black_on_white(void);
+void text_colors(void);
+void data_colors(void);
+void switch_mode(void);
+void show_segment_exceptions(void);
+void check_menus(menu_item m,int mode);
+FILE * open_input_file(char *file_name);
+void stripspace(char *str);
+
+void set_scorecutoff(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval);
+void set_lengthcutoff(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval);
+void set_scorescale(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval);
+void set_go_penalty(TexT t);
+void set_ge_penalty(TexT t);
+void set_gap_dist(TexT t);
+void set_ntrials(TexT t);
+void set_ran_seed(TexT t);
+void set_div_seq(TexT t);
+void set_pw_go_penalty(TexT t);
+void set_pw_ge_penalty(TexT t);
+void set_gp(TexT t);
+void set_ktuple(TexT t);
+void set_topdiags(TexT t);
+void set_window(TexT t);
+void set_hyd_res(TexT t);
+void set_matrix(GrouP g);
+void set_dnamatrix(GrouP g);
+void set_user_matrix(ButtoN but);
+void set_user_dnamatrix(ButtoN but);
+int get_user_matrixname(char *usermtrxname,short *usermat,short *aa_xref,int usermatnum,int *matnum,PrompT mattext);
+void set_format(GrouP g);
+void set_button(ButtoN l,Boolean *value);
+void set_toggle(PopuP l,Boolean *value);
+void set_pref_penalties(PopuP l);
+void set_hyd_penalties(PopuP l);
+void set_var_penalties(PopuP l);
+void set_endgaps(PopuP l);
+void set_align_endgappenalties(PopuP l);
+void set_realign_endgappenalties(PopuP l);
+void set_case(PopuP l);
+void set_snos(PopuP l);
+
+void setRange(PopuP l);
+
+void set_save_paras(PopuP l);
+void set_transitions(TexT t);
+void set_save_log(IteM i);
+void set_neg_matrix(PopuP l);
+void set_ambiguities(PopuP l);
+void set_aln_mode(PopuP g);
+void set_pscroll_mode(ButtoN l);
+void set_show_segments(IteM l);
+void set_font_size(PopuP g);
+void set_residue_exceptions(IteM i);
+void set_segment_exceptions(IteM i);
+void set_segment_dnascale(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval);
+void set_fs_toggle(PopuP l);
+void set_score_matrix(GrouP g);
+void set_segment_matrix(GrouP g);
+void set_score_user_matrix(ButtoN but);
+void set_segment_user_matrix(ButtoN but);
+void set_score_dnamatrix(GrouP g);
+void set_segment_dnamatrix(GrouP g);
+void set_score_user_dnamatrix(ButtoN but);
+void set_segment_user_dnamatrix(ButtoN but);
+void set_pagesize(PopuP g);
+void set_orientation(PopuP g);
+void set_header(PopuP l);
+void set_ruler(PopuP l);
+void set_curve(PopuP l);
+void set_resno(PopuP l);
+void set_resize(PopuP l);
+void set_fres(TexT t);
+void set_lres(TexT t);
+void set_fpres(TexT t);
+void set_lpres(TexT t);
+void set_blocklen(TexT t);
+void set_output_clustal(ButtoN l);
+void set_output_nbrf(ButtoN l);
+void set_output_phylip(ButtoN l);
+void set_output_gcg(ButtoN l);
+void set_output_gde(ButtoN l);
+void set_output_nexus(ButtoN l);
+
+void set_output_pim(ButtoN l); /* Ramu */
+
+void set_output_fasta(ButtoN l); /* Ramu */
+
+void set_pw_matrix(GrouP g);
+void set_pw_dnamatrix(GrouP g);
+void set_pw_user_matrix(ButtoN but);
+void set_pw_user_dnamatrix(ButtoN but);
+void set_output_order(PopuP g);
+void set_output_tree_clustal(ButtoN l);
+void set_output_tree_phylip(ButtoN l);
+void set_output_tree_distances(ButtoN l);
+void set_output_tree_nexus(ButtoN l);
+void set_inverted(IteM i);
+void set_tossgaps(IteM i);
+void set_kimura(IteM i);
+void set_boot_format(PopuP g);
+void set_use_ss1(PopuP l);
+void set_use_ss2(PopuP l);
+void set_helix_gp(TexT t);
+void set_strand_gp(TexT t);
+void set_loop_gp(TexT t);
+void set_terminal_gp(TexT t);
+void set_helix_minus(TexT t);
+void set_helix_plus(TexT t);
+void set_strand_minus(TexT t);
+void set_strand_plus(TexT t);
+void set_ss_output(ButtoN b);
+void set_gp_output(ButtoN b);
+void calc_segment_exceptions(IteM i);
+
+void write_ps_file(spanel p,char *ps_file,char *par_file,int pagesize,
+int orientation,Boolean header, Boolean ruler, Boolean resno, Boolean resize,
+int first_printres,int last_printres,
+int blocklen,Boolean show_curve);
+
Added: trunk/packages/clustalw/branches/upstream/current/xscore.c
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/xscore.c 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/xscore.c 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,1017 @@
+#include <stdio.h>
+#include <stdarg.h>
+#include <string.h>
+
+#include <vibrant.h>
+#include <document.h>
+
+#include "clustalw.h"
+#include "xmenu.h"
+
+static void build_profile(int prf_length,int first_seq,int last_seq,sint matrix[NUMRES][NUMRES],
+ sint *weight,sint **profile);
+static void calc_colscores(Boolean update_seqs,Boolean update_scores);
+static void calc_panel_segment_exceptions(PaneL p);
+static void calc_weights(PaneL p);
+static void remove_short_segments(PaneL p);
+
+
+extern Boolean aln_mode;
+extern Boolean profile1_empty,profile2_empty;
+
+extern int score_cutoff; /* cutoff for residue exceptions */
+extern int score_hwin; /* half window for summing alignment column scores */
+extern int score_scale;
+extern int segment_dnascale;
+extern int length_cutoff; /* cutoffs for segment exceptions */
+extern Boolean residue_exceptions;
+extern Boolean segment_exceptions;
+extern int score_matnum;
+extern char score_mtrxname[];
+extern int segment_matnum;
+extern char segment_mtrxname[];
+extern int score_dnamatnum;
+extern char score_dnamtrxname[];
+extern int segment_dnamatnum;
+extern char segment_dnamtrxname[];
+extern IteM segment_item;
+
+extern double **tmat;
+
+extern short score_matrix[];
+extern short score_aa_xref[];
+extern short segment_matrix[];
+extern short segment_aa_xref[];
+extern short score_dnamatrix[];
+extern short score_dna_xref[];
+extern short segment_dnamatrix[];
+extern short segment_dna_xref[];
+
+extern WindoW mainw;
+extern FonT datafont;
+extern short idmat[];
+extern short def_dna_xref[],def_aa_xref[];
+extern short swgapdnamt[],clustalvdnamt[]; /* used for alignment scores */
+extern short gon80mt[],gon120mt[],gon250mt[],gon350mt[];
+extern Boolean dnaflag;
+extern sint max_aa;
+extern sint *seqlen_array;
+extern char **seq_array;
+extern sint gap_pos1, gap_pos2;
+extern sint *output_index;
+extern spanel seq_panel; /* data for multiple alignment area */
+extern spanel prf_panel[]; /* data for profile alignment areas */
+
+extern PrompT residue_cutofftext;
+extern PrompT length_cutofftext;
+extern PrompT scorescaletext;
+extern PrompT segmentdnascaletext;
+extern PrompT scoremattext;
+extern PrompT segmentmattext;
+extern PrompT scorednamattext;
+extern PrompT segmentdnamattext;
+extern PopuP show_seg_toggle;
+
+extern GrouP score_matrix_list,seg_matrix_list;
+extern GrouP score_dnamatrix_list,seg_dnamatrix_list;
+
+static Char filename[FILENAMELEN]; /* used in temporary file selection window */
+
+
+
+void draw_colscores(PaneL p)
+{
+ RecT block,r;
+ int i, b, x, y;
+ panel_data data;
+
+ UseWindow(mainw);
+ Select(p);
+ SelectFont(datafont);
+ GetPanelExtra(p, &data);
+ if(data.nseqs == 0) return;
+ if(data.colscore == NULL) return;
+ if(data.vcols<=0) return;
+
+ ObjectRect (p, &r);
+ InsetRect(&r,1,1);
+ block.bottom=r.bottom;
+ block.top=block.bottom-SCOREHEIGHT-1;
+ block.left=r.left;
+ block.right=r.right;
+ data_colors();
+ EraseRect(&block);
+
+ Gray();
+ r.left=block.left+data.charwidth;
+ r.bottom=b=block.bottom;
+ for(i=data.firstvcol;i<data.firstvcol+data.vcols && i<data.ncols;i++)
+ {
+ x=r.left;
+ MoveTo(x,b);
+ r.right=r.left+data.charwidth;
+ r.top=block.bottom-SCOREHEIGHT*(float)data.colscore[i]/100.0;
+ PaintRect(&r);
+ r.left+=data.charwidth;
+ }
+ black_on_white();
+}
+
+
+void make_colscores(panel_data data)
+{
+/* FILE *fd;*/
+ int n,i,s,p,r,r1;
+ short *mat_xref, *matptr;
+ float median,mean;
+ float t,q1,q3,ul;
+ float *seqdist,*sorteddist,diff;
+ sint maxres;
+ sint *seqvector;
+ sint *freq,**profile;
+ sint matrix[NUMRES][NUMRES];
+ Boolean include_gaps=FALSE;
+ panel_data data1;
+
+ if(dnaflag)
+ {
+ if (score_dnamatnum==1)
+ {
+ matptr = swgapdnamt;
+ mat_xref = def_dna_xref;
+ }
+ else if (score_dnamatnum==2)
+ {
+ matptr = clustalvdnamt;
+ mat_xref = def_dna_xref;
+ }
+ else
+ {
+ matptr = score_dnamatrix;
+ mat_xref = score_dna_xref;
+ }
+ }
+ else if (score_matnum==1)
+ {
+ matptr = idmat;
+ mat_xref = def_aa_xref;
+ }
+ else if (score_matnum==2)
+ {
+ matptr = gon80mt;
+ mat_xref = def_aa_xref;
+ }
+ else if (score_matnum==3)
+ {
+ matptr = gon120mt;
+ mat_xref = def_aa_xref;
+ }
+ else if (score_matnum==4)
+ {
+ matptr = gon250mt;
+ mat_xref = def_aa_xref;
+ }
+ else if (score_matnum==5)
+ {
+ matptr = gon350mt;
+ mat_xref = def_aa_xref;
+ }
+ else
+ {
+ matptr = score_matrix;
+ mat_xref = score_aa_xref;
+ }
+ maxres = get_matrix(matptr, mat_xref, matrix, FALSE, 100);
+ if (maxres == 0)
+ {
+ error("matrix not found for aln score");
+ return;
+ }
+
+ profile = (sint **) ckalloc( (data.ncols+2) * sizeof (sint *) );
+ for(p=0; p<data.ncols; p++)
+ profile[p] = (sint *) ckalloc( (max_aa+2) * sizeof(sint) );
+ freq = (sint *) ckalloc( (max_aa+2) * sizeof (sint) );
+
+ for(p=0;p<data.ncols;p++)
+ {
+ for(r=0;r<max_aa;r++)
+ freq[r]=0;
+ for(s=data.firstseq;s<data.firstseq+data.nseqs;s++)
+ if(p<seqlen_array[s+1] && seq_array[s+1][p+1]>=0 && seq_array[s+1][p+1]<max_aa)
+ {
+ freq[seq_array[s+1][p+1]]++;
+ }
+ for(r=0;r<max_aa;r++)
+ {
+ profile[p][r]=0;
+ for(r1=0;r1<max_aa;r1++)
+ profile[p][r]+=freq[r1]*matrix[r1][r];
+ profile[p][r]/=(float)data.nseqs;
+ }
+ }
+/*
+fprintf(fd,"Profile...\n");
+for(r=0;r<max_aa;r++)
+{
+ for(p=0;p<data.ncols;p++)
+ fprintf(fd,"%d\t",profile[p][r]);
+ fprintf(fd,"\n");
+}
+*/
+ seqvector = (sint *) ckalloc( (max_aa+2) * sizeof(sint) );
+ seqdist=(float *)ckalloc((data.nseqs+1)*sizeof(float));
+ sorteddist=(float *)ckalloc((data.nseqs+1)*sizeof(float));
+
+ for(p=0; p<data.ncols; p++)
+ {
+ for(s=data.firstseq; s<data.firstseq+data.nseqs; s++)
+ {
+ if (p<seqlen_array[s+1])
+ for (r=0;r<max_aa; r++)
+ seqvector[r]=matrix[r][(int)seq_array[s+1][p+1]];
+ else
+ for (r=0;r<max_aa; r++)
+ seqvector[r]=matrix[r][gap_pos1];
+ seqdist[s-data.firstseq]=0.0;
+ for(r=0;r<max_aa;r++)
+ {
+ diff=profile[p][r]-seqvector[r];
+ diff/=1000.0;
+ seqdist[s-data.firstseq]+=diff*diff;
+ }
+ seqdist[s-data.firstseq]=sqrt((double)seqdist[s-data.firstseq]);
+ }
+/*
+fprintf(fd,"\n\nPosition %d:\n",p+1);
+fprintf(fd,"Sequence Distances...\n");
+for(s=0;s<data.nseqs;s++)
+ fprintf(fd,"%.1f\t",seqdist[s]);
+*/
+/* calculate mean,median and rms of seq distances */
+ mean=median=0.0;
+ if(include_gaps)
+ {
+ for(s=0; s<data.nseqs; s++)
+ mean+=seqdist[s];
+ mean/=data.nseqs;
+ n=data.nseqs;
+ for(s=0; s<data.nseqs; s++)
+ sorteddist[s]=seqdist[s];
+ }
+ else
+ {
+ n=0;
+ for(s=data.firstseq; s<data.firstseq+data.nseqs; s++)
+ if(p<seqlen_array[s+1] && seq_array[s+1][p+1]>=0 && seq_array[s+1][p+1]<max_aa)
+ {
+ mean+=seqdist[s-data.firstseq];
+ n++;
+ }
+ if(n>0) mean/=n;
+ for(s=data.firstseq,i=0; s<data.firstseq+data.nseqs; s++)
+ if(p<seqlen_array[s+1] && seq_array[s+1][p+1]>=0 && seq_array[s+1][p+1]<max_aa)
+ sorteddist[i++]=seqdist[s-data.firstseq];
+ }
+ sort_scores(sorteddist,0,n-1);
+/*
+fprintf(fd,"\nSorted:\n");
+for(s=0;s<n;s++)
+ fprintf(fd,"%.1f ",sorteddist[s]);
+*/
+
+ if(n == 0)
+ median = 0;
+ else if(n % 2 == 0)
+ median=(sorteddist[n/2-1]+sorteddist[n/2])/2.0;
+ else
+ median=sorteddist[n/2];
+ if(score_scale<=5)
+ data.colscore[p]=exp((double)(-mean*(6-score_scale)/4.0))*100.0*n/data.nseqs;
+ else
+ data.colscore[p]=exp((double)(-mean/(4.0*(score_scale-4))))*100.0*n/data.nseqs;
+/*
+fprintf(fd,"\nMean %.1f Median %.1f Score %.1f\n",mean,median,data.colscore[p]);
+*/
+ if(n==0)
+ {
+ ul=0;
+ }
+ else
+ {
+ t = n/4.0 + 0.5;
+ if(t - (int)t == 0.5)
+ {
+ q3=(sorteddist[(int)t]+sorteddist[(int)t+1])/2.0;
+ q1=(sorteddist[n-(int)t]+sorteddist[n-(int)t-1])/2.0;
+ }
+ else if(t - (int)t > 0.5)
+ {
+ q3=sorteddist[(int)t+1];
+ q1=sorteddist[n-(int)t-1];
+ }
+ else
+ {
+ q3=sorteddist[(int)t];
+ q1=sorteddist[n-(int)t];
+ }
+ if (n<4)ul=sorteddist[0];
+ else ul=q3+(q3-q1)*((float)score_cutoff/2.0);
+ }
+/*
+fprintf(fd,"\nMedian %.1f Q1 %.1f Q3 %.1f UL %.1f\n",median,q1,q3,ul);
+fprintf(fd,"\nExceptions: ");
+for(s=0;s<data.nseqs;s++)
+ if(seqdist[s]>ul) fprintf(fd,"%d ",s+1);
+*/
+ for(s=data.firstseq;s<data.firstseq+data.nseqs;s++)
+ if(seqdist[s-data.firstseq]>ul && p<seqlen_array[s+1] && seq_array[s+1][p+1]>=0 && seq_array[s+1][p+1]<max_aa)
+ data.residue_exception[s-data.firstseq][p]=TRUE;
+ else
+ data.residue_exception[s-data.firstseq][p]=FALSE;
+ }
+/*
+fclose(fd);
+*/
+ for(p=0;p<data.ncols;p++)
+ ckfree(profile[p]);
+ ckfree(profile);
+ ckfree(freq);
+ ckfree(seqvector);
+ ckfree(seqdist);
+ ckfree(sorteddist);
+
+
+}
+
+
+void sort_scores(float *scores,int f,int l)
+{
+ int i,last;
+
+ if(f>=l) return;
+
+ swap(scores,f,(f+l)/2);
+ last=f;
+ for(i=f+1;i<=l;i++)
+ {
+ if(scores[i]>scores[f])
+ swap(scores,++last,i);
+ }
+ swap(scores,f,last);
+ sort_scores(scores,f,last-1);
+ sort_scores(scores,last+1,l);
+
+}
+
+void swap(float *scores,int s1, int s2)
+{
+ float temp;
+
+ temp=scores[s1];
+ scores[s1]=scores[s2];
+ scores[s2]=temp;
+}
+
+
+void set_scorescale(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval)
+{
+ char str[FILENAMELEN];
+ panel_data data;
+
+ score_scale = newval+1;
+
+ calc_colscores(FALSE,TRUE);
+
+ sprintf(str,"Score Plot Scale: %2d",score_scale);
+ SetTitle(scorescaletext,str);
+}
+
+void set_scorecutoff(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval)
+{
+ char str[FILENAMELEN];
+ int temp;
+ panel_data data;
+ temp=newval+1;
+
+ score_cutoff = temp;
+
+ calc_colscores(residue_exceptions,FALSE);
+ sprintf(str,"Residue Exception Cutoff: %2d",score_cutoff);
+ SetTitle(residue_cutofftext,str);
+}
+
+
+
+
+void calc_segment_exceptions(IteM i)
+{
+ WatchCursor();
+ segment_exceptions=TRUE;
+ calc_seg_exceptions();
+ show_segment_exceptions();
+ SetValue(show_seg_toggle,1);
+ SetStatus(segment_item,segment_exceptions);
+ ArrowCursor();
+}
+
+void set_lengthcutoff(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval)
+{
+ char str[100];
+ int temp;
+
+ temp=newval+1;
+
+ length_cutoff = temp;
+ sprintf(str,"Minimum Length of Segments: %2d",length_cutoff);
+
+ if(aln_mode==MULTIPLEM)
+ {
+ remove_short_segments(seq_panel.seqs);
+ }
+ else
+ {
+ remove_short_segments(prf_panel[0].seqs);
+ remove_short_segments(prf_panel[1].seqs);
+ }
+ if(segment_exceptions) show_segment_exceptions();
+ SetTitle(length_cutofftext,str);
+
+}
+
+
+void set_score_user_matrix(ButtoN but)
+{
+
+ if(get_user_matrixname(score_mtrxname,score_matrix,score_aa_xref,6,&score_matnum,scoremattext))
+ {
+ calc_colscores(residue_exceptions,TRUE);
+ SetValue(score_matrix_list,score_matnum);
+ }
+}
+
+void set_score_matrix(GrouP g)
+{
+ int tmp;
+
+ tmp = GetValue(g);
+ if(tmp>0 && tmp<6)
+ {
+ score_matnum=tmp;
+ }
+ else
+ {
+ if (score_mtrxname[0]=='\0')
+ {
+ get_user_matrixname(score_mtrxname,score_matrix,score_aa_xref,6,&score_matnum,scoremattext);
+ }
+ else score_matnum=6;
+ }
+ calc_colscores(residue_exceptions,TRUE);
+
+ SetValue(score_matrix_list,score_matnum);
+}
+
+void set_segment_user_matrix(ButtoN but)
+{
+
+ if(get_user_matrixname(segment_mtrxname,segment_matrix,segment_aa_xref,5,&segment_matnum,segmentmattext))
+ {
+ calc_seg_exceptions();
+ if(segment_exceptions) show_segment_exceptions();
+ SetValue(seg_matrix_list,segment_matnum);
+ }
+}
+
+
+void set_segment_matrix(GrouP g)
+{
+ int tmp;
+
+ tmp = GetValue(g);
+ if(tmp>0 && tmp<5)
+ {
+ segment_matnum=tmp;
+ }
+ else
+ {
+ if (segment_mtrxname[0]=='\0')
+ {
+ get_user_matrixname(segment_mtrxname,segment_matrix,segment_aa_xref,5,&segment_matnum,segmentmattext);
+ }
+ else segment_matnum=5;
+ }
+
+ calc_seg_exceptions();
+ if(segment_exceptions) show_segment_exceptions();
+
+ SetValue(seg_matrix_list,segment_matnum);
+}
+
+
+void set_score_user_dnamatrix(ButtoN but)
+{
+
+ if(get_user_matrixname(score_dnamtrxname,score_dnamatrix,score_dna_xref,3,&score_dnamatnum,scorednamattext))
+ {
+ calc_colscores(residue_exceptions,TRUE);
+ SetValue(score_dnamatrix_list,score_dnamatnum);
+ }
+}
+
+void set_score_dnamatrix(GrouP g)
+{
+ int tmp;
+
+ tmp = GetValue(g);
+ if(tmp>0 && tmp<3)
+ {
+ score_dnamatnum=tmp;
+ }
+ else
+ {
+ if (score_dnamtrxname[0]=='\0')
+ {
+ get_user_matrixname(score_dnamtrxname,score_dnamatrix,score_dna_xref,3,&score_dnamatnum,scorednamattext);
+ }
+ else score_dnamatnum=3;
+ }
+ calc_colscores(residue_exceptions,TRUE);
+
+ SetValue(score_dnamatrix_list,score_dnamatnum);
+}
+
+void set_segment_user_dnamatrix(ButtoN but)
+{
+
+ if(get_user_matrixname(segment_dnamtrxname,segment_dnamatrix,segment_dna_xref,3,&segment_dnamatnum,segmentdnamattext))
+ calc_seg_exceptions();
+ if(segment_exceptions) show_segment_exceptions();
+
+ SetValue(seg_dnamatrix_list,segment_dnamatnum);
+}
+
+void set_segment_dnamatrix(GrouP g)
+{
+ int tmp;
+
+ tmp = GetValue(g);
+ if(tmp>0 && tmp<3)
+ {
+ segment_dnamatnum=tmp;
+ }
+ else
+ {
+ if (segment_dnamtrxname[0]=='\0')
+ {
+ get_user_matrixname(segment_dnamtrxname,segment_dnamatrix,segment_dna_xref,3,&segment_dnamatnum,segmentdnamattext);
+ }
+ else segment_dnamatnum=3;
+ }
+
+ calc_seg_exceptions();
+ if(segment_exceptions) show_segment_exceptions();
+
+ SetValue(seg_dnamatrix_list,segment_dnamatnum);
+}
+
+static void calc_colscores(Boolean update_seqs,Boolean update_scores)
+{
+ panel_data data;
+
+ if(aln_mode==MULTIPLEM)
+ {
+ GetPanelExtra(seq_panel.seqs,&data);
+ make_colscores(data);
+ SetPanelExtra(seq_panel.seqs,&data);
+ if(update_seqs) draw_seqs(seq_panel.seqs);
+ if(update_scores) draw_colscores(seq_panel.seqs);
+ }
+ else
+ {
+ GetPanelExtra(prf_panel[0].seqs,&data);
+ make_colscores(data);
+ SetPanelExtra(prf_panel[0].seqs,&data);
+ if(update_seqs) draw_seqs(prf_panel[0].seqs);
+ if(update_scores) draw_colscores(prf_panel[0].seqs);
+ GetPanelExtra(prf_panel[1].seqs,&data);
+ make_colscores(data);
+ SetPanelExtra(prf_panel[1].seqs,&data);
+ if(update_seqs) draw_seqs(prf_panel[1].seqs);
+ if(update_scores) draw_colscores(prf_panel[1].seqs);
+ }
+}
+
+void calc_seg_exceptions(void)
+{
+ if(aln_mode==MULTIPLEM)
+ {
+ calc_panel_segment_exceptions(seq_panel.seqs);
+ }
+ else
+ {
+ calc_panel_segment_exceptions(prf_panel[0].seqs);
+ calc_panel_segment_exceptions(prf_panel[1].seqs);
+ }
+}
+
+void show_segment_exceptions(void)
+{
+ if(aln_mode==MULTIPLEM)
+ {
+ draw_seqs(seq_panel.seqs);
+ }
+ else
+ {
+ draw_seqs(prf_panel[0].seqs);
+ draw_seqs(prf_panel[1].seqs);
+ }
+}
+
+static void remove_short_segments(PaneL p)
+{
+ int i,j,k,start;
+ panel_data data;
+
+ GetPanelExtra(p,&data);
+ if(data.nseqs<=0) return;
+
+/* Reset all the exceptions - a value of 1 indicates an exception that
+will be displayed. A value of -1 is used to remember exceptions that
+are temporarily hidden in the display */
+ for(i=0;i<data.nseqs;i++)
+ for(j=0;j<data.ncols;j++)
+ if(data.segment_exception[i][j] == -1)
+ data.segment_exception[i][j] = 1;
+
+ for(i=0;i<data.nseqs;i++)
+ {
+ start = -1;
+ for(j=0;j<=data.ncols;j++)
+ {
+ if(start == -1)
+ {
+ if(data.segment_exception[i][j]==1)
+ start=j;
+ }
+ else
+ {
+ if(j==data.ncols || data.segment_exception[i][j]==0)
+ {
+ if(j-start<length_cutoff)
+ for(k=start;k<j;k++)
+ data.segment_exception[i][k] = -1;
+ start = -1;
+ }
+ }
+
+ }
+ }
+
+ SetPanelExtra(p,&data);
+}
+
+static void calc_weights(PaneL p)
+{
+ int i,j;
+ int status;
+ sint *weight;
+ float dscore;
+ FILE *tree;
+ panel_data data;
+
+#ifdef UNIX
+ char tree_name[FILENAMELEN]=".score.ph";
+#else
+ char tree_name[FILENAMELEN]="tmp.ph";
+#endif
+
+ GetPanelExtra(p,&data);
+ if(data.nseqs<=0) return;
+
+/* if sequence weights have been calculated before - don't bother
+doing it again (it takes too long). data.seqweight is set to NULL when
+ new sequences are loaded. */
+ if(data.seqweight!=NULL) return;
+
+ WatchCursor();
+ info("Calculating sequence weights...");
+/* count pairwise percent identities to make a phylogenetic tree */
+ if(data.nseqs>=2)
+ {
+ for (i=1;i<=data.nseqs;i++) {
+ for (j=i+1;j<=data.nseqs;j++) {
+ dscore = countid(i+data.firstseq,j+data.firstseq);
+ tmat[i][j] = (100.0 - dscore)/100.0;
+ tmat[j][i] = tmat[i][j];
+ }
+ }
+
+ if((tree = open_explicit_file(tree_name))==NULL) return;
+
+ guide_tree(tree,data.firstseq+1,data.nseqs);
+
+ status = read_tree(tree_name, data.firstseq, data.firstseq+data.nseqs);
+ if (status == 0) return;
+
+ }
+
+ weight = (sint *) ckalloc( (data.firstseq+data.nseqs+1) * sizeof(sint) );
+/* get the sequence weights */
+ calc_seq_weights(data.firstseq, data.firstseq+data.nseqs,weight);
+ if(data.seqweight==NULL) data.seqweight=(sint *)ckalloc((data.nseqs+1) * sizeof(sint));
+ for(i=data.firstseq;i<data.firstseq+data.nseqs;i++)
+ data.seqweight[i-data.firstseq]=weight[i];
+
+/* clear the memory for the phylogenetic tree */
+ if (data.nseqs >= 2)
+ {
+ clear_tree(NULL);
+ remove(tree_name);
+ }
+ ckfree(weight);
+ SetPanelExtra(p,&data);
+ info("Done.");
+ ArrowCursor();
+}
+
+static void calc_panel_segment_exceptions(PaneL p)
+{
+ int i,j;
+ float sum,prev_sum;
+ float gscale;
+ sint **profile;
+ sint *weight,sweight;
+ sint *gaps;
+ sint maxres;
+ int max=0,offset;
+ short *mat_xref, *matptr;
+ sint matrix[NUMRES][NUMRES];
+ float *fsum;
+ float *bsum;
+ float *pscore;
+ panel_data data;
+
+/* First, calculate sequence weights which will be used to build the
+profile */
+ calc_weights(p);
+
+ GetPanelExtra(p,&data);
+ if(data.nseqs<=0) return;
+
+ WatchCursor();
+ info("Calculating profile scores...");
+
+ for(i=0;i<data.nseqs;i++)
+ for(j=0;j<data.ncols;j++)
+ data.segment_exception[i][j]=0;
+
+/* get the comparison matrix for building the profile */
+ if(dnaflag)
+ {
+ if (segment_dnamatnum==1)
+ {
+ matptr = swgapdnamt;
+ mat_xref = def_dna_xref;
+ }
+ else if (segment_dnamatnum==2)
+ {
+ matptr = clustalvdnamt;
+ mat_xref = def_dna_xref;
+ }
+ else
+ {
+ matptr = segment_dnamatrix;
+ mat_xref = segment_dna_xref;
+ }
+/* get a positive matrix - then adjust it according to scale */
+ maxres = get_matrix(matptr, mat_xref, matrix, FALSE, 100);
+/* find the maximum value */
+ for(i=0;i<=max_aa;i++)
+ for(j=0;j<=max_aa;j++)
+ if(matrix[i][j]>max) max=matrix[i][j];
+/* subtract max*scale/2 from each matrix value */
+ offset=(float)(max*segment_dnascale)/20.0;
+
+ for(i=0;i<=max_aa;i++)
+ for(j=0;j<=max_aa;j++)
+ matrix[i][j]-=offset;
+ }
+ else
+ {
+ if (segment_matnum==1)
+ {
+ matptr = gon80mt;
+ mat_xref = def_aa_xref;
+ }
+ else if (segment_matnum==2)
+ {
+ matptr = gon120mt;
+ mat_xref = def_aa_xref;
+ }
+ else if (segment_matnum==3)
+ {
+ matptr = gon250mt;
+ mat_xref = def_aa_xref;
+ }
+ else if (segment_matnum==4)
+ {
+ matptr = gon350mt;
+ mat_xref = def_aa_xref;
+ }
+ else
+ {
+ matptr = segment_matrix;
+ mat_xref = segment_aa_xref;
+ }
+/* get a negative matrix */
+ maxres = get_matrix(matptr, mat_xref, matrix, TRUE, 100);
+ }
+
+ profile = (sint **) ckalloc( (data.ncols+2) * sizeof (sint *) );
+ for(i=0; i<data.ncols+1; i++)
+ profile[i] = (sint *) ckalloc( (LENCOL+2) * sizeof(sint) );
+
+/* calculate the profile */
+ gaps = (sint *) ckalloc( (data.ncols+1) * sizeof (sint) );
+ for (j=1; j<=data.ncols; j++)
+ {
+ gaps[j-1] = 0;
+ for(i=data.firstseq+1;i<data.firstseq+data.nseqs;i++)
+ if (j<seqlen_array[i])
+ if ((seq_array[i][j] < 0) || (seq_array[i][j] > max_aa))
+ gaps[j-1]++;
+ }
+ weight = (sint *) ckalloc( (data.firstseq+data.nseqs+1) * sizeof(sint) );
+ for(i=data.firstseq;i<data.firstseq+data.nseqs;i++)
+ weight[i]=data.seqweight[i-data.firstseq];
+
+ build_profile(data.ncols,data.firstseq,data.firstseq+data.nseqs,matrix,weight,profile);
+
+ sweight=0;
+ for(i=data.firstseq;i<data.firstseq+data.nseqs;i++)
+ sweight+=weight[i];
+
+/*Now, use the profile scores to mark segments of each sequence which score
+badly. */
+
+ fsum = (float *) ckalloc( (data.ncols+2) * sizeof (float) );
+ bsum = (float *) ckalloc( (data.ncols+2) * sizeof (float) );
+ pscore = (float *) ckalloc( (data.ncols+2) * sizeof (float) );
+ for(i=data.firstseq+1;i<data.firstseq+data.nseqs+1;i++)
+ {
+/* In a forward phase, sum the profile scores. Mark negative sums as exceptions.
+If the sum is positive, then it gets reset to 0. */
+ sum=0.0;
+ for(j=1;j<=seqlen_array[i];j++)
+ {
+ gscale = (float)(data.nseqs-gaps[j-1]) / (float)data.nseqs;
+ if(seq_array[i][j]<0 || seq_array[i][j]>=max_aa)
+ {
+ pscore[j-1]=0.0;
+ sum=0.0;
+ }
+ else
+ pscore[j-1]=(profile[j][seq_array[i][j]]-
+ weight[i-1]*matrix[seq_array[i][j]][seq_array[i][j]])*gscale/sweight;
+ sum+=pscore[j-1];
+ if(sum>0.0) sum=0.0;
+ fsum[j-1]=sum;
+ }
+/* trim off any positive scoring residues from the end of the segments */
+ prev_sum=0;
+ for(j=seqlen_array[i]-1;j>=0;j--)
+ {
+ if(prev_sum>=0.0 && fsum[j]<0.0 && pscore[j]>=0.0)
+ fsum[j]=0.0;
+ prev_sum=fsum[j];
+ }
+
+/* Now, in a backward phase, do the same summing process. */
+ sum=0.0;
+ for(j=seqlen_array[i];j>=1;j--)
+ {
+ if(seq_array[i][j]<0 || seq_array[i][j]>=max_aa)
+ sum=0;
+ else
+ sum+=pscore[j-1];
+ if(sum>0.0) sum=0.0;
+ bsum[j-1]=sum;
+ }
+/* trim off any positive scoring residues from the start of the segments */
+ prev_sum=0;
+ for(j=0;j<seqlen_array[i];j++)
+ {
+ if(prev_sum>=0.0 && bsum[j]<0.0 && pscore[j]>=0.0)
+ bsum[j]=0.0;
+ prev_sum=bsum[j];
+ }
+/*Mark residues as exceptions if they score negative in the forward AND backward directions. */
+ for(j=1;j<=seqlen_array[i];j++)
+ if(fsum[j-1]<0.0 && bsum[j-1]<0.0)
+ if(seq_array[i][j]>=0 && seq_array[i][j]<max_aa)
+ data.segment_exception[i-data.firstseq-1][j-1]=-1;
+/*
+if(i==5) {
+fprintf(stderr,"%4d ",j);
+fprintf(stderr,"\n");
+for(j=0;j<seqlen_array[i];j++)
+fprintf(stderr,"%4d ",(int)fsum[j]);
+fprintf(stderr,"\n");
+}
+*/
+ }
+ for(i=0; i<data.ncols+1; i++)
+ ckfree(profile[i]);
+ ckfree(profile);
+ ckfree(weight);
+ ckfree(gaps);
+ ckfree(pscore);
+ ckfree(fsum);
+ ckfree(bsum);
+
+ SetPanelExtra(p,&data);
+
+/* Finally, apply the length cutoff to the segments - removing segments shorter
+than the cutoff */
+ remove_short_segments(p);
+
+ info("Done.");
+ ArrowCursor();
+}
+
+
+void set_segment_dnascale(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval)
+{
+ char str[FILENAMELEN];
+ panel_data data;
+
+ segment_dnascale = newval+1;
+ calc_seg_exceptions();
+ if(segment_exceptions) show_segment_exceptions();
+ sprintf(str,"DNA Marking Scale: %2d",segment_dnascale);
+ SetTitle(segmentdnascaletext,str);
+}
+
+
+static void build_profile(int prf_length,int first_seq,int last_seq,sint matrix[NUMRES][NUMRES],sint *weight,sint **profile)
+{
+ sint **weighting, d, i, res;
+ sint r, pos;
+ int f;
+
+ weighting = (sint **) ckalloc( (NUMRES+2) * sizeof (sint *) );
+ for (i=0;i<NUMRES+2;i++)
+ weighting[i] = (sint *) ckalloc( (prf_length+2) * sizeof (sint) );
+
+ for (r=0; r<prf_length; r++)
+ {
+ for (d=0; d<=max_aa; d++)
+ {
+ weighting[d][r] = 0;
+ for (i=first_seq; i<last_seq; i++)
+ if (r+1<seqlen_array[i+1])
+ if (d == seq_array[i+1][r+1]) weighting[d][r] += weight[i];
+ }
+ weighting[gap_pos1][r] = 0;
+ for (i=first_seq; i<last_seq; i++)
+ if (r+1<seqlen_array[i+1])
+ if (gap_pos1 == seq_array[i+1][r+1]) weighting[gap_pos1][r] += weight[i];
+ weighting[gap_pos2][r] = 0;
+ for (i=first_seq; i<last_seq; i++)
+ if (r+1<seqlen_array[i+1])
+ if (gap_pos2 == seq_array[i+1][r+1]) weighting[gap_pos2][r] += weight[i];
+ }
+
+ for (pos=0; pos< prf_length; pos++)
+ {
+ for (res=0; res<=max_aa; res++)
+ {
+ f = 0;
+ for (d=0; d<=max_aa; d++)
+ f += (weighting[d][pos] * matrix[d][res]);
+ f += (weighting[gap_pos1][pos] * matrix[gap_pos1][res]);
+ f += (weighting[gap_pos2][pos] * matrix[gap_pos2][res]);
+ profile[pos+1][res] = f;
+ }
+ f = 0;
+ for (d=0; d<=max_aa; d++)
+ f += (weighting[d][pos] * matrix[d][gap_pos1]);
+ f += (weighting[gap_pos1][pos] * matrix[gap_pos1][gap_pos1]);
+ f += (weighting[gap_pos2][pos] * matrix[gap_pos2][gap_pos1]);
+ profile[pos+1][gap_pos1] = f;
+ f = 0;
+ for (d=0; d<=max_aa; d++)
+ f += (weighting[d][pos] * matrix[d][gap_pos2]);
+ f += (weighting[gap_pos1][pos] * matrix[gap_pos1][gap_pos2]);
+ f += (weighting[gap_pos2][pos] * matrix[gap_pos2][gap_pos2]);
+ profile[pos+1][gap_pos2] = f;
+ }
+
+ for (i=0;i<=max_aa;i++)
+ weighting[i]=ckfree((void *)weighting[i]);
+ weighting=ckfree((void *)weighting);
+
+}
+
+
Added: trunk/packages/clustalw/branches/upstream/current/xutils.c
===================================================================
--- trunk/packages/clustalw/branches/upstream/current/xutils.c 2006-11-29 14:30:13 UTC (rev 162)
+++ trunk/packages/clustalw/branches/upstream/current/xutils.c 2006-12-04 00:55:49 UTC (rev 163)
@@ -0,0 +1,1340 @@
+#include <stdio.h>
+#include <stdarg.h>
+#include <string.h>
+
+#include <vibrant.h>
+
+#include "clustalw.h"
+#include "xmenu.h"
+
+char fontbuf[80];
+FonT tmpFont=NULL;
+static void VSeqMgrFontProc ();
+
+static int get_series_matrixname(char *usermtrxname, short *usermat,short *aa_xref,int usermatnum,int *matnum,PrompT mattext);
+
+extern Boolean x_menus;
+extern WindoW mainw;
+extern GrouP matrix_list,pw_matrix_list;
+extern GrouP dnamatrix_list,pw_dnamatrix_list;
+extern GrouP seg_matrix_list,seg_dnamatrix_list;
+extern GrouP score_matrix_list,score_dnamatrix_list;
+extern Boolean interactive;
+extern Boolean dnaflag;
+extern char hyd_residues[];
+extern sint gap_dist;
+extern Boolean no_var_penalties, no_hyd_penalties, no_pref_penalties;
+extern Boolean use_endgaps;
+extern Boolean realign_endgappenalties;
+extern Boolean align_endgappenalties;
+extern sint divergence_cutoff;
+extern Boolean lowercase; /* Flag for GDE output - set on comm. line*/
+extern Boolean cl_seq_numbers;
+
+extern Boolean seqRange;
+
+extern sint output_order;
+extern Boolean save_log;
+extern Boolean quick_pairalign;
+extern Boolean neg_matrix;
+extern Boolean output_clustal, output_nbrf, output_phylip, output_gcg, output_gde, output_nexus;
+extern Boolean output_fasta;
+extern Boolean save_parameters;
+extern Boolean output_tree_clustal, output_tree_phylip, output_tree_distances, output_tree_nexus, output_pim;
+extern char seqname[];
+extern float transition_weight;
+extern float gap_open, gap_extend;
+extern float dna_gap_open, dna_gap_extend;
+extern float prot_gap_open, prot_gap_extend;
+extern float pw_go_penalty, pw_ge_penalty;
+extern float dna_pw_go_penalty, dna_pw_ge_penalty;
+extern float prot_pw_go_penalty, prot_pw_ge_penalty;
+extern sint wind_gap,ktup,window,signif;
+extern sint dna_wind_gap, dna_ktup, dna_window, dna_signif;
+extern sint prot_wind_gap,prot_ktup,prot_window,prot_signif;
+extern Boolean tossgaps; /* Ignore places in align. where ANY seq. has a gap*/
+extern Boolean kimura; /* Use correction for multiple substitutions */
+extern sint boot_ntrials; /* number of bootstrap trials */
+extern unsigned sint boot_ran_seed; /* random number generator seed */
+extern sint bootstrap_format;
+extern sint struct_penalties,struct_penalties1,struct_penalties2;
+extern sint output_struct_penalties;
+extern sint profile1_nseqs;
+extern sint nseqs;
+extern Boolean use_ss1, use_ss2;
+extern int inverted;
+extern char mtrxname[], pw_mtrxname[];
+extern char usermtrxname[], pw_usermtrxname[];
+extern sint matnum,pw_matnum;
+extern short usermat[], pw_usermat[];
+extern short aa_xref[], pw_aa_xref[];
+extern char dnamtrxname[], pw_dnamtrxname[];
+extern char dnausermtrxname[], pw_dnausermtrxname[];
+extern sint dnamatnum,pw_dnamatnum;
+extern short userdnamat[], pw_userdnamat[];
+extern short dna_xref[], pw_dna_xref[];
+extern Boolean use_ambiguities;
+
+extern MatMenu matrix_menu;
+extern MatMenu dnamatrix_menu;
+extern MatMenu pw_matrix_menu;
+
+extern sint helix_penalty;
+extern sint strand_penalty;
+extern sint loop_penalty;
+extern sint helix_end_minus;
+extern sint helix_end_plus;
+extern sint strand_end_minus;
+extern sint strand_end_plus;
+extern sint helix_end_penalty;
+extern sint strand_end_penalty;
+
+extern TexT savealntext;
+extern GrouP slow_para,fast_para;
+
+extern PrompT message; /* used in temporary message window */
+extern Boolean mess_output;
+extern FILE *save_log_fd;
+extern color color_lut[];
+extern spanel seq_panel; /* data for multiple alignment area */
+extern spanel prf_panel[]; /* data for profile alignment areas */
+extern Boolean aln_mode;
+extern Boolean fixed_prf_scroll;
+extern Boolean output_ss;
+extern Boolean output_gp;
+extern PrompT mattext,pwmattext;
+extern PrompT dnamattext,pwdnamattext;
+extern int save_format;
+extern Boolean residue_exceptions;
+extern Boolean segment_exceptions;
+extern int font_size;
+extern FonT datafont;
+extern int av_font[];
+extern TexT blocklentext;
+extern IteM segment_item;
+
+extern int pagesize;
+extern int orientation;
+extern Boolean ps_ruler,ps_header,resize,ps_curve,ps_resno;
+extern int first_printres,last_printres,blocklen;
+extern int firstres,lastres;
+
+void set_go_penalty(TexT t)
+{
+ char str[10];
+ float temp;
+
+ GetTitle(t,str,10);
+ temp = atof(str);
+ if (temp < 0 || temp > 100)
+ return;
+ gap_open=temp;
+
+ if(dnaflag)
+ dna_gap_open = gap_open;
+ else
+ prot_gap_open = gap_open;
+}
+
+void set_ge_penalty(TexT t)
+{
+ char str[10];
+ float temp;
+
+ GetTitle(t,str,10);
+ temp = atof(str);
+ if (temp < 0 || temp > 100)
+ return;
+ gap_extend=temp;
+
+ if(dnaflag)
+ dna_gap_extend = gap_extend;
+ else
+ prot_gap_extend = gap_extend;
+}
+
+void set_gap_dist(TexT t)
+{
+ char str[10];
+ int temp;
+
+ GetTitle(t,str,10);
+ temp = atoi(str);
+ if (temp < 0 || temp > 100)
+ return;
+ gap_dist = temp;
+
+}
+
+void set_ntrials(TexT t)
+{
+ char str[10];
+ int temp;
+
+ GetTitle(t,str,10);
+ if (str == NULL) return;
+ temp = atoi(str);
+ if (temp < 0 || temp > 10000)
+ return;
+ boot_ntrials = temp;
+}
+
+void set_ran_seed(TexT t)
+{
+ char str[10];
+ int temp;
+
+ GetTitle(t,str,10);
+ temp = atoi(str);
+ if (temp < 0 || temp > 1000)
+ return;
+ boot_ran_seed = temp;
+}
+
+void set_div_seq(TexT t)
+{
+ char str[10];
+ int temp;
+
+ GetTitle(t,str,10);
+ temp = atoi(str);
+ if (temp < 0 || temp > 100)
+ return;
+ divergence_cutoff = temp;
+}
+
+void set_pw_go_penalty(TexT t)
+{
+ char str[10];
+ float temp;
+
+ GetTitle(t,str,10);
+ temp = atof(str);
+ if (temp < 0 || temp > 100)
+ return;
+
+ pw_go_penalty = temp;
+ if(dnaflag)
+ dna_pw_go_penalty = pw_go_penalty;
+ else
+ prot_pw_go_penalty = pw_go_penalty;
+}
+
+void set_pw_ge_penalty(TexT t)
+{
+ char str[10];
+ float temp;
+
+ GetTitle(t,str,10);
+ temp = atof(str);
+ if (temp < 0 || temp > 100)
+ return;
+
+ pw_ge_penalty = temp;
+ if(dnaflag)
+ dna_pw_ge_penalty = pw_ge_penalty;
+ else
+ prot_pw_ge_penalty = pw_ge_penalty;
+}
+
+void set_gp(TexT t)
+{
+ char str[10];
+ int temp;
+
+ GetTitle(t,str,10);
+ temp = atoi(str);
+ if (temp < 0 || temp > 100)
+ return;
+
+ wind_gap = temp;
+ if(dnaflag)
+ dna_wind_gap = wind_gap;
+ else
+ prot_wind_gap = wind_gap;
+}
+
+void set_ktuple(TexT t)
+{
+ char str[10];
+ int temp;
+
+ GetTitle(t,str,10);
+ temp = atoi(str);
+ if (temp < 0 || temp > 100)
+ return;
+
+ ktup = temp;
+ if(dnaflag)
+ dna_ktup = ktup;
+ else
+ prot_ktup = ktup;
+}
+
+void set_topdiags(TexT t)
+{
+ char str[10];
+ int temp;
+
+ GetTitle(t,str,10);
+ temp = atoi(str);
+ if (temp < 0 || temp > 100)
+ return;
+
+ signif = temp;
+ if(dnaflag)
+ dna_signif = signif;
+ else
+ prot_signif = signif;
+}
+
+void set_window(TexT t)
+{
+ char str[10];
+ int temp;
+
+ GetTitle(t,str,10);
+ temp = atoi(str);
+ if (temp < 0 || temp > 100)
+ return;
+
+ window = temp;
+ if(dnaflag)
+ dna_window = window;
+ else
+ prot_window = window;
+}
+
+void set_hyd_res(TexT t)
+{
+ int i,j;
+ char tstr[27];
+
+ GetTitle(t,tstr,27);
+ for (i=0,j=0;i<strlen(hyd_residues) && i<27;i++)
+ {
+ if (isalpha(tstr[i]))
+ hyd_residues[j++] = tstr[i];
+ }
+ hyd_residues[j]='\0';
+}
+
+void set_button(ButtoN l,Boolean *value)
+{
+ int tmp;
+
+ tmp = GetStatus(l);
+ if (tmp == TRUE)
+ *value = TRUE;
+ else
+ *value = FALSE;
+}
+
+void set_toggle(PopuP l,Boolean *value)
+{
+ int tmp;
+
+ tmp = GetValue(l);
+ if (tmp == 1)
+ *value = TRUE;
+ else
+ *value = FALSE;
+}
+
+void set_pref_penalties(PopuP l)
+{
+ set_toggle(l,&no_pref_penalties);
+}
+
+void set_hyd_penalties(PopuP l)
+{
+ set_toggle(l,&no_hyd_penalties);
+}
+void set_var_penalties(PopuP l)
+{
+ set_toggle(l,&no_var_penalties);
+}
+void set_endgaps(PopuP l)
+{
+ set_toggle(l,&use_endgaps);
+}
+void set_align_endgappenalties(PopuP l)
+{
+ set_toggle(l,&align_endgappenalties);
+}
+void set_realign_endgappenalties(PopuP l)
+{
+ set_toggle(l,&realign_endgappenalties);
+}
+void set_case(PopuP l)
+{
+ set_toggle(l,&lowercase);
+}
+void set_snos(PopuP l)
+{
+ set_toggle(l,&cl_seq_numbers);
+}
+
+
+void setRange(PopuP l)
+{
+ set_toggle(l, &seqRange);
+}
+
+void set_save_paras(PopuP l)
+{
+ set_toggle(l,&save_parameters);
+}
+void set_transitions(TexT t)
+{
+ char str[10];
+ float temp;
+
+ GetTitle(t,str,10);
+ temp = atof(str);
+ if (temp < 0 || temp > 100)
+ return;
+
+ transition_weight = temp;
+}
+
+void set_ambiguities(PopuP l)
+{
+ set_toggle(l,&use_ambiguities);
+}
+
+void set_neg_matrix(PopuP l)
+{
+ set_toggle(l,&neg_matrix);
+}
+
+void set_output_nbrf(ButtoN l)
+{
+ set_button(l,&output_nbrf);
+}
+void set_output_phylip(ButtoN l)
+{
+ set_button(l,&output_phylip);
+}
+void set_output_gcg(ButtoN l)
+{
+ set_button(l,&output_gcg);
+}
+
+void set_output_order(PopuP g)
+{
+ int tmp;
+ tmp = GetValue(g);
+ if (tmp == 1)
+ output_order=INPUT;
+ else
+ output_order=ALIGNED;
+}
+
+void set_pagesize(PopuP g)
+{
+ int tmp;
+ char tstr[10];
+
+ tmp = GetValue(g);
+ if (tmp == 1)
+ pagesize=A4;
+ else if (tmp == 2)
+ pagesize=A3;
+ else
+ pagesize=USLETTER;
+ if(orientation==LANDSCAPE)
+ {
+ if(pagesize==A4) blocklen=150;
+ else if (pagesize==A3) blocklen=250;
+ else blocklen=150;
+ }
+ else
+ {
+ if(pagesize==A4) blocklen=80;
+ else if (pagesize==A3) blocklen=150;
+ else blocklen=150;
+ }
+ sprintf(tstr,"%d",blocklen);
+ SetTitle(blocklentext,tstr);
+}
+void set_orientation(PopuP g)
+{
+ int tmp;
+ char tstr[10];
+
+ tmp = GetValue(g);
+ if (tmp == 1)
+ orientation=LANDSCAPE;
+ else
+ orientation=PORTRAIT;
+
+ if(orientation==LANDSCAPE)
+ {
+ if(pagesize==A4) blocklen=150;
+ else if (pagesize==A3) blocklen=250;
+ else blocklen=150;
+ }
+ else
+ {
+ if(pagesize==A4) blocklen=80;
+ else if (pagesize==A3) blocklen=150;
+ else blocklen=150;
+ }
+ sprintf(tstr,"%d",blocklen);
+ SetTitle(blocklentext,tstr);
+}
+void set_resno(PopuP l)
+{
+ set_toggle(l,&ps_resno);
+}
+void set_curve(PopuP l)
+{
+ set_toggle(l,&ps_curve);
+}
+void set_ruler(PopuP l)
+{
+ set_toggle(l,&ps_ruler);
+}
+void set_header(PopuP l)
+{
+ set_toggle(l,&ps_header);
+}
+void set_resize(PopuP l)
+{
+ set_toggle(l,&resize);
+}
+void set_fres(TexT t)
+{
+ char str[10];
+ int temp;
+
+ GetTitle(t,str,10);
+ if (str == NULL) return;
+ temp = atoi(str);
+ if (temp < 0 || temp > 100000)
+ return;
+ firstres = temp;
+}
+void set_lres(TexT t)
+{
+ char str[10];
+ int temp;
+
+ GetTitle(t,str,10);
+ if (str == NULL) return;
+ temp = atoi(str);
+ if (temp < 0 || temp > 100000)
+ return;
+ lastres = temp;
+}
+void set_fpres(TexT t)
+{
+ char str[10];
+ int temp;
+
+ GetTitle(t,str,10);
+ if (str == NULL) return;
+ temp = atoi(str);
+ if (temp < 0 || temp > 10000)
+ return;
+ first_printres = temp;
+}
+void set_lpres(TexT t)
+{
+ char str[10];
+ int temp;
+
+ GetTitle(t,str,10);
+ if (str == NULL) return;
+ temp = atoi(str);
+ if (temp < 0 || temp > 10000)
+ return;
+ last_printres = temp;
+}
+void set_blocklen(TexT t)
+{
+ char str[10];
+ int temp;
+
+ GetTitle(t,str,10);
+ if (str == NULL) return;
+ temp = atoi(str);
+ if (temp < 0 || temp > 10000)
+ return;
+ blocklen = temp;
+}
+
+void set_output_tree_nexus(ButtoN l)
+{
+ set_button(l,&output_tree_nexus);
+}
+
+
+void set_output_pim(ButtoN l)
+{
+ set_button(l,&output_pim);
+}
+
+void set_output_tree_clustal(ButtoN l)
+{
+ set_button(l,&output_tree_clustal);
+}
+void set_output_tree_phylip(ButtoN l)
+{
+ set_button(l,&output_tree_phylip);
+}
+void set_output_tree_distances(ButtoN l)
+{
+ set_button(l,&output_tree_distances);
+}
+void set_tossgaps(IteM i)
+{
+ tossgaps=GetStatus(i);
+}
+void set_kimura(IteM i)
+{
+ kimura=GetStatus(i);
+}
+void set_boot_format(PopuP g)
+{
+ int tmp;
+ tmp = GetValue(g);
+ if (tmp == 1)
+ bootstrap_format=BS_NODE_LABELS;
+ else
+ bootstrap_format=BS_BRANCH_LABELS;
+}
+
+char prompt_for_yes_no(char *title,char *prompt)
+{
+ char lin2[MESSLENGTH*MESSLINES];
+
+ if(!x_menus) return;
+
+ strcpy(lin2,title);
+ strcat(lin2,".\n");
+ strcat(lin2,prompt);
+ strcat(lin2,"?");
+ if (Message(MSG_YN,lin2)==ANS_NO)
+ return('n');
+ else
+ return('y');
+
+}
+
+
+/*
+* fatal()
+*
+* Prints error msg and exits.
+* Variadic parameter list can be passed.
+*
+* Return values:
+* none
+*/
+
+void fatal( char *msg,...)
+{
+ va_list ap;
+ char istr[MESSLENGTH*MESSLINES] = "FATAL ERROR: ";
+ char vstr[1000];
+
+
+ va_start(ap,msg);
+ vsprintf(vstr,msg,ap);
+ va_end(ap);
+ strncat(istr,vstr,MESSLENGTH*MESSLINES-20);
+ Message(MSG_FATAL,istr);
+}
+
+/*
+* error()
+*
+* Prints error msg.
+* Variadic parameter list can be passed.
+*
+* Return values:
+* none
+*/
+
+void error( char *msg,...)
+{
+ va_list ap;
+ char istr[MESSLENGTH*MESSLINES] = "ERROR: ";
+ char vstr[1000];
+
+
+ va_start(ap,msg);
+ vsprintf(vstr,msg,ap);
+ va_end(ap);
+ strncat(istr,vstr,MESSLENGTH*MESSLINES-10);
+ if (!interactive)
+ fprintf(stdout,"%s",istr);
+ else
+ Message(MSG_ERROR,istr);
+}
+
+/*
+* warning()
+*
+* Prints warning msg.
+* Variadic parameter list can be passed.
+*
+* Return values:
+* none
+*/
+
+void warning( char *msg,...)
+{
+ va_list ap;
+ char istr[MESSLENGTH*MESSLINES] = "WARNING: ";
+ char vstr[1000];
+ va_start(ap,msg);
+ vsprintf(vstr,msg,ap);
+ va_end(ap);
+ strncat(istr,vstr,MESSLENGTH*MESSLINES-10);
+ if (!interactive)
+ fprintf(stdout,"%s",istr);
+ else
+ Message(MSG_ERROR,istr);
+}
+
+/*
+* info()
+*
+* Prints info msg.
+* Variadic parameter list can be passed.
+*
+* Return values:
+* none
+*/
+
+void info( char *msg,...)
+{
+ va_list ap;
+ char istr[MESSLENGTH+10] = "";
+ char vstr[1000];
+
+ if (!mess_output) return;
+
+ va_start(ap,msg);
+ vsprintf(vstr,msg,ap);
+ va_end(ap);
+ strncat(istr,vstr,MESSLENGTH);
+ if (!interactive)
+ fprintf(stdout,"%s\n",istr);
+ else
+ {
+ UseWindow(mainw);
+ SelectFont(systemFont);
+ SetTitle(message,istr);
+ if(save_log && save_log_fd!=NULL)
+ fprintf(save_log_fd,"%s\n",istr);
+ Update();
+ }
+
+}
+
+
+void set_helix_gp(TexT t)
+{
+ char str[10];
+ int temp;
+
+ GetTitle(t,str,10);
+ temp = atoi(str);
+ if (temp < 0 || temp > 9)
+ return;
+ helix_penalty = temp;
+
+}
+
+void set_strand_gp(TexT t)
+{
+ char str[10];
+ int temp;
+
+ GetTitle(t,str,10);
+ temp = atoi(str);
+ if (temp < 0 || temp > 9)
+ return;
+ strand_penalty = temp;
+
+}
+
+void set_loop_gp(TexT t)
+{
+ char str[10];
+ int temp;
+
+ GetTitle(t,str,10);
+ temp = atoi(str);
+ if (temp < 0 || temp > 9)
+ return;
+ loop_penalty = temp;
+
+}
+
+void set_terminal_gp(TexT t)
+{
+ char str[10];
+ int temp;
+
+ GetTitle(t,str,10);
+ temp = atoi(str);
+ if (temp < 0 || temp > 9)
+ return;
+ helix_end_penalty = temp;
+
+}
+
+void set_helix_minus(TexT t)
+{
+ char str[10];
+ int temp;
+
+ GetTitle(t,str,10);
+ temp = atoi(str);
+ if (temp < 0 || temp > 9)
+ return;
+ helix_end_minus = temp;
+
+}
+
+void set_helix_plus(TexT t)
+{
+ char str[10];
+ int temp;
+
+ GetTitle(t,str,10);
+ temp = atoi(str);
+ if (temp < 0 || temp > 9)
+ return;
+ helix_end_plus = temp;
+}
+
+void set_strand_plus(TexT t)
+{
+ char str[10];
+ int temp;
+
+ GetTitle(t,str,10);
+ temp = atoi(str);
+ if (temp < 0 || temp > 9)
+ return;
+ strand_end_plus = temp;
+}
+
+void set_strand_minus(TexT t)
+{
+ char str[10];
+ int temp;
+
+ GetTitle(t,str,10);
+ temp = atoi(str);
+ if (temp < 0 || temp > 9)
+ return;
+ strand_end_minus = temp;
+}
+
+
+void set_inverted(IteM i)
+{
+ inverted=GetStatus(i);
+ if (inverted==FALSE)
+ {
+ strcpy(color_lut[0].name,"BLACK");
+ color_lut[0].r=0.4;
+ color_lut[0].g=0.4;
+ color_lut[0].b=0.4;
+ SelectColor(color_lut[0].r*255, color_lut[0].g*255, color_lut[0].b*255);
+ color_lut[0].val=GetColor();
+ }
+ else
+ {
+ strcpy(color_lut[0].name,"WHITE");
+ color_lut[0].r=1.0;
+ color_lut[0].g=1.0;
+ color_lut[0].b=1.0;
+ SelectColor(color_lut[0].r*255, color_lut[0].g*255, color_lut[0].b*255);
+ color_lut[0].val=GetColor();
+ }
+
+ if(aln_mode==MULTIPLEM)
+ DrawPanel(seq_panel.seqs);
+ else
+ {
+ DrawPanel(prf_panel[0].seqs);
+ DrawPanel(prf_panel[1].seqs);
+ }
+
+}
+
+void set_ss_output(ButtoN b)
+{
+ int tmp;
+
+ tmp = GetStatus(b);
+ if (tmp) output_ss = TRUE;
+ else output_ss = FALSE;
+
+ if (output_ss && output_gp)
+ output_struct_penalties=2;
+ else if (output_ss)
+ output_struct_penalties=0;
+ else if (output_gp)
+ output_struct_penalties=1;
+ else
+ output_struct_penalties=3;
+}
+void set_gp_output(ButtoN b)
+{
+ int tmp;
+
+ tmp = GetStatus(b);
+ if (tmp) output_gp = TRUE;
+ else output_gp = FALSE;
+
+ if (output_ss && output_gp)
+ output_struct_penalties=2;
+ else if (output_ss)
+ output_struct_penalties=0;
+ else if (output_gp)
+ output_struct_penalties=1;
+ else
+ output_struct_penalties=3;
+}
+
+void set_user_matrix(ButtoN but)
+{
+ if(get_series_matrixname(usermtrxname,usermat,aa_xref,5,&matnum,mattext))
+ strcpy(mtrxname,usermtrxname);
+ SetValue(matrix_list,matnum);
+}
+
+
+void set_pw_user_matrix(ButtoN but)
+{
+ if(get_user_matrixname(pw_usermtrxname,pw_usermat,pw_aa_xref,5,&pw_matnum,pwmattext))
+ strcpy(pw_mtrxname,pw_usermtrxname);
+ SetValue(pw_matrix_list,pw_matnum);
+}
+
+void set_pw_matrix(GrouP g)
+{
+ int tmp;
+
+ tmp = GetValue(g);
+ if (tmp>0 && tmp<pw_matrix_menu.noptions)
+ {
+ pw_matnum = tmp;
+ strcpy(pw_mtrxname,pw_matrix_menu.opt[tmp-1].string);
+ }
+ else if(pw_usermtrxname[0]=='\0')
+ {
+ if(get_user_matrixname(pw_usermtrxname,pw_usermat,pw_aa_xref,pw_matrix_menu.noptions,&pw_matnum,pwmattext))
+ strcpy(pw_mtrxname,pw_usermtrxname);
+ }
+ else
+ pw_matnum=pw_matrix_menu.noptions;
+ SetValue(pw_matrix_list,pw_matnum);
+}
+void set_matrix(GrouP g)
+{
+ int tmp;
+ int status;
+
+ tmp = GetValue(g);
+ if (tmp>0 && tmp<matrix_menu.noptions)
+ {
+ matnum = tmp;
+ strcpy(mtrxname,matrix_menu.opt[tmp-1].string);
+ }
+ else if(usermtrxname[0]=='\0')
+ {
+ if(get_series_matrixname(usermtrxname,usermat,aa_xref,matrix_menu.noptions,&matnum,mattext))
+ strcpy(mtrxname,usermtrxname);
+ }
+ else matnum=matrix_menu.noptions;
+
+ SetValue(matrix_list,matnum);
+}
+
+static int get_series_matrixname(char *usermtrxname, short *usermat,short *aa_xref,int usermatnum,int *matnum,PrompT mattext)
+{
+ int ret=0;
+ static Char filename[FILENAMELEN];
+
+ if (GetInputFileName(filename,FILENAMELEN,"",""))
+ {
+ if(user_mat_series(filename, usermat, aa_xref))
+ {
+ strcpy(usermtrxname,filename);
+ *matnum=usermatnum;
+ SetTitle(mattext,usermtrxname);
+ ret=1;
+ }
+ }
+
+ return ret;
+}
+int get_user_matrixname(char *usermtrxname, short *usermat,short *aa_xref,int usermatnum,int *matnum,PrompT mattext)
+{
+ int ret=0;
+ static Char filename[FILENAMELEN];
+
+ if (GetInputFileName(filename,FILENAMELEN,"",""))
+ {
+ if(user_mat(filename, usermat, aa_xref))
+ {
+ strcpy(usermtrxname,filename);
+ *matnum=usermatnum;
+ SetTitle(mattext,usermtrxname);
+ ret=1;
+ }
+ }
+
+ return ret;
+}
+
+void set_user_dnamatrix(ButtoN but)
+{
+ if(get_user_matrixname(dnausermtrxname,userdnamat,dna_xref,3,&dnamatnum,dnamattext))
+ strcpy(dnamtrxname,dnausermtrxname);
+ SetValue(dnamatrix_list,dnamatnum);
+}
+
+
+void set_pw_user_dnamatrix(ButtoN but)
+{
+ if(get_user_matrixname(pw_dnausermtrxname,pw_userdnamat,pw_dna_xref,3,&pw_dnamatnum,pwdnamattext))
+ strcpy(pw_dnamtrxname,pw_dnausermtrxname);
+ SetValue(pw_dnamatrix_list,pw_dnamatnum);
+}
+
+void set_pw_dnamatrix(GrouP g)
+{
+ int tmp;
+
+ tmp = GetValue(g);
+ if (tmp>0 && tmp<dnamatrix_menu.noptions)
+ {
+ pw_dnamatnum = tmp;
+ strcpy(pw_dnamtrxname,dnamatrix_menu.opt[tmp-1].string);
+ }
+ else if(pw_dnausermtrxname[0]=='\0')
+ {
+ if(get_user_matrixname(pw_dnausermtrxname,pw_userdnamat,pw_dna_xref,dnamatrix_menu.noptions,&pw_dnamatnum,pwdnamattext))
+ strcpy(pw_dnamtrxname,pw_dnausermtrxname);
+ }
+ else pw_dnamatnum=dnamatrix_menu.noptions;
+ SetValue(pw_dnamatrix_list,pw_dnamatnum);
+}
+void set_dnamatrix(GrouP g)
+{
+ int tmp;
+
+ tmp = GetValue(g);
+ if (tmp>0 && tmp<dnamatrix_menu.noptions)
+ {
+ dnamatnum = tmp;
+ strcpy(dnamtrxname,dnamatrix_menu.opt[tmp-1].string);
+ }
+ else if(dnausermtrxname[0]=='\0')
+ {
+ if(get_user_matrixname(dnausermtrxname,userdnamat,dna_xref,dnamatrix_menu.noptions,&dnamatnum,dnamattext))
+ strcpy(dnamtrxname,dnausermtrxname);
+ }
+ else dnamatnum=dnamatrix_menu.noptions;
+ SetValue(dnamatrix_list,dnamatnum);
+}
+
+FILE * open_input_file(char *file_name)
+{
+ FILE * file_handle;
+
+ if (*file_name == EOS) {
+ error("Bad input file [%s]",file_name);
+ return NULL;
+ }
+#ifdef VMS
+ if((file_handle=fopen(file_name,"r","rat=cr","rfm=var"))==NULL) {
+#else
+ if((file_handle=fopen(file_name,"r"))==NULL) {
+#endif
+ error("Cannot open input file [%s]",file_name);
+ return NULL;
+ }
+ return file_handle;
+}
+
+
+void set_use_ss1(PopuP l)
+{
+ set_toggle(l,&use_ss1);
+ load_aln(prf_panel[0],0,profile1_nseqs-1,FALSE);
+ load_aln(prf_panel[1],profile1_nseqs,nseqs-1,FALSE);
+}
+void set_use_ss2(PopuP l)
+{
+ set_toggle(l,&use_ss2);
+ load_aln(prf_panel[0],0,profile1_nseqs-1,FALSE);
+ load_aln(prf_panel[1],profile1_nseqs,nseqs-1,FALSE);
+}
+
+
+void set_output_clustal(ButtoN l)
+{
+ set_button(l,&output_clustal);
+}
+void set_output_gde(ButtoN l)
+{
+ set_button(l,&output_gde);
+}
+void set_output_nexus(ButtoN l)
+{
+ set_button(l,&output_nexus);
+}
+
+
+void set_output_fasta(ButtoN l)
+{
+ set_button(l,&output_fasta);
+}
+
+void set_format(GrouP g)
+{
+ int i;
+ char path[FILENAMELEN];
+
+ get_path(seqname,path);
+ GetTitle(savealntext, path,FILENAMELEN);
+/* remove the current extension */
+ for(i=strlen(path)-1;i>=0;i--)
+ if(path[i]=='.')
+ {
+ path[i]='\0';
+ break;
+ }
+
+ i = GetValue(g);
+ if (i==1)
+ {
+ save_format=CLUSTAL;
+ strcat(path,".aln");
+ }
+ else if (i==2)
+ {
+ save_format=PIR;
+ strcat(path,".pir");
+ }
+ else if (i==3)
+ {
+ save_format=MSF;
+ strcat(path,".msf");
+ }
+ else if (i==4)
+ {
+ save_format=PHYLIP;
+ strcat(path,".phy");
+ }
+ else if (i==5)
+ {
+ save_format=GDE;
+ strcat(path,".gde");
+ }
+ else if (i==6)
+ {
+ save_format=NEXUS;
+ strcat(path,".nxs");
+ }
+
+ else if (i==7)
+ {
+ save_format=FASTA;
+ strcat(path,".fasta");
+ }
+
+ SetTitle(savealntext, path);
+}
+
+void set_residue_exceptions(IteM i)
+{
+ if (residue_exceptions==FALSE)
+ residue_exceptions=TRUE;
+ else
+ residue_exceptions=FALSE;
+ if (aln_mode==MULTIPLEM)
+ DrawPanel(seq_panel.seqs);
+ else
+ {
+ DrawPanel(prf_panel[0].seqs);
+ DrawPanel(prf_panel[1].seqs);
+ }
+}
+
+
+void set_fs_toggle(PopuP l)
+{
+ set_toggle(l,&quick_pairalign);
+ if (quick_pairalign)
+ {
+ Hide(slow_para);
+ Show(fast_para);
+ }
+ else
+ {
+ Hide(fast_para);
+ Show(slow_para);
+ }
+}
+
+void set_font_size(PopuP g)
+{
+ int tmp;
+ char font[30];
+
+ tmp = GetValue(g);
+
+ /*
+ if ( tmp == 6 ) {
+ printf("\n choosing differnt fonts %d\n ",tmp);
+ VSeqMgrFontProc();
+ printf("\n choosing differnt fonts ( %s ) ",fontbuf);
+ datafont=ParseFont(fontbuf);
+ }
+ else {
+ printf("\n NOT......... choosing differnt fonts %d\n ",tmp);
+ */
+ font_size=tmp-1;
+ sprintf(font, "%s,%d,%c", "courier", av_font[font_size], 'm');
+ datafont=ParseFont(font);
+ /* } */
+
+ if (aln_mode==MULTIPLEM)
+ {
+ DrawPanel(seq_panel.names);
+ DrawPanel(seq_panel.seqs);
+ }
+ else
+ {
+ DrawPanel(prf_panel[0].names);
+ DrawPanel(prf_panel[0].seqs);
+
+ DrawPanel(prf_panel[1].names);
+ DrawPanel(prf_panel[1].seqs);
+ }
+ correct_name_bars(FALSE);
+ correct_seq_bars(FALSE);
+}
+
+void set_pscroll_mode(ButtoN l)
+{
+ panel_data data;
+
+ set_button(l,&fixed_prf_scroll);
+ GetPanelExtra(prf_panel[0].seqs,&data);
+ if(fixed_prf_scroll)
+ data.lockoffset=data.firstvcol;
+ else
+ data.lockoffset=0;
+ SetPanelExtra(prf_panel[0].seqs,&data);
+ GetPanelExtra(prf_panel[1].seqs,&data);
+ if(fixed_prf_scroll)
+ data.lockoffset=data.firstvcol;
+ else
+ data.lockoffset=0;
+ SetPanelExtra(prf_panel[1].seqs,&data);
+ correct_seq_bars(FALSE);
+}
+
+void set_aln_mode(PopuP g)
+{
+ int tmp;
+ tmp = GetValue(g);
+ if (tmp == 1)
+ aln_mode = MULTIPLEM;
+ else
+ aln_mode = PROFILEM;
+ switch_mode();
+}
+
+void set_show_segments(IteM l)
+{
+ if (segment_exceptions==FALSE)
+ segment_exceptions=TRUE;
+ else
+ segment_exceptions=FALSE;
+ calc_seg_exceptions();
+ SetStatus(segment_item,segment_exceptions);
+ show_segment_exceptions();
+}
+
+
+void shift(Handle a, int dx, int dy)
+{
+ PoinT pt;
+
+ GetNextPosition (a, &pt);
+ pt.x+=dx;
+ pt.y+=dy;
+ SetNextPosition(a, pt);
+}
+
+void stripspace(char *str)
+{
+ register int i,j,p;
+ char *tstr;
+
+#ifndef UNIX
+ return;
+#endif
+ p = strlen(str) - 1;
+
+ while ( isspace(str[p]) )
+ p--;
+
+ str[p + 1] = EOS;
+
+ tstr=(char *)ckalloc((p+2)*sizeof(char));
+
+ for(i=0,j=0;i<=p;i++)
+ if(!isspace(str[i]))
+ tstr[j++]=str[i];
+ tstr[j] = EOS;
+ strcpy(str,tstr);
+ ckfree(tstr);
+
+}
+
+/* extra code */
+
+static void VSeqMgrFontProc ()
+{
+ Nlm_FontSpec font;
+ FonT f;
+
+ /* GetFontSpec(vsmp->font, &font); */
+ printf(" before getfontspec \n");
+ GetFontSpec(tmpFont, &font);
+ printf(" done getfontspec \n");
+ if (ChooseFont(&font, CFF_READ_FSP, NULL))
+ {
+ /***
+ f = GetPermanentFont(&font);
+ ***/
+ f = CreateFont(&font);
+ tmpFont = f;
+ SelectFont(f);
+ /* vsmp->lineheight = LineHeight();
+ vsmp->leading = Leading();
+ vsmp->charw = MaxCharWidth();
+ vsmp->update_all = TRUE; */
+ /* VSeqMgrShow(); */
+ FontSpecToStr(&font, fontbuf, 80);
+ datafont = ParseFont(fontbuf);
+ printf(" font info \n ( %s ) \n", fontbuf);
+ }
+ return ;
+}
More information about the debian-med-commit
mailing list